Safetensors
llava
ViMUL / trainer_state.json
k-m-irfan's picture
Upload folder using huggingface_hub
d48ba4d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996942830938551,
"eval_steps": 500,
"global_step": 1635,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 5.6230373155383,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.0665,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 6.01161593196551,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.0829,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 5.53722803622308,
"learning_rate": 6.000000000000001e-07,
"loss": 1.0876,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 7.93426726662942,
"learning_rate": 8.000000000000001e-07,
"loss": 1.0719,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 7.031649978841274,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0572,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 6.3575449397660835,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.1097,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 6.573922198067659,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.1593,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 5.564979871860141,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.0538,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 6.0509131309088655,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.0715,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 6.12848977059448,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9286,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 4.916177727066278,
"learning_rate": 2.2e-06,
"loss": 0.9492,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 4.8581443117369405,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.0133,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 3.908502671210593,
"learning_rate": 2.6e-06,
"loss": 0.9243,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 4.034985649544406,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.0416,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 3.507758376052119,
"learning_rate": 3e-06,
"loss": 0.9373,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 3.663272180369727,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.8673,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 3.508069835907157,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.8894,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 3.294815456496393,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.8841,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 2.877754612416487,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.768,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 2.664239443889974,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7173,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 2.900279841618844,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.7689,
"step": 21
},
{
"epoch": 0.01,
"grad_norm": 3.0487383417411658,
"learning_rate": 4.4e-06,
"loss": 0.7327,
"step": 22
},
{
"epoch": 0.01,
"grad_norm": 2.9928876018893447,
"learning_rate": 4.600000000000001e-06,
"loss": 0.8763,
"step": 23
},
{
"epoch": 0.01,
"grad_norm": 3.031747010513625,
"learning_rate": 4.800000000000001e-06,
"loss": 0.812,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 2.5408522914684863,
"learning_rate": 5e-06,
"loss": 0.8598,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 2.7561902015944253,
"learning_rate": 5.2e-06,
"loss": 0.8967,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 2.976716291176,
"learning_rate": 5.400000000000001e-06,
"loss": 0.7226,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 2.9056079278585227,
"learning_rate": 5.600000000000001e-06,
"loss": 0.8096,
"step": 28
},
{
"epoch": 0.02,
"grad_norm": 2.4758306534625802,
"learning_rate": 5.8e-06,
"loss": 0.8044,
"step": 29
},
{
"epoch": 0.02,
"grad_norm": 2.3717118099560217,
"learning_rate": 6e-06,
"loss": 0.6461,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 2.6666631455135335,
"learning_rate": 6.200000000000001e-06,
"loss": 0.8049,
"step": 31
},
{
"epoch": 0.02,
"grad_norm": 2.8733424047344993,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.8736,
"step": 32
},
{
"epoch": 0.02,
"grad_norm": 2.6648200702201637,
"learning_rate": 6.600000000000001e-06,
"loss": 0.7235,
"step": 33
},
{
"epoch": 0.02,
"grad_norm": 2.5601473220515056,
"learning_rate": 6.800000000000001e-06,
"loss": 0.68,
"step": 34
},
{
"epoch": 0.02,
"grad_norm": 2.7840115776082466,
"learning_rate": 7e-06,
"loss": 0.7816,
"step": 35
},
{
"epoch": 0.02,
"grad_norm": 2.524287013051412,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.7471,
"step": 36
},
{
"epoch": 0.02,
"grad_norm": 2.4550748912153613,
"learning_rate": 7.4e-06,
"loss": 0.7719,
"step": 37
},
{
"epoch": 0.02,
"grad_norm": 3.1526664248369936,
"learning_rate": 7.600000000000001e-06,
"loss": 0.8696,
"step": 38
},
{
"epoch": 0.02,
"grad_norm": 2.6121499364302383,
"learning_rate": 7.800000000000002e-06,
"loss": 0.8445,
"step": 39
},
{
"epoch": 0.02,
"grad_norm": 2.862520896543254,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7393,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 2.4539611276002877,
"learning_rate": 8.2e-06,
"loss": 0.7101,
"step": 41
},
{
"epoch": 0.03,
"grad_norm": 2.4407347141807656,
"learning_rate": 8.400000000000001e-06,
"loss": 0.6975,
"step": 42
},
{
"epoch": 0.03,
"grad_norm": 2.2749278520807903,
"learning_rate": 8.6e-06,
"loss": 0.6533,
"step": 43
},
{
"epoch": 0.03,
"grad_norm": 2.666183851086396,
"learning_rate": 8.8e-06,
"loss": 0.7896,
"step": 44
},
{
"epoch": 0.03,
"grad_norm": 2.6380466110589214,
"learning_rate": 9e-06,
"loss": 0.7127,
"step": 45
},
{
"epoch": 0.03,
"grad_norm": 2.6229206709830577,
"learning_rate": 9.200000000000002e-06,
"loss": 0.8321,
"step": 46
},
{
"epoch": 0.03,
"grad_norm": 2.439693927487141,
"learning_rate": 9.4e-06,
"loss": 0.6645,
"step": 47
},
{
"epoch": 0.03,
"grad_norm": 2.3073039773943127,
"learning_rate": 9.600000000000001e-06,
"loss": 0.6964,
"step": 48
},
{
"epoch": 0.03,
"grad_norm": 2.4261514181880757,
"learning_rate": 9.800000000000001e-06,
"loss": 0.7328,
"step": 49
},
{
"epoch": 0.03,
"grad_norm": 2.6298961031353434,
"learning_rate": 1e-05,
"loss": 0.7843,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 2.368357019452094,
"learning_rate": 9.999990178426327e-06,
"loss": 0.668,
"step": 51
},
{
"epoch": 0.03,
"grad_norm": 2.7183273232506373,
"learning_rate": 9.999960713743888e-06,
"loss": 0.9064,
"step": 52
},
{
"epoch": 0.03,
"grad_norm": 2.4366531712753416,
"learning_rate": 9.99991160606844e-06,
"loss": 0.6861,
"step": 53
},
{
"epoch": 0.03,
"grad_norm": 2.594914984056206,
"learning_rate": 9.999842855592912e-06,
"loss": 0.6947,
"step": 54
},
{
"epoch": 0.03,
"grad_norm": 2.631921187839759,
"learning_rate": 9.999754462587396e-06,
"loss": 0.8039,
"step": 55
},
{
"epoch": 0.03,
"grad_norm": 2.5158584160059085,
"learning_rate": 9.999646427399155e-06,
"loss": 0.7604,
"step": 56
},
{
"epoch": 0.03,
"grad_norm": 2.4520389413039703,
"learning_rate": 9.999518750452622e-06,
"loss": 0.8429,
"step": 57
},
{
"epoch": 0.04,
"grad_norm": 2.511219649765081,
"learning_rate": 9.99937143224939e-06,
"loss": 0.6068,
"step": 58
},
{
"epoch": 0.04,
"grad_norm": 2.2738544213871683,
"learning_rate": 9.999204473368218e-06,
"loss": 0.7126,
"step": 59
},
{
"epoch": 0.04,
"grad_norm": 2.2995201634238365,
"learning_rate": 9.999017874465028e-06,
"loss": 0.7117,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 2.3985761980411646,
"learning_rate": 9.998811636272893e-06,
"loss": 0.6838,
"step": 61
},
{
"epoch": 0.04,
"grad_norm": 2.2586065910900595,
"learning_rate": 9.998585759602052e-06,
"loss": 0.6853,
"step": 62
},
{
"epoch": 0.04,
"grad_norm": 2.585278752939056,
"learning_rate": 9.998340245339888e-06,
"loss": 0.8295,
"step": 63
},
{
"epoch": 0.04,
"grad_norm": 2.4881456153581327,
"learning_rate": 9.998075094450935e-06,
"loss": 0.6653,
"step": 64
},
{
"epoch": 0.04,
"grad_norm": 2.2548663647589096,
"learning_rate": 9.997790307976874e-06,
"loss": 0.6354,
"step": 65
},
{
"epoch": 0.04,
"grad_norm": 2.229109711289454,
"learning_rate": 9.997485887036524e-06,
"loss": 0.6932,
"step": 66
},
{
"epoch": 0.04,
"grad_norm": 2.452224143427515,
"learning_rate": 9.997161832825843e-06,
"loss": 0.7654,
"step": 67
},
{
"epoch": 0.04,
"grad_norm": 2.2297142951513824,
"learning_rate": 9.996818146617922e-06,
"loss": 0.6906,
"step": 68
},
{
"epoch": 0.04,
"grad_norm": 2.2791022191610946,
"learning_rate": 9.996454829762973e-06,
"loss": 0.754,
"step": 69
},
{
"epoch": 0.04,
"grad_norm": 2.4219124929553515,
"learning_rate": 9.996071883688333e-06,
"loss": 0.6764,
"step": 70
},
{
"epoch": 0.04,
"grad_norm": 2.3501944220961146,
"learning_rate": 9.99566930989846e-06,
"loss": 0.7615,
"step": 71
},
{
"epoch": 0.04,
"grad_norm": 2.344366272975179,
"learning_rate": 9.995247109974915e-06,
"loss": 0.7134,
"step": 72
},
{
"epoch": 0.04,
"grad_norm": 2.2314263082377934,
"learning_rate": 9.994805285576364e-06,
"loss": 0.7631,
"step": 73
},
{
"epoch": 0.05,
"grad_norm": 2.2834194270284636,
"learning_rate": 9.99434383843857e-06,
"loss": 0.8618,
"step": 74
},
{
"epoch": 0.05,
"grad_norm": 2.1338375917815813,
"learning_rate": 9.99386277037439e-06,
"loss": 0.6911,
"step": 75
},
{
"epoch": 0.05,
"grad_norm": 2.3756649144864954,
"learning_rate": 9.993362083273763e-06,
"loss": 0.6907,
"step": 76
},
{
"epoch": 0.05,
"grad_norm": 2.243315382854752,
"learning_rate": 9.992841779103701e-06,
"loss": 0.7424,
"step": 77
},
{
"epoch": 0.05,
"grad_norm": 2.2756590145094773,
"learning_rate": 9.992301859908289e-06,
"loss": 0.7107,
"step": 78
},
{
"epoch": 0.05,
"grad_norm": 2.4499338417790324,
"learning_rate": 9.991742327808667e-06,
"loss": 0.6014,
"step": 79
},
{
"epoch": 0.05,
"grad_norm": 2.5224681026150524,
"learning_rate": 9.991163185003028e-06,
"loss": 0.7545,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 2.255631190324587,
"learning_rate": 9.990564433766615e-06,
"loss": 0.6931,
"step": 81
},
{
"epoch": 0.05,
"grad_norm": 2.3236984572992045,
"learning_rate": 9.989946076451693e-06,
"loss": 0.7708,
"step": 82
},
{
"epoch": 0.05,
"grad_norm": 2.3364421780529887,
"learning_rate": 9.989308115487563e-06,
"loss": 0.6633,
"step": 83
},
{
"epoch": 0.05,
"grad_norm": 2.4163016525364336,
"learning_rate": 9.988650553380537e-06,
"loss": 0.7195,
"step": 84
},
{
"epoch": 0.05,
"grad_norm": 2.296872674947527,
"learning_rate": 9.987973392713932e-06,
"loss": 0.6912,
"step": 85
},
{
"epoch": 0.05,
"grad_norm": 2.236387998298346,
"learning_rate": 9.987276636148062e-06,
"loss": 0.6737,
"step": 86
},
{
"epoch": 0.05,
"grad_norm": 2.277025205153288,
"learning_rate": 9.986560286420224e-06,
"loss": 0.7312,
"step": 87
},
{
"epoch": 0.05,
"grad_norm": 2.200531820532158,
"learning_rate": 9.985824346344692e-06,
"loss": 0.6251,
"step": 88
},
{
"epoch": 0.05,
"grad_norm": 2.488944948555403,
"learning_rate": 9.9850688188127e-06,
"loss": 0.7303,
"step": 89
},
{
"epoch": 0.06,
"grad_norm": 2.260371060935904,
"learning_rate": 9.984293706792438e-06,
"loss": 0.7546,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 2.4449103602612245,
"learning_rate": 9.983499013329035e-06,
"loss": 0.7119,
"step": 91
},
{
"epoch": 0.06,
"grad_norm": 2.0759253415676246,
"learning_rate": 9.982684741544543e-06,
"loss": 0.6844,
"step": 92
},
{
"epoch": 0.06,
"grad_norm": 2.387552887692122,
"learning_rate": 9.981850894637937e-06,
"loss": 0.6649,
"step": 93
},
{
"epoch": 0.06,
"grad_norm": 2.3995087357031797,
"learning_rate": 9.980997475885092e-06,
"loss": 0.6547,
"step": 94
},
{
"epoch": 0.06,
"grad_norm": 2.0904111698207495,
"learning_rate": 9.980124488638774e-06,
"loss": 0.6566,
"step": 95
},
{
"epoch": 0.06,
"grad_norm": 2.156307458383918,
"learning_rate": 9.979231936328627e-06,
"loss": 0.6928,
"step": 96
},
{
"epoch": 0.06,
"grad_norm": 2.24711153397962,
"learning_rate": 9.978319822461156e-06,
"loss": 0.6853,
"step": 97
},
{
"epoch": 0.06,
"grad_norm": 2.1649921192352317,
"learning_rate": 9.97738815061972e-06,
"loss": 0.7694,
"step": 98
},
{
"epoch": 0.06,
"grad_norm": 2.374786305390187,
"learning_rate": 9.976436924464513e-06,
"loss": 0.5882,
"step": 99
},
{
"epoch": 0.06,
"grad_norm": 2.4306832331036436,
"learning_rate": 9.975466147732551e-06,
"loss": 0.7988,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 2.407124728247618,
"learning_rate": 9.974475824237653e-06,
"loss": 0.7287,
"step": 101
},
{
"epoch": 0.06,
"grad_norm": 2.264722126370796,
"learning_rate": 9.973465957870437e-06,
"loss": 0.629,
"step": 102
},
{
"epoch": 0.06,
"grad_norm": 2.1005973714933273,
"learning_rate": 9.972436552598287e-06,
"loss": 0.6619,
"step": 103
},
{
"epoch": 0.06,
"grad_norm": 2.270262489712689,
"learning_rate": 9.971387612465364e-06,
"loss": 0.7143,
"step": 104
},
{
"epoch": 0.06,
"grad_norm": 2.265439764692705,
"learning_rate": 9.970319141592559e-06,
"loss": 0.7442,
"step": 105
},
{
"epoch": 0.06,
"grad_norm": 2.4319715531705213,
"learning_rate": 9.9692311441775e-06,
"loss": 0.7737,
"step": 106
},
{
"epoch": 0.07,
"grad_norm": 2.491658969929883,
"learning_rate": 9.968123624494525e-06,
"loss": 0.7946,
"step": 107
},
{
"epoch": 0.07,
"grad_norm": 2.027048425981043,
"learning_rate": 9.966996586894669e-06,
"loss": 0.6461,
"step": 108
},
{
"epoch": 0.07,
"grad_norm": 2.3372539030580874,
"learning_rate": 9.965850035805647e-06,
"loss": 0.6859,
"step": 109
},
{
"epoch": 0.07,
"grad_norm": 2.2435812305648057,
"learning_rate": 9.964683975731828e-06,
"loss": 0.7748,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 2.315025140176691,
"learning_rate": 9.963498411254235e-06,
"loss": 0.659,
"step": 111
},
{
"epoch": 0.07,
"grad_norm": 2.074191768339382,
"learning_rate": 9.96229334703051e-06,
"loss": 0.6609,
"step": 112
},
{
"epoch": 0.07,
"grad_norm": 2.145042227584639,
"learning_rate": 9.961068787794905e-06,
"loss": 0.6756,
"step": 113
},
{
"epoch": 0.07,
"grad_norm": 2.131588951231997,
"learning_rate": 9.959824738358257e-06,
"loss": 0.6334,
"step": 114
},
{
"epoch": 0.07,
"grad_norm": 2.069665378872537,
"learning_rate": 9.958561203607975e-06,
"loss": 0.6443,
"step": 115
},
{
"epoch": 0.07,
"grad_norm": 2.3057314770322646,
"learning_rate": 9.957278188508023e-06,
"loss": 0.7952,
"step": 116
},
{
"epoch": 0.07,
"grad_norm": 2.2378897605544474,
"learning_rate": 9.955975698098887e-06,
"loss": 0.7272,
"step": 117
},
{
"epoch": 0.07,
"grad_norm": 2.0307818678260654,
"learning_rate": 9.954653737497573e-06,
"loss": 0.5701,
"step": 118
},
{
"epoch": 0.07,
"grad_norm": 2.24162131248661,
"learning_rate": 9.953312311897573e-06,
"loss": 0.7793,
"step": 119
},
{
"epoch": 0.07,
"grad_norm": 2.3552427687921393,
"learning_rate": 9.951951426568852e-06,
"loss": 0.7209,
"step": 120
},
{
"epoch": 0.07,
"grad_norm": 2.141178442038793,
"learning_rate": 9.950571086857821e-06,
"loss": 0.6716,
"step": 121
},
{
"epoch": 0.07,
"grad_norm": 2.140261901239331,
"learning_rate": 9.949171298187328e-06,
"loss": 0.6743,
"step": 122
},
{
"epoch": 0.08,
"grad_norm": 2.4084672072357907,
"learning_rate": 9.94775206605662e-06,
"loss": 0.7973,
"step": 123
},
{
"epoch": 0.08,
"grad_norm": 2.025773154875324,
"learning_rate": 9.946313396041334e-06,
"loss": 0.7025,
"step": 124
},
{
"epoch": 0.08,
"grad_norm": 2.150720688365092,
"learning_rate": 9.944855293793477e-06,
"loss": 0.6128,
"step": 125
},
{
"epoch": 0.08,
"grad_norm": 2.161613123811631,
"learning_rate": 9.943377765041385e-06,
"loss": 0.6306,
"step": 126
},
{
"epoch": 0.08,
"grad_norm": 2.593842849691437,
"learning_rate": 9.941880815589726e-06,
"loss": 0.5894,
"step": 127
},
{
"epoch": 0.08,
"grad_norm": 2.504320662466919,
"learning_rate": 9.94036445131946e-06,
"loss": 0.772,
"step": 128
},
{
"epoch": 0.08,
"grad_norm": 2.1848467219695498,
"learning_rate": 9.938828678187816e-06,
"loss": 0.6397,
"step": 129
},
{
"epoch": 0.08,
"grad_norm": 2.27661938480736,
"learning_rate": 9.937273502228283e-06,
"loss": 0.6975,
"step": 130
},
{
"epoch": 0.08,
"grad_norm": 2.3673651764999573,
"learning_rate": 9.935698929550565e-06,
"loss": 0.7621,
"step": 131
},
{
"epoch": 0.08,
"grad_norm": 2.26639915751491,
"learning_rate": 9.934104966340582e-06,
"loss": 0.6551,
"step": 132
},
{
"epoch": 0.08,
"grad_norm": 2.3961035912104927,
"learning_rate": 9.932491618860419e-06,
"loss": 0.7304,
"step": 133
},
{
"epoch": 0.08,
"grad_norm": 2.2515245018919505,
"learning_rate": 9.93085889344832e-06,
"loss": 0.6655,
"step": 134
},
{
"epoch": 0.08,
"grad_norm": 2.254087375166334,
"learning_rate": 9.929206796518663e-06,
"loss": 0.666,
"step": 135
},
{
"epoch": 0.08,
"grad_norm": 2.31262707571602,
"learning_rate": 9.927535334561922e-06,
"loss": 0.7362,
"step": 136
},
{
"epoch": 0.08,
"grad_norm": 2.443869927919863,
"learning_rate": 9.925844514144651e-06,
"loss": 0.6805,
"step": 137
},
{
"epoch": 0.08,
"grad_norm": 2.2594952238504904,
"learning_rate": 9.924134341909459e-06,
"loss": 0.6936,
"step": 138
},
{
"epoch": 0.08,
"grad_norm": 2.3645395585503013,
"learning_rate": 9.922404824574976e-06,
"loss": 0.6318,
"step": 139
},
{
"epoch": 0.09,
"grad_norm": 1.952498408951814,
"learning_rate": 9.920655968935839e-06,
"loss": 0.6884,
"step": 140
},
{
"epoch": 0.09,
"grad_norm": 1.9398379720377739,
"learning_rate": 9.91888778186265e-06,
"loss": 0.5678,
"step": 141
},
{
"epoch": 0.09,
"grad_norm": 2.213422596923249,
"learning_rate": 9.917100270301963e-06,
"loss": 0.6868,
"step": 142
},
{
"epoch": 0.09,
"grad_norm": 2.2025927290666307,
"learning_rate": 9.915293441276246e-06,
"loss": 0.7192,
"step": 143
},
{
"epoch": 0.09,
"grad_norm": 2.80775911051946,
"learning_rate": 9.913467301883863e-06,
"loss": 0.784,
"step": 144
},
{
"epoch": 0.09,
"grad_norm": 2.0925302922228175,
"learning_rate": 9.91162185929904e-06,
"loss": 0.7198,
"step": 145
},
{
"epoch": 0.09,
"grad_norm": 2.334842810434387,
"learning_rate": 9.909757120771835e-06,
"loss": 0.7402,
"step": 146
},
{
"epoch": 0.09,
"grad_norm": 2.3908201091336805,
"learning_rate": 9.907873093628115e-06,
"loss": 0.6787,
"step": 147
},
{
"epoch": 0.09,
"grad_norm": 2.0776209566634316,
"learning_rate": 9.905969785269527e-06,
"loss": 0.6842,
"step": 148
},
{
"epoch": 0.09,
"grad_norm": 1.997004958324354,
"learning_rate": 9.904047203173462e-06,
"loss": 0.5733,
"step": 149
},
{
"epoch": 0.09,
"grad_norm": 2.3139630172641508,
"learning_rate": 9.90210535489303e-06,
"loss": 0.6647,
"step": 150
},
{
"epoch": 0.09,
"grad_norm": 1.9690194811261275,
"learning_rate": 9.90014424805704e-06,
"loss": 0.7311,
"step": 151
},
{
"epoch": 0.09,
"grad_norm": 2.3995016349539617,
"learning_rate": 9.898163890369948e-06,
"loss": 0.7473,
"step": 152
},
{
"epoch": 0.09,
"grad_norm": 2.1433446483892324,
"learning_rate": 9.896164289611849e-06,
"loss": 0.7016,
"step": 153
},
{
"epoch": 0.09,
"grad_norm": 2.190153295128278,
"learning_rate": 9.894145453638433e-06,
"loss": 0.67,
"step": 154
},
{
"epoch": 0.09,
"grad_norm": 2.4110621078855394,
"learning_rate": 9.892107390380959e-06,
"loss": 0.6655,
"step": 155
},
{
"epoch": 0.1,
"grad_norm": 2.2675629481717667,
"learning_rate": 9.890050107846219e-06,
"loss": 0.6459,
"step": 156
},
{
"epoch": 0.1,
"grad_norm": 2.274763942094237,
"learning_rate": 9.887973614116517e-06,
"loss": 0.6077,
"step": 157
},
{
"epoch": 0.1,
"grad_norm": 2.4805064767185314,
"learning_rate": 9.885877917349626e-06,
"loss": 0.7247,
"step": 158
},
{
"epoch": 0.1,
"grad_norm": 2.002019535180498,
"learning_rate": 9.883763025778766e-06,
"loss": 0.5373,
"step": 159
},
{
"epoch": 0.1,
"grad_norm": 2.337028939246668,
"learning_rate": 9.881628947712556e-06,
"loss": 0.7776,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 2.1479302094948247,
"learning_rate": 9.879475691535e-06,
"loss": 0.6499,
"step": 161
},
{
"epoch": 0.1,
"grad_norm": 1.9593814752135854,
"learning_rate": 9.87730326570545e-06,
"loss": 0.5575,
"step": 162
},
{
"epoch": 0.1,
"grad_norm": 2.3120091764398714,
"learning_rate": 9.875111678758553e-06,
"loss": 0.8117,
"step": 163
},
{
"epoch": 0.1,
"grad_norm": 2.58203181614075,
"learning_rate": 9.872900939304246e-06,
"loss": 0.7774,
"step": 164
},
{
"epoch": 0.1,
"grad_norm": 2.1359404314304173,
"learning_rate": 9.870671056027705e-06,
"loss": 0.7738,
"step": 165
},
{
"epoch": 0.1,
"grad_norm": 2.0558218346618484,
"learning_rate": 9.868422037689316e-06,
"loss": 0.6216,
"step": 166
},
{
"epoch": 0.1,
"grad_norm": 2.243529236580866,
"learning_rate": 9.866153893124638e-06,
"loss": 0.6684,
"step": 167
},
{
"epoch": 0.1,
"grad_norm": 2.1145868718408316,
"learning_rate": 9.863866631244371e-06,
"loss": 0.6847,
"step": 168
},
{
"epoch": 0.1,
"grad_norm": 1.9024551152535656,
"learning_rate": 9.861560261034319e-06,
"loss": 0.5933,
"step": 169
},
{
"epoch": 0.1,
"grad_norm": 2.1434847962939396,
"learning_rate": 9.859234791555356e-06,
"loss": 0.6503,
"step": 170
},
{
"epoch": 0.1,
"grad_norm": 2.1711654094618713,
"learning_rate": 9.856890231943389e-06,
"loss": 0.6768,
"step": 171
},
{
"epoch": 0.11,
"grad_norm": 2.4118520851923524,
"learning_rate": 9.854526591409325e-06,
"loss": 0.7787,
"step": 172
},
{
"epoch": 0.11,
"grad_norm": 2.0666041021999977,
"learning_rate": 9.852143879239032e-06,
"loss": 0.6304,
"step": 173
},
{
"epoch": 0.11,
"grad_norm": 2.0942561413873806,
"learning_rate": 9.849742104793303e-06,
"loss": 0.659,
"step": 174
},
{
"epoch": 0.11,
"grad_norm": 1.9464606223168837,
"learning_rate": 9.847321277507821e-06,
"loss": 0.5119,
"step": 175
},
{
"epoch": 0.11,
"grad_norm": 2.0838258311462443,
"learning_rate": 9.844881406893118e-06,
"loss": 0.6413,
"step": 176
},
{
"epoch": 0.11,
"grad_norm": 2.2413046501319136,
"learning_rate": 9.842422502534542e-06,
"loss": 0.6781,
"step": 177
},
{
"epoch": 0.11,
"grad_norm": 2.0434533347433392,
"learning_rate": 9.839944574092215e-06,
"loss": 0.7173,
"step": 178
},
{
"epoch": 0.11,
"grad_norm": 2.0756948950274956,
"learning_rate": 9.837447631301003e-06,
"loss": 0.691,
"step": 179
},
{
"epoch": 0.11,
"grad_norm": 2.096621527142799,
"learning_rate": 9.834931683970468e-06,
"loss": 0.6164,
"step": 180
},
{
"epoch": 0.11,
"grad_norm": 2.325512127795748,
"learning_rate": 9.832396741984834e-06,
"loss": 0.7617,
"step": 181
},
{
"epoch": 0.11,
"grad_norm": 2.1101717813557723,
"learning_rate": 9.829842815302951e-06,
"loss": 0.5708,
"step": 182
},
{
"epoch": 0.11,
"grad_norm": 2.1629292221166567,
"learning_rate": 9.827269913958247e-06,
"loss": 0.7347,
"step": 183
},
{
"epoch": 0.11,
"grad_norm": 2.3197439288304906,
"learning_rate": 9.8246780480587e-06,
"loss": 0.7264,
"step": 184
},
{
"epoch": 0.11,
"grad_norm": 2.1964017610148843,
"learning_rate": 9.822067227786794e-06,
"loss": 0.759,
"step": 185
},
{
"epoch": 0.11,
"grad_norm": 2.272214121168636,
"learning_rate": 9.819437463399468e-06,
"loss": 0.6904,
"step": 186
},
{
"epoch": 0.11,
"grad_norm": 2.3994827721770684,
"learning_rate": 9.816788765228095e-06,
"loss": 0.7399,
"step": 187
},
{
"epoch": 0.11,
"grad_norm": 2.111638499108041,
"learning_rate": 9.81412114367843e-06,
"loss": 0.7612,
"step": 188
},
{
"epoch": 0.12,
"grad_norm": 1.9840878981791865,
"learning_rate": 9.81143460923057e-06,
"loss": 0.5875,
"step": 189
},
{
"epoch": 0.12,
"grad_norm": 2.1652359918123167,
"learning_rate": 9.808729172438909e-06,
"loss": 0.724,
"step": 190
},
{
"epoch": 0.12,
"grad_norm": 1.9395882749114965,
"learning_rate": 9.80600484393211e-06,
"loss": 0.5528,
"step": 191
},
{
"epoch": 0.12,
"grad_norm": 2.1604646292422527,
"learning_rate": 9.803261634413049e-06,
"loss": 0.5707,
"step": 192
},
{
"epoch": 0.12,
"grad_norm": 2.016941878380451,
"learning_rate": 9.80049955465878e-06,
"loss": 0.6385,
"step": 193
},
{
"epoch": 0.12,
"grad_norm": 2.532345965216657,
"learning_rate": 9.797718615520488e-06,
"loss": 0.6938,
"step": 194
},
{
"epoch": 0.12,
"grad_norm": 2.2511667943525038,
"learning_rate": 9.794918827923458e-06,
"loss": 0.753,
"step": 195
},
{
"epoch": 0.12,
"grad_norm": 2.1447413054072517,
"learning_rate": 9.792100202867014e-06,
"loss": 0.6697,
"step": 196
},
{
"epoch": 0.12,
"grad_norm": 2.189313732305628,
"learning_rate": 9.78926275142449e-06,
"loss": 0.616,
"step": 197
},
{
"epoch": 0.12,
"grad_norm": 2.268919346618177,
"learning_rate": 9.786406484743183e-06,
"loss": 0.7412,
"step": 198
},
{
"epoch": 0.12,
"grad_norm": 2.2333800757193276,
"learning_rate": 9.783531414044304e-06,
"loss": 0.6923,
"step": 199
},
{
"epoch": 0.12,
"grad_norm": 2.055136095603475,
"learning_rate": 9.780637550622943e-06,
"loss": 0.7031,
"step": 200
},
{
"epoch": 0.12,
"grad_norm": 2.226669249666592,
"learning_rate": 9.777724905848013e-06,
"loss": 0.7531,
"step": 201
},
{
"epoch": 0.12,
"grad_norm": 2.232153739326395,
"learning_rate": 9.774793491162221e-06,
"loss": 0.6133,
"step": 202
},
{
"epoch": 0.12,
"grad_norm": 2.264407351963802,
"learning_rate": 9.771843318082008e-06,
"loss": 0.5952,
"step": 203
},
{
"epoch": 0.12,
"grad_norm": 2.294784666626483,
"learning_rate": 9.76887439819751e-06,
"loss": 0.8448,
"step": 204
},
{
"epoch": 0.13,
"grad_norm": 2.1307772179440936,
"learning_rate": 9.765886743172512e-06,
"loss": 0.5948,
"step": 205
},
{
"epoch": 0.13,
"grad_norm": 1.83961485752976,
"learning_rate": 9.762880364744404e-06,
"loss": 0.5447,
"step": 206
},
{
"epoch": 0.13,
"grad_norm": 2.031812871238954,
"learning_rate": 9.759855274724137e-06,
"loss": 0.6538,
"step": 207
},
{
"epoch": 0.13,
"grad_norm": 2.0040113831634745,
"learning_rate": 9.756811484996162e-06,
"loss": 0.6421,
"step": 208
},
{
"epoch": 0.13,
"grad_norm": 1.7984932798626254,
"learning_rate": 9.753749007518407e-06,
"loss": 0.5157,
"step": 209
},
{
"epoch": 0.13,
"grad_norm": 2.0477874542114916,
"learning_rate": 9.750667854322207e-06,
"loss": 0.6199,
"step": 210
},
{
"epoch": 0.13,
"grad_norm": 2.206751428653727,
"learning_rate": 9.747568037512274e-06,
"loss": 0.6161,
"step": 211
},
{
"epoch": 0.13,
"grad_norm": 2.3285004622852767,
"learning_rate": 9.744449569266637e-06,
"loss": 0.7607,
"step": 212
},
{
"epoch": 0.13,
"grad_norm": 2.1011118330445475,
"learning_rate": 9.741312461836606e-06,
"loss": 0.666,
"step": 213
},
{
"epoch": 0.13,
"grad_norm": 2.397695659444179,
"learning_rate": 9.738156727546711e-06,
"loss": 0.7105,
"step": 214
},
{
"epoch": 0.13,
"grad_norm": 1.91331183170689,
"learning_rate": 9.734982378794662e-06,
"loss": 0.619,
"step": 215
},
{
"epoch": 0.13,
"grad_norm": 2.1362268650914125,
"learning_rate": 9.731789428051302e-06,
"loss": 0.7317,
"step": 216
},
{
"epoch": 0.13,
"grad_norm": 2.04421825962035,
"learning_rate": 9.72857788786055e-06,
"loss": 0.6309,
"step": 217
},
{
"epoch": 0.13,
"grad_norm": 2.1550284488031473,
"learning_rate": 9.725347770839356e-06,
"loss": 0.6768,
"step": 218
},
{
"epoch": 0.13,
"grad_norm": 2.049439369305375,
"learning_rate": 9.722099089677655e-06,
"loss": 0.6423,
"step": 219
},
{
"epoch": 0.13,
"grad_norm": 2.122940983855365,
"learning_rate": 9.718831857138308e-06,
"loss": 0.6345,
"step": 220
},
{
"epoch": 0.14,
"grad_norm": 2.274655471484878,
"learning_rate": 9.715546086057066e-06,
"loss": 0.5792,
"step": 221
},
{
"epoch": 0.14,
"grad_norm": 1.8385884175738376,
"learning_rate": 9.712241789342504e-06,
"loss": 0.656,
"step": 222
},
{
"epoch": 0.14,
"grad_norm": 2.2460907183322933,
"learning_rate": 9.708918979975982e-06,
"loss": 0.6417,
"step": 223
},
{
"epoch": 0.14,
"grad_norm": 1.9323218121201529,
"learning_rate": 9.705577671011579e-06,
"loss": 0.6371,
"step": 224
},
{
"epoch": 0.14,
"grad_norm": 2.0382284797399293,
"learning_rate": 9.702217875576067e-06,
"loss": 0.591,
"step": 225
},
{
"epoch": 0.14,
"grad_norm": 2.0698398210796567,
"learning_rate": 9.698839606868835e-06,
"loss": 0.5794,
"step": 226
},
{
"epoch": 0.14,
"grad_norm": 1.9440879438361034,
"learning_rate": 9.69544287816185e-06,
"loss": 0.6745,
"step": 227
},
{
"epoch": 0.14,
"grad_norm": 1.9900929740524849,
"learning_rate": 9.6920277027996e-06,
"loss": 0.6757,
"step": 228
},
{
"epoch": 0.14,
"grad_norm": 2.1940921838343446,
"learning_rate": 9.688594094199043e-06,
"loss": 0.6472,
"step": 229
},
{
"epoch": 0.14,
"grad_norm": 2.1958440427756636,
"learning_rate": 9.685142065849556e-06,
"loss": 0.7342,
"step": 230
},
{
"epoch": 0.14,
"grad_norm": 2.165725186559193,
"learning_rate": 9.681671631312876e-06,
"loss": 0.6485,
"step": 231
},
{
"epoch": 0.14,
"grad_norm": 2.1894781279792443,
"learning_rate": 9.67818280422306e-06,
"loss": 0.6896,
"step": 232
},
{
"epoch": 0.14,
"grad_norm": 2.195740186965468,
"learning_rate": 9.674675598286414e-06,
"loss": 0.6974,
"step": 233
},
{
"epoch": 0.14,
"grad_norm": 2.2452322721170668,
"learning_rate": 9.671150027281449e-06,
"loss": 0.6163,
"step": 234
},
{
"epoch": 0.14,
"grad_norm": 2.326336053478045,
"learning_rate": 9.667606105058828e-06,
"loss": 0.6448,
"step": 235
},
{
"epoch": 0.14,
"grad_norm": 2.0032814883659036,
"learning_rate": 9.66404384554131e-06,
"loss": 0.619,
"step": 236
},
{
"epoch": 0.14,
"grad_norm": 1.932037123804567,
"learning_rate": 9.660463262723691e-06,
"loss": 0.5897,
"step": 237
},
{
"epoch": 0.15,
"grad_norm": 2.340804976271579,
"learning_rate": 9.656864370672757e-06,
"loss": 0.8023,
"step": 238
},
{
"epoch": 0.15,
"grad_norm": 2.1022457172205327,
"learning_rate": 9.653247183527216e-06,
"loss": 0.7218,
"step": 239
},
{
"epoch": 0.15,
"grad_norm": 1.8423012822099027,
"learning_rate": 9.649611715497662e-06,
"loss": 0.6005,
"step": 240
},
{
"epoch": 0.15,
"grad_norm": 1.8546169042416565,
"learning_rate": 9.645957980866499e-06,
"loss": 0.5888,
"step": 241
},
{
"epoch": 0.15,
"grad_norm": 1.9846561311341997,
"learning_rate": 9.642285993987895e-06,
"loss": 0.5579,
"step": 242
},
{
"epoch": 0.15,
"grad_norm": 2.239777371231001,
"learning_rate": 9.63859576928773e-06,
"loss": 0.755,
"step": 243
},
{
"epoch": 0.15,
"grad_norm": 2.0594775910705083,
"learning_rate": 9.634887321263525e-06,
"loss": 0.6442,
"step": 244
},
{
"epoch": 0.15,
"grad_norm": 2.176249216011953,
"learning_rate": 9.631160664484398e-06,
"loss": 0.8016,
"step": 245
},
{
"epoch": 0.15,
"grad_norm": 2.085520659410642,
"learning_rate": 9.627415813591007e-06,
"loss": 0.6368,
"step": 246
},
{
"epoch": 0.15,
"grad_norm": 2.021294737304931,
"learning_rate": 9.623652783295483e-06,
"loss": 0.5614,
"step": 247
},
{
"epoch": 0.15,
"grad_norm": 2.231835038374354,
"learning_rate": 9.619871588381376e-06,
"loss": 0.7216,
"step": 248
},
{
"epoch": 0.15,
"grad_norm": 1.910271586943225,
"learning_rate": 9.616072243703598e-06,
"loss": 0.5791,
"step": 249
},
{
"epoch": 0.15,
"grad_norm": 1.8003560124729412,
"learning_rate": 9.612254764188368e-06,
"loss": 0.5448,
"step": 250
},
{
"epoch": 0.15,
"grad_norm": 2.2007549176054404,
"learning_rate": 9.608419164833152e-06,
"loss": 0.7257,
"step": 251
},
{
"epoch": 0.15,
"grad_norm": 2.0571622186487044,
"learning_rate": 9.604565460706592e-06,
"loss": 0.6335,
"step": 252
},
{
"epoch": 0.15,
"grad_norm": 2.2497466958972154,
"learning_rate": 9.60069366694847e-06,
"loss": 0.6597,
"step": 253
},
{
"epoch": 0.16,
"grad_norm": 1.9612178789599213,
"learning_rate": 9.596803798769626e-06,
"loss": 0.7287,
"step": 254
},
{
"epoch": 0.16,
"grad_norm": 2.1104419163141115,
"learning_rate": 9.592895871451908e-06,
"loss": 0.6671,
"step": 255
},
{
"epoch": 0.16,
"grad_norm": 2.0822889988204305,
"learning_rate": 9.58896990034812e-06,
"loss": 0.7013,
"step": 256
},
{
"epoch": 0.16,
"grad_norm": 2.248918383189871,
"learning_rate": 9.585025900881944e-06,
"loss": 0.7042,
"step": 257
},
{
"epoch": 0.16,
"grad_norm": 2.0495739015390857,
"learning_rate": 9.581063888547895e-06,
"loss": 0.6913,
"step": 258
},
{
"epoch": 0.16,
"grad_norm": 1.759437262151785,
"learning_rate": 9.57708387891125e-06,
"loss": 0.5709,
"step": 259
},
{
"epoch": 0.16,
"grad_norm": 2.420770662182739,
"learning_rate": 9.573085887607991e-06,
"loss": 0.6814,
"step": 260
},
{
"epoch": 0.16,
"grad_norm": 2.130894942110269,
"learning_rate": 9.569069930344746e-06,
"loss": 0.6187,
"step": 261
},
{
"epoch": 0.16,
"grad_norm": 1.9591579536191646,
"learning_rate": 9.565036022898723e-06,
"loss": 0.5882,
"step": 262
},
{
"epoch": 0.16,
"grad_norm": 1.8084671651408435,
"learning_rate": 9.56098418111765e-06,
"loss": 0.6313,
"step": 263
},
{
"epoch": 0.16,
"grad_norm": 2.1526443073933086,
"learning_rate": 9.556914420919711e-06,
"loss": 0.6102,
"step": 264
},
{
"epoch": 0.16,
"grad_norm": 2.194691833732068,
"learning_rate": 9.552826758293487e-06,
"loss": 0.6878,
"step": 265
},
{
"epoch": 0.16,
"grad_norm": 2.501846785947928,
"learning_rate": 9.548721209297889e-06,
"loss": 0.6596,
"step": 266
},
{
"epoch": 0.16,
"grad_norm": 2.0669308931128123,
"learning_rate": 9.544597790062098e-06,
"loss": 0.6224,
"step": 267
},
{
"epoch": 0.16,
"grad_norm": 2.2681106555575052,
"learning_rate": 9.5404565167855e-06,
"loss": 0.6786,
"step": 268
},
{
"epoch": 0.16,
"grad_norm": 2.0576613963336445,
"learning_rate": 9.536297405737624e-06,
"loss": 0.5946,
"step": 269
},
{
"epoch": 0.17,
"grad_norm": 2.0426855396207264,
"learning_rate": 9.532120473258075e-06,
"loss": 0.641,
"step": 270
},
{
"epoch": 0.17,
"grad_norm": 2.1061182708873973,
"learning_rate": 9.527925735756473e-06,
"loss": 0.6379,
"step": 271
},
{
"epoch": 0.17,
"grad_norm": 1.6795883204795699,
"learning_rate": 9.52371320971239e-06,
"loss": 0.5449,
"step": 272
},
{
"epoch": 0.17,
"grad_norm": 1.9653754600396853,
"learning_rate": 9.519482911675278e-06,
"loss": 0.6875,
"step": 273
},
{
"epoch": 0.17,
"grad_norm": 2.1944241074929534,
"learning_rate": 9.51523485826441e-06,
"loss": 0.6387,
"step": 274
},
{
"epoch": 0.17,
"grad_norm": 2.2621502173726418,
"learning_rate": 9.510969066168814e-06,
"loss": 0.7567,
"step": 275
},
{
"epoch": 0.17,
"grad_norm": 2.0713889312949623,
"learning_rate": 9.506685552147208e-06,
"loss": 0.8003,
"step": 276
},
{
"epoch": 0.17,
"grad_norm": 2.161082236049816,
"learning_rate": 9.502384333027929e-06,
"loss": 0.7317,
"step": 277
},
{
"epoch": 0.17,
"grad_norm": 2.099935757711094,
"learning_rate": 9.498065425708878e-06,
"loss": 0.6365,
"step": 278
},
{
"epoch": 0.17,
"grad_norm": 2.0269709971121768,
"learning_rate": 9.493728847157436e-06,
"loss": 0.6275,
"step": 279
},
{
"epoch": 0.17,
"grad_norm": 1.9361261922037705,
"learning_rate": 9.489374614410413e-06,
"loss": 0.6505,
"step": 280
},
{
"epoch": 0.17,
"grad_norm": 2.276393660154319,
"learning_rate": 9.485002744573982e-06,
"loss": 0.7315,
"step": 281
},
{
"epoch": 0.17,
"grad_norm": 1.9425800587711943,
"learning_rate": 9.480613254823595e-06,
"loss": 0.6143,
"step": 282
},
{
"epoch": 0.17,
"grad_norm": 2.2697653664784534,
"learning_rate": 9.476206162403933e-06,
"loss": 0.6727,
"step": 283
},
{
"epoch": 0.17,
"grad_norm": 2.1842308672307063,
"learning_rate": 9.471781484628828e-06,
"loss": 0.6416,
"step": 284
},
{
"epoch": 0.17,
"grad_norm": 1.9855040976893727,
"learning_rate": 9.467339238881199e-06,
"loss": 0.6107,
"step": 285
},
{
"epoch": 0.17,
"grad_norm": 1.9633251460753256,
"learning_rate": 9.462879442612984e-06,
"loss": 0.5977,
"step": 286
},
{
"epoch": 0.18,
"grad_norm": 2.0847506096615094,
"learning_rate": 9.458402113345071e-06,
"loss": 0.5964,
"step": 287
},
{
"epoch": 0.18,
"grad_norm": 2.071971955099866,
"learning_rate": 9.453907268667226e-06,
"loss": 0.6926,
"step": 288
},
{
"epoch": 0.18,
"grad_norm": 2.1433436130539074,
"learning_rate": 9.44939492623803e-06,
"loss": 0.5979,
"step": 289
},
{
"epoch": 0.18,
"grad_norm": 2.262092646829491,
"learning_rate": 9.444865103784803e-06,
"loss": 0.6555,
"step": 290
},
{
"epoch": 0.18,
"grad_norm": 2.1581337027107192,
"learning_rate": 9.440317819103542e-06,
"loss": 0.7022,
"step": 291
},
{
"epoch": 0.18,
"grad_norm": 2.207365600854885,
"learning_rate": 9.435753090058839e-06,
"loss": 0.6298,
"step": 292
},
{
"epoch": 0.18,
"grad_norm": 1.9716493031373659,
"learning_rate": 9.431170934583826e-06,
"loss": 0.6057,
"step": 293
},
{
"epoch": 0.18,
"grad_norm": 1.8605748771934563,
"learning_rate": 9.426571370680094e-06,
"loss": 0.6488,
"step": 294
},
{
"epoch": 0.18,
"grad_norm": 2.101750848753291,
"learning_rate": 9.421954416417624e-06,
"loss": 0.6334,
"step": 295
},
{
"epoch": 0.18,
"grad_norm": 2.189471586472517,
"learning_rate": 9.417320089934721e-06,
"loss": 0.6478,
"step": 296
},
{
"epoch": 0.18,
"grad_norm": 1.8693406953522982,
"learning_rate": 9.412668409437934e-06,
"loss": 0.5423,
"step": 297
},
{
"epoch": 0.18,
"grad_norm": 2.1604549448326207,
"learning_rate": 9.407999393201992e-06,
"loss": 0.6778,
"step": 298
},
{
"epoch": 0.18,
"grad_norm": 2.260164616585325,
"learning_rate": 9.403313059569729e-06,
"loss": 0.7631,
"step": 299
},
{
"epoch": 0.18,
"grad_norm": 2.0264565243677652,
"learning_rate": 9.398609426952019e-06,
"loss": 0.6039,
"step": 300
},
{
"epoch": 0.18,
"grad_norm": 2.0077711587490987,
"learning_rate": 9.393888513827686e-06,
"loss": 0.6003,
"step": 301
},
{
"epoch": 0.18,
"grad_norm": 2.0401684479993563,
"learning_rate": 9.389150338743451e-06,
"loss": 0.6232,
"step": 302
},
{
"epoch": 0.19,
"grad_norm": 2.0592434888026467,
"learning_rate": 9.384394920313847e-06,
"loss": 0.692,
"step": 303
},
{
"epoch": 0.19,
"grad_norm": 2.127639974580799,
"learning_rate": 9.379622277221152e-06,
"loss": 0.6403,
"step": 304
},
{
"epoch": 0.19,
"grad_norm": 2.1637948423090596,
"learning_rate": 9.37483242821531e-06,
"loss": 0.7911,
"step": 305
},
{
"epoch": 0.19,
"grad_norm": 1.96655742278293,
"learning_rate": 9.370025392113866e-06,
"loss": 0.6817,
"step": 306
},
{
"epoch": 0.19,
"grad_norm": 2.4075353559357375,
"learning_rate": 9.365201187801884e-06,
"loss": 0.7468,
"step": 307
},
{
"epoch": 0.19,
"grad_norm": 2.170026110189212,
"learning_rate": 9.360359834231873e-06,
"loss": 0.7148,
"step": 308
},
{
"epoch": 0.19,
"grad_norm": 2.0723680052005378,
"learning_rate": 9.355501350423717e-06,
"loss": 0.6234,
"step": 309
},
{
"epoch": 0.19,
"grad_norm": 2.111461085654852,
"learning_rate": 9.3506257554646e-06,
"loss": 0.6659,
"step": 310
},
{
"epoch": 0.19,
"grad_norm": 2.2496008204864104,
"learning_rate": 9.345733068508929e-06,
"loss": 0.7033,
"step": 311
},
{
"epoch": 0.19,
"grad_norm": 1.9996970862014591,
"learning_rate": 9.340823308778255e-06,
"loss": 0.7255,
"step": 312
},
{
"epoch": 0.19,
"grad_norm": 1.8792750115155255,
"learning_rate": 9.335896495561207e-06,
"loss": 0.6429,
"step": 313
},
{
"epoch": 0.19,
"grad_norm": 2.12929524638482,
"learning_rate": 9.33095264821341e-06,
"loss": 0.6596,
"step": 314
},
{
"epoch": 0.19,
"grad_norm": 1.9025785100638457,
"learning_rate": 9.325991786157405e-06,
"loss": 0.6464,
"step": 315
},
{
"epoch": 0.19,
"grad_norm": 1.9934226422368588,
"learning_rate": 9.321013928882583e-06,
"loss": 0.5929,
"step": 316
},
{
"epoch": 0.19,
"grad_norm": 2.1268235022475697,
"learning_rate": 9.3160190959451e-06,
"loss": 0.6511,
"step": 317
},
{
"epoch": 0.19,
"grad_norm": 1.9740290904745004,
"learning_rate": 9.311007306967805e-06,
"loss": 0.5765,
"step": 318
},
{
"epoch": 0.2,
"grad_norm": 2.0624735759975823,
"learning_rate": 9.305978581640157e-06,
"loss": 0.7006,
"step": 319
},
{
"epoch": 0.2,
"grad_norm": 2.2850117779663144,
"learning_rate": 9.300932939718159e-06,
"loss": 0.6555,
"step": 320
},
{
"epoch": 0.2,
"grad_norm": 1.9373893589189668,
"learning_rate": 9.295870401024266e-06,
"loss": 0.6105,
"step": 321
},
{
"epoch": 0.2,
"grad_norm": 1.9939827504480299,
"learning_rate": 9.290790985447316e-06,
"loss": 0.574,
"step": 322
},
{
"epoch": 0.2,
"grad_norm": 2.518967684022752,
"learning_rate": 9.285694712942453e-06,
"loss": 0.737,
"step": 323
},
{
"epoch": 0.2,
"grad_norm": 2.061941526906131,
"learning_rate": 9.28058160353104e-06,
"loss": 0.6289,
"step": 324
},
{
"epoch": 0.2,
"grad_norm": 2.046986924521927,
"learning_rate": 9.275451677300591e-06,
"loss": 0.6026,
"step": 325
},
{
"epoch": 0.2,
"grad_norm": 1.9643917949664476,
"learning_rate": 9.270304954404688e-06,
"loss": 0.6726,
"step": 326
},
{
"epoch": 0.2,
"grad_norm": 2.030653402715337,
"learning_rate": 9.265141455062894e-06,
"loss": 0.6522,
"step": 327
},
{
"epoch": 0.2,
"grad_norm": 2.083183062824829,
"learning_rate": 9.259961199560686e-06,
"loss": 0.6681,
"step": 328
},
{
"epoch": 0.2,
"grad_norm": 2.0946830011733955,
"learning_rate": 9.254764208249369e-06,
"loss": 0.7092,
"step": 329
},
{
"epoch": 0.2,
"grad_norm": 2.1225126781820283,
"learning_rate": 9.249550501545998e-06,
"loss": 0.67,
"step": 330
},
{
"epoch": 0.2,
"grad_norm": 2.026753617785709,
"learning_rate": 9.244320099933291e-06,
"loss": 0.6578,
"step": 331
},
{
"epoch": 0.2,
"grad_norm": 2.3515094288361125,
"learning_rate": 9.239073023959562e-06,
"loss": 0.7187,
"step": 332
},
{
"epoch": 0.2,
"grad_norm": 2.1066900321641655,
"learning_rate": 9.233809294238625e-06,
"loss": 0.709,
"step": 333
},
{
"epoch": 0.2,
"grad_norm": 2.2215204725890416,
"learning_rate": 9.228528931449724e-06,
"loss": 0.7507,
"step": 334
},
{
"epoch": 0.2,
"grad_norm": 2.2519794194499068,
"learning_rate": 9.22323195633745e-06,
"loss": 0.804,
"step": 335
},
{
"epoch": 0.21,
"grad_norm": 2.1625190316209792,
"learning_rate": 9.217918389711652e-06,
"loss": 0.7088,
"step": 336
},
{
"epoch": 0.21,
"grad_norm": 2.095704698093118,
"learning_rate": 9.21258825244737e-06,
"loss": 0.6989,
"step": 337
},
{
"epoch": 0.21,
"grad_norm": 2.330487081066773,
"learning_rate": 9.207241565484733e-06,
"loss": 0.7033,
"step": 338
},
{
"epoch": 0.21,
"grad_norm": 2.132239589678436,
"learning_rate": 9.201878349828897e-06,
"loss": 0.6656,
"step": 339
},
{
"epoch": 0.21,
"grad_norm": 2.158342662695929,
"learning_rate": 9.196498626549944e-06,
"loss": 0.5794,
"step": 340
},
{
"epoch": 0.21,
"grad_norm": 1.9571490668467135,
"learning_rate": 9.191102416782819e-06,
"loss": 0.5614,
"step": 341
},
{
"epoch": 0.21,
"grad_norm": 2.0828460881254154,
"learning_rate": 9.185689741727229e-06,
"loss": 0.7618,
"step": 342
},
{
"epoch": 0.21,
"grad_norm": 1.945496988662335,
"learning_rate": 9.180260622647565e-06,
"loss": 0.6134,
"step": 343
},
{
"epoch": 0.21,
"grad_norm": 2.1345490610587006,
"learning_rate": 9.174815080872829e-06,
"loss": 0.6491,
"step": 344
},
{
"epoch": 0.21,
"grad_norm": 1.888910241101656,
"learning_rate": 9.169353137796533e-06,
"loss": 0.5433,
"step": 345
},
{
"epoch": 0.21,
"grad_norm": 2.2231218683381346,
"learning_rate": 9.163874814876632e-06,
"loss": 0.6674,
"step": 346
},
{
"epoch": 0.21,
"grad_norm": 1.8397128888601602,
"learning_rate": 9.158380133635425e-06,
"loss": 0.5104,
"step": 347
},
{
"epoch": 0.21,
"grad_norm": 1.8435017185456046,
"learning_rate": 9.152869115659474e-06,
"loss": 0.5708,
"step": 348
},
{
"epoch": 0.21,
"grad_norm": 2.004371953603906,
"learning_rate": 9.147341782599534e-06,
"loss": 0.5923,
"step": 349
},
{
"epoch": 0.21,
"grad_norm": 2.1426206185002523,
"learning_rate": 9.141798156170447e-06,
"loss": 0.6067,
"step": 350
},
{
"epoch": 0.21,
"grad_norm": 2.0707774939518435,
"learning_rate": 9.136238258151063e-06,
"loss": 0.621,
"step": 351
},
{
"epoch": 0.22,
"grad_norm": 2.0756065658076808,
"learning_rate": 9.130662110384163e-06,
"loss": 0.609,
"step": 352
},
{
"epoch": 0.22,
"grad_norm": 1.864542155335991,
"learning_rate": 9.125069734776367e-06,
"loss": 0.5795,
"step": 353
},
{
"epoch": 0.22,
"grad_norm": 2.247405694299018,
"learning_rate": 9.119461153298045e-06,
"loss": 0.6788,
"step": 354
},
{
"epoch": 0.22,
"grad_norm": 2.0281028236357908,
"learning_rate": 9.113836387983239e-06,
"loss": 0.6667,
"step": 355
},
{
"epoch": 0.22,
"grad_norm": 2.1739992658132126,
"learning_rate": 9.108195460929563e-06,
"loss": 0.6559,
"step": 356
},
{
"epoch": 0.22,
"grad_norm": 1.844308015715884,
"learning_rate": 9.10253839429813e-06,
"loss": 0.5637,
"step": 357
},
{
"epoch": 0.22,
"grad_norm": 2.158849134009064,
"learning_rate": 9.096865210313461e-06,
"loss": 0.6977,
"step": 358
},
{
"epoch": 0.22,
"grad_norm": 1.9857083622278322,
"learning_rate": 9.091175931263395e-06,
"loss": 0.7014,
"step": 359
},
{
"epoch": 0.22,
"grad_norm": 2.083743100705083,
"learning_rate": 9.085470579498996e-06,
"loss": 0.6288,
"step": 360
},
{
"epoch": 0.22,
"grad_norm": 2.0848589757887304,
"learning_rate": 9.079749177434481e-06,
"loss": 0.5892,
"step": 361
},
{
"epoch": 0.22,
"grad_norm": 2.2211766443468073,
"learning_rate": 9.074011747547118e-06,
"loss": 0.6756,
"step": 362
},
{
"epoch": 0.22,
"grad_norm": 2.259415733177512,
"learning_rate": 9.068258312377143e-06,
"loss": 0.637,
"step": 363
},
{
"epoch": 0.22,
"grad_norm": 1.8541880063977976,
"learning_rate": 9.06248889452767e-06,
"loss": 0.5564,
"step": 364
},
{
"epoch": 0.22,
"grad_norm": 2.043397669872696,
"learning_rate": 9.056703516664606e-06,
"loss": 0.5995,
"step": 365
},
{
"epoch": 0.22,
"grad_norm": 1.9966425012080062,
"learning_rate": 9.050902201516555e-06,
"loss": 0.5602,
"step": 366
},
{
"epoch": 0.22,
"grad_norm": 2.097676942573622,
"learning_rate": 9.045084971874738e-06,
"loss": 0.669,
"step": 367
},
{
"epoch": 0.23,
"grad_norm": 2.0595811405443016,
"learning_rate": 9.039251850592892e-06,
"loss": 0.6529,
"step": 368
},
{
"epoch": 0.23,
"grad_norm": 2.0255229150761576,
"learning_rate": 9.033402860587187e-06,
"loss": 0.5948,
"step": 369
},
{
"epoch": 0.23,
"grad_norm": 2.0548212104417276,
"learning_rate": 9.027538024836143e-06,
"loss": 0.6584,
"step": 370
},
{
"epoch": 0.23,
"grad_norm": 2.2114936351325465,
"learning_rate": 9.021657366380521e-06,
"loss": 0.6837,
"step": 371
},
{
"epoch": 0.23,
"grad_norm": 1.9893474856689934,
"learning_rate": 9.015760908323253e-06,
"loss": 0.5977,
"step": 372
},
{
"epoch": 0.23,
"grad_norm": 1.9935862578665022,
"learning_rate": 9.009848673829337e-06,
"loss": 0.6574,
"step": 373
},
{
"epoch": 0.23,
"grad_norm": 1.8536984972638404,
"learning_rate": 9.00392068612575e-06,
"loss": 0.5571,
"step": 374
},
{
"epoch": 0.23,
"grad_norm": 2.07272622617217,
"learning_rate": 8.997976968501362e-06,
"loss": 0.6437,
"step": 375
},
{
"epoch": 0.23,
"grad_norm": 1.9669798106315952,
"learning_rate": 8.992017544306834e-06,
"loss": 0.6805,
"step": 376
},
{
"epoch": 0.23,
"grad_norm": 2.243741605970751,
"learning_rate": 8.986042436954538e-06,
"loss": 0.7328,
"step": 377
},
{
"epoch": 0.23,
"grad_norm": 2.011662513116711,
"learning_rate": 8.980051669918458e-06,
"loss": 0.6209,
"step": 378
},
{
"epoch": 0.23,
"grad_norm": 2.1937242214026007,
"learning_rate": 8.974045266734094e-06,
"loss": 0.7434,
"step": 379
},
{
"epoch": 0.23,
"grad_norm": 2.132031298132569,
"learning_rate": 8.96802325099838e-06,
"loss": 0.6832,
"step": 380
},
{
"epoch": 0.23,
"grad_norm": 1.7086865848142259,
"learning_rate": 8.961985646369587e-06,
"loss": 0.5608,
"step": 381
},
{
"epoch": 0.23,
"grad_norm": 1.9009692420702806,
"learning_rate": 8.955932476567224e-06,
"loss": 0.6121,
"step": 382
},
{
"epoch": 0.23,
"grad_norm": 1.9044767808035803,
"learning_rate": 8.949863765371952e-06,
"loss": 0.6172,
"step": 383
},
{
"epoch": 0.23,
"grad_norm": 2.1087095562200946,
"learning_rate": 8.943779536625489e-06,
"loss": 0.7064,
"step": 384
},
{
"epoch": 0.24,
"grad_norm": 2.17610448059507,
"learning_rate": 8.937679814230517e-06,
"loss": 0.6725,
"step": 385
},
{
"epoch": 0.24,
"grad_norm": 2.006215616453568,
"learning_rate": 8.931564622150583e-06,
"loss": 0.6987,
"step": 386
},
{
"epoch": 0.24,
"grad_norm": 1.9223795578578178,
"learning_rate": 8.925433984410012e-06,
"loss": 0.5192,
"step": 387
},
{
"epoch": 0.24,
"grad_norm": 1.8039652484819113,
"learning_rate": 8.919287925093808e-06,
"loss": 0.628,
"step": 388
},
{
"epoch": 0.24,
"grad_norm": 1.769522071377601,
"learning_rate": 8.913126468347561e-06,
"loss": 0.4867,
"step": 389
},
{
"epoch": 0.24,
"grad_norm": 2.290538985245612,
"learning_rate": 8.906949638377352e-06,
"loss": 0.6833,
"step": 390
},
{
"epoch": 0.24,
"grad_norm": 2.1791089656581764,
"learning_rate": 8.900757459449655e-06,
"loss": 0.76,
"step": 391
},
{
"epoch": 0.24,
"grad_norm": 2.0748417472498537,
"learning_rate": 8.894549955891247e-06,
"loss": 0.6931,
"step": 392
},
{
"epoch": 0.24,
"grad_norm": 2.007190815984241,
"learning_rate": 8.888327152089112e-06,
"loss": 0.6713,
"step": 393
},
{
"epoch": 0.24,
"grad_norm": 1.7418731560021379,
"learning_rate": 8.882089072490339e-06,
"loss": 0.5852,
"step": 394
},
{
"epoch": 0.24,
"grad_norm": 1.9279427627473156,
"learning_rate": 8.875835741602031e-06,
"loss": 0.5998,
"step": 395
},
{
"epoch": 0.24,
"grad_norm": 2.011804969137247,
"learning_rate": 8.869567183991208e-06,
"loss": 0.7047,
"step": 396
},
{
"epoch": 0.24,
"grad_norm": 2.0919216640489577,
"learning_rate": 8.86328342428471e-06,
"loss": 0.6773,
"step": 397
},
{
"epoch": 0.24,
"grad_norm": 2.1708903996053994,
"learning_rate": 8.856984487169102e-06,
"loss": 0.6511,
"step": 398
},
{
"epoch": 0.24,
"grad_norm": 2.0482302804600954,
"learning_rate": 8.85067039739057e-06,
"loss": 0.6458,
"step": 399
},
{
"epoch": 0.24,
"grad_norm": 2.2389233691566184,
"learning_rate": 8.84434117975484e-06,
"loss": 0.6042,
"step": 400
},
{
"epoch": 0.25,
"grad_norm": 2.3238794603179365,
"learning_rate": 8.837996859127056e-06,
"loss": 0.6536,
"step": 401
},
{
"epoch": 0.25,
"grad_norm": 1.9634398094275907,
"learning_rate": 8.831637460431708e-06,
"loss": 0.6009,
"step": 402
},
{
"epoch": 0.25,
"grad_norm": 2.1849384771988167,
"learning_rate": 8.825263008652513e-06,
"loss": 0.6747,
"step": 403
},
{
"epoch": 0.25,
"grad_norm": 1.9367137201020725,
"learning_rate": 8.818873528832334e-06,
"loss": 0.5679,
"step": 404
},
{
"epoch": 0.25,
"grad_norm": 1.9444930407351348,
"learning_rate": 8.812469046073069e-06,
"loss": 0.5809,
"step": 405
},
{
"epoch": 0.25,
"grad_norm": 2.0382647745048263,
"learning_rate": 8.806049585535554e-06,
"loss": 0.5664,
"step": 406
},
{
"epoch": 0.25,
"grad_norm": 2.1047083940033944,
"learning_rate": 8.799615172439475e-06,
"loss": 0.5677,
"step": 407
},
{
"epoch": 0.25,
"grad_norm": 1.8990495481992753,
"learning_rate": 8.793165832063254e-06,
"loss": 0.6238,
"step": 408
},
{
"epoch": 0.25,
"grad_norm": 2.096972722468596,
"learning_rate": 8.786701589743965e-06,
"loss": 0.6452,
"step": 409
},
{
"epoch": 0.25,
"grad_norm": 1.8640478732019463,
"learning_rate": 8.780222470877213e-06,
"loss": 0.5267,
"step": 410
},
{
"epoch": 0.25,
"grad_norm": 1.9247739069634147,
"learning_rate": 8.77372850091706e-06,
"loss": 0.6142,
"step": 411
},
{
"epoch": 0.25,
"grad_norm": 2.087695274157492,
"learning_rate": 8.76721970537591e-06,
"loss": 0.6652,
"step": 412
},
{
"epoch": 0.25,
"grad_norm": 1.850788744558352,
"learning_rate": 8.760696109824403e-06,
"loss": 0.5258,
"step": 413
},
{
"epoch": 0.25,
"grad_norm": 2.118016145296157,
"learning_rate": 8.754157739891332e-06,
"loss": 0.6427,
"step": 414
},
{
"epoch": 0.25,
"grad_norm": 2.1471953099208525,
"learning_rate": 8.74760462126353e-06,
"loss": 0.6361,
"step": 415
},
{
"epoch": 0.25,
"grad_norm": 1.9116075357657814,
"learning_rate": 8.741036779685771e-06,
"loss": 0.5885,
"step": 416
},
{
"epoch": 0.25,
"grad_norm": 2.186935678265862,
"learning_rate": 8.734454240960672e-06,
"loss": 0.7819,
"step": 417
},
{
"epoch": 0.26,
"grad_norm": 1.9329278314726581,
"learning_rate": 8.727857030948587e-06,
"loss": 0.6089,
"step": 418
},
{
"epoch": 0.26,
"grad_norm": 2.29559179529083,
"learning_rate": 8.721245175567513e-06,
"loss": 0.6536,
"step": 419
},
{
"epoch": 0.26,
"grad_norm": 1.949685346432584,
"learning_rate": 8.714618700792975e-06,
"loss": 0.588,
"step": 420
},
{
"epoch": 0.26,
"grad_norm": 2.1364335358069555,
"learning_rate": 8.707977632657942e-06,
"loss": 0.5693,
"step": 421
},
{
"epoch": 0.26,
"grad_norm": 2.1052630337837646,
"learning_rate": 8.701321997252707e-06,
"loss": 0.6618,
"step": 422
},
{
"epoch": 0.26,
"grad_norm": 1.9419341419413294,
"learning_rate": 8.694651820724796e-06,
"loss": 0.6432,
"step": 423
},
{
"epoch": 0.26,
"grad_norm": 2.1809423639777847,
"learning_rate": 8.687967129278863e-06,
"loss": 0.6786,
"step": 424
},
{
"epoch": 0.26,
"grad_norm": 2.048911015295105,
"learning_rate": 8.68126794917658e-06,
"loss": 0.6848,
"step": 425
},
{
"epoch": 0.26,
"grad_norm": 2.1090753469834076,
"learning_rate": 8.674554306736545e-06,
"loss": 0.6447,
"step": 426
},
{
"epoch": 0.26,
"grad_norm": 2.106438933355631,
"learning_rate": 8.667826228334173e-06,
"loss": 0.5551,
"step": 427
},
{
"epoch": 0.26,
"grad_norm": 2.0203947279705226,
"learning_rate": 8.66108374040159e-06,
"loss": 0.5717,
"step": 428
},
{
"epoch": 0.26,
"grad_norm": 2.0615093467875854,
"learning_rate": 8.654326869427533e-06,
"loss": 0.6311,
"step": 429
},
{
"epoch": 0.26,
"grad_norm": 2.1329704988537665,
"learning_rate": 8.647555641957243e-06,
"loss": 0.6243,
"step": 430
},
{
"epoch": 0.26,
"grad_norm": 1.8315214353591525,
"learning_rate": 8.640770084592367e-06,
"loss": 0.5547,
"step": 431
},
{
"epoch": 0.26,
"grad_norm": 2.210411556217951,
"learning_rate": 8.633970223990841e-06,
"loss": 0.6408,
"step": 432
},
{
"epoch": 0.26,
"grad_norm": 2.007561414582103,
"learning_rate": 8.627156086866804e-06,
"loss": 0.5894,
"step": 433
},
{
"epoch": 0.27,
"grad_norm": 1.939162087316279,
"learning_rate": 8.620327699990469e-06,
"loss": 0.5772,
"step": 434
},
{
"epoch": 0.27,
"grad_norm": 2.1700787302690094,
"learning_rate": 8.613485090188044e-06,
"loss": 0.6095,
"step": 435
},
{
"epoch": 0.27,
"grad_norm": 2.0168885983710703,
"learning_rate": 8.606628284341603e-06,
"loss": 0.6537,
"step": 436
},
{
"epoch": 0.27,
"grad_norm": 1.9420432087459054,
"learning_rate": 8.599757309388998e-06,
"loss": 0.5503,
"step": 437
},
{
"epoch": 0.27,
"grad_norm": 1.9005641654421328,
"learning_rate": 8.592872192323742e-06,
"loss": 0.5285,
"step": 438
},
{
"epoch": 0.27,
"grad_norm": 2.310866512162385,
"learning_rate": 8.58597296019491e-06,
"loss": 0.6925,
"step": 439
},
{
"epoch": 0.27,
"grad_norm": 2.0185104038237283,
"learning_rate": 8.57905964010703e-06,
"loss": 0.6208,
"step": 440
},
{
"epoch": 0.27,
"grad_norm": 2.227534027585251,
"learning_rate": 8.572132259219973e-06,
"loss": 0.6722,
"step": 441
},
{
"epoch": 0.27,
"grad_norm": 1.972521989095671,
"learning_rate": 8.565190844748852e-06,
"loss": 0.6204,
"step": 442
},
{
"epoch": 0.27,
"grad_norm": 2.192852585817164,
"learning_rate": 8.558235423963912e-06,
"loss": 0.6615,
"step": 443
},
{
"epoch": 0.27,
"grad_norm": 2.1499672574920883,
"learning_rate": 8.551266024190425e-06,
"loss": 0.5939,
"step": 444
},
{
"epoch": 0.27,
"grad_norm": 2.0344036721852303,
"learning_rate": 8.54428267280858e-06,
"loss": 0.609,
"step": 445
},
{
"epoch": 0.27,
"grad_norm": 2.100328047808317,
"learning_rate": 8.537285397253378e-06,
"loss": 0.6728,
"step": 446
},
{
"epoch": 0.27,
"grad_norm": 1.9522804167083359,
"learning_rate": 8.53027422501452e-06,
"loss": 0.5963,
"step": 447
},
{
"epoch": 0.27,
"grad_norm": 2.0276370479958663,
"learning_rate": 8.523249183636303e-06,
"loss": 0.6615,
"step": 448
},
{
"epoch": 0.27,
"grad_norm": 1.969628544118802,
"learning_rate": 8.516210300717519e-06,
"loss": 0.6111,
"step": 449
},
{
"epoch": 0.28,
"grad_norm": 2.029052699494888,
"learning_rate": 8.50915760391132e-06,
"loss": 0.6396,
"step": 450
},
{
"epoch": 0.28,
"grad_norm": 1.8921377970210058,
"learning_rate": 8.502091120925147e-06,
"loss": 0.6135,
"step": 451
},
{
"epoch": 0.28,
"grad_norm": 1.905825397098304,
"learning_rate": 8.49501087952059e-06,
"loss": 0.6531,
"step": 452
},
{
"epoch": 0.28,
"grad_norm": 2.062356135135231,
"learning_rate": 8.487916907513291e-06,
"loss": 0.6511,
"step": 453
},
{
"epoch": 0.28,
"grad_norm": 2.0500672806486047,
"learning_rate": 8.480809232772845e-06,
"loss": 0.6973,
"step": 454
},
{
"epoch": 0.28,
"grad_norm": 1.9780923474909595,
"learning_rate": 8.473687883222665e-06,
"loss": 0.5567,
"step": 455
},
{
"epoch": 0.28,
"grad_norm": 2.001802531470044,
"learning_rate": 8.4665528868399e-06,
"loss": 0.6096,
"step": 456
},
{
"epoch": 0.28,
"grad_norm": 2.0486427239843343,
"learning_rate": 8.459404271655304e-06,
"loss": 0.7061,
"step": 457
},
{
"epoch": 0.28,
"grad_norm": 2.1064266393636113,
"learning_rate": 8.452242065753138e-06,
"loss": 0.6797,
"step": 458
},
{
"epoch": 0.28,
"grad_norm": 2.3915047992203,
"learning_rate": 8.445066297271055e-06,
"loss": 0.6238,
"step": 459
},
{
"epoch": 0.28,
"grad_norm": 2.0029093719066053,
"learning_rate": 8.437876994399992e-06,
"loss": 0.5708,
"step": 460
},
{
"epoch": 0.28,
"grad_norm": 2.0251422666051178,
"learning_rate": 8.430674185384054e-06,
"loss": 0.6305,
"step": 461
},
{
"epoch": 0.28,
"grad_norm": 2.3215947492777222,
"learning_rate": 8.423457898520411e-06,
"loss": 0.6077,
"step": 462
},
{
"epoch": 0.28,
"grad_norm": 1.9799905222032952,
"learning_rate": 8.416228162159178e-06,
"loss": 0.5937,
"step": 463
},
{
"epoch": 0.28,
"grad_norm": 1.9919779229157657,
"learning_rate": 8.408985004703312e-06,
"loss": 0.6588,
"step": 464
},
{
"epoch": 0.28,
"grad_norm": 1.8545854124573158,
"learning_rate": 8.401728454608495e-06,
"loss": 0.5624,
"step": 465
},
{
"epoch": 0.28,
"grad_norm": 1.9951991842396126,
"learning_rate": 8.394458540383021e-06,
"loss": 0.6586,
"step": 466
},
{
"epoch": 0.29,
"grad_norm": 1.9832046641551582,
"learning_rate": 8.387175290587692e-06,
"loss": 0.6178,
"step": 467
},
{
"epoch": 0.29,
"grad_norm": 2.1705961273936456,
"learning_rate": 8.379878733835697e-06,
"loss": 0.6783,
"step": 468
},
{
"epoch": 0.29,
"grad_norm": 2.0865192960586323,
"learning_rate": 8.372568898792504e-06,
"loss": 0.6141,
"step": 469
},
{
"epoch": 0.29,
"grad_norm": 1.965297643743764,
"learning_rate": 8.365245814175744e-06,
"loss": 0.5656,
"step": 470
},
{
"epoch": 0.29,
"grad_norm": 1.994816838265779,
"learning_rate": 8.357909508755106e-06,
"loss": 0.5594,
"step": 471
},
{
"epoch": 0.29,
"grad_norm": 2.081917736412011,
"learning_rate": 8.350560011352217e-06,
"loss": 0.6753,
"step": 472
},
{
"epoch": 0.29,
"grad_norm": 1.9190310357166047,
"learning_rate": 8.343197350840525e-06,
"loss": 0.5778,
"step": 473
},
{
"epoch": 0.29,
"grad_norm": 1.9990825253769382,
"learning_rate": 8.335821556145196e-06,
"loss": 0.5679,
"step": 474
},
{
"epoch": 0.29,
"grad_norm": 1.8601731215327446,
"learning_rate": 8.328432656242998e-06,
"loss": 0.5376,
"step": 475
},
{
"epoch": 0.29,
"grad_norm": 1.8011984252968534,
"learning_rate": 8.321030680162177e-06,
"loss": 0.5679,
"step": 476
},
{
"epoch": 0.29,
"grad_norm": 2.2985243811453637,
"learning_rate": 8.313615656982354e-06,
"loss": 0.6887,
"step": 477
},
{
"epoch": 0.29,
"grad_norm": 2.0168932090236624,
"learning_rate": 8.306187615834411e-06,
"loss": 0.6523,
"step": 478
},
{
"epoch": 0.29,
"grad_norm": 2.007847760050486,
"learning_rate": 8.298746585900367e-06,
"loss": 0.6079,
"step": 479
},
{
"epoch": 0.29,
"grad_norm": 2.1580333815191914,
"learning_rate": 8.291292596413272e-06,
"loss": 0.7007,
"step": 480
},
{
"epoch": 0.29,
"grad_norm": 2.0148089585758857,
"learning_rate": 8.28382567665709e-06,
"loss": 0.6778,
"step": 481
},
{
"epoch": 0.29,
"grad_norm": 2.0624654834089697,
"learning_rate": 8.276345855966579e-06,
"loss": 0.618,
"step": 482
},
{
"epoch": 0.3,
"grad_norm": 1.9930903577238281,
"learning_rate": 8.268853163727184e-06,
"loss": 0.6011,
"step": 483
},
{
"epoch": 0.3,
"grad_norm": 1.9326313111875104,
"learning_rate": 8.26134762937492e-06,
"loss": 0.5755,
"step": 484
},
{
"epoch": 0.3,
"grad_norm": 2.1052378837219283,
"learning_rate": 8.253829282396246e-06,
"loss": 0.576,
"step": 485
},
{
"epoch": 0.3,
"grad_norm": 1.9490986209974357,
"learning_rate": 8.246298152327965e-06,
"loss": 0.4944,
"step": 486
},
{
"epoch": 0.3,
"grad_norm": 2.0980391720214002,
"learning_rate": 8.238754268757092e-06,
"loss": 0.7186,
"step": 487
},
{
"epoch": 0.3,
"grad_norm": 1.881768667514335,
"learning_rate": 8.231197661320755e-06,
"loss": 0.6097,
"step": 488
},
{
"epoch": 0.3,
"grad_norm": 1.9646616396951349,
"learning_rate": 8.223628359706063e-06,
"loss": 0.6717,
"step": 489
},
{
"epoch": 0.3,
"grad_norm": 1.9845900062528004,
"learning_rate": 8.216046393649997e-06,
"loss": 0.5794,
"step": 490
},
{
"epoch": 0.3,
"grad_norm": 1.975691200881602,
"learning_rate": 8.20845179293929e-06,
"loss": 0.6777,
"step": 491
},
{
"epoch": 0.3,
"grad_norm": 1.9790588163074925,
"learning_rate": 8.20084458741032e-06,
"loss": 0.5762,
"step": 492
},
{
"epoch": 0.3,
"grad_norm": 2.033854053229917,
"learning_rate": 8.193224806948975e-06,
"loss": 0.6425,
"step": 493
},
{
"epoch": 0.3,
"grad_norm": 2.0564703779289855,
"learning_rate": 8.185592481490549e-06,
"loss": 0.5421,
"step": 494
},
{
"epoch": 0.3,
"grad_norm": 1.9024282460009037,
"learning_rate": 8.177947641019622e-06,
"loss": 0.5416,
"step": 495
},
{
"epoch": 0.3,
"grad_norm": 1.8428880273743034,
"learning_rate": 8.170290315569937e-06,
"loss": 0.5476,
"step": 496
},
{
"epoch": 0.3,
"grad_norm": 2.0314873709790517,
"learning_rate": 8.16262053522429e-06,
"loss": 0.6254,
"step": 497
},
{
"epoch": 0.3,
"grad_norm": 1.959747747554248,
"learning_rate": 8.154938330114407e-06,
"loss": 0.6715,
"step": 498
},
{
"epoch": 0.31,
"grad_norm": 1.9605352675210954,
"learning_rate": 8.147243730420827e-06,
"loss": 0.5389,
"step": 499
},
{
"epoch": 0.31,
"grad_norm": 1.9808533481893225,
"learning_rate": 8.139536766372775e-06,
"loss": 0.5917,
"step": 500
},
{
"epoch": 0.31,
"grad_norm": 2.124751570239496,
"learning_rate": 8.131817468248064e-06,
"loss": 0.646,
"step": 501
},
{
"epoch": 0.31,
"grad_norm": 1.9453603552598644,
"learning_rate": 8.124085866372952e-06,
"loss": 0.6475,
"step": 502
},
{
"epoch": 0.31,
"grad_norm": 2.284493964086694,
"learning_rate": 8.116341991122038e-06,
"loss": 0.657,
"step": 503
},
{
"epoch": 0.31,
"grad_norm": 2.173487845748996,
"learning_rate": 8.108585872918142e-06,
"loss": 0.6072,
"step": 504
},
{
"epoch": 0.31,
"grad_norm": 1.9740790341680636,
"learning_rate": 8.100817542232175e-06,
"loss": 0.6192,
"step": 505
},
{
"epoch": 0.31,
"grad_norm": 1.9882407145838754,
"learning_rate": 8.09303702958303e-06,
"loss": 0.7174,
"step": 506
},
{
"epoch": 0.31,
"grad_norm": 1.765767752810985,
"learning_rate": 8.085244365537459e-06,
"loss": 0.5659,
"step": 507
},
{
"epoch": 0.31,
"grad_norm": 2.0919873787965018,
"learning_rate": 8.077439580709954e-06,
"loss": 0.7014,
"step": 508
},
{
"epoch": 0.31,
"grad_norm": 2.0909317709072597,
"learning_rate": 8.069622705762619e-06,
"loss": 0.6553,
"step": 509
},
{
"epoch": 0.31,
"grad_norm": 2.0985013077972163,
"learning_rate": 8.06179377140506e-06,
"loss": 0.5996,
"step": 510
},
{
"epoch": 0.31,
"grad_norm": 1.875167456622752,
"learning_rate": 8.05395280839426e-06,
"loss": 0.4977,
"step": 511
},
{
"epoch": 0.31,
"grad_norm": 1.8642775987752205,
"learning_rate": 8.046099847534458e-06,
"loss": 0.516,
"step": 512
},
{
"epoch": 0.31,
"grad_norm": 1.8047762854038711,
"learning_rate": 8.038234919677029e-06,
"loss": 0.5456,
"step": 513
},
{
"epoch": 0.31,
"grad_norm": 2.054027103241828,
"learning_rate": 8.030358055720355e-06,
"loss": 0.6449,
"step": 514
},
{
"epoch": 0.31,
"grad_norm": 1.88938224837625,
"learning_rate": 8.02246928660972e-06,
"loss": 0.5853,
"step": 515
},
{
"epoch": 0.32,
"grad_norm": 1.9949053145025524,
"learning_rate": 8.014568643337175e-06,
"loss": 0.6374,
"step": 516
},
{
"epoch": 0.32,
"grad_norm": 2.181017234415942,
"learning_rate": 8.006656156941418e-06,
"loss": 0.6383,
"step": 517
},
{
"epoch": 0.32,
"grad_norm": 2.1231246586306325,
"learning_rate": 7.998731858507675e-06,
"loss": 0.6517,
"step": 518
},
{
"epoch": 0.32,
"grad_norm": 2.014704090337969,
"learning_rate": 7.990795779167584e-06,
"loss": 0.6203,
"step": 519
},
{
"epoch": 0.32,
"grad_norm": 2.1690171263618785,
"learning_rate": 7.982847950099055e-06,
"loss": 0.7135,
"step": 520
},
{
"epoch": 0.32,
"grad_norm": 1.8051904968061352,
"learning_rate": 7.974888402526166e-06,
"loss": 0.5658,
"step": 521
},
{
"epoch": 0.32,
"grad_norm": 2.0602443463430555,
"learning_rate": 7.966917167719029e-06,
"loss": 0.6524,
"step": 522
},
{
"epoch": 0.32,
"grad_norm": 1.8973152827677298,
"learning_rate": 7.95893427699367e-06,
"loss": 0.5758,
"step": 523
},
{
"epoch": 0.32,
"grad_norm": 2.019571292211095,
"learning_rate": 7.950939761711915e-06,
"loss": 0.6241,
"step": 524
},
{
"epoch": 0.32,
"grad_norm": 1.8785901266881793,
"learning_rate": 7.942933653281245e-06,
"loss": 0.5769,
"step": 525
},
{
"epoch": 0.32,
"grad_norm": 2.166266745685418,
"learning_rate": 7.934915983154698e-06,
"loss": 0.6663,
"step": 526
},
{
"epoch": 0.32,
"grad_norm": 2.123451242286571,
"learning_rate": 7.92688678283073e-06,
"loss": 0.6527,
"step": 527
},
{
"epoch": 0.32,
"grad_norm": 1.9075717966913297,
"learning_rate": 7.918846083853089e-06,
"loss": 0.6569,
"step": 528
},
{
"epoch": 0.32,
"grad_norm": 1.9000996169691746,
"learning_rate": 7.910793917810707e-06,
"loss": 0.6385,
"step": 529
},
{
"epoch": 0.32,
"grad_norm": 1.927434736070484,
"learning_rate": 7.902730316337556e-06,
"loss": 0.5631,
"step": 530
},
{
"epoch": 0.32,
"grad_norm": 1.9790989754571544,
"learning_rate": 7.894655311112545e-06,
"loss": 0.6068,
"step": 531
},
{
"epoch": 0.33,
"grad_norm": 2.1188146600936535,
"learning_rate": 7.886568933859372e-06,
"loss": 0.696,
"step": 532
},
{
"epoch": 0.33,
"grad_norm": 1.9800997991892215,
"learning_rate": 7.878471216346418e-06,
"loss": 0.6283,
"step": 533
},
{
"epoch": 0.33,
"grad_norm": 1.8749813668837976,
"learning_rate": 7.870362190386616e-06,
"loss": 0.5925,
"step": 534
},
{
"epoch": 0.33,
"grad_norm": 2.030181629804673,
"learning_rate": 7.862241887837322e-06,
"loss": 0.5838,
"step": 535
},
{
"epoch": 0.33,
"grad_norm": 2.191116164536583,
"learning_rate": 7.854110340600199e-06,
"loss": 0.6621,
"step": 536
},
{
"epoch": 0.33,
"grad_norm": 2.1322352729861747,
"learning_rate": 7.845967580621082e-06,
"loss": 0.7296,
"step": 537
},
{
"epoch": 0.33,
"grad_norm": 1.9503331877159438,
"learning_rate": 7.837813639889858e-06,
"loss": 0.6131,
"step": 538
},
{
"epoch": 0.33,
"grad_norm": 2.0706877886034802,
"learning_rate": 7.829648550440337e-06,
"loss": 0.6048,
"step": 539
},
{
"epoch": 0.33,
"grad_norm": 2.0116116138534617,
"learning_rate": 7.821472344350131e-06,
"loss": 0.6343,
"step": 540
},
{
"epoch": 0.33,
"grad_norm": 1.9969976539512104,
"learning_rate": 7.813285053740526e-06,
"loss": 0.6453,
"step": 541
},
{
"epoch": 0.33,
"grad_norm": 2.1284306784258638,
"learning_rate": 7.805086710776353e-06,
"loss": 0.6498,
"step": 542
},
{
"epoch": 0.33,
"grad_norm": 1.9199881561880785,
"learning_rate": 7.796877347665861e-06,
"loss": 0.5469,
"step": 543
},
{
"epoch": 0.33,
"grad_norm": 2.190799717584273,
"learning_rate": 7.788656996660596e-06,
"loss": 0.6443,
"step": 544
},
{
"epoch": 0.33,
"grad_norm": 2.0667507128163525,
"learning_rate": 7.780425690055275e-06,
"loss": 0.6689,
"step": 545
},
{
"epoch": 0.33,
"grad_norm": 2.0614204138949077,
"learning_rate": 7.772183460187647e-06,
"loss": 0.7005,
"step": 546
},
{
"epoch": 0.33,
"grad_norm": 1.8873750448102828,
"learning_rate": 7.763930339438383e-06,
"loss": 0.54,
"step": 547
},
{
"epoch": 0.34,
"grad_norm": 1.8982868706163196,
"learning_rate": 7.755666360230933e-06,
"loss": 0.6,
"step": 548
},
{
"epoch": 0.34,
"grad_norm": 1.9491417935122528,
"learning_rate": 7.747391555031414e-06,
"loss": 0.5981,
"step": 549
},
{
"epoch": 0.34,
"grad_norm": 2.082993477568864,
"learning_rate": 7.739105956348465e-06,
"loss": 0.6724,
"step": 550
},
{
"epoch": 0.34,
"grad_norm": 1.9771665196367632,
"learning_rate": 7.730809596733136e-06,
"loss": 0.6199,
"step": 551
},
{
"epoch": 0.34,
"grad_norm": 1.9910315139065318,
"learning_rate": 7.722502508778747e-06,
"loss": 0.6237,
"step": 552
},
{
"epoch": 0.34,
"grad_norm": 1.8173451465224066,
"learning_rate": 7.71418472512077e-06,
"loss": 0.5711,
"step": 553
},
{
"epoch": 0.34,
"grad_norm": 2.017970082027841,
"learning_rate": 7.705856278436696e-06,
"loss": 0.568,
"step": 554
},
{
"epoch": 0.34,
"grad_norm": 1.756994616816012,
"learning_rate": 7.697517201445906e-06,
"loss": 0.4771,
"step": 555
},
{
"epoch": 0.34,
"grad_norm": 1.783316776618109,
"learning_rate": 7.689167526909542e-06,
"loss": 0.5154,
"step": 556
},
{
"epoch": 0.34,
"grad_norm": 2.019720522663777,
"learning_rate": 7.680807287630383e-06,
"loss": 0.6041,
"step": 557
},
{
"epoch": 0.34,
"grad_norm": 2.0783908508210622,
"learning_rate": 7.67243651645271e-06,
"loss": 0.5921,
"step": 558
},
{
"epoch": 0.34,
"grad_norm": 1.8528382833978114,
"learning_rate": 7.664055246262183e-06,
"loss": 0.5604,
"step": 559
},
{
"epoch": 0.34,
"grad_norm": 2.117910334131364,
"learning_rate": 7.655663509985707e-06,
"loss": 0.6059,
"step": 560
},
{
"epoch": 0.34,
"grad_norm": 1.9758606107637775,
"learning_rate": 7.647261340591303e-06,
"loss": 0.6412,
"step": 561
},
{
"epoch": 0.34,
"grad_norm": 1.9290348907447834,
"learning_rate": 7.638848771087982e-06,
"loss": 0.5705,
"step": 562
},
{
"epoch": 0.34,
"grad_norm": 1.8960214594144043,
"learning_rate": 7.63042583452561e-06,
"loss": 0.6163,
"step": 563
},
{
"epoch": 0.34,
"grad_norm": 1.7445681487714644,
"learning_rate": 7.621992563994789e-06,
"loss": 0.5722,
"step": 564
},
{
"epoch": 0.35,
"grad_norm": 1.9279287791801931,
"learning_rate": 7.613548992626711e-06,
"loss": 0.5845,
"step": 565
},
{
"epoch": 0.35,
"grad_norm": 1.8792256339894968,
"learning_rate": 7.605095153593038e-06,
"loss": 0.5947,
"step": 566
},
{
"epoch": 0.35,
"grad_norm": 1.9849662044668719,
"learning_rate": 7.596631080105774e-06,
"loss": 0.6454,
"step": 567
},
{
"epoch": 0.35,
"grad_norm": 2.1721543557468643,
"learning_rate": 7.588156805417126e-06,
"loss": 0.5729,
"step": 568
},
{
"epoch": 0.35,
"grad_norm": 2.0849048969435136,
"learning_rate": 7.5796723628193815e-06,
"loss": 0.5947,
"step": 569
},
{
"epoch": 0.35,
"grad_norm": 2.2968962836408324,
"learning_rate": 7.571177785644766e-06,
"loss": 0.6569,
"step": 570
},
{
"epoch": 0.35,
"grad_norm": 1.9267367648655322,
"learning_rate": 7.562673107265333e-06,
"loss": 0.5691,
"step": 571
},
{
"epoch": 0.35,
"grad_norm": 1.9013543447575418,
"learning_rate": 7.554158361092807e-06,
"loss": 0.5434,
"step": 572
},
{
"epoch": 0.35,
"grad_norm": 1.82007897551597,
"learning_rate": 7.545633580578474e-06,
"loss": 0.6298,
"step": 573
},
{
"epoch": 0.35,
"grad_norm": 2.04394739282291,
"learning_rate": 7.537098799213036e-06,
"loss": 0.622,
"step": 574
},
{
"epoch": 0.35,
"grad_norm": 2.075155956099819,
"learning_rate": 7.528554050526489e-06,
"loss": 0.6556,
"step": 575
},
{
"epoch": 0.35,
"grad_norm": 2.0315943098160236,
"learning_rate": 7.519999368087982e-06,
"loss": 0.6453,
"step": 576
},
{
"epoch": 0.35,
"grad_norm": 2.0428160155679786,
"learning_rate": 7.511434785505693e-06,
"loss": 0.7135,
"step": 577
},
{
"epoch": 0.35,
"grad_norm": 2.222904873771381,
"learning_rate": 7.502860336426696e-06,
"loss": 0.6357,
"step": 578
},
{
"epoch": 0.35,
"grad_norm": 1.7822608620060818,
"learning_rate": 7.494276054536821e-06,
"loss": 0.5291,
"step": 579
},
{
"epoch": 0.35,
"grad_norm": 2.0685923305897624,
"learning_rate": 7.485681973560532e-06,
"loss": 0.5797,
"step": 580
},
{
"epoch": 0.36,
"grad_norm": 1.8859535667625311,
"learning_rate": 7.4770781272607895e-06,
"loss": 0.548,
"step": 581
},
{
"epoch": 0.36,
"grad_norm": 2.2063159373182093,
"learning_rate": 7.468464549438916e-06,
"loss": 0.5926,
"step": 582
},
{
"epoch": 0.36,
"grad_norm": 1.8937819274172978,
"learning_rate": 7.45984127393447e-06,
"loss": 0.6131,
"step": 583
},
{
"epoch": 0.36,
"grad_norm": 2.0362363463005506,
"learning_rate": 7.4512083346251026e-06,
"loss": 0.5862,
"step": 584
},
{
"epoch": 0.36,
"grad_norm": 2.2409987097836717,
"learning_rate": 7.442565765426436e-06,
"loss": 0.6329,
"step": 585
},
{
"epoch": 0.36,
"grad_norm": 1.984238720215282,
"learning_rate": 7.433913600291921e-06,
"loss": 0.6436,
"step": 586
},
{
"epoch": 0.36,
"grad_norm": 2.104431967032393,
"learning_rate": 7.425251873212709e-06,
"loss": 0.7334,
"step": 587
},
{
"epoch": 0.36,
"grad_norm": 2.1700383544690096,
"learning_rate": 7.416580618217515e-06,
"loss": 0.6976,
"step": 588
},
{
"epoch": 0.36,
"grad_norm": 2.2359253381623345,
"learning_rate": 7.407899869372489e-06,
"loss": 0.6529,
"step": 589
},
{
"epoch": 0.36,
"grad_norm": 2.207059006099314,
"learning_rate": 7.399209660781075e-06,
"loss": 0.6742,
"step": 590
},
{
"epoch": 0.36,
"grad_norm": 2.261704338275933,
"learning_rate": 7.390510026583884e-06,
"loss": 0.6153,
"step": 591
},
{
"epoch": 0.36,
"grad_norm": 2.0605566199360004,
"learning_rate": 7.381801000958554e-06,
"loss": 0.6127,
"step": 592
},
{
"epoch": 0.36,
"grad_norm": 1.8448877901078287,
"learning_rate": 7.3730826181196206e-06,
"loss": 0.5658,
"step": 593
},
{
"epoch": 0.36,
"grad_norm": 1.813700393133709,
"learning_rate": 7.364354912318379e-06,
"loss": 0.5713,
"step": 594
},
{
"epoch": 0.36,
"grad_norm": 1.8851240301079237,
"learning_rate": 7.355617917842751e-06,
"loss": 0.551,
"step": 595
},
{
"epoch": 0.36,
"grad_norm": 1.7963234257752434,
"learning_rate": 7.346871669017153e-06,
"loss": 0.5825,
"step": 596
},
{
"epoch": 0.37,
"grad_norm": 1.9811136419250976,
"learning_rate": 7.338116200202352e-06,
"loss": 0.6257,
"step": 597
},
{
"epoch": 0.37,
"grad_norm": 2.113085248629396,
"learning_rate": 7.329351545795345e-06,
"loss": 0.6154,
"step": 598
},
{
"epoch": 0.37,
"grad_norm": 1.664313245894575,
"learning_rate": 7.320577740229208e-06,
"loss": 0.5348,
"step": 599
},
{
"epoch": 0.37,
"grad_norm": 2.0938115916173095,
"learning_rate": 7.311794817972975e-06,
"loss": 0.619,
"step": 600
},
{
"epoch": 0.37,
"grad_norm": 1.9914918015459053,
"learning_rate": 7.3030028135314905e-06,
"loss": 0.5977,
"step": 601
},
{
"epoch": 0.37,
"grad_norm": 1.8973404703566117,
"learning_rate": 7.294201761445284e-06,
"loss": 0.6016,
"step": 602
},
{
"epoch": 0.37,
"grad_norm": 2.686916719491371,
"learning_rate": 7.285391696290427e-06,
"loss": 0.5594,
"step": 603
},
{
"epoch": 0.37,
"grad_norm": 1.8914279547975104,
"learning_rate": 7.276572652678403e-06,
"loss": 0.6548,
"step": 604
},
{
"epoch": 0.37,
"grad_norm": 1.9045214756464477,
"learning_rate": 7.267744665255966e-06,
"loss": 0.5625,
"step": 605
},
{
"epoch": 0.37,
"grad_norm": 2.0508726477606416,
"learning_rate": 7.258907768705006e-06,
"loss": 0.5994,
"step": 606
},
{
"epoch": 0.37,
"grad_norm": 2.1572541720871206,
"learning_rate": 7.2500619977424154e-06,
"loss": 0.6259,
"step": 607
},
{
"epoch": 0.37,
"grad_norm": 1.8740105212119254,
"learning_rate": 7.241207387119953e-06,
"loss": 0.5498,
"step": 608
},
{
"epoch": 0.37,
"grad_norm": 2.11048827570066,
"learning_rate": 7.2323439716241e-06,
"loss": 0.6176,
"step": 609
},
{
"epoch": 0.37,
"grad_norm": 2.138108461906426,
"learning_rate": 7.223471786075934e-06,
"loss": 0.7467,
"step": 610
},
{
"epoch": 0.37,
"grad_norm": 2.1086676582577035,
"learning_rate": 7.214590865330984e-06,
"loss": 0.6513,
"step": 611
},
{
"epoch": 0.37,
"grad_norm": 1.8758955739955738,
"learning_rate": 7.2057012442790975e-06,
"loss": 0.5449,
"step": 612
},
{
"epoch": 0.37,
"grad_norm": 1.682499149886398,
"learning_rate": 7.1968029578443e-06,
"loss": 0.525,
"step": 613
},
{
"epoch": 0.38,
"grad_norm": 1.9868958931777934,
"learning_rate": 7.187896040984661e-06,
"loss": 0.626,
"step": 614
},
{
"epoch": 0.38,
"grad_norm": 1.9273119955565226,
"learning_rate": 7.178980528692161e-06,
"loss": 0.6012,
"step": 615
},
{
"epoch": 0.38,
"grad_norm": 2.0192285563021466,
"learning_rate": 7.170056455992541e-06,
"loss": 0.6065,
"step": 616
},
{
"epoch": 0.38,
"grad_norm": 1.9019456790022062,
"learning_rate": 7.161123857945177e-06,
"loss": 0.6329,
"step": 617
},
{
"epoch": 0.38,
"grad_norm": 2.024610041244123,
"learning_rate": 7.152182769642936e-06,
"loss": 0.6359,
"step": 618
},
{
"epoch": 0.38,
"grad_norm": 1.9325040211739186,
"learning_rate": 7.143233226212042e-06,
"loss": 0.5215,
"step": 619
},
{
"epoch": 0.38,
"grad_norm": 2.012751842328307,
"learning_rate": 7.134275262811935e-06,
"loss": 0.6432,
"step": 620
},
{
"epoch": 0.38,
"grad_norm": 1.7572649666598243,
"learning_rate": 7.1253089146351325e-06,
"loss": 0.5677,
"step": 621
},
{
"epoch": 0.38,
"grad_norm": 1.7788331655165412,
"learning_rate": 7.116334216907097e-06,
"loss": 0.5215,
"step": 622
},
{
"epoch": 0.38,
"grad_norm": 1.9050240439242967,
"learning_rate": 7.107351204886088e-06,
"loss": 0.5178,
"step": 623
},
{
"epoch": 0.38,
"grad_norm": 2.041894936366493,
"learning_rate": 7.098359913863034e-06,
"loss": 0.6043,
"step": 624
},
{
"epoch": 0.38,
"grad_norm": 2.0308197433902797,
"learning_rate": 7.089360379161381e-06,
"loss": 0.6213,
"step": 625
},
{
"epoch": 0.38,
"grad_norm": 1.95222686445269,
"learning_rate": 7.08035263613697e-06,
"loss": 0.5971,
"step": 626
},
{
"epoch": 0.38,
"grad_norm": 2.017912918523442,
"learning_rate": 7.071336720177886e-06,
"loss": 0.6046,
"step": 627
},
{
"epoch": 0.38,
"grad_norm": 2.166790566645372,
"learning_rate": 7.062312666704321e-06,
"loss": 0.5927,
"step": 628
},
{
"epoch": 0.38,
"grad_norm": 2.2400201098243544,
"learning_rate": 7.053280511168437e-06,
"loss": 0.7107,
"step": 629
},
{
"epoch": 0.39,
"grad_norm": 1.8761384322160164,
"learning_rate": 7.044240289054227e-06,
"loss": 0.5877,
"step": 630
},
{
"epoch": 0.39,
"grad_norm": 1.8121190685789235,
"learning_rate": 7.035192035877374e-06,
"loss": 0.5278,
"step": 631
},
{
"epoch": 0.39,
"grad_norm": 2.291146349707187,
"learning_rate": 7.026135787185113e-06,
"loss": 0.6674,
"step": 632
},
{
"epoch": 0.39,
"grad_norm": 1.9115866137344344,
"learning_rate": 7.017071578556088e-06,
"loss": 0.6101,
"step": 633
},
{
"epoch": 0.39,
"grad_norm": 2.2159502369044746,
"learning_rate": 7.007999445600216e-06,
"loss": 0.6451,
"step": 634
},
{
"epoch": 0.39,
"grad_norm": 1.94968047657449,
"learning_rate": 6.998919423958548e-06,
"loss": 0.6115,
"step": 635
},
{
"epoch": 0.39,
"grad_norm": 2.0483508425325208,
"learning_rate": 6.989831549303121e-06,
"loss": 0.5641,
"step": 636
},
{
"epoch": 0.39,
"grad_norm": 2.078362428704396,
"learning_rate": 6.980735857336831e-06,
"loss": 0.5859,
"step": 637
},
{
"epoch": 0.39,
"grad_norm": 2.102194806164863,
"learning_rate": 6.971632383793278e-06,
"loss": 0.5956,
"step": 638
},
{
"epoch": 0.39,
"grad_norm": 2.1562012485508766,
"learning_rate": 6.962521164436641e-06,
"loss": 0.6522,
"step": 639
},
{
"epoch": 0.39,
"grad_norm": 1.942549118113248,
"learning_rate": 6.953402235061519e-06,
"loss": 0.5656,
"step": 640
},
{
"epoch": 0.39,
"grad_norm": 2.032598660713363,
"learning_rate": 6.944275631492813e-06,
"loss": 0.6328,
"step": 641
},
{
"epoch": 0.39,
"grad_norm": 2.120207767189764,
"learning_rate": 6.935141389585562e-06,
"loss": 0.6283,
"step": 642
},
{
"epoch": 0.39,
"grad_norm": 1.846891984881128,
"learning_rate": 6.925999545224819e-06,
"loss": 0.5348,
"step": 643
},
{
"epoch": 0.39,
"grad_norm": 2.2117072258313515,
"learning_rate": 6.916850134325505e-06,
"loss": 0.5428,
"step": 644
},
{
"epoch": 0.39,
"grad_norm": 1.9428888699888005,
"learning_rate": 6.907693192832263e-06,
"loss": 0.6194,
"step": 645
},
{
"epoch": 0.39,
"grad_norm": 2.324654552066874,
"learning_rate": 6.898528756719325e-06,
"loss": 0.6157,
"step": 646
},
{
"epoch": 0.4,
"grad_norm": 2.105488134378262,
"learning_rate": 6.8893568619903625e-06,
"loss": 0.6574,
"step": 647
},
{
"epoch": 0.4,
"grad_norm": 2.0741815083758803,
"learning_rate": 6.8801775446783545e-06,
"loss": 0.681,
"step": 648
},
{
"epoch": 0.4,
"grad_norm": 1.8599982138936229,
"learning_rate": 6.870990840845435e-06,
"loss": 0.532,
"step": 649
},
{
"epoch": 0.4,
"grad_norm": 1.8066163998362903,
"learning_rate": 6.861796786582761e-06,
"loss": 0.5864,
"step": 650
},
{
"epoch": 0.4,
"grad_norm": 2.103633090126261,
"learning_rate": 6.852595418010364e-06,
"loss": 0.6276,
"step": 651
},
{
"epoch": 0.4,
"grad_norm": 2.0271857194621994,
"learning_rate": 6.843386771277012e-06,
"loss": 0.6113,
"step": 652
},
{
"epoch": 0.4,
"grad_norm": 1.9892261757816698,
"learning_rate": 6.834170882560066e-06,
"loss": 0.6066,
"step": 653
},
{
"epoch": 0.4,
"grad_norm": 2.0920982211462142,
"learning_rate": 6.824947788065339e-06,
"loss": 0.6631,
"step": 654
},
{
"epoch": 0.4,
"grad_norm": 1.7304966407527353,
"learning_rate": 6.8157175240269495e-06,
"loss": 0.5458,
"step": 655
},
{
"epoch": 0.4,
"grad_norm": 1.9003138804763595,
"learning_rate": 6.806480126707187e-06,
"loss": 0.6121,
"step": 656
},
{
"epoch": 0.4,
"grad_norm": 1.9727053822571718,
"learning_rate": 6.797235632396362e-06,
"loss": 0.6235,
"step": 657
},
{
"epoch": 0.4,
"grad_norm": 2.1447934975774325,
"learning_rate": 6.787984077412666e-06,
"loss": 0.652,
"step": 658
},
{
"epoch": 0.4,
"grad_norm": 2.0660746773365775,
"learning_rate": 6.7787254981020335e-06,
"loss": 0.6679,
"step": 659
},
{
"epoch": 0.4,
"grad_norm": 2.0622987551332597,
"learning_rate": 6.7694599308379895e-06,
"loss": 0.6033,
"step": 660
},
{
"epoch": 0.4,
"grad_norm": 1.9723578845421632,
"learning_rate": 6.760187412021516e-06,
"loss": 0.6082,
"step": 661
},
{
"epoch": 0.4,
"grad_norm": 1.7982428478028805,
"learning_rate": 6.750907978080902e-06,
"loss": 0.5334,
"step": 662
},
{
"epoch": 0.41,
"grad_norm": 2.036081125390073,
"learning_rate": 6.741621665471607e-06,
"loss": 0.6212,
"step": 663
},
{
"epoch": 0.41,
"grad_norm": 2.1493033458896664,
"learning_rate": 6.732328510676111e-06,
"loss": 0.6751,
"step": 664
},
{
"epoch": 0.41,
"grad_norm": 2.070635996051103,
"learning_rate": 6.723028550203779e-06,
"loss": 0.5758,
"step": 665
},
{
"epoch": 0.41,
"grad_norm": 1.9050719437104036,
"learning_rate": 6.7137218205907036e-06,
"loss": 0.54,
"step": 666
},
{
"epoch": 0.41,
"grad_norm": 1.7928377835662002,
"learning_rate": 6.704408358399583e-06,
"loss": 0.5676,
"step": 667
},
{
"epoch": 0.41,
"grad_norm": 2.027588661623482,
"learning_rate": 6.695088200219557e-06,
"loss": 0.5546,
"step": 668
},
{
"epoch": 0.41,
"grad_norm": 1.9325373078264918,
"learning_rate": 6.6857613826660714e-06,
"loss": 0.5941,
"step": 669
},
{
"epoch": 0.41,
"grad_norm": 1.9172475345332523,
"learning_rate": 6.676427942380741e-06,
"loss": 0.5328,
"step": 670
},
{
"epoch": 0.41,
"grad_norm": 2.1396308238670367,
"learning_rate": 6.667087916031192e-06,
"loss": 0.6748,
"step": 671
},
{
"epoch": 0.41,
"grad_norm": 1.8568393271779622,
"learning_rate": 6.657741340310927e-06,
"loss": 0.5975,
"step": 672
},
{
"epoch": 0.41,
"grad_norm": 2.0294986249307394,
"learning_rate": 6.648388251939177e-06,
"loss": 0.6111,
"step": 673
},
{
"epoch": 0.41,
"grad_norm": 1.9234325700371586,
"learning_rate": 6.639028687660766e-06,
"loss": 0.596,
"step": 674
},
{
"epoch": 0.41,
"grad_norm": 2.0366203200056088,
"learning_rate": 6.629662684245949e-06,
"loss": 0.5688,
"step": 675
},
{
"epoch": 0.41,
"grad_norm": 1.8958934625265222,
"learning_rate": 6.620290278490284e-06,
"loss": 0.5791,
"step": 676
},
{
"epoch": 0.41,
"grad_norm": 1.806822731262611,
"learning_rate": 6.610911507214482e-06,
"loss": 0.5465,
"step": 677
},
{
"epoch": 0.41,
"grad_norm": 1.8876652924106438,
"learning_rate": 6.601526407264261e-06,
"loss": 0.5537,
"step": 678
},
{
"epoch": 0.42,
"grad_norm": 2.0030181777961156,
"learning_rate": 6.592135015510197e-06,
"loss": 0.6045,
"step": 679
},
{
"epoch": 0.42,
"grad_norm": 1.97710315660336,
"learning_rate": 6.5827373688475925e-06,
"loss": 0.5725,
"step": 680
},
{
"epoch": 0.42,
"grad_norm": 2.01659583449962,
"learning_rate": 6.5733335041963175e-06,
"loss": 0.6237,
"step": 681
},
{
"epoch": 0.42,
"grad_norm": 1.8805423446258591,
"learning_rate": 6.563923458500672e-06,
"loss": 0.5479,
"step": 682
},
{
"epoch": 0.42,
"grad_norm": 1.7669993233431147,
"learning_rate": 6.554507268729238e-06,
"loss": 0.5109,
"step": 683
},
{
"epoch": 0.42,
"grad_norm": 1.8269014493705453,
"learning_rate": 6.545084971874738e-06,
"loss": 0.5462,
"step": 684
},
{
"epoch": 0.42,
"grad_norm": 2.0359088067386786,
"learning_rate": 6.535656604953884e-06,
"loss": 0.6384,
"step": 685
},
{
"epoch": 0.42,
"grad_norm": 1.9599189973913222,
"learning_rate": 6.526222205007236e-06,
"loss": 0.5452,
"step": 686
},
{
"epoch": 0.42,
"grad_norm": 1.7645021807661985,
"learning_rate": 6.516781809099055e-06,
"loss": 0.4752,
"step": 687
},
{
"epoch": 0.42,
"grad_norm": 1.9917551528037687,
"learning_rate": 6.507335454317161e-06,
"loss": 0.5545,
"step": 688
},
{
"epoch": 0.42,
"grad_norm": 2.026042072169137,
"learning_rate": 6.497883177772779e-06,
"loss": 0.627,
"step": 689
},
{
"epoch": 0.42,
"grad_norm": 1.8861715324088848,
"learning_rate": 6.488425016600403e-06,
"loss": 0.6235,
"step": 690
},
{
"epoch": 0.42,
"grad_norm": 1.957150362035283,
"learning_rate": 6.4789610079576426e-06,
"loss": 0.5386,
"step": 691
},
{
"epoch": 0.42,
"grad_norm": 2.1453992014090364,
"learning_rate": 6.469491189025081e-06,
"loss": 0.6518,
"step": 692
},
{
"epoch": 0.42,
"grad_norm": 1.765906720730784,
"learning_rate": 6.46001559700613e-06,
"loss": 0.6203,
"step": 693
},
{
"epoch": 0.42,
"grad_norm": 2.260525086713535,
"learning_rate": 6.450534269126878e-06,
"loss": 0.6806,
"step": 694
},
{
"epoch": 0.42,
"grad_norm": 2.171042858463267,
"learning_rate": 6.441047242635947e-06,
"loss": 0.6542,
"step": 695
},
{
"epoch": 0.43,
"grad_norm": 1.9103992956138385,
"learning_rate": 6.431554554804353e-06,
"loss": 0.6342,
"step": 696
},
{
"epoch": 0.43,
"grad_norm": 2.0002914383944974,
"learning_rate": 6.422056242925347e-06,
"loss": 0.561,
"step": 697
},
{
"epoch": 0.43,
"grad_norm": 1.8379292896391608,
"learning_rate": 6.412552344314279e-06,
"loss": 0.5599,
"step": 698
},
{
"epoch": 0.43,
"grad_norm": 1.9748932397312229,
"learning_rate": 6.40304289630844e-06,
"loss": 0.5952,
"step": 699
},
{
"epoch": 0.43,
"grad_norm": 2.1614553712542253,
"learning_rate": 6.3935279362669335e-06,
"loss": 0.6412,
"step": 700
},
{
"epoch": 0.43,
"grad_norm": 1.9859247585151905,
"learning_rate": 6.384007501570509e-06,
"loss": 0.6359,
"step": 701
},
{
"epoch": 0.43,
"grad_norm": 1.8136261988901872,
"learning_rate": 6.374481629621427e-06,
"loss": 0.5893,
"step": 702
},
{
"epoch": 0.43,
"grad_norm": 2.0161944625478574,
"learning_rate": 6.364950357843309e-06,
"loss": 0.5371,
"step": 703
},
{
"epoch": 0.43,
"grad_norm": 1.8707294403008965,
"learning_rate": 6.355413723680991e-06,
"loss": 0.606,
"step": 704
},
{
"epoch": 0.43,
"grad_norm": 2.083870266773342,
"learning_rate": 6.3458717646003746e-06,
"loss": 0.5857,
"step": 705
},
{
"epoch": 0.43,
"grad_norm": 2.21481293408251,
"learning_rate": 6.33632451808828e-06,
"loss": 0.5945,
"step": 706
},
{
"epoch": 0.43,
"grad_norm": 1.8209514169327161,
"learning_rate": 6.326772021652303e-06,
"loss": 0.561,
"step": 707
},
{
"epoch": 0.43,
"grad_norm": 1.7765877330909154,
"learning_rate": 6.317214312820662e-06,
"loss": 0.5808,
"step": 708
},
{
"epoch": 0.43,
"grad_norm": 2.301880122628837,
"learning_rate": 6.307651429142053e-06,
"loss": 0.6169,
"step": 709
},
{
"epoch": 0.43,
"grad_norm": 1.9511858179855806,
"learning_rate": 6.298083408185503e-06,
"loss": 0.5485,
"step": 710
},
{
"epoch": 0.43,
"grad_norm": 1.8733084336465669,
"learning_rate": 6.288510287540221e-06,
"loss": 0.5414,
"step": 711
},
{
"epoch": 0.44,
"grad_norm": 1.9100686536267126,
"learning_rate": 6.278932104815453e-06,
"loss": 0.5177,
"step": 712
},
{
"epoch": 0.44,
"grad_norm": 1.980724894367988,
"learning_rate": 6.269348897640327e-06,
"loss": 0.5847,
"step": 713
},
{
"epoch": 0.44,
"grad_norm": 2.4372622910469515,
"learning_rate": 6.259760703663713e-06,
"loss": 0.6332,
"step": 714
},
{
"epoch": 0.44,
"grad_norm": 1.7666399516614875,
"learning_rate": 6.2501675605540755e-06,
"loss": 0.4731,
"step": 715
},
{
"epoch": 0.44,
"grad_norm": 1.9269831431359743,
"learning_rate": 6.240569505999317e-06,
"loss": 0.5864,
"step": 716
},
{
"epoch": 0.44,
"grad_norm": 2.1889446969469306,
"learning_rate": 6.230966577706637e-06,
"loss": 0.6465,
"step": 717
},
{
"epoch": 0.44,
"grad_norm": 2.0970566330713036,
"learning_rate": 6.221358813402383e-06,
"loss": 0.6136,
"step": 718
},
{
"epoch": 0.44,
"grad_norm": 2.05054134285554,
"learning_rate": 6.211746250831902e-06,
"loss": 0.6313,
"step": 719
},
{
"epoch": 0.44,
"grad_norm": 1.997693167566272,
"learning_rate": 6.202128927759391e-06,
"loss": 0.5838,
"step": 720
},
{
"epoch": 0.44,
"grad_norm": 1.902952046522087,
"learning_rate": 6.192506881967746e-06,
"loss": 0.5913,
"step": 721
},
{
"epoch": 0.44,
"grad_norm": 2.118349972159298,
"learning_rate": 6.182880151258422e-06,
"loss": 0.6401,
"step": 722
},
{
"epoch": 0.44,
"grad_norm": 1.7923389650042116,
"learning_rate": 6.173248773451278e-06,
"loss": 0.4488,
"step": 723
},
{
"epoch": 0.44,
"grad_norm": 2.0358206659939206,
"learning_rate": 6.163612786384426e-06,
"loss": 0.5871,
"step": 724
},
{
"epoch": 0.44,
"grad_norm": 1.9373816200953502,
"learning_rate": 6.153972227914089e-06,
"loss": 0.6472,
"step": 725
},
{
"epoch": 0.44,
"grad_norm": 1.7741801403344204,
"learning_rate": 6.144327135914452e-06,
"loss": 0.5512,
"step": 726
},
{
"epoch": 0.44,
"grad_norm": 2.000681880884647,
"learning_rate": 6.134677548277504e-06,
"loss": 0.6792,
"step": 727
},
{
"epoch": 0.45,
"grad_norm": 2.030783168366151,
"learning_rate": 6.125023502912901e-06,
"loss": 0.6046,
"step": 728
},
{
"epoch": 0.45,
"grad_norm": 2.0794685094461802,
"learning_rate": 6.1153650377478116e-06,
"loss": 0.6356,
"step": 729
},
{
"epoch": 0.45,
"grad_norm": 2.0447620516144394,
"learning_rate": 6.105702190726765e-06,
"loss": 0.6179,
"step": 730
},
{
"epoch": 0.45,
"grad_norm": 2.0749186785935803,
"learning_rate": 6.096034999811507e-06,
"loss": 0.6269,
"step": 731
},
{
"epoch": 0.45,
"grad_norm": 2.099298563572386,
"learning_rate": 6.086363502980848e-06,
"loss": 0.5107,
"step": 732
},
{
"epoch": 0.45,
"grad_norm": 2.143595827433539,
"learning_rate": 6.076687738230517e-06,
"loss": 0.613,
"step": 733
},
{
"epoch": 0.45,
"grad_norm": 2.2534551834297574,
"learning_rate": 6.067007743573007e-06,
"loss": 0.6627,
"step": 734
},
{
"epoch": 0.45,
"grad_norm": 2.196821231032266,
"learning_rate": 6.0573235570374315e-06,
"loss": 0.6868,
"step": 735
},
{
"epoch": 0.45,
"grad_norm": 1.9274444301473674,
"learning_rate": 6.04763521666937e-06,
"loss": 0.6234,
"step": 736
},
{
"epoch": 0.45,
"grad_norm": 2.0827989695986906,
"learning_rate": 6.037942760530722e-06,
"loss": 0.5338,
"step": 737
},
{
"epoch": 0.45,
"grad_norm": 1.7904575978539012,
"learning_rate": 6.028246226699559e-06,
"loss": 0.5255,
"step": 738
},
{
"epoch": 0.45,
"grad_norm": 1.8597042651935416,
"learning_rate": 6.018545653269967e-06,
"loss": 0.5604,
"step": 739
},
{
"epoch": 0.45,
"grad_norm": 1.925996097217488,
"learning_rate": 6.008841078351903e-06,
"loss": 0.5435,
"step": 740
},
{
"epoch": 0.45,
"grad_norm": 1.726239669389769,
"learning_rate": 5.9991325400710506e-06,
"loss": 0.5033,
"step": 741
},
{
"epoch": 0.45,
"grad_norm": 1.8668368547030405,
"learning_rate": 5.9894200765686574e-06,
"loss": 0.5801,
"step": 742
},
{
"epoch": 0.45,
"grad_norm": 2.0334452116466037,
"learning_rate": 5.9797037260013915e-06,
"loss": 0.6715,
"step": 743
},
{
"epoch": 0.45,
"grad_norm": 2.1692961748152384,
"learning_rate": 5.969983526541197e-06,
"loss": 0.6002,
"step": 744
},
{
"epoch": 0.46,
"grad_norm": 1.8242102907354445,
"learning_rate": 5.960259516375134e-06,
"loss": 0.5459,
"step": 745
},
{
"epoch": 0.46,
"grad_norm": 1.9465644945877867,
"learning_rate": 5.950531733705237e-06,
"loss": 0.5633,
"step": 746
},
{
"epoch": 0.46,
"grad_norm": 2.033416173745934,
"learning_rate": 5.940800216748357e-06,
"loss": 0.595,
"step": 747
},
{
"epoch": 0.46,
"grad_norm": 2.029664712146445,
"learning_rate": 5.9310650037360226e-06,
"loss": 0.636,
"step": 748
},
{
"epoch": 0.46,
"grad_norm": 1.949211782466542,
"learning_rate": 5.921326132914275e-06,
"loss": 0.5598,
"step": 749
},
{
"epoch": 0.46,
"grad_norm": 2.130586940499767,
"learning_rate": 5.911583642543532e-06,
"loss": 0.6793,
"step": 750
},
{
"epoch": 0.46,
"grad_norm": 2.037641386476974,
"learning_rate": 5.901837570898425e-06,
"loss": 0.6281,
"step": 751
},
{
"epoch": 0.46,
"grad_norm": 2.063158938340875,
"learning_rate": 5.892087956267659e-06,
"loss": 0.5975,
"step": 752
},
{
"epoch": 0.46,
"grad_norm": 2.0120237220111954,
"learning_rate": 5.88233483695386e-06,
"loss": 0.5072,
"step": 753
},
{
"epoch": 0.46,
"grad_norm": 1.8881476646969595,
"learning_rate": 5.872578251273418e-06,
"loss": 0.5661,
"step": 754
},
{
"epoch": 0.46,
"grad_norm": 1.8984742507906354,
"learning_rate": 5.862818237556344e-06,
"loss": 0.5364,
"step": 755
},
{
"epoch": 0.46,
"grad_norm": 2.041836701931837,
"learning_rate": 5.8530548341461125e-06,
"loss": 0.6654,
"step": 756
},
{
"epoch": 0.46,
"grad_norm": 1.9187386355732121,
"learning_rate": 5.843288079399523e-06,
"loss": 0.5945,
"step": 757
},
{
"epoch": 0.46,
"grad_norm": 1.929660691723023,
"learning_rate": 5.833518011686531e-06,
"loss": 0.523,
"step": 758
},
{
"epoch": 0.46,
"grad_norm": 2.150008834991264,
"learning_rate": 5.823744669390115e-06,
"loss": 0.669,
"step": 759
},
{
"epoch": 0.46,
"grad_norm": 2.124617912097345,
"learning_rate": 5.813968090906117e-06,
"loss": 0.5963,
"step": 760
},
{
"epoch": 0.47,
"grad_norm": 2.0097512818088563,
"learning_rate": 5.804188314643088e-06,
"loss": 0.5946,
"step": 761
},
{
"epoch": 0.47,
"grad_norm": 1.7904830455893548,
"learning_rate": 5.794405379022147e-06,
"loss": 0.5818,
"step": 762
},
{
"epoch": 0.47,
"grad_norm": 2.1097636641498805,
"learning_rate": 5.784619322476822e-06,
"loss": 0.5711,
"step": 763
},
{
"epoch": 0.47,
"grad_norm": 2.1115025364007636,
"learning_rate": 5.774830183452905e-06,
"loss": 0.5844,
"step": 764
},
{
"epoch": 0.47,
"grad_norm": 2.0129995897966833,
"learning_rate": 5.765038000408295e-06,
"loss": 0.5759,
"step": 765
},
{
"epoch": 0.47,
"grad_norm": 1.9329010751228102,
"learning_rate": 5.755242811812851e-06,
"loss": 0.5464,
"step": 766
},
{
"epoch": 0.47,
"grad_norm": 2.0085429975812055,
"learning_rate": 5.74544465614824e-06,
"loss": 0.5751,
"step": 767
},
{
"epoch": 0.47,
"grad_norm": 1.9903327654676763,
"learning_rate": 5.735643571907785e-06,
"loss": 0.6458,
"step": 768
},
{
"epoch": 0.47,
"grad_norm": 2.1067344237718393,
"learning_rate": 5.725839597596312e-06,
"loss": 0.6115,
"step": 769
},
{
"epoch": 0.47,
"grad_norm": 1.9189891230884772,
"learning_rate": 5.716032771730008e-06,
"loss": 0.5441,
"step": 770
},
{
"epoch": 0.47,
"grad_norm": 2.4011718668135993,
"learning_rate": 5.706223132836255e-06,
"loss": 0.5773,
"step": 771
},
{
"epoch": 0.47,
"grad_norm": 2.072775046614376,
"learning_rate": 5.69641071945349e-06,
"loss": 0.6463,
"step": 772
},
{
"epoch": 0.47,
"grad_norm": 1.9935114006477437,
"learning_rate": 5.686595570131048e-06,
"loss": 0.5186,
"step": 773
},
{
"epoch": 0.47,
"grad_norm": 1.844020593200682,
"learning_rate": 5.6767777234290165e-06,
"loss": 0.5469,
"step": 774
},
{
"epoch": 0.47,
"grad_norm": 2.0393335340113743,
"learning_rate": 5.666957217918076e-06,
"loss": 0.6512,
"step": 775
},
{
"epoch": 0.47,
"grad_norm": 1.9394154490786393,
"learning_rate": 5.657134092179354e-06,
"loss": 0.603,
"step": 776
},
{
"epoch": 0.48,
"grad_norm": 2.005329689227122,
"learning_rate": 5.647308384804272e-06,
"loss": 0.6182,
"step": 777
},
{
"epoch": 0.48,
"grad_norm": 2.170791771636413,
"learning_rate": 5.637480134394394e-06,
"loss": 0.6277,
"step": 778
},
{
"epoch": 0.48,
"grad_norm": 2.001689578136617,
"learning_rate": 5.627649379561273e-06,
"loss": 0.6089,
"step": 779
},
{
"epoch": 0.48,
"grad_norm": 2.0454235186622114,
"learning_rate": 5.617816158926303e-06,
"loss": 0.5878,
"step": 780
},
{
"epoch": 0.48,
"grad_norm": 1.8823086840222076,
"learning_rate": 5.607980511120565e-06,
"loss": 0.5706,
"step": 781
},
{
"epoch": 0.48,
"grad_norm": 1.80269943780875,
"learning_rate": 5.598142474784671e-06,
"loss": 0.4961,
"step": 782
},
{
"epoch": 0.48,
"grad_norm": 2.033336270252808,
"learning_rate": 5.588302088568625e-06,
"loss": 0.556,
"step": 783
},
{
"epoch": 0.48,
"grad_norm": 1.9778139575223732,
"learning_rate": 5.578459391131657e-06,
"loss": 0.5302,
"step": 784
},
{
"epoch": 0.48,
"grad_norm": 2.020736561300123,
"learning_rate": 5.568614421142078e-06,
"loss": 0.5978,
"step": 785
},
{
"epoch": 0.48,
"grad_norm": 1.9174222510669499,
"learning_rate": 5.558767217277127e-06,
"loss": 0.4907,
"step": 786
},
{
"epoch": 0.48,
"grad_norm": 1.8250592907674714,
"learning_rate": 5.548917818222818e-06,
"loss": 0.5262,
"step": 787
},
{
"epoch": 0.48,
"grad_norm": 1.966787711230393,
"learning_rate": 5.539066262673793e-06,
"loss": 0.6737,
"step": 788
},
{
"epoch": 0.48,
"grad_norm": 1.855643229521311,
"learning_rate": 5.529212589333163e-06,
"loss": 0.5955,
"step": 789
},
{
"epoch": 0.48,
"grad_norm": 2.1346079614371543,
"learning_rate": 5.5193568369123576e-06,
"loss": 0.5729,
"step": 790
},
{
"epoch": 0.48,
"grad_norm": 2.0515766199556706,
"learning_rate": 5.509499044130977e-06,
"loss": 0.5719,
"step": 791
},
{
"epoch": 0.48,
"grad_norm": 2.0107844202744336,
"learning_rate": 5.4996392497166375e-06,
"loss": 0.6046,
"step": 792
},
{
"epoch": 0.48,
"grad_norm": 1.9275125168152694,
"learning_rate": 5.489777492404818e-06,
"loss": 0.5201,
"step": 793
},
{
"epoch": 0.49,
"grad_norm": 2.0862073890728428,
"learning_rate": 5.479913810938706e-06,
"loss": 0.6474,
"step": 794
},
{
"epoch": 0.49,
"grad_norm": 1.947899506277237,
"learning_rate": 5.470048244069055e-06,
"loss": 0.6276,
"step": 795
},
{
"epoch": 0.49,
"grad_norm": 1.938490880806152,
"learning_rate": 5.46018083055402e-06,
"loss": 0.573,
"step": 796
},
{
"epoch": 0.49,
"grad_norm": 1.9073385387532762,
"learning_rate": 5.450311609159013e-06,
"loss": 0.5404,
"step": 797
},
{
"epoch": 0.49,
"grad_norm": 1.809537621124168,
"learning_rate": 5.4404406186565465e-06,
"loss": 0.541,
"step": 798
},
{
"epoch": 0.49,
"grad_norm": 1.9527437330399584,
"learning_rate": 5.430567897826086e-06,
"loss": 0.6258,
"step": 799
},
{
"epoch": 0.49,
"grad_norm": 1.7508264552185595,
"learning_rate": 5.420693485453893e-06,
"loss": 0.5149,
"step": 800
},
{
"epoch": 0.49,
"grad_norm": 2.0056514367983858,
"learning_rate": 5.410817420332876e-06,
"loss": 0.5755,
"step": 801
},
{
"epoch": 0.49,
"grad_norm": 2.035682132284417,
"learning_rate": 5.400939741262434e-06,
"loss": 0.6091,
"step": 802
},
{
"epoch": 0.49,
"grad_norm": 2.039715629729808,
"learning_rate": 5.39106048704831e-06,
"loss": 0.6284,
"step": 803
},
{
"epoch": 0.49,
"grad_norm": 1.9509583461522269,
"learning_rate": 5.381179696502432e-06,
"loss": 0.6541,
"step": 804
},
{
"epoch": 0.49,
"grad_norm": 2.101000418400194,
"learning_rate": 5.371297408442765e-06,
"loss": 0.719,
"step": 805
},
{
"epoch": 0.49,
"grad_norm": 2.1274177930152187,
"learning_rate": 5.361413661693157e-06,
"loss": 0.6933,
"step": 806
},
{
"epoch": 0.49,
"grad_norm": 1.9350531172343641,
"learning_rate": 5.351528495083187e-06,
"loss": 0.5427,
"step": 807
},
{
"epoch": 0.49,
"grad_norm": 1.9236019600576935,
"learning_rate": 5.341641947448011e-06,
"loss": 0.5427,
"step": 808
},
{
"epoch": 0.49,
"grad_norm": 1.896084411851985,
"learning_rate": 5.331754057628212e-06,
"loss": 0.5404,
"step": 809
},
{
"epoch": 0.5,
"grad_norm": 2.1943365136159345,
"learning_rate": 5.321864864469646e-06,
"loss": 0.6178,
"step": 810
},
{
"epoch": 0.5,
"grad_norm": 2.0214323469529307,
"learning_rate": 5.311974406823288e-06,
"loss": 0.5394,
"step": 811
},
{
"epoch": 0.5,
"grad_norm": 1.867537229859426,
"learning_rate": 5.3020827235450815e-06,
"loss": 0.5502,
"step": 812
},
{
"epoch": 0.5,
"grad_norm": 2.234343663037103,
"learning_rate": 5.292189853495784e-06,
"loss": 0.6277,
"step": 813
},
{
"epoch": 0.5,
"grad_norm": 2.013802275182187,
"learning_rate": 5.282295835540818e-06,
"loss": 0.6056,
"step": 814
},
{
"epoch": 0.5,
"grad_norm": 1.9513906655142625,
"learning_rate": 5.272400708550114e-06,
"loss": 0.5685,
"step": 815
},
{
"epoch": 0.5,
"grad_norm": 1.9338299630529332,
"learning_rate": 5.262504511397959e-06,
"loss": 0.592,
"step": 816
},
{
"epoch": 0.5,
"grad_norm": 1.7548541609411559,
"learning_rate": 5.252607282962843e-06,
"loss": 0.526,
"step": 817
},
{
"epoch": 0.5,
"grad_norm": 2.0616714683528667,
"learning_rate": 5.2427090621273114e-06,
"loss": 0.5529,
"step": 818
},
{
"epoch": 0.5,
"grad_norm": 1.7804791451461532,
"learning_rate": 5.232809887777807e-06,
"loss": 0.5478,
"step": 819
},
{
"epoch": 0.5,
"grad_norm": 1.826725496699057,
"learning_rate": 5.222909798804515e-06,
"loss": 0.5544,
"step": 820
},
{
"epoch": 0.5,
"grad_norm": 2.138811923531637,
"learning_rate": 5.213008834101218e-06,
"loss": 0.643,
"step": 821
},
{
"epoch": 0.5,
"grad_norm": 1.9873736384117076,
"learning_rate": 5.20310703256514e-06,
"loss": 0.6616,
"step": 822
},
{
"epoch": 0.5,
"grad_norm": 1.9208415386150814,
"learning_rate": 5.193204433096787e-06,
"loss": 0.5055,
"step": 823
},
{
"epoch": 0.5,
"grad_norm": 1.9813842045072931,
"learning_rate": 5.183301074599805e-06,
"loss": 0.6327,
"step": 824
},
{
"epoch": 0.5,
"grad_norm": 1.908652107185451,
"learning_rate": 5.173396995980818e-06,
"loss": 0.6359,
"step": 825
},
{
"epoch": 0.51,
"grad_norm": 2.0742468419024847,
"learning_rate": 5.1634922361492845e-06,
"loss": 0.6413,
"step": 826
},
{
"epoch": 0.51,
"grad_norm": 1.9352720515169122,
"learning_rate": 5.153586834017333e-06,
"loss": 0.4937,
"step": 827
},
{
"epoch": 0.51,
"grad_norm": 1.8636055456230387,
"learning_rate": 5.14368082849962e-06,
"loss": 0.5491,
"step": 828
},
{
"epoch": 0.51,
"grad_norm": 2.2525115422822255,
"learning_rate": 5.133774258513168e-06,
"loss": 0.6518,
"step": 829
},
{
"epoch": 0.51,
"grad_norm": 1.976929887451241,
"learning_rate": 5.123867162977224e-06,
"loss": 0.5955,
"step": 830
},
{
"epoch": 0.51,
"grad_norm": 2.1238491116296787,
"learning_rate": 5.1139595808130915e-06,
"loss": 0.5438,
"step": 831
},
{
"epoch": 0.51,
"grad_norm": 1.9460536517410532,
"learning_rate": 5.1040515509439926e-06,
"loss": 0.6111,
"step": 832
},
{
"epoch": 0.51,
"grad_norm": 1.8502322758352145,
"learning_rate": 5.0941431122949044e-06,
"loss": 0.5802,
"step": 833
},
{
"epoch": 0.51,
"grad_norm": 2.1312052237471226,
"learning_rate": 5.08423430379241e-06,
"loss": 0.6531,
"step": 834
},
{
"epoch": 0.51,
"grad_norm": 1.955948461366251,
"learning_rate": 5.074325164364549e-06,
"loss": 0.576,
"step": 835
},
{
"epoch": 0.51,
"grad_norm": 2.2603660355638016,
"learning_rate": 5.064415732940654e-06,
"loss": 0.6709,
"step": 836
},
{
"epoch": 0.51,
"grad_norm": 2.2004715834934854,
"learning_rate": 5.054506048451214e-06,
"loss": 0.7273,
"step": 837
},
{
"epoch": 0.51,
"grad_norm": 1.9625833391118874,
"learning_rate": 5.044596149827705e-06,
"loss": 0.5655,
"step": 838
},
{
"epoch": 0.51,
"grad_norm": 2.0367810488166196,
"learning_rate": 5.034686076002447e-06,
"loss": 0.5503,
"step": 839
},
{
"epoch": 0.51,
"grad_norm": 2.0781271470418865,
"learning_rate": 5.024775865908451e-06,
"loss": 0.5408,
"step": 840
},
{
"epoch": 0.51,
"grad_norm": 1.8174563416303517,
"learning_rate": 5.014865558479257e-06,
"loss": 0.5601,
"step": 841
},
{
"epoch": 0.51,
"grad_norm": 2.04027597278746,
"learning_rate": 5.004955192648791e-06,
"loss": 0.5129,
"step": 842
},
{
"epoch": 0.52,
"grad_norm": 1.929086047655504,
"learning_rate": 4.9950448073512096e-06,
"loss": 0.6012,
"step": 843
},
{
"epoch": 0.52,
"grad_norm": 2.0846476788174018,
"learning_rate": 4.9851344415207455e-06,
"loss": 0.5691,
"step": 844
},
{
"epoch": 0.52,
"grad_norm": 2.015199227101593,
"learning_rate": 4.975224134091551e-06,
"loss": 0.626,
"step": 845
},
{
"epoch": 0.52,
"grad_norm": 2.005830472801361,
"learning_rate": 4.965313923997552e-06,
"loss": 0.5876,
"step": 846
},
{
"epoch": 0.52,
"grad_norm": 2.11312125492647,
"learning_rate": 4.955403850172297e-06,
"loss": 0.5779,
"step": 847
},
{
"epoch": 0.52,
"grad_norm": 1.964404887222109,
"learning_rate": 4.945493951548788e-06,
"loss": 0.5264,
"step": 848
},
{
"epoch": 0.52,
"grad_norm": 1.632455019293396,
"learning_rate": 4.935584267059346e-06,
"loss": 0.4701,
"step": 849
},
{
"epoch": 0.52,
"grad_norm": 1.9988491228675496,
"learning_rate": 4.925674835635455e-06,
"loss": 0.604,
"step": 850
},
{
"epoch": 0.52,
"grad_norm": 1.9517240575905959,
"learning_rate": 4.915765696207591e-06,
"loss": 0.6134,
"step": 851
},
{
"epoch": 0.52,
"grad_norm": 1.8771210243391112,
"learning_rate": 4.905856887705097e-06,
"loss": 0.5352,
"step": 852
},
{
"epoch": 0.52,
"grad_norm": 1.9010355843007118,
"learning_rate": 4.895948449056008e-06,
"loss": 0.5825,
"step": 853
},
{
"epoch": 0.52,
"grad_norm": 1.8640061544368143,
"learning_rate": 4.886040419186909e-06,
"loss": 0.536,
"step": 854
},
{
"epoch": 0.52,
"grad_norm": 2.127850537210119,
"learning_rate": 4.876132837022778e-06,
"loss": 0.7484,
"step": 855
},
{
"epoch": 0.52,
"grad_norm": 2.035416663771683,
"learning_rate": 4.866225741486833e-06,
"loss": 0.5556,
"step": 856
},
{
"epoch": 0.52,
"grad_norm": 1.8112553709887884,
"learning_rate": 4.856319171500382e-06,
"loss": 0.5089,
"step": 857
},
{
"epoch": 0.52,
"grad_norm": 1.7461261116319204,
"learning_rate": 4.846413165982668e-06,
"loss": 0.5798,
"step": 858
},
{
"epoch": 0.53,
"grad_norm": 1.8200383652508103,
"learning_rate": 4.836507763850717e-06,
"loss": 0.5644,
"step": 859
},
{
"epoch": 0.53,
"grad_norm": 2.0210685681015517,
"learning_rate": 4.826603004019182e-06,
"loss": 0.6028,
"step": 860
},
{
"epoch": 0.53,
"grad_norm": 2.0488262467671654,
"learning_rate": 4.816698925400197e-06,
"loss": 0.6634,
"step": 861
},
{
"epoch": 0.53,
"grad_norm": 1.9045043503411678,
"learning_rate": 4.806795566903214e-06,
"loss": 0.5246,
"step": 862
},
{
"epoch": 0.53,
"grad_norm": 1.903132223526836,
"learning_rate": 4.796892967434861e-06,
"loss": 0.5501,
"step": 863
},
{
"epoch": 0.53,
"grad_norm": 1.9775121455691418,
"learning_rate": 4.7869911658987825e-06,
"loss": 0.5821,
"step": 864
},
{
"epoch": 0.53,
"grad_norm": 2.134944135303822,
"learning_rate": 4.777090201195486e-06,
"loss": 0.5914,
"step": 865
},
{
"epoch": 0.53,
"grad_norm": 1.8017818510043424,
"learning_rate": 4.767190112222196e-06,
"loss": 0.5215,
"step": 866
},
{
"epoch": 0.53,
"grad_norm": 1.8986193975250871,
"learning_rate": 4.757290937872689e-06,
"loss": 0.5674,
"step": 867
},
{
"epoch": 0.53,
"grad_norm": 2.198006939268661,
"learning_rate": 4.747392717037158e-06,
"loss": 0.6696,
"step": 868
},
{
"epoch": 0.53,
"grad_norm": 1.9844558939372063,
"learning_rate": 4.737495488602044e-06,
"loss": 0.6495,
"step": 869
},
{
"epoch": 0.53,
"grad_norm": 1.8377231311260462,
"learning_rate": 4.727599291449887e-06,
"loss": 0.526,
"step": 870
},
{
"epoch": 0.53,
"grad_norm": 2.1843148217052795,
"learning_rate": 4.717704164459182e-06,
"loss": 0.6569,
"step": 871
},
{
"epoch": 0.53,
"grad_norm": 2.0731163232163525,
"learning_rate": 4.707810146504217e-06,
"loss": 0.6277,
"step": 872
},
{
"epoch": 0.53,
"grad_norm": 1.8835943474176664,
"learning_rate": 4.697917276454919e-06,
"loss": 0.5287,
"step": 873
},
{
"epoch": 0.53,
"grad_norm": 2.0281931145371828,
"learning_rate": 4.688025593176713e-06,
"loss": 0.5604,
"step": 874
},
{
"epoch": 0.54,
"grad_norm": 1.9088774231682988,
"learning_rate": 4.6781351355303555e-06,
"loss": 0.554,
"step": 875
},
{
"epoch": 0.54,
"grad_norm": 1.9551048202904684,
"learning_rate": 4.668245942371789e-06,
"loss": 0.6467,
"step": 876
},
{
"epoch": 0.54,
"grad_norm": 2.067313801101298,
"learning_rate": 4.658358052551992e-06,
"loss": 0.5992,
"step": 877
},
{
"epoch": 0.54,
"grad_norm": 1.902021092998417,
"learning_rate": 4.648471504916815e-06,
"loss": 0.5812,
"step": 878
},
{
"epoch": 0.54,
"grad_norm": 1.8922807148527254,
"learning_rate": 4.638586338306845e-06,
"loss": 0.5374,
"step": 879
},
{
"epoch": 0.54,
"grad_norm": 1.884819760587392,
"learning_rate": 4.628702591557237e-06,
"loss": 0.5056,
"step": 880
},
{
"epoch": 0.54,
"grad_norm": 1.809064236289934,
"learning_rate": 4.61882030349757e-06,
"loss": 0.5311,
"step": 881
},
{
"epoch": 0.54,
"grad_norm": 1.939206185133062,
"learning_rate": 4.60893951295169e-06,
"loss": 0.5821,
"step": 882
},
{
"epoch": 0.54,
"grad_norm": 2.1796594240586518,
"learning_rate": 4.599060258737567e-06,
"loss": 0.6658,
"step": 883
},
{
"epoch": 0.54,
"grad_norm": 2.103575194199594,
"learning_rate": 4.589182579667125e-06,
"loss": 0.6145,
"step": 884
},
{
"epoch": 0.54,
"grad_norm": 2.159018112419537,
"learning_rate": 4.579306514546107e-06,
"loss": 0.6203,
"step": 885
},
{
"epoch": 0.54,
"grad_norm": 1.9460192510920176,
"learning_rate": 4.569432102173917e-06,
"loss": 0.5578,
"step": 886
},
{
"epoch": 0.54,
"grad_norm": 1.8654041708472648,
"learning_rate": 4.559559381343455e-06,
"loss": 0.528,
"step": 887
},
{
"epoch": 0.54,
"grad_norm": 1.9680995454358476,
"learning_rate": 4.5496883908409905e-06,
"loss": 0.6183,
"step": 888
},
{
"epoch": 0.54,
"grad_norm": 1.9853793075023518,
"learning_rate": 4.539819169445982e-06,
"loss": 0.5658,
"step": 889
},
{
"epoch": 0.54,
"grad_norm": 2.1272265558554695,
"learning_rate": 4.529951755930946e-06,
"loss": 0.6413,
"step": 890
},
{
"epoch": 0.54,
"grad_norm": 2.0536110493039827,
"learning_rate": 4.5200861890612955e-06,
"loss": 0.5394,
"step": 891
},
{
"epoch": 0.55,
"grad_norm": 1.9694789258484728,
"learning_rate": 4.510222507595185e-06,
"loss": 0.5543,
"step": 892
},
{
"epoch": 0.55,
"grad_norm": 2.0637295858493214,
"learning_rate": 4.500360750283363e-06,
"loss": 0.6254,
"step": 893
},
{
"epoch": 0.55,
"grad_norm": 2.2382680799881762,
"learning_rate": 4.490500955869025e-06,
"loss": 0.5594,
"step": 894
},
{
"epoch": 0.55,
"grad_norm": 1.9215522609744207,
"learning_rate": 4.480643163087644e-06,
"loss": 0.5565,
"step": 895
},
{
"epoch": 0.55,
"grad_norm": 1.8992138968072834,
"learning_rate": 4.4707874106668406e-06,
"loss": 0.5549,
"step": 896
},
{
"epoch": 0.55,
"grad_norm": 2.053529626956222,
"learning_rate": 4.460933737326208e-06,
"loss": 0.5997,
"step": 897
},
{
"epoch": 0.55,
"grad_norm": 1.9545793745044062,
"learning_rate": 4.4510821817771825e-06,
"loss": 0.5397,
"step": 898
},
{
"epoch": 0.55,
"grad_norm": 2.085152918955289,
"learning_rate": 4.441232782722875e-06,
"loss": 0.6005,
"step": 899
},
{
"epoch": 0.55,
"grad_norm": 2.1202774407600926,
"learning_rate": 4.431385578857924e-06,
"loss": 0.5819,
"step": 900
},
{
"epoch": 0.55,
"grad_norm": 1.8352961154836602,
"learning_rate": 4.421540608868344e-06,
"loss": 0.5951,
"step": 901
},
{
"epoch": 0.55,
"grad_norm": 2.1495914883931904,
"learning_rate": 4.411697911431376e-06,
"loss": 0.6428,
"step": 902
},
{
"epoch": 0.55,
"grad_norm": 2.1564746769491876,
"learning_rate": 4.4018575252153295e-06,
"loss": 0.6402,
"step": 903
},
{
"epoch": 0.55,
"grad_norm": 1.8954514160537663,
"learning_rate": 4.392019488879438e-06,
"loss": 0.6072,
"step": 904
},
{
"epoch": 0.55,
"grad_norm": 1.8105483820540889,
"learning_rate": 4.382183841073698e-06,
"loss": 0.5387,
"step": 905
},
{
"epoch": 0.55,
"grad_norm": 1.9485751025827374,
"learning_rate": 4.372350620438728e-06,
"loss": 0.531,
"step": 906
},
{
"epoch": 0.55,
"grad_norm": 1.9608862157969138,
"learning_rate": 4.362519865605608e-06,
"loss": 0.5402,
"step": 907
},
{
"epoch": 0.56,
"grad_norm": 1.9691930324266667,
"learning_rate": 4.352691615195729e-06,
"loss": 0.5624,
"step": 908
},
{
"epoch": 0.56,
"grad_norm": 1.8973081884631189,
"learning_rate": 4.342865907820647e-06,
"loss": 0.5595,
"step": 909
},
{
"epoch": 0.56,
"grad_norm": 1.9717587970990957,
"learning_rate": 4.333042782081926e-06,
"loss": 0.662,
"step": 910
},
{
"epoch": 0.56,
"grad_norm": 2.007138023783923,
"learning_rate": 4.323222276570984e-06,
"loss": 0.5723,
"step": 911
},
{
"epoch": 0.56,
"grad_norm": 2.22307977714878,
"learning_rate": 4.313404429868952e-06,
"loss": 0.6789,
"step": 912
},
{
"epoch": 0.56,
"grad_norm": 2.190452780908872,
"learning_rate": 4.303589280546513e-06,
"loss": 0.6045,
"step": 913
},
{
"epoch": 0.56,
"grad_norm": 1.9051078417596634,
"learning_rate": 4.293776867163746e-06,
"loss": 0.5001,
"step": 914
},
{
"epoch": 0.56,
"grad_norm": 2.098076895433394,
"learning_rate": 4.283967228269993e-06,
"loss": 0.6982,
"step": 915
},
{
"epoch": 0.56,
"grad_norm": 2.135564782739449,
"learning_rate": 4.274160402403689e-06,
"loss": 0.6086,
"step": 916
},
{
"epoch": 0.56,
"grad_norm": 2.322595532423094,
"learning_rate": 4.264356428092217e-06,
"loss": 0.6274,
"step": 917
},
{
"epoch": 0.56,
"grad_norm": 1.9740445661634287,
"learning_rate": 4.254555343851762e-06,
"loss": 0.6254,
"step": 918
},
{
"epoch": 0.56,
"grad_norm": 2.1042432186823965,
"learning_rate": 4.24475718818715e-06,
"loss": 0.4925,
"step": 919
},
{
"epoch": 0.56,
"grad_norm": 2.092018762885259,
"learning_rate": 4.234961999591706e-06,
"loss": 0.638,
"step": 920
},
{
"epoch": 0.56,
"grad_norm": 1.819978568221369,
"learning_rate": 4.2251698165470965e-06,
"loss": 0.5285,
"step": 921
},
{
"epoch": 0.56,
"grad_norm": 2.0575179276629685,
"learning_rate": 4.215380677523179e-06,
"loss": 0.5426,
"step": 922
},
{
"epoch": 0.56,
"grad_norm": 1.8926418567324161,
"learning_rate": 4.205594620977854e-06,
"loss": 0.5378,
"step": 923
},
{
"epoch": 0.56,
"grad_norm": 2.055749823208842,
"learning_rate": 4.195811685356914e-06,
"loss": 0.5888,
"step": 924
},
{
"epoch": 0.57,
"grad_norm": 2.0051242608320745,
"learning_rate": 4.186031909093884e-06,
"loss": 0.5652,
"step": 925
},
{
"epoch": 0.57,
"grad_norm": 1.8592910852701108,
"learning_rate": 4.176255330609885e-06,
"loss": 0.487,
"step": 926
},
{
"epoch": 0.57,
"grad_norm": 2.2411604400071674,
"learning_rate": 4.16648198831347e-06,
"loss": 0.6867,
"step": 927
},
{
"epoch": 0.57,
"grad_norm": 2.072512037016527,
"learning_rate": 4.156711920600479e-06,
"loss": 0.6362,
"step": 928
},
{
"epoch": 0.57,
"grad_norm": 1.9250096482155696,
"learning_rate": 4.146945165853888e-06,
"loss": 0.4271,
"step": 929
},
{
"epoch": 0.57,
"grad_norm": 2.1874561938184898,
"learning_rate": 4.137181762443658e-06,
"loss": 0.5753,
"step": 930
},
{
"epoch": 0.57,
"grad_norm": 1.8676645313834617,
"learning_rate": 4.127421748726583e-06,
"loss": 0.5137,
"step": 931
},
{
"epoch": 0.57,
"grad_norm": 2.211228766881823,
"learning_rate": 4.117665163046141e-06,
"loss": 0.6821,
"step": 932
},
{
"epoch": 0.57,
"grad_norm": 2.095689790428209,
"learning_rate": 4.107912043732342e-06,
"loss": 0.5183,
"step": 933
},
{
"epoch": 0.57,
"grad_norm": 2.3214361789624944,
"learning_rate": 4.098162429101576e-06,
"loss": 0.588,
"step": 934
},
{
"epoch": 0.57,
"grad_norm": 1.9276332877997406,
"learning_rate": 4.088416357456471e-06,
"loss": 0.5425,
"step": 935
},
{
"epoch": 0.57,
"grad_norm": 1.8872033903192418,
"learning_rate": 4.0786738670857254e-06,
"loss": 0.5275,
"step": 936
},
{
"epoch": 0.57,
"grad_norm": 2.173402848844034,
"learning_rate": 4.068934996263978e-06,
"loss": 0.6501,
"step": 937
},
{
"epoch": 0.57,
"grad_norm": 2.216415986512064,
"learning_rate": 4.059199783251644e-06,
"loss": 0.5988,
"step": 938
},
{
"epoch": 0.57,
"grad_norm": 1.8838143839651054,
"learning_rate": 4.049468266294765e-06,
"loss": 0.6169,
"step": 939
},
{
"epoch": 0.57,
"grad_norm": 2.028659010558423,
"learning_rate": 4.039740483624869e-06,
"loss": 0.6277,
"step": 940
},
{
"epoch": 0.58,
"grad_norm": 2.0194050107448303,
"learning_rate": 4.030016473458805e-06,
"loss": 0.6028,
"step": 941
},
{
"epoch": 0.58,
"grad_norm": 2.1557586383454574,
"learning_rate": 4.020296273998609e-06,
"loss": 0.6176,
"step": 942
},
{
"epoch": 0.58,
"grad_norm": 1.8682573092015993,
"learning_rate": 4.010579923431346e-06,
"loss": 0.5763,
"step": 943
},
{
"epoch": 0.58,
"grad_norm": 2.1548920939456093,
"learning_rate": 4.00086745992895e-06,
"loss": 0.6331,
"step": 944
},
{
"epoch": 0.58,
"grad_norm": 2.267061719065842,
"learning_rate": 3.991158921648096e-06,
"loss": 0.7066,
"step": 945
},
{
"epoch": 0.58,
"grad_norm": 1.9935522203843874,
"learning_rate": 3.981454346730036e-06,
"loss": 0.5729,
"step": 946
},
{
"epoch": 0.58,
"grad_norm": 1.8026214689706248,
"learning_rate": 3.9717537733004415e-06,
"loss": 0.5706,
"step": 947
},
{
"epoch": 0.58,
"grad_norm": 1.904682640150856,
"learning_rate": 3.9620572394692776e-06,
"loss": 0.5683,
"step": 948
},
{
"epoch": 0.58,
"grad_norm": 2.136529894979737,
"learning_rate": 3.952364783330632e-06,
"loss": 0.651,
"step": 949
},
{
"epoch": 0.58,
"grad_norm": 2.233298404795316,
"learning_rate": 3.942676442962569e-06,
"loss": 0.5268,
"step": 950
},
{
"epoch": 0.58,
"grad_norm": 2.2946703794023486,
"learning_rate": 3.932992256426995e-06,
"loss": 0.676,
"step": 951
},
{
"epoch": 0.58,
"grad_norm": 1.9555055432357573,
"learning_rate": 3.923312261769485e-06,
"loss": 0.598,
"step": 952
},
{
"epoch": 0.58,
"grad_norm": 2.3078155728224212,
"learning_rate": 3.913636497019154e-06,
"loss": 0.6872,
"step": 953
},
{
"epoch": 0.58,
"grad_norm": 1.8574615796272702,
"learning_rate": 3.903965000188495e-06,
"loss": 0.5518,
"step": 954
},
{
"epoch": 0.58,
"grad_norm": 2.213864265081535,
"learning_rate": 3.894297809273237e-06,
"loss": 0.5652,
"step": 955
},
{
"epoch": 0.58,
"grad_norm": 1.8234823525571142,
"learning_rate": 3.884634962252189e-06,
"loss": 0.4526,
"step": 956
},
{
"epoch": 0.59,
"grad_norm": 1.8652657269096666,
"learning_rate": 3.8749764970871e-06,
"loss": 0.5418,
"step": 957
},
{
"epoch": 0.59,
"grad_norm": 2.0976734402107224,
"learning_rate": 3.8653224517224965e-06,
"loss": 0.5637,
"step": 958
},
{
"epoch": 0.59,
"grad_norm": 2.0254191334608826,
"learning_rate": 3.855672864085549e-06,
"loss": 0.5265,
"step": 959
},
{
"epoch": 0.59,
"grad_norm": 1.8196043256300247,
"learning_rate": 3.846027772085912e-06,
"loss": 0.5179,
"step": 960
},
{
"epoch": 0.59,
"grad_norm": 1.931679412683687,
"learning_rate": 3.836387213615576e-06,
"loss": 0.5646,
"step": 961
},
{
"epoch": 0.59,
"grad_norm": 1.9232046934900524,
"learning_rate": 3.826751226548725e-06,
"loss": 0.4793,
"step": 962
},
{
"epoch": 0.59,
"grad_norm": 1.8622914220495714,
"learning_rate": 3.817119848741579e-06,
"loss": 0.5253,
"step": 963
},
{
"epoch": 0.59,
"grad_norm": 2.294972552628036,
"learning_rate": 3.8074931180322544e-06,
"loss": 0.6577,
"step": 964
},
{
"epoch": 0.59,
"grad_norm": 1.9186117148347783,
"learning_rate": 3.7978710722406113e-06,
"loss": 0.5449,
"step": 965
},
{
"epoch": 0.59,
"grad_norm": 2.1934148088583014,
"learning_rate": 3.7882537491680992e-06,
"loss": 0.5944,
"step": 966
},
{
"epoch": 0.59,
"grad_norm": 2.0440451523844816,
"learning_rate": 3.7786411865976167e-06,
"loss": 0.5916,
"step": 967
},
{
"epoch": 0.59,
"grad_norm": 2.18697015319259,
"learning_rate": 3.7690334222933654e-06,
"loss": 0.5679,
"step": 968
},
{
"epoch": 0.59,
"grad_norm": 2.3700476225659957,
"learning_rate": 3.7594304940006846e-06,
"loss": 0.7297,
"step": 969
},
{
"epoch": 0.59,
"grad_norm": 2.032165887398491,
"learning_rate": 3.7498324394459253e-06,
"loss": 0.5391,
"step": 970
},
{
"epoch": 0.59,
"grad_norm": 2.0047253114127006,
"learning_rate": 3.7402392963362878e-06,
"loss": 0.6912,
"step": 971
},
{
"epoch": 0.59,
"grad_norm": 1.925349824119937,
"learning_rate": 3.7306511023596743e-06,
"loss": 0.4714,
"step": 972
},
{
"epoch": 0.59,
"grad_norm": 1.8222273181991067,
"learning_rate": 3.721067895184549e-06,
"loss": 0.5714,
"step": 973
},
{
"epoch": 0.6,
"grad_norm": 1.8642832971234993,
"learning_rate": 3.711489712459779e-06,
"loss": 0.5697,
"step": 974
},
{
"epoch": 0.6,
"grad_norm": 2.028076998382638,
"learning_rate": 3.7019165918144974e-06,
"loss": 0.6216,
"step": 975
},
{
"epoch": 0.6,
"grad_norm": 2.272782312304872,
"learning_rate": 3.6923485708579487e-06,
"loss": 0.4969,
"step": 976
},
{
"epoch": 0.6,
"grad_norm": 2.1112583503988525,
"learning_rate": 3.6827856871793393e-06,
"loss": 0.5942,
"step": 977
},
{
"epoch": 0.6,
"grad_norm": 2.0753754185170243,
"learning_rate": 3.673227978347698e-06,
"loss": 0.5954,
"step": 978
},
{
"epoch": 0.6,
"grad_norm": 1.8568456850511224,
"learning_rate": 3.6636754819117213e-06,
"loss": 0.5574,
"step": 979
},
{
"epoch": 0.6,
"grad_norm": 2.165971383751749,
"learning_rate": 3.6541282353996275e-06,
"loss": 0.5837,
"step": 980
},
{
"epoch": 0.6,
"grad_norm": 1.9460733583421799,
"learning_rate": 3.6445862763190104e-06,
"loss": 0.5682,
"step": 981
},
{
"epoch": 0.6,
"grad_norm": 2.1328727066329525,
"learning_rate": 3.635049642156692e-06,
"loss": 0.6156,
"step": 982
},
{
"epoch": 0.6,
"grad_norm": 2.206066665837199,
"learning_rate": 3.6255183703785735e-06,
"loss": 0.5946,
"step": 983
},
{
"epoch": 0.6,
"grad_norm": 2.0007589219567854,
"learning_rate": 3.615992498429493e-06,
"loss": 0.5819,
"step": 984
},
{
"epoch": 0.6,
"grad_norm": 2.038096288010089,
"learning_rate": 3.6064720637330673e-06,
"loss": 0.5356,
"step": 985
},
{
"epoch": 0.6,
"grad_norm": 1.978722860188176,
"learning_rate": 3.5969571036915596e-06,
"loss": 0.5895,
"step": 986
},
{
"epoch": 0.6,
"grad_norm": 1.9480013936453797,
"learning_rate": 3.587447655685724e-06,
"loss": 0.5308,
"step": 987
},
{
"epoch": 0.6,
"grad_norm": 2.1945763438024453,
"learning_rate": 3.5779437570746536e-06,
"loss": 0.6562,
"step": 988
},
{
"epoch": 0.6,
"grad_norm": 2.048811833634992,
"learning_rate": 3.568445445195647e-06,
"loss": 0.5449,
"step": 989
},
{
"epoch": 0.61,
"grad_norm": 1.9036038375948279,
"learning_rate": 3.5589527573640537e-06,
"loss": 0.5552,
"step": 990
},
{
"epoch": 0.61,
"grad_norm": 1.5575371034983034,
"learning_rate": 3.549465730873124e-06,
"loss": 0.4615,
"step": 991
},
{
"epoch": 0.61,
"grad_norm": 1.981358268031162,
"learning_rate": 3.5399844029938724e-06,
"loss": 0.5655,
"step": 992
},
{
"epoch": 0.61,
"grad_norm": 1.9338027875151251,
"learning_rate": 3.5305088109749196e-06,
"loss": 0.4972,
"step": 993
},
{
"epoch": 0.61,
"grad_norm": 1.9485561300488783,
"learning_rate": 3.5210389920423582e-06,
"loss": 0.5759,
"step": 994
},
{
"epoch": 0.61,
"grad_norm": 1.860842546149849,
"learning_rate": 3.511574983399599e-06,
"loss": 0.5328,
"step": 995
},
{
"epoch": 0.61,
"grad_norm": 2.1337146376104985,
"learning_rate": 3.5021168222272227e-06,
"loss": 0.6441,
"step": 996
},
{
"epoch": 0.61,
"grad_norm": 2.041043993151766,
"learning_rate": 3.49266454568284e-06,
"loss": 0.543,
"step": 997
},
{
"epoch": 0.61,
"grad_norm": 1.9008324125548013,
"learning_rate": 3.4832181909009467e-06,
"loss": 0.5582,
"step": 998
},
{
"epoch": 0.61,
"grad_norm": 2.0009175266246113,
"learning_rate": 3.473777794992765e-06,
"loss": 0.5657,
"step": 999
},
{
"epoch": 0.61,
"grad_norm": 2.0488988575321274,
"learning_rate": 3.4643433950461175e-06,
"loss": 0.5898,
"step": 1000
},
{
"epoch": 0.61,
"grad_norm": 2.164963300418657,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.5981,
"step": 1001
},
{
"epoch": 0.61,
"grad_norm": 1.9763623413542644,
"learning_rate": 3.4454927312707633e-06,
"loss": 0.6106,
"step": 1002
},
{
"epoch": 0.61,
"grad_norm": 1.9962996245557485,
"learning_rate": 3.43607654149933e-06,
"loss": 0.5782,
"step": 1003
},
{
"epoch": 0.61,
"grad_norm": 2.2151735776108,
"learning_rate": 3.4266664958036838e-06,
"loss": 0.5685,
"step": 1004
},
{
"epoch": 0.61,
"grad_norm": 2.018094880353655,
"learning_rate": 3.417262631152409e-06,
"loss": 0.528,
"step": 1005
},
{
"epoch": 0.62,
"grad_norm": 1.7630000234879197,
"learning_rate": 3.4078649844898045e-06,
"loss": 0.5205,
"step": 1006
},
{
"epoch": 0.62,
"grad_norm": 1.7955549765689147,
"learning_rate": 3.3984735927357414e-06,
"loss": 0.4731,
"step": 1007
},
{
"epoch": 0.62,
"grad_norm": 1.8797578836131676,
"learning_rate": 3.3890884927855185e-06,
"loss": 0.5603,
"step": 1008
},
{
"epoch": 0.62,
"grad_norm": 1.909711782728961,
"learning_rate": 3.3797097215097173e-06,
"loss": 0.5129,
"step": 1009
},
{
"epoch": 0.62,
"grad_norm": 1.8389954494108633,
"learning_rate": 3.3703373157540525e-06,
"loss": 0.5193,
"step": 1010
},
{
"epoch": 0.62,
"grad_norm": 1.8474429582879734,
"learning_rate": 3.3609713123392352e-06,
"loss": 0.4737,
"step": 1011
},
{
"epoch": 0.62,
"grad_norm": 2.3345478444238354,
"learning_rate": 3.3516117480608234e-06,
"loss": 0.7071,
"step": 1012
},
{
"epoch": 0.62,
"grad_norm": 2.175175435777652,
"learning_rate": 3.3422586596890742e-06,
"loss": 0.5722,
"step": 1013
},
{
"epoch": 0.62,
"grad_norm": 2.1305366509524055,
"learning_rate": 3.3329120839688102e-06,
"loss": 0.6892,
"step": 1014
},
{
"epoch": 0.62,
"grad_norm": 2.0052757824888037,
"learning_rate": 3.32357205761926e-06,
"loss": 0.5995,
"step": 1015
},
{
"epoch": 0.62,
"grad_norm": 2.0159203979737668,
"learning_rate": 3.314238617333928e-06,
"loss": 0.6025,
"step": 1016
},
{
"epoch": 0.62,
"grad_norm": 2.012244600426063,
"learning_rate": 3.304911799780445e-06,
"loss": 0.5673,
"step": 1017
},
{
"epoch": 0.62,
"grad_norm": 1.9361641471312003,
"learning_rate": 3.295591641600418e-06,
"loss": 0.5838,
"step": 1018
},
{
"epoch": 0.62,
"grad_norm": 1.8304544679156056,
"learning_rate": 3.2862781794092964e-06,
"loss": 0.5585,
"step": 1019
},
{
"epoch": 0.62,
"grad_norm": 2.149167385207215,
"learning_rate": 3.2769714497962235e-06,
"loss": 0.5886,
"step": 1020
},
{
"epoch": 0.62,
"grad_norm": 2.04272728052408,
"learning_rate": 3.267671489323889e-06,
"loss": 0.5355,
"step": 1021
},
{
"epoch": 0.62,
"grad_norm": 1.9965937830873703,
"learning_rate": 3.258378334528393e-06,
"loss": 0.5976,
"step": 1022
},
{
"epoch": 0.63,
"grad_norm": 2.126700139225219,
"learning_rate": 3.249092021919099e-06,
"loss": 0.5431,
"step": 1023
},
{
"epoch": 0.63,
"grad_norm": 1.8129449899934444,
"learning_rate": 3.239812587978485e-06,
"loss": 0.5674,
"step": 1024
},
{
"epoch": 0.63,
"grad_norm": 2.0383597018537865,
"learning_rate": 3.2305400691620126e-06,
"loss": 0.6182,
"step": 1025
},
{
"epoch": 0.63,
"grad_norm": 2.050473137758968,
"learning_rate": 3.221274501897968e-06,
"loss": 0.5404,
"step": 1026
},
{
"epoch": 0.63,
"grad_norm": 1.9168650695196385,
"learning_rate": 3.212015922587335e-06,
"loss": 0.5563,
"step": 1027
},
{
"epoch": 0.63,
"grad_norm": 1.98106980221109,
"learning_rate": 3.2027643676036402e-06,
"loss": 0.5734,
"step": 1028
},
{
"epoch": 0.63,
"grad_norm": 1.847757494976792,
"learning_rate": 3.193519873292815e-06,
"loss": 0.5501,
"step": 1029
},
{
"epoch": 0.63,
"grad_norm": 2.0870831968238965,
"learning_rate": 3.1842824759730518e-06,
"loss": 0.5744,
"step": 1030
},
{
"epoch": 0.63,
"grad_norm": 1.8249978023375093,
"learning_rate": 3.1750522119346626e-06,
"loss": 0.5438,
"step": 1031
},
{
"epoch": 0.63,
"grad_norm": 1.9231526649033666,
"learning_rate": 3.165829117439935e-06,
"loss": 0.529,
"step": 1032
},
{
"epoch": 0.63,
"grad_norm": 2.0472491835741873,
"learning_rate": 3.1566132287229876e-06,
"loss": 0.5332,
"step": 1033
},
{
"epoch": 0.63,
"grad_norm": 2.1920921831656375,
"learning_rate": 3.1474045819896374e-06,
"loss": 0.5604,
"step": 1034
},
{
"epoch": 0.63,
"grad_norm": 1.9909841140079707,
"learning_rate": 3.1382032134172395e-06,
"loss": 0.5111,
"step": 1035
},
{
"epoch": 0.63,
"grad_norm": 1.942915636194669,
"learning_rate": 3.129009159154567e-06,
"loss": 0.5641,
"step": 1036
},
{
"epoch": 0.63,
"grad_norm": 2.11037815260772,
"learning_rate": 3.1198224553216472e-06,
"loss": 0.593,
"step": 1037
},
{
"epoch": 0.63,
"grad_norm": 2.1107440825628494,
"learning_rate": 3.1106431380096374e-06,
"loss": 0.5313,
"step": 1038
},
{
"epoch": 0.64,
"grad_norm": 2.0760892689013324,
"learning_rate": 3.101471243280677e-06,
"loss": 0.5261,
"step": 1039
},
{
"epoch": 0.64,
"grad_norm": 1.9189444275549399,
"learning_rate": 3.092306807167738e-06,
"loss": 0.5436,
"step": 1040
},
{
"epoch": 0.64,
"grad_norm": 2.0604725562399913,
"learning_rate": 3.083149865674496e-06,
"loss": 0.6429,
"step": 1041
},
{
"epoch": 0.64,
"grad_norm": 1.8122634340035755,
"learning_rate": 3.0740004547751824e-06,
"loss": 0.5544,
"step": 1042
},
{
"epoch": 0.64,
"grad_norm": 1.8800400262197519,
"learning_rate": 3.0648586104144397e-06,
"loss": 0.5622,
"step": 1043
},
{
"epoch": 0.64,
"grad_norm": 2.0644669592689127,
"learning_rate": 3.0557243685071874e-06,
"loss": 0.6323,
"step": 1044
},
{
"epoch": 0.64,
"grad_norm": 2.127783852090682,
"learning_rate": 3.0465977649384813e-06,
"loss": 0.6729,
"step": 1045
},
{
"epoch": 0.64,
"grad_norm": 1.7585798179204468,
"learning_rate": 3.03747883556336e-06,
"loss": 0.5283,
"step": 1046
},
{
"epoch": 0.64,
"grad_norm": 2.122048769461784,
"learning_rate": 3.0283676162067234e-06,
"loss": 0.6467,
"step": 1047
},
{
"epoch": 0.64,
"grad_norm": 2.0797175331249282,
"learning_rate": 3.0192641426631707e-06,
"loss": 0.5904,
"step": 1048
},
{
"epoch": 0.64,
"grad_norm": 1.911144315914054,
"learning_rate": 3.010168450696879e-06,
"loss": 0.5504,
"step": 1049
},
{
"epoch": 0.64,
"grad_norm": 1.733484390437723,
"learning_rate": 3.0010805760414544e-06,
"loss": 0.4998,
"step": 1050
},
{
"epoch": 0.64,
"grad_norm": 2.102048422163863,
"learning_rate": 2.9920005543997847e-06,
"loss": 0.5276,
"step": 1051
},
{
"epoch": 0.64,
"grad_norm": 1.9804256500678663,
"learning_rate": 2.982928421443914e-06,
"loss": 0.4796,
"step": 1052
},
{
"epoch": 0.64,
"grad_norm": 2.058252604257508,
"learning_rate": 2.9738642128148887e-06,
"loss": 0.5238,
"step": 1053
},
{
"epoch": 0.64,
"grad_norm": 1.9511201686457833,
"learning_rate": 2.9648079641226267e-06,
"loss": 0.5746,
"step": 1054
},
{
"epoch": 0.65,
"grad_norm": 2.1199181959460947,
"learning_rate": 2.955759710945773e-06,
"loss": 0.5502,
"step": 1055
},
{
"epoch": 0.65,
"grad_norm": 2.3766960612858234,
"learning_rate": 2.946719488831564e-06,
"loss": 0.518,
"step": 1056
},
{
"epoch": 0.65,
"grad_norm": 1.960044833847147,
"learning_rate": 2.93768733329568e-06,
"loss": 0.5366,
"step": 1057
},
{
"epoch": 0.65,
"grad_norm": 2.1052778798118643,
"learning_rate": 2.928663279822116e-06,
"loss": 0.6107,
"step": 1058
},
{
"epoch": 0.65,
"grad_norm": 1.9902325930624214,
"learning_rate": 2.919647363863031e-06,
"loss": 0.5625,
"step": 1059
},
{
"epoch": 0.65,
"grad_norm": 1.8937657013896414,
"learning_rate": 2.910639620838619e-06,
"loss": 0.5431,
"step": 1060
},
{
"epoch": 0.65,
"grad_norm": 2.081716192850459,
"learning_rate": 2.901640086136969e-06,
"loss": 0.504,
"step": 1061
},
{
"epoch": 0.65,
"grad_norm": 2.1189522676194037,
"learning_rate": 2.892648795113912e-06,
"loss": 0.6598,
"step": 1062
},
{
"epoch": 0.65,
"grad_norm": 1.907842433021825,
"learning_rate": 2.8836657830929048e-06,
"loss": 0.5169,
"step": 1063
},
{
"epoch": 0.65,
"grad_norm": 1.9520656325829888,
"learning_rate": 2.874691085364868e-06,
"loss": 0.5536,
"step": 1064
},
{
"epoch": 0.65,
"grad_norm": 2.072673902863609,
"learning_rate": 2.865724737188067e-06,
"loss": 0.579,
"step": 1065
},
{
"epoch": 0.65,
"grad_norm": 2.0924189620942775,
"learning_rate": 2.856766773787959e-06,
"loss": 0.5745,
"step": 1066
},
{
"epoch": 0.65,
"grad_norm": 1.7781952691539604,
"learning_rate": 2.847817230357066e-06,
"loss": 0.5756,
"step": 1067
},
{
"epoch": 0.65,
"grad_norm": 2.034790297885924,
"learning_rate": 2.838876142054825e-06,
"loss": 0.4909,
"step": 1068
},
{
"epoch": 0.65,
"grad_norm": 2.17420374619433,
"learning_rate": 2.8299435440074596e-06,
"loss": 0.5831,
"step": 1069
},
{
"epoch": 0.65,
"grad_norm": 1.9647837412380398,
"learning_rate": 2.8210194713078408e-06,
"loss": 0.5177,
"step": 1070
},
{
"epoch": 0.65,
"grad_norm": 2.2111415620011017,
"learning_rate": 2.81210395901534e-06,
"loss": 0.625,
"step": 1071
},
{
"epoch": 0.66,
"grad_norm": 2.2292783394598406,
"learning_rate": 2.8031970421557035e-06,
"loss": 0.6244,
"step": 1072
},
{
"epoch": 0.66,
"grad_norm": 2.079126116302448,
"learning_rate": 2.7942987557209054e-06,
"loss": 0.5667,
"step": 1073
},
{
"epoch": 0.66,
"grad_norm": 1.8984066910628055,
"learning_rate": 2.785409134669017e-06,
"loss": 0.5423,
"step": 1074
},
{
"epoch": 0.66,
"grad_norm": 1.8127916497049823,
"learning_rate": 2.776528213924068e-06,
"loss": 0.494,
"step": 1075
},
{
"epoch": 0.66,
"grad_norm": 2.06245637765686,
"learning_rate": 2.7676560283759013e-06,
"loss": 0.5621,
"step": 1076
},
{
"epoch": 0.66,
"grad_norm": 2.114616000228047,
"learning_rate": 2.7587926128800503e-06,
"loss": 0.582,
"step": 1077
},
{
"epoch": 0.66,
"grad_norm": 1.8380497026526659,
"learning_rate": 2.7499380022575862e-06,
"loss": 0.5381,
"step": 1078
},
{
"epoch": 0.66,
"grad_norm": 1.898889400333682,
"learning_rate": 2.7410922312949955e-06,
"loss": 0.543,
"step": 1079
},
{
"epoch": 0.66,
"grad_norm": 2.2504985306779677,
"learning_rate": 2.7322553347440368e-06,
"loss": 0.5839,
"step": 1080
},
{
"epoch": 0.66,
"grad_norm": 2.202704207784561,
"learning_rate": 2.723427347321598e-06,
"loss": 0.6228,
"step": 1081
},
{
"epoch": 0.66,
"grad_norm": 2.0144401171881405,
"learning_rate": 2.7146083037095726e-06,
"loss": 0.5422,
"step": 1082
},
{
"epoch": 0.66,
"grad_norm": 1.736757757636721,
"learning_rate": 2.705798238554718e-06,
"loss": 0.5307,
"step": 1083
},
{
"epoch": 0.66,
"grad_norm": 1.8565795545311183,
"learning_rate": 2.696997186468511e-06,
"loss": 0.5413,
"step": 1084
},
{
"epoch": 0.66,
"grad_norm": 2.2977818371394663,
"learning_rate": 2.688205182027026e-06,
"loss": 0.6052,
"step": 1085
},
{
"epoch": 0.66,
"grad_norm": 1.9308033883834883,
"learning_rate": 2.6794222597707937e-06,
"loss": 0.5361,
"step": 1086
},
{
"epoch": 0.66,
"grad_norm": 1.9086632142591924,
"learning_rate": 2.6706484542046564e-06,
"loss": 0.5446,
"step": 1087
},
{
"epoch": 0.67,
"grad_norm": 1.9880975815360458,
"learning_rate": 2.6618837997976497e-06,
"loss": 0.5471,
"step": 1088
},
{
"epoch": 0.67,
"grad_norm": 2.080617319056223,
"learning_rate": 2.6531283309828493e-06,
"loss": 0.6338,
"step": 1089
},
{
"epoch": 0.67,
"grad_norm": 1.9076523429048307,
"learning_rate": 2.6443820821572496e-06,
"loss": 0.5312,
"step": 1090
},
{
"epoch": 0.67,
"grad_norm": 2.0510429772806784,
"learning_rate": 2.635645087681623e-06,
"loss": 0.6337,
"step": 1091
},
{
"epoch": 0.67,
"grad_norm": 1.8261749722012643,
"learning_rate": 2.626917381880381e-06,
"loss": 0.4953,
"step": 1092
},
{
"epoch": 0.67,
"grad_norm": 1.9732654093784376,
"learning_rate": 2.618198999041447e-06,
"loss": 0.5538,
"step": 1093
},
{
"epoch": 0.67,
"grad_norm": 2.012868194526242,
"learning_rate": 2.609489973416118e-06,
"loss": 0.6014,
"step": 1094
},
{
"epoch": 0.67,
"grad_norm": 2.0313320710830274,
"learning_rate": 2.600790339218926e-06,
"loss": 0.5784,
"step": 1095
},
{
"epoch": 0.67,
"grad_norm": 2.1398171593710185,
"learning_rate": 2.5921001306275116e-06,
"loss": 0.5516,
"step": 1096
},
{
"epoch": 0.67,
"grad_norm": 2.120355813263771,
"learning_rate": 2.5834193817824865e-06,
"loss": 0.5909,
"step": 1097
},
{
"epoch": 0.67,
"grad_norm": 2.030214931297693,
"learning_rate": 2.5747481267872925e-06,
"loss": 0.5592,
"step": 1098
},
{
"epoch": 0.67,
"grad_norm": 1.9767602094754053,
"learning_rate": 2.5660863997080808e-06,
"loss": 0.5503,
"step": 1099
},
{
"epoch": 0.67,
"grad_norm": 1.9483830769279278,
"learning_rate": 2.557434234573565e-06,
"loss": 0.5671,
"step": 1100
},
{
"epoch": 0.67,
"grad_norm": 1.9276456895969436,
"learning_rate": 2.548791665374898e-06,
"loss": 0.5127,
"step": 1101
},
{
"epoch": 0.67,
"grad_norm": 1.8090439396291618,
"learning_rate": 2.540158726065532e-06,
"loss": 0.5713,
"step": 1102
},
{
"epoch": 0.67,
"grad_norm": 2.320785088443513,
"learning_rate": 2.5315354505610847e-06,
"loss": 0.6488,
"step": 1103
},
{
"epoch": 0.68,
"grad_norm": 1.9982757056307234,
"learning_rate": 2.522921872739211e-06,
"loss": 0.5425,
"step": 1104
},
{
"epoch": 0.68,
"grad_norm": 1.9334400895537176,
"learning_rate": 2.514318026439469e-06,
"loss": 0.6033,
"step": 1105
},
{
"epoch": 0.68,
"grad_norm": 2.027430128222646,
"learning_rate": 2.50572394546318e-06,
"loss": 0.5551,
"step": 1106
},
{
"epoch": 0.68,
"grad_norm": 2.212527434238601,
"learning_rate": 2.4971396635733043e-06,
"loss": 0.6576,
"step": 1107
},
{
"epoch": 0.68,
"grad_norm": 2.047142954880681,
"learning_rate": 2.488565214494307e-06,
"loss": 0.6133,
"step": 1108
},
{
"epoch": 0.68,
"grad_norm": 1.855345806040437,
"learning_rate": 2.480000631912018e-06,
"loss": 0.5198,
"step": 1109
},
{
"epoch": 0.68,
"grad_norm": 2.1075238338772895,
"learning_rate": 2.471445949473512e-06,
"loss": 0.5667,
"step": 1110
},
{
"epoch": 0.68,
"grad_norm": 1.9673428615144855,
"learning_rate": 2.4629012007869634e-06,
"loss": 0.5715,
"step": 1111
},
{
"epoch": 0.68,
"grad_norm": 2.2624688260095382,
"learning_rate": 2.4543664194215272e-06,
"loss": 0.7673,
"step": 1112
},
{
"epoch": 0.68,
"grad_norm": 2.0285948397058626,
"learning_rate": 2.445841638907194e-06,
"loss": 0.5768,
"step": 1113
},
{
"epoch": 0.68,
"grad_norm": 2.088805134401383,
"learning_rate": 2.4373268927346678e-06,
"loss": 0.5607,
"step": 1114
},
{
"epoch": 0.68,
"grad_norm": 1.7787754535359048,
"learning_rate": 2.428822214355235e-06,
"loss": 0.5723,
"step": 1115
},
{
"epoch": 0.68,
"grad_norm": 2.149439034712146,
"learning_rate": 2.4203276371806206e-06,
"loss": 0.6358,
"step": 1116
},
{
"epoch": 0.68,
"grad_norm": 1.8509674900388537,
"learning_rate": 2.4118431945828757e-06,
"loss": 0.5393,
"step": 1117
},
{
"epoch": 0.68,
"grad_norm": 1.8927674268591554,
"learning_rate": 2.4033689198942272e-06,
"loss": 0.5846,
"step": 1118
},
{
"epoch": 0.68,
"grad_norm": 1.9917147049157173,
"learning_rate": 2.394904846406964e-06,
"loss": 0.6189,
"step": 1119
},
{
"epoch": 0.68,
"grad_norm": 1.9129347000522996,
"learning_rate": 2.3864510073732914e-06,
"loss": 0.5045,
"step": 1120
},
{
"epoch": 0.69,
"grad_norm": 2.050425773481486,
"learning_rate": 2.378007436005214e-06,
"loss": 0.5873,
"step": 1121
},
{
"epoch": 0.69,
"grad_norm": 1.9719631201639327,
"learning_rate": 2.3695741654743913e-06,
"loss": 0.5375,
"step": 1122
},
{
"epoch": 0.69,
"grad_norm": 2.0214460337530946,
"learning_rate": 2.3611512289120208e-06,
"loss": 0.5548,
"step": 1123
},
{
"epoch": 0.69,
"grad_norm": 1.8042574192263385,
"learning_rate": 2.3527386594087003e-06,
"loss": 0.5189,
"step": 1124
},
{
"epoch": 0.69,
"grad_norm": 1.9600273923574374,
"learning_rate": 2.344336490014295e-06,
"loss": 0.5378,
"step": 1125
},
{
"epoch": 0.69,
"grad_norm": 1.9432405793477807,
"learning_rate": 2.3359447537378173e-06,
"loss": 0.5354,
"step": 1126
},
{
"epoch": 0.69,
"grad_norm": 2.04898553718917,
"learning_rate": 2.3275634835472914e-06,
"loss": 0.6216,
"step": 1127
},
{
"epoch": 0.69,
"grad_norm": 1.9391818082210701,
"learning_rate": 2.3191927123696185e-06,
"loss": 0.5523,
"step": 1128
},
{
"epoch": 0.69,
"grad_norm": 1.9965210994972027,
"learning_rate": 2.3108324730904584e-06,
"loss": 0.5929,
"step": 1129
},
{
"epoch": 0.69,
"grad_norm": 1.8480997829156387,
"learning_rate": 2.302482798554096e-06,
"loss": 0.5467,
"step": 1130
},
{
"epoch": 0.69,
"grad_norm": 1.8430449159542786,
"learning_rate": 2.2941437215633043e-06,
"loss": 0.5267,
"step": 1131
},
{
"epoch": 0.69,
"grad_norm": 2.0808145765908543,
"learning_rate": 2.2858152748792316e-06,
"loss": 0.6113,
"step": 1132
},
{
"epoch": 0.69,
"grad_norm": 1.8218623081262797,
"learning_rate": 2.277497491221255e-06,
"loss": 0.4938,
"step": 1133
},
{
"epoch": 0.69,
"grad_norm": 1.9041064505829883,
"learning_rate": 2.269190403266866e-06,
"loss": 0.5633,
"step": 1134
},
{
"epoch": 0.69,
"grad_norm": 1.652734879699611,
"learning_rate": 2.260894043651537e-06,
"loss": 0.5735,
"step": 1135
},
{
"epoch": 0.69,
"grad_norm": 2.2538231832723445,
"learning_rate": 2.2526084449685876e-06,
"loss": 0.6128,
"step": 1136
},
{
"epoch": 0.7,
"grad_norm": 1.7960651904147913,
"learning_rate": 2.244333639769066e-06,
"loss": 0.4856,
"step": 1137
},
{
"epoch": 0.7,
"grad_norm": 1.9343813745783291,
"learning_rate": 2.236069660561619e-06,
"loss": 0.5552,
"step": 1138
},
{
"epoch": 0.7,
"grad_norm": 2.0101528659989025,
"learning_rate": 2.2278165398123538e-06,
"loss": 0.5589,
"step": 1139
},
{
"epoch": 0.7,
"grad_norm": 2.1101574118483826,
"learning_rate": 2.2195743099447257e-06,
"loss": 0.5837,
"step": 1140
},
{
"epoch": 0.7,
"grad_norm": 1.7157523251139841,
"learning_rate": 2.211343003339405e-06,
"loss": 0.4769,
"step": 1141
},
{
"epoch": 0.7,
"grad_norm": 1.7800599491375297,
"learning_rate": 2.203122652334141e-06,
"loss": 0.5251,
"step": 1142
},
{
"epoch": 0.7,
"grad_norm": 1.9387781774592656,
"learning_rate": 2.1949132892236495e-06,
"loss": 0.5669,
"step": 1143
},
{
"epoch": 0.7,
"grad_norm": 1.9580346237978052,
"learning_rate": 2.1867149462594745e-06,
"loss": 0.6192,
"step": 1144
},
{
"epoch": 0.7,
"grad_norm": 1.8050769261944541,
"learning_rate": 2.178527655649868e-06,
"loss": 0.5353,
"step": 1145
},
{
"epoch": 0.7,
"grad_norm": 2.028704396965185,
"learning_rate": 2.1703514495596643e-06,
"loss": 0.565,
"step": 1146
},
{
"epoch": 0.7,
"grad_norm": 1.8852797512232322,
"learning_rate": 2.1621863601101434e-06,
"loss": 0.4691,
"step": 1147
},
{
"epoch": 0.7,
"grad_norm": 2.0307600694751566,
"learning_rate": 2.1540324193789177e-06,
"loss": 0.6075,
"step": 1148
},
{
"epoch": 0.7,
"grad_norm": 2.0635630868289274,
"learning_rate": 2.145889659399801e-06,
"loss": 0.5713,
"step": 1149
},
{
"epoch": 0.7,
"grad_norm": 1.8359324107226915,
"learning_rate": 2.137758112162678e-06,
"loss": 0.5419,
"step": 1150
},
{
"epoch": 0.7,
"grad_norm": 1.9327432699880955,
"learning_rate": 2.1296378096133863e-06,
"loss": 0.5219,
"step": 1151
},
{
"epoch": 0.7,
"grad_norm": 1.9646016130657808,
"learning_rate": 2.1215287836535836e-06,
"loss": 0.5865,
"step": 1152
},
{
"epoch": 0.7,
"grad_norm": 2.053237316493875,
"learning_rate": 2.1134310661406293e-06,
"loss": 0.5495,
"step": 1153
},
{
"epoch": 0.71,
"grad_norm": 1.9224877080515406,
"learning_rate": 2.1053446888874575e-06,
"loss": 0.57,
"step": 1154
},
{
"epoch": 0.71,
"grad_norm": 2.0094429579360296,
"learning_rate": 2.097269683662444e-06,
"loss": 0.5966,
"step": 1155
},
{
"epoch": 0.71,
"grad_norm": 2.214480865585653,
"learning_rate": 2.089206082189294e-06,
"loss": 0.6409,
"step": 1156
},
{
"epoch": 0.71,
"grad_norm": 2.0300256495896516,
"learning_rate": 2.0811539161469126e-06,
"loss": 0.5318,
"step": 1157
},
{
"epoch": 0.71,
"grad_norm": 1.9501820223073412,
"learning_rate": 2.073113217169272e-06,
"loss": 0.5289,
"step": 1158
},
{
"epoch": 0.71,
"grad_norm": 2.073955162988734,
"learning_rate": 2.065084016845301e-06,
"loss": 0.6114,
"step": 1159
},
{
"epoch": 0.71,
"grad_norm": 2.2617011183013505,
"learning_rate": 2.0570663467187556e-06,
"loss": 0.692,
"step": 1160
},
{
"epoch": 0.71,
"grad_norm": 1.9412567792801219,
"learning_rate": 2.049060238288086e-06,
"loss": 0.5781,
"step": 1161
},
{
"epoch": 0.71,
"grad_norm": 1.9161885318665781,
"learning_rate": 2.0410657230063304e-06,
"loss": 0.4698,
"step": 1162
},
{
"epoch": 0.71,
"grad_norm": 1.9595900836322337,
"learning_rate": 2.0330828322809727e-06,
"loss": 0.5868,
"step": 1163
},
{
"epoch": 0.71,
"grad_norm": 1.860761169455946,
"learning_rate": 2.025111597473836e-06,
"loss": 0.5014,
"step": 1164
},
{
"epoch": 0.71,
"grad_norm": 1.9607972138295733,
"learning_rate": 2.0171520499009457e-06,
"loss": 0.5398,
"step": 1165
},
{
"epoch": 0.71,
"grad_norm": 1.8911857797620755,
"learning_rate": 2.009204220832418e-06,
"loss": 0.5382,
"step": 1166
},
{
"epoch": 0.71,
"grad_norm": 2.149350473430931,
"learning_rate": 2.0012681414923254e-06,
"loss": 0.5554,
"step": 1167
},
{
"epoch": 0.71,
"grad_norm": 1.8778243908792742,
"learning_rate": 1.993343843058585e-06,
"loss": 0.5085,
"step": 1168
},
{
"epoch": 0.71,
"grad_norm": 1.9294134892146952,
"learning_rate": 1.9854313566628273e-06,
"loss": 0.5678,
"step": 1169
},
{
"epoch": 0.72,
"grad_norm": 1.9503939087047788,
"learning_rate": 1.977530713390281e-06,
"loss": 0.5656,
"step": 1170
},
{
"epoch": 0.72,
"grad_norm": 2.092855618928881,
"learning_rate": 1.9696419442796474e-06,
"loss": 0.5589,
"step": 1171
},
{
"epoch": 0.72,
"grad_norm": 2.0558632332328366,
"learning_rate": 1.9617650803229736e-06,
"loss": 0.565,
"step": 1172
},
{
"epoch": 0.72,
"grad_norm": 1.8074897396978709,
"learning_rate": 1.953900152465544e-06,
"loss": 0.5278,
"step": 1173
},
{
"epoch": 0.72,
"grad_norm": 1.9979141137083714,
"learning_rate": 1.9460471916057415e-06,
"loss": 0.542,
"step": 1174
},
{
"epoch": 0.72,
"grad_norm": 2.031624720114998,
"learning_rate": 1.9382062285949416e-06,
"loss": 0.4827,
"step": 1175
},
{
"epoch": 0.72,
"grad_norm": 2.109236104508361,
"learning_rate": 1.9303772942373846e-06,
"loss": 0.5567,
"step": 1176
},
{
"epoch": 0.72,
"grad_norm": 2.2956266675459576,
"learning_rate": 1.9225604192900488e-06,
"loss": 0.6067,
"step": 1177
},
{
"epoch": 0.72,
"grad_norm": 1.9743192317750047,
"learning_rate": 1.914755634462542e-06,
"loss": 0.4976,
"step": 1178
},
{
"epoch": 0.72,
"grad_norm": 1.8028667562352623,
"learning_rate": 1.9069629704169723e-06,
"loss": 0.509,
"step": 1179
},
{
"epoch": 0.72,
"grad_norm": 1.7362668980411815,
"learning_rate": 1.8991824577678269e-06,
"loss": 0.5544,
"step": 1180
},
{
"epoch": 0.72,
"grad_norm": 2.0285534879494715,
"learning_rate": 1.8914141270818593e-06,
"loss": 0.4984,
"step": 1181
},
{
"epoch": 0.72,
"grad_norm": 2.046266381772136,
"learning_rate": 1.8836580088779628e-06,
"loss": 0.59,
"step": 1182
},
{
"epoch": 0.72,
"grad_norm": 1.9988343414891951,
"learning_rate": 1.8759141336270486e-06,
"loss": 0.5491,
"step": 1183
},
{
"epoch": 0.72,
"grad_norm": 2.0240438299435968,
"learning_rate": 1.868182531751938e-06,
"loss": 0.5816,
"step": 1184
},
{
"epoch": 0.72,
"grad_norm": 2.1594937237415226,
"learning_rate": 1.8604632336272249e-06,
"loss": 0.5865,
"step": 1185
},
{
"epoch": 0.73,
"grad_norm": 2.0666115681553032,
"learning_rate": 1.8527562695791746e-06,
"loss": 0.5231,
"step": 1186
},
{
"epoch": 0.73,
"grad_norm": 2.0866219869375064,
"learning_rate": 1.8450616698855938e-06,
"loss": 0.5465,
"step": 1187
},
{
"epoch": 0.73,
"grad_norm": 1.9640838158081952,
"learning_rate": 1.8373794647757105e-06,
"loss": 0.5484,
"step": 1188
},
{
"epoch": 0.73,
"grad_norm": 1.8827641315498593,
"learning_rate": 1.8297096844300638e-06,
"loss": 0.5447,
"step": 1189
},
{
"epoch": 0.73,
"grad_norm": 1.9270241151498686,
"learning_rate": 1.8220523589803808e-06,
"loss": 0.5148,
"step": 1190
},
{
"epoch": 0.73,
"grad_norm": 1.8983158181788506,
"learning_rate": 1.8144075185094523e-06,
"loss": 0.5089,
"step": 1191
},
{
"epoch": 0.73,
"grad_norm": 1.9391982904773535,
"learning_rate": 1.8067751930510258e-06,
"loss": 0.6062,
"step": 1192
},
{
"epoch": 0.73,
"grad_norm": 1.8572456201220782,
"learning_rate": 1.799155412589681e-06,
"loss": 0.4707,
"step": 1193
},
{
"epoch": 0.73,
"grad_norm": 2.180002124115602,
"learning_rate": 1.7915482070607094e-06,
"loss": 0.597,
"step": 1194
},
{
"epoch": 0.73,
"grad_norm": 1.8462663478460364,
"learning_rate": 1.783953606350005e-06,
"loss": 0.5577,
"step": 1195
},
{
"epoch": 0.73,
"grad_norm": 1.889638886458579,
"learning_rate": 1.7763716402939385e-06,
"loss": 0.519,
"step": 1196
},
{
"epoch": 0.73,
"grad_norm": 1.726660341693367,
"learning_rate": 1.7688023386792452e-06,
"loss": 0.4718,
"step": 1197
},
{
"epoch": 0.73,
"grad_norm": 2.2128024274172655,
"learning_rate": 1.7612457312429093e-06,
"loss": 0.6105,
"step": 1198
},
{
"epoch": 0.73,
"grad_norm": 1.8977444616718881,
"learning_rate": 1.7537018476720369e-06,
"loss": 0.5442,
"step": 1199
},
{
"epoch": 0.73,
"grad_norm": 1.7605812625109447,
"learning_rate": 1.7461707176037546e-06,
"loss": 0.4897,
"step": 1200
},
{
"epoch": 0.73,
"grad_norm": 2.1962948774783655,
"learning_rate": 1.738652370625082e-06,
"loss": 0.5795,
"step": 1201
},
{
"epoch": 0.73,
"grad_norm": 1.8753753975602796,
"learning_rate": 1.7311468362728163e-06,
"loss": 0.5267,
"step": 1202
},
{
"epoch": 0.74,
"grad_norm": 2.2401308043999877,
"learning_rate": 1.723654144033422e-06,
"loss": 0.5422,
"step": 1203
},
{
"epoch": 0.74,
"grad_norm": 2.360182834014352,
"learning_rate": 1.7161743233429123e-06,
"loss": 0.5932,
"step": 1204
},
{
"epoch": 0.74,
"grad_norm": 2.1509906603775675,
"learning_rate": 1.7087074035867284e-06,
"loss": 0.5336,
"step": 1205
},
{
"epoch": 0.74,
"grad_norm": 2.2167606816262215,
"learning_rate": 1.7012534140996351e-06,
"loss": 0.6204,
"step": 1206
},
{
"epoch": 0.74,
"grad_norm": 2.2516401026227193,
"learning_rate": 1.69381238416559e-06,
"loss": 0.6229,
"step": 1207
},
{
"epoch": 0.74,
"grad_norm": 2.040572811584935,
"learning_rate": 1.6863843430176464e-06,
"loss": 0.5554,
"step": 1208
},
{
"epoch": 0.74,
"grad_norm": 2.001470306966103,
"learning_rate": 1.6789693198378254e-06,
"loss": 0.5494,
"step": 1209
},
{
"epoch": 0.74,
"grad_norm": 2.0804007666629434,
"learning_rate": 1.6715673437570035e-06,
"loss": 0.6031,
"step": 1210
},
{
"epoch": 0.74,
"grad_norm": 2.017960337685253,
"learning_rate": 1.6641784438548048e-06,
"loss": 0.5567,
"step": 1211
},
{
"epoch": 0.74,
"grad_norm": 2.084312076747243,
"learning_rate": 1.6568026491594763e-06,
"loss": 0.5529,
"step": 1212
},
{
"epoch": 0.74,
"grad_norm": 2.016310638065491,
"learning_rate": 1.6494399886477859e-06,
"loss": 0.5525,
"step": 1213
},
{
"epoch": 0.74,
"grad_norm": 1.8351394186785017,
"learning_rate": 1.6420904912448942e-06,
"loss": 0.5631,
"step": 1214
},
{
"epoch": 0.74,
"grad_norm": 1.978522191746191,
"learning_rate": 1.634754185824256e-06,
"loss": 0.5075,
"step": 1215
},
{
"epoch": 0.74,
"grad_norm": 2.223631750342603,
"learning_rate": 1.6274311012074984e-06,
"loss": 0.6659,
"step": 1216
},
{
"epoch": 0.74,
"grad_norm": 1.9821415990981424,
"learning_rate": 1.6201212661643045e-06,
"loss": 0.5744,
"step": 1217
},
{
"epoch": 0.74,
"grad_norm": 1.9744423033984106,
"learning_rate": 1.61282470941231e-06,
"loss": 0.6117,
"step": 1218
},
{
"epoch": 0.75,
"grad_norm": 2.065211898764052,
"learning_rate": 1.6055414596169806e-06,
"loss": 0.5691,
"step": 1219
},
{
"epoch": 0.75,
"grad_norm": 1.98605861724129,
"learning_rate": 1.5982715453915082e-06,
"loss": 0.4985,
"step": 1220
},
{
"epoch": 0.75,
"grad_norm": 2.04457416568264,
"learning_rate": 1.5910149952966898e-06,
"loss": 0.5538,
"step": 1221
},
{
"epoch": 0.75,
"grad_norm": 2.0121702166230895,
"learning_rate": 1.583771837840823e-06,
"loss": 0.5658,
"step": 1222
},
{
"epoch": 0.75,
"grad_norm": 2.0092606234614694,
"learning_rate": 1.5765421014795911e-06,
"loss": 0.5113,
"step": 1223
},
{
"epoch": 0.75,
"grad_norm": 1.8935411378036877,
"learning_rate": 1.569325814615947e-06,
"loss": 0.507,
"step": 1224
},
{
"epoch": 0.75,
"grad_norm": 2.34585820804892,
"learning_rate": 1.562123005600009e-06,
"loss": 0.5769,
"step": 1225
},
{
"epoch": 0.75,
"grad_norm": 2.1528289643549234,
"learning_rate": 1.5549337027289468e-06,
"loss": 0.5501,
"step": 1226
},
{
"epoch": 0.75,
"grad_norm": 1.8301608372784568,
"learning_rate": 1.5477579342468634e-06,
"loss": 0.5208,
"step": 1227
},
{
"epoch": 0.75,
"grad_norm": 2.176141189999809,
"learning_rate": 1.5405957283446987e-06,
"loss": 0.6609,
"step": 1228
},
{
"epoch": 0.75,
"grad_norm": 2.0473489032027565,
"learning_rate": 1.5334471131601025e-06,
"loss": 0.5715,
"step": 1229
},
{
"epoch": 0.75,
"grad_norm": 2.1029826913605647,
"learning_rate": 1.526312116777336e-06,
"loss": 0.4786,
"step": 1230
},
{
"epoch": 0.75,
"grad_norm": 1.9871418869111652,
"learning_rate": 1.5191907672271582e-06,
"loss": 0.4602,
"step": 1231
},
{
"epoch": 0.75,
"grad_norm": 1.9159131882394276,
"learning_rate": 1.5120830924867098e-06,
"loss": 0.508,
"step": 1232
},
{
"epoch": 0.75,
"grad_norm": 2.282250060067885,
"learning_rate": 1.5049891204794125e-06,
"loss": 0.5567,
"step": 1233
},
{
"epoch": 0.75,
"grad_norm": 1.9230524058397154,
"learning_rate": 1.4979088790748553e-06,
"loss": 0.5514,
"step": 1234
},
{
"epoch": 0.76,
"grad_norm": 2.036428797678635,
"learning_rate": 1.4908423960886808e-06,
"loss": 0.5909,
"step": 1235
},
{
"epoch": 0.76,
"grad_norm": 2.1397315181807506,
"learning_rate": 1.4837896992824835e-06,
"loss": 0.6168,
"step": 1236
},
{
"epoch": 0.76,
"grad_norm": 2.197275580787461,
"learning_rate": 1.4767508163636968e-06,
"loss": 0.5636,
"step": 1237
},
{
"epoch": 0.76,
"grad_norm": 1.9450184678272302,
"learning_rate": 1.4697257749854815e-06,
"loss": 0.5576,
"step": 1238
},
{
"epoch": 0.76,
"grad_norm": 1.708185896072239,
"learning_rate": 1.4627146027466248e-06,
"loss": 0.5048,
"step": 1239
},
{
"epoch": 0.76,
"grad_norm": 1.931851292310904,
"learning_rate": 1.4557173271914216e-06,
"loss": 0.6003,
"step": 1240
},
{
"epoch": 0.76,
"grad_norm": 1.7739213428365466,
"learning_rate": 1.4487339758095758e-06,
"loss": 0.4847,
"step": 1241
},
{
"epoch": 0.76,
"grad_norm": 2.065334796741025,
"learning_rate": 1.4417645760360899e-06,
"loss": 0.4995,
"step": 1242
},
{
"epoch": 0.76,
"grad_norm": 1.9060500200669357,
"learning_rate": 1.4348091552511496e-06,
"loss": 0.4772,
"step": 1243
},
{
"epoch": 0.76,
"grad_norm": 1.8926362460364048,
"learning_rate": 1.427867740780028e-06,
"loss": 0.4678,
"step": 1244
},
{
"epoch": 0.76,
"grad_norm": 2.230768332923383,
"learning_rate": 1.4209403598929711e-06,
"loss": 0.5556,
"step": 1245
},
{
"epoch": 0.76,
"grad_norm": 2.2444577467361078,
"learning_rate": 1.4140270398050899e-06,
"loss": 0.6313,
"step": 1246
},
{
"epoch": 0.76,
"grad_norm": 2.0449648982479385,
"learning_rate": 1.407127807676259e-06,
"loss": 0.5457,
"step": 1247
},
{
"epoch": 0.76,
"grad_norm": 2.0517370766830707,
"learning_rate": 1.4002426906110034e-06,
"loss": 0.539,
"step": 1248
},
{
"epoch": 0.76,
"grad_norm": 2.0847967027851375,
"learning_rate": 1.3933717156583975e-06,
"loss": 0.5256,
"step": 1249
},
{
"epoch": 0.76,
"grad_norm": 2.1822442085414386,
"learning_rate": 1.386514909811958e-06,
"loss": 0.5648,
"step": 1250
},
{
"epoch": 0.76,
"grad_norm": 2.01409537100995,
"learning_rate": 1.3796723000095312e-06,
"loss": 0.5878,
"step": 1251
},
{
"epoch": 0.77,
"grad_norm": 2.2139379142084357,
"learning_rate": 1.3728439131331972e-06,
"loss": 0.5724,
"step": 1252
},
{
"epoch": 0.77,
"grad_norm": 2.022116537133516,
"learning_rate": 1.366029776009159e-06,
"loss": 0.5686,
"step": 1253
},
{
"epoch": 0.77,
"grad_norm": 2.3703607571589265,
"learning_rate": 1.3592299154076344e-06,
"loss": 0.676,
"step": 1254
},
{
"epoch": 0.77,
"grad_norm": 2.1435846797265317,
"learning_rate": 1.3524443580427565e-06,
"loss": 0.6176,
"step": 1255
},
{
"epoch": 0.77,
"grad_norm": 1.9476892823094056,
"learning_rate": 1.3456731305724685e-06,
"loss": 0.5245,
"step": 1256
},
{
"epoch": 0.77,
"grad_norm": 1.7780892199670588,
"learning_rate": 1.3389162595984106e-06,
"loss": 0.4913,
"step": 1257
},
{
"epoch": 0.77,
"grad_norm": 2.147087917412656,
"learning_rate": 1.3321737716658284e-06,
"loss": 0.5712,
"step": 1258
},
{
"epoch": 0.77,
"grad_norm": 1.9396671940173766,
"learning_rate": 1.3254456932634557e-06,
"loss": 0.5236,
"step": 1259
},
{
"epoch": 0.77,
"grad_norm": 2.105081861677922,
"learning_rate": 1.3187320508234208e-06,
"loss": 0.528,
"step": 1260
},
{
"epoch": 0.77,
"grad_norm": 2.119521541148906,
"learning_rate": 1.3120328707211394e-06,
"loss": 0.5511,
"step": 1261
},
{
"epoch": 0.77,
"grad_norm": 1.9411630633184176,
"learning_rate": 1.3053481792752044e-06,
"loss": 0.5692,
"step": 1262
},
{
"epoch": 0.77,
"grad_norm": 2.13953797890098,
"learning_rate": 1.298678002747294e-06,
"loss": 0.6083,
"step": 1263
},
{
"epoch": 0.77,
"grad_norm": 1.696024362037954,
"learning_rate": 1.2920223673420584e-06,
"loss": 0.4515,
"step": 1264
},
{
"epoch": 0.77,
"grad_norm": 1.8867892952085517,
"learning_rate": 1.285381299207026e-06,
"loss": 0.5367,
"step": 1265
},
{
"epoch": 0.77,
"grad_norm": 2.052265073795798,
"learning_rate": 1.2787548244324888e-06,
"loss": 0.6345,
"step": 1266
},
{
"epoch": 0.77,
"grad_norm": 1.9697590487372725,
"learning_rate": 1.2721429690514142e-06,
"loss": 0.5131,
"step": 1267
},
{
"epoch": 0.78,
"grad_norm": 1.950928985112725,
"learning_rate": 1.26554575903933e-06,
"loss": 0.5065,
"step": 1268
},
{
"epoch": 0.78,
"grad_norm": 2.040285812654646,
"learning_rate": 1.2589632203142316e-06,
"loss": 0.6118,
"step": 1269
},
{
"epoch": 0.78,
"grad_norm": 2.0858975965094095,
"learning_rate": 1.2523953787364723e-06,
"loss": 0.5986,
"step": 1270
},
{
"epoch": 0.78,
"grad_norm": 1.96463477151841,
"learning_rate": 1.24584226010867e-06,
"loss": 0.5598,
"step": 1271
},
{
"epoch": 0.78,
"grad_norm": 2.1145952141868434,
"learning_rate": 1.2393038901756e-06,
"loss": 0.5922,
"step": 1272
},
{
"epoch": 0.78,
"grad_norm": 1.8968425608898443,
"learning_rate": 1.232780294624093e-06,
"loss": 0.5095,
"step": 1273
},
{
"epoch": 0.78,
"grad_norm": 2.0301980708684866,
"learning_rate": 1.22627149908294e-06,
"loss": 0.5498,
"step": 1274
},
{
"epoch": 0.78,
"grad_norm": 1.8270409790198994,
"learning_rate": 1.2197775291227887e-06,
"loss": 0.4714,
"step": 1275
},
{
"epoch": 0.78,
"grad_norm": 2.0963357615007703,
"learning_rate": 1.2132984102560374e-06,
"loss": 0.6149,
"step": 1276
},
{
"epoch": 0.78,
"grad_norm": 1.9801877339079816,
"learning_rate": 1.2068341679367452e-06,
"loss": 0.5337,
"step": 1277
},
{
"epoch": 0.78,
"grad_norm": 1.980282736079062,
"learning_rate": 1.2003848275605263e-06,
"loss": 0.5857,
"step": 1278
},
{
"epoch": 0.78,
"grad_norm": 1.835028289058003,
"learning_rate": 1.1939504144644464e-06,
"loss": 0.5959,
"step": 1279
},
{
"epoch": 0.78,
"grad_norm": 2.120379341814735,
"learning_rate": 1.1875309539269332e-06,
"loss": 0.5015,
"step": 1280
},
{
"epoch": 0.78,
"grad_norm": 1.93303989363275,
"learning_rate": 1.1811264711676661e-06,
"loss": 0.5125,
"step": 1281
},
{
"epoch": 0.78,
"grad_norm": 1.973773020853456,
"learning_rate": 1.1747369913474866e-06,
"loss": 0.5864,
"step": 1282
},
{
"epoch": 0.78,
"grad_norm": 2.0447822785351994,
"learning_rate": 1.1683625395682935e-06,
"loss": 0.572,
"step": 1283
},
{
"epoch": 0.79,
"grad_norm": 1.9251270954167108,
"learning_rate": 1.1620031408729443e-06,
"loss": 0.5745,
"step": 1284
},
{
"epoch": 0.79,
"grad_norm": 1.881635576782646,
"learning_rate": 1.1556588202451613e-06,
"loss": 0.4638,
"step": 1285
},
{
"epoch": 0.79,
"grad_norm": 1.7896548110439616,
"learning_rate": 1.1493296026094302e-06,
"loss": 0.5252,
"step": 1286
},
{
"epoch": 0.79,
"grad_norm": 1.9326908202122164,
"learning_rate": 1.1430155128309e-06,
"loss": 0.4933,
"step": 1287
},
{
"epoch": 0.79,
"grad_norm": 1.9844579045400434,
"learning_rate": 1.1367165757152905e-06,
"loss": 0.5393,
"step": 1288
},
{
"epoch": 0.79,
"grad_norm": 2.0830711516317573,
"learning_rate": 1.1304328160087935e-06,
"loss": 0.6165,
"step": 1289
},
{
"epoch": 0.79,
"grad_norm": 2.0820587210774875,
"learning_rate": 1.12416425839797e-06,
"loss": 0.5735,
"step": 1290
},
{
"epoch": 0.79,
"grad_norm": 1.9942738518131777,
"learning_rate": 1.1179109275096628e-06,
"loss": 0.5331,
"step": 1291
},
{
"epoch": 0.79,
"grad_norm": 2.09146135494079,
"learning_rate": 1.1116728479108884e-06,
"loss": 0.4912,
"step": 1292
},
{
"epoch": 0.79,
"grad_norm": 2.0181223861753685,
"learning_rate": 1.105450044108753e-06,
"loss": 0.5767,
"step": 1293
},
{
"epoch": 0.79,
"grad_norm": 1.7770312565830746,
"learning_rate": 1.099242540550347e-06,
"loss": 0.5222,
"step": 1294
},
{
"epoch": 0.79,
"grad_norm": 1.9504760795862741,
"learning_rate": 1.0930503616226495e-06,
"loss": 0.605,
"step": 1295
},
{
"epoch": 0.79,
"grad_norm": 1.8039286890109292,
"learning_rate": 1.0868735316524387e-06,
"loss": 0.439,
"step": 1296
},
{
"epoch": 0.79,
"grad_norm": 1.8781587995004858,
"learning_rate": 1.0807120749061923e-06,
"loss": 0.4785,
"step": 1297
},
{
"epoch": 0.79,
"grad_norm": 2.1386175502459466,
"learning_rate": 1.0745660155899878e-06,
"loss": 0.6047,
"step": 1298
},
{
"epoch": 0.79,
"grad_norm": 2.0105202575538255,
"learning_rate": 1.0684353778494166e-06,
"loss": 0.6412,
"step": 1299
},
{
"epoch": 0.79,
"grad_norm": 2.110747721557068,
"learning_rate": 1.0623201857694837e-06,
"loss": 0.5084,
"step": 1300
},
{
"epoch": 0.8,
"grad_norm": 2.0266382461097576,
"learning_rate": 1.056220463374511e-06,
"loss": 0.5513,
"step": 1301
},
{
"epoch": 0.8,
"grad_norm": 1.941131403268958,
"learning_rate": 1.0501362346280492e-06,
"loss": 0.5362,
"step": 1302
},
{
"epoch": 0.8,
"grad_norm": 2.179036766615289,
"learning_rate": 1.0440675234327774e-06,
"loss": 0.5566,
"step": 1303
},
{
"epoch": 0.8,
"grad_norm": 1.9388634477990079,
"learning_rate": 1.0380143536304133e-06,
"loss": 0.5316,
"step": 1304
},
{
"epoch": 0.8,
"grad_norm": 1.9303281784663635,
"learning_rate": 1.0319767490016196e-06,
"loss": 0.5194,
"step": 1305
},
{
"epoch": 0.8,
"grad_norm": 2.3880703426336356,
"learning_rate": 1.0259547332659065e-06,
"loss": 0.7486,
"step": 1306
},
{
"epoch": 0.8,
"grad_norm": 1.9926065703927103,
"learning_rate": 1.0199483300815421e-06,
"loss": 0.527,
"step": 1307
},
{
"epoch": 0.8,
"grad_norm": 1.9341158574219506,
"learning_rate": 1.0139575630454618e-06,
"loss": 0.5403,
"step": 1308
},
{
"epoch": 0.8,
"grad_norm": 1.8567155488677838,
"learning_rate": 1.0079824556931655e-06,
"loss": 0.548,
"step": 1309
},
{
"epoch": 0.8,
"grad_norm": 2.0404242620951436,
"learning_rate": 1.0020230314986395e-06,
"loss": 0.498,
"step": 1310
},
{
"epoch": 0.8,
"grad_norm": 2.0107577837327457,
"learning_rate": 9.960793138742503e-07,
"loss": 0.58,
"step": 1311
},
{
"epoch": 0.8,
"grad_norm": 1.764122033295333,
"learning_rate": 9.901513261706652e-07,
"loss": 0.4909,
"step": 1312
},
{
"epoch": 0.8,
"grad_norm": 2.1074644373274225,
"learning_rate": 9.84239091676748e-07,
"loss": 0.5358,
"step": 1313
},
{
"epoch": 0.8,
"grad_norm": 1.8835155965933714,
"learning_rate": 9.783426336194807e-07,
"loss": 0.5683,
"step": 1314
},
{
"epoch": 0.8,
"grad_norm": 1.9460151844862306,
"learning_rate": 9.724619751638598e-07,
"loss": 0.5901,
"step": 1315
},
{
"epoch": 0.8,
"grad_norm": 1.9960692765948507,
"learning_rate": 9.665971394128137e-07,
"loss": 0.5299,
"step": 1316
},
{
"epoch": 0.81,
"grad_norm": 1.904429551506709,
"learning_rate": 9.607481494071107e-07,
"loss": 0.5077,
"step": 1317
},
{
"epoch": 0.81,
"grad_norm": 2.0807551324391618,
"learning_rate": 9.549150281252633e-07,
"loss": 0.4932,
"step": 1318
},
{
"epoch": 0.81,
"grad_norm": 1.880996194999353,
"learning_rate": 9.490977984834454e-07,
"loss": 0.5256,
"step": 1319
},
{
"epoch": 0.81,
"grad_norm": 2.292702889415888,
"learning_rate": 9.432964833353947e-07,
"loss": 0.5633,
"step": 1320
},
{
"epoch": 0.81,
"grad_norm": 1.8157124323149034,
"learning_rate": 9.375111054723301e-07,
"loss": 0.5443,
"step": 1321
},
{
"epoch": 0.81,
"grad_norm": 1.997225759296561,
"learning_rate": 9.317416876228591e-07,
"loss": 0.6053,
"step": 1322
},
{
"epoch": 0.81,
"grad_norm": 2.2930834747649187,
"learning_rate": 9.259882524528835e-07,
"loss": 0.647,
"step": 1323
},
{
"epoch": 0.81,
"grad_norm": 1.7855930884686897,
"learning_rate": 9.202508225655216e-07,
"loss": 0.4861,
"step": 1324
},
{
"epoch": 0.81,
"grad_norm": 2.041666912419482,
"learning_rate": 9.145294205010058e-07,
"loss": 0.5105,
"step": 1325
},
{
"epoch": 0.81,
"grad_norm": 2.1259688035148496,
"learning_rate": 9.088240687366073e-07,
"loss": 0.6038,
"step": 1326
},
{
"epoch": 0.81,
"grad_norm": 2.0323306545521436,
"learning_rate": 9.0313478968654e-07,
"loss": 0.5853,
"step": 1327
},
{
"epoch": 0.81,
"grad_norm": 2.044128726826921,
"learning_rate": 8.974616057018709e-07,
"loss": 0.5153,
"step": 1328
},
{
"epoch": 0.81,
"grad_norm": 1.944192337336169,
"learning_rate": 8.918045390704383e-07,
"loss": 0.5475,
"step": 1329
},
{
"epoch": 0.81,
"grad_norm": 2.062636307328745,
"learning_rate": 8.861636120167632e-07,
"loss": 0.5959,
"step": 1330
},
{
"epoch": 0.81,
"grad_norm": 2.1486851598365946,
"learning_rate": 8.805388467019549e-07,
"loss": 0.5959,
"step": 1331
},
{
"epoch": 0.81,
"grad_norm": 2.2040420193997483,
"learning_rate": 8.749302652236341e-07,
"loss": 0.6322,
"step": 1332
},
{
"epoch": 0.82,
"grad_norm": 2.20406327847528,
"learning_rate": 8.693378896158377e-07,
"loss": 0.6114,
"step": 1333
},
{
"epoch": 0.82,
"grad_norm": 2.1795279212664886,
"learning_rate": 8.637617418489386e-07,
"loss": 0.5828,
"step": 1334
},
{
"epoch": 0.82,
"grad_norm": 2.041352967900095,
"learning_rate": 8.582018438295553e-07,
"loss": 0.5139,
"step": 1335
},
{
"epoch": 0.82,
"grad_norm": 2.0047043727475167,
"learning_rate": 8.52658217400466e-07,
"loss": 0.5492,
"step": 1336
},
{
"epoch": 0.82,
"grad_norm": 1.9340395107266044,
"learning_rate": 8.471308843405252e-07,
"loss": 0.5404,
"step": 1337
},
{
"epoch": 0.82,
"grad_norm": 1.9443767009288708,
"learning_rate": 8.416198663645775e-07,
"loss": 0.6145,
"step": 1338
},
{
"epoch": 0.82,
"grad_norm": 2.0047578158470385,
"learning_rate": 8.361251851233687e-07,
"loss": 0.5147,
"step": 1339
},
{
"epoch": 0.82,
"grad_norm": 1.9775796297395034,
"learning_rate": 8.306468622034663e-07,
"loss": 0.4914,
"step": 1340
},
{
"epoch": 0.82,
"grad_norm": 2.0350740385108783,
"learning_rate": 8.251849191271727e-07,
"loss": 0.5988,
"step": 1341
},
{
"epoch": 0.82,
"grad_norm": 1.810070925909079,
"learning_rate": 8.197393773524359e-07,
"loss": 0.4841,
"step": 1342
},
{
"epoch": 0.82,
"grad_norm": 1.9563215783299615,
"learning_rate": 8.143102582727741e-07,
"loss": 0.5356,
"step": 1343
},
{
"epoch": 0.82,
"grad_norm": 1.8199362899555016,
"learning_rate": 8.088975832171819e-07,
"loss": 0.4712,
"step": 1344
},
{
"epoch": 0.82,
"grad_norm": 2.309704865979389,
"learning_rate": 8.035013734500557e-07,
"loss": 0.6218,
"step": 1345
},
{
"epoch": 0.82,
"grad_norm": 2.0272627809313923,
"learning_rate": 7.981216501711053e-07,
"loss": 0.5838,
"step": 1346
},
{
"epoch": 0.82,
"grad_norm": 2.0114764889056613,
"learning_rate": 7.927584345152672e-07,
"loss": 0.5609,
"step": 1347
},
{
"epoch": 0.82,
"grad_norm": 1.9745269009451916,
"learning_rate": 7.874117475526305e-07,
"loss": 0.4989,
"step": 1348
},
{
"epoch": 0.82,
"grad_norm": 2.253717109150984,
"learning_rate": 7.820816102883477e-07,
"loss": 0.6223,
"step": 1349
},
{
"epoch": 0.83,
"grad_norm": 2.098840934097801,
"learning_rate": 7.767680436625513e-07,
"loss": 0.5429,
"step": 1350
},
{
"epoch": 0.83,
"grad_norm": 1.9010834528450948,
"learning_rate": 7.714710685502764e-07,
"loss": 0.5055,
"step": 1351
},
{
"epoch": 0.83,
"grad_norm": 2.0272090911880176,
"learning_rate": 7.661907057613766e-07,
"loss": 0.5749,
"step": 1352
},
{
"epoch": 0.83,
"grad_norm": 2.1091229605800645,
"learning_rate": 7.609269760404392e-07,
"loss": 0.5019,
"step": 1353
},
{
"epoch": 0.83,
"grad_norm": 1.9509717065137873,
"learning_rate": 7.556799000667097e-07,
"loss": 0.4808,
"step": 1354
},
{
"epoch": 0.83,
"grad_norm": 1.873704324767879,
"learning_rate": 7.504494984540033e-07,
"loss": 0.4928,
"step": 1355
},
{
"epoch": 0.83,
"grad_norm": 1.987007022534763,
"learning_rate": 7.452357917506309e-07,
"loss": 0.5312,
"step": 1356
},
{
"epoch": 0.83,
"grad_norm": 2.1167223759818503,
"learning_rate": 7.40038800439315e-07,
"loss": 0.5532,
"step": 1357
},
{
"epoch": 0.83,
"grad_norm": 1.8820691821813884,
"learning_rate": 7.348585449371076e-07,
"loss": 0.5615,
"step": 1358
},
{
"epoch": 0.83,
"grad_norm": 1.9800556013383923,
"learning_rate": 7.296950455953145e-07,
"loss": 0.5546,
"step": 1359
},
{
"epoch": 0.83,
"grad_norm": 2.278698097867691,
"learning_rate": 7.245483226994094e-07,
"loss": 0.6373,
"step": 1360
},
{
"epoch": 0.83,
"grad_norm": 1.8404133278696633,
"learning_rate": 7.19418396468961e-07,
"loss": 0.5212,
"step": 1361
},
{
"epoch": 0.83,
"grad_norm": 2.082478736477018,
"learning_rate": 7.14305287057549e-07,
"loss": 0.5405,
"step": 1362
},
{
"epoch": 0.83,
"grad_norm": 2.274543877989927,
"learning_rate": 7.092090145526842e-07,
"loss": 0.5788,
"step": 1363
},
{
"epoch": 0.83,
"grad_norm": 2.1975775615791284,
"learning_rate": 7.041295989757352e-07,
"loss": 0.5705,
"step": 1364
},
{
"epoch": 0.83,
"grad_norm": 2.3957382130192744,
"learning_rate": 6.990670602818412e-07,
"loss": 0.5319,
"step": 1365
},
{
"epoch": 0.84,
"grad_norm": 1.9572975655214617,
"learning_rate": 6.940214183598431e-07,
"loss": 0.4977,
"step": 1366
},
{
"epoch": 0.84,
"grad_norm": 2.0343363601457796,
"learning_rate": 6.889926930321961e-07,
"loss": 0.5601,
"step": 1367
},
{
"epoch": 0.84,
"grad_norm": 2.2173297633145497,
"learning_rate": 6.839809040549017e-07,
"loss": 0.5652,
"step": 1368
},
{
"epoch": 0.84,
"grad_norm": 1.9898806046683502,
"learning_rate": 6.789860711174184e-07,
"loss": 0.5604,
"step": 1369
},
{
"epoch": 0.84,
"grad_norm": 1.755937949976552,
"learning_rate": 6.740082138425963e-07,
"loss": 0.5268,
"step": 1370
},
{
"epoch": 0.84,
"grad_norm": 2.059314359247024,
"learning_rate": 6.690473517865925e-07,
"loss": 0.5516,
"step": 1371
},
{
"epoch": 0.84,
"grad_norm": 1.921092025716401,
"learning_rate": 6.641035044387939e-07,
"loss": 0.5282,
"step": 1372
},
{
"epoch": 0.84,
"grad_norm": 2.1611589441440904,
"learning_rate": 6.591766912217456e-07,
"loss": 0.5721,
"step": 1373
},
{
"epoch": 0.84,
"grad_norm": 2.07998470722096,
"learning_rate": 6.542669314910732e-07,
"loss": 0.616,
"step": 1374
},
{
"epoch": 0.84,
"grad_norm": 1.8276327443296445,
"learning_rate": 6.493742445354012e-07,
"loss": 0.4733,
"step": 1375
},
{
"epoch": 0.84,
"grad_norm": 2.0470888106096568,
"learning_rate": 6.44498649576285e-07,
"loss": 0.6115,
"step": 1376
},
{
"epoch": 0.84,
"grad_norm": 1.930085414769607,
"learning_rate": 6.39640165768129e-07,
"loss": 0.5524,
"step": 1377
},
{
"epoch": 0.84,
"grad_norm": 1.992132029715179,
"learning_rate": 6.347988121981175e-07,
"loss": 0.5116,
"step": 1378
},
{
"epoch": 0.84,
"grad_norm": 1.9749787878274394,
"learning_rate": 6.299746078861346e-07,
"loss": 0.5243,
"step": 1379
},
{
"epoch": 0.84,
"grad_norm": 2.1338239619198003,
"learning_rate": 6.251675717846905e-07,
"loss": 0.6601,
"step": 1380
},
{
"epoch": 0.84,
"grad_norm": 1.9037772081403355,
"learning_rate": 6.203777227788493e-07,
"loss": 0.537,
"step": 1381
},
{
"epoch": 0.85,
"grad_norm": 1.982734111031249,
"learning_rate": 6.156050796861551e-07,
"loss": 0.5447,
"step": 1382
},
{
"epoch": 0.85,
"grad_norm": 1.9647030662707663,
"learning_rate": 6.108496612565507e-07,
"loss": 0.5572,
"step": 1383
},
{
"epoch": 0.85,
"grad_norm": 2.20503978552407,
"learning_rate": 6.061114861723144e-07,
"loss": 0.5847,
"step": 1384
},
{
"epoch": 0.85,
"grad_norm": 1.8985370142150249,
"learning_rate": 6.013905730479824e-07,
"loss": 0.5245,
"step": 1385
},
{
"epoch": 0.85,
"grad_norm": 2.0108479395190204,
"learning_rate": 5.966869404302705e-07,
"loss": 0.4869,
"step": 1386
},
{
"epoch": 0.85,
"grad_norm": 1.6778892025630097,
"learning_rate": 5.920006067980105e-07,
"loss": 0.4713,
"step": 1387
},
{
"epoch": 0.85,
"grad_norm": 1.9622392449880077,
"learning_rate": 5.873315905620685e-07,
"loss": 0.5619,
"step": 1388
},
{
"epoch": 0.85,
"grad_norm": 1.8986949233718315,
"learning_rate": 5.826799100652802e-07,
"loss": 0.5944,
"step": 1389
},
{
"epoch": 0.85,
"grad_norm": 2.0094350555686726,
"learning_rate": 5.780455835823767e-07,
"loss": 0.6029,
"step": 1390
},
{
"epoch": 0.85,
"grad_norm": 2.01549443606308,
"learning_rate": 5.734286293199065e-07,
"loss": 0.5168,
"step": 1391
},
{
"epoch": 0.85,
"grad_norm": 1.9026568713715968,
"learning_rate": 5.688290654161738e-07,
"loss": 0.4661,
"step": 1392
},
{
"epoch": 0.85,
"grad_norm": 2.090067485392997,
"learning_rate": 5.642469099411619e-07,
"loss": 0.5773,
"step": 1393
},
{
"epoch": 0.85,
"grad_norm": 2.041801412484887,
"learning_rate": 5.596821808964592e-07,
"loss": 0.5174,
"step": 1394
},
{
"epoch": 0.85,
"grad_norm": 1.8047865906862048,
"learning_rate": 5.551348962151965e-07,
"loss": 0.5096,
"step": 1395
},
{
"epoch": 0.85,
"grad_norm": 1.7749522813672998,
"learning_rate": 5.506050737619706e-07,
"loss": 0.4149,
"step": 1396
},
{
"epoch": 0.85,
"grad_norm": 1.9669167109351449,
"learning_rate": 5.460927313327746e-07,
"loss": 0.5318,
"step": 1397
},
{
"epoch": 0.85,
"grad_norm": 2.0851119947571397,
"learning_rate": 5.415978866549309e-07,
"loss": 0.5206,
"step": 1398
},
{
"epoch": 0.86,
"grad_norm": 1.770305337825321,
"learning_rate": 5.371205573870169e-07,
"loss": 0.5146,
"step": 1399
},
{
"epoch": 0.86,
"grad_norm": 2.0529803134924793,
"learning_rate": 5.326607611188023e-07,
"loss": 0.5925,
"step": 1400
},
{
"epoch": 0.86,
"grad_norm": 1.8408476413034762,
"learning_rate": 5.282185153711739e-07,
"loss": 0.5419,
"step": 1401
},
{
"epoch": 0.86,
"grad_norm": 2.1277879535451887,
"learning_rate": 5.237938375960683e-07,
"loss": 0.5522,
"step": 1402
},
{
"epoch": 0.86,
"grad_norm": 1.798009767692874,
"learning_rate": 5.19386745176405e-07,
"loss": 0.4908,
"step": 1403
},
{
"epoch": 0.86,
"grad_norm": 1.9218825894337166,
"learning_rate": 5.149972554260191e-07,
"loss": 0.5907,
"step": 1404
},
{
"epoch": 0.86,
"grad_norm": 1.7135646036349135,
"learning_rate": 5.106253855895865e-07,
"loss": 0.5325,
"step": 1405
},
{
"epoch": 0.86,
"grad_norm": 2.0051336782599916,
"learning_rate": 5.062711528425657e-07,
"loss": 0.552,
"step": 1406
},
{
"epoch": 0.86,
"grad_norm": 1.823540052411061,
"learning_rate": 5.019345742911241e-07,
"loss": 0.5279,
"step": 1407
},
{
"epoch": 0.86,
"grad_norm": 1.9498421802995456,
"learning_rate": 4.976156669720706e-07,
"loss": 0.4684,
"step": 1408
},
{
"epoch": 0.86,
"grad_norm": 2.0510089396163464,
"learning_rate": 4.933144478527929e-07,
"loss": 0.5733,
"step": 1409
},
{
"epoch": 0.86,
"grad_norm": 1.718921166851957,
"learning_rate": 4.890309338311861e-07,
"loss": 0.4503,
"step": 1410
},
{
"epoch": 0.86,
"grad_norm": 2.066841848267666,
"learning_rate": 4.847651417355914e-07,
"loss": 0.5523,
"step": 1411
},
{
"epoch": 0.86,
"grad_norm": 1.935126073825529,
"learning_rate": 4.805170883247228e-07,
"loss": 0.5709,
"step": 1412
},
{
"epoch": 0.86,
"grad_norm": 1.7529161435960845,
"learning_rate": 4.7628679028761114e-07,
"loss": 0.4784,
"step": 1413
},
{
"epoch": 0.86,
"grad_norm": 1.9418531589165946,
"learning_rate": 4.720742642435272e-07,
"loss": 0.5417,
"step": 1414
},
{
"epoch": 0.87,
"grad_norm": 2.256822591186601,
"learning_rate": 4.678795267419267e-07,
"loss": 0.5787,
"step": 1415
},
{
"epoch": 0.87,
"grad_norm": 1.7848727741194108,
"learning_rate": 4.63702594262378e-07,
"loss": 0.4068,
"step": 1416
},
{
"epoch": 0.87,
"grad_norm": 2.1119584563313314,
"learning_rate": 4.595434832145013e-07,
"loss": 0.6635,
"step": 1417
},
{
"epoch": 0.87,
"grad_norm": 2.047030257561458,
"learning_rate": 4.554022099379035e-07,
"loss": 0.5171,
"step": 1418
},
{
"epoch": 0.87,
"grad_norm": 1.9273060718802395,
"learning_rate": 4.5127879070211213e-07,
"loss": 0.5597,
"step": 1419
},
{
"epoch": 0.87,
"grad_norm": 2.1177945388342727,
"learning_rate": 4.471732417065144e-07,
"loss": 0.5861,
"step": 1420
},
{
"epoch": 0.87,
"grad_norm": 2.1663172290104002,
"learning_rate": 4.430855790802896e-07,
"loss": 0.5851,
"step": 1421
},
{
"epoch": 0.87,
"grad_norm": 2.1902483639800887,
"learning_rate": 4.3901581888235067e-07,
"loss": 0.5485,
"step": 1422
},
{
"epoch": 0.87,
"grad_norm": 2.27940192829255,
"learning_rate": 4.3496397710127756e-07,
"loss": 0.5683,
"step": 1423
},
{
"epoch": 0.87,
"grad_norm": 1.6787266853728975,
"learning_rate": 4.3093006965525483e-07,
"loss": 0.4487,
"step": 1424
},
{
"epoch": 0.87,
"grad_norm": 2.2173461979846554,
"learning_rate": 4.2691411239201007e-07,
"loss": 0.6181,
"step": 1425
},
{
"epoch": 0.87,
"grad_norm": 2.0362219497157663,
"learning_rate": 4.2291612108875226e-07,
"loss": 0.5827,
"step": 1426
},
{
"epoch": 0.87,
"grad_norm": 1.984683976482598,
"learning_rate": 4.189361114521062e-07,
"loss": 0.5687,
"step": 1427
},
{
"epoch": 0.87,
"grad_norm": 2.010835056818883,
"learning_rate": 4.149740991180573e-07,
"loss": 0.5484,
"step": 1428
},
{
"epoch": 0.87,
"grad_norm": 2.113774248995943,
"learning_rate": 4.1103009965188125e-07,
"loss": 0.598,
"step": 1429
},
{
"epoch": 0.87,
"grad_norm": 2.0151775021994385,
"learning_rate": 4.0710412854809255e-07,
"loss": 0.4896,
"step": 1430
},
{
"epoch": 0.87,
"grad_norm": 1.9514989153137001,
"learning_rate": 4.0319620123037697e-07,
"loss": 0.5659,
"step": 1431
},
{
"epoch": 0.88,
"grad_norm": 2.07614867510403,
"learning_rate": 3.9930633305153177e-07,
"loss": 0.4641,
"step": 1432
},
{
"epoch": 0.88,
"grad_norm": 2.2405590611515933,
"learning_rate": 3.9543453929340834e-07,
"loss": 0.5112,
"step": 1433
},
{
"epoch": 0.88,
"grad_norm": 2.14510497102877,
"learning_rate": 3.9158083516685043e-07,
"loss": 0.6867,
"step": 1434
},
{
"epoch": 0.88,
"grad_norm": 1.8206597677309004,
"learning_rate": 3.8774523581163236e-07,
"loss": 0.5024,
"step": 1435
},
{
"epoch": 0.88,
"grad_norm": 1.9730244343440717,
"learning_rate": 3.8392775629640275e-07,
"loss": 0.6115,
"step": 1436
},
{
"epoch": 0.88,
"grad_norm": 2.0113186406624677,
"learning_rate": 3.80128411618626e-07,
"loss": 0.5308,
"step": 1437
},
{
"epoch": 0.88,
"grad_norm": 2.291565453318076,
"learning_rate": 3.763472167045179e-07,
"loss": 0.5849,
"step": 1438
},
{
"epoch": 0.88,
"grad_norm": 1.865162322112732,
"learning_rate": 3.72584186408993e-07,
"loss": 0.4345,
"step": 1439
},
{
"epoch": 0.88,
"grad_norm": 1.7806466246249277,
"learning_rate": 3.688393355156022e-07,
"loss": 0.4976,
"step": 1440
},
{
"epoch": 0.88,
"grad_norm": 1.837932154589643,
"learning_rate": 3.6511267873647725e-07,
"loss": 0.5382,
"step": 1441
},
{
"epoch": 0.88,
"grad_norm": 1.900042396349841,
"learning_rate": 3.614042307122728e-07,
"loss": 0.5135,
"step": 1442
},
{
"epoch": 0.88,
"grad_norm": 2.0377705675722435,
"learning_rate": 3.577140060121059e-07,
"loss": 0.6439,
"step": 1443
},
{
"epoch": 0.88,
"grad_norm": 1.9812798873664048,
"learning_rate": 3.54042019133502e-07,
"loss": 0.518,
"step": 1444
},
{
"epoch": 0.88,
"grad_norm": 1.9801105899864375,
"learning_rate": 3.5038828450233874e-07,
"loss": 0.5513,
"step": 1445
},
{
"epoch": 0.88,
"grad_norm": 1.787558486401592,
"learning_rate": 3.4675281647278346e-07,
"loss": 0.4717,
"step": 1446
},
{
"epoch": 0.88,
"grad_norm": 1.7350169337556531,
"learning_rate": 3.431356293272442e-07,
"loss": 0.4517,
"step": 1447
},
{
"epoch": 0.89,
"grad_norm": 1.9036391339274414,
"learning_rate": 3.395367372763092e-07,
"loss": 0.4952,
"step": 1448
},
{
"epoch": 0.89,
"grad_norm": 2.1326612564420078,
"learning_rate": 3.3595615445869033e-07,
"loss": 0.665,
"step": 1449
},
{
"epoch": 0.89,
"grad_norm": 1.9766629375982392,
"learning_rate": 3.3239389494117316e-07,
"loss": 0.4712,
"step": 1450
},
{
"epoch": 0.89,
"grad_norm": 2.085629103255329,
"learning_rate": 3.288499727185529e-07,
"loss": 0.5991,
"step": 1451
},
{
"epoch": 0.89,
"grad_norm": 1.9952308886565442,
"learning_rate": 3.253244017135876e-07,
"loss": 0.5492,
"step": 1452
},
{
"epoch": 0.89,
"grad_norm": 1.9823172713501394,
"learning_rate": 3.218171957769411e-07,
"loss": 0.5133,
"step": 1453
},
{
"epoch": 0.89,
"grad_norm": 2.243504252708836,
"learning_rate": 3.183283686871236e-07,
"loss": 0.5375,
"step": 1454
},
{
"epoch": 0.89,
"grad_norm": 2.160814779756097,
"learning_rate": 3.1485793415044483e-07,
"loss": 0.5441,
"step": 1455
},
{
"epoch": 0.89,
"grad_norm": 2.1575822253594397,
"learning_rate": 3.1140590580095777e-07,
"loss": 0.5261,
"step": 1456
},
{
"epoch": 0.89,
"grad_norm": 2.072867103693364,
"learning_rate": 3.079722972004007e-07,
"loss": 0.528,
"step": 1457
},
{
"epoch": 0.89,
"grad_norm": 2.067717194924602,
"learning_rate": 3.0455712183815044e-07,
"loss": 0.5705,
"step": 1458
},
{
"epoch": 0.89,
"grad_norm": 2.2018910614851053,
"learning_rate": 3.011603931311652e-07,
"loss": 0.6087,
"step": 1459
},
{
"epoch": 0.89,
"grad_norm": 1.799733514347377,
"learning_rate": 2.9778212442393373e-07,
"loss": 0.3817,
"step": 1460
},
{
"epoch": 0.89,
"grad_norm": 2.0297829733559096,
"learning_rate": 2.9442232898842184e-07,
"loss": 0.5627,
"step": 1461
},
{
"epoch": 0.89,
"grad_norm": 2.1127229638235314,
"learning_rate": 2.910810200240205e-07,
"loss": 0.6539,
"step": 1462
},
{
"epoch": 0.89,
"grad_norm": 2.2173549226318148,
"learning_rate": 2.877582106574961e-07,
"loss": 0.6292,
"step": 1463
},
{
"epoch": 0.9,
"grad_norm": 1.7665892894148392,
"learning_rate": 2.8445391394293364e-07,
"loss": 0.536,
"step": 1464
},
{
"epoch": 0.9,
"grad_norm": 1.9459724264850673,
"learning_rate": 2.811681428616919e-07,
"loss": 0.506,
"step": 1465
},
{
"epoch": 0.9,
"grad_norm": 2.055519305865498,
"learning_rate": 2.779009103223473e-07,
"loss": 0.5743,
"step": 1466
},
{
"epoch": 0.9,
"grad_norm": 1.927080536920677,
"learning_rate": 2.746522291606463e-07,
"loss": 0.5181,
"step": 1467
},
{
"epoch": 0.9,
"grad_norm": 2.04096009635488,
"learning_rate": 2.7142211213945224e-07,
"loss": 0.564,
"step": 1468
},
{
"epoch": 0.9,
"grad_norm": 1.9317429342953267,
"learning_rate": 2.682105719486994e-07,
"loss": 0.5655,
"step": 1469
},
{
"epoch": 0.9,
"grad_norm": 2.1614143475246803,
"learning_rate": 2.65017621205339e-07,
"loss": 0.5385,
"step": 1470
},
{
"epoch": 0.9,
"grad_norm": 1.9518158323053523,
"learning_rate": 2.61843272453291e-07,
"loss": 0.5292,
"step": 1471
},
{
"epoch": 0.9,
"grad_norm": 1.988402222876475,
"learning_rate": 2.5868753816339574e-07,
"loss": 0.4855,
"step": 1472
},
{
"epoch": 0.9,
"grad_norm": 2.2461460760928973,
"learning_rate": 2.5555043073336394e-07,
"loss": 0.545,
"step": 1473
},
{
"epoch": 0.9,
"grad_norm": 2.016749712197434,
"learning_rate": 2.524319624877275e-07,
"loss": 0.5487,
"step": 1474
},
{
"epoch": 0.9,
"grad_norm": 1.8374656172629769,
"learning_rate": 2.4933214567779473e-07,
"loss": 0.4698,
"step": 1475
},
{
"epoch": 0.9,
"grad_norm": 2.1647972903983224,
"learning_rate": 2.462509924815948e-07,
"loss": 0.6418,
"step": 1476
},
{
"epoch": 0.9,
"grad_norm": 1.9502593053235608,
"learning_rate": 2.4318851500383823e-07,
"loss": 0.5,
"step": 1477
},
{
"epoch": 0.9,
"grad_norm": 1.9211282369606173,
"learning_rate": 2.4014472527586483e-07,
"loss": 0.4927,
"step": 1478
},
{
"epoch": 0.9,
"grad_norm": 2.329177425501773,
"learning_rate": 2.3711963525559544e-07,
"loss": 0.5993,
"step": 1479
},
{
"epoch": 0.9,
"grad_norm": 2.225800165473701,
"learning_rate": 2.3411325682748843e-07,
"loss": 0.6954,
"step": 1480
},
{
"epoch": 0.91,
"grad_norm": 2.0522726914562703,
"learning_rate": 2.3112560180249154e-07,
"loss": 0.5618,
"step": 1481
},
{
"epoch": 0.91,
"grad_norm": 2.1168420702658657,
"learning_rate": 2.2815668191799255e-07,
"loss": 0.5674,
"step": 1482
},
{
"epoch": 0.91,
"grad_norm": 2.032866002096309,
"learning_rate": 2.2520650883777917e-07,
"loss": 0.5903,
"step": 1483
},
{
"epoch": 0.91,
"grad_norm": 1.927079943417791,
"learning_rate": 2.222750941519869e-07,
"loss": 0.5379,
"step": 1484
},
{
"epoch": 0.91,
"grad_norm": 2.230233104141958,
"learning_rate": 2.193624493770591e-07,
"loss": 0.5362,
"step": 1485
},
{
"epoch": 0.91,
"grad_norm": 1.9871866256133242,
"learning_rate": 2.1646858595569754e-07,
"loss": 0.5402,
"step": 1486
},
{
"epoch": 0.91,
"grad_norm": 2.0451780616189827,
"learning_rate": 2.135935152568186e-07,
"loss": 0.5671,
"step": 1487
},
{
"epoch": 0.91,
"grad_norm": 1.9066201288251257,
"learning_rate": 2.107372485755105e-07,
"loss": 0.5467,
"step": 1488
},
{
"epoch": 0.91,
"grad_norm": 1.797950359009013,
"learning_rate": 2.0789979713298714e-07,
"loss": 0.5164,
"step": 1489
},
{
"epoch": 0.91,
"grad_norm": 2.071072453128202,
"learning_rate": 2.0508117207654276e-07,
"loss": 0.5991,
"step": 1490
},
{
"epoch": 0.91,
"grad_norm": 2.317654978785418,
"learning_rate": 2.0228138447951128e-07,
"loss": 0.6293,
"step": 1491
},
{
"epoch": 0.91,
"grad_norm": 2.053251641022129,
"learning_rate": 1.9950044534122138e-07,
"loss": 0.5853,
"step": 1492
},
{
"epoch": 0.91,
"grad_norm": 1.971312933014048,
"learning_rate": 1.9673836558695148e-07,
"loss": 0.4732,
"step": 1493
},
{
"epoch": 0.91,
"grad_norm": 1.9713907215979547,
"learning_rate": 1.9399515606789098e-07,
"loss": 0.6066,
"step": 1494
},
{
"epoch": 0.91,
"grad_norm": 1.9568482450287366,
"learning_rate": 1.9127082756109138e-07,
"loss": 0.547,
"step": 1495
},
{
"epoch": 0.91,
"grad_norm": 1.7163468064474119,
"learning_rate": 1.8856539076943126e-07,
"loss": 0.3999,
"step": 1496
},
{
"epoch": 0.92,
"grad_norm": 1.9178630187217227,
"learning_rate": 1.858788563215702e-07,
"loss": 0.5042,
"step": 1497
},
{
"epoch": 0.92,
"grad_norm": 2.127461173898097,
"learning_rate": 1.8321123477190506e-07,
"loss": 0.6439,
"step": 1498
},
{
"epoch": 0.92,
"grad_norm": 2.231384706660567,
"learning_rate": 1.8056253660053258e-07,
"loss": 0.5077,
"step": 1499
},
{
"epoch": 0.92,
"grad_norm": 1.9635997200687012,
"learning_rate": 1.7793277221320794e-07,
"loss": 0.5042,
"step": 1500
},
{
"epoch": 0.92,
"grad_norm": 2.073065458571614,
"learning_rate": 1.7532195194129964e-07,
"loss": 0.5212,
"step": 1501
},
{
"epoch": 0.92,
"grad_norm": 1.977411836335037,
"learning_rate": 1.7273008604175301e-07,
"loss": 0.5035,
"step": 1502
},
{
"epoch": 0.92,
"grad_norm": 1.9442884403854808,
"learning_rate": 1.7015718469705066e-07,
"loss": 0.5649,
"step": 1503
},
{
"epoch": 0.92,
"grad_norm": 1.9764790370004715,
"learning_rate": 1.6760325801516597e-07,
"loss": 0.5255,
"step": 1504
},
{
"epoch": 0.92,
"grad_norm": 1.9950729966418053,
"learning_rate": 1.6506831602953298e-07,
"loss": 0.5285,
"step": 1505
},
{
"epoch": 0.92,
"grad_norm": 1.876091437155584,
"learning_rate": 1.625523686989977e-07,
"loss": 0.4915,
"step": 1506
},
{
"epoch": 0.92,
"grad_norm": 2.1698152647396465,
"learning_rate": 1.6005542590778521e-07,
"loss": 0.6394,
"step": 1507
},
{
"epoch": 0.92,
"grad_norm": 1.9390084511570913,
"learning_rate": 1.5757749746546037e-07,
"loss": 0.5461,
"step": 1508
},
{
"epoch": 0.92,
"grad_norm": 2.0186157583922135,
"learning_rate": 1.5511859310688326e-07,
"loss": 0.5515,
"step": 1509
},
{
"epoch": 0.92,
"grad_norm": 1.908132809961028,
"learning_rate": 1.5267872249217997e-07,
"loss": 0.4557,
"step": 1510
},
{
"epoch": 0.92,
"grad_norm": 2.189140445691145,
"learning_rate": 1.5025789520669688e-07,
"loss": 0.5904,
"step": 1511
},
{
"epoch": 0.92,
"grad_norm": 2.1203241389390257,
"learning_rate": 1.4785612076096856e-07,
"loss": 0.5698,
"step": 1512
},
{
"epoch": 0.93,
"grad_norm": 1.8864144078816145,
"learning_rate": 1.454734085906756e-07,
"loss": 0.5211,
"step": 1513
},
{
"epoch": 0.93,
"grad_norm": 2.0640726129454867,
"learning_rate": 1.4310976805661237e-07,
"loss": 0.534,
"step": 1514
},
{
"epoch": 0.93,
"grad_norm": 1.9593391676293495,
"learning_rate": 1.407652084446459e-07,
"loss": 0.5575,
"step": 1515
},
{
"epoch": 0.93,
"grad_norm": 1.9956718882503213,
"learning_rate": 1.3843973896568275e-07,
"loss": 0.4995,
"step": 1516
},
{
"epoch": 0.93,
"grad_norm": 2.0309484325202027,
"learning_rate": 1.3613336875563045e-07,
"loss": 0.5561,
"step": 1517
},
{
"epoch": 0.93,
"grad_norm": 2.2977806815622808,
"learning_rate": 1.338461068753627e-07,
"loss": 0.6895,
"step": 1518
},
{
"epoch": 0.93,
"grad_norm": 1.9903192519492166,
"learning_rate": 1.3157796231068497e-07,
"loss": 0.5644,
"step": 1519
},
{
"epoch": 0.93,
"grad_norm": 1.9448808087386893,
"learning_rate": 1.293289439722961e-07,
"loss": 0.5146,
"step": 1520
},
{
"epoch": 0.93,
"grad_norm": 2.115645901357327,
"learning_rate": 1.2709906069575561e-07,
"loss": 0.5702,
"step": 1521
},
{
"epoch": 0.93,
"grad_norm": 2.1808080246002457,
"learning_rate": 1.2488832124144923e-07,
"loss": 0.4805,
"step": 1522
},
{
"epoch": 0.93,
"grad_norm": 1.8228170984929375,
"learning_rate": 1.2269673429455287e-07,
"loss": 0.4851,
"step": 1523
},
{
"epoch": 0.93,
"grad_norm": 2.038456785853388,
"learning_rate": 1.2052430846499984e-07,
"loss": 0.5771,
"step": 1524
},
{
"epoch": 0.93,
"grad_norm": 1.8904101603890644,
"learning_rate": 1.183710522874454e-07,
"loss": 0.4813,
"step": 1525
},
{
"epoch": 0.93,
"grad_norm": 1.7215684414215477,
"learning_rate": 1.1623697422123603e-07,
"loss": 0.4418,
"step": 1526
},
{
"epoch": 0.93,
"grad_norm": 1.949814139587883,
"learning_rate": 1.1412208265037417e-07,
"loss": 0.4467,
"step": 1527
},
{
"epoch": 0.93,
"grad_norm": 1.989698073002338,
"learning_rate": 1.1202638588348413e-07,
"loss": 0.479,
"step": 1528
},
{
"epoch": 0.93,
"grad_norm": 1.9796631752851332,
"learning_rate": 1.0994989215378227e-07,
"loss": 0.6001,
"step": 1529
},
{
"epoch": 0.94,
"grad_norm": 2.0989381422150837,
"learning_rate": 1.0789260961904357e-07,
"loss": 0.5106,
"step": 1530
},
{
"epoch": 0.94,
"grad_norm": 1.971183598874908,
"learning_rate": 1.0585454636156788e-07,
"loss": 0.5654,
"step": 1531
},
{
"epoch": 0.94,
"grad_norm": 1.8203580247750317,
"learning_rate": 1.0383571038815155e-07,
"loss": 0.4136,
"step": 1532
},
{
"epoch": 0.94,
"grad_norm": 2.1265452345564264,
"learning_rate": 1.0183610963005298e-07,
"loss": 0.6466,
"step": 1533
},
{
"epoch": 0.94,
"grad_norm": 1.9020416641784,
"learning_rate": 9.98557519429616e-08,
"loss": 0.5149,
"step": 1534
},
{
"epoch": 0.94,
"grad_norm": 1.9565333994726395,
"learning_rate": 9.789464510697011e-08,
"loss": 0.6182,
"step": 1535
},
{
"epoch": 0.94,
"grad_norm": 2.192339437616818,
"learning_rate": 9.595279682654002e-08,
"loss": 0.5793,
"step": 1536
},
{
"epoch": 0.94,
"grad_norm": 1.752208514931023,
"learning_rate": 9.40302147304739e-08,
"loss": 0.4573,
"step": 1537
},
{
"epoch": 0.94,
"grad_norm": 1.9487144688198685,
"learning_rate": 9.212690637188492e-08,
"loss": 0.5045,
"step": 1538
},
{
"epoch": 0.94,
"grad_norm": 2.1355012550891095,
"learning_rate": 9.024287922816566e-08,
"loss": 0.5376,
"step": 1539
},
{
"epoch": 0.94,
"grad_norm": 1.830404039518353,
"learning_rate": 8.83781407009604e-08,
"loss": 0.5084,
"step": 1540
},
{
"epoch": 0.94,
"grad_norm": 2.0061575400005927,
"learning_rate": 8.653269811613685e-08,
"loss": 0.5405,
"step": 1541
},
{
"epoch": 0.94,
"grad_norm": 1.9051869282056297,
"learning_rate": 8.4706558723755e-08,
"loss": 0.4705,
"step": 1542
},
{
"epoch": 0.94,
"grad_norm": 1.970236377135854,
"learning_rate": 8.289972969803884e-08,
"loss": 0.4761,
"step": 1543
},
{
"epoch": 0.94,
"grad_norm": 2.393702968269512,
"learning_rate": 8.111221813735137e-08,
"loss": 0.5913,
"step": 1544
},
{
"epoch": 0.94,
"grad_norm": 2.1643166252959807,
"learning_rate": 7.934403106416245e-08,
"loss": 0.6399,
"step": 1545
},
{
"epoch": 0.95,
"grad_norm": 2.24965238428619,
"learning_rate": 7.759517542502426e-08,
"loss": 0.5946,
"step": 1546
},
{
"epoch": 0.95,
"grad_norm": 2.0038056323845663,
"learning_rate": 7.586565809054258e-08,
"loss": 0.5606,
"step": 1547
},
{
"epoch": 0.95,
"grad_norm": 2.1007936583122397,
"learning_rate": 7.415548585534949e-08,
"loss": 0.6222,
"step": 1548
},
{
"epoch": 0.95,
"grad_norm": 2.0990617623271497,
"learning_rate": 7.246466543807951e-08,
"loss": 0.6033,
"step": 1549
},
{
"epoch": 0.95,
"grad_norm": 2.235834211479321,
"learning_rate": 7.0793203481338e-08,
"loss": 0.5658,
"step": 1550
},
{
"epoch": 0.95,
"grad_norm": 2.0971634187847843,
"learning_rate": 6.914110655168005e-08,
"loss": 0.5197,
"step": 1551
},
{
"epoch": 0.95,
"grad_norm": 2.2723168272777845,
"learning_rate": 6.750838113958381e-08,
"loss": 0.5444,
"step": 1552
},
{
"epoch": 0.95,
"grad_norm": 2.14031105890706,
"learning_rate": 6.589503365941996e-08,
"loss": 0.5484,
"step": 1553
},
{
"epoch": 0.95,
"grad_norm": 1.9836284579510188,
"learning_rate": 6.430107044943512e-08,
"loss": 0.5281,
"step": 1554
},
{
"epoch": 0.95,
"grad_norm": 1.8255909265594834,
"learning_rate": 6.272649777171902e-08,
"loss": 0.4866,
"step": 1555
},
{
"epoch": 0.95,
"grad_norm": 2.107443031046224,
"learning_rate": 6.117132181218454e-08,
"loss": 0.5199,
"step": 1556
},
{
"epoch": 0.95,
"grad_norm": 2.1682405735610626,
"learning_rate": 5.963554868054167e-08,
"loss": 0.539,
"step": 1557
},
{
"epoch": 0.95,
"grad_norm": 2.24775315981034,
"learning_rate": 5.8119184410274085e-08,
"loss": 0.5139,
"step": 1558
},
{
"epoch": 0.95,
"grad_norm": 1.998700285054948,
"learning_rate": 5.662223495861596e-08,
"loss": 0.5518,
"step": 1559
},
{
"epoch": 0.95,
"grad_norm": 2.006720101430102,
"learning_rate": 5.5144706206525235e-08,
"loss": 0.5034,
"step": 1560
},
{
"epoch": 0.95,
"grad_norm": 1.9240681602880376,
"learning_rate": 5.368660395866643e-08,
"loss": 0.5566,
"step": 1561
},
{
"epoch": 0.96,
"grad_norm": 2.367451906830005,
"learning_rate": 5.2247933943382344e-08,
"loss": 0.6171,
"step": 1562
},
{
"epoch": 0.96,
"grad_norm": 1.9591190023164184,
"learning_rate": 5.0828701812674074e-08,
"loss": 0.5367,
"step": 1563
},
{
"epoch": 0.96,
"grad_norm": 2.0598213561773577,
"learning_rate": 4.94289131421799e-08,
"loss": 0.5737,
"step": 1564
},
{
"epoch": 0.96,
"grad_norm": 2.0711836264717634,
"learning_rate": 4.804857343114977e-08,
"loss": 0.5522,
"step": 1565
},
{
"epoch": 0.96,
"grad_norm": 1.803888225967423,
"learning_rate": 4.668768810242752e-08,
"loss": 0.441,
"step": 1566
},
{
"epoch": 0.96,
"grad_norm": 2.0820741333254507,
"learning_rate": 4.534626250242702e-08,
"loss": 0.5394,
"step": 1567
},
{
"epoch": 0.96,
"grad_norm": 2.2847763509593477,
"learning_rate": 4.4024301901113285e-08,
"loss": 0.5254,
"step": 1568
},
{
"epoch": 0.96,
"grad_norm": 1.9411071290247621,
"learning_rate": 4.2721811491978626e-08,
"loss": 0.5702,
"step": 1569
},
{
"epoch": 0.96,
"grad_norm": 1.745037379504202,
"learning_rate": 4.1438796392025416e-08,
"loss": 0.4511,
"step": 1570
},
{
"epoch": 0.96,
"grad_norm": 2.24676269385492,
"learning_rate": 4.017526164174501e-08,
"loss": 0.5475,
"step": 1571
},
{
"epoch": 0.96,
"grad_norm": 1.8719954199651734,
"learning_rate": 3.8931212205096655e-08,
"loss": 0.5167,
"step": 1572
},
{
"epoch": 0.96,
"grad_norm": 2.1782725251581443,
"learning_rate": 3.770665296949028e-08,
"loss": 0.583,
"step": 1573
},
{
"epoch": 0.96,
"grad_norm": 2.084620351446328,
"learning_rate": 3.650158874576537e-08,
"loss": 0.5573,
"step": 1574
},
{
"epoch": 0.96,
"grad_norm": 1.9942648310254383,
"learning_rate": 3.5316024268172713e-08,
"loss": 0.5195,
"step": 1575
},
{
"epoch": 0.96,
"grad_norm": 1.9276106667007256,
"learning_rate": 3.41499641943549e-08,
"loss": 0.461,
"step": 1576
},
{
"epoch": 0.96,
"grad_norm": 2.0002769940610676,
"learning_rate": 3.3003413105331396e-08,
"loss": 0.5253,
"step": 1577
},
{
"epoch": 0.96,
"grad_norm": 2.3233564755810336,
"learning_rate": 3.187637550547573e-08,
"loss": 0.6343,
"step": 1578
},
{
"epoch": 0.97,
"grad_norm": 1.9669404121882792,
"learning_rate": 3.076885582250111e-08,
"loss": 0.5298,
"step": 1579
},
{
"epoch": 0.97,
"grad_norm": 2.160706870886176,
"learning_rate": 2.9680858407441503e-08,
"loss": 0.5412,
"step": 1580
},
{
"epoch": 0.97,
"grad_norm": 2.1010182979674363,
"learning_rate": 2.8612387534636687e-08,
"loss": 0.5874,
"step": 1581
},
{
"epoch": 0.97,
"grad_norm": 2.3185770521155464,
"learning_rate": 2.756344740171224e-08,
"loss": 0.5676,
"step": 1582
},
{
"epoch": 0.97,
"grad_norm": 1.928726797539864,
"learning_rate": 2.653404212956512e-08,
"loss": 0.5199,
"step": 1583
},
{
"epoch": 0.97,
"grad_norm": 2.374638635528354,
"learning_rate": 2.552417576234756e-08,
"loss": 0.5822,
"step": 1584
},
{
"epoch": 0.97,
"grad_norm": 2.1312842535423666,
"learning_rate": 2.4533852267450976e-08,
"loss": 0.5486,
"step": 1585
},
{
"epoch": 0.97,
"grad_norm": 1.887474697390811,
"learning_rate": 2.3563075535487646e-08,
"loss": 0.5318,
"step": 1586
},
{
"epoch": 0.97,
"grad_norm": 2.162989562166336,
"learning_rate": 2.2611849380280715e-08,
"loss": 0.5646,
"step": 1587
},
{
"epoch": 0.97,
"grad_norm": 1.8865197403491825,
"learning_rate": 2.1680177538845882e-08,
"loss": 0.511,
"step": 1588
},
{
"epoch": 0.97,
"grad_norm": 2.0221844144721084,
"learning_rate": 2.0768063671375292e-08,
"loss": 0.5605,
"step": 1589
},
{
"epoch": 0.97,
"grad_norm": 1.7341292361915335,
"learning_rate": 1.9875511361227562e-08,
"loss": 0.5054,
"step": 1590
},
{
"epoch": 0.97,
"grad_norm": 2.1625957875579194,
"learning_rate": 1.9002524114909438e-08,
"loss": 0.5484,
"step": 1591
},
{
"epoch": 0.97,
"grad_norm": 1.890787277810342,
"learning_rate": 1.8149105362064157e-08,
"loss": 0.4912,
"step": 1592
},
{
"epoch": 0.97,
"grad_norm": 1.89646065518405,
"learning_rate": 1.731525845545812e-08,
"loss": 0.4987,
"step": 1593
},
{
"epoch": 0.97,
"grad_norm": 2.000138281803427,
"learning_rate": 1.6500986670966444e-08,
"loss": 0.5334,
"step": 1594
},
{
"epoch": 0.98,
"grad_norm": 1.9360765732063108,
"learning_rate": 1.5706293207561896e-08,
"loss": 0.5272,
"step": 1595
},
{
"epoch": 0.98,
"grad_norm": 1.8899446974988472,
"learning_rate": 1.4931181187300413e-08,
"loss": 0.5351,
"step": 1596
},
{
"epoch": 0.98,
"grad_norm": 2.083452401449017,
"learning_rate": 1.4175653655309484e-08,
"loss": 0.5654,
"step": 1597
},
{
"epoch": 0.98,
"grad_norm": 1.7902773793269389,
"learning_rate": 1.3439713579777025e-08,
"loss": 0.4598,
"step": 1598
},
{
"epoch": 0.98,
"grad_norm": 1.9347948196954086,
"learning_rate": 1.2723363851939175e-08,
"loss": 0.5546,
"step": 1599
},
{
"epoch": 0.98,
"grad_norm": 2.1793801518923117,
"learning_rate": 1.2026607286068637e-08,
"loss": 0.5184,
"step": 1600
},
{
"epoch": 0.98,
"grad_norm": 2.1162767261686497,
"learning_rate": 1.1349446619463578e-08,
"loss": 0.5576,
"step": 1601
},
{
"epoch": 0.98,
"grad_norm": 1.9064679554702675,
"learning_rate": 1.0691884512437078e-08,
"loss": 0.5593,
"step": 1602
},
{
"epoch": 0.98,
"grad_norm": 1.9599229582469748,
"learning_rate": 1.0053923548307698e-08,
"loss": 0.5226,
"step": 1603
},
{
"epoch": 0.98,
"grad_norm": 2.3703310743026367,
"learning_rate": 9.435566233387261e-09,
"loss": 0.6998,
"step": 1604
},
{
"epoch": 0.98,
"grad_norm": 1.8265190173903945,
"learning_rate": 8.836814996971977e-09,
"loss": 0.5149,
"step": 1605
},
{
"epoch": 0.98,
"grad_norm": 1.919991660951489,
"learning_rate": 8.257672191334664e-09,
"loss": 0.5058,
"step": 1606
},
{
"epoch": 0.98,
"grad_norm": 2.226643529735206,
"learning_rate": 7.698140091712547e-09,
"loss": 0.5828,
"step": 1607
},
{
"epoch": 0.98,
"grad_norm": 1.971584660303256,
"learning_rate": 7.158220896298917e-09,
"loss": 0.5688,
"step": 1608
},
{
"epoch": 0.98,
"grad_norm": 2.0480628954718223,
"learning_rate": 6.637916726237592e-09,
"loss": 0.5851,
"step": 1609
},
{
"epoch": 0.98,
"grad_norm": 2.2325632034699905,
"learning_rate": 6.1372296256101414e-09,
"loss": 0.5965,
"step": 1610
},
{
"epoch": 0.99,
"grad_norm": 2.0574293300744557,
"learning_rate": 5.6561615614314505e-09,
"loss": 0.5216,
"step": 1611
},
{
"epoch": 0.99,
"grad_norm": 2.0626282148992625,
"learning_rate": 5.194714423638059e-09,
"loss": 0.5443,
"step": 1612
},
{
"epoch": 0.99,
"grad_norm": 1.8130730344455217,
"learning_rate": 4.752890025086499e-09,
"loss": 0.5109,
"step": 1613
},
{
"epoch": 0.99,
"grad_norm": 2.1550184229788463,
"learning_rate": 4.330690101539969e-09,
"loss": 0.6121,
"step": 1614
},
{
"epoch": 0.99,
"grad_norm": 1.795454334127577,
"learning_rate": 3.928116311666119e-09,
"loss": 0.4971,
"step": 1615
},
{
"epoch": 0.99,
"grad_norm": 1.8886493347103483,
"learning_rate": 3.5451702370281616e-09,
"loss": 0.4622,
"step": 1616
},
{
"epoch": 0.99,
"grad_norm": 1.7323735035854002,
"learning_rate": 3.181853382079325e-09,
"loss": 0.4841,
"step": 1617
},
{
"epoch": 0.99,
"grad_norm": 1.9823067756781059,
"learning_rate": 2.8381671741567475e-09,
"loss": 0.5521,
"step": 1618
},
{
"epoch": 0.99,
"grad_norm": 2.127563659134435,
"learning_rate": 2.514112963476478e-09,
"loss": 0.5391,
"step": 1619
},
{
"epoch": 0.99,
"grad_norm": 2.124869138735419,
"learning_rate": 2.209692023126819e-09,
"loss": 0.6062,
"step": 1620
},
{
"epoch": 0.99,
"grad_norm": 1.986396902793689,
"learning_rate": 1.9249055490655477e-09,
"loss": 0.4799,
"step": 1621
},
{
"epoch": 0.99,
"grad_norm": 1.9573989864120591,
"learning_rate": 1.6597546601127001e-09,
"loss": 0.5542,
"step": 1622
},
{
"epoch": 0.99,
"grad_norm": 1.8291458436134207,
"learning_rate": 1.4142403979483522e-09,
"loss": 0.4648,
"step": 1623
},
{
"epoch": 0.99,
"grad_norm": 2.0950932153675312,
"learning_rate": 1.1883637271065118e-09,
"loss": 0.5056,
"step": 1624
},
{
"epoch": 0.99,
"grad_norm": 1.8233239812965405,
"learning_rate": 9.821255349734548e-10,
"loss": 0.5067,
"step": 1625
},
{
"epoch": 0.99,
"grad_norm": 2.1447666317815934,
"learning_rate": 7.955266317821731e-10,
"loss": 0.5142,
"step": 1626
},
{
"epoch": 0.99,
"grad_norm": 2.0265645163369683,
"learning_rate": 6.28567750610709e-10,
"loss": 0.5296,
"step": 1627
},
{
"epoch": 1.0,
"grad_norm": 1.8561507428186768,
"learning_rate": 4.812495473788259e-10,
"loss": 0.5011,
"step": 1628
},
{
"epoch": 1.0,
"grad_norm": 2.2576656272188163,
"learning_rate": 3.5357260084523114e-10,
"loss": 0.5452,
"step": 1629
},
{
"epoch": 1.0,
"grad_norm": 2.224588449835144,
"learning_rate": 2.4553741260535667e-10,
"loss": 0.56,
"step": 1630
},
{
"epoch": 1.0,
"grad_norm": 2.027404547376381,
"learning_rate": 1.5714440708913815e-10,
"loss": 0.5433,
"step": 1631
},
{
"epoch": 1.0,
"grad_norm": 1.9249522577325628,
"learning_rate": 8.839393155990472e-11,
"loss": 0.5585,
"step": 1632
},
{
"epoch": 1.0,
"grad_norm": 1.916155539003378,
"learning_rate": 3.9286256113268973e-11,
"loss": 0.5188,
"step": 1633
},
{
"epoch": 1.0,
"grad_norm": 2.0829325561732537,
"learning_rate": 9.821573674906326e-12,
"loss": 0.5721,
"step": 1634
},
{
"epoch": 1.0,
"grad_norm": 2.060500339504095,
"learning_rate": 0.0,
"loss": 0.5242,
"step": 1635
},
{
"epoch": 1.0,
"step": 1635,
"total_flos": 669628105687040.0,
"train_loss": 0.6020827626780997,
"train_runtime": 53189.986,
"train_samples_per_second": 3.936,
"train_steps_per_second": 0.031
}
],
"logging_steps": 1.0,
"max_steps": 1635,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 669628105687040.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}