synthetic1 / trainer_state.json
sedrickkeh's picture
End of training
14a5b4a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998016660055534,
"eval_steps": 500,
"global_step": 1575,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0031733439111463705,
"grad_norm": 6.348144432543206,
"learning_rate": 5.063291139240507e-07,
"loss": 0.8982,
"step": 1
},
{
"epoch": 0.006346687822292741,
"grad_norm": 6.383645545622549,
"learning_rate": 1.0126582278481013e-06,
"loss": 0.9021,
"step": 2
},
{
"epoch": 0.009520031733439112,
"grad_norm": 6.2539360712589085,
"learning_rate": 1.518987341772152e-06,
"loss": 0.8937,
"step": 3
},
{
"epoch": 0.012693375644585482,
"grad_norm": 6.225921771088206,
"learning_rate": 2.0253164556962026e-06,
"loss": 0.8949,
"step": 4
},
{
"epoch": 0.01586671955573185,
"grad_norm": 5.8423675974230775,
"learning_rate": 2.5316455696202535e-06,
"loss": 0.8753,
"step": 5
},
{
"epoch": 0.019040063466878223,
"grad_norm": 4.757379141562461,
"learning_rate": 3.037974683544304e-06,
"loss": 0.8416,
"step": 6
},
{
"epoch": 0.022213407378024592,
"grad_norm": 4.272314367321253,
"learning_rate": 3.544303797468355e-06,
"loss": 0.8311,
"step": 7
},
{
"epoch": 0.025386751289170964,
"grad_norm": 2.4318992207766845,
"learning_rate": 4.050632911392405e-06,
"loss": 0.7784,
"step": 8
},
{
"epoch": 0.028560095200317333,
"grad_norm": 2.248095466527857,
"learning_rate": 4.556962025316456e-06,
"loss": 0.7769,
"step": 9
},
{
"epoch": 0.0317334391114637,
"grad_norm": 4.2093273786875836,
"learning_rate": 5.063291139240507e-06,
"loss": 0.7732,
"step": 10
},
{
"epoch": 0.03490678302261008,
"grad_norm": 4.309584499183112,
"learning_rate": 5.569620253164557e-06,
"loss": 0.7619,
"step": 11
},
{
"epoch": 0.038080126933756446,
"grad_norm": 4.357671892488039,
"learning_rate": 6.075949367088608e-06,
"loss": 0.7564,
"step": 12
},
{
"epoch": 0.041253470844902815,
"grad_norm": 3.6078600546011184,
"learning_rate": 6.582278481012659e-06,
"loss": 0.7009,
"step": 13
},
{
"epoch": 0.044426814756049184,
"grad_norm": 3.651249913873882,
"learning_rate": 7.08860759493671e-06,
"loss": 0.699,
"step": 14
},
{
"epoch": 0.04760015866719556,
"grad_norm": 2.9003418519249107,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.6807,
"step": 15
},
{
"epoch": 0.05077350257834193,
"grad_norm": 2.0070454552304744,
"learning_rate": 8.10126582278481e-06,
"loss": 0.6622,
"step": 16
},
{
"epoch": 0.0539468464894883,
"grad_norm": 1.6956361687151162,
"learning_rate": 8.607594936708861e-06,
"loss": 0.6405,
"step": 17
},
{
"epoch": 0.057120190400634666,
"grad_norm": 2.212305308576385,
"learning_rate": 9.113924050632912e-06,
"loss": 0.6294,
"step": 18
},
{
"epoch": 0.06029353431178104,
"grad_norm": 2.2094016296877634,
"learning_rate": 9.620253164556963e-06,
"loss": 0.6225,
"step": 19
},
{
"epoch": 0.0634668782229274,
"grad_norm": 1.6141377735329752,
"learning_rate": 1.0126582278481014e-05,
"loss": 0.611,
"step": 20
},
{
"epoch": 0.06664022213407378,
"grad_norm": 1.204912005316192,
"learning_rate": 1.0632911392405063e-05,
"loss": 0.6073,
"step": 21
},
{
"epoch": 0.06981356604522015,
"grad_norm": 1.3322615853509538,
"learning_rate": 1.1139240506329114e-05,
"loss": 0.5935,
"step": 22
},
{
"epoch": 0.07298690995636652,
"grad_norm": 1.3306388942331462,
"learning_rate": 1.1645569620253165e-05,
"loss": 0.5827,
"step": 23
},
{
"epoch": 0.07616025386751289,
"grad_norm": 0.8751536661021161,
"learning_rate": 1.2151898734177216e-05,
"loss": 0.5805,
"step": 24
},
{
"epoch": 0.07933359777865927,
"grad_norm": 0.9015772829079778,
"learning_rate": 1.2658227848101268e-05,
"loss": 0.5726,
"step": 25
},
{
"epoch": 0.08250694168980563,
"grad_norm": 0.9818736669411537,
"learning_rate": 1.3164556962025317e-05,
"loss": 0.5683,
"step": 26
},
{
"epoch": 0.085680285600952,
"grad_norm": 0.6758237512246137,
"learning_rate": 1.3670886075949368e-05,
"loss": 0.5554,
"step": 27
},
{
"epoch": 0.08885362951209837,
"grad_norm": 0.7798065896044373,
"learning_rate": 1.417721518987342e-05,
"loss": 0.562,
"step": 28
},
{
"epoch": 0.09202697342324474,
"grad_norm": 0.5943455450936407,
"learning_rate": 1.468354430379747e-05,
"loss": 0.5516,
"step": 29
},
{
"epoch": 0.09520031733439112,
"grad_norm": 0.5360484272133458,
"learning_rate": 1.5189873417721521e-05,
"loss": 0.5466,
"step": 30
},
{
"epoch": 0.09837366124553748,
"grad_norm": 0.6427312239326713,
"learning_rate": 1.5696202531645572e-05,
"loss": 0.5282,
"step": 31
},
{
"epoch": 0.10154700515668386,
"grad_norm": 0.52852465695618,
"learning_rate": 1.620253164556962e-05,
"loss": 0.5358,
"step": 32
},
{
"epoch": 0.10472034906783023,
"grad_norm": 0.5252420797220524,
"learning_rate": 1.6708860759493674e-05,
"loss": 0.5292,
"step": 33
},
{
"epoch": 0.1078936929789766,
"grad_norm": 0.6243715389867477,
"learning_rate": 1.7215189873417723e-05,
"loss": 0.5371,
"step": 34
},
{
"epoch": 0.11106703689012297,
"grad_norm": 0.4003457981794737,
"learning_rate": 1.7721518987341772e-05,
"loss": 0.5186,
"step": 35
},
{
"epoch": 0.11424038080126933,
"grad_norm": 0.5457989224110974,
"learning_rate": 1.8227848101265824e-05,
"loss": 0.5223,
"step": 36
},
{
"epoch": 0.1174137247124157,
"grad_norm": 0.3580435227349059,
"learning_rate": 1.8734177215189874e-05,
"loss": 0.5158,
"step": 37
},
{
"epoch": 0.12058706862356208,
"grad_norm": 0.4663870760426878,
"learning_rate": 1.9240506329113926e-05,
"loss": 0.518,
"step": 38
},
{
"epoch": 0.12376041253470844,
"grad_norm": 0.4046971344951154,
"learning_rate": 1.974683544303798e-05,
"loss": 0.5107,
"step": 39
},
{
"epoch": 0.1269337564458548,
"grad_norm": 0.41758513906570793,
"learning_rate": 2.0253164556962028e-05,
"loss": 0.5124,
"step": 40
},
{
"epoch": 0.13010710035700118,
"grad_norm": 0.4062531142420596,
"learning_rate": 2.0759493670886077e-05,
"loss": 0.504,
"step": 41
},
{
"epoch": 0.13328044426814756,
"grad_norm": 0.3998294866521708,
"learning_rate": 2.1265822784810126e-05,
"loss": 0.5025,
"step": 42
},
{
"epoch": 0.13645378817929393,
"grad_norm": 0.3867028742210112,
"learning_rate": 2.177215189873418e-05,
"loss": 0.5043,
"step": 43
},
{
"epoch": 0.1396271320904403,
"grad_norm": 0.43575380555266197,
"learning_rate": 2.2278481012658228e-05,
"loss": 0.4972,
"step": 44
},
{
"epoch": 0.14280047600158668,
"grad_norm": 0.6078264125881937,
"learning_rate": 2.278481012658228e-05,
"loss": 0.4975,
"step": 45
},
{
"epoch": 0.14597381991273303,
"grad_norm": 0.973098853907099,
"learning_rate": 2.329113924050633e-05,
"loss": 0.4998,
"step": 46
},
{
"epoch": 0.1491471638238794,
"grad_norm": 1.2032473307606681,
"learning_rate": 2.379746835443038e-05,
"loss": 0.5023,
"step": 47
},
{
"epoch": 0.15232050773502578,
"grad_norm": 0.517082628902644,
"learning_rate": 2.430379746835443e-05,
"loss": 0.4926,
"step": 48
},
{
"epoch": 0.15549385164617216,
"grad_norm": 0.8487019755795518,
"learning_rate": 2.481012658227848e-05,
"loss": 0.4981,
"step": 49
},
{
"epoch": 0.15866719555731854,
"grad_norm": 1.0951673394024417,
"learning_rate": 2.5316455696202537e-05,
"loss": 0.4962,
"step": 50
},
{
"epoch": 0.16184053946846488,
"grad_norm": 0.755728182616416,
"learning_rate": 2.5822784810126586e-05,
"loss": 0.4962,
"step": 51
},
{
"epoch": 0.16501388337961126,
"grad_norm": 1.3057901440899915,
"learning_rate": 2.6329113924050635e-05,
"loss": 0.4865,
"step": 52
},
{
"epoch": 0.16818722729075763,
"grad_norm": 0.6650026611670792,
"learning_rate": 2.6835443037974687e-05,
"loss": 0.4895,
"step": 53
},
{
"epoch": 0.171360571201904,
"grad_norm": 0.7738295996967581,
"learning_rate": 2.7341772151898737e-05,
"loss": 0.4968,
"step": 54
},
{
"epoch": 0.1745339151130504,
"grad_norm": 0.9000555872432531,
"learning_rate": 2.784810126582279e-05,
"loss": 0.4909,
"step": 55
},
{
"epoch": 0.17770725902419673,
"grad_norm": 0.8162914872438413,
"learning_rate": 2.835443037974684e-05,
"loss": 0.4834,
"step": 56
},
{
"epoch": 0.1808806029353431,
"grad_norm": 0.7906164914396787,
"learning_rate": 2.8860759493670888e-05,
"loss": 0.4865,
"step": 57
},
{
"epoch": 0.18405394684648949,
"grad_norm": 0.9490020845109697,
"learning_rate": 2.936708860759494e-05,
"loss": 0.4908,
"step": 58
},
{
"epoch": 0.18722729075763586,
"grad_norm": 0.7752131353512624,
"learning_rate": 2.987341772151899e-05,
"loss": 0.4782,
"step": 59
},
{
"epoch": 0.19040063466878224,
"grad_norm": 0.84561035331472,
"learning_rate": 3.0379746835443042e-05,
"loss": 0.4703,
"step": 60
},
{
"epoch": 0.1935739785799286,
"grad_norm": 0.5602877470932804,
"learning_rate": 3.088607594936709e-05,
"loss": 0.4775,
"step": 61
},
{
"epoch": 0.19674732249107496,
"grad_norm": 0.6821307536004709,
"learning_rate": 3.1392405063291144e-05,
"loss": 0.4773,
"step": 62
},
{
"epoch": 0.19992066640222134,
"grad_norm": 0.717968030563759,
"learning_rate": 3.1898734177215196e-05,
"loss": 0.4742,
"step": 63
},
{
"epoch": 0.2030940103133677,
"grad_norm": 0.8033651165413181,
"learning_rate": 3.240506329113924e-05,
"loss": 0.4721,
"step": 64
},
{
"epoch": 0.2062673542245141,
"grad_norm": 0.8192446128586057,
"learning_rate": 3.2911392405063295e-05,
"loss": 0.4772,
"step": 65
},
{
"epoch": 0.20944069813566046,
"grad_norm": 1.431814362557968,
"learning_rate": 3.341772151898735e-05,
"loss": 0.4833,
"step": 66
},
{
"epoch": 0.2126140420468068,
"grad_norm": 0.8878416063140385,
"learning_rate": 3.392405063291139e-05,
"loss": 0.4769,
"step": 67
},
{
"epoch": 0.2157873859579532,
"grad_norm": 0.8851153809304894,
"learning_rate": 3.4430379746835445e-05,
"loss": 0.4702,
"step": 68
},
{
"epoch": 0.21896072986909956,
"grad_norm": 0.8786193628266902,
"learning_rate": 3.49367088607595e-05,
"loss": 0.4685,
"step": 69
},
{
"epoch": 0.22213407378024594,
"grad_norm": 0.7405283080781369,
"learning_rate": 3.5443037974683544e-05,
"loss": 0.4717,
"step": 70
},
{
"epoch": 0.22530741769139231,
"grad_norm": 1.043777209641996,
"learning_rate": 3.5949367088607596e-05,
"loss": 0.4739,
"step": 71
},
{
"epoch": 0.22848076160253866,
"grad_norm": 1.3637808792967987,
"learning_rate": 3.645569620253165e-05,
"loss": 0.4609,
"step": 72
},
{
"epoch": 0.23165410551368504,
"grad_norm": 0.5922364571167895,
"learning_rate": 3.69620253164557e-05,
"loss": 0.4633,
"step": 73
},
{
"epoch": 0.2348274494248314,
"grad_norm": 1.092592419482634,
"learning_rate": 3.746835443037975e-05,
"loss": 0.4709,
"step": 74
},
{
"epoch": 0.2380007933359778,
"grad_norm": 1.3150716961544304,
"learning_rate": 3.79746835443038e-05,
"loss": 0.4645,
"step": 75
},
{
"epoch": 0.24117413724712417,
"grad_norm": 0.8868277668057346,
"learning_rate": 3.848101265822785e-05,
"loss": 0.4701,
"step": 76
},
{
"epoch": 0.24434748115827054,
"grad_norm": 1.0869800415170585,
"learning_rate": 3.89873417721519e-05,
"loss": 0.4646,
"step": 77
},
{
"epoch": 0.2475208250694169,
"grad_norm": 1.0125334083010975,
"learning_rate": 3.949367088607596e-05,
"loss": 0.4678,
"step": 78
},
{
"epoch": 0.2506941689805633,
"grad_norm": 1.312025871541637,
"learning_rate": 4e-05,
"loss": 0.4689,
"step": 79
},
{
"epoch": 0.2538675128917096,
"grad_norm": 0.7660489575183855,
"learning_rate": 4.0506329113924056e-05,
"loss": 0.4627,
"step": 80
},
{
"epoch": 0.257040856802856,
"grad_norm": 1.0627032211180467,
"learning_rate": 4.10126582278481e-05,
"loss": 0.4665,
"step": 81
},
{
"epoch": 0.26021420071400236,
"grad_norm": 1.2004403182979329,
"learning_rate": 4.1518987341772154e-05,
"loss": 0.463,
"step": 82
},
{
"epoch": 0.26338754462514874,
"grad_norm": 0.8697213717507531,
"learning_rate": 4.202531645569621e-05,
"loss": 0.4639,
"step": 83
},
{
"epoch": 0.2665608885362951,
"grad_norm": 0.7770576209264962,
"learning_rate": 4.253164556962025e-05,
"loss": 0.4636,
"step": 84
},
{
"epoch": 0.2697342324474415,
"grad_norm": 1.006934185356337,
"learning_rate": 4.3037974683544305e-05,
"loss": 0.4687,
"step": 85
},
{
"epoch": 0.27290757635858787,
"grad_norm": 1.7918476049624474,
"learning_rate": 4.354430379746836e-05,
"loss": 0.4775,
"step": 86
},
{
"epoch": 0.27608092026973424,
"grad_norm": 1.0471693596288953,
"learning_rate": 4.405063291139241e-05,
"loss": 0.4736,
"step": 87
},
{
"epoch": 0.2792542641808806,
"grad_norm": 1.4597496826948222,
"learning_rate": 4.4556962025316456e-05,
"loss": 0.4786,
"step": 88
},
{
"epoch": 0.282427608092027,
"grad_norm": 1.400079131837375,
"learning_rate": 4.506329113924051e-05,
"loss": 0.4699,
"step": 89
},
{
"epoch": 0.28560095200317337,
"grad_norm": 1.1646233119330298,
"learning_rate": 4.556962025316456e-05,
"loss": 0.4721,
"step": 90
},
{
"epoch": 0.2887742959143197,
"grad_norm": 1.5770765835982208,
"learning_rate": 4.607594936708861e-05,
"loss": 0.4741,
"step": 91
},
{
"epoch": 0.29194763982546607,
"grad_norm": 1.2163067580443867,
"learning_rate": 4.658227848101266e-05,
"loss": 0.466,
"step": 92
},
{
"epoch": 0.29512098373661244,
"grad_norm": 1.2720825044452424,
"learning_rate": 4.708860759493671e-05,
"loss": 0.4641,
"step": 93
},
{
"epoch": 0.2982943276477588,
"grad_norm": 1.1086102734888972,
"learning_rate": 4.759493670886076e-05,
"loss": 0.4689,
"step": 94
},
{
"epoch": 0.3014676715589052,
"grad_norm": 1.1621389294628086,
"learning_rate": 4.810126582278481e-05,
"loss": 0.46,
"step": 95
},
{
"epoch": 0.30464101547005157,
"grad_norm": 1.023227048678522,
"learning_rate": 4.860759493670886e-05,
"loss": 0.4664,
"step": 96
},
{
"epoch": 0.30781435938119794,
"grad_norm": 0.6681644577920394,
"learning_rate": 4.911392405063292e-05,
"loss": 0.46,
"step": 97
},
{
"epoch": 0.3109877032923443,
"grad_norm": 0.8942148240973403,
"learning_rate": 4.962025316455696e-05,
"loss": 0.4578,
"step": 98
},
{
"epoch": 0.3141610472034907,
"grad_norm": 0.8129981254381207,
"learning_rate": 5.012658227848102e-05,
"loss": 0.4588,
"step": 99
},
{
"epoch": 0.31733439111463707,
"grad_norm": 0.7399247303363102,
"learning_rate": 5.063291139240507e-05,
"loss": 0.4684,
"step": 100
},
{
"epoch": 0.32050773502578345,
"grad_norm": 0.9208344037352115,
"learning_rate": 5.113924050632911e-05,
"loss": 0.4537,
"step": 101
},
{
"epoch": 0.32368107893692977,
"grad_norm": 0.7718334071956167,
"learning_rate": 5.164556962025317e-05,
"loss": 0.4541,
"step": 102
},
{
"epoch": 0.32685442284807614,
"grad_norm": 0.9193553200855868,
"learning_rate": 5.2151898734177224e-05,
"loss": 0.4639,
"step": 103
},
{
"epoch": 0.3300277667592225,
"grad_norm": 1.0893805228839906,
"learning_rate": 5.265822784810127e-05,
"loss": 0.4591,
"step": 104
},
{
"epoch": 0.3332011106703689,
"grad_norm": 1.3296951795184973,
"learning_rate": 5.316455696202532e-05,
"loss": 0.4609,
"step": 105
},
{
"epoch": 0.33637445458151527,
"grad_norm": 0.749655425862983,
"learning_rate": 5.3670886075949375e-05,
"loss": 0.4552,
"step": 106
},
{
"epoch": 0.33954779849266165,
"grad_norm": 1.0304107904402815,
"learning_rate": 5.417721518987342e-05,
"loss": 0.455,
"step": 107
},
{
"epoch": 0.342721142403808,
"grad_norm": 1.259806934599001,
"learning_rate": 5.468354430379747e-05,
"loss": 0.456,
"step": 108
},
{
"epoch": 0.3458944863149544,
"grad_norm": 0.7667285389018935,
"learning_rate": 5.5189873417721526e-05,
"loss": 0.446,
"step": 109
},
{
"epoch": 0.3490678302261008,
"grad_norm": 0.8396599205938527,
"learning_rate": 5.569620253164558e-05,
"loss": 0.444,
"step": 110
},
{
"epoch": 0.35224117413724715,
"grad_norm": 1.0519977391007944,
"learning_rate": 5.6202531645569624e-05,
"loss": 0.4509,
"step": 111
},
{
"epoch": 0.35541451804839347,
"grad_norm": 1.0013235222472492,
"learning_rate": 5.670886075949368e-05,
"loss": 0.4486,
"step": 112
},
{
"epoch": 0.35858786195953984,
"grad_norm": 1.0607291724487944,
"learning_rate": 5.721518987341773e-05,
"loss": 0.4469,
"step": 113
},
{
"epoch": 0.3617612058706862,
"grad_norm": 0.9739072199836216,
"learning_rate": 5.7721518987341775e-05,
"loss": 0.453,
"step": 114
},
{
"epoch": 0.3649345497818326,
"grad_norm": 0.9878026910095292,
"learning_rate": 5.822784810126583e-05,
"loss": 0.4504,
"step": 115
},
{
"epoch": 0.36810789369297897,
"grad_norm": 1.0083153796649502,
"learning_rate": 5.873417721518988e-05,
"loss": 0.4514,
"step": 116
},
{
"epoch": 0.37128123760412535,
"grad_norm": 1.230069181330947,
"learning_rate": 5.9240506329113926e-05,
"loss": 0.4547,
"step": 117
},
{
"epoch": 0.3744545815152717,
"grad_norm": 0.8910651737656846,
"learning_rate": 5.974683544303798e-05,
"loss": 0.447,
"step": 118
},
{
"epoch": 0.3776279254264181,
"grad_norm": 1.0595908240339857,
"learning_rate": 6.025316455696203e-05,
"loss": 0.4466,
"step": 119
},
{
"epoch": 0.3808012693375645,
"grad_norm": 0.8363383114472455,
"learning_rate": 6.0759493670886084e-05,
"loss": 0.4463,
"step": 120
},
{
"epoch": 0.38397461324871085,
"grad_norm": 0.907469468063755,
"learning_rate": 6.126582278481012e-05,
"loss": 0.4492,
"step": 121
},
{
"epoch": 0.3871479571598572,
"grad_norm": 1.3417514038396015,
"learning_rate": 6.177215189873418e-05,
"loss": 0.4572,
"step": 122
},
{
"epoch": 0.39032130107100355,
"grad_norm": 0.6834706943125461,
"learning_rate": 6.227848101265824e-05,
"loss": 0.4452,
"step": 123
},
{
"epoch": 0.3934946449821499,
"grad_norm": 1.1807244230068368,
"learning_rate": 6.278481012658229e-05,
"loss": 0.4486,
"step": 124
},
{
"epoch": 0.3966679888932963,
"grad_norm": 0.9150568429572349,
"learning_rate": 6.329113924050633e-05,
"loss": 0.4475,
"step": 125
},
{
"epoch": 0.3998413328044427,
"grad_norm": 0.9711686157986756,
"learning_rate": 6.379746835443039e-05,
"loss": 0.4548,
"step": 126
},
{
"epoch": 0.40301467671558905,
"grad_norm": 0.8470526923713568,
"learning_rate": 6.430379746835444e-05,
"loss": 0.4573,
"step": 127
},
{
"epoch": 0.4061880206267354,
"grad_norm": 1.1867353576740691,
"learning_rate": 6.481012658227848e-05,
"loss": 0.4524,
"step": 128
},
{
"epoch": 0.4093613645378818,
"grad_norm": 0.9924058775384482,
"learning_rate": 6.531645569620254e-05,
"loss": 0.4445,
"step": 129
},
{
"epoch": 0.4125347084490282,
"grad_norm": 1.0279428635787766,
"learning_rate": 6.582278481012659e-05,
"loss": 0.4536,
"step": 130
},
{
"epoch": 0.41570805236017455,
"grad_norm": 1.327497625966449,
"learning_rate": 6.632911392405063e-05,
"loss": 0.4517,
"step": 131
},
{
"epoch": 0.4188813962713209,
"grad_norm": 0.8047709339837646,
"learning_rate": 6.68354430379747e-05,
"loss": 0.4507,
"step": 132
},
{
"epoch": 0.42205474018246725,
"grad_norm": 0.6624995618693278,
"learning_rate": 6.734177215189874e-05,
"loss": 0.4416,
"step": 133
},
{
"epoch": 0.4252280840936136,
"grad_norm": 0.8937479460795225,
"learning_rate": 6.784810126582279e-05,
"loss": 0.4413,
"step": 134
},
{
"epoch": 0.42840142800476,
"grad_norm": 1.2288491348657344,
"learning_rate": 6.835443037974685e-05,
"loss": 0.4481,
"step": 135
},
{
"epoch": 0.4315747719159064,
"grad_norm": 0.7888292495946069,
"learning_rate": 6.886075949367089e-05,
"loss": 0.4434,
"step": 136
},
{
"epoch": 0.43474811582705275,
"grad_norm": 0.8552313093639724,
"learning_rate": 6.936708860759494e-05,
"loss": 0.4415,
"step": 137
},
{
"epoch": 0.4379214597381991,
"grad_norm": 1.1555499174697232,
"learning_rate": 6.9873417721519e-05,
"loss": 0.4469,
"step": 138
},
{
"epoch": 0.4410948036493455,
"grad_norm": 0.8107672566991376,
"learning_rate": 7.037974683544304e-05,
"loss": 0.4391,
"step": 139
},
{
"epoch": 0.4442681475604919,
"grad_norm": 0.7123665736327819,
"learning_rate": 7.088607594936709e-05,
"loss": 0.44,
"step": 140
},
{
"epoch": 0.44744149147163825,
"grad_norm": 0.7484877480349433,
"learning_rate": 7.139240506329115e-05,
"loss": 0.4444,
"step": 141
},
{
"epoch": 0.45061483538278463,
"grad_norm": 0.8956448687190801,
"learning_rate": 7.189873417721519e-05,
"loss": 0.4472,
"step": 142
},
{
"epoch": 0.453788179293931,
"grad_norm": 1.1407806224479462,
"learning_rate": 7.240506329113925e-05,
"loss": 0.4549,
"step": 143
},
{
"epoch": 0.4569615232050773,
"grad_norm": 0.6987492103428099,
"learning_rate": 7.29113924050633e-05,
"loss": 0.4376,
"step": 144
},
{
"epoch": 0.4601348671162237,
"grad_norm": 1.0417474479567967,
"learning_rate": 7.341772151898734e-05,
"loss": 0.4496,
"step": 145
},
{
"epoch": 0.4633082110273701,
"grad_norm": 0.9448597075694767,
"learning_rate": 7.39240506329114e-05,
"loss": 0.4446,
"step": 146
},
{
"epoch": 0.46648155493851645,
"grad_norm": 0.891015819461564,
"learning_rate": 7.443037974683545e-05,
"loss": 0.445,
"step": 147
},
{
"epoch": 0.4696548988496628,
"grad_norm": 0.6640649337472588,
"learning_rate": 7.49367088607595e-05,
"loss": 0.4415,
"step": 148
},
{
"epoch": 0.4728282427608092,
"grad_norm": 0.6255801769841464,
"learning_rate": 7.544303797468355e-05,
"loss": 0.4401,
"step": 149
},
{
"epoch": 0.4760015866719556,
"grad_norm": 0.6462120499489888,
"learning_rate": 7.59493670886076e-05,
"loss": 0.4443,
"step": 150
},
{
"epoch": 0.47917493058310195,
"grad_norm": 0.6031979187078812,
"learning_rate": 7.645569620253165e-05,
"loss": 0.4315,
"step": 151
},
{
"epoch": 0.48234827449424833,
"grad_norm": 0.7422409369854028,
"learning_rate": 7.69620253164557e-05,
"loss": 0.4411,
"step": 152
},
{
"epoch": 0.4855216184053947,
"grad_norm": 0.6026165860234792,
"learning_rate": 7.746835443037976e-05,
"loss": 0.4372,
"step": 153
},
{
"epoch": 0.4886949623165411,
"grad_norm": 0.4790898172907393,
"learning_rate": 7.79746835443038e-05,
"loss": 0.4397,
"step": 154
},
{
"epoch": 0.4918683062276874,
"grad_norm": 0.5637573427525947,
"learning_rate": 7.848101265822786e-05,
"loss": 0.4364,
"step": 155
},
{
"epoch": 0.4950416501388338,
"grad_norm": 0.7151994774739086,
"learning_rate": 7.898734177215191e-05,
"loss": 0.4418,
"step": 156
},
{
"epoch": 0.49821499404998015,
"grad_norm": 0.8523966891138255,
"learning_rate": 7.949367088607595e-05,
"loss": 0.4426,
"step": 157
},
{
"epoch": 0.5013883379611266,
"grad_norm": 1.0199664244681628,
"learning_rate": 8e-05,
"loss": 0.4396,
"step": 158
},
{
"epoch": 0.5045616818722729,
"grad_norm": 0.9730498513447443,
"learning_rate": 7.999990169177323e-05,
"loss": 0.4524,
"step": 159
},
{
"epoch": 0.5077350257834192,
"grad_norm": 0.8763687649740503,
"learning_rate": 7.99996067675761e-05,
"loss": 0.4385,
"step": 160
},
{
"epoch": 0.5109083696945657,
"grad_norm": 0.7439185882238727,
"learning_rate": 7.99991152288583e-05,
"loss": 0.4426,
"step": 161
},
{
"epoch": 0.514081713605712,
"grad_norm": 0.8277849239630293,
"learning_rate": 7.999842707803597e-05,
"loss": 0.4336,
"step": 162
},
{
"epoch": 0.5172550575168584,
"grad_norm": 0.6740425300459671,
"learning_rate": 7.999754231849163e-05,
"loss": 0.4371,
"step": 163
},
{
"epoch": 0.5204284014280047,
"grad_norm": 0.6114082854253373,
"learning_rate": 7.999646095457422e-05,
"loss": 0.4421,
"step": 164
},
{
"epoch": 0.5236017453391512,
"grad_norm": 0.8489305672656007,
"learning_rate": 7.999518299159912e-05,
"loss": 0.4366,
"step": 165
},
{
"epoch": 0.5267750892502975,
"grad_norm": 0.7585314746590911,
"learning_rate": 7.999370843584805e-05,
"loss": 0.432,
"step": 166
},
{
"epoch": 0.5299484331614439,
"grad_norm": 0.9398955324495176,
"learning_rate": 7.999203729456902e-05,
"loss": 0.4361,
"step": 167
},
{
"epoch": 0.5331217770725902,
"grad_norm": 1.7667814883362698,
"learning_rate": 7.99901695759764e-05,
"loss": 0.4484,
"step": 168
},
{
"epoch": 0.5362951209837367,
"grad_norm": 0.5640015415710142,
"learning_rate": 7.99881052892508e-05,
"loss": 0.4347,
"step": 169
},
{
"epoch": 0.539468464894883,
"grad_norm": 1.512073308742083,
"learning_rate": 7.998584444453901e-05,
"loss": 0.4454,
"step": 170
},
{
"epoch": 0.5426418088060293,
"grad_norm": 0.9689824722762622,
"learning_rate": 7.998338705295406e-05,
"loss": 0.4386,
"step": 171
},
{
"epoch": 0.5458151527171757,
"grad_norm": 0.7646001072380342,
"learning_rate": 7.9980733126575e-05,
"loss": 0.4358,
"step": 172
},
{
"epoch": 0.548988496628322,
"grad_norm": 0.9621346516255858,
"learning_rate": 7.997788267844699e-05,
"loss": 0.4438,
"step": 173
},
{
"epoch": 0.5521618405394685,
"grad_norm": 0.8313101918847394,
"learning_rate": 7.997483572258112e-05,
"loss": 0.4402,
"step": 174
},
{
"epoch": 0.5553351844506148,
"grad_norm": 0.6402057742934584,
"learning_rate": 7.997159227395449e-05,
"loss": 0.4289,
"step": 175
},
{
"epoch": 0.5585085283617612,
"grad_norm": 0.5861524014663406,
"learning_rate": 7.996815234850994e-05,
"loss": 0.4258,
"step": 176
},
{
"epoch": 0.5616818722729076,
"grad_norm": 0.49147871141369276,
"learning_rate": 7.996451596315613e-05,
"loss": 0.4284,
"step": 177
},
{
"epoch": 0.564855216184054,
"grad_norm": 0.5385589588185274,
"learning_rate": 7.99606831357674e-05,
"loss": 0.4326,
"step": 178
},
{
"epoch": 0.5680285600952003,
"grad_norm": 0.5020636674847232,
"learning_rate": 7.995665388518366e-05,
"loss": 0.4275,
"step": 179
},
{
"epoch": 0.5712019040063467,
"grad_norm": 0.34677645292080367,
"learning_rate": 7.995242823121035e-05,
"loss": 0.4313,
"step": 180
},
{
"epoch": 0.5743752479174931,
"grad_norm": 0.41481246440728614,
"learning_rate": 7.994800619461826e-05,
"loss": 0.428,
"step": 181
},
{
"epoch": 0.5775485918286394,
"grad_norm": 0.4728147459852834,
"learning_rate": 7.994338779714356e-05,
"loss": 0.429,
"step": 182
},
{
"epoch": 0.5807219357397858,
"grad_norm": 0.34714272709199906,
"learning_rate": 7.993857306148757e-05,
"loss": 0.42,
"step": 183
},
{
"epoch": 0.5838952796509321,
"grad_norm": 0.33307391540828185,
"learning_rate": 7.993356201131667e-05,
"loss": 0.4306,
"step": 184
},
{
"epoch": 0.5870686235620786,
"grad_norm": 0.4557580388879463,
"learning_rate": 7.992835467126226e-05,
"loss": 0.4336,
"step": 185
},
{
"epoch": 0.5902419674732249,
"grad_norm": 0.3413621320410112,
"learning_rate": 7.992295106692053e-05,
"loss": 0.4244,
"step": 186
},
{
"epoch": 0.5934153113843713,
"grad_norm": 0.30301313621067855,
"learning_rate": 7.991735122485244e-05,
"loss": 0.4264,
"step": 187
},
{
"epoch": 0.5965886552955176,
"grad_norm": 0.27873012092060434,
"learning_rate": 7.991155517258351e-05,
"loss": 0.4254,
"step": 188
},
{
"epoch": 0.5997619992066641,
"grad_norm": 0.3234296671716833,
"learning_rate": 7.990556293860373e-05,
"loss": 0.4226,
"step": 189
},
{
"epoch": 0.6029353431178104,
"grad_norm": 0.26061056621175,
"learning_rate": 7.989937455236738e-05,
"loss": 0.4212,
"step": 190
},
{
"epoch": 0.6061086870289568,
"grad_norm": 0.32864221465315796,
"learning_rate": 7.989299004429294e-05,
"loss": 0.4226,
"step": 191
},
{
"epoch": 0.6092820309401031,
"grad_norm": 0.4444198405279817,
"learning_rate": 7.988640944576287e-05,
"loss": 0.4266,
"step": 192
},
{
"epoch": 0.6124553748512495,
"grad_norm": 0.5842630238421751,
"learning_rate": 7.987963278912353e-05,
"loss": 0.4239,
"step": 193
},
{
"epoch": 0.6156287187623959,
"grad_norm": 0.8349781810783966,
"learning_rate": 7.9872660107685e-05,
"loss": 0.4249,
"step": 194
},
{
"epoch": 0.6188020626735422,
"grad_norm": 1.167134998920243,
"learning_rate": 7.986549143572085e-05,
"loss": 0.4274,
"step": 195
},
{
"epoch": 0.6219754065846886,
"grad_norm": 0.9339211226210348,
"learning_rate": 7.985812680846804e-05,
"loss": 0.4259,
"step": 196
},
{
"epoch": 0.625148750495835,
"grad_norm": 0.7637681964537499,
"learning_rate": 7.985056626212678e-05,
"loss": 0.4293,
"step": 197
},
{
"epoch": 0.6283220944069814,
"grad_norm": 0.758578666942809,
"learning_rate": 7.984280983386022e-05,
"loss": 0.4268,
"step": 198
},
{
"epoch": 0.6314954383181277,
"grad_norm": 0.7219115681675803,
"learning_rate": 7.983485756179443e-05,
"loss": 0.4284,
"step": 199
},
{
"epoch": 0.6346687822292741,
"grad_norm": 0.7774185389424639,
"learning_rate": 7.98267094850181e-05,
"loss": 0.4357,
"step": 200
},
{
"epoch": 0.6378421261404205,
"grad_norm": 0.8684313423895228,
"learning_rate": 7.981836564358235e-05,
"loss": 0.4364,
"step": 201
},
{
"epoch": 0.6410154700515669,
"grad_norm": 0.9490290231535304,
"learning_rate": 7.980982607850062e-05,
"loss": 0.4316,
"step": 202
},
{
"epoch": 0.6441888139627132,
"grad_norm": 1.1158847171655581,
"learning_rate": 7.980109083174838e-05,
"loss": 0.4326,
"step": 203
},
{
"epoch": 0.6473621578738595,
"grad_norm": 0.5821381704555008,
"learning_rate": 7.979215994626295e-05,
"loss": 0.4248,
"step": 204
},
{
"epoch": 0.650535501785006,
"grad_norm": 0.6159942615763982,
"learning_rate": 7.97830334659433e-05,
"loss": 0.4228,
"step": 205
},
{
"epoch": 0.6537088456961523,
"grad_norm": 0.9854660383074767,
"learning_rate": 7.977371143564986e-05,
"loss": 0.4319,
"step": 206
},
{
"epoch": 0.6568821896072987,
"grad_norm": 0.6753873782895369,
"learning_rate": 7.976419390120422e-05,
"loss": 0.4257,
"step": 207
},
{
"epoch": 0.660055533518445,
"grad_norm": 0.4578459840507763,
"learning_rate": 7.9754480909389e-05,
"loss": 0.4305,
"step": 208
},
{
"epoch": 0.6632288774295915,
"grad_norm": 0.7788354439242892,
"learning_rate": 7.974457250794752e-05,
"loss": 0.4294,
"step": 209
},
{
"epoch": 0.6664022213407378,
"grad_norm": 0.5659494748709242,
"learning_rate": 7.973446874558367e-05,
"loss": 0.4244,
"step": 210
},
{
"epoch": 0.6695755652518842,
"grad_norm": 0.37038793237805434,
"learning_rate": 7.97241696719616e-05,
"loss": 0.4259,
"step": 211
},
{
"epoch": 0.6727489091630305,
"grad_norm": 0.5598930828252042,
"learning_rate": 7.971367533770548e-05,
"loss": 0.424,
"step": 212
},
{
"epoch": 0.6759222530741769,
"grad_norm": 0.38017398481352066,
"learning_rate": 7.97029857943993e-05,
"loss": 0.4259,
"step": 213
},
{
"epoch": 0.6790955969853233,
"grad_norm": 0.34969847402690163,
"learning_rate": 7.969210109458653e-05,
"loss": 0.4224,
"step": 214
},
{
"epoch": 0.6822689408964696,
"grad_norm": 0.4326242238353279,
"learning_rate": 7.968102129176998e-05,
"loss": 0.4217,
"step": 215
},
{
"epoch": 0.685442284807616,
"grad_norm": 0.3678356901641005,
"learning_rate": 7.966974644041142e-05,
"loss": 0.4268,
"step": 216
},
{
"epoch": 0.6886156287187624,
"grad_norm": 0.3291254839559168,
"learning_rate": 7.965827659593138e-05,
"loss": 0.4206,
"step": 217
},
{
"epoch": 0.6917889726299088,
"grad_norm": 0.3567104966136532,
"learning_rate": 7.964661181470887e-05,
"loss": 0.4191,
"step": 218
},
{
"epoch": 0.6949623165410551,
"grad_norm": 0.3146162751453091,
"learning_rate": 7.96347521540811e-05,
"loss": 0.4133,
"step": 219
},
{
"epoch": 0.6981356604522015,
"grad_norm": 0.25903345117835813,
"learning_rate": 7.962269767234315e-05,
"loss": 0.4241,
"step": 220
},
{
"epoch": 0.7013090043633479,
"grad_norm": 0.3697676932101927,
"learning_rate": 7.96104484287478e-05,
"loss": 0.4248,
"step": 221
},
{
"epoch": 0.7044823482744943,
"grad_norm": 0.4012189220411039,
"learning_rate": 7.959800448350507e-05,
"loss": 0.4188,
"step": 222
},
{
"epoch": 0.7076556921856406,
"grad_norm": 0.43083222853217606,
"learning_rate": 7.95853658977821e-05,
"loss": 0.4184,
"step": 223
},
{
"epoch": 0.7108290360967869,
"grad_norm": 0.41949845884539716,
"learning_rate": 7.957253273370275e-05,
"loss": 0.4168,
"step": 224
},
{
"epoch": 0.7140023800079334,
"grad_norm": 0.460716406317274,
"learning_rate": 7.955950505434725e-05,
"loss": 0.4193,
"step": 225
},
{
"epoch": 0.7171757239190797,
"grad_norm": 0.6387648521603024,
"learning_rate": 7.954628292375207e-05,
"loss": 0.428,
"step": 226
},
{
"epoch": 0.7203490678302261,
"grad_norm": 0.7552539307927338,
"learning_rate": 7.953286640690936e-05,
"loss": 0.4243,
"step": 227
},
{
"epoch": 0.7235224117413724,
"grad_norm": 0.6681939851633835,
"learning_rate": 7.951925556976686e-05,
"loss": 0.4256,
"step": 228
},
{
"epoch": 0.7266957556525189,
"grad_norm": 0.6068582323945105,
"learning_rate": 7.950545047922741e-05,
"loss": 0.4231,
"step": 229
},
{
"epoch": 0.7298690995636652,
"grad_norm": 0.5668354270139984,
"learning_rate": 7.949145120314871e-05,
"loss": 0.4216,
"step": 230
},
{
"epoch": 0.7330424434748116,
"grad_norm": 0.5694091160000379,
"learning_rate": 7.947725781034299e-05,
"loss": 0.4173,
"step": 231
},
{
"epoch": 0.7362157873859579,
"grad_norm": 0.546717014419698,
"learning_rate": 7.946287037057657e-05,
"loss": 0.4181,
"step": 232
},
{
"epoch": 0.7393891312971044,
"grad_norm": 0.3630503498245246,
"learning_rate": 7.944828895456968e-05,
"loss": 0.4137,
"step": 233
},
{
"epoch": 0.7425624752082507,
"grad_norm": 0.43965037772138893,
"learning_rate": 7.943351363399593e-05,
"loss": 0.423,
"step": 234
},
{
"epoch": 0.745735819119397,
"grad_norm": 0.6391260482000344,
"learning_rate": 7.941854448148212e-05,
"loss": 0.4161,
"step": 235
},
{
"epoch": 0.7489091630305434,
"grad_norm": 0.5110774940868824,
"learning_rate": 7.94033815706078e-05,
"loss": 0.4128,
"step": 236
},
{
"epoch": 0.7520825069416898,
"grad_norm": 0.3565529564135717,
"learning_rate": 7.938802497590491e-05,
"loss": 0.4138,
"step": 237
},
{
"epoch": 0.7552558508528362,
"grad_norm": 0.4657810664776175,
"learning_rate": 7.937247477285743e-05,
"loss": 0.4161,
"step": 238
},
{
"epoch": 0.7584291947639825,
"grad_norm": 0.4101562014538087,
"learning_rate": 7.935673103790101e-05,
"loss": 0.4198,
"step": 239
},
{
"epoch": 0.761602538675129,
"grad_norm": 0.3474690454226505,
"learning_rate": 7.934079384842255e-05,
"loss": 0.4182,
"step": 240
},
{
"epoch": 0.7647758825862753,
"grad_norm": 0.501425607063933,
"learning_rate": 7.932466328275994e-05,
"loss": 0.4154,
"step": 241
},
{
"epoch": 0.7679492264974217,
"grad_norm": 0.5505691214538797,
"learning_rate": 7.93083394202015e-05,
"loss": 0.4192,
"step": 242
},
{
"epoch": 0.771122570408568,
"grad_norm": 0.48063376141835473,
"learning_rate": 7.929182234098576e-05,
"loss": 0.4142,
"step": 243
},
{
"epoch": 0.7742959143197145,
"grad_norm": 0.4777993396793008,
"learning_rate": 7.927511212630096e-05,
"loss": 0.4166,
"step": 244
},
{
"epoch": 0.7774692582308608,
"grad_norm": 0.7002665548552994,
"learning_rate": 7.925820885828468e-05,
"loss": 0.4222,
"step": 245
},
{
"epoch": 0.7806426021420071,
"grad_norm": 0.9739111639000991,
"learning_rate": 7.924111262002338e-05,
"loss": 0.4252,
"step": 246
},
{
"epoch": 0.7838159460531535,
"grad_norm": 1.0430449575283998,
"learning_rate": 7.922382349555218e-05,
"loss": 0.4252,
"step": 247
},
{
"epoch": 0.7869892899642998,
"grad_norm": 0.7337059616611151,
"learning_rate": 7.92063415698542e-05,
"loss": 0.4144,
"step": 248
},
{
"epoch": 0.7901626338754463,
"grad_norm": 0.5385536889326978,
"learning_rate": 7.918866692886031e-05,
"loss": 0.4226,
"step": 249
},
{
"epoch": 0.7933359777865926,
"grad_norm": 0.7647340729347486,
"learning_rate": 7.917079965944862e-05,
"loss": 0.4201,
"step": 250
},
{
"epoch": 0.796509321697739,
"grad_norm": 0.8450817800646696,
"learning_rate": 7.915273984944412e-05,
"loss": 0.4224,
"step": 251
},
{
"epoch": 0.7996826656088853,
"grad_norm": 0.4920042288499847,
"learning_rate": 7.913448758761821e-05,
"loss": 0.4146,
"step": 252
},
{
"epoch": 0.8028560095200318,
"grad_norm": 0.39445337326690816,
"learning_rate": 7.911604296368826e-05,
"loss": 0.4157,
"step": 253
},
{
"epoch": 0.8060293534311781,
"grad_norm": 0.5853009915458679,
"learning_rate": 7.909740606831719e-05,
"loss": 0.417,
"step": 254
},
{
"epoch": 0.8092026973423245,
"grad_norm": 0.4974087565357452,
"learning_rate": 7.907857699311299e-05,
"loss": 0.4158,
"step": 255
},
{
"epoch": 0.8123760412534708,
"grad_norm": 0.35737310825619095,
"learning_rate": 7.905955583062833e-05,
"loss": 0.4231,
"step": 256
},
{
"epoch": 0.8155493851646172,
"grad_norm": 0.3107479347233733,
"learning_rate": 7.904034267436004e-05,
"loss": 0.4091,
"step": 257
},
{
"epoch": 0.8187227290757636,
"grad_norm": 0.3938008913327802,
"learning_rate": 7.902093761874867e-05,
"loss": 0.4184,
"step": 258
},
{
"epoch": 0.8218960729869099,
"grad_norm": 0.36143088089529846,
"learning_rate": 7.900134075917807e-05,
"loss": 0.4188,
"step": 259
},
{
"epoch": 0.8250694168980564,
"grad_norm": 0.27530043552201605,
"learning_rate": 7.898155219197488e-05,
"loss": 0.409,
"step": 260
},
{
"epoch": 0.8282427608092027,
"grad_norm": 0.30129724789128026,
"learning_rate": 7.896157201440801e-05,
"loss": 0.4114,
"step": 261
},
{
"epoch": 0.8314161047203491,
"grad_norm": 0.3380484464752199,
"learning_rate": 7.894140032468828e-05,
"loss": 0.413,
"step": 262
},
{
"epoch": 0.8345894486314954,
"grad_norm": 0.3928098788344459,
"learning_rate": 7.892103722196782e-05,
"loss": 0.4138,
"step": 263
},
{
"epoch": 0.8377627925426419,
"grad_norm": 0.36724870276604,
"learning_rate": 7.890048280633967e-05,
"loss": 0.4084,
"step": 264
},
{
"epoch": 0.8409361364537882,
"grad_norm": 0.35512151600153574,
"learning_rate": 7.887973717883725e-05,
"loss": 0.4101,
"step": 265
},
{
"epoch": 0.8441094803649345,
"grad_norm": 0.32038340926763376,
"learning_rate": 7.885880044143382e-05,
"loss": 0.4149,
"step": 266
},
{
"epoch": 0.8472828242760809,
"grad_norm": 0.385102685025381,
"learning_rate": 7.883767269704209e-05,
"loss": 0.4141,
"step": 267
},
{
"epoch": 0.8504561681872272,
"grad_norm": 0.41759937563314564,
"learning_rate": 7.88163540495136e-05,
"loss": 0.4168,
"step": 268
},
{
"epoch": 0.8536295120983737,
"grad_norm": 0.47749975244001913,
"learning_rate": 7.879484460363825e-05,
"loss": 0.4164,
"step": 269
},
{
"epoch": 0.85680285600952,
"grad_norm": 0.49048299669029655,
"learning_rate": 7.877314446514385e-05,
"loss": 0.4102,
"step": 270
},
{
"epoch": 0.8599761999206664,
"grad_norm": 0.5033680425309713,
"learning_rate": 7.87512537406955e-05,
"loss": 0.4168,
"step": 271
},
{
"epoch": 0.8631495438318127,
"grad_norm": 0.632095279857067,
"learning_rate": 7.87291725378951e-05,
"loss": 0.4192,
"step": 272
},
{
"epoch": 0.8663228877429592,
"grad_norm": 0.7890821846662114,
"learning_rate": 7.870690096528084e-05,
"loss": 0.4104,
"step": 273
},
{
"epoch": 0.8694962316541055,
"grad_norm": 0.8461099097389182,
"learning_rate": 7.868443913232669e-05,
"loss": 0.4166,
"step": 274
},
{
"epoch": 0.8726695755652519,
"grad_norm": 0.7484663137119127,
"learning_rate": 7.866178714944178e-05,
"loss": 0.4141,
"step": 275
},
{
"epoch": 0.8758429194763983,
"grad_norm": 0.45570202002824084,
"learning_rate": 7.863894512796992e-05,
"loss": 0.4123,
"step": 276
},
{
"epoch": 0.8790162633875446,
"grad_norm": 0.4294812192002864,
"learning_rate": 7.861591318018904e-05,
"loss": 0.4144,
"step": 277
},
{
"epoch": 0.882189607298691,
"grad_norm": 0.58391480443298,
"learning_rate": 7.859269141931065e-05,
"loss": 0.4131,
"step": 278
},
{
"epoch": 0.8853629512098373,
"grad_norm": 0.47580822422237723,
"learning_rate": 7.856927995947925e-05,
"loss": 0.4086,
"step": 279
},
{
"epoch": 0.8885362951209838,
"grad_norm": 0.38129055085982533,
"learning_rate": 7.854567891577179e-05,
"loss": 0.4184,
"step": 280
},
{
"epoch": 0.8917096390321301,
"grad_norm": 0.43052433481163255,
"learning_rate": 7.852188840419711e-05,
"loss": 0.4096,
"step": 281
},
{
"epoch": 0.8948829829432765,
"grad_norm": 0.4433184262594291,
"learning_rate": 7.849790854169536e-05,
"loss": 0.4104,
"step": 282
},
{
"epoch": 0.8980563268544228,
"grad_norm": 0.4435382933392679,
"learning_rate": 7.847373944613745e-05,
"loss": 0.4059,
"step": 283
},
{
"epoch": 0.9012296707655693,
"grad_norm": 0.40447203187517,
"learning_rate": 7.844938123632439e-05,
"loss": 0.4094,
"step": 284
},
{
"epoch": 0.9044030146767156,
"grad_norm": 0.34449120043205606,
"learning_rate": 7.842483403198683e-05,
"loss": 0.408,
"step": 285
},
{
"epoch": 0.907576358587862,
"grad_norm": 0.37462807346909405,
"learning_rate": 7.840009795378436e-05,
"loss": 0.4146,
"step": 286
},
{
"epoch": 0.9107497024990083,
"grad_norm": 0.4704923604854831,
"learning_rate": 7.837517312330498e-05,
"loss": 0.4122,
"step": 287
},
{
"epoch": 0.9139230464101546,
"grad_norm": 0.5097972249194679,
"learning_rate": 7.83500596630645e-05,
"loss": 0.416,
"step": 288
},
{
"epoch": 0.9170963903213011,
"grad_norm": 0.5351289342983636,
"learning_rate": 7.832475769650588e-05,
"loss": 0.415,
"step": 289
},
{
"epoch": 0.9202697342324474,
"grad_norm": 0.4674732233010469,
"learning_rate": 7.829926734799872e-05,
"loss": 0.4107,
"step": 290
},
{
"epoch": 0.9234430781435938,
"grad_norm": 0.4263783054265977,
"learning_rate": 7.827358874283855e-05,
"loss": 0.4131,
"step": 291
},
{
"epoch": 0.9266164220547402,
"grad_norm": 0.5102719397399545,
"learning_rate": 7.824772200724629e-05,
"loss": 0.4096,
"step": 292
},
{
"epoch": 0.9297897659658866,
"grad_norm": 0.5129309020648893,
"learning_rate": 7.822166726836758e-05,
"loss": 0.4077,
"step": 293
},
{
"epoch": 0.9329631098770329,
"grad_norm": 0.45936698859229286,
"learning_rate": 7.819542465427217e-05,
"loss": 0.4072,
"step": 294
},
{
"epoch": 0.9361364537881793,
"grad_norm": 0.35187396508942126,
"learning_rate": 7.816899429395332e-05,
"loss": 0.41,
"step": 295
},
{
"epoch": 0.9393097976993257,
"grad_norm": 0.3773900389073242,
"learning_rate": 7.814237631732711e-05,
"loss": 0.4097,
"step": 296
},
{
"epoch": 0.9424831416104721,
"grad_norm": 0.48196894071104524,
"learning_rate": 7.811557085523187e-05,
"loss": 0.4125,
"step": 297
},
{
"epoch": 0.9456564855216184,
"grad_norm": 0.43803354037854536,
"learning_rate": 7.808857803942741e-05,
"loss": 0.4047,
"step": 298
},
{
"epoch": 0.9488298294327647,
"grad_norm": 0.36020876635054705,
"learning_rate": 7.80613980025946e-05,
"loss": 0.4005,
"step": 299
},
{
"epoch": 0.9520031733439112,
"grad_norm": 0.2741750720795513,
"learning_rate": 7.803403087833444e-05,
"loss": 0.4023,
"step": 300
},
{
"epoch": 0.9551765172550575,
"grad_norm": 0.25720288277565834,
"learning_rate": 7.800647680116764e-05,
"loss": 0.4041,
"step": 301
},
{
"epoch": 0.9583498611662039,
"grad_norm": 0.3359257955372094,
"learning_rate": 7.797873590653381e-05,
"loss": 0.3991,
"step": 302
},
{
"epoch": 0.9615232050773502,
"grad_norm": 0.37500047533306236,
"learning_rate": 7.795080833079084e-05,
"loss": 0.4075,
"step": 303
},
{
"epoch": 0.9646965489884967,
"grad_norm": 0.4231851966084708,
"learning_rate": 7.792269421121429e-05,
"loss": 0.408,
"step": 304
},
{
"epoch": 0.967869892899643,
"grad_norm": 0.46773700878625746,
"learning_rate": 7.78943936859966e-05,
"loss": 0.406,
"step": 305
},
{
"epoch": 0.9710432368107894,
"grad_norm": 0.48276037296021734,
"learning_rate": 7.78659068942465e-05,
"loss": 0.4113,
"step": 306
},
{
"epoch": 0.9742165807219357,
"grad_norm": 0.4520209571383191,
"learning_rate": 7.783723397598829e-05,
"loss": 0.418,
"step": 307
},
{
"epoch": 0.9773899246330822,
"grad_norm": 0.4761928527698079,
"learning_rate": 7.780837507216114e-05,
"loss": 0.406,
"step": 308
},
{
"epoch": 0.9805632685442285,
"grad_norm": 0.6222583620657315,
"learning_rate": 7.777933032461845e-05,
"loss": 0.4115,
"step": 309
},
{
"epoch": 0.9837366124553748,
"grad_norm": 0.6260831309463507,
"learning_rate": 7.775009987612711e-05,
"loss": 0.4079,
"step": 310
},
{
"epoch": 0.9869099563665212,
"grad_norm": 0.44371491044635886,
"learning_rate": 7.772068387036677e-05,
"loss": 0.4057,
"step": 311
},
{
"epoch": 0.9900833002776676,
"grad_norm": 0.3096918566187467,
"learning_rate": 7.769108245192922e-05,
"loss": 0.4134,
"step": 312
},
{
"epoch": 0.993256644188814,
"grad_norm": 0.497339716362162,
"learning_rate": 7.766129576631759e-05,
"loss": 0.41,
"step": 313
},
{
"epoch": 0.9964299880999603,
"grad_norm": 0.5309378071519795,
"learning_rate": 7.763132395994572e-05,
"loss": 0.4023,
"step": 314
},
{
"epoch": 0.9996033320111067,
"grad_norm": 0.44514647594528395,
"learning_rate": 7.760116718013735e-05,
"loss": 0.4079,
"step": 315
},
{
"epoch": 1.0027766759222532,
"grad_norm": 0.7972378708928999,
"learning_rate": 7.757082557512545e-05,
"loss": 0.7527,
"step": 316
},
{
"epoch": 1.0059500198333995,
"grad_norm": 1.9850051036948724,
"learning_rate": 7.75402992940515e-05,
"loss": 0.4284,
"step": 317
},
{
"epoch": 1.0091233637445458,
"grad_norm": 0.5262986098049574,
"learning_rate": 7.750958848696473e-05,
"loss": 0.3962,
"step": 318
},
{
"epoch": 1.0122967076556921,
"grad_norm": 1.5250119834033022,
"learning_rate": 7.747869330482137e-05,
"loss": 0.4258,
"step": 319
},
{
"epoch": 1.0154700515668384,
"grad_norm": 0.5638471189301302,
"learning_rate": 7.744761389948397e-05,
"loss": 0.4036,
"step": 320
},
{
"epoch": 1.018643395477985,
"grad_norm": 0.9646189002979927,
"learning_rate": 7.741635042372059e-05,
"loss": 0.418,
"step": 321
},
{
"epoch": 1.0218167393891313,
"grad_norm": 0.9961382446454835,
"learning_rate": 7.738490303120407e-05,
"loss": 0.4059,
"step": 322
},
{
"epoch": 1.0249900833002776,
"grad_norm": 0.7909414038458441,
"learning_rate": 7.735327187651127e-05,
"loss": 0.4079,
"step": 323
},
{
"epoch": 1.028163427211424,
"grad_norm": 0.5245790298223422,
"learning_rate": 7.732145711512234e-05,
"loss": 0.4063,
"step": 324
},
{
"epoch": 1.0313367711225705,
"grad_norm": 0.7541383994484598,
"learning_rate": 7.728945890341991e-05,
"loss": 0.4016,
"step": 325
},
{
"epoch": 1.0345101150337168,
"grad_norm": 0.484806781405723,
"learning_rate": 7.725727739868837e-05,
"loss": 0.3989,
"step": 326
},
{
"epoch": 1.0376834589448631,
"grad_norm": 0.5797811834279861,
"learning_rate": 7.722491275911302e-05,
"loss": 0.4013,
"step": 327
},
{
"epoch": 1.0408568028560095,
"grad_norm": 0.45973779856894975,
"learning_rate": 7.71923651437794e-05,
"loss": 0.4022,
"step": 328
},
{
"epoch": 1.044030146767156,
"grad_norm": 0.4205253473707581,
"learning_rate": 7.715963471267243e-05,
"loss": 0.394,
"step": 329
},
{
"epoch": 1.0472034906783023,
"grad_norm": 0.3671719170264264,
"learning_rate": 7.712672162667563e-05,
"loss": 0.3953,
"step": 330
},
{
"epoch": 1.0503768345894486,
"grad_norm": 0.3923903715008762,
"learning_rate": 7.709362604757037e-05,
"loss": 0.3941,
"step": 331
},
{
"epoch": 1.053550178500595,
"grad_norm": 0.38611037315155544,
"learning_rate": 7.706034813803501e-05,
"loss": 0.3967,
"step": 332
},
{
"epoch": 1.0567235224117413,
"grad_norm": 0.3205248790594937,
"learning_rate": 7.702688806164419e-05,
"loss": 0.3978,
"step": 333
},
{
"epoch": 1.0598968663228878,
"grad_norm": 0.3851041881561772,
"learning_rate": 7.699324598286794e-05,
"loss": 0.3969,
"step": 334
},
{
"epoch": 1.0630702102340341,
"grad_norm": 0.3048551045247859,
"learning_rate": 7.69594220670709e-05,
"loss": 0.3981,
"step": 335
},
{
"epoch": 1.0662435541451805,
"grad_norm": 0.30649990794634424,
"learning_rate": 7.692541648051156e-05,
"loss": 0.3883,
"step": 336
},
{
"epoch": 1.0694168980563268,
"grad_norm": 0.28629978941116185,
"learning_rate": 7.689122939034135e-05,
"loss": 0.391,
"step": 337
},
{
"epoch": 1.0725902419674733,
"grad_norm": 0.2815013785922316,
"learning_rate": 7.685686096460387e-05,
"loss": 0.3869,
"step": 338
},
{
"epoch": 1.0757635858786196,
"grad_norm": 0.2953463475399726,
"learning_rate": 7.682231137223409e-05,
"loss": 0.3913,
"step": 339
},
{
"epoch": 1.078936929789766,
"grad_norm": 0.2366855254680076,
"learning_rate": 7.678758078305745e-05,
"loss": 0.3883,
"step": 340
},
{
"epoch": 1.0821102737009123,
"grad_norm": 0.22104183899331858,
"learning_rate": 7.67526693677891e-05,
"loss": 0.3926,
"step": 341
},
{
"epoch": 1.0852836176120586,
"grad_norm": 0.22652465216086823,
"learning_rate": 7.671757729803299e-05,
"loss": 0.3831,
"step": 342
},
{
"epoch": 1.0884569615232051,
"grad_norm": 0.2160165389313366,
"learning_rate": 7.668230474628108e-05,
"loss": 0.3921,
"step": 343
},
{
"epoch": 1.0916303054343515,
"grad_norm": 0.2107397076821849,
"learning_rate": 7.664685188591246e-05,
"loss": 0.3855,
"step": 344
},
{
"epoch": 1.0948036493454978,
"grad_norm": 0.24162805284413108,
"learning_rate": 7.661121889119257e-05,
"loss": 0.3864,
"step": 345
},
{
"epoch": 1.097976993256644,
"grad_norm": 0.20058743270523813,
"learning_rate": 7.657540593727218e-05,
"loss": 0.3898,
"step": 346
},
{
"epoch": 1.1011503371677906,
"grad_norm": 0.19024320800558892,
"learning_rate": 7.653941320018672e-05,
"loss": 0.3881,
"step": 347
},
{
"epoch": 1.104323681078937,
"grad_norm": 0.19348467821715107,
"learning_rate": 7.650324085685528e-05,
"loss": 0.3861,
"step": 348
},
{
"epoch": 1.1074970249900833,
"grad_norm": 0.17855591849282623,
"learning_rate": 7.646688908507983e-05,
"loss": 0.3879,
"step": 349
},
{
"epoch": 1.1106703689012296,
"grad_norm": 0.17423718420205644,
"learning_rate": 7.643035806354427e-05,
"loss": 0.393,
"step": 350
},
{
"epoch": 1.113843712812376,
"grad_norm": 0.17435999115090034,
"learning_rate": 7.639364797181359e-05,
"loss": 0.3923,
"step": 351
},
{
"epoch": 1.1170170567235225,
"grad_norm": 0.1709673536337813,
"learning_rate": 7.6356758990333e-05,
"loss": 0.3906,
"step": 352
},
{
"epoch": 1.1201904006346688,
"grad_norm": 0.17317855376977867,
"learning_rate": 7.6319691300427e-05,
"loss": 0.3888,
"step": 353
},
{
"epoch": 1.1233637445458151,
"grad_norm": 0.18174880626145437,
"learning_rate": 7.628244508429856e-05,
"loss": 0.3865,
"step": 354
},
{
"epoch": 1.1265370884569614,
"grad_norm": 0.20331689410780052,
"learning_rate": 7.624502052502814e-05,
"loss": 0.3874,
"step": 355
},
{
"epoch": 1.129710432368108,
"grad_norm": 0.23136698366970124,
"learning_rate": 7.620741780657284e-05,
"loss": 0.3862,
"step": 356
},
{
"epoch": 1.1328837762792543,
"grad_norm": 0.2578608983871263,
"learning_rate": 7.61696371137655e-05,
"loss": 0.3885,
"step": 357
},
{
"epoch": 1.1360571201904006,
"grad_norm": 0.31713285899714155,
"learning_rate": 7.613167863231376e-05,
"loss": 0.3858,
"step": 358
},
{
"epoch": 1.139230464101547,
"grad_norm": 0.4030624344185451,
"learning_rate": 7.609354254879916e-05,
"loss": 0.3894,
"step": 359
},
{
"epoch": 1.1424038080126935,
"grad_norm": 0.5246514767856207,
"learning_rate": 7.605522905067626e-05,
"loss": 0.3852,
"step": 360
},
{
"epoch": 1.1455771519238398,
"grad_norm": 0.5257615152434831,
"learning_rate": 7.601673832627162e-05,
"loss": 0.3892,
"step": 361
},
{
"epoch": 1.1487504958349861,
"grad_norm": 0.47164366725819173,
"learning_rate": 7.597807056478304e-05,
"loss": 0.3941,
"step": 362
},
{
"epoch": 1.1519238397461324,
"grad_norm": 0.40276010096670556,
"learning_rate": 7.593922595627843e-05,
"loss": 0.3895,
"step": 363
},
{
"epoch": 1.1550971836572788,
"grad_norm": 0.3335379837317923,
"learning_rate": 7.590020469169505e-05,
"loss": 0.3868,
"step": 364
},
{
"epoch": 1.1582705275684253,
"grad_norm": 0.389480610213482,
"learning_rate": 7.586100696283845e-05,
"loss": 0.3888,
"step": 365
},
{
"epoch": 1.1614438714795716,
"grad_norm": 0.40641502830320464,
"learning_rate": 7.582163296238158e-05,
"loss": 0.3886,
"step": 366
},
{
"epoch": 1.164617215390718,
"grad_norm": 0.34031545798126167,
"learning_rate": 7.578208288386386e-05,
"loss": 0.3888,
"step": 367
},
{
"epoch": 1.1677905593018643,
"grad_norm": 0.3375442621679508,
"learning_rate": 7.574235692169021e-05,
"loss": 0.3901,
"step": 368
},
{
"epoch": 1.1709639032130108,
"grad_norm": 0.34505650189241893,
"learning_rate": 7.570245527113004e-05,
"loss": 0.3875,
"step": 369
},
{
"epoch": 1.1741372471241571,
"grad_norm": 0.32855600104681076,
"learning_rate": 7.566237812831641e-05,
"loss": 0.3808,
"step": 370
},
{
"epoch": 1.1773105910353034,
"grad_norm": 0.33479682880517575,
"learning_rate": 7.562212569024494e-05,
"loss": 0.3831,
"step": 371
},
{
"epoch": 1.1804839349464498,
"grad_norm": 0.3323687537285638,
"learning_rate": 7.558169815477293e-05,
"loss": 0.3763,
"step": 372
},
{
"epoch": 1.1836572788575963,
"grad_norm": 0.29597912982148444,
"learning_rate": 7.554109572061835e-05,
"loss": 0.384,
"step": 373
},
{
"epoch": 1.1868306227687426,
"grad_norm": 0.2608818292389468,
"learning_rate": 7.550031858735885e-05,
"loss": 0.3844,
"step": 374
},
{
"epoch": 1.190003966679889,
"grad_norm": 0.2362089436062283,
"learning_rate": 7.545936695543084e-05,
"loss": 0.3847,
"step": 375
},
{
"epoch": 1.1931773105910353,
"grad_norm": 0.27249864452918143,
"learning_rate": 7.541824102612839e-05,
"loss": 0.3843,
"step": 376
},
{
"epoch": 1.1963506545021816,
"grad_norm": 0.291889265790341,
"learning_rate": 7.537694100160242e-05,
"loss": 0.385,
"step": 377
},
{
"epoch": 1.1995239984133281,
"grad_norm": 0.26769841877142225,
"learning_rate": 7.533546708485949e-05,
"loss": 0.3857,
"step": 378
},
{
"epoch": 1.2026973423244744,
"grad_norm": 0.25310229652456356,
"learning_rate": 7.529381947976097e-05,
"loss": 0.3842,
"step": 379
},
{
"epoch": 1.2058706862356208,
"grad_norm": 0.3794658972915025,
"learning_rate": 7.525199839102198e-05,
"loss": 0.3853,
"step": 380
},
{
"epoch": 1.209044030146767,
"grad_norm": 0.4433997626787792,
"learning_rate": 7.521000402421039e-05,
"loss": 0.3871,
"step": 381
},
{
"epoch": 1.2122173740579134,
"grad_norm": 0.44670046763987153,
"learning_rate": 7.516783658574575e-05,
"loss": 0.38,
"step": 382
},
{
"epoch": 1.21539071796906,
"grad_norm": 0.4470855938681001,
"learning_rate": 7.51254962828984e-05,
"loss": 0.3883,
"step": 383
},
{
"epoch": 1.2185640618802063,
"grad_norm": 0.37472305499391584,
"learning_rate": 7.508298332378832e-05,
"loss": 0.3874,
"step": 384
},
{
"epoch": 1.2217374057913526,
"grad_norm": 0.2675107029023701,
"learning_rate": 7.504029791738419e-05,
"loss": 0.3797,
"step": 385
},
{
"epoch": 1.224910749702499,
"grad_norm": 0.2692926766700756,
"learning_rate": 7.499744027350236e-05,
"loss": 0.3877,
"step": 386
},
{
"epoch": 1.2280840936136455,
"grad_norm": 0.35704547610000964,
"learning_rate": 7.495441060280577e-05,
"loss": 0.3847,
"step": 387
},
{
"epoch": 1.2312574375247918,
"grad_norm": 0.393025585260643,
"learning_rate": 7.491120911680295e-05,
"loss": 0.3833,
"step": 388
},
{
"epoch": 1.234430781435938,
"grad_norm": 0.34992195520115954,
"learning_rate": 7.486783602784697e-05,
"loss": 0.3804,
"step": 389
},
{
"epoch": 1.2376041253470844,
"grad_norm": 0.26770410504282594,
"learning_rate": 7.48242915491344e-05,
"loss": 0.3792,
"step": 390
},
{
"epoch": 1.240777469258231,
"grad_norm": 0.21538249724047004,
"learning_rate": 7.478057589470429e-05,
"loss": 0.3823,
"step": 391
},
{
"epoch": 1.2439508131693773,
"grad_norm": 0.24012637317741398,
"learning_rate": 7.473668927943703e-05,
"loss": 0.3898,
"step": 392
},
{
"epoch": 1.2471241570805236,
"grad_norm": 0.21927612950882752,
"learning_rate": 7.469263191905342e-05,
"loss": 0.3832,
"step": 393
},
{
"epoch": 1.25029750099167,
"grad_norm": 0.22562831710853817,
"learning_rate": 7.464840403011348e-05,
"loss": 0.3781,
"step": 394
},
{
"epoch": 1.2534708449028162,
"grad_norm": 0.25428834981926673,
"learning_rate": 7.460400583001549e-05,
"loss": 0.3812,
"step": 395
},
{
"epoch": 1.2566441888139628,
"grad_norm": 0.2702984486835505,
"learning_rate": 7.455943753699485e-05,
"loss": 0.3866,
"step": 396
},
{
"epoch": 1.259817532725109,
"grad_norm": 0.24121323703671108,
"learning_rate": 7.451469937012308e-05,
"loss": 0.3873,
"step": 397
},
{
"epoch": 1.2629908766362554,
"grad_norm": 0.18653342749296287,
"learning_rate": 7.446979154930664e-05,
"loss": 0.3825,
"step": 398
},
{
"epoch": 1.2661642205474017,
"grad_norm": 0.16332575184703055,
"learning_rate": 7.4424714295286e-05,
"loss": 0.3815,
"step": 399
},
{
"epoch": 1.269337564458548,
"grad_norm": 0.25403145188276544,
"learning_rate": 7.437946782963434e-05,
"loss": 0.383,
"step": 400
},
{
"epoch": 1.2725109083696946,
"grad_norm": 0.3444161535033614,
"learning_rate": 7.433405237475668e-05,
"loss": 0.3813,
"step": 401
},
{
"epoch": 1.275684252280841,
"grad_norm": 0.3519022102988297,
"learning_rate": 7.428846815388867e-05,
"loss": 0.3855,
"step": 402
},
{
"epoch": 1.2788575961919872,
"grad_norm": 0.33077524829421917,
"learning_rate": 7.424271539109548e-05,
"loss": 0.3821,
"step": 403
},
{
"epoch": 1.2820309401031338,
"grad_norm": 0.3162039421386741,
"learning_rate": 7.419679431127078e-05,
"loss": 0.3851,
"step": 404
},
{
"epoch": 1.28520428401428,
"grad_norm": 0.32717321840623165,
"learning_rate": 7.415070514013554e-05,
"loss": 0.3835,
"step": 405
},
{
"epoch": 1.2883776279254264,
"grad_norm": 0.3260212879527716,
"learning_rate": 7.410444810423703e-05,
"loss": 0.3851,
"step": 406
},
{
"epoch": 1.2915509718365727,
"grad_norm": 0.28141296423668416,
"learning_rate": 7.405802343094761e-05,
"loss": 0.384,
"step": 407
},
{
"epoch": 1.294724315747719,
"grad_norm": 0.29242681028873885,
"learning_rate": 7.401143134846361e-05,
"loss": 0.3845,
"step": 408
},
{
"epoch": 1.2978976596588656,
"grad_norm": 0.3089571656384374,
"learning_rate": 7.396467208580431e-05,
"loss": 0.3876,
"step": 409
},
{
"epoch": 1.301071003570012,
"grad_norm": 0.2727831374787424,
"learning_rate": 7.39177458728107e-05,
"loss": 0.3853,
"step": 410
},
{
"epoch": 1.3042443474811583,
"grad_norm": 0.24218021592794953,
"learning_rate": 7.387065294014444e-05,
"loss": 0.3854,
"step": 411
},
{
"epoch": 1.3074176913923046,
"grad_norm": 0.26309492335546847,
"learning_rate": 7.382339351928664e-05,
"loss": 0.3831,
"step": 412
},
{
"epoch": 1.310591035303451,
"grad_norm": 0.2590023202596478,
"learning_rate": 7.377596784253682e-05,
"loss": 0.3874,
"step": 413
},
{
"epoch": 1.3137643792145974,
"grad_norm": 0.22355363656378588,
"learning_rate": 7.372837614301167e-05,
"loss": 0.3794,
"step": 414
},
{
"epoch": 1.3169377231257438,
"grad_norm": 0.23277360549451911,
"learning_rate": 7.368061865464398e-05,
"loss": 0.3823,
"step": 415
},
{
"epoch": 1.32011106703689,
"grad_norm": 0.29759664882744374,
"learning_rate": 7.363269561218144e-05,
"loss": 0.3854,
"step": 416
},
{
"epoch": 1.3232844109480366,
"grad_norm": 0.37555739230090074,
"learning_rate": 7.358460725118553e-05,
"loss": 0.3796,
"step": 417
},
{
"epoch": 1.326457754859183,
"grad_norm": 0.4302124224232923,
"learning_rate": 7.353635380803031e-05,
"loss": 0.3887,
"step": 418
},
{
"epoch": 1.3296310987703293,
"grad_norm": 0.4773944462131208,
"learning_rate": 7.348793551990132e-05,
"loss": 0.3814,
"step": 419
},
{
"epoch": 1.3328044426814756,
"grad_norm": 0.5403161096787458,
"learning_rate": 7.343935262479433e-05,
"loss": 0.3823,
"step": 420
},
{
"epoch": 1.335977786592622,
"grad_norm": 0.5311329635820989,
"learning_rate": 7.33906053615143e-05,
"loss": 0.3863,
"step": 421
},
{
"epoch": 1.3391511305037684,
"grad_norm": 0.44347216349851915,
"learning_rate": 7.334169396967403e-05,
"loss": 0.3811,
"step": 422
},
{
"epoch": 1.3423244744149148,
"grad_norm": 0.4038867029004121,
"learning_rate": 7.329261868969318e-05,
"loss": 0.3786,
"step": 423
},
{
"epoch": 1.345497818326061,
"grad_norm": 0.42166277469573826,
"learning_rate": 7.324337976279688e-05,
"loss": 0.386,
"step": 424
},
{
"epoch": 1.3486711622372074,
"grad_norm": 0.4142405224624998,
"learning_rate": 7.319397743101478e-05,
"loss": 0.3824,
"step": 425
},
{
"epoch": 1.3518445061483537,
"grad_norm": 0.36700141049097496,
"learning_rate": 7.31444119371796e-05,
"loss": 0.3829,
"step": 426
},
{
"epoch": 1.3550178500595003,
"grad_norm": 0.3135329163503587,
"learning_rate": 7.309468352492616e-05,
"loss": 0.3824,
"step": 427
},
{
"epoch": 1.3581911939706466,
"grad_norm": 0.2823327229949492,
"learning_rate": 7.304479243869007e-05,
"loss": 0.376,
"step": 428
},
{
"epoch": 1.361364537881793,
"grad_norm": 0.30150602495208256,
"learning_rate": 7.299473892370651e-05,
"loss": 0.3792,
"step": 429
},
{
"epoch": 1.3645378817929394,
"grad_norm": 0.2849241799804419,
"learning_rate": 7.294452322600912e-05,
"loss": 0.374,
"step": 430
},
{
"epoch": 1.3677112257040855,
"grad_norm": 0.23906164695179707,
"learning_rate": 7.289414559242871e-05,
"loss": 0.3841,
"step": 431
},
{
"epoch": 1.370884569615232,
"grad_norm": 0.27790217133033873,
"learning_rate": 7.284360627059205e-05,
"loss": 0.3894,
"step": 432
},
{
"epoch": 1.3740579135263784,
"grad_norm": 0.27965618860067953,
"learning_rate": 7.279290550892071e-05,
"loss": 0.3856,
"step": 433
},
{
"epoch": 1.3772312574375247,
"grad_norm": 0.2596553182069096,
"learning_rate": 7.274204355662981e-05,
"loss": 0.3835,
"step": 434
},
{
"epoch": 1.3804046013486713,
"grad_norm": 0.22914836462750776,
"learning_rate": 7.269102066372672e-05,
"loss": 0.3801,
"step": 435
},
{
"epoch": 1.3835779452598176,
"grad_norm": 0.24961150632312626,
"learning_rate": 7.263983708100998e-05,
"loss": 0.3792,
"step": 436
},
{
"epoch": 1.386751289170964,
"grad_norm": 0.2799354977533222,
"learning_rate": 7.258849306006796e-05,
"loss": 0.3785,
"step": 437
},
{
"epoch": 1.3899246330821102,
"grad_norm": 0.2609555149345694,
"learning_rate": 7.253698885327761e-05,
"loss": 0.3877,
"step": 438
},
{
"epoch": 1.3930979769932565,
"grad_norm": 0.25476756664542266,
"learning_rate": 7.24853247138033e-05,
"loss": 0.3865,
"step": 439
},
{
"epoch": 1.396271320904403,
"grad_norm": 0.2853597140584723,
"learning_rate": 7.243350089559555e-05,
"loss": 0.3835,
"step": 440
},
{
"epoch": 1.3994446648155494,
"grad_norm": 0.3037545404410833,
"learning_rate": 7.238151765338974e-05,
"loss": 0.3755,
"step": 441
},
{
"epoch": 1.4026180087266957,
"grad_norm": 0.32448084195253646,
"learning_rate": 7.232937524270486e-05,
"loss": 0.3824,
"step": 442
},
{
"epoch": 1.405791352637842,
"grad_norm": 0.3389903691526235,
"learning_rate": 7.227707391984233e-05,
"loss": 0.3851,
"step": 443
},
{
"epoch": 1.4089646965489884,
"grad_norm": 0.32339033942705137,
"learning_rate": 7.222461394188467e-05,
"loss": 0.3809,
"step": 444
},
{
"epoch": 1.412138040460135,
"grad_norm": 0.3004560485468966,
"learning_rate": 7.217199556669423e-05,
"loss": 0.3753,
"step": 445
},
{
"epoch": 1.4153113843712812,
"grad_norm": 0.2963352362165301,
"learning_rate": 7.211921905291198e-05,
"loss": 0.3845,
"step": 446
},
{
"epoch": 1.4184847282824276,
"grad_norm": 0.3394987023293836,
"learning_rate": 7.20662846599562e-05,
"loss": 0.3829,
"step": 447
},
{
"epoch": 1.421658072193574,
"grad_norm": 0.3691233795245846,
"learning_rate": 7.201319264802118e-05,
"loss": 0.3746,
"step": 448
},
{
"epoch": 1.4248314161047204,
"grad_norm": 0.419112740006911,
"learning_rate": 7.195994327807603e-05,
"loss": 0.3889,
"step": 449
},
{
"epoch": 1.4280047600158667,
"grad_norm": 0.4313621841160134,
"learning_rate": 7.19065368118633e-05,
"loss": 0.3821,
"step": 450
},
{
"epoch": 1.431178103927013,
"grad_norm": 0.3566308145795978,
"learning_rate": 7.185297351189771e-05,
"loss": 0.3844,
"step": 451
},
{
"epoch": 1.4343514478381594,
"grad_norm": 0.3637230498145468,
"learning_rate": 7.179925364146496e-05,
"loss": 0.3765,
"step": 452
},
{
"epoch": 1.437524791749306,
"grad_norm": 0.3468382349782321,
"learning_rate": 7.174537746462027e-05,
"loss": 0.3744,
"step": 453
},
{
"epoch": 1.4406981356604522,
"grad_norm": 0.3131084073191254,
"learning_rate": 7.169134524618723e-05,
"loss": 0.3814,
"step": 454
},
{
"epoch": 1.4438714795715986,
"grad_norm": 0.23863373901146623,
"learning_rate": 7.163715725175641e-05,
"loss": 0.3848,
"step": 455
},
{
"epoch": 1.4470448234827449,
"grad_norm": 0.25976033784025593,
"learning_rate": 7.15828137476841e-05,
"loss": 0.3836,
"step": 456
},
{
"epoch": 1.4502181673938912,
"grad_norm": 0.3648800662769983,
"learning_rate": 7.152831500109096e-05,
"loss": 0.3848,
"step": 457
},
{
"epoch": 1.4533915113050377,
"grad_norm": 0.3566172184643027,
"learning_rate": 7.14736612798608e-05,
"loss": 0.382,
"step": 458
},
{
"epoch": 1.456564855216184,
"grad_norm": 0.3135261324431755,
"learning_rate": 7.141885285263906e-05,
"loss": 0.3741,
"step": 459
},
{
"epoch": 1.4597381991273304,
"grad_norm": 0.3588285727185198,
"learning_rate": 7.136388998883176e-05,
"loss": 0.3836,
"step": 460
},
{
"epoch": 1.462911543038477,
"grad_norm": 0.4144697912185411,
"learning_rate": 7.130877295860396e-05,
"loss": 0.3814,
"step": 461
},
{
"epoch": 1.4660848869496232,
"grad_norm": 0.27886664253240845,
"learning_rate": 7.125350203287856e-05,
"loss": 0.3793,
"step": 462
},
{
"epoch": 1.4692582308607696,
"grad_norm": 0.3160540382900367,
"learning_rate": 7.119807748333488e-05,
"loss": 0.382,
"step": 463
},
{
"epoch": 1.4724315747719159,
"grad_norm": 0.43245832112598737,
"learning_rate": 7.114249958240736e-05,
"loss": 0.3792,
"step": 464
},
{
"epoch": 1.4756049186830622,
"grad_norm": 0.3929381559573092,
"learning_rate": 7.108676860328429e-05,
"loss": 0.3813,
"step": 465
},
{
"epoch": 1.4787782625942087,
"grad_norm": 0.36518334446855616,
"learning_rate": 7.103088481990631e-05,
"loss": 0.3794,
"step": 466
},
{
"epoch": 1.481951606505355,
"grad_norm": 0.4062717060922552,
"learning_rate": 7.097484850696523e-05,
"loss": 0.3788,
"step": 467
},
{
"epoch": 1.4851249504165014,
"grad_norm": 0.3985440491593577,
"learning_rate": 7.091865993990257e-05,
"loss": 0.3839,
"step": 468
},
{
"epoch": 1.4882982943276477,
"grad_norm": 0.2478605758310385,
"learning_rate": 7.086231939490825e-05,
"loss": 0.3822,
"step": 469
},
{
"epoch": 1.491471638238794,
"grad_norm": 0.2198881914089004,
"learning_rate": 7.080582714891922e-05,
"loss": 0.3844,
"step": 470
},
{
"epoch": 1.4946449821499406,
"grad_norm": 0.30913344433263185,
"learning_rate": 7.074918347961812e-05,
"loss": 0.383,
"step": 471
},
{
"epoch": 1.497818326061087,
"grad_norm": 0.3416286926665111,
"learning_rate": 7.069238866543186e-05,
"loss": 0.3836,
"step": 472
},
{
"epoch": 1.5009916699722332,
"grad_norm": 0.26203884165321145,
"learning_rate": 7.063544298553036e-05,
"loss": 0.3857,
"step": 473
},
{
"epoch": 1.5041650138833798,
"grad_norm": 0.2277269534373725,
"learning_rate": 7.0578346719825e-05,
"loss": 0.3768,
"step": 474
},
{
"epoch": 1.5073383577945259,
"grad_norm": 0.29105785846246257,
"learning_rate": 7.052110014896745e-05,
"loss": 0.383,
"step": 475
},
{
"epoch": 1.5105117017056724,
"grad_norm": 0.2943191343272462,
"learning_rate": 7.046370355434814e-05,
"loss": 0.3824,
"step": 476
},
{
"epoch": 1.5136850456168187,
"grad_norm": 0.25257147532228097,
"learning_rate": 7.040615721809495e-05,
"loss": 0.3832,
"step": 477
},
{
"epoch": 1.516858389527965,
"grad_norm": 0.227234673966253,
"learning_rate": 7.03484614230718e-05,
"loss": 0.3744,
"step": 478
},
{
"epoch": 1.5200317334391116,
"grad_norm": 0.24823996008997273,
"learning_rate": 7.029061645287724e-05,
"loss": 0.3796,
"step": 479
},
{
"epoch": 1.5232050773502577,
"grad_norm": 0.2871638979837815,
"learning_rate": 7.023262259184309e-05,
"loss": 0.381,
"step": 480
},
{
"epoch": 1.5263784212614042,
"grad_norm": 0.3204999675625282,
"learning_rate": 7.017448012503306e-05,
"loss": 0.3798,
"step": 481
},
{
"epoch": 1.5295517651725505,
"grad_norm": 0.3133074262497558,
"learning_rate": 7.011618933824124e-05,
"loss": 0.3811,
"step": 482
},
{
"epoch": 1.5327251090836969,
"grad_norm": 0.2699740914992895,
"learning_rate": 7.005775051799088e-05,
"loss": 0.3764,
"step": 483
},
{
"epoch": 1.5358984529948434,
"grad_norm": 0.26103326951221123,
"learning_rate": 6.999916395153279e-05,
"loss": 0.3839,
"step": 484
},
{
"epoch": 1.5390717969059897,
"grad_norm": 0.319889184006099,
"learning_rate": 6.994042992684406e-05,
"loss": 0.3807,
"step": 485
},
{
"epoch": 1.542245140817136,
"grad_norm": 0.3366203795427369,
"learning_rate": 6.988154873262655e-05,
"loss": 0.3859,
"step": 486
},
{
"epoch": 1.5454184847282826,
"grad_norm": 0.31334787713113593,
"learning_rate": 6.982252065830557e-05,
"loss": 0.3801,
"step": 487
},
{
"epoch": 1.5485918286394287,
"grad_norm": 0.2612084722641158,
"learning_rate": 6.976334599402838e-05,
"loss": 0.3818,
"step": 488
},
{
"epoch": 1.5517651725505752,
"grad_norm": 0.23800865263666302,
"learning_rate": 6.970402503066281e-05,
"loss": 0.3796,
"step": 489
},
{
"epoch": 1.5549385164617215,
"grad_norm": 0.20619756878025533,
"learning_rate": 6.96445580597958e-05,
"loss": 0.3784,
"step": 490
},
{
"epoch": 1.5581118603728679,
"grad_norm": 0.22823026276130054,
"learning_rate": 6.958494537373194e-05,
"loss": 0.3898,
"step": 491
},
{
"epoch": 1.5612852042840144,
"grad_norm": 0.2320286176349148,
"learning_rate": 6.952518726549212e-05,
"loss": 0.3776,
"step": 492
},
{
"epoch": 1.5644585481951605,
"grad_norm": 0.18333626484330504,
"learning_rate": 6.946528402881204e-05,
"loss": 0.3768,
"step": 493
},
{
"epoch": 1.567631892106307,
"grad_norm": 0.23594130345003322,
"learning_rate": 6.940523595814073e-05,
"loss": 0.3813,
"step": 494
},
{
"epoch": 1.5708052360174534,
"grad_norm": 0.28075187874529217,
"learning_rate": 6.934504334863915e-05,
"loss": 0.3831,
"step": 495
},
{
"epoch": 1.5739785799285997,
"grad_norm": 0.3164356465842641,
"learning_rate": 6.928470649617876e-05,
"loss": 0.3828,
"step": 496
},
{
"epoch": 1.5771519238397462,
"grad_norm": 0.29758964129105847,
"learning_rate": 6.922422569733998e-05,
"loss": 0.3784,
"step": 497
},
{
"epoch": 1.5803252677508925,
"grad_norm": 0.2444339210595453,
"learning_rate": 6.916360124941084e-05,
"loss": 0.3834,
"step": 498
},
{
"epoch": 1.5834986116620389,
"grad_norm": 0.20948868463222103,
"learning_rate": 6.910283345038542e-05,
"loss": 0.3717,
"step": 499
},
{
"epoch": 1.5866719555731852,
"grad_norm": 0.2172132590543899,
"learning_rate": 6.904192259896247e-05,
"loss": 0.3725,
"step": 500
},
{
"epoch": 1.5898452994843315,
"grad_norm": 0.21384474550035673,
"learning_rate": 6.898086899454387e-05,
"loss": 0.3813,
"step": 501
},
{
"epoch": 1.593018643395478,
"grad_norm": 0.1765294609272397,
"learning_rate": 6.891967293723318e-05,
"loss": 0.3754,
"step": 502
},
{
"epoch": 1.5961919873066244,
"grad_norm": 0.19836511713366814,
"learning_rate": 6.885833472783422e-05,
"loss": 0.3789,
"step": 503
},
{
"epoch": 1.5993653312177707,
"grad_norm": 0.24256580925484045,
"learning_rate": 6.879685466784951e-05,
"loss": 0.3786,
"step": 504
},
{
"epoch": 1.6025386751289172,
"grad_norm": 0.2332208270352636,
"learning_rate": 6.873523305947883e-05,
"loss": 0.378,
"step": 505
},
{
"epoch": 1.6057120190400633,
"grad_norm": 0.2380059856826742,
"learning_rate": 6.867347020561774e-05,
"loss": 0.382,
"step": 506
},
{
"epoch": 1.6088853629512099,
"grad_norm": 0.24371122406892495,
"learning_rate": 6.861156640985607e-05,
"loss": 0.3813,
"step": 507
},
{
"epoch": 1.6120587068623562,
"grad_norm": 0.22447844663148647,
"learning_rate": 6.854952197647643e-05,
"loss": 0.3762,
"step": 508
},
{
"epoch": 1.6152320507735025,
"grad_norm": 0.19727536404273158,
"learning_rate": 6.848733721045275e-05,
"loss": 0.3732,
"step": 509
},
{
"epoch": 1.618405394684649,
"grad_norm": 0.21660640742566128,
"learning_rate": 6.842501241744873e-05,
"loss": 0.3786,
"step": 510
},
{
"epoch": 1.6215787385957952,
"grad_norm": 0.2567200817107154,
"learning_rate": 6.836254790381635e-05,
"loss": 0.3763,
"step": 511
},
{
"epoch": 1.6247520825069417,
"grad_norm": 0.2867034859585703,
"learning_rate": 6.829994397659439e-05,
"loss": 0.3747,
"step": 512
},
{
"epoch": 1.627925426418088,
"grad_norm": 0.3529920349888837,
"learning_rate": 6.823720094350691e-05,
"loss": 0.3801,
"step": 513
},
{
"epoch": 1.6310987703292343,
"grad_norm": 0.393616670612644,
"learning_rate": 6.817431911296174e-05,
"loss": 0.3763,
"step": 514
},
{
"epoch": 1.6342721142403809,
"grad_norm": 0.37774302228207385,
"learning_rate": 6.811129879404892e-05,
"loss": 0.374,
"step": 515
},
{
"epoch": 1.6374454581515272,
"grad_norm": 0.3864964477468986,
"learning_rate": 6.804814029653926e-05,
"loss": 0.3791,
"step": 516
},
{
"epoch": 1.6406188020626735,
"grad_norm": 0.39159495583279336,
"learning_rate": 6.798484393088273e-05,
"loss": 0.3829,
"step": 517
},
{
"epoch": 1.64379214597382,
"grad_norm": 0.39302151980304456,
"learning_rate": 6.792141000820703e-05,
"loss": 0.3824,
"step": 518
},
{
"epoch": 1.6469654898849662,
"grad_norm": 0.36360896642596874,
"learning_rate": 6.785783884031596e-05,
"loss": 0.3761,
"step": 519
},
{
"epoch": 1.6501388337961127,
"grad_norm": 0.3241622597538385,
"learning_rate": 6.779413073968798e-05,
"loss": 0.3781,
"step": 520
},
{
"epoch": 1.653312177707259,
"grad_norm": 0.29559003085786345,
"learning_rate": 6.77302860194746e-05,
"loss": 0.3767,
"step": 521
},
{
"epoch": 1.6564855216184053,
"grad_norm": 0.30944837922176927,
"learning_rate": 6.766630499349888e-05,
"loss": 0.3747,
"step": 522
},
{
"epoch": 1.6596588655295519,
"grad_norm": 0.3338849982759374,
"learning_rate": 6.760218797625389e-05,
"loss": 0.3778,
"step": 523
},
{
"epoch": 1.662832209440698,
"grad_norm": 0.305222030262685,
"learning_rate": 6.753793528290112e-05,
"loss": 0.3861,
"step": 524
},
{
"epoch": 1.6660055533518445,
"grad_norm": 0.23959583080697613,
"learning_rate": 6.747354722926903e-05,
"loss": 0.3845,
"step": 525
},
{
"epoch": 1.6691788972629908,
"grad_norm": 0.28220363781779934,
"learning_rate": 6.740902413185133e-05,
"loss": 0.3788,
"step": 526
},
{
"epoch": 1.6723522411741372,
"grad_norm": 0.2787836540873191,
"learning_rate": 6.734436630780565e-05,
"loss": 0.379,
"step": 527
},
{
"epoch": 1.6755255850852837,
"grad_norm": 0.24464321106241743,
"learning_rate": 6.727957407495174e-05,
"loss": 0.3745,
"step": 528
},
{
"epoch": 1.67869892899643,
"grad_norm": 0.2605023337214791,
"learning_rate": 6.721464775177009e-05,
"loss": 0.3742,
"step": 529
},
{
"epoch": 1.6818722729075763,
"grad_norm": 0.257073037128768,
"learning_rate": 6.71495876574003e-05,
"loss": 0.3696,
"step": 530
},
{
"epoch": 1.685045616818723,
"grad_norm": 0.22905434546418404,
"learning_rate": 6.708439411163948e-05,
"loss": 0.3744,
"step": 531
},
{
"epoch": 1.688218960729869,
"grad_norm": 0.18138198252013102,
"learning_rate": 6.701906743494075e-05,
"loss": 0.3822,
"step": 532
},
{
"epoch": 1.6913923046410155,
"grad_norm": 0.21324403956364202,
"learning_rate": 6.695360794841156e-05,
"loss": 0.374,
"step": 533
},
{
"epoch": 1.6945656485521619,
"grad_norm": 0.2589520378305425,
"learning_rate": 6.688801597381223e-05,
"loss": 0.3766,
"step": 534
},
{
"epoch": 1.6977389924633082,
"grad_norm": 0.2739815290314333,
"learning_rate": 6.68222918335543e-05,
"loss": 0.3787,
"step": 535
},
{
"epoch": 1.7009123363744547,
"grad_norm": 0.2622879792779608,
"learning_rate": 6.675643585069894e-05,
"loss": 0.3762,
"step": 536
},
{
"epoch": 1.7040856802856008,
"grad_norm": 0.3267913358280892,
"learning_rate": 6.669044834895541e-05,
"loss": 0.3785,
"step": 537
},
{
"epoch": 1.7072590241967474,
"grad_norm": 0.32086484201627036,
"learning_rate": 6.662432965267944e-05,
"loss": 0.3761,
"step": 538
},
{
"epoch": 1.7104323681078937,
"grad_norm": 0.23790201809058262,
"learning_rate": 6.655808008687156e-05,
"loss": 0.3779,
"step": 539
},
{
"epoch": 1.71360571201904,
"grad_norm": 0.20836977293389647,
"learning_rate": 6.649169997717571e-05,
"loss": 0.3776,
"step": 540
},
{
"epoch": 1.7167790559301865,
"grad_norm": 0.2053799652061652,
"learning_rate": 6.642518964987739e-05,
"loss": 0.3846,
"step": 541
},
{
"epoch": 1.7199523998413329,
"grad_norm": 0.21743619572422304,
"learning_rate": 6.635854943190221e-05,
"loss": 0.3818,
"step": 542
},
{
"epoch": 1.7231257437524792,
"grad_norm": 0.2330911970783851,
"learning_rate": 6.629177965081428e-05,
"loss": 0.3728,
"step": 543
},
{
"epoch": 1.7262990876636255,
"grad_norm": 0.2971530159306548,
"learning_rate": 6.622488063481454e-05,
"loss": 0.385,
"step": 544
},
{
"epoch": 1.7294724315747718,
"grad_norm": 0.34910792893542064,
"learning_rate": 6.615785271273913e-05,
"loss": 0.375,
"step": 545
},
{
"epoch": 1.7326457754859184,
"grad_norm": 0.3668253793899281,
"learning_rate": 6.609069621405791e-05,
"loss": 0.3785,
"step": 546
},
{
"epoch": 1.7358191193970647,
"grad_norm": 0.2748830512102228,
"learning_rate": 6.602341146887267e-05,
"loss": 0.3825,
"step": 547
},
{
"epoch": 1.738992463308211,
"grad_norm": 0.20904532284444974,
"learning_rate": 6.595599880791562e-05,
"loss": 0.3825,
"step": 548
},
{
"epoch": 1.7421658072193575,
"grad_norm": 0.23319626735985258,
"learning_rate": 6.58884585625477e-05,
"loss": 0.3817,
"step": 549
},
{
"epoch": 1.7453391511305036,
"grad_norm": 0.3101304312167264,
"learning_rate": 6.582079106475702e-05,
"loss": 0.3783,
"step": 550
},
{
"epoch": 1.7485124950416502,
"grad_norm": 0.40115725345017383,
"learning_rate": 6.575299664715714e-05,
"loss": 0.3753,
"step": 551
},
{
"epoch": 1.7516858389527965,
"grad_norm": 0.3987859292934531,
"learning_rate": 6.568507564298553e-05,
"loss": 0.3855,
"step": 552
},
{
"epoch": 1.7548591828639428,
"grad_norm": 0.3456599898982391,
"learning_rate": 6.561702838610186e-05,
"loss": 0.376,
"step": 553
},
{
"epoch": 1.7580325267750894,
"grad_norm": 0.2417290147953845,
"learning_rate": 6.55488552109864e-05,
"loss": 0.3776,
"step": 554
},
{
"epoch": 1.7612058706862355,
"grad_norm": 0.20640270006206152,
"learning_rate": 6.548055645273831e-05,
"loss": 0.3803,
"step": 555
},
{
"epoch": 1.764379214597382,
"grad_norm": 0.2232956579416985,
"learning_rate": 6.541213244707412e-05,
"loss": 0.3752,
"step": 556
},
{
"epoch": 1.7675525585085283,
"grad_norm": 0.2615817344479701,
"learning_rate": 6.534358353032593e-05,
"loss": 0.375,
"step": 557
},
{
"epoch": 1.7707259024196746,
"grad_norm": 0.254734079264447,
"learning_rate": 6.52749100394399e-05,
"loss": 0.3751,
"step": 558
},
{
"epoch": 1.7738992463308212,
"grad_norm": 0.21642500358045758,
"learning_rate": 6.520611231197446e-05,
"loss": 0.3845,
"step": 559
},
{
"epoch": 1.7770725902419675,
"grad_norm": 0.19329608647091265,
"learning_rate": 6.513719068609874e-05,
"loss": 0.3789,
"step": 560
},
{
"epoch": 1.7802459341531138,
"grad_norm": 0.2530286985366438,
"learning_rate": 6.506814550059091e-05,
"loss": 0.377,
"step": 561
},
{
"epoch": 1.7834192780642604,
"grad_norm": 0.2896046782801796,
"learning_rate": 6.499897709483641e-05,
"loss": 0.375,
"step": 562
},
{
"epoch": 1.7865926219754065,
"grad_norm": 0.2712317131665261,
"learning_rate": 6.492968580882644e-05,
"loss": 0.3776,
"step": 563
},
{
"epoch": 1.789765965886553,
"grad_norm": 0.2314286827879497,
"learning_rate": 6.486027198315617e-05,
"loss": 0.3794,
"step": 564
},
{
"epoch": 1.7929393097976993,
"grad_norm": 0.21626322109134916,
"learning_rate": 6.479073595902309e-05,
"loss": 0.3774,
"step": 565
},
{
"epoch": 1.7961126537088457,
"grad_norm": 0.19733332972369577,
"learning_rate": 6.472107807822538e-05,
"loss": 0.3708,
"step": 566
},
{
"epoch": 1.7992859976199922,
"grad_norm": 0.18353210245394538,
"learning_rate": 6.465129868316016e-05,
"loss": 0.3743,
"step": 567
},
{
"epoch": 1.8024593415311383,
"grad_norm": 0.21893010037488972,
"learning_rate": 6.458139811682188e-05,
"loss": 0.3752,
"step": 568
},
{
"epoch": 1.8056326854422848,
"grad_norm": 0.2628816377905819,
"learning_rate": 6.451137672280056e-05,
"loss": 0.3731,
"step": 569
},
{
"epoch": 1.8088060293534312,
"grad_norm": 0.2548108269109919,
"learning_rate": 6.444123484528015e-05,
"loss": 0.3755,
"step": 570
},
{
"epoch": 1.8119793732645775,
"grad_norm": 0.2260163520548028,
"learning_rate": 6.437097282903685e-05,
"loss": 0.3773,
"step": 571
},
{
"epoch": 1.815152717175724,
"grad_norm": 0.19255377161005122,
"learning_rate": 6.430059101943736e-05,
"loss": 0.3788,
"step": 572
},
{
"epoch": 1.8183260610868703,
"grad_norm": 0.16745873690738822,
"learning_rate": 6.423008976243722e-05,
"loss": 0.3742,
"step": 573
},
{
"epoch": 1.8214994049980167,
"grad_norm": 0.24311299014307378,
"learning_rate": 6.415946940457911e-05,
"loss": 0.3781,
"step": 574
},
{
"epoch": 1.824672748909163,
"grad_norm": 0.2692540724073406,
"learning_rate": 6.408873029299115e-05,
"loss": 0.3758,
"step": 575
},
{
"epoch": 1.8278460928203093,
"grad_norm": 0.25293407337643353,
"learning_rate": 6.401787277538515e-05,
"loss": 0.3745,
"step": 576
},
{
"epoch": 1.8310194367314558,
"grad_norm": 0.21866660096950907,
"learning_rate": 6.394689720005499e-05,
"loss": 0.3692,
"step": 577
},
{
"epoch": 1.8341927806426022,
"grad_norm": 0.21682910277524756,
"learning_rate": 6.387580391587477e-05,
"loss": 0.3758,
"step": 578
},
{
"epoch": 1.8373661245537485,
"grad_norm": 0.25230011318318546,
"learning_rate": 6.380459327229727e-05,
"loss": 0.3783,
"step": 579
},
{
"epoch": 1.840539468464895,
"grad_norm": 0.26756389255355956,
"learning_rate": 6.373326561935207e-05,
"loss": 0.3723,
"step": 580
},
{
"epoch": 1.8437128123760411,
"grad_norm": 0.27082347370882653,
"learning_rate": 6.366182130764392e-05,
"loss": 0.3695,
"step": 581
},
{
"epoch": 1.8468861562871877,
"grad_norm": 0.2981854551164244,
"learning_rate": 6.359026068835101e-05,
"loss": 0.3725,
"step": 582
},
{
"epoch": 1.850059500198334,
"grad_norm": 0.30431911893180386,
"learning_rate": 6.351858411322324e-05,
"loss": 0.3754,
"step": 583
},
{
"epoch": 1.8532328441094803,
"grad_norm": 0.29505742082933084,
"learning_rate": 6.344679193458043e-05,
"loss": 0.374,
"step": 584
},
{
"epoch": 1.8564061880206268,
"grad_norm": 0.2999630911197104,
"learning_rate": 6.337488450531068e-05,
"loss": 0.371,
"step": 585
},
{
"epoch": 1.859579531931773,
"grad_norm": 0.35422358202016324,
"learning_rate": 6.330286217886857e-05,
"loss": 0.3689,
"step": 586
},
{
"epoch": 1.8627528758429195,
"grad_norm": 0.31054325154643925,
"learning_rate": 6.323072530927349e-05,
"loss": 0.3803,
"step": 587
},
{
"epoch": 1.8659262197540658,
"grad_norm": 0.27551184384415855,
"learning_rate": 6.31584742511078e-05,
"loss": 0.3731,
"step": 588
},
{
"epoch": 1.8690995636652121,
"grad_norm": 0.31879930611418805,
"learning_rate": 6.308610935951516e-05,
"loss": 0.3767,
"step": 589
},
{
"epoch": 1.8722729075763587,
"grad_norm": 0.33037474746040296,
"learning_rate": 6.301363099019881e-05,
"loss": 0.3751,
"step": 590
},
{
"epoch": 1.875446251487505,
"grad_norm": 0.2595876949175527,
"learning_rate": 6.294103949941975e-05,
"loss": 0.3722,
"step": 591
},
{
"epoch": 1.8786195953986513,
"grad_norm": 0.26859094455834825,
"learning_rate": 6.2868335243995e-05,
"loss": 0.3727,
"step": 592
},
{
"epoch": 1.8817929393097979,
"grad_norm": 0.3602873489651573,
"learning_rate": 6.279551858129588e-05,
"loss": 0.371,
"step": 593
},
{
"epoch": 1.884966283220944,
"grad_norm": 0.4148929616678669,
"learning_rate": 6.272258986924624e-05,
"loss": 0.3726,
"step": 594
},
{
"epoch": 1.8881396271320905,
"grad_norm": 0.3239892420647555,
"learning_rate": 6.26495494663207e-05,
"loss": 0.3754,
"step": 595
},
{
"epoch": 1.8913129710432368,
"grad_norm": 0.2219707121481662,
"learning_rate": 6.257639773154288e-05,
"loss": 0.3763,
"step": 596
},
{
"epoch": 1.8944863149543831,
"grad_norm": 0.24701125744993022,
"learning_rate": 6.250313502448368e-05,
"loss": 0.3769,
"step": 597
},
{
"epoch": 1.8976596588655297,
"grad_norm": 0.2508614116325903,
"learning_rate": 6.24297617052594e-05,
"loss": 0.3797,
"step": 598
},
{
"epoch": 1.9008330027766758,
"grad_norm": 0.24307046170154817,
"learning_rate": 6.23562781345301e-05,
"loss": 0.3768,
"step": 599
},
{
"epoch": 1.9040063466878223,
"grad_norm": 0.23596073863704914,
"learning_rate": 6.228268467349776e-05,
"loss": 0.3764,
"step": 600
},
{
"epoch": 1.9071796905989686,
"grad_norm": 0.287641306711991,
"learning_rate": 6.22089816839045e-05,
"loss": 0.3679,
"step": 601
},
{
"epoch": 1.910353034510115,
"grad_norm": 0.3350000639906152,
"learning_rate": 6.213516952803084e-05,
"loss": 0.3749,
"step": 602
},
{
"epoch": 1.9135263784212615,
"grad_norm": 0.2953436732358185,
"learning_rate": 6.20612485686939e-05,
"loss": 0.3731,
"step": 603
},
{
"epoch": 1.9166997223324078,
"grad_norm": 0.26307310235068826,
"learning_rate": 6.198721916924559e-05,
"loss": 0.3744,
"step": 604
},
{
"epoch": 1.9198730662435541,
"grad_norm": 0.21992342429818196,
"learning_rate": 6.191308169357084e-05,
"loss": 0.3776,
"step": 605
},
{
"epoch": 1.9230464101547007,
"grad_norm": 0.2096604115734754,
"learning_rate": 6.183883650608588e-05,
"loss": 0.3772,
"step": 606
},
{
"epoch": 1.9262197540658468,
"grad_norm": 0.20221136579674542,
"learning_rate": 6.176448397173632e-05,
"loss": 0.3725,
"step": 607
},
{
"epoch": 1.9293930979769933,
"grad_norm": 0.2152338983820113,
"learning_rate": 6.169002445599544e-05,
"loss": 0.3677,
"step": 608
},
{
"epoch": 1.9325664418881396,
"grad_norm": 0.22320752106286315,
"learning_rate": 6.161545832486242e-05,
"loss": 0.3756,
"step": 609
},
{
"epoch": 1.935739785799286,
"grad_norm": 0.2253156227252389,
"learning_rate": 6.154078594486045e-05,
"loss": 0.3714,
"step": 610
},
{
"epoch": 1.9389131297104325,
"grad_norm": 0.2060476121512058,
"learning_rate": 6.146600768303498e-05,
"loss": 0.3765,
"step": 611
},
{
"epoch": 1.9420864736215786,
"grad_norm": 0.25181408887727724,
"learning_rate": 6.139112390695195e-05,
"loss": 0.3736,
"step": 612
},
{
"epoch": 1.9452598175327251,
"grad_norm": 0.28050692887038614,
"learning_rate": 6.13161349846959e-05,
"loss": 0.3711,
"step": 613
},
{
"epoch": 1.9484331614438715,
"grad_norm": 0.2323287898084542,
"learning_rate": 6.124104128486824e-05,
"loss": 0.3787,
"step": 614
},
{
"epoch": 1.9516065053550178,
"grad_norm": 0.22252533467543772,
"learning_rate": 6.11658431765854e-05,
"loss": 0.3761,
"step": 615
},
{
"epoch": 1.9547798492661643,
"grad_norm": 0.23979064807450512,
"learning_rate": 6.109054102947701e-05,
"loss": 0.3768,
"step": 616
},
{
"epoch": 1.9579531931773104,
"grad_norm": 0.20369908867353018,
"learning_rate": 6.101513521368409e-05,
"loss": 0.3717,
"step": 617
},
{
"epoch": 1.961126537088457,
"grad_norm": 0.21979116733171586,
"learning_rate": 6.0939626099857256e-05,
"loss": 0.3719,
"step": 618
},
{
"epoch": 1.9642998809996033,
"grad_norm": 0.2359782325683911,
"learning_rate": 6.086401405915485e-05,
"loss": 0.3697,
"step": 619
},
{
"epoch": 1.9674732249107496,
"grad_norm": 0.22186897194718339,
"learning_rate": 6.0788299463241146e-05,
"loss": 0.3714,
"step": 620
},
{
"epoch": 1.9706465688218961,
"grad_norm": 0.19286916683029354,
"learning_rate": 6.071248268428455e-05,
"loss": 0.3694,
"step": 621
},
{
"epoch": 1.9738199127330425,
"grad_norm": 0.19038716732119104,
"learning_rate": 6.06365640949557e-05,
"loss": 0.3689,
"step": 622
},
{
"epoch": 1.9769932566441888,
"grad_norm": 0.21853406474562687,
"learning_rate": 6.0560544068425704e-05,
"loss": 0.3713,
"step": 623
},
{
"epoch": 1.9801666005553353,
"grad_norm": 0.2519707220662586,
"learning_rate": 6.048442297836424e-05,
"loss": 0.3745,
"step": 624
},
{
"epoch": 1.9833399444664814,
"grad_norm": 0.27348429383893025,
"learning_rate": 6.040820119893781e-05,
"loss": 0.3726,
"step": 625
},
{
"epoch": 1.986513288377628,
"grad_norm": 0.25727914228364634,
"learning_rate": 6.033187910480779e-05,
"loss": 0.3737,
"step": 626
},
{
"epoch": 1.9896866322887743,
"grad_norm": 0.19830437617712135,
"learning_rate": 6.025545707112868e-05,
"loss": 0.3714,
"step": 627
},
{
"epoch": 1.9928599761999206,
"grad_norm": 0.17578894688384492,
"learning_rate": 6.017893547354618e-05,
"loss": 0.3701,
"step": 628
},
{
"epoch": 1.9960333201110672,
"grad_norm": 0.28565651904457573,
"learning_rate": 6.0102314688195466e-05,
"loss": 0.3707,
"step": 629
},
{
"epoch": 1.9992066640222133,
"grad_norm": 0.3512726249295466,
"learning_rate": 6.002559509169917e-05,
"loss": 0.374,
"step": 630
},
{
"epoch": 2.00238000793336,
"grad_norm": 0.5082104882029274,
"learning_rate": 5.994877706116571e-05,
"loss": 0.6682,
"step": 631
},
{
"epoch": 2.0055533518445063,
"grad_norm": 0.7958584718546614,
"learning_rate": 5.9871860974187266e-05,
"loss": 0.3515,
"step": 632
},
{
"epoch": 2.0087266957556524,
"grad_norm": 1.2309453398579198,
"learning_rate": 5.979484720883806e-05,
"loss": 0.3691,
"step": 633
},
{
"epoch": 2.011900039666799,
"grad_norm": 0.6258361012105962,
"learning_rate": 5.971773614367244e-05,
"loss": 0.3592,
"step": 634
},
{
"epoch": 2.015073383577945,
"grad_norm": 0.7180639249872159,
"learning_rate": 5.964052815772298e-05,
"loss": 0.3567,
"step": 635
},
{
"epoch": 2.0182467274890916,
"grad_norm": 0.9313882834628479,
"learning_rate": 5.9563223630498714e-05,
"loss": 0.3601,
"step": 636
},
{
"epoch": 2.021420071400238,
"grad_norm": 0.7992267279088702,
"learning_rate": 5.9485822941983185e-05,
"loss": 0.3532,
"step": 637
},
{
"epoch": 2.0245934153113843,
"grad_norm": 0.507785688644202,
"learning_rate": 5.940832647263262e-05,
"loss": 0.3522,
"step": 638
},
{
"epoch": 2.027766759222531,
"grad_norm": 0.578349620722966,
"learning_rate": 5.933073460337404e-05,
"loss": 0.347,
"step": 639
},
{
"epoch": 2.030940103133677,
"grad_norm": 0.5232367250977141,
"learning_rate": 5.9253047715603384e-05,
"loss": 0.36,
"step": 640
},
{
"epoch": 2.0341134470448234,
"grad_norm": 0.48100594573201455,
"learning_rate": 5.917526619118368e-05,
"loss": 0.3458,
"step": 641
},
{
"epoch": 2.03728679095597,
"grad_norm": 0.3770908533781567,
"learning_rate": 5.909739041244311e-05,
"loss": 0.3491,
"step": 642
},
{
"epoch": 2.040460134867116,
"grad_norm": 0.43489851021818743,
"learning_rate": 5.9019420762173156e-05,
"loss": 0.35,
"step": 643
},
{
"epoch": 2.0436334787782626,
"grad_norm": 0.34486032651494825,
"learning_rate": 5.894135762362673e-05,
"loss": 0.3504,
"step": 644
},
{
"epoch": 2.046806822689409,
"grad_norm": 0.3111171335430375,
"learning_rate": 5.8863201380516255e-05,
"loss": 0.3518,
"step": 645
},
{
"epoch": 2.0499801666005553,
"grad_norm": 0.35964655450159183,
"learning_rate": 5.8784952417011826e-05,
"loss": 0.3492,
"step": 646
},
{
"epoch": 2.053153510511702,
"grad_norm": 0.2793218850087469,
"learning_rate": 5.8706611117739275e-05,
"loss": 0.345,
"step": 647
},
{
"epoch": 2.056326854422848,
"grad_norm": 0.2630228702707809,
"learning_rate": 5.862817786777832e-05,
"loss": 0.3527,
"step": 648
},
{
"epoch": 2.0595001983339944,
"grad_norm": 0.287600094925337,
"learning_rate": 5.854965305266065e-05,
"loss": 0.3472,
"step": 649
},
{
"epoch": 2.062673542245141,
"grad_norm": 0.23415839473852326,
"learning_rate": 5.8471037058368035e-05,
"loss": 0.3472,
"step": 650
},
{
"epoch": 2.065846886156287,
"grad_norm": 0.2480609831522893,
"learning_rate": 5.839233027133041e-05,
"loss": 0.347,
"step": 651
},
{
"epoch": 2.0690202300674336,
"grad_norm": 0.24277577378910167,
"learning_rate": 5.8313533078424016e-05,
"loss": 0.3502,
"step": 652
},
{
"epoch": 2.0721935739785797,
"grad_norm": 0.22888858417929306,
"learning_rate": 5.823464586696947e-05,
"loss": 0.3398,
"step": 653
},
{
"epoch": 2.0753669178897263,
"grad_norm": 0.22777692875348907,
"learning_rate": 5.8155669024729864e-05,
"loss": 0.3502,
"step": 654
},
{
"epoch": 2.078540261800873,
"grad_norm": 0.21302274202843294,
"learning_rate": 5.807660293990887e-05,
"loss": 0.3467,
"step": 655
},
{
"epoch": 2.081713605712019,
"grad_norm": 0.17698466873204569,
"learning_rate": 5.79974480011488e-05,
"loss": 0.3468,
"step": 656
},
{
"epoch": 2.0848869496231655,
"grad_norm": 0.18921626118682697,
"learning_rate": 5.7918204597528755e-05,
"loss": 0.3401,
"step": 657
},
{
"epoch": 2.088060293534312,
"grad_norm": 0.19642637234378701,
"learning_rate": 5.7838873118562656e-05,
"loss": 0.3467,
"step": 658
},
{
"epoch": 2.091233637445458,
"grad_norm": 0.13768919840127494,
"learning_rate": 5.775945395419736e-05,
"loss": 0.3461,
"step": 659
},
{
"epoch": 2.0944069813566046,
"grad_norm": 0.18625327191674967,
"learning_rate": 5.7679947494810707e-05,
"loss": 0.3445,
"step": 660
},
{
"epoch": 2.0975803252677507,
"grad_norm": 0.17171644648932974,
"learning_rate": 5.760035413120967e-05,
"loss": 0.3481,
"step": 661
},
{
"epoch": 2.1007536691788973,
"grad_norm": 0.181873538605243,
"learning_rate": 5.752067425462835e-05,
"loss": 0.348,
"step": 662
},
{
"epoch": 2.103927013090044,
"grad_norm": 0.19042134398803523,
"learning_rate": 5.744090825672615e-05,
"loss": 0.3447,
"step": 663
},
{
"epoch": 2.10710035700119,
"grad_norm": 0.160202051805729,
"learning_rate": 5.7361056529585736e-05,
"loss": 0.3508,
"step": 664
},
{
"epoch": 2.1102737009123365,
"grad_norm": 0.1568895178074113,
"learning_rate": 5.728111946571119e-05,
"loss": 0.3517,
"step": 665
},
{
"epoch": 2.1134470448234826,
"grad_norm": 0.16324890485854626,
"learning_rate": 5.720109745802607e-05,
"loss": 0.3464,
"step": 666
},
{
"epoch": 2.116620388734629,
"grad_norm": 0.1457391254676544,
"learning_rate": 5.712099089987146e-05,
"loss": 0.3475,
"step": 667
},
{
"epoch": 2.1197937326457756,
"grad_norm": 0.12805660018258297,
"learning_rate": 5.704080018500405e-05,
"loss": 0.3531,
"step": 668
},
{
"epoch": 2.1229670765569217,
"grad_norm": 0.14439965378280822,
"learning_rate": 5.696052570759418e-05,
"loss": 0.3395,
"step": 669
},
{
"epoch": 2.1261404204680683,
"grad_norm": 0.14704244500302163,
"learning_rate": 5.6880167862223915e-05,
"loss": 0.3456,
"step": 670
},
{
"epoch": 2.129313764379215,
"grad_norm": 0.1650010303974902,
"learning_rate": 5.6799727043885155e-05,
"loss": 0.3472,
"step": 671
},
{
"epoch": 2.132487108290361,
"grad_norm": 0.13673543011389194,
"learning_rate": 5.671920364797758e-05,
"loss": 0.3446,
"step": 672
},
{
"epoch": 2.1356604522015075,
"grad_norm": 0.1524951210391999,
"learning_rate": 5.66385980703068e-05,
"loss": 0.3409,
"step": 673
},
{
"epoch": 2.1388337961126536,
"grad_norm": 0.1651899044016252,
"learning_rate": 5.655791070708242e-05,
"loss": 0.3458,
"step": 674
},
{
"epoch": 2.1420071400238,
"grad_norm": 0.18171434979507048,
"learning_rate": 5.647714195491599e-05,
"loss": 0.3467,
"step": 675
},
{
"epoch": 2.1451804839349466,
"grad_norm": 0.1895404480414663,
"learning_rate": 5.6396292210819154e-05,
"loss": 0.3458,
"step": 676
},
{
"epoch": 2.1483538278460927,
"grad_norm": 0.1977999320741365,
"learning_rate": 5.6315361872201663e-05,
"loss": 0.3439,
"step": 677
},
{
"epoch": 2.1515271717572393,
"grad_norm": 0.1709340652432958,
"learning_rate": 5.6234351336869425e-05,
"loss": 0.3435,
"step": 678
},
{
"epoch": 2.1547005156683854,
"grad_norm": 0.19031692182789184,
"learning_rate": 5.6153261003022556e-05,
"loss": 0.3449,
"step": 679
},
{
"epoch": 2.157873859579532,
"grad_norm": 0.16069939440920603,
"learning_rate": 5.607209126925337e-05,
"loss": 0.3537,
"step": 680
},
{
"epoch": 2.1610472034906785,
"grad_norm": 0.1433199497560158,
"learning_rate": 5.599084253454452e-05,
"loss": 0.3393,
"step": 681
},
{
"epoch": 2.1642205474018246,
"grad_norm": 0.14956082186311095,
"learning_rate": 5.5909515198266965e-05,
"loss": 0.342,
"step": 682
},
{
"epoch": 2.167393891312971,
"grad_norm": 0.1544287677040818,
"learning_rate": 5.582810966017799e-05,
"loss": 0.3503,
"step": 683
},
{
"epoch": 2.170567235224117,
"grad_norm": 0.15319232608750777,
"learning_rate": 5.574662632041932e-05,
"loss": 0.3467,
"step": 684
},
{
"epoch": 2.1737405791352638,
"grad_norm": 0.14670339038716207,
"learning_rate": 5.566506557951508e-05,
"loss": 0.3422,
"step": 685
},
{
"epoch": 2.1769139230464103,
"grad_norm": 0.16667697899341136,
"learning_rate": 5.558342783836987e-05,
"loss": 0.3424,
"step": 686
},
{
"epoch": 2.1800872669575564,
"grad_norm": 0.1975464831082886,
"learning_rate": 5.550171349826675e-05,
"loss": 0.3461,
"step": 687
},
{
"epoch": 2.183260610868703,
"grad_norm": 0.17888577322944074,
"learning_rate": 5.5419922960865334e-05,
"loss": 0.3469,
"step": 688
},
{
"epoch": 2.186433954779849,
"grad_norm": 0.17541637626798942,
"learning_rate": 5.533805662819975e-05,
"loss": 0.3505,
"step": 689
},
{
"epoch": 2.1896072986909956,
"grad_norm": 0.2198583155605273,
"learning_rate": 5.5256114902676675e-05,
"loss": 0.3447,
"step": 690
},
{
"epoch": 2.192780642602142,
"grad_norm": 0.22694834599020175,
"learning_rate": 5.517409818707343e-05,
"loss": 0.3504,
"step": 691
},
{
"epoch": 2.195953986513288,
"grad_norm": 0.159759032699931,
"learning_rate": 5.50920068845359e-05,
"loss": 0.3429,
"step": 692
},
{
"epoch": 2.1991273304244348,
"grad_norm": 0.11626634052913302,
"learning_rate": 5.500984139857659e-05,
"loss": 0.3465,
"step": 693
},
{
"epoch": 2.2023006743355813,
"grad_norm": 0.16029340678767326,
"learning_rate": 5.492760213307268e-05,
"loss": 0.3493,
"step": 694
},
{
"epoch": 2.2054740182467274,
"grad_norm": 0.20138781571087522,
"learning_rate": 5.484528949226397e-05,
"loss": 0.3448,
"step": 695
},
{
"epoch": 2.208647362157874,
"grad_norm": 0.20673832786589078,
"learning_rate": 5.4762903880750956e-05,
"loss": 0.3472,
"step": 696
},
{
"epoch": 2.21182070606902,
"grad_norm": 0.1945972361781099,
"learning_rate": 5.468044570349282e-05,
"loss": 0.3452,
"step": 697
},
{
"epoch": 2.2149940499801666,
"grad_norm": 0.15260971345475988,
"learning_rate": 5.45979153658054e-05,
"loss": 0.3444,
"step": 698
},
{
"epoch": 2.218167393891313,
"grad_norm": 0.16140310676647643,
"learning_rate": 5.451531327335927e-05,
"loss": 0.3464,
"step": 699
},
{
"epoch": 2.221340737802459,
"grad_norm": 0.1710826897248202,
"learning_rate": 5.4432639832177675e-05,
"loss": 0.3477,
"step": 700
},
{
"epoch": 2.2245140817136058,
"grad_norm": 0.1907595410644264,
"learning_rate": 5.43498954486346e-05,
"loss": 0.3399,
"step": 701
},
{
"epoch": 2.227687425624752,
"grad_norm": 0.16532722473360295,
"learning_rate": 5.426708052945272e-05,
"loss": 0.3505,
"step": 702
},
{
"epoch": 2.2308607695358984,
"grad_norm": 0.1536194473143472,
"learning_rate": 5.4184195481701425e-05,
"loss": 0.3474,
"step": 703
},
{
"epoch": 2.234034113447045,
"grad_norm": 0.1520117940400569,
"learning_rate": 5.4101240712794826e-05,
"loss": 0.3437,
"step": 704
},
{
"epoch": 2.237207457358191,
"grad_norm": 0.16336250647219758,
"learning_rate": 5.401821663048974e-05,
"loss": 0.3484,
"step": 705
},
{
"epoch": 2.2403808012693376,
"grad_norm": 0.18048860842589537,
"learning_rate": 5.393512364288366e-05,
"loss": 0.3461,
"step": 706
},
{
"epoch": 2.243554145180484,
"grad_norm": 0.2077517477600907,
"learning_rate": 5.3851962158412835e-05,
"loss": 0.3469,
"step": 707
},
{
"epoch": 2.2467274890916302,
"grad_norm": 0.23694455387298174,
"learning_rate": 5.3768732585850135e-05,
"loss": 0.3458,
"step": 708
},
{
"epoch": 2.2499008330027768,
"grad_norm": 0.2221303640440427,
"learning_rate": 5.3685435334303144e-05,
"loss": 0.3475,
"step": 709
},
{
"epoch": 2.253074176913923,
"grad_norm": 0.15740414776377862,
"learning_rate": 5.360207081321215e-05,
"loss": 0.3514,
"step": 710
},
{
"epoch": 2.2562475208250694,
"grad_norm": 0.14133878333017164,
"learning_rate": 5.351863943234803e-05,
"loss": 0.3501,
"step": 711
},
{
"epoch": 2.259420864736216,
"grad_norm": 0.23287734656674552,
"learning_rate": 5.343514160181037e-05,
"loss": 0.3476,
"step": 712
},
{
"epoch": 2.262594208647362,
"grad_norm": 0.2747098009784896,
"learning_rate": 5.3351577732025324e-05,
"loss": 0.3467,
"step": 713
},
{
"epoch": 2.2657675525585086,
"grad_norm": 0.2544286593636421,
"learning_rate": 5.3267948233743705e-05,
"loss": 0.3474,
"step": 714
},
{
"epoch": 2.2689408964696547,
"grad_norm": 0.23428301254687212,
"learning_rate": 5.318425351803889e-05,
"loss": 0.3436,
"step": 715
},
{
"epoch": 2.2721142403808012,
"grad_norm": 0.19972273517158506,
"learning_rate": 5.310049399630483e-05,
"loss": 0.3428,
"step": 716
},
{
"epoch": 2.2752875842919478,
"grad_norm": 0.17312802850984527,
"learning_rate": 5.301667008025404e-05,
"loss": 0.3507,
"step": 717
},
{
"epoch": 2.278460928203094,
"grad_norm": 0.1884344961517114,
"learning_rate": 5.293278218191553e-05,
"loss": 0.344,
"step": 718
},
{
"epoch": 2.2816342721142404,
"grad_norm": 0.16251980995579765,
"learning_rate": 5.2848830713632844e-05,
"loss": 0.3524,
"step": 719
},
{
"epoch": 2.284807616025387,
"grad_norm": 0.13519307906275285,
"learning_rate": 5.2764816088061974e-05,
"loss": 0.3518,
"step": 720
},
{
"epoch": 2.287980959936533,
"grad_norm": 0.15371744727582462,
"learning_rate": 5.2680738718169355e-05,
"loss": 0.3412,
"step": 721
},
{
"epoch": 2.2911543038476796,
"grad_norm": 0.14184577228089562,
"learning_rate": 5.2596599017229864e-05,
"loss": 0.3436,
"step": 722
},
{
"epoch": 2.2943276477588257,
"grad_norm": 0.1369004603255518,
"learning_rate": 5.251239739882472e-05,
"loss": 0.3446,
"step": 723
},
{
"epoch": 2.2975009916699722,
"grad_norm": 0.15048696535358383,
"learning_rate": 5.2428134276839525e-05,
"loss": 0.3442,
"step": 724
},
{
"epoch": 2.300674335581119,
"grad_norm": 0.14200830231004927,
"learning_rate": 5.234381006546219e-05,
"loss": 0.3446,
"step": 725
},
{
"epoch": 2.303847679492265,
"grad_norm": 0.16320009543918237,
"learning_rate": 5.2259425179180873e-05,
"loss": 0.3443,
"step": 726
},
{
"epoch": 2.3070210234034114,
"grad_norm": 0.15997774955995675,
"learning_rate": 5.217498003278204e-05,
"loss": 0.346,
"step": 727
},
{
"epoch": 2.3101943673145575,
"grad_norm": 0.17017241599015379,
"learning_rate": 5.209047504134828e-05,
"loss": 0.3436,
"step": 728
},
{
"epoch": 2.313367711225704,
"grad_norm": 0.13479061683911572,
"learning_rate": 5.200591062025641e-05,
"loss": 0.346,
"step": 729
},
{
"epoch": 2.3165410551368506,
"grad_norm": 0.12285044545606259,
"learning_rate": 5.192128718517535e-05,
"loss": 0.3383,
"step": 730
},
{
"epoch": 2.3197143990479967,
"grad_norm": 0.1359930168100327,
"learning_rate": 5.1836605152064076e-05,
"loss": 0.3515,
"step": 731
},
{
"epoch": 2.3228877429591432,
"grad_norm": 0.14719463738675423,
"learning_rate": 5.175186493716963e-05,
"loss": 0.3451,
"step": 732
},
{
"epoch": 2.32606108687029,
"grad_norm": 0.1286663460165603,
"learning_rate": 5.1667066957025e-05,
"loss": 0.3512,
"step": 733
},
{
"epoch": 2.329234430781436,
"grad_norm": 0.10867634667327339,
"learning_rate": 5.158221162844717e-05,
"loss": 0.3439,
"step": 734
},
{
"epoch": 2.3324077746925824,
"grad_norm": 0.1591243176843703,
"learning_rate": 5.1497299368534965e-05,
"loss": 0.3511,
"step": 735
},
{
"epoch": 2.3355811186037285,
"grad_norm": 0.12942608362042907,
"learning_rate": 5.1412330594667075e-05,
"loss": 0.3434,
"step": 736
},
{
"epoch": 2.338754462514875,
"grad_norm": 0.10948773984311001,
"learning_rate": 5.132730572449997e-05,
"loss": 0.3524,
"step": 737
},
{
"epoch": 2.3419278064260216,
"grad_norm": 0.11783857681490648,
"learning_rate": 5.124222517596586e-05,
"loss": 0.349,
"step": 738
},
{
"epoch": 2.3451011503371677,
"grad_norm": 0.13439082971050517,
"learning_rate": 5.115708936727065e-05,
"loss": 0.3405,
"step": 739
},
{
"epoch": 2.3482744942483142,
"grad_norm": 0.11812799919852607,
"learning_rate": 5.1071898716891853e-05,
"loss": 0.3461,
"step": 740
},
{
"epoch": 2.3514478381594603,
"grad_norm": 0.11652694906685064,
"learning_rate": 5.098665364357656e-05,
"loss": 0.3426,
"step": 741
},
{
"epoch": 2.354621182070607,
"grad_norm": 0.139468896494999,
"learning_rate": 5.0901354566339355e-05,
"loss": 0.3426,
"step": 742
},
{
"epoch": 2.3577945259817534,
"grad_norm": 0.13901937078693805,
"learning_rate": 5.0816001904460316e-05,
"loss": 0.3453,
"step": 743
},
{
"epoch": 2.3609678698928995,
"grad_norm": 0.1469507185762338,
"learning_rate": 5.073059607748287e-05,
"loss": 0.3438,
"step": 744
},
{
"epoch": 2.364141213804046,
"grad_norm": 0.12953053633992156,
"learning_rate": 5.064513750521179e-05,
"loss": 0.348,
"step": 745
},
{
"epoch": 2.3673145577151926,
"grad_norm": 0.15355601770329005,
"learning_rate": 5.05596266077111e-05,
"loss": 0.3487,
"step": 746
},
{
"epoch": 2.3704879016263387,
"grad_norm": 0.17548971874850716,
"learning_rate": 5.047406380530205e-05,
"loss": 0.3435,
"step": 747
},
{
"epoch": 2.3736612455374853,
"grad_norm": 0.1336318519267535,
"learning_rate": 5.038844951856101e-05,
"loss": 0.3472,
"step": 748
},
{
"epoch": 2.3768345894486314,
"grad_norm": 0.15489849858402407,
"learning_rate": 5.0302784168317405e-05,
"loss": 0.3507,
"step": 749
},
{
"epoch": 2.380007933359778,
"grad_norm": 0.15730467807777396,
"learning_rate": 5.021706817565168e-05,
"loss": 0.3494,
"step": 750
},
{
"epoch": 2.383181277270924,
"grad_norm": 0.14142472540150533,
"learning_rate": 5.013130196189319e-05,
"loss": 0.3508,
"step": 751
},
{
"epoch": 2.3863546211820705,
"grad_norm": 0.1309447338438405,
"learning_rate": 5.004548594861815e-05,
"loss": 0.3414,
"step": 752
},
{
"epoch": 2.389527965093217,
"grad_norm": 0.13689757661127885,
"learning_rate": 4.995962055764758e-05,
"loss": 0.3462,
"step": 753
},
{
"epoch": 2.392701309004363,
"grad_norm": 0.15155743192346277,
"learning_rate": 4.987370621104518e-05,
"loss": 0.3502,
"step": 754
},
{
"epoch": 2.3958746529155097,
"grad_norm": 0.1771759302131552,
"learning_rate": 4.978774333111532e-05,
"loss": 0.3415,
"step": 755
},
{
"epoch": 2.3990479968266563,
"grad_norm": 0.14297561112928286,
"learning_rate": 4.97017323404009e-05,
"loss": 0.3445,
"step": 756
},
{
"epoch": 2.4022213407378024,
"grad_norm": 0.1359038159105317,
"learning_rate": 4.9615673661681314e-05,
"loss": 0.3431,
"step": 757
},
{
"epoch": 2.405394684648949,
"grad_norm": 0.14972172604238929,
"learning_rate": 4.952956771797039e-05,
"loss": 0.3449,
"step": 758
},
{
"epoch": 2.4085680285600954,
"grad_norm": 0.14098278608768156,
"learning_rate": 4.9443414932514245e-05,
"loss": 0.3446,
"step": 759
},
{
"epoch": 2.4117413724712415,
"grad_norm": 0.1300591686668367,
"learning_rate": 4.935721572878927e-05,
"loss": 0.3469,
"step": 760
},
{
"epoch": 2.414914716382388,
"grad_norm": 0.13525638291526695,
"learning_rate": 4.9270970530499995e-05,
"loss": 0.3457,
"step": 761
},
{
"epoch": 2.418088060293534,
"grad_norm": 0.13706454820860725,
"learning_rate": 4.918467976157704e-05,
"loss": 0.3493,
"step": 762
},
{
"epoch": 2.4212614042046807,
"grad_norm": 0.1396470750888246,
"learning_rate": 4.909834384617505e-05,
"loss": 0.3466,
"step": 763
},
{
"epoch": 2.424434748115827,
"grad_norm": 0.1579884171451332,
"learning_rate": 4.901196320867054e-05,
"loss": 0.3434,
"step": 764
},
{
"epoch": 2.4276080920269734,
"grad_norm": 0.14762421037735238,
"learning_rate": 4.89255382736599e-05,
"loss": 0.3461,
"step": 765
},
{
"epoch": 2.43078143593812,
"grad_norm": 0.1484907586908843,
"learning_rate": 4.883906946595721e-05,
"loss": 0.3429,
"step": 766
},
{
"epoch": 2.433954779849266,
"grad_norm": 0.15347798353398318,
"learning_rate": 4.875255721059223e-05,
"loss": 0.3471,
"step": 767
},
{
"epoch": 2.4371281237604125,
"grad_norm": 0.1291098504201812,
"learning_rate": 4.8666001932808315e-05,
"loss": 0.3492,
"step": 768
},
{
"epoch": 2.440301467671559,
"grad_norm": 0.1277300904749933,
"learning_rate": 4.857940405806022e-05,
"loss": 0.3431,
"step": 769
},
{
"epoch": 2.443474811582705,
"grad_norm": 0.12939888924222556,
"learning_rate": 4.8492764012012146e-05,
"loss": 0.3467,
"step": 770
},
{
"epoch": 2.4466481554938517,
"grad_norm": 0.1224426894168111,
"learning_rate": 4.840608222053553e-05,
"loss": 0.3417,
"step": 771
},
{
"epoch": 2.449821499404998,
"grad_norm": 0.1497720241182753,
"learning_rate": 4.831935910970706e-05,
"loss": 0.3444,
"step": 772
},
{
"epoch": 2.4529948433161444,
"grad_norm": 0.11319605561124714,
"learning_rate": 4.8232595105806486e-05,
"loss": 0.3445,
"step": 773
},
{
"epoch": 2.456168187227291,
"grad_norm": 0.1345350003328084,
"learning_rate": 4.814579063531458e-05,
"loss": 0.3453,
"step": 774
},
{
"epoch": 2.459341531138437,
"grad_norm": 0.14108407549438753,
"learning_rate": 4.8058946124911014e-05,
"loss": 0.3456,
"step": 775
},
{
"epoch": 2.4625148750495836,
"grad_norm": 0.13444987730516147,
"learning_rate": 4.797206200147229e-05,
"loss": 0.3444,
"step": 776
},
{
"epoch": 2.4656882189607296,
"grad_norm": 0.1375640631845117,
"learning_rate": 4.78851386920696e-05,
"loss": 0.3446,
"step": 777
},
{
"epoch": 2.468861562871876,
"grad_norm": 0.14096157244272645,
"learning_rate": 4.779817662396679e-05,
"loss": 0.3478,
"step": 778
},
{
"epoch": 2.4720349067830227,
"grad_norm": 0.12488925047337064,
"learning_rate": 4.771117622461816e-05,
"loss": 0.3413,
"step": 779
},
{
"epoch": 2.475208250694169,
"grad_norm": 0.13287851067977652,
"learning_rate": 4.7624137921666475e-05,
"loss": 0.3489,
"step": 780
},
{
"epoch": 2.4783815946053154,
"grad_norm": 0.13197704125308232,
"learning_rate": 4.753706214294082e-05,
"loss": 0.3455,
"step": 781
},
{
"epoch": 2.481554938516462,
"grad_norm": 0.13265503243246032,
"learning_rate": 4.7449949316454425e-05,
"loss": 0.3439,
"step": 782
},
{
"epoch": 2.484728282427608,
"grad_norm": 0.130704370235696,
"learning_rate": 4.736279987040269e-05,
"loss": 0.3457,
"step": 783
},
{
"epoch": 2.4879016263387546,
"grad_norm": 0.14148325746554727,
"learning_rate": 4.727561423316099e-05,
"loss": 0.3463,
"step": 784
},
{
"epoch": 2.4910749702499007,
"grad_norm": 0.13608094038136562,
"learning_rate": 4.7188392833282575e-05,
"loss": 0.3484,
"step": 785
},
{
"epoch": 2.494248314161047,
"grad_norm": 0.13874721828210285,
"learning_rate": 4.710113609949653e-05,
"loss": 0.346,
"step": 786
},
{
"epoch": 2.4974216580721937,
"grad_norm": 0.14977483842518804,
"learning_rate": 4.701384446070557e-05,
"loss": 0.3424,
"step": 787
},
{
"epoch": 2.50059500198334,
"grad_norm": 0.12967959257083173,
"learning_rate": 4.6926518345984026e-05,
"loss": 0.3406,
"step": 788
},
{
"epoch": 2.5037683458944864,
"grad_norm": 0.12801141105870723,
"learning_rate": 4.683915818457566e-05,
"loss": 0.3396,
"step": 789
},
{
"epoch": 2.5069416898056325,
"grad_norm": 0.13327811790487024,
"learning_rate": 4.67517644058916e-05,
"loss": 0.3403,
"step": 790
},
{
"epoch": 2.510115033716779,
"grad_norm": 0.16238667666380158,
"learning_rate": 4.6664337439508226e-05,
"loss": 0.3411,
"step": 791
},
{
"epoch": 2.5132883776279256,
"grad_norm": 0.16494439303835634,
"learning_rate": 4.657687771516503e-05,
"loss": 0.3409,
"step": 792
},
{
"epoch": 2.5164617215390717,
"grad_norm": 0.14403868840938389,
"learning_rate": 4.6489385662762544e-05,
"loss": 0.3439,
"step": 793
},
{
"epoch": 2.519635065450218,
"grad_norm": 0.13948670219193632,
"learning_rate": 4.640186171236018e-05,
"loss": 0.3455,
"step": 794
},
{
"epoch": 2.5228084093613647,
"grad_norm": 0.16277210257572064,
"learning_rate": 4.6314306294174164e-05,
"loss": 0.3467,
"step": 795
},
{
"epoch": 2.525981753272511,
"grad_norm": 0.17170615713941978,
"learning_rate": 4.622671983857539e-05,
"loss": 0.3447,
"step": 796
},
{
"epoch": 2.5291550971836574,
"grad_norm": 0.15234975751861424,
"learning_rate": 4.6139102776087316e-05,
"loss": 0.3468,
"step": 797
},
{
"epoch": 2.5323284410948035,
"grad_norm": 0.11279769192482242,
"learning_rate": 4.605145553738385e-05,
"loss": 0.3452,
"step": 798
},
{
"epoch": 2.53550178500595,
"grad_norm": 0.13961162984914652,
"learning_rate": 4.5963778553287215e-05,
"loss": 0.3396,
"step": 799
},
{
"epoch": 2.538675128917096,
"grad_norm": 0.15754039363591266,
"learning_rate": 4.587607225476585e-05,
"loss": 0.3464,
"step": 800
},
{
"epoch": 2.5418484728282427,
"grad_norm": 0.15400642858424615,
"learning_rate": 4.57883370729323e-05,
"loss": 0.3429,
"step": 801
},
{
"epoch": 2.545021816739389,
"grad_norm": 0.13587095295609775,
"learning_rate": 4.570057343904107e-05,
"loss": 0.3444,
"step": 802
},
{
"epoch": 2.5481951606505353,
"grad_norm": 0.1661934512206222,
"learning_rate": 4.5612781784486516e-05,
"loss": 0.3456,
"step": 803
},
{
"epoch": 2.551368504561682,
"grad_norm": 0.1818284189001732,
"learning_rate": 4.5524962540800726e-05,
"loss": 0.3454,
"step": 804
},
{
"epoch": 2.5545418484728284,
"grad_norm": 0.134272995751436,
"learning_rate": 4.5437116139651416e-05,
"loss": 0.3458,
"step": 805
},
{
"epoch": 2.5577151923839745,
"grad_norm": 0.11098594445466681,
"learning_rate": 4.5349243012839787e-05,
"loss": 0.3428,
"step": 806
},
{
"epoch": 2.560888536295121,
"grad_norm": 0.1383868261385255,
"learning_rate": 4.52613435922984e-05,
"loss": 0.3433,
"step": 807
},
{
"epoch": 2.5640618802062676,
"grad_norm": 0.13958525369734295,
"learning_rate": 4.517341831008906e-05,
"loss": 0.343,
"step": 808
},
{
"epoch": 2.5672352241174137,
"grad_norm": 0.14299669733617631,
"learning_rate": 4.5085467598400687e-05,
"loss": 0.3481,
"step": 809
},
{
"epoch": 2.57040856802856,
"grad_norm": 0.16300391548684726,
"learning_rate": 4.499749188954721e-05,
"loss": 0.3442,
"step": 810
},
{
"epoch": 2.5735819119397063,
"grad_norm": 0.15939899931970958,
"learning_rate": 4.490949161596545e-05,
"loss": 0.346,
"step": 811
},
{
"epoch": 2.576755255850853,
"grad_norm": 0.11711293584718259,
"learning_rate": 4.4821467210212924e-05,
"loss": 0.3394,
"step": 812
},
{
"epoch": 2.579928599761999,
"grad_norm": 0.12318146826993423,
"learning_rate": 4.473341910496579e-05,
"loss": 0.3391,
"step": 813
},
{
"epoch": 2.5831019436731455,
"grad_norm": 0.12321952201871014,
"learning_rate": 4.464534773301674e-05,
"loss": 0.3424,
"step": 814
},
{
"epoch": 2.586275287584292,
"grad_norm": 0.12259038741152185,
"learning_rate": 4.455725352727276e-05,
"loss": 0.346,
"step": 815
},
{
"epoch": 2.589448631495438,
"grad_norm": 0.14443376513578737,
"learning_rate": 4.446913692075311e-05,
"loss": 0.3436,
"step": 816
},
{
"epoch": 2.5926219754065847,
"grad_norm": 0.1381230568223312,
"learning_rate": 4.438099834658716e-05,
"loss": 0.3444,
"step": 817
},
{
"epoch": 2.595795319317731,
"grad_norm": 0.12800321400776432,
"learning_rate": 4.429283823801227e-05,
"loss": 0.3423,
"step": 818
},
{
"epoch": 2.5989686632288773,
"grad_norm": 0.14739193433103212,
"learning_rate": 4.420465702837162e-05,
"loss": 0.346,
"step": 819
},
{
"epoch": 2.602142007140024,
"grad_norm": 0.14778467346501864,
"learning_rate": 4.4116455151112135e-05,
"loss": 0.3481,
"step": 820
},
{
"epoch": 2.6053153510511704,
"grad_norm": 0.11699717837600367,
"learning_rate": 4.4028233039782336e-05,
"loss": 0.3468,
"step": 821
},
{
"epoch": 2.6084886949623165,
"grad_norm": 0.12377912947544831,
"learning_rate": 4.393999112803017e-05,
"loss": 0.3422,
"step": 822
},
{
"epoch": 2.611662038873463,
"grad_norm": 0.11993635420501132,
"learning_rate": 4.385172984960093e-05,
"loss": 0.3479,
"step": 823
},
{
"epoch": 2.614835382784609,
"grad_norm": 0.1198156205225748,
"learning_rate": 4.3763449638335124e-05,
"loss": 0.3486,
"step": 824
},
{
"epoch": 2.6180087266957557,
"grad_norm": 0.14812630286596426,
"learning_rate": 4.367515092816628e-05,
"loss": 0.3494,
"step": 825
},
{
"epoch": 2.621182070606902,
"grad_norm": 0.11969830749952089,
"learning_rate": 4.3586834153118905e-05,
"loss": 0.343,
"step": 826
},
{
"epoch": 2.6243554145180483,
"grad_norm": 0.1330251306537408,
"learning_rate": 4.3498499747306243e-05,
"loss": 0.342,
"step": 827
},
{
"epoch": 2.627528758429195,
"grad_norm": 0.12546162647602022,
"learning_rate": 4.3410148144928256e-05,
"loss": 0.3425,
"step": 828
},
{
"epoch": 2.630702102340341,
"grad_norm": 0.13410489232741177,
"learning_rate": 4.332177978026943e-05,
"loss": 0.3483,
"step": 829
},
{
"epoch": 2.6338754462514875,
"grad_norm": 0.1232508805552152,
"learning_rate": 4.3233395087696585e-05,
"loss": 0.346,
"step": 830
},
{
"epoch": 2.637048790162634,
"grad_norm": 0.12167537812092089,
"learning_rate": 4.314499450165688e-05,
"loss": 0.3409,
"step": 831
},
{
"epoch": 2.64022213407378,
"grad_norm": 0.13645509206436196,
"learning_rate": 4.305657845667553e-05,
"loss": 0.3462,
"step": 832
},
{
"epoch": 2.6433954779849267,
"grad_norm": 0.10599165500884121,
"learning_rate": 4.296814738735376e-05,
"loss": 0.3464,
"step": 833
},
{
"epoch": 2.6465688218960732,
"grad_norm": 0.13223375794191417,
"learning_rate": 4.2879701728366686e-05,
"loss": 0.3438,
"step": 834
},
{
"epoch": 2.6497421658072193,
"grad_norm": 0.12369661834770018,
"learning_rate": 4.2791241914461076e-05,
"loss": 0.348,
"step": 835
},
{
"epoch": 2.652915509718366,
"grad_norm": 0.1225955125555118,
"learning_rate": 4.270276838045331e-05,
"loss": 0.3396,
"step": 836
},
{
"epoch": 2.656088853629512,
"grad_norm": 0.11973226274198374,
"learning_rate": 4.26142815612272e-05,
"loss": 0.3473,
"step": 837
},
{
"epoch": 2.6592621975406585,
"grad_norm": 0.1302252203210119,
"learning_rate": 4.252578189173186e-05,
"loss": 0.3421,
"step": 838
},
{
"epoch": 2.6624355414518046,
"grad_norm": 0.13620263914799938,
"learning_rate": 4.2437269806979574e-05,
"loss": 0.3447,
"step": 839
},
{
"epoch": 2.665608885362951,
"grad_norm": 0.12216233976282596,
"learning_rate": 4.234874574204364e-05,
"loss": 0.3481,
"step": 840
},
{
"epoch": 2.6687822292740977,
"grad_norm": 0.14511224880697332,
"learning_rate": 4.226021013205626e-05,
"loss": 0.3433,
"step": 841
},
{
"epoch": 2.671955573185244,
"grad_norm": 0.13167003525580787,
"learning_rate": 4.217166341220635e-05,
"loss": 0.3462,
"step": 842
},
{
"epoch": 2.6751289170963903,
"grad_norm": 0.11233528339177638,
"learning_rate": 4.208310601773749e-05,
"loss": 0.3422,
"step": 843
},
{
"epoch": 2.678302261007537,
"grad_norm": 0.116009578862862,
"learning_rate": 4.1994538383945686e-05,
"loss": 0.3382,
"step": 844
},
{
"epoch": 2.681475604918683,
"grad_norm": 0.14193128052476364,
"learning_rate": 4.190596094617729e-05,
"loss": 0.3477,
"step": 845
},
{
"epoch": 2.6846489488298295,
"grad_norm": 0.14084760732606688,
"learning_rate": 4.1817374139826857e-05,
"loss": 0.3416,
"step": 846
},
{
"epoch": 2.687822292740976,
"grad_norm": 0.1182925424974134,
"learning_rate": 4.172877840033496e-05,
"loss": 0.346,
"step": 847
},
{
"epoch": 2.690995636652122,
"grad_norm": 0.14243194869284295,
"learning_rate": 4.164017416318611e-05,
"loss": 0.3319,
"step": 848
},
{
"epoch": 2.6941689805632687,
"grad_norm": 0.14660981394007142,
"learning_rate": 4.155156186390659e-05,
"loss": 0.3454,
"step": 849
},
{
"epoch": 2.697342324474415,
"grad_norm": 0.12282621458948739,
"learning_rate": 4.1462941938062295e-05,
"loss": 0.3425,
"step": 850
},
{
"epoch": 2.7005156683855613,
"grad_norm": 0.11456728642253448,
"learning_rate": 4.137431482125659e-05,
"loss": 0.3437,
"step": 851
},
{
"epoch": 2.7036890122967074,
"grad_norm": 0.13185395211536752,
"learning_rate": 4.128568094912825e-05,
"loss": 0.3437,
"step": 852
},
{
"epoch": 2.706862356207854,
"grad_norm": 0.11115691532463522,
"learning_rate": 4.1197040757349175e-05,
"loss": 0.35,
"step": 853
},
{
"epoch": 2.7100357001190005,
"grad_norm": 0.12965739444200763,
"learning_rate": 4.1108394681622406e-05,
"loss": 0.3469,
"step": 854
},
{
"epoch": 2.7132090440301466,
"grad_norm": 0.12606196792345453,
"learning_rate": 4.101974315767984e-05,
"loss": 0.3424,
"step": 855
},
{
"epoch": 2.716382387941293,
"grad_norm": 0.13377181792852472,
"learning_rate": 4.0931086621280195e-05,
"loss": 0.3449,
"step": 856
},
{
"epoch": 2.7195557318524397,
"grad_norm": 0.1083820821550372,
"learning_rate": 4.0842425508206814e-05,
"loss": 0.3422,
"step": 857
},
{
"epoch": 2.722729075763586,
"grad_norm": 0.14051865023509388,
"learning_rate": 4.075376025426553e-05,
"loss": 0.3432,
"step": 858
},
{
"epoch": 2.7259024196747323,
"grad_norm": 0.1446305958755852,
"learning_rate": 4.0665091295282557e-05,
"loss": 0.3421,
"step": 859
},
{
"epoch": 2.729075763585879,
"grad_norm": 0.1185199085913208,
"learning_rate": 4.0576419067102294e-05,
"loss": 0.3425,
"step": 860
},
{
"epoch": 2.732249107497025,
"grad_norm": 0.12047038069530473,
"learning_rate": 4.04877440055852e-05,
"loss": 0.3433,
"step": 861
},
{
"epoch": 2.735422451408171,
"grad_norm": 0.12298317824143638,
"learning_rate": 4.03990665466057e-05,
"loss": 0.3419,
"step": 862
},
{
"epoch": 2.7385957953193176,
"grad_norm": 0.1134666458695627,
"learning_rate": 4.0310387126049965e-05,
"loss": 0.3449,
"step": 863
},
{
"epoch": 2.741769139230464,
"grad_norm": 0.1151216459243458,
"learning_rate": 4.022170617981383e-05,
"loss": 0.3487,
"step": 864
},
{
"epoch": 2.7449424831416103,
"grad_norm": 0.12438855242604728,
"learning_rate": 4.013302414380062e-05,
"loss": 0.3433,
"step": 865
},
{
"epoch": 2.748115827052757,
"grad_norm": 0.12462238298496425,
"learning_rate": 4.004434145391903e-05,
"loss": 0.3379,
"step": 866
},
{
"epoch": 2.7512891709639034,
"grad_norm": 0.12043847383946583,
"learning_rate": 3.9955658546080975e-05,
"loss": 0.3418,
"step": 867
},
{
"epoch": 2.7544625148750495,
"grad_norm": 0.12670455035239242,
"learning_rate": 3.9866975856199376e-05,
"loss": 0.3374,
"step": 868
},
{
"epoch": 2.757635858786196,
"grad_norm": 0.13869220804125282,
"learning_rate": 3.9778293820186176e-05,
"loss": 0.344,
"step": 869
},
{
"epoch": 2.7608092026973425,
"grad_norm": 0.12510091538071094,
"learning_rate": 3.968961287395004e-05,
"loss": 0.3464,
"step": 870
},
{
"epoch": 2.7639825466084886,
"grad_norm": 0.11788439172448362,
"learning_rate": 3.960093345339432e-05,
"loss": 0.3415,
"step": 871
},
{
"epoch": 2.767155890519635,
"grad_norm": 0.13570817794277007,
"learning_rate": 3.9512255994414804e-05,
"loss": 0.3415,
"step": 872
},
{
"epoch": 2.7703292344307813,
"grad_norm": 0.13940993558742085,
"learning_rate": 3.9423580932897726e-05,
"loss": 0.3423,
"step": 873
},
{
"epoch": 2.773502578341928,
"grad_norm": 0.10631207090804463,
"learning_rate": 3.933490870471745e-05,
"loss": 0.3442,
"step": 874
},
{
"epoch": 2.776675922253074,
"grad_norm": 0.1503520815021571,
"learning_rate": 3.924623974573448e-05,
"loss": 0.3416,
"step": 875
},
{
"epoch": 2.7798492661642205,
"grad_norm": 0.12632156433331662,
"learning_rate": 3.9157574491793185e-05,
"loss": 0.3371,
"step": 876
},
{
"epoch": 2.783022610075367,
"grad_norm": 0.13734903086765304,
"learning_rate": 3.906891337871982e-05,
"loss": 0.3491,
"step": 877
},
{
"epoch": 2.786195953986513,
"grad_norm": 0.15639396884236917,
"learning_rate": 3.898025684232016e-05,
"loss": 0.3472,
"step": 878
},
{
"epoch": 2.7893692978976596,
"grad_norm": 0.12483944951607047,
"learning_rate": 3.889160531837761e-05,
"loss": 0.3381,
"step": 879
},
{
"epoch": 2.792542641808806,
"grad_norm": 0.13742280075928823,
"learning_rate": 3.8802959242650825e-05,
"loss": 0.3411,
"step": 880
},
{
"epoch": 2.7957159857199523,
"grad_norm": 0.12266800245109839,
"learning_rate": 3.8714319050871764e-05,
"loss": 0.3414,
"step": 881
},
{
"epoch": 2.798889329631099,
"grad_norm": 0.12861032154624427,
"learning_rate": 3.862568517874341e-05,
"loss": 0.3392,
"step": 882
},
{
"epoch": 2.8020626735422454,
"grad_norm": 0.11470183191097062,
"learning_rate": 3.8537058061937725e-05,
"loss": 0.343,
"step": 883
},
{
"epoch": 2.8052360174533915,
"grad_norm": 0.12702748667879213,
"learning_rate": 3.8448438136093414e-05,
"loss": 0.3453,
"step": 884
},
{
"epoch": 2.808409361364538,
"grad_norm": 0.11316231412291199,
"learning_rate": 3.8359825836813895e-05,
"loss": 0.3446,
"step": 885
},
{
"epoch": 2.811582705275684,
"grad_norm": 0.12175654245533254,
"learning_rate": 3.827122159966504e-05,
"loss": 0.3464,
"step": 886
},
{
"epoch": 2.8147560491868306,
"grad_norm": 0.12805730071128243,
"learning_rate": 3.818262586017315e-05,
"loss": 0.3421,
"step": 887
},
{
"epoch": 2.8179293930979767,
"grad_norm": 0.13404348902216565,
"learning_rate": 3.8094039053822715e-05,
"loss": 0.3448,
"step": 888
},
{
"epoch": 2.8211027370091233,
"grad_norm": 0.12244995804431573,
"learning_rate": 3.800546161605433e-05,
"loss": 0.3408,
"step": 889
},
{
"epoch": 2.82427608092027,
"grad_norm": 0.11165322485961218,
"learning_rate": 3.791689398226252e-05,
"loss": 0.3448,
"step": 890
},
{
"epoch": 2.827449424831416,
"grad_norm": 0.12392250807575451,
"learning_rate": 3.7828336587793665e-05,
"loss": 0.339,
"step": 891
},
{
"epoch": 2.8306227687425625,
"grad_norm": 0.14372027694900424,
"learning_rate": 3.773978986794376e-05,
"loss": 0.3471,
"step": 892
},
{
"epoch": 2.833796112653709,
"grad_norm": 0.11903041102933404,
"learning_rate": 3.765125425795637e-05,
"loss": 0.3445,
"step": 893
},
{
"epoch": 2.836969456564855,
"grad_norm": 0.11348910315003628,
"learning_rate": 3.7562730193020425e-05,
"loss": 0.34,
"step": 894
},
{
"epoch": 2.8401428004760016,
"grad_norm": 0.12038861322563689,
"learning_rate": 3.747421810826815e-05,
"loss": 0.3389,
"step": 895
},
{
"epoch": 2.843316144387148,
"grad_norm": 0.1097622809874721,
"learning_rate": 3.73857184387728e-05,
"loss": 0.3471,
"step": 896
},
{
"epoch": 2.8464894882982943,
"grad_norm": 0.11347921572685421,
"learning_rate": 3.7297231619546695e-05,
"loss": 0.3443,
"step": 897
},
{
"epoch": 2.849662832209441,
"grad_norm": 0.12369160424846627,
"learning_rate": 3.7208758085538924e-05,
"loss": 0.3432,
"step": 898
},
{
"epoch": 2.852836176120587,
"grad_norm": 0.12357156761028097,
"learning_rate": 3.712029827163332e-05,
"loss": 0.3414,
"step": 899
},
{
"epoch": 2.8560095200317335,
"grad_norm": 0.11376937375822221,
"learning_rate": 3.703185261264624e-05,
"loss": 0.3401,
"step": 900
},
{
"epoch": 2.8591828639428796,
"grad_norm": 0.12349127066450852,
"learning_rate": 3.694342154332449e-05,
"loss": 0.3422,
"step": 901
},
{
"epoch": 2.862356207854026,
"grad_norm": 0.1034961716279667,
"learning_rate": 3.685500549834314e-05,
"loss": 0.3434,
"step": 902
},
{
"epoch": 2.8655295517651727,
"grad_norm": 0.12269370497851256,
"learning_rate": 3.676660491230343e-05,
"loss": 0.3383,
"step": 903
},
{
"epoch": 2.8687028956763188,
"grad_norm": 0.13382966320810855,
"learning_rate": 3.667822021973058e-05,
"loss": 0.3435,
"step": 904
},
{
"epoch": 2.8718762395874653,
"grad_norm": 0.11277938663333244,
"learning_rate": 3.658985185507175e-05,
"loss": 0.3406,
"step": 905
},
{
"epoch": 2.875049583498612,
"grad_norm": 0.1143319724175766,
"learning_rate": 3.6501500252693756e-05,
"loss": 0.3376,
"step": 906
},
{
"epoch": 2.878222927409758,
"grad_norm": 0.11689144421922465,
"learning_rate": 3.641316584688111e-05,
"loss": 0.3453,
"step": 907
},
{
"epoch": 2.8813962713209045,
"grad_norm": 0.11809016351008388,
"learning_rate": 3.632484907183372e-05,
"loss": 0.3481,
"step": 908
},
{
"epoch": 2.884569615232051,
"grad_norm": 0.1276259830733137,
"learning_rate": 3.623655036166489e-05,
"loss": 0.3421,
"step": 909
},
{
"epoch": 2.887742959143197,
"grad_norm": 0.11884475840914586,
"learning_rate": 3.6148270150399074e-05,
"loss": 0.3446,
"step": 910
},
{
"epoch": 2.8909163030543437,
"grad_norm": 0.11087658030941172,
"learning_rate": 3.606000887196985e-05,
"loss": 0.3439,
"step": 911
},
{
"epoch": 2.8940896469654898,
"grad_norm": 0.10980416285438169,
"learning_rate": 3.597176696021767e-05,
"loss": 0.3372,
"step": 912
},
{
"epoch": 2.8972629908766363,
"grad_norm": 0.11601487167763035,
"learning_rate": 3.588354484888787e-05,
"loss": 0.3459,
"step": 913
},
{
"epoch": 2.9004363347877824,
"grad_norm": 0.11368911677025695,
"learning_rate": 3.579534297162838e-05,
"loss": 0.3438,
"step": 914
},
{
"epoch": 2.903609678698929,
"grad_norm": 0.10395283389340351,
"learning_rate": 3.5707161761987745e-05,
"loss": 0.338,
"step": 915
},
{
"epoch": 2.9067830226100755,
"grad_norm": 0.11161882010475302,
"learning_rate": 3.561900165341284e-05,
"loss": 0.3408,
"step": 916
},
{
"epoch": 2.9099563665212216,
"grad_norm": 0.0997500952290493,
"learning_rate": 3.55308630792469e-05,
"loss": 0.3421,
"step": 917
},
{
"epoch": 2.913129710432368,
"grad_norm": 0.1339571225523946,
"learning_rate": 3.544274647272725e-05,
"loss": 0.3442,
"step": 918
},
{
"epoch": 2.9163030543435147,
"grad_norm": 0.11617441220009064,
"learning_rate": 3.535465226698327e-05,
"loss": 0.343,
"step": 919
},
{
"epoch": 2.9194763982546608,
"grad_norm": 0.1410191016195631,
"learning_rate": 3.526658089503421e-05,
"loss": 0.3395,
"step": 920
},
{
"epoch": 2.9226497421658073,
"grad_norm": 0.13899939104575385,
"learning_rate": 3.517853278978708e-05,
"loss": 0.3418,
"step": 921
},
{
"epoch": 2.925823086076954,
"grad_norm": 0.1448300244004051,
"learning_rate": 3.5090508384034554e-05,
"loss": 0.3434,
"step": 922
},
{
"epoch": 2.9289964299881,
"grad_norm": 0.14071342868222997,
"learning_rate": 3.5002508110452796e-05,
"loss": 0.3463,
"step": 923
},
{
"epoch": 2.9321697738992465,
"grad_norm": 0.13757441367946524,
"learning_rate": 3.491453240159932e-05,
"loss": 0.3428,
"step": 924
},
{
"epoch": 2.9353431178103926,
"grad_norm": 0.12380157230283552,
"learning_rate": 3.4826581689910956e-05,
"loss": 0.3425,
"step": 925
},
{
"epoch": 2.938516461721539,
"grad_norm": 0.14057723094623828,
"learning_rate": 3.473865640770161e-05,
"loss": 0.3411,
"step": 926
},
{
"epoch": 2.9416898056326852,
"grad_norm": 0.11650354025690383,
"learning_rate": 3.465075698716022e-05,
"loss": 0.3365,
"step": 927
},
{
"epoch": 2.9448631495438318,
"grad_norm": 0.12980737726549346,
"learning_rate": 3.4562883860348584e-05,
"loss": 0.3399,
"step": 928
},
{
"epoch": 2.9480364934549783,
"grad_norm": 0.1134055745141929,
"learning_rate": 3.447503745919929e-05,
"loss": 0.3414,
"step": 929
},
{
"epoch": 2.9512098373661244,
"grad_norm": 0.1201253976384782,
"learning_rate": 3.43872182155135e-05,
"loss": 0.3457,
"step": 930
},
{
"epoch": 2.954383181277271,
"grad_norm": 0.11035272651701446,
"learning_rate": 3.429942656095895e-05,
"loss": 0.3458,
"step": 931
},
{
"epoch": 2.9575565251884175,
"grad_norm": 0.11154353610592178,
"learning_rate": 3.4211662927067694e-05,
"loss": 0.3439,
"step": 932
},
{
"epoch": 2.9607298690995636,
"grad_norm": 0.11422392455235472,
"learning_rate": 3.412392774523416e-05,
"loss": 0.3382,
"step": 933
},
{
"epoch": 2.96390321301071,
"grad_norm": 0.10421558917351098,
"learning_rate": 3.4036221446712785e-05,
"loss": 0.3442,
"step": 934
},
{
"epoch": 2.9670765569218567,
"grad_norm": 0.10191858553645955,
"learning_rate": 3.3948544462616154e-05,
"loss": 0.3472,
"step": 935
},
{
"epoch": 2.9702499008330028,
"grad_norm": 0.10534746393290977,
"learning_rate": 3.386089722391268e-05,
"loss": 0.3426,
"step": 936
},
{
"epoch": 2.973423244744149,
"grad_norm": 0.10261252386705168,
"learning_rate": 3.3773280161424614e-05,
"loss": 0.3433,
"step": 937
},
{
"epoch": 2.9765965886552954,
"grad_norm": 0.1170256748314465,
"learning_rate": 3.368569370582584e-05,
"loss": 0.3394,
"step": 938
},
{
"epoch": 2.979769932566442,
"grad_norm": 0.11126882366159503,
"learning_rate": 3.359813828763983e-05,
"loss": 0.3397,
"step": 939
},
{
"epoch": 2.982943276477588,
"grad_norm": 0.1198757045135952,
"learning_rate": 3.351061433723746e-05,
"loss": 0.3438,
"step": 940
},
{
"epoch": 2.9861166203887346,
"grad_norm": 0.10112115474813273,
"learning_rate": 3.3423122284834976e-05,
"loss": 0.3342,
"step": 941
},
{
"epoch": 2.989289964299881,
"grad_norm": 0.11249349782419689,
"learning_rate": 3.3335662560491773e-05,
"loss": 0.3429,
"step": 942
},
{
"epoch": 2.9924633082110272,
"grad_norm": 0.1001185547565032,
"learning_rate": 3.3248235594108415e-05,
"loss": 0.3337,
"step": 943
},
{
"epoch": 2.995636652122174,
"grad_norm": 0.11017756335045858,
"learning_rate": 3.316084181542434e-05,
"loss": 0.3375,
"step": 944
},
{
"epoch": 2.9988099960333203,
"grad_norm": 0.12823056575818978,
"learning_rate": 3.307348165401598e-05,
"loss": 0.3306,
"step": 945
},
{
"epoch": 3.0019833399444664,
"grad_norm": 0.27281229082142194,
"learning_rate": 3.2986155539294435e-05,
"loss": 0.6218,
"step": 946
},
{
"epoch": 3.005156683855613,
"grad_norm": 0.2100816419248202,
"learning_rate": 3.2898863900503484e-05,
"loss": 0.3147,
"step": 947
},
{
"epoch": 3.008330027766759,
"grad_norm": 0.17105137980537694,
"learning_rate": 3.281160716671743e-05,
"loss": 0.3116,
"step": 948
},
{
"epoch": 3.0115033716779056,
"grad_norm": 0.22415298912899825,
"learning_rate": 3.2724385766839026e-05,
"loss": 0.3194,
"step": 949
},
{
"epoch": 3.014676715589052,
"grad_norm": 0.19583525189357293,
"learning_rate": 3.263720012959732e-05,
"loss": 0.3173,
"step": 950
},
{
"epoch": 3.0178500595001982,
"grad_norm": 0.15512832486443295,
"learning_rate": 3.255005068354559e-05,
"loss": 0.3157,
"step": 951
},
{
"epoch": 3.021023403411345,
"grad_norm": 0.16721068805673456,
"learning_rate": 3.2462937857059187e-05,
"loss": 0.3178,
"step": 952
},
{
"epoch": 3.024196747322491,
"grad_norm": 0.17107719512173727,
"learning_rate": 3.237586207833353e-05,
"loss": 0.3119,
"step": 953
},
{
"epoch": 3.0273700912336374,
"grad_norm": 0.1585477770306446,
"learning_rate": 3.2288823775381845e-05,
"loss": 0.3134,
"step": 954
},
{
"epoch": 3.030543435144784,
"grad_norm": 0.16343916704345743,
"learning_rate": 3.2201823376033226e-05,
"loss": 0.3132,
"step": 955
},
{
"epoch": 3.03371677905593,
"grad_norm": 0.13291081320812426,
"learning_rate": 3.21148613079304e-05,
"loss": 0.3155,
"step": 956
},
{
"epoch": 3.0368901229670766,
"grad_norm": 0.1495696649421576,
"learning_rate": 3.202793799852772e-05,
"loss": 0.3139,
"step": 957
},
{
"epoch": 3.040063466878223,
"grad_norm": 0.14466866199473793,
"learning_rate": 3.194105387508899e-05,
"loss": 0.3134,
"step": 958
},
{
"epoch": 3.0432368107893693,
"grad_norm": 0.1361091421412387,
"learning_rate": 3.1854209364685436e-05,
"loss": 0.3082,
"step": 959
},
{
"epoch": 3.046410154700516,
"grad_norm": 0.13367023176470594,
"learning_rate": 3.176740489419352e-05,
"loss": 0.3126,
"step": 960
},
{
"epoch": 3.049583498611662,
"grad_norm": 0.13627283601328063,
"learning_rate": 3.168064089029296e-05,
"loss": 0.3134,
"step": 961
},
{
"epoch": 3.0527568425228084,
"grad_norm": 0.13148674412233574,
"learning_rate": 3.159391777946447e-05,
"loss": 0.3162,
"step": 962
},
{
"epoch": 3.055930186433955,
"grad_norm": 0.12808707534046343,
"learning_rate": 3.150723598798787e-05,
"loss": 0.3118,
"step": 963
},
{
"epoch": 3.059103530345101,
"grad_norm": 0.13273988457362224,
"learning_rate": 3.1420595941939786e-05,
"loss": 0.3133,
"step": 964
},
{
"epoch": 3.0622768742562476,
"grad_norm": 0.11421318045881329,
"learning_rate": 3.13339980671917e-05,
"loss": 0.3098,
"step": 965
},
{
"epoch": 3.0654502181673937,
"grad_norm": 0.13703482566876224,
"learning_rate": 3.124744278940777e-05,
"loss": 0.3195,
"step": 966
},
{
"epoch": 3.0686235620785403,
"grad_norm": 0.11316298092381129,
"learning_rate": 3.1160930534042805e-05,
"loss": 0.3172,
"step": 967
},
{
"epoch": 3.071796905989687,
"grad_norm": 0.1189425742935073,
"learning_rate": 3.107446172634012e-05,
"loss": 0.3108,
"step": 968
},
{
"epoch": 3.074970249900833,
"grad_norm": 0.11756376777246882,
"learning_rate": 3.098803679132947e-05,
"loss": 0.3179,
"step": 969
},
{
"epoch": 3.0781435938119794,
"grad_norm": 0.1143776389372859,
"learning_rate": 3.090165615382496e-05,
"loss": 0.3102,
"step": 970
},
{
"epoch": 3.0813169377231255,
"grad_norm": 0.12423804582324503,
"learning_rate": 3.0815320238422974e-05,
"loss": 0.3115,
"step": 971
},
{
"epoch": 3.084490281634272,
"grad_norm": 0.13996003835367157,
"learning_rate": 3.072902946950001e-05,
"loss": 0.3192,
"step": 972
},
{
"epoch": 3.0876636255454186,
"grad_norm": 0.11255035117655975,
"learning_rate": 3.0642784271210734e-05,
"loss": 0.309,
"step": 973
},
{
"epoch": 3.0908369694565647,
"grad_norm": 0.12699451919385885,
"learning_rate": 3.055658506748575e-05,
"loss": 0.3134,
"step": 974
},
{
"epoch": 3.0940103133677113,
"grad_norm": 0.11635660673053044,
"learning_rate": 3.0470432282029614e-05,
"loss": 0.3167,
"step": 975
},
{
"epoch": 3.097183657278858,
"grad_norm": 0.10873609232252231,
"learning_rate": 3.0384326338318685e-05,
"loss": 0.3133,
"step": 976
},
{
"epoch": 3.100357001190004,
"grad_norm": 0.11807314933760477,
"learning_rate": 3.0298267659599115e-05,
"loss": 0.312,
"step": 977
},
{
"epoch": 3.1035303451011504,
"grad_norm": 0.11916222364590734,
"learning_rate": 3.0212256668884695e-05,
"loss": 0.3078,
"step": 978
},
{
"epoch": 3.1067036890122965,
"grad_norm": 0.12272695797856463,
"learning_rate": 3.0126293788954833e-05,
"loss": 0.3122,
"step": 979
},
{
"epoch": 3.109877032923443,
"grad_norm": 0.11540153822434568,
"learning_rate": 3.004037944235242e-05,
"loss": 0.3136,
"step": 980
},
{
"epoch": 3.1130503768345896,
"grad_norm": 0.11618456648740005,
"learning_rate": 2.9954514051381863e-05,
"loss": 0.3166,
"step": 981
},
{
"epoch": 3.1162237207457357,
"grad_norm": 0.11244633521843918,
"learning_rate": 2.9868698038106815e-05,
"loss": 0.3148,
"step": 982
},
{
"epoch": 3.1193970646568823,
"grad_norm": 0.10920070004246661,
"learning_rate": 2.9782931824348328e-05,
"loss": 0.3088,
"step": 983
},
{
"epoch": 3.1225704085680284,
"grad_norm": 0.12174658516975441,
"learning_rate": 2.9697215831682595e-05,
"loss": 0.3165,
"step": 984
},
{
"epoch": 3.125743752479175,
"grad_norm": 0.10643018736017142,
"learning_rate": 2.9611550481439e-05,
"loss": 0.3145,
"step": 985
},
{
"epoch": 3.1289170963903215,
"grad_norm": 0.10684450615726702,
"learning_rate": 2.952593619469795e-05,
"loss": 0.3092,
"step": 986
},
{
"epoch": 3.1320904403014675,
"grad_norm": 0.10941148930845793,
"learning_rate": 2.9440373392288913e-05,
"loss": 0.3138,
"step": 987
},
{
"epoch": 3.135263784212614,
"grad_norm": 0.10382261849589255,
"learning_rate": 2.9354862494788223e-05,
"loss": 0.3119,
"step": 988
},
{
"epoch": 3.13843712812376,
"grad_norm": 0.10958378594318682,
"learning_rate": 2.9269403922517145e-05,
"loss": 0.3099,
"step": 989
},
{
"epoch": 3.1416104720349067,
"grad_norm": 0.12998774556538992,
"learning_rate": 2.9183998095539684e-05,
"loss": 0.3129,
"step": 990
},
{
"epoch": 3.1447838159460533,
"grad_norm": 0.10561418907091862,
"learning_rate": 2.9098645433660655e-05,
"loss": 0.3128,
"step": 991
},
{
"epoch": 3.1479571598571994,
"grad_norm": 0.12592324141074962,
"learning_rate": 2.9013346356423446e-05,
"loss": 0.317,
"step": 992
},
{
"epoch": 3.151130503768346,
"grad_norm": 0.12513756646057633,
"learning_rate": 2.8928101283108153e-05,
"loss": 0.314,
"step": 993
},
{
"epoch": 3.1543038476794925,
"grad_norm": 0.14282969282839109,
"learning_rate": 2.884291063272935e-05,
"loss": 0.3109,
"step": 994
},
{
"epoch": 3.1574771915906386,
"grad_norm": 0.1199544503476591,
"learning_rate": 2.8757774824034146e-05,
"loss": 0.3119,
"step": 995
},
{
"epoch": 3.160650535501785,
"grad_norm": 0.11681362905939408,
"learning_rate": 2.867269427550004e-05,
"loss": 0.3065,
"step": 996
},
{
"epoch": 3.163823879412931,
"grad_norm": 0.11355969210006382,
"learning_rate": 2.8587669405332942e-05,
"loss": 0.3134,
"step": 997
},
{
"epoch": 3.1669972233240777,
"grad_norm": 0.11125944189799743,
"learning_rate": 2.850270063146505e-05,
"loss": 0.3123,
"step": 998
},
{
"epoch": 3.1701705672352243,
"grad_norm": 0.10903398382765295,
"learning_rate": 2.8417788371552847e-05,
"loss": 0.3137,
"step": 999
},
{
"epoch": 3.1733439111463704,
"grad_norm": 0.11735770219676825,
"learning_rate": 2.8332933042974997e-05,
"loss": 0.3143,
"step": 1000
},
{
"epoch": 3.176517255057517,
"grad_norm": 0.09886653400775318,
"learning_rate": 2.824813506283038e-05,
"loss": 0.312,
"step": 1001
},
{
"epoch": 3.179690598968663,
"grad_norm": 0.10362486903153832,
"learning_rate": 2.8163394847935924e-05,
"loss": 0.3063,
"step": 1002
},
{
"epoch": 3.1828639428798096,
"grad_norm": 0.11188362720582715,
"learning_rate": 2.8078712814824657e-05,
"loss": 0.3125,
"step": 1003
},
{
"epoch": 3.186037286790956,
"grad_norm": 0.09559025289026829,
"learning_rate": 2.799408937974359e-05,
"loss": 0.3121,
"step": 1004
},
{
"epoch": 3.189210630702102,
"grad_norm": 0.10812265121811047,
"learning_rate": 2.790952495865173e-05,
"loss": 0.3106,
"step": 1005
},
{
"epoch": 3.1923839746132487,
"grad_norm": 0.1121565061938828,
"learning_rate": 2.7825019967217975e-05,
"loss": 0.3189,
"step": 1006
},
{
"epoch": 3.1955573185243953,
"grad_norm": 0.1004631775145022,
"learning_rate": 2.7740574820819133e-05,
"loss": 0.3137,
"step": 1007
},
{
"epoch": 3.1987306624355414,
"grad_norm": 0.12433594061957719,
"learning_rate": 2.7656189934537815e-05,
"loss": 0.3081,
"step": 1008
},
{
"epoch": 3.201904006346688,
"grad_norm": 0.0976644148795563,
"learning_rate": 2.7571865723160484e-05,
"loss": 0.3108,
"step": 1009
},
{
"epoch": 3.205077350257834,
"grad_norm": 0.12033671452278129,
"learning_rate": 2.7487602601175274e-05,
"loss": 0.3128,
"step": 1010
},
{
"epoch": 3.2082506941689806,
"grad_norm": 0.11018748833970334,
"learning_rate": 2.7403400982770142e-05,
"loss": 0.3151,
"step": 1011
},
{
"epoch": 3.211424038080127,
"grad_norm": 0.11256643868056433,
"learning_rate": 2.731926128183064e-05,
"loss": 0.314,
"step": 1012
},
{
"epoch": 3.214597381991273,
"grad_norm": 0.11109067168322144,
"learning_rate": 2.7235183911938033e-05,
"loss": 0.3138,
"step": 1013
},
{
"epoch": 3.2177707259024197,
"grad_norm": 0.11490231986287124,
"learning_rate": 2.715116928636716e-05,
"loss": 0.3131,
"step": 1014
},
{
"epoch": 3.220944069813566,
"grad_norm": 0.1101964461135929,
"learning_rate": 2.7067217818084475e-05,
"loss": 0.3145,
"step": 1015
},
{
"epoch": 3.2241174137247124,
"grad_norm": 0.10280292922760781,
"learning_rate": 2.6983329919745968e-05,
"loss": 0.3117,
"step": 1016
},
{
"epoch": 3.227290757635859,
"grad_norm": 0.10260424316634227,
"learning_rate": 2.689950600369518e-05,
"loss": 0.3188,
"step": 1017
},
{
"epoch": 3.230464101547005,
"grad_norm": 0.09915026576929485,
"learning_rate": 2.681574648196111e-05,
"loss": 0.3128,
"step": 1018
},
{
"epoch": 3.2336374454581516,
"grad_norm": 0.09342686268392349,
"learning_rate": 2.673205176625631e-05,
"loss": 0.3127,
"step": 1019
},
{
"epoch": 3.236810789369298,
"grad_norm": 0.10375205809388013,
"learning_rate": 2.664842226797468e-05,
"loss": 0.3125,
"step": 1020
},
{
"epoch": 3.239984133280444,
"grad_norm": 0.09480673127381069,
"learning_rate": 2.656485839818964e-05,
"loss": 0.3103,
"step": 1021
},
{
"epoch": 3.2431574771915908,
"grad_norm": 0.0967591082180364,
"learning_rate": 2.648136056765197e-05,
"loss": 0.3158,
"step": 1022
},
{
"epoch": 3.246330821102737,
"grad_norm": 0.10704831819192964,
"learning_rate": 2.639792918678786e-05,
"loss": 0.3165,
"step": 1023
},
{
"epoch": 3.2495041650138834,
"grad_norm": 0.10372920487733647,
"learning_rate": 2.6314564665696853e-05,
"loss": 0.314,
"step": 1024
},
{
"epoch": 3.25267750892503,
"grad_norm": 0.10195810032807612,
"learning_rate": 2.6231267414149882e-05,
"loss": 0.3115,
"step": 1025
},
{
"epoch": 3.255850852836176,
"grad_norm": 0.10118352358935338,
"learning_rate": 2.6148037841587178e-05,
"loss": 0.3124,
"step": 1026
},
{
"epoch": 3.2590241967473226,
"grad_norm": 0.0960144819596772,
"learning_rate": 2.606487635711634e-05,
"loss": 0.3133,
"step": 1027
},
{
"epoch": 3.2621975406584687,
"grad_norm": 0.10341496259177649,
"learning_rate": 2.5981783369510262e-05,
"loss": 0.3133,
"step": 1028
},
{
"epoch": 3.265370884569615,
"grad_norm": 0.09978273187676878,
"learning_rate": 2.589875928720518e-05,
"loss": 0.3092,
"step": 1029
},
{
"epoch": 3.2685442284807618,
"grad_norm": 0.10892472596771698,
"learning_rate": 2.5815804518298575e-05,
"loss": 0.3098,
"step": 1030
},
{
"epoch": 3.271717572391908,
"grad_norm": 0.10275518766631388,
"learning_rate": 2.5732919470547295e-05,
"loss": 0.3083,
"step": 1031
},
{
"epoch": 3.2748909163030544,
"grad_norm": 0.10103072194259327,
"learning_rate": 2.5650104551365412e-05,
"loss": 0.3108,
"step": 1032
},
{
"epoch": 3.278064260214201,
"grad_norm": 0.10324262880702959,
"learning_rate": 2.556736016782234e-05,
"loss": 0.3122,
"step": 1033
},
{
"epoch": 3.281237604125347,
"grad_norm": 0.09926981679742919,
"learning_rate": 2.5484686726640744e-05,
"loss": 0.3107,
"step": 1034
},
{
"epoch": 3.2844109480364936,
"grad_norm": 0.10130743878380367,
"learning_rate": 2.540208463419462e-05,
"loss": 0.3074,
"step": 1035
},
{
"epoch": 3.2875842919476397,
"grad_norm": 0.10557324952221134,
"learning_rate": 2.5319554296507188e-05,
"loss": 0.3131,
"step": 1036
},
{
"epoch": 3.2907576358587862,
"grad_norm": 0.10083350399120836,
"learning_rate": 2.5237096119249058e-05,
"loss": 0.3204,
"step": 1037
},
{
"epoch": 3.2939309797699323,
"grad_norm": 0.0986145583348089,
"learning_rate": 2.5154710507736037e-05,
"loss": 0.3079,
"step": 1038
},
{
"epoch": 3.297104323681079,
"grad_norm": 0.11022375139763324,
"learning_rate": 2.5072397866927335e-05,
"loss": 0.317,
"step": 1039
},
{
"epoch": 3.3002776675922254,
"grad_norm": 0.1125140361024117,
"learning_rate": 2.4990158601423417e-05,
"loss": 0.317,
"step": 1040
},
{
"epoch": 3.3034510115033715,
"grad_norm": 0.11021070433389958,
"learning_rate": 2.4907993115464116e-05,
"loss": 0.3113,
"step": 1041
},
{
"epoch": 3.306624355414518,
"grad_norm": 0.10634404742096815,
"learning_rate": 2.4825901812926574e-05,
"loss": 0.3158,
"step": 1042
},
{
"epoch": 3.3097976993256646,
"grad_norm": 0.11699549952140445,
"learning_rate": 2.474388509732333e-05,
"loss": 0.3111,
"step": 1043
},
{
"epoch": 3.3129710432368107,
"grad_norm": 0.0980534798385101,
"learning_rate": 2.466194337180027e-05,
"loss": 0.3143,
"step": 1044
},
{
"epoch": 3.3161443871479572,
"grad_norm": 0.1194870796488275,
"learning_rate": 2.4580077039134683e-05,
"loss": 0.316,
"step": 1045
},
{
"epoch": 3.3193177310591038,
"grad_norm": 0.10484409113962986,
"learning_rate": 2.449828650173325e-05,
"loss": 0.3135,
"step": 1046
},
{
"epoch": 3.32249107497025,
"grad_norm": 0.10357015345939864,
"learning_rate": 2.441657216163015e-05,
"loss": 0.3109,
"step": 1047
},
{
"epoch": 3.3256644188813964,
"grad_norm": 0.11296601395220406,
"learning_rate": 2.433493442048492e-05,
"loss": 0.3133,
"step": 1048
},
{
"epoch": 3.3288377627925425,
"grad_norm": 0.09961102835039787,
"learning_rate": 2.4253373679580686e-05,
"loss": 0.3158,
"step": 1049
},
{
"epoch": 3.332011106703689,
"grad_norm": 0.10181823522886456,
"learning_rate": 2.4171890339822013e-05,
"loss": 0.3116,
"step": 1050
},
{
"epoch": 3.335184450614835,
"grad_norm": 0.10979798113117561,
"learning_rate": 2.409048480173305e-05,
"loss": 0.3162,
"step": 1051
},
{
"epoch": 3.3383577945259817,
"grad_norm": 0.094779351238252,
"learning_rate": 2.400915746545548e-05,
"loss": 0.315,
"step": 1052
},
{
"epoch": 3.3415311384371282,
"grad_norm": 0.10148023343814727,
"learning_rate": 2.392790873074664e-05,
"loss": 0.3146,
"step": 1053
},
{
"epoch": 3.3447044823482743,
"grad_norm": 0.09289268477261449,
"learning_rate": 2.384673899697746e-05,
"loss": 0.3125,
"step": 1054
},
{
"epoch": 3.347877826259421,
"grad_norm": 0.09684287864696564,
"learning_rate": 2.376564866313058e-05,
"loss": 0.3104,
"step": 1055
},
{
"epoch": 3.3510511701705674,
"grad_norm": 0.09643736175489405,
"learning_rate": 2.3684638127798336e-05,
"loss": 0.3178,
"step": 1056
},
{
"epoch": 3.3542245140817135,
"grad_norm": 0.10635216596639196,
"learning_rate": 2.3603707789180863e-05,
"loss": 0.3136,
"step": 1057
},
{
"epoch": 3.35739785799286,
"grad_norm": 0.09676123073144416,
"learning_rate": 2.3522858045084016e-05,
"loss": 0.3122,
"step": 1058
},
{
"epoch": 3.360571201904006,
"grad_norm": 0.10713967084993489,
"learning_rate": 2.344208929291759e-05,
"loss": 0.3141,
"step": 1059
},
{
"epoch": 3.3637445458151527,
"grad_norm": 0.09842859790655265,
"learning_rate": 2.3361401929693194e-05,
"loss": 0.3192,
"step": 1060
},
{
"epoch": 3.3669178897262992,
"grad_norm": 0.0960717662909176,
"learning_rate": 2.3280796352022442e-05,
"loss": 0.3053,
"step": 1061
},
{
"epoch": 3.3700912336374453,
"grad_norm": 0.10290198906684156,
"learning_rate": 2.3200272956114845e-05,
"loss": 0.3115,
"step": 1062
},
{
"epoch": 3.373264577548592,
"grad_norm": 0.09091365277942409,
"learning_rate": 2.3119832137776088e-05,
"loss": 0.3148,
"step": 1063
},
{
"epoch": 3.376437921459738,
"grad_norm": 0.10196717070207367,
"learning_rate": 2.3039474292405834e-05,
"loss": 0.3111,
"step": 1064
},
{
"epoch": 3.3796112653708845,
"grad_norm": 0.09586271821215989,
"learning_rate": 2.295919981499596e-05,
"loss": 0.3181,
"step": 1065
},
{
"epoch": 3.382784609282031,
"grad_norm": 0.09899945933563967,
"learning_rate": 2.287900910012854e-05,
"loss": 0.3162,
"step": 1066
},
{
"epoch": 3.385957953193177,
"grad_norm": 0.10322988072215468,
"learning_rate": 2.2798902541973945e-05,
"loss": 0.3153,
"step": 1067
},
{
"epoch": 3.3891312971043237,
"grad_norm": 0.09259998496704394,
"learning_rate": 2.2718880534288826e-05,
"loss": 0.3121,
"step": 1068
},
{
"epoch": 3.3923046410154702,
"grad_norm": 0.10569429128390048,
"learning_rate": 2.2638943470414274e-05,
"loss": 0.3125,
"step": 1069
},
{
"epoch": 3.3954779849266163,
"grad_norm": 0.09760048377336895,
"learning_rate": 2.2559091743273855e-05,
"loss": 0.3127,
"step": 1070
},
{
"epoch": 3.398651328837763,
"grad_norm": 0.10906819806313078,
"learning_rate": 2.2479325745371662e-05,
"loss": 0.3135,
"step": 1071
},
{
"epoch": 3.401824672748909,
"grad_norm": 0.09977494398591062,
"learning_rate": 2.239964586879033e-05,
"loss": 0.312,
"step": 1072
},
{
"epoch": 3.4049980166600555,
"grad_norm": 0.10832374432497219,
"learning_rate": 2.2320052505189307e-05,
"loss": 0.3181,
"step": 1073
},
{
"epoch": 3.408171360571202,
"grad_norm": 0.1022824580898682,
"learning_rate": 2.2240546045802657e-05,
"loss": 0.3224,
"step": 1074
},
{
"epoch": 3.411344704482348,
"grad_norm": 0.10034777595087839,
"learning_rate": 2.216112688143735e-05,
"loss": 0.3125,
"step": 1075
},
{
"epoch": 3.4145180483934947,
"grad_norm": 0.10030841402583066,
"learning_rate": 2.2081795402471248e-05,
"loss": 0.3084,
"step": 1076
},
{
"epoch": 3.417691392304641,
"grad_norm": 0.09762803350713443,
"learning_rate": 2.2002551998851214e-05,
"loss": 0.3093,
"step": 1077
},
{
"epoch": 3.4208647362157873,
"grad_norm": 0.1009931981677632,
"learning_rate": 2.192339706009115e-05,
"loss": 0.3142,
"step": 1078
},
{
"epoch": 3.424038080126934,
"grad_norm": 0.10087359052596663,
"learning_rate": 2.1844330975270146e-05,
"loss": 0.3116,
"step": 1079
},
{
"epoch": 3.42721142403808,
"grad_norm": 0.09669963400697884,
"learning_rate": 2.1765354133030537e-05,
"loss": 0.3093,
"step": 1080
},
{
"epoch": 3.4303847679492265,
"grad_norm": 0.10318895349642157,
"learning_rate": 2.1686466921576e-05,
"loss": 0.3139,
"step": 1081
},
{
"epoch": 3.433558111860373,
"grad_norm": 0.100273860386507,
"learning_rate": 2.1607669728669595e-05,
"loss": 0.3147,
"step": 1082
},
{
"epoch": 3.436731455771519,
"grad_norm": 0.10871940977569985,
"learning_rate": 2.152896294163198e-05,
"loss": 0.3195,
"step": 1083
},
{
"epoch": 3.4399047996826657,
"grad_norm": 0.09861412764646266,
"learning_rate": 2.1450346947339354e-05,
"loss": 0.3099,
"step": 1084
},
{
"epoch": 3.443078143593812,
"grad_norm": 0.1036174572244448,
"learning_rate": 2.137182213222168e-05,
"loss": 0.3141,
"step": 1085
},
{
"epoch": 3.4462514875049584,
"grad_norm": 0.10204915828850486,
"learning_rate": 2.1293388882260725e-05,
"loss": 0.3098,
"step": 1086
},
{
"epoch": 3.449424831416105,
"grad_norm": 0.09799894678131774,
"learning_rate": 2.1215047582988195e-05,
"loss": 0.3103,
"step": 1087
},
{
"epoch": 3.452598175327251,
"grad_norm": 0.09504536792403677,
"learning_rate": 2.113679861948376e-05,
"loss": 0.3127,
"step": 1088
},
{
"epoch": 3.4557715192383975,
"grad_norm": 0.09764248583285513,
"learning_rate": 2.1058642376373283e-05,
"loss": 0.3164,
"step": 1089
},
{
"epoch": 3.4589448631495436,
"grad_norm": 0.0906376025849311,
"learning_rate": 2.098057923782685e-05,
"loss": 0.3152,
"step": 1090
},
{
"epoch": 3.46211820706069,
"grad_norm": 0.09359589566718937,
"learning_rate": 2.0902609587556896e-05,
"loss": 0.3117,
"step": 1091
},
{
"epoch": 3.4652915509718367,
"grad_norm": 0.09548394143187244,
"learning_rate": 2.082473380881632e-05,
"loss": 0.3115,
"step": 1092
},
{
"epoch": 3.468464894882983,
"grad_norm": 0.09338528887344875,
"learning_rate": 2.074695228439663e-05,
"loss": 0.3114,
"step": 1093
},
{
"epoch": 3.4716382387941294,
"grad_norm": 0.09884602966359947,
"learning_rate": 2.066926539662598e-05,
"loss": 0.3145,
"step": 1094
},
{
"epoch": 3.474811582705276,
"grad_norm": 0.09230713872679078,
"learning_rate": 2.0591673527367386e-05,
"loss": 0.31,
"step": 1095
},
{
"epoch": 3.477984926616422,
"grad_norm": 0.09898085658702097,
"learning_rate": 2.051417705801681e-05,
"loss": 0.3106,
"step": 1096
},
{
"epoch": 3.4811582705275685,
"grad_norm": 0.08889141823884165,
"learning_rate": 2.04367763695013e-05,
"loss": 0.3096,
"step": 1097
},
{
"epoch": 3.4843316144387146,
"grad_norm": 0.10424786455484286,
"learning_rate": 2.0359471842277014e-05,
"loss": 0.3125,
"step": 1098
},
{
"epoch": 3.487504958349861,
"grad_norm": 0.09373054000252577,
"learning_rate": 2.0282263856327575e-05,
"loss": 0.3095,
"step": 1099
},
{
"epoch": 3.4906783022610077,
"grad_norm": 0.09743243651722891,
"learning_rate": 2.0205152791161942e-05,
"loss": 0.3138,
"step": 1100
},
{
"epoch": 3.493851646172154,
"grad_norm": 0.09903803728619201,
"learning_rate": 2.0128139025812737e-05,
"loss": 0.3117,
"step": 1101
},
{
"epoch": 3.4970249900833004,
"grad_norm": 0.09797020262865515,
"learning_rate": 2.00512229388343e-05,
"loss": 0.3083,
"step": 1102
},
{
"epoch": 3.5001983339944465,
"grad_norm": 0.10766478943557996,
"learning_rate": 1.9974404908300837e-05,
"loss": 0.3139,
"step": 1103
},
{
"epoch": 3.503371677905593,
"grad_norm": 0.09779359977262553,
"learning_rate": 1.9897685311804547e-05,
"loss": 0.3106,
"step": 1104
},
{
"epoch": 3.5065450218167395,
"grad_norm": 0.09387274555393152,
"learning_rate": 1.982106452645382e-05,
"loss": 0.3105,
"step": 1105
},
{
"epoch": 3.5097183657278856,
"grad_norm": 0.10203359221398621,
"learning_rate": 1.9744542928871335e-05,
"loss": 0.3131,
"step": 1106
},
{
"epoch": 3.512891709639032,
"grad_norm": 0.0940533035661825,
"learning_rate": 1.966812089519223e-05,
"loss": 0.3105,
"step": 1107
},
{
"epoch": 3.5160650535501787,
"grad_norm": 0.11077496419812648,
"learning_rate": 1.959179880106219e-05,
"loss": 0.3124,
"step": 1108
},
{
"epoch": 3.519238397461325,
"grad_norm": 0.08886436421169637,
"learning_rate": 1.9515577021635766e-05,
"loss": 0.3133,
"step": 1109
},
{
"epoch": 3.5224117413724714,
"grad_norm": 0.1058427262771896,
"learning_rate": 1.9439455931574306e-05,
"loss": 0.3188,
"step": 1110
},
{
"epoch": 3.5255850852836175,
"grad_norm": 0.0943621814009501,
"learning_rate": 1.9363435905044303e-05,
"loss": 0.3115,
"step": 1111
},
{
"epoch": 3.528758429194764,
"grad_norm": 0.10860537135562053,
"learning_rate": 1.9287517315715455e-05,
"loss": 0.3109,
"step": 1112
},
{
"epoch": 3.53193177310591,
"grad_norm": 0.08975638711859225,
"learning_rate": 1.9211700536758867e-05,
"loss": 0.3147,
"step": 1113
},
{
"epoch": 3.5351051170170567,
"grad_norm": 0.09431070654205716,
"learning_rate": 1.9135985940845167e-05,
"loss": 0.3061,
"step": 1114
},
{
"epoch": 3.538278460928203,
"grad_norm": 0.09176047436666471,
"learning_rate": 1.9060373900142758e-05,
"loss": 0.3039,
"step": 1115
},
{
"epoch": 3.5414518048393493,
"grad_norm": 0.09566553815030118,
"learning_rate": 1.898486478631591e-05,
"loss": 0.3096,
"step": 1116
},
{
"epoch": 3.544625148750496,
"grad_norm": 0.09398037913009233,
"learning_rate": 1.8909458970523003e-05,
"loss": 0.3117,
"step": 1117
},
{
"epoch": 3.5477984926616424,
"grad_norm": 0.09297530409510038,
"learning_rate": 1.8834156823414592e-05,
"loss": 0.3143,
"step": 1118
},
{
"epoch": 3.5509718365727885,
"grad_norm": 0.09616519717958327,
"learning_rate": 1.8758958715131763e-05,
"loss": 0.3134,
"step": 1119
},
{
"epoch": 3.554145180483935,
"grad_norm": 0.09595619987699143,
"learning_rate": 1.8683865015304107e-05,
"loss": 0.3167,
"step": 1120
},
{
"epoch": 3.5573185243950816,
"grad_norm": 0.09428857006752461,
"learning_rate": 1.860887609304806e-05,
"loss": 0.32,
"step": 1121
},
{
"epoch": 3.5604918683062277,
"grad_norm": 0.09288431779361411,
"learning_rate": 1.853399231696502e-05,
"loss": 0.3105,
"step": 1122
},
{
"epoch": 3.563665212217374,
"grad_norm": 0.09478695944187812,
"learning_rate": 1.845921405513957e-05,
"loss": 0.3136,
"step": 1123
},
{
"epoch": 3.5668385561285203,
"grad_norm": 0.09259086063066573,
"learning_rate": 1.838454167513759e-05,
"loss": 0.3082,
"step": 1124
},
{
"epoch": 3.570011900039667,
"grad_norm": 0.09731407575423262,
"learning_rate": 1.8309975544004563e-05,
"loss": 0.3097,
"step": 1125
},
{
"epoch": 3.573185243950813,
"grad_norm": 0.08908940261693532,
"learning_rate": 1.8235516028263693e-05,
"loss": 0.3102,
"step": 1126
},
{
"epoch": 3.5763585878619595,
"grad_norm": 0.0994114922720132,
"learning_rate": 1.8161163493914138e-05,
"loss": 0.3127,
"step": 1127
},
{
"epoch": 3.579531931773106,
"grad_norm": 0.09284958185097307,
"learning_rate": 1.808691830642915e-05,
"loss": 0.306,
"step": 1128
},
{
"epoch": 3.582705275684252,
"grad_norm": 0.09838434020935204,
"learning_rate": 1.8012780830754428e-05,
"loss": 0.3142,
"step": 1129
},
{
"epoch": 3.5858786195953987,
"grad_norm": 0.0957701843609928,
"learning_rate": 1.7938751431306108e-05,
"loss": 0.3059,
"step": 1130
},
{
"epoch": 3.589051963506545,
"grad_norm": 0.10452299557787663,
"learning_rate": 1.7864830471969158e-05,
"loss": 0.3149,
"step": 1131
},
{
"epoch": 3.5922253074176913,
"grad_norm": 0.09852568670954905,
"learning_rate": 1.77910183160955e-05,
"loss": 0.318,
"step": 1132
},
{
"epoch": 3.595398651328838,
"grad_norm": 0.08964997440395585,
"learning_rate": 1.771731532650226e-05,
"loss": 0.3132,
"step": 1133
},
{
"epoch": 3.5985719952399844,
"grad_norm": 0.10225504250213735,
"learning_rate": 1.7643721865469913e-05,
"loss": 0.3125,
"step": 1134
},
{
"epoch": 3.6017453391511305,
"grad_norm": 0.09337353095498398,
"learning_rate": 1.757023829474061e-05,
"loss": 0.3096,
"step": 1135
},
{
"epoch": 3.604918683062277,
"grad_norm": 0.09729026605466504,
"learning_rate": 1.7496864975516326e-05,
"loss": 0.3143,
"step": 1136
},
{
"epoch": 3.608092026973423,
"grad_norm": 0.09648684875445444,
"learning_rate": 1.7423602268457124e-05,
"loss": 0.3176,
"step": 1137
},
{
"epoch": 3.6112653708845697,
"grad_norm": 0.0891461740774676,
"learning_rate": 1.7350450533679298e-05,
"loss": 0.3076,
"step": 1138
},
{
"epoch": 3.6144387147957158,
"grad_norm": 0.09136934909265221,
"learning_rate": 1.7277410130753775e-05,
"loss": 0.3113,
"step": 1139
},
{
"epoch": 3.6176120587068623,
"grad_norm": 0.09282078764180805,
"learning_rate": 1.7204481418704136e-05,
"loss": 0.3088,
"step": 1140
},
{
"epoch": 3.620785402618009,
"grad_norm": 0.08738434824899044,
"learning_rate": 1.7131664756005012e-05,
"loss": 0.3112,
"step": 1141
},
{
"epoch": 3.623958746529155,
"grad_norm": 0.09634650667327464,
"learning_rate": 1.705896050058025e-05,
"loss": 0.3159,
"step": 1142
},
{
"epoch": 3.6271320904403015,
"grad_norm": 0.09161796695924315,
"learning_rate": 1.698636900980119e-05,
"loss": 0.3087,
"step": 1143
},
{
"epoch": 3.630305434351448,
"grad_norm": 0.08736032109536832,
"learning_rate": 1.6913890640484844e-05,
"loss": 0.3137,
"step": 1144
},
{
"epoch": 3.633478778262594,
"grad_norm": 0.09745072572329685,
"learning_rate": 1.6841525748892216e-05,
"loss": 0.3128,
"step": 1145
},
{
"epoch": 3.6366521221737407,
"grad_norm": 0.08662042757890395,
"learning_rate": 1.6769274690726523e-05,
"loss": 0.3131,
"step": 1146
},
{
"epoch": 3.639825466084887,
"grad_norm": 0.09312666065952754,
"learning_rate": 1.6697137821131443e-05,
"loss": 0.3086,
"step": 1147
},
{
"epoch": 3.6429988099960333,
"grad_norm": 0.10075435906006584,
"learning_rate": 1.6625115494689327e-05,
"loss": 0.3144,
"step": 1148
},
{
"epoch": 3.6461721539071794,
"grad_norm": 0.08168920500142361,
"learning_rate": 1.6553208065419585e-05,
"loss": 0.3067,
"step": 1149
},
{
"epoch": 3.649345497818326,
"grad_norm": 0.09473700986154443,
"learning_rate": 1.648141588677677e-05,
"loss": 0.3086,
"step": 1150
},
{
"epoch": 3.6525188417294725,
"grad_norm": 0.09995319497546384,
"learning_rate": 1.6409739311648985e-05,
"loss": 0.3153,
"step": 1151
},
{
"epoch": 3.6556921856406186,
"grad_norm": 0.08763944506301463,
"learning_rate": 1.633817869235608e-05,
"loss": 0.3109,
"step": 1152
},
{
"epoch": 3.658865529551765,
"grad_norm": 0.09881602368631207,
"learning_rate": 1.626673438064795e-05,
"loss": 0.321,
"step": 1153
},
{
"epoch": 3.6620388734629117,
"grad_norm": 0.09238325300250558,
"learning_rate": 1.6195406727702746e-05,
"loss": 0.3176,
"step": 1154
},
{
"epoch": 3.665212217374058,
"grad_norm": 0.09179321263341135,
"learning_rate": 1.6124196084125235e-05,
"loss": 0.3149,
"step": 1155
},
{
"epoch": 3.6683855612852043,
"grad_norm": 0.09632257418168569,
"learning_rate": 1.6053102799945026e-05,
"loss": 0.3152,
"step": 1156
},
{
"epoch": 3.671558905196351,
"grad_norm": 0.10636674102365737,
"learning_rate": 1.5982127224614867e-05,
"loss": 0.3183,
"step": 1157
},
{
"epoch": 3.674732249107497,
"grad_norm": 0.0872410805303305,
"learning_rate": 1.5911269707008857e-05,
"loss": 0.3114,
"step": 1158
},
{
"epoch": 3.6779055930186435,
"grad_norm": 0.09875023791232529,
"learning_rate": 1.5840530595420903e-05,
"loss": 0.3061,
"step": 1159
},
{
"epoch": 3.6810789369297896,
"grad_norm": 0.09455931044972642,
"learning_rate": 1.5769910237562798e-05,
"loss": 0.3069,
"step": 1160
},
{
"epoch": 3.684252280840936,
"grad_norm": 0.08864062113346818,
"learning_rate": 1.5699408980562653e-05,
"loss": 0.3091,
"step": 1161
},
{
"epoch": 3.6874256247520822,
"grad_norm": 0.0957648926194275,
"learning_rate": 1.562902717096316e-05,
"loss": 0.3105,
"step": 1162
},
{
"epoch": 3.690598968663229,
"grad_norm": 0.08822389225988175,
"learning_rate": 1.5558765154719867e-05,
"loss": 0.312,
"step": 1163
},
{
"epoch": 3.6937723125743753,
"grad_norm": 0.09144318120715113,
"learning_rate": 1.5488623277199463e-05,
"loss": 0.3133,
"step": 1164
},
{
"epoch": 3.6969456564855214,
"grad_norm": 0.08207249013636533,
"learning_rate": 1.5418601883178138e-05,
"loss": 0.3093,
"step": 1165
},
{
"epoch": 3.700119000396668,
"grad_norm": 0.08659915132311609,
"learning_rate": 1.5348701316839844e-05,
"loss": 0.3118,
"step": 1166
},
{
"epoch": 3.7032923443078145,
"grad_norm": 0.0829362247581557,
"learning_rate": 1.5278921921774624e-05,
"loss": 0.3101,
"step": 1167
},
{
"epoch": 3.7064656882189606,
"grad_norm": 0.08830303321169561,
"learning_rate": 1.5209264040976911e-05,
"loss": 0.3121,
"step": 1168
},
{
"epoch": 3.709639032130107,
"grad_norm": 0.08371651283489175,
"learning_rate": 1.5139728016843846e-05,
"loss": 0.3121,
"step": 1169
},
{
"epoch": 3.7128123760412537,
"grad_norm": 0.09394104777007455,
"learning_rate": 1.507031419117357e-05,
"loss": 0.3128,
"step": 1170
},
{
"epoch": 3.7159857199524,
"grad_norm": 0.08410977786060891,
"learning_rate": 1.5001022905163596e-05,
"loss": 0.3085,
"step": 1171
},
{
"epoch": 3.7191590638635463,
"grad_norm": 0.09126456097238246,
"learning_rate": 1.49318544994091e-05,
"loss": 0.3134,
"step": 1172
},
{
"epoch": 3.7223324077746924,
"grad_norm": 0.08545709007632069,
"learning_rate": 1.4862809313901268e-05,
"loss": 0.3154,
"step": 1173
},
{
"epoch": 3.725505751685839,
"grad_norm": 0.08780137354204715,
"learning_rate": 1.4793887688025534e-05,
"loss": 0.311,
"step": 1174
},
{
"epoch": 3.728679095596985,
"grad_norm": 0.09036873460150745,
"learning_rate": 1.4725089960560106e-05,
"loss": 0.3092,
"step": 1175
},
{
"epoch": 3.7318524395081316,
"grad_norm": 0.0906949944979453,
"learning_rate": 1.4656416469674067e-05,
"loss": 0.3087,
"step": 1176
},
{
"epoch": 3.735025783419278,
"grad_norm": 0.08524397104099314,
"learning_rate": 1.4587867552925886e-05,
"loss": 0.3097,
"step": 1177
},
{
"epoch": 3.7381991273304243,
"grad_norm": 0.0831715872678655,
"learning_rate": 1.4519443547261692e-05,
"loss": 0.3091,
"step": 1178
},
{
"epoch": 3.741372471241571,
"grad_norm": 0.08787932200139537,
"learning_rate": 1.445114478901362e-05,
"loss": 0.3104,
"step": 1179
},
{
"epoch": 3.7445458151527173,
"grad_norm": 0.09013424171138545,
"learning_rate": 1.4382971613898145e-05,
"loss": 0.3133,
"step": 1180
},
{
"epoch": 3.7477191590638634,
"grad_norm": 0.08872383417712887,
"learning_rate": 1.431492435701447e-05,
"loss": 0.3103,
"step": 1181
},
{
"epoch": 3.75089250297501,
"grad_norm": 0.0964431058984415,
"learning_rate": 1.424700335284286e-05,
"loss": 0.3142,
"step": 1182
},
{
"epoch": 3.7540658468861565,
"grad_norm": 0.08591965622221728,
"learning_rate": 1.4179208935243e-05,
"loss": 0.3075,
"step": 1183
},
{
"epoch": 3.7572391907973026,
"grad_norm": 0.08719874856870091,
"learning_rate": 1.4111541437452294e-05,
"loss": 0.3111,
"step": 1184
},
{
"epoch": 3.760412534708449,
"grad_norm": 0.09819733047781874,
"learning_rate": 1.4044001192084391e-05,
"loss": 0.3099,
"step": 1185
},
{
"epoch": 3.7635858786195953,
"grad_norm": 0.0836739072391128,
"learning_rate": 1.3976588531127334e-05,
"loss": 0.3115,
"step": 1186
},
{
"epoch": 3.766759222530742,
"grad_norm": 0.09633657842787988,
"learning_rate": 1.3909303785942089e-05,
"loss": 0.3154,
"step": 1187
},
{
"epoch": 3.769932566441888,
"grad_norm": 0.08398570887749328,
"learning_rate": 1.3842147287260863e-05,
"loss": 0.3114,
"step": 1188
},
{
"epoch": 3.7731059103530344,
"grad_norm": 0.08881126144850926,
"learning_rate": 1.3775119365185484e-05,
"loss": 0.3135,
"step": 1189
},
{
"epoch": 3.776279254264181,
"grad_norm": 0.087196679652796,
"learning_rate": 1.3708220349185731e-05,
"loss": 0.3096,
"step": 1190
},
{
"epoch": 3.779452598175327,
"grad_norm": 0.0818848922205358,
"learning_rate": 1.3641450568097794e-05,
"loss": 0.3072,
"step": 1191
},
{
"epoch": 3.7826259420864736,
"grad_norm": 0.09311600595200806,
"learning_rate": 1.3574810350122625e-05,
"loss": 0.3104,
"step": 1192
},
{
"epoch": 3.78579928599762,
"grad_norm": 0.0838102155532246,
"learning_rate": 1.350830002282431e-05,
"loss": 0.3128,
"step": 1193
},
{
"epoch": 3.7889726299087663,
"grad_norm": 0.08273644119209583,
"learning_rate": 1.344191991312843e-05,
"loss": 0.3099,
"step": 1194
},
{
"epoch": 3.792145973819913,
"grad_norm": 0.08673064009083564,
"learning_rate": 1.3375670347320577e-05,
"loss": 0.3108,
"step": 1195
},
{
"epoch": 3.7953193177310593,
"grad_norm": 0.08342237037133293,
"learning_rate": 1.330955165104459e-05,
"loss": 0.3074,
"step": 1196
},
{
"epoch": 3.7984926616422054,
"grad_norm": 0.08544215414242184,
"learning_rate": 1.3243564149301058e-05,
"loss": 0.3092,
"step": 1197
},
{
"epoch": 3.801666005553352,
"grad_norm": 0.07858190033954376,
"learning_rate": 1.3177708166445702e-05,
"loss": 0.3085,
"step": 1198
},
{
"epoch": 3.804839349464498,
"grad_norm": 0.0853891038054064,
"learning_rate": 1.311198402618778e-05,
"loss": 0.307,
"step": 1199
},
{
"epoch": 3.8080126933756446,
"grad_norm": 0.07989799358178139,
"learning_rate": 1.3046392051588454e-05,
"loss": 0.3128,
"step": 1200
},
{
"epoch": 3.8111860372867907,
"grad_norm": 0.08828442224939745,
"learning_rate": 1.2980932565059261e-05,
"loss": 0.3148,
"step": 1201
},
{
"epoch": 3.8143593811979373,
"grad_norm": 0.08068725629005853,
"learning_rate": 1.2915605888360516e-05,
"loss": 0.3177,
"step": 1202
},
{
"epoch": 3.817532725109084,
"grad_norm": 0.08257356138701971,
"learning_rate": 1.2850412342599712e-05,
"loss": 0.3119,
"step": 1203
},
{
"epoch": 3.82070606902023,
"grad_norm": 0.0866333410804087,
"learning_rate": 1.2785352248229907e-05,
"loss": 0.3098,
"step": 1204
},
{
"epoch": 3.8238794129313765,
"grad_norm": 0.08166886458082445,
"learning_rate": 1.2720425925048274e-05,
"loss": 0.3125,
"step": 1205
},
{
"epoch": 3.827052756842523,
"grad_norm": 0.08920230072749204,
"learning_rate": 1.2655633692194367e-05,
"loss": 0.3106,
"step": 1206
},
{
"epoch": 3.830226100753669,
"grad_norm": 0.08897293804748853,
"learning_rate": 1.259097586814867e-05,
"loss": 0.3146,
"step": 1207
},
{
"epoch": 3.8333994446648156,
"grad_norm": 0.08018138419305465,
"learning_rate": 1.2526452770730986e-05,
"loss": 0.3138,
"step": 1208
},
{
"epoch": 3.836572788575962,
"grad_norm": 0.08612406310222656,
"learning_rate": 1.246206471709889e-05,
"loss": 0.3125,
"step": 1209
},
{
"epoch": 3.8397461324871083,
"grad_norm": 0.08560142904793838,
"learning_rate": 1.2397812023746124e-05,
"loss": 0.3098,
"step": 1210
},
{
"epoch": 3.842919476398255,
"grad_norm": 0.08086605996564016,
"learning_rate": 1.2333695006501127e-05,
"loss": 0.3099,
"step": 1211
},
{
"epoch": 3.846092820309401,
"grad_norm": 0.08291451210131907,
"learning_rate": 1.2269713980525407e-05,
"loss": 0.3057,
"step": 1212
},
{
"epoch": 3.8492661642205475,
"grad_norm": 0.08303662204470749,
"learning_rate": 1.2205869260312034e-05,
"loss": 0.3133,
"step": 1213
},
{
"epoch": 3.8524395081316936,
"grad_norm": 0.08226024846397162,
"learning_rate": 1.2142161159684034e-05,
"loss": 0.3158,
"step": 1214
},
{
"epoch": 3.85561285204284,
"grad_norm": 0.08613484735188547,
"learning_rate": 1.2078589991792983e-05,
"loss": 0.3118,
"step": 1215
},
{
"epoch": 3.8587861959539866,
"grad_norm": 0.0844370020344609,
"learning_rate": 1.2015156069117278e-05,
"loss": 0.311,
"step": 1216
},
{
"epoch": 3.8619595398651327,
"grad_norm": 0.08157413068957893,
"learning_rate": 1.195185970346075e-05,
"loss": 0.3118,
"step": 1217
},
{
"epoch": 3.8651328837762793,
"grad_norm": 0.0809161660461467,
"learning_rate": 1.1888701205951084e-05,
"loss": 0.3074,
"step": 1218
},
{
"epoch": 3.868306227687426,
"grad_norm": 0.08569592239748124,
"learning_rate": 1.1825680887038274e-05,
"loss": 0.3136,
"step": 1219
},
{
"epoch": 3.871479571598572,
"grad_norm": 0.08144000303149429,
"learning_rate": 1.1762799056493095e-05,
"loss": 0.3117,
"step": 1220
},
{
"epoch": 3.8746529155097185,
"grad_norm": 0.08023713096676821,
"learning_rate": 1.1700056023405622e-05,
"loss": 0.3085,
"step": 1221
},
{
"epoch": 3.877826259420865,
"grad_norm": 0.08533264894900448,
"learning_rate": 1.1637452096183663e-05,
"loss": 0.3112,
"step": 1222
},
{
"epoch": 3.880999603332011,
"grad_norm": 0.08294444189988791,
"learning_rate": 1.1574987582551293e-05,
"loss": 0.312,
"step": 1223
},
{
"epoch": 3.884172947243157,
"grad_norm": 0.07997988212163627,
"learning_rate": 1.1512662789547249e-05,
"loss": 0.3102,
"step": 1224
},
{
"epoch": 3.8873462911543037,
"grad_norm": 0.08618908178309605,
"learning_rate": 1.1450478023523575e-05,
"loss": 0.3104,
"step": 1225
},
{
"epoch": 3.8905196350654503,
"grad_norm": 0.08855469703733576,
"learning_rate": 1.1388433590143939e-05,
"loss": 0.3099,
"step": 1226
},
{
"epoch": 3.8936929789765964,
"grad_norm": 0.08401029593007889,
"learning_rate": 1.1326529794382264e-05,
"loss": 0.3064,
"step": 1227
},
{
"epoch": 3.896866322887743,
"grad_norm": 0.08447977219178371,
"learning_rate": 1.1264766940521171e-05,
"loss": 0.3144,
"step": 1228
},
{
"epoch": 3.9000396667988895,
"grad_norm": 0.08858538731350699,
"learning_rate": 1.1203145332150505e-05,
"loss": 0.3121,
"step": 1229
},
{
"epoch": 3.9032130107100356,
"grad_norm": 0.0909725806781119,
"learning_rate": 1.1141665272165789e-05,
"loss": 0.3109,
"step": 1230
},
{
"epoch": 3.906386354621182,
"grad_norm": 0.08293088335470532,
"learning_rate": 1.1080327062766827e-05,
"loss": 0.3125,
"step": 1231
},
{
"epoch": 3.9095596985323287,
"grad_norm": 0.08569314487810697,
"learning_rate": 1.1019131005456143e-05,
"loss": 0.3083,
"step": 1232
},
{
"epoch": 3.9127330424434748,
"grad_norm": 0.08315219817463836,
"learning_rate": 1.0958077401037542e-05,
"loss": 0.313,
"step": 1233
},
{
"epoch": 3.9159063863546213,
"grad_norm": 0.08276037981985424,
"learning_rate": 1.0897166549614573e-05,
"loss": 0.314,
"step": 1234
},
{
"epoch": 3.9190797302657674,
"grad_norm": 0.08774104502041317,
"learning_rate": 1.0836398750589172e-05,
"loss": 0.3071,
"step": 1235
},
{
"epoch": 3.922253074176914,
"grad_norm": 0.08859302501482878,
"learning_rate": 1.0775774302660027e-05,
"loss": 0.3148,
"step": 1236
},
{
"epoch": 3.92542641808806,
"grad_norm": 0.08667466575751302,
"learning_rate": 1.0715293503821256e-05,
"loss": 0.309,
"step": 1237
},
{
"epoch": 3.9285997619992066,
"grad_norm": 0.08724856226631149,
"learning_rate": 1.0654956651360857e-05,
"loss": 0.3147,
"step": 1238
},
{
"epoch": 3.931773105910353,
"grad_norm": 0.08515851001054704,
"learning_rate": 1.0594764041859293e-05,
"loss": 0.312,
"step": 1239
},
{
"epoch": 3.934946449821499,
"grad_norm": 0.08664134089549802,
"learning_rate": 1.0534715971187976e-05,
"loss": 0.3065,
"step": 1240
},
{
"epoch": 3.9381197937326458,
"grad_norm": 0.07937577751856649,
"learning_rate": 1.0474812734507886e-05,
"loss": 0.3079,
"step": 1241
},
{
"epoch": 3.9412931376437923,
"grad_norm": 0.08269171013905462,
"learning_rate": 1.041505462626807e-05,
"loss": 0.3078,
"step": 1242
},
{
"epoch": 3.9444664815549384,
"grad_norm": 0.08011960205561683,
"learning_rate": 1.0355441940204215e-05,
"loss": 0.3066,
"step": 1243
},
{
"epoch": 3.947639825466085,
"grad_norm": 0.07819385992242202,
"learning_rate": 1.0295974969337186e-05,
"loss": 0.3128,
"step": 1244
},
{
"epoch": 3.9508131693772315,
"grad_norm": 0.08250283712608554,
"learning_rate": 1.0236654005971625e-05,
"loss": 0.3181,
"step": 1245
},
{
"epoch": 3.9539865132883776,
"grad_norm": 0.6384901300040913,
"learning_rate": 1.017747934169444e-05,
"loss": 0.3072,
"step": 1246
},
{
"epoch": 3.957159857199524,
"grad_norm": 0.08198945954442947,
"learning_rate": 1.0118451267373462e-05,
"loss": 0.3088,
"step": 1247
},
{
"epoch": 3.96033320111067,
"grad_norm": 0.08781474473700562,
"learning_rate": 1.0059570073155953e-05,
"loss": 0.3119,
"step": 1248
},
{
"epoch": 3.9635065450218168,
"grad_norm": 0.08246668180469044,
"learning_rate": 1.0000836048467221e-05,
"loss": 0.313,
"step": 1249
},
{
"epoch": 3.966679888932963,
"grad_norm": 0.08889236389023081,
"learning_rate": 9.942249482009117e-06,
"loss": 0.3148,
"step": 1250
},
{
"epoch": 3.9698532328441094,
"grad_norm": 0.08718334142633781,
"learning_rate": 9.88381066175876e-06,
"loss": 0.3098,
"step": 1251
},
{
"epoch": 3.973026576755256,
"grad_norm": 0.08701871231451917,
"learning_rate": 9.825519874966952e-06,
"loss": 0.3132,
"step": 1252
},
{
"epoch": 3.976199920666402,
"grad_norm": 0.08864822845899448,
"learning_rate": 9.767377408156906e-06,
"loss": 0.3141,
"step": 1253
},
{
"epoch": 3.9793732645775486,
"grad_norm": 0.08251988049787,
"learning_rate": 9.709383547122764e-06,
"loss": 0.3106,
"step": 1254
},
{
"epoch": 3.982546608488695,
"grad_norm": 0.0829769224836818,
"learning_rate": 9.651538576928213e-06,
"loss": 0.3147,
"step": 1255
},
{
"epoch": 3.9857199523998412,
"grad_norm": 0.08191823227244799,
"learning_rate": 9.59384278190505e-06,
"loss": 0.3068,
"step": 1256
},
{
"epoch": 3.9888932963109878,
"grad_norm": 0.08231464325661744,
"learning_rate": 9.53629644565186e-06,
"loss": 0.311,
"step": 1257
},
{
"epoch": 3.9920666402221343,
"grad_norm": 0.07952802959102342,
"learning_rate": 9.478899851032554e-06,
"loss": 0.3092,
"step": 1258
},
{
"epoch": 3.9952399841332804,
"grad_norm": 0.07741615193611691,
"learning_rate": 9.421653280175014e-06,
"loss": 0.3043,
"step": 1259
},
{
"epoch": 3.998413328044427,
"grad_norm": 0.0792129238358985,
"learning_rate": 9.364557014469651e-06,
"loss": 0.3106,
"step": 1260
},
{
"epoch": 4.0015866719555735,
"grad_norm": 0.2372544363090627,
"learning_rate": 9.307611334568137e-06,
"loss": 0.5642,
"step": 1261
},
{
"epoch": 4.00476001586672,
"grad_norm": 0.12928047394977374,
"learning_rate": 9.250816520381884e-06,
"loss": 0.2916,
"step": 1262
},
{
"epoch": 4.007933359777866,
"grad_norm": 0.1547813755040527,
"learning_rate": 9.194172851080778e-06,
"loss": 0.2904,
"step": 1263
},
{
"epoch": 4.011106703689013,
"grad_norm": 0.12678189534036333,
"learning_rate": 9.137680605091753e-06,
"loss": 0.2871,
"step": 1264
},
{
"epoch": 4.014280047600159,
"grad_norm": 0.13337837439164088,
"learning_rate": 9.081340060097443e-06,
"loss": 0.286,
"step": 1265
},
{
"epoch": 4.017453391511305,
"grad_norm": 0.1460976490959922,
"learning_rate": 9.025151493034779e-06,
"loss": 0.288,
"step": 1266
},
{
"epoch": 4.020626735422451,
"grad_norm": 0.11381826357536112,
"learning_rate": 8.969115180093699e-06,
"loss": 0.2894,
"step": 1267
},
{
"epoch": 4.023800079333598,
"grad_norm": 0.1292486747466223,
"learning_rate": 8.91323139671572e-06,
"loss": 0.2931,
"step": 1268
},
{
"epoch": 4.026973423244744,
"grad_norm": 0.1243312232469112,
"learning_rate": 8.857500417592648e-06,
"loss": 0.2908,
"step": 1269
},
{
"epoch": 4.03014676715589,
"grad_norm": 0.10431292634709807,
"learning_rate": 8.801922516665127e-06,
"loss": 0.285,
"step": 1270
},
{
"epoch": 4.033320111067037,
"grad_norm": 0.11511588958038933,
"learning_rate": 8.746497967121445e-06,
"loss": 0.288,
"step": 1271
},
{
"epoch": 4.036493454978183,
"grad_norm": 0.09400500609718773,
"learning_rate": 8.69122704139604e-06,
"loss": 0.2906,
"step": 1272
},
{
"epoch": 4.039666798889329,
"grad_norm": 0.10377314324554619,
"learning_rate": 8.636110011168246e-06,
"loss": 0.2847,
"step": 1273
},
{
"epoch": 4.042840142800476,
"grad_norm": 0.09999767041994467,
"learning_rate": 8.58114714736094e-06,
"loss": 0.2892,
"step": 1274
},
{
"epoch": 4.046013486711622,
"grad_norm": 0.0879076384755368,
"learning_rate": 8.526338720139225e-06,
"loss": 0.2866,
"step": 1275
},
{
"epoch": 4.0491868306227685,
"grad_norm": 0.10595158327559934,
"learning_rate": 8.471684998909033e-06,
"loss": 0.2856,
"step": 1276
},
{
"epoch": 4.0523601745339155,
"grad_norm": 0.09994029668981493,
"learning_rate": 8.417186252315904e-06,
"loss": 0.2862,
"step": 1277
},
{
"epoch": 4.055533518445062,
"grad_norm": 0.08748710664677468,
"learning_rate": 8.362842748243593e-06,
"loss": 0.2888,
"step": 1278
},
{
"epoch": 4.058706862356208,
"grad_norm": 0.1057862773429233,
"learning_rate": 8.308654753812785e-06,
"loss": 0.2922,
"step": 1279
},
{
"epoch": 4.061880206267354,
"grad_norm": 0.10805451904620589,
"learning_rate": 8.254622535379733e-06,
"loss": 0.2933,
"step": 1280
},
{
"epoch": 4.065053550178501,
"grad_norm": 0.08529498056433478,
"learning_rate": 8.200746358535054e-06,
"loss": 0.289,
"step": 1281
},
{
"epoch": 4.068226894089647,
"grad_norm": 0.10025625808348602,
"learning_rate": 8.147026488102288e-06,
"loss": 0.2861,
"step": 1282
},
{
"epoch": 4.071400238000793,
"grad_norm": 0.10466816524732167,
"learning_rate": 8.093463188136712e-06,
"loss": 0.2879,
"step": 1283
},
{
"epoch": 4.07457358191194,
"grad_norm": 0.09159217995048068,
"learning_rate": 8.04005672192397e-06,
"loss": 0.2911,
"step": 1284
},
{
"epoch": 4.077746925823086,
"grad_norm": 0.09573656877621275,
"learning_rate": 7.986807351978827e-06,
"loss": 0.2858,
"step": 1285
},
{
"epoch": 4.080920269734232,
"grad_norm": 0.09231500426704371,
"learning_rate": 7.933715340043822e-06,
"loss": 0.2906,
"step": 1286
},
{
"epoch": 4.084093613645379,
"grad_norm": 0.09110127087440445,
"learning_rate": 7.880780947088031e-06,
"loss": 0.2888,
"step": 1287
},
{
"epoch": 4.087266957556525,
"grad_norm": 0.09178186024677654,
"learning_rate": 7.82800443330578e-06,
"loss": 0.2845,
"step": 1288
},
{
"epoch": 4.090440301467671,
"grad_norm": 0.08699713547366167,
"learning_rate": 7.77538605811535e-06,
"loss": 0.2949,
"step": 1289
},
{
"epoch": 4.093613645378818,
"grad_norm": 0.08651443976613882,
"learning_rate": 7.722926080157673e-06,
"loss": 0.2915,
"step": 1290
},
{
"epoch": 4.096786989289964,
"grad_norm": 0.08839398111073136,
"learning_rate": 7.670624757295151e-06,
"loss": 0.2895,
"step": 1291
},
{
"epoch": 4.0999603332011105,
"grad_norm": 0.07840294963123258,
"learning_rate": 7.618482346610276e-06,
"loss": 0.2842,
"step": 1292
},
{
"epoch": 4.103133677112257,
"grad_norm": 0.08242092139423192,
"learning_rate": 7.566499104404452e-06,
"loss": 0.2848,
"step": 1293
},
{
"epoch": 4.106307021023404,
"grad_norm": 0.08497119568891674,
"learning_rate": 7.514675286196698e-06,
"loss": 0.2914,
"step": 1294
},
{
"epoch": 4.10948036493455,
"grad_norm": 0.0810225885617694,
"learning_rate": 7.46301114672241e-06,
"loss": 0.2872,
"step": 1295
},
{
"epoch": 4.112653708845696,
"grad_norm": 0.08718653499899484,
"learning_rate": 7.411506939932058e-06,
"loss": 0.2875,
"step": 1296
},
{
"epoch": 4.115827052756843,
"grad_norm": 0.08535338762467477,
"learning_rate": 7.360162918990021e-06,
"loss": 0.2846,
"step": 1297
},
{
"epoch": 4.119000396667989,
"grad_norm": 0.07685735750488655,
"learning_rate": 7.308979336273281e-06,
"loss": 0.2843,
"step": 1298
},
{
"epoch": 4.122173740579135,
"grad_norm": 0.08263564119903752,
"learning_rate": 7.257956443370209e-06,
"loss": 0.2905,
"step": 1299
},
{
"epoch": 4.125347084490282,
"grad_norm": 0.08566614638966435,
"learning_rate": 7.20709449107928e-06,
"loss": 0.289,
"step": 1300
},
{
"epoch": 4.128520428401428,
"grad_norm": 0.08073179020805406,
"learning_rate": 7.156393729407956e-06,
"loss": 0.285,
"step": 1301
},
{
"epoch": 4.131693772312574,
"grad_norm": 0.08607718507088445,
"learning_rate": 7.1058544075712995e-06,
"loss": 0.2958,
"step": 1302
},
{
"epoch": 4.134867116223721,
"grad_norm": 0.0773081314831222,
"learning_rate": 7.055476773990881e-06,
"loss": 0.2829,
"step": 1303
},
{
"epoch": 4.138040460134867,
"grad_norm": 0.08091752523907358,
"learning_rate": 7.0052610762934905e-06,
"loss": 0.2864,
"step": 1304
},
{
"epoch": 4.141213804046013,
"grad_norm": 0.08363422913570277,
"learning_rate": 6.955207561309944e-06,
"loss": 0.2949,
"step": 1305
},
{
"epoch": 4.1443871479571595,
"grad_norm": 0.07855930876619456,
"learning_rate": 6.905316475073842e-06,
"loss": 0.2921,
"step": 1306
},
{
"epoch": 4.147560491868306,
"grad_norm": 0.07964756493675755,
"learning_rate": 6.855588062820407e-06,
"loss": 0.2895,
"step": 1307
},
{
"epoch": 4.1507338357794525,
"grad_norm": 0.0788278091150506,
"learning_rate": 6.806022568985233e-06,
"loss": 0.2864,
"step": 1308
},
{
"epoch": 4.153907179690599,
"grad_norm": 0.08049149360175006,
"learning_rate": 6.756620237203124e-06,
"loss": 0.2825,
"step": 1309
},
{
"epoch": 4.157080523601746,
"grad_norm": 0.080868151969321,
"learning_rate": 6.707381310306833e-06,
"loss": 0.2892,
"step": 1310
},
{
"epoch": 4.160253867512892,
"grad_norm": 0.07798552901381123,
"learning_rate": 6.658306030325978e-06,
"loss": 0.2865,
"step": 1311
},
{
"epoch": 4.163427211424038,
"grad_norm": 0.0805335375512832,
"learning_rate": 6.60939463848572e-06,
"loss": 0.2911,
"step": 1312
},
{
"epoch": 4.166600555335185,
"grad_norm": 0.07518550741681629,
"learning_rate": 6.560647375205676e-06,
"loss": 0.2882,
"step": 1313
},
{
"epoch": 4.169773899246331,
"grad_norm": 0.0781854434454258,
"learning_rate": 6.512064480098694e-06,
"loss": 0.2891,
"step": 1314
},
{
"epoch": 4.172947243157477,
"grad_norm": 0.07987762578525559,
"learning_rate": 6.4636461919697034e-06,
"loss": 0.2856,
"step": 1315
},
{
"epoch": 4.176120587068624,
"grad_norm": 0.07606785506312735,
"learning_rate": 6.4153927488144865e-06,
"loss": 0.2845,
"step": 1316
},
{
"epoch": 4.17929393097977,
"grad_norm": 0.07637479917962597,
"learning_rate": 6.367304387818567e-06,
"loss": 0.277,
"step": 1317
},
{
"epoch": 4.182467274890916,
"grad_norm": 0.08032228596283143,
"learning_rate": 6.31938134535603e-06,
"loss": 0.2869,
"step": 1318
},
{
"epoch": 4.185640618802062,
"grad_norm": 0.07742967651427472,
"learning_rate": 6.271623856988336e-06,
"loss": 0.2841,
"step": 1319
},
{
"epoch": 4.188813962713209,
"grad_norm": 0.0808988691015848,
"learning_rate": 6.224032157463184e-06,
"loss": 0.2906,
"step": 1320
},
{
"epoch": 4.191987306624355,
"grad_norm": 0.07749448537178391,
"learning_rate": 6.176606480713365e-06,
"loss": 0.2898,
"step": 1321
},
{
"epoch": 4.1951606505355015,
"grad_norm": 0.07984359673939721,
"learning_rate": 6.129347059855572e-06,
"loss": 0.2883,
"step": 1322
},
{
"epoch": 4.1983339944466485,
"grad_norm": 0.07874063845144524,
"learning_rate": 6.082254127189302e-06,
"loss": 0.2863,
"step": 1323
},
{
"epoch": 4.2015073383577946,
"grad_norm": 0.07786902211545077,
"learning_rate": 6.035327914195694e-06,
"loss": 0.2911,
"step": 1324
},
{
"epoch": 4.204680682268941,
"grad_norm": 0.08187795024389079,
"learning_rate": 5.988568651536399e-06,
"loss": 0.2928,
"step": 1325
},
{
"epoch": 4.207854026180088,
"grad_norm": 0.07811082790053991,
"learning_rate": 5.941976569052394e-06,
"loss": 0.2855,
"step": 1326
},
{
"epoch": 4.211027370091234,
"grad_norm": 0.0781331548320789,
"learning_rate": 5.895551895762968e-06,
"loss": 0.2873,
"step": 1327
},
{
"epoch": 4.21420071400238,
"grad_norm": 0.08057455261417476,
"learning_rate": 5.849294859864456e-06,
"loss": 0.2862,
"step": 1328
},
{
"epoch": 4.217374057913526,
"grad_norm": 0.08046289285253863,
"learning_rate": 5.8032056887292345e-06,
"loss": 0.2926,
"step": 1329
},
{
"epoch": 4.220547401824673,
"grad_norm": 0.07901814464604347,
"learning_rate": 5.757284608904528e-06,
"loss": 0.2927,
"step": 1330
},
{
"epoch": 4.223720745735819,
"grad_norm": 0.08241125484928784,
"learning_rate": 5.711531846111351e-06,
"loss": 0.2919,
"step": 1331
},
{
"epoch": 4.226894089646965,
"grad_norm": 0.07943725546881736,
"learning_rate": 5.6659476252433285e-06,
"loss": 0.2858,
"step": 1332
},
{
"epoch": 4.230067433558112,
"grad_norm": 0.07690103398901868,
"learning_rate": 5.620532170365667e-06,
"loss": 0.288,
"step": 1333
},
{
"epoch": 4.233240777469258,
"grad_norm": 0.07982460591594524,
"learning_rate": 5.5752857047140086e-06,
"loss": 0.292,
"step": 1334
},
{
"epoch": 4.236414121380404,
"grad_norm": 0.07809749969035885,
"learning_rate": 5.530208450693355e-06,
"loss": 0.2895,
"step": 1335
},
{
"epoch": 4.239587465291551,
"grad_norm": 0.07723500752452818,
"learning_rate": 5.48530062987692e-06,
"loss": 0.2924,
"step": 1336
},
{
"epoch": 4.242760809202697,
"grad_norm": 0.07756528859698934,
"learning_rate": 5.440562463005154e-06,
"loss": 0.2923,
"step": 1337
},
{
"epoch": 4.2459341531138435,
"grad_norm": 0.07872620120384079,
"learning_rate": 5.395994169984522e-06,
"loss": 0.2858,
"step": 1338
},
{
"epoch": 4.2491074970249905,
"grad_norm": 0.07841592614920617,
"learning_rate": 5.351595969886529e-06,
"loss": 0.2948,
"step": 1339
},
{
"epoch": 4.252280840936137,
"grad_norm": 0.07403115107349581,
"learning_rate": 5.307368080946584e-06,
"loss": 0.2853,
"step": 1340
},
{
"epoch": 4.255454184847283,
"grad_norm": 0.07885594656165801,
"learning_rate": 5.263310720562973e-06,
"loss": 0.2929,
"step": 1341
},
{
"epoch": 4.25862752875843,
"grad_norm": 0.07590630094007132,
"learning_rate": 5.219424105295719e-06,
"loss": 0.2859,
"step": 1342
},
{
"epoch": 4.261800872669576,
"grad_norm": 0.07840740302270648,
"learning_rate": 5.175708450865595e-06,
"loss": 0.2858,
"step": 1343
},
{
"epoch": 4.264974216580722,
"grad_norm": 0.07673578995611278,
"learning_rate": 5.1321639721530325e-06,
"loss": 0.2889,
"step": 1344
},
{
"epoch": 4.268147560491868,
"grad_norm": 0.07951575168272773,
"learning_rate": 5.088790883197061e-06,
"loss": 0.2896,
"step": 1345
},
{
"epoch": 4.271320904403015,
"grad_norm": 0.08030218736730213,
"learning_rate": 5.045589397194231e-06,
"loss": 0.2865,
"step": 1346
},
{
"epoch": 4.274494248314161,
"grad_norm": 0.07842325630849278,
"learning_rate": 5.0025597264976446e-06,
"loss": 0.2858,
"step": 1347
},
{
"epoch": 4.277667592225307,
"grad_norm": 0.07773496721087116,
"learning_rate": 4.9597020826158114e-06,
"loss": 0.2901,
"step": 1348
},
{
"epoch": 4.280840936136454,
"grad_norm": 0.07839435165688115,
"learning_rate": 4.917016676211686e-06,
"loss": 0.2952,
"step": 1349
},
{
"epoch": 4.2840142800476,
"grad_norm": 0.08007319222215062,
"learning_rate": 4.8745037171016045e-06,
"loss": 0.2895,
"step": 1350
},
{
"epoch": 4.287187623958746,
"grad_norm": 0.0802943162822994,
"learning_rate": 4.832163414254254e-06,
"loss": 0.2854,
"step": 1351
},
{
"epoch": 4.290360967869893,
"grad_norm": 0.07670078924225301,
"learning_rate": 4.789995975789619e-06,
"loss": 0.2866,
"step": 1352
},
{
"epoch": 4.293534311781039,
"grad_norm": 0.07571900940799264,
"learning_rate": 4.748001608978015e-06,
"loss": 0.2864,
"step": 1353
},
{
"epoch": 4.2967076556921855,
"grad_norm": 0.07887124085195305,
"learning_rate": 4.70618052023903e-06,
"loss": 0.2861,
"step": 1354
},
{
"epoch": 4.299880999603332,
"grad_norm": 0.07844552010826854,
"learning_rate": 4.664532915140525e-06,
"loss": 0.2861,
"step": 1355
},
{
"epoch": 4.303054343514479,
"grad_norm": 0.0758283591701092,
"learning_rate": 4.623058998397585e-06,
"loss": 0.2827,
"step": 1356
},
{
"epoch": 4.306227687425625,
"grad_norm": 0.07567360663444131,
"learning_rate": 4.581758973871609e-06,
"loss": 0.2852,
"step": 1357
},
{
"epoch": 4.309401031336771,
"grad_norm": 0.07698958638459642,
"learning_rate": 4.540633044569172e-06,
"loss": 0.2838,
"step": 1358
},
{
"epoch": 4.312574375247918,
"grad_norm": 0.07772937829274414,
"learning_rate": 4.499681412641148e-06,
"loss": 0.2895,
"step": 1359
},
{
"epoch": 4.315747719159064,
"grad_norm": 0.07637398650887665,
"learning_rate": 4.4589042793816525e-06,
"loss": 0.2921,
"step": 1360
},
{
"epoch": 4.31892106307021,
"grad_norm": 0.07633260293292246,
"learning_rate": 4.418301845227073e-06,
"loss": 0.2829,
"step": 1361
},
{
"epoch": 4.322094406981357,
"grad_norm": 0.07860481054083629,
"learning_rate": 4.377874309755065e-06,
"loss": 0.2866,
"step": 1362
},
{
"epoch": 4.325267750892503,
"grad_norm": 0.0762763983665852,
"learning_rate": 4.337621871683597e-06,
"loss": 0.2833,
"step": 1363
},
{
"epoch": 4.328441094803649,
"grad_norm": 0.0766144117665982,
"learning_rate": 4.297544728869958e-06,
"loss": 0.2901,
"step": 1364
},
{
"epoch": 4.331614438714796,
"grad_norm": 0.07937899829074807,
"learning_rate": 4.257643078309808e-06,
"loss": 0.2924,
"step": 1365
},
{
"epoch": 4.334787782625942,
"grad_norm": 0.07527289810281566,
"learning_rate": 4.2179171161361365e-06,
"loss": 0.2848,
"step": 1366
},
{
"epoch": 4.337961126537088,
"grad_norm": 0.07688154141825307,
"learning_rate": 4.178367037618429e-06,
"loss": 0.2895,
"step": 1367
},
{
"epoch": 4.341134470448234,
"grad_norm": 0.07790856052535607,
"learning_rate": 4.138993037161565e-06,
"loss": 0.293,
"step": 1368
},
{
"epoch": 4.344307814359381,
"grad_norm": 0.0782212752825844,
"learning_rate": 4.099795308304954e-06,
"loss": 0.2873,
"step": 1369
},
{
"epoch": 4.3474811582705275,
"grad_norm": 0.07864542088050423,
"learning_rate": 4.060774043721565e-06,
"loss": 0.2895,
"step": 1370
},
{
"epoch": 4.350654502181674,
"grad_norm": 0.07539046652733071,
"learning_rate": 4.0219294352169714e-06,
"loss": 0.2928,
"step": 1371
},
{
"epoch": 4.353827846092821,
"grad_norm": 0.07802077980079632,
"learning_rate": 3.983261673728378e-06,
"loss": 0.2907,
"step": 1372
},
{
"epoch": 4.357001190003967,
"grad_norm": 0.07914754106412068,
"learning_rate": 3.94477094932376e-06,
"loss": 0.2897,
"step": 1373
},
{
"epoch": 4.360174533915113,
"grad_norm": 0.07815313150549123,
"learning_rate": 3.906457451200845e-06,
"loss": 0.2823,
"step": 1374
},
{
"epoch": 4.36334787782626,
"grad_norm": 0.0783643686014188,
"learning_rate": 3.8683213676862585e-06,
"loss": 0.2902,
"step": 1375
},
{
"epoch": 4.366521221737406,
"grad_norm": 0.07899805915070553,
"learning_rate": 3.830362886234502e-06,
"loss": 0.2915,
"step": 1376
},
{
"epoch": 4.369694565648552,
"grad_norm": 0.07616018845948855,
"learning_rate": 3.7925821934271655e-06,
"loss": 0.2853,
"step": 1377
},
{
"epoch": 4.372867909559698,
"grad_norm": 0.07604900073546127,
"learning_rate": 3.7549794749718673e-06,
"loss": 0.2918,
"step": 1378
},
{
"epoch": 4.376041253470845,
"grad_norm": 0.07611014329101679,
"learning_rate": 3.717554915701449e-06,
"loss": 0.2833,
"step": 1379
},
{
"epoch": 4.379214597381991,
"grad_norm": 0.07836395731025556,
"learning_rate": 3.680308699573005e-06,
"loss": 0.2879,
"step": 1380
},
{
"epoch": 4.382387941293137,
"grad_norm": 0.07492430726816589,
"learning_rate": 3.64324100966702e-06,
"loss": 0.2806,
"step": 1381
},
{
"epoch": 4.385561285204284,
"grad_norm": 0.07741032733958537,
"learning_rate": 3.606352028186426e-06,
"loss": 0.2897,
"step": 1382
},
{
"epoch": 4.38873462911543,
"grad_norm": 0.07451962544381002,
"learning_rate": 3.5696419364557433e-06,
"loss": 0.2893,
"step": 1383
},
{
"epoch": 4.391907973026576,
"grad_norm": 0.08019767472904286,
"learning_rate": 3.533110914920177e-06,
"loss": 0.2884,
"step": 1384
},
{
"epoch": 4.395081316937723,
"grad_norm": 0.07731877454550821,
"learning_rate": 3.4967591431447256e-06,
"loss": 0.2902,
"step": 1385
},
{
"epoch": 4.3982546608488695,
"grad_norm": 0.08113895307948948,
"learning_rate": 3.460586799813288e-06,
"loss": 0.2906,
"step": 1386
},
{
"epoch": 4.401428004760016,
"grad_norm": 0.07677456138444168,
"learning_rate": 3.4245940627278284e-06,
"loss": 0.2902,
"step": 1387
},
{
"epoch": 4.404601348671163,
"grad_norm": 0.07648163374651913,
"learning_rate": 3.388781108807444e-06,
"loss": 0.2879,
"step": 1388
},
{
"epoch": 4.407774692582309,
"grad_norm": 0.0758938270041518,
"learning_rate": 3.3531481140875345e-06,
"loss": 0.2934,
"step": 1389
},
{
"epoch": 4.410948036493455,
"grad_norm": 0.07252099034254757,
"learning_rate": 3.317695253718931e-06,
"loss": 0.2855,
"step": 1390
},
{
"epoch": 4.414121380404602,
"grad_norm": 0.08007509732355841,
"learning_rate": 3.2824227019670272e-06,
"loss": 0.29,
"step": 1391
},
{
"epoch": 4.417294724315748,
"grad_norm": 0.07359637512325612,
"learning_rate": 3.2473306322109078e-06,
"loss": 0.2846,
"step": 1392
},
{
"epoch": 4.420468068226894,
"grad_norm": 0.07691247352169074,
"learning_rate": 3.2124192169425573e-06,
"loss": 0.2938,
"step": 1393
},
{
"epoch": 4.42364141213804,
"grad_norm": 0.07370622338875793,
"learning_rate": 3.177688627765916e-06,
"loss": 0.2838,
"step": 1394
},
{
"epoch": 4.426814756049187,
"grad_norm": 0.0756129323864927,
"learning_rate": 3.143139035396128e-06,
"loss": 0.292,
"step": 1395
},
{
"epoch": 4.429988099960333,
"grad_norm": 0.0775966898230028,
"learning_rate": 3.1087706096586535e-06,
"loss": 0.2929,
"step": 1396
},
{
"epoch": 4.433161443871479,
"grad_norm": 0.07808391129944373,
"learning_rate": 3.0745835194884435e-06,
"loss": 0.2897,
"step": 1397
},
{
"epoch": 4.436334787782626,
"grad_norm": 0.07349160355298512,
"learning_rate": 3.0405779329290987e-06,
"loss": 0.2821,
"step": 1398
},
{
"epoch": 4.439508131693772,
"grad_norm": 0.07471990105942067,
"learning_rate": 3.0067540171320674e-06,
"loss": 0.2868,
"step": 1399
},
{
"epoch": 4.442681475604918,
"grad_norm": 0.07304458565439569,
"learning_rate": 2.973111938355815e-06,
"loss": 0.289,
"step": 1400
},
{
"epoch": 4.445854819516065,
"grad_norm": 0.07542736354862137,
"learning_rate": 2.9396518619650007e-06,
"loss": 0.2837,
"step": 1401
},
{
"epoch": 4.4490281634272115,
"grad_norm": 0.07580500670249878,
"learning_rate": 2.90637395242964e-06,
"loss": 0.2886,
"step": 1402
},
{
"epoch": 4.452201507338358,
"grad_norm": 0.07483645701648972,
"learning_rate": 2.8732783733243754e-06,
"loss": 0.2883,
"step": 1403
},
{
"epoch": 4.455374851249504,
"grad_norm": 0.0734355618152173,
"learning_rate": 2.8403652873275755e-06,
"loss": 0.2896,
"step": 1404
},
{
"epoch": 4.458548195160651,
"grad_norm": 0.07743307777656919,
"learning_rate": 2.8076348562206024e-06,
"loss": 0.2945,
"step": 1405
},
{
"epoch": 4.461721539071797,
"grad_norm": 0.07285494607732518,
"learning_rate": 2.7750872408869843e-06,
"loss": 0.2823,
"step": 1406
},
{
"epoch": 4.464894882982943,
"grad_norm": 0.07654578475317778,
"learning_rate": 2.7427226013116448e-06,
"loss": 0.2893,
"step": 1407
},
{
"epoch": 4.46806822689409,
"grad_norm": 0.07415127461281228,
"learning_rate": 2.7105410965800928e-06,
"loss": 0.2881,
"step": 1408
},
{
"epoch": 4.471241570805236,
"grad_norm": 0.07473740527863117,
"learning_rate": 2.678542884877664e-06,
"loss": 0.2861,
"step": 1409
},
{
"epoch": 4.474414914716382,
"grad_norm": 0.07455517075536294,
"learning_rate": 2.646728123488731e-06,
"loss": 0.2868,
"step": 1410
},
{
"epoch": 4.477588258627529,
"grad_norm": 0.07402864039008861,
"learning_rate": 2.6150969687959426e-06,
"loss": 0.2891,
"step": 1411
},
{
"epoch": 4.480761602538675,
"grad_norm": 0.07506645131586324,
"learning_rate": 2.5836495762794078e-06,
"loss": 0.2875,
"step": 1412
},
{
"epoch": 4.483934946449821,
"grad_norm": 0.07687328694069057,
"learning_rate": 2.552386100516033e-06,
"loss": 0.2889,
"step": 1413
},
{
"epoch": 4.487108290360968,
"grad_norm": 0.07292287204353926,
"learning_rate": 2.521306695178636e-06,
"loss": 0.2862,
"step": 1414
},
{
"epoch": 4.490281634272114,
"grad_norm": 0.07378533131418447,
"learning_rate": 2.4904115130352845e-06,
"loss": 0.2858,
"step": 1415
},
{
"epoch": 4.4934549781832605,
"grad_norm": 0.07434204604179016,
"learning_rate": 2.459700705948507e-06,
"loss": 0.2855,
"step": 1416
},
{
"epoch": 4.496628322094407,
"grad_norm": 0.07431002205125276,
"learning_rate": 2.429174424874563e-06,
"loss": 0.2816,
"step": 1417
},
{
"epoch": 4.4998016660055535,
"grad_norm": 0.07821248980927963,
"learning_rate": 2.398832819862662e-06,
"loss": 0.2884,
"step": 1418
},
{
"epoch": 4.5029750099167,
"grad_norm": 0.07520177957755667,
"learning_rate": 2.3686760400542853e-06,
"loss": 0.2912,
"step": 1419
},
{
"epoch": 4.506148353827846,
"grad_norm": 0.07186434245067173,
"learning_rate": 2.3387042336824097e-06,
"loss": 0.287,
"step": 1420
},
{
"epoch": 4.509321697738993,
"grad_norm": 0.07671516440501819,
"learning_rate": 2.3089175480707926e-06,
"loss": 0.2904,
"step": 1421
},
{
"epoch": 4.512495041650139,
"grad_norm": 0.07403348893774686,
"learning_rate": 2.2793161296332354e-06,
"loss": 0.2848,
"step": 1422
},
{
"epoch": 4.515668385561285,
"grad_norm": 0.07485509243857365,
"learning_rate": 2.2499001238729034e-06,
"loss": 0.2889,
"step": 1423
},
{
"epoch": 4.518841729472432,
"grad_norm": 0.07341240180495677,
"learning_rate": 2.2206696753815527e-06,
"loss": 0.2876,
"step": 1424
},
{
"epoch": 4.522015073383578,
"grad_norm": 0.07545046460513817,
"learning_rate": 2.191624927838865e-06,
"loss": 0.2803,
"step": 1425
},
{
"epoch": 4.525188417294724,
"grad_norm": 0.0729065721268795,
"learning_rate": 2.1627660240117177e-06,
"loss": 0.2907,
"step": 1426
},
{
"epoch": 4.52836176120587,
"grad_norm": 0.0739827535708724,
"learning_rate": 2.134093105753503e-06,
"loss": 0.2875,
"step": 1427
},
{
"epoch": 4.531535105117017,
"grad_norm": 0.0737465566063914,
"learning_rate": 2.1056063140034013e-06,
"loss": 0.2886,
"step": 1428
},
{
"epoch": 4.534708449028163,
"grad_norm": 0.07481103353206092,
"learning_rate": 2.0773057887857105e-06,
"loss": 0.2859,
"step": 1429
},
{
"epoch": 4.537881792939309,
"grad_norm": 0.07443664446659384,
"learning_rate": 2.049191669209156e-06,
"loss": 0.2877,
"step": 1430
},
{
"epoch": 4.541055136850456,
"grad_norm": 0.07271868744467536,
"learning_rate": 2.0212640934662087e-06,
"loss": 0.2808,
"step": 1431
},
{
"epoch": 4.5442284807616025,
"grad_norm": 0.0731348085785222,
"learning_rate": 1.99352319883237e-06,
"loss": 0.2859,
"step": 1432
},
{
"epoch": 4.547401824672749,
"grad_norm": 0.0728659918076379,
"learning_rate": 1.9659691216655697e-06,
"loss": 0.2817,
"step": 1433
},
{
"epoch": 4.5505751685838955,
"grad_norm": 0.07052892858321479,
"learning_rate": 1.9386019974054182e-06,
"loss": 0.2845,
"step": 1434
},
{
"epoch": 4.553748512495042,
"grad_norm": 0.07162865810925105,
"learning_rate": 1.9114219605725905e-06,
"loss": 0.2852,
"step": 1435
},
{
"epoch": 4.556921856406188,
"grad_norm": 0.07216631582905737,
"learning_rate": 1.8844291447681496e-06,
"loss": 0.2893,
"step": 1436
},
{
"epoch": 4.560095200317335,
"grad_norm": 0.07454657617595882,
"learning_rate": 1.857623682672891e-06,
"loss": 0.2884,
"step": 1437
},
{
"epoch": 4.563268544228481,
"grad_norm": 0.07192684325273382,
"learning_rate": 1.8310057060466845e-06,
"loss": 0.2921,
"step": 1438
},
{
"epoch": 4.566441888139627,
"grad_norm": 0.07346684054783836,
"learning_rate": 1.8045753457278303e-06,
"loss": 0.287,
"step": 1439
},
{
"epoch": 4.569615232050774,
"grad_norm": 0.07368537492116049,
"learning_rate": 1.7783327316324238e-06,
"loss": 0.2874,
"step": 1440
},
{
"epoch": 4.57278857596192,
"grad_norm": 0.07300900030526943,
"learning_rate": 1.752277992753717e-06,
"loss": 0.2827,
"step": 1441
},
{
"epoch": 4.575961919873066,
"grad_norm": 0.0735865281943053,
"learning_rate": 1.726411257161451e-06,
"loss": 0.2902,
"step": 1442
},
{
"epoch": 4.579135263784213,
"grad_norm": 0.07218495117782045,
"learning_rate": 1.700732652001289e-06,
"loss": 0.2878,
"step": 1443
},
{
"epoch": 4.582308607695359,
"grad_norm": 0.07237820497971356,
"learning_rate": 1.6752423034941223e-06,
"loss": 0.2914,
"step": 1444
},
{
"epoch": 4.585481951606505,
"grad_norm": 0.07169874424676832,
"learning_rate": 1.6499403369355115e-06,
"loss": 0.2906,
"step": 1445
},
{
"epoch": 4.588655295517651,
"grad_norm": 0.07237942647540188,
"learning_rate": 1.6248268766950204e-06,
"loss": 0.2871,
"step": 1446
},
{
"epoch": 4.591828639428798,
"grad_norm": 0.07206522725627988,
"learning_rate": 1.5999020462156511e-06,
"loss": 0.2872,
"step": 1447
},
{
"epoch": 4.5950019833399445,
"grad_norm": 0.07232160129557513,
"learning_rate": 1.5751659680131792e-06,
"loss": 0.2859,
"step": 1448
},
{
"epoch": 4.598175327251091,
"grad_norm": 0.07224139682967294,
"learning_rate": 1.5506187636756109e-06,
"loss": 0.2899,
"step": 1449
},
{
"epoch": 4.601348671162238,
"grad_norm": 0.07350195322568023,
"learning_rate": 1.5262605538625574e-06,
"loss": 0.2888,
"step": 1450
},
{
"epoch": 4.604522015073384,
"grad_norm": 0.0724134729249542,
"learning_rate": 1.5020914583046398e-06,
"loss": 0.2872,
"step": 1451
},
{
"epoch": 4.60769535898453,
"grad_norm": 0.07146812102253265,
"learning_rate": 1.4781115958028935e-06,
"loss": 0.2863,
"step": 1452
},
{
"epoch": 4.610868702895676,
"grad_norm": 0.07365565598868938,
"learning_rate": 1.4543210842282229e-06,
"loss": 0.2891,
"step": 1453
},
{
"epoch": 4.614042046806823,
"grad_norm": 0.07266133951722602,
"learning_rate": 1.4307200405207656e-06,
"loss": 0.2875,
"step": 1454
},
{
"epoch": 4.617215390717969,
"grad_norm": 0.07255421353863033,
"learning_rate": 1.4073085806893593e-06,
"loss": 0.2933,
"step": 1455
},
{
"epoch": 4.620388734629115,
"grad_norm": 0.0738467244304907,
"learning_rate": 1.3840868198109658e-06,
"loss": 0.2853,
"step": 1456
},
{
"epoch": 4.623562078540262,
"grad_norm": 0.07193567501371838,
"learning_rate": 1.3610548720300965e-06,
"loss": 0.2846,
"step": 1457
},
{
"epoch": 4.626735422451408,
"grad_norm": 0.07215628995818985,
"learning_rate": 1.3382128505582315e-06,
"loss": 0.2907,
"step": 1458
},
{
"epoch": 4.629908766362554,
"grad_norm": 0.07250278706923594,
"learning_rate": 1.315560867673318e-06,
"loss": 0.2892,
"step": 1459
},
{
"epoch": 4.633082110273701,
"grad_norm": 0.07078718377132749,
"learning_rate": 1.2930990347191607e-06,
"loss": 0.2834,
"step": 1460
},
{
"epoch": 4.636255454184847,
"grad_norm": 0.07349937695742143,
"learning_rate": 1.2708274621049134e-06,
"loss": 0.2861,
"step": 1461
},
{
"epoch": 4.639428798095993,
"grad_norm": 0.07204842558684392,
"learning_rate": 1.2487462593045075e-06,
"loss": 0.2904,
"step": 1462
},
{
"epoch": 4.64260214200714,
"grad_norm": 0.0721734105457553,
"learning_rate": 1.2268555348561529e-06,
"loss": 0.2835,
"step": 1463
},
{
"epoch": 4.6457754859182865,
"grad_norm": 0.0720032080278003,
"learning_rate": 1.20515539636175e-06,
"loss": 0.2872,
"step": 1464
},
{
"epoch": 4.648948829829433,
"grad_norm": 0.07173144513365656,
"learning_rate": 1.183645950486416e-06,
"loss": 0.2903,
"step": 1465
},
{
"epoch": 4.65212217374058,
"grad_norm": 0.07173493810236914,
"learning_rate": 1.1623273029579195e-06,
"loss": 0.293,
"step": 1466
},
{
"epoch": 4.655295517651726,
"grad_norm": 0.07082496375705209,
"learning_rate": 1.141199558566184e-06,
"loss": 0.286,
"step": 1467
},
{
"epoch": 4.658468861562872,
"grad_norm": 0.07192243350970179,
"learning_rate": 1.1202628211627587e-06,
"loss": 0.2865,
"step": 1468
},
{
"epoch": 4.661642205474018,
"grad_norm": 0.07242387413423897,
"learning_rate": 1.099517193660331e-06,
"loss": 0.2934,
"step": 1469
},
{
"epoch": 4.664815549385165,
"grad_norm": 0.0733740588484503,
"learning_rate": 1.0789627780321888e-06,
"loss": 0.2905,
"step": 1470
},
{
"epoch": 4.667988893296311,
"grad_norm": 0.07206002165507971,
"learning_rate": 1.058599675311731e-06,
"loss": 0.287,
"step": 1471
},
{
"epoch": 4.671162237207457,
"grad_norm": 0.07127887265831032,
"learning_rate": 1.0384279855919944e-06,
"loss": 0.2896,
"step": 1472
},
{
"epoch": 4.674335581118604,
"grad_norm": 0.07358907596433474,
"learning_rate": 1.0184478080251315e-06,
"loss": 0.2879,
"step": 1473
},
{
"epoch": 4.67750892502975,
"grad_norm": 0.0714280248007389,
"learning_rate": 9.986592408219286e-07,
"loss": 0.2868,
"step": 1474
},
{
"epoch": 4.680682268940896,
"grad_norm": 0.0711643498792862,
"learning_rate": 9.7906238125133e-07,
"loss": 0.2877,
"step": 1475
},
{
"epoch": 4.683855612852043,
"grad_norm": 0.07071027647717443,
"learning_rate": 9.59657325639971e-07,
"loss": 0.2875,
"step": 1476
},
{
"epoch": 4.687028956763189,
"grad_norm": 0.07098768877099393,
"learning_rate": 9.404441693716771e-07,
"loss": 0.2889,
"step": 1477
},
{
"epoch": 4.690202300674335,
"grad_norm": 0.0713859105686287,
"learning_rate": 9.2142300688701e-07,
"loss": 0.292,
"step": 1478
},
{
"epoch": 4.6933756445854815,
"grad_norm": 0.07126866567296886,
"learning_rate": 9.025939316828203e-07,
"loss": 0.2846,
"step": 1479
},
{
"epoch": 4.6965489884966285,
"grad_norm": 0.0723870642231539,
"learning_rate": 8.839570363117445e-07,
"loss": 0.288,
"step": 1480
},
{
"epoch": 4.699722332407775,
"grad_norm": 0.07141709931411543,
"learning_rate": 8.655124123817926e-07,
"loss": 0.2851,
"step": 1481
},
{
"epoch": 4.702895676318921,
"grad_norm": 0.07173481548872705,
"learning_rate": 8.47260150555882e-07,
"loss": 0.2869,
"step": 1482
},
{
"epoch": 4.706069020230068,
"grad_norm": 0.07243043839490769,
"learning_rate": 8.292003405513882e-07,
"loss": 0.2863,
"step": 1483
},
{
"epoch": 4.709242364141214,
"grad_norm": 0.07029557557454981,
"learning_rate": 8.113330711397016e-07,
"loss": 0.2873,
"step": 1484
},
{
"epoch": 4.71241570805236,
"grad_norm": 0.0704646381143378,
"learning_rate": 7.936584301458006e-07,
"loss": 0.2889,
"step": 1485
},
{
"epoch": 4.715589051963507,
"grad_norm": 0.07189911994697401,
"learning_rate": 7.761765044478209e-07,
"loss": 0.2844,
"step": 1486
},
{
"epoch": 4.718762395874653,
"grad_norm": 0.07145859830072916,
"learning_rate": 7.588873799766161e-07,
"loss": 0.29,
"step": 1487
},
{
"epoch": 4.721935739785799,
"grad_norm": 0.0711567541075634,
"learning_rate": 7.417911417153401e-07,
"loss": 0.2919,
"step": 1488
},
{
"epoch": 4.725109083696946,
"grad_norm": 0.07204569153095995,
"learning_rate": 7.248878736990428e-07,
"loss": 0.2877,
"step": 1489
},
{
"epoch": 4.728282427608092,
"grad_norm": 0.07125020942303784,
"learning_rate": 7.081776590142352e-07,
"loss": 0.2858,
"step": 1490
},
{
"epoch": 4.731455771519238,
"grad_norm": 0.06994584906572929,
"learning_rate": 6.916605797984987e-07,
"loss": 0.2807,
"step": 1491
},
{
"epoch": 4.734629115430385,
"grad_norm": 0.0715883706850393,
"learning_rate": 6.753367172400716e-07,
"loss": 0.2859,
"step": 1492
},
{
"epoch": 4.737802459341531,
"grad_norm": 0.07097376897268061,
"learning_rate": 6.59206151577454e-07,
"loss": 0.2831,
"step": 1493
},
{
"epoch": 4.740975803252677,
"grad_norm": 0.07016656413072693,
"learning_rate": 6.432689620990084e-07,
"loss": 0.2814,
"step": 1494
},
{
"epoch": 4.7441491471638235,
"grad_norm": 0.07157617127762622,
"learning_rate": 6.275252271425736e-07,
"loss": 0.2838,
"step": 1495
},
{
"epoch": 4.7473224910749705,
"grad_norm": 0.07273178665389977,
"learning_rate": 6.119750240950906e-07,
"loss": 0.2892,
"step": 1496
},
{
"epoch": 4.750495834986117,
"grad_norm": 0.07183066506951129,
"learning_rate": 5.96618429392204e-07,
"loss": 0.2838,
"step": 1497
},
{
"epoch": 4.753669178897263,
"grad_norm": 0.07085345548841952,
"learning_rate": 5.814555185178838e-07,
"loss": 0.2924,
"step": 1498
},
{
"epoch": 4.75684252280841,
"grad_norm": 0.071668882520716,
"learning_rate": 5.664863660040843e-07,
"loss": 0.2882,
"step": 1499
},
{
"epoch": 4.760015866719556,
"grad_norm": 0.07232430109380229,
"learning_rate": 5.517110454303387e-07,
"loss": 0.2922,
"step": 1500
},
{
"epoch": 4.763189210630702,
"grad_norm": 0.07157437165230283,
"learning_rate": 5.371296294234318e-07,
"loss": 0.2818,
"step": 1501
},
{
"epoch": 4.766362554541848,
"grad_norm": 0.07138983987689239,
"learning_rate": 5.227421896570217e-07,
"loss": 0.2896,
"step": 1502
},
{
"epoch": 4.769535898452995,
"grad_norm": 0.0728983951265643,
"learning_rate": 5.085487968512892e-07,
"loss": 0.2837,
"step": 1503
},
{
"epoch": 4.772709242364141,
"grad_norm": 0.07175621867152891,
"learning_rate": 4.945495207725958e-07,
"loss": 0.2838,
"step": 1504
},
{
"epoch": 4.775882586275287,
"grad_norm": 0.07008101989246392,
"learning_rate": 4.807444302331509e-07,
"loss": 0.2891,
"step": 1505
},
{
"epoch": 4.779055930186434,
"grad_norm": 0.07114350131264044,
"learning_rate": 4.671335930906429e-07,
"loss": 0.2884,
"step": 1506
},
{
"epoch": 4.78222927409758,
"grad_norm": 0.0712547000013449,
"learning_rate": 4.537170762479459e-07,
"loss": 0.2865,
"step": 1507
},
{
"epoch": 4.785402618008726,
"grad_norm": 0.07044621429669759,
"learning_rate": 4.404949456527474e-07,
"loss": 0.2803,
"step": 1508
},
{
"epoch": 4.788575961919873,
"grad_norm": 0.07132128750657112,
"learning_rate": 4.274672662972679e-07,
"loss": 0.2904,
"step": 1509
},
{
"epoch": 4.791749305831019,
"grad_norm": 0.07164963300558302,
"learning_rate": 4.146341022179057e-07,
"loss": 0.2908,
"step": 1510
},
{
"epoch": 4.7949226497421655,
"grad_norm": 0.07165377120863803,
"learning_rate": 4.019955164949352e-07,
"loss": 0.2874,
"step": 1511
},
{
"epoch": 4.7980959936533125,
"grad_norm": 0.07017965132137843,
"learning_rate": 3.8955157125221356e-07,
"loss": 0.2849,
"step": 1512
},
{
"epoch": 4.801269337564459,
"grad_norm": 0.0706363848720524,
"learning_rate": 3.7730232765685213e-07,
"loss": 0.2861,
"step": 1513
},
{
"epoch": 4.804442681475605,
"grad_norm": 0.07329406027888734,
"learning_rate": 3.6524784591891013e-07,
"loss": 0.2873,
"step": 1514
},
{
"epoch": 4.807616025386752,
"grad_norm": 0.06880746155049348,
"learning_rate": 3.5338818529113253e-07,
"loss": 0.2869,
"step": 1515
},
{
"epoch": 4.810789369297898,
"grad_norm": 0.07176951769611108,
"learning_rate": 3.417234040686257e-07,
"loss": 0.2935,
"step": 1516
},
{
"epoch": 4.813962713209044,
"grad_norm": 0.07173903151971806,
"learning_rate": 3.302535595885914e-07,
"loss": 0.292,
"step": 1517
},
{
"epoch": 4.817136057120191,
"grad_norm": 0.06981608234215353,
"learning_rate": 3.1897870823002883e-07,
"loss": 0.2875,
"step": 1518
},
{
"epoch": 4.820309401031337,
"grad_norm": 0.07079666453671589,
"learning_rate": 3.078989054134729e-07,
"loss": 0.2852,
"step": 1519
},
{
"epoch": 4.823482744942483,
"grad_norm": 0.07073176594785087,
"learning_rate": 2.9701420560071417e-07,
"loss": 0.2905,
"step": 1520
},
{
"epoch": 4.826656088853629,
"grad_norm": 0.06983353352491713,
"learning_rate": 2.863246622945237e-07,
"loss": 0.2851,
"step": 1521
},
{
"epoch": 4.829829432764776,
"grad_norm": 0.07067428777959145,
"learning_rate": 2.758303280384045e-07,
"loss": 0.2873,
"step": 1522
},
{
"epoch": 4.833002776675922,
"grad_norm": 0.07102800424361236,
"learning_rate": 2.6553125441633355e-07,
"loss": 0.2871,
"step": 1523
},
{
"epoch": 4.836176120587068,
"grad_norm": 0.07060381894814903,
"learning_rate": 2.5542749205248683e-07,
"loss": 0.285,
"step": 1524
},
{
"epoch": 4.839349464498215,
"grad_norm": 0.07059262948871846,
"learning_rate": 2.4551909061101275e-07,
"loss": 0.2838,
"step": 1525
},
{
"epoch": 4.8425228084093614,
"grad_norm": 0.0700522721522073,
"learning_rate": 2.3580609879578332e-07,
"loss": 0.2913,
"step": 1526
},
{
"epoch": 4.8456961523205075,
"grad_norm": 0.07311091919771467,
"learning_rate": 2.2628856435015e-07,
"loss": 0.2903,
"step": 1527
},
{
"epoch": 4.848869496231654,
"grad_norm": 0.06916920833217112,
"learning_rate": 2.1696653405670398e-07,
"loss": 0.2831,
"step": 1528
},
{
"epoch": 4.852042840142801,
"grad_norm": 0.0709586661157069,
"learning_rate": 2.0784005373706729e-07,
"loss": 0.2892,
"step": 1529
},
{
"epoch": 4.855216184053947,
"grad_norm": 0.06984666564128461,
"learning_rate": 1.9890916825163086e-07,
"loss": 0.2895,
"step": 1530
},
{
"epoch": 4.858389527965093,
"grad_norm": 0.07227174002725144,
"learning_rate": 1.9017392149938585e-07,
"loss": 0.2856,
"step": 1531
},
{
"epoch": 4.86156287187624,
"grad_norm": 0.06981243660335461,
"learning_rate": 1.816343564176526e-07,
"loss": 0.2841,
"step": 1532
},
{
"epoch": 4.864736215787386,
"grad_norm": 0.06955676961459405,
"learning_rate": 1.7329051498191196e-07,
"loss": 0.2853,
"step": 1533
},
{
"epoch": 4.867909559698532,
"grad_norm": 0.07044007567746591,
"learning_rate": 1.6514243820556996e-07,
"loss": 0.2892,
"step": 1534
},
{
"epoch": 4.871082903609679,
"grad_norm": 0.07011876795238156,
"learning_rate": 1.5719016613978012e-07,
"loss": 0.2888,
"step": 1535
},
{
"epoch": 4.874256247520825,
"grad_norm": 0.0711153254470578,
"learning_rate": 1.4943373787323468e-07,
"loss": 0.2875,
"step": 1536
},
{
"epoch": 4.877429591431971,
"grad_norm": 0.07033022450802037,
"learning_rate": 1.4187319153196488e-07,
"loss": 0.2937,
"step": 1537
},
{
"epoch": 4.880602935343118,
"grad_norm": 0.07169239899295766,
"learning_rate": 1.3450856427916325e-07,
"loss": 0.2911,
"step": 1538
},
{
"epoch": 4.883776279254264,
"grad_norm": 0.07061272545363809,
"learning_rate": 1.2733989231500597e-07,
"loss": 0.2852,
"step": 1539
},
{
"epoch": 4.88694962316541,
"grad_norm": 0.07142348008808704,
"learning_rate": 1.2036721087646642e-07,
"loss": 0.2881,
"step": 1540
},
{
"epoch": 4.890122967076557,
"grad_norm": 0.07491018067145877,
"learning_rate": 1.1359055423713295e-07,
"loss": 0.2883,
"step": 1541
},
{
"epoch": 4.8932963109877035,
"grad_norm": 0.07043352846998585,
"learning_rate": 1.070099557070714e-07,
"loss": 0.2884,
"step": 1542
},
{
"epoch": 4.8964696548988496,
"grad_norm": 0.0703268500097442,
"learning_rate": 1.0062544763262516e-07,
"loss": 0.2886,
"step": 1543
},
{
"epoch": 4.899642998809996,
"grad_norm": 0.06967989026912552,
"learning_rate": 9.44370613962775e-08,
"loss": 0.2875,
"step": 1544
},
{
"epoch": 4.902816342721143,
"grad_norm": 0.07091190304068178,
"learning_rate": 8.844482741649174e-08,
"loss": 0.2846,
"step": 1545
},
{
"epoch": 4.905989686632289,
"grad_norm": 0.0704157347762079,
"learning_rate": 8.264877514756465e-08,
"loss": 0.2915,
"step": 1546
},
{
"epoch": 4.909163030543435,
"grad_norm": 0.07197642287777141,
"learning_rate": 7.704893307947547e-08,
"loss": 0.2886,
"step": 1547
},
{
"epoch": 4.912336374454582,
"grad_norm": 0.06954677224905088,
"learning_rate": 7.164532873775276e-08,
"loss": 0.288,
"step": 1548
},
{
"epoch": 4.915509718365728,
"grad_norm": 0.07029671235552006,
"learning_rate": 6.643798868333662e-08,
"loss": 0.2868,
"step": 1549
},
{
"epoch": 4.918683062276874,
"grad_norm": 0.07101459809224268,
"learning_rate": 6.142693851244109e-08,
"loss": 0.2871,
"step": 1550
},
{
"epoch": 4.921856406188021,
"grad_norm": 0.07028381337765902,
"learning_rate": 5.661220285643865e-08,
"loss": 0.2923,
"step": 1551
},
{
"epoch": 4.925029750099167,
"grad_norm": 0.07105984885088223,
"learning_rate": 5.199380538174037e-08,
"loss": 0.2935,
"step": 1552
},
{
"epoch": 4.928203094010313,
"grad_norm": 0.07233897145123062,
"learning_rate": 4.7571768789667075e-08,
"loss": 0.2875,
"step": 1553
},
{
"epoch": 4.931376437921459,
"grad_norm": 0.0703092103811166,
"learning_rate": 4.3346114816347207e-08,
"loss": 0.2858,
"step": 1554
},
{
"epoch": 4.934549781832606,
"grad_norm": 0.0706968242740516,
"learning_rate": 3.931686423261027e-08,
"loss": 0.2872,
"step": 1555
},
{
"epoch": 4.937723125743752,
"grad_norm": 0.069673162302404,
"learning_rate": 3.5484036843875804e-08,
"loss": 0.2882,
"step": 1556
},
{
"epoch": 4.9408964696548985,
"grad_norm": 0.06968426384379744,
"learning_rate": 3.1847651490068964e-08,
"loss": 0.2855,
"step": 1557
},
{
"epoch": 4.9440698135660455,
"grad_norm": 0.07194410996026296,
"learning_rate": 2.8407726045522886e-08,
"loss": 0.2928,
"step": 1558
},
{
"epoch": 4.947243157477192,
"grad_norm": 0.06910814568161171,
"learning_rate": 2.5164277418880945e-08,
"loss": 0.2853,
"step": 1559
},
{
"epoch": 4.950416501388338,
"grad_norm": 0.07039708426854659,
"learning_rate": 2.2117321553030146e-08,
"loss": 0.2896,
"step": 1560
},
{
"epoch": 4.953589845299485,
"grad_norm": 0.07008182260440685,
"learning_rate": 1.9266873425012323e-08,
"loss": 0.2892,
"step": 1561
},
{
"epoch": 4.956763189210631,
"grad_norm": 0.06932142266239405,
"learning_rate": 1.6612947045953064e-08,
"loss": 0.2852,
"step": 1562
},
{
"epoch": 4.959936533121777,
"grad_norm": 0.06970140934176305,
"learning_rate": 1.4155555460990677e-08,
"loss": 0.2893,
"step": 1563
},
{
"epoch": 4.963109877032924,
"grad_norm": 0.07142752676643119,
"learning_rate": 1.1894710749214e-08,
"loss": 0.2883,
"step": 1564
},
{
"epoch": 4.96628322094407,
"grad_norm": 0.07011403382472035,
"learning_rate": 9.83042402360912e-09,
"loss": 0.2877,
"step": 1565
},
{
"epoch": 4.969456564855216,
"grad_norm": 0.06985655852080626,
"learning_rate": 7.962705430988315e-09,
"loss": 0.2908,
"step": 1566
},
{
"epoch": 4.972629908766363,
"grad_norm": 0.07205354220071218,
"learning_rate": 6.291564151963414e-09,
"loss": 0.2885,
"step": 1567
},
{
"epoch": 4.975803252677509,
"grad_norm": 0.06900070418548365,
"learning_rate": 4.817008400879175e-09,
"loss": 0.2877,
"step": 1568
},
{
"epoch": 4.978976596588655,
"grad_norm": 0.07029900380853947,
"learning_rate": 3.539045425777765e-09,
"loss": 0.2868,
"step": 1569
},
{
"epoch": 4.982149940499801,
"grad_norm": 0.070893441701857,
"learning_rate": 2.4576815083809933e-09,
"loss": 0.2859,
"step": 1570
},
{
"epoch": 4.985323284410948,
"grad_norm": 0.07118447299049073,
"learning_rate": 1.572921964032581e-09,
"loss": 0.2897,
"step": 1571
},
{
"epoch": 4.988496628322094,
"grad_norm": 0.07138726064563218,
"learning_rate": 8.847711416937188e-10,
"loss": 0.2912,
"step": 1572
},
{
"epoch": 4.9916699722332405,
"grad_norm": 0.06968196998745611,
"learning_rate": 3.9323242390754137e-10,
"loss": 0.2841,
"step": 1573
},
{
"epoch": 4.9948433161443875,
"grad_norm": 0.07155054271656291,
"learning_rate": 9.830822678136288e-11,
"loss": 0.2906,
"step": 1574
},
{
"epoch": 4.998016660055534,
"grad_norm": 0.07077675462284239,
"learning_rate": 0.0,
"loss": 0.2858,
"step": 1575
},
{
"epoch": 4.998016660055534,
"step": 1575,
"total_flos": 3.782923812788083e+19,
"train_loss": 0.3593273034549895,
"train_runtime": 93234.1699,
"train_samples_per_second": 8.651,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1.0,
"max_steps": 1575,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.782923812788083e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}