flyingbugs's picture
Model save
26be6e9 verified
raw
history blame
205 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1173,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025575447570332483,
"grad_norm": 60.07620508040347,
"learning_rate": 0.0,
"loss": 10.9714,
"step": 1
},
{
"epoch": 0.005115089514066497,
"grad_norm": 60.511635982681035,
"learning_rate": 4.2372881355932204e-07,
"loss": 11.044,
"step": 2
},
{
"epoch": 0.0076726342710997444,
"grad_norm": 61.57012701086648,
"learning_rate": 8.474576271186441e-07,
"loss": 10.9687,
"step": 3
},
{
"epoch": 0.010230179028132993,
"grad_norm": 62.423863746635334,
"learning_rate": 1.2711864406779662e-06,
"loss": 10.9132,
"step": 4
},
{
"epoch": 0.01278772378516624,
"grad_norm": 60.51018546257131,
"learning_rate": 1.6949152542372882e-06,
"loss": 11.0108,
"step": 5
},
{
"epoch": 0.015345268542199489,
"grad_norm": 66.2795306712718,
"learning_rate": 2.11864406779661e-06,
"loss": 10.7022,
"step": 6
},
{
"epoch": 0.017902813299232736,
"grad_norm": 68.66164801562074,
"learning_rate": 2.5423728813559323e-06,
"loss": 10.6058,
"step": 7
},
{
"epoch": 0.020460358056265986,
"grad_norm": 107.4660149943893,
"learning_rate": 2.9661016949152545e-06,
"loss": 9.0593,
"step": 8
},
{
"epoch": 0.023017902813299233,
"grad_norm": 122.48386436910788,
"learning_rate": 3.3898305084745763e-06,
"loss": 8.4522,
"step": 9
},
{
"epoch": 0.02557544757033248,
"grad_norm": 125.82848908671042,
"learning_rate": 3.813559322033899e-06,
"loss": 5.6693,
"step": 10
},
{
"epoch": 0.028132992327365727,
"grad_norm": 52.58888004444451,
"learning_rate": 4.23728813559322e-06,
"loss": 3.0629,
"step": 11
},
{
"epoch": 0.030690537084398978,
"grad_norm": 37.39340585668415,
"learning_rate": 4.6610169491525425e-06,
"loss": 2.376,
"step": 12
},
{
"epoch": 0.03324808184143223,
"grad_norm": 28.735337125133064,
"learning_rate": 5.084745762711865e-06,
"loss": 2.1006,
"step": 13
},
{
"epoch": 0.03580562659846547,
"grad_norm": 6.3291764630351315,
"learning_rate": 5.508474576271187e-06,
"loss": 1.2756,
"step": 14
},
{
"epoch": 0.03836317135549872,
"grad_norm": 4.690308248334096,
"learning_rate": 5.932203389830509e-06,
"loss": 1.2509,
"step": 15
},
{
"epoch": 0.04092071611253197,
"grad_norm": 3.5468348254384843,
"learning_rate": 6.3559322033898304e-06,
"loss": 1.1712,
"step": 16
},
{
"epoch": 0.043478260869565216,
"grad_norm": 2.676492989643342,
"learning_rate": 6.779661016949153e-06,
"loss": 1.055,
"step": 17
},
{
"epoch": 0.04603580562659847,
"grad_norm": 2.1888510444313205,
"learning_rate": 7.203389830508475e-06,
"loss": 1.0324,
"step": 18
},
{
"epoch": 0.04859335038363171,
"grad_norm": 55.49598040447309,
"learning_rate": 7.627118644067798e-06,
"loss": 0.9577,
"step": 19
},
{
"epoch": 0.05115089514066496,
"grad_norm": 18.10939464017419,
"learning_rate": 8.050847457627118e-06,
"loss": 0.8841,
"step": 20
},
{
"epoch": 0.05370843989769821,
"grad_norm": 1.783845830738153,
"learning_rate": 8.47457627118644e-06,
"loss": 0.8704,
"step": 21
},
{
"epoch": 0.056265984654731455,
"grad_norm": 1.2295478253717957,
"learning_rate": 8.898305084745763e-06,
"loss": 0.829,
"step": 22
},
{
"epoch": 0.058823529411764705,
"grad_norm": 1.0279978849315632,
"learning_rate": 9.322033898305085e-06,
"loss": 0.8196,
"step": 23
},
{
"epoch": 0.061381074168797956,
"grad_norm": 0.8982739673565904,
"learning_rate": 9.745762711864407e-06,
"loss": 0.7903,
"step": 24
},
{
"epoch": 0.0639386189258312,
"grad_norm": 0.7588801023963194,
"learning_rate": 1.016949152542373e-05,
"loss": 0.7177,
"step": 25
},
{
"epoch": 0.06649616368286446,
"grad_norm": 1.0123370131062162,
"learning_rate": 1.0593220338983052e-05,
"loss": 0.7536,
"step": 26
},
{
"epoch": 0.06905370843989769,
"grad_norm": 0.7910316066634632,
"learning_rate": 1.1016949152542374e-05,
"loss": 0.6874,
"step": 27
},
{
"epoch": 0.07161125319693094,
"grad_norm": 0.7192937721653079,
"learning_rate": 1.1440677966101696e-05,
"loss": 0.6942,
"step": 28
},
{
"epoch": 0.0741687979539642,
"grad_norm": 0.6367048650637959,
"learning_rate": 1.1864406779661018e-05,
"loss": 0.652,
"step": 29
},
{
"epoch": 0.07672634271099744,
"grad_norm": 0.6890008346231932,
"learning_rate": 1.228813559322034e-05,
"loss": 0.6527,
"step": 30
},
{
"epoch": 0.0792838874680307,
"grad_norm": 0.7018774861427414,
"learning_rate": 1.2711864406779661e-05,
"loss": 0.6389,
"step": 31
},
{
"epoch": 0.08184143222506395,
"grad_norm": 0.6934531307251741,
"learning_rate": 1.3135593220338985e-05,
"loss": 0.6612,
"step": 32
},
{
"epoch": 0.08439897698209718,
"grad_norm": 0.4547490187162451,
"learning_rate": 1.3559322033898305e-05,
"loss": 0.6272,
"step": 33
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.5411681099025528,
"learning_rate": 1.3983050847457627e-05,
"loss": 0.6326,
"step": 34
},
{
"epoch": 0.08951406649616368,
"grad_norm": 0.5591394716745298,
"learning_rate": 1.440677966101695e-05,
"loss": 0.6139,
"step": 35
},
{
"epoch": 0.09207161125319693,
"grad_norm": 0.4569913550931653,
"learning_rate": 1.4830508474576272e-05,
"loss": 0.6073,
"step": 36
},
{
"epoch": 0.09462915601023018,
"grad_norm": 0.4147309558729621,
"learning_rate": 1.5254237288135596e-05,
"loss": 0.6017,
"step": 37
},
{
"epoch": 0.09718670076726342,
"grad_norm": 0.44578293274404973,
"learning_rate": 1.5677966101694916e-05,
"loss": 0.578,
"step": 38
},
{
"epoch": 0.09974424552429667,
"grad_norm": 0.44759576906101894,
"learning_rate": 1.6101694915254237e-05,
"loss": 0.5725,
"step": 39
},
{
"epoch": 0.10230179028132992,
"grad_norm": 0.521441753506374,
"learning_rate": 1.652542372881356e-05,
"loss": 0.6091,
"step": 40
},
{
"epoch": 0.10485933503836317,
"grad_norm": 0.3633683810476169,
"learning_rate": 1.694915254237288e-05,
"loss": 0.591,
"step": 41
},
{
"epoch": 0.10741687979539642,
"grad_norm": 0.38875293035716313,
"learning_rate": 1.7372881355932205e-05,
"loss": 0.5684,
"step": 42
},
{
"epoch": 0.10997442455242967,
"grad_norm": 0.4050488399781334,
"learning_rate": 1.7796610169491526e-05,
"loss": 0.5604,
"step": 43
},
{
"epoch": 0.11253196930946291,
"grad_norm": 0.35484531528744356,
"learning_rate": 1.8220338983050846e-05,
"loss": 0.5588,
"step": 44
},
{
"epoch": 0.11508951406649616,
"grad_norm": 0.3558009349640067,
"learning_rate": 1.864406779661017e-05,
"loss": 0.5772,
"step": 45
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.3631599278698065,
"learning_rate": 1.906779661016949e-05,
"loss": 0.5567,
"step": 46
},
{
"epoch": 0.12020460358056266,
"grad_norm": 0.29178893481388374,
"learning_rate": 1.9491525423728814e-05,
"loss": 0.5575,
"step": 47
},
{
"epoch": 0.12276214833759591,
"grad_norm": 0.28512370332661957,
"learning_rate": 1.9915254237288135e-05,
"loss": 0.545,
"step": 48
},
{
"epoch": 0.12531969309462915,
"grad_norm": 0.33383686916439004,
"learning_rate": 2.033898305084746e-05,
"loss": 0.5395,
"step": 49
},
{
"epoch": 0.1278772378516624,
"grad_norm": 0.3302302589173117,
"learning_rate": 2.076271186440678e-05,
"loss": 0.5654,
"step": 50
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.25804408924344063,
"learning_rate": 2.1186440677966103e-05,
"loss": 0.545,
"step": 51
},
{
"epoch": 0.1329923273657289,
"grad_norm": 0.27338682999676506,
"learning_rate": 2.1610169491525427e-05,
"loss": 0.5417,
"step": 52
},
{
"epoch": 0.13554987212276215,
"grad_norm": 0.25924856640229854,
"learning_rate": 2.2033898305084748e-05,
"loss": 0.5435,
"step": 53
},
{
"epoch": 0.13810741687979539,
"grad_norm": 0.25667969517909306,
"learning_rate": 2.245762711864407e-05,
"loss": 0.5027,
"step": 54
},
{
"epoch": 0.14066496163682865,
"grad_norm": 0.2651721483714715,
"learning_rate": 2.2881355932203392e-05,
"loss": 0.5148,
"step": 55
},
{
"epoch": 0.1432225063938619,
"grad_norm": 0.2695589091283933,
"learning_rate": 2.3305084745762712e-05,
"loss": 0.5421,
"step": 56
},
{
"epoch": 0.14578005115089515,
"grad_norm": 0.2660946246807775,
"learning_rate": 2.3728813559322036e-05,
"loss": 0.5302,
"step": 57
},
{
"epoch": 0.1483375959079284,
"grad_norm": 0.2572598707834026,
"learning_rate": 2.4152542372881357e-05,
"loss": 0.5494,
"step": 58
},
{
"epoch": 0.15089514066496162,
"grad_norm": 0.25796653370038297,
"learning_rate": 2.457627118644068e-05,
"loss": 0.5173,
"step": 59
},
{
"epoch": 0.1534526854219949,
"grad_norm": 0.26719666930574326,
"learning_rate": 2.5e-05,
"loss": 0.5318,
"step": 60
},
{
"epoch": 0.15601023017902813,
"grad_norm": 0.2415395019191131,
"learning_rate": 2.5423728813559322e-05,
"loss": 0.533,
"step": 61
},
{
"epoch": 0.1585677749360614,
"grad_norm": 0.2731503593131359,
"learning_rate": 2.5847457627118642e-05,
"loss": 0.5138,
"step": 62
},
{
"epoch": 0.16112531969309463,
"grad_norm": 0.23021339667231472,
"learning_rate": 2.627118644067797e-05,
"loss": 0.506,
"step": 63
},
{
"epoch": 0.1636828644501279,
"grad_norm": 0.2438183399920384,
"learning_rate": 2.669491525423729e-05,
"loss": 0.4933,
"step": 64
},
{
"epoch": 0.16624040920716113,
"grad_norm": 0.25625774549395297,
"learning_rate": 2.711864406779661e-05,
"loss": 0.5275,
"step": 65
},
{
"epoch": 0.16879795396419436,
"grad_norm": 0.2523483723490555,
"learning_rate": 2.754237288135593e-05,
"loss": 0.517,
"step": 66
},
{
"epoch": 0.17135549872122763,
"grad_norm": 0.24599282565528238,
"learning_rate": 2.7966101694915255e-05,
"loss": 0.5105,
"step": 67
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.25271072320627247,
"learning_rate": 2.838983050847458e-05,
"loss": 0.4947,
"step": 68
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.26800675870234536,
"learning_rate": 2.88135593220339e-05,
"loss": 0.5018,
"step": 69
},
{
"epoch": 0.17902813299232737,
"grad_norm": 0.22967309445842915,
"learning_rate": 2.9237288135593223e-05,
"loss": 0.5127,
"step": 70
},
{
"epoch": 0.1815856777493606,
"grad_norm": 0.2936501608494599,
"learning_rate": 2.9661016949152544e-05,
"loss": 0.5067,
"step": 71
},
{
"epoch": 0.18414322250639387,
"grad_norm": 0.3944135766030376,
"learning_rate": 3.0084745762711864e-05,
"loss": 0.5189,
"step": 72
},
{
"epoch": 0.1867007672634271,
"grad_norm": 0.266923293934136,
"learning_rate": 3.050847457627119e-05,
"loss": 0.5099,
"step": 73
},
{
"epoch": 0.18925831202046037,
"grad_norm": 0.25718984553900326,
"learning_rate": 3.093220338983051e-05,
"loss": 0.5024,
"step": 74
},
{
"epoch": 0.1918158567774936,
"grad_norm": 0.23516139958516855,
"learning_rate": 3.135593220338983e-05,
"loss": 0.4961,
"step": 75
},
{
"epoch": 0.19437340153452684,
"grad_norm": 0.2629972539950733,
"learning_rate": 3.177966101694915e-05,
"loss": 0.4858,
"step": 76
},
{
"epoch": 0.1969309462915601,
"grad_norm": 0.2397591843698089,
"learning_rate": 3.2203389830508473e-05,
"loss": 0.5022,
"step": 77
},
{
"epoch": 0.19948849104859334,
"grad_norm": 0.2488143296082389,
"learning_rate": 3.26271186440678e-05,
"loss": 0.5008,
"step": 78
},
{
"epoch": 0.2020460358056266,
"grad_norm": 0.284022517588893,
"learning_rate": 3.305084745762712e-05,
"loss": 0.4944,
"step": 79
},
{
"epoch": 0.20460358056265984,
"grad_norm": 0.2585535341280856,
"learning_rate": 3.347457627118644e-05,
"loss": 0.4681,
"step": 80
},
{
"epoch": 0.2071611253196931,
"grad_norm": 0.27227808307258267,
"learning_rate": 3.389830508474576e-05,
"loss": 0.4798,
"step": 81
},
{
"epoch": 0.20971867007672634,
"grad_norm": 0.27943220348506814,
"learning_rate": 3.432203389830508e-05,
"loss": 0.4869,
"step": 82
},
{
"epoch": 0.21227621483375958,
"grad_norm": 0.2591147558052403,
"learning_rate": 3.474576271186441e-05,
"loss": 0.5002,
"step": 83
},
{
"epoch": 0.21483375959079284,
"grad_norm": 0.26199419848962174,
"learning_rate": 3.516949152542373e-05,
"loss": 0.4848,
"step": 84
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.2560452706817345,
"learning_rate": 3.559322033898305e-05,
"loss": 0.4796,
"step": 85
},
{
"epoch": 0.21994884910485935,
"grad_norm": 0.3104926180958261,
"learning_rate": 3.601694915254237e-05,
"loss": 0.4857,
"step": 86
},
{
"epoch": 0.22250639386189258,
"grad_norm": 0.2595037856684306,
"learning_rate": 3.644067796610169e-05,
"loss": 0.4786,
"step": 87
},
{
"epoch": 0.22506393861892582,
"grad_norm": 0.28985166506581866,
"learning_rate": 3.686440677966102e-05,
"loss": 0.4733,
"step": 88
},
{
"epoch": 0.22762148337595908,
"grad_norm": 0.2900856188045173,
"learning_rate": 3.728813559322034e-05,
"loss": 0.4893,
"step": 89
},
{
"epoch": 0.23017902813299232,
"grad_norm": 0.3181961782523891,
"learning_rate": 3.771186440677966e-05,
"loss": 0.4836,
"step": 90
},
{
"epoch": 0.23273657289002558,
"grad_norm": 0.3524322519656808,
"learning_rate": 3.813559322033898e-05,
"loss": 0.4858,
"step": 91
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.277143774625197,
"learning_rate": 3.855932203389831e-05,
"loss": 0.4602,
"step": 92
},
{
"epoch": 0.23785166240409208,
"grad_norm": 0.3152846596099472,
"learning_rate": 3.898305084745763e-05,
"loss": 0.4861,
"step": 93
},
{
"epoch": 0.24040920716112532,
"grad_norm": 0.3108040600900486,
"learning_rate": 3.940677966101695e-05,
"loss": 0.4735,
"step": 94
},
{
"epoch": 0.24296675191815856,
"grad_norm": 0.3636456936928106,
"learning_rate": 3.983050847457627e-05,
"loss": 0.4927,
"step": 95
},
{
"epoch": 0.24552429667519182,
"grad_norm": 0.281719824056263,
"learning_rate": 4.025423728813559e-05,
"loss": 0.478,
"step": 96
},
{
"epoch": 0.24808184143222506,
"grad_norm": 0.31572505604740536,
"learning_rate": 4.067796610169492e-05,
"loss": 0.4782,
"step": 97
},
{
"epoch": 0.2506393861892583,
"grad_norm": 0.3265923715391404,
"learning_rate": 4.110169491525424e-05,
"loss": 0.4769,
"step": 98
},
{
"epoch": 0.2531969309462916,
"grad_norm": 0.28803267079398887,
"learning_rate": 4.152542372881356e-05,
"loss": 0.4729,
"step": 99
},
{
"epoch": 0.2557544757033248,
"grad_norm": 0.3650171432061163,
"learning_rate": 4.1949152542372886e-05,
"loss": 0.4686,
"step": 100
},
{
"epoch": 0.25831202046035806,
"grad_norm": 0.3208885876586653,
"learning_rate": 4.2372881355932206e-05,
"loss": 0.4756,
"step": 101
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.3018386311182313,
"learning_rate": 4.279661016949153e-05,
"loss": 0.4898,
"step": 102
},
{
"epoch": 0.26342710997442453,
"grad_norm": 0.35043017471200005,
"learning_rate": 4.3220338983050854e-05,
"loss": 0.4791,
"step": 103
},
{
"epoch": 0.2659846547314578,
"grad_norm": 0.34067263771788764,
"learning_rate": 4.3644067796610175e-05,
"loss": 0.4605,
"step": 104
},
{
"epoch": 0.26854219948849106,
"grad_norm": 0.30101429979539257,
"learning_rate": 4.4067796610169495e-05,
"loss": 0.4736,
"step": 105
},
{
"epoch": 0.2710997442455243,
"grad_norm": 0.30707206512082585,
"learning_rate": 4.4491525423728816e-05,
"loss": 0.4822,
"step": 106
},
{
"epoch": 0.27365728900255754,
"grad_norm": 0.39306930698809855,
"learning_rate": 4.491525423728814e-05,
"loss": 0.4586,
"step": 107
},
{
"epoch": 0.27621483375959077,
"grad_norm": 0.2793625552949932,
"learning_rate": 4.533898305084746e-05,
"loss": 0.4824,
"step": 108
},
{
"epoch": 0.27877237851662406,
"grad_norm": 0.39226221347711837,
"learning_rate": 4.5762711864406784e-05,
"loss": 0.4576,
"step": 109
},
{
"epoch": 0.2813299232736573,
"grad_norm": 0.3030667831941101,
"learning_rate": 4.6186440677966104e-05,
"loss": 0.4624,
"step": 110
},
{
"epoch": 0.28388746803069054,
"grad_norm": 0.3273613222535301,
"learning_rate": 4.6610169491525425e-05,
"loss": 0.4799,
"step": 111
},
{
"epoch": 0.2864450127877238,
"grad_norm": 0.2863063658186757,
"learning_rate": 4.703389830508475e-05,
"loss": 0.4669,
"step": 112
},
{
"epoch": 0.289002557544757,
"grad_norm": 0.33232608459400076,
"learning_rate": 4.745762711864407e-05,
"loss": 0.4825,
"step": 113
},
{
"epoch": 0.2915601023017903,
"grad_norm": 0.3600411712420216,
"learning_rate": 4.788135593220339e-05,
"loss": 0.4683,
"step": 114
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.27761199193640784,
"learning_rate": 4.8305084745762714e-05,
"loss": 0.4755,
"step": 115
},
{
"epoch": 0.2966751918158568,
"grad_norm": 0.3144400589669658,
"learning_rate": 4.8728813559322034e-05,
"loss": 0.4566,
"step": 116
},
{
"epoch": 0.29923273657289,
"grad_norm": 0.35513580765557595,
"learning_rate": 4.915254237288136e-05,
"loss": 0.459,
"step": 117
},
{
"epoch": 0.30179028132992325,
"grad_norm": 0.2738899153960894,
"learning_rate": 4.957627118644068e-05,
"loss": 0.4657,
"step": 118
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.3433568900683564,
"learning_rate": 5e-05,
"loss": 0.4594,
"step": 119
},
{
"epoch": 0.3069053708439898,
"grad_norm": 0.2765901256428289,
"learning_rate": 4.9952606635071094e-05,
"loss": 0.4754,
"step": 120
},
{
"epoch": 0.309462915601023,
"grad_norm": 0.3551164959173657,
"learning_rate": 4.990521327014218e-05,
"loss": 0.4616,
"step": 121
},
{
"epoch": 0.31202046035805625,
"grad_norm": 0.29005640287933204,
"learning_rate": 4.985781990521327e-05,
"loss": 0.4671,
"step": 122
},
{
"epoch": 0.3145780051150895,
"grad_norm": 0.3733034604083977,
"learning_rate": 4.981042654028436e-05,
"loss": 0.4891,
"step": 123
},
{
"epoch": 0.3171355498721228,
"grad_norm": 0.28943964192066307,
"learning_rate": 4.976303317535545e-05,
"loss": 0.4658,
"step": 124
},
{
"epoch": 0.319693094629156,
"grad_norm": 0.34596305401543703,
"learning_rate": 4.9715639810426544e-05,
"loss": 0.4601,
"step": 125
},
{
"epoch": 0.32225063938618925,
"grad_norm": 0.30408705183314516,
"learning_rate": 4.9668246445497635e-05,
"loss": 0.4392,
"step": 126
},
{
"epoch": 0.3248081841432225,
"grad_norm": 0.2934769724426176,
"learning_rate": 4.9620853080568726e-05,
"loss": 0.4755,
"step": 127
},
{
"epoch": 0.3273657289002558,
"grad_norm": 0.3194601339022468,
"learning_rate": 4.957345971563981e-05,
"loss": 0.455,
"step": 128
},
{
"epoch": 0.329923273657289,
"grad_norm": 0.2765722150148559,
"learning_rate": 4.95260663507109e-05,
"loss": 0.4371,
"step": 129
},
{
"epoch": 0.33248081841432225,
"grad_norm": 0.3098968524738735,
"learning_rate": 4.9478672985781994e-05,
"loss": 0.4479,
"step": 130
},
{
"epoch": 0.3350383631713555,
"grad_norm": 0.29058110177351354,
"learning_rate": 4.9431279620853085e-05,
"loss": 0.4638,
"step": 131
},
{
"epoch": 0.3375959079283887,
"grad_norm": 0.34878186474460904,
"learning_rate": 4.938388625592417e-05,
"loss": 0.4589,
"step": 132
},
{
"epoch": 0.340153452685422,
"grad_norm": 0.34103367199010814,
"learning_rate": 4.933649289099526e-05,
"loss": 0.4494,
"step": 133
},
{
"epoch": 0.34271099744245526,
"grad_norm": 0.3024000321891373,
"learning_rate": 4.928909952606635e-05,
"loss": 0.4642,
"step": 134
},
{
"epoch": 0.3452685421994885,
"grad_norm": 0.3120266717266376,
"learning_rate": 4.9241706161137443e-05,
"loss": 0.4494,
"step": 135
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.3437694967932959,
"learning_rate": 4.919431279620853e-05,
"loss": 0.4385,
"step": 136
},
{
"epoch": 0.35038363171355497,
"grad_norm": 0.3561886653860422,
"learning_rate": 4.9146919431279626e-05,
"loss": 0.4503,
"step": 137
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.3265621404114159,
"learning_rate": 4.909952606635072e-05,
"loss": 0.4528,
"step": 138
},
{
"epoch": 0.3554987212276215,
"grad_norm": 0.39021732468276327,
"learning_rate": 4.90521327014218e-05,
"loss": 0.47,
"step": 139
},
{
"epoch": 0.35805626598465473,
"grad_norm": 0.2892525783443359,
"learning_rate": 4.900473933649289e-05,
"loss": 0.4425,
"step": 140
},
{
"epoch": 0.36061381074168797,
"grad_norm": 0.35290539690783224,
"learning_rate": 4.8957345971563985e-05,
"loss": 0.4575,
"step": 141
},
{
"epoch": 0.3631713554987212,
"grad_norm": 0.4428368742434025,
"learning_rate": 4.8909952606635076e-05,
"loss": 0.4722,
"step": 142
},
{
"epoch": 0.3657289002557545,
"grad_norm": 0.2735345303292492,
"learning_rate": 4.886255924170616e-05,
"loss": 0.4553,
"step": 143
},
{
"epoch": 0.36828644501278773,
"grad_norm": 0.4438915131124858,
"learning_rate": 4.881516587677725e-05,
"loss": 0.4721,
"step": 144
},
{
"epoch": 0.37084398976982097,
"grad_norm": 0.3281658095262752,
"learning_rate": 4.876777251184834e-05,
"loss": 0.4378,
"step": 145
},
{
"epoch": 0.3734015345268542,
"grad_norm": 0.3710338165695333,
"learning_rate": 4.8720379146919435e-05,
"loss": 0.4764,
"step": 146
},
{
"epoch": 0.37595907928388744,
"grad_norm": 0.35926803120990913,
"learning_rate": 4.867298578199052e-05,
"loss": 0.4552,
"step": 147
},
{
"epoch": 0.37851662404092073,
"grad_norm": 0.36794824845872526,
"learning_rate": 4.862559241706162e-05,
"loss": 0.4578,
"step": 148
},
{
"epoch": 0.38107416879795397,
"grad_norm": 0.31318291286449124,
"learning_rate": 4.857819905213271e-05,
"loss": 0.4351,
"step": 149
},
{
"epoch": 0.3836317135549872,
"grad_norm": 0.33033923224683864,
"learning_rate": 4.853080568720379e-05,
"loss": 0.4565,
"step": 150
},
{
"epoch": 0.38618925831202044,
"grad_norm": 0.30424131577956276,
"learning_rate": 4.8483412322274884e-05,
"loss": 0.4403,
"step": 151
},
{
"epoch": 0.3887468030690537,
"grad_norm": 0.28074085140395005,
"learning_rate": 4.8436018957345976e-05,
"loss": 0.4486,
"step": 152
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.3579125827021185,
"learning_rate": 4.838862559241707e-05,
"loss": 0.4625,
"step": 153
},
{
"epoch": 0.3938618925831202,
"grad_norm": 0.3057863908214165,
"learning_rate": 4.834123222748815e-05,
"loss": 0.4361,
"step": 154
},
{
"epoch": 0.39641943734015345,
"grad_norm": 0.28441580945568773,
"learning_rate": 4.829383886255924e-05,
"loss": 0.4341,
"step": 155
},
{
"epoch": 0.3989769820971867,
"grad_norm": 0.28566109258674055,
"learning_rate": 4.8246445497630334e-05,
"loss": 0.441,
"step": 156
},
{
"epoch": 0.40153452685422,
"grad_norm": 0.3002441202365732,
"learning_rate": 4.819905213270142e-05,
"loss": 0.4329,
"step": 157
},
{
"epoch": 0.4040920716112532,
"grad_norm": 0.3199646866784537,
"learning_rate": 4.815165876777251e-05,
"loss": 0.4446,
"step": 158
},
{
"epoch": 0.40664961636828645,
"grad_norm": 0.2928518681501388,
"learning_rate": 4.810426540284361e-05,
"loss": 0.428,
"step": 159
},
{
"epoch": 0.4092071611253197,
"grad_norm": 0.3946927235529595,
"learning_rate": 4.80568720379147e-05,
"loss": 0.4421,
"step": 160
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.30774149759921665,
"learning_rate": 4.8009478672985784e-05,
"loss": 0.4679,
"step": 161
},
{
"epoch": 0.4143222506393862,
"grad_norm": 0.3644907390867006,
"learning_rate": 4.7962085308056876e-05,
"loss": 0.4516,
"step": 162
},
{
"epoch": 0.41687979539641945,
"grad_norm": 0.31614856501701377,
"learning_rate": 4.791469194312797e-05,
"loss": 0.4539,
"step": 163
},
{
"epoch": 0.4194373401534527,
"grad_norm": 0.30645090377175094,
"learning_rate": 4.786729857819905e-05,
"loss": 0.4346,
"step": 164
},
{
"epoch": 0.4219948849104859,
"grad_norm": 0.34220416537004444,
"learning_rate": 4.781990521327014e-05,
"loss": 0.4437,
"step": 165
},
{
"epoch": 0.42455242966751916,
"grad_norm": 0.29009367411374415,
"learning_rate": 4.7772511848341234e-05,
"loss": 0.4478,
"step": 166
},
{
"epoch": 0.42710997442455245,
"grad_norm": 0.3080387840957786,
"learning_rate": 4.7725118483412326e-05,
"loss": 0.4365,
"step": 167
},
{
"epoch": 0.4296675191815857,
"grad_norm": 0.30741939240017874,
"learning_rate": 4.767772511848341e-05,
"loss": 0.4588,
"step": 168
},
{
"epoch": 0.4322250639386189,
"grad_norm": 0.3198498782578863,
"learning_rate": 4.76303317535545e-05,
"loss": 0.438,
"step": 169
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.34750707859647,
"learning_rate": 4.758293838862559e-05,
"loss": 0.4543,
"step": 170
},
{
"epoch": 0.4373401534526854,
"grad_norm": 0.3106322104274765,
"learning_rate": 4.7535545023696684e-05,
"loss": 0.4567,
"step": 171
},
{
"epoch": 0.4398976982097187,
"grad_norm": 0.30192961843031885,
"learning_rate": 4.7488151658767775e-05,
"loss": 0.4342,
"step": 172
},
{
"epoch": 0.4424552429667519,
"grad_norm": 0.28068686110702473,
"learning_rate": 4.744075829383887e-05,
"loss": 0.4246,
"step": 173
},
{
"epoch": 0.44501278772378516,
"grad_norm": 0.343504552181982,
"learning_rate": 4.739336492890996e-05,
"loss": 0.4515,
"step": 174
},
{
"epoch": 0.4475703324808184,
"grad_norm": 0.27995937978607677,
"learning_rate": 4.734597156398104e-05,
"loss": 0.4423,
"step": 175
},
{
"epoch": 0.45012787723785164,
"grad_norm": 0.3040416539136848,
"learning_rate": 4.7298578199052134e-05,
"loss": 0.45,
"step": 176
},
{
"epoch": 0.45268542199488493,
"grad_norm": 0.31835031373188166,
"learning_rate": 4.7251184834123225e-05,
"loss": 0.4532,
"step": 177
},
{
"epoch": 0.45524296675191817,
"grad_norm": 0.3414414505648522,
"learning_rate": 4.720379146919432e-05,
"loss": 0.4498,
"step": 178
},
{
"epoch": 0.4578005115089514,
"grad_norm": 0.3673972403213916,
"learning_rate": 4.71563981042654e-05,
"loss": 0.444,
"step": 179
},
{
"epoch": 0.46035805626598464,
"grad_norm": 0.2994655863634162,
"learning_rate": 4.710900473933649e-05,
"loss": 0.4436,
"step": 180
},
{
"epoch": 0.4629156010230179,
"grad_norm": 0.2979340261572654,
"learning_rate": 4.7061611374407584e-05,
"loss": 0.4338,
"step": 181
},
{
"epoch": 0.46547314578005117,
"grad_norm": 0.3259496116943633,
"learning_rate": 4.7014218009478675e-05,
"loss": 0.4495,
"step": 182
},
{
"epoch": 0.4680306905370844,
"grad_norm": 0.23888073915231028,
"learning_rate": 4.6966824644549767e-05,
"loss": 0.4284,
"step": 183
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.32689979789920254,
"learning_rate": 4.691943127962086e-05,
"loss": 0.4307,
"step": 184
},
{
"epoch": 0.4731457800511509,
"grad_norm": 0.25079547977758637,
"learning_rate": 4.687203791469195e-05,
"loss": 0.4421,
"step": 185
},
{
"epoch": 0.47570332480818417,
"grad_norm": 0.2867599117175957,
"learning_rate": 4.6824644549763034e-05,
"loss": 0.4208,
"step": 186
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.30676226767943815,
"learning_rate": 4.6777251184834125e-05,
"loss": 0.4473,
"step": 187
},
{
"epoch": 0.48081841432225064,
"grad_norm": 0.2535226915718885,
"learning_rate": 4.6729857819905216e-05,
"loss": 0.4341,
"step": 188
},
{
"epoch": 0.4833759590792839,
"grad_norm": 0.2953685479977593,
"learning_rate": 4.668246445497631e-05,
"loss": 0.4296,
"step": 189
},
{
"epoch": 0.4859335038363171,
"grad_norm": 0.24557281792948057,
"learning_rate": 4.663507109004739e-05,
"loss": 0.4507,
"step": 190
},
{
"epoch": 0.4884910485933504,
"grad_norm": 0.2738208517596116,
"learning_rate": 4.6587677725118484e-05,
"loss": 0.4285,
"step": 191
},
{
"epoch": 0.49104859335038364,
"grad_norm": 0.28109008439258515,
"learning_rate": 4.6540284360189575e-05,
"loss": 0.4396,
"step": 192
},
{
"epoch": 0.4936061381074169,
"grad_norm": 0.2793263219783419,
"learning_rate": 4.6492890995260666e-05,
"loss": 0.4334,
"step": 193
},
{
"epoch": 0.4961636828644501,
"grad_norm": 0.2679578064695335,
"learning_rate": 4.644549763033176e-05,
"loss": 0.4425,
"step": 194
},
{
"epoch": 0.49872122762148335,
"grad_norm": 0.22379280473483837,
"learning_rate": 4.639810426540285e-05,
"loss": 0.4366,
"step": 195
},
{
"epoch": 0.5012787723785166,
"grad_norm": 0.24785033078885174,
"learning_rate": 4.635071090047394e-05,
"loss": 0.4309,
"step": 196
},
{
"epoch": 0.5038363171355499,
"grad_norm": 0.24670000195823377,
"learning_rate": 4.6303317535545025e-05,
"loss": 0.4417,
"step": 197
},
{
"epoch": 0.5063938618925832,
"grad_norm": 0.2930253060170641,
"learning_rate": 4.6255924170616116e-05,
"loss": 0.4375,
"step": 198
},
{
"epoch": 0.5089514066496164,
"grad_norm": 0.25825281391527216,
"learning_rate": 4.620853080568721e-05,
"loss": 0.4069,
"step": 199
},
{
"epoch": 0.5115089514066496,
"grad_norm": 0.26224408452770004,
"learning_rate": 4.616113744075829e-05,
"loss": 0.4324,
"step": 200
},
{
"epoch": 0.5140664961636828,
"grad_norm": 0.25990930281801855,
"learning_rate": 4.6113744075829384e-05,
"loss": 0.4345,
"step": 201
},
{
"epoch": 0.5166240409207161,
"grad_norm": 0.268851283978036,
"learning_rate": 4.6066350710900475e-05,
"loss": 0.4459,
"step": 202
},
{
"epoch": 0.5191815856777494,
"grad_norm": 0.24959358046946803,
"learning_rate": 4.6018957345971566e-05,
"loss": 0.429,
"step": 203
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.25540467279864626,
"learning_rate": 4.597156398104265e-05,
"loss": 0.4348,
"step": 204
},
{
"epoch": 0.5242966751918159,
"grad_norm": 0.3130713054404299,
"learning_rate": 4.592417061611375e-05,
"loss": 0.4271,
"step": 205
},
{
"epoch": 0.5268542199488491,
"grad_norm": 0.2688748449663916,
"learning_rate": 4.587677725118484e-05,
"loss": 0.442,
"step": 206
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.28683589425397626,
"learning_rate": 4.5829383886255925e-05,
"loss": 0.4333,
"step": 207
},
{
"epoch": 0.5319693094629157,
"grad_norm": 0.271763985489389,
"learning_rate": 4.5781990521327016e-05,
"loss": 0.4438,
"step": 208
},
{
"epoch": 0.5345268542199488,
"grad_norm": 0.2885843579908882,
"learning_rate": 4.573459715639811e-05,
"loss": 0.4419,
"step": 209
},
{
"epoch": 0.5370843989769821,
"grad_norm": 0.28754217783051483,
"learning_rate": 4.56872037914692e-05,
"loss": 0.4355,
"step": 210
},
{
"epoch": 0.5396419437340153,
"grad_norm": 0.2737511286441873,
"learning_rate": 4.563981042654028e-05,
"loss": 0.4387,
"step": 211
},
{
"epoch": 0.5421994884910486,
"grad_norm": 0.27934016374689097,
"learning_rate": 4.5592417061611375e-05,
"loss": 0.4349,
"step": 212
},
{
"epoch": 0.5447570332480819,
"grad_norm": 0.26735219691819356,
"learning_rate": 4.5545023696682466e-05,
"loss": 0.4134,
"step": 213
},
{
"epoch": 0.5473145780051151,
"grad_norm": 0.23887968506376323,
"learning_rate": 4.549763033175356e-05,
"loss": 0.4229,
"step": 214
},
{
"epoch": 0.5498721227621484,
"grad_norm": 0.3011075198266259,
"learning_rate": 4.545023696682464e-05,
"loss": 0.428,
"step": 215
},
{
"epoch": 0.5524296675191815,
"grad_norm": 0.2637321441272665,
"learning_rate": 4.540284360189574e-05,
"loss": 0.4363,
"step": 216
},
{
"epoch": 0.5549872122762148,
"grad_norm": 0.29296145440427007,
"learning_rate": 4.535545023696683e-05,
"loss": 0.4304,
"step": 217
},
{
"epoch": 0.5575447570332481,
"grad_norm": 0.30674762583017257,
"learning_rate": 4.5308056872037916e-05,
"loss": 0.4298,
"step": 218
},
{
"epoch": 0.5601023017902813,
"grad_norm": 0.3143323294898988,
"learning_rate": 4.526066350710901e-05,
"loss": 0.4174,
"step": 219
},
{
"epoch": 0.5626598465473146,
"grad_norm": 0.32231260882420976,
"learning_rate": 4.52132701421801e-05,
"loss": 0.4167,
"step": 220
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.3083722811455428,
"learning_rate": 4.516587677725119e-05,
"loss": 0.4146,
"step": 221
},
{
"epoch": 0.5677749360613811,
"grad_norm": 0.29120067898722657,
"learning_rate": 4.5118483412322274e-05,
"loss": 0.4341,
"step": 222
},
{
"epoch": 0.5703324808184144,
"grad_norm": 0.31085600501836047,
"learning_rate": 4.5071090047393366e-05,
"loss": 0.4368,
"step": 223
},
{
"epoch": 0.5728900255754475,
"grad_norm": 0.2562962629149674,
"learning_rate": 4.502369668246446e-05,
"loss": 0.4505,
"step": 224
},
{
"epoch": 0.5754475703324808,
"grad_norm": 0.3229335775809623,
"learning_rate": 4.497630331753555e-05,
"loss": 0.4281,
"step": 225
},
{
"epoch": 0.578005115089514,
"grad_norm": 0.2540883724081723,
"learning_rate": 4.492890995260663e-05,
"loss": 0.4345,
"step": 226
},
{
"epoch": 0.5805626598465473,
"grad_norm": 0.2886423143864352,
"learning_rate": 4.488151658767773e-05,
"loss": 0.4252,
"step": 227
},
{
"epoch": 0.5831202046035806,
"grad_norm": 0.25233412822407364,
"learning_rate": 4.483412322274882e-05,
"loss": 0.4366,
"step": 228
},
{
"epoch": 0.5856777493606138,
"grad_norm": 0.3098472836225145,
"learning_rate": 4.478672985781991e-05,
"loss": 0.4363,
"step": 229
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.27067664311480977,
"learning_rate": 4.4739336492891e-05,
"loss": 0.4354,
"step": 230
},
{
"epoch": 0.5907928388746803,
"grad_norm": 0.28985639348209424,
"learning_rate": 4.469194312796209e-05,
"loss": 0.4441,
"step": 231
},
{
"epoch": 0.5933503836317136,
"grad_norm": 0.24685436630203944,
"learning_rate": 4.464454976303318e-05,
"loss": 0.4249,
"step": 232
},
{
"epoch": 0.5959079283887468,
"grad_norm": 0.2415267361110554,
"learning_rate": 4.4597156398104266e-05,
"loss": 0.4218,
"step": 233
},
{
"epoch": 0.59846547314578,
"grad_norm": 0.2690111434121743,
"learning_rate": 4.454976303317536e-05,
"loss": 0.4597,
"step": 234
},
{
"epoch": 0.6010230179028133,
"grad_norm": 0.24515241676488578,
"learning_rate": 4.450236966824645e-05,
"loss": 0.4484,
"step": 235
},
{
"epoch": 0.6035805626598465,
"grad_norm": 0.27035232285201444,
"learning_rate": 4.445497630331753e-05,
"loss": 0.4241,
"step": 236
},
{
"epoch": 0.6061381074168798,
"grad_norm": 0.24712864164146403,
"learning_rate": 4.4407582938388624e-05,
"loss": 0.4351,
"step": 237
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.2756970755602701,
"learning_rate": 4.4360189573459716e-05,
"loss": 0.424,
"step": 238
},
{
"epoch": 0.6112531969309463,
"grad_norm": 0.219814601291788,
"learning_rate": 4.431279620853081e-05,
"loss": 0.4355,
"step": 239
},
{
"epoch": 0.6138107416879796,
"grad_norm": 0.3022287967822769,
"learning_rate": 4.42654028436019e-05,
"loss": 0.421,
"step": 240
},
{
"epoch": 0.6163682864450127,
"grad_norm": 0.25187957786419013,
"learning_rate": 4.421800947867299e-05,
"loss": 0.3987,
"step": 241
},
{
"epoch": 0.618925831202046,
"grad_norm": 0.2906641083550279,
"learning_rate": 4.417061611374408e-05,
"loss": 0.4094,
"step": 242
},
{
"epoch": 0.6214833759590793,
"grad_norm": 0.276150078017692,
"learning_rate": 4.4123222748815165e-05,
"loss": 0.4336,
"step": 243
},
{
"epoch": 0.6240409207161125,
"grad_norm": 0.31066268816197273,
"learning_rate": 4.407582938388626e-05,
"loss": 0.4327,
"step": 244
},
{
"epoch": 0.6265984654731458,
"grad_norm": 0.2741673358883194,
"learning_rate": 4.402843601895735e-05,
"loss": 0.4389,
"step": 245
},
{
"epoch": 0.629156010230179,
"grad_norm": 0.2836157982013865,
"learning_rate": 4.398104265402844e-05,
"loss": 0.4197,
"step": 246
},
{
"epoch": 0.6317135549872123,
"grad_norm": 0.2785562060260622,
"learning_rate": 4.3933649289099524e-05,
"loss": 0.4118,
"step": 247
},
{
"epoch": 0.6342710997442456,
"grad_norm": 0.2562708631634233,
"learning_rate": 4.3886255924170615e-05,
"loss": 0.4313,
"step": 248
},
{
"epoch": 0.6368286445012787,
"grad_norm": 0.3006474659338952,
"learning_rate": 4.383886255924171e-05,
"loss": 0.4369,
"step": 249
},
{
"epoch": 0.639386189258312,
"grad_norm": 0.2457393144167786,
"learning_rate": 4.37914691943128e-05,
"loss": 0.4262,
"step": 250
},
{
"epoch": 0.6419437340153452,
"grad_norm": 0.2613054151983516,
"learning_rate": 4.374407582938389e-05,
"loss": 0.4156,
"step": 251
},
{
"epoch": 0.6445012787723785,
"grad_norm": 0.2560975612132112,
"learning_rate": 4.369668246445498e-05,
"loss": 0.4327,
"step": 252
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.2615125383682568,
"learning_rate": 4.364928909952607e-05,
"loss": 0.4239,
"step": 253
},
{
"epoch": 0.649616368286445,
"grad_norm": 0.2703032829220517,
"learning_rate": 4.3601895734597157e-05,
"loss": 0.4326,
"step": 254
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.27271514949565595,
"learning_rate": 4.355450236966825e-05,
"loss": 0.4265,
"step": 255
},
{
"epoch": 0.6547314578005116,
"grad_norm": 0.28726916782564577,
"learning_rate": 4.350710900473934e-05,
"loss": 0.4357,
"step": 256
},
{
"epoch": 0.6572890025575447,
"grad_norm": 0.24344125190622717,
"learning_rate": 4.345971563981043e-05,
"loss": 0.4209,
"step": 257
},
{
"epoch": 0.659846547314578,
"grad_norm": 0.2779762711774089,
"learning_rate": 4.3412322274881515e-05,
"loss": 0.4402,
"step": 258
},
{
"epoch": 0.6624040920716112,
"grad_norm": 0.2833066194766303,
"learning_rate": 4.3364928909952606e-05,
"loss": 0.4309,
"step": 259
},
{
"epoch": 0.6649616368286445,
"grad_norm": 0.264439200242611,
"learning_rate": 4.33175355450237e-05,
"loss": 0.4234,
"step": 260
},
{
"epoch": 0.6675191815856778,
"grad_norm": 0.24820943480335378,
"learning_rate": 4.327014218009479e-05,
"loss": 0.3998,
"step": 261
},
{
"epoch": 0.670076726342711,
"grad_norm": 0.25992990540498473,
"learning_rate": 4.322274881516588e-05,
"loss": 0.4168,
"step": 262
},
{
"epoch": 0.6726342710997443,
"grad_norm": 0.261861520036362,
"learning_rate": 4.317535545023697e-05,
"loss": 0.4148,
"step": 263
},
{
"epoch": 0.6751918158567775,
"grad_norm": 0.26644356287497634,
"learning_rate": 4.312796208530806e-05,
"loss": 0.4345,
"step": 264
},
{
"epoch": 0.6777493606138107,
"grad_norm": 0.2666945078617733,
"learning_rate": 4.308056872037915e-05,
"loss": 0.429,
"step": 265
},
{
"epoch": 0.680306905370844,
"grad_norm": 0.23998424454638398,
"learning_rate": 4.303317535545024e-05,
"loss": 0.4226,
"step": 266
},
{
"epoch": 0.6828644501278772,
"grad_norm": 0.2530577923125785,
"learning_rate": 4.298578199052133e-05,
"loss": 0.4078,
"step": 267
},
{
"epoch": 0.6854219948849105,
"grad_norm": 0.2532304497913718,
"learning_rate": 4.293838862559242e-05,
"loss": 0.4157,
"step": 268
},
{
"epoch": 0.6879795396419437,
"grad_norm": 0.25183699854529734,
"learning_rate": 4.2890995260663506e-05,
"loss": 0.4213,
"step": 269
},
{
"epoch": 0.690537084398977,
"grad_norm": 0.26547907225114403,
"learning_rate": 4.28436018957346e-05,
"loss": 0.4213,
"step": 270
},
{
"epoch": 0.6930946291560103,
"grad_norm": 0.247119162528651,
"learning_rate": 4.279620853080569e-05,
"loss": 0.4441,
"step": 271
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.2802387698001237,
"learning_rate": 4.2748815165876774e-05,
"loss": 0.3961,
"step": 272
},
{
"epoch": 0.6982097186700768,
"grad_norm": 0.26948037359199567,
"learning_rate": 4.270142180094787e-05,
"loss": 0.4245,
"step": 273
},
{
"epoch": 0.7007672634271099,
"grad_norm": 0.26130649009882045,
"learning_rate": 4.265402843601896e-05,
"loss": 0.4322,
"step": 274
},
{
"epoch": 0.7033248081841432,
"grad_norm": 0.27770444806162603,
"learning_rate": 4.260663507109005e-05,
"loss": 0.4202,
"step": 275
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.2725938450444014,
"learning_rate": 4.255924170616114e-05,
"loss": 0.4287,
"step": 276
},
{
"epoch": 0.7084398976982097,
"grad_norm": 0.27389105466937425,
"learning_rate": 4.251184834123223e-05,
"loss": 0.4409,
"step": 277
},
{
"epoch": 0.710997442455243,
"grad_norm": 0.2559589781819663,
"learning_rate": 4.246445497630332e-05,
"loss": 0.4239,
"step": 278
},
{
"epoch": 0.7135549872122762,
"grad_norm": 0.27905262687916627,
"learning_rate": 4.2417061611374406e-05,
"loss": 0.4012,
"step": 279
},
{
"epoch": 0.7161125319693095,
"grad_norm": 0.23334151341578974,
"learning_rate": 4.23696682464455e-05,
"loss": 0.4046,
"step": 280
},
{
"epoch": 0.7186700767263428,
"grad_norm": 0.268496141900923,
"learning_rate": 4.232227488151659e-05,
"loss": 0.4333,
"step": 281
},
{
"epoch": 0.7212276214833759,
"grad_norm": 0.23917839522777942,
"learning_rate": 4.227488151658768e-05,
"loss": 0.4199,
"step": 282
},
{
"epoch": 0.7237851662404092,
"grad_norm": 0.2550111302110382,
"learning_rate": 4.2227488151658765e-05,
"loss": 0.4323,
"step": 283
},
{
"epoch": 0.7263427109974424,
"grad_norm": 0.23586654241099228,
"learning_rate": 4.218009478672986e-05,
"loss": 0.4332,
"step": 284
},
{
"epoch": 0.7289002557544757,
"grad_norm": 0.23396222749154336,
"learning_rate": 4.2132701421800954e-05,
"loss": 0.4269,
"step": 285
},
{
"epoch": 0.731457800511509,
"grad_norm": 0.24604087944261607,
"learning_rate": 4.208530805687204e-05,
"loss": 0.4255,
"step": 286
},
{
"epoch": 0.7340153452685422,
"grad_norm": 0.2357151023733209,
"learning_rate": 4.203791469194313e-05,
"loss": 0.4153,
"step": 287
},
{
"epoch": 0.7365728900255755,
"grad_norm": 0.2737358478652627,
"learning_rate": 4.199052132701422e-05,
"loss": 0.4122,
"step": 288
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.24451563415626945,
"learning_rate": 4.194312796208531e-05,
"loss": 0.4185,
"step": 289
},
{
"epoch": 0.7416879795396419,
"grad_norm": 0.2541923450282548,
"learning_rate": 4.18957345971564e-05,
"loss": 0.433,
"step": 290
},
{
"epoch": 0.7442455242966752,
"grad_norm": 0.26598804764295264,
"learning_rate": 4.184834123222749e-05,
"loss": 0.4331,
"step": 291
},
{
"epoch": 0.7468030690537084,
"grad_norm": 0.2652230008961156,
"learning_rate": 4.180094786729858e-05,
"loss": 0.4055,
"step": 292
},
{
"epoch": 0.7493606138107417,
"grad_norm": 0.2795715968348066,
"learning_rate": 4.175355450236967e-05,
"loss": 0.445,
"step": 293
},
{
"epoch": 0.7519181585677749,
"grad_norm": 0.27501165211060447,
"learning_rate": 4.1706161137440756e-05,
"loss": 0.4159,
"step": 294
},
{
"epoch": 0.7544757033248082,
"grad_norm": 0.2739979913068721,
"learning_rate": 4.1658767772511854e-05,
"loss": 0.4104,
"step": 295
},
{
"epoch": 0.7570332480818415,
"grad_norm": 0.28465485095011533,
"learning_rate": 4.1611374407582945e-05,
"loss": 0.4244,
"step": 296
},
{
"epoch": 0.7595907928388747,
"grad_norm": 0.23976720007281227,
"learning_rate": 4.156398104265403e-05,
"loss": 0.4295,
"step": 297
},
{
"epoch": 0.7621483375959079,
"grad_norm": 0.3088498871060993,
"learning_rate": 4.151658767772512e-05,
"loss": 0.4092,
"step": 298
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.22770658779763117,
"learning_rate": 4.146919431279621e-05,
"loss": 0.4269,
"step": 299
},
{
"epoch": 0.7672634271099744,
"grad_norm": 0.27958609884503893,
"learning_rate": 4.1421800947867304e-05,
"loss": 0.4044,
"step": 300
},
{
"epoch": 0.7698209718670077,
"grad_norm": 0.21729140703255853,
"learning_rate": 4.137440758293839e-05,
"loss": 0.4131,
"step": 301
},
{
"epoch": 0.7723785166240409,
"grad_norm": 0.2685972778786351,
"learning_rate": 4.132701421800948e-05,
"loss": 0.4207,
"step": 302
},
{
"epoch": 0.7749360613810742,
"grad_norm": 0.22146302445276972,
"learning_rate": 4.127962085308057e-05,
"loss": 0.4242,
"step": 303
},
{
"epoch": 0.7774936061381074,
"grad_norm": 0.246088542123556,
"learning_rate": 4.123222748815166e-05,
"loss": 0.4141,
"step": 304
},
{
"epoch": 0.7800511508951407,
"grad_norm": 0.2601313122186582,
"learning_rate": 4.118483412322275e-05,
"loss": 0.4139,
"step": 305
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.24464325806612688,
"learning_rate": 4.113744075829384e-05,
"loss": 0.4163,
"step": 306
},
{
"epoch": 0.7851662404092071,
"grad_norm": 0.2651808280050511,
"learning_rate": 4.1090047393364936e-05,
"loss": 0.4306,
"step": 307
},
{
"epoch": 0.7877237851662404,
"grad_norm": 0.30621497925153024,
"learning_rate": 4.104265402843602e-05,
"loss": 0.4142,
"step": 308
},
{
"epoch": 0.7902813299232737,
"grad_norm": 0.27574828742072455,
"learning_rate": 4.099526066350711e-05,
"loss": 0.4194,
"step": 309
},
{
"epoch": 0.7928388746803069,
"grad_norm": 0.2646206797692572,
"learning_rate": 4.0947867298578204e-05,
"loss": 0.4338,
"step": 310
},
{
"epoch": 0.7953964194373402,
"grad_norm": 0.2953561111239538,
"learning_rate": 4.090047393364929e-05,
"loss": 0.4354,
"step": 311
},
{
"epoch": 0.7979539641943734,
"grad_norm": 0.2679304891781562,
"learning_rate": 4.085308056872038e-05,
"loss": 0.3996,
"step": 312
},
{
"epoch": 0.8005115089514067,
"grad_norm": 0.2614240488716786,
"learning_rate": 4.080568720379147e-05,
"loss": 0.4177,
"step": 313
},
{
"epoch": 0.80306905370844,
"grad_norm": 0.265506214792124,
"learning_rate": 4.075829383886256e-05,
"loss": 0.4229,
"step": 314
},
{
"epoch": 0.8056265984654731,
"grad_norm": 0.27403664060217564,
"learning_rate": 4.071090047393365e-05,
"loss": 0.4111,
"step": 315
},
{
"epoch": 0.8081841432225064,
"grad_norm": 0.27566927450054673,
"learning_rate": 4.066350710900474e-05,
"loss": 0.4186,
"step": 316
},
{
"epoch": 0.8107416879795396,
"grad_norm": 0.2432325969682962,
"learning_rate": 4.061611374407583e-05,
"loss": 0.4317,
"step": 317
},
{
"epoch": 0.8132992327365729,
"grad_norm": 0.3100835908713653,
"learning_rate": 4.056872037914692e-05,
"loss": 0.4263,
"step": 318
},
{
"epoch": 0.8158567774936062,
"grad_norm": 0.22437352803704477,
"learning_rate": 4.052132701421801e-05,
"loss": 0.4255,
"step": 319
},
{
"epoch": 0.8184143222506394,
"grad_norm": 0.2922741159014344,
"learning_rate": 4.0473933649289103e-05,
"loss": 0.4211,
"step": 320
},
{
"epoch": 0.8209718670076727,
"grad_norm": 0.24988657961825148,
"learning_rate": 4.0426540284360195e-05,
"loss": 0.424,
"step": 321
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.26255657346142036,
"learning_rate": 4.037914691943128e-05,
"loss": 0.4227,
"step": 322
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.30330186929682673,
"learning_rate": 4.033175355450237e-05,
"loss": 0.4074,
"step": 323
},
{
"epoch": 0.8286445012787724,
"grad_norm": 0.2545401531861922,
"learning_rate": 4.028436018957346e-05,
"loss": 0.4229,
"step": 324
},
{
"epoch": 0.8312020460358056,
"grad_norm": 0.33076162934856534,
"learning_rate": 4.023696682464455e-05,
"loss": 0.4236,
"step": 325
},
{
"epoch": 0.8337595907928389,
"grad_norm": 0.2595060997444347,
"learning_rate": 4.018957345971564e-05,
"loss": 0.3957,
"step": 326
},
{
"epoch": 0.8363171355498721,
"grad_norm": 0.27331289296315875,
"learning_rate": 4.014218009478673e-05,
"loss": 0.3975,
"step": 327
},
{
"epoch": 0.8388746803069054,
"grad_norm": 0.3262160188917236,
"learning_rate": 4.009478672985782e-05,
"loss": 0.4149,
"step": 328
},
{
"epoch": 0.8414322250639387,
"grad_norm": 0.28084918191171054,
"learning_rate": 4.004739336492891e-05,
"loss": 0.3999,
"step": 329
},
{
"epoch": 0.8439897698209718,
"grad_norm": 0.28489832877056614,
"learning_rate": 4e-05,
"loss": 0.4134,
"step": 330
},
{
"epoch": 0.8465473145780051,
"grad_norm": 0.28245197788542203,
"learning_rate": 3.9952606635071095e-05,
"loss": 0.4169,
"step": 331
},
{
"epoch": 0.8491048593350383,
"grad_norm": 0.2637012101965425,
"learning_rate": 3.9905213270142186e-05,
"loss": 0.4218,
"step": 332
},
{
"epoch": 0.8516624040920716,
"grad_norm": 0.25239050580322964,
"learning_rate": 3.985781990521327e-05,
"loss": 0.403,
"step": 333
},
{
"epoch": 0.8542199488491049,
"grad_norm": 0.3242230481439879,
"learning_rate": 3.981042654028436e-05,
"loss": 0.4413,
"step": 334
},
{
"epoch": 0.8567774936061381,
"grad_norm": 0.284310422864808,
"learning_rate": 3.976303317535545e-05,
"loss": 0.3992,
"step": 335
},
{
"epoch": 0.8593350383631714,
"grad_norm": 0.44681533776592774,
"learning_rate": 3.9715639810426545e-05,
"loss": 0.427,
"step": 336
},
{
"epoch": 0.8618925831202046,
"grad_norm": 0.276866045564762,
"learning_rate": 3.966824644549763e-05,
"loss": 0.4166,
"step": 337
},
{
"epoch": 0.8644501278772379,
"grad_norm": 0.2645666241102728,
"learning_rate": 3.962085308056872e-05,
"loss": 0.4018,
"step": 338
},
{
"epoch": 0.8670076726342711,
"grad_norm": 0.24575688741880164,
"learning_rate": 3.957345971563981e-05,
"loss": 0.4103,
"step": 339
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.27234369778819617,
"learning_rate": 3.95260663507109e-05,
"loss": 0.4178,
"step": 340
},
{
"epoch": 0.8721227621483376,
"grad_norm": 0.2510614688018398,
"learning_rate": 3.9478672985781994e-05,
"loss": 0.4124,
"step": 341
},
{
"epoch": 0.8746803069053708,
"grad_norm": 0.26748644233845587,
"learning_rate": 3.9431279620853086e-05,
"loss": 0.414,
"step": 342
},
{
"epoch": 0.8772378516624041,
"grad_norm": 0.2839803657663104,
"learning_rate": 3.938388625592418e-05,
"loss": 0.4089,
"step": 343
},
{
"epoch": 0.8797953964194374,
"grad_norm": 0.2797111880377059,
"learning_rate": 3.933649289099526e-05,
"loss": 0.4143,
"step": 344
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.2889574353504485,
"learning_rate": 3.928909952606635e-05,
"loss": 0.4224,
"step": 345
},
{
"epoch": 0.8849104859335039,
"grad_norm": 0.2661360330372934,
"learning_rate": 3.9241706161137444e-05,
"loss": 0.4002,
"step": 346
},
{
"epoch": 0.887468030690537,
"grad_norm": 0.2771219438556858,
"learning_rate": 3.919431279620853e-05,
"loss": 0.4113,
"step": 347
},
{
"epoch": 0.8900255754475703,
"grad_norm": 0.27519975509219513,
"learning_rate": 3.914691943127962e-05,
"loss": 0.4191,
"step": 348
},
{
"epoch": 0.8925831202046036,
"grad_norm": 0.2928986459591194,
"learning_rate": 3.909952606635071e-05,
"loss": 0.4233,
"step": 349
},
{
"epoch": 0.8951406649616368,
"grad_norm": 0.2516706010333049,
"learning_rate": 3.90521327014218e-05,
"loss": 0.4012,
"step": 350
},
{
"epoch": 0.8976982097186701,
"grad_norm": 0.2408367305868911,
"learning_rate": 3.900473933649289e-05,
"loss": 0.409,
"step": 351
},
{
"epoch": 0.9002557544757033,
"grad_norm": 0.27279698596719115,
"learning_rate": 3.8957345971563986e-05,
"loss": 0.4068,
"step": 352
},
{
"epoch": 0.9028132992327366,
"grad_norm": 0.25724144072901317,
"learning_rate": 3.890995260663508e-05,
"loss": 0.4048,
"step": 353
},
{
"epoch": 0.9053708439897699,
"grad_norm": 0.22699353109114737,
"learning_rate": 3.886255924170616e-05,
"loss": 0.4005,
"step": 354
},
{
"epoch": 0.907928388746803,
"grad_norm": 0.248751842398148,
"learning_rate": 3.881516587677725e-05,
"loss": 0.403,
"step": 355
},
{
"epoch": 0.9104859335038363,
"grad_norm": 0.29922749419927136,
"learning_rate": 3.8767772511848344e-05,
"loss": 0.4255,
"step": 356
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.24165253803081185,
"learning_rate": 3.8720379146919435e-05,
"loss": 0.3939,
"step": 357
},
{
"epoch": 0.9156010230179028,
"grad_norm": 0.26769384614675706,
"learning_rate": 3.867298578199052e-05,
"loss": 0.3881,
"step": 358
},
{
"epoch": 0.9181585677749361,
"grad_norm": 0.24501952061738294,
"learning_rate": 3.862559241706161e-05,
"loss": 0.412,
"step": 359
},
{
"epoch": 0.9207161125319693,
"grad_norm": 0.27781395797316877,
"learning_rate": 3.85781990521327e-05,
"loss": 0.4293,
"step": 360
},
{
"epoch": 0.9232736572890026,
"grad_norm": 0.22892488592677732,
"learning_rate": 3.8530805687203794e-05,
"loss": 0.4069,
"step": 361
},
{
"epoch": 0.9258312020460358,
"grad_norm": 0.258222796507594,
"learning_rate": 3.848341232227488e-05,
"loss": 0.4177,
"step": 362
},
{
"epoch": 0.928388746803069,
"grad_norm": 0.22668062864053168,
"learning_rate": 3.843601895734598e-05,
"loss": 0.4012,
"step": 363
},
{
"epoch": 0.9309462915601023,
"grad_norm": 0.29919710610032196,
"learning_rate": 3.838862559241707e-05,
"loss": 0.3971,
"step": 364
},
{
"epoch": 0.9335038363171355,
"grad_norm": 0.25611276582674614,
"learning_rate": 3.834123222748815e-05,
"loss": 0.4071,
"step": 365
},
{
"epoch": 0.9360613810741688,
"grad_norm": 0.24646222411688562,
"learning_rate": 3.8293838862559244e-05,
"loss": 0.3993,
"step": 366
},
{
"epoch": 0.9386189258312021,
"grad_norm": 0.26353127236434676,
"learning_rate": 3.8246445497630335e-05,
"loss": 0.4042,
"step": 367
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.23542690864108376,
"learning_rate": 3.8199052132701427e-05,
"loss": 0.3971,
"step": 368
},
{
"epoch": 0.9437340153452686,
"grad_norm": 0.28245650020992513,
"learning_rate": 3.815165876777251e-05,
"loss": 0.4044,
"step": 369
},
{
"epoch": 0.9462915601023018,
"grad_norm": 0.241903542321646,
"learning_rate": 3.81042654028436e-05,
"loss": 0.4054,
"step": 370
},
{
"epoch": 0.948849104859335,
"grad_norm": 0.2378788913607164,
"learning_rate": 3.8056872037914694e-05,
"loss": 0.4125,
"step": 371
},
{
"epoch": 0.9514066496163683,
"grad_norm": 0.2804121730578267,
"learning_rate": 3.8009478672985785e-05,
"loss": 0.421,
"step": 372
},
{
"epoch": 0.9539641943734015,
"grad_norm": 0.23484030136789275,
"learning_rate": 3.796208530805687e-05,
"loss": 0.4112,
"step": 373
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.27396114311616715,
"learning_rate": 3.791469194312796e-05,
"loss": 0.4204,
"step": 374
},
{
"epoch": 0.959079283887468,
"grad_norm": 0.2393059668656863,
"learning_rate": 3.786729857819906e-05,
"loss": 0.4221,
"step": 375
},
{
"epoch": 0.9616368286445013,
"grad_norm": 0.2482404610854873,
"learning_rate": 3.7819905213270144e-05,
"loss": 0.4078,
"step": 376
},
{
"epoch": 0.9641943734015346,
"grad_norm": 0.2187225932256056,
"learning_rate": 3.7772511848341235e-05,
"loss": 0.4012,
"step": 377
},
{
"epoch": 0.9667519181585678,
"grad_norm": 0.2876923866076237,
"learning_rate": 3.7725118483412326e-05,
"loss": 0.4252,
"step": 378
},
{
"epoch": 0.969309462915601,
"grad_norm": 0.21352306199272536,
"learning_rate": 3.767772511848342e-05,
"loss": 0.3966,
"step": 379
},
{
"epoch": 0.9718670076726342,
"grad_norm": 0.24918249981369345,
"learning_rate": 3.76303317535545e-05,
"loss": 0.415,
"step": 380
},
{
"epoch": 0.9744245524296675,
"grad_norm": 0.23387991185870513,
"learning_rate": 3.7582938388625594e-05,
"loss": 0.4127,
"step": 381
},
{
"epoch": 0.9769820971867008,
"grad_norm": 0.29074570845452075,
"learning_rate": 3.7535545023696685e-05,
"loss": 0.4275,
"step": 382
},
{
"epoch": 0.979539641943734,
"grad_norm": 0.26317053639627985,
"learning_rate": 3.748815165876777e-05,
"loss": 0.4294,
"step": 383
},
{
"epoch": 0.9820971867007673,
"grad_norm": 0.2686101535519852,
"learning_rate": 3.744075829383886e-05,
"loss": 0.4025,
"step": 384
},
{
"epoch": 0.9846547314578005,
"grad_norm": 0.23512146035375164,
"learning_rate": 3.739336492890995e-05,
"loss": 0.4156,
"step": 385
},
{
"epoch": 0.9872122762148338,
"grad_norm": 0.2445528540082093,
"learning_rate": 3.734597156398105e-05,
"loss": 0.411,
"step": 386
},
{
"epoch": 0.989769820971867,
"grad_norm": 0.25629958731478186,
"learning_rate": 3.7298578199052135e-05,
"loss": 0.4146,
"step": 387
},
{
"epoch": 0.9923273657289002,
"grad_norm": 0.22796776248894252,
"learning_rate": 3.7251184834123226e-05,
"loss": 0.4087,
"step": 388
},
{
"epoch": 0.9948849104859335,
"grad_norm": 0.2958838240159185,
"learning_rate": 3.720379146919432e-05,
"loss": 0.4099,
"step": 389
},
{
"epoch": 0.9974424552429667,
"grad_norm": 0.29381146676513115,
"learning_rate": 3.71563981042654e-05,
"loss": 0.414,
"step": 390
},
{
"epoch": 1.0,
"grad_norm": 0.24652387465895462,
"learning_rate": 3.7109004739336493e-05,
"loss": 0.406,
"step": 391
},
{
"epoch": 1.0025575447570332,
"grad_norm": 0.34877694988477503,
"learning_rate": 3.7061611374407585e-05,
"loss": 0.3503,
"step": 392
},
{
"epoch": 1.0051150895140666,
"grad_norm": 0.253308743207643,
"learning_rate": 3.7014218009478676e-05,
"loss": 0.3396,
"step": 393
},
{
"epoch": 1.0076726342710998,
"grad_norm": 0.25868870232786395,
"learning_rate": 3.696682464454976e-05,
"loss": 0.3479,
"step": 394
},
{
"epoch": 1.010230179028133,
"grad_norm": 0.2971613524674976,
"learning_rate": 3.691943127962085e-05,
"loss": 0.3422,
"step": 395
},
{
"epoch": 1.0127877237851663,
"grad_norm": 0.2906921720555448,
"learning_rate": 3.687203791469194e-05,
"loss": 0.3554,
"step": 396
},
{
"epoch": 1.0153452685421995,
"grad_norm": 0.2875908700888308,
"learning_rate": 3.6824644549763035e-05,
"loss": 0.3291,
"step": 397
},
{
"epoch": 1.0179028132992327,
"grad_norm": 0.26243291597976126,
"learning_rate": 3.6777251184834126e-05,
"loss": 0.3563,
"step": 398
},
{
"epoch": 1.020460358056266,
"grad_norm": 0.2730126516412927,
"learning_rate": 3.672985781990522e-05,
"loss": 0.3292,
"step": 399
},
{
"epoch": 1.0230179028132993,
"grad_norm": 0.29682604006588903,
"learning_rate": 3.668246445497631e-05,
"loss": 0.3461,
"step": 400
},
{
"epoch": 1.0255754475703325,
"grad_norm": 0.2494027953748241,
"learning_rate": 3.663507109004739e-05,
"loss": 0.3491,
"step": 401
},
{
"epoch": 1.0281329923273657,
"grad_norm": 0.2538094727914758,
"learning_rate": 3.6587677725118485e-05,
"loss": 0.3406,
"step": 402
},
{
"epoch": 1.030690537084399,
"grad_norm": 0.28915662612861087,
"learning_rate": 3.6540284360189576e-05,
"loss": 0.3408,
"step": 403
},
{
"epoch": 1.0332480818414322,
"grad_norm": 0.24591203347051302,
"learning_rate": 3.649289099526067e-05,
"loss": 0.337,
"step": 404
},
{
"epoch": 1.0358056265984654,
"grad_norm": 0.2871114071516867,
"learning_rate": 3.644549763033175e-05,
"loss": 0.3347,
"step": 405
},
{
"epoch": 1.0383631713554988,
"grad_norm": 0.2524744240235806,
"learning_rate": 3.639810426540284e-05,
"loss": 0.3441,
"step": 406
},
{
"epoch": 1.040920716112532,
"grad_norm": 0.2630583826634349,
"learning_rate": 3.6350710900473935e-05,
"loss": 0.3099,
"step": 407
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.2570358498212408,
"learning_rate": 3.6303317535545026e-05,
"loss": 0.3211,
"step": 408
},
{
"epoch": 1.0460358056265984,
"grad_norm": 0.26431307397410003,
"learning_rate": 3.625592417061612e-05,
"loss": 0.35,
"step": 409
},
{
"epoch": 1.0485933503836318,
"grad_norm": 0.27463747349361467,
"learning_rate": 3.620853080568721e-05,
"loss": 0.3494,
"step": 410
},
{
"epoch": 1.051150895140665,
"grad_norm": 0.24666921280022072,
"learning_rate": 3.61611374407583e-05,
"loss": 0.341,
"step": 411
},
{
"epoch": 1.0537084398976981,
"grad_norm": 0.2505495844763562,
"learning_rate": 3.6113744075829384e-05,
"loss": 0.3412,
"step": 412
},
{
"epoch": 1.0562659846547315,
"grad_norm": 0.2506374206193608,
"learning_rate": 3.6066350710900476e-05,
"loss": 0.3229,
"step": 413
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.24249666251287566,
"learning_rate": 3.601895734597157e-05,
"loss": 0.3329,
"step": 414
},
{
"epoch": 1.061381074168798,
"grad_norm": 0.2618704099040068,
"learning_rate": 3.597156398104266e-05,
"loss": 0.3462,
"step": 415
},
{
"epoch": 1.0639386189258313,
"grad_norm": 0.25454325976096887,
"learning_rate": 3.592417061611374e-05,
"loss": 0.319,
"step": 416
},
{
"epoch": 1.0664961636828645,
"grad_norm": 0.3012500683553219,
"learning_rate": 3.5876777251184834e-05,
"loss": 0.3452,
"step": 417
},
{
"epoch": 1.0690537084398977,
"grad_norm": 0.2310352458746118,
"learning_rate": 3.5829383886255926e-05,
"loss": 0.3203,
"step": 418
},
{
"epoch": 1.0716112531969308,
"grad_norm": 0.2867380051579317,
"learning_rate": 3.578199052132701e-05,
"loss": 0.3408,
"step": 419
},
{
"epoch": 1.0741687979539642,
"grad_norm": 0.24642924252308632,
"learning_rate": 3.573459715639811e-05,
"loss": 0.3247,
"step": 420
},
{
"epoch": 1.0767263427109974,
"grad_norm": 0.22539243089747027,
"learning_rate": 3.56872037914692e-05,
"loss": 0.3282,
"step": 421
},
{
"epoch": 1.0792838874680306,
"grad_norm": 0.2508510372019925,
"learning_rate": 3.563981042654029e-05,
"loss": 0.3444,
"step": 422
},
{
"epoch": 1.081841432225064,
"grad_norm": 0.25272955853952195,
"learning_rate": 3.5592417061611376e-05,
"loss": 0.3366,
"step": 423
},
{
"epoch": 1.0843989769820972,
"grad_norm": 0.2272026636889727,
"learning_rate": 3.554502369668247e-05,
"loss": 0.3516,
"step": 424
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.27462834447503987,
"learning_rate": 3.549763033175356e-05,
"loss": 0.3374,
"step": 425
},
{
"epoch": 1.0895140664961638,
"grad_norm": 0.2128026835115876,
"learning_rate": 3.545023696682464e-05,
"loss": 0.3336,
"step": 426
},
{
"epoch": 1.092071611253197,
"grad_norm": 0.2354379611105053,
"learning_rate": 3.5402843601895734e-05,
"loss": 0.3379,
"step": 427
},
{
"epoch": 1.0946291560102301,
"grad_norm": 0.224481929706568,
"learning_rate": 3.5355450236966825e-05,
"loss": 0.3564,
"step": 428
},
{
"epoch": 1.0971867007672633,
"grad_norm": 0.21368714222847607,
"learning_rate": 3.530805687203792e-05,
"loss": 0.3141,
"step": 429
},
{
"epoch": 1.0997442455242967,
"grad_norm": 0.23974097995132895,
"learning_rate": 3.5260663507109e-05,
"loss": 0.3313,
"step": 430
},
{
"epoch": 1.10230179028133,
"grad_norm": 1.5113616428603722,
"learning_rate": 3.52132701421801e-05,
"loss": 0.3288,
"step": 431
},
{
"epoch": 1.104859335038363,
"grad_norm": 0.2584287203231182,
"learning_rate": 3.516587677725119e-05,
"loss": 0.3347,
"step": 432
},
{
"epoch": 1.1074168797953965,
"grad_norm": 0.19574817933562375,
"learning_rate": 3.5118483412322275e-05,
"loss": 0.3204,
"step": 433
},
{
"epoch": 1.1099744245524297,
"grad_norm": 0.24894576509043242,
"learning_rate": 3.507109004739337e-05,
"loss": 0.3406,
"step": 434
},
{
"epoch": 1.1125319693094629,
"grad_norm": 0.22605511670011208,
"learning_rate": 3.502369668246446e-05,
"loss": 0.3372,
"step": 435
},
{
"epoch": 1.1150895140664963,
"grad_norm": 0.2426838758794149,
"learning_rate": 3.497630331753555e-05,
"loss": 0.3354,
"step": 436
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.22312946234793202,
"learning_rate": 3.4928909952606634e-05,
"loss": 0.3283,
"step": 437
},
{
"epoch": 1.1202046035805626,
"grad_norm": 0.24548486238399964,
"learning_rate": 3.4881516587677725e-05,
"loss": 0.3428,
"step": 438
},
{
"epoch": 1.1227621483375958,
"grad_norm": 0.22862518373154317,
"learning_rate": 3.4834123222748817e-05,
"loss": 0.3296,
"step": 439
},
{
"epoch": 1.1253196930946292,
"grad_norm": 0.2415393319131855,
"learning_rate": 3.478672985781991e-05,
"loss": 0.3366,
"step": 440
},
{
"epoch": 1.1278772378516624,
"grad_norm": 0.24350557581856444,
"learning_rate": 3.473933649289099e-05,
"loss": 0.3331,
"step": 441
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.22906222897576142,
"learning_rate": 3.4691943127962084e-05,
"loss": 0.3383,
"step": 442
},
{
"epoch": 1.132992327365729,
"grad_norm": 0.9570276654007269,
"learning_rate": 3.464454976303318e-05,
"loss": 0.3383,
"step": 443
},
{
"epoch": 1.1355498721227621,
"grad_norm": 0.4221562491079337,
"learning_rate": 3.4597156398104267e-05,
"loss": 0.3493,
"step": 444
},
{
"epoch": 1.1381074168797953,
"grad_norm": 0.7247823264413438,
"learning_rate": 3.454976303317536e-05,
"loss": 0.3503,
"step": 445
},
{
"epoch": 1.1406649616368287,
"grad_norm": 0.27292208588198535,
"learning_rate": 3.450236966824645e-05,
"loss": 0.3491,
"step": 446
},
{
"epoch": 1.143222506393862,
"grad_norm": 0.24846288711065778,
"learning_rate": 3.445497630331754e-05,
"loss": 0.34,
"step": 447
},
{
"epoch": 1.145780051150895,
"grad_norm": 0.28289846837651944,
"learning_rate": 3.4407582938388625e-05,
"loss": 0.3395,
"step": 448
},
{
"epoch": 1.1483375959079285,
"grad_norm": 0.20360202393964177,
"learning_rate": 3.4360189573459716e-05,
"loss": 0.336,
"step": 449
},
{
"epoch": 1.1508951406649617,
"grad_norm": 0.26795912135731376,
"learning_rate": 3.431279620853081e-05,
"loss": 0.3363,
"step": 450
},
{
"epoch": 1.1534526854219949,
"grad_norm": 0.24482207535162454,
"learning_rate": 3.42654028436019e-05,
"loss": 0.3263,
"step": 451
},
{
"epoch": 1.156010230179028,
"grad_norm": 1.091037041185637,
"learning_rate": 3.4218009478672984e-05,
"loss": 0.3319,
"step": 452
},
{
"epoch": 1.1585677749360614,
"grad_norm": 0.25708621832570655,
"learning_rate": 3.4170616113744075e-05,
"loss": 0.3456,
"step": 453
},
{
"epoch": 1.1611253196930946,
"grad_norm": 0.22978489335863728,
"learning_rate": 3.412322274881517e-05,
"loss": 0.3453,
"step": 454
},
{
"epoch": 1.1636828644501278,
"grad_norm": 0.23661531026909,
"learning_rate": 3.407582938388626e-05,
"loss": 0.3311,
"step": 455
},
{
"epoch": 1.1662404092071612,
"grad_norm": 0.26058801823717986,
"learning_rate": 3.402843601895735e-05,
"loss": 0.3293,
"step": 456
},
{
"epoch": 1.1687979539641944,
"grad_norm": 0.2213831357242978,
"learning_rate": 3.398104265402844e-05,
"loss": 0.3448,
"step": 457
},
{
"epoch": 1.1713554987212276,
"grad_norm": 0.22314933582706364,
"learning_rate": 3.393364928909953e-05,
"loss": 0.3319,
"step": 458
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.2368130291318867,
"learning_rate": 3.3886255924170616e-05,
"loss": 0.3417,
"step": 459
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.23299474747082943,
"learning_rate": 3.383886255924171e-05,
"loss": 0.3434,
"step": 460
},
{
"epoch": 1.1790281329923273,
"grad_norm": 0.23122859916384542,
"learning_rate": 3.37914691943128e-05,
"loss": 0.353,
"step": 461
},
{
"epoch": 1.1815856777493605,
"grad_norm": 0.23133544704817127,
"learning_rate": 3.3744075829383883e-05,
"loss": 0.3268,
"step": 462
},
{
"epoch": 1.184143222506394,
"grad_norm": 0.2279777026926834,
"learning_rate": 3.3696682464454975e-05,
"loss": 0.3427,
"step": 463
},
{
"epoch": 1.186700767263427,
"grad_norm": 0.24354305412664243,
"learning_rate": 3.3649289099526066e-05,
"loss": 0.3416,
"step": 464
},
{
"epoch": 1.1892583120204603,
"grad_norm": 0.23175483156511392,
"learning_rate": 3.360189573459716e-05,
"loss": 0.3253,
"step": 465
},
{
"epoch": 1.1918158567774937,
"grad_norm": 0.2505719500530794,
"learning_rate": 3.355450236966825e-05,
"loss": 0.357,
"step": 466
},
{
"epoch": 1.1943734015345269,
"grad_norm": 0.23794330588394164,
"learning_rate": 3.350710900473934e-05,
"loss": 0.3368,
"step": 467
},
{
"epoch": 1.19693094629156,
"grad_norm": 0.24430596176344385,
"learning_rate": 3.345971563981043e-05,
"loss": 0.3387,
"step": 468
},
{
"epoch": 1.1994884910485935,
"grad_norm": 0.22981260995180924,
"learning_rate": 3.3412322274881516e-05,
"loss": 0.3394,
"step": 469
},
{
"epoch": 1.2020460358056266,
"grad_norm": 0.26211223679278534,
"learning_rate": 3.336492890995261e-05,
"loss": 0.3319,
"step": 470
},
{
"epoch": 1.2046035805626598,
"grad_norm": 0.20949166867985375,
"learning_rate": 3.33175355450237e-05,
"loss": 0.3247,
"step": 471
},
{
"epoch": 1.207161125319693,
"grad_norm": 0.26920054172152863,
"learning_rate": 3.327014218009479e-05,
"loss": 0.3266,
"step": 472
},
{
"epoch": 1.2097186700767264,
"grad_norm": 0.23259269182122375,
"learning_rate": 3.3222748815165875e-05,
"loss": 0.336,
"step": 473
},
{
"epoch": 1.2122762148337596,
"grad_norm": 0.2544872114348285,
"learning_rate": 3.3175355450236966e-05,
"loss": 0.3288,
"step": 474
},
{
"epoch": 1.2148337595907928,
"grad_norm": 0.23096314256849135,
"learning_rate": 3.312796208530806e-05,
"loss": 0.338,
"step": 475
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.2714305850528602,
"learning_rate": 3.308056872037915e-05,
"loss": 0.3447,
"step": 476
},
{
"epoch": 1.2199488491048593,
"grad_norm": 0.27398730927997655,
"learning_rate": 3.303317535545024e-05,
"loss": 0.328,
"step": 477
},
{
"epoch": 1.2225063938618925,
"grad_norm": 0.21842573163699253,
"learning_rate": 3.298578199052133e-05,
"loss": 0.3434,
"step": 478
},
{
"epoch": 1.2250639386189257,
"grad_norm": 0.24231426743387735,
"learning_rate": 3.293838862559242e-05,
"loss": 0.3355,
"step": 479
},
{
"epoch": 1.227621483375959,
"grad_norm": 0.23387954201665254,
"learning_rate": 3.289099526066351e-05,
"loss": 0.3456,
"step": 480
},
{
"epoch": 1.2301790281329923,
"grad_norm": 0.2240321236806126,
"learning_rate": 3.28436018957346e-05,
"loss": 0.3376,
"step": 481
},
{
"epoch": 1.2327365728900257,
"grad_norm": 0.2261690321581938,
"learning_rate": 3.279620853080569e-05,
"loss": 0.3313,
"step": 482
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.27592615919196145,
"learning_rate": 3.274881516587678e-05,
"loss": 0.3385,
"step": 483
},
{
"epoch": 1.237851662404092,
"grad_norm": 0.1983777165414548,
"learning_rate": 3.2701421800947866e-05,
"loss": 0.3344,
"step": 484
},
{
"epoch": 1.2404092071611252,
"grad_norm": 0.25775855180422713,
"learning_rate": 3.265402843601896e-05,
"loss": 0.349,
"step": 485
},
{
"epoch": 1.2429667519181586,
"grad_norm": 0.21057070064196648,
"learning_rate": 3.260663507109005e-05,
"loss": 0.3226,
"step": 486
},
{
"epoch": 1.2455242966751918,
"grad_norm": 0.25264888053163403,
"learning_rate": 3.255924170616114e-05,
"loss": 0.3405,
"step": 487
},
{
"epoch": 1.248081841432225,
"grad_norm": 0.20358857621290893,
"learning_rate": 3.251184834123223e-05,
"loss": 0.3384,
"step": 488
},
{
"epoch": 1.2506393861892584,
"grad_norm": 0.2221188350040554,
"learning_rate": 3.246445497630332e-05,
"loss": 0.3592,
"step": 489
},
{
"epoch": 1.2531969309462916,
"grad_norm": 0.22907812456671411,
"learning_rate": 3.2417061611374414e-05,
"loss": 0.3509,
"step": 490
},
{
"epoch": 1.2557544757033248,
"grad_norm": 0.2203229089376764,
"learning_rate": 3.23696682464455e-05,
"loss": 0.3157,
"step": 491
},
{
"epoch": 1.258312020460358,
"grad_norm": 0.22923138047926875,
"learning_rate": 3.232227488151659e-05,
"loss": 0.3321,
"step": 492
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.20989998077940591,
"learning_rate": 3.227488151658768e-05,
"loss": 0.3293,
"step": 493
},
{
"epoch": 1.2634271099744245,
"grad_norm": 0.21568832380392375,
"learning_rate": 3.222748815165877e-05,
"loss": 0.3466,
"step": 494
},
{
"epoch": 1.265984654731458,
"grad_norm": 0.22068990151180867,
"learning_rate": 3.218009478672986e-05,
"loss": 0.3346,
"step": 495
},
{
"epoch": 1.2685421994884911,
"grad_norm": 0.22072570403379316,
"learning_rate": 3.213270142180095e-05,
"loss": 0.3415,
"step": 496
},
{
"epoch": 1.2710997442455243,
"grad_norm": 0.22130125503638862,
"learning_rate": 3.208530805687204e-05,
"loss": 0.3237,
"step": 497
},
{
"epoch": 1.2736572890025575,
"grad_norm": 0.21888368739994798,
"learning_rate": 3.2037914691943124e-05,
"loss": 0.3232,
"step": 498
},
{
"epoch": 1.2762148337595907,
"grad_norm": 0.23237851118888075,
"learning_rate": 3.1990521327014215e-05,
"loss": 0.3362,
"step": 499
},
{
"epoch": 1.278772378516624,
"grad_norm": 0.20408840058481117,
"learning_rate": 3.1943127962085314e-05,
"loss": 0.331,
"step": 500
},
{
"epoch": 1.2813299232736572,
"grad_norm": 0.25671575639210675,
"learning_rate": 3.18957345971564e-05,
"loss": 0.3483,
"step": 501
},
{
"epoch": 1.2838874680306906,
"grad_norm": 0.21146534380332607,
"learning_rate": 3.184834123222749e-05,
"loss": 0.3179,
"step": 502
},
{
"epoch": 1.2864450127877238,
"grad_norm": 0.22647977672526137,
"learning_rate": 3.180094786729858e-05,
"loss": 0.3371,
"step": 503
},
{
"epoch": 1.289002557544757,
"grad_norm": 0.2024444911259877,
"learning_rate": 3.175355450236967e-05,
"loss": 0.3208,
"step": 504
},
{
"epoch": 1.2915601023017902,
"grad_norm": 0.23292871699588752,
"learning_rate": 3.170616113744076e-05,
"loss": 0.3325,
"step": 505
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.24617143877927294,
"learning_rate": 3.165876777251185e-05,
"loss": 0.3456,
"step": 506
},
{
"epoch": 1.2966751918158568,
"grad_norm": 0.21521749139279983,
"learning_rate": 3.161137440758294e-05,
"loss": 0.3632,
"step": 507
},
{
"epoch": 1.29923273657289,
"grad_norm": 0.23710165276474784,
"learning_rate": 3.156398104265403e-05,
"loss": 0.3258,
"step": 508
},
{
"epoch": 1.3017902813299234,
"grad_norm": 0.2182566275526034,
"learning_rate": 3.1516587677725115e-05,
"loss": 0.3254,
"step": 509
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.2567309562573808,
"learning_rate": 3.1469194312796207e-05,
"loss": 0.3393,
"step": 510
},
{
"epoch": 1.3069053708439897,
"grad_norm": 0.21736576266820642,
"learning_rate": 3.1421800947867305e-05,
"loss": 0.3237,
"step": 511
},
{
"epoch": 1.309462915601023,
"grad_norm": 0.2476914139591077,
"learning_rate": 3.137440758293839e-05,
"loss": 0.3387,
"step": 512
},
{
"epoch": 1.3120204603580563,
"grad_norm": 0.21116880886723996,
"learning_rate": 3.132701421800948e-05,
"loss": 0.3342,
"step": 513
},
{
"epoch": 1.3145780051150895,
"grad_norm": 0.2282427820832504,
"learning_rate": 3.127962085308057e-05,
"loss": 0.3406,
"step": 514
},
{
"epoch": 1.317135549872123,
"grad_norm": 0.220656045586937,
"learning_rate": 3.123222748815166e-05,
"loss": 0.3481,
"step": 515
},
{
"epoch": 1.319693094629156,
"grad_norm": 0.21477244949218188,
"learning_rate": 3.118483412322275e-05,
"loss": 0.3417,
"step": 516
},
{
"epoch": 1.3222506393861893,
"grad_norm": 0.2123179538890313,
"learning_rate": 3.113744075829384e-05,
"loss": 0.3374,
"step": 517
},
{
"epoch": 1.3248081841432224,
"grad_norm": 0.20966406562861323,
"learning_rate": 3.109004739336493e-05,
"loss": 0.332,
"step": 518
},
{
"epoch": 1.3273657289002558,
"grad_norm": 0.1967874776267105,
"learning_rate": 3.104265402843602e-05,
"loss": 0.327,
"step": 519
},
{
"epoch": 1.329923273657289,
"grad_norm": 0.21447737012880227,
"learning_rate": 3.0995260663507106e-05,
"loss": 0.3342,
"step": 520
},
{
"epoch": 1.3324808184143222,
"grad_norm": 0.22702076072063435,
"learning_rate": 3.09478672985782e-05,
"loss": 0.357,
"step": 521
},
{
"epoch": 1.3350383631713556,
"grad_norm": 0.24746439681290008,
"learning_rate": 3.0900473933649296e-05,
"loss": 0.3489,
"step": 522
},
{
"epoch": 1.3375959079283888,
"grad_norm": 0.21236354577498476,
"learning_rate": 3.085308056872038e-05,
"loss": 0.309,
"step": 523
},
{
"epoch": 1.340153452685422,
"grad_norm": 0.21060912049882632,
"learning_rate": 3.080568720379147e-05,
"loss": 0.3191,
"step": 524
},
{
"epoch": 1.3427109974424551,
"grad_norm": 0.20505413275714032,
"learning_rate": 3.075829383886256e-05,
"loss": 0.3451,
"step": 525
},
{
"epoch": 1.3452685421994885,
"grad_norm": 0.24615234141005185,
"learning_rate": 3.0710900473933654e-05,
"loss": 0.3528,
"step": 526
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.22369032901485378,
"learning_rate": 3.066350710900474e-05,
"loss": 0.3284,
"step": 527
},
{
"epoch": 1.350383631713555,
"grad_norm": 0.22838183924629246,
"learning_rate": 3.061611374407583e-05,
"loss": 0.3374,
"step": 528
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.27105416556647893,
"learning_rate": 3.056872037914692e-05,
"loss": 0.3464,
"step": 529
},
{
"epoch": 1.3554987212276215,
"grad_norm": 0.20957984851168898,
"learning_rate": 3.052132701421801e-05,
"loss": 0.3284,
"step": 530
},
{
"epoch": 1.3580562659846547,
"grad_norm": 0.2320992461077895,
"learning_rate": 3.0473933649289098e-05,
"loss": 0.3486,
"step": 531
},
{
"epoch": 1.3606138107416879,
"grad_norm": 0.21826085112206514,
"learning_rate": 3.042654028436019e-05,
"loss": 0.3385,
"step": 532
},
{
"epoch": 1.3631713554987213,
"grad_norm": 0.2098867818685027,
"learning_rate": 3.0379146919431277e-05,
"loss": 0.3281,
"step": 533
},
{
"epoch": 1.3657289002557544,
"grad_norm": 0.20672602706389986,
"learning_rate": 3.0331753554502375e-05,
"loss": 0.3347,
"step": 534
},
{
"epoch": 1.3682864450127878,
"grad_norm": 0.19992143809636548,
"learning_rate": 3.0284360189573463e-05,
"loss": 0.3359,
"step": 535
},
{
"epoch": 1.370843989769821,
"grad_norm": 0.20352905078357075,
"learning_rate": 3.023696682464455e-05,
"loss": 0.3378,
"step": 536
},
{
"epoch": 1.3734015345268542,
"grad_norm": 0.20917931960339106,
"learning_rate": 3.0189573459715642e-05,
"loss": 0.3477,
"step": 537
},
{
"epoch": 1.3759590792838874,
"grad_norm": 0.19805266052365922,
"learning_rate": 3.014218009478673e-05,
"loss": 0.3268,
"step": 538
},
{
"epoch": 1.3785166240409208,
"grad_norm": 0.19502494739762932,
"learning_rate": 3.009478672985782e-05,
"loss": 0.3493,
"step": 539
},
{
"epoch": 1.381074168797954,
"grad_norm": 0.19983564710769386,
"learning_rate": 3.004739336492891e-05,
"loss": 0.3428,
"step": 540
},
{
"epoch": 1.3836317135549872,
"grad_norm": 0.22658682357477436,
"learning_rate": 3e-05,
"loss": 0.3378,
"step": 541
},
{
"epoch": 1.3861892583120206,
"grad_norm": 0.23466719921978055,
"learning_rate": 2.995260663507109e-05,
"loss": 0.3353,
"step": 542
},
{
"epoch": 1.3887468030690537,
"grad_norm": 0.21015550969200184,
"learning_rate": 2.990521327014218e-05,
"loss": 0.35,
"step": 543
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.2614967884472048,
"learning_rate": 2.9857819905213268e-05,
"loss": 0.3458,
"step": 544
},
{
"epoch": 1.39386189258312,
"grad_norm": 0.20442313390946987,
"learning_rate": 2.9810426540284363e-05,
"loss": 0.3197,
"step": 545
},
{
"epoch": 1.3964194373401535,
"grad_norm": 0.22800479961193035,
"learning_rate": 2.9763033175355454e-05,
"loss": 0.3216,
"step": 546
},
{
"epoch": 1.3989769820971867,
"grad_norm": 0.23631248009519853,
"learning_rate": 2.9715639810426542e-05,
"loss": 0.3475,
"step": 547
},
{
"epoch": 1.40153452685422,
"grad_norm": 0.2148560333286399,
"learning_rate": 2.9668246445497633e-05,
"loss": 0.3321,
"step": 548
},
{
"epoch": 1.4040920716112533,
"grad_norm": 0.22336842918171954,
"learning_rate": 2.962085308056872e-05,
"loss": 0.3488,
"step": 549
},
{
"epoch": 1.4066496163682864,
"grad_norm": 0.21805777627104153,
"learning_rate": 2.9573459715639813e-05,
"loss": 0.3377,
"step": 550
},
{
"epoch": 1.4092071611253196,
"grad_norm": 0.197363455632153,
"learning_rate": 2.95260663507109e-05,
"loss": 0.3395,
"step": 551
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.273730892459262,
"learning_rate": 2.9478672985781992e-05,
"loss": 0.3637,
"step": 552
},
{
"epoch": 1.4143222506393862,
"grad_norm": 0.21968147378847208,
"learning_rate": 2.943127962085308e-05,
"loss": 0.3458,
"step": 553
},
{
"epoch": 1.4168797953964194,
"grad_norm": 0.22407752561098943,
"learning_rate": 2.938388625592417e-05,
"loss": 0.3341,
"step": 554
},
{
"epoch": 1.4194373401534528,
"grad_norm": 0.24706387013454778,
"learning_rate": 2.933649289099526e-05,
"loss": 0.3393,
"step": 555
},
{
"epoch": 1.421994884910486,
"grad_norm": 0.23943822236699114,
"learning_rate": 2.9289099526066354e-05,
"loss": 0.3476,
"step": 556
},
{
"epoch": 1.4245524296675192,
"grad_norm": 0.21133262016275636,
"learning_rate": 2.9241706161137445e-05,
"loss": 0.3317,
"step": 557
},
{
"epoch": 1.4271099744245523,
"grad_norm": 0.2187205757977556,
"learning_rate": 2.9194312796208533e-05,
"loss": 0.3344,
"step": 558
},
{
"epoch": 1.4296675191815857,
"grad_norm": 0.2278521035319285,
"learning_rate": 2.9146919431279624e-05,
"loss": 0.3391,
"step": 559
},
{
"epoch": 1.432225063938619,
"grad_norm": 0.22984861625880482,
"learning_rate": 2.9099526066350712e-05,
"loss": 0.3289,
"step": 560
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.2126484088527432,
"learning_rate": 2.9052132701421804e-05,
"loss": 0.3216,
"step": 561
},
{
"epoch": 1.4373401534526855,
"grad_norm": 0.21963164389365947,
"learning_rate": 2.9004739336492892e-05,
"loss": 0.3366,
"step": 562
},
{
"epoch": 1.4398976982097187,
"grad_norm": 0.2271030491918638,
"learning_rate": 2.8957345971563983e-05,
"loss": 0.3287,
"step": 563
},
{
"epoch": 1.4424552429667519,
"grad_norm": 0.22596502012606307,
"learning_rate": 2.890995260663507e-05,
"loss": 0.3445,
"step": 564
},
{
"epoch": 1.445012787723785,
"grad_norm": 0.2092819883191256,
"learning_rate": 2.8862559241706162e-05,
"loss": 0.3188,
"step": 565
},
{
"epoch": 1.4475703324808185,
"grad_norm": 0.2085679869485133,
"learning_rate": 2.881516587677725e-05,
"loss": 0.3297,
"step": 566
},
{
"epoch": 1.4501278772378516,
"grad_norm": 0.20731318095051873,
"learning_rate": 2.8767772511848338e-05,
"loss": 0.3254,
"step": 567
},
{
"epoch": 1.452685421994885,
"grad_norm": 0.22614524862117436,
"learning_rate": 2.8720379146919436e-05,
"loss": 0.3389,
"step": 568
},
{
"epoch": 1.4552429667519182,
"grad_norm": 0.22116772067000307,
"learning_rate": 2.8672985781990524e-05,
"loss": 0.3372,
"step": 569
},
{
"epoch": 1.4578005115089514,
"grad_norm": 0.2081439129486148,
"learning_rate": 2.8625592417061616e-05,
"loss": 0.3185,
"step": 570
},
{
"epoch": 1.4603580562659846,
"grad_norm": 0.2101126677570276,
"learning_rate": 2.8578199052132704e-05,
"loss": 0.3386,
"step": 571
},
{
"epoch": 1.4629156010230178,
"grad_norm": 0.20857004085030162,
"learning_rate": 2.853080568720379e-05,
"loss": 0.3317,
"step": 572
},
{
"epoch": 1.4654731457800512,
"grad_norm": 0.21972466939910235,
"learning_rate": 2.8483412322274883e-05,
"loss": 0.3421,
"step": 573
},
{
"epoch": 1.4680306905370843,
"grad_norm": 0.22670934909893178,
"learning_rate": 2.843601895734597e-05,
"loss": 0.3373,
"step": 574
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.20335752165987916,
"learning_rate": 2.8388625592417062e-05,
"loss": 0.3489,
"step": 575
},
{
"epoch": 1.473145780051151,
"grad_norm": 0.21670951576300224,
"learning_rate": 2.834123222748815e-05,
"loss": 0.3436,
"step": 576
},
{
"epoch": 1.4757033248081841,
"grad_norm": 0.24188198119161047,
"learning_rate": 2.829383886255924e-05,
"loss": 0.346,
"step": 577
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.19284248575531912,
"learning_rate": 2.824644549763033e-05,
"loss": 0.3412,
"step": 578
},
{
"epoch": 1.4808184143222507,
"grad_norm": 0.21408001651811503,
"learning_rate": 2.8199052132701424e-05,
"loss": 0.3359,
"step": 579
},
{
"epoch": 1.4833759590792839,
"grad_norm": 0.23043383318843624,
"learning_rate": 2.8151658767772515e-05,
"loss": 0.3352,
"step": 580
},
{
"epoch": 1.485933503836317,
"grad_norm": 0.19637762705882086,
"learning_rate": 2.8104265402843603e-05,
"loss": 0.3335,
"step": 581
},
{
"epoch": 1.4884910485933505,
"grad_norm": 0.21814477890754627,
"learning_rate": 2.8056872037914695e-05,
"loss": 0.3219,
"step": 582
},
{
"epoch": 1.4910485933503836,
"grad_norm": 0.24377115034783173,
"learning_rate": 2.8009478672985783e-05,
"loss": 0.349,
"step": 583
},
{
"epoch": 1.4936061381074168,
"grad_norm": 0.20020466798938577,
"learning_rate": 2.7962085308056874e-05,
"loss": 0.3364,
"step": 584
},
{
"epoch": 1.49616368286445,
"grad_norm": 0.2363047057326481,
"learning_rate": 2.7914691943127962e-05,
"loss": 0.3565,
"step": 585
},
{
"epoch": 1.4987212276214834,
"grad_norm": 0.21956341661227455,
"learning_rate": 2.7867298578199053e-05,
"loss": 0.3377,
"step": 586
},
{
"epoch": 1.5012787723785166,
"grad_norm": 0.2267586568563103,
"learning_rate": 2.781990521327014e-05,
"loss": 0.328,
"step": 587
},
{
"epoch": 1.50383631713555,
"grad_norm": 0.2047581074184725,
"learning_rate": 2.7772511848341233e-05,
"loss": 0.3468,
"step": 588
},
{
"epoch": 1.5063938618925832,
"grad_norm": 0.24688978065050932,
"learning_rate": 2.772511848341232e-05,
"loss": 0.3279,
"step": 589
},
{
"epoch": 1.5089514066496164,
"grad_norm": 0.21201942656506023,
"learning_rate": 2.7677725118483415e-05,
"loss": 0.3435,
"step": 590
},
{
"epoch": 1.5115089514066495,
"grad_norm": 0.21407415709345273,
"learning_rate": 2.7630331753554507e-05,
"loss": 0.3338,
"step": 591
},
{
"epoch": 1.5140664961636827,
"grad_norm": 0.24244329625085864,
"learning_rate": 2.7582938388625595e-05,
"loss": 0.3353,
"step": 592
},
{
"epoch": 1.5166240409207161,
"grad_norm": 0.21106578023597755,
"learning_rate": 2.7535545023696686e-05,
"loss": 0.3185,
"step": 593
},
{
"epoch": 1.5191815856777495,
"grad_norm": 0.22046969326673913,
"learning_rate": 2.7488151658767774e-05,
"loss": 0.3405,
"step": 594
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.22082584514229614,
"learning_rate": 2.7440758293838865e-05,
"loss": 0.3374,
"step": 595
},
{
"epoch": 1.5242966751918159,
"grad_norm": 0.2214039800077301,
"learning_rate": 2.7393364928909953e-05,
"loss": 0.3408,
"step": 596
},
{
"epoch": 1.526854219948849,
"grad_norm": 0.21162564133453074,
"learning_rate": 2.7345971563981044e-05,
"loss": 0.3223,
"step": 597
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.21038119410478973,
"learning_rate": 2.7298578199052132e-05,
"loss": 0.3232,
"step": 598
},
{
"epoch": 1.5319693094629157,
"grad_norm": 0.2232877311097297,
"learning_rate": 2.7251184834123224e-05,
"loss": 0.3569,
"step": 599
},
{
"epoch": 1.5345268542199488,
"grad_norm": 0.21018531562144588,
"learning_rate": 2.720379146919431e-05,
"loss": 0.3572,
"step": 600
},
{
"epoch": 1.5370843989769822,
"grad_norm": 0.18432057304329444,
"learning_rate": 2.7156398104265403e-05,
"loss": 0.3239,
"step": 601
},
{
"epoch": 1.5396419437340154,
"grad_norm": 0.23256686957170164,
"learning_rate": 2.7109004739336498e-05,
"loss": 0.327,
"step": 602
},
{
"epoch": 1.5421994884910486,
"grad_norm": 0.2168659241371808,
"learning_rate": 2.7061611374407586e-05,
"loss": 0.3345,
"step": 603
},
{
"epoch": 1.5447570332480818,
"grad_norm": 0.2066176620461704,
"learning_rate": 2.7014218009478677e-05,
"loss": 0.3263,
"step": 604
},
{
"epoch": 1.547314578005115,
"grad_norm": 0.2510122104603682,
"learning_rate": 2.6966824644549765e-05,
"loss": 0.3383,
"step": 605
},
{
"epoch": 1.5498721227621484,
"grad_norm": 0.21620946293671467,
"learning_rate": 2.6919431279620856e-05,
"loss": 0.3421,
"step": 606
},
{
"epoch": 1.5524296675191815,
"grad_norm": 0.2374053609246905,
"learning_rate": 2.6872037914691944e-05,
"loss": 0.3395,
"step": 607
},
{
"epoch": 1.554987212276215,
"grad_norm": 0.23310585207272871,
"learning_rate": 2.6824644549763032e-05,
"loss": 0.3328,
"step": 608
},
{
"epoch": 1.5575447570332481,
"grad_norm": 0.21706136950371865,
"learning_rate": 2.6777251184834124e-05,
"loss": 0.3291,
"step": 609
},
{
"epoch": 1.5601023017902813,
"grad_norm": 0.2557349212164624,
"learning_rate": 2.672985781990521e-05,
"loss": 0.3329,
"step": 610
},
{
"epoch": 1.5626598465473145,
"grad_norm": 0.21369332563945545,
"learning_rate": 2.6682464454976303e-05,
"loss": 0.3403,
"step": 611
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.22249413300101917,
"learning_rate": 2.663507109004739e-05,
"loss": 0.3295,
"step": 612
},
{
"epoch": 1.567774936061381,
"grad_norm": 0.2352350138406436,
"learning_rate": 2.658767772511849e-05,
"loss": 0.3491,
"step": 613
},
{
"epoch": 1.5703324808184145,
"grad_norm": 0.21444744975093857,
"learning_rate": 2.6540284360189577e-05,
"loss": 0.3515,
"step": 614
},
{
"epoch": 1.5728900255754477,
"grad_norm": 0.20106535936796008,
"learning_rate": 2.6492890995260665e-05,
"loss": 0.3415,
"step": 615
},
{
"epoch": 1.5754475703324808,
"grad_norm": 0.24355009019358553,
"learning_rate": 2.6445497630331756e-05,
"loss": 0.3382,
"step": 616
},
{
"epoch": 1.578005115089514,
"grad_norm": 0.19632069646990316,
"learning_rate": 2.6398104265402844e-05,
"loss": 0.3344,
"step": 617
},
{
"epoch": 1.5805626598465472,
"grad_norm": 0.2041069390497847,
"learning_rate": 2.6350710900473935e-05,
"loss": 0.3314,
"step": 618
},
{
"epoch": 1.5831202046035806,
"grad_norm": 0.23307172936850007,
"learning_rate": 2.6303317535545023e-05,
"loss": 0.3279,
"step": 619
},
{
"epoch": 1.5856777493606138,
"grad_norm": 0.22016471550055694,
"learning_rate": 2.6255924170616115e-05,
"loss": 0.3238,
"step": 620
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.21161098152167546,
"learning_rate": 2.6208530805687203e-05,
"loss": 0.3382,
"step": 621
},
{
"epoch": 1.5907928388746804,
"grad_norm": 0.23095386869319134,
"learning_rate": 2.6161137440758294e-05,
"loss": 0.3316,
"step": 622
},
{
"epoch": 1.5933503836317136,
"grad_norm": 0.2258130781665819,
"learning_rate": 2.6113744075829382e-05,
"loss": 0.3184,
"step": 623
},
{
"epoch": 1.5959079283887467,
"grad_norm": 0.2101743033652242,
"learning_rate": 2.6066350710900477e-05,
"loss": 0.3525,
"step": 624
},
{
"epoch": 1.59846547314578,
"grad_norm": 0.23556388836544728,
"learning_rate": 2.6018957345971568e-05,
"loss": 0.3596,
"step": 625
},
{
"epoch": 1.6010230179028133,
"grad_norm": 0.2173806495933601,
"learning_rate": 2.5971563981042656e-05,
"loss": 0.3367,
"step": 626
},
{
"epoch": 1.6035805626598465,
"grad_norm": 0.21332385463283657,
"learning_rate": 2.5924170616113747e-05,
"loss": 0.3371,
"step": 627
},
{
"epoch": 1.60613810741688,
"grad_norm": 0.20121738409593162,
"learning_rate": 2.5876777251184835e-05,
"loss": 0.3308,
"step": 628
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.22736961793911684,
"learning_rate": 2.5829383886255927e-05,
"loss": 0.3453,
"step": 629
},
{
"epoch": 1.6112531969309463,
"grad_norm": 0.19872074468079123,
"learning_rate": 2.5781990521327014e-05,
"loss": 0.3278,
"step": 630
},
{
"epoch": 1.6138107416879794,
"grad_norm": 0.2314685609756946,
"learning_rate": 2.5734597156398106e-05,
"loss": 0.3466,
"step": 631
},
{
"epoch": 1.6163682864450126,
"grad_norm": 0.21359598281755646,
"learning_rate": 2.5687203791469194e-05,
"loss": 0.3568,
"step": 632
},
{
"epoch": 1.618925831202046,
"grad_norm": 0.2410455323018816,
"learning_rate": 2.5639810426540285e-05,
"loss": 0.3212,
"step": 633
},
{
"epoch": 1.6214833759590794,
"grad_norm": 0.253509763295898,
"learning_rate": 2.5592417061611373e-05,
"loss": 0.3589,
"step": 634
},
{
"epoch": 1.6240409207161126,
"grad_norm": 0.22712797799055953,
"learning_rate": 2.5545023696682464e-05,
"loss": 0.3349,
"step": 635
},
{
"epoch": 1.6265984654731458,
"grad_norm": 0.22386259809972237,
"learning_rate": 2.549763033175356e-05,
"loss": 0.3261,
"step": 636
},
{
"epoch": 1.629156010230179,
"grad_norm": 0.2605466792154154,
"learning_rate": 2.5450236966824647e-05,
"loss": 0.3435,
"step": 637
},
{
"epoch": 1.6317135549872122,
"grad_norm": 0.20761172721493334,
"learning_rate": 2.540284360189574e-05,
"loss": 0.3251,
"step": 638
},
{
"epoch": 1.6342710997442456,
"grad_norm": 0.24685722210051553,
"learning_rate": 2.5355450236966826e-05,
"loss": 0.3235,
"step": 639
},
{
"epoch": 1.6368286445012787,
"grad_norm": 0.21434302571838307,
"learning_rate": 2.5308056872037918e-05,
"loss": 0.3207,
"step": 640
},
{
"epoch": 1.6393861892583121,
"grad_norm": 0.21184549514913412,
"learning_rate": 2.5260663507109006e-05,
"loss": 0.3238,
"step": 641
},
{
"epoch": 1.6419437340153453,
"grad_norm": 0.21252363567202226,
"learning_rate": 2.5213270142180097e-05,
"loss": 0.323,
"step": 642
},
{
"epoch": 1.6445012787723785,
"grad_norm": 0.21829115176694264,
"learning_rate": 2.5165876777251185e-05,
"loss": 0.3321,
"step": 643
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.20922978833957703,
"learning_rate": 2.5118483412322273e-05,
"loss": 0.326,
"step": 644
},
{
"epoch": 1.6496163682864449,
"grad_norm": 0.2091095674028597,
"learning_rate": 2.5071090047393364e-05,
"loss": 0.3076,
"step": 645
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.20714939228335966,
"learning_rate": 2.5023696682464452e-05,
"loss": 0.3304,
"step": 646
},
{
"epoch": 1.6547314578005117,
"grad_norm": 0.19116018976328764,
"learning_rate": 2.4976303317535547e-05,
"loss": 0.3274,
"step": 647
},
{
"epoch": 1.6572890025575449,
"grad_norm": 0.19243515464801864,
"learning_rate": 2.4928909952606635e-05,
"loss": 0.3397,
"step": 648
},
{
"epoch": 1.659846547314578,
"grad_norm": 0.22171437826424312,
"learning_rate": 2.4881516587677726e-05,
"loss": 0.3404,
"step": 649
},
{
"epoch": 1.6624040920716112,
"grad_norm": 0.18811862144302852,
"learning_rate": 2.4834123222748817e-05,
"loss": 0.3294,
"step": 650
},
{
"epoch": 1.6649616368286444,
"grad_norm": 0.20726034225043538,
"learning_rate": 2.4786729857819905e-05,
"loss": 0.3376,
"step": 651
},
{
"epoch": 1.6675191815856778,
"grad_norm": 0.22641125026229106,
"learning_rate": 2.4739336492890997e-05,
"loss": 0.3315,
"step": 652
},
{
"epoch": 1.670076726342711,
"grad_norm": 0.19760668759690572,
"learning_rate": 2.4691943127962085e-05,
"loss": 0.3484,
"step": 653
},
{
"epoch": 1.6726342710997444,
"grad_norm": 0.2036460572936716,
"learning_rate": 2.4644549763033176e-05,
"loss": 0.3405,
"step": 654
},
{
"epoch": 1.6751918158567776,
"grad_norm": 0.19580889345429936,
"learning_rate": 2.4597156398104264e-05,
"loss": 0.3311,
"step": 655
},
{
"epoch": 1.6777493606138107,
"grad_norm": 0.20331485010582212,
"learning_rate": 2.454976303317536e-05,
"loss": 0.3319,
"step": 656
},
{
"epoch": 1.680306905370844,
"grad_norm": 0.2003381154185122,
"learning_rate": 2.4502369668246447e-05,
"loss": 0.3338,
"step": 657
},
{
"epoch": 1.682864450127877,
"grad_norm": 0.22901909585607055,
"learning_rate": 2.4454976303317538e-05,
"loss": 0.3439,
"step": 658
},
{
"epoch": 1.6854219948849105,
"grad_norm": 0.2072701167914152,
"learning_rate": 2.4407582938388626e-05,
"loss": 0.3299,
"step": 659
},
{
"epoch": 1.6879795396419437,
"grad_norm": 0.2156044161532469,
"learning_rate": 2.4360189573459717e-05,
"loss": 0.3356,
"step": 660
},
{
"epoch": 1.690537084398977,
"grad_norm": 0.22960331769603365,
"learning_rate": 2.431279620853081e-05,
"loss": 0.3211,
"step": 661
},
{
"epoch": 1.6930946291560103,
"grad_norm": 0.184836419291593,
"learning_rate": 2.4265402843601897e-05,
"loss": 0.3134,
"step": 662
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.22152975307273395,
"learning_rate": 2.4218009478672988e-05,
"loss": 0.3556,
"step": 663
},
{
"epoch": 1.6982097186700766,
"grad_norm": 0.27533636995577504,
"learning_rate": 2.4170616113744076e-05,
"loss": 0.333,
"step": 664
},
{
"epoch": 1.7007672634271098,
"grad_norm": 0.20239642573133182,
"learning_rate": 2.4123222748815167e-05,
"loss": 0.3244,
"step": 665
},
{
"epoch": 1.7033248081841432,
"grad_norm": 0.19215048920041694,
"learning_rate": 2.4075829383886255e-05,
"loss": 0.3261,
"step": 666
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.21226322101300024,
"learning_rate": 2.402843601895735e-05,
"loss": 0.3357,
"step": 667
},
{
"epoch": 1.7084398976982098,
"grad_norm": 0.22539028864743468,
"learning_rate": 2.3981042654028438e-05,
"loss": 0.3472,
"step": 668
},
{
"epoch": 1.710997442455243,
"grad_norm": 0.23393010371109055,
"learning_rate": 2.3933649289099526e-05,
"loss": 0.3325,
"step": 669
},
{
"epoch": 1.7135549872122762,
"grad_norm": 0.1735369909355323,
"learning_rate": 2.3886255924170617e-05,
"loss": 0.3158,
"step": 670
},
{
"epoch": 1.7161125319693094,
"grad_norm": 0.21921508136082404,
"learning_rate": 2.3838862559241705e-05,
"loss": 0.35,
"step": 671
},
{
"epoch": 1.7186700767263428,
"grad_norm": 0.21982061308675563,
"learning_rate": 2.3791469194312796e-05,
"loss": 0.3479,
"step": 672
},
{
"epoch": 1.721227621483376,
"grad_norm": 0.2169093947973993,
"learning_rate": 2.3744075829383888e-05,
"loss": 0.3318,
"step": 673
},
{
"epoch": 1.7237851662404093,
"grad_norm": 0.20360889372476712,
"learning_rate": 2.369668246445498e-05,
"loss": 0.334,
"step": 674
},
{
"epoch": 1.7263427109974425,
"grad_norm": 0.2174062686096523,
"learning_rate": 2.3649289099526067e-05,
"loss": 0.3385,
"step": 675
},
{
"epoch": 1.7289002557544757,
"grad_norm": 0.20684367968994177,
"learning_rate": 2.360189573459716e-05,
"loss": 0.3375,
"step": 676
},
{
"epoch": 1.7314578005115089,
"grad_norm": 0.19965154462316637,
"learning_rate": 2.3554502369668246e-05,
"loss": 0.3253,
"step": 677
},
{
"epoch": 1.734015345268542,
"grad_norm": 0.21474011017766587,
"learning_rate": 2.3507109004739338e-05,
"loss": 0.3382,
"step": 678
},
{
"epoch": 1.7365728900255755,
"grad_norm": 0.22746922194428235,
"learning_rate": 2.345971563981043e-05,
"loss": 0.324,
"step": 679
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.2104457674935215,
"learning_rate": 2.3412322274881517e-05,
"loss": 0.3392,
"step": 680
},
{
"epoch": 1.741687979539642,
"grad_norm": 0.20283023890137888,
"learning_rate": 2.3364928909952608e-05,
"loss": 0.3291,
"step": 681
},
{
"epoch": 1.7442455242966752,
"grad_norm": 0.21461127150907236,
"learning_rate": 2.3317535545023696e-05,
"loss": 0.3386,
"step": 682
},
{
"epoch": 1.7468030690537084,
"grad_norm": 0.1909336493281626,
"learning_rate": 2.3270142180094788e-05,
"loss": 0.3175,
"step": 683
},
{
"epoch": 1.7493606138107416,
"grad_norm": 0.21259577247012493,
"learning_rate": 2.322274881516588e-05,
"loss": 0.3403,
"step": 684
},
{
"epoch": 1.7519181585677748,
"grad_norm": 0.20066175215287518,
"learning_rate": 2.317535545023697e-05,
"loss": 0.3285,
"step": 685
},
{
"epoch": 1.7544757033248082,
"grad_norm": 0.7581076529268704,
"learning_rate": 2.3127962085308058e-05,
"loss": 0.363,
"step": 686
},
{
"epoch": 1.7570332480818416,
"grad_norm": 0.22657067765448413,
"learning_rate": 2.3080568720379146e-05,
"loss": 0.3435,
"step": 687
},
{
"epoch": 1.7595907928388748,
"grad_norm": 0.2217418832124222,
"learning_rate": 2.3033175355450237e-05,
"loss": 0.3197,
"step": 688
},
{
"epoch": 1.762148337595908,
"grad_norm": 0.21459082938179172,
"learning_rate": 2.2985781990521325e-05,
"loss": 0.3505,
"step": 689
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.20738951497933816,
"learning_rate": 2.293838862559242e-05,
"loss": 0.321,
"step": 690
},
{
"epoch": 1.7672634271099743,
"grad_norm": 0.22785561899819126,
"learning_rate": 2.2890995260663508e-05,
"loss": 0.3424,
"step": 691
},
{
"epoch": 1.7698209718670077,
"grad_norm": 0.22927074980811404,
"learning_rate": 2.28436018957346e-05,
"loss": 0.3351,
"step": 692
},
{
"epoch": 1.772378516624041,
"grad_norm": 0.23347434762189972,
"learning_rate": 2.2796208530805687e-05,
"loss": 0.347,
"step": 693
},
{
"epoch": 1.7749360613810743,
"grad_norm": 0.2330189859527237,
"learning_rate": 2.274881516587678e-05,
"loss": 0.3341,
"step": 694
},
{
"epoch": 1.7774936061381075,
"grad_norm": 0.25074043381573513,
"learning_rate": 2.270142180094787e-05,
"loss": 0.3172,
"step": 695
},
{
"epoch": 1.7800511508951407,
"grad_norm": 0.21374906832842885,
"learning_rate": 2.2654028436018958e-05,
"loss": 0.339,
"step": 696
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.25168218613406507,
"learning_rate": 2.260663507109005e-05,
"loss": 0.3325,
"step": 697
},
{
"epoch": 1.785166240409207,
"grad_norm": 0.2403091187285791,
"learning_rate": 2.2559241706161137e-05,
"loss": 0.3478,
"step": 698
},
{
"epoch": 1.7877237851662404,
"grad_norm": 0.22187397630061947,
"learning_rate": 2.251184834123223e-05,
"loss": 0.3452,
"step": 699
},
{
"epoch": 1.7902813299232738,
"grad_norm": 0.24752310282755516,
"learning_rate": 2.2464454976303317e-05,
"loss": 0.3356,
"step": 700
},
{
"epoch": 1.792838874680307,
"grad_norm": 0.2084033534950943,
"learning_rate": 2.241706161137441e-05,
"loss": 0.3259,
"step": 701
},
{
"epoch": 1.7953964194373402,
"grad_norm": 0.2355859217064896,
"learning_rate": 2.23696682464455e-05,
"loss": 0.3425,
"step": 702
},
{
"epoch": 1.7979539641943734,
"grad_norm": 0.21447292569876703,
"learning_rate": 2.232227488151659e-05,
"loss": 0.3153,
"step": 703
},
{
"epoch": 1.8005115089514065,
"grad_norm": 0.20854337096420522,
"learning_rate": 2.227488151658768e-05,
"loss": 0.3352,
"step": 704
},
{
"epoch": 1.80306905370844,
"grad_norm": 0.22064595096377312,
"learning_rate": 2.2227488151658766e-05,
"loss": 0.3228,
"step": 705
},
{
"epoch": 1.8056265984654731,
"grad_norm": 0.23748592354665862,
"learning_rate": 2.2180094786729858e-05,
"loss": 0.3407,
"step": 706
},
{
"epoch": 1.8081841432225065,
"grad_norm": 0.25098201166842826,
"learning_rate": 2.213270142180095e-05,
"loss": 0.3533,
"step": 707
},
{
"epoch": 1.8107416879795397,
"grad_norm": 0.2789258681226503,
"learning_rate": 2.208530805687204e-05,
"loss": 0.3405,
"step": 708
},
{
"epoch": 1.813299232736573,
"grad_norm": 0.21924763977982134,
"learning_rate": 2.203791469194313e-05,
"loss": 0.3209,
"step": 709
},
{
"epoch": 1.815856777493606,
"grad_norm": 0.24534901252195856,
"learning_rate": 2.199052132701422e-05,
"loss": 0.3228,
"step": 710
},
{
"epoch": 1.8184143222506393,
"grad_norm": 0.23769380073414784,
"learning_rate": 2.1943127962085308e-05,
"loss": 0.3319,
"step": 711
},
{
"epoch": 1.8209718670076727,
"grad_norm": 0.20966116422671724,
"learning_rate": 2.18957345971564e-05,
"loss": 0.3255,
"step": 712
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.2278495662047266,
"learning_rate": 2.184834123222749e-05,
"loss": 0.3234,
"step": 713
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.22895416972072405,
"learning_rate": 2.1800947867298578e-05,
"loss": 0.3189,
"step": 714
},
{
"epoch": 1.8286445012787724,
"grad_norm": 0.2086902846375283,
"learning_rate": 2.175355450236967e-05,
"loss": 0.3472,
"step": 715
},
{
"epoch": 1.8312020460358056,
"grad_norm": 0.19855684843219606,
"learning_rate": 2.1706161137440758e-05,
"loss": 0.3458,
"step": 716
},
{
"epoch": 1.8337595907928388,
"grad_norm": 0.23552439546401155,
"learning_rate": 2.165876777251185e-05,
"loss": 0.336,
"step": 717
},
{
"epoch": 1.836317135549872,
"grad_norm": 0.20685861123790114,
"learning_rate": 2.161137440758294e-05,
"loss": 0.336,
"step": 718
},
{
"epoch": 1.8388746803069054,
"grad_norm": 0.19887491717386577,
"learning_rate": 2.156398104265403e-05,
"loss": 0.3396,
"step": 719
},
{
"epoch": 1.8414322250639388,
"grad_norm": 0.2509814669536259,
"learning_rate": 2.151658767772512e-05,
"loss": 0.3441,
"step": 720
},
{
"epoch": 1.843989769820972,
"grad_norm": 0.19522319866892376,
"learning_rate": 2.146919431279621e-05,
"loss": 0.3106,
"step": 721
},
{
"epoch": 1.8465473145780051,
"grad_norm": 0.18974799063588516,
"learning_rate": 2.14218009478673e-05,
"loss": 0.3262,
"step": 722
},
{
"epoch": 1.8491048593350383,
"grad_norm": 0.20477382204756456,
"learning_rate": 2.1374407582938387e-05,
"loss": 0.3429,
"step": 723
},
{
"epoch": 1.8516624040920715,
"grad_norm": 0.2142522572136313,
"learning_rate": 2.132701421800948e-05,
"loss": 0.3308,
"step": 724
},
{
"epoch": 1.854219948849105,
"grad_norm": 0.1979056292881532,
"learning_rate": 2.127962085308057e-05,
"loss": 0.3346,
"step": 725
},
{
"epoch": 1.856777493606138,
"grad_norm": 0.20554303110251226,
"learning_rate": 2.123222748815166e-05,
"loss": 0.3583,
"step": 726
},
{
"epoch": 1.8593350383631715,
"grad_norm": 0.1984154565321334,
"learning_rate": 2.118483412322275e-05,
"loss": 0.3196,
"step": 727
},
{
"epoch": 1.8618925831202047,
"grad_norm": 0.2008595114867567,
"learning_rate": 2.113744075829384e-05,
"loss": 0.3427,
"step": 728
},
{
"epoch": 1.8644501278772379,
"grad_norm": 0.21531627949148452,
"learning_rate": 2.109004739336493e-05,
"loss": 0.3415,
"step": 729
},
{
"epoch": 1.867007672634271,
"grad_norm": 0.20193118670494573,
"learning_rate": 2.104265402843602e-05,
"loss": 0.3371,
"step": 730
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.21944975819432885,
"learning_rate": 2.099526066350711e-05,
"loss": 0.3217,
"step": 731
},
{
"epoch": 1.8721227621483376,
"grad_norm": 0.23381023417915053,
"learning_rate": 2.09478672985782e-05,
"loss": 0.3406,
"step": 732
},
{
"epoch": 1.8746803069053708,
"grad_norm": 0.19300009053421657,
"learning_rate": 2.090047393364929e-05,
"loss": 0.3199,
"step": 733
},
{
"epoch": 1.8772378516624042,
"grad_norm": 0.19576466530600098,
"learning_rate": 2.0853080568720378e-05,
"loss": 0.3215,
"step": 734
},
{
"epoch": 1.8797953964194374,
"grad_norm": 0.21787819537132525,
"learning_rate": 2.0805687203791473e-05,
"loss": 0.3359,
"step": 735
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.20623605117402122,
"learning_rate": 2.075829383886256e-05,
"loss": 0.3396,
"step": 736
},
{
"epoch": 1.8849104859335037,
"grad_norm": 0.19807063269430017,
"learning_rate": 2.0710900473933652e-05,
"loss": 0.3321,
"step": 737
},
{
"epoch": 1.887468030690537,
"grad_norm": 0.21340084606280826,
"learning_rate": 2.066350710900474e-05,
"loss": 0.3376,
"step": 738
},
{
"epoch": 1.8900255754475703,
"grad_norm": 0.2117778713699439,
"learning_rate": 2.061611374407583e-05,
"loss": 0.318,
"step": 739
},
{
"epoch": 1.8925831202046037,
"grad_norm": 0.19496876658086634,
"learning_rate": 2.056872037914692e-05,
"loss": 0.3275,
"step": 740
},
{
"epoch": 1.895140664961637,
"grad_norm": 0.22772399554231024,
"learning_rate": 2.052132701421801e-05,
"loss": 0.3456,
"step": 741
},
{
"epoch": 1.89769820971867,
"grad_norm": 0.19861753270620258,
"learning_rate": 2.0473933649289102e-05,
"loss": 0.3364,
"step": 742
},
{
"epoch": 1.9002557544757033,
"grad_norm": 0.2101418258514019,
"learning_rate": 2.042654028436019e-05,
"loss": 0.3324,
"step": 743
},
{
"epoch": 1.9028132992327365,
"grad_norm": 0.19738568484825283,
"learning_rate": 2.037914691943128e-05,
"loss": 0.3458,
"step": 744
},
{
"epoch": 1.9053708439897699,
"grad_norm": 0.22341732627665845,
"learning_rate": 2.033175355450237e-05,
"loss": 0.3472,
"step": 745
},
{
"epoch": 1.907928388746803,
"grad_norm": 0.20794044931146008,
"learning_rate": 2.028436018957346e-05,
"loss": 0.3367,
"step": 746
},
{
"epoch": 1.9104859335038364,
"grad_norm": 0.20491964629174395,
"learning_rate": 2.0236966824644552e-05,
"loss": 0.3223,
"step": 747
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.20189894495265795,
"learning_rate": 2.018957345971564e-05,
"loss": 0.3362,
"step": 748
},
{
"epoch": 1.9156010230179028,
"grad_norm": 0.2048150503497566,
"learning_rate": 2.014218009478673e-05,
"loss": 0.3426,
"step": 749
},
{
"epoch": 1.918158567774936,
"grad_norm": 0.21954225182865705,
"learning_rate": 2.009478672985782e-05,
"loss": 0.3528,
"step": 750
},
{
"epoch": 1.9207161125319692,
"grad_norm": 0.22214844960655306,
"learning_rate": 2.004739336492891e-05,
"loss": 0.3491,
"step": 751
},
{
"epoch": 1.9232736572890026,
"grad_norm": 0.2002610790388588,
"learning_rate": 2e-05,
"loss": 0.3324,
"step": 752
},
{
"epoch": 1.9258312020460358,
"grad_norm": 0.23222016864966347,
"learning_rate": 1.9952606635071093e-05,
"loss": 0.3292,
"step": 753
},
{
"epoch": 1.9283887468030692,
"grad_norm": 0.2207542823663722,
"learning_rate": 1.990521327014218e-05,
"loss": 0.3385,
"step": 754
},
{
"epoch": 1.9309462915601023,
"grad_norm": 0.22749264244194325,
"learning_rate": 1.9857819905213272e-05,
"loss": 0.316,
"step": 755
},
{
"epoch": 1.9335038363171355,
"grad_norm": 0.1977916254111309,
"learning_rate": 1.981042654028436e-05,
"loss": 0.3395,
"step": 756
},
{
"epoch": 1.9360613810741687,
"grad_norm": 0.19556691281474403,
"learning_rate": 1.976303317535545e-05,
"loss": 0.3355,
"step": 757
},
{
"epoch": 1.938618925831202,
"grad_norm": 0.1962514353937156,
"learning_rate": 1.9715639810426543e-05,
"loss": 0.3156,
"step": 758
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.23567156259437158,
"learning_rate": 1.966824644549763e-05,
"loss": 0.3413,
"step": 759
},
{
"epoch": 1.9437340153452687,
"grad_norm": 0.1980383962745943,
"learning_rate": 1.9620853080568722e-05,
"loss": 0.323,
"step": 760
},
{
"epoch": 1.9462915601023019,
"grad_norm": 0.19505875547934262,
"learning_rate": 1.957345971563981e-05,
"loss": 0.3342,
"step": 761
},
{
"epoch": 1.948849104859335,
"grad_norm": 0.22978204914718386,
"learning_rate": 1.95260663507109e-05,
"loss": 0.3438,
"step": 762
},
{
"epoch": 1.9514066496163682,
"grad_norm": 0.19344201193147603,
"learning_rate": 1.9478672985781993e-05,
"loss": 0.3118,
"step": 763
},
{
"epoch": 1.9539641943734014,
"grad_norm": 0.18582466291193162,
"learning_rate": 1.943127962085308e-05,
"loss": 0.3375,
"step": 764
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.21401678800134463,
"learning_rate": 1.9383886255924172e-05,
"loss": 0.3384,
"step": 765
},
{
"epoch": 1.959079283887468,
"grad_norm": 0.19342159241258478,
"learning_rate": 1.933649289099526e-05,
"loss": 0.3206,
"step": 766
},
{
"epoch": 1.9616368286445014,
"grad_norm": 0.19399605381147378,
"learning_rate": 1.928909952606635e-05,
"loss": 0.3384,
"step": 767
},
{
"epoch": 1.9641943734015346,
"grad_norm": 0.20148408812790133,
"learning_rate": 1.924170616113744e-05,
"loss": 0.3225,
"step": 768
},
{
"epoch": 1.9667519181585678,
"grad_norm": 0.18715476457309554,
"learning_rate": 1.9194312796208534e-05,
"loss": 0.3288,
"step": 769
},
{
"epoch": 1.969309462915601,
"grad_norm": 0.18815259504289839,
"learning_rate": 1.9146919431279622e-05,
"loss": 0.3142,
"step": 770
},
{
"epoch": 1.9718670076726341,
"grad_norm": 0.20640473592890973,
"learning_rate": 1.9099526066350713e-05,
"loss": 0.3191,
"step": 771
},
{
"epoch": 1.9744245524296675,
"grad_norm": 0.19662779268863564,
"learning_rate": 1.90521327014218e-05,
"loss": 0.3207,
"step": 772
},
{
"epoch": 1.976982097186701,
"grad_norm": 0.2066568679986916,
"learning_rate": 1.9004739336492893e-05,
"loss": 0.3454,
"step": 773
},
{
"epoch": 1.979539641943734,
"grad_norm": 0.20322475668496154,
"learning_rate": 1.895734597156398e-05,
"loss": 0.3234,
"step": 774
},
{
"epoch": 1.9820971867007673,
"grad_norm": 0.19059283154521114,
"learning_rate": 1.8909952606635072e-05,
"loss": 0.3313,
"step": 775
},
{
"epoch": 1.9846547314578005,
"grad_norm": 0.21406140221632183,
"learning_rate": 1.8862559241706163e-05,
"loss": 0.3465,
"step": 776
},
{
"epoch": 1.9872122762148337,
"grad_norm": 0.21139212441518793,
"learning_rate": 1.881516587677725e-05,
"loss": 0.3261,
"step": 777
},
{
"epoch": 1.989769820971867,
"grad_norm": 0.19320779992691875,
"learning_rate": 1.8767772511848342e-05,
"loss": 0.3199,
"step": 778
},
{
"epoch": 1.9923273657289002,
"grad_norm": 0.1948553869588904,
"learning_rate": 1.872037914691943e-05,
"loss": 0.3295,
"step": 779
},
{
"epoch": 1.9948849104859336,
"grad_norm": 0.19898631153447896,
"learning_rate": 1.8672985781990525e-05,
"loss": 0.3185,
"step": 780
},
{
"epoch": 1.9974424552429668,
"grad_norm": 0.20500622742531077,
"learning_rate": 1.8625592417061613e-05,
"loss": 0.3367,
"step": 781
},
{
"epoch": 2.0,
"grad_norm": 0.18745671376713552,
"learning_rate": 1.85781990521327e-05,
"loss": 0.3035,
"step": 782
},
{
"epoch": 2.002557544757033,
"grad_norm": 0.2785878708893465,
"learning_rate": 1.8530805687203792e-05,
"loss": 0.2615,
"step": 783
},
{
"epoch": 2.0051150895140664,
"grad_norm": 0.20397480769360993,
"learning_rate": 1.848341232227488e-05,
"loss": 0.2434,
"step": 784
},
{
"epoch": 2.0076726342710995,
"grad_norm": 0.2923962743620856,
"learning_rate": 1.843601895734597e-05,
"loss": 0.2478,
"step": 785
},
{
"epoch": 2.010230179028133,
"grad_norm": 0.25689369914334176,
"learning_rate": 1.8388625592417063e-05,
"loss": 0.2448,
"step": 786
},
{
"epoch": 2.0127877237851663,
"grad_norm": 0.23710484976355836,
"learning_rate": 1.8341232227488154e-05,
"loss": 0.257,
"step": 787
},
{
"epoch": 2.0153452685421995,
"grad_norm": 0.29563461441097083,
"learning_rate": 1.8293838862559242e-05,
"loss": 0.2521,
"step": 788
},
{
"epoch": 2.0179028132992327,
"grad_norm": 0.2381040612370418,
"learning_rate": 1.8246445497630334e-05,
"loss": 0.2499,
"step": 789
},
{
"epoch": 2.020460358056266,
"grad_norm": 0.2291439129489046,
"learning_rate": 1.819905213270142e-05,
"loss": 0.2438,
"step": 790
},
{
"epoch": 2.023017902813299,
"grad_norm": 0.28685620757378183,
"learning_rate": 1.8151658767772513e-05,
"loss": 0.2553,
"step": 791
},
{
"epoch": 2.0255754475703327,
"grad_norm": 0.21147497245529764,
"learning_rate": 1.8104265402843604e-05,
"loss": 0.252,
"step": 792
},
{
"epoch": 2.028132992327366,
"grad_norm": 0.22446603408981508,
"learning_rate": 1.8056872037914692e-05,
"loss": 0.2536,
"step": 793
},
{
"epoch": 2.030690537084399,
"grad_norm": 0.24541367333886985,
"learning_rate": 1.8009478672985784e-05,
"loss": 0.2504,
"step": 794
},
{
"epoch": 2.0332480818414322,
"grad_norm": 0.22514879404996416,
"learning_rate": 1.796208530805687e-05,
"loss": 0.2605,
"step": 795
},
{
"epoch": 2.0358056265984654,
"grad_norm": 0.20624678594072715,
"learning_rate": 1.7914691943127963e-05,
"loss": 0.2612,
"step": 796
},
{
"epoch": 2.0383631713554986,
"grad_norm": 0.21342231575903908,
"learning_rate": 1.7867298578199054e-05,
"loss": 0.2499,
"step": 797
},
{
"epoch": 2.040920716112532,
"grad_norm": 0.22708020169166784,
"learning_rate": 1.7819905213270146e-05,
"loss": 0.2573,
"step": 798
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.20671082360929366,
"learning_rate": 1.7772511848341233e-05,
"loss": 0.2517,
"step": 799
},
{
"epoch": 2.0460358056265986,
"grad_norm": 0.20470461882320312,
"learning_rate": 1.772511848341232e-05,
"loss": 0.2441,
"step": 800
},
{
"epoch": 2.0485933503836318,
"grad_norm": 0.20251032207130173,
"learning_rate": 1.7677725118483413e-05,
"loss": 0.2547,
"step": 801
},
{
"epoch": 2.051150895140665,
"grad_norm": 0.20368951509303784,
"learning_rate": 1.76303317535545e-05,
"loss": 0.2439,
"step": 802
},
{
"epoch": 2.053708439897698,
"grad_norm": 0.19922183550926562,
"learning_rate": 1.7582938388625595e-05,
"loss": 0.2401,
"step": 803
},
{
"epoch": 2.0562659846547313,
"grad_norm": 0.21378847417361496,
"learning_rate": 1.7535545023696683e-05,
"loss": 0.2598,
"step": 804
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.2093916676403955,
"learning_rate": 1.7488151658767775e-05,
"loss": 0.2562,
"step": 805
},
{
"epoch": 2.061381074168798,
"grad_norm": 0.2148148853889112,
"learning_rate": 1.7440758293838863e-05,
"loss": 0.2543,
"step": 806
},
{
"epoch": 2.0639386189258313,
"grad_norm": 0.20365914748452466,
"learning_rate": 1.7393364928909954e-05,
"loss": 0.248,
"step": 807
},
{
"epoch": 2.0664961636828645,
"grad_norm": 0.21066398897720096,
"learning_rate": 1.7345971563981042e-05,
"loss": 0.2638,
"step": 808
},
{
"epoch": 2.0690537084398977,
"grad_norm": 0.20804166422941303,
"learning_rate": 1.7298578199052133e-05,
"loss": 0.2542,
"step": 809
},
{
"epoch": 2.071611253196931,
"grad_norm": 0.18674967472128892,
"learning_rate": 1.7251184834123225e-05,
"loss": 0.2405,
"step": 810
},
{
"epoch": 2.074168797953964,
"grad_norm": 0.1906175209072609,
"learning_rate": 1.7203791469194313e-05,
"loss": 0.2342,
"step": 811
},
{
"epoch": 2.0767263427109977,
"grad_norm": 0.2100046063283888,
"learning_rate": 1.7156398104265404e-05,
"loss": 0.2432,
"step": 812
},
{
"epoch": 2.079283887468031,
"grad_norm": 0.1967925906674926,
"learning_rate": 1.7109004739336492e-05,
"loss": 0.2413,
"step": 813
},
{
"epoch": 2.081841432225064,
"grad_norm": 0.1985022110628129,
"learning_rate": 1.7061611374407587e-05,
"loss": 0.2412,
"step": 814
},
{
"epoch": 2.084398976982097,
"grad_norm": 0.2004462205861864,
"learning_rate": 1.7014218009478674e-05,
"loss": 0.2608,
"step": 815
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.2126154513787664,
"learning_rate": 1.6966824644549766e-05,
"loss": 0.2419,
"step": 816
},
{
"epoch": 2.0895140664961636,
"grad_norm": 0.21724682158013556,
"learning_rate": 1.6919431279620854e-05,
"loss": 0.2555,
"step": 817
},
{
"epoch": 2.0920716112531967,
"grad_norm": 0.20957633230824144,
"learning_rate": 1.6872037914691942e-05,
"loss": 0.2434,
"step": 818
},
{
"epoch": 2.0946291560102304,
"grad_norm": 0.1852153835527483,
"learning_rate": 1.6824644549763033e-05,
"loss": 0.2408,
"step": 819
},
{
"epoch": 2.0971867007672635,
"grad_norm": 0.22086697836670513,
"learning_rate": 1.6777251184834124e-05,
"loss": 0.2582,
"step": 820
},
{
"epoch": 2.0997442455242967,
"grad_norm": 0.24261708505812196,
"learning_rate": 1.6729857819905216e-05,
"loss": 0.2632,
"step": 821
},
{
"epoch": 2.10230179028133,
"grad_norm": 0.18366952389698496,
"learning_rate": 1.6682464454976304e-05,
"loss": 0.2433,
"step": 822
},
{
"epoch": 2.104859335038363,
"grad_norm": 0.2038172463973163,
"learning_rate": 1.6635071090047395e-05,
"loss": 0.2525,
"step": 823
},
{
"epoch": 2.1074168797953963,
"grad_norm": 0.2012679343362801,
"learning_rate": 1.6587677725118483e-05,
"loss": 0.249,
"step": 824
},
{
"epoch": 2.10997442455243,
"grad_norm": 0.19324190678914918,
"learning_rate": 1.6540284360189574e-05,
"loss": 0.2476,
"step": 825
},
{
"epoch": 2.112531969309463,
"grad_norm": 0.19308515698590148,
"learning_rate": 1.6492890995260666e-05,
"loss": 0.2545,
"step": 826
},
{
"epoch": 2.1150895140664963,
"grad_norm": 0.20072878909780828,
"learning_rate": 1.6445497630331754e-05,
"loss": 0.2493,
"step": 827
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.21529840999791708,
"learning_rate": 1.6398104265402845e-05,
"loss": 0.2505,
"step": 828
},
{
"epoch": 2.1202046035805626,
"grad_norm": 0.190291814924438,
"learning_rate": 1.6350710900473933e-05,
"loss": 0.2568,
"step": 829
},
{
"epoch": 2.122762148337596,
"grad_norm": 0.1843567434491544,
"learning_rate": 1.6303317535545024e-05,
"loss": 0.235,
"step": 830
},
{
"epoch": 2.125319693094629,
"grad_norm": 0.20192839632170334,
"learning_rate": 1.6255924170616116e-05,
"loss": 0.2518,
"step": 831
},
{
"epoch": 2.1278772378516626,
"grad_norm": 0.19505086113061484,
"learning_rate": 1.6208530805687207e-05,
"loss": 0.2422,
"step": 832
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.18413293323513488,
"learning_rate": 1.6161137440758295e-05,
"loss": 0.2481,
"step": 833
},
{
"epoch": 2.132992327365729,
"grad_norm": 0.19660864149905055,
"learning_rate": 1.6113744075829386e-05,
"loss": 0.2482,
"step": 834
},
{
"epoch": 2.135549872122762,
"grad_norm": 0.19108123965299506,
"learning_rate": 1.6066350710900474e-05,
"loss": 0.2488,
"step": 835
},
{
"epoch": 2.1381074168797953,
"grad_norm": 0.2054493861311576,
"learning_rate": 1.6018957345971562e-05,
"loss": 0.2508,
"step": 836
},
{
"epoch": 2.1406649616368285,
"grad_norm": 0.19933140761961352,
"learning_rate": 1.5971563981042657e-05,
"loss": 0.2526,
"step": 837
},
{
"epoch": 2.1432225063938617,
"grad_norm": 0.18520997915505424,
"learning_rate": 1.5924170616113745e-05,
"loss": 0.2553,
"step": 838
},
{
"epoch": 2.1457800511508953,
"grad_norm": 0.18142714347687713,
"learning_rate": 1.5876777251184836e-05,
"loss": 0.2404,
"step": 839
},
{
"epoch": 2.1483375959079285,
"grad_norm": 0.19332393510145196,
"learning_rate": 1.5829383886255924e-05,
"loss": 0.2608,
"step": 840
},
{
"epoch": 2.1508951406649617,
"grad_norm": 0.18239849204776917,
"learning_rate": 1.5781990521327015e-05,
"loss": 0.2472,
"step": 841
},
{
"epoch": 2.153452685421995,
"grad_norm": 0.19432247568701047,
"learning_rate": 1.5734597156398103e-05,
"loss": 0.2509,
"step": 842
},
{
"epoch": 2.156010230179028,
"grad_norm": 0.1891425736304601,
"learning_rate": 1.5687203791469195e-05,
"loss": 0.2544,
"step": 843
},
{
"epoch": 2.1585677749360612,
"grad_norm": 0.1776945543749591,
"learning_rate": 1.5639810426540286e-05,
"loss": 0.2418,
"step": 844
},
{
"epoch": 2.1611253196930944,
"grad_norm": 0.19454352996860633,
"learning_rate": 1.5592417061611374e-05,
"loss": 0.2578,
"step": 845
},
{
"epoch": 2.163682864450128,
"grad_norm": 0.19387855469120038,
"learning_rate": 1.5545023696682465e-05,
"loss": 0.2562,
"step": 846
},
{
"epoch": 2.166240409207161,
"grad_norm": 0.1884476249381793,
"learning_rate": 1.5497630331753553e-05,
"loss": 0.2435,
"step": 847
},
{
"epoch": 2.1687979539641944,
"grad_norm": 0.19682354969261456,
"learning_rate": 1.5450236966824648e-05,
"loss": 0.245,
"step": 848
},
{
"epoch": 2.1713554987212276,
"grad_norm": 0.19646857607869206,
"learning_rate": 1.5402843601895736e-05,
"loss": 0.2421,
"step": 849
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.1878274831496743,
"learning_rate": 1.5355450236966827e-05,
"loss": 0.2602,
"step": 850
},
{
"epoch": 2.176470588235294,
"grad_norm": 0.2203759013180319,
"learning_rate": 1.5308056872037915e-05,
"loss": 0.26,
"step": 851
},
{
"epoch": 2.1790281329923276,
"grad_norm": 0.20353045344689538,
"learning_rate": 1.5260663507109007e-05,
"loss": 0.2548,
"step": 852
},
{
"epoch": 2.1815856777493607,
"grad_norm": 0.17917216373663247,
"learning_rate": 1.5213270142180094e-05,
"loss": 0.2481,
"step": 853
},
{
"epoch": 2.184143222506394,
"grad_norm": 0.19965741458148648,
"learning_rate": 1.5165876777251187e-05,
"loss": 0.2369,
"step": 854
},
{
"epoch": 2.186700767263427,
"grad_norm": 0.1983107544529902,
"learning_rate": 1.5118483412322275e-05,
"loss": 0.2649,
"step": 855
},
{
"epoch": 2.1892583120204603,
"grad_norm": 0.18889444027577523,
"learning_rate": 1.5071090047393365e-05,
"loss": 0.2537,
"step": 856
},
{
"epoch": 2.1918158567774935,
"grad_norm": 0.1778943432272497,
"learning_rate": 1.5023696682464455e-05,
"loss": 0.2416,
"step": 857
},
{
"epoch": 2.1943734015345266,
"grad_norm": 0.1864614456662679,
"learning_rate": 1.4976303317535544e-05,
"loss": 0.2552,
"step": 858
},
{
"epoch": 2.1969309462915603,
"grad_norm": 0.20406945944259086,
"learning_rate": 1.4928909952606634e-05,
"loss": 0.2445,
"step": 859
},
{
"epoch": 2.1994884910485935,
"grad_norm": 0.19912968488704036,
"learning_rate": 1.4881516587677727e-05,
"loss": 0.2483,
"step": 860
},
{
"epoch": 2.2020460358056266,
"grad_norm": 0.1971404080483016,
"learning_rate": 1.4834123222748817e-05,
"loss": 0.2485,
"step": 861
},
{
"epoch": 2.20460358056266,
"grad_norm": 0.1906866495437284,
"learning_rate": 1.4786729857819906e-05,
"loss": 0.2422,
"step": 862
},
{
"epoch": 2.207161125319693,
"grad_norm": 0.2236746863882317,
"learning_rate": 1.4739336492890996e-05,
"loss": 0.2526,
"step": 863
},
{
"epoch": 2.209718670076726,
"grad_norm": 0.20479615550169253,
"learning_rate": 1.4691943127962086e-05,
"loss": 0.2406,
"step": 864
},
{
"epoch": 2.21227621483376,
"grad_norm": 0.18952772024384357,
"learning_rate": 1.4644549763033177e-05,
"loss": 0.2473,
"step": 865
},
{
"epoch": 2.214833759590793,
"grad_norm": 0.21288572261909536,
"learning_rate": 1.4597156398104267e-05,
"loss": 0.2421,
"step": 866
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.22140557077938572,
"learning_rate": 1.4549763033175356e-05,
"loss": 0.2475,
"step": 867
},
{
"epoch": 2.2199488491048593,
"grad_norm": 0.20708774144192757,
"learning_rate": 1.4502369668246446e-05,
"loss": 0.2673,
"step": 868
},
{
"epoch": 2.2225063938618925,
"grad_norm": 0.18720660014130933,
"learning_rate": 1.4454976303317535e-05,
"loss": 0.247,
"step": 869
},
{
"epoch": 2.2250639386189257,
"grad_norm": 0.22218057616305048,
"learning_rate": 1.4407582938388625e-05,
"loss": 0.2563,
"step": 870
},
{
"epoch": 2.227621483375959,
"grad_norm": 0.19461791848551566,
"learning_rate": 1.4360189573459718e-05,
"loss": 0.2411,
"step": 871
},
{
"epoch": 2.2301790281329925,
"grad_norm": 0.18465999777437872,
"learning_rate": 1.4312796208530808e-05,
"loss": 0.2436,
"step": 872
},
{
"epoch": 2.2327365728900257,
"grad_norm": 0.1869706742832914,
"learning_rate": 1.4265402843601896e-05,
"loss": 0.2468,
"step": 873
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.19859678673304443,
"learning_rate": 1.4218009478672985e-05,
"loss": 0.2629,
"step": 874
},
{
"epoch": 2.237851662404092,
"grad_norm": 0.18894735140342547,
"learning_rate": 1.4170616113744075e-05,
"loss": 0.2463,
"step": 875
},
{
"epoch": 2.2404092071611252,
"grad_norm": 0.1841073857339832,
"learning_rate": 1.4123222748815165e-05,
"loss": 0.2443,
"step": 876
},
{
"epoch": 2.2429667519181584,
"grad_norm": 0.19766216004613152,
"learning_rate": 1.4075829383886258e-05,
"loss": 0.2445,
"step": 877
},
{
"epoch": 2.2455242966751916,
"grad_norm": 0.20409991732668273,
"learning_rate": 1.4028436018957347e-05,
"loss": 0.2708,
"step": 878
},
{
"epoch": 2.2480818414322252,
"grad_norm": 0.19977719707950095,
"learning_rate": 1.3981042654028437e-05,
"loss": 0.2518,
"step": 879
},
{
"epoch": 2.2506393861892584,
"grad_norm": 0.2053796828512668,
"learning_rate": 1.3933649289099527e-05,
"loss": 0.2495,
"step": 880
},
{
"epoch": 2.2531969309462916,
"grad_norm": 0.17832792645098117,
"learning_rate": 1.3886255924170616e-05,
"loss": 0.2556,
"step": 881
},
{
"epoch": 2.2557544757033248,
"grad_norm": 0.18840256764724986,
"learning_rate": 1.3838862559241708e-05,
"loss": 0.2451,
"step": 882
},
{
"epoch": 2.258312020460358,
"grad_norm": 0.19398836581670234,
"learning_rate": 1.3791469194312797e-05,
"loss": 0.2473,
"step": 883
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.20303902146790734,
"learning_rate": 1.3744075829383887e-05,
"loss": 0.2597,
"step": 884
},
{
"epoch": 2.2634271099744243,
"grad_norm": 0.18720894136927904,
"learning_rate": 1.3696682464454977e-05,
"loss": 0.2434,
"step": 885
},
{
"epoch": 2.265984654731458,
"grad_norm": 0.18987210304857877,
"learning_rate": 1.3649289099526066e-05,
"loss": 0.2525,
"step": 886
},
{
"epoch": 2.268542199488491,
"grad_norm": 0.2048273825139193,
"learning_rate": 1.3601895734597156e-05,
"loss": 0.2455,
"step": 887
},
{
"epoch": 2.2710997442455243,
"grad_norm": 0.19576486403594287,
"learning_rate": 1.3554502369668249e-05,
"loss": 0.2613,
"step": 888
},
{
"epoch": 2.2736572890025575,
"grad_norm": 0.20157435141172714,
"learning_rate": 1.3507109004739339e-05,
"loss": 0.2537,
"step": 889
},
{
"epoch": 2.2762148337595907,
"grad_norm": 0.18228152643827863,
"learning_rate": 1.3459715639810428e-05,
"loss": 0.2513,
"step": 890
},
{
"epoch": 2.2787723785166243,
"grad_norm": 0.19555632064091633,
"learning_rate": 1.3412322274881516e-05,
"loss": 0.2586,
"step": 891
},
{
"epoch": 2.2813299232736575,
"grad_norm": 0.203816174533527,
"learning_rate": 1.3364928909952606e-05,
"loss": 0.2442,
"step": 892
},
{
"epoch": 2.2838874680306906,
"grad_norm": 0.2029139098001244,
"learning_rate": 1.3317535545023695e-05,
"loss": 0.2561,
"step": 893
},
{
"epoch": 2.286445012787724,
"grad_norm": 0.19048244262223243,
"learning_rate": 1.3270142180094788e-05,
"loss": 0.2548,
"step": 894
},
{
"epoch": 2.289002557544757,
"grad_norm": 0.19391847669904372,
"learning_rate": 1.3222748815165878e-05,
"loss": 0.2421,
"step": 895
},
{
"epoch": 2.29156010230179,
"grad_norm": 0.17981132307135597,
"learning_rate": 1.3175355450236968e-05,
"loss": 0.2532,
"step": 896
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.17858235830519362,
"learning_rate": 1.3127962085308057e-05,
"loss": 0.2404,
"step": 897
},
{
"epoch": 2.296675191815857,
"grad_norm": 0.19117496497677566,
"learning_rate": 1.3080568720379147e-05,
"loss": 0.2593,
"step": 898
},
{
"epoch": 2.29923273657289,
"grad_norm": 0.2016899881448073,
"learning_rate": 1.3033175355450238e-05,
"loss": 0.2528,
"step": 899
},
{
"epoch": 2.3017902813299234,
"grad_norm": 0.1810650437144312,
"learning_rate": 1.2985781990521328e-05,
"loss": 0.2402,
"step": 900
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.19291047749671594,
"learning_rate": 1.2938388625592418e-05,
"loss": 0.2461,
"step": 901
},
{
"epoch": 2.3069053708439897,
"grad_norm": 0.1939959846671169,
"learning_rate": 1.2890995260663507e-05,
"loss": 0.2473,
"step": 902
},
{
"epoch": 2.309462915601023,
"grad_norm": 0.1863110176956983,
"learning_rate": 1.2843601895734597e-05,
"loss": 0.2364,
"step": 903
},
{
"epoch": 2.312020460358056,
"grad_norm": 0.17566806664980533,
"learning_rate": 1.2796208530805687e-05,
"loss": 0.2482,
"step": 904
},
{
"epoch": 2.3145780051150897,
"grad_norm": 0.19920352232738897,
"learning_rate": 1.274881516587678e-05,
"loss": 0.2559,
"step": 905
},
{
"epoch": 2.317135549872123,
"grad_norm": 0.1953502116868402,
"learning_rate": 1.270142180094787e-05,
"loss": 0.2408,
"step": 906
},
{
"epoch": 2.319693094629156,
"grad_norm": 0.18651854725906564,
"learning_rate": 1.2654028436018959e-05,
"loss": 0.2523,
"step": 907
},
{
"epoch": 2.3222506393861893,
"grad_norm": 0.1894806065906189,
"learning_rate": 1.2606635071090048e-05,
"loss": 0.2651,
"step": 908
},
{
"epoch": 2.3248081841432224,
"grad_norm": 0.18839186702018404,
"learning_rate": 1.2559241706161136e-05,
"loss": 0.2474,
"step": 909
},
{
"epoch": 2.3273657289002556,
"grad_norm": 0.19140520747243725,
"learning_rate": 1.2511848341232226e-05,
"loss": 0.2393,
"step": 910
},
{
"epoch": 2.329923273657289,
"grad_norm": 0.18330215327131463,
"learning_rate": 1.2464454976303317e-05,
"loss": 0.2528,
"step": 911
},
{
"epoch": 2.3324808184143224,
"grad_norm": 0.1932126436646379,
"learning_rate": 1.2417061611374409e-05,
"loss": 0.2565,
"step": 912
},
{
"epoch": 2.3350383631713556,
"grad_norm": 0.1950356336934161,
"learning_rate": 1.2369668246445498e-05,
"loss": 0.2457,
"step": 913
},
{
"epoch": 2.337595907928389,
"grad_norm": 0.17865872468905974,
"learning_rate": 1.2322274881516588e-05,
"loss": 0.2425,
"step": 914
},
{
"epoch": 2.340153452685422,
"grad_norm": 0.18504654975711932,
"learning_rate": 1.227488151658768e-05,
"loss": 0.2577,
"step": 915
},
{
"epoch": 2.342710997442455,
"grad_norm": 0.20222565063208944,
"learning_rate": 1.2227488151658769e-05,
"loss": 0.2581,
"step": 916
},
{
"epoch": 2.3452685421994883,
"grad_norm": 0.1838472381815511,
"learning_rate": 1.2180094786729859e-05,
"loss": 0.2542,
"step": 917
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.18346333495631081,
"learning_rate": 1.2132701421800948e-05,
"loss": 0.2366,
"step": 918
},
{
"epoch": 2.350383631713555,
"grad_norm": 0.18792845931699567,
"learning_rate": 1.2085308056872038e-05,
"loss": 0.2397,
"step": 919
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.18552873313226068,
"learning_rate": 1.2037914691943128e-05,
"loss": 0.2476,
"step": 920
},
{
"epoch": 2.3554987212276215,
"grad_norm": 0.18568764961833162,
"learning_rate": 1.1990521327014219e-05,
"loss": 0.2464,
"step": 921
},
{
"epoch": 2.3580562659846547,
"grad_norm": 0.19910274628884306,
"learning_rate": 1.1943127962085309e-05,
"loss": 0.2616,
"step": 922
},
{
"epoch": 2.360613810741688,
"grad_norm": 0.19023555512921334,
"learning_rate": 1.1895734597156398e-05,
"loss": 0.2468,
"step": 923
},
{
"epoch": 2.363171355498721,
"grad_norm": 0.18498994847552913,
"learning_rate": 1.184834123222749e-05,
"loss": 0.238,
"step": 924
},
{
"epoch": 2.3657289002557547,
"grad_norm": 0.19230699836861417,
"learning_rate": 1.180094786729858e-05,
"loss": 0.2499,
"step": 925
},
{
"epoch": 2.368286445012788,
"grad_norm": 0.18992356411703937,
"learning_rate": 1.1753554502369669e-05,
"loss": 0.2486,
"step": 926
},
{
"epoch": 2.370843989769821,
"grad_norm": 0.194179037810583,
"learning_rate": 1.1706161137440758e-05,
"loss": 0.2419,
"step": 927
},
{
"epoch": 2.373401534526854,
"grad_norm": 0.19309339760618768,
"learning_rate": 1.1658767772511848e-05,
"loss": 0.2498,
"step": 928
},
{
"epoch": 2.3759590792838874,
"grad_norm": 0.18311942664171635,
"learning_rate": 1.161137440758294e-05,
"loss": 0.2508,
"step": 929
},
{
"epoch": 2.3785166240409206,
"grad_norm": 0.1851345033868292,
"learning_rate": 1.1563981042654029e-05,
"loss": 0.2458,
"step": 930
},
{
"epoch": 2.381074168797954,
"grad_norm": 0.20539574184587675,
"learning_rate": 1.1516587677725119e-05,
"loss": 0.2535,
"step": 931
},
{
"epoch": 2.3836317135549874,
"grad_norm": 0.1892100521466075,
"learning_rate": 1.146919431279621e-05,
"loss": 0.2429,
"step": 932
},
{
"epoch": 2.3861892583120206,
"grad_norm": 0.18603312629992957,
"learning_rate": 1.14218009478673e-05,
"loss": 0.2509,
"step": 933
},
{
"epoch": 2.3887468030690537,
"grad_norm": 0.19500218393174892,
"learning_rate": 1.137440758293839e-05,
"loss": 0.2566,
"step": 934
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.1826094688821734,
"learning_rate": 1.1327014218009479e-05,
"loss": 0.2463,
"step": 935
},
{
"epoch": 2.39386189258312,
"grad_norm": 0.2035505913630252,
"learning_rate": 1.1279620853080569e-05,
"loss": 0.2453,
"step": 936
},
{
"epoch": 2.3964194373401533,
"grad_norm": 0.1986182294740978,
"learning_rate": 1.1232227488151658e-05,
"loss": 0.2558,
"step": 937
},
{
"epoch": 2.398976982097187,
"grad_norm": 0.19707252280447218,
"learning_rate": 1.118483412322275e-05,
"loss": 0.2434,
"step": 938
},
{
"epoch": 2.40153452685422,
"grad_norm": 0.18477355859208972,
"learning_rate": 1.113744075829384e-05,
"loss": 0.2593,
"step": 939
},
{
"epoch": 2.4040920716112533,
"grad_norm": 0.18942958453392783,
"learning_rate": 1.1090047393364929e-05,
"loss": 0.2356,
"step": 940
},
{
"epoch": 2.4066496163682864,
"grad_norm": 0.19115020121012594,
"learning_rate": 1.104265402843602e-05,
"loss": 0.2586,
"step": 941
},
{
"epoch": 2.4092071611253196,
"grad_norm": 0.1907623500410302,
"learning_rate": 1.099526066350711e-05,
"loss": 0.2466,
"step": 942
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.20551739715864323,
"learning_rate": 1.09478672985782e-05,
"loss": 0.2641,
"step": 943
},
{
"epoch": 2.414322250639386,
"grad_norm": 0.2037628239144751,
"learning_rate": 1.0900473933649289e-05,
"loss": 0.2581,
"step": 944
},
{
"epoch": 2.4168797953964196,
"grad_norm": 0.20606502494179982,
"learning_rate": 1.0853080568720379e-05,
"loss": 0.2455,
"step": 945
},
{
"epoch": 2.419437340153453,
"grad_norm": 0.19957207626288198,
"learning_rate": 1.080568720379147e-05,
"loss": 0.2417,
"step": 946
},
{
"epoch": 2.421994884910486,
"grad_norm": 0.19178247803581283,
"learning_rate": 1.075829383886256e-05,
"loss": 0.2491,
"step": 947
},
{
"epoch": 2.424552429667519,
"grad_norm": 0.1829585466891497,
"learning_rate": 1.071090047393365e-05,
"loss": 0.2622,
"step": 948
},
{
"epoch": 2.4271099744245523,
"grad_norm": 0.19009770566404205,
"learning_rate": 1.066350710900474e-05,
"loss": 0.2472,
"step": 949
},
{
"epoch": 2.4296675191815855,
"grad_norm": 0.18826570727560837,
"learning_rate": 1.061611374407583e-05,
"loss": 0.2435,
"step": 950
},
{
"epoch": 2.4322250639386187,
"grad_norm": 0.18359777168223823,
"learning_rate": 1.056872037914692e-05,
"loss": 0.2563,
"step": 951
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.18743774264051472,
"learning_rate": 1.052132701421801e-05,
"loss": 0.2501,
"step": 952
},
{
"epoch": 2.4373401534526855,
"grad_norm": 0.18190848955579414,
"learning_rate": 1.04739336492891e-05,
"loss": 0.2419,
"step": 953
},
{
"epoch": 2.4398976982097187,
"grad_norm": 0.1869056453658558,
"learning_rate": 1.0426540284360189e-05,
"loss": 0.2468,
"step": 954
},
{
"epoch": 2.442455242966752,
"grad_norm": 0.19325837584457362,
"learning_rate": 1.037914691943128e-05,
"loss": 0.2398,
"step": 955
},
{
"epoch": 2.445012787723785,
"grad_norm": 0.18958397138054392,
"learning_rate": 1.033175355450237e-05,
"loss": 0.2587,
"step": 956
},
{
"epoch": 2.4475703324808182,
"grad_norm": 0.1802569001638857,
"learning_rate": 1.028436018957346e-05,
"loss": 0.249,
"step": 957
},
{
"epoch": 2.4501278772378514,
"grad_norm": 0.19473776964299236,
"learning_rate": 1.0236966824644551e-05,
"loss": 0.2504,
"step": 958
},
{
"epoch": 2.452685421994885,
"grad_norm": 0.19318565328468898,
"learning_rate": 1.018957345971564e-05,
"loss": 0.2504,
"step": 959
},
{
"epoch": 2.455242966751918,
"grad_norm": 0.1935892471549821,
"learning_rate": 1.014218009478673e-05,
"loss": 0.2608,
"step": 960
},
{
"epoch": 2.4578005115089514,
"grad_norm": 0.2128184199845009,
"learning_rate": 1.009478672985782e-05,
"loss": 0.2547,
"step": 961
},
{
"epoch": 2.4603580562659846,
"grad_norm": 0.1894940142447489,
"learning_rate": 1.004739336492891e-05,
"loss": 0.2654,
"step": 962
},
{
"epoch": 2.4629156010230178,
"grad_norm": 0.18093993857309348,
"learning_rate": 1e-05,
"loss": 0.2214,
"step": 963
},
{
"epoch": 2.4654731457800514,
"grad_norm": 0.19178096580173365,
"learning_rate": 9.95260663507109e-06,
"loss": 0.2428,
"step": 964
},
{
"epoch": 2.4680306905370846,
"grad_norm": 0.17992616710306839,
"learning_rate": 9.90521327014218e-06,
"loss": 0.2316,
"step": 965
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.19203487435465688,
"learning_rate": 9.857819905213271e-06,
"loss": 0.2461,
"step": 966
},
{
"epoch": 2.473145780051151,
"grad_norm": 0.18682427737935345,
"learning_rate": 9.810426540284361e-06,
"loss": 0.2546,
"step": 967
},
{
"epoch": 2.475703324808184,
"grad_norm": 0.18859469892005637,
"learning_rate": 9.76303317535545e-06,
"loss": 0.2622,
"step": 968
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.18438885948965986,
"learning_rate": 9.71563981042654e-06,
"loss": 0.2545,
"step": 969
},
{
"epoch": 2.4808184143222505,
"grad_norm": 0.19779141574116138,
"learning_rate": 9.66824644549763e-06,
"loss": 0.244,
"step": 970
},
{
"epoch": 2.483375959079284,
"grad_norm": 0.19565635148681754,
"learning_rate": 9.62085308056872e-06,
"loss": 0.2659,
"step": 971
},
{
"epoch": 2.4859335038363173,
"grad_norm": 0.17628127887779274,
"learning_rate": 9.573459715639811e-06,
"loss": 0.2445,
"step": 972
},
{
"epoch": 2.4884910485933505,
"grad_norm": 0.1800315251692981,
"learning_rate": 9.5260663507109e-06,
"loss": 0.2524,
"step": 973
},
{
"epoch": 2.4910485933503836,
"grad_norm": 0.1863667327196091,
"learning_rate": 9.47867298578199e-06,
"loss": 0.2555,
"step": 974
},
{
"epoch": 2.493606138107417,
"grad_norm": 0.19474788386072336,
"learning_rate": 9.431279620853082e-06,
"loss": 0.2607,
"step": 975
},
{
"epoch": 2.49616368286445,
"grad_norm": 0.18695877540681222,
"learning_rate": 9.383886255924171e-06,
"loss": 0.2594,
"step": 976
},
{
"epoch": 2.498721227621483,
"grad_norm": 0.18123819527715856,
"learning_rate": 9.336492890995263e-06,
"loss": 0.2388,
"step": 977
},
{
"epoch": 2.501278772378517,
"grad_norm": 0.1805022447990822,
"learning_rate": 9.28909952606635e-06,
"loss": 0.2611,
"step": 978
},
{
"epoch": 2.50383631713555,
"grad_norm": 0.20725044894441963,
"learning_rate": 9.24170616113744e-06,
"loss": 0.2597,
"step": 979
},
{
"epoch": 2.506393861892583,
"grad_norm": 0.17978028465292306,
"learning_rate": 9.194312796208532e-06,
"loss": 0.2546,
"step": 980
},
{
"epoch": 2.5089514066496164,
"grad_norm": 0.19895373521772294,
"learning_rate": 9.146919431279621e-06,
"loss": 0.2592,
"step": 981
},
{
"epoch": 2.5115089514066495,
"grad_norm": 0.1908106662263474,
"learning_rate": 9.09952606635071e-06,
"loss": 0.2617,
"step": 982
},
{
"epoch": 2.5140664961636827,
"grad_norm": 0.17851227882118903,
"learning_rate": 9.052132701421802e-06,
"loss": 0.2438,
"step": 983
},
{
"epoch": 2.516624040920716,
"grad_norm": 0.18752114855298738,
"learning_rate": 9.004739336492892e-06,
"loss": 0.2443,
"step": 984
},
{
"epoch": 2.5191815856777495,
"grad_norm": 0.2066530632997492,
"learning_rate": 8.957345971563981e-06,
"loss": 0.2518,
"step": 985
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.17903017399321067,
"learning_rate": 8.909952606635073e-06,
"loss": 0.2551,
"step": 986
},
{
"epoch": 2.524296675191816,
"grad_norm": 0.1813455062016161,
"learning_rate": 8.86255924170616e-06,
"loss": 0.2477,
"step": 987
},
{
"epoch": 2.526854219948849,
"grad_norm": 0.19991953255257042,
"learning_rate": 8.81516587677725e-06,
"loss": 0.2518,
"step": 988
},
{
"epoch": 2.5294117647058822,
"grad_norm": 0.19321012124999268,
"learning_rate": 8.767772511848342e-06,
"loss": 0.2586,
"step": 989
},
{
"epoch": 2.531969309462916,
"grad_norm": 0.17877646902912023,
"learning_rate": 8.720379146919431e-06,
"loss": 0.2439,
"step": 990
},
{
"epoch": 2.5345268542199486,
"grad_norm": 0.18016347358333068,
"learning_rate": 8.672985781990521e-06,
"loss": 0.2504,
"step": 991
},
{
"epoch": 2.5370843989769822,
"grad_norm": 0.19598330654437415,
"learning_rate": 8.625592417061612e-06,
"loss": 0.2402,
"step": 992
},
{
"epoch": 2.5396419437340154,
"grad_norm": 0.19085013146167623,
"learning_rate": 8.578199052132702e-06,
"loss": 0.2518,
"step": 993
},
{
"epoch": 2.5421994884910486,
"grad_norm": 0.1835210269356536,
"learning_rate": 8.530805687203793e-06,
"loss": 0.2457,
"step": 994
},
{
"epoch": 2.544757033248082,
"grad_norm": 0.1808657315125499,
"learning_rate": 8.483412322274883e-06,
"loss": 0.2581,
"step": 995
},
{
"epoch": 2.547314578005115,
"grad_norm": 0.17348100084546994,
"learning_rate": 8.436018957345971e-06,
"loss": 0.2361,
"step": 996
},
{
"epoch": 2.5498721227621486,
"grad_norm": 0.18526137266320347,
"learning_rate": 8.388625592417062e-06,
"loss": 0.2518,
"step": 997
},
{
"epoch": 2.5524296675191813,
"grad_norm": 0.1866598354500443,
"learning_rate": 8.341232227488152e-06,
"loss": 0.2489,
"step": 998
},
{
"epoch": 2.554987212276215,
"grad_norm": 0.1866669094430096,
"learning_rate": 8.293838862559241e-06,
"loss": 0.2612,
"step": 999
},
{
"epoch": 2.557544757033248,
"grad_norm": 0.20188335606638064,
"learning_rate": 8.246445497630333e-06,
"loss": 0.2394,
"step": 1000
},
{
"epoch": 2.5601023017902813,
"grad_norm": 0.17491742752344783,
"learning_rate": 8.199052132701422e-06,
"loss": 0.2377,
"step": 1001
},
{
"epoch": 2.5626598465473145,
"grad_norm": 0.17572988640896128,
"learning_rate": 8.151658767772512e-06,
"loss": 0.2523,
"step": 1002
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.1850486906047413,
"learning_rate": 8.104265402843603e-06,
"loss": 0.2604,
"step": 1003
},
{
"epoch": 2.5677749360613813,
"grad_norm": 0.18456694735716317,
"learning_rate": 8.056872037914693e-06,
"loss": 0.2579,
"step": 1004
},
{
"epoch": 2.5703324808184145,
"grad_norm": 0.1989825021126641,
"learning_rate": 8.009478672985781e-06,
"loss": 0.2456,
"step": 1005
},
{
"epoch": 2.5728900255754477,
"grad_norm": 0.19392351458658866,
"learning_rate": 7.962085308056872e-06,
"loss": 0.2542,
"step": 1006
},
{
"epoch": 2.575447570332481,
"grad_norm": 0.1803390415874974,
"learning_rate": 7.914691943127962e-06,
"loss": 0.243,
"step": 1007
},
{
"epoch": 2.578005115089514,
"grad_norm": 0.18345024591378195,
"learning_rate": 7.867298578199052e-06,
"loss": 0.2451,
"step": 1008
},
{
"epoch": 2.580562659846547,
"grad_norm": 0.1941629539774514,
"learning_rate": 7.819905213270143e-06,
"loss": 0.2484,
"step": 1009
},
{
"epoch": 2.5831202046035804,
"grad_norm": 0.20207081751890732,
"learning_rate": 7.772511848341233e-06,
"loss": 0.2562,
"step": 1010
},
{
"epoch": 2.585677749360614,
"grad_norm": 0.18062688142042024,
"learning_rate": 7.725118483412324e-06,
"loss": 0.2454,
"step": 1011
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.18172987412926497,
"learning_rate": 7.677725118483414e-06,
"loss": 0.258,
"step": 1012
},
{
"epoch": 2.5907928388746804,
"grad_norm": 0.1910447725518475,
"learning_rate": 7.630331753554503e-06,
"loss": 0.2547,
"step": 1013
},
{
"epoch": 2.5933503836317136,
"grad_norm": 0.18183009657939525,
"learning_rate": 7.582938388625594e-06,
"loss": 0.2477,
"step": 1014
},
{
"epoch": 2.5959079283887467,
"grad_norm": 0.19084392574095072,
"learning_rate": 7.5355450236966825e-06,
"loss": 0.2535,
"step": 1015
},
{
"epoch": 2.59846547314578,
"grad_norm": 0.19660855958741716,
"learning_rate": 7.488151658767772e-06,
"loss": 0.2542,
"step": 1016
},
{
"epoch": 2.601023017902813,
"grad_norm": 0.19119145619102845,
"learning_rate": 7.4407582938388635e-06,
"loss": 0.2652,
"step": 1017
},
{
"epoch": 2.6035805626598467,
"grad_norm": 0.18403920655569364,
"learning_rate": 7.393364928909953e-06,
"loss": 0.2507,
"step": 1018
},
{
"epoch": 2.60613810741688,
"grad_norm": 0.18051231326303438,
"learning_rate": 7.345971563981043e-06,
"loss": 0.2457,
"step": 1019
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.18075516190798283,
"learning_rate": 7.298578199052133e-06,
"loss": 0.252,
"step": 1020
},
{
"epoch": 2.6112531969309463,
"grad_norm": 0.1788096745482508,
"learning_rate": 7.251184834123223e-06,
"loss": 0.2436,
"step": 1021
},
{
"epoch": 2.6138107416879794,
"grad_norm": 0.1824092891963844,
"learning_rate": 7.2037914691943126e-06,
"loss": 0.2432,
"step": 1022
},
{
"epoch": 2.6163682864450126,
"grad_norm": 0.16862388714463997,
"learning_rate": 7.156398104265404e-06,
"loss": 0.2432,
"step": 1023
},
{
"epoch": 2.618925831202046,
"grad_norm": 0.17677820353677443,
"learning_rate": 7.109004739336493e-06,
"loss": 0.2454,
"step": 1024
},
{
"epoch": 2.6214833759590794,
"grad_norm": 0.1749578021536912,
"learning_rate": 7.061611374407582e-06,
"loss": 0.2516,
"step": 1025
},
{
"epoch": 2.6240409207161126,
"grad_norm": 0.1746709607811344,
"learning_rate": 7.014218009478674e-06,
"loss": 0.2393,
"step": 1026
},
{
"epoch": 2.626598465473146,
"grad_norm": 0.1774898930232003,
"learning_rate": 6.966824644549763e-06,
"loss": 0.2488,
"step": 1027
},
{
"epoch": 2.629156010230179,
"grad_norm": 0.17292145541011766,
"learning_rate": 6.919431279620854e-06,
"loss": 0.2439,
"step": 1028
},
{
"epoch": 2.631713554987212,
"grad_norm": 0.25017047237469586,
"learning_rate": 6.8720379146919435e-06,
"loss": 0.2666,
"step": 1029
},
{
"epoch": 2.634271099744246,
"grad_norm": 0.1802705434620767,
"learning_rate": 6.824644549763033e-06,
"loss": 0.2611,
"step": 1030
},
{
"epoch": 2.6368286445012785,
"grad_norm": 0.18448765710220488,
"learning_rate": 6.7772511848341244e-06,
"loss": 0.2407,
"step": 1031
},
{
"epoch": 2.639386189258312,
"grad_norm": 0.1740367294783914,
"learning_rate": 6.729857819905214e-06,
"loss": 0.2459,
"step": 1032
},
{
"epoch": 2.6419437340153453,
"grad_norm": 0.17677782819853,
"learning_rate": 6.682464454976303e-06,
"loss": 0.2531,
"step": 1033
},
{
"epoch": 2.6445012787723785,
"grad_norm": 0.18226239340272907,
"learning_rate": 6.635071090047394e-06,
"loss": 0.2488,
"step": 1034
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.17405875174384855,
"learning_rate": 6.587677725118484e-06,
"loss": 0.2445,
"step": 1035
},
{
"epoch": 2.649616368286445,
"grad_norm": 0.1776309384953249,
"learning_rate": 6.5402843601895735e-06,
"loss": 0.2401,
"step": 1036
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.18334105397117928,
"learning_rate": 6.492890995260664e-06,
"loss": 0.2503,
"step": 1037
},
{
"epoch": 2.6547314578005117,
"grad_norm": 0.17986542354916346,
"learning_rate": 6.445497630331754e-06,
"loss": 0.2489,
"step": 1038
},
{
"epoch": 2.657289002557545,
"grad_norm": 0.17690617151115767,
"learning_rate": 6.398104265402843e-06,
"loss": 0.2403,
"step": 1039
},
{
"epoch": 2.659846547314578,
"grad_norm": 0.18280012481225613,
"learning_rate": 6.350710900473935e-06,
"loss": 0.2498,
"step": 1040
},
{
"epoch": 2.662404092071611,
"grad_norm": 0.17506102024381995,
"learning_rate": 6.303317535545024e-06,
"loss": 0.2308,
"step": 1041
},
{
"epoch": 2.6649616368286444,
"grad_norm": 0.18790705934428378,
"learning_rate": 6.255924170616113e-06,
"loss": 0.2529,
"step": 1042
},
{
"epoch": 2.6675191815856776,
"grad_norm": 0.1892712596775323,
"learning_rate": 6.208530805687204e-06,
"loss": 0.2432,
"step": 1043
},
{
"epoch": 2.670076726342711,
"grad_norm": 0.19005999786944971,
"learning_rate": 6.161137440758294e-06,
"loss": 0.2423,
"step": 1044
},
{
"epoch": 2.6726342710997444,
"grad_norm": 0.1845872169401998,
"learning_rate": 6.1137440758293845e-06,
"loss": 0.2593,
"step": 1045
},
{
"epoch": 2.6751918158567776,
"grad_norm": 0.18704678458411442,
"learning_rate": 6.066350710900474e-06,
"loss": 0.2391,
"step": 1046
},
{
"epoch": 2.6777493606138107,
"grad_norm": 0.17913018163417851,
"learning_rate": 6.018957345971564e-06,
"loss": 0.2405,
"step": 1047
},
{
"epoch": 2.680306905370844,
"grad_norm": 0.19472560672322844,
"learning_rate": 5.971563981042654e-06,
"loss": 0.2505,
"step": 1048
},
{
"epoch": 2.682864450127877,
"grad_norm": 0.18019457992632396,
"learning_rate": 5.924170616113745e-06,
"loss": 0.24,
"step": 1049
},
{
"epoch": 2.6854219948849103,
"grad_norm": 0.17330920592908153,
"learning_rate": 5.876777251184834e-06,
"loss": 0.2544,
"step": 1050
},
{
"epoch": 2.687979539641944,
"grad_norm": 0.1815334540303024,
"learning_rate": 5.829383886255924e-06,
"loss": 0.2466,
"step": 1051
},
{
"epoch": 2.690537084398977,
"grad_norm": 0.19496083605348435,
"learning_rate": 5.7819905213270145e-06,
"loss": 0.252,
"step": 1052
},
{
"epoch": 2.6930946291560103,
"grad_norm": 0.19241164389164786,
"learning_rate": 5.734597156398105e-06,
"loss": 0.2504,
"step": 1053
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.45538670179365104,
"learning_rate": 5.687203791469195e-06,
"loss": 0.26,
"step": 1054
},
{
"epoch": 2.6982097186700766,
"grad_norm": 0.18008218699297715,
"learning_rate": 5.639810426540284e-06,
"loss": 0.2493,
"step": 1055
},
{
"epoch": 2.70076726342711,
"grad_norm": 0.18340270921249122,
"learning_rate": 5.592417061611375e-06,
"loss": 0.237,
"step": 1056
},
{
"epoch": 2.703324808184143,
"grad_norm": 0.22199762048479815,
"learning_rate": 5.5450236966824644e-06,
"loss": 0.2509,
"step": 1057
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.1761889836482758,
"learning_rate": 5.497630331753555e-06,
"loss": 0.2502,
"step": 1058
},
{
"epoch": 2.70843989769821,
"grad_norm": 0.1788757958818801,
"learning_rate": 5.4502369668246446e-06,
"loss": 0.2548,
"step": 1059
},
{
"epoch": 2.710997442455243,
"grad_norm": 0.17679336863109477,
"learning_rate": 5.402843601895735e-06,
"loss": 0.2511,
"step": 1060
},
{
"epoch": 2.713554987212276,
"grad_norm": 0.18320215458542266,
"learning_rate": 5.355450236966825e-06,
"loss": 0.2394,
"step": 1061
},
{
"epoch": 2.7161125319693094,
"grad_norm": 0.1954816098192057,
"learning_rate": 5.308056872037915e-06,
"loss": 0.2686,
"step": 1062
},
{
"epoch": 2.718670076726343,
"grad_norm": 0.17601162272580267,
"learning_rate": 5.260663507109005e-06,
"loss": 0.2428,
"step": 1063
},
{
"epoch": 2.7212276214833757,
"grad_norm": 0.17667083168107164,
"learning_rate": 5.2132701421800945e-06,
"loss": 0.2538,
"step": 1064
},
{
"epoch": 2.7237851662404093,
"grad_norm": 0.17964593927923134,
"learning_rate": 5.165876777251185e-06,
"loss": 0.2515,
"step": 1065
},
{
"epoch": 2.7263427109974425,
"grad_norm": 0.18415215339401061,
"learning_rate": 5.1184834123222755e-06,
"loss": 0.2467,
"step": 1066
},
{
"epoch": 2.7289002557544757,
"grad_norm": 0.18206983493291928,
"learning_rate": 5.071090047393365e-06,
"loss": 0.2505,
"step": 1067
},
{
"epoch": 2.731457800511509,
"grad_norm": 0.17702957807117964,
"learning_rate": 5.023696682464455e-06,
"loss": 0.2481,
"step": 1068
},
{
"epoch": 2.734015345268542,
"grad_norm": 0.18941559797605062,
"learning_rate": 4.976303317535545e-06,
"loss": 0.2515,
"step": 1069
},
{
"epoch": 2.7365728900255757,
"grad_norm": 0.17866021653822645,
"learning_rate": 4.928909952606636e-06,
"loss": 0.2573,
"step": 1070
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.1771695946103805,
"learning_rate": 4.881516587677725e-06,
"loss": 0.2468,
"step": 1071
},
{
"epoch": 2.741687979539642,
"grad_norm": 0.18877889489990468,
"learning_rate": 4.834123222748815e-06,
"loss": 0.2513,
"step": 1072
},
{
"epoch": 2.7442455242966752,
"grad_norm": 0.18206987667415467,
"learning_rate": 4.7867298578199055e-06,
"loss": 0.261,
"step": 1073
},
{
"epoch": 2.7468030690537084,
"grad_norm": 0.17701923743513961,
"learning_rate": 4.739336492890995e-06,
"loss": 0.2382,
"step": 1074
},
{
"epoch": 2.7493606138107416,
"grad_norm": 0.17540739260356206,
"learning_rate": 4.691943127962086e-06,
"loss": 0.2252,
"step": 1075
},
{
"epoch": 2.7519181585677748,
"grad_norm": 0.18630964353133092,
"learning_rate": 4.644549763033175e-06,
"loss": 0.2641,
"step": 1076
},
{
"epoch": 2.7544757033248084,
"grad_norm": 0.18014938799060165,
"learning_rate": 4.597156398104266e-06,
"loss": 0.2474,
"step": 1077
},
{
"epoch": 2.7570332480818416,
"grad_norm": 0.17307796418005664,
"learning_rate": 4.549763033175355e-06,
"loss": 0.2494,
"step": 1078
},
{
"epoch": 2.7595907928388748,
"grad_norm": 0.17931809836460505,
"learning_rate": 4.502369668246446e-06,
"loss": 0.2489,
"step": 1079
},
{
"epoch": 2.762148337595908,
"grad_norm": 0.18594644827320997,
"learning_rate": 4.454976303317536e-06,
"loss": 0.2592,
"step": 1080
},
{
"epoch": 2.764705882352941,
"grad_norm": 0.1761943169656133,
"learning_rate": 4.407582938388625e-06,
"loss": 0.2478,
"step": 1081
},
{
"epoch": 2.7672634271099743,
"grad_norm": 0.173916317774858,
"learning_rate": 4.360189573459716e-06,
"loss": 0.247,
"step": 1082
},
{
"epoch": 2.7698209718670075,
"grad_norm": 0.17873537641991927,
"learning_rate": 4.312796208530806e-06,
"loss": 0.2506,
"step": 1083
},
{
"epoch": 2.772378516624041,
"grad_norm": 0.16911433381467955,
"learning_rate": 4.265402843601897e-06,
"loss": 0.2508,
"step": 1084
},
{
"epoch": 2.7749360613810743,
"grad_norm": 0.1791375462513097,
"learning_rate": 4.2180094786729854e-06,
"loss": 0.2388,
"step": 1085
},
{
"epoch": 2.7774936061381075,
"grad_norm": 0.1784797467796097,
"learning_rate": 4.170616113744076e-06,
"loss": 0.2572,
"step": 1086
},
{
"epoch": 2.7800511508951407,
"grad_norm": 0.18492104457145292,
"learning_rate": 4.123222748815166e-06,
"loss": 0.2384,
"step": 1087
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.18012702780394255,
"learning_rate": 4.075829383886256e-06,
"loss": 0.2307,
"step": 1088
},
{
"epoch": 2.785166240409207,
"grad_norm": 0.17320341741197393,
"learning_rate": 4.0284360189573465e-06,
"loss": 0.2383,
"step": 1089
},
{
"epoch": 2.78772378516624,
"grad_norm": 0.17354330549732871,
"learning_rate": 3.981042654028436e-06,
"loss": 0.2284,
"step": 1090
},
{
"epoch": 2.790281329923274,
"grad_norm": 0.1817183194818464,
"learning_rate": 3.933649289099526e-06,
"loss": 0.2325,
"step": 1091
},
{
"epoch": 2.792838874680307,
"grad_norm": 0.1740902128176595,
"learning_rate": 3.886255924170616e-06,
"loss": 0.2464,
"step": 1092
},
{
"epoch": 2.79539641943734,
"grad_norm": 0.17979740364634914,
"learning_rate": 3.838862559241707e-06,
"loss": 0.2448,
"step": 1093
},
{
"epoch": 2.7979539641943734,
"grad_norm": 0.18910478213308557,
"learning_rate": 3.791469194312797e-06,
"loss": 0.2529,
"step": 1094
},
{
"epoch": 2.8005115089514065,
"grad_norm": 0.17562387593048473,
"learning_rate": 3.744075829383886e-06,
"loss": 0.2516,
"step": 1095
},
{
"epoch": 2.80306905370844,
"grad_norm": 0.17037649183598133,
"learning_rate": 3.6966824644549766e-06,
"loss": 0.2304,
"step": 1096
},
{
"epoch": 2.805626598465473,
"grad_norm": 0.1865715453669857,
"learning_rate": 3.6492890995260666e-06,
"loss": 0.2484,
"step": 1097
},
{
"epoch": 2.8081841432225065,
"grad_norm": 0.17956564469501413,
"learning_rate": 3.6018957345971563e-06,
"loss": 0.2429,
"step": 1098
},
{
"epoch": 2.8107416879795397,
"grad_norm": 0.17380201982016105,
"learning_rate": 3.5545023696682464e-06,
"loss": 0.2475,
"step": 1099
},
{
"epoch": 2.813299232736573,
"grad_norm": 0.18949661964378972,
"learning_rate": 3.507109004739337e-06,
"loss": 0.254,
"step": 1100
},
{
"epoch": 2.815856777493606,
"grad_norm": 0.18281900420620492,
"learning_rate": 3.459715639810427e-06,
"loss": 0.246,
"step": 1101
},
{
"epoch": 2.8184143222506393,
"grad_norm": 0.19046092151157248,
"learning_rate": 3.4123222748815165e-06,
"loss": 0.252,
"step": 1102
},
{
"epoch": 2.820971867007673,
"grad_norm": 0.17912805262085352,
"learning_rate": 3.364928909952607e-06,
"loss": 0.2528,
"step": 1103
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.16539721286530049,
"learning_rate": 3.317535545023697e-06,
"loss": 0.2452,
"step": 1104
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.18089995003561432,
"learning_rate": 3.2701421800947867e-06,
"loss": 0.2442,
"step": 1105
},
{
"epoch": 2.8286445012787724,
"grad_norm": 0.17961058615086165,
"learning_rate": 3.222748815165877e-06,
"loss": 0.2532,
"step": 1106
},
{
"epoch": 2.8312020460358056,
"grad_norm": 0.17670809278729904,
"learning_rate": 3.1753554502369673e-06,
"loss": 0.2469,
"step": 1107
},
{
"epoch": 2.833759590792839,
"grad_norm": 0.17184672240491808,
"learning_rate": 3.1279620853080565e-06,
"loss": 0.253,
"step": 1108
},
{
"epoch": 2.836317135549872,
"grad_norm": 0.18342580492222546,
"learning_rate": 3.080568720379147e-06,
"loss": 0.2447,
"step": 1109
},
{
"epoch": 2.8388746803069056,
"grad_norm": 0.17195526482252144,
"learning_rate": 3.033175355450237e-06,
"loss": 0.2463,
"step": 1110
},
{
"epoch": 2.8414322250639388,
"grad_norm": 0.1792058703015505,
"learning_rate": 2.985781990521327e-06,
"loss": 0.2498,
"step": 1111
},
{
"epoch": 2.843989769820972,
"grad_norm": 0.17565132753951782,
"learning_rate": 2.938388625592417e-06,
"loss": 0.2553,
"step": 1112
},
{
"epoch": 2.846547314578005,
"grad_norm": 0.18056116078607748,
"learning_rate": 2.8909952606635073e-06,
"loss": 0.254,
"step": 1113
},
{
"epoch": 2.8491048593350383,
"grad_norm": 0.17874160432603925,
"learning_rate": 2.8436018957345973e-06,
"loss": 0.246,
"step": 1114
},
{
"epoch": 2.8516624040920715,
"grad_norm": 0.17126107844733118,
"learning_rate": 2.7962085308056874e-06,
"loss": 0.2437,
"step": 1115
},
{
"epoch": 2.8542199488491047,
"grad_norm": 0.16804735225501954,
"learning_rate": 2.7488151658767775e-06,
"loss": 0.2338,
"step": 1116
},
{
"epoch": 2.8567774936061383,
"grad_norm": 0.17871874445538027,
"learning_rate": 2.7014218009478675e-06,
"loss": 0.2504,
"step": 1117
},
{
"epoch": 2.8593350383631715,
"grad_norm": 0.16605891064626507,
"learning_rate": 2.6540284360189576e-06,
"loss": 0.2431,
"step": 1118
},
{
"epoch": 2.8618925831202047,
"grad_norm": 0.1803054733026333,
"learning_rate": 2.6066350710900472e-06,
"loss": 0.2508,
"step": 1119
},
{
"epoch": 2.864450127877238,
"grad_norm": 0.17422585403639088,
"learning_rate": 2.5592417061611377e-06,
"loss": 0.2462,
"step": 1120
},
{
"epoch": 2.867007672634271,
"grad_norm": 0.17364151884560752,
"learning_rate": 2.5118483412322274e-06,
"loss": 0.2583,
"step": 1121
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.1746615119917103,
"learning_rate": 2.464454976303318e-06,
"loss": 0.2421,
"step": 1122
},
{
"epoch": 2.8721227621483374,
"grad_norm": 0.17498173832710498,
"learning_rate": 2.4170616113744075e-06,
"loss": 0.2437,
"step": 1123
},
{
"epoch": 2.874680306905371,
"grad_norm": 0.16581915880183135,
"learning_rate": 2.3696682464454976e-06,
"loss": 0.2398,
"step": 1124
},
{
"epoch": 2.877237851662404,
"grad_norm": 0.16997023135551514,
"learning_rate": 2.3222748815165876e-06,
"loss": 0.2467,
"step": 1125
},
{
"epoch": 2.8797953964194374,
"grad_norm": 0.1691406192934443,
"learning_rate": 2.2748815165876777e-06,
"loss": 0.238,
"step": 1126
},
{
"epoch": 2.8823529411764706,
"grad_norm": 0.178904797391566,
"learning_rate": 2.227488151658768e-06,
"loss": 0.2664,
"step": 1127
},
{
"epoch": 2.8849104859335037,
"grad_norm": 0.17160493563940152,
"learning_rate": 2.180094786729858e-06,
"loss": 0.2405,
"step": 1128
},
{
"epoch": 2.887468030690537,
"grad_norm": 0.17542566078202718,
"learning_rate": 2.1327014218009483e-06,
"loss": 0.2514,
"step": 1129
},
{
"epoch": 2.89002557544757,
"grad_norm": 0.18375078276401854,
"learning_rate": 2.085308056872038e-06,
"loss": 0.2558,
"step": 1130
},
{
"epoch": 2.8925831202046037,
"grad_norm": 0.1746613382211159,
"learning_rate": 2.037914691943128e-06,
"loss": 0.2498,
"step": 1131
},
{
"epoch": 2.895140664961637,
"grad_norm": 0.17688085965152694,
"learning_rate": 1.990521327014218e-06,
"loss": 0.2439,
"step": 1132
},
{
"epoch": 2.89769820971867,
"grad_norm": 0.17262679123443198,
"learning_rate": 1.943127962085308e-06,
"loss": 0.232,
"step": 1133
},
{
"epoch": 2.9002557544757033,
"grad_norm": 0.16308493274857086,
"learning_rate": 1.8957345971563984e-06,
"loss": 0.2372,
"step": 1134
},
{
"epoch": 2.9028132992327365,
"grad_norm": 0.17307065518752035,
"learning_rate": 1.8483412322274883e-06,
"loss": 0.2496,
"step": 1135
},
{
"epoch": 2.90537084398977,
"grad_norm": 0.17570971869354174,
"learning_rate": 1.8009478672985781e-06,
"loss": 0.255,
"step": 1136
},
{
"epoch": 2.907928388746803,
"grad_norm": 0.17365101323268625,
"learning_rate": 1.7535545023696684e-06,
"loss": 0.2463,
"step": 1137
},
{
"epoch": 2.9104859335038364,
"grad_norm": 0.17347423801664208,
"learning_rate": 1.7061611374407583e-06,
"loss": 0.2596,
"step": 1138
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.16784563506361547,
"learning_rate": 1.6587677725118486e-06,
"loss": 0.2389,
"step": 1139
},
{
"epoch": 2.915601023017903,
"grad_norm": 0.1815931456665935,
"learning_rate": 1.6113744075829384e-06,
"loss": 0.2601,
"step": 1140
},
{
"epoch": 2.918158567774936,
"grad_norm": 0.179248730479389,
"learning_rate": 1.5639810426540283e-06,
"loss": 0.2613,
"step": 1141
},
{
"epoch": 2.920716112531969,
"grad_norm": 0.16773919256420614,
"learning_rate": 1.5165876777251185e-06,
"loss": 0.2439,
"step": 1142
},
{
"epoch": 2.923273657289003,
"grad_norm": 0.17107783453591816,
"learning_rate": 1.4691943127962086e-06,
"loss": 0.2457,
"step": 1143
},
{
"epoch": 2.9258312020460355,
"grad_norm": 0.17079854678851109,
"learning_rate": 1.4218009478672987e-06,
"loss": 0.2435,
"step": 1144
},
{
"epoch": 2.928388746803069,
"grad_norm": 0.16349978016152963,
"learning_rate": 1.3744075829383887e-06,
"loss": 0.2304,
"step": 1145
},
{
"epoch": 2.9309462915601023,
"grad_norm": 0.1673539709565629,
"learning_rate": 1.3270142180094788e-06,
"loss": 0.2436,
"step": 1146
},
{
"epoch": 2.9335038363171355,
"grad_norm": 0.1708287520831406,
"learning_rate": 1.2796208530805689e-06,
"loss": 0.2622,
"step": 1147
},
{
"epoch": 2.9360613810741687,
"grad_norm": 0.17066789815162614,
"learning_rate": 1.232227488151659e-06,
"loss": 0.2493,
"step": 1148
},
{
"epoch": 2.938618925831202,
"grad_norm": 0.1716388233712423,
"learning_rate": 1.1848341232227488e-06,
"loss": 0.2501,
"step": 1149
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.18539699654347663,
"learning_rate": 1.1374407582938388e-06,
"loss": 0.2544,
"step": 1150
},
{
"epoch": 2.9437340153452687,
"grad_norm": 0.1685342419724911,
"learning_rate": 1.090047393364929e-06,
"loss": 0.2563,
"step": 1151
},
{
"epoch": 2.946291560102302,
"grad_norm": 0.17685764883456795,
"learning_rate": 1.042654028436019e-06,
"loss": 0.2484,
"step": 1152
},
{
"epoch": 2.948849104859335,
"grad_norm": 0.17233103902620187,
"learning_rate": 9.95260663507109e-07,
"loss": 0.2471,
"step": 1153
},
{
"epoch": 2.9514066496163682,
"grad_norm": 0.17927170332506637,
"learning_rate": 9.478672985781992e-07,
"loss": 0.2463,
"step": 1154
},
{
"epoch": 2.9539641943734014,
"grad_norm": 0.16939187906182812,
"learning_rate": 9.004739336492891e-07,
"loss": 0.2449,
"step": 1155
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.16994779580235087,
"learning_rate": 8.530805687203791e-07,
"loss": 0.2449,
"step": 1156
},
{
"epoch": 2.959079283887468,
"grad_norm": 0.1699706601774477,
"learning_rate": 8.056872037914692e-07,
"loss": 0.2481,
"step": 1157
},
{
"epoch": 2.9616368286445014,
"grad_norm": 0.16870847051946605,
"learning_rate": 7.582938388625593e-07,
"loss": 0.246,
"step": 1158
},
{
"epoch": 2.9641943734015346,
"grad_norm": 0.1698101486377668,
"learning_rate": 7.109004739336493e-07,
"loss": 0.2554,
"step": 1159
},
{
"epoch": 2.9667519181585678,
"grad_norm": 0.16651213070393764,
"learning_rate": 6.635071090047394e-07,
"loss": 0.2358,
"step": 1160
},
{
"epoch": 2.969309462915601,
"grad_norm": 0.17237546004566973,
"learning_rate": 6.161137440758295e-07,
"loss": 0.2593,
"step": 1161
},
{
"epoch": 2.971867007672634,
"grad_norm": 0.1669612454811503,
"learning_rate": 5.687203791469194e-07,
"loss": 0.2422,
"step": 1162
},
{
"epoch": 2.9744245524296673,
"grad_norm": 0.16627677687780293,
"learning_rate": 5.213270142180095e-07,
"loss": 0.2524,
"step": 1163
},
{
"epoch": 2.976982097186701,
"grad_norm": 0.17381593936793757,
"learning_rate": 4.739336492890996e-07,
"loss": 0.2605,
"step": 1164
},
{
"epoch": 2.979539641943734,
"grad_norm": 0.1685052599832634,
"learning_rate": 4.2654028436018957e-07,
"loss": 0.2436,
"step": 1165
},
{
"epoch": 2.9820971867007673,
"grad_norm": 0.16629494329700928,
"learning_rate": 3.7914691943127963e-07,
"loss": 0.2509,
"step": 1166
},
{
"epoch": 2.9846547314578005,
"grad_norm": 0.17193426032210676,
"learning_rate": 3.317535545023697e-07,
"loss": 0.2525,
"step": 1167
},
{
"epoch": 2.9872122762148337,
"grad_norm": 0.1691249872952514,
"learning_rate": 2.843601895734597e-07,
"loss": 0.2471,
"step": 1168
},
{
"epoch": 2.9897698209718673,
"grad_norm": 0.16940746272899151,
"learning_rate": 2.369668246445498e-07,
"loss": 0.2421,
"step": 1169
},
{
"epoch": 2.9923273657289,
"grad_norm": 0.16950720483754556,
"learning_rate": 1.8957345971563982e-07,
"loss": 0.252,
"step": 1170
},
{
"epoch": 2.9948849104859336,
"grad_norm": 0.16465075098818885,
"learning_rate": 1.4218009478672986e-07,
"loss": 0.246,
"step": 1171
},
{
"epoch": 2.997442455242967,
"grad_norm": 0.1658083222308387,
"learning_rate": 9.478672985781991e-08,
"loss": 0.2591,
"step": 1172
},
{
"epoch": 3.0,
"grad_norm": 0.17583311315224384,
"learning_rate": 4.7393364928909954e-08,
"loss": 0.2248,
"step": 1173
},
{
"epoch": 3.0,
"step": 1173,
"total_flos": 1.3044690334083187e+19,
"train_loss": 0.4372467596944539,
"train_runtime": 36845.5005,
"train_samples_per_second": 0.509,
"train_steps_per_second": 0.032
}
],
"logging_steps": 1,
"max_steps": 1173,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3044690334083187e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}