reasoning_0_chat / trainer_state.json
sedrickkeh's picture
End of training
1cea212 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.990403071017274,
"eval_steps": 500,
"global_step": 1950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025591810620601407,
"grad_norm": 5.72460191514523,
"learning_rate": 2.0512820512820514e-07,
"loss": 0.8785,
"step": 1
},
{
"epoch": 0.005118362124120281,
"grad_norm": 5.959578433623624,
"learning_rate": 4.102564102564103e-07,
"loss": 0.8678,
"step": 2
},
{
"epoch": 0.007677543186180422,
"grad_norm": 5.902957688136316,
"learning_rate": 6.153846153846155e-07,
"loss": 0.8968,
"step": 3
},
{
"epoch": 0.010236724248240563,
"grad_norm": 5.4723551383557805,
"learning_rate": 8.205128205128206e-07,
"loss": 0.8418,
"step": 4
},
{
"epoch": 0.012795905310300703,
"grad_norm": 5.787912289462209,
"learning_rate": 1.0256410256410257e-06,
"loss": 0.886,
"step": 5
},
{
"epoch": 0.015355086372360844,
"grad_norm": 5.557852691465803,
"learning_rate": 1.230769230769231e-06,
"loss": 0.9033,
"step": 6
},
{
"epoch": 0.017914267434420986,
"grad_norm": 5.362076082832634,
"learning_rate": 1.4358974358974359e-06,
"loss": 0.8629,
"step": 7
},
{
"epoch": 0.020473448496481125,
"grad_norm": 4.370734339586599,
"learning_rate": 1.6410256410256412e-06,
"loss": 0.8157,
"step": 8
},
{
"epoch": 0.023032629558541268,
"grad_norm": 4.493266725432695,
"learning_rate": 1.8461538461538465e-06,
"loss": 0.8392,
"step": 9
},
{
"epoch": 0.025591810620601407,
"grad_norm": 4.121132813839215,
"learning_rate": 2.0512820512820513e-06,
"loss": 0.8246,
"step": 10
},
{
"epoch": 0.02815099168266155,
"grad_norm": 2.4085810485641095,
"learning_rate": 2.2564102564102566e-06,
"loss": 0.8009,
"step": 11
},
{
"epoch": 0.030710172744721688,
"grad_norm": 2.3319618373499087,
"learning_rate": 2.461538461538462e-06,
"loss": 0.8189,
"step": 12
},
{
"epoch": 0.03326935380678183,
"grad_norm": 2.050198471153189,
"learning_rate": 2.666666666666667e-06,
"loss": 0.7671,
"step": 13
},
{
"epoch": 0.03582853486884197,
"grad_norm": 2.5524552964667926,
"learning_rate": 2.8717948717948717e-06,
"loss": 0.7587,
"step": 14
},
{
"epoch": 0.03838771593090211,
"grad_norm": 3.44805626463955,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.779,
"step": 15
},
{
"epoch": 0.04094689699296225,
"grad_norm": 3.559383059115368,
"learning_rate": 3.2820512820512823e-06,
"loss": 0.758,
"step": 16
},
{
"epoch": 0.04350607805502239,
"grad_norm": 3.33809383465494,
"learning_rate": 3.487179487179487e-06,
"loss": 0.7136,
"step": 17
},
{
"epoch": 0.046065259117082535,
"grad_norm": 3.2168388473087757,
"learning_rate": 3.692307692307693e-06,
"loss": 0.7369,
"step": 18
},
{
"epoch": 0.04862444017914268,
"grad_norm": 2.5872597896051728,
"learning_rate": 3.897435897435898e-06,
"loss": 0.7323,
"step": 19
},
{
"epoch": 0.05118362124120281,
"grad_norm": 2.0282374599316957,
"learning_rate": 4.102564102564103e-06,
"loss": 0.6876,
"step": 20
},
{
"epoch": 0.053742802303262956,
"grad_norm": 1.5964078137928233,
"learning_rate": 4.307692307692308e-06,
"loss": 0.6678,
"step": 21
},
{
"epoch": 0.0563019833653231,
"grad_norm": 1.2909627910012984,
"learning_rate": 4.512820512820513e-06,
"loss": 0.662,
"step": 22
},
{
"epoch": 0.05886116442738324,
"grad_norm": 1.3177057124827334,
"learning_rate": 4.717948717948718e-06,
"loss": 0.6594,
"step": 23
},
{
"epoch": 0.061420345489443376,
"grad_norm": 1.24942825931957,
"learning_rate": 4.923076923076924e-06,
"loss": 0.626,
"step": 24
},
{
"epoch": 0.06397952655150352,
"grad_norm": 1.2673077068864744,
"learning_rate": 5.128205128205128e-06,
"loss": 0.6347,
"step": 25
},
{
"epoch": 0.06653870761356366,
"grad_norm": 1.1563656900829429,
"learning_rate": 5.333333333333334e-06,
"loss": 0.6329,
"step": 26
},
{
"epoch": 0.0690978886756238,
"grad_norm": 1.044094666100426,
"learning_rate": 5.538461538461539e-06,
"loss": 0.6254,
"step": 27
},
{
"epoch": 0.07165706973768395,
"grad_norm": 0.9466038321235274,
"learning_rate": 5.743589743589743e-06,
"loss": 0.6295,
"step": 28
},
{
"epoch": 0.07421625079974409,
"grad_norm": 0.7981163236308523,
"learning_rate": 5.948717948717949e-06,
"loss": 0.6155,
"step": 29
},
{
"epoch": 0.07677543186180422,
"grad_norm": 0.9968280326546483,
"learning_rate": 6.153846153846155e-06,
"loss": 0.6113,
"step": 30
},
{
"epoch": 0.07933461292386436,
"grad_norm": 0.9260190035124614,
"learning_rate": 6.358974358974359e-06,
"loss": 0.636,
"step": 31
},
{
"epoch": 0.0818937939859245,
"grad_norm": 0.8477667711908703,
"learning_rate": 6.564102564102565e-06,
"loss": 0.63,
"step": 32
},
{
"epoch": 0.08445297504798464,
"grad_norm": 0.6532816121925329,
"learning_rate": 6.76923076923077e-06,
"loss": 0.6108,
"step": 33
},
{
"epoch": 0.08701215611004479,
"grad_norm": 0.7821169327010173,
"learning_rate": 6.974358974358974e-06,
"loss": 0.6408,
"step": 34
},
{
"epoch": 0.08957133717210493,
"grad_norm": 0.7393409032705888,
"learning_rate": 7.17948717948718e-06,
"loss": 0.5915,
"step": 35
},
{
"epoch": 0.09213051823416507,
"grad_norm": 0.6644188521415291,
"learning_rate": 7.384615384615386e-06,
"loss": 0.5961,
"step": 36
},
{
"epoch": 0.09468969929622521,
"grad_norm": 0.593116184422468,
"learning_rate": 7.58974358974359e-06,
"loss": 0.5971,
"step": 37
},
{
"epoch": 0.09724888035828536,
"grad_norm": 0.6919315440965734,
"learning_rate": 7.794871794871796e-06,
"loss": 0.5725,
"step": 38
},
{
"epoch": 0.09980806142034548,
"grad_norm": 0.6959155805001593,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5685,
"step": 39
},
{
"epoch": 0.10236724248240563,
"grad_norm": 0.6398643666989815,
"learning_rate": 8.205128205128205e-06,
"loss": 0.6138,
"step": 40
},
{
"epoch": 0.10492642354446577,
"grad_norm": 0.6361698523409376,
"learning_rate": 8.410256410256411e-06,
"loss": 0.5849,
"step": 41
},
{
"epoch": 0.10748560460652591,
"grad_norm": 0.5303957188864263,
"learning_rate": 8.615384615384617e-06,
"loss": 0.5533,
"step": 42
},
{
"epoch": 0.11004478566858605,
"grad_norm": 0.5370326395081426,
"learning_rate": 8.820512820512821e-06,
"loss": 0.5833,
"step": 43
},
{
"epoch": 0.1126039667306462,
"grad_norm": 0.590660924049006,
"learning_rate": 9.025641025641027e-06,
"loss": 0.5888,
"step": 44
},
{
"epoch": 0.11516314779270634,
"grad_norm": 0.540686200850521,
"learning_rate": 9.230769230769232e-06,
"loss": 0.5624,
"step": 45
},
{
"epoch": 0.11772232885476648,
"grad_norm": 0.5777571666796689,
"learning_rate": 9.435897435897436e-06,
"loss": 0.5851,
"step": 46
},
{
"epoch": 0.12028150991682661,
"grad_norm": 0.537318145066867,
"learning_rate": 9.641025641025642e-06,
"loss": 0.5644,
"step": 47
},
{
"epoch": 0.12284069097888675,
"grad_norm": 0.5211947061981134,
"learning_rate": 9.846153846153848e-06,
"loss": 0.5591,
"step": 48
},
{
"epoch": 0.1253998720409469,
"grad_norm": 0.5397404148687415,
"learning_rate": 1.0051282051282052e-05,
"loss": 0.5843,
"step": 49
},
{
"epoch": 0.12795905310300704,
"grad_norm": 0.5297926804183296,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.5277,
"step": 50
},
{
"epoch": 0.13051823416506717,
"grad_norm": 0.605652388150248,
"learning_rate": 1.0461538461538463e-05,
"loss": 0.5724,
"step": 51
},
{
"epoch": 0.13307741522712732,
"grad_norm": 0.45885002080359344,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.5381,
"step": 52
},
{
"epoch": 0.13563659628918745,
"grad_norm": 0.5930020802013372,
"learning_rate": 1.0871794871794871e-05,
"loss": 0.6045,
"step": 53
},
{
"epoch": 0.1381957773512476,
"grad_norm": 0.5808129528368039,
"learning_rate": 1.1076923076923079e-05,
"loss": 0.5222,
"step": 54
},
{
"epoch": 0.14075495841330773,
"grad_norm": 0.5154128262574531,
"learning_rate": 1.1282051282051283e-05,
"loss": 0.5578,
"step": 55
},
{
"epoch": 0.1433141394753679,
"grad_norm": 0.5390655268219918,
"learning_rate": 1.1487179487179487e-05,
"loss": 0.5436,
"step": 56
},
{
"epoch": 0.14587332053742802,
"grad_norm": 0.5586414645653933,
"learning_rate": 1.1692307692307694e-05,
"loss": 0.5353,
"step": 57
},
{
"epoch": 0.14843250159948818,
"grad_norm": 0.6534105047151474,
"learning_rate": 1.1897435897435898e-05,
"loss": 0.5392,
"step": 58
},
{
"epoch": 0.1509916826615483,
"grad_norm": 0.5945111461514314,
"learning_rate": 1.2102564102564102e-05,
"loss": 0.5542,
"step": 59
},
{
"epoch": 0.15355086372360843,
"grad_norm": 0.6505380466948517,
"learning_rate": 1.230769230769231e-05,
"loss": 0.5252,
"step": 60
},
{
"epoch": 0.1561100447856686,
"grad_norm": 0.6510336772233184,
"learning_rate": 1.2512820512820514e-05,
"loss": 0.5683,
"step": 61
},
{
"epoch": 0.15866922584772872,
"grad_norm": 0.5509745528884461,
"learning_rate": 1.2717948717948718e-05,
"loss": 0.5205,
"step": 62
},
{
"epoch": 0.16122840690978887,
"grad_norm": 0.5917102526960739,
"learning_rate": 1.2923076923076925e-05,
"loss": 0.5603,
"step": 63
},
{
"epoch": 0.163787587971849,
"grad_norm": 0.560110790537656,
"learning_rate": 1.312820512820513e-05,
"loss": 0.545,
"step": 64
},
{
"epoch": 0.16634676903390916,
"grad_norm": 0.5678363411921677,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.5287,
"step": 65
},
{
"epoch": 0.1689059500959693,
"grad_norm": 0.5819806020456917,
"learning_rate": 1.353846153846154e-05,
"loss": 0.5535,
"step": 66
},
{
"epoch": 0.17146513115802944,
"grad_norm": 0.5558896303005907,
"learning_rate": 1.3743589743589745e-05,
"loss": 0.5648,
"step": 67
},
{
"epoch": 0.17402431222008957,
"grad_norm": 0.5818297224628268,
"learning_rate": 1.3948717948717949e-05,
"loss": 0.545,
"step": 68
},
{
"epoch": 0.1765834932821497,
"grad_norm": 0.5604385516101225,
"learning_rate": 1.4153846153846156e-05,
"loss": 0.5625,
"step": 69
},
{
"epoch": 0.17914267434420986,
"grad_norm": 0.5262255043539227,
"learning_rate": 1.435897435897436e-05,
"loss": 0.553,
"step": 70
},
{
"epoch": 0.18170185540626999,
"grad_norm": 0.5449392960823104,
"learning_rate": 1.4564102564102564e-05,
"loss": 0.532,
"step": 71
},
{
"epoch": 0.18426103646833014,
"grad_norm": 0.5757069226256301,
"learning_rate": 1.4769230769230772e-05,
"loss": 0.538,
"step": 72
},
{
"epoch": 0.18682021753039027,
"grad_norm": 0.5241911795419738,
"learning_rate": 1.4974358974358976e-05,
"loss": 0.537,
"step": 73
},
{
"epoch": 0.18937939859245043,
"grad_norm": 0.4755835874750557,
"learning_rate": 1.517948717948718e-05,
"loss": 0.5015,
"step": 74
},
{
"epoch": 0.19193857965451055,
"grad_norm": 0.5816260566225624,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.5317,
"step": 75
},
{
"epoch": 0.1944977607165707,
"grad_norm": 0.5324895484169254,
"learning_rate": 1.558974358974359e-05,
"loss": 0.503,
"step": 76
},
{
"epoch": 0.19705694177863084,
"grad_norm": 0.5873638181887759,
"learning_rate": 1.5794871794871795e-05,
"loss": 0.5571,
"step": 77
},
{
"epoch": 0.19961612284069097,
"grad_norm": 0.5466165214856151,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.5177,
"step": 78
},
{
"epoch": 0.20217530390275112,
"grad_norm": 0.6209989234883905,
"learning_rate": 1.6205128205128207e-05,
"loss": 0.5268,
"step": 79
},
{
"epoch": 0.20473448496481125,
"grad_norm": 0.5479973413858492,
"learning_rate": 1.641025641025641e-05,
"loss": 0.5096,
"step": 80
},
{
"epoch": 0.2072936660268714,
"grad_norm": 0.7512485115452843,
"learning_rate": 1.6615384615384618e-05,
"loss": 0.5398,
"step": 81
},
{
"epoch": 0.20985284708893154,
"grad_norm": 0.5988795561569944,
"learning_rate": 1.6820512820512822e-05,
"loss": 0.5054,
"step": 82
},
{
"epoch": 0.2124120281509917,
"grad_norm": 0.6349565283068188,
"learning_rate": 1.7025641025641026e-05,
"loss": 0.5301,
"step": 83
},
{
"epoch": 0.21497120921305182,
"grad_norm": 0.7482046004578073,
"learning_rate": 1.7230769230769234e-05,
"loss": 0.5451,
"step": 84
},
{
"epoch": 0.21753039027511195,
"grad_norm": 0.5452089019203088,
"learning_rate": 1.7435897435897438e-05,
"loss": 0.537,
"step": 85
},
{
"epoch": 0.2200895713371721,
"grad_norm": 0.6062443857702727,
"learning_rate": 1.7641025641025642e-05,
"loss": 0.5057,
"step": 86
},
{
"epoch": 0.22264875239923224,
"grad_norm": 0.6133096727031904,
"learning_rate": 1.784615384615385e-05,
"loss": 0.549,
"step": 87
},
{
"epoch": 0.2252079334612924,
"grad_norm": 0.6734829746990577,
"learning_rate": 1.8051282051282053e-05,
"loss": 0.5549,
"step": 88
},
{
"epoch": 0.22776711452335252,
"grad_norm": 0.5692319701895174,
"learning_rate": 1.8256410256410257e-05,
"loss": 0.5053,
"step": 89
},
{
"epoch": 0.23032629558541268,
"grad_norm": 0.6551305606242741,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.5451,
"step": 90
},
{
"epoch": 0.2328854766474728,
"grad_norm": 0.5869184912875696,
"learning_rate": 1.866666666666667e-05,
"loss": 0.5573,
"step": 91
},
{
"epoch": 0.23544465770953296,
"grad_norm": 0.6175475389576918,
"learning_rate": 1.8871794871794873e-05,
"loss": 0.5239,
"step": 92
},
{
"epoch": 0.2380038387715931,
"grad_norm": 0.6923526622405791,
"learning_rate": 1.907692307692308e-05,
"loss": 0.5178,
"step": 93
},
{
"epoch": 0.24056301983365322,
"grad_norm": 0.6169855947769446,
"learning_rate": 1.9282051282051284e-05,
"loss": 0.5583,
"step": 94
},
{
"epoch": 0.24312220089571338,
"grad_norm": 0.8905504059133514,
"learning_rate": 1.9487179487179488e-05,
"loss": 0.5577,
"step": 95
},
{
"epoch": 0.2456813819577735,
"grad_norm": 0.7206999368653747,
"learning_rate": 1.9692307692307696e-05,
"loss": 0.5248,
"step": 96
},
{
"epoch": 0.24824056301983366,
"grad_norm": 0.5689479892809097,
"learning_rate": 1.98974358974359e-05,
"loss": 0.5143,
"step": 97
},
{
"epoch": 0.2507997440818938,
"grad_norm": 0.7120200186242683,
"learning_rate": 2.0102564102564104e-05,
"loss": 0.5363,
"step": 98
},
{
"epoch": 0.2533589251439539,
"grad_norm": 0.5649172258139606,
"learning_rate": 2.0307692307692308e-05,
"loss": 0.5232,
"step": 99
},
{
"epoch": 0.2559181062060141,
"grad_norm": 0.8530948421377756,
"learning_rate": 2.0512820512820512e-05,
"loss": 0.5127,
"step": 100
},
{
"epoch": 0.25847728726807423,
"grad_norm": 0.7220421419640349,
"learning_rate": 2.0717948717948723e-05,
"loss": 0.5472,
"step": 101
},
{
"epoch": 0.26103646833013433,
"grad_norm": 0.7179821332235292,
"learning_rate": 2.0923076923076927e-05,
"loss": 0.4748,
"step": 102
},
{
"epoch": 0.2635956493921945,
"grad_norm": 0.5969399443763903,
"learning_rate": 2.112820512820513e-05,
"loss": 0.4869,
"step": 103
},
{
"epoch": 0.26615483045425464,
"grad_norm": 0.7652225150209184,
"learning_rate": 2.1333333333333335e-05,
"loss": 0.5048,
"step": 104
},
{
"epoch": 0.2687140115163148,
"grad_norm": 0.7026938678210959,
"learning_rate": 2.153846153846154e-05,
"loss": 0.5351,
"step": 105
},
{
"epoch": 0.2712731925783749,
"grad_norm": 0.6625081293533241,
"learning_rate": 2.1743589743589743e-05,
"loss": 0.5071,
"step": 106
},
{
"epoch": 0.27383237364043506,
"grad_norm": 0.7338103048357757,
"learning_rate": 2.1948717948717954e-05,
"loss": 0.5214,
"step": 107
},
{
"epoch": 0.2763915547024952,
"grad_norm": 0.7157427738707126,
"learning_rate": 2.2153846153846158e-05,
"loss": 0.521,
"step": 108
},
{
"epoch": 0.27895073576455537,
"grad_norm": 0.8391389612996835,
"learning_rate": 2.235897435897436e-05,
"loss": 0.5114,
"step": 109
},
{
"epoch": 0.28150991682661547,
"grad_norm": 0.6739896722237592,
"learning_rate": 2.2564102564102566e-05,
"loss": 0.4607,
"step": 110
},
{
"epoch": 0.2840690978886756,
"grad_norm": 0.7325968758566463,
"learning_rate": 2.276923076923077e-05,
"loss": 0.5114,
"step": 111
},
{
"epoch": 0.2866282789507358,
"grad_norm": 0.9192765458484284,
"learning_rate": 2.2974358974358974e-05,
"loss": 0.5164,
"step": 112
},
{
"epoch": 0.2891874600127959,
"grad_norm": 0.7638911037686114,
"learning_rate": 2.3179487179487184e-05,
"loss": 0.5056,
"step": 113
},
{
"epoch": 0.29174664107485604,
"grad_norm": 0.7754803757011083,
"learning_rate": 2.338461538461539e-05,
"loss": 0.5137,
"step": 114
},
{
"epoch": 0.2943058221369162,
"grad_norm": 0.7313220696083259,
"learning_rate": 2.3589743589743593e-05,
"loss": 0.5171,
"step": 115
},
{
"epoch": 0.29686500319897635,
"grad_norm": 0.8944240636016003,
"learning_rate": 2.3794871794871797e-05,
"loss": 0.5631,
"step": 116
},
{
"epoch": 0.29942418426103645,
"grad_norm": 0.827344957741263,
"learning_rate": 2.4e-05,
"loss": 0.5329,
"step": 117
},
{
"epoch": 0.3019833653230966,
"grad_norm": 0.7763009278418379,
"learning_rate": 2.4205128205128205e-05,
"loss": 0.5341,
"step": 118
},
{
"epoch": 0.30454254638515676,
"grad_norm": 0.7998736408167985,
"learning_rate": 2.4410256410256415e-05,
"loss": 0.52,
"step": 119
},
{
"epoch": 0.30710172744721687,
"grad_norm": 0.7411952795822903,
"learning_rate": 2.461538461538462e-05,
"loss": 0.5418,
"step": 120
},
{
"epoch": 0.309660908509277,
"grad_norm": 0.659770656309478,
"learning_rate": 2.4820512820512824e-05,
"loss": 0.5195,
"step": 121
},
{
"epoch": 0.3122200895713372,
"grad_norm": 0.8056693118680838,
"learning_rate": 2.5025641025641028e-05,
"loss": 0.5215,
"step": 122
},
{
"epoch": 0.31477927063339733,
"grad_norm": 0.9631898506281213,
"learning_rate": 2.523076923076923e-05,
"loss": 0.5283,
"step": 123
},
{
"epoch": 0.31733845169545744,
"grad_norm": 0.7096814914325649,
"learning_rate": 2.5435897435897436e-05,
"loss": 0.5155,
"step": 124
},
{
"epoch": 0.3198976327575176,
"grad_norm": 1.028582043530853,
"learning_rate": 2.5641025641025646e-05,
"loss": 0.5394,
"step": 125
},
{
"epoch": 0.32245681381957775,
"grad_norm": 0.7239590324246933,
"learning_rate": 2.584615384615385e-05,
"loss": 0.5446,
"step": 126
},
{
"epoch": 0.32501599488163785,
"grad_norm": 1.0571455117998556,
"learning_rate": 2.6051282051282054e-05,
"loss": 0.5335,
"step": 127
},
{
"epoch": 0.327575175943698,
"grad_norm": 1.0256105590142106,
"learning_rate": 2.625641025641026e-05,
"loss": 0.5418,
"step": 128
},
{
"epoch": 0.33013435700575816,
"grad_norm": 1.0959117099820284,
"learning_rate": 2.6461538461538463e-05,
"loss": 0.5544,
"step": 129
},
{
"epoch": 0.3326935380678183,
"grad_norm": 0.7463871798931493,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.4965,
"step": 130
},
{
"epoch": 0.3352527191298784,
"grad_norm": 1.0194762534931083,
"learning_rate": 2.687179487179487e-05,
"loss": 0.4776,
"step": 131
},
{
"epoch": 0.3378119001919386,
"grad_norm": 0.7119748311745303,
"learning_rate": 2.707692307692308e-05,
"loss": 0.5203,
"step": 132
},
{
"epoch": 0.34037108125399873,
"grad_norm": 0.920481871489979,
"learning_rate": 2.7282051282051285e-05,
"loss": 0.5142,
"step": 133
},
{
"epoch": 0.3429302623160589,
"grad_norm": 0.7065977141822832,
"learning_rate": 2.748717948717949e-05,
"loss": 0.5349,
"step": 134
},
{
"epoch": 0.345489443378119,
"grad_norm": 0.9570189898635619,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.556,
"step": 135
},
{
"epoch": 0.34804862444017914,
"grad_norm": 0.8774520896998024,
"learning_rate": 2.7897435897435898e-05,
"loss": 0.5221,
"step": 136
},
{
"epoch": 0.3506078055022393,
"grad_norm": 0.9754340258356095,
"learning_rate": 2.81025641025641e-05,
"loss": 0.5165,
"step": 137
},
{
"epoch": 0.3531669865642994,
"grad_norm": 0.9512013949257682,
"learning_rate": 2.8307692307692312e-05,
"loss": 0.5094,
"step": 138
},
{
"epoch": 0.35572616762635956,
"grad_norm": 1.0101279645456138,
"learning_rate": 2.8512820512820516e-05,
"loss": 0.5134,
"step": 139
},
{
"epoch": 0.3582853486884197,
"grad_norm": 0.9392456947082269,
"learning_rate": 2.871794871794872e-05,
"loss": 0.5154,
"step": 140
},
{
"epoch": 0.36084452975047987,
"grad_norm": 1.064205953704163,
"learning_rate": 2.8923076923076925e-05,
"loss": 0.5422,
"step": 141
},
{
"epoch": 0.36340371081253997,
"grad_norm": 0.8361885058587943,
"learning_rate": 2.912820512820513e-05,
"loss": 0.5045,
"step": 142
},
{
"epoch": 0.3659628918746001,
"grad_norm": 1.0678541131176078,
"learning_rate": 2.9333333333333333e-05,
"loss": 0.4843,
"step": 143
},
{
"epoch": 0.3685220729366603,
"grad_norm": 0.6800509515379447,
"learning_rate": 2.9538461538461543e-05,
"loss": 0.5256,
"step": 144
},
{
"epoch": 0.3710812539987204,
"grad_norm": 0.7904486157434544,
"learning_rate": 2.9743589743589747e-05,
"loss": 0.5225,
"step": 145
},
{
"epoch": 0.37364043506078054,
"grad_norm": 0.816468325578623,
"learning_rate": 2.994871794871795e-05,
"loss": 0.5389,
"step": 146
},
{
"epoch": 0.3761996161228407,
"grad_norm": 0.7918935996515184,
"learning_rate": 3.0153846153846155e-05,
"loss": 0.5014,
"step": 147
},
{
"epoch": 0.37875879718490085,
"grad_norm": 0.7555828789735101,
"learning_rate": 3.035897435897436e-05,
"loss": 0.5236,
"step": 148
},
{
"epoch": 0.38131797824696095,
"grad_norm": 1.0603633188732544,
"learning_rate": 3.0564102564102564e-05,
"loss": 0.5271,
"step": 149
},
{
"epoch": 0.3838771593090211,
"grad_norm": 0.8796724653002846,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.5214,
"step": 150
},
{
"epoch": 0.38643634037108127,
"grad_norm": 0.6823417266648101,
"learning_rate": 3.097435897435898e-05,
"loss": 0.492,
"step": 151
},
{
"epoch": 0.3889955214331414,
"grad_norm": 0.9675008798265416,
"learning_rate": 3.117948717948718e-05,
"loss": 0.5282,
"step": 152
},
{
"epoch": 0.3915547024952015,
"grad_norm": 1.0136226084949147,
"learning_rate": 3.1384615384615386e-05,
"loss": 0.5044,
"step": 153
},
{
"epoch": 0.3941138835572617,
"grad_norm": 0.9351502869426284,
"learning_rate": 3.158974358974359e-05,
"loss": 0.5006,
"step": 154
},
{
"epoch": 0.39667306461932184,
"grad_norm": 0.9882679082998469,
"learning_rate": 3.1794871794871795e-05,
"loss": 0.5013,
"step": 155
},
{
"epoch": 0.39923224568138194,
"grad_norm": 0.9382011251547424,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.4777,
"step": 156
},
{
"epoch": 0.4017914267434421,
"grad_norm": 0.9879747473370469,
"learning_rate": 3.220512820512821e-05,
"loss": 0.4718,
"step": 157
},
{
"epoch": 0.40435060780550225,
"grad_norm": 0.9016818417869712,
"learning_rate": 3.2410256410256413e-05,
"loss": 0.5294,
"step": 158
},
{
"epoch": 0.4069097888675624,
"grad_norm": 0.9938632562876675,
"learning_rate": 3.261538461538462e-05,
"loss": 0.4991,
"step": 159
},
{
"epoch": 0.4094689699296225,
"grad_norm": 1.3410049525059016,
"learning_rate": 3.282051282051282e-05,
"loss": 0.5095,
"step": 160
},
{
"epoch": 0.41202815099168266,
"grad_norm": 0.8871685101904818,
"learning_rate": 3.3025641025641025e-05,
"loss": 0.5483,
"step": 161
},
{
"epoch": 0.4145873320537428,
"grad_norm": 1.2268108807413454,
"learning_rate": 3.3230769230769236e-05,
"loss": 0.5174,
"step": 162
},
{
"epoch": 0.4171465131158029,
"grad_norm": 0.9220772123108049,
"learning_rate": 3.343589743589744e-05,
"loss": 0.5181,
"step": 163
},
{
"epoch": 0.4197056941778631,
"grad_norm": 1.0658979698672157,
"learning_rate": 3.3641025641025644e-05,
"loss": 0.539,
"step": 164
},
{
"epoch": 0.42226487523992323,
"grad_norm": 0.97733215203732,
"learning_rate": 3.384615384615385e-05,
"loss": 0.5191,
"step": 165
},
{
"epoch": 0.4248240563019834,
"grad_norm": 1.1290947988408833,
"learning_rate": 3.405128205128205e-05,
"loss": 0.5169,
"step": 166
},
{
"epoch": 0.4273832373640435,
"grad_norm": 0.8361689719032068,
"learning_rate": 3.4256410256410256e-05,
"loss": 0.5114,
"step": 167
},
{
"epoch": 0.42994241842610365,
"grad_norm": 0.9928527289292229,
"learning_rate": 3.446153846153847e-05,
"loss": 0.5123,
"step": 168
},
{
"epoch": 0.4325015994881638,
"grad_norm": 0.7704183020179163,
"learning_rate": 3.466666666666667e-05,
"loss": 0.4828,
"step": 169
},
{
"epoch": 0.4350607805502239,
"grad_norm": 0.9283903900385092,
"learning_rate": 3.4871794871794875e-05,
"loss": 0.5226,
"step": 170
},
{
"epoch": 0.43761996161228406,
"grad_norm": 0.925519484340918,
"learning_rate": 3.507692307692308e-05,
"loss": 0.4956,
"step": 171
},
{
"epoch": 0.4401791426743442,
"grad_norm": 0.9993231714335514,
"learning_rate": 3.5282051282051283e-05,
"loss": 0.5355,
"step": 172
},
{
"epoch": 0.44273832373640437,
"grad_norm": 1.311826250820302,
"learning_rate": 3.548717948717949e-05,
"loss": 0.5237,
"step": 173
},
{
"epoch": 0.44529750479846447,
"grad_norm": 0.9273485381049265,
"learning_rate": 3.56923076923077e-05,
"loss": 0.5131,
"step": 174
},
{
"epoch": 0.44785668586052463,
"grad_norm": 1.248607874192031,
"learning_rate": 3.58974358974359e-05,
"loss": 0.4972,
"step": 175
},
{
"epoch": 0.4504158669225848,
"grad_norm": 0.8222900953155937,
"learning_rate": 3.6102564102564106e-05,
"loss": 0.5364,
"step": 176
},
{
"epoch": 0.45297504798464494,
"grad_norm": 1.123693838028357,
"learning_rate": 3.630769230769231e-05,
"loss": 0.5219,
"step": 177
},
{
"epoch": 0.45553422904670504,
"grad_norm": 0.7807365104513995,
"learning_rate": 3.6512820512820514e-05,
"loss": 0.5182,
"step": 178
},
{
"epoch": 0.4580934101087652,
"grad_norm": 0.9231872939650723,
"learning_rate": 3.671794871794872e-05,
"loss": 0.5235,
"step": 179
},
{
"epoch": 0.46065259117082535,
"grad_norm": 0.8053927537642723,
"learning_rate": 3.692307692307693e-05,
"loss": 0.5202,
"step": 180
},
{
"epoch": 0.46321177223288545,
"grad_norm": 1.1551405361338565,
"learning_rate": 3.712820512820513e-05,
"loss": 0.5173,
"step": 181
},
{
"epoch": 0.4657709532949456,
"grad_norm": 0.9439986501141405,
"learning_rate": 3.733333333333334e-05,
"loss": 0.5232,
"step": 182
},
{
"epoch": 0.46833013435700577,
"grad_norm": 1.106193836601767,
"learning_rate": 3.753846153846154e-05,
"loss": 0.5533,
"step": 183
},
{
"epoch": 0.4708893154190659,
"grad_norm": 0.794287368963475,
"learning_rate": 3.7743589743589745e-05,
"loss": 0.4762,
"step": 184
},
{
"epoch": 0.473448496481126,
"grad_norm": 0.8611752707863242,
"learning_rate": 3.794871794871795e-05,
"loss": 0.5274,
"step": 185
},
{
"epoch": 0.4760076775431862,
"grad_norm": 0.8817686680939495,
"learning_rate": 3.815384615384616e-05,
"loss": 0.5188,
"step": 186
},
{
"epoch": 0.47856685860524634,
"grad_norm": 0.8845212101910219,
"learning_rate": 3.8358974358974364e-05,
"loss": 0.5218,
"step": 187
},
{
"epoch": 0.48112603966730644,
"grad_norm": 0.663678181823649,
"learning_rate": 3.856410256410257e-05,
"loss": 0.5227,
"step": 188
},
{
"epoch": 0.4836852207293666,
"grad_norm": 0.755313719038617,
"learning_rate": 3.876923076923077e-05,
"loss": 0.5426,
"step": 189
},
{
"epoch": 0.48624440179142675,
"grad_norm": 0.68323576325022,
"learning_rate": 3.8974358974358976e-05,
"loss": 0.5254,
"step": 190
},
{
"epoch": 0.4888035828534869,
"grad_norm": 0.7623898782087327,
"learning_rate": 3.917948717948718e-05,
"loss": 0.5011,
"step": 191
},
{
"epoch": 0.491362763915547,
"grad_norm": 0.6832707042582635,
"learning_rate": 3.938461538461539e-05,
"loss": 0.5318,
"step": 192
},
{
"epoch": 0.49392194497760716,
"grad_norm": 0.8383249840589115,
"learning_rate": 3.9589743589743595e-05,
"loss": 0.5244,
"step": 193
},
{
"epoch": 0.4964811260396673,
"grad_norm": 0.6885029349814699,
"learning_rate": 3.97948717948718e-05,
"loss": 0.5068,
"step": 194
},
{
"epoch": 0.4990403071017274,
"grad_norm": 0.9336320586686789,
"learning_rate": 4e-05,
"loss": 0.5013,
"step": 195
},
{
"epoch": 0.5015994881637876,
"grad_norm": 1.1273619177371816,
"learning_rate": 3.999996795609852e-05,
"loss": 0.4924,
"step": 196
},
{
"epoch": 0.5041586692258477,
"grad_norm": 0.8484780646183507,
"learning_rate": 3.9999871824496765e-05,
"loss": 0.4774,
"step": 197
},
{
"epoch": 0.5067178502879078,
"grad_norm": 0.7566071063905553,
"learning_rate": 3.999971160550277e-05,
"loss": 0.5328,
"step": 198
},
{
"epoch": 0.509277031349968,
"grad_norm": 0.963017285802582,
"learning_rate": 3.999948729962994e-05,
"loss": 0.535,
"step": 199
},
{
"epoch": 0.5118362124120281,
"grad_norm": 0.6417865549660366,
"learning_rate": 3.9999198907597046e-05,
"loss": 0.502,
"step": 200
},
{
"epoch": 0.5143953934740882,
"grad_norm": 0.8877386892780115,
"learning_rate": 3.999884643032821e-05,
"loss": 0.513,
"step": 201
},
{
"epoch": 0.5169545745361485,
"grad_norm": 0.7952767722764698,
"learning_rate": 3.999842986895289e-05,
"loss": 0.5121,
"step": 202
},
{
"epoch": 0.5195137555982086,
"grad_norm": 0.7352919644974529,
"learning_rate": 3.999794922480593e-05,
"loss": 0.4881,
"step": 203
},
{
"epoch": 0.5220729366602687,
"grad_norm": 0.7101035899255743,
"learning_rate": 3.9997404499427494e-05,
"loss": 0.4941,
"step": 204
},
{
"epoch": 0.5246321177223289,
"grad_norm": 0.7457517555879764,
"learning_rate": 3.9996795694563096e-05,
"loss": 0.5128,
"step": 205
},
{
"epoch": 0.527191298784389,
"grad_norm": 0.6573802709079243,
"learning_rate": 3.999612281216358e-05,
"loss": 0.4949,
"step": 206
},
{
"epoch": 0.5297504798464492,
"grad_norm": 0.7445707355838409,
"learning_rate": 3.9995385854385124e-05,
"loss": 0.5036,
"step": 207
},
{
"epoch": 0.5323096609085093,
"grad_norm": 0.7713581724495707,
"learning_rate": 3.999458482358924e-05,
"loss": 0.4985,
"step": 208
},
{
"epoch": 0.5348688419705694,
"grad_norm": 0.7752015257631751,
"learning_rate": 3.9993719722342726e-05,
"loss": 0.508,
"step": 209
},
{
"epoch": 0.5374280230326296,
"grad_norm": 1.133508847111212,
"learning_rate": 3.999279055341771e-05,
"loss": 0.5261,
"step": 210
},
{
"epoch": 0.5399872040946897,
"grad_norm": 0.7726822279135389,
"learning_rate": 3.999179731979162e-05,
"loss": 0.4672,
"step": 211
},
{
"epoch": 0.5425463851567498,
"grad_norm": 0.7766807053995074,
"learning_rate": 3.9990740024647154e-05,
"loss": 0.4984,
"step": 212
},
{
"epoch": 0.54510556621881,
"grad_norm": 0.5303932042183875,
"learning_rate": 3.9989618671372304e-05,
"loss": 0.5129,
"step": 213
},
{
"epoch": 0.5476647472808701,
"grad_norm": 0.6026485573392468,
"learning_rate": 3.998843326356032e-05,
"loss": 0.5127,
"step": 214
},
{
"epoch": 0.5502239283429302,
"grad_norm": 0.6118913537653141,
"learning_rate": 3.998718380500971e-05,
"loss": 0.458,
"step": 215
},
{
"epoch": 0.5527831094049904,
"grad_norm": 0.6445812096951002,
"learning_rate": 3.998587029972423e-05,
"loss": 0.495,
"step": 216
},
{
"epoch": 0.5553422904670505,
"grad_norm": 0.7741176770417552,
"learning_rate": 3.998449275191286e-05,
"loss": 0.5095,
"step": 217
},
{
"epoch": 0.5579014715291107,
"grad_norm": 0.6972261092630921,
"learning_rate": 3.9983051165989814e-05,
"loss": 0.4871,
"step": 218
},
{
"epoch": 0.5604606525911708,
"grad_norm": 0.6664826347703592,
"learning_rate": 3.998154554657448e-05,
"loss": 0.5137,
"step": 219
},
{
"epoch": 0.5630198336532309,
"grad_norm": 0.6334253599100873,
"learning_rate": 3.997997589849145e-05,
"loss": 0.5494,
"step": 220
},
{
"epoch": 0.5655790147152912,
"grad_norm": 0.7145250281069672,
"learning_rate": 3.99783422267705e-05,
"loss": 0.5219,
"step": 221
},
{
"epoch": 0.5681381957773513,
"grad_norm": 0.8804491997042745,
"learning_rate": 3.997664453664654e-05,
"loss": 0.5305,
"step": 222
},
{
"epoch": 0.5706973768394114,
"grad_norm": 0.5941374342473514,
"learning_rate": 3.9974882833559634e-05,
"loss": 0.492,
"step": 223
},
{
"epoch": 0.5732565579014716,
"grad_norm": 0.8541812515528515,
"learning_rate": 3.997305712315497e-05,
"loss": 0.4994,
"step": 224
},
{
"epoch": 0.5758157389635317,
"grad_norm": 0.8858077653575287,
"learning_rate": 3.9971167411282835e-05,
"loss": 0.5268,
"step": 225
},
{
"epoch": 0.5783749200255918,
"grad_norm": 1.0807623477992674,
"learning_rate": 3.9969213703998606e-05,
"loss": 0.5047,
"step": 226
},
{
"epoch": 0.580934101087652,
"grad_norm": 0.8213345788017161,
"learning_rate": 3.9967196007562725e-05,
"loss": 0.5302,
"step": 227
},
{
"epoch": 0.5834932821497121,
"grad_norm": 0.6055767495165976,
"learning_rate": 3.996511432844067e-05,
"loss": 0.4833,
"step": 228
},
{
"epoch": 0.5860524632117722,
"grad_norm": 0.9265228707425465,
"learning_rate": 3.996296867330296e-05,
"loss": 0.5146,
"step": 229
},
{
"epoch": 0.5886116442738324,
"grad_norm": 1.228465580948079,
"learning_rate": 3.99607590490251e-05,
"loss": 0.474,
"step": 230
},
{
"epoch": 0.5911708253358925,
"grad_norm": 0.8130275473999699,
"learning_rate": 3.9958485462687606e-05,
"loss": 0.4767,
"step": 231
},
{
"epoch": 0.5937300063979527,
"grad_norm": 0.780191780172052,
"learning_rate": 3.995614792157592e-05,
"loss": 0.5037,
"step": 232
},
{
"epoch": 0.5962891874600128,
"grad_norm": 0.9566872727407667,
"learning_rate": 3.995374643318045e-05,
"loss": 0.5152,
"step": 233
},
{
"epoch": 0.5988483685220729,
"grad_norm": 1.1646368305147163,
"learning_rate": 3.9951281005196486e-05,
"loss": 0.5329,
"step": 234
},
{
"epoch": 0.6014075495841331,
"grad_norm": 0.7216132566248876,
"learning_rate": 3.9948751645524235e-05,
"loss": 0.5285,
"step": 235
},
{
"epoch": 0.6039667306461932,
"grad_norm": 1.0575511476108006,
"learning_rate": 3.994615836226874e-05,
"loss": 0.5364,
"step": 236
},
{
"epoch": 0.6065259117082533,
"grad_norm": 0.9622069565170775,
"learning_rate": 3.994350116373991e-05,
"loss": 0.5067,
"step": 237
},
{
"epoch": 0.6090850927703135,
"grad_norm": 0.7168012454794431,
"learning_rate": 3.9940780058452416e-05,
"loss": 0.5012,
"step": 238
},
{
"epoch": 0.6116442738323736,
"grad_norm": 0.8302983155644609,
"learning_rate": 3.9937995055125774e-05,
"loss": 0.5282,
"step": 239
},
{
"epoch": 0.6142034548944337,
"grad_norm": 0.7497453674792344,
"learning_rate": 3.9935146162684206e-05,
"loss": 0.4992,
"step": 240
},
{
"epoch": 0.6167626359564939,
"grad_norm": 0.8217939024893259,
"learning_rate": 3.993223339025667e-05,
"loss": 0.4895,
"step": 241
},
{
"epoch": 0.619321817018554,
"grad_norm": 0.7219771415337329,
"learning_rate": 3.992925674717683e-05,
"loss": 0.4636,
"step": 242
},
{
"epoch": 0.6218809980806143,
"grad_norm": 0.9341076990511636,
"learning_rate": 3.9926216242983017e-05,
"loss": 0.5008,
"step": 243
},
{
"epoch": 0.6244401791426744,
"grad_norm": 0.7304570272364406,
"learning_rate": 3.9923111887418185e-05,
"loss": 0.4921,
"step": 244
},
{
"epoch": 0.6269993602047345,
"grad_norm": 1.0471475376229655,
"learning_rate": 3.9919943690429906e-05,
"loss": 0.4768,
"step": 245
},
{
"epoch": 0.6295585412667947,
"grad_norm": 0.693079198064159,
"learning_rate": 3.991671166217031e-05,
"loss": 0.4786,
"step": 246
},
{
"epoch": 0.6321177223288548,
"grad_norm": 1.1561745746369312,
"learning_rate": 3.991341581299609e-05,
"loss": 0.5182,
"step": 247
},
{
"epoch": 0.6346769033909149,
"grad_norm": 0.8741051048237501,
"learning_rate": 3.991005615346843e-05,
"loss": 0.5024,
"step": 248
},
{
"epoch": 0.6372360844529751,
"grad_norm": 0.9086658125226366,
"learning_rate": 3.990663269435298e-05,
"loss": 0.4974,
"step": 249
},
{
"epoch": 0.6397952655150352,
"grad_norm": 0.9761201164077572,
"learning_rate": 3.9903145446619837e-05,
"loss": 0.5112,
"step": 250
},
{
"epoch": 0.6423544465770953,
"grad_norm": 0.8136560445992532,
"learning_rate": 3.989959442144352e-05,
"loss": 0.4971,
"step": 251
},
{
"epoch": 0.6449136276391555,
"grad_norm": 0.6200675715165651,
"learning_rate": 3.989597963020289e-05,
"loss": 0.506,
"step": 252
},
{
"epoch": 0.6474728087012156,
"grad_norm": 0.9727978732394552,
"learning_rate": 3.989230108448115e-05,
"loss": 0.5132,
"step": 253
},
{
"epoch": 0.6500319897632757,
"grad_norm": 0.7588308378509555,
"learning_rate": 3.9888558796065784e-05,
"loss": 0.4948,
"step": 254
},
{
"epoch": 0.6525911708253359,
"grad_norm": 0.8984908598875354,
"learning_rate": 3.9884752776948564e-05,
"loss": 0.4912,
"step": 255
},
{
"epoch": 0.655150351887396,
"grad_norm": 0.9180268932937014,
"learning_rate": 3.988088303932545e-05,
"loss": 0.5058,
"step": 256
},
{
"epoch": 0.6577095329494562,
"grad_norm": 0.9874072428401991,
"learning_rate": 3.987694959559658e-05,
"loss": 0.5265,
"step": 257
},
{
"epoch": 0.6602687140115163,
"grad_norm": 0.7774534286927767,
"learning_rate": 3.9872952458366267e-05,
"loss": 0.5116,
"step": 258
},
{
"epoch": 0.6628278950735764,
"grad_norm": 0.8189986183875947,
"learning_rate": 3.9868891640442874e-05,
"loss": 0.507,
"step": 259
},
{
"epoch": 0.6653870761356366,
"grad_norm": 0.6489509247329256,
"learning_rate": 3.9864767154838864e-05,
"loss": 0.5111,
"step": 260
},
{
"epoch": 0.6679462571976967,
"grad_norm": 0.8761101947097708,
"learning_rate": 3.986057901477069e-05,
"loss": 0.5127,
"step": 261
},
{
"epoch": 0.6705054382597568,
"grad_norm": 0.9687654727768278,
"learning_rate": 3.985632723365878e-05,
"loss": 0.547,
"step": 262
},
{
"epoch": 0.673064619321817,
"grad_norm": 0.7867771900738217,
"learning_rate": 3.985201182512752e-05,
"loss": 0.516,
"step": 263
},
{
"epoch": 0.6756238003838771,
"grad_norm": 0.6453678386295493,
"learning_rate": 3.984763280300514e-05,
"loss": 0.481,
"step": 264
},
{
"epoch": 0.6781829814459372,
"grad_norm": 0.7765079788191963,
"learning_rate": 3.9843190181323744e-05,
"loss": 0.4913,
"step": 265
},
{
"epoch": 0.6807421625079975,
"grad_norm": 0.6373936761246601,
"learning_rate": 3.983868397431923e-05,
"loss": 0.5133,
"step": 266
},
{
"epoch": 0.6833013435700576,
"grad_norm": 0.730921099373597,
"learning_rate": 3.983411419643125e-05,
"loss": 0.5339,
"step": 267
},
{
"epoch": 0.6858605246321178,
"grad_norm": 0.8152333844378884,
"learning_rate": 3.982948086230312e-05,
"loss": 0.4865,
"step": 268
},
{
"epoch": 0.6884197056941779,
"grad_norm": 0.7100466271823358,
"learning_rate": 3.9824783986781897e-05,
"loss": 0.49,
"step": 269
},
{
"epoch": 0.690978886756238,
"grad_norm": 0.8934016406293627,
"learning_rate": 3.982002358491817e-05,
"loss": 0.5208,
"step": 270
},
{
"epoch": 0.6935380678182982,
"grad_norm": 0.6756901306503084,
"learning_rate": 3.981519967196614e-05,
"loss": 0.5191,
"step": 271
},
{
"epoch": 0.6960972488803583,
"grad_norm": 0.8009942346681957,
"learning_rate": 3.98103122633835e-05,
"loss": 0.5067,
"step": 272
},
{
"epoch": 0.6986564299424184,
"grad_norm": 0.6923671981740297,
"learning_rate": 3.980536137483141e-05,
"loss": 0.4868,
"step": 273
},
{
"epoch": 0.7012156110044786,
"grad_norm": 0.9566550041102404,
"learning_rate": 3.980034702217445e-05,
"loss": 0.5398,
"step": 274
},
{
"epoch": 0.7037747920665387,
"grad_norm": 0.8440959667430001,
"learning_rate": 3.979526922148058e-05,
"loss": 0.4658,
"step": 275
},
{
"epoch": 0.7063339731285988,
"grad_norm": 0.8588196985974998,
"learning_rate": 3.9790127989021024e-05,
"loss": 0.5052,
"step": 276
},
{
"epoch": 0.708893154190659,
"grad_norm": 0.7562049758522937,
"learning_rate": 3.978492334127032e-05,
"loss": 0.5267,
"step": 277
},
{
"epoch": 0.7114523352527191,
"grad_norm": 0.632945819901848,
"learning_rate": 3.977965529490618e-05,
"loss": 0.4618,
"step": 278
},
{
"epoch": 0.7140115163147792,
"grad_norm": 0.845801145881355,
"learning_rate": 3.9774323866809485e-05,
"loss": 0.4683,
"step": 279
},
{
"epoch": 0.7165706973768394,
"grad_norm": 0.7371708961015324,
"learning_rate": 3.9768929074064206e-05,
"loss": 0.5364,
"step": 280
},
{
"epoch": 0.7191298784388995,
"grad_norm": 0.7371186918676583,
"learning_rate": 3.976347093395736e-05,
"loss": 0.5061,
"step": 281
},
{
"epoch": 0.7216890595009597,
"grad_norm": 0.7032178872579914,
"learning_rate": 3.9757949463978975e-05,
"loss": 0.5242,
"step": 282
},
{
"epoch": 0.7242482405630198,
"grad_norm": 0.7617726502548777,
"learning_rate": 3.9752364681821973e-05,
"loss": 0.4888,
"step": 283
},
{
"epoch": 0.7268074216250799,
"grad_norm": 0.7903937882632156,
"learning_rate": 3.9746716605382186e-05,
"loss": 0.5203,
"step": 284
},
{
"epoch": 0.7293666026871402,
"grad_norm": 0.7645749463474476,
"learning_rate": 3.9741005252758255e-05,
"loss": 0.5116,
"step": 285
},
{
"epoch": 0.7319257837492003,
"grad_norm": 0.6220992771519867,
"learning_rate": 3.973523064225159e-05,
"loss": 0.4671,
"step": 286
},
{
"epoch": 0.7344849648112604,
"grad_norm": 0.5115715364977104,
"learning_rate": 3.972939279236627e-05,
"loss": 0.4565,
"step": 287
},
{
"epoch": 0.7370441458733206,
"grad_norm": 0.7021026496208411,
"learning_rate": 3.9723491721809076e-05,
"loss": 0.5379,
"step": 288
},
{
"epoch": 0.7396033269353807,
"grad_norm": 0.5201603136030488,
"learning_rate": 3.971752744948932e-05,
"loss": 0.4692,
"step": 289
},
{
"epoch": 0.7421625079974408,
"grad_norm": 0.8208102312039668,
"learning_rate": 3.971149999451886e-05,
"loss": 0.4882,
"step": 290
},
{
"epoch": 0.744721689059501,
"grad_norm": 1.119540735969476,
"learning_rate": 3.970540937621201e-05,
"loss": 0.5097,
"step": 291
},
{
"epoch": 0.7472808701215611,
"grad_norm": 0.7635743214902218,
"learning_rate": 3.9699255614085495e-05,
"loss": 0.5101,
"step": 292
},
{
"epoch": 0.7498400511836213,
"grad_norm": 0.8126931797929314,
"learning_rate": 3.969303872785837e-05,
"loss": 0.4889,
"step": 293
},
{
"epoch": 0.7523992322456814,
"grad_norm": 0.7624486728902423,
"learning_rate": 3.9686758737451955e-05,
"loss": 0.4966,
"step": 294
},
{
"epoch": 0.7549584133077415,
"grad_norm": 0.5229197667386186,
"learning_rate": 3.9680415662989806e-05,
"loss": 0.4886,
"step": 295
},
{
"epoch": 0.7575175943698017,
"grad_norm": 0.6766454901060076,
"learning_rate": 3.967400952479759e-05,
"loss": 0.4661,
"step": 296
},
{
"epoch": 0.7600767754318618,
"grad_norm": 0.6030935943397303,
"learning_rate": 3.966754034340308e-05,
"loss": 0.5526,
"step": 297
},
{
"epoch": 0.7626359564939219,
"grad_norm": 0.688481808856117,
"learning_rate": 3.966100813953607e-05,
"loss": 0.5191,
"step": 298
},
{
"epoch": 0.7651951375559821,
"grad_norm": 0.8026340827764397,
"learning_rate": 3.965441293412827e-05,
"loss": 0.4875,
"step": 299
},
{
"epoch": 0.7677543186180422,
"grad_norm": 0.5877161986618623,
"learning_rate": 3.9647754748313294e-05,
"loss": 0.5581,
"step": 300
},
{
"epoch": 0.7703134996801023,
"grad_norm": 0.7230843693397704,
"learning_rate": 3.964103360342658e-05,
"loss": 0.4941,
"step": 301
},
{
"epoch": 0.7728726807421625,
"grad_norm": 0.6543565273084903,
"learning_rate": 3.963424952100529e-05,
"loss": 0.4749,
"step": 302
},
{
"epoch": 0.7754318618042226,
"grad_norm": 0.6083305140129942,
"learning_rate": 3.962740252278827e-05,
"loss": 0.506,
"step": 303
},
{
"epoch": 0.7779910428662828,
"grad_norm": 0.6712573537812702,
"learning_rate": 3.962049263071598e-05,
"loss": 0.4752,
"step": 304
},
{
"epoch": 0.780550223928343,
"grad_norm": 0.8280090720406486,
"learning_rate": 3.96135198669304e-05,
"loss": 0.4891,
"step": 305
},
{
"epoch": 0.783109404990403,
"grad_norm": 0.6598880067981137,
"learning_rate": 3.960648425377499e-05,
"loss": 0.4947,
"step": 306
},
{
"epoch": 0.7856685860524633,
"grad_norm": 0.7265475529158774,
"learning_rate": 3.95993858137946e-05,
"loss": 0.4989,
"step": 307
},
{
"epoch": 0.7882277671145234,
"grad_norm": 0.5888767351942641,
"learning_rate": 3.959222456973541e-05,
"loss": 0.4829,
"step": 308
},
{
"epoch": 0.7907869481765835,
"grad_norm": 0.6375937306972569,
"learning_rate": 3.958500054454482e-05,
"loss": 0.4574,
"step": 309
},
{
"epoch": 0.7933461292386437,
"grad_norm": 0.6933840977854485,
"learning_rate": 3.957771376137144e-05,
"loss": 0.5059,
"step": 310
},
{
"epoch": 0.7959053103007038,
"grad_norm": 0.6896810504909161,
"learning_rate": 3.9570364243564966e-05,
"loss": 0.4992,
"step": 311
},
{
"epoch": 0.7984644913627639,
"grad_norm": 0.6468420638732157,
"learning_rate": 3.9562952014676116e-05,
"loss": 0.496,
"step": 312
},
{
"epoch": 0.8010236724248241,
"grad_norm": 0.6274952773967104,
"learning_rate": 3.955547709845656e-05,
"loss": 0.4874,
"step": 313
},
{
"epoch": 0.8035828534868842,
"grad_norm": 0.5653574700934775,
"learning_rate": 3.9547939518858856e-05,
"loss": 0.5067,
"step": 314
},
{
"epoch": 0.8061420345489443,
"grad_norm": 0.6550905511901227,
"learning_rate": 3.954033930003634e-05,
"loss": 0.5212,
"step": 315
},
{
"epoch": 0.8087012156110045,
"grad_norm": 0.7494837380638635,
"learning_rate": 3.953267646634309e-05,
"loss": 0.505,
"step": 316
},
{
"epoch": 0.8112603966730646,
"grad_norm": 0.635770951888326,
"learning_rate": 3.95249510423338e-05,
"loss": 0.4967,
"step": 317
},
{
"epoch": 0.8138195777351248,
"grad_norm": 0.7410641008583149,
"learning_rate": 3.9517163052763756e-05,
"loss": 0.4773,
"step": 318
},
{
"epoch": 0.8163787587971849,
"grad_norm": 0.7149351574867076,
"learning_rate": 3.9509312522588704e-05,
"loss": 0.4709,
"step": 319
},
{
"epoch": 0.818937939859245,
"grad_norm": 0.823519098221886,
"learning_rate": 3.9501399476964806e-05,
"loss": 0.4867,
"step": 320
},
{
"epoch": 0.8214971209213052,
"grad_norm": 0.7163722784021596,
"learning_rate": 3.9493423941248564e-05,
"loss": 0.507,
"step": 321
},
{
"epoch": 0.8240563019833653,
"grad_norm": 0.5792885526249222,
"learning_rate": 3.948538594099668e-05,
"loss": 0.4863,
"step": 322
},
{
"epoch": 0.8266154830454254,
"grad_norm": 0.8674821812665616,
"learning_rate": 3.9477285501966064e-05,
"loss": 0.4497,
"step": 323
},
{
"epoch": 0.8291746641074856,
"grad_norm": 0.6695505549743577,
"learning_rate": 3.946912265011368e-05,
"loss": 0.4853,
"step": 324
},
{
"epoch": 0.8317338451695457,
"grad_norm": 0.6407618610078097,
"learning_rate": 3.946089741159648e-05,
"loss": 0.4742,
"step": 325
},
{
"epoch": 0.8342930262316058,
"grad_norm": 0.8505071134562596,
"learning_rate": 3.9452609812771346e-05,
"loss": 0.5346,
"step": 326
},
{
"epoch": 0.836852207293666,
"grad_norm": 0.5413673046728109,
"learning_rate": 3.944425988019498e-05,
"loss": 0.4677,
"step": 327
},
{
"epoch": 0.8394113883557262,
"grad_norm": 0.9828950428531091,
"learning_rate": 3.9435847640623806e-05,
"loss": 0.4808,
"step": 328
},
{
"epoch": 0.8419705694177864,
"grad_norm": 0.6480252890065408,
"learning_rate": 3.942737312101394e-05,
"loss": 0.5019,
"step": 329
},
{
"epoch": 0.8445297504798465,
"grad_norm": 0.9281412616435286,
"learning_rate": 3.9418836348521045e-05,
"loss": 0.5069,
"step": 330
},
{
"epoch": 0.8470889315419066,
"grad_norm": 0.6511602292237915,
"learning_rate": 3.941023735050027e-05,
"loss": 0.5135,
"step": 331
},
{
"epoch": 0.8496481126039668,
"grad_norm": 0.649990025588154,
"learning_rate": 3.9401576154506155e-05,
"loss": 0.4721,
"step": 332
},
{
"epoch": 0.8522072936660269,
"grad_norm": 0.8525830113834602,
"learning_rate": 3.9392852788292556e-05,
"loss": 0.4747,
"step": 333
},
{
"epoch": 0.854766474728087,
"grad_norm": 0.8122595340814978,
"learning_rate": 3.938406727981254e-05,
"loss": 0.5036,
"step": 334
},
{
"epoch": 0.8573256557901472,
"grad_norm": 0.6813807690997764,
"learning_rate": 3.937521965721831e-05,
"loss": 0.4778,
"step": 335
},
{
"epoch": 0.8598848368522073,
"grad_norm": 0.742372369654133,
"learning_rate": 3.936630994886109e-05,
"loss": 0.4912,
"step": 336
},
{
"epoch": 0.8624440179142674,
"grad_norm": 0.6932498968117697,
"learning_rate": 3.9357338183291066e-05,
"loss": 0.5033,
"step": 337
},
{
"epoch": 0.8650031989763276,
"grad_norm": 0.7002201428035697,
"learning_rate": 3.934830438925728e-05,
"loss": 0.4843,
"step": 338
},
{
"epoch": 0.8675623800383877,
"grad_norm": 0.6143454063707157,
"learning_rate": 3.933920859570753e-05,
"loss": 0.4959,
"step": 339
},
{
"epoch": 0.8701215611004478,
"grad_norm": 0.5609771595796579,
"learning_rate": 3.933005083178828e-05,
"loss": 0.4778,
"step": 340
},
{
"epoch": 0.872680742162508,
"grad_norm": 0.5872526379907818,
"learning_rate": 3.932083112684459e-05,
"loss": 0.4736,
"step": 341
},
{
"epoch": 0.8752399232245681,
"grad_norm": 0.5259898431650297,
"learning_rate": 3.931154951041998e-05,
"loss": 0.5061,
"step": 342
},
{
"epoch": 0.8777991042866283,
"grad_norm": 0.6462230152484912,
"learning_rate": 3.930220601225638e-05,
"loss": 0.503,
"step": 343
},
{
"epoch": 0.8803582853486884,
"grad_norm": 0.6210066174968442,
"learning_rate": 3.9292800662294e-05,
"loss": 0.4592,
"step": 344
},
{
"epoch": 0.8829174664107485,
"grad_norm": 0.7033811879806838,
"learning_rate": 3.928333349067125e-05,
"loss": 0.4839,
"step": 345
},
{
"epoch": 0.8854766474728087,
"grad_norm": 0.7112080589811673,
"learning_rate": 3.927380452772464e-05,
"loss": 0.4833,
"step": 346
},
{
"epoch": 0.8880358285348688,
"grad_norm": 0.5763115473730898,
"learning_rate": 3.926421380398869e-05,
"loss": 0.5128,
"step": 347
},
{
"epoch": 0.8905950095969289,
"grad_norm": 0.493333488186968,
"learning_rate": 3.925456135019582e-05,
"loss": 0.4777,
"step": 348
},
{
"epoch": 0.8931541906589892,
"grad_norm": 0.5969713695047262,
"learning_rate": 3.924484719727625e-05,
"loss": 0.5548,
"step": 349
},
{
"epoch": 0.8957133717210493,
"grad_norm": 0.5743073508713653,
"learning_rate": 3.923507137635792e-05,
"loss": 0.4993,
"step": 350
},
{
"epoch": 0.8982725527831094,
"grad_norm": 0.5816084208432016,
"learning_rate": 3.922523391876638e-05,
"loss": 0.4974,
"step": 351
},
{
"epoch": 0.9008317338451696,
"grad_norm": 0.6913456504659746,
"learning_rate": 3.921533485602467e-05,
"loss": 0.5038,
"step": 352
},
{
"epoch": 0.9033909149072297,
"grad_norm": 0.5710576834895075,
"learning_rate": 3.920537421985327e-05,
"loss": 0.469,
"step": 353
},
{
"epoch": 0.9059500959692899,
"grad_norm": 0.831878724155053,
"learning_rate": 3.9195352042169924e-05,
"loss": 0.5178,
"step": 354
},
{
"epoch": 0.90850927703135,
"grad_norm": 0.5623208840830399,
"learning_rate": 3.9185268355089606e-05,
"loss": 0.4892,
"step": 355
},
{
"epoch": 0.9110684580934101,
"grad_norm": 0.8167633776332197,
"learning_rate": 3.9175123190924384e-05,
"loss": 0.5193,
"step": 356
},
{
"epoch": 0.9136276391554703,
"grad_norm": 0.5899572202270219,
"learning_rate": 3.916491658218333e-05,
"loss": 0.4739,
"step": 357
},
{
"epoch": 0.9161868202175304,
"grad_norm": 0.7206391505617225,
"learning_rate": 3.9154648561572386e-05,
"loss": 0.4752,
"step": 358
},
{
"epoch": 0.9187460012795905,
"grad_norm": 0.7210244727385913,
"learning_rate": 3.91443191619943e-05,
"loss": 0.4789,
"step": 359
},
{
"epoch": 0.9213051823416507,
"grad_norm": 0.7320868272015927,
"learning_rate": 3.913392841654851e-05,
"loss": 0.503,
"step": 360
},
{
"epoch": 0.9238643634037108,
"grad_norm": 0.8003299130587506,
"learning_rate": 3.9123476358531e-05,
"loss": 0.4917,
"step": 361
},
{
"epoch": 0.9264235444657709,
"grad_norm": 0.7467581757493919,
"learning_rate": 3.911296302143426e-05,
"loss": 0.4973,
"step": 362
},
{
"epoch": 0.9289827255278311,
"grad_norm": 0.695841862059534,
"learning_rate": 3.9102388438947104e-05,
"loss": 0.5014,
"step": 363
},
{
"epoch": 0.9315419065898912,
"grad_norm": 0.8452597656759123,
"learning_rate": 3.909175264495464e-05,
"loss": 0.4528,
"step": 364
},
{
"epoch": 0.9341010876519513,
"grad_norm": 0.8328804446765739,
"learning_rate": 3.9081055673538093e-05,
"loss": 0.5014,
"step": 365
},
{
"epoch": 0.9366602687140115,
"grad_norm": 0.8976120702595518,
"learning_rate": 3.907029755897473e-05,
"loss": 0.4767,
"step": 366
},
{
"epoch": 0.9392194497760716,
"grad_norm": 0.6729987700598375,
"learning_rate": 3.905947833573775e-05,
"loss": 0.4758,
"step": 367
},
{
"epoch": 0.9417786308381318,
"grad_norm": 0.965419537056689,
"learning_rate": 3.904859803849617e-05,
"loss": 0.4952,
"step": 368
},
{
"epoch": 0.944337811900192,
"grad_norm": 0.8355270729808283,
"learning_rate": 3.903765670211469e-05,
"loss": 0.5048,
"step": 369
},
{
"epoch": 0.946896992962252,
"grad_norm": 0.7603742900226873,
"learning_rate": 3.902665436165364e-05,
"loss": 0.491,
"step": 370
},
{
"epoch": 0.9494561740243123,
"grad_norm": 0.7054418825885687,
"learning_rate": 3.901559105236881e-05,
"loss": 0.4654,
"step": 371
},
{
"epoch": 0.9520153550863724,
"grad_norm": 0.6840867683040738,
"learning_rate": 3.9004466809711343e-05,
"loss": 0.4789,
"step": 372
},
{
"epoch": 0.9545745361484325,
"grad_norm": 0.7107919372914869,
"learning_rate": 3.8993281669327664e-05,
"loss": 0.5041,
"step": 373
},
{
"epoch": 0.9571337172104927,
"grad_norm": 0.5705893437651786,
"learning_rate": 3.8982035667059327e-05,
"loss": 0.4724,
"step": 374
},
{
"epoch": 0.9596928982725528,
"grad_norm": 0.6398479886368891,
"learning_rate": 3.897072883894291e-05,
"loss": 0.512,
"step": 375
},
{
"epoch": 0.9622520793346129,
"grad_norm": 0.6496978842533705,
"learning_rate": 3.895936122120991e-05,
"loss": 0.4998,
"step": 376
},
{
"epoch": 0.9648112603966731,
"grad_norm": 0.6636946077689827,
"learning_rate": 3.8947932850286585e-05,
"loss": 0.5105,
"step": 377
},
{
"epoch": 0.9673704414587332,
"grad_norm": 0.8146680744120419,
"learning_rate": 3.893644376279392e-05,
"loss": 0.5081,
"step": 378
},
{
"epoch": 0.9699296225207934,
"grad_norm": 0.6043005275708316,
"learning_rate": 3.8924893995547427e-05,
"loss": 0.465,
"step": 379
},
{
"epoch": 0.9724888035828535,
"grad_norm": 0.7737447625323293,
"learning_rate": 3.8913283585557054e-05,
"loss": 0.4745,
"step": 380
},
{
"epoch": 0.9750479846449136,
"grad_norm": 0.6584366000203595,
"learning_rate": 3.89016125700271e-05,
"loss": 0.4646,
"step": 381
},
{
"epoch": 0.9776071657069738,
"grad_norm": 0.7267670466528602,
"learning_rate": 3.888988098635604e-05,
"loss": 0.5443,
"step": 382
},
{
"epoch": 0.9801663467690339,
"grad_norm": 0.8281518075048059,
"learning_rate": 3.8878088872136446e-05,
"loss": 0.5175,
"step": 383
},
{
"epoch": 0.982725527831094,
"grad_norm": 0.7524480266101858,
"learning_rate": 3.8866236265154864e-05,
"loss": 0.4752,
"step": 384
},
{
"epoch": 0.9852847088931542,
"grad_norm": 0.8364410578252669,
"learning_rate": 3.885432320339167e-05,
"loss": 0.4752,
"step": 385
},
{
"epoch": 0.9878438899552143,
"grad_norm": 0.684899934501505,
"learning_rate": 3.884234972502095e-05,
"loss": 0.4931,
"step": 386
},
{
"epoch": 0.9904030710172744,
"grad_norm": 0.6158115167694288,
"learning_rate": 3.88303158684104e-05,
"loss": 0.4861,
"step": 387
},
{
"epoch": 0.9929622520793346,
"grad_norm": 0.7078911086493701,
"learning_rate": 3.8818221672121204e-05,
"loss": 0.4921,
"step": 388
},
{
"epoch": 0.9955214331413947,
"grad_norm": 0.5372294629344029,
"learning_rate": 3.8806067174907876e-05,
"loss": 0.513,
"step": 389
},
{
"epoch": 0.9980806142034548,
"grad_norm": 0.7744974417312304,
"learning_rate": 3.879385241571817e-05,
"loss": 0.4669,
"step": 390
},
{
"epoch": 1.000639795265515,
"grad_norm": 0.649847374858299,
"learning_rate": 3.878157743369294e-05,
"loss": 0.6153,
"step": 391
},
{
"epoch": 1.0031989763275753,
"grad_norm": 0.7557922196471385,
"learning_rate": 3.876924226816602e-05,
"loss": 0.3771,
"step": 392
},
{
"epoch": 1.0057581573896353,
"grad_norm": 0.720395789720446,
"learning_rate": 3.875684695866409e-05,
"loss": 0.4498,
"step": 393
},
{
"epoch": 1.0083173384516955,
"grad_norm": 0.6684707254214445,
"learning_rate": 3.874439154490656e-05,
"loss": 0.4581,
"step": 394
},
{
"epoch": 1.0108765195137557,
"grad_norm": 0.6845617164901984,
"learning_rate": 3.873187606680543e-05,
"loss": 0.4382,
"step": 395
},
{
"epoch": 1.0134357005758157,
"grad_norm": 0.6687231254947439,
"learning_rate": 3.871930056446518e-05,
"loss": 0.3945,
"step": 396
},
{
"epoch": 1.0159948816378759,
"grad_norm": 0.8306778742589912,
"learning_rate": 3.870666507818262e-05,
"loss": 0.4194,
"step": 397
},
{
"epoch": 1.018554062699936,
"grad_norm": 0.7593090405257436,
"learning_rate": 3.869396964844679e-05,
"loss": 0.4378,
"step": 398
},
{
"epoch": 1.021113243761996,
"grad_norm": 0.545915139395705,
"learning_rate": 3.8681214315938786e-05,
"loss": 0.4009,
"step": 399
},
{
"epoch": 1.0236724248240563,
"grad_norm": 0.8728350624043225,
"learning_rate": 3.866839912153168e-05,
"loss": 0.4239,
"step": 400
},
{
"epoch": 1.0262316058861165,
"grad_norm": 0.7685743303640589,
"learning_rate": 3.8655524106290345e-05,
"loss": 0.4433,
"step": 401
},
{
"epoch": 1.0287907869481765,
"grad_norm": 0.6436857445045538,
"learning_rate": 3.864258931147136e-05,
"loss": 0.4135,
"step": 402
},
{
"epoch": 1.0313499680102367,
"grad_norm": 0.6043825799230796,
"learning_rate": 3.862959477852285e-05,
"loss": 0.4511,
"step": 403
},
{
"epoch": 1.033909149072297,
"grad_norm": 0.864882027042227,
"learning_rate": 3.8616540549084366e-05,
"loss": 0.4281,
"step": 404
},
{
"epoch": 1.036468330134357,
"grad_norm": 0.5669229687455682,
"learning_rate": 3.860342666498677e-05,
"loss": 0.4265,
"step": 405
},
{
"epoch": 1.0390275111964171,
"grad_norm": 1.092925757327788,
"learning_rate": 3.859025316825204e-05,
"loss": 0.4171,
"step": 406
},
{
"epoch": 1.0415866922584773,
"grad_norm": 0.6235272525172618,
"learning_rate": 3.8577020101093214e-05,
"loss": 0.3889,
"step": 407
},
{
"epoch": 1.0441458733205373,
"grad_norm": 0.7044477218033379,
"learning_rate": 3.856372750591419e-05,
"loss": 0.4268,
"step": 408
},
{
"epoch": 1.0467050543825975,
"grad_norm": 0.6758623607211073,
"learning_rate": 3.8550375425309643e-05,
"loss": 0.376,
"step": 409
},
{
"epoch": 1.0492642354446577,
"grad_norm": 0.703616580836483,
"learning_rate": 3.853696390206484e-05,
"loss": 0.4782,
"step": 410
},
{
"epoch": 1.051823416506718,
"grad_norm": 0.7120884633418646,
"learning_rate": 3.8523492979155534e-05,
"loss": 0.4156,
"step": 411
},
{
"epoch": 1.054382597568778,
"grad_norm": 0.7074414036356858,
"learning_rate": 3.850996269974782e-05,
"loss": 0.4044,
"step": 412
},
{
"epoch": 1.0569417786308382,
"grad_norm": 0.6805988413911291,
"learning_rate": 3.849637310719799e-05,
"loss": 0.4659,
"step": 413
},
{
"epoch": 1.0595009596928984,
"grad_norm": 0.751339193242575,
"learning_rate": 3.84827242450524e-05,
"loss": 0.4362,
"step": 414
},
{
"epoch": 1.0620601407549584,
"grad_norm": 0.8444583705618753,
"learning_rate": 3.846901615704734e-05,
"loss": 0.4671,
"step": 415
},
{
"epoch": 1.0646193218170186,
"grad_norm": 0.5494970933209252,
"learning_rate": 3.845524888710885e-05,
"loss": 0.4192,
"step": 416
},
{
"epoch": 1.0671785028790788,
"grad_norm": 0.6871657555654107,
"learning_rate": 3.844142247935265e-05,
"loss": 0.4392,
"step": 417
},
{
"epoch": 1.0697376839411388,
"grad_norm": 0.6255544924834299,
"learning_rate": 3.842753697808395e-05,
"loss": 0.4098,
"step": 418
},
{
"epoch": 1.072296865003199,
"grad_norm": 0.5236343059225219,
"learning_rate": 3.84135924277973e-05,
"loss": 0.4039,
"step": 419
},
{
"epoch": 1.0748560460652592,
"grad_norm": 0.7081136566843064,
"learning_rate": 3.839958887317649e-05,
"loss": 0.42,
"step": 420
},
{
"epoch": 1.0774152271273192,
"grad_norm": 0.630614377419562,
"learning_rate": 3.838552635909436e-05,
"loss": 0.4065,
"step": 421
},
{
"epoch": 1.0799744081893794,
"grad_norm": 0.5615204386321961,
"learning_rate": 3.8371404930612704e-05,
"loss": 0.4146,
"step": 422
},
{
"epoch": 1.0825335892514396,
"grad_norm": 0.6565436867003143,
"learning_rate": 3.835722463298208e-05,
"loss": 0.4064,
"step": 423
},
{
"epoch": 1.0850927703134996,
"grad_norm": 0.5737679071890739,
"learning_rate": 3.83429855116417e-05,
"loss": 0.4552,
"step": 424
},
{
"epoch": 1.0876519513755598,
"grad_norm": 0.6434114879042129,
"learning_rate": 3.832868761221926e-05,
"loss": 0.4441,
"step": 425
},
{
"epoch": 1.09021113243762,
"grad_norm": 0.6708602456648777,
"learning_rate": 3.831433098053082e-05,
"loss": 0.4022,
"step": 426
},
{
"epoch": 1.09277031349968,
"grad_norm": 0.623494522486998,
"learning_rate": 3.829991566258061e-05,
"loss": 0.4043,
"step": 427
},
{
"epoch": 1.0953294945617402,
"grad_norm": 0.698870377709763,
"learning_rate": 3.828544170456094e-05,
"loss": 0.4559,
"step": 428
},
{
"epoch": 1.0978886756238004,
"grad_norm": 0.6409469981980804,
"learning_rate": 3.827090915285202e-05,
"loss": 0.423,
"step": 429
},
{
"epoch": 1.1004478566858604,
"grad_norm": 0.7366884864992727,
"learning_rate": 3.825631805402182e-05,
"loss": 0.4878,
"step": 430
},
{
"epoch": 1.1030070377479206,
"grad_norm": 0.6346910715057278,
"learning_rate": 3.824166845482591e-05,
"loss": 0.3875,
"step": 431
},
{
"epoch": 1.1055662188099808,
"grad_norm": 0.598625329081273,
"learning_rate": 3.8226960402207316e-05,
"loss": 0.4201,
"step": 432
},
{
"epoch": 1.108125399872041,
"grad_norm": 0.7955423553177828,
"learning_rate": 3.821219394329638e-05,
"loss": 0.468,
"step": 433
},
{
"epoch": 1.110684580934101,
"grad_norm": 0.6767598095676679,
"learning_rate": 3.81973691254106e-05,
"loss": 0.4104,
"step": 434
},
{
"epoch": 1.1132437619961613,
"grad_norm": 0.669121013587649,
"learning_rate": 3.818248599605448e-05,
"loss": 0.3625,
"step": 435
},
{
"epoch": 1.1158029430582213,
"grad_norm": 0.9836919729491361,
"learning_rate": 3.816754460291936e-05,
"loss": 0.4852,
"step": 436
},
{
"epoch": 1.1183621241202815,
"grad_norm": 0.9574882546263436,
"learning_rate": 3.8152544993883305e-05,
"loss": 0.4003,
"step": 437
},
{
"epoch": 1.1209213051823417,
"grad_norm": 0.6647251327930666,
"learning_rate": 3.813748721701091e-05,
"loss": 0.4202,
"step": 438
},
{
"epoch": 1.1234804862444019,
"grad_norm": 1.05006176773705,
"learning_rate": 3.812237132055317e-05,
"loss": 0.4341,
"step": 439
},
{
"epoch": 1.1260396673064619,
"grad_norm": 0.7815506825876288,
"learning_rate": 3.810719735294731e-05,
"loss": 0.4748,
"step": 440
},
{
"epoch": 1.128598848368522,
"grad_norm": 1.0347544390312602,
"learning_rate": 3.809196536281665e-05,
"loss": 0.4248,
"step": 441
},
{
"epoch": 1.1311580294305823,
"grad_norm": 0.8757539929940961,
"learning_rate": 3.807667539897041e-05,
"loss": 0.3786,
"step": 442
},
{
"epoch": 1.1337172104926423,
"grad_norm": 0.869699779958049,
"learning_rate": 3.8061327510403624e-05,
"loss": 0.4397,
"step": 443
},
{
"epoch": 1.1362763915547025,
"grad_norm": 1.0544887595956864,
"learning_rate": 3.80459217462969e-05,
"loss": 0.4333,
"step": 444
},
{
"epoch": 1.1388355726167627,
"grad_norm": 0.7148540444466959,
"learning_rate": 3.8030458156016326e-05,
"loss": 0.432,
"step": 445
},
{
"epoch": 1.1413947536788227,
"grad_norm": 0.8417941713288487,
"learning_rate": 3.801493678911326e-05,
"loss": 0.4414,
"step": 446
},
{
"epoch": 1.143953934740883,
"grad_norm": 0.9182856799978838,
"learning_rate": 3.799935769532425e-05,
"loss": 0.4318,
"step": 447
},
{
"epoch": 1.1465131158029431,
"grad_norm": 0.6142048538916006,
"learning_rate": 3.798372092457076e-05,
"loss": 0.3898,
"step": 448
},
{
"epoch": 1.1490722968650031,
"grad_norm": 0.8780972905895119,
"learning_rate": 3.796802652695911e-05,
"loss": 0.5123,
"step": 449
},
{
"epoch": 1.1516314779270633,
"grad_norm": 0.7702566431359069,
"learning_rate": 3.795227455278029e-05,
"loss": 0.3752,
"step": 450
},
{
"epoch": 1.1541906589891235,
"grad_norm": 0.6619464755572778,
"learning_rate": 3.7936465052509744e-05,
"loss": 0.4028,
"step": 451
},
{
"epoch": 1.1567498400511835,
"grad_norm": 0.8748332649924442,
"learning_rate": 3.79205980768073e-05,
"loss": 0.4178,
"step": 452
},
{
"epoch": 1.1593090211132437,
"grad_norm": 0.5921826549497154,
"learning_rate": 3.790467367651694e-05,
"loss": 0.4034,
"step": 453
},
{
"epoch": 1.161868202175304,
"grad_norm": 0.8231905572303784,
"learning_rate": 3.788869190266664e-05,
"loss": 0.4934,
"step": 454
},
{
"epoch": 1.164427383237364,
"grad_norm": 0.7187628790605729,
"learning_rate": 3.787265280646825e-05,
"loss": 0.4113,
"step": 455
},
{
"epoch": 1.1669865642994242,
"grad_norm": 0.6550020914082988,
"learning_rate": 3.785655643931728e-05,
"loss": 0.4038,
"step": 456
},
{
"epoch": 1.1695457453614844,
"grad_norm": 0.7857598212931832,
"learning_rate": 3.784040285279279e-05,
"loss": 0.4083,
"step": 457
},
{
"epoch": 1.1721049264235446,
"grad_norm": 0.6478292689928322,
"learning_rate": 3.782419209865716e-05,
"loss": 0.387,
"step": 458
},
{
"epoch": 1.1746641074856046,
"grad_norm": 0.6645094823023456,
"learning_rate": 3.780792422885597e-05,
"loss": 0.3904,
"step": 459
},
{
"epoch": 1.1772232885476648,
"grad_norm": 0.6316724059123985,
"learning_rate": 3.7791599295517825e-05,
"loss": 0.4225,
"step": 460
},
{
"epoch": 1.1797824696097248,
"grad_norm": 0.5440033363984303,
"learning_rate": 3.777521735095418e-05,
"loss": 0.4116,
"step": 461
},
{
"epoch": 1.182341650671785,
"grad_norm": 0.7448918620076476,
"learning_rate": 3.7758778447659184e-05,
"loss": 0.4272,
"step": 462
},
{
"epoch": 1.1849008317338452,
"grad_norm": 0.5835830556541663,
"learning_rate": 3.774228263830948e-05,
"loss": 0.3958,
"step": 463
},
{
"epoch": 1.1874600127959054,
"grad_norm": 0.5982038873563293,
"learning_rate": 3.772572997576409e-05,
"loss": 0.4053,
"step": 464
},
{
"epoch": 1.1900191938579654,
"grad_norm": 0.5062712599156436,
"learning_rate": 3.7709120513064196e-05,
"loss": 0.3874,
"step": 465
},
{
"epoch": 1.1925783749200256,
"grad_norm": 0.606586616558905,
"learning_rate": 3.769245430343301e-05,
"loss": 0.4528,
"step": 466
},
{
"epoch": 1.1951375559820858,
"grad_norm": 0.7220231131524425,
"learning_rate": 3.767573140027556e-05,
"loss": 0.433,
"step": 467
},
{
"epoch": 1.1976967370441458,
"grad_norm": 0.6706551361731272,
"learning_rate": 3.7658951857178544e-05,
"loss": 0.443,
"step": 468
},
{
"epoch": 1.200255918106206,
"grad_norm": 0.5717527361383908,
"learning_rate": 3.764211572791017e-05,
"loss": 0.4669,
"step": 469
},
{
"epoch": 1.2028150991682662,
"grad_norm": 0.8106231588399798,
"learning_rate": 3.762522306641998e-05,
"loss": 0.406,
"step": 470
},
{
"epoch": 1.2053742802303262,
"grad_norm": 0.5810325663311328,
"learning_rate": 3.760827392683863e-05,
"loss": 0.4304,
"step": 471
},
{
"epoch": 1.2079334612923864,
"grad_norm": 0.8071567509837944,
"learning_rate": 3.759126836347779e-05,
"loss": 0.4044,
"step": 472
},
{
"epoch": 1.2104926423544466,
"grad_norm": 0.700979544013499,
"learning_rate": 3.757420643082991e-05,
"loss": 0.4397,
"step": 473
},
{
"epoch": 1.2130518234165066,
"grad_norm": 0.5863406678085621,
"learning_rate": 3.755708818356809e-05,
"loss": 0.4099,
"step": 474
},
{
"epoch": 1.2156110044785668,
"grad_norm": 0.6016554944675842,
"learning_rate": 3.7539913676545874e-05,
"loss": 0.4107,
"step": 475
},
{
"epoch": 1.218170185540627,
"grad_norm": 0.5831332187240583,
"learning_rate": 3.7522682964797066e-05,
"loss": 0.4023,
"step": 476
},
{
"epoch": 1.220729366602687,
"grad_norm": 0.6093541377057935,
"learning_rate": 3.75053961035356e-05,
"loss": 0.4301,
"step": 477
},
{
"epoch": 1.2232885476647473,
"grad_norm": 0.6465778983982824,
"learning_rate": 3.748805314815532e-05,
"loss": 0.3933,
"step": 478
},
{
"epoch": 1.2258477287268075,
"grad_norm": 0.5808579764017139,
"learning_rate": 3.7470654154229834e-05,
"loss": 0.4386,
"step": 479
},
{
"epoch": 1.2284069097888675,
"grad_norm": 0.7856607698111114,
"learning_rate": 3.745319917751229e-05,
"loss": 0.4201,
"step": 480
},
{
"epoch": 1.2309660908509277,
"grad_norm": 0.5050539819501365,
"learning_rate": 3.743568827393525e-05,
"loss": 0.4773,
"step": 481
},
{
"epoch": 1.2335252719129879,
"grad_norm": 0.8385075664472458,
"learning_rate": 3.741812149961049e-05,
"loss": 0.4041,
"step": 482
},
{
"epoch": 1.236084452975048,
"grad_norm": 0.624705443326678,
"learning_rate": 3.740049891082879e-05,
"loss": 0.4157,
"step": 483
},
{
"epoch": 1.238643634037108,
"grad_norm": 0.5436565794572649,
"learning_rate": 3.738282056405981e-05,
"loss": 0.3959,
"step": 484
},
{
"epoch": 1.2412028150991683,
"grad_norm": 0.785730847073682,
"learning_rate": 3.736508651595188e-05,
"loss": 0.4413,
"step": 485
},
{
"epoch": 1.2437619961612283,
"grad_norm": 0.5059153650301063,
"learning_rate": 3.734729682333179e-05,
"loss": 0.4033,
"step": 486
},
{
"epoch": 1.2463211772232885,
"grad_norm": 0.609111900762071,
"learning_rate": 3.732945154320467e-05,
"loss": 0.4282,
"step": 487
},
{
"epoch": 1.2488803582853487,
"grad_norm": 0.5621990676055562,
"learning_rate": 3.731155073275375e-05,
"loss": 0.411,
"step": 488
},
{
"epoch": 1.251439539347409,
"grad_norm": 0.5476414987684265,
"learning_rate": 3.729359444934022e-05,
"loss": 0.4217,
"step": 489
},
{
"epoch": 1.253998720409469,
"grad_norm": 0.5674756992559286,
"learning_rate": 3.727558275050301e-05,
"loss": 0.461,
"step": 490
},
{
"epoch": 1.2565579014715291,
"grad_norm": 0.6949881690129194,
"learning_rate": 3.725751569395863e-05,
"loss": 0.4621,
"step": 491
},
{
"epoch": 1.2591170825335891,
"grad_norm": 0.5458610089583471,
"learning_rate": 3.723939333760099e-05,
"loss": 0.4508,
"step": 492
},
{
"epoch": 1.2616762635956493,
"grad_norm": 0.5906724584503589,
"learning_rate": 3.7221215739501176e-05,
"loss": 0.4276,
"step": 493
},
{
"epoch": 1.2642354446577095,
"grad_norm": 0.4800178261816677,
"learning_rate": 3.720298295790732e-05,
"loss": 0.3921,
"step": 494
},
{
"epoch": 1.2667946257197698,
"grad_norm": 0.6636123062743209,
"learning_rate": 3.718469505124434e-05,
"loss": 0.4584,
"step": 495
},
{
"epoch": 1.2693538067818297,
"grad_norm": 0.5232376990167991,
"learning_rate": 3.716635207811385e-05,
"loss": 0.4168,
"step": 496
},
{
"epoch": 1.27191298784389,
"grad_norm": 0.6767875708314827,
"learning_rate": 3.714795409729388e-05,
"loss": 0.4379,
"step": 497
},
{
"epoch": 1.2744721689059502,
"grad_norm": 0.5605992428107962,
"learning_rate": 3.712950116773875e-05,
"loss": 0.4074,
"step": 498
},
{
"epoch": 1.2770313499680102,
"grad_norm": 0.5932800163478181,
"learning_rate": 3.711099334857884e-05,
"loss": 0.4194,
"step": 499
},
{
"epoch": 1.2795905310300704,
"grad_norm": 0.5438179500986763,
"learning_rate": 3.709243069912041e-05,
"loss": 0.3917,
"step": 500
},
{
"epoch": 1.2821497120921306,
"grad_norm": 0.7190898552769511,
"learning_rate": 3.707381327884545e-05,
"loss": 0.4717,
"step": 501
},
{
"epoch": 1.2847088931541908,
"grad_norm": 0.6816367378411385,
"learning_rate": 3.705514114741142e-05,
"loss": 0.3782,
"step": 502
},
{
"epoch": 1.2872680742162508,
"grad_norm": 0.5116844048821922,
"learning_rate": 3.703641436465114e-05,
"loss": 0.4225,
"step": 503
},
{
"epoch": 1.289827255278311,
"grad_norm": 0.6736005325770137,
"learning_rate": 3.70176329905725e-05,
"loss": 0.4444,
"step": 504
},
{
"epoch": 1.292386436340371,
"grad_norm": 0.5864619464379411,
"learning_rate": 3.699879708535838e-05,
"loss": 0.4354,
"step": 505
},
{
"epoch": 1.2949456174024312,
"grad_norm": 0.6078408475459052,
"learning_rate": 3.6979906709366334e-05,
"loss": 0.453,
"step": 506
},
{
"epoch": 1.2975047984644914,
"grad_norm": 0.7409797972431034,
"learning_rate": 3.696096192312852e-05,
"loss": 0.4365,
"step": 507
},
{
"epoch": 1.3000639795265516,
"grad_norm": 0.4775651025928874,
"learning_rate": 3.694196278735142e-05,
"loss": 0.4391,
"step": 508
},
{
"epoch": 1.3026231605886116,
"grad_norm": 0.5823632608853917,
"learning_rate": 3.692290936291568e-05,
"loss": 0.3875,
"step": 509
},
{
"epoch": 1.3051823416506718,
"grad_norm": 0.5307154121726435,
"learning_rate": 3.69038017108759e-05,
"loss": 0.4059,
"step": 510
},
{
"epoch": 1.3077415227127318,
"grad_norm": 0.5835304311686238,
"learning_rate": 3.688463989246045e-05,
"loss": 0.4505,
"step": 511
},
{
"epoch": 1.310300703774792,
"grad_norm": 0.5459622175115579,
"learning_rate": 3.686542396907128e-05,
"loss": 0.3994,
"step": 512
},
{
"epoch": 1.3128598848368522,
"grad_norm": 0.5266102200698167,
"learning_rate": 3.6846154002283696e-05,
"loss": 0.3954,
"step": 513
},
{
"epoch": 1.3154190658989124,
"grad_norm": 0.49436862141147236,
"learning_rate": 3.68268300538462e-05,
"loss": 0.4417,
"step": 514
},
{
"epoch": 1.3179782469609724,
"grad_norm": 0.5981532644758176,
"learning_rate": 3.680745218568026e-05,
"loss": 0.4382,
"step": 515
},
{
"epoch": 1.3205374280230326,
"grad_norm": 0.4757174461989479,
"learning_rate": 3.678802045988012e-05,
"loss": 0.3686,
"step": 516
},
{
"epoch": 1.3230966090850926,
"grad_norm": 0.6321671394432529,
"learning_rate": 3.676853493871262e-05,
"loss": 0.4418,
"step": 517
},
{
"epoch": 1.3256557901471528,
"grad_norm": 0.44736313515673864,
"learning_rate": 3.674899568461696e-05,
"loss": 0.4235,
"step": 518
},
{
"epoch": 1.328214971209213,
"grad_norm": 0.5494531167802622,
"learning_rate": 3.6729402760204535e-05,
"loss": 0.4069,
"step": 519
},
{
"epoch": 1.3307741522712733,
"grad_norm": 0.5068999873204574,
"learning_rate": 3.6709756228258735e-05,
"loss": 0.4284,
"step": 520
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.506062505003235,
"learning_rate": 3.669005615173469e-05,
"loss": 0.4438,
"step": 521
},
{
"epoch": 1.3358925143953935,
"grad_norm": 0.6379310859811307,
"learning_rate": 3.667030259375915e-05,
"loss": 0.4142,
"step": 522
},
{
"epoch": 1.3384516954574537,
"grad_norm": 0.42277797644227044,
"learning_rate": 3.665049561763021e-05,
"loss": 0.3805,
"step": 523
},
{
"epoch": 1.3410108765195137,
"grad_norm": 0.6090026974823245,
"learning_rate": 3.663063528681716e-05,
"loss": 0.4136,
"step": 524
},
{
"epoch": 1.3435700575815739,
"grad_norm": 0.46562372404588254,
"learning_rate": 3.6610721664960236e-05,
"loss": 0.4354,
"step": 525
},
{
"epoch": 1.346129238643634,
"grad_norm": 0.6500313998419536,
"learning_rate": 3.659075481587046e-05,
"loss": 0.4283,
"step": 526
},
{
"epoch": 1.3486884197056943,
"grad_norm": 0.5211253922160387,
"learning_rate": 3.65707348035294e-05,
"loss": 0.4255,
"step": 527
},
{
"epoch": 1.3512476007677543,
"grad_norm": 0.6220670330163766,
"learning_rate": 3.6550661692089e-05,
"loss": 0.4191,
"step": 528
},
{
"epoch": 1.3538067818298145,
"grad_norm": 0.544711113768934,
"learning_rate": 3.6530535545871326e-05,
"loss": 0.436,
"step": 529
},
{
"epoch": 1.3563659628918745,
"grad_norm": 0.7052970063859283,
"learning_rate": 3.65103564293684e-05,
"loss": 0.4949,
"step": 530
},
{
"epoch": 1.3589251439539347,
"grad_norm": 0.5145642841555808,
"learning_rate": 3.6490124407242007e-05,
"loss": 0.4131,
"step": 531
},
{
"epoch": 1.361484325015995,
"grad_norm": 0.5857771172798699,
"learning_rate": 3.646983954432342e-05,
"loss": 0.4146,
"step": 532
},
{
"epoch": 1.3640435060780551,
"grad_norm": 0.4920261736044566,
"learning_rate": 3.644950190561325e-05,
"loss": 0.4284,
"step": 533
},
{
"epoch": 1.3666026871401151,
"grad_norm": 0.5687057750503046,
"learning_rate": 3.642911155628124e-05,
"loss": 0.4514,
"step": 534
},
{
"epoch": 1.3691618682021753,
"grad_norm": 0.5724699735691123,
"learning_rate": 3.640866856166601e-05,
"loss": 0.4539,
"step": 535
},
{
"epoch": 1.3717210492642353,
"grad_norm": 0.5749209613889618,
"learning_rate": 3.6388172987274913e-05,
"loss": 0.3865,
"step": 536
},
{
"epoch": 1.3742802303262955,
"grad_norm": 0.6228395395499405,
"learning_rate": 3.636762489878374e-05,
"loss": 0.4075,
"step": 537
},
{
"epoch": 1.3768394113883557,
"grad_norm": 0.6903037733166263,
"learning_rate": 3.63470243620366e-05,
"loss": 0.4312,
"step": 538
},
{
"epoch": 1.379398592450416,
"grad_norm": 0.678573324042214,
"learning_rate": 3.632637144304565e-05,
"loss": 0.4806,
"step": 539
},
{
"epoch": 1.381957773512476,
"grad_norm": 0.6951420732428104,
"learning_rate": 3.6305666207990886e-05,
"loss": 0.439,
"step": 540
},
{
"epoch": 1.3845169545745362,
"grad_norm": 0.5961242888653673,
"learning_rate": 3.628490872321998e-05,
"loss": 0.4205,
"step": 541
},
{
"epoch": 1.3870761356365962,
"grad_norm": 0.7217418974601812,
"learning_rate": 3.626409905524799e-05,
"loss": 0.4707,
"step": 542
},
{
"epoch": 1.3896353166986564,
"grad_norm": 0.45054510458685476,
"learning_rate": 3.624323727075723e-05,
"loss": 0.4145,
"step": 543
},
{
"epoch": 1.3921944977607166,
"grad_norm": 0.7284286885213026,
"learning_rate": 3.622232343659698e-05,
"loss": 0.4299,
"step": 544
},
{
"epoch": 1.3947536788227768,
"grad_norm": 0.6769810527885796,
"learning_rate": 3.6201357619783336e-05,
"loss": 0.4163,
"step": 545
},
{
"epoch": 1.3973128598848368,
"grad_norm": 0.5221114603904703,
"learning_rate": 3.6180339887498953e-05,
"loss": 0.4443,
"step": 546
},
{
"epoch": 1.399872040946897,
"grad_norm": 0.6008896686645977,
"learning_rate": 3.615927030709284e-05,
"loss": 0.4318,
"step": 547
},
{
"epoch": 1.4024312220089572,
"grad_norm": 0.5343608825415525,
"learning_rate": 3.613814894608016e-05,
"loss": 0.4623,
"step": 548
},
{
"epoch": 1.4049904030710172,
"grad_norm": 0.691771745080796,
"learning_rate": 3.6116975872141984e-05,
"loss": 0.4624,
"step": 549
},
{
"epoch": 1.4075495841330774,
"grad_norm": 0.6259414433121822,
"learning_rate": 3.609575115312511e-05,
"loss": 0.4508,
"step": 550
},
{
"epoch": 1.4101087651951376,
"grad_norm": 0.6649399879213752,
"learning_rate": 3.607447485704182e-05,
"loss": 0.4143,
"step": 551
},
{
"epoch": 1.4126679462571978,
"grad_norm": 0.47253136462127165,
"learning_rate": 3.605314705206966e-05,
"loss": 0.4106,
"step": 552
},
{
"epoch": 1.4152271273192578,
"grad_norm": 0.5556693107195737,
"learning_rate": 3.603176780655124e-05,
"loss": 0.4616,
"step": 553
},
{
"epoch": 1.417786308381318,
"grad_norm": 0.4021083541093729,
"learning_rate": 3.601033718899401e-05,
"loss": 0.3928,
"step": 554
},
{
"epoch": 1.420345489443378,
"grad_norm": 0.5351580026934937,
"learning_rate": 3.598885526807003e-05,
"loss": 0.4661,
"step": 555
},
{
"epoch": 1.4229046705054382,
"grad_norm": 0.5247684064361327,
"learning_rate": 3.596732211261574e-05,
"loss": 0.4303,
"step": 556
},
{
"epoch": 1.4254638515674984,
"grad_norm": 0.49894549324782106,
"learning_rate": 3.594573779163179e-05,
"loss": 0.3938,
"step": 557
},
{
"epoch": 1.4280230326295587,
"grad_norm": 0.5751022801506976,
"learning_rate": 3.5924102374282754e-05,
"loss": 0.4401,
"step": 558
},
{
"epoch": 1.4305822136916186,
"grad_norm": 0.4560617808969142,
"learning_rate": 3.590241592989696e-05,
"loss": 0.4241,
"step": 559
},
{
"epoch": 1.4331413947536789,
"grad_norm": 0.6245655178581035,
"learning_rate": 3.5880678527966224e-05,
"loss": 0.4138,
"step": 560
},
{
"epoch": 1.4357005758157388,
"grad_norm": 0.5355963534405355,
"learning_rate": 3.5858890238145674e-05,
"loss": 0.4145,
"step": 561
},
{
"epoch": 1.438259756877799,
"grad_norm": 0.6422997737126074,
"learning_rate": 3.583705113025348e-05,
"loss": 0.4554,
"step": 562
},
{
"epoch": 1.4408189379398593,
"grad_norm": 0.6761962261433756,
"learning_rate": 3.581516127427068e-05,
"loss": 0.4176,
"step": 563
},
{
"epoch": 1.4433781190019195,
"grad_norm": 0.6187534229527232,
"learning_rate": 3.5793220740340904e-05,
"loss": 0.4255,
"step": 564
},
{
"epoch": 1.4459373000639795,
"grad_norm": 0.5255489769173007,
"learning_rate": 3.577122959877017e-05,
"loss": 0.4147,
"step": 565
},
{
"epoch": 1.4484964811260397,
"grad_norm": 0.5786362132356377,
"learning_rate": 3.57491879200267e-05,
"loss": 0.4018,
"step": 566
},
{
"epoch": 1.4510556621880997,
"grad_norm": 0.5297316242089579,
"learning_rate": 3.572709577474062e-05,
"loss": 0.4446,
"step": 567
},
{
"epoch": 1.4536148432501599,
"grad_norm": 0.5131558041266011,
"learning_rate": 3.570495323370378e-05,
"loss": 0.4475,
"step": 568
},
{
"epoch": 1.45617402431222,
"grad_norm": 0.7521443480114743,
"learning_rate": 3.568276036786952e-05,
"loss": 0.4091,
"step": 569
},
{
"epoch": 1.4587332053742803,
"grad_norm": 0.5969703958136027,
"learning_rate": 3.566051724835245e-05,
"loss": 0.4283,
"step": 570
},
{
"epoch": 1.4612923864363403,
"grad_norm": 0.4037709403552394,
"learning_rate": 3.5638223946428194e-05,
"loss": 0.4271,
"step": 571
},
{
"epoch": 1.4638515674984005,
"grad_norm": 0.5680529949916449,
"learning_rate": 3.561588053353319e-05,
"loss": 0.4253,
"step": 572
},
{
"epoch": 1.4664107485604607,
"grad_norm": 0.4379512610916079,
"learning_rate": 3.559348708126445e-05,
"loss": 0.3955,
"step": 573
},
{
"epoch": 1.4689699296225207,
"grad_norm": 0.6425572888305521,
"learning_rate": 3.557104366137934e-05,
"loss": 0.4208,
"step": 574
},
{
"epoch": 1.471529110684581,
"grad_norm": 0.5147430638168887,
"learning_rate": 3.554855034579532e-05,
"loss": 0.4206,
"step": 575
},
{
"epoch": 1.4740882917466411,
"grad_norm": 0.5773864382218094,
"learning_rate": 3.552600720658976e-05,
"loss": 0.3936,
"step": 576
},
{
"epoch": 1.4766474728087013,
"grad_norm": 0.5531560341828542,
"learning_rate": 3.550341431599967e-05,
"loss": 0.4674,
"step": 577
},
{
"epoch": 1.4792066538707613,
"grad_norm": 0.5447609179432047,
"learning_rate": 3.5480771746421494e-05,
"loss": 0.4032,
"step": 578
},
{
"epoch": 1.4817658349328215,
"grad_norm": 0.572196110734813,
"learning_rate": 3.545807957041084e-05,
"loss": 0.4509,
"step": 579
},
{
"epoch": 1.4843250159948815,
"grad_norm": 0.5028398333246705,
"learning_rate": 3.5435337860682304e-05,
"loss": 0.3334,
"step": 580
},
{
"epoch": 1.4868841970569417,
"grad_norm": 0.7160945031568345,
"learning_rate": 3.54125466901092e-05,
"loss": 0.4598,
"step": 581
},
{
"epoch": 1.489443378119002,
"grad_norm": 0.6791351765101247,
"learning_rate": 3.538970613172332e-05,
"loss": 0.4055,
"step": 582
},
{
"epoch": 1.4920025591810622,
"grad_norm": 0.8140085732718708,
"learning_rate": 3.536681625871474e-05,
"loss": 0.3982,
"step": 583
},
{
"epoch": 1.4945617402431222,
"grad_norm": 0.6600156473699682,
"learning_rate": 3.534387714443153e-05,
"loss": 0.4283,
"step": 584
},
{
"epoch": 1.4971209213051824,
"grad_norm": 0.9072053140385462,
"learning_rate": 3.532088886237956e-05,
"loss": 0.461,
"step": 585
},
{
"epoch": 1.4996801023672424,
"grad_norm": 0.5291141400904751,
"learning_rate": 3.5297851486222274e-05,
"loss": 0.4105,
"step": 586
},
{
"epoch": 1.5022392834293026,
"grad_norm": 0.8711742022509097,
"learning_rate": 3.527476508978039e-05,
"loss": 0.4266,
"step": 587
},
{
"epoch": 1.5047984644913628,
"grad_norm": 0.5011541715943939,
"learning_rate": 3.525162974703174e-05,
"loss": 0.4681,
"step": 588
},
{
"epoch": 1.507357645553423,
"grad_norm": 0.7884765548368484,
"learning_rate": 3.5228445532110996e-05,
"loss": 0.4341,
"step": 589
},
{
"epoch": 1.5099168266154832,
"grad_norm": 0.5740501726759667,
"learning_rate": 3.520521251930941e-05,
"loss": 0.4128,
"step": 590
},
{
"epoch": 1.5124760076775432,
"grad_norm": 0.5460891296171442,
"learning_rate": 3.518193078307463e-05,
"loss": 0.4188,
"step": 591
},
{
"epoch": 1.5150351887396032,
"grad_norm": 0.7055940264802735,
"learning_rate": 3.515860039801043e-05,
"loss": 0.3965,
"step": 592
},
{
"epoch": 1.5175943698016634,
"grad_norm": 0.5874392809254365,
"learning_rate": 3.513522143887645e-05,
"loss": 0.4918,
"step": 593
},
{
"epoch": 1.5201535508637236,
"grad_norm": 0.6245938510885568,
"learning_rate": 3.5111793980588006e-05,
"loss": 0.4285,
"step": 594
},
{
"epoch": 1.5227127319257838,
"grad_norm": 0.48774952340541083,
"learning_rate": 3.5088318098215805e-05,
"loss": 0.4013,
"step": 595
},
{
"epoch": 1.525271912987844,
"grad_norm": 0.4468453631601922,
"learning_rate": 3.506479386698575e-05,
"loss": 0.3958,
"step": 596
},
{
"epoch": 1.527831094049904,
"grad_norm": 0.5538376365976897,
"learning_rate": 3.5041221362278644e-05,
"loss": 0.4347,
"step": 597
},
{
"epoch": 1.530390275111964,
"grad_norm": 0.4775499856280007,
"learning_rate": 3.5017600659629986e-05,
"loss": 0.4484,
"step": 598
},
{
"epoch": 1.5329494561740242,
"grad_norm": 0.5259243816931762,
"learning_rate": 3.499393183472973e-05,
"loss": 0.4211,
"step": 599
},
{
"epoch": 1.5355086372360844,
"grad_norm": 0.5198109524806964,
"learning_rate": 3.497021496342203e-05,
"loss": 0.4363,
"step": 600
},
{
"epoch": 1.5380678182981447,
"grad_norm": 0.4696941806509467,
"learning_rate": 3.494645012170498e-05,
"loss": 0.4295,
"step": 601
},
{
"epoch": 1.5406269993602049,
"grad_norm": 0.6474700305067774,
"learning_rate": 3.4922637385730406e-05,
"loss": 0.5,
"step": 602
},
{
"epoch": 1.5431861804222649,
"grad_norm": 0.4793948522260617,
"learning_rate": 3.489877683180362e-05,
"loss": 0.3845,
"step": 603
},
{
"epoch": 1.545745361484325,
"grad_norm": 0.539237627319592,
"learning_rate": 3.487486853638314e-05,
"loss": 0.4356,
"step": 604
},
{
"epoch": 1.548304542546385,
"grad_norm": 0.5532057074478213,
"learning_rate": 3.485091257608047e-05,
"loss": 0.3891,
"step": 605
},
{
"epoch": 1.5508637236084453,
"grad_norm": 0.6992089262871057,
"learning_rate": 3.482690902765984e-05,
"loss": 0.4571,
"step": 606
},
{
"epoch": 1.5534229046705055,
"grad_norm": 0.4638019008983174,
"learning_rate": 3.4802857968038e-05,
"loss": 0.4188,
"step": 607
},
{
"epoch": 1.5559820857325657,
"grad_norm": 0.7914532265925823,
"learning_rate": 3.4778759474283936e-05,
"loss": 0.4534,
"step": 608
},
{
"epoch": 1.5585412667946257,
"grad_norm": 0.5295039816146513,
"learning_rate": 3.475461362361861e-05,
"loss": 0.4001,
"step": 609
},
{
"epoch": 1.561100447856686,
"grad_norm": 0.6132941939025509,
"learning_rate": 3.473042049341474e-05,
"loss": 0.4225,
"step": 610
},
{
"epoch": 1.5636596289187459,
"grad_norm": 0.714057587477597,
"learning_rate": 3.470618016119658e-05,
"loss": 0.4136,
"step": 611
},
{
"epoch": 1.566218809980806,
"grad_norm": 0.5320432430474729,
"learning_rate": 3.468189270463959e-05,
"loss": 0.4004,
"step": 612
},
{
"epoch": 1.5687779910428663,
"grad_norm": 0.66784032505952,
"learning_rate": 3.465755820157026e-05,
"loss": 0.4065,
"step": 613
},
{
"epoch": 1.5713371721049265,
"grad_norm": 0.49152795733031895,
"learning_rate": 3.463317672996583e-05,
"loss": 0.3791,
"step": 614
},
{
"epoch": 1.5738963531669867,
"grad_norm": 0.555185505506747,
"learning_rate": 3.4608748367954064e-05,
"loss": 0.4633,
"step": 615
},
{
"epoch": 1.5764555342290467,
"grad_norm": 0.4980166871980404,
"learning_rate": 3.4584273193812956e-05,
"loss": 0.4252,
"step": 616
},
{
"epoch": 1.5790147152911067,
"grad_norm": 0.589732498310526,
"learning_rate": 3.45597512859705e-05,
"loss": 0.4839,
"step": 617
},
{
"epoch": 1.581573896353167,
"grad_norm": 0.5743441085203349,
"learning_rate": 3.4535182723004466e-05,
"loss": 0.4062,
"step": 618
},
{
"epoch": 1.5841330774152271,
"grad_norm": 0.49126577666625093,
"learning_rate": 3.451056758364212e-05,
"loss": 0.4135,
"step": 619
},
{
"epoch": 1.5866922584772873,
"grad_norm": 0.5037192947792559,
"learning_rate": 3.4485905946759965e-05,
"loss": 0.459,
"step": 620
},
{
"epoch": 1.5892514395393476,
"grad_norm": 0.5439325021578344,
"learning_rate": 3.446119789138351e-05,
"loss": 0.3882,
"step": 621
},
{
"epoch": 1.5918106206014075,
"grad_norm": 0.5020489297705218,
"learning_rate": 3.443644349668701e-05,
"loss": 0.4053,
"step": 622
},
{
"epoch": 1.5943698016634675,
"grad_norm": 0.5634072314676822,
"learning_rate": 3.4411642841993185e-05,
"loss": 0.4065,
"step": 623
},
{
"epoch": 1.5969289827255277,
"grad_norm": 0.4911032231149214,
"learning_rate": 3.438679600677303e-05,
"loss": 0.4207,
"step": 624
},
{
"epoch": 1.599488163787588,
"grad_norm": 0.6463093595410542,
"learning_rate": 3.4361903070645484e-05,
"loss": 0.4195,
"step": 625
},
{
"epoch": 1.6020473448496482,
"grad_norm": 0.5395555876619189,
"learning_rate": 3.433696411337723e-05,
"loss": 0.4359,
"step": 626
},
{
"epoch": 1.6046065259117084,
"grad_norm": 0.5456191870776568,
"learning_rate": 3.431197921488242e-05,
"loss": 0.4325,
"step": 627
},
{
"epoch": 1.6071657069737684,
"grad_norm": 0.5181489907237284,
"learning_rate": 3.4286948455222425e-05,
"loss": 0.4262,
"step": 628
},
{
"epoch": 1.6097248880358286,
"grad_norm": 0.4542522655783656,
"learning_rate": 3.426187191460555e-05,
"loss": 0.4008,
"step": 629
},
{
"epoch": 1.6122840690978886,
"grad_norm": 0.5748315523058384,
"learning_rate": 3.423674967338681e-05,
"loss": 0.4613,
"step": 630
},
{
"epoch": 1.6148432501599488,
"grad_norm": 0.4893291757606303,
"learning_rate": 3.421158181206769e-05,
"loss": 0.411,
"step": 631
},
{
"epoch": 1.617402431222009,
"grad_norm": 0.580837787265809,
"learning_rate": 3.418636841129582e-05,
"loss": 0.417,
"step": 632
},
{
"epoch": 1.6199616122840692,
"grad_norm": 0.6495075917348687,
"learning_rate": 3.416110955186477e-05,
"loss": 0.4817,
"step": 633
},
{
"epoch": 1.6225207933461292,
"grad_norm": 0.45060911216179744,
"learning_rate": 3.4135805314713804e-05,
"loss": 0.4033,
"step": 634
},
{
"epoch": 1.6250799744081894,
"grad_norm": 0.5998653037164785,
"learning_rate": 3.411045578092754e-05,
"loss": 0.3912,
"step": 635
},
{
"epoch": 1.6276391554702494,
"grad_norm": 0.5310655122776353,
"learning_rate": 3.4085061031735794e-05,
"loss": 0.4313,
"step": 636
},
{
"epoch": 1.6301983365323096,
"grad_norm": 0.6388088047336757,
"learning_rate": 3.405962114851324e-05,
"loss": 0.4433,
"step": 637
},
{
"epoch": 1.6327575175943698,
"grad_norm": 0.5039515005309823,
"learning_rate": 3.4034136212779195e-05,
"loss": 0.414,
"step": 638
},
{
"epoch": 1.63531669865643,
"grad_norm": 0.6351418548004399,
"learning_rate": 3.4008606306197336e-05,
"loss": 0.4271,
"step": 639
},
{
"epoch": 1.6378758797184902,
"grad_norm": 0.5382251510311172,
"learning_rate": 3.398303151057543e-05,
"loss": 0.4223,
"step": 640
},
{
"epoch": 1.6404350607805502,
"grad_norm": 0.5213424824749899,
"learning_rate": 3.3957411907865123e-05,
"loss": 0.4169,
"step": 641
},
{
"epoch": 1.6429942418426102,
"grad_norm": 0.6114074897511479,
"learning_rate": 3.393174758016161e-05,
"loss": 0.4141,
"step": 642
},
{
"epoch": 1.6455534229046704,
"grad_norm": 0.5077805851469291,
"learning_rate": 3.39060386097034e-05,
"loss": 0.4354,
"step": 643
},
{
"epoch": 1.6481126039667306,
"grad_norm": 0.5648032929681525,
"learning_rate": 3.3880285078872076e-05,
"loss": 0.3944,
"step": 644
},
{
"epoch": 1.6506717850287909,
"grad_norm": 0.5623890506817537,
"learning_rate": 3.385448707019199e-05,
"loss": 0.463,
"step": 645
},
{
"epoch": 1.653230966090851,
"grad_norm": 0.47253413270859174,
"learning_rate": 3.382864466633003e-05,
"loss": 0.4179,
"step": 646
},
{
"epoch": 1.655790147152911,
"grad_norm": 0.6667403121893724,
"learning_rate": 3.3802757950095346e-05,
"loss": 0.4401,
"step": 647
},
{
"epoch": 1.658349328214971,
"grad_norm": 0.48362697457689036,
"learning_rate": 3.377682700443907e-05,
"loss": 0.4294,
"step": 648
},
{
"epoch": 1.6609085092770313,
"grad_norm": 0.49305745599661777,
"learning_rate": 3.375085191245407e-05,
"loss": 0.4166,
"step": 649
},
{
"epoch": 1.6634676903390915,
"grad_norm": 0.5645696987568555,
"learning_rate": 3.372483275737468e-05,
"loss": 0.3922,
"step": 650
},
{
"epoch": 1.6660268714011517,
"grad_norm": 0.6497192793696049,
"learning_rate": 3.3698769622576404e-05,
"loss": 0.484,
"step": 651
},
{
"epoch": 1.668586052463212,
"grad_norm": 0.6004795627408669,
"learning_rate": 3.367266259157572e-05,
"loss": 0.4744,
"step": 652
},
{
"epoch": 1.671145233525272,
"grad_norm": 0.6035991357132837,
"learning_rate": 3.364651174802974e-05,
"loss": 0.4576,
"step": 653
},
{
"epoch": 1.673704414587332,
"grad_norm": 0.5638974284467396,
"learning_rate": 3.3620317175735945e-05,
"loss": 0.3829,
"step": 654
},
{
"epoch": 1.676263595649392,
"grad_norm": 0.6144746966380764,
"learning_rate": 3.359407895863199e-05,
"loss": 0.4219,
"step": 655
},
{
"epoch": 1.6788227767114523,
"grad_norm": 0.6330635207423285,
"learning_rate": 3.356779718079534e-05,
"loss": 0.3939,
"step": 656
},
{
"epoch": 1.6813819577735125,
"grad_norm": 0.6511893943323425,
"learning_rate": 3.3541471926443084e-05,
"loss": 0.4626,
"step": 657
},
{
"epoch": 1.6839411388355727,
"grad_norm": 0.7313805162264239,
"learning_rate": 3.3515103279931584e-05,
"loss": 0.4443,
"step": 658
},
{
"epoch": 1.6865003198976327,
"grad_norm": 0.4567231810719478,
"learning_rate": 3.3488691325756294e-05,
"loss": 0.4072,
"step": 659
},
{
"epoch": 1.689059500959693,
"grad_norm": 0.6498464270828352,
"learning_rate": 3.34622361485514e-05,
"loss": 0.4532,
"step": 660
},
{
"epoch": 1.691618682021753,
"grad_norm": 0.46427052887792153,
"learning_rate": 3.343573783308964e-05,
"loss": 0.4266,
"step": 661
},
{
"epoch": 1.6941778630838131,
"grad_norm": 0.5468516675156209,
"learning_rate": 3.340919646428193e-05,
"loss": 0.4208,
"step": 662
},
{
"epoch": 1.6967370441458733,
"grad_norm": 0.5751528606477143,
"learning_rate": 3.3382612127177166e-05,
"loss": 0.4146,
"step": 663
},
{
"epoch": 1.6992962252079336,
"grad_norm": 0.5793295201191827,
"learning_rate": 3.335598490696196e-05,
"loss": 0.4623,
"step": 664
},
{
"epoch": 1.7018554062699938,
"grad_norm": 0.7077864391730174,
"learning_rate": 3.332931488896029e-05,
"loss": 0.4459,
"step": 665
},
{
"epoch": 1.7044145873320538,
"grad_norm": 0.4786371924890489,
"learning_rate": 3.330260215863332e-05,
"loss": 0.3967,
"step": 666
},
{
"epoch": 1.7069737683941137,
"grad_norm": 0.71627935746326,
"learning_rate": 3.327584680157904e-05,
"loss": 0.4466,
"step": 667
},
{
"epoch": 1.709532949456174,
"grad_norm": 0.5672877205591593,
"learning_rate": 3.3249048903532075e-05,
"loss": 0.4245,
"step": 668
},
{
"epoch": 1.7120921305182342,
"grad_norm": 0.5967392434258936,
"learning_rate": 3.322220855036333e-05,
"loss": 0.4399,
"step": 669
},
{
"epoch": 1.7146513115802944,
"grad_norm": 0.619642871257619,
"learning_rate": 3.319532582807977e-05,
"loss": 0.4429,
"step": 670
},
{
"epoch": 1.7172104926423546,
"grad_norm": 0.5718993726856418,
"learning_rate": 3.316840082282412e-05,
"loss": 0.4049,
"step": 671
},
{
"epoch": 1.7197696737044146,
"grad_norm": 0.7158788807881703,
"learning_rate": 3.314143362087462e-05,
"loss": 0.465,
"step": 672
},
{
"epoch": 1.7223288547664746,
"grad_norm": 0.49801282221595167,
"learning_rate": 3.3114424308644686e-05,
"loss": 0.4304,
"step": 673
},
{
"epoch": 1.7248880358285348,
"grad_norm": 0.732000530911472,
"learning_rate": 3.3087372972682703e-05,
"loss": 0.4496,
"step": 674
},
{
"epoch": 1.727447216890595,
"grad_norm": 0.5016534768936702,
"learning_rate": 3.30602796996717e-05,
"loss": 0.4196,
"step": 675
},
{
"epoch": 1.7300063979526552,
"grad_norm": 0.6732998458752849,
"learning_rate": 3.303314457642911e-05,
"loss": 0.4377,
"step": 676
},
{
"epoch": 1.7325655790147154,
"grad_norm": 0.512135853555358,
"learning_rate": 3.300596768990644e-05,
"loss": 0.4032,
"step": 677
},
{
"epoch": 1.7351247600767754,
"grad_norm": 0.5565097137954512,
"learning_rate": 3.297874912718902e-05,
"loss": 0.4124,
"step": 678
},
{
"epoch": 1.7376839411388356,
"grad_norm": 0.5579693456602185,
"learning_rate": 3.2951488975495785e-05,
"loss": 0.4493,
"step": 679
},
{
"epoch": 1.7402431222008956,
"grad_norm": 0.5767956410825538,
"learning_rate": 3.2924187322178865e-05,
"loss": 0.4701,
"step": 680
},
{
"epoch": 1.7428023032629558,
"grad_norm": 0.474585132199083,
"learning_rate": 3.2896844254723414e-05,
"loss": 0.4118,
"step": 681
},
{
"epoch": 1.745361484325016,
"grad_norm": 0.44776574599085095,
"learning_rate": 3.28694598607473e-05,
"loss": 0.3928,
"step": 682
},
{
"epoch": 1.7479206653870762,
"grad_norm": 0.4613485770735262,
"learning_rate": 3.28420342280008e-05,
"loss": 0.4185,
"step": 683
},
{
"epoch": 1.7504798464491362,
"grad_norm": 0.6343181624558385,
"learning_rate": 3.281456744436634e-05,
"loss": 0.4133,
"step": 684
},
{
"epoch": 1.7530390275111964,
"grad_norm": 0.5493779464654077,
"learning_rate": 3.278705959785821e-05,
"loss": 0.4671,
"step": 685
},
{
"epoch": 1.7555982085732564,
"grad_norm": 0.5441386561154584,
"learning_rate": 3.2759510776622274e-05,
"loss": 0.4453,
"step": 686
},
{
"epoch": 1.7581573896353166,
"grad_norm": 0.629823120577556,
"learning_rate": 3.273192106893572e-05,
"loss": 0.3839,
"step": 687
},
{
"epoch": 1.7607165706973769,
"grad_norm": 0.549530453190963,
"learning_rate": 3.270429056320672e-05,
"loss": 0.4502,
"step": 688
},
{
"epoch": 1.763275751759437,
"grad_norm": 0.6031975081301458,
"learning_rate": 3.26766193479742e-05,
"loss": 0.4738,
"step": 689
},
{
"epoch": 1.7658349328214973,
"grad_norm": 0.48136220411058944,
"learning_rate": 3.2648907511907544e-05,
"loss": 0.4036,
"step": 690
},
{
"epoch": 1.7683941138835573,
"grad_norm": 0.6253783336704821,
"learning_rate": 3.262115514380628e-05,
"loss": 0.4081,
"step": 691
},
{
"epoch": 1.7709532949456173,
"grad_norm": 0.47939071537048983,
"learning_rate": 3.25933623325998e-05,
"loss": 0.4314,
"step": 692
},
{
"epoch": 1.7735124760076775,
"grad_norm": 0.4518873833371369,
"learning_rate": 3.256552916734713e-05,
"loss": 0.3986,
"step": 693
},
{
"epoch": 1.7760716570697377,
"grad_norm": 0.6074017672602955,
"learning_rate": 3.25376557372366e-05,
"loss": 0.4324,
"step": 694
},
{
"epoch": 1.778630838131798,
"grad_norm": 0.470674972052956,
"learning_rate": 3.250974213158555e-05,
"loss": 0.3933,
"step": 695
},
{
"epoch": 1.781190019193858,
"grad_norm": 0.6255159824430874,
"learning_rate": 3.248178843984006e-05,
"loss": 0.4252,
"step": 696
},
{
"epoch": 1.783749200255918,
"grad_norm": 0.5371760909763971,
"learning_rate": 3.245379475157465e-05,
"loss": 0.4778,
"step": 697
},
{
"epoch": 1.786308381317978,
"grad_norm": 0.5168010335153898,
"learning_rate": 3.242576115649205e-05,
"loss": 0.4229,
"step": 698
},
{
"epoch": 1.7888675623800383,
"grad_norm": 0.49166449165933496,
"learning_rate": 3.239768774442281e-05,
"loss": 0.4005,
"step": 699
},
{
"epoch": 1.7914267434420985,
"grad_norm": 0.4932386148580624,
"learning_rate": 3.23695746053251e-05,
"loss": 0.4163,
"step": 700
},
{
"epoch": 1.7939859245041587,
"grad_norm": 0.5880360652699835,
"learning_rate": 3.2341421829284394e-05,
"loss": 0.4413,
"step": 701
},
{
"epoch": 1.796545105566219,
"grad_norm": 0.4625923123089751,
"learning_rate": 3.2313229506513167e-05,
"loss": 0.426,
"step": 702
},
{
"epoch": 1.799104286628279,
"grad_norm": 0.5109208128171377,
"learning_rate": 3.228499772735062e-05,
"loss": 0.393,
"step": 703
},
{
"epoch": 1.8016634676903391,
"grad_norm": 0.4806179945017673,
"learning_rate": 3.2256726582262384e-05,
"loss": 0.4479,
"step": 704
},
{
"epoch": 1.8042226487523991,
"grad_norm": 0.4970665904869278,
"learning_rate": 3.222841616184025e-05,
"loss": 0.4318,
"step": 705
},
{
"epoch": 1.8067818298144593,
"grad_norm": 0.5666482942373245,
"learning_rate": 3.220006655680183e-05,
"loss": 0.4245,
"step": 706
},
{
"epoch": 1.8093410108765196,
"grad_norm": 0.4999181748027583,
"learning_rate": 3.2171677857990334e-05,
"loss": 0.4372,
"step": 707
},
{
"epoch": 1.8119001919385798,
"grad_norm": 0.47380754642052403,
"learning_rate": 3.2143250156374226e-05,
"loss": 0.3926,
"step": 708
},
{
"epoch": 1.8144593730006398,
"grad_norm": 0.6008131918939661,
"learning_rate": 3.211478354304695e-05,
"loss": 0.4533,
"step": 709
},
{
"epoch": 1.8170185540627,
"grad_norm": 0.5144973230871912,
"learning_rate": 3.208627810922665e-05,
"loss": 0.4352,
"step": 710
},
{
"epoch": 1.81957773512476,
"grad_norm": 0.4699623742277227,
"learning_rate": 3.2057733946255844e-05,
"loss": 0.3852,
"step": 711
},
{
"epoch": 1.8221369161868202,
"grad_norm": 0.4961706904135912,
"learning_rate": 3.202915114560118e-05,
"loss": 0.4445,
"step": 712
},
{
"epoch": 1.8246960972488804,
"grad_norm": 0.561279443158619,
"learning_rate": 3.200052979885309e-05,
"loss": 0.4802,
"step": 713
},
{
"epoch": 1.8272552783109406,
"grad_norm": 0.4557933345271347,
"learning_rate": 3.197186999772555e-05,
"loss": 0.4029,
"step": 714
},
{
"epoch": 1.8298144593730008,
"grad_norm": 0.589112376636283,
"learning_rate": 3.194317183405573e-05,
"loss": 0.4563,
"step": 715
},
{
"epoch": 1.8323736404350608,
"grad_norm": 0.5161326824368248,
"learning_rate": 3.191443539980374e-05,
"loss": 0.4556,
"step": 716
},
{
"epoch": 1.8349328214971208,
"grad_norm": 0.464424728984026,
"learning_rate": 3.188566078705235e-05,
"loss": 0.4044,
"step": 717
},
{
"epoch": 1.837492002559181,
"grad_norm": 0.47067836600914265,
"learning_rate": 3.1856848088006636e-05,
"loss": 0.4335,
"step": 718
},
{
"epoch": 1.8400511836212412,
"grad_norm": 0.5769491747912345,
"learning_rate": 3.182799739499371e-05,
"loss": 0.4407,
"step": 719
},
{
"epoch": 1.8426103646833014,
"grad_norm": 0.4932465377591071,
"learning_rate": 3.1799108800462466e-05,
"loss": 0.4328,
"step": 720
},
{
"epoch": 1.8451695457453616,
"grad_norm": 0.488480860113565,
"learning_rate": 3.177018239698322e-05,
"loss": 0.4235,
"step": 721
},
{
"epoch": 1.8477287268074216,
"grad_norm": 0.48833589337954714,
"learning_rate": 3.1741218277247466e-05,
"loss": 0.4132,
"step": 722
},
{
"epoch": 1.8502879078694816,
"grad_norm": 0.39099920314827113,
"learning_rate": 3.1712216534067536e-05,
"loss": 0.4265,
"step": 723
},
{
"epoch": 1.8528470889315418,
"grad_norm": 0.45542998139885993,
"learning_rate": 3.168317726037634e-05,
"loss": 0.3971,
"step": 724
},
{
"epoch": 1.855406269993602,
"grad_norm": 0.46738782533250195,
"learning_rate": 3.1654100549227024e-05,
"loss": 0.4559,
"step": 725
},
{
"epoch": 1.8579654510556622,
"grad_norm": 0.40700257974662957,
"learning_rate": 3.1624986493792735e-05,
"loss": 0.4135,
"step": 726
},
{
"epoch": 1.8605246321177225,
"grad_norm": 0.4797435460256252,
"learning_rate": 3.159583518736625e-05,
"loss": 0.4463,
"step": 727
},
{
"epoch": 1.8630838131797824,
"grad_norm": 0.47740901056002083,
"learning_rate": 3.156664672335973e-05,
"loss": 0.3884,
"step": 728
},
{
"epoch": 1.8656429942418427,
"grad_norm": 0.49631875720308977,
"learning_rate": 3.153742119530441e-05,
"loss": 0.4162,
"step": 729
},
{
"epoch": 1.8682021753039026,
"grad_norm": 0.49356825084947964,
"learning_rate": 3.1508158696850275e-05,
"loss": 0.4329,
"step": 730
},
{
"epoch": 1.8707613563659629,
"grad_norm": 0.4599036508089157,
"learning_rate": 3.1478859321765796e-05,
"loss": 0.428,
"step": 731
},
{
"epoch": 1.873320537428023,
"grad_norm": 0.48294927528675924,
"learning_rate": 3.144952316393758e-05,
"loss": 0.4058,
"step": 732
},
{
"epoch": 1.8758797184900833,
"grad_norm": 0.5890205448596298,
"learning_rate": 3.142015031737016e-05,
"loss": 0.4776,
"step": 733
},
{
"epoch": 1.8784388995521433,
"grad_norm": 0.4923421153314979,
"learning_rate": 3.139074087618556e-05,
"loss": 0.4045,
"step": 734
},
{
"epoch": 1.8809980806142035,
"grad_norm": 0.5087618925492778,
"learning_rate": 3.136129493462312e-05,
"loss": 0.4275,
"step": 735
},
{
"epoch": 1.8835572616762635,
"grad_norm": 0.5648227631500222,
"learning_rate": 3.133181258703912e-05,
"loss": 0.4727,
"step": 736
},
{
"epoch": 1.8861164427383237,
"grad_norm": 0.5234994479526746,
"learning_rate": 3.1302293927906516e-05,
"loss": 0.3967,
"step": 737
},
{
"epoch": 1.888675623800384,
"grad_norm": 0.6807348510101979,
"learning_rate": 3.1272739051814594e-05,
"loss": 0.4551,
"step": 738
},
{
"epoch": 1.891234804862444,
"grad_norm": 0.4969832272412207,
"learning_rate": 3.1243148053468715e-05,
"loss": 0.3773,
"step": 739
},
{
"epoch": 1.8937939859245043,
"grad_norm": 0.6317866448620022,
"learning_rate": 3.121352102768998e-05,
"loss": 0.4389,
"step": 740
},
{
"epoch": 1.8963531669865643,
"grad_norm": 0.5131091797313253,
"learning_rate": 3.1183858069414936e-05,
"loss": 0.4458,
"step": 741
},
{
"epoch": 1.8989123480486243,
"grad_norm": 0.4972035729160381,
"learning_rate": 3.115415927369529e-05,
"loss": 0.4451,
"step": 742
},
{
"epoch": 1.9014715291106845,
"grad_norm": 0.47895024578706524,
"learning_rate": 3.112442473569754e-05,
"loss": 0.4324,
"step": 743
},
{
"epoch": 1.9040307101727447,
"grad_norm": 0.4691676491006599,
"learning_rate": 3.109465455070278e-05,
"loss": 0.4035,
"step": 744
},
{
"epoch": 1.906589891234805,
"grad_norm": 0.4736286307947326,
"learning_rate": 3.106484881410628e-05,
"loss": 0.4446,
"step": 745
},
{
"epoch": 1.9091490722968651,
"grad_norm": 0.4359831705290721,
"learning_rate": 3.103500762141725e-05,
"loss": 0.3829,
"step": 746
},
{
"epoch": 1.9117082533589251,
"grad_norm": 0.462916560551583,
"learning_rate": 3.1005131068258506e-05,
"loss": 0.4107,
"step": 747
},
{
"epoch": 1.9142674344209851,
"grad_norm": 0.47173985630781595,
"learning_rate": 3.09752192503662e-05,
"loss": 0.3903,
"step": 748
},
{
"epoch": 1.9168266154830453,
"grad_norm": 0.43246703137114556,
"learning_rate": 3.094527226358945e-05,
"loss": 0.4091,
"step": 749
},
{
"epoch": 1.9193857965451055,
"grad_norm": 0.5056443186885541,
"learning_rate": 3.091529020389009e-05,
"loss": 0.4837,
"step": 750
},
{
"epoch": 1.9219449776071658,
"grad_norm": 0.49376735973598973,
"learning_rate": 3.088527316734235e-05,
"loss": 0.4124,
"step": 751
},
{
"epoch": 1.924504158669226,
"grad_norm": 0.5428642850508197,
"learning_rate": 3.08552212501325e-05,
"loss": 0.4304,
"step": 752
},
{
"epoch": 1.927063339731286,
"grad_norm": 0.5365719180678239,
"learning_rate": 3.082513454855863e-05,
"loss": 0.405,
"step": 753
},
{
"epoch": 1.9296225207933462,
"grad_norm": 0.4559502049703374,
"learning_rate": 3.079501315903026e-05,
"loss": 0.445,
"step": 754
},
{
"epoch": 1.9321817018554062,
"grad_norm": 0.5222902812164878,
"learning_rate": 3.076485717806808e-05,
"loss": 0.3726,
"step": 755
},
{
"epoch": 1.9347408829174664,
"grad_norm": 0.5737561385558596,
"learning_rate": 3.073466670230361e-05,
"loss": 0.4588,
"step": 756
},
{
"epoch": 1.9373000639795266,
"grad_norm": 0.43383831135928497,
"learning_rate": 3.070444182847891e-05,
"loss": 0.4006,
"step": 757
},
{
"epoch": 1.9398592450415868,
"grad_norm": 0.49738529963698463,
"learning_rate": 3.067418265344628e-05,
"loss": 0.404,
"step": 758
},
{
"epoch": 1.9424184261036468,
"grad_norm": 0.4779872060995513,
"learning_rate": 3.0643889274167926e-05,
"loss": 0.4642,
"step": 759
},
{
"epoch": 1.944977607165707,
"grad_norm": 0.4501703649941174,
"learning_rate": 3.061356178771564e-05,
"loss": 0.3845,
"step": 760
},
{
"epoch": 1.947536788227767,
"grad_norm": 0.566851781049989,
"learning_rate": 3.058320029127052e-05,
"loss": 0.4603,
"step": 761
},
{
"epoch": 1.9500959692898272,
"grad_norm": 0.41716803055724166,
"learning_rate": 3.055280488212266e-05,
"loss": 0.3988,
"step": 762
},
{
"epoch": 1.9526551503518874,
"grad_norm": 0.5321354765650695,
"learning_rate": 3.052237565767079e-05,
"loss": 0.4633,
"step": 763
},
{
"epoch": 1.9552143314139476,
"grad_norm": 0.5101148541262678,
"learning_rate": 3.0491912715422047e-05,
"loss": 0.4154,
"step": 764
},
{
"epoch": 1.9577735124760078,
"grad_norm": 0.44138127982821407,
"learning_rate": 3.0461416152991555e-05,
"loss": 0.3971,
"step": 765
},
{
"epoch": 1.9603326935380678,
"grad_norm": 0.5119670448118282,
"learning_rate": 3.043088606810221e-05,
"loss": 0.4344,
"step": 766
},
{
"epoch": 1.9628918746001278,
"grad_norm": 0.5844510240848945,
"learning_rate": 3.0400322558584308e-05,
"loss": 0.4369,
"step": 767
},
{
"epoch": 1.965451055662188,
"grad_norm": 0.45663704360586077,
"learning_rate": 3.0369725722375274e-05,
"loss": 0.4666,
"step": 768
},
{
"epoch": 1.9680102367242482,
"grad_norm": 0.539565481061931,
"learning_rate": 3.0339095657519292e-05,
"loss": 0.4359,
"step": 769
},
{
"epoch": 1.9705694177863085,
"grad_norm": 0.5385392681842599,
"learning_rate": 3.0308432462167045e-05,
"loss": 0.4264,
"step": 770
},
{
"epoch": 1.9731285988483687,
"grad_norm": 0.432889165826209,
"learning_rate": 3.0277736234575378e-05,
"loss": 0.3845,
"step": 771
},
{
"epoch": 1.9756877799104287,
"grad_norm": 0.5443873180170078,
"learning_rate": 3.0247007073106976e-05,
"loss": 0.406,
"step": 772
},
{
"epoch": 1.9782469609724886,
"grad_norm": 0.5012354762450505,
"learning_rate": 3.0216245076230062e-05,
"loss": 0.4334,
"step": 773
},
{
"epoch": 1.9808061420345489,
"grad_norm": 0.5232208647955975,
"learning_rate": 3.0185450342518075e-05,
"loss": 0.4268,
"step": 774
},
{
"epoch": 1.983365323096609,
"grad_norm": 0.4513266845951912,
"learning_rate": 3.015462297064936e-05,
"loss": 0.3783,
"step": 775
},
{
"epoch": 1.9859245041586693,
"grad_norm": 0.5054305167039745,
"learning_rate": 3.0123763059406835e-05,
"loss": 0.4148,
"step": 776
},
{
"epoch": 1.9884836852207295,
"grad_norm": 0.47543662649122564,
"learning_rate": 3.009287070767771e-05,
"loss": 0.4083,
"step": 777
},
{
"epoch": 1.9910428662827895,
"grad_norm": 0.5147396246458542,
"learning_rate": 3.0061946014453113e-05,
"loss": 0.406,
"step": 778
},
{
"epoch": 1.9936020473448497,
"grad_norm": 0.537028842282906,
"learning_rate": 3.0030989078827848e-05,
"loss": 0.386,
"step": 779
},
{
"epoch": 1.9961612284069097,
"grad_norm": 0.4661586754448457,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.4218,
"step": 780
},
{
"epoch": 1.99872040946897,
"grad_norm": 0.4261635908664324,
"learning_rate": 2.9968978877270672e-05,
"loss": 0.4449,
"step": 781
},
{
"epoch": 2.00127959053103,
"grad_norm": 0.6837214787749436,
"learning_rate": 2.9937925810043654e-05,
"loss": 0.4318,
"step": 782
},
{
"epoch": 2.0038387715930903,
"grad_norm": 0.451386066705997,
"learning_rate": 2.990684089782507e-05,
"loss": 0.3612,
"step": 783
},
{
"epoch": 2.0063979526551505,
"grad_norm": 0.7078432089850583,
"learning_rate": 2.987572424022311e-05,
"loss": 0.3505,
"step": 784
},
{
"epoch": 2.0089571337172103,
"grad_norm": 0.5081289643217769,
"learning_rate": 2.98445759369477e-05,
"loss": 0.3149,
"step": 785
},
{
"epoch": 2.0115163147792705,
"grad_norm": 0.5849221231819475,
"learning_rate": 2.9813396087810134e-05,
"loss": 0.3514,
"step": 786
},
{
"epoch": 2.0140754958413307,
"grad_norm": 0.48371629559591783,
"learning_rate": 2.9782184792722845e-05,
"loss": 0.3339,
"step": 787
},
{
"epoch": 2.016634676903391,
"grad_norm": 0.6063864453716905,
"learning_rate": 2.9750942151698968e-05,
"loss": 0.389,
"step": 788
},
{
"epoch": 2.019193857965451,
"grad_norm": 0.6277898887247666,
"learning_rate": 2.971966826485212e-05,
"loss": 0.3283,
"step": 789
},
{
"epoch": 2.0217530390275114,
"grad_norm": 0.5884137115714144,
"learning_rate": 2.9688363232396056e-05,
"loss": 0.3353,
"step": 790
},
{
"epoch": 2.024312220089571,
"grad_norm": 0.5242949584836221,
"learning_rate": 2.9657027154644294e-05,
"loss": 0.3059,
"step": 791
},
{
"epoch": 2.0268714011516313,
"grad_norm": 0.5023037783914988,
"learning_rate": 2.962566013200986e-05,
"loss": 0.3433,
"step": 792
},
{
"epoch": 2.0294305822136915,
"grad_norm": 0.6201000972339953,
"learning_rate": 2.959426226500493e-05,
"loss": 0.318,
"step": 793
},
{
"epoch": 2.0319897632757518,
"grad_norm": 0.44395380528610195,
"learning_rate": 2.9562833654240518e-05,
"loss": 0.3401,
"step": 794
},
{
"epoch": 2.034548944337812,
"grad_norm": 0.43056205609112264,
"learning_rate": 2.9531374400426158e-05,
"loss": 0.2937,
"step": 795
},
{
"epoch": 2.037108125399872,
"grad_norm": 0.4803245756648518,
"learning_rate": 2.949988460436958e-05,
"loss": 0.3396,
"step": 796
},
{
"epoch": 2.0396673064619324,
"grad_norm": 0.41525308273215306,
"learning_rate": 2.946836436697636e-05,
"loss": 0.3508,
"step": 797
},
{
"epoch": 2.042226487523992,
"grad_norm": 0.4033224019153147,
"learning_rate": 2.943681378924964e-05,
"loss": 0.307,
"step": 798
},
{
"epoch": 2.0447856685860524,
"grad_norm": 0.4163421073725909,
"learning_rate": 2.94052329722898e-05,
"loss": 0.3145,
"step": 799
},
{
"epoch": 2.0473448496481126,
"grad_norm": 0.374364306892129,
"learning_rate": 2.9373622017294075e-05,
"loss": 0.3412,
"step": 800
},
{
"epoch": 2.049904030710173,
"grad_norm": 0.3911100371820488,
"learning_rate": 2.934198102555631e-05,
"loss": 0.3046,
"step": 801
},
{
"epoch": 2.052463211772233,
"grad_norm": 0.4467542980832922,
"learning_rate": 2.9310310098466588e-05,
"loss": 0.291,
"step": 802
},
{
"epoch": 2.055022392834293,
"grad_norm": 0.42396484047274274,
"learning_rate": 2.92786093375109e-05,
"loss": 0.3268,
"step": 803
},
{
"epoch": 2.057581573896353,
"grad_norm": 0.52821272561634,
"learning_rate": 2.924687884427087e-05,
"loss": 0.3699,
"step": 804
},
{
"epoch": 2.060140754958413,
"grad_norm": 0.4278559612529404,
"learning_rate": 2.9215118720423375e-05,
"loss": 0.3389,
"step": 805
},
{
"epoch": 2.0626999360204734,
"grad_norm": 0.4868265035371802,
"learning_rate": 2.9183329067740235e-05,
"loss": 0.2993,
"step": 806
},
{
"epoch": 2.0652591170825336,
"grad_norm": 0.41003428936435155,
"learning_rate": 2.9151509988087912e-05,
"loss": 0.3138,
"step": 807
},
{
"epoch": 2.067818298144594,
"grad_norm": 0.4548780109348631,
"learning_rate": 2.911966158342713e-05,
"loss": 0.3298,
"step": 808
},
{
"epoch": 2.070377479206654,
"grad_norm": 0.42982485262636566,
"learning_rate": 2.9087783955812628e-05,
"loss": 0.3493,
"step": 809
},
{
"epoch": 2.072936660268714,
"grad_norm": 0.37659830424896135,
"learning_rate": 2.9055877207392752e-05,
"loss": 0.2905,
"step": 810
},
{
"epoch": 2.075495841330774,
"grad_norm": 0.6047986137094586,
"learning_rate": 2.9023941440409164e-05,
"loss": 0.3921,
"step": 811
},
{
"epoch": 2.0780550223928342,
"grad_norm": 0.4024787680486599,
"learning_rate": 2.899197675719653e-05,
"loss": 0.3126,
"step": 812
},
{
"epoch": 2.0806142034548945,
"grad_norm": 0.4864391147176377,
"learning_rate": 2.8959983260182166e-05,
"loss": 0.3259,
"step": 813
},
{
"epoch": 2.0831733845169547,
"grad_norm": 0.4289885322757846,
"learning_rate": 2.8927961051885716e-05,
"loss": 0.3327,
"step": 814
},
{
"epoch": 2.085732565579015,
"grad_norm": 0.46469758441400516,
"learning_rate": 2.8895910234918828e-05,
"loss": 0.3566,
"step": 815
},
{
"epoch": 2.0882917466410746,
"grad_norm": 0.47717231725597103,
"learning_rate": 2.886383091198483e-05,
"loss": 0.3543,
"step": 816
},
{
"epoch": 2.090850927703135,
"grad_norm": 0.40073207808377775,
"learning_rate": 2.8831723185878382e-05,
"loss": 0.2954,
"step": 817
},
{
"epoch": 2.093410108765195,
"grad_norm": 0.5629907330757088,
"learning_rate": 2.8799587159485166e-05,
"loss": 0.3302,
"step": 818
},
{
"epoch": 2.0959692898272553,
"grad_norm": 0.5537969468557435,
"learning_rate": 2.876742293578155e-05,
"loss": 0.3323,
"step": 819
},
{
"epoch": 2.0985284708893155,
"grad_norm": 0.5750723337033808,
"learning_rate": 2.873523061783426e-05,
"loss": 0.3083,
"step": 820
},
{
"epoch": 2.1010876519513757,
"grad_norm": 0.5333136369374486,
"learning_rate": 2.8703010308800034e-05,
"loss": 0.3516,
"step": 821
},
{
"epoch": 2.103646833013436,
"grad_norm": 0.5946698263030077,
"learning_rate": 2.8670762111925313e-05,
"loss": 0.3337,
"step": 822
},
{
"epoch": 2.1062060140754957,
"grad_norm": 0.4783241662438903,
"learning_rate": 2.863848613054591e-05,
"loss": 0.302,
"step": 823
},
{
"epoch": 2.108765195137556,
"grad_norm": 0.42094897221075406,
"learning_rate": 2.8606182468086654e-05,
"loss": 0.3739,
"step": 824
},
{
"epoch": 2.111324376199616,
"grad_norm": 0.5112995720272753,
"learning_rate": 2.8573851228061084e-05,
"loss": 0.3328,
"step": 825
},
{
"epoch": 2.1138835572616763,
"grad_norm": 0.4268589732083703,
"learning_rate": 2.8541492514071115e-05,
"loss": 0.3199,
"step": 826
},
{
"epoch": 2.1164427383237365,
"grad_norm": 0.40030928394161236,
"learning_rate": 2.850910642980668e-05,
"loss": 0.3229,
"step": 827
},
{
"epoch": 2.1190019193857967,
"grad_norm": 0.4208663528647155,
"learning_rate": 2.8476693079045432e-05,
"loss": 0.3475,
"step": 828
},
{
"epoch": 2.1215611004478565,
"grad_norm": 0.43920626817956737,
"learning_rate": 2.8444252565652397e-05,
"loss": 0.3395,
"step": 829
},
{
"epoch": 2.1241202815099167,
"grad_norm": 0.4129992695563009,
"learning_rate": 2.8411784993579633e-05,
"loss": 0.2742,
"step": 830
},
{
"epoch": 2.126679462571977,
"grad_norm": 0.4454585391740596,
"learning_rate": 2.8379290466865906e-05,
"loss": 0.3328,
"step": 831
},
{
"epoch": 2.129238643634037,
"grad_norm": 0.46723501336721224,
"learning_rate": 2.834676908963636e-05,
"loss": 0.3379,
"step": 832
},
{
"epoch": 2.1317978246960974,
"grad_norm": 0.5238858504216463,
"learning_rate": 2.8314220966102177e-05,
"loss": 0.3621,
"step": 833
},
{
"epoch": 2.1343570057581576,
"grad_norm": 0.4375600366447412,
"learning_rate": 2.828164620056024e-05,
"loss": 0.3031,
"step": 834
},
{
"epoch": 2.1369161868202173,
"grad_norm": 0.4011400320445024,
"learning_rate": 2.8249044897392814e-05,
"loss": 0.3167,
"step": 835
},
{
"epoch": 2.1394753678822775,
"grad_norm": 0.48667210285852947,
"learning_rate": 2.8216417161067187e-05,
"loss": 0.3517,
"step": 836
},
{
"epoch": 2.1420345489443378,
"grad_norm": 0.540946535276379,
"learning_rate": 2.818376309613535e-05,
"loss": 0.3276,
"step": 837
},
{
"epoch": 2.144593730006398,
"grad_norm": 0.41449908449590483,
"learning_rate": 2.8151082807233684e-05,
"loss": 0.3429,
"step": 838
},
{
"epoch": 2.147152911068458,
"grad_norm": 0.4411596533715045,
"learning_rate": 2.811837639908257e-05,
"loss": 0.3064,
"step": 839
},
{
"epoch": 2.1497120921305184,
"grad_norm": 0.4049429223396906,
"learning_rate": 2.80856439764861e-05,
"loss": 0.3212,
"step": 840
},
{
"epoch": 2.1522712731925786,
"grad_norm": 0.41603945550088506,
"learning_rate": 2.8052885644331742e-05,
"loss": 0.3097,
"step": 841
},
{
"epoch": 2.1548304542546384,
"grad_norm": 0.526968199578847,
"learning_rate": 2.8020101507589958e-05,
"loss": 0.3547,
"step": 842
},
{
"epoch": 2.1573896353166986,
"grad_norm": 0.38305125359444786,
"learning_rate": 2.798729167131391e-05,
"loss": 0.3027,
"step": 843
},
{
"epoch": 2.159948816378759,
"grad_norm": 0.6336647706261161,
"learning_rate": 2.795445624063913e-05,
"loss": 0.3806,
"step": 844
},
{
"epoch": 2.162507997440819,
"grad_norm": 0.44531937519080506,
"learning_rate": 2.792159532078314e-05,
"loss": 0.3323,
"step": 845
},
{
"epoch": 2.165067178502879,
"grad_norm": 0.44290733063906507,
"learning_rate": 2.7888709017045146e-05,
"loss": 0.3237,
"step": 846
},
{
"epoch": 2.167626359564939,
"grad_norm": 0.4370183293736626,
"learning_rate": 2.7855797434805695e-05,
"loss": 0.338,
"step": 847
},
{
"epoch": 2.170185540626999,
"grad_norm": 0.4164027353779343,
"learning_rate": 2.782286067952634e-05,
"loss": 0.3278,
"step": 848
},
{
"epoch": 2.1727447216890594,
"grad_norm": 0.49409168279966853,
"learning_rate": 2.7789898856749297e-05,
"loss": 0.3568,
"step": 849
},
{
"epoch": 2.1753039027511196,
"grad_norm": 0.43481015195691675,
"learning_rate": 2.77569120720971e-05,
"loss": 0.3356,
"step": 850
},
{
"epoch": 2.17786308381318,
"grad_norm": 0.4983975940240211,
"learning_rate": 2.772390043127228e-05,
"loss": 0.3373,
"step": 851
},
{
"epoch": 2.18042226487524,
"grad_norm": 0.43692987588400956,
"learning_rate": 2.7690864040057023e-05,
"loss": 0.3108,
"step": 852
},
{
"epoch": 2.1829814459373003,
"grad_norm": 0.44866752393409093,
"learning_rate": 2.7657803004312797e-05,
"loss": 0.3347,
"step": 853
},
{
"epoch": 2.18554062699936,
"grad_norm": 0.5027966160971863,
"learning_rate": 2.7624717429980067e-05,
"loss": 0.3536,
"step": 854
},
{
"epoch": 2.1880998080614202,
"grad_norm": 0.4093435861731407,
"learning_rate": 2.7591607423077932e-05,
"loss": 0.2917,
"step": 855
},
{
"epoch": 2.1906589891234804,
"grad_norm": 0.49615217011265117,
"learning_rate": 2.755847308970376e-05,
"loss": 0.3502,
"step": 856
},
{
"epoch": 2.1932181701855407,
"grad_norm": 0.3967350798943657,
"learning_rate": 2.752531453603288e-05,
"loss": 0.3177,
"step": 857
},
{
"epoch": 2.195777351247601,
"grad_norm": 0.4774434168760954,
"learning_rate": 2.7492131868318247e-05,
"loss": 0.3616,
"step": 858
},
{
"epoch": 2.198336532309661,
"grad_norm": 0.4533916271890891,
"learning_rate": 2.7458925192890057e-05,
"loss": 0.3235,
"step": 859
},
{
"epoch": 2.200895713371721,
"grad_norm": 0.4666533097303878,
"learning_rate": 2.7425694616155474e-05,
"loss": 0.362,
"step": 860
},
{
"epoch": 2.203454894433781,
"grad_norm": 0.5256588022807218,
"learning_rate": 2.739244024459822e-05,
"loss": 0.3577,
"step": 861
},
{
"epoch": 2.2060140754958413,
"grad_norm": 0.42484253391437565,
"learning_rate": 2.7359162184778276e-05,
"loss": 0.3297,
"step": 862
},
{
"epoch": 2.2085732565579015,
"grad_norm": 0.44868873474156656,
"learning_rate": 2.7325860543331533e-05,
"loss": 0.336,
"step": 863
},
{
"epoch": 2.2111324376199617,
"grad_norm": 0.4732703525530866,
"learning_rate": 2.7292535426969436e-05,
"loss": 0.3057,
"step": 864
},
{
"epoch": 2.213691618682022,
"grad_norm": 0.477868292020973,
"learning_rate": 2.7259186942478656e-05,
"loss": 0.308,
"step": 865
},
{
"epoch": 2.216250799744082,
"grad_norm": 0.4842331398481323,
"learning_rate": 2.7225815196720767e-05,
"loss": 0.3145,
"step": 866
},
{
"epoch": 2.218809980806142,
"grad_norm": 0.5381115021925195,
"learning_rate": 2.7192420296631835e-05,
"loss": 0.3798,
"step": 867
},
{
"epoch": 2.221369161868202,
"grad_norm": 0.4236836146413328,
"learning_rate": 2.7159002349222178e-05,
"loss": 0.3237,
"step": 868
},
{
"epoch": 2.2239283429302623,
"grad_norm": 0.5479188963928002,
"learning_rate": 2.7125561461575924e-05,
"loss": 0.3832,
"step": 869
},
{
"epoch": 2.2264875239923225,
"grad_norm": 0.45518106829461097,
"learning_rate": 2.7092097740850712e-05,
"loss": 0.3048,
"step": 870
},
{
"epoch": 2.2290467050543827,
"grad_norm": 0.4441896488412185,
"learning_rate": 2.7058611294277378e-05,
"loss": 0.3141,
"step": 871
},
{
"epoch": 2.2316058861164425,
"grad_norm": 0.43355994332482317,
"learning_rate": 2.702510222915956e-05,
"loss": 0.3107,
"step": 872
},
{
"epoch": 2.2341650671785027,
"grad_norm": 0.5312269604580118,
"learning_rate": 2.6991570652873357e-05,
"loss": 0.3404,
"step": 873
},
{
"epoch": 2.236724248240563,
"grad_norm": 0.3995110246728028,
"learning_rate": 2.6958016672867048e-05,
"loss": 0.3122,
"step": 874
},
{
"epoch": 2.239283429302623,
"grad_norm": 0.4611276323390611,
"learning_rate": 2.692444039666066e-05,
"loss": 0.317,
"step": 875
},
{
"epoch": 2.2418426103646834,
"grad_norm": 0.5307192324613822,
"learning_rate": 2.6890841931845674e-05,
"loss": 0.3579,
"step": 876
},
{
"epoch": 2.2444017914267436,
"grad_norm": 0.41152344841446314,
"learning_rate": 2.68572213860847e-05,
"loss": 0.3278,
"step": 877
},
{
"epoch": 2.2469609724888038,
"grad_norm": 0.439703599513908,
"learning_rate": 2.6823578867111072e-05,
"loss": 0.3207,
"step": 878
},
{
"epoch": 2.2495201535508635,
"grad_norm": 0.48436216914849156,
"learning_rate": 2.6789914482728546e-05,
"loss": 0.3923,
"step": 879
},
{
"epoch": 2.2520793346129238,
"grad_norm": 0.4065191432791332,
"learning_rate": 2.6756228340810946e-05,
"loss": 0.3092,
"step": 880
},
{
"epoch": 2.254638515674984,
"grad_norm": 0.45257368911274026,
"learning_rate": 2.6722520549301813e-05,
"loss": 0.3201,
"step": 881
},
{
"epoch": 2.257197696737044,
"grad_norm": 0.47676366709922463,
"learning_rate": 2.6688791216214064e-05,
"loss": 0.3552,
"step": 882
},
{
"epoch": 2.2597568777991044,
"grad_norm": 0.4388935470431639,
"learning_rate": 2.6655040449629646e-05,
"loss": 0.3117,
"step": 883
},
{
"epoch": 2.2623160588611646,
"grad_norm": 0.49852041258030133,
"learning_rate": 2.6621268357699165e-05,
"loss": 0.2986,
"step": 884
},
{
"epoch": 2.2648752399232244,
"grad_norm": 0.5574815275071192,
"learning_rate": 2.6587475048641596e-05,
"loss": 0.3652,
"step": 885
},
{
"epoch": 2.2674344209852846,
"grad_norm": 0.532498078005579,
"learning_rate": 2.655366063074388e-05,
"loss": 0.3361,
"step": 886
},
{
"epoch": 2.269993602047345,
"grad_norm": 0.4905390491427929,
"learning_rate": 2.6519825212360607e-05,
"loss": 0.2904,
"step": 887
},
{
"epoch": 2.272552783109405,
"grad_norm": 0.4548356990918924,
"learning_rate": 2.6485968901913658e-05,
"loss": 0.3383,
"step": 888
},
{
"epoch": 2.275111964171465,
"grad_norm": 0.4527361560109495,
"learning_rate": 2.6452091807891855e-05,
"loss": 0.3395,
"step": 889
},
{
"epoch": 2.2776711452335254,
"grad_norm": 0.4444282648587205,
"learning_rate": 2.6418194038850634e-05,
"loss": 0.3155,
"step": 890
},
{
"epoch": 2.2802303262955856,
"grad_norm": 0.37005966275705604,
"learning_rate": 2.6384275703411666e-05,
"loss": 0.3172,
"step": 891
},
{
"epoch": 2.2827895073576454,
"grad_norm": 0.4280292706382066,
"learning_rate": 2.635033691026253e-05,
"loss": 0.3643,
"step": 892
},
{
"epoch": 2.2853486884197056,
"grad_norm": 0.46336387808601265,
"learning_rate": 2.6316377768156366e-05,
"loss": 0.3516,
"step": 893
},
{
"epoch": 2.287907869481766,
"grad_norm": 0.3675246620437907,
"learning_rate": 2.6282398385911503e-05,
"loss": 0.2782,
"step": 894
},
{
"epoch": 2.290467050543826,
"grad_norm": 0.4188680082069283,
"learning_rate": 2.624839887241115e-05,
"loss": 0.3521,
"step": 895
},
{
"epoch": 2.2930262316058863,
"grad_norm": 0.42684239051457756,
"learning_rate": 2.6214379336603016e-05,
"loss": 0.2909,
"step": 896
},
{
"epoch": 2.295585412667946,
"grad_norm": 0.36815581001539444,
"learning_rate": 2.618033988749895e-05,
"loss": 0.3068,
"step": 897
},
{
"epoch": 2.2981445937300062,
"grad_norm": 0.4355098607462678,
"learning_rate": 2.614628063417464e-05,
"loss": 0.3561,
"step": 898
},
{
"epoch": 2.3007037747920664,
"grad_norm": 0.4278330160593537,
"learning_rate": 2.6112201685769224e-05,
"loss": 0.3265,
"step": 899
},
{
"epoch": 2.3032629558541267,
"grad_norm": 0.39014595589180573,
"learning_rate": 2.607810315148494e-05,
"loss": 0.3569,
"step": 900
},
{
"epoch": 2.305822136916187,
"grad_norm": 0.4667698942028393,
"learning_rate": 2.60439851405868e-05,
"loss": 0.3631,
"step": 901
},
{
"epoch": 2.308381317978247,
"grad_norm": 0.41965973806882434,
"learning_rate": 2.600984776240222e-05,
"loss": 0.3248,
"step": 902
},
{
"epoch": 2.3109404990403073,
"grad_norm": 0.44724491924157506,
"learning_rate": 2.5975691126320678e-05,
"loss": 0.3854,
"step": 903
},
{
"epoch": 2.313499680102367,
"grad_norm": 0.46251594524874595,
"learning_rate": 2.5941515341793366e-05,
"loss": 0.3503,
"step": 904
},
{
"epoch": 2.3160588611644273,
"grad_norm": 0.35346075618792994,
"learning_rate": 2.5907320518332827e-05,
"loss": 0.3309,
"step": 905
},
{
"epoch": 2.3186180422264875,
"grad_norm": 0.3826935478844609,
"learning_rate": 2.587310676551262e-05,
"loss": 0.2894,
"step": 906
},
{
"epoch": 2.3211772232885477,
"grad_norm": 0.416397755264069,
"learning_rate": 2.5838874192966953e-05,
"loss": 0.3716,
"step": 907
},
{
"epoch": 2.323736404350608,
"grad_norm": 0.3982851139464845,
"learning_rate": 2.5804622910390348e-05,
"loss": 0.2833,
"step": 908
},
{
"epoch": 2.326295585412668,
"grad_norm": 0.4591104893849011,
"learning_rate": 2.5770353027537276e-05,
"loss": 0.3277,
"step": 909
},
{
"epoch": 2.328854766474728,
"grad_norm": 0.382725152261151,
"learning_rate": 2.5736064654221808e-05,
"loss": 0.323,
"step": 910
},
{
"epoch": 2.331413947536788,
"grad_norm": 0.5087928821054611,
"learning_rate": 2.5701757900317277e-05,
"loss": 0.3314,
"step": 911
},
{
"epoch": 2.3339731285988483,
"grad_norm": 0.3914001278691039,
"learning_rate": 2.5667432875755904e-05,
"loss": 0.3508,
"step": 912
},
{
"epoch": 2.3365323096609085,
"grad_norm": 0.4510480545622787,
"learning_rate": 2.5633089690528455e-05,
"loss": 0.3529,
"step": 913
},
{
"epoch": 2.3390914907229687,
"grad_norm": 0.4134813709280613,
"learning_rate": 2.559872845468391e-05,
"loss": 0.3286,
"step": 914
},
{
"epoch": 2.341650671785029,
"grad_norm": 0.40105410165144934,
"learning_rate": 2.5564349278329056e-05,
"loss": 0.2852,
"step": 915
},
{
"epoch": 2.344209852847089,
"grad_norm": 0.49998212235296036,
"learning_rate": 2.5529952271628192e-05,
"loss": 0.2916,
"step": 916
},
{
"epoch": 2.346769033909149,
"grad_norm": 0.46734623561200184,
"learning_rate": 2.5495537544802757e-05,
"loss": 0.3497,
"step": 917
},
{
"epoch": 2.349328214971209,
"grad_norm": 0.43677669206755015,
"learning_rate": 2.5461105208130953e-05,
"loss": 0.359,
"step": 918
},
{
"epoch": 2.3518873960332694,
"grad_norm": 0.49010454865810016,
"learning_rate": 2.542665537194742e-05,
"loss": 0.3368,
"step": 919
},
{
"epoch": 2.3544465770953296,
"grad_norm": 0.370850360816377,
"learning_rate": 2.539218814664288e-05,
"loss": 0.3222,
"step": 920
},
{
"epoch": 2.3570057581573898,
"grad_norm": 0.46886497417633327,
"learning_rate": 2.5357703642663766e-05,
"loss": 0.3633,
"step": 921
},
{
"epoch": 2.3595649392194495,
"grad_norm": 0.4227514997155462,
"learning_rate": 2.5323201970511883e-05,
"loss": 0.3497,
"step": 922
},
{
"epoch": 2.3621241202815098,
"grad_norm": 0.4015401134012503,
"learning_rate": 2.528868324074405e-05,
"loss": 0.3076,
"step": 923
},
{
"epoch": 2.36468330134357,
"grad_norm": 0.4012146844135177,
"learning_rate": 2.525414756397174e-05,
"loss": 0.3117,
"step": 924
},
{
"epoch": 2.36724248240563,
"grad_norm": 0.3724913651532696,
"learning_rate": 2.521959505086075e-05,
"loss": 0.2948,
"step": 925
},
{
"epoch": 2.3698016634676904,
"grad_norm": 0.44515489828057647,
"learning_rate": 2.5185025812130794e-05,
"loss": 0.3624,
"step": 926
},
{
"epoch": 2.3723608445297506,
"grad_norm": 0.4333489454322345,
"learning_rate": 2.5150439958555205e-05,
"loss": 0.3254,
"step": 927
},
{
"epoch": 2.374920025591811,
"grad_norm": 0.4284602049159506,
"learning_rate": 2.5115837600960564e-05,
"loss": 0.3232,
"step": 928
},
{
"epoch": 2.3774792066538706,
"grad_norm": 0.4327763714080197,
"learning_rate": 2.5081218850226315e-05,
"loss": 0.3213,
"step": 929
},
{
"epoch": 2.380038387715931,
"grad_norm": 0.4092391708238846,
"learning_rate": 2.5046583817284437e-05,
"loss": 0.3645,
"step": 930
},
{
"epoch": 2.382597568777991,
"grad_norm": 0.3874157650273361,
"learning_rate": 2.5011932613119098e-05,
"loss": 0.3546,
"step": 931
},
{
"epoch": 2.385156749840051,
"grad_norm": 0.4154060512286611,
"learning_rate": 2.497726534876627e-05,
"loss": 0.3724,
"step": 932
},
{
"epoch": 2.3877159309021114,
"grad_norm": 0.3520365071455179,
"learning_rate": 2.4942582135313393e-05,
"loss": 0.3171,
"step": 933
},
{
"epoch": 2.3902751119641716,
"grad_norm": 0.4129362868719995,
"learning_rate": 2.490788308389902e-05,
"loss": 0.3081,
"step": 934
},
{
"epoch": 2.3928342930262314,
"grad_norm": 0.3826332240853489,
"learning_rate": 2.487316830571244e-05,
"loss": 0.3167,
"step": 935
},
{
"epoch": 2.3953934740882916,
"grad_norm": 0.41073512437749543,
"learning_rate": 2.4838437911993355e-05,
"loss": 0.2872,
"step": 936
},
{
"epoch": 2.397952655150352,
"grad_norm": 0.41060249122236425,
"learning_rate": 2.48036920140315e-05,
"loss": 0.3331,
"step": 937
},
{
"epoch": 2.400511836212412,
"grad_norm": 0.39939347922246277,
"learning_rate": 2.4768930723166266e-05,
"loss": 0.309,
"step": 938
},
{
"epoch": 2.4030710172744723,
"grad_norm": 0.44800610692896503,
"learning_rate": 2.473415415078642e-05,
"loss": 0.3301,
"step": 939
},
{
"epoch": 2.4056301983365325,
"grad_norm": 0.45045037484262557,
"learning_rate": 2.4699362408329646e-05,
"loss": 0.3545,
"step": 940
},
{
"epoch": 2.4081893793985927,
"grad_norm": 0.4516520833709128,
"learning_rate": 2.466455560728227e-05,
"loss": 0.3219,
"step": 941
},
{
"epoch": 2.4107485604606524,
"grad_norm": 0.4140396756771499,
"learning_rate": 2.4629733859178867e-05,
"loss": 0.3312,
"step": 942
},
{
"epoch": 2.4133077415227127,
"grad_norm": 0.4242325070475781,
"learning_rate": 2.4594897275601887e-05,
"loss": 0.3657,
"step": 943
},
{
"epoch": 2.415866922584773,
"grad_norm": 0.3534056791478892,
"learning_rate": 2.456004596818135e-05,
"loss": 0.2875,
"step": 944
},
{
"epoch": 2.418426103646833,
"grad_norm": 0.4504625178937038,
"learning_rate": 2.4525180048594452e-05,
"loss": 0.3947,
"step": 945
},
{
"epoch": 2.4209852847088933,
"grad_norm": 0.3713456955088067,
"learning_rate": 2.4490299628565168e-05,
"loss": 0.3365,
"step": 946
},
{
"epoch": 2.423544465770953,
"grad_norm": 0.4139332630376369,
"learning_rate": 2.4455404819864e-05,
"loss": 0.3213,
"step": 947
},
{
"epoch": 2.4261036468330133,
"grad_norm": 0.39284365253142334,
"learning_rate": 2.4420495734307527e-05,
"loss": 0.3707,
"step": 948
},
{
"epoch": 2.4286628278950735,
"grad_norm": 0.44240961049247096,
"learning_rate": 2.4385572483758066e-05,
"loss": 0.373,
"step": 949
},
{
"epoch": 2.4312220089571337,
"grad_norm": 0.41468110454884644,
"learning_rate": 2.435063518012335e-05,
"loss": 0.3791,
"step": 950
},
{
"epoch": 2.433781190019194,
"grad_norm": 0.38546461774505014,
"learning_rate": 2.4315683935356127e-05,
"loss": 0.3092,
"step": 951
},
{
"epoch": 2.436340371081254,
"grad_norm": 0.3973539449011059,
"learning_rate": 2.4280718861453814e-05,
"loss": 0.3537,
"step": 952
},
{
"epoch": 2.4388995521433143,
"grad_norm": 0.40087880001543535,
"learning_rate": 2.424574007045816e-05,
"loss": 0.3513,
"step": 953
},
{
"epoch": 2.441458733205374,
"grad_norm": 0.4363352682087938,
"learning_rate": 2.421074767445485e-05,
"loss": 0.3168,
"step": 954
},
{
"epoch": 2.4440179142674343,
"grad_norm": 0.387588478700538,
"learning_rate": 2.4175741785573177e-05,
"loss": 0.3156,
"step": 955
},
{
"epoch": 2.4465770953294945,
"grad_norm": 0.43136617250905906,
"learning_rate": 2.4140722515985666e-05,
"loss": 0.3396,
"step": 956
},
{
"epoch": 2.4491362763915547,
"grad_norm": 0.4356259978949205,
"learning_rate": 2.4105689977907722e-05,
"loss": 0.3633,
"step": 957
},
{
"epoch": 2.451695457453615,
"grad_norm": 0.3939046327707216,
"learning_rate": 2.407064428359726e-05,
"loss": 0.3367,
"step": 958
},
{
"epoch": 2.454254638515675,
"grad_norm": 0.44530753935780215,
"learning_rate": 2.4035585545354353e-05,
"loss": 0.2652,
"step": 959
},
{
"epoch": 2.456813819577735,
"grad_norm": 0.39324736708789354,
"learning_rate": 2.4000513875520892e-05,
"loss": 0.3497,
"step": 960
},
{
"epoch": 2.459373000639795,
"grad_norm": 0.3745102508002373,
"learning_rate": 2.396542938648018e-05,
"loss": 0.351,
"step": 961
},
{
"epoch": 2.4619321817018553,
"grad_norm": 0.433148063417755,
"learning_rate": 2.3930332190656604e-05,
"loss": 0.3226,
"step": 962
},
{
"epoch": 2.4644913627639156,
"grad_norm": 0.5060978327975577,
"learning_rate": 2.3895222400515282e-05,
"loss": 0.3944,
"step": 963
},
{
"epoch": 2.4670505438259758,
"grad_norm": 0.3831333684566055,
"learning_rate": 2.3860100128561677e-05,
"loss": 0.303,
"step": 964
},
{
"epoch": 2.469609724888036,
"grad_norm": 0.4834696949807748,
"learning_rate": 2.3824965487341247e-05,
"loss": 0.36,
"step": 965
},
{
"epoch": 2.472168905950096,
"grad_norm": 0.48539696684918826,
"learning_rate": 2.3789818589439094e-05,
"loss": 0.3418,
"step": 966
},
{
"epoch": 2.474728087012156,
"grad_norm": 0.3663525699221002,
"learning_rate": 2.375465954747959e-05,
"loss": 0.2906,
"step": 967
},
{
"epoch": 2.477287268074216,
"grad_norm": 0.4444877062926493,
"learning_rate": 2.371948847412602e-05,
"loss": 0.3281,
"step": 968
},
{
"epoch": 2.4798464491362764,
"grad_norm": 0.37267946857207057,
"learning_rate": 2.3684305482080233e-05,
"loss": 0.3214,
"step": 969
},
{
"epoch": 2.4824056301983366,
"grad_norm": 0.422309895496103,
"learning_rate": 2.3649110684082258e-05,
"loss": 0.3309,
"step": 970
},
{
"epoch": 2.484964811260397,
"grad_norm": 0.366896575024139,
"learning_rate": 2.361390419290995e-05,
"loss": 0.3359,
"step": 971
},
{
"epoch": 2.4875239923224566,
"grad_norm": 0.4252498965109737,
"learning_rate": 2.357868612137866e-05,
"loss": 0.3162,
"step": 972
},
{
"epoch": 2.490083173384517,
"grad_norm": 0.4454769676233995,
"learning_rate": 2.3543456582340815e-05,
"loss": 0.3458,
"step": 973
},
{
"epoch": 2.492642354446577,
"grad_norm": 0.504528012046428,
"learning_rate": 2.3508215688685607e-05,
"loss": 0.3783,
"step": 974
},
{
"epoch": 2.495201535508637,
"grad_norm": 0.3791982649378316,
"learning_rate": 2.3472963553338614e-05,
"loss": 0.3439,
"step": 975
},
{
"epoch": 2.4977607165706974,
"grad_norm": 0.3958397655771158,
"learning_rate": 2.3437700289261417e-05,
"loss": 0.3098,
"step": 976
},
{
"epoch": 2.5003198976327576,
"grad_norm": 0.4716678361651927,
"learning_rate": 2.3402426009451288e-05,
"loss": 0.3442,
"step": 977
},
{
"epoch": 2.502879078694818,
"grad_norm": 0.41505752875646384,
"learning_rate": 2.3367140826940768e-05,
"loss": 0.3393,
"step": 978
},
{
"epoch": 2.505438259756878,
"grad_norm": 0.4831411264450984,
"learning_rate": 2.333184485479737e-05,
"loss": 0.3406,
"step": 979
},
{
"epoch": 2.507997440818938,
"grad_norm": 0.44259478488091053,
"learning_rate": 2.3296538206123134e-05,
"loss": 0.3498,
"step": 980
},
{
"epoch": 2.510556621880998,
"grad_norm": 0.39650938475151654,
"learning_rate": 2.326122099405435e-05,
"loss": 0.3218,
"step": 981
},
{
"epoch": 2.5131158029430583,
"grad_norm": 0.44478690078566685,
"learning_rate": 2.3225893331761143e-05,
"loss": 0.3354,
"step": 982
},
{
"epoch": 2.5156749840051185,
"grad_norm": 0.4617579108787994,
"learning_rate": 2.319055533244712e-05,
"loss": 0.3689,
"step": 983
},
{
"epoch": 2.5182341650671782,
"grad_norm": 0.36510971786258006,
"learning_rate": 2.315520710934903e-05,
"loss": 0.3189,
"step": 984
},
{
"epoch": 2.5207933461292384,
"grad_norm": 0.5962978268524062,
"learning_rate": 2.311984877573636e-05,
"loss": 0.3785,
"step": 985
},
{
"epoch": 2.5233525271912987,
"grad_norm": 0.4286182290118198,
"learning_rate": 2.3084480444911006e-05,
"loss": 0.2969,
"step": 986
},
{
"epoch": 2.525911708253359,
"grad_norm": 0.406129472684799,
"learning_rate": 2.304910223020691e-05,
"loss": 0.3622,
"step": 987
},
{
"epoch": 2.528470889315419,
"grad_norm": 0.40352070167371706,
"learning_rate": 2.3013714244989665e-05,
"loss": 0.3003,
"step": 988
},
{
"epoch": 2.5310300703774793,
"grad_norm": 0.41853745249254193,
"learning_rate": 2.2978316602656183e-05,
"loss": 0.3545,
"step": 989
},
{
"epoch": 2.5335892514395395,
"grad_norm": 0.3751248697656993,
"learning_rate": 2.2942909416634326e-05,
"loss": 0.3317,
"step": 990
},
{
"epoch": 2.5361484325015997,
"grad_norm": 0.38457641982344676,
"learning_rate": 2.290749280038252e-05,
"loss": 0.3186,
"step": 991
},
{
"epoch": 2.5387076135636595,
"grad_norm": 0.43410698680189885,
"learning_rate": 2.2872066867389434e-05,
"loss": 0.3819,
"step": 992
},
{
"epoch": 2.5412667946257197,
"grad_norm": 0.3827463318912182,
"learning_rate": 2.2836631731173577e-05,
"loss": 0.3428,
"step": 993
},
{
"epoch": 2.54382597568778,
"grad_norm": 0.3626009354081465,
"learning_rate": 2.2801187505282948e-05,
"loss": 0.3313,
"step": 994
},
{
"epoch": 2.54638515674984,
"grad_norm": 0.3879708806451702,
"learning_rate": 2.2765734303294666e-05,
"loss": 0.302,
"step": 995
},
{
"epoch": 2.5489443378119003,
"grad_norm": 0.3615193094932171,
"learning_rate": 2.2730272238814636e-05,
"loss": 0.3022,
"step": 996
},
{
"epoch": 2.55150351887396,
"grad_norm": 0.38109707078147037,
"learning_rate": 2.2694801425477136e-05,
"loss": 0.3199,
"step": 997
},
{
"epoch": 2.5540626999360203,
"grad_norm": 0.35624704442372485,
"learning_rate": 2.2659321976944507e-05,
"loss": 0.3394,
"step": 998
},
{
"epoch": 2.5566218809980805,
"grad_norm": 0.5236454693659701,
"learning_rate": 2.2623834006906732e-05,
"loss": 0.3254,
"step": 999
},
{
"epoch": 2.5591810620601407,
"grad_norm": 0.4793516094510245,
"learning_rate": 2.2588337629081107e-05,
"loss": 0.4122,
"step": 1000
},
{
"epoch": 2.561740243122201,
"grad_norm": 0.32519173651998734,
"learning_rate": 2.25528329572119e-05,
"loss": 0.2782,
"step": 1001
},
{
"epoch": 2.564299424184261,
"grad_norm": 0.47513564073653997,
"learning_rate": 2.25173201050699e-05,
"loss": 0.4075,
"step": 1002
},
{
"epoch": 2.5668586052463214,
"grad_norm": 0.34327531232567976,
"learning_rate": 2.248179918645216e-05,
"loss": 0.2602,
"step": 1003
},
{
"epoch": 2.5694177863083816,
"grad_norm": 0.42876499217691605,
"learning_rate": 2.2446270315181566e-05,
"loss": 0.3538,
"step": 1004
},
{
"epoch": 2.5719769673704413,
"grad_norm": 0.38643082957234787,
"learning_rate": 2.2410733605106462e-05,
"loss": 0.3331,
"step": 1005
},
{
"epoch": 2.5745361484325016,
"grad_norm": 0.3845952145329833,
"learning_rate": 2.237518917010035e-05,
"loss": 0.3068,
"step": 1006
},
{
"epoch": 2.5770953294945618,
"grad_norm": 0.398304345128026,
"learning_rate": 2.233963712406147e-05,
"loss": 0.3455,
"step": 1007
},
{
"epoch": 2.579654510556622,
"grad_norm": 0.4106067027436424,
"learning_rate": 2.2304077580912423e-05,
"loss": 0.3266,
"step": 1008
},
{
"epoch": 2.5822136916186818,
"grad_norm": 0.3515875497757696,
"learning_rate": 2.2268510654599885e-05,
"loss": 0.3089,
"step": 1009
},
{
"epoch": 2.584772872680742,
"grad_norm": 0.34230034407291976,
"learning_rate": 2.2232936459094158e-05,
"loss": 0.37,
"step": 1010
},
{
"epoch": 2.587332053742802,
"grad_norm": 0.3409277384030245,
"learning_rate": 2.2197355108388835e-05,
"loss": 0.3425,
"step": 1011
},
{
"epoch": 2.5898912348048624,
"grad_norm": 0.3659124451915072,
"learning_rate": 2.216176671650045e-05,
"loss": 0.3417,
"step": 1012
},
{
"epoch": 2.5924504158669226,
"grad_norm": 0.34644737723549984,
"learning_rate": 2.2126171397468105e-05,
"loss": 0.3048,
"step": 1013
},
{
"epoch": 2.595009596928983,
"grad_norm": 0.34192043331418503,
"learning_rate": 2.209056926535307e-05,
"loss": 0.3245,
"step": 1014
},
{
"epoch": 2.597568777991043,
"grad_norm": 0.4024225219500372,
"learning_rate": 2.205496043423849e-05,
"loss": 0.3501,
"step": 1015
},
{
"epoch": 2.6001279590531032,
"grad_norm": 0.3444122394151278,
"learning_rate": 2.2019345018228922e-05,
"loss": 0.3403,
"step": 1016
},
{
"epoch": 2.602687140115163,
"grad_norm": 0.41753443691652886,
"learning_rate": 2.1983723131450088e-05,
"loss": 0.3609,
"step": 1017
},
{
"epoch": 2.605246321177223,
"grad_norm": 0.42275585901863255,
"learning_rate": 2.194809488804839e-05,
"loss": 0.3427,
"step": 1018
},
{
"epoch": 2.6078055022392834,
"grad_norm": 0.3513925245942965,
"learning_rate": 2.1912460402190625e-05,
"loss": 0.2984,
"step": 1019
},
{
"epoch": 2.6103646833013436,
"grad_norm": 0.4845404009383636,
"learning_rate": 2.1876819788063586e-05,
"loss": 0.342,
"step": 1020
},
{
"epoch": 2.612923864363404,
"grad_norm": 0.38632744480954595,
"learning_rate": 2.1841173159873718e-05,
"loss": 0.3178,
"step": 1021
},
{
"epoch": 2.6154830454254636,
"grad_norm": 0.40341965144316216,
"learning_rate": 2.1805520631846705e-05,
"loss": 0.3454,
"step": 1022
},
{
"epoch": 2.618042226487524,
"grad_norm": 0.5429183555857332,
"learning_rate": 2.176986231822717e-05,
"loss": 0.3407,
"step": 1023
},
{
"epoch": 2.620601407549584,
"grad_norm": 0.3693931345744361,
"learning_rate": 2.173419833327826e-05,
"loss": 0.2931,
"step": 1024
},
{
"epoch": 2.6231605886116443,
"grad_norm": 0.41461427921632693,
"learning_rate": 2.16985287912813e-05,
"loss": 0.3462,
"step": 1025
},
{
"epoch": 2.6257197696737045,
"grad_norm": 0.35793827637412173,
"learning_rate": 2.166285380653541e-05,
"loss": 0.2649,
"step": 1026
},
{
"epoch": 2.6282789507357647,
"grad_norm": 0.43455345040805726,
"learning_rate": 2.1627173493357167e-05,
"loss": 0.3432,
"step": 1027
},
{
"epoch": 2.630838131797825,
"grad_norm": 0.36886933829443885,
"learning_rate": 2.1591487966080215e-05,
"loss": 0.3106,
"step": 1028
},
{
"epoch": 2.633397312859885,
"grad_norm": 0.3991822617060509,
"learning_rate": 2.1555797339054898e-05,
"loss": 0.3621,
"step": 1029
},
{
"epoch": 2.635956493921945,
"grad_norm": 0.3967391590295086,
"learning_rate": 2.1520101726647922e-05,
"loss": 0.3711,
"step": 1030
},
{
"epoch": 2.638515674984005,
"grad_norm": 0.3577916760514241,
"learning_rate": 2.1484401243241947e-05,
"loss": 0.2945,
"step": 1031
},
{
"epoch": 2.6410748560460653,
"grad_norm": 0.40155422081290365,
"learning_rate": 2.1448696003235252e-05,
"loss": 0.3366,
"step": 1032
},
{
"epoch": 2.6436340371081255,
"grad_norm": 0.36541079152322986,
"learning_rate": 2.1412986121041355e-05,
"loss": 0.2932,
"step": 1033
},
{
"epoch": 2.6461932181701853,
"grad_norm": 0.3484241132798254,
"learning_rate": 2.1377271711088655e-05,
"loss": 0.3339,
"step": 1034
},
{
"epoch": 2.6487523992322455,
"grad_norm": 0.43519634098943255,
"learning_rate": 2.1341552887820048e-05,
"loss": 0.3762,
"step": 1035
},
{
"epoch": 2.6513115802943057,
"grad_norm": 0.33001353721510546,
"learning_rate": 2.1305829765692588e-05,
"loss": 0.3277,
"step": 1036
},
{
"epoch": 2.653870761356366,
"grad_norm": 0.3720609129475248,
"learning_rate": 2.1270102459177093e-05,
"loss": 0.3101,
"step": 1037
},
{
"epoch": 2.656429942418426,
"grad_norm": 0.3837314560637158,
"learning_rate": 2.123437108275779e-05,
"loss": 0.351,
"step": 1038
},
{
"epoch": 2.6589891234804863,
"grad_norm": 0.40361073664191494,
"learning_rate": 2.119863575093195e-05,
"loss": 0.3171,
"step": 1039
},
{
"epoch": 2.6615483045425465,
"grad_norm": 0.35854323369583274,
"learning_rate": 2.1162896578209517e-05,
"loss": 0.3253,
"step": 1040
},
{
"epoch": 2.6641074856046068,
"grad_norm": 0.37248590893696937,
"learning_rate": 2.112715367911275e-05,
"loss": 0.3511,
"step": 1041
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.3453384873567075,
"learning_rate": 2.1091407168175836e-05,
"loss": 0.3414,
"step": 1042
},
{
"epoch": 2.6692258477287267,
"grad_norm": 0.40821485869737884,
"learning_rate": 2.1055657159944545e-05,
"loss": 0.3326,
"step": 1043
},
{
"epoch": 2.671785028790787,
"grad_norm": 0.36123413084691214,
"learning_rate": 2.1019903768975852e-05,
"loss": 0.3298,
"step": 1044
},
{
"epoch": 2.674344209852847,
"grad_norm": 0.3404084610072503,
"learning_rate": 2.0984147109837564e-05,
"loss": 0.3047,
"step": 1045
},
{
"epoch": 2.6769033909149074,
"grad_norm": 0.4454775171788898,
"learning_rate": 2.094838729710798e-05,
"loss": 0.3679,
"step": 1046
},
{
"epoch": 2.679462571976967,
"grad_norm": 0.3571602753390297,
"learning_rate": 2.0912624445375483e-05,
"loss": 0.3261,
"step": 1047
},
{
"epoch": 2.6820217530390273,
"grad_norm": 0.332410280409988,
"learning_rate": 2.0876858669238206e-05,
"loss": 0.3114,
"step": 1048
},
{
"epoch": 2.6845809341010876,
"grad_norm": 0.42377420739915694,
"learning_rate": 2.0841090083303643e-05,
"loss": 0.342,
"step": 1049
},
{
"epoch": 2.6871401151631478,
"grad_norm": 0.3395618485708725,
"learning_rate": 2.0805318802188307e-05,
"loss": 0.3157,
"step": 1050
},
{
"epoch": 2.689699296225208,
"grad_norm": 0.3545398294326781,
"learning_rate": 2.0769544940517326e-05,
"loss": 0.3207,
"step": 1051
},
{
"epoch": 2.692258477287268,
"grad_norm": 0.41494596285417495,
"learning_rate": 2.0733768612924137e-05,
"loss": 0.3294,
"step": 1052
},
{
"epoch": 2.6948176583493284,
"grad_norm": 0.3439663523935823,
"learning_rate": 2.0697989934050025e-05,
"loss": 0.2852,
"step": 1053
},
{
"epoch": 2.6973768394113886,
"grad_norm": 0.41679625940370135,
"learning_rate": 2.0662209018543836e-05,
"loss": 0.3548,
"step": 1054
},
{
"epoch": 2.6999360204734484,
"grad_norm": 0.4309626747124301,
"learning_rate": 2.0626425981061608e-05,
"loss": 0.326,
"step": 1055
},
{
"epoch": 2.7024952015355086,
"grad_norm": 0.3700268039056913,
"learning_rate": 2.0590640936266132e-05,
"loss": 0.3346,
"step": 1056
},
{
"epoch": 2.705054382597569,
"grad_norm": 0.4092764582830664,
"learning_rate": 2.0554853998826652e-05,
"loss": 0.3432,
"step": 1057
},
{
"epoch": 2.707613563659629,
"grad_norm": 0.3003644777646875,
"learning_rate": 2.0519065283418494e-05,
"loss": 0.257,
"step": 1058
},
{
"epoch": 2.710172744721689,
"grad_norm": 0.36206552407899595,
"learning_rate": 2.0483274904722647e-05,
"loss": 0.3339,
"step": 1059
},
{
"epoch": 2.712731925783749,
"grad_norm": 0.41404402491302905,
"learning_rate": 2.0447482977425465e-05,
"loss": 0.3705,
"step": 1060
},
{
"epoch": 2.715291106845809,
"grad_norm": 0.35467256324797414,
"learning_rate": 2.0411689616218234e-05,
"loss": 0.3173,
"step": 1061
},
{
"epoch": 2.7178502879078694,
"grad_norm": 0.39211100334061155,
"learning_rate": 2.037589493579685e-05,
"loss": 0.336,
"step": 1062
},
{
"epoch": 2.7204094689699296,
"grad_norm": 0.3752676154141884,
"learning_rate": 2.034009905086144e-05,
"loss": 0.339,
"step": 1063
},
{
"epoch": 2.72296865003199,
"grad_norm": 0.41294533749554696,
"learning_rate": 2.0304302076115987e-05,
"loss": 0.3187,
"step": 1064
},
{
"epoch": 2.72552783109405,
"grad_norm": 0.37876928739351945,
"learning_rate": 2.0268504126267952e-05,
"loss": 0.2895,
"step": 1065
},
{
"epoch": 2.7280870121561103,
"grad_norm": 0.4001922065674535,
"learning_rate": 2.0232705316027946e-05,
"loss": 0.3153,
"step": 1066
},
{
"epoch": 2.73064619321817,
"grad_norm": 0.8048596079823415,
"learning_rate": 2.019690576010931e-05,
"loss": 0.3593,
"step": 1067
},
{
"epoch": 2.7332053742802302,
"grad_norm": 0.38282646966391626,
"learning_rate": 2.0161105573227798e-05,
"loss": 0.3035,
"step": 1068
},
{
"epoch": 2.7357645553422905,
"grad_norm": 0.46315568817195285,
"learning_rate": 2.0125304870101184e-05,
"loss": 0.3751,
"step": 1069
},
{
"epoch": 2.7383237364043507,
"grad_norm": 0.4109495278872969,
"learning_rate": 2.008950376544887e-05,
"loss": 0.3346,
"step": 1070
},
{
"epoch": 2.740882917466411,
"grad_norm": 0.4857096813595627,
"learning_rate": 2.005370237399157e-05,
"loss": 0.328,
"step": 1071
},
{
"epoch": 2.7434420985284707,
"grad_norm": 0.39791730028337013,
"learning_rate": 2.0017900810450923e-05,
"loss": 0.2865,
"step": 1072
},
{
"epoch": 2.746001279590531,
"grad_norm": 0.4465305517364689,
"learning_rate": 1.9982099189549087e-05,
"loss": 0.3647,
"step": 1073
},
{
"epoch": 2.748560460652591,
"grad_norm": 0.41005369965327937,
"learning_rate": 1.9946297626008432e-05,
"loss": 0.3151,
"step": 1074
},
{
"epoch": 2.7511196417146513,
"grad_norm": 0.49665282457932985,
"learning_rate": 1.9910496234551132e-05,
"loss": 0.3809,
"step": 1075
},
{
"epoch": 2.7536788227767115,
"grad_norm": 0.37361149681320743,
"learning_rate": 1.9874695129898826e-05,
"loss": 0.3221,
"step": 1076
},
{
"epoch": 2.7562380038387717,
"grad_norm": 0.51447395549755,
"learning_rate": 1.9838894426772205e-05,
"loss": 0.3677,
"step": 1077
},
{
"epoch": 2.758797184900832,
"grad_norm": 0.4601671007615494,
"learning_rate": 1.9803094239890692e-05,
"loss": 0.3519,
"step": 1078
},
{
"epoch": 2.761356365962892,
"grad_norm": 0.39680274509707913,
"learning_rate": 1.9767294683972064e-05,
"loss": 0.3521,
"step": 1079
},
{
"epoch": 2.763915547024952,
"grad_norm": 0.43847108432837295,
"learning_rate": 1.9731495873732055e-05,
"loss": 0.3346,
"step": 1080
},
{
"epoch": 2.766474728087012,
"grad_norm": 0.4078264869228389,
"learning_rate": 1.969569792388402e-05,
"loss": 0.3624,
"step": 1081
},
{
"epoch": 2.7690339091490723,
"grad_norm": 0.37340927869032653,
"learning_rate": 1.9659900949138562e-05,
"loss": 0.3252,
"step": 1082
},
{
"epoch": 2.7715930902111325,
"grad_norm": 0.7622901676621391,
"learning_rate": 1.9624105064203157e-05,
"loss": 0.3829,
"step": 1083
},
{
"epoch": 2.7741522712731923,
"grad_norm": 0.4249232215662602,
"learning_rate": 1.9588310383781773e-05,
"loss": 0.342,
"step": 1084
},
{
"epoch": 2.7767114523352525,
"grad_norm": 0.41402606002990694,
"learning_rate": 1.9552517022574542e-05,
"loss": 0.3756,
"step": 1085
},
{
"epoch": 2.7792706333973127,
"grad_norm": 0.48132928718497536,
"learning_rate": 1.951672509527736e-05,
"loss": 0.3263,
"step": 1086
},
{
"epoch": 2.781829814459373,
"grad_norm": 0.3954278720969905,
"learning_rate": 1.9480934716581513e-05,
"loss": 0.3209,
"step": 1087
},
{
"epoch": 2.784388995521433,
"grad_norm": 0.39183072418087983,
"learning_rate": 1.944514600117335e-05,
"loss": 0.3807,
"step": 1088
},
{
"epoch": 2.7869481765834934,
"grad_norm": 0.4620346036605357,
"learning_rate": 1.940935906373388e-05,
"loss": 0.3576,
"step": 1089
},
{
"epoch": 2.7895073576455536,
"grad_norm": 0.36599489933850005,
"learning_rate": 1.93735740189384e-05,
"loss": 0.3239,
"step": 1090
},
{
"epoch": 2.792066538707614,
"grad_norm": 0.3686130358461154,
"learning_rate": 1.9337790981456164e-05,
"loss": 0.331,
"step": 1091
},
{
"epoch": 2.7946257197696736,
"grad_norm": 0.38425163759635245,
"learning_rate": 1.930201006594999e-05,
"loss": 0.2916,
"step": 1092
},
{
"epoch": 2.7971849008317338,
"grad_norm": 0.4405563689728695,
"learning_rate": 1.926623138707587e-05,
"loss": 0.3361,
"step": 1093
},
{
"epoch": 2.799744081893794,
"grad_norm": 0.38706329631394476,
"learning_rate": 1.923045505948267e-05,
"loss": 0.323,
"step": 1094
},
{
"epoch": 2.802303262955854,
"grad_norm": 0.4708041341171456,
"learning_rate": 1.9194681197811703e-05,
"loss": 0.3378,
"step": 1095
},
{
"epoch": 2.8048624440179144,
"grad_norm": 0.43844070620974923,
"learning_rate": 1.915890991669636e-05,
"loss": 0.331,
"step": 1096
},
{
"epoch": 2.807421625079974,
"grad_norm": 0.4532919809550908,
"learning_rate": 1.9123141330761804e-05,
"loss": 0.3863,
"step": 1097
},
{
"epoch": 2.8099808061420344,
"grad_norm": 0.48356206097375876,
"learning_rate": 1.9087375554624527e-05,
"loss": 0.3241,
"step": 1098
},
{
"epoch": 2.8125399872040946,
"grad_norm": 0.37016571224063527,
"learning_rate": 1.9051612702892028e-05,
"loss": 0.3035,
"step": 1099
},
{
"epoch": 2.815099168266155,
"grad_norm": 0.5488710646000331,
"learning_rate": 1.901585289016244e-05,
"loss": 0.3365,
"step": 1100
},
{
"epoch": 2.817658349328215,
"grad_norm": 0.39186801965858076,
"learning_rate": 1.898009623102415e-05,
"loss": 0.3171,
"step": 1101
},
{
"epoch": 2.8202175303902752,
"grad_norm": 0.396128192090393,
"learning_rate": 1.894434284005546e-05,
"loss": 0.2926,
"step": 1102
},
{
"epoch": 2.8227767114523354,
"grad_norm": 0.4498006361647164,
"learning_rate": 1.890859283182417e-05,
"loss": 0.339,
"step": 1103
},
{
"epoch": 2.8253358925143957,
"grad_norm": 0.4421659448994112,
"learning_rate": 1.887284632088725e-05,
"loss": 0.3512,
"step": 1104
},
{
"epoch": 2.8278950735764554,
"grad_norm": 0.40391454153472733,
"learning_rate": 1.8837103421790486e-05,
"loss": 0.3431,
"step": 1105
},
{
"epoch": 2.8304542546385156,
"grad_norm": 0.4029197127830964,
"learning_rate": 1.8801364249068053e-05,
"loss": 0.3083,
"step": 1106
},
{
"epoch": 2.833013435700576,
"grad_norm": 0.3636262656158161,
"learning_rate": 1.8765628917242213e-05,
"loss": 0.2695,
"step": 1107
},
{
"epoch": 2.835572616762636,
"grad_norm": 0.41088999848761776,
"learning_rate": 1.8729897540822914e-05,
"loss": 0.3191,
"step": 1108
},
{
"epoch": 2.838131797824696,
"grad_norm": 0.4141748812616923,
"learning_rate": 1.8694170234307415e-05,
"loss": 0.3494,
"step": 1109
},
{
"epoch": 2.840690978886756,
"grad_norm": 0.40924209313260995,
"learning_rate": 1.8658447112179952e-05,
"loss": 0.3424,
"step": 1110
},
{
"epoch": 2.8432501599488162,
"grad_norm": 0.38985761612852116,
"learning_rate": 1.8622728288911358e-05,
"loss": 0.3367,
"step": 1111
},
{
"epoch": 2.8458093410108765,
"grad_norm": 0.38078814823399454,
"learning_rate": 1.858701387895865e-05,
"loss": 0.3143,
"step": 1112
},
{
"epoch": 2.8483685220729367,
"grad_norm": 0.34959904397806785,
"learning_rate": 1.8551303996764755e-05,
"loss": 0.3007,
"step": 1113
},
{
"epoch": 2.850927703134997,
"grad_norm": 0.4183989425563707,
"learning_rate": 1.8515598756758064e-05,
"loss": 0.359,
"step": 1114
},
{
"epoch": 2.853486884197057,
"grad_norm": 0.37111554641813266,
"learning_rate": 1.8479898273352084e-05,
"loss": 0.3381,
"step": 1115
},
{
"epoch": 2.8560460652591173,
"grad_norm": 0.36998674648942864,
"learning_rate": 1.8444202660945105e-05,
"loss": 0.3317,
"step": 1116
},
{
"epoch": 2.858605246321177,
"grad_norm": 0.4020148068654659,
"learning_rate": 1.8408512033919798e-05,
"loss": 0.3048,
"step": 1117
},
{
"epoch": 2.8611644273832373,
"grad_norm": 0.4005462960969913,
"learning_rate": 1.837282650664284e-05,
"loss": 0.3603,
"step": 1118
},
{
"epoch": 2.8637236084452975,
"grad_norm": 0.3444120356474249,
"learning_rate": 1.8337146193464595e-05,
"loss": 0.2898,
"step": 1119
},
{
"epoch": 2.8662827895073577,
"grad_norm": 0.39082290145745685,
"learning_rate": 1.83014712087187e-05,
"loss": 0.3131,
"step": 1120
},
{
"epoch": 2.868841970569418,
"grad_norm": 0.35821645503904304,
"learning_rate": 1.8265801666721744e-05,
"loss": 0.3433,
"step": 1121
},
{
"epoch": 2.8714011516314777,
"grad_norm": 0.41668733158509796,
"learning_rate": 1.8230137681772836e-05,
"loss": 0.3567,
"step": 1122
},
{
"epoch": 2.873960332693538,
"grad_norm": 0.34804389069027475,
"learning_rate": 1.8194479368153298e-05,
"loss": 0.3136,
"step": 1123
},
{
"epoch": 2.876519513755598,
"grad_norm": 0.3911488779456878,
"learning_rate": 1.8158826840126292e-05,
"loss": 0.3412,
"step": 1124
},
{
"epoch": 2.8790786948176583,
"grad_norm": 0.44396897741245006,
"learning_rate": 1.8123180211936417e-05,
"loss": 0.3644,
"step": 1125
},
{
"epoch": 2.8816378758797185,
"grad_norm": 0.3818666952140594,
"learning_rate": 1.808753959780938e-05,
"loss": 0.2988,
"step": 1126
},
{
"epoch": 2.8841970569417787,
"grad_norm": 0.3819899108018794,
"learning_rate": 1.805190511195162e-05,
"loss": 0.3304,
"step": 1127
},
{
"epoch": 2.886756238003839,
"grad_norm": 0.3988449634404456,
"learning_rate": 1.801627686854992e-05,
"loss": 0.3413,
"step": 1128
},
{
"epoch": 2.889315419065899,
"grad_norm": 0.4410132899232368,
"learning_rate": 1.7980654981771074e-05,
"loss": 0.3725,
"step": 1129
},
{
"epoch": 2.891874600127959,
"grad_norm": 0.31362864495515286,
"learning_rate": 1.794503956576152e-05,
"loss": 0.2833,
"step": 1130
},
{
"epoch": 2.894433781190019,
"grad_norm": 0.411109247445083,
"learning_rate": 1.7909430734646936e-05,
"loss": 0.3297,
"step": 1131
},
{
"epoch": 2.8969929622520794,
"grad_norm": 0.3360115333630458,
"learning_rate": 1.78738286025319e-05,
"loss": 0.2985,
"step": 1132
},
{
"epoch": 2.8995521433141396,
"grad_norm": 0.3728330681966131,
"learning_rate": 1.7838233283499554e-05,
"loss": 0.378,
"step": 1133
},
{
"epoch": 2.9021113243761993,
"grad_norm": 0.3539275837478167,
"learning_rate": 1.780264489161117e-05,
"loss": 0.3638,
"step": 1134
},
{
"epoch": 2.9046705054382596,
"grad_norm": 0.33582742300915935,
"learning_rate": 1.776706354090585e-05,
"loss": 0.3383,
"step": 1135
},
{
"epoch": 2.9072296865003198,
"grad_norm": 0.38396584024437336,
"learning_rate": 1.7731489345400118e-05,
"loss": 0.3116,
"step": 1136
},
{
"epoch": 2.90978886756238,
"grad_norm": 0.3121417283022767,
"learning_rate": 1.769592241908758e-05,
"loss": 0.3089,
"step": 1137
},
{
"epoch": 2.91234804862444,
"grad_norm": 0.37871598245894667,
"learning_rate": 1.766036287593854e-05,
"loss": 0.3504,
"step": 1138
},
{
"epoch": 2.9149072296865004,
"grad_norm": 0.4083761440481677,
"learning_rate": 1.762481082989965e-05,
"loss": 0.3338,
"step": 1139
},
{
"epoch": 2.9174664107485606,
"grad_norm": 0.3759166218369834,
"learning_rate": 1.758926639489354e-05,
"loss": 0.3448,
"step": 1140
},
{
"epoch": 2.920025591810621,
"grad_norm": 0.3918883448687237,
"learning_rate": 1.755372968481844e-05,
"loss": 0.3465,
"step": 1141
},
{
"epoch": 2.9225847728726806,
"grad_norm": 0.45281102397931977,
"learning_rate": 1.7518200813547842e-05,
"loss": 0.352,
"step": 1142
},
{
"epoch": 2.925143953934741,
"grad_norm": 0.3403187481784673,
"learning_rate": 1.748267989493011e-05,
"loss": 0.2767,
"step": 1143
},
{
"epoch": 2.927703134996801,
"grad_norm": 0.32374732234103554,
"learning_rate": 1.7447167042788108e-05,
"loss": 0.3003,
"step": 1144
},
{
"epoch": 2.9302623160588612,
"grad_norm": 0.43821825023278765,
"learning_rate": 1.7411662370918893e-05,
"loss": 0.3365,
"step": 1145
},
{
"epoch": 2.9328214971209214,
"grad_norm": 0.3735728621894312,
"learning_rate": 1.7376165993093278e-05,
"loss": 0.3164,
"step": 1146
},
{
"epoch": 2.935380678182981,
"grad_norm": 0.3713647905663265,
"learning_rate": 1.7340678023055496e-05,
"loss": 0.3237,
"step": 1147
},
{
"epoch": 2.9379398592450414,
"grad_norm": 0.40116903737296333,
"learning_rate": 1.7305198574522864e-05,
"loss": 0.3614,
"step": 1148
},
{
"epoch": 2.9404990403071016,
"grad_norm": 0.40054289621797295,
"learning_rate": 1.7269727761185374e-05,
"loss": 0.334,
"step": 1149
},
{
"epoch": 2.943058221369162,
"grad_norm": 0.3925230509455669,
"learning_rate": 1.7234265696705344e-05,
"loss": 0.2959,
"step": 1150
},
{
"epoch": 2.945617402431222,
"grad_norm": 0.42214888830794545,
"learning_rate": 1.7198812494717062e-05,
"loss": 0.3776,
"step": 1151
},
{
"epoch": 2.9481765834932823,
"grad_norm": 0.3616779488648713,
"learning_rate": 1.7163368268826433e-05,
"loss": 0.3016,
"step": 1152
},
{
"epoch": 2.9507357645553425,
"grad_norm": 0.38477907353760216,
"learning_rate": 1.7127933132610573e-05,
"loss": 0.3073,
"step": 1153
},
{
"epoch": 2.9532949456174027,
"grad_norm": 0.38583388740216534,
"learning_rate": 1.7092507199617482e-05,
"loss": 0.3303,
"step": 1154
},
{
"epoch": 2.9558541266794625,
"grad_norm": 0.4615746603773426,
"learning_rate": 1.7057090583365678e-05,
"loss": 0.3944,
"step": 1155
},
{
"epoch": 2.9584133077415227,
"grad_norm": 0.3793974621003137,
"learning_rate": 1.7021683397343823e-05,
"loss": 0.3298,
"step": 1156
},
{
"epoch": 2.960972488803583,
"grad_norm": 0.4920742763843501,
"learning_rate": 1.698628575501034e-05,
"loss": 0.3401,
"step": 1157
},
{
"epoch": 2.963531669865643,
"grad_norm": 0.4104914994234224,
"learning_rate": 1.6950897769793093e-05,
"loss": 0.3268,
"step": 1158
},
{
"epoch": 2.966090850927703,
"grad_norm": 0.41153736245664496,
"learning_rate": 1.6915519555089e-05,
"loss": 0.3594,
"step": 1159
},
{
"epoch": 2.968650031989763,
"grad_norm": 0.3875070318404897,
"learning_rate": 1.6880151224263646e-05,
"loss": 0.3398,
"step": 1160
},
{
"epoch": 2.9712092130518233,
"grad_norm": 0.44887352669211456,
"learning_rate": 1.6844792890650976e-05,
"loss": 0.2813,
"step": 1161
},
{
"epoch": 2.9737683941138835,
"grad_norm": 0.3800558590649599,
"learning_rate": 1.680944466755289e-05,
"loss": 0.3635,
"step": 1162
},
{
"epoch": 2.9763275751759437,
"grad_norm": 0.3971504175952064,
"learning_rate": 1.6774106668238867e-05,
"loss": 0.3146,
"step": 1163
},
{
"epoch": 2.978886756238004,
"grad_norm": 0.4715791982640647,
"learning_rate": 1.673877900594566e-05,
"loss": 0.3553,
"step": 1164
},
{
"epoch": 2.981445937300064,
"grad_norm": 0.30859611389638464,
"learning_rate": 1.6703461793876876e-05,
"loss": 0.2989,
"step": 1165
},
{
"epoch": 2.9840051183621243,
"grad_norm": 0.3973251808749978,
"learning_rate": 1.6668155145202638e-05,
"loss": 0.3579,
"step": 1166
},
{
"epoch": 2.986564299424184,
"grad_norm": 0.446835333586439,
"learning_rate": 1.6632859173059232e-05,
"loss": 0.3258,
"step": 1167
},
{
"epoch": 2.9891234804862443,
"grad_norm": 0.3811823018283798,
"learning_rate": 1.6597573990548722e-05,
"loss": 0.3201,
"step": 1168
},
{
"epoch": 2.9916826615483045,
"grad_norm": 0.375872795491289,
"learning_rate": 1.6562299710738586e-05,
"loss": 0.3255,
"step": 1169
},
{
"epoch": 2.9942418426103647,
"grad_norm": 0.4029499625289535,
"learning_rate": 1.6527036446661396e-05,
"loss": 0.307,
"step": 1170
},
{
"epoch": 2.996801023672425,
"grad_norm": 0.4156370523912452,
"learning_rate": 1.6491784311314403e-05,
"loss": 0.3797,
"step": 1171
},
{
"epoch": 2.9993602047344847,
"grad_norm": 0.4189485161365368,
"learning_rate": 1.6456543417659192e-05,
"loss": 0.3488,
"step": 1172
},
{
"epoch": 3.001919385796545,
"grad_norm": 0.5633124123177335,
"learning_rate": 1.6421313878621344e-05,
"loss": 0.2857,
"step": 1173
},
{
"epoch": 3.004478566858605,
"grad_norm": 0.39083213560124314,
"learning_rate": 1.6386095807090047e-05,
"loss": 0.2379,
"step": 1174
},
{
"epoch": 3.0070377479206654,
"grad_norm": 0.5240113784249277,
"learning_rate": 1.635088931591775e-05,
"loss": 0.2245,
"step": 1175
},
{
"epoch": 3.0095969289827256,
"grad_norm": 0.6354763063642779,
"learning_rate": 1.631569451791977e-05,
"loss": 0.2869,
"step": 1176
},
{
"epoch": 3.012156110044786,
"grad_norm": 0.4893508226750199,
"learning_rate": 1.628051152587398e-05,
"loss": 0.2703,
"step": 1177
},
{
"epoch": 3.014715291106846,
"grad_norm": 0.4524451646086151,
"learning_rate": 1.6245340452520414e-05,
"loss": 0.2176,
"step": 1178
},
{
"epoch": 3.0172744721689058,
"grad_norm": 0.4778994330296646,
"learning_rate": 1.6210181410560912e-05,
"loss": 0.2571,
"step": 1179
},
{
"epoch": 3.019833653230966,
"grad_norm": 0.37599046051577073,
"learning_rate": 1.6175034512658753e-05,
"loss": 0.2338,
"step": 1180
},
{
"epoch": 3.022392834293026,
"grad_norm": 0.45320853578403025,
"learning_rate": 1.613989987143833e-05,
"loss": 0.2325,
"step": 1181
},
{
"epoch": 3.0249520153550864,
"grad_norm": 0.43340502469781866,
"learning_rate": 1.610477759948472e-05,
"loss": 0.2767,
"step": 1182
},
{
"epoch": 3.0275111964171466,
"grad_norm": 0.38361490841329304,
"learning_rate": 1.6069667809343396e-05,
"loss": 0.2445,
"step": 1183
},
{
"epoch": 3.030070377479207,
"grad_norm": 0.39878208398916914,
"learning_rate": 1.603457061351983e-05,
"loss": 0.2506,
"step": 1184
},
{
"epoch": 3.0326295585412666,
"grad_norm": 0.44131555640934655,
"learning_rate": 1.5999486124479115e-05,
"loss": 0.2588,
"step": 1185
},
{
"epoch": 3.035188739603327,
"grad_norm": 0.36228403498534006,
"learning_rate": 1.5964414454645647e-05,
"loss": 0.2394,
"step": 1186
},
{
"epoch": 3.037747920665387,
"grad_norm": 0.40556573549758734,
"learning_rate": 1.5929355716402754e-05,
"loss": 0.2422,
"step": 1187
},
{
"epoch": 3.0403071017274472,
"grad_norm": 0.46971240300411676,
"learning_rate": 1.5894310022092288e-05,
"loss": 0.2536,
"step": 1188
},
{
"epoch": 3.0428662827895074,
"grad_norm": 0.37074278168598435,
"learning_rate": 1.5859277484014338e-05,
"loss": 0.2262,
"step": 1189
},
{
"epoch": 3.0454254638515676,
"grad_norm": 0.4101230047135583,
"learning_rate": 1.5824258214426833e-05,
"loss": 0.2501,
"step": 1190
},
{
"epoch": 3.047984644913628,
"grad_norm": 0.4316482731662535,
"learning_rate": 1.5789252325545157e-05,
"loss": 0.2766,
"step": 1191
},
{
"epoch": 3.0505438259756876,
"grad_norm": 0.36783848268146724,
"learning_rate": 1.5754259929541848e-05,
"loss": 0.2401,
"step": 1192
},
{
"epoch": 3.053103007037748,
"grad_norm": 0.4061482848745174,
"learning_rate": 1.5719281138546186e-05,
"loss": 0.2508,
"step": 1193
},
{
"epoch": 3.055662188099808,
"grad_norm": 0.382539152433566,
"learning_rate": 1.568431606464388e-05,
"loss": 0.2489,
"step": 1194
},
{
"epoch": 3.0582213691618683,
"grad_norm": 0.3652697874982772,
"learning_rate": 1.5649364819876655e-05,
"loss": 0.2429,
"step": 1195
},
{
"epoch": 3.0607805502239285,
"grad_norm": 0.42737097970366417,
"learning_rate": 1.561442751624193e-05,
"loss": 0.256,
"step": 1196
},
{
"epoch": 3.0633397312859887,
"grad_norm": 0.3207166801748589,
"learning_rate": 1.557950426569248e-05,
"loss": 0.1962,
"step": 1197
},
{
"epoch": 3.0658989123480485,
"grad_norm": 0.3943172590265861,
"learning_rate": 1.5544595180136003e-05,
"loss": 0.2519,
"step": 1198
},
{
"epoch": 3.0684580934101087,
"grad_norm": 0.36274192661719984,
"learning_rate": 1.550970037143483e-05,
"loss": 0.2279,
"step": 1199
},
{
"epoch": 3.071017274472169,
"grad_norm": 0.3630874200444502,
"learning_rate": 1.547481995140556e-05,
"loss": 0.2516,
"step": 1200
},
{
"epoch": 3.073576455534229,
"grad_norm": 0.3518754730219404,
"learning_rate": 1.5439954031818652e-05,
"loss": 0.2329,
"step": 1201
},
{
"epoch": 3.0761356365962893,
"grad_norm": 0.37790098887949486,
"learning_rate": 1.5405102724398113e-05,
"loss": 0.2677,
"step": 1202
},
{
"epoch": 3.0786948176583495,
"grad_norm": 0.35043382673558215,
"learning_rate": 1.5370266140821143e-05,
"loss": 0.2294,
"step": 1203
},
{
"epoch": 3.0812539987204093,
"grad_norm": 0.3731184820167596,
"learning_rate": 1.5335444392717738e-05,
"loss": 0.2319,
"step": 1204
},
{
"epoch": 3.0838131797824695,
"grad_norm": 0.3395760795759123,
"learning_rate": 1.5300637591670357e-05,
"loss": 0.2333,
"step": 1205
},
{
"epoch": 3.0863723608445297,
"grad_norm": 0.3530521161404101,
"learning_rate": 1.5265845849213588e-05,
"loss": 0.2458,
"step": 1206
},
{
"epoch": 3.08893154190659,
"grad_norm": 0.2968062718035343,
"learning_rate": 1.523106927683374e-05,
"loss": 0.1984,
"step": 1207
},
{
"epoch": 3.09149072296865,
"grad_norm": 0.34618157328728927,
"learning_rate": 1.5196307985968509e-05,
"loss": 0.2338,
"step": 1208
},
{
"epoch": 3.0940499040307103,
"grad_norm": 0.3991365653818135,
"learning_rate": 1.5161562088006649e-05,
"loss": 0.2639,
"step": 1209
},
{
"epoch": 3.09660908509277,
"grad_norm": 0.3337465568445769,
"learning_rate": 1.5126831694287564e-05,
"loss": 0.2354,
"step": 1210
},
{
"epoch": 3.0991682661548303,
"grad_norm": 0.39228609678359605,
"learning_rate": 1.5092116916100982e-05,
"loss": 0.2737,
"step": 1211
},
{
"epoch": 3.1017274472168905,
"grad_norm": 0.3216556126081721,
"learning_rate": 1.5057417864686607e-05,
"loss": 0.2237,
"step": 1212
},
{
"epoch": 3.1042866282789507,
"grad_norm": 0.3567015015151436,
"learning_rate": 1.5022734651233737e-05,
"loss": 0.2568,
"step": 1213
},
{
"epoch": 3.106845809341011,
"grad_norm": 0.35178848138592544,
"learning_rate": 1.4988067386880904e-05,
"loss": 0.2276,
"step": 1214
},
{
"epoch": 3.109404990403071,
"grad_norm": 0.3511504661864566,
"learning_rate": 1.4953416182715566e-05,
"loss": 0.2699,
"step": 1215
},
{
"epoch": 3.1119641714651314,
"grad_norm": 0.3425398327341164,
"learning_rate": 1.4918781149773694e-05,
"loss": 0.2677,
"step": 1216
},
{
"epoch": 3.114523352527191,
"grad_norm": 0.36160910451306577,
"learning_rate": 1.4884162399039439e-05,
"loss": 0.2545,
"step": 1217
},
{
"epoch": 3.1170825335892514,
"grad_norm": 0.37097329916252125,
"learning_rate": 1.4849560041444795e-05,
"loss": 0.2609,
"step": 1218
},
{
"epoch": 3.1196417146513116,
"grad_norm": 0.36352782561925345,
"learning_rate": 1.4814974187869218e-05,
"loss": 0.2236,
"step": 1219
},
{
"epoch": 3.122200895713372,
"grad_norm": 0.3551683721423837,
"learning_rate": 1.478040494913926e-05,
"loss": 0.2244,
"step": 1220
},
{
"epoch": 3.124760076775432,
"grad_norm": 0.3308813359796844,
"learning_rate": 1.4745852436028262e-05,
"loss": 0.2591,
"step": 1221
},
{
"epoch": 3.127319257837492,
"grad_norm": 0.3507830733548493,
"learning_rate": 1.4711316759255963e-05,
"loss": 0.2453,
"step": 1222
},
{
"epoch": 3.129878438899552,
"grad_norm": 0.33582461966461585,
"learning_rate": 1.4676798029488123e-05,
"loss": 0.2593,
"step": 1223
},
{
"epoch": 3.132437619961612,
"grad_norm": 0.3507842455435477,
"learning_rate": 1.464229635733624e-05,
"loss": 0.2372,
"step": 1224
},
{
"epoch": 3.1349968010236724,
"grad_norm": 0.3318567188084375,
"learning_rate": 1.460781185335713e-05,
"loss": 0.231,
"step": 1225
},
{
"epoch": 3.1375559820857326,
"grad_norm": 0.3188990523059626,
"learning_rate": 1.4573344628052588e-05,
"loss": 0.2376,
"step": 1226
},
{
"epoch": 3.140115163147793,
"grad_norm": 0.3664030290111237,
"learning_rate": 1.4538894791869052e-05,
"loss": 0.2585,
"step": 1227
},
{
"epoch": 3.142674344209853,
"grad_norm": 0.3456582759492691,
"learning_rate": 1.4504462455197248e-05,
"loss": 0.2295,
"step": 1228
},
{
"epoch": 3.145233525271913,
"grad_norm": 0.3071644333690587,
"learning_rate": 1.4470047728371813e-05,
"loss": 0.2113,
"step": 1229
},
{
"epoch": 3.147792706333973,
"grad_norm": 0.33706220227684885,
"learning_rate": 1.443565072167095e-05,
"loss": 0.2286,
"step": 1230
},
{
"epoch": 3.1503518873960332,
"grad_norm": 0.3213473350814528,
"learning_rate": 1.4401271545316096e-05,
"loss": 0.2333,
"step": 1231
},
{
"epoch": 3.1529110684580934,
"grad_norm": 0.32806923950956013,
"learning_rate": 1.436691030947155e-05,
"loss": 0.2338,
"step": 1232
},
{
"epoch": 3.1554702495201536,
"grad_norm": 0.33540430311298725,
"learning_rate": 1.43325671242441e-05,
"loss": 0.209,
"step": 1233
},
{
"epoch": 3.158029430582214,
"grad_norm": 0.3208377254757641,
"learning_rate": 1.4298242099682726e-05,
"loss": 0.245,
"step": 1234
},
{
"epoch": 3.1605886116442736,
"grad_norm": 0.3427654538840671,
"learning_rate": 1.4263935345778202e-05,
"loss": 0.2521,
"step": 1235
},
{
"epoch": 3.163147792706334,
"grad_norm": 0.35551971354398254,
"learning_rate": 1.4229646972462732e-05,
"loss": 0.2338,
"step": 1236
},
{
"epoch": 3.165706973768394,
"grad_norm": 0.33357596367989273,
"learning_rate": 1.419537708960966e-05,
"loss": 0.2322,
"step": 1237
},
{
"epoch": 3.1682661548304543,
"grad_norm": 0.3416940872307819,
"learning_rate": 1.4161125807033059e-05,
"loss": 0.24,
"step": 1238
},
{
"epoch": 3.1708253358925145,
"grad_norm": 0.3259027337159305,
"learning_rate": 1.412689323448739e-05,
"loss": 0.2705,
"step": 1239
},
{
"epoch": 3.1733845169545747,
"grad_norm": 0.33778026989222404,
"learning_rate": 1.409267948166718e-05,
"loss": 0.2335,
"step": 1240
},
{
"epoch": 3.175943698016635,
"grad_norm": 0.3425941944724759,
"learning_rate": 1.4058484658206646e-05,
"loss": 0.2684,
"step": 1241
},
{
"epoch": 3.1785028790786947,
"grad_norm": 0.304680436913791,
"learning_rate": 1.4024308873679327e-05,
"loss": 0.2181,
"step": 1242
},
{
"epoch": 3.181062060140755,
"grad_norm": 0.3522850109826806,
"learning_rate": 1.3990152237597787e-05,
"loss": 0.2572,
"step": 1243
},
{
"epoch": 3.183621241202815,
"grad_norm": 0.3209721557320742,
"learning_rate": 1.3956014859413211e-05,
"loss": 0.2337,
"step": 1244
},
{
"epoch": 3.1861804222648753,
"grad_norm": 0.3293098487746776,
"learning_rate": 1.3921896848515064e-05,
"loss": 0.2411,
"step": 1245
},
{
"epoch": 3.1887396033269355,
"grad_norm": 0.30365057870700035,
"learning_rate": 1.388779831423078e-05,
"loss": 0.2291,
"step": 1246
},
{
"epoch": 3.1912987843889957,
"grad_norm": 0.3131208709907512,
"learning_rate": 1.3853719365825357e-05,
"loss": 0.2352,
"step": 1247
},
{
"epoch": 3.1938579654510555,
"grad_norm": 0.3376254463164988,
"learning_rate": 1.3819660112501054e-05,
"loss": 0.2625,
"step": 1248
},
{
"epoch": 3.1964171465131157,
"grad_norm": 0.3069287683421629,
"learning_rate": 1.3785620663396992e-05,
"loss": 0.2229,
"step": 1249
},
{
"epoch": 3.198976327575176,
"grad_norm": 0.3316589759980029,
"learning_rate": 1.3751601127588849e-05,
"loss": 0.245,
"step": 1250
},
{
"epoch": 3.201535508637236,
"grad_norm": 0.30527916893181595,
"learning_rate": 1.37176016140885e-05,
"loss": 0.2346,
"step": 1251
},
{
"epoch": 3.2040946896992963,
"grad_norm": 0.34900918414936455,
"learning_rate": 1.3683622231843644e-05,
"loss": 0.2392,
"step": 1252
},
{
"epoch": 3.2066538707613566,
"grad_norm": 0.30641882677939075,
"learning_rate": 1.364966308973747e-05,
"loss": 0.2218,
"step": 1253
},
{
"epoch": 3.2092130518234163,
"grad_norm": 0.3296381755475144,
"learning_rate": 1.3615724296588342e-05,
"loss": 0.2566,
"step": 1254
},
{
"epoch": 3.2117722328854765,
"grad_norm": 0.34080590800970306,
"learning_rate": 1.3581805961149371e-05,
"loss": 0.2518,
"step": 1255
},
{
"epoch": 3.2143314139475367,
"grad_norm": 0.31502390005004344,
"learning_rate": 1.3547908192108143e-05,
"loss": 0.2288,
"step": 1256
},
{
"epoch": 3.216890595009597,
"grad_norm": 0.30635056034248115,
"learning_rate": 1.3514031098086349e-05,
"loss": 0.2539,
"step": 1257
},
{
"epoch": 3.219449776071657,
"grad_norm": 0.34066214746897916,
"learning_rate": 1.3480174787639397e-05,
"loss": 0.2664,
"step": 1258
},
{
"epoch": 3.2220089571337174,
"grad_norm": 0.31764511418435903,
"learning_rate": 1.3446339369256121e-05,
"loss": 0.2067,
"step": 1259
},
{
"epoch": 3.224568138195777,
"grad_norm": 0.29852956617495935,
"learning_rate": 1.341252495135841e-05,
"loss": 0.2298,
"step": 1260
},
{
"epoch": 3.2271273192578374,
"grad_norm": 0.34615186747664684,
"learning_rate": 1.3378731642300841e-05,
"loss": 0.2488,
"step": 1261
},
{
"epoch": 3.2296865003198976,
"grad_norm": 0.31284863193899576,
"learning_rate": 1.3344959550370362e-05,
"loss": 0.222,
"step": 1262
},
{
"epoch": 3.232245681381958,
"grad_norm": 0.3198015399733083,
"learning_rate": 1.3311208783785945e-05,
"loss": 0.2561,
"step": 1263
},
{
"epoch": 3.234804862444018,
"grad_norm": 0.33484963926651445,
"learning_rate": 1.327747945069819e-05,
"loss": 0.2532,
"step": 1264
},
{
"epoch": 3.237364043506078,
"grad_norm": 0.3505508918710989,
"learning_rate": 1.324377165918906e-05,
"loss": 0.253,
"step": 1265
},
{
"epoch": 3.2399232245681384,
"grad_norm": 0.3317100885612124,
"learning_rate": 1.3210085517271459e-05,
"loss": 0.2488,
"step": 1266
},
{
"epoch": 3.242482405630198,
"grad_norm": 0.3183951056600632,
"learning_rate": 1.3176421132888936e-05,
"loss": 0.2206,
"step": 1267
},
{
"epoch": 3.2450415866922584,
"grad_norm": 0.7798174468009574,
"learning_rate": 1.3142778613915308e-05,
"loss": 0.3465,
"step": 1268
},
{
"epoch": 3.2476007677543186,
"grad_norm": 0.3276263112485529,
"learning_rate": 1.3109158068154329e-05,
"loss": 0.2206,
"step": 1269
},
{
"epoch": 3.250159948816379,
"grad_norm": 0.35582948619273064,
"learning_rate": 1.3075559603339354e-05,
"loss": 0.2272,
"step": 1270
},
{
"epoch": 3.252719129878439,
"grad_norm": 0.3425439408964034,
"learning_rate": 1.304198332713296e-05,
"loss": 0.2587,
"step": 1271
},
{
"epoch": 3.255278310940499,
"grad_norm": 0.35588463150095667,
"learning_rate": 1.3008429347126641e-05,
"loss": 0.2585,
"step": 1272
},
{
"epoch": 3.257837492002559,
"grad_norm": 0.3354553998723496,
"learning_rate": 1.2974897770840448e-05,
"loss": 0.2067,
"step": 1273
},
{
"epoch": 3.260396673064619,
"grad_norm": 0.3247184453320128,
"learning_rate": 1.2941388705722627e-05,
"loss": 0.2449,
"step": 1274
},
{
"epoch": 3.2629558541266794,
"grad_norm": 0.3269101886184072,
"learning_rate": 1.2907902259149287e-05,
"loss": 0.2454,
"step": 1275
},
{
"epoch": 3.2655150351887396,
"grad_norm": 0.34277751654037186,
"learning_rate": 1.2874438538424086e-05,
"loss": 0.2267,
"step": 1276
},
{
"epoch": 3.2680742162508,
"grad_norm": 0.3425839528353915,
"learning_rate": 1.2840997650777829e-05,
"loss": 0.2289,
"step": 1277
},
{
"epoch": 3.27063339731286,
"grad_norm": 0.32496809601312776,
"learning_rate": 1.2807579703368162e-05,
"loss": 0.2437,
"step": 1278
},
{
"epoch": 3.27319257837492,
"grad_norm": 0.37627417428336984,
"learning_rate": 1.2774184803279245e-05,
"loss": 0.2196,
"step": 1279
},
{
"epoch": 3.27575175943698,
"grad_norm": 0.30844464297394786,
"learning_rate": 1.274081305752135e-05,
"loss": 0.2148,
"step": 1280
},
{
"epoch": 3.2783109404990403,
"grad_norm": 0.32041752442694194,
"learning_rate": 1.2707464573030572e-05,
"loss": 0.2495,
"step": 1281
},
{
"epoch": 3.2808701215611005,
"grad_norm": 0.3247468389566215,
"learning_rate": 1.2674139456668479e-05,
"loss": 0.2558,
"step": 1282
},
{
"epoch": 3.2834293026231607,
"grad_norm": 0.30941778730595587,
"learning_rate": 1.2640837815221731e-05,
"loss": 0.2238,
"step": 1283
},
{
"epoch": 3.285988483685221,
"grad_norm": 0.3397180703699647,
"learning_rate": 1.260755975540178e-05,
"loss": 0.2405,
"step": 1284
},
{
"epoch": 3.2885476647472807,
"grad_norm": 0.31915117073868005,
"learning_rate": 1.2574305383844528e-05,
"loss": 0.2396,
"step": 1285
},
{
"epoch": 3.291106845809341,
"grad_norm": 0.2977168854647766,
"learning_rate": 1.2541074807109945e-05,
"loss": 0.2286,
"step": 1286
},
{
"epoch": 3.293666026871401,
"grad_norm": 0.3141534078265832,
"learning_rate": 1.250786813168176e-05,
"loss": 0.2291,
"step": 1287
},
{
"epoch": 3.2962252079334613,
"grad_norm": 0.3250362176047104,
"learning_rate": 1.2474685463967125e-05,
"loss": 0.2353,
"step": 1288
},
{
"epoch": 3.2987843889955215,
"grad_norm": 0.5783304188096524,
"learning_rate": 1.2441526910296253e-05,
"loss": 0.2316,
"step": 1289
},
{
"epoch": 3.3013435700575817,
"grad_norm": 0.31254991008955707,
"learning_rate": 1.2408392576922075e-05,
"loss": 0.2336,
"step": 1290
},
{
"epoch": 3.303902751119642,
"grad_norm": 0.31041323932723247,
"learning_rate": 1.2375282570019933e-05,
"loss": 0.2457,
"step": 1291
},
{
"epoch": 3.3064619321817017,
"grad_norm": 0.32993719976229857,
"learning_rate": 1.2342196995687212e-05,
"loss": 0.2588,
"step": 1292
},
{
"epoch": 3.309021113243762,
"grad_norm": 0.3016426692910031,
"learning_rate": 1.2309135959942986e-05,
"loss": 0.2577,
"step": 1293
},
{
"epoch": 3.311580294305822,
"grad_norm": 0.34298650349077,
"learning_rate": 1.227609956872772e-05,
"loss": 0.2386,
"step": 1294
},
{
"epoch": 3.3141394753678823,
"grad_norm": 0.3233823231490881,
"learning_rate": 1.2243087927902905e-05,
"loss": 0.2203,
"step": 1295
},
{
"epoch": 3.3166986564299425,
"grad_norm": 0.3431401051489707,
"learning_rate": 1.2210101143250708e-05,
"loss": 0.2369,
"step": 1296
},
{
"epoch": 3.3192578374920023,
"grad_norm": 0.30536095566488874,
"learning_rate": 1.2177139320473663e-05,
"loss": 0.209,
"step": 1297
},
{
"epoch": 3.3218170185540625,
"grad_norm": 0.3380078591507895,
"learning_rate": 1.2144202565194311e-05,
"loss": 0.2793,
"step": 1298
},
{
"epoch": 3.3243761996161227,
"grad_norm": 0.3327840853187567,
"learning_rate": 1.211129098295486e-05,
"loss": 0.2473,
"step": 1299
},
{
"epoch": 3.326935380678183,
"grad_norm": 0.29406663008971645,
"learning_rate": 1.2078404679216864e-05,
"loss": 0.2056,
"step": 1300
},
{
"epoch": 3.329494561740243,
"grad_norm": 0.31662925879264764,
"learning_rate": 1.2045543759360876e-05,
"loss": 0.2443,
"step": 1301
},
{
"epoch": 3.3320537428023034,
"grad_norm": 0.322730147707437,
"learning_rate": 1.2012708328686093e-05,
"loss": 0.2143,
"step": 1302
},
{
"epoch": 3.3346129238643636,
"grad_norm": 0.2975751708993095,
"learning_rate": 1.1979898492410049e-05,
"loss": 0.2385,
"step": 1303
},
{
"epoch": 3.3371721049264234,
"grad_norm": 0.3636857886009555,
"learning_rate": 1.1947114355668265e-05,
"loss": 0.2623,
"step": 1304
},
{
"epoch": 3.3397312859884836,
"grad_norm": 0.3199715781542156,
"learning_rate": 1.1914356023513904e-05,
"loss": 0.2605,
"step": 1305
},
{
"epoch": 3.342290467050544,
"grad_norm": 0.32209675935492127,
"learning_rate": 1.1881623600917437e-05,
"loss": 0.2474,
"step": 1306
},
{
"epoch": 3.344849648112604,
"grad_norm": 0.3146940770709782,
"learning_rate": 1.1848917192766322e-05,
"loss": 0.1886,
"step": 1307
},
{
"epoch": 3.347408829174664,
"grad_norm": 0.3515135762125379,
"learning_rate": 1.1816236903864656e-05,
"loss": 0.2639,
"step": 1308
},
{
"epoch": 3.3499680102367244,
"grad_norm": 0.3273044475438286,
"learning_rate": 1.1783582838932821e-05,
"loss": 0.2681,
"step": 1309
},
{
"epoch": 3.352527191298784,
"grad_norm": 0.3241556852063347,
"learning_rate": 1.1750955102607193e-05,
"loss": 0.2148,
"step": 1310
},
{
"epoch": 3.3550863723608444,
"grad_norm": 0.34788552766120195,
"learning_rate": 1.1718353799439766e-05,
"loss": 0.2328,
"step": 1311
},
{
"epoch": 3.3576455534229046,
"grad_norm": 0.31285926538820524,
"learning_rate": 1.1685779033897827e-05,
"loss": 0.2139,
"step": 1312
},
{
"epoch": 3.360204734484965,
"grad_norm": 0.34091923953794956,
"learning_rate": 1.1653230910363645e-05,
"loss": 0.2522,
"step": 1313
},
{
"epoch": 3.362763915547025,
"grad_norm": 0.3093765842790142,
"learning_rate": 1.1620709533134104e-05,
"loss": 0.25,
"step": 1314
},
{
"epoch": 3.3653230966090852,
"grad_norm": 0.3332035792467053,
"learning_rate": 1.1588215006420374e-05,
"loss": 0.2729,
"step": 1315
},
{
"epoch": 3.3678822776711455,
"grad_norm": 0.3076095736538766,
"learning_rate": 1.1555747434347606e-05,
"loss": 0.2076,
"step": 1316
},
{
"epoch": 3.370441458733205,
"grad_norm": 0.30288813913337326,
"learning_rate": 1.1523306920954571e-05,
"loss": 0.2449,
"step": 1317
},
{
"epoch": 3.3730006397952654,
"grad_norm": 0.3702724394333769,
"learning_rate": 1.1490893570193328e-05,
"loss": 0.2646,
"step": 1318
},
{
"epoch": 3.3755598208573256,
"grad_norm": 0.35103580998810946,
"learning_rate": 1.1458507485928891e-05,
"loss": 0.2634,
"step": 1319
},
{
"epoch": 3.378119001919386,
"grad_norm": 0.3037095018272702,
"learning_rate": 1.1426148771938915e-05,
"loss": 0.2229,
"step": 1320
},
{
"epoch": 3.380678182981446,
"grad_norm": 0.3192579058292554,
"learning_rate": 1.139381753191335e-05,
"loss": 0.2489,
"step": 1321
},
{
"epoch": 3.383237364043506,
"grad_norm": 0.32501023665860496,
"learning_rate": 1.1361513869454092e-05,
"loss": 0.2407,
"step": 1322
},
{
"epoch": 3.385796545105566,
"grad_norm": 0.35596917491983554,
"learning_rate": 1.1329237888074691e-05,
"loss": 0.2437,
"step": 1323
},
{
"epoch": 3.3883557261676263,
"grad_norm": 0.33172031186682566,
"learning_rate": 1.129698969119998e-05,
"loss": 0.2623,
"step": 1324
},
{
"epoch": 3.3909149072296865,
"grad_norm": 0.2995594606117323,
"learning_rate": 1.1264769382165748e-05,
"loss": 0.1996,
"step": 1325
},
{
"epoch": 3.3934740882917467,
"grad_norm": 0.3194047630014032,
"learning_rate": 1.123257706421845e-05,
"loss": 0.236,
"step": 1326
},
{
"epoch": 3.396033269353807,
"grad_norm": 0.33068857427400655,
"learning_rate": 1.1200412840514839e-05,
"loss": 0.2244,
"step": 1327
},
{
"epoch": 3.398592450415867,
"grad_norm": 0.36837193881845204,
"learning_rate": 1.1168276814121621e-05,
"loss": 0.2828,
"step": 1328
},
{
"epoch": 3.401151631477927,
"grad_norm": 0.3076965971301543,
"learning_rate": 1.1136169088015177e-05,
"loss": 0.2241,
"step": 1329
},
{
"epoch": 3.403710812539987,
"grad_norm": 0.3411885790050691,
"learning_rate": 1.110408976508118e-05,
"loss": 0.2232,
"step": 1330
},
{
"epoch": 3.4062699936020473,
"grad_norm": 0.3263500540450158,
"learning_rate": 1.107203894811429e-05,
"loss": 0.2572,
"step": 1331
},
{
"epoch": 3.4088291746641075,
"grad_norm": 0.3416605787702754,
"learning_rate": 1.1040016739817836e-05,
"loss": 0.2433,
"step": 1332
},
{
"epoch": 3.4113883557261677,
"grad_norm": 0.335195547574942,
"learning_rate": 1.1008023242803477e-05,
"loss": 0.2648,
"step": 1333
},
{
"epoch": 3.413947536788228,
"grad_norm": 0.36538157527653864,
"learning_rate": 1.097605855959084e-05,
"loss": 0.2286,
"step": 1334
},
{
"epoch": 3.4165067178502877,
"grad_norm": 0.2912108672412734,
"learning_rate": 1.094412279260726e-05,
"loss": 0.2073,
"step": 1335
},
{
"epoch": 3.419065898912348,
"grad_norm": 0.32352858434290793,
"learning_rate": 1.0912216044187382e-05,
"loss": 0.2725,
"step": 1336
},
{
"epoch": 3.421625079974408,
"grad_norm": 0.30980460582268804,
"learning_rate": 1.0880338416572872e-05,
"loss": 0.242,
"step": 1337
},
{
"epoch": 3.4241842610364683,
"grad_norm": 0.3026357282953144,
"learning_rate": 1.0848490011912096e-05,
"loss": 0.2207,
"step": 1338
},
{
"epoch": 3.4267434420985285,
"grad_norm": 0.3222649336637817,
"learning_rate": 1.0816670932259763e-05,
"loss": 0.2196,
"step": 1339
},
{
"epoch": 3.4293026231605888,
"grad_norm": 0.3346500533447882,
"learning_rate": 1.0784881279576635e-05,
"loss": 0.2187,
"step": 1340
},
{
"epoch": 3.431861804222649,
"grad_norm": 0.3122079565048836,
"learning_rate": 1.0753121155729133e-05,
"loss": 0.2227,
"step": 1341
},
{
"epoch": 3.4344209852847087,
"grad_norm": 0.3240510909707239,
"learning_rate": 1.07213906624891e-05,
"loss": 0.2231,
"step": 1342
},
{
"epoch": 3.436980166346769,
"grad_norm": 0.3235912618403718,
"learning_rate": 1.0689689901533424e-05,
"loss": 0.2492,
"step": 1343
},
{
"epoch": 3.439539347408829,
"grad_norm": 0.3040119908970231,
"learning_rate": 1.0658018974443692e-05,
"loss": 0.1984,
"step": 1344
},
{
"epoch": 3.4420985284708894,
"grad_norm": 0.340863607236755,
"learning_rate": 1.0626377982705929e-05,
"loss": 0.2349,
"step": 1345
},
{
"epoch": 3.4446577095329496,
"grad_norm": 0.32795701173977326,
"learning_rate": 1.059476702771021e-05,
"loss": 0.2529,
"step": 1346
},
{
"epoch": 3.4472168905950094,
"grad_norm": 0.31132450713720333,
"learning_rate": 1.056318621075036e-05,
"loss": 0.2095,
"step": 1347
},
{
"epoch": 3.4497760716570696,
"grad_norm": 0.3254097118432526,
"learning_rate": 1.0531635633023644e-05,
"loss": 0.2358,
"step": 1348
},
{
"epoch": 3.4523352527191298,
"grad_norm": 0.3017269372689714,
"learning_rate": 1.050011539563043e-05,
"loss": 0.2247,
"step": 1349
},
{
"epoch": 3.45489443378119,
"grad_norm": 0.3283903326525304,
"learning_rate": 1.0468625599573842e-05,
"loss": 0.2718,
"step": 1350
},
{
"epoch": 3.45745361484325,
"grad_norm": 0.31738351502037276,
"learning_rate": 1.0437166345759489e-05,
"loss": 0.2345,
"step": 1351
},
{
"epoch": 3.4600127959053104,
"grad_norm": 0.30402755053632596,
"learning_rate": 1.0405737734995083e-05,
"loss": 0.2057,
"step": 1352
},
{
"epoch": 3.4625719769673706,
"grad_norm": 0.33488642167297444,
"learning_rate": 1.037433986799015e-05,
"loss": 0.2439,
"step": 1353
},
{
"epoch": 3.4651311580294304,
"grad_norm": 0.30914851197686366,
"learning_rate": 1.034297284535571e-05,
"loss": 0.2028,
"step": 1354
},
{
"epoch": 3.4676903390914906,
"grad_norm": 0.32175431813825445,
"learning_rate": 1.0311636767603952e-05,
"loss": 0.2439,
"step": 1355
},
{
"epoch": 3.470249520153551,
"grad_norm": 0.3395487986448244,
"learning_rate": 1.028033173514788e-05,
"loss": 0.2502,
"step": 1356
},
{
"epoch": 3.472808701215611,
"grad_norm": 0.3042126709214444,
"learning_rate": 1.0249057848301043e-05,
"loss": 0.2395,
"step": 1357
},
{
"epoch": 3.4753678822776712,
"grad_norm": 0.3183501854149144,
"learning_rate": 1.0217815207277165e-05,
"loss": 0.2234,
"step": 1358
},
{
"epoch": 3.4779270633397315,
"grad_norm": 0.3233478623491546,
"learning_rate": 1.0186603912189867e-05,
"loss": 0.2589,
"step": 1359
},
{
"epoch": 3.480486244401791,
"grad_norm": 0.3018286569760461,
"learning_rate": 1.0155424063052306e-05,
"loss": 0.2401,
"step": 1360
},
{
"epoch": 3.4830454254638514,
"grad_norm": 0.3174313407841064,
"learning_rate": 1.0124275759776889e-05,
"loss": 0.2399,
"step": 1361
},
{
"epoch": 3.4856046065259116,
"grad_norm": 0.307048568974569,
"learning_rate": 1.0093159102174938e-05,
"loss": 0.2291,
"step": 1362
},
{
"epoch": 3.488163787587972,
"grad_norm": 0.3132031600114937,
"learning_rate": 1.006207418995636e-05,
"loss": 0.2086,
"step": 1363
},
{
"epoch": 3.490722968650032,
"grad_norm": 0.34596427815653313,
"learning_rate": 1.0031021122729328e-05,
"loss": 0.2497,
"step": 1364
},
{
"epoch": 3.4932821497120923,
"grad_norm": 0.2986896060364163,
"learning_rate": 1.0000000000000006e-05,
"loss": 0.2379,
"step": 1365
},
{
"epoch": 3.4958413307741525,
"grad_norm": 0.3319066544902576,
"learning_rate": 9.969010921172155e-06,
"loss": 0.2542,
"step": 1366
},
{
"epoch": 3.4984005118362123,
"grad_norm": 0.3053208810307986,
"learning_rate": 9.938053985546883e-06,
"loss": 0.2299,
"step": 1367
},
{
"epoch": 3.5009596928982725,
"grad_norm": 0.3736366846122222,
"learning_rate": 9.907129292322298e-06,
"loss": 0.2676,
"step": 1368
},
{
"epoch": 3.5035188739603327,
"grad_norm": 0.33175316766942814,
"learning_rate": 9.876236940593173e-06,
"loss": 0.2753,
"step": 1369
},
{
"epoch": 3.506078055022393,
"grad_norm": 0.3276624133983928,
"learning_rate": 9.84537702935065e-06,
"loss": 0.2745,
"step": 1370
},
{
"epoch": 3.508637236084453,
"grad_norm": 0.3067138284048095,
"learning_rate": 9.814549657481935e-06,
"loss": 0.2201,
"step": 1371
},
{
"epoch": 3.511196417146513,
"grad_norm": 0.3489342217784152,
"learning_rate": 9.783754923769946e-06,
"loss": 0.2402,
"step": 1372
},
{
"epoch": 3.513755598208573,
"grad_norm": 0.3103704194146652,
"learning_rate": 9.752992926893027e-06,
"loss": 0.231,
"step": 1373
},
{
"epoch": 3.5163147792706333,
"grad_norm": 0.29799307682543535,
"learning_rate": 9.722263765424628e-06,
"loss": 0.2103,
"step": 1374
},
{
"epoch": 3.5188739603326935,
"grad_norm": 0.3245882511730939,
"learning_rate": 9.691567537832964e-06,
"loss": 0.2547,
"step": 1375
},
{
"epoch": 3.5214331413947537,
"grad_norm": 0.29350433799889125,
"learning_rate": 9.660904342480715e-06,
"loss": 0.2083,
"step": 1376
},
{
"epoch": 3.523992322456814,
"grad_norm": 0.3333636077305378,
"learning_rate": 9.630274277624729e-06,
"loss": 0.2837,
"step": 1377
},
{
"epoch": 3.526551503518874,
"grad_norm": 0.33952864266921756,
"learning_rate": 9.599677441415694e-06,
"loss": 0.2313,
"step": 1378
},
{
"epoch": 3.5291106845809344,
"grad_norm": 0.3012959852140507,
"learning_rate": 9.5691139318978e-06,
"loss": 0.2171,
"step": 1379
},
{
"epoch": 3.531669865642994,
"grad_norm": 0.3318689398716116,
"learning_rate": 9.538583847008452e-06,
"loss": 0.2366,
"step": 1380
},
{
"epoch": 3.5342290467050543,
"grad_norm": 0.3358375103639254,
"learning_rate": 9.508087284577963e-06,
"loss": 0.2402,
"step": 1381
},
{
"epoch": 3.5367882277671145,
"grad_norm": 0.30920378089911293,
"learning_rate": 9.477624342329209e-06,
"loss": 0.2143,
"step": 1382
},
{
"epoch": 3.5393474088291748,
"grad_norm": 0.3169427298803479,
"learning_rate": 9.447195117877343e-06,
"loss": 0.2285,
"step": 1383
},
{
"epoch": 3.541906589891235,
"grad_norm": 0.3112954457690554,
"learning_rate": 9.416799708729486e-06,
"loss": 0.2315,
"step": 1384
},
{
"epoch": 3.5444657709532947,
"grad_norm": 0.3238511016385153,
"learning_rate": 9.386438212284372e-06,
"loss": 0.2252,
"step": 1385
},
{
"epoch": 3.547024952015355,
"grad_norm": 0.3031398199187957,
"learning_rate": 9.356110725832081e-06,
"loss": 0.2376,
"step": 1386
},
{
"epoch": 3.549584133077415,
"grad_norm": 0.3105091167975465,
"learning_rate": 9.325817346553725e-06,
"loss": 0.2689,
"step": 1387
},
{
"epoch": 3.5521433141394754,
"grad_norm": 0.31208594113425225,
"learning_rate": 9.295558171521093e-06,
"loss": 0.2278,
"step": 1388
},
{
"epoch": 3.5547024952015356,
"grad_norm": 0.31485619105429463,
"learning_rate": 9.265333297696395e-06,
"loss": 0.242,
"step": 1389
},
{
"epoch": 3.557261676263596,
"grad_norm": 0.31606147283215824,
"learning_rate": 9.235142821931928e-06,
"loss": 0.2363,
"step": 1390
},
{
"epoch": 3.559820857325656,
"grad_norm": 0.3150525527068536,
"learning_rate": 9.204986840969749e-06,
"loss": 0.2199,
"step": 1391
},
{
"epoch": 3.5623800383877158,
"grad_norm": 0.31857444893477177,
"learning_rate": 9.174865451441375e-06,
"loss": 0.2283,
"step": 1392
},
{
"epoch": 3.564939219449776,
"grad_norm": 0.30466028849006704,
"learning_rate": 9.1447787498675e-06,
"loss": 0.232,
"step": 1393
},
{
"epoch": 3.567498400511836,
"grad_norm": 0.34031824974175295,
"learning_rate": 9.114726832657658e-06,
"loss": 0.2663,
"step": 1394
},
{
"epoch": 3.5700575815738964,
"grad_norm": 0.3261116373502211,
"learning_rate": 9.084709796109907e-06,
"loss": 0.2489,
"step": 1395
},
{
"epoch": 3.5726167626359566,
"grad_norm": 0.30217642557332414,
"learning_rate": 9.054727736410555e-06,
"loss": 0.2613,
"step": 1396
},
{
"epoch": 3.5751759436980164,
"grad_norm": 0.3029175133984261,
"learning_rate": 9.02478074963381e-06,
"loss": 0.2263,
"step": 1397
},
{
"epoch": 3.5777351247600766,
"grad_norm": 0.3613891446327612,
"learning_rate": 8.994868931741499e-06,
"loss": 0.2658,
"step": 1398
},
{
"epoch": 3.580294305822137,
"grad_norm": 0.32375645597232505,
"learning_rate": 8.964992378582758e-06,
"loss": 0.2458,
"step": 1399
},
{
"epoch": 3.582853486884197,
"grad_norm": 0.346977280248451,
"learning_rate": 8.93515118589373e-06,
"loss": 0.2673,
"step": 1400
},
{
"epoch": 3.5854126679462572,
"grad_norm": 0.3091670769782311,
"learning_rate": 8.905345449297223e-06,
"loss": 0.2517,
"step": 1401
},
{
"epoch": 3.5879718490083174,
"grad_norm": 0.3033606651844572,
"learning_rate": 8.87557526430246e-06,
"loss": 0.2012,
"step": 1402
},
{
"epoch": 3.5905310300703777,
"grad_norm": 0.3115678004260849,
"learning_rate": 8.845840726304723e-06,
"loss": 0.2297,
"step": 1403
},
{
"epoch": 3.593090211132438,
"grad_norm": 0.3441565775702763,
"learning_rate": 8.816141930585067e-06,
"loss": 0.2542,
"step": 1404
},
{
"epoch": 3.5956493921944976,
"grad_norm": 0.30703493196426435,
"learning_rate": 8.786478972310023e-06,
"loss": 0.2342,
"step": 1405
},
{
"epoch": 3.598208573256558,
"grad_norm": 0.31549384345069187,
"learning_rate": 8.756851946531294e-06,
"loss": 0.247,
"step": 1406
},
{
"epoch": 3.600767754318618,
"grad_norm": 0.29102001056426585,
"learning_rate": 8.72726094818541e-06,
"loss": 0.2074,
"step": 1407
},
{
"epoch": 3.6033269353806783,
"grad_norm": 0.33490062048588876,
"learning_rate": 8.697706072093493e-06,
"loss": 0.2541,
"step": 1408
},
{
"epoch": 3.6058861164427385,
"grad_norm": 0.3124637481108502,
"learning_rate": 8.668187412960887e-06,
"loss": 0.2437,
"step": 1409
},
{
"epoch": 3.6084452975047983,
"grad_norm": 0.3113279052119832,
"learning_rate": 8.638705065376887e-06,
"loss": 0.2389,
"step": 1410
},
{
"epoch": 3.6110044785668585,
"grad_norm": 0.3194418676306725,
"learning_rate": 8.609259123814443e-06,
"loss": 0.2549,
"step": 1411
},
{
"epoch": 3.6135636596289187,
"grad_norm": 0.28910975717043763,
"learning_rate": 8.579849682629844e-06,
"loss": 0.2246,
"step": 1412
},
{
"epoch": 3.616122840690979,
"grad_norm": 0.31307324283444393,
"learning_rate": 8.550476836062419e-06,
"loss": 0.2425,
"step": 1413
},
{
"epoch": 3.618682021753039,
"grad_norm": 0.3040084877742325,
"learning_rate": 8.521140678234214e-06,
"loss": 0.2361,
"step": 1414
},
{
"epoch": 3.6212412028150993,
"grad_norm": 0.3130765747434785,
"learning_rate": 8.491841303149728e-06,
"loss": 0.2272,
"step": 1415
},
{
"epoch": 3.6238003838771595,
"grad_norm": 0.30265698644043443,
"learning_rate": 8.462578804695595e-06,
"loss": 0.2701,
"step": 1416
},
{
"epoch": 3.6263595649392193,
"grad_norm": 0.2987350574519827,
"learning_rate": 8.43335327664027e-06,
"loss": 0.2177,
"step": 1417
},
{
"epoch": 3.6289187460012795,
"grad_norm": 0.3166676881180338,
"learning_rate": 8.404164812633755e-06,
"loss": 0.2756,
"step": 1418
},
{
"epoch": 3.6314779270633397,
"grad_norm": 0.3033675615693007,
"learning_rate": 8.375013506207275e-06,
"loss": 0.2136,
"step": 1419
},
{
"epoch": 3.6340371081254,
"grad_norm": 0.30761103254564687,
"learning_rate": 8.345899450772975e-06,
"loss": 0.2535,
"step": 1420
},
{
"epoch": 3.63659628918746,
"grad_norm": 0.2982029035930307,
"learning_rate": 8.316822739623662e-06,
"loss": 0.2165,
"step": 1421
},
{
"epoch": 3.63915547024952,
"grad_norm": 0.31969771781374523,
"learning_rate": 8.287783465932466e-06,
"loss": 0.257,
"step": 1422
},
{
"epoch": 3.64171465131158,
"grad_norm": 0.2837299669230536,
"learning_rate": 8.258781722752535e-06,
"loss": 0.224,
"step": 1423
},
{
"epoch": 3.6442738323736403,
"grad_norm": 0.3012916036117272,
"learning_rate": 8.229817603016786e-06,
"loss": 0.2246,
"step": 1424
},
{
"epoch": 3.6468330134357005,
"grad_norm": 0.31189818144182524,
"learning_rate": 8.200891199537549e-06,
"loss": 0.2695,
"step": 1425
},
{
"epoch": 3.6493921944977608,
"grad_norm": 0.2997238939361931,
"learning_rate": 8.1720026050063e-06,
"loss": 0.1862,
"step": 1426
},
{
"epoch": 3.651951375559821,
"grad_norm": 0.34075811134574374,
"learning_rate": 8.143151911993374e-06,
"loss": 0.2619,
"step": 1427
},
{
"epoch": 3.654510556621881,
"grad_norm": 0.3146837641261533,
"learning_rate": 8.114339212947655e-06,
"loss": 0.2396,
"step": 1428
},
{
"epoch": 3.6570697376839414,
"grad_norm": 0.325379060971747,
"learning_rate": 8.085564600196258e-06,
"loss": 0.2435,
"step": 1429
},
{
"epoch": 3.659628918746001,
"grad_norm": 0.33089426856010606,
"learning_rate": 8.056828165944282e-06,
"loss": 0.2459,
"step": 1430
},
{
"epoch": 3.6621880998080614,
"grad_norm": 0.3111195583478781,
"learning_rate": 8.028130002274459e-06,
"loss": 0.2328,
"step": 1431
},
{
"epoch": 3.6647472808701216,
"grad_norm": 0.2917405967267679,
"learning_rate": 7.999470201146915e-06,
"loss": 0.2273,
"step": 1432
},
{
"epoch": 3.667306461932182,
"grad_norm": 0.3072708256616894,
"learning_rate": 7.970848854398825e-06,
"loss": 0.2616,
"step": 1433
},
{
"epoch": 3.669865642994242,
"grad_norm": 0.3274866801130236,
"learning_rate": 7.942266053744155e-06,
"loss": 0.2469,
"step": 1434
},
{
"epoch": 3.6724248240563018,
"grad_norm": 0.3206452949252231,
"learning_rate": 7.913721890773354e-06,
"loss": 0.2265,
"step": 1435
},
{
"epoch": 3.674984005118362,
"grad_norm": 0.3076598617812908,
"learning_rate": 7.885216456953053e-06,
"loss": 0.2167,
"step": 1436
},
{
"epoch": 3.677543186180422,
"grad_norm": 0.2979466544850749,
"learning_rate": 7.856749843625777e-06,
"loss": 0.2203,
"step": 1437
},
{
"epoch": 3.6801023672424824,
"grad_norm": 0.3224437385684691,
"learning_rate": 7.828322142009672e-06,
"loss": 0.2473,
"step": 1438
},
{
"epoch": 3.6826615483045426,
"grad_norm": 0.31485417513081154,
"learning_rate": 7.799933443198173e-06,
"loss": 0.2606,
"step": 1439
},
{
"epoch": 3.685220729366603,
"grad_norm": 0.3243639601134415,
"learning_rate": 7.771583838159756e-06,
"loss": 0.2633,
"step": 1440
},
{
"epoch": 3.687779910428663,
"grad_norm": 0.32893510730247094,
"learning_rate": 7.743273417737617e-06,
"loss": 0.2531,
"step": 1441
},
{
"epoch": 3.690339091490723,
"grad_norm": 0.31189557871340884,
"learning_rate": 7.715002272649388e-06,
"loss": 0.2403,
"step": 1442
},
{
"epoch": 3.692898272552783,
"grad_norm": 0.2901780629695267,
"learning_rate": 7.686770493486835e-06,
"loss": 0.2517,
"step": 1443
},
{
"epoch": 3.6954574536148432,
"grad_norm": 0.3159906947616901,
"learning_rate": 7.65857817071561e-06,
"loss": 0.2492,
"step": 1444
},
{
"epoch": 3.6980166346769034,
"grad_norm": 0.3098159857766409,
"learning_rate": 7.630425394674903e-06,
"loss": 0.2341,
"step": 1445
},
{
"epoch": 3.7005758157389637,
"grad_norm": 0.31575476226888965,
"learning_rate": 7.602312255577193e-06,
"loss": 0.2416,
"step": 1446
},
{
"epoch": 3.7031349968010234,
"grad_norm": 0.31087089571752347,
"learning_rate": 7.574238843507957e-06,
"loss": 0.2673,
"step": 1447
},
{
"epoch": 3.7056941778630836,
"grad_norm": 0.3209375688192084,
"learning_rate": 7.546205248425353e-06,
"loss": 0.2313,
"step": 1448
},
{
"epoch": 3.708253358925144,
"grad_norm": 0.30680259294814516,
"learning_rate": 7.518211560159949e-06,
"loss": 0.2187,
"step": 1449
},
{
"epoch": 3.710812539987204,
"grad_norm": 0.3064495121568587,
"learning_rate": 7.49025786841445e-06,
"loss": 0.2161,
"step": 1450
},
{
"epoch": 3.7133717210492643,
"grad_norm": 0.29214767577744066,
"learning_rate": 7.462344262763399e-06,
"loss": 0.2339,
"step": 1451
},
{
"epoch": 3.7159309021113245,
"grad_norm": 0.3146911137467561,
"learning_rate": 7.434470832652865e-06,
"loss": 0.2464,
"step": 1452
},
{
"epoch": 3.7184900831733847,
"grad_norm": 0.3499806012960674,
"learning_rate": 7.406637667400205e-06,
"loss": 0.2246,
"step": 1453
},
{
"epoch": 3.721049264235445,
"grad_norm": 0.30724891177758956,
"learning_rate": 7.378844856193736e-06,
"loss": 0.272,
"step": 1454
},
{
"epoch": 3.7236084452975047,
"grad_norm": 0.29711759225447126,
"learning_rate": 7.3510924880924575e-06,
"loss": 0.2205,
"step": 1455
},
{
"epoch": 3.726167626359565,
"grad_norm": 0.31114069148352147,
"learning_rate": 7.323380652025794e-06,
"loss": 0.2619,
"step": 1456
},
{
"epoch": 3.728726807421625,
"grad_norm": 0.3019555109712794,
"learning_rate": 7.295709436793284e-06,
"loss": 0.2526,
"step": 1457
},
{
"epoch": 3.7312859884836853,
"grad_norm": 0.30587326696750855,
"learning_rate": 7.268078931064293e-06,
"loss": 0.2156,
"step": 1458
},
{
"epoch": 3.7338451695457455,
"grad_norm": 0.3010387774085701,
"learning_rate": 7.2404892233777334e-06,
"loss": 0.2343,
"step": 1459
},
{
"epoch": 3.7364043506078053,
"grad_norm": 0.30265924975544334,
"learning_rate": 7.212940402141808e-06,
"loss": 0.2542,
"step": 1460
},
{
"epoch": 3.7389635316698655,
"grad_norm": 0.2899279021405336,
"learning_rate": 7.185432555633672e-06,
"loss": 0.2263,
"step": 1461
},
{
"epoch": 3.7415227127319257,
"grad_norm": 0.3197841686852777,
"learning_rate": 7.1579657719992045e-06,
"loss": 0.2665,
"step": 1462
},
{
"epoch": 3.744081893793986,
"grad_norm": 0.27978040220930955,
"learning_rate": 7.130540139252704e-06,
"loss": 0.2338,
"step": 1463
},
{
"epoch": 3.746641074856046,
"grad_norm": 0.31288929736620646,
"learning_rate": 7.1031557452765934e-06,
"loss": 0.2372,
"step": 1464
},
{
"epoch": 3.7492002559181064,
"grad_norm": 0.3098001081263227,
"learning_rate": 7.075812677821145e-06,
"loss": 0.2221,
"step": 1465
},
{
"epoch": 3.7517594369801666,
"grad_norm": 0.29645463690795515,
"learning_rate": 7.048511024504223e-06,
"loss": 0.2439,
"step": 1466
},
{
"epoch": 3.7543186180422263,
"grad_norm": 0.30305748324499643,
"learning_rate": 7.021250872810983e-06,
"loss": 0.2447,
"step": 1467
},
{
"epoch": 3.7568777991042865,
"grad_norm": 0.3180001382034292,
"learning_rate": 6.9940323100935725e-06,
"loss": 0.2455,
"step": 1468
},
{
"epoch": 3.7594369801663468,
"grad_norm": 0.30172748918488546,
"learning_rate": 6.966855423570898e-06,
"loss": 0.2319,
"step": 1469
},
{
"epoch": 3.761996161228407,
"grad_norm": 0.30342551799145645,
"learning_rate": 6.939720300328303e-06,
"loss": 0.2283,
"step": 1470
},
{
"epoch": 3.764555342290467,
"grad_norm": 0.31295405822100386,
"learning_rate": 6.9126270273173e-06,
"loss": 0.2361,
"step": 1471
},
{
"epoch": 3.767114523352527,
"grad_norm": 0.2970422267078146,
"learning_rate": 6.885575691355315e-06,
"loss": 0.1965,
"step": 1472
},
{
"epoch": 3.769673704414587,
"grad_norm": 0.32398586323455586,
"learning_rate": 6.858566379125389e-06,
"loss": 0.2661,
"step": 1473
},
{
"epoch": 3.7722328854766474,
"grad_norm": 0.28922597868223626,
"learning_rate": 6.831599177175879e-06,
"loss": 0.222,
"step": 1474
},
{
"epoch": 3.7747920665387076,
"grad_norm": 0.296927401509587,
"learning_rate": 6.8046741719202385e-06,
"loss": 0.2262,
"step": 1475
},
{
"epoch": 3.777351247600768,
"grad_norm": 0.318579910031184,
"learning_rate": 6.777791449636681e-06,
"loss": 0.2455,
"step": 1476
},
{
"epoch": 3.779910428662828,
"grad_norm": 0.30482817177680954,
"learning_rate": 6.7509510964679305e-06,
"loss": 0.2376,
"step": 1477
},
{
"epoch": 3.782469609724888,
"grad_norm": 0.3043184575100951,
"learning_rate": 6.724153198420957e-06,
"loss": 0.2508,
"step": 1478
},
{
"epoch": 3.7850287907869484,
"grad_norm": 0.29471146448676344,
"learning_rate": 6.697397841366686e-06,
"loss": 0.2219,
"step": 1479
},
{
"epoch": 3.787587971849008,
"grad_norm": 0.2763016151280136,
"learning_rate": 6.67068511103971e-06,
"loss": 0.2092,
"step": 1480
},
{
"epoch": 3.7901471529110684,
"grad_norm": 0.29319933516975266,
"learning_rate": 6.644015093038049e-06,
"loss": 0.2042,
"step": 1481
},
{
"epoch": 3.7927063339731286,
"grad_norm": 0.3160040922162459,
"learning_rate": 6.617387872822842e-06,
"loss": 0.2269,
"step": 1482
},
{
"epoch": 3.795265515035189,
"grad_norm": 0.31249964016432036,
"learning_rate": 6.590803535718082e-06,
"loss": 0.2841,
"step": 1483
},
{
"epoch": 3.797824696097249,
"grad_norm": 0.29901036986186935,
"learning_rate": 6.564262166910367e-06,
"loss": 0.2096,
"step": 1484
},
{
"epoch": 3.800383877159309,
"grad_norm": 0.3151541438953574,
"learning_rate": 6.537763851448593e-06,
"loss": 0.2215,
"step": 1485
},
{
"epoch": 3.802943058221369,
"grad_norm": 0.3238104601226396,
"learning_rate": 6.511308674243711e-06,
"loss": 0.2493,
"step": 1486
},
{
"epoch": 3.8055022392834292,
"grad_norm": 0.30561591970597685,
"learning_rate": 6.484896720068421e-06,
"loss": 0.238,
"step": 1487
},
{
"epoch": 3.8080614203454894,
"grad_norm": 0.28920704087260063,
"learning_rate": 6.458528073556925e-06,
"loss": 0.2685,
"step": 1488
},
{
"epoch": 3.8106206014075497,
"grad_norm": 0.3038589031703289,
"learning_rate": 6.432202819204667e-06,
"loss": 0.248,
"step": 1489
},
{
"epoch": 3.81317978246961,
"grad_norm": 0.3385452502303158,
"learning_rate": 6.4059210413680175e-06,
"loss": 0.2503,
"step": 1490
},
{
"epoch": 3.81573896353167,
"grad_norm": 0.2768232976810782,
"learning_rate": 6.379682824264055e-06,
"loss": 0.2164,
"step": 1491
},
{
"epoch": 3.81829814459373,
"grad_norm": 0.3009349939503682,
"learning_rate": 6.353488251970275e-06,
"loss": 0.2366,
"step": 1492
},
{
"epoch": 3.82085732565579,
"grad_norm": 0.2928819289728828,
"learning_rate": 6.327337408424281e-06,
"loss": 0.2332,
"step": 1493
},
{
"epoch": 3.8234165067178503,
"grad_norm": 0.2870974660694633,
"learning_rate": 6.301230377423595e-06,
"loss": 0.2,
"step": 1494
},
{
"epoch": 3.8259756877799105,
"grad_norm": 0.3081786003102801,
"learning_rate": 6.275167242625331e-06,
"loss": 0.2414,
"step": 1495
},
{
"epoch": 3.8285348688419707,
"grad_norm": 0.2790089490684267,
"learning_rate": 6.2491480875459336e-06,
"loss": 0.215,
"step": 1496
},
{
"epoch": 3.8310940499040305,
"grad_norm": 0.29306281039648446,
"learning_rate": 6.223172995560935e-06,
"loss": 0.2679,
"step": 1497
},
{
"epoch": 3.8336532309660907,
"grad_norm": 0.2762204818599091,
"learning_rate": 6.1972420499046635e-06,
"loss": 0.2192,
"step": 1498
},
{
"epoch": 3.836212412028151,
"grad_norm": 0.29828532304173333,
"learning_rate": 6.171355333669973e-06,
"loss": 0.2441,
"step": 1499
},
{
"epoch": 3.838771593090211,
"grad_norm": 0.32131992187351277,
"learning_rate": 6.145512929808013e-06,
"loss": 0.229,
"step": 1500
},
{
"epoch": 3.8413307741522713,
"grad_norm": 0.3043046372632,
"learning_rate": 6.119714921127933e-06,
"loss": 0.2694,
"step": 1501
},
{
"epoch": 3.8438899552143315,
"grad_norm": 0.2717132569091513,
"learning_rate": 6.093961390296603e-06,
"loss": 0.2254,
"step": 1502
},
{
"epoch": 3.8464491362763917,
"grad_norm": 0.30029687863634635,
"learning_rate": 6.068252419838399e-06,
"loss": 0.2326,
"step": 1503
},
{
"epoch": 3.849008317338452,
"grad_norm": 0.27288392014891966,
"learning_rate": 6.042588092134878e-06,
"loss": 0.2163,
"step": 1504
},
{
"epoch": 3.8515674984005117,
"grad_norm": 0.283307260045485,
"learning_rate": 6.016968489424572e-06,
"loss": 0.2312,
"step": 1505
},
{
"epoch": 3.854126679462572,
"grad_norm": 0.2919987165904583,
"learning_rate": 5.991393693802674e-06,
"loss": 0.2533,
"step": 1506
},
{
"epoch": 3.856685860524632,
"grad_norm": 0.2968617574713475,
"learning_rate": 5.96586378722081e-06,
"loss": 0.2397,
"step": 1507
},
{
"epoch": 3.8592450415866923,
"grad_norm": 0.28294719312430794,
"learning_rate": 5.940378851486766e-06,
"loss": 0.2302,
"step": 1508
},
{
"epoch": 3.8618042226487526,
"grad_norm": 0.2885647154013107,
"learning_rate": 5.9149389682642165e-06,
"loss": 0.2429,
"step": 1509
},
{
"epoch": 3.8643634037108123,
"grad_norm": 0.29335993884299777,
"learning_rate": 5.889544219072465e-06,
"loss": 0.2347,
"step": 1510
},
{
"epoch": 3.8669225847728725,
"grad_norm": 0.30407531909262675,
"learning_rate": 5.864194685286206e-06,
"loss": 0.2405,
"step": 1511
},
{
"epoch": 3.8694817658349328,
"grad_norm": 0.29889145730228683,
"learning_rate": 5.838890448135228e-06,
"loss": 0.2373,
"step": 1512
},
{
"epoch": 3.872040946896993,
"grad_norm": 0.3002231215499318,
"learning_rate": 5.81363158870418e-06,
"loss": 0.2316,
"step": 1513
},
{
"epoch": 3.874600127959053,
"grad_norm": 0.3246106606552585,
"learning_rate": 5.788418187932314e-06,
"loss": 0.2365,
"step": 1514
},
{
"epoch": 3.8771593090211134,
"grad_norm": 0.29292707103941984,
"learning_rate": 5.7632503266131925e-06,
"loss": 0.2087,
"step": 1515
},
{
"epoch": 3.8797184900831736,
"grad_norm": 0.3255584037433501,
"learning_rate": 5.7381280853944585e-06,
"loss": 0.2807,
"step": 1516
},
{
"epoch": 3.8822776711452334,
"grad_norm": 0.2912419112819368,
"learning_rate": 5.713051544777584e-06,
"loss": 0.2218,
"step": 1517
},
{
"epoch": 3.8848368522072936,
"grad_norm": 0.3048580778970364,
"learning_rate": 5.688020785117581e-06,
"loss": 0.2753,
"step": 1518
},
{
"epoch": 3.887396033269354,
"grad_norm": 0.29167337976733226,
"learning_rate": 5.66303588662277e-06,
"loss": 0.2329,
"step": 1519
},
{
"epoch": 3.889955214331414,
"grad_norm": 0.2992142371051243,
"learning_rate": 5.638096929354522e-06,
"loss": 0.2268,
"step": 1520
},
{
"epoch": 3.892514395393474,
"grad_norm": 0.3027423583297044,
"learning_rate": 5.613203993226981e-06,
"loss": 0.221,
"step": 1521
},
{
"epoch": 3.895073576455534,
"grad_norm": 0.3021686605260421,
"learning_rate": 5.588357158006821e-06,
"loss": 0.252,
"step": 1522
},
{
"epoch": 3.897632757517594,
"grad_norm": 0.2996327213886315,
"learning_rate": 5.563556503312997e-06,
"loss": 0.2318,
"step": 1523
},
{
"epoch": 3.9001919385796544,
"grad_norm": 0.2957035292429405,
"learning_rate": 5.538802108616494e-06,
"loss": 0.239,
"step": 1524
},
{
"epoch": 3.9027511196417146,
"grad_norm": 0.309492243720712,
"learning_rate": 5.514094053240035e-06,
"loss": 0.2228,
"step": 1525
},
{
"epoch": 3.905310300703775,
"grad_norm": 0.6416163470606975,
"learning_rate": 5.489432416357885e-06,
"loss": 0.2326,
"step": 1526
},
{
"epoch": 3.907869481765835,
"grad_norm": 0.297847179245253,
"learning_rate": 5.46481727699554e-06,
"loss": 0.2346,
"step": 1527
},
{
"epoch": 3.9104286628278953,
"grad_norm": 0.30350301665082174,
"learning_rate": 5.440248714029508e-06,
"loss": 0.2478,
"step": 1528
},
{
"epoch": 3.9129878438899555,
"grad_norm": 0.29944760683896515,
"learning_rate": 5.415726806187052e-06,
"loss": 0.2306,
"step": 1529
},
{
"epoch": 3.9155470249520152,
"grad_norm": 0.28720986057591197,
"learning_rate": 5.39125163204594e-06,
"loss": 0.2057,
"step": 1530
},
{
"epoch": 3.9181062060140754,
"grad_norm": 0.2868190209794002,
"learning_rate": 5.3668232700341735e-06,
"loss": 0.2278,
"step": 1531
},
{
"epoch": 3.9206653870761357,
"grad_norm": 0.3118363419035033,
"learning_rate": 5.342441798429747e-06,
"loss": 0.2518,
"step": 1532
},
{
"epoch": 3.923224568138196,
"grad_norm": 0.27782593526187427,
"learning_rate": 5.318107295360424e-06,
"loss": 0.2334,
"step": 1533
},
{
"epoch": 3.925783749200256,
"grad_norm": 0.29231670514977803,
"learning_rate": 5.293819838803429e-06,
"loss": 0.2198,
"step": 1534
},
{
"epoch": 3.928342930262316,
"grad_norm": 0.30148858674026124,
"learning_rate": 5.269579506585259e-06,
"loss": 0.2291,
"step": 1535
},
{
"epoch": 3.930902111324376,
"grad_norm": 0.2942793998750714,
"learning_rate": 5.245386376381398e-06,
"loss": 0.2235,
"step": 1536
},
{
"epoch": 3.9334612923864363,
"grad_norm": 0.30230762629355007,
"learning_rate": 5.221240525716071e-06,
"loss": 0.2182,
"step": 1537
},
{
"epoch": 3.9360204734484965,
"grad_norm": 0.2838869405763987,
"learning_rate": 5.197142031961999e-06,
"loss": 0.2531,
"step": 1538
},
{
"epoch": 3.9385796545105567,
"grad_norm": 0.284590980219594,
"learning_rate": 5.17309097234016e-06,
"loss": 0.2235,
"step": 1539
},
{
"epoch": 3.941138835572617,
"grad_norm": 0.2805331306681004,
"learning_rate": 5.149087423919541e-06,
"loss": 0.1941,
"step": 1540
},
{
"epoch": 3.943698016634677,
"grad_norm": 0.3227111819969921,
"learning_rate": 5.125131463616863e-06,
"loss": 0.2598,
"step": 1541
},
{
"epoch": 3.946257197696737,
"grad_norm": 0.3095815791809101,
"learning_rate": 5.101223168196381e-06,
"loss": 0.26,
"step": 1542
},
{
"epoch": 3.948816378758797,
"grad_norm": 0.28306994750627523,
"learning_rate": 5.077362614269599e-06,
"loss": 0.2214,
"step": 1543
},
{
"epoch": 3.9513755598208573,
"grad_norm": 0.30833267171559586,
"learning_rate": 5.05354987829503e-06,
"loss": 0.2473,
"step": 1544
},
{
"epoch": 3.9539347408829175,
"grad_norm": 0.29154031796148494,
"learning_rate": 5.029785036577976e-06,
"loss": 0.231,
"step": 1545
},
{
"epoch": 3.9564939219449777,
"grad_norm": 0.3230057370683127,
"learning_rate": 5.0060681652702745e-06,
"loss": 0.2538,
"step": 1546
},
{
"epoch": 3.9590531030070375,
"grad_norm": 0.28553506909372656,
"learning_rate": 4.982399340370017e-06,
"loss": 0.231,
"step": 1547
},
{
"epoch": 3.9616122840690977,
"grad_norm": 0.30170947130683173,
"learning_rate": 4.958778637721364e-06,
"loss": 0.2454,
"step": 1548
},
{
"epoch": 3.964171465131158,
"grad_norm": 0.28267032534811537,
"learning_rate": 4.935206133014259e-06,
"loss": 0.2417,
"step": 1549
},
{
"epoch": 3.966730646193218,
"grad_norm": 0.2954487998931121,
"learning_rate": 4.911681901784198e-06,
"loss": 0.2319,
"step": 1550
},
{
"epoch": 3.9692898272552783,
"grad_norm": 0.32228832699706966,
"learning_rate": 4.8882060194119985e-06,
"loss": 0.2282,
"step": 1551
},
{
"epoch": 3.9718490083173386,
"grad_norm": 0.34818873755980667,
"learning_rate": 4.864778561123555e-06,
"loss": 0.2718,
"step": 1552
},
{
"epoch": 3.9744081893793988,
"grad_norm": 0.27407731345565545,
"learning_rate": 4.841399601989574e-06,
"loss": 0.2039,
"step": 1553
},
{
"epoch": 3.976967370441459,
"grad_norm": 0.2749332266585077,
"learning_rate": 4.8180692169253714e-06,
"loss": 0.2181,
"step": 1554
},
{
"epoch": 3.9795265515035187,
"grad_norm": 0.3010112395338718,
"learning_rate": 4.794787480690597e-06,
"loss": 0.2232,
"step": 1555
},
{
"epoch": 3.982085732565579,
"grad_norm": 0.29171650447929676,
"learning_rate": 4.771554467889012e-06,
"loss": 0.2391,
"step": 1556
},
{
"epoch": 3.984644913627639,
"grad_norm": 0.2906873549139878,
"learning_rate": 4.74837025296826e-06,
"loss": 0.2297,
"step": 1557
},
{
"epoch": 3.9872040946896994,
"grad_norm": 0.3106548346493187,
"learning_rate": 4.725234910219609e-06,
"loss": 0.2564,
"step": 1558
},
{
"epoch": 3.9897632757517596,
"grad_norm": 0.29178940946369886,
"learning_rate": 4.702148513777731e-06,
"loss": 0.2457,
"step": 1559
},
{
"epoch": 3.9923224568138194,
"grad_norm": 0.290141222179049,
"learning_rate": 4.679111137620442e-06,
"loss": 0.2007,
"step": 1560
},
{
"epoch": 3.9948816378758796,
"grad_norm": 0.3078661741665255,
"learning_rate": 4.656122855568477e-06,
"loss": 0.2416,
"step": 1561
},
{
"epoch": 3.99744081893794,
"grad_norm": 0.28827757162372486,
"learning_rate": 4.63318374128527e-06,
"loss": 0.2416,
"step": 1562
},
{
"epoch": 4.0,
"grad_norm": 0.36059866962884213,
"learning_rate": 4.610293868276681e-06,
"loss": 0.286,
"step": 1563
},
{
"epoch": 4.00255918106206,
"grad_norm": 0.4806949756871284,
"learning_rate": 4.587453309890804e-06,
"loss": 0.1829,
"step": 1564
},
{
"epoch": 4.00511836212412,
"grad_norm": 0.4071198782663298,
"learning_rate": 4.5646621393177e-06,
"loss": 0.2002,
"step": 1565
},
{
"epoch": 4.007677543186181,
"grad_norm": 0.29904364596653477,
"learning_rate": 4.541920429589168e-06,
"loss": 0.1689,
"step": 1566
},
{
"epoch": 4.010236724248241,
"grad_norm": 0.319382339625052,
"learning_rate": 4.519228253578514e-06,
"loss": 0.162,
"step": 1567
},
{
"epoch": 4.012795905310301,
"grad_norm": 0.44482796159027516,
"learning_rate": 4.496585684000332e-06,
"loss": 0.1905,
"step": 1568
},
{
"epoch": 4.015355086372361,
"grad_norm": 0.4881520882605733,
"learning_rate": 4.47399279341024e-06,
"loss": 0.1883,
"step": 1569
},
{
"epoch": 4.017914267434421,
"grad_norm": 0.39543877818036155,
"learning_rate": 4.451449654204685e-06,
"loss": 0.1792,
"step": 1570
},
{
"epoch": 4.020473448496481,
"grad_norm": 0.33428133649360503,
"learning_rate": 4.428956338620671e-06,
"loss": 0.1549,
"step": 1571
},
{
"epoch": 4.023032629558541,
"grad_norm": 0.39094054957770324,
"learning_rate": 4.406512918735555e-06,
"loss": 0.168,
"step": 1572
},
{
"epoch": 4.025591810620601,
"grad_norm": 0.41624590579275506,
"learning_rate": 4.384119466466816e-06,
"loss": 0.1546,
"step": 1573
},
{
"epoch": 4.028150991682661,
"grad_norm": 0.41050595250529065,
"learning_rate": 4.361776053571816e-06,
"loss": 0.1553,
"step": 1574
},
{
"epoch": 4.030710172744722,
"grad_norm": 0.338525098415926,
"learning_rate": 4.339482751647557e-06,
"loss": 0.1672,
"step": 1575
},
{
"epoch": 4.033269353806782,
"grad_norm": 0.3227337964991407,
"learning_rate": 4.317239632130485e-06,
"loss": 0.1694,
"step": 1576
},
{
"epoch": 4.035828534868842,
"grad_norm": 0.3126102999935797,
"learning_rate": 4.295046766296224e-06,
"loss": 0.1652,
"step": 1577
},
{
"epoch": 4.038387715930902,
"grad_norm": 0.3419323105009614,
"learning_rate": 4.272904225259387e-06,
"loss": 0.1643,
"step": 1578
},
{
"epoch": 4.0409468969929625,
"grad_norm": 0.35539152039667277,
"learning_rate": 4.250812079973301e-06,
"loss": 0.1693,
"step": 1579
},
{
"epoch": 4.043506078055023,
"grad_norm": 0.34150875416867105,
"learning_rate": 4.228770401229824e-06,
"loss": 0.1676,
"step": 1580
},
{
"epoch": 4.046065259117083,
"grad_norm": 0.31333649874909303,
"learning_rate": 4.206779259659102e-06,
"loss": 0.1837,
"step": 1581
},
{
"epoch": 4.048624440179142,
"grad_norm": 0.29497085250511934,
"learning_rate": 4.184838725729326e-06,
"loss": 0.1606,
"step": 1582
},
{
"epoch": 4.0511836212412025,
"grad_norm": 0.28865884769293576,
"learning_rate": 4.1629488697465195e-06,
"loss": 0.1701,
"step": 1583
},
{
"epoch": 4.053742802303263,
"grad_norm": 0.30617690195804087,
"learning_rate": 4.141109761854332e-06,
"loss": 0.1586,
"step": 1584
},
{
"epoch": 4.056301983365323,
"grad_norm": 0.32345308536745493,
"learning_rate": 4.119321472033779e-06,
"loss": 0.1787,
"step": 1585
},
{
"epoch": 4.058861164427383,
"grad_norm": 0.29444459546640134,
"learning_rate": 4.097584070103042e-06,
"loss": 0.153,
"step": 1586
},
{
"epoch": 4.061420345489443,
"grad_norm": 0.28258351174881763,
"learning_rate": 4.075897625717249e-06,
"loss": 0.1593,
"step": 1587
},
{
"epoch": 4.0639795265515035,
"grad_norm": 0.3000401537763792,
"learning_rate": 4.054262208368216e-06,
"loss": 0.1805,
"step": 1588
},
{
"epoch": 4.066538707613564,
"grad_norm": 0.31652390372720957,
"learning_rate": 4.032677887384262e-06,
"loss": 0.1702,
"step": 1589
},
{
"epoch": 4.069097888675624,
"grad_norm": 0.3228892663985873,
"learning_rate": 4.011144731929981e-06,
"loss": 0.1913,
"step": 1590
},
{
"epoch": 4.071657069737684,
"grad_norm": 0.31373728199242906,
"learning_rate": 3.989662811005992e-06,
"loss": 0.1727,
"step": 1591
},
{
"epoch": 4.074216250799744,
"grad_norm": 0.30059854075379616,
"learning_rate": 3.96823219344876e-06,
"loss": 0.2085,
"step": 1592
},
{
"epoch": 4.076775431861805,
"grad_norm": 0.29253073860156936,
"learning_rate": 3.9468529479303445e-06,
"loss": 0.1746,
"step": 1593
},
{
"epoch": 4.079334612923865,
"grad_norm": 0.2925598790253486,
"learning_rate": 3.925525142958189e-06,
"loss": 0.1949,
"step": 1594
},
{
"epoch": 4.081893793985924,
"grad_norm": 0.29501284249932824,
"learning_rate": 3.904248846874894e-06,
"loss": 0.1665,
"step": 1595
},
{
"epoch": 4.084452975047984,
"grad_norm": 0.3128356519747454,
"learning_rate": 3.883024127858017e-06,
"loss": 0.1725,
"step": 1596
},
{
"epoch": 4.0870121561100445,
"grad_norm": 0.2888374275584094,
"learning_rate": 3.861851053919847e-06,
"loss": 0.1873,
"step": 1597
},
{
"epoch": 4.089571337172105,
"grad_norm": 0.28170591176204707,
"learning_rate": 3.840729692907164e-06,
"loss": 0.1789,
"step": 1598
},
{
"epoch": 4.092130518234165,
"grad_norm": 0.2851212955027544,
"learning_rate": 3.819660112501053e-06,
"loss": 0.1587,
"step": 1599
},
{
"epoch": 4.094689699296225,
"grad_norm": 0.2794461018913548,
"learning_rate": 3.7986423802166705e-06,
"loss": 0.1564,
"step": 1600
},
{
"epoch": 4.097248880358285,
"grad_norm": 0.29566827038245036,
"learning_rate": 3.7776765634030234e-06,
"loss": 0.1636,
"step": 1601
},
{
"epoch": 4.099808061420346,
"grad_norm": 0.2933200780850988,
"learning_rate": 3.756762729242773e-06,
"loss": 0.1991,
"step": 1602
},
{
"epoch": 4.102367242482406,
"grad_norm": 0.2761865132545396,
"learning_rate": 3.7359009447520112e-06,
"loss": 0.165,
"step": 1603
},
{
"epoch": 4.104926423544466,
"grad_norm": 0.2661987317855668,
"learning_rate": 3.715091276780023e-06,
"loss": 0.1897,
"step": 1604
},
{
"epoch": 4.107485604606526,
"grad_norm": 0.2898297360177148,
"learning_rate": 3.694333792009115e-06,
"loss": 0.1967,
"step": 1605
},
{
"epoch": 4.110044785668586,
"grad_norm": 0.30088937431780177,
"learning_rate": 3.6736285569543585e-06,
"loss": 0.1705,
"step": 1606
},
{
"epoch": 4.112603966730646,
"grad_norm": 0.3076179252763136,
"learning_rate": 3.652975637963401e-06,
"loss": 0.1865,
"step": 1607
},
{
"epoch": 4.115163147792706,
"grad_norm": 0.2666186074605756,
"learning_rate": 3.632375101216259e-06,
"loss": 0.1804,
"step": 1608
},
{
"epoch": 4.117722328854766,
"grad_norm": 0.264680141837875,
"learning_rate": 3.6118270127250954e-06,
"loss": 0.139,
"step": 1609
},
{
"epoch": 4.120281509916826,
"grad_norm": 0.26713666643272443,
"learning_rate": 3.5913314383339937e-06,
"loss": 0.1533,
"step": 1610
},
{
"epoch": 4.122840690978887,
"grad_norm": 0.28782921605361467,
"learning_rate": 3.5708884437187673e-06,
"loss": 0.1614,
"step": 1611
},
{
"epoch": 4.125399872040947,
"grad_norm": 0.27318365156538243,
"learning_rate": 3.5504980943867538e-06,
"loss": 0.1868,
"step": 1612
},
{
"epoch": 4.127959053103007,
"grad_norm": 0.27437741262556425,
"learning_rate": 3.53016045567659e-06,
"loss": 0.1634,
"step": 1613
},
{
"epoch": 4.130518234165067,
"grad_norm": 0.27443522507309115,
"learning_rate": 3.509875592757999e-06,
"loss": 0.2041,
"step": 1614
},
{
"epoch": 4.1330774152271275,
"grad_norm": 0.2733625632699625,
"learning_rate": 3.4896435706316e-06,
"loss": 0.1676,
"step": 1615
},
{
"epoch": 4.135636596289188,
"grad_norm": 0.28425889785798236,
"learning_rate": 3.469464454128684e-06,
"loss": 0.1714,
"step": 1616
},
{
"epoch": 4.138195777351248,
"grad_norm": 0.27651054747700926,
"learning_rate": 3.4493383079110054e-06,
"loss": 0.2032,
"step": 1617
},
{
"epoch": 4.140754958413308,
"grad_norm": 0.27095766306492064,
"learning_rate": 3.429265196470599e-06,
"loss": 0.1654,
"step": 1618
},
{
"epoch": 4.143314139475368,
"grad_norm": 0.27829913330629624,
"learning_rate": 3.409245184129546e-06,
"loss": 0.1753,
"step": 1619
},
{
"epoch": 4.145873320537428,
"grad_norm": 0.26642028532837686,
"learning_rate": 3.3892783350397675e-06,
"loss": 0.1605,
"step": 1620
},
{
"epoch": 4.148432501599488,
"grad_norm": 0.2671909078689866,
"learning_rate": 3.369364713182848e-06,
"loss": 0.1546,
"step": 1621
},
{
"epoch": 4.150991682661548,
"grad_norm": 0.2744629320037609,
"learning_rate": 3.349504382369795e-06,
"loss": 0.1606,
"step": 1622
},
{
"epoch": 4.153550863723608,
"grad_norm": 0.2655886430602391,
"learning_rate": 3.329697406240855e-06,
"loss": 0.1802,
"step": 1623
},
{
"epoch": 4.1561100447856685,
"grad_norm": 0.2735670466849154,
"learning_rate": 3.309943848265311e-06,
"loss": 0.1685,
"step": 1624
},
{
"epoch": 4.158669225847729,
"grad_norm": 0.2733935619603659,
"learning_rate": 3.290243771741275e-06,
"loss": 0.1712,
"step": 1625
},
{
"epoch": 4.161228406909789,
"grad_norm": 0.28420268431811874,
"learning_rate": 3.2705972397954655e-06,
"loss": 0.1888,
"step": 1626
},
{
"epoch": 4.163787587971849,
"grad_norm": 0.26985232539202264,
"learning_rate": 3.2510043153830486e-06,
"loss": 0.1877,
"step": 1627
},
{
"epoch": 4.166346769033909,
"grad_norm": 0.2656409578402704,
"learning_rate": 3.231465061287391e-06,
"loss": 0.1844,
"step": 1628
},
{
"epoch": 4.1689059500959695,
"grad_norm": 0.2870535364269227,
"learning_rate": 3.211979540119883e-06,
"loss": 0.1489,
"step": 1629
},
{
"epoch": 4.17146513115803,
"grad_norm": 0.2751438862963445,
"learning_rate": 3.1925478143197418e-06,
"loss": 0.1651,
"step": 1630
},
{
"epoch": 4.17402431222009,
"grad_norm": 0.28504383266802613,
"learning_rate": 3.1731699461537958e-06,
"loss": 0.1809,
"step": 1631
},
{
"epoch": 4.176583493282149,
"grad_norm": 0.28160560698590886,
"learning_rate": 3.153845997716303e-06,
"loss": 0.1608,
"step": 1632
},
{
"epoch": 4.1791426743442095,
"grad_norm": 0.2792655962671141,
"learning_rate": 3.1345760309287264e-06,
"loss": 0.1486,
"step": 1633
},
{
"epoch": 4.18170185540627,
"grad_norm": 0.2835400810692843,
"learning_rate": 3.1153601075395533e-06,
"loss": 0.1742,
"step": 1634
},
{
"epoch": 4.18426103646833,
"grad_norm": 0.27354882443074413,
"learning_rate": 3.0961982891241083e-06,
"loss": 0.1892,
"step": 1635
},
{
"epoch": 4.18682021753039,
"grad_norm": 0.2593652179219975,
"learning_rate": 3.0770906370843234e-06,
"loss": 0.176,
"step": 1636
},
{
"epoch": 4.18937939859245,
"grad_norm": 0.28067593594699425,
"learning_rate": 3.058037212648579e-06,
"loss": 0.1942,
"step": 1637
},
{
"epoch": 4.1919385796545106,
"grad_norm": 0.27960007176651136,
"learning_rate": 3.039038076871481e-06,
"loss": 0.1722,
"step": 1638
},
{
"epoch": 4.194497760716571,
"grad_norm": 0.2784512143122092,
"learning_rate": 3.02009329063367e-06,
"loss": 0.1819,
"step": 1639
},
{
"epoch": 4.197056941778631,
"grad_norm": 0.291087706558686,
"learning_rate": 3.001202914641628e-06,
"loss": 0.1855,
"step": 1640
},
{
"epoch": 4.199616122840691,
"grad_norm": 0.2731191212662854,
"learning_rate": 2.9823670094275e-06,
"loss": 0.1671,
"step": 1641
},
{
"epoch": 4.202175303902751,
"grad_norm": 0.27430309267830155,
"learning_rate": 2.9635856353488645e-06,
"loss": 0.1731,
"step": 1642
},
{
"epoch": 4.204734484964812,
"grad_norm": 0.26629467870174495,
"learning_rate": 2.9448588525885746e-06,
"loss": 0.1845,
"step": 1643
},
{
"epoch": 4.207293666026872,
"grad_norm": 0.2742753809709845,
"learning_rate": 2.9261867211545603e-06,
"loss": 0.1748,
"step": 1644
},
{
"epoch": 4.209852847088931,
"grad_norm": 0.28012849721285554,
"learning_rate": 2.907569300879596e-06,
"loss": 0.1994,
"step": 1645
},
{
"epoch": 4.212412028150991,
"grad_norm": 0.28016509946069285,
"learning_rate": 2.889006651421169e-06,
"loss": 0.1788,
"step": 1646
},
{
"epoch": 4.214971209213052,
"grad_norm": 0.2811917062883201,
"learning_rate": 2.870498832261257e-06,
"loss": 0.1486,
"step": 1647
},
{
"epoch": 4.217530390275112,
"grad_norm": 0.277577622075296,
"learning_rate": 2.85204590270612e-06,
"loss": 0.1832,
"step": 1648
},
{
"epoch": 4.220089571337172,
"grad_norm": 0.27124773514140205,
"learning_rate": 2.8336479218861556e-06,
"loss": 0.1626,
"step": 1649
},
{
"epoch": 4.222648752399232,
"grad_norm": 0.26604650947349917,
"learning_rate": 2.815304948755664e-06,
"loss": 0.1686,
"step": 1650
},
{
"epoch": 4.225207933461292,
"grad_norm": 0.273081026747494,
"learning_rate": 2.7970170420926957e-06,
"loss": 0.1713,
"step": 1651
},
{
"epoch": 4.227767114523353,
"grad_norm": 0.2906700784230057,
"learning_rate": 2.778784260498828e-06,
"loss": 0.1681,
"step": 1652
},
{
"epoch": 4.230326295585413,
"grad_norm": 0.29269194422721684,
"learning_rate": 2.7606066623990145e-06,
"loss": 0.1869,
"step": 1653
},
{
"epoch": 4.232885476647473,
"grad_norm": 0.28591911142148163,
"learning_rate": 2.742484306041373e-06,
"loss": 0.174,
"step": 1654
},
{
"epoch": 4.235444657709533,
"grad_norm": 0.27682074794775224,
"learning_rate": 2.7244172494969978e-06,
"loss": 0.1855,
"step": 1655
},
{
"epoch": 4.2380038387715935,
"grad_norm": 0.302986579035124,
"learning_rate": 2.7064055506597875e-06,
"loss": 0.1641,
"step": 1656
},
{
"epoch": 4.240563019833653,
"grad_norm": 0.2799225480334416,
"learning_rate": 2.688449267246258e-06,
"loss": 0.1923,
"step": 1657
},
{
"epoch": 4.243122200895713,
"grad_norm": 0.38988709458435794,
"learning_rate": 2.6705484567953386e-06,
"loss": 0.2104,
"step": 1658
},
{
"epoch": 4.245681381957773,
"grad_norm": 0.26949955966216177,
"learning_rate": 2.6527031766682142e-06,
"loss": 0.1718,
"step": 1659
},
{
"epoch": 4.248240563019833,
"grad_norm": 0.2788156254648664,
"learning_rate": 2.6349134840481294e-06,
"loss": 0.1711,
"step": 1660
},
{
"epoch": 4.250799744081894,
"grad_norm": 0.2621763225250691,
"learning_rate": 2.6171794359401957e-06,
"loss": 0.1532,
"step": 1661
},
{
"epoch": 4.253358925143954,
"grad_norm": 0.2865555062851034,
"learning_rate": 2.599501089171217e-06,
"loss": 0.1552,
"step": 1662
},
{
"epoch": 4.255918106206014,
"grad_norm": 0.26224793954089864,
"learning_rate": 2.581878500389523e-06,
"loss": 0.1755,
"step": 1663
},
{
"epoch": 4.258477287268074,
"grad_norm": 0.2721003139035216,
"learning_rate": 2.564311726064754e-06,
"loss": 0.1898,
"step": 1664
},
{
"epoch": 4.2610364683301345,
"grad_norm": 0.27269538230220364,
"learning_rate": 2.546800822487714e-06,
"loss": 0.1698,
"step": 1665
},
{
"epoch": 4.263595649392195,
"grad_norm": 0.28827484525986047,
"learning_rate": 2.5293458457701726e-06,
"loss": 0.2087,
"step": 1666
},
{
"epoch": 4.266154830454255,
"grad_norm": 0.27259819354187614,
"learning_rate": 2.5119468518446844e-06,
"loss": 0.18,
"step": 1667
},
{
"epoch": 4.268714011516315,
"grad_norm": 0.28656968871426663,
"learning_rate": 2.494603896464405e-06,
"loss": 0.1818,
"step": 1668
},
{
"epoch": 4.271273192578375,
"grad_norm": 0.28475736704342813,
"learning_rate": 2.47731703520294e-06,
"loss": 0.1888,
"step": 1669
},
{
"epoch": 4.273832373640435,
"grad_norm": 0.2816641273674954,
"learning_rate": 2.4600863234541338e-06,
"loss": 0.186,
"step": 1670
},
{
"epoch": 4.276391554702495,
"grad_norm": 0.2836263153168214,
"learning_rate": 2.4429118164319076e-06,
"loss": 0.1554,
"step": 1671
},
{
"epoch": 4.278950735764555,
"grad_norm": 0.2840636606374706,
"learning_rate": 2.4257935691700897e-06,
"loss": 0.2089,
"step": 1672
},
{
"epoch": 4.281509916826615,
"grad_norm": 0.2745376280727821,
"learning_rate": 2.408731636522217e-06,
"loss": 0.1579,
"step": 1673
},
{
"epoch": 4.2840690978886755,
"grad_norm": 0.2605630953509163,
"learning_rate": 2.3917260731613733e-06,
"loss": 0.1903,
"step": 1674
},
{
"epoch": 4.286628278950736,
"grad_norm": 0.2740285831465967,
"learning_rate": 2.374776933580025e-06,
"loss": 0.1725,
"step": 1675
},
{
"epoch": 4.289187460012796,
"grad_norm": 0.28095336757761535,
"learning_rate": 2.35788427208983e-06,
"loss": 0.1867,
"step": 1676
},
{
"epoch": 4.291746641074856,
"grad_norm": 0.27154745065005187,
"learning_rate": 2.3410481428214602e-06,
"loss": 0.1613,
"step": 1677
},
{
"epoch": 4.294305822136916,
"grad_norm": 0.27345791664879376,
"learning_rate": 2.324268599724451e-06,
"loss": 0.1667,
"step": 1678
},
{
"epoch": 4.296865003198977,
"grad_norm": 0.2732090679685769,
"learning_rate": 2.307545696566997e-06,
"loss": 0.1657,
"step": 1679
},
{
"epoch": 4.299424184261037,
"grad_norm": 0.2697521270279199,
"learning_rate": 2.2908794869358044e-06,
"loss": 0.1897,
"step": 1680
},
{
"epoch": 4.301983365323097,
"grad_norm": 0.27114579546466616,
"learning_rate": 2.274270024235912e-06,
"loss": 0.188,
"step": 1681
},
{
"epoch": 4.304542546385157,
"grad_norm": 0.2729061433296195,
"learning_rate": 2.2577173616905256e-06,
"loss": 0.1595,
"step": 1682
},
{
"epoch": 4.3071017274472165,
"grad_norm": 0.2906131303633033,
"learning_rate": 2.2412215523408266e-06,
"loss": 0.1737,
"step": 1683
},
{
"epoch": 4.309660908509277,
"grad_norm": 0.2768622060092854,
"learning_rate": 2.2247826490458223e-06,
"loss": 0.1796,
"step": 1684
},
{
"epoch": 4.312220089571337,
"grad_norm": 0.27867740126032275,
"learning_rate": 2.2084007044821764e-06,
"loss": 0.1565,
"step": 1685
},
{
"epoch": 4.314779270633397,
"grad_norm": 0.2723637032411945,
"learning_rate": 2.1920757711440354e-06,
"loss": 0.1756,
"step": 1686
},
{
"epoch": 4.317338451695457,
"grad_norm": 0.2676292277176362,
"learning_rate": 2.1758079013428435e-06,
"loss": 0.1683,
"step": 1687
},
{
"epoch": 4.319897632757518,
"grad_norm": 0.29143070017187106,
"learning_rate": 2.159597147207213e-06,
"loss": 0.1697,
"step": 1688
},
{
"epoch": 4.322456813819578,
"grad_norm": 0.27260777733690406,
"learning_rate": 2.143443560682721e-06,
"loss": 0.1788,
"step": 1689
},
{
"epoch": 4.325015994881638,
"grad_norm": 0.27238765087958206,
"learning_rate": 2.127347193531757e-06,
"loss": 0.1704,
"step": 1690
},
{
"epoch": 4.327575175943698,
"grad_norm": 0.3004112415700906,
"learning_rate": 2.1113080973333643e-06,
"loss": 0.1684,
"step": 1691
},
{
"epoch": 4.330134357005758,
"grad_norm": 0.27426577172310523,
"learning_rate": 2.0953263234830667e-06,
"loss": 0.1541,
"step": 1692
},
{
"epoch": 4.332693538067819,
"grad_norm": 0.27454753389749326,
"learning_rate": 2.0794019231926986e-06,
"loss": 0.1861,
"step": 1693
},
{
"epoch": 4.335252719129878,
"grad_norm": 0.2809263966115645,
"learning_rate": 2.0635349474902598e-06,
"loss": 0.1785,
"step": 1694
},
{
"epoch": 4.337811900191938,
"grad_norm": 0.2690301934416836,
"learning_rate": 2.0477254472197237e-06,
"loss": 0.1896,
"step": 1695
},
{
"epoch": 4.340371081253998,
"grad_norm": 0.2682604924926407,
"learning_rate": 2.0319734730408935e-06,
"loss": 0.1775,
"step": 1696
},
{
"epoch": 4.342930262316059,
"grad_norm": 0.2696807174487812,
"learning_rate": 2.016279075429246e-06,
"loss": 0.1903,
"step": 1697
},
{
"epoch": 4.345489443378119,
"grad_norm": 0.2665228127768651,
"learning_rate": 2.0006423046757596e-06,
"loss": 0.1754,
"step": 1698
},
{
"epoch": 4.348048624440179,
"grad_norm": 0.2694146119632947,
"learning_rate": 1.985063210886735e-06,
"loss": 0.1549,
"step": 1699
},
{
"epoch": 4.350607805502239,
"grad_norm": 0.2822224216458224,
"learning_rate": 1.96954184398368e-06,
"loss": 0.1362,
"step": 1700
},
{
"epoch": 4.3531669865642995,
"grad_norm": 0.2615035547353888,
"learning_rate": 1.9540782537031045e-06,
"loss": 0.1586,
"step": 1701
},
{
"epoch": 4.35572616762636,
"grad_norm": 0.269471616538485,
"learning_rate": 1.9386724895963805e-06,
"loss": 0.1612,
"step": 1702
},
{
"epoch": 4.35828534868842,
"grad_norm": 0.2682599270036803,
"learning_rate": 1.9233246010295903e-06,
"loss": 0.1822,
"step": 1703
},
{
"epoch": 4.36084452975048,
"grad_norm": 0.2632188336157985,
"learning_rate": 1.908034637183356e-06,
"loss": 0.1815,
"step": 1704
},
{
"epoch": 4.36340371081254,
"grad_norm": 0.2615148499789861,
"learning_rate": 1.8928026470526917e-06,
"loss": 0.1545,
"step": 1705
},
{
"epoch": 4.3659628918746005,
"grad_norm": 0.274824297046551,
"learning_rate": 1.8776286794468346e-06,
"loss": 0.1476,
"step": 1706
},
{
"epoch": 4.36852207293666,
"grad_norm": 0.27527733450113034,
"learning_rate": 1.8625127829890922e-06,
"loss": 0.2037,
"step": 1707
},
{
"epoch": 4.37108125399872,
"grad_norm": 0.27423935602322536,
"learning_rate": 1.8474550061166984e-06,
"loss": 0.1719,
"step": 1708
},
{
"epoch": 4.37364043506078,
"grad_norm": 0.27364608091349185,
"learning_rate": 1.8324553970806436e-06,
"loss": 0.1664,
"step": 1709
},
{
"epoch": 4.3761996161228405,
"grad_norm": 0.2796166648988934,
"learning_rate": 1.817514003945524e-06,
"loss": 0.1953,
"step": 1710
},
{
"epoch": 4.378758797184901,
"grad_norm": 0.26079246660216127,
"learning_rate": 1.802630874589404e-06,
"loss": 0.1641,
"step": 1711
},
{
"epoch": 4.381317978246961,
"grad_norm": 0.2999617994892891,
"learning_rate": 1.787806056703627e-06,
"loss": 0.1718,
"step": 1712
},
{
"epoch": 4.383877159309021,
"grad_norm": 0.2889063824640918,
"learning_rate": 1.7730395977926917e-06,
"loss": 0.1653,
"step": 1713
},
{
"epoch": 4.386436340371081,
"grad_norm": 0.26748602837639757,
"learning_rate": 1.758331545174099e-06,
"loss": 0.1842,
"step": 1714
},
{
"epoch": 4.3889955214331415,
"grad_norm": 0.2806929556508416,
"learning_rate": 1.743681945978184e-06,
"loss": 0.1586,
"step": 1715
},
{
"epoch": 4.391554702495202,
"grad_norm": 0.29287158041254,
"learning_rate": 1.7290908471479805e-06,
"loss": 0.1761,
"step": 1716
},
{
"epoch": 4.394113883557262,
"grad_norm": 0.28949556938232984,
"learning_rate": 1.7145582954390638e-06,
"loss": 0.1831,
"step": 1717
},
{
"epoch": 4.396673064619322,
"grad_norm": 0.2707679886069612,
"learning_rate": 1.7000843374193987e-06,
"loss": 0.1796,
"step": 1718
},
{
"epoch": 4.399232245681382,
"grad_norm": 0.26434041057826485,
"learning_rate": 1.6856690194691872e-06,
"loss": 0.1812,
"step": 1719
},
{
"epoch": 4.401791426743442,
"grad_norm": 0.2717307275192052,
"learning_rate": 1.6713123877807413e-06,
"loss": 0.1618,
"step": 1720
},
{
"epoch": 4.404350607805502,
"grad_norm": 0.26427198325914797,
"learning_rate": 1.6570144883582994e-06,
"loss": 0.1485,
"step": 1721
},
{
"epoch": 4.406909788867562,
"grad_norm": 0.2830739919541902,
"learning_rate": 1.6427753670179214e-06,
"loss": 0.1628,
"step": 1722
},
{
"epoch": 4.409468969929622,
"grad_norm": 0.2667553896951004,
"learning_rate": 1.6285950693872999e-06,
"loss": 0.1887,
"step": 1723
},
{
"epoch": 4.4120281509916826,
"grad_norm": 0.2681400499833261,
"learning_rate": 1.614473640905645e-06,
"loss": 0.1629,
"step": 1724
},
{
"epoch": 4.414587332053743,
"grad_norm": 0.2629743389937269,
"learning_rate": 1.6004111268235156e-06,
"loss": 0.2008,
"step": 1725
},
{
"epoch": 4.417146513115803,
"grad_norm": 0.285465240532548,
"learning_rate": 1.5864075722027017e-06,
"loss": 0.191,
"step": 1726
},
{
"epoch": 4.419705694177863,
"grad_norm": 0.2683638382247192,
"learning_rate": 1.5724630219160553e-06,
"loss": 0.2073,
"step": 1727
},
{
"epoch": 4.422264875239923,
"grad_norm": 0.2829211001794818,
"learning_rate": 1.5585775206473508e-06,
"loss": 0.1568,
"step": 1728
},
{
"epoch": 4.424824056301984,
"grad_norm": 0.2788212658151338,
"learning_rate": 1.5447511128911542e-06,
"loss": 0.1728,
"step": 1729
},
{
"epoch": 4.427383237364044,
"grad_norm": 0.2848579162361746,
"learning_rate": 1.5309838429526714e-06,
"loss": 0.1904,
"step": 1730
},
{
"epoch": 4.429942418426104,
"grad_norm": 0.2654662056800488,
"learning_rate": 1.5172757549476024e-06,
"loss": 0.166,
"step": 1731
},
{
"epoch": 4.432501599488164,
"grad_norm": 0.285577343916777,
"learning_rate": 1.5036268928020125e-06,
"loss": 0.195,
"step": 1732
},
{
"epoch": 4.435060780550224,
"grad_norm": 0.25545559192222317,
"learning_rate": 1.4900373002521851e-06,
"loss": 0.1706,
"step": 1733
},
{
"epoch": 4.437619961612284,
"grad_norm": 0.279484081091582,
"learning_rate": 1.4765070208444732e-06,
"loss": 0.1909,
"step": 1734
},
{
"epoch": 4.440179142674344,
"grad_norm": 0.27394133244756325,
"learning_rate": 1.4630360979351644e-06,
"loss": 0.1955,
"step": 1735
},
{
"epoch": 4.442738323736404,
"grad_norm": 0.27730795832891525,
"learning_rate": 1.4496245746903626e-06,
"loss": 0.1668,
"step": 1736
},
{
"epoch": 4.445297504798464,
"grad_norm": 0.2515943739407271,
"learning_rate": 1.4362724940858109e-06,
"loss": 0.173,
"step": 1737
},
{
"epoch": 4.447856685860525,
"grad_norm": 0.25367847682125305,
"learning_rate": 1.422979898906789e-06,
"loss": 0.1639,
"step": 1738
},
{
"epoch": 4.450415866922585,
"grad_norm": 0.27558480853746065,
"learning_rate": 1.4097468317479623e-06,
"loss": 0.1633,
"step": 1739
},
{
"epoch": 4.452975047984645,
"grad_norm": 0.27695396643812065,
"learning_rate": 1.396573335013236e-06,
"loss": 0.1808,
"step": 1740
},
{
"epoch": 4.455534229046705,
"grad_norm": 0.2804193028236503,
"learning_rate": 1.3834594509156319e-06,
"loss": 0.1673,
"step": 1741
},
{
"epoch": 4.4580934101087655,
"grad_norm": 0.2782333366929398,
"learning_rate": 1.3704052214771513e-06,
"loss": 0.1971,
"step": 1742
},
{
"epoch": 4.460652591170826,
"grad_norm": 0.2777400443098731,
"learning_rate": 1.3574106885286465e-06,
"loss": 0.1737,
"step": 1743
},
{
"epoch": 4.463211772232885,
"grad_norm": 0.2764493972670724,
"learning_rate": 1.344475893709658e-06,
"loss": 0.1904,
"step": 1744
},
{
"epoch": 4.465770953294945,
"grad_norm": 0.2752241285220294,
"learning_rate": 1.3316008784683265e-06,
"loss": 0.1613,
"step": 1745
},
{
"epoch": 4.468330134357005,
"grad_norm": 0.2693372116468191,
"learning_rate": 1.3187856840612167e-06,
"loss": 0.1627,
"step": 1746
},
{
"epoch": 4.470889315419066,
"grad_norm": 0.26299523604064184,
"learning_rate": 1.3060303515532135e-06,
"loss": 0.1644,
"step": 1747
},
{
"epoch": 4.473448496481126,
"grad_norm": 0.26861009102213246,
"learning_rate": 1.2933349218173774e-06,
"loss": 0.1748,
"step": 1748
},
{
"epoch": 4.476007677543186,
"grad_norm": 0.266256378668002,
"learning_rate": 1.2806994355348224e-06,
"loss": 0.1717,
"step": 1749
},
{
"epoch": 4.478566858605246,
"grad_norm": 0.2783524667972571,
"learning_rate": 1.2681239331945695e-06,
"loss": 0.1739,
"step": 1750
},
{
"epoch": 4.4811260396673065,
"grad_norm": 0.2807754665043445,
"learning_rate": 1.2556084550934423e-06,
"loss": 0.163,
"step": 1751
},
{
"epoch": 4.483685220729367,
"grad_norm": 0.2751132941559695,
"learning_rate": 1.2431530413359138e-06,
"loss": 0.1596,
"step": 1752
},
{
"epoch": 4.486244401791427,
"grad_norm": 0.27965370827809377,
"learning_rate": 1.2307577318339825e-06,
"loss": 0.1764,
"step": 1753
},
{
"epoch": 4.488803582853487,
"grad_norm": 0.27090435786248723,
"learning_rate": 1.2184225663070604e-06,
"loss": 0.1904,
"step": 1754
},
{
"epoch": 4.491362763915547,
"grad_norm": 0.2830802635501525,
"learning_rate": 1.2061475842818337e-06,
"loss": 0.1785,
"step": 1755
},
{
"epoch": 4.4939219449776076,
"grad_norm": 0.2624221347168147,
"learning_rate": 1.1939328250921278e-06,
"loss": 0.1804,
"step": 1756
},
{
"epoch": 4.496481126039667,
"grad_norm": 0.27670735162368,
"learning_rate": 1.1817783278788042e-06,
"loss": 0.1534,
"step": 1757
},
{
"epoch": 4.499040307101727,
"grad_norm": 0.2755467325350106,
"learning_rate": 1.169684131589608e-06,
"loss": 0.1791,
"step": 1758
},
{
"epoch": 4.501599488163787,
"grad_norm": 0.2750192315338786,
"learning_rate": 1.1576502749790608e-06,
"loss": 0.1721,
"step": 1759
},
{
"epoch": 4.5041586692258475,
"grad_norm": 0.26825366429953873,
"learning_rate": 1.1456767966083393e-06,
"loss": 0.1739,
"step": 1760
},
{
"epoch": 4.506717850287908,
"grad_norm": 0.26461882189193386,
"learning_rate": 1.1337637348451369e-06,
"loss": 0.1836,
"step": 1761
},
{
"epoch": 4.509277031349968,
"grad_norm": 0.27170229996613754,
"learning_rate": 1.1219111278635575e-06,
"loss": 0.1746,
"step": 1762
},
{
"epoch": 4.511836212412028,
"grad_norm": 0.28612289439672206,
"learning_rate": 1.1101190136439689e-06,
"loss": 0.1664,
"step": 1763
},
{
"epoch": 4.514395393474088,
"grad_norm": 0.2814719237385938,
"learning_rate": 1.0983874299729092e-06,
"loss": 0.1552,
"step": 1764
},
{
"epoch": 4.516954574536149,
"grad_norm": 0.27224408532725913,
"learning_rate": 1.086716414442952e-06,
"loss": 0.155,
"step": 1765
},
{
"epoch": 4.519513755598209,
"grad_norm": 0.2683837888920839,
"learning_rate": 1.0751060044525797e-06,
"loss": 0.1947,
"step": 1766
},
{
"epoch": 4.522072936660269,
"grad_norm": 0.266405093166955,
"learning_rate": 1.0635562372060825e-06,
"loss": 0.179,
"step": 1767
},
{
"epoch": 4.524632117722329,
"grad_norm": 0.26568191781978007,
"learning_rate": 1.052067149713416e-06,
"loss": 0.1595,
"step": 1768
},
{
"epoch": 4.527191298784389,
"grad_norm": 0.27613787388854283,
"learning_rate": 1.0406387787900974e-06,
"loss": 0.2022,
"step": 1769
},
{
"epoch": 4.529750479846449,
"grad_norm": 0.2783446591602134,
"learning_rate": 1.0292711610570904e-06,
"loss": 0.1965,
"step": 1770
},
{
"epoch": 4.532309660908509,
"grad_norm": 0.2754628182404677,
"learning_rate": 1.0179643329406752e-06,
"loss": 0.1796,
"step": 1771
},
{
"epoch": 4.534868841970569,
"grad_norm": 0.2717991423747503,
"learning_rate": 1.0067183306723384e-06,
"loss": 0.1872,
"step": 1772
},
{
"epoch": 4.537428023032629,
"grad_norm": 0.26023938540588254,
"learning_rate": 9.955331902886645e-07,
"loss": 0.1645,
"step": 1773
},
{
"epoch": 4.53998720409469,
"grad_norm": 0.2697243580148783,
"learning_rate": 9.844089476312035e-07,
"loss": 0.1736,
"step": 1774
},
{
"epoch": 4.54254638515675,
"grad_norm": 0.27089652411524956,
"learning_rate": 9.733456383463658e-07,
"loss": 0.156,
"step": 1775
},
{
"epoch": 4.54510556621881,
"grad_norm": 0.2625263168411182,
"learning_rate": 9.62343297885313e-07,
"loss": 0.1709,
"step": 1776
},
{
"epoch": 4.54766474728087,
"grad_norm": 0.2720147925441457,
"learning_rate": 9.514019615038395e-07,
"loss": 0.1609,
"step": 1777
},
{
"epoch": 4.55022392834293,
"grad_norm": 0.26862738106885103,
"learning_rate": 9.40521664262255e-07,
"loss": 0.1823,
"step": 1778
},
{
"epoch": 4.552783109404991,
"grad_norm": 0.2858477259373205,
"learning_rate": 9.297024410252753e-07,
"loss": 0.1719,
"step": 1779
},
{
"epoch": 4.555342290467051,
"grad_norm": 0.2792231695337476,
"learning_rate": 9.189443264619102e-07,
"loss": 0.2187,
"step": 1780
},
{
"epoch": 4.557901471529111,
"grad_norm": 0.2722587468079133,
"learning_rate": 9.082473550453619e-07,
"loss": 0.1581,
"step": 1781
},
{
"epoch": 4.560460652591171,
"grad_norm": 0.25994947891197123,
"learning_rate": 8.976115610528957e-07,
"loss": 0.1813,
"step": 1782
},
{
"epoch": 4.563019833653231,
"grad_norm": 0.2685596280130304,
"learning_rate": 8.870369785657451e-07,
"loss": 0.1637,
"step": 1783
},
{
"epoch": 4.565579014715291,
"grad_norm": 0.2624613954232775,
"learning_rate": 8.765236414690026e-07,
"loss": 0.1867,
"step": 1784
},
{
"epoch": 4.568138195777351,
"grad_norm": 0.26985980601394455,
"learning_rate": 8.660715834514977e-07,
"loss": 0.1812,
"step": 1785
},
{
"epoch": 4.570697376839411,
"grad_norm": 0.2782580432929674,
"learning_rate": 8.556808380057013e-07,
"loss": 0.1551,
"step": 1786
},
{
"epoch": 4.5732565579014715,
"grad_norm": 0.3229561788111089,
"learning_rate": 8.453514384276196e-07,
"loss": 0.1665,
"step": 1787
},
{
"epoch": 4.575815738963532,
"grad_norm": 0.26676739208882927,
"learning_rate": 8.350834178166755e-07,
"loss": 0.2019,
"step": 1788
},
{
"epoch": 4.578374920025592,
"grad_norm": 0.25638061426053027,
"learning_rate": 8.248768090756143e-07,
"loss": 0.1623,
"step": 1789
},
{
"epoch": 4.580934101087652,
"grad_norm": 0.2769760353268046,
"learning_rate": 8.147316449103959e-07,
"loss": 0.193,
"step": 1790
},
{
"epoch": 4.583493282149712,
"grad_norm": 0.2827217753260577,
"learning_rate": 8.046479578300803e-07,
"loss": 0.1573,
"step": 1791
},
{
"epoch": 4.5860524632117725,
"grad_norm": 0.267728272299288,
"learning_rate": 7.946257801467339e-07,
"loss": 0.1534,
"step": 1792
},
{
"epoch": 4.588611644273833,
"grad_norm": 0.26899124519431056,
"learning_rate": 7.846651439753273e-07,
"loss": 0.1785,
"step": 1793
},
{
"epoch": 4.591170825335892,
"grad_norm": 0.2655562652017706,
"learning_rate": 7.747660812336221e-07,
"loss": 0.1632,
"step": 1794
},
{
"epoch": 4.593730006397953,
"grad_norm": 0.2912418780943663,
"learning_rate": 7.649286236420806e-07,
"loss": 0.1664,
"step": 1795
},
{
"epoch": 4.5962891874600125,
"grad_norm": 0.2773582855251603,
"learning_rate": 7.551528027237553e-07,
"loss": 0.1649,
"step": 1796
},
{
"epoch": 4.598848368522073,
"grad_norm": 0.2706350032862212,
"learning_rate": 7.454386498041865e-07,
"loss": 0.1897,
"step": 1797
},
{
"epoch": 4.601407549584133,
"grad_norm": 0.27987597117336843,
"learning_rate": 7.357861960113121e-07,
"loss": 0.1806,
"step": 1798
},
{
"epoch": 4.603966730646193,
"grad_norm": 0.2554799929519513,
"learning_rate": 7.261954722753595e-07,
"loss": 0.1454,
"step": 1799
},
{
"epoch": 4.606525911708253,
"grad_norm": 0.28194077726489003,
"learning_rate": 7.166665093287539e-07,
"loss": 0.1956,
"step": 1800
},
{
"epoch": 4.6090850927703135,
"grad_norm": 0.27206485970301414,
"learning_rate": 7.071993377060038e-07,
"loss": 0.1813,
"step": 1801
},
{
"epoch": 4.611644273832374,
"grad_norm": 0.27639368969275124,
"learning_rate": 6.977939877436224e-07,
"loss": 0.1937,
"step": 1802
},
{
"epoch": 4.614203454894434,
"grad_norm": 0.26700294636297844,
"learning_rate": 6.884504895800237e-07,
"loss": 0.159,
"step": 1803
},
{
"epoch": 4.616762635956494,
"grad_norm": 0.2715005815453172,
"learning_rate": 6.791688731554158e-07,
"loss": 0.1608,
"step": 1804
},
{
"epoch": 4.619321817018554,
"grad_norm": 0.27127828240291824,
"learning_rate": 6.69949168211721e-07,
"loss": 0.1857,
"step": 1805
},
{
"epoch": 4.621880998080615,
"grad_norm": 0.28402081462443657,
"learning_rate": 6.607914042924756e-07,
"loss": 0.1918,
"step": 1806
},
{
"epoch": 4.624440179142674,
"grad_norm": 0.26263908410916775,
"learning_rate": 6.516956107427241e-07,
"loss": 0.1569,
"step": 1807
},
{
"epoch": 4.626999360204734,
"grad_norm": 0.27371755225997646,
"learning_rate": 6.426618167089338e-07,
"loss": 0.1557,
"step": 1808
},
{
"epoch": 4.629558541266794,
"grad_norm": 0.26959266513847036,
"learning_rate": 6.336900511389133e-07,
"loss": 0.1733,
"step": 1809
},
{
"epoch": 4.6321177223288545,
"grad_norm": 0.27453758652223553,
"learning_rate": 6.247803427816945e-07,
"loss": 0.1635,
"step": 1810
},
{
"epoch": 4.634676903390915,
"grad_norm": 0.2673151789681698,
"learning_rate": 6.159327201874598e-07,
"loss": 0.1709,
"step": 1811
},
{
"epoch": 4.637236084452975,
"grad_norm": 0.2702926085830735,
"learning_rate": 6.071472117074462e-07,
"loss": 0.1815,
"step": 1812
},
{
"epoch": 4.639795265515035,
"grad_norm": 0.2788070786022333,
"learning_rate": 5.984238454938496e-07,
"loss": 0.1527,
"step": 1813
},
{
"epoch": 4.642354446577095,
"grad_norm": 0.27358568856995236,
"learning_rate": 5.897626494997366e-07,
"loss": 0.1785,
"step": 1814
},
{
"epoch": 4.644913627639156,
"grad_norm": 0.2718095549716457,
"learning_rate": 5.811636514789598e-07,
"loss": 0.1853,
"step": 1815
},
{
"epoch": 4.647472808701216,
"grad_norm": 0.27759832517042105,
"learning_rate": 5.726268789860645e-07,
"loss": 0.1646,
"step": 1816
},
{
"epoch": 4.650031989763276,
"grad_norm": 0.26320625609355736,
"learning_rate": 5.641523593761977e-07,
"loss": 0.1723,
"step": 1817
},
{
"epoch": 4.652591170825336,
"grad_norm": 0.27780583001556897,
"learning_rate": 5.557401198050327e-07,
"loss": 0.184,
"step": 1818
},
{
"epoch": 4.6551503518873965,
"grad_norm": 0.27504183175562963,
"learning_rate": 5.473901872286602e-07,
"loss": 0.1712,
"step": 1819
},
{
"epoch": 4.657709532949456,
"grad_norm": 0.2774680321446144,
"learning_rate": 5.391025884035239e-07,
"loss": 0.1817,
"step": 1820
},
{
"epoch": 4.660268714011516,
"grad_norm": 0.26555781569772313,
"learning_rate": 5.308773498863251e-07,
"loss": 0.1576,
"step": 1821
},
{
"epoch": 4.662827895073576,
"grad_norm": 0.33797160433489215,
"learning_rate": 5.22714498033936e-07,
"loss": 0.1929,
"step": 1822
},
{
"epoch": 4.665387076135636,
"grad_norm": 0.28245555374717063,
"learning_rate": 5.146140590033199e-07,
"loss": 0.1869,
"step": 1823
},
{
"epoch": 4.667946257197697,
"grad_norm": 0.27464067240369455,
"learning_rate": 5.065760587514446e-07,
"loss": 0.1902,
"step": 1824
},
{
"epoch": 4.670505438259757,
"grad_norm": 0.2672943490021358,
"learning_rate": 4.986005230351954e-07,
"loss": 0.188,
"step": 1825
},
{
"epoch": 4.673064619321817,
"grad_norm": 0.2657663358279065,
"learning_rate": 4.906874774113024e-07,
"loss": 0.184,
"step": 1826
},
{
"epoch": 4.675623800383877,
"grad_norm": 0.2674714454963707,
"learning_rate": 4.828369472362493e-07,
"loss": 0.1469,
"step": 1827
},
{
"epoch": 4.6781829814459375,
"grad_norm": 0.2882898088898947,
"learning_rate": 4.750489576662021e-07,
"loss": 0.162,
"step": 1828
},
{
"epoch": 4.680742162507998,
"grad_norm": 0.27727023737142387,
"learning_rate": 4.6732353365691374e-07,
"loss": 0.1543,
"step": 1829
},
{
"epoch": 4.683301343570058,
"grad_norm": 0.2636650641131126,
"learning_rate": 4.5966069996365993e-07,
"loss": 0.1561,
"step": 1830
},
{
"epoch": 4.685860524632118,
"grad_norm": 0.2682699322744399,
"learning_rate": 4.5206048114114775e-07,
"loss": 0.1673,
"step": 1831
},
{
"epoch": 4.688419705694178,
"grad_norm": 0.2743352871936966,
"learning_rate": 4.4452290154344046e-07,
"loss": 0.1807,
"step": 1832
},
{
"epoch": 4.690978886756238,
"grad_norm": 0.2770317090716807,
"learning_rate": 4.3704798532388624e-07,
"loss": 0.2129,
"step": 1833
},
{
"epoch": 4.693538067818298,
"grad_norm": 0.280836808159879,
"learning_rate": 4.296357564350362e-07,
"loss": 0.1604,
"step": 1834
},
{
"epoch": 4.696097248880358,
"grad_norm": 0.26525175245500215,
"learning_rate": 4.22286238628562e-07,
"loss": 0.1763,
"step": 1835
},
{
"epoch": 4.698656429942418,
"grad_norm": 0.2700214270271814,
"learning_rate": 4.1499945545518283e-07,
"loss": 0.154,
"step": 1836
},
{
"epoch": 4.7012156110044785,
"grad_norm": 0.24327556501731853,
"learning_rate": 4.077754302645964e-07,
"loss": 0.1616,
"step": 1837
},
{
"epoch": 4.703774792066539,
"grad_norm": 0.2652109590879505,
"learning_rate": 4.006141862054014e-07,
"loss": 0.1809,
"step": 1838
},
{
"epoch": 4.706333973128599,
"grad_norm": 0.26799443888146,
"learning_rate": 3.935157462250128e-07,
"loss": 0.1799,
"step": 1839
},
{
"epoch": 4.708893154190659,
"grad_norm": 0.26417342730564347,
"learning_rate": 3.8648013306960664e-07,
"loss": 0.1697,
"step": 1840
},
{
"epoch": 4.711452335252719,
"grad_norm": 0.2655166510116645,
"learning_rate": 3.7950736928402674e-07,
"loss": 0.1354,
"step": 1841
},
{
"epoch": 4.7140115163147795,
"grad_norm": 0.27018720652857536,
"learning_rate": 3.7259747721173134e-07,
"loss": 0.1568,
"step": 1842
},
{
"epoch": 4.71657069737684,
"grad_norm": 0.26944814432184416,
"learning_rate": 3.6575047899471085e-07,
"loss": 0.1539,
"step": 1843
},
{
"epoch": 4.719129878438899,
"grad_norm": 0.273820427322093,
"learning_rate": 3.5896639657342134e-07,
"loss": 0.1566,
"step": 1844
},
{
"epoch": 4.72168905950096,
"grad_norm": 0.25408644087595794,
"learning_rate": 3.522452516867048e-07,
"loss": 0.1751,
"step": 1845
},
{
"epoch": 4.7242482405630195,
"grad_norm": 0.282638324070005,
"learning_rate": 3.455870658717353e-07,
"loss": 0.1788,
"step": 1846
},
{
"epoch": 4.72680742162508,
"grad_norm": 0.27645267654347633,
"learning_rate": 3.3899186046393526e-07,
"loss": 0.1856,
"step": 1847
},
{
"epoch": 4.72936660268714,
"grad_norm": 0.28435322569370036,
"learning_rate": 3.324596565969174e-07,
"loss": 0.1903,
"step": 1848
},
{
"epoch": 4.7319257837492,
"grad_norm": 0.2665475946312596,
"learning_rate": 3.2599047520241123e-07,
"loss": 0.1625,
"step": 1849
},
{
"epoch": 4.73448496481126,
"grad_norm": 0.2848927094305906,
"learning_rate": 3.1958433701019697e-07,
"loss": 0.2058,
"step": 1850
},
{
"epoch": 4.737044145873321,
"grad_norm": 0.2704317535606738,
"learning_rate": 3.1324126254804524e-07,
"loss": 0.1868,
"step": 1851
},
{
"epoch": 4.739603326935381,
"grad_norm": 0.2800291692162128,
"learning_rate": 3.069612721416371e-07,
"loss": 0.1794,
"step": 1852
},
{
"epoch": 4.742162507997441,
"grad_norm": 0.27622038521669706,
"learning_rate": 3.007443859145087e-07,
"loss": 0.1701,
"step": 1853
},
{
"epoch": 4.744721689059501,
"grad_norm": 0.28051196613093177,
"learning_rate": 2.9459062378799806e-07,
"loss": 0.193,
"step": 1854
},
{
"epoch": 4.747280870121561,
"grad_norm": 0.26937886948023293,
"learning_rate": 2.8850000548115155e-07,
"loss": 0.1645,
"step": 1855
},
{
"epoch": 4.749840051183622,
"grad_norm": 0.2574745053917364,
"learning_rate": 2.8247255051068845e-07,
"loss": 0.1711,
"step": 1856
},
{
"epoch": 4.752399232245681,
"grad_norm": 0.2716275327438086,
"learning_rate": 2.7650827819093005e-07,
"loss": 0.1699,
"step": 1857
},
{
"epoch": 4.754958413307741,
"grad_norm": 0.2590049108849854,
"learning_rate": 2.706072076337285e-07,
"loss": 0.1648,
"step": 1858
},
{
"epoch": 4.757517594369801,
"grad_norm": 0.27267669634639347,
"learning_rate": 2.647693577484156e-07,
"loss": 0.1887,
"step": 1859
},
{
"epoch": 4.760076775431862,
"grad_norm": 0.27747570018737217,
"learning_rate": 2.5899474724174313e-07,
"loss": 0.1822,
"step": 1860
},
{
"epoch": 4.762635956493922,
"grad_norm": 0.27256177644643004,
"learning_rate": 2.532833946178137e-07,
"loss": 0.1833,
"step": 1861
},
{
"epoch": 4.765195137555982,
"grad_norm": 0.26875210471690963,
"learning_rate": 2.4763531817802777e-07,
"loss": 0.1634,
"step": 1862
},
{
"epoch": 4.767754318618042,
"grad_norm": 0.2841356724669023,
"learning_rate": 2.4205053602103015e-07,
"loss": 0.1716,
"step": 1863
},
{
"epoch": 4.770313499680102,
"grad_norm": 0.28261688509298977,
"learning_rate": 2.365290660426389e-07,
"loss": 0.1804,
"step": 1864
},
{
"epoch": 4.772872680742163,
"grad_norm": 0.2673985339554513,
"learning_rate": 2.3107092593579905e-07,
"loss": 0.17,
"step": 1865
},
{
"epoch": 4.775431861804223,
"grad_norm": 0.2644854641715479,
"learning_rate": 2.2567613319051997e-07,
"loss": 0.1624,
"step": 1866
},
{
"epoch": 4.777991042866283,
"grad_norm": 0.2657701185136481,
"learning_rate": 2.2034470509382234e-07,
"loss": 0.1967,
"step": 1867
},
{
"epoch": 4.780550223928343,
"grad_norm": 0.26830383283496656,
"learning_rate": 2.1507665872968264e-07,
"loss": 0.1743,
"step": 1868
},
{
"epoch": 4.7831094049904035,
"grad_norm": 0.26825661403886375,
"learning_rate": 2.0987201097897757e-07,
"loss": 0.1697,
"step": 1869
},
{
"epoch": 4.785668586052463,
"grad_norm": 0.25032287801653486,
"learning_rate": 2.0473077851942858e-07,
"loss": 0.1692,
"step": 1870
},
{
"epoch": 4.788227767114523,
"grad_norm": 0.2540438269392125,
"learning_rate": 1.9965297782554848e-07,
"loss": 0.1594,
"step": 1871
},
{
"epoch": 4.790786948176583,
"grad_norm": 0.25648856154725486,
"learning_rate": 1.9463862516859277e-07,
"loss": 0.1862,
"step": 1872
},
{
"epoch": 4.7933461292386434,
"grad_norm": 0.27520225342246696,
"learning_rate": 1.896877366165062e-07,
"loss": 0.1625,
"step": 1873
},
{
"epoch": 4.795905310300704,
"grad_norm": 0.2879456654071117,
"learning_rate": 1.8480032803386505e-07,
"loss": 0.1647,
"step": 1874
},
{
"epoch": 4.798464491362764,
"grad_norm": 0.26113619907647856,
"learning_rate": 1.799764150818306e-07,
"loss": 0.1556,
"step": 1875
},
{
"epoch": 4.801023672424824,
"grad_norm": 0.2684336534763457,
"learning_rate": 1.7521601321810687e-07,
"loss": 0.1686,
"step": 1876
},
{
"epoch": 4.803582853486884,
"grad_norm": 0.287932659535979,
"learning_rate": 1.7051913769687623e-07,
"loss": 0.1549,
"step": 1877
},
{
"epoch": 4.8061420345489445,
"grad_norm": 0.26143215817508003,
"learning_rate": 1.658858035687594e-07,
"loss": 0.1796,
"step": 1878
},
{
"epoch": 4.808701215611005,
"grad_norm": 0.26742321309706935,
"learning_rate": 1.6131602568076887e-07,
"loss": 0.1723,
"step": 1879
},
{
"epoch": 4.811260396673065,
"grad_norm": 0.26866517810628654,
"learning_rate": 1.5680981867625566e-07,
"loss": 0.1631,
"step": 1880
},
{
"epoch": 4.813819577735125,
"grad_norm": 0.2664673668196368,
"learning_rate": 1.5236719699486256e-07,
"loss": 0.1595,
"step": 1881
},
{
"epoch": 4.816378758797185,
"grad_norm": 0.26624359763267497,
"learning_rate": 1.479881748724865e-07,
"loss": 0.174,
"step": 1882
},
{
"epoch": 4.818937939859245,
"grad_norm": 0.26586320390850965,
"learning_rate": 1.4367276634122073e-07,
"loss": 0.1733,
"step": 1883
},
{
"epoch": 4.821497120921305,
"grad_norm": 0.2734079332163835,
"learning_rate": 1.3942098522931491e-07,
"loss": 0.1524,
"step": 1884
},
{
"epoch": 4.824056301983365,
"grad_norm": 0.26860707446851667,
"learning_rate": 1.3523284516113955e-07,
"loss": 0.1801,
"step": 1885
},
{
"epoch": 4.826615483045425,
"grad_norm": 0.26293219622877667,
"learning_rate": 1.3110835955712831e-07,
"loss": 0.1789,
"step": 1886
},
{
"epoch": 4.8291746641074855,
"grad_norm": 0.26548180261766674,
"learning_rate": 1.2704754163374022e-07,
"loss": 0.1643,
"step": 1887
},
{
"epoch": 4.831733845169546,
"grad_norm": 0.27466550793343814,
"learning_rate": 1.2305040440342198e-07,
"loss": 0.1417,
"step": 1888
},
{
"epoch": 4.834293026231606,
"grad_norm": 0.27044679875019695,
"learning_rate": 1.1911696067455902e-07,
"loss": 0.1862,
"step": 1889
},
{
"epoch": 4.836852207293666,
"grad_norm": 0.2667382729572236,
"learning_rate": 1.1524722305144231e-07,
"loss": 0.1671,
"step": 1890
},
{
"epoch": 4.839411388355726,
"grad_norm": 0.26077082259648754,
"learning_rate": 1.114412039342172e-07,
"loss": 0.1583,
"step": 1891
},
{
"epoch": 4.841970569417787,
"grad_norm": 0.2683982318957702,
"learning_rate": 1.0769891551885903e-07,
"loss": 0.1689,
"step": 1892
},
{
"epoch": 4.844529750479847,
"grad_norm": 0.26442562426087834,
"learning_rate": 1.0402036979711317e-07,
"loss": 0.1901,
"step": 1893
},
{
"epoch": 4.847088931541906,
"grad_norm": 0.2628418022630685,
"learning_rate": 1.0040557855648169e-07,
"loss": 0.1628,
"step": 1894
},
{
"epoch": 4.849648112603967,
"grad_norm": 0.2643402568604478,
"learning_rate": 9.685455338016347e-08,
"loss": 0.1769,
"step": 1895
},
{
"epoch": 4.8522072936660265,
"grad_norm": 0.27073662782178587,
"learning_rate": 9.336730564702745e-08,
"loss": 0.163,
"step": 1896
},
{
"epoch": 4.854766474728087,
"grad_norm": 0.2592523308295433,
"learning_rate": 8.994384653157718e-08,
"loss": 0.1748,
"step": 1897
},
{
"epoch": 4.857325655790147,
"grad_norm": 0.26364468600135543,
"learning_rate": 8.658418700391302e-08,
"loss": 0.1658,
"step": 1898
},
{
"epoch": 4.859884836852207,
"grad_norm": 0.26261286147824464,
"learning_rate": 8.328833782969003e-08,
"loss": 0.166,
"step": 1899
},
{
"epoch": 4.862444017914267,
"grad_norm": 0.2752608130158951,
"learning_rate": 8.005630957010014e-08,
"loss": 0.1832,
"step": 1900
},
{
"epoch": 4.865003198976328,
"grad_norm": 0.27593480590108715,
"learning_rate": 7.688811258181883e-08,
"loss": 0.1838,
"step": 1901
},
{
"epoch": 4.867562380038388,
"grad_norm": 0.27236472207879586,
"learning_rate": 7.378375701698748e-08,
"loss": 0.1898,
"step": 1902
},
{
"epoch": 4.870121561100448,
"grad_norm": 0.25195081103744715,
"learning_rate": 7.074325282317329e-08,
"loss": 0.1754,
"step": 1903
},
{
"epoch": 4.872680742162508,
"grad_norm": 0.2869783000463503,
"learning_rate": 6.776660974333605e-08,
"loss": 0.1572,
"step": 1904
},
{
"epoch": 4.8752399232245685,
"grad_norm": 0.27441367536012884,
"learning_rate": 6.485383731580142e-08,
"loss": 0.1766,
"step": 1905
},
{
"epoch": 4.877799104286629,
"grad_norm": 0.28772532733219014,
"learning_rate": 6.200494487422771e-08,
"loss": 0.1794,
"step": 1906
},
{
"epoch": 4.880358285348688,
"grad_norm": 0.27889648675155043,
"learning_rate": 5.921994154758137e-08,
"loss": 0.17,
"step": 1907
},
{
"epoch": 4.882917466410748,
"grad_norm": 0.25762876059244744,
"learning_rate": 5.649883626009933e-08,
"loss": 0.1415,
"step": 1908
},
{
"epoch": 4.885476647472808,
"grad_norm": 0.27418736611080513,
"learning_rate": 5.3841637731260054e-08,
"loss": 0.1637,
"step": 1909
},
{
"epoch": 4.888035828534869,
"grad_norm": 0.2644534739010174,
"learning_rate": 5.1248354475768034e-08,
"loss": 0.1856,
"step": 1910
},
{
"epoch": 4.890595009596929,
"grad_norm": 0.26943083477416585,
"learning_rate": 4.871899480351605e-08,
"loss": 0.1833,
"step": 1911
},
{
"epoch": 4.893154190658989,
"grad_norm": 0.27096897776649476,
"learning_rate": 4.6253566819554066e-08,
"loss": 0.1646,
"step": 1912
},
{
"epoch": 4.895713371721049,
"grad_norm": 0.2810986979004101,
"learning_rate": 4.385207842407813e-08,
"loss": 0.1688,
"step": 1913
},
{
"epoch": 4.8982725527831095,
"grad_norm": 0.2780463267788789,
"learning_rate": 4.151453731239707e-08,
"loss": 0.1889,
"step": 1914
},
{
"epoch": 4.90083173384517,
"grad_norm": 0.2740470136715985,
"learning_rate": 3.924095097489922e-08,
"loss": 0.1771,
"step": 1915
},
{
"epoch": 4.90339091490723,
"grad_norm": 0.2670609796470983,
"learning_rate": 3.703132669704568e-08,
"loss": 0.1767,
"step": 1916
},
{
"epoch": 4.90595009596929,
"grad_norm": 0.276634001409615,
"learning_rate": 3.4885671559332645e-08,
"loss": 0.1698,
"step": 1917
},
{
"epoch": 4.90850927703135,
"grad_norm": 0.2836253559347624,
"learning_rate": 3.280399243727806e-08,
"loss": 0.1434,
"step": 1918
},
{
"epoch": 4.9110684580934105,
"grad_norm": 0.2596326694355661,
"learning_rate": 3.078629600139271e-08,
"loss": 0.1738,
"step": 1919
},
{
"epoch": 4.91362763915547,
"grad_norm": 0.26138854550688545,
"learning_rate": 2.8832588717164766e-08,
"loss": 0.1698,
"step": 1920
},
{
"epoch": 4.91618682021753,
"grad_norm": 0.2629381551374675,
"learning_rate": 2.694287684503083e-08,
"loss": 0.1803,
"step": 1921
},
{
"epoch": 4.91874600127959,
"grad_norm": 0.2707296291029451,
"learning_rate": 2.511716644036932e-08,
"loss": 0.2076,
"step": 1922
},
{
"epoch": 4.9213051823416505,
"grad_norm": 0.2643795979993967,
"learning_rate": 2.3355463353467168e-08,
"loss": 0.1737,
"step": 1923
},
{
"epoch": 4.923864363403711,
"grad_norm": 0.2687386354716593,
"learning_rate": 2.1657773229508684e-08,
"loss": 0.1525,
"step": 1924
},
{
"epoch": 4.926423544465771,
"grad_norm": 0.25773983452315097,
"learning_rate": 2.0024101508555604e-08,
"loss": 0.1611,
"step": 1925
},
{
"epoch": 4.928982725527831,
"grad_norm": 0.2759068990299683,
"learning_rate": 1.8454453425527098e-08,
"loss": 0.149,
"step": 1926
},
{
"epoch": 4.931541906589891,
"grad_norm": 0.2658729323365574,
"learning_rate": 1.6948834010190874e-08,
"loss": 0.1928,
"step": 1927
},
{
"epoch": 4.9341010876519515,
"grad_norm": 0.28178214512822436,
"learning_rate": 1.550724808713877e-08,
"loss": 0.1885,
"step": 1928
},
{
"epoch": 4.936660268714012,
"grad_norm": 0.28860699599060785,
"learning_rate": 1.4129700275771208e-08,
"loss": 0.1466,
"step": 1929
},
{
"epoch": 4.939219449776072,
"grad_norm": 0.2584677694793919,
"learning_rate": 1.281619499029274e-08,
"loss": 0.1844,
"step": 1930
},
{
"epoch": 4.941778630838132,
"grad_norm": 0.2704355939225655,
"learning_rate": 1.1566736439685422e-08,
"loss": 0.1687,
"step": 1931
},
{
"epoch": 4.944337811900192,
"grad_norm": 0.2647721632155255,
"learning_rate": 1.0381328627702136e-08,
"loss": 0.1398,
"step": 1932
},
{
"epoch": 4.946896992962252,
"grad_norm": 0.26415602194940657,
"learning_rate": 9.259975352848838e-09,
"loss": 0.1665,
"step": 1933
},
{
"epoch": 4.949456174024312,
"grad_norm": 0.26483516576483956,
"learning_rate": 8.20268020838455e-09,
"loss": 0.1632,
"step": 1934
},
{
"epoch": 4.952015355086372,
"grad_norm": 0.26643152935588643,
"learning_rate": 7.209446582292501e-09,
"loss": 0.1563,
"step": 1935
},
{
"epoch": 4.954574536148432,
"grad_norm": 0.25910553240725315,
"learning_rate": 6.2802776572779005e-09,
"loss": 0.1467,
"step": 1936
},
{
"epoch": 4.957133717210493,
"grad_norm": 0.25618053832139825,
"learning_rate": 5.415176410765721e-09,
"loss": 0.1586,
"step": 1937
},
{
"epoch": 4.959692898272553,
"grad_norm": 0.2519718295297774,
"learning_rate": 4.614145614876275e-09,
"loss": 0.175,
"step": 1938
},
{
"epoch": 4.962252079334613,
"grad_norm": 0.266276772335997,
"learning_rate": 3.877187836422991e-09,
"loss": 0.1893,
"step": 1939
},
{
"epoch": 4.964811260396673,
"grad_norm": 0.27130663744108346,
"learning_rate": 3.2043054369057523e-09,
"loss": 0.1928,
"step": 1940
},
{
"epoch": 4.967370441458733,
"grad_norm": 0.2716068333936989,
"learning_rate": 2.5955005725064597e-09,
"loss": 0.1714,
"step": 1941
},
{
"epoch": 4.969929622520794,
"grad_norm": 0.26663386055671573,
"learning_rate": 2.0507751940690434e-09,
"loss": 0.1648,
"step": 1942
},
{
"epoch": 4.972488803582854,
"grad_norm": 0.26494109005360517,
"learning_rate": 1.5701310471083476e-09,
"loss": 0.1591,
"step": 1943
},
{
"epoch": 4.975047984644913,
"grad_norm": 0.2655087142061252,
"learning_rate": 1.1535696717945855e-09,
"loss": 0.1437,
"step": 1944
},
{
"epoch": 4.977607165706974,
"grad_norm": 0.27477087152296875,
"learning_rate": 8.010924029533406e-10,
"loss": 0.1491,
"step": 1945
},
{
"epoch": 4.980166346769034,
"grad_norm": 0.2590976709734058,
"learning_rate": 5.127003700589051e-10,
"loss": 0.1584,
"step": 1946
},
{
"epoch": 4.982725527831094,
"grad_norm": 0.267447188689341,
"learning_rate": 2.8839449723205847e-10,
"loss": 0.1975,
"step": 1947
},
{
"epoch": 4.985284708893154,
"grad_norm": 0.2598735449781636,
"learning_rate": 1.2817550323784843e-10,
"loss": 0.2008,
"step": 1948
},
{
"epoch": 4.987843889955214,
"grad_norm": 0.26368976010696143,
"learning_rate": 3.2043901478928666e-11,
"loss": 0.1795,
"step": 1949
},
{
"epoch": 4.990403071017274,
"grad_norm": 0.2645697161648628,
"learning_rate": 0.0,
"loss": 0.1707,
"step": 1950
},
{
"epoch": 4.990403071017274,
"step": 1950,
"total_flos": 3.115960359367213e+18,
"train_loss": 0.34092234334120386,
"train_runtime": 73926.9587,
"train_samples_per_second": 3.382,
"train_steps_per_second": 0.026
}
],
"logging_steps": 1.0,
"max_steps": 1950,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.115960359367213e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}