PaperPrediction-ALL-4B-ba / trainer_state.json
weihezhai's picture
Upload folder using huggingface_hub
26eccac verified
{
"best_global_step": 1800,
"best_metric": 0.74,
"best_model_checkpoint": "/mnt/parscratch/users/acr24wz/etu/topcon/qwen3_4B/cpt_model/balanced/finetuned/all/checkpoint-1800",
"epoch": 2.0642662458757712,
"eval_steps": 100,
"global_step": 1800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011476115334959117,
"grad_norm": 201.0,
"learning_rate": 0.0,
"loss": 18.5701,
"step": 1
},
{
"epoch": 0.0022952230669918234,
"grad_norm": 1392.0,
"learning_rate": 4.587155963302753e-08,
"loss": 12.0441,
"step": 2
},
{
"epoch": 0.0034428346004877347,
"grad_norm": 161.0,
"learning_rate": 9.174311926605506e-08,
"loss": 14.3223,
"step": 3
},
{
"epoch": 0.004590446133983647,
"grad_norm": 150.0,
"learning_rate": 1.376146788990826e-07,
"loss": 10.3759,
"step": 4
},
{
"epoch": 0.005738057667479558,
"grad_norm": 141.0,
"learning_rate": 1.8348623853211012e-07,
"loss": 11.4624,
"step": 5
},
{
"epoch": 0.006885669200975469,
"grad_norm": 159.0,
"learning_rate": 2.2935779816513764e-07,
"loss": 10.5127,
"step": 6
},
{
"epoch": 0.008033280734471382,
"grad_norm": 166.0,
"learning_rate": 2.752293577981652e-07,
"loss": 15.7339,
"step": 7
},
{
"epoch": 0.009180892267967294,
"grad_norm": 310.0,
"learning_rate": 3.211009174311927e-07,
"loss": 21.2237,
"step": 8
},
{
"epoch": 0.010328503801463204,
"grad_norm": 153.0,
"learning_rate": 3.6697247706422023e-07,
"loss": 11.4438,
"step": 9
},
{
"epoch": 0.011476115334959116,
"grad_norm": 146.0,
"learning_rate": 4.128440366972478e-07,
"loss": 14.27,
"step": 10
},
{
"epoch": 0.012623726868455028,
"grad_norm": 185.0,
"learning_rate": 4.587155963302753e-07,
"loss": 17.4331,
"step": 11
},
{
"epoch": 0.013771338401950939,
"grad_norm": 157.0,
"learning_rate": 5.045871559633028e-07,
"loss": 16.0972,
"step": 12
},
{
"epoch": 0.014918949935446851,
"grad_norm": 146.0,
"learning_rate": 5.504587155963304e-07,
"loss": 10.9198,
"step": 13
},
{
"epoch": 0.016066561468942763,
"grad_norm": 194.0,
"learning_rate": 5.963302752293579e-07,
"loss": 14.1635,
"step": 14
},
{
"epoch": 0.017214173002438674,
"grad_norm": 162.0,
"learning_rate": 6.422018348623854e-07,
"loss": 11.853,
"step": 15
},
{
"epoch": 0.018361784535934587,
"grad_norm": 160.0,
"learning_rate": 6.880733944954129e-07,
"loss": 12.7435,
"step": 16
},
{
"epoch": 0.019509396069430498,
"grad_norm": 158.0,
"learning_rate": 7.339449541284405e-07,
"loss": 11.8396,
"step": 17
},
{
"epoch": 0.020657007602926408,
"grad_norm": 165.0,
"learning_rate": 7.79816513761468e-07,
"loss": 11.5206,
"step": 18
},
{
"epoch": 0.021804619136422322,
"grad_norm": 177.0,
"learning_rate": 8.256880733944956e-07,
"loss": 11.5111,
"step": 19
},
{
"epoch": 0.022952230669918233,
"grad_norm": 157.0,
"learning_rate": 8.71559633027523e-07,
"loss": 14.7131,
"step": 20
},
{
"epoch": 0.024099842203414143,
"grad_norm": 172.0,
"learning_rate": 9.174311926605506e-07,
"loss": 11.1313,
"step": 21
},
{
"epoch": 0.025247453736910057,
"grad_norm": 194.0,
"learning_rate": 9.633027522935782e-07,
"loss": 16.6221,
"step": 22
},
{
"epoch": 0.026395065270405967,
"grad_norm": 268.0,
"learning_rate": 1.0091743119266057e-06,
"loss": 15.6201,
"step": 23
},
{
"epoch": 0.027542676803901878,
"grad_norm": 218.0,
"learning_rate": 1.055045871559633e-06,
"loss": 11.6275,
"step": 24
},
{
"epoch": 0.02869028833739779,
"grad_norm": 158.0,
"learning_rate": 1.1009174311926608e-06,
"loss": 15.5678,
"step": 25
},
{
"epoch": 0.029837899870893702,
"grad_norm": 236.0,
"learning_rate": 1.1467889908256882e-06,
"loss": 15.4133,
"step": 26
},
{
"epoch": 0.030985511404389616,
"grad_norm": 131.0,
"learning_rate": 1.1926605504587159e-06,
"loss": 12.6406,
"step": 27
},
{
"epoch": 0.032133122937885526,
"grad_norm": 139.0,
"learning_rate": 1.2385321100917433e-06,
"loss": 15.0131,
"step": 28
},
{
"epoch": 0.03328073447138144,
"grad_norm": 150.0,
"learning_rate": 1.2844036697247707e-06,
"loss": 13.4583,
"step": 29
},
{
"epoch": 0.03442834600487735,
"grad_norm": 166.0,
"learning_rate": 1.3302752293577984e-06,
"loss": 15.6894,
"step": 30
},
{
"epoch": 0.03557595753837326,
"grad_norm": 168.0,
"learning_rate": 1.3761467889908258e-06,
"loss": 13.8435,
"step": 31
},
{
"epoch": 0.036723569071869175,
"grad_norm": 191.0,
"learning_rate": 1.4220183486238535e-06,
"loss": 8.6607,
"step": 32
},
{
"epoch": 0.03787118060536508,
"grad_norm": 140.0,
"learning_rate": 1.467889908256881e-06,
"loss": 12.4132,
"step": 33
},
{
"epoch": 0.039018792138860996,
"grad_norm": 201.0,
"learning_rate": 1.5137614678899084e-06,
"loss": 13.8843,
"step": 34
},
{
"epoch": 0.04016640367235691,
"grad_norm": 164.0,
"learning_rate": 1.559633027522936e-06,
"loss": 15.1008,
"step": 35
},
{
"epoch": 0.041314015205852817,
"grad_norm": 284.0,
"learning_rate": 1.6055045871559635e-06,
"loss": 21.6636,
"step": 36
},
{
"epoch": 0.04246162673934873,
"grad_norm": 394.0,
"learning_rate": 1.6513761467889911e-06,
"loss": 14.0359,
"step": 37
},
{
"epoch": 0.043609238272844644,
"grad_norm": 161.0,
"learning_rate": 1.6972477064220186e-06,
"loss": 13.4677,
"step": 38
},
{
"epoch": 0.04475684980634055,
"grad_norm": 164.0,
"learning_rate": 1.743119266055046e-06,
"loss": 8.3447,
"step": 39
},
{
"epoch": 0.045904461339836465,
"grad_norm": 148.0,
"learning_rate": 1.7889908256880737e-06,
"loss": 6.5679,
"step": 40
},
{
"epoch": 0.04705207287333238,
"grad_norm": 168.0,
"learning_rate": 1.8348623853211011e-06,
"loss": 15.6762,
"step": 41
},
{
"epoch": 0.048199684406828286,
"grad_norm": 164.0,
"learning_rate": 1.8807339449541288e-06,
"loss": 6.2052,
"step": 42
},
{
"epoch": 0.0493472959403242,
"grad_norm": 103.5,
"learning_rate": 1.9266055045871564e-06,
"loss": 8.8464,
"step": 43
},
{
"epoch": 0.050494907473820114,
"grad_norm": 372.0,
"learning_rate": 1.9724770642201837e-06,
"loss": 15.5934,
"step": 44
},
{
"epoch": 0.05164251900731602,
"grad_norm": 210.0,
"learning_rate": 2.0183486238532113e-06,
"loss": 11.6431,
"step": 45
},
{
"epoch": 0.052790130540811935,
"grad_norm": 125.5,
"learning_rate": 2.064220183486239e-06,
"loss": 8.5935,
"step": 46
},
{
"epoch": 0.05393774207430785,
"grad_norm": 102.5,
"learning_rate": 2.110091743119266e-06,
"loss": 9.1192,
"step": 47
},
{
"epoch": 0.055085353607803755,
"grad_norm": 143.0,
"learning_rate": 2.155963302752294e-06,
"loss": 8.8696,
"step": 48
},
{
"epoch": 0.05623296514129967,
"grad_norm": 137.0,
"learning_rate": 2.2018348623853215e-06,
"loss": 11.2497,
"step": 49
},
{
"epoch": 0.05738057667479558,
"grad_norm": 129.0,
"learning_rate": 2.2477064220183487e-06,
"loss": 11.6115,
"step": 50
},
{
"epoch": 0.05852818820829149,
"grad_norm": 149.0,
"learning_rate": 2.2935779816513764e-06,
"loss": 13.9466,
"step": 51
},
{
"epoch": 0.059675799741787404,
"grad_norm": 120.5,
"learning_rate": 2.339449541284404e-06,
"loss": 9.2116,
"step": 52
},
{
"epoch": 0.06082341127528332,
"grad_norm": 240.0,
"learning_rate": 2.3853211009174317e-06,
"loss": 21.5138,
"step": 53
},
{
"epoch": 0.06197102280877923,
"grad_norm": 112.0,
"learning_rate": 2.431192660550459e-06,
"loss": 9.7849,
"step": 54
},
{
"epoch": 0.06311863434227515,
"grad_norm": 135.0,
"learning_rate": 2.4770642201834866e-06,
"loss": 12.795,
"step": 55
},
{
"epoch": 0.06426624587577105,
"grad_norm": 222.0,
"learning_rate": 2.522935779816514e-06,
"loss": 15.822,
"step": 56
},
{
"epoch": 0.06541385740926696,
"grad_norm": 137.0,
"learning_rate": 2.5688073394495415e-06,
"loss": 14.4134,
"step": 57
},
{
"epoch": 0.06656146894276288,
"grad_norm": 100.0,
"learning_rate": 2.6146788990825687e-06,
"loss": 10.1907,
"step": 58
},
{
"epoch": 0.06770908047625879,
"grad_norm": 154.0,
"learning_rate": 2.6605504587155968e-06,
"loss": 14.2824,
"step": 59
},
{
"epoch": 0.0688566920097547,
"grad_norm": 109.0,
"learning_rate": 2.706422018348624e-06,
"loss": 8.6855,
"step": 60
},
{
"epoch": 0.07000430354325061,
"grad_norm": 198.0,
"learning_rate": 2.7522935779816517e-06,
"loss": 14.2233,
"step": 61
},
{
"epoch": 0.07115191507674652,
"grad_norm": 134.0,
"learning_rate": 2.798165137614679e-06,
"loss": 11.8776,
"step": 62
},
{
"epoch": 0.07229952661024243,
"grad_norm": 112.5,
"learning_rate": 2.844036697247707e-06,
"loss": 9.321,
"step": 63
},
{
"epoch": 0.07344713814373835,
"grad_norm": 118.0,
"learning_rate": 2.8899082568807342e-06,
"loss": 10.727,
"step": 64
},
{
"epoch": 0.07459474967723426,
"grad_norm": 167.0,
"learning_rate": 2.935779816513762e-06,
"loss": 13.9879,
"step": 65
},
{
"epoch": 0.07574236121073016,
"grad_norm": 100.0,
"learning_rate": 2.981651376146789e-06,
"loss": 7.9334,
"step": 66
},
{
"epoch": 0.07688997274422608,
"grad_norm": 153.0,
"learning_rate": 3.0275229357798168e-06,
"loss": 14.9343,
"step": 67
},
{
"epoch": 0.07803758427772199,
"grad_norm": 119.0,
"learning_rate": 3.073394495412844e-06,
"loss": 8.6676,
"step": 68
},
{
"epoch": 0.0791851958112179,
"grad_norm": 117.5,
"learning_rate": 3.119266055045872e-06,
"loss": 10.101,
"step": 69
},
{
"epoch": 0.08033280734471382,
"grad_norm": 85.5,
"learning_rate": 3.1651376146788993e-06,
"loss": 7.5899,
"step": 70
},
{
"epoch": 0.08148041887820973,
"grad_norm": 174.0,
"learning_rate": 3.211009174311927e-06,
"loss": 15.9673,
"step": 71
},
{
"epoch": 0.08262803041170563,
"grad_norm": 221.0,
"learning_rate": 3.256880733944954e-06,
"loss": 14.1455,
"step": 72
},
{
"epoch": 0.08377564194520155,
"grad_norm": 172.0,
"learning_rate": 3.3027522935779823e-06,
"loss": 15.9228,
"step": 73
},
{
"epoch": 0.08492325347869746,
"grad_norm": 144.0,
"learning_rate": 3.3486238532110095e-06,
"loss": 12.6043,
"step": 74
},
{
"epoch": 0.08607086501219337,
"grad_norm": 118.5,
"learning_rate": 3.394495412844037e-06,
"loss": 9.2068,
"step": 75
},
{
"epoch": 0.08721847654568929,
"grad_norm": 147.0,
"learning_rate": 3.4403669724770644e-06,
"loss": 11.8722,
"step": 76
},
{
"epoch": 0.0883660880791852,
"grad_norm": 119.5,
"learning_rate": 3.486238532110092e-06,
"loss": 10.4207,
"step": 77
},
{
"epoch": 0.0895136996126811,
"grad_norm": 170.0,
"learning_rate": 3.5321100917431193e-06,
"loss": 14.4936,
"step": 78
},
{
"epoch": 0.09066131114617702,
"grad_norm": 183.0,
"learning_rate": 3.5779816513761473e-06,
"loss": 14.2192,
"step": 79
},
{
"epoch": 0.09180892267967293,
"grad_norm": 128.0,
"learning_rate": 3.6238532110091746e-06,
"loss": 12.4628,
"step": 80
},
{
"epoch": 0.09295653421316884,
"grad_norm": 100.0,
"learning_rate": 3.6697247706422022e-06,
"loss": 5.9004,
"step": 81
},
{
"epoch": 0.09410414574666476,
"grad_norm": 163.0,
"learning_rate": 3.7155963302752295e-06,
"loss": 12.766,
"step": 82
},
{
"epoch": 0.09525175728016066,
"grad_norm": 202.0,
"learning_rate": 3.7614678899082575e-06,
"loss": 14.3118,
"step": 83
},
{
"epoch": 0.09639936881365657,
"grad_norm": 314.0,
"learning_rate": 3.8073394495412848e-06,
"loss": 12.7559,
"step": 84
},
{
"epoch": 0.09754698034715249,
"grad_norm": 100.0,
"learning_rate": 3.853211009174313e-06,
"loss": 7.6448,
"step": 85
},
{
"epoch": 0.0986945918806484,
"grad_norm": 135.0,
"learning_rate": 3.89908256880734e-06,
"loss": 11.1222,
"step": 86
},
{
"epoch": 0.0998422034141443,
"grad_norm": 176.0,
"learning_rate": 3.944954128440367e-06,
"loss": 11.0153,
"step": 87
},
{
"epoch": 0.10098981494764023,
"grad_norm": 130.0,
"learning_rate": 3.9908256880733945e-06,
"loss": 11.3109,
"step": 88
},
{
"epoch": 0.10213742648113613,
"grad_norm": 176.0,
"learning_rate": 4.036697247706423e-06,
"loss": 11.3729,
"step": 89
},
{
"epoch": 0.10328503801463204,
"grad_norm": 132.0,
"learning_rate": 4.08256880733945e-06,
"loss": 10.5579,
"step": 90
},
{
"epoch": 0.10443264954812796,
"grad_norm": 126.0,
"learning_rate": 4.128440366972478e-06,
"loss": 9.2442,
"step": 91
},
{
"epoch": 0.10558026108162387,
"grad_norm": 151.0,
"learning_rate": 4.174311926605505e-06,
"loss": 13.6998,
"step": 92
},
{
"epoch": 0.10672787261511978,
"grad_norm": 99.5,
"learning_rate": 4.220183486238532e-06,
"loss": 8.6871,
"step": 93
},
{
"epoch": 0.1078754841486157,
"grad_norm": 128.0,
"learning_rate": 4.26605504587156e-06,
"loss": 8.4919,
"step": 94
},
{
"epoch": 0.1090230956821116,
"grad_norm": 132.0,
"learning_rate": 4.311926605504588e-06,
"loss": 8.9568,
"step": 95
},
{
"epoch": 0.11017070721560751,
"grad_norm": 135.0,
"learning_rate": 4.357798165137615e-06,
"loss": 11.2536,
"step": 96
},
{
"epoch": 0.11131831874910343,
"grad_norm": 141.0,
"learning_rate": 4.403669724770643e-06,
"loss": 10.4686,
"step": 97
},
{
"epoch": 0.11246593028259934,
"grad_norm": 78.5,
"learning_rate": 4.44954128440367e-06,
"loss": 4.7855,
"step": 98
},
{
"epoch": 0.11361354181609525,
"grad_norm": 126.5,
"learning_rate": 4.4954128440366975e-06,
"loss": 8.6237,
"step": 99
},
{
"epoch": 0.11476115334959117,
"grad_norm": 104.5,
"learning_rate": 4.541284403669725e-06,
"loss": 6.5662,
"step": 100
},
{
"epoch": 0.11476115334959117,
"eval_accuracy": 0.46,
"eval_loss": 10.765486717224121,
"eval_runtime": 49.6485,
"eval_samples_per_second": 2.014,
"eval_steps_per_second": 2.014,
"step": 100
},
{
"epoch": 0.11590876488308707,
"grad_norm": 103.0,
"learning_rate": 4.587155963302753e-06,
"loss": 6.5649,
"step": 101
},
{
"epoch": 0.11705637641658298,
"grad_norm": 144.0,
"learning_rate": 4.63302752293578e-06,
"loss": 8.2535,
"step": 102
},
{
"epoch": 0.1182039879500789,
"grad_norm": 135.0,
"learning_rate": 4.678899082568808e-06,
"loss": 11.0001,
"step": 103
},
{
"epoch": 0.11935159948357481,
"grad_norm": 109.0,
"learning_rate": 4.724770642201835e-06,
"loss": 8.2321,
"step": 104
},
{
"epoch": 0.12049921101707071,
"grad_norm": 134.0,
"learning_rate": 4.770642201834863e-06,
"loss": 10.8236,
"step": 105
},
{
"epoch": 0.12164682255056664,
"grad_norm": 133.0,
"learning_rate": 4.816513761467891e-06,
"loss": 10.03,
"step": 106
},
{
"epoch": 0.12279443408406254,
"grad_norm": 148.0,
"learning_rate": 4.862385321100918e-06,
"loss": 13.1908,
"step": 107
},
{
"epoch": 0.12394204561755846,
"grad_norm": 64.0,
"learning_rate": 4.908256880733945e-06,
"loss": 3.6086,
"step": 108
},
{
"epoch": 0.12508965715105436,
"grad_norm": 139.0,
"learning_rate": 4.954128440366973e-06,
"loss": 10.9146,
"step": 109
},
{
"epoch": 0.1262372686845503,
"grad_norm": 100.5,
"learning_rate": 5e-06,
"loss": 7.6266,
"step": 110
},
{
"epoch": 0.1273848802180462,
"grad_norm": 102.0,
"learning_rate": 5.045871559633028e-06,
"loss": 8.3553,
"step": 111
},
{
"epoch": 0.1285324917515421,
"grad_norm": 145.0,
"learning_rate": 5.091743119266055e-06,
"loss": 8.7646,
"step": 112
},
{
"epoch": 0.129680103285038,
"grad_norm": 178.0,
"learning_rate": 5.137614678899083e-06,
"loss": 12.6374,
"step": 113
},
{
"epoch": 0.13082771481853392,
"grad_norm": 91.5,
"learning_rate": 5.18348623853211e-06,
"loss": 5.8455,
"step": 114
},
{
"epoch": 0.13197532635202983,
"grad_norm": 122.0,
"learning_rate": 5.229357798165137e-06,
"loss": 9.7438,
"step": 115
},
{
"epoch": 0.13312293788552576,
"grad_norm": 192.0,
"learning_rate": 5.275229357798165e-06,
"loss": 9.9915,
"step": 116
},
{
"epoch": 0.13427054941902167,
"grad_norm": 108.0,
"learning_rate": 5.3211009174311936e-06,
"loss": 7.6686,
"step": 117
},
{
"epoch": 0.13541816095251757,
"grad_norm": 153.0,
"learning_rate": 5.366972477064221e-06,
"loss": 10.4111,
"step": 118
},
{
"epoch": 0.13656577248601348,
"grad_norm": 139.0,
"learning_rate": 5.412844036697248e-06,
"loss": 6.46,
"step": 119
},
{
"epoch": 0.1377133840195094,
"grad_norm": 79.0,
"learning_rate": 5.458715596330275e-06,
"loss": 5.2337,
"step": 120
},
{
"epoch": 0.1388609955530053,
"grad_norm": 114.5,
"learning_rate": 5.504587155963303e-06,
"loss": 5.2836,
"step": 121
},
{
"epoch": 0.14000860708650123,
"grad_norm": 99.5,
"learning_rate": 5.5504587155963306e-06,
"loss": 7.6412,
"step": 122
},
{
"epoch": 0.14115621861999714,
"grad_norm": 147.0,
"learning_rate": 5.596330275229358e-06,
"loss": 9.4328,
"step": 123
},
{
"epoch": 0.14230383015349304,
"grad_norm": 114.5,
"learning_rate": 5.642201834862385e-06,
"loss": 7.6121,
"step": 124
},
{
"epoch": 0.14345144168698895,
"grad_norm": 131.0,
"learning_rate": 5.688073394495414e-06,
"loss": 8.1481,
"step": 125
},
{
"epoch": 0.14459905322048486,
"grad_norm": 124.5,
"learning_rate": 5.733944954128441e-06,
"loss": 6.9154,
"step": 126
},
{
"epoch": 0.14574666475398076,
"grad_norm": 125.0,
"learning_rate": 5.7798165137614684e-06,
"loss": 7.5579,
"step": 127
},
{
"epoch": 0.1468942762874767,
"grad_norm": 69.0,
"learning_rate": 5.825688073394496e-06,
"loss": 4.5767,
"step": 128
},
{
"epoch": 0.1480418878209726,
"grad_norm": 136.0,
"learning_rate": 5.871559633027524e-06,
"loss": 8.226,
"step": 129
},
{
"epoch": 0.1491894993544685,
"grad_norm": 592.0,
"learning_rate": 5.917431192660551e-06,
"loss": 4.7686,
"step": 130
},
{
"epoch": 0.15033711088796442,
"grad_norm": 199.0,
"learning_rate": 5.963302752293578e-06,
"loss": 8.1914,
"step": 131
},
{
"epoch": 0.15148472242146033,
"grad_norm": 99.0,
"learning_rate": 6.0091743119266054e-06,
"loss": 4.6827,
"step": 132
},
{
"epoch": 0.15263233395495623,
"grad_norm": 79.0,
"learning_rate": 6.0550458715596335e-06,
"loss": 4.0036,
"step": 133
},
{
"epoch": 0.15377994548845217,
"grad_norm": 104.0,
"learning_rate": 6.100917431192661e-06,
"loss": 5.5383,
"step": 134
},
{
"epoch": 0.15492755702194808,
"grad_norm": 89.5,
"learning_rate": 6.146788990825688e-06,
"loss": 5.6737,
"step": 135
},
{
"epoch": 0.15607516855544398,
"grad_norm": 126.0,
"learning_rate": 6.192660550458715e-06,
"loss": 6.3379,
"step": 136
},
{
"epoch": 0.1572227800889399,
"grad_norm": 106.0,
"learning_rate": 6.238532110091744e-06,
"loss": 5.8609,
"step": 137
},
{
"epoch": 0.1583703916224358,
"grad_norm": 74.0,
"learning_rate": 6.284403669724771e-06,
"loss": 2.5903,
"step": 138
},
{
"epoch": 0.1595180031559317,
"grad_norm": 166.0,
"learning_rate": 6.330275229357799e-06,
"loss": 6.5836,
"step": 139
},
{
"epoch": 0.16066561468942764,
"grad_norm": 132.0,
"learning_rate": 6.376146788990826e-06,
"loss": 4.7941,
"step": 140
},
{
"epoch": 0.16181322622292355,
"grad_norm": 79.5,
"learning_rate": 6.422018348623854e-06,
"loss": 3.4315,
"step": 141
},
{
"epoch": 0.16296083775641945,
"grad_norm": 90.0,
"learning_rate": 6.467889908256881e-06,
"loss": 2.8439,
"step": 142
},
{
"epoch": 0.16410844928991536,
"grad_norm": 147.0,
"learning_rate": 6.513761467889908e-06,
"loss": 5.9459,
"step": 143
},
{
"epoch": 0.16525606082341127,
"grad_norm": 127.5,
"learning_rate": 6.559633027522936e-06,
"loss": 5.9421,
"step": 144
},
{
"epoch": 0.1664036723569072,
"grad_norm": 108.5,
"learning_rate": 6.6055045871559645e-06,
"loss": 4.3347,
"step": 145
},
{
"epoch": 0.1675512838904031,
"grad_norm": 110.0,
"learning_rate": 6.651376146788992e-06,
"loss": 2.862,
"step": 146
},
{
"epoch": 0.16869889542389901,
"grad_norm": 91.5,
"learning_rate": 6.697247706422019e-06,
"loss": 3.0382,
"step": 147
},
{
"epoch": 0.16984650695739492,
"grad_norm": 90.0,
"learning_rate": 6.743119266055046e-06,
"loss": 2.4137,
"step": 148
},
{
"epoch": 0.17099411849089083,
"grad_norm": 205.0,
"learning_rate": 6.788990825688074e-06,
"loss": 3.6585,
"step": 149
},
{
"epoch": 0.17214173002438674,
"grad_norm": 132.0,
"learning_rate": 6.8348623853211015e-06,
"loss": 3.3452,
"step": 150
},
{
"epoch": 0.17328934155788267,
"grad_norm": 102.5,
"learning_rate": 6.880733944954129e-06,
"loss": 2.6872,
"step": 151
},
{
"epoch": 0.17443695309137858,
"grad_norm": 92.5,
"learning_rate": 6.926605504587156e-06,
"loss": 2.7081,
"step": 152
},
{
"epoch": 0.17558456462487448,
"grad_norm": 97.0,
"learning_rate": 6.972477064220184e-06,
"loss": 1.789,
"step": 153
},
{
"epoch": 0.1767321761583704,
"grad_norm": 96.5,
"learning_rate": 7.018348623853211e-06,
"loss": 2.1933,
"step": 154
},
{
"epoch": 0.1778797876918663,
"grad_norm": 117.0,
"learning_rate": 7.0642201834862385e-06,
"loss": 1.5972,
"step": 155
},
{
"epoch": 0.1790273992253622,
"grad_norm": 70.0,
"learning_rate": 7.110091743119267e-06,
"loss": 1.6302,
"step": 156
},
{
"epoch": 0.18017501075885814,
"grad_norm": 50.0,
"learning_rate": 7.155963302752295e-06,
"loss": 1.1936,
"step": 157
},
{
"epoch": 0.18132262229235405,
"grad_norm": 71.5,
"learning_rate": 7.201834862385322e-06,
"loss": 1.2134,
"step": 158
},
{
"epoch": 0.18247023382584995,
"grad_norm": 37.75,
"learning_rate": 7.247706422018349e-06,
"loss": 0.8042,
"step": 159
},
{
"epoch": 0.18361784535934586,
"grad_norm": 54.75,
"learning_rate": 7.293577981651376e-06,
"loss": 0.7016,
"step": 160
},
{
"epoch": 0.18476545689284177,
"grad_norm": 141.0,
"learning_rate": 7.3394495412844045e-06,
"loss": 1.6214,
"step": 161
},
{
"epoch": 0.18591306842633767,
"grad_norm": 118.0,
"learning_rate": 7.385321100917432e-06,
"loss": 1.4091,
"step": 162
},
{
"epoch": 0.1870606799598336,
"grad_norm": 42.0,
"learning_rate": 7.431192660550459e-06,
"loss": 0.636,
"step": 163
},
{
"epoch": 0.18820829149332952,
"grad_norm": 165.0,
"learning_rate": 7.477064220183486e-06,
"loss": 1.8344,
"step": 164
},
{
"epoch": 0.18935590302682542,
"grad_norm": 62.0,
"learning_rate": 7.522935779816515e-06,
"loss": 0.5202,
"step": 165
},
{
"epoch": 0.19050351456032133,
"grad_norm": 87.5,
"learning_rate": 7.568807339449542e-06,
"loss": 1.1639,
"step": 166
},
{
"epoch": 0.19165112609381724,
"grad_norm": 53.0,
"learning_rate": 7.6146788990825695e-06,
"loss": 0.907,
"step": 167
},
{
"epoch": 0.19279873762731314,
"grad_norm": 59.5,
"learning_rate": 7.660550458715596e-06,
"loss": 1.0624,
"step": 168
},
{
"epoch": 0.19394634916080908,
"grad_norm": 37.0,
"learning_rate": 7.706422018348626e-06,
"loss": 0.6051,
"step": 169
},
{
"epoch": 0.19509396069430499,
"grad_norm": 50.75,
"learning_rate": 7.752293577981652e-06,
"loss": 0.9568,
"step": 170
},
{
"epoch": 0.1962415722278009,
"grad_norm": 99.5,
"learning_rate": 7.79816513761468e-06,
"loss": 1.0009,
"step": 171
},
{
"epoch": 0.1973891837612968,
"grad_norm": 58.75,
"learning_rate": 7.844036697247707e-06,
"loss": 1.2179,
"step": 172
},
{
"epoch": 0.1985367952947927,
"grad_norm": 30.375,
"learning_rate": 7.889908256880735e-06,
"loss": 0.2789,
"step": 173
},
{
"epoch": 0.1996844068282886,
"grad_norm": 48.5,
"learning_rate": 7.935779816513763e-06,
"loss": 0.7911,
"step": 174
},
{
"epoch": 0.20083201836178455,
"grad_norm": 42.25,
"learning_rate": 7.981651376146789e-06,
"loss": 0.8686,
"step": 175
},
{
"epoch": 0.20197962989528045,
"grad_norm": 141.0,
"learning_rate": 8.027522935779817e-06,
"loss": 1.1276,
"step": 176
},
{
"epoch": 0.20312724142877636,
"grad_norm": 156.0,
"learning_rate": 8.073394495412845e-06,
"loss": 0.8758,
"step": 177
},
{
"epoch": 0.20427485296227227,
"grad_norm": 32.75,
"learning_rate": 8.119266055045872e-06,
"loss": 0.6642,
"step": 178
},
{
"epoch": 0.20542246449576818,
"grad_norm": 56.25,
"learning_rate": 8.1651376146789e-06,
"loss": 1.0594,
"step": 179
},
{
"epoch": 0.20657007602926408,
"grad_norm": 34.5,
"learning_rate": 8.211009174311926e-06,
"loss": 0.6556,
"step": 180
},
{
"epoch": 0.20771768756276002,
"grad_norm": 80.5,
"learning_rate": 8.256880733944956e-06,
"loss": 0.8868,
"step": 181
},
{
"epoch": 0.20886529909625592,
"grad_norm": 47.5,
"learning_rate": 8.302752293577982e-06,
"loss": 0.7725,
"step": 182
},
{
"epoch": 0.21001291062975183,
"grad_norm": 23.0,
"learning_rate": 8.34862385321101e-06,
"loss": 0.6719,
"step": 183
},
{
"epoch": 0.21116052216324774,
"grad_norm": 72.0,
"learning_rate": 8.394495412844037e-06,
"loss": 0.8492,
"step": 184
},
{
"epoch": 0.21230813369674364,
"grad_norm": 73.0,
"learning_rate": 8.440366972477065e-06,
"loss": 0.7163,
"step": 185
},
{
"epoch": 0.21345574523023955,
"grad_norm": 82.0,
"learning_rate": 8.486238532110093e-06,
"loss": 0.7227,
"step": 186
},
{
"epoch": 0.2146033567637355,
"grad_norm": 48.75,
"learning_rate": 8.53211009174312e-06,
"loss": 0.8237,
"step": 187
},
{
"epoch": 0.2157509682972314,
"grad_norm": 30.75,
"learning_rate": 8.577981651376147e-06,
"loss": 0.7007,
"step": 188
},
{
"epoch": 0.2168985798307273,
"grad_norm": 76.0,
"learning_rate": 8.623853211009175e-06,
"loss": 0.7568,
"step": 189
},
{
"epoch": 0.2180461913642232,
"grad_norm": 126.0,
"learning_rate": 8.669724770642203e-06,
"loss": 0.8139,
"step": 190
},
{
"epoch": 0.21919380289771911,
"grad_norm": 67.5,
"learning_rate": 8.71559633027523e-06,
"loss": 0.7062,
"step": 191
},
{
"epoch": 0.22034141443121502,
"grad_norm": 26.375,
"learning_rate": 8.761467889908258e-06,
"loss": 0.5425,
"step": 192
},
{
"epoch": 0.22148902596471096,
"grad_norm": 105.5,
"learning_rate": 8.807339449541286e-06,
"loss": 0.8822,
"step": 193
},
{
"epoch": 0.22263663749820686,
"grad_norm": 131.0,
"learning_rate": 8.853211009174312e-06,
"loss": 0.9047,
"step": 194
},
{
"epoch": 0.22378424903170277,
"grad_norm": 56.5,
"learning_rate": 8.89908256880734e-06,
"loss": 0.5039,
"step": 195
},
{
"epoch": 0.22493186056519868,
"grad_norm": 73.5,
"learning_rate": 8.944954128440367e-06,
"loss": 0.7597,
"step": 196
},
{
"epoch": 0.22607947209869458,
"grad_norm": 56.25,
"learning_rate": 8.990825688073395e-06,
"loss": 0.742,
"step": 197
},
{
"epoch": 0.2272270836321905,
"grad_norm": 71.5,
"learning_rate": 9.036697247706423e-06,
"loss": 0.892,
"step": 198
},
{
"epoch": 0.22837469516568643,
"grad_norm": 33.0,
"learning_rate": 9.08256880733945e-06,
"loss": 0.6746,
"step": 199
},
{
"epoch": 0.22952230669918233,
"grad_norm": 95.0,
"learning_rate": 9.128440366972477e-06,
"loss": 0.8428,
"step": 200
},
{
"epoch": 0.22952230669918233,
"eval_accuracy": 0.23,
"eval_loss": 0.7526699900627136,
"eval_runtime": 49.2923,
"eval_samples_per_second": 2.029,
"eval_steps_per_second": 2.029,
"step": 200
},
{
"epoch": 0.23066991823267824,
"grad_norm": 43.0,
"learning_rate": 9.174311926605506e-06,
"loss": 0.6504,
"step": 201
},
{
"epoch": 0.23181752976617415,
"grad_norm": 46.75,
"learning_rate": 9.220183486238534e-06,
"loss": 0.7568,
"step": 202
},
{
"epoch": 0.23296514129967005,
"grad_norm": 76.5,
"learning_rate": 9.26605504587156e-06,
"loss": 0.5601,
"step": 203
},
{
"epoch": 0.23411275283316596,
"grad_norm": 82.0,
"learning_rate": 9.311926605504588e-06,
"loss": 0.6661,
"step": 204
},
{
"epoch": 0.2352603643666619,
"grad_norm": 63.75,
"learning_rate": 9.357798165137616e-06,
"loss": 0.7619,
"step": 205
},
{
"epoch": 0.2364079759001578,
"grad_norm": 28.5,
"learning_rate": 9.403669724770643e-06,
"loss": 0.6332,
"step": 206
},
{
"epoch": 0.2375555874336537,
"grad_norm": 48.75,
"learning_rate": 9.44954128440367e-06,
"loss": 0.8103,
"step": 207
},
{
"epoch": 0.23870319896714962,
"grad_norm": 32.25,
"learning_rate": 9.495412844036697e-06,
"loss": 0.8623,
"step": 208
},
{
"epoch": 0.23985081050064552,
"grad_norm": 51.25,
"learning_rate": 9.541284403669727e-06,
"loss": 0.6734,
"step": 209
},
{
"epoch": 0.24099842203414143,
"grad_norm": 106.0,
"learning_rate": 9.587155963302753e-06,
"loss": 0.7637,
"step": 210
},
{
"epoch": 0.24214603356763736,
"grad_norm": 43.5,
"learning_rate": 9.633027522935781e-06,
"loss": 0.6827,
"step": 211
},
{
"epoch": 0.24329364510113327,
"grad_norm": 56.25,
"learning_rate": 9.678899082568808e-06,
"loss": 0.9193,
"step": 212
},
{
"epoch": 0.24444125663462918,
"grad_norm": 67.5,
"learning_rate": 9.724770642201836e-06,
"loss": 0.8784,
"step": 213
},
{
"epoch": 0.24558886816812509,
"grad_norm": 61.0,
"learning_rate": 9.770642201834864e-06,
"loss": 0.6853,
"step": 214
},
{
"epoch": 0.246736479701621,
"grad_norm": 33.5,
"learning_rate": 9.81651376146789e-06,
"loss": 0.6893,
"step": 215
},
{
"epoch": 0.24788409123511693,
"grad_norm": 20.5,
"learning_rate": 9.862385321100918e-06,
"loss": 0.6858,
"step": 216
},
{
"epoch": 0.24903170276861283,
"grad_norm": 51.0,
"learning_rate": 9.908256880733946e-06,
"loss": 0.5894,
"step": 217
},
{
"epoch": 0.2501793143021087,
"grad_norm": 61.0,
"learning_rate": 9.954128440366973e-06,
"loss": 0.9096,
"step": 218
},
{
"epoch": 0.2513269258356047,
"grad_norm": 28.625,
"learning_rate": 1e-05,
"loss": 0.801,
"step": 219
},
{
"epoch": 0.2524745373691006,
"grad_norm": 41.0,
"learning_rate": 1.0045871559633029e-05,
"loss": 0.6585,
"step": 220
},
{
"epoch": 0.2536221489025965,
"grad_norm": 39.75,
"learning_rate": 1.0091743119266055e-05,
"loss": 0.7587,
"step": 221
},
{
"epoch": 0.2547697604360924,
"grad_norm": 40.0,
"learning_rate": 1.0137614678899083e-05,
"loss": 0.7094,
"step": 222
},
{
"epoch": 0.2559173719695883,
"grad_norm": 684.0,
"learning_rate": 1.018348623853211e-05,
"loss": 0.7388,
"step": 223
},
{
"epoch": 0.2570649835030842,
"grad_norm": 54.5,
"learning_rate": 1.0229357798165138e-05,
"loss": 0.7495,
"step": 224
},
{
"epoch": 0.2582125950365801,
"grad_norm": 65.5,
"learning_rate": 1.0275229357798166e-05,
"loss": 0.834,
"step": 225
},
{
"epoch": 0.259360206570076,
"grad_norm": 68.5,
"learning_rate": 1.0321100917431192e-05,
"loss": 0.9911,
"step": 226
},
{
"epoch": 0.26050781810357193,
"grad_norm": 59.75,
"learning_rate": 1.036697247706422e-05,
"loss": 0.7996,
"step": 227
},
{
"epoch": 0.26165542963706784,
"grad_norm": 39.0,
"learning_rate": 1.041284403669725e-05,
"loss": 0.7586,
"step": 228
},
{
"epoch": 0.26280304117056374,
"grad_norm": 41.0,
"learning_rate": 1.0458715596330275e-05,
"loss": 0.6575,
"step": 229
},
{
"epoch": 0.26395065270405965,
"grad_norm": 25.75,
"learning_rate": 1.0504587155963305e-05,
"loss": 0.5676,
"step": 230
},
{
"epoch": 0.2650982642375556,
"grad_norm": 38.5,
"learning_rate": 1.055045871559633e-05,
"loss": 0.7107,
"step": 231
},
{
"epoch": 0.2662458757710515,
"grad_norm": 29.0,
"learning_rate": 1.0596330275229359e-05,
"loss": 0.5768,
"step": 232
},
{
"epoch": 0.26739348730454743,
"grad_norm": 67.0,
"learning_rate": 1.0642201834862387e-05,
"loss": 0.8002,
"step": 233
},
{
"epoch": 0.26854109883804334,
"grad_norm": 92.0,
"learning_rate": 1.0688073394495414e-05,
"loss": 0.9373,
"step": 234
},
{
"epoch": 0.26968871037153924,
"grad_norm": 95.0,
"learning_rate": 1.0733944954128442e-05,
"loss": 0.9883,
"step": 235
},
{
"epoch": 0.27083632190503515,
"grad_norm": 32.25,
"learning_rate": 1.077981651376147e-05,
"loss": 0.3327,
"step": 236
},
{
"epoch": 0.27198393343853106,
"grad_norm": 38.75,
"learning_rate": 1.0825688073394496e-05,
"loss": 0.9128,
"step": 237
},
{
"epoch": 0.27313154497202696,
"grad_norm": 113.5,
"learning_rate": 1.0871559633027524e-05,
"loss": 0.7185,
"step": 238
},
{
"epoch": 0.27427915650552287,
"grad_norm": 78.5,
"learning_rate": 1.091743119266055e-05,
"loss": 0.7406,
"step": 239
},
{
"epoch": 0.2754267680390188,
"grad_norm": 54.25,
"learning_rate": 1.0963302752293579e-05,
"loss": 0.5355,
"step": 240
},
{
"epoch": 0.2765743795725147,
"grad_norm": 88.0,
"learning_rate": 1.1009174311926607e-05,
"loss": 0.7876,
"step": 241
},
{
"epoch": 0.2777219911060106,
"grad_norm": 25.125,
"learning_rate": 1.1055045871559633e-05,
"loss": 0.7005,
"step": 242
},
{
"epoch": 0.27886960263950655,
"grad_norm": 62.0,
"learning_rate": 1.1100917431192661e-05,
"loss": 0.6772,
"step": 243
},
{
"epoch": 0.28001721417300246,
"grad_norm": 88.5,
"learning_rate": 1.114678899082569e-05,
"loss": 0.7296,
"step": 244
},
{
"epoch": 0.28116482570649837,
"grad_norm": 29.375,
"learning_rate": 1.1192660550458716e-05,
"loss": 0.7339,
"step": 245
},
{
"epoch": 0.2823124372399943,
"grad_norm": 21.75,
"learning_rate": 1.1238532110091744e-05,
"loss": 0.5743,
"step": 246
},
{
"epoch": 0.2834600487734902,
"grad_norm": 127.5,
"learning_rate": 1.128440366972477e-05,
"loss": 0.9532,
"step": 247
},
{
"epoch": 0.2846076603069861,
"grad_norm": 97.0,
"learning_rate": 1.1330275229357798e-05,
"loss": 0.9855,
"step": 248
},
{
"epoch": 0.285755271840482,
"grad_norm": 54.25,
"learning_rate": 1.1376146788990828e-05,
"loss": 0.6011,
"step": 249
},
{
"epoch": 0.2869028833739779,
"grad_norm": 27.125,
"learning_rate": 1.1422018348623853e-05,
"loss": 0.4934,
"step": 250
},
{
"epoch": 0.2880504949074738,
"grad_norm": 156.0,
"learning_rate": 1.1467889908256882e-05,
"loss": 1.0312,
"step": 251
},
{
"epoch": 0.2891981064409697,
"grad_norm": 31.5,
"learning_rate": 1.151376146788991e-05,
"loss": 0.6735,
"step": 252
},
{
"epoch": 0.2903457179744656,
"grad_norm": 26.0,
"learning_rate": 1.1559633027522937e-05,
"loss": 0.5176,
"step": 253
},
{
"epoch": 0.29149332950796153,
"grad_norm": 28.0,
"learning_rate": 1.1605504587155965e-05,
"loss": 0.7067,
"step": 254
},
{
"epoch": 0.2926409410414575,
"grad_norm": 50.75,
"learning_rate": 1.1651376146788991e-05,
"loss": 0.5816,
"step": 255
},
{
"epoch": 0.2937885525749534,
"grad_norm": 33.0,
"learning_rate": 1.169724770642202e-05,
"loss": 0.5099,
"step": 256
},
{
"epoch": 0.2949361641084493,
"grad_norm": 63.25,
"learning_rate": 1.1743119266055047e-05,
"loss": 0.6038,
"step": 257
},
{
"epoch": 0.2960837756419452,
"grad_norm": 152.0,
"learning_rate": 1.1788990825688074e-05,
"loss": 1.2612,
"step": 258
},
{
"epoch": 0.2972313871754411,
"grad_norm": 55.5,
"learning_rate": 1.1834862385321102e-05,
"loss": 0.8309,
"step": 259
},
{
"epoch": 0.298378998708937,
"grad_norm": 49.75,
"learning_rate": 1.188073394495413e-05,
"loss": 0.7434,
"step": 260
},
{
"epoch": 0.29952661024243293,
"grad_norm": 38.25,
"learning_rate": 1.1926605504587156e-05,
"loss": 0.6988,
"step": 261
},
{
"epoch": 0.30067422177592884,
"grad_norm": 31.25,
"learning_rate": 1.1972477064220184e-05,
"loss": 0.674,
"step": 262
},
{
"epoch": 0.30182183330942475,
"grad_norm": 61.25,
"learning_rate": 1.2018348623853211e-05,
"loss": 0.8105,
"step": 263
},
{
"epoch": 0.30296944484292065,
"grad_norm": 67.0,
"learning_rate": 1.2064220183486239e-05,
"loss": 0.7834,
"step": 264
},
{
"epoch": 0.30411705637641656,
"grad_norm": 34.0,
"learning_rate": 1.2110091743119267e-05,
"loss": 0.6694,
"step": 265
},
{
"epoch": 0.30526466790991247,
"grad_norm": 48.75,
"learning_rate": 1.2155963302752293e-05,
"loss": 0.4389,
"step": 266
},
{
"epoch": 0.30641227944340843,
"grad_norm": 45.0,
"learning_rate": 1.2201834862385321e-05,
"loss": 0.9619,
"step": 267
},
{
"epoch": 0.30755989097690434,
"grad_norm": 27.25,
"learning_rate": 1.2247706422018351e-05,
"loss": 0.8181,
"step": 268
},
{
"epoch": 0.30870750251040024,
"grad_norm": 78.5,
"learning_rate": 1.2293577981651376e-05,
"loss": 0.8289,
"step": 269
},
{
"epoch": 0.30985511404389615,
"grad_norm": 29.625,
"learning_rate": 1.2339449541284406e-05,
"loss": 0.66,
"step": 270
},
{
"epoch": 0.31100272557739206,
"grad_norm": 51.25,
"learning_rate": 1.238532110091743e-05,
"loss": 0.6833,
"step": 271
},
{
"epoch": 0.31215033711088797,
"grad_norm": 45.0,
"learning_rate": 1.243119266055046e-05,
"loss": 0.6545,
"step": 272
},
{
"epoch": 0.3132979486443839,
"grad_norm": 35.5,
"learning_rate": 1.2477064220183488e-05,
"loss": 0.6642,
"step": 273
},
{
"epoch": 0.3144455601778798,
"grad_norm": 27.75,
"learning_rate": 1.2522935779816515e-05,
"loss": 0.7786,
"step": 274
},
{
"epoch": 0.3155931717113757,
"grad_norm": 103.0,
"learning_rate": 1.2568807339449543e-05,
"loss": 0.9578,
"step": 275
},
{
"epoch": 0.3167407832448716,
"grad_norm": 61.75,
"learning_rate": 1.261467889908257e-05,
"loss": 0.5513,
"step": 276
},
{
"epoch": 0.3178883947783675,
"grad_norm": 86.5,
"learning_rate": 1.2660550458715597e-05,
"loss": 0.855,
"step": 277
},
{
"epoch": 0.3190360063118634,
"grad_norm": 47.0,
"learning_rate": 1.2706422018348625e-05,
"loss": 0.7903,
"step": 278
},
{
"epoch": 0.32018361784535937,
"grad_norm": 21.125,
"learning_rate": 1.2752293577981652e-05,
"loss": 0.6084,
"step": 279
},
{
"epoch": 0.3213312293788553,
"grad_norm": 53.0,
"learning_rate": 1.279816513761468e-05,
"loss": 0.7655,
"step": 280
},
{
"epoch": 0.3224788409123512,
"grad_norm": 69.0,
"learning_rate": 1.2844036697247708e-05,
"loss": 0.7763,
"step": 281
},
{
"epoch": 0.3236264524458471,
"grad_norm": 98.0,
"learning_rate": 1.2889908256880734e-05,
"loss": 0.8355,
"step": 282
},
{
"epoch": 0.324774063979343,
"grad_norm": 65.0,
"learning_rate": 1.2935779816513762e-05,
"loss": 0.7071,
"step": 283
},
{
"epoch": 0.3259216755128389,
"grad_norm": 25.75,
"learning_rate": 1.298165137614679e-05,
"loss": 0.8358,
"step": 284
},
{
"epoch": 0.3270692870463348,
"grad_norm": 48.25,
"learning_rate": 1.3027522935779817e-05,
"loss": 0.7069,
"step": 285
},
{
"epoch": 0.3282168985798307,
"grad_norm": 27.75,
"learning_rate": 1.3073394495412845e-05,
"loss": 0.601,
"step": 286
},
{
"epoch": 0.3293645101133266,
"grad_norm": 44.25,
"learning_rate": 1.3119266055045871e-05,
"loss": 0.6844,
"step": 287
},
{
"epoch": 0.33051212164682253,
"grad_norm": 73.0,
"learning_rate": 1.31651376146789e-05,
"loss": 1.5458,
"step": 288
},
{
"epoch": 0.33165973318031844,
"grad_norm": 36.0,
"learning_rate": 1.3211009174311929e-05,
"loss": 0.8631,
"step": 289
},
{
"epoch": 0.3328073447138144,
"grad_norm": 60.25,
"learning_rate": 1.3256880733944954e-05,
"loss": 0.7894,
"step": 290
},
{
"epoch": 0.3339549562473103,
"grad_norm": 46.75,
"learning_rate": 1.3302752293577984e-05,
"loss": 0.7715,
"step": 291
},
{
"epoch": 0.3351025677808062,
"grad_norm": 17.5,
"learning_rate": 1.3348623853211012e-05,
"loss": 0.5756,
"step": 292
},
{
"epoch": 0.3362501793143021,
"grad_norm": 32.0,
"learning_rate": 1.3394495412844038e-05,
"loss": 0.7155,
"step": 293
},
{
"epoch": 0.33739779084779803,
"grad_norm": 96.5,
"learning_rate": 1.3440366972477066e-05,
"loss": 0.6518,
"step": 294
},
{
"epoch": 0.33854540238129394,
"grad_norm": 37.75,
"learning_rate": 1.3486238532110092e-05,
"loss": 0.6962,
"step": 295
},
{
"epoch": 0.33969301391478984,
"grad_norm": 57.5,
"learning_rate": 1.353211009174312e-05,
"loss": 0.5687,
"step": 296
},
{
"epoch": 0.34084062544828575,
"grad_norm": 20.125,
"learning_rate": 1.3577981651376149e-05,
"loss": 0.7647,
"step": 297
},
{
"epoch": 0.34198823698178166,
"grad_norm": 22.75,
"learning_rate": 1.3623853211009175e-05,
"loss": 0.7313,
"step": 298
},
{
"epoch": 0.34313584851527756,
"grad_norm": 71.0,
"learning_rate": 1.3669724770642203e-05,
"loss": 0.8702,
"step": 299
},
{
"epoch": 0.34428346004877347,
"grad_norm": 70.0,
"learning_rate": 1.3715596330275231e-05,
"loss": 0.7895,
"step": 300
},
{
"epoch": 0.34428346004877347,
"eval_accuracy": 0.22,
"eval_loss": 0.6987403631210327,
"eval_runtime": 49.3136,
"eval_samples_per_second": 2.028,
"eval_steps_per_second": 2.028,
"step": 300
},
{
"epoch": 0.3454310715822694,
"grad_norm": 32.0,
"learning_rate": 1.3761467889908258e-05,
"loss": 0.6857,
"step": 301
},
{
"epoch": 0.34657868311576534,
"grad_norm": 58.25,
"learning_rate": 1.3807339449541286e-05,
"loss": 0.6662,
"step": 302
},
{
"epoch": 0.34772629464926125,
"grad_norm": 26.875,
"learning_rate": 1.3853211009174312e-05,
"loss": 0.5594,
"step": 303
},
{
"epoch": 0.34887390618275715,
"grad_norm": 36.5,
"learning_rate": 1.389908256880734e-05,
"loss": 0.6889,
"step": 304
},
{
"epoch": 0.35002151771625306,
"grad_norm": 49.0,
"learning_rate": 1.3944954128440368e-05,
"loss": 0.6969,
"step": 305
},
{
"epoch": 0.35116912924974897,
"grad_norm": 173.0,
"learning_rate": 1.3990825688073395e-05,
"loss": 0.6462,
"step": 306
},
{
"epoch": 0.3523167407832449,
"grad_norm": 60.0,
"learning_rate": 1.4036697247706423e-05,
"loss": 0.539,
"step": 307
},
{
"epoch": 0.3534643523167408,
"grad_norm": 27.125,
"learning_rate": 1.4082568807339452e-05,
"loss": 0.855,
"step": 308
},
{
"epoch": 0.3546119638502367,
"grad_norm": 61.5,
"learning_rate": 1.4128440366972477e-05,
"loss": 0.7295,
"step": 309
},
{
"epoch": 0.3557595753837326,
"grad_norm": 56.25,
"learning_rate": 1.4174311926605507e-05,
"loss": 0.8013,
"step": 310
},
{
"epoch": 0.3569071869172285,
"grad_norm": 19.375,
"learning_rate": 1.4220183486238533e-05,
"loss": 0.6061,
"step": 311
},
{
"epoch": 0.3580547984507244,
"grad_norm": 22.75,
"learning_rate": 1.4266055045871561e-05,
"loss": 0.2982,
"step": 312
},
{
"epoch": 0.3592024099842203,
"grad_norm": 102.0,
"learning_rate": 1.431192660550459e-05,
"loss": 1.1087,
"step": 313
},
{
"epoch": 0.3603500215177163,
"grad_norm": 42.25,
"learning_rate": 1.4357798165137616e-05,
"loss": 0.8645,
"step": 314
},
{
"epoch": 0.3614976330512122,
"grad_norm": 112.0,
"learning_rate": 1.4403669724770644e-05,
"loss": 1.2745,
"step": 315
},
{
"epoch": 0.3626452445847081,
"grad_norm": 21.125,
"learning_rate": 1.4449541284403672e-05,
"loss": 0.3714,
"step": 316
},
{
"epoch": 0.363792856118204,
"grad_norm": 87.0,
"learning_rate": 1.4495412844036698e-05,
"loss": 0.9653,
"step": 317
},
{
"epoch": 0.3649404676516999,
"grad_norm": 82.0,
"learning_rate": 1.4541284403669726e-05,
"loss": 0.982,
"step": 318
},
{
"epoch": 0.3660880791851958,
"grad_norm": 27.375,
"learning_rate": 1.4587155963302753e-05,
"loss": 0.4173,
"step": 319
},
{
"epoch": 0.3672356907186917,
"grad_norm": 74.0,
"learning_rate": 1.463302752293578e-05,
"loss": 0.9199,
"step": 320
},
{
"epoch": 0.3683833022521876,
"grad_norm": 57.0,
"learning_rate": 1.4678899082568809e-05,
"loss": 0.6555,
"step": 321
},
{
"epoch": 0.36953091378568353,
"grad_norm": 40.25,
"learning_rate": 1.4724770642201835e-05,
"loss": 0.5512,
"step": 322
},
{
"epoch": 0.37067852531917944,
"grad_norm": 51.0,
"learning_rate": 1.4770642201834863e-05,
"loss": 0.8541,
"step": 323
},
{
"epoch": 0.37182613685267535,
"grad_norm": 112.0,
"learning_rate": 1.4816513761467891e-05,
"loss": 1.031,
"step": 324
},
{
"epoch": 0.37297374838617126,
"grad_norm": 17.625,
"learning_rate": 1.4862385321100918e-05,
"loss": 0.448,
"step": 325
},
{
"epoch": 0.3741213599196672,
"grad_norm": 109.0,
"learning_rate": 1.4908256880733946e-05,
"loss": 1.0731,
"step": 326
},
{
"epoch": 0.3752689714531631,
"grad_norm": 76.0,
"learning_rate": 1.4954128440366972e-05,
"loss": 0.9293,
"step": 327
},
{
"epoch": 0.37641658298665903,
"grad_norm": 142.0,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.2647,
"step": 328
},
{
"epoch": 0.37756419452015494,
"grad_norm": 17.0,
"learning_rate": 1.504587155963303e-05,
"loss": 0.5858,
"step": 329
},
{
"epoch": 0.37871180605365085,
"grad_norm": 63.5,
"learning_rate": 1.5091743119266057e-05,
"loss": 1.1073,
"step": 330
},
{
"epoch": 0.37985941758714675,
"grad_norm": 23.125,
"learning_rate": 1.5137614678899085e-05,
"loss": 0.6616,
"step": 331
},
{
"epoch": 0.38100702912064266,
"grad_norm": 23.5,
"learning_rate": 1.5183486238532111e-05,
"loss": 0.81,
"step": 332
},
{
"epoch": 0.38215464065413857,
"grad_norm": 44.5,
"learning_rate": 1.5229357798165139e-05,
"loss": 0.7774,
"step": 333
},
{
"epoch": 0.3833022521876345,
"grad_norm": 53.0,
"learning_rate": 1.5275229357798167e-05,
"loss": 0.7527,
"step": 334
},
{
"epoch": 0.3844498637211304,
"grad_norm": 26.0,
"learning_rate": 1.5321100917431192e-05,
"loss": 0.5953,
"step": 335
},
{
"epoch": 0.3855974752546263,
"grad_norm": 81.5,
"learning_rate": 1.536697247706422e-05,
"loss": 1.1549,
"step": 336
},
{
"epoch": 0.3867450867881222,
"grad_norm": 40.25,
"learning_rate": 1.541284403669725e-05,
"loss": 0.6953,
"step": 337
},
{
"epoch": 0.38789269832161816,
"grad_norm": 59.5,
"learning_rate": 1.5458715596330276e-05,
"loss": 0.9157,
"step": 338
},
{
"epoch": 0.38904030985511406,
"grad_norm": 37.25,
"learning_rate": 1.5504587155963304e-05,
"loss": 0.6101,
"step": 339
},
{
"epoch": 0.39018792138860997,
"grad_norm": 47.75,
"learning_rate": 1.555045871559633e-05,
"loss": 0.6971,
"step": 340
},
{
"epoch": 0.3913355329221059,
"grad_norm": 35.5,
"learning_rate": 1.559633027522936e-05,
"loss": 0.6038,
"step": 341
},
{
"epoch": 0.3924831444556018,
"grad_norm": 258.0,
"learning_rate": 1.564220183486239e-05,
"loss": 0.7838,
"step": 342
},
{
"epoch": 0.3936307559890977,
"grad_norm": 19.625,
"learning_rate": 1.5688073394495413e-05,
"loss": 0.6458,
"step": 343
},
{
"epoch": 0.3947783675225936,
"grad_norm": 78.5,
"learning_rate": 1.573394495412844e-05,
"loss": 0.8405,
"step": 344
},
{
"epoch": 0.3959259790560895,
"grad_norm": 118.5,
"learning_rate": 1.577981651376147e-05,
"loss": 1.0364,
"step": 345
},
{
"epoch": 0.3970735905895854,
"grad_norm": 30.0,
"learning_rate": 1.5825688073394497e-05,
"loss": 0.5703,
"step": 346
},
{
"epoch": 0.3982212021230813,
"grad_norm": 60.75,
"learning_rate": 1.5871559633027525e-05,
"loss": 0.8595,
"step": 347
},
{
"epoch": 0.3993688136565772,
"grad_norm": 78.5,
"learning_rate": 1.591743119266055e-05,
"loss": 0.8161,
"step": 348
},
{
"epoch": 0.40051642519007313,
"grad_norm": 33.75,
"learning_rate": 1.5963302752293578e-05,
"loss": 0.7062,
"step": 349
},
{
"epoch": 0.4016640367235691,
"grad_norm": 20.75,
"learning_rate": 1.6009174311926606e-05,
"loss": 0.825,
"step": 350
},
{
"epoch": 0.402811648257065,
"grad_norm": 20.375,
"learning_rate": 1.6055045871559634e-05,
"loss": 0.5635,
"step": 351
},
{
"epoch": 0.4039592597905609,
"grad_norm": 26.0,
"learning_rate": 1.6100917431192662e-05,
"loss": 0.7392,
"step": 352
},
{
"epoch": 0.4051068713240568,
"grad_norm": 39.75,
"learning_rate": 1.614678899082569e-05,
"loss": 0.6261,
"step": 353
},
{
"epoch": 0.4062544828575527,
"grad_norm": 40.0,
"learning_rate": 1.6192660550458715e-05,
"loss": 0.6046,
"step": 354
},
{
"epoch": 0.40740209439104863,
"grad_norm": 106.0,
"learning_rate": 1.6238532110091743e-05,
"loss": 0.9682,
"step": 355
},
{
"epoch": 0.40854970592454454,
"grad_norm": 51.25,
"learning_rate": 1.628440366972477e-05,
"loss": 0.7811,
"step": 356
},
{
"epoch": 0.40969731745804044,
"grad_norm": 38.0,
"learning_rate": 1.63302752293578e-05,
"loss": 0.8129,
"step": 357
},
{
"epoch": 0.41084492899153635,
"grad_norm": 25.375,
"learning_rate": 1.6376146788990827e-05,
"loss": 0.5273,
"step": 358
},
{
"epoch": 0.41199254052503226,
"grad_norm": 34.0,
"learning_rate": 1.6422018348623852e-05,
"loss": 0.5413,
"step": 359
},
{
"epoch": 0.41314015205852817,
"grad_norm": 19.25,
"learning_rate": 1.6467889908256884e-05,
"loss": 0.5278,
"step": 360
},
{
"epoch": 0.4142877635920241,
"grad_norm": 24.0,
"learning_rate": 1.6513761467889912e-05,
"loss": 0.6991,
"step": 361
},
{
"epoch": 0.41543537512552003,
"grad_norm": 180.0,
"learning_rate": 1.6559633027522936e-05,
"loss": 0.7462,
"step": 362
},
{
"epoch": 0.41658298665901594,
"grad_norm": 27.125,
"learning_rate": 1.6605504587155964e-05,
"loss": 0.7057,
"step": 363
},
{
"epoch": 0.41773059819251185,
"grad_norm": 75.0,
"learning_rate": 1.6651376146788993e-05,
"loss": 0.8487,
"step": 364
},
{
"epoch": 0.41887820972600776,
"grad_norm": 52.5,
"learning_rate": 1.669724770642202e-05,
"loss": 0.83,
"step": 365
},
{
"epoch": 0.42002582125950366,
"grad_norm": 18.5,
"learning_rate": 1.674311926605505e-05,
"loss": 0.4915,
"step": 366
},
{
"epoch": 0.42117343279299957,
"grad_norm": 76.0,
"learning_rate": 1.6788990825688073e-05,
"loss": 0.6684,
"step": 367
},
{
"epoch": 0.4223210443264955,
"grad_norm": 21.625,
"learning_rate": 1.68348623853211e-05,
"loss": 0.5262,
"step": 368
},
{
"epoch": 0.4234686558599914,
"grad_norm": 13.375,
"learning_rate": 1.688073394495413e-05,
"loss": 0.5778,
"step": 369
},
{
"epoch": 0.4246162673934873,
"grad_norm": 20.375,
"learning_rate": 1.6926605504587158e-05,
"loss": 0.5191,
"step": 370
},
{
"epoch": 0.4257638789269832,
"grad_norm": 91.5,
"learning_rate": 1.6972477064220186e-05,
"loss": 1.0079,
"step": 371
},
{
"epoch": 0.4269114904604791,
"grad_norm": 55.25,
"learning_rate": 1.701834862385321e-05,
"loss": 0.6378,
"step": 372
},
{
"epoch": 0.42805910199397507,
"grad_norm": 39.5,
"learning_rate": 1.706422018348624e-05,
"loss": 0.836,
"step": 373
},
{
"epoch": 0.429206713527471,
"grad_norm": 42.75,
"learning_rate": 1.7110091743119267e-05,
"loss": 0.5683,
"step": 374
},
{
"epoch": 0.4303543250609669,
"grad_norm": 60.25,
"learning_rate": 1.7155963302752295e-05,
"loss": 0.4543,
"step": 375
},
{
"epoch": 0.4315019365944628,
"grad_norm": 21.375,
"learning_rate": 1.7201834862385323e-05,
"loss": 0.5242,
"step": 376
},
{
"epoch": 0.4326495481279587,
"grad_norm": 17.625,
"learning_rate": 1.724770642201835e-05,
"loss": 0.6393,
"step": 377
},
{
"epoch": 0.4337971596614546,
"grad_norm": 21.875,
"learning_rate": 1.7293577981651376e-05,
"loss": 0.5476,
"step": 378
},
{
"epoch": 0.4349447711949505,
"grad_norm": 56.25,
"learning_rate": 1.7339449541284407e-05,
"loss": 0.7973,
"step": 379
},
{
"epoch": 0.4360923827284464,
"grad_norm": 80.0,
"learning_rate": 1.738532110091743e-05,
"loss": 0.8487,
"step": 380
},
{
"epoch": 0.4372399942619423,
"grad_norm": 46.5,
"learning_rate": 1.743119266055046e-05,
"loss": 0.8605,
"step": 381
},
{
"epoch": 0.43838760579543823,
"grad_norm": 65.0,
"learning_rate": 1.7477064220183488e-05,
"loss": 0.8858,
"step": 382
},
{
"epoch": 0.43953521732893414,
"grad_norm": 87.0,
"learning_rate": 1.7522935779816516e-05,
"loss": 0.7342,
"step": 383
},
{
"epoch": 0.44068282886243004,
"grad_norm": 108.5,
"learning_rate": 1.7568807339449544e-05,
"loss": 0.8372,
"step": 384
},
{
"epoch": 0.441830440395926,
"grad_norm": 38.5,
"learning_rate": 1.7614678899082572e-05,
"loss": 1.0963,
"step": 385
},
{
"epoch": 0.4429780519294219,
"grad_norm": 21.875,
"learning_rate": 1.7660550458715597e-05,
"loss": 0.737,
"step": 386
},
{
"epoch": 0.4441256634629178,
"grad_norm": 34.25,
"learning_rate": 1.7706422018348625e-05,
"loss": 0.7902,
"step": 387
},
{
"epoch": 0.4452732749964137,
"grad_norm": 116.0,
"learning_rate": 1.7752293577981653e-05,
"loss": 0.875,
"step": 388
},
{
"epoch": 0.44642088652990963,
"grad_norm": 66.5,
"learning_rate": 1.779816513761468e-05,
"loss": 0.9535,
"step": 389
},
{
"epoch": 0.44756849806340554,
"grad_norm": 46.25,
"learning_rate": 1.784403669724771e-05,
"loss": 0.7879,
"step": 390
},
{
"epoch": 0.44871610959690145,
"grad_norm": 48.75,
"learning_rate": 1.7889908256880734e-05,
"loss": 0.6081,
"step": 391
},
{
"epoch": 0.44986372113039735,
"grad_norm": 32.5,
"learning_rate": 1.7935779816513762e-05,
"loss": 0.6908,
"step": 392
},
{
"epoch": 0.45101133266389326,
"grad_norm": 34.0,
"learning_rate": 1.798165137614679e-05,
"loss": 0.6664,
"step": 393
},
{
"epoch": 0.45215894419738917,
"grad_norm": 74.5,
"learning_rate": 1.8027522935779818e-05,
"loss": 0.6012,
"step": 394
},
{
"epoch": 0.4533065557308851,
"grad_norm": 33.25,
"learning_rate": 1.8073394495412846e-05,
"loss": 0.6278,
"step": 395
},
{
"epoch": 0.454454167264381,
"grad_norm": 19.25,
"learning_rate": 1.811926605504587e-05,
"loss": 0.6279,
"step": 396
},
{
"epoch": 0.45560177879787694,
"grad_norm": 33.5,
"learning_rate": 1.81651376146789e-05,
"loss": 0.689,
"step": 397
},
{
"epoch": 0.45674939033137285,
"grad_norm": 34.25,
"learning_rate": 1.821100917431193e-05,
"loss": 0.4764,
"step": 398
},
{
"epoch": 0.45789700186486876,
"grad_norm": 144.0,
"learning_rate": 1.8256880733944955e-05,
"loss": 1.3598,
"step": 399
},
{
"epoch": 0.45904461339836466,
"grad_norm": 105.5,
"learning_rate": 1.8302752293577983e-05,
"loss": 0.9441,
"step": 400
},
{
"epoch": 0.45904461339836466,
"eval_accuracy": 0.37,
"eval_loss": 0.7958357334136963,
"eval_runtime": 49.9294,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 2.003,
"step": 400
},
{
"epoch": 0.46019222493186057,
"grad_norm": 125.5,
"learning_rate": 1.834862385321101e-05,
"loss": 1.1444,
"step": 401
},
{
"epoch": 0.4613398364653565,
"grad_norm": 67.0,
"learning_rate": 1.839449541284404e-05,
"loss": 0.694,
"step": 402
},
{
"epoch": 0.4624874479988524,
"grad_norm": 48.75,
"learning_rate": 1.8440366972477067e-05,
"loss": 0.7125,
"step": 403
},
{
"epoch": 0.4636350595323483,
"grad_norm": 60.5,
"learning_rate": 1.8486238532110092e-05,
"loss": 0.6703,
"step": 404
},
{
"epoch": 0.4647826710658442,
"grad_norm": 51.0,
"learning_rate": 1.853211009174312e-05,
"loss": 0.6631,
"step": 405
},
{
"epoch": 0.4659302825993401,
"grad_norm": 51.75,
"learning_rate": 1.8577981651376148e-05,
"loss": 0.7814,
"step": 406
},
{
"epoch": 0.467077894132836,
"grad_norm": 22.875,
"learning_rate": 1.8623853211009176e-05,
"loss": 0.5642,
"step": 407
},
{
"epoch": 0.4682255056663319,
"grad_norm": 105.0,
"learning_rate": 1.8669724770642204e-05,
"loss": 0.884,
"step": 408
},
{
"epoch": 0.4693731171998279,
"grad_norm": 48.5,
"learning_rate": 1.8715596330275232e-05,
"loss": 0.5543,
"step": 409
},
{
"epoch": 0.4705207287333238,
"grad_norm": 26.875,
"learning_rate": 1.8761467889908257e-05,
"loss": 0.6461,
"step": 410
},
{
"epoch": 0.4716683402668197,
"grad_norm": 23.125,
"learning_rate": 1.8807339449541285e-05,
"loss": 0.5786,
"step": 411
},
{
"epoch": 0.4728159518003156,
"grad_norm": 37.75,
"learning_rate": 1.8853211009174313e-05,
"loss": 0.6921,
"step": 412
},
{
"epoch": 0.4739635633338115,
"grad_norm": 23.625,
"learning_rate": 1.889908256880734e-05,
"loss": 0.4189,
"step": 413
},
{
"epoch": 0.4751111748673074,
"grad_norm": 108.5,
"learning_rate": 1.894495412844037e-05,
"loss": 1.0126,
"step": 414
},
{
"epoch": 0.4762587864008033,
"grad_norm": 138.0,
"learning_rate": 1.8990825688073394e-05,
"loss": 1.2399,
"step": 415
},
{
"epoch": 0.47740639793429923,
"grad_norm": 31.375,
"learning_rate": 1.9036697247706422e-05,
"loss": 0.4347,
"step": 416
},
{
"epoch": 0.47855400946779514,
"grad_norm": 120.5,
"learning_rate": 1.9082568807339454e-05,
"loss": 1.1874,
"step": 417
},
{
"epoch": 0.47970162100129105,
"grad_norm": 25.5,
"learning_rate": 1.912844036697248e-05,
"loss": 0.6172,
"step": 418
},
{
"epoch": 0.48084923253478695,
"grad_norm": 29.0,
"learning_rate": 1.9174311926605506e-05,
"loss": 0.7072,
"step": 419
},
{
"epoch": 0.48199684406828286,
"grad_norm": 40.75,
"learning_rate": 1.9220183486238534e-05,
"loss": 0.8408,
"step": 420
},
{
"epoch": 0.4831444556017788,
"grad_norm": 17.5,
"learning_rate": 1.9266055045871563e-05,
"loss": 0.6384,
"step": 421
},
{
"epoch": 0.48429206713527473,
"grad_norm": 26.375,
"learning_rate": 1.931192660550459e-05,
"loss": 0.7132,
"step": 422
},
{
"epoch": 0.48543967866877064,
"grad_norm": 41.0,
"learning_rate": 1.9357798165137615e-05,
"loss": 0.6823,
"step": 423
},
{
"epoch": 0.48658729020226654,
"grad_norm": 36.0,
"learning_rate": 1.9403669724770643e-05,
"loss": 0.5629,
"step": 424
},
{
"epoch": 0.48773490173576245,
"grad_norm": 93.5,
"learning_rate": 1.944954128440367e-05,
"loss": 1.028,
"step": 425
},
{
"epoch": 0.48888251326925836,
"grad_norm": 70.5,
"learning_rate": 1.94954128440367e-05,
"loss": 0.7085,
"step": 426
},
{
"epoch": 0.49003012480275426,
"grad_norm": 17.875,
"learning_rate": 1.9541284403669728e-05,
"loss": 0.5299,
"step": 427
},
{
"epoch": 0.49117773633625017,
"grad_norm": 58.75,
"learning_rate": 1.9587155963302752e-05,
"loss": 0.9028,
"step": 428
},
{
"epoch": 0.4923253478697461,
"grad_norm": 45.25,
"learning_rate": 1.963302752293578e-05,
"loss": 0.8021,
"step": 429
},
{
"epoch": 0.493472959403242,
"grad_norm": 69.0,
"learning_rate": 1.967889908256881e-05,
"loss": 0.696,
"step": 430
},
{
"epoch": 0.4946205709367379,
"grad_norm": 44.0,
"learning_rate": 1.9724770642201837e-05,
"loss": 0.5913,
"step": 431
},
{
"epoch": 0.49576818247023385,
"grad_norm": 71.5,
"learning_rate": 1.9770642201834865e-05,
"loss": 0.8661,
"step": 432
},
{
"epoch": 0.49691579400372976,
"grad_norm": 80.5,
"learning_rate": 1.9816513761467893e-05,
"loss": 1.109,
"step": 433
},
{
"epoch": 0.49806340553722567,
"grad_norm": 86.0,
"learning_rate": 1.9862385321100917e-05,
"loss": 1.0316,
"step": 434
},
{
"epoch": 0.4992110170707216,
"grad_norm": 30.375,
"learning_rate": 1.9908256880733945e-05,
"loss": 0.7336,
"step": 435
},
{
"epoch": 0.5003586286042174,
"grad_norm": 42.75,
"learning_rate": 1.9954128440366974e-05,
"loss": 0.7081,
"step": 436
},
{
"epoch": 0.5015062401377134,
"grad_norm": 20.75,
"learning_rate": 2e-05,
"loss": 0.5407,
"step": 437
},
{
"epoch": 0.5026538516712094,
"grad_norm": 117.0,
"learning_rate": 1.999490316004078e-05,
"loss": 1.1049,
"step": 438
},
{
"epoch": 0.5038014632047052,
"grad_norm": 126.5,
"learning_rate": 1.998980632008155e-05,
"loss": 1.1828,
"step": 439
},
{
"epoch": 0.5049490747382012,
"grad_norm": 120.5,
"learning_rate": 1.9984709480122327e-05,
"loss": 1.3274,
"step": 440
},
{
"epoch": 0.506096686271697,
"grad_norm": 80.0,
"learning_rate": 1.99796126401631e-05,
"loss": 0.9327,
"step": 441
},
{
"epoch": 0.507244297805193,
"grad_norm": 105.0,
"learning_rate": 1.9974515800203875e-05,
"loss": 1.1003,
"step": 442
},
{
"epoch": 0.5083919093386888,
"grad_norm": 106.5,
"learning_rate": 1.9969418960244652e-05,
"loss": 1.0261,
"step": 443
},
{
"epoch": 0.5095395208721848,
"grad_norm": 36.5,
"learning_rate": 1.9964322120285426e-05,
"loss": 0.695,
"step": 444
},
{
"epoch": 0.5106871324056806,
"grad_norm": 15.8125,
"learning_rate": 1.99592252803262e-05,
"loss": 0.6983,
"step": 445
},
{
"epoch": 0.5118347439391766,
"grad_norm": 53.5,
"learning_rate": 1.9954128440366974e-05,
"loss": 0.7186,
"step": 446
},
{
"epoch": 0.5129823554726725,
"grad_norm": 12.25,
"learning_rate": 1.9949031600407747e-05,
"loss": 0.6272,
"step": 447
},
{
"epoch": 0.5141299670061684,
"grad_norm": 16.125,
"learning_rate": 1.9943934760448525e-05,
"loss": 0.5849,
"step": 448
},
{
"epoch": 0.5152775785396643,
"grad_norm": 36.25,
"learning_rate": 1.99388379204893e-05,
"loss": 0.5905,
"step": 449
},
{
"epoch": 0.5164251900731602,
"grad_norm": 24.25,
"learning_rate": 1.9933741080530073e-05,
"loss": 0.4751,
"step": 450
},
{
"epoch": 0.5175728016066562,
"grad_norm": 15.9375,
"learning_rate": 1.9928644240570846e-05,
"loss": 0.4372,
"step": 451
},
{
"epoch": 0.518720413140152,
"grad_norm": 18.375,
"learning_rate": 1.9923547400611624e-05,
"loss": 0.6552,
"step": 452
},
{
"epoch": 0.519868024673648,
"grad_norm": 13.6875,
"learning_rate": 1.9918450560652398e-05,
"loss": 0.6515,
"step": 453
},
{
"epoch": 0.5210156362071439,
"grad_norm": 13.6875,
"learning_rate": 1.991335372069317e-05,
"loss": 0.5219,
"step": 454
},
{
"epoch": 0.5221632477406398,
"grad_norm": 70.0,
"learning_rate": 1.9908256880733945e-05,
"loss": 0.694,
"step": 455
},
{
"epoch": 0.5233108592741357,
"grad_norm": 47.75,
"learning_rate": 1.990316004077472e-05,
"loss": 1.0051,
"step": 456
},
{
"epoch": 0.5244584708076316,
"grad_norm": 98.5,
"learning_rate": 1.9898063200815497e-05,
"loss": 0.8809,
"step": 457
},
{
"epoch": 0.5256060823411275,
"grad_norm": 22.875,
"learning_rate": 1.989296636085627e-05,
"loss": 0.6882,
"step": 458
},
{
"epoch": 0.5267536938746235,
"grad_norm": 103.0,
"learning_rate": 1.9887869520897044e-05,
"loss": 0.8227,
"step": 459
},
{
"epoch": 0.5279013054081193,
"grad_norm": 41.5,
"learning_rate": 1.9882772680937822e-05,
"loss": 0.5851,
"step": 460
},
{
"epoch": 0.5290489169416153,
"grad_norm": 16.125,
"learning_rate": 1.9877675840978596e-05,
"loss": 0.6286,
"step": 461
},
{
"epoch": 0.5301965284751112,
"grad_norm": 40.0,
"learning_rate": 1.987257900101937e-05,
"loss": 0.3909,
"step": 462
},
{
"epoch": 0.5313441400086071,
"grad_norm": 180.0,
"learning_rate": 1.9867482161060147e-05,
"loss": 1.4089,
"step": 463
},
{
"epoch": 0.532491751542103,
"grad_norm": 67.5,
"learning_rate": 1.9862385321100917e-05,
"loss": 0.7342,
"step": 464
},
{
"epoch": 0.5336393630755989,
"grad_norm": 146.0,
"learning_rate": 1.9857288481141695e-05,
"loss": 1.1024,
"step": 465
},
{
"epoch": 0.5347869746090949,
"grad_norm": 112.0,
"learning_rate": 1.985219164118247e-05,
"loss": 0.7631,
"step": 466
},
{
"epoch": 0.5359345861425907,
"grad_norm": 51.25,
"learning_rate": 1.9847094801223243e-05,
"loss": 0.8592,
"step": 467
},
{
"epoch": 0.5370821976760867,
"grad_norm": 53.5,
"learning_rate": 1.984199796126402e-05,
"loss": 0.772,
"step": 468
},
{
"epoch": 0.5382298092095825,
"grad_norm": 168.0,
"learning_rate": 1.9836901121304794e-05,
"loss": 1.3238,
"step": 469
},
{
"epoch": 0.5393774207430785,
"grad_norm": 56.25,
"learning_rate": 1.9831804281345568e-05,
"loss": 0.686,
"step": 470
},
{
"epoch": 0.5405250322765743,
"grad_norm": 140.0,
"learning_rate": 1.982670744138634e-05,
"loss": 1.3487,
"step": 471
},
{
"epoch": 0.5416726438100703,
"grad_norm": 26.25,
"learning_rate": 1.9821610601427115e-05,
"loss": 0.654,
"step": 472
},
{
"epoch": 0.5428202553435661,
"grad_norm": 96.0,
"learning_rate": 1.9816513761467893e-05,
"loss": 0.8932,
"step": 473
},
{
"epoch": 0.5439678668770621,
"grad_norm": 118.5,
"learning_rate": 1.9811416921508667e-05,
"loss": 0.9886,
"step": 474
},
{
"epoch": 0.5451154784105581,
"grad_norm": 34.75,
"learning_rate": 1.980632008154944e-05,
"loss": 0.5023,
"step": 475
},
{
"epoch": 0.5462630899440539,
"grad_norm": 28.125,
"learning_rate": 1.9801223241590214e-05,
"loss": 0.4678,
"step": 476
},
{
"epoch": 0.5474107014775499,
"grad_norm": 27.875,
"learning_rate": 1.9796126401630992e-05,
"loss": 0.5802,
"step": 477
},
{
"epoch": 0.5485583130110457,
"grad_norm": 9.5625,
"learning_rate": 1.9791029561671766e-05,
"loss": 0.3969,
"step": 478
},
{
"epoch": 0.5497059245445417,
"grad_norm": 30.875,
"learning_rate": 1.978593272171254e-05,
"loss": 0.4119,
"step": 479
},
{
"epoch": 0.5508535360780376,
"grad_norm": 17.25,
"learning_rate": 1.9780835881753317e-05,
"loss": 0.5645,
"step": 480
},
{
"epoch": 0.5520011476115335,
"grad_norm": 47.75,
"learning_rate": 1.9775739041794087e-05,
"loss": 0.6168,
"step": 481
},
{
"epoch": 0.5531487591450294,
"grad_norm": 14.1875,
"learning_rate": 1.9770642201834865e-05,
"loss": 0.5325,
"step": 482
},
{
"epoch": 0.5542963706785253,
"grad_norm": 41.25,
"learning_rate": 1.976554536187564e-05,
"loss": 0.5013,
"step": 483
},
{
"epoch": 0.5554439822120212,
"grad_norm": 18.75,
"learning_rate": 1.9760448521916412e-05,
"loss": 0.4441,
"step": 484
},
{
"epoch": 0.5565915937455171,
"grad_norm": 147.0,
"learning_rate": 1.975535168195719e-05,
"loss": 1.585,
"step": 485
},
{
"epoch": 0.5577392052790131,
"grad_norm": 64.0,
"learning_rate": 1.9750254841997964e-05,
"loss": 0.958,
"step": 486
},
{
"epoch": 0.558886816812509,
"grad_norm": 135.0,
"learning_rate": 1.9745158002038738e-05,
"loss": 1.4838,
"step": 487
},
{
"epoch": 0.5600344283460049,
"grad_norm": 141.0,
"learning_rate": 1.974006116207951e-05,
"loss": 1.6651,
"step": 488
},
{
"epoch": 0.5611820398795008,
"grad_norm": 108.0,
"learning_rate": 1.9734964322120285e-05,
"loss": 0.9729,
"step": 489
},
{
"epoch": 0.5623296514129967,
"grad_norm": 35.25,
"learning_rate": 1.9729867482161063e-05,
"loss": 0.5966,
"step": 490
},
{
"epoch": 0.5634772629464926,
"grad_norm": 34.5,
"learning_rate": 1.9724770642201837e-05,
"loss": 0.7337,
"step": 491
},
{
"epoch": 0.5646248744799885,
"grad_norm": 19.875,
"learning_rate": 1.971967380224261e-05,
"loss": 0.4022,
"step": 492
},
{
"epoch": 0.5657724860134844,
"grad_norm": 29.375,
"learning_rate": 1.9714576962283384e-05,
"loss": 0.618,
"step": 493
},
{
"epoch": 0.5669200975469804,
"grad_norm": 75.5,
"learning_rate": 1.970948012232416e-05,
"loss": 0.6627,
"step": 494
},
{
"epoch": 0.5680677090804762,
"grad_norm": 104.5,
"learning_rate": 1.9704383282364936e-05,
"loss": 0.9524,
"step": 495
},
{
"epoch": 0.5692153206139722,
"grad_norm": 91.0,
"learning_rate": 1.969928644240571e-05,
"loss": 0.7282,
"step": 496
},
{
"epoch": 0.570362932147468,
"grad_norm": 95.5,
"learning_rate": 1.9694189602446487e-05,
"loss": 0.9184,
"step": 497
},
{
"epoch": 0.571510543680964,
"grad_norm": 23.625,
"learning_rate": 1.9689092762487257e-05,
"loss": 0.6252,
"step": 498
},
{
"epoch": 0.57265815521446,
"grad_norm": 55.25,
"learning_rate": 1.9683995922528035e-05,
"loss": 0.77,
"step": 499
},
{
"epoch": 0.5738057667479558,
"grad_norm": 49.75,
"learning_rate": 1.967889908256881e-05,
"loss": 0.5024,
"step": 500
},
{
"epoch": 0.5738057667479558,
"eval_accuracy": 0.56,
"eval_loss": 0.5818310379981995,
"eval_runtime": 49.317,
"eval_samples_per_second": 2.028,
"eval_steps_per_second": 2.028,
"step": 500
},
{
"epoch": 0.5749533782814518,
"grad_norm": 72.0,
"learning_rate": 1.9673802242609582e-05,
"loss": 0.5565,
"step": 501
},
{
"epoch": 0.5761009898149476,
"grad_norm": 17.0,
"learning_rate": 1.966870540265036e-05,
"loss": 0.5465,
"step": 502
},
{
"epoch": 0.5772486013484436,
"grad_norm": 16.5,
"learning_rate": 1.9663608562691134e-05,
"loss": 0.7208,
"step": 503
},
{
"epoch": 0.5783962128819394,
"grad_norm": 13.75,
"learning_rate": 1.9658511722731907e-05,
"loss": 0.5784,
"step": 504
},
{
"epoch": 0.5795438244154354,
"grad_norm": 17.875,
"learning_rate": 1.9653414882772685e-05,
"loss": 0.7433,
"step": 505
},
{
"epoch": 0.5806914359489312,
"grad_norm": 55.5,
"learning_rate": 1.9648318042813455e-05,
"loss": 0.5593,
"step": 506
},
{
"epoch": 0.5818390474824272,
"grad_norm": 26.0,
"learning_rate": 1.9643221202854233e-05,
"loss": 0.4981,
"step": 507
},
{
"epoch": 0.5829866590159231,
"grad_norm": 45.5,
"learning_rate": 1.9638124362895006e-05,
"loss": 0.6998,
"step": 508
},
{
"epoch": 0.584134270549419,
"grad_norm": 64.5,
"learning_rate": 1.963302752293578e-05,
"loss": 0.331,
"step": 509
},
{
"epoch": 0.585281882082915,
"grad_norm": 15.75,
"learning_rate": 1.9627930682976558e-05,
"loss": 0.5757,
"step": 510
},
{
"epoch": 0.5864294936164108,
"grad_norm": 78.0,
"learning_rate": 1.962283384301733e-05,
"loss": 0.5458,
"step": 511
},
{
"epoch": 0.5875771051499068,
"grad_norm": 12.6875,
"learning_rate": 1.9617737003058106e-05,
"loss": 0.4577,
"step": 512
},
{
"epoch": 0.5887247166834026,
"grad_norm": 94.5,
"learning_rate": 1.961264016309888e-05,
"loss": 1.0295,
"step": 513
},
{
"epoch": 0.5898723282168986,
"grad_norm": 62.75,
"learning_rate": 1.9607543323139657e-05,
"loss": 0.6586,
"step": 514
},
{
"epoch": 0.5910199397503945,
"grad_norm": 12.8125,
"learning_rate": 1.960244648318043e-05,
"loss": 0.4499,
"step": 515
},
{
"epoch": 0.5921675512838904,
"grad_norm": 41.25,
"learning_rate": 1.9597349643221205e-05,
"loss": 0.6115,
"step": 516
},
{
"epoch": 0.5933151628173863,
"grad_norm": 33.5,
"learning_rate": 1.959225280326198e-05,
"loss": 0.6823,
"step": 517
},
{
"epoch": 0.5944627743508822,
"grad_norm": 67.5,
"learning_rate": 1.9587155963302752e-05,
"loss": 0.7254,
"step": 518
},
{
"epoch": 0.5956103858843781,
"grad_norm": 21.75,
"learning_rate": 1.958205912334353e-05,
"loss": 0.6258,
"step": 519
},
{
"epoch": 0.596757997417874,
"grad_norm": 20.125,
"learning_rate": 1.9576962283384304e-05,
"loss": 0.782,
"step": 520
},
{
"epoch": 0.59790560895137,
"grad_norm": 55.0,
"learning_rate": 1.9571865443425077e-05,
"loss": 0.6427,
"step": 521
},
{
"epoch": 0.5990532204848659,
"grad_norm": 21.375,
"learning_rate": 1.9566768603465855e-05,
"loss": 0.5042,
"step": 522
},
{
"epoch": 0.6002008320183618,
"grad_norm": 84.0,
"learning_rate": 1.9561671763506625e-05,
"loss": 0.7413,
"step": 523
},
{
"epoch": 0.6013484435518577,
"grad_norm": 32.25,
"learning_rate": 1.9556574923547403e-05,
"loss": 0.4809,
"step": 524
},
{
"epoch": 0.6024960550853536,
"grad_norm": 29.25,
"learning_rate": 1.9551478083588176e-05,
"loss": 0.5245,
"step": 525
},
{
"epoch": 0.6036436666188495,
"grad_norm": 115.0,
"learning_rate": 1.954638124362895e-05,
"loss": 1.0439,
"step": 526
},
{
"epoch": 0.6047912781523455,
"grad_norm": 155.0,
"learning_rate": 1.9541284403669728e-05,
"loss": 1.6477,
"step": 527
},
{
"epoch": 0.6059388896858413,
"grad_norm": 39.25,
"learning_rate": 1.95361875637105e-05,
"loss": 0.555,
"step": 528
},
{
"epoch": 0.6070865012193373,
"grad_norm": 23.0,
"learning_rate": 1.9531090723751275e-05,
"loss": 0.478,
"step": 529
},
{
"epoch": 0.6082341127528331,
"grad_norm": 43.25,
"learning_rate": 1.9525993883792053e-05,
"loss": 0.6068,
"step": 530
},
{
"epoch": 0.6093817242863291,
"grad_norm": 44.5,
"learning_rate": 1.9520897043832823e-05,
"loss": 0.248,
"step": 531
},
{
"epoch": 0.6105293358198249,
"grad_norm": 20.75,
"learning_rate": 1.95158002038736e-05,
"loss": 0.4505,
"step": 532
},
{
"epoch": 0.6116769473533209,
"grad_norm": 41.5,
"learning_rate": 1.9510703363914374e-05,
"loss": 0.3902,
"step": 533
},
{
"epoch": 0.6128245588868169,
"grad_norm": 38.75,
"learning_rate": 1.950560652395515e-05,
"loss": 0.5029,
"step": 534
},
{
"epoch": 0.6139721704203127,
"grad_norm": 131.0,
"learning_rate": 1.9500509683995926e-05,
"loss": 1.2225,
"step": 535
},
{
"epoch": 0.6151197819538087,
"grad_norm": 85.5,
"learning_rate": 1.94954128440367e-05,
"loss": 0.8337,
"step": 536
},
{
"epoch": 0.6162673934873045,
"grad_norm": 43.25,
"learning_rate": 1.9490316004077473e-05,
"loss": 0.5878,
"step": 537
},
{
"epoch": 0.6174150050208005,
"grad_norm": 12.875,
"learning_rate": 1.9485219164118247e-05,
"loss": 0.4961,
"step": 538
},
{
"epoch": 0.6185626165542963,
"grad_norm": 77.0,
"learning_rate": 1.9480122324159025e-05,
"loss": 0.9027,
"step": 539
},
{
"epoch": 0.6197102280877923,
"grad_norm": 46.5,
"learning_rate": 1.94750254841998e-05,
"loss": 0.7113,
"step": 540
},
{
"epoch": 0.6208578396212882,
"grad_norm": 77.5,
"learning_rate": 1.9469928644240572e-05,
"loss": 0.7001,
"step": 541
},
{
"epoch": 0.6220054511547841,
"grad_norm": 68.5,
"learning_rate": 1.9464831804281346e-05,
"loss": 0.6916,
"step": 542
},
{
"epoch": 0.62315306268828,
"grad_norm": 77.0,
"learning_rate": 1.945973496432212e-05,
"loss": 0.7548,
"step": 543
},
{
"epoch": 0.6243006742217759,
"grad_norm": 85.0,
"learning_rate": 1.9454638124362898e-05,
"loss": 0.8164,
"step": 544
},
{
"epoch": 0.6254482857552719,
"grad_norm": 15.0625,
"learning_rate": 1.944954128440367e-05,
"loss": 0.603,
"step": 545
},
{
"epoch": 0.6265958972887677,
"grad_norm": 23.25,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.4311,
"step": 546
},
{
"epoch": 0.6277435088222637,
"grad_norm": 10.1875,
"learning_rate": 1.9439347604485223e-05,
"loss": 0.3436,
"step": 547
},
{
"epoch": 0.6288911203557596,
"grad_norm": 75.0,
"learning_rate": 1.9434250764525993e-05,
"loss": 0.8814,
"step": 548
},
{
"epoch": 0.6300387318892555,
"grad_norm": 62.0,
"learning_rate": 1.942915392456677e-05,
"loss": 0.939,
"step": 549
},
{
"epoch": 0.6311863434227514,
"grad_norm": 52.0,
"learning_rate": 1.9424057084607544e-05,
"loss": 0.411,
"step": 550
},
{
"epoch": 0.6323339549562473,
"grad_norm": 127.5,
"learning_rate": 1.9418960244648318e-05,
"loss": 1.655,
"step": 551
},
{
"epoch": 0.6334815664897432,
"grad_norm": 96.0,
"learning_rate": 1.9413863404689096e-05,
"loss": 1.4065,
"step": 552
},
{
"epoch": 0.6346291780232391,
"grad_norm": 52.5,
"learning_rate": 1.940876656472987e-05,
"loss": 0.7391,
"step": 553
},
{
"epoch": 0.635776789556735,
"grad_norm": 78.0,
"learning_rate": 1.9403669724770643e-05,
"loss": 0.9576,
"step": 554
},
{
"epoch": 0.636924401090231,
"grad_norm": 91.0,
"learning_rate": 1.9398572884811417e-05,
"loss": 1.0132,
"step": 555
},
{
"epoch": 0.6380720126237268,
"grad_norm": 14.8125,
"learning_rate": 1.9393476044852195e-05,
"loss": 0.734,
"step": 556
},
{
"epoch": 0.6392196241572228,
"grad_norm": 63.75,
"learning_rate": 1.938837920489297e-05,
"loss": 0.6127,
"step": 557
},
{
"epoch": 0.6403672356907187,
"grad_norm": 19.625,
"learning_rate": 1.9383282364933742e-05,
"loss": 0.5999,
"step": 558
},
{
"epoch": 0.6415148472242146,
"grad_norm": 21.625,
"learning_rate": 1.9378185524974516e-05,
"loss": 0.7446,
"step": 559
},
{
"epoch": 0.6426624587577106,
"grad_norm": 26.375,
"learning_rate": 1.937308868501529e-05,
"loss": 0.6067,
"step": 560
},
{
"epoch": 0.6438100702912064,
"grad_norm": 130.0,
"learning_rate": 1.9367991845056068e-05,
"loss": 1.0849,
"step": 561
},
{
"epoch": 0.6449576818247024,
"grad_norm": 26.875,
"learning_rate": 1.936289500509684e-05,
"loss": 0.4882,
"step": 562
},
{
"epoch": 0.6461052933581982,
"grad_norm": 13.375,
"learning_rate": 1.9357798165137615e-05,
"loss": 0.6071,
"step": 563
},
{
"epoch": 0.6472529048916942,
"grad_norm": 23.625,
"learning_rate": 1.9352701325178393e-05,
"loss": 0.7541,
"step": 564
},
{
"epoch": 0.64840051642519,
"grad_norm": 22.5,
"learning_rate": 1.9347604485219163e-05,
"loss": 0.6343,
"step": 565
},
{
"epoch": 0.649548127958686,
"grad_norm": 37.75,
"learning_rate": 1.934250764525994e-05,
"loss": 0.629,
"step": 566
},
{
"epoch": 0.6506957394921818,
"grad_norm": 33.25,
"learning_rate": 1.9337410805300714e-05,
"loss": 0.6112,
"step": 567
},
{
"epoch": 0.6518433510256778,
"grad_norm": 23.625,
"learning_rate": 1.9332313965341488e-05,
"loss": 0.6854,
"step": 568
},
{
"epoch": 0.6529909625591738,
"grad_norm": 25.125,
"learning_rate": 1.9327217125382266e-05,
"loss": 0.5574,
"step": 569
},
{
"epoch": 0.6541385740926696,
"grad_norm": 22.125,
"learning_rate": 1.932212028542304e-05,
"loss": 0.4604,
"step": 570
},
{
"epoch": 0.6552861856261656,
"grad_norm": 10.625,
"learning_rate": 1.9317023445463813e-05,
"loss": 0.409,
"step": 571
},
{
"epoch": 0.6564337971596614,
"grad_norm": 11.375,
"learning_rate": 1.931192660550459e-05,
"loss": 0.4102,
"step": 572
},
{
"epoch": 0.6575814086931574,
"grad_norm": 74.5,
"learning_rate": 1.9306829765545365e-05,
"loss": 0.5481,
"step": 573
},
{
"epoch": 0.6587290202266533,
"grad_norm": 109.5,
"learning_rate": 1.930173292558614e-05,
"loss": 0.886,
"step": 574
},
{
"epoch": 0.6598766317601492,
"grad_norm": 48.75,
"learning_rate": 1.9296636085626912e-05,
"loss": 0.6536,
"step": 575
},
{
"epoch": 0.6610242432936451,
"grad_norm": 57.75,
"learning_rate": 1.9291539245667686e-05,
"loss": 0.902,
"step": 576
},
{
"epoch": 0.662171854827141,
"grad_norm": 61.5,
"learning_rate": 1.9286442405708464e-05,
"loss": 0.7151,
"step": 577
},
{
"epoch": 0.6633194663606369,
"grad_norm": 36.25,
"learning_rate": 1.9281345565749237e-05,
"loss": 0.6232,
"step": 578
},
{
"epoch": 0.6644670778941328,
"grad_norm": 11.6875,
"learning_rate": 1.927624872579001e-05,
"loss": 0.3918,
"step": 579
},
{
"epoch": 0.6656146894276288,
"grad_norm": 13.9375,
"learning_rate": 1.9271151885830785e-05,
"loss": 0.7175,
"step": 580
},
{
"epoch": 0.6667623009611247,
"grad_norm": 17.875,
"learning_rate": 1.9266055045871563e-05,
"loss": 0.7939,
"step": 581
},
{
"epoch": 0.6679099124946206,
"grad_norm": 13.9375,
"learning_rate": 1.9260958205912336e-05,
"loss": 0.6663,
"step": 582
},
{
"epoch": 0.6690575240281165,
"grad_norm": 17.75,
"learning_rate": 1.925586136595311e-05,
"loss": 0.512,
"step": 583
},
{
"epoch": 0.6702051355616124,
"grad_norm": 24.625,
"learning_rate": 1.9250764525993884e-05,
"loss": 0.8056,
"step": 584
},
{
"epoch": 0.6713527470951083,
"grad_norm": 46.75,
"learning_rate": 1.9245667686034658e-05,
"loss": 0.6661,
"step": 585
},
{
"epoch": 0.6725003586286042,
"grad_norm": 23.5,
"learning_rate": 1.9240570846075435e-05,
"loss": 0.6705,
"step": 586
},
{
"epoch": 0.6736479701621001,
"grad_norm": 55.5,
"learning_rate": 1.923547400611621e-05,
"loss": 0.7411,
"step": 587
},
{
"epoch": 0.6747955816955961,
"grad_norm": 34.25,
"learning_rate": 1.9230377166156983e-05,
"loss": 0.6056,
"step": 588
},
{
"epoch": 0.6759431932290919,
"grad_norm": 107.5,
"learning_rate": 1.922528032619776e-05,
"loss": 0.9975,
"step": 589
},
{
"epoch": 0.6770908047625879,
"grad_norm": 14.1875,
"learning_rate": 1.9220183486238534e-05,
"loss": 0.5988,
"step": 590
},
{
"epoch": 0.6782384162960837,
"grad_norm": 132.0,
"learning_rate": 1.921508664627931e-05,
"loss": 1.1598,
"step": 591
},
{
"epoch": 0.6793860278295797,
"grad_norm": 15.6875,
"learning_rate": 1.9209989806320086e-05,
"loss": 0.4172,
"step": 592
},
{
"epoch": 0.6805336393630756,
"grad_norm": 38.25,
"learning_rate": 1.9204892966360856e-05,
"loss": 0.5714,
"step": 593
},
{
"epoch": 0.6816812508965715,
"grad_norm": 14.0,
"learning_rate": 1.9199796126401633e-05,
"loss": 0.4887,
"step": 594
},
{
"epoch": 0.6828288624300675,
"grad_norm": 41.5,
"learning_rate": 1.9194699286442407e-05,
"loss": 0.6648,
"step": 595
},
{
"epoch": 0.6839764739635633,
"grad_norm": 13.875,
"learning_rate": 1.918960244648318e-05,
"loss": 0.6589,
"step": 596
},
{
"epoch": 0.6851240854970593,
"grad_norm": 35.0,
"learning_rate": 1.918450560652396e-05,
"loss": 1.0174,
"step": 597
},
{
"epoch": 0.6862716970305551,
"grad_norm": 27.875,
"learning_rate": 1.9179408766564732e-05,
"loss": 0.7711,
"step": 598
},
{
"epoch": 0.6874193085640511,
"grad_norm": 48.25,
"learning_rate": 1.9174311926605506e-05,
"loss": 0.4402,
"step": 599
},
{
"epoch": 0.6885669200975469,
"grad_norm": 49.0,
"learning_rate": 1.916921508664628e-05,
"loss": 0.4483,
"step": 600
},
{
"epoch": 0.6885669200975469,
"eval_accuracy": 0.63,
"eval_loss": 0.6301568150520325,
"eval_runtime": 49.333,
"eval_samples_per_second": 2.027,
"eval_steps_per_second": 2.027,
"step": 600
},
{
"epoch": 0.6897145316310429,
"grad_norm": 30.25,
"learning_rate": 1.9164118246687054e-05,
"loss": 0.4694,
"step": 601
},
{
"epoch": 0.6908621431645388,
"grad_norm": 29.875,
"learning_rate": 1.915902140672783e-05,
"loss": 0.374,
"step": 602
},
{
"epoch": 0.6920097546980347,
"grad_norm": 70.5,
"learning_rate": 1.9153924566768605e-05,
"loss": 0.941,
"step": 603
},
{
"epoch": 0.6931573662315307,
"grad_norm": 87.0,
"learning_rate": 1.914882772680938e-05,
"loss": 0.906,
"step": 604
},
{
"epoch": 0.6943049777650265,
"grad_norm": 14.0625,
"learning_rate": 1.9143730886850153e-05,
"loss": 0.406,
"step": 605
},
{
"epoch": 0.6954525892985225,
"grad_norm": 55.25,
"learning_rate": 1.913863404689093e-05,
"loss": 0.779,
"step": 606
},
{
"epoch": 0.6966002008320183,
"grad_norm": 36.5,
"learning_rate": 1.9133537206931704e-05,
"loss": 0.5805,
"step": 607
},
{
"epoch": 0.6977478123655143,
"grad_norm": 23.75,
"learning_rate": 1.912844036697248e-05,
"loss": 0.6406,
"step": 608
},
{
"epoch": 0.6988954238990102,
"grad_norm": 39.25,
"learning_rate": 1.9123343527013256e-05,
"loss": 0.4615,
"step": 609
},
{
"epoch": 0.7000430354325061,
"grad_norm": 67.5,
"learning_rate": 1.9118246687054026e-05,
"loss": 0.8637,
"step": 610
},
{
"epoch": 0.701190646966002,
"grad_norm": 22.5,
"learning_rate": 1.9113149847094803e-05,
"loss": 0.4727,
"step": 611
},
{
"epoch": 0.7023382584994979,
"grad_norm": 27.125,
"learning_rate": 1.9108053007135577e-05,
"loss": 0.6226,
"step": 612
},
{
"epoch": 0.7034858700329938,
"grad_norm": 62.5,
"learning_rate": 1.910295616717635e-05,
"loss": 0.6596,
"step": 613
},
{
"epoch": 0.7046334815664897,
"grad_norm": 40.0,
"learning_rate": 1.909785932721713e-05,
"loss": 0.5424,
"step": 614
},
{
"epoch": 0.7057810930999856,
"grad_norm": 62.5,
"learning_rate": 1.9092762487257902e-05,
"loss": 0.6348,
"step": 615
},
{
"epoch": 0.7069287046334816,
"grad_norm": 80.5,
"learning_rate": 1.9087665647298676e-05,
"loss": 0.9329,
"step": 616
},
{
"epoch": 0.7080763161669775,
"grad_norm": 101.5,
"learning_rate": 1.9082568807339454e-05,
"loss": 1.0578,
"step": 617
},
{
"epoch": 0.7092239277004734,
"grad_norm": 23.375,
"learning_rate": 1.9077471967380224e-05,
"loss": 0.6725,
"step": 618
},
{
"epoch": 0.7103715392339693,
"grad_norm": 42.0,
"learning_rate": 1.9072375127421e-05,
"loss": 0.6087,
"step": 619
},
{
"epoch": 0.7115191507674652,
"grad_norm": 37.0,
"learning_rate": 1.9067278287461775e-05,
"loss": 0.6237,
"step": 620
},
{
"epoch": 0.7126667623009612,
"grad_norm": 18.5,
"learning_rate": 1.906218144750255e-05,
"loss": 0.6327,
"step": 621
},
{
"epoch": 0.713814373834457,
"grad_norm": 27.375,
"learning_rate": 1.9057084607543327e-05,
"loss": 0.7476,
"step": 622
},
{
"epoch": 0.714961985367953,
"grad_norm": 28.25,
"learning_rate": 1.90519877675841e-05,
"loss": 0.7312,
"step": 623
},
{
"epoch": 0.7161095969014488,
"grad_norm": 57.75,
"learning_rate": 1.9046890927624874e-05,
"loss": 0.6385,
"step": 624
},
{
"epoch": 0.7172572084349448,
"grad_norm": 61.75,
"learning_rate": 1.9041794087665648e-05,
"loss": 0.3454,
"step": 625
},
{
"epoch": 0.7184048199684406,
"grad_norm": 39.0,
"learning_rate": 1.9036697247706422e-05,
"loss": 0.6641,
"step": 626
},
{
"epoch": 0.7195524315019366,
"grad_norm": 62.5,
"learning_rate": 1.9031600407747196e-05,
"loss": 0.8142,
"step": 627
},
{
"epoch": 0.7207000430354326,
"grad_norm": 16.375,
"learning_rate": 1.9026503567787973e-05,
"loss": 0.6127,
"step": 628
},
{
"epoch": 0.7218476545689284,
"grad_norm": 57.75,
"learning_rate": 1.9021406727828747e-05,
"loss": 0.7968,
"step": 629
},
{
"epoch": 0.7229952661024244,
"grad_norm": 25.375,
"learning_rate": 1.901630988786952e-05,
"loss": 0.4927,
"step": 630
},
{
"epoch": 0.7241428776359202,
"grad_norm": 48.25,
"learning_rate": 1.90112130479103e-05,
"loss": 0.6833,
"step": 631
},
{
"epoch": 0.7252904891694162,
"grad_norm": 54.75,
"learning_rate": 1.9006116207951072e-05,
"loss": 0.6494,
"step": 632
},
{
"epoch": 0.726438100702912,
"grad_norm": 54.5,
"learning_rate": 1.9001019367991846e-05,
"loss": 0.2914,
"step": 633
},
{
"epoch": 0.727585712236408,
"grad_norm": 10.125,
"learning_rate": 1.8995922528032624e-05,
"loss": 0.745,
"step": 634
},
{
"epoch": 0.7287333237699039,
"grad_norm": 45.75,
"learning_rate": 1.8990825688073394e-05,
"loss": 0.8078,
"step": 635
},
{
"epoch": 0.7298809353033998,
"grad_norm": 95.0,
"learning_rate": 1.898572884811417e-05,
"loss": 0.9361,
"step": 636
},
{
"epoch": 0.7310285468368957,
"grad_norm": 64.5,
"learning_rate": 1.8980632008154945e-05,
"loss": 0.5982,
"step": 637
},
{
"epoch": 0.7321761583703916,
"grad_norm": 22.25,
"learning_rate": 1.897553516819572e-05,
"loss": 0.5722,
"step": 638
},
{
"epoch": 0.7333237699038875,
"grad_norm": 51.5,
"learning_rate": 1.8970438328236496e-05,
"loss": 0.7216,
"step": 639
},
{
"epoch": 0.7344713814373834,
"grad_norm": 18.5,
"learning_rate": 1.896534148827727e-05,
"loss": 0.5961,
"step": 640
},
{
"epoch": 0.7356189929708794,
"grad_norm": 47.75,
"learning_rate": 1.8960244648318044e-05,
"loss": 0.6,
"step": 641
},
{
"epoch": 0.7367666045043753,
"grad_norm": 67.0,
"learning_rate": 1.8955147808358818e-05,
"loss": 0.7799,
"step": 642
},
{
"epoch": 0.7379142160378712,
"grad_norm": 90.5,
"learning_rate": 1.8950050968399592e-05,
"loss": 0.979,
"step": 643
},
{
"epoch": 0.7390618275713671,
"grad_norm": 47.0,
"learning_rate": 1.894495412844037e-05,
"loss": 0.6637,
"step": 644
},
{
"epoch": 0.740209439104863,
"grad_norm": 28.875,
"learning_rate": 1.8939857288481143e-05,
"loss": 0.7095,
"step": 645
},
{
"epoch": 0.7413570506383589,
"grad_norm": 70.5,
"learning_rate": 1.8934760448521917e-05,
"loss": 0.7767,
"step": 646
},
{
"epoch": 0.7425046621718548,
"grad_norm": 67.0,
"learning_rate": 1.892966360856269e-05,
"loss": 0.8117,
"step": 647
},
{
"epoch": 0.7436522737053507,
"grad_norm": 47.0,
"learning_rate": 1.892456676860347e-05,
"loss": 0.7253,
"step": 648
},
{
"epoch": 0.7447998852388467,
"grad_norm": 42.25,
"learning_rate": 1.8919469928644242e-05,
"loss": 0.5712,
"step": 649
},
{
"epoch": 0.7459474967723425,
"grad_norm": 10.3125,
"learning_rate": 1.8914373088685016e-05,
"loss": 0.4933,
"step": 650
},
{
"epoch": 0.7470951083058385,
"grad_norm": 17.875,
"learning_rate": 1.8909276248725793e-05,
"loss": 0.468,
"step": 651
},
{
"epoch": 0.7482427198393344,
"grad_norm": 61.75,
"learning_rate": 1.8904179408766564e-05,
"loss": 0.7518,
"step": 652
},
{
"epoch": 0.7493903313728303,
"grad_norm": 71.0,
"learning_rate": 1.889908256880734e-05,
"loss": 0.8373,
"step": 653
},
{
"epoch": 0.7505379429063262,
"grad_norm": 26.75,
"learning_rate": 1.8893985728848115e-05,
"loss": 0.7643,
"step": 654
},
{
"epoch": 0.7516855544398221,
"grad_norm": 28.25,
"learning_rate": 1.888888888888889e-05,
"loss": 0.5331,
"step": 655
},
{
"epoch": 0.7528331659733181,
"grad_norm": 8.375,
"learning_rate": 1.8883792048929666e-05,
"loss": 0.4439,
"step": 656
},
{
"epoch": 0.7539807775068139,
"grad_norm": 17.875,
"learning_rate": 1.887869520897044e-05,
"loss": 0.5831,
"step": 657
},
{
"epoch": 0.7551283890403099,
"grad_norm": 34.25,
"learning_rate": 1.8873598369011214e-05,
"loss": 0.5412,
"step": 658
},
{
"epoch": 0.7562760005738057,
"grad_norm": 50.75,
"learning_rate": 1.886850152905199e-05,
"loss": 0.5549,
"step": 659
},
{
"epoch": 0.7574236121073017,
"grad_norm": 56.0,
"learning_rate": 1.8863404689092762e-05,
"loss": 0.4515,
"step": 660
},
{
"epoch": 0.7585712236407975,
"grad_norm": 76.5,
"learning_rate": 1.885830784913354e-05,
"loss": 0.8915,
"step": 661
},
{
"epoch": 0.7597188351742935,
"grad_norm": 88.0,
"learning_rate": 1.8853211009174313e-05,
"loss": 0.7725,
"step": 662
},
{
"epoch": 0.7608664467077895,
"grad_norm": 32.0,
"learning_rate": 1.8848114169215087e-05,
"loss": 0.674,
"step": 663
},
{
"epoch": 0.7620140582412853,
"grad_norm": 37.0,
"learning_rate": 1.8843017329255864e-05,
"loss": 0.4771,
"step": 664
},
{
"epoch": 0.7631616697747813,
"grad_norm": 47.5,
"learning_rate": 1.883792048929664e-05,
"loss": 0.665,
"step": 665
},
{
"epoch": 0.7643092813082771,
"grad_norm": 49.0,
"learning_rate": 1.8832823649337412e-05,
"loss": 0.5971,
"step": 666
},
{
"epoch": 0.7654568928417731,
"grad_norm": 17.875,
"learning_rate": 1.8827726809378186e-05,
"loss": 0.507,
"step": 667
},
{
"epoch": 0.766604504375269,
"grad_norm": 17.5,
"learning_rate": 1.8822629969418963e-05,
"loss": 0.3678,
"step": 668
},
{
"epoch": 0.7677521159087649,
"grad_norm": 39.5,
"learning_rate": 1.8817533129459737e-05,
"loss": 0.5468,
"step": 669
},
{
"epoch": 0.7688997274422608,
"grad_norm": 26.375,
"learning_rate": 1.881243628950051e-05,
"loss": 0.4158,
"step": 670
},
{
"epoch": 0.7700473389757567,
"grad_norm": 70.5,
"learning_rate": 1.8807339449541285e-05,
"loss": 0.8145,
"step": 671
},
{
"epoch": 0.7711949505092526,
"grad_norm": 57.0,
"learning_rate": 1.880224260958206e-05,
"loss": 0.5283,
"step": 672
},
{
"epoch": 0.7723425620427485,
"grad_norm": 15.875,
"learning_rate": 1.8797145769622836e-05,
"loss": 0.6116,
"step": 673
},
{
"epoch": 0.7734901735762444,
"grad_norm": 10.6875,
"learning_rate": 1.879204892966361e-05,
"loss": 0.6081,
"step": 674
},
{
"epoch": 0.7746377851097404,
"grad_norm": 41.0,
"learning_rate": 1.8786952089704384e-05,
"loss": 0.6481,
"step": 675
},
{
"epoch": 0.7757853966432363,
"grad_norm": 13.375,
"learning_rate": 1.878185524974516e-05,
"loss": 0.4866,
"step": 676
},
{
"epoch": 0.7769330081767322,
"grad_norm": 32.75,
"learning_rate": 1.8776758409785932e-05,
"loss": 0.7627,
"step": 677
},
{
"epoch": 0.7780806197102281,
"grad_norm": 83.0,
"learning_rate": 1.877166156982671e-05,
"loss": 0.8497,
"step": 678
},
{
"epoch": 0.779228231243724,
"grad_norm": 46.75,
"learning_rate": 1.8766564729867483e-05,
"loss": 0.3555,
"step": 679
},
{
"epoch": 0.7803758427772199,
"grad_norm": 67.0,
"learning_rate": 1.8761467889908257e-05,
"loss": 0.7386,
"step": 680
},
{
"epoch": 0.7815234543107158,
"grad_norm": 98.5,
"learning_rate": 1.8756371049949034e-05,
"loss": 0.6743,
"step": 681
},
{
"epoch": 0.7826710658442118,
"grad_norm": 52.25,
"learning_rate": 1.8751274209989808e-05,
"loss": 0.6193,
"step": 682
},
{
"epoch": 0.7838186773777076,
"grad_norm": 18.0,
"learning_rate": 1.8746177370030582e-05,
"loss": 0.4905,
"step": 683
},
{
"epoch": 0.7849662889112036,
"grad_norm": 37.75,
"learning_rate": 1.874108053007136e-05,
"loss": 0.4043,
"step": 684
},
{
"epoch": 0.7861139004446994,
"grad_norm": 22.25,
"learning_rate": 1.8735983690112133e-05,
"loss": 0.5243,
"step": 685
},
{
"epoch": 0.7872615119781954,
"grad_norm": 21.75,
"learning_rate": 1.8730886850152907e-05,
"loss": 0.4342,
"step": 686
},
{
"epoch": 0.7884091235116913,
"grad_norm": 31.375,
"learning_rate": 1.872579001019368e-05,
"loss": 0.5206,
"step": 687
},
{
"epoch": 0.7895567350451872,
"grad_norm": 21.25,
"learning_rate": 1.8720693170234455e-05,
"loss": 0.6048,
"step": 688
},
{
"epoch": 0.7907043465786832,
"grad_norm": 49.0,
"learning_rate": 1.8715596330275232e-05,
"loss": 0.5758,
"step": 689
},
{
"epoch": 0.791851958112179,
"grad_norm": 26.75,
"learning_rate": 1.8710499490316006e-05,
"loss": 0.5446,
"step": 690
},
{
"epoch": 0.792999569645675,
"grad_norm": 28.5,
"learning_rate": 1.870540265035678e-05,
"loss": 0.4504,
"step": 691
},
{
"epoch": 0.7941471811791708,
"grad_norm": 28.375,
"learning_rate": 1.8700305810397554e-05,
"loss": 0.7349,
"step": 692
},
{
"epoch": 0.7952947927126668,
"grad_norm": 102.0,
"learning_rate": 1.869520897043833e-05,
"loss": 0.6082,
"step": 693
},
{
"epoch": 0.7964424042461626,
"grad_norm": 87.0,
"learning_rate": 1.8690112130479105e-05,
"loss": 0.7548,
"step": 694
},
{
"epoch": 0.7975900157796586,
"grad_norm": 24.875,
"learning_rate": 1.868501529051988e-05,
"loss": 0.7732,
"step": 695
},
{
"epoch": 0.7987376273131545,
"grad_norm": 44.0,
"learning_rate": 1.8679918450560653e-05,
"loss": 0.5928,
"step": 696
},
{
"epoch": 0.7998852388466504,
"grad_norm": 24.375,
"learning_rate": 1.8674821610601427e-05,
"loss": 0.5727,
"step": 697
},
{
"epoch": 0.8010328503801463,
"grad_norm": 72.5,
"learning_rate": 1.8669724770642204e-05,
"loss": 0.811,
"step": 698
},
{
"epoch": 0.8021804619136422,
"grad_norm": 13.25,
"learning_rate": 1.8664627930682978e-05,
"loss": 0.2618,
"step": 699
},
{
"epoch": 0.8033280734471382,
"grad_norm": 34.5,
"learning_rate": 1.8659531090723752e-05,
"loss": 0.8214,
"step": 700
},
{
"epoch": 0.8033280734471382,
"eval_accuracy": 0.61,
"eval_loss": 0.5829592347145081,
"eval_runtime": 49.9174,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 2.003,
"step": 700
},
{
"epoch": 0.804475684980634,
"grad_norm": 25.125,
"learning_rate": 1.865443425076453e-05,
"loss": 0.5568,
"step": 701
},
{
"epoch": 0.80562329651413,
"grad_norm": 14.9375,
"learning_rate": 1.86493374108053e-05,
"loss": 0.3704,
"step": 702
},
{
"epoch": 0.8067709080476259,
"grad_norm": 15.8125,
"learning_rate": 1.8644240570846077e-05,
"loss": 0.4246,
"step": 703
},
{
"epoch": 0.8079185195811218,
"grad_norm": 16.125,
"learning_rate": 1.863914373088685e-05,
"loss": 0.3896,
"step": 704
},
{
"epoch": 0.8090661311146177,
"grad_norm": 50.5,
"learning_rate": 1.8634046890927625e-05,
"loss": 0.3966,
"step": 705
},
{
"epoch": 0.8102137426481136,
"grad_norm": 45.25,
"learning_rate": 1.8628950050968402e-05,
"loss": 0.3742,
"step": 706
},
{
"epoch": 0.8113613541816095,
"grad_norm": 39.75,
"learning_rate": 1.8623853211009176e-05,
"loss": 0.4672,
"step": 707
},
{
"epoch": 0.8125089657151054,
"grad_norm": 39.25,
"learning_rate": 1.861875637104995e-05,
"loss": 0.6046,
"step": 708
},
{
"epoch": 0.8136565772486013,
"grad_norm": 38.25,
"learning_rate": 1.8613659531090724e-05,
"loss": 1.0867,
"step": 709
},
{
"epoch": 0.8148041887820973,
"grad_norm": 26.875,
"learning_rate": 1.86085626911315e-05,
"loss": 0.3141,
"step": 710
},
{
"epoch": 0.8159518003155932,
"grad_norm": 53.0,
"learning_rate": 1.8603465851172275e-05,
"loss": 0.8153,
"step": 711
},
{
"epoch": 0.8170994118490891,
"grad_norm": 35.0,
"learning_rate": 1.859836901121305e-05,
"loss": 0.7676,
"step": 712
},
{
"epoch": 0.818247023382585,
"grad_norm": 55.75,
"learning_rate": 1.8593272171253823e-05,
"loss": 0.5664,
"step": 713
},
{
"epoch": 0.8193946349160809,
"grad_norm": 14.5,
"learning_rate": 1.8588175331294597e-05,
"loss": 0.6436,
"step": 714
},
{
"epoch": 0.8205422464495769,
"grad_norm": 29.0,
"learning_rate": 1.8583078491335374e-05,
"loss": 0.4355,
"step": 715
},
{
"epoch": 0.8216898579830727,
"grad_norm": 82.5,
"learning_rate": 1.8577981651376148e-05,
"loss": 1.2766,
"step": 716
},
{
"epoch": 0.8228374695165687,
"grad_norm": 37.5,
"learning_rate": 1.8572884811416922e-05,
"loss": 0.4578,
"step": 717
},
{
"epoch": 0.8239850810500645,
"grad_norm": 16.875,
"learning_rate": 1.85677879714577e-05,
"loss": 0.5334,
"step": 718
},
{
"epoch": 0.8251326925835605,
"grad_norm": 22.875,
"learning_rate": 1.856269113149847e-05,
"loss": 0.5546,
"step": 719
},
{
"epoch": 0.8262803041170563,
"grad_norm": 106.0,
"learning_rate": 1.8557594291539247e-05,
"loss": 0.9589,
"step": 720
},
{
"epoch": 0.8274279156505523,
"grad_norm": 21.75,
"learning_rate": 1.855249745158002e-05,
"loss": 0.5682,
"step": 721
},
{
"epoch": 0.8285755271840483,
"grad_norm": 52.75,
"learning_rate": 1.8547400611620795e-05,
"loss": 0.4809,
"step": 722
},
{
"epoch": 0.8297231387175441,
"grad_norm": 19.0,
"learning_rate": 1.8542303771661572e-05,
"loss": 0.3774,
"step": 723
},
{
"epoch": 0.8308707502510401,
"grad_norm": 25.125,
"learning_rate": 1.8537206931702346e-05,
"loss": 0.4828,
"step": 724
},
{
"epoch": 0.8320183617845359,
"grad_norm": 34.0,
"learning_rate": 1.853211009174312e-05,
"loss": 0.4859,
"step": 725
},
{
"epoch": 0.8331659733180319,
"grad_norm": 48.75,
"learning_rate": 1.8527013251783897e-05,
"loss": 0.5612,
"step": 726
},
{
"epoch": 0.8343135848515277,
"grad_norm": 45.5,
"learning_rate": 1.852191641182467e-05,
"loss": 0.5976,
"step": 727
},
{
"epoch": 0.8354611963850237,
"grad_norm": 52.0,
"learning_rate": 1.8516819571865445e-05,
"loss": 0.6551,
"step": 728
},
{
"epoch": 0.8366088079185195,
"grad_norm": 23.75,
"learning_rate": 1.851172273190622e-05,
"loss": 0.5022,
"step": 729
},
{
"epoch": 0.8377564194520155,
"grad_norm": 71.5,
"learning_rate": 1.8506625891946993e-05,
"loss": 0.8474,
"step": 730
},
{
"epoch": 0.8389040309855114,
"grad_norm": 67.5,
"learning_rate": 1.850152905198777e-05,
"loss": 0.6511,
"step": 731
},
{
"epoch": 0.8400516425190073,
"grad_norm": 36.5,
"learning_rate": 1.8496432212028544e-05,
"loss": 0.5485,
"step": 732
},
{
"epoch": 0.8411992540525032,
"grad_norm": 63.25,
"learning_rate": 1.8491335372069318e-05,
"loss": 0.7833,
"step": 733
},
{
"epoch": 0.8423468655859991,
"grad_norm": 11.625,
"learning_rate": 1.8486238532110092e-05,
"loss": 0.5295,
"step": 734
},
{
"epoch": 0.8434944771194951,
"grad_norm": 44.25,
"learning_rate": 1.848114169215087e-05,
"loss": 0.4733,
"step": 735
},
{
"epoch": 0.844642088652991,
"grad_norm": 6.90625,
"learning_rate": 1.8476044852191643e-05,
"loss": 0.2207,
"step": 736
},
{
"epoch": 0.8457897001864869,
"grad_norm": 63.0,
"learning_rate": 1.8470948012232417e-05,
"loss": 0.5543,
"step": 737
},
{
"epoch": 0.8469373117199828,
"grad_norm": 19.5,
"learning_rate": 1.846585117227319e-05,
"loss": 0.4689,
"step": 738
},
{
"epoch": 0.8480849232534787,
"grad_norm": 14.6875,
"learning_rate": 1.8460754332313965e-05,
"loss": 0.5446,
"step": 739
},
{
"epoch": 0.8492325347869746,
"grad_norm": 68.5,
"learning_rate": 1.8455657492354742e-05,
"loss": 0.6652,
"step": 740
},
{
"epoch": 0.8503801463204705,
"grad_norm": 25.0,
"learning_rate": 1.8450560652395516e-05,
"loss": 0.3413,
"step": 741
},
{
"epoch": 0.8515277578539664,
"grad_norm": 43.5,
"learning_rate": 1.844546381243629e-05,
"loss": 0.5552,
"step": 742
},
{
"epoch": 0.8526753693874624,
"grad_norm": 50.25,
"learning_rate": 1.8440366972477067e-05,
"loss": 0.5051,
"step": 743
},
{
"epoch": 0.8538229809209582,
"grad_norm": 30.625,
"learning_rate": 1.843527013251784e-05,
"loss": 0.5442,
"step": 744
},
{
"epoch": 0.8549705924544542,
"grad_norm": 21.75,
"learning_rate": 1.8430173292558615e-05,
"loss": 0.724,
"step": 745
},
{
"epoch": 0.8561182039879501,
"grad_norm": 23.0,
"learning_rate": 1.8425076452599392e-05,
"loss": 0.6982,
"step": 746
},
{
"epoch": 0.857265815521446,
"grad_norm": 34.5,
"learning_rate": 1.8419979612640163e-05,
"loss": 0.4268,
"step": 747
},
{
"epoch": 0.858413427054942,
"grad_norm": 12.0,
"learning_rate": 1.841488277268094e-05,
"loss": 0.5611,
"step": 748
},
{
"epoch": 0.8595610385884378,
"grad_norm": 20.5,
"learning_rate": 1.8409785932721714e-05,
"loss": 0.552,
"step": 749
},
{
"epoch": 0.8607086501219338,
"grad_norm": 35.0,
"learning_rate": 1.8404689092762488e-05,
"loss": 0.5549,
"step": 750
},
{
"epoch": 0.8618562616554296,
"grad_norm": 43.5,
"learning_rate": 1.8399592252803265e-05,
"loss": 0.5242,
"step": 751
},
{
"epoch": 0.8630038731889256,
"grad_norm": 13.875,
"learning_rate": 1.839449541284404e-05,
"loss": 0.3858,
"step": 752
},
{
"epoch": 0.8641514847224214,
"grad_norm": 45.25,
"learning_rate": 1.8389398572884813e-05,
"loss": 0.6547,
"step": 753
},
{
"epoch": 0.8652990962559174,
"grad_norm": 66.5,
"learning_rate": 1.8384301732925587e-05,
"loss": 0.5999,
"step": 754
},
{
"epoch": 0.8664467077894132,
"grad_norm": 31.25,
"learning_rate": 1.837920489296636e-05,
"loss": 0.6402,
"step": 755
},
{
"epoch": 0.8675943193229092,
"grad_norm": 16.125,
"learning_rate": 1.8374108053007138e-05,
"loss": 0.4183,
"step": 756
},
{
"epoch": 0.868741930856405,
"grad_norm": 34.75,
"learning_rate": 1.8369011213047912e-05,
"loss": 0.46,
"step": 757
},
{
"epoch": 0.869889542389901,
"grad_norm": 11.6875,
"learning_rate": 1.8363914373088686e-05,
"loss": 0.5179,
"step": 758
},
{
"epoch": 0.871037153923397,
"grad_norm": 60.0,
"learning_rate": 1.835881753312946e-05,
"loss": 0.5217,
"step": 759
},
{
"epoch": 0.8721847654568928,
"grad_norm": 9.75,
"learning_rate": 1.8353720693170237e-05,
"loss": 0.3772,
"step": 760
},
{
"epoch": 0.8733323769903888,
"grad_norm": 11.75,
"learning_rate": 1.834862385321101e-05,
"loss": 0.5367,
"step": 761
},
{
"epoch": 0.8744799885238846,
"grad_norm": 41.5,
"learning_rate": 1.8343527013251785e-05,
"loss": 0.573,
"step": 762
},
{
"epoch": 0.8756276000573806,
"grad_norm": 34.25,
"learning_rate": 1.8338430173292562e-05,
"loss": 0.1937,
"step": 763
},
{
"epoch": 0.8767752115908765,
"grad_norm": 23.125,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.4614,
"step": 764
},
{
"epoch": 0.8779228231243724,
"grad_norm": 46.5,
"learning_rate": 1.832823649337411e-05,
"loss": 0.442,
"step": 765
},
{
"epoch": 0.8790704346578683,
"grad_norm": 74.5,
"learning_rate": 1.8323139653414884e-05,
"loss": 0.9426,
"step": 766
},
{
"epoch": 0.8802180461913642,
"grad_norm": 65.5,
"learning_rate": 1.8318042813455658e-05,
"loss": 0.8281,
"step": 767
},
{
"epoch": 0.8813656577248601,
"grad_norm": 30.375,
"learning_rate": 1.8312945973496435e-05,
"loss": 0.6035,
"step": 768
},
{
"epoch": 0.882513269258356,
"grad_norm": 89.5,
"learning_rate": 1.830784913353721e-05,
"loss": 0.8813,
"step": 769
},
{
"epoch": 0.883660880791852,
"grad_norm": 71.5,
"learning_rate": 1.8302752293577983e-05,
"loss": 0.7809,
"step": 770
},
{
"epoch": 0.8848084923253479,
"grad_norm": 87.5,
"learning_rate": 1.829765545361876e-05,
"loss": 0.9051,
"step": 771
},
{
"epoch": 0.8859561038588438,
"grad_norm": 78.5,
"learning_rate": 1.829255861365953e-05,
"loss": 0.8777,
"step": 772
},
{
"epoch": 0.8871037153923397,
"grad_norm": 38.5,
"learning_rate": 1.8287461773700308e-05,
"loss": 0.3393,
"step": 773
},
{
"epoch": 0.8882513269258356,
"grad_norm": 34.0,
"learning_rate": 1.8282364933741082e-05,
"loss": 0.4772,
"step": 774
},
{
"epoch": 0.8893989384593315,
"grad_norm": 42.25,
"learning_rate": 1.8277268093781856e-05,
"loss": 0.5136,
"step": 775
},
{
"epoch": 0.8905465499928275,
"grad_norm": 51.25,
"learning_rate": 1.8272171253822633e-05,
"loss": 0.4965,
"step": 776
},
{
"epoch": 0.8916941615263233,
"grad_norm": 23.5,
"learning_rate": 1.8267074413863407e-05,
"loss": 0.6667,
"step": 777
},
{
"epoch": 0.8928417730598193,
"grad_norm": 69.0,
"learning_rate": 1.826197757390418e-05,
"loss": 0.7309,
"step": 778
},
{
"epoch": 0.8939893845933151,
"grad_norm": 13.5625,
"learning_rate": 1.8256880733944955e-05,
"loss": 0.5041,
"step": 779
},
{
"epoch": 0.8951369961268111,
"grad_norm": 115.0,
"learning_rate": 1.825178389398573e-05,
"loss": 0.9974,
"step": 780
},
{
"epoch": 0.8962846076603069,
"grad_norm": 19.125,
"learning_rate": 1.8246687054026503e-05,
"loss": 0.5556,
"step": 781
},
{
"epoch": 0.8974322191938029,
"grad_norm": 52.5,
"learning_rate": 1.824159021406728e-05,
"loss": 0.7631,
"step": 782
},
{
"epoch": 0.8985798307272989,
"grad_norm": 10.4375,
"learning_rate": 1.8236493374108054e-05,
"loss": 0.6217,
"step": 783
},
{
"epoch": 0.8997274422607947,
"grad_norm": 20.625,
"learning_rate": 1.8231396534148828e-05,
"loss": 0.4404,
"step": 784
},
{
"epoch": 0.9008750537942907,
"grad_norm": 81.0,
"learning_rate": 1.8226299694189605e-05,
"loss": 0.8382,
"step": 785
},
{
"epoch": 0.9020226653277865,
"grad_norm": 25.125,
"learning_rate": 1.822120285423038e-05,
"loss": 0.465,
"step": 786
},
{
"epoch": 0.9031702768612825,
"grad_norm": 4.90625,
"learning_rate": 1.8216106014271153e-05,
"loss": 0.2211,
"step": 787
},
{
"epoch": 0.9043178883947783,
"grad_norm": 18.25,
"learning_rate": 1.821100917431193e-05,
"loss": 0.5354,
"step": 788
},
{
"epoch": 0.9054654999282743,
"grad_norm": 22.625,
"learning_rate": 1.82059123343527e-05,
"loss": 0.4656,
"step": 789
},
{
"epoch": 0.9066131114617701,
"grad_norm": 22.125,
"learning_rate": 1.8200815494393478e-05,
"loss": 0.7412,
"step": 790
},
{
"epoch": 0.9077607229952661,
"grad_norm": 18.125,
"learning_rate": 1.8195718654434252e-05,
"loss": 0.5085,
"step": 791
},
{
"epoch": 0.908908334528762,
"grad_norm": 17.625,
"learning_rate": 1.8190621814475026e-05,
"loss": 0.4275,
"step": 792
},
{
"epoch": 0.9100559460622579,
"grad_norm": 74.5,
"learning_rate": 1.8185524974515803e-05,
"loss": 0.8506,
"step": 793
},
{
"epoch": 0.9112035575957539,
"grad_norm": 59.75,
"learning_rate": 1.8180428134556577e-05,
"loss": 0.5863,
"step": 794
},
{
"epoch": 0.9123511691292497,
"grad_norm": 17.875,
"learning_rate": 1.817533129459735e-05,
"loss": 0.5018,
"step": 795
},
{
"epoch": 0.9134987806627457,
"grad_norm": 73.0,
"learning_rate": 1.8170234454638125e-05,
"loss": 0.7539,
"step": 796
},
{
"epoch": 0.9146463921962416,
"grad_norm": 38.0,
"learning_rate": 1.81651376146789e-05,
"loss": 0.6021,
"step": 797
},
{
"epoch": 0.9157940037297375,
"grad_norm": 34.75,
"learning_rate": 1.8160040774719676e-05,
"loss": 0.4989,
"step": 798
},
{
"epoch": 0.9169416152632334,
"grad_norm": 29.25,
"learning_rate": 1.815494393476045e-05,
"loss": 0.5503,
"step": 799
},
{
"epoch": 0.9180892267967293,
"grad_norm": 113.0,
"learning_rate": 1.8149847094801224e-05,
"loss": 0.7238,
"step": 800
},
{
"epoch": 0.9180892267967293,
"eval_accuracy": 0.67,
"eval_loss": 0.5950115323066711,
"eval_runtime": 49.3005,
"eval_samples_per_second": 2.028,
"eval_steps_per_second": 2.028,
"step": 800
},
{
"epoch": 0.9192368383302252,
"grad_norm": 12.9375,
"learning_rate": 1.8144750254841998e-05,
"loss": 0.632,
"step": 801
},
{
"epoch": 0.9203844498637211,
"grad_norm": 49.75,
"learning_rate": 1.8139653414882775e-05,
"loss": 0.6413,
"step": 802
},
{
"epoch": 0.921532061397217,
"grad_norm": 13.125,
"learning_rate": 1.813455657492355e-05,
"loss": 0.5482,
"step": 803
},
{
"epoch": 0.922679672930713,
"grad_norm": 20.125,
"learning_rate": 1.8129459734964323e-05,
"loss": 0.5773,
"step": 804
},
{
"epoch": 0.9238272844642089,
"grad_norm": 100.0,
"learning_rate": 1.81243628950051e-05,
"loss": 1.35,
"step": 805
},
{
"epoch": 0.9249748959977048,
"grad_norm": 25.75,
"learning_rate": 1.811926605504587e-05,
"loss": 0.5234,
"step": 806
},
{
"epoch": 0.9261225075312007,
"grad_norm": 37.0,
"learning_rate": 1.8114169215086648e-05,
"loss": 0.473,
"step": 807
},
{
"epoch": 0.9272701190646966,
"grad_norm": 29.0,
"learning_rate": 1.8109072375127422e-05,
"loss": 0.4716,
"step": 808
},
{
"epoch": 0.9284177305981925,
"grad_norm": 22.0,
"learning_rate": 1.8103975535168196e-05,
"loss": 0.5146,
"step": 809
},
{
"epoch": 0.9295653421316884,
"grad_norm": 11.75,
"learning_rate": 1.8098878695208973e-05,
"loss": 0.6532,
"step": 810
},
{
"epoch": 0.9307129536651844,
"grad_norm": 14.375,
"learning_rate": 1.8093781855249747e-05,
"loss": 0.5441,
"step": 811
},
{
"epoch": 0.9318605651986802,
"grad_norm": 42.0,
"learning_rate": 1.808868501529052e-05,
"loss": 0.4905,
"step": 812
},
{
"epoch": 0.9330081767321762,
"grad_norm": 64.0,
"learning_rate": 1.8083588175331298e-05,
"loss": 0.8364,
"step": 813
},
{
"epoch": 0.934155788265672,
"grad_norm": 28.875,
"learning_rate": 1.807849133537207e-05,
"loss": 0.414,
"step": 814
},
{
"epoch": 0.935303399799168,
"grad_norm": 7.6875,
"learning_rate": 1.8073394495412846e-05,
"loss": 0.3923,
"step": 815
},
{
"epoch": 0.9364510113326638,
"grad_norm": 32.25,
"learning_rate": 1.806829765545362e-05,
"loss": 0.5358,
"step": 816
},
{
"epoch": 0.9375986228661598,
"grad_norm": 57.5,
"learning_rate": 1.8063200815494394e-05,
"loss": 0.4813,
"step": 817
},
{
"epoch": 0.9387462343996558,
"grad_norm": 33.5,
"learning_rate": 1.805810397553517e-05,
"loss": 0.4693,
"step": 818
},
{
"epoch": 0.9398938459331516,
"grad_norm": 35.0,
"learning_rate": 1.8053007135575945e-05,
"loss": 0.3321,
"step": 819
},
{
"epoch": 0.9410414574666476,
"grad_norm": 19.75,
"learning_rate": 1.804791029561672e-05,
"loss": 0.5709,
"step": 820
},
{
"epoch": 0.9421890690001434,
"grad_norm": 133.0,
"learning_rate": 1.8042813455657493e-05,
"loss": 1.0803,
"step": 821
},
{
"epoch": 0.9433366805336394,
"grad_norm": 7.90625,
"learning_rate": 1.803771661569827e-05,
"loss": 0.5182,
"step": 822
},
{
"epoch": 0.9444842920671352,
"grad_norm": 25.375,
"learning_rate": 1.8032619775739044e-05,
"loss": 0.6489,
"step": 823
},
{
"epoch": 0.9456319036006312,
"grad_norm": 24.125,
"learning_rate": 1.8027522935779818e-05,
"loss": 0.5298,
"step": 824
},
{
"epoch": 0.9467795151341271,
"grad_norm": 26.5,
"learning_rate": 1.8022426095820592e-05,
"loss": 0.4967,
"step": 825
},
{
"epoch": 0.947927126667623,
"grad_norm": 55.5,
"learning_rate": 1.8017329255861366e-05,
"loss": 0.7623,
"step": 826
},
{
"epoch": 0.9490747382011189,
"grad_norm": 38.75,
"learning_rate": 1.8012232415902143e-05,
"loss": 0.3873,
"step": 827
},
{
"epoch": 0.9502223497346148,
"grad_norm": 36.5,
"learning_rate": 1.8007135575942917e-05,
"loss": 0.4885,
"step": 828
},
{
"epoch": 0.9513699612681108,
"grad_norm": 36.75,
"learning_rate": 1.800203873598369e-05,
"loss": 0.4786,
"step": 829
},
{
"epoch": 0.9525175728016066,
"grad_norm": 15.9375,
"learning_rate": 1.7996941896024468e-05,
"loss": 0.7344,
"step": 830
},
{
"epoch": 0.9536651843351026,
"grad_norm": 35.75,
"learning_rate": 1.799184505606524e-05,
"loss": 0.4364,
"step": 831
},
{
"epoch": 0.9548127958685985,
"grad_norm": 60.5,
"learning_rate": 1.7986748216106016e-05,
"loss": 0.4018,
"step": 832
},
{
"epoch": 0.9559604074020944,
"grad_norm": 29.5,
"learning_rate": 1.798165137614679e-05,
"loss": 0.5492,
"step": 833
},
{
"epoch": 0.9571080189355903,
"grad_norm": 51.75,
"learning_rate": 1.7976554536187564e-05,
"loss": 0.5751,
"step": 834
},
{
"epoch": 0.9582556304690862,
"grad_norm": 27.0,
"learning_rate": 1.797145769622834e-05,
"loss": 0.4943,
"step": 835
},
{
"epoch": 0.9594032420025821,
"grad_norm": 40.75,
"learning_rate": 1.7966360856269115e-05,
"loss": 0.6622,
"step": 836
},
{
"epoch": 0.960550853536078,
"grad_norm": 82.0,
"learning_rate": 1.796126401630989e-05,
"loss": 0.6737,
"step": 837
},
{
"epoch": 0.9616984650695739,
"grad_norm": 11.125,
"learning_rate": 1.7956167176350666e-05,
"loss": 0.4544,
"step": 838
},
{
"epoch": 0.9628460766030699,
"grad_norm": 18.125,
"learning_rate": 1.795107033639144e-05,
"loss": 0.5389,
"step": 839
},
{
"epoch": 0.9639936881365657,
"grad_norm": 15.9375,
"learning_rate": 1.7945973496432214e-05,
"loss": 0.1783,
"step": 840
},
{
"epoch": 0.9651412996700617,
"grad_norm": 53.5,
"learning_rate": 1.7940876656472988e-05,
"loss": 0.3035,
"step": 841
},
{
"epoch": 0.9662889112035576,
"grad_norm": 52.75,
"learning_rate": 1.7935779816513762e-05,
"loss": 0.6946,
"step": 842
},
{
"epoch": 0.9674365227370535,
"grad_norm": 34.75,
"learning_rate": 1.793068297655454e-05,
"loss": 0.5466,
"step": 843
},
{
"epoch": 0.9685841342705495,
"grad_norm": 21.875,
"learning_rate": 1.7925586136595313e-05,
"loss": 0.4619,
"step": 844
},
{
"epoch": 0.9697317458040453,
"grad_norm": 50.5,
"learning_rate": 1.7920489296636087e-05,
"loss": 0.6513,
"step": 845
},
{
"epoch": 0.9708793573375413,
"grad_norm": 15.125,
"learning_rate": 1.791539245667686e-05,
"loss": 0.379,
"step": 846
},
{
"epoch": 0.9720269688710371,
"grad_norm": 10.625,
"learning_rate": 1.7910295616717638e-05,
"loss": 0.5085,
"step": 847
},
{
"epoch": 0.9731745804045331,
"grad_norm": 12.6875,
"learning_rate": 1.7905198776758412e-05,
"loss": 0.5272,
"step": 848
},
{
"epoch": 0.9743221919380289,
"grad_norm": 27.5,
"learning_rate": 1.7900101936799186e-05,
"loss": 1.0062,
"step": 849
},
{
"epoch": 0.9754698034715249,
"grad_norm": 127.5,
"learning_rate": 1.789500509683996e-05,
"loss": 1.0798,
"step": 850
},
{
"epoch": 0.9766174150050208,
"grad_norm": 103.5,
"learning_rate": 1.7889908256880734e-05,
"loss": 1.1638,
"step": 851
},
{
"epoch": 0.9777650265385167,
"grad_norm": 69.0,
"learning_rate": 1.788481141692151e-05,
"loss": 0.9011,
"step": 852
},
{
"epoch": 0.9789126380720127,
"grad_norm": 86.5,
"learning_rate": 1.7879714576962285e-05,
"loss": 0.8197,
"step": 853
},
{
"epoch": 0.9800602496055085,
"grad_norm": 15.5,
"learning_rate": 1.787461773700306e-05,
"loss": 0.6134,
"step": 854
},
{
"epoch": 0.9812078611390045,
"grad_norm": 110.5,
"learning_rate": 1.7869520897043836e-05,
"loss": 0.919,
"step": 855
},
{
"epoch": 0.9823554726725003,
"grad_norm": 30.125,
"learning_rate": 1.786442405708461e-05,
"loss": 0.5746,
"step": 856
},
{
"epoch": 0.9835030842059963,
"grad_norm": 100.0,
"learning_rate": 1.7859327217125384e-05,
"loss": 0.3361,
"step": 857
},
{
"epoch": 0.9846506957394922,
"grad_norm": 60.0,
"learning_rate": 1.7854230377166158e-05,
"loss": 0.6782,
"step": 858
},
{
"epoch": 0.9857983072729881,
"grad_norm": 62.5,
"learning_rate": 1.7849133537206932e-05,
"loss": 0.8552,
"step": 859
},
{
"epoch": 0.986945918806484,
"grad_norm": 38.75,
"learning_rate": 1.784403669724771e-05,
"loss": 0.7251,
"step": 860
},
{
"epoch": 0.9880935303399799,
"grad_norm": 84.5,
"learning_rate": 1.7838939857288483e-05,
"loss": 0.9825,
"step": 861
},
{
"epoch": 0.9892411418734758,
"grad_norm": 32.75,
"learning_rate": 1.7833843017329257e-05,
"loss": 0.2631,
"step": 862
},
{
"epoch": 0.9903887534069717,
"grad_norm": 101.5,
"learning_rate": 1.782874617737003e-05,
"loss": 1.0281,
"step": 863
},
{
"epoch": 0.9915363649404677,
"grad_norm": 36.75,
"learning_rate": 1.7823649337410808e-05,
"loss": 0.6591,
"step": 864
},
{
"epoch": 0.9926839764739636,
"grad_norm": 23.75,
"learning_rate": 1.7818552497451582e-05,
"loss": 0.2017,
"step": 865
},
{
"epoch": 0.9938315880074595,
"grad_norm": 11.0625,
"learning_rate": 1.7813455657492356e-05,
"loss": 0.6496,
"step": 866
},
{
"epoch": 0.9949791995409554,
"grad_norm": 52.0,
"learning_rate": 1.780835881753313e-05,
"loss": 0.7726,
"step": 867
},
{
"epoch": 0.9961268110744513,
"grad_norm": 8.9375,
"learning_rate": 1.7803261977573904e-05,
"loss": 0.2688,
"step": 868
},
{
"epoch": 0.9972744226079472,
"grad_norm": 29.625,
"learning_rate": 1.779816513761468e-05,
"loss": 0.4991,
"step": 869
},
{
"epoch": 0.9984220341414431,
"grad_norm": 6.59375,
"learning_rate": 1.7793068297655455e-05,
"loss": 0.3232,
"step": 870
},
{
"epoch": 0.999569645674939,
"grad_norm": 39.25,
"learning_rate": 1.778797145769623e-05,
"loss": 0.4866,
"step": 871
},
{
"epoch": 1.0,
"grad_norm": 31.25,
"learning_rate": 1.7782874617737006e-05,
"loss": 0.1617,
"step": 872
},
{
"epoch": 1.0011476115334959,
"grad_norm": 54.25,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.5081,
"step": 873
},
{
"epoch": 1.002295223066992,
"grad_norm": 59.5,
"learning_rate": 1.7772680937818554e-05,
"loss": 0.6284,
"step": 874
},
{
"epoch": 1.0034428346004878,
"grad_norm": 62.0,
"learning_rate": 1.7767584097859328e-05,
"loss": 0.6364,
"step": 875
},
{
"epoch": 1.0045904461339836,
"grad_norm": 99.5,
"learning_rate": 1.7762487257900102e-05,
"loss": 1.521,
"step": 876
},
{
"epoch": 1.0057380576674795,
"grad_norm": 105.5,
"learning_rate": 1.775739041794088e-05,
"loss": 1.0837,
"step": 877
},
{
"epoch": 1.0068856692009756,
"grad_norm": 117.0,
"learning_rate": 1.7752293577981653e-05,
"loss": 1.0871,
"step": 878
},
{
"epoch": 1.0080332807344714,
"grad_norm": 91.5,
"learning_rate": 1.7747196738022427e-05,
"loss": 0.7927,
"step": 879
},
{
"epoch": 1.0091808922679673,
"grad_norm": 68.5,
"learning_rate": 1.7742099898063204e-05,
"loss": 0.6309,
"step": 880
},
{
"epoch": 1.010328503801463,
"grad_norm": 11.3125,
"learning_rate": 1.7737003058103978e-05,
"loss": 0.3369,
"step": 881
},
{
"epoch": 1.0114761153349592,
"grad_norm": 11.125,
"learning_rate": 1.7731906218144752e-05,
"loss": 0.2181,
"step": 882
},
{
"epoch": 1.012623726868455,
"grad_norm": 23.75,
"learning_rate": 1.7726809378185526e-05,
"loss": 0.4936,
"step": 883
},
{
"epoch": 1.0137713384019509,
"grad_norm": 26.25,
"learning_rate": 1.77217125382263e-05,
"loss": 0.5372,
"step": 884
},
{
"epoch": 1.014918949935447,
"grad_norm": 23.375,
"learning_rate": 1.7716615698267077e-05,
"loss": 0.2898,
"step": 885
},
{
"epoch": 1.0160665614689428,
"grad_norm": 61.25,
"learning_rate": 1.771151885830785e-05,
"loss": 1.0463,
"step": 886
},
{
"epoch": 1.0172141730024387,
"grad_norm": 16.125,
"learning_rate": 1.7706422018348625e-05,
"loss": 0.3061,
"step": 887
},
{
"epoch": 1.0183617845359345,
"grad_norm": 83.5,
"learning_rate": 1.77013251783894e-05,
"loss": 0.7545,
"step": 888
},
{
"epoch": 1.0195093960694306,
"grad_norm": 57.75,
"learning_rate": 1.7696228338430176e-05,
"loss": 0.7643,
"step": 889
},
{
"epoch": 1.0206570076029264,
"grad_norm": 19.125,
"learning_rate": 1.769113149847095e-05,
"loss": 0.6013,
"step": 890
},
{
"epoch": 1.0218046191364223,
"grad_norm": 12.25,
"learning_rate": 1.7686034658511724e-05,
"loss": 0.4579,
"step": 891
},
{
"epoch": 1.0229522306699181,
"grad_norm": 38.25,
"learning_rate": 1.7680937818552498e-05,
"loss": 0.4669,
"step": 892
},
{
"epoch": 1.0240998422034142,
"grad_norm": 68.0,
"learning_rate": 1.767584097859327e-05,
"loss": 0.4824,
"step": 893
},
{
"epoch": 1.02524745373691,
"grad_norm": 10.5625,
"learning_rate": 1.767074413863405e-05,
"loss": 0.5689,
"step": 894
},
{
"epoch": 1.026395065270406,
"grad_norm": 8.875,
"learning_rate": 1.7665647298674823e-05,
"loss": 0.3161,
"step": 895
},
{
"epoch": 1.0275426768039018,
"grad_norm": 23.625,
"learning_rate": 1.7660550458715597e-05,
"loss": 0.4443,
"step": 896
},
{
"epoch": 1.0286902883373978,
"grad_norm": 15.75,
"learning_rate": 1.7655453618756374e-05,
"loss": 0.2331,
"step": 897
},
{
"epoch": 1.0298378998708937,
"grad_norm": 8.25,
"learning_rate": 1.7650356778797148e-05,
"loss": 0.3554,
"step": 898
},
{
"epoch": 1.0309855114043895,
"grad_norm": 14.5,
"learning_rate": 1.7645259938837922e-05,
"loss": 0.6107,
"step": 899
},
{
"epoch": 1.0321331229378856,
"grad_norm": 42.25,
"learning_rate": 1.76401630988787e-05,
"loss": 0.3624,
"step": 900
},
{
"epoch": 1.0321331229378856,
"eval_accuracy": 0.64,
"eval_loss": 0.6176496744155884,
"eval_runtime": 49.5336,
"eval_samples_per_second": 2.019,
"eval_steps_per_second": 2.019,
"step": 900
},
{
"epoch": 1.0332807344713815,
"grad_norm": 23.0,
"learning_rate": 1.763506625891947e-05,
"loss": 0.4606,
"step": 901
},
{
"epoch": 1.0344283460048773,
"grad_norm": 50.75,
"learning_rate": 1.7629969418960247e-05,
"loss": 0.5176,
"step": 902
},
{
"epoch": 1.0355759575383732,
"grad_norm": 58.0,
"learning_rate": 1.762487257900102e-05,
"loss": 0.3688,
"step": 903
},
{
"epoch": 1.0367235690718692,
"grad_norm": 36.0,
"learning_rate": 1.7619775739041795e-05,
"loss": 0.7414,
"step": 904
},
{
"epoch": 1.037871180605365,
"grad_norm": 28.5,
"learning_rate": 1.7614678899082572e-05,
"loss": 0.8468,
"step": 905
},
{
"epoch": 1.039018792138861,
"grad_norm": 26.25,
"learning_rate": 1.7609582059123346e-05,
"loss": 0.4338,
"step": 906
},
{
"epoch": 1.040166403672357,
"grad_norm": 122.5,
"learning_rate": 1.760448521916412e-05,
"loss": 0.9431,
"step": 907
},
{
"epoch": 1.0413140152058529,
"grad_norm": 15.375,
"learning_rate": 1.7599388379204894e-05,
"loss": 0.5602,
"step": 908
},
{
"epoch": 1.0424616267393487,
"grad_norm": 96.5,
"learning_rate": 1.7594291539245668e-05,
"loss": 0.6268,
"step": 909
},
{
"epoch": 1.0436092382728446,
"grad_norm": 59.25,
"learning_rate": 1.7589194699286445e-05,
"loss": 0.404,
"step": 910
},
{
"epoch": 1.0447568498063406,
"grad_norm": 30.5,
"learning_rate": 1.758409785932722e-05,
"loss": 0.5772,
"step": 911
},
{
"epoch": 1.0459044613398365,
"grad_norm": 15.875,
"learning_rate": 1.7579001019367993e-05,
"loss": 0.4666,
"step": 912
},
{
"epoch": 1.0470520728733324,
"grad_norm": 20.25,
"learning_rate": 1.7573904179408767e-05,
"loss": 0.4576,
"step": 913
},
{
"epoch": 1.0481996844068282,
"grad_norm": 33.5,
"learning_rate": 1.7568807339449544e-05,
"loss": 0.4427,
"step": 914
},
{
"epoch": 1.0493472959403243,
"grad_norm": 6.59375,
"learning_rate": 1.7563710499490318e-05,
"loss": 0.1466,
"step": 915
},
{
"epoch": 1.0504949074738201,
"grad_norm": 44.0,
"learning_rate": 1.7558613659531092e-05,
"loss": 0.3564,
"step": 916
},
{
"epoch": 1.051642519007316,
"grad_norm": 26.75,
"learning_rate": 1.755351681957187e-05,
"loss": 0.6648,
"step": 917
},
{
"epoch": 1.0527901305408118,
"grad_norm": 53.5,
"learning_rate": 1.754841997961264e-05,
"loss": 0.5389,
"step": 918
},
{
"epoch": 1.053937742074308,
"grad_norm": 23.875,
"learning_rate": 1.7543323139653417e-05,
"loss": 0.4424,
"step": 919
},
{
"epoch": 1.0550853536078038,
"grad_norm": 34.75,
"learning_rate": 1.753822629969419e-05,
"loss": 0.4035,
"step": 920
},
{
"epoch": 1.0562329651412996,
"grad_norm": 25.125,
"learning_rate": 1.7533129459734965e-05,
"loss": 0.5704,
"step": 921
},
{
"epoch": 1.0573805766747957,
"grad_norm": 25.625,
"learning_rate": 1.7528032619775742e-05,
"loss": 0.7077,
"step": 922
},
{
"epoch": 1.0585281882082915,
"grad_norm": 11.25,
"learning_rate": 1.7522935779816516e-05,
"loss": 0.228,
"step": 923
},
{
"epoch": 1.0596757997417874,
"grad_norm": 31.125,
"learning_rate": 1.751783893985729e-05,
"loss": 0.5716,
"step": 924
},
{
"epoch": 1.0608234112752832,
"grad_norm": 21.25,
"learning_rate": 1.7512742099898067e-05,
"loss": 0.2658,
"step": 925
},
{
"epoch": 1.0619710228087793,
"grad_norm": 12.875,
"learning_rate": 1.7507645259938838e-05,
"loss": 0.2415,
"step": 926
},
{
"epoch": 1.0631186343422752,
"grad_norm": 100.5,
"learning_rate": 1.7502548419979615e-05,
"loss": 1.0011,
"step": 927
},
{
"epoch": 1.064266245875771,
"grad_norm": 67.0,
"learning_rate": 1.749745158002039e-05,
"loss": 0.6753,
"step": 928
},
{
"epoch": 1.0654138574092669,
"grad_norm": 109.0,
"learning_rate": 1.7492354740061163e-05,
"loss": 0.8631,
"step": 929
},
{
"epoch": 1.066561468942763,
"grad_norm": 68.5,
"learning_rate": 1.7487257900101937e-05,
"loss": 1.0799,
"step": 930
},
{
"epoch": 1.0677090804762588,
"grad_norm": 74.0,
"learning_rate": 1.7482161060142714e-05,
"loss": 0.5419,
"step": 931
},
{
"epoch": 1.0688566920097546,
"grad_norm": 61.25,
"learning_rate": 1.7477064220183488e-05,
"loss": 0.6041,
"step": 932
},
{
"epoch": 1.0700043035432507,
"grad_norm": 32.25,
"learning_rate": 1.7471967380224262e-05,
"loss": 0.8215,
"step": 933
},
{
"epoch": 1.0711519150767466,
"grad_norm": 45.25,
"learning_rate": 1.746687054026504e-05,
"loss": 0.5843,
"step": 934
},
{
"epoch": 1.0722995266102424,
"grad_norm": 104.5,
"learning_rate": 1.746177370030581e-05,
"loss": 1.3222,
"step": 935
},
{
"epoch": 1.0734471381437383,
"grad_norm": 56.0,
"learning_rate": 1.7456676860346587e-05,
"loss": 0.5504,
"step": 936
},
{
"epoch": 1.0745947496772343,
"grad_norm": 54.0,
"learning_rate": 1.745158002038736e-05,
"loss": 0.8466,
"step": 937
},
{
"epoch": 1.0757423612107302,
"grad_norm": 27.375,
"learning_rate": 1.7446483180428135e-05,
"loss": 0.9508,
"step": 938
},
{
"epoch": 1.076889972744226,
"grad_norm": 14.625,
"learning_rate": 1.7441386340468912e-05,
"loss": 0.3969,
"step": 939
},
{
"epoch": 1.078037584277722,
"grad_norm": 75.0,
"learning_rate": 1.7436289500509686e-05,
"loss": 0.9936,
"step": 940
},
{
"epoch": 1.079185195811218,
"grad_norm": 51.75,
"learning_rate": 1.743119266055046e-05,
"loss": 0.5978,
"step": 941
},
{
"epoch": 1.0803328073447138,
"grad_norm": 57.5,
"learning_rate": 1.7426095820591237e-05,
"loss": 0.6549,
"step": 942
},
{
"epoch": 1.0814804188782097,
"grad_norm": 19.625,
"learning_rate": 1.7420998980632008e-05,
"loss": 0.4942,
"step": 943
},
{
"epoch": 1.0826280304117057,
"grad_norm": 83.0,
"learning_rate": 1.7415902140672785e-05,
"loss": 0.6702,
"step": 944
},
{
"epoch": 1.0837756419452016,
"grad_norm": 42.5,
"learning_rate": 1.741080530071356e-05,
"loss": 0.6299,
"step": 945
},
{
"epoch": 1.0849232534786974,
"grad_norm": 19.25,
"learning_rate": 1.7405708460754333e-05,
"loss": 0.5421,
"step": 946
},
{
"epoch": 1.0860708650121933,
"grad_norm": 34.0,
"learning_rate": 1.740061162079511e-05,
"loss": 0.7019,
"step": 947
},
{
"epoch": 1.0872184765456894,
"grad_norm": 34.0,
"learning_rate": 1.7395514780835884e-05,
"loss": 0.5919,
"step": 948
},
{
"epoch": 1.0883660880791852,
"grad_norm": 17.875,
"learning_rate": 1.7390417940876658e-05,
"loss": 0.2788,
"step": 949
},
{
"epoch": 1.089513699612681,
"grad_norm": 16.0,
"learning_rate": 1.738532110091743e-05,
"loss": 0.7744,
"step": 950
},
{
"epoch": 1.090661311146177,
"grad_norm": 61.5,
"learning_rate": 1.7380224260958206e-05,
"loss": 0.6198,
"step": 951
},
{
"epoch": 1.091808922679673,
"grad_norm": 17.5,
"learning_rate": 1.7375127420998983e-05,
"loss": 0.5995,
"step": 952
},
{
"epoch": 1.0929565342131689,
"grad_norm": 15.0,
"learning_rate": 1.7370030581039757e-05,
"loss": 0.4392,
"step": 953
},
{
"epoch": 1.0941041457466647,
"grad_norm": 54.75,
"learning_rate": 1.736493374108053e-05,
"loss": 0.4673,
"step": 954
},
{
"epoch": 1.0952517572801606,
"grad_norm": 31.5,
"learning_rate": 1.7359836901121305e-05,
"loss": 0.5318,
"step": 955
},
{
"epoch": 1.0963993688136566,
"grad_norm": 35.0,
"learning_rate": 1.7354740061162082e-05,
"loss": 0.5184,
"step": 956
},
{
"epoch": 1.0975469803471525,
"grad_norm": 23.75,
"learning_rate": 1.7349643221202856e-05,
"loss": 0.5015,
"step": 957
},
{
"epoch": 1.0986945918806483,
"grad_norm": 54.0,
"learning_rate": 1.734454638124363e-05,
"loss": 0.5254,
"step": 958
},
{
"epoch": 1.0998422034141444,
"grad_norm": 10.375,
"learning_rate": 1.7339449541284407e-05,
"loss": 0.4739,
"step": 959
},
{
"epoch": 1.1009898149476403,
"grad_norm": 23.5,
"learning_rate": 1.7334352701325177e-05,
"loss": 0.5565,
"step": 960
},
{
"epoch": 1.102137426481136,
"grad_norm": 11.875,
"learning_rate": 1.7329255861365955e-05,
"loss": 0.3887,
"step": 961
},
{
"epoch": 1.103285038014632,
"grad_norm": 10.875,
"learning_rate": 1.732415902140673e-05,
"loss": 0.6166,
"step": 962
},
{
"epoch": 1.104432649548128,
"grad_norm": 43.75,
"learning_rate": 1.7319062181447503e-05,
"loss": 0.9438,
"step": 963
},
{
"epoch": 1.1055802610816239,
"grad_norm": 17.375,
"learning_rate": 1.731396534148828e-05,
"loss": 0.6131,
"step": 964
},
{
"epoch": 1.1067278726151197,
"grad_norm": 36.5,
"learning_rate": 1.7308868501529054e-05,
"loss": 0.5897,
"step": 965
},
{
"epoch": 1.1078754841486158,
"grad_norm": 34.25,
"learning_rate": 1.7303771661569828e-05,
"loss": 0.473,
"step": 966
},
{
"epoch": 1.1090230956821117,
"grad_norm": 23.75,
"learning_rate": 1.7298674821610605e-05,
"loss": 0.6736,
"step": 967
},
{
"epoch": 1.1101707072156075,
"grad_norm": 47.25,
"learning_rate": 1.7293577981651376e-05,
"loss": 0.4113,
"step": 968
},
{
"epoch": 1.1113183187491034,
"grad_norm": 13.6875,
"learning_rate": 1.7288481141692153e-05,
"loss": 0.2634,
"step": 969
},
{
"epoch": 1.1124659302825994,
"grad_norm": 36.75,
"learning_rate": 1.7283384301732927e-05,
"loss": 0.5289,
"step": 970
},
{
"epoch": 1.1136135418160953,
"grad_norm": 28.625,
"learning_rate": 1.72782874617737e-05,
"loss": 0.5775,
"step": 971
},
{
"epoch": 1.1147611533495911,
"grad_norm": 42.25,
"learning_rate": 1.7273190621814478e-05,
"loss": 0.7163,
"step": 972
},
{
"epoch": 1.115908764883087,
"grad_norm": 57.0,
"learning_rate": 1.7268093781855252e-05,
"loss": 0.5009,
"step": 973
},
{
"epoch": 1.117056376416583,
"grad_norm": 22.375,
"learning_rate": 1.7262996941896026e-05,
"loss": 0.4101,
"step": 974
},
{
"epoch": 1.118203987950079,
"grad_norm": 41.25,
"learning_rate": 1.72579001019368e-05,
"loss": 0.4195,
"step": 975
},
{
"epoch": 1.1193515994835748,
"grad_norm": 17.625,
"learning_rate": 1.7252803261977577e-05,
"loss": 0.4409,
"step": 976
},
{
"epoch": 1.1204992110170706,
"grad_norm": 18.375,
"learning_rate": 1.724770642201835e-05,
"loss": 0.4041,
"step": 977
},
{
"epoch": 1.1216468225505667,
"grad_norm": 39.0,
"learning_rate": 1.7242609582059125e-05,
"loss": 0.6333,
"step": 978
},
{
"epoch": 1.1227944340840625,
"grad_norm": 62.25,
"learning_rate": 1.72375127420999e-05,
"loss": 0.648,
"step": 979
},
{
"epoch": 1.1239420456175584,
"grad_norm": 57.0,
"learning_rate": 1.7232415902140673e-05,
"loss": 0.5549,
"step": 980
},
{
"epoch": 1.1250896571510545,
"grad_norm": 17.875,
"learning_rate": 1.722731906218145e-05,
"loss": 0.3829,
"step": 981
},
{
"epoch": 1.1262372686845503,
"grad_norm": 23.5,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.3594,
"step": 982
},
{
"epoch": 1.1273848802180462,
"grad_norm": 53.0,
"learning_rate": 1.7217125382262998e-05,
"loss": 0.6625,
"step": 983
},
{
"epoch": 1.128532491751542,
"grad_norm": 49.75,
"learning_rate": 1.7212028542303775e-05,
"loss": 0.4887,
"step": 984
},
{
"epoch": 1.129680103285038,
"grad_norm": 15.0,
"learning_rate": 1.7206931702344545e-05,
"loss": 0.5548,
"step": 985
},
{
"epoch": 1.130827714818534,
"grad_norm": 40.5,
"learning_rate": 1.7201834862385323e-05,
"loss": 0.7024,
"step": 986
},
{
"epoch": 1.1319753263520298,
"grad_norm": 58.25,
"learning_rate": 1.7196738022426097e-05,
"loss": 0.4027,
"step": 987
},
{
"epoch": 1.1331229378855259,
"grad_norm": 70.5,
"learning_rate": 1.719164118246687e-05,
"loss": 0.6295,
"step": 988
},
{
"epoch": 1.1342705494190217,
"grad_norm": 34.25,
"learning_rate": 1.7186544342507648e-05,
"loss": 0.3274,
"step": 989
},
{
"epoch": 1.1354181609525176,
"grad_norm": 20.0,
"learning_rate": 1.7181447502548422e-05,
"loss": 0.1818,
"step": 990
},
{
"epoch": 1.1365657724860134,
"grad_norm": 46.5,
"learning_rate": 1.7176350662589196e-05,
"loss": 0.4344,
"step": 991
},
{
"epoch": 1.1377133840195093,
"grad_norm": 60.0,
"learning_rate": 1.7171253822629973e-05,
"loss": 0.3682,
"step": 992
},
{
"epoch": 1.1388609955530054,
"grad_norm": 30.25,
"learning_rate": 1.7166156982670747e-05,
"loss": 0.4771,
"step": 993
},
{
"epoch": 1.1400086070865012,
"grad_norm": 23.375,
"learning_rate": 1.716106014271152e-05,
"loss": 0.4939,
"step": 994
},
{
"epoch": 1.141156218619997,
"grad_norm": 17.625,
"learning_rate": 1.7155963302752295e-05,
"loss": 0.6885,
"step": 995
},
{
"epoch": 1.1423038301534931,
"grad_norm": 64.5,
"learning_rate": 1.715086646279307e-05,
"loss": 0.8163,
"step": 996
},
{
"epoch": 1.143451441686989,
"grad_norm": 39.5,
"learning_rate": 1.7145769622833846e-05,
"loss": 0.3577,
"step": 997
},
{
"epoch": 1.1445990532204848,
"grad_norm": 8.6875,
"learning_rate": 1.714067278287462e-05,
"loss": 0.202,
"step": 998
},
{
"epoch": 1.1457466647539807,
"grad_norm": 52.0,
"learning_rate": 1.7135575942915394e-05,
"loss": 0.5541,
"step": 999
},
{
"epoch": 1.1468942762874768,
"grad_norm": 22.125,
"learning_rate": 1.7130479102956168e-05,
"loss": 0.2125,
"step": 1000
},
{
"epoch": 1.1468942762874768,
"eval_accuracy": 0.6,
"eval_loss": 0.5487725734710693,
"eval_runtime": 50.2711,
"eval_samples_per_second": 1.989,
"eval_steps_per_second": 1.989,
"step": 1000
},
{
"epoch": 1.1480418878209726,
"grad_norm": 21.25,
"learning_rate": 1.7125382262996945e-05,
"loss": 0.3415,
"step": 1001
},
{
"epoch": 1.1491894993544685,
"grad_norm": 39.5,
"learning_rate": 1.712028542303772e-05,
"loss": 0.6746,
"step": 1002
},
{
"epoch": 1.1503371108879645,
"grad_norm": 16.875,
"learning_rate": 1.7115188583078493e-05,
"loss": 0.7315,
"step": 1003
},
{
"epoch": 1.1514847224214604,
"grad_norm": 13.6875,
"learning_rate": 1.7110091743119267e-05,
"loss": 0.5293,
"step": 1004
},
{
"epoch": 1.1526323339549562,
"grad_norm": 10.4375,
"learning_rate": 1.710499490316004e-05,
"loss": 0.4509,
"step": 1005
},
{
"epoch": 1.153779945488452,
"grad_norm": 18.375,
"learning_rate": 1.7099898063200818e-05,
"loss": 0.3469,
"step": 1006
},
{
"epoch": 1.1549275570219482,
"grad_norm": 12.375,
"learning_rate": 1.709480122324159e-05,
"loss": 0.4868,
"step": 1007
},
{
"epoch": 1.156075168555444,
"grad_norm": 57.5,
"learning_rate": 1.7089704383282366e-05,
"loss": 0.5211,
"step": 1008
},
{
"epoch": 1.1572227800889399,
"grad_norm": 13.875,
"learning_rate": 1.7084607543323143e-05,
"loss": 0.3623,
"step": 1009
},
{
"epoch": 1.1583703916224357,
"grad_norm": 69.0,
"learning_rate": 1.7079510703363917e-05,
"loss": 0.274,
"step": 1010
},
{
"epoch": 1.1595180031559318,
"grad_norm": 18.5,
"learning_rate": 1.707441386340469e-05,
"loss": 0.2365,
"step": 1011
},
{
"epoch": 1.1606656146894276,
"grad_norm": 40.25,
"learning_rate": 1.7069317023445465e-05,
"loss": 0.2999,
"step": 1012
},
{
"epoch": 1.1618132262229235,
"grad_norm": 57.5,
"learning_rate": 1.706422018348624e-05,
"loss": 0.5137,
"step": 1013
},
{
"epoch": 1.1629608377564193,
"grad_norm": 20.875,
"learning_rate": 1.7059123343527016e-05,
"loss": 0.6691,
"step": 1014
},
{
"epoch": 1.1641084492899154,
"grad_norm": 30.875,
"learning_rate": 1.705402650356779e-05,
"loss": 0.6642,
"step": 1015
},
{
"epoch": 1.1652560608234113,
"grad_norm": 15.9375,
"learning_rate": 1.7048929663608564e-05,
"loss": 0.2695,
"step": 1016
},
{
"epoch": 1.1664036723569071,
"grad_norm": 88.5,
"learning_rate": 1.7043832823649338e-05,
"loss": 0.8211,
"step": 1017
},
{
"epoch": 1.1675512838904032,
"grad_norm": 45.5,
"learning_rate": 1.7038735983690115e-05,
"loss": 0.7956,
"step": 1018
},
{
"epoch": 1.168698895423899,
"grad_norm": 80.0,
"learning_rate": 1.703363914373089e-05,
"loss": 0.8805,
"step": 1019
},
{
"epoch": 1.169846506957395,
"grad_norm": 15.1875,
"learning_rate": 1.7028542303771663e-05,
"loss": 0.5262,
"step": 1020
},
{
"epoch": 1.1709941184908907,
"grad_norm": 60.75,
"learning_rate": 1.7023445463812437e-05,
"loss": 1.1968,
"step": 1021
},
{
"epoch": 1.1721417300243868,
"grad_norm": 31.375,
"learning_rate": 1.701834862385321e-05,
"loss": 0.744,
"step": 1022
},
{
"epoch": 1.1732893415578827,
"grad_norm": 20.625,
"learning_rate": 1.7013251783893988e-05,
"loss": 0.3321,
"step": 1023
},
{
"epoch": 1.1744369530913785,
"grad_norm": 19.875,
"learning_rate": 1.700815494393476e-05,
"loss": 0.4447,
"step": 1024
},
{
"epoch": 1.1755845646248746,
"grad_norm": 23.625,
"learning_rate": 1.7003058103975536e-05,
"loss": 0.3697,
"step": 1025
},
{
"epoch": 1.1767321761583704,
"grad_norm": 20.625,
"learning_rate": 1.6997961264016313e-05,
"loss": 0.2759,
"step": 1026
},
{
"epoch": 1.1778797876918663,
"grad_norm": 58.75,
"learning_rate": 1.6992864424057087e-05,
"loss": 0.7182,
"step": 1027
},
{
"epoch": 1.1790273992253621,
"grad_norm": 6.96875,
"learning_rate": 1.698776758409786e-05,
"loss": 0.1403,
"step": 1028
},
{
"epoch": 1.1801750107588582,
"grad_norm": 24.875,
"learning_rate": 1.6982670744138638e-05,
"loss": 0.513,
"step": 1029
},
{
"epoch": 1.181322622292354,
"grad_norm": 15.8125,
"learning_rate": 1.697757390417941e-05,
"loss": 0.5238,
"step": 1030
},
{
"epoch": 1.18247023382585,
"grad_norm": 22.0,
"learning_rate": 1.6972477064220186e-05,
"loss": 0.445,
"step": 1031
},
{
"epoch": 1.1836178453593458,
"grad_norm": 86.5,
"learning_rate": 1.696738022426096e-05,
"loss": 0.7085,
"step": 1032
},
{
"epoch": 1.1847654568928419,
"grad_norm": 63.75,
"learning_rate": 1.6962283384301734e-05,
"loss": 1.0473,
"step": 1033
},
{
"epoch": 1.1859130684263377,
"grad_norm": 63.5,
"learning_rate": 1.695718654434251e-05,
"loss": 0.3947,
"step": 1034
},
{
"epoch": 1.1870606799598336,
"grad_norm": 12.375,
"learning_rate": 1.6952089704383285e-05,
"loss": 0.3453,
"step": 1035
},
{
"epoch": 1.1882082914933294,
"grad_norm": 9.1875,
"learning_rate": 1.694699286442406e-05,
"loss": 0.2802,
"step": 1036
},
{
"epoch": 1.1893559030268255,
"grad_norm": 29.75,
"learning_rate": 1.6941896024464833e-05,
"loss": 0.543,
"step": 1037
},
{
"epoch": 1.1905035145603213,
"grad_norm": 60.75,
"learning_rate": 1.6936799184505606e-05,
"loss": 0.7737,
"step": 1038
},
{
"epoch": 1.1916511260938172,
"grad_norm": 38.0,
"learning_rate": 1.6931702344546384e-05,
"loss": 0.7725,
"step": 1039
},
{
"epoch": 1.1927987376273133,
"grad_norm": 9.9375,
"learning_rate": 1.6926605504587158e-05,
"loss": 0.2875,
"step": 1040
},
{
"epoch": 1.193946349160809,
"grad_norm": 53.5,
"learning_rate": 1.692150866462793e-05,
"loss": 0.6683,
"step": 1041
},
{
"epoch": 1.195093960694305,
"grad_norm": 58.25,
"learning_rate": 1.6916411824668705e-05,
"loss": 0.4406,
"step": 1042
},
{
"epoch": 1.1962415722278008,
"grad_norm": 46.0,
"learning_rate": 1.6911314984709483e-05,
"loss": 0.6739,
"step": 1043
},
{
"epoch": 1.1973891837612969,
"grad_norm": 47.25,
"learning_rate": 1.6906218144750257e-05,
"loss": 0.278,
"step": 1044
},
{
"epoch": 1.1985367952947927,
"grad_norm": 42.5,
"learning_rate": 1.690112130479103e-05,
"loss": 0.4348,
"step": 1045
},
{
"epoch": 1.1996844068282886,
"grad_norm": 20.0,
"learning_rate": 1.6896024464831804e-05,
"loss": 0.1145,
"step": 1046
},
{
"epoch": 1.2008320183617847,
"grad_norm": 36.0,
"learning_rate": 1.689092762487258e-05,
"loss": 0.7542,
"step": 1047
},
{
"epoch": 1.2019796298952805,
"grad_norm": 7.5625,
"learning_rate": 1.6885830784913356e-05,
"loss": 0.1475,
"step": 1048
},
{
"epoch": 1.2031272414287764,
"grad_norm": 54.75,
"learning_rate": 1.688073394495413e-05,
"loss": 0.3082,
"step": 1049
},
{
"epoch": 1.2042748529622722,
"grad_norm": 24.5,
"learning_rate": 1.6875637104994903e-05,
"loss": 0.8289,
"step": 1050
},
{
"epoch": 1.205422464495768,
"grad_norm": 69.5,
"learning_rate": 1.687054026503568e-05,
"loss": 0.7198,
"step": 1051
},
{
"epoch": 1.2065700760292641,
"grad_norm": 57.25,
"learning_rate": 1.6865443425076455e-05,
"loss": 0.3022,
"step": 1052
},
{
"epoch": 1.20771768756276,
"grad_norm": 43.5,
"learning_rate": 1.686034658511723e-05,
"loss": 0.5085,
"step": 1053
},
{
"epoch": 1.2088652990962558,
"grad_norm": 16.75,
"learning_rate": 1.6855249745158006e-05,
"loss": 0.4784,
"step": 1054
},
{
"epoch": 1.210012910629752,
"grad_norm": 14.3125,
"learning_rate": 1.6850152905198776e-05,
"loss": 0.3837,
"step": 1055
},
{
"epoch": 1.2111605221632478,
"grad_norm": 9.625,
"learning_rate": 1.6845056065239554e-05,
"loss": 0.2057,
"step": 1056
},
{
"epoch": 1.2123081336967436,
"grad_norm": 23.625,
"learning_rate": 1.6839959225280328e-05,
"loss": 0.9273,
"step": 1057
},
{
"epoch": 1.2134557452302395,
"grad_norm": 27.125,
"learning_rate": 1.68348623853211e-05,
"loss": 0.5371,
"step": 1058
},
{
"epoch": 1.2146033567637355,
"grad_norm": 55.75,
"learning_rate": 1.682976554536188e-05,
"loss": 0.5682,
"step": 1059
},
{
"epoch": 1.2157509682972314,
"grad_norm": 55.25,
"learning_rate": 1.6824668705402653e-05,
"loss": 0.4674,
"step": 1060
},
{
"epoch": 1.2168985798307272,
"grad_norm": 18.625,
"learning_rate": 1.6819571865443427e-05,
"loss": 0.515,
"step": 1061
},
{
"epoch": 1.2180461913642233,
"grad_norm": 118.5,
"learning_rate": 1.68144750254842e-05,
"loss": 1.1109,
"step": 1062
},
{
"epoch": 1.2191938028977192,
"grad_norm": 45.5,
"learning_rate": 1.6809378185524974e-05,
"loss": 0.2941,
"step": 1063
},
{
"epoch": 1.220341414431215,
"grad_norm": 21.0,
"learning_rate": 1.6804281345565752e-05,
"loss": 0.4562,
"step": 1064
},
{
"epoch": 1.2214890259647109,
"grad_norm": 9.375,
"learning_rate": 1.6799184505606526e-05,
"loss": 0.1801,
"step": 1065
},
{
"epoch": 1.222636637498207,
"grad_norm": 12.5,
"learning_rate": 1.67940876656473e-05,
"loss": 0.2672,
"step": 1066
},
{
"epoch": 1.2237842490317028,
"grad_norm": 77.5,
"learning_rate": 1.6788990825688073e-05,
"loss": 0.6601,
"step": 1067
},
{
"epoch": 1.2249318605651986,
"grad_norm": 12.0625,
"learning_rate": 1.678389398572885e-05,
"loss": 0.1519,
"step": 1068
},
{
"epoch": 1.2260794720986945,
"grad_norm": 24.75,
"learning_rate": 1.6778797145769625e-05,
"loss": 0.5777,
"step": 1069
},
{
"epoch": 1.2272270836321906,
"grad_norm": 31.125,
"learning_rate": 1.67737003058104e-05,
"loss": 0.7135,
"step": 1070
},
{
"epoch": 1.2283746951656864,
"grad_norm": 63.5,
"learning_rate": 1.6768603465851176e-05,
"loss": 0.8834,
"step": 1071
},
{
"epoch": 1.2295223066991823,
"grad_norm": 64.0,
"learning_rate": 1.6763506625891946e-05,
"loss": 0.8991,
"step": 1072
},
{
"epoch": 1.2306699182326781,
"grad_norm": 21.375,
"learning_rate": 1.6758409785932724e-05,
"loss": 0.2697,
"step": 1073
},
{
"epoch": 1.2318175297661742,
"grad_norm": 44.0,
"learning_rate": 1.6753312945973498e-05,
"loss": 0.4755,
"step": 1074
},
{
"epoch": 1.23296514129967,
"grad_norm": 28.875,
"learning_rate": 1.674821610601427e-05,
"loss": 0.3531,
"step": 1075
},
{
"epoch": 1.234112752833166,
"grad_norm": 33.0,
"learning_rate": 1.674311926605505e-05,
"loss": 0.1501,
"step": 1076
},
{
"epoch": 1.235260364366662,
"grad_norm": 23.0,
"learning_rate": 1.6738022426095823e-05,
"loss": 0.7386,
"step": 1077
},
{
"epoch": 1.2364079759001578,
"grad_norm": 18.75,
"learning_rate": 1.6732925586136597e-05,
"loss": 0.1371,
"step": 1078
},
{
"epoch": 1.2375555874336537,
"grad_norm": 32.25,
"learning_rate": 1.672782874617737e-05,
"loss": 0.4283,
"step": 1079
},
{
"epoch": 1.2387031989671495,
"grad_norm": 15.25,
"learning_rate": 1.6722731906218144e-05,
"loss": 0.221,
"step": 1080
},
{
"epoch": 1.2398508105006456,
"grad_norm": 30.625,
"learning_rate": 1.671763506625892e-05,
"loss": 0.4018,
"step": 1081
},
{
"epoch": 1.2409984220341415,
"grad_norm": 67.0,
"learning_rate": 1.6712538226299696e-05,
"loss": 0.9727,
"step": 1082
},
{
"epoch": 1.2421460335676373,
"grad_norm": 31.375,
"learning_rate": 1.670744138634047e-05,
"loss": 0.4461,
"step": 1083
},
{
"epoch": 1.2432936451011334,
"grad_norm": 15.9375,
"learning_rate": 1.6702344546381243e-05,
"loss": 0.4182,
"step": 1084
},
{
"epoch": 1.2444412566346292,
"grad_norm": 58.25,
"learning_rate": 1.669724770642202e-05,
"loss": 0.6867,
"step": 1085
},
{
"epoch": 1.245588868168125,
"grad_norm": 30.75,
"learning_rate": 1.6692150866462795e-05,
"loss": 0.3318,
"step": 1086
},
{
"epoch": 1.246736479701621,
"grad_norm": 52.5,
"learning_rate": 1.668705402650357e-05,
"loss": 0.4148,
"step": 1087
},
{
"epoch": 1.247884091235117,
"grad_norm": 22.125,
"learning_rate": 1.6681957186544346e-05,
"loss": 0.5934,
"step": 1088
},
{
"epoch": 1.2490317027686129,
"grad_norm": 33.5,
"learning_rate": 1.6676860346585116e-05,
"loss": 0.2049,
"step": 1089
},
{
"epoch": 1.2501793143021087,
"grad_norm": 37.5,
"learning_rate": 1.6671763506625894e-05,
"loss": 0.3963,
"step": 1090
},
{
"epoch": 1.2513269258356048,
"grad_norm": 18.5,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2542,
"step": 1091
},
{
"epoch": 1.2524745373691006,
"grad_norm": 22.75,
"learning_rate": 1.666156982670744e-05,
"loss": 0.608,
"step": 1092
},
{
"epoch": 1.2536221489025965,
"grad_norm": 29.625,
"learning_rate": 1.665647298674822e-05,
"loss": 0.6878,
"step": 1093
},
{
"epoch": 1.2547697604360923,
"grad_norm": 30.125,
"learning_rate": 1.6651376146788993e-05,
"loss": 0.172,
"step": 1094
},
{
"epoch": 1.2559173719695882,
"grad_norm": 32.75,
"learning_rate": 1.6646279306829766e-05,
"loss": 0.535,
"step": 1095
},
{
"epoch": 1.2570649835030843,
"grad_norm": 53.25,
"learning_rate": 1.6641182466870544e-05,
"loss": 0.5492,
"step": 1096
},
{
"epoch": 1.2582125950365801,
"grad_norm": 29.0,
"learning_rate": 1.6636085626911314e-05,
"loss": 0.3528,
"step": 1097
},
{
"epoch": 1.259360206570076,
"grad_norm": 72.5,
"learning_rate": 1.663098878695209e-05,
"loss": 0.5466,
"step": 1098
},
{
"epoch": 1.260507818103572,
"grad_norm": 33.25,
"learning_rate": 1.6625891946992865e-05,
"loss": 0.2994,
"step": 1099
},
{
"epoch": 1.261655429637068,
"grad_norm": 44.5,
"learning_rate": 1.662079510703364e-05,
"loss": 0.5398,
"step": 1100
},
{
"epoch": 1.261655429637068,
"eval_accuracy": 0.56,
"eval_loss": 0.5154783129692078,
"eval_runtime": 49.6732,
"eval_samples_per_second": 2.013,
"eval_steps_per_second": 2.013,
"step": 1100
},
{
"epoch": 1.2628030411705637,
"grad_norm": 23.5,
"learning_rate": 1.6615698267074417e-05,
"loss": 0.6216,
"step": 1101
},
{
"epoch": 1.2639506527040596,
"grad_norm": 42.0,
"learning_rate": 1.661060142711519e-05,
"loss": 0.2358,
"step": 1102
},
{
"epoch": 1.2650982642375557,
"grad_norm": 32.75,
"learning_rate": 1.6605504587155964e-05,
"loss": 0.2636,
"step": 1103
},
{
"epoch": 1.2662458757710515,
"grad_norm": 19.875,
"learning_rate": 1.660040774719674e-05,
"loss": 0.2472,
"step": 1104
},
{
"epoch": 1.2673934873045474,
"grad_norm": 11.8125,
"learning_rate": 1.6595310907237516e-05,
"loss": 0.1907,
"step": 1105
},
{
"epoch": 1.2685410988380434,
"grad_norm": 74.0,
"learning_rate": 1.659021406727829e-05,
"loss": 0.8738,
"step": 1106
},
{
"epoch": 1.2696887103715393,
"grad_norm": 39.5,
"learning_rate": 1.6585117227319063e-05,
"loss": 0.4113,
"step": 1107
},
{
"epoch": 1.2708363219050351,
"grad_norm": 34.25,
"learning_rate": 1.6580020387359837e-05,
"loss": 0.7458,
"step": 1108
},
{
"epoch": 1.271983933438531,
"grad_norm": 130.0,
"learning_rate": 1.657492354740061e-05,
"loss": 1.2238,
"step": 1109
},
{
"epoch": 1.2731315449720269,
"grad_norm": 37.5,
"learning_rate": 1.656982670744139e-05,
"loss": 0.6994,
"step": 1110
},
{
"epoch": 1.274279156505523,
"grad_norm": 83.5,
"learning_rate": 1.6564729867482163e-05,
"loss": 1.1055,
"step": 1111
},
{
"epoch": 1.2754267680390188,
"grad_norm": 7.15625,
"learning_rate": 1.6559633027522936e-05,
"loss": 0.1926,
"step": 1112
},
{
"epoch": 1.2765743795725146,
"grad_norm": 151.0,
"learning_rate": 1.6554536187563714e-05,
"loss": 0.4989,
"step": 1113
},
{
"epoch": 1.2777219911060107,
"grad_norm": 43.75,
"learning_rate": 1.6549439347604484e-05,
"loss": 0.4593,
"step": 1114
},
{
"epoch": 1.2788696026395066,
"grad_norm": 23.5,
"learning_rate": 1.654434250764526e-05,
"loss": 0.2898,
"step": 1115
},
{
"epoch": 1.2800172141730024,
"grad_norm": 37.5,
"learning_rate": 1.6539245667686035e-05,
"loss": 0.3342,
"step": 1116
},
{
"epoch": 1.2811648257064983,
"grad_norm": 35.25,
"learning_rate": 1.653414882772681e-05,
"loss": 0.4059,
"step": 1117
},
{
"epoch": 1.2823124372399943,
"grad_norm": 17.875,
"learning_rate": 1.6529051987767587e-05,
"loss": 0.3272,
"step": 1118
},
{
"epoch": 1.2834600487734902,
"grad_norm": 59.25,
"learning_rate": 1.652395514780836e-05,
"loss": 0.5725,
"step": 1119
},
{
"epoch": 1.284607660306986,
"grad_norm": 66.0,
"learning_rate": 1.6518858307849134e-05,
"loss": 0.8477,
"step": 1120
},
{
"epoch": 1.285755271840482,
"grad_norm": 67.0,
"learning_rate": 1.6513761467889912e-05,
"loss": 0.5421,
"step": 1121
},
{
"epoch": 1.286902883373978,
"grad_norm": 23.75,
"learning_rate": 1.6508664627930682e-05,
"loss": 0.457,
"step": 1122
},
{
"epoch": 1.2880504949074738,
"grad_norm": 22.875,
"learning_rate": 1.650356778797146e-05,
"loss": 0.5799,
"step": 1123
},
{
"epoch": 1.2891981064409697,
"grad_norm": 48.25,
"learning_rate": 1.6498470948012233e-05,
"loss": 0.5672,
"step": 1124
},
{
"epoch": 1.2903457179744655,
"grad_norm": 31.625,
"learning_rate": 1.6493374108053007e-05,
"loss": 0.6196,
"step": 1125
},
{
"epoch": 1.2914933295079616,
"grad_norm": 79.5,
"learning_rate": 1.6488277268093785e-05,
"loss": 0.6727,
"step": 1126
},
{
"epoch": 1.2926409410414574,
"grad_norm": 55.25,
"learning_rate": 1.648318042813456e-05,
"loss": 0.6848,
"step": 1127
},
{
"epoch": 1.2937885525749535,
"grad_norm": 49.5,
"learning_rate": 1.6478083588175332e-05,
"loss": 1.015,
"step": 1128
},
{
"epoch": 1.2949361641084494,
"grad_norm": 30.375,
"learning_rate": 1.6472986748216106e-05,
"loss": 0.9048,
"step": 1129
},
{
"epoch": 1.2960837756419452,
"grad_norm": 65.5,
"learning_rate": 1.6467889908256884e-05,
"loss": 0.7712,
"step": 1130
},
{
"epoch": 1.297231387175441,
"grad_norm": 14.8125,
"learning_rate": 1.6462793068297658e-05,
"loss": 0.1942,
"step": 1131
},
{
"epoch": 1.298378998708937,
"grad_norm": 57.0,
"learning_rate": 1.645769622833843e-05,
"loss": 0.5278,
"step": 1132
},
{
"epoch": 1.299526610242433,
"grad_norm": 20.125,
"learning_rate": 1.6452599388379205e-05,
"loss": 0.3787,
"step": 1133
},
{
"epoch": 1.3006742217759288,
"grad_norm": 20.875,
"learning_rate": 1.644750254841998e-05,
"loss": 0.347,
"step": 1134
},
{
"epoch": 1.3018218333094247,
"grad_norm": 36.25,
"learning_rate": 1.6442405708460757e-05,
"loss": 0.613,
"step": 1135
},
{
"epoch": 1.3029694448429208,
"grad_norm": 46.75,
"learning_rate": 1.643730886850153e-05,
"loss": 0.3531,
"step": 1136
},
{
"epoch": 1.3041170563764166,
"grad_norm": 51.5,
"learning_rate": 1.6432212028542304e-05,
"loss": 0.4654,
"step": 1137
},
{
"epoch": 1.3052646679099125,
"grad_norm": 59.5,
"learning_rate": 1.642711518858308e-05,
"loss": 0.6825,
"step": 1138
},
{
"epoch": 1.3064122794434083,
"grad_norm": 20.125,
"learning_rate": 1.6422018348623852e-05,
"loss": 0.5258,
"step": 1139
},
{
"epoch": 1.3075598909769044,
"grad_norm": 21.375,
"learning_rate": 1.641692150866463e-05,
"loss": 0.2334,
"step": 1140
},
{
"epoch": 1.3087075025104002,
"grad_norm": 57.5,
"learning_rate": 1.6411824668705403e-05,
"loss": 0.9003,
"step": 1141
},
{
"epoch": 1.309855114043896,
"grad_norm": 61.5,
"learning_rate": 1.6406727828746177e-05,
"loss": 0.6237,
"step": 1142
},
{
"epoch": 1.3110027255773922,
"grad_norm": 20.875,
"learning_rate": 1.6401630988786955e-05,
"loss": 0.3164,
"step": 1143
},
{
"epoch": 1.312150337110888,
"grad_norm": 48.5,
"learning_rate": 1.639653414882773e-05,
"loss": 0.4018,
"step": 1144
},
{
"epoch": 1.3132979486443839,
"grad_norm": 56.0,
"learning_rate": 1.6391437308868502e-05,
"loss": 0.7092,
"step": 1145
},
{
"epoch": 1.3144455601778797,
"grad_norm": 38.75,
"learning_rate": 1.638634046890928e-05,
"loss": 0.5181,
"step": 1146
},
{
"epoch": 1.3155931717113756,
"grad_norm": 40.5,
"learning_rate": 1.6381243628950054e-05,
"loss": 0.3165,
"step": 1147
},
{
"epoch": 1.3167407832448716,
"grad_norm": 32.25,
"learning_rate": 1.6376146788990827e-05,
"loss": 0.7836,
"step": 1148
},
{
"epoch": 1.3178883947783675,
"grad_norm": 61.5,
"learning_rate": 1.63710499490316e-05,
"loss": 0.7627,
"step": 1149
},
{
"epoch": 1.3190360063118634,
"grad_norm": 79.5,
"learning_rate": 1.6365953109072375e-05,
"loss": 0.9356,
"step": 1150
},
{
"epoch": 1.3201836178453594,
"grad_norm": 189.0,
"learning_rate": 1.6360856269113153e-05,
"loss": 1.0391,
"step": 1151
},
{
"epoch": 1.3213312293788553,
"grad_norm": 20.25,
"learning_rate": 1.6355759429153926e-05,
"loss": 0.6406,
"step": 1152
},
{
"epoch": 1.3224788409123511,
"grad_norm": 26.375,
"learning_rate": 1.63506625891947e-05,
"loss": 0.3832,
"step": 1153
},
{
"epoch": 1.323626452445847,
"grad_norm": 33.0,
"learning_rate": 1.6345565749235474e-05,
"loss": 0.4041,
"step": 1154
},
{
"epoch": 1.324774063979343,
"grad_norm": 23.625,
"learning_rate": 1.634046890927625e-05,
"loss": 0.3527,
"step": 1155
},
{
"epoch": 1.325921675512839,
"grad_norm": 99.5,
"learning_rate": 1.6335372069317022e-05,
"loss": 0.9746,
"step": 1156
},
{
"epoch": 1.3270692870463348,
"grad_norm": 45.0,
"learning_rate": 1.63302752293578e-05,
"loss": 0.2891,
"step": 1157
},
{
"epoch": 1.3282168985798308,
"grad_norm": 56.5,
"learning_rate": 1.6325178389398573e-05,
"loss": 0.8078,
"step": 1158
},
{
"epoch": 1.3293645101133267,
"grad_norm": 16.125,
"learning_rate": 1.6320081549439347e-05,
"loss": 0.6181,
"step": 1159
},
{
"epoch": 1.3305121216468225,
"grad_norm": 31.5,
"learning_rate": 1.6314984709480125e-05,
"loss": 0.3313,
"step": 1160
},
{
"epoch": 1.3316597331803184,
"grad_norm": 11.75,
"learning_rate": 1.63098878695209e-05,
"loss": 0.2764,
"step": 1161
},
{
"epoch": 1.3328073447138145,
"grad_norm": 32.0,
"learning_rate": 1.6304791029561672e-05,
"loss": 0.6992,
"step": 1162
},
{
"epoch": 1.3339549562473103,
"grad_norm": 40.25,
"learning_rate": 1.629969418960245e-05,
"loss": 0.4695,
"step": 1163
},
{
"epoch": 1.3351025677808062,
"grad_norm": 60.75,
"learning_rate": 1.6294597349643224e-05,
"loss": 0.5952,
"step": 1164
},
{
"epoch": 1.3362501793143022,
"grad_norm": 52.75,
"learning_rate": 1.6289500509683997e-05,
"loss": 0.4987,
"step": 1165
},
{
"epoch": 1.337397790847798,
"grad_norm": 28.25,
"learning_rate": 1.628440366972477e-05,
"loss": 0.31,
"step": 1166
},
{
"epoch": 1.338545402381294,
"grad_norm": 21.25,
"learning_rate": 1.6279306829765545e-05,
"loss": 0.4244,
"step": 1167
},
{
"epoch": 1.3396930139147898,
"grad_norm": 30.75,
"learning_rate": 1.6274209989806323e-05,
"loss": 0.5522,
"step": 1168
},
{
"epoch": 1.3408406254482856,
"grad_norm": 18.5,
"learning_rate": 1.6269113149847096e-05,
"loss": 0.3786,
"step": 1169
},
{
"epoch": 1.3419882369817817,
"grad_norm": 14.6875,
"learning_rate": 1.626401630988787e-05,
"loss": 0.0966,
"step": 1170
},
{
"epoch": 1.3431358485152776,
"grad_norm": 50.0,
"learning_rate": 1.6258919469928644e-05,
"loss": 0.4607,
"step": 1171
},
{
"epoch": 1.3442834600487734,
"grad_norm": 27.375,
"learning_rate": 1.625382262996942e-05,
"loss": 0.847,
"step": 1172
},
{
"epoch": 1.3454310715822695,
"grad_norm": 13.0625,
"learning_rate": 1.6248725790010195e-05,
"loss": 0.4091,
"step": 1173
},
{
"epoch": 1.3465786831157653,
"grad_norm": 16.0,
"learning_rate": 1.624362895005097e-05,
"loss": 0.2403,
"step": 1174
},
{
"epoch": 1.3477262946492612,
"grad_norm": 37.25,
"learning_rate": 1.6238532110091743e-05,
"loss": 0.421,
"step": 1175
},
{
"epoch": 1.348873906182757,
"grad_norm": 55.25,
"learning_rate": 1.6233435270132517e-05,
"loss": 0.662,
"step": 1176
},
{
"epoch": 1.3500215177162531,
"grad_norm": 40.5,
"learning_rate": 1.6228338430173294e-05,
"loss": 0.4565,
"step": 1177
},
{
"epoch": 1.351169129249749,
"grad_norm": 14.3125,
"learning_rate": 1.622324159021407e-05,
"loss": 0.4465,
"step": 1178
},
{
"epoch": 1.3523167407832448,
"grad_norm": 28.625,
"learning_rate": 1.6218144750254842e-05,
"loss": 0.3729,
"step": 1179
},
{
"epoch": 1.353464352316741,
"grad_norm": 55.25,
"learning_rate": 1.621304791029562e-05,
"loss": 0.3222,
"step": 1180
},
{
"epoch": 1.3546119638502367,
"grad_norm": 22.875,
"learning_rate": 1.6207951070336393e-05,
"loss": 0.437,
"step": 1181
},
{
"epoch": 1.3557595753837326,
"grad_norm": 38.0,
"learning_rate": 1.6202854230377167e-05,
"loss": 0.651,
"step": 1182
},
{
"epoch": 1.3569071869172284,
"grad_norm": 21.625,
"learning_rate": 1.6197757390417945e-05,
"loss": 0.4508,
"step": 1183
},
{
"epoch": 1.3580547984507243,
"grad_norm": 27.875,
"learning_rate": 1.6192660550458715e-05,
"loss": 0.3005,
"step": 1184
},
{
"epoch": 1.3592024099842204,
"grad_norm": 42.0,
"learning_rate": 1.6187563710499492e-05,
"loss": 0.2964,
"step": 1185
},
{
"epoch": 1.3603500215177162,
"grad_norm": 26.125,
"learning_rate": 1.6182466870540266e-05,
"loss": 0.6497,
"step": 1186
},
{
"epoch": 1.3614976330512123,
"grad_norm": 20.375,
"learning_rate": 1.617737003058104e-05,
"loss": 0.3097,
"step": 1187
},
{
"epoch": 1.3626452445847081,
"grad_norm": 56.5,
"learning_rate": 1.6172273190621818e-05,
"loss": 0.437,
"step": 1188
},
{
"epoch": 1.363792856118204,
"grad_norm": 69.5,
"learning_rate": 1.616717635066259e-05,
"loss": 0.4491,
"step": 1189
},
{
"epoch": 1.3649404676516999,
"grad_norm": 58.75,
"learning_rate": 1.6162079510703365e-05,
"loss": 0.4697,
"step": 1190
},
{
"epoch": 1.3660880791851957,
"grad_norm": 15.0,
"learning_rate": 1.615698267074414e-05,
"loss": 0.2935,
"step": 1191
},
{
"epoch": 1.3672356907186918,
"grad_norm": 69.0,
"learning_rate": 1.6151885830784913e-05,
"loss": 0.8532,
"step": 1192
},
{
"epoch": 1.3683833022521876,
"grad_norm": 27.5,
"learning_rate": 1.614678899082569e-05,
"loss": 0.3305,
"step": 1193
},
{
"epoch": 1.3695309137856835,
"grad_norm": 65.5,
"learning_rate": 1.6141692150866464e-05,
"loss": 0.6747,
"step": 1194
},
{
"epoch": 1.3706785253191796,
"grad_norm": 39.5,
"learning_rate": 1.6136595310907238e-05,
"loss": 0.4378,
"step": 1195
},
{
"epoch": 1.3718261368526754,
"grad_norm": 36.25,
"learning_rate": 1.6131498470948012e-05,
"loss": 0.4756,
"step": 1196
},
{
"epoch": 1.3729737483861713,
"grad_norm": 25.75,
"learning_rate": 1.612640163098879e-05,
"loss": 0.2116,
"step": 1197
},
{
"epoch": 1.374121359919667,
"grad_norm": 51.5,
"learning_rate": 1.6121304791029563e-05,
"loss": 0.6976,
"step": 1198
},
{
"epoch": 1.3752689714531632,
"grad_norm": 28.25,
"learning_rate": 1.6116207951070337e-05,
"loss": 0.3644,
"step": 1199
},
{
"epoch": 1.376416582986659,
"grad_norm": 21.25,
"learning_rate": 1.6111111111111115e-05,
"loss": 0.4288,
"step": 1200
},
{
"epoch": 1.376416582986659,
"eval_accuracy": 0.61,
"eval_loss": 0.5443911552429199,
"eval_runtime": 49.3817,
"eval_samples_per_second": 2.025,
"eval_steps_per_second": 2.025,
"step": 1200
},
{
"epoch": 1.3775641945201549,
"grad_norm": 16.75,
"learning_rate": 1.6106014271151885e-05,
"loss": 0.5041,
"step": 1201
},
{
"epoch": 1.378711806053651,
"grad_norm": 50.25,
"learning_rate": 1.6100917431192662e-05,
"loss": 0.5077,
"step": 1202
},
{
"epoch": 1.3798594175871468,
"grad_norm": 15.875,
"learning_rate": 1.6095820591233436e-05,
"loss": 0.2509,
"step": 1203
},
{
"epoch": 1.3810070291206427,
"grad_norm": 52.5,
"learning_rate": 1.609072375127421e-05,
"loss": 0.6619,
"step": 1204
},
{
"epoch": 1.3821546406541385,
"grad_norm": 27.0,
"learning_rate": 1.6085626911314988e-05,
"loss": 0.3906,
"step": 1205
},
{
"epoch": 1.3833022521876344,
"grad_norm": 28.5,
"learning_rate": 1.608053007135576e-05,
"loss": 0.506,
"step": 1206
},
{
"epoch": 1.3844498637211304,
"grad_norm": 34.25,
"learning_rate": 1.6075433231396535e-05,
"loss": 0.3932,
"step": 1207
},
{
"epoch": 1.3855974752546263,
"grad_norm": 36.75,
"learning_rate": 1.6070336391437313e-05,
"loss": 0.5362,
"step": 1208
},
{
"epoch": 1.3867450867881221,
"grad_norm": 52.5,
"learning_rate": 1.6065239551478083e-05,
"loss": 0.5699,
"step": 1209
},
{
"epoch": 1.3878926983216182,
"grad_norm": 45.5,
"learning_rate": 1.606014271151886e-05,
"loss": 0.5685,
"step": 1210
},
{
"epoch": 1.389040309855114,
"grad_norm": 60.25,
"learning_rate": 1.6055045871559634e-05,
"loss": 0.9313,
"step": 1211
},
{
"epoch": 1.39018792138861,
"grad_norm": 38.0,
"learning_rate": 1.6049949031600408e-05,
"loss": 0.5542,
"step": 1212
},
{
"epoch": 1.3913355329221058,
"grad_norm": 40.75,
"learning_rate": 1.6044852191641186e-05,
"loss": 0.8328,
"step": 1213
},
{
"epoch": 1.3924831444556018,
"grad_norm": 42.25,
"learning_rate": 1.603975535168196e-05,
"loss": 0.2783,
"step": 1214
},
{
"epoch": 1.3936307559890977,
"grad_norm": 39.5,
"learning_rate": 1.6034658511722733e-05,
"loss": 0.5385,
"step": 1215
},
{
"epoch": 1.3947783675225935,
"grad_norm": 42.75,
"learning_rate": 1.6029561671763507e-05,
"loss": 0.5375,
"step": 1216
},
{
"epoch": 1.3959259790560896,
"grad_norm": 22.25,
"learning_rate": 1.602446483180428e-05,
"loss": 0.6028,
"step": 1217
},
{
"epoch": 1.3970735905895855,
"grad_norm": 34.0,
"learning_rate": 1.601936799184506e-05,
"loss": 0.5153,
"step": 1218
},
{
"epoch": 1.3982212021230813,
"grad_norm": 79.5,
"learning_rate": 1.6014271151885832e-05,
"loss": 0.7959,
"step": 1219
},
{
"epoch": 1.3993688136565772,
"grad_norm": 28.0,
"learning_rate": 1.6009174311926606e-05,
"loss": 0.271,
"step": 1220
},
{
"epoch": 1.400516425190073,
"grad_norm": 76.5,
"learning_rate": 1.600407747196738e-05,
"loss": 0.6952,
"step": 1221
},
{
"epoch": 1.401664036723569,
"grad_norm": 21.875,
"learning_rate": 1.5998980632008157e-05,
"loss": 0.452,
"step": 1222
},
{
"epoch": 1.402811648257065,
"grad_norm": 70.5,
"learning_rate": 1.599388379204893e-05,
"loss": 0.5592,
"step": 1223
},
{
"epoch": 1.403959259790561,
"grad_norm": 17.875,
"learning_rate": 1.5988786952089705e-05,
"loss": 0.451,
"step": 1224
},
{
"epoch": 1.4051068713240569,
"grad_norm": 30.125,
"learning_rate": 1.5983690112130483e-05,
"loss": 0.4143,
"step": 1225
},
{
"epoch": 1.4062544828575527,
"grad_norm": 25.625,
"learning_rate": 1.5978593272171253e-05,
"loss": 0.454,
"step": 1226
},
{
"epoch": 1.4074020943910486,
"grad_norm": 24.625,
"learning_rate": 1.597349643221203e-05,
"loss": 0.4827,
"step": 1227
},
{
"epoch": 1.4085497059245444,
"grad_norm": 14.6875,
"learning_rate": 1.5968399592252804e-05,
"loss": 0.1517,
"step": 1228
},
{
"epoch": 1.4096973174580405,
"grad_norm": 12.3125,
"learning_rate": 1.5963302752293578e-05,
"loss": 0.4015,
"step": 1229
},
{
"epoch": 1.4108449289915364,
"grad_norm": 59.0,
"learning_rate": 1.5958205912334355e-05,
"loss": 0.5366,
"step": 1230
},
{
"epoch": 1.4119925405250322,
"grad_norm": 11.1875,
"learning_rate": 1.595310907237513e-05,
"loss": 0.3743,
"step": 1231
},
{
"epoch": 1.4131401520585283,
"grad_norm": 18.75,
"learning_rate": 1.5948012232415903e-05,
"loss": 0.4668,
"step": 1232
},
{
"epoch": 1.4142877635920241,
"grad_norm": 50.75,
"learning_rate": 1.5942915392456677e-05,
"loss": 0.3211,
"step": 1233
},
{
"epoch": 1.41543537512552,
"grad_norm": 41.5,
"learning_rate": 1.593781855249745e-05,
"loss": 0.5208,
"step": 1234
},
{
"epoch": 1.4165829866590158,
"grad_norm": 16.5,
"learning_rate": 1.593272171253823e-05,
"loss": 0.2334,
"step": 1235
},
{
"epoch": 1.417730598192512,
"grad_norm": 72.0,
"learning_rate": 1.5927624872579002e-05,
"loss": 0.4065,
"step": 1236
},
{
"epoch": 1.4188782097260078,
"grad_norm": 21.0,
"learning_rate": 1.5922528032619776e-05,
"loss": 0.4257,
"step": 1237
},
{
"epoch": 1.4200258212595036,
"grad_norm": 18.75,
"learning_rate": 1.591743119266055e-05,
"loss": 0.3615,
"step": 1238
},
{
"epoch": 1.4211734327929997,
"grad_norm": 54.5,
"learning_rate": 1.5912334352701327e-05,
"loss": 0.2902,
"step": 1239
},
{
"epoch": 1.4223210443264955,
"grad_norm": 8.3125,
"learning_rate": 1.59072375127421e-05,
"loss": 0.1653,
"step": 1240
},
{
"epoch": 1.4234686558599914,
"grad_norm": 18.125,
"learning_rate": 1.5902140672782875e-05,
"loss": 0.3842,
"step": 1241
},
{
"epoch": 1.4246162673934872,
"grad_norm": 85.0,
"learning_rate": 1.5897043832823652e-05,
"loss": 0.7718,
"step": 1242
},
{
"epoch": 1.425763878926983,
"grad_norm": 27.125,
"learning_rate": 1.5891946992864423e-05,
"loss": 0.195,
"step": 1243
},
{
"epoch": 1.4269114904604792,
"grad_norm": 31.125,
"learning_rate": 1.58868501529052e-05,
"loss": 0.5963,
"step": 1244
},
{
"epoch": 1.428059101993975,
"grad_norm": 67.0,
"learning_rate": 1.5881753312945974e-05,
"loss": 0.709,
"step": 1245
},
{
"epoch": 1.429206713527471,
"grad_norm": 20.25,
"learning_rate": 1.5876656472986748e-05,
"loss": 0.3003,
"step": 1246
},
{
"epoch": 1.430354325060967,
"grad_norm": 40.25,
"learning_rate": 1.5871559633027525e-05,
"loss": 0.7344,
"step": 1247
},
{
"epoch": 1.4315019365944628,
"grad_norm": 26.75,
"learning_rate": 1.58664627930683e-05,
"loss": 1.0281,
"step": 1248
},
{
"epoch": 1.4326495481279586,
"grad_norm": 49.25,
"learning_rate": 1.5861365953109073e-05,
"loss": 0.3,
"step": 1249
},
{
"epoch": 1.4337971596614545,
"grad_norm": 27.125,
"learning_rate": 1.585626911314985e-05,
"loss": 0.5945,
"step": 1250
},
{
"epoch": 1.4349447711949506,
"grad_norm": 41.5,
"learning_rate": 1.585117227319062e-05,
"loss": 0.6692,
"step": 1251
},
{
"epoch": 1.4360923827284464,
"grad_norm": 14.375,
"learning_rate": 1.58460754332314e-05,
"loss": 0.3908,
"step": 1252
},
{
"epoch": 1.4372399942619423,
"grad_norm": 77.5,
"learning_rate": 1.5840978593272172e-05,
"loss": 0.7376,
"step": 1253
},
{
"epoch": 1.4383876057954383,
"grad_norm": 29.75,
"learning_rate": 1.5835881753312946e-05,
"loss": 0.355,
"step": 1254
},
{
"epoch": 1.4395352173289342,
"grad_norm": 32.0,
"learning_rate": 1.5830784913353723e-05,
"loss": 0.7525,
"step": 1255
},
{
"epoch": 1.44068282886243,
"grad_norm": 42.75,
"learning_rate": 1.5825688073394497e-05,
"loss": 0.2832,
"step": 1256
},
{
"epoch": 1.441830440395926,
"grad_norm": 21.125,
"learning_rate": 1.582059123343527e-05,
"loss": 0.3375,
"step": 1257
},
{
"epoch": 1.442978051929422,
"grad_norm": 33.25,
"learning_rate": 1.5815494393476045e-05,
"loss": 0.3517,
"step": 1258
},
{
"epoch": 1.4441256634629178,
"grad_norm": 35.0,
"learning_rate": 1.5810397553516822e-05,
"loss": 0.382,
"step": 1259
},
{
"epoch": 1.4452732749964137,
"grad_norm": 53.75,
"learning_rate": 1.5805300713557596e-05,
"loss": 0.3113,
"step": 1260
},
{
"epoch": 1.4464208865299097,
"grad_norm": 43.75,
"learning_rate": 1.580020387359837e-05,
"loss": 0.3177,
"step": 1261
},
{
"epoch": 1.4475684980634056,
"grad_norm": 35.0,
"learning_rate": 1.5795107033639144e-05,
"loss": 0.3791,
"step": 1262
},
{
"epoch": 1.4487161095969014,
"grad_norm": 45.0,
"learning_rate": 1.5790010193679918e-05,
"loss": 0.4492,
"step": 1263
},
{
"epoch": 1.4498637211303973,
"grad_norm": 27.875,
"learning_rate": 1.5784913353720695e-05,
"loss": 0.3343,
"step": 1264
},
{
"epoch": 1.4510113326638931,
"grad_norm": 19.125,
"learning_rate": 1.577981651376147e-05,
"loss": 0.8559,
"step": 1265
},
{
"epoch": 1.4521589441973892,
"grad_norm": 8.0,
"learning_rate": 1.5774719673802243e-05,
"loss": 0.1379,
"step": 1266
},
{
"epoch": 1.453306555730885,
"grad_norm": 49.5,
"learning_rate": 1.576962283384302e-05,
"loss": 0.4941,
"step": 1267
},
{
"epoch": 1.454454167264381,
"grad_norm": 84.5,
"learning_rate": 1.576452599388379e-05,
"loss": 1.4308,
"step": 1268
},
{
"epoch": 1.455601778797877,
"grad_norm": 92.0,
"learning_rate": 1.5759429153924568e-05,
"loss": 0.9692,
"step": 1269
},
{
"epoch": 1.4567493903313729,
"grad_norm": 88.0,
"learning_rate": 1.5754332313965342e-05,
"loss": 0.9589,
"step": 1270
},
{
"epoch": 1.4578970018648687,
"grad_norm": 50.25,
"learning_rate": 1.5749235474006116e-05,
"loss": 0.5352,
"step": 1271
},
{
"epoch": 1.4590446133983646,
"grad_norm": 42.0,
"learning_rate": 1.5744138634046893e-05,
"loss": 0.3708,
"step": 1272
},
{
"epoch": 1.4601922249318606,
"grad_norm": 35.0,
"learning_rate": 1.5739041794087667e-05,
"loss": 0.7022,
"step": 1273
},
{
"epoch": 1.4613398364653565,
"grad_norm": 13.375,
"learning_rate": 1.573394495412844e-05,
"loss": 0.3201,
"step": 1274
},
{
"epoch": 1.4624874479988523,
"grad_norm": 87.5,
"learning_rate": 1.572884811416922e-05,
"loss": 0.576,
"step": 1275
},
{
"epoch": 1.4636350595323484,
"grad_norm": 68.5,
"learning_rate": 1.5723751274209992e-05,
"loss": 0.5697,
"step": 1276
},
{
"epoch": 1.4647826710658443,
"grad_norm": 31.75,
"learning_rate": 1.5718654434250766e-05,
"loss": 0.4631,
"step": 1277
},
{
"epoch": 1.46593028259934,
"grad_norm": 19.5,
"learning_rate": 1.571355759429154e-05,
"loss": 0.4516,
"step": 1278
},
{
"epoch": 1.467077894132836,
"grad_norm": 52.0,
"learning_rate": 1.5708460754332314e-05,
"loss": 0.6808,
"step": 1279
},
{
"epoch": 1.4682255056663318,
"grad_norm": 17.875,
"learning_rate": 1.570336391437309e-05,
"loss": 0.3936,
"step": 1280
},
{
"epoch": 1.4693731171998279,
"grad_norm": 24.25,
"learning_rate": 1.5698267074413865e-05,
"loss": 0.4196,
"step": 1281
},
{
"epoch": 1.4705207287333237,
"grad_norm": 111.0,
"learning_rate": 1.569317023445464e-05,
"loss": 0.8228,
"step": 1282
},
{
"epoch": 1.4716683402668198,
"grad_norm": 36.5,
"learning_rate": 1.5688073394495413e-05,
"loss": 0.5546,
"step": 1283
},
{
"epoch": 1.4728159518003157,
"grad_norm": 40.5,
"learning_rate": 1.568297655453619e-05,
"loss": 0.4347,
"step": 1284
},
{
"epoch": 1.4739635633338115,
"grad_norm": 59.75,
"learning_rate": 1.5677879714576964e-05,
"loss": 0.8506,
"step": 1285
},
{
"epoch": 1.4751111748673074,
"grad_norm": 58.25,
"learning_rate": 1.5672782874617738e-05,
"loss": 0.4958,
"step": 1286
},
{
"epoch": 1.4762587864008032,
"grad_norm": 41.5,
"learning_rate": 1.5667686034658512e-05,
"loss": 0.6571,
"step": 1287
},
{
"epoch": 1.4774063979342993,
"grad_norm": 20.75,
"learning_rate": 1.5662589194699286e-05,
"loss": 0.2749,
"step": 1288
},
{
"epoch": 1.4785540094677951,
"grad_norm": 24.875,
"learning_rate": 1.5657492354740063e-05,
"loss": 0.552,
"step": 1289
},
{
"epoch": 1.479701621001291,
"grad_norm": 24.625,
"learning_rate": 1.5652395514780837e-05,
"loss": 0.5655,
"step": 1290
},
{
"epoch": 1.480849232534787,
"grad_norm": 71.0,
"learning_rate": 1.564729867482161e-05,
"loss": 1.1072,
"step": 1291
},
{
"epoch": 1.481996844068283,
"grad_norm": 56.5,
"learning_rate": 1.564220183486239e-05,
"loss": 0.9029,
"step": 1292
},
{
"epoch": 1.4831444556017788,
"grad_norm": 75.0,
"learning_rate": 1.563710499490316e-05,
"loss": 0.8671,
"step": 1293
},
{
"epoch": 1.4842920671352746,
"grad_norm": 68.5,
"learning_rate": 1.5632008154943936e-05,
"loss": 0.6165,
"step": 1294
},
{
"epoch": 1.4854396786687707,
"grad_norm": 57.5,
"learning_rate": 1.5626911314984713e-05,
"loss": 0.4413,
"step": 1295
},
{
"epoch": 1.4865872902022665,
"grad_norm": 38.25,
"learning_rate": 1.5621814475025484e-05,
"loss": 0.4508,
"step": 1296
},
{
"epoch": 1.4877349017357624,
"grad_norm": 22.375,
"learning_rate": 1.561671763506626e-05,
"loss": 0.4694,
"step": 1297
},
{
"epoch": 1.4888825132692585,
"grad_norm": 19.625,
"learning_rate": 1.5611620795107035e-05,
"loss": 0.4833,
"step": 1298
},
{
"epoch": 1.4900301248027543,
"grad_norm": 74.0,
"learning_rate": 1.560652395514781e-05,
"loss": 0.6443,
"step": 1299
},
{
"epoch": 1.4911777363362502,
"grad_norm": 30.25,
"learning_rate": 1.5601427115188586e-05,
"loss": 0.5003,
"step": 1300
},
{
"epoch": 1.4911777363362502,
"eval_accuracy": 0.64,
"eval_loss": 0.5184877514839172,
"eval_runtime": 49.6613,
"eval_samples_per_second": 2.014,
"eval_steps_per_second": 2.014,
"step": 1300
},
{
"epoch": 1.492325347869746,
"grad_norm": 26.5,
"learning_rate": 1.559633027522936e-05,
"loss": 0.4356,
"step": 1301
},
{
"epoch": 1.4934729594032419,
"grad_norm": 93.0,
"learning_rate": 1.5591233435270134e-05,
"loss": 0.6945,
"step": 1302
},
{
"epoch": 1.494620570936738,
"grad_norm": 84.0,
"learning_rate": 1.5586136595310908e-05,
"loss": 0.7059,
"step": 1303
},
{
"epoch": 1.4957681824702338,
"grad_norm": 84.5,
"learning_rate": 1.5581039755351682e-05,
"loss": 0.8654,
"step": 1304
},
{
"epoch": 1.4969157940037299,
"grad_norm": 79.5,
"learning_rate": 1.5575942915392456e-05,
"loss": 0.8112,
"step": 1305
},
{
"epoch": 1.4980634055372257,
"grad_norm": 41.75,
"learning_rate": 1.5570846075433233e-05,
"loss": 1.0995,
"step": 1306
},
{
"epoch": 1.4992110170707216,
"grad_norm": 28.625,
"learning_rate": 1.5565749235474007e-05,
"loss": 0.8355,
"step": 1307
},
{
"epoch": 1.5003586286042174,
"grad_norm": 67.5,
"learning_rate": 1.556065239551478e-05,
"loss": 0.7727,
"step": 1308
},
{
"epoch": 1.5015062401377133,
"grad_norm": 17.5,
"learning_rate": 1.555555555555556e-05,
"loss": 0.4682,
"step": 1309
},
{
"epoch": 1.5026538516712094,
"grad_norm": 18.625,
"learning_rate": 1.555045871559633e-05,
"loss": 0.2126,
"step": 1310
},
{
"epoch": 1.5038014632047052,
"grad_norm": 15.75,
"learning_rate": 1.5545361875637106e-05,
"loss": 0.4916,
"step": 1311
},
{
"epoch": 1.5049490747382013,
"grad_norm": 31.625,
"learning_rate": 1.554026503567788e-05,
"loss": 0.2308,
"step": 1312
},
{
"epoch": 1.5060966862716971,
"grad_norm": 51.75,
"learning_rate": 1.5535168195718654e-05,
"loss": 1.0898,
"step": 1313
},
{
"epoch": 1.507244297805193,
"grad_norm": 31.75,
"learning_rate": 1.553007135575943e-05,
"loss": 0.4099,
"step": 1314
},
{
"epoch": 1.5083919093386888,
"grad_norm": 88.0,
"learning_rate": 1.5524974515800205e-05,
"loss": 0.9649,
"step": 1315
},
{
"epoch": 1.5095395208721847,
"grad_norm": 24.75,
"learning_rate": 1.551987767584098e-05,
"loss": 1.0352,
"step": 1316
},
{
"epoch": 1.5106871324056805,
"grad_norm": 13.625,
"learning_rate": 1.5514780835881756e-05,
"loss": 0.3537,
"step": 1317
},
{
"epoch": 1.5118347439391766,
"grad_norm": 94.0,
"learning_rate": 1.550968399592253e-05,
"loss": 0.9038,
"step": 1318
},
{
"epoch": 1.5129823554726725,
"grad_norm": 26.0,
"learning_rate": 1.5504587155963304e-05,
"loss": 0.346,
"step": 1319
},
{
"epoch": 1.5141299670061685,
"grad_norm": 44.25,
"learning_rate": 1.5499490316004078e-05,
"loss": 0.7941,
"step": 1320
},
{
"epoch": 1.5152775785396644,
"grad_norm": 27.75,
"learning_rate": 1.5494393476044852e-05,
"loss": 0.3747,
"step": 1321
},
{
"epoch": 1.5164251900731602,
"grad_norm": 97.5,
"learning_rate": 1.548929663608563e-05,
"loss": 0.9651,
"step": 1322
},
{
"epoch": 1.517572801606656,
"grad_norm": 10.6875,
"learning_rate": 1.5484199796126403e-05,
"loss": 0.2523,
"step": 1323
},
{
"epoch": 1.518720413140152,
"grad_norm": 23.25,
"learning_rate": 1.5479102956167177e-05,
"loss": 0.5667,
"step": 1324
},
{
"epoch": 1.519868024673648,
"grad_norm": 22.25,
"learning_rate": 1.547400611620795e-05,
"loss": 0.3108,
"step": 1325
},
{
"epoch": 1.5210156362071439,
"grad_norm": 29.125,
"learning_rate": 1.5468909276248728e-05,
"loss": 0.5994,
"step": 1326
},
{
"epoch": 1.52216324774064,
"grad_norm": 37.0,
"learning_rate": 1.5463812436289502e-05,
"loss": 0.6835,
"step": 1327
},
{
"epoch": 1.5233108592741358,
"grad_norm": 35.0,
"learning_rate": 1.5458715596330276e-05,
"loss": 0.3934,
"step": 1328
},
{
"epoch": 1.5244584708076316,
"grad_norm": 42.5,
"learning_rate": 1.545361875637105e-05,
"loss": 0.4904,
"step": 1329
},
{
"epoch": 1.5256060823411275,
"grad_norm": 79.5,
"learning_rate": 1.5448521916411824e-05,
"loss": 0.8999,
"step": 1330
},
{
"epoch": 1.5267536938746233,
"grad_norm": 51.25,
"learning_rate": 1.54434250764526e-05,
"loss": 0.5231,
"step": 1331
},
{
"epoch": 1.5279013054081192,
"grad_norm": 53.5,
"learning_rate": 1.5438328236493375e-05,
"loss": 0.6297,
"step": 1332
},
{
"epoch": 1.5290489169416153,
"grad_norm": 65.5,
"learning_rate": 1.543323139653415e-05,
"loss": 0.5863,
"step": 1333
},
{
"epoch": 1.5301965284751113,
"grad_norm": 44.0,
"learning_rate": 1.5428134556574926e-05,
"loss": 0.402,
"step": 1334
},
{
"epoch": 1.5313441400086072,
"grad_norm": 54.0,
"learning_rate": 1.54230377166157e-05,
"loss": 0.5476,
"step": 1335
},
{
"epoch": 1.532491751542103,
"grad_norm": 40.5,
"learning_rate": 1.5417940876656474e-05,
"loss": 0.4921,
"step": 1336
},
{
"epoch": 1.533639363075599,
"grad_norm": 15.125,
"learning_rate": 1.541284403669725e-05,
"loss": 0.4748,
"step": 1337
},
{
"epoch": 1.5347869746090947,
"grad_norm": 35.25,
"learning_rate": 1.5407747196738022e-05,
"loss": 0.5071,
"step": 1338
},
{
"epoch": 1.5359345861425906,
"grad_norm": 26.5,
"learning_rate": 1.54026503567788e-05,
"loss": 0.2151,
"step": 1339
},
{
"epoch": 1.5370821976760867,
"grad_norm": 32.5,
"learning_rate": 1.5397553516819573e-05,
"loss": 0.4312,
"step": 1340
},
{
"epoch": 1.5382298092095825,
"grad_norm": 80.5,
"learning_rate": 1.5392456676860347e-05,
"loss": 0.6625,
"step": 1341
},
{
"epoch": 1.5393774207430786,
"grad_norm": 46.25,
"learning_rate": 1.5387359836901124e-05,
"loss": 0.3488,
"step": 1342
},
{
"epoch": 1.5405250322765744,
"grad_norm": 41.75,
"learning_rate": 1.5382262996941898e-05,
"loss": 0.5342,
"step": 1343
},
{
"epoch": 1.5416726438100703,
"grad_norm": 44.0,
"learning_rate": 1.5377166156982672e-05,
"loss": 0.4736,
"step": 1344
},
{
"epoch": 1.5428202553435661,
"grad_norm": 11.625,
"learning_rate": 1.5372069317023446e-05,
"loss": 0.3527,
"step": 1345
},
{
"epoch": 1.543967866877062,
"grad_norm": 31.75,
"learning_rate": 1.536697247706422e-05,
"loss": 0.6221,
"step": 1346
},
{
"epoch": 1.545115478410558,
"grad_norm": 47.0,
"learning_rate": 1.5361875637104997e-05,
"loss": 0.6081,
"step": 1347
},
{
"epoch": 1.546263089944054,
"grad_norm": 22.5,
"learning_rate": 1.535677879714577e-05,
"loss": 0.4948,
"step": 1348
},
{
"epoch": 1.54741070147755,
"grad_norm": 82.5,
"learning_rate": 1.5351681957186545e-05,
"loss": 0.7993,
"step": 1349
},
{
"epoch": 1.5485583130110459,
"grad_norm": 45.5,
"learning_rate": 1.534658511722732e-05,
"loss": 0.6413,
"step": 1350
},
{
"epoch": 1.5497059245445417,
"grad_norm": 14.875,
"learning_rate": 1.5341488277268096e-05,
"loss": 0.4485,
"step": 1351
},
{
"epoch": 1.5508535360780376,
"grad_norm": 50.0,
"learning_rate": 1.533639143730887e-05,
"loss": 1.0687,
"step": 1352
},
{
"epoch": 1.5520011476115334,
"grad_norm": 58.75,
"learning_rate": 1.5331294597349644e-05,
"loss": 1.0185,
"step": 1353
},
{
"epoch": 1.5531487591450293,
"grad_norm": 43.0,
"learning_rate": 1.532619775739042e-05,
"loss": 0.5074,
"step": 1354
},
{
"epoch": 1.5542963706785253,
"grad_norm": 47.25,
"learning_rate": 1.5321100917431192e-05,
"loss": 0.3446,
"step": 1355
},
{
"epoch": 1.5554439822120212,
"grad_norm": 37.25,
"learning_rate": 1.531600407747197e-05,
"loss": 0.3883,
"step": 1356
},
{
"epoch": 1.5565915937455173,
"grad_norm": 79.0,
"learning_rate": 1.5310907237512743e-05,
"loss": 0.8577,
"step": 1357
},
{
"epoch": 1.557739205279013,
"grad_norm": 45.0,
"learning_rate": 1.5305810397553517e-05,
"loss": 0.5937,
"step": 1358
},
{
"epoch": 1.558886816812509,
"grad_norm": 56.25,
"learning_rate": 1.5300713557594294e-05,
"loss": 0.8568,
"step": 1359
},
{
"epoch": 1.5600344283460048,
"grad_norm": 33.25,
"learning_rate": 1.5295616717635068e-05,
"loss": 0.4064,
"step": 1360
},
{
"epoch": 1.5611820398795007,
"grad_norm": 59.0,
"learning_rate": 1.5290519877675842e-05,
"loss": 0.6067,
"step": 1361
},
{
"epoch": 1.5623296514129967,
"grad_norm": 11.75,
"learning_rate": 1.528542303771662e-05,
"loss": 0.3149,
"step": 1362
},
{
"epoch": 1.5634772629464926,
"grad_norm": 38.0,
"learning_rate": 1.528032619775739e-05,
"loss": 0.5482,
"step": 1363
},
{
"epoch": 1.5646248744799887,
"grad_norm": 43.0,
"learning_rate": 1.5275229357798167e-05,
"loss": 0.3758,
"step": 1364
},
{
"epoch": 1.5657724860134845,
"grad_norm": 17.625,
"learning_rate": 1.527013251783894e-05,
"loss": 0.0865,
"step": 1365
},
{
"epoch": 1.5669200975469804,
"grad_norm": 79.0,
"learning_rate": 1.5265035677879715e-05,
"loss": 1.07,
"step": 1366
},
{
"epoch": 1.5680677090804762,
"grad_norm": 86.5,
"learning_rate": 1.5259938837920492e-05,
"loss": 1.2776,
"step": 1367
},
{
"epoch": 1.569215320613972,
"grad_norm": 65.0,
"learning_rate": 1.5254841997961264e-05,
"loss": 1.0829,
"step": 1368
},
{
"epoch": 1.570362932147468,
"grad_norm": 11.25,
"learning_rate": 1.524974515800204e-05,
"loss": 0.1616,
"step": 1369
},
{
"epoch": 1.571510543680964,
"grad_norm": 175.0,
"learning_rate": 1.5244648318042814e-05,
"loss": 0.788,
"step": 1370
},
{
"epoch": 1.57265815521446,
"grad_norm": 52.25,
"learning_rate": 1.523955147808359e-05,
"loss": 0.6801,
"step": 1371
},
{
"epoch": 1.573805766747956,
"grad_norm": 90.0,
"learning_rate": 1.5234454638124365e-05,
"loss": 1.1125,
"step": 1372
},
{
"epoch": 1.5749533782814518,
"grad_norm": 69.5,
"learning_rate": 1.5229357798165139e-05,
"loss": 0.7275,
"step": 1373
},
{
"epoch": 1.5761009898149476,
"grad_norm": 21.625,
"learning_rate": 1.5224260958205915e-05,
"loss": 0.2809,
"step": 1374
},
{
"epoch": 1.5772486013484435,
"grad_norm": 41.5,
"learning_rate": 1.5219164118246687e-05,
"loss": 1.0073,
"step": 1375
},
{
"epoch": 1.5783962128819393,
"grad_norm": 65.5,
"learning_rate": 1.5214067278287462e-05,
"loss": 0.6342,
"step": 1376
},
{
"epoch": 1.5795438244154354,
"grad_norm": 19.5,
"learning_rate": 1.5208970438328238e-05,
"loss": 0.2352,
"step": 1377
},
{
"epoch": 1.5806914359489312,
"grad_norm": 17.125,
"learning_rate": 1.5203873598369012e-05,
"loss": 0.5829,
"step": 1378
},
{
"epoch": 1.5818390474824273,
"grad_norm": 40.25,
"learning_rate": 1.5198776758409788e-05,
"loss": 0.5567,
"step": 1379
},
{
"epoch": 1.5829866590159232,
"grad_norm": 20.0,
"learning_rate": 1.5193679918450561e-05,
"loss": 0.6663,
"step": 1380
},
{
"epoch": 1.584134270549419,
"grad_norm": 84.0,
"learning_rate": 1.5188583078491337e-05,
"loss": 0.6399,
"step": 1381
},
{
"epoch": 1.5852818820829149,
"grad_norm": 13.5,
"learning_rate": 1.5183486238532111e-05,
"loss": 0.3913,
"step": 1382
},
{
"epoch": 1.5864294936164107,
"grad_norm": 39.75,
"learning_rate": 1.5178389398572887e-05,
"loss": 0.4537,
"step": 1383
},
{
"epoch": 1.5875771051499068,
"grad_norm": 10.3125,
"learning_rate": 1.5173292558613662e-05,
"loss": 0.5198,
"step": 1384
},
{
"epoch": 1.5887247166834026,
"grad_norm": 26.75,
"learning_rate": 1.5168195718654434e-05,
"loss": 0.4686,
"step": 1385
},
{
"epoch": 1.5898723282168987,
"grad_norm": 57.25,
"learning_rate": 1.516309887869521e-05,
"loss": 0.5172,
"step": 1386
},
{
"epoch": 1.5910199397503946,
"grad_norm": 82.0,
"learning_rate": 1.5158002038735984e-05,
"loss": 0.9411,
"step": 1387
},
{
"epoch": 1.5921675512838904,
"grad_norm": 31.0,
"learning_rate": 1.515290519877676e-05,
"loss": 0.3182,
"step": 1388
},
{
"epoch": 1.5933151628173863,
"grad_norm": 79.5,
"learning_rate": 1.5147808358817535e-05,
"loss": 0.7013,
"step": 1389
},
{
"epoch": 1.5944627743508821,
"grad_norm": 17.875,
"learning_rate": 1.5142711518858309e-05,
"loss": 0.5569,
"step": 1390
},
{
"epoch": 1.595610385884378,
"grad_norm": 23.375,
"learning_rate": 1.5137614678899085e-05,
"loss": 0.5306,
"step": 1391
},
{
"epoch": 1.596757997417874,
"grad_norm": 11.5,
"learning_rate": 1.5132517838939857e-05,
"loss": 0.2887,
"step": 1392
},
{
"epoch": 1.5979056089513701,
"grad_norm": 22.5,
"learning_rate": 1.5127420998980632e-05,
"loss": 0.5286,
"step": 1393
},
{
"epoch": 1.599053220484866,
"grad_norm": 21.0,
"learning_rate": 1.5122324159021408e-05,
"loss": 0.3716,
"step": 1394
},
{
"epoch": 1.6002008320183618,
"grad_norm": 63.25,
"learning_rate": 1.5117227319062182e-05,
"loss": 0.7257,
"step": 1395
},
{
"epoch": 1.6013484435518577,
"grad_norm": 7.6875,
"learning_rate": 1.5112130479102958e-05,
"loss": 0.127,
"step": 1396
},
{
"epoch": 1.6024960550853535,
"grad_norm": 17.0,
"learning_rate": 1.5107033639143731e-05,
"loss": 0.2272,
"step": 1397
},
{
"epoch": 1.6036436666188494,
"grad_norm": 30.875,
"learning_rate": 1.5101936799184507e-05,
"loss": 0.4778,
"step": 1398
},
{
"epoch": 1.6047912781523455,
"grad_norm": 19.5,
"learning_rate": 1.5096839959225283e-05,
"loss": 0.5537,
"step": 1399
},
{
"epoch": 1.6059388896858413,
"grad_norm": 57.5,
"learning_rate": 1.5091743119266057e-05,
"loss": 0.6817,
"step": 1400
},
{
"epoch": 1.6059388896858413,
"eval_accuracy": 0.63,
"eval_loss": 0.49080872535705566,
"eval_runtime": 49.7511,
"eval_samples_per_second": 2.01,
"eval_steps_per_second": 2.01,
"step": 1400
},
{
"epoch": 1.6070865012193374,
"grad_norm": 14.125,
"learning_rate": 1.5086646279306832e-05,
"loss": 0.6062,
"step": 1401
},
{
"epoch": 1.6082341127528332,
"grad_norm": 9.625,
"learning_rate": 1.5081549439347604e-05,
"loss": 0.2577,
"step": 1402
},
{
"epoch": 1.609381724286329,
"grad_norm": 19.25,
"learning_rate": 1.507645259938838e-05,
"loss": 0.4531,
"step": 1403
},
{
"epoch": 1.610529335819825,
"grad_norm": 41.0,
"learning_rate": 1.5071355759429156e-05,
"loss": 0.6092,
"step": 1404
},
{
"epoch": 1.6116769473533208,
"grad_norm": 34.0,
"learning_rate": 1.506625891946993e-05,
"loss": 0.4515,
"step": 1405
},
{
"epoch": 1.6128245588868169,
"grad_norm": 23.75,
"learning_rate": 1.5061162079510705e-05,
"loss": 0.5269,
"step": 1406
},
{
"epoch": 1.6139721704203127,
"grad_norm": 31.75,
"learning_rate": 1.5056065239551479e-05,
"loss": 0.4641,
"step": 1407
},
{
"epoch": 1.6151197819538088,
"grad_norm": 33.75,
"learning_rate": 1.5050968399592255e-05,
"loss": 0.3172,
"step": 1408
},
{
"epoch": 1.6162673934873046,
"grad_norm": 9.9375,
"learning_rate": 1.504587155963303e-05,
"loss": 0.231,
"step": 1409
},
{
"epoch": 1.6174150050208005,
"grad_norm": 10.5,
"learning_rate": 1.5040774719673802e-05,
"loss": 0.19,
"step": 1410
},
{
"epoch": 1.6185626165542963,
"grad_norm": 26.375,
"learning_rate": 1.5035677879714578e-05,
"loss": 0.6969,
"step": 1411
},
{
"epoch": 1.6197102280877922,
"grad_norm": 11.5625,
"learning_rate": 1.5030581039755352e-05,
"loss": 0.3084,
"step": 1412
},
{
"epoch": 1.620857839621288,
"grad_norm": 33.5,
"learning_rate": 1.5025484199796127e-05,
"loss": 0.9029,
"step": 1413
},
{
"epoch": 1.6220054511547841,
"grad_norm": 26.75,
"learning_rate": 1.5020387359836903e-05,
"loss": 0.7869,
"step": 1414
},
{
"epoch": 1.62315306268828,
"grad_norm": 18.5,
"learning_rate": 1.5015290519877677e-05,
"loss": 0.555,
"step": 1415
},
{
"epoch": 1.624300674221776,
"grad_norm": 26.25,
"learning_rate": 1.5010193679918453e-05,
"loss": 0.8343,
"step": 1416
},
{
"epoch": 1.625448285755272,
"grad_norm": 18.625,
"learning_rate": 1.5005096839959225e-05,
"loss": 0.4117,
"step": 1417
},
{
"epoch": 1.6265958972887677,
"grad_norm": 30.75,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.3806,
"step": 1418
},
{
"epoch": 1.6277435088222636,
"grad_norm": 24.375,
"learning_rate": 1.4994903160040778e-05,
"loss": 0.4463,
"step": 1419
},
{
"epoch": 1.6288911203557594,
"grad_norm": 14.5,
"learning_rate": 1.498980632008155e-05,
"loss": 0.1973,
"step": 1420
},
{
"epoch": 1.6300387318892555,
"grad_norm": 19.25,
"learning_rate": 1.4984709480122325e-05,
"loss": 0.689,
"step": 1421
},
{
"epoch": 1.6311863434227514,
"grad_norm": 21.125,
"learning_rate": 1.49796126401631e-05,
"loss": 0.5135,
"step": 1422
},
{
"epoch": 1.6323339549562474,
"grad_norm": 29.5,
"learning_rate": 1.4974515800203875e-05,
"loss": 0.3149,
"step": 1423
},
{
"epoch": 1.6334815664897433,
"grad_norm": 13.1875,
"learning_rate": 1.496941896024465e-05,
"loss": 0.2246,
"step": 1424
},
{
"epoch": 1.6346291780232391,
"grad_norm": 76.0,
"learning_rate": 1.4964322120285424e-05,
"loss": 0.7469,
"step": 1425
},
{
"epoch": 1.635776789556735,
"grad_norm": 90.5,
"learning_rate": 1.49592252803262e-05,
"loss": 0.8995,
"step": 1426
},
{
"epoch": 1.6369244010902309,
"grad_norm": 23.25,
"learning_rate": 1.4954128440366972e-05,
"loss": 0.7444,
"step": 1427
},
{
"epoch": 1.6380720126237267,
"grad_norm": 18.0,
"learning_rate": 1.4949031600407748e-05,
"loss": 0.4238,
"step": 1428
},
{
"epoch": 1.6392196241572228,
"grad_norm": 31.25,
"learning_rate": 1.4943934760448523e-05,
"loss": 0.4251,
"step": 1429
},
{
"epoch": 1.6403672356907189,
"grad_norm": 20.0,
"learning_rate": 1.4938837920489297e-05,
"loss": 0.6363,
"step": 1430
},
{
"epoch": 1.6415148472242147,
"grad_norm": 42.0,
"learning_rate": 1.4933741080530073e-05,
"loss": 0.6076,
"step": 1431
},
{
"epoch": 1.6426624587577106,
"grad_norm": 62.0,
"learning_rate": 1.4928644240570847e-05,
"loss": 0.5137,
"step": 1432
},
{
"epoch": 1.6438100702912064,
"grad_norm": 26.125,
"learning_rate": 1.4923547400611623e-05,
"loss": 0.4482,
"step": 1433
},
{
"epoch": 1.6449576818247023,
"grad_norm": 66.0,
"learning_rate": 1.4918450560652398e-05,
"loss": 0.9944,
"step": 1434
},
{
"epoch": 1.646105293358198,
"grad_norm": 21.25,
"learning_rate": 1.491335372069317e-05,
"loss": 0.2451,
"step": 1435
},
{
"epoch": 1.6472529048916942,
"grad_norm": 18.0,
"learning_rate": 1.4908256880733946e-05,
"loss": 0.699,
"step": 1436
},
{
"epoch": 1.64840051642519,
"grad_norm": 17.125,
"learning_rate": 1.490316004077472e-05,
"loss": 0.4074,
"step": 1437
},
{
"epoch": 1.649548127958686,
"grad_norm": 59.25,
"learning_rate": 1.4898063200815495e-05,
"loss": 0.4132,
"step": 1438
},
{
"epoch": 1.650695739492182,
"grad_norm": 20.5,
"learning_rate": 1.4892966360856271e-05,
"loss": 0.339,
"step": 1439
},
{
"epoch": 1.6518433510256778,
"grad_norm": 27.0,
"learning_rate": 1.4887869520897045e-05,
"loss": 0.3662,
"step": 1440
},
{
"epoch": 1.6529909625591737,
"grad_norm": 9.5625,
"learning_rate": 1.488277268093782e-05,
"loss": 0.3819,
"step": 1441
},
{
"epoch": 1.6541385740926695,
"grad_norm": 130.0,
"learning_rate": 1.4877675840978594e-05,
"loss": 0.4787,
"step": 1442
},
{
"epoch": 1.6552861856261656,
"grad_norm": 29.375,
"learning_rate": 1.487257900101937e-05,
"loss": 0.4502,
"step": 1443
},
{
"epoch": 1.6564337971596614,
"grad_norm": 28.125,
"learning_rate": 1.4867482161060146e-05,
"loss": 0.4953,
"step": 1444
},
{
"epoch": 1.6575814086931575,
"grad_norm": 36.0,
"learning_rate": 1.4862385321100918e-05,
"loss": 0.9421,
"step": 1445
},
{
"epoch": 1.6587290202266534,
"grad_norm": 35.0,
"learning_rate": 1.4857288481141693e-05,
"loss": 0.3018,
"step": 1446
},
{
"epoch": 1.6598766317601492,
"grad_norm": 18.625,
"learning_rate": 1.4852191641182467e-05,
"loss": 0.2527,
"step": 1447
},
{
"epoch": 1.661024243293645,
"grad_norm": 12.5,
"learning_rate": 1.4847094801223243e-05,
"loss": 0.3482,
"step": 1448
},
{
"epoch": 1.662171854827141,
"grad_norm": 18.75,
"learning_rate": 1.4841997961264019e-05,
"loss": 0.1798,
"step": 1449
},
{
"epoch": 1.6633194663606368,
"grad_norm": 6.34375,
"learning_rate": 1.4836901121304792e-05,
"loss": 0.1118,
"step": 1450
},
{
"epoch": 1.6644670778941328,
"grad_norm": 21.375,
"learning_rate": 1.4831804281345568e-05,
"loss": 0.5154,
"step": 1451
},
{
"epoch": 1.665614689427629,
"grad_norm": 57.75,
"learning_rate": 1.482670744138634e-05,
"loss": 0.845,
"step": 1452
},
{
"epoch": 1.6667623009611248,
"grad_norm": 31.875,
"learning_rate": 1.4821610601427116e-05,
"loss": 0.6743,
"step": 1453
},
{
"epoch": 1.6679099124946206,
"grad_norm": 30.5,
"learning_rate": 1.4816513761467891e-05,
"loss": 0.6286,
"step": 1454
},
{
"epoch": 1.6690575240281165,
"grad_norm": 26.25,
"learning_rate": 1.4811416921508665e-05,
"loss": 0.2807,
"step": 1455
},
{
"epoch": 1.6702051355616123,
"grad_norm": 21.25,
"learning_rate": 1.4806320081549441e-05,
"loss": 0.5438,
"step": 1456
},
{
"epoch": 1.6713527470951082,
"grad_norm": 15.875,
"learning_rate": 1.4801223241590215e-05,
"loss": 0.4873,
"step": 1457
},
{
"epoch": 1.6725003586286042,
"grad_norm": 12.3125,
"learning_rate": 1.479612640163099e-05,
"loss": 0.2455,
"step": 1458
},
{
"epoch": 1.6736479701621,
"grad_norm": 36.25,
"learning_rate": 1.4791029561671764e-05,
"loss": 0.6741,
"step": 1459
},
{
"epoch": 1.6747955816955962,
"grad_norm": 36.25,
"learning_rate": 1.478593272171254e-05,
"loss": 0.2113,
"step": 1460
},
{
"epoch": 1.675943193229092,
"grad_norm": 15.0625,
"learning_rate": 1.4780835881753316e-05,
"loss": 0.3223,
"step": 1461
},
{
"epoch": 1.6770908047625879,
"grad_norm": 70.5,
"learning_rate": 1.4775739041794088e-05,
"loss": 0.7413,
"step": 1462
},
{
"epoch": 1.6782384162960837,
"grad_norm": 50.25,
"learning_rate": 1.4770642201834863e-05,
"loss": 0.5802,
"step": 1463
},
{
"epoch": 1.6793860278295796,
"grad_norm": 13.3125,
"learning_rate": 1.4765545361875637e-05,
"loss": 0.3988,
"step": 1464
},
{
"epoch": 1.6805336393630756,
"grad_norm": 22.125,
"learning_rate": 1.4760448521916413e-05,
"loss": 0.2763,
"step": 1465
},
{
"epoch": 1.6816812508965715,
"grad_norm": 63.5,
"learning_rate": 1.4755351681957188e-05,
"loss": 0.5855,
"step": 1466
},
{
"epoch": 1.6828288624300676,
"grad_norm": 41.75,
"learning_rate": 1.4750254841997962e-05,
"loss": 0.413,
"step": 1467
},
{
"epoch": 1.6839764739635634,
"grad_norm": 75.0,
"learning_rate": 1.4745158002038738e-05,
"loss": 1.2905,
"step": 1468
},
{
"epoch": 1.6851240854970593,
"grad_norm": 35.0,
"learning_rate": 1.474006116207951e-05,
"loss": 0.5774,
"step": 1469
},
{
"epoch": 1.6862716970305551,
"grad_norm": 28.25,
"learning_rate": 1.4734964322120286e-05,
"loss": 0.8901,
"step": 1470
},
{
"epoch": 1.687419308564051,
"grad_norm": 35.0,
"learning_rate": 1.4729867482161061e-05,
"loss": 0.457,
"step": 1471
},
{
"epoch": 1.6885669200975468,
"grad_norm": 34.0,
"learning_rate": 1.4724770642201835e-05,
"loss": 0.4638,
"step": 1472
},
{
"epoch": 1.689714531631043,
"grad_norm": 37.5,
"learning_rate": 1.4719673802242611e-05,
"loss": 0.5084,
"step": 1473
},
{
"epoch": 1.6908621431645388,
"grad_norm": 26.625,
"learning_rate": 1.4714576962283385e-05,
"loss": 0.2821,
"step": 1474
},
{
"epoch": 1.6920097546980348,
"grad_norm": 34.25,
"learning_rate": 1.470948012232416e-05,
"loss": 0.3812,
"step": 1475
},
{
"epoch": 1.6931573662315307,
"grad_norm": 59.0,
"learning_rate": 1.4704383282364936e-05,
"loss": 0.5477,
"step": 1476
},
{
"epoch": 1.6943049777650265,
"grad_norm": 28.25,
"learning_rate": 1.469928644240571e-05,
"loss": 0.6984,
"step": 1477
},
{
"epoch": 1.6954525892985224,
"grad_norm": 69.5,
"learning_rate": 1.4694189602446486e-05,
"loss": 0.7855,
"step": 1478
},
{
"epoch": 1.6966002008320182,
"grad_norm": 49.0,
"learning_rate": 1.4689092762487258e-05,
"loss": 0.984,
"step": 1479
},
{
"epoch": 1.6977478123655143,
"grad_norm": 22.875,
"learning_rate": 1.4683995922528033e-05,
"loss": 0.6088,
"step": 1480
},
{
"epoch": 1.6988954238990102,
"grad_norm": 17.875,
"learning_rate": 1.4678899082568809e-05,
"loss": 0.1793,
"step": 1481
},
{
"epoch": 1.7000430354325062,
"grad_norm": 22.375,
"learning_rate": 1.4673802242609583e-05,
"loss": 0.4399,
"step": 1482
},
{
"epoch": 1.701190646966002,
"grad_norm": 44.75,
"learning_rate": 1.4668705402650358e-05,
"loss": 0.8196,
"step": 1483
},
{
"epoch": 1.702338258499498,
"grad_norm": 17.75,
"learning_rate": 1.4663608562691132e-05,
"loss": 0.3481,
"step": 1484
},
{
"epoch": 1.7034858700329938,
"grad_norm": 30.75,
"learning_rate": 1.4658511722731908e-05,
"loss": 0.5881,
"step": 1485
},
{
"epoch": 1.7046334815664896,
"grad_norm": 54.5,
"learning_rate": 1.4653414882772684e-05,
"loss": 0.9103,
"step": 1486
},
{
"epoch": 1.7057810930999855,
"grad_norm": 22.0,
"learning_rate": 1.4648318042813456e-05,
"loss": 0.9757,
"step": 1487
},
{
"epoch": 1.7069287046334816,
"grad_norm": 41.25,
"learning_rate": 1.4643221202854231e-05,
"loss": 0.2791,
"step": 1488
},
{
"epoch": 1.7080763161669776,
"grad_norm": 72.5,
"learning_rate": 1.4638124362895005e-05,
"loss": 0.6413,
"step": 1489
},
{
"epoch": 1.7092239277004735,
"grad_norm": 31.25,
"learning_rate": 1.463302752293578e-05,
"loss": 0.6097,
"step": 1490
},
{
"epoch": 1.7103715392339693,
"grad_norm": 31.625,
"learning_rate": 1.4627930682976556e-05,
"loss": 0.6532,
"step": 1491
},
{
"epoch": 1.7115191507674652,
"grad_norm": 23.75,
"learning_rate": 1.462283384301733e-05,
"loss": 0.5511,
"step": 1492
},
{
"epoch": 1.712666762300961,
"grad_norm": 44.25,
"learning_rate": 1.4617737003058106e-05,
"loss": 0.5933,
"step": 1493
},
{
"epoch": 1.713814373834457,
"grad_norm": 175.0,
"learning_rate": 1.461264016309888e-05,
"loss": 0.8476,
"step": 1494
},
{
"epoch": 1.714961985367953,
"grad_norm": 12.0625,
"learning_rate": 1.4607543323139655e-05,
"loss": 0.2916,
"step": 1495
},
{
"epoch": 1.7161095969014488,
"grad_norm": 40.0,
"learning_rate": 1.4602446483180431e-05,
"loss": 0.4779,
"step": 1496
},
{
"epoch": 1.717257208434945,
"grad_norm": 19.25,
"learning_rate": 1.4597349643221203e-05,
"loss": 0.3403,
"step": 1497
},
{
"epoch": 1.7184048199684407,
"grad_norm": 20.125,
"learning_rate": 1.4592252803261979e-05,
"loss": 0.4278,
"step": 1498
},
{
"epoch": 1.7195524315019366,
"grad_norm": 11.125,
"learning_rate": 1.4587155963302753e-05,
"loss": 0.4435,
"step": 1499
},
{
"epoch": 1.7207000430354324,
"grad_norm": 47.75,
"learning_rate": 1.4582059123343528e-05,
"loss": 0.6405,
"step": 1500
},
{
"epoch": 1.7207000430354324,
"eval_accuracy": 0.64,
"eval_loss": 0.4719592034816742,
"eval_runtime": 49.6324,
"eval_samples_per_second": 2.015,
"eval_steps_per_second": 2.015,
"step": 1500
},
{
"epoch": 1.7218476545689283,
"grad_norm": 11.5,
"learning_rate": 1.4576962283384304e-05,
"loss": 0.3975,
"step": 1501
},
{
"epoch": 1.7229952661024244,
"grad_norm": 20.875,
"learning_rate": 1.4571865443425078e-05,
"loss": 0.3939,
"step": 1502
},
{
"epoch": 1.7241428776359202,
"grad_norm": 44.25,
"learning_rate": 1.4566768603465853e-05,
"loss": 0.7124,
"step": 1503
},
{
"epoch": 1.7252904891694163,
"grad_norm": 33.0,
"learning_rate": 1.4561671763506626e-05,
"loss": 0.5179,
"step": 1504
},
{
"epoch": 1.7264381007029121,
"grad_norm": 13.9375,
"learning_rate": 1.4556574923547401e-05,
"loss": 0.6342,
"step": 1505
},
{
"epoch": 1.727585712236408,
"grad_norm": 20.75,
"learning_rate": 1.4551478083588177e-05,
"loss": 0.397,
"step": 1506
},
{
"epoch": 1.7287333237699039,
"grad_norm": 12.375,
"learning_rate": 1.454638124362895e-05,
"loss": 0.3495,
"step": 1507
},
{
"epoch": 1.7298809353033997,
"grad_norm": 53.75,
"learning_rate": 1.4541284403669726e-05,
"loss": 0.5092,
"step": 1508
},
{
"epoch": 1.7310285468368956,
"grad_norm": 14.25,
"learning_rate": 1.45361875637105e-05,
"loss": 0.1927,
"step": 1509
},
{
"epoch": 1.7321761583703916,
"grad_norm": 20.875,
"learning_rate": 1.4531090723751276e-05,
"loss": 0.7156,
"step": 1510
},
{
"epoch": 1.7333237699038875,
"grad_norm": 8.6875,
"learning_rate": 1.4525993883792051e-05,
"loss": 0.3399,
"step": 1511
},
{
"epoch": 1.7344713814373836,
"grad_norm": 16.125,
"learning_rate": 1.4520897043832824e-05,
"loss": 0.5978,
"step": 1512
},
{
"epoch": 1.7356189929708794,
"grad_norm": 42.0,
"learning_rate": 1.45158002038736e-05,
"loss": 0.9311,
"step": 1513
},
{
"epoch": 1.7367666045043753,
"grad_norm": 70.5,
"learning_rate": 1.4510703363914373e-05,
"loss": 0.7334,
"step": 1514
},
{
"epoch": 1.737914216037871,
"grad_norm": 16.625,
"learning_rate": 1.4505606523955149e-05,
"loss": 0.4106,
"step": 1515
},
{
"epoch": 1.739061827571367,
"grad_norm": 12.0,
"learning_rate": 1.4500509683995924e-05,
"loss": 0.2984,
"step": 1516
},
{
"epoch": 1.740209439104863,
"grad_norm": 27.125,
"learning_rate": 1.4495412844036698e-05,
"loss": 0.3245,
"step": 1517
},
{
"epoch": 1.7413570506383589,
"grad_norm": 40.25,
"learning_rate": 1.4490316004077474e-05,
"loss": 0.5248,
"step": 1518
},
{
"epoch": 1.742504662171855,
"grad_norm": 15.5,
"learning_rate": 1.4485219164118248e-05,
"loss": 0.3244,
"step": 1519
},
{
"epoch": 1.7436522737053508,
"grad_norm": 70.5,
"learning_rate": 1.4480122324159023e-05,
"loss": 0.9236,
"step": 1520
},
{
"epoch": 1.7447998852388467,
"grad_norm": 30.625,
"learning_rate": 1.4475025484199799e-05,
"loss": 0.8874,
"step": 1521
},
{
"epoch": 1.7459474967723425,
"grad_norm": 11.6875,
"learning_rate": 1.4469928644240571e-05,
"loss": 0.3286,
"step": 1522
},
{
"epoch": 1.7470951083058384,
"grad_norm": 26.875,
"learning_rate": 1.4464831804281347e-05,
"loss": 0.3404,
"step": 1523
},
{
"epoch": 1.7482427198393344,
"grad_norm": 15.375,
"learning_rate": 1.445973496432212e-05,
"loss": 0.4482,
"step": 1524
},
{
"epoch": 1.7493903313728303,
"grad_norm": 27.0,
"learning_rate": 1.4454638124362896e-05,
"loss": 0.476,
"step": 1525
},
{
"epoch": 1.7505379429063264,
"grad_norm": 20.5,
"learning_rate": 1.4449541284403672e-05,
"loss": 0.3796,
"step": 1526
},
{
"epoch": 1.7516855544398222,
"grad_norm": 47.75,
"learning_rate": 1.4444444444444446e-05,
"loss": 0.5618,
"step": 1527
},
{
"epoch": 1.752833165973318,
"grad_norm": 29.5,
"learning_rate": 1.4439347604485221e-05,
"loss": 0.4359,
"step": 1528
},
{
"epoch": 1.753980777506814,
"grad_norm": 52.25,
"learning_rate": 1.4434250764525994e-05,
"loss": 0.6163,
"step": 1529
},
{
"epoch": 1.7551283890403098,
"grad_norm": 19.125,
"learning_rate": 1.442915392456677e-05,
"loss": 0.5202,
"step": 1530
},
{
"epoch": 1.7562760005738056,
"grad_norm": 14.0,
"learning_rate": 1.4424057084607545e-05,
"loss": 0.3921,
"step": 1531
},
{
"epoch": 1.7574236121073017,
"grad_norm": 64.0,
"learning_rate": 1.4418960244648319e-05,
"loss": 0.7896,
"step": 1532
},
{
"epoch": 1.7585712236407975,
"grad_norm": 23.5,
"learning_rate": 1.4413863404689094e-05,
"loss": 0.4141,
"step": 1533
},
{
"epoch": 1.7597188351742936,
"grad_norm": 39.75,
"learning_rate": 1.4408766564729868e-05,
"loss": 0.8279,
"step": 1534
},
{
"epoch": 1.7608664467077895,
"grad_norm": 60.5,
"learning_rate": 1.4403669724770644e-05,
"loss": 0.6541,
"step": 1535
},
{
"epoch": 1.7620140582412853,
"grad_norm": 22.375,
"learning_rate": 1.4398572884811418e-05,
"loss": 0.4579,
"step": 1536
},
{
"epoch": 1.7631616697747812,
"grad_norm": 34.5,
"learning_rate": 1.4393476044852193e-05,
"loss": 0.3177,
"step": 1537
},
{
"epoch": 1.764309281308277,
"grad_norm": 13.9375,
"learning_rate": 1.4388379204892969e-05,
"loss": 0.405,
"step": 1538
},
{
"epoch": 1.765456892841773,
"grad_norm": 45.25,
"learning_rate": 1.4383282364933741e-05,
"loss": 0.4536,
"step": 1539
},
{
"epoch": 1.766604504375269,
"grad_norm": 15.0,
"learning_rate": 1.4378185524974517e-05,
"loss": 0.658,
"step": 1540
},
{
"epoch": 1.767752115908765,
"grad_norm": 23.125,
"learning_rate": 1.437308868501529e-05,
"loss": 0.5647,
"step": 1541
},
{
"epoch": 1.7688997274422609,
"grad_norm": 49.5,
"learning_rate": 1.4367991845056066e-05,
"loss": 0.6544,
"step": 1542
},
{
"epoch": 1.7700473389757567,
"grad_norm": 14.625,
"learning_rate": 1.4362895005096842e-05,
"loss": 0.3288,
"step": 1543
},
{
"epoch": 1.7711949505092526,
"grad_norm": 14.875,
"learning_rate": 1.4357798165137616e-05,
"loss": 0.5407,
"step": 1544
},
{
"epoch": 1.7723425620427484,
"grad_norm": 69.0,
"learning_rate": 1.4352701325178391e-05,
"loss": 0.4395,
"step": 1545
},
{
"epoch": 1.7734901735762443,
"grad_norm": 32.5,
"learning_rate": 1.4347604485219164e-05,
"loss": 0.4165,
"step": 1546
},
{
"epoch": 1.7746377851097404,
"grad_norm": 52.25,
"learning_rate": 1.434250764525994e-05,
"loss": 0.455,
"step": 1547
},
{
"epoch": 1.7757853966432364,
"grad_norm": 26.875,
"learning_rate": 1.4337410805300715e-05,
"loss": 0.5133,
"step": 1548
},
{
"epoch": 1.7769330081767323,
"grad_norm": 63.75,
"learning_rate": 1.4332313965341489e-05,
"loss": 0.8173,
"step": 1549
},
{
"epoch": 1.7780806197102281,
"grad_norm": 69.5,
"learning_rate": 1.4327217125382264e-05,
"loss": 0.7585,
"step": 1550
},
{
"epoch": 1.779228231243724,
"grad_norm": 12.25,
"learning_rate": 1.4322120285423038e-05,
"loss": 0.4586,
"step": 1551
},
{
"epoch": 1.7803758427772198,
"grad_norm": 76.0,
"learning_rate": 1.4317023445463814e-05,
"loss": 0.6924,
"step": 1552
},
{
"epoch": 1.7815234543107157,
"grad_norm": 12.4375,
"learning_rate": 1.431192660550459e-05,
"loss": 0.3333,
"step": 1553
},
{
"epoch": 1.7826710658442118,
"grad_norm": 23.5,
"learning_rate": 1.4306829765545363e-05,
"loss": 0.7329,
"step": 1554
},
{
"epoch": 1.7838186773777076,
"grad_norm": 22.875,
"learning_rate": 1.4301732925586139e-05,
"loss": 0.2949,
"step": 1555
},
{
"epoch": 1.7849662889112037,
"grad_norm": 52.0,
"learning_rate": 1.4296636085626911e-05,
"loss": 0.6708,
"step": 1556
},
{
"epoch": 1.7861139004446995,
"grad_norm": 75.0,
"learning_rate": 1.4291539245667687e-05,
"loss": 0.6416,
"step": 1557
},
{
"epoch": 1.7872615119781954,
"grad_norm": 16.0,
"learning_rate": 1.4286442405708462e-05,
"loss": 0.1615,
"step": 1558
},
{
"epoch": 1.7884091235116912,
"grad_norm": 13.8125,
"learning_rate": 1.4281345565749236e-05,
"loss": 0.2567,
"step": 1559
},
{
"epoch": 1.789556735045187,
"grad_norm": 27.125,
"learning_rate": 1.4276248725790012e-05,
"loss": 0.3011,
"step": 1560
},
{
"epoch": 1.7907043465786832,
"grad_norm": 37.5,
"learning_rate": 1.4271151885830786e-05,
"loss": 0.4136,
"step": 1561
},
{
"epoch": 1.791851958112179,
"grad_norm": 64.0,
"learning_rate": 1.4266055045871561e-05,
"loss": 0.5132,
"step": 1562
},
{
"epoch": 1.792999569645675,
"grad_norm": 23.5,
"learning_rate": 1.4260958205912337e-05,
"loss": 0.8581,
"step": 1563
},
{
"epoch": 1.794147181179171,
"grad_norm": 35.75,
"learning_rate": 1.4255861365953109e-05,
"loss": 0.4336,
"step": 1564
},
{
"epoch": 1.7952947927126668,
"grad_norm": 34.5,
"learning_rate": 1.4250764525993885e-05,
"loss": 0.7922,
"step": 1565
},
{
"epoch": 1.7964424042461626,
"grad_norm": 12.375,
"learning_rate": 1.4245667686034659e-05,
"loss": 0.385,
"step": 1566
},
{
"epoch": 1.7975900157796585,
"grad_norm": 22.125,
"learning_rate": 1.4240570846075434e-05,
"loss": 0.1375,
"step": 1567
},
{
"epoch": 1.7987376273131543,
"grad_norm": 49.25,
"learning_rate": 1.423547400611621e-05,
"loss": 0.2854,
"step": 1568
},
{
"epoch": 1.7998852388466504,
"grad_norm": 74.5,
"learning_rate": 1.4230377166156984e-05,
"loss": 0.8727,
"step": 1569
},
{
"epoch": 1.8010328503801463,
"grad_norm": 5.875,
"learning_rate": 1.422528032619776e-05,
"loss": 0.0896,
"step": 1570
},
{
"epoch": 1.8021804619136423,
"grad_norm": 23.0,
"learning_rate": 1.4220183486238533e-05,
"loss": 0.709,
"step": 1571
},
{
"epoch": 1.8033280734471382,
"grad_norm": 8.5,
"learning_rate": 1.4215086646279309e-05,
"loss": 0.1583,
"step": 1572
},
{
"epoch": 1.804475684980634,
"grad_norm": 45.25,
"learning_rate": 1.4209989806320084e-05,
"loss": 0.3676,
"step": 1573
},
{
"epoch": 1.80562329651413,
"grad_norm": 14.125,
"learning_rate": 1.4204892966360857e-05,
"loss": 0.2197,
"step": 1574
},
{
"epoch": 1.8067709080476257,
"grad_norm": 8.6875,
"learning_rate": 1.4199796126401632e-05,
"loss": 0.2591,
"step": 1575
},
{
"epoch": 1.8079185195811218,
"grad_norm": 43.25,
"learning_rate": 1.4194699286442406e-05,
"loss": 0.4846,
"step": 1576
},
{
"epoch": 1.8090661311146177,
"grad_norm": 32.0,
"learning_rate": 1.4189602446483182e-05,
"loss": 0.2703,
"step": 1577
},
{
"epoch": 1.8102137426481137,
"grad_norm": 46.25,
"learning_rate": 1.4184505606523957e-05,
"loss": 0.6256,
"step": 1578
},
{
"epoch": 1.8113613541816096,
"grad_norm": 38.25,
"learning_rate": 1.4179408766564731e-05,
"loss": 1.0764,
"step": 1579
},
{
"epoch": 1.8125089657151054,
"grad_norm": 21.625,
"learning_rate": 1.4174311926605507e-05,
"loss": 0.1879,
"step": 1580
},
{
"epoch": 1.8136565772486013,
"grad_norm": 25.375,
"learning_rate": 1.4169215086646279e-05,
"loss": 0.8602,
"step": 1581
},
{
"epoch": 1.8148041887820971,
"grad_norm": 73.0,
"learning_rate": 1.4164118246687055e-05,
"loss": 0.6298,
"step": 1582
},
{
"epoch": 1.8159518003155932,
"grad_norm": 33.0,
"learning_rate": 1.415902140672783e-05,
"loss": 0.3714,
"step": 1583
},
{
"epoch": 1.817099411849089,
"grad_norm": 13.9375,
"learning_rate": 1.4153924566768604e-05,
"loss": 0.2252,
"step": 1584
},
{
"epoch": 1.8182470233825851,
"grad_norm": 42.5,
"learning_rate": 1.414882772680938e-05,
"loss": 0.577,
"step": 1585
},
{
"epoch": 1.819394634916081,
"grad_norm": 28.375,
"learning_rate": 1.4143730886850154e-05,
"loss": 0.5294,
"step": 1586
},
{
"epoch": 1.8205422464495769,
"grad_norm": 29.25,
"learning_rate": 1.413863404689093e-05,
"loss": 0.4661,
"step": 1587
},
{
"epoch": 1.8216898579830727,
"grad_norm": 15.6875,
"learning_rate": 1.4133537206931705e-05,
"loss": 0.358,
"step": 1588
},
{
"epoch": 1.8228374695165686,
"grad_norm": 42.0,
"learning_rate": 1.4128440366972477e-05,
"loss": 0.5276,
"step": 1589
},
{
"epoch": 1.8239850810500644,
"grad_norm": 98.5,
"learning_rate": 1.4123343527013254e-05,
"loss": 0.6566,
"step": 1590
},
{
"epoch": 1.8251326925835605,
"grad_norm": 37.0,
"learning_rate": 1.4118246687054027e-05,
"loss": 0.2234,
"step": 1591
},
{
"epoch": 1.8262803041170563,
"grad_norm": 49.5,
"learning_rate": 1.4113149847094802e-05,
"loss": 0.5727,
"step": 1592
},
{
"epoch": 1.8274279156505524,
"grad_norm": 31.75,
"learning_rate": 1.4108053007135578e-05,
"loss": 0.7391,
"step": 1593
},
{
"epoch": 1.8285755271840483,
"grad_norm": 81.0,
"learning_rate": 1.4102956167176352e-05,
"loss": 0.762,
"step": 1594
},
{
"epoch": 1.829723138717544,
"grad_norm": 56.0,
"learning_rate": 1.4097859327217127e-05,
"loss": 0.371,
"step": 1595
},
{
"epoch": 1.83087075025104,
"grad_norm": 33.75,
"learning_rate": 1.4092762487257901e-05,
"loss": 0.5857,
"step": 1596
},
{
"epoch": 1.8320183617845358,
"grad_norm": 15.0625,
"learning_rate": 1.4087665647298677e-05,
"loss": 0.2163,
"step": 1597
},
{
"epoch": 1.8331659733180319,
"grad_norm": 21.25,
"learning_rate": 1.4082568807339452e-05,
"loss": 0.4766,
"step": 1598
},
{
"epoch": 1.8343135848515277,
"grad_norm": 49.75,
"learning_rate": 1.4077471967380225e-05,
"loss": 0.3923,
"step": 1599
},
{
"epoch": 1.8354611963850238,
"grad_norm": 38.25,
"learning_rate": 1.4072375127421e-05,
"loss": 0.445,
"step": 1600
},
{
"epoch": 1.8354611963850238,
"eval_accuracy": 0.69,
"eval_loss": 0.5018435120582581,
"eval_runtime": 49.4827,
"eval_samples_per_second": 2.021,
"eval_steps_per_second": 2.021,
"step": 1600
},
{
"epoch": 1.8366088079185197,
"grad_norm": 36.25,
"learning_rate": 1.4067278287461774e-05,
"loss": 0.7721,
"step": 1601
},
{
"epoch": 1.8377564194520155,
"grad_norm": 26.875,
"learning_rate": 1.406218144750255e-05,
"loss": 0.9496,
"step": 1602
},
{
"epoch": 1.8389040309855114,
"grad_norm": 47.5,
"learning_rate": 1.4057084607543325e-05,
"loss": 0.5079,
"step": 1603
},
{
"epoch": 1.8400516425190072,
"grad_norm": 14.0625,
"learning_rate": 1.40519877675841e-05,
"loss": 0.2523,
"step": 1604
},
{
"epoch": 1.841199254052503,
"grad_norm": 36.5,
"learning_rate": 1.4046890927624875e-05,
"loss": 0.6013,
"step": 1605
},
{
"epoch": 1.8423468655859991,
"grad_norm": 33.25,
"learning_rate": 1.4041794087665647e-05,
"loss": 0.4822,
"step": 1606
},
{
"epoch": 1.8434944771194952,
"grad_norm": 12.6875,
"learning_rate": 1.4036697247706423e-05,
"loss": 0.4923,
"step": 1607
},
{
"epoch": 1.844642088652991,
"grad_norm": 15.0625,
"learning_rate": 1.4031600407747196e-05,
"loss": 0.2454,
"step": 1608
},
{
"epoch": 1.845789700186487,
"grad_norm": 20.5,
"learning_rate": 1.4026503567787972e-05,
"loss": 0.2125,
"step": 1609
},
{
"epoch": 1.8469373117199828,
"grad_norm": 18.875,
"learning_rate": 1.4021406727828748e-05,
"loss": 0.6114,
"step": 1610
},
{
"epoch": 1.8480849232534786,
"grad_norm": 37.25,
"learning_rate": 1.4016309887869522e-05,
"loss": 0.4515,
"step": 1611
},
{
"epoch": 1.8492325347869745,
"grad_norm": 10.5,
"learning_rate": 1.4011213047910297e-05,
"loss": 0.2101,
"step": 1612
},
{
"epoch": 1.8503801463204705,
"grad_norm": 33.5,
"learning_rate": 1.4006116207951071e-05,
"loss": 0.5889,
"step": 1613
},
{
"epoch": 1.8515277578539664,
"grad_norm": 19.875,
"learning_rate": 1.4001019367991847e-05,
"loss": 0.4676,
"step": 1614
},
{
"epoch": 1.8526753693874625,
"grad_norm": 51.0,
"learning_rate": 1.3995922528032622e-05,
"loss": 0.5021,
"step": 1615
},
{
"epoch": 1.8538229809209583,
"grad_norm": 38.0,
"learning_rate": 1.3990825688073395e-05,
"loss": 0.6099,
"step": 1616
},
{
"epoch": 1.8549705924544542,
"grad_norm": 49.75,
"learning_rate": 1.398572884811417e-05,
"loss": 0.6493,
"step": 1617
},
{
"epoch": 1.85611820398795,
"grad_norm": 14.5625,
"learning_rate": 1.3980632008154944e-05,
"loss": 0.1851,
"step": 1618
},
{
"epoch": 1.8572658155214459,
"grad_norm": 74.5,
"learning_rate": 1.397553516819572e-05,
"loss": 0.625,
"step": 1619
},
{
"epoch": 1.858413427054942,
"grad_norm": 49.25,
"learning_rate": 1.3970438328236495e-05,
"loss": 0.4501,
"step": 1620
},
{
"epoch": 1.8595610385884378,
"grad_norm": 36.0,
"learning_rate": 1.3965341488277269e-05,
"loss": 0.4769,
"step": 1621
},
{
"epoch": 1.8607086501219339,
"grad_norm": 72.5,
"learning_rate": 1.3960244648318045e-05,
"loss": 0.6018,
"step": 1622
},
{
"epoch": 1.8618562616554297,
"grad_norm": 28.75,
"learning_rate": 1.3955147808358817e-05,
"loss": 0.4446,
"step": 1623
},
{
"epoch": 1.8630038731889256,
"grad_norm": 58.5,
"learning_rate": 1.3950050968399593e-05,
"loss": 0.7133,
"step": 1624
},
{
"epoch": 1.8641514847224214,
"grad_norm": 14.6875,
"learning_rate": 1.3944954128440368e-05,
"loss": 0.2074,
"step": 1625
},
{
"epoch": 1.8652990962559173,
"grad_norm": 54.25,
"learning_rate": 1.3939857288481142e-05,
"loss": 0.3376,
"step": 1626
},
{
"epoch": 1.8664467077894131,
"grad_norm": 23.0,
"learning_rate": 1.3934760448521918e-05,
"loss": 0.5169,
"step": 1627
},
{
"epoch": 1.8675943193229092,
"grad_norm": 55.5,
"learning_rate": 1.3929663608562692e-05,
"loss": 0.3699,
"step": 1628
},
{
"epoch": 1.868741930856405,
"grad_norm": 18.125,
"learning_rate": 1.3924566768603467e-05,
"loss": 0.207,
"step": 1629
},
{
"epoch": 1.8698895423899011,
"grad_norm": 26.5,
"learning_rate": 1.3919469928644243e-05,
"loss": 0.4332,
"step": 1630
},
{
"epoch": 1.871037153923397,
"grad_norm": 22.75,
"learning_rate": 1.3914373088685017e-05,
"loss": 0.4837,
"step": 1631
},
{
"epoch": 1.8721847654568928,
"grad_norm": 69.5,
"learning_rate": 1.3909276248725792e-05,
"loss": 0.5754,
"step": 1632
},
{
"epoch": 1.8733323769903887,
"grad_norm": 16.25,
"learning_rate": 1.3904179408766564e-05,
"loss": 0.2141,
"step": 1633
},
{
"epoch": 1.8744799885238845,
"grad_norm": 35.75,
"learning_rate": 1.389908256880734e-05,
"loss": 0.3208,
"step": 1634
},
{
"epoch": 1.8756276000573806,
"grad_norm": 29.75,
"learning_rate": 1.3893985728848116e-05,
"loss": 0.6767,
"step": 1635
},
{
"epoch": 1.8767752115908765,
"grad_norm": 19.375,
"learning_rate": 1.388888888888889e-05,
"loss": 0.1118,
"step": 1636
},
{
"epoch": 1.8779228231243725,
"grad_norm": 15.8125,
"learning_rate": 1.3883792048929665e-05,
"loss": 0.1238,
"step": 1637
},
{
"epoch": 1.8790704346578684,
"grad_norm": 31.875,
"learning_rate": 1.3878695208970439e-05,
"loss": 0.5031,
"step": 1638
},
{
"epoch": 1.8802180461913642,
"grad_norm": 40.25,
"learning_rate": 1.3873598369011215e-05,
"loss": 0.8107,
"step": 1639
},
{
"epoch": 1.88136565772486,
"grad_norm": 25.0,
"learning_rate": 1.386850152905199e-05,
"loss": 0.3873,
"step": 1640
},
{
"epoch": 1.882513269258356,
"grad_norm": 78.0,
"learning_rate": 1.3863404689092762e-05,
"loss": 1.1926,
"step": 1641
},
{
"epoch": 1.883660880791852,
"grad_norm": 34.25,
"learning_rate": 1.3858307849133538e-05,
"loss": 0.5274,
"step": 1642
},
{
"epoch": 1.8848084923253479,
"grad_norm": 15.125,
"learning_rate": 1.3853211009174312e-05,
"loss": 0.4215,
"step": 1643
},
{
"epoch": 1.885956103858844,
"grad_norm": 28.0,
"learning_rate": 1.3848114169215088e-05,
"loss": 0.2697,
"step": 1644
},
{
"epoch": 1.8871037153923398,
"grad_norm": 34.25,
"learning_rate": 1.3843017329255863e-05,
"loss": 0.2025,
"step": 1645
},
{
"epoch": 1.8882513269258356,
"grad_norm": 91.5,
"learning_rate": 1.3837920489296637e-05,
"loss": 0.7438,
"step": 1646
},
{
"epoch": 1.8893989384593315,
"grad_norm": 88.0,
"learning_rate": 1.3832823649337413e-05,
"loss": 0.9659,
"step": 1647
},
{
"epoch": 1.8905465499928273,
"grad_norm": 26.875,
"learning_rate": 1.3827726809378187e-05,
"loss": 0.2307,
"step": 1648
},
{
"epoch": 1.8916941615263232,
"grad_norm": 13.375,
"learning_rate": 1.3822629969418962e-05,
"loss": 0.3359,
"step": 1649
},
{
"epoch": 1.8928417730598193,
"grad_norm": 72.0,
"learning_rate": 1.3817533129459738e-05,
"loss": 0.5043,
"step": 1650
},
{
"epoch": 1.8939893845933151,
"grad_norm": 46.75,
"learning_rate": 1.381243628950051e-05,
"loss": 0.4365,
"step": 1651
},
{
"epoch": 1.8951369961268112,
"grad_norm": 27.0,
"learning_rate": 1.3807339449541286e-05,
"loss": 0.4578,
"step": 1652
},
{
"epoch": 1.896284607660307,
"grad_norm": 49.75,
"learning_rate": 1.380224260958206e-05,
"loss": 0.645,
"step": 1653
},
{
"epoch": 1.897432219193803,
"grad_norm": 58.25,
"learning_rate": 1.3797145769622835e-05,
"loss": 0.7014,
"step": 1654
},
{
"epoch": 1.8985798307272987,
"grad_norm": 41.75,
"learning_rate": 1.379204892966361e-05,
"loss": 0.6419,
"step": 1655
},
{
"epoch": 1.8997274422607946,
"grad_norm": 49.5,
"learning_rate": 1.3786952089704385e-05,
"loss": 0.6695,
"step": 1656
},
{
"epoch": 1.9008750537942907,
"grad_norm": 34.5,
"learning_rate": 1.378185524974516e-05,
"loss": 0.324,
"step": 1657
},
{
"epoch": 1.9020226653277865,
"grad_norm": 14.25,
"learning_rate": 1.3776758409785932e-05,
"loss": 0.289,
"step": 1658
},
{
"epoch": 1.9031702768612826,
"grad_norm": 20.875,
"learning_rate": 1.3771661569826708e-05,
"loss": 0.2563,
"step": 1659
},
{
"epoch": 1.9043178883947784,
"grad_norm": 26.0,
"learning_rate": 1.3766564729867484e-05,
"loss": 0.7482,
"step": 1660
},
{
"epoch": 1.9054654999282743,
"grad_norm": 14.4375,
"learning_rate": 1.3761467889908258e-05,
"loss": 0.4048,
"step": 1661
},
{
"epoch": 1.9066131114617701,
"grad_norm": 51.25,
"learning_rate": 1.3756371049949033e-05,
"loss": 0.6209,
"step": 1662
},
{
"epoch": 1.907760722995266,
"grad_norm": 30.875,
"learning_rate": 1.3751274209989807e-05,
"loss": 0.7158,
"step": 1663
},
{
"epoch": 1.9089083345287619,
"grad_norm": 9.4375,
"learning_rate": 1.3746177370030583e-05,
"loss": 0.1511,
"step": 1664
},
{
"epoch": 1.910055946062258,
"grad_norm": 19.125,
"learning_rate": 1.3741080530071358e-05,
"loss": 0.3676,
"step": 1665
},
{
"epoch": 1.911203557595754,
"grad_norm": 19.875,
"learning_rate": 1.3735983690112132e-05,
"loss": 0.3149,
"step": 1666
},
{
"epoch": 1.9123511691292499,
"grad_norm": 22.25,
"learning_rate": 1.3730886850152908e-05,
"loss": 0.2507,
"step": 1667
},
{
"epoch": 1.9134987806627457,
"grad_norm": 13.0,
"learning_rate": 1.372579001019368e-05,
"loss": 0.5281,
"step": 1668
},
{
"epoch": 1.9146463921962416,
"grad_norm": 22.625,
"learning_rate": 1.3720693170234456e-05,
"loss": 0.3352,
"step": 1669
},
{
"epoch": 1.9157940037297374,
"grad_norm": 25.625,
"learning_rate": 1.3715596330275231e-05,
"loss": 0.3003,
"step": 1670
},
{
"epoch": 1.9169416152632333,
"grad_norm": 37.5,
"learning_rate": 1.3710499490316005e-05,
"loss": 0.2462,
"step": 1671
},
{
"epoch": 1.9180892267967293,
"grad_norm": 20.75,
"learning_rate": 1.370540265035678e-05,
"loss": 0.6685,
"step": 1672
},
{
"epoch": 1.9192368383302252,
"grad_norm": 30.0,
"learning_rate": 1.3700305810397555e-05,
"loss": 0.5793,
"step": 1673
},
{
"epoch": 1.9203844498637213,
"grad_norm": 67.5,
"learning_rate": 1.369520897043833e-05,
"loss": 0.5628,
"step": 1674
},
{
"epoch": 1.921532061397217,
"grad_norm": 68.0,
"learning_rate": 1.3690112130479106e-05,
"loss": 0.3445,
"step": 1675
},
{
"epoch": 1.922679672930713,
"grad_norm": 18.375,
"learning_rate": 1.3685015290519878e-05,
"loss": 0.3626,
"step": 1676
},
{
"epoch": 1.9238272844642088,
"grad_norm": 26.875,
"learning_rate": 1.3679918450560654e-05,
"loss": 0.8984,
"step": 1677
},
{
"epoch": 1.9249748959977047,
"grad_norm": 27.125,
"learning_rate": 1.3674821610601427e-05,
"loss": 0.4586,
"step": 1678
},
{
"epoch": 1.9261225075312007,
"grad_norm": 62.0,
"learning_rate": 1.3669724770642203e-05,
"loss": 0.7513,
"step": 1679
},
{
"epoch": 1.9272701190646966,
"grad_norm": 38.25,
"learning_rate": 1.3664627930682979e-05,
"loss": 0.4712,
"step": 1680
},
{
"epoch": 1.9284177305981927,
"grad_norm": 19.125,
"learning_rate": 1.3659531090723753e-05,
"loss": 0.2701,
"step": 1681
},
{
"epoch": 1.9295653421316885,
"grad_norm": 16.625,
"learning_rate": 1.3654434250764528e-05,
"loss": 0.3847,
"step": 1682
},
{
"epoch": 1.9307129536651844,
"grad_norm": 52.0,
"learning_rate": 1.36493374108053e-05,
"loss": 0.4352,
"step": 1683
},
{
"epoch": 1.9318605651986802,
"grad_norm": 100.0,
"learning_rate": 1.3644240570846076e-05,
"loss": 1.0839,
"step": 1684
},
{
"epoch": 1.933008176732176,
"grad_norm": 53.25,
"learning_rate": 1.363914373088685e-05,
"loss": 0.5791,
"step": 1685
},
{
"epoch": 1.934155788265672,
"grad_norm": 55.5,
"learning_rate": 1.3634046890927625e-05,
"loss": 0.7248,
"step": 1686
},
{
"epoch": 1.935303399799168,
"grad_norm": 16.5,
"learning_rate": 1.3628950050968401e-05,
"loss": 0.3914,
"step": 1687
},
{
"epoch": 1.9364510113326638,
"grad_norm": 43.5,
"learning_rate": 1.3623853211009175e-05,
"loss": 0.348,
"step": 1688
},
{
"epoch": 1.93759862286616,
"grad_norm": 34.25,
"learning_rate": 1.361875637104995e-05,
"loss": 0.4504,
"step": 1689
},
{
"epoch": 1.9387462343996558,
"grad_norm": 32.5,
"learning_rate": 1.3613659531090724e-05,
"loss": 0.4256,
"step": 1690
},
{
"epoch": 1.9398938459331516,
"grad_norm": 17.125,
"learning_rate": 1.36085626911315e-05,
"loss": 0.2441,
"step": 1691
},
{
"epoch": 1.9410414574666475,
"grad_norm": 31.625,
"learning_rate": 1.3603465851172276e-05,
"loss": 0.5579,
"step": 1692
},
{
"epoch": 1.9421890690001433,
"grad_norm": 29.75,
"learning_rate": 1.3598369011213048e-05,
"loss": 0.8088,
"step": 1693
},
{
"epoch": 1.9433366805336394,
"grad_norm": 51.5,
"learning_rate": 1.3593272171253823e-05,
"loss": 0.6118,
"step": 1694
},
{
"epoch": 1.9444842920671352,
"grad_norm": 27.875,
"learning_rate": 1.3588175331294597e-05,
"loss": 0.2742,
"step": 1695
},
{
"epoch": 1.9456319036006313,
"grad_norm": 11.8125,
"learning_rate": 1.3583078491335373e-05,
"loss": 0.2417,
"step": 1696
},
{
"epoch": 1.9467795151341272,
"grad_norm": 30.75,
"learning_rate": 1.3577981651376149e-05,
"loss": 0.236,
"step": 1697
},
{
"epoch": 1.947927126667623,
"grad_norm": 28.5,
"learning_rate": 1.3572884811416922e-05,
"loss": 0.299,
"step": 1698
},
{
"epoch": 1.9490747382011189,
"grad_norm": 15.125,
"learning_rate": 1.3567787971457698e-05,
"loss": 0.1027,
"step": 1699
},
{
"epoch": 1.9502223497346147,
"grad_norm": 35.25,
"learning_rate": 1.356269113149847e-05,
"loss": 0.2869,
"step": 1700
},
{
"epoch": 1.9502223497346147,
"eval_accuracy": 0.72,
"eval_loss": 0.4552258551120758,
"eval_runtime": 49.3148,
"eval_samples_per_second": 2.028,
"eval_steps_per_second": 2.028,
"step": 1700
},
{
"epoch": 1.9513699612681108,
"grad_norm": 27.875,
"learning_rate": 1.3557594291539246e-05,
"loss": 0.9343,
"step": 1701
},
{
"epoch": 1.9525175728016066,
"grad_norm": 31.25,
"learning_rate": 1.3552497451580021e-05,
"loss": 0.5365,
"step": 1702
},
{
"epoch": 1.9536651843351027,
"grad_norm": 56.25,
"learning_rate": 1.3547400611620795e-05,
"loss": 0.6064,
"step": 1703
},
{
"epoch": 1.9548127958685986,
"grad_norm": 8.875,
"learning_rate": 1.3542303771661571e-05,
"loss": 0.2503,
"step": 1704
},
{
"epoch": 1.9559604074020944,
"grad_norm": 23.25,
"learning_rate": 1.3537206931702345e-05,
"loss": 0.6551,
"step": 1705
},
{
"epoch": 1.9571080189355903,
"grad_norm": 26.75,
"learning_rate": 1.353211009174312e-05,
"loss": 0.4402,
"step": 1706
},
{
"epoch": 1.9582556304690861,
"grad_norm": 19.75,
"learning_rate": 1.3527013251783896e-05,
"loss": 0.5219,
"step": 1707
},
{
"epoch": 1.959403242002582,
"grad_norm": 109.0,
"learning_rate": 1.352191641182467e-05,
"loss": 0.698,
"step": 1708
},
{
"epoch": 1.960550853536078,
"grad_norm": 81.5,
"learning_rate": 1.3516819571865446e-05,
"loss": 0.5249,
"step": 1709
},
{
"epoch": 1.961698465069574,
"grad_norm": 29.125,
"learning_rate": 1.3511722731906218e-05,
"loss": 0.5226,
"step": 1710
},
{
"epoch": 1.96284607660307,
"grad_norm": 59.75,
"learning_rate": 1.3506625891946993e-05,
"loss": 0.6405,
"step": 1711
},
{
"epoch": 1.9639936881365658,
"grad_norm": 21.5,
"learning_rate": 1.3501529051987769e-05,
"loss": 0.4448,
"step": 1712
},
{
"epoch": 1.9651412996700617,
"grad_norm": 34.75,
"learning_rate": 1.3496432212028543e-05,
"loss": 0.667,
"step": 1713
},
{
"epoch": 1.9662889112035575,
"grad_norm": 15.125,
"learning_rate": 1.3491335372069319e-05,
"loss": 0.4765,
"step": 1714
},
{
"epoch": 1.9674365227370534,
"grad_norm": 31.0,
"learning_rate": 1.3486238532110092e-05,
"loss": 0.2273,
"step": 1715
},
{
"epoch": 1.9685841342705495,
"grad_norm": 20.75,
"learning_rate": 1.3481141692150868e-05,
"loss": 0.3604,
"step": 1716
},
{
"epoch": 1.9697317458040453,
"grad_norm": 39.5,
"learning_rate": 1.3476044852191644e-05,
"loss": 0.4167,
"step": 1717
},
{
"epoch": 1.9708793573375414,
"grad_norm": 28.5,
"learning_rate": 1.3470948012232416e-05,
"loss": 0.4476,
"step": 1718
},
{
"epoch": 1.9720269688710372,
"grad_norm": 19.25,
"learning_rate": 1.3465851172273191e-05,
"loss": 0.5297,
"step": 1719
},
{
"epoch": 1.973174580404533,
"grad_norm": 65.0,
"learning_rate": 1.3460754332313965e-05,
"loss": 0.8327,
"step": 1720
},
{
"epoch": 1.974322191938029,
"grad_norm": 23.875,
"learning_rate": 1.3455657492354741e-05,
"loss": 0.1996,
"step": 1721
},
{
"epoch": 1.9754698034715248,
"grad_norm": 23.0,
"learning_rate": 1.3450560652395517e-05,
"loss": 0.4416,
"step": 1722
},
{
"epoch": 1.9766174150050206,
"grad_norm": 11.4375,
"learning_rate": 1.344546381243629e-05,
"loss": 0.2721,
"step": 1723
},
{
"epoch": 1.9777650265385167,
"grad_norm": 35.0,
"learning_rate": 1.3440366972477066e-05,
"loss": 0.5629,
"step": 1724
},
{
"epoch": 1.9789126380720128,
"grad_norm": 67.5,
"learning_rate": 1.343527013251784e-05,
"loss": 0.6305,
"step": 1725
},
{
"epoch": 1.9800602496055086,
"grad_norm": 32.75,
"learning_rate": 1.3430173292558616e-05,
"loss": 0.2927,
"step": 1726
},
{
"epoch": 1.9812078611390045,
"grad_norm": 35.25,
"learning_rate": 1.3425076452599391e-05,
"loss": 0.238,
"step": 1727
},
{
"epoch": 1.9823554726725003,
"grad_norm": 20.875,
"learning_rate": 1.3419979612640163e-05,
"loss": 0.4392,
"step": 1728
},
{
"epoch": 1.9835030842059962,
"grad_norm": 44.0,
"learning_rate": 1.3414882772680939e-05,
"loss": 0.398,
"step": 1729
},
{
"epoch": 1.984650695739492,
"grad_norm": 26.0,
"learning_rate": 1.3409785932721713e-05,
"loss": 0.7501,
"step": 1730
},
{
"epoch": 1.9857983072729881,
"grad_norm": 20.75,
"learning_rate": 1.3404689092762488e-05,
"loss": 0.3494,
"step": 1731
},
{
"epoch": 1.986945918806484,
"grad_norm": 30.875,
"learning_rate": 1.3399592252803264e-05,
"loss": 1.1064,
"step": 1732
},
{
"epoch": 1.98809353033998,
"grad_norm": 30.0,
"learning_rate": 1.3394495412844038e-05,
"loss": 0.6117,
"step": 1733
},
{
"epoch": 1.989241141873476,
"grad_norm": 16.875,
"learning_rate": 1.3389398572884814e-05,
"loss": 0.3173,
"step": 1734
},
{
"epoch": 1.9903887534069717,
"grad_norm": 15.75,
"learning_rate": 1.3384301732925586e-05,
"loss": 0.4467,
"step": 1735
},
{
"epoch": 1.9915363649404676,
"grad_norm": 49.0,
"learning_rate": 1.3379204892966361e-05,
"loss": 0.7462,
"step": 1736
},
{
"epoch": 1.9926839764739634,
"grad_norm": 22.25,
"learning_rate": 1.3374108053007137e-05,
"loss": 0.4648,
"step": 1737
},
{
"epoch": 1.9938315880074595,
"grad_norm": 41.25,
"learning_rate": 1.3369011213047911e-05,
"loss": 0.2781,
"step": 1738
},
{
"epoch": 1.9949791995409554,
"grad_norm": 22.625,
"learning_rate": 1.3363914373088686e-05,
"loss": 0.6798,
"step": 1739
},
{
"epoch": 1.9961268110744514,
"grad_norm": 61.5,
"learning_rate": 1.335881753312946e-05,
"loss": 0.4519,
"step": 1740
},
{
"epoch": 1.9972744226079473,
"grad_norm": 22.375,
"learning_rate": 1.3353720693170236e-05,
"loss": 0.7196,
"step": 1741
},
{
"epoch": 1.9984220341414431,
"grad_norm": 9.9375,
"learning_rate": 1.3348623853211012e-05,
"loss": 0.2518,
"step": 1742
},
{
"epoch": 1.999569645674939,
"grad_norm": 24.0,
"learning_rate": 1.3343527013251785e-05,
"loss": 0.5652,
"step": 1743
},
{
"epoch": 2.0,
"grad_norm": 43.0,
"learning_rate": 1.3338430173292561e-05,
"loss": 0.2228,
"step": 1744
},
{
"epoch": 2.001147611533496,
"grad_norm": 12.75,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.2022,
"step": 1745
},
{
"epoch": 2.0022952230669917,
"grad_norm": 38.5,
"learning_rate": 1.3328236493374109e-05,
"loss": 0.5685,
"step": 1746
},
{
"epoch": 2.0034428346004876,
"grad_norm": 17.875,
"learning_rate": 1.3323139653414884e-05,
"loss": 0.4231,
"step": 1747
},
{
"epoch": 2.004590446133984,
"grad_norm": 30.125,
"learning_rate": 1.3318042813455658e-05,
"loss": 0.4095,
"step": 1748
},
{
"epoch": 2.0057380576674797,
"grad_norm": 9.125,
"learning_rate": 1.3312945973496434e-05,
"loss": 0.2246,
"step": 1749
},
{
"epoch": 2.0068856692009756,
"grad_norm": 13.3125,
"learning_rate": 1.3307849133537208e-05,
"loss": 0.3579,
"step": 1750
},
{
"epoch": 2.0080332807344714,
"grad_norm": 43.25,
"learning_rate": 1.3302752293577984e-05,
"loss": 0.2611,
"step": 1751
},
{
"epoch": 2.0091808922679673,
"grad_norm": 29.125,
"learning_rate": 1.3297655453618759e-05,
"loss": 0.1953,
"step": 1752
},
{
"epoch": 2.010328503801463,
"grad_norm": 25.625,
"learning_rate": 1.3292558613659531e-05,
"loss": 0.3513,
"step": 1753
},
{
"epoch": 2.011476115334959,
"grad_norm": 16.5,
"learning_rate": 1.3287461773700307e-05,
"loss": 0.2294,
"step": 1754
},
{
"epoch": 2.012623726868455,
"grad_norm": 7.34375,
"learning_rate": 1.328236493374108e-05,
"loss": 0.1434,
"step": 1755
},
{
"epoch": 2.013771338401951,
"grad_norm": 18.625,
"learning_rate": 1.3277268093781856e-05,
"loss": 0.5704,
"step": 1756
},
{
"epoch": 2.014918949935447,
"grad_norm": 47.25,
"learning_rate": 1.3272171253822632e-05,
"loss": 0.4283,
"step": 1757
},
{
"epoch": 2.016066561468943,
"grad_norm": 9.6875,
"learning_rate": 1.3267074413863406e-05,
"loss": 0.2391,
"step": 1758
},
{
"epoch": 2.0172141730024387,
"grad_norm": 34.5,
"learning_rate": 1.3261977573904182e-05,
"loss": 0.248,
"step": 1759
},
{
"epoch": 2.0183617845359345,
"grad_norm": 11.9375,
"learning_rate": 1.3256880733944954e-05,
"loss": 0.2023,
"step": 1760
},
{
"epoch": 2.0195093960694304,
"grad_norm": 27.375,
"learning_rate": 1.325178389398573e-05,
"loss": 0.261,
"step": 1761
},
{
"epoch": 2.020657007602926,
"grad_norm": 10.875,
"learning_rate": 1.3246687054026503e-05,
"loss": 0.224,
"step": 1762
},
{
"epoch": 2.0218046191364225,
"grad_norm": 24.375,
"learning_rate": 1.3241590214067279e-05,
"loss": 0.2463,
"step": 1763
},
{
"epoch": 2.0229522306699184,
"grad_norm": 18.5,
"learning_rate": 1.3236493374108054e-05,
"loss": 0.2944,
"step": 1764
},
{
"epoch": 2.024099842203414,
"grad_norm": 24.0,
"learning_rate": 1.3231396534148828e-05,
"loss": 0.315,
"step": 1765
},
{
"epoch": 2.02524745373691,
"grad_norm": 54.0,
"learning_rate": 1.3226299694189604e-05,
"loss": 0.3818,
"step": 1766
},
{
"epoch": 2.026395065270406,
"grad_norm": 18.5,
"learning_rate": 1.3221202854230378e-05,
"loss": 0.3524,
"step": 1767
},
{
"epoch": 2.0275426768039018,
"grad_norm": 19.75,
"learning_rate": 1.3216106014271153e-05,
"loss": 0.3522,
"step": 1768
},
{
"epoch": 2.0286902883373976,
"grad_norm": 35.0,
"learning_rate": 1.3211009174311929e-05,
"loss": 0.3279,
"step": 1769
},
{
"epoch": 2.029837899870894,
"grad_norm": 56.75,
"learning_rate": 1.3205912334352701e-05,
"loss": 1.3613,
"step": 1770
},
{
"epoch": 2.0309855114043898,
"grad_norm": 28.0,
"learning_rate": 1.3200815494393477e-05,
"loss": 0.2122,
"step": 1771
},
{
"epoch": 2.0321331229378856,
"grad_norm": 37.0,
"learning_rate": 1.319571865443425e-05,
"loss": 0.1997,
"step": 1772
},
{
"epoch": 2.0332807344713815,
"grad_norm": 6.0,
"learning_rate": 1.3190621814475026e-05,
"loss": 0.0679,
"step": 1773
},
{
"epoch": 2.0344283460048773,
"grad_norm": 16.75,
"learning_rate": 1.3185524974515802e-05,
"loss": 0.3065,
"step": 1774
},
{
"epoch": 2.035575957538373,
"grad_norm": 33.5,
"learning_rate": 1.3180428134556576e-05,
"loss": 0.2069,
"step": 1775
},
{
"epoch": 2.036723569071869,
"grad_norm": 27.25,
"learning_rate": 1.3175331294597351e-05,
"loss": 0.2496,
"step": 1776
},
{
"epoch": 2.037871180605365,
"grad_norm": 29.25,
"learning_rate": 1.3170234454638124e-05,
"loss": 0.3496,
"step": 1777
},
{
"epoch": 2.039018792138861,
"grad_norm": 22.375,
"learning_rate": 1.31651376146789e-05,
"loss": 0.302,
"step": 1778
},
{
"epoch": 2.040166403672357,
"grad_norm": 30.375,
"learning_rate": 1.3160040774719675e-05,
"loss": 0.5632,
"step": 1779
},
{
"epoch": 2.041314015205853,
"grad_norm": 61.0,
"learning_rate": 1.3154943934760449e-05,
"loss": 0.3705,
"step": 1780
},
{
"epoch": 2.0424616267393487,
"grad_norm": 9.1875,
"learning_rate": 1.3149847094801224e-05,
"loss": 0.1027,
"step": 1781
},
{
"epoch": 2.0436092382728446,
"grad_norm": 13.75,
"learning_rate": 1.3144750254841998e-05,
"loss": 0.0892,
"step": 1782
},
{
"epoch": 2.0447568498063404,
"grad_norm": 25.0,
"learning_rate": 1.3139653414882774e-05,
"loss": 0.6006,
"step": 1783
},
{
"epoch": 2.0459044613398363,
"grad_norm": 11.0625,
"learning_rate": 1.313455657492355e-05,
"loss": 0.1804,
"step": 1784
},
{
"epoch": 2.0470520728733326,
"grad_norm": 32.75,
"learning_rate": 1.3129459734964323e-05,
"loss": 0.3527,
"step": 1785
},
{
"epoch": 2.0481996844068284,
"grad_norm": 98.5,
"learning_rate": 1.3124362895005099e-05,
"loss": 0.8699,
"step": 1786
},
{
"epoch": 2.0493472959403243,
"grad_norm": 40.5,
"learning_rate": 1.3119266055045871e-05,
"loss": 0.193,
"step": 1787
},
{
"epoch": 2.05049490747382,
"grad_norm": 27.75,
"learning_rate": 1.3114169215086647e-05,
"loss": 0.3553,
"step": 1788
},
{
"epoch": 2.051642519007316,
"grad_norm": 14.4375,
"learning_rate": 1.3109072375127422e-05,
"loss": 0.1813,
"step": 1789
},
{
"epoch": 2.052790130540812,
"grad_norm": 17.75,
"learning_rate": 1.3103975535168196e-05,
"loss": 0.0898,
"step": 1790
},
{
"epoch": 2.0539377420743077,
"grad_norm": 9.25,
"learning_rate": 1.3098878695208972e-05,
"loss": 0.1149,
"step": 1791
},
{
"epoch": 2.0550853536078035,
"grad_norm": 46.0,
"learning_rate": 1.3093781855249746e-05,
"loss": 0.2937,
"step": 1792
},
{
"epoch": 2.0562329651413,
"grad_norm": 17.125,
"learning_rate": 1.3088685015290521e-05,
"loss": 0.3886,
"step": 1793
},
{
"epoch": 2.0573805766747957,
"grad_norm": 25.875,
"learning_rate": 1.3083588175331297e-05,
"loss": 0.2858,
"step": 1794
},
{
"epoch": 2.0585281882082915,
"grad_norm": 42.5,
"learning_rate": 1.307849133537207e-05,
"loss": 0.6463,
"step": 1795
},
{
"epoch": 2.0596757997417874,
"grad_norm": 97.5,
"learning_rate": 1.3073394495412845e-05,
"loss": 0.9309,
"step": 1796
},
{
"epoch": 2.0608234112752832,
"grad_norm": 27.25,
"learning_rate": 1.3068297655453619e-05,
"loss": 0.3763,
"step": 1797
},
{
"epoch": 2.061971022808779,
"grad_norm": 137.0,
"learning_rate": 1.3063200815494394e-05,
"loss": 1.1044,
"step": 1798
},
{
"epoch": 2.063118634342275,
"grad_norm": 12.1875,
"learning_rate": 1.305810397553517e-05,
"loss": 0.1574,
"step": 1799
},
{
"epoch": 2.0642662458757712,
"grad_norm": 22.875,
"learning_rate": 1.3053007135575944e-05,
"loss": 0.1174,
"step": 1800
},
{
"epoch": 2.0642662458757712,
"eval_accuracy": 0.74,
"eval_loss": 0.4835154712200165,
"eval_runtime": 49.2987,
"eval_samples_per_second": 2.028,
"eval_steps_per_second": 2.028,
"step": 1800
}
],
"logging_steps": 1,
"max_steps": 4360,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.358825065150048e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}