diff --git "a/cost_to_push_frequency_2128/checkpoint-40000/trainer_state.json" "b/cost_to_push_frequency_2128/checkpoint-40000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/cost_to_push_frequency_2128/checkpoint-40000/trainer_state.json"
@@ -0,0 +1,6003 @@
+{
+  "best_global_step": 40000,
+  "best_metric": 3.5569655895233154,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_push_frequency_2128/checkpoint-40000",
+  "epoch": 11.648261401362921,
+  "eval_steps": 1000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014561127613722406,
+      "grad_norm": 1.0827219486236572,
+      "learning_rate": 0.000294,
+      "loss": 8.4429,
+      "step": 50
+    },
+    {
+      "epoch": 0.029122255227444813,
+      "grad_norm": 0.9950307011604309,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7538,
+      "step": 100
+    },
+    {
+      "epoch": 0.04368338284116722,
+      "grad_norm": 0.40593206882476807,
+      "learning_rate": 0.0005998286713286713,
+      "loss": 6.3529,
+      "step": 150
+    },
+    {
+      "epoch": 0.058244510454889625,
+      "grad_norm": 0.5188880562782288,
+      "learning_rate": 0.0005996538461538461,
+      "loss": 6.1387,
+      "step": 200
+    },
+    {
+      "epoch": 0.07280563806861204,
+      "grad_norm": 0.5219882726669312,
+      "learning_rate": 0.0005994790209790209,
+      "loss": 5.9936,
+      "step": 250
+    },
+    {
+      "epoch": 0.08736676568233444,
+      "grad_norm": 0.4981943368911743,
+      "learning_rate": 0.0005993041958041958,
+      "loss": 5.8475,
+      "step": 300
+    },
+    {
+      "epoch": 0.10192789329605685,
+      "grad_norm": 0.419317364692688,
+      "learning_rate": 0.0005991293706293705,
+      "loss": 5.731,
+      "step": 350
+    },
+    {
+      "epoch": 0.11648902090977925,
+      "grad_norm": 0.4203638732433319,
+      "learning_rate": 0.0005989545454545454,
+      "loss": 5.6254,
+      "step": 400
+    },
+    {
+      "epoch": 0.13105014852350166,
+      "grad_norm": 0.5592066645622253,
+      "learning_rate": 0.0005987797202797202,
+      "loss": 5.5068,
+      "step": 450
+    },
+    {
+      "epoch": 0.14561127613722408,
+      "grad_norm": 0.465763658285141,
+      "learning_rate": 0.000598604895104895,
+      "loss": 5.4002,
+      "step": 500
+    },
+    {
+      "epoch": 0.16017240375094646,
+      "grad_norm": 0.4648076295852661,
+      "learning_rate": 0.0005984300699300698,
+      "loss": 5.3308,
+      "step": 550
+    },
+    {
+      "epoch": 0.17473353136466888,
+      "grad_norm": 0.49379315972328186,
+      "learning_rate": 0.0005982552447552447,
+      "loss": 5.2609,
+      "step": 600
+    },
+    {
+      "epoch": 0.1892946589783913,
+      "grad_norm": 0.4596584439277649,
+      "learning_rate": 0.0005980804195804195,
+      "loss": 5.1905,
+      "step": 650
+    },
+    {
+      "epoch": 0.2038557865921137,
+      "grad_norm": 0.40508726239204407,
+      "learning_rate": 0.0005979055944055943,
+      "loss": 5.1331,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184169142058361,
+      "grad_norm": 0.3763967752456665,
+      "learning_rate": 0.0005977307692307691,
+      "loss": 5.0753,
+      "step": 750
+    },
+    {
+      "epoch": 0.2329780418195585,
+      "grad_norm": 0.4820829927921295,
+      "learning_rate": 0.000597555944055944,
+      "loss": 5.0201,
+      "step": 800
+    },
+    {
+      "epoch": 0.24753916943328091,
+      "grad_norm": 0.4278320074081421,
+      "learning_rate": 0.0005973811188811188,
+      "loss": 4.9577,
+      "step": 850
+    },
+    {
+      "epoch": 0.2621002970470033,
+      "grad_norm": 0.4293597340583801,
+      "learning_rate": 0.0005972062937062936,
+      "loss": 4.9213,
+      "step": 900
+    },
+    {
+      "epoch": 0.27666142466072574,
+      "grad_norm": 0.43348240852355957,
+      "learning_rate": 0.0005970314685314685,
+      "loss": 4.8786,
+      "step": 950
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "grad_norm": 0.4520808160305023,
+      "learning_rate": 0.0005968566433566433,
+      "loss": 4.8181,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "eval_accuracy": 0.2556966724844482,
+      "eval_loss": 4.745121955871582,
+      "eval_runtime": 179.4217,
+      "eval_samples_per_second": 92.776,
+      "eval_steps_per_second": 5.802,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30578367988817057,
+      "grad_norm": 0.45122841000556946,
+      "learning_rate": 0.0005966818181818181,
+      "loss": 4.7877,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3203448075018929,
+      "grad_norm": 0.47879835963249207,
+      "learning_rate": 0.0005965069930069929,
+      "loss": 4.7428,
+      "step": 1100
+    },
+    {
+      "epoch": 0.33490593511561534,
+      "grad_norm": 0.4831642210483551,
+      "learning_rate": 0.0005963321678321677,
+      "loss": 4.6996,
+      "step": 1150
+    },
+    {
+      "epoch": 0.34946706272933775,
+      "grad_norm": 0.4561481177806854,
+      "learning_rate": 0.0005961573426573425,
+      "loss": 4.6659,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36402819034306017,
+      "grad_norm": 0.4561339020729065,
+      "learning_rate": 0.0005959825174825174,
+      "loss": 4.6367,
+      "step": 1250
+    },
+    {
+      "epoch": 0.3785893179567826,
+      "grad_norm": 0.4436923861503601,
+      "learning_rate": 0.0005958076923076922,
+      "loss": 4.6064,
+      "step": 1300
+    },
+    {
+      "epoch": 0.393150445570505,
+      "grad_norm": 0.46087032556533813,
+      "learning_rate": 0.000595632867132867,
+      "loss": 4.5797,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4077115731842274,
+      "grad_norm": 0.47251585125923157,
+      "learning_rate": 0.0005954580419580418,
+      "loss": 4.547,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4222727007979498,
+      "grad_norm": 0.43149644136428833,
+      "learning_rate": 0.0005952832167832168,
+      "loss": 4.5216,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4368338284116722,
+      "grad_norm": 0.3600349724292755,
+      "learning_rate": 0.0005951083916083916,
+      "loss": 4.5129,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4513949560253946,
+      "grad_norm": 0.42545634508132935,
+      "learning_rate": 0.0005949335664335664,
+      "loss": 4.478,
+      "step": 1550
+    },
+    {
+      "epoch": 0.465956083639117,
+      "grad_norm": 0.4261489808559418,
+      "learning_rate": 0.0005947587412587413,
+      "loss": 4.466,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4805172112528394,
+      "grad_norm": 0.382684588432312,
+      "learning_rate": 0.0005945839160839161,
+      "loss": 4.4463,
+      "step": 1650
+    },
+    {
+      "epoch": 0.49507833886656183,
+      "grad_norm": 0.4798526465892792,
+      "learning_rate": 0.0005944090909090909,
+      "loss": 4.419,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5096394664802842,
+      "grad_norm": 0.4271828830242157,
+      "learning_rate": 0.0005942342657342657,
+      "loss": 4.4065,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5242005940940067,
+      "grad_norm": 0.4648028016090393,
+      "learning_rate": 0.0005940594405594406,
+      "loss": 4.389,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5387617217077291,
+      "grad_norm": 0.46727654337882996,
+      "learning_rate": 0.0005938846153846153,
+      "loss": 4.3739,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5533228493214515,
+      "grad_norm": 0.4359632432460785,
+      "learning_rate": 0.0005937097902097902,
+      "loss": 4.3727,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5678839769351739,
+      "grad_norm": 0.39883190393447876,
+      "learning_rate": 0.000593534965034965,
+      "loss": 4.3559,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "grad_norm": 0.4254516661167145,
+      "learning_rate": 0.0005933601398601398,
+      "loss": 4.3438,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "eval_accuracy": 0.29953294727340574,
+      "eval_loss": 4.282804489135742,
+      "eval_runtime": 179.6292,
+      "eval_samples_per_second": 92.669,
+      "eval_steps_per_second": 5.795,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5970062321626187,
+      "grad_norm": 0.39681392908096313,
+      "learning_rate": 0.0005931853146853146,
+      "loss": 4.3252,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6115673597763411,
+      "grad_norm": 0.36488792300224304,
+      "learning_rate": 0.0005930104895104895,
+      "loss": 4.3158,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6261284873900634,
+      "grad_norm": 0.4375183582305908,
+      "learning_rate": 0.0005928356643356643,
+      "loss": 4.299,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6406896150037859,
+      "grad_norm": 0.38287097215652466,
+      "learning_rate": 0.0005926608391608391,
+      "loss": 4.2941,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6552507426175083,
+      "grad_norm": 0.3945271968841553,
+      "learning_rate": 0.000592486013986014,
+      "loss": 4.2685,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6698118702312307,
+      "grad_norm": 0.3807995617389679,
+      "learning_rate": 0.0005923111888111888,
+      "loss": 4.2773,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6843729978449531,
+      "grad_norm": 0.3736141324043274,
+      "learning_rate": 0.0005921363636363636,
+      "loss": 4.2439,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6989341254586755,
+      "grad_norm": 0.37925609946250916,
+      "learning_rate": 0.0005919615384615384,
+      "loss": 4.2377,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7134952530723979,
+      "grad_norm": 0.40228238701820374,
+      "learning_rate": 0.0005917867132867133,
+      "loss": 4.2397,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7280563806861203,
+      "grad_norm": 0.3505542278289795,
+      "learning_rate": 0.0005916118881118881,
+      "loss": 4.2359,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7426175082998427,
+      "grad_norm": 0.40058302879333496,
+      "learning_rate": 0.0005914370629370629,
+      "loss": 4.2241,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7571786359135652,
+      "grad_norm": 0.3788367509841919,
+      "learning_rate": 0.0005912622377622377,
+      "loss": 4.2107,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7717397635272876,
+      "grad_norm": 0.3747999668121338,
+      "learning_rate": 0.0005910874125874125,
+      "loss": 4.2,
+      "step": 2650
+    },
+    {
+      "epoch": 0.78630089114101,
+      "grad_norm": 0.40086600184440613,
+      "learning_rate": 0.0005909125874125873,
+      "loss": 4.1915,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8008620187547324,
+      "grad_norm": 0.36495792865753174,
+      "learning_rate": 0.0005907377622377622,
+      "loss": 4.1941,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8154231463684548,
+      "grad_norm": 0.3766659200191498,
+      "learning_rate": 0.000590562937062937,
+      "loss": 4.1739,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8299842739821772,
+      "grad_norm": 0.3640320301055908,
+      "learning_rate": 0.0005903881118881118,
+      "loss": 4.1626,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8445454015958996,
+      "grad_norm": 0.3703969717025757,
+      "learning_rate": 0.0005902132867132867,
+      "loss": 4.1557,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8591065292096219,
+      "grad_norm": 0.3352505564689636,
+      "learning_rate": 0.0005900384615384615,
+      "loss": 4.1426,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "grad_norm": 0.3644249141216278,
+      "learning_rate": 0.0005898636363636363,
+      "loss": 4.1483,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "eval_accuracy": 0.31578797630784283,
+      "eval_loss": 4.0948615074157715,
+      "eval_runtime": 179.7252,
+      "eval_samples_per_second": 92.619,
+      "eval_steps_per_second": 5.792,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8882287844370668,
+      "grad_norm": 0.3400101065635681,
+      "learning_rate": 0.0005896888111888111,
+      "loss": 4.1436,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9027899120507892,
+      "grad_norm": 0.3571796417236328,
+      "learning_rate": 0.000589513986013986,
+      "loss": 4.1302,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9173510396645116,
+      "grad_norm": 0.34732452034950256,
+      "learning_rate": 0.0005893391608391608,
+      "loss": 4.1203,
+      "step": 3150
+    },
+    {
+      "epoch": 0.931912167278234,
+      "grad_norm": 0.36288565397262573,
+      "learning_rate": 0.0005891643356643356,
+      "loss": 4.1241,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9464732948919564,
+      "grad_norm": 0.34131136536598206,
+      "learning_rate": 0.0005889895104895104,
+      "loss": 4.1136,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9610344225056788,
+      "grad_norm": 0.35798367857933044,
+      "learning_rate": 0.0005888146853146853,
+      "loss": 4.1029,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9755955501194012,
+      "grad_norm": 0.3709186613559723,
+      "learning_rate": 0.00058863986013986,
+      "loss": 4.0891,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9901566777331237,
+      "grad_norm": 0.3378744423389435,
+      "learning_rate": 0.0005884650349650349,
+      "loss": 4.0959,
+      "step": 3400
+    },
+    {
+      "epoch": 1.004659560836391,
+      "grad_norm": 0.3469085097312927,
+      "learning_rate": 0.0005882902097902097,
+      "loss": 4.0733,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0192206884501136,
+      "grad_norm": 0.3355250954627991,
+      "learning_rate": 0.0005881153846153845,
+      "loss": 4.0135,
+      "step": 3500
+    },
+    {
+      "epoch": 1.033781816063836,
+      "grad_norm": 0.34765860438346863,
+      "learning_rate": 0.0005879405594405594,
+      "loss": 4.0131,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0483429436775584,
+      "grad_norm": 0.3484998941421509,
+      "learning_rate": 0.0005877657342657342,
+      "loss": 4.0352,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0629040712912807,
+      "grad_norm": 0.34341979026794434,
+      "learning_rate": 0.000587590909090909,
+      "loss": 4.0047,
+      "step": 3650
+    },
+    {
+      "epoch": 1.0774651989050033,
+      "grad_norm": 0.36538752913475037,
+      "learning_rate": 0.0005874160839160838,
+      "loss": 4.0016,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0920263265187256,
+      "grad_norm": 0.3458220064640045,
+      "learning_rate": 0.0005872412587412587,
+      "loss": 4.0163,
+      "step": 3750
+    },
+    {
+      "epoch": 1.106587454132448,
+      "grad_norm": 0.3493204414844513,
+      "learning_rate": 0.0005870664335664335,
+      "loss": 4.0035,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1211485817461704,
+      "grad_norm": 0.3274590075016022,
+      "learning_rate": 0.0005868916083916083,
+      "loss": 4.0167,
+      "step": 3850
+    },
+    {
+      "epoch": 1.135709709359893,
+      "grad_norm": 0.3461831510066986,
+      "learning_rate": 0.0005867167832167831,
+      "loss": 3.99,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1502708369736152,
+      "grad_norm": 0.3442121148109436,
+      "learning_rate": 0.000586541958041958,
+      "loss": 3.9825,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "grad_norm": 0.3337996006011963,
+      "learning_rate": 0.0005863671328671328,
+      "loss": 3.9794,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "eval_accuracy": 0.32491283320475906,
+      "eval_loss": 3.989677906036377,
+      "eval_runtime": 179.7843,
+      "eval_samples_per_second": 92.589,
+      "eval_steps_per_second": 5.79,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17939309220106,
+      "grad_norm": 0.33036714792251587,
+      "learning_rate": 0.0005861923076923076,
+      "loss": 3.9821,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1939542198147826,
+      "grad_norm": 0.33033114671707153,
+      "learning_rate": 0.0005860174825174824,
+      "loss": 3.9925,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2085153474285049,
+      "grad_norm": 0.3445809781551361,
+      "learning_rate": 0.0005858426573426573,
+      "loss": 3.9873,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2230764750422272,
+      "grad_norm": 0.32692384719848633,
+      "learning_rate": 0.000585667832167832,
+      "loss": 3.9814,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2376376026559497,
+      "grad_norm": 0.3487424850463867,
+      "learning_rate": 0.000585493006993007,
+      "loss": 3.9712,
+      "step": 4250
+    },
+    {
+      "epoch": 1.2521987302696722,
+      "grad_norm": 0.345749169588089,
+      "learning_rate": 0.0005853181818181817,
+      "loss": 3.9784,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2667598578833945,
+      "grad_norm": 0.36335498094558716,
+      "learning_rate": 0.0005851433566433565,
+      "loss": 3.9808,
+      "step": 4350
+    },
+    {
+      "epoch": 1.2813209854971168,
+      "grad_norm": 0.31872642040252686,
+      "learning_rate": 0.0005849685314685315,
+      "loss": 3.9746,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2958821131108393,
+      "grad_norm": 0.357146680355072,
+      "learning_rate": 0.0005847937062937063,
+      "loss": 3.9645,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3104432407245616,
+      "grad_norm": 0.325870543718338,
+      "learning_rate": 0.0005846188811188811,
+      "loss": 3.9639,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3250043683382842,
+      "grad_norm": 0.3136429488658905,
+      "learning_rate": 0.0005844440559440559,
+      "loss": 3.9582,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3395654959520065,
+      "grad_norm": 0.35432639718055725,
+      "learning_rate": 0.0005842692307692308,
+      "loss": 3.9456,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354126623565729,
+      "grad_norm": 0.3514183759689331,
+      "learning_rate": 0.0005840944055944056,
+      "loss": 3.9475,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3686877511794513,
+      "grad_norm": 0.33868497610092163,
+      "learning_rate": 0.0005839195804195804,
+      "loss": 3.9486,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3832488787931738,
+      "grad_norm": 0.3391216993331909,
+      "learning_rate": 0.0005837447552447552,
+      "loss": 3.9525,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3978100064068961,
+      "grad_norm": 0.34010815620422363,
+      "learning_rate": 0.0005835699300699301,
+      "loss": 3.947,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4123711340206184,
+      "grad_norm": 0.3243875205516815,
+      "learning_rate": 0.0005833951048951048,
+      "loss": 3.9515,
+      "step": 4850
+    },
+    {
+      "epoch": 1.426932261634341,
+      "grad_norm": 0.35085731744766235,
+      "learning_rate": 0.0005832202797202797,
+      "loss": 3.9402,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4414933892480635,
+      "grad_norm": 0.34375637769699097,
+      "learning_rate": 0.0005830454545454546,
+      "loss": 3.9424,
+      "step": 4950
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "grad_norm": 0.3360918164253235,
+      "learning_rate": 0.0005828706293706293,
+      "loss": 3.946,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "eval_accuracy": 0.3315629972163526,
+      "eval_loss": 3.9168527126312256,
+      "eval_runtime": 179.6234,
+      "eval_samples_per_second": 92.672,
+      "eval_steps_per_second": 5.795,
+      "step": 5000
+    },
+    {
+      "epoch": 1.470615644475508,
+      "grad_norm": 0.3384229838848114,
+      "learning_rate": 0.0005826958041958042,
+      "loss": 3.9282,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4851767720892306,
+      "grad_norm": 0.3160015642642975,
+      "learning_rate": 0.000582520979020979,
+      "loss": 3.9223,
+      "step": 5100
+    },
+    {
+      "epoch": 1.4997378997029531,
+      "grad_norm": 0.31337279081344604,
+      "learning_rate": 0.0005823461538461538,
+      "loss": 3.9139,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5142990273166754,
+      "grad_norm": 0.3430108428001404,
+      "learning_rate": 0.0005821713286713286,
+      "loss": 3.9192,
+      "step": 5200
+    },
+    {
+      "epoch": 1.5288601549303977,
+      "grad_norm": 0.32244783639907837,
+      "learning_rate": 0.0005819965034965035,
+      "loss": 3.9181,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5434212825441203,
+      "grad_norm": 0.32754674553871155,
+      "learning_rate": 0.0005818216783216783,
+      "loss": 3.9076,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5579824101578428,
+      "grad_norm": 0.3257962167263031,
+      "learning_rate": 0.0005816468531468531,
+      "loss": 3.9091,
+      "step": 5350
+    },
+    {
+      "epoch": 1.572543537771565,
+      "grad_norm": 0.319021999835968,
+      "learning_rate": 0.0005814720279720279,
+      "loss": 3.8997,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5871046653852874,
+      "grad_norm": 0.34583571553230286,
+      "learning_rate": 0.0005812972027972028,
+      "loss": 3.9082,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6016657929990097,
+      "grad_norm": 0.31768912076950073,
+      "learning_rate": 0.0005811223776223776,
+      "loss": 3.9119,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6162269206127322,
+      "grad_norm": 0.30981358885765076,
+      "learning_rate": 0.0005809475524475524,
+      "loss": 3.8991,
+      "step": 5550
+    },
+    {
+      "epoch": 1.6307880482264547,
+      "grad_norm": 0.3583605885505676,
+      "learning_rate": 0.0005807727272727272,
+      "loss": 3.8898,
+      "step": 5600
+    },
+    {
+      "epoch": 1.645349175840177,
+      "grad_norm": 0.35432425141334534,
+      "learning_rate": 0.0005805979020979021,
+      "loss": 3.9067,
+      "step": 5650
+    },
+    {
+      "epoch": 1.6599103034538993,
+      "grad_norm": 0.32656440138816833,
+      "learning_rate": 0.0005804230769230769,
+      "loss": 3.8878,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6744714310676219,
+      "grad_norm": 0.32895249128341675,
+      "learning_rate": 0.0005802482517482517,
+      "loss": 3.8858,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6890325586813444,
+      "grad_norm": 0.3573879897594452,
+      "learning_rate": 0.0005800734265734265,
+      "loss": 3.8995,
+      "step": 5800
+    },
+    {
+      "epoch": 1.7035936862950667,
+      "grad_norm": 0.3116515278816223,
+      "learning_rate": 0.0005798986013986013,
+      "loss": 3.8855,
+      "step": 5850
+    },
+    {
+      "epoch": 1.718154813908789,
+      "grad_norm": 0.32921165227890015,
+      "learning_rate": 0.0005797237762237762,
+      "loss": 3.8858,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7327159415225115,
+      "grad_norm": 0.32322996854782104,
+      "learning_rate": 0.000579548951048951,
+      "loss": 3.8747,
+      "step": 5950
+    },
+    {
+      "epoch": 1.747277069136234,
+      "grad_norm": 0.3198484778404236,
+      "learning_rate": 0.0005793741258741258,
+      "loss": 3.8796,
+      "step": 6000
+    },
+    {
+      "epoch": 1.747277069136234,
+      "eval_accuracy": 0.33665428105410394,
+      "eval_loss": 3.859868049621582,
+      "eval_runtime": 179.7598,
+      "eval_samples_per_second": 92.601,
+      "eval_steps_per_second": 5.791,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7618381967499563,
+      "grad_norm": 0.32858818769454956,
+      "learning_rate": 0.0005791993006993006,
+      "loss": 3.8737,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7763993243636786,
+      "grad_norm": 0.31307506561279297,
+      "learning_rate": 0.0005790244755244755,
+      "loss": 3.8731,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7909604519774012,
+      "grad_norm": 0.32378000020980835,
+      "learning_rate": 0.0005788496503496503,
+      "loss": 3.8751,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8055215795911237,
+      "grad_norm": 0.3218482434749603,
+      "learning_rate": 0.0005786748251748251,
+      "loss": 3.8731,
+      "step": 6200
+    },
+    {
+      "epoch": 1.820082707204846,
+      "grad_norm": 0.3510587215423584,
+      "learning_rate": 0.0005784999999999999,
+      "loss": 3.8621,
+      "step": 6250
+    },
+    {
+      "epoch": 1.8346438348185683,
+      "grad_norm": 0.32646113634109497,
+      "learning_rate": 0.0005783251748251748,
+      "loss": 3.8652,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8492049624322906,
+      "grad_norm": 0.34067031741142273,
+      "learning_rate": 0.0005781503496503496,
+      "loss": 3.8638,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8637660900460131,
+      "grad_norm": 0.327680766582489,
+      "learning_rate": 0.0005779755244755244,
+      "loss": 3.8617,
+      "step": 6400
+    },
+    {
+      "epoch": 1.8783272176597356,
+      "grad_norm": 0.31625163555145264,
+      "learning_rate": 0.0005778006993006993,
+      "loss": 3.8561,
+      "step": 6450
+    },
+    {
+      "epoch": 1.892888345273458,
+      "grad_norm": 0.312741219997406,
+      "learning_rate": 0.000577625874125874,
+      "loss": 3.842,
+      "step": 6500
+    },
+    {
+      "epoch": 1.9074494728871803,
+      "grad_norm": 0.32632362842559814,
+      "learning_rate": 0.0005774510489510489,
+      "loss": 3.8528,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9220106005009028,
+      "grad_norm": 0.32156306505203247,
+      "learning_rate": 0.0005772762237762237,
+      "loss": 3.8587,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9365717281146253,
+      "grad_norm": 0.3177630305290222,
+      "learning_rate": 0.0005771013986013985,
+      "loss": 3.8592,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9511328557283476,
+      "grad_norm": 0.3381432890892029,
+      "learning_rate": 0.0005769265734265733,
+      "loss": 3.8487,
+      "step": 6700
+    },
+    {
+      "epoch": 1.96569398334207,
+      "grad_norm": 0.31193795800209045,
+      "learning_rate": 0.0005767517482517482,
+      "loss": 3.8599,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9802551109557924,
+      "grad_norm": 0.33586713671684265,
+      "learning_rate": 0.000576576923076923,
+      "loss": 3.8439,
+      "step": 6800
+    },
+    {
+      "epoch": 1.994816238569515,
+      "grad_norm": 0.3259575068950653,
+      "learning_rate": 0.0005764020979020978,
+      "loss": 3.8482,
+      "step": 6850
+    },
+    {
+      "epoch": 2.009319121672782,
+      "grad_norm": 0.3125501275062561,
+      "learning_rate": 0.0005762272727272726,
+      "loss": 3.7815,
+      "step": 6900
+    },
+    {
+      "epoch": 2.023880249286505,
+      "grad_norm": 0.3336809575557709,
+      "learning_rate": 0.0005760524475524475,
+      "loss": 3.7473,
+      "step": 6950
+    },
+    {
+      "epoch": 2.038441376900227,
+      "grad_norm": 0.3166639804840088,
+      "learning_rate": 0.0005758776223776223,
+      "loss": 3.7474,
+      "step": 7000
+    },
+    {
+      "epoch": 2.038441376900227,
+      "eval_accuracy": 0.34135384628406934,
+      "eval_loss": 3.8145618438720703,
+      "eval_runtime": 179.8334,
+      "eval_samples_per_second": 92.563,
+      "eval_steps_per_second": 5.789,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0530025045139495,
+      "grad_norm": 0.3304164409637451,
+      "learning_rate": 0.0005757027972027971,
+      "loss": 3.7559,
+      "step": 7050
+    },
+    {
+      "epoch": 2.067563632127672,
+      "grad_norm": 0.35328182578086853,
+      "learning_rate": 0.000575527972027972,
+      "loss": 3.741,
+      "step": 7100
+    },
+    {
+      "epoch": 2.0821247597413945,
+      "grad_norm": 0.3486672043800354,
+      "learning_rate": 0.0005753531468531468,
+      "loss": 3.751,
+      "step": 7150
+    },
+    {
+      "epoch": 2.096685887355117,
+      "grad_norm": 0.32075631618499756,
+      "learning_rate": 0.0005751783216783216,
+      "loss": 3.7516,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111247014968839,
+      "grad_norm": 0.3235573172569275,
+      "learning_rate": 0.0005750034965034964,
+      "loss": 3.7561,
+      "step": 7250
+    },
+    {
+      "epoch": 2.1258081425825615,
+      "grad_norm": 0.32960283756256104,
+      "learning_rate": 0.0005748286713286712,
+      "loss": 3.7471,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140369270196284,
+      "grad_norm": 0.3249431848526001,
+      "learning_rate": 0.000574653846153846,
+      "loss": 3.7479,
+      "step": 7350
+    },
+    {
+      "epoch": 2.1549303978100065,
+      "grad_norm": 0.32068416476249695,
+      "learning_rate": 0.000574479020979021,
+      "loss": 3.7515,
+      "step": 7400
+    },
+    {
+      "epoch": 2.169491525423729,
+      "grad_norm": 0.35874906182289124,
+      "learning_rate": 0.0005743041958041958,
+      "loss": 3.7665,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184052653037451,
+      "grad_norm": 0.34327706694602966,
+      "learning_rate": 0.0005741293706293706,
+      "loss": 3.7511,
+      "step": 7500
+    },
+    {
+      "epoch": 2.198613780651174,
+      "grad_norm": 0.3151525855064392,
+      "learning_rate": 0.0005739545454545454,
+      "loss": 3.7454,
+      "step": 7550
+    },
+    {
+      "epoch": 2.213174908264896,
+      "grad_norm": 0.3023368716239929,
+      "learning_rate": 0.0005737797202797203,
+      "loss": 3.7624,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2277360358786185,
+      "grad_norm": 0.3228301703929901,
+      "learning_rate": 0.0005736048951048951,
+      "loss": 3.7529,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2422971634923408,
+      "grad_norm": 0.33145347237586975,
+      "learning_rate": 0.0005734300699300699,
+      "loss": 3.76,
+      "step": 7700
+    },
+    {
+      "epoch": 2.256858291106063,
+      "grad_norm": 0.31790366768836975,
+      "learning_rate": 0.0005732552447552448,
+      "loss": 3.7657,
+      "step": 7750
+    },
+    {
+      "epoch": 2.271419418719786,
+      "grad_norm": 0.32009178400039673,
+      "learning_rate": 0.0005730804195804196,
+      "loss": 3.7592,
+      "step": 7800
+    },
+    {
+      "epoch": 2.285980546333508,
+      "grad_norm": 0.31966885924339294,
+      "learning_rate": 0.0005729055944055944,
+      "loss": 3.7606,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3005416739472304,
+      "grad_norm": 0.3291054368019104,
+      "learning_rate": 0.0005727307692307692,
+      "loss": 3.7479,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3151028015609527,
+      "grad_norm": 0.33194002509117126,
+      "learning_rate": 0.0005725559440559441,
+      "loss": 3.757,
+      "step": 7950
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "grad_norm": 0.30678218603134155,
+      "learning_rate": 0.0005723811188811188,
+      "loss": 3.7545,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "eval_accuracy": 0.34440224469340025,
+      "eval_loss": 3.782811164855957,
+      "eval_runtime": 179.7459,
+      "eval_samples_per_second": 92.609,
+      "eval_steps_per_second": 5.792,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3442250567883978,
+      "grad_norm": 0.31450313329696655,
+      "learning_rate": 0.0005722062937062937,
+      "loss": 3.7648,
+      "step": 8050
+    },
+    {
+      "epoch": 2.35878618440212,
+      "grad_norm": 0.3125315308570862,
+      "learning_rate": 0.0005720314685314685,
+      "loss": 3.7461,
+      "step": 8100
+    },
+    {
+      "epoch": 2.3733473120158424,
+      "grad_norm": 0.3463304936885834,
+      "learning_rate": 0.0005718566433566433,
+      "loss": 3.7542,
+      "step": 8150
+    },
+    {
+      "epoch": 2.387908439629565,
+      "grad_norm": 0.3375414311885834,
+      "learning_rate": 0.0005716818181818181,
+      "loss": 3.7424,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4024695672432874,
+      "grad_norm": 0.3216915428638458,
+      "learning_rate": 0.000571506993006993,
+      "loss": 3.7559,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4170306948570097,
+      "grad_norm": 0.37400275468826294,
+      "learning_rate": 0.0005713321678321678,
+      "loss": 3.7556,
+      "step": 8300
+    },
+    {
+      "epoch": 2.431591822470732,
+      "grad_norm": 0.3273051977157593,
+      "learning_rate": 0.0005711573426573426,
+      "loss": 3.7541,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4461529500844543,
+      "grad_norm": 0.31118476390838623,
+      "learning_rate": 0.0005709825174825175,
+      "loss": 3.7479,
+      "step": 8400
+    },
+    {
+      "epoch": 2.460714077698177,
+      "grad_norm": 0.33436667919158936,
+      "learning_rate": 0.0005708076923076923,
+      "loss": 3.7398,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4752752053118994,
+      "grad_norm": 0.32443201541900635,
+      "learning_rate": 0.0005706328671328671,
+      "loss": 3.7483,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4898363329256217,
+      "grad_norm": 0.3430940806865692,
+      "learning_rate": 0.0005704580419580419,
+      "loss": 3.75,
+      "step": 8550
+    },
+    {
+      "epoch": 2.5043974605393444,
+      "grad_norm": 0.31686174869537354,
+      "learning_rate": 0.0005702832167832168,
+      "loss": 3.7418,
+      "step": 8600
+    },
+    {
+      "epoch": 2.5189585881530667,
+      "grad_norm": 0.3173408508300781,
+      "learning_rate": 0.0005701083916083916,
+      "loss": 3.7437,
+      "step": 8650
+    },
+    {
+      "epoch": 2.533519715766789,
+      "grad_norm": 0.3175743818283081,
+      "learning_rate": 0.0005699335664335664,
+      "loss": 3.7417,
+      "step": 8700
+    },
+    {
+      "epoch": 2.5480808433805113,
+      "grad_norm": 0.3153781592845917,
+      "learning_rate": 0.0005697587412587412,
+      "loss": 3.7459,
+      "step": 8750
+    },
+    {
+      "epoch": 2.5626419709942336,
+      "grad_norm": 0.3198295831680298,
+      "learning_rate": 0.000569583916083916,
+      "loss": 3.7524,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5772030986079564,
+      "grad_norm": 0.31497374176979065,
+      "learning_rate": 0.0005694090909090908,
+      "loss": 3.7366,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5917642262216787,
+      "grad_norm": 0.3190245032310486,
+      "learning_rate": 0.0005692342657342657,
+      "loss": 3.7408,
+      "step": 8900
+    },
+    {
+      "epoch": 2.606325353835401,
+      "grad_norm": 0.3084900975227356,
+      "learning_rate": 0.0005690594405594405,
+      "loss": 3.7355,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "grad_norm": 0.3053756356239319,
+      "learning_rate": 0.0005688846153846153,
+      "loss": 3.7487,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "eval_accuracy": 0.34699638118781967,
+      "eval_loss": 3.7568321228027344,
+      "eval_runtime": 179.8658,
+      "eval_samples_per_second": 92.547,
+      "eval_steps_per_second": 5.788,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6354476090628456,
+      "grad_norm": 0.3176893889904022,
+      "learning_rate": 0.0005687097902097901,
+      "loss": 3.7455,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6500087366765683,
+      "grad_norm": 0.3208650052547455,
+      "learning_rate": 0.000568534965034965,
+      "loss": 3.7457,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6645698642902906,
+      "grad_norm": 0.3182576596736908,
+      "learning_rate": 0.0005683601398601398,
+      "loss": 3.7312,
+      "step": 9150
+    },
+    {
+      "epoch": 2.679130991904013,
+      "grad_norm": 0.31629255414009094,
+      "learning_rate": 0.0005681853146853146,
+      "loss": 3.7155,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6936921195177357,
+      "grad_norm": 0.33148428797721863,
+      "learning_rate": 0.0005680104895104895,
+      "loss": 3.7379,
+      "step": 9250
+    },
+    {
+      "epoch": 2.708253247131458,
+      "grad_norm": 0.3020288646221161,
+      "learning_rate": 0.0005678356643356643,
+      "loss": 3.7264,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7228143747451803,
+      "grad_norm": 0.34346917271614075,
+      "learning_rate": 0.0005676608391608391,
+      "loss": 3.7374,
+      "step": 9350
+    },
+    {
+      "epoch": 2.7373755023589026,
+      "grad_norm": 0.31063133478164673,
+      "learning_rate": 0.0005674860139860139,
+      "loss": 3.7298,
+      "step": 9400
+    },
+    {
+      "epoch": 2.751936629972625,
+      "grad_norm": 0.31841859221458435,
+      "learning_rate": 0.0005673111888111888,
+      "loss": 3.7237,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7664977575863476,
+      "grad_norm": 0.3212113082408905,
+      "learning_rate": 0.0005671363636363635,
+      "loss": 3.7389,
+      "step": 9500
+    },
+    {
+      "epoch": 2.78105888520007,
+      "grad_norm": 0.319784551858902,
+      "learning_rate": 0.0005669615384615384,
+      "loss": 3.7401,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7956200128137922,
+      "grad_norm": 0.31253302097320557,
+      "learning_rate": 0.0005667867132867132,
+      "loss": 3.7299,
+      "step": 9600
+    },
+    {
+      "epoch": 2.8101811404275145,
+      "grad_norm": 0.3241884708404541,
+      "learning_rate": 0.000566611888111888,
+      "loss": 3.7281,
+      "step": 9650
+    },
+    {
+      "epoch": 2.824742268041237,
+      "grad_norm": 0.3327905833721161,
+      "learning_rate": 0.0005664370629370628,
+      "loss": 3.7403,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8393033956549596,
+      "grad_norm": 0.33363252878189087,
+      "learning_rate": 0.0005662622377622377,
+      "loss": 3.7429,
+      "step": 9750
+    },
+    {
+      "epoch": 2.853864523268682,
+      "grad_norm": 0.3250058591365814,
+      "learning_rate": 0.0005660874125874125,
+      "loss": 3.7313,
+      "step": 9800
+    },
+    {
+      "epoch": 2.868425650882404,
+      "grad_norm": 0.3366358280181885,
+      "learning_rate": 0.0005659125874125873,
+      "loss": 3.732,
+      "step": 9850
+    },
+    {
+      "epoch": 2.882986778496127,
+      "grad_norm": 0.3395000100135803,
+      "learning_rate": 0.0005657377622377622,
+      "loss": 3.7283,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8975479061098492,
+      "grad_norm": 0.30396348237991333,
+      "learning_rate": 0.000565562937062937,
+      "loss": 3.7282,
+      "step": 9950
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "grad_norm": 0.310280442237854,
+      "learning_rate": 0.0005653881118881118,
+      "loss": 3.7222,
+      "step": 10000
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "eval_accuracy": 0.3495220962447447,
+      "eval_loss": 3.729001045227051,
+      "eval_runtime": 179.9042,
+      "eval_samples_per_second": 92.527,
+      "eval_steps_per_second": 5.786,
+      "step": 10000
+    },
+    {
+      "epoch": 2.926670161337294,
+      "grad_norm": 0.3250355124473572,
+      "learning_rate": 0.0005652132867132866,
+      "loss": 3.7166,
+      "step": 10050
+    },
+    {
+      "epoch": 2.941231288951016,
+      "grad_norm": 0.30567246675491333,
+      "learning_rate": 0.0005650384615384615,
+      "loss": 3.7325,
+      "step": 10100
+    },
+    {
+      "epoch": 2.955792416564739,
+      "grad_norm": 0.34791237115859985,
+      "learning_rate": 0.0005648636363636363,
+      "loss": 3.7056,
+      "step": 10150
+    },
+    {
+      "epoch": 2.970353544178461,
+      "grad_norm": 0.31332409381866455,
+      "learning_rate": 0.0005646888111888111,
+      "loss": 3.7251,
+      "step": 10200
+    },
+    {
+      "epoch": 2.9849146717921835,
+      "grad_norm": 0.2971247136592865,
+      "learning_rate": 0.000564513986013986,
+      "loss": 3.7126,
+      "step": 10250
+    },
+    {
+      "epoch": 2.9994757994059063,
+      "grad_norm": 0.32203900814056396,
+      "learning_rate": 0.0005643391608391607,
+      "loss": 3.7198,
+      "step": 10300
+    },
+    {
+      "epoch": 3.0139786825091734,
+      "grad_norm": 0.3143203854560852,
+      "learning_rate": 0.0005641643356643355,
+      "loss": 3.6176,
+      "step": 10350
+    },
+    {
+      "epoch": 3.0285398101228957,
+      "grad_norm": 0.33899393677711487,
+      "learning_rate": 0.0005639895104895105,
+      "loss": 3.6184,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0431009377366185,
+      "grad_norm": 0.33629149198532104,
+      "learning_rate": 0.0005638146853146853,
+      "loss": 3.6218,
+      "step": 10450
+    },
+    {
+      "epoch": 3.057662065350341,
+      "grad_norm": 0.33977800607681274,
+      "learning_rate": 0.0005636398601398601,
+      "loss": 3.6169,
+      "step": 10500
+    },
+    {
+      "epoch": 3.072223192964063,
+      "grad_norm": 0.3242505192756653,
+      "learning_rate": 0.000563465034965035,
+      "loss": 3.6248,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0867843205777854,
+      "grad_norm": 0.33569052815437317,
+      "learning_rate": 0.0005632902097902098,
+      "loss": 3.6438,
+      "step": 10600
+    },
+    {
+      "epoch": 3.101345448191508,
+      "grad_norm": 0.3249237835407257,
+      "learning_rate": 0.0005631153846153846,
+      "loss": 3.6286,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1159065758052304,
+      "grad_norm": 0.3126699924468994,
+      "learning_rate": 0.0005629405594405594,
+      "loss": 3.6282,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1304677034189528,
+      "grad_norm": 0.3072546720504761,
+      "learning_rate": 0.0005627657342657343,
+      "loss": 3.6303,
+      "step": 10750
+    },
+    {
+      "epoch": 3.145028831032675,
+      "grad_norm": 0.30215486884117126,
+      "learning_rate": 0.0005625909090909091,
+      "loss": 3.6246,
+      "step": 10800
+    },
+    {
+      "epoch": 3.1595899586463974,
+      "grad_norm": 0.30103379487991333,
+      "learning_rate": 0.0005624160839160839,
+      "loss": 3.632,
+      "step": 10850
+    },
+    {
+      "epoch": 3.17415108626012,
+      "grad_norm": 0.40593844652175903,
+      "learning_rate": 0.0005622412587412587,
+      "loss": 3.6411,
+      "step": 10900
+    },
+    {
+      "epoch": 3.1887122138738424,
+      "grad_norm": 0.30845344066619873,
+      "learning_rate": 0.0005620664335664336,
+      "loss": 3.6405,
+      "step": 10950
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "grad_norm": 0.31571993231773376,
+      "learning_rate": 0.0005618916083916083,
+      "loss": 3.6434,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "eval_accuracy": 0.3514291968616427,
+      "eval_loss": 3.715721607208252,
+      "eval_runtime": 179.7371,
+      "eval_samples_per_second": 92.613,
+      "eval_steps_per_second": 5.792,
+      "step": 11000
+    },
+    {
+      "epoch": 3.217834469101287,
+      "grad_norm": 0.3318782448768616,
+      "learning_rate": 0.0005617167832167832,
+      "loss": 3.6317,
+      "step": 11050
+    },
+    {
+      "epoch": 3.2323955967150098,
+      "grad_norm": 0.33287033438682556,
+      "learning_rate": 0.000561541958041958,
+      "loss": 3.6503,
+      "step": 11100
+    },
+    {
+      "epoch": 3.246956724328732,
+      "grad_norm": 0.3447157144546509,
+      "learning_rate": 0.0005613671328671328,
+      "loss": 3.6464,
+      "step": 11150
+    },
+    {
+      "epoch": 3.2615178519424544,
+      "grad_norm": 0.31866371631622314,
+      "learning_rate": 0.0005611923076923077,
+      "loss": 3.6459,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2760789795561767,
+      "grad_norm": 0.3190111517906189,
+      "learning_rate": 0.0005610174825174825,
+      "loss": 3.6375,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2906401071698994,
+      "grad_norm": 0.3384534418582916,
+      "learning_rate": 0.0005608426573426573,
+      "loss": 3.6297,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3052012347836217,
+      "grad_norm": 0.3122884929180145,
+      "learning_rate": 0.0005606678321678321,
+      "loss": 3.6488,
+      "step": 11350
+    },
+    {
+      "epoch": 3.319762362397344,
+      "grad_norm": 0.3280264139175415,
+      "learning_rate": 0.000560493006993007,
+      "loss": 3.6409,
+      "step": 11400
+    },
+    {
+      "epoch": 3.3343234900110663,
+      "grad_norm": 0.3291660249233246,
+      "learning_rate": 0.0005603181818181818,
+      "loss": 3.6371,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3488846176247886,
+      "grad_norm": 0.3122524619102478,
+      "learning_rate": 0.0005601433566433566,
+      "loss": 3.6437,
+      "step": 11500
+    },
+    {
+      "epoch": 3.3634457452385114,
+      "grad_norm": 0.3195066452026367,
+      "learning_rate": 0.0005599685314685314,
+      "loss": 3.654,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3780068728522337,
+      "grad_norm": 0.32396697998046875,
+      "learning_rate": 0.0005597937062937063,
+      "loss": 3.6451,
+      "step": 11600
+    },
+    {
+      "epoch": 3.392568000465956,
+      "grad_norm": 0.31407713890075684,
+      "learning_rate": 0.0005596188811188811,
+      "loss": 3.6336,
+      "step": 11650
+    },
+    {
+      "epoch": 3.4071291280796787,
+      "grad_norm": 0.31519898772239685,
+      "learning_rate": 0.0005594440559440559,
+      "loss": 3.6432,
+      "step": 11700
+    },
+    {
+      "epoch": 3.421690255693401,
+      "grad_norm": 0.33295854926109314,
+      "learning_rate": 0.0005592692307692307,
+      "loss": 3.6271,
+      "step": 11750
+    },
+    {
+      "epoch": 3.4362513833071233,
+      "grad_norm": 0.3175846338272095,
+      "learning_rate": 0.0005590944055944055,
+      "loss": 3.6419,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4508125109208456,
+      "grad_norm": 0.3179056942462921,
+      "learning_rate": 0.0005589195804195803,
+      "loss": 3.649,
+      "step": 11850
+    },
+    {
+      "epoch": 3.465373638534568,
+      "grad_norm": 0.31343457102775574,
+      "learning_rate": 0.0005587447552447552,
+      "loss": 3.6439,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4799347661482907,
+      "grad_norm": 0.3348383903503418,
+      "learning_rate": 0.00055856993006993,
+      "loss": 3.6389,
+      "step": 11950
+    },
+    {
+      "epoch": 3.494495893762013,
+      "grad_norm": 0.33012107014656067,
+      "learning_rate": 0.0005583951048951048,
+      "loss": 3.6493,
+      "step": 12000
+    },
+    {
+      "epoch": 3.494495893762013,
+      "eval_accuracy": 0.3528794491862669,
+      "eval_loss": 3.7000977993011475,
+      "eval_runtime": 179.7672,
+      "eval_samples_per_second": 92.598,
+      "eval_steps_per_second": 5.791,
+      "step": 12000
+    },
+    {
+      "epoch": 3.5090570213757353,
+      "grad_norm": 0.30933046340942383,
+      "learning_rate": 0.0005582202797202797,
+      "loss": 3.6413,
+      "step": 12050
+    },
+    {
+      "epoch": 3.523618148989458,
+      "grad_norm": 0.3057238757610321,
+      "learning_rate": 0.0005580454545454545,
+      "loss": 3.6378,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53817927660318,
+      "grad_norm": 0.3380361497402191,
+      "learning_rate": 0.0005578706293706293,
+      "loss": 3.6462,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5527404042169026,
+      "grad_norm": 0.32907187938690186,
+      "learning_rate": 0.0005576958041958041,
+      "loss": 3.6464,
+      "step": 12200
+    },
+    {
+      "epoch": 3.567301531830625,
+      "grad_norm": 0.3162597417831421,
+      "learning_rate": 0.000557520979020979,
+      "loss": 3.656,
+      "step": 12250
+    },
+    {
+      "epoch": 3.5818626594443472,
+      "grad_norm": 0.3106593191623688,
+      "learning_rate": 0.0005573461538461538,
+      "loss": 3.6515,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59642378705807,
+      "grad_norm": 0.29408252239227295,
+      "learning_rate": 0.0005571713286713286,
+      "loss": 3.6489,
+      "step": 12350
+    },
+    {
+      "epoch": 3.6109849146717923,
+      "grad_norm": 0.3639216721057892,
+      "learning_rate": 0.0005569965034965034,
+      "loss": 3.6424,
+      "step": 12400
+    },
+    {
+      "epoch": 3.6255460422855146,
+      "grad_norm": 0.31863993406295776,
+      "learning_rate": 0.0005568216783216783,
+      "loss": 3.6446,
+      "step": 12450
+    },
+    {
+      "epoch": 3.640107169899237,
+      "grad_norm": 0.3066108822822571,
+      "learning_rate": 0.000556646853146853,
+      "loss": 3.6432,
+      "step": 12500
+    },
+    {
+      "epoch": 3.654668297512959,
+      "grad_norm": 0.30826711654663086,
+      "learning_rate": 0.0005564720279720279,
+      "loss": 3.6457,
+      "step": 12550
+    },
+    {
+      "epoch": 3.669229425126682,
+      "grad_norm": 0.3210170567035675,
+      "learning_rate": 0.0005562972027972027,
+      "loss": 3.6411,
+      "step": 12600
+    },
+    {
+      "epoch": 3.6837905527404042,
+      "grad_norm": 0.31402987241744995,
+      "learning_rate": 0.0005561223776223775,
+      "loss": 3.6542,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6983516803541265,
+      "grad_norm": 0.33224406838417053,
+      "learning_rate": 0.0005559475524475524,
+      "loss": 3.6385,
+      "step": 12700
+    },
+    {
+      "epoch": 3.7129128079678493,
+      "grad_norm": 0.3081912398338318,
+      "learning_rate": 0.0005557727272727272,
+      "loss": 3.6361,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7274739355815716,
+      "grad_norm": 0.31198635697364807,
+      "learning_rate": 0.000555597902097902,
+      "loss": 3.6456,
+      "step": 12800
+    },
+    {
+      "epoch": 3.742035063195294,
+      "grad_norm": 0.31249940395355225,
+      "learning_rate": 0.0005554230769230768,
+      "loss": 3.6301,
+      "step": 12850
+    },
+    {
+      "epoch": 3.756596190809016,
+      "grad_norm": 0.29419270157814026,
+      "learning_rate": 0.0005552482517482517,
+      "loss": 3.6319,
+      "step": 12900
+    },
+    {
+      "epoch": 3.7711573184227385,
+      "grad_norm": 0.3123679459095001,
+      "learning_rate": 0.0005550734265734265,
+      "loss": 3.6439,
+      "step": 12950
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "grad_norm": 0.3085649013519287,
+      "learning_rate": 0.0005548986013986013,
+      "loss": 3.6429,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "eval_accuracy": 0.3545970388800704,
+      "eval_loss": 3.6809747219085693,
+      "eval_runtime": 179.6902,
+      "eval_samples_per_second": 92.637,
+      "eval_steps_per_second": 5.793,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8002795736501835,
+      "grad_norm": 0.3226883113384247,
+      "learning_rate": 0.0005547237762237761,
+      "loss": 3.644,
+      "step": 13050
+    },
+    {
+      "epoch": 3.814840701263906,
+      "grad_norm": 0.32543593645095825,
+      "learning_rate": 0.000554548951048951,
+      "loss": 3.6455,
+      "step": 13100
+    },
+    {
+      "epoch": 3.829401828877628,
+      "grad_norm": 0.313363254070282,
+      "learning_rate": 0.0005543741258741258,
+      "loss": 3.647,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8439629564913504,
+      "grad_norm": 0.3085945248603821,
+      "learning_rate": 0.0005541993006993006,
+      "loss": 3.6409,
+      "step": 13200
+    },
+    {
+      "epoch": 3.858524084105073,
+      "grad_norm": 0.32422712445259094,
+      "learning_rate": 0.0005540244755244756,
+      "loss": 3.6415,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8730852117187955,
+      "grad_norm": 0.31334224343299866,
+      "learning_rate": 0.0005538496503496502,
+      "loss": 3.6376,
+      "step": 13300
+    },
+    {
+      "epoch": 3.887646339332518,
+      "grad_norm": 0.3215864598751068,
+      "learning_rate": 0.0005536748251748252,
+      "loss": 3.6382,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9022074669462405,
+      "grad_norm": 0.32258346676826477,
+      "learning_rate": 0.0005535,
+      "loss": 3.6334,
+      "step": 13400
+    },
+    {
+      "epoch": 3.916768594559963,
+      "grad_norm": 0.32085853815078735,
+      "learning_rate": 0.0005533251748251748,
+      "loss": 3.6264,
+      "step": 13450
+    },
+    {
+      "epoch": 3.931329722173685,
+      "grad_norm": 0.30639684200286865,
+      "learning_rate": 0.0005531503496503496,
+      "loss": 3.6552,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9458908497874075,
+      "grad_norm": 0.31769323348999023,
+      "learning_rate": 0.0005529755244755245,
+      "loss": 3.6226,
+      "step": 13550
+    },
+    {
+      "epoch": 3.9604519774011298,
+      "grad_norm": 0.31194061040878296,
+      "learning_rate": 0.0005528006993006993,
+      "loss": 3.6418,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9750131050148525,
+      "grad_norm": 0.326402485370636,
+      "learning_rate": 0.0005526258741258741,
+      "loss": 3.6404,
+      "step": 13650
+    },
+    {
+      "epoch": 3.989574232628575,
+      "grad_norm": 0.3246409595012665,
+      "learning_rate": 0.0005524510489510489,
+      "loss": 3.6294,
+      "step": 13700
+    },
+    {
+      "epoch": 4.004077115731842,
+      "grad_norm": 0.32423749566078186,
+      "learning_rate": 0.0005522762237762238,
+      "loss": 3.6146,
+      "step": 13750
+    },
+    {
+      "epoch": 4.018638243345564,
+      "grad_norm": 0.311954140663147,
+      "learning_rate": 0.0005521013986013986,
+      "loss": 3.5222,
+      "step": 13800
+    },
+    {
+      "epoch": 4.033199370959287,
+      "grad_norm": 0.31119635701179504,
+      "learning_rate": 0.0005519265734265734,
+      "loss": 3.5373,
+      "step": 13850
+    },
+    {
+      "epoch": 4.04776049857301,
+      "grad_norm": 0.31895068287849426,
+      "learning_rate": 0.0005517517482517482,
+      "loss": 3.5214,
+      "step": 13900
+    },
+    {
+      "epoch": 4.062321626186732,
+      "grad_norm": 0.34818094968795776,
+      "learning_rate": 0.0005515769230769231,
+      "loss": 3.5465,
+      "step": 13950
+    },
+    {
+      "epoch": 4.076882753800454,
+      "grad_norm": 0.33164742588996887,
+      "learning_rate": 0.0005514020979020979,
+      "loss": 3.5427,
+      "step": 14000
+    },
+    {
+      "epoch": 4.076882753800454,
+      "eval_accuracy": 0.3561751993215227,
+      "eval_loss": 3.6737630367279053,
+      "eval_runtime": 179.6974,
+      "eval_samples_per_second": 92.633,
+      "eval_steps_per_second": 5.793,
+      "step": 14000
+    },
+    {
+      "epoch": 4.091443881414177,
+      "grad_norm": 0.3311789333820343,
+      "learning_rate": 0.0005512272727272727,
+      "loss": 3.5457,
+      "step": 14050
+    },
+    {
+      "epoch": 4.106005009027899,
+      "grad_norm": 0.3225516378879547,
+      "learning_rate": 0.0005510524475524475,
+      "loss": 3.5393,
+      "step": 14100
+    },
+    {
+      "epoch": 4.120566136641622,
+      "grad_norm": 0.3110713064670563,
+      "learning_rate": 0.0005508776223776223,
+      "loss": 3.5559,
+      "step": 14150
+    },
+    {
+      "epoch": 4.135127264255344,
+      "grad_norm": 0.32352516055107117,
+      "learning_rate": 0.0005507027972027972,
+      "loss": 3.557,
+      "step": 14200
+    },
+    {
+      "epoch": 4.149688391869066,
+      "grad_norm": 0.32771018147468567,
+      "learning_rate": 0.000550527972027972,
+      "loss": 3.5614,
+      "step": 14250
+    },
+    {
+      "epoch": 4.164249519482789,
+      "grad_norm": 0.3170819580554962,
+      "learning_rate": 0.0005503531468531468,
+      "loss": 3.5519,
+      "step": 14300
+    },
+    {
+      "epoch": 4.178810647096511,
+      "grad_norm": 0.3334265947341919,
+      "learning_rate": 0.0005501783216783216,
+      "loss": 3.5502,
+      "step": 14350
+    },
+    {
+      "epoch": 4.193371774710234,
+      "grad_norm": 0.30677902698516846,
+      "learning_rate": 0.0005500034965034965,
+      "loss": 3.574,
+      "step": 14400
+    },
+    {
+      "epoch": 4.207932902323956,
+      "grad_norm": 0.33088985085487366,
+      "learning_rate": 0.0005498286713286713,
+      "loss": 3.5655,
+      "step": 14450
+    },
+    {
+      "epoch": 4.222494029937678,
+      "grad_norm": 0.31959256529808044,
+      "learning_rate": 0.0005496538461538461,
+      "loss": 3.5559,
+      "step": 14500
+    },
+    {
+      "epoch": 4.237055157551401,
+      "grad_norm": 0.31475120782852173,
+      "learning_rate": 0.0005494790209790209,
+      "loss": 3.559,
+      "step": 14550
+    },
+    {
+      "epoch": 4.251616285165123,
+      "grad_norm": 0.3372187912464142,
+      "learning_rate": 0.0005493041958041958,
+      "loss": 3.568,
+      "step": 14600
+    },
+    {
+      "epoch": 4.266177412778846,
+      "grad_norm": 0.3159469962120056,
+      "learning_rate": 0.0005491293706293706,
+      "loss": 3.5742,
+      "step": 14650
+    },
+    {
+      "epoch": 4.280738540392568,
+      "grad_norm": 0.34496167302131653,
+      "learning_rate": 0.0005489545454545454,
+      "loss": 3.569,
+      "step": 14700
+    },
+    {
+      "epoch": 4.29529966800629,
+      "grad_norm": 0.3201475441455841,
+      "learning_rate": 0.0005487797202797203,
+      "loss": 3.573,
+      "step": 14750
+    },
+    {
+      "epoch": 4.309860795620013,
+      "grad_norm": 0.3239315450191498,
+      "learning_rate": 0.000548604895104895,
+      "loss": 3.577,
+      "step": 14800
+    },
+    {
+      "epoch": 4.324421923233735,
+      "grad_norm": 0.30931442975997925,
+      "learning_rate": 0.0005484300699300699,
+      "loss": 3.5692,
+      "step": 14850
+    },
+    {
+      "epoch": 4.338983050847458,
+      "grad_norm": 0.3285701870918274,
+      "learning_rate": 0.0005482552447552447,
+      "loss": 3.566,
+      "step": 14900
+    },
+    {
+      "epoch": 4.35354417846118,
+      "grad_norm": 0.325842022895813,
+      "learning_rate": 0.0005480804195804195,
+      "loss": 3.5647,
+      "step": 14950
+    },
+    {
+      "epoch": 4.368105306074902,
+      "grad_norm": 0.3167710304260254,
+      "learning_rate": 0.0005479055944055943,
+      "loss": 3.5735,
+      "step": 15000
+    },
+    {
+      "epoch": 4.368105306074902,
+      "eval_accuracy": 0.3571037087946,
+      "eval_loss": 3.659233570098877,
+      "eval_runtime": 179.796,
+      "eval_samples_per_second": 92.583,
+      "eval_steps_per_second": 5.79,
+      "step": 15000
+    },
+    {
+      "epoch": 4.382666433688625,
+      "grad_norm": 0.3091343939304352,
+      "learning_rate": 0.0005477307692307692,
+      "loss": 3.5822,
+      "step": 15050
+    },
+    {
+      "epoch": 4.397227561302348,
+      "grad_norm": 0.33039334416389465,
+      "learning_rate": 0.000547555944055944,
+      "loss": 3.586,
+      "step": 15100
+    },
+    {
+      "epoch": 4.41178868891607,
+      "grad_norm": 0.30892929434776306,
+      "learning_rate": 0.0005473811188811188,
+      "loss": 3.5716,
+      "step": 15150
+    },
+    {
+      "epoch": 4.426349816529792,
+      "grad_norm": 0.3354114592075348,
+      "learning_rate": 0.0005472062937062936,
+      "loss": 3.5646,
+      "step": 15200
+    },
+    {
+      "epoch": 4.440910944143514,
+      "grad_norm": 0.3432832360267639,
+      "learning_rate": 0.0005470314685314685,
+      "loss": 3.5779,
+      "step": 15250
+    },
+    {
+      "epoch": 4.455472071757237,
+      "grad_norm": 0.3167623281478882,
+      "learning_rate": 0.0005468566433566433,
+      "loss": 3.5657,
+      "step": 15300
+    },
+    {
+      "epoch": 4.47003319937096,
+      "grad_norm": 0.3280886113643646,
+      "learning_rate": 0.0005466818181818181,
+      "loss": 3.5732,
+      "step": 15350
+    },
+    {
+      "epoch": 4.4845943269846815,
+      "grad_norm": 0.3291832208633423,
+      "learning_rate": 0.000546506993006993,
+      "loss": 3.5683,
+      "step": 15400
+    },
+    {
+      "epoch": 4.499155454598404,
+      "grad_norm": 0.31101885437965393,
+      "learning_rate": 0.0005463321678321678,
+      "loss": 3.5722,
+      "step": 15450
+    },
+    {
+      "epoch": 4.513716582212126,
+      "grad_norm": 0.3118363320827484,
+      "learning_rate": 0.0005461573426573426,
+      "loss": 3.5855,
+      "step": 15500
+    },
+    {
+      "epoch": 4.528277709825849,
+      "grad_norm": 0.31627270579338074,
+      "learning_rate": 0.0005459825174825174,
+      "loss": 3.5871,
+      "step": 15550
+    },
+    {
+      "epoch": 4.542838837439572,
+      "grad_norm": 0.32284530997276306,
+      "learning_rate": 0.0005458076923076922,
+      "loss": 3.5754,
+      "step": 15600
+    },
+    {
+      "epoch": 4.5573999650532935,
+      "grad_norm": 0.32503610849380493,
+      "learning_rate": 0.000545632867132867,
+      "loss": 3.569,
+      "step": 15650
+    },
+    {
+      "epoch": 4.571961092667016,
+      "grad_norm": 0.3345843553543091,
+      "learning_rate": 0.0005454580419580419,
+      "loss": 3.566,
+      "step": 15700
+    },
+    {
+      "epoch": 4.586522220280738,
+      "grad_norm": 0.31699925661087036,
+      "learning_rate": 0.0005452832167832167,
+      "loss": 3.5757,
+      "step": 15750
+    },
+    {
+      "epoch": 4.601083347894461,
+      "grad_norm": 0.3411146402359009,
+      "learning_rate": 0.0005451083916083915,
+      "loss": 3.5894,
+      "step": 15800
+    },
+    {
+      "epoch": 4.615644475508184,
+      "grad_norm": 0.31675615906715393,
+      "learning_rate": 0.0005449335664335663,
+      "loss": 3.5752,
+      "step": 15850
+    },
+    {
+      "epoch": 4.630205603121905,
+      "grad_norm": 0.3413219153881073,
+      "learning_rate": 0.0005447587412587412,
+      "loss": 3.5711,
+      "step": 15900
+    },
+    {
+      "epoch": 4.644766730735628,
+      "grad_norm": 0.3177620470523834,
+      "learning_rate": 0.000544583916083916,
+      "loss": 3.5798,
+      "step": 15950
+    },
+    {
+      "epoch": 4.659327858349351,
+      "grad_norm": 0.31724312901496887,
+      "learning_rate": 0.0005444090909090908,
+      "loss": 3.5796,
+      "step": 16000
+    },
+    {
+      "epoch": 4.659327858349351,
+      "eval_accuracy": 0.35869268499593115,
+      "eval_loss": 3.648486375808716,
+      "eval_runtime": 179.9066,
+      "eval_samples_per_second": 92.526,
+      "eval_steps_per_second": 5.786,
+      "step": 16000
+    },
+    {
+      "epoch": 4.673888985963073,
+      "grad_norm": 0.32944586873054504,
+      "learning_rate": 0.0005442342657342657,
+      "loss": 3.5742,
+      "step": 16050
+    },
+    {
+      "epoch": 4.6884501135767955,
+      "grad_norm": 0.320095956325531,
+      "learning_rate": 0.0005440594405594405,
+      "loss": 3.5843,
+      "step": 16100
+    },
+    {
+      "epoch": 4.703011241190518,
+      "grad_norm": 0.3284047245979309,
+      "learning_rate": 0.0005438846153846153,
+      "loss": 3.566,
+      "step": 16150
+    },
+    {
+      "epoch": 4.71757236880424,
+      "grad_norm": 0.338379830121994,
+      "learning_rate": 0.0005437097902097901,
+      "loss": 3.5667,
+      "step": 16200
+    },
+    {
+      "epoch": 4.732133496417963,
+      "grad_norm": 0.3109598159790039,
+      "learning_rate": 0.0005435349650349651,
+      "loss": 3.5742,
+      "step": 16250
+    },
+    {
+      "epoch": 4.746694624031685,
+      "grad_norm": 0.30519962310791016,
+      "learning_rate": 0.0005433601398601397,
+      "loss": 3.5789,
+      "step": 16300
+    },
+    {
+      "epoch": 4.7612557516454075,
+      "grad_norm": 0.3150230944156647,
+      "learning_rate": 0.0005431853146853147,
+      "loss": 3.5769,
+      "step": 16350
+    },
+    {
+      "epoch": 4.77581687925913,
+      "grad_norm": 0.29910922050476074,
+      "learning_rate": 0.0005430104895104895,
+      "loss": 3.5761,
+      "step": 16400
+    },
+    {
+      "epoch": 4.790378006872852,
+      "grad_norm": 0.3157634437084198,
+      "learning_rate": 0.0005428356643356643,
+      "loss": 3.5709,
+      "step": 16450
+    },
+    {
+      "epoch": 4.804939134486575,
+      "grad_norm": 0.3214448094367981,
+      "learning_rate": 0.0005426608391608391,
+      "loss": 3.5804,
+      "step": 16500
+    },
+    {
+      "epoch": 4.819500262100297,
+      "grad_norm": 0.31892773509025574,
+      "learning_rate": 0.000542486013986014,
+      "loss": 3.5899,
+      "step": 16550
+    },
+    {
+      "epoch": 4.834061389714019,
+      "grad_norm": 0.3179968595504761,
+      "learning_rate": 0.0005423111888111888,
+      "loss": 3.5709,
+      "step": 16600
+    },
+    {
+      "epoch": 4.848622517327742,
+      "grad_norm": 0.33231818675994873,
+      "learning_rate": 0.0005421363636363636,
+      "loss": 3.5737,
+      "step": 16650
+    },
+    {
+      "epoch": 4.863183644941464,
+      "grad_norm": 0.30390241742134094,
+      "learning_rate": 0.0005419615384615385,
+      "loss": 3.572,
+      "step": 16700
+    },
+    {
+      "epoch": 4.877744772555187,
+      "grad_norm": 0.3263714909553528,
+      "learning_rate": 0.0005417867132867133,
+      "loss": 3.5714,
+      "step": 16750
+    },
+    {
+      "epoch": 4.892305900168909,
+      "grad_norm": 0.31608420610427856,
+      "learning_rate": 0.0005416118881118881,
+      "loss": 3.573,
+      "step": 16800
+    },
+    {
+      "epoch": 4.906867027782631,
+      "grad_norm": 0.3054676353931427,
+      "learning_rate": 0.0005414370629370629,
+      "loss": 3.5793,
+      "step": 16850
+    },
+    {
+      "epoch": 4.921428155396354,
+      "grad_norm": 0.3099980354309082,
+      "learning_rate": 0.0005412622377622378,
+      "loss": 3.5697,
+      "step": 16900
+    },
+    {
+      "epoch": 4.935989283010076,
+      "grad_norm": 0.29981857538223267,
+      "learning_rate": 0.0005410874125874126,
+      "loss": 3.5735,
+      "step": 16950
+    },
+    {
+      "epoch": 4.950550410623799,
+      "grad_norm": 0.3208276033401489,
+      "learning_rate": 0.0005409125874125874,
+      "loss": 3.5819,
+      "step": 17000
+    },
+    {
+      "epoch": 4.950550410623799,
+      "eval_accuracy": 0.3599148658622406,
+      "eval_loss": 3.634756326675415,
+      "eval_runtime": 179.7751,
+      "eval_samples_per_second": 92.593,
+      "eval_steps_per_second": 5.791,
+      "step": 17000
+    },
+    {
+      "epoch": 4.9651115382375215,
+      "grad_norm": 0.310529500246048,
+      "learning_rate": 0.0005407377622377622,
+      "loss": 3.5832,
+      "step": 17050
+    },
+    {
+      "epoch": 4.979672665851243,
+      "grad_norm": 0.32999780774116516,
+      "learning_rate": 0.000540562937062937,
+      "loss": 3.5711,
+      "step": 17100
+    },
+    {
+      "epoch": 4.994233793464966,
+      "grad_norm": 0.3354627192020416,
+      "learning_rate": 0.0005403881118881118,
+      "loss": 3.5734,
+      "step": 17150
+    },
+    {
+      "epoch": 5.008736676568233,
+      "grad_norm": 0.35508137941360474,
+      "learning_rate": 0.0005402132867132867,
+      "loss": 3.5155,
+      "step": 17200
+    },
+    {
+      "epoch": 5.023297804181956,
+      "grad_norm": 0.31227484345436096,
+      "learning_rate": 0.0005400384615384615,
+      "loss": 3.4713,
+      "step": 17250
+    },
+    {
+      "epoch": 5.037858931795678,
+      "grad_norm": 0.31459367275238037,
+      "learning_rate": 0.0005398636363636363,
+      "loss": 3.48,
+      "step": 17300
+    },
+    {
+      "epoch": 5.052420059409401,
+      "grad_norm": 0.31045621633529663,
+      "learning_rate": 0.0005396888111888111,
+      "loss": 3.4772,
+      "step": 17350
+    },
+    {
+      "epoch": 5.066981187023123,
+      "grad_norm": 0.3227365016937256,
+      "learning_rate": 0.000539513986013986,
+      "loss": 3.4702,
+      "step": 17400
+    },
+    {
+      "epoch": 5.081542314636845,
+      "grad_norm": 0.30600887537002563,
+      "learning_rate": 0.0005393391608391608,
+      "loss": 3.4776,
+      "step": 17450
+    },
+    {
+      "epoch": 5.096103442250568,
+      "grad_norm": 0.3312874138355255,
+      "learning_rate": 0.0005391643356643356,
+      "loss": 3.4876,
+      "step": 17500
+    },
+    {
+      "epoch": 5.110664569864291,
+      "grad_norm": 0.3330562114715576,
+      "learning_rate": 0.0005389895104895105,
+      "loss": 3.4802,
+      "step": 17550
+    },
+    {
+      "epoch": 5.125225697478013,
+      "grad_norm": 0.32655513286590576,
+      "learning_rate": 0.0005388146853146853,
+      "loss": 3.4899,
+      "step": 17600
+    },
+    {
+      "epoch": 5.139786825091735,
+      "grad_norm": 0.34551799297332764,
+      "learning_rate": 0.0005386398601398601,
+      "loss": 3.493,
+      "step": 17650
+    },
+    {
+      "epoch": 5.154347952705457,
+      "grad_norm": 0.3142414093017578,
+      "learning_rate": 0.0005384650349650349,
+      "loss": 3.5019,
+      "step": 17700
+    },
+    {
+      "epoch": 5.16890908031918,
+      "grad_norm": 0.3235276937484741,
+      "learning_rate": 0.0005382902097902098,
+      "loss": 3.4889,
+      "step": 17750
+    },
+    {
+      "epoch": 5.183470207932903,
+      "grad_norm": 0.3249594569206238,
+      "learning_rate": 0.0005381153846153845,
+      "loss": 3.4947,
+      "step": 17800
+    },
+    {
+      "epoch": 5.1980313355466246,
+      "grad_norm": 0.32166171073913574,
+      "learning_rate": 0.0005379405594405594,
+      "loss": 3.5064,
+      "step": 17850
+    },
+    {
+      "epoch": 5.212592463160347,
+      "grad_norm": 0.3284703195095062,
+      "learning_rate": 0.0005377657342657342,
+      "loss": 3.5105,
+      "step": 17900
+    },
+    {
+      "epoch": 5.227153590774069,
+      "grad_norm": 0.32744383811950684,
+      "learning_rate": 0.000537590909090909,
+      "loss": 3.5143,
+      "step": 17950
+    },
+    {
+      "epoch": 5.241714718387792,
+      "grad_norm": 0.312739759683609,
+      "learning_rate": 0.0005374160839160838,
+      "loss": 3.5007,
+      "step": 18000
+    },
+    {
+      "epoch": 5.241714718387792,
+      "eval_accuracy": 0.36018232079402723,
+      "eval_loss": 3.6365652084350586,
+      "eval_runtime": 179.8304,
+      "eval_samples_per_second": 92.565,
+      "eval_steps_per_second": 5.789,
+      "step": 18000
+    },
+    {
+      "epoch": 5.256275846001515,
+      "grad_norm": 0.31837671995162964,
+      "learning_rate": 0.0005372412587412587,
+      "loss": 3.5128,
+      "step": 18050
+    },
+    {
+      "epoch": 5.2708369736152365,
+      "grad_norm": 0.33519458770751953,
+      "learning_rate": 0.0005370664335664335,
+      "loss": 3.5119,
+      "step": 18100
+    },
+    {
+      "epoch": 5.285398101228959,
+      "grad_norm": 0.34740373492240906,
+      "learning_rate": 0.0005368916083916083,
+      "loss": 3.5228,
+      "step": 18150
+    },
+    {
+      "epoch": 5.299959228842681,
+      "grad_norm": 0.34328994154930115,
+      "learning_rate": 0.0005367167832167832,
+      "loss": 3.5142,
+      "step": 18200
+    },
+    {
+      "epoch": 5.314520356456404,
+      "grad_norm": 0.3207642436027527,
+      "learning_rate": 0.000536541958041958,
+      "loss": 3.5114,
+      "step": 18250
+    },
+    {
+      "epoch": 5.329081484070127,
+      "grad_norm": 0.335101455450058,
+      "learning_rate": 0.0005363671328671328,
+      "loss": 3.5175,
+      "step": 18300
+    },
+    {
+      "epoch": 5.3436426116838485,
+      "grad_norm": 0.34362977743148804,
+      "learning_rate": 0.0005361923076923076,
+      "loss": 3.519,
+      "step": 18350
+    },
+    {
+      "epoch": 5.358203739297571,
+      "grad_norm": 0.3147866725921631,
+      "learning_rate": 0.0005360174825174825,
+      "loss": 3.5155,
+      "step": 18400
+    },
+    {
+      "epoch": 5.372764866911294,
+      "grad_norm": 0.33346375823020935,
+      "learning_rate": 0.0005358426573426573,
+      "loss": 3.5163,
+      "step": 18450
+    },
+    {
+      "epoch": 5.387325994525016,
+      "grad_norm": 0.3331373631954193,
+      "learning_rate": 0.0005356678321678321,
+      "loss": 3.5133,
+      "step": 18500
+    },
+    {
+      "epoch": 5.401887122138739,
+      "grad_norm": 0.3066289722919464,
+      "learning_rate": 0.0005354930069930069,
+      "loss": 3.5202,
+      "step": 18550
+    },
+    {
+      "epoch": 5.41644824975246,
+      "grad_norm": 0.32293954491615295,
+      "learning_rate": 0.0005353181818181817,
+      "loss": 3.5251,
+      "step": 18600
+    },
+    {
+      "epoch": 5.431009377366183,
+      "grad_norm": 0.33153200149536133,
+      "learning_rate": 0.0005351433566433565,
+      "loss": 3.5089,
+      "step": 18650
+    },
+    {
+      "epoch": 5.445570504979906,
+      "grad_norm": 0.32844340801239014,
+      "learning_rate": 0.0005349685314685314,
+      "loss": 3.5175,
+      "step": 18700
+    },
+    {
+      "epoch": 5.460131632593628,
+      "grad_norm": 0.33013710379600525,
+      "learning_rate": 0.0005347937062937062,
+      "loss": 3.5155,
+      "step": 18750
+    },
+    {
+      "epoch": 5.4746927602073505,
+      "grad_norm": 0.318752259016037,
+      "learning_rate": 0.000534618881118881,
+      "loss": 3.5226,
+      "step": 18800
+    },
+    {
+      "epoch": 5.489253887821073,
+      "grad_norm": 0.3632429242134094,
+      "learning_rate": 0.0005344440559440559,
+      "loss": 3.5256,
+      "step": 18850
+    },
+    {
+      "epoch": 5.503815015434795,
+      "grad_norm": 0.31200987100601196,
+      "learning_rate": 0.0005342692307692307,
+      "loss": 3.5277,
+      "step": 18900
+    },
+    {
+      "epoch": 5.518376143048518,
+      "grad_norm": 0.35066500306129456,
+      "learning_rate": 0.0005340944055944055,
+      "loss": 3.5224,
+      "step": 18950
+    },
+    {
+      "epoch": 5.53293727066224,
+      "grad_norm": 0.3067936301231384,
+      "learning_rate": 0.0005339195804195803,
+      "loss": 3.5156,
+      "step": 19000
+    },
+    {
+      "epoch": 5.53293727066224,
+      "eval_accuracy": 0.3610321808827682,
+      "eval_loss": 3.6285228729248047,
+      "eval_runtime": 180.0932,
+      "eval_samples_per_second": 92.43,
+      "eval_steps_per_second": 5.78,
+      "step": 19000
+    },
+    {
+      "epoch": 5.5474983982759625,
+      "grad_norm": 0.3141394853591919,
+      "learning_rate": 0.0005337447552447552,
+      "loss": 3.5173,
+      "step": 19050
+    },
+    {
+      "epoch": 5.562059525889685,
+      "grad_norm": 0.334416925907135,
+      "learning_rate": 0.00053356993006993,
+      "loss": 3.5189,
+      "step": 19100
+    },
+    {
+      "epoch": 5.576620653503407,
+      "grad_norm": 0.3050374686717987,
+      "learning_rate": 0.0005333951048951048,
+      "loss": 3.5142,
+      "step": 19150
+    },
+    {
+      "epoch": 5.59118178111713,
+      "grad_norm": 0.33711856603622437,
+      "learning_rate": 0.0005332202797202796,
+      "loss": 3.5282,
+      "step": 19200
+    },
+    {
+      "epoch": 5.605742908730852,
+      "grad_norm": 0.34378382563591003,
+      "learning_rate": 0.0005330454545454546,
+      "loss": 3.5195,
+      "step": 19250
+    },
+    {
+      "epoch": 5.620304036344574,
+      "grad_norm": 0.3297707736492157,
+      "learning_rate": 0.0005328706293706292,
+      "loss": 3.532,
+      "step": 19300
+    },
+    {
+      "epoch": 5.634865163958297,
+      "grad_norm": 0.33016687631607056,
+      "learning_rate": 0.0005326958041958042,
+      "loss": 3.5425,
+      "step": 19350
+    },
+    {
+      "epoch": 5.649426291572019,
+      "grad_norm": 0.34170061349868774,
+      "learning_rate": 0.000532520979020979,
+      "loss": 3.5282,
+      "step": 19400
+    },
+    {
+      "epoch": 5.663987419185742,
+      "grad_norm": 0.3264179825782776,
+      "learning_rate": 0.0005323461538461538,
+      "loss": 3.5302,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6785485467994645,
+      "grad_norm": 0.3002929091453552,
+      "learning_rate": 0.0005321713286713287,
+      "loss": 3.5267,
+      "step": 19500
+    },
+    {
+      "epoch": 5.693109674413186,
+      "grad_norm": 0.35670411586761475,
+      "learning_rate": 0.0005319965034965035,
+      "loss": 3.5173,
+      "step": 19550
+    },
+    {
+      "epoch": 5.707670802026909,
+      "grad_norm": 0.3164016902446747,
+      "learning_rate": 0.0005318216783216783,
+      "loss": 3.5437,
+      "step": 19600
+    },
+    {
+      "epoch": 5.722231929640631,
+      "grad_norm": 0.3452078700065613,
+      "learning_rate": 0.0005316468531468531,
+      "loss": 3.5239,
+      "step": 19650
+    },
+    {
+      "epoch": 5.736793057254354,
+      "grad_norm": 0.3179798424243927,
+      "learning_rate": 0.000531472027972028,
+      "loss": 3.5254,
+      "step": 19700
+    },
+    {
+      "epoch": 5.7513541848680765,
+      "grad_norm": 0.32574138045310974,
+      "learning_rate": 0.0005312972027972028,
+      "loss": 3.5335,
+      "step": 19750
+    },
+    {
+      "epoch": 5.765915312481798,
+      "grad_norm": 0.32392826676368713,
+      "learning_rate": 0.0005311223776223776,
+      "loss": 3.5268,
+      "step": 19800
+    },
+    {
+      "epoch": 5.780476440095521,
+      "grad_norm": 0.34594979882240295,
+      "learning_rate": 0.0005309475524475524,
+      "loss": 3.534,
+      "step": 19850
+    },
+    {
+      "epoch": 5.795037567709244,
+      "grad_norm": 0.31376367807388306,
+      "learning_rate": 0.0005307727272727273,
+      "loss": 3.5306,
+      "step": 19900
+    },
+    {
+      "epoch": 5.809598695322966,
+      "grad_norm": 0.32450011372566223,
+      "learning_rate": 0.0005305979020979021,
+      "loss": 3.5337,
+      "step": 19950
+    },
+    {
+      "epoch": 5.824159822936688,
+      "grad_norm": 0.30886128544807434,
+      "learning_rate": 0.0005304230769230769,
+      "loss": 3.5239,
+      "step": 20000
+    },
+    {
+      "epoch": 5.824159822936688,
+      "eval_accuracy": 0.3622778742705534,
+      "eval_loss": 3.6140716075897217,
+      "eval_runtime": 180.1478,
+      "eval_samples_per_second": 92.402,
+      "eval_steps_per_second": 5.779,
+      "step": 20000
+    },
+    {
+      "epoch": 5.83872095055041,
+      "grad_norm": 0.32404589653015137,
+      "learning_rate": 0.0005302482517482517,
+      "loss": 3.5409,
+      "step": 20050
+    },
+    {
+      "epoch": 5.853282078164133,
+      "grad_norm": 0.30877238512039185,
+      "learning_rate": 0.0005300734265734265,
+      "loss": 3.5373,
+      "step": 20100
+    },
+    {
+      "epoch": 5.867843205777856,
+      "grad_norm": 0.31356489658355713,
+      "learning_rate": 0.0005298986013986013,
+      "loss": 3.5219,
+      "step": 20150
+    },
+    {
+      "epoch": 5.882404333391578,
+      "grad_norm": 0.30876606702804565,
+      "learning_rate": 0.0005297237762237762,
+      "loss": 3.529,
+      "step": 20200
+    },
+    {
+      "epoch": 5.8969654610053,
+      "grad_norm": 0.3364260494709015,
+      "learning_rate": 0.000529548951048951,
+      "loss": 3.5252,
+      "step": 20250
+    },
+    {
+      "epoch": 5.911526588619022,
+      "grad_norm": 0.3011105954647064,
+      "learning_rate": 0.0005293741258741258,
+      "loss": 3.5244,
+      "step": 20300
+    },
+    {
+      "epoch": 5.926087716232745,
+      "grad_norm": 0.31753775477409363,
+      "learning_rate": 0.0005291993006993007,
+      "loss": 3.5309,
+      "step": 20350
+    },
+    {
+      "epoch": 5.940648843846468,
+      "grad_norm": 0.3421807289123535,
+      "learning_rate": 0.0005290244755244755,
+      "loss": 3.537,
+      "step": 20400
+    },
+    {
+      "epoch": 5.95520997146019,
+      "grad_norm": 0.3219417631626129,
+      "learning_rate": 0.0005288496503496503,
+      "loss": 3.5311,
+      "step": 20450
+    },
+    {
+      "epoch": 5.969771099073912,
+      "grad_norm": 0.3096925616264343,
+      "learning_rate": 0.0005286748251748251,
+      "loss": 3.5334,
+      "step": 20500
+    },
+    {
+      "epoch": 5.984332226687634,
+      "grad_norm": 0.3308550715446472,
+      "learning_rate": 0.0005285,
+      "loss": 3.5199,
+      "step": 20550
+    },
+    {
+      "epoch": 5.998893354301357,
+      "grad_norm": 0.31948336958885193,
+      "learning_rate": 0.0005283251748251748,
+      "loss": 3.5393,
+      "step": 20600
+    },
+    {
+      "epoch": 6.013396237404625,
+      "grad_norm": 0.31365492939949036,
+      "learning_rate": 0.0005281503496503496,
+      "loss": 3.432,
+      "step": 20650
+    },
+    {
+      "epoch": 6.027957365018347,
+      "grad_norm": 0.32687506079673767,
+      "learning_rate": 0.0005279755244755244,
+      "loss": 3.4276,
+      "step": 20700
+    },
+    {
+      "epoch": 6.04251849263207,
+      "grad_norm": 0.32380980253219604,
+      "learning_rate": 0.0005278006993006993,
+      "loss": 3.4312,
+      "step": 20750
+    },
+    {
+      "epoch": 6.0570796202457915,
+      "grad_norm": 0.3151368498802185,
+      "learning_rate": 0.000527625874125874,
+      "loss": 3.4158,
+      "step": 20800
+    },
+    {
+      "epoch": 6.071640747859514,
+      "grad_norm": 0.315514475107193,
+      "learning_rate": 0.0005274510489510489,
+      "loss": 3.4395,
+      "step": 20850
+    },
+    {
+      "epoch": 6.086201875473237,
+      "grad_norm": 0.32791003584861755,
+      "learning_rate": 0.0005272762237762238,
+      "loss": 3.4373,
+      "step": 20900
+    },
+    {
+      "epoch": 6.100763003086959,
+      "grad_norm": 0.3153580129146576,
+      "learning_rate": 0.0005271013986013985,
+      "loss": 3.4479,
+      "step": 20950
+    },
+    {
+      "epoch": 6.115324130700682,
+      "grad_norm": 0.34948551654815674,
+      "learning_rate": 0.0005269265734265734,
+      "loss": 3.4463,
+      "step": 21000
+    },
+    {
+      "epoch": 6.115324130700682,
+      "eval_accuracy": 0.3622545968742924,
+      "eval_loss": 3.6173741817474365,
+      "eval_runtime": 179.8785,
+      "eval_samples_per_second": 92.54,
+      "eval_steps_per_second": 5.787,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1298852583144035,
+      "grad_norm": 0.3471393883228302,
+      "learning_rate": 0.0005267517482517482,
+      "loss": 3.4418,
+      "step": 21050
+    },
+    {
+      "epoch": 6.144446385928126,
+      "grad_norm": 0.32299190759658813,
+      "learning_rate": 0.000526576923076923,
+      "loss": 3.462,
+      "step": 21100
+    },
+    {
+      "epoch": 6.159007513541849,
+      "grad_norm": 0.3276447355747223,
+      "learning_rate": 0.0005264020979020978,
+      "loss": 3.4441,
+      "step": 21150
+    },
+    {
+      "epoch": 6.173568641155571,
+      "grad_norm": 0.3275761604309082,
+      "learning_rate": 0.0005262272727272727,
+      "loss": 3.4414,
+      "step": 21200
+    },
+    {
+      "epoch": 6.1881297687692935,
+      "grad_norm": 0.32831233739852905,
+      "learning_rate": 0.0005260524475524475,
+      "loss": 3.4573,
+      "step": 21250
+    },
+    {
+      "epoch": 6.202690896383016,
+      "grad_norm": 0.32581037282943726,
+      "learning_rate": 0.0005258776223776223,
+      "loss": 3.4413,
+      "step": 21300
+    },
+    {
+      "epoch": 6.217252023996738,
+      "grad_norm": 0.3218664228916168,
+      "learning_rate": 0.0005257027972027971,
+      "loss": 3.4494,
+      "step": 21350
+    },
+    {
+      "epoch": 6.231813151610461,
+      "grad_norm": 0.34039339423179626,
+      "learning_rate": 0.000525527972027972,
+      "loss": 3.458,
+      "step": 21400
+    },
+    {
+      "epoch": 6.246374279224183,
+      "grad_norm": 0.3327193260192871,
+      "learning_rate": 0.0005253531468531468,
+      "loss": 3.4557,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2609354068379055,
+      "grad_norm": 0.3233095705509186,
+      "learning_rate": 0.0005251783216783216,
+      "loss": 3.4511,
+      "step": 21500
+    },
+    {
+      "epoch": 6.275496534451628,
+      "grad_norm": 0.3496866822242737,
+      "learning_rate": 0.0005250034965034965,
+      "loss": 3.4622,
+      "step": 21550
+    },
+    {
+      "epoch": 6.29005766206535,
+      "grad_norm": 0.3645714521408081,
+      "learning_rate": 0.0005248286713286712,
+      "loss": 3.4517,
+      "step": 21600
+    },
+    {
+      "epoch": 6.304618789679073,
+      "grad_norm": 0.3256557881832123,
+      "learning_rate": 0.0005246538461538461,
+      "loss": 3.4739,
+      "step": 21650
+    },
+    {
+      "epoch": 6.319179917292795,
+      "grad_norm": 0.3235686719417572,
+      "learning_rate": 0.0005244790209790209,
+      "loss": 3.4725,
+      "step": 21700
+    },
+    {
+      "epoch": 6.3337410449065175,
+      "grad_norm": 0.3351970911026001,
+      "learning_rate": 0.0005243041958041957,
+      "loss": 3.4857,
+      "step": 21750
+    },
+    {
+      "epoch": 6.34830217252024,
+      "grad_norm": 0.3423496186733246,
+      "learning_rate": 0.0005241293706293705,
+      "loss": 3.4746,
+      "step": 21800
+    },
+    {
+      "epoch": 6.362863300133962,
+      "grad_norm": 0.3310966491699219,
+      "learning_rate": 0.0005239545454545454,
+      "loss": 3.4763,
+      "step": 21850
+    },
+    {
+      "epoch": 6.377424427747685,
+      "grad_norm": 0.31002819538116455,
+      "learning_rate": 0.0005237797202797202,
+      "loss": 3.4714,
+      "step": 21900
+    },
+    {
+      "epoch": 6.391985555361408,
+      "grad_norm": 0.3289186358451843,
+      "learning_rate": 0.000523604895104895,
+      "loss": 3.4637,
+      "step": 21950
+    },
+    {
+      "epoch": 6.406546682975129,
+      "grad_norm": 0.3141127824783325,
+      "learning_rate": 0.0005234300699300698,
+      "loss": 3.4779,
+      "step": 22000
+    },
+    {
+      "epoch": 6.406546682975129,
+      "eval_accuracy": 0.36285122710673956,
+      "eval_loss": 3.610785722732544,
+      "eval_runtime": 179.7092,
+      "eval_samples_per_second": 92.627,
+      "eval_steps_per_second": 5.793,
+      "step": 22000
+    },
+    {
+      "epoch": 6.421107810588852,
+      "grad_norm": 0.3150128722190857,
+      "learning_rate": 0.0005232552447552447,
+      "loss": 3.4842,
+      "step": 22050
+    },
+    {
+      "epoch": 6.435668938202574,
+      "grad_norm": 0.3259349465370178,
+      "learning_rate": 0.0005230804195804195,
+      "loss": 3.4848,
+      "step": 22100
+    },
+    {
+      "epoch": 6.450230065816297,
+      "grad_norm": 0.32301968336105347,
+      "learning_rate": 0.0005229055944055943,
+      "loss": 3.4818,
+      "step": 22150
+    },
+    {
+      "epoch": 6.4647911934300195,
+      "grad_norm": 0.3123028874397278,
+      "learning_rate": 0.0005227307692307691,
+      "loss": 3.4914,
+      "step": 22200
+    },
+    {
+      "epoch": 6.479352321043741,
+      "grad_norm": 0.3286699652671814,
+      "learning_rate": 0.0005225559440559441,
+      "loss": 3.4875,
+      "step": 22250
+    },
+    {
+      "epoch": 6.493913448657464,
+      "grad_norm": 0.3313329517841339,
+      "learning_rate": 0.0005223811188811189,
+      "loss": 3.4791,
+      "step": 22300
+    },
+    {
+      "epoch": 6.508474576271187,
+      "grad_norm": 0.31018057465553284,
+      "learning_rate": 0.0005222062937062937,
+      "loss": 3.4807,
+      "step": 22350
+    },
+    {
+      "epoch": 6.523035703884909,
+      "grad_norm": 0.32716143131256104,
+      "learning_rate": 0.0005220314685314686,
+      "loss": 3.4846,
+      "step": 22400
+    },
+    {
+      "epoch": 6.5375968314986315,
+      "grad_norm": 0.3213047981262207,
+      "learning_rate": 0.0005218566433566433,
+      "loss": 3.4959,
+      "step": 22450
+    },
+    {
+      "epoch": 6.552157959112353,
+      "grad_norm": 0.3478303849697113,
+      "learning_rate": 0.0005216818181818182,
+      "loss": 3.4794,
+      "step": 22500
+    },
+    {
+      "epoch": 6.566719086726076,
+      "grad_norm": 0.334625780582428,
+      "learning_rate": 0.000521506993006993,
+      "loss": 3.4993,
+      "step": 22550
+    },
+    {
+      "epoch": 6.581280214339799,
+      "grad_norm": 0.3324287235736847,
+      "learning_rate": 0.0005213321678321678,
+      "loss": 3.4947,
+      "step": 22600
+    },
+    {
+      "epoch": 6.595841341953521,
+      "grad_norm": 0.3208302855491638,
+      "learning_rate": 0.0005211573426573426,
+      "loss": 3.4816,
+      "step": 22650
+    },
+    {
+      "epoch": 6.610402469567243,
+      "grad_norm": 0.3206283152103424,
+      "learning_rate": 0.0005209825174825175,
+      "loss": 3.4811,
+      "step": 22700
+    },
+    {
+      "epoch": 6.624963597180965,
+      "grad_norm": 0.3405255377292633,
+      "learning_rate": 0.0005208076923076923,
+      "loss": 3.4839,
+      "step": 22750
+    },
+    {
+      "epoch": 6.639524724794688,
+      "grad_norm": 0.33559542894363403,
+      "learning_rate": 0.0005206328671328671,
+      "loss": 3.4962,
+      "step": 22800
+    },
+    {
+      "epoch": 6.654085852408411,
+      "grad_norm": 0.3277864456176758,
+      "learning_rate": 0.0005204580419580419,
+      "loss": 3.4831,
+      "step": 22850
+    },
+    {
+      "epoch": 6.668646980022133,
+      "grad_norm": 0.3352718949317932,
+      "learning_rate": 0.0005202832167832168,
+      "loss": 3.4782,
+      "step": 22900
+    },
+    {
+      "epoch": 6.683208107635855,
+      "grad_norm": 0.31568098068237305,
+      "learning_rate": 0.0005201083916083916,
+      "loss": 3.4802,
+      "step": 22950
+    },
+    {
+      "epoch": 6.697769235249577,
+      "grad_norm": 0.3398934602737427,
+      "learning_rate": 0.0005199335664335664,
+      "loss": 3.4888,
+      "step": 23000
+    },
+    {
+      "epoch": 6.697769235249577,
+      "eval_accuracy": 0.3639860589557666,
+      "eval_loss": 3.598484992980957,
+      "eval_runtime": 179.7416,
+      "eval_samples_per_second": 92.611,
+      "eval_steps_per_second": 5.792,
+      "step": 23000
+    },
+    {
+      "epoch": 6.7123303628633,
+      "grad_norm": 0.30721819400787354,
+      "learning_rate": 0.0005197587412587413,
+      "loss": 3.4866,
+      "step": 23050
+    },
+    {
+      "epoch": 6.726891490477023,
+      "grad_norm": 0.3224666714668274,
+      "learning_rate": 0.0005195839160839161,
+      "loss": 3.4968,
+      "step": 23100
+    },
+    {
+      "epoch": 6.741452618090745,
+      "grad_norm": 0.32522931694984436,
+      "learning_rate": 0.0005194090909090909,
+      "loss": 3.4878,
+      "step": 23150
+    },
+    {
+      "epoch": 6.756013745704467,
+      "grad_norm": 0.31341007351875305,
+      "learning_rate": 0.0005192342657342657,
+      "loss": 3.4833,
+      "step": 23200
+    },
+    {
+      "epoch": 6.77057487331819,
+      "grad_norm": 0.3186572790145874,
+      "learning_rate": 0.0005190594405594405,
+      "loss": 3.4975,
+      "step": 23250
+    },
+    {
+      "epoch": 6.785136000931912,
+      "grad_norm": 0.33995872735977173,
+      "learning_rate": 0.0005188846153846153,
+      "loss": 3.4878,
+      "step": 23300
+    },
+    {
+      "epoch": 6.799697128545635,
+      "grad_norm": 0.3231462836265564,
+      "learning_rate": 0.0005187097902097902,
+      "loss": 3.4897,
+      "step": 23350
+    },
+    {
+      "epoch": 6.814258256159357,
+      "grad_norm": 0.31064069271087646,
+      "learning_rate": 0.000518534965034965,
+      "loss": 3.4984,
+      "step": 23400
+    },
+    {
+      "epoch": 6.828819383773079,
+      "grad_norm": 0.31749048829078674,
+      "learning_rate": 0.0005183601398601398,
+      "loss": 3.4867,
+      "step": 23450
+    },
+    {
+      "epoch": 6.843380511386802,
+      "grad_norm": 0.31053680181503296,
+      "learning_rate": 0.0005181853146853146,
+      "loss": 3.4937,
+      "step": 23500
+    },
+    {
+      "epoch": 6.857941639000524,
+      "grad_norm": 0.3226015269756317,
+      "learning_rate": 0.0005180104895104895,
+      "loss": 3.4918,
+      "step": 23550
+    },
+    {
+      "epoch": 6.872502766614247,
+      "grad_norm": 0.3255876302719116,
+      "learning_rate": 0.0005178356643356643,
+      "loss": 3.4998,
+      "step": 23600
+    },
+    {
+      "epoch": 6.887063894227969,
+      "grad_norm": 0.32611915469169617,
+      "learning_rate": 0.0005176608391608391,
+      "loss": 3.4874,
+      "step": 23650
+    },
+    {
+      "epoch": 6.901625021841691,
+      "grad_norm": 0.3349880874156952,
+      "learning_rate": 0.000517486013986014,
+      "loss": 3.4898,
+      "step": 23700
+    },
+    {
+      "epoch": 6.916186149455414,
+      "grad_norm": 0.32357269525527954,
+      "learning_rate": 0.0005173111888111888,
+      "loss": 3.4919,
+      "step": 23750
+    },
+    {
+      "epoch": 6.930747277069136,
+      "grad_norm": 0.30893370509147644,
+      "learning_rate": 0.0005171363636363636,
+      "loss": 3.488,
+      "step": 23800
+    },
+    {
+      "epoch": 6.945308404682859,
+      "grad_norm": 0.34728315472602844,
+      "learning_rate": 0.0005169615384615384,
+      "loss": 3.4851,
+      "step": 23850
+    },
+    {
+      "epoch": 6.959869532296581,
+      "grad_norm": 0.34141796827316284,
+      "learning_rate": 0.0005167867132867133,
+      "loss": 3.4924,
+      "step": 23900
+    },
+    {
+      "epoch": 6.974430659910303,
+      "grad_norm": 0.33731377124786377,
+      "learning_rate": 0.000516611888111888,
+      "loss": 3.4936,
+      "step": 23950
+    },
+    {
+      "epoch": 6.988991787524026,
+      "grad_norm": 0.330599308013916,
+      "learning_rate": 0.0005164370629370629,
+      "loss": 3.4999,
+      "step": 24000
+    },
+    {
+      "epoch": 6.988991787524026,
+      "eval_accuracy": 0.36480441226573007,
+      "eval_loss": 3.590554714202881,
+      "eval_runtime": 179.6543,
+      "eval_samples_per_second": 92.656,
+      "eval_steps_per_second": 5.794,
+      "step": 24000
+    },
+    {
+      "epoch": 7.003494670627293,
+      "grad_norm": 0.3449358642101288,
+      "learning_rate": 0.0005162622377622377,
+      "loss": 3.471,
+      "step": 24050
+    },
+    {
+      "epoch": 7.018055798241016,
+      "grad_norm": 0.35293149948120117,
+      "learning_rate": 0.0005160874125874125,
+      "loss": 3.3888,
+      "step": 24100
+    },
+    {
+      "epoch": 7.032616925854738,
+      "grad_norm": 0.3265637755393982,
+      "learning_rate": 0.0005159125874125873,
+      "loss": 3.3925,
+      "step": 24150
+    },
+    {
+      "epoch": 7.0471780534684605,
+      "grad_norm": 0.32121822237968445,
+      "learning_rate": 0.0005157377622377622,
+      "loss": 3.3897,
+      "step": 24200
+    },
+    {
+      "epoch": 7.061739181082183,
+      "grad_norm": 0.3485367000102997,
+      "learning_rate": 0.000515562937062937,
+      "loss": 3.3965,
+      "step": 24250
+    },
+    {
+      "epoch": 7.076300308695905,
+      "grad_norm": 0.32369834184646606,
+      "learning_rate": 0.0005153881118881118,
+      "loss": 3.4083,
+      "step": 24300
+    },
+    {
+      "epoch": 7.090861436309628,
+      "grad_norm": 0.3367840349674225,
+      "learning_rate": 0.0005152132867132867,
+      "loss": 3.4072,
+      "step": 24350
+    },
+    {
+      "epoch": 7.105422563923351,
+      "grad_norm": 0.3350302278995514,
+      "learning_rate": 0.0005150384615384615,
+      "loss": 3.403,
+      "step": 24400
+    },
+    {
+      "epoch": 7.1199836915370724,
+      "grad_norm": 0.3556578755378723,
+      "learning_rate": 0.0005148636363636363,
+      "loss": 3.3993,
+      "step": 24450
+    },
+    {
+      "epoch": 7.134544819150795,
+      "grad_norm": 0.33493995666503906,
+      "learning_rate": 0.0005146888111888111,
+      "loss": 3.4033,
+      "step": 24500
+    },
+    {
+      "epoch": 7.149105946764517,
+      "grad_norm": 0.3266991674900055,
+      "learning_rate": 0.000514513986013986,
+      "loss": 3.4133,
+      "step": 24550
+    },
+    {
+      "epoch": 7.16366707437824,
+      "grad_norm": 0.33190712332725525,
+      "learning_rate": 0.0005143391608391608,
+      "loss": 3.4191,
+      "step": 24600
+    },
+    {
+      "epoch": 7.1782282019919625,
+      "grad_norm": 0.33754125237464905,
+      "learning_rate": 0.0005141643356643356,
+      "loss": 3.4116,
+      "step": 24650
+    },
+    {
+      "epoch": 7.192789329605684,
+      "grad_norm": 0.3015083074569702,
+      "learning_rate": 0.0005139895104895104,
+      "loss": 3.4225,
+      "step": 24700
+    },
+    {
+      "epoch": 7.207350457219407,
+      "grad_norm": 0.3270661532878876,
+      "learning_rate": 0.0005138146853146852,
+      "loss": 3.4205,
+      "step": 24750
+    },
+    {
+      "epoch": 7.22191158483313,
+      "grad_norm": 0.3491705656051636,
+      "learning_rate": 0.00051363986013986,
+      "loss": 3.4145,
+      "step": 24800
+    },
+    {
+      "epoch": 7.236472712446852,
+      "grad_norm": 0.3363984525203705,
+      "learning_rate": 0.0005134650349650349,
+      "loss": 3.4213,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2510338400605745,
+      "grad_norm": 0.33105769753456116,
+      "learning_rate": 0.0005132902097902097,
+      "loss": 3.4229,
+      "step": 24900
+    },
+    {
+      "epoch": 7.265594967674296,
+      "grad_norm": 0.3505908250808716,
+      "learning_rate": 0.0005131153846153845,
+      "loss": 3.4278,
+      "step": 24950
+    },
+    {
+      "epoch": 7.280156095288019,
+      "grad_norm": 0.3380582332611084,
+      "learning_rate": 0.0005129405594405594,
+      "loss": 3.428,
+      "step": 25000
+    },
+    {
+      "epoch": 7.280156095288019,
+      "eval_accuracy": 0.36480793914395143,
+      "eval_loss": 3.5974769592285156,
+      "eval_runtime": 179.8567,
+      "eval_samples_per_second": 92.551,
+      "eval_steps_per_second": 5.788,
+      "step": 25000
+    },
+    {
+      "epoch": 7.294717222901742,
+      "grad_norm": 0.31806254386901855,
+      "learning_rate": 0.0005127657342657342,
+      "loss": 3.4173,
+      "step": 25050
+    },
+    {
+      "epoch": 7.309278350515464,
+      "grad_norm": 0.3278155028820038,
+      "learning_rate": 0.000512590909090909,
+      "loss": 3.4293,
+      "step": 25100
+    },
+    {
+      "epoch": 7.3238394781291865,
+      "grad_norm": 0.31251752376556396,
+      "learning_rate": 0.0005124160839160838,
+      "loss": 3.4386,
+      "step": 25150
+    },
+    {
+      "epoch": 7.338400605742908,
+      "grad_norm": 0.3372874855995178,
+      "learning_rate": 0.0005122412587412588,
+      "loss": 3.4216,
+      "step": 25200
+    },
+    {
+      "epoch": 7.352961733356631,
+      "grad_norm": 0.32962003350257874,
+      "learning_rate": 0.0005120664335664336,
+      "loss": 3.4316,
+      "step": 25250
+    },
+    {
+      "epoch": 7.367522860970354,
+      "grad_norm": 0.3354533612728119,
+      "learning_rate": 0.0005118916083916084,
+      "loss": 3.4331,
+      "step": 25300
+    },
+    {
+      "epoch": 7.382083988584076,
+      "grad_norm": 0.32760855555534363,
+      "learning_rate": 0.0005117167832167832,
+      "loss": 3.4298,
+      "step": 25350
+    },
+    {
+      "epoch": 7.396645116197798,
+      "grad_norm": 0.323398232460022,
+      "learning_rate": 0.0005115419580419581,
+      "loss": 3.4329,
+      "step": 25400
+    },
+    {
+      "epoch": 7.411206243811521,
+      "grad_norm": 0.3129633665084839,
+      "learning_rate": 0.0005113671328671328,
+      "loss": 3.4451,
+      "step": 25450
+    },
+    {
+      "epoch": 7.425767371425243,
+      "grad_norm": 0.308672159910202,
+      "learning_rate": 0.0005111923076923077,
+      "loss": 3.4402,
+      "step": 25500
+    },
+    {
+      "epoch": 7.440328499038966,
+      "grad_norm": 0.3408229649066925,
+      "learning_rate": 0.0005110174825174825,
+      "loss": 3.4397,
+      "step": 25550
+    },
+    {
+      "epoch": 7.454889626652688,
+      "grad_norm": 0.320758581161499,
+      "learning_rate": 0.0005108426573426573,
+      "loss": 3.4447,
+      "step": 25600
+    },
+    {
+      "epoch": 7.46945075426641,
+      "grad_norm": 0.33821046352386475,
+      "learning_rate": 0.0005106678321678321,
+      "loss": 3.4399,
+      "step": 25650
+    },
+    {
+      "epoch": 7.484011881880133,
+      "grad_norm": 0.32798120379447937,
+      "learning_rate": 0.000510493006993007,
+      "loss": 3.4441,
+      "step": 25700
+    },
+    {
+      "epoch": 7.498573009493855,
+      "grad_norm": 0.36191534996032715,
+      "learning_rate": 0.0005103181818181818,
+      "loss": 3.4465,
+      "step": 25750
+    },
+    {
+      "epoch": 7.513134137107578,
+      "grad_norm": 0.333870530128479,
+      "learning_rate": 0.0005101433566433566,
+      "loss": 3.4483,
+      "step": 25800
+    },
+    {
+      "epoch": 7.5276952647213005,
+      "grad_norm": 0.3584294319152832,
+      "learning_rate": 0.0005099685314685315,
+      "loss": 3.4472,
+      "step": 25850
+    },
+    {
+      "epoch": 7.542256392335022,
+      "grad_norm": 0.3232259750366211,
+      "learning_rate": 0.0005097937062937063,
+      "loss": 3.4551,
+      "step": 25900
+    },
+    {
+      "epoch": 7.556817519948745,
+      "grad_norm": 0.34521010518074036,
+      "learning_rate": 0.0005096188811188811,
+      "loss": 3.4492,
+      "step": 25950
+    },
+    {
+      "epoch": 7.571378647562467,
+      "grad_norm": 0.3537822365760803,
+      "learning_rate": 0.0005094440559440559,
+      "loss": 3.4552,
+      "step": 26000
+    },
+    {
+      "epoch": 7.571378647562467,
+      "eval_accuracy": 0.36535131351525596,
+      "eval_loss": 3.590632200241089,
+      "eval_runtime": 179.7252,
+      "eval_samples_per_second": 92.619,
+      "eval_steps_per_second": 5.792,
+      "step": 26000
+    },
+    {
+      "epoch": 7.58593977517619,
+      "grad_norm": 0.3477293848991394,
+      "learning_rate": 0.0005092692307692308,
+      "loss": 3.4485,
+      "step": 26050
+    },
+    {
+      "epoch": 7.600500902789912,
+      "grad_norm": 0.3164335787296295,
+      "learning_rate": 0.0005090944055944056,
+      "loss": 3.4537,
+      "step": 26100
+    },
+    {
+      "epoch": 7.615062030403634,
+      "grad_norm": 0.31365999579429626,
+      "learning_rate": 0.0005089195804195804,
+      "loss": 3.4563,
+      "step": 26150
+    },
+    {
+      "epoch": 7.629623158017357,
+      "grad_norm": 0.33597031235694885,
+      "learning_rate": 0.0005087447552447552,
+      "loss": 3.4469,
+      "step": 26200
+    },
+    {
+      "epoch": 7.644184285631079,
+      "grad_norm": 0.33030572533607483,
+      "learning_rate": 0.00050856993006993,
+      "loss": 3.4471,
+      "step": 26250
+    },
+    {
+      "epoch": 7.658745413244802,
+      "grad_norm": 0.34268873929977417,
+      "learning_rate": 0.0005083951048951048,
+      "loss": 3.4566,
+      "step": 26300
+    },
+    {
+      "epoch": 7.673306540858524,
+      "grad_norm": 0.34644824266433716,
+      "learning_rate": 0.0005082202797202797,
+      "loss": 3.4572,
+      "step": 26350
+    },
+    {
+      "epoch": 7.687867668472246,
+      "grad_norm": 0.3286401331424713,
+      "learning_rate": 0.0005080454545454545,
+      "loss": 3.4614,
+      "step": 26400
+    },
+    {
+      "epoch": 7.702428796085969,
+      "grad_norm": 0.3406911790370941,
+      "learning_rate": 0.0005078706293706293,
+      "loss": 3.4613,
+      "step": 26450
+    },
+    {
+      "epoch": 7.716989923699691,
+      "grad_norm": 0.32939502596855164,
+      "learning_rate": 0.0005076958041958042,
+      "loss": 3.4677,
+      "step": 26500
+    },
+    {
+      "epoch": 7.731551051313414,
+      "grad_norm": 0.33044230937957764,
+      "learning_rate": 0.000507520979020979,
+      "loss": 3.4601,
+      "step": 26550
+    },
+    {
+      "epoch": 7.746112178927136,
+      "grad_norm": 0.315995454788208,
+      "learning_rate": 0.0005073461538461538,
+      "loss": 3.4459,
+      "step": 26600
+    },
+    {
+      "epoch": 7.760673306540858,
+      "grad_norm": 0.35745933651924133,
+      "learning_rate": 0.0005071713286713286,
+      "loss": 3.4574,
+      "step": 26650
+    },
+    {
+      "epoch": 7.775234434154581,
+      "grad_norm": 0.3426244258880615,
+      "learning_rate": 0.0005069965034965035,
+      "loss": 3.4537,
+      "step": 26700
+    },
+    {
+      "epoch": 7.789795561768304,
+      "grad_norm": 0.3141034245491028,
+      "learning_rate": 0.0005068216783216783,
+      "loss": 3.4541,
+      "step": 26750
+    },
+    {
+      "epoch": 7.8043566893820255,
+      "grad_norm": 0.34187954664230347,
+      "learning_rate": 0.0005066468531468531,
+      "loss": 3.4703,
+      "step": 26800
+    },
+    {
+      "epoch": 7.818917816995748,
+      "grad_norm": 0.32608917355537415,
+      "learning_rate": 0.0005064720279720279,
+      "loss": 3.4433,
+      "step": 26850
+    },
+    {
+      "epoch": 7.833478944609471,
+      "grad_norm": 0.30253276228904724,
+      "learning_rate": 0.0005062972027972028,
+      "loss": 3.4582,
+      "step": 26900
+    },
+    {
+      "epoch": 7.848040072223193,
+      "grad_norm": 0.3292168378829956,
+      "learning_rate": 0.0005061223776223775,
+      "loss": 3.4609,
+      "step": 26950
+    },
+    {
+      "epoch": 7.862601199836916,
+      "grad_norm": 0.3352425992488861,
+      "learning_rate": 0.0005059475524475524,
+      "loss": 3.4646,
+      "step": 27000
+    },
+    {
+      "epoch": 7.862601199836916,
+      "eval_accuracy": 0.3661527378097569,
+      "eval_loss": 3.5805842876434326,
+      "eval_runtime": 179.8829,
+      "eval_samples_per_second": 92.538,
+      "eval_steps_per_second": 5.787,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8771623274506375,
+      "grad_norm": 0.33013811707496643,
+      "learning_rate": 0.0005057727272727272,
+      "loss": 3.4593,
+      "step": 27050
+    },
+    {
+      "epoch": 7.89172345506436,
+      "grad_norm": 0.3326588273048401,
+      "learning_rate": 0.000505597902097902,
+      "loss": 3.4569,
+      "step": 27100
+    },
+    {
+      "epoch": 7.906284582678083,
+      "grad_norm": 0.3302481472492218,
+      "learning_rate": 0.0005054230769230769,
+      "loss": 3.4571,
+      "step": 27150
+    },
+    {
+      "epoch": 7.920845710291805,
+      "grad_norm": 0.3329846262931824,
+      "learning_rate": 0.0005052482517482517,
+      "loss": 3.471,
+      "step": 27200
+    },
+    {
+      "epoch": 7.935406837905528,
+      "grad_norm": 0.3458568751811981,
+      "learning_rate": 0.0005050734265734265,
+      "loss": 3.4765,
+      "step": 27250
+    },
+    {
+      "epoch": 7.9499679655192494,
+      "grad_norm": 0.3226156532764435,
+      "learning_rate": 0.0005048986013986013,
+      "loss": 3.4703,
+      "step": 27300
+    },
+    {
+      "epoch": 7.964529093132972,
+      "grad_norm": 0.34230631589889526,
+      "learning_rate": 0.0005047237762237762,
+      "loss": 3.4688,
+      "step": 27350
+    },
+    {
+      "epoch": 7.979090220746695,
+      "grad_norm": 0.31827977299690247,
+      "learning_rate": 0.000504548951048951,
+      "loss": 3.4583,
+      "step": 27400
+    },
+    {
+      "epoch": 7.993651348360417,
+      "grad_norm": 0.32115548849105835,
+      "learning_rate": 0.0005043741258741258,
+      "loss": 3.4671,
+      "step": 27450
+    },
+    {
+      "epoch": 8.008154231463685,
+      "grad_norm": 0.3317052721977234,
+      "learning_rate": 0.0005041993006993006,
+      "loss": 3.4071,
+      "step": 27500
+    },
+    {
+      "epoch": 8.022715359077408,
+      "grad_norm": 0.3432307541370392,
+      "learning_rate": 0.0005040244755244755,
+      "loss": 3.3524,
+      "step": 27550
+    },
+    {
+      "epoch": 8.037276486691129,
+      "grad_norm": 0.3754727244377136,
+      "learning_rate": 0.0005038496503496503,
+      "loss": 3.3459,
+      "step": 27600
+    },
+    {
+      "epoch": 8.051837614304851,
+      "grad_norm": 0.35126733779907227,
+      "learning_rate": 0.0005036748251748251,
+      "loss": 3.3583,
+      "step": 27650
+    },
+    {
+      "epoch": 8.066398741918574,
+      "grad_norm": 0.3542656898498535,
+      "learning_rate": 0.0005034999999999999,
+      "loss": 3.3705,
+      "step": 27700
+    },
+    {
+      "epoch": 8.080959869532297,
+      "grad_norm": 0.34104204177856445,
+      "learning_rate": 0.0005033251748251747,
+      "loss": 3.3645,
+      "step": 27750
+    },
+    {
+      "epoch": 8.09552099714602,
+      "grad_norm": 0.34891462326049805,
+      "learning_rate": 0.0005031503496503496,
+      "loss": 3.3766,
+      "step": 27800
+    },
+    {
+      "epoch": 8.11008212475974,
+      "grad_norm": 0.3348483145236969,
+      "learning_rate": 0.0005029755244755244,
+      "loss": 3.3758,
+      "step": 27850
+    },
+    {
+      "epoch": 8.124643252373463,
+      "grad_norm": 0.35943523049354553,
+      "learning_rate": 0.0005028006993006992,
+      "loss": 3.3722,
+      "step": 27900
+    },
+    {
+      "epoch": 8.139204379987186,
+      "grad_norm": 0.360538125038147,
+      "learning_rate": 0.000502625874125874,
+      "loss": 3.3764,
+      "step": 27950
+    },
+    {
+      "epoch": 8.153765507600909,
+      "grad_norm": 0.34357813000679016,
+      "learning_rate": 0.000502451048951049,
+      "loss": 3.3921,
+      "step": 28000
+    },
+    {
+      "epoch": 8.153765507600909,
+      "eval_accuracy": 0.3661058303294128,
+      "eval_loss": 3.58715558052063,
+      "eval_runtime": 180.3137,
+      "eval_samples_per_second": 92.317,
+      "eval_steps_per_second": 5.773,
+      "step": 28000
+    },
+    {
+      "epoch": 8.168326635214632,
+      "grad_norm": 0.3614567220211029,
+      "learning_rate": 0.0005022762237762237,
+      "loss": 3.3796,
+      "step": 28050
+    },
+    {
+      "epoch": 8.182887762828354,
+      "grad_norm": 0.33387571573257446,
+      "learning_rate": 0.0005021013986013985,
+      "loss": 3.383,
+      "step": 28100
+    },
+    {
+      "epoch": 8.197448890442075,
+      "grad_norm": 0.3599357008934021,
+      "learning_rate": 0.0005019265734265733,
+      "loss": 3.3902,
+      "step": 28150
+    },
+    {
+      "epoch": 8.212010018055798,
+      "grad_norm": 0.3254016041755676,
+      "learning_rate": 0.0005017517482517483,
+      "loss": 3.3856,
+      "step": 28200
+    },
+    {
+      "epoch": 8.22657114566952,
+      "grad_norm": 0.3269076347351074,
+      "learning_rate": 0.0005015769230769231,
+      "loss": 3.3963,
+      "step": 28250
+    },
+    {
+      "epoch": 8.241132273283243,
+      "grad_norm": 0.3196601867675781,
+      "learning_rate": 0.0005014020979020979,
+      "loss": 3.3973,
+      "step": 28300
+    },
+    {
+      "epoch": 8.255693400896966,
+      "grad_norm": 0.3544836640357971,
+      "learning_rate": 0.0005012272727272727,
+      "loss": 3.3985,
+      "step": 28350
+    },
+    {
+      "epoch": 8.270254528510687,
+      "grad_norm": 0.33133646845817566,
+      "learning_rate": 0.0005010524475524476,
+      "loss": 3.3971,
+      "step": 28400
+    },
+    {
+      "epoch": 8.28481565612441,
+      "grad_norm": 0.365125834941864,
+      "learning_rate": 0.0005008776223776223,
+      "loss": 3.3979,
+      "step": 28450
+    },
+    {
+      "epoch": 8.299376783738133,
+      "grad_norm": 0.3482271730899811,
+      "learning_rate": 0.0005007027972027972,
+      "loss": 3.4087,
+      "step": 28500
+    },
+    {
+      "epoch": 8.313937911351855,
+      "grad_norm": 0.3457016050815582,
+      "learning_rate": 0.000500527972027972,
+      "loss": 3.4072,
+      "step": 28550
+    },
+    {
+      "epoch": 8.328499038965578,
+      "grad_norm": 0.3350307047367096,
+      "learning_rate": 0.0005003531468531468,
+      "loss": 3.3984,
+      "step": 28600
+    },
+    {
+      "epoch": 8.3430601665793,
+      "grad_norm": 0.33122938871383667,
+      "learning_rate": 0.0005001783216783217,
+      "loss": 3.4036,
+      "step": 28650
+    },
+    {
+      "epoch": 8.357621294193022,
+      "grad_norm": 0.3646140992641449,
+      "learning_rate": 0.0005000034965034965,
+      "loss": 3.4019,
+      "step": 28700
+    },
+    {
+      "epoch": 8.372182421806745,
+      "grad_norm": 0.339650422334671,
+      "learning_rate": 0.0004998286713286713,
+      "loss": 3.3992,
+      "step": 28750
+    },
+    {
+      "epoch": 8.386743549420467,
+      "grad_norm": 0.31742623448371887,
+      "learning_rate": 0.0004996538461538461,
+      "loss": 3.404,
+      "step": 28800
+    },
+    {
+      "epoch": 8.40130467703419,
+      "grad_norm": 0.3145395815372467,
+      "learning_rate": 0.000499479020979021,
+      "loss": 3.3995,
+      "step": 28850
+    },
+    {
+      "epoch": 8.415865804647911,
+      "grad_norm": 0.34881776571273804,
+      "learning_rate": 0.0004993041958041958,
+      "loss": 3.4046,
+      "step": 28900
+    },
+    {
+      "epoch": 8.430426932261634,
+      "grad_norm": 0.3403722941875458,
+      "learning_rate": 0.0004991293706293706,
+      "loss": 3.4167,
+      "step": 28950
+    },
+    {
+      "epoch": 8.444988059875357,
+      "grad_norm": 0.3250523507595062,
+      "learning_rate": 0.0004989545454545454,
+      "loss": 3.4086,
+      "step": 29000
+    },
+    {
+      "epoch": 8.444988059875357,
+      "eval_accuracy": 0.3662244510002579,
+      "eval_loss": 3.5857577323913574,
+      "eval_runtime": 179.7516,
+      "eval_samples_per_second": 92.606,
+      "eval_steps_per_second": 5.791,
+      "step": 29000
+    },
+    {
+      "epoch": 8.45954918748908,
+      "grad_norm": 0.3275490403175354,
+      "learning_rate": 0.0004987797202797203,
+      "loss": 3.4166,
+      "step": 29050
+    },
+    {
+      "epoch": 8.474110315102802,
+      "grad_norm": 0.31486833095550537,
+      "learning_rate": 0.0004986048951048951,
+      "loss": 3.4041,
+      "step": 29100
+    },
+    {
+      "epoch": 8.488671442716523,
+      "grad_norm": 0.3729318082332611,
+      "learning_rate": 0.0004984300699300699,
+      "loss": 3.4167,
+      "step": 29150
+    },
+    {
+      "epoch": 8.503232570330246,
+      "grad_norm": 0.3305770456790924,
+      "learning_rate": 0.0004982552447552448,
+      "loss": 3.4228,
+      "step": 29200
+    },
+    {
+      "epoch": 8.517793697943969,
+      "grad_norm": 0.3442740738391876,
+      "learning_rate": 0.0004980804195804195,
+      "loss": 3.406,
+      "step": 29250
+    },
+    {
+      "epoch": 8.532354825557691,
+      "grad_norm": 0.32196056842803955,
+      "learning_rate": 0.0004979055944055944,
+      "loss": 3.4296,
+      "step": 29300
+    },
+    {
+      "epoch": 8.546915953171414,
+      "grad_norm": 0.3387078642845154,
+      "learning_rate": 0.0004977307692307692,
+      "loss": 3.4227,
+      "step": 29350
+    },
+    {
+      "epoch": 8.561477080785137,
+      "grad_norm": 0.32302534580230713,
+      "learning_rate": 0.000497555944055944,
+      "loss": 3.414,
+      "step": 29400
+    },
+    {
+      "epoch": 8.576038208398858,
+      "grad_norm": 0.3491160571575165,
+      "learning_rate": 0.0004973811188811188,
+      "loss": 3.4214,
+      "step": 29450
+    },
+    {
+      "epoch": 8.59059933601258,
+      "grad_norm": 0.32889190316200256,
+      "learning_rate": 0.0004972062937062937,
+      "loss": 3.4281,
+      "step": 29500
+    },
+    {
+      "epoch": 8.605160463626303,
+      "grad_norm": 0.32402417063713074,
+      "learning_rate": 0.0004970314685314685,
+      "loss": 3.4171,
+      "step": 29550
+    },
+    {
+      "epoch": 8.619721591240026,
+      "grad_norm": 0.3430418074131012,
+      "learning_rate": 0.0004968566433566433,
+      "loss": 3.4293,
+      "step": 29600
+    },
+    {
+      "epoch": 8.634282718853749,
+      "grad_norm": 0.34214910864830017,
+      "learning_rate": 0.0004966818181818181,
+      "loss": 3.4223,
+      "step": 29650
+    },
+    {
+      "epoch": 8.64884384646747,
+      "grad_norm": 0.3425740897655487,
+      "learning_rate": 0.000496506993006993,
+      "loss": 3.4194,
+      "step": 29700
+    },
+    {
+      "epoch": 8.663404974081192,
+      "grad_norm": 0.34497156739234924,
+      "learning_rate": 0.0004963321678321678,
+      "loss": 3.4311,
+      "step": 29750
+    },
+    {
+      "epoch": 8.677966101694915,
+      "grad_norm": 0.35663503408432007,
+      "learning_rate": 0.0004961573426573426,
+      "loss": 3.431,
+      "step": 29800
+    },
+    {
+      "epoch": 8.692527229308638,
+      "grad_norm": 0.34114986658096313,
+      "learning_rate": 0.0004959825174825175,
+      "loss": 3.4287,
+      "step": 29850
+    },
+    {
+      "epoch": 8.70708835692236,
+      "grad_norm": 0.3398053050041199,
+      "learning_rate": 0.0004958076923076923,
+      "loss": 3.4288,
+      "step": 29900
+    },
+    {
+      "epoch": 8.721649484536082,
+      "grad_norm": 0.34339818358421326,
+      "learning_rate": 0.0004956328671328671,
+      "loss": 3.4414,
+      "step": 29950
+    },
+    {
+      "epoch": 8.736210612149804,
+      "grad_norm": 0.3127419352531433,
+      "learning_rate": 0.0004954580419580419,
+      "loss": 3.4342,
+      "step": 30000
+    },
+    {
+      "epoch": 8.736210612149804,
+      "eval_accuracy": 0.36726194101037535,
+      "eval_loss": 3.5715725421905518,
+      "eval_runtime": 179.7611,
+      "eval_samples_per_second": 92.601,
+      "eval_steps_per_second": 5.791,
+      "step": 30000
+    },
+    {
+      "epoch": 8.750771739763527,
+      "grad_norm": 0.3249102532863617,
+      "learning_rate": 0.0004952832167832167,
+      "loss": 3.428,
+      "step": 30050
+    },
+    {
+      "epoch": 8.76533286737725,
+      "grad_norm": 0.34389105439186096,
+      "learning_rate": 0.0004951083916083915,
+      "loss": 3.4286,
+      "step": 30100
+    },
+    {
+      "epoch": 8.779893994990973,
+      "grad_norm": 0.34607070684432983,
+      "learning_rate": 0.0004949335664335664,
+      "loss": 3.4325,
+      "step": 30150
+    },
+    {
+      "epoch": 8.794455122604695,
+      "grad_norm": 0.33967599272727966,
+      "learning_rate": 0.0004947587412587412,
+      "loss": 3.4188,
+      "step": 30200
+    },
+    {
+      "epoch": 8.809016250218416,
+      "grad_norm": 0.34365391731262207,
+      "learning_rate": 0.000494583916083916,
+      "loss": 3.4258,
+      "step": 30250
+    },
+    {
+      "epoch": 8.82357737783214,
+      "grad_norm": 0.31158357858657837,
+      "learning_rate": 0.0004944090909090908,
+      "loss": 3.4302,
+      "step": 30300
+    },
+    {
+      "epoch": 8.838138505445862,
+      "grad_norm": 0.3425881564617157,
+      "learning_rate": 0.0004942342657342657,
+      "loss": 3.4471,
+      "step": 30350
+    },
+    {
+      "epoch": 8.852699633059585,
+      "grad_norm": 0.33694136142730713,
+      "learning_rate": 0.0004940594405594405,
+      "loss": 3.4363,
+      "step": 30400
+    },
+    {
+      "epoch": 8.867260760673307,
+      "grad_norm": 0.33916687965393066,
+      "learning_rate": 0.0004938846153846153,
+      "loss": 3.4398,
+      "step": 30450
+    },
+    {
+      "epoch": 8.881821888287028,
+      "grad_norm": 0.3424004912376404,
+      "learning_rate": 0.0004937097902097901,
+      "loss": 3.4373,
+      "step": 30500
+    },
+    {
+      "epoch": 8.896383015900751,
+      "grad_norm": 0.3579810857772827,
+      "learning_rate": 0.000493534965034965,
+      "loss": 3.4423,
+      "step": 30550
+    },
+    {
+      "epoch": 8.910944143514474,
+      "grad_norm": 0.35978007316589355,
+      "learning_rate": 0.0004933601398601398,
+      "loss": 3.4223,
+      "step": 30600
+    },
+    {
+      "epoch": 8.925505271128197,
+      "grad_norm": 0.34889093041419983,
+      "learning_rate": 0.0004931853146853146,
+      "loss": 3.4384,
+      "step": 30650
+    },
+    {
+      "epoch": 8.94006639874192,
+      "grad_norm": 0.3178730010986328,
+      "learning_rate": 0.0004930104895104895,
+      "loss": 3.4316,
+      "step": 30700
+    },
+    {
+      "epoch": 8.95462752635564,
+      "grad_norm": 0.3225439190864563,
+      "learning_rate": 0.0004928356643356642,
+      "loss": 3.4376,
+      "step": 30750
+    },
+    {
+      "epoch": 8.969188653969363,
+      "grad_norm": 0.32753077149391174,
+      "learning_rate": 0.0004926608391608391,
+      "loss": 3.4457,
+      "step": 30800
+    },
+    {
+      "epoch": 8.983749781583086,
+      "grad_norm": 0.3687169551849365,
+      "learning_rate": 0.0004924860139860139,
+      "loss": 3.4323,
+      "step": 30850
+    },
+    {
+      "epoch": 8.998310909196809,
+      "grad_norm": 0.3431978225708008,
+      "learning_rate": 0.0004923111888111887,
+      "loss": 3.4443,
+      "step": 30900
+    },
+    {
+      "epoch": 9.012813792300076,
+      "grad_norm": 0.3386369049549103,
+      "learning_rate": 0.0004921363636363635,
+      "loss": 3.3368,
+      "step": 30950
+    },
+    {
+      "epoch": 9.027374919913798,
+      "grad_norm": 0.35466766357421875,
+      "learning_rate": 0.0004919615384615384,
+      "loss": 3.3258,
+      "step": 31000
+    },
+    {
+      "epoch": 9.027374919913798,
+      "eval_accuracy": 0.36759758225444167,
+      "eval_loss": 3.57794451713562,
+      "eval_runtime": 184.3404,
+      "eval_samples_per_second": 90.3,
+      "eval_steps_per_second": 5.647,
+      "step": 31000
+    },
+    {
+      "epoch": 9.041936047527521,
+      "grad_norm": 0.3356529176235199,
+      "learning_rate": 0.0004917867132867132,
+      "loss": 3.3184,
+      "step": 31050
+    },
+    {
+      "epoch": 9.056497175141242,
+      "grad_norm": 0.3212270140647888,
+      "learning_rate": 0.000491611888111888,
+      "loss": 3.3383,
+      "step": 31100
+    },
+    {
+      "epoch": 9.071058302754965,
+      "grad_norm": 0.3324335813522339,
+      "learning_rate": 0.0004914370629370628,
+      "loss": 3.3392,
+      "step": 31150
+    },
+    {
+      "epoch": 9.085619430368688,
+      "grad_norm": 0.32331228256225586,
+      "learning_rate": 0.0004912622377622378,
+      "loss": 3.3521,
+      "step": 31200
+    },
+    {
+      "epoch": 9.10018055798241,
+      "grad_norm": 0.31954678893089294,
+      "learning_rate": 0.0004910874125874126,
+      "loss": 3.3435,
+      "step": 31250
+    },
+    {
+      "epoch": 9.114741685596133,
+      "grad_norm": 0.32974445819854736,
+      "learning_rate": 0.0004909125874125874,
+      "loss": 3.3501,
+      "step": 31300
+    },
+    {
+      "epoch": 9.129302813209854,
+      "grad_norm": 0.35506731271743774,
+      "learning_rate": 0.0004907377622377623,
+      "loss": 3.3467,
+      "step": 31350
+    },
+    {
+      "epoch": 9.143863940823577,
+      "grad_norm": 0.32969748973846436,
+      "learning_rate": 0.0004905629370629371,
+      "loss": 3.357,
+      "step": 31400
+    },
+    {
+      "epoch": 9.1584250684373,
+      "grad_norm": 0.3305834233760834,
+      "learning_rate": 0.0004903881118881119,
+      "loss": 3.3573,
+      "step": 31450
+    },
+    {
+      "epoch": 9.172986196051022,
+      "grad_norm": 0.33574923872947693,
+      "learning_rate": 0.0004902132867132867,
+      "loss": 3.3572,
+      "step": 31500
+    },
+    {
+      "epoch": 9.187547323664745,
+      "grad_norm": 0.32476624846458435,
+      "learning_rate": 0.0004900384615384615,
+      "loss": 3.3454,
+      "step": 31550
+    },
+    {
+      "epoch": 9.202108451278466,
+      "grad_norm": 0.36604878306388855,
+      "learning_rate": 0.0004898636363636363,
+      "loss": 3.3598,
+      "step": 31600
+    },
+    {
+      "epoch": 9.216669578892189,
+      "grad_norm": 0.3407774567604065,
+      "learning_rate": 0.0004896888111888112,
+      "loss": 3.3655,
+      "step": 31650
+    },
+    {
+      "epoch": 9.231230706505912,
+      "grad_norm": 0.3136043846607208,
+      "learning_rate": 0.000489513986013986,
+      "loss": 3.3658,
+      "step": 31700
+    },
+    {
+      "epoch": 9.245791834119634,
+      "grad_norm": 0.34752407670021057,
+      "learning_rate": 0.0004893391608391608,
+      "loss": 3.374,
+      "step": 31750
+    },
+    {
+      "epoch": 9.260352961733357,
+      "grad_norm": 0.33697524666786194,
+      "learning_rate": 0.0004891643356643356,
+      "loss": 3.3715,
+      "step": 31800
+    },
+    {
+      "epoch": 9.27491408934708,
+      "grad_norm": 0.3399849832057953,
+      "learning_rate": 0.0004889895104895105,
+      "loss": 3.3781,
+      "step": 31850
+    },
+    {
+      "epoch": 9.2894752169608,
+      "grad_norm": 0.32320427894592285,
+      "learning_rate": 0.0004888146853146853,
+      "loss": 3.3606,
+      "step": 31900
+    },
+    {
+      "epoch": 9.304036344574524,
+      "grad_norm": 0.3273387849330902,
+      "learning_rate": 0.0004886398601398601,
+      "loss": 3.3726,
+      "step": 31950
+    },
+    {
+      "epoch": 9.318597472188246,
+      "grad_norm": 0.33997225761413574,
+      "learning_rate": 0.000488465034965035,
+      "loss": 3.3831,
+      "step": 32000
+    },
+    {
+      "epoch": 9.318597472188246,
+      "eval_accuracy": 0.3672809861527707,
+      "eval_loss": 3.577822685241699,
+      "eval_runtime": 186.0219,
+      "eval_samples_per_second": 89.484,
+      "eval_steps_per_second": 5.596,
+      "step": 32000
+    },
+    {
+      "epoch": 9.333158599801969,
+      "grad_norm": 0.3397623896598816,
+      "learning_rate": 0.0004882902097902098,
+      "loss": 3.3802,
+      "step": 32050
+    },
+    {
+      "epoch": 9.347719727415692,
+      "grad_norm": 0.3965780735015869,
+      "learning_rate": 0.0004881153846153846,
+      "loss": 3.3961,
+      "step": 32100
+    },
+    {
+      "epoch": 9.362280855029413,
+      "grad_norm": 0.32509127259254456,
+      "learning_rate": 0.0004879405594405594,
+      "loss": 3.392,
+      "step": 32150
+    },
+    {
+      "epoch": 9.376841982643136,
+      "grad_norm": 0.3580123484134674,
+      "learning_rate": 0.00048776573426573424,
+      "loss": 3.3685,
+      "step": 32200
+    },
+    {
+      "epoch": 9.391403110256858,
+      "grad_norm": 0.33572641015052795,
+      "learning_rate": 0.00048759090909090904,
+      "loss": 3.3738,
+      "step": 32250
+    },
+    {
+      "epoch": 9.405964237870581,
+      "grad_norm": 0.34592849016189575,
+      "learning_rate": 0.0004874160839160839,
+      "loss": 3.3792,
+      "step": 32300
+    },
+    {
+      "epoch": 9.420525365484304,
+      "grad_norm": 0.39023056626319885,
+      "learning_rate": 0.0004872412587412587,
+      "loss": 3.3712,
+      "step": 32350
+    },
+    {
+      "epoch": 9.435086493098025,
+      "grad_norm": 0.3557857871055603,
+      "learning_rate": 0.00048706643356643354,
+      "loss": 3.3959,
+      "step": 32400
+    },
+    {
+      "epoch": 9.449647620711747,
+      "grad_norm": 0.3627590537071228,
+      "learning_rate": 0.00048689160839160834,
+      "loss": 3.3798,
+      "step": 32450
+    },
+    {
+      "epoch": 9.46420874832547,
+      "grad_norm": 0.34032562375068665,
+      "learning_rate": 0.0004867167832167832,
+      "loss": 3.3949,
+      "step": 32500
+    },
+    {
+      "epoch": 9.478769875939193,
+      "grad_norm": 0.32405319809913635,
+      "learning_rate": 0.00048654195804195794,
+      "loss": 3.382,
+      "step": 32550
+    },
+    {
+      "epoch": 9.493331003552916,
+      "grad_norm": 0.34905362129211426,
+      "learning_rate": 0.00048636713286713285,
+      "loss": 3.3953,
+      "step": 32600
+    },
+    {
+      "epoch": 9.507892131166638,
+      "grad_norm": 0.3418472409248352,
+      "learning_rate": 0.0004861923076923077,
+      "loss": 3.3972,
+      "step": 32650
+    },
+    {
+      "epoch": 9.52245325878036,
+      "grad_norm": 0.3480176031589508,
+      "learning_rate": 0.00048601748251748245,
+      "loss": 3.3983,
+      "step": 32700
+    },
+    {
+      "epoch": 9.537014386394082,
+      "grad_norm": 0.3377174139022827,
+      "learning_rate": 0.0004858426573426573,
+      "loss": 3.3821,
+      "step": 32750
+    },
+    {
+      "epoch": 9.551575514007805,
+      "grad_norm": 0.3357522487640381,
+      "learning_rate": 0.0004856678321678321,
+      "loss": 3.3975,
+      "step": 32800
+    },
+    {
+      "epoch": 9.566136641621528,
+      "grad_norm": 0.32841238379478455,
+      "learning_rate": 0.00048549300699300696,
+      "loss": 3.3976,
+      "step": 32850
+    },
+    {
+      "epoch": 9.58069776923525,
+      "grad_norm": 0.33749887347221375,
+      "learning_rate": 0.00048531818181818176,
+      "loss": 3.4136,
+      "step": 32900
+    },
+    {
+      "epoch": 9.595258896848971,
+      "grad_norm": 0.3626416325569153,
+      "learning_rate": 0.0004851433566433566,
+      "loss": 3.3985,
+      "step": 32950
+    },
+    {
+      "epoch": 9.609820024462694,
+      "grad_norm": 0.36860400438308716,
+      "learning_rate": 0.0004849685314685314,
+      "loss": 3.3965,
+      "step": 33000
+    },
+    {
+      "epoch": 9.609820024462694,
+      "eval_accuracy": 0.36782729958925975,
+      "eval_loss": 3.5715696811676025,
+      "eval_runtime": 183.923,
+      "eval_samples_per_second": 90.505,
+      "eval_steps_per_second": 5.66,
+      "step": 33000
+    },
+    {
+      "epoch": 9.624381152076417,
+      "grad_norm": 0.3426574468612671,
+      "learning_rate": 0.00048479370629370627,
+      "loss": 3.4014,
+      "step": 33050
+    },
+    {
+      "epoch": 9.63894227969014,
+      "grad_norm": 0.3384750783443451,
+      "learning_rate": 0.00048461888111888106,
+      "loss": 3.4112,
+      "step": 33100
+    },
+    {
+      "epoch": 9.653503407303862,
+      "grad_norm": 0.35202690958976746,
+      "learning_rate": 0.0004844440559440559,
+      "loss": 3.4129,
+      "step": 33150
+    },
+    {
+      "epoch": 9.668064534917583,
+      "grad_norm": 0.355497270822525,
+      "learning_rate": 0.0004842692307692307,
+      "loss": 3.3892,
+      "step": 33200
+    },
+    {
+      "epoch": 9.682625662531306,
+      "grad_norm": 0.32850146293640137,
+      "learning_rate": 0.00048409440559440557,
+      "loss": 3.4,
+      "step": 33250
+    },
+    {
+      "epoch": 9.697186790145029,
+      "grad_norm": 0.3368713855743408,
+      "learning_rate": 0.0004839195804195803,
+      "loss": 3.4127,
+      "step": 33300
+    },
+    {
+      "epoch": 9.711747917758752,
+      "grad_norm": 0.3568696677684784,
+      "learning_rate": 0.0004837447552447552,
+      "loss": 3.3944,
+      "step": 33350
+    },
+    {
+      "epoch": 9.726309045372474,
+      "grad_norm": 0.32732048630714417,
+      "learning_rate": 0.0004835699300699301,
+      "loss": 3.4031,
+      "step": 33400
+    },
+    {
+      "epoch": 9.740870172986195,
+      "grad_norm": 0.3446010649204254,
+      "learning_rate": 0.0004833951048951048,
+      "loss": 3.4169,
+      "step": 33450
+    },
+    {
+      "epoch": 9.755431300599918,
+      "grad_norm": 0.32168522477149963,
+      "learning_rate": 0.0004832202797202797,
+      "loss": 3.4106,
+      "step": 33500
+    },
+    {
+      "epoch": 9.76999242821364,
+      "grad_norm": 0.35548439621925354,
+      "learning_rate": 0.0004830454545454545,
+      "loss": 3.3974,
+      "step": 33550
+    },
+    {
+      "epoch": 9.784553555827364,
+      "grad_norm": 0.3315522074699402,
+      "learning_rate": 0.00048287062937062933,
+      "loss": 3.4138,
+      "step": 33600
+    },
+    {
+      "epoch": 9.799114683441086,
+      "grad_norm": 0.33013713359832764,
+      "learning_rate": 0.00048269580419580413,
+      "loss": 3.4092,
+      "step": 33650
+    },
+    {
+      "epoch": 9.813675811054807,
+      "grad_norm": 0.34848856925964355,
+      "learning_rate": 0.000482520979020979,
+      "loss": 3.4104,
+      "step": 33700
+    },
+    {
+      "epoch": 9.82823693866853,
+      "grad_norm": 0.32687628269195557,
+      "learning_rate": 0.0004823461538461538,
+      "loss": 3.4018,
+      "step": 33750
+    },
+    {
+      "epoch": 9.842798066282253,
+      "grad_norm": 0.36140188574790955,
+      "learning_rate": 0.00048217132867132864,
+      "loss": 3.4006,
+      "step": 33800
+    },
+    {
+      "epoch": 9.857359193895975,
+      "grad_norm": 0.32018741965293884,
+      "learning_rate": 0.00048199650349650344,
+      "loss": 3.4013,
+      "step": 33850
+    },
+    {
+      "epoch": 9.871920321509698,
+      "grad_norm": 0.34114909172058105,
+      "learning_rate": 0.0004818216783216783,
+      "loss": 3.3959,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88648144912342,
+      "grad_norm": 0.32743898034095764,
+      "learning_rate": 0.0004816468531468531,
+      "loss": 3.4191,
+      "step": 33950
+    },
+    {
+      "epoch": 9.901042576737142,
+      "grad_norm": 0.37873852252960205,
+      "learning_rate": 0.00048147202797202795,
+      "loss": 3.4187,
+      "step": 34000
+    },
+    {
+      "epoch": 9.901042576737142,
+      "eval_accuracy": 0.36834492774954836,
+      "eval_loss": 3.561105728149414,
+      "eval_runtime": 179.7864,
+      "eval_samples_per_second": 92.588,
+      "eval_steps_per_second": 5.79,
+      "step": 34000
+    },
+    {
+      "epoch": 9.915603704350865,
+      "grad_norm": 0.33194535970687866,
+      "learning_rate": 0.0004812972027972028,
+      "loss": 3.4104,
+      "step": 34050
+    },
+    {
+      "epoch": 9.930164831964587,
+      "grad_norm": 0.3524761497974396,
+      "learning_rate": 0.0004811223776223776,
+      "loss": 3.4132,
+      "step": 34100
+    },
+    {
+      "epoch": 9.94472595957831,
+      "grad_norm": 0.3482424020767212,
+      "learning_rate": 0.00048094755244755245,
+      "loss": 3.4059,
+      "step": 34150
+    },
+    {
+      "epoch": 9.959287087192033,
+      "grad_norm": 0.350705087184906,
+      "learning_rate": 0.0004807727272727272,
+      "loss": 3.4047,
+      "step": 34200
+    },
+    {
+      "epoch": 9.973848214805754,
+      "grad_norm": 0.33391574025154114,
+      "learning_rate": 0.00048059790209790205,
+      "loss": 3.4213,
+      "step": 34250
+    },
+    {
+      "epoch": 9.988409342419477,
+      "grad_norm": 0.3514692187309265,
+      "learning_rate": 0.00048042307692307685,
+      "loss": 3.4138,
+      "step": 34300
+    },
+    {
+      "epoch": 10.002912225522744,
+      "grad_norm": 0.3422977030277252,
+      "learning_rate": 0.0004802482517482517,
+      "loss": 3.3925,
+      "step": 34350
+    },
+    {
+      "epoch": 10.017473353136467,
+      "grad_norm": 0.33694586157798767,
+      "learning_rate": 0.0004800734265734265,
+      "loss": 3.296,
+      "step": 34400
+    },
+    {
+      "epoch": 10.03203448075019,
+      "grad_norm": 0.3611753284931183,
+      "learning_rate": 0.00047989860139860136,
+      "loss": 3.2954,
+      "step": 34450
+    },
+    {
+      "epoch": 10.046595608363912,
+      "grad_norm": 0.3551093637943268,
+      "learning_rate": 0.00047972377622377616,
+      "loss": 3.2935,
+      "step": 34500
+    },
+    {
+      "epoch": 10.061156735977635,
+      "grad_norm": 0.37375402450561523,
+      "learning_rate": 0.000479548951048951,
+      "loss": 3.3175,
+      "step": 34550
+    },
+    {
+      "epoch": 10.075717863591356,
+      "grad_norm": 0.365528404712677,
+      "learning_rate": 0.0004793741258741258,
+      "loss": 3.3185,
+      "step": 34600
+    },
+    {
+      "epoch": 10.090278991205079,
+      "grad_norm": 0.35895606875419617,
+      "learning_rate": 0.00047919930069930067,
+      "loss": 3.3174,
+      "step": 34650
+    },
+    {
+      "epoch": 10.104840118818801,
+      "grad_norm": 0.32946503162384033,
+      "learning_rate": 0.0004790244755244755,
+      "loss": 3.32,
+      "step": 34700
+    },
+    {
+      "epoch": 10.119401246432524,
+      "grad_norm": 0.33243829011917114,
+      "learning_rate": 0.0004788496503496503,
+      "loss": 3.3142,
+      "step": 34750
+    },
+    {
+      "epoch": 10.133962374046247,
+      "grad_norm": 0.3511507511138916,
+      "learning_rate": 0.0004786748251748252,
+      "loss": 3.3314,
+      "step": 34800
+    },
+    {
+      "epoch": 10.148523501659968,
+      "grad_norm": 0.3584575653076172,
+      "learning_rate": 0.0004785,
+      "loss": 3.3345,
+      "step": 34850
+    },
+    {
+      "epoch": 10.16308462927369,
+      "grad_norm": 0.3298545479774475,
+      "learning_rate": 0.00047832517482517483,
+      "loss": 3.339,
+      "step": 34900
+    },
+    {
+      "epoch": 10.177645756887413,
+      "grad_norm": 0.3483952581882477,
+      "learning_rate": 0.0004781503496503496,
+      "loss": 3.324,
+      "step": 34950
+    },
+    {
+      "epoch": 10.192206884501136,
+      "grad_norm": 0.3524647057056427,
+      "learning_rate": 0.00047797552447552443,
+      "loss": 3.344,
+      "step": 35000
+    },
+    {
+      "epoch": 10.192206884501136,
+      "eval_accuracy": 0.3683687929588463,
+      "eval_loss": 3.5726046562194824,
+      "eval_runtime": 180.0575,
+      "eval_samples_per_second": 92.448,
+      "eval_steps_per_second": 5.781,
+      "step": 35000
+    },
+    {
+      "epoch": 10.206768012114859,
+      "grad_norm": 0.3376518189907074,
+      "learning_rate": 0.00047780069930069923,
+      "loss": 3.3347,
+      "step": 35050
+    },
+    {
+      "epoch": 10.221329139728581,
+      "grad_norm": 0.3457695543766022,
+      "learning_rate": 0.0004776258741258741,
+      "loss": 3.3483,
+      "step": 35100
+    },
+    {
+      "epoch": 10.235890267342302,
+      "grad_norm": 0.37430188059806824,
+      "learning_rate": 0.0004774510489510489,
+      "loss": 3.3552,
+      "step": 35150
+    },
+    {
+      "epoch": 10.250451394956025,
+      "grad_norm": 0.3510351777076721,
+      "learning_rate": 0.00047727622377622374,
+      "loss": 3.3549,
+      "step": 35200
+    },
+    {
+      "epoch": 10.265012522569748,
+      "grad_norm": 0.37889590859413147,
+      "learning_rate": 0.00047710139860139854,
+      "loss": 3.3416,
+      "step": 35250
+    },
+    {
+      "epoch": 10.27957365018347,
+      "grad_norm": 0.3422775864601135,
+      "learning_rate": 0.0004769265734265734,
+      "loss": 3.3383,
+      "step": 35300
+    },
+    {
+      "epoch": 10.294134777797193,
+      "grad_norm": 0.38626229763031006,
+      "learning_rate": 0.0004767517482517482,
+      "loss": 3.3461,
+      "step": 35350
+    },
+    {
+      "epoch": 10.308695905410914,
+      "grad_norm": 0.3493908643722534,
+      "learning_rate": 0.00047657692307692304,
+      "loss": 3.3535,
+      "step": 35400
+    },
+    {
+      "epoch": 10.323257033024637,
+      "grad_norm": 0.35432669520378113,
+      "learning_rate": 0.0004764020979020979,
+      "loss": 3.3555,
+      "step": 35450
+    },
+    {
+      "epoch": 10.33781816063836,
+      "grad_norm": 0.3410918116569519,
+      "learning_rate": 0.0004762272727272727,
+      "loss": 3.3469,
+      "step": 35500
+    },
+    {
+      "epoch": 10.352379288252083,
+      "grad_norm": 0.36023515462875366,
+      "learning_rate": 0.00047605244755244755,
+      "loss": 3.3584,
+      "step": 35550
+    },
+    {
+      "epoch": 10.366940415865805,
+      "grad_norm": 0.3287743330001831,
+      "learning_rate": 0.00047587762237762235,
+      "loss": 3.3674,
+      "step": 35600
+    },
+    {
+      "epoch": 10.381501543479526,
+      "grad_norm": 0.3435341715812683,
+      "learning_rate": 0.0004757027972027972,
+      "loss": 3.3646,
+      "step": 35650
+    },
+    {
+      "epoch": 10.396062671093249,
+      "grad_norm": 0.3478064239025116,
+      "learning_rate": 0.00047552797202797195,
+      "loss": 3.3675,
+      "step": 35700
+    },
+    {
+      "epoch": 10.410623798706972,
+      "grad_norm": 0.3641142249107361,
+      "learning_rate": 0.0004753531468531468,
+      "loss": 3.3591,
+      "step": 35750
+    },
+    {
+      "epoch": 10.425184926320695,
+      "grad_norm": 0.3393605649471283,
+      "learning_rate": 0.0004751783216783216,
+      "loss": 3.3615,
+      "step": 35800
+    },
+    {
+      "epoch": 10.439746053934417,
+      "grad_norm": 0.38742467761039734,
+      "learning_rate": 0.00047500349650349646,
+      "loss": 3.3701,
+      "step": 35850
+    },
+    {
+      "epoch": 10.454307181548138,
+      "grad_norm": 0.37009376287460327,
+      "learning_rate": 0.00047482867132867126,
+      "loss": 3.3576,
+      "step": 35900
+    },
+    {
+      "epoch": 10.468868309161861,
+      "grad_norm": 0.36964377760887146,
+      "learning_rate": 0.0004746538461538461,
+      "loss": 3.3546,
+      "step": 35950
+    },
+    {
+      "epoch": 10.483429436775584,
+      "grad_norm": 0.3347964882850647,
+      "learning_rate": 0.0004744790209790209,
+      "loss": 3.3789,
+      "step": 36000
+    },
+    {
+      "epoch": 10.483429436775584,
+      "eval_accuracy": 0.3682949636414124,
+      "eval_loss": 3.5694682598114014,
+      "eval_runtime": 179.8767,
+      "eval_samples_per_second": 92.541,
+      "eval_steps_per_second": 5.787,
+      "step": 36000
+    },
+    {
+      "epoch": 10.497990564389307,
+      "grad_norm": 0.3568975329399109,
+      "learning_rate": 0.00047430419580419576,
+      "loss": 3.3692,
+      "step": 36050
+    },
+    {
+      "epoch": 10.51255169200303,
+      "grad_norm": 0.3243386447429657,
+      "learning_rate": 0.0004741293706293706,
+      "loss": 3.3693,
+      "step": 36100
+    },
+    {
+      "epoch": 10.52711281961675,
+      "grad_norm": 0.3336549997329712,
+      "learning_rate": 0.0004739545454545454,
+      "loss": 3.3785,
+      "step": 36150
+    },
+    {
+      "epoch": 10.541673947230473,
+      "grad_norm": 0.3561848998069763,
+      "learning_rate": 0.00047377972027972027,
+      "loss": 3.3691,
+      "step": 36200
+    },
+    {
+      "epoch": 10.556235074844196,
+      "grad_norm": 0.356851726770401,
+      "learning_rate": 0.00047360489510489507,
+      "loss": 3.3643,
+      "step": 36250
+    },
+    {
+      "epoch": 10.570796202457919,
+      "grad_norm": 0.33825376629829407,
+      "learning_rate": 0.0004734300699300699,
+      "loss": 3.3913,
+      "step": 36300
+    },
+    {
+      "epoch": 10.585357330071641,
+      "grad_norm": 0.3185909390449524,
+      "learning_rate": 0.0004732552447552447,
+      "loss": 3.3784,
+      "step": 36350
+    },
+    {
+      "epoch": 10.599918457685362,
+      "grad_norm": 0.3499145805835724,
+      "learning_rate": 0.0004730804195804196,
+      "loss": 3.379,
+      "step": 36400
+    },
+    {
+      "epoch": 10.614479585299085,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004729055944055943,
+      "loss": 3.3796,
+      "step": 36450
+    },
+    {
+      "epoch": 10.629040712912808,
+      "grad_norm": 0.34612196683883667,
+      "learning_rate": 0.0004727307692307692,
+      "loss": 3.3845,
+      "step": 36500
+    },
+    {
+      "epoch": 10.64360184052653,
+      "grad_norm": 0.3680227994918823,
+      "learning_rate": 0.000472555944055944,
+      "loss": 3.3761,
+      "step": 36550
+    },
+    {
+      "epoch": 10.658162968140253,
+      "grad_norm": 0.36743855476379395,
+      "learning_rate": 0.00047238111888111883,
+      "loss": 3.3684,
+      "step": 36600
+    },
+    {
+      "epoch": 10.672724095753976,
+      "grad_norm": 0.33822494745254517,
+      "learning_rate": 0.00047220629370629363,
+      "loss": 3.3854,
+      "step": 36650
+    },
+    {
+      "epoch": 10.687285223367697,
+      "grad_norm": 0.3840745687484741,
+      "learning_rate": 0.0004720314685314685,
+      "loss": 3.3676,
+      "step": 36700
+    },
+    {
+      "epoch": 10.70184635098142,
+      "grad_norm": 0.3411411941051483,
+      "learning_rate": 0.0004718566433566433,
+      "loss": 3.3732,
+      "step": 36750
+    },
+    {
+      "epoch": 10.716407478595142,
+      "grad_norm": 0.3389338552951813,
+      "learning_rate": 0.00047168181818181814,
+      "loss": 3.3848,
+      "step": 36800
+    },
+    {
+      "epoch": 10.730968606208865,
+      "grad_norm": 0.33619245886802673,
+      "learning_rate": 0.000471506993006993,
+      "loss": 3.3761,
+      "step": 36850
+    },
+    {
+      "epoch": 10.745529733822588,
+      "grad_norm": 0.35158175230026245,
+      "learning_rate": 0.0004713321678321678,
+      "loss": 3.3823,
+      "step": 36900
+    },
+    {
+      "epoch": 10.760090861436309,
+      "grad_norm": 0.3377283811569214,
+      "learning_rate": 0.00047115734265734265,
+      "loss": 3.4041,
+      "step": 36950
+    },
+    {
+      "epoch": 10.774651989050032,
+      "grad_norm": 0.3622257113456726,
+      "learning_rate": 0.00047098251748251745,
+      "loss": 3.4005,
+      "step": 37000
+    },
+    {
+      "epoch": 10.774651989050032,
+      "eval_accuracy": 0.36925674333237796,
+      "eval_loss": 3.5602827072143555,
+      "eval_runtime": 179.7742,
+      "eval_samples_per_second": 92.594,
+      "eval_steps_per_second": 5.791,
+      "step": 37000
+    },
+    {
+      "epoch": 10.789213116663754,
+      "grad_norm": 0.3160146474838257,
+      "learning_rate": 0.0004708076923076923,
+      "loss": 3.3912,
+      "step": 37050
+    },
+    {
+      "epoch": 10.803774244277477,
+      "grad_norm": 0.3281710147857666,
+      "learning_rate": 0.0004706328671328671,
+      "loss": 3.3967,
+      "step": 37100
+    },
+    {
+      "epoch": 10.8183353718912,
+      "grad_norm": 0.3841243386268616,
+      "learning_rate": 0.00047045804195804195,
+      "loss": 3.3979,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83289649950492,
+      "grad_norm": 0.33870750665664673,
+      "learning_rate": 0.0004702832167832167,
+      "loss": 3.3879,
+      "step": 37200
+    },
+    {
+      "epoch": 10.847457627118644,
+      "grad_norm": 0.34300288558006287,
+      "learning_rate": 0.00047010839160839155,
+      "loss": 3.3959,
+      "step": 37250
+    },
+    {
+      "epoch": 10.862018754732366,
+      "grad_norm": 0.35645779967308044,
+      "learning_rate": 0.00046993356643356635,
+      "loss": 3.3974,
+      "step": 37300
+    },
+    {
+      "epoch": 10.876579882346089,
+      "grad_norm": 0.35175466537475586,
+      "learning_rate": 0.0004697587412587412,
+      "loss": 3.3888,
+      "step": 37350
+    },
+    {
+      "epoch": 10.891141009959812,
+      "grad_norm": 0.3222729563713074,
+      "learning_rate": 0.000469583916083916,
+      "loss": 3.3981,
+      "step": 37400
+    },
+    {
+      "epoch": 10.905702137573535,
+      "grad_norm": 0.32414838671684265,
+      "learning_rate": 0.00046940909090909086,
+      "loss": 3.391,
+      "step": 37450
+    },
+    {
+      "epoch": 10.920263265187256,
+      "grad_norm": 0.34362274408340454,
+      "learning_rate": 0.0004692342657342657,
+      "loss": 3.3832,
+      "step": 37500
+    },
+    {
+      "epoch": 10.934824392800978,
+      "grad_norm": 0.3682989776134491,
+      "learning_rate": 0.0004690594405594405,
+      "loss": 3.3894,
+      "step": 37550
+    },
+    {
+      "epoch": 10.949385520414701,
+      "grad_norm": 0.3218347728252411,
+      "learning_rate": 0.00046888461538461537,
+      "loss": 3.3901,
+      "step": 37600
+    },
+    {
+      "epoch": 10.963946648028424,
+      "grad_norm": 0.37173983454704285,
+      "learning_rate": 0.00046870979020979017,
+      "loss": 3.3884,
+      "step": 37650
+    },
+    {
+      "epoch": 10.978507775642147,
+      "grad_norm": 0.37107351422309875,
+      "learning_rate": 0.000468534965034965,
+      "loss": 3.3922,
+      "step": 37700
+    },
+    {
+      "epoch": 10.993068903255867,
+      "grad_norm": 0.35204780101776123,
+      "learning_rate": 0.0004683601398601398,
+      "loss": 3.3753,
+      "step": 37750
+    },
+    {
+      "epoch": 11.007571786359136,
+      "grad_norm": 0.33582913875579834,
+      "learning_rate": 0.0004681853146853147,
+      "loss": 3.3251,
+      "step": 37800
+    },
+    {
+      "epoch": 11.022132913972857,
+      "grad_norm": 0.33512166142463684,
+      "learning_rate": 0.0004680104895104895,
+      "loss": 3.2653,
+      "step": 37850
+    },
+    {
+      "epoch": 11.03669404158658,
+      "grad_norm": 0.3530137240886688,
+      "learning_rate": 0.00046783566433566433,
+      "loss": 3.2888,
+      "step": 37900
+    },
+    {
+      "epoch": 11.051255169200303,
+      "grad_norm": 0.3322924077510834,
+      "learning_rate": 0.0004676608391608391,
+      "loss": 3.304,
+      "step": 37950
+    },
+    {
+      "epoch": 11.065816296814026,
+      "grad_norm": 0.34434235095977783,
+      "learning_rate": 0.00046748601398601393,
+      "loss": 3.2886,
+      "step": 38000
+    },
+    {
+      "epoch": 11.065816296814026,
+      "eval_accuracy": 0.36925674333237796,
+      "eval_loss": 3.5678048133850098,
+      "eval_runtime": 179.7265,
+      "eval_samples_per_second": 92.618,
+      "eval_steps_per_second": 5.792,
+      "step": 38000
+    },
+    {
+      "epoch": 11.080377424427748,
+      "grad_norm": 0.3417208790779114,
+      "learning_rate": 0.00046731118881118873,
+      "loss": 3.2852,
+      "step": 38050
+    },
+    {
+      "epoch": 11.09493855204147,
+      "grad_norm": 0.36699753999710083,
+      "learning_rate": 0.0004671363636363636,
+      "loss": 3.2885,
+      "step": 38100
+    },
+    {
+      "epoch": 11.109499679655192,
+      "grad_norm": 0.336487740278244,
+      "learning_rate": 0.00046696153846153844,
+      "loss": 3.309,
+      "step": 38150
+    },
+    {
+      "epoch": 11.124060807268915,
+      "grad_norm": 0.35883960127830505,
+      "learning_rate": 0.00046678671328671324,
+      "loss": 3.3019,
+      "step": 38200
+    },
+    {
+      "epoch": 11.138621934882638,
+      "grad_norm": 0.3807854950428009,
+      "learning_rate": 0.0004666118881118881,
+      "loss": 3.3034,
+      "step": 38250
+    },
+    {
+      "epoch": 11.15318306249636,
+      "grad_norm": 0.3594328463077545,
+      "learning_rate": 0.0004664370629370629,
+      "loss": 3.2984,
+      "step": 38300
+    },
+    {
+      "epoch": 11.167744190110081,
+      "grad_norm": 0.344275563955307,
+      "learning_rate": 0.00046626223776223774,
+      "loss": 3.3229,
+      "step": 38350
+    },
+    {
+      "epoch": 11.182305317723804,
+      "grad_norm": 0.3437211811542511,
+      "learning_rate": 0.00046608741258741254,
+      "loss": 3.3241,
+      "step": 38400
+    },
+    {
+      "epoch": 11.196866445337527,
+      "grad_norm": 0.3443619906902313,
+      "learning_rate": 0.0004659125874125874,
+      "loss": 3.319,
+      "step": 38450
+    },
+    {
+      "epoch": 11.21142757295125,
+      "grad_norm": 0.3490242063999176,
+      "learning_rate": 0.0004657377622377622,
+      "loss": 3.3212,
+      "step": 38500
+    },
+    {
+      "epoch": 11.225988700564972,
+      "grad_norm": 0.33340850472450256,
+      "learning_rate": 0.00046556293706293705,
+      "loss": 3.3279,
+      "step": 38550
+    },
+    {
+      "epoch": 11.240549828178693,
+      "grad_norm": 0.33329835534095764,
+      "learning_rate": 0.00046538811188811185,
+      "loss": 3.3262,
+      "step": 38600
+    },
+    {
+      "epoch": 11.255110955792416,
+      "grad_norm": 0.3586530387401581,
+      "learning_rate": 0.0004652132867132867,
+      "loss": 3.3209,
+      "step": 38650
+    },
+    {
+      "epoch": 11.269672083406139,
+      "grad_norm": 0.3424071669578552,
+      "learning_rate": 0.00046503846153846145,
+      "loss": 3.3249,
+      "step": 38700
+    },
+    {
+      "epoch": 11.284233211019862,
+      "grad_norm": 0.36820656061172485,
+      "learning_rate": 0.0004648636363636363,
+      "loss": 3.3411,
+      "step": 38750
+    },
+    {
+      "epoch": 11.298794338633584,
+      "grad_norm": 0.33335718512535095,
+      "learning_rate": 0.0004646888111888111,
+      "loss": 3.3367,
+      "step": 38800
+    },
+    {
+      "epoch": 11.313355466247307,
+      "grad_norm": 0.3624469041824341,
+      "learning_rate": 0.00046451398601398596,
+      "loss": 3.3382,
+      "step": 38850
+    },
+    {
+      "epoch": 11.327916593861028,
+      "grad_norm": 0.3541378974914551,
+      "learning_rate": 0.0004643391608391608,
+      "loss": 3.335,
+      "step": 38900
+    },
+    {
+      "epoch": 11.34247772147475,
+      "grad_norm": 0.3461047112941742,
+      "learning_rate": 0.0004641643356643356,
+      "loss": 3.3381,
+      "step": 38950
+    },
+    {
+      "epoch": 11.357038849088473,
+      "grad_norm": 0.33990010619163513,
+      "learning_rate": 0.00046398951048951046,
+      "loss": 3.339,
+      "step": 39000
+    },
+    {
+      "epoch": 11.357038849088473,
+      "eval_accuracy": 0.36928096122949794,
+      "eval_loss": 3.5659894943237305,
+      "eval_runtime": 179.756,
+      "eval_samples_per_second": 92.603,
+      "eval_steps_per_second": 5.791,
+      "step": 39000
+    },
+    {
+      "epoch": 11.371599976702196,
+      "grad_norm": 0.32082730531692505,
+      "learning_rate": 0.00046381468531468526,
+      "loss": 3.3427,
+      "step": 39050
+    },
+    {
+      "epoch": 11.386161104315919,
+      "grad_norm": 0.34358900785446167,
+      "learning_rate": 0.0004636398601398601,
+      "loss": 3.3326,
+      "step": 39100
+    },
+    {
+      "epoch": 11.40072223192964,
+      "grad_norm": 0.3438430428504944,
+      "learning_rate": 0.0004634650349650349,
+      "loss": 3.3397,
+      "step": 39150
+    },
+    {
+      "epoch": 11.415283359543363,
+      "grad_norm": 0.33695173263549805,
+      "learning_rate": 0.00046329020979020977,
+      "loss": 3.3419,
+      "step": 39200
+    },
+    {
+      "epoch": 11.429844487157085,
+      "grad_norm": 0.35443180799484253,
+      "learning_rate": 0.00046311538461538457,
+      "loss": 3.3384,
+      "step": 39250
+    },
+    {
+      "epoch": 11.444405614770808,
+      "grad_norm": 0.3469848930835724,
+      "learning_rate": 0.0004629405594405594,
+      "loss": 3.3428,
+      "step": 39300
+    },
+    {
+      "epoch": 11.458966742384531,
+      "grad_norm": 0.3489019274711609,
+      "learning_rate": 0.0004627657342657342,
+      "loss": 3.3367,
+      "step": 39350
+    },
+    {
+      "epoch": 11.473527869998252,
+      "grad_norm": 0.34575167298316956,
+      "learning_rate": 0.0004625909090909091,
+      "loss": 3.3421,
+      "step": 39400
+    },
+    {
+      "epoch": 11.488088997611975,
+      "grad_norm": 0.36594027280807495,
+      "learning_rate": 0.0004624160839160838,
+      "loss": 3.3443,
+      "step": 39450
+    },
+    {
+      "epoch": 11.502650125225697,
+      "grad_norm": 0.37035486102104187,
+      "learning_rate": 0.0004622412587412587,
+      "loss": 3.3556,
+      "step": 39500
+    },
+    {
+      "epoch": 11.51721125283942,
+      "grad_norm": 0.378604918718338,
+      "learning_rate": 0.00046206643356643353,
+      "loss": 3.3449,
+      "step": 39550
+    },
+    {
+      "epoch": 11.531772380453143,
+      "grad_norm": 0.37700214982032776,
+      "learning_rate": 0.00046189160839160833,
+      "loss": 3.3588,
+      "step": 39600
+    },
+    {
+      "epoch": 11.546333508066864,
+      "grad_norm": 0.3387846350669861,
+      "learning_rate": 0.0004617167832167832,
+      "loss": 3.3551,
+      "step": 39650
+    },
+    {
+      "epoch": 11.560894635680587,
+      "grad_norm": 0.35091421008110046,
+      "learning_rate": 0.000461541958041958,
+      "loss": 3.3484,
+      "step": 39700
+    },
+    {
+      "epoch": 11.57545576329431,
+      "grad_norm": 0.36825278401374817,
+      "learning_rate": 0.00046136713286713284,
+      "loss": 3.3578,
+      "step": 39750
+    },
+    {
+      "epoch": 11.590016890908032,
+      "grad_norm": 0.3793783485889435,
+      "learning_rate": 0.00046119230769230764,
+      "loss": 3.3567,
+      "step": 39800
+    },
+    {
+      "epoch": 11.604578018521755,
+      "grad_norm": 0.35660460591316223,
+      "learning_rate": 0.0004610174825174825,
+      "loss": 3.3665,
+      "step": 39850
+    },
+    {
+      "epoch": 11.619139146135478,
+      "grad_norm": 0.3568241000175476,
+      "learning_rate": 0.0004608426573426573,
+      "loss": 3.3687,
+      "step": 39900
+    },
+    {
+      "epoch": 11.633700273749199,
+      "grad_norm": 0.3692575991153717,
+      "learning_rate": 0.00046066783216783215,
+      "loss": 3.3489,
+      "step": 39950
+    },
+    {
+      "epoch": 11.648261401362921,
+      "grad_norm": 0.33610156178474426,
+      "learning_rate": 0.00046049300699300695,
+      "loss": 3.3622,
+      "step": 40000
+    },
+    {
+      "epoch": 11.648261401362921,
+      "eval_accuracy": 0.36969924898655154,
+      "eval_loss": 3.5569655895233154,
+      "eval_runtime": 179.7612,
+      "eval_samples_per_second": 92.601,
+      "eval_steps_per_second": 5.791,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171700,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.3607701815296e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}