diff --git "a/last_to_push_frequency_3591/checkpoint-30000/trainer_state.json" "b/last_to_push_frequency_3591/checkpoint-30000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/last_to_push_frequency_3591/checkpoint-30000/trainer_state.json"
@@ -0,0 +1,4513 @@
+{
+  "best_global_step": 30000,
+  "best_metric": 3.57382869720459,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_push_frequency_3591/checkpoint-30000",
+  "epoch": 8.736210612149804,
+  "eval_steps": 1000,
+  "global_step": 30000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014561127613722406,
+      "grad_norm": 0.8466977477073669,
+      "learning_rate": 0.000294,
+      "loss": 8.4098,
+      "step": 50
+    },
+    {
+      "epoch": 0.029122255227444813,
+      "grad_norm": 0.6842089295387268,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7014,
+      "step": 100
+    },
+    {
+      "epoch": 0.04368338284116722,
+      "grad_norm": 0.7142037749290466,
+      "learning_rate": 0.0005998286713286713,
+      "loss": 6.34,
+      "step": 150
+    },
+    {
+      "epoch": 0.058244510454889625,
+      "grad_norm": 0.5938196778297424,
+      "learning_rate": 0.0005996538461538461,
+      "loss": 6.139,
+      "step": 200
+    },
+    {
+      "epoch": 0.07280563806861204,
+      "grad_norm": 0.3856406807899475,
+      "learning_rate": 0.0005994790209790209,
+      "loss": 5.9921,
+      "step": 250
+    },
+    {
+      "epoch": 0.08736676568233444,
+      "grad_norm": 0.39986154437065125,
+      "learning_rate": 0.0005993041958041958,
+      "loss": 5.8431,
+      "step": 300
+    },
+    {
+      "epoch": 0.10192789329605685,
+      "grad_norm": 0.5341420769691467,
+      "learning_rate": 0.0005991293706293705,
+      "loss": 5.7043,
+      "step": 350
+    },
+    {
+      "epoch": 0.11648902090977925,
+      "grad_norm": 0.4427669644355774,
+      "learning_rate": 0.0005989545454545454,
+      "loss": 5.6019,
+      "step": 400
+    },
+    {
+      "epoch": 0.13105014852350166,
+      "grad_norm": 0.45295220613479614,
+      "learning_rate": 0.0005987797202797202,
+      "loss": 5.4996,
+      "step": 450
+    },
+    {
+      "epoch": 0.14561127613722408,
+      "grad_norm": 0.5014562606811523,
+      "learning_rate": 0.000598604895104895,
+      "loss": 5.3991,
+      "step": 500
+    },
+    {
+      "epoch": 0.16017240375094646,
+      "grad_norm": 0.4532122015953064,
+      "learning_rate": 0.0005984300699300698,
+      "loss": 5.3103,
+      "step": 550
+    },
+    {
+      "epoch": 0.17473353136466888,
+      "grad_norm": 0.45960476994514465,
+      "learning_rate": 0.0005982552447552447,
+      "loss": 5.2526,
+      "step": 600
+    },
+    {
+      "epoch": 0.1892946589783913,
+      "grad_norm": 0.4369155764579773,
+      "learning_rate": 0.0005980804195804195,
+      "loss": 5.1734,
+      "step": 650
+    },
+    {
+      "epoch": 0.2038557865921137,
+      "grad_norm": 0.48178285360336304,
+      "learning_rate": 0.0005979055944055943,
+      "loss": 5.1075,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184169142058361,
+      "grad_norm": 0.44058653712272644,
+      "learning_rate": 0.0005977307692307691,
+      "loss": 5.0671,
+      "step": 750
+    },
+    {
+      "epoch": 0.2329780418195585,
+      "grad_norm": 0.45520836114883423,
+      "learning_rate": 0.000597555944055944,
+      "loss": 5.0176,
+      "step": 800
+    },
+    {
+      "epoch": 0.24753916943328091,
+      "grad_norm": 0.4496179521083832,
+      "learning_rate": 0.0005973811188811188,
+      "loss": 4.972,
+      "step": 850
+    },
+    {
+      "epoch": 0.2621002970470033,
+      "grad_norm": 0.5189023613929749,
+      "learning_rate": 0.0005972062937062936,
+      "loss": 4.9247,
+      "step": 900
+    },
+    {
+      "epoch": 0.27666142466072574,
+      "grad_norm": 0.542473554611206,
+      "learning_rate": 0.0005970314685314685,
+      "loss": 4.8671,
+      "step": 950
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "grad_norm": 0.45608243346214294,
+      "learning_rate": 0.0005968566433566433,
+      "loss": 4.8193,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "eval_accuracy": 0.2555685292424054,
+      "eval_loss": 4.740924835205078,
+      "eval_runtime": 182.37,
+      "eval_samples_per_second": 91.276,
+      "eval_steps_per_second": 5.708,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30578367988817057,
+      "grad_norm": 0.4338395893573761,
+      "learning_rate": 0.0005966818181818181,
+      "loss": 4.7728,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3203448075018929,
+      "grad_norm": 0.5276835560798645,
+      "learning_rate": 0.0005965069930069929,
+      "loss": 4.7444,
+      "step": 1100
+    },
+    {
+      "epoch": 0.33490593511561534,
+      "grad_norm": 0.44655585289001465,
+      "learning_rate": 0.0005963321678321677,
+      "loss": 4.6967,
+      "step": 1150
+    },
+    {
+      "epoch": 0.34946706272933775,
+      "grad_norm": 0.38414475321769714,
+      "learning_rate": 0.0005961573426573425,
+      "loss": 4.6614,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36402819034306017,
+      "grad_norm": 0.4639468193054199,
+      "learning_rate": 0.0005959825174825174,
+      "loss": 4.6234,
+      "step": 1250
+    },
+    {
+      "epoch": 0.3785893179567826,
+      "grad_norm": 0.4747146964073181,
+      "learning_rate": 0.0005958076923076922,
+      "loss": 4.6,
+      "step": 1300
+    },
+    {
+      "epoch": 0.393150445570505,
+      "grad_norm": 0.5148531198501587,
+      "learning_rate": 0.000595632867132867,
+      "loss": 4.5744,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4077115731842274,
+      "grad_norm": 0.4540764391422272,
+      "learning_rate": 0.0005954580419580418,
+      "loss": 4.5458,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4222727007979498,
+      "grad_norm": 0.4469960033893585,
+      "learning_rate": 0.0005952832167832168,
+      "loss": 4.5213,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4368338284116722,
+      "grad_norm": 0.4068995416164398,
+      "learning_rate": 0.0005951083916083916,
+      "loss": 4.4989,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4513949560253946,
+      "grad_norm": 0.36514100432395935,
+      "learning_rate": 0.0005949335664335664,
+      "loss": 4.4959,
+      "step": 1550
+    },
+    {
+      "epoch": 0.465956083639117,
+      "grad_norm": 0.41950780153274536,
+      "learning_rate": 0.0005947587412587413,
+      "loss": 4.4468,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4805172112528394,
+      "grad_norm": 0.42646339535713196,
+      "learning_rate": 0.0005945839160839161,
+      "loss": 4.4366,
+      "step": 1650
+    },
+    {
+      "epoch": 0.49507833886656183,
+      "grad_norm": 0.3759300112724304,
+      "learning_rate": 0.0005944090909090909,
+      "loss": 4.425,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5096394664802842,
+      "grad_norm": 0.40312129259109497,
+      "learning_rate": 0.0005942342657342657,
+      "loss": 4.3986,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5242005940940067,
+      "grad_norm": 0.3916018009185791,
+      "learning_rate": 0.0005940594405594406,
+      "loss": 4.3877,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5387617217077291,
+      "grad_norm": 0.36997199058532715,
+      "learning_rate": 0.0005938846153846153,
+      "loss": 4.3737,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5533228493214515,
+      "grad_norm": 0.4766369163990021,
+      "learning_rate": 0.0005937097902097902,
+      "loss": 4.3561,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5678839769351739,
+      "grad_norm": 0.4964778423309326,
+      "learning_rate": 0.000593534965034965,
+      "loss": 4.3618,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "grad_norm": 0.43863603472709656,
+      "learning_rate": 0.0005933601398601398,
+      "loss": 4.3232,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "eval_accuracy": 0.29954776016193546,
+      "eval_loss": 4.281495094299316,
+      "eval_runtime": 182.5417,
+      "eval_samples_per_second": 91.19,
+      "eval_steps_per_second": 5.703,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5970062321626187,
+      "grad_norm": 0.3895237147808075,
+      "learning_rate": 0.0005931853146853146,
+      "loss": 4.3153,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6115673597763411,
+      "grad_norm": 0.35456663370132446,
+      "learning_rate": 0.0005930104895104895,
+      "loss": 4.3017,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6261284873900634,
+      "grad_norm": 0.36657285690307617,
+      "learning_rate": 0.0005928356643356643,
+      "loss": 4.3021,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6406896150037859,
+      "grad_norm": 0.3920918107032776,
+      "learning_rate": 0.0005926608391608391,
+      "loss": 4.2813,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6552507426175083,
+      "grad_norm": 0.3744465708732605,
+      "learning_rate": 0.000592486013986014,
+      "loss": 4.2666,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6698118702312307,
+      "grad_norm": 0.38081395626068115,
+      "learning_rate": 0.0005923111888111888,
+      "loss": 4.2704,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6843729978449531,
+      "grad_norm": 0.36270445585250854,
+      "learning_rate": 0.0005921363636363636,
+      "loss": 4.24,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6989341254586755,
+      "grad_norm": 0.40953528881073,
+      "learning_rate": 0.0005919615384615384,
+      "loss": 4.2401,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7134952530723979,
+      "grad_norm": 0.36897608637809753,
+      "learning_rate": 0.0005917867132867133,
+      "loss": 4.2372,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7280563806861203,
+      "grad_norm": 0.35832679271698,
+      "learning_rate": 0.0005916118881118881,
+      "loss": 4.2331,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7426175082998427,
+      "grad_norm": 0.38073498010635376,
+      "learning_rate": 0.0005914370629370629,
+      "loss": 4.2068,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7571786359135652,
+      "grad_norm": 0.4003191888332367,
+      "learning_rate": 0.0005912622377622377,
+      "loss": 4.2046,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7717397635272876,
+      "grad_norm": 0.3635654151439667,
+      "learning_rate": 0.0005910874125874125,
+      "loss": 4.1995,
+      "step": 2650
+    },
+    {
+      "epoch": 0.78630089114101,
+      "grad_norm": 0.35873937606811523,
+      "learning_rate": 0.0005909125874125873,
+      "loss": 4.2033,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8008620187547324,
+      "grad_norm": 0.3539746105670929,
+      "learning_rate": 0.0005907377622377622,
+      "loss": 4.1872,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8154231463684548,
+      "grad_norm": 0.37548962235450745,
+      "learning_rate": 0.000590562937062937,
+      "loss": 4.168,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8299842739821772,
+      "grad_norm": 0.3538251221179962,
+      "learning_rate": 0.0005903881118881118,
+      "loss": 4.179,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8445454015958996,
+      "grad_norm": 0.3582804501056671,
+      "learning_rate": 0.0005902132867132867,
+      "loss": 4.1529,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8591065292096219,
+      "grad_norm": 0.3666636049747467,
+      "learning_rate": 0.0005900384615384615,
+      "loss": 4.1502,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "grad_norm": 0.37483951449394226,
+      "learning_rate": 0.0005898636363636363,
+      "loss": 4.1292,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "eval_accuracy": 0.31617228847136397,
+      "eval_loss": 4.089555263519287,
+      "eval_runtime": 182.8843,
+      "eval_samples_per_second": 91.019,
+      "eval_steps_per_second": 5.692,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8882287844370668,
+      "grad_norm": 0.3864665925502777,
+      "learning_rate": 0.0005896888111888111,
+      "loss": 4.1193,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9027899120507892,
+      "grad_norm": 0.347319632768631,
+      "learning_rate": 0.000589513986013986,
+      "loss": 4.1179,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9173510396645116,
+      "grad_norm": 0.3676200807094574,
+      "learning_rate": 0.0005893391608391608,
+      "loss": 4.1154,
+      "step": 3150
+    },
+    {
+      "epoch": 0.931912167278234,
+      "grad_norm": 0.3425697088241577,
+      "learning_rate": 0.0005891643356643356,
+      "loss": 4.1291,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9464732948919564,
+      "grad_norm": 0.3959852457046509,
+      "learning_rate": 0.0005889895104895104,
+      "loss": 4.1053,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9610344225056788,
+      "grad_norm": 0.3836718499660492,
+      "learning_rate": 0.0005888146853146853,
+      "loss": 4.1068,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9755955501194012,
+      "grad_norm": 0.4188532531261444,
+      "learning_rate": 0.00058863986013986,
+      "loss": 4.0989,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9901566777331237,
+      "grad_norm": 0.33719179034233093,
+      "learning_rate": 0.0005884650349650349,
+      "loss": 4.0963,
+      "step": 3400
+    },
+    {
+      "epoch": 1.004659560836391,
+      "grad_norm": 0.36592167615890503,
+      "learning_rate": 0.0005882902097902097,
+      "loss": 4.0577,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0192206884501136,
+      "grad_norm": 0.3381110727787018,
+      "learning_rate": 0.0005881153846153845,
+      "loss": 4.0016,
+      "step": 3500
+    },
+    {
+      "epoch": 1.033781816063836,
+      "grad_norm": 0.35888952016830444,
+      "learning_rate": 0.0005879405594405594,
+      "loss": 4.0032,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0483429436775584,
+      "grad_norm": 0.35463494062423706,
+      "learning_rate": 0.0005877657342657342,
+      "loss": 4.0018,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0629040712912807,
+      "grad_norm": 0.3339248299598694,
+      "learning_rate": 0.000587590909090909,
+      "loss": 4.0116,
+      "step": 3650
+    },
+    {
+      "epoch": 1.0774651989050033,
+      "grad_norm": 0.37050846219062805,
+      "learning_rate": 0.0005874160839160838,
+      "loss": 4.0031,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0920263265187256,
+      "grad_norm": 0.37132593989372253,
+      "learning_rate": 0.0005872412587412587,
+      "loss": 3.996,
+      "step": 3750
+    },
+    {
+      "epoch": 1.106587454132448,
+      "grad_norm": 0.3583703339099884,
+      "learning_rate": 0.0005870664335664335,
+      "loss": 3.9926,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1211485817461704,
+      "grad_norm": 0.32278576493263245,
+      "learning_rate": 0.0005868916083916083,
+      "loss": 3.9928,
+      "step": 3850
+    },
+    {
+      "epoch": 1.135709709359893,
+      "grad_norm": 0.46608036756515503,
+      "learning_rate": 0.0005867167832167831,
+      "loss": 3.9823,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1502708369736152,
+      "grad_norm": 0.3258165717124939,
+      "learning_rate": 0.000586541958041958,
+      "loss": 3.99,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "grad_norm": 0.36865976452827454,
+      "learning_rate": 0.0005863671328671328,
+      "loss": 3.9822,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "eval_accuracy": 0.3258375806744003,
+      "eval_loss": 3.9846527576446533,
+      "eval_runtime": 182.7722,
+      "eval_samples_per_second": 91.075,
+      "eval_steps_per_second": 5.696,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17939309220106,
+      "grad_norm": 0.3421526551246643,
+      "learning_rate": 0.0005861923076923076,
+      "loss": 3.995,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1939542198147826,
+      "grad_norm": 0.34738972783088684,
+      "learning_rate": 0.0005860174825174824,
+      "loss": 3.9821,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2085153474285049,
+      "grad_norm": 0.33670172095298767,
+      "learning_rate": 0.0005858426573426573,
+      "loss": 3.9627,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2230764750422272,
+      "grad_norm": 0.3570351004600525,
+      "learning_rate": 0.000585667832167832,
+      "loss": 3.9683,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2376376026559497,
+      "grad_norm": 0.34402894973754883,
+      "learning_rate": 0.000585493006993007,
+      "loss": 3.9625,
+      "step": 4250
+    },
+    {
+      "epoch": 1.2521987302696722,
+      "grad_norm": 0.3514033555984497,
+      "learning_rate": 0.0005853181818181817,
+      "loss": 3.9698,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2667598578833945,
+      "grad_norm": 0.34714460372924805,
+      "learning_rate": 0.0005851433566433565,
+      "loss": 3.9567,
+      "step": 4350
+    },
+    {
+      "epoch": 1.2813209854971168,
+      "grad_norm": 0.34446805715560913,
+      "learning_rate": 0.0005849685314685315,
+      "loss": 3.9564,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2958821131108393,
+      "grad_norm": 0.3477458655834198,
+      "learning_rate": 0.0005847937062937063,
+      "loss": 3.9627,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3104432407245616,
+      "grad_norm": 0.32496753334999084,
+      "learning_rate": 0.0005846188811188811,
+      "loss": 3.9557,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3250043683382842,
+      "grad_norm": 0.37617602944374084,
+      "learning_rate": 0.0005844440559440559,
+      "loss": 3.9518,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3395654959520065,
+      "grad_norm": 0.35174936056137085,
+      "learning_rate": 0.0005842692307692308,
+      "loss": 3.9558,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354126623565729,
+      "grad_norm": 0.36389508843421936,
+      "learning_rate": 0.0005840944055944056,
+      "loss": 3.9488,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3686877511794513,
+      "grad_norm": 0.3404442071914673,
+      "learning_rate": 0.0005839195804195804,
+      "loss": 3.9436,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3832488787931738,
+      "grad_norm": 0.3432350158691406,
+      "learning_rate": 0.0005837447552447552,
+      "loss": 3.934,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3978100064068961,
+      "grad_norm": 0.3505391776561737,
+      "learning_rate": 0.0005835699300699301,
+      "loss": 3.9345,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4123711340206184,
+      "grad_norm": 0.3216480016708374,
+      "learning_rate": 0.0005833951048951048,
+      "loss": 3.9266,
+      "step": 4850
+    },
+    {
+      "epoch": 1.426932261634341,
+      "grad_norm": 0.3275972902774811,
+      "learning_rate": 0.0005832202797202797,
+      "loss": 3.9393,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4414933892480635,
+      "grad_norm": 0.35102906823158264,
+      "learning_rate": 0.0005830454545454546,
+      "loss": 3.9359,
+      "step": 4950
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "grad_norm": 0.3343009650707245,
+      "learning_rate": 0.0005828706293706293,
+      "loss": 3.9236,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "eval_accuracy": 0.33192285635753893,
+      "eval_loss": 3.9100613594055176,
+      "eval_runtime": 184.3964,
+      "eval_samples_per_second": 90.273,
+      "eval_steps_per_second": 5.645,
+      "step": 5000
+    },
+    {
+      "epoch": 1.470615644475508,
+      "grad_norm": 0.32075586915016174,
+      "learning_rate": 0.0005826958041958042,
+      "loss": 3.923,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4851767720892306,
+      "grad_norm": 0.3327905237674713,
+      "learning_rate": 0.000582520979020979,
+      "loss": 3.9322,
+      "step": 5100
+    },
+    {
+      "epoch": 1.4997378997029531,
+      "grad_norm": 0.34814631938934326,
+      "learning_rate": 0.0005823461538461538,
+      "loss": 3.9174,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5142990273166754,
+      "grad_norm": 0.3348972797393799,
+      "learning_rate": 0.0005821713286713286,
+      "loss": 3.9043,
+      "step": 5200
+    },
+    {
+      "epoch": 1.5288601549303977,
+      "grad_norm": 0.3347136676311493,
+      "learning_rate": 0.0005819965034965035,
+      "loss": 3.9046,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5434212825441203,
+      "grad_norm": 0.336887001991272,
+      "learning_rate": 0.0005818216783216783,
+      "loss": 3.915,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5579824101578428,
+      "grad_norm": 0.3282018005847931,
+      "learning_rate": 0.0005816468531468531,
+      "loss": 3.9147,
+      "step": 5350
+    },
+    {
+      "epoch": 1.572543537771565,
+      "grad_norm": 0.34454506635665894,
+      "learning_rate": 0.0005814720279720279,
+      "loss": 3.9073,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5871046653852874,
+      "grad_norm": 0.3495652377605438,
+      "learning_rate": 0.0005812972027972028,
+      "loss": 3.8947,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6016657929990097,
+      "grad_norm": 0.33489686250686646,
+      "learning_rate": 0.0005811223776223776,
+      "loss": 3.9016,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6162269206127322,
+      "grad_norm": 0.3139170706272125,
+      "learning_rate": 0.0005809475524475524,
+      "loss": 3.8813,
+      "step": 5550
+    },
+    {
+      "epoch": 1.6307880482264547,
+      "grad_norm": 0.3473421335220337,
+      "learning_rate": 0.0005807727272727272,
+      "loss": 3.8912,
+      "step": 5600
+    },
+    {
+      "epoch": 1.645349175840177,
+      "grad_norm": 0.3421488106250763,
+      "learning_rate": 0.0005805979020979021,
+      "loss": 3.8781,
+      "step": 5650
+    },
+    {
+      "epoch": 1.6599103034538993,
+      "grad_norm": 0.3326078951358795,
+      "learning_rate": 0.0005804230769230769,
+      "loss": 3.8902,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6744714310676219,
+      "grad_norm": 0.34903430938720703,
+      "learning_rate": 0.0005802482517482517,
+      "loss": 3.8875,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6890325586813444,
+      "grad_norm": 0.337380051612854,
+      "learning_rate": 0.0005800734265734265,
+      "loss": 3.8923,
+      "step": 5800
+    },
+    {
+      "epoch": 1.7035936862950667,
+      "grad_norm": 0.36713698506355286,
+      "learning_rate": 0.0005798986013986013,
+      "loss": 3.8702,
+      "step": 5850
+    },
+    {
+      "epoch": 1.718154813908789,
+      "grad_norm": 0.34020107984542847,
+      "learning_rate": 0.0005797237762237762,
+      "loss": 3.8835,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7327159415225115,
+      "grad_norm": 0.325308233499527,
+      "learning_rate": 0.000579548951048951,
+      "loss": 3.8765,
+      "step": 5950
+    },
+    {
+      "epoch": 1.747277069136234,
+      "grad_norm": 0.3312442898750305,
+      "learning_rate": 0.0005793741258741258,
+      "loss": 3.8783,
+      "step": 6000
+    },
+    {
+      "epoch": 1.747277069136234,
+      "eval_accuracy": 0.3375568091909506,
+      "eval_loss": 3.8538968563079834,
+      "eval_runtime": 182.6837,
+      "eval_samples_per_second": 91.119,
+      "eval_steps_per_second": 5.698,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7618381967499563,
+      "grad_norm": 0.3400241732597351,
+      "learning_rate": 0.0005791993006993006,
+      "loss": 3.8657,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7763993243636786,
+      "grad_norm": 0.3185133934020996,
+      "learning_rate": 0.0005790244755244755,
+      "loss": 3.8761,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7909604519774012,
+      "grad_norm": 0.3425745964050293,
+      "learning_rate": 0.0005788496503496503,
+      "loss": 3.8653,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8055215795911237,
+      "grad_norm": 0.31458574533462524,
+      "learning_rate": 0.0005786748251748251,
+      "loss": 3.8693,
+      "step": 6200
+    },
+    {
+      "epoch": 1.820082707204846,
+      "grad_norm": 0.33563944697380066,
+      "learning_rate": 0.0005784999999999999,
+      "loss": 3.8654,
+      "step": 6250
+    },
+    {
+      "epoch": 1.8346438348185683,
+      "grad_norm": 0.3224489092826843,
+      "learning_rate": 0.0005783251748251748,
+      "loss": 3.8461,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8492049624322906,
+      "grad_norm": 0.3258609175682068,
+      "learning_rate": 0.0005781503496503496,
+      "loss": 3.857,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8637660900460131,
+      "grad_norm": 0.31683316826820374,
+      "learning_rate": 0.0005779755244755244,
+      "loss": 3.8613,
+      "step": 6400
+    },
+    {
+      "epoch": 1.8783272176597356,
+      "grad_norm": 0.3589027523994446,
+      "learning_rate": 0.0005778006993006993,
+      "loss": 3.8568,
+      "step": 6450
+    },
+    {
+      "epoch": 1.892888345273458,
+      "grad_norm": 0.3233237862586975,
+      "learning_rate": 0.000577625874125874,
+      "loss": 3.8619,
+      "step": 6500
+    },
+    {
+      "epoch": 1.9074494728871803,
+      "grad_norm": 0.3178718388080597,
+      "learning_rate": 0.0005774510489510489,
+      "loss": 3.8528,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9220106005009028,
+      "grad_norm": 0.3323863446712494,
+      "learning_rate": 0.0005772762237762237,
+      "loss": 3.8432,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9365717281146253,
+      "grad_norm": 0.3357718586921692,
+      "learning_rate": 0.0005771013986013985,
+      "loss": 3.8408,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9511328557283476,
+      "grad_norm": 0.3276062607765198,
+      "learning_rate": 0.0005769265734265733,
+      "loss": 3.8554,
+      "step": 6700
+    },
+    {
+      "epoch": 1.96569398334207,
+      "grad_norm": 0.35431572794914246,
+      "learning_rate": 0.0005767517482517482,
+      "loss": 3.8394,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9802551109557924,
+      "grad_norm": 0.3411652147769928,
+      "learning_rate": 0.000576576923076923,
+      "loss": 3.8523,
+      "step": 6800
+    },
+    {
+      "epoch": 1.994816238569515,
+      "grad_norm": 0.32246044278144836,
+      "learning_rate": 0.0005764020979020978,
+      "loss": 3.8446,
+      "step": 6850
+    },
+    {
+      "epoch": 2.009319121672782,
+      "grad_norm": 0.3211682438850403,
+      "learning_rate": 0.0005762272727272726,
+      "loss": 3.7718,
+      "step": 6900
+    },
+    {
+      "epoch": 2.023880249286505,
+      "grad_norm": 0.327195405960083,
+      "learning_rate": 0.0005760524475524475,
+      "loss": 3.7531,
+      "step": 6950
+    },
+    {
+      "epoch": 2.038441376900227,
+      "grad_norm": 0.327593058347702,
+      "learning_rate": 0.0005758776223776223,
+      "loss": 3.7409,
+      "step": 7000
+    },
+    {
+      "epoch": 2.038441376900227,
+      "eval_accuracy": 0.3418543103036807,
+      "eval_loss": 3.807173013687134,
+      "eval_runtime": 182.7201,
+      "eval_samples_per_second": 91.101,
+      "eval_steps_per_second": 5.697,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0530025045139495,
+      "grad_norm": 0.31732338666915894,
+      "learning_rate": 0.0005757027972027971,
+      "loss": 3.7579,
+      "step": 7050
+    },
+    {
+      "epoch": 2.067563632127672,
+      "grad_norm": 0.31844356656074524,
+      "learning_rate": 0.000575527972027972,
+      "loss": 3.7387,
+      "step": 7100
+    },
+    {
+      "epoch": 2.0821247597413945,
+      "grad_norm": 0.3230411410331726,
+      "learning_rate": 0.0005753531468531468,
+      "loss": 3.751,
+      "step": 7150
+    },
+    {
+      "epoch": 2.096685887355117,
+      "grad_norm": 0.32469573616981506,
+      "learning_rate": 0.0005751783216783216,
+      "loss": 3.7505,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111247014968839,
+      "grad_norm": 0.32758161425590515,
+      "learning_rate": 0.0005750034965034964,
+      "loss": 3.749,
+      "step": 7250
+    },
+    {
+      "epoch": 2.1258081425825615,
+      "grad_norm": 0.32330748438835144,
+      "learning_rate": 0.0005748286713286712,
+      "loss": 3.7479,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140369270196284,
+      "grad_norm": 0.3409372568130493,
+      "learning_rate": 0.000574653846153846,
+      "loss": 3.7407,
+      "step": 7350
+    },
+    {
+      "epoch": 2.1549303978100065,
+      "grad_norm": 0.3466344177722931,
+      "learning_rate": 0.000574479020979021,
+      "loss": 3.7507,
+      "step": 7400
+    },
+    {
+      "epoch": 2.169491525423729,
+      "grad_norm": 0.3374614119529724,
+      "learning_rate": 0.0005743041958041958,
+      "loss": 3.752,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184052653037451,
+      "grad_norm": 0.31741416454315186,
+      "learning_rate": 0.0005741293706293706,
+      "loss": 3.7516,
+      "step": 7500
+    },
+    {
+      "epoch": 2.198613780651174,
+      "grad_norm": 0.3337715268135071,
+      "learning_rate": 0.0005739545454545454,
+      "loss": 3.7466,
+      "step": 7550
+    },
+    {
+      "epoch": 2.213174908264896,
+      "grad_norm": 0.3211740255355835,
+      "learning_rate": 0.0005737797202797203,
+      "loss": 3.7529,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2277360358786185,
+      "grad_norm": 0.3633030652999878,
+      "learning_rate": 0.0005736048951048951,
+      "loss": 3.7411,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2422971634923408,
+      "grad_norm": 0.3291577100753784,
+      "learning_rate": 0.0005734300699300699,
+      "loss": 3.7522,
+      "step": 7700
+    },
+    {
+      "epoch": 2.256858291106063,
+      "grad_norm": 0.32250499725341797,
+      "learning_rate": 0.0005732552447552448,
+      "loss": 3.7576,
+      "step": 7750
+    },
+    {
+      "epoch": 2.271419418719786,
+      "grad_norm": 0.30450233817100525,
+      "learning_rate": 0.0005730804195804196,
+      "loss": 3.7453,
+      "step": 7800
+    },
+    {
+      "epoch": 2.285980546333508,
+      "grad_norm": 0.30964726209640503,
+      "learning_rate": 0.0005729055944055944,
+      "loss": 3.755,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3005416739472304,
+      "grad_norm": 0.31292903423309326,
+      "learning_rate": 0.0005727307692307692,
+      "loss": 3.7539,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3151028015609527,
+      "grad_norm": 0.3473982512950897,
+      "learning_rate": 0.0005725559440559441,
+      "loss": 3.7509,
+      "step": 7950
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "grad_norm": 0.35157695412635803,
+      "learning_rate": 0.0005723811188811188,
+      "loss": 3.7556,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "eval_accuracy": 0.3448868377610154,
+      "eval_loss": 3.778634786605835,
+      "eval_runtime": 184.554,
+      "eval_samples_per_second": 90.196,
+      "eval_steps_per_second": 5.641,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3442250567883978,
+      "grad_norm": 0.3254748582839966,
+      "learning_rate": 0.0005722062937062937,
+      "loss": 3.7366,
+      "step": 8050
+    },
+    {
+      "epoch": 2.35878618440212,
+      "grad_norm": 0.33637189865112305,
+      "learning_rate": 0.0005720314685314685,
+      "loss": 3.749,
+      "step": 8100
+    },
+    {
+      "epoch": 2.3733473120158424,
+      "grad_norm": 0.3259914517402649,
+      "learning_rate": 0.0005718566433566433,
+      "loss": 3.7439,
+      "step": 8150
+    },
+    {
+      "epoch": 2.387908439629565,
+      "grad_norm": 0.31481459736824036,
+      "learning_rate": 0.0005716818181818181,
+      "loss": 3.7376,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4024695672432874,
+      "grad_norm": 0.3136726915836334,
+      "learning_rate": 0.000571506993006993,
+      "loss": 3.7284,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4170306948570097,
+      "grad_norm": 0.32896652817726135,
+      "learning_rate": 0.0005713321678321678,
+      "loss": 3.7478,
+      "step": 8300
+    },
+    {
+      "epoch": 2.431591822470732,
+      "grad_norm": 0.3367721140384674,
+      "learning_rate": 0.0005711573426573426,
+      "loss": 3.7418,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4461529500844543,
+      "grad_norm": 0.32438716292381287,
+      "learning_rate": 0.0005709825174825175,
+      "loss": 3.752,
+      "step": 8400
+    },
+    {
+      "epoch": 2.460714077698177,
+      "grad_norm": 0.3197824954986572,
+      "learning_rate": 0.0005708076923076923,
+      "loss": 3.7425,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4752752053118994,
+      "grad_norm": 0.3408988416194916,
+      "learning_rate": 0.0005706328671328671,
+      "loss": 3.7424,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4898363329256217,
+      "grad_norm": 0.3419415354728699,
+      "learning_rate": 0.0005704580419580419,
+      "loss": 3.7546,
+      "step": 8550
+    },
+    {
+      "epoch": 2.5043974605393444,
+      "grad_norm": 0.3166530132293701,
+      "learning_rate": 0.0005702832167832168,
+      "loss": 3.7339,
+      "step": 8600
+    },
+    {
+      "epoch": 2.5189585881530667,
+      "grad_norm": 0.3352281451225281,
+      "learning_rate": 0.0005701083916083916,
+      "loss": 3.7278,
+      "step": 8650
+    },
+    {
+      "epoch": 2.533519715766789,
+      "grad_norm": 0.31808385252952576,
+      "learning_rate": 0.0005699335664335664,
+      "loss": 3.7421,
+      "step": 8700
+    },
+    {
+      "epoch": 2.5480808433805113,
+      "grad_norm": 0.3182724714279175,
+      "learning_rate": 0.0005697587412587412,
+      "loss": 3.7366,
+      "step": 8750
+    },
+    {
+      "epoch": 2.5626419709942336,
+      "grad_norm": 0.3110598623752594,
+      "learning_rate": 0.000569583916083916,
+      "loss": 3.7345,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5772030986079564,
+      "grad_norm": 0.3083634674549103,
+      "learning_rate": 0.0005694090909090908,
+      "loss": 3.7335,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5917642262216787,
+      "grad_norm": 0.31325507164001465,
+      "learning_rate": 0.0005692342657342657,
+      "loss": 3.735,
+      "step": 8900
+    },
+    {
+      "epoch": 2.606325353835401,
+      "grad_norm": 0.325335830450058,
+      "learning_rate": 0.0005690594405594405,
+      "loss": 3.7352,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "grad_norm": 0.3095828592777252,
+      "learning_rate": 0.0005688846153846153,
+      "loss": 3.7318,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "eval_accuracy": 0.34765026441006025,
+      "eval_loss": 3.7488150596618652,
+      "eval_runtime": 181.151,
+      "eval_samples_per_second": 91.89,
+      "eval_steps_per_second": 5.747,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6354476090628456,
+      "grad_norm": 0.32213926315307617,
+      "learning_rate": 0.0005687097902097901,
+      "loss": 3.7308,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6500087366765683,
+      "grad_norm": 0.3269072473049164,
+      "learning_rate": 0.000568534965034965,
+      "loss": 3.7258,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6645698642902906,
+      "grad_norm": 0.3074745833873749,
+      "learning_rate": 0.0005683601398601398,
+      "loss": 3.7307,
+      "step": 9150
+    },
+    {
+      "epoch": 2.679130991904013,
+      "grad_norm": 0.3062356412410736,
+      "learning_rate": 0.0005681853146853146,
+      "loss": 3.7292,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6936921195177357,
+      "grad_norm": 0.3062867820262909,
+      "learning_rate": 0.0005680104895104895,
+      "loss": 3.7162,
+      "step": 9250
+    },
+    {
+      "epoch": 2.708253247131458,
+      "grad_norm": 0.3553263545036316,
+      "learning_rate": 0.0005678356643356643,
+      "loss": 3.7239,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7228143747451803,
+      "grad_norm": 0.31543877720832825,
+      "learning_rate": 0.0005676608391608391,
+      "loss": 3.7271,
+      "step": 9350
+    },
+    {
+      "epoch": 2.7373755023589026,
+      "grad_norm": 0.3128660023212433,
+      "learning_rate": 0.0005674860139860139,
+      "loss": 3.7304,
+      "step": 9400
+    },
+    {
+      "epoch": 2.751936629972625,
+      "grad_norm": 0.31978827714920044,
+      "learning_rate": 0.0005673111888111888,
+      "loss": 3.7252,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7664977575863476,
+      "grad_norm": 0.3351915776729584,
+      "learning_rate": 0.0005671363636363635,
+      "loss": 3.7215,
+      "step": 9500
+    },
+    {
+      "epoch": 2.78105888520007,
+      "grad_norm": 0.32556962966918945,
+      "learning_rate": 0.0005669615384615384,
+      "loss": 3.7109,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7956200128137922,
+      "grad_norm": 0.30667567253112793,
+      "learning_rate": 0.0005667867132867132,
+      "loss": 3.7309,
+      "step": 9600
+    },
+    {
+      "epoch": 2.8101811404275145,
+      "grad_norm": 0.2958901524543762,
+      "learning_rate": 0.000566611888111888,
+      "loss": 3.7281,
+      "step": 9650
+    },
+    {
+      "epoch": 2.824742268041237,
+      "grad_norm": 0.3355960249900818,
+      "learning_rate": 0.0005664370629370628,
+      "loss": 3.7114,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8393033956549596,
+      "grad_norm": 0.3073790967464447,
+      "learning_rate": 0.0005662622377622377,
+      "loss": 3.7258,
+      "step": 9750
+    },
+    {
+      "epoch": 2.853864523268682,
+      "grad_norm": 0.30925753712654114,
+      "learning_rate": 0.0005660874125874125,
+      "loss": 3.721,
+      "step": 9800
+    },
+    {
+      "epoch": 2.868425650882404,
+      "grad_norm": 0.3250259459018707,
+      "learning_rate": 0.0005659125874125873,
+      "loss": 3.7338,
+      "step": 9850
+    },
+    {
+      "epoch": 2.882986778496127,
+      "grad_norm": 0.3341335356235504,
+      "learning_rate": 0.0005657377622377622,
+      "loss": 3.7089,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8975479061098492,
+      "grad_norm": 0.325226366519928,
+      "learning_rate": 0.000565562937062937,
+      "loss": 3.7195,
+      "step": 9950
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "grad_norm": 0.32413652539253235,
+      "learning_rate": 0.0005653881118881118,
+      "loss": 3.7152,
+      "step": 10000
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "eval_accuracy": 0.3496235527749125,
+      "eval_loss": 3.7240853309631348,
+      "eval_runtime": 180.9488,
+      "eval_samples_per_second": 91.993,
+      "eval_steps_per_second": 5.753,
+      "step": 10000
+    },
+    {
+      "epoch": 2.926670161337294,
+      "grad_norm": 0.3107230067253113,
+      "learning_rate": 0.0005652132867132866,
+      "loss": 3.7198,
+      "step": 10050
+    },
+    {
+      "epoch": 2.941231288951016,
+      "grad_norm": 0.3085416257381439,
+      "learning_rate": 0.0005650384615384615,
+      "loss": 3.7137,
+      "step": 10100
+    },
+    {
+      "epoch": 2.955792416564739,
+      "grad_norm": 0.3245227634906769,
+      "learning_rate": 0.0005648636363636363,
+      "loss": 3.7077,
+      "step": 10150
+    },
+    {
+      "epoch": 2.970353544178461,
+      "grad_norm": 0.31815105676651,
+      "learning_rate": 0.0005646888111888111,
+      "loss": 3.7133,
+      "step": 10200
+    },
+    {
+      "epoch": 2.9849146717921835,
+      "grad_norm": 0.3235960900783539,
+      "learning_rate": 0.000564513986013986,
+      "loss": 3.7025,
+      "step": 10250
+    },
+    {
+      "epoch": 2.9994757994059063,
+      "grad_norm": 0.34538596868515015,
+      "learning_rate": 0.0005643391608391607,
+      "loss": 3.7129,
+      "step": 10300
+    },
+    {
+      "epoch": 3.0139786825091734,
+      "grad_norm": 0.31785765290260315,
+      "learning_rate": 0.0005641643356643355,
+      "loss": 3.6054,
+      "step": 10350
+    },
+    {
+      "epoch": 3.0285398101228957,
+      "grad_norm": 0.3313532769680023,
+      "learning_rate": 0.0005639895104895105,
+      "loss": 3.6084,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0431009377366185,
+      "grad_norm": 0.3206842839717865,
+      "learning_rate": 0.0005638146853146853,
+      "loss": 3.6094,
+      "step": 10450
+    },
+    {
+      "epoch": 3.057662065350341,
+      "grad_norm": 0.3367646336555481,
+      "learning_rate": 0.0005636398601398601,
+      "loss": 3.6184,
+      "step": 10500
+    },
+    {
+      "epoch": 3.072223192964063,
+      "grad_norm": 0.3168485164642334,
+      "learning_rate": 0.000563465034965035,
+      "loss": 3.6267,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0867843205777854,
+      "grad_norm": 0.3227006196975708,
+      "learning_rate": 0.0005632902097902098,
+      "loss": 3.6165,
+      "step": 10600
+    },
+    {
+      "epoch": 3.101345448191508,
+      "grad_norm": 0.3266565799713135,
+      "learning_rate": 0.0005631153846153846,
+      "loss": 3.6185,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1159065758052304,
+      "grad_norm": 0.3158946633338928,
+      "learning_rate": 0.0005629405594405594,
+      "loss": 3.6104,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1304677034189528,
+      "grad_norm": 0.32722142338752747,
+      "learning_rate": 0.0005627657342657343,
+      "loss": 3.619,
+      "step": 10750
+    },
+    {
+      "epoch": 3.145028831032675,
+      "grad_norm": 0.30216503143310547,
+      "learning_rate": 0.0005625909090909091,
+      "loss": 3.629,
+      "step": 10800
+    },
+    {
+      "epoch": 3.1595899586463974,
+      "grad_norm": 0.3186177909374237,
+      "learning_rate": 0.0005624160839160839,
+      "loss": 3.6195,
+      "step": 10850
+    },
+    {
+      "epoch": 3.17415108626012,
+      "grad_norm": 0.3162807822227478,
+      "learning_rate": 0.0005622412587412587,
+      "loss": 3.6169,
+      "step": 10900
+    },
+    {
+      "epoch": 3.1887122138738424,
+      "grad_norm": 0.33053115010261536,
+      "learning_rate": 0.0005620664335664336,
+      "loss": 3.6338,
+      "step": 10950
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "grad_norm": 0.30891671776771545,
+      "learning_rate": 0.0005618916083916083,
+      "loss": 3.6271,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "eval_accuracy": 0.3519776264250645,
+      "eval_loss": 3.709869384765625,
+      "eval_runtime": 180.919,
+      "eval_samples_per_second": 92.008,
+      "eval_steps_per_second": 5.754,
+      "step": 11000
+    },
+    {
+      "epoch": 3.217834469101287,
+      "grad_norm": 0.33202117681503296,
+      "learning_rate": 0.0005617167832167832,
+      "loss": 3.6235,
+      "step": 11050
+    },
+    {
+      "epoch": 3.2323955967150098,
+      "grad_norm": 0.3139243423938751,
+      "learning_rate": 0.000561541958041958,
+      "loss": 3.6294,
+      "step": 11100
+    },
+    {
+      "epoch": 3.246956724328732,
+      "grad_norm": 0.3314415216445923,
+      "learning_rate": 0.0005613671328671328,
+      "loss": 3.6461,
+      "step": 11150
+    },
+    {
+      "epoch": 3.2615178519424544,
+      "grad_norm": 0.3135373592376709,
+      "learning_rate": 0.0005611923076923077,
+      "loss": 3.6442,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2760789795561767,
+      "grad_norm": 0.32920390367507935,
+      "learning_rate": 0.0005610174825174825,
+      "loss": 3.6266,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2906401071698994,
+      "grad_norm": 0.3141324520111084,
+      "learning_rate": 0.0005608426573426573,
+      "loss": 3.641,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3052012347836217,
+      "grad_norm": 0.3209163546562195,
+      "learning_rate": 0.0005606678321678321,
+      "loss": 3.6214,
+      "step": 11350
+    },
+    {
+      "epoch": 3.319762362397344,
+      "grad_norm": 0.33475250005722046,
+      "learning_rate": 0.000560493006993007,
+      "loss": 3.6157,
+      "step": 11400
+    },
+    {
+      "epoch": 3.3343234900110663,
+      "grad_norm": 0.33062008023262024,
+      "learning_rate": 0.0005603181818181818,
+      "loss": 3.6323,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3488846176247886,
+      "grad_norm": 0.32883119583129883,
+      "learning_rate": 0.0005601433566433566,
+      "loss": 3.6385,
+      "step": 11500
+    },
+    {
+      "epoch": 3.3634457452385114,
+      "grad_norm": 0.31883203983306885,
+      "learning_rate": 0.0005599685314685314,
+      "loss": 3.6394,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3780068728522337,
+      "grad_norm": 0.30644893646240234,
+      "learning_rate": 0.0005597937062937063,
+      "loss": 3.6358,
+      "step": 11600
+    },
+    {
+      "epoch": 3.392568000465956,
+      "grad_norm": 0.30966854095458984,
+      "learning_rate": 0.0005596188811188811,
+      "loss": 3.6315,
+      "step": 11650
+    },
+    {
+      "epoch": 3.4071291280796787,
+      "grad_norm": 0.33119064569473267,
+      "learning_rate": 0.0005594440559440559,
+      "loss": 3.6342,
+      "step": 11700
+    },
+    {
+      "epoch": 3.421690255693401,
+      "grad_norm": 0.340206116437912,
+      "learning_rate": 0.0005592692307692307,
+      "loss": 3.6359,
+      "step": 11750
+    },
+    {
+      "epoch": 3.4362513833071233,
+      "grad_norm": 0.3317243754863739,
+      "learning_rate": 0.0005590944055944055,
+      "loss": 3.6418,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4508125109208456,
+      "grad_norm": 0.35198959708213806,
+      "learning_rate": 0.0005589195804195803,
+      "loss": 3.6433,
+      "step": 11850
+    },
+    {
+      "epoch": 3.465373638534568,
+      "grad_norm": 0.3379652202129364,
+      "learning_rate": 0.0005587447552447552,
+      "loss": 3.63,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4799347661482907,
+      "grad_norm": 0.3148704171180725,
+      "learning_rate": 0.00055856993006993,
+      "loss": 3.6433,
+      "step": 11950
+    },
+    {
+      "epoch": 3.494495893762013,
+      "grad_norm": 0.32295045256614685,
+      "learning_rate": 0.0005583951048951048,
+      "loss": 3.637,
+      "step": 12000
+    },
+    {
+      "epoch": 3.494495893762013,
+      "eval_accuracy": 0.3538141894775353,
+      "eval_loss": 3.6922500133514404,
+      "eval_runtime": 181.0231,
+      "eval_samples_per_second": 91.955,
+      "eval_steps_per_second": 5.751,
+      "step": 12000
+    },
+    {
+      "epoch": 3.5090570213757353,
+      "grad_norm": 0.33041051030158997,
+      "learning_rate": 0.0005582202797202797,
+      "loss": 3.6393,
+      "step": 12050
+    },
+    {
+      "epoch": 3.523618148989458,
+      "grad_norm": 0.30944448709487915,
+      "learning_rate": 0.0005580454545454545,
+      "loss": 3.636,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53817927660318,
+      "grad_norm": 0.3318426311016083,
+      "learning_rate": 0.0005578706293706293,
+      "loss": 3.6265,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5527404042169026,
+      "grad_norm": 0.31078043580055237,
+      "learning_rate": 0.0005576958041958041,
+      "loss": 3.6335,
+      "step": 12200
+    },
+    {
+      "epoch": 3.567301531830625,
+      "grad_norm": 0.31694719195365906,
+      "learning_rate": 0.000557520979020979,
+      "loss": 3.6387,
+      "step": 12250
+    },
+    {
+      "epoch": 3.5818626594443472,
+      "grad_norm": 0.32518377900123596,
+      "learning_rate": 0.0005573461538461538,
+      "loss": 3.6292,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59642378705807,
+      "grad_norm": 0.33264607191085815,
+      "learning_rate": 0.0005571713286713286,
+      "loss": 3.6424,
+      "step": 12350
+    },
+    {
+      "epoch": 3.6109849146717923,
+      "grad_norm": 0.31201255321502686,
+      "learning_rate": 0.0005569965034965034,
+      "loss": 3.6499,
+      "step": 12400
+    },
+    {
+      "epoch": 3.6255460422855146,
+      "grad_norm": 0.32355648279190063,
+      "learning_rate": 0.0005568216783216783,
+      "loss": 3.6397,
+      "step": 12450
+    },
+    {
+      "epoch": 3.640107169899237,
+      "grad_norm": 0.3250090479850769,
+      "learning_rate": 0.000556646853146853,
+      "loss": 3.6408,
+      "step": 12500
+    },
+    {
+      "epoch": 3.654668297512959,
+      "grad_norm": 0.3053436577320099,
+      "learning_rate": 0.0005564720279720279,
+      "loss": 3.6246,
+      "step": 12550
+    },
+    {
+      "epoch": 3.669229425126682,
+      "grad_norm": 0.3021223545074463,
+      "learning_rate": 0.0005562972027972027,
+      "loss": 3.6422,
+      "step": 12600
+    },
+    {
+      "epoch": 3.6837905527404042,
+      "grad_norm": 0.30464479327201843,
+      "learning_rate": 0.0005561223776223775,
+      "loss": 3.6258,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6983516803541265,
+      "grad_norm": 0.32816681265830994,
+      "learning_rate": 0.0005559475524475524,
+      "loss": 3.6374,
+      "step": 12700
+    },
+    {
+      "epoch": 3.7129128079678493,
+      "grad_norm": 0.3056129217147827,
+      "learning_rate": 0.0005557727272727272,
+      "loss": 3.6422,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7274739355815716,
+      "grad_norm": 0.3144519031047821,
+      "learning_rate": 0.000555597902097902,
+      "loss": 3.6329,
+      "step": 12800
+    },
+    {
+      "epoch": 3.742035063195294,
+      "grad_norm": 0.3042009472846985,
+      "learning_rate": 0.0005554230769230768,
+      "loss": 3.6296,
+      "step": 12850
+    },
+    {
+      "epoch": 3.756596190809016,
+      "grad_norm": 0.3230903446674347,
+      "learning_rate": 0.0005552482517482517,
+      "loss": 3.6342,
+      "step": 12900
+    },
+    {
+      "epoch": 3.7711573184227385,
+      "grad_norm": 0.30217060446739197,
+      "learning_rate": 0.0005550734265734265,
+      "loss": 3.638,
+      "step": 12950
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "grad_norm": 0.3292492628097534,
+      "learning_rate": 0.0005548986013986013,
+      "loss": 3.6368,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "eval_accuracy": 0.35537424527745126,
+      "eval_loss": 3.6732499599456787,
+      "eval_runtime": 181.1768,
+      "eval_samples_per_second": 91.877,
+      "eval_steps_per_second": 5.746,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8002795736501835,
+      "grad_norm": 0.3120541274547577,
+      "learning_rate": 0.0005547237762237761,
+      "loss": 3.6445,
+      "step": 13050
+    },
+    {
+      "epoch": 3.814840701263906,
+      "grad_norm": 0.3175022006034851,
+      "learning_rate": 0.000554548951048951,
+      "loss": 3.6454,
+      "step": 13100
+    },
+    {
+      "epoch": 3.829401828877628,
+      "grad_norm": 0.3213273286819458,
+      "learning_rate": 0.0005543741258741258,
+      "loss": 3.6236,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8439629564913504,
+      "grad_norm": 0.3022514283657074,
+      "learning_rate": 0.0005541993006993006,
+      "loss": 3.6383,
+      "step": 13200
+    },
+    {
+      "epoch": 3.858524084105073,
+      "grad_norm": 0.3136868476867676,
+      "learning_rate": 0.0005540244755244756,
+      "loss": 3.6342,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8730852117187955,
+      "grad_norm": 0.3210989236831665,
+      "learning_rate": 0.0005538496503496502,
+      "loss": 3.636,
+      "step": 13300
+    },
+    {
+      "epoch": 3.887646339332518,
+      "grad_norm": 0.31793737411499023,
+      "learning_rate": 0.0005536748251748252,
+      "loss": 3.6384,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9022074669462405,
+      "grad_norm": 0.3172999620437622,
+      "learning_rate": 0.0005535,
+      "loss": 3.6322,
+      "step": 13400
+    },
+    {
+      "epoch": 3.916768594559963,
+      "grad_norm": 0.3040863573551178,
+      "learning_rate": 0.0005533251748251748,
+      "loss": 3.6361,
+      "step": 13450
+    },
+    {
+      "epoch": 3.931329722173685,
+      "grad_norm": 0.2977621555328369,
+      "learning_rate": 0.0005531503496503496,
+      "loss": 3.6293,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9458908497874075,
+      "grad_norm": 0.32015374302864075,
+      "learning_rate": 0.0005529755244755245,
+      "loss": 3.6437,
+      "step": 13550
+    },
+    {
+      "epoch": 3.9604519774011298,
+      "grad_norm": 0.30774080753326416,
+      "learning_rate": 0.0005528006993006993,
+      "loss": 3.6305,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9750131050148525,
+      "grad_norm": 0.32130375504493713,
+      "learning_rate": 0.0005526258741258741,
+      "loss": 3.6176,
+      "step": 13650
+    },
+    {
+      "epoch": 3.989574232628575,
+      "grad_norm": 0.3188425600528717,
+      "learning_rate": 0.0005524510489510489,
+      "loss": 3.6412,
+      "step": 13700
+    },
+    {
+      "epoch": 4.004077115731842,
+      "grad_norm": 0.31326737999916077,
+      "learning_rate": 0.0005522762237762238,
+      "loss": 3.599,
+      "step": 13750
+    },
+    {
+      "epoch": 4.018638243345564,
+      "grad_norm": 0.30300453305244446,
+      "learning_rate": 0.0005521013986013986,
+      "loss": 3.5165,
+      "step": 13800
+    },
+    {
+      "epoch": 4.033199370959287,
+      "grad_norm": 0.3266518712043762,
+      "learning_rate": 0.0005519265734265734,
+      "loss": 3.5381,
+      "step": 13850
+    },
+    {
+      "epoch": 4.04776049857301,
+      "grad_norm": 0.3203023076057434,
+      "learning_rate": 0.0005517517482517482,
+      "loss": 3.5187,
+      "step": 13900
+    },
+    {
+      "epoch": 4.062321626186732,
+      "grad_norm": 0.3111482262611389,
+      "learning_rate": 0.0005515769230769231,
+      "loss": 3.5362,
+      "step": 13950
+    },
+    {
+      "epoch": 4.076882753800454,
+      "grad_norm": 0.34344446659088135,
+      "learning_rate": 0.0005514020979020979,
+      "loss": 3.5417,
+      "step": 14000
+    },
+    {
+      "epoch": 4.076882753800454,
+      "eval_accuracy": 0.35658572794648924,
+      "eval_loss": 3.6685900688171387,
+      "eval_runtime": 180.9843,
+      "eval_samples_per_second": 91.975,
+      "eval_steps_per_second": 5.752,
+      "step": 14000
+    },
+    {
+      "epoch": 4.091443881414177,
+      "grad_norm": 0.3067654073238373,
+      "learning_rate": 0.0005512272727272727,
+      "loss": 3.539,
+      "step": 14050
+    },
+    {
+      "epoch": 4.106005009027899,
+      "grad_norm": 0.30828049778938293,
+      "learning_rate": 0.0005510524475524475,
+      "loss": 3.5469,
+      "step": 14100
+    },
+    {
+      "epoch": 4.120566136641622,
+      "grad_norm": 0.31522074341773987,
+      "learning_rate": 0.0005508776223776223,
+      "loss": 3.5362,
+      "step": 14150
+    },
+    {
+      "epoch": 4.135127264255344,
+      "grad_norm": 0.3391481637954712,
+      "learning_rate": 0.0005507027972027972,
+      "loss": 3.5471,
+      "step": 14200
+    },
+    {
+      "epoch": 4.149688391869066,
+      "grad_norm": 0.3163962662220001,
+      "learning_rate": 0.000550527972027972,
+      "loss": 3.5409,
+      "step": 14250
+    },
+    {
+      "epoch": 4.164249519482789,
+      "grad_norm": 0.3650486171245575,
+      "learning_rate": 0.0005503531468531468,
+      "loss": 3.5534,
+      "step": 14300
+    },
+    {
+      "epoch": 4.178810647096511,
+      "grad_norm": 0.3179774284362793,
+      "learning_rate": 0.0005501783216783216,
+      "loss": 3.5485,
+      "step": 14350
+    },
+    {
+      "epoch": 4.193371774710234,
+      "grad_norm": 0.3414445221424103,
+      "learning_rate": 0.0005500034965034965,
+      "loss": 3.5559,
+      "step": 14400
+    },
+    {
+      "epoch": 4.207932902323956,
+      "grad_norm": 0.3165196180343628,
+      "learning_rate": 0.0005498286713286713,
+      "loss": 3.5449,
+      "step": 14450
+    },
+    {
+      "epoch": 4.222494029937678,
+      "grad_norm": 0.2997719347476959,
+      "learning_rate": 0.0005496538461538461,
+      "loss": 3.5552,
+      "step": 14500
+    },
+    {
+      "epoch": 4.237055157551401,
+      "grad_norm": 0.3242793679237366,
+      "learning_rate": 0.0005494790209790209,
+      "loss": 3.5649,
+      "step": 14550
+    },
+    {
+      "epoch": 4.251616285165123,
+      "grad_norm": 0.3411242663860321,
+      "learning_rate": 0.0005493041958041958,
+      "loss": 3.5586,
+      "step": 14600
+    },
+    {
+      "epoch": 4.266177412778846,
+      "grad_norm": 0.3131914734840393,
+      "learning_rate": 0.0005491293706293706,
+      "loss": 3.5638,
+      "step": 14650
+    },
+    {
+      "epoch": 4.280738540392568,
+      "grad_norm": 0.3113696873188019,
+      "learning_rate": 0.0005489545454545454,
+      "loss": 3.5583,
+      "step": 14700
+    },
+    {
+      "epoch": 4.29529966800629,
+      "grad_norm": 0.3232826590538025,
+      "learning_rate": 0.0005487797202797203,
+      "loss": 3.5635,
+      "step": 14750
+    },
+    {
+      "epoch": 4.309860795620013,
+      "grad_norm": 0.3019048273563385,
+      "learning_rate": 0.000548604895104895,
+      "loss": 3.5583,
+      "step": 14800
+    },
+    {
+      "epoch": 4.324421923233735,
+      "grad_norm": 0.30543556809425354,
+      "learning_rate": 0.0005484300699300699,
+      "loss": 3.5636,
+      "step": 14850
+    },
+    {
+      "epoch": 4.338983050847458,
+      "grad_norm": 0.327394038438797,
+      "learning_rate": 0.0005482552447552447,
+      "loss": 3.5602,
+      "step": 14900
+    },
+    {
+      "epoch": 4.35354417846118,
+      "grad_norm": 0.30762121081352234,
+      "learning_rate": 0.0005480804195804195,
+      "loss": 3.555,
+      "step": 14950
+    },
+    {
+      "epoch": 4.368105306074902,
+      "grad_norm": 0.3166114389896393,
+      "learning_rate": 0.0005479055944055943,
+      "loss": 3.5573,
+      "step": 15000
+    },
+    {
+      "epoch": 4.368105306074902,
+      "eval_accuracy": 0.3576027620629228,
+      "eval_loss": 3.6566529273986816,
+      "eval_runtime": 181.0498,
+      "eval_samples_per_second": 91.942,
+      "eval_steps_per_second": 5.75,
+      "step": 15000
+    },
+    {
+      "epoch": 4.382666433688625,
+      "grad_norm": 0.3189477026462555,
+      "learning_rate": 0.0005477307692307692,
+      "loss": 3.5526,
+      "step": 15050
+    },
+    {
+      "epoch": 4.397227561302348,
+      "grad_norm": 0.31075966358184814,
+      "learning_rate": 0.000547555944055944,
+      "loss": 3.5699,
+      "step": 15100
+    },
+    {
+      "epoch": 4.41178868891607,
+      "grad_norm": 0.33162787556648254,
+      "learning_rate": 0.0005473811188811188,
+      "loss": 3.573,
+      "step": 15150
+    },
+    {
+      "epoch": 4.426349816529792,
+      "grad_norm": 0.31193026900291443,
+      "learning_rate": 0.0005472062937062936,
+      "loss": 3.5564,
+      "step": 15200
+    },
+    {
+      "epoch": 4.440910944143514,
+      "grad_norm": 0.30927059054374695,
+      "learning_rate": 0.0005470314685314685,
+      "loss": 3.5524,
+      "step": 15250
+    },
+    {
+      "epoch": 4.455472071757237,
+      "grad_norm": 0.31786641478538513,
+      "learning_rate": 0.0005468566433566433,
+      "loss": 3.5712,
+      "step": 15300
+    },
+    {
+      "epoch": 4.47003319937096,
+      "grad_norm": 0.31433233618736267,
+      "learning_rate": 0.0005466818181818181,
+      "loss": 3.5586,
+      "step": 15350
+    },
+    {
+      "epoch": 4.4845943269846815,
+      "grad_norm": 0.3186981976032257,
+      "learning_rate": 0.000546506993006993,
+      "loss": 3.571,
+      "step": 15400
+    },
+    {
+      "epoch": 4.499155454598404,
+      "grad_norm": 0.31594040989875793,
+      "learning_rate": 0.0005463321678321678,
+      "loss": 3.5668,
+      "step": 15450
+    },
+    {
+      "epoch": 4.513716582212126,
+      "grad_norm": 0.3135336637496948,
+      "learning_rate": 0.0005461573426573426,
+      "loss": 3.5652,
+      "step": 15500
+    },
+    {
+      "epoch": 4.528277709825849,
+      "grad_norm": 0.37292537093162537,
+      "learning_rate": 0.0005459825174825174,
+      "loss": 3.5808,
+      "step": 15550
+    },
+    {
+      "epoch": 4.542838837439572,
+      "grad_norm": 0.3025301694869995,
+      "learning_rate": 0.0005458076923076922,
+      "loss": 3.5782,
+      "step": 15600
+    },
+    {
+      "epoch": 4.5573999650532935,
+      "grad_norm": 0.3217926025390625,
+      "learning_rate": 0.000545632867132867,
+      "loss": 3.567,
+      "step": 15650
+    },
+    {
+      "epoch": 4.571961092667016,
+      "grad_norm": 0.33000338077545166,
+      "learning_rate": 0.0005454580419580419,
+      "loss": 3.5761,
+      "step": 15700
+    },
+    {
+      "epoch": 4.586522220280738,
+      "grad_norm": 0.3380807340145111,
+      "learning_rate": 0.0005452832167832167,
+      "loss": 3.5729,
+      "step": 15750
+    },
+    {
+      "epoch": 4.601083347894461,
+      "grad_norm": 0.2971727252006531,
+      "learning_rate": 0.0005451083916083915,
+      "loss": 3.5666,
+      "step": 15800
+    },
+    {
+      "epoch": 4.615644475508184,
+      "grad_norm": 0.32622653245925903,
+      "learning_rate": 0.0005449335664335663,
+      "loss": 3.5744,
+      "step": 15850
+    },
+    {
+      "epoch": 4.630205603121905,
+      "grad_norm": 0.33031386137008667,
+      "learning_rate": 0.0005447587412587412,
+      "loss": 3.5836,
+      "step": 15900
+    },
+    {
+      "epoch": 4.644766730735628,
+      "grad_norm": 0.31544092297554016,
+      "learning_rate": 0.000544583916083916,
+      "loss": 3.58,
+      "step": 15950
+    },
+    {
+      "epoch": 4.659327858349351,
+      "grad_norm": 0.32821419835090637,
+      "learning_rate": 0.0005444090909090908,
+      "loss": 3.5738,
+      "step": 16000
+    },
+    {
+      "epoch": 4.659327858349351,
+      "eval_accuracy": 0.35873136309375875,
+      "eval_loss": 3.642806053161621,
+      "eval_runtime": 181.1419,
+      "eval_samples_per_second": 91.895,
+      "eval_steps_per_second": 5.747,
+      "step": 16000
+    },
+    {
+      "epoch": 4.673888985963073,
+      "grad_norm": 0.313347727060318,
+      "learning_rate": 0.0005442342657342657,
+      "loss": 3.5561,
+      "step": 16050
+    },
+    {
+      "epoch": 4.6884501135767955,
+      "grad_norm": 0.29170653223991394,
+      "learning_rate": 0.0005440594405594405,
+      "loss": 3.5704,
+      "step": 16100
+    },
+    {
+      "epoch": 4.703011241190518,
+      "grad_norm": 0.3108586370944977,
+      "learning_rate": 0.0005438846153846153,
+      "loss": 3.5616,
+      "step": 16150
+    },
+    {
+      "epoch": 4.71757236880424,
+      "grad_norm": 0.31426018476486206,
+      "learning_rate": 0.0005437097902097901,
+      "loss": 3.569,
+      "step": 16200
+    },
+    {
+      "epoch": 4.732133496417963,
+      "grad_norm": 0.3121700584888458,
+      "learning_rate": 0.0005435349650349651,
+      "loss": 3.5648,
+      "step": 16250
+    },
+    {
+      "epoch": 4.746694624031685,
+      "grad_norm": 0.3235621154308319,
+      "learning_rate": 0.0005433601398601397,
+      "loss": 3.5713,
+      "step": 16300
+    },
+    {
+      "epoch": 4.7612557516454075,
+      "grad_norm": 0.3033587336540222,
+      "learning_rate": 0.0005431853146853147,
+      "loss": 3.5571,
+      "step": 16350
+    },
+    {
+      "epoch": 4.77581687925913,
+      "grad_norm": 0.3489319086074829,
+      "learning_rate": 0.0005430104895104895,
+      "loss": 3.5774,
+      "step": 16400
+    },
+    {
+      "epoch": 4.790378006872852,
+      "grad_norm": 0.3031330108642578,
+      "learning_rate": 0.0005428356643356643,
+      "loss": 3.571,
+      "step": 16450
+    },
+    {
+      "epoch": 4.804939134486575,
+      "grad_norm": 0.31163015961647034,
+      "learning_rate": 0.0005426608391608391,
+      "loss": 3.5646,
+      "step": 16500
+    },
+    {
+      "epoch": 4.819500262100297,
+      "grad_norm": 0.32922112941741943,
+      "learning_rate": 0.000542486013986014,
+      "loss": 3.5692,
+      "step": 16550
+    },
+    {
+      "epoch": 4.834061389714019,
+      "grad_norm": 0.32564711570739746,
+      "learning_rate": 0.0005423111888111888,
+      "loss": 3.5759,
+      "step": 16600
+    },
+    {
+      "epoch": 4.848622517327742,
+      "grad_norm": 0.3296327590942383,
+      "learning_rate": 0.0005421363636363636,
+      "loss": 3.5785,
+      "step": 16650
+    },
+    {
+      "epoch": 4.863183644941464,
+      "grad_norm": 0.3310580253601074,
+      "learning_rate": 0.0005419615384615385,
+      "loss": 3.5685,
+      "step": 16700
+    },
+    {
+      "epoch": 4.877744772555187,
+      "grad_norm": 0.31274470686912537,
+      "learning_rate": 0.0005417867132867133,
+      "loss": 3.5587,
+      "step": 16750
+    },
+    {
+      "epoch": 4.892305900168909,
+      "grad_norm": 0.323668897151947,
+      "learning_rate": 0.0005416118881118881,
+      "loss": 3.5657,
+      "step": 16800
+    },
+    {
+      "epoch": 4.906867027782631,
+      "grad_norm": 0.3014122247695923,
+      "learning_rate": 0.0005414370629370629,
+      "loss": 3.5707,
+      "step": 16850
+    },
+    {
+      "epoch": 4.921428155396354,
+      "grad_norm": 0.32767361402511597,
+      "learning_rate": 0.0005412622377622378,
+      "loss": 3.5724,
+      "step": 16900
+    },
+    {
+      "epoch": 4.935989283010076,
+      "grad_norm": 0.3055601119995117,
+      "learning_rate": 0.0005410874125874126,
+      "loss": 3.56,
+      "step": 16950
+    },
+    {
+      "epoch": 4.950550410623799,
+      "grad_norm": 0.30220872163772583,
+      "learning_rate": 0.0005409125874125874,
+      "loss": 3.585,
+      "step": 17000
+    },
+    {
+      "epoch": 4.950550410623799,
+      "eval_accuracy": 0.360180674917524,
+      "eval_loss": 3.6294431686401367,
+      "eval_runtime": 181.1546,
+      "eval_samples_per_second": 91.888,
+      "eval_steps_per_second": 5.746,
+      "step": 17000
+    },
+    {
+      "epoch": 4.9651115382375215,
+      "grad_norm": 0.34501272439956665,
+      "learning_rate": 0.0005407377622377622,
+      "loss": 3.5698,
+      "step": 17050
+    },
+    {
+      "epoch": 4.979672665851243,
+      "grad_norm": 0.31808799505233765,
+      "learning_rate": 0.000540562937062937,
+      "loss": 3.5684,
+      "step": 17100
+    },
+    {
+      "epoch": 4.994233793464966,
+      "grad_norm": 0.30529871582984924,
+      "learning_rate": 0.0005403881118881118,
+      "loss": 3.5667,
+      "step": 17150
+    },
+    {
+      "epoch": 5.008736676568233,
+      "grad_norm": 0.32560649514198303,
+      "learning_rate": 0.0005402132867132867,
+      "loss": 3.5124,
+      "step": 17200
+    },
+    {
+      "epoch": 5.023297804181956,
+      "grad_norm": 0.33668822050094604,
+      "learning_rate": 0.0005400384615384615,
+      "loss": 3.448,
+      "step": 17250
+    },
+    {
+      "epoch": 5.037858931795678,
+      "grad_norm": 0.3430330753326416,
+      "learning_rate": 0.0005398636363636363,
+      "loss": 3.462,
+      "step": 17300
+    },
+    {
+      "epoch": 5.052420059409401,
+      "grad_norm": 0.32133740186691284,
+      "learning_rate": 0.0005396888111888111,
+      "loss": 3.4736,
+      "step": 17350
+    },
+    {
+      "epoch": 5.066981187023123,
+      "grad_norm": 0.30943241715431213,
+      "learning_rate": 0.000539513986013986,
+      "loss": 3.4657,
+      "step": 17400
+    },
+    {
+      "epoch": 5.081542314636845,
+      "grad_norm": 0.3143688142299652,
+      "learning_rate": 0.0005393391608391608,
+      "loss": 3.4726,
+      "step": 17450
+    },
+    {
+      "epoch": 5.096103442250568,
+      "grad_norm": 0.3386751711368561,
+      "learning_rate": 0.0005391643356643356,
+      "loss": 3.4804,
+      "step": 17500
+    },
+    {
+      "epoch": 5.110664569864291,
+      "grad_norm": 0.31570807099342346,
+      "learning_rate": 0.0005389895104895105,
+      "loss": 3.4793,
+      "step": 17550
+    },
+    {
+      "epoch": 5.125225697478013,
+      "grad_norm": 0.32351601123809814,
+      "learning_rate": 0.0005388146853146853,
+      "loss": 3.4856,
+      "step": 17600
+    },
+    {
+      "epoch": 5.139786825091735,
+      "grad_norm": 0.30925703048706055,
+      "learning_rate": 0.0005386398601398601,
+      "loss": 3.486,
+      "step": 17650
+    },
+    {
+      "epoch": 5.154347952705457,
+      "grad_norm": 0.31706124544143677,
+      "learning_rate": 0.0005384650349650349,
+      "loss": 3.4876,
+      "step": 17700
+    },
+    {
+      "epoch": 5.16890908031918,
+      "grad_norm": 0.3108503818511963,
+      "learning_rate": 0.0005382902097902098,
+      "loss": 3.4841,
+      "step": 17750
+    },
+    {
+      "epoch": 5.183470207932903,
+      "grad_norm": 0.3236485719680786,
+      "learning_rate": 0.0005381153846153845,
+      "loss": 3.4851,
+      "step": 17800
+    },
+    {
+      "epoch": 5.1980313355466246,
+      "grad_norm": 0.30849018692970276,
+      "learning_rate": 0.0005379405594405594,
+      "loss": 3.4838,
+      "step": 17850
+    },
+    {
+      "epoch": 5.212592463160347,
+      "grad_norm": 0.3113718032836914,
+      "learning_rate": 0.0005377657342657342,
+      "loss": 3.476,
+      "step": 17900
+    },
+    {
+      "epoch": 5.227153590774069,
+      "grad_norm": 0.35100990533828735,
+      "learning_rate": 0.000537590909090909,
+      "loss": 3.4976,
+      "step": 17950
+    },
+    {
+      "epoch": 5.241714718387792,
+      "grad_norm": 0.3076687455177307,
+      "learning_rate": 0.0005374160839160838,
+      "loss": 3.5044,
+      "step": 18000
+    },
+    {
+      "epoch": 5.241714718387792,
+      "eval_accuracy": 0.3602995307135839,
+      "eval_loss": 3.6342709064483643,
+      "eval_runtime": 181.2085,
+      "eval_samples_per_second": 91.861,
+      "eval_steps_per_second": 5.745,
+      "step": 18000
+    },
+    {
+      "epoch": 5.256275846001515,
+      "grad_norm": 0.3252745270729065,
+      "learning_rate": 0.0005372412587412587,
+      "loss": 3.5038,
+      "step": 18050
+    },
+    {
+      "epoch": 5.2708369736152365,
+      "grad_norm": 0.31030163168907166,
+      "learning_rate": 0.0005370664335664335,
+      "loss": 3.49,
+      "step": 18100
+    },
+    {
+      "epoch": 5.285398101228959,
+      "grad_norm": 0.36103177070617676,
+      "learning_rate": 0.0005368916083916083,
+      "loss": 3.492,
+      "step": 18150
+    },
+    {
+      "epoch": 5.299959228842681,
+      "grad_norm": 0.32504352927207947,
+      "learning_rate": 0.0005367167832167832,
+      "loss": 3.4948,
+      "step": 18200
+    },
+    {
+      "epoch": 5.314520356456404,
+      "grad_norm": 0.3331771790981293,
+      "learning_rate": 0.000536541958041958,
+      "loss": 3.5126,
+      "step": 18250
+    },
+    {
+      "epoch": 5.329081484070127,
+      "grad_norm": 0.31116366386413574,
+      "learning_rate": 0.0005363671328671328,
+      "loss": 3.5076,
+      "step": 18300
+    },
+    {
+      "epoch": 5.3436426116838485,
+      "grad_norm": 0.3158913850784302,
+      "learning_rate": 0.0005361923076923076,
+      "loss": 3.5178,
+      "step": 18350
+    },
+    {
+      "epoch": 5.358203739297571,
+      "grad_norm": 0.32401207089424133,
+      "learning_rate": 0.0005360174825174825,
+      "loss": 3.5149,
+      "step": 18400
+    },
+    {
+      "epoch": 5.372764866911294,
+      "grad_norm": 0.340380996465683,
+      "learning_rate": 0.0005358426573426573,
+      "loss": 3.5078,
+      "step": 18450
+    },
+    {
+      "epoch": 5.387325994525016,
+      "grad_norm": 0.31110548973083496,
+      "learning_rate": 0.0005356678321678321,
+      "loss": 3.5107,
+      "step": 18500
+    },
+    {
+      "epoch": 5.401887122138739,
+      "grad_norm": 0.32105040550231934,
+      "learning_rate": 0.0005354930069930069,
+      "loss": 3.4964,
+      "step": 18550
+    },
+    {
+      "epoch": 5.41644824975246,
+      "grad_norm": 0.3346503674983978,
+      "learning_rate": 0.0005353181818181817,
+      "loss": 3.5029,
+      "step": 18600
+    },
+    {
+      "epoch": 5.431009377366183,
+      "grad_norm": 0.3195594549179077,
+      "learning_rate": 0.0005351433566433565,
+      "loss": 3.5149,
+      "step": 18650
+    },
+    {
+      "epoch": 5.445570504979906,
+      "grad_norm": 0.3379792273044586,
+      "learning_rate": 0.0005349685314685314,
+      "loss": 3.5073,
+      "step": 18700
+    },
+    {
+      "epoch": 5.460131632593628,
+      "grad_norm": 0.3183291256427765,
+      "learning_rate": 0.0005347937062937062,
+      "loss": 3.5091,
+      "step": 18750
+    },
+    {
+      "epoch": 5.4746927602073505,
+      "grad_norm": 0.3273472785949707,
+      "learning_rate": 0.000534618881118881,
+      "loss": 3.5196,
+      "step": 18800
+    },
+    {
+      "epoch": 5.489253887821073,
+      "grad_norm": 0.3325451910495758,
+      "learning_rate": 0.0005344440559440559,
+      "loss": 3.5159,
+      "step": 18850
+    },
+    {
+      "epoch": 5.503815015434795,
+      "grad_norm": 0.33070364594459534,
+      "learning_rate": 0.0005342692307692307,
+      "loss": 3.518,
+      "step": 18900
+    },
+    {
+      "epoch": 5.518376143048518,
+      "grad_norm": 0.33015328645706177,
+      "learning_rate": 0.0005340944055944055,
+      "loss": 3.5238,
+      "step": 18950
+    },
+    {
+      "epoch": 5.53293727066224,
+      "grad_norm": 0.307849645614624,
+      "learning_rate": 0.0005339195804195803,
+      "loss": 3.5108,
+      "step": 19000
+    },
+    {
+      "epoch": 5.53293727066224,
+      "eval_accuracy": 0.36167172146690857,
+      "eval_loss": 3.62229061126709,
+      "eval_runtime": 181.6251,
+      "eval_samples_per_second": 91.65,
+      "eval_steps_per_second": 5.732,
+      "step": 19000
+    },
+    {
+      "epoch": 5.5474983982759625,
+      "grad_norm": 0.2991660237312317,
+      "learning_rate": 0.0005337447552447552,
+      "loss": 3.5153,
+      "step": 19050
+    },
+    {
+      "epoch": 5.562059525889685,
+      "grad_norm": 0.33114874362945557,
+      "learning_rate": 0.00053356993006993,
+      "loss": 3.5165,
+      "step": 19100
+    },
+    {
+      "epoch": 5.576620653503407,
+      "grad_norm": 0.3147285282611847,
+      "learning_rate": 0.0005333951048951048,
+      "loss": 3.5263,
+      "step": 19150
+    },
+    {
+      "epoch": 5.59118178111713,
+      "grad_norm": 0.32433706521987915,
+      "learning_rate": 0.0005332202797202796,
+      "loss": 3.5213,
+      "step": 19200
+    },
+    {
+      "epoch": 5.605742908730852,
+      "grad_norm": 0.386737197637558,
+      "learning_rate": 0.0005330454545454546,
+      "loss": 3.5211,
+      "step": 19250
+    },
+    {
+      "epoch": 5.620304036344574,
+      "grad_norm": 0.32082608342170715,
+      "learning_rate": 0.0005328706293706292,
+      "loss": 3.53,
+      "step": 19300
+    },
+    {
+      "epoch": 5.634865163958297,
+      "grad_norm": 0.2935941219329834,
+      "learning_rate": 0.0005326958041958042,
+      "loss": 3.5197,
+      "step": 19350
+    },
+    {
+      "epoch": 5.649426291572019,
+      "grad_norm": 0.31667831540107727,
+      "learning_rate": 0.000532520979020979,
+      "loss": 3.5095,
+      "step": 19400
+    },
+    {
+      "epoch": 5.663987419185742,
+      "grad_norm": 0.306999534368515,
+      "learning_rate": 0.0005323461538461538,
+      "loss": 3.5187,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6785485467994645,
+      "grad_norm": 0.30406883358955383,
+      "learning_rate": 0.0005321713286713287,
+      "loss": 3.5199,
+      "step": 19500
+    },
+    {
+      "epoch": 5.693109674413186,
+      "grad_norm": 0.3210119903087616,
+      "learning_rate": 0.0005319965034965035,
+      "loss": 3.527,
+      "step": 19550
+    },
+    {
+      "epoch": 5.707670802026909,
+      "grad_norm": 0.33628612756729126,
+      "learning_rate": 0.0005318216783216783,
+      "loss": 3.5168,
+      "step": 19600
+    },
+    {
+      "epoch": 5.722231929640631,
+      "grad_norm": 0.3189808428287506,
+      "learning_rate": 0.0005316468531468531,
+      "loss": 3.5246,
+      "step": 19650
+    },
+    {
+      "epoch": 5.736793057254354,
+      "grad_norm": 0.3225274682044983,
+      "learning_rate": 0.000531472027972028,
+      "loss": 3.5168,
+      "step": 19700
+    },
+    {
+      "epoch": 5.7513541848680765,
+      "grad_norm": 0.32404860854148865,
+      "learning_rate": 0.0005312972027972028,
+      "loss": 3.5305,
+      "step": 19750
+    },
+    {
+      "epoch": 5.765915312481798,
+      "grad_norm": 0.33228975534439087,
+      "learning_rate": 0.0005311223776223776,
+      "loss": 3.5191,
+      "step": 19800
+    },
+    {
+      "epoch": 5.780476440095521,
+      "grad_norm": 0.3346024453639984,
+      "learning_rate": 0.0005309475524475524,
+      "loss": 3.5233,
+      "step": 19850
+    },
+    {
+      "epoch": 5.795037567709244,
+      "grad_norm": 0.30757418274879456,
+      "learning_rate": 0.0005307727272727273,
+      "loss": 3.5203,
+      "step": 19900
+    },
+    {
+      "epoch": 5.809598695322966,
+      "grad_norm": 0.33240988850593567,
+      "learning_rate": 0.0005305979020979021,
+      "loss": 3.5232,
+      "step": 19950
+    },
+    {
+      "epoch": 5.824159822936688,
+      "grad_norm": 0.3051111698150635,
+      "learning_rate": 0.0005304230769230769,
+      "loss": 3.5355,
+      "step": 20000
+    },
+    {
+      "epoch": 5.824159822936688,
+      "eval_accuracy": 0.3623371258246723,
+      "eval_loss": 3.6103367805480957,
+      "eval_runtime": 181.4963,
+      "eval_samples_per_second": 91.715,
+      "eval_steps_per_second": 5.736,
+      "step": 20000
+    },
+    {
+      "epoch": 5.83872095055041,
+      "grad_norm": 0.3242921233177185,
+      "learning_rate": 0.0005302482517482517,
+      "loss": 3.5295,
+      "step": 20050
+    },
+    {
+      "epoch": 5.853282078164133,
+      "grad_norm": 0.3176933526992798,
+      "learning_rate": 0.0005300734265734265,
+      "loss": 3.5191,
+      "step": 20100
+    },
+    {
+      "epoch": 5.867843205777856,
+      "grad_norm": 0.34109026193618774,
+      "learning_rate": 0.0005298986013986013,
+      "loss": 3.5357,
+      "step": 20150
+    },
+    {
+      "epoch": 5.882404333391578,
+      "grad_norm": 0.2950345575809479,
+      "learning_rate": 0.0005297237762237762,
+      "loss": 3.5309,
+      "step": 20200
+    },
+    {
+      "epoch": 5.8969654610053,
+      "grad_norm": 0.30492424964904785,
+      "learning_rate": 0.000529548951048951,
+      "loss": 3.5426,
+      "step": 20250
+    },
+    {
+      "epoch": 5.911526588619022,
+      "grad_norm": 0.3427123725414276,
+      "learning_rate": 0.0005293741258741258,
+      "loss": 3.5293,
+      "step": 20300
+    },
+    {
+      "epoch": 5.926087716232745,
+      "grad_norm": 0.30794087052345276,
+      "learning_rate": 0.0005291993006993007,
+      "loss": 3.5297,
+      "step": 20350
+    },
+    {
+      "epoch": 5.940648843846468,
+      "grad_norm": 0.30005186796188354,
+      "learning_rate": 0.0005290244755244755,
+      "loss": 3.5312,
+      "step": 20400
+    },
+    {
+      "epoch": 5.95520997146019,
+      "grad_norm": 0.3218655586242676,
+      "learning_rate": 0.0005288496503496503,
+      "loss": 3.5224,
+      "step": 20450
+    },
+    {
+      "epoch": 5.969771099073912,
+      "grad_norm": 0.3385027348995209,
+      "learning_rate": 0.0005286748251748251,
+      "loss": 3.5319,
+      "step": 20500
+    },
+    {
+      "epoch": 5.984332226687634,
+      "grad_norm": 0.33277666568756104,
+      "learning_rate": 0.0005285,
+      "loss": 3.5229,
+      "step": 20550
+    },
+    {
+      "epoch": 5.998893354301357,
+      "grad_norm": 0.2984437346458435,
+      "learning_rate": 0.0005283251748251748,
+      "loss": 3.5246,
+      "step": 20600
+    },
+    {
+      "epoch": 6.013396237404625,
+      "grad_norm": 0.3273489773273468,
+      "learning_rate": 0.0005281503496503496,
+      "loss": 3.4096,
+      "step": 20650
+    },
+    {
+      "epoch": 6.027957365018347,
+      "grad_norm": 0.32948926091194153,
+      "learning_rate": 0.0005279755244755244,
+      "loss": 3.4094,
+      "step": 20700
+    },
+    {
+      "epoch": 6.04251849263207,
+      "grad_norm": 0.3348131775856018,
+      "learning_rate": 0.0005278006993006993,
+      "loss": 3.4196,
+      "step": 20750
+    },
+    {
+      "epoch": 6.0570796202457915,
+      "grad_norm": 0.3126414716243744,
+      "learning_rate": 0.000527625874125874,
+      "loss": 3.4201,
+      "step": 20800
+    },
+    {
+      "epoch": 6.071640747859514,
+      "grad_norm": 0.33334097266197205,
+      "learning_rate": 0.0005274510489510489,
+      "loss": 3.4269,
+      "step": 20850
+    },
+    {
+      "epoch": 6.086201875473237,
+      "grad_norm": 0.32628950476646423,
+      "learning_rate": 0.0005272762237762238,
+      "loss": 3.432,
+      "step": 20900
+    },
+    {
+      "epoch": 6.100763003086959,
+      "grad_norm": 0.3498513996601105,
+      "learning_rate": 0.0005271013986013985,
+      "loss": 3.425,
+      "step": 20950
+    },
+    {
+      "epoch": 6.115324130700682,
+      "grad_norm": 0.29833361506462097,
+      "learning_rate": 0.0005269265734265734,
+      "loss": 3.4252,
+      "step": 21000
+    },
+    {
+      "epoch": 6.115324130700682,
+      "eval_accuracy": 0.36289260914453686,
+      "eval_loss": 3.613590717315674,
+      "eval_runtime": 181.366,
+      "eval_samples_per_second": 91.781,
+      "eval_steps_per_second": 5.74,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1298852583144035,
+      "grad_norm": 0.3302992880344391,
+      "learning_rate": 0.0005267517482517482,
+      "loss": 3.4361,
+      "step": 21050
+    },
+    {
+      "epoch": 6.144446385928126,
+      "grad_norm": 0.3029468059539795,
+      "learning_rate": 0.000526576923076923,
+      "loss": 3.4468,
+      "step": 21100
+    },
+    {
+      "epoch": 6.159007513541849,
+      "grad_norm": 0.3242226243019104,
+      "learning_rate": 0.0005264020979020978,
+      "loss": 3.4352,
+      "step": 21150
+    },
+    {
+      "epoch": 6.173568641155571,
+      "grad_norm": 0.31961268186569214,
+      "learning_rate": 0.0005262272727272727,
+      "loss": 3.4358,
+      "step": 21200
+    },
+    {
+      "epoch": 6.1881297687692935,
+      "grad_norm": 0.33194205164909363,
+      "learning_rate": 0.0005260524475524475,
+      "loss": 3.4442,
+      "step": 21250
+    },
+    {
+      "epoch": 6.202690896383016,
+      "grad_norm": 0.32972991466522217,
+      "learning_rate": 0.0005258776223776223,
+      "loss": 3.4521,
+      "step": 21300
+    },
+    {
+      "epoch": 6.217252023996738,
+      "grad_norm": 0.31378456950187683,
+      "learning_rate": 0.0005257027972027971,
+      "loss": 3.4469,
+      "step": 21350
+    },
+    {
+      "epoch": 6.231813151610461,
+      "grad_norm": 0.3384927809238434,
+      "learning_rate": 0.000525527972027972,
+      "loss": 3.4621,
+      "step": 21400
+    },
+    {
+      "epoch": 6.246374279224183,
+      "grad_norm": 0.3361877202987671,
+      "learning_rate": 0.0005253531468531468,
+      "loss": 3.4574,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2609354068379055,
+      "grad_norm": 0.3300999402999878,
+      "learning_rate": 0.0005251783216783216,
+      "loss": 3.4494,
+      "step": 21500
+    },
+    {
+      "epoch": 6.275496534451628,
+      "grad_norm": 0.31691470742225647,
+      "learning_rate": 0.0005250034965034965,
+      "loss": 3.4443,
+      "step": 21550
+    },
+    {
+      "epoch": 6.29005766206535,
+      "grad_norm": 0.30609896779060364,
+      "learning_rate": 0.0005248286713286712,
+      "loss": 3.465,
+      "step": 21600
+    },
+    {
+      "epoch": 6.304618789679073,
+      "grad_norm": 0.3142344653606415,
+      "learning_rate": 0.0005246538461538461,
+      "loss": 3.4489,
+      "step": 21650
+    },
+    {
+      "epoch": 6.319179917292795,
+      "grad_norm": 0.3219060003757477,
+      "learning_rate": 0.0005244790209790209,
+      "loss": 3.4663,
+      "step": 21700
+    },
+    {
+      "epoch": 6.3337410449065175,
+      "grad_norm": 0.3099410831928253,
+      "learning_rate": 0.0005243041958041957,
+      "loss": 3.472,
+      "step": 21750
+    },
+    {
+      "epoch": 6.34830217252024,
+      "grad_norm": 0.31502556800842285,
+      "learning_rate": 0.0005241293706293705,
+      "loss": 3.4804,
+      "step": 21800
+    },
+    {
+      "epoch": 6.362863300133962,
+      "grad_norm": 0.30814129114151,
+      "learning_rate": 0.0005239545454545454,
+      "loss": 3.4601,
+      "step": 21850
+    },
+    {
+      "epoch": 6.377424427747685,
+      "grad_norm": 0.3281274735927582,
+      "learning_rate": 0.0005237797202797202,
+      "loss": 3.4743,
+      "step": 21900
+    },
+    {
+      "epoch": 6.391985555361408,
+      "grad_norm": 0.33183300495147705,
+      "learning_rate": 0.000523604895104895,
+      "loss": 3.4548,
+      "step": 21950
+    },
+    {
+      "epoch": 6.406546682975129,
+      "grad_norm": 0.33608192205429077,
+      "learning_rate": 0.0005234300699300698,
+      "loss": 3.4717,
+      "step": 22000
+    },
+    {
+      "epoch": 6.406546682975129,
+      "eval_accuracy": 0.3633190087214996,
+      "eval_loss": 3.6065382957458496,
+      "eval_runtime": 181.0468,
+      "eval_samples_per_second": 91.943,
+      "eval_steps_per_second": 5.75,
+      "step": 22000
+    },
+    {
+      "epoch": 6.421107810588852,
+      "grad_norm": 0.3260321319103241,
+      "learning_rate": 0.0005232552447552447,
+      "loss": 3.4682,
+      "step": 22050
+    },
+    {
+      "epoch": 6.435668938202574,
+      "grad_norm": 0.33026236295700073,
+      "learning_rate": 0.0005230804195804195,
+      "loss": 3.4802,
+      "step": 22100
+    },
+    {
+      "epoch": 6.450230065816297,
+      "grad_norm": 0.30417993664741516,
+      "learning_rate": 0.0005229055944055943,
+      "loss": 3.476,
+      "step": 22150
+    },
+    {
+      "epoch": 6.4647911934300195,
+      "grad_norm": 0.3218960464000702,
+      "learning_rate": 0.0005227307692307691,
+      "loss": 3.4754,
+      "step": 22200
+    },
+    {
+      "epoch": 6.479352321043741,
+      "grad_norm": 0.31791952252388,
+      "learning_rate": 0.0005225559440559441,
+      "loss": 3.4712,
+      "step": 22250
+    },
+    {
+      "epoch": 6.493913448657464,
+      "grad_norm": 0.31013575196266174,
+      "learning_rate": 0.0005223811188811189,
+      "loss": 3.4725,
+      "step": 22300
+    },
+    {
+      "epoch": 6.508474576271187,
+      "grad_norm": 0.3217204213142395,
+      "learning_rate": 0.0005222062937062937,
+      "loss": 3.4725,
+      "step": 22350
+    },
+    {
+      "epoch": 6.523035703884909,
+      "grad_norm": 0.31372180581092834,
+      "learning_rate": 0.0005220314685314686,
+      "loss": 3.4723,
+      "step": 22400
+    },
+    {
+      "epoch": 6.5375968314986315,
+      "grad_norm": 0.3291754126548767,
+      "learning_rate": 0.0005218566433566433,
+      "loss": 3.4768,
+      "step": 22450
+    },
+    {
+      "epoch": 6.552157959112353,
+      "grad_norm": 0.3256649672985077,
+      "learning_rate": 0.0005216818181818182,
+      "loss": 3.4629,
+      "step": 22500
+    },
+    {
+      "epoch": 6.566719086726076,
+      "grad_norm": 0.323404461145401,
+      "learning_rate": 0.000521506993006993,
+      "loss": 3.4747,
+      "step": 22550
+    },
+    {
+      "epoch": 6.581280214339799,
+      "grad_norm": 0.33667072653770447,
+      "learning_rate": 0.0005213321678321678,
+      "loss": 3.4652,
+      "step": 22600
+    },
+    {
+      "epoch": 6.595841341953521,
+      "grad_norm": 0.3154466450214386,
+      "learning_rate": 0.0005211573426573426,
+      "loss": 3.4836,
+      "step": 22650
+    },
+    {
+      "epoch": 6.610402469567243,
+      "grad_norm": 0.34323811531066895,
+      "learning_rate": 0.0005209825174825175,
+      "loss": 3.4798,
+      "step": 22700
+    },
+    {
+      "epoch": 6.624963597180965,
+      "grad_norm": 0.3353569209575653,
+      "learning_rate": 0.0005208076923076923,
+      "loss": 3.4756,
+      "step": 22750
+    },
+    {
+      "epoch": 6.639524724794688,
+      "grad_norm": 0.3173069655895233,
+      "learning_rate": 0.0005206328671328671,
+      "loss": 3.4838,
+      "step": 22800
+    },
+    {
+      "epoch": 6.654085852408411,
+      "grad_norm": 0.3082946836948395,
+      "learning_rate": 0.0005204580419580419,
+      "loss": 3.4859,
+      "step": 22850
+    },
+    {
+      "epoch": 6.668646980022133,
+      "grad_norm": 0.32030490040779114,
+      "learning_rate": 0.0005202832167832168,
+      "loss": 3.4902,
+      "step": 22900
+    },
+    {
+      "epoch": 6.683208107635855,
+      "grad_norm": 0.3350687623023987,
+      "learning_rate": 0.0005201083916083916,
+      "loss": 3.4842,
+      "step": 22950
+    },
+    {
+      "epoch": 6.697769235249577,
+      "grad_norm": 0.324341356754303,
+      "learning_rate": 0.0005199335664335664,
+      "loss": 3.4676,
+      "step": 23000
+    },
+    {
+      "epoch": 6.697769235249577,
+      "eval_accuracy": 0.3642195382940208,
+      "eval_loss": 3.5976805686950684,
+      "eval_runtime": 181.2142,
+      "eval_samples_per_second": 91.858,
+      "eval_steps_per_second": 5.745,
+      "step": 23000
+    },
+    {
+      "epoch": 6.7123303628633,
+      "grad_norm": 0.31654438376426697,
+      "learning_rate": 0.0005197587412587413,
+      "loss": 3.487,
+      "step": 23050
+    },
+    {
+      "epoch": 6.726891490477023,
+      "grad_norm": 0.324179083108902,
+      "learning_rate": 0.0005195839160839161,
+      "loss": 3.4772,
+      "step": 23100
+    },
+    {
+      "epoch": 6.741452618090745,
+      "grad_norm": 0.3194001019001007,
+      "learning_rate": 0.0005194090909090909,
+      "loss": 3.4873,
+      "step": 23150
+    },
+    {
+      "epoch": 6.756013745704467,
+      "grad_norm": 0.3092760741710663,
+      "learning_rate": 0.0005192342657342657,
+      "loss": 3.4903,
+      "step": 23200
+    },
+    {
+      "epoch": 6.77057487331819,
+      "grad_norm": 0.3114040195941925,
+      "learning_rate": 0.0005190594405594405,
+      "loss": 3.4924,
+      "step": 23250
+    },
+    {
+      "epoch": 6.785136000931912,
+      "grad_norm": 0.3235475420951843,
+      "learning_rate": 0.0005188846153846153,
+      "loss": 3.4826,
+      "step": 23300
+    },
+    {
+      "epoch": 6.799697128545635,
+      "grad_norm": 0.3458871841430664,
+      "learning_rate": 0.0005187097902097902,
+      "loss": 3.4872,
+      "step": 23350
+    },
+    {
+      "epoch": 6.814258256159357,
+      "grad_norm": 0.34217914938926697,
+      "learning_rate": 0.000518534965034965,
+      "loss": 3.4947,
+      "step": 23400
+    },
+    {
+      "epoch": 6.828819383773079,
+      "grad_norm": 0.3139799237251282,
+      "learning_rate": 0.0005183601398601398,
+      "loss": 3.4925,
+      "step": 23450
+    },
+    {
+      "epoch": 6.843380511386802,
+      "grad_norm": 0.33358052372932434,
+      "learning_rate": 0.0005181853146853146,
+      "loss": 3.4904,
+      "step": 23500
+    },
+    {
+      "epoch": 6.857941639000524,
+      "grad_norm": 0.32350531220436096,
+      "learning_rate": 0.0005180104895104895,
+      "loss": 3.4951,
+      "step": 23550
+    },
+    {
+      "epoch": 6.872502766614247,
+      "grad_norm": 0.32974010705947876,
+      "learning_rate": 0.0005178356643356643,
+      "loss": 3.493,
+      "step": 23600
+    },
+    {
+      "epoch": 6.887063894227969,
+      "grad_norm": 0.33600473403930664,
+      "learning_rate": 0.0005176608391608391,
+      "loss": 3.4893,
+      "step": 23650
+    },
+    {
+      "epoch": 6.901625021841691,
+      "grad_norm": 0.32952529191970825,
+      "learning_rate": 0.000517486013986014,
+      "loss": 3.4971,
+      "step": 23700
+    },
+    {
+      "epoch": 6.916186149455414,
+      "grad_norm": 0.325702041387558,
+      "learning_rate": 0.0005173111888111888,
+      "loss": 3.4979,
+      "step": 23750
+    },
+    {
+      "epoch": 6.930747277069136,
+      "grad_norm": 0.32848599553108215,
+      "learning_rate": 0.0005171363636363636,
+      "loss": 3.4868,
+      "step": 23800
+    },
+    {
+      "epoch": 6.945308404682859,
+      "grad_norm": 0.3094465732574463,
+      "learning_rate": 0.0005169615384615384,
+      "loss": 3.4919,
+      "step": 23850
+    },
+    {
+      "epoch": 6.959869532296581,
+      "grad_norm": 0.3336765468120575,
+      "learning_rate": 0.0005167867132867133,
+      "loss": 3.4885,
+      "step": 23900
+    },
+    {
+      "epoch": 6.974430659910303,
+      "grad_norm": 0.32281845808029175,
+      "learning_rate": 0.000516611888111888,
+      "loss": 3.4788,
+      "step": 23950
+    },
+    {
+      "epoch": 6.988991787524026,
+      "grad_norm": 0.37186485528945923,
+      "learning_rate": 0.0005164370629370629,
+      "loss": 3.4867,
+      "step": 24000
+    },
+    {
+      "epoch": 6.988991787524026,
+      "eval_accuracy": 0.3649564207170708,
+      "eval_loss": 3.5883593559265137,
+      "eval_runtime": 181.1707,
+      "eval_samples_per_second": 91.88,
+      "eval_steps_per_second": 5.746,
+      "step": 24000
+    },
+    {
+      "epoch": 7.003494670627293,
+      "grad_norm": 0.31783783435821533,
+      "learning_rate": 0.0005162622377622377,
+      "loss": 3.459,
+      "step": 24050
+    },
+    {
+      "epoch": 7.018055798241016,
+      "grad_norm": 0.34084847569465637,
+      "learning_rate": 0.0005160874125874125,
+      "loss": 3.379,
+      "step": 24100
+    },
+    {
+      "epoch": 7.032616925854738,
+      "grad_norm": 0.3169894218444824,
+      "learning_rate": 0.0005159125874125873,
+      "loss": 3.3842,
+      "step": 24150
+    },
+    {
+      "epoch": 7.0471780534684605,
+      "grad_norm": 0.32522594928741455,
+      "learning_rate": 0.0005157377622377622,
+      "loss": 3.3877,
+      "step": 24200
+    },
+    {
+      "epoch": 7.061739181082183,
+      "grad_norm": 0.3461341857910156,
+      "learning_rate": 0.000515562937062937,
+      "loss": 3.3905,
+      "step": 24250
+    },
+    {
+      "epoch": 7.076300308695905,
+      "grad_norm": 0.328916996717453,
+      "learning_rate": 0.0005153881118881118,
+      "loss": 3.3949,
+      "step": 24300
+    },
+    {
+      "epoch": 7.090861436309628,
+      "grad_norm": 0.34981271624565125,
+      "learning_rate": 0.0005152132867132867,
+      "loss": 3.399,
+      "step": 24350
+    },
+    {
+      "epoch": 7.105422563923351,
+      "grad_norm": 0.3182472884654999,
+      "learning_rate": 0.0005150384615384615,
+      "loss": 3.4032,
+      "step": 24400
+    },
+    {
+      "epoch": 7.1199836915370724,
+      "grad_norm": 0.3407321870326996,
+      "learning_rate": 0.0005148636363636363,
+      "loss": 3.4098,
+      "step": 24450
+    },
+    {
+      "epoch": 7.134544819150795,
+      "grad_norm": 0.31166911125183105,
+      "learning_rate": 0.0005146888111888111,
+      "loss": 3.3845,
+      "step": 24500
+    },
+    {
+      "epoch": 7.149105946764517,
+      "grad_norm": 0.3245963454246521,
+      "learning_rate": 0.000514513986013986,
+      "loss": 3.4064,
+      "step": 24550
+    },
+    {
+      "epoch": 7.16366707437824,
+      "grad_norm": 0.35094335675239563,
+      "learning_rate": 0.0005143391608391608,
+      "loss": 3.4088,
+      "step": 24600
+    },
+    {
+      "epoch": 7.1782282019919625,
+      "grad_norm": 0.31868040561676025,
+      "learning_rate": 0.0005141643356643356,
+      "loss": 3.4047,
+      "step": 24650
+    },
+    {
+      "epoch": 7.192789329605684,
+      "grad_norm": 0.32330843806266785,
+      "learning_rate": 0.0005139895104895104,
+      "loss": 3.4007,
+      "step": 24700
+    },
+    {
+      "epoch": 7.207350457219407,
+      "grad_norm": 0.34358319640159607,
+      "learning_rate": 0.0005138146853146852,
+      "loss": 3.4181,
+      "step": 24750
+    },
+    {
+      "epoch": 7.22191158483313,
+      "grad_norm": 0.3291915953159332,
+      "learning_rate": 0.00051363986013986,
+      "loss": 3.4165,
+      "step": 24800
+    },
+    {
+      "epoch": 7.236472712446852,
+      "grad_norm": 0.31977158784866333,
+      "learning_rate": 0.0005134650349650349,
+      "loss": 3.4246,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2510338400605745,
+      "grad_norm": 0.33271709084510803,
+      "learning_rate": 0.0005132902097902097,
+      "loss": 3.4101,
+      "step": 24900
+    },
+    {
+      "epoch": 7.265594967674296,
+      "grad_norm": 0.3385375738143921,
+      "learning_rate": 0.0005131153846153845,
+      "loss": 3.4185,
+      "step": 24950
+    },
+    {
+      "epoch": 7.280156095288019,
+      "grad_norm": 0.359745055437088,
+      "learning_rate": 0.0005129405594405594,
+      "loss": 3.4248,
+      "step": 25000
+    },
+    {
+      "epoch": 7.280156095288019,
+      "eval_accuracy": 0.36494066732768204,
+      "eval_loss": 3.5982604026794434,
+      "eval_runtime": 181.1034,
+      "eval_samples_per_second": 91.914,
+      "eval_steps_per_second": 5.748,
+      "step": 25000
+    },
+    {
+      "epoch": 7.294717222901742,
+      "grad_norm": 0.32137957215309143,
+      "learning_rate": 0.0005127657342657342,
+      "loss": 3.4147,
+      "step": 25050
+    },
+    {
+      "epoch": 7.309278350515464,
+      "grad_norm": 0.3147618770599365,
+      "learning_rate": 0.000512590909090909,
+      "loss": 3.4202,
+      "step": 25100
+    },
+    {
+      "epoch": 7.3238394781291865,
+      "grad_norm": 0.3232477605342865,
+      "learning_rate": 0.0005124160839160838,
+      "loss": 3.4304,
+      "step": 25150
+    },
+    {
+      "epoch": 7.338400605742908,
+      "grad_norm": 0.3342421352863312,
+      "learning_rate": 0.0005122412587412588,
+      "loss": 3.4261,
+      "step": 25200
+    },
+    {
+      "epoch": 7.352961733356631,
+      "grad_norm": 0.3140034079551697,
+      "learning_rate": 0.0005120664335664336,
+      "loss": 3.435,
+      "step": 25250
+    },
+    {
+      "epoch": 7.367522860970354,
+      "grad_norm": 0.33676761388778687,
+      "learning_rate": 0.0005118916083916084,
+      "loss": 3.4313,
+      "step": 25300
+    },
+    {
+      "epoch": 7.382083988584076,
+      "grad_norm": 0.33039015531539917,
+      "learning_rate": 0.0005117167832167832,
+      "loss": 3.435,
+      "step": 25350
+    },
+    {
+      "epoch": 7.396645116197798,
+      "grad_norm": 0.34258249402046204,
+      "learning_rate": 0.0005115419580419581,
+      "loss": 3.4223,
+      "step": 25400
+    },
+    {
+      "epoch": 7.411206243811521,
+      "grad_norm": 0.3384631276130676,
+      "learning_rate": 0.0005113671328671328,
+      "loss": 3.4243,
+      "step": 25450
+    },
+    {
+      "epoch": 7.425767371425243,
+      "grad_norm": 0.32956787943840027,
+      "learning_rate": 0.0005111923076923077,
+      "loss": 3.4416,
+      "step": 25500
+    },
+    {
+      "epoch": 7.440328499038966,
+      "grad_norm": 0.3352907598018646,
+      "learning_rate": 0.0005110174825174825,
+      "loss": 3.4284,
+      "step": 25550
+    },
+    {
+      "epoch": 7.454889626652688,
+      "grad_norm": 0.3281717002391815,
+      "learning_rate": 0.0005108426573426573,
+      "loss": 3.4443,
+      "step": 25600
+    },
+    {
+      "epoch": 7.46945075426641,
+      "grad_norm": 0.34179285168647766,
+      "learning_rate": 0.0005106678321678321,
+      "loss": 3.435,
+      "step": 25650
+    },
+    {
+      "epoch": 7.484011881880133,
+      "grad_norm": 0.35076916217803955,
+      "learning_rate": 0.000510493006993007,
+      "loss": 3.4447,
+      "step": 25700
+    },
+    {
+      "epoch": 7.498573009493855,
+      "grad_norm": 0.3248736262321472,
+      "learning_rate": 0.0005103181818181818,
+      "loss": 3.4386,
+      "step": 25750
+    },
+    {
+      "epoch": 7.513134137107578,
+      "grad_norm": 0.3269752860069275,
+      "learning_rate": 0.0005101433566433566,
+      "loss": 3.4369,
+      "step": 25800
+    },
+    {
+      "epoch": 7.5276952647213005,
+      "grad_norm": 0.343089759349823,
+      "learning_rate": 0.0005099685314685315,
+      "loss": 3.447,
+      "step": 25850
+    },
+    {
+      "epoch": 7.542256392335022,
+      "grad_norm": 0.3274691104888916,
+      "learning_rate": 0.0005097937062937063,
+      "loss": 3.4378,
+      "step": 25900
+    },
+    {
+      "epoch": 7.556817519948745,
+      "grad_norm": 0.31769490242004395,
+      "learning_rate": 0.0005096188811188811,
+      "loss": 3.4429,
+      "step": 25950
+    },
+    {
+      "epoch": 7.571378647562467,
+      "grad_norm": 0.30322396755218506,
+      "learning_rate": 0.0005094440559440559,
+      "loss": 3.4455,
+      "step": 26000
+    },
+    {
+      "epoch": 7.571378647562467,
+      "eval_accuracy": 0.3655577534538131,
+      "eval_loss": 3.586430072784424,
+      "eval_runtime": 181.088,
+      "eval_samples_per_second": 91.922,
+      "eval_steps_per_second": 5.749,
+      "step": 26000
+    },
+    {
+      "epoch": 7.58593977517619,
+      "grad_norm": 0.3594802916049957,
+      "learning_rate": 0.0005092692307692308,
+      "loss": 3.4449,
+      "step": 26050
+    },
+    {
+      "epoch": 7.600500902789912,
+      "grad_norm": 0.34348103404045105,
+      "learning_rate": 0.0005090944055944056,
+      "loss": 3.4474,
+      "step": 26100
+    },
+    {
+      "epoch": 7.615062030403634,
+      "grad_norm": 0.32391390204429626,
+      "learning_rate": 0.0005089195804195804,
+      "loss": 3.445,
+      "step": 26150
+    },
+    {
+      "epoch": 7.629623158017357,
+      "grad_norm": 0.3242975175380707,
+      "learning_rate": 0.0005087447552447552,
+      "loss": 3.4404,
+      "step": 26200
+    },
+    {
+      "epoch": 7.644184285631079,
+      "grad_norm": 0.35450035333633423,
+      "learning_rate": 0.00050856993006993,
+      "loss": 3.4475,
+      "step": 26250
+    },
+    {
+      "epoch": 7.658745413244802,
+      "grad_norm": 0.32055407762527466,
+      "learning_rate": 0.0005083951048951048,
+      "loss": 3.4406,
+      "step": 26300
+    },
+    {
+      "epoch": 7.673306540858524,
+      "grad_norm": 0.31129223108291626,
+      "learning_rate": 0.0005082202797202797,
+      "loss": 3.4532,
+      "step": 26350
+    },
+    {
+      "epoch": 7.687867668472246,
+      "grad_norm": 0.33475667238235474,
+      "learning_rate": 0.0005080454545454545,
+      "loss": 3.4582,
+      "step": 26400
+    },
+    {
+      "epoch": 7.702428796085969,
+      "grad_norm": 0.3636000454425812,
+      "learning_rate": 0.0005078706293706293,
+      "loss": 3.442,
+      "step": 26450
+    },
+    {
+      "epoch": 7.716989923699691,
+      "grad_norm": 0.3021737039089203,
+      "learning_rate": 0.0005076958041958042,
+      "loss": 3.4586,
+      "step": 26500
+    },
+    {
+      "epoch": 7.731551051313414,
+      "grad_norm": 0.32579872012138367,
+      "learning_rate": 0.000507520979020979,
+      "loss": 3.4468,
+      "step": 26550
+    },
+    {
+      "epoch": 7.746112178927136,
+      "grad_norm": 0.3306429982185364,
+      "learning_rate": 0.0005073461538461538,
+      "loss": 3.4508,
+      "step": 26600
+    },
+    {
+      "epoch": 7.760673306540858,
+      "grad_norm": 0.33379238843917847,
+      "learning_rate": 0.0005071713286713286,
+      "loss": 3.4554,
+      "step": 26650
+    },
+    {
+      "epoch": 7.775234434154581,
+      "grad_norm": 0.3489669859409332,
+      "learning_rate": 0.0005069965034965035,
+      "loss": 3.4479,
+      "step": 26700
+    },
+    {
+      "epoch": 7.789795561768304,
+      "grad_norm": 0.3256928324699402,
+      "learning_rate": 0.0005068216783216783,
+      "loss": 3.4562,
+      "step": 26750
+    },
+    {
+      "epoch": 7.8043566893820255,
+      "grad_norm": 0.30848199129104614,
+      "learning_rate": 0.0005066468531468531,
+      "loss": 3.4483,
+      "step": 26800
+    },
+    {
+      "epoch": 7.818917816995748,
+      "grad_norm": 0.3393913507461548,
+      "learning_rate": 0.0005064720279720279,
+      "loss": 3.4709,
+      "step": 26850
+    },
+    {
+      "epoch": 7.833478944609471,
+      "grad_norm": 0.31153106689453125,
+      "learning_rate": 0.0005062972027972028,
+      "loss": 3.4658,
+      "step": 26900
+    },
+    {
+      "epoch": 7.848040072223193,
+      "grad_norm": 0.328171044588089,
+      "learning_rate": 0.0005061223776223775,
+      "loss": 3.4517,
+      "step": 26950
+    },
+    {
+      "epoch": 7.862601199836916,
+      "grad_norm": 0.32433098554611206,
+      "learning_rate": 0.0005059475524475524,
+      "loss": 3.4606,
+      "step": 27000
+    },
+    {
+      "epoch": 7.862601199836916,
+      "eval_accuracy": 0.366670836220475,
+      "eval_loss": 3.5772430896759033,
+      "eval_runtime": 181.1731,
+      "eval_samples_per_second": 91.879,
+      "eval_steps_per_second": 5.746,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8771623274506375,
+      "grad_norm": 0.3007674515247345,
+      "learning_rate": 0.0005057727272727272,
+      "loss": 3.4629,
+      "step": 27050
+    },
+    {
+      "epoch": 7.89172345506436,
+      "grad_norm": 0.3290211856365204,
+      "learning_rate": 0.000505597902097902,
+      "loss": 3.4581,
+      "step": 27100
+    },
+    {
+      "epoch": 7.906284582678083,
+      "grad_norm": 0.3350273072719574,
+      "learning_rate": 0.0005054230769230769,
+      "loss": 3.4522,
+      "step": 27150
+    },
+    {
+      "epoch": 7.920845710291805,
+      "grad_norm": 0.3044837713241577,
+      "learning_rate": 0.0005052482517482517,
+      "loss": 3.4548,
+      "step": 27200
+    },
+    {
+      "epoch": 7.935406837905528,
+      "grad_norm": 0.3208082914352417,
+      "learning_rate": 0.0005050734265734265,
+      "loss": 3.4407,
+      "step": 27250
+    },
+    {
+      "epoch": 7.9499679655192494,
+      "grad_norm": 0.33409732580184937,
+      "learning_rate": 0.0005048986013986013,
+      "loss": 3.4498,
+      "step": 27300
+    },
+    {
+      "epoch": 7.964529093132972,
+      "grad_norm": 0.33270469307899475,
+      "learning_rate": 0.0005047237762237762,
+      "loss": 3.4592,
+      "step": 27350
+    },
+    {
+      "epoch": 7.979090220746695,
+      "grad_norm": 0.336601585149765,
+      "learning_rate": 0.000504548951048951,
+      "loss": 3.4603,
+      "step": 27400
+    },
+    {
+      "epoch": 7.993651348360417,
+      "grad_norm": 0.32460638880729675,
+      "learning_rate": 0.0005043741258741258,
+      "loss": 3.4516,
+      "step": 27450
+    },
+    {
+      "epoch": 8.008154231463685,
+      "grad_norm": 0.33659544587135315,
+      "learning_rate": 0.0005041993006993006,
+      "loss": 3.404,
+      "step": 27500
+    },
+    {
+      "epoch": 8.022715359077408,
+      "grad_norm": 0.3334440588951111,
+      "learning_rate": 0.0005040244755244755,
+      "loss": 3.3413,
+      "step": 27550
+    },
+    {
+      "epoch": 8.037276486691129,
+      "grad_norm": 0.32501035928726196,
+      "learning_rate": 0.0005038496503496503,
+      "loss": 3.3455,
+      "step": 27600
+    },
+    {
+      "epoch": 8.051837614304851,
+      "grad_norm": 0.33923956751823425,
+      "learning_rate": 0.0005036748251748251,
+      "loss": 3.3632,
+      "step": 27650
+    },
+    {
+      "epoch": 8.066398741918574,
+      "grad_norm": 0.3274213671684265,
+      "learning_rate": 0.0005034999999999999,
+      "loss": 3.3568,
+      "step": 27700
+    },
+    {
+      "epoch": 8.080959869532297,
+      "grad_norm": 0.3114986717700958,
+      "learning_rate": 0.0005033251748251747,
+      "loss": 3.3671,
+      "step": 27750
+    },
+    {
+      "epoch": 8.09552099714602,
+      "grad_norm": 0.33176517486572266,
+      "learning_rate": 0.0005031503496503496,
+      "loss": 3.3705,
+      "step": 27800
+    },
+    {
+      "epoch": 8.11008212475974,
+      "grad_norm": 0.3320193290710449,
+      "learning_rate": 0.0005029755244755244,
+      "loss": 3.365,
+      "step": 27850
+    },
+    {
+      "epoch": 8.124643252373463,
+      "grad_norm": 0.3689771890640259,
+      "learning_rate": 0.0005028006993006992,
+      "loss": 3.3782,
+      "step": 27900
+    },
+    {
+      "epoch": 8.139204379987186,
+      "grad_norm": 0.32468920946121216,
+      "learning_rate": 0.000502625874125874,
+      "loss": 3.372,
+      "step": 27950
+    },
+    {
+      "epoch": 8.153765507600909,
+      "grad_norm": 0.3300785422325134,
+      "learning_rate": 0.000502451048951049,
+      "loss": 3.3815,
+      "step": 28000
+    },
+    {
+      "epoch": 8.153765507600909,
+      "eval_accuracy": 0.3664210156797952,
+      "eval_loss": 3.589651584625244,
+      "eval_runtime": 181.0562,
+      "eval_samples_per_second": 91.938,
+      "eval_steps_per_second": 5.75,
+      "step": 28000
+    },
+    {
+      "epoch": 8.168326635214632,
+      "grad_norm": 0.3502894639968872,
+      "learning_rate": 0.0005022762237762237,
+      "loss": 3.3722,
+      "step": 28050
+    },
+    {
+      "epoch": 8.182887762828354,
+      "grad_norm": 0.32582199573516846,
+      "learning_rate": 0.0005021013986013985,
+      "loss": 3.3825,
+      "step": 28100
+    },
+    {
+      "epoch": 8.197448890442075,
+      "grad_norm": 0.3195355534553528,
+      "learning_rate": 0.0005019265734265733,
+      "loss": 3.3769,
+      "step": 28150
+    },
+    {
+      "epoch": 8.212010018055798,
+      "grad_norm": 0.3466898500919342,
+      "learning_rate": 0.0005017517482517483,
+      "loss": 3.3755,
+      "step": 28200
+    },
+    {
+      "epoch": 8.22657114566952,
+      "grad_norm": 0.31910231709480286,
+      "learning_rate": 0.0005015769230769231,
+      "loss": 3.3877,
+      "step": 28250
+    },
+    {
+      "epoch": 8.241132273283243,
+      "grad_norm": 0.3181063234806061,
+      "learning_rate": 0.0005014020979020979,
+      "loss": 3.3976,
+      "step": 28300
+    },
+    {
+      "epoch": 8.255693400896966,
+      "grad_norm": 0.3086499869823456,
+      "learning_rate": 0.0005012272727272727,
+      "loss": 3.3798,
+      "step": 28350
+    },
+    {
+      "epoch": 8.270254528510687,
+      "grad_norm": 0.32086050510406494,
+      "learning_rate": 0.0005010524475524476,
+      "loss": 3.3847,
+      "step": 28400
+    },
+    {
+      "epoch": 8.28481565612441,
+      "grad_norm": 0.33809781074523926,
+      "learning_rate": 0.0005008776223776223,
+      "loss": 3.4049,
+      "step": 28450
+    },
+    {
+      "epoch": 8.299376783738133,
+      "grad_norm": 0.365345299243927,
+      "learning_rate": 0.0005007027972027972,
+      "loss": 3.3976,
+      "step": 28500
+    },
+    {
+      "epoch": 8.313937911351855,
+      "grad_norm": 0.31952720880508423,
+      "learning_rate": 0.000500527972027972,
+      "loss": 3.4071,
+      "step": 28550
+    },
+    {
+      "epoch": 8.328499038965578,
+      "grad_norm": 0.3320535123348236,
+      "learning_rate": 0.0005003531468531468,
+      "loss": 3.4125,
+      "step": 28600
+    },
+    {
+      "epoch": 8.3430601665793,
+      "grad_norm": 0.3217061758041382,
+      "learning_rate": 0.0005001783216783217,
+      "loss": 3.3984,
+      "step": 28650
+    },
+    {
+      "epoch": 8.357621294193022,
+      "grad_norm": 0.3326950967311859,
+      "learning_rate": 0.0005000034965034965,
+      "loss": 3.3986,
+      "step": 28700
+    },
+    {
+      "epoch": 8.372182421806745,
+      "grad_norm": 0.33330121636390686,
+      "learning_rate": 0.0004998286713286713,
+      "loss": 3.3976,
+      "step": 28750
+    },
+    {
+      "epoch": 8.386743549420467,
+      "grad_norm": 0.3166747987270355,
+      "learning_rate": 0.0004996538461538461,
+      "loss": 3.4002,
+      "step": 28800
+    },
+    {
+      "epoch": 8.40130467703419,
+      "grad_norm": 0.34696197509765625,
+      "learning_rate": 0.000499479020979021,
+      "loss": 3.4108,
+      "step": 28850
+    },
+    {
+      "epoch": 8.415865804647911,
+      "grad_norm": 0.3111904263496399,
+      "learning_rate": 0.0004993041958041958,
+      "loss": 3.3988,
+      "step": 28900
+    },
+    {
+      "epoch": 8.430426932261634,
+      "grad_norm": 0.3558425307273865,
+      "learning_rate": 0.0004991293706293706,
+      "loss": 3.4024,
+      "step": 28950
+    },
+    {
+      "epoch": 8.444988059875357,
+      "grad_norm": 0.37141433358192444,
+      "learning_rate": 0.0004989545454545454,
+      "loss": 3.4063,
+      "step": 29000
+    },
+    {
+      "epoch": 8.444988059875357,
+      "eval_accuracy": 0.3664435877004119,
+      "eval_loss": 3.581801414489746,
+      "eval_runtime": 181.2088,
+      "eval_samples_per_second": 91.861,
+      "eval_steps_per_second": 5.745,
+      "step": 29000
+    },
+    {
+      "epoch": 8.45954918748908,
+      "grad_norm": 0.31924572587013245,
+      "learning_rate": 0.0004987797202797203,
+      "loss": 3.4258,
+      "step": 29050
+    },
+    {
+      "epoch": 8.474110315102802,
+      "grad_norm": 0.3425278961658478,
+      "learning_rate": 0.0004986048951048951,
+      "loss": 3.4101,
+      "step": 29100
+    },
+    {
+      "epoch": 8.488671442716523,
+      "grad_norm": 0.3478877544403076,
+      "learning_rate": 0.0004984300699300699,
+      "loss": 3.4145,
+      "step": 29150
+    },
+    {
+      "epoch": 8.503232570330246,
+      "grad_norm": 0.35354796051979065,
+      "learning_rate": 0.0004982552447552448,
+      "loss": 3.4072,
+      "step": 29200
+    },
+    {
+      "epoch": 8.517793697943969,
+      "grad_norm": 0.32307496666908264,
+      "learning_rate": 0.0004980804195804195,
+      "loss": 3.4305,
+      "step": 29250
+    },
+    {
+      "epoch": 8.532354825557691,
+      "grad_norm": 0.3157086670398712,
+      "learning_rate": 0.0004979055944055944,
+      "loss": 3.4064,
+      "step": 29300
+    },
+    {
+      "epoch": 8.546915953171414,
+      "grad_norm": 0.3342621326446533,
+      "learning_rate": 0.0004977307692307692,
+      "loss": 3.3926,
+      "step": 29350
+    },
+    {
+      "epoch": 8.561477080785137,
+      "grad_norm": 0.31849896907806396,
+      "learning_rate": 0.000497555944055944,
+      "loss": 3.4163,
+      "step": 29400
+    },
+    {
+      "epoch": 8.576038208398858,
+      "grad_norm": 0.31333833932876587,
+      "learning_rate": 0.0004973811188811188,
+      "loss": 3.4039,
+      "step": 29450
+    },
+    {
+      "epoch": 8.59059933601258,
+      "grad_norm": 0.3372572660446167,
+      "learning_rate": 0.0004972062937062937,
+      "loss": 3.4262,
+      "step": 29500
+    },
+    {
+      "epoch": 8.605160463626303,
+      "grad_norm": 0.3162132799625397,
+      "learning_rate": 0.0004970314685314685,
+      "loss": 3.4172,
+      "step": 29550
+    },
+    {
+      "epoch": 8.619721591240026,
+      "grad_norm": 0.3454541862010956,
+      "learning_rate": 0.0004968566433566433,
+      "loss": 3.4144,
+      "step": 29600
+    },
+    {
+      "epoch": 8.634282718853749,
+      "grad_norm": 0.3138660192489624,
+      "learning_rate": 0.0004966818181818181,
+      "loss": 3.4081,
+      "step": 29650
+    },
+    {
+      "epoch": 8.64884384646747,
+      "grad_norm": 0.3220912516117096,
+      "learning_rate": 0.000496506993006993,
+      "loss": 3.417,
+      "step": 29700
+    },
+    {
+      "epoch": 8.663404974081192,
+      "grad_norm": 0.32789018750190735,
+      "learning_rate": 0.0004963321678321678,
+      "loss": 3.4131,
+      "step": 29750
+    },
+    {
+      "epoch": 8.677966101694915,
+      "grad_norm": 0.3420283794403076,
+      "learning_rate": 0.0004961573426573426,
+      "loss": 3.4202,
+      "step": 29800
+    },
+    {
+      "epoch": 8.692527229308638,
+      "grad_norm": 0.35813525319099426,
+      "learning_rate": 0.0004959825174825175,
+      "loss": 3.4203,
+      "step": 29850
+    },
+    {
+      "epoch": 8.70708835692236,
+      "grad_norm": 0.33372020721435547,
+      "learning_rate": 0.0004958076923076923,
+      "loss": 3.4255,
+      "step": 29900
+    },
+    {
+      "epoch": 8.721649484536082,
+      "grad_norm": 0.3299264907836914,
+      "learning_rate": 0.0004956328671328671,
+      "loss": 3.4189,
+      "step": 29950
+    },
+    {
+      "epoch": 8.736210612149804,
+      "grad_norm": 0.34100714325904846,
+      "learning_rate": 0.0004954580419580419,
+      "loss": 3.4294,
+      "step": 30000
+    },
+    {
+      "epoch": 8.736210612149804,
+      "eval_accuracy": 0.367117809253729,
+      "eval_loss": 3.57382869720459,
+      "eval_runtime": 181.1298,
+      "eval_samples_per_second": 91.901,
+      "eval_steps_per_second": 5.747,
+      "step": 30000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171700,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.2705907007488e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}