diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7033 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.36469730123997085,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00036469730123997083,
+      "grad_norm": 0.2268046736717224,
+      "learning_rate": 4e-05,
+      "loss": 0.6811,
+      "step": 1
+    },
+    {
+      "epoch": 0.0007293946024799417,
+      "grad_norm": 0.2326797991991043,
+      "learning_rate": 8e-05,
+      "loss": 0.5105,
+      "step": 2
+    },
+    {
+      "epoch": 0.0010940919037199124,
+      "grad_norm": 0.22285917401313782,
+      "learning_rate": 0.00012,
+      "loss": 0.5868,
+      "step": 3
+    },
+    {
+      "epoch": 0.0014587892049598833,
+      "grad_norm": 0.19026874005794525,
+      "learning_rate": 0.00016,
+      "loss": 0.4699,
+      "step": 4
+    },
+    {
+      "epoch": 0.0018234865061998542,
+      "grad_norm": 0.2471790909767151,
+      "learning_rate": 0.0002,
+      "loss": 0.6535,
+      "step": 5
+    },
+    {
+      "epoch": 0.002188183807439825,
+      "grad_norm": 0.24426604807376862,
+      "learning_rate": 0.00019992692729265622,
+      "loss": 0.5943,
+      "step": 6
+    },
+    {
+      "epoch": 0.002552881108679796,
+      "grad_norm": 0.22199125587940216,
+      "learning_rate": 0.0001998538545853124,
+      "loss": 0.5346,
+      "step": 7
+    },
+    {
+      "epoch": 0.0029175784099197666,
+      "grad_norm": 0.28295931220054626,
+      "learning_rate": 0.00019978078187796858,
+      "loss": 0.5446,
+      "step": 8
+    },
+    {
+      "epoch": 0.0032822757111597373,
+      "grad_norm": 0.28008294105529785,
+      "learning_rate": 0.0001997077091706248,
+      "loss": 0.714,
+      "step": 9
+    },
+    {
+      "epoch": 0.0036469730123997084,
+      "grad_norm": 0.23789669573307037,
+      "learning_rate": 0.00019963463646328097,
+      "loss": 0.505,
+      "step": 10
+    },
+    {
+      "epoch": 0.0040116703136396795,
+      "grad_norm": 0.31461969017982483,
+      "learning_rate": 0.00019956156375593718,
+      "loss": 0.6611,
+      "step": 11
+    },
+    {
+      "epoch": 0.00437636761487965,
+      "grad_norm": 0.31836387515068054,
+      "learning_rate": 0.00019948849104859336,
+      "loss": 0.5954,
+      "step": 12
+    },
+    {
+      "epoch": 0.004741064916119621,
+      "grad_norm": 0.24425436556339264,
+      "learning_rate": 0.00019941541834124954,
+      "loss": 0.4776,
+      "step": 13
+    },
+    {
+      "epoch": 0.005105762217359592,
+      "grad_norm": 0.2626420259475708,
+      "learning_rate": 0.00019934234563390575,
+      "loss": 0.451,
+      "step": 14
+    },
+    {
+      "epoch": 0.005470459518599562,
+      "grad_norm": 0.3038848042488098,
+      "learning_rate": 0.00019926927292656196,
+      "loss": 0.7207,
+      "step": 15
+    },
+    {
+      "epoch": 0.005835156819839533,
+      "grad_norm": 0.25323376059532166,
+      "learning_rate": 0.0001991962002192181,
+      "loss": 0.4251,
+      "step": 16
+    },
+    {
+      "epoch": 0.006199854121079504,
+      "grad_norm": 0.3116416931152344,
+      "learning_rate": 0.00019912312751187432,
+      "loss": 0.724,
+      "step": 17
+    },
+    {
+      "epoch": 0.006564551422319475,
+      "grad_norm": 0.28092512488365173,
+      "learning_rate": 0.00019905005480453053,
+      "loss": 0.6591,
+      "step": 18
+    },
+    {
+      "epoch": 0.006929248723559446,
+      "grad_norm": 0.3688206076622009,
+      "learning_rate": 0.0001989769820971867,
+      "loss": 0.8551,
+      "step": 19
+    },
+    {
+      "epoch": 0.007293946024799417,
+      "grad_norm": 0.2335461676120758,
+      "learning_rate": 0.00019890390938984292,
+      "loss": 0.3711,
+      "step": 20
+    },
+    {
+      "epoch": 0.007658643326039387,
+      "grad_norm": 0.28437790274620056,
+      "learning_rate": 0.0001988308366824991,
+      "loss": 0.4625,
+      "step": 21
+    },
+    {
+      "epoch": 0.008023340627279359,
+      "grad_norm": 0.31511029601097107,
+      "learning_rate": 0.00019875776397515528,
+      "loss": 0.6102,
+      "step": 22
+    },
+    {
+      "epoch": 0.008388037928519328,
+      "grad_norm": 0.2957281768321991,
+      "learning_rate": 0.0001986846912678115,
+      "loss": 0.6009,
+      "step": 23
+    },
+    {
+      "epoch": 0.0087527352297593,
+      "grad_norm": 0.26605701446533203,
+      "learning_rate": 0.00019861161856046767,
+      "loss": 0.4492,
+      "step": 24
+    },
+    {
+      "epoch": 0.00911743253099927,
+      "grad_norm": 0.2744329273700714,
+      "learning_rate": 0.00019853854585312385,
+      "loss": 0.4166,
+      "step": 25
+    },
+    {
+      "epoch": 0.009482129832239242,
+      "grad_norm": 0.3793030083179474,
+      "learning_rate": 0.00019846547314578006,
+      "loss": 0.7185,
+      "step": 26
+    },
+    {
+      "epoch": 0.009846827133479213,
+      "grad_norm": 0.3891250789165497,
+      "learning_rate": 0.00019839240043843624,
+      "loss": 0.9231,
+      "step": 27
+    },
+    {
+      "epoch": 0.010211524434719184,
+      "grad_norm": 0.4095641076564789,
+      "learning_rate": 0.00019831932773109245,
+      "loss": 0.706,
+      "step": 28
+    },
+    {
+      "epoch": 0.010576221735959153,
+      "grad_norm": 0.3578352928161621,
+      "learning_rate": 0.00019824625502374866,
+      "loss": 0.5935,
+      "step": 29
+    },
+    {
+      "epoch": 0.010940919037199124,
+      "grad_norm": 0.3827204406261444,
+      "learning_rate": 0.0001981731823164048,
+      "loss": 0.8403,
+      "step": 30
+    },
+    {
+      "epoch": 0.011305616338439095,
+      "grad_norm": 0.3178398609161377,
+      "learning_rate": 0.00019810010960906102,
+      "loss": 0.588,
+      "step": 31
+    },
+    {
+      "epoch": 0.011670313639679067,
+      "grad_norm": 0.3158668279647827,
+      "learning_rate": 0.00019802703690171723,
+      "loss": 0.5753,
+      "step": 32
+    },
+    {
+      "epoch": 0.012035010940919038,
+      "grad_norm": 0.26300448179244995,
+      "learning_rate": 0.0001979539641943734,
+      "loss": 0.573,
+      "step": 33
+    },
+    {
+      "epoch": 0.012399708242159009,
+      "grad_norm": 0.2760365605354309,
+      "learning_rate": 0.0001978808914870296,
+      "loss": 0.5887,
+      "step": 34
+    },
+    {
+      "epoch": 0.012764405543398978,
+      "grad_norm": 0.34060901403427124,
+      "learning_rate": 0.0001978078187796858,
+      "loss": 0.7347,
+      "step": 35
+    },
+    {
+      "epoch": 0.01312910284463895,
+      "grad_norm": 0.2601701319217682,
+      "learning_rate": 0.00019773474607234198,
+      "loss": 0.5684,
+      "step": 36
+    },
+    {
+      "epoch": 0.01349380014587892,
+      "grad_norm": 0.33629322052001953,
+      "learning_rate": 0.0001976616733649982,
+      "loss": 0.8124,
+      "step": 37
+    },
+    {
+      "epoch": 0.013858497447118891,
+      "grad_norm": 0.36825031042099,
+      "learning_rate": 0.00019758860065765437,
+      "loss": 0.8472,
+      "step": 38
+    },
+    {
+      "epoch": 0.014223194748358862,
+      "grad_norm": 0.28288817405700684,
+      "learning_rate": 0.00019751552795031055,
+      "loss": 0.76,
+      "step": 39
+    },
+    {
+      "epoch": 0.014587892049598834,
+      "grad_norm": 0.29537615180015564,
+      "learning_rate": 0.00019744245524296676,
+      "loss": 0.7321,
+      "step": 40
+    },
+    {
+      "epoch": 0.014952589350838803,
+      "grad_norm": 0.34148740768432617,
+      "learning_rate": 0.00019736938253562297,
+      "loss": 0.8342,
+      "step": 41
+    },
+    {
+      "epoch": 0.015317286652078774,
+      "grad_norm": 0.292447566986084,
+      "learning_rate": 0.00019729630982827915,
+      "loss": 0.6134,
+      "step": 42
+    },
+    {
+      "epoch": 0.015681983953318747,
+      "grad_norm": 0.2850889265537262,
+      "learning_rate": 0.00019722323712093533,
+      "loss": 0.6998,
+      "step": 43
+    },
+    {
+      "epoch": 0.016046681254558718,
+      "grad_norm": 0.3336108326911926,
+      "learning_rate": 0.00019715016441359154,
+      "loss": 0.8365,
+      "step": 44
+    },
+    {
+      "epoch": 0.016411378555798686,
+      "grad_norm": 0.34880322217941284,
+      "learning_rate": 0.00019707709170624772,
+      "loss": 0.8137,
+      "step": 45
+    },
+    {
+      "epoch": 0.016776075857038657,
+      "grad_norm": 0.27575618028640747,
+      "learning_rate": 0.00019700401899890393,
+      "loss": 0.7041,
+      "step": 46
+    },
+    {
+      "epoch": 0.017140773158278628,
+      "grad_norm": 0.32200825214385986,
+      "learning_rate": 0.0001969309462915601,
+      "loss": 0.9865,
+      "step": 47
+    },
+    {
+      "epoch": 0.0175054704595186,
+      "grad_norm": 0.31082266569137573,
+      "learning_rate": 0.0001968578735842163,
+      "loss": 0.7872,
+      "step": 48
+    },
+    {
+      "epoch": 0.01787016776075857,
+      "grad_norm": 0.3037458658218384,
+      "learning_rate": 0.0001967848008768725,
+      "loss": 1.005,
+      "step": 49
+    },
+    {
+      "epoch": 0.01823486506199854,
+      "grad_norm": 0.22701998054981232,
+      "learning_rate": 0.0001967117281695287,
+      "loss": 0.4318,
+      "step": 50
+    },
+    {
+      "epoch": 0.018599562363238512,
+      "grad_norm": 0.27476680278778076,
+      "learning_rate": 0.00019663865546218486,
+      "loss": 0.6872,
+      "step": 51
+    },
+    {
+      "epoch": 0.018964259664478483,
+      "grad_norm": 0.2562110424041748,
+      "learning_rate": 0.00019656558275484107,
+      "loss": 0.6356,
+      "step": 52
+    },
+    {
+      "epoch": 0.019328956965718454,
+      "grad_norm": 0.2805593013763428,
+      "learning_rate": 0.00019649251004749728,
+      "loss": 0.8285,
+      "step": 53
+    },
+    {
+      "epoch": 0.019693654266958426,
+      "grad_norm": 0.32811108231544495,
+      "learning_rate": 0.00019641943734015346,
+      "loss": 0.8711,
+      "step": 54
+    },
+    {
+      "epoch": 0.020058351568198397,
+      "grad_norm": 0.24847714602947235,
+      "learning_rate": 0.00019634636463280967,
+      "loss": 0.6721,
+      "step": 55
+    },
+    {
+      "epoch": 0.020423048869438368,
+      "grad_norm": 0.2888585031032562,
+      "learning_rate": 0.00019627329192546585,
+      "loss": 0.786,
+      "step": 56
+    },
+    {
+      "epoch": 0.020787746170678335,
+      "grad_norm": 0.2474086582660675,
+      "learning_rate": 0.00019620021921812203,
+      "loss": 0.6735,
+      "step": 57
+    },
+    {
+      "epoch": 0.021152443471918306,
+      "grad_norm": 0.28172221779823303,
+      "learning_rate": 0.00019612714651077824,
+      "loss": 0.7776,
+      "step": 58
+    },
+    {
+      "epoch": 0.021517140773158278,
+      "grad_norm": 0.3260613977909088,
+      "learning_rate": 0.00019605407380343442,
+      "loss": 0.8941,
+      "step": 59
+    },
+    {
+      "epoch": 0.02188183807439825,
+      "grad_norm": 0.2589282989501953,
+      "learning_rate": 0.0001959810010960906,
+      "loss": 0.6758,
+      "step": 60
+    },
+    {
+      "epoch": 0.02224653537563822,
+      "grad_norm": 0.2978575825691223,
+      "learning_rate": 0.0001959079283887468,
+      "loss": 0.7084,
+      "step": 61
+    },
+    {
+      "epoch": 0.02261123267687819,
+      "grad_norm": 0.2522169351577759,
+      "learning_rate": 0.000195834855681403,
+      "loss": 0.6634,
+      "step": 62
+    },
+    {
+      "epoch": 0.022975929978118162,
+      "grad_norm": 0.3184927999973297,
+      "learning_rate": 0.0001957617829740592,
+      "loss": 0.9409,
+      "step": 63
+    },
+    {
+      "epoch": 0.023340627279358133,
+      "grad_norm": 0.24009554088115692,
+      "learning_rate": 0.00019568871026671538,
+      "loss": 0.6867,
+      "step": 64
+    },
+    {
+      "epoch": 0.023705324580598104,
+      "grad_norm": 0.2735375761985779,
+      "learning_rate": 0.00019561563755937157,
+      "loss": 0.6533,
+      "step": 65
+    },
+    {
+      "epoch": 0.024070021881838075,
+      "grad_norm": 0.2777388095855713,
+      "learning_rate": 0.00019554256485202777,
+      "loss": 0.7517,
+      "step": 66
+    },
+    {
+      "epoch": 0.024434719183078046,
+      "grad_norm": 0.271108478307724,
+      "learning_rate": 0.00019546949214468398,
+      "loss": 0.8724,
+      "step": 67
+    },
+    {
+      "epoch": 0.024799416484318017,
+      "grad_norm": 0.22648799419403076,
+      "learning_rate": 0.00019539641943734016,
+      "loss": 0.523,
+      "step": 68
+    },
+    {
+      "epoch": 0.02516411378555799,
+      "grad_norm": 0.2377820461988449,
+      "learning_rate": 0.00019532334672999634,
+      "loss": 0.5099,
+      "step": 69
+    },
+    {
+      "epoch": 0.025528811086797956,
+      "grad_norm": 0.22092792391777039,
+      "learning_rate": 0.00019525027402265255,
+      "loss": 0.5447,
+      "step": 70
+    },
+    {
+      "epoch": 0.025893508388037927,
+      "grad_norm": 0.26954007148742676,
+      "learning_rate": 0.00019517720131530873,
+      "loss": 0.6141,
+      "step": 71
+    },
+    {
+      "epoch": 0.0262582056892779,
+      "grad_norm": 0.2562531530857086,
+      "learning_rate": 0.00019510412860796494,
+      "loss": 0.7991,
+      "step": 72
+    },
+    {
+      "epoch": 0.02662290299051787,
+      "grad_norm": 0.28438082337379456,
+      "learning_rate": 0.00019503105590062112,
+      "loss": 0.9198,
+      "step": 73
+    },
+    {
+      "epoch": 0.02698760029175784,
+      "grad_norm": 0.1985676884651184,
+      "learning_rate": 0.0001949579831932773,
+      "loss": 0.4564,
+      "step": 74
+    },
+    {
+      "epoch": 0.02735229759299781,
+      "grad_norm": 0.23653608560562134,
+      "learning_rate": 0.00019488491048593351,
+      "loss": 0.6741,
+      "step": 75
+    },
+    {
+      "epoch": 0.027716994894237783,
+      "grad_norm": 0.18751463294029236,
+      "learning_rate": 0.00019481183777858972,
+      "loss": 0.378,
+      "step": 76
+    },
+    {
+      "epoch": 0.028081692195477754,
+      "grad_norm": 0.2654147148132324,
+      "learning_rate": 0.0001947387650712459,
+      "loss": 0.6615,
+      "step": 77
+    },
+    {
+      "epoch": 0.028446389496717725,
+      "grad_norm": 0.2540780007839203,
+      "learning_rate": 0.00019466569236390208,
+      "loss": 0.661,
+      "step": 78
+    },
+    {
+      "epoch": 0.028811086797957696,
+      "grad_norm": 0.2665940821170807,
+      "learning_rate": 0.0001945926196565583,
+      "loss": 0.7608,
+      "step": 79
+    },
+    {
+      "epoch": 0.029175784099197667,
+      "grad_norm": 0.27822214365005493,
+      "learning_rate": 0.00019451954694921447,
+      "loss": 0.9142,
+      "step": 80
+    },
+    {
+      "epoch": 0.02954048140043764,
+      "grad_norm": 0.26205846667289734,
+      "learning_rate": 0.00019444647424187068,
+      "loss": 0.7416,
+      "step": 81
+    },
+    {
+      "epoch": 0.029905178701677606,
+      "grad_norm": 0.2633398175239563,
+      "learning_rate": 0.00019437340153452686,
+      "loss": 0.6727,
+      "step": 82
+    },
+    {
+      "epoch": 0.030269876002917577,
+      "grad_norm": 0.2297828495502472,
+      "learning_rate": 0.00019430032882718305,
+      "loss": 0.5744,
+      "step": 83
+    },
+    {
+      "epoch": 0.030634573304157548,
+      "grad_norm": 0.27555879950523376,
+      "learning_rate": 0.00019422725611983925,
+      "loss": 0.7682,
+      "step": 84
+    },
+    {
+      "epoch": 0.03099927060539752,
+      "grad_norm": 0.2495211362838745,
+      "learning_rate": 0.00019415418341249546,
+      "loss": 0.7603,
+      "step": 85
+    },
+    {
+      "epoch": 0.031363967906637494,
+      "grad_norm": 0.24564798176288605,
+      "learning_rate": 0.00019408111070515162,
+      "loss": 0.6745,
+      "step": 86
+    },
+    {
+      "epoch": 0.03172866520787746,
+      "grad_norm": 0.2123216986656189,
+      "learning_rate": 0.00019400803799780783,
+      "loss": 0.5624,
+      "step": 87
+    },
+    {
+      "epoch": 0.032093362509117436,
+      "grad_norm": 0.26791101694107056,
+      "learning_rate": 0.00019393496529046403,
+      "loss": 0.8236,
+      "step": 88
+    },
+    {
+      "epoch": 0.032458059810357404,
+      "grad_norm": 0.24294224381446838,
+      "learning_rate": 0.00019386189258312021,
+      "loss": 0.7493,
+      "step": 89
+    },
+    {
+      "epoch": 0.03282275711159737,
+      "grad_norm": 0.25676801800727844,
+      "learning_rate": 0.00019378881987577642,
+      "loss": 0.8281,
+      "step": 90
+    },
+    {
+      "epoch": 0.033187454412837346,
+      "grad_norm": 0.22630107402801514,
+      "learning_rate": 0.0001937157471684326,
+      "loss": 0.6335,
+      "step": 91
+    },
+    {
+      "epoch": 0.03355215171407731,
+      "grad_norm": 0.25208771228790283,
+      "learning_rate": 0.00019364267446108879,
+      "loss": 0.7753,
+      "step": 92
+    },
+    {
+      "epoch": 0.03391684901531729,
+      "grad_norm": 0.23288992047309875,
+      "learning_rate": 0.000193569601753745,
+      "loss": 0.683,
+      "step": 93
+    },
+    {
+      "epoch": 0.034281546316557256,
+      "grad_norm": 0.2340337187051773,
+      "learning_rate": 0.00019349652904640118,
+      "loss": 0.7111,
+      "step": 94
+    },
+    {
+      "epoch": 0.03464624361779723,
+      "grad_norm": 0.24910978972911835,
+      "learning_rate": 0.00019342345633905736,
+      "loss": 0.8183,
+      "step": 95
+    },
+    {
+      "epoch": 0.0350109409190372,
+      "grad_norm": 0.23724719882011414,
+      "learning_rate": 0.00019335038363171357,
+      "loss": 0.7408,
+      "step": 96
+    },
+    {
+      "epoch": 0.03537563822027717,
+      "grad_norm": 0.2057395875453949,
+      "learning_rate": 0.00019327731092436975,
+      "loss": 0.5099,
+      "step": 97
+    },
+    {
+      "epoch": 0.03574033552151714,
+      "grad_norm": 0.22540345788002014,
+      "learning_rate": 0.00019320423821702595,
+      "loss": 0.7391,
+      "step": 98
+    },
+    {
+      "epoch": 0.036105032822757115,
+      "grad_norm": 0.2615845799446106,
+      "learning_rate": 0.00019313116550968214,
+      "loss": 0.7005,
+      "step": 99
+    },
+    {
+      "epoch": 0.03646973012399708,
+      "grad_norm": 0.19165730476379395,
+      "learning_rate": 0.00019305809280233832,
+      "loss": 0.4355,
+      "step": 100
+    },
+    {
+      "epoch": 0.03683442742523705,
+      "grad_norm": 0.22737336158752441,
+      "learning_rate": 0.00019298502009499453,
+      "loss": 0.7579,
+      "step": 101
+    },
+    {
+      "epoch": 0.037199124726477024,
+      "grad_norm": 0.21350006759166718,
+      "learning_rate": 0.00019291194738765073,
+      "loss": 0.6608,
+      "step": 102
+    },
+    {
+      "epoch": 0.03756382202771699,
+      "grad_norm": 0.24027219414710999,
+      "learning_rate": 0.00019283887468030692,
+      "loss": 0.8235,
+      "step": 103
+    },
+    {
+      "epoch": 0.03792851932895697,
+      "grad_norm": 0.21983422338962555,
+      "learning_rate": 0.0001927658019729631,
+      "loss": 0.5319,
+      "step": 104
+    },
+    {
+      "epoch": 0.038293216630196934,
+      "grad_norm": 0.26618310809135437,
+      "learning_rate": 0.0001926927292656193,
+      "loss": 0.9259,
+      "step": 105
+    },
+    {
+      "epoch": 0.03865791393143691,
+      "grad_norm": 0.24729526042938232,
+      "learning_rate": 0.0001926196565582755,
+      "loss": 0.7653,
+      "step": 106
+    },
+    {
+      "epoch": 0.039022611232676876,
+      "grad_norm": 0.2595866024494171,
+      "learning_rate": 0.0001925465838509317,
+      "loss": 0.7662,
+      "step": 107
+    },
+    {
+      "epoch": 0.03938730853391685,
+      "grad_norm": 0.16923396289348602,
+      "learning_rate": 0.00019247351114358788,
+      "loss": 0.3119,
+      "step": 108
+    },
+    {
+      "epoch": 0.03975200583515682,
+      "grad_norm": 0.2592317759990692,
+      "learning_rate": 0.00019240043843624406,
+      "loss": 0.6767,
+      "step": 109
+    },
+    {
+      "epoch": 0.04011670313639679,
+      "grad_norm": 0.2139894813299179,
+      "learning_rate": 0.00019232736572890027,
+      "loss": 0.5058,
+      "step": 110
+    },
+    {
+      "epoch": 0.04048140043763676,
+      "grad_norm": 0.2439870685338974,
+      "learning_rate": 0.00019225429302155647,
+      "loss": 0.807,
+      "step": 111
+    },
+    {
+      "epoch": 0.040846097738876735,
+      "grad_norm": 0.26212504506111145,
+      "learning_rate": 0.00019218122031421263,
+      "loss": 0.9942,
+      "step": 112
+    },
+    {
+      "epoch": 0.0412107950401167,
+      "grad_norm": 0.26018884778022766,
+      "learning_rate": 0.00019210814760686884,
+      "loss": 0.6573,
+      "step": 113
+    },
+    {
+      "epoch": 0.04157549234135667,
+      "grad_norm": 0.20158423483371735,
+      "learning_rate": 0.00019203507489952505,
+      "loss": 0.4533,
+      "step": 114
+    },
+    {
+      "epoch": 0.041940189642596645,
+      "grad_norm": 0.2270892709493637,
+      "learning_rate": 0.00019196200219218123,
+      "loss": 0.5983,
+      "step": 115
+    },
+    {
+      "epoch": 0.04230488694383661,
+      "grad_norm": 0.2140335738658905,
+      "learning_rate": 0.00019188892948483744,
+      "loss": 0.5686,
+      "step": 116
+    },
+    {
+      "epoch": 0.04266958424507659,
+      "grad_norm": 0.25761061906814575,
+      "learning_rate": 0.00019181585677749362,
+      "loss": 0.7238,
+      "step": 117
+    },
+    {
+      "epoch": 0.043034281546316555,
+      "grad_norm": 0.24370697140693665,
+      "learning_rate": 0.0001917427840701498,
+      "loss": 0.7342,
+      "step": 118
+    },
+    {
+      "epoch": 0.04339897884755653,
+      "grad_norm": 0.2632579207420349,
+      "learning_rate": 0.000191669711362806,
+      "loss": 0.8978,
+      "step": 119
+    },
+    {
+      "epoch": 0.0437636761487965,
+      "grad_norm": 0.22956986725330353,
+      "learning_rate": 0.00019159663865546221,
+      "loss": 0.6409,
+      "step": 120
+    },
+    {
+      "epoch": 0.04412837345003647,
+      "grad_norm": 0.24121011793613434,
+      "learning_rate": 0.00019152356594811837,
+      "loss": 0.7556,
+      "step": 121
+    },
+    {
+      "epoch": 0.04449307075127644,
+      "grad_norm": 0.2375144511461258,
+      "learning_rate": 0.00019145049324077458,
+      "loss": 0.7303,
+      "step": 122
+    },
+    {
+      "epoch": 0.044857768052516414,
+      "grad_norm": 0.22452694177627563,
+      "learning_rate": 0.00019137742053343079,
+      "loss": 0.6456,
+      "step": 123
+    },
+    {
+      "epoch": 0.04522246535375638,
+      "grad_norm": 0.2590137720108032,
+      "learning_rate": 0.00019130434782608697,
+      "loss": 0.7438,
+      "step": 124
+    },
+    {
+      "epoch": 0.045587162654996356,
+      "grad_norm": 0.2955920696258545,
+      "learning_rate": 0.00019123127511874318,
+      "loss": 1.0269,
+      "step": 125
+    },
+    {
+      "epoch": 0.045951859956236324,
+      "grad_norm": 0.21322080492973328,
+      "learning_rate": 0.00019115820241139936,
+      "loss": 0.6143,
+      "step": 126
+    },
+    {
+      "epoch": 0.04631655725747629,
+      "grad_norm": 0.20497886836528778,
+      "learning_rate": 0.00019108512970405554,
+      "loss": 0.5871,
+      "step": 127
+    },
+    {
+      "epoch": 0.046681254558716266,
+      "grad_norm": 0.26355209946632385,
+      "learning_rate": 0.00019101205699671175,
+      "loss": 0.8077,
+      "step": 128
+    },
+    {
+      "epoch": 0.047045951859956234,
+      "grad_norm": 0.21869653463363647,
+      "learning_rate": 0.00019093898428936793,
+      "loss": 0.6736,
+      "step": 129
+    },
+    {
+      "epoch": 0.04741064916119621,
+      "grad_norm": 0.21741290390491486,
+      "learning_rate": 0.0001908659115820241,
+      "loss": 0.597,
+      "step": 130
+    },
+    {
+      "epoch": 0.047775346462436176,
+      "grad_norm": 0.2503755986690521,
+      "learning_rate": 0.00019079283887468032,
+      "loss": 0.8038,
+      "step": 131
+    },
+    {
+      "epoch": 0.04814004376367615,
+      "grad_norm": 0.2087285965681076,
+      "learning_rate": 0.0001907197661673365,
+      "loss": 0.5409,
+      "step": 132
+    },
+    {
+      "epoch": 0.04850474106491612,
+      "grad_norm": 0.2347562611103058,
+      "learning_rate": 0.0001906466934599927,
+      "loss": 0.6741,
+      "step": 133
+    },
+    {
+      "epoch": 0.04886943836615609,
+      "grad_norm": 0.23479056358337402,
+      "learning_rate": 0.0001905736207526489,
+      "loss": 0.7257,
+      "step": 134
+    },
+    {
+      "epoch": 0.04923413566739606,
+      "grad_norm": 0.2217235416173935,
+      "learning_rate": 0.00019050054804530507,
+      "loss": 0.6539,
+      "step": 135
+    },
+    {
+      "epoch": 0.049598832968636035,
+      "grad_norm": 0.23342272639274597,
+      "learning_rate": 0.00019042747533796128,
+      "loss": 0.6716,
+      "step": 136
+    },
+    {
+      "epoch": 0.049963530269876,
+      "grad_norm": 0.231741800904274,
+      "learning_rate": 0.00019035440263061749,
+      "loss": 0.6563,
+      "step": 137
+    },
+    {
+      "epoch": 0.05032822757111598,
+      "grad_norm": 0.2353263646364212,
+      "learning_rate": 0.00019028132992327367,
+      "loss": 0.7612,
+      "step": 138
+    },
+    {
+      "epoch": 0.050692924872355945,
+      "grad_norm": 0.21605569124221802,
+      "learning_rate": 0.00019020825721592985,
+      "loss": 0.6007,
+      "step": 139
+    },
+    {
+      "epoch": 0.05105762217359591,
+      "grad_norm": 0.25579389929771423,
+      "learning_rate": 0.00019013518450858606,
+      "loss": 0.8586,
+      "step": 140
+    },
+    {
+      "epoch": 0.05142231947483589,
+      "grad_norm": 0.24197165668010712,
+      "learning_rate": 0.00019006211180124224,
+      "loss": 0.7442,
+      "step": 141
+    },
+    {
+      "epoch": 0.051787016776075855,
+      "grad_norm": 0.21442811191082,
+      "learning_rate": 0.00018998903909389845,
+      "loss": 0.6356,
+      "step": 142
+    },
+    {
+      "epoch": 0.05215171407731583,
+      "grad_norm": 0.23524148762226105,
+      "learning_rate": 0.00018991596638655463,
+      "loss": 0.8438,
+      "step": 143
+    },
+    {
+      "epoch": 0.0525164113785558,
+      "grad_norm": 0.1977744698524475,
+      "learning_rate": 0.0001898428936792108,
+      "loss": 0.6249,
+      "step": 144
+    },
+    {
+      "epoch": 0.05288110867979577,
+      "grad_norm": 0.22285562753677368,
+      "learning_rate": 0.00018976982097186702,
+      "loss": 0.6603,
+      "step": 145
+    },
+    {
+      "epoch": 0.05324580598103574,
+      "grad_norm": 0.223031684756279,
+      "learning_rate": 0.00018969674826452323,
+      "loss": 0.7324,
+      "step": 146
+    },
+    {
+      "epoch": 0.053610503282275714,
+      "grad_norm": 0.200596883893013,
+      "learning_rate": 0.00018962367555717938,
+      "loss": 0.546,
+      "step": 147
+    },
+    {
+      "epoch": 0.05397520058351568,
+      "grad_norm": 0.23952823877334595,
+      "learning_rate": 0.0001895506028498356,
+      "loss": 0.909,
+      "step": 148
+    },
+    {
+      "epoch": 0.054339897884755656,
+      "grad_norm": 0.15994083881378174,
+      "learning_rate": 0.0001894775301424918,
+      "loss": 0.3073,
+      "step": 149
+    },
+    {
+      "epoch": 0.05470459518599562,
+      "grad_norm": 0.2209351807832718,
+      "learning_rate": 0.00018940445743514798,
+      "loss": 0.6757,
+      "step": 150
+    },
+    {
+      "epoch": 0.05506929248723559,
+      "grad_norm": 0.19749605655670166,
+      "learning_rate": 0.0001893313847278042,
+      "loss": 0.5071,
+      "step": 151
+    },
+    {
+      "epoch": 0.055433989788475566,
+      "grad_norm": 0.23921646177768707,
+      "learning_rate": 0.00018925831202046037,
+      "loss": 0.7567,
+      "step": 152
+    },
+    {
+      "epoch": 0.05579868708971553,
+      "grad_norm": 0.21612580120563507,
+      "learning_rate": 0.00018918523931311655,
+      "loss": 0.6329,
+      "step": 153
+    },
+    {
+      "epoch": 0.05616338439095551,
+      "grad_norm": 0.21304431557655334,
+      "learning_rate": 0.00018911216660577276,
+      "loss": 0.6612,
+      "step": 154
+    },
+    {
+      "epoch": 0.056528081692195475,
+      "grad_norm": 0.23509572446346283,
+      "learning_rate": 0.00018903909389842897,
+      "loss": 0.6723,
+      "step": 155
+    },
+    {
+      "epoch": 0.05689277899343545,
+      "grad_norm": 0.23066239058971405,
+      "learning_rate": 0.00018896602119108512,
+      "loss": 0.7588,
+      "step": 156
+    },
+    {
+      "epoch": 0.05725747629467542,
+      "grad_norm": 0.2865854799747467,
+      "learning_rate": 0.00018889294848374133,
+      "loss": 1.0649,
+      "step": 157
+    },
+    {
+      "epoch": 0.05762217359591539,
+      "grad_norm": 0.21136869490146637,
+      "learning_rate": 0.00018881987577639754,
+      "loss": 0.6532,
+      "step": 158
+    },
+    {
+      "epoch": 0.05798687089715536,
+      "grad_norm": 0.2841559648513794,
+      "learning_rate": 0.00018874680306905372,
+      "loss": 1.047,
+      "step": 159
+    },
+    {
+      "epoch": 0.058351568198395334,
+      "grad_norm": 0.23411215841770172,
+      "learning_rate": 0.0001886737303617099,
+      "loss": 0.6954,
+      "step": 160
+    },
+    {
+      "epoch": 0.0587162654996353,
+      "grad_norm": 0.25536251068115234,
+      "learning_rate": 0.0001886006576543661,
+      "loss": 0.8371,
+      "step": 161
+    },
+    {
+      "epoch": 0.05908096280087528,
+      "grad_norm": 0.24174682796001434,
+      "learning_rate": 0.0001885275849470223,
+      "loss": 0.6944,
+      "step": 162
+    },
+    {
+      "epoch": 0.059445660102115244,
+      "grad_norm": 0.21550963819026947,
+      "learning_rate": 0.0001884545122396785,
+      "loss": 0.6978,
+      "step": 163
+    },
+    {
+      "epoch": 0.05981035740335521,
+      "grad_norm": 0.1993122398853302,
+      "learning_rate": 0.00018838143953233468,
+      "loss": 0.4768,
+      "step": 164
+    },
+    {
+      "epoch": 0.060175054704595186,
+      "grad_norm": 0.22367194294929504,
+      "learning_rate": 0.00018830836682499086,
+      "loss": 0.6238,
+      "step": 165
+    },
+    {
+      "epoch": 0.060539752005835154,
+      "grad_norm": 0.20643705129623413,
+      "learning_rate": 0.00018823529411764707,
+      "loss": 0.598,
+      "step": 166
+    },
+    {
+      "epoch": 0.06090444930707513,
+      "grad_norm": 0.21671368181705475,
+      "learning_rate": 0.00018816222141030325,
+      "loss": 0.7071,
+      "step": 167
+    },
+    {
+      "epoch": 0.061269146608315096,
+      "grad_norm": 0.19023250043392181,
+      "learning_rate": 0.00018808914870295946,
+      "loss": 0.4923,
+      "step": 168
+    },
+    {
+      "epoch": 0.06163384390955507,
+      "grad_norm": 0.26334115862846375,
+      "learning_rate": 0.00018801607599561564,
+      "loss": 0.8478,
+      "step": 169
+    },
+    {
+      "epoch": 0.06199854121079504,
+      "grad_norm": 0.2204032838344574,
+      "learning_rate": 0.00018794300328827182,
+      "loss": 0.6593,
+      "step": 170
+    },
+    {
+      "epoch": 0.06236323851203501,
+      "grad_norm": 0.20991367101669312,
+      "learning_rate": 0.00018786993058092803,
+      "loss": 0.6712,
+      "step": 171
+    },
+    {
+      "epoch": 0.06272793581327499,
+      "grad_norm": 0.2253231555223465,
+      "learning_rate": 0.00018779685787358424,
+      "loss": 0.6589,
+      "step": 172
+    },
+    {
+      "epoch": 0.06309263311451495,
+      "grad_norm": 0.1936040222644806,
+      "learning_rate": 0.00018772378516624042,
+      "loss": 0.515,
+      "step": 173
+    },
+    {
+      "epoch": 0.06345733041575492,
+      "grad_norm": 0.24025693535804749,
+      "learning_rate": 0.0001876507124588966,
+      "loss": 0.8256,
+      "step": 174
+    },
+    {
+      "epoch": 0.0638220277169949,
+      "grad_norm": 0.2465592622756958,
+      "learning_rate": 0.0001875776397515528,
+      "loss": 0.9898,
+      "step": 175
+    },
+    {
+      "epoch": 0.06418672501823487,
+      "grad_norm": 0.20442670583724976,
+      "learning_rate": 0.000187504567044209,
+      "loss": 0.6724,
+      "step": 176
+    },
+    {
+      "epoch": 0.06455142231947483,
+      "grad_norm": 0.19670630991458893,
+      "learning_rate": 0.0001874314943368652,
+      "loss": 0.6449,
+      "step": 177
+    },
+    {
+      "epoch": 0.06491611962071481,
+      "grad_norm": 0.1966039091348648,
+      "learning_rate": 0.00018735842162952138,
+      "loss": 0.787,
+      "step": 178
+    },
+    {
+      "epoch": 0.06528081692195478,
+      "grad_norm": 0.21741807460784912,
+      "learning_rate": 0.00018728534892217756,
+      "loss": 0.8073,
+      "step": 179
+    },
+    {
+      "epoch": 0.06564551422319474,
+      "grad_norm": 0.20191621780395508,
+      "learning_rate": 0.00018721227621483377,
+      "loss": 0.6297,
+      "step": 180
+    },
+    {
+      "epoch": 0.06601021152443472,
+      "grad_norm": 0.21392996609210968,
+      "learning_rate": 0.00018713920350748998,
+      "loss": 0.6611,
+      "step": 181
+    },
+    {
+      "epoch": 0.06637490882567469,
+      "grad_norm": 0.21892090141773224,
+      "learning_rate": 0.00018706613080014613,
+      "loss": 0.7913,
+      "step": 182
+    },
+    {
+      "epoch": 0.06673960612691467,
+      "grad_norm": 0.240064337849617,
+      "learning_rate": 0.00018699305809280234,
+      "loss": 0.74,
+      "step": 183
+    },
+    {
+      "epoch": 0.06710430342815463,
+      "grad_norm": 0.2156010866165161,
+      "learning_rate": 0.00018691998538545855,
+      "loss": 0.6928,
+      "step": 184
+    },
+    {
+      "epoch": 0.0674690007293946,
+      "grad_norm": 0.19305868446826935,
+      "learning_rate": 0.00018684691267811473,
+      "loss": 0.6712,
+      "step": 185
+    },
+    {
+      "epoch": 0.06783369803063458,
+      "grad_norm": 0.24134770035743713,
+      "learning_rate": 0.00018677383997077094,
+      "loss": 0.7784,
+      "step": 186
+    },
+    {
+      "epoch": 0.06819839533187455,
+      "grad_norm": 0.21951039135456085,
+      "learning_rate": 0.00018670076726342712,
+      "loss": 0.7536,
+      "step": 187
+    },
+    {
+      "epoch": 0.06856309263311451,
+      "grad_norm": 0.22668084502220154,
+      "learning_rate": 0.0001866276945560833,
+      "loss": 0.7914,
+      "step": 188
+    },
+    {
+      "epoch": 0.06892778993435449,
+      "grad_norm": 0.23703083395957947,
+      "learning_rate": 0.0001865546218487395,
+      "loss": 0.8562,
+      "step": 189
+    },
+    {
+      "epoch": 0.06929248723559446,
+      "grad_norm": 0.18408019840717316,
+      "learning_rate": 0.00018648154914139572,
+      "loss": 0.5408,
+      "step": 190
+    },
+    {
+      "epoch": 0.06965718453683442,
+      "grad_norm": 0.168971985578537,
+      "learning_rate": 0.00018640847643405187,
+      "loss": 0.5145,
+      "step": 191
+    },
+    {
+      "epoch": 0.0700218818380744,
+      "grad_norm": 0.2313617318868637,
+      "learning_rate": 0.00018633540372670808,
+      "loss": 0.7116,
+      "step": 192
+    },
+    {
+      "epoch": 0.07038657913931437,
+      "grad_norm": 0.22079357504844666,
+      "learning_rate": 0.0001862623310193643,
+      "loss": 0.7453,
+      "step": 193
+    },
+    {
+      "epoch": 0.07075127644055434,
+      "grad_norm": 0.20398060977458954,
+      "learning_rate": 0.00018618925831202047,
+      "loss": 0.6914,
+      "step": 194
+    },
+    {
+      "epoch": 0.0711159737417943,
+      "grad_norm": 0.21486474573612213,
+      "learning_rate": 0.00018611618560467665,
+      "loss": 0.7252,
+      "step": 195
+    },
+    {
+      "epoch": 0.07148067104303428,
+      "grad_norm": 0.21152055263519287,
+      "learning_rate": 0.00018604311289733286,
+      "loss": 0.676,
+      "step": 196
+    },
+    {
+      "epoch": 0.07184536834427425,
+      "grad_norm": 0.2242351919412613,
+      "learning_rate": 0.00018597004018998904,
+      "loss": 0.8428,
+      "step": 197
+    },
+    {
+      "epoch": 0.07221006564551423,
+      "grad_norm": 0.19831842184066772,
+      "learning_rate": 0.00018589696748264525,
+      "loss": 0.6379,
+      "step": 198
+    },
+    {
+      "epoch": 0.07257476294675419,
+      "grad_norm": 0.2057400494813919,
+      "learning_rate": 0.00018582389477530143,
+      "loss": 0.7417,
+      "step": 199
+    },
+    {
+      "epoch": 0.07293946024799416,
+      "grad_norm": 0.17779530584812164,
+      "learning_rate": 0.0001857508220679576,
+      "loss": 0.6891,
+      "step": 200
+    },
+    {
+      "epoch": 0.07330415754923414,
+      "grad_norm": 0.20354579389095306,
+      "learning_rate": 0.00018567774936061382,
+      "loss": 0.7052,
+      "step": 201
+    },
+    {
+      "epoch": 0.0736688548504741,
+      "grad_norm": 0.17558813095092773,
+      "learning_rate": 0.00018560467665327,
+      "loss": 0.6295,
+      "step": 202
+    },
+    {
+      "epoch": 0.07403355215171407,
+      "grad_norm": 0.1900980919599533,
+      "learning_rate": 0.0001855316039459262,
+      "loss": 0.6312,
+      "step": 203
+    },
+    {
+      "epoch": 0.07439824945295405,
+      "grad_norm": 0.2224852293729782,
+      "learning_rate": 0.0001854585312385824,
+      "loss": 0.7633,
+      "step": 204
+    },
+    {
+      "epoch": 0.07476294675419402,
+      "grad_norm": 0.2300332933664322,
+      "learning_rate": 0.00018538545853123857,
+      "loss": 0.8564,
+      "step": 205
+    },
+    {
+      "epoch": 0.07512764405543398,
+      "grad_norm": 0.24671219289302826,
+      "learning_rate": 0.00018531238582389478,
+      "loss": 0.9927,
+      "step": 206
+    },
+    {
+      "epoch": 0.07549234135667396,
+      "grad_norm": 0.1760961413383484,
+      "learning_rate": 0.000185239313116551,
+      "loss": 0.5943,
+      "step": 207
+    },
+    {
+      "epoch": 0.07585703865791393,
+      "grad_norm": 0.25457900762557983,
+      "learning_rate": 0.00018516624040920717,
+      "loss": 1.0929,
+      "step": 208
+    },
+    {
+      "epoch": 0.07622173595915391,
+      "grad_norm": 0.1756330132484436,
+      "learning_rate": 0.00018509316770186335,
+      "loss": 0.5191,
+      "step": 209
+    },
+    {
+      "epoch": 0.07658643326039387,
+      "grad_norm": 0.22464942932128906,
+      "learning_rate": 0.00018502009499451956,
+      "loss": 0.9786,
+      "step": 210
+    },
+    {
+      "epoch": 0.07695113056163384,
+      "grad_norm": 0.17986498773097992,
+      "learning_rate": 0.00018494702228717574,
+      "loss": 0.5692,
+      "step": 211
+    },
+    {
+      "epoch": 0.07731582786287382,
+      "grad_norm": 0.20685848593711853,
+      "learning_rate": 0.00018487394957983195,
+      "loss": 0.7599,
+      "step": 212
+    },
+    {
+      "epoch": 0.07768052516411379,
+      "grad_norm": 0.19135037064552307,
+      "learning_rate": 0.00018480087687248813,
+      "loss": 0.5762,
+      "step": 213
+    },
+    {
+      "epoch": 0.07804522246535375,
+      "grad_norm": 0.18071472644805908,
+      "learning_rate": 0.00018472780416514431,
+      "loss": 0.4915,
+      "step": 214
+    },
+    {
+      "epoch": 0.07840991976659373,
+      "grad_norm": 0.2219233512878418,
+      "learning_rate": 0.00018465473145780052,
+      "loss": 0.9144,
+      "step": 215
+    },
+    {
+      "epoch": 0.0787746170678337,
+      "grad_norm": 0.20180021226406097,
+      "learning_rate": 0.00018458165875045673,
+      "loss": 0.753,
+      "step": 216
+    },
+    {
+      "epoch": 0.07913931436907366,
+      "grad_norm": 0.18585549294948578,
+      "learning_rate": 0.00018450858604311289,
+      "loss": 0.6177,
+      "step": 217
+    },
+    {
+      "epoch": 0.07950401167031364,
+      "grad_norm": 0.21946971118450165,
+      "learning_rate": 0.0001844355133357691,
+      "loss": 0.8311,
+      "step": 218
+    },
+    {
+      "epoch": 0.07986870897155361,
+      "grad_norm": 0.22317932546138763,
+      "learning_rate": 0.0001843624406284253,
+      "loss": 0.831,
+      "step": 219
+    },
+    {
+      "epoch": 0.08023340627279359,
+      "grad_norm": 0.20973660051822662,
+      "learning_rate": 0.00018428936792108148,
+      "loss": 0.718,
+      "step": 220
+    },
+    {
+      "epoch": 0.08059810357403355,
+      "grad_norm": 0.19049975275993347,
+      "learning_rate": 0.0001842162952137377,
+      "loss": 0.6143,
+      "step": 221
+    },
+    {
+      "epoch": 0.08096280087527352,
+      "grad_norm": 0.22633568942546844,
+      "learning_rate": 0.00018414322250639387,
+      "loss": 0.7479,
+      "step": 222
+    },
+    {
+      "epoch": 0.0813274981765135,
+      "grad_norm": 0.22375738620758057,
+      "learning_rate": 0.00018407014979905005,
+      "loss": 0.783,
+      "step": 223
+    },
+    {
+      "epoch": 0.08169219547775347,
+      "grad_norm": 0.20708216726779938,
+      "learning_rate": 0.00018399707709170626,
+      "loss": 0.7068,
+      "step": 224
+    },
+    {
+      "epoch": 0.08205689277899343,
+      "grad_norm": 0.2057557851076126,
+      "learning_rate": 0.00018392400438436247,
+      "loss": 0.7246,
+      "step": 225
+    },
+    {
+      "epoch": 0.0824215900802334,
+      "grad_norm": 0.21535396575927734,
+      "learning_rate": 0.00018385093167701863,
+      "loss": 0.6725,
+      "step": 226
+    },
+    {
+      "epoch": 0.08278628738147338,
+      "grad_norm": 0.17876707017421722,
+      "learning_rate": 0.00018377785896967483,
+      "loss": 0.5595,
+      "step": 227
+    },
+    {
+      "epoch": 0.08315098468271334,
+      "grad_norm": 0.17822523415088654,
+      "learning_rate": 0.00018370478626233104,
+      "loss": 0.5312,
+      "step": 228
+    },
+    {
+      "epoch": 0.08351568198395332,
+      "grad_norm": 0.24496309459209442,
+      "learning_rate": 0.00018363171355498722,
+      "loss": 1.0328,
+      "step": 229
+    },
+    {
+      "epoch": 0.08388037928519329,
+      "grad_norm": 0.2105628103017807,
+      "learning_rate": 0.0001835586408476434,
+      "loss": 0.6597,
+      "step": 230
+    },
+    {
+      "epoch": 0.08424507658643327,
+      "grad_norm": 0.1954609751701355,
+      "learning_rate": 0.0001834855681402996,
+      "loss": 0.7691,
+      "step": 231
+    },
+    {
+      "epoch": 0.08460977388767323,
+      "grad_norm": 0.18443147838115692,
+      "learning_rate": 0.0001834124954329558,
+      "loss": 0.634,
+      "step": 232
+    },
+    {
+      "epoch": 0.0849744711889132,
+      "grad_norm": 0.20477977395057678,
+      "learning_rate": 0.000183339422725612,
+      "loss": 0.6758,
+      "step": 233
+    },
+    {
+      "epoch": 0.08533916849015317,
+      "grad_norm": 0.21266460418701172,
+      "learning_rate": 0.00018326635001826818,
+      "loss": 0.9446,
+      "step": 234
+    },
+    {
+      "epoch": 0.08570386579139315,
+      "grad_norm": 0.2169213891029358,
+      "learning_rate": 0.00018319327731092437,
+      "loss": 0.755,
+      "step": 235
+    },
+    {
+      "epoch": 0.08606856309263311,
+      "grad_norm": 0.19145172834396362,
+      "learning_rate": 0.00018312020460358057,
+      "loss": 0.7846,
+      "step": 236
+    },
+    {
+      "epoch": 0.08643326039387308,
+      "grad_norm": 0.1787746399641037,
+      "learning_rate": 0.00018304713189623676,
+      "loss": 0.6359,
+      "step": 237
+    },
+    {
+      "epoch": 0.08679795769511306,
+      "grad_norm": 0.21680796146392822,
+      "learning_rate": 0.00018297405918889296,
+      "loss": 0.6408,
+      "step": 238
+    },
+    {
+      "epoch": 0.08716265499635302,
+      "grad_norm": 0.20149710774421692,
+      "learning_rate": 0.00018290098648154914,
+      "loss": 0.695,
+      "step": 239
+    },
+    {
+      "epoch": 0.087527352297593,
+      "grad_norm": 0.20793262124061584,
+      "learning_rate": 0.00018282791377420533,
+      "loss": 0.7159,
+      "step": 240
+    },
+    {
+      "epoch": 0.08789204959883297,
+      "grad_norm": 0.21236906945705414,
+      "learning_rate": 0.00018275484106686153,
+      "loss": 0.8924,
+      "step": 241
+    },
+    {
+      "epoch": 0.08825674690007294,
+      "grad_norm": 0.17754322290420532,
+      "learning_rate": 0.00018268176835951774,
+      "loss": 0.5931,
+      "step": 242
+    },
+    {
+      "epoch": 0.0886214442013129,
+      "grad_norm": 0.21361956000328064,
+      "learning_rate": 0.00018260869565217392,
+      "loss": 0.852,
+      "step": 243
+    },
+    {
+      "epoch": 0.08898614150255288,
+      "grad_norm": 0.19755081832408905,
+      "learning_rate": 0.0001825356229448301,
+      "loss": 0.7846,
+      "step": 244
+    },
+    {
+      "epoch": 0.08935083880379285,
+      "grad_norm": 0.17052070796489716,
+      "learning_rate": 0.00018246255023748631,
+      "loss": 0.4757,
+      "step": 245
+    },
+    {
+      "epoch": 0.08971553610503283,
+      "grad_norm": 0.17633679509162903,
+      "learning_rate": 0.0001823894775301425,
+      "loss": 0.3075,
+      "step": 246
+    },
+    {
+      "epoch": 0.09008023340627279,
+      "grad_norm": 0.18541620671749115,
+      "learning_rate": 0.0001823164048227987,
+      "loss": 0.6268,
+      "step": 247
+    },
+    {
+      "epoch": 0.09044493070751276,
+      "grad_norm": 0.18999099731445312,
+      "learning_rate": 0.00018224333211545488,
+      "loss": 0.6236,
+      "step": 248
+    },
+    {
+      "epoch": 0.09080962800875274,
+      "grad_norm": 0.17926354706287384,
+      "learning_rate": 0.00018217025940811107,
+      "loss": 0.6153,
+      "step": 249
+    },
+    {
+      "epoch": 0.09117432530999271,
+      "grad_norm": 0.161162868142128,
+      "learning_rate": 0.00018209718670076727,
+      "loss": 0.436,
+      "step": 250
+    },
+    {
+      "epoch": 0.09153902261123267,
+      "grad_norm": 0.17199409008026123,
+      "learning_rate": 0.00018202411399342348,
+      "loss": 0.5008,
+      "step": 251
+    },
+    {
+      "epoch": 0.09190371991247265,
+      "grad_norm": 0.20446273684501648,
+      "learning_rate": 0.00018195104128607964,
+      "loss": 0.8099,
+      "step": 252
+    },
+    {
+      "epoch": 0.09226841721371262,
+      "grad_norm": 0.22668351233005524,
+      "learning_rate": 0.00018187796857873585,
+      "loss": 0.8039,
+      "step": 253
+    },
+    {
+      "epoch": 0.09263311451495258,
+      "grad_norm": 0.20580340921878815,
+      "learning_rate": 0.00018180489587139205,
+      "loss": 0.8551,
+      "step": 254
+    },
+    {
+      "epoch": 0.09299781181619256,
+      "grad_norm": 0.20698761940002441,
+      "learning_rate": 0.00018173182316404824,
+      "loss": 0.7958,
+      "step": 255
+    },
+    {
+      "epoch": 0.09336250911743253,
+      "grad_norm": 0.18675172328948975,
+      "learning_rate": 0.00018165875045670444,
+      "loss": 0.6952,
+      "step": 256
+    },
+    {
+      "epoch": 0.0937272064186725,
+      "grad_norm": 0.21192388236522675,
+      "learning_rate": 0.00018158567774936063,
+      "loss": 0.906,
+      "step": 257
+    },
+    {
+      "epoch": 0.09409190371991247,
+      "grad_norm": 0.20454420149326324,
+      "learning_rate": 0.0001815126050420168,
+      "loss": 0.8052,
+      "step": 258
+    },
+    {
+      "epoch": 0.09445660102115244,
+      "grad_norm": 0.2223392277956009,
+      "learning_rate": 0.00018143953233467301,
+      "loss": 1.0307,
+      "step": 259
+    },
+    {
+      "epoch": 0.09482129832239242,
+      "grad_norm": 0.1847476363182068,
+      "learning_rate": 0.00018136645962732922,
+      "loss": 0.7384,
+      "step": 260
+    },
+    {
+      "epoch": 0.09518599562363239,
+      "grad_norm": 0.1858607977628708,
+      "learning_rate": 0.00018129338691998538,
+      "loss": 0.6157,
+      "step": 261
+    },
+    {
+      "epoch": 0.09555069292487235,
+      "grad_norm": 0.19958609342575073,
+      "learning_rate": 0.00018122031421264159,
+      "loss": 0.7196,
+      "step": 262
+    },
+    {
+      "epoch": 0.09591539022611233,
+      "grad_norm": 0.22875526547431946,
+      "learning_rate": 0.0001811472415052978,
+      "loss": 0.864,
+      "step": 263
+    },
+    {
+      "epoch": 0.0962800875273523,
+      "grad_norm": 0.15967297554016113,
+      "learning_rate": 0.00018107416879795398,
+      "loss": 0.5185,
+      "step": 264
+    },
+    {
+      "epoch": 0.09664478482859226,
+      "grad_norm": 0.15776892006397247,
+      "learning_rate": 0.00018100109609061016,
+      "loss": 0.5136,
+      "step": 265
+    },
+    {
+      "epoch": 0.09700948212983224,
+      "grad_norm": 0.17968744039535522,
+      "learning_rate": 0.00018092802338326637,
+      "loss": 0.6538,
+      "step": 266
+    },
+    {
+      "epoch": 0.09737417943107221,
+      "grad_norm": 0.19834838807582855,
+      "learning_rate": 0.00018085495067592255,
+      "loss": 0.7448,
+      "step": 267
+    },
+    {
+      "epoch": 0.09773887673231219,
+      "grad_norm": 0.18585754930973053,
+      "learning_rate": 0.00018078187796857875,
+      "loss": 0.6166,
+      "step": 268
+    },
+    {
+      "epoch": 0.09810357403355215,
+      "grad_norm": 0.21760611236095428,
+      "learning_rate": 0.00018070880526123494,
+      "loss": 0.9431,
+      "step": 269
+    },
+    {
+      "epoch": 0.09846827133479212,
+      "grad_norm": 0.17871254682540894,
+      "learning_rate": 0.00018063573255389112,
+      "loss": 0.5939,
+      "step": 270
+    },
+    {
+      "epoch": 0.0988329686360321,
+      "grad_norm": 0.16932818293571472,
+      "learning_rate": 0.00018056265984654733,
+      "loss": 0.5615,
+      "step": 271
+    },
+    {
+      "epoch": 0.09919766593727207,
+      "grad_norm": 0.18163429200649261,
+      "learning_rate": 0.0001804895871392035,
+      "loss": 0.6572,
+      "step": 272
+    },
+    {
+      "epoch": 0.09956236323851203,
+      "grad_norm": 0.17625723779201508,
+      "learning_rate": 0.00018041651443185972,
+      "loss": 0.5748,
+      "step": 273
+    },
+    {
+      "epoch": 0.099927060539752,
+      "grad_norm": 0.19214142858982086,
+      "learning_rate": 0.0001803434417245159,
+      "loss": 0.7589,
+      "step": 274
+    },
+    {
+      "epoch": 0.10029175784099198,
+      "grad_norm": 0.19191130995750427,
+      "learning_rate": 0.00018027036901717208,
+      "loss": 0.7201,
+      "step": 275
+    },
+    {
+      "epoch": 0.10065645514223195,
+      "grad_norm": 0.18246126174926758,
+      "learning_rate": 0.0001801972963098283,
+      "loss": 0.7919,
+      "step": 276
+    },
+    {
+      "epoch": 0.10102115244347191,
+      "grad_norm": 0.17984451353549957,
+      "learning_rate": 0.0001801242236024845,
+      "loss": 0.6746,
+      "step": 277
+    },
+    {
+      "epoch": 0.10138584974471189,
+      "grad_norm": 0.18245050311088562,
+      "learning_rate": 0.00018005115089514068,
+      "loss": 0.5672,
+      "step": 278
+    },
+    {
+      "epoch": 0.10175054704595186,
+      "grad_norm": 0.1899084746837616,
+      "learning_rate": 0.00017997807818779686,
+      "loss": 0.6676,
+      "step": 279
+    },
+    {
+      "epoch": 0.10211524434719182,
+      "grad_norm": 0.21027837693691254,
+      "learning_rate": 0.00017990500548045307,
+      "loss": 0.8676,
+      "step": 280
+    },
+    {
+      "epoch": 0.1024799416484318,
+      "grad_norm": 0.178825244307518,
+      "learning_rate": 0.00017983193277310925,
+      "loss": 0.5617,
+      "step": 281
+    },
+    {
+      "epoch": 0.10284463894967177,
+      "grad_norm": 0.18551243841648102,
+      "learning_rate": 0.00017975886006576546,
+      "loss": 0.7091,
+      "step": 282
+    },
+    {
+      "epoch": 0.10320933625091175,
+      "grad_norm": 0.18075218796730042,
+      "learning_rate": 0.00017968578735842164,
+      "loss": 0.7194,
+      "step": 283
+    },
+    {
+      "epoch": 0.10357403355215171,
+      "grad_norm": 0.1333538293838501,
+      "learning_rate": 0.00017961271465107782,
+      "loss": 0.3161,
+      "step": 284
+    },
+    {
+      "epoch": 0.10393873085339168,
+      "grad_norm": 0.18160590529441833,
+      "learning_rate": 0.00017953964194373403,
+      "loss": 0.7355,
+      "step": 285
+    },
+    {
+      "epoch": 0.10430342815463166,
+      "grad_norm": 0.19034148752689362,
+      "learning_rate": 0.00017946656923639024,
+      "loss": 0.7279,
+      "step": 286
+    },
+    {
+      "epoch": 0.10466812545587163,
+      "grad_norm": 0.18591929972171783,
+      "learning_rate": 0.0001793934965290464,
+      "loss": 0.7353,
+      "step": 287
+    },
+    {
+      "epoch": 0.1050328227571116,
+      "grad_norm": 0.19063667953014374,
+      "learning_rate": 0.0001793204238217026,
+      "loss": 0.7322,
+      "step": 288
+    },
+    {
+      "epoch": 0.10539752005835157,
+      "grad_norm": 0.20110805332660675,
+      "learning_rate": 0.0001792473511143588,
+      "loss": 0.869,
+      "step": 289
+    },
+    {
+      "epoch": 0.10576221735959154,
+      "grad_norm": 0.18062824010849,
+      "learning_rate": 0.000179174278407015,
+      "loss": 0.6839,
+      "step": 290
+    },
+    {
+      "epoch": 0.1061269146608315,
+      "grad_norm": 0.21643158793449402,
+      "learning_rate": 0.00017910120569967117,
+      "loss": 0.7626,
+      "step": 291
+    },
+    {
+      "epoch": 0.10649161196207148,
+      "grad_norm": 0.20745426416397095,
+      "learning_rate": 0.00017902813299232738,
+      "loss": 0.8748,
+      "step": 292
+    },
+    {
+      "epoch": 0.10685630926331145,
+      "grad_norm": 0.1999918520450592,
+      "learning_rate": 0.00017895506028498356,
+      "loss": 0.7267,
+      "step": 293
+    },
+    {
+      "epoch": 0.10722100656455143,
+      "grad_norm": 0.17721255123615265,
+      "learning_rate": 0.00017888198757763977,
+      "loss": 0.788,
+      "step": 294
+    },
+    {
+      "epoch": 0.10758570386579139,
+      "grad_norm": 0.19578172266483307,
+      "learning_rate": 0.00017880891487029598,
+      "loss": 0.6801,
+      "step": 295
+    },
+    {
+      "epoch": 0.10795040116703136,
+      "grad_norm": 0.20748168230056763,
+      "learning_rate": 0.00017873584216295213,
+      "loss": 0.7624,
+      "step": 296
+    },
+    {
+      "epoch": 0.10831509846827134,
+      "grad_norm": 0.17874249815940857,
+      "learning_rate": 0.00017866276945560834,
+      "loss": 0.7029,
+      "step": 297
+    },
+    {
+      "epoch": 0.10867979576951131,
+      "grad_norm": 0.21527762711048126,
+      "learning_rate": 0.00017858969674826455,
+      "loss": 0.9119,
+      "step": 298
+    },
+    {
+      "epoch": 0.10904449307075127,
+      "grad_norm": 0.16523544490337372,
+      "learning_rate": 0.00017851662404092073,
+      "loss": 0.406,
+      "step": 299
+    },
+    {
+      "epoch": 0.10940919037199125,
+      "grad_norm": 0.18175430595874786,
+      "learning_rate": 0.0001784435513335769,
+      "loss": 0.6711,
+      "step": 300
+    },
+    {
+      "epoch": 0.10977388767323122,
+      "grad_norm": 0.19601181149482727,
+      "learning_rate": 0.00017837047862623312,
+      "loss": 0.7117,
+      "step": 301
+    },
+    {
+      "epoch": 0.11013858497447118,
+      "grad_norm": 0.1821203976869583,
+      "learning_rate": 0.0001782974059188893,
+      "loss": 0.6163,
+      "step": 302
+    },
+    {
+      "epoch": 0.11050328227571116,
+      "grad_norm": 0.1985776126384735,
+      "learning_rate": 0.0001782243332115455,
+      "loss": 0.6357,
+      "step": 303
+    },
+    {
+      "epoch": 0.11086797957695113,
+      "grad_norm": 0.18439283967018127,
+      "learning_rate": 0.0001781512605042017,
+      "loss": 0.6007,
+      "step": 304
+    },
+    {
+      "epoch": 0.1112326768781911,
+      "grad_norm": 0.19900138676166534,
+      "learning_rate": 0.00017807818779685787,
+      "loss": 0.794,
+      "step": 305
+    },
+    {
+      "epoch": 0.11159737417943107,
+      "grad_norm": 0.1659189611673355,
+      "learning_rate": 0.00017800511508951408,
+      "loss": 0.4824,
+      "step": 306
+    },
+    {
+      "epoch": 0.11196207148067104,
+      "grad_norm": 0.19788797199726105,
+      "learning_rate": 0.00017793204238217026,
+      "loss": 0.6806,
+      "step": 307
+    },
+    {
+      "epoch": 0.11232676878191102,
+      "grad_norm": 0.19467923045158386,
+      "learning_rate": 0.00017785896967482647,
+      "loss": 0.7515,
+      "step": 308
+    },
+    {
+      "epoch": 0.11269146608315099,
+      "grad_norm": 0.20231202244758606,
+      "learning_rate": 0.00017778589696748265,
+      "loss": 0.7302,
+      "step": 309
+    },
+    {
+      "epoch": 0.11305616338439095,
+      "grad_norm": 0.1800953596830368,
+      "learning_rate": 0.00017771282426013883,
+      "loss": 0.5432,
+      "step": 310
+    },
+    {
+      "epoch": 0.11342086068563093,
+      "grad_norm": 0.15572136640548706,
+      "learning_rate": 0.00017763975155279504,
+      "loss": 0.4674,
+      "step": 311
+    },
+    {
+      "epoch": 0.1137855579868709,
+      "grad_norm": 0.1948738545179367,
+      "learning_rate": 0.00017756667884545125,
+      "loss": 0.8442,
+      "step": 312
+    },
+    {
+      "epoch": 0.11415025528811087,
+      "grad_norm": 0.1789115071296692,
+      "learning_rate": 0.00017749360613810743,
+      "loss": 0.6009,
+      "step": 313
+    },
+    {
+      "epoch": 0.11451495258935084,
+      "grad_norm": 0.22927749156951904,
+      "learning_rate": 0.0001774205334307636,
+      "loss": 0.866,
+      "step": 314
+    },
+    {
+      "epoch": 0.11487964989059081,
+      "grad_norm": 0.2268659621477127,
+      "learning_rate": 0.00017734746072341982,
+      "loss": 0.918,
+      "step": 315
+    },
+    {
+      "epoch": 0.11524434719183078,
+      "grad_norm": 0.14317840337753296,
+      "learning_rate": 0.000177274388016076,
+      "loss": 0.3706,
+      "step": 316
+    },
+    {
+      "epoch": 0.11560904449307075,
+      "grad_norm": 0.1896270215511322,
+      "learning_rate": 0.0001772013153087322,
+      "loss": 0.7108,
+      "step": 317
+    },
+    {
+      "epoch": 0.11597374179431072,
+      "grad_norm": 0.18470154702663422,
+      "learning_rate": 0.0001771282426013884,
+      "loss": 0.8041,
+      "step": 318
+    },
+    {
+      "epoch": 0.1163384390955507,
+      "grad_norm": 0.1888645738363266,
+      "learning_rate": 0.00017705516989404457,
+      "loss": 0.7601,
+      "step": 319
+    },
+    {
+      "epoch": 0.11670313639679067,
+      "grad_norm": 0.19202347099781036,
+      "learning_rate": 0.00017698209718670078,
+      "loss": 0.6529,
+      "step": 320
+    },
+    {
+      "epoch": 0.11706783369803063,
+      "grad_norm": 0.18936879932880402,
+      "learning_rate": 0.000176909024479357,
+      "loss": 0.768,
+      "step": 321
+    },
+    {
+      "epoch": 0.1174325309992706,
+      "grad_norm": 0.18530336022377014,
+      "learning_rate": 0.00017683595177201314,
+      "loss": 0.7997,
+      "step": 322
+    },
+    {
+      "epoch": 0.11779722830051058,
+      "grad_norm": 0.1945243924856186,
+      "learning_rate": 0.00017676287906466935,
+      "loss": 0.8864,
+      "step": 323
+    },
+    {
+      "epoch": 0.11816192560175055,
+      "grad_norm": 0.16887742280960083,
+      "learning_rate": 0.00017668980635732556,
+      "loss": 0.8014,
+      "step": 324
+    },
+    {
+      "epoch": 0.11852662290299051,
+      "grad_norm": 0.20715931057929993,
+      "learning_rate": 0.00017661673364998174,
+      "loss": 0.9641,
+      "step": 325
+    },
+    {
+      "epoch": 0.11889132020423049,
+      "grad_norm": 0.18339566886425018,
+      "learning_rate": 0.00017654366094263792,
+      "loss": 0.6723,
+      "step": 326
+    },
+    {
+      "epoch": 0.11925601750547046,
+      "grad_norm": 0.1606166958808899,
+      "learning_rate": 0.00017647058823529413,
+      "loss": 0.499,
+      "step": 327
+    },
+    {
+      "epoch": 0.11962071480671042,
+      "grad_norm": 0.17019236087799072,
+      "learning_rate": 0.0001763975155279503,
+      "loss": 0.7332,
+      "step": 328
+    },
+    {
+      "epoch": 0.1199854121079504,
+      "grad_norm": 0.15148764848709106,
+      "learning_rate": 0.00017632444282060652,
+      "loss": 0.477,
+      "step": 329
+    },
+    {
+      "epoch": 0.12035010940919037,
+      "grad_norm": 0.182627871632576,
+      "learning_rate": 0.00017625137011326273,
+      "loss": 0.7443,
+      "step": 330
+    },
+    {
+      "epoch": 0.12071480671043035,
+      "grad_norm": 0.18017525970935822,
+      "learning_rate": 0.00017617829740591888,
+      "loss": 0.6439,
+      "step": 331
+    },
+    {
+      "epoch": 0.12107950401167031,
+      "grad_norm": 0.1846102774143219,
+      "learning_rate": 0.0001761052246985751,
+      "loss": 0.7023,
+      "step": 332
+    },
+    {
+      "epoch": 0.12144420131291028,
+      "grad_norm": 0.19771961867809296,
+      "learning_rate": 0.0001760321519912313,
+      "loss": 0.7772,
+      "step": 333
+    },
+    {
+      "epoch": 0.12180889861415026,
+      "grad_norm": 0.18164925277233124,
+      "learning_rate": 0.00017595907928388748,
+      "loss": 0.7348,
+      "step": 334
+    },
+    {
+      "epoch": 0.12217359591539023,
+      "grad_norm": 0.182021364569664,
+      "learning_rate": 0.00017588600657654366,
+      "loss": 0.7034,
+      "step": 335
+    },
+    {
+      "epoch": 0.12253829321663019,
+      "grad_norm": 0.1674763709306717,
+      "learning_rate": 0.00017581293386919987,
+      "loss": 0.5969,
+      "step": 336
+    },
+    {
+      "epoch": 0.12290299051787017,
+      "grad_norm": 0.18998658657073975,
+      "learning_rate": 0.00017573986116185605,
+      "loss": 0.723,
+      "step": 337
+    },
+    {
+      "epoch": 0.12326768781911014,
+      "grad_norm": 0.20089052617549896,
+      "learning_rate": 0.00017566678845451226,
+      "loss": 0.8216,
+      "step": 338
+    },
+    {
+      "epoch": 0.12363238512035012,
+      "grad_norm": 0.18273848295211792,
+      "learning_rate": 0.00017559371574716844,
+      "loss": 0.6634,
+      "step": 339
+    },
+    {
+      "epoch": 0.12399708242159008,
+      "grad_norm": 0.18198589980602264,
+      "learning_rate": 0.00017552064303982462,
+      "loss": 0.7026,
+      "step": 340
+    },
+    {
+      "epoch": 0.12436177972283005,
+      "grad_norm": 0.17685921490192413,
+      "learning_rate": 0.00017544757033248083,
+      "loss": 0.5641,
+      "step": 341
+    },
+    {
+      "epoch": 0.12472647702407003,
+      "grad_norm": 0.20872358977794647,
+      "learning_rate": 0.000175374497625137,
+      "loss": 0.851,
+      "step": 342
+    },
+    {
+      "epoch": 0.12509117432531,
+      "grad_norm": 0.19250471889972687,
+      "learning_rate": 0.00017530142491779322,
+      "loss": 0.6265,
+      "step": 343
+    },
+    {
+      "epoch": 0.12545587162654998,
+      "grad_norm": 0.18738161027431488,
+      "learning_rate": 0.0001752283522104494,
+      "loss": 0.7612,
+      "step": 344
+    },
+    {
+      "epoch": 0.12582056892778992,
+      "grad_norm": 0.188717782497406,
+      "learning_rate": 0.00017515527950310558,
+      "loss": 0.768,
+      "step": 345
+    },
+    {
+      "epoch": 0.1261852662290299,
+      "grad_norm": 0.16923002898693085,
+      "learning_rate": 0.0001750822067957618,
+      "loss": 0.6071,
+      "step": 346
+    },
+    {
+      "epoch": 0.12654996353026987,
+      "grad_norm": 0.1732785850763321,
+      "learning_rate": 0.000175009134088418,
+      "loss": 0.5788,
+      "step": 347
+    },
+    {
+      "epoch": 0.12691466083150985,
+      "grad_norm": 0.15573865175247192,
+      "learning_rate": 0.00017493606138107418,
+      "loss": 0.5516,
+      "step": 348
+    },
+    {
+      "epoch": 0.12727935813274982,
+      "grad_norm": 0.17997129261493683,
+      "learning_rate": 0.00017486298867373036,
+      "loss": 0.5627,
+      "step": 349
+    },
+    {
+      "epoch": 0.1276440554339898,
+      "grad_norm": 0.17864130437374115,
+      "learning_rate": 0.00017478991596638657,
+      "loss": 0.6458,
+      "step": 350
+    },
+    {
+      "epoch": 0.12800875273522977,
+      "grad_norm": 0.17441441118717194,
+      "learning_rate": 0.00017471684325904275,
+      "loss": 0.5879,
+      "step": 351
+    },
+    {
+      "epoch": 0.12837345003646974,
+      "grad_norm": 0.16584378480911255,
+      "learning_rate": 0.00017464377055169896,
+      "loss": 0.571,
+      "step": 352
+    },
+    {
+      "epoch": 0.1287381473377097,
+      "grad_norm": 0.16552752256393433,
+      "learning_rate": 0.00017457069784435514,
+      "loss": 0.4731,
+      "step": 353
+    },
+    {
+      "epoch": 0.12910284463894967,
+      "grad_norm": 0.19212619960308075,
+      "learning_rate": 0.00017449762513701132,
+      "loss": 0.7846,
+      "step": 354
+    },
+    {
+      "epoch": 0.12946754194018964,
+      "grad_norm": 0.15730014443397522,
+      "learning_rate": 0.00017442455242966753,
+      "loss": 0.5544,
+      "step": 355
+    },
+    {
+      "epoch": 0.12983223924142961,
+      "grad_norm": 0.16712331771850586,
+      "learning_rate": 0.00017435147972232374,
+      "loss": 0.5702,
+      "step": 356
+    },
+    {
+      "epoch": 0.1301969365426696,
+      "grad_norm": 0.19101271033287048,
+      "learning_rate": 0.0001742784070149799,
+      "loss": 0.697,
+      "step": 357
+    },
+    {
+      "epoch": 0.13056163384390956,
+      "grad_norm": 0.15471835434436798,
+      "learning_rate": 0.0001742053343076361,
+      "loss": 0.4917,
+      "step": 358
+    },
+    {
+      "epoch": 0.13092633114514954,
+      "grad_norm": 0.19589225947856903,
+      "learning_rate": 0.0001741322616002923,
+      "loss": 0.7135,
+      "step": 359
+    },
+    {
+      "epoch": 0.13129102844638948,
+      "grad_norm": 0.22023500502109528,
+      "learning_rate": 0.0001740591888929485,
+      "loss": 0.8744,
+      "step": 360
+    },
+    {
+      "epoch": 0.13165572574762946,
+      "grad_norm": 0.17470814287662506,
+      "learning_rate": 0.00017398611618560467,
+      "loss": 0.5644,
+      "step": 361
+    },
+    {
+      "epoch": 0.13202042304886943,
+      "grad_norm": 0.16508780419826508,
+      "learning_rate": 0.00017391304347826088,
+      "loss": 0.4774,
+      "step": 362
+    },
+    {
+      "epoch": 0.1323851203501094,
+      "grad_norm": 0.18133142590522766,
+      "learning_rate": 0.00017383997077091706,
+      "loss": 0.6806,
+      "step": 363
+    },
+    {
+      "epoch": 0.13274981765134938,
+      "grad_norm": 0.19952940940856934,
+      "learning_rate": 0.00017376689806357327,
+      "loss": 0.7776,
+      "step": 364
+    },
+    {
+      "epoch": 0.13311451495258936,
+      "grad_norm": 0.17585842311382294,
+      "learning_rate": 0.00017369382535622948,
+      "loss": 0.7543,
+      "step": 365
+    },
+    {
+      "epoch": 0.13347921225382933,
+      "grad_norm": 0.19367621839046478,
+      "learning_rate": 0.00017362075264888563,
+      "loss": 0.585,
+      "step": 366
+    },
+    {
+      "epoch": 0.13384390955506928,
+      "grad_norm": 0.1928108036518097,
+      "learning_rate": 0.00017354767994154184,
+      "loss": 0.6656,
+      "step": 367
+    },
+    {
+      "epoch": 0.13420860685630925,
+      "grad_norm": 0.19390811026096344,
+      "learning_rate": 0.00017347460723419805,
+      "loss": 0.713,
+      "step": 368
+    },
+    {
+      "epoch": 0.13457330415754923,
+      "grad_norm": 0.19284166395664215,
+      "learning_rate": 0.00017340153452685423,
+      "loss": 0.7414,
+      "step": 369
+    },
+    {
+      "epoch": 0.1349380014587892,
+      "grad_norm": 0.16773255169391632,
+      "learning_rate": 0.0001733284618195104,
+      "loss": 0.5786,
+      "step": 370
+    },
+    {
+      "epoch": 0.13530269876002918,
+      "grad_norm": 0.15630416572093964,
+      "learning_rate": 0.00017325538911216662,
+      "loss": 0.4774,
+      "step": 371
+    },
+    {
+      "epoch": 0.13566739606126915,
+      "grad_norm": 0.1833135038614273,
+      "learning_rate": 0.0001731823164048228,
+      "loss": 0.7178,
+      "step": 372
+    },
+    {
+      "epoch": 0.13603209336250913,
+      "grad_norm": 0.19849684834480286,
+      "learning_rate": 0.000173109243697479,
+      "loss": 0.7354,
+      "step": 373
+    },
+    {
+      "epoch": 0.1363967906637491,
+      "grad_norm": 0.17072516679763794,
+      "learning_rate": 0.0001730361709901352,
+      "loss": 0.5945,
+      "step": 374
+    },
+    {
+      "epoch": 0.13676148796498905,
+      "grad_norm": 0.1784793585538864,
+      "learning_rate": 0.00017296309828279137,
+      "loss": 0.6322,
+      "step": 375
+    },
+    {
+      "epoch": 0.13712618526622902,
+      "grad_norm": 0.192670539021492,
+      "learning_rate": 0.00017289002557544758,
+      "loss": 0.7612,
+      "step": 376
+    },
+    {
+      "epoch": 0.137490882567469,
+      "grad_norm": 0.1875181794166565,
+      "learning_rate": 0.00017281695286810376,
+      "loss": 0.7439,
+      "step": 377
+    },
+    {
+      "epoch": 0.13785557986870897,
+      "grad_norm": 0.1796884834766388,
+      "learning_rate": 0.00017274388016075997,
+      "loss": 0.8694,
+      "step": 378
+    },
+    {
+      "epoch": 0.13822027716994895,
+      "grad_norm": 0.20059190690517426,
+      "learning_rate": 0.00017267080745341615,
+      "loss": 0.8027,
+      "step": 379
+    },
+    {
+      "epoch": 0.13858497447118892,
+      "grad_norm": 0.19425062835216522,
+      "learning_rate": 0.00017259773474607236,
+      "loss": 0.8948,
+      "step": 380
+    },
+    {
+      "epoch": 0.1389496717724289,
+      "grad_norm": 0.20872867107391357,
+      "learning_rate": 0.00017252466203872854,
+      "loss": 0.9552,
+      "step": 381
+    },
+    {
+      "epoch": 0.13931436907366884,
+      "grad_norm": 0.1698973923921585,
+      "learning_rate": 0.00017245158933138475,
+      "loss": 0.5604,
+      "step": 382
+    },
+    {
+      "epoch": 0.13967906637490882,
+      "grad_norm": 0.19797375798225403,
+      "learning_rate": 0.00017237851662404093,
+      "loss": 0.8872,
+      "step": 383
+    },
+    {
+      "epoch": 0.1400437636761488,
+      "grad_norm": 0.16452281177043915,
+      "learning_rate": 0.00017230544391669711,
+      "loss": 0.5724,
+      "step": 384
+    },
+    {
+      "epoch": 0.14040846097738877,
+      "grad_norm": 0.17133468389511108,
+      "learning_rate": 0.00017223237120935332,
+      "loss": 0.592,
+      "step": 385
+    },
+    {
+      "epoch": 0.14077315827862874,
+      "grad_norm": 0.17517192661762238,
+      "learning_rate": 0.0001721592985020095,
+      "loss": 0.5597,
+      "step": 386
+    },
+    {
+      "epoch": 0.14113785557986872,
+      "grad_norm": 0.17406406998634338,
+      "learning_rate": 0.00017208622579466569,
+      "loss": 0.7118,
+      "step": 387
+    },
+    {
+      "epoch": 0.1415025528811087,
+      "grad_norm": 0.17395231127738953,
+      "learning_rate": 0.0001720131530873219,
+      "loss": 0.6616,
+      "step": 388
+    },
+    {
+      "epoch": 0.14186725018234866,
+      "grad_norm": 0.21478557586669922,
+      "learning_rate": 0.00017194008037997807,
+      "loss": 0.8517,
+      "step": 389
+    },
+    {
+      "epoch": 0.1422319474835886,
+      "grad_norm": 0.22842730581760406,
+      "learning_rate": 0.00017186700767263428,
+      "loss": 0.9611,
+      "step": 390
+    },
+    {
+      "epoch": 0.14259664478482859,
+      "grad_norm": 0.18962237238883972,
+      "learning_rate": 0.0001717939349652905,
+      "loss": 0.7606,
+      "step": 391
+    },
+    {
+      "epoch": 0.14296134208606856,
+      "grad_norm": 0.19581514596939087,
+      "learning_rate": 0.00017172086225794665,
+      "loss": 0.892,
+      "step": 392
+    },
+    {
+      "epoch": 0.14332603938730853,
+      "grad_norm": 0.18178801238536835,
+      "learning_rate": 0.00017164778955060285,
+      "loss": 0.7477,
+      "step": 393
+    },
+    {
+      "epoch": 0.1436907366885485,
+      "grad_norm": 0.18961721658706665,
+      "learning_rate": 0.00017157471684325906,
+      "loss": 0.771,
+      "step": 394
+    },
+    {
+      "epoch": 0.14405543398978848,
+      "grad_norm": 0.18671804666519165,
+      "learning_rate": 0.00017150164413591524,
+      "loss": 0.7383,
+      "step": 395
+    },
+    {
+      "epoch": 0.14442013129102846,
+      "grad_norm": 0.17833957076072693,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 0.6363,
+      "step": 396
+    },
+    {
+      "epoch": 0.1447848285922684,
+      "grad_norm": 0.18321546912193298,
+      "learning_rate": 0.00017135549872122763,
+      "loss": 0.7068,
+      "step": 397
+    },
+    {
+      "epoch": 0.14514952589350838,
+      "grad_norm": 0.20233039557933807,
+      "learning_rate": 0.00017128242601388381,
+      "loss": 0.8699,
+      "step": 398
+    },
+    {
+      "epoch": 0.14551422319474835,
+      "grad_norm": 0.1749749630689621,
+      "learning_rate": 0.00017120935330654002,
+      "loss": 0.7355,
+      "step": 399
+    },
+    {
+      "epoch": 0.14587892049598833,
+      "grad_norm": 0.14458586275577545,
+      "learning_rate": 0.00017113628059919623,
+      "loss": 0.427,
+      "step": 400
+    },
+    {
+      "epoch": 0.1462436177972283,
+      "grad_norm": 0.183969646692276,
+      "learning_rate": 0.00017106320789185239,
+      "loss": 0.6166,
+      "step": 401
+    },
+    {
+      "epoch": 0.14660831509846828,
+      "grad_norm": 0.15677833557128906,
+      "learning_rate": 0.0001709901351845086,
+      "loss": 0.4161,
+      "step": 402
+    },
+    {
+      "epoch": 0.14697301239970825,
+      "grad_norm": 0.20313799381256104,
+      "learning_rate": 0.0001709170624771648,
+      "loss": 0.8189,
+      "step": 403
+    },
+    {
+      "epoch": 0.1473377097009482,
+      "grad_norm": 0.17821913957595825,
+      "learning_rate": 0.00017084398976982098,
+      "loss": 0.6365,
+      "step": 404
+    },
+    {
+      "epoch": 0.14770240700218817,
+      "grad_norm": 0.1630881130695343,
+      "learning_rate": 0.00017077091706247717,
+      "loss": 0.5623,
+      "step": 405
+    },
+    {
+      "epoch": 0.14806710430342815,
+      "grad_norm": 0.20716772973537445,
+      "learning_rate": 0.00017069784435513337,
+      "loss": 0.9347,
+      "step": 406
+    },
+    {
+      "epoch": 0.14843180160466812,
+      "grad_norm": 0.17428599298000336,
+      "learning_rate": 0.00017062477164778956,
+      "loss": 0.6188,
+      "step": 407
+    },
+    {
+      "epoch": 0.1487964989059081,
+      "grad_norm": 0.15823887288570404,
+      "learning_rate": 0.00017055169894044576,
+      "loss": 0.5995,
+      "step": 408
+    },
+    {
+      "epoch": 0.14916119620714807,
+      "grad_norm": 0.1716741919517517,
+      "learning_rate": 0.00017047862623310194,
+      "loss": 0.6407,
+      "step": 409
+    },
+    {
+      "epoch": 0.14952589350838805,
+      "grad_norm": 0.16747424006462097,
+      "learning_rate": 0.00017040555352575813,
+      "loss": 0.5822,
+      "step": 410
+    },
+    {
+      "epoch": 0.14989059080962802,
+      "grad_norm": 0.14587102830410004,
+      "learning_rate": 0.00017033248081841433,
+      "loss": 0.431,
+      "step": 411
+    },
+    {
+      "epoch": 0.15025528811086797,
+      "grad_norm": 0.1754886656999588,
+      "learning_rate": 0.00017025940811107052,
+      "loss": 0.5515,
+      "step": 412
+    },
+    {
+      "epoch": 0.15061998541210794,
+      "grad_norm": 0.20467835664749146,
+      "learning_rate": 0.00017018633540372672,
+      "loss": 0.8614,
+      "step": 413
+    },
+    {
+      "epoch": 0.15098468271334792,
+      "grad_norm": 0.2041863650083542,
+      "learning_rate": 0.0001701132626963829,
+      "loss": 0.88,
+      "step": 414
+    },
+    {
+      "epoch": 0.1513493800145879,
+      "grad_norm": 0.2153850644826889,
+      "learning_rate": 0.00017004018998903911,
+      "loss": 0.9241,
+      "step": 415
+    },
+    {
+      "epoch": 0.15171407731582787,
+      "grad_norm": 0.20441819727420807,
+      "learning_rate": 0.0001699671172816953,
+      "loss": 0.9127,
+      "step": 416
+    },
+    {
+      "epoch": 0.15207877461706784,
+      "grad_norm": 0.17975957691669464,
+      "learning_rate": 0.0001698940445743515,
+      "loss": 0.6395,
+      "step": 417
+    },
+    {
+      "epoch": 0.15244347191830782,
+      "grad_norm": 0.1702582836151123,
+      "learning_rate": 0.00016982097186700768,
+      "loss": 0.5951,
+      "step": 418
+    },
+    {
+      "epoch": 0.15280816921954776,
+      "grad_norm": 0.1543644517660141,
+      "learning_rate": 0.00016974789915966387,
+      "loss": 0.5093,
+      "step": 419
+    },
+    {
+      "epoch": 0.15317286652078774,
+      "grad_norm": 0.1747189462184906,
+      "learning_rate": 0.00016967482645232007,
+      "loss": 0.5314,
+      "step": 420
+    },
+    {
+      "epoch": 0.1535375638220277,
+      "grad_norm": 0.22516067326068878,
+      "learning_rate": 0.00016960175374497626,
+      "loss": 0.9766,
+      "step": 421
+    },
+    {
+      "epoch": 0.1539022611232677,
+      "grad_norm": 0.21535086631774902,
+      "learning_rate": 0.00016952868103763244,
+      "loss": 0.8287,
+      "step": 422
+    },
+    {
+      "epoch": 0.15426695842450766,
+      "grad_norm": 0.17004358768463135,
+      "learning_rate": 0.00016945560833028865,
+      "loss": 0.6612,
+      "step": 423
+    },
+    {
+      "epoch": 0.15463165572574764,
+      "grad_norm": 0.18636055290699005,
+      "learning_rate": 0.00016938253562294483,
+      "loss": 0.784,
+      "step": 424
+    },
+    {
+      "epoch": 0.1549963530269876,
+      "grad_norm": 0.1781081259250641,
+      "learning_rate": 0.00016930946291560104,
+      "loss": 0.5275,
+      "step": 425
+    },
+    {
+      "epoch": 0.15536105032822758,
+      "grad_norm": 0.19579234719276428,
+      "learning_rate": 0.00016923639020825724,
+      "loss": 0.7712,
+      "step": 426
+    },
+    {
+      "epoch": 0.15572574762946753,
+      "grad_norm": 0.18568329513072968,
+      "learning_rate": 0.0001691633175009134,
+      "loss": 0.7509,
+      "step": 427
+    },
+    {
+      "epoch": 0.1560904449307075,
+      "grad_norm": 0.17547035217285156,
+      "learning_rate": 0.0001690902447935696,
+      "loss": 0.711,
+      "step": 428
+    },
+    {
+      "epoch": 0.15645514223194748,
+      "grad_norm": 0.19599728286266327,
+      "learning_rate": 0.00016901717208622581,
+      "loss": 0.9256,
+      "step": 429
+    },
+    {
+      "epoch": 0.15681983953318746,
+      "grad_norm": 0.18903636932373047,
+      "learning_rate": 0.000168944099378882,
+      "loss": 0.7167,
+      "step": 430
+    },
+    {
+      "epoch": 0.15718453683442743,
+      "grad_norm": 0.14108788967132568,
+      "learning_rate": 0.00016887102667153818,
+      "loss": 0.4165,
+      "step": 431
+    },
+    {
+      "epoch": 0.1575492341356674,
+      "grad_norm": 0.14891500771045685,
+      "learning_rate": 0.00016879795396419439,
+      "loss": 0.4908,
+      "step": 432
+    },
+    {
+      "epoch": 0.15791393143690738,
+      "grad_norm": 0.18601083755493164,
+      "learning_rate": 0.00016872488125685057,
+      "loss": 0.6856,
+      "step": 433
+    },
+    {
+      "epoch": 0.15827862873814733,
+      "grad_norm": 0.1775410771369934,
+      "learning_rate": 0.00016865180854950678,
+      "loss": 0.6805,
+      "step": 434
+    },
+    {
+      "epoch": 0.1586433260393873,
+      "grad_norm": 0.19800400733947754,
+      "learning_rate": 0.00016857873584216296,
+      "loss": 0.7518,
+      "step": 435
+    },
+    {
+      "epoch": 0.15900802334062727,
+      "grad_norm": 0.21951526403427124,
+      "learning_rate": 0.00016850566313481914,
+      "loss": 0.9934,
+      "step": 436
+    },
+    {
+      "epoch": 0.15937272064186725,
+      "grad_norm": 0.15916769206523895,
+      "learning_rate": 0.00016843259042747535,
+      "loss": 0.5548,
+      "step": 437
+    },
+    {
+      "epoch": 0.15973741794310722,
+      "grad_norm": 0.20369085669517517,
+      "learning_rate": 0.00016835951772013155,
+      "loss": 0.876,
+      "step": 438
+    },
+    {
+      "epoch": 0.1601021152443472,
+      "grad_norm": 0.16725899279117584,
+      "learning_rate": 0.00016828644501278774,
+      "loss": 0.6011,
+      "step": 439
+    },
+    {
+      "epoch": 0.16046681254558717,
+      "grad_norm": 0.15776625275611877,
+      "learning_rate": 0.00016821337230544392,
+      "loss": 0.4744,
+      "step": 440
+    },
+    {
+      "epoch": 0.16083150984682712,
+      "grad_norm": 0.20461910963058472,
+      "learning_rate": 0.00016814029959810013,
+      "loss": 0.8574,
+      "step": 441
+    },
+    {
+      "epoch": 0.1611962071480671,
+      "grad_norm": 0.1932179033756256,
+      "learning_rate": 0.0001680672268907563,
+      "loss": 0.7493,
+      "step": 442
+    },
+    {
+      "epoch": 0.16156090444930707,
+      "grad_norm": 0.17809215188026428,
+      "learning_rate": 0.00016799415418341252,
+      "loss": 0.6159,
+      "step": 443
+    },
+    {
+      "epoch": 0.16192560175054704,
+      "grad_norm": 0.19475284218788147,
+      "learning_rate": 0.0001679210814760687,
+      "loss": 0.7342,
+      "step": 444
+    },
+    {
+      "epoch": 0.16229029905178702,
+      "grad_norm": 0.17827290296554565,
+      "learning_rate": 0.00016784800876872488,
+      "loss": 0.7018,
+      "step": 445
+    },
+    {
+      "epoch": 0.162654996353027,
+      "grad_norm": 0.18410883843898773,
+      "learning_rate": 0.0001677749360613811,
+      "loss": 0.689,
+      "step": 446
+    },
+    {
+      "epoch": 0.16301969365426697,
+      "grad_norm": 0.161324605345726,
+      "learning_rate": 0.00016770186335403727,
+      "loss": 0.5249,
+      "step": 447
+    },
+    {
+      "epoch": 0.16338439095550694,
+      "grad_norm": 0.1743507832288742,
+      "learning_rate": 0.00016762879064669348,
+      "loss": 0.7147,
+      "step": 448
+    },
+    {
+      "epoch": 0.1637490882567469,
+      "grad_norm": 0.18522591888904572,
+      "learning_rate": 0.00016755571793934966,
+      "loss": 0.6813,
+      "step": 449
+    },
+    {
+      "epoch": 0.16411378555798686,
+      "grad_norm": 0.18846698105335236,
+      "learning_rate": 0.00016748264523200587,
+      "loss": 0.8582,
+      "step": 450
+    },
+    {
+      "epoch": 0.16447848285922684,
+      "grad_norm": 0.18033850193023682,
+      "learning_rate": 0.00016740957252466205,
+      "loss": 0.7465,
+      "step": 451
+    },
+    {
+      "epoch": 0.1648431801604668,
+      "grad_norm": 0.1762772500514984,
+      "learning_rate": 0.00016733649981731826,
+      "loss": 0.6501,
+      "step": 452
+    },
+    {
+      "epoch": 0.1652078774617068,
+      "grad_norm": 0.20865213871002197,
+      "learning_rate": 0.00016726342710997444,
+      "loss": 0.5579,
+      "step": 453
+    },
+    {
+      "epoch": 0.16557257476294676,
+      "grad_norm": 0.20231905579566956,
+      "learning_rate": 0.00016719035440263062,
+      "loss": 0.8055,
+      "step": 454
+    },
+    {
+      "epoch": 0.16593727206418674,
+      "grad_norm": 0.1755821406841278,
+      "learning_rate": 0.00016711728169528683,
+      "loss": 0.6538,
+      "step": 455
+    },
+    {
+      "epoch": 0.16630196936542668,
+      "grad_norm": 0.18525098264217377,
+      "learning_rate": 0.000167044208987943,
+      "loss": 0.6979,
+      "step": 456
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 0.18403197824954987,
+      "learning_rate": 0.0001669711362805992,
+      "loss": 0.8222,
+      "step": 457
+    },
+    {
+      "epoch": 0.16703136396790663,
+      "grad_norm": 0.1567261666059494,
+      "learning_rate": 0.0001668980635732554,
+      "loss": 0.5104,
+      "step": 458
+    },
+    {
+      "epoch": 0.1673960612691466,
+      "grad_norm": 0.1938832849264145,
+      "learning_rate": 0.00016682499086591158,
+      "loss": 0.6972,
+      "step": 459
+    },
+    {
+      "epoch": 0.16776075857038658,
+      "grad_norm": 0.19050729274749756,
+      "learning_rate": 0.0001667519181585678,
+      "loss": 0.7917,
+      "step": 460
+    },
+    {
+      "epoch": 0.16812545587162656,
+      "grad_norm": 0.22484062612056732,
+      "learning_rate": 0.000166678845451224,
+      "loss": 1.0339,
+      "step": 461
+    },
+    {
+      "epoch": 0.16849015317286653,
+      "grad_norm": 0.18188448250293732,
+      "learning_rate": 0.00016660577274388015,
+      "loss": 0.6295,
+      "step": 462
+    },
+    {
+      "epoch": 0.1688548504741065,
+      "grad_norm": 0.17250071465969086,
+      "learning_rate": 0.00016653270003653636,
+      "loss": 0.5911,
+      "step": 463
+    },
+    {
+      "epoch": 0.16921954777534645,
+      "grad_norm": 0.177398219704628,
+      "learning_rate": 0.00016645962732919257,
+      "loss": 0.6524,
+      "step": 464
+    },
+    {
+      "epoch": 0.16958424507658643,
+      "grad_norm": 0.18105994164943695,
+      "learning_rate": 0.00016638655462184875,
+      "loss": 0.6554,
+      "step": 465
+    },
+    {
+      "epoch": 0.1699489423778264,
+      "grad_norm": 0.16810593008995056,
+      "learning_rate": 0.00016631348191450493,
+      "loss": 0.5617,
+      "step": 466
+    },
+    {
+      "epoch": 0.17031363967906638,
+      "grad_norm": 0.18866056203842163,
+      "learning_rate": 0.00016624040920716114,
+      "loss": 0.848,
+      "step": 467
+    },
+    {
+      "epoch": 0.17067833698030635,
+      "grad_norm": 0.18827949464321136,
+      "learning_rate": 0.00016616733649981732,
+      "loss": 0.6413,
+      "step": 468
+    },
+    {
+      "epoch": 0.17104303428154632,
+      "grad_norm": 0.16476339101791382,
+      "learning_rate": 0.00016609426379247353,
+      "loss": 0.6205,
+      "step": 469
+    },
+    {
+      "epoch": 0.1714077315827863,
+      "grad_norm": 0.1794668436050415,
+      "learning_rate": 0.0001660211910851297,
+      "loss": 0.557,
+      "step": 470
+    },
+    {
+      "epoch": 0.17177242888402625,
+      "grad_norm": 0.18404535949230194,
+      "learning_rate": 0.0001659481183777859,
+      "loss": 0.7329,
+      "step": 471
+    },
+    {
+      "epoch": 0.17213712618526622,
+      "grad_norm": 0.1756718009710312,
+      "learning_rate": 0.0001658750456704421,
+      "loss": 0.5631,
+      "step": 472
+    },
+    {
+      "epoch": 0.1725018234865062,
+      "grad_norm": 0.17083744704723358,
+      "learning_rate": 0.0001658019729630983,
+      "loss": 0.6473,
+      "step": 473
+    },
+    {
+      "epoch": 0.17286652078774617,
+      "grad_norm": 0.17385685443878174,
+      "learning_rate": 0.0001657289002557545,
+      "loss": 0.6402,
+      "step": 474
+    },
+    {
+      "epoch": 0.17323121808898614,
+      "grad_norm": 0.16864272952079773,
+      "learning_rate": 0.00016565582754841067,
+      "loss": 0.5569,
+      "step": 475
+    },
+    {
+      "epoch": 0.17359591539022612,
+      "grad_norm": 0.1955600082874298,
+      "learning_rate": 0.00016558275484106688,
+      "loss": 0.8456,
+      "step": 476
+    },
+    {
+      "epoch": 0.1739606126914661,
+      "grad_norm": 0.1845158338546753,
+      "learning_rate": 0.00016550968213372306,
+      "loss": 0.6691,
+      "step": 477
+    },
+    {
+      "epoch": 0.17432530999270604,
+      "grad_norm": 0.14487138390541077,
+      "learning_rate": 0.00016543660942637927,
+      "loss": 0.3899,
+      "step": 478
+    },
+    {
+      "epoch": 0.17469000729394601,
+      "grad_norm": 0.1681089848279953,
+      "learning_rate": 0.00016536353671903545,
+      "loss": 0.5527,
+      "step": 479
+    },
+    {
+      "epoch": 0.175054704595186,
+      "grad_norm": 0.1934751272201538,
+      "learning_rate": 0.00016529046401169163,
+      "loss": 0.764,
+      "step": 480
+    },
+    {
+      "epoch": 0.17541940189642596,
+      "grad_norm": 0.17804409563541412,
+      "learning_rate": 0.00016521739130434784,
+      "loss": 0.6075,
+      "step": 481
+    },
+    {
+      "epoch": 0.17578409919766594,
+      "grad_norm": 0.17334255576133728,
+      "learning_rate": 0.00016514431859700402,
+      "loss": 0.5402,
+      "step": 482
+    },
+    {
+      "epoch": 0.1761487964989059,
+      "grad_norm": 0.166969433426857,
+      "learning_rate": 0.0001650712458896602,
+      "loss": 0.5684,
+      "step": 483
+    },
+    {
+      "epoch": 0.1765134938001459,
+      "grad_norm": 0.17517952620983124,
+      "learning_rate": 0.0001649981731823164,
+      "loss": 0.6743,
+      "step": 484
+    },
+    {
+      "epoch": 0.17687819110138586,
+      "grad_norm": 0.18539521098136902,
+      "learning_rate": 0.00016492510047497262,
+      "loss": 0.7196,
+      "step": 485
+    },
+    {
+      "epoch": 0.1772428884026258,
+      "grad_norm": 0.18387848138809204,
+      "learning_rate": 0.0001648520277676288,
+      "loss": 0.6336,
+      "step": 486
+    },
+    {
+      "epoch": 0.17760758570386578,
+      "grad_norm": 0.18015360832214355,
+      "learning_rate": 0.000164778955060285,
+      "loss": 0.7254,
+      "step": 487
+    },
+    {
+      "epoch": 0.17797228300510576,
+      "grad_norm": 0.1397644728422165,
+      "learning_rate": 0.0001647058823529412,
+      "loss": 0.3431,
+      "step": 488
+    },
+    {
+      "epoch": 0.17833698030634573,
+      "grad_norm": 0.22470806539058685,
+      "learning_rate": 0.00016463280964559737,
+      "loss": 0.9921,
+      "step": 489
+    },
+    {
+      "epoch": 0.1787016776075857,
+      "grad_norm": 0.2027674913406372,
+      "learning_rate": 0.00016455973693825358,
+      "loss": 0.8432,
+      "step": 490
+    },
+    {
+      "epoch": 0.17906637490882568,
+      "grad_norm": 0.16964022815227509,
+      "learning_rate": 0.00016448666423090976,
+      "loss": 0.659,
+      "step": 491
+    },
+    {
+      "epoch": 0.17943107221006566,
+      "grad_norm": 0.18222148716449738,
+      "learning_rate": 0.00016441359152356594,
+      "loss": 0.6367,
+      "step": 492
+    },
+    {
+      "epoch": 0.1797957695113056,
+      "grad_norm": 0.19195525348186493,
+      "learning_rate": 0.00016434051881622215,
+      "loss": 0.657,
+      "step": 493
+    },
+    {
+      "epoch": 0.18016046681254558,
+      "grad_norm": 0.17382116615772247,
+      "learning_rate": 0.00016426744610887833,
+      "loss": 0.6367,
+      "step": 494
+    },
+    {
+      "epoch": 0.18052516411378555,
+      "grad_norm": 0.18641424179077148,
+      "learning_rate": 0.00016419437340153454,
+      "loss": 0.7523,
+      "step": 495
+    },
+    {
+      "epoch": 0.18088986141502553,
+      "grad_norm": 0.1832839697599411,
+      "learning_rate": 0.00016412130069419075,
+      "loss": 0.7381,
+      "step": 496
+    },
+    {
+      "epoch": 0.1812545587162655,
+      "grad_norm": 0.1541210114955902,
+      "learning_rate": 0.0001640482279868469,
+      "loss": 0.5941,
+      "step": 497
+    },
+    {
+      "epoch": 0.18161925601750548,
+      "grad_norm": 0.22230711579322815,
+      "learning_rate": 0.0001639751552795031,
+      "loss": 0.9853,
+      "step": 498
+    },
+    {
+      "epoch": 0.18198395331874545,
+      "grad_norm": 0.17912793159484863,
+      "learning_rate": 0.00016390208257215932,
+      "loss": 0.6753,
+      "step": 499
+    },
+    {
+      "epoch": 0.18234865061998543,
+      "grad_norm": 0.1971275359392166,
+      "learning_rate": 0.0001638290098648155,
+      "loss": 0.7659,
+      "step": 500
+    },
+    {
+      "epoch": 0.18271334792122537,
+      "grad_norm": 0.19420406222343445,
+      "learning_rate": 0.00016375593715747168,
+      "loss": 0.7616,
+      "step": 501
+    },
+    {
+      "epoch": 0.18307804522246535,
+      "grad_norm": 0.16759932041168213,
+      "learning_rate": 0.0001636828644501279,
+      "loss": 0.5594,
+      "step": 502
+    },
+    {
+      "epoch": 0.18344274252370532,
+      "grad_norm": 0.1994057297706604,
+      "learning_rate": 0.00016360979174278407,
+      "loss": 0.8499,
+      "step": 503
+    },
+    {
+      "epoch": 0.1838074398249453,
+      "grad_norm": 0.1942475587129593,
+      "learning_rate": 0.00016353671903544028,
+      "loss": 0.8477,
+      "step": 504
+    },
+    {
+      "epoch": 0.18417213712618527,
+      "grad_norm": 0.20013171434402466,
+      "learning_rate": 0.00016346364632809646,
+      "loss": 0.8432,
+      "step": 505
+    },
+    {
+      "epoch": 0.18453683442742524,
+      "grad_norm": 0.19940058887004852,
+      "learning_rate": 0.00016339057362075264,
+      "loss": 0.9063,
+      "step": 506
+    },
+    {
+      "epoch": 0.18490153172866522,
+      "grad_norm": 0.17371918261051178,
+      "learning_rate": 0.00016331750091340885,
+      "loss": 0.755,
+      "step": 507
+    },
+    {
+      "epoch": 0.18526622902990517,
+      "grad_norm": 0.1687958985567093,
+      "learning_rate": 0.00016324442820606506,
+      "loss": 0.6164,
+      "step": 508
+    },
+    {
+      "epoch": 0.18563092633114514,
+      "grad_norm": 0.1572989672422409,
+      "learning_rate": 0.00016317135549872124,
+      "loss": 0.5366,
+      "step": 509
+    },
+    {
+      "epoch": 0.18599562363238512,
+      "grad_norm": 0.1621757596731186,
+      "learning_rate": 0.00016309828279137742,
+      "loss": 0.572,
+      "step": 510
+    },
+    {
+      "epoch": 0.1863603209336251,
+      "grad_norm": 0.16226086020469666,
+      "learning_rate": 0.00016302521008403363,
+      "loss": 0.6112,
+      "step": 511
+    },
+    {
+      "epoch": 0.18672501823486506,
+      "grad_norm": 0.19324566423892975,
+      "learning_rate": 0.0001629521373766898,
+      "loss": 0.8287,
+      "step": 512
+    },
+    {
+      "epoch": 0.18708971553610504,
+      "grad_norm": 0.18370205163955688,
+      "learning_rate": 0.00016287906466934602,
+      "loss": 0.7856,
+      "step": 513
+    },
+    {
+      "epoch": 0.187454412837345,
+      "grad_norm": 0.22035083174705505,
+      "learning_rate": 0.0001628059919620022,
+      "loss": 0.9329,
+      "step": 514
+    },
+    {
+      "epoch": 0.187819110138585,
+      "grad_norm": 0.1782984435558319,
+      "learning_rate": 0.00016273291925465838,
+      "loss": 0.8342,
+      "step": 515
+    },
+    {
+      "epoch": 0.18818380743982493,
+      "grad_norm": 0.23118692636489868,
+      "learning_rate": 0.0001626598465473146,
+      "loss": 0.9808,
+      "step": 516
+    },
+    {
+      "epoch": 0.1885485047410649,
+      "grad_norm": 0.1947745978832245,
+      "learning_rate": 0.00016258677383997077,
+      "loss": 0.8156,
+      "step": 517
+    },
+    {
+      "epoch": 0.18891320204230488,
+      "grad_norm": 0.18419331312179565,
+      "learning_rate": 0.00016251370113262695,
+      "loss": 0.7476,
+      "step": 518
+    },
+    {
+      "epoch": 0.18927789934354486,
+      "grad_norm": 0.1725061982870102,
+      "learning_rate": 0.00016244062842528316,
+      "loss": 0.646,
+      "step": 519
+    },
+    {
+      "epoch": 0.18964259664478483,
+      "grad_norm": 0.17075173556804657,
+      "learning_rate": 0.00016236755571793937,
+      "loss": 0.5723,
+      "step": 520
+    },
+    {
+      "epoch": 0.1900072939460248,
+      "grad_norm": 0.17799945175647736,
+      "learning_rate": 0.00016229448301059555,
+      "loss": 0.8289,
+      "step": 521
+    },
+    {
+      "epoch": 0.19037199124726478,
+      "grad_norm": 0.18537406623363495,
+      "learning_rate": 0.00016222141030325176,
+      "loss": 0.8107,
+      "step": 522
+    },
+    {
+      "epoch": 0.19073668854850473,
+      "grad_norm": 0.20047864317893982,
+      "learning_rate": 0.00016214833759590794,
+      "loss": 0.9138,
+      "step": 523
+    },
+    {
+      "epoch": 0.1911013858497447,
+      "grad_norm": 0.20184080302715302,
+      "learning_rate": 0.00016207526488856412,
+      "loss": 0.8424,
+      "step": 524
+    },
+    {
+      "epoch": 0.19146608315098468,
+      "grad_norm": 0.2016812413930893,
+      "learning_rate": 0.00016200219218122033,
+      "loss": 0.971,
+      "step": 525
+    },
+    {
+      "epoch": 0.19183078045222465,
+      "grad_norm": 0.17220567166805267,
+      "learning_rate": 0.0001619291194738765,
+      "loss": 0.6097,
+      "step": 526
+    },
+    {
+      "epoch": 0.19219547775346463,
+      "grad_norm": 0.16681768000125885,
+      "learning_rate": 0.0001618560467665327,
+      "loss": 0.6361,
+      "step": 527
+    },
+    {
+      "epoch": 0.1925601750547046,
+      "grad_norm": 0.1976706087589264,
+      "learning_rate": 0.0001617829740591889,
+      "loss": 0.8039,
+      "step": 528
+    },
+    {
+      "epoch": 0.19292487235594458,
+      "grad_norm": 0.14784552156925201,
+      "learning_rate": 0.00016170990135184508,
+      "loss": 0.4833,
+      "step": 529
+    },
+    {
+      "epoch": 0.19328956965718452,
+      "grad_norm": 0.18679118156433105,
+      "learning_rate": 0.0001616368286445013,
+      "loss": 0.8576,
+      "step": 530
+    },
+    {
+      "epoch": 0.1936542669584245,
+      "grad_norm": 0.1557287573814392,
+      "learning_rate": 0.0001615637559371575,
+      "loss": 0.5705,
+      "step": 531
+    },
+    {
+      "epoch": 0.19401896425966447,
+      "grad_norm": 0.19311803579330444,
+      "learning_rate": 0.00016149068322981365,
+      "loss": 0.7767,
+      "step": 532
+    },
+    {
+      "epoch": 0.19438366156090445,
+      "grad_norm": 0.14209668338298798,
+      "learning_rate": 0.00016141761052246986,
+      "loss": 0.4313,
+      "step": 533
+    },
+    {
+      "epoch": 0.19474835886214442,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 0.00016134453781512607,
+      "loss": 0.5665,
+      "step": 534
+    },
+    {
+      "epoch": 0.1951130561633844,
+      "grad_norm": 0.17052525281906128,
+      "learning_rate": 0.00016127146510778225,
+      "loss": 0.507,
+      "step": 535
+    },
+    {
+      "epoch": 0.19547775346462437,
+      "grad_norm": 0.19006414711475372,
+      "learning_rate": 0.00016119839240043843,
+      "loss": 0.6866,
+      "step": 536
+    },
+    {
+      "epoch": 0.19584245076586435,
+      "grad_norm": 0.1550920307636261,
+      "learning_rate": 0.00016112531969309464,
+      "loss": 0.537,
+      "step": 537
+    },
+    {
+      "epoch": 0.1962071480671043,
+      "grad_norm": 0.17453649640083313,
+      "learning_rate": 0.00016105224698575082,
+      "loss": 0.5476,
+      "step": 538
+    },
+    {
+      "epoch": 0.19657184536834427,
+      "grad_norm": 0.1762418895959854,
+      "learning_rate": 0.00016097917427840703,
+      "loss": 0.7063,
+      "step": 539
+    },
+    {
+      "epoch": 0.19693654266958424,
+      "grad_norm": 0.1798284500837326,
+      "learning_rate": 0.0001609061015710632,
+      "loss": 0.6109,
+      "step": 540
+    },
+    {
+      "epoch": 0.19730123997082422,
+      "grad_norm": 0.18691901862621307,
+      "learning_rate": 0.0001608330288637194,
+      "loss": 0.7341,
+      "step": 541
+    },
+    {
+      "epoch": 0.1976659372720642,
+      "grad_norm": 0.21478383243083954,
+      "learning_rate": 0.0001607599561563756,
+      "loss": 0.8744,
+      "step": 542
+    },
+    {
+      "epoch": 0.19803063457330417,
+      "grad_norm": 0.17121157050132751,
+      "learning_rate": 0.0001606868834490318,
+      "loss": 0.54,
+      "step": 543
+    },
+    {
+      "epoch": 0.19839533187454414,
+      "grad_norm": 0.1967833936214447,
+      "learning_rate": 0.000160613810741688,
+      "loss": 0.5669,
+      "step": 544
+    },
+    {
+      "epoch": 0.1987600291757841,
+      "grad_norm": 0.16104774177074432,
+      "learning_rate": 0.00016054073803434417,
+      "loss": 0.4524,
+      "step": 545
+    },
+    {
+      "epoch": 0.19912472647702406,
+      "grad_norm": 0.17480875551700592,
+      "learning_rate": 0.00016046766532700038,
+      "loss": 0.6578,
+      "step": 546
+    },
+    {
+      "epoch": 0.19948942377826404,
+      "grad_norm": 0.16630177199840546,
+      "learning_rate": 0.00016039459261965656,
+      "loss": 0.6371,
+      "step": 547
+    },
+    {
+      "epoch": 0.199854121079504,
+      "grad_norm": 0.18494483828544617,
+      "learning_rate": 0.00016032151991231277,
+      "loss": 0.7218,
+      "step": 548
+    },
+    {
+      "epoch": 0.20021881838074398,
+      "grad_norm": 0.17873701453208923,
+      "learning_rate": 0.00016024844720496895,
+      "loss": 0.6248,
+      "step": 549
+    },
+    {
+      "epoch": 0.20058351568198396,
+      "grad_norm": 0.16306859254837036,
+      "learning_rate": 0.00016017537449762513,
+      "loss": 0.5553,
+      "step": 550
+    },
+    {
+      "epoch": 0.20094821298322393,
+      "grad_norm": 0.1972372829914093,
+      "learning_rate": 0.00016010230179028134,
+      "loss": 0.8172,
+      "step": 551
+    },
+    {
+      "epoch": 0.2013129102844639,
+      "grad_norm": 0.21976488828659058,
+      "learning_rate": 0.00016002922908293752,
+      "loss": 1.0276,
+      "step": 552
+    },
+    {
+      "epoch": 0.20167760758570386,
+      "grad_norm": 0.16156013309955597,
+      "learning_rate": 0.0001599561563755937,
+      "loss": 0.5599,
+      "step": 553
+    },
+    {
+      "epoch": 0.20204230488694383,
+      "grad_norm": 0.18124879896640778,
+      "learning_rate": 0.00015988308366824991,
+      "loss": 0.6038,
+      "step": 554
+    },
+    {
+      "epoch": 0.2024070021881838,
+      "grad_norm": 0.16870321333408356,
+      "learning_rate": 0.00015981001096090612,
+      "loss": 0.6956,
+      "step": 555
+    },
+    {
+      "epoch": 0.20277169948942378,
+      "grad_norm": 0.18470539152622223,
+      "learning_rate": 0.0001597369382535623,
+      "loss": 0.7895,
+      "step": 556
+    },
+    {
+      "epoch": 0.20313639679066375,
+      "grad_norm": 0.1810760498046875,
+      "learning_rate": 0.0001596638655462185,
+      "loss": 0.7416,
+      "step": 557
+    },
+    {
+      "epoch": 0.20350109409190373,
+      "grad_norm": 0.19518287479877472,
+      "learning_rate": 0.0001595907928388747,
+      "loss": 0.8724,
+      "step": 558
+    },
+    {
+      "epoch": 0.2038657913931437,
+      "grad_norm": 0.16267555952072144,
+      "learning_rate": 0.00015951772013153087,
+      "loss": 0.6208,
+      "step": 559
+    },
+    {
+      "epoch": 0.20423048869438365,
+      "grad_norm": 0.19488964974880219,
+      "learning_rate": 0.00015944464742418708,
+      "loss": 0.8636,
+      "step": 560
+    },
+    {
+      "epoch": 0.20459518599562362,
+      "grad_norm": 0.1762942373752594,
+      "learning_rate": 0.00015937157471684326,
+      "loss": 0.6351,
+      "step": 561
+    },
+    {
+      "epoch": 0.2049598832968636,
+      "grad_norm": 0.1685190051794052,
+      "learning_rate": 0.00015929850200949945,
+      "loss": 0.7862,
+      "step": 562
+    },
+    {
+      "epoch": 0.20532458059810357,
+      "grad_norm": 0.17033015191555023,
+      "learning_rate": 0.00015922542930215565,
+      "loss": 0.7065,
+      "step": 563
+    },
+    {
+      "epoch": 0.20568927789934355,
+      "grad_norm": 0.1639910638332367,
+      "learning_rate": 0.00015915235659481184,
+      "loss": 0.5806,
+      "step": 564
+    },
+    {
+      "epoch": 0.20605397520058352,
+      "grad_norm": 0.16588236391544342,
+      "learning_rate": 0.00015907928388746804,
+      "loss": 0.639,
+      "step": 565
+    },
+    {
+      "epoch": 0.2064186725018235,
+      "grad_norm": 0.1781357228755951,
+      "learning_rate": 0.00015900621118012423,
+      "loss": 0.7193,
+      "step": 566
+    },
+    {
+      "epoch": 0.20678336980306344,
+      "grad_norm": 0.2006876915693283,
+      "learning_rate": 0.0001589331384727804,
+      "loss": 0.8255,
+      "step": 567
+    },
+    {
+      "epoch": 0.20714806710430342,
+      "grad_norm": 0.24458666145801544,
+      "learning_rate": 0.00015886006576543661,
+      "loss": 0.7577,
+      "step": 568
+    },
+    {
+      "epoch": 0.2075127644055434,
+      "grad_norm": 0.18870210647583008,
+      "learning_rate": 0.00015878699305809282,
+      "loss": 0.7712,
+      "step": 569
+    },
+    {
+      "epoch": 0.20787746170678337,
+      "grad_norm": 0.1800808161497116,
+      "learning_rate": 0.000158713920350749,
+      "loss": 0.5715,
+      "step": 570
+    },
+    {
+      "epoch": 0.20824215900802334,
+      "grad_norm": 0.16994856297969818,
+      "learning_rate": 0.00015864084764340519,
+      "loss": 0.6102,
+      "step": 571
+    },
+    {
+      "epoch": 0.20860685630926332,
+      "grad_norm": 0.21051564812660217,
+      "learning_rate": 0.0001585677749360614,
+      "loss": 0.9531,
+      "step": 572
+    },
+    {
+      "epoch": 0.2089715536105033,
+      "grad_norm": 0.17979969084262848,
+      "learning_rate": 0.00015849470222871758,
+      "loss": 0.6933,
+      "step": 573
+    },
+    {
+      "epoch": 0.20933625091174327,
+      "grad_norm": 0.17062553763389587,
+      "learning_rate": 0.00015842162952137378,
+      "loss": 0.5809,
+      "step": 574
+    },
+    {
+      "epoch": 0.2097009482129832,
+      "grad_norm": 0.1818009465932846,
+      "learning_rate": 0.00015834855681402997,
+      "loss": 0.6487,
+      "step": 575
+    },
+    {
+      "epoch": 0.2100656455142232,
+      "grad_norm": 0.19140908122062683,
+      "learning_rate": 0.00015827548410668615,
+      "loss": 0.7887,
+      "step": 576
+    },
+    {
+      "epoch": 0.21043034281546316,
+      "grad_norm": 0.15934517979621887,
+      "learning_rate": 0.00015820241139934236,
+      "loss": 0.5761,
+      "step": 577
+    },
+    {
+      "epoch": 0.21079504011670314,
+      "grad_norm": 0.16064375638961792,
+      "learning_rate": 0.00015812933869199856,
+      "loss": 0.5708,
+      "step": 578
+    },
+    {
+      "epoch": 0.2111597374179431,
+      "grad_norm": 0.18466386198997498,
+      "learning_rate": 0.00015805626598465474,
+      "loss": 0.7036,
+      "step": 579
+    },
+    {
+      "epoch": 0.21152443471918309,
+      "grad_norm": 0.17533251643180847,
+      "learning_rate": 0.00015798319327731093,
+      "loss": 0.801,
+      "step": 580
+    },
+    {
+      "epoch": 0.21188913202042306,
+      "grad_norm": 0.1585182249546051,
+      "learning_rate": 0.00015791012056996713,
+      "loss": 0.5439,
+      "step": 581
+    },
+    {
+      "epoch": 0.212253829321663,
+      "grad_norm": 0.18301182985305786,
+      "learning_rate": 0.00015783704786262332,
+      "loss": 0.6721,
+      "step": 582
+    },
+    {
+      "epoch": 0.21261852662290298,
+      "grad_norm": 0.19599053263664246,
+      "learning_rate": 0.00015776397515527952,
+      "loss": 0.7901,
+      "step": 583
+    },
+    {
+      "epoch": 0.21298322392414296,
+      "grad_norm": 0.19529075920581818,
+      "learning_rate": 0.0001576909024479357,
+      "loss": 0.7955,
+      "step": 584
+    },
+    {
+      "epoch": 0.21334792122538293,
+      "grad_norm": 0.17965692281723022,
+      "learning_rate": 0.0001576178297405919,
+      "loss": 0.6761,
+      "step": 585
+    },
+    {
+      "epoch": 0.2137126185266229,
+      "grad_norm": 0.15989278256893158,
+      "learning_rate": 0.0001575447570332481,
+      "loss": 0.5406,
+      "step": 586
+    },
+    {
+      "epoch": 0.21407731582786288,
+      "grad_norm": 0.18571379780769348,
+      "learning_rate": 0.00015747168432590428,
+      "loss": 0.784,
+      "step": 587
+    },
+    {
+      "epoch": 0.21444201312910285,
+      "grad_norm": 0.17124933004379272,
+      "learning_rate": 0.00015739861161856046,
+      "loss": 0.6769,
+      "step": 588
+    },
+    {
+      "epoch": 0.21480671043034283,
+      "grad_norm": 0.2020951509475708,
+      "learning_rate": 0.00015732553891121667,
+      "loss": 0.9287,
+      "step": 589
+    },
+    {
+      "epoch": 0.21517140773158278,
+      "grad_norm": 0.1693318635225296,
+      "learning_rate": 0.00015725246620387287,
+      "loss": 0.6093,
+      "step": 590
+    },
+    {
+      "epoch": 0.21553610503282275,
+      "grad_norm": 0.1748688966035843,
+      "learning_rate": 0.00015717939349652906,
+      "loss": 0.6981,
+      "step": 591
+    },
+    {
+      "epoch": 0.21590080233406272,
+      "grad_norm": 0.15914174914360046,
+      "learning_rate": 0.00015710632078918526,
+      "loss": 0.5255,
+      "step": 592
+    },
+    {
+      "epoch": 0.2162654996353027,
+      "grad_norm": 0.16605016589164734,
+      "learning_rate": 0.00015703324808184145,
+      "loss": 0.613,
+      "step": 593
+    },
+    {
+      "epoch": 0.21663019693654267,
+      "grad_norm": 0.13033540546894073,
+      "learning_rate": 0.00015696017537449763,
+      "loss": 0.2626,
+      "step": 594
+    },
+    {
+      "epoch": 0.21699489423778265,
+      "grad_norm": 0.17081047594547272,
+      "learning_rate": 0.00015688710266715384,
+      "loss": 0.6266,
+      "step": 595
+    },
+    {
+      "epoch": 0.21735959153902262,
+      "grad_norm": 0.18738578259944916,
+      "learning_rate": 0.00015681402995981002,
+      "loss": 0.7505,
+      "step": 596
+    },
+    {
+      "epoch": 0.21772428884026257,
+      "grad_norm": 0.18220360577106476,
+      "learning_rate": 0.0001567409572524662,
+      "loss": 0.6359,
+      "step": 597
+    },
+    {
+      "epoch": 0.21808898614150254,
+      "grad_norm": 0.12919798493385315,
+      "learning_rate": 0.0001566678845451224,
+      "loss": 0.2671,
+      "step": 598
+    },
+    {
+      "epoch": 0.21845368344274252,
+      "grad_norm": 0.19884516298770905,
+      "learning_rate": 0.0001565948118377786,
+      "loss": 0.7276,
+      "step": 599
+    },
+    {
+      "epoch": 0.2188183807439825,
+      "grad_norm": 0.21019931137561798,
+      "learning_rate": 0.0001565217391304348,
+      "loss": 0.8272,
+      "step": 600
+    },
+    {
+      "epoch": 0.21918307804522247,
+      "grad_norm": 0.18614532053470612,
+      "learning_rate": 0.00015644866642309098,
+      "loss": 0.6676,
+      "step": 601
+    },
+    {
+      "epoch": 0.21954777534646244,
+      "grad_norm": 0.1795649528503418,
+      "learning_rate": 0.00015637559371574716,
+      "loss": 0.7105,
+      "step": 602
+    },
+    {
+      "epoch": 0.21991247264770242,
+      "grad_norm": 0.2079457938671112,
+      "learning_rate": 0.00015630252100840337,
+      "loss": 1.0025,
+      "step": 603
+    },
+    {
+      "epoch": 0.22027716994894236,
+      "grad_norm": 0.16998709738254547,
+      "learning_rate": 0.00015622944830105958,
+      "loss": 0.6432,
+      "step": 604
+    },
+    {
+      "epoch": 0.22064186725018234,
+      "grad_norm": 0.20293568074703217,
+      "learning_rate": 0.00015615637559371576,
+      "loss": 0.9977,
+      "step": 605
+    },
+    {
+      "epoch": 0.2210065645514223,
+      "grad_norm": 0.1662643551826477,
+      "learning_rate": 0.00015608330288637194,
+      "loss": 0.5779,
+      "step": 606
+    },
+    {
+      "epoch": 0.2213712618526623,
+      "grad_norm": 0.21681144833564758,
+      "learning_rate": 0.00015601023017902815,
+      "loss": 0.9133,
+      "step": 607
+    },
+    {
+      "epoch": 0.22173595915390226,
+      "grad_norm": 0.1861148327589035,
+      "learning_rate": 0.00015593715747168433,
+      "loss": 0.854,
+      "step": 608
+    },
+    {
+      "epoch": 0.22210065645514224,
+      "grad_norm": 0.15234194695949554,
+      "learning_rate": 0.00015586408476434054,
+      "loss": 0.5688,
+      "step": 609
+    },
+    {
+      "epoch": 0.2224653537563822,
+      "grad_norm": 0.18444177508354187,
+      "learning_rate": 0.00015579101205699672,
+      "loss": 0.8598,
+      "step": 610
+    },
+    {
+      "epoch": 0.22283005105762219,
+      "grad_norm": 0.1650354266166687,
+      "learning_rate": 0.0001557179393496529,
+      "loss": 0.5645,
+      "step": 611
+    },
+    {
+      "epoch": 0.22319474835886213,
+      "grad_norm": 0.1783570498228073,
+      "learning_rate": 0.0001556448666423091,
+      "loss": 0.6918,
+      "step": 612
+    },
+    {
+      "epoch": 0.2235594456601021,
+      "grad_norm": 0.201633483171463,
+      "learning_rate": 0.00015557179393496532,
+      "loss": 0.9097,
+      "step": 613
+    },
+    {
+      "epoch": 0.22392414296134208,
+      "grad_norm": 0.1808750182390213,
+      "learning_rate": 0.00015549872122762147,
+      "loss": 0.5749,
+      "step": 614
+    },
+    {
+      "epoch": 0.22428884026258206,
+      "grad_norm": 0.18188488483428955,
+      "learning_rate": 0.00015542564852027768,
+      "loss": 0.7137,
+      "step": 615
+    },
+    {
+      "epoch": 0.22465353756382203,
+      "grad_norm": 0.1791074126958847,
+      "learning_rate": 0.0001553525758129339,
+      "loss": 0.8679,
+      "step": 616
+    },
+    {
+      "epoch": 0.225018234865062,
+      "grad_norm": 0.18888451159000397,
+      "learning_rate": 0.00015527950310559007,
+      "loss": 0.8296,
+      "step": 617
+    },
+    {
+      "epoch": 0.22538293216630198,
+      "grad_norm": 0.1985502392053604,
+      "learning_rate": 0.00015520643039824628,
+      "loss": 0.8388,
+      "step": 618
+    },
+    {
+      "epoch": 0.22574762946754193,
+      "grad_norm": 0.20040510594844818,
+      "learning_rate": 0.00015513335769090246,
+      "loss": 0.8381,
+      "step": 619
+    },
+    {
+      "epoch": 0.2261123267687819,
+      "grad_norm": 0.20879292488098145,
+      "learning_rate": 0.00015506028498355864,
+      "loss": 0.9998,
+      "step": 620
+    },
+    {
+      "epoch": 0.22647702407002188,
+      "grad_norm": 0.18888017535209656,
+      "learning_rate": 0.00015498721227621485,
+      "loss": 0.6915,
+      "step": 621
+    },
+    {
+      "epoch": 0.22684172137126185,
+      "grad_norm": 0.19784311950206757,
+      "learning_rate": 0.00015491413956887103,
+      "loss": 1.054,
+      "step": 622
+    },
+    {
+      "epoch": 0.22720641867250183,
+      "grad_norm": 0.16005344688892365,
+      "learning_rate": 0.0001548410668615272,
+      "loss": 0.4895,
+      "step": 623
+    },
+    {
+      "epoch": 0.2275711159737418,
+      "grad_norm": 0.20644895732402802,
+      "learning_rate": 0.00015476799415418342,
+      "loss": 0.8316,
+      "step": 624
+    },
+    {
+      "epoch": 0.22793581327498177,
+      "grad_norm": 0.18263429403305054,
+      "learning_rate": 0.00015469492144683963,
+      "loss": 0.7931,
+      "step": 625
+    },
+    {
+      "epoch": 0.22830051057622175,
+      "grad_norm": 0.18260689079761505,
+      "learning_rate": 0.0001546218487394958,
+      "loss": 0.6584,
+      "step": 626
+    },
+    {
+      "epoch": 0.2286652078774617,
+      "grad_norm": 0.20142361521720886,
+      "learning_rate": 0.00015454877603215202,
+      "loss": 0.7865,
+      "step": 627
+    },
+    {
+      "epoch": 0.22902990517870167,
+      "grad_norm": 0.163302943110466,
+      "learning_rate": 0.0001544757033248082,
+      "loss": 0.6348,
+      "step": 628
+    },
+    {
+      "epoch": 0.22939460247994164,
+      "grad_norm": 0.1729688048362732,
+      "learning_rate": 0.00015440263061746438,
+      "loss": 0.6776,
+      "step": 629
+    },
+    {
+      "epoch": 0.22975929978118162,
+      "grad_norm": 0.167247012257576,
+      "learning_rate": 0.0001543295579101206,
+      "loss": 0.6274,
+      "step": 630
+    },
+    {
+      "epoch": 0.2301239970824216,
+      "grad_norm": 0.17160290479660034,
+      "learning_rate": 0.00015425648520277677,
+      "loss": 0.6194,
+      "step": 631
+    },
+    {
+      "epoch": 0.23048869438366157,
+      "grad_norm": 0.17698507010936737,
+      "learning_rate": 0.00015418341249543295,
+      "loss": 0.7014,
+      "step": 632
+    },
+    {
+      "epoch": 0.23085339168490154,
+      "grad_norm": 0.1988571584224701,
+      "learning_rate": 0.00015411033978808916,
+      "loss": 0.8195,
+      "step": 633
+    },
+    {
+      "epoch": 0.2312180889861415,
+      "grad_norm": 0.18846118450164795,
+      "learning_rate": 0.00015403726708074534,
+      "loss": 0.711,
+      "step": 634
+    },
+    {
+      "epoch": 0.23158278628738146,
+      "grad_norm": 0.1780976951122284,
+      "learning_rate": 0.00015396419437340155,
+      "loss": 0.6782,
+      "step": 635
+    },
+    {
+      "epoch": 0.23194748358862144,
+      "grad_norm": 0.17137964069843292,
+      "learning_rate": 0.00015389112166605773,
+      "loss": 0.6011,
+      "step": 636
+    },
+    {
+      "epoch": 0.2323121808898614,
+      "grad_norm": 0.18975742161273956,
+      "learning_rate": 0.0001538180489587139,
+      "loss": 0.6786,
+      "step": 637
+    },
+    {
+      "epoch": 0.2326768781911014,
+      "grad_norm": 0.17563198506832123,
+      "learning_rate": 0.00015374497625137012,
+      "loss": 0.6222,
+      "step": 638
+    },
+    {
+      "epoch": 0.23304157549234136,
+      "grad_norm": 0.17805379629135132,
+      "learning_rate": 0.00015367190354402633,
+      "loss": 0.6959,
+      "step": 639
+    },
+    {
+      "epoch": 0.23340627279358134,
+      "grad_norm": 0.1799224615097046,
+      "learning_rate": 0.0001535988308366825,
+      "loss": 0.6508,
+      "step": 640
+    },
+    {
+      "epoch": 0.2337709700948213,
+      "grad_norm": 0.1822444349527359,
+      "learning_rate": 0.0001535257581293387,
+      "loss": 0.6701,
+      "step": 641
+    },
+    {
+      "epoch": 0.23413566739606126,
+      "grad_norm": 0.18402761220932007,
+      "learning_rate": 0.0001534526854219949,
+      "loss": 0.7839,
+      "step": 642
+    },
+    {
+      "epoch": 0.23450036469730123,
+      "grad_norm": 0.1818612664937973,
+      "learning_rate": 0.00015337961271465108,
+      "loss": 0.7694,
+      "step": 643
+    },
+    {
+      "epoch": 0.2348650619985412,
+      "grad_norm": 0.17730721831321716,
+      "learning_rate": 0.0001533065400073073,
+      "loss": 0.6292,
+      "step": 644
+    },
+    {
+      "epoch": 0.23522975929978118,
+      "grad_norm": 0.1790621429681778,
+      "learning_rate": 0.00015323346729996347,
+      "loss": 0.729,
+      "step": 645
+    },
+    {
+      "epoch": 0.23559445660102116,
+      "grad_norm": 0.17606894671916962,
+      "learning_rate": 0.00015316039459261965,
+      "loss": 0.6977,
+      "step": 646
+    },
+    {
+      "epoch": 0.23595915390226113,
+      "grad_norm": 0.18288281559944153,
+      "learning_rate": 0.00015308732188527586,
+      "loss": 0.7042,
+      "step": 647
+    },
+    {
+      "epoch": 0.2363238512035011,
+      "grad_norm": 0.17753863334655762,
+      "learning_rate": 0.00015301424917793207,
+      "loss": 0.7813,
+      "step": 648
+    },
+    {
+      "epoch": 0.23668854850474105,
+      "grad_norm": 0.1613297015428543,
+      "learning_rate": 0.00015294117647058822,
+      "loss": 0.5043,
+      "step": 649
+    },
+    {
+      "epoch": 0.23705324580598103,
+      "grad_norm": 0.19487784802913666,
+      "learning_rate": 0.00015286810376324443,
+      "loss": 0.8338,
+      "step": 650
+    },
+    {
+      "epoch": 0.237417943107221,
+      "grad_norm": 0.17686185240745544,
+      "learning_rate": 0.00015279503105590064,
+      "loss": 0.6406,
+      "step": 651
+    },
+    {
+      "epoch": 0.23778264040846098,
+      "grad_norm": 0.18062898516654968,
+      "learning_rate": 0.00015272195834855682,
+      "loss": 0.772,
+      "step": 652
+    },
+    {
+      "epoch": 0.23814733770970095,
+      "grad_norm": 0.1765890270471573,
+      "learning_rate": 0.00015264888564121303,
+      "loss": 0.6313,
+      "step": 653
+    },
+    {
+      "epoch": 0.23851203501094093,
+      "grad_norm": 0.18338140845298767,
+      "learning_rate": 0.0001525758129338692,
+      "loss": 0.6944,
+      "step": 654
+    },
+    {
+      "epoch": 0.2388767323121809,
+      "grad_norm": 0.16114097833633423,
+      "learning_rate": 0.0001525027402265254,
+      "loss": 0.4307,
+      "step": 655
+    },
+    {
+      "epoch": 0.23924142961342085,
+      "grad_norm": 0.18049006164073944,
+      "learning_rate": 0.0001524296675191816,
+      "loss": 0.6055,
+      "step": 656
+    },
+    {
+      "epoch": 0.23960612691466082,
+      "grad_norm": 0.17693577706813812,
+      "learning_rate": 0.00015235659481183778,
+      "loss": 0.6953,
+      "step": 657
+    },
+    {
+      "epoch": 0.2399708242159008,
+      "grad_norm": 0.17377278208732605,
+      "learning_rate": 0.00015228352210449396,
+      "loss": 0.616,
+      "step": 658
+    },
+    {
+      "epoch": 0.24033552151714077,
+      "grad_norm": 0.2092607021331787,
+      "learning_rate": 0.00015221044939715017,
+      "loss": 0.9593,
+      "step": 659
+    },
+    {
+      "epoch": 0.24070021881838075,
+      "grad_norm": 0.18305638432502747,
+      "learning_rate": 0.00015213737668980638,
+      "loss": 0.7737,
+      "step": 660
+    },
+    {
+      "epoch": 0.24106491611962072,
+      "grad_norm": 0.17872866988182068,
+      "learning_rate": 0.00015206430398246256,
+      "loss": 0.6324,
+      "step": 661
+    },
+    {
+      "epoch": 0.2414296134208607,
+      "grad_norm": 0.16150373220443726,
+      "learning_rate": 0.00015199123127511874,
+      "loss": 0.4876,
+      "step": 662
+    },
+    {
+      "epoch": 0.24179431072210067,
+      "grad_norm": 0.16832824051380157,
+      "learning_rate": 0.00015191815856777495,
+      "loss": 0.5724,
+      "step": 663
+    },
+    {
+      "epoch": 0.24215900802334062,
+      "grad_norm": 0.17251615226268768,
+      "learning_rate": 0.00015184508586043113,
+      "loss": 0.6452,
+      "step": 664
+    },
+    {
+      "epoch": 0.2425237053245806,
+      "grad_norm": 0.1856648325920105,
+      "learning_rate": 0.00015177201315308734,
+      "loss": 0.6863,
+      "step": 665
+    },
+    {
+      "epoch": 0.24288840262582057,
+      "grad_norm": 0.17599360644817352,
+      "learning_rate": 0.00015169894044574352,
+      "loss": 0.573,
+      "step": 666
+    },
+    {
+      "epoch": 0.24325309992706054,
+      "grad_norm": 0.1808532327413559,
+      "learning_rate": 0.0001516258677383997,
+      "loss": 0.6037,
+      "step": 667
+    },
+    {
+      "epoch": 0.24361779722830051,
+      "grad_norm": 0.18831709027290344,
+      "learning_rate": 0.0001515527950310559,
+      "loss": 0.8105,
+      "step": 668
+    },
+    {
+      "epoch": 0.2439824945295405,
+      "grad_norm": 0.1883680522441864,
+      "learning_rate": 0.0001514797223237121,
+      "loss": 0.7836,
+      "step": 669
+    },
+    {
+      "epoch": 0.24434719183078046,
+      "grad_norm": 0.18449294567108154,
+      "learning_rate": 0.0001514066496163683,
+      "loss": 0.74,
+      "step": 670
+    },
+    {
+      "epoch": 0.2447118891320204,
+      "grad_norm": 0.14520494639873505,
+      "learning_rate": 0.00015133357690902448,
+      "loss": 0.4257,
+      "step": 671
+    },
+    {
+      "epoch": 0.24507658643326038,
+      "grad_norm": 0.16999614238739014,
+      "learning_rate": 0.00015126050420168066,
+      "loss": 0.5913,
+      "step": 672
+    },
+    {
+      "epoch": 0.24544128373450036,
+      "grad_norm": 0.19084987044334412,
+      "learning_rate": 0.00015118743149433687,
+      "loss": 0.7187,
+      "step": 673
+    },
+    {
+      "epoch": 0.24580598103574033,
+      "grad_norm": 0.16345396637916565,
+      "learning_rate": 0.00015111435878699308,
+      "loss": 0.5708,
+      "step": 674
+    },
+    {
+      "epoch": 0.2461706783369803,
+      "grad_norm": 0.19265908002853394,
+      "learning_rate": 0.00015104128607964926,
+      "loss": 0.5983,
+      "step": 675
+    },
+    {
+      "epoch": 0.24653537563822028,
+      "grad_norm": 0.20187486708164215,
+      "learning_rate": 0.00015096821337230544,
+      "loss": 0.7784,
+      "step": 676
+    },
+    {
+      "epoch": 0.24690007293946026,
+      "grad_norm": 0.19124649465084076,
+      "learning_rate": 0.00015089514066496165,
+      "loss": 0.7033,
+      "step": 677
+    },
+    {
+      "epoch": 0.24726477024070023,
+      "grad_norm": 0.18420779705047607,
+      "learning_rate": 0.00015082206795761783,
+      "loss": 0.7342,
+      "step": 678
+    },
+    {
+      "epoch": 0.24762946754194018,
+      "grad_norm": 0.18304283916950226,
+      "learning_rate": 0.00015074899525027404,
+      "loss": 0.7119,
+      "step": 679
+    },
+    {
+      "epoch": 0.24799416484318015,
+      "grad_norm": 0.17313408851623535,
+      "learning_rate": 0.00015067592254293022,
+      "loss": 0.7298,
+      "step": 680
+    },
+    {
+      "epoch": 0.24835886214442013,
+      "grad_norm": 0.1871861219406128,
+      "learning_rate": 0.0001506028498355864,
+      "loss": 0.6186,
+      "step": 681
+    },
+    {
+      "epoch": 0.2487235594456601,
+      "grad_norm": 0.20435944199562073,
+      "learning_rate": 0.0001505297771282426,
+      "loss": 0.843,
+      "step": 682
+    },
+    {
+      "epoch": 0.24908825674690008,
+      "grad_norm": 0.16381756961345673,
+      "learning_rate": 0.00015045670442089882,
+      "loss": 0.5857,
+      "step": 683
+    },
+    {
+      "epoch": 0.24945295404814005,
+      "grad_norm": 0.18176475167274475,
+      "learning_rate": 0.00015038363171355497,
+      "loss": 0.7148,
+      "step": 684
+    },
+    {
+      "epoch": 0.24981765134938003,
+      "grad_norm": 0.1591276228427887,
+      "learning_rate": 0.00015031055900621118,
+      "loss": 0.6348,
+      "step": 685
+    },
+    {
+      "epoch": 0.25018234865062,
+      "grad_norm": 0.16380302608013153,
+      "learning_rate": 0.0001502374862988674,
+      "loss": 0.5438,
+      "step": 686
+    },
+    {
+      "epoch": 0.25054704595185995,
+      "grad_norm": 0.16611479222774506,
+      "learning_rate": 0.00015016441359152357,
+      "loss": 0.509,
+      "step": 687
+    },
+    {
+      "epoch": 0.25091174325309995,
+      "grad_norm": 0.188828706741333,
+      "learning_rate": 0.00015009134088417978,
+      "loss": 0.7431,
+      "step": 688
+    },
+    {
+      "epoch": 0.2512764405543399,
+      "grad_norm": 0.16808100044727325,
+      "learning_rate": 0.00015001826817683596,
+      "loss": 0.5523,
+      "step": 689
+    },
+    {
+      "epoch": 0.25164113785557984,
+      "grad_norm": 0.20357143878936768,
+      "learning_rate": 0.00014994519546949214,
+      "loss": 0.7993,
+      "step": 690
+    },
+    {
+      "epoch": 0.25200583515681985,
+      "grad_norm": 0.18831704556941986,
+      "learning_rate": 0.00014987212276214835,
+      "loss": 0.7521,
+      "step": 691
+    },
+    {
+      "epoch": 0.2523705324580598,
+      "grad_norm": 0.17688477039337158,
+      "learning_rate": 0.00014979905005480453,
+      "loss": 0.6743,
+      "step": 692
+    },
+    {
+      "epoch": 0.2527352297592998,
+      "grad_norm": 0.1944332718849182,
+      "learning_rate": 0.00014972597734746071,
+      "loss": 0.869,
+      "step": 693
+    },
+    {
+      "epoch": 0.25309992706053974,
+      "grad_norm": 0.1805860847234726,
+      "learning_rate": 0.00014965290464011692,
+      "loss": 0.7884,
+      "step": 694
+    },
+    {
+      "epoch": 0.25346462436177974,
+      "grad_norm": 0.149339959025383,
+      "learning_rate": 0.00014957983193277313,
+      "loss": 0.4521,
+      "step": 695
+    },
+    {
+      "epoch": 0.2538293216630197,
+      "grad_norm": 0.16970299184322357,
+      "learning_rate": 0.0001495067592254293,
+      "loss": 0.6262,
+      "step": 696
+    },
+    {
+      "epoch": 0.25419401896425964,
+      "grad_norm": 0.17406289279460907,
+      "learning_rate": 0.0001494336865180855,
+      "loss": 0.7258,
+      "step": 697
+    },
+    {
+      "epoch": 0.25455871626549964,
+      "grad_norm": 0.18558435142040253,
+      "learning_rate": 0.0001493606138107417,
+      "loss": 0.6835,
+      "step": 698
+    },
+    {
+      "epoch": 0.2549234135667396,
+      "grad_norm": 0.16771887242794037,
+      "learning_rate": 0.00014928754110339788,
+      "loss": 0.506,
+      "step": 699
+    },
+    {
+      "epoch": 0.2552881108679796,
+      "grad_norm": 0.2056199610233307,
+      "learning_rate": 0.0001492144683960541,
+      "loss": 0.8879,
+      "step": 700
+    },
+    {
+      "epoch": 0.25565280816921954,
+      "grad_norm": 0.16422101855278015,
+      "learning_rate": 0.00014914139568871027,
+      "loss": 0.4785,
+      "step": 701
+    },
+    {
+      "epoch": 0.25601750547045954,
+      "grad_norm": 0.17392300069332123,
+      "learning_rate": 0.00014906832298136645,
+      "loss": 0.667,
+      "step": 702
+    },
+    {
+      "epoch": 0.2563822027716995,
+      "grad_norm": 0.18036240339279175,
+      "learning_rate": 0.00014899525027402266,
+      "loss": 0.6646,
+      "step": 703
+    },
+    {
+      "epoch": 0.2567469000729395,
+      "grad_norm": 0.19860827922821045,
+      "learning_rate": 0.00014892217756667884,
+      "loss": 0.8341,
+      "step": 704
+    },
+    {
+      "epoch": 0.25711159737417943,
+      "grad_norm": 0.15973514318466187,
+      "learning_rate": 0.00014884910485933505,
+      "loss": 0.5357,
+      "step": 705
+    },
+    {
+      "epoch": 0.2574762946754194,
+      "grad_norm": 0.17433969676494598,
+      "learning_rate": 0.00014877603215199123,
+      "loss": 0.6615,
+      "step": 706
+    },
+    {
+      "epoch": 0.2578409919766594,
+      "grad_norm": 0.17215828597545624,
+      "learning_rate": 0.00014870295944464742,
+      "loss": 0.547,
+      "step": 707
+    },
+    {
+      "epoch": 0.25820568927789933,
+      "grad_norm": 0.19459031522274017,
+      "learning_rate": 0.00014862988673730362,
+      "loss": 0.8744,
+      "step": 708
+    },
+    {
+      "epoch": 0.25857038657913933,
+      "grad_norm": 0.18972012400627136,
+      "learning_rate": 0.00014855681402995983,
+      "loss": 0.7687,
+      "step": 709
+    },
+    {
+      "epoch": 0.2589350838803793,
+      "grad_norm": 0.17814265191555023,
+      "learning_rate": 0.00014848374132261599,
+      "loss": 0.627,
+      "step": 710
+    },
+    {
+      "epoch": 0.2592997811816193,
+      "grad_norm": 0.19156868755817413,
+      "learning_rate": 0.0001484106686152722,
+      "loss": 0.882,
+      "step": 711
+    },
+    {
+      "epoch": 0.25966447848285923,
+      "grad_norm": 0.17542894184589386,
+      "learning_rate": 0.0001483375959079284,
+      "loss": 0.7863,
+      "step": 712
+    },
+    {
+      "epoch": 0.2600291757840992,
+      "grad_norm": 0.15451830625534058,
+      "learning_rate": 0.00014826452320058458,
+      "loss": 0.5987,
+      "step": 713
+    },
+    {
+      "epoch": 0.2603938730853392,
+      "grad_norm": 0.17926959693431854,
+      "learning_rate": 0.0001481914504932408,
+      "loss": 0.6238,
+      "step": 714
+    },
+    {
+      "epoch": 0.2607585703865791,
+      "grad_norm": 0.15617339313030243,
+      "learning_rate": 0.00014811837778589697,
+      "loss": 0.611,
+      "step": 715
+    },
+    {
+      "epoch": 0.2611232676878191,
+      "grad_norm": 0.18667645752429962,
+      "learning_rate": 0.00014804530507855316,
+      "loss": 0.657,
+      "step": 716
+    },
+    {
+      "epoch": 0.2614879649890591,
+      "grad_norm": 0.1547141671180725,
+      "learning_rate": 0.00014797223237120936,
+      "loss": 0.4469,
+      "step": 717
+    },
+    {
+      "epoch": 0.2618526622902991,
+      "grad_norm": 0.18968114256858826,
+      "learning_rate": 0.00014789915966386557,
+      "loss": 0.8392,
+      "step": 718
+    },
+    {
+      "epoch": 0.262217359591539,
+      "grad_norm": 0.1685071587562561,
+      "learning_rate": 0.00014782608695652173,
+      "loss": 0.5476,
+      "step": 719
+    },
+    {
+      "epoch": 0.26258205689277897,
+      "grad_norm": 0.18805775046348572,
+      "learning_rate": 0.00014775301424917793,
+      "loss": 0.7899,
+      "step": 720
+    },
+    {
+      "epoch": 0.262946754194019,
+      "grad_norm": 0.1985326111316681,
+      "learning_rate": 0.00014767994154183414,
+      "loss": 0.8315,
+      "step": 721
+    },
+    {
+      "epoch": 0.2633114514952589,
+      "grad_norm": 0.18817968666553497,
+      "learning_rate": 0.00014760686883449032,
+      "loss": 0.8077,
+      "step": 722
+    },
+    {
+      "epoch": 0.2636761487964989,
+      "grad_norm": 0.1707736700773239,
+      "learning_rate": 0.00014753379612714653,
+      "loss": 0.5055,
+      "step": 723
+    },
+    {
+      "epoch": 0.26404084609773887,
+      "grad_norm": 0.17432935535907745,
+      "learning_rate": 0.00014746072341980271,
+      "loss": 0.6428,
+      "step": 724
+    },
+    {
+      "epoch": 0.26440554339897887,
+      "grad_norm": 0.19772885739803314,
+      "learning_rate": 0.0001473876507124589,
+      "loss": 0.8951,
+      "step": 725
+    },
+    {
+      "epoch": 0.2647702407002188,
+      "grad_norm": 0.18301154673099518,
+      "learning_rate": 0.0001473145780051151,
+      "loss": 0.8608,
+      "step": 726
+    },
+    {
+      "epoch": 0.26513493800145876,
+      "grad_norm": 0.17923039197921753,
+      "learning_rate": 0.00014724150529777129,
+      "loss": 0.674,
+      "step": 727
+    },
+    {
+      "epoch": 0.26549963530269877,
+      "grad_norm": 0.1921747922897339,
+      "learning_rate": 0.00014716843259042747,
+      "loss": 0.7752,
+      "step": 728
+    },
+    {
+      "epoch": 0.2658643326039387,
+      "grad_norm": 0.17289309203624725,
+      "learning_rate": 0.00014709535988308367,
+      "loss": 0.7519,
+      "step": 729
+    },
+    {
+      "epoch": 0.2662290299051787,
+      "grad_norm": 0.1981101930141449,
+      "learning_rate": 0.00014702228717573988,
+      "loss": 0.8573,
+      "step": 730
+    },
+    {
+      "epoch": 0.26659372720641866,
+      "grad_norm": 0.16859839856624603,
+      "learning_rate": 0.00014694921446839606,
+      "loss": 0.6,
+      "step": 731
+    },
+    {
+      "epoch": 0.26695842450765866,
+      "grad_norm": 0.15981674194335938,
+      "learning_rate": 0.00014687614176105225,
+      "loss": 0.434,
+      "step": 732
+    },
+    {
+      "epoch": 0.2673231218088986,
+      "grad_norm": 0.15087354183197021,
+      "learning_rate": 0.00014680306905370845,
+      "loss": 0.4438,
+      "step": 733
+    },
+    {
+      "epoch": 0.26768781911013856,
+      "grad_norm": 0.21009470522403717,
+      "learning_rate": 0.00014672999634636464,
+      "loss": 0.92,
+      "step": 734
+    },
+    {
+      "epoch": 0.26805251641137856,
+      "grad_norm": 0.17488998174667358,
+      "learning_rate": 0.00014665692363902084,
+      "loss": 0.6819,
+      "step": 735
+    },
+    {
+      "epoch": 0.2684172137126185,
+      "grad_norm": 0.19854167103767395,
+      "learning_rate": 0.00014658385093167703,
+      "loss": 0.832,
+      "step": 736
+    },
+    {
+      "epoch": 0.2687819110138585,
+      "grad_norm": 0.17115044593811035,
+      "learning_rate": 0.0001465107782243332,
+      "loss": 0.6293,
+      "step": 737
+    },
+    {
+      "epoch": 0.26914660831509846,
+      "grad_norm": 0.14838315546512604,
+      "learning_rate": 0.00014643770551698941,
+      "loss": 0.3817,
+      "step": 738
+    },
+    {
+      "epoch": 0.26951130561633846,
+      "grad_norm": 0.1810576319694519,
+      "learning_rate": 0.0001463646328096456,
+      "loss": 0.6395,
+      "step": 739
+    },
+    {
+      "epoch": 0.2698760029175784,
+      "grad_norm": 0.2036665976047516,
+      "learning_rate": 0.0001462915601023018,
+      "loss": 0.855,
+      "step": 740
+    },
+    {
+      "epoch": 0.2702407002188184,
+      "grad_norm": 0.1664070338010788,
+      "learning_rate": 0.00014621848739495799,
+      "loss": 0.6324,
+      "step": 741
+    },
+    {
+      "epoch": 0.27060539752005836,
+      "grad_norm": 0.21777962148189545,
+      "learning_rate": 0.00014614541468761417,
+      "loss": 0.8487,
+      "step": 742
+    },
+    {
+      "epoch": 0.2709700948212983,
+      "grad_norm": 0.16543632745742798,
+      "learning_rate": 0.00014607234198027038,
+      "loss": 0.5855,
+      "step": 743
+    },
+    {
+      "epoch": 0.2713347921225383,
+      "grad_norm": 0.18875911831855774,
+      "learning_rate": 0.00014599926927292658,
+      "loss": 0.7444,
+      "step": 744
+    },
+    {
+      "epoch": 0.27169948942377825,
+      "grad_norm": 0.19937555491924286,
+      "learning_rate": 0.00014592619656558274,
+      "loss": 0.8593,
+      "step": 745
+    },
+    {
+      "epoch": 0.27206418672501825,
+      "grad_norm": 0.17685411870479584,
+      "learning_rate": 0.00014585312385823895,
+      "loss": 0.7563,
+      "step": 746
+    },
+    {
+      "epoch": 0.2724288840262582,
+      "grad_norm": 0.16490668058395386,
+      "learning_rate": 0.00014578005115089515,
+      "loss": 0.6355,
+      "step": 747
+    },
+    {
+      "epoch": 0.2727935813274982,
+      "grad_norm": 0.17879438400268555,
+      "learning_rate": 0.00014570697844355134,
+      "loss": 0.7345,
+      "step": 748
+    },
+    {
+      "epoch": 0.27315827862873815,
+      "grad_norm": 0.15404187142848969,
+      "learning_rate": 0.00014563390573620754,
+      "loss": 0.4345,
+      "step": 749
+    },
+    {
+      "epoch": 0.2735229759299781,
+      "grad_norm": 0.18319928646087646,
+      "learning_rate": 0.00014556083302886373,
+      "loss": 0.6891,
+      "step": 750
+    },
+    {
+      "epoch": 0.2738876732312181,
+      "grad_norm": 0.173502117395401,
+      "learning_rate": 0.0001454877603215199,
+      "loss": 0.6031,
+      "step": 751
+    },
+    {
+      "epoch": 0.27425237053245805,
+      "grad_norm": 0.16543245315551758,
+      "learning_rate": 0.00014541468761417612,
+      "loss": 0.5698,
+      "step": 752
+    },
+    {
+      "epoch": 0.27461706783369805,
+      "grad_norm": 0.16900089383125305,
+      "learning_rate": 0.00014534161490683232,
+      "loss": 0.7195,
+      "step": 753
+    },
+    {
+      "epoch": 0.274981765134938,
+      "grad_norm": 0.1920769065618515,
+      "learning_rate": 0.00014526854219948848,
+      "loss": 0.7794,
+      "step": 754
+    },
+    {
+      "epoch": 0.275346462436178,
+      "grad_norm": 0.15764226019382477,
+      "learning_rate": 0.0001451954694921447,
+      "loss": 0.5931,
+      "step": 755
+    },
+    {
+      "epoch": 0.27571115973741794,
+      "grad_norm": 0.16820603609085083,
+      "learning_rate": 0.0001451223967848009,
+      "loss": 0.5739,
+      "step": 756
+    },
+    {
+      "epoch": 0.2760758570386579,
+      "grad_norm": 0.15538586676120758,
+      "learning_rate": 0.00014504932407745708,
+      "loss": 0.4564,
+      "step": 757
+    },
+    {
+      "epoch": 0.2764405543398979,
+      "grad_norm": 0.1703750640153885,
+      "learning_rate": 0.00014497625137011328,
+      "loss": 0.6289,
+      "step": 758
+    },
+    {
+      "epoch": 0.27680525164113784,
+      "grad_norm": 0.17181243002414703,
+      "learning_rate": 0.00014490317866276947,
+      "loss": 0.6888,
+      "step": 759
+    },
+    {
+      "epoch": 0.27716994894237784,
+      "grad_norm": 0.17971667647361755,
+      "learning_rate": 0.00014483010595542565,
+      "loss": 0.6178,
+      "step": 760
+    },
+    {
+      "epoch": 0.2775346462436178,
+      "grad_norm": 0.16959045827388763,
+      "learning_rate": 0.00014475703324808186,
+      "loss": 0.6029,
+      "step": 761
+    },
+    {
+      "epoch": 0.2778993435448578,
+      "grad_norm": 0.18184369802474976,
+      "learning_rate": 0.00014468396054073804,
+      "loss": 0.769,
+      "step": 762
+    },
+    {
+      "epoch": 0.27826404084609774,
+      "grad_norm": 0.18276362121105194,
+      "learning_rate": 0.00014461088783339422,
+      "loss": 0.6981,
+      "step": 763
+    },
+    {
+      "epoch": 0.2786287381473377,
+      "grad_norm": 0.14473621547222137,
+      "learning_rate": 0.00014453781512605043,
+      "loss": 0.4821,
+      "step": 764
+    },
+    {
+      "epoch": 0.2789934354485777,
+      "grad_norm": 0.16732734441757202,
+      "learning_rate": 0.00014446474241870664,
+      "loss": 0.5346,
+      "step": 765
+    },
+    {
+      "epoch": 0.27935813274981763,
+      "grad_norm": 0.17172355949878693,
+      "learning_rate": 0.00014439166971136282,
+      "loss": 0.6361,
+      "step": 766
+    },
+    {
+      "epoch": 0.27972283005105764,
+      "grad_norm": 0.19567203521728516,
+      "learning_rate": 0.000144318597004019,
+      "loss": 0.859,
+      "step": 767
+    },
+    {
+      "epoch": 0.2800875273522976,
+      "grad_norm": 0.1898382008075714,
+      "learning_rate": 0.0001442455242966752,
+      "loss": 0.8217,
+      "step": 768
+    },
+    {
+      "epoch": 0.2804522246535376,
+      "grad_norm": 0.1976533979177475,
+      "learning_rate": 0.0001441724515893314,
+      "loss": 0.8709,
+      "step": 769
+    },
+    {
+      "epoch": 0.28081692195477753,
+      "grad_norm": 0.15758675336837769,
+      "learning_rate": 0.0001440993788819876,
+      "loss": 0.5522,
+      "step": 770
+    },
+    {
+      "epoch": 0.2811816192560175,
+      "grad_norm": 0.19185684621334076,
+      "learning_rate": 0.00014402630617464378,
+      "loss": 0.7182,
+      "step": 771
+    },
+    {
+      "epoch": 0.2815463165572575,
+      "grad_norm": 0.16831618547439575,
+      "learning_rate": 0.00014395323346729996,
+      "loss": 0.5841,
+      "step": 772
+    },
+    {
+      "epoch": 0.28191101385849743,
+      "grad_norm": 0.207386776804924,
+      "learning_rate": 0.00014388016075995617,
+      "loss": 0.8525,
+      "step": 773
+    },
+    {
+      "epoch": 0.28227571115973743,
+      "grad_norm": 0.17020228505134583,
+      "learning_rate": 0.00014380708805261235,
+      "loss": 0.5534,
+      "step": 774
+    },
+    {
+      "epoch": 0.2826404084609774,
+      "grad_norm": 0.18354672193527222,
+      "learning_rate": 0.00014373401534526856,
+      "loss": 0.7006,
+      "step": 775
+    },
+    {
+      "epoch": 0.2830051057622174,
+      "grad_norm": 0.16657932102680206,
+      "learning_rate": 0.00014366094263792474,
+      "loss": 0.6063,
+      "step": 776
+    },
+    {
+      "epoch": 0.2833698030634573,
+      "grad_norm": 0.15975604951381683,
+      "learning_rate": 0.00014358786993058092,
+      "loss": 0.4616,
+      "step": 777
+    },
+    {
+      "epoch": 0.28373450036469733,
+      "grad_norm": 0.17766812443733215,
+      "learning_rate": 0.00014351479722323713,
+      "loss": 0.7521,
+      "step": 778
+    },
+    {
+      "epoch": 0.2840991976659373,
+      "grad_norm": 0.18797791004180908,
+      "learning_rate": 0.00014344172451589334,
+      "loss": 0.7563,
+      "step": 779
+    },
+    {
+      "epoch": 0.2844638949671772,
+      "grad_norm": 0.17721839249134064,
+      "learning_rate": 0.0001433686518085495,
+      "loss": 0.8557,
+      "step": 780
+    },
+    {
+      "epoch": 0.2848285922684172,
+      "grad_norm": 0.18058152496814728,
+      "learning_rate": 0.0001432955791012057,
+      "loss": 0.6517,
+      "step": 781
+    },
+    {
+      "epoch": 0.28519328956965717,
+      "grad_norm": 0.1826111078262329,
+      "learning_rate": 0.0001432225063938619,
+      "loss": 0.6134,
+      "step": 782
+    },
+    {
+      "epoch": 0.2855579868708972,
+      "grad_norm": 0.1896212249994278,
+      "learning_rate": 0.0001431494336865181,
+      "loss": 0.7902,
+      "step": 783
+    },
+    {
+      "epoch": 0.2859226841721371,
+      "grad_norm": 0.1763574331998825,
+      "learning_rate": 0.0001430763609791743,
+      "loss": 0.6796,
+      "step": 784
+    },
+    {
+      "epoch": 0.2862873814733771,
+      "grad_norm": 0.16747136414051056,
+      "learning_rate": 0.00014300328827183048,
+      "loss": 0.5986,
+      "step": 785
+    },
+    {
+      "epoch": 0.28665207877461707,
+      "grad_norm": 0.18107502162456512,
+      "learning_rate": 0.00014293021556448666,
+      "loss": 0.6232,
+      "step": 786
+    },
+    {
+      "epoch": 0.287016776075857,
+      "grad_norm": 0.17364108562469482,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 0.6362,
+      "step": 787
+    },
+    {
+      "epoch": 0.287381473377097,
+      "grad_norm": 0.21109223365783691,
+      "learning_rate": 0.00014278407014979908,
+      "loss": 0.9846,
+      "step": 788
+    },
+    {
+      "epoch": 0.28774617067833697,
+      "grad_norm": 0.1875358521938324,
+      "learning_rate": 0.00014271099744245523,
+      "loss": 0.7414,
+      "step": 789
+    },
+    {
+      "epoch": 0.28811086797957697,
+      "grad_norm": 0.17905642092227936,
+      "learning_rate": 0.00014263792473511144,
+      "loss": 0.875,
+      "step": 790
+    },
+    {
+      "epoch": 0.2884755652808169,
+      "grad_norm": 0.16830816864967346,
+      "learning_rate": 0.00014256485202776765,
+      "loss": 0.6712,
+      "step": 791
+    },
+    {
+      "epoch": 0.2888402625820569,
+      "grad_norm": 0.1730622798204422,
+      "learning_rate": 0.00014249177932042383,
+      "loss": 0.5806,
+      "step": 792
+    },
+    {
+      "epoch": 0.28920495988329686,
+      "grad_norm": 0.15828505158424377,
+      "learning_rate": 0.00014241870661308,
+      "loss": 0.4814,
+      "step": 793
+    },
+    {
+      "epoch": 0.2895696571845368,
+      "grad_norm": 0.16246497631072998,
+      "learning_rate": 0.00014234563390573622,
+      "loss": 0.4964,
+      "step": 794
+    },
+    {
+      "epoch": 0.2899343544857768,
+      "grad_norm": 0.1828710287809372,
+      "learning_rate": 0.0001422725611983924,
+      "loss": 0.7139,
+      "step": 795
+    },
+    {
+      "epoch": 0.29029905178701676,
+      "grad_norm": 0.15251024067401886,
+      "learning_rate": 0.0001421994884910486,
+      "loss": 0.4463,
+      "step": 796
+    },
+    {
+      "epoch": 0.29066374908825676,
+      "grad_norm": 0.17780308425426483,
+      "learning_rate": 0.0001421264157837048,
+      "loss": 0.6125,
+      "step": 797
+    },
+    {
+      "epoch": 0.2910284463894967,
+      "grad_norm": 0.19187119603157043,
+      "learning_rate": 0.00014205334307636097,
+      "loss": 0.814,
+      "step": 798
+    },
+    {
+      "epoch": 0.2913931436907367,
+      "grad_norm": 0.20423109829425812,
+      "learning_rate": 0.00014198027036901718,
+      "loss": 0.8008,
+      "step": 799
+    },
+    {
+      "epoch": 0.29175784099197666,
+      "grad_norm": 0.17211808264255524,
+      "learning_rate": 0.0001419071976616734,
+      "loss": 0.5667,
+      "step": 800
+    },
+    {
+      "epoch": 0.2921225382932166,
+      "grad_norm": 0.20684252679347992,
+      "learning_rate": 0.00014183412495432957,
+      "loss": 0.7646,
+      "step": 801
+    },
+    {
+      "epoch": 0.2924872355944566,
+      "grad_norm": 0.15838950872421265,
+      "learning_rate": 0.00014176105224698575,
+      "loss": 0.5542,
+      "step": 802
+    },
+    {
+      "epoch": 0.29285193289569655,
+      "grad_norm": 0.19353005290031433,
+      "learning_rate": 0.00014168797953964196,
+      "loss": 0.7211,
+      "step": 803
+    },
+    {
+      "epoch": 0.29321663019693656,
+      "grad_norm": 0.1527222841978073,
+      "learning_rate": 0.00014161490683229814,
+      "loss": 0.5183,
+      "step": 804
+    },
+    {
+      "epoch": 0.2935813274981765,
+      "grad_norm": 0.15716342628002167,
+      "learning_rate": 0.00014154183412495435,
+      "loss": 0.5197,
+      "step": 805
+    },
+    {
+      "epoch": 0.2939460247994165,
+      "grad_norm": 0.2023271769285202,
+      "learning_rate": 0.00014146876141761053,
+      "loss": 0.8512,
+      "step": 806
+    },
+    {
+      "epoch": 0.29431072210065645,
+      "grad_norm": 0.17139078676700592,
+      "learning_rate": 0.0001413956887102667,
+      "loss": 0.5161,
+      "step": 807
+    },
+    {
+      "epoch": 0.2946754194018964,
+      "grad_norm": 0.15848784148693085,
+      "learning_rate": 0.00014132261600292292,
+      "loss": 0.6017,
+      "step": 808
+    },
+    {
+      "epoch": 0.2950401167031364,
+      "grad_norm": 0.1857565939426422,
+      "learning_rate": 0.0001412495432955791,
+      "loss": 0.7406,
+      "step": 809
+    },
+    {
+      "epoch": 0.29540481400437635,
+      "grad_norm": 0.17538581788539886,
+      "learning_rate": 0.0001411764705882353,
+      "loss": 0.615,
+      "step": 810
+    },
+    {
+      "epoch": 0.29576951130561635,
+      "grad_norm": 0.18965482711791992,
+      "learning_rate": 0.0001411033978808915,
+      "loss": 0.7869,
+      "step": 811
+    },
+    {
+      "epoch": 0.2961342086068563,
+      "grad_norm": 0.1757965087890625,
+      "learning_rate": 0.00014103032517354767,
+      "loss": 0.7217,
+      "step": 812
+    },
+    {
+      "epoch": 0.2964989059080963,
+      "grad_norm": 0.20843826234340668,
+      "learning_rate": 0.00014095725246620388,
+      "loss": 0.9802,
+      "step": 813
+    },
+    {
+      "epoch": 0.29686360320933625,
+      "grad_norm": 0.17155392467975616,
+      "learning_rate": 0.0001408841797588601,
+      "loss": 0.67,
+      "step": 814
+    },
+    {
+      "epoch": 0.29722830051057625,
+      "grad_norm": 0.1699131578207016,
+      "learning_rate": 0.00014081110705151624,
+      "loss": 0.5432,
+      "step": 815
+    },
+    {
+      "epoch": 0.2975929978118162,
+      "grad_norm": 0.2160065770149231,
+      "learning_rate": 0.00014073803434417245,
+      "loss": 0.9022,
+      "step": 816
+    },
+    {
+      "epoch": 0.29795769511305614,
+      "grad_norm": 0.17137810587882996,
+      "learning_rate": 0.00014066496163682866,
+      "loss": 0.5853,
+      "step": 817
+    },
+    {
+      "epoch": 0.29832239241429614,
+      "grad_norm": 0.18035869300365448,
+      "learning_rate": 0.00014059188892948484,
+      "loss": 0.6308,
+      "step": 818
+    },
+    {
+      "epoch": 0.2986870897155361,
+      "grad_norm": 0.15042151510715485,
+      "learning_rate": 0.00014051881622214105,
+      "loss": 0.4335,
+      "step": 819
+    },
+    {
+      "epoch": 0.2990517870167761,
+      "grad_norm": 0.19387514889240265,
+      "learning_rate": 0.00014044574351479723,
+      "loss": 0.8104,
+      "step": 820
+    },
+    {
+      "epoch": 0.29941648431801604,
+      "grad_norm": 0.1495147943496704,
+      "learning_rate": 0.0001403726708074534,
+      "loss": 0.4551,
+      "step": 821
+    },
+    {
+      "epoch": 0.29978118161925604,
+      "grad_norm": 0.1305743157863617,
+      "learning_rate": 0.00014029959810010962,
+      "loss": 0.3016,
+      "step": 822
+    },
+    {
+      "epoch": 0.300145878920496,
+      "grad_norm": 0.18238870799541473,
+      "learning_rate": 0.00014022652539276583,
+      "loss": 0.6105,
+      "step": 823
+    },
+    {
+      "epoch": 0.30051057622173594,
+      "grad_norm": 0.1913590431213379,
+      "learning_rate": 0.00014015345268542198,
+      "loss": 0.8108,
+      "step": 824
+    },
+    {
+      "epoch": 0.30087527352297594,
+      "grad_norm": 0.15131442248821259,
+      "learning_rate": 0.0001400803799780782,
+      "loss": 0.5041,
+      "step": 825
+    },
+    {
+      "epoch": 0.3012399708242159,
+      "grad_norm": 0.15665921568870544,
+      "learning_rate": 0.0001400073072707344,
+      "loss": 0.5691,
+      "step": 826
+    },
+    {
+      "epoch": 0.3016046681254559,
+      "grad_norm": 0.1675909459590912,
+      "learning_rate": 0.00013993423456339058,
+      "loss": 0.6212,
+      "step": 827
+    },
+    {
+      "epoch": 0.30196936542669583,
+      "grad_norm": 0.2064395546913147,
+      "learning_rate": 0.00013986116185604676,
+      "loss": 0.8267,
+      "step": 828
+    },
+    {
+      "epoch": 0.30233406272793584,
+      "grad_norm": 0.18026086688041687,
+      "learning_rate": 0.00013978808914870297,
+      "loss": 0.7102,
+      "step": 829
+    },
+    {
+      "epoch": 0.3026987600291758,
+      "grad_norm": 0.18619216978549957,
+      "learning_rate": 0.00013971501644135915,
+      "loss": 0.5986,
+      "step": 830
+    },
+    {
+      "epoch": 0.30306345733041573,
+      "grad_norm": 0.20318515598773956,
+      "learning_rate": 0.00013964194373401536,
+      "loss": 0.9558,
+      "step": 831
+    },
+    {
+      "epoch": 0.30342815463165573,
+      "grad_norm": 0.1473841667175293,
+      "learning_rate": 0.00013956887102667154,
+      "loss": 0.3906,
+      "step": 832
+    },
+    {
+      "epoch": 0.3037928519328957,
+      "grad_norm": 0.1872483193874359,
+      "learning_rate": 0.00013949579831932772,
+      "loss": 0.7164,
+      "step": 833
+    },
+    {
+      "epoch": 0.3041575492341357,
+      "grad_norm": 0.1955832690000534,
+      "learning_rate": 0.00013942272561198393,
+      "loss": 0.7684,
+      "step": 834
+    },
+    {
+      "epoch": 0.30452224653537563,
+      "grad_norm": 0.17416808009147644,
+      "learning_rate": 0.00013934965290464014,
+      "loss": 0.6543,
+      "step": 835
+    },
+    {
+      "epoch": 0.30488694383661563,
+      "grad_norm": 0.18365338444709778,
+      "learning_rate": 0.00013927658019729632,
+      "loss": 0.6797,
+      "step": 836
+    },
+    {
+      "epoch": 0.3052516411378556,
+      "grad_norm": 0.159471333026886,
+      "learning_rate": 0.0001392035074899525,
+      "loss": 0.5944,
+      "step": 837
+    },
+    {
+      "epoch": 0.3056163384390955,
+      "grad_norm": 0.1895028054714203,
+      "learning_rate": 0.0001391304347826087,
+      "loss": 0.7348,
+      "step": 838
+    },
+    {
+      "epoch": 0.3059810357403355,
+      "grad_norm": 0.19214889407157898,
+      "learning_rate": 0.0001390573620752649,
+      "loss": 0.8888,
+      "step": 839
+    },
+    {
+      "epoch": 0.3063457330415755,
+      "grad_norm": 0.18355461955070496,
+      "learning_rate": 0.0001389842893679211,
+      "loss": 0.7066,
+      "step": 840
+    },
+    {
+      "epoch": 0.3067104303428155,
+      "grad_norm": 0.20794498920440674,
+      "learning_rate": 0.00013891121666057728,
+      "loss": 0.7818,
+      "step": 841
+    },
+    {
+      "epoch": 0.3070751276440554,
+      "grad_norm": 0.20993681252002716,
+      "learning_rate": 0.00013883814395323346,
+      "loss": 0.8961,
+      "step": 842
+    },
+    {
+      "epoch": 0.3074398249452954,
+      "grad_norm": 0.17933285236358643,
+      "learning_rate": 0.00013876507124588967,
+      "loss": 0.7162,
+      "step": 843
+    },
+    {
+      "epoch": 0.3078045222465354,
+      "grad_norm": 0.1597253382205963,
+      "learning_rate": 0.00013869199853854585,
+      "loss": 0.4908,
+      "step": 844
+    },
+    {
+      "epoch": 0.3081692195477753,
+      "grad_norm": 0.17529501020908356,
+      "learning_rate": 0.00013861892583120206,
+      "loss": 0.5254,
+      "step": 845
+    },
+    {
+      "epoch": 0.3085339168490153,
+      "grad_norm": 0.19237765669822693,
+      "learning_rate": 0.00013854585312385824,
+      "loss": 0.7377,
+      "step": 846
+    },
+    {
+      "epoch": 0.30889861415025527,
+      "grad_norm": 0.18043018877506256,
+      "learning_rate": 0.00013847278041651442,
+      "loss": 0.6784,
+      "step": 847
+    },
+    {
+      "epoch": 0.30926331145149527,
+      "grad_norm": 0.1529788225889206,
+      "learning_rate": 0.00013839970770917063,
+      "loss": 0.5048,
+      "step": 848
+    },
+    {
+      "epoch": 0.3096280087527352,
+      "grad_norm": 0.18892444670200348,
+      "learning_rate": 0.00013832663500182684,
+      "loss": 0.7093,
+      "step": 849
+    },
+    {
+      "epoch": 0.3099927060539752,
+      "grad_norm": 0.17523162066936493,
+      "learning_rate": 0.000138253562294483,
+      "loss": 0.6947,
+      "step": 850
+    },
+    {
+      "epoch": 0.31035740335521517,
+      "grad_norm": 0.17124755680561066,
+      "learning_rate": 0.0001381804895871392,
+      "loss": 0.6009,
+      "step": 851
+    },
+    {
+      "epoch": 0.31072210065645517,
+      "grad_norm": 0.19225868582725525,
+      "learning_rate": 0.0001381074168797954,
+      "loss": 0.8058,
+      "step": 852
+    },
+    {
+      "epoch": 0.3110867979576951,
+      "grad_norm": 0.19115358591079712,
+      "learning_rate": 0.0001380343441724516,
+      "loss": 0.6689,
+      "step": 853
+    },
+    {
+      "epoch": 0.31145149525893506,
+      "grad_norm": 0.15115682780742645,
+      "learning_rate": 0.0001379612714651078,
+      "loss": 0.3348,
+      "step": 854
+    },
+    {
+      "epoch": 0.31181619256017507,
+      "grad_norm": 0.18653741478919983,
+      "learning_rate": 0.00013788819875776398,
+      "loss": 0.872,
+      "step": 855
+    },
+    {
+      "epoch": 0.312180889861415,
+      "grad_norm": 0.18661542236804962,
+      "learning_rate": 0.00013781512605042016,
+      "loss": 0.9351,
+      "step": 856
+    },
+    {
+      "epoch": 0.312545587162655,
+      "grad_norm": 0.1910124570131302,
+      "learning_rate": 0.00013774205334307637,
+      "loss": 0.8575,
+      "step": 857
+    },
+    {
+      "epoch": 0.31291028446389496,
+      "grad_norm": 0.18687231838703156,
+      "learning_rate": 0.00013766898063573258,
+      "loss": 0.7134,
+      "step": 858
+    },
+    {
+      "epoch": 0.31327498176513496,
+      "grad_norm": 0.17739079892635345,
+      "learning_rate": 0.00013759590792838873,
+      "loss": 0.6473,
+      "step": 859
+    },
+    {
+      "epoch": 0.3136396790663749,
+      "grad_norm": 0.16609562933444977,
+      "learning_rate": 0.00013752283522104494,
+      "loss": 0.511,
+      "step": 860
+    },
+    {
+      "epoch": 0.31400437636761486,
+      "grad_norm": 0.17272962629795074,
+      "learning_rate": 0.00013744976251370115,
+      "loss": 0.5939,
+      "step": 861
+    },
+    {
+      "epoch": 0.31436907366885486,
+      "grad_norm": 0.196741983294487,
+      "learning_rate": 0.00013737668980635733,
+      "loss": 0.6241,
+      "step": 862
+    },
+    {
+      "epoch": 0.3147337709700948,
+      "grad_norm": 0.16731449961662292,
+      "learning_rate": 0.00013730361709901351,
+      "loss": 0.6294,
+      "step": 863
+    },
+    {
+      "epoch": 0.3150984682713348,
+      "grad_norm": 0.17899174988269806,
+      "learning_rate": 0.00013723054439166972,
+      "loss": 0.724,
+      "step": 864
+    },
+    {
+      "epoch": 0.31546316557257476,
+      "grad_norm": 0.18915481865406036,
+      "learning_rate": 0.0001371574716843259,
+      "loss": 0.6508,
+      "step": 865
+    },
+    {
+      "epoch": 0.31582786287381476,
+      "grad_norm": 0.17585769295692444,
+      "learning_rate": 0.0001370843989769821,
+      "loss": 0.736,
+      "step": 866
+    },
+    {
+      "epoch": 0.3161925601750547,
+      "grad_norm": 0.16982930898666382,
+      "learning_rate": 0.0001370113262696383,
+      "loss": 0.5676,
+      "step": 867
+    },
+    {
+      "epoch": 0.31655725747629465,
+      "grad_norm": 0.17551635205745697,
+      "learning_rate": 0.00013693825356229447,
+      "loss": 0.621,
+      "step": 868
+    },
+    {
+      "epoch": 0.31692195477753465,
+      "grad_norm": 0.19762268662452698,
+      "learning_rate": 0.00013686518085495068,
+      "loss": 0.872,
+      "step": 869
+    },
+    {
+      "epoch": 0.3172866520787746,
+      "grad_norm": 0.16374340653419495,
+      "learning_rate": 0.0001367921081476069,
+      "loss": 0.4612,
+      "step": 870
+    },
+    {
+      "epoch": 0.3176513493800146,
+      "grad_norm": 0.20281797647476196,
+      "learning_rate": 0.00013671903544026307,
+      "loss": 0.8791,
+      "step": 871
+    },
+    {
+      "epoch": 0.31801604668125455,
+      "grad_norm": 0.1752830445766449,
+      "learning_rate": 0.00013664596273291925,
+      "loss": 0.7158,
+      "step": 872
+    },
+    {
+      "epoch": 0.31838074398249455,
+      "grad_norm": 0.1833350956439972,
+      "learning_rate": 0.00013657289002557546,
+      "loss": 0.7307,
+      "step": 873
+    },
+    {
+      "epoch": 0.3187454412837345,
+      "grad_norm": 0.158727765083313,
+      "learning_rate": 0.00013649981731823164,
+      "loss": 0.4426,
+      "step": 874
+    },
+    {
+      "epoch": 0.31911013858497445,
+      "grad_norm": 0.1881689429283142,
+      "learning_rate": 0.00013642674461088785,
+      "loss": 0.6806,
+      "step": 875
+    },
+    {
+      "epoch": 0.31947483588621445,
+      "grad_norm": 0.18638849258422852,
+      "learning_rate": 0.00013635367190354403,
+      "loss": 0.5562,
+      "step": 876
+    },
+    {
+      "epoch": 0.3198395331874544,
+      "grad_norm": 0.17578531801700592,
+      "learning_rate": 0.00013628059919620022,
+      "loss": 0.6174,
+      "step": 877
+    },
+    {
+      "epoch": 0.3202042304886944,
+      "grad_norm": 0.17400258779525757,
+      "learning_rate": 0.00013620752648885642,
+      "loss": 0.6818,
+      "step": 878
+    },
+    {
+      "epoch": 0.32056892778993434,
+      "grad_norm": 0.17320291697978973,
+      "learning_rate": 0.0001361344537815126,
+      "loss": 0.59,
+      "step": 879
+    },
+    {
+      "epoch": 0.32093362509117435,
+      "grad_norm": 0.1907418966293335,
+      "learning_rate": 0.0001360613810741688,
+      "loss": 0.8411,
+      "step": 880
+    },
+    {
+      "epoch": 0.3212983223924143,
+      "grad_norm": 0.1831212341785431,
+      "learning_rate": 0.000135988308366825,
+      "loss": 0.7538,
+      "step": 881
+    },
+    {
+      "epoch": 0.32166301969365424,
+      "grad_norm": 0.16207090020179749,
+      "learning_rate": 0.00013591523565948118,
+      "loss": 0.5343,
+      "step": 882
+    },
+    {
+      "epoch": 0.32202771699489424,
+      "grad_norm": 0.16630573570728302,
+      "learning_rate": 0.00013584216295213738,
+      "loss": 0.5422,
+      "step": 883
+    },
+    {
+      "epoch": 0.3223924142961342,
+      "grad_norm": 0.16689611971378326,
+      "learning_rate": 0.0001357690902447936,
+      "loss": 0.682,
+      "step": 884
+    },
+    {
+      "epoch": 0.3227571115973742,
+      "grad_norm": 0.1957065463066101,
+      "learning_rate": 0.00013569601753744975,
+      "loss": 0.8429,
+      "step": 885
+    },
+    {
+      "epoch": 0.32312180889861414,
+      "grad_norm": 0.16777260601520538,
+      "learning_rate": 0.00013562294483010596,
+      "loss": 0.5122,
+      "step": 886
+    },
+    {
+      "epoch": 0.32348650619985414,
+      "grad_norm": 0.18913322687149048,
+      "learning_rate": 0.00013554987212276216,
+      "loss": 0.8262,
+      "step": 887
+    },
+    {
+      "epoch": 0.3238512035010941,
+      "grad_norm": 0.15186062455177307,
+      "learning_rate": 0.00013547679941541834,
+      "loss": 0.4259,
+      "step": 888
+    },
+    {
+      "epoch": 0.3242159008023341,
+      "grad_norm": 0.17760516703128815,
+      "learning_rate": 0.00013540372670807453,
+      "loss": 0.6425,
+      "step": 889
+    },
+    {
+      "epoch": 0.32458059810357404,
+      "grad_norm": 0.17440609633922577,
+      "learning_rate": 0.00013533065400073073,
+      "loss": 0.795,
+      "step": 890
+    },
+    {
+      "epoch": 0.324945295404814,
+      "grad_norm": 0.1680755466222763,
+      "learning_rate": 0.00013525758129338692,
+      "loss": 0.6218,
+      "step": 891
+    },
+    {
+      "epoch": 0.325309992706054,
+      "grad_norm": 0.18546250462532043,
+      "learning_rate": 0.00013518450858604312,
+      "loss": 0.7299,
+      "step": 892
+    },
+    {
+      "epoch": 0.32567469000729393,
+      "grad_norm": 0.2105245590209961,
+      "learning_rate": 0.00013511143587869933,
+      "loss": 0.8526,
+      "step": 893
+    },
+    {
+      "epoch": 0.32603938730853393,
+      "grad_norm": 0.18736779689788818,
+      "learning_rate": 0.0001350383631713555,
+      "loss": 0.7205,
+      "step": 894
+    },
+    {
+      "epoch": 0.3264040846097739,
+      "grad_norm": 0.17326989769935608,
+      "learning_rate": 0.0001349652904640117,
+      "loss": 0.5746,
+      "step": 895
+    },
+    {
+      "epoch": 0.3267687819110139,
+      "grad_norm": 0.21187415719032288,
+      "learning_rate": 0.0001348922177566679,
+      "loss": 0.9321,
+      "step": 896
+    },
+    {
+      "epoch": 0.32713347921225383,
+      "grad_norm": 0.18079346418380737,
+      "learning_rate": 0.00013481914504932409,
+      "loss": 0.6063,
+      "step": 897
+    },
+    {
+      "epoch": 0.3274981765134938,
+      "grad_norm": 0.2063044011592865,
+      "learning_rate": 0.00013474607234198027,
+      "loss": 0.8192,
+      "step": 898
+    },
+    {
+      "epoch": 0.3278628738147338,
+      "grad_norm": 0.1921169012784958,
+      "learning_rate": 0.00013467299963463647,
+      "loss": 0.9365,
+      "step": 899
+    },
+    {
+      "epoch": 0.3282275711159737,
+      "grad_norm": 0.19340583682060242,
+      "learning_rate": 0.00013459992692729266,
+      "loss": 0.7314,
+      "step": 900
+    },
+    {
+      "epoch": 0.32859226841721373,
+      "grad_norm": 0.19069619476795197,
+      "learning_rate": 0.00013452685421994886,
+      "loss": 0.7089,
+      "step": 901
+    },
+    {
+      "epoch": 0.3289569657184537,
+      "grad_norm": 0.1918506920337677,
+      "learning_rate": 0.00013445378151260507,
+      "loss": 0.7538,
+      "step": 902
+    },
+    {
+      "epoch": 0.3293216630196937,
+      "grad_norm": 0.18830406665802002,
+      "learning_rate": 0.00013438070880526123,
+      "loss": 0.6583,
+      "step": 903
+    },
+    {
+      "epoch": 0.3296863603209336,
+      "grad_norm": 0.18953083455562592,
+      "learning_rate": 0.00013430763609791744,
+      "loss": 0.6886,
+      "step": 904
+    },
+    {
+      "epoch": 0.33005105762217357,
+      "grad_norm": 0.1635250300168991,
+      "learning_rate": 0.00013423456339057364,
+      "loss": 0.6201,
+      "step": 905
+    },
+    {
+      "epoch": 0.3304157549234136,
+      "grad_norm": 0.19504040479660034,
+      "learning_rate": 0.00013416149068322983,
+      "loss": 0.6456,
+      "step": 906
+    },
+    {
+      "epoch": 0.3307804522246535,
+      "grad_norm": 0.1988976150751114,
+      "learning_rate": 0.000134088417975886,
+      "loss": 0.7649,
+      "step": 907
+    },
+    {
+      "epoch": 0.3311451495258935,
+      "grad_norm": 0.17592492699623108,
+      "learning_rate": 0.00013401534526854221,
+      "loss": 0.6112,
+      "step": 908
+    },
+    {
+      "epoch": 0.33150984682713347,
+      "grad_norm": 0.19126677513122559,
+      "learning_rate": 0.0001339422725611984,
+      "loss": 0.7045,
+      "step": 909
+    },
+    {
+      "epoch": 0.33187454412837347,
+      "grad_norm": 0.1643197238445282,
+      "learning_rate": 0.0001338691998538546,
+      "loss": 0.4433,
+      "step": 910
+    },
+    {
+      "epoch": 0.3322392414296134,
+      "grad_norm": 0.19282923638820648,
+      "learning_rate": 0.00013379612714651079,
+      "loss": 0.7357,
+      "step": 911
+    },
+    {
+      "epoch": 0.33260393873085337,
+      "grad_norm": 0.17284700274467468,
+      "learning_rate": 0.00013372305443916697,
+      "loss": 0.8092,
+      "step": 912
+    },
+    {
+      "epoch": 0.33296863603209337,
+      "grad_norm": 0.2016737312078476,
+      "learning_rate": 0.00013364998173182318,
+      "loss": 0.8372,
+      "step": 913
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.1924041360616684,
+      "learning_rate": 0.00013357690902447936,
+      "loss": 0.7576,
+      "step": 914
+    },
+    {
+      "epoch": 0.3336980306345733,
+      "grad_norm": 0.18545666337013245,
+      "learning_rate": 0.00013350383631713557,
+      "loss": 0.7559,
+      "step": 915
+    },
+    {
+      "epoch": 0.33406272793581326,
+      "grad_norm": 0.1433466225862503,
+      "learning_rate": 0.00013343076360979175,
+      "loss": 0.4667,
+      "step": 916
+    },
+    {
+      "epoch": 0.33442742523705327,
+      "grad_norm": 0.1683466136455536,
+      "learning_rate": 0.00013335769090244793,
+      "loss": 0.5951,
+      "step": 917
+    },
+    {
+      "epoch": 0.3347921225382932,
+      "grad_norm": 0.18137226998806,
+      "learning_rate": 0.00013328461819510414,
+      "loss": 0.6964,
+      "step": 918
+    },
+    {
+      "epoch": 0.33515681983953316,
+      "grad_norm": 0.19752834737300873,
+      "learning_rate": 0.00013321154548776034,
+      "loss": 0.7344,
+      "step": 919
+    },
+    {
+      "epoch": 0.33552151714077316,
+      "grad_norm": 0.17333728075027466,
+      "learning_rate": 0.0001331384727804165,
+      "loss": 0.5658,
+      "step": 920
+    },
+    {
+      "epoch": 0.3358862144420131,
+      "grad_norm": 0.17877991497516632,
+      "learning_rate": 0.0001330654000730727,
+      "loss": 0.6542,
+      "step": 921
+    },
+    {
+      "epoch": 0.3362509117432531,
+      "grad_norm": 0.15863974392414093,
+      "learning_rate": 0.00013299232736572892,
+      "loss": 0.4601,
+      "step": 922
+    },
+    {
+      "epoch": 0.33661560904449306,
+      "grad_norm": 0.20868968963623047,
+      "learning_rate": 0.0001329192546583851,
+      "loss": 0.7883,
+      "step": 923
+    },
+    {
+      "epoch": 0.33698030634573306,
+      "grad_norm": 0.20271888375282288,
+      "learning_rate": 0.00013284618195104128,
+      "loss": 0.9605,
+      "step": 924
+    },
+    {
+      "epoch": 0.337345003646973,
+      "grad_norm": 0.19558852910995483,
+      "learning_rate": 0.0001327731092436975,
+      "loss": 0.7866,
+      "step": 925
+    },
+    {
+      "epoch": 0.337709700948213,
+      "grad_norm": 0.2165563404560089,
+      "learning_rate": 0.00013270003653635367,
+      "loss": 0.9694,
+      "step": 926
+    },
+    {
+      "epoch": 0.33807439824945296,
+      "grad_norm": 0.18585020303726196,
+      "learning_rate": 0.00013262696382900988,
+      "loss": 0.7127,
+      "step": 927
+    },
+    {
+      "epoch": 0.3384390955506929,
+      "grad_norm": 0.15885639190673828,
+      "learning_rate": 0.00013255389112166608,
+      "loss": 0.4634,
+      "step": 928
+    },
+    {
+      "epoch": 0.3388037928519329,
+      "grad_norm": 0.18902234733104706,
+      "learning_rate": 0.00013248081841432224,
+      "loss": 0.7553,
+      "step": 929
+    },
+    {
+      "epoch": 0.33916849015317285,
+      "grad_norm": 0.1625453233718872,
+      "learning_rate": 0.00013240774570697845,
+      "loss": 0.6045,
+      "step": 930
+    },
+    {
+      "epoch": 0.33953318745441285,
+      "grad_norm": 0.1839369386434555,
+      "learning_rate": 0.00013233467299963466,
+      "loss": 0.6966,
+      "step": 931
+    },
+    {
+      "epoch": 0.3398978847556528,
+      "grad_norm": 0.1871074140071869,
+      "learning_rate": 0.00013226160029229084,
+      "loss": 0.7223,
+      "step": 932
+    },
+    {
+      "epoch": 0.3402625820568928,
+      "grad_norm": 0.19105811417102814,
+      "learning_rate": 0.00013218852758494702,
+      "loss": 0.8262,
+      "step": 933
+    },
+    {
+      "epoch": 0.34062727935813275,
+      "grad_norm": 0.19458365440368652,
+      "learning_rate": 0.00013211545487760323,
+      "loss": 0.6728,
+      "step": 934
+    },
+    {
+      "epoch": 0.3409919766593727,
+      "grad_norm": 0.17245818674564362,
+      "learning_rate": 0.0001320423821702594,
+      "loss": 0.6373,
+      "step": 935
+    },
+    {
+      "epoch": 0.3413566739606127,
+      "grad_norm": 0.17466460168361664,
+      "learning_rate": 0.00013196930946291562,
+      "loss": 0.6054,
+      "step": 936
+    },
+    {
+      "epoch": 0.34172137126185265,
+      "grad_norm": 0.1496109813451767,
+      "learning_rate": 0.0001318962367555718,
+      "loss": 0.5585,
+      "step": 937
+    },
+    {
+      "epoch": 0.34208606856309265,
+      "grad_norm": 0.16135789453983307,
+      "learning_rate": 0.00013182316404822798,
+      "loss": 0.4524,
+      "step": 938
+    },
+    {
+      "epoch": 0.3424507658643326,
+      "grad_norm": 0.18663141131401062,
+      "learning_rate": 0.0001317500913408842,
+      "loss": 0.6951,
+      "step": 939
+    },
+    {
+      "epoch": 0.3428154631655726,
+      "grad_norm": 0.15193338692188263,
+      "learning_rate": 0.0001316770186335404,
+      "loss": 0.5151,
+      "step": 940
+    },
+    {
+      "epoch": 0.34318016046681254,
+      "grad_norm": 0.16860604286193848,
+      "learning_rate": 0.00013160394592619658,
+      "loss": 0.5776,
+      "step": 941
+    },
+    {
+      "epoch": 0.3435448577680525,
+      "grad_norm": 0.18972420692443848,
+      "learning_rate": 0.00013153087321885276,
+      "loss": 0.7691,
+      "step": 942
+    },
+    {
+      "epoch": 0.3439095550692925,
+      "grad_norm": 0.196933776140213,
+      "learning_rate": 0.00013145780051150897,
+      "loss": 0.8165,
+      "step": 943
+    },
+    {
+      "epoch": 0.34427425237053244,
+      "grad_norm": 0.19200679659843445,
+      "learning_rate": 0.00013138472780416515,
+      "loss": 0.7874,
+      "step": 944
+    },
+    {
+      "epoch": 0.34463894967177244,
+      "grad_norm": 0.1795893758535385,
+      "learning_rate": 0.00013131165509682136,
+      "loss": 0.7507,
+      "step": 945
+    },
+    {
+      "epoch": 0.3450036469730124,
+      "grad_norm": 0.18410655856132507,
+      "learning_rate": 0.00013123858238947754,
+      "loss": 0.7354,
+      "step": 946
+    },
+    {
+      "epoch": 0.3453683442742524,
+      "grad_norm": 0.1739976406097412,
+      "learning_rate": 0.00013116550968213372,
+      "loss": 0.6932,
+      "step": 947
+    },
+    {
+      "epoch": 0.34573304157549234,
+      "grad_norm": 0.14160172641277313,
+      "learning_rate": 0.00013109243697478993,
+      "loss": 0.351,
+      "step": 948
+    },
+    {
+      "epoch": 0.3460977388767323,
+      "grad_norm": 0.14415137469768524,
+      "learning_rate": 0.0001310193642674461,
+      "loss": 0.4202,
+      "step": 949
+    },
+    {
+      "epoch": 0.3464624361779723,
+      "grad_norm": 0.2061617225408554,
+      "learning_rate": 0.00013094629156010232,
+      "loss": 0.9679,
+      "step": 950
+    },
+    {
+      "epoch": 0.34682713347921224,
+      "grad_norm": 0.20319141447544098,
+      "learning_rate": 0.0001308732188527585,
+      "loss": 0.8076,
+      "step": 951
+    },
+    {
+      "epoch": 0.34719183078045224,
+      "grad_norm": 0.17571642994880676,
+      "learning_rate": 0.00013080014614541468,
+      "loss": 0.6941,
+      "step": 952
+    },
+    {
+      "epoch": 0.3475565280816922,
+      "grad_norm": 0.177334725856781,
+      "learning_rate": 0.0001307270734380709,
+      "loss": 0.7511,
+      "step": 953
+    },
+    {
+      "epoch": 0.3479212253829322,
+      "grad_norm": 0.2112066000699997,
+      "learning_rate": 0.0001306540007307271,
+      "loss": 1.0981,
+      "step": 954
+    },
+    {
+      "epoch": 0.34828592268417213,
+      "grad_norm": 0.18469132483005524,
+      "learning_rate": 0.00013058092802338325,
+      "loss": 0.8159,
+      "step": 955
+    },
+    {
+      "epoch": 0.3486506199854121,
+      "grad_norm": 0.17193461954593658,
+      "learning_rate": 0.00013050785531603946,
+      "loss": 0.6005,
+      "step": 956
+    },
+    {
+      "epoch": 0.3490153172866521,
+      "grad_norm": 0.21006590127944946,
+      "learning_rate": 0.00013043478260869567,
+      "loss": 1.0101,
+      "step": 957
+    },
+    {
+      "epoch": 0.34938001458789203,
+      "grad_norm": 0.1526053547859192,
+      "learning_rate": 0.00013036170990135185,
+      "loss": 0.5219,
+      "step": 958
+    },
+    {
+      "epoch": 0.34974471188913203,
+      "grad_norm": 0.18546460568904877,
+      "learning_rate": 0.00013028863719400803,
+      "loss": 0.7272,
+      "step": 959
+    },
+    {
+      "epoch": 0.350109409190372,
+      "grad_norm": 0.1910969465970993,
+      "learning_rate": 0.00013021556448666424,
+      "loss": 0.8131,
+      "step": 960
+    },
+    {
+      "epoch": 0.350474106491612,
+      "grad_norm": 0.17536579072475433,
+      "learning_rate": 0.00013014249177932042,
+      "loss": 0.62,
+      "step": 961
+    },
+    {
+      "epoch": 0.3508388037928519,
+      "grad_norm": 0.18440979719161987,
+      "learning_rate": 0.00013006941907197663,
+      "loss": 0.7427,
+      "step": 962
+    },
+    {
+      "epoch": 0.35120350109409193,
+      "grad_norm": 0.1736113727092743,
+      "learning_rate": 0.00012999634636463284,
+      "loss": 0.6271,
+      "step": 963
+    },
+    {
+      "epoch": 0.3515681983953319,
+      "grad_norm": 0.1437050998210907,
+      "learning_rate": 0.000129923273657289,
+      "loss": 0.448,
+      "step": 964
+    },
+    {
+      "epoch": 0.3519328956965718,
+      "grad_norm": 0.17444917559623718,
+      "learning_rate": 0.0001298502009499452,
+      "loss": 0.5675,
+      "step": 965
+    },
+    {
+      "epoch": 0.3522975929978118,
+      "grad_norm": 0.2057693749666214,
+      "learning_rate": 0.0001297771282426014,
+      "loss": 0.9973,
+      "step": 966
+    },
+    {
+      "epoch": 0.3526622902990518,
+      "grad_norm": 0.1640542894601822,
+      "learning_rate": 0.0001297040555352576,
+      "loss": 0.5839,
+      "step": 967
+    },
+    {
+      "epoch": 0.3530269876002918,
+      "grad_norm": 0.1888854056596756,
+      "learning_rate": 0.00012963098282791377,
+      "loss": 0.7098,
+      "step": 968
+    },
+    {
+      "epoch": 0.3533916849015317,
+      "grad_norm": 0.1694556325674057,
+      "learning_rate": 0.00012955791012056998,
+      "loss": 0.5281,
+      "step": 969
+    },
+    {
+      "epoch": 0.3537563822027717,
+      "grad_norm": 0.17687252163887024,
+      "learning_rate": 0.00012948483741322616,
+      "loss": 0.6114,
+      "step": 970
+    },
+    {
+      "epoch": 0.35412107950401167,
+      "grad_norm": 0.1951674520969391,
+      "learning_rate": 0.00012941176470588237,
+      "loss": 0.8128,
+      "step": 971
+    },
+    {
+      "epoch": 0.3544857768052516,
+      "grad_norm": 0.20023071765899658,
+      "learning_rate": 0.00012933869199853855,
+      "loss": 0.8015,
+      "step": 972
+    },
+    {
+      "epoch": 0.3548504741064916,
+      "grad_norm": 0.18741564452648163,
+      "learning_rate": 0.00012926561929119473,
+      "loss": 0.7011,
+      "step": 973
+    },
+    {
+      "epoch": 0.35521517140773157,
+      "grad_norm": 0.13944192230701447,
+      "learning_rate": 0.00012919254658385094,
+      "loss": 0.365,
+      "step": 974
+    },
+    {
+      "epoch": 0.35557986870897157,
+      "grad_norm": 0.20607557892799377,
+      "learning_rate": 0.00012911947387650715,
+      "loss": 0.89,
+      "step": 975
+    },
+    {
+      "epoch": 0.3559445660102115,
+      "grad_norm": 0.2182752937078476,
+      "learning_rate": 0.00012904640116916333,
+      "loss": 0.9494,
+      "step": 976
+    },
+    {
+      "epoch": 0.3563092633114515,
+      "grad_norm": 0.18262708187103271,
+      "learning_rate": 0.0001289733284618195,
+      "loss": 0.6899,
+      "step": 977
+    },
+    {
+      "epoch": 0.35667396061269147,
+      "grad_norm": 0.18693357706069946,
+      "learning_rate": 0.00012890025575447572,
+      "loss": 0.7838,
+      "step": 978
+    },
+    {
+      "epoch": 0.3570386579139314,
+      "grad_norm": 0.19558003544807434,
+      "learning_rate": 0.0001288271830471319,
+      "loss": 0.6937,
+      "step": 979
+    },
+    {
+      "epoch": 0.3574033552151714,
+      "grad_norm": 0.1773812621831894,
+      "learning_rate": 0.0001287541103397881,
+      "loss": 0.6103,
+      "step": 980
+    },
+    {
+      "epoch": 0.35776805251641136,
+      "grad_norm": 0.19865770637989044,
+      "learning_rate": 0.0001286810376324443,
+      "loss": 0.884,
+      "step": 981
+    },
+    {
+      "epoch": 0.35813274981765136,
+      "grad_norm": 0.17361897230148315,
+      "learning_rate": 0.00012860796492510047,
+      "loss": 0.6312,
+      "step": 982
+    },
+    {
+      "epoch": 0.3584974471188913,
+      "grad_norm": 0.17946158349514008,
+      "learning_rate": 0.00012853489221775668,
+      "loss": 0.7898,
+      "step": 983
+    },
+    {
+      "epoch": 0.3588621444201313,
+      "grad_norm": 0.1990206092596054,
+      "learning_rate": 0.00012846181951041286,
+      "loss": 0.821,
+      "step": 984
+    },
+    {
+      "epoch": 0.35922684172137126,
+      "grad_norm": 0.16487091779708862,
+      "learning_rate": 0.00012838874680306904,
+      "loss": 0.6642,
+      "step": 985
+    },
+    {
+      "epoch": 0.3595915390226112,
+      "grad_norm": 0.17961853742599487,
+      "learning_rate": 0.00012831567409572525,
+      "loss": 0.8118,
+      "step": 986
+    },
+    {
+      "epoch": 0.3599562363238512,
+      "grad_norm": 0.16912826895713806,
+      "learning_rate": 0.00012824260138838143,
+      "loss": 0.6378,
+      "step": 987
+    },
+    {
+      "epoch": 0.36032093362509116,
+      "grad_norm": 0.19518031179904938,
+      "learning_rate": 0.00012816952868103764,
+      "loss": 0.9833,
+      "step": 988
+    },
+    {
+      "epoch": 0.36068563092633116,
+      "grad_norm": 0.18844519555568695,
+      "learning_rate": 0.00012809645597369385,
+      "loss": 0.7191,
+      "step": 989
+    },
+    {
+      "epoch": 0.3610503282275711,
+      "grad_norm": 0.18332988023757935,
+      "learning_rate": 0.00012802338326635,
+      "loss": 0.7232,
+      "step": 990
+    },
+    {
+      "epoch": 0.3614150255288111,
+      "grad_norm": 0.18674488365650177,
+      "learning_rate": 0.0001279503105590062,
+      "loss": 0.7617,
+      "step": 991
+    },
+    {
+      "epoch": 0.36177972283005105,
+      "grad_norm": 0.15862171351909637,
+      "learning_rate": 0.00012787723785166242,
+      "loss": 0.5411,
+      "step": 992
+    },
+    {
+      "epoch": 0.36214442013129106,
+      "grad_norm": 0.15820543467998505,
+      "learning_rate": 0.0001278041651443186,
+      "loss": 0.5198,
+      "step": 993
+    },
+    {
+      "epoch": 0.362509117432531,
+      "grad_norm": 0.19230028986930847,
+      "learning_rate": 0.00012773109243697478,
+      "loss": 0.7863,
+      "step": 994
+    },
+    {
+      "epoch": 0.36287381473377095,
+      "grad_norm": 0.18754363059997559,
+      "learning_rate": 0.000127658019729631,
+      "loss": 0.7216,
+      "step": 995
+    },
+    {
+      "epoch": 0.36323851203501095,
+      "grad_norm": 0.19463679194450378,
+      "learning_rate": 0.00012758494702228717,
+      "loss": 0.8963,
+      "step": 996
+    },
+    {
+      "epoch": 0.3636032093362509,
+      "grad_norm": 0.17682290077209473,
+      "learning_rate": 0.00012751187431494338,
+      "loss": 0.7009,
+      "step": 997
+    },
+    {
+      "epoch": 0.3639679066374909,
+      "grad_norm": 0.20536212623119354,
+      "learning_rate": 0.0001274388016075996,
+      "loss": 0.9973,
+      "step": 998
+    },
+    {
+      "epoch": 0.36433260393873085,
+      "grad_norm": 0.2060231864452362,
+      "learning_rate": 0.00012736572890025574,
+      "loss": 0.8184,
+      "step": 999
+    },
+    {
+      "epoch": 0.36469730123997085,
+      "grad_norm": 0.19924308359622955,
+      "learning_rate": 0.00012729265619291195,
+      "loss": 0.7663,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2742,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.490016210832998e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}