diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-1000/trainer_state.json"
@@ -0,0 +1,7034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5216484089723527,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005216484089723526,
+      "grad_norm": 3.6680614948272705,
+      "learning_rate": 0.0,
+      "loss": 0.541,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010432968179447052,
+      "grad_norm": 1.7092307806015015,
+      "learning_rate": 5e-06,
+      "loss": 0.2639,
+      "step": 2
+    },
+    {
+      "epoch": 0.001564945226917058,
+      "grad_norm": 1.990319013595581,
+      "learning_rate": 1e-05,
+      "loss": 0.3118,
+      "step": 3
+    },
+    {
+      "epoch": 0.0020865936358894104,
+      "grad_norm": 3.750917434692383,
+      "learning_rate": 1.5e-05,
+      "loss": 0.4562,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026082420448617634,
+      "grad_norm": 4.690845966339111,
+      "learning_rate": 2e-05,
+      "loss": 0.673,
+      "step": 5
+    },
+    {
+      "epoch": 0.003129890453834116,
+      "grad_norm": 1.4218288660049438,
+      "learning_rate": 2.5e-05,
+      "loss": 0.2984,
+      "step": 6
+    },
+    {
+      "epoch": 0.0036515388628064684,
+      "grad_norm": 4.896511077880859,
+      "learning_rate": 3e-05,
+      "loss": 0.7113,
+      "step": 7
+    },
+    {
+      "epoch": 0.004173187271778821,
+      "grad_norm": 2.5787155628204346,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 0.4226,
+      "step": 8
+    },
+    {
+      "epoch": 0.004694835680751174,
+      "grad_norm": 1.028937578201294,
+      "learning_rate": 4e-05,
+      "loss": 0.1873,
+      "step": 9
+    },
+    {
+      "epoch": 0.005216484089723527,
+      "grad_norm": 3.9262092113494873,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 0.5728,
+      "step": 10
+    },
+    {
+      "epoch": 0.005738132498695879,
+      "grad_norm": 4.9360198974609375,
+      "learning_rate": 5e-05,
+      "loss": 0.725,
+      "step": 11
+    },
+    {
+      "epoch": 0.006259780907668232,
+      "grad_norm": 4.287437915802002,
+      "learning_rate": 5.5e-05,
+      "loss": 0.6361,
+      "step": 12
+    },
+    {
+      "epoch": 0.006781429316640584,
+      "grad_norm": 1.3290928602218628,
+      "learning_rate": 6e-05,
+      "loss": 0.3109,
+      "step": 13
+    },
+    {
+      "epoch": 0.007303077725612937,
+      "grad_norm": 2.0050501823425293,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 0.4099,
+      "step": 14
+    },
+    {
+      "epoch": 0.00782472613458529,
+      "grad_norm": 4.360481262207031,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 0.6363,
+      "step": 15
+    },
+    {
+      "epoch": 0.008346374543557642,
+      "grad_norm": 3.9680511951446533,
+      "learning_rate": 7.5e-05,
+      "loss": 0.6124,
+      "step": 16
+    },
+    {
+      "epoch": 0.008868022952529996,
+      "grad_norm": 1.701784610748291,
+      "learning_rate": 8e-05,
+      "loss": 0.3439,
+      "step": 17
+    },
+    {
+      "epoch": 0.009389671361502348,
+      "grad_norm": 4.544748783111572,
+      "learning_rate": 8.5e-05,
+      "loss": 0.6253,
+      "step": 18
+    },
+    {
+      "epoch": 0.0099113197704747,
+      "grad_norm": 4.58634090423584,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.7669,
+      "step": 19
+    },
+    {
+      "epoch": 0.010432968179447054,
+      "grad_norm": 2.89898419380188,
+      "learning_rate": 9.5e-05,
+      "loss": 0.512,
+      "step": 20
+    },
+    {
+      "epoch": 0.010954616588419406,
+      "grad_norm": 2.61112904548645,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 21
+    },
+    {
+      "epoch": 0.011476264997391758,
+      "grad_norm": 3.217054843902588,
+      "learning_rate": 0.000105,
+      "loss": 0.4959,
+      "step": 22
+    },
+    {
+      "epoch": 0.011997913406364111,
+      "grad_norm": 2.569636821746826,
+      "learning_rate": 0.00011,
+      "loss": 0.3918,
+      "step": 23
+    },
+    {
+      "epoch": 0.012519561815336464,
+      "grad_norm": 1.4626373052597046,
+      "learning_rate": 0.000115,
+      "loss": 0.3316,
+      "step": 24
+    },
+    {
+      "epoch": 0.013041210224308816,
+      "grad_norm": 1.2480732202529907,
+      "learning_rate": 0.00012,
+      "loss": 0.3484,
+      "step": 25
+    },
+    {
+      "epoch": 0.013562858633281168,
+      "grad_norm": 2.5430543422698975,
+      "learning_rate": 0.000125,
+      "loss": 0.4699,
+      "step": 26
+    },
+    {
+      "epoch": 0.014084507042253521,
+      "grad_norm": 1.7051862478256226,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.2139,
+      "step": 27
+    },
+    {
+      "epoch": 0.014606155451225874,
+      "grad_norm": 1.1670981645584106,
+      "learning_rate": 0.000135,
+      "loss": 0.3883,
+      "step": 28
+    },
+    {
+      "epoch": 0.015127803860198226,
+      "grad_norm": 1.336538314819336,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.3714,
+      "step": 29
+    },
+    {
+      "epoch": 0.01564945226917058,
+      "grad_norm": 2.018078565597534,
+      "learning_rate": 0.000145,
+      "loss": 0.2301,
+      "step": 30
+    },
+    {
+      "epoch": 0.01617110067814293,
+      "grad_norm": 1.5743223428726196,
+      "learning_rate": 0.00015,
+      "loss": 0.2935,
+      "step": 31
+    },
+    {
+      "epoch": 0.016692749087115284,
+      "grad_norm": 1.2724987268447876,
+      "learning_rate": 0.000155,
+      "loss": 0.3141,
+      "step": 32
+    },
+    {
+      "epoch": 0.017214397496087636,
+      "grad_norm": 2.2347893714904785,
+      "learning_rate": 0.00016,
+      "loss": 0.2917,
+      "step": 33
+    },
+    {
+      "epoch": 0.01773604590505999,
+      "grad_norm": 1.6726069450378418,
+      "learning_rate": 0.000165,
+      "loss": 0.377,
+      "step": 34
+    },
+    {
+      "epoch": 0.018257694314032343,
+      "grad_norm": 1.2217071056365967,
+      "learning_rate": 0.00017,
+      "loss": 0.3027,
+      "step": 35
+    },
+    {
+      "epoch": 0.018779342723004695,
+      "grad_norm": 1.3436322212219238,
+      "learning_rate": 0.000175,
+      "loss": 0.2853,
+      "step": 36
+    },
+    {
+      "epoch": 0.019300991131977047,
+      "grad_norm": 1.2247120141983032,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.2967,
+      "step": 37
+    },
+    {
+      "epoch": 0.0198226395409494,
+      "grad_norm": 1.0636978149414062,
+      "learning_rate": 0.000185,
+      "loss": 0.2745,
+      "step": 38
+    },
+    {
+      "epoch": 0.02034428794992175,
+      "grad_norm": 1.302099347114563,
+      "learning_rate": 0.00019,
+      "loss": 0.2688,
+      "step": 39
+    },
+    {
+      "epoch": 0.020865936358894107,
+      "grad_norm": 1.0052679777145386,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.2539,
+      "step": 40
+    },
+    {
+      "epoch": 0.02138758476786646,
+      "grad_norm": 1.0164084434509277,
+      "learning_rate": 0.0002,
+      "loss": 0.1978,
+      "step": 41
+    },
+    {
+      "epoch": 0.02190923317683881,
+      "grad_norm": 1.3891016244888306,
+      "learning_rate": 0.000205,
+      "loss": 0.3189,
+      "step": 42
+    },
+    {
+      "epoch": 0.022430881585811163,
+      "grad_norm": 0.960986852645874,
+      "learning_rate": 0.00021,
+      "loss": 0.2321,
+      "step": 43
+    },
+    {
+      "epoch": 0.022952529994783515,
+      "grad_norm": 0.9918408393859863,
+      "learning_rate": 0.000215,
+      "loss": 0.2359,
+      "step": 44
+    },
+    {
+      "epoch": 0.023474178403755867,
+      "grad_norm": 1.190205693244934,
+      "learning_rate": 0.00022,
+      "loss": 0.2347,
+      "step": 45
+    },
+    {
+      "epoch": 0.023995826812728223,
+      "grad_norm": 0.7985232472419739,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.2048,
+      "step": 46
+    },
+    {
+      "epoch": 0.024517475221700575,
+      "grad_norm": 0.5192842483520508,
+      "learning_rate": 0.00023,
+      "loss": 0.1116,
+      "step": 47
+    },
+    {
+      "epoch": 0.025039123630672927,
+      "grad_norm": 1.1033375263214111,
+      "learning_rate": 0.000235,
+      "loss": 0.2665,
+      "step": 48
+    },
+    {
+      "epoch": 0.02556077203964528,
+      "grad_norm": 0.7089418172836304,
+      "learning_rate": 0.00024,
+      "loss": 0.1639,
+      "step": 49
+    },
+    {
+      "epoch": 0.02608242044861763,
+      "grad_norm": 1.08647882938385,
+      "learning_rate": 0.000245,
+      "loss": 0.2072,
+      "step": 50
+    },
+    {
+      "epoch": 0.026604068857589983,
+      "grad_norm": 0.9901174902915955,
+      "learning_rate": 0.00025,
+      "loss": 0.2035,
+      "step": 51
+    },
+    {
+      "epoch": 0.027125717266562335,
+      "grad_norm": 0.6938351988792419,
+      "learning_rate": 0.000255,
+      "loss": 0.1851,
+      "step": 52
+    },
+    {
+      "epoch": 0.02764736567553469,
+      "grad_norm": 0.8392678499221802,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.1918,
+      "step": 53
+    },
+    {
+      "epoch": 0.028169014084507043,
+      "grad_norm": 0.5979602932929993,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.1243,
+      "step": 54
+    },
+    {
+      "epoch": 0.028690662493479395,
+      "grad_norm": 0.7119799852371216,
+      "learning_rate": 0.00027,
+      "loss": 0.1594,
+      "step": 55
+    },
+    {
+      "epoch": 0.029212310902451747,
+      "grad_norm": 0.5519995093345642,
+      "learning_rate": 0.000275,
+      "loss": 0.078,
+      "step": 56
+    },
+    {
+      "epoch": 0.0297339593114241,
+      "grad_norm": 0.5917723774909973,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.134,
+      "step": 57
+    },
+    {
+      "epoch": 0.03025560772039645,
+      "grad_norm": 0.6265603303909302,
+      "learning_rate": 0.000285,
+      "loss": 0.1848,
+      "step": 58
+    },
+    {
+      "epoch": 0.030777256129368807,
+      "grad_norm": 1.0653454065322876,
+      "learning_rate": 0.00029,
+      "loss": 0.1831,
+      "step": 59
+    },
+    {
+      "epoch": 0.03129890453834116,
+      "grad_norm": 0.3466293513774872,
+      "learning_rate": 0.000295,
+      "loss": 0.0878,
+      "step": 60
+    },
+    {
+      "epoch": 0.03182055294731351,
+      "grad_norm": 0.5498062372207642,
+      "learning_rate": 0.0003,
+      "loss": 0.1733,
+      "step": 61
+    },
+    {
+      "epoch": 0.03234220135628586,
+      "grad_norm": 0.7708966135978699,
+      "learning_rate": 0.000305,
+      "loss": 0.1975,
+      "step": 62
+    },
+    {
+      "epoch": 0.03286384976525822,
+      "grad_norm": 0.7717278003692627,
+      "learning_rate": 0.00031,
+      "loss": 0.1863,
+      "step": 63
+    },
+    {
+      "epoch": 0.03338549817423057,
+      "grad_norm": 0.8076028823852539,
+      "learning_rate": 0.000315,
+      "loss": 0.1938,
+      "step": 64
+    },
+    {
+      "epoch": 0.03390714658320292,
+      "grad_norm": 0.5629755258560181,
+      "learning_rate": 0.00032,
+      "loss": 0.1471,
+      "step": 65
+    },
+    {
+      "epoch": 0.03442879499217527,
+      "grad_norm": 0.5237282514572144,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.1244,
+      "step": 66
+    },
+    {
+      "epoch": 0.03495044340114763,
+      "grad_norm": 0.7248942852020264,
+      "learning_rate": 0.00033,
+      "loss": 0.1933,
+      "step": 67
+    },
+    {
+      "epoch": 0.03547209181011998,
+      "grad_norm": 0.49564772844314575,
+      "learning_rate": 0.000335,
+      "loss": 0.1389,
+      "step": 68
+    },
+    {
+      "epoch": 0.03599374021909233,
+      "grad_norm": 0.4806594252586365,
+      "learning_rate": 0.00034,
+      "loss": 0.1295,
+      "step": 69
+    },
+    {
+      "epoch": 0.036515388628064686,
+      "grad_norm": 0.39995619654655457,
+      "learning_rate": 0.000345,
+      "loss": 0.1324,
+      "step": 70
+    },
+    {
+      "epoch": 0.037037037037037035,
+      "grad_norm": 0.6496027708053589,
+      "learning_rate": 0.00035,
+      "loss": 0.1002,
+      "step": 71
+    },
+    {
+      "epoch": 0.03755868544600939,
+      "grad_norm": 0.5661569237709045,
+      "learning_rate": 0.000355,
+      "loss": 0.1277,
+      "step": 72
+    },
+    {
+      "epoch": 0.03808033385498174,
+      "grad_norm": 0.49875250458717346,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.1322,
+      "step": 73
+    },
+    {
+      "epoch": 0.038601982263954095,
+      "grad_norm": 0.44551461935043335,
+      "learning_rate": 0.000365,
+      "loss": 0.1278,
+      "step": 74
+    },
+    {
+      "epoch": 0.03912363067292645,
+      "grad_norm": 0.3314933478832245,
+      "learning_rate": 0.00037,
+      "loss": 0.0918,
+      "step": 75
+    },
+    {
+      "epoch": 0.0396452790818988,
+      "grad_norm": 0.3463922441005707,
+      "learning_rate": 0.000375,
+      "loss": 0.0948,
+      "step": 76
+    },
+    {
+      "epoch": 0.040166927490871154,
+      "grad_norm": 0.5401505827903748,
+      "learning_rate": 0.00038,
+      "loss": 0.1574,
+      "step": 77
+    },
+    {
+      "epoch": 0.0406885758998435,
+      "grad_norm": 0.39233317971229553,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.1312,
+      "step": 78
+    },
+    {
+      "epoch": 0.04121022430881586,
+      "grad_norm": 0.4380398988723755,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0601,
+      "step": 79
+    },
+    {
+      "epoch": 0.041731872717788214,
+      "grad_norm": 0.3931694030761719,
+      "learning_rate": 0.000395,
+      "loss": 0.0962,
+      "step": 80
+    },
+    {
+      "epoch": 0.04225352112676056,
+      "grad_norm": 0.3566243648529053,
+      "learning_rate": 0.0004,
+      "loss": 0.1137,
+      "step": 81
+    },
+    {
+      "epoch": 0.04277516953573292,
+      "grad_norm": 0.40159469842910767,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.1128,
+      "step": 82
+    },
+    {
+      "epoch": 0.04329681794470527,
+      "grad_norm": 0.30474773049354553,
+      "learning_rate": 0.00041,
+      "loss": 0.0922,
+      "step": 83
+    },
+    {
+      "epoch": 0.04381846635367762,
+      "grad_norm": 0.31177017092704773,
+      "learning_rate": 0.000415,
+      "loss": 0.1015,
+      "step": 84
+    },
+    {
+      "epoch": 0.04434011476264997,
+      "grad_norm": 0.3996855914592743,
+      "learning_rate": 0.00042,
+      "loss": 0.1266,
+      "step": 85
+    },
+    {
+      "epoch": 0.044861763171622326,
+      "grad_norm": 0.2281728833913803,
+      "learning_rate": 0.000425,
+      "loss": 0.0758,
+      "step": 86
+    },
+    {
+      "epoch": 0.04538341158059468,
+      "grad_norm": 0.5169669985771179,
+      "learning_rate": 0.00043,
+      "loss": 0.1092,
+      "step": 87
+    },
+    {
+      "epoch": 0.04590505998956703,
+      "grad_norm": 0.5525585412979126,
+      "learning_rate": 0.000435,
+      "loss": 0.1226,
+      "step": 88
+    },
+    {
+      "epoch": 0.046426708398539386,
+      "grad_norm": 0.33093884587287903,
+      "learning_rate": 0.00044,
+      "loss": 0.0879,
+      "step": 89
+    },
+    {
+      "epoch": 0.046948356807511735,
+      "grad_norm": 0.3713582158088684,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.1121,
+      "step": 90
+    },
+    {
+      "epoch": 0.04747000521648409,
+      "grad_norm": 0.565517246723175,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.1469,
+      "step": 91
+    },
+    {
+      "epoch": 0.047991653625456446,
+      "grad_norm": 0.31801939010620117,
+      "learning_rate": 0.000455,
+      "loss": 0.0915,
+      "step": 92
+    },
+    {
+      "epoch": 0.048513302034428794,
+      "grad_norm": 0.42586401104927063,
+      "learning_rate": 0.00046,
+      "loss": 0.0411,
+      "step": 93
+    },
+    {
+      "epoch": 0.04903495044340115,
+      "grad_norm": 0.42403289675712585,
+      "learning_rate": 0.000465,
+      "loss": 0.0589,
+      "step": 94
+    },
+    {
+      "epoch": 0.0495565988523735,
+      "grad_norm": 0.2604529559612274,
+      "learning_rate": 0.00047,
+      "loss": 0.0779,
+      "step": 95
+    },
+    {
+      "epoch": 0.050078247261345854,
+      "grad_norm": 0.32257840037345886,
+      "learning_rate": 0.000475,
+      "loss": 0.0958,
+      "step": 96
+    },
+    {
+      "epoch": 0.0505998956703182,
+      "grad_norm": 0.2648946940898895,
+      "learning_rate": 0.00048,
+      "loss": 0.0591,
+      "step": 97
+    },
+    {
+      "epoch": 0.05112154407929056,
+      "grad_norm": 0.26664629578590393,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0607,
+      "step": 98
+    },
+    {
+      "epoch": 0.051643192488262914,
+      "grad_norm": 0.2891658544540405,
+      "learning_rate": 0.00049,
+      "loss": 0.0478,
+      "step": 99
+    },
+    {
+      "epoch": 0.05216484089723526,
+      "grad_norm": 0.35936883091926575,
+      "learning_rate": 0.000495,
+      "loss": 0.1126,
+      "step": 100
+    },
+    {
+      "epoch": 0.05268648930620762,
+      "grad_norm": 0.3226841986179352,
+      "learning_rate": 0.0005,
+      "loss": 0.0995,
+      "step": 101
+    },
+    {
+      "epoch": 0.053208137715179966,
+      "grad_norm": 0.2140406370162964,
+      "learning_rate": 0.0004994444444444445,
+      "loss": 0.0636,
+      "step": 102
+    },
+    {
+      "epoch": 0.05372978612415232,
+      "grad_norm": 0.28297877311706543,
+      "learning_rate": 0.0004988888888888889,
+      "loss": 0.0674,
+      "step": 103
+    },
+    {
+      "epoch": 0.05425143453312467,
+      "grad_norm": 0.27131739258766174,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 0.0657,
+      "step": 104
+    },
+    {
+      "epoch": 0.054773082942097026,
+      "grad_norm": 0.28402701020240784,
+      "learning_rate": 0.0004977777777777778,
+      "loss": 0.0894,
+      "step": 105
+    },
+    {
+      "epoch": 0.05529473135106938,
+      "grad_norm": 0.33924373984336853,
+      "learning_rate": 0.0004972222222222222,
+      "loss": 0.1264,
+      "step": 106
+    },
+    {
+      "epoch": 0.05581637976004173,
+      "grad_norm": 0.3655984401702881,
+      "learning_rate": 0.0004966666666666666,
+      "loss": 0.0828,
+      "step": 107
+    },
+    {
+      "epoch": 0.056338028169014086,
+      "grad_norm": 0.2262953370809555,
+      "learning_rate": 0.0004961111111111111,
+      "loss": 0.0662,
+      "step": 108
+    },
+    {
+      "epoch": 0.056859676577986434,
+      "grad_norm": 0.23988084495067596,
+      "learning_rate": 0.0004955555555555556,
+      "loss": 0.0672,
+      "step": 109
+    },
+    {
+      "epoch": 0.05738132498695879,
+      "grad_norm": 0.228820338845253,
+      "learning_rate": 0.000495,
+      "loss": 0.0615,
+      "step": 110
+    },
+    {
+      "epoch": 0.057902973395931145,
+      "grad_norm": 0.32484373450279236,
+      "learning_rate": 0.0004944444444444445,
+      "loss": 0.0833,
+      "step": 111
+    },
+    {
+      "epoch": 0.058424621804903494,
+      "grad_norm": 0.22520330548286438,
+      "learning_rate": 0.0004938888888888889,
+      "loss": 0.0767,
+      "step": 112
+    },
+    {
+      "epoch": 0.05894627021387585,
+      "grad_norm": 0.4783564805984497,
+      "learning_rate": 0.0004933333333333334,
+      "loss": 0.0999,
+      "step": 113
+    },
+    {
+      "epoch": 0.0594679186228482,
+      "grad_norm": 0.2565033733844757,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 0.0819,
+      "step": 114
+    },
+    {
+      "epoch": 0.059989567031820554,
+      "grad_norm": 0.19332879781723022,
+      "learning_rate": 0.0004922222222222222,
+      "loss": 0.0702,
+      "step": 115
+    },
+    {
+      "epoch": 0.0605112154407929,
+      "grad_norm": 0.2507823705673218,
+      "learning_rate": 0.0004916666666666666,
+      "loss": 0.076,
+      "step": 116
+    },
+    {
+      "epoch": 0.06103286384976526,
+      "grad_norm": 0.29689472913742065,
+      "learning_rate": 0.0004911111111111111,
+      "loss": 0.0748,
+      "step": 117
+    },
+    {
+      "epoch": 0.06155451225873761,
+      "grad_norm": 0.34821203351020813,
+      "learning_rate": 0.0004905555555555556,
+      "loss": 0.0949,
+      "step": 118
+    },
+    {
+      "epoch": 0.06207616066770996,
+      "grad_norm": 0.25025618076324463,
+      "learning_rate": 0.00049,
+      "loss": 0.0813,
+      "step": 119
+    },
+    {
+      "epoch": 0.06259780907668232,
+      "grad_norm": 0.23138757050037384,
+      "learning_rate": 0.0004894444444444445,
+      "loss": 0.0806,
+      "step": 120
+    },
+    {
+      "epoch": 0.06311945748565467,
+      "grad_norm": 0.25655433535575867,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.0864,
+      "step": 121
+    },
+    {
+      "epoch": 0.06364110589462701,
+      "grad_norm": 0.2863710820674896,
+      "learning_rate": 0.0004883333333333333,
+      "loss": 0.0659,
+      "step": 122
+    },
+    {
+      "epoch": 0.06416275430359937,
+      "grad_norm": 0.2628318965435028,
+      "learning_rate": 0.0004877777777777778,
+      "loss": 0.0746,
+      "step": 123
+    },
+    {
+      "epoch": 0.06468440271257173,
+      "grad_norm": 0.2095496952533722,
+      "learning_rate": 0.0004872222222222222,
+      "loss": 0.0746,
+      "step": 124
+    },
+    {
+      "epoch": 0.06520605112154408,
+      "grad_norm": 0.25687775015830994,
+      "learning_rate": 0.0004866666666666667,
+      "loss": 0.0867,
+      "step": 125
+    },
+    {
+      "epoch": 0.06572769953051644,
+      "grad_norm": 0.3623638153076172,
+      "learning_rate": 0.0004861111111111111,
+      "loss": 0.0859,
+      "step": 126
+    },
+    {
+      "epoch": 0.06624934793948878,
+      "grad_norm": 0.22254744172096252,
+      "learning_rate": 0.0004855555555555556,
+      "loss": 0.0956,
+      "step": 127
+    },
+    {
+      "epoch": 0.06677099634846113,
+      "grad_norm": 0.42705070972442627,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0885,
+      "step": 128
+    },
+    {
+      "epoch": 0.06729264475743349,
+      "grad_norm": 0.23360145092010498,
+      "learning_rate": 0.00048444444444444446,
+      "loss": 0.0521,
+      "step": 129
+    },
+    {
+      "epoch": 0.06781429316640585,
+      "grad_norm": 0.1959061473608017,
+      "learning_rate": 0.0004838888888888889,
+      "loss": 0.043,
+      "step": 130
+    },
+    {
+      "epoch": 0.0683359415753782,
+      "grad_norm": 0.32006219029426575,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 0.0942,
+      "step": 131
+    },
+    {
+      "epoch": 0.06885758998435054,
+      "grad_norm": 0.20010985434055328,
+      "learning_rate": 0.0004827777777777778,
+      "loss": 0.0645,
+      "step": 132
+    },
+    {
+      "epoch": 0.0693792383933229,
+      "grad_norm": 0.18007700145244598,
+      "learning_rate": 0.0004822222222222222,
+      "loss": 0.0593,
+      "step": 133
+    },
+    {
+      "epoch": 0.06990088680229525,
+      "grad_norm": 0.23080182075500488,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 0.069,
+      "step": 134
+    },
+    {
+      "epoch": 0.07042253521126761,
+      "grad_norm": 0.16220460832118988,
+      "learning_rate": 0.0004811111111111111,
+      "loss": 0.0499,
+      "step": 135
+    },
+    {
+      "epoch": 0.07094418362023996,
+      "grad_norm": 0.19325301051139832,
+      "learning_rate": 0.0004805555555555556,
+      "loss": 0.0616,
+      "step": 136
+    },
+    {
+      "epoch": 0.0714658320292123,
+      "grad_norm": 0.16364900767803192,
+      "learning_rate": 0.00048,
+      "loss": 0.0612,
+      "step": 137
+    },
+    {
+      "epoch": 0.07198748043818466,
+      "grad_norm": 0.15745937824249268,
+      "learning_rate": 0.00047944444444444445,
+      "loss": 0.0526,
+      "step": 138
+    },
+    {
+      "epoch": 0.07250912884715702,
+      "grad_norm": 0.22706539928913116,
+      "learning_rate": 0.0004788888888888889,
+      "loss": 0.067,
+      "step": 139
+    },
+    {
+      "epoch": 0.07303077725612937,
+      "grad_norm": 0.22147034108638763,
+      "learning_rate": 0.0004783333333333333,
+      "loss": 0.0684,
+      "step": 140
+    },
+    {
+      "epoch": 0.07355242566510173,
+      "grad_norm": 0.2623853385448456,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.0491,
+      "step": 141
+    },
+    {
+      "epoch": 0.07407407407407407,
+      "grad_norm": 0.1899435669183731,
+      "learning_rate": 0.00047722222222222225,
+      "loss": 0.029,
+      "step": 142
+    },
+    {
+      "epoch": 0.07459572248304643,
+      "grad_norm": 0.2669859826564789,
+      "learning_rate": 0.0004766666666666667,
+      "loss": 0.064,
+      "step": 143
+    },
+    {
+      "epoch": 0.07511737089201878,
+      "grad_norm": 0.18063829839229584,
+      "learning_rate": 0.0004761111111111111,
+      "loss": 0.0624,
+      "step": 144
+    },
+    {
+      "epoch": 0.07563901930099114,
+      "grad_norm": 0.22147716581821442,
+      "learning_rate": 0.00047555555555555556,
+      "loss": 0.0544,
+      "step": 145
+    },
+    {
+      "epoch": 0.07616066770996348,
+      "grad_norm": 0.30522170662879944,
+      "learning_rate": 0.000475,
+      "loss": 0.077,
+      "step": 146
+    },
+    {
+      "epoch": 0.07668231611893583,
+      "grad_norm": 0.15942497551441193,
+      "learning_rate": 0.00047444444444444444,
+      "loss": 0.0372,
+      "step": 147
+    },
+    {
+      "epoch": 0.07720396452790819,
+      "grad_norm": 0.1456826627254486,
+      "learning_rate": 0.00047388888888888893,
+      "loss": 0.0423,
+      "step": 148
+    },
+    {
+      "epoch": 0.07772561293688054,
+      "grad_norm": 0.17793269455432892,
+      "learning_rate": 0.00047333333333333336,
+      "loss": 0.0559,
+      "step": 149
+    },
+    {
+      "epoch": 0.0782472613458529,
+      "grad_norm": 0.152329221367836,
+      "learning_rate": 0.0004727777777777778,
+      "loss": 0.0266,
+      "step": 150
+    },
+    {
+      "epoch": 0.07876890975482524,
+      "grad_norm": 0.19327858090400696,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 0.0608,
+      "step": 151
+    },
+    {
+      "epoch": 0.0792905581637976,
+      "grad_norm": 0.15060095489025116,
+      "learning_rate": 0.0004716666666666667,
+      "loss": 0.0461,
+      "step": 152
+    },
+    {
+      "epoch": 0.07981220657276995,
+      "grad_norm": 0.1864742785692215,
+      "learning_rate": 0.0004711111111111111,
+      "loss": 0.0724,
+      "step": 153
+    },
+    {
+      "epoch": 0.08033385498174231,
+      "grad_norm": 0.1422508805990219,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 0.0325,
+      "step": 154
+    },
+    {
+      "epoch": 0.08085550339071466,
+      "grad_norm": 0.21115481853485107,
+      "learning_rate": 0.00047,
+      "loss": 0.0535,
+      "step": 155
+    },
+    {
+      "epoch": 0.081377151799687,
+      "grad_norm": 0.2197350263595581,
+      "learning_rate": 0.0004694444444444445,
+      "loss": 0.0703,
+      "step": 156
+    },
+    {
+      "epoch": 0.08189880020865936,
+      "grad_norm": 0.1608528196811676,
+      "learning_rate": 0.0004688888888888889,
+      "loss": 0.0447,
+      "step": 157
+    },
+    {
+      "epoch": 0.08242044861763172,
+      "grad_norm": 0.1445985585451126,
+      "learning_rate": 0.00046833333333333335,
+      "loss": 0.0469,
+      "step": 158
+    },
+    {
+      "epoch": 0.08294209702660407,
+      "grad_norm": 0.25215667486190796,
+      "learning_rate": 0.0004677777777777778,
+      "loss": 0.0709,
+      "step": 159
+    },
+    {
+      "epoch": 0.08346374543557643,
+      "grad_norm": 0.14391636848449707,
+      "learning_rate": 0.0004672222222222222,
+      "loss": 0.0457,
+      "step": 160
+    },
+    {
+      "epoch": 0.08398539384454877,
+      "grad_norm": 0.29619306325912476,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.0765,
+      "step": 161
+    },
+    {
+      "epoch": 0.08450704225352113,
+      "grad_norm": 0.15701289474964142,
+      "learning_rate": 0.0004661111111111111,
+      "loss": 0.0418,
+      "step": 162
+    },
+    {
+      "epoch": 0.08502869066249348,
+      "grad_norm": 0.1698683649301529,
+      "learning_rate": 0.0004655555555555556,
+      "loss": 0.0294,
+      "step": 163
+    },
+    {
+      "epoch": 0.08555033907146584,
+      "grad_norm": 0.12165573239326477,
+      "learning_rate": 0.000465,
+      "loss": 0.0258,
+      "step": 164
+    },
+    {
+      "epoch": 0.08607198748043818,
+      "grad_norm": 0.1611219197511673,
+      "learning_rate": 0.00046444444444444446,
+      "loss": 0.0491,
+      "step": 165
+    },
+    {
+      "epoch": 0.08659363588941053,
+      "grad_norm": 0.1486036628484726,
+      "learning_rate": 0.0004638888888888889,
+      "loss": 0.0479,
+      "step": 166
+    },
+    {
+      "epoch": 0.08711528429838289,
+      "grad_norm": 0.13054965436458588,
+      "learning_rate": 0.00046333333333333334,
+      "loss": 0.0401,
+      "step": 167
+    },
+    {
+      "epoch": 0.08763693270735524,
+      "grad_norm": 0.15433131158351898,
+      "learning_rate": 0.0004627777777777778,
+      "loss": 0.048,
+      "step": 168
+    },
+    {
+      "epoch": 0.0881585811163276,
+      "grad_norm": 0.17511604726314545,
+      "learning_rate": 0.0004622222222222222,
+      "loss": 0.0569,
+      "step": 169
+    },
+    {
+      "epoch": 0.08868022952529994,
+      "grad_norm": 0.1398395150899887,
+      "learning_rate": 0.0004616666666666667,
+      "loss": 0.034,
+      "step": 170
+    },
+    {
+      "epoch": 0.0892018779342723,
+      "grad_norm": 0.15484075248241425,
+      "learning_rate": 0.00046111111111111114,
+      "loss": 0.0514,
+      "step": 171
+    },
+    {
+      "epoch": 0.08972352634324465,
+      "grad_norm": 0.17851784825325012,
+      "learning_rate": 0.0004605555555555556,
+      "loss": 0.0571,
+      "step": 172
+    },
+    {
+      "epoch": 0.09024517475221701,
+      "grad_norm": 0.18745650351047516,
+      "learning_rate": 0.00046,
+      "loss": 0.0523,
+      "step": 173
+    },
+    {
+      "epoch": 0.09076682316118936,
+      "grad_norm": 0.18322691321372986,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 0.0642,
+      "step": 174
+    },
+    {
+      "epoch": 0.0912884715701617,
+      "grad_norm": 0.1173708513379097,
+      "learning_rate": 0.0004588888888888889,
+      "loss": 0.0267,
+      "step": 175
+    },
+    {
+      "epoch": 0.09181011997913406,
+      "grad_norm": 0.1754874438047409,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 0.0657,
+      "step": 176
+    },
+    {
+      "epoch": 0.09233176838810642,
+      "grad_norm": 0.13830502331256866,
+      "learning_rate": 0.0004577777777777778,
+      "loss": 0.0433,
+      "step": 177
+    },
+    {
+      "epoch": 0.09285341679707877,
+      "grad_norm": 0.11174938827753067,
+      "learning_rate": 0.0004572222222222222,
+      "loss": 0.04,
+      "step": 178
+    },
+    {
+      "epoch": 0.09337506520605113,
+      "grad_norm": 0.1829378753900528,
+      "learning_rate": 0.0004566666666666667,
+      "loss": 0.0453,
+      "step": 179
+    },
+    {
+      "epoch": 0.09389671361502347,
+      "grad_norm": 0.10748015344142914,
+      "learning_rate": 0.0004561111111111111,
+      "loss": 0.05,
+      "step": 180
+    },
+    {
+      "epoch": 0.09441836202399582,
+      "grad_norm": 0.1160806268453598,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.0245,
+      "step": 181
+    },
+    {
+      "epoch": 0.09494001043296818,
+      "grad_norm": 0.12387479841709137,
+      "learning_rate": 0.000455,
+      "loss": 0.0259,
+      "step": 182
+    },
+    {
+      "epoch": 0.09546165884194054,
+      "grad_norm": 0.1586403250694275,
+      "learning_rate": 0.00045444444444444444,
+      "loss": 0.0378,
+      "step": 183
+    },
+    {
+      "epoch": 0.09598330725091289,
+      "grad_norm": 0.18905822932720184,
+      "learning_rate": 0.00045388888888888893,
+      "loss": 0.0484,
+      "step": 184
+    },
+    {
+      "epoch": 0.09650495565988523,
+      "grad_norm": 0.17541544139385223,
+      "learning_rate": 0.0004533333333333333,
+      "loss": 0.0503,
+      "step": 185
+    },
+    {
+      "epoch": 0.09702660406885759,
+      "grad_norm": 0.1083071306347847,
+      "learning_rate": 0.0004527777777777778,
+      "loss": 0.0439,
+      "step": 186
+    },
+    {
+      "epoch": 0.09754825247782994,
+      "grad_norm": 0.10464104264974594,
+      "learning_rate": 0.00045222222222222224,
+      "loss": 0.0271,
+      "step": 187
+    },
+    {
+      "epoch": 0.0980699008868023,
+      "grad_norm": 0.18022054433822632,
+      "learning_rate": 0.0004516666666666667,
+      "loss": 0.0589,
+      "step": 188
+    },
+    {
+      "epoch": 0.09859154929577464,
+      "grad_norm": 0.18715251982212067,
+      "learning_rate": 0.0004511111111111111,
+      "loss": 0.0489,
+      "step": 189
+    },
+    {
+      "epoch": 0.099113197704747,
+      "grad_norm": 0.10440787672996521,
+      "learning_rate": 0.00045055555555555555,
+      "loss": 0.0221,
+      "step": 190
+    },
+    {
+      "epoch": 0.09963484611371935,
+      "grad_norm": 0.11525921523571014,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.0427,
+      "step": 191
+    },
+    {
+      "epoch": 0.10015649452269171,
+      "grad_norm": 0.1573028564453125,
+      "learning_rate": 0.0004494444444444444,
+      "loss": 0.04,
+      "step": 192
+    },
+    {
+      "epoch": 0.10067814293166406,
+      "grad_norm": 0.15942253172397614,
+      "learning_rate": 0.0004488888888888889,
+      "loss": 0.045,
+      "step": 193
+    },
+    {
+      "epoch": 0.1011997913406364,
+      "grad_norm": 0.2997572422027588,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 0.062,
+      "step": 194
+    },
+    {
+      "epoch": 0.10172143974960876,
+      "grad_norm": 0.1859196424484253,
+      "learning_rate": 0.0004477777777777778,
+      "loss": 0.0496,
+      "step": 195
+    },
+    {
+      "epoch": 0.10224308815858112,
+      "grad_norm": 0.1265893131494522,
+      "learning_rate": 0.0004472222222222222,
+      "loss": 0.0457,
+      "step": 196
+    },
+    {
+      "epoch": 0.10276473656755347,
+      "grad_norm": 0.16036029160022736,
+      "learning_rate": 0.00044666666666666666,
+      "loss": 0.046,
+      "step": 197
+    },
+    {
+      "epoch": 0.10328638497652583,
+      "grad_norm": 0.10421448945999146,
+      "learning_rate": 0.00044611111111111115,
+      "loss": 0.033,
+      "step": 198
+    },
+    {
+      "epoch": 0.10380803338549817,
+      "grad_norm": 0.12321974337100983,
+      "learning_rate": 0.00044555555555555554,
+      "loss": 0.0458,
+      "step": 199
+    },
+    {
+      "epoch": 0.10432968179447052,
+      "grad_norm": 0.13863791525363922,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.0221,
+      "step": 200
+    },
+    {
+      "epoch": 0.10485133020344288,
+      "grad_norm": 0.11896353214979172,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.0477,
+      "step": 201
+    },
+    {
+      "epoch": 0.10537297861241524,
+      "grad_norm": 0.1473105251789093,
+      "learning_rate": 0.0004438888888888889,
+      "loss": 0.056,
+      "step": 202
+    },
+    {
+      "epoch": 0.10589462702138759,
+      "grad_norm": 0.15058237314224243,
+      "learning_rate": 0.00044333333333333334,
+      "loss": 0.0477,
+      "step": 203
+    },
+    {
+      "epoch": 0.10641627543035993,
+      "grad_norm": 0.10770102590322495,
+      "learning_rate": 0.0004427777777777778,
+      "loss": 0.0316,
+      "step": 204
+    },
+    {
+      "epoch": 0.10693792383933229,
+      "grad_norm": 0.13766999542713165,
+      "learning_rate": 0.00044222222222222227,
+      "loss": 0.041,
+      "step": 205
+    },
+    {
+      "epoch": 0.10745957224830464,
+      "grad_norm": 0.11786706745624542,
+      "learning_rate": 0.00044166666666666665,
+      "loss": 0.0302,
+      "step": 206
+    },
+    {
+      "epoch": 0.107981220657277,
+      "grad_norm": 0.10209888964891434,
+      "learning_rate": 0.00044111111111111114,
+      "loss": 0.0396,
+      "step": 207
+    },
+    {
+      "epoch": 0.10850286906624934,
+      "grad_norm": 0.13609950244426727,
+      "learning_rate": 0.0004405555555555555,
+      "loss": 0.0394,
+      "step": 208
+    },
+    {
+      "epoch": 0.1090245174752217,
+      "grad_norm": 0.11915361881256104,
+      "learning_rate": 0.00044,
+      "loss": 0.0421,
+      "step": 209
+    },
+    {
+      "epoch": 0.10954616588419405,
+      "grad_norm": 0.11170439422130585,
+      "learning_rate": 0.0004394444444444445,
+      "loss": 0.0395,
+      "step": 210
+    },
+    {
+      "epoch": 0.11006781429316641,
+      "grad_norm": 0.12584055960178375,
+      "learning_rate": 0.0004388888888888889,
+      "loss": 0.0534,
+      "step": 211
+    },
+    {
+      "epoch": 0.11058946270213876,
+      "grad_norm": 0.1454746276140213,
+      "learning_rate": 0.0004383333333333334,
+      "loss": 0.0469,
+      "step": 212
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.10297106951475143,
+      "learning_rate": 0.00043777777777777776,
+      "loss": 0.0359,
+      "step": 213
+    },
+    {
+      "epoch": 0.11163275952008346,
+      "grad_norm": 0.10994141548871994,
+      "learning_rate": 0.00043722222222222225,
+      "loss": 0.0404,
+      "step": 214
+    },
+    {
+      "epoch": 0.11215440792905582,
+      "grad_norm": 0.13165079057216644,
+      "learning_rate": 0.00043666666666666664,
+      "loss": 0.0475,
+      "step": 215
+    },
+    {
+      "epoch": 0.11267605633802817,
+      "grad_norm": 0.11115416139364243,
+      "learning_rate": 0.00043611111111111113,
+      "loss": 0.0351,
+      "step": 216
+    },
+    {
+      "epoch": 0.11319770474700053,
+      "grad_norm": 0.15927758812904358,
+      "learning_rate": 0.0004355555555555555,
+      "loss": 0.0468,
+      "step": 217
+    },
+    {
+      "epoch": 0.11371935315597287,
+      "grad_norm": 0.0941813513636589,
+      "learning_rate": 0.000435,
+      "loss": 0.0337,
+      "step": 218
+    },
+    {
+      "epoch": 0.11424100156494522,
+      "grad_norm": 0.10850685834884644,
+      "learning_rate": 0.0004344444444444445,
+      "loss": 0.0211,
+      "step": 219
+    },
+    {
+      "epoch": 0.11476264997391758,
+      "grad_norm": 0.0790611058473587,
+      "learning_rate": 0.0004338888888888889,
+      "loss": 0.0196,
+      "step": 220
+    },
+    {
+      "epoch": 0.11528429838288994,
+      "grad_norm": 0.10849782079458237,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 0.04,
+      "step": 221
+    },
+    {
+      "epoch": 0.11580594679186229,
+      "grad_norm": 0.09607880562543869,
+      "learning_rate": 0.00043277777777777775,
+      "loss": 0.0291,
+      "step": 222
+    },
+    {
+      "epoch": 0.11632759520083463,
+      "grad_norm": 0.17959930002689362,
+      "learning_rate": 0.00043222222222222224,
+      "loss": 0.0426,
+      "step": 223
+    },
+    {
+      "epoch": 0.11684924360980699,
+      "grad_norm": 0.08865644782781601,
+      "learning_rate": 0.0004316666666666667,
+      "loss": 0.0321,
+      "step": 224
+    },
+    {
+      "epoch": 0.11737089201877934,
+      "grad_norm": 0.17324934899806976,
+      "learning_rate": 0.0004311111111111111,
+      "loss": 0.0537,
+      "step": 225
+    },
+    {
+      "epoch": 0.1178925404277517,
+      "grad_norm": 0.10226263850927353,
+      "learning_rate": 0.0004305555555555556,
+      "loss": 0.0342,
+      "step": 226
+    },
+    {
+      "epoch": 0.11841418883672405,
+      "grad_norm": 0.10456152260303497,
+      "learning_rate": 0.00043,
+      "loss": 0.039,
+      "step": 227
+    },
+    {
+      "epoch": 0.1189358372456964,
+      "grad_norm": 0.10196290910243988,
+      "learning_rate": 0.0004294444444444445,
+      "loss": 0.0329,
+      "step": 228
+    },
+    {
+      "epoch": 0.11945748565466875,
+      "grad_norm": 0.12004778534173965,
+      "learning_rate": 0.00042888888888888886,
+      "loss": 0.0434,
+      "step": 229
+    },
+    {
+      "epoch": 0.11997913406364111,
+      "grad_norm": 0.10152442753314972,
+      "learning_rate": 0.00042833333333333335,
+      "loss": 0.0305,
+      "step": 230
+    },
+    {
+      "epoch": 0.12050078247261346,
+      "grad_norm": 0.1072554886341095,
+      "learning_rate": 0.0004277777777777778,
+      "loss": 0.0407,
+      "step": 231
+    },
+    {
+      "epoch": 0.1210224308815858,
+      "grad_norm": 0.08478479087352753,
+      "learning_rate": 0.00042722222222222223,
+      "loss": 0.0375,
+      "step": 232
+    },
+    {
+      "epoch": 0.12154407929055816,
+      "grad_norm": 0.11901957541704178,
+      "learning_rate": 0.0004266666666666667,
+      "loss": 0.0281,
+      "step": 233
+    },
+    {
+      "epoch": 0.12206572769953052,
+      "grad_norm": 0.097981758415699,
+      "learning_rate": 0.0004261111111111111,
+      "loss": 0.0365,
+      "step": 234
+    },
+    {
+      "epoch": 0.12258737610850287,
+      "grad_norm": 0.08464547991752625,
+      "learning_rate": 0.0004255555555555556,
+      "loss": 0.0227,
+      "step": 235
+    },
+    {
+      "epoch": 0.12310902451747523,
+      "grad_norm": 0.18886807560920715,
+      "learning_rate": 0.000425,
+      "loss": 0.0494,
+      "step": 236
+    },
+    {
+      "epoch": 0.12363067292644757,
+      "grad_norm": 0.08432997763156891,
+      "learning_rate": 0.00042444444444444447,
+      "loss": 0.031,
+      "step": 237
+    },
+    {
+      "epoch": 0.12415232133541992,
+      "grad_norm": 0.24738061428070068,
+      "learning_rate": 0.0004238888888888889,
+      "loss": 0.0611,
+      "step": 238
+    },
+    {
+      "epoch": 0.12467396974439228,
+      "grad_norm": 0.11955960839986801,
+      "learning_rate": 0.00042333333333333334,
+      "loss": 0.0481,
+      "step": 239
+    },
+    {
+      "epoch": 0.12519561815336464,
+      "grad_norm": 0.132662832736969,
+      "learning_rate": 0.0004227777777777778,
+      "loss": 0.0432,
+      "step": 240
+    },
+    {
+      "epoch": 0.12571726656233698,
+      "grad_norm": 0.08496639877557755,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 0.0328,
+      "step": 241
+    },
+    {
+      "epoch": 0.12623891497130935,
+      "grad_norm": 0.13830861449241638,
+      "learning_rate": 0.0004216666666666667,
+      "loss": 0.0336,
+      "step": 242
+    },
+    {
+      "epoch": 0.1267605633802817,
+      "grad_norm": 0.12200845032930374,
+      "learning_rate": 0.0004211111111111111,
+      "loss": 0.0346,
+      "step": 243
+    },
+    {
+      "epoch": 0.12728221178925403,
+      "grad_norm": 0.10438041388988495,
+      "learning_rate": 0.0004205555555555556,
+      "loss": 0.039,
+      "step": 244
+    },
+    {
+      "epoch": 0.1278038601982264,
+      "grad_norm": 0.10238846391439438,
+      "learning_rate": 0.00042,
+      "loss": 0.0442,
+      "step": 245
+    },
+    {
+      "epoch": 0.12832550860719874,
+      "grad_norm": 0.10930721461772919,
+      "learning_rate": 0.00041944444444444445,
+      "loss": 0.0426,
+      "step": 246
+    },
+    {
+      "epoch": 0.1288471570161711,
+      "grad_norm": 0.09867265820503235,
+      "learning_rate": 0.0004188888888888889,
+      "loss": 0.0402,
+      "step": 247
+    },
+    {
+      "epoch": 0.12936880542514345,
+      "grad_norm": 0.1137848049402237,
+      "learning_rate": 0.00041833333333333333,
+      "loss": 0.0278,
+      "step": 248
+    },
+    {
+      "epoch": 0.1298904538341158,
+      "grad_norm": 0.1364007592201233,
+      "learning_rate": 0.0004177777777777778,
+      "loss": 0.0437,
+      "step": 249
+    },
+    {
+      "epoch": 0.13041210224308816,
+      "grad_norm": 0.09385659545660019,
+      "learning_rate": 0.0004172222222222222,
+      "loss": 0.0353,
+      "step": 250
+    },
+    {
+      "epoch": 0.1309337506520605,
+      "grad_norm": 0.1302153617143631,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 0.0287,
+      "step": 251
+    },
+    {
+      "epoch": 0.13145539906103287,
+      "grad_norm": 0.09976278990507126,
+      "learning_rate": 0.00041611111111111113,
+      "loss": 0.0381,
+      "step": 252
+    },
+    {
+      "epoch": 0.13197704747000522,
+      "grad_norm": 0.0966271236538887,
+      "learning_rate": 0.00041555555555555557,
+      "loss": 0.0204,
+      "step": 253
+    },
+    {
+      "epoch": 0.13249869587897756,
+      "grad_norm": 0.0773528590798378,
+      "learning_rate": 0.000415,
+      "loss": 0.0285,
+      "step": 254
+    },
+    {
+      "epoch": 0.13302034428794993,
+      "grad_norm": 0.2350674420595169,
+      "learning_rate": 0.00041444444444444444,
+      "loss": 0.0511,
+      "step": 255
+    },
+    {
+      "epoch": 0.13354199269692227,
+      "grad_norm": 0.08375384658575058,
+      "learning_rate": 0.0004138888888888889,
+      "loss": 0.0341,
+      "step": 256
+    },
+    {
+      "epoch": 0.13406364110589464,
+      "grad_norm": 0.09229125827550888,
+      "learning_rate": 0.0004133333333333333,
+      "loss": 0.0329,
+      "step": 257
+    },
+    {
+      "epoch": 0.13458528951486698,
+      "grad_norm": 0.08750821650028229,
+      "learning_rate": 0.0004127777777777778,
+      "loss": 0.0283,
+      "step": 258
+    },
+    {
+      "epoch": 0.13510693792383932,
+      "grad_norm": 0.075618676841259,
+      "learning_rate": 0.00041222222222222224,
+      "loss": 0.0291,
+      "step": 259
+    },
+    {
+      "epoch": 0.1356285863328117,
+      "grad_norm": 0.16954250633716583,
+      "learning_rate": 0.0004116666666666667,
+      "loss": 0.0441,
+      "step": 260
+    },
+    {
+      "epoch": 0.13615023474178403,
+      "grad_norm": 0.07529555261135101,
+      "learning_rate": 0.0004111111111111111,
+      "loss": 0.0139,
+      "step": 261
+    },
+    {
+      "epoch": 0.1366718831507564,
+      "grad_norm": 0.08884407579898834,
+      "learning_rate": 0.00041055555555555555,
+      "loss": 0.0299,
+      "step": 262
+    },
+    {
+      "epoch": 0.13719353155972874,
+      "grad_norm": 0.09607396274805069,
+      "learning_rate": 0.00041,
+      "loss": 0.0337,
+      "step": 263
+    },
+    {
+      "epoch": 0.13771517996870108,
+      "grad_norm": 0.08453882485628128,
+      "learning_rate": 0.00040944444444444443,
+      "loss": 0.0315,
+      "step": 264
+    },
+    {
+      "epoch": 0.13823682837767345,
+      "grad_norm": 0.09629228711128235,
+      "learning_rate": 0.0004088888888888889,
+      "loss": 0.0181,
+      "step": 265
+    },
+    {
+      "epoch": 0.1387584767866458,
+      "grad_norm": 0.07212290167808533,
+      "learning_rate": 0.00040833333333333336,
+      "loss": 0.0262,
+      "step": 266
+    },
+    {
+      "epoch": 0.13928012519561817,
+      "grad_norm": 0.09192827343940735,
+      "learning_rate": 0.0004077777777777778,
+      "loss": 0.018,
+      "step": 267
+    },
+    {
+      "epoch": 0.1398017736045905,
+      "grad_norm": 0.10876122117042542,
+      "learning_rate": 0.00040722222222222223,
+      "loss": 0.0316,
+      "step": 268
+    },
+    {
+      "epoch": 0.14032342201356285,
+      "grad_norm": 0.10796765238046646,
+      "learning_rate": 0.00040666666666666667,
+      "loss": 0.0323,
+      "step": 269
+    },
+    {
+      "epoch": 0.14084507042253522,
+      "grad_norm": 0.08297892659902573,
+      "learning_rate": 0.0004061111111111111,
+      "loss": 0.0244,
+      "step": 270
+    },
+    {
+      "epoch": 0.14136671883150756,
+      "grad_norm": 0.09534858912229538,
+      "learning_rate": 0.00040555555555555554,
+      "loss": 0.0279,
+      "step": 271
+    },
+    {
+      "epoch": 0.14188836724047993,
+      "grad_norm": 0.07854770123958588,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.032,
+      "step": 272
+    },
+    {
+      "epoch": 0.14241001564945227,
+      "grad_norm": 0.16025401651859283,
+      "learning_rate": 0.00040444444444444447,
+      "loss": 0.0413,
+      "step": 273
+    },
+    {
+      "epoch": 0.1429316640584246,
+      "grad_norm": 0.07919424772262573,
+      "learning_rate": 0.0004038888888888889,
+      "loss": 0.0277,
+      "step": 274
+    },
+    {
+      "epoch": 0.14345331246739698,
+      "grad_norm": 0.07335282117128372,
+      "learning_rate": 0.00040333333333333334,
+      "loss": 0.0363,
+      "step": 275
+    },
+    {
+      "epoch": 0.14397496087636932,
+      "grad_norm": 0.1280767321586609,
+      "learning_rate": 0.0004027777777777778,
+      "loss": 0.0402,
+      "step": 276
+    },
+    {
+      "epoch": 0.1444966092853417,
+      "grad_norm": 0.11371007561683655,
+      "learning_rate": 0.0004022222222222222,
+      "loss": 0.0312,
+      "step": 277
+    },
+    {
+      "epoch": 0.14501825769431403,
+      "grad_norm": 0.12229876220226288,
+      "learning_rate": 0.00040166666666666665,
+      "loss": 0.0357,
+      "step": 278
+    },
+    {
+      "epoch": 0.14553990610328638,
+      "grad_norm": 0.11436333507299423,
+      "learning_rate": 0.0004011111111111111,
+      "loss": 0.0135,
+      "step": 279
+    },
+    {
+      "epoch": 0.14606155451225875,
+      "grad_norm": 0.08084696531295776,
+      "learning_rate": 0.0004005555555555556,
+      "loss": 0.0302,
+      "step": 280
+    },
+    {
+      "epoch": 0.1465832029212311,
+      "grad_norm": 0.09421739727258682,
+      "learning_rate": 0.0004,
+      "loss": 0.0376,
+      "step": 281
+    },
+    {
+      "epoch": 0.14710485133020346,
+      "grad_norm": 0.0744849219918251,
+      "learning_rate": 0.00039944444444444446,
+      "loss": 0.0291,
+      "step": 282
+    },
+    {
+      "epoch": 0.1476264997391758,
+      "grad_norm": 0.06754301488399506,
+      "learning_rate": 0.0003988888888888889,
+      "loss": 0.0262,
+      "step": 283
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.06215747445821762,
+      "learning_rate": 0.00039833333333333333,
+      "loss": 0.0223,
+      "step": 284
+    },
+    {
+      "epoch": 0.1486697965571205,
+      "grad_norm": 0.10289556533098221,
+      "learning_rate": 0.00039777777777777777,
+      "loss": 0.0401,
+      "step": 285
+    },
+    {
+      "epoch": 0.14919144496609285,
+      "grad_norm": 0.10723885893821716,
+      "learning_rate": 0.0003972222222222222,
+      "loss": 0.018,
+      "step": 286
+    },
+    {
+      "epoch": 0.1497130933750652,
+      "grad_norm": 0.12683100998401642,
+      "learning_rate": 0.0003966666666666667,
+      "loss": 0.0155,
+      "step": 287
+    },
+    {
+      "epoch": 0.15023474178403756,
+      "grad_norm": 0.10709403455257416,
+      "learning_rate": 0.00039611111111111113,
+      "loss": 0.0186,
+      "step": 288
+    },
+    {
+      "epoch": 0.1507563901930099,
+      "grad_norm": 0.09857751429080963,
+      "learning_rate": 0.00039555555555555557,
+      "loss": 0.0311,
+      "step": 289
+    },
+    {
+      "epoch": 0.15127803860198227,
+      "grad_norm": 0.07990946620702744,
+      "learning_rate": 0.000395,
+      "loss": 0.032,
+      "step": 290
+    },
+    {
+      "epoch": 0.15179968701095461,
+      "grad_norm": 0.06873098760843277,
+      "learning_rate": 0.00039444444444444444,
+      "loss": 0.0163,
+      "step": 291
+    },
+    {
+      "epoch": 0.15232133541992696,
+      "grad_norm": 0.0788077712059021,
+      "learning_rate": 0.00039388888888888893,
+      "loss": 0.0319,
+      "step": 292
+    },
+    {
+      "epoch": 0.15284298382889933,
+      "grad_norm": 0.08789033442735672,
+      "learning_rate": 0.0003933333333333333,
+      "loss": 0.0352,
+      "step": 293
+    },
+    {
+      "epoch": 0.15336463223787167,
+      "grad_norm": 0.10574653744697571,
+      "learning_rate": 0.0003927777777777778,
+      "loss": 0.0411,
+      "step": 294
+    },
+    {
+      "epoch": 0.15388628064684404,
+      "grad_norm": 0.08198726177215576,
+      "learning_rate": 0.00039222222222222225,
+      "loss": 0.0286,
+      "step": 295
+    },
+    {
+      "epoch": 0.15440792905581638,
+      "grad_norm": 0.2811417579650879,
+      "learning_rate": 0.0003916666666666667,
+      "loss": 0.0508,
+      "step": 296
+    },
+    {
+      "epoch": 0.15492957746478872,
+      "grad_norm": 0.1203279122710228,
+      "learning_rate": 0.0003911111111111111,
+      "loss": 0.0384,
+      "step": 297
+    },
+    {
+      "epoch": 0.1554512258737611,
+      "grad_norm": 0.08802422881126404,
+      "learning_rate": 0.00039055555555555556,
+      "loss": 0.0305,
+      "step": 298
+    },
+    {
+      "epoch": 0.15597287428273343,
+      "grad_norm": 0.05368930101394653,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0167,
+      "step": 299
+    },
+    {
+      "epoch": 0.1564945226917058,
+      "grad_norm": 0.16041633486747742,
+      "learning_rate": 0.00038944444444444443,
+      "loss": 0.047,
+      "step": 300
+    },
+    {
+      "epoch": 0.15701617110067814,
+      "grad_norm": 0.06771723926067352,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 0.0242,
+      "step": 301
+    },
+    {
+      "epoch": 0.15753781950965048,
+      "grad_norm": 0.09745685011148453,
+      "learning_rate": 0.0003883333333333333,
+      "loss": 0.0121,
+      "step": 302
+    },
+    {
+      "epoch": 0.15805946791862285,
+      "grad_norm": 0.1079089567065239,
+      "learning_rate": 0.0003877777777777778,
+      "loss": 0.0331,
+      "step": 303
+    },
+    {
+      "epoch": 0.1585811163275952,
+      "grad_norm": 0.07800073176622391,
+      "learning_rate": 0.00038722222222222223,
+      "loss": 0.0325,
+      "step": 304
+    },
+    {
+      "epoch": 0.15910276473656756,
+      "grad_norm": 0.13546329736709595,
+      "learning_rate": 0.00038666666666666667,
+      "loss": 0.0296,
+      "step": 305
+    },
+    {
+      "epoch": 0.1596244131455399,
+      "grad_norm": 0.0735045000910759,
+      "learning_rate": 0.00038611111111111116,
+      "loss": 0.0275,
+      "step": 306
+    },
+    {
+      "epoch": 0.16014606155451225,
+      "grad_norm": 0.056763097643852234,
+      "learning_rate": 0.00038555555555555554,
+      "loss": 0.025,
+      "step": 307
+    },
+    {
+      "epoch": 0.16066770996348462,
+      "grad_norm": 0.0723307803273201,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.0269,
+      "step": 308
+    },
+    {
+      "epoch": 0.16118935837245696,
+      "grad_norm": 0.07295756787061691,
+      "learning_rate": 0.0003844444444444444,
+      "loss": 0.0224,
+      "step": 309
+    },
+    {
+      "epoch": 0.16171100678142933,
+      "grad_norm": 0.1010420173406601,
+      "learning_rate": 0.0003838888888888889,
+      "loss": 0.0154,
+      "step": 310
+    },
+    {
+      "epoch": 0.16223265519040167,
+      "grad_norm": 0.10790162533521652,
+      "learning_rate": 0.00038333333333333334,
+      "loss": 0.0334,
+      "step": 311
+    },
+    {
+      "epoch": 0.162754303599374,
+      "grad_norm": 0.06171411648392677,
+      "learning_rate": 0.0003827777777777778,
+      "loss": 0.026,
+      "step": 312
+    },
+    {
+      "epoch": 0.16327595200834638,
+      "grad_norm": 0.0646505281329155,
+      "learning_rate": 0.0003822222222222223,
+      "loss": 0.0283,
+      "step": 313
+    },
+    {
+      "epoch": 0.16379760041731872,
+      "grad_norm": 0.1241549476981163,
+      "learning_rate": 0.00038166666666666666,
+      "loss": 0.0433,
+      "step": 314
+    },
+    {
+      "epoch": 0.1643192488262911,
+      "grad_norm": 0.08475686609745026,
+      "learning_rate": 0.00038111111111111115,
+      "loss": 0.0264,
+      "step": 315
+    },
+    {
+      "epoch": 0.16484089723526343,
+      "grad_norm": 0.1006927415728569,
+      "learning_rate": 0.00038055555555555553,
+      "loss": 0.0373,
+      "step": 316
+    },
+    {
+      "epoch": 0.16536254564423578,
+      "grad_norm": 0.08395830541849136,
+      "learning_rate": 0.00038,
+      "loss": 0.0151,
+      "step": 317
+    },
+    {
+      "epoch": 0.16588419405320814,
+      "grad_norm": 0.05780460685491562,
+      "learning_rate": 0.0003794444444444444,
+      "loss": 0.018,
+      "step": 318
+    },
+    {
+      "epoch": 0.1664058424621805,
+      "grad_norm": 0.08385057002305984,
+      "learning_rate": 0.0003788888888888889,
+      "loss": 0.0347,
+      "step": 319
+    },
+    {
+      "epoch": 0.16692749087115286,
+      "grad_norm": 0.0629425197839737,
+      "learning_rate": 0.0003783333333333334,
+      "loss": 0.0288,
+      "step": 320
+    },
+    {
+      "epoch": 0.1674491392801252,
+      "grad_norm": 0.07353231310844421,
+      "learning_rate": 0.00037777777777777777,
+      "loss": 0.0318,
+      "step": 321
+    },
+    {
+      "epoch": 0.16797078768909754,
+      "grad_norm": 0.06632209569215775,
+      "learning_rate": 0.00037722222222222226,
+      "loss": 0.0286,
+      "step": 322
+    },
+    {
+      "epoch": 0.1684924360980699,
+      "grad_norm": 0.10224422067403793,
+      "learning_rate": 0.00037666666666666664,
+      "loss": 0.0287,
+      "step": 323
+    },
+    {
+      "epoch": 0.16901408450704225,
+      "grad_norm": 0.07615455985069275,
+      "learning_rate": 0.00037611111111111113,
+      "loss": 0.0245,
+      "step": 324
+    },
+    {
+      "epoch": 0.16953573291601462,
+      "grad_norm": 0.08341842144727707,
+      "learning_rate": 0.0003755555555555555,
+      "loss": 0.0272,
+      "step": 325
+    },
+    {
+      "epoch": 0.17005738132498696,
+      "grad_norm": 0.06340507417917252,
+      "learning_rate": 0.000375,
+      "loss": 0.0251,
+      "step": 326
+    },
+    {
+      "epoch": 0.1705790297339593,
+      "grad_norm": 0.05245117098093033,
+      "learning_rate": 0.0003744444444444445,
+      "loss": 0.016,
+      "step": 327
+    },
+    {
+      "epoch": 0.17110067814293167,
+      "grad_norm": 0.07821597903966904,
+      "learning_rate": 0.0003738888888888889,
+      "loss": 0.0166,
+      "step": 328
+    },
+    {
+      "epoch": 0.17162232655190401,
+      "grad_norm": 0.05091237649321556,
+      "learning_rate": 0.0003733333333333334,
+      "loss": 0.0169,
+      "step": 329
+    },
+    {
+      "epoch": 0.17214397496087636,
+      "grad_norm": 0.11584059149026871,
+      "learning_rate": 0.00037277777777777776,
+      "loss": 0.0424,
+      "step": 330
+    },
+    {
+      "epoch": 0.17266562336984873,
+      "grad_norm": 0.08996029943227768,
+      "learning_rate": 0.00037222222222222225,
+      "loss": 0.0287,
+      "step": 331
+    },
+    {
+      "epoch": 0.17318727177882107,
+      "grad_norm": 0.06258998066186905,
+      "learning_rate": 0.00037166666666666663,
+      "loss": 0.0243,
+      "step": 332
+    },
+    {
+      "epoch": 0.17370892018779344,
+      "grad_norm": 0.06734970957040787,
+      "learning_rate": 0.0003711111111111111,
+      "loss": 0.0294,
+      "step": 333
+    },
+    {
+      "epoch": 0.17423056859676578,
+      "grad_norm": 0.06081216409802437,
+      "learning_rate": 0.0003705555555555556,
+      "loss": 0.0262,
+      "step": 334
+    },
+    {
+      "epoch": 0.17475221700573812,
+      "grad_norm": 0.06397537142038345,
+      "learning_rate": 0.00037,
+      "loss": 0.0161,
+      "step": 335
+    },
+    {
+      "epoch": 0.1752738654147105,
+      "grad_norm": 0.07987434417009354,
+      "learning_rate": 0.0003694444444444445,
+      "loss": 0.0301,
+      "step": 336
+    },
+    {
+      "epoch": 0.17579551382368283,
+      "grad_norm": 0.09395250678062439,
+      "learning_rate": 0.00036888888888888887,
+      "loss": 0.0289,
+      "step": 337
+    },
+    {
+      "epoch": 0.1763171622326552,
+      "grad_norm": 0.05801301822066307,
+      "learning_rate": 0.00036833333333333336,
+      "loss": 0.0326,
+      "step": 338
+    },
+    {
+      "epoch": 0.17683881064162754,
+      "grad_norm": 0.06285756826400757,
+      "learning_rate": 0.00036777777777777774,
+      "loss": 0.0235,
+      "step": 339
+    },
+    {
+      "epoch": 0.17736045905059988,
+      "grad_norm": 0.06429009139537811,
+      "learning_rate": 0.00036722222222222223,
+      "loss": 0.0136,
+      "step": 340
+    },
+    {
+      "epoch": 0.17788210745957225,
+      "grad_norm": 0.05570930242538452,
+      "learning_rate": 0.00036666666666666667,
+      "loss": 0.0205,
+      "step": 341
+    },
+    {
+      "epoch": 0.1784037558685446,
+      "grad_norm": 0.061478108167648315,
+      "learning_rate": 0.0003661111111111111,
+      "loss": 0.026,
+      "step": 342
+    },
+    {
+      "epoch": 0.17892540427751696,
+      "grad_norm": 0.07520420104265213,
+      "learning_rate": 0.0003655555555555556,
+      "loss": 0.0265,
+      "step": 343
+    },
+    {
+      "epoch": 0.1794470526864893,
+      "grad_norm": 0.047426123172044754,
+      "learning_rate": 0.000365,
+      "loss": 0.0144,
+      "step": 344
+    },
+    {
+      "epoch": 0.17996870109546165,
+      "grad_norm": 0.09971431642770767,
+      "learning_rate": 0.00036444444444444447,
+      "loss": 0.0359,
+      "step": 345
+    },
+    {
+      "epoch": 0.18049034950443402,
+      "grad_norm": 0.0507560633122921,
+      "learning_rate": 0.00036388888888888886,
+      "loss": 0.0203,
+      "step": 346
+    },
+    {
+      "epoch": 0.18101199791340636,
+      "grad_norm": 0.09610850363969803,
+      "learning_rate": 0.00036333333333333335,
+      "loss": 0.0352,
+      "step": 347
+    },
+    {
+      "epoch": 0.18153364632237873,
+      "grad_norm": 0.04846423119306564,
+      "learning_rate": 0.0003627777777777778,
+      "loss": 0.0162,
+      "step": 348
+    },
+    {
+      "epoch": 0.18205529473135107,
+      "grad_norm": 0.15771976113319397,
+      "learning_rate": 0.0003622222222222222,
+      "loss": 0.0323,
+      "step": 349
+    },
+    {
+      "epoch": 0.1825769431403234,
+      "grad_norm": 0.07306705415248871,
+      "learning_rate": 0.0003616666666666667,
+      "loss": 0.0217,
+      "step": 350
+    },
+    {
+      "epoch": 0.18309859154929578,
+      "grad_norm": 0.05630479007959366,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 0.0149,
+      "step": 351
+    },
+    {
+      "epoch": 0.18362023995826812,
+      "grad_norm": 0.08934023231267929,
+      "learning_rate": 0.0003605555555555556,
+      "loss": 0.0113,
+      "step": 352
+    },
+    {
+      "epoch": 0.1841418883672405,
+      "grad_norm": 0.1724640429019928,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.0435,
+      "step": 353
+    },
+    {
+      "epoch": 0.18466353677621283,
+      "grad_norm": 0.06963273137807846,
+      "learning_rate": 0.00035944444444444446,
+      "loss": 0.0307,
+      "step": 354
+    },
+    {
+      "epoch": 0.18518518518518517,
+      "grad_norm": 0.06084301322698593,
+      "learning_rate": 0.0003588888888888889,
+      "loss": 0.0306,
+      "step": 355
+    },
+    {
+      "epoch": 0.18570683359415754,
+      "grad_norm": 0.07648872584104538,
+      "learning_rate": 0.00035833333333333333,
+      "loss": 0.0271,
+      "step": 356
+    },
+    {
+      "epoch": 0.18622848200312989,
+      "grad_norm": 0.07619331032037735,
+      "learning_rate": 0.00035777777777777777,
+      "loss": 0.0171,
+      "step": 357
+    },
+    {
+      "epoch": 0.18675013041210226,
+      "grad_norm": 0.08520349115133286,
+      "learning_rate": 0.0003572222222222222,
+      "loss": 0.0284,
+      "step": 358
+    },
+    {
+      "epoch": 0.1872717788210746,
+      "grad_norm": 0.057310912758111954,
+      "learning_rate": 0.0003566666666666667,
+      "loss": 0.0223,
+      "step": 359
+    },
+    {
+      "epoch": 0.18779342723004694,
+      "grad_norm": 0.08222941309213638,
+      "learning_rate": 0.0003561111111111111,
+      "loss": 0.0296,
+      "step": 360
+    },
+    {
+      "epoch": 0.1883150756390193,
+      "grad_norm": 0.08427579700946808,
+      "learning_rate": 0.00035555555555555557,
+      "loss": 0.031,
+      "step": 361
+    },
+    {
+      "epoch": 0.18883672404799165,
+      "grad_norm": 0.06189948692917824,
+      "learning_rate": 0.000355,
+      "loss": 0.0273,
+      "step": 362
+    },
+    {
+      "epoch": 0.18935837245696402,
+      "grad_norm": 0.07053495943546295,
+      "learning_rate": 0.00035444444444444445,
+      "loss": 0.0197,
+      "step": 363
+    },
+    {
+      "epoch": 0.18988002086593636,
+      "grad_norm": 0.0913248062133789,
+      "learning_rate": 0.0003538888888888889,
+      "loss": 0.0336,
+      "step": 364
+    },
+    {
+      "epoch": 0.1904016692749087,
+      "grad_norm": 0.062306199222803116,
+      "learning_rate": 0.0003533333333333333,
+      "loss": 0.0291,
+      "step": 365
+    },
+    {
+      "epoch": 0.19092331768388107,
+      "grad_norm": 0.09297792613506317,
+      "learning_rate": 0.0003527777777777778,
+      "loss": 0.0338,
+      "step": 366
+    },
+    {
+      "epoch": 0.1914449660928534,
+      "grad_norm": 0.12690134346485138,
+      "learning_rate": 0.00035222222222222225,
+      "loss": 0.0399,
+      "step": 367
+    },
+    {
+      "epoch": 0.19196661450182578,
+      "grad_norm": 0.07451540231704712,
+      "learning_rate": 0.0003516666666666667,
+      "loss": 0.0364,
+      "step": 368
+    },
+    {
+      "epoch": 0.19248826291079812,
+      "grad_norm": 0.09954366087913513,
+      "learning_rate": 0.0003511111111111111,
+      "loss": 0.036,
+      "step": 369
+    },
+    {
+      "epoch": 0.19300991131977047,
+      "grad_norm": 0.07105272263288498,
+      "learning_rate": 0.00035055555555555556,
+      "loss": 0.0226,
+      "step": 370
+    },
+    {
+      "epoch": 0.19353155972874284,
+      "grad_norm": 0.06857888400554657,
+      "learning_rate": 0.00035,
+      "loss": 0.0305,
+      "step": 371
+    },
+    {
+      "epoch": 0.19405320813771518,
+      "grad_norm": 0.060487356036901474,
+      "learning_rate": 0.00034944444444444443,
+      "loss": 0.0279,
+      "step": 372
+    },
+    {
+      "epoch": 0.19457485654668752,
+      "grad_norm": 0.07935786992311478,
+      "learning_rate": 0.0003488888888888889,
+      "loss": 0.0377,
+      "step": 373
+    },
+    {
+      "epoch": 0.1950965049556599,
+      "grad_norm": 0.10610669106245041,
+      "learning_rate": 0.00034833333333333336,
+      "loss": 0.0198,
+      "step": 374
+    },
+    {
+      "epoch": 0.19561815336463223,
+      "grad_norm": 0.06738949567079544,
+      "learning_rate": 0.0003477777777777778,
+      "loss": 0.0321,
+      "step": 375
+    },
+    {
+      "epoch": 0.1961398017736046,
+      "grad_norm": 0.09995345771312714,
+      "learning_rate": 0.00034722222222222224,
+      "loss": 0.0168,
+      "step": 376
+    },
+    {
+      "epoch": 0.19666145018257694,
+      "grad_norm": 0.07820367068052292,
+      "learning_rate": 0.00034666666666666667,
+      "loss": 0.0192,
+      "step": 377
+    },
+    {
+      "epoch": 0.19718309859154928,
+      "grad_norm": 0.05883244797587395,
+      "learning_rate": 0.0003461111111111111,
+      "loss": 0.0217,
+      "step": 378
+    },
+    {
+      "epoch": 0.19770474700052165,
+      "grad_norm": 0.06929990649223328,
+      "learning_rate": 0.00034555555555555555,
+      "loss": 0.0248,
+      "step": 379
+    },
+    {
+      "epoch": 0.198226395409494,
+      "grad_norm": 0.060583919286727905,
+      "learning_rate": 0.000345,
+      "loss": 0.0204,
+      "step": 380
+    },
+    {
+      "epoch": 0.19874804381846636,
+      "grad_norm": 0.08263508230447769,
+      "learning_rate": 0.0003444444444444445,
+      "loss": 0.0261,
+      "step": 381
+    },
+    {
+      "epoch": 0.1992696922274387,
+      "grad_norm": 0.07354709506034851,
+      "learning_rate": 0.0003438888888888889,
+      "loss": 0.0328,
+      "step": 382
+    },
+    {
+      "epoch": 0.19979134063641105,
+      "grad_norm": 0.09151386469602585,
+      "learning_rate": 0.00034333333333333335,
+      "loss": 0.0315,
+      "step": 383
+    },
+    {
+      "epoch": 0.20031298904538342,
+      "grad_norm": 0.06800325959920883,
+      "learning_rate": 0.0003427777777777778,
+      "loss": 0.0265,
+      "step": 384
+    },
+    {
+      "epoch": 0.20083463745435576,
+      "grad_norm": 0.05672604963183403,
+      "learning_rate": 0.0003422222222222222,
+      "loss": 0.0216,
+      "step": 385
+    },
+    {
+      "epoch": 0.20135628586332813,
+      "grad_norm": 0.07447244226932526,
+      "learning_rate": 0.00034166666666666666,
+      "loss": 0.0253,
+      "step": 386
+    },
+    {
+      "epoch": 0.20187793427230047,
+      "grad_norm": 0.051845699548721313,
+      "learning_rate": 0.0003411111111111111,
+      "loss": 0.0291,
+      "step": 387
+    },
+    {
+      "epoch": 0.2023995826812728,
+      "grad_norm": 0.04311797395348549,
+      "learning_rate": 0.0003405555555555556,
+      "loss": 0.0189,
+      "step": 388
+    },
+    {
+      "epoch": 0.20292123109024518,
+      "grad_norm": 0.08901547640562057,
+      "learning_rate": 0.00034,
+      "loss": 0.0313,
+      "step": 389
+    },
+    {
+      "epoch": 0.20344287949921752,
+      "grad_norm": 0.05055601894855499,
+      "learning_rate": 0.00033944444444444446,
+      "loss": 0.0231,
+      "step": 390
+    },
+    {
+      "epoch": 0.2039645279081899,
+      "grad_norm": 0.04943820461630821,
+      "learning_rate": 0.0003388888888888889,
+      "loss": 0.021,
+      "step": 391
+    },
+    {
+      "epoch": 0.20448617631716223,
+      "grad_norm": 0.0558842197060585,
+      "learning_rate": 0.00033833333333333334,
+      "loss": 0.0251,
+      "step": 392
+    },
+    {
+      "epoch": 0.20500782472613457,
+      "grad_norm": 0.06570509821176529,
+      "learning_rate": 0.00033777777777777777,
+      "loss": 0.012,
+      "step": 393
+    },
+    {
+      "epoch": 0.20552947313510694,
+      "grad_norm": 0.13640566170215607,
+      "learning_rate": 0.0003372222222222222,
+      "loss": 0.0396,
+      "step": 394
+    },
+    {
+      "epoch": 0.20605112154407929,
+      "grad_norm": 0.05271435156464577,
+      "learning_rate": 0.0003366666666666667,
+      "loss": 0.028,
+      "step": 395
+    },
+    {
+      "epoch": 0.20657276995305165,
+      "grad_norm": 0.04778929427266121,
+      "learning_rate": 0.00033611111111111114,
+      "loss": 0.0126,
+      "step": 396
+    },
+    {
+      "epoch": 0.207094418362024,
+      "grad_norm": 0.04178643599152565,
+      "learning_rate": 0.0003355555555555556,
+      "loss": 0.0148,
+      "step": 397
+    },
+    {
+      "epoch": 0.20761606677099634,
+      "grad_norm": 0.05933418869972229,
+      "learning_rate": 0.000335,
+      "loss": 0.0218,
+      "step": 398
+    },
+    {
+      "epoch": 0.2081377151799687,
+      "grad_norm": 0.05561219900846481,
+      "learning_rate": 0.00033444444444444445,
+      "loss": 0.0167,
+      "step": 399
+    },
+    {
+      "epoch": 0.20865936358894105,
+      "grad_norm": 0.0431622713804245,
+      "learning_rate": 0.0003338888888888889,
+      "loss": 0.0263,
+      "step": 400
+    },
+    {
+      "epoch": 0.20918101199791342,
+      "grad_norm": 0.06121833994984627,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.0224,
+      "step": 401
+    },
+    {
+      "epoch": 0.20970266040688576,
+      "grad_norm": 0.0923122763633728,
+      "learning_rate": 0.0003327777777777778,
+      "loss": 0.0264,
+      "step": 402
+    },
+    {
+      "epoch": 0.2102243088158581,
+      "grad_norm": 0.045166369527578354,
+      "learning_rate": 0.0003322222222222222,
+      "loss": 0.0135,
+      "step": 403
+    },
+    {
+      "epoch": 0.21074595722483047,
+      "grad_norm": 0.08227386325597763,
+      "learning_rate": 0.0003316666666666667,
+      "loss": 0.0253,
+      "step": 404
+    },
+    {
+      "epoch": 0.2112676056338028,
+      "grad_norm": 0.0516192689538002,
+      "learning_rate": 0.0003311111111111111,
+      "loss": 0.0278,
+      "step": 405
+    },
+    {
+      "epoch": 0.21178925404277518,
+      "grad_norm": 0.11895351111888885,
+      "learning_rate": 0.00033055555555555556,
+      "loss": 0.0333,
+      "step": 406
+    },
+    {
+      "epoch": 0.21231090245174752,
+      "grad_norm": 0.062691330909729,
+      "learning_rate": 0.00033,
+      "loss": 0.0278,
+      "step": 407
+    },
+    {
+      "epoch": 0.21283255086071987,
+      "grad_norm": 0.04849197715520859,
+      "learning_rate": 0.00032944444444444444,
+      "loss": 0.0274,
+      "step": 408
+    },
+    {
+      "epoch": 0.21335419926969224,
+      "grad_norm": 0.12979350984096527,
+      "learning_rate": 0.0003288888888888889,
+      "loss": 0.0311,
+      "step": 409
+    },
+    {
+      "epoch": 0.21387584767866458,
+      "grad_norm": 0.09985975176095963,
+      "learning_rate": 0.0003283333333333333,
+      "loss": 0.0343,
+      "step": 410
+    },
+    {
+      "epoch": 0.21439749608763695,
+      "grad_norm": 0.05388106778264046,
+      "learning_rate": 0.0003277777777777778,
+      "loss": 0.0119,
+      "step": 411
+    },
+    {
+      "epoch": 0.2149191444966093,
+      "grad_norm": 0.061424896121025085,
+      "learning_rate": 0.00032722222222222224,
+      "loss": 0.0258,
+      "step": 412
+    },
+    {
+      "epoch": 0.21544079290558163,
+      "grad_norm": 0.05304243043065071,
+      "learning_rate": 0.0003266666666666667,
+      "loss": 0.0252,
+      "step": 413
+    },
+    {
+      "epoch": 0.215962441314554,
+      "grad_norm": 0.06124117597937584,
+      "learning_rate": 0.0003261111111111111,
+      "loss": 0.0265,
+      "step": 414
+    },
+    {
+      "epoch": 0.21648408972352634,
+      "grad_norm": 0.0462118461728096,
+      "learning_rate": 0.00032555555555555555,
+      "loss": 0.018,
+      "step": 415
+    },
+    {
+      "epoch": 0.21700573813249868,
+      "grad_norm": 0.05339544638991356,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.015,
+      "step": 416
+    },
+    {
+      "epoch": 0.21752738654147105,
+      "grad_norm": 0.04562271013855934,
+      "learning_rate": 0.0003244444444444444,
+      "loss": 0.0222,
+      "step": 417
+    },
+    {
+      "epoch": 0.2180490349504434,
+      "grad_norm": 0.06268814951181412,
+      "learning_rate": 0.0003238888888888889,
+      "loss": 0.017,
+      "step": 418
+    },
+    {
+      "epoch": 0.21857068335941576,
+      "grad_norm": 0.07897916436195374,
+      "learning_rate": 0.0003233333333333333,
+      "loss": 0.0362,
+      "step": 419
+    },
+    {
+      "epoch": 0.2190923317683881,
+      "grad_norm": 0.0633966326713562,
+      "learning_rate": 0.0003227777777777778,
+      "loss": 0.016,
+      "step": 420
+    },
+    {
+      "epoch": 0.21961398017736045,
+      "grad_norm": 0.06430386751890182,
+      "learning_rate": 0.0003222222222222222,
+      "loss": 0.029,
+      "step": 421
+    },
+    {
+      "epoch": 0.22013562858633282,
+      "grad_norm": 0.07986673712730408,
+      "learning_rate": 0.00032166666666666666,
+      "loss": 0.0339,
+      "step": 422
+    },
+    {
+      "epoch": 0.22065727699530516,
+      "grad_norm": 0.05497688055038452,
+      "learning_rate": 0.00032111111111111115,
+      "loss": 0.0135,
+      "step": 423
+    },
+    {
+      "epoch": 0.22117892540427753,
+      "grad_norm": 0.07314179837703705,
+      "learning_rate": 0.00032055555555555554,
+      "loss": 0.0274,
+      "step": 424
+    },
+    {
+      "epoch": 0.22170057381324987,
+      "grad_norm": 0.045945893973112106,
+      "learning_rate": 0.00032,
+      "loss": 0.0239,
+      "step": 425
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.11235559731721878,
+      "learning_rate": 0.0003194444444444444,
+      "loss": 0.0302,
+      "step": 426
+    },
+    {
+      "epoch": 0.22274387063119458,
+      "grad_norm": 0.06201227381825447,
+      "learning_rate": 0.0003188888888888889,
+      "loss": 0.0319,
+      "step": 427
+    },
+    {
+      "epoch": 0.22326551904016692,
+      "grad_norm": 0.05009296536445618,
+      "learning_rate": 0.00031833333333333334,
+      "loss": 0.0175,
+      "step": 428
+    },
+    {
+      "epoch": 0.2237871674491393,
+      "grad_norm": 0.06590239703655243,
+      "learning_rate": 0.0003177777777777778,
+      "loss": 0.0275,
+      "step": 429
+    },
+    {
+      "epoch": 0.22430881585811163,
+      "grad_norm": 0.06859228760004044,
+      "learning_rate": 0.00031722222222222227,
+      "loss": 0.0282,
+      "step": 430
+    },
+    {
+      "epoch": 0.22483046426708397,
+      "grad_norm": 0.04625241830945015,
+      "learning_rate": 0.00031666666666666665,
+      "loss": 0.025,
+      "step": 431
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.06035872921347618,
+      "learning_rate": 0.00031611111111111114,
+      "loss": 0.0301,
+      "step": 432
+    },
+    {
+      "epoch": 0.22587376108502868,
+      "grad_norm": 0.09716615080833435,
+      "learning_rate": 0.0003155555555555555,
+      "loss": 0.0135,
+      "step": 433
+    },
+    {
+      "epoch": 0.22639540949400105,
+      "grad_norm": 0.060461174696683884,
+      "learning_rate": 0.000315,
+      "loss": 0.0237,
+      "step": 434
+    },
+    {
+      "epoch": 0.2269170579029734,
+      "grad_norm": 0.0631205216050148,
+      "learning_rate": 0.0003144444444444445,
+      "loss": 0.0136,
+      "step": 435
+    },
+    {
+      "epoch": 0.22743870631194574,
+      "grad_norm": 0.044443339109420776,
+      "learning_rate": 0.0003138888888888889,
+      "loss": 0.0237,
+      "step": 436
+    },
+    {
+      "epoch": 0.2279603547209181,
+      "grad_norm": 0.04617779329419136,
+      "learning_rate": 0.0003133333333333334,
+      "loss": 0.0246,
+      "step": 437
+    },
+    {
+      "epoch": 0.22848200312989045,
+      "grad_norm": 0.040022511035203934,
+      "learning_rate": 0.00031277777777777776,
+      "loss": 0.0291,
+      "step": 438
+    },
+    {
+      "epoch": 0.22900365153886282,
+      "grad_norm": 0.10207943618297577,
+      "learning_rate": 0.00031222222222222225,
+      "loss": 0.0289,
+      "step": 439
+    },
+    {
+      "epoch": 0.22952529994783516,
+      "grad_norm": 0.052271150052547455,
+      "learning_rate": 0.00031166666666666663,
+      "loss": 0.0275,
+      "step": 440
+    },
+    {
+      "epoch": 0.2300469483568075,
+      "grad_norm": 0.05472894012928009,
+      "learning_rate": 0.0003111111111111111,
+      "loss": 0.0237,
+      "step": 441
+    },
+    {
+      "epoch": 0.23056859676577987,
+      "grad_norm": 0.051506489515304565,
+      "learning_rate": 0.0003105555555555555,
+      "loss": 0.0267,
+      "step": 442
+    },
+    {
+      "epoch": 0.2310902451747522,
+      "grad_norm": 0.12625692784786224,
+      "learning_rate": 0.00031,
+      "loss": 0.0295,
+      "step": 443
+    },
+    {
+      "epoch": 0.23161189358372458,
+      "grad_norm": 0.052463430911302567,
+      "learning_rate": 0.0003094444444444445,
+      "loss": 0.0222,
+      "step": 444
+    },
+    {
+      "epoch": 0.23213354199269692,
+      "grad_norm": 0.07140383124351501,
+      "learning_rate": 0.0003088888888888889,
+      "loss": 0.0274,
+      "step": 445
+    },
+    {
+      "epoch": 0.23265519040166927,
+      "grad_norm": 0.04128264635801315,
+      "learning_rate": 0.00030833333333333337,
+      "loss": 0.0195,
+      "step": 446
+    },
+    {
+      "epoch": 0.23317683881064163,
+      "grad_norm": 0.09302657097578049,
+      "learning_rate": 0.00030777777777777775,
+      "loss": 0.0372,
+      "step": 447
+    },
+    {
+      "epoch": 0.23369848721961398,
+      "grad_norm": 0.07428940385580063,
+      "learning_rate": 0.00030722222222222224,
+      "loss": 0.0232,
+      "step": 448
+    },
+    {
+      "epoch": 0.23422013562858635,
+      "grad_norm": 0.08673116564750671,
+      "learning_rate": 0.0003066666666666667,
+      "loss": 0.0292,
+      "step": 449
+    },
+    {
+      "epoch": 0.2347417840375587,
+      "grad_norm": 0.06983821839094162,
+      "learning_rate": 0.0003061111111111111,
+      "loss": 0.0299,
+      "step": 450
+    },
+    {
+      "epoch": 0.23526343244653103,
+      "grad_norm": 0.05890791490674019,
+      "learning_rate": 0.0003055555555555556,
+      "loss": 0.0159,
+      "step": 451
+    },
+    {
+      "epoch": 0.2357850808555034,
+      "grad_norm": 0.05678574740886688,
+      "learning_rate": 0.000305,
+      "loss": 0.0194,
+      "step": 452
+    },
+    {
+      "epoch": 0.23630672926447574,
+      "grad_norm": 0.06532768905162811,
+      "learning_rate": 0.0003044444444444445,
+      "loss": 0.0222,
+      "step": 453
+    },
+    {
+      "epoch": 0.2368283776734481,
+      "grad_norm": 0.04442572221159935,
+      "learning_rate": 0.00030388888888888886,
+      "loss": 0.0204,
+      "step": 454
+    },
+    {
+      "epoch": 0.23735002608242045,
+      "grad_norm": 0.06691122055053711,
+      "learning_rate": 0.00030333333333333335,
+      "loss": 0.0277,
+      "step": 455
+    },
+    {
+      "epoch": 0.2378716744913928,
+      "grad_norm": 0.061397284269332886,
+      "learning_rate": 0.0003027777777777778,
+      "loss": 0.015,
+      "step": 456
+    },
+    {
+      "epoch": 0.23839332290036516,
+      "grad_norm": 0.058687739074230194,
+      "learning_rate": 0.0003022222222222222,
+      "loss": 0.0307,
+      "step": 457
+    },
+    {
+      "epoch": 0.2389149713093375,
+      "grad_norm": 0.07386653125286102,
+      "learning_rate": 0.0003016666666666667,
+      "loss": 0.0291,
+      "step": 458
+    },
+    {
+      "epoch": 0.23943661971830985,
+      "grad_norm": 0.06010840833187103,
+      "learning_rate": 0.0003011111111111111,
+      "loss": 0.02,
+      "step": 459
+    },
+    {
+      "epoch": 0.23995826812728221,
+      "grad_norm": 0.05542018637061119,
+      "learning_rate": 0.0003005555555555556,
+      "loss": 0.0185,
+      "step": 460
+    },
+    {
+      "epoch": 0.24047991653625456,
+      "grad_norm": 0.07411111891269684,
+      "learning_rate": 0.0003,
+      "loss": 0.013,
+      "step": 461
+    },
+    {
+      "epoch": 0.24100156494522693,
+      "grad_norm": 0.05750228837132454,
+      "learning_rate": 0.00029944444444444446,
+      "loss": 0.0157,
+      "step": 462
+    },
+    {
+      "epoch": 0.24152321335419927,
+      "grad_norm": 0.04710470885038376,
+      "learning_rate": 0.0002988888888888889,
+      "loss": 0.0194,
+      "step": 463
+    },
+    {
+      "epoch": 0.2420448617631716,
+      "grad_norm": 0.058672945946455,
+      "learning_rate": 0.00029833333333333334,
+      "loss": 0.0138,
+      "step": 464
+    },
+    {
+      "epoch": 0.24256651017214398,
+      "grad_norm": 0.048859693109989166,
+      "learning_rate": 0.0002977777777777778,
+      "loss": 0.0177,
+      "step": 465
+    },
+    {
+      "epoch": 0.24308815858111632,
+      "grad_norm": 0.0428357794880867,
+      "learning_rate": 0.0002972222222222222,
+      "loss": 0.013,
+      "step": 466
+    },
+    {
+      "epoch": 0.2436098069900887,
+      "grad_norm": 0.053479310125112534,
+      "learning_rate": 0.0002966666666666667,
+      "loss": 0.0238,
+      "step": 467
+    },
+    {
+      "epoch": 0.24413145539906103,
+      "grad_norm": 0.05971763655543327,
+      "learning_rate": 0.0002961111111111111,
+      "loss": 0.0258,
+      "step": 468
+    },
+    {
+      "epoch": 0.24465310380803337,
+      "grad_norm": 0.045527439564466476,
+      "learning_rate": 0.0002955555555555556,
+      "loss": 0.0219,
+      "step": 469
+    },
+    {
+      "epoch": 0.24517475221700574,
+      "grad_norm": 0.04123241826891899,
+      "learning_rate": 0.000295,
+      "loss": 0.0233,
+      "step": 470
+    },
+    {
+      "epoch": 0.24569640062597808,
+      "grad_norm": 0.17581242322921753,
+      "learning_rate": 0.00029444444444444445,
+      "loss": 0.0359,
+      "step": 471
+    },
+    {
+      "epoch": 0.24621804903495045,
+      "grad_norm": 0.09290144592523575,
+      "learning_rate": 0.0002938888888888889,
+      "loss": 0.0256,
+      "step": 472
+    },
+    {
+      "epoch": 0.2467396974439228,
+      "grad_norm": 0.03519435226917267,
+      "learning_rate": 0.0002933333333333333,
+      "loss": 0.0198,
+      "step": 473
+    },
+    {
+      "epoch": 0.24726134585289514,
+      "grad_norm": 0.04338726028800011,
+      "learning_rate": 0.0002927777777777778,
+      "loss": 0.0118,
+      "step": 474
+    },
+    {
+      "epoch": 0.2477829942618675,
+      "grad_norm": 0.1172214224934578,
+      "learning_rate": 0.0002922222222222222,
+      "loss": 0.0324,
+      "step": 475
+    },
+    {
+      "epoch": 0.24830464267083985,
+      "grad_norm": 0.06438913941383362,
+      "learning_rate": 0.0002916666666666667,
+      "loss": 0.0241,
+      "step": 476
+    },
+    {
+      "epoch": 0.24882629107981222,
+      "grad_norm": 0.14249597489833832,
+      "learning_rate": 0.00029111111111111113,
+      "loss": 0.0356,
+      "step": 477
+    },
+    {
+      "epoch": 0.24934793948878456,
+      "grad_norm": 0.045848701149225235,
+      "learning_rate": 0.00029055555555555556,
+      "loss": 0.0155,
+      "step": 478
+    },
+    {
+      "epoch": 0.2498695878977569,
+      "grad_norm": 0.05208711698651314,
+      "learning_rate": 0.00029,
+      "loss": 0.0243,
+      "step": 479
+    },
+    {
+      "epoch": 0.25039123630672927,
+      "grad_norm": 0.05537666380405426,
+      "learning_rate": 0.00028944444444444444,
+      "loss": 0.0308,
+      "step": 480
+    },
+    {
+      "epoch": 0.2509128847157016,
+      "grad_norm": 0.049218740314245224,
+      "learning_rate": 0.0002888888888888889,
+      "loss": 0.0149,
+      "step": 481
+    },
+    {
+      "epoch": 0.25143453312467395,
+      "grad_norm": 0.050829824060201645,
+      "learning_rate": 0.0002883333333333333,
+      "loss": 0.0252,
+      "step": 482
+    },
+    {
+      "epoch": 0.2519561815336463,
+      "grad_norm": 0.047700315713882446,
+      "learning_rate": 0.0002877777777777778,
+      "loss": 0.024,
+      "step": 483
+    },
+    {
+      "epoch": 0.2524778299426187,
+      "grad_norm": 0.07238329201936722,
+      "learning_rate": 0.00028722222222222224,
+      "loss": 0.0214,
+      "step": 484
+    },
+    {
+      "epoch": 0.25299947835159103,
+      "grad_norm": 0.04379934072494507,
+      "learning_rate": 0.0002866666666666667,
+      "loss": 0.0259,
+      "step": 485
+    },
+    {
+      "epoch": 0.2535211267605634,
+      "grad_norm": 0.0694945827126503,
+      "learning_rate": 0.0002861111111111111,
+      "loss": 0.0183,
+      "step": 486
+    },
+    {
+      "epoch": 0.2540427751695357,
+      "grad_norm": 0.056973714381456375,
+      "learning_rate": 0.00028555555555555555,
+      "loss": 0.0272,
+      "step": 487
+    },
+    {
+      "epoch": 0.25456442357850806,
+      "grad_norm": 0.05177774280309677,
+      "learning_rate": 0.000285,
+      "loss": 0.0275,
+      "step": 488
+    },
+    {
+      "epoch": 0.25508607198748046,
+      "grad_norm": 0.08025998622179031,
+      "learning_rate": 0.0002844444444444444,
+      "loss": 0.0275,
+      "step": 489
+    },
+    {
+      "epoch": 0.2556077203964528,
+      "grad_norm": 0.04469485580921173,
+      "learning_rate": 0.0002838888888888889,
+      "loss": 0.0233,
+      "step": 490
+    },
+    {
+      "epoch": 0.25612936880542514,
+      "grad_norm": 0.06418605148792267,
+      "learning_rate": 0.00028333333333333335,
+      "loss": 0.0315,
+      "step": 491
+    },
+    {
+      "epoch": 0.2566510172143975,
+      "grad_norm": 0.09197323769330978,
+      "learning_rate": 0.0002827777777777778,
+      "loss": 0.0252,
+      "step": 492
+    },
+    {
+      "epoch": 0.2571726656233698,
+      "grad_norm": 0.058225080370903015,
+      "learning_rate": 0.00028222222222222223,
+      "loss": 0.0273,
+      "step": 493
+    },
+    {
+      "epoch": 0.2576943140323422,
+      "grad_norm": 0.03813721984624863,
+      "learning_rate": 0.00028166666666666666,
+      "loss": 0.0244,
+      "step": 494
+    },
+    {
+      "epoch": 0.25821596244131456,
+      "grad_norm": 0.07051227241754532,
+      "learning_rate": 0.0002811111111111111,
+      "loss": 0.0243,
+      "step": 495
+    },
+    {
+      "epoch": 0.2587376108502869,
+      "grad_norm": 0.07289328426122665,
+      "learning_rate": 0.00028055555555555554,
+      "loss": 0.0275,
+      "step": 496
+    },
+    {
+      "epoch": 0.25925925925925924,
+      "grad_norm": 0.08466362953186035,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.0107,
+      "step": 497
+    },
+    {
+      "epoch": 0.2597809076682316,
+      "grad_norm": 0.06937390565872192,
+      "learning_rate": 0.00027944444444444447,
+      "loss": 0.0277,
+      "step": 498
+    },
+    {
+      "epoch": 0.260302556077204,
+      "grad_norm": 0.07908118516206741,
+      "learning_rate": 0.0002788888888888889,
+      "loss": 0.013,
+      "step": 499
+    },
+    {
+      "epoch": 0.2608242044861763,
+      "grad_norm": 0.05328572168946266,
+      "learning_rate": 0.00027833333333333334,
+      "loss": 0.0241,
+      "step": 500
+    },
+    {
+      "epoch": 0.26134585289514867,
+      "grad_norm": 0.039753999561071396,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 0.0184,
+      "step": 501
+    },
+    {
+      "epoch": 0.261867501304121,
+      "grad_norm": 0.046303607523441315,
+      "learning_rate": 0.0002772222222222222,
+      "loss": 0.024,
+      "step": 502
+    },
+    {
+      "epoch": 0.26238914971309335,
+      "grad_norm": 0.07334708422422409,
+      "learning_rate": 0.00027666666666666665,
+      "loss": 0.0117,
+      "step": 503
+    },
+    {
+      "epoch": 0.26291079812206575,
+      "grad_norm": 0.08413495868444443,
+      "learning_rate": 0.0002761111111111111,
+      "loss": 0.032,
+      "step": 504
+    },
+    {
+      "epoch": 0.2634324465310381,
+      "grad_norm": 0.05528826639056206,
+      "learning_rate": 0.0002755555555555556,
+      "loss": 0.0263,
+      "step": 505
+    },
+    {
+      "epoch": 0.26395409494001043,
+      "grad_norm": 0.04823659360408783,
+      "learning_rate": 0.000275,
+      "loss": 0.0272,
+      "step": 506
+    },
+    {
+      "epoch": 0.2644757433489828,
+      "grad_norm": 0.06403996050357819,
+      "learning_rate": 0.00027444444444444445,
+      "loss": 0.016,
+      "step": 507
+    },
+    {
+      "epoch": 0.2649973917579551,
+      "grad_norm": 0.044559504836797714,
+      "learning_rate": 0.0002738888888888889,
+      "loss": 0.024,
+      "step": 508
+    },
+    {
+      "epoch": 0.2655190401669275,
+      "grad_norm": 0.050583697855472565,
+      "learning_rate": 0.00027333333333333333,
+      "loss": 0.0227,
+      "step": 509
+    },
+    {
+      "epoch": 0.26604068857589985,
+      "grad_norm": 0.06379957497119904,
+      "learning_rate": 0.00027277777777777776,
+      "loss": 0.0353,
+      "step": 510
+    },
+    {
+      "epoch": 0.2665623369848722,
+      "grad_norm": 0.05423254147171974,
+      "learning_rate": 0.0002722222222222222,
+      "loss": 0.023,
+      "step": 511
+    },
+    {
+      "epoch": 0.26708398539384454,
+      "grad_norm": 0.05240878462791443,
+      "learning_rate": 0.0002716666666666667,
+      "loss": 0.0249,
+      "step": 512
+    },
+    {
+      "epoch": 0.2676056338028169,
+      "grad_norm": 0.0713438093662262,
+      "learning_rate": 0.00027111111111111113,
+      "loss": 0.0258,
+      "step": 513
+    },
+    {
+      "epoch": 0.2681272822117893,
+      "grad_norm": 0.05144781991839409,
+      "learning_rate": 0.00027055555555555557,
+      "loss": 0.0262,
+      "step": 514
+    },
+    {
+      "epoch": 0.2686489306207616,
+      "grad_norm": 0.052076008170843124,
+      "learning_rate": 0.00027,
+      "loss": 0.0106,
+      "step": 515
+    },
+    {
+      "epoch": 0.26917057902973396,
+      "grad_norm": 0.05983636528253555,
+      "learning_rate": 0.00026944444444444444,
+      "loss": 0.0271,
+      "step": 516
+    },
+    {
+      "epoch": 0.2696922274387063,
+      "grad_norm": 0.05891017988324165,
+      "learning_rate": 0.00026888888888888893,
+      "loss": 0.0267,
+      "step": 517
+    },
+    {
+      "epoch": 0.27021387584767864,
+      "grad_norm": 0.06906379759311676,
+      "learning_rate": 0.0002683333333333333,
+      "loss": 0.0268,
+      "step": 518
+    },
+    {
+      "epoch": 0.27073552425665104,
+      "grad_norm": 0.07758453488349915,
+      "learning_rate": 0.0002677777777777778,
+      "loss": 0.0254,
+      "step": 519
+    },
+    {
+      "epoch": 0.2712571726656234,
+      "grad_norm": 0.04410182312130928,
+      "learning_rate": 0.00026722222222222224,
+      "loss": 0.0206,
+      "step": 520
+    },
+    {
+      "epoch": 0.2717788210745957,
+      "grad_norm": 0.04364776983857155,
+      "learning_rate": 0.0002666666666666667,
+      "loss": 0.0256,
+      "step": 521
+    },
+    {
+      "epoch": 0.27230046948356806,
+      "grad_norm": 0.067947156727314,
+      "learning_rate": 0.0002661111111111111,
+      "loss": 0.0276,
+      "step": 522
+    },
+    {
+      "epoch": 0.2728221178925404,
+      "grad_norm": 0.04528547078371048,
+      "learning_rate": 0.00026555555555555555,
+      "loss": 0.0243,
+      "step": 523
+    },
+    {
+      "epoch": 0.2733437663015128,
+      "grad_norm": 0.06837579607963562,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.0105,
+      "step": 524
+    },
+    {
+      "epoch": 0.27386541471048514,
+      "grad_norm": 0.09349052608013153,
+      "learning_rate": 0.00026444444444444443,
+      "loss": 0.0316,
+      "step": 525
+    },
+    {
+      "epoch": 0.2743870631194575,
+      "grad_norm": 0.04515364021062851,
+      "learning_rate": 0.0002638888888888889,
+      "loss": 0.0225,
+      "step": 526
+    },
+    {
+      "epoch": 0.2749087115284298,
+      "grad_norm": 0.05468389391899109,
+      "learning_rate": 0.0002633333333333333,
+      "loss": 0.0158,
+      "step": 527
+    },
+    {
+      "epoch": 0.27543035993740217,
+      "grad_norm": 0.058201421052217484,
+      "learning_rate": 0.0002627777777777778,
+      "loss": 0.0237,
+      "step": 528
+    },
+    {
+      "epoch": 0.27595200834637457,
+      "grad_norm": 0.0540071465075016,
+      "learning_rate": 0.00026222222222222223,
+      "loss": 0.0256,
+      "step": 529
+    },
+    {
+      "epoch": 0.2764736567553469,
+      "grad_norm": 0.05313161760568619,
+      "learning_rate": 0.00026166666666666667,
+      "loss": 0.0208,
+      "step": 530
+    },
+    {
+      "epoch": 0.27699530516431925,
+      "grad_norm": 0.10889704525470734,
+      "learning_rate": 0.00026111111111111116,
+      "loss": 0.0329,
+      "step": 531
+    },
+    {
+      "epoch": 0.2775169535732916,
+      "grad_norm": 0.04877515137195587,
+      "learning_rate": 0.00026055555555555554,
+      "loss": 0.0285,
+      "step": 532
+    },
+    {
+      "epoch": 0.27803860198226393,
+      "grad_norm": 0.07354211062192917,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.0326,
+      "step": 533
+    },
+    {
+      "epoch": 0.27856025039123633,
+      "grad_norm": 0.04824311286211014,
+      "learning_rate": 0.0002594444444444444,
+      "loss": 0.0216,
+      "step": 534
+    },
+    {
+      "epoch": 0.27908189880020867,
+      "grad_norm": 0.07306429743766785,
+      "learning_rate": 0.0002588888888888889,
+      "loss": 0.0265,
+      "step": 535
+    },
+    {
+      "epoch": 0.279603547209181,
+      "grad_norm": 0.07081807404756546,
+      "learning_rate": 0.00025833333333333334,
+      "loss": 0.0131,
+      "step": 536
+    },
+    {
+      "epoch": 0.28012519561815336,
+      "grad_norm": 0.06269505620002747,
+      "learning_rate": 0.0002577777777777778,
+      "loss": 0.0273,
+      "step": 537
+    },
+    {
+      "epoch": 0.2806468440271257,
+      "grad_norm": 0.042418792843818665,
+      "learning_rate": 0.00025722222222222227,
+      "loss": 0.0251,
+      "step": 538
+    },
+    {
+      "epoch": 0.2811684924360981,
+      "grad_norm": 0.0451393648982048,
+      "learning_rate": 0.00025666666666666665,
+      "loss": 0.0192,
+      "step": 539
+    },
+    {
+      "epoch": 0.28169014084507044,
+      "grad_norm": 0.07902763038873672,
+      "learning_rate": 0.00025611111111111114,
+      "loss": 0.0156,
+      "step": 540
+    },
+    {
+      "epoch": 0.2822117892540428,
+      "grad_norm": 0.046156350523233414,
+      "learning_rate": 0.00025555555555555553,
+      "loss": 0.0229,
+      "step": 541
+    },
+    {
+      "epoch": 0.2827334376630151,
+      "grad_norm": 0.04461774230003357,
+      "learning_rate": 0.000255,
+      "loss": 0.0242,
+      "step": 542
+    },
+    {
+      "epoch": 0.28325508607198746,
+      "grad_norm": 0.05216965079307556,
+      "learning_rate": 0.0002544444444444444,
+      "loss": 0.0198,
+      "step": 543
+    },
+    {
+      "epoch": 0.28377673448095986,
+      "grad_norm": 0.07048270106315613,
+      "learning_rate": 0.0002538888888888889,
+      "loss": 0.0163,
+      "step": 544
+    },
+    {
+      "epoch": 0.2842983828899322,
+      "grad_norm": 0.06380292773246765,
+      "learning_rate": 0.0002533333333333334,
+      "loss": 0.017,
+      "step": 545
+    },
+    {
+      "epoch": 0.28482003129890454,
+      "grad_norm": 0.047167252749204636,
+      "learning_rate": 0.00025277777777777777,
+      "loss": 0.0236,
+      "step": 546
+    },
+    {
+      "epoch": 0.2853416797078769,
+      "grad_norm": 0.0429813452064991,
+      "learning_rate": 0.00025222222222222226,
+      "loss": 0.0271,
+      "step": 547
+    },
+    {
+      "epoch": 0.2858633281168492,
+      "grad_norm": 0.11126257479190826,
+      "learning_rate": 0.00025166666666666664,
+      "loss": 0.0342,
+      "step": 548
+    },
+    {
+      "epoch": 0.2863849765258216,
+      "grad_norm": 0.03414067253470421,
+      "learning_rate": 0.00025111111111111113,
+      "loss": 0.0181,
+      "step": 549
+    },
+    {
+      "epoch": 0.28690662493479396,
+      "grad_norm": 0.04244649410247803,
+      "learning_rate": 0.0002505555555555555,
+      "loss": 0.0214,
+      "step": 550
+    },
+    {
+      "epoch": 0.2874282733437663,
+      "grad_norm": 0.06161463260650635,
+      "learning_rate": 0.00025,
+      "loss": 0.0161,
+      "step": 551
+    },
+    {
+      "epoch": 0.28794992175273865,
+      "grad_norm": 0.042669475078582764,
+      "learning_rate": 0.00024944444444444444,
+      "loss": 0.0248,
+      "step": 552
+    },
+    {
+      "epoch": 0.288471570161711,
+      "grad_norm": 0.08395751565694809,
+      "learning_rate": 0.0002488888888888889,
+      "loss": 0.0305,
+      "step": 553
+    },
+    {
+      "epoch": 0.2889932185706834,
+      "grad_norm": 0.038790151476860046,
+      "learning_rate": 0.0002483333333333333,
+      "loss": 0.0249,
+      "step": 554
+    },
+    {
+      "epoch": 0.2895148669796557,
+      "grad_norm": 0.04883798584342003,
+      "learning_rate": 0.0002477777777777778,
+      "loss": 0.0267,
+      "step": 555
+    },
+    {
+      "epoch": 0.29003651538862807,
+      "grad_norm": 0.056415002793073654,
+      "learning_rate": 0.00024722222222222224,
+      "loss": 0.0204,
+      "step": 556
+    },
+    {
+      "epoch": 0.2905581637976004,
+      "grad_norm": 0.06406931579113007,
+      "learning_rate": 0.0002466666666666667,
+      "loss": 0.0164,
+      "step": 557
+    },
+    {
+      "epoch": 0.29107981220657275,
+      "grad_norm": 0.06549858301877975,
+      "learning_rate": 0.0002461111111111111,
+      "loss": 0.0259,
+      "step": 558
+    },
+    {
+      "epoch": 0.29160146061554515,
+      "grad_norm": 0.05236493796110153,
+      "learning_rate": 0.00024555555555555556,
+      "loss": 0.0182,
+      "step": 559
+    },
+    {
+      "epoch": 0.2921231090245175,
+      "grad_norm": 0.05339088663458824,
+      "learning_rate": 0.000245,
+      "loss": 0.0231,
+      "step": 560
+    },
+    {
+      "epoch": 0.29264475743348983,
+      "grad_norm": 0.052736297249794006,
+      "learning_rate": 0.00024444444444444443,
+      "loss": 0.0108,
+      "step": 561
+    },
+    {
+      "epoch": 0.2931664058424622,
+      "grad_norm": 0.03991522639989853,
+      "learning_rate": 0.0002438888888888889,
+      "loss": 0.0135,
+      "step": 562
+    },
+    {
+      "epoch": 0.2936880542514345,
+      "grad_norm": 0.053174279630184174,
+      "learning_rate": 0.00024333333333333336,
+      "loss": 0.0222,
+      "step": 563
+    },
+    {
+      "epoch": 0.2942097026604069,
+      "grad_norm": 0.07549899816513062,
+      "learning_rate": 0.0002427777777777778,
+      "loss": 0.0236,
+      "step": 564
+    },
+    {
+      "epoch": 0.29473135106937925,
+      "grad_norm": 0.04507315158843994,
+      "learning_rate": 0.00024222222222222223,
+      "loss": 0.0219,
+      "step": 565
+    },
+    {
+      "epoch": 0.2952529994783516,
+      "grad_norm": 0.05438590794801712,
+      "learning_rate": 0.00024166666666666667,
+      "loss": 0.0135,
+      "step": 566
+    },
+    {
+      "epoch": 0.29577464788732394,
+      "grad_norm": 0.04266679286956787,
+      "learning_rate": 0.0002411111111111111,
+      "loss": 0.0125,
+      "step": 567
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.08215553313493729,
+      "learning_rate": 0.00024055555555555554,
+      "loss": 0.0296,
+      "step": 568
+    },
+    {
+      "epoch": 0.2968179447052686,
+      "grad_norm": 0.13914398849010468,
+      "learning_rate": 0.00024,
+      "loss": 0.0346,
+      "step": 569
+    },
+    {
+      "epoch": 0.297339593114241,
+      "grad_norm": 0.035860706120729446,
+      "learning_rate": 0.00023944444444444444,
+      "loss": 0.0145,
+      "step": 570
+    },
+    {
+      "epoch": 0.29786124152321336,
+      "grad_norm": 0.038400448858737946,
+      "learning_rate": 0.0002388888888888889,
+      "loss": 0.024,
+      "step": 571
+    },
+    {
+      "epoch": 0.2983828899321857,
+      "grad_norm": 0.07077977806329727,
+      "learning_rate": 0.00023833333333333334,
+      "loss": 0.0293,
+      "step": 572
+    },
+    {
+      "epoch": 0.29890453834115804,
+      "grad_norm": 0.05618384853005409,
+      "learning_rate": 0.00023777777777777778,
+      "loss": 0.0268,
+      "step": 573
+    },
+    {
+      "epoch": 0.2994261867501304,
+      "grad_norm": 0.041202716529369354,
+      "learning_rate": 0.00023722222222222222,
+      "loss": 0.0191,
+      "step": 574
+    },
+    {
+      "epoch": 0.2999478351591028,
+      "grad_norm": 0.06533535569906235,
+      "learning_rate": 0.00023666666666666668,
+      "loss": 0.0253,
+      "step": 575
+    },
+    {
+      "epoch": 0.3004694835680751,
+      "grad_norm": 0.04670009762048721,
+      "learning_rate": 0.00023611111111111112,
+      "loss": 0.0257,
+      "step": 576
+    },
+    {
+      "epoch": 0.30099113197704747,
+      "grad_norm": 0.039087213575839996,
+      "learning_rate": 0.00023555555555555556,
+      "loss": 0.025,
+      "step": 577
+    },
+    {
+      "epoch": 0.3015127803860198,
+      "grad_norm": 0.06907133758068085,
+      "learning_rate": 0.000235,
+      "loss": 0.024,
+      "step": 578
+    },
+    {
+      "epoch": 0.30203442879499215,
+      "grad_norm": 0.0662020891904831,
+      "learning_rate": 0.00023444444444444446,
+      "loss": 0.0235,
+      "step": 579
+    },
+    {
+      "epoch": 0.30255607720396455,
+      "grad_norm": 0.03638078272342682,
+      "learning_rate": 0.0002338888888888889,
+      "loss": 0.0193,
+      "step": 580
+    },
+    {
+      "epoch": 0.3030777256129369,
+      "grad_norm": 0.05564034357666969,
+      "learning_rate": 0.00023333333333333333,
+      "loss": 0.0164,
+      "step": 581
+    },
+    {
+      "epoch": 0.30359937402190923,
+      "grad_norm": 0.08538271486759186,
+      "learning_rate": 0.0002327777777777778,
+      "loss": 0.0109,
+      "step": 582
+    },
+    {
+      "epoch": 0.30412102243088157,
+      "grad_norm": 0.06491502374410629,
+      "learning_rate": 0.00023222222222222223,
+      "loss": 0.03,
+      "step": 583
+    },
+    {
+      "epoch": 0.3046426708398539,
+      "grad_norm": 0.051391441375017166,
+      "learning_rate": 0.00023166666666666667,
+      "loss": 0.0239,
+      "step": 584
+    },
+    {
+      "epoch": 0.3051643192488263,
+      "grad_norm": 0.041383545845746994,
+      "learning_rate": 0.0002311111111111111,
+      "loss": 0.0235,
+      "step": 585
+    },
+    {
+      "epoch": 0.30568596765779865,
+      "grad_norm": 0.06924084573984146,
+      "learning_rate": 0.00023055555555555557,
+      "loss": 0.0278,
+      "step": 586
+    },
+    {
+      "epoch": 0.306207616066771,
+      "grad_norm": 0.04862818121910095,
+      "learning_rate": 0.00023,
+      "loss": 0.0102,
+      "step": 587
+    },
+    {
+      "epoch": 0.30672926447574334,
+      "grad_norm": 0.041307900100946426,
+      "learning_rate": 0.00022944444444444444,
+      "loss": 0.024,
+      "step": 588
+    },
+    {
+      "epoch": 0.3072509128847157,
+      "grad_norm": 0.06442257761955261,
+      "learning_rate": 0.0002288888888888889,
+      "loss": 0.0247,
+      "step": 589
+    },
+    {
+      "epoch": 0.3077725612936881,
+      "grad_norm": 0.0453709252178669,
+      "learning_rate": 0.00022833333333333334,
+      "loss": 0.0184,
+      "step": 590
+    },
+    {
+      "epoch": 0.3082942097026604,
+      "grad_norm": 0.08471877872943878,
+      "learning_rate": 0.00022777777777777778,
+      "loss": 0.0319,
+      "step": 591
+    },
+    {
+      "epoch": 0.30881585811163276,
+      "grad_norm": 0.0712469220161438,
+      "learning_rate": 0.00022722222222222222,
+      "loss": 0.0281,
+      "step": 592
+    },
+    {
+      "epoch": 0.3093375065206051,
+      "grad_norm": 0.0397157222032547,
+      "learning_rate": 0.00022666666666666666,
+      "loss": 0.024,
+      "step": 593
+    },
+    {
+      "epoch": 0.30985915492957744,
+      "grad_norm": 0.03950037062168121,
+      "learning_rate": 0.00022611111111111112,
+      "loss": 0.0239,
+      "step": 594
+    },
+    {
+      "epoch": 0.31038080333854984,
+      "grad_norm": 0.05540947616100311,
+      "learning_rate": 0.00022555555555555556,
+      "loss": 0.0261,
+      "step": 595
+    },
+    {
+      "epoch": 0.3109024517475222,
+      "grad_norm": 0.06277068704366684,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.0172,
+      "step": 596
+    },
+    {
+      "epoch": 0.3114241001564945,
+      "grad_norm": 0.06810203939676285,
+      "learning_rate": 0.00022444444444444446,
+      "loss": 0.0234,
+      "step": 597
+    },
+    {
+      "epoch": 0.31194574856546686,
+      "grad_norm": 0.03866199776530266,
+      "learning_rate": 0.0002238888888888889,
+      "loss": 0.0246,
+      "step": 598
+    },
+    {
+      "epoch": 0.3124673969744392,
+      "grad_norm": 0.034964367747306824,
+      "learning_rate": 0.00022333333333333333,
+      "loss": 0.0221,
+      "step": 599
+    },
+    {
+      "epoch": 0.3129890453834116,
+      "grad_norm": 0.045828189700841904,
+      "learning_rate": 0.00022277777777777777,
+      "loss": 0.0222,
+      "step": 600
+    },
+    {
+      "epoch": 0.31351069379238394,
+      "grad_norm": 0.038094017654657364,
+      "learning_rate": 0.0002222222222222222,
+      "loss": 0.0201,
+      "step": 601
+    },
+    {
+      "epoch": 0.3140323422013563,
+      "grad_norm": 0.09865216165781021,
+      "learning_rate": 0.00022166666666666667,
+      "loss": 0.0313,
+      "step": 602
+    },
+    {
+      "epoch": 0.3145539906103286,
+      "grad_norm": 0.03507848456501961,
+      "learning_rate": 0.00022111111111111113,
+      "loss": 0.0213,
+      "step": 603
+    },
+    {
+      "epoch": 0.31507563901930097,
+      "grad_norm": 0.07374833524227142,
+      "learning_rate": 0.00022055555555555557,
+      "loss": 0.0289,
+      "step": 604
+    },
+    {
+      "epoch": 0.31559728742827337,
+      "grad_norm": 0.06977537274360657,
+      "learning_rate": 0.00022,
+      "loss": 0.0158,
+      "step": 605
+    },
+    {
+      "epoch": 0.3161189358372457,
+      "grad_norm": 0.07651909440755844,
+      "learning_rate": 0.00021944444444444444,
+      "loss": 0.0158,
+      "step": 606
+    },
+    {
+      "epoch": 0.31664058424621805,
+      "grad_norm": 0.05928812175989151,
+      "learning_rate": 0.00021888888888888888,
+      "loss": 0.0128,
+      "step": 607
+    },
+    {
+      "epoch": 0.3171622326551904,
+      "grad_norm": 0.047975439578294754,
+      "learning_rate": 0.00021833333333333332,
+      "loss": 0.0219,
+      "step": 608
+    },
+    {
+      "epoch": 0.31768388106416273,
+      "grad_norm": 0.036830369383096695,
+      "learning_rate": 0.00021777777777777776,
+      "loss": 0.0187,
+      "step": 609
+    },
+    {
+      "epoch": 0.31820552947313513,
+      "grad_norm": 0.03662344813346863,
+      "learning_rate": 0.00021722222222222225,
+      "loss": 0.0212,
+      "step": 610
+    },
+    {
+      "epoch": 0.31872717788210747,
+      "grad_norm": 0.038968924432992935,
+      "learning_rate": 0.00021666666666666668,
+      "loss": 0.0229,
+      "step": 611
+    },
+    {
+      "epoch": 0.3192488262910798,
+      "grad_norm": 0.03521070256829262,
+      "learning_rate": 0.00021611111111111112,
+      "loss": 0.0223,
+      "step": 612
+    },
+    {
+      "epoch": 0.31977047470005215,
+      "grad_norm": 0.05176553502678871,
+      "learning_rate": 0.00021555555555555556,
+      "loss": 0.0156,
+      "step": 613
+    },
+    {
+      "epoch": 0.3202921231090245,
+      "grad_norm": 0.07710260897874832,
+      "learning_rate": 0.000215,
+      "loss": 0.0256,
+      "step": 614
+    },
+    {
+      "epoch": 0.3208137715179969,
+      "grad_norm": 0.044599100947380066,
+      "learning_rate": 0.00021444444444444443,
+      "loss": 0.0144,
+      "step": 615
+    },
+    {
+      "epoch": 0.32133541992696923,
+      "grad_norm": 0.04929358512163162,
+      "learning_rate": 0.0002138888888888889,
+      "loss": 0.0256,
+      "step": 616
+    },
+    {
+      "epoch": 0.3218570683359416,
+      "grad_norm": 0.04041970521211624,
+      "learning_rate": 0.00021333333333333336,
+      "loss": 0.0257,
+      "step": 617
+    },
+    {
+      "epoch": 0.3223787167449139,
+      "grad_norm": 0.09948903322219849,
+      "learning_rate": 0.0002127777777777778,
+      "loss": 0.0284,
+      "step": 618
+    },
+    {
+      "epoch": 0.32290036515388626,
+      "grad_norm": 0.08420311659574509,
+      "learning_rate": 0.00021222222222222223,
+      "loss": 0.0271,
+      "step": 619
+    },
+    {
+      "epoch": 0.32342201356285866,
+      "grad_norm": 0.05317756533622742,
+      "learning_rate": 0.00021166666666666667,
+      "loss": 0.0106,
+      "step": 620
+    },
+    {
+      "epoch": 0.323943661971831,
+      "grad_norm": 0.08581392467021942,
+      "learning_rate": 0.0002111111111111111,
+      "loss": 0.0241,
+      "step": 621
+    },
+    {
+      "epoch": 0.32446531038080334,
+      "grad_norm": 0.04383018612861633,
+      "learning_rate": 0.00021055555555555554,
+      "loss": 0.0216,
+      "step": 622
+    },
+    {
+      "epoch": 0.3249869587897757,
+      "grad_norm": 0.04261196032166481,
+      "learning_rate": 0.00021,
+      "loss": 0.0242,
+      "step": 623
+    },
+    {
+      "epoch": 0.325508607198748,
+      "grad_norm": 0.06482464075088501,
+      "learning_rate": 0.00020944444444444445,
+      "loss": 0.0251,
+      "step": 624
+    },
+    {
+      "epoch": 0.3260302556077204,
+      "grad_norm": 0.045221809297800064,
+      "learning_rate": 0.0002088888888888889,
+      "loss": 0.0213,
+      "step": 625
+    },
+    {
+      "epoch": 0.32655190401669276,
+      "grad_norm": 0.07963284850120544,
+      "learning_rate": 0.00020833333333333335,
+      "loss": 0.0312,
+      "step": 626
+    },
+    {
+      "epoch": 0.3270735524256651,
+      "grad_norm": 0.11016980558633804,
+      "learning_rate": 0.00020777777777777778,
+      "loss": 0.0333,
+      "step": 627
+    },
+    {
+      "epoch": 0.32759520083463745,
+      "grad_norm": 0.038222573697566986,
+      "learning_rate": 0.00020722222222222222,
+      "loss": 0.0224,
+      "step": 628
+    },
+    {
+      "epoch": 0.3281168492436098,
+      "grad_norm": 0.05089324340224266,
+      "learning_rate": 0.00020666666666666666,
+      "loss": 0.0242,
+      "step": 629
+    },
+    {
+      "epoch": 0.3286384976525822,
+      "grad_norm": 0.05707726627588272,
+      "learning_rate": 0.00020611111111111112,
+      "loss": 0.024,
+      "step": 630
+    },
+    {
+      "epoch": 0.3291601460615545,
+      "grad_norm": 0.04010495916008949,
+      "learning_rate": 0.00020555555555555556,
+      "loss": 0.0236,
+      "step": 631
+    },
+    {
+      "epoch": 0.32968179447052687,
+      "grad_norm": 0.03976452723145485,
+      "learning_rate": 0.000205,
+      "loss": 0.0213,
+      "step": 632
+    },
+    {
+      "epoch": 0.3302034428794992,
+      "grad_norm": 0.042373333126306534,
+      "learning_rate": 0.00020444444444444446,
+      "loss": 0.0283,
+      "step": 633
+    },
+    {
+      "epoch": 0.33072509128847155,
+      "grad_norm": 0.06429338455200195,
+      "learning_rate": 0.0002038888888888889,
+      "loss": 0.0165,
+      "step": 634
+    },
+    {
+      "epoch": 0.33124673969744395,
+      "grad_norm": 0.03952011466026306,
+      "learning_rate": 0.00020333333333333333,
+      "loss": 0.0237,
+      "step": 635
+    },
+    {
+      "epoch": 0.3317683881064163,
+      "grad_norm": 0.061460334807634354,
+      "learning_rate": 0.00020277777777777777,
+      "loss": 0.0177,
+      "step": 636
+    },
+    {
+      "epoch": 0.33229003651538863,
+      "grad_norm": 0.05908782780170441,
+      "learning_rate": 0.00020222222222222223,
+      "loss": 0.0223,
+      "step": 637
+    },
+    {
+      "epoch": 0.332811684924361,
+      "grad_norm": 0.04075014218688011,
+      "learning_rate": 0.00020166666666666667,
+      "loss": 0.013,
+      "step": 638
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.04613792523741722,
+      "learning_rate": 0.0002011111111111111,
+      "loss": 0.0178,
+      "step": 639
+    },
+    {
+      "epoch": 0.3338549817423057,
+      "grad_norm": 0.04457557201385498,
+      "learning_rate": 0.00020055555555555555,
+      "loss": 0.0139,
+      "step": 640
+    },
+    {
+      "epoch": 0.33437663015127805,
+      "grad_norm": 0.04981216415762901,
+      "learning_rate": 0.0002,
+      "loss": 0.0213,
+      "step": 641
+    },
+    {
+      "epoch": 0.3348982785602504,
+      "grad_norm": 0.04061228781938553,
+      "learning_rate": 0.00019944444444444445,
+      "loss": 0.0224,
+      "step": 642
+    },
+    {
+      "epoch": 0.33541992696922274,
+      "grad_norm": 0.0416584387421608,
+      "learning_rate": 0.00019888888888888888,
+      "loss": 0.024,
+      "step": 643
+    },
+    {
+      "epoch": 0.3359415753781951,
+      "grad_norm": 0.06171800196170807,
+      "learning_rate": 0.00019833333333333335,
+      "loss": 0.0148,
+      "step": 644
+    },
+    {
+      "epoch": 0.3364632237871675,
+      "grad_norm": 0.03681602329015732,
+      "learning_rate": 0.00019777777777777778,
+      "loss": 0.0177,
+      "step": 645
+    },
+    {
+      "epoch": 0.3369848721961398,
+      "grad_norm": 0.03308926522731781,
+      "learning_rate": 0.00019722222222222222,
+      "loss": 0.0181,
+      "step": 646
+    },
+    {
+      "epoch": 0.33750652060511216,
+      "grad_norm": 0.04177209734916687,
+      "learning_rate": 0.00019666666666666666,
+      "loss": 0.0284,
+      "step": 647
+    },
+    {
+      "epoch": 0.3380281690140845,
+      "grad_norm": 0.04754487797617912,
+      "learning_rate": 0.00019611111111111112,
+      "loss": 0.0187,
+      "step": 648
+    },
+    {
+      "epoch": 0.33854981742305684,
+      "grad_norm": 0.042892929166555405,
+      "learning_rate": 0.00019555555555555556,
+      "loss": 0.0273,
+      "step": 649
+    },
+    {
+      "epoch": 0.33907146583202924,
+      "grad_norm": 0.04137077182531357,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.0134,
+      "step": 650
+    },
+    {
+      "epoch": 0.3395931142410016,
+      "grad_norm": 0.04692915827035904,
+      "learning_rate": 0.00019444444444444446,
+      "loss": 0.0212,
+      "step": 651
+    },
+    {
+      "epoch": 0.3401147626499739,
+      "grad_norm": 0.038703177124261856,
+      "learning_rate": 0.0001938888888888889,
+      "loss": 0.021,
+      "step": 652
+    },
+    {
+      "epoch": 0.34063641105894626,
+      "grad_norm": 0.03845067694783211,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 0.0173,
+      "step": 653
+    },
+    {
+      "epoch": 0.3411580594679186,
+      "grad_norm": 0.0640154704451561,
+      "learning_rate": 0.00019277777777777777,
+      "loss": 0.0095,
+      "step": 654
+    },
+    {
+      "epoch": 0.34167970787689095,
+      "grad_norm": 0.043140705674886703,
+      "learning_rate": 0.0001922222222222222,
+      "loss": 0.024,
+      "step": 655
+    },
+    {
+      "epoch": 0.34220135628586335,
+      "grad_norm": 0.05687430500984192,
+      "learning_rate": 0.00019166666666666667,
+      "loss": 0.0254,
+      "step": 656
+    },
+    {
+      "epoch": 0.3427230046948357,
+      "grad_norm": 0.07442109286785126,
+      "learning_rate": 0.00019111111111111114,
+      "loss": 0.0281,
+      "step": 657
+    },
+    {
+      "epoch": 0.34324465310380803,
+      "grad_norm": 0.04823756590485573,
+      "learning_rate": 0.00019055555555555557,
+      "loss": 0.0232,
+      "step": 658
+    },
+    {
+      "epoch": 0.34376630151278037,
+      "grad_norm": 0.04826001450419426,
+      "learning_rate": 0.00019,
+      "loss": 0.0203,
+      "step": 659
+    },
+    {
+      "epoch": 0.3442879499217527,
+      "grad_norm": 0.044953636825084686,
+      "learning_rate": 0.00018944444444444445,
+      "loss": 0.0226,
+      "step": 660
+    },
+    {
+      "epoch": 0.3448095983307251,
+      "grad_norm": 0.0559956319630146,
+      "learning_rate": 0.00018888888888888888,
+      "loss": 0.0231,
+      "step": 661
+    },
+    {
+      "epoch": 0.34533124673969745,
+      "grad_norm": 0.07470231503248215,
+      "learning_rate": 0.00018833333333333332,
+      "loss": 0.0241,
+      "step": 662
+    },
+    {
+      "epoch": 0.3458528951486698,
+      "grad_norm": 0.044720359146595,
+      "learning_rate": 0.00018777777777777776,
+      "loss": 0.0098,
+      "step": 663
+    },
+    {
+      "epoch": 0.34637454355764213,
+      "grad_norm": 0.027653418481349945,
+      "learning_rate": 0.00018722222222222225,
+      "loss": 0.0148,
+      "step": 664
+    },
+    {
+      "epoch": 0.3468961919666145,
+      "grad_norm": 0.10543134808540344,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.0332,
+      "step": 665
+    },
+    {
+      "epoch": 0.3474178403755869,
+      "grad_norm": 0.031183289363980293,
+      "learning_rate": 0.00018611111111111112,
+      "loss": 0.0213,
+      "step": 666
+    },
+    {
+      "epoch": 0.3479394887845592,
+      "grad_norm": 0.09472862631082535,
+      "learning_rate": 0.00018555555555555556,
+      "loss": 0.0234,
+      "step": 667
+    },
+    {
+      "epoch": 0.34846113719353156,
+      "grad_norm": 0.08138673007488251,
+      "learning_rate": 0.000185,
+      "loss": 0.0294,
+      "step": 668
+    },
+    {
+      "epoch": 0.3489827856025039,
+      "grad_norm": 0.07677923887968063,
+      "learning_rate": 0.00018444444444444443,
+      "loss": 0.0147,
+      "step": 669
+    },
+    {
+      "epoch": 0.34950443401147624,
+      "grad_norm": 0.11338996142148972,
+      "learning_rate": 0.00018388888888888887,
+      "loss": 0.0339,
+      "step": 670
+    },
+    {
+      "epoch": 0.35002608242044864,
+      "grad_norm": 0.05841991677880287,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 0.0283,
+      "step": 671
+    },
+    {
+      "epoch": 0.350547730829421,
+      "grad_norm": 0.06760915368795395,
+      "learning_rate": 0.0001827777777777778,
+      "loss": 0.0295,
+      "step": 672
+    },
+    {
+      "epoch": 0.3510693792383933,
+      "grad_norm": 0.042751237750053406,
+      "learning_rate": 0.00018222222222222224,
+      "loss": 0.0161,
+      "step": 673
+    },
+    {
+      "epoch": 0.35159102764736566,
+      "grad_norm": 0.06219693645834923,
+      "learning_rate": 0.00018166666666666667,
+      "loss": 0.014,
+      "step": 674
+    },
+    {
+      "epoch": 0.352112676056338,
+      "grad_norm": 0.03916552662849426,
+      "learning_rate": 0.0001811111111111111,
+      "loss": 0.0255,
+      "step": 675
+    },
+    {
+      "epoch": 0.3526343244653104,
+      "grad_norm": 0.04349285736680031,
+      "learning_rate": 0.00018055555555555555,
+      "loss": 0.0212,
+      "step": 676
+    },
+    {
+      "epoch": 0.35315597287428274,
+      "grad_norm": 0.04847508296370506,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.0109,
+      "step": 677
+    },
+    {
+      "epoch": 0.3536776212832551,
+      "grad_norm": 0.07422209531068802,
+      "learning_rate": 0.00017944444444444445,
+      "loss": 0.0222,
+      "step": 678
+    },
+    {
+      "epoch": 0.3541992696922274,
+      "grad_norm": 0.040487054735422134,
+      "learning_rate": 0.00017888888888888889,
+      "loss": 0.0175,
+      "step": 679
+    },
+    {
+      "epoch": 0.35472091810119977,
+      "grad_norm": 0.039799537509679794,
+      "learning_rate": 0.00017833333333333335,
+      "loss": 0.0203,
+      "step": 680
+    },
+    {
+      "epoch": 0.35524256651017216,
+      "grad_norm": 0.04392382130026817,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 0.0184,
+      "step": 681
+    },
+    {
+      "epoch": 0.3557642149191445,
+      "grad_norm": 0.03797397017478943,
+      "learning_rate": 0.00017722222222222222,
+      "loss": 0.0232,
+      "step": 682
+    },
+    {
+      "epoch": 0.35628586332811685,
+      "grad_norm": 0.04700712487101555,
+      "learning_rate": 0.00017666666666666666,
+      "loss": 0.0165,
+      "step": 683
+    },
+    {
+      "epoch": 0.3568075117370892,
+      "grad_norm": 0.03243011608719826,
+      "learning_rate": 0.00017611111111111112,
+      "loss": 0.0196,
+      "step": 684
+    },
+    {
+      "epoch": 0.35732916014606153,
+      "grad_norm": 0.03285043686628342,
+      "learning_rate": 0.00017555555555555556,
+      "loss": 0.0207,
+      "step": 685
+    },
+    {
+      "epoch": 0.35785080855503393,
+      "grad_norm": 0.0681452751159668,
+      "learning_rate": 0.000175,
+      "loss": 0.0273,
+      "step": 686
+    },
+    {
+      "epoch": 0.35837245696400627,
+      "grad_norm": 0.05350063368678093,
+      "learning_rate": 0.00017444444444444446,
+      "loss": 0.0218,
+      "step": 687
+    },
+    {
+      "epoch": 0.3588941053729786,
+      "grad_norm": 0.05729454755783081,
+      "learning_rate": 0.0001738888888888889,
+      "loss": 0.0245,
+      "step": 688
+    },
+    {
+      "epoch": 0.35941575378195095,
+      "grad_norm": 0.04530972242355347,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.0211,
+      "step": 689
+    },
+    {
+      "epoch": 0.3599374021909233,
+      "grad_norm": 0.029124662280082703,
+      "learning_rate": 0.00017277777777777777,
+      "loss": 0.0189,
+      "step": 690
+    },
+    {
+      "epoch": 0.3604590505998957,
+      "grad_norm": 0.023517435416579247,
+      "learning_rate": 0.00017222222222222224,
+      "loss": 0.0136,
+      "step": 691
+    },
+    {
+      "epoch": 0.36098069900886803,
+      "grad_norm": 0.03519543632864952,
+      "learning_rate": 0.00017166666666666667,
+      "loss": 0.0148,
+      "step": 692
+    },
+    {
+      "epoch": 0.3615023474178404,
+      "grad_norm": 0.05330968275666237,
+      "learning_rate": 0.0001711111111111111,
+      "loss": 0.0149,
+      "step": 693
+    },
+    {
+      "epoch": 0.3620239958268127,
+      "grad_norm": 0.03034808114171028,
+      "learning_rate": 0.00017055555555555555,
+      "loss": 0.0198,
+      "step": 694
+    },
+    {
+      "epoch": 0.36254564423578506,
+      "grad_norm": 0.04207657277584076,
+      "learning_rate": 0.00017,
+      "loss": 0.0244,
+      "step": 695
+    },
+    {
+      "epoch": 0.36306729264475746,
+      "grad_norm": 0.0516652911901474,
+      "learning_rate": 0.00016944444444444445,
+      "loss": 0.0209,
+      "step": 696
+    },
+    {
+      "epoch": 0.3635889410537298,
+      "grad_norm": 0.07596902549266815,
+      "learning_rate": 0.00016888888888888889,
+      "loss": 0.0286,
+      "step": 697
+    },
+    {
+      "epoch": 0.36411058946270214,
+      "grad_norm": 0.042746830731630325,
+      "learning_rate": 0.00016833333333333335,
+      "loss": 0.0149,
+      "step": 698
+    },
+    {
+      "epoch": 0.3646322378716745,
+      "grad_norm": 0.0824197307229042,
+      "learning_rate": 0.0001677777777777778,
+      "loss": 0.0271,
+      "step": 699
+    },
+    {
+      "epoch": 0.3651538862806468,
+      "grad_norm": 0.0710834488272667,
+      "learning_rate": 0.00016722222222222222,
+      "loss": 0.0243,
+      "step": 700
+    },
+    {
+      "epoch": 0.3656755346896192,
+      "grad_norm": 0.03268688917160034,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.0233,
+      "step": 701
+    },
+    {
+      "epoch": 0.36619718309859156,
+      "grad_norm": 0.032433949410915375,
+      "learning_rate": 0.0001661111111111111,
+      "loss": 0.0197,
+      "step": 702
+    },
+    {
+      "epoch": 0.3667188315075639,
+      "grad_norm": 0.048944734036922455,
+      "learning_rate": 0.00016555555555555556,
+      "loss": 0.0255,
+      "step": 703
+    },
+    {
+      "epoch": 0.36724047991653624,
+      "grad_norm": 0.030319012701511383,
+      "learning_rate": 0.000165,
+      "loss": 0.0198,
+      "step": 704
+    },
+    {
+      "epoch": 0.3677621283255086,
+      "grad_norm": 0.060266073793172836,
+      "learning_rate": 0.00016444444444444446,
+      "loss": 0.0226,
+      "step": 705
+    },
+    {
+      "epoch": 0.368283776734481,
+      "grad_norm": 0.06388845294713974,
+      "learning_rate": 0.0001638888888888889,
+      "loss": 0.0148,
+      "step": 706
+    },
+    {
+      "epoch": 0.3688054251434533,
+      "grad_norm": 0.03695458918809891,
+      "learning_rate": 0.00016333333333333334,
+      "loss": 0.022,
+      "step": 707
+    },
+    {
+      "epoch": 0.36932707355242567,
+      "grad_norm": 0.04076375067234039,
+      "learning_rate": 0.00016277777777777777,
+      "loss": 0.0267,
+      "step": 708
+    },
+    {
+      "epoch": 0.369848721961398,
+      "grad_norm": 0.06942404806613922,
+      "learning_rate": 0.0001622222222222222,
+      "loss": 0.0286,
+      "step": 709
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.04396967217326164,
+      "learning_rate": 0.00016166666666666665,
+      "loss": 0.0202,
+      "step": 710
+    },
+    {
+      "epoch": 0.37089201877934275,
+      "grad_norm": 0.03774530813097954,
+      "learning_rate": 0.0001611111111111111,
+      "loss": 0.025,
+      "step": 711
+    },
+    {
+      "epoch": 0.3714136671883151,
+      "grad_norm": 0.03641577437520027,
+      "learning_rate": 0.00016055555555555558,
+      "loss": 0.0232,
+      "step": 712
+    },
+    {
+      "epoch": 0.37193531559728743,
+      "grad_norm": 0.04867683723568916,
+      "learning_rate": 0.00016,
+      "loss": 0.0215,
+      "step": 713
+    },
+    {
+      "epoch": 0.37245696400625977,
+      "grad_norm": 0.03597405552864075,
+      "learning_rate": 0.00015944444444444445,
+      "loss": 0.0185,
+      "step": 714
+    },
+    {
+      "epoch": 0.3729786124152321,
+      "grad_norm": 0.11010299623012543,
+      "learning_rate": 0.0001588888888888889,
+      "loss": 0.0331,
+      "step": 715
+    },
+    {
+      "epoch": 0.3735002608242045,
+      "grad_norm": 0.0590214878320694,
+      "learning_rate": 0.00015833333333333332,
+      "loss": 0.0244,
+      "step": 716
+    },
+    {
+      "epoch": 0.37402190923317685,
+      "grad_norm": 0.046834152191877365,
+      "learning_rate": 0.00015777777777777776,
+      "loss": 0.0185,
+      "step": 717
+    },
+    {
+      "epoch": 0.3745435576421492,
+      "grad_norm": 0.05189788341522217,
+      "learning_rate": 0.00015722222222222225,
+      "loss": 0.0169,
+      "step": 718
+    },
+    {
+      "epoch": 0.37506520605112154,
+      "grad_norm": 0.0672694593667984,
+      "learning_rate": 0.0001566666666666667,
+      "loss": 0.0149,
+      "step": 719
+    },
+    {
+      "epoch": 0.3755868544600939,
+      "grad_norm": 0.04932631924748421,
+      "learning_rate": 0.00015611111111111113,
+      "loss": 0.0139,
+      "step": 720
+    },
+    {
+      "epoch": 0.3761085028690663,
+      "grad_norm": 0.02602981962263584,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 0.0131,
+      "step": 721
+    },
+    {
+      "epoch": 0.3766301512780386,
+      "grad_norm": 0.04524467512965202,
+      "learning_rate": 0.000155,
+      "loss": 0.0131,
+      "step": 722
+    },
+    {
+      "epoch": 0.37715179968701096,
+      "grad_norm": 0.05330897122621536,
+      "learning_rate": 0.00015444444444444444,
+      "loss": 0.024,
+      "step": 723
+    },
+    {
+      "epoch": 0.3776734480959833,
+      "grad_norm": 0.06539610773324966,
+      "learning_rate": 0.00015388888888888887,
+      "loss": 0.0228,
+      "step": 724
+    },
+    {
+      "epoch": 0.37819509650495564,
+      "grad_norm": 0.050068728625774384,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 0.0181,
+      "step": 725
+    },
+    {
+      "epoch": 0.37871674491392804,
+      "grad_norm": 0.05034396052360535,
+      "learning_rate": 0.0001527777777777778,
+      "loss": 0.019,
+      "step": 726
+    },
+    {
+      "epoch": 0.3792383933229004,
+      "grad_norm": 0.03978167846798897,
+      "learning_rate": 0.00015222222222222224,
+      "loss": 0.0118,
+      "step": 727
+    },
+    {
+      "epoch": 0.3797600417318727,
+      "grad_norm": 0.06867479532957077,
+      "learning_rate": 0.00015166666666666668,
+      "loss": 0.023,
+      "step": 728
+    },
+    {
+      "epoch": 0.38028169014084506,
+      "grad_norm": 0.06552889198064804,
+      "learning_rate": 0.0001511111111111111,
+      "loss": 0.0294,
+      "step": 729
+    },
+    {
+      "epoch": 0.3808033385498174,
+      "grad_norm": 0.04288003221154213,
+      "learning_rate": 0.00015055555555555555,
+      "loss": 0.0247,
+      "step": 730
+    },
+    {
+      "epoch": 0.3813249869587898,
+      "grad_norm": 0.03031757101416588,
+      "learning_rate": 0.00015,
+      "loss": 0.0179,
+      "step": 731
+    },
+    {
+      "epoch": 0.38184663536776214,
+      "grad_norm": 0.031997255980968475,
+      "learning_rate": 0.00014944444444444445,
+      "loss": 0.0174,
+      "step": 732
+    },
+    {
+      "epoch": 0.3823682837767345,
+      "grad_norm": 0.030668631196022034,
+      "learning_rate": 0.0001488888888888889,
+      "loss": 0.0189,
+      "step": 733
+    },
+    {
+      "epoch": 0.3828899321857068,
+      "grad_norm": 0.06306172162294388,
+      "learning_rate": 0.00014833333333333335,
+      "loss": 0.0111,
+      "step": 734
+    },
+    {
+      "epoch": 0.38341158059467917,
+      "grad_norm": 0.0309304092079401,
+      "learning_rate": 0.0001477777777777778,
+      "loss": 0.0207,
+      "step": 735
+    },
+    {
+      "epoch": 0.38393322900365157,
+      "grad_norm": 0.037320055067539215,
+      "learning_rate": 0.00014722222222222223,
+      "loss": 0.0164,
+      "step": 736
+    },
+    {
+      "epoch": 0.3844548774126239,
+      "grad_norm": 0.03826696798205376,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.0235,
+      "step": 737
+    },
+    {
+      "epoch": 0.38497652582159625,
+      "grad_norm": 0.06731518357992172,
+      "learning_rate": 0.0001461111111111111,
+      "loss": 0.0261,
+      "step": 738
+    },
+    {
+      "epoch": 0.3854981742305686,
+      "grad_norm": 0.06984197348356247,
+      "learning_rate": 0.00014555555555555556,
+      "loss": 0.0247,
+      "step": 739
+    },
+    {
+      "epoch": 0.38601982263954093,
+      "grad_norm": 0.044138602912425995,
+      "learning_rate": 0.000145,
+      "loss": 0.0268,
+      "step": 740
+    },
+    {
+      "epoch": 0.3865414710485133,
+      "grad_norm": 0.02548188902437687,
+      "learning_rate": 0.00014444444444444444,
+      "loss": 0.0183,
+      "step": 741
+    },
+    {
+      "epoch": 0.38706311945748567,
+      "grad_norm": 0.040708597749471664,
+      "learning_rate": 0.0001438888888888889,
+      "loss": 0.0191,
+      "step": 742
+    },
+    {
+      "epoch": 0.387584767866458,
+      "grad_norm": 0.05437528342008591,
+      "learning_rate": 0.00014333333333333334,
+      "loss": 0.0149,
+      "step": 743
+    },
+    {
+      "epoch": 0.38810641627543035,
+      "grad_norm": 0.039601244032382965,
+      "learning_rate": 0.00014277777777777778,
+      "loss": 0.0237,
+      "step": 744
+    },
+    {
+      "epoch": 0.3886280646844027,
+      "grad_norm": 0.04201721027493477,
+      "learning_rate": 0.0001422222222222222,
+      "loss": 0.0227,
+      "step": 745
+    },
+    {
+      "epoch": 0.38914971309337504,
+      "grad_norm": 0.032314058393239975,
+      "learning_rate": 0.00014166666666666668,
+      "loss": 0.0181,
+      "step": 746
+    },
+    {
+      "epoch": 0.38967136150234744,
+      "grad_norm": 0.04769531264901161,
+      "learning_rate": 0.00014111111111111111,
+      "loss": 0.0181,
+      "step": 747
+    },
+    {
+      "epoch": 0.3901930099113198,
+      "grad_norm": 0.04128763824701309,
+      "learning_rate": 0.00014055555555555555,
+      "loss": 0.0222,
+      "step": 748
+    },
+    {
+      "epoch": 0.3907146583202921,
+      "grad_norm": 0.03540586680173874,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.0208,
+      "step": 749
+    },
+    {
+      "epoch": 0.39123630672926446,
+      "grad_norm": 0.0364544615149498,
+      "learning_rate": 0.00013944444444444445,
+      "loss": 0.012,
+      "step": 750
+    },
+    {
+      "epoch": 0.3917579551382368,
+      "grad_norm": 0.07143893837928772,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 0.0236,
+      "step": 751
+    },
+    {
+      "epoch": 0.3922796035472092,
+      "grad_norm": 0.03759491816163063,
+      "learning_rate": 0.00013833333333333333,
+      "loss": 0.0266,
+      "step": 752
+    },
+    {
+      "epoch": 0.39280125195618154,
+      "grad_norm": 0.05017688497900963,
+      "learning_rate": 0.0001377777777777778,
+      "loss": 0.0216,
+      "step": 753
+    },
+    {
+      "epoch": 0.3933229003651539,
+      "grad_norm": 0.05127080902457237,
+      "learning_rate": 0.00013722222222222223,
+      "loss": 0.0115,
+      "step": 754
+    },
+    {
+      "epoch": 0.3938445487741262,
+      "grad_norm": 0.04553140699863434,
+      "learning_rate": 0.00013666666666666666,
+      "loss": 0.0164,
+      "step": 755
+    },
+    {
+      "epoch": 0.39436619718309857,
+      "grad_norm": 0.04521241411566734,
+      "learning_rate": 0.0001361111111111111,
+      "loss": 0.0217,
+      "step": 756
+    },
+    {
+      "epoch": 0.39488784559207096,
+      "grad_norm": 0.029037892818450928,
+      "learning_rate": 0.00013555555555555556,
+      "loss": 0.024,
+      "step": 757
+    },
+    {
+      "epoch": 0.3954094940010433,
+      "grad_norm": 0.04166054353117943,
+      "learning_rate": 0.000135,
+      "loss": 0.0234,
+      "step": 758
+    },
+    {
+      "epoch": 0.39593114241001565,
+      "grad_norm": 0.043482281267642975,
+      "learning_rate": 0.00013444444444444447,
+      "loss": 0.0208,
+      "step": 759
+    },
+    {
+      "epoch": 0.396452790818988,
+      "grad_norm": 0.06500442326068878,
+      "learning_rate": 0.0001338888888888889,
+      "loss": 0.009,
+      "step": 760
+    },
+    {
+      "epoch": 0.39697443922796033,
+      "grad_norm": 0.042982008308172226,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.0143,
+      "step": 761
+    },
+    {
+      "epoch": 0.3974960876369327,
+      "grad_norm": 0.045418255031108856,
+      "learning_rate": 0.00013277777777777778,
+      "loss": 0.0218,
+      "step": 762
+    },
+    {
+      "epoch": 0.39801773604590507,
+      "grad_norm": 0.033716801553964615,
+      "learning_rate": 0.00013222222222222221,
+      "loss": 0.0204,
+      "step": 763
+    },
+    {
+      "epoch": 0.3985393844548774,
+      "grad_norm": 0.05949891358613968,
+      "learning_rate": 0.00013166666666666665,
+      "loss": 0.0205,
+      "step": 764
+    },
+    {
+      "epoch": 0.39906103286384975,
+      "grad_norm": 0.03850205987691879,
+      "learning_rate": 0.00013111111111111111,
+      "loss": 0.0165,
+      "step": 765
+    },
+    {
+      "epoch": 0.3995826812728221,
+      "grad_norm": 0.04359976202249527,
+      "learning_rate": 0.00013055555555555558,
+      "loss": 0.0204,
+      "step": 766
+    },
+    {
+      "epoch": 0.4001043296817945,
+      "grad_norm": 0.05350225791335106,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.0262,
+      "step": 767
+    },
+    {
+      "epoch": 0.40062597809076683,
+      "grad_norm": 0.04726016893982887,
+      "learning_rate": 0.00012944444444444445,
+      "loss": 0.0107,
+      "step": 768
+    },
+    {
+      "epoch": 0.4011476264997392,
+      "grad_norm": 0.03490623086690903,
+      "learning_rate": 0.0001288888888888889,
+      "loss": 0.0163,
+      "step": 769
+    },
+    {
+      "epoch": 0.4016692749087115,
+      "grad_norm": 0.04292970895767212,
+      "learning_rate": 0.00012833333333333333,
+      "loss": 0.0193,
+      "step": 770
+    },
+    {
+      "epoch": 0.40219092331768386,
+      "grad_norm": 0.13282786309719086,
+      "learning_rate": 0.00012777777777777776,
+      "loss": 0.0298,
+      "step": 771
+    },
+    {
+      "epoch": 0.40271257172665625,
+      "grad_norm": 0.04199996963143349,
+      "learning_rate": 0.0001272222222222222,
+      "loss": 0.015,
+      "step": 772
+    },
+    {
+      "epoch": 0.4032342201356286,
+      "grad_norm": 0.061165716499090195,
+      "learning_rate": 0.0001266666666666667,
+      "loss": 0.0274,
+      "step": 773
+    },
+    {
+      "epoch": 0.40375586854460094,
+      "grad_norm": 0.08557713776826859,
+      "learning_rate": 0.00012611111111111113,
+      "loss": 0.0246,
+      "step": 774
+    },
+    {
+      "epoch": 0.4042775169535733,
+      "grad_norm": 0.034305475652217865,
+      "learning_rate": 0.00012555555555555557,
+      "loss": 0.0235,
+      "step": 775
+    },
+    {
+      "epoch": 0.4047991653625456,
+      "grad_norm": 0.04216960445046425,
+      "learning_rate": 0.000125,
+      "loss": 0.0218,
+      "step": 776
+    },
+    {
+      "epoch": 0.405320813771518,
+      "grad_norm": 0.07115976512432098,
+      "learning_rate": 0.00012444444444444444,
+      "loss": 0.0298,
+      "step": 777
+    },
+    {
+      "epoch": 0.40584246218049036,
+      "grad_norm": 0.06568846851587296,
+      "learning_rate": 0.0001238888888888889,
+      "loss": 0.0227,
+      "step": 778
+    },
+    {
+      "epoch": 0.4063641105894627,
+      "grad_norm": 0.04704579710960388,
+      "learning_rate": 0.00012333333333333334,
+      "loss": 0.0119,
+      "step": 779
+    },
+    {
+      "epoch": 0.40688575899843504,
+      "grad_norm": 0.0535866804420948,
+      "learning_rate": 0.00012277777777777778,
+      "loss": 0.0266,
+      "step": 780
+    },
+    {
+      "epoch": 0.4074074074074074,
+      "grad_norm": 0.06528276950120926,
+      "learning_rate": 0.00012222222222222221,
+      "loss": 0.0278,
+      "step": 781
+    },
+    {
+      "epoch": 0.4079290558163798,
+      "grad_norm": 0.039144959300756454,
+      "learning_rate": 0.00012166666666666668,
+      "loss": 0.0124,
+      "step": 782
+    },
+    {
+      "epoch": 0.4084507042253521,
+      "grad_norm": 0.04178335890173912,
+      "learning_rate": 0.00012111111111111112,
+      "loss": 0.0213,
+      "step": 783
+    },
+    {
+      "epoch": 0.40897235263432447,
+      "grad_norm": 0.041467975825071335,
+      "learning_rate": 0.00012055555555555555,
+      "loss": 0.0192,
+      "step": 784
+    },
+    {
+      "epoch": 0.4094940010432968,
+      "grad_norm": 0.03834041580557823,
+      "learning_rate": 0.00012,
+      "loss": 0.0195,
+      "step": 785
+    },
+    {
+      "epoch": 0.41001564945226915,
+      "grad_norm": 0.03077683411538601,
+      "learning_rate": 0.00011944444444444445,
+      "loss": 0.0196,
+      "step": 786
+    },
+    {
+      "epoch": 0.41053729786124155,
+      "grad_norm": 0.03556019440293312,
+      "learning_rate": 0.00011888888888888889,
+      "loss": 0.0226,
+      "step": 787
+    },
+    {
+      "epoch": 0.4110589462702139,
+      "grad_norm": 0.030661238357424736,
+      "learning_rate": 0.00011833333333333334,
+      "loss": 0.0223,
+      "step": 788
+    },
+    {
+      "epoch": 0.41158059467918623,
+      "grad_norm": 0.028493182733654976,
+      "learning_rate": 0.00011777777777777778,
+      "loss": 0.018,
+      "step": 789
+    },
+    {
+      "epoch": 0.41210224308815857,
+      "grad_norm": 0.02826119214296341,
+      "learning_rate": 0.00011722222222222223,
+      "loss": 0.0126,
+      "step": 790
+    },
+    {
+      "epoch": 0.4126238914971309,
+      "grad_norm": 0.04553418606519699,
+      "learning_rate": 0.00011666666666666667,
+      "loss": 0.0262,
+      "step": 791
+    },
+    {
+      "epoch": 0.4131455399061033,
+      "grad_norm": 0.0435551293194294,
+      "learning_rate": 0.00011611111111111112,
+      "loss": 0.0188,
+      "step": 792
+    },
+    {
+      "epoch": 0.41366718831507565,
+      "grad_norm": 0.05089694261550903,
+      "learning_rate": 0.00011555555555555555,
+      "loss": 0.0117,
+      "step": 793
+    },
+    {
+      "epoch": 0.414188836724048,
+      "grad_norm": 0.034144748002290726,
+      "learning_rate": 0.000115,
+      "loss": 0.0115,
+      "step": 794
+    },
+    {
+      "epoch": 0.41471048513302033,
+      "grad_norm": 0.06659701466560364,
+      "learning_rate": 0.00011444444444444445,
+      "loss": 0.0319,
+      "step": 795
+    },
+    {
+      "epoch": 0.4152321335419927,
+      "grad_norm": 0.05749977380037308,
+      "learning_rate": 0.00011388888888888889,
+      "loss": 0.0182,
+      "step": 796
+    },
+    {
+      "epoch": 0.4157537819509651,
+      "grad_norm": 0.040132977068424225,
+      "learning_rate": 0.00011333333333333333,
+      "loss": 0.0254,
+      "step": 797
+    },
+    {
+      "epoch": 0.4162754303599374,
+      "grad_norm": 0.028033824637532234,
+      "learning_rate": 0.00011277777777777778,
+      "loss": 0.0214,
+      "step": 798
+    },
+    {
+      "epoch": 0.41679707876890976,
+      "grad_norm": 0.03379250690340996,
+      "learning_rate": 0.00011222222222222223,
+      "loss": 0.0199,
+      "step": 799
+    },
+    {
+      "epoch": 0.4173187271778821,
+      "grad_norm": 0.056704215705394745,
+      "learning_rate": 0.00011166666666666667,
+      "loss": 0.0271,
+      "step": 800
+    },
+    {
+      "epoch": 0.41784037558685444,
+      "grad_norm": 0.10203065723180771,
+      "learning_rate": 0.0001111111111111111,
+      "loss": 0.0327,
+      "step": 801
+    },
+    {
+      "epoch": 0.41836202399582684,
+      "grad_norm": 0.030047588050365448,
+      "learning_rate": 0.00011055555555555557,
+      "loss": 0.0132,
+      "step": 802
+    },
+    {
+      "epoch": 0.4188836724047992,
+      "grad_norm": 0.040803615003824234,
+      "learning_rate": 0.00011,
+      "loss": 0.0244,
+      "step": 803
+    },
+    {
+      "epoch": 0.4194053208137715,
+      "grad_norm": 0.0868738442659378,
+      "learning_rate": 0.00010944444444444444,
+      "loss": 0.0292,
+      "step": 804
+    },
+    {
+      "epoch": 0.41992696922274386,
+      "grad_norm": 0.04418834671378136,
+      "learning_rate": 0.00010888888888888888,
+      "loss": 0.0203,
+      "step": 805
+    },
+    {
+      "epoch": 0.4204486176317162,
+      "grad_norm": 0.032158080488443375,
+      "learning_rate": 0.00010833333333333334,
+      "loss": 0.0205,
+      "step": 806
+    },
+    {
+      "epoch": 0.4209702660406886,
+      "grad_norm": 0.033949948847293854,
+      "learning_rate": 0.00010777777777777778,
+      "loss": 0.0118,
+      "step": 807
+    },
+    {
+      "epoch": 0.42149191444966094,
+      "grad_norm": 0.05019507557153702,
+      "learning_rate": 0.00010722222222222222,
+      "loss": 0.0226,
+      "step": 808
+    },
+    {
+      "epoch": 0.4220135628586333,
+      "grad_norm": 0.04622408375144005,
+      "learning_rate": 0.00010666666666666668,
+      "loss": 0.0208,
+      "step": 809
+    },
+    {
+      "epoch": 0.4225352112676056,
+      "grad_norm": 0.0362313911318779,
+      "learning_rate": 0.00010611111111111112,
+      "loss": 0.0181,
+      "step": 810
+    },
+    {
+      "epoch": 0.42305685967657797,
+      "grad_norm": 0.041581764817237854,
+      "learning_rate": 0.00010555555555555555,
+      "loss": 0.021,
+      "step": 811
+    },
+    {
+      "epoch": 0.42357850808555036,
+      "grad_norm": 0.06415802985429764,
+      "learning_rate": 0.000105,
+      "loss": 0.0241,
+      "step": 812
+    },
+    {
+      "epoch": 0.4241001564945227,
+      "grad_norm": 0.036627769470214844,
+      "learning_rate": 0.00010444444444444445,
+      "loss": 0.0241,
+      "step": 813
+    },
+    {
+      "epoch": 0.42462180490349505,
+      "grad_norm": 0.10887697339057922,
+      "learning_rate": 0.00010388888888888889,
+      "loss": 0.0328,
+      "step": 814
+    },
+    {
+      "epoch": 0.4251434533124674,
+      "grad_norm": 0.06744590401649475,
+      "learning_rate": 0.00010333333333333333,
+      "loss": 0.0155,
+      "step": 815
+    },
+    {
+      "epoch": 0.42566510172143973,
+      "grad_norm": 0.04498301446437836,
+      "learning_rate": 0.00010277777777777778,
+      "loss": 0.0124,
+      "step": 816
+    },
+    {
+      "epoch": 0.42618675013041213,
+      "grad_norm": 0.04123969003558159,
+      "learning_rate": 0.00010222222222222223,
+      "loss": 0.0209,
+      "step": 817
+    },
+    {
+      "epoch": 0.42670839853938447,
+      "grad_norm": 0.05105501785874367,
+      "learning_rate": 0.00010166666666666667,
+      "loss": 0.0214,
+      "step": 818
+    },
+    {
+      "epoch": 0.4272300469483568,
+      "grad_norm": 0.0853237509727478,
+      "learning_rate": 0.00010111111111111112,
+      "loss": 0.023,
+      "step": 819
+    },
+    {
+      "epoch": 0.42775169535732915,
+      "grad_norm": 0.03969847410917282,
+      "learning_rate": 0.00010055555555555555,
+      "loss": 0.0214,
+      "step": 820
+    },
+    {
+      "epoch": 0.4282733437663015,
+      "grad_norm": 0.028099317103624344,
+      "learning_rate": 0.0001,
+      "loss": 0.0193,
+      "step": 821
+    },
+    {
+      "epoch": 0.4287949921752739,
+      "grad_norm": 0.058374159038066864,
+      "learning_rate": 9.944444444444444e-05,
+      "loss": 0.0234,
+      "step": 822
+    },
+    {
+      "epoch": 0.42931664058424623,
+      "grad_norm": 0.07853943854570389,
+      "learning_rate": 9.888888888888889e-05,
+      "loss": 0.0301,
+      "step": 823
+    },
+    {
+      "epoch": 0.4298382889932186,
+      "grad_norm": 0.04180357977747917,
+      "learning_rate": 9.833333333333333e-05,
+      "loss": 0.0233,
+      "step": 824
+    },
+    {
+      "epoch": 0.4303599374021909,
+      "grad_norm": 0.046311184763908386,
+      "learning_rate": 9.777777777777778e-05,
+      "loss": 0.0219,
+      "step": 825
+    },
+    {
+      "epoch": 0.43088158581116326,
+      "grad_norm": 0.046774689108133316,
+      "learning_rate": 9.722222222222223e-05,
+      "loss": 0.0264,
+      "step": 826
+    },
+    {
+      "epoch": 0.43140323422013566,
+      "grad_norm": 0.06261669844388962,
+      "learning_rate": 9.666666666666667e-05,
+      "loss": 0.024,
+      "step": 827
+    },
+    {
+      "epoch": 0.431924882629108,
+      "grad_norm": 0.03115753084421158,
+      "learning_rate": 9.61111111111111e-05,
+      "loss": 0.0217,
+      "step": 828
+    },
+    {
+      "epoch": 0.43244653103808034,
+      "grad_norm": 0.054340995848178864,
+      "learning_rate": 9.555555555555557e-05,
+      "loss": 0.0181,
+      "step": 829
+    },
+    {
+      "epoch": 0.4329681794470527,
+      "grad_norm": 0.032995328307151794,
+      "learning_rate": 9.5e-05,
+      "loss": 0.0152,
+      "step": 830
+    },
+    {
+      "epoch": 0.433489827856025,
+      "grad_norm": 0.04482552409172058,
+      "learning_rate": 9.444444444444444e-05,
+      "loss": 0.0179,
+      "step": 831
+    },
+    {
+      "epoch": 0.43401147626499736,
+      "grad_norm": 0.03367936983704567,
+      "learning_rate": 9.388888888888888e-05,
+      "loss": 0.0205,
+      "step": 832
+    },
+    {
+      "epoch": 0.43453312467396976,
+      "grad_norm": 0.06362450867891312,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.0111,
+      "step": 833
+    },
+    {
+      "epoch": 0.4350547730829421,
+      "grad_norm": 0.04869982600212097,
+      "learning_rate": 9.277777777777778e-05,
+      "loss": 0.0259,
+      "step": 834
+    },
+    {
+      "epoch": 0.43557642149191445,
+      "grad_norm": 0.038218311965465546,
+      "learning_rate": 9.222222222222222e-05,
+      "loss": 0.0213,
+      "step": 835
+    },
+    {
+      "epoch": 0.4360980699008868,
+      "grad_norm": 0.0287641528993845,
+      "learning_rate": 9.166666666666667e-05,
+      "loss": 0.0202,
+      "step": 836
+    },
+    {
+      "epoch": 0.43661971830985913,
+      "grad_norm": 0.032309625297784805,
+      "learning_rate": 9.111111111111112e-05,
+      "loss": 0.0226,
+      "step": 837
+    },
+    {
+      "epoch": 0.4371413667188315,
+      "grad_norm": 0.031242694705724716,
+      "learning_rate": 9.055555555555556e-05,
+      "loss": 0.0209,
+      "step": 838
+    },
+    {
+      "epoch": 0.43766301512780387,
+      "grad_norm": 0.03623700514435768,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.0217,
+      "step": 839
+    },
+    {
+      "epoch": 0.4381846635367762,
+      "grad_norm": 0.04697221890091896,
+      "learning_rate": 8.944444444444444e-05,
+      "loss": 0.0235,
+      "step": 840
+    },
+    {
+      "epoch": 0.43870631194574855,
+      "grad_norm": 0.046198632568120956,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 0.0111,
+      "step": 841
+    },
+    {
+      "epoch": 0.4392279603547209,
+      "grad_norm": 0.05326194688677788,
+      "learning_rate": 8.833333333333333e-05,
+      "loss": 0.025,
+      "step": 842
+    },
+    {
+      "epoch": 0.4397496087636933,
+      "grad_norm": 0.052906736731529236,
+      "learning_rate": 8.777777777777778e-05,
+      "loss": 0.0204,
+      "step": 843
+    },
+    {
+      "epoch": 0.44027125717266563,
+      "grad_norm": 0.0386294387280941,
+      "learning_rate": 8.722222222222223e-05,
+      "loss": 0.0211,
+      "step": 844
+    },
+    {
+      "epoch": 0.440792905581638,
+      "grad_norm": 0.04268417879939079,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 0.0186,
+      "step": 845
+    },
+    {
+      "epoch": 0.4413145539906103,
+      "grad_norm": 0.07896760106086731,
+      "learning_rate": 8.611111111111112e-05,
+      "loss": 0.0295,
+      "step": 846
+    },
+    {
+      "epoch": 0.44183620239958266,
+      "grad_norm": 0.06323965638875961,
+      "learning_rate": 8.555555555555556e-05,
+      "loss": 0.0271,
+      "step": 847
+    },
+    {
+      "epoch": 0.44235785080855505,
+      "grad_norm": 0.045192502439022064,
+      "learning_rate": 8.5e-05,
+      "loss": 0.023,
+      "step": 848
+    },
+    {
+      "epoch": 0.4428794992175274,
+      "grad_norm": 0.03172018378973007,
+      "learning_rate": 8.444444444444444e-05,
+      "loss": 0.0211,
+      "step": 849
+    },
+    {
+      "epoch": 0.44340114762649974,
+      "grad_norm": 0.07090255618095398,
+      "learning_rate": 8.38888888888889e-05,
+      "loss": 0.0243,
+      "step": 850
+    },
+    {
+      "epoch": 0.4439227960354721,
+      "grad_norm": 0.04297306388616562,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 0.0232,
+      "step": 851
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.057149000465869904,
+      "learning_rate": 8.277777777777778e-05,
+      "loss": 0.0241,
+      "step": 852
+    },
+    {
+      "epoch": 0.4449660928534168,
+      "grad_norm": 0.037716496735811234,
+      "learning_rate": 8.222222222222223e-05,
+      "loss": 0.0196,
+      "step": 853
+    },
+    {
+      "epoch": 0.44548774126238916,
+      "grad_norm": 0.030622676014900208,
+      "learning_rate": 8.166666666666667e-05,
+      "loss": 0.0215,
+      "step": 854
+    },
+    {
+      "epoch": 0.4460093896713615,
+      "grad_norm": 0.03952807933092117,
+      "learning_rate": 8.11111111111111e-05,
+      "loss": 0.0189,
+      "step": 855
+    },
+    {
+      "epoch": 0.44653103808033384,
+      "grad_norm": 0.0395282618701458,
+      "learning_rate": 8.055555555555556e-05,
+      "loss": 0.0154,
+      "step": 856
+    },
+    {
+      "epoch": 0.4470526864893062,
+      "grad_norm": 0.02904989756643772,
+      "learning_rate": 8e-05,
+      "loss": 0.0233,
+      "step": 857
+    },
+    {
+      "epoch": 0.4475743348982786,
+      "grad_norm": 0.033239513635635376,
+      "learning_rate": 7.944444444444444e-05,
+      "loss": 0.0231,
+      "step": 858
+    },
+    {
+      "epoch": 0.4480959833072509,
+      "grad_norm": 0.04544852674007416,
+      "learning_rate": 7.888888888888888e-05,
+      "loss": 0.0226,
+      "step": 859
+    },
+    {
+      "epoch": 0.44861763171622326,
+      "grad_norm": 0.07550681382417679,
+      "learning_rate": 7.833333333333334e-05,
+      "loss": 0.0238,
+      "step": 860
+    },
+    {
+      "epoch": 0.4491392801251956,
+      "grad_norm": 0.06995867937803268,
+      "learning_rate": 7.777777777777778e-05,
+      "loss": 0.0284,
+      "step": 861
+    },
+    {
+      "epoch": 0.44966092853416795,
+      "grad_norm": 0.030304618179798126,
+      "learning_rate": 7.722222222222222e-05,
+      "loss": 0.0205,
+      "step": 862
+    },
+    {
+      "epoch": 0.45018257694314034,
+      "grad_norm": 0.04170898720622063,
+      "learning_rate": 7.666666666666667e-05,
+      "loss": 0.0232,
+      "step": 863
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.05225684493780136,
+      "learning_rate": 7.611111111111112e-05,
+      "loss": 0.0101,
+      "step": 864
+    },
+    {
+      "epoch": 0.45122587376108503,
+      "grad_norm": 0.0283154658973217,
+      "learning_rate": 7.555555555555556e-05,
+      "loss": 0.0176,
+      "step": 865
+    },
+    {
+      "epoch": 0.45174752217005737,
+      "grad_norm": 0.03328422084450722,
+      "learning_rate": 7.5e-05,
+      "loss": 0.0232,
+      "step": 866
+    },
+    {
+      "epoch": 0.4522691705790297,
+      "grad_norm": 0.025384338572621346,
+      "learning_rate": 7.444444444444444e-05,
+      "loss": 0.0144,
+      "step": 867
+    },
+    {
+      "epoch": 0.4527908189880021,
+      "grad_norm": 0.038840651512145996,
+      "learning_rate": 7.38888888888889e-05,
+      "loss": 0.0177,
+      "step": 868
+    },
+    {
+      "epoch": 0.45331246739697445,
+      "grad_norm": 0.039001982659101486,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 0.0114,
+      "step": 869
+    },
+    {
+      "epoch": 0.4538341158059468,
+      "grad_norm": 0.03812301158905029,
+      "learning_rate": 7.277777777777778e-05,
+      "loss": 0.0207,
+      "step": 870
+    },
+    {
+      "epoch": 0.45435576421491913,
+      "grad_norm": 0.04113636165857315,
+      "learning_rate": 7.222222222222222e-05,
+      "loss": 0.0194,
+      "step": 871
+    },
+    {
+      "epoch": 0.4548774126238915,
+      "grad_norm": 0.038486793637275696,
+      "learning_rate": 7.166666666666667e-05,
+      "loss": 0.0196,
+      "step": 872
+    },
+    {
+      "epoch": 0.45539906103286387,
+      "grad_norm": 0.03968929871916771,
+      "learning_rate": 7.11111111111111e-05,
+      "loss": 0.0203,
+      "step": 873
+    },
+    {
+      "epoch": 0.4559207094418362,
+      "grad_norm": 0.031823720782995224,
+      "learning_rate": 7.055555555555556e-05,
+      "loss": 0.0191,
+      "step": 874
+    },
+    {
+      "epoch": 0.45644235785080856,
+      "grad_norm": 0.04647579789161682,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 0.021,
+      "step": 875
+    },
+    {
+      "epoch": 0.4569640062597809,
+      "grad_norm": 0.07104521989822388,
+      "learning_rate": 6.944444444444444e-05,
+      "loss": 0.0226,
+      "step": 876
+    },
+    {
+      "epoch": 0.45748565466875324,
+      "grad_norm": 0.03164111077785492,
+      "learning_rate": 6.88888888888889e-05,
+      "loss": 0.0191,
+      "step": 877
+    },
+    {
+      "epoch": 0.45800730307772564,
+      "grad_norm": 0.05744309350848198,
+      "learning_rate": 6.833333333333333e-05,
+      "loss": 0.0209,
+      "step": 878
+    },
+    {
+      "epoch": 0.458528951486698,
+      "grad_norm": 0.0594148151576519,
+      "learning_rate": 6.777777777777778e-05,
+      "loss": 0.0226,
+      "step": 879
+    },
+    {
+      "epoch": 0.4590505998956703,
+      "grad_norm": 0.06638067960739136,
+      "learning_rate": 6.722222222222223e-05,
+      "loss": 0.0277,
+      "step": 880
+    },
+    {
+      "epoch": 0.45957224830464266,
+      "grad_norm": 0.07358410954475403,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.0295,
+      "step": 881
+    },
+    {
+      "epoch": 0.460093896713615,
+      "grad_norm": 0.04828910902142525,
+      "learning_rate": 6.611111111111111e-05,
+      "loss": 0.0229,
+      "step": 882
+    },
+    {
+      "epoch": 0.4606155451225874,
+      "grad_norm": 0.038606926798820496,
+      "learning_rate": 6.555555555555556e-05,
+      "loss": 0.0103,
+      "step": 883
+    },
+    {
+      "epoch": 0.46113719353155974,
+      "grad_norm": 0.03224022313952446,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 0.0197,
+      "step": 884
+    },
+    {
+      "epoch": 0.4616588419405321,
+      "grad_norm": 0.036784786731004715,
+      "learning_rate": 6.444444444444444e-05,
+      "loss": 0.0191,
+      "step": 885
+    },
+    {
+      "epoch": 0.4621804903495044,
+      "grad_norm": 0.033140335232019424,
+      "learning_rate": 6.388888888888888e-05,
+      "loss": 0.0202,
+      "step": 886
+    },
+    {
+      "epoch": 0.46270213875847677,
+      "grad_norm": 0.03425415977835655,
+      "learning_rate": 6.333333333333335e-05,
+      "loss": 0.0209,
+      "step": 887
+    },
+    {
+      "epoch": 0.46322378716744916,
+      "grad_norm": 0.045110322535037994,
+      "learning_rate": 6.277777777777778e-05,
+      "loss": 0.0208,
+      "step": 888
+    },
+    {
+      "epoch": 0.4637454355764215,
+      "grad_norm": 0.04547950625419617,
+      "learning_rate": 6.222222222222222e-05,
+      "loss": 0.0236,
+      "step": 889
+    },
+    {
+      "epoch": 0.46426708398539385,
+      "grad_norm": 0.045051079243421555,
+      "learning_rate": 6.166666666666667e-05,
+      "loss": 0.018,
+      "step": 890
+    },
+    {
+      "epoch": 0.4647887323943662,
+      "grad_norm": 0.04237956181168556,
+      "learning_rate": 6.111111111111111e-05,
+      "loss": 0.021,
+      "step": 891
+    },
+    {
+      "epoch": 0.46531038080333853,
+      "grad_norm": 0.042051415890455246,
+      "learning_rate": 6.055555555555556e-05,
+      "loss": 0.0168,
+      "step": 892
+    },
+    {
+      "epoch": 0.4658320292123109,
+      "grad_norm": 0.031854745000600815,
+      "learning_rate": 6e-05,
+      "loss": 0.0253,
+      "step": 893
+    },
+    {
+      "epoch": 0.46635367762128327,
+      "grad_norm": 0.04082592576742172,
+      "learning_rate": 5.9444444444444445e-05,
+      "loss": 0.024,
+      "step": 894
+    },
+    {
+      "epoch": 0.4668753260302556,
+      "grad_norm": 0.033791981637477875,
+      "learning_rate": 5.888888888888889e-05,
+      "loss": 0.0238,
+      "step": 895
+    },
+    {
+      "epoch": 0.46739697443922795,
+      "grad_norm": 0.04776583984494209,
+      "learning_rate": 5.833333333333333e-05,
+      "loss": 0.0253,
+      "step": 896
+    },
+    {
+      "epoch": 0.4679186228482003,
+      "grad_norm": 0.04382411763072014,
+      "learning_rate": 5.7777777777777776e-05,
+      "loss": 0.0129,
+      "step": 897
+    },
+    {
+      "epoch": 0.4684402712571727,
+      "grad_norm": 0.051006391644477844,
+      "learning_rate": 5.722222222222223e-05,
+      "loss": 0.0112,
+      "step": 898
+    },
+    {
+      "epoch": 0.46896191966614503,
+      "grad_norm": 0.03245469182729721,
+      "learning_rate": 5.6666666666666664e-05,
+      "loss": 0.0178,
+      "step": 899
+    },
+    {
+      "epoch": 0.4694835680751174,
+      "grad_norm": 0.0461103580892086,
+      "learning_rate": 5.6111111111111114e-05,
+      "loss": 0.0178,
+      "step": 900
+    },
+    {
+      "epoch": 0.4700052164840897,
+      "grad_norm": 0.047334831207990646,
+      "learning_rate": 5.555555555555555e-05,
+      "loss": 0.0221,
+      "step": 901
+    },
+    {
+      "epoch": 0.47052686489306206,
+      "grad_norm": 0.03664927929639816,
+      "learning_rate": 5.5e-05,
+      "loss": 0.0202,
+      "step": 902
+    },
+    {
+      "epoch": 0.47104851330203446,
+      "grad_norm": 0.023868730291724205,
+      "learning_rate": 5.444444444444444e-05,
+      "loss": 0.0115,
+      "step": 903
+    },
+    {
+      "epoch": 0.4715701617110068,
+      "grad_norm": 0.045030467212200165,
+      "learning_rate": 5.388888888888889e-05,
+      "loss": 0.0162,
+      "step": 904
+    },
+    {
+      "epoch": 0.47209181011997914,
+      "grad_norm": 0.03776118904352188,
+      "learning_rate": 5.333333333333334e-05,
+      "loss": 0.0197,
+      "step": 905
+    },
+    {
+      "epoch": 0.4726134585289515,
+      "grad_norm": 0.034501805901527405,
+      "learning_rate": 5.277777777777778e-05,
+      "loss": 0.025,
+      "step": 906
+    },
+    {
+      "epoch": 0.4731351069379238,
+      "grad_norm": 0.043729010969400406,
+      "learning_rate": 5.222222222222223e-05,
+      "loss": 0.0266,
+      "step": 907
+    },
+    {
+      "epoch": 0.4736567553468962,
+      "grad_norm": 0.10858713090419769,
+      "learning_rate": 5.1666666666666664e-05,
+      "loss": 0.03,
+      "step": 908
+    },
+    {
+      "epoch": 0.47417840375586856,
+      "grad_norm": 0.04872022941708565,
+      "learning_rate": 5.1111111111111115e-05,
+      "loss": 0.0112,
+      "step": 909
+    },
+    {
+      "epoch": 0.4747000521648409,
+      "grad_norm": 0.06106869503855705,
+      "learning_rate": 5.055555555555556e-05,
+      "loss": 0.0146,
+      "step": 910
+    },
+    {
+      "epoch": 0.47522170057381324,
+      "grad_norm": 0.07267069071531296,
+      "learning_rate": 5e-05,
+      "loss": 0.0299,
+      "step": 911
+    },
+    {
+      "epoch": 0.4757433489827856,
+      "grad_norm": 0.0418294258415699,
+      "learning_rate": 4.9444444444444446e-05,
+      "loss": 0.0237,
+      "step": 912
+    },
+    {
+      "epoch": 0.476264997391758,
+      "grad_norm": 0.03087545745074749,
+      "learning_rate": 4.888888888888889e-05,
+      "loss": 0.0118,
+      "step": 913
+    },
+    {
+      "epoch": 0.4767866458007303,
+      "grad_norm": 0.028627268970012665,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 0.0223,
+      "step": 914
+    },
+    {
+      "epoch": 0.47730829420970267,
+      "grad_norm": 0.0332285612821579,
+      "learning_rate": 4.7777777777777784e-05,
+      "loss": 0.0207,
+      "step": 915
+    },
+    {
+      "epoch": 0.477829942618675,
+      "grad_norm": 0.034126222133636475,
+      "learning_rate": 4.722222222222222e-05,
+      "loss": 0.0109,
+      "step": 916
+    },
+    {
+      "epoch": 0.47835159102764735,
+      "grad_norm": 0.04092434421181679,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.0196,
+      "step": 917
+    },
+    {
+      "epoch": 0.4788732394366197,
+      "grad_norm": 0.03659692779183388,
+      "learning_rate": 4.611111111111111e-05,
+      "loss": 0.0158,
+      "step": 918
+    },
+    {
+      "epoch": 0.4793948878455921,
+      "grad_norm": 0.029037335887551308,
+      "learning_rate": 4.555555555555556e-05,
+      "loss": 0.0127,
+      "step": 919
+    },
+    {
+      "epoch": 0.47991653625456443,
+      "grad_norm": 0.03405088558793068,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 0.02,
+      "step": 920
+    },
+    {
+      "epoch": 0.48043818466353677,
+      "grad_norm": 0.04277755320072174,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.0153,
+      "step": 921
+    },
+    {
+      "epoch": 0.4809598330725091,
+      "grad_norm": 0.03835497051477432,
+      "learning_rate": 4.388888888888889e-05,
+      "loss": 0.0231,
+      "step": 922
+    },
+    {
+      "epoch": 0.48148148148148145,
+      "grad_norm": 0.04431463032960892,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 0.0194,
+      "step": 923
+    },
+    {
+      "epoch": 0.48200312989045385,
+      "grad_norm": 0.04221165552735329,
+      "learning_rate": 4.277777777777778e-05,
+      "loss": 0.024,
+      "step": 924
+    },
+    {
+      "epoch": 0.4825247782994262,
+      "grad_norm": 0.09658390283584595,
+      "learning_rate": 4.222222222222222e-05,
+      "loss": 0.0292,
+      "step": 925
+    },
+    {
+      "epoch": 0.48304642670839854,
+      "grad_norm": 0.061060868203639984,
+      "learning_rate": 4.1666666666666665e-05,
+      "loss": 0.028,
+      "step": 926
+    },
+    {
+      "epoch": 0.4835680751173709,
+      "grad_norm": 0.052690718322992325,
+      "learning_rate": 4.1111111111111116e-05,
+      "loss": 0.022,
+      "step": 927
+    },
+    {
+      "epoch": 0.4840897235263432,
+      "grad_norm": 0.04050262272357941,
+      "learning_rate": 4.055555555555555e-05,
+      "loss": 0.0163,
+      "step": 928
+    },
+    {
+      "epoch": 0.4846113719353156,
+      "grad_norm": 0.03896137326955795,
+      "learning_rate": 4e-05,
+      "loss": 0.0182,
+      "step": 929
+    },
+    {
+      "epoch": 0.48513302034428796,
+      "grad_norm": 0.04256868362426758,
+      "learning_rate": 3.944444444444444e-05,
+      "loss": 0.0188,
+      "step": 930
+    },
+    {
+      "epoch": 0.4856546687532603,
+      "grad_norm": 0.02733660116791725,
+      "learning_rate": 3.888888888888889e-05,
+      "loss": 0.0222,
+      "step": 931
+    },
+    {
+      "epoch": 0.48617631716223264,
+      "grad_norm": 0.05308909714221954,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 0.0192,
+      "step": 932
+    },
+    {
+      "epoch": 0.486697965571205,
+      "grad_norm": 0.02588343620300293,
+      "learning_rate": 3.777777777777778e-05,
+      "loss": 0.0133,
+      "step": 933
+    },
+    {
+      "epoch": 0.4872196139801774,
+      "grad_norm": 0.03014085628092289,
+      "learning_rate": 3.722222222222222e-05,
+      "loss": 0.0118,
+      "step": 934
+    },
+    {
+      "epoch": 0.4877412623891497,
+      "grad_norm": 0.043832968920469284,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 0.0147,
+      "step": 935
+    },
+    {
+      "epoch": 0.48826291079812206,
+      "grad_norm": 0.054664142429828644,
+      "learning_rate": 3.611111111111111e-05,
+      "loss": 0.0109,
+      "step": 936
+    },
+    {
+      "epoch": 0.4887845592070944,
+      "grad_norm": 0.06583913415670395,
+      "learning_rate": 3.555555555555555e-05,
+      "loss": 0.028,
+      "step": 937
+    },
+    {
+      "epoch": 0.48930620761606675,
+      "grad_norm": 0.06476892530918121,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 0.0286,
+      "step": 938
+    },
+    {
+      "epoch": 0.48982785602503914,
+      "grad_norm": 0.056334152817726135,
+      "learning_rate": 3.444444444444445e-05,
+      "loss": 0.0172,
+      "step": 939
+    },
+    {
+      "epoch": 0.4903495044340115,
+      "grad_norm": 0.04867906495928764,
+      "learning_rate": 3.388888888888889e-05,
+      "loss": 0.0196,
+      "step": 940
+    },
+    {
+      "epoch": 0.4908711528429838,
+      "grad_norm": 0.03338843584060669,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.0201,
+      "step": 941
+    },
+    {
+      "epoch": 0.49139280125195617,
+      "grad_norm": 0.03972908481955528,
+      "learning_rate": 3.277777777777778e-05,
+      "loss": 0.0223,
+      "step": 942
+    },
+    {
+      "epoch": 0.4919144496609285,
+      "grad_norm": 0.03122125379741192,
+      "learning_rate": 3.222222222222222e-05,
+      "loss": 0.0181,
+      "step": 943
+    },
+    {
+      "epoch": 0.4924360980699009,
+      "grad_norm": 0.03441190719604492,
+      "learning_rate": 3.166666666666667e-05,
+      "loss": 0.0178,
+      "step": 944
+    },
+    {
+      "epoch": 0.49295774647887325,
+      "grad_norm": 0.024923836812376976,
+      "learning_rate": 3.111111111111111e-05,
+      "loss": 0.017,
+      "step": 945
+    },
+    {
+      "epoch": 0.4934793948878456,
+      "grad_norm": 0.09072909504175186,
+      "learning_rate": 3.0555555555555554e-05,
+      "loss": 0.0322,
+      "step": 946
+    },
+    {
+      "epoch": 0.49400104329681793,
+      "grad_norm": 0.04489835724234581,
+      "learning_rate": 3e-05,
+      "loss": 0.0216,
+      "step": 947
+    },
+    {
+      "epoch": 0.4945226917057903,
+      "grad_norm": 0.03687606751918793,
+      "learning_rate": 2.9444444444444445e-05,
+      "loss": 0.0202,
+      "step": 948
+    },
+    {
+      "epoch": 0.49504434011476267,
+      "grad_norm": 0.04640423133969307,
+      "learning_rate": 2.8888888888888888e-05,
+      "loss": 0.0152,
+      "step": 949
+    },
+    {
+      "epoch": 0.495565988523735,
+      "grad_norm": 0.07281932234764099,
+      "learning_rate": 2.8333333333333332e-05,
+      "loss": 0.0341,
+      "step": 950
+    },
+    {
+      "epoch": 0.49608763693270735,
+      "grad_norm": 0.047392841428518295,
+      "learning_rate": 2.7777777777777776e-05,
+      "loss": 0.0154,
+      "step": 951
+    },
+    {
+      "epoch": 0.4966092853416797,
+      "grad_norm": 0.041269198060035706,
+      "learning_rate": 2.722222222222222e-05,
+      "loss": 0.0208,
+      "step": 952
+    },
+    {
+      "epoch": 0.49713093375065204,
+      "grad_norm": 0.028984997421503067,
+      "learning_rate": 2.666666666666667e-05,
+      "loss": 0.0186,
+      "step": 953
+    },
+    {
+      "epoch": 0.49765258215962443,
+      "grad_norm": 0.03882346302270889,
+      "learning_rate": 2.6111111111111114e-05,
+      "loss": 0.0182,
+      "step": 954
+    },
+    {
+      "epoch": 0.4981742305685968,
+      "grad_norm": 0.050562262535095215,
+      "learning_rate": 2.5555555555555557e-05,
+      "loss": 0.0182,
+      "step": 955
+    },
+    {
+      "epoch": 0.4986958789775691,
+      "grad_norm": 0.03481961786746979,
+      "learning_rate": 2.5e-05,
+      "loss": 0.0191,
+      "step": 956
+    },
+    {
+      "epoch": 0.49921752738654146,
+      "grad_norm": 0.06581820547580719,
+      "learning_rate": 2.4444444444444445e-05,
+      "loss": 0.0234,
+      "step": 957
+    },
+    {
+      "epoch": 0.4997391757955138,
+      "grad_norm": 0.020346660166978836,
+      "learning_rate": 2.3888888888888892e-05,
+      "loss": 0.0119,
+      "step": 958
+    },
+    {
+      "epoch": 0.5002608242044861,
+      "grad_norm": 0.04314263164997101,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 0.0205,
+      "step": 959
+    },
+    {
+      "epoch": 0.5007824726134585,
+      "grad_norm": 0.0439009889960289,
+      "learning_rate": 2.277777777777778e-05,
+      "loss": 0.0194,
+      "step": 960
+    },
+    {
+      "epoch": 0.5013041210224308,
+      "grad_norm": 0.02872501127421856,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.024,
+      "step": 961
+    },
+    {
+      "epoch": 0.5018257694314032,
+      "grad_norm": 0.07443469762802124,
+      "learning_rate": 2.1666666666666667e-05,
+      "loss": 0.0281,
+      "step": 962
+    },
+    {
+      "epoch": 0.5023474178403756,
+      "grad_norm": 0.04083111509680748,
+      "learning_rate": 2.111111111111111e-05,
+      "loss": 0.0192,
+      "step": 963
+    },
+    {
+      "epoch": 0.5028690662493479,
+      "grad_norm": 0.08003074675798416,
+      "learning_rate": 2.0555555555555558e-05,
+      "loss": 0.025,
+      "step": 964
+    },
+    {
+      "epoch": 0.5033907146583203,
+      "grad_norm": 0.05287975072860718,
+      "learning_rate": 2e-05,
+      "loss": 0.0237,
+      "step": 965
+    },
+    {
+      "epoch": 0.5039123630672926,
+      "grad_norm": 0.033477749675512314,
+      "learning_rate": 1.9444444444444445e-05,
+      "loss": 0.0221,
+      "step": 966
+    },
+    {
+      "epoch": 0.504434011476265,
+      "grad_norm": 0.06147143989801407,
+      "learning_rate": 1.888888888888889e-05,
+      "loss": 0.0139,
+      "step": 967
+    },
+    {
+      "epoch": 0.5049556598852374,
+      "grad_norm": 0.031191756948828697,
+      "learning_rate": 1.8333333333333333e-05,
+      "loss": 0.0201,
+      "step": 968
+    },
+    {
+      "epoch": 0.5054773082942097,
+      "grad_norm": 0.054284486919641495,
+      "learning_rate": 1.7777777777777777e-05,
+      "loss": 0.0194,
+      "step": 969
+    },
+    {
+      "epoch": 0.5059989567031821,
+      "grad_norm": 0.03909388557076454,
+      "learning_rate": 1.7222222222222224e-05,
+      "loss": 0.025,
+      "step": 970
+    },
+    {
+      "epoch": 0.5065206051121544,
+      "grad_norm": 0.04850156232714653,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.0226,
+      "step": 971
+    },
+    {
+      "epoch": 0.5070422535211268,
+      "grad_norm": 0.058491021394729614,
+      "learning_rate": 1.611111111111111e-05,
+      "loss": 0.0101,
+      "step": 972
+    },
+    {
+      "epoch": 0.5075639019300991,
+      "grad_norm": 0.0658450648188591,
+      "learning_rate": 1.5555555555555555e-05,
+      "loss": 0.0195,
+      "step": 973
+    },
+    {
+      "epoch": 0.5080855503390714,
+      "grad_norm": 0.03677137568593025,
+      "learning_rate": 1.5e-05,
+      "loss": 0.0173,
+      "step": 974
+    },
+    {
+      "epoch": 0.5086071987480438,
+      "grad_norm": 0.06385330110788345,
+      "learning_rate": 1.4444444444444444e-05,
+      "loss": 0.0244,
+      "step": 975
+    },
+    {
+      "epoch": 0.5091288471570161,
+      "grad_norm": 0.028939418494701385,
+      "learning_rate": 1.3888888888888888e-05,
+      "loss": 0.0178,
+      "step": 976
+    },
+    {
+      "epoch": 0.5096504955659885,
+      "grad_norm": 0.04115728661417961,
+      "learning_rate": 1.3333333333333335e-05,
+      "loss": 0.0235,
+      "step": 977
+    },
+    {
+      "epoch": 0.5101721439749609,
+      "grad_norm": 0.06474485248327255,
+      "learning_rate": 1.2777777777777779e-05,
+      "loss": 0.0288,
+      "step": 978
+    },
+    {
+      "epoch": 0.5106937923839332,
+      "grad_norm": 0.04489276930689812,
+      "learning_rate": 1.2222222222222222e-05,
+      "loss": 0.0244,
+      "step": 979
+    },
+    {
+      "epoch": 0.5112154407929056,
+      "grad_norm": 0.04168180748820305,
+      "learning_rate": 1.1666666666666668e-05,
+      "loss": 0.0236,
+      "step": 980
+    },
+    {
+      "epoch": 0.5117370892018779,
+      "grad_norm": 0.03408731147646904,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 0.0176,
+      "step": 981
+    },
+    {
+      "epoch": 0.5122587376108503,
+      "grad_norm": 0.025826234370470047,
+      "learning_rate": 1.0555555555555555e-05,
+      "loss": 0.0169,
+      "step": 982
+    },
+    {
+      "epoch": 0.5127803860198227,
+      "grad_norm": 0.08169863373041153,
+      "learning_rate": 1e-05,
+      "loss": 0.0199,
+      "step": 983
+    },
+    {
+      "epoch": 0.513302034428795,
+      "grad_norm": 0.09390078485012054,
+      "learning_rate": 9.444444444444445e-06,
+      "loss": 0.0296,
+      "step": 984
+    },
+    {
+      "epoch": 0.5138236828377674,
+      "grad_norm": 0.04345531761646271,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.0256,
+      "step": 985
+    },
+    {
+      "epoch": 0.5143453312467396,
+      "grad_norm": 0.029856091365218163,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.0248,
+      "step": 986
+    },
+    {
+      "epoch": 0.514866979655712,
+      "grad_norm": 0.057649165391922,
+      "learning_rate": 7.777777777777777e-06,
+      "loss": 0.0096,
+      "step": 987
+    },
+    {
+      "epoch": 0.5153886280646844,
+      "grad_norm": 0.03002311848104,
+      "learning_rate": 7.222222222222222e-06,
+      "loss": 0.0166,
+      "step": 988
+    },
+    {
+      "epoch": 0.5159102764736567,
+      "grad_norm": 0.05182252824306488,
+      "learning_rate": 6.6666666666666675e-06,
+      "loss": 0.0265,
+      "step": 989
+    },
+    {
+      "epoch": 0.5164319248826291,
+      "grad_norm": 0.069261334836483,
+      "learning_rate": 6.111111111111111e-06,
+      "loss": 0.0257,
+      "step": 990
+    },
+    {
+      "epoch": 0.5169535732916014,
+      "grad_norm": 0.024985987693071365,
+      "learning_rate": 5.555555555555556e-06,
+      "loss": 0.0212,
+      "step": 991
+    },
+    {
+      "epoch": 0.5174752217005738,
+      "grad_norm": 0.03711266443133354,
+      "learning_rate": 5e-06,
+      "loss": 0.0196,
+      "step": 992
+    },
+    {
+      "epoch": 0.5179968701095462,
+      "grad_norm": 0.041288308799266815,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.0225,
+      "step": 993
+    },
+    {
+      "epoch": 0.5185185185185185,
+      "grad_norm": 0.05295979604125023,
+      "learning_rate": 3.888888888888889e-06,
+      "loss": 0.0227,
+      "step": 994
+    },
+    {
+      "epoch": 0.5190401669274909,
+      "grad_norm": 0.02941884472966194,
+      "learning_rate": 3.3333333333333337e-06,
+      "loss": 0.0189,
+      "step": 995
+    },
+    {
+      "epoch": 0.5195618153364632,
+      "grad_norm": 0.04401032626628876,
+      "learning_rate": 2.777777777777778e-06,
+      "loss": 0.0222,
+      "step": 996
+    },
+    {
+      "epoch": 0.5200834637454356,
+      "grad_norm": 0.03393331170082092,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.0199,
+      "step": 997
+    },
+    {
+      "epoch": 0.520605112154408,
+      "grad_norm": 0.08748078346252441,
+      "learning_rate": 1.6666666666666669e-06,
+      "loss": 0.0312,
+      "step": 998
+    },
+    {
+      "epoch": 0.5211267605633803,
+      "grad_norm": 0.029830964282155037,
+      "learning_rate": 1.111111111111111e-06,
+      "loss": 0.0189,
+      "step": 999
+    },
+    {
+      "epoch": 0.5216484089723527,
+      "grad_norm": 0.06009744107723236,
+      "learning_rate": 5.555555555555555e-07,
+      "loss": 0.0088,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 512,
+  "trial_name": null,
+  "trial_params": null
+}