diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-1000/trainer_state.json"
@@ -0,0 +1,7034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.26048450117218025,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00026048450117218026,
+      "grad_norm": 6.6370320320129395,
+      "learning_rate": 0.0,
+      "loss": 1.5389,
+      "step": 1
+    },
+    {
+      "epoch": 0.0005209690023443605,
+      "grad_norm": 7.1702704429626465,
+      "learning_rate": 5e-06,
+      "loss": 1.3234,
+      "step": 2
+    },
+    {
+      "epoch": 0.0007814535035165407,
+      "grad_norm": 8.348443984985352,
+      "learning_rate": 1e-05,
+      "loss": 1.2501,
+      "step": 3
+    },
+    {
+      "epoch": 0.001041938004688721,
+      "grad_norm": 7.599966526031494,
+      "learning_rate": 1.5e-05,
+      "loss": 1.4676,
+      "step": 4
+    },
+    {
+      "epoch": 0.0013024225058609013,
+      "grad_norm": 5.925275802612305,
+      "learning_rate": 2e-05,
+      "loss": 1.1892,
+      "step": 5
+    },
+    {
+      "epoch": 0.0015629070070330815,
+      "grad_norm": 7.288003921508789,
+      "learning_rate": 2.5e-05,
+      "loss": 1.492,
+      "step": 6
+    },
+    {
+      "epoch": 0.0018233915082052619,
+      "grad_norm": 7.2629218101501465,
+      "learning_rate": 3e-05,
+      "loss": 1.3525,
+      "step": 7
+    },
+    {
+      "epoch": 0.002083876009377442,
+      "grad_norm": 9.106829643249512,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 1.6033,
+      "step": 8
+    },
+    {
+      "epoch": 0.0023443605105496223,
+      "grad_norm": 8.875594139099121,
+      "learning_rate": 4e-05,
+      "loss": 1.5238,
+      "step": 9
+    },
+    {
+      "epoch": 0.0026048450117218025,
+      "grad_norm": 7.383709907531738,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 1.4859,
+      "step": 10
+    },
+    {
+      "epoch": 0.0028653295128939827,
+      "grad_norm": 7.463179111480713,
+      "learning_rate": 5e-05,
+      "loss": 1.3504,
+      "step": 11
+    },
+    {
+      "epoch": 0.003125814014066163,
+      "grad_norm": 7.137135028839111,
+      "learning_rate": 5.5e-05,
+      "loss": 1.358,
+      "step": 12
+    },
+    {
+      "epoch": 0.003386298515238343,
+      "grad_norm": 5.68809175491333,
+      "learning_rate": 6e-05,
+      "loss": 1.2017,
+      "step": 13
+    },
+    {
+      "epoch": 0.0036467830164105238,
+      "grad_norm": 6.024169921875,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 1.1491,
+      "step": 14
+    },
+    {
+      "epoch": 0.003907267517582704,
+      "grad_norm": 5.510103225708008,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 1.1809,
+      "step": 15
+    },
+    {
+      "epoch": 0.004167752018754884,
+      "grad_norm": 6.086293697357178,
+      "learning_rate": 7.5e-05,
+      "loss": 1.2069,
+      "step": 16
+    },
+    {
+      "epoch": 0.004428236519927064,
+      "grad_norm": 5.8847551345825195,
+      "learning_rate": 8e-05,
+      "loss": 1.2933,
+      "step": 17
+    },
+    {
+      "epoch": 0.004688721021099245,
+      "grad_norm": 5.263647079467773,
+      "learning_rate": 8.5e-05,
+      "loss": 1.0476,
+      "step": 18
+    },
+    {
+      "epoch": 0.004949205522271425,
+      "grad_norm": 5.684865951538086,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 1.1442,
+      "step": 19
+    },
+    {
+      "epoch": 0.005209690023443605,
+      "grad_norm": 4.671970844268799,
+      "learning_rate": 9.5e-05,
+      "loss": 1.0422,
+      "step": 20
+    },
+    {
+      "epoch": 0.005470174524615785,
+      "grad_norm": 7.935784816741943,
+      "learning_rate": 0.0001,
+      "loss": 1.1025,
+      "step": 21
+    },
+    {
+      "epoch": 0.0057306590257879654,
+      "grad_norm": 4.634947299957275,
+      "learning_rate": 0.000105,
+      "loss": 0.9849,
+      "step": 22
+    },
+    {
+      "epoch": 0.005991143526960146,
+      "grad_norm": 4.8161516189575195,
+      "learning_rate": 0.00011,
+      "loss": 0.9843,
+      "step": 23
+    },
+    {
+      "epoch": 0.006251628028132326,
+      "grad_norm": 4.3339762687683105,
+      "learning_rate": 0.000115,
+      "loss": 0.8714,
+      "step": 24
+    },
+    {
+      "epoch": 0.006512112529304506,
+      "grad_norm": 3.4047181606292725,
+      "learning_rate": 0.00012,
+      "loss": 0.8898,
+      "step": 25
+    },
+    {
+      "epoch": 0.006772597030476686,
+      "grad_norm": 4.15224552154541,
+      "learning_rate": 0.000125,
+      "loss": 1.0079,
+      "step": 26
+    },
+    {
+      "epoch": 0.0070330815316488665,
+      "grad_norm": 3.5006914138793945,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.8013,
+      "step": 27
+    },
+    {
+      "epoch": 0.0072935660328210476,
+      "grad_norm": 2.773101806640625,
+      "learning_rate": 0.000135,
+      "loss": 0.7086,
+      "step": 28
+    },
+    {
+      "epoch": 0.007554050533993228,
+      "grad_norm": 2.4410135746002197,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.7198,
+      "step": 29
+    },
+    {
+      "epoch": 0.007814535035165408,
+      "grad_norm": 2.5674309730529785,
+      "learning_rate": 0.000145,
+      "loss": 0.6359,
+      "step": 30
+    },
+    {
+      "epoch": 0.008075019536337588,
+      "grad_norm": 2.310837984085083,
+      "learning_rate": 0.00015,
+      "loss": 0.6039,
+      "step": 31
+    },
+    {
+      "epoch": 0.008335504037509768,
+      "grad_norm": 2.4884161949157715,
+      "learning_rate": 0.000155,
+      "loss": 0.8962,
+      "step": 32
+    },
+    {
+      "epoch": 0.008595988538681949,
+      "grad_norm": 5.428861141204834,
+      "learning_rate": 0.00016,
+      "loss": 0.5576,
+      "step": 33
+    },
+    {
+      "epoch": 0.008856473039854129,
+      "grad_norm": 2.035452127456665,
+      "learning_rate": 0.000165,
+      "loss": 0.5866,
+      "step": 34
+    },
+    {
+      "epoch": 0.009116957541026309,
+      "grad_norm": 4.757160663604736,
+      "learning_rate": 0.00017,
+      "loss": 0.4413,
+      "step": 35
+    },
+    {
+      "epoch": 0.00937744204219849,
+      "grad_norm": 2.8071913719177246,
+      "learning_rate": 0.000175,
+      "loss": 0.7013,
+      "step": 36
+    },
+    {
+      "epoch": 0.00963792654337067,
+      "grad_norm": 3.3390369415283203,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.348,
+      "step": 37
+    },
+    {
+      "epoch": 0.00989841104454285,
+      "grad_norm": 2.469451665878296,
+      "learning_rate": 0.000185,
+      "loss": 0.72,
+      "step": 38
+    },
+    {
+      "epoch": 0.01015889554571503,
+      "grad_norm": 2.7830817699432373,
+      "learning_rate": 0.00019,
+      "loss": 0.671,
+      "step": 39
+    },
+    {
+      "epoch": 0.01041938004688721,
+      "grad_norm": 3.005566358566284,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.5808,
+      "step": 40
+    },
+    {
+      "epoch": 0.01067986454805939,
+      "grad_norm": 2.8901026248931885,
+      "learning_rate": 0.0002,
+      "loss": 0.6022,
+      "step": 41
+    },
+    {
+      "epoch": 0.01094034904923157,
+      "grad_norm": 2.004911422729492,
+      "learning_rate": 0.000205,
+      "loss": 0.525,
+      "step": 42
+    },
+    {
+      "epoch": 0.01120083355040375,
+      "grad_norm": 2.9986109733581543,
+      "learning_rate": 0.00021,
+      "loss": 0.6073,
+      "step": 43
+    },
+    {
+      "epoch": 0.011461318051575931,
+      "grad_norm": 3.4304168224334717,
+      "learning_rate": 0.000215,
+      "loss": 0.5203,
+      "step": 44
+    },
+    {
+      "epoch": 0.011721802552748111,
+      "grad_norm": 2.295295000076294,
+      "learning_rate": 0.00022,
+      "loss": 0.3148,
+      "step": 45
+    },
+    {
+      "epoch": 0.011982287053920291,
+      "grad_norm": 3.9490885734558105,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.5378,
+      "step": 46
+    },
+    {
+      "epoch": 0.012242771555092472,
+      "grad_norm": 2.3454151153564453,
+      "learning_rate": 0.00023,
+      "loss": 0.3085,
+      "step": 47
+    },
+    {
+      "epoch": 0.012503256056264652,
+      "grad_norm": 2.9150779247283936,
+      "learning_rate": 0.000235,
+      "loss": 0.432,
+      "step": 48
+    },
+    {
+      "epoch": 0.012763740557436832,
+      "grad_norm": 2.1253578662872314,
+      "learning_rate": 0.00024,
+      "loss": 0.1773,
+      "step": 49
+    },
+    {
+      "epoch": 0.013024225058609012,
+      "grad_norm": 3.5161190032958984,
+      "learning_rate": 0.000245,
+      "loss": 0.581,
+      "step": 50
+    },
+    {
+      "epoch": 0.013284709559781192,
+      "grad_norm": 1.8895039558410645,
+      "learning_rate": 0.00025,
+      "loss": 0.4554,
+      "step": 51
+    },
+    {
+      "epoch": 0.013545194060953373,
+      "grad_norm": 1.1252281665802002,
+      "learning_rate": 0.000255,
+      "loss": 0.3409,
+      "step": 52
+    },
+    {
+      "epoch": 0.013805678562125553,
+      "grad_norm": 2.6543619632720947,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.594,
+      "step": 53
+    },
+    {
+      "epoch": 0.014066163063297733,
+      "grad_norm": 3.3003315925598145,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.5753,
+      "step": 54
+    },
+    {
+      "epoch": 0.014326647564469915,
+      "grad_norm": 2.486830234527588,
+      "learning_rate": 0.00027,
+      "loss": 0.2907,
+      "step": 55
+    },
+    {
+      "epoch": 0.014587132065642095,
+      "grad_norm": 1.5953302383422852,
+      "learning_rate": 0.000275,
+      "loss": 0.4006,
+      "step": 56
+    },
+    {
+      "epoch": 0.014847616566814275,
+      "grad_norm": 1.8115977048873901,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.2677,
+      "step": 57
+    },
+    {
+      "epoch": 0.015108101067986456,
+      "grad_norm": 2.282597780227661,
+      "learning_rate": 0.000285,
+      "loss": 0.6526,
+      "step": 58
+    },
+    {
+      "epoch": 0.015368585569158636,
+      "grad_norm": 1.4348944425582886,
+      "learning_rate": 0.00029,
+      "loss": 0.4291,
+      "step": 59
+    },
+    {
+      "epoch": 0.015629070070330816,
+      "grad_norm": 2.0866997241973877,
+      "learning_rate": 0.000295,
+      "loss": 0.1811,
+      "step": 60
+    },
+    {
+      "epoch": 0.015889554571502994,
+      "grad_norm": 1.6576564311981201,
+      "learning_rate": 0.0003,
+      "loss": 0.4088,
+      "step": 61
+    },
+    {
+      "epoch": 0.016150039072675176,
+      "grad_norm": 1.635674238204956,
+      "learning_rate": 0.000305,
+      "loss": 0.4438,
+      "step": 62
+    },
+    {
+      "epoch": 0.016410523573847355,
+      "grad_norm": 1.140415072441101,
+      "learning_rate": 0.00031,
+      "loss": 0.3589,
+      "step": 63
+    },
+    {
+      "epoch": 0.016671008075019537,
+      "grad_norm": 1.7953686714172363,
+      "learning_rate": 0.000315,
+      "loss": 0.2778,
+      "step": 64
+    },
+    {
+      "epoch": 0.016931492576191715,
+      "grad_norm": 2.2324233055114746,
+      "learning_rate": 0.00032,
+      "loss": 0.5049,
+      "step": 65
+    },
+    {
+      "epoch": 0.017191977077363897,
+      "grad_norm": 2.036297559738159,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.2636,
+      "step": 66
+    },
+    {
+      "epoch": 0.017452461578536076,
+      "grad_norm": 2.2596747875213623,
+      "learning_rate": 0.00033,
+      "loss": 0.3743,
+      "step": 67
+    },
+    {
+      "epoch": 0.017712946079708258,
+      "grad_norm": 1.6777313947677612,
+      "learning_rate": 0.000335,
+      "loss": 0.3978,
+      "step": 68
+    },
+    {
+      "epoch": 0.017973430580880436,
+      "grad_norm": 1.6452847719192505,
+      "learning_rate": 0.00034,
+      "loss": 0.1836,
+      "step": 69
+    },
+    {
+      "epoch": 0.018233915082052618,
+      "grad_norm": 1.7216978073120117,
+      "learning_rate": 0.000345,
+      "loss": 0.4191,
+      "step": 70
+    },
+    {
+      "epoch": 0.018494399583224796,
+      "grad_norm": 1.7111387252807617,
+      "learning_rate": 0.00035,
+      "loss": 0.1812,
+      "step": 71
+    },
+    {
+      "epoch": 0.01875488408439698,
+      "grad_norm": 1.6676584482192993,
+      "learning_rate": 0.000355,
+      "loss": 0.4526,
+      "step": 72
+    },
+    {
+      "epoch": 0.019015368585569157,
+      "grad_norm": 0.9286651611328125,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.2746,
+      "step": 73
+    },
+    {
+      "epoch": 0.01927585308674134,
+      "grad_norm": 3.234783411026001,
+      "learning_rate": 0.000365,
+      "loss": 0.5224,
+      "step": 74
+    },
+    {
+      "epoch": 0.01953633758791352,
+      "grad_norm": 1.3695653676986694,
+      "learning_rate": 0.00037,
+      "loss": 0.3308,
+      "step": 75
+    },
+    {
+      "epoch": 0.0197968220890857,
+      "grad_norm": 2.9995968341827393,
+      "learning_rate": 0.000375,
+      "loss": 0.4817,
+      "step": 76
+    },
+    {
+      "epoch": 0.02005730659025788,
+      "grad_norm": 1.8912553787231445,
+      "learning_rate": 0.00038,
+      "loss": 0.2722,
+      "step": 77
+    },
+    {
+      "epoch": 0.02031779109143006,
+      "grad_norm": 1.3702706098556519,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.1064,
+      "step": 78
+    },
+    {
+      "epoch": 0.02057827559260224,
+      "grad_norm": 0.9273198246955872,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.1233,
+      "step": 79
+    },
+    {
+      "epoch": 0.02083876009377442,
+      "grad_norm": 0.8209530711174011,
+      "learning_rate": 0.000395,
+      "loss": 0.2622,
+      "step": 80
+    },
+    {
+      "epoch": 0.021099244594946602,
+      "grad_norm": 1.4749599695205688,
+      "learning_rate": 0.0004,
+      "loss": 0.2999,
+      "step": 81
+    },
+    {
+      "epoch": 0.02135972909611878,
+      "grad_norm": 1.1133017539978027,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.2929,
+      "step": 82
+    },
+    {
+      "epoch": 0.021620213597290962,
+      "grad_norm": 1.235826015472412,
+      "learning_rate": 0.00041,
+      "loss": 0.3106,
+      "step": 83
+    },
+    {
+      "epoch": 0.02188069809846314,
+      "grad_norm": 0.9904353022575378,
+      "learning_rate": 0.000415,
+      "loss": 0.2101,
+      "step": 84
+    },
+    {
+      "epoch": 0.022141182599635323,
+      "grad_norm": 1.2953742742538452,
+      "learning_rate": 0.00042,
+      "loss": 0.1131,
+      "step": 85
+    },
+    {
+      "epoch": 0.0224016671008075,
+      "grad_norm": 1.10429048538208,
+      "learning_rate": 0.000425,
+      "loss": 0.2727,
+      "step": 86
+    },
+    {
+      "epoch": 0.022662151601979683,
+      "grad_norm": 1.048660159111023,
+      "learning_rate": 0.00043,
+      "loss": 0.1082,
+      "step": 87
+    },
+    {
+      "epoch": 0.022922636103151862,
+      "grad_norm": 0.857686460018158,
+      "learning_rate": 0.000435,
+      "loss": 0.1475,
+      "step": 88
+    },
+    {
+      "epoch": 0.023183120604324044,
+      "grad_norm": 0.9353561401367188,
+      "learning_rate": 0.00044,
+      "loss": 0.2123,
+      "step": 89
+    },
+    {
+      "epoch": 0.023443605105496222,
+      "grad_norm": 1.4590015411376953,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.2988,
+      "step": 90
+    },
+    {
+      "epoch": 0.023704089606668404,
+      "grad_norm": 0.9181132316589355,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.2321,
+      "step": 91
+    },
+    {
+      "epoch": 0.023964574107840583,
+      "grad_norm": 0.7688923478126526,
+      "learning_rate": 0.000455,
+      "loss": 0.16,
+      "step": 92
+    },
+    {
+      "epoch": 0.024225058609012765,
+      "grad_norm": 1.0974979400634766,
+      "learning_rate": 0.00046,
+      "loss": 0.2135,
+      "step": 93
+    },
+    {
+      "epoch": 0.024485543110184943,
+      "grad_norm": 1.083938717842102,
+      "learning_rate": 0.000465,
+      "loss": 0.1931,
+      "step": 94
+    },
+    {
+      "epoch": 0.024746027611357125,
+      "grad_norm": 0.5162568688392639,
+      "learning_rate": 0.00047,
+      "loss": 0.0853,
+      "step": 95
+    },
+    {
+      "epoch": 0.025006512112529303,
+      "grad_norm": 0.8454329967498779,
+      "learning_rate": 0.000475,
+      "loss": 0.1723,
+      "step": 96
+    },
+    {
+      "epoch": 0.025266996613701485,
+      "grad_norm": 0.9237842559814453,
+      "learning_rate": 0.00048,
+      "loss": 0.1858,
+      "step": 97
+    },
+    {
+      "epoch": 0.025527481114873664,
+      "grad_norm": 0.8391311168670654,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.1558,
+      "step": 98
+    },
+    {
+      "epoch": 0.025787965616045846,
+      "grad_norm": 0.7986068725585938,
+      "learning_rate": 0.00049,
+      "loss": 0.2043,
+      "step": 99
+    },
+    {
+      "epoch": 0.026048450117218024,
+      "grad_norm": 0.3467917740345001,
+      "learning_rate": 0.000495,
+      "loss": 0.0386,
+      "step": 100
+    },
+    {
+      "epoch": 0.026308934618390206,
+      "grad_norm": 1.4537785053253174,
+      "learning_rate": 0.0005,
+      "loss": 0.2298,
+      "step": 101
+    },
+    {
+      "epoch": 0.026569419119562385,
+      "grad_norm": 0.5565273761749268,
+      "learning_rate": 0.0004994444444444445,
+      "loss": 0.0893,
+      "step": 102
+    },
+    {
+      "epoch": 0.026829903620734567,
+      "grad_norm": 0.5200175642967224,
+      "learning_rate": 0.0004988888888888889,
+      "loss": 0.1191,
+      "step": 103
+    },
+    {
+      "epoch": 0.027090388121906745,
+      "grad_norm": 0.8346852660179138,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 0.1709,
+      "step": 104
+    },
+    {
+      "epoch": 0.027350872623078927,
+      "grad_norm": 0.6015453934669495,
+      "learning_rate": 0.0004977777777777778,
+      "loss": 0.1095,
+      "step": 105
+    },
+    {
+      "epoch": 0.027611357124251105,
+      "grad_norm": 0.5995053052902222,
+      "learning_rate": 0.0004972222222222222,
+      "loss": 0.1382,
+      "step": 106
+    },
+    {
+      "epoch": 0.027871841625423287,
+      "grad_norm": 0.857565701007843,
+      "learning_rate": 0.0004966666666666666,
+      "loss": 0.1751,
+      "step": 107
+    },
+    {
+      "epoch": 0.028132326126595466,
+      "grad_norm": 0.561795175075531,
+      "learning_rate": 0.0004961111111111111,
+      "loss": 0.1364,
+      "step": 108
+    },
+    {
+      "epoch": 0.028392810627767648,
+      "grad_norm": 0.6582000851631165,
+      "learning_rate": 0.0004955555555555556,
+      "loss": 0.1348,
+      "step": 109
+    },
+    {
+      "epoch": 0.02865329512893983,
+      "grad_norm": 0.5225309729576111,
+      "learning_rate": 0.000495,
+      "loss": 0.0939,
+      "step": 110
+    },
+    {
+      "epoch": 0.02891377963011201,
+      "grad_norm": 0.6684510707855225,
+      "learning_rate": 0.0004944444444444445,
+      "loss": 0.1658,
+      "step": 111
+    },
+    {
+      "epoch": 0.02917426413128419,
+      "grad_norm": 0.6758474111557007,
+      "learning_rate": 0.0004938888888888889,
+      "loss": 0.0703,
+      "step": 112
+    },
+    {
+      "epoch": 0.02943474863245637,
+      "grad_norm": 0.7549937963485718,
+      "learning_rate": 0.0004933333333333334,
+      "loss": 0.0794,
+      "step": 113
+    },
+    {
+      "epoch": 0.02969523313362855,
+      "grad_norm": 0.4596688747406006,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 0.1018,
+      "step": 114
+    },
+    {
+      "epoch": 0.02995571763480073,
+      "grad_norm": 0.48921626806259155,
+      "learning_rate": 0.0004922222222222222,
+      "loss": 0.1206,
+      "step": 115
+    },
+    {
+      "epoch": 0.03021620213597291,
+      "grad_norm": 0.5874961614608765,
+      "learning_rate": 0.0004916666666666666,
+      "loss": 0.1024,
+      "step": 116
+    },
+    {
+      "epoch": 0.03047668663714509,
+      "grad_norm": 0.4092181921005249,
+      "learning_rate": 0.0004911111111111111,
+      "loss": 0.1152,
+      "step": 117
+    },
+    {
+      "epoch": 0.03073717113831727,
+      "grad_norm": 0.731638491153717,
+      "learning_rate": 0.0004905555555555556,
+      "loss": 0.1052,
+      "step": 118
+    },
+    {
+      "epoch": 0.03099765563948945,
+      "grad_norm": 0.5791296362876892,
+      "learning_rate": 0.00049,
+      "loss": 0.1155,
+      "step": 119
+    },
+    {
+      "epoch": 0.03125814014066163,
+      "grad_norm": 0.524922788143158,
+      "learning_rate": 0.0004894444444444445,
+      "loss": 0.1225,
+      "step": 120
+    },
+    {
+      "epoch": 0.03151862464183381,
+      "grad_norm": 0.43153440952301025,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.1042,
+      "step": 121
+    },
+    {
+      "epoch": 0.03177910914300599,
+      "grad_norm": 0.5489442348480225,
+      "learning_rate": 0.0004883333333333333,
+      "loss": 0.0436,
+      "step": 122
+    },
+    {
+      "epoch": 0.032039593644178174,
+      "grad_norm": 0.47173041105270386,
+      "learning_rate": 0.0004877777777777778,
+      "loss": 0.0986,
+      "step": 123
+    },
+    {
+      "epoch": 0.03230007814535035,
+      "grad_norm": 0.6360733509063721,
+      "learning_rate": 0.0004872222222222222,
+      "loss": 0.1181,
+      "step": 124
+    },
+    {
+      "epoch": 0.03256056264652253,
+      "grad_norm": 0.294552743434906,
+      "learning_rate": 0.0004866666666666667,
+      "loss": 0.0407,
+      "step": 125
+    },
+    {
+      "epoch": 0.03282104714769471,
+      "grad_norm": 0.44740888476371765,
+      "learning_rate": 0.0004861111111111111,
+      "loss": 0.0477,
+      "step": 126
+    },
+    {
+      "epoch": 0.033081531648866895,
+      "grad_norm": 0.8320237398147583,
+      "learning_rate": 0.0004855555555555556,
+      "loss": 0.1399,
+      "step": 127
+    },
+    {
+      "epoch": 0.033342016150039074,
+      "grad_norm": 0.4137701392173767,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.1035,
+      "step": 128
+    },
+    {
+      "epoch": 0.03360250065121125,
+      "grad_norm": 0.6063897609710693,
+      "learning_rate": 0.00048444444444444446,
+      "loss": 0.1225,
+      "step": 129
+    },
+    {
+      "epoch": 0.03386298515238343,
+      "grad_norm": 0.3804122507572174,
+      "learning_rate": 0.0004838888888888889,
+      "loss": 0.0626,
+      "step": 130
+    },
+    {
+      "epoch": 0.034123469653555616,
+      "grad_norm": 0.370593398809433,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 0.0533,
+      "step": 131
+    },
+    {
+      "epoch": 0.034383954154727794,
+      "grad_norm": 0.5119293928146362,
+      "learning_rate": 0.0004827777777777778,
+      "loss": 0.1296,
+      "step": 132
+    },
+    {
+      "epoch": 0.03464443865589997,
+      "grad_norm": 0.3838193118572235,
+      "learning_rate": 0.0004822222222222222,
+      "loss": 0.0839,
+      "step": 133
+    },
+    {
+      "epoch": 0.03490492315707215,
+      "grad_norm": 0.36693644523620605,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 0.0754,
+      "step": 134
+    },
+    {
+      "epoch": 0.03516540765824434,
+      "grad_norm": 0.3589820861816406,
+      "learning_rate": 0.0004811111111111111,
+      "loss": 0.0829,
+      "step": 135
+    },
+    {
+      "epoch": 0.035425892159416515,
+      "grad_norm": 0.5229784250259399,
+      "learning_rate": 0.0004805555555555556,
+      "loss": 0.0547,
+      "step": 136
+    },
+    {
+      "epoch": 0.035686376660588694,
+      "grad_norm": 0.4046158194541931,
+      "learning_rate": 0.00048,
+      "loss": 0.1089,
+      "step": 137
+    },
+    {
+      "epoch": 0.03594686116176087,
+      "grad_norm": 0.3599977195262909,
+      "learning_rate": 0.00047944444444444445,
+      "loss": 0.0847,
+      "step": 138
+    },
+    {
+      "epoch": 0.03620734566293306,
+      "grad_norm": 0.4707350730895996,
+      "learning_rate": 0.0004788888888888889,
+      "loss": 0.0469,
+      "step": 139
+    },
+    {
+      "epoch": 0.036467830164105236,
+      "grad_norm": 0.43708017468452454,
+      "learning_rate": 0.0004783333333333333,
+      "loss": 0.1026,
+      "step": 140
+    },
+    {
+      "epoch": 0.036728314665277415,
+      "grad_norm": 0.38362443447113037,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.081,
+      "step": 141
+    },
+    {
+      "epoch": 0.03698879916644959,
+      "grad_norm": 0.3944476842880249,
+      "learning_rate": 0.00047722222222222225,
+      "loss": 0.0975,
+      "step": 142
+    },
+    {
+      "epoch": 0.03724928366762178,
+      "grad_norm": 0.3799622356891632,
+      "learning_rate": 0.0004766666666666667,
+      "loss": 0.0762,
+      "step": 143
+    },
+    {
+      "epoch": 0.03750976816879396,
+      "grad_norm": 0.26444610953330994,
+      "learning_rate": 0.0004761111111111111,
+      "loss": 0.0391,
+      "step": 144
+    },
+    {
+      "epoch": 0.037770252669966135,
+      "grad_norm": 0.2609337866306305,
+      "learning_rate": 0.00047555555555555556,
+      "loss": 0.0339,
+      "step": 145
+    },
+    {
+      "epoch": 0.038030737171138314,
+      "grad_norm": 0.31581151485443115,
+      "learning_rate": 0.000475,
+      "loss": 0.0478,
+      "step": 146
+    },
+    {
+      "epoch": 0.0382912216723105,
+      "grad_norm": 0.24990220367908478,
+      "learning_rate": 0.00047444444444444444,
+      "loss": 0.0547,
+      "step": 147
+    },
+    {
+      "epoch": 0.03855170617348268,
+      "grad_norm": 0.25984567403793335,
+      "learning_rate": 0.00047388888888888893,
+      "loss": 0.0292,
+      "step": 148
+    },
+    {
+      "epoch": 0.038812190674654856,
+      "grad_norm": 0.2555845379829407,
+      "learning_rate": 0.00047333333333333336,
+      "loss": 0.0603,
+      "step": 149
+    },
+    {
+      "epoch": 0.03907267517582704,
+      "grad_norm": 0.2423526793718338,
+      "learning_rate": 0.0004727777777777778,
+      "loss": 0.0433,
+      "step": 150
+    },
+    {
+      "epoch": 0.03933315967699922,
+      "grad_norm": 0.3917102813720703,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 0.0856,
+      "step": 151
+    },
+    {
+      "epoch": 0.0395936441781714,
+      "grad_norm": 0.25814005732536316,
+      "learning_rate": 0.0004716666666666667,
+      "loss": 0.03,
+      "step": 152
+    },
+    {
+      "epoch": 0.03985412867934358,
+      "grad_norm": 0.5332754254341125,
+      "learning_rate": 0.0004711111111111111,
+      "loss": 0.0926,
+      "step": 153
+    },
+    {
+      "epoch": 0.04011461318051576,
+      "grad_norm": 0.41863763332366943,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 0.0781,
+      "step": 154
+    },
+    {
+      "epoch": 0.04037509768168794,
+      "grad_norm": 0.18758471310138702,
+      "learning_rate": 0.00047,
+      "loss": 0.02,
+      "step": 155
+    },
+    {
+      "epoch": 0.04063558218286012,
+      "grad_norm": 0.4820327162742615,
+      "learning_rate": 0.0004694444444444445,
+      "loss": 0.0925,
+      "step": 156
+    },
+    {
+      "epoch": 0.0408960666840323,
+      "grad_norm": 0.35083940625190735,
+      "learning_rate": 0.0004688888888888889,
+      "loss": 0.0765,
+      "step": 157
+    },
+    {
+      "epoch": 0.04115655118520448,
+      "grad_norm": 0.17448937892913818,
+      "learning_rate": 0.00046833333333333335,
+      "loss": 0.0336,
+      "step": 158
+    },
+    {
+      "epoch": 0.04141703568637666,
+      "grad_norm": 0.2029111683368683,
+      "learning_rate": 0.0004677777777777778,
+      "loss": 0.0254,
+      "step": 159
+    },
+    {
+      "epoch": 0.04167752018754884,
+      "grad_norm": 0.362997442483902,
+      "learning_rate": 0.0004672222222222222,
+      "loss": 0.065,
+      "step": 160
+    },
+    {
+      "epoch": 0.04193800468872102,
+      "grad_norm": 0.17652612924575806,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.0459,
+      "step": 161
+    },
+    {
+      "epoch": 0.042198489189893204,
+      "grad_norm": 0.3325017988681793,
+      "learning_rate": 0.0004661111111111111,
+      "loss": 0.0725,
+      "step": 162
+    },
+    {
+      "epoch": 0.04245897369106538,
+      "grad_norm": 0.2500215172767639,
+      "learning_rate": 0.0004655555555555556,
+      "loss": 0.0643,
+      "step": 163
+    },
+    {
+      "epoch": 0.04271945819223756,
+      "grad_norm": 0.2853871285915375,
+      "learning_rate": 0.000465,
+      "loss": 0.0267,
+      "step": 164
+    },
+    {
+      "epoch": 0.04297994269340974,
+      "grad_norm": 0.19212019443511963,
+      "learning_rate": 0.00046444444444444446,
+      "loss": 0.0423,
+      "step": 165
+    },
+    {
+      "epoch": 0.043240427194581925,
+      "grad_norm": 0.5235925316810608,
+      "learning_rate": 0.0004638888888888889,
+      "loss": 0.1148,
+      "step": 166
+    },
+    {
+      "epoch": 0.0435009116957541,
+      "grad_norm": 0.27955177426338196,
+      "learning_rate": 0.00046333333333333334,
+      "loss": 0.0639,
+      "step": 167
+    },
+    {
+      "epoch": 0.04376139619692628,
+      "grad_norm": 0.21516919136047363,
+      "learning_rate": 0.0004627777777777778,
+      "loss": 0.0604,
+      "step": 168
+    },
+    {
+      "epoch": 0.04402188069809846,
+      "grad_norm": 0.23670804500579834,
+      "learning_rate": 0.0004622222222222222,
+      "loss": 0.0622,
+      "step": 169
+    },
+    {
+      "epoch": 0.044282365199270646,
+      "grad_norm": 0.2706695795059204,
+      "learning_rate": 0.0004616666666666667,
+      "loss": 0.0689,
+      "step": 170
+    },
+    {
+      "epoch": 0.044542849700442824,
+      "grad_norm": 0.29786598682403564,
+      "learning_rate": 0.00046111111111111114,
+      "loss": 0.0594,
+      "step": 171
+    },
+    {
+      "epoch": 0.044803334201615,
+      "grad_norm": 0.24643343687057495,
+      "learning_rate": 0.0004605555555555556,
+      "loss": 0.0708,
+      "step": 172
+    },
+    {
+      "epoch": 0.04506381870278718,
+      "grad_norm": 0.299601286649704,
+      "learning_rate": 0.00046,
+      "loss": 0.0747,
+      "step": 173
+    },
+    {
+      "epoch": 0.04532430320395937,
+      "grad_norm": 0.3954971432685852,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 0.0777,
+      "step": 174
+    },
+    {
+      "epoch": 0.045584787705131545,
+      "grad_norm": 0.21053896844387054,
+      "learning_rate": 0.0004588888888888889,
+      "loss": 0.0515,
+      "step": 175
+    },
+    {
+      "epoch": 0.045845272206303724,
+      "grad_norm": 0.2881135642528534,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 0.0686,
+      "step": 176
+    },
+    {
+      "epoch": 0.0461057567074759,
+      "grad_norm": 0.283166766166687,
+      "learning_rate": 0.0004577777777777778,
+      "loss": 0.0365,
+      "step": 177
+    },
+    {
+      "epoch": 0.04636624120864809,
+      "grad_norm": 0.24174754321575165,
+      "learning_rate": 0.0004572222222222222,
+      "loss": 0.0647,
+      "step": 178
+    },
+    {
+      "epoch": 0.046626725709820266,
+      "grad_norm": 0.24825502932071686,
+      "learning_rate": 0.0004566666666666667,
+      "loss": 0.0673,
+      "step": 179
+    },
+    {
+      "epoch": 0.046887210210992444,
+      "grad_norm": 0.2801763117313385,
+      "learning_rate": 0.0004561111111111111,
+      "loss": 0.0284,
+      "step": 180
+    },
+    {
+      "epoch": 0.04714769471216462,
+      "grad_norm": 0.3177853226661682,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.0751,
+      "step": 181
+    },
+    {
+      "epoch": 0.04740817921333681,
+      "grad_norm": 0.2763389050960541,
+      "learning_rate": 0.000455,
+      "loss": 0.0491,
+      "step": 182
+    },
+    {
+      "epoch": 0.04766866371450899,
+      "grad_norm": 0.2378890961408615,
+      "learning_rate": 0.00045444444444444444,
+      "loss": 0.0527,
+      "step": 183
+    },
+    {
+      "epoch": 0.047929148215681165,
+      "grad_norm": 0.24222926795482635,
+      "learning_rate": 0.00045388888888888893,
+      "loss": 0.0664,
+      "step": 184
+    },
+    {
+      "epoch": 0.04818963271685335,
+      "grad_norm": 0.23114658892154694,
+      "learning_rate": 0.0004533333333333333,
+      "loss": 0.0494,
+      "step": 185
+    },
+    {
+      "epoch": 0.04845011721802553,
+      "grad_norm": 0.22762534022331238,
+      "learning_rate": 0.0004527777777777778,
+      "loss": 0.071,
+      "step": 186
+    },
+    {
+      "epoch": 0.04871060171919771,
+      "grad_norm": 0.2370385229587555,
+      "learning_rate": 0.00045222222222222224,
+      "loss": 0.0387,
+      "step": 187
+    },
+    {
+      "epoch": 0.048971086220369886,
+      "grad_norm": 0.22486446797847748,
+      "learning_rate": 0.0004516666666666667,
+      "loss": 0.0547,
+      "step": 188
+    },
+    {
+      "epoch": 0.04923157072154207,
+      "grad_norm": 0.20467261970043182,
+      "learning_rate": 0.0004511111111111111,
+      "loss": 0.0623,
+      "step": 189
+    },
+    {
+      "epoch": 0.04949205522271425,
+      "grad_norm": 0.1958482414484024,
+      "learning_rate": 0.00045055555555555555,
+      "loss": 0.038,
+      "step": 190
+    },
+    {
+      "epoch": 0.04975253972388643,
+      "grad_norm": 0.2064603567123413,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.0485,
+      "step": 191
+    },
+    {
+      "epoch": 0.05001302422505861,
+      "grad_norm": 0.21092765033245087,
+      "learning_rate": 0.0004494444444444444,
+      "loss": 0.0401,
+      "step": 192
+    },
+    {
+      "epoch": 0.05027350872623079,
+      "grad_norm": 0.19756095111370087,
+      "learning_rate": 0.0004488888888888889,
+      "loss": 0.0498,
+      "step": 193
+    },
+    {
+      "epoch": 0.05053399322740297,
+      "grad_norm": 0.21518975496292114,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 0.0166,
+      "step": 194
+    },
+    {
+      "epoch": 0.05079447772857515,
+      "grad_norm": 0.2586614489555359,
+      "learning_rate": 0.0004477777777777778,
+      "loss": 0.0551,
+      "step": 195
+    },
+    {
+      "epoch": 0.05105496222974733,
+      "grad_norm": 0.2525339126586914,
+      "learning_rate": 0.0004472222222222222,
+      "loss": 0.0576,
+      "step": 196
+    },
+    {
+      "epoch": 0.05131544673091951,
+      "grad_norm": 0.26852843165397644,
+      "learning_rate": 0.00044666666666666666,
+      "loss": 0.0457,
+      "step": 197
+    },
+    {
+      "epoch": 0.05157593123209169,
+      "grad_norm": 0.2254427671432495,
+      "learning_rate": 0.00044611111111111115,
+      "loss": 0.0414,
+      "step": 198
+    },
+    {
+      "epoch": 0.05183641573326387,
+      "grad_norm": 0.14540040493011475,
+      "learning_rate": 0.00044555555555555554,
+      "loss": 0.0223,
+      "step": 199
+    },
+    {
+      "epoch": 0.05209690023443605,
+      "grad_norm": 0.21755613386631012,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.0495,
+      "step": 200
+    },
+    {
+      "epoch": 0.052357384735608234,
+      "grad_norm": 0.33412784337997437,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.063,
+      "step": 201
+    },
+    {
+      "epoch": 0.05261786923678041,
+      "grad_norm": 0.30330124497413635,
+      "learning_rate": 0.0004438888888888889,
+      "loss": 0.0639,
+      "step": 202
+    },
+    {
+      "epoch": 0.05287835373795259,
+      "grad_norm": 0.3984980583190918,
+      "learning_rate": 0.00044333333333333334,
+      "loss": 0.0699,
+      "step": 203
+    },
+    {
+      "epoch": 0.05313883823912477,
+      "grad_norm": 0.22607579827308655,
+      "learning_rate": 0.0004427777777777778,
+      "loss": 0.0498,
+      "step": 204
+    },
+    {
+      "epoch": 0.053399322740296955,
+      "grad_norm": 0.20679153501987457,
+      "learning_rate": 0.00044222222222222227,
+      "loss": 0.0608,
+      "step": 205
+    },
+    {
+      "epoch": 0.05365980724146913,
+      "grad_norm": 0.2578828036785126,
+      "learning_rate": 0.00044166666666666665,
+      "loss": 0.0686,
+      "step": 206
+    },
+    {
+      "epoch": 0.05392029174264131,
+      "grad_norm": 0.20136725902557373,
+      "learning_rate": 0.00044111111111111114,
+      "loss": 0.0314,
+      "step": 207
+    },
+    {
+      "epoch": 0.05418077624381349,
+      "grad_norm": 0.2162970006465912,
+      "learning_rate": 0.0004405555555555555,
+      "loss": 0.0275,
+      "step": 208
+    },
+    {
+      "epoch": 0.054441260744985676,
+      "grad_norm": 0.22596463561058044,
+      "learning_rate": 0.00044,
+      "loss": 0.05,
+      "step": 209
+    },
+    {
+      "epoch": 0.054701745246157854,
+      "grad_norm": 0.21236006915569305,
+      "learning_rate": 0.0004394444444444445,
+      "loss": 0.0634,
+      "step": 210
+    },
+    {
+      "epoch": 0.05496222974733003,
+      "grad_norm": 0.22657150030136108,
+      "learning_rate": 0.0004388888888888889,
+      "loss": 0.0194,
+      "step": 211
+    },
+    {
+      "epoch": 0.05522271424850221,
+      "grad_norm": 0.2614911198616028,
+      "learning_rate": 0.0004383333333333334,
+      "loss": 0.0223,
+      "step": 212
+    },
+    {
+      "epoch": 0.055483198749674396,
+      "grad_norm": 0.19223269820213318,
+      "learning_rate": 0.00043777777777777776,
+      "loss": 0.0342,
+      "step": 213
+    },
+    {
+      "epoch": 0.055743683250846575,
+      "grad_norm": 0.16092517971992493,
+      "learning_rate": 0.00043722222222222225,
+      "loss": 0.0264,
+      "step": 214
+    },
+    {
+      "epoch": 0.05600416775201875,
+      "grad_norm": 0.17417079210281372,
+      "learning_rate": 0.00043666666666666664,
+      "loss": 0.0235,
+      "step": 215
+    },
+    {
+      "epoch": 0.05626465225319093,
+      "grad_norm": 0.22072814404964447,
+      "learning_rate": 0.00043611111111111113,
+      "loss": 0.0431,
+      "step": 216
+    },
+    {
+      "epoch": 0.05652513675436312,
+      "grad_norm": 0.27902325987815857,
+      "learning_rate": 0.0004355555555555555,
+      "loss": 0.0578,
+      "step": 217
+    },
+    {
+      "epoch": 0.056785621255535296,
+      "grad_norm": 0.14471304416656494,
+      "learning_rate": 0.000435,
+      "loss": 0.0272,
+      "step": 218
+    },
+    {
+      "epoch": 0.057046105756707474,
+      "grad_norm": 0.19169339537620544,
+      "learning_rate": 0.0004344444444444445,
+      "loss": 0.0516,
+      "step": 219
+    },
+    {
+      "epoch": 0.05730659025787966,
+      "grad_norm": 0.2244272381067276,
+      "learning_rate": 0.0004338888888888889,
+      "loss": 0.046,
+      "step": 220
+    },
+    {
+      "epoch": 0.05756707475905184,
+      "grad_norm": 0.12852348387241364,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 0.0373,
+      "step": 221
+    },
+    {
+      "epoch": 0.05782755926022402,
+      "grad_norm": 0.2068084180355072,
+      "learning_rate": 0.00043277777777777775,
+      "loss": 0.0597,
+      "step": 222
+    },
+    {
+      "epoch": 0.058088043761396195,
+      "grad_norm": 0.2222401201725006,
+      "learning_rate": 0.00043222222222222224,
+      "loss": 0.0412,
+      "step": 223
+    },
+    {
+      "epoch": 0.05834852826256838,
+      "grad_norm": 0.22024814784526825,
+      "learning_rate": 0.0004316666666666667,
+      "loss": 0.045,
+      "step": 224
+    },
+    {
+      "epoch": 0.05860901276374056,
+      "grad_norm": 0.1876782774925232,
+      "learning_rate": 0.0004311111111111111,
+      "loss": 0.0433,
+      "step": 225
+    },
+    {
+      "epoch": 0.05886949726491274,
+      "grad_norm": 0.3719448149204254,
+      "learning_rate": 0.0004305555555555556,
+      "loss": 0.0645,
+      "step": 226
+    },
+    {
+      "epoch": 0.059129981766084916,
+      "grad_norm": 0.18171130120754242,
+      "learning_rate": 0.00043,
+      "loss": 0.0452,
+      "step": 227
+    },
+    {
+      "epoch": 0.0593904662672571,
+      "grad_norm": 0.178982675075531,
+      "learning_rate": 0.0004294444444444445,
+      "loss": 0.0497,
+      "step": 228
+    },
+    {
+      "epoch": 0.05965095076842928,
+      "grad_norm": 0.1660393476486206,
+      "learning_rate": 0.00042888888888888886,
+      "loss": 0.0478,
+      "step": 229
+    },
+    {
+      "epoch": 0.05991143526960146,
+      "grad_norm": 0.1843854784965515,
+      "learning_rate": 0.00042833333333333335,
+      "loss": 0.0522,
+      "step": 230
+    },
+    {
+      "epoch": 0.06017191977077364,
+      "grad_norm": 0.12932512164115906,
+      "learning_rate": 0.0004277777777777778,
+      "loss": 0.0233,
+      "step": 231
+    },
+    {
+      "epoch": 0.06043240427194582,
+      "grad_norm": 0.2272115796804428,
+      "learning_rate": 0.00042722222222222223,
+      "loss": 0.044,
+      "step": 232
+    },
+    {
+      "epoch": 0.060692888773118,
+      "grad_norm": 0.15523988008499146,
+      "learning_rate": 0.0004266666666666667,
+      "loss": 0.0275,
+      "step": 233
+    },
+    {
+      "epoch": 0.06095337327429018,
+      "grad_norm": 0.1535874903202057,
+      "learning_rate": 0.0004261111111111111,
+      "loss": 0.0316,
+      "step": 234
+    },
+    {
+      "epoch": 0.06121385777546236,
+      "grad_norm": 0.23498567938804626,
+      "learning_rate": 0.0004255555555555556,
+      "loss": 0.054,
+      "step": 235
+    },
+    {
+      "epoch": 0.06147434227663454,
+      "grad_norm": 0.1743401437997818,
+      "learning_rate": 0.000425,
+      "loss": 0.051,
+      "step": 236
+    },
+    {
+      "epoch": 0.06173482677780672,
+      "grad_norm": 0.1762365847826004,
+      "learning_rate": 0.00042444444444444447,
+      "loss": 0.0464,
+      "step": 237
+    },
+    {
+      "epoch": 0.0619953112789789,
+      "grad_norm": 0.18202027678489685,
+      "learning_rate": 0.0004238888888888889,
+      "loss": 0.0197,
+      "step": 238
+    },
+    {
+      "epoch": 0.06225579578015108,
+      "grad_norm": 0.20636488497257233,
+      "learning_rate": 0.00042333333333333334,
+      "loss": 0.054,
+      "step": 239
+    },
+    {
+      "epoch": 0.06251628028132326,
+      "grad_norm": 0.12590542435646057,
+      "learning_rate": 0.0004227777777777778,
+      "loss": 0.023,
+      "step": 240
+    },
+    {
+      "epoch": 0.06277676478249544,
+      "grad_norm": 0.19617991149425507,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 0.0166,
+      "step": 241
+    },
+    {
+      "epoch": 0.06303724928366762,
+      "grad_norm": 0.20318199694156647,
+      "learning_rate": 0.0004216666666666667,
+      "loss": 0.0567,
+      "step": 242
+    },
+    {
+      "epoch": 0.0632977337848398,
+      "grad_norm": 0.14517085254192352,
+      "learning_rate": 0.0004211111111111111,
+      "loss": 0.0196,
+      "step": 243
+    },
+    {
+      "epoch": 0.06355821828601198,
+      "grad_norm": 0.15447178483009338,
+      "learning_rate": 0.0004205555555555556,
+      "loss": 0.0387,
+      "step": 244
+    },
+    {
+      "epoch": 0.06381870278718416,
+      "grad_norm": 0.17070916295051575,
+      "learning_rate": 0.00042,
+      "loss": 0.0461,
+      "step": 245
+    },
+    {
+      "epoch": 0.06407918728835635,
+      "grad_norm": 0.15433409810066223,
+      "learning_rate": 0.00041944444444444445,
+      "loss": 0.019,
+      "step": 246
+    },
+    {
+      "epoch": 0.06433967178952853,
+      "grad_norm": 0.183025062084198,
+      "learning_rate": 0.0004188888888888889,
+      "loss": 0.0444,
+      "step": 247
+    },
+    {
+      "epoch": 0.0646001562907007,
+      "grad_norm": 0.385356068611145,
+      "learning_rate": 0.00041833333333333333,
+      "loss": 0.0547,
+      "step": 248
+    },
+    {
+      "epoch": 0.06486064079187288,
+      "grad_norm": 0.21771393716335297,
+      "learning_rate": 0.0004177777777777778,
+      "loss": 0.0509,
+      "step": 249
+    },
+    {
+      "epoch": 0.06512112529304506,
+      "grad_norm": 0.21005302667617798,
+      "learning_rate": 0.0004172222222222222,
+      "loss": 0.0433,
+      "step": 250
+    },
+    {
+      "epoch": 0.06538160979421724,
+      "grad_norm": 0.13587549328804016,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 0.038,
+      "step": 251
+    },
+    {
+      "epoch": 0.06564209429538942,
+      "grad_norm": 0.17643588781356812,
+      "learning_rate": 0.00041611111111111113,
+      "loss": 0.0468,
+      "step": 252
+    },
+    {
+      "epoch": 0.0659025787965616,
+      "grad_norm": 0.1694176346063614,
+      "learning_rate": 0.00041555555555555557,
+      "loss": 0.0372,
+      "step": 253
+    },
+    {
+      "epoch": 0.06616306329773379,
+      "grad_norm": 0.20141932368278503,
+      "learning_rate": 0.000415,
+      "loss": 0.0529,
+      "step": 254
+    },
+    {
+      "epoch": 0.06642354779890597,
+      "grad_norm": 0.22460468113422394,
+      "learning_rate": 0.00041444444444444444,
+      "loss": 0.0441,
+      "step": 255
+    },
+    {
+      "epoch": 0.06668403230007815,
+      "grad_norm": 0.16733771562576294,
+      "learning_rate": 0.0004138888888888889,
+      "loss": 0.0475,
+      "step": 256
+    },
+    {
+      "epoch": 0.06694451680125033,
+      "grad_norm": 0.1671062558889389,
+      "learning_rate": 0.0004133333333333333,
+      "loss": 0.0357,
+      "step": 257
+    },
+    {
+      "epoch": 0.0672050013024225,
+      "grad_norm": 0.16501788794994354,
+      "learning_rate": 0.0004127777777777778,
+      "loss": 0.0477,
+      "step": 258
+    },
+    {
+      "epoch": 0.06746548580359468,
+      "grad_norm": 0.23321153223514557,
+      "learning_rate": 0.00041222222222222224,
+      "loss": 0.0233,
+      "step": 259
+    },
+    {
+      "epoch": 0.06772597030476686,
+      "grad_norm": 0.21765446662902832,
+      "learning_rate": 0.0004116666666666667,
+      "loss": 0.0379,
+      "step": 260
+    },
+    {
+      "epoch": 0.06798645480593905,
+      "grad_norm": 0.15531405806541443,
+      "learning_rate": 0.0004111111111111111,
+      "loss": 0.036,
+      "step": 261
+    },
+    {
+      "epoch": 0.06824693930711123,
+      "grad_norm": 0.2108466476202011,
+      "learning_rate": 0.00041055555555555555,
+      "loss": 0.0232,
+      "step": 262
+    },
+    {
+      "epoch": 0.06850742380828341,
+      "grad_norm": 0.1868615299463272,
+      "learning_rate": 0.00041,
+      "loss": 0.0524,
+      "step": 263
+    },
+    {
+      "epoch": 0.06876790830945559,
+      "grad_norm": 0.2014734148979187,
+      "learning_rate": 0.00040944444444444443,
+      "loss": 0.0513,
+      "step": 264
+    },
+    {
+      "epoch": 0.06902839281062777,
+      "grad_norm": 0.1071903333067894,
+      "learning_rate": 0.0004088888888888889,
+      "loss": 0.0344,
+      "step": 265
+    },
+    {
+      "epoch": 0.06928887731179995,
+      "grad_norm": 0.11115298420190811,
+      "learning_rate": 0.00040833333333333336,
+      "loss": 0.038,
+      "step": 266
+    },
+    {
+      "epoch": 0.06954936181297212,
+      "grad_norm": 0.2003505676984787,
+      "learning_rate": 0.0004077777777777778,
+      "loss": 0.049,
+      "step": 267
+    },
+    {
+      "epoch": 0.0698098463141443,
+      "grad_norm": 0.1403959095478058,
+      "learning_rate": 0.00040722222222222223,
+      "loss": 0.0324,
+      "step": 268
+    },
+    {
+      "epoch": 0.0700703308153165,
+      "grad_norm": 0.16135309636592865,
+      "learning_rate": 0.00040666666666666667,
+      "loss": 0.0367,
+      "step": 269
+    },
+    {
+      "epoch": 0.07033081531648867,
+      "grad_norm": 0.14055243134498596,
+      "learning_rate": 0.0004061111111111111,
+      "loss": 0.0387,
+      "step": 270
+    },
+    {
+      "epoch": 0.07059129981766085,
+      "grad_norm": 0.2191935032606125,
+      "learning_rate": 0.00040555555555555554,
+      "loss": 0.0469,
+      "step": 271
+    },
+    {
+      "epoch": 0.07085178431883303,
+      "grad_norm": 0.1192723885178566,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.0198,
+      "step": 272
+    },
+    {
+      "epoch": 0.07111226882000521,
+      "grad_norm": 0.12160241603851318,
+      "learning_rate": 0.00040444444444444447,
+      "loss": 0.0291,
+      "step": 273
+    },
+    {
+      "epoch": 0.07137275332117739,
+      "grad_norm": 0.11681188642978668,
+      "learning_rate": 0.0004038888888888889,
+      "loss": 0.0214,
+      "step": 274
+    },
+    {
+      "epoch": 0.07163323782234957,
+      "grad_norm": 0.16357356309890747,
+      "learning_rate": 0.00040333333333333334,
+      "loss": 0.0386,
+      "step": 275
+    },
+    {
+      "epoch": 0.07189372232352174,
+      "grad_norm": 0.21038782596588135,
+      "learning_rate": 0.0004027777777777778,
+      "loss": 0.0487,
+      "step": 276
+    },
+    {
+      "epoch": 0.07215420682469394,
+      "grad_norm": 0.16823112964630127,
+      "learning_rate": 0.0004022222222222222,
+      "loss": 0.0398,
+      "step": 277
+    },
+    {
+      "epoch": 0.07241469132586612,
+      "grad_norm": 0.11350996047258377,
+      "learning_rate": 0.00040166666666666665,
+      "loss": 0.023,
+      "step": 278
+    },
+    {
+      "epoch": 0.0726751758270383,
+      "grad_norm": 0.12088004499673843,
+      "learning_rate": 0.0004011111111111111,
+      "loss": 0.0367,
+      "step": 279
+    },
+    {
+      "epoch": 0.07293566032821047,
+      "grad_norm": 0.20815615355968475,
+      "learning_rate": 0.0004005555555555556,
+      "loss": 0.0339,
+      "step": 280
+    },
+    {
+      "epoch": 0.07319614482938265,
+      "grad_norm": 0.20457801222801208,
+      "learning_rate": 0.0004,
+      "loss": 0.0462,
+      "step": 281
+    },
+    {
+      "epoch": 0.07345662933055483,
+      "grad_norm": 0.2844444811344147,
+      "learning_rate": 0.00039944444444444446,
+      "loss": 0.0542,
+      "step": 282
+    },
+    {
+      "epoch": 0.07371711383172701,
+      "grad_norm": 0.19987139105796814,
+      "learning_rate": 0.0003988888888888889,
+      "loss": 0.0179,
+      "step": 283
+    },
+    {
+      "epoch": 0.07397759833289919,
+      "grad_norm": 0.14339959621429443,
+      "learning_rate": 0.00039833333333333333,
+      "loss": 0.0449,
+      "step": 284
+    },
+    {
+      "epoch": 0.07423808283407138,
+      "grad_norm": 0.15933018922805786,
+      "learning_rate": 0.00039777777777777777,
+      "loss": 0.027,
+      "step": 285
+    },
+    {
+      "epoch": 0.07449856733524356,
+      "grad_norm": 0.207759290933609,
+      "learning_rate": 0.0003972222222222222,
+      "loss": 0.0461,
+      "step": 286
+    },
+    {
+      "epoch": 0.07475905183641574,
+      "grad_norm": 0.13354656100273132,
+      "learning_rate": 0.0003966666666666667,
+      "loss": 0.0415,
+      "step": 287
+    },
+    {
+      "epoch": 0.07501953633758791,
+      "grad_norm": 0.1208604946732521,
+      "learning_rate": 0.00039611111111111113,
+      "loss": 0.0393,
+      "step": 288
+    },
+    {
+      "epoch": 0.07528002083876009,
+      "grad_norm": 0.1399313062429428,
+      "learning_rate": 0.00039555555555555557,
+      "loss": 0.043,
+      "step": 289
+    },
+    {
+      "epoch": 0.07554050533993227,
+      "grad_norm": 0.1408071219921112,
+      "learning_rate": 0.000395,
+      "loss": 0.015,
+      "step": 290
+    },
+    {
+      "epoch": 0.07580098984110445,
+      "grad_norm": 0.11537426710128784,
+      "learning_rate": 0.00039444444444444444,
+      "loss": 0.0345,
+      "step": 291
+    },
+    {
+      "epoch": 0.07606147434227663,
+      "grad_norm": 0.12704655528068542,
+      "learning_rate": 0.00039388888888888893,
+      "loss": 0.0399,
+      "step": 292
+    },
+    {
+      "epoch": 0.07632195884344882,
+      "grad_norm": 0.09692219644784927,
+      "learning_rate": 0.0003933333333333333,
+      "loss": 0.0328,
+      "step": 293
+    },
+    {
+      "epoch": 0.076582443344621,
+      "grad_norm": 0.12996357679367065,
+      "learning_rate": 0.0003927777777777778,
+      "loss": 0.0151,
+      "step": 294
+    },
+    {
+      "epoch": 0.07684292784579318,
+      "grad_norm": 0.0913880318403244,
+      "learning_rate": 0.00039222222222222225,
+      "loss": 0.0328,
+      "step": 295
+    },
+    {
+      "epoch": 0.07710341234696536,
+      "grad_norm": 0.10965622216463089,
+      "learning_rate": 0.0003916666666666667,
+      "loss": 0.0296,
+      "step": 296
+    },
+    {
+      "epoch": 0.07736389684813753,
+      "grad_norm": 0.1479872316122055,
+      "learning_rate": 0.0003911111111111111,
+      "loss": 0.0421,
+      "step": 297
+    },
+    {
+      "epoch": 0.07762438134930971,
+      "grad_norm": 0.10420911759138107,
+      "learning_rate": 0.00039055555555555556,
+      "loss": 0.0227,
+      "step": 298
+    },
+    {
+      "epoch": 0.07788486585048189,
+      "grad_norm": 0.13351784646511078,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0341,
+      "step": 299
+    },
+    {
+      "epoch": 0.07814535035165408,
+      "grad_norm": 0.11381152272224426,
+      "learning_rate": 0.00038944444444444443,
+      "loss": 0.0381,
+      "step": 300
+    },
+    {
+      "epoch": 0.07840583485282626,
+      "grad_norm": 0.1092122346162796,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 0.0247,
+      "step": 301
+    },
+    {
+      "epoch": 0.07866631935399844,
+      "grad_norm": 0.11427556723356247,
+      "learning_rate": 0.0003883333333333333,
+      "loss": 0.0396,
+      "step": 302
+    },
+    {
+      "epoch": 0.07892680385517062,
+      "grad_norm": 0.25716203451156616,
+      "learning_rate": 0.0003877777777777778,
+      "loss": 0.0503,
+      "step": 303
+    },
+    {
+      "epoch": 0.0791872883563428,
+      "grad_norm": 0.10936491191387177,
+      "learning_rate": 0.00038722222222222223,
+      "loss": 0.0345,
+      "step": 304
+    },
+    {
+      "epoch": 0.07944777285751498,
+      "grad_norm": 0.10200309008359909,
+      "learning_rate": 0.00038666666666666667,
+      "loss": 0.0307,
+      "step": 305
+    },
+    {
+      "epoch": 0.07970825735868715,
+      "grad_norm": 0.15819424390792847,
+      "learning_rate": 0.00038611111111111116,
+      "loss": 0.0375,
+      "step": 306
+    },
+    {
+      "epoch": 0.07996874185985933,
+      "grad_norm": 0.14317704737186432,
+      "learning_rate": 0.00038555555555555554,
+      "loss": 0.0374,
+      "step": 307
+    },
+    {
+      "epoch": 0.08022922636103152,
+      "grad_norm": 0.14639945328235626,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.0306,
+      "step": 308
+    },
+    {
+      "epoch": 0.0804897108622037,
+      "grad_norm": 0.1522209346294403,
+      "learning_rate": 0.0003844444444444444,
+      "loss": 0.0158,
+      "step": 309
+    },
+    {
+      "epoch": 0.08075019536337588,
+      "grad_norm": 0.1352933645248413,
+      "learning_rate": 0.0003838888888888889,
+      "loss": 0.0191,
+      "step": 310
+    },
+    {
+      "epoch": 0.08101067986454806,
+      "grad_norm": 0.30525916814804077,
+      "learning_rate": 0.00038333333333333334,
+      "loss": 0.0529,
+      "step": 311
+    },
+    {
+      "epoch": 0.08127116436572024,
+      "grad_norm": 0.2127600610256195,
+      "learning_rate": 0.0003827777777777778,
+      "loss": 0.0545,
+      "step": 312
+    },
+    {
+      "epoch": 0.08153164886689242,
+      "grad_norm": 0.12473171949386597,
+      "learning_rate": 0.0003822222222222223,
+      "loss": 0.0174,
+      "step": 313
+    },
+    {
+      "epoch": 0.0817921333680646,
+      "grad_norm": 0.13822594285011292,
+      "learning_rate": 0.00038166666666666666,
+      "loss": 0.0129,
+      "step": 314
+    },
+    {
+      "epoch": 0.08205261786923677,
+      "grad_norm": 0.1683114618062973,
+      "learning_rate": 0.00038111111111111115,
+      "loss": 0.0214,
+      "step": 315
+    },
+    {
+      "epoch": 0.08231310237040897,
+      "grad_norm": 0.09103227406740189,
+      "learning_rate": 0.00038055555555555553,
+      "loss": 0.0307,
+      "step": 316
+    },
+    {
+      "epoch": 0.08257358687158114,
+      "grad_norm": 0.10524292290210724,
+      "learning_rate": 0.00038,
+      "loss": 0.0205,
+      "step": 317
+    },
+    {
+      "epoch": 0.08283407137275332,
+      "grad_norm": 0.15706130862236023,
+      "learning_rate": 0.0003794444444444444,
+      "loss": 0.037,
+      "step": 318
+    },
+    {
+      "epoch": 0.0830945558739255,
+      "grad_norm": 0.14827808737754822,
+      "learning_rate": 0.0003788888888888889,
+      "loss": 0.0332,
+      "step": 319
+    },
+    {
+      "epoch": 0.08335504037509768,
+      "grad_norm": 0.09460010379552841,
+      "learning_rate": 0.0003783333333333334,
+      "loss": 0.032,
+      "step": 320
+    },
+    {
+      "epoch": 0.08361552487626986,
+      "grad_norm": 0.16332273185253143,
+      "learning_rate": 0.00037777777777777777,
+      "loss": 0.0176,
+      "step": 321
+    },
+    {
+      "epoch": 0.08387600937744204,
+      "grad_norm": 0.2653479278087616,
+      "learning_rate": 0.00037722222222222226,
+      "loss": 0.046,
+      "step": 322
+    },
+    {
+      "epoch": 0.08413649387861422,
+      "grad_norm": 0.22567397356033325,
+      "learning_rate": 0.00037666666666666664,
+      "loss": 0.0397,
+      "step": 323
+    },
+    {
+      "epoch": 0.08439697837978641,
+      "grad_norm": 0.09909996390342712,
+      "learning_rate": 0.00037611111111111113,
+      "loss": 0.0347,
+      "step": 324
+    },
+    {
+      "epoch": 0.08465746288095859,
+      "grad_norm": 0.14052069187164307,
+      "learning_rate": 0.0003755555555555555,
+      "loss": 0.0426,
+      "step": 325
+    },
+    {
+      "epoch": 0.08491794738213077,
+      "grad_norm": 0.10061012208461761,
+      "learning_rate": 0.000375,
+      "loss": 0.0339,
+      "step": 326
+    },
+    {
+      "epoch": 0.08517843188330294,
+      "grad_norm": 0.07894690334796906,
+      "learning_rate": 0.0003744444444444445,
+      "loss": 0.0251,
+      "step": 327
+    },
+    {
+      "epoch": 0.08543891638447512,
+      "grad_norm": 0.09962338209152222,
+      "learning_rate": 0.0003738888888888889,
+      "loss": 0.0349,
+      "step": 328
+    },
+    {
+      "epoch": 0.0856994008856473,
+      "grad_norm": 0.18012109398841858,
+      "learning_rate": 0.0003733333333333334,
+      "loss": 0.0151,
+      "step": 329
+    },
+    {
+      "epoch": 0.08595988538681948,
+      "grad_norm": 0.11095000058412552,
+      "learning_rate": 0.00037277777777777776,
+      "loss": 0.0377,
+      "step": 330
+    },
+    {
+      "epoch": 0.08622036988799167,
+      "grad_norm": 0.06870577484369278,
+      "learning_rate": 0.00037222222222222225,
+      "loss": 0.0188,
+      "step": 331
+    },
+    {
+      "epoch": 0.08648085438916385,
+      "grad_norm": 0.24816635251045227,
+      "learning_rate": 0.00037166666666666663,
+      "loss": 0.0391,
+      "step": 332
+    },
+    {
+      "epoch": 0.08674133889033603,
+      "grad_norm": 0.10784301161766052,
+      "learning_rate": 0.0003711111111111111,
+      "loss": 0.0187,
+      "step": 333
+    },
+    {
+      "epoch": 0.0870018233915082,
+      "grad_norm": 0.11542753875255585,
+      "learning_rate": 0.0003705555555555556,
+      "loss": 0.0393,
+      "step": 334
+    },
+    {
+      "epoch": 0.08726230789268039,
+      "grad_norm": 0.1449226289987564,
+      "learning_rate": 0.00037,
+      "loss": 0.0379,
+      "step": 335
+    },
+    {
+      "epoch": 0.08752279239385256,
+      "grad_norm": 0.165171280503273,
+      "learning_rate": 0.0003694444444444445,
+      "loss": 0.0475,
+      "step": 336
+    },
+    {
+      "epoch": 0.08778327689502474,
+      "grad_norm": 0.16520822048187256,
+      "learning_rate": 0.00036888888888888887,
+      "loss": 0.0392,
+      "step": 337
+    },
+    {
+      "epoch": 0.08804376139619692,
+      "grad_norm": 0.10801101475954056,
+      "learning_rate": 0.00036833333333333336,
+      "loss": 0.0374,
+      "step": 338
+    },
+    {
+      "epoch": 0.08830424589736911,
+      "grad_norm": 0.13719044625759125,
+      "learning_rate": 0.00036777777777777774,
+      "loss": 0.0293,
+      "step": 339
+    },
+    {
+      "epoch": 0.08856473039854129,
+      "grad_norm": 0.13164854049682617,
+      "learning_rate": 0.00036722222222222223,
+      "loss": 0.0159,
+      "step": 340
+    },
+    {
+      "epoch": 0.08882521489971347,
+      "grad_norm": 0.16117313504219055,
+      "learning_rate": 0.00036666666666666667,
+      "loss": 0.0238,
+      "step": 341
+    },
+    {
+      "epoch": 0.08908569940088565,
+      "grad_norm": 0.11216874420642853,
+      "learning_rate": 0.0003661111111111111,
+      "loss": 0.034,
+      "step": 342
+    },
+    {
+      "epoch": 0.08934618390205783,
+      "grad_norm": 0.10498247295618057,
+      "learning_rate": 0.0003655555555555556,
+      "loss": 0.0335,
+      "step": 343
+    },
+    {
+      "epoch": 0.08960666840323,
+      "grad_norm": 0.083678238093853,
+      "learning_rate": 0.000365,
+      "loss": 0.0318,
+      "step": 344
+    },
+    {
+      "epoch": 0.08986715290440218,
+      "grad_norm": 0.09851270914077759,
+      "learning_rate": 0.00036444444444444447,
+      "loss": 0.032,
+      "step": 345
+    },
+    {
+      "epoch": 0.09012763740557436,
+      "grad_norm": 0.15901733934879303,
+      "learning_rate": 0.00036388888888888886,
+      "loss": 0.0369,
+      "step": 346
+    },
+    {
+      "epoch": 0.09038812190674655,
+      "grad_norm": 0.1602892279624939,
+      "learning_rate": 0.00036333333333333335,
+      "loss": 0.0144,
+      "step": 347
+    },
+    {
+      "epoch": 0.09064860640791873,
+      "grad_norm": 0.0990363284945488,
+      "learning_rate": 0.0003627777777777778,
+      "loss": 0.0307,
+      "step": 348
+    },
+    {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 0.09152241796255112,
+      "learning_rate": 0.0003622222222222222,
+      "loss": 0.0174,
+      "step": 349
+    },
+    {
+      "epoch": 0.09116957541026309,
+      "grad_norm": 0.11197351664304733,
+      "learning_rate": 0.0003616666666666667,
+      "loss": 0.0291,
+      "step": 350
+    },
+    {
+      "epoch": 0.09143005991143527,
+      "grad_norm": 0.11259925365447998,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 0.0118,
+      "step": 351
+    },
+    {
+      "epoch": 0.09169054441260745,
+      "grad_norm": 0.07788604497909546,
+      "learning_rate": 0.0003605555555555556,
+      "loss": 0.0289,
+      "step": 352
+    },
+    {
+      "epoch": 0.09195102891377963,
+      "grad_norm": 0.06775746494531631,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.0253,
+      "step": 353
+    },
+    {
+      "epoch": 0.0922115134149518,
+      "grad_norm": 0.13518136739730835,
+      "learning_rate": 0.00035944444444444446,
+      "loss": 0.0404,
+      "step": 354
+    },
+    {
+      "epoch": 0.092471997916124,
+      "grad_norm": 0.1233261302113533,
+      "learning_rate": 0.0003588888888888889,
+      "loss": 0.0331,
+      "step": 355
+    },
+    {
+      "epoch": 0.09273248241729617,
+      "grad_norm": 0.1096162497997284,
+      "learning_rate": 0.00035833333333333333,
+      "loss": 0.0423,
+      "step": 356
+    },
+    {
+      "epoch": 0.09299296691846835,
+      "grad_norm": 0.1044531762599945,
+      "learning_rate": 0.00035777777777777777,
+      "loss": 0.0159,
+      "step": 357
+    },
+    {
+      "epoch": 0.09325345141964053,
+      "grad_norm": 0.09880375862121582,
+      "learning_rate": 0.0003572222222222222,
+      "loss": 0.0315,
+      "step": 358
+    },
+    {
+      "epoch": 0.09351393592081271,
+      "grad_norm": 0.16216698288917542,
+      "learning_rate": 0.0003566666666666667,
+      "loss": 0.038,
+      "step": 359
+    },
+    {
+      "epoch": 0.09377442042198489,
+      "grad_norm": 0.10374052822589874,
+      "learning_rate": 0.0003561111111111111,
+      "loss": 0.017,
+      "step": 360
+    },
+    {
+      "epoch": 0.09403490492315707,
+      "grad_norm": 0.09040326625108719,
+      "learning_rate": 0.00035555555555555557,
+      "loss": 0.0306,
+      "step": 361
+    },
+    {
+      "epoch": 0.09429538942432925,
+      "grad_norm": 0.088216632604599,
+      "learning_rate": 0.000355,
+      "loss": 0.025,
+      "step": 362
+    },
+    {
+      "epoch": 0.09455587392550144,
+      "grad_norm": 0.13739970326423645,
+      "learning_rate": 0.00035444444444444445,
+      "loss": 0.0315,
+      "step": 363
+    },
+    {
+      "epoch": 0.09481635842667362,
+      "grad_norm": 0.0974138155579567,
+      "learning_rate": 0.0003538888888888889,
+      "loss": 0.0284,
+      "step": 364
+    },
+    {
+      "epoch": 0.0950768429278458,
+      "grad_norm": 0.08695337176322937,
+      "learning_rate": 0.0003533333333333333,
+      "loss": 0.0228,
+      "step": 365
+    },
+    {
+      "epoch": 0.09533732742901797,
+      "grad_norm": 0.1133357360959053,
+      "learning_rate": 0.0003527777777777778,
+      "loss": 0.0247,
+      "step": 366
+    },
+    {
+      "epoch": 0.09559781193019015,
+      "grad_norm": 0.11041553318500519,
+      "learning_rate": 0.00035222222222222225,
+      "loss": 0.0196,
+      "step": 367
+    },
+    {
+      "epoch": 0.09585829643136233,
+      "grad_norm": 0.08988118171691895,
+      "learning_rate": 0.0003516666666666667,
+      "loss": 0.018,
+      "step": 368
+    },
+    {
+      "epoch": 0.09611878093253451,
+      "grad_norm": 0.08557140082120895,
+      "learning_rate": 0.0003511111111111111,
+      "loss": 0.0189,
+      "step": 369
+    },
+    {
+      "epoch": 0.0963792654337067,
+      "grad_norm": 0.08144374936819077,
+      "learning_rate": 0.00035055555555555556,
+      "loss": 0.0195,
+      "step": 370
+    },
+    {
+      "epoch": 0.09663974993487888,
+      "grad_norm": 0.18180625140666962,
+      "learning_rate": 0.00035,
+      "loss": 0.0412,
+      "step": 371
+    },
+    {
+      "epoch": 0.09690023443605106,
+      "grad_norm": 0.1168271005153656,
+      "learning_rate": 0.00034944444444444443,
+      "loss": 0.015,
+      "step": 372
+    },
+    {
+      "epoch": 0.09716071893722324,
+      "grad_norm": 0.08017319440841675,
+      "learning_rate": 0.0003488888888888889,
+      "loss": 0.0309,
+      "step": 373
+    },
+    {
+      "epoch": 0.09742120343839542,
+      "grad_norm": 0.16148969531059265,
+      "learning_rate": 0.00034833333333333336,
+      "loss": 0.0344,
+      "step": 374
+    },
+    {
+      "epoch": 0.0976816879395676,
+      "grad_norm": 0.07820136100053787,
+      "learning_rate": 0.0003477777777777778,
+      "loss": 0.0154,
+      "step": 375
+    },
+    {
+      "epoch": 0.09794217244073977,
+      "grad_norm": 0.1732979118824005,
+      "learning_rate": 0.00034722222222222224,
+      "loss": 0.0362,
+      "step": 376
+    },
+    {
+      "epoch": 0.09820265694191195,
+      "grad_norm": 0.16460788249969482,
+      "learning_rate": 0.00034666666666666667,
+      "loss": 0.0399,
+      "step": 377
+    },
+    {
+      "epoch": 0.09846314144308414,
+      "grad_norm": 0.17509953677654266,
+      "learning_rate": 0.0003461111111111111,
+      "loss": 0.0384,
+      "step": 378
+    },
+    {
+      "epoch": 0.09872362594425632,
+      "grad_norm": 0.09137140214443207,
+      "learning_rate": 0.00034555555555555555,
+      "loss": 0.0124,
+      "step": 379
+    },
+    {
+      "epoch": 0.0989841104454285,
+      "grad_norm": 0.0854978933930397,
+      "learning_rate": 0.000345,
+      "loss": 0.028,
+      "step": 380
+    },
+    {
+      "epoch": 0.09924459494660068,
+      "grad_norm": 0.09197288006544113,
+      "learning_rate": 0.0003444444444444445,
+      "loss": 0.0147,
+      "step": 381
+    },
+    {
+      "epoch": 0.09950507944777286,
+      "grad_norm": 0.15859054028987885,
+      "learning_rate": 0.0003438888888888889,
+      "loss": 0.0423,
+      "step": 382
+    },
+    {
+      "epoch": 0.09976556394894504,
+      "grad_norm": 0.09122268110513687,
+      "learning_rate": 0.00034333333333333335,
+      "loss": 0.0288,
+      "step": 383
+    },
+    {
+      "epoch": 0.10002604845011721,
+      "grad_norm": 0.1572301834821701,
+      "learning_rate": 0.0003427777777777778,
+      "loss": 0.0392,
+      "step": 384
+    },
+    {
+      "epoch": 0.10028653295128939,
+      "grad_norm": 0.1745997965335846,
+      "learning_rate": 0.0003422222222222222,
+      "loss": 0.0401,
+      "step": 385
+    },
+    {
+      "epoch": 0.10054701745246158,
+      "grad_norm": 0.12563872337341309,
+      "learning_rate": 0.00034166666666666666,
+      "loss": 0.0359,
+      "step": 386
+    },
+    {
+      "epoch": 0.10080750195363376,
+      "grad_norm": 0.14481599628925323,
+      "learning_rate": 0.0003411111111111111,
+      "loss": 0.0458,
+      "step": 387
+    },
+    {
+      "epoch": 0.10106798645480594,
+      "grad_norm": 0.10502296686172485,
+      "learning_rate": 0.0003405555555555556,
+      "loss": 0.03,
+      "step": 388
+    },
+    {
+      "epoch": 0.10132847095597812,
+      "grad_norm": 0.12301287800073624,
+      "learning_rate": 0.00034,
+      "loss": 0.0286,
+      "step": 389
+    },
+    {
+      "epoch": 0.1015889554571503,
+      "grad_norm": 0.08884134888648987,
+      "learning_rate": 0.00033944444444444446,
+      "loss": 0.0148,
+      "step": 390
+    },
+    {
+      "epoch": 0.10184943995832248,
+      "grad_norm": 0.17433515191078186,
+      "learning_rate": 0.0003388888888888889,
+      "loss": 0.0149,
+      "step": 391
+    },
+    {
+      "epoch": 0.10210992445949466,
+      "grad_norm": 0.11985349655151367,
+      "learning_rate": 0.00033833333333333334,
+      "loss": 0.036,
+      "step": 392
+    },
+    {
+      "epoch": 0.10237040896066683,
+      "grad_norm": 0.17169484496116638,
+      "learning_rate": 0.00033777777777777777,
+      "loss": 0.0214,
+      "step": 393
+    },
+    {
+      "epoch": 0.10263089346183903,
+      "grad_norm": 0.1162295863032341,
+      "learning_rate": 0.0003372222222222222,
+      "loss": 0.0432,
+      "step": 394
+    },
+    {
+      "epoch": 0.1028913779630112,
+      "grad_norm": 0.1161981150507927,
+      "learning_rate": 0.0003366666666666667,
+      "loss": 0.0345,
+      "step": 395
+    },
+    {
+      "epoch": 0.10315186246418338,
+      "grad_norm": 0.08045128732919693,
+      "learning_rate": 0.00033611111111111114,
+      "loss": 0.0147,
+      "step": 396
+    },
+    {
+      "epoch": 0.10341234696535556,
+      "grad_norm": 0.0739087387919426,
+      "learning_rate": 0.0003355555555555556,
+      "loss": 0.0237,
+      "step": 397
+    },
+    {
+      "epoch": 0.10367283146652774,
+      "grad_norm": 0.10391904413700104,
+      "learning_rate": 0.000335,
+      "loss": 0.0302,
+      "step": 398
+    },
+    {
+      "epoch": 0.10393331596769992,
+      "grad_norm": 0.10834187269210815,
+      "learning_rate": 0.00033444444444444445,
+      "loss": 0.0354,
+      "step": 399
+    },
+    {
+      "epoch": 0.1041938004688721,
+      "grad_norm": 0.12172620743513107,
+      "learning_rate": 0.0003338888888888889,
+      "loss": 0.0142,
+      "step": 400
+    },
+    {
+      "epoch": 0.10445428497004428,
+      "grad_norm": 0.07939380407333374,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.0207,
+      "step": 401
+    },
+    {
+      "epoch": 0.10471476947121647,
+      "grad_norm": 0.07943827658891678,
+      "learning_rate": 0.0003327777777777778,
+      "loss": 0.0263,
+      "step": 402
+    },
+    {
+      "epoch": 0.10497525397238865,
+      "grad_norm": 0.08589910715818405,
+      "learning_rate": 0.0003322222222222222,
+      "loss": 0.0305,
+      "step": 403
+    },
+    {
+      "epoch": 0.10523573847356082,
+      "grad_norm": 0.07241426408290863,
+      "learning_rate": 0.0003316666666666667,
+      "loss": 0.0282,
+      "step": 404
+    },
+    {
+      "epoch": 0.105496222974733,
+      "grad_norm": 0.11648601293563843,
+      "learning_rate": 0.0003311111111111111,
+      "loss": 0.0157,
+      "step": 405
+    },
+    {
+      "epoch": 0.10575670747590518,
+      "grad_norm": 0.06991654634475708,
+      "learning_rate": 0.00033055555555555556,
+      "loss": 0.0289,
+      "step": 406
+    },
+    {
+      "epoch": 0.10601719197707736,
+      "grad_norm": 0.14965589344501495,
+      "learning_rate": 0.00033,
+      "loss": 0.0356,
+      "step": 407
+    },
+    {
+      "epoch": 0.10627767647824954,
+      "grad_norm": 0.11378724128007889,
+      "learning_rate": 0.00032944444444444444,
+      "loss": 0.0351,
+      "step": 408
+    },
+    {
+      "epoch": 0.10653816097942173,
+      "grad_norm": 0.13678783178329468,
+      "learning_rate": 0.0003288888888888889,
+      "loss": 0.0315,
+      "step": 409
+    },
+    {
+      "epoch": 0.10679864548059391,
+      "grad_norm": 0.09216827154159546,
+      "learning_rate": 0.0003283333333333333,
+      "loss": 0.0294,
+      "step": 410
+    },
+    {
+      "epoch": 0.10705912998176609,
+      "grad_norm": 0.08084861934185028,
+      "learning_rate": 0.0003277777777777778,
+      "loss": 0.0239,
+      "step": 411
+    },
+    {
+      "epoch": 0.10731961448293827,
+      "grad_norm": 0.0898488387465477,
+      "learning_rate": 0.00032722222222222224,
+      "loss": 0.0187,
+      "step": 412
+    },
+    {
+      "epoch": 0.10758009898411044,
+      "grad_norm": 0.14007185399532318,
+      "learning_rate": 0.0003266666666666667,
+      "loss": 0.0327,
+      "step": 413
+    },
+    {
+      "epoch": 0.10784058348528262,
+      "grad_norm": 0.0782909020781517,
+      "learning_rate": 0.0003261111111111111,
+      "loss": 0.0301,
+      "step": 414
+    },
+    {
+      "epoch": 0.1081010679864548,
+      "grad_norm": 0.12210501730442047,
+      "learning_rate": 0.00032555555555555555,
+      "loss": 0.0356,
+      "step": 415
+    },
+    {
+      "epoch": 0.10836155248762698,
+      "grad_norm": 0.10311099886894226,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.0161,
+      "step": 416
+    },
+    {
+      "epoch": 0.10862203698879917,
+      "grad_norm": 0.10908329486846924,
+      "learning_rate": 0.0003244444444444444,
+      "loss": 0.0168,
+      "step": 417
+    },
+    {
+      "epoch": 0.10888252148997135,
+      "grad_norm": 0.09817449748516083,
+      "learning_rate": 0.0003238888888888889,
+      "loss": 0.0118,
+      "step": 418
+    },
+    {
+      "epoch": 0.10914300599114353,
+      "grad_norm": 0.061673179268836975,
+      "learning_rate": 0.0003233333333333333,
+      "loss": 0.0199,
+      "step": 419
+    },
+    {
+      "epoch": 0.10940349049231571,
+      "grad_norm": 0.12285150587558746,
+      "learning_rate": 0.0003227777777777778,
+      "loss": 0.0347,
+      "step": 420
+    },
+    {
+      "epoch": 0.10966397499348789,
+      "grad_norm": 0.1721012145280838,
+      "learning_rate": 0.0003222222222222222,
+      "loss": 0.0337,
+      "step": 421
+    },
+    {
+      "epoch": 0.10992445949466007,
+      "grad_norm": 0.10047407448291779,
+      "learning_rate": 0.00032166666666666666,
+      "loss": 0.0138,
+      "step": 422
+    },
+    {
+      "epoch": 0.11018494399583224,
+      "grad_norm": 0.09609969705343246,
+      "learning_rate": 0.00032111111111111115,
+      "loss": 0.0115,
+      "step": 423
+    },
+    {
+      "epoch": 0.11044542849700442,
+      "grad_norm": 0.17365185916423798,
+      "learning_rate": 0.00032055555555555554,
+      "loss": 0.0384,
+      "step": 424
+    },
+    {
+      "epoch": 0.11070591299817661,
+      "grad_norm": 0.2545756697654724,
+      "learning_rate": 0.00032,
+      "loss": 0.0486,
+      "step": 425
+    },
+    {
+      "epoch": 0.11096639749934879,
+      "grad_norm": 0.07526978850364685,
+      "learning_rate": 0.0003194444444444444,
+      "loss": 0.0224,
+      "step": 426
+    },
+    {
+      "epoch": 0.11122688200052097,
+      "grad_norm": 0.08974594622850418,
+      "learning_rate": 0.0003188888888888889,
+      "loss": 0.0134,
+      "step": 427
+    },
+    {
+      "epoch": 0.11148736650169315,
+      "grad_norm": 0.13914678990840912,
+      "learning_rate": 0.00031833333333333334,
+      "loss": 0.0163,
+      "step": 428
+    },
+    {
+      "epoch": 0.11174785100286533,
+      "grad_norm": 0.08919675648212433,
+      "learning_rate": 0.0003177777777777778,
+      "loss": 0.0126,
+      "step": 429
+    },
+    {
+      "epoch": 0.1120083355040375,
+      "grad_norm": 0.13034509122371674,
+      "learning_rate": 0.00031722222222222227,
+      "loss": 0.0289,
+      "step": 430
+    },
+    {
+      "epoch": 0.11226882000520969,
+      "grad_norm": 0.09612299501895905,
+      "learning_rate": 0.00031666666666666665,
+      "loss": 0.0183,
+      "step": 431
+    },
+    {
+      "epoch": 0.11252930450638186,
+      "grad_norm": 0.09348250180482864,
+      "learning_rate": 0.00031611111111111114,
+      "loss": 0.0322,
+      "step": 432
+    },
+    {
+      "epoch": 0.11278978900755406,
+      "grad_norm": 0.14950884878635406,
+      "learning_rate": 0.0003155555555555555,
+      "loss": 0.0411,
+      "step": 433
+    },
+    {
+      "epoch": 0.11305027350872623,
+      "grad_norm": 0.13522475957870483,
+      "learning_rate": 0.000315,
+      "loss": 0.039,
+      "step": 434
+    },
+    {
+      "epoch": 0.11331075800989841,
+      "grad_norm": 0.0809144377708435,
+      "learning_rate": 0.0003144444444444445,
+      "loss": 0.0305,
+      "step": 435
+    },
+    {
+      "epoch": 0.11357124251107059,
+      "grad_norm": 0.09236025810241699,
+      "learning_rate": 0.0003138888888888889,
+      "loss": 0.0131,
+      "step": 436
+    },
+    {
+      "epoch": 0.11383172701224277,
+      "grad_norm": 0.15586403012275696,
+      "learning_rate": 0.0003133333333333334,
+      "loss": 0.037,
+      "step": 437
+    },
+    {
+      "epoch": 0.11409221151341495,
+      "grad_norm": 0.12417721003293991,
+      "learning_rate": 0.00031277777777777776,
+      "loss": 0.0375,
+      "step": 438
+    },
+    {
+      "epoch": 0.11435269601458713,
+      "grad_norm": 0.09874928742647171,
+      "learning_rate": 0.00031222222222222225,
+      "loss": 0.0311,
+      "step": 439
+    },
+    {
+      "epoch": 0.11461318051575932,
+      "grad_norm": 0.18468396365642548,
+      "learning_rate": 0.00031166666666666663,
+      "loss": 0.0391,
+      "step": 440
+    },
+    {
+      "epoch": 0.1148736650169315,
+      "grad_norm": 0.12530437111854553,
+      "learning_rate": 0.0003111111111111111,
+      "loss": 0.0329,
+      "step": 441
+    },
+    {
+      "epoch": 0.11513414951810368,
+      "grad_norm": 0.14760665595531464,
+      "learning_rate": 0.0003105555555555555,
+      "loss": 0.0329,
+      "step": 442
+    },
+    {
+      "epoch": 0.11539463401927585,
+      "grad_norm": 0.05746585503220558,
+      "learning_rate": 0.00031,
+      "loss": 0.0236,
+      "step": 443
+    },
+    {
+      "epoch": 0.11565511852044803,
+      "grad_norm": 0.09533420950174332,
+      "learning_rate": 0.0003094444444444445,
+      "loss": 0.0158,
+      "step": 444
+    },
+    {
+      "epoch": 0.11591560302162021,
+      "grad_norm": 0.1520976573228836,
+      "learning_rate": 0.0003088888888888889,
+      "loss": 0.0129,
+      "step": 445
+    },
+    {
+      "epoch": 0.11617608752279239,
+      "grad_norm": 0.07816439867019653,
+      "learning_rate": 0.00030833333333333337,
+      "loss": 0.0301,
+      "step": 446
+    },
+    {
+      "epoch": 0.11643657202396457,
+      "grad_norm": 0.08245697617530823,
+      "learning_rate": 0.00030777777777777775,
+      "loss": 0.0322,
+      "step": 447
+    },
+    {
+      "epoch": 0.11669705652513676,
+      "grad_norm": 0.0839148461818695,
+      "learning_rate": 0.00030722222222222224,
+      "loss": 0.0295,
+      "step": 448
+    },
+    {
+      "epoch": 0.11695754102630894,
+      "grad_norm": 0.14252392947673798,
+      "learning_rate": 0.0003066666666666667,
+      "loss": 0.0397,
+      "step": 449
+    },
+    {
+      "epoch": 0.11721802552748112,
+      "grad_norm": 0.15466183423995972,
+      "learning_rate": 0.0003061111111111111,
+      "loss": 0.0147,
+      "step": 450
+    },
+    {
+      "epoch": 0.1174785100286533,
+      "grad_norm": 0.06841623783111572,
+      "learning_rate": 0.0003055555555555556,
+      "loss": 0.0261,
+      "step": 451
+    },
+    {
+      "epoch": 0.11773899452982547,
+      "grad_norm": 0.11777552217245102,
+      "learning_rate": 0.000305,
+      "loss": 0.0149,
+      "step": 452
+    },
+    {
+      "epoch": 0.11799947903099765,
+      "grad_norm": 0.09684286266565323,
+      "learning_rate": 0.0003044444444444445,
+      "loss": 0.0346,
+      "step": 453
+    },
+    {
+      "epoch": 0.11825996353216983,
+      "grad_norm": 0.06864480674266815,
+      "learning_rate": 0.00030388888888888886,
+      "loss": 0.0295,
+      "step": 454
+    },
+    {
+      "epoch": 0.11852044803334201,
+      "grad_norm": 0.08115808665752411,
+      "learning_rate": 0.00030333333333333335,
+      "loss": 0.0265,
+      "step": 455
+    },
+    {
+      "epoch": 0.1187809325345142,
+      "grad_norm": 0.11336829513311386,
+      "learning_rate": 0.0003027777777777778,
+      "loss": 0.0281,
+      "step": 456
+    },
+    {
+      "epoch": 0.11904141703568638,
+      "grad_norm": 0.0856219008564949,
+      "learning_rate": 0.0003022222222222222,
+      "loss": 0.0307,
+      "step": 457
+    },
+    {
+      "epoch": 0.11930190153685856,
+      "grad_norm": 0.07512032985687256,
+      "learning_rate": 0.0003016666666666667,
+      "loss": 0.0294,
+      "step": 458
+    },
+    {
+      "epoch": 0.11956238603803074,
+      "grad_norm": 0.10245606303215027,
+      "learning_rate": 0.0003011111111111111,
+      "loss": 0.0307,
+      "step": 459
+    },
+    {
+      "epoch": 0.11982287053920292,
+      "grad_norm": 0.08995255082845688,
+      "learning_rate": 0.0003005555555555556,
+      "loss": 0.0276,
+      "step": 460
+    },
+    {
+      "epoch": 0.1200833550403751,
+      "grad_norm": 0.08044669777154922,
+      "learning_rate": 0.0003,
+      "loss": 0.0269,
+      "step": 461
+    },
+    {
+      "epoch": 0.12034383954154727,
+      "grad_norm": 0.07865863293409348,
+      "learning_rate": 0.00029944444444444446,
+      "loss": 0.0315,
+      "step": 462
+    },
+    {
+      "epoch": 0.12060432404271945,
+      "grad_norm": 0.09515385329723358,
+      "learning_rate": 0.0002988888888888889,
+      "loss": 0.0312,
+      "step": 463
+    },
+    {
+      "epoch": 0.12086480854389164,
+      "grad_norm": 0.06927935779094696,
+      "learning_rate": 0.00029833333333333334,
+      "loss": 0.0159,
+      "step": 464
+    },
+    {
+      "epoch": 0.12112529304506382,
+      "grad_norm": 0.1430736929178238,
+      "learning_rate": 0.0002977777777777778,
+      "loss": 0.0163,
+      "step": 465
+    },
+    {
+      "epoch": 0.121385777546236,
+      "grad_norm": 0.09657033532857895,
+      "learning_rate": 0.0002972222222222222,
+      "loss": 0.0268,
+      "step": 466
+    },
+    {
+      "epoch": 0.12164626204740818,
+      "grad_norm": 0.06322190165519714,
+      "learning_rate": 0.0002966666666666667,
+      "loss": 0.0283,
+      "step": 467
+    },
+    {
+      "epoch": 0.12190674654858036,
+      "grad_norm": 0.09344757348299026,
+      "learning_rate": 0.0002961111111111111,
+      "loss": 0.0121,
+      "step": 468
+    },
+    {
+      "epoch": 0.12216723104975254,
+      "grad_norm": 0.08868320286273956,
+      "learning_rate": 0.0002955555555555556,
+      "loss": 0.0265,
+      "step": 469
+    },
+    {
+      "epoch": 0.12242771555092472,
+      "grad_norm": 0.11758578568696976,
+      "learning_rate": 0.000295,
+      "loss": 0.0165,
+      "step": 470
+    },
+    {
+      "epoch": 0.1226882000520969,
+      "grad_norm": 0.09136319160461426,
+      "learning_rate": 0.00029444444444444445,
+      "loss": 0.0184,
+      "step": 471
+    },
+    {
+      "epoch": 0.12294868455326909,
+      "grad_norm": 0.12742240726947784,
+      "learning_rate": 0.0002938888888888889,
+      "loss": 0.0311,
+      "step": 472
+    },
+    {
+      "epoch": 0.12320916905444126,
+      "grad_norm": 0.11562032252550125,
+      "learning_rate": 0.0002933333333333333,
+      "loss": 0.034,
+      "step": 473
+    },
+    {
+      "epoch": 0.12346965355561344,
+      "grad_norm": 0.07075949758291245,
+      "learning_rate": 0.0002927777777777778,
+      "loss": 0.0261,
+      "step": 474
+    },
+    {
+      "epoch": 0.12373013805678562,
+      "grad_norm": 0.20507678389549255,
+      "learning_rate": 0.0002922222222222222,
+      "loss": 0.0402,
+      "step": 475
+    },
+    {
+      "epoch": 0.1239906225579578,
+      "grad_norm": 0.06373949348926544,
+      "learning_rate": 0.0002916666666666667,
+      "loss": 0.0274,
+      "step": 476
+    },
+    {
+      "epoch": 0.12425110705912998,
+      "grad_norm": 0.06538847833871841,
+      "learning_rate": 0.00029111111111111113,
+      "loss": 0.025,
+      "step": 477
+    },
+    {
+      "epoch": 0.12451159156030216,
+      "grad_norm": 0.16323456168174744,
+      "learning_rate": 0.00029055555555555556,
+      "loss": 0.0366,
+      "step": 478
+    },
+    {
+      "epoch": 0.12477207606147435,
+      "grad_norm": 0.07732845097780228,
+      "learning_rate": 0.00029,
+      "loss": 0.0284,
+      "step": 479
+    },
+    {
+      "epoch": 0.12503256056264653,
+      "grad_norm": 0.06152462214231491,
+      "learning_rate": 0.00028944444444444444,
+      "loss": 0.0244,
+      "step": 480
+    },
+    {
+      "epoch": 0.1252930450638187,
+      "grad_norm": 0.10735264420509338,
+      "learning_rate": 0.0002888888888888889,
+      "loss": 0.0124,
+      "step": 481
+    },
+    {
+      "epoch": 0.12555352956499088,
+      "grad_norm": 0.07874636352062225,
+      "learning_rate": 0.0002883333333333333,
+      "loss": 0.0286,
+      "step": 482
+    },
+    {
+      "epoch": 0.12581401406616308,
+      "grad_norm": 0.07407646626234055,
+      "learning_rate": 0.0002877777777777778,
+      "loss": 0.0261,
+      "step": 483
+    },
+    {
+      "epoch": 0.12607449856733524,
+      "grad_norm": 0.07773268967866898,
+      "learning_rate": 0.00028722222222222224,
+      "loss": 0.0148,
+      "step": 484
+    },
+    {
+      "epoch": 0.12633498306850743,
+      "grad_norm": 0.0872015506029129,
+      "learning_rate": 0.0002866666666666667,
+      "loss": 0.0303,
+      "step": 485
+    },
+    {
+      "epoch": 0.1265954675696796,
+      "grad_norm": 0.08122217655181885,
+      "learning_rate": 0.0002861111111111111,
+      "loss": 0.0262,
+      "step": 486
+    },
+    {
+      "epoch": 0.1268559520708518,
+      "grad_norm": 0.1467408388853073,
+      "learning_rate": 0.00028555555555555555,
+      "loss": 0.0306,
+      "step": 487
+    },
+    {
+      "epoch": 0.12711643657202396,
+      "grad_norm": 0.07901852577924728,
+      "learning_rate": 0.000285,
+      "loss": 0.0147,
+      "step": 488
+    },
+    {
+      "epoch": 0.12737692107319615,
+      "grad_norm": 0.08532578498125076,
+      "learning_rate": 0.0002844444444444444,
+      "loss": 0.0258,
+      "step": 489
+    },
+    {
+      "epoch": 0.1276374055743683,
+      "grad_norm": 0.08219342678785324,
+      "learning_rate": 0.0002838888888888889,
+      "loss": 0.0252,
+      "step": 490
+    },
+    {
+      "epoch": 0.1278978900755405,
+      "grad_norm": 0.07493036240339279,
+      "learning_rate": 0.00028333333333333335,
+      "loss": 0.0259,
+      "step": 491
+    },
+    {
+      "epoch": 0.1281583745767127,
+      "grad_norm": 0.10418077558279037,
+      "learning_rate": 0.0002827777777777778,
+      "loss": 0.0274,
+      "step": 492
+    },
+    {
+      "epoch": 0.12841885907788486,
+      "grad_norm": 0.07653704285621643,
+      "learning_rate": 0.00028222222222222223,
+      "loss": 0.0281,
+      "step": 493
+    },
+    {
+      "epoch": 0.12867934357905705,
+      "grad_norm": 0.06863569468259811,
+      "learning_rate": 0.00028166666666666666,
+      "loss": 0.0185,
+      "step": 494
+    },
+    {
+      "epoch": 0.12893982808022922,
+      "grad_norm": 0.08756248652935028,
+      "learning_rate": 0.0002811111111111111,
+      "loss": 0.0298,
+      "step": 495
+    },
+    {
+      "epoch": 0.1292003125814014,
+      "grad_norm": 0.12433123588562012,
+      "learning_rate": 0.00028055555555555554,
+      "loss": 0.0337,
+      "step": 496
+    },
+    {
+      "epoch": 0.12946079708257358,
+      "grad_norm": 0.14893244206905365,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.0381,
+      "step": 497
+    },
+    {
+      "epoch": 0.12972128158374577,
+      "grad_norm": 0.13264304399490356,
+      "learning_rate": 0.00027944444444444447,
+      "loss": 0.0279,
+      "step": 498
+    },
+    {
+      "epoch": 0.12998176608491796,
+      "grad_norm": 0.12043178826570511,
+      "learning_rate": 0.0002788888888888889,
+      "loss": 0.0143,
+      "step": 499
+    },
+    {
+      "epoch": 0.13024225058609012,
+      "grad_norm": 0.09020431339740753,
+      "learning_rate": 0.00027833333333333334,
+      "loss": 0.0328,
+      "step": 500
+    },
+    {
+      "epoch": 0.13050273508726232,
+      "grad_norm": 0.0732375830411911,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 0.0331,
+      "step": 501
+    },
+    {
+      "epoch": 0.13076321958843448,
+      "grad_norm": 0.13811439275741577,
+      "learning_rate": 0.0002772222222222222,
+      "loss": 0.0375,
+      "step": 502
+    },
+    {
+      "epoch": 0.13102370408960667,
+      "grad_norm": 0.14354398846626282,
+      "learning_rate": 0.00027666666666666665,
+      "loss": 0.0319,
+      "step": 503
+    },
+    {
+      "epoch": 0.13128418859077884,
+      "grad_norm": 0.12450239062309265,
+      "learning_rate": 0.0002761111111111111,
+      "loss": 0.0141,
+      "step": 504
+    },
+    {
+      "epoch": 0.13154467309195103,
+      "grad_norm": 0.10675760358572006,
+      "learning_rate": 0.0002755555555555556,
+      "loss": 0.0178,
+      "step": 505
+    },
+    {
+      "epoch": 0.1318051575931232,
+      "grad_norm": 0.08877679705619812,
+      "learning_rate": 0.000275,
+      "loss": 0.0202,
+      "step": 506
+    },
+    {
+      "epoch": 0.1320656420942954,
+      "grad_norm": 0.08938851207494736,
+      "learning_rate": 0.00027444444444444445,
+      "loss": 0.0155,
+      "step": 507
+    },
+    {
+      "epoch": 0.13232612659546758,
+      "grad_norm": 0.08893389254808426,
+      "learning_rate": 0.0002738888888888889,
+      "loss": 0.0218,
+      "step": 508
+    },
+    {
+      "epoch": 0.13258661109663974,
+      "grad_norm": 0.09104225039482117,
+      "learning_rate": 0.00027333333333333333,
+      "loss": 0.0137,
+      "step": 509
+    },
+    {
+      "epoch": 0.13284709559781194,
+      "grad_norm": 0.09981877356767654,
+      "learning_rate": 0.00027277777777777776,
+      "loss": 0.0336,
+      "step": 510
+    },
+    {
+      "epoch": 0.1331075800989841,
+      "grad_norm": 0.1007775142788887,
+      "learning_rate": 0.0002722222222222222,
+      "loss": 0.0315,
+      "step": 511
+    },
+    {
+      "epoch": 0.1333680646001563,
+      "grad_norm": 0.18223972618579865,
+      "learning_rate": 0.0002716666666666667,
+      "loss": 0.0153,
+      "step": 512
+    },
+    {
+      "epoch": 0.13362854910132846,
+      "grad_norm": 0.06076887995004654,
+      "learning_rate": 0.00027111111111111113,
+      "loss": 0.0262,
+      "step": 513
+    },
+    {
+      "epoch": 0.13388903360250065,
+      "grad_norm": 0.08838528394699097,
+      "learning_rate": 0.00027055555555555557,
+      "loss": 0.0298,
+      "step": 514
+    },
+    {
+      "epoch": 0.13414951810367284,
+      "grad_norm": 0.09148416668176651,
+      "learning_rate": 0.00027,
+      "loss": 0.0284,
+      "step": 515
+    },
+    {
+      "epoch": 0.134410002604845,
+      "grad_norm": 0.059513382613658905,
+      "learning_rate": 0.00026944444444444444,
+      "loss": 0.0128,
+      "step": 516
+    },
+    {
+      "epoch": 0.1346704871060172,
+      "grad_norm": 0.06906077265739441,
+      "learning_rate": 0.00026888888888888893,
+      "loss": 0.0323,
+      "step": 517
+    },
+    {
+      "epoch": 0.13493097160718937,
+      "grad_norm": 0.05184634402394295,
+      "learning_rate": 0.0002683333333333333,
+      "loss": 0.0227,
+      "step": 518
+    },
+    {
+      "epoch": 0.13519145610836156,
+      "grad_norm": 0.1272115856409073,
+      "learning_rate": 0.0002677777777777778,
+      "loss": 0.03,
+      "step": 519
+    },
+    {
+      "epoch": 0.13545194060953372,
+      "grad_norm": 0.061580728739500046,
+      "learning_rate": 0.00026722222222222224,
+      "loss": 0.0274,
+      "step": 520
+    },
+    {
+      "epoch": 0.13571242511070591,
+      "grad_norm": 0.09900429099798203,
+      "learning_rate": 0.0002666666666666667,
+      "loss": 0.0363,
+      "step": 521
+    },
+    {
+      "epoch": 0.1359729096118781,
+      "grad_norm": 0.05738164857029915,
+      "learning_rate": 0.0002661111111111111,
+      "loss": 0.0224,
+      "step": 522
+    },
+    {
+      "epoch": 0.13623339411305027,
+      "grad_norm": 0.12190201878547668,
+      "learning_rate": 0.00026555555555555555,
+      "loss": 0.0325,
+      "step": 523
+    },
+    {
+      "epoch": 0.13649387861422246,
+      "grad_norm": 0.103856660425663,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.0324,
+      "step": 524
+    },
+    {
+      "epoch": 0.13675436311539463,
+      "grad_norm": 0.06949201226234436,
+      "learning_rate": 0.00026444444444444443,
+      "loss": 0.0266,
+      "step": 525
+    },
+    {
+      "epoch": 0.13701484761656682,
+      "grad_norm": 0.09444725513458252,
+      "learning_rate": 0.0002638888888888889,
+      "loss": 0.0281,
+      "step": 526
+    },
+    {
+      "epoch": 0.13727533211773899,
+      "grad_norm": 0.08735501766204834,
+      "learning_rate": 0.0002633333333333333,
+      "loss": 0.0312,
+      "step": 527
+    },
+    {
+      "epoch": 0.13753581661891118,
+      "grad_norm": 0.14742058515548706,
+      "learning_rate": 0.0002627777777777778,
+      "loss": 0.0139,
+      "step": 528
+    },
+    {
+      "epoch": 0.13779630112008334,
+      "grad_norm": 0.07838045060634613,
+      "learning_rate": 0.00026222222222222223,
+      "loss": 0.0202,
+      "step": 529
+    },
+    {
+      "epoch": 0.13805678562125553,
+      "grad_norm": 0.09870190173387527,
+      "learning_rate": 0.00026166666666666667,
+      "loss": 0.0339,
+      "step": 530
+    },
+    {
+      "epoch": 0.13831727012242773,
+      "grad_norm": 0.08367832750082016,
+      "learning_rate": 0.00026111111111111116,
+      "loss": 0.032,
+      "step": 531
+    },
+    {
+      "epoch": 0.1385777546235999,
+      "grad_norm": 0.106461301445961,
+      "learning_rate": 0.00026055555555555554,
+      "loss": 0.0316,
+      "step": 532
+    },
+    {
+      "epoch": 0.13883823912477208,
+      "grad_norm": 0.0788130983710289,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.0282,
+      "step": 533
+    },
+    {
+      "epoch": 0.13909872362594425,
+      "grad_norm": 0.07943234592676163,
+      "learning_rate": 0.0002594444444444444,
+      "loss": 0.0258,
+      "step": 534
+    },
+    {
+      "epoch": 0.13935920812711644,
+      "grad_norm": 0.09006848186254501,
+      "learning_rate": 0.0002588888888888889,
+      "loss": 0.0233,
+      "step": 535
+    },
+    {
+      "epoch": 0.1396196926282886,
+      "grad_norm": 0.08987578749656677,
+      "learning_rate": 0.00025833333333333334,
+      "loss": 0.0351,
+      "step": 536
+    },
+    {
+      "epoch": 0.1398801771294608,
+      "grad_norm": 0.17907755076885223,
+      "learning_rate": 0.0002577777777777778,
+      "loss": 0.0405,
+      "step": 537
+    },
+    {
+      "epoch": 0.140140661630633,
+      "grad_norm": 0.09651251882314682,
+      "learning_rate": 0.00025722222222222227,
+      "loss": 0.031,
+      "step": 538
+    },
+    {
+      "epoch": 0.14040114613180515,
+      "grad_norm": 0.1170162782073021,
+      "learning_rate": 0.00025666666666666665,
+      "loss": 0.0158,
+      "step": 539
+    },
+    {
+      "epoch": 0.14066163063297735,
+      "grad_norm": 0.06463764607906342,
+      "learning_rate": 0.00025611111111111114,
+      "loss": 0.0248,
+      "step": 540
+    },
+    {
+      "epoch": 0.1409221151341495,
+      "grad_norm": 0.10253310203552246,
+      "learning_rate": 0.00025555555555555553,
+      "loss": 0.0138,
+      "step": 541
+    },
+    {
+      "epoch": 0.1411825996353217,
+      "grad_norm": 0.07588808983564377,
+      "learning_rate": 0.000255,
+      "loss": 0.0138,
+      "step": 542
+    },
+    {
+      "epoch": 0.14144308413649387,
+      "grad_norm": 0.07522521913051605,
+      "learning_rate": 0.0002544444444444444,
+      "loss": 0.0237,
+      "step": 543
+    },
+    {
+      "epoch": 0.14170356863766606,
+      "grad_norm": 0.0933796614408493,
+      "learning_rate": 0.0002538888888888889,
+      "loss": 0.0296,
+      "step": 544
+    },
+    {
+      "epoch": 0.14196405313883823,
+      "grad_norm": 0.06249965354800224,
+      "learning_rate": 0.0002533333333333334,
+      "loss": 0.0134,
+      "step": 545
+    },
+    {
+      "epoch": 0.14222453764001042,
+      "grad_norm": 0.12490858137607574,
+      "learning_rate": 0.00025277777777777777,
+      "loss": 0.0129,
+      "step": 546
+    },
+    {
+      "epoch": 0.1424850221411826,
+      "grad_norm": 0.08053889870643616,
+      "learning_rate": 0.00025222222222222226,
+      "loss": 0.0231,
+      "step": 547
+    },
+    {
+      "epoch": 0.14274550664235477,
+      "grad_norm": 0.06593835353851318,
+      "learning_rate": 0.00025166666666666664,
+      "loss": 0.0238,
+      "step": 548
+    },
+    {
+      "epoch": 0.14300599114352697,
+      "grad_norm": 0.05316567048430443,
+      "learning_rate": 0.00025111111111111113,
+      "loss": 0.0225,
+      "step": 549
+    },
+    {
+      "epoch": 0.14326647564469913,
+      "grad_norm": 0.0695248618721962,
+      "learning_rate": 0.0002505555555555555,
+      "loss": 0.0294,
+      "step": 550
+    },
+    {
+      "epoch": 0.14352696014587132,
+      "grad_norm": 0.08083239197731018,
+      "learning_rate": 0.00025,
+      "loss": 0.0324,
+      "step": 551
+    },
+    {
+      "epoch": 0.1437874446470435,
+      "grad_norm": 0.06797367334365845,
+      "learning_rate": 0.00024944444444444444,
+      "loss": 0.0238,
+      "step": 552
+    },
+    {
+      "epoch": 0.14404792914821568,
+      "grad_norm": 0.07074914127588272,
+      "learning_rate": 0.0002488888888888889,
+      "loss": 0.0128,
+      "step": 553
+    },
+    {
+      "epoch": 0.14430841364938787,
+      "grad_norm": 0.06064210459589958,
+      "learning_rate": 0.0002483333333333333,
+      "loss": 0.0258,
+      "step": 554
+    },
+    {
+      "epoch": 0.14456889815056004,
+      "grad_norm": 0.04608076810836792,
+      "learning_rate": 0.0002477777777777778,
+      "loss": 0.0212,
+      "step": 555
+    },
+    {
+      "epoch": 0.14482938265173223,
+      "grad_norm": 0.08264319598674774,
+      "learning_rate": 0.00024722222222222224,
+      "loss": 0.023,
+      "step": 556
+    },
+    {
+      "epoch": 0.1450898671529044,
+      "grad_norm": 0.06639570742845535,
+      "learning_rate": 0.0002466666666666667,
+      "loss": 0.0303,
+      "step": 557
+    },
+    {
+      "epoch": 0.1453503516540766,
+      "grad_norm": 0.11838199943304062,
+      "learning_rate": 0.0002461111111111111,
+      "loss": 0.0122,
+      "step": 558
+    },
+    {
+      "epoch": 0.14561083615524875,
+      "grad_norm": 0.08091646432876587,
+      "learning_rate": 0.00024555555555555556,
+      "loss": 0.0136,
+      "step": 559
+    },
+    {
+      "epoch": 0.14587132065642094,
+      "grad_norm": 0.06345510482788086,
+      "learning_rate": 0.000245,
+      "loss": 0.0238,
+      "step": 560
+    },
+    {
+      "epoch": 0.14613180515759314,
+      "grad_norm": 0.1701902151107788,
+      "learning_rate": 0.00024444444444444443,
+      "loss": 0.032,
+      "step": 561
+    },
+    {
+      "epoch": 0.1463922896587653,
+      "grad_norm": 0.10313287377357483,
+      "learning_rate": 0.0002438888888888889,
+      "loss": 0.0254,
+      "step": 562
+    },
+    {
+      "epoch": 0.1466527741599375,
+      "grad_norm": 0.09362874925136566,
+      "learning_rate": 0.00024333333333333336,
+      "loss": 0.0245,
+      "step": 563
+    },
+    {
+      "epoch": 0.14691325866110966,
+      "grad_norm": 0.0804872065782547,
+      "learning_rate": 0.0002427777777777778,
+      "loss": 0.0247,
+      "step": 564
+    },
+    {
+      "epoch": 0.14717374316228185,
+      "grad_norm": 0.16796275973320007,
+      "learning_rate": 0.00024222222222222223,
+      "loss": 0.0286,
+      "step": 565
+    },
+    {
+      "epoch": 0.14743422766345401,
+      "grad_norm": 0.13599511981010437,
+      "learning_rate": 0.00024166666666666667,
+      "loss": 0.0314,
+      "step": 566
+    },
+    {
+      "epoch": 0.1476947121646262,
+      "grad_norm": 0.04849720001220703,
+      "learning_rate": 0.0002411111111111111,
+      "loss": 0.0224,
+      "step": 567
+    },
+    {
+      "epoch": 0.14795519666579837,
+      "grad_norm": 0.06677763164043427,
+      "learning_rate": 0.00024055555555555554,
+      "loss": 0.01,
+      "step": 568
+    },
+    {
+      "epoch": 0.14821568116697056,
+      "grad_norm": 0.05729574337601662,
+      "learning_rate": 0.00024,
+      "loss": 0.024,
+      "step": 569
+    },
+    {
+      "epoch": 0.14847616566814276,
+      "grad_norm": 0.11312732100486755,
+      "learning_rate": 0.00023944444444444444,
+      "loss": 0.0264,
+      "step": 570
+    },
+    {
+      "epoch": 0.14873665016931492,
+      "grad_norm": 0.058812785893678665,
+      "learning_rate": 0.0002388888888888889,
+      "loss": 0.0282,
+      "step": 571
+    },
+    {
+      "epoch": 0.1489971346704871,
+      "grad_norm": 0.06908426433801651,
+      "learning_rate": 0.00023833333333333334,
+      "loss": 0.0176,
+      "step": 572
+    },
+    {
+      "epoch": 0.14925761917165928,
+      "grad_norm": 0.06809442490339279,
+      "learning_rate": 0.00023777777777777778,
+      "loss": 0.0253,
+      "step": 573
+    },
+    {
+      "epoch": 0.14951810367283147,
+      "grad_norm": 0.12843027710914612,
+      "learning_rate": 0.00023722222222222222,
+      "loss": 0.0123,
+      "step": 574
+    },
+    {
+      "epoch": 0.14977858817400364,
+      "grad_norm": 0.1014859676361084,
+      "learning_rate": 0.00023666666666666668,
+      "loss": 0.0203,
+      "step": 575
+    },
+    {
+      "epoch": 0.15003907267517583,
+      "grad_norm": 0.08219820261001587,
+      "learning_rate": 0.00023611111111111112,
+      "loss": 0.0158,
+      "step": 576
+    },
+    {
+      "epoch": 0.15029955717634802,
+      "grad_norm": 0.0556500069797039,
+      "learning_rate": 0.00023555555555555556,
+      "loss": 0.0128,
+      "step": 577
+    },
+    {
+      "epoch": 0.15056004167752018,
+      "grad_norm": 0.07403457164764404,
+      "learning_rate": 0.000235,
+      "loss": 0.0177,
+      "step": 578
+    },
+    {
+      "epoch": 0.15082052617869238,
+      "grad_norm": 0.05322965234518051,
+      "learning_rate": 0.00023444444444444446,
+      "loss": 0.0242,
+      "step": 579
+    },
+    {
+      "epoch": 0.15108101067986454,
+      "grad_norm": 0.1096285954117775,
+      "learning_rate": 0.0002338888888888889,
+      "loss": 0.0302,
+      "step": 580
+    },
+    {
+      "epoch": 0.15134149518103673,
+      "grad_norm": 0.047204066067934036,
+      "learning_rate": 0.00023333333333333333,
+      "loss": 0.0202,
+      "step": 581
+    },
+    {
+      "epoch": 0.1516019796822089,
+      "grad_norm": 0.07682661712169647,
+      "learning_rate": 0.0002327777777777778,
+      "loss": 0.0277,
+      "step": 582
+    },
+    {
+      "epoch": 0.1518624641833811,
+      "grad_norm": 0.0762876346707344,
+      "learning_rate": 0.00023222222222222223,
+      "loss": 0.0282,
+      "step": 583
+    },
+    {
+      "epoch": 0.15212294868455326,
+      "grad_norm": 0.24234281480312347,
+      "learning_rate": 0.00023166666666666667,
+      "loss": 0.0425,
+      "step": 584
+    },
+    {
+      "epoch": 0.15238343318572545,
+      "grad_norm": 0.10953521728515625,
+      "learning_rate": 0.0002311111111111111,
+      "loss": 0.0321,
+      "step": 585
+    },
+    {
+      "epoch": 0.15264391768689764,
+      "grad_norm": 0.05023661628365517,
+      "learning_rate": 0.00023055555555555557,
+      "loss": 0.0115,
+      "step": 586
+    },
+    {
+      "epoch": 0.1529044021880698,
+      "grad_norm": 0.06634744256734848,
+      "learning_rate": 0.00023,
+      "loss": 0.0135,
+      "step": 587
+    },
+    {
+      "epoch": 0.153164886689242,
+      "grad_norm": 0.06897839158773422,
+      "learning_rate": 0.00022944444444444444,
+      "loss": 0.0256,
+      "step": 588
+    },
+    {
+      "epoch": 0.15342537119041416,
+      "grad_norm": 0.06687745451927185,
+      "learning_rate": 0.0002288888888888889,
+      "loss": 0.0176,
+      "step": 589
+    },
+    {
+      "epoch": 0.15368585569158635,
+      "grad_norm": 0.053371988236904144,
+      "learning_rate": 0.00022833333333333334,
+      "loss": 0.0221,
+      "step": 590
+    },
+    {
+      "epoch": 0.15394634019275852,
+      "grad_norm": 0.04696241766214371,
+      "learning_rate": 0.00022777777777777778,
+      "loss": 0.022,
+      "step": 591
+    },
+    {
+      "epoch": 0.1542068246939307,
+      "grad_norm": 0.07133936136960983,
+      "learning_rate": 0.00022722222222222222,
+      "loss": 0.0279,
+      "step": 592
+    },
+    {
+      "epoch": 0.1544673091951029,
+      "grad_norm": 0.06719057261943817,
+      "learning_rate": 0.00022666666666666666,
+      "loss": 0.0248,
+      "step": 593
+    },
+    {
+      "epoch": 0.15472779369627507,
+      "grad_norm": 0.04650402441620827,
+      "learning_rate": 0.00022611111111111112,
+      "loss": 0.0116,
+      "step": 594
+    },
+    {
+      "epoch": 0.15498827819744726,
+      "grad_norm": 0.07789336889982224,
+      "learning_rate": 0.00022555555555555556,
+      "loss": 0.0279,
+      "step": 595
+    },
+    {
+      "epoch": 0.15524876269861942,
+      "grad_norm": 0.04695094749331474,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.0254,
+      "step": 596
+    },
+    {
+      "epoch": 0.15550924719979162,
+      "grad_norm": 0.11447637528181076,
+      "learning_rate": 0.00022444444444444446,
+      "loss": 0.0299,
+      "step": 597
+    },
+    {
+      "epoch": 0.15576973170096378,
+      "grad_norm": 0.059543974697589874,
+      "learning_rate": 0.0002238888888888889,
+      "loss": 0.0262,
+      "step": 598
+    },
+    {
+      "epoch": 0.15603021620213597,
+      "grad_norm": 0.11290789395570755,
+      "learning_rate": 0.00022333333333333333,
+      "loss": 0.035,
+      "step": 599
+    },
+    {
+      "epoch": 0.15629070070330817,
+      "grad_norm": 0.0641266405582428,
+      "learning_rate": 0.00022277777777777777,
+      "loss": 0.0239,
+      "step": 600
+    },
+    {
+      "epoch": 0.15655118520448033,
+      "grad_norm": 0.06315059214830399,
+      "learning_rate": 0.0002222222222222222,
+      "loss": 0.0248,
+      "step": 601
+    },
+    {
+      "epoch": 0.15681166970565252,
+      "grad_norm": 0.08061481267213821,
+      "learning_rate": 0.00022166666666666667,
+      "loss": 0.0263,
+      "step": 602
+    },
+    {
+      "epoch": 0.1570721542068247,
+      "grad_norm": 0.07757250219583511,
+      "learning_rate": 0.00022111111111111113,
+      "loss": 0.0197,
+      "step": 603
+    },
+    {
+      "epoch": 0.15733263870799688,
+      "grad_norm": 0.10401252657175064,
+      "learning_rate": 0.00022055555555555557,
+      "loss": 0.0136,
+      "step": 604
+    },
+    {
+      "epoch": 0.15759312320916904,
+      "grad_norm": 0.10702945291996002,
+      "learning_rate": 0.00022,
+      "loss": 0.03,
+      "step": 605
+    },
+    {
+      "epoch": 0.15785360771034124,
+      "grad_norm": 0.08410105109214783,
+      "learning_rate": 0.00021944444444444444,
+      "loss": 0.0297,
+      "step": 606
+    },
+    {
+      "epoch": 0.1581140922115134,
+      "grad_norm": 0.11355652660131454,
+      "learning_rate": 0.00021888888888888888,
+      "loss": 0.0328,
+      "step": 607
+    },
+    {
+      "epoch": 0.1583745767126856,
+      "grad_norm": 0.0707426518201828,
+      "learning_rate": 0.00021833333333333332,
+      "loss": 0.0265,
+      "step": 608
+    },
+    {
+      "epoch": 0.1586350612138578,
+      "grad_norm": 0.07182347774505615,
+      "learning_rate": 0.00021777777777777776,
+      "loss": 0.0267,
+      "step": 609
+    },
+    {
+      "epoch": 0.15889554571502995,
+      "grad_norm": 0.04591052234172821,
+      "learning_rate": 0.00021722222222222225,
+      "loss": 0.0253,
+      "step": 610
+    },
+    {
+      "epoch": 0.15915603021620214,
+      "grad_norm": 0.0790751576423645,
+      "learning_rate": 0.00021666666666666668,
+      "loss": 0.0306,
+      "step": 611
+    },
+    {
+      "epoch": 0.1594165147173743,
+      "grad_norm": 0.10152866691350937,
+      "learning_rate": 0.00021611111111111112,
+      "loss": 0.0158,
+      "step": 612
+    },
+    {
+      "epoch": 0.1596769992185465,
+      "grad_norm": 0.057877685874700546,
+      "learning_rate": 0.00021555555555555556,
+      "loss": 0.0273,
+      "step": 613
+    },
+    {
+      "epoch": 0.15993748371971866,
+      "grad_norm": 0.0936063900589943,
+      "learning_rate": 0.000215,
+      "loss": 0.0143,
+      "step": 614
+    },
+    {
+      "epoch": 0.16019796822089086,
+      "grad_norm": 0.0547916404902935,
+      "learning_rate": 0.00021444444444444443,
+      "loss": 0.0268,
+      "step": 615
+    },
+    {
+      "epoch": 0.16045845272206305,
+      "grad_norm": 0.0787026435136795,
+      "learning_rate": 0.0002138888888888889,
+      "loss": 0.0269,
+      "step": 616
+    },
+    {
+      "epoch": 0.16071893722323521,
+      "grad_norm": 0.08008725196123123,
+      "learning_rate": 0.00021333333333333336,
+      "loss": 0.0311,
+      "step": 617
+    },
+    {
+      "epoch": 0.1609794217244074,
+      "grad_norm": 0.07616233080625534,
+      "learning_rate": 0.0002127777777777778,
+      "loss": 0.0268,
+      "step": 618
+    },
+    {
+      "epoch": 0.16123990622557957,
+      "grad_norm": 0.06189870834350586,
+      "learning_rate": 0.00021222222222222223,
+      "loss": 0.0254,
+      "step": 619
+    },
+    {
+      "epoch": 0.16150039072675176,
+      "grad_norm": 0.06891774386167526,
+      "learning_rate": 0.00021166666666666667,
+      "loss": 0.0272,
+      "step": 620
+    },
+    {
+      "epoch": 0.16176087522792393,
+      "grad_norm": 0.08759953081607819,
+      "learning_rate": 0.0002111111111111111,
+      "loss": 0.0214,
+      "step": 621
+    },
+    {
+      "epoch": 0.16202135972909612,
+      "grad_norm": 0.05686959624290466,
+      "learning_rate": 0.00021055555555555554,
+      "loss": 0.0203,
+      "step": 622
+    },
+    {
+      "epoch": 0.1622818442302683,
+      "grad_norm": 0.09747463464736938,
+      "learning_rate": 0.00021,
+      "loss": 0.0245,
+      "step": 623
+    },
+    {
+      "epoch": 0.16254232873144048,
+      "grad_norm": 0.04596638306975365,
+      "learning_rate": 0.00020944444444444445,
+      "loss": 0.0248,
+      "step": 624
+    },
+    {
+      "epoch": 0.16280281323261267,
+      "grad_norm": 0.08737804740667343,
+      "learning_rate": 0.0002088888888888889,
+      "loss": 0.0314,
+      "step": 625
+    },
+    {
+      "epoch": 0.16306329773378483,
+      "grad_norm": 0.07385338097810745,
+      "learning_rate": 0.00020833333333333335,
+      "loss": 0.0292,
+      "step": 626
+    },
+    {
+      "epoch": 0.16332378223495703,
+      "grad_norm": 0.0636276826262474,
+      "learning_rate": 0.00020777777777777778,
+      "loss": 0.0268,
+      "step": 627
+    },
+    {
+      "epoch": 0.1635842667361292,
+      "grad_norm": 0.07636599242687225,
+      "learning_rate": 0.00020722222222222222,
+      "loss": 0.0274,
+      "step": 628
+    },
+    {
+      "epoch": 0.16384475123730138,
+      "grad_norm": 0.09945692121982574,
+      "learning_rate": 0.00020666666666666666,
+      "loss": 0.0155,
+      "step": 629
+    },
+    {
+      "epoch": 0.16410523573847355,
+      "grad_norm": 0.08397584408521652,
+      "learning_rate": 0.00020611111111111112,
+      "loss": 0.0281,
+      "step": 630
+    },
+    {
+      "epoch": 0.16436572023964574,
+      "grad_norm": 0.061054617166519165,
+      "learning_rate": 0.00020555555555555556,
+      "loss": 0.0255,
+      "step": 631
+    },
+    {
+      "epoch": 0.16462620474081793,
+      "grad_norm": 0.059529922902584076,
+      "learning_rate": 0.000205,
+      "loss": 0.0207,
+      "step": 632
+    },
+    {
+      "epoch": 0.1648866892419901,
+      "grad_norm": 0.0805809274315834,
+      "learning_rate": 0.00020444444444444446,
+      "loss": 0.0242,
+      "step": 633
+    },
+    {
+      "epoch": 0.1651471737431623,
+      "grad_norm": 0.15958362817764282,
+      "learning_rate": 0.0002038888888888889,
+      "loss": 0.0345,
+      "step": 634
+    },
+    {
+      "epoch": 0.16540765824433445,
+      "grad_norm": 0.07853345572948456,
+      "learning_rate": 0.00020333333333333333,
+      "loss": 0.0091,
+      "step": 635
+    },
+    {
+      "epoch": 0.16566814274550665,
+      "grad_norm": 0.1266714334487915,
+      "learning_rate": 0.00020277777777777777,
+      "loss": 0.0132,
+      "step": 636
+    },
+    {
+      "epoch": 0.1659286272466788,
+      "grad_norm": 0.08651676028966904,
+      "learning_rate": 0.00020222222222222223,
+      "loss": 0.0249,
+      "step": 637
+    },
+    {
+      "epoch": 0.166189111747851,
+      "grad_norm": 0.0951671376824379,
+      "learning_rate": 0.00020166666666666667,
+      "loss": 0.0292,
+      "step": 638
+    },
+    {
+      "epoch": 0.1664495962490232,
+      "grad_norm": 0.06016332656145096,
+      "learning_rate": 0.0002011111111111111,
+      "loss": 0.0186,
+      "step": 639
+    },
+    {
+      "epoch": 0.16671008075019536,
+      "grad_norm": 0.03814365342259407,
+      "learning_rate": 0.00020055555555555555,
+      "loss": 0.0136,
+      "step": 640
+    },
+    {
+      "epoch": 0.16697056525136755,
+      "grad_norm": 0.12929277122020721,
+      "learning_rate": 0.0002,
+      "loss": 0.0368,
+      "step": 641
+    },
+    {
+      "epoch": 0.16723104975253972,
+      "grad_norm": 0.10242114216089249,
+      "learning_rate": 0.00019944444444444445,
+      "loss": 0.0143,
+      "step": 642
+    },
+    {
+      "epoch": 0.1674915342537119,
+      "grad_norm": 0.09811166673898697,
+      "learning_rate": 0.00019888888888888888,
+      "loss": 0.0325,
+      "step": 643
+    },
+    {
+      "epoch": 0.16775201875488407,
+      "grad_norm": 0.05625639855861664,
+      "learning_rate": 0.00019833333333333335,
+      "loss": 0.025,
+      "step": 644
+    },
+    {
+      "epoch": 0.16801250325605627,
+      "grad_norm": 0.12204310297966003,
+      "learning_rate": 0.00019777777777777778,
+      "loss": 0.0318,
+      "step": 645
+    },
+    {
+      "epoch": 0.16827298775722843,
+      "grad_norm": 0.04521187022328377,
+      "learning_rate": 0.00019722222222222222,
+      "loss": 0.0218,
+      "step": 646
+    },
+    {
+      "epoch": 0.16853347225840062,
+      "grad_norm": 0.0765163004398346,
+      "learning_rate": 0.00019666666666666666,
+      "loss": 0.0205,
+      "step": 647
+    },
+    {
+      "epoch": 0.16879395675957282,
+      "grad_norm": 0.06920886039733887,
+      "learning_rate": 0.00019611111111111112,
+      "loss": 0.0195,
+      "step": 648
+    },
+    {
+      "epoch": 0.16905444126074498,
+      "grad_norm": 0.07708664238452911,
+      "learning_rate": 0.00019555555555555556,
+      "loss": 0.0149,
+      "step": 649
+    },
+    {
+      "epoch": 0.16931492576191717,
+      "grad_norm": 0.05546024441719055,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.0223,
+      "step": 650
+    },
+    {
+      "epoch": 0.16957541026308934,
+      "grad_norm": 0.04842771962285042,
+      "learning_rate": 0.00019444444444444446,
+      "loss": 0.0205,
+      "step": 651
+    },
+    {
+      "epoch": 0.16983589476426153,
+      "grad_norm": 0.16406475007534027,
+      "learning_rate": 0.0001938888888888889,
+      "loss": 0.0318,
+      "step": 652
+    },
+    {
+      "epoch": 0.1700963792654337,
+      "grad_norm": 0.0651065856218338,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 0.0203,
+      "step": 653
+    },
+    {
+      "epoch": 0.1703568637666059,
+      "grad_norm": 0.10570181161165237,
+      "learning_rate": 0.00019277777777777777,
+      "loss": 0.0323,
+      "step": 654
+    },
+    {
+      "epoch": 0.17061734826777808,
+      "grad_norm": 0.04322061687707901,
+      "learning_rate": 0.0001922222222222222,
+      "loss": 0.0218,
+      "step": 655
+    },
+    {
+      "epoch": 0.17087783276895024,
+      "grad_norm": 0.10667745769023895,
+      "learning_rate": 0.00019166666666666667,
+      "loss": 0.0313,
+      "step": 656
+    },
+    {
+      "epoch": 0.17113831727012244,
+      "grad_norm": 0.09213763475418091,
+      "learning_rate": 0.00019111111111111114,
+      "loss": 0.0272,
+      "step": 657
+    },
+    {
+      "epoch": 0.1713988017712946,
+      "grad_norm": 0.05325629189610481,
+      "learning_rate": 0.00019055555555555557,
+      "loss": 0.0223,
+      "step": 658
+    },
+    {
+      "epoch": 0.1716592862724668,
+      "grad_norm": 0.1126520112156868,
+      "learning_rate": 0.00019,
+      "loss": 0.0281,
+      "step": 659
+    },
+    {
+      "epoch": 0.17191977077363896,
+      "grad_norm": 0.09467708319425583,
+      "learning_rate": 0.00018944444444444445,
+      "loss": 0.0294,
+      "step": 660
+    },
+    {
+      "epoch": 0.17218025527481115,
+      "grad_norm": 0.05817677453160286,
+      "learning_rate": 0.00018888888888888888,
+      "loss": 0.0265,
+      "step": 661
+    },
+    {
+      "epoch": 0.17244073977598334,
+      "grad_norm": 0.08164044469594955,
+      "learning_rate": 0.00018833333333333332,
+      "loss": 0.0099,
+      "step": 662
+    },
+    {
+      "epoch": 0.1727012242771555,
+      "grad_norm": 0.05309176817536354,
+      "learning_rate": 0.00018777777777777776,
+      "loss": 0.0286,
+      "step": 663
+    },
+    {
+      "epoch": 0.1729617087783277,
+      "grad_norm": 0.050173692405223846,
+      "learning_rate": 0.00018722222222222225,
+      "loss": 0.0212,
+      "step": 664
+    },
+    {
+      "epoch": 0.17322219327949986,
+      "grad_norm": 0.06619109958410263,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.0157,
+      "step": 665
+    },
+    {
+      "epoch": 0.17348267778067206,
+      "grad_norm": 0.12874183058738708,
+      "learning_rate": 0.00018611111111111112,
+      "loss": 0.0326,
+      "step": 666
+    },
+    {
+      "epoch": 0.17374316228184422,
+      "grad_norm": 0.12340325862169266,
+      "learning_rate": 0.00018555555555555556,
+      "loss": 0.0326,
+      "step": 667
+    },
+    {
+      "epoch": 0.1740036467830164,
+      "grad_norm": 0.05308026820421219,
+      "learning_rate": 0.000185,
+      "loss": 0.0222,
+      "step": 668
+    },
+    {
+      "epoch": 0.17426413128418858,
+      "grad_norm": 0.07470495998859406,
+      "learning_rate": 0.00018444444444444443,
+      "loss": 0.0232,
+      "step": 669
+    },
+    {
+      "epoch": 0.17452461578536077,
+      "grad_norm": 0.11508367210626602,
+      "learning_rate": 0.00018388888888888887,
+      "loss": 0.0311,
+      "step": 670
+    },
+    {
+      "epoch": 0.17478510028653296,
+      "grad_norm": 0.09353961795568466,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 0.0157,
+      "step": 671
+    },
+    {
+      "epoch": 0.17504558478770513,
+      "grad_norm": 0.085630401968956,
+      "learning_rate": 0.0001827777777777778,
+      "loss": 0.0159,
+      "step": 672
+    },
+    {
+      "epoch": 0.17530606928887732,
+      "grad_norm": 0.07081904262304306,
+      "learning_rate": 0.00018222222222222224,
+      "loss": 0.024,
+      "step": 673
+    },
+    {
+      "epoch": 0.17556655379004948,
+      "grad_norm": 0.09265665709972382,
+      "learning_rate": 0.00018166666666666667,
+      "loss": 0.0313,
+      "step": 674
+    },
+    {
+      "epoch": 0.17582703829122168,
+      "grad_norm": 0.16763915121555328,
+      "learning_rate": 0.0001811111111111111,
+      "loss": 0.038,
+      "step": 675
+    },
+    {
+      "epoch": 0.17608752279239384,
+      "grad_norm": 0.10551124811172485,
+      "learning_rate": 0.00018055555555555555,
+      "loss": 0.0107,
+      "step": 676
+    },
+    {
+      "epoch": 0.17634800729356603,
+      "grad_norm": 0.07576940208673477,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.0139,
+      "step": 677
+    },
+    {
+      "epoch": 0.17660849179473823,
+      "grad_norm": 0.08700092881917953,
+      "learning_rate": 0.00017944444444444445,
+      "loss": 0.0229,
+      "step": 678
+    },
+    {
+      "epoch": 0.1768689762959104,
+      "grad_norm": 0.12369963526725769,
+      "learning_rate": 0.00017888888888888889,
+      "loss": 0.0107,
+      "step": 679
+    },
+    {
+      "epoch": 0.17712946079708258,
+      "grad_norm": 0.07566480338573456,
+      "learning_rate": 0.00017833333333333335,
+      "loss": 0.0106,
+      "step": 680
+    },
+    {
+      "epoch": 0.17738994529825475,
+      "grad_norm": 0.039756081998348236,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 0.0224,
+      "step": 681
+    },
+    {
+      "epoch": 0.17765042979942694,
+      "grad_norm": 0.06256339699029922,
+      "learning_rate": 0.00017722222222222222,
+      "loss": 0.0302,
+      "step": 682
+    },
+    {
+      "epoch": 0.1779109143005991,
+      "grad_norm": 0.05882665514945984,
+      "learning_rate": 0.00017666666666666666,
+      "loss": 0.027,
+      "step": 683
+    },
+    {
+      "epoch": 0.1781713988017713,
+      "grad_norm": 0.05948967859148979,
+      "learning_rate": 0.00017611111111111112,
+      "loss": 0.021,
+      "step": 684
+    },
+    {
+      "epoch": 0.17843188330294346,
+      "grad_norm": 0.06567700952291489,
+      "learning_rate": 0.00017555555555555556,
+      "loss": 0.0269,
+      "step": 685
+    },
+    {
+      "epoch": 0.17869236780411565,
+      "grad_norm": 0.05989934876561165,
+      "learning_rate": 0.000175,
+      "loss": 0.0244,
+      "step": 686
+    },
+    {
+      "epoch": 0.17895285230528785,
+      "grad_norm": 0.06580004841089249,
+      "learning_rate": 0.00017444444444444446,
+      "loss": 0.0154,
+      "step": 687
+    },
+    {
+      "epoch": 0.17921333680646,
+      "grad_norm": 0.0769931897521019,
+      "learning_rate": 0.0001738888888888889,
+      "loss": 0.0116,
+      "step": 688
+    },
+    {
+      "epoch": 0.1794738213076322,
+      "grad_norm": 0.09212974458932877,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.028,
+      "step": 689
+    },
+    {
+      "epoch": 0.17973430580880437,
+      "grad_norm": 0.0643521323800087,
+      "learning_rate": 0.00017277777777777777,
+      "loss": 0.0133,
+      "step": 690
+    },
+    {
+      "epoch": 0.17999479030997656,
+      "grad_norm": 0.0669914111495018,
+      "learning_rate": 0.00017222222222222224,
+      "loss": 0.0247,
+      "step": 691
+    },
+    {
+      "epoch": 0.18025527481114872,
+      "grad_norm": 0.08314863592386246,
+      "learning_rate": 0.00017166666666666667,
+      "loss": 0.0267,
+      "step": 692
+    },
+    {
+      "epoch": 0.18051575931232092,
+      "grad_norm": 0.0718664899468422,
+      "learning_rate": 0.0001711111111111111,
+      "loss": 0.017,
+      "step": 693
+    },
+    {
+      "epoch": 0.1807762438134931,
+      "grad_norm": 0.06642474979162216,
+      "learning_rate": 0.00017055555555555555,
+      "loss": 0.0266,
+      "step": 694
+    },
+    {
+      "epoch": 0.18103672831466527,
+      "grad_norm": 0.05796753242611885,
+      "learning_rate": 0.00017,
+      "loss": 0.0241,
+      "step": 695
+    },
+    {
+      "epoch": 0.18129721281583747,
+      "grad_norm": 0.055620770901441574,
+      "learning_rate": 0.00016944444444444445,
+      "loss": 0.013,
+      "step": 696
+    },
+    {
+      "epoch": 0.18155769731700963,
+      "grad_norm": 0.0542733296751976,
+      "learning_rate": 0.00016888888888888889,
+      "loss": 0.0119,
+      "step": 697
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 0.06157843396067619,
+      "learning_rate": 0.00016833333333333335,
+      "loss": 0.0113,
+      "step": 698
+    },
+    {
+      "epoch": 0.182078666319354,
+      "grad_norm": 0.07051915675401688,
+      "learning_rate": 0.0001677777777777778,
+      "loss": 0.0114,
+      "step": 699
+    },
+    {
+      "epoch": 0.18233915082052618,
+      "grad_norm": 0.05601470172405243,
+      "learning_rate": 0.00016722222222222222,
+      "loss": 0.0241,
+      "step": 700
+    },
+    {
+      "epoch": 0.18259963532169837,
+      "grad_norm": 0.19451101124286652,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.0328,
+      "step": 701
+    },
+    {
+      "epoch": 0.18286011982287054,
+      "grad_norm": 0.058869846165180206,
+      "learning_rate": 0.0001661111111111111,
+      "loss": 0.0223,
+      "step": 702
+    },
+    {
+      "epoch": 0.18312060432404273,
+      "grad_norm": 0.07163263857364655,
+      "learning_rate": 0.00016555555555555556,
+      "loss": 0.0237,
+      "step": 703
+    },
+    {
+      "epoch": 0.1833810888252149,
+      "grad_norm": 0.09661490470170975,
+      "learning_rate": 0.000165,
+      "loss": 0.0138,
+      "step": 704
+    },
+    {
+      "epoch": 0.1836415733263871,
+      "grad_norm": 0.13651227951049805,
+      "learning_rate": 0.00016444444444444446,
+      "loss": 0.0278,
+      "step": 705
+    },
+    {
+      "epoch": 0.18390205782755925,
+      "grad_norm": 0.03849536180496216,
+      "learning_rate": 0.0001638888888888889,
+      "loss": 0.0213,
+      "step": 706
+    },
+    {
+      "epoch": 0.18416254232873144,
+      "grad_norm": 0.12398000806570053,
+      "learning_rate": 0.00016333333333333334,
+      "loss": 0.0293,
+      "step": 707
+    },
+    {
+      "epoch": 0.1844230268299036,
+      "grad_norm": 0.061215635389089584,
+      "learning_rate": 0.00016277777777777777,
+      "loss": 0.013,
+      "step": 708
+    },
+    {
+      "epoch": 0.1846835113310758,
+      "grad_norm": 0.07608167082071304,
+      "learning_rate": 0.0001622222222222222,
+      "loss": 0.028,
+      "step": 709
+    },
+    {
+      "epoch": 0.184943995832248,
+      "grad_norm": 0.1440596580505371,
+      "learning_rate": 0.00016166666666666665,
+      "loss": 0.029,
+      "step": 710
+    },
+    {
+      "epoch": 0.18520448033342016,
+      "grad_norm": 0.045715004205703735,
+      "learning_rate": 0.0001611111111111111,
+      "loss": 0.0129,
+      "step": 711
+    },
+    {
+      "epoch": 0.18546496483459235,
+      "grad_norm": 0.16964435577392578,
+      "learning_rate": 0.00016055555555555558,
+      "loss": 0.035,
+      "step": 712
+    },
+    {
+      "epoch": 0.18572544933576451,
+      "grad_norm": 0.06140347942709923,
+      "learning_rate": 0.00016,
+      "loss": 0.0259,
+      "step": 713
+    },
+    {
+      "epoch": 0.1859859338369367,
+      "grad_norm": 0.05893239751458168,
+      "learning_rate": 0.00015944444444444445,
+      "loss": 0.0206,
+      "step": 714
+    },
+    {
+      "epoch": 0.18624641833810887,
+      "grad_norm": 0.04311307892203331,
+      "learning_rate": 0.0001588888888888889,
+      "loss": 0.0251,
+      "step": 715
+    },
+    {
+      "epoch": 0.18650690283928106,
+      "grad_norm": 0.07638710737228394,
+      "learning_rate": 0.00015833333333333332,
+      "loss": 0.0253,
+      "step": 716
+    },
+    {
+      "epoch": 0.18676738734045326,
+      "grad_norm": 0.12614621222019196,
+      "learning_rate": 0.00015777777777777776,
+      "loss": 0.0291,
+      "step": 717
+    },
+    {
+      "epoch": 0.18702787184162542,
+      "grad_norm": 0.05761735513806343,
+      "learning_rate": 0.00015722222222222225,
+      "loss": 0.0245,
+      "step": 718
+    },
+    {
+      "epoch": 0.1872883563427976,
+      "grad_norm": 0.058171264827251434,
+      "learning_rate": 0.0001566666666666667,
+      "loss": 0.0259,
+      "step": 719
+    },
+    {
+      "epoch": 0.18754884084396978,
+      "grad_norm": 0.11938812583684921,
+      "learning_rate": 0.00015611111111111113,
+      "loss": 0.0319,
+      "step": 720
+    },
+    {
+      "epoch": 0.18780932534514197,
+      "grad_norm": 0.08229276537895203,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 0.0287,
+      "step": 721
+    },
+    {
+      "epoch": 0.18806980984631413,
+      "grad_norm": 0.06222108379006386,
+      "learning_rate": 0.000155,
+      "loss": 0.0121,
+      "step": 722
+    },
+    {
+      "epoch": 0.18833029434748633,
+      "grad_norm": 0.06809267401695251,
+      "learning_rate": 0.00015444444444444444,
+      "loss": 0.0104,
+      "step": 723
+    },
+    {
+      "epoch": 0.1885907788486585,
+      "grad_norm": 0.11717727780342102,
+      "learning_rate": 0.00015388888888888887,
+      "loss": 0.0296,
+      "step": 724
+    },
+    {
+      "epoch": 0.18885126334983068,
+      "grad_norm": 0.0870775431394577,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 0.0233,
+      "step": 725
+    },
+    {
+      "epoch": 0.18911174785100288,
+      "grad_norm": 0.04679020866751671,
+      "learning_rate": 0.0001527777777777778,
+      "loss": 0.0214,
+      "step": 726
+    },
+    {
+      "epoch": 0.18937223235217504,
+      "grad_norm": 0.04551999270915985,
+      "learning_rate": 0.00015222222222222224,
+      "loss": 0.0155,
+      "step": 727
+    },
+    {
+      "epoch": 0.18963271685334723,
+      "grad_norm": 0.06846822053194046,
+      "learning_rate": 0.00015166666666666668,
+      "loss": 0.0258,
+      "step": 728
+    },
+    {
+      "epoch": 0.1898932013545194,
+      "grad_norm": 0.08462876081466675,
+      "learning_rate": 0.0001511111111111111,
+      "loss": 0.0148,
+      "step": 729
+    },
+    {
+      "epoch": 0.1901536858556916,
+      "grad_norm": 0.1039138212800026,
+      "learning_rate": 0.00015055555555555555,
+      "loss": 0.0194,
+      "step": 730
+    },
+    {
+      "epoch": 0.19041417035686375,
+      "grad_norm": 0.07241775840520859,
+      "learning_rate": 0.00015,
+      "loss": 0.0239,
+      "step": 731
+    },
+    {
+      "epoch": 0.19067465485803595,
+      "grad_norm": 0.04232200235128403,
+      "learning_rate": 0.00014944444444444445,
+      "loss": 0.0204,
+      "step": 732
+    },
+    {
+      "epoch": 0.19093513935920814,
+      "grad_norm": 0.05513433367013931,
+      "learning_rate": 0.0001488888888888889,
+      "loss": 0.0275,
+      "step": 733
+    },
+    {
+      "epoch": 0.1911956238603803,
+      "grad_norm": 0.08720387518405914,
+      "learning_rate": 0.00014833333333333335,
+      "loss": 0.0092,
+      "step": 734
+    },
+    {
+      "epoch": 0.1914561083615525,
+      "grad_norm": 0.07920187711715698,
+      "learning_rate": 0.0001477777777777778,
+      "loss": 0.0271,
+      "step": 735
+    },
+    {
+      "epoch": 0.19171659286272466,
+      "grad_norm": 0.11916014552116394,
+      "learning_rate": 0.00014722222222222223,
+      "loss": 0.0162,
+      "step": 736
+    },
+    {
+      "epoch": 0.19197707736389685,
+      "grad_norm": 0.08308955281972885,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.0289,
+      "step": 737
+    },
+    {
+      "epoch": 0.19223756186506902,
+      "grad_norm": 0.03911888971924782,
+      "learning_rate": 0.0001461111111111111,
+      "loss": 0.0141,
+      "step": 738
+    },
+    {
+      "epoch": 0.1924980463662412,
+      "grad_norm": 0.05872703716158867,
+      "learning_rate": 0.00014555555555555556,
+      "loss": 0.0254,
+      "step": 739
+    },
+    {
+      "epoch": 0.1927585308674134,
+      "grad_norm": 0.05465229973196983,
+      "learning_rate": 0.000145,
+      "loss": 0.0279,
+      "step": 740
+    },
+    {
+      "epoch": 0.19301901536858557,
+      "grad_norm": 0.07703054696321487,
+      "learning_rate": 0.00014444444444444444,
+      "loss": 0.024,
+      "step": 741
+    },
+    {
+      "epoch": 0.19327949986975776,
+      "grad_norm": 0.09856285899877548,
+      "learning_rate": 0.0001438888888888889,
+      "loss": 0.0217,
+      "step": 742
+    },
+    {
+      "epoch": 0.19353998437092992,
+      "grad_norm": 0.04211313650012016,
+      "learning_rate": 0.00014333333333333334,
+      "loss": 0.0217,
+      "step": 743
+    },
+    {
+      "epoch": 0.19380046887210212,
+      "grad_norm": 0.1467122882604599,
+      "learning_rate": 0.00014277777777777778,
+      "loss": 0.0354,
+      "step": 744
+    },
+    {
+      "epoch": 0.19406095337327428,
+      "grad_norm": 0.09239381551742554,
+      "learning_rate": 0.0001422222222222222,
+      "loss": 0.0295,
+      "step": 745
+    },
+    {
+      "epoch": 0.19432143787444647,
+      "grad_norm": 0.06980609148740768,
+      "learning_rate": 0.00014166666666666668,
+      "loss": 0.0118,
+      "step": 746
+    },
+    {
+      "epoch": 0.19458192237561864,
+      "grad_norm": 0.08493070304393768,
+      "learning_rate": 0.00014111111111111111,
+      "loss": 0.0236,
+      "step": 747
+    },
+    {
+      "epoch": 0.19484240687679083,
+      "grad_norm": 0.054070018231868744,
+      "learning_rate": 0.00014055555555555555,
+      "loss": 0.0228,
+      "step": 748
+    },
+    {
+      "epoch": 0.19510289137796302,
+      "grad_norm": 0.0668550506234169,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.0251,
+      "step": 749
+    },
+    {
+      "epoch": 0.1953633758791352,
+      "grad_norm": 0.050128865987062454,
+      "learning_rate": 0.00013944444444444445,
+      "loss": 0.0231,
+      "step": 750
+    },
+    {
+      "epoch": 0.19562386038030738,
+      "grad_norm": 0.042184095829725266,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 0.02,
+      "step": 751
+    },
+    {
+      "epoch": 0.19588434488147954,
+      "grad_norm": 0.07815464586019516,
+      "learning_rate": 0.00013833333333333333,
+      "loss": 0.0124,
+      "step": 752
+    },
+    {
+      "epoch": 0.19614482938265174,
+      "grad_norm": 0.03611977770924568,
+      "learning_rate": 0.0001377777777777778,
+      "loss": 0.0228,
+      "step": 753
+    },
+    {
+      "epoch": 0.1964053138838239,
+      "grad_norm": 0.05367238447070122,
+      "learning_rate": 0.00013722222222222223,
+      "loss": 0.0245,
+      "step": 754
+    },
+    {
+      "epoch": 0.1966657983849961,
+      "grad_norm": 0.05393598973751068,
+      "learning_rate": 0.00013666666666666666,
+      "loss": 0.025,
+      "step": 755
+    },
+    {
+      "epoch": 0.19692628288616829,
+      "grad_norm": 0.06653586030006409,
+      "learning_rate": 0.0001361111111111111,
+      "loss": 0.0225,
+      "step": 756
+    },
+    {
+      "epoch": 0.19718676738734045,
+      "grad_norm": 0.04225403815507889,
+      "learning_rate": 0.00013555555555555556,
+      "loss": 0.0233,
+      "step": 757
+    },
+    {
+      "epoch": 0.19744725188851264,
+      "grad_norm": 0.0775744616985321,
+      "learning_rate": 0.000135,
+      "loss": 0.0297,
+      "step": 758
+    },
+    {
+      "epoch": 0.1977077363896848,
+      "grad_norm": 0.04823022335767746,
+      "learning_rate": 0.00013444444444444447,
+      "loss": 0.0228,
+      "step": 759
+    },
+    {
+      "epoch": 0.197968220890857,
+      "grad_norm": 0.058353208005428314,
+      "learning_rate": 0.0001338888888888889,
+      "loss": 0.0203,
+      "step": 760
+    },
+    {
+      "epoch": 0.19822870539202916,
+      "grad_norm": 0.089630126953125,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.0121,
+      "step": 761
+    },
+    {
+      "epoch": 0.19848918989320136,
+      "grad_norm": 0.038025736808776855,
+      "learning_rate": 0.00013277777777777778,
+      "loss": 0.0213,
+      "step": 762
+    },
+    {
+      "epoch": 0.19874967439437352,
+      "grad_norm": 0.05469721555709839,
+      "learning_rate": 0.00013222222222222221,
+      "loss": 0.0223,
+      "step": 763
+    },
+    {
+      "epoch": 0.1990101588955457,
+      "grad_norm": 0.07665356993675232,
+      "learning_rate": 0.00013166666666666665,
+      "loss": 0.0223,
+      "step": 764
+    },
+    {
+      "epoch": 0.1992706433967179,
+      "grad_norm": 0.11525499075651169,
+      "learning_rate": 0.00013111111111111111,
+      "loss": 0.0304,
+      "step": 765
+    },
+    {
+      "epoch": 0.19953112789789007,
+      "grad_norm": 0.10458825528621674,
+      "learning_rate": 0.00013055555555555558,
+      "loss": 0.0283,
+      "step": 766
+    },
+    {
+      "epoch": 0.19979161239906226,
+      "grad_norm": 0.1617119461297989,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.0326,
+      "step": 767
+    },
+    {
+      "epoch": 0.20005209690023443,
+      "grad_norm": 0.0581621415913105,
+      "learning_rate": 0.00012944444444444445,
+      "loss": 0.0241,
+      "step": 768
+    },
+    {
+      "epoch": 0.20031258140140662,
+      "grad_norm": 0.08246870338916779,
+      "learning_rate": 0.0001288888888888889,
+      "loss": 0.0268,
+      "step": 769
+    },
+    {
+      "epoch": 0.20057306590257878,
+      "grad_norm": 0.06548018008470535,
+      "learning_rate": 0.00012833333333333333,
+      "loss": 0.0288,
+      "step": 770
+    },
+    {
+      "epoch": 0.20083355040375098,
+      "grad_norm": 0.03866685554385185,
+      "learning_rate": 0.00012777777777777776,
+      "loss": 0.0217,
+      "step": 771
+    },
+    {
+      "epoch": 0.20109403490492317,
+      "grad_norm": 0.10325771570205688,
+      "learning_rate": 0.0001272222222222222,
+      "loss": 0.0202,
+      "step": 772
+    },
+    {
+      "epoch": 0.20135451940609533,
+      "grad_norm": 0.08021347224712372,
+      "learning_rate": 0.0001266666666666667,
+      "loss": 0.0281,
+      "step": 773
+    },
+    {
+      "epoch": 0.20161500390726753,
+      "grad_norm": 0.1363510936498642,
+      "learning_rate": 0.00012611111111111113,
+      "loss": 0.0294,
+      "step": 774
+    },
+    {
+      "epoch": 0.2018754884084397,
+      "grad_norm": 0.05307513847947121,
+      "learning_rate": 0.00012555555555555557,
+      "loss": 0.024,
+      "step": 775
+    },
+    {
+      "epoch": 0.20213597290961188,
+      "grad_norm": 0.08089610934257507,
+      "learning_rate": 0.000125,
+      "loss": 0.0287,
+      "step": 776
+    },
+    {
+      "epoch": 0.20239645741078405,
+      "grad_norm": 0.10350208729505539,
+      "learning_rate": 0.00012444444444444444,
+      "loss": 0.0136,
+      "step": 777
+    },
+    {
+      "epoch": 0.20265694191195624,
+      "grad_norm": 0.08581133931875229,
+      "learning_rate": 0.0001238888888888889,
+      "loss": 0.0269,
+      "step": 778
+    },
+    {
+      "epoch": 0.20291742641312843,
+      "grad_norm": 0.07267531007528305,
+      "learning_rate": 0.00012333333333333334,
+      "loss": 0.0305,
+      "step": 779
+    },
+    {
+      "epoch": 0.2031779109143006,
+      "grad_norm": 0.06246737763285637,
+      "learning_rate": 0.00012277777777777778,
+      "loss": 0.0279,
+      "step": 780
+    },
+    {
+      "epoch": 0.2034383954154728,
+      "grad_norm": 0.07415787130594254,
+      "learning_rate": 0.00012222222222222221,
+      "loss": 0.026,
+      "step": 781
+    },
+    {
+      "epoch": 0.20369887991664495,
+      "grad_norm": 0.06334596872329712,
+      "learning_rate": 0.00012166666666666668,
+      "loss": 0.0264,
+      "step": 782
+    },
+    {
+      "epoch": 0.20395936441781715,
+      "grad_norm": 0.08752144128084183,
+      "learning_rate": 0.00012111111111111112,
+      "loss": 0.0262,
+      "step": 783
+    },
+    {
+      "epoch": 0.2042198489189893,
+      "grad_norm": 0.04901771619915962,
+      "learning_rate": 0.00012055555555555555,
+      "loss": 0.0262,
+      "step": 784
+    },
+    {
+      "epoch": 0.2044803334201615,
+      "grad_norm": 0.0815124586224556,
+      "learning_rate": 0.00012,
+      "loss": 0.0093,
+      "step": 785
+    },
+    {
+      "epoch": 0.20474081792133367,
+      "grad_norm": 0.07053158432245255,
+      "learning_rate": 0.00011944444444444445,
+      "loss": 0.0236,
+      "step": 786
+    },
+    {
+      "epoch": 0.20500130242250586,
+      "grad_norm": 0.0743735209107399,
+      "learning_rate": 0.00011888888888888889,
+      "loss": 0.0272,
+      "step": 787
+    },
+    {
+      "epoch": 0.20526178692367805,
+      "grad_norm": 0.05355112999677658,
+      "learning_rate": 0.00011833333333333334,
+      "loss": 0.0264,
+      "step": 788
+    },
+    {
+      "epoch": 0.20552227142485022,
+      "grad_norm": 0.06432649493217468,
+      "learning_rate": 0.00011777777777777778,
+      "loss": 0.0108,
+      "step": 789
+    },
+    {
+      "epoch": 0.2057827559260224,
+      "grad_norm": 0.05454564467072487,
+      "learning_rate": 0.00011722222222222223,
+      "loss": 0.0204,
+      "step": 790
+    },
+    {
+      "epoch": 0.20604324042719457,
+      "grad_norm": 0.05235034599900246,
+      "learning_rate": 0.00011666666666666667,
+      "loss": 0.012,
+      "step": 791
+    },
+    {
+      "epoch": 0.20630372492836677,
+      "grad_norm": 0.04313506931066513,
+      "learning_rate": 0.00011611111111111112,
+      "loss": 0.024,
+      "step": 792
+    },
+    {
+      "epoch": 0.20656420942953893,
+      "grad_norm": 0.047931794077157974,
+      "learning_rate": 0.00011555555555555555,
+      "loss": 0.0209,
+      "step": 793
+    },
+    {
+      "epoch": 0.20682469393071112,
+      "grad_norm": 0.05677259713411331,
+      "learning_rate": 0.000115,
+      "loss": 0.0117,
+      "step": 794
+    },
+    {
+      "epoch": 0.20708517843188332,
+      "grad_norm": 0.04583245888352394,
+      "learning_rate": 0.00011444444444444445,
+      "loss": 0.0233,
+      "step": 795
+    },
+    {
+      "epoch": 0.20734566293305548,
+      "grad_norm": 0.06760893762111664,
+      "learning_rate": 0.00011388888888888889,
+      "loss": 0.0286,
+      "step": 796
+    },
+    {
+      "epoch": 0.20760614743422767,
+      "grad_norm": 0.06912104785442352,
+      "learning_rate": 0.00011333333333333333,
+      "loss": 0.0299,
+      "step": 797
+    },
+    {
+      "epoch": 0.20786663193539984,
+      "grad_norm": 0.10582921653985977,
+      "learning_rate": 0.00011277777777777778,
+      "loss": 0.0117,
+      "step": 798
+    },
+    {
+      "epoch": 0.20812711643657203,
+      "grad_norm": 0.059685759246349335,
+      "learning_rate": 0.00011222222222222223,
+      "loss": 0.026,
+      "step": 799
+    },
+    {
+      "epoch": 0.2083876009377442,
+      "grad_norm": 0.043049488216638565,
+      "learning_rate": 0.00011166666666666667,
+      "loss": 0.0197,
+      "step": 800
+    },
+    {
+      "epoch": 0.2086480854389164,
+      "grad_norm": 0.08262094855308533,
+      "learning_rate": 0.0001111111111111111,
+      "loss": 0.0282,
+      "step": 801
+    },
+    {
+      "epoch": 0.20890856994008855,
+      "grad_norm": 0.05336850881576538,
+      "learning_rate": 0.00011055555555555557,
+      "loss": 0.0227,
+      "step": 802
+    },
+    {
+      "epoch": 0.20916905444126074,
+      "grad_norm": 0.042703427374362946,
+      "learning_rate": 0.00011,
+      "loss": 0.0198,
+      "step": 803
+    },
+    {
+      "epoch": 0.20942953894243294,
+      "grad_norm": 0.09256606549024582,
+      "learning_rate": 0.00010944444444444444,
+      "loss": 0.0115,
+      "step": 804
+    },
+    {
+      "epoch": 0.2096900234436051,
+      "grad_norm": 0.0701405480504036,
+      "learning_rate": 0.00010888888888888888,
+      "loss": 0.0268,
+      "step": 805
+    },
+    {
+      "epoch": 0.2099505079447773,
+      "grad_norm": 0.09650083631277084,
+      "learning_rate": 0.00010833333333333334,
+      "loss": 0.023,
+      "step": 806
+    },
+    {
+      "epoch": 0.21021099244594946,
+      "grad_norm": 0.04848174378275871,
+      "learning_rate": 0.00010777777777777778,
+      "loss": 0.0129,
+      "step": 807
+    },
+    {
+      "epoch": 0.21047147694712165,
+      "grad_norm": 0.06695965677499771,
+      "learning_rate": 0.00010722222222222222,
+      "loss": 0.0281,
+      "step": 808
+    },
+    {
+      "epoch": 0.21073196144829381,
+      "grad_norm": 0.08163263648748398,
+      "learning_rate": 0.00010666666666666668,
+      "loss": 0.0254,
+      "step": 809
+    },
+    {
+      "epoch": 0.210992445949466,
+      "grad_norm": 0.12193469703197479,
+      "learning_rate": 0.00010611111111111112,
+      "loss": 0.0321,
+      "step": 810
+    },
+    {
+      "epoch": 0.2112529304506382,
+      "grad_norm": 0.04213068634271622,
+      "learning_rate": 0.00010555555555555555,
+      "loss": 0.0251,
+      "step": 811
+    },
+    {
+      "epoch": 0.21151341495181036,
+      "grad_norm": 0.04552720487117767,
+      "learning_rate": 0.000105,
+      "loss": 0.0193,
+      "step": 812
+    },
+    {
+      "epoch": 0.21177389945298256,
+      "grad_norm": 0.054694145917892456,
+      "learning_rate": 0.00010444444444444445,
+      "loss": 0.0237,
+      "step": 813
+    },
+    {
+      "epoch": 0.21203438395415472,
+      "grad_norm": 0.06649375706911087,
+      "learning_rate": 0.00010388888888888889,
+      "loss": 0.0207,
+      "step": 814
+    },
+    {
+      "epoch": 0.2122948684553269,
+      "grad_norm": 0.042342595756053925,
+      "learning_rate": 0.00010333333333333333,
+      "loss": 0.0217,
+      "step": 815
+    },
+    {
+      "epoch": 0.21255535295649908,
+      "grad_norm": 0.0994374081492424,
+      "learning_rate": 0.00010277777777777778,
+      "loss": 0.0169,
+      "step": 816
+    },
+    {
+      "epoch": 0.21281583745767127,
+      "grad_norm": 0.05945296213030815,
+      "learning_rate": 0.00010222222222222223,
+      "loss": 0.0269,
+      "step": 817
+    },
+    {
+      "epoch": 0.21307632195884346,
+      "grad_norm": 0.04200858250260353,
+      "learning_rate": 0.00010166666666666667,
+      "loss": 0.0238,
+      "step": 818
+    },
+    {
+      "epoch": 0.21333680646001563,
+      "grad_norm": 0.07417359948158264,
+      "learning_rate": 0.00010111111111111112,
+      "loss": 0.0264,
+      "step": 819
+    },
+    {
+      "epoch": 0.21359729096118782,
+      "grad_norm": 0.06793615967035294,
+      "learning_rate": 0.00010055555555555555,
+      "loss": 0.0271,
+      "step": 820
+    },
+    {
+      "epoch": 0.21385777546235998,
+      "grad_norm": 0.05132850632071495,
+      "learning_rate": 0.0001,
+      "loss": 0.0183,
+      "step": 821
+    },
+    {
+      "epoch": 0.21411825996353218,
+      "grad_norm": 0.11458087712526321,
+      "learning_rate": 9.944444444444444e-05,
+      "loss": 0.0307,
+      "step": 822
+    },
+    {
+      "epoch": 0.21437874446470434,
+      "grad_norm": 0.09691832959651947,
+      "learning_rate": 9.888888888888889e-05,
+      "loss": 0.0153,
+      "step": 823
+    },
+    {
+      "epoch": 0.21463922896587653,
+      "grad_norm": 0.08158433437347412,
+      "learning_rate": 9.833333333333333e-05,
+      "loss": 0.0192,
+      "step": 824
+    },
+    {
+      "epoch": 0.2148997134670487,
+      "grad_norm": 0.09115268290042877,
+      "learning_rate": 9.777777777777778e-05,
+      "loss": 0.018,
+      "step": 825
+    },
+    {
+      "epoch": 0.2151601979682209,
+      "grad_norm": 0.05657673254609108,
+      "learning_rate": 9.722222222222223e-05,
+      "loss": 0.0253,
+      "step": 826
+    },
+    {
+      "epoch": 0.21542068246939308,
+      "grad_norm": 0.05650609731674194,
+      "learning_rate": 9.666666666666667e-05,
+      "loss": 0.0232,
+      "step": 827
+    },
+    {
+      "epoch": 0.21568116697056525,
+      "grad_norm": 0.06089344248175621,
+      "learning_rate": 9.61111111111111e-05,
+      "loss": 0.014,
+      "step": 828
+    },
+    {
+      "epoch": 0.21594165147173744,
+      "grad_norm": 0.05349273234605789,
+      "learning_rate": 9.555555555555557e-05,
+      "loss": 0.0233,
+      "step": 829
+    },
+    {
+      "epoch": 0.2162021359729096,
+      "grad_norm": 0.04966244846582413,
+      "learning_rate": 9.5e-05,
+      "loss": 0.0242,
+      "step": 830
+    },
+    {
+      "epoch": 0.2164626204740818,
+      "grad_norm": 0.11511816829442978,
+      "learning_rate": 9.444444444444444e-05,
+      "loss": 0.0275,
+      "step": 831
+    },
+    {
+      "epoch": 0.21672310497525396,
+      "grad_norm": 0.057164501398801804,
+      "learning_rate": 9.388888888888888e-05,
+      "loss": 0.0122,
+      "step": 832
+    },
+    {
+      "epoch": 0.21698358947642615,
+      "grad_norm": 0.08348594605922699,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.0281,
+      "step": 833
+    },
+    {
+      "epoch": 0.21724407397759835,
+      "grad_norm": 0.10066075623035431,
+      "learning_rate": 9.277777777777778e-05,
+      "loss": 0.0268,
+      "step": 834
+    },
+    {
+      "epoch": 0.2175045584787705,
+      "grad_norm": 0.05197849124670029,
+      "learning_rate": 9.222222222222222e-05,
+      "loss": 0.0221,
+      "step": 835
+    },
+    {
+      "epoch": 0.2177650429799427,
+      "grad_norm": 0.07499635219573975,
+      "learning_rate": 9.166666666666667e-05,
+      "loss": 0.0243,
+      "step": 836
+    },
+    {
+      "epoch": 0.21802552748111487,
+      "grad_norm": 0.04976842552423477,
+      "learning_rate": 9.111111111111112e-05,
+      "loss": 0.023,
+      "step": 837
+    },
+    {
+      "epoch": 0.21828601198228706,
+      "grad_norm": 0.13596895337104797,
+      "learning_rate": 9.055555555555556e-05,
+      "loss": 0.0316,
+      "step": 838
+    },
+    {
+      "epoch": 0.21854649648345922,
+      "grad_norm": 0.09843513369560242,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.0298,
+      "step": 839
+    },
+    {
+      "epoch": 0.21880698098463142,
+      "grad_norm": 0.03935172036290169,
+      "learning_rate": 8.944444444444444e-05,
+      "loss": 0.0221,
+      "step": 840
+    },
+    {
+      "epoch": 0.2190674654858036,
+      "grad_norm": 0.06101774051785469,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 0.023,
+      "step": 841
+    },
+    {
+      "epoch": 0.21932794998697577,
+      "grad_norm": 0.07553184777498245,
+      "learning_rate": 8.833333333333333e-05,
+      "loss": 0.0207,
+      "step": 842
+    },
+    {
+      "epoch": 0.21958843448814797,
+      "grad_norm": 0.055677421391010284,
+      "learning_rate": 8.777777777777778e-05,
+      "loss": 0.0214,
+      "step": 843
+    },
+    {
+      "epoch": 0.21984891898932013,
+      "grad_norm": 0.06566982716321945,
+      "learning_rate": 8.722222222222223e-05,
+      "loss": 0.026,
+      "step": 844
+    },
+    {
+      "epoch": 0.22010940349049232,
+      "grad_norm": 0.09868303686380386,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 0.0275,
+      "step": 845
+    },
+    {
+      "epoch": 0.2203698879916645,
+      "grad_norm": 0.06977616250514984,
+      "learning_rate": 8.611111111111112e-05,
+      "loss": 0.0202,
+      "step": 846
+    },
+    {
+      "epoch": 0.22063037249283668,
+      "grad_norm": 0.06692767888307571,
+      "learning_rate": 8.555555555555556e-05,
+      "loss": 0.0269,
+      "step": 847
+    },
+    {
+      "epoch": 0.22089085699400884,
+      "grad_norm": 0.08198261260986328,
+      "learning_rate": 8.5e-05,
+      "loss": 0.0094,
+      "step": 848
+    },
+    {
+      "epoch": 0.22115134149518104,
+      "grad_norm": 0.057990413159132004,
+      "learning_rate": 8.444444444444444e-05,
+      "loss": 0.0244,
+      "step": 849
+    },
+    {
+      "epoch": 0.22141182599635323,
+      "grad_norm": 0.06253142654895782,
+      "learning_rate": 8.38888888888889e-05,
+      "loss": 0.0276,
+      "step": 850
+    },
+    {
+      "epoch": 0.2216723104975254,
+      "grad_norm": 0.09076081961393356,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 0.0134,
+      "step": 851
+    },
+    {
+      "epoch": 0.22193279499869759,
+      "grad_norm": 0.04802235588431358,
+      "learning_rate": 8.277777777777778e-05,
+      "loss": 0.0192,
+      "step": 852
+    },
+    {
+      "epoch": 0.22219327949986975,
+      "grad_norm": 0.07106220722198486,
+      "learning_rate": 8.222222222222223e-05,
+      "loss": 0.0217,
+      "step": 853
+    },
+    {
+      "epoch": 0.22245376400104194,
+      "grad_norm": 0.0669749453663826,
+      "learning_rate": 8.166666666666667e-05,
+      "loss": 0.0158,
+      "step": 854
+    },
+    {
+      "epoch": 0.2227142485022141,
+      "grad_norm": 0.05683424696326256,
+      "learning_rate": 8.11111111111111e-05,
+      "loss": 0.0262,
+      "step": 855
+    },
+    {
+      "epoch": 0.2229747330033863,
+      "grad_norm": 0.05684027448296547,
+      "learning_rate": 8.055555555555556e-05,
+      "loss": 0.0242,
+      "step": 856
+    },
+    {
+      "epoch": 0.2232352175045585,
+      "grad_norm": 0.058487534523010254,
+      "learning_rate": 8e-05,
+      "loss": 0.0182,
+      "step": 857
+    },
+    {
+      "epoch": 0.22349570200573066,
+      "grad_norm": 0.059550922363996506,
+      "learning_rate": 7.944444444444444e-05,
+      "loss": 0.0248,
+      "step": 858
+    },
+    {
+      "epoch": 0.22375618650690285,
+      "grad_norm": 0.07189222425222397,
+      "learning_rate": 7.888888888888888e-05,
+      "loss": 0.0239,
+      "step": 859
+    },
+    {
+      "epoch": 0.224016671008075,
+      "grad_norm": 0.04999905824661255,
+      "learning_rate": 7.833333333333334e-05,
+      "loss": 0.0135,
+      "step": 860
+    },
+    {
+      "epoch": 0.2242771555092472,
+      "grad_norm": 0.05286647006869316,
+      "learning_rate": 7.777777777777778e-05,
+      "loss": 0.025,
+      "step": 861
+    },
+    {
+      "epoch": 0.22453764001041937,
+      "grad_norm": 0.07563666254281998,
+      "learning_rate": 7.722222222222222e-05,
+      "loss": 0.0269,
+      "step": 862
+    },
+    {
+      "epoch": 0.22479812451159156,
+      "grad_norm": 0.08469083905220032,
+      "learning_rate": 7.666666666666667e-05,
+      "loss": 0.0261,
+      "step": 863
+    },
+    {
+      "epoch": 0.22505860901276373,
+      "grad_norm": 0.10381034016609192,
+      "learning_rate": 7.611111111111112e-05,
+      "loss": 0.0322,
+      "step": 864
+    },
+    {
+      "epoch": 0.22531909351393592,
+      "grad_norm": 0.048530541360378265,
+      "learning_rate": 7.555555555555556e-05,
+      "loss": 0.0109,
+      "step": 865
+    },
+    {
+      "epoch": 0.2255795780151081,
+      "grad_norm": 0.05683276802301407,
+      "learning_rate": 7.5e-05,
+      "loss": 0.026,
+      "step": 866
+    },
+    {
+      "epoch": 0.22584006251628028,
+      "grad_norm": 0.05248989164829254,
+      "learning_rate": 7.444444444444444e-05,
+      "loss": 0.0246,
+      "step": 867
+    },
+    {
+      "epoch": 0.22610054701745247,
+      "grad_norm": 0.046281568706035614,
+      "learning_rate": 7.38888888888889e-05,
+      "loss": 0.0143,
+      "step": 868
+    },
+    {
+      "epoch": 0.22636103151862463,
+      "grad_norm": 0.08264920115470886,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 0.0289,
+      "step": 869
+    },
+    {
+      "epoch": 0.22662151601979683,
+      "grad_norm": 0.06127788498997688,
+      "learning_rate": 7.277777777777778e-05,
+      "loss": 0.0241,
+      "step": 870
+    },
+    {
+      "epoch": 0.226882000520969,
+      "grad_norm": 0.06106068938970566,
+      "learning_rate": 7.222222222222222e-05,
+      "loss": 0.0239,
+      "step": 871
+    },
+    {
+      "epoch": 0.22714248502214118,
+      "grad_norm": 0.053920209407806396,
+      "learning_rate": 7.166666666666667e-05,
+      "loss": 0.0223,
+      "step": 872
+    },
+    {
+      "epoch": 0.22740296952331338,
+      "grad_norm": 0.0667242705821991,
+      "learning_rate": 7.11111111111111e-05,
+      "loss": 0.0134,
+      "step": 873
+    },
+    {
+      "epoch": 0.22766345402448554,
+      "grad_norm": 0.05814678221940994,
+      "learning_rate": 7.055555555555556e-05,
+      "loss": 0.0239,
+      "step": 874
+    },
+    {
+      "epoch": 0.22792393852565773,
+      "grad_norm": 0.10684846341609955,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 0.0114,
+      "step": 875
+    },
+    {
+      "epoch": 0.2281844230268299,
+      "grad_norm": 0.0764312744140625,
+      "learning_rate": 6.944444444444444e-05,
+      "loss": 0.0116,
+      "step": 876
+    },
+    {
+      "epoch": 0.2284449075280021,
+      "grad_norm": 0.10544411092996597,
+      "learning_rate": 6.88888888888889e-05,
+      "loss": 0.0277,
+      "step": 877
+    },
+    {
+      "epoch": 0.22870539202917425,
+      "grad_norm": 0.12729933857917786,
+      "learning_rate": 6.833333333333333e-05,
+      "loss": 0.0301,
+      "step": 878
+    },
+    {
+      "epoch": 0.22896587653034645,
+      "grad_norm": 0.05990598350763321,
+      "learning_rate": 6.777777777777778e-05,
+      "loss": 0.0227,
+      "step": 879
+    },
+    {
+      "epoch": 0.22922636103151864,
+      "grad_norm": 0.1211724728345871,
+      "learning_rate": 6.722222222222223e-05,
+      "loss": 0.0122,
+      "step": 880
+    },
+    {
+      "epoch": 0.2294868455326908,
+      "grad_norm": 0.09479571133852005,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.0265,
+      "step": 881
+    },
+    {
+      "epoch": 0.229747330033863,
+      "grad_norm": 0.0977165549993515,
+      "learning_rate": 6.611111111111111e-05,
+      "loss": 0.0257,
+      "step": 882
+    },
+    {
+      "epoch": 0.23000781453503516,
+      "grad_norm": 0.08902498334646225,
+      "learning_rate": 6.555555555555556e-05,
+      "loss": 0.0278,
+      "step": 883
+    },
+    {
+      "epoch": 0.23026829903620735,
+      "grad_norm": 0.05326511338353157,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 0.0134,
+      "step": 884
+    },
+    {
+      "epoch": 0.23052878353737952,
+      "grad_norm": 0.06463765352964401,
+      "learning_rate": 6.444444444444444e-05,
+      "loss": 0.0093,
+      "step": 885
+    },
+    {
+      "epoch": 0.2307892680385517,
+      "grad_norm": 0.06279236078262329,
+      "learning_rate": 6.388888888888888e-05,
+      "loss": 0.0295,
+      "step": 886
+    },
+    {
+      "epoch": 0.23104975253972387,
+      "grad_norm": 0.0766410231590271,
+      "learning_rate": 6.333333333333335e-05,
+      "loss": 0.0266,
+      "step": 887
+    },
+    {
+      "epoch": 0.23131023704089607,
+      "grad_norm": 0.04724525287747383,
+      "learning_rate": 6.277777777777778e-05,
+      "loss": 0.0211,
+      "step": 888
+    },
+    {
+      "epoch": 0.23157072154206826,
+      "grad_norm": 0.10141412168741226,
+      "learning_rate": 6.222222222222222e-05,
+      "loss": 0.0325,
+      "step": 889
+    },
+    {
+      "epoch": 0.23183120604324042,
+      "grad_norm": 0.04504161700606346,
+      "learning_rate": 6.166666666666667e-05,
+      "loss": 0.0209,
+      "step": 890
+    },
+    {
+      "epoch": 0.23209169054441262,
+      "grad_norm": 0.05162983760237694,
+      "learning_rate": 6.111111111111111e-05,
+      "loss": 0.0203,
+      "step": 891
+    },
+    {
+      "epoch": 0.23235217504558478,
+      "grad_norm": 0.06397057324647903,
+      "learning_rate": 6.055555555555556e-05,
+      "loss": 0.0221,
+      "step": 892
+    },
+    {
+      "epoch": 0.23261265954675697,
+      "grad_norm": 0.14115869998931885,
+      "learning_rate": 6e-05,
+      "loss": 0.0341,
+      "step": 893
+    },
+    {
+      "epoch": 0.23287314404792914,
+      "grad_norm": 0.05720777064561844,
+      "learning_rate": 5.9444444444444445e-05,
+      "loss": 0.029,
+      "step": 894
+    },
+    {
+      "epoch": 0.23313362854910133,
+      "grad_norm": 0.059804365038871765,
+      "learning_rate": 5.888888888888889e-05,
+      "loss": 0.03,
+      "step": 895
+    },
+    {
+      "epoch": 0.23339411305027352,
+      "grad_norm": 0.06636619567871094,
+      "learning_rate": 5.833333333333333e-05,
+      "loss": 0.0116,
+      "step": 896
+    },
+    {
+      "epoch": 0.2336545975514457,
+      "grad_norm": 0.0481412410736084,
+      "learning_rate": 5.7777777777777776e-05,
+      "loss": 0.0213,
+      "step": 897
+    },
+    {
+      "epoch": 0.23391508205261788,
+      "grad_norm": 0.051565125584602356,
+      "learning_rate": 5.722222222222223e-05,
+      "loss": 0.0263,
+      "step": 898
+    },
+    {
+      "epoch": 0.23417556655379004,
+      "grad_norm": 0.04665525630116463,
+      "learning_rate": 5.6666666666666664e-05,
+      "loss": 0.02,
+      "step": 899
+    },
+    {
+      "epoch": 0.23443605105496224,
+      "grad_norm": 0.046372778713703156,
+      "learning_rate": 5.6111111111111114e-05,
+      "loss": 0.022,
+      "step": 900
+    },
+    {
+      "epoch": 0.2346965355561344,
+      "grad_norm": 0.04924992471933365,
+      "learning_rate": 5.555555555555555e-05,
+      "loss": 0.0251,
+      "step": 901
+    },
+    {
+      "epoch": 0.2349570200573066,
+      "grad_norm": 0.09490156918764114,
+      "learning_rate": 5.5e-05,
+      "loss": 0.0137,
+      "step": 902
+    },
+    {
+      "epoch": 0.23521750455847876,
+      "grad_norm": 0.05161282792687416,
+      "learning_rate": 5.444444444444444e-05,
+      "loss": 0.0109,
+      "step": 903
+    },
+    {
+      "epoch": 0.23547798905965095,
+      "grad_norm": 0.0635368600487709,
+      "learning_rate": 5.388888888888889e-05,
+      "loss": 0.0214,
+      "step": 904
+    },
+    {
+      "epoch": 0.23573847356082314,
+      "grad_norm": 0.050677914172410965,
+      "learning_rate": 5.333333333333334e-05,
+      "loss": 0.0226,
+      "step": 905
+    },
+    {
+      "epoch": 0.2359989580619953,
+      "grad_norm": 0.036224327981472015,
+      "learning_rate": 5.277777777777778e-05,
+      "loss": 0.01,
+      "step": 906
+    },
+    {
+      "epoch": 0.2362594425631675,
+      "grad_norm": 0.08436845988035202,
+      "learning_rate": 5.222222222222223e-05,
+      "loss": 0.029,
+      "step": 907
+    },
+    {
+      "epoch": 0.23651992706433966,
+      "grad_norm": 0.05044053867459297,
+      "learning_rate": 5.1666666666666664e-05,
+      "loss": 0.0199,
+      "step": 908
+    },
+    {
+      "epoch": 0.23678041156551186,
+      "grad_norm": 0.09900317341089249,
+      "learning_rate": 5.1111111111111115e-05,
+      "loss": 0.0124,
+      "step": 909
+    },
+    {
+      "epoch": 0.23704089606668402,
+      "grad_norm": 0.1304093301296234,
+      "learning_rate": 5.055555555555556e-05,
+      "loss": 0.0112,
+      "step": 910
+    },
+    {
+      "epoch": 0.2373013805678562,
+      "grad_norm": 0.06725986301898956,
+      "learning_rate": 5e-05,
+      "loss": 0.0266,
+      "step": 911
+    },
+    {
+      "epoch": 0.2375618650690284,
+      "grad_norm": 0.07616691291332245,
+      "learning_rate": 4.9444444444444446e-05,
+      "loss": 0.0287,
+      "step": 912
+    },
+    {
+      "epoch": 0.23782234957020057,
+      "grad_norm": 0.07480709999799728,
+      "learning_rate": 4.888888888888889e-05,
+      "loss": 0.0289,
+      "step": 913
+    },
+    {
+      "epoch": 0.23808283407137276,
+      "grad_norm": 0.05376133695244789,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 0.0262,
+      "step": 914
+    },
+    {
+      "epoch": 0.23834331857254493,
+      "grad_norm": 0.07016518712043762,
+      "learning_rate": 4.7777777777777784e-05,
+      "loss": 0.0227,
+      "step": 915
+    },
+    {
+      "epoch": 0.23860380307371712,
+      "grad_norm": 0.07425282895565033,
+      "learning_rate": 4.722222222222222e-05,
+      "loss": 0.0126,
+      "step": 916
+    },
+    {
+      "epoch": 0.23886428757488928,
+      "grad_norm": 0.05164991691708565,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.0243,
+      "step": 917
+    },
+    {
+      "epoch": 0.23912477207606148,
+      "grad_norm": 0.11435671150684357,
+      "learning_rate": 4.611111111111111e-05,
+      "loss": 0.0301,
+      "step": 918
+    },
+    {
+      "epoch": 0.23938525657723367,
+      "grad_norm": 0.08265142142772675,
+      "learning_rate": 4.555555555555556e-05,
+      "loss": 0.0254,
+      "step": 919
+    },
+    {
+      "epoch": 0.23964574107840583,
+      "grad_norm": 0.08810949325561523,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 0.0108,
+      "step": 920
+    },
+    {
+      "epoch": 0.23990622557957803,
+      "grad_norm": 0.07355548441410065,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.0244,
+      "step": 921
+    },
+    {
+      "epoch": 0.2401667100807502,
+      "grad_norm": 0.07502081990242004,
+      "learning_rate": 4.388888888888889e-05,
+      "loss": 0.0123,
+      "step": 922
+    },
+    {
+      "epoch": 0.24042719458192238,
+      "grad_norm": 0.13509482145309448,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 0.0231,
+      "step": 923
+    },
+    {
+      "epoch": 0.24068767908309455,
+      "grad_norm": 0.057144083082675934,
+      "learning_rate": 4.277777777777778e-05,
+      "loss": 0.0096,
+      "step": 924
+    },
+    {
+      "epoch": 0.24094816358426674,
+      "grad_norm": 0.0596635565161705,
+      "learning_rate": 4.222222222222222e-05,
+      "loss": 0.01,
+      "step": 925
+    },
+    {
+      "epoch": 0.2412086480854389,
+      "grad_norm": 0.09015180170536041,
+      "learning_rate": 4.1666666666666665e-05,
+      "loss": 0.0265,
+      "step": 926
+    },
+    {
+      "epoch": 0.2414691325866111,
+      "grad_norm": 0.11118481308221817,
+      "learning_rate": 4.1111111111111116e-05,
+      "loss": 0.0287,
+      "step": 927
+    },
+    {
+      "epoch": 0.2417296170877833,
+      "grad_norm": 0.05947785824537277,
+      "learning_rate": 4.055555555555555e-05,
+      "loss": 0.016,
+      "step": 928
+    },
+    {
+      "epoch": 0.24199010158895545,
+      "grad_norm": 0.06007053330540657,
+      "learning_rate": 4e-05,
+      "loss": 0.0173,
+      "step": 929
+    },
+    {
+      "epoch": 0.24225058609012765,
+      "grad_norm": 0.04201246052980423,
+      "learning_rate": 3.944444444444444e-05,
+      "loss": 0.025,
+      "step": 930
+    },
+    {
+      "epoch": 0.2425110705912998,
+      "grad_norm": 0.057011738419532776,
+      "learning_rate": 3.888888888888889e-05,
+      "loss": 0.0258,
+      "step": 931
+    },
+    {
+      "epoch": 0.242771555092472,
+      "grad_norm": 0.05665633827447891,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 0.0128,
+      "step": 932
+    },
+    {
+      "epoch": 0.24303203959364417,
+      "grad_norm": 0.050919968634843826,
+      "learning_rate": 3.777777777777778e-05,
+      "loss": 0.0261,
+      "step": 933
+    },
+    {
+      "epoch": 0.24329252409481636,
+      "grad_norm": 0.07048945873975754,
+      "learning_rate": 3.722222222222222e-05,
+      "loss": 0.0112,
+      "step": 934
+    },
+    {
+      "epoch": 0.24355300859598855,
+      "grad_norm": 0.05634898692369461,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 0.0202,
+      "step": 935
+    },
+    {
+      "epoch": 0.24381349309716072,
+      "grad_norm": 0.03418530523777008,
+      "learning_rate": 3.611111111111111e-05,
+      "loss": 0.0239,
+      "step": 936
+    },
+    {
+      "epoch": 0.2440739775983329,
+      "grad_norm": 0.08094564080238342,
+      "learning_rate": 3.555555555555555e-05,
+      "loss": 0.0138,
+      "step": 937
+    },
+    {
+      "epoch": 0.24433446209950507,
+      "grad_norm": 0.14068718254566193,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 0.0305,
+      "step": 938
+    },
+    {
+      "epoch": 0.24459494660067727,
+      "grad_norm": 0.08950838446617126,
+      "learning_rate": 3.444444444444445e-05,
+      "loss": 0.0177,
+      "step": 939
+    },
+    {
+      "epoch": 0.24485543110184943,
+      "grad_norm": 0.04776313155889511,
+      "learning_rate": 3.388888888888889e-05,
+      "loss": 0.0243,
+      "step": 940
+    },
+    {
+      "epoch": 0.24511591560302162,
+      "grad_norm": 0.06609267741441727,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.0221,
+      "step": 941
+    },
+    {
+      "epoch": 0.2453764001041938,
+      "grad_norm": 0.05166230723261833,
+      "learning_rate": 3.277777777777778e-05,
+      "loss": 0.0102,
+      "step": 942
+    },
+    {
+      "epoch": 0.24563688460536598,
+      "grad_norm": 0.06623659282922745,
+      "learning_rate": 3.222222222222222e-05,
+      "loss": 0.029,
+      "step": 943
+    },
+    {
+      "epoch": 0.24589736910653817,
+      "grad_norm": 0.06607527285814285,
+      "learning_rate": 3.166666666666667e-05,
+      "loss": 0.0276,
+      "step": 944
+    },
+    {
+      "epoch": 0.24615785360771034,
+      "grad_norm": 0.09493245929479599,
+      "learning_rate": 3.111111111111111e-05,
+      "loss": 0.0259,
+      "step": 945
+    },
+    {
+      "epoch": 0.24641833810888253,
+      "grad_norm": 0.14745251834392548,
+      "learning_rate": 3.0555555555555554e-05,
+      "loss": 0.0311,
+      "step": 946
+    },
+    {
+      "epoch": 0.2466788226100547,
+      "grad_norm": 0.0724267065525055,
+      "learning_rate": 3e-05,
+      "loss": 0.0313,
+      "step": 947
+    },
+    {
+      "epoch": 0.24693930711122689,
+      "grad_norm": 0.13564924895763397,
+      "learning_rate": 2.9444444444444445e-05,
+      "loss": 0.0281,
+      "step": 948
+    },
+    {
+      "epoch": 0.24719979161239905,
+      "grad_norm": 0.05825835093855858,
+      "learning_rate": 2.8888888888888888e-05,
+      "loss": 0.0203,
+      "step": 949
+    },
+    {
+      "epoch": 0.24746027611357124,
+      "grad_norm": 0.031234711408615112,
+      "learning_rate": 2.8333333333333332e-05,
+      "loss": 0.0199,
+      "step": 950
+    },
+    {
+      "epoch": 0.24772076061474343,
+      "grad_norm": 0.08138159662485123,
+      "learning_rate": 2.7777777777777776e-05,
+      "loss": 0.0108,
+      "step": 951
+    },
+    {
+      "epoch": 0.2479812451159156,
+      "grad_norm": 0.04411553591489792,
+      "learning_rate": 2.722222222222222e-05,
+      "loss": 0.0219,
+      "step": 952
+    },
+    {
+      "epoch": 0.2482417296170878,
+      "grad_norm": 0.07893390953540802,
+      "learning_rate": 2.666666666666667e-05,
+      "loss": 0.0264,
+      "step": 953
+    },
+    {
+      "epoch": 0.24850221411825996,
+      "grad_norm": 0.08252941071987152,
+      "learning_rate": 2.6111111111111114e-05,
+      "loss": 0.0125,
+      "step": 954
+    },
+    {
+      "epoch": 0.24876269861943215,
+      "grad_norm": 0.057603947818279266,
+      "learning_rate": 2.5555555555555557e-05,
+      "loss": 0.0097,
+      "step": 955
+    },
+    {
+      "epoch": 0.2490231831206043,
+      "grad_norm": 0.14112702012062073,
+      "learning_rate": 2.5e-05,
+      "loss": 0.0313,
+      "step": 956
+    },
+    {
+      "epoch": 0.2492836676217765,
+      "grad_norm": 0.10659433901309967,
+      "learning_rate": 2.4444444444444445e-05,
+      "loss": 0.0269,
+      "step": 957
+    },
+    {
+      "epoch": 0.2495441521229487,
+      "grad_norm": 0.06049094721674919,
+      "learning_rate": 2.3888888888888892e-05,
+      "loss": 0.0235,
+      "step": 958
+    },
+    {
+      "epoch": 0.24980463662412086,
+      "grad_norm": 0.057612448930740356,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 0.0205,
+      "step": 959
+    },
+    {
+      "epoch": 0.25006512112529306,
+      "grad_norm": 0.05991975963115692,
+      "learning_rate": 2.277777777777778e-05,
+      "loss": 0.0124,
+      "step": 960
+    },
+    {
+      "epoch": 0.2503256056264652,
+      "grad_norm": 0.04733411222696304,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.0232,
+      "step": 961
+    },
+    {
+      "epoch": 0.2505860901276374,
+      "grad_norm": 0.06568509340286255,
+      "learning_rate": 2.1666666666666667e-05,
+      "loss": 0.0113,
+      "step": 962
+    },
+    {
+      "epoch": 0.2508465746288096,
+      "grad_norm": 0.07371300458908081,
+      "learning_rate": 2.111111111111111e-05,
+      "loss": 0.0199,
+      "step": 963
+    },
+    {
+      "epoch": 0.25110705912998177,
+      "grad_norm": 0.04667287319898605,
+      "learning_rate": 2.0555555555555558e-05,
+      "loss": 0.0213,
+      "step": 964
+    },
+    {
+      "epoch": 0.25136754363115393,
+      "grad_norm": 0.05440174788236618,
+      "learning_rate": 2e-05,
+      "loss": 0.0226,
+      "step": 965
+    },
+    {
+      "epoch": 0.25162802813232615,
+      "grad_norm": 0.06468810141086578,
+      "learning_rate": 1.9444444444444445e-05,
+      "loss": 0.0102,
+      "step": 966
+    },
+    {
+      "epoch": 0.2518885126334983,
+      "grad_norm": 0.08703803271055222,
+      "learning_rate": 1.888888888888889e-05,
+      "loss": 0.0249,
+      "step": 967
+    },
+    {
+      "epoch": 0.2521489971346705,
+      "grad_norm": 0.06582577526569366,
+      "learning_rate": 1.8333333333333333e-05,
+      "loss": 0.0165,
+      "step": 968
+    },
+    {
+      "epoch": 0.25240948163584265,
+      "grad_norm": 0.05731911212205887,
+      "learning_rate": 1.7777777777777777e-05,
+      "loss": 0.0267,
+      "step": 969
+    },
+    {
+      "epoch": 0.25266996613701487,
+      "grad_norm": 0.07839024066925049,
+      "learning_rate": 1.7222222222222224e-05,
+      "loss": 0.0252,
+      "step": 970
+    },
+    {
+      "epoch": 0.25293045063818703,
+      "grad_norm": 0.03709130361676216,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.0203,
+      "step": 971
+    },
+    {
+      "epoch": 0.2531909351393592,
+      "grad_norm": 0.1385352462530136,
+      "learning_rate": 1.611111111111111e-05,
+      "loss": 0.032,
+      "step": 972
+    },
+    {
+      "epoch": 0.25345141964053136,
+      "grad_norm": 0.0869651809334755,
+      "learning_rate": 1.5555555555555555e-05,
+      "loss": 0.0292,
+      "step": 973
+    },
+    {
+      "epoch": 0.2537119041417036,
+      "grad_norm": 0.04381627216935158,
+      "learning_rate": 1.5e-05,
+      "loss": 0.0215,
+      "step": 974
+    },
+    {
+      "epoch": 0.25397238864287575,
+      "grad_norm": 0.08942004293203354,
+      "learning_rate": 1.4444444444444444e-05,
+      "loss": 0.0281,
+      "step": 975
+    },
+    {
+      "epoch": 0.2542328731440479,
+      "grad_norm": 0.054757945239543915,
+      "learning_rate": 1.3888888888888888e-05,
+      "loss": 0.0232,
+      "step": 976
+    },
+    {
+      "epoch": 0.25449335764522013,
+      "grad_norm": 0.04899635165929794,
+      "learning_rate": 1.3333333333333335e-05,
+      "loss": 0.0218,
+      "step": 977
+    },
+    {
+      "epoch": 0.2547538421463923,
+      "grad_norm": 0.07881903648376465,
+      "learning_rate": 1.2777777777777779e-05,
+      "loss": 0.0116,
+      "step": 978
+    },
+    {
+      "epoch": 0.25501432664756446,
+      "grad_norm": 0.06364961713552475,
+      "learning_rate": 1.2222222222222222e-05,
+      "loss": 0.0277,
+      "step": 979
+    },
+    {
+      "epoch": 0.2552748111487366,
+      "grad_norm": 0.15875524282455444,
+      "learning_rate": 1.1666666666666668e-05,
+      "loss": 0.0327,
+      "step": 980
+    },
+    {
+      "epoch": 0.25553529564990884,
+      "grad_norm": 0.10088494420051575,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 0.0099,
+      "step": 981
+    },
+    {
+      "epoch": 0.255795780151081,
+      "grad_norm": 0.08318589627742767,
+      "learning_rate": 1.0555555555555555e-05,
+      "loss": 0.0313,
+      "step": 982
+    },
+    {
+      "epoch": 0.2560562646522532,
+      "grad_norm": 0.05513027682900429,
+      "learning_rate": 1e-05,
+      "loss": 0.0237,
+      "step": 983
+    },
+    {
+      "epoch": 0.2563167491534254,
+      "grad_norm": 0.07061628252267838,
+      "learning_rate": 9.444444444444445e-06,
+      "loss": 0.023,
+      "step": 984
+    },
+    {
+      "epoch": 0.25657723365459756,
+      "grad_norm": 0.054420240223407745,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.0241,
+      "step": 985
+    },
+    {
+      "epoch": 0.2568377181557697,
+      "grad_norm": 0.03888707980513573,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.0128,
+      "step": 986
+    },
+    {
+      "epoch": 0.2570982026569419,
+      "grad_norm": 0.04444512352347374,
+      "learning_rate": 7.777777777777777e-06,
+      "loss": 0.0237,
+      "step": 987
+    },
+    {
+      "epoch": 0.2573586871581141,
+      "grad_norm": 0.07981324940919876,
+      "learning_rate": 7.222222222222222e-06,
+      "loss": 0.0291,
+      "step": 988
+    },
+    {
+      "epoch": 0.2576191716592863,
+      "grad_norm": 0.07422112673521042,
+      "learning_rate": 6.6666666666666675e-06,
+      "loss": 0.025,
+      "step": 989
+    },
+    {
+      "epoch": 0.25787965616045844,
+      "grad_norm": 0.0461212582886219,
+      "learning_rate": 6.111111111111111e-06,
+      "loss": 0.0259,
+      "step": 990
+    },
+    {
+      "epoch": 0.25814014066163066,
+      "grad_norm": 0.04493691772222519,
+      "learning_rate": 5.555555555555556e-06,
+      "loss": 0.0217,
+      "step": 991
+    },
+    {
+      "epoch": 0.2584006251628028,
+      "grad_norm": 0.05476246401667595,
+      "learning_rate": 5e-06,
+      "loss": 0.0223,
+      "step": 992
+    },
+    {
+      "epoch": 0.258661109663975,
+      "grad_norm": 0.05926680192351341,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.0229,
+      "step": 993
+    },
+    {
+      "epoch": 0.25892159416514715,
+      "grad_norm": 0.04447260499000549,
+      "learning_rate": 3.888888888888889e-06,
+      "loss": 0.0228,
+      "step": 994
+    },
+    {
+      "epoch": 0.25918207866631937,
+      "grad_norm": 0.09865443408489227,
+      "learning_rate": 3.3333333333333337e-06,
+      "loss": 0.0278,
+      "step": 995
+    },
+    {
+      "epoch": 0.25944256316749154,
+      "grad_norm": 0.1337546706199646,
+      "learning_rate": 2.777777777777778e-06,
+      "loss": 0.02,
+      "step": 996
+    },
+    {
+      "epoch": 0.2597030476686637,
+      "grad_norm": 0.06787464022636414,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.0264,
+      "step": 997
+    },
+    {
+      "epoch": 0.2599635321698359,
+      "grad_norm": 0.15675954520702362,
+      "learning_rate": 1.6666666666666669e-06,
+      "loss": 0.0334,
+      "step": 998
+    },
+    {
+      "epoch": 0.2602240166710081,
+      "grad_norm": 0.06898515671491623,
+      "learning_rate": 1.111111111111111e-06,
+      "loss": 0.025,
+      "step": 999
+    },
+    {
+      "epoch": 0.26048450117218025,
+      "grad_norm": 0.08689691126346588,
+      "learning_rate": 5.555555555555555e-07,
+      "loss": 0.0125,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 256,
+  "trial_name": null,
+  "trial_params": null
+}