{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6666666666666666, "eval_steps": 2000, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003333333333333333, "grad_norm": 31.625, "learning_rate": 0.0001, "loss": 7.5985, "loss/crossentropy": 2.1267614088952542, "loss/hidden": 3.50078125, "loss/jsd": 0.0, "loss/logits": 0.2063240222632885, "step": 10 }, { "epoch": 0.0006666666666666666, "grad_norm": 29.75, "grad_norm_var": 4.032291666666667, "learning_rate": 0.0001, "loss": 7.7168, "loss/crossentropy": 2.0330664247274397, "loss/hidden": 3.586328125, "loss/jsd": 0.0, "loss/logits": 0.21499024610966444, "step": 20 }, { "epoch": 0.001, "grad_norm": 32.5, "grad_norm_var": 4.584375, "learning_rate": 0.0001, "loss": 7.541, "loss/crossentropy": 2.004887755215168, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.21890099346637726, "step": 30 }, { "epoch": 0.0013333333333333333, "grad_norm": 29.5, "grad_norm_var": 11.2275390625, "learning_rate": 0.0001, "loss": 7.5741, "loss/crossentropy": 1.921644088253379, "loss/hidden": 3.48984375, "loss/jsd": 0.0, "loss/logits": 0.1869568184018135, "step": 40 }, { "epoch": 0.0016666666666666668, "grad_norm": 28.125, "grad_norm_var": 30.218684895833334, "learning_rate": 0.0001, "loss": 7.6759, "loss/crossentropy": 2.0211001455783846, "loss/hidden": 3.51640625, "loss/jsd": 0.0, "loss/logits": 0.20038237608969212, "step": 50 }, { "epoch": 0.002, "grad_norm": 32.0, "grad_norm_var": 28.772330729166665, "learning_rate": 0.0001, "loss": 7.6072, "loss/crossentropy": 1.9673272311687469, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.21945994403213262, "step": 60 }, { "epoch": 0.0023333333333333335, "grad_norm": 31.875, "grad_norm_var": 23.434309895833334, "learning_rate": 0.0001, "loss": 7.6091, "loss/crossentropy": 1.8958056002855301, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.20005326718091965, "step": 70 }, { "epoch": 0.0026666666666666666, "grad_norm": 29.875, "grad_norm_var": 30.380989583333335, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 1.9986739233136177, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.20020230654627086, "step": 80 }, { "epoch": 0.003, "grad_norm": 28.75, "grad_norm_var": 23.71875, "learning_rate": 0.0001, "loss": 7.5856, "loss/crossentropy": 1.9644637256860733, "loss/hidden": 3.583203125, "loss/jsd": 0.0, "loss/logits": 0.20230205450206995, "step": 90 }, { "epoch": 0.0033333333333333335, "grad_norm": 76.0, "grad_norm_var": 141.28333333333333, "learning_rate": 0.0001, "loss": 7.5535, "loss/crossentropy": 2.0592997409403324, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.1948918802663684, "step": 100 }, { "epoch": 0.0036666666666666666, "grad_norm": 30.625, "grad_norm_var": 135.15104166666666, "learning_rate": 0.0001, "loss": 7.5903, "loss/crossentropy": 2.063865052163601, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.19530703937634825, "step": 110 }, { "epoch": 0.004, "grad_norm": 28.875, "grad_norm_var": 17.206184895833335, "learning_rate": 0.0001, "loss": 7.5741, "loss/crossentropy": 1.8623701207339765, "loss/hidden": 3.578515625, "loss/jsd": 0.0, "loss/logits": 0.1913905506953597, "step": 120 }, { "epoch": 0.004333333333333333, "grad_norm": 32.5, "grad_norm_var": 23.064322916666665, "learning_rate": 0.0001, "loss": 7.6511, "loss/crossentropy": 2.0091651529073715, "loss/hidden": 3.553125, "loss/jsd": 0.0, "loss/logits": 0.2141670197248459, "step": 130 }, { "epoch": 0.004666666666666667, "grad_norm": 27.375, "grad_norm_var": 103.30514322916666, "learning_rate": 0.0001, "loss": 7.4431, "loss/crossentropy": 1.916436170786619, "loss/hidden": 3.51796875, "loss/jsd": 0.0, "loss/logits": 0.20381243024021387, "step": 140 }, { "epoch": 0.005, "grad_norm": 39.75, "grad_norm_var": 96.50208333333333, "learning_rate": 0.0001, "loss": 7.5559, "loss/crossentropy": 1.9664984509348868, "loss/hidden": 3.58203125, "loss/jsd": 0.0, "loss/logits": 0.21185472514480352, "step": 150 }, { "epoch": 0.005333333333333333, "grad_norm": 38.5, "grad_norm_var": 12.889322916666666, "learning_rate": 0.0001, "loss": 7.5639, "loss/crossentropy": 1.7829985275864602, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.18775589521974326, "step": 160 }, { "epoch": 0.005666666666666667, "grad_norm": 30.625, "grad_norm_var": 7.327018229166667, "learning_rate": 0.0001, "loss": 7.6669, "loss/crossentropy": 2.119756256043911, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.19864867273718118, "step": 170 }, { "epoch": 0.006, "grad_norm": 28.5, "grad_norm_var": 11.115625, "learning_rate": 0.0001, "loss": 7.58, "loss/crossentropy": 2.0513909574598075, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.19242641516029835, "step": 180 }, { "epoch": 0.006333333333333333, "grad_norm": 30.5, "grad_norm_var": 12.895768229166666, "learning_rate": 0.0001, "loss": 7.5609, "loss/crossentropy": 2.12065304517746, "loss/hidden": 3.562890625, "loss/jsd": 0.0, "loss/logits": 0.20189347472041846, "step": 190 }, { "epoch": 0.006666666666666667, "grad_norm": 32.25, "grad_norm_var": 6.068489583333333, "learning_rate": 0.0001, "loss": 7.4733, "loss/crossentropy": 1.8765578493475914, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.17754658330231904, "step": 200 }, { "epoch": 0.007, "grad_norm": 30.875, "grad_norm_var": 8.3837890625, "learning_rate": 0.0001, "loss": 7.6002, "loss/crossentropy": 2.1129152834415437, "loss/hidden": 3.607421875, "loss/jsd": 0.0, "loss/logits": 0.21554278153926135, "step": 210 }, { "epoch": 0.007333333333333333, "grad_norm": 36.5, "grad_norm_var": 8.062239583333334, "learning_rate": 0.0001, "loss": 7.5878, "loss/crossentropy": 2.1251440078020094, "loss/hidden": 3.526953125, "loss/jsd": 0.0, "loss/logits": 0.21148672625422477, "step": 220 }, { "epoch": 0.007666666666666666, "grad_norm": 30.25, "grad_norm_var": 3.3749553105552276e+18, "learning_rate": 0.0001, "loss": 7.5887, "loss/crossentropy": 1.9448822535574437, "loss/hidden": 3.531640625, "loss/jsd": 0.0, "loss/logits": 0.19272698145359754, "step": 230 }, { "epoch": 0.008, "grad_norm": 32.25, "grad_norm_var": 7.077018229166667, "learning_rate": 0.0001, "loss": 7.5141, "loss/crossentropy": 1.948906348645687, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.1882854463532567, "step": 240 }, { "epoch": 0.008333333333333333, "grad_norm": 27.625, "grad_norm_var": 10.489583333333334, "learning_rate": 0.0001, "loss": 7.5826, "loss/crossentropy": 2.0311428889632226, "loss/hidden": 3.553515625, "loss/jsd": 0.0, "loss/logits": 0.2118887808173895, "step": 250 }, { "epoch": 0.008666666666666666, "grad_norm": 38.5, "grad_norm_var": 2.5671221310702746e+18, "learning_rate": 0.0001, "loss": 7.5344, "loss/crossentropy": 2.0367442041635515, "loss/hidden": 3.51875, "loss/jsd": 0.0, "loss/logits": 0.1990184025838971, "step": 260 }, { "epoch": 0.009, "grad_norm": 29.5, "grad_norm_var": 2.5671221307030984e+18, "learning_rate": 0.0001, "loss": 7.6333, "loss/crossentropy": 2.0081590503454207, "loss/hidden": 3.587109375, "loss/jsd": 0.0, "loss/logits": 0.1952247340232134, "step": 270 }, { "epoch": 0.009333333333333334, "grad_norm": 32.25, "grad_norm_var": 4.65625, "learning_rate": 0.0001, "loss": 7.5652, "loss/crossentropy": 2.0006632819771766, "loss/hidden": 3.6578125, "loss/jsd": 0.0, "loss/logits": 0.2109442435204983, "step": 280 }, { "epoch": 0.009666666666666667, "grad_norm": 28.25, "grad_norm_var": 4.8712890625, "learning_rate": 0.0001, "loss": 7.6176, "loss/crossentropy": 1.9936409659683705, "loss/hidden": 3.62734375, "loss/jsd": 0.0, "loss/logits": 0.20991227701306342, "step": 290 }, { "epoch": 0.01, "grad_norm": 29.875, "grad_norm_var": 1.2395833333333333, "learning_rate": 0.0001, "loss": 7.5556, "loss/crossentropy": 1.9283976651728154, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.1817620201036334, "step": 300 }, { "epoch": 0.010333333333333333, "grad_norm": 28.875, "grad_norm_var": 2.6978515625, "learning_rate": 0.0001, "loss": 7.536, "loss/crossentropy": 1.9715700536966323, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.19542323499917985, "step": 310 }, { "epoch": 0.010666666666666666, "grad_norm": 28.5, "grad_norm_var": 3.8268229166666665, "learning_rate": 0.0001, "loss": 7.5057, "loss/crossentropy": 1.901275810599327, "loss/hidden": 3.5296875, "loss/jsd": 0.0, "loss/logits": 0.2020751972682774, "step": 320 }, { "epoch": 0.011, "grad_norm": 32.25, "grad_norm_var": 9.017643229166667, "learning_rate": 0.0001, "loss": 7.6704, "loss/crossentropy": 2.16257778853178, "loss/hidden": 3.51484375, "loss/jsd": 0.0, "loss/logits": 0.20814207065850496, "step": 330 }, { "epoch": 0.011333333333333334, "grad_norm": 30.25, "grad_norm_var": 22.987955729166668, "learning_rate": 0.0001, "loss": 7.7235, "loss/crossentropy": 2.1863152951002123, "loss/hidden": 3.580859375, "loss/jsd": 0.0, "loss/logits": 0.22101893909275533, "step": 340 }, { "epoch": 0.011666666666666667, "grad_norm": 29.0, "grad_norm_var": 14.775455729166667, "learning_rate": 0.0001, "loss": 7.7208, "loss/crossentropy": 2.0950776278972625, "loss/hidden": 3.582421875, "loss/jsd": 0.0, "loss/logits": 0.21649912521243095, "step": 350 }, { "epoch": 0.012, "grad_norm": 28.875, "grad_norm_var": 14.079622395833333, "learning_rate": 0.0001, "loss": 7.5629, "loss/crossentropy": 2.157699775695801, "loss/hidden": 3.594921875, "loss/jsd": 0.0, "loss/logits": 0.21237569116055965, "step": 360 }, { "epoch": 0.012333333333333333, "grad_norm": 34.5, "grad_norm_var": 8.6619140625, "learning_rate": 0.0001, "loss": 7.5906, "loss/crossentropy": 2.003641249984503, "loss/hidden": 3.60625, "loss/jsd": 0.0, "loss/logits": 0.214218404982239, "step": 370 }, { "epoch": 0.012666666666666666, "grad_norm": 31.375, "grad_norm_var": 7.7134765625, "learning_rate": 0.0001, "loss": 7.4848, "loss/crossentropy": 2.04550613835454, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.1962023086845875, "step": 380 }, { "epoch": 0.013, "grad_norm": 32.25, "grad_norm_var": 17.018489583333334, "learning_rate": 0.0001, "loss": 7.6107, "loss/crossentropy": 1.9983752451837062, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.18816513549536468, "step": 390 }, { "epoch": 0.013333333333333334, "grad_norm": 27.375, "grad_norm_var": 13.6853515625, "learning_rate": 0.0001, "loss": 7.5517, "loss/crossentropy": 1.9227100484073163, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.19887849427759646, "step": 400 }, { "epoch": 0.013666666666666667, "grad_norm": 29.375, "grad_norm_var": 6.414322916666666, "learning_rate": 0.0001, "loss": 7.4839, "loss/crossentropy": 1.8909222275018691, "loss/hidden": 3.52890625, "loss/jsd": 0.0, "loss/logits": 0.20509518096223472, "step": 410 }, { "epoch": 0.014, "grad_norm": 31.25, "grad_norm_var": 43.6775390625, "learning_rate": 0.0001, "loss": 7.5711, "loss/crossentropy": 2.0496916867792607, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.20816515628248453, "step": 420 }, { "epoch": 0.014333333333333333, "grad_norm": 28.375, "grad_norm_var": 43.920833333333334, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 1.8629249297082424, "loss/hidden": 3.600390625, "loss/jsd": 0.0, "loss/logits": 0.22204021718353034, "step": 430 }, { "epoch": 0.014666666666666666, "grad_norm": 36.5, "grad_norm_var": 4.4666015625, "learning_rate": 0.0001, "loss": 7.6361, "loss/crossentropy": 2.144109159708023, "loss/hidden": 3.5453125, "loss/jsd": 0.0, "loss/logits": 0.2209251705557108, "step": 440 }, { "epoch": 0.015, "grad_norm": 39.5, "grad_norm_var": 11.8416015625, "learning_rate": 0.0001, "loss": 7.5079, "loss/crossentropy": 2.1198726154863836, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.20496611315757035, "step": 450 }, { "epoch": 0.015333333333333332, "grad_norm": 27.625, "grad_norm_var": 9.52265625, "learning_rate": 0.0001, "loss": 7.6017, "loss/crossentropy": 1.9195545315742493, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.20273383799940348, "step": 460 }, { "epoch": 0.015666666666666666, "grad_norm": 28.0, "grad_norm_var": 3.6997395833333333, "learning_rate": 0.0001, "loss": 7.522, "loss/crossentropy": 1.843916654586792, "loss/hidden": 3.503515625, "loss/jsd": 0.0, "loss/logits": 0.18868891876190902, "step": 470 }, { "epoch": 0.016, "grad_norm": 29.0, "grad_norm_var": 6.409830729166667, "learning_rate": 0.0001, "loss": 7.6407, "loss/crossentropy": 2.101169949769974, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.1965132836252451, "step": 480 }, { "epoch": 0.01633333333333333, "grad_norm": 28.0, "grad_norm_var": 5.893489583333333, "learning_rate": 0.0001, "loss": 7.6128, "loss/crossentropy": 1.7744137354195118, "loss/hidden": 3.688671875, "loss/jsd": 0.0, "loss/logits": 0.22763985134661197, "step": 490 }, { "epoch": 0.016666666666666666, "grad_norm": 30.625, "grad_norm_var": 2.4702473958333333, "learning_rate": 0.0001, "loss": 7.4634, "loss/crossentropy": 1.9821113154292107, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.19102557199075818, "step": 500 }, { "epoch": 0.017, "grad_norm": 33.25, "grad_norm_var": 3.252795169989198e+18, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 1.9145816348493099, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.1794050358235836, "step": 510 }, { "epoch": 0.017333333333333333, "grad_norm": 30.0, "grad_norm_var": 3.252795169974168e+18, "learning_rate": 0.0001, "loss": 7.4755, "loss/crossentropy": 1.8432543367147445, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.17617368968203664, "step": 520 }, { "epoch": 0.017666666666666667, "grad_norm": 28.0, "grad_norm_var": 4.024934895833334, "learning_rate": 0.0001, "loss": 7.5833, "loss/crossentropy": 1.8588541634380817, "loss/hidden": 3.59921875, "loss/jsd": 0.0, "loss/logits": 0.222337419167161, "step": 530 }, { "epoch": 0.018, "grad_norm": 32.25, "grad_norm_var": 5.722330729166667, "learning_rate": 0.0001, "loss": 7.5707, "loss/crossentropy": 2.078941762447357, "loss/hidden": 3.549609375, "loss/jsd": 0.0, "loss/logits": 0.20959145110100508, "step": 540 }, { "epoch": 0.018333333333333333, "grad_norm": 34.25, "grad_norm_var": 4.277018229166667, "learning_rate": 0.0001, "loss": 7.5294, "loss/crossentropy": 1.9379733189940453, "loss/hidden": 3.53515625, "loss/jsd": 0.0, "loss/logits": 0.20881600230932235, "step": 550 }, { "epoch": 0.018666666666666668, "grad_norm": 32.25, "grad_norm_var": 7.69765625, "learning_rate": 0.0001, "loss": 7.6373, "loss/crossentropy": 1.9783288672566415, "loss/hidden": 3.58515625, "loss/jsd": 0.0, "loss/logits": 0.21193231288343667, "step": 560 }, { "epoch": 0.019, "grad_norm": 30.0, "grad_norm_var": 2.1447265625, "learning_rate": 0.0001, "loss": 7.6105, "loss/crossentropy": 1.9430489577353, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.20412963908165693, "step": 570 }, { "epoch": 0.019333333333333334, "grad_norm": 28.5, "grad_norm_var": 3.871875, "learning_rate": 0.0001, "loss": 7.4554, "loss/crossentropy": 1.9607401743531228, "loss/hidden": 3.516796875, "loss/jsd": 0.0, "loss/logits": 0.19354354813694954, "step": 580 }, { "epoch": 0.019666666666666666, "grad_norm": 28.875, "grad_norm_var": 2.22265625, "learning_rate": 0.0001, "loss": 7.5661, "loss/crossentropy": 2.1669919759035112, "loss/hidden": 3.520703125, "loss/jsd": 0.0, "loss/logits": 0.21281926594674588, "step": 590 }, { "epoch": 0.02, "grad_norm": 28.0, "grad_norm_var": 6.128059895833333, "learning_rate": 0.0001, "loss": 7.6051, "loss/crossentropy": 1.9848819866776466, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19332607751712202, "step": 600 }, { "epoch": 0.02033333333333333, "grad_norm": 32.25, "grad_norm_var": 5.8806640625, "learning_rate": 0.0001, "loss": 7.6721, "loss/crossentropy": 1.8939708933234214, "loss/hidden": 3.6921875, "loss/jsd": 0.0, "loss/logits": 0.21814953703433276, "step": 610 }, { "epoch": 0.020666666666666667, "grad_norm": 30.5, "grad_norm_var": 3.1447265625, "learning_rate": 0.0001, "loss": 7.5335, "loss/crossentropy": 2.0598485633730887, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.20407293895259498, "step": 620 }, { "epoch": 0.021, "grad_norm": 30.75, "grad_norm_var": 1.7806640625, "learning_rate": 0.0001, "loss": 7.6016, "loss/crossentropy": 2.089304950833321, "loss/hidden": 3.5296875, "loss/jsd": 0.0, "loss/logits": 0.21185609493404628, "step": 630 }, { "epoch": 0.021333333333333333, "grad_norm": 28.5, "grad_norm_var": 1.9010416666666667, "learning_rate": 0.0001, "loss": 7.4758, "loss/crossentropy": 1.9482996821403504, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.1972294919192791, "step": 640 }, { "epoch": 0.021666666666666667, "grad_norm": 28.875, "grad_norm_var": 4.358072916666667, "learning_rate": 0.0001, "loss": 7.5755, "loss/crossentropy": 1.944211595505476, "loss/hidden": 3.52890625, "loss/jsd": 0.0, "loss/logits": 0.2032023023813963, "step": 650 }, { "epoch": 0.022, "grad_norm": 29.75, "grad_norm_var": 6.462434895833334, "learning_rate": 0.0001, "loss": 7.5883, "loss/crossentropy": 2.0811010405421255, "loss/hidden": 3.65234375, "loss/jsd": 0.0, "loss/logits": 0.2275617804378271, "step": 660 }, { "epoch": 0.022333333333333334, "grad_norm": 30.625, "grad_norm_var": 2.432747395833333, "learning_rate": 0.0001, "loss": 7.563, "loss/crossentropy": 1.9055639967322349, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.17874624766409397, "step": 670 }, { "epoch": 0.02266666666666667, "grad_norm": 32.0, "grad_norm_var": 1.7207682291666666, "learning_rate": 0.0001, "loss": 7.4911, "loss/crossentropy": 1.9971644386649132, "loss/hidden": 3.60234375, "loss/jsd": 0.0, "loss/logits": 0.21614729724824427, "step": 680 }, { "epoch": 0.023, "grad_norm": 29.375, "grad_norm_var": 4.27265625, "learning_rate": 0.0001, "loss": 7.4699, "loss/crossentropy": 1.7947118416428567, "loss/hidden": 3.528515625, "loss/jsd": 0.0, "loss/logits": 0.19064230471849442, "step": 690 }, { "epoch": 0.023333333333333334, "grad_norm": 44.75, "grad_norm_var": 2.9871953734742487e+18, "learning_rate": 0.0001, "loss": 7.6738, "loss/crossentropy": 2.002936027944088, "loss/hidden": 3.569140625, "loss/jsd": 0.0, "loss/logits": 0.20915955789387225, "step": 700 }, { "epoch": 0.023666666666666666, "grad_norm": 34.0, "grad_norm_var": 5.215599895383044e+18, "learning_rate": 0.0001, "loss": 7.6325, "loss/crossentropy": 2.1164340645074846, "loss/hidden": 3.86953125, "loss/jsd": 0.0, "loss/logits": 0.2152573699131608, "step": 710 }, { "epoch": 0.024, "grad_norm": 37.5, "grad_norm_var": 2.786672611922455e+18, "learning_rate": 0.0001, "loss": 7.5198, "loss/crossentropy": 1.98669326454401, "loss/hidden": 3.548828125, "loss/jsd": 0.0, "loss/logits": 0.19720418509095908, "step": 720 }, { "epoch": 0.024333333333333332, "grad_norm": 28.375, "grad_norm_var": 7.020768229166666, "learning_rate": 0.0001, "loss": 7.434, "loss/crossentropy": 2.0505421236157417, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.2041106302291155, "step": 730 }, { "epoch": 0.024666666666666667, "grad_norm": 31.625, "grad_norm_var": 2.09140625, "learning_rate": 0.0001, "loss": 7.6193, "loss/crossentropy": 2.248919141292572, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.20840255357325077, "step": 740 }, { "epoch": 0.025, "grad_norm": 28.125, "grad_norm_var": 2.169791666666667, "learning_rate": 0.0001, "loss": 7.509, "loss/crossentropy": 1.864149733632803, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.19409853629767895, "step": 750 }, { "epoch": 0.025333333333333333, "grad_norm": 29.5, "grad_norm_var": 3.064583333333333, "learning_rate": 0.0001, "loss": 7.5649, "loss/crossentropy": 2.054806835949421, "loss/hidden": 3.597265625, "loss/jsd": 0.0, "loss/logits": 0.21673853546380997, "step": 760 }, { "epoch": 0.025666666666666667, "grad_norm": 29.75, "grad_norm_var": 2.7035807291666667, "learning_rate": 0.0001, "loss": 7.4756, "loss/crossentropy": 2.0617753110826014, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.206906277872622, "step": 770 }, { "epoch": 0.026, "grad_norm": 31.5, "grad_norm_var": 2.6372395833333333, "learning_rate": 0.0001, "loss": 7.4702, "loss/crossentropy": 1.8962275065481662, "loss/hidden": 3.608984375, "loss/jsd": 0.0, "loss/logits": 0.19846606981009246, "step": 780 }, { "epoch": 0.026333333333333334, "grad_norm": 33.5, "grad_norm_var": 3.169205729166667, "learning_rate": 0.0001, "loss": 7.5526, "loss/crossentropy": 2.071788703650236, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.20218130983412266, "step": 790 }, { "epoch": 0.02666666666666667, "grad_norm": 30.875, "grad_norm_var": 3.065625, "learning_rate": 0.0001, "loss": 7.4885, "loss/crossentropy": 2.0777721792459487, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.1973621778190136, "step": 800 }, { "epoch": 0.027, "grad_norm": 33.0, "grad_norm_var": 2.6634765625, "learning_rate": 0.0001, "loss": 7.5415, "loss/crossentropy": 2.004931616783142, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.2041475849226117, "step": 810 }, { "epoch": 0.027333333333333334, "grad_norm": 29.875, "grad_norm_var": 8.7400390625, "learning_rate": 0.0001, "loss": 7.5747, "loss/crossentropy": 1.9893924653530122, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.20147222764790057, "step": 820 }, { "epoch": 0.027666666666666666, "grad_norm": 29.375, "grad_norm_var": 6.619791666666667, "learning_rate": 0.0001, "loss": 7.5374, "loss/crossentropy": 2.0037991762161256, "loss/hidden": 3.61796875, "loss/jsd": 0.0, "loss/logits": 0.21304882485419513, "step": 830 }, { "epoch": 0.028, "grad_norm": 27.25, "grad_norm_var": 13.094205729166667, "learning_rate": 0.0001, "loss": 7.5467, "loss/crossentropy": 2.165927970409393, "loss/hidden": 3.61328125, "loss/jsd": 0.0, "loss/logits": 0.22674734387546777, "step": 840 }, { "epoch": 0.028333333333333332, "grad_norm": 33.0, "grad_norm_var": 4.539909863774495e+18, "learning_rate": 0.0001, "loss": 7.6389, "loss/crossentropy": 2.002170477807522, "loss/hidden": 3.64453125, "loss/jsd": 0.0, "loss/logits": 0.2155582347884774, "step": 850 }, { "epoch": 0.028666666666666667, "grad_norm": 31.25, "grad_norm_var": 13.001497395833333, "learning_rate": 0.0001, "loss": 7.4992, "loss/crossentropy": 1.9676588103175163, "loss/hidden": 3.56015625, "loss/jsd": 0.0, "loss/logits": 0.21938053257763385, "step": 860 }, { "epoch": 0.029, "grad_norm": 29.375, "grad_norm_var": 2.3837890625, "learning_rate": 0.0001, "loss": 7.4194, "loss/crossentropy": 2.071619277447462, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.18815423799678682, "step": 870 }, { "epoch": 0.029333333333333333, "grad_norm": 31.875, "grad_norm_var": 4.96015625, "learning_rate": 0.0001, "loss": 7.5124, "loss/crossentropy": 1.8726294487714767, "loss/hidden": 3.544140625, "loss/jsd": 0.0, "loss/logits": 0.1933526873588562, "step": 880 }, { "epoch": 0.029666666666666668, "grad_norm": 31.5, "grad_norm_var": 4.031184895833333, "learning_rate": 0.0001, "loss": 7.483, "loss/crossentropy": 2.0754525646567346, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.19888003207743168, "step": 890 }, { "epoch": 0.03, "grad_norm": 27.75, "grad_norm_var": 3.1328868321504614e+18, "learning_rate": 0.0001, "loss": 7.5394, "loss/crossentropy": 1.982106475532055, "loss/hidden": 3.522265625, "loss/jsd": 0.0, "loss/logits": 0.19500507693737745, "step": 900 }, { "epoch": 0.030333333333333334, "grad_norm": 28.375, "grad_norm_var": 3.3374348958333333, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.0982042878866194, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.19667920079082252, "step": 910 }, { "epoch": 0.030666666666666665, "grad_norm": 27.625, "grad_norm_var": 15.303580729166667, "learning_rate": 0.0001, "loss": 7.5156, "loss/crossentropy": 1.917236404120922, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1794217735528946, "step": 920 }, { "epoch": 0.031, "grad_norm": 29.875, "grad_norm_var": 2.491080729166667, "learning_rate": 0.0001, "loss": 7.3778, "loss/crossentropy": 1.8236326985061169, "loss/hidden": 3.599609375, "loss/jsd": 0.0, "loss/logits": 0.1855462996289134, "step": 930 }, { "epoch": 0.03133333333333333, "grad_norm": 29.75, "grad_norm_var": 1.6958333333333333, "learning_rate": 0.0001, "loss": 7.5085, "loss/crossentropy": 1.9527893960475922, "loss/hidden": 3.597265625, "loss/jsd": 0.0, "loss/logits": 0.1947738079354167, "step": 940 }, { "epoch": 0.03166666666666667, "grad_norm": 31.625, "grad_norm_var": 6.542643229166667, "learning_rate": 0.0001, "loss": 7.4885, "loss/crossentropy": 2.021645413711667, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.21985882464796305, "step": 950 }, { "epoch": 0.032, "grad_norm": 30.5, "grad_norm_var": 5.34765625, "learning_rate": 0.0001, "loss": 7.5288, "loss/crossentropy": 2.0543280750513078, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.20037652570754289, "step": 960 }, { "epoch": 0.03233333333333333, "grad_norm": 28.875, "grad_norm_var": 4.117643229166666, "learning_rate": 0.0001, "loss": 7.577, "loss/crossentropy": 2.075811302661896, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.2037598794326186, "step": 970 }, { "epoch": 0.03266666666666666, "grad_norm": 30.875, "grad_norm_var": 4.921875, "learning_rate": 0.0001, "loss": 7.4566, "loss/crossentropy": 1.7558796517550945, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.2151224732398987, "step": 980 }, { "epoch": 0.033, "grad_norm": 27.75, "grad_norm_var": 3.86640625, "learning_rate": 0.0001, "loss": 7.6352, "loss/crossentropy": 2.005723576247692, "loss/hidden": 3.525, "loss/jsd": 0.0, "loss/logits": 0.21079379208385945, "step": 990 }, { "epoch": 0.03333333333333333, "grad_norm": 29.0, "grad_norm_var": 2.881705729166667, "learning_rate": 0.0001, "loss": 7.5591, "loss/crossentropy": 1.9675343804061414, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.1875331589952111, "step": 1000 }, { "epoch": 0.033666666666666664, "grad_norm": 28.5, "grad_norm_var": 2.2447265625, "learning_rate": 0.0001, "loss": 7.6778, "loss/crossentropy": 2.1516902424395083, "loss/hidden": 3.5578125, "loss/jsd": 0.0, "loss/logits": 0.2169111574999988, "step": 1010 }, { "epoch": 0.034, "grad_norm": 34.5, "grad_norm_var": 4.061458333333333, "learning_rate": 0.0001, "loss": 7.5063, "loss/crossentropy": 2.0061569780111315, "loss/hidden": 3.56015625, "loss/jsd": 0.0, "loss/logits": 0.21480353213846684, "step": 1020 }, { "epoch": 0.034333333333333334, "grad_norm": 29.875, "grad_norm_var": 3.676041666666667, "learning_rate": 0.0001, "loss": 7.4549, "loss/crossentropy": 1.790242201089859, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.2009488895535469, "step": 1030 }, { "epoch": 0.034666666666666665, "grad_norm": 30.25, "grad_norm_var": 4.843489583333334, "learning_rate": 0.0001, "loss": 7.6435, "loss/crossentropy": 1.8725737452507019, "loss/hidden": 3.560546875, "loss/jsd": 0.0, "loss/logits": 0.1986979153007269, "step": 1040 }, { "epoch": 0.035, "grad_norm": 31.125, "grad_norm_var": 7.880989583333333, "learning_rate": 0.0001, "loss": 7.5452, "loss/crossentropy": 1.9032835066318512, "loss/hidden": 3.6078125, "loss/jsd": 0.0, "loss/logits": 0.22343007065355777, "step": 1050 }, { "epoch": 0.035333333333333335, "grad_norm": 27.75, "grad_norm_var": 3.1936848958333335, "learning_rate": 0.0001, "loss": 7.4307, "loss/crossentropy": 1.8790418922901153, "loss/hidden": 3.509765625, "loss/jsd": 0.0, "loss/logits": 0.19635042268782854, "step": 1060 }, { "epoch": 0.035666666666666666, "grad_norm": 27.5, "grad_norm_var": 19.636393229166668, "learning_rate": 0.0001, "loss": 7.5323, "loss/crossentropy": 2.105144755542278, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20146814184263348, "step": 1070 }, { "epoch": 0.036, "grad_norm": 28.75, "grad_norm_var": 1.5229166666666667, "learning_rate": 0.0001, "loss": 7.4547, "loss/crossentropy": 1.9635142974555493, "loss/hidden": 3.596484375, "loss/jsd": 0.0, "loss/logits": 0.2115407356992364, "step": 1080 }, { "epoch": 0.036333333333333336, "grad_norm": 27.5, "grad_norm_var": 1.99140625, "learning_rate": 0.0001, "loss": 7.4156, "loss/crossentropy": 1.7195671685039997, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.17583819925785066, "step": 1090 }, { "epoch": 0.03666666666666667, "grad_norm": 30.625, "grad_norm_var": 16.251497395833333, "learning_rate": 0.0001, "loss": 7.5598, "loss/crossentropy": 2.0989751145243645, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.21162394005805255, "step": 1100 }, { "epoch": 0.037, "grad_norm": 31.875, "grad_norm_var": 16.667643229166668, "learning_rate": 0.0001, "loss": 7.44, "loss/crossentropy": 1.8476200565695762, "loss/hidden": 3.656640625, "loss/jsd": 0.0, "loss/logits": 0.21428941674530505, "step": 1110 }, { "epoch": 0.037333333333333336, "grad_norm": 27.875, "grad_norm_var": 4.141080729166666, "learning_rate": 0.0001, "loss": 7.5599, "loss/crossentropy": 2.0233672678470613, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20636487621814012, "step": 1120 }, { "epoch": 0.03766666666666667, "grad_norm": 30.625, "grad_norm_var": 3.905989583333333, "learning_rate": 0.0001, "loss": 7.5396, "loss/crossentropy": 2.0706439450383187, "loss/hidden": 3.6375, "loss/jsd": 0.0, "loss/logits": 0.22444433607161046, "step": 1130 }, { "epoch": 0.038, "grad_norm": 40.0, "grad_norm_var": 9.1681640625, "learning_rate": 0.0001, "loss": 7.4816, "loss/crossentropy": 2.0604072071611883, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.19570945519953967, "step": 1140 }, { "epoch": 0.03833333333333333, "grad_norm": 27.125, "grad_norm_var": 8.778125, "learning_rate": 0.0001, "loss": 7.496, "loss/crossentropy": 2.003468566387892, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.20729636624455453, "step": 1150 }, { "epoch": 0.03866666666666667, "grad_norm": 26.875, "grad_norm_var": 4.26875, "learning_rate": 0.0001, "loss": 7.5086, "loss/crossentropy": 2.0594829618930817, "loss/hidden": 3.520703125, "loss/jsd": 0.0, "loss/logits": 0.19975599888712167, "step": 1160 }, { "epoch": 0.039, "grad_norm": 30.0, "grad_norm_var": 5.464322916666666, "learning_rate": 0.0001, "loss": 7.4157, "loss/crossentropy": 2.1024828396737574, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20702101960778235, "step": 1170 }, { "epoch": 0.03933333333333333, "grad_norm": 30.875, "grad_norm_var": 5.27890625, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 1.8360055424273014, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.19995237458497286, "step": 1180 }, { "epoch": 0.03966666666666667, "grad_norm": 28.5, "grad_norm_var": 2.24765625, "learning_rate": 0.0001, "loss": 7.4699, "loss/crossentropy": 1.8730700716376305, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.1805972701869905, "step": 1190 }, { "epoch": 0.04, "grad_norm": 30.0, "grad_norm_var": 1.39140625, "learning_rate": 0.0001, "loss": 7.4527, "loss/crossentropy": 1.9279061555862427, "loss/hidden": 3.613671875, "loss/jsd": 0.0, "loss/logits": 0.19899049140512942, "step": 1200 }, { "epoch": 0.04033333333333333, "grad_norm": 30.25, "grad_norm_var": 2.6393229166666665, "learning_rate": 0.0001, "loss": 7.5003, "loss/crossentropy": 1.9523996852338315, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.18847337635234, "step": 1210 }, { "epoch": 0.04066666666666666, "grad_norm": 28.625, "grad_norm_var": 4.0712890625, "learning_rate": 0.0001, "loss": 7.3692, "loss/crossentropy": 1.869452066719532, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.18939079586416482, "step": 1220 }, { "epoch": 0.041, "grad_norm": 27.125, "grad_norm_var": 4.7, "learning_rate": 0.0001, "loss": 7.4955, "loss/crossentropy": 1.8472121506929398, "loss/hidden": 3.525390625, "loss/jsd": 0.0, "loss/logits": 0.19386024083942174, "step": 1230 }, { "epoch": 0.04133333333333333, "grad_norm": 31.0, "grad_norm_var": 6.287955729166667, "learning_rate": 0.0001, "loss": 7.5464, "loss/crossentropy": 2.074781009554863, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.1918861323967576, "step": 1240 }, { "epoch": 0.041666666666666664, "grad_norm": 28.25, "grad_norm_var": 7.2625, "learning_rate": 0.0001, "loss": 7.4926, "loss/crossentropy": 2.0296138398349286, "loss/hidden": 3.51484375, "loss/jsd": 0.0, "loss/logits": 0.20444901417940856, "step": 1250 }, { "epoch": 0.042, "grad_norm": 34.25, "grad_norm_var": 4.368489583333333, "learning_rate": 0.0001, "loss": 7.4304, "loss/crossentropy": 1.9001075483858585, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.20093492306768895, "step": 1260 }, { "epoch": 0.042333333333333334, "grad_norm": 34.25, "grad_norm_var": 5.839518229166667, "learning_rate": 0.0001, "loss": 7.5735, "loss/crossentropy": 2.0429010786116124, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.1947106197476387, "step": 1270 }, { "epoch": 0.042666666666666665, "grad_norm": 28.125, "grad_norm_var": 19.803059895833332, "learning_rate": 0.0001, "loss": 7.4188, "loss/crossentropy": 1.9738387122750283, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.19369605761021375, "step": 1280 }, { "epoch": 0.043, "grad_norm": 28.5, "grad_norm_var": 20.662955729166665, "learning_rate": 0.0001, "loss": 7.5718, "loss/crossentropy": 2.069811438769102, "loss/hidden": 3.5921875, "loss/jsd": 0.0, "loss/logits": 0.2095336861908436, "step": 1290 }, { "epoch": 0.043333333333333335, "grad_norm": 29.75, "grad_norm_var": 9.688997395833333, "learning_rate": 0.0001, "loss": 7.5433, "loss/crossentropy": 1.8691002488136292, "loss/hidden": 3.55625, "loss/jsd": 0.0, "loss/logits": 0.20217573698610067, "step": 1300 }, { "epoch": 0.043666666666666666, "grad_norm": 31.5, "grad_norm_var": 8.06875, "learning_rate": 0.0001, "loss": 7.6259, "loss/crossentropy": 1.9233153462409973, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.1821169566363096, "step": 1310 }, { "epoch": 0.044, "grad_norm": 28.625, "grad_norm_var": 2.505143229166667, "learning_rate": 0.0001, "loss": 7.5391, "loss/crossentropy": 1.886833480745554, "loss/hidden": 3.531640625, "loss/jsd": 0.0, "loss/logits": 0.18800980802625417, "step": 1320 }, { "epoch": 0.044333333333333336, "grad_norm": 62.5, "grad_norm_var": 74.24368489583334, "learning_rate": 0.0001, "loss": 7.4192, "loss/crossentropy": 2.0140346214175224, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.20020044017583133, "step": 1330 }, { "epoch": 0.04466666666666667, "grad_norm": 30.5, "grad_norm_var": 67.02180989583333, "learning_rate": 0.0001, "loss": 7.641, "loss/crossentropy": 1.987047991529107, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.1943076512310654, "step": 1340 }, { "epoch": 0.045, "grad_norm": 29.25, "grad_norm_var": 4.337434895833334, "learning_rate": 0.0001, "loss": 7.3804, "loss/crossentropy": 1.9446437805891037, "loss/hidden": 3.51875, "loss/jsd": 0.0, "loss/logits": 0.19996042568236588, "step": 1350 }, { "epoch": 0.04533333333333334, "grad_norm": 27.875, "grad_norm_var": 37.121875, "learning_rate": 0.0001, "loss": 7.3962, "loss/crossentropy": 2.0235562570393086, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.18199211172759533, "step": 1360 }, { "epoch": 0.04566666666666667, "grad_norm": 28.0, "grad_norm_var": 30.05625, "learning_rate": 0.0001, "loss": 7.5547, "loss/crossentropy": 2.0059662126004696, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.1931003224104643, "step": 1370 }, { "epoch": 0.046, "grad_norm": 30.625, "grad_norm_var": 1.6393229166666667, "learning_rate": 0.0001, "loss": 7.5566, "loss/crossentropy": 2.0615221828222277, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.1939302109181881, "step": 1380 }, { "epoch": 0.04633333333333333, "grad_norm": 31.125, "grad_norm_var": 2.4155598958333333, "learning_rate": 0.0001, "loss": 7.3674, "loss/crossentropy": 2.0334793105721474, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.20573036475107073, "step": 1390 }, { "epoch": 0.04666666666666667, "grad_norm": 28.75, "grad_norm_var": 3.4385416666666666, "learning_rate": 0.0001, "loss": 7.5564, "loss/crossentropy": 2.0022457137703897, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.2054836593568325, "step": 1400 }, { "epoch": 0.047, "grad_norm": 31.0, "grad_norm_var": 2.569205729166667, "learning_rate": 0.0001, "loss": 7.5129, "loss/crossentropy": 2.0882332563400268, "loss/hidden": 3.5875, "loss/jsd": 0.0, "loss/logits": 0.2245011145249009, "step": 1410 }, { "epoch": 0.04733333333333333, "grad_norm": 28.75, "grad_norm_var": 3.309309895833333, "learning_rate": 0.0001, "loss": 7.6391, "loss/crossentropy": 2.117540512979031, "loss/hidden": 3.49140625, "loss/jsd": 0.0, "loss/logits": 0.21007234025746585, "step": 1420 }, { "epoch": 0.04766666666666667, "grad_norm": 29.25, "grad_norm_var": 2.426041666666667, "learning_rate": 0.0001, "loss": 7.5825, "loss/crossentropy": 1.9723884835839272, "loss/hidden": 3.619140625, "loss/jsd": 0.0, "loss/logits": 0.22350366972386837, "step": 1430 }, { "epoch": 0.048, "grad_norm": 28.25, "grad_norm_var": 6.510416666666667, "learning_rate": 0.0001, "loss": 7.5028, "loss/crossentropy": 2.130250224471092, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.204136916808784, "step": 1440 }, { "epoch": 0.04833333333333333, "grad_norm": 32.25, "grad_norm_var": 5.830208333333333, "learning_rate": 0.0001, "loss": 7.6264, "loss/crossentropy": 1.9263621412217617, "loss/hidden": 3.5265625, "loss/jsd": 0.0, "loss/logits": 0.20204023886471986, "step": 1450 }, { "epoch": 0.048666666666666664, "grad_norm": 30.25, "grad_norm_var": 3.077018229166667, "learning_rate": 0.0001, "loss": 7.6508, "loss/crossentropy": 2.015178495645523, "loss/hidden": 3.505859375, "loss/jsd": 0.0, "loss/logits": 0.2149656626395881, "step": 1460 }, { "epoch": 0.049, "grad_norm": 28.75, "grad_norm_var": 1.6364583333333333, "learning_rate": 0.0001, "loss": 7.396, "loss/crossentropy": 1.7446117386221887, "loss/hidden": 3.53984375, "loss/jsd": 0.0, "loss/logits": 0.18610622957348824, "step": 1470 }, { "epoch": 0.04933333333333333, "grad_norm": 29.5, "grad_norm_var": 2.1093098958333334, "learning_rate": 0.0001, "loss": 7.5106, "loss/crossentropy": 1.928684476017952, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.19403766375035048, "step": 1480 }, { "epoch": 0.049666666666666665, "grad_norm": 34.25, "grad_norm_var": 3.076041666666667, "learning_rate": 0.0001, "loss": 7.626, "loss/crossentropy": 1.9855677127838134, "loss/hidden": 3.621484375, "loss/jsd": 0.0, "loss/logits": 0.2084784339182079, "step": 1490 }, { "epoch": 0.05, "grad_norm": 34.0, "grad_norm_var": 4.855989583333334, "learning_rate": 0.0001, "loss": 7.5278, "loss/crossentropy": 1.9957743145525455, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.18782012313604354, "step": 1500 }, { "epoch": 0.050333333333333334, "grad_norm": 29.5, "grad_norm_var": 5.1384765625, "learning_rate": 0.0001, "loss": 7.5201, "loss/crossentropy": 1.7874414373189211, "loss/hidden": 3.560546875, "loss/jsd": 0.0, "loss/logits": 0.19774688063189388, "step": 1510 }, { "epoch": 0.050666666666666665, "grad_norm": 28.375, "grad_norm_var": 3.6166666666666667, "learning_rate": 0.0001, "loss": 7.5365, "loss/crossentropy": 1.976015865802765, "loss/hidden": 3.511328125, "loss/jsd": 0.0, "loss/logits": 0.19546866938471794, "step": 1520 }, { "epoch": 0.051, "grad_norm": 31.25, "grad_norm_var": 4.483072916666667, "learning_rate": 0.0001, "loss": 7.5963, "loss/crossentropy": 2.012376007437706, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.19421723317354916, "step": 1530 }, { "epoch": 0.051333333333333335, "grad_norm": 29.0, "grad_norm_var": 3.6143229166666666, "learning_rate": 0.0001, "loss": 7.5606, "loss/crossentropy": 1.973476481437683, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.18947902042418718, "step": 1540 }, { "epoch": 0.051666666666666666, "grad_norm": 31.25, "grad_norm_var": 2.216080729166667, "learning_rate": 0.0001, "loss": 7.5841, "loss/crossentropy": 2.0963733606040478, "loss/hidden": 3.63046875, "loss/jsd": 0.0, "loss/logits": 0.21203735722228884, "step": 1550 }, { "epoch": 0.052, "grad_norm": 31.125, "grad_norm_var": 2.939322916666667, "learning_rate": 0.0001, "loss": 7.5262, "loss/crossentropy": 2.087253449857235, "loss/hidden": 3.505859375, "loss/jsd": 0.0, "loss/logits": 0.20268339216709136, "step": 1560 }, { "epoch": 0.052333333333333336, "grad_norm": 27.875, "grad_norm_var": 19.0056640625, "learning_rate": 0.0001, "loss": 7.4507, "loss/crossentropy": 1.9263526067137717, "loss/hidden": 3.570703125, "loss/jsd": 0.0, "loss/logits": 0.20883142184466125, "step": 1570 }, { "epoch": 0.05266666666666667, "grad_norm": 34.5, "grad_norm_var": 58.7853515625, "learning_rate": 0.0001, "loss": 7.5061, "loss/crossentropy": 2.00187024474144, "loss/hidden": 3.547265625, "loss/jsd": 0.0, "loss/logits": 0.20301075298339127, "step": 1580 }, { "epoch": 0.053, "grad_norm": 28.25, "grad_norm_var": 363.8733723958333, "learning_rate": 0.0001, "loss": 7.6123, "loss/crossentropy": 2.1360982835292814, "loss/hidden": 3.5203125, "loss/jsd": 0.0, "loss/logits": 0.22095724791288376, "step": 1590 }, { "epoch": 0.05333333333333334, "grad_norm": 40.0, "grad_norm_var": 299.7135416666667, "learning_rate": 0.0001, "loss": 7.5589, "loss/crossentropy": 2.0119175627827643, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.19130879566073417, "step": 1600 }, { "epoch": 0.05366666666666667, "grad_norm": 71.0, "grad_norm_var": 107.38118489583333, "learning_rate": 0.0001, "loss": 7.3823, "loss/crossentropy": 2.086880347132683, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.20045112436637283, "step": 1610 }, { "epoch": 0.054, "grad_norm": 57.75, "grad_norm_var": 284.3822916666667, "learning_rate": 0.0001, "loss": 7.4543, "loss/crossentropy": 1.8442568637430667, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.18611445426940917, "step": 1620 }, { "epoch": 0.05433333333333333, "grad_norm": 38.0, "grad_norm_var": 2.758736203563506e+18, "learning_rate": 0.0001, "loss": 7.5437, "loss/crossentropy": 2.0272573217749597, "loss/hidden": 3.5796875, "loss/jsd": 0.0, "loss/logits": 0.2182312898337841, "step": 1630 }, { "epoch": 0.05466666666666667, "grad_norm": 42.0, "grad_norm_var": 132.63723958333333, "learning_rate": 0.0001, "loss": 7.4523, "loss/crossentropy": 2.2197774812579154, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.20501965284347534, "step": 1640 }, { "epoch": 0.055, "grad_norm": 31.5, "grad_norm_var": 72.67083333333333, "learning_rate": 0.0001, "loss": 7.4527, "loss/crossentropy": 2.0575285345315932, "loss/hidden": 3.54375, "loss/jsd": 0.0, "loss/logits": 0.20768663324415684, "step": 1650 }, { "epoch": 0.05533333333333333, "grad_norm": 29.5, "grad_norm_var": 4.2572265625, "learning_rate": 0.0001, "loss": 7.2883, "loss/crossentropy": 1.9474568977952003, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.18284566076472403, "step": 1660 }, { "epoch": 0.05566666666666667, "grad_norm": 30.5, "grad_norm_var": 2.751822916666667, "learning_rate": 0.0001, "loss": 7.3877, "loss/crossentropy": 1.9012431338429452, "loss/hidden": 3.51796875, "loss/jsd": 0.0, "loss/logits": 0.20468210186809302, "step": 1670 }, { "epoch": 0.056, "grad_norm": 31.875, "grad_norm_var": 173.35358072916668, "learning_rate": 0.0001, "loss": 7.5338, "loss/crossentropy": 1.9648082010447978, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.20034276638180018, "step": 1680 }, { "epoch": 0.05633333333333333, "grad_norm": 27.875, "grad_norm_var": 174.56770833333334, "learning_rate": 0.0001, "loss": 7.5064, "loss/crossentropy": 1.945329135656357, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.1990867467597127, "step": 1690 }, { "epoch": 0.056666666666666664, "grad_norm": 30.5, "grad_norm_var": 1.6837890625, "learning_rate": 0.0001, "loss": 7.5574, "loss/crossentropy": 2.179739661514759, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.21293067559599876, "step": 1700 }, { "epoch": 0.057, "grad_norm": 29.25, "grad_norm_var": 1.4046223958333333, "learning_rate": 0.0001, "loss": 7.4038, "loss/crossentropy": 2.059504656493664, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19147460404783487, "step": 1710 }, { "epoch": 0.05733333333333333, "grad_norm": 31.625, "grad_norm_var": 18.357291666666665, "learning_rate": 0.0001, "loss": 7.5845, "loss/crossentropy": 2.033545310795307, "loss/hidden": 3.606640625, "loss/jsd": 0.0, "loss/logits": 0.2197973679751158, "step": 1720 }, { "epoch": 0.057666666666666665, "grad_norm": 28.875, "grad_norm_var": 19.99375, "learning_rate": 0.0001, "loss": 7.4686, "loss/crossentropy": 1.9618622936308383, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.19450215119868516, "step": 1730 }, { "epoch": 0.058, "grad_norm": 27.625, "grad_norm_var": 3.3577473958333335, "learning_rate": 0.0001, "loss": 7.4657, "loss/crossentropy": 2.002457691729069, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.21016842536628247, "step": 1740 }, { "epoch": 0.058333333333333334, "grad_norm": 28.125, "grad_norm_var": 5.837239583333333, "learning_rate": 0.0001, "loss": 7.4309, "loss/crossentropy": 2.003567434847355, "loss/hidden": 3.488671875, "loss/jsd": 0.0, "loss/logits": 0.191750480979681, "step": 1750 }, { "epoch": 0.058666666666666666, "grad_norm": 31.0, "grad_norm_var": 6.869205729166667, "learning_rate": 0.0001, "loss": 7.5323, "loss/crossentropy": 1.9891509354114532, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.19737645965069534, "step": 1760 }, { "epoch": 0.059, "grad_norm": 33.5, "grad_norm_var": 9.762239583333333, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 1.9060836732387543, "loss/hidden": 3.525390625, "loss/jsd": 0.0, "loss/logits": 0.19073443934321405, "step": 1770 }, { "epoch": 0.059333333333333335, "grad_norm": 53.25, "grad_norm_var": 817.7145833333333, "learning_rate": 0.0001, "loss": 7.4702, "loss/crossentropy": 1.9535066038370132, "loss/hidden": 3.259375, "loss/jsd": 0.0, "loss/logits": 0.18470509704202415, "step": 1780 }, { "epoch": 0.059666666666666666, "grad_norm": 28.75, "grad_norm_var": 824.4478515625, "learning_rate": 0.0001, "loss": 7.4281, "loss/crossentropy": 2.1046823799610137, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.20299233607947825, "step": 1790 }, { "epoch": 0.06, "grad_norm": 28.25, "grad_norm_var": 8.218489583333334, "learning_rate": 0.0001, "loss": 7.4213, "loss/crossentropy": 2.0691867701709272, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.18451128397136926, "step": 1800 }, { "epoch": 0.060333333333333336, "grad_norm": 29.5, "grad_norm_var": 4.63515625, "learning_rate": 0.0001, "loss": 7.5827, "loss/crossentropy": 2.0910270109772684, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.2041981004178524, "step": 1810 }, { "epoch": 0.06066666666666667, "grad_norm": 30.5, "grad_norm_var": 2.809309895833333, "learning_rate": 0.0001, "loss": 7.3922, "loss/crossentropy": 2.2195225834846495, "loss/hidden": 3.541796875, "loss/jsd": 0.0, "loss/logits": 0.20947412233799695, "step": 1820 }, { "epoch": 0.061, "grad_norm": 31.75, "grad_norm_var": 2.1025390625, "learning_rate": 0.0001, "loss": 7.4482, "loss/crossentropy": 1.883380150794983, "loss/hidden": 3.505859375, "loss/jsd": 0.0, "loss/logits": 0.19172603711485864, "step": 1830 }, { "epoch": 0.06133333333333333, "grad_norm": 30.5, "grad_norm_var": 2.983268229166667, "learning_rate": 0.0001, "loss": 7.574, "loss/crossentropy": 2.0509617626667023, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.19509849231690168, "step": 1840 }, { "epoch": 0.06166666666666667, "grad_norm": 30.5, "grad_norm_var": 2.6035807291666666, "learning_rate": 0.0001, "loss": 7.491, "loss/crossentropy": 1.8894333105534316, "loss/hidden": 3.55390625, "loss/jsd": 0.0, "loss/logits": 0.20632429774850608, "step": 1850 }, { "epoch": 0.062, "grad_norm": 31.5, "grad_norm_var": 2.0327473958333333, "learning_rate": 0.0001, "loss": 7.5918, "loss/crossentropy": 1.8879051633179187, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.19158746097236873, "step": 1860 }, { "epoch": 0.06233333333333333, "grad_norm": 28.75, "grad_norm_var": 1.1364583333333333, "learning_rate": 0.0001, "loss": 7.482, "loss/crossentropy": 2.0157490566372873, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.19888350684195757, "step": 1870 }, { "epoch": 0.06266666666666666, "grad_norm": 28.875, "grad_norm_var": 3.5567057291666666, "learning_rate": 0.0001, "loss": 7.5322, "loss/crossentropy": 1.945923997461796, "loss/hidden": 3.52109375, "loss/jsd": 0.0, "loss/logits": 0.19232689775526524, "step": 1880 }, { "epoch": 0.063, "grad_norm": 29.0, "grad_norm_var": 17.6541015625, "learning_rate": 0.0001, "loss": 7.6363, "loss/crossentropy": 1.9230536341667175, "loss/hidden": 3.57421875, "loss/jsd": 0.0, "loss/logits": 0.22092546205967664, "step": 1890 }, { "epoch": 0.06333333333333334, "grad_norm": 30.375, "grad_norm_var": 4.918489583333334, "learning_rate": 0.0001, "loss": 7.512, "loss/crossentropy": 1.948906946182251, "loss/hidden": 3.51640625, "loss/jsd": 0.0, "loss/logits": 0.21105497963726522, "step": 1900 }, { "epoch": 0.06366666666666666, "grad_norm": 28.75, "grad_norm_var": 4.620833333333334, "learning_rate": 0.0001, "loss": 7.7102, "loss/crossentropy": 1.9838243424892426, "loss/hidden": 3.5484375, "loss/jsd": 0.0, "loss/logits": 0.19815999493002892, "step": 1910 }, { "epoch": 0.064, "grad_norm": 28.625, "grad_norm_var": 3.915625, "learning_rate": 0.0001, "loss": 7.4002, "loss/crossentropy": 1.9684632889926434, "loss/hidden": 3.526953125, "loss/jsd": 0.0, "loss/logits": 0.19348899647593498, "step": 1920 }, { "epoch": 0.06433333333333334, "grad_norm": 30.0, "grad_norm_var": 2.445572916666667, "learning_rate": 0.0001, "loss": 7.4213, "loss/crossentropy": 1.9885004058480262, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.197100811265409, "step": 1930 }, { "epoch": 0.06466666666666666, "grad_norm": 28.75, "grad_norm_var": 8.6291015625, "learning_rate": 0.0001, "loss": 7.5265, "loss/crossentropy": 2.053832122683525, "loss/hidden": 3.48359375, "loss/jsd": 0.0, "loss/logits": 0.2043491993099451, "step": 1940 }, { "epoch": 0.065, "grad_norm": 30.375, "grad_norm_var": 8.452083333333333, "learning_rate": 0.0001, "loss": 7.5805, "loss/crossentropy": 2.155242443084717, "loss/hidden": 3.528125, "loss/jsd": 0.0, "loss/logits": 0.20595550388097764, "step": 1950 }, { "epoch": 0.06533333333333333, "grad_norm": 30.125, "grad_norm_var": 2.8052083333333333, "learning_rate": 0.0001, "loss": 7.4906, "loss/crossentropy": 2.013193578273058, "loss/hidden": 3.5484375, "loss/jsd": 0.0, "loss/logits": 0.20787378288805486, "step": 1960 }, { "epoch": 0.06566666666666666, "grad_norm": 29.375, "grad_norm_var": 1.5103515625, "learning_rate": 0.0001, "loss": 7.3576, "loss/crossentropy": 2.026726779341698, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.19324612077325581, "step": 1970 }, { "epoch": 0.066, "grad_norm": 32.75, "grad_norm_var": 5.118489583333333, "learning_rate": 0.0001, "loss": 7.5116, "loss/crossentropy": 1.8675480149686337, "loss/hidden": 3.545703125, "loss/jsd": 0.0, "loss/logits": 0.1889248845167458, "step": 1980 }, { "epoch": 0.06633333333333333, "grad_norm": 30.875, "grad_norm_var": 2.917708333333333, "learning_rate": 0.0001, "loss": 7.3717, "loss/crossentropy": 1.7394216984510422, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.18252216521650552, "step": 1990 }, { "epoch": 0.06666666666666667, "grad_norm": 30.75, "grad_norm_var": 3.0083333333333333, "learning_rate": 0.0001, "loss": 7.3815, "loss/crossentropy": 1.95072568282485, "loss/hidden": 3.511328125, "loss/jsd": 0.0, "loss/logits": 0.21236553341150283, "step": 2000 }, { "epoch": 0.067, "grad_norm": 28.375, "grad_norm_var": 3.2223307291666665, "learning_rate": 0.0001, "loss": 7.592, "loss/crossentropy": 2.0375755399465563, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.20556780323386192, "step": 2010 }, { "epoch": 0.06733333333333333, "grad_norm": 29.625, "grad_norm_var": 1.8113932291666666, "learning_rate": 0.0001, "loss": 7.4696, "loss/crossentropy": 1.8140352122485637, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.18291605981066822, "step": 2020 }, { "epoch": 0.06766666666666667, "grad_norm": 27.625, "grad_norm_var": 5.895768229166666, "learning_rate": 0.0001, "loss": 7.5348, "loss/crossentropy": 2.0320239901542663, "loss/hidden": 3.612890625, "loss/jsd": 0.0, "loss/logits": 0.21361415404826403, "step": 2030 }, { "epoch": 0.068, "grad_norm": 28.375, "grad_norm_var": 2.146875, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.0248512759804727, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19521952457726002, "step": 2040 }, { "epoch": 0.06833333333333333, "grad_norm": 30.375, "grad_norm_var": 2.7177083333333334, "learning_rate": 0.0001, "loss": 7.4225, "loss/crossentropy": 1.965277999639511, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.20597195848822594, "step": 2050 }, { "epoch": 0.06866666666666667, "grad_norm": 29.0, "grad_norm_var": 2.30625, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 1.9709086209535598, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.2123000852763653, "step": 2060 }, { "epoch": 0.069, "grad_norm": 29.125, "grad_norm_var": 1.6749348958333334, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 1.8571318089962006, "loss/hidden": 3.4890625, "loss/jsd": 0.0, "loss/logits": 0.18433686401695012, "step": 2070 }, { "epoch": 0.06933333333333333, "grad_norm": 50.5, "grad_norm_var": 29.994205729166666, "learning_rate": 0.0001, "loss": 7.5624, "loss/crossentropy": 1.8412291273474692, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.1968109119683504, "step": 2080 }, { "epoch": 0.06966666666666667, "grad_norm": 28.25, "grad_norm_var": 30.145833333333332, "learning_rate": 0.0001, "loss": 7.5685, "loss/crossentropy": 1.9417820394039154, "loss/hidden": 3.557421875, "loss/jsd": 0.0, "loss/logits": 0.2025158878415823, "step": 2090 }, { "epoch": 0.07, "grad_norm": 29.75, "grad_norm_var": 26.91875, "learning_rate": 0.0001, "loss": 7.5357, "loss/crossentropy": 1.9219476103782653, "loss/hidden": 3.52734375, "loss/jsd": 0.0, "loss/logits": 0.20465447269380094, "step": 2100 }, { "epoch": 0.07033333333333333, "grad_norm": 29.625, "grad_norm_var": 1.2348307291666667, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.0867193929851053, "loss/hidden": 3.462890625, "loss/jsd": 0.0, "loss/logits": 0.2029378518462181, "step": 2110 }, { "epoch": 0.07066666666666667, "grad_norm": 31.25, "grad_norm_var": 2.0041015625, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 1.946194139868021, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.19791226089000702, "step": 2120 }, { "epoch": 0.071, "grad_norm": 29.25, "grad_norm_var": 1.9291666666666667, "learning_rate": 0.0001, "loss": 7.5498, "loss/crossentropy": 2.1164715617895125, "loss/hidden": 3.549609375, "loss/jsd": 0.0, "loss/logits": 0.2019685409963131, "step": 2130 }, { "epoch": 0.07133333333333333, "grad_norm": 29.25, "grad_norm_var": 3.914322916666667, "learning_rate": 0.0001, "loss": 7.5419, "loss/crossentropy": 1.9712113544344902, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.19878581538796425, "step": 2140 }, { "epoch": 0.07166666666666667, "grad_norm": 31.125, "grad_norm_var": 4.383333333333334, "learning_rate": 0.0001, "loss": 7.4941, "loss/crossentropy": 2.029564140737057, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.18580430280417204, "step": 2150 }, { "epoch": 0.072, "grad_norm": 33.75, "grad_norm_var": 3.0952473958333333, "learning_rate": 0.0001, "loss": 7.5539, "loss/crossentropy": 2.108410653471947, "loss/hidden": 3.550390625, "loss/jsd": 0.0, "loss/logits": 0.19773362781852483, "step": 2160 }, { "epoch": 0.07233333333333333, "grad_norm": 29.125, "grad_norm_var": 3.7225065370769597e+18, "learning_rate": 0.0001, "loss": 7.5807, "loss/crossentropy": 2.200056713819504, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.21192360743880273, "step": 2170 }, { "epoch": 0.07266666666666667, "grad_norm": 29.125, "grad_norm_var": 2.825, "learning_rate": 0.0001, "loss": 7.5171, "loss/crossentropy": 1.9891901642084122, "loss/hidden": 3.4890625, "loss/jsd": 0.0, "loss/logits": 0.20712667908519505, "step": 2180 }, { "epoch": 0.073, "grad_norm": 29.25, "grad_norm_var": 3.9176432291666665, "learning_rate": 0.0001, "loss": 7.4697, "loss/crossentropy": 2.06245077252388, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19077986814081668, "step": 2190 }, { "epoch": 0.07333333333333333, "grad_norm": 31.375, "grad_norm_var": 2.7848307291666665, "learning_rate": 0.0001, "loss": 7.3934, "loss/crossentropy": 2.0596141688525678, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.19412127723917366, "step": 2200 }, { "epoch": 0.07366666666666667, "grad_norm": 29.125, "grad_norm_var": 1.5712890625, "learning_rate": 0.0001, "loss": 7.6225, "loss/crossentropy": 1.9819763131439685, "loss/hidden": 3.535546875, "loss/jsd": 0.0, "loss/logits": 0.21018738839775325, "step": 2210 }, { "epoch": 0.074, "grad_norm": 31.375, "grad_norm_var": 1.33125, "learning_rate": 0.0001, "loss": 7.5582, "loss/crossentropy": 2.1160483695566654, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.19892175681889057, "step": 2220 }, { "epoch": 0.07433333333333333, "grad_norm": 29.875, "grad_norm_var": 2.655143229166667, "learning_rate": 0.0001, "loss": 7.4979, "loss/crossentropy": 2.024607817828655, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.20980050768703223, "step": 2230 }, { "epoch": 0.07466666666666667, "grad_norm": 28.25, "grad_norm_var": 29.4181640625, "learning_rate": 0.0001, "loss": 7.5534, "loss/crossentropy": 2.139413857460022, "loss/hidden": 3.5875, "loss/jsd": 0.0, "loss/logits": 0.22047825269401072, "step": 2240 }, { "epoch": 0.075, "grad_norm": 31.375, "grad_norm_var": 26.749739583333334, "learning_rate": 0.0001, "loss": 7.5299, "loss/crossentropy": 1.8568807385861874, "loss/hidden": 3.67109375, "loss/jsd": 0.0, "loss/logits": 0.22608924899250268, "step": 2250 }, { "epoch": 0.07533333333333334, "grad_norm": 29.75, "grad_norm_var": 2.7306640625, "learning_rate": 0.0001, "loss": 7.3968, "loss/crossentropy": 1.928690955042839, "loss/hidden": 3.4984375, "loss/jsd": 0.0, "loss/logits": 0.19692064113914967, "step": 2260 }, { "epoch": 0.07566666666666666, "grad_norm": 32.0, "grad_norm_var": 5.160872395833334, "learning_rate": 0.0001, "loss": 7.3918, "loss/crossentropy": 1.9859561018645764, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.19398031858727335, "step": 2270 }, { "epoch": 0.076, "grad_norm": 29.5, "grad_norm_var": 3.1626528095259156e+18, "learning_rate": 0.0001, "loss": 7.5856, "loss/crossentropy": 1.9928829759359359, "loss/hidden": 3.58671875, "loss/jsd": 0.0, "loss/logits": 0.20894276313483715, "step": 2280 }, { "epoch": 0.07633333333333334, "grad_norm": 29.25, "grad_norm_var": 0.9072916666666667, "learning_rate": 0.0001, "loss": 7.4803, "loss/crossentropy": 1.9579173117876052, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.19308385336771608, "step": 2290 }, { "epoch": 0.07666666666666666, "grad_norm": 33.5, "grad_norm_var": 5.972916666666666, "learning_rate": 0.0001, "loss": 7.6019, "loss/crossentropy": 2.1964300483465196, "loss/hidden": 3.565625, "loss/jsd": 0.0, "loss/logits": 0.22039370723068713, "step": 2300 }, { "epoch": 0.077, "grad_norm": 31.625, "grad_norm_var": 5.362434895833333, "learning_rate": 0.0001, "loss": 7.5746, "loss/crossentropy": 2.0767961286008356, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.21204420514404773, "step": 2310 }, { "epoch": 0.07733333333333334, "grad_norm": 29.625, "grad_norm_var": 4.051497395833334, "learning_rate": 0.0001, "loss": 7.5052, "loss/crossentropy": 1.9535674713551998, "loss/hidden": 3.584765625, "loss/jsd": 0.0, "loss/logits": 0.19966216515749693, "step": 2320 }, { "epoch": 0.07766666666666666, "grad_norm": 29.5, "grad_norm_var": 1.6087890625, "learning_rate": 0.0001, "loss": 7.4516, "loss/crossentropy": 2.075386855006218, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.19604461081326008, "step": 2330 }, { "epoch": 0.078, "grad_norm": 30.125, "grad_norm_var": 1.8488932291666667, "learning_rate": 0.0001, "loss": 7.434, "loss/crossentropy": 2.1180974088609217, "loss/hidden": 3.51171875, "loss/jsd": 0.0, "loss/logits": 0.21024074107408525, "step": 2340 }, { "epoch": 0.07833333333333334, "grad_norm": 28.25, "grad_norm_var": 1.73515625, "learning_rate": 0.0001, "loss": 7.2779, "loss/crossentropy": 1.7643262982368468, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.17727267583832146, "step": 2350 }, { "epoch": 0.07866666666666666, "grad_norm": 32.5, "grad_norm_var": 3.0254557291666666, "learning_rate": 0.0001, "loss": 7.6358, "loss/crossentropy": 2.13492615967989, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.20045919958502054, "step": 2360 }, { "epoch": 0.079, "grad_norm": 29.875, "grad_norm_var": 6.226822916666666, "learning_rate": 0.0001, "loss": 7.5431, "loss/crossentropy": 2.001661640405655, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.19230220075696708, "step": 2370 }, { "epoch": 0.07933333333333334, "grad_norm": 33.0, "grad_norm_var": 2.353125, "learning_rate": 0.0001, "loss": 7.4993, "loss/crossentropy": 1.9042383052408696, "loss/hidden": 3.55390625, "loss/jsd": 0.0, "loss/logits": 0.2107909569516778, "step": 2380 }, { "epoch": 0.07966666666666666, "grad_norm": 29.375, "grad_norm_var": 2.780143229166667, "learning_rate": 0.0001, "loss": 7.5453, "loss/crossentropy": 1.8663247779011727, "loss/hidden": 3.607421875, "loss/jsd": 0.0, "loss/logits": 0.1925799923017621, "step": 2390 }, { "epoch": 0.08, "grad_norm": 28.625, "grad_norm_var": 1.0434895833333333, "learning_rate": 0.0001, "loss": 7.5259, "loss/crossentropy": 1.8550945937633514, "loss/hidden": 3.572265625, "loss/jsd": 0.0, "loss/logits": 0.21014908161014317, "step": 2400 }, { "epoch": 0.08033333333333334, "grad_norm": 29.0, "grad_norm_var": 2.1556640625, "learning_rate": 0.0001, "loss": 7.4168, "loss/crossentropy": 2.02880796790123, "loss/hidden": 3.50234375, "loss/jsd": 0.0, "loss/logits": 0.20425355471670628, "step": 2410 }, { "epoch": 0.08066666666666666, "grad_norm": 34.5, "grad_norm_var": 6.866080729166667, "learning_rate": 0.0001, "loss": 7.4415, "loss/crossentropy": 1.9024710722267628, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.1873930080793798, "step": 2420 }, { "epoch": 0.081, "grad_norm": 29.25, "grad_norm_var": 4.495768229166667, "learning_rate": 0.0001, "loss": 7.4689, "loss/crossentropy": 1.9507659994065762, "loss/hidden": 3.588671875, "loss/jsd": 0.0, "loss/logits": 0.1926257025450468, "step": 2430 }, { "epoch": 0.08133333333333333, "grad_norm": 29.0, "grad_norm_var": 2.129622395833333, "learning_rate": 0.0001, "loss": 7.5494, "loss/crossentropy": 1.894545779377222, "loss/hidden": 3.56875, "loss/jsd": 0.0, "loss/logits": 0.1897579187527299, "step": 2440 }, { "epoch": 0.08166666666666667, "grad_norm": 28.25, "grad_norm_var": 3.222606979622747e+18, "learning_rate": 0.0001, "loss": 7.5604, "loss/crossentropy": 2.0058958128094675, "loss/hidden": 3.610546875, "loss/jsd": 0.0, "loss/logits": 0.19538369607180356, "step": 2450 }, { "epoch": 0.082, "grad_norm": 28.625, "grad_norm_var": 6.909309895833333, "learning_rate": 0.0001, "loss": 7.3757, "loss/crossentropy": 1.9315983682870865, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.1938073981553316, "step": 2460 }, { "epoch": 0.08233333333333333, "grad_norm": 28.875, "grad_norm_var": 2.927018229166667, "learning_rate": 0.0001, "loss": 7.4538, "loss/crossentropy": 1.950503447651863, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.19425667226314544, "step": 2470 }, { "epoch": 0.08266666666666667, "grad_norm": 30.375, "grad_norm_var": 4.36015625, "learning_rate": 0.0001, "loss": 7.5083, "loss/crossentropy": 1.960983681678772, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.1997334610670805, "step": 2480 }, { "epoch": 0.083, "grad_norm": 28.5, "grad_norm_var": 6.917643229166667, "learning_rate": 0.0001, "loss": 7.4835, "loss/crossentropy": 1.9694616302847863, "loss/hidden": 3.5359375, "loss/jsd": 0.0, "loss/logits": 0.2186092620715499, "step": 2490 }, { "epoch": 0.08333333333333333, "grad_norm": 30.5, "grad_norm_var": 3.0874348958333333, "learning_rate": 0.0001, "loss": 7.4791, "loss/crossentropy": 1.9896290645003318, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.19468545503914356, "step": 2500 }, { "epoch": 0.08366666666666667, "grad_norm": 29.25, "grad_norm_var": 2.1550427683902804e+18, "learning_rate": 0.0001, "loss": 7.4959, "loss/crossentropy": 1.9156237840652466, "loss/hidden": 3.777734375, "loss/jsd": 0.0, "loss/logits": 0.20535588413476943, "step": 2510 }, { "epoch": 0.084, "grad_norm": 27.125, "grad_norm_var": 2.155042768420864e+18, "learning_rate": 0.0001, "loss": 7.366, "loss/crossentropy": 1.8602532722055911, "loss/hidden": 3.521875, "loss/jsd": 0.0, "loss/logits": 0.19043652582913637, "step": 2520 }, { "epoch": 0.08433333333333333, "grad_norm": 31.125, "grad_norm_var": 2.3958333333333335, "learning_rate": 0.0001, "loss": 7.5602, "loss/crossentropy": 2.101513335108757, "loss/hidden": 3.520703125, "loss/jsd": 0.0, "loss/logits": 0.2020590901374817, "step": 2530 }, { "epoch": 0.08466666666666667, "grad_norm": 28.5, "grad_norm_var": 1.9997395833333333, "learning_rate": 0.0001, "loss": 7.519, "loss/crossentropy": 1.9717224732041359, "loss/hidden": 3.658203125, "loss/jsd": 0.0, "loss/logits": 0.2198735598474741, "step": 2540 }, { "epoch": 0.085, "grad_norm": 30.5, "grad_norm_var": 1.2739583333333333, "learning_rate": 0.0001, "loss": 7.4124, "loss/crossentropy": 1.9950795613229275, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.1929251885972917, "step": 2550 }, { "epoch": 0.08533333333333333, "grad_norm": 27.625, "grad_norm_var": 19.298958333333335, "learning_rate": 0.0001, "loss": 7.533, "loss/crossentropy": 1.9461016044020654, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.21008020546287298, "step": 2560 }, { "epoch": 0.08566666666666667, "grad_norm": 27.625, "grad_norm_var": 24.687239583333334, "learning_rate": 0.0001, "loss": 7.4751, "loss/crossentropy": 2.0097773715853693, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.1930695092305541, "step": 2570 }, { "epoch": 0.086, "grad_norm": 29.25, "grad_norm_var": 2.4145182291666667, "learning_rate": 0.0001, "loss": 7.5301, "loss/crossentropy": 2.0881737142801287, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.2016575714573264, "step": 2580 }, { "epoch": 0.08633333333333333, "grad_norm": 27.375, "grad_norm_var": 1.1768229166666666, "learning_rate": 0.0001, "loss": 7.4214, "loss/crossentropy": 1.9433169059455395, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.18515117829665542, "step": 2590 }, { "epoch": 0.08666666666666667, "grad_norm": 27.125, "grad_norm_var": 21.9025390625, "learning_rate": 0.0001, "loss": 7.4042, "loss/crossentropy": 2.065168938785791, "loss/hidden": 3.596484375, "loss/jsd": 0.0, "loss/logits": 0.21064424477517604, "step": 2600 }, { "epoch": 0.087, "grad_norm": 28.5, "grad_norm_var": 22.8962890625, "learning_rate": 0.0001, "loss": 7.3812, "loss/crossentropy": 1.9288912743330002, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.19405354280024767, "step": 2610 }, { "epoch": 0.08733333333333333, "grad_norm": 28.25, "grad_norm_var": 28.0869140625, "learning_rate": 0.0001, "loss": 7.3727, "loss/crossentropy": 1.9190235704183578, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.18091363273561, "step": 2620 }, { "epoch": 0.08766666666666667, "grad_norm": 26.375, "grad_norm_var": 5.84140625, "learning_rate": 0.0001, "loss": 7.5212, "loss/crossentropy": 1.9575423367321492, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.19920362159609795, "step": 2630 }, { "epoch": 0.088, "grad_norm": 28.5, "grad_norm_var": 2.8291666666666666, "learning_rate": 0.0001, "loss": 7.4275, "loss/crossentropy": 1.9045813247561454, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.1949614692479372, "step": 2640 }, { "epoch": 0.08833333333333333, "grad_norm": 28.75, "grad_norm_var": 2.24765625, "learning_rate": 0.0001, "loss": 7.3944, "loss/crossentropy": 2.020074427127838, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.18035217132419348, "step": 2650 }, { "epoch": 0.08866666666666667, "grad_norm": 30.0, "grad_norm_var": 2.6337890625, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.0263232976198196, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19060131050646306, "step": 2660 }, { "epoch": 0.089, "grad_norm": 31.125, "grad_norm_var": 1.6306640625, "learning_rate": 0.0001, "loss": 7.4651, "loss/crossentropy": 1.9355658136308194, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.20016200989484786, "step": 2670 }, { "epoch": 0.08933333333333333, "grad_norm": 27.5, "grad_norm_var": 1.553125, "learning_rate": 0.0001, "loss": 7.4609, "loss/crossentropy": 1.9335235960781574, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.19086429215967654, "step": 2680 }, { "epoch": 0.08966666666666667, "grad_norm": 27.5, "grad_norm_var": 2.049934895833333, "learning_rate": 0.0001, "loss": 7.3724, "loss/crossentropy": 2.019812647998333, "loss/hidden": 3.486328125, "loss/jsd": 0.0, "loss/logits": 0.19540303545072674, "step": 2690 }, { "epoch": 0.09, "grad_norm": 36.75, "grad_norm_var": 7.087955729166667, "learning_rate": 0.0001, "loss": 7.5417, "loss/crossentropy": 1.9292104691267014, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.1910783352330327, "step": 2700 }, { "epoch": 0.09033333333333333, "grad_norm": 27.0, "grad_norm_var": 6.6291015625, "learning_rate": 0.0001, "loss": 7.4822, "loss/crossentropy": 2.077114895731211, "loss/hidden": 3.501171875, "loss/jsd": 0.0, "loss/logits": 0.20367261301726103, "step": 2710 }, { "epoch": 0.09066666666666667, "grad_norm": 32.5, "grad_norm_var": 2.3650390625, "learning_rate": 0.0001, "loss": 7.4912, "loss/crossentropy": 2.010223163664341, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.18317762911319732, "step": 2720 }, { "epoch": 0.091, "grad_norm": 28.75, "grad_norm_var": 2.6832682291666665, "learning_rate": 0.0001, "loss": 7.4297, "loss/crossentropy": 1.8867005869746207, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.18299706783145667, "step": 2730 }, { "epoch": 0.09133333333333334, "grad_norm": 29.625, "grad_norm_var": 2.3926432291666666, "learning_rate": 0.0001, "loss": 7.4149, "loss/crossentropy": 1.9446359343826771, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.1956596726551652, "step": 2740 }, { "epoch": 0.09166666666666666, "grad_norm": 31.25, "grad_norm_var": 7.225455729166667, "learning_rate": 0.0001, "loss": 7.4118, "loss/crossentropy": 1.9953445553779603, "loss/hidden": 3.516796875, "loss/jsd": 0.0, "loss/logits": 0.2088780239224434, "step": 2750 }, { "epoch": 0.092, "grad_norm": 29.5, "grad_norm_var": 5.553059895833333, "learning_rate": 0.0001, "loss": 7.5014, "loss/crossentropy": 2.0024605989456177, "loss/hidden": 3.5828125, "loss/jsd": 0.0, "loss/logits": 0.20880002910271286, "step": 2760 }, { "epoch": 0.09233333333333334, "grad_norm": 28.625, "grad_norm_var": 11.9275390625, "learning_rate": 0.0001, "loss": 7.4792, "loss/crossentropy": 1.9933758333325386, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.19116427507251502, "step": 2770 }, { "epoch": 0.09266666666666666, "grad_norm": 32.25, "grad_norm_var": 14.883268229166667, "learning_rate": 0.0001, "loss": 7.4646, "loss/crossentropy": 1.9825053632259368, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.1976021321490407, "step": 2780 }, { "epoch": 0.093, "grad_norm": 29.125, "grad_norm_var": 1.3792273674783076e+18, "learning_rate": 0.0001, "loss": 7.5405, "loss/crossentropy": 1.9630035154521466, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.1928799333050847, "step": 2790 }, { "epoch": 0.09333333333333334, "grad_norm": 29.875, "grad_norm_var": 1.379227367429374e+18, "learning_rate": 0.0001, "loss": 7.5114, "loss/crossentropy": 1.9186689764261247, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.18643324170261621, "step": 2800 }, { "epoch": 0.09366666666666666, "grad_norm": 28.625, "grad_norm_var": 2.5176432291666666, "learning_rate": 0.0001, "loss": 7.4271, "loss/crossentropy": 1.8392992101609706, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.18913925625383854, "step": 2810 }, { "epoch": 0.094, "grad_norm": 29.5, "grad_norm_var": 2.591666666666667, "learning_rate": 0.0001, "loss": 7.4786, "loss/crossentropy": 1.8614527031779289, "loss/hidden": 3.521484375, "loss/jsd": 0.0, "loss/logits": 0.21286203451454638, "step": 2820 }, { "epoch": 0.09433333333333334, "grad_norm": 30.25, "grad_norm_var": 3.7728515625, "learning_rate": 0.0001, "loss": 7.5528, "loss/crossentropy": 1.9886460095643996, "loss/hidden": 3.514453125, "loss/jsd": 0.0, "loss/logits": 0.1992162003181875, "step": 2830 }, { "epoch": 0.09466666666666666, "grad_norm": 30.375, "grad_norm_var": 2.366080729166667, "learning_rate": 0.0001, "loss": 7.4912, "loss/crossentropy": 1.8799399740993976, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.19175071343779565, "step": 2840 }, { "epoch": 0.095, "grad_norm": 29.125, "grad_norm_var": 0.8955729166666667, "learning_rate": 0.0001, "loss": 7.3382, "loss/crossentropy": 1.9913848645985126, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.1934125845320523, "step": 2850 }, { "epoch": 0.09533333333333334, "grad_norm": 32.5, "grad_norm_var": 2.2223307291666665, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 1.955909602344036, "loss/hidden": 3.544140625, "loss/jsd": 0.0, "loss/logits": 0.1989471236243844, "step": 2860 }, { "epoch": 0.09566666666666666, "grad_norm": 29.625, "grad_norm_var": 4.77890625, "learning_rate": 0.0001, "loss": 7.4136, "loss/crossentropy": 1.9344482243061065, "loss/hidden": 3.50390625, "loss/jsd": 0.0, "loss/logits": 0.19827343709766865, "step": 2870 }, { "epoch": 0.096, "grad_norm": 30.375, "grad_norm_var": 1.7364583333333334, "learning_rate": 0.0001, "loss": 7.4133, "loss/crossentropy": 2.044962373375893, "loss/hidden": 3.5453125, "loss/jsd": 0.0, "loss/logits": 0.20929175913333892, "step": 2880 }, { "epoch": 0.09633333333333334, "grad_norm": 30.375, "grad_norm_var": 1.8535807291666666, "learning_rate": 0.0001, "loss": 7.503, "loss/crossentropy": 1.999411664903164, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.19163514766842127, "step": 2890 }, { "epoch": 0.09666666666666666, "grad_norm": 28.5, "grad_norm_var": 1.625, "learning_rate": 0.0001, "loss": 7.4411, "loss/crossentropy": 2.1730361245572567, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.2091605866327882, "step": 2900 }, { "epoch": 0.097, "grad_norm": 29.125, "grad_norm_var": 2.363541666666667, "learning_rate": 0.0001, "loss": 7.5065, "loss/crossentropy": 1.9977510288357734, "loss/hidden": 3.565625, "loss/jsd": 0.0, "loss/logits": 0.20206534005701543, "step": 2910 }, { "epoch": 0.09733333333333333, "grad_norm": 30.75, "grad_norm_var": 2.434309895833333, "learning_rate": 0.0001, "loss": 7.5452, "loss/crossentropy": 1.9643309473991395, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.19875732865184545, "step": 2920 }, { "epoch": 0.09766666666666667, "grad_norm": 34.25, "grad_norm_var": 4.233268229166667, "learning_rate": 0.0001, "loss": 7.465, "loss/crossentropy": 1.9926663398742677, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.1921684741973877, "step": 2930 }, { "epoch": 0.098, "grad_norm": 28.75, "grad_norm_var": 4.276041666666667, "learning_rate": 0.0001, "loss": 7.55, "loss/crossentropy": 1.9251036688685417, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.18784591825678945, "step": 2940 }, { "epoch": 0.09833333333333333, "grad_norm": 28.125, "grad_norm_var": 2.2018229166666665, "learning_rate": 0.0001, "loss": 7.5089, "loss/crossentropy": 1.976527239382267, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.1919506398960948, "step": 2950 }, { "epoch": 0.09866666666666667, "grad_norm": 27.25, "grad_norm_var": 16.328059895833334, "learning_rate": 0.0001, "loss": 7.3554, "loss/crossentropy": 1.6932103797793387, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.16905276710167527, "step": 2960 }, { "epoch": 0.099, "grad_norm": 46.0, "grad_norm_var": 3.499367251026449e+18, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 1.9110350444912911, "loss/hidden": 3.503515625, "loss/jsd": 0.0, "loss/logits": 0.19965738523751497, "step": 2970 }, { "epoch": 0.09933333333333333, "grad_norm": 28.375, "grad_norm_var": 3.4993672500677356e+18, "learning_rate": 0.0001, "loss": 7.3093, "loss/crossentropy": 1.876964795216918, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.1779957885853946, "step": 2980 }, { "epoch": 0.09966666666666667, "grad_norm": 29.5, "grad_norm_var": 1.9677083333333334, "learning_rate": 0.0001, "loss": 7.4409, "loss/crossentropy": 2.0366775766015053, "loss/hidden": 3.5015625, "loss/jsd": 0.0, "loss/logits": 0.20072081238031386, "step": 2990 }, { "epoch": 0.1, "grad_norm": 30.375, "grad_norm_var": 4.94140625, "learning_rate": 0.0001, "loss": 7.4282, "loss/crossentropy": 1.9014973968267441, "loss/hidden": 3.565234375, "loss/jsd": 0.0, "loss/logits": 0.1985214444808662, "step": 3000 }, { "epoch": 0.10033333333333333, "grad_norm": 33.25, "grad_norm_var": 7.516666666666667, "learning_rate": 0.0001, "loss": 7.5137, "loss/crossentropy": 2.0006577894091606, "loss/hidden": 3.52734375, "loss/jsd": 0.0, "loss/logits": 0.19276103507727385, "step": 3010 }, { "epoch": 0.10066666666666667, "grad_norm": 28.25, "grad_norm_var": 3.405989583333333, "learning_rate": 0.0001, "loss": 7.4728, "loss/crossentropy": 2.014061541855335, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.20625524381175636, "step": 3020 }, { "epoch": 0.101, "grad_norm": 28.25, "grad_norm_var": 1.7518229166666666, "learning_rate": 0.0001, "loss": 7.399, "loss/crossentropy": 1.9580177992582322, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.2090502878651023, "step": 3030 }, { "epoch": 0.10133333333333333, "grad_norm": 28.75, "grad_norm_var": 0.9192057291666667, "learning_rate": 0.0001, "loss": 7.4486, "loss/crossentropy": 1.8475046925246716, "loss/hidden": 3.57421875, "loss/jsd": 0.0, "loss/logits": 0.18773396629840136, "step": 3040 }, { "epoch": 0.10166666666666667, "grad_norm": 31.125, "grad_norm_var": 1.54140625, "learning_rate": 0.0001, "loss": 7.5471, "loss/crossentropy": 1.930780842527747, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.17681049406528473, "step": 3050 }, { "epoch": 0.102, "grad_norm": 31.125, "grad_norm_var": 1.8046223958333334, "learning_rate": 0.0001, "loss": 7.4486, "loss/crossentropy": 2.0486811682581902, "loss/hidden": 3.5265625, "loss/jsd": 0.0, "loss/logits": 0.19173399601131677, "step": 3060 }, { "epoch": 0.10233333333333333, "grad_norm": 31.375, "grad_norm_var": 2.2442057291666666, "learning_rate": 0.0001, "loss": 7.4665, "loss/crossentropy": 2.0373094201087953, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.18425635807216167, "step": 3070 }, { "epoch": 0.10266666666666667, "grad_norm": 31.25, "grad_norm_var": 3.753059895833333, "learning_rate": 0.0001, "loss": 7.4216, "loss/crossentropy": 1.9081890523433684, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.19185817260295152, "step": 3080 }, { "epoch": 0.103, "grad_norm": 30.375, "grad_norm_var": 3.3072265625, "learning_rate": 0.0001, "loss": 7.4026, "loss/crossentropy": 1.6877060309052467, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.1845168405212462, "step": 3090 }, { "epoch": 0.10333333333333333, "grad_norm": 26.875, "grad_norm_var": 3.599739583333333, "learning_rate": 0.0001, "loss": 7.4931, "loss/crossentropy": 2.122732860594988, "loss/hidden": 3.556640625, "loss/jsd": 0.0, "loss/logits": 0.20256736762821675, "step": 3100 }, { "epoch": 0.10366666666666667, "grad_norm": 30.375, "grad_norm_var": 2.778125, "learning_rate": 0.0001, "loss": 7.4364, "loss/crossentropy": 1.945944369584322, "loss/hidden": 3.60859375, "loss/jsd": 0.0, "loss/logits": 0.1929264385253191, "step": 3110 }, { "epoch": 0.104, "grad_norm": 29.125, "grad_norm_var": 2.52265625, "learning_rate": 0.0001, "loss": 7.4465, "loss/crossentropy": 2.189981409907341, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.19659098321571947, "step": 3120 }, { "epoch": 0.10433333333333333, "grad_norm": 28.5, "grad_norm_var": 2.410872395833333, "learning_rate": 0.0001, "loss": 7.4694, "loss/crossentropy": 1.9503503561019897, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.18633831990882754, "step": 3130 }, { "epoch": 0.10466666666666667, "grad_norm": 37.25, "grad_norm_var": 6.328580729166666, "learning_rate": 0.0001, "loss": 7.4823, "loss/crossentropy": 2.050146286934614, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.21134810987859964, "step": 3140 }, { "epoch": 0.105, "grad_norm": 28.75, "grad_norm_var": 5.5181640625, "learning_rate": 0.0001, "loss": 7.4594, "loss/crossentropy": 1.8215602859854698, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.19511280804872513, "step": 3150 }, { "epoch": 0.10533333333333333, "grad_norm": 32.0, "grad_norm_var": 5.401497395833333, "learning_rate": 0.0001, "loss": 7.5666, "loss/crossentropy": 1.944676174223423, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.18761635795235634, "step": 3160 }, { "epoch": 0.10566666666666667, "grad_norm": 33.75, "grad_norm_var": 13.608268229166667, "learning_rate": 0.0001, "loss": 7.4786, "loss/crossentropy": 1.7963935866951943, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.18216567151248456, "step": 3170 }, { "epoch": 0.106, "grad_norm": 28.0, "grad_norm_var": 14.433333333333334, "learning_rate": 0.0001, "loss": 7.352, "loss/crossentropy": 1.9989204011857509, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.19957533106207848, "step": 3180 }, { "epoch": 0.10633333333333334, "grad_norm": 27.875, "grad_norm_var": 12.511393229166666, "learning_rate": 0.0001, "loss": 7.4518, "loss/crossentropy": 2.0659667015075684, "loss/hidden": 3.57890625, "loss/jsd": 0.0, "loss/logits": 0.2092963818460703, "step": 3190 }, { "epoch": 0.10666666666666667, "grad_norm": 29.875, "grad_norm_var": 1.5212890625, "learning_rate": 0.0001, "loss": 7.4653, "loss/crossentropy": 2.0060829371213913, "loss/hidden": 3.54140625, "loss/jsd": 0.0, "loss/logits": 0.20476709287613631, "step": 3200 }, { "epoch": 0.107, "grad_norm": 30.625, "grad_norm_var": 0.8997395833333334, "learning_rate": 0.0001, "loss": 7.4395, "loss/crossentropy": 1.8213725358247757, "loss/hidden": 3.6984375, "loss/jsd": 0.0, "loss/logits": 0.20085674282163382, "step": 3210 }, { "epoch": 0.10733333333333334, "grad_norm": 29.125, "grad_norm_var": 2.6103515625, "learning_rate": 0.0001, "loss": 7.4265, "loss/crossentropy": 1.998078039288521, "loss/hidden": 3.50234375, "loss/jsd": 0.0, "loss/logits": 0.2023037345148623, "step": 3220 }, { "epoch": 0.10766666666666666, "grad_norm": 31.625, "grad_norm_var": 2.316666666666667, "learning_rate": 0.0001, "loss": 7.4808, "loss/crossentropy": 2.005017626285553, "loss/hidden": 3.653515625, "loss/jsd": 0.0, "loss/logits": 0.20823571495711804, "step": 3230 }, { "epoch": 0.108, "grad_norm": 30.0, "grad_norm_var": 11.2416015625, "learning_rate": 0.0001, "loss": 7.3296, "loss/crossentropy": 2.1028595566749573, "loss/hidden": 3.4578125, "loss/jsd": 0.0, "loss/logits": 0.19267723504453899, "step": 3240 }, { "epoch": 0.10833333333333334, "grad_norm": 27.875, "grad_norm_var": 2.089583333333333, "learning_rate": 0.0001, "loss": 7.4162, "loss/crossentropy": 2.038830915093422, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.1994033705443144, "step": 3250 }, { "epoch": 0.10866666666666666, "grad_norm": 29.25, "grad_norm_var": 1.3622395833333334, "learning_rate": 0.0001, "loss": 7.4696, "loss/crossentropy": 1.9886814475059509, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.19070003218948842, "step": 3260 }, { "epoch": 0.109, "grad_norm": 30.0, "grad_norm_var": 3.3999348958333333, "learning_rate": 0.0001, "loss": 7.5318, "loss/crossentropy": 1.9805981650948525, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.1879159839823842, "step": 3270 }, { "epoch": 0.10933333333333334, "grad_norm": 31.0, "grad_norm_var": 21.5478515625, "learning_rate": 0.0001, "loss": 7.4546, "loss/crossentropy": 1.9455841317772866, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.19233653256669642, "step": 3280 }, { "epoch": 0.10966666666666666, "grad_norm": 28.75, "grad_norm_var": 3.5518229166666666, "learning_rate": 0.0001, "loss": 7.3583, "loss/crossentropy": 2.0373078525066375, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.1944910105317831, "step": 3290 }, { "epoch": 0.11, "grad_norm": 32.25, "grad_norm_var": 3.436879804510608e+18, "learning_rate": 0.0001, "loss": 7.5337, "loss/crossentropy": 2.1838977314531802, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.21365603134036065, "step": 3300 }, { "epoch": 0.11033333333333334, "grad_norm": 28.875, "grad_norm_var": 3.4368798027648686e+18, "learning_rate": 0.0001, "loss": 7.4496, "loss/crossentropy": 1.8488673001527787, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.17365747494623066, "step": 3310 }, { "epoch": 0.11066666666666666, "grad_norm": 29.75, "grad_norm_var": 7.133333333333334, "learning_rate": 0.0001, "loss": 7.4407, "loss/crossentropy": 1.9391836307942867, "loss/hidden": 3.565234375, "loss/jsd": 0.0, "loss/logits": 0.19759995695203542, "step": 3320 }, { "epoch": 0.111, "grad_norm": 28.875, "grad_norm_var": 2.7134765625, "learning_rate": 0.0001, "loss": 7.4751, "loss/crossentropy": 2.071904855966568, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.1974649203941226, "step": 3330 }, { "epoch": 0.11133333333333334, "grad_norm": 28.375, "grad_norm_var": 4.101822916666666, "learning_rate": 0.0001, "loss": 7.4933, "loss/crossentropy": 1.945932475477457, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.1945926444604993, "step": 3340 }, { "epoch": 0.11166666666666666, "grad_norm": 28.375, "grad_norm_var": 3.693489583333333, "learning_rate": 0.0001, "loss": 7.4492, "loss/crossentropy": 1.9980404734611512, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.19402206502854824, "step": 3350 }, { "epoch": 0.112, "grad_norm": 27.75, "grad_norm_var": 2.5973307291666665, "learning_rate": 0.0001, "loss": 7.2742, "loss/crossentropy": 2.0758365228772164, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.19238017238676547, "step": 3360 }, { "epoch": 0.11233333333333333, "grad_norm": 27.25, "grad_norm_var": 2.4572265625, "learning_rate": 0.0001, "loss": 7.3725, "loss/crossentropy": 1.937923902273178, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.19878114890307189, "step": 3370 }, { "epoch": 0.11266666666666666, "grad_norm": 28.125, "grad_norm_var": 2.7400390625, "learning_rate": 0.0001, "loss": 7.4162, "loss/crossentropy": 1.9594831869006157, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.18990084500983356, "step": 3380 }, { "epoch": 0.113, "grad_norm": 31.125, "grad_norm_var": 8.0994140625, "learning_rate": 0.0001, "loss": 7.4095, "loss/crossentropy": 1.8118527330458165, "loss/hidden": 3.537109375, "loss/jsd": 0.0, "loss/logits": 0.19681523069739343, "step": 3390 }, { "epoch": 0.11333333333333333, "grad_norm": 30.125, "grad_norm_var": 6.30390625, "learning_rate": 0.0001, "loss": 7.4939, "loss/crossentropy": 1.9089997105300427, "loss/hidden": 3.558984375, "loss/jsd": 0.0, "loss/logits": 0.2114435363560915, "step": 3400 }, { "epoch": 0.11366666666666667, "grad_norm": 28.625, "grad_norm_var": 5.4744140625, "learning_rate": 0.0001, "loss": 7.4432, "loss/crossentropy": 2.0572441786527635, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.19825842864811422, "step": 3410 }, { "epoch": 0.114, "grad_norm": 29.5, "grad_norm_var": 2.6363932291666665, "learning_rate": 0.0001, "loss": 7.479, "loss/crossentropy": 1.9038099765777587, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.1826524894684553, "step": 3420 }, { "epoch": 0.11433333333333333, "grad_norm": 29.25, "grad_norm_var": 2.309375, "learning_rate": 0.0001, "loss": 7.5365, "loss/crossentropy": 2.017984043061733, "loss/hidden": 3.569921875, "loss/jsd": 0.0, "loss/logits": 0.2075114218518138, "step": 3430 }, { "epoch": 0.11466666666666667, "grad_norm": 31.5, "grad_norm_var": 6.413541666666666, "learning_rate": 0.0001, "loss": 7.5588, "loss/crossentropy": 1.9223331391811371, "loss/hidden": 3.567578125, "loss/jsd": 0.0, "loss/logits": 0.203357866499573, "step": 3440 }, { "epoch": 0.115, "grad_norm": 27.125, "grad_norm_var": 10.917708333333334, "learning_rate": 0.0001, "loss": 7.3951, "loss/crossentropy": 1.8734836861491204, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.1840468027628958, "step": 3450 }, { "epoch": 0.11533333333333333, "grad_norm": 29.0, "grad_norm_var": 7.1416015625, "learning_rate": 0.0001, "loss": 7.4645, "loss/crossentropy": 1.9262547835707664, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.20320513006299734, "step": 3460 }, { "epoch": 0.11566666666666667, "grad_norm": 31.875, "grad_norm_var": 4.643684895833333, "learning_rate": 0.0001, "loss": 7.4002, "loss/crossentropy": 2.042244290560484, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.20483402479439974, "step": 3470 }, { "epoch": 0.116, "grad_norm": 29.125, "grad_norm_var": 3.153059895833333, "learning_rate": 0.0001, "loss": 7.5874, "loss/crossentropy": 2.112857761979103, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.2121016476303339, "step": 3480 }, { "epoch": 0.11633333333333333, "grad_norm": 31.625, "grad_norm_var": 4.187239583333334, "learning_rate": 0.0001, "loss": 7.3873, "loss/crossentropy": 1.9675170451402664, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.19640588220208882, "step": 3490 }, { "epoch": 0.11666666666666667, "grad_norm": 27.375, "grad_norm_var": 6.042122395833333, "learning_rate": 0.0001, "loss": 7.4707, "loss/crossentropy": 1.8870328783988952, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.18739598374813796, "step": 3500 }, { "epoch": 0.117, "grad_norm": 32.25, "grad_norm_var": 6.116080729166667, "learning_rate": 0.0001, "loss": 7.451, "loss/crossentropy": 2.1010326854884624, "loss/hidden": 3.5203125, "loss/jsd": 0.0, "loss/logits": 0.20313938818871974, "step": 3510 }, { "epoch": 0.11733333333333333, "grad_norm": 31.375, "grad_norm_var": 4.824739583333334, "learning_rate": 0.0001, "loss": 7.4738, "loss/crossentropy": 1.979208241403103, "loss/hidden": 3.55390625, "loss/jsd": 0.0, "loss/logits": 0.21524249725043773, "step": 3520 }, { "epoch": 0.11766666666666667, "grad_norm": 27.25, "grad_norm_var": 3.8983723958333334, "learning_rate": 0.0001, "loss": 7.4112, "loss/crossentropy": 1.9373849138617516, "loss/hidden": 3.534375, "loss/jsd": 0.0, "loss/logits": 0.20901197157800197, "step": 3530 }, { "epoch": 0.118, "grad_norm": 33.25, "grad_norm_var": 2.2799472881345495e+18, "learning_rate": 0.0001, "loss": 7.5303, "loss/crossentropy": 1.9143501874059439, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.1845899167470634, "step": 3540 }, { "epoch": 0.11833333333333333, "grad_norm": 29.25, "grad_norm_var": 11.645572916666667, "learning_rate": 0.0001, "loss": 7.3518, "loss/crossentropy": 1.864604126662016, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.1810127807315439, "step": 3550 }, { "epoch": 0.11866666666666667, "grad_norm": 26.875, "grad_norm_var": 2.8238932291666665, "learning_rate": 0.0001, "loss": 7.3168, "loss/crossentropy": 1.8864627651870252, "loss/hidden": 3.516015625, "loss/jsd": 0.0, "loss/logits": 0.192725289426744, "step": 3560 }, { "epoch": 0.119, "grad_norm": 37.75, "grad_norm_var": 6.1353515625, "learning_rate": 0.0001, "loss": 7.4636, "loss/crossentropy": 2.0385304942727087, "loss/hidden": 3.51875, "loss/jsd": 0.0, "loss/logits": 0.20133369509130716, "step": 3570 }, { "epoch": 0.11933333333333333, "grad_norm": 27.25, "grad_norm_var": 5.903125, "learning_rate": 0.0001, "loss": 7.361, "loss/crossentropy": 1.9261676244437695, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.19395384658128023, "step": 3580 }, { "epoch": 0.11966666666666667, "grad_norm": 28.0, "grad_norm_var": 3.160416666666667, "learning_rate": 0.0001, "loss": 7.3347, "loss/crossentropy": 1.9396270588040352, "loss/hidden": 3.586328125, "loss/jsd": 0.0, "loss/logits": 0.19555766582489015, "step": 3590 }, { "epoch": 0.12, "grad_norm": 28.375, "grad_norm_var": 3.6207682291666665, "learning_rate": 0.0001, "loss": 7.3577, "loss/crossentropy": 2.1443623453378677, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.21093165650963783, "step": 3600 }, { "epoch": 0.12033333333333333, "grad_norm": 28.625, "grad_norm_var": 2.205208333333333, "learning_rate": 0.0001, "loss": 7.4796, "loss/crossentropy": 2.0280169516801836, "loss/hidden": 3.516015625, "loss/jsd": 0.0, "loss/logits": 0.199892882630229, "step": 3610 }, { "epoch": 0.12066666666666667, "grad_norm": 33.25, "grad_norm_var": 5.2125, "learning_rate": 0.0001, "loss": 7.5701, "loss/crossentropy": 1.9959274344146252, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.20606609880924226, "step": 3620 }, { "epoch": 0.121, "grad_norm": 28.25, "grad_norm_var": 4.215625, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 1.8985692247748376, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.18893487583845853, "step": 3630 }, { "epoch": 0.12133333333333333, "grad_norm": 29.125, "grad_norm_var": 1.4552083333333334, "learning_rate": 0.0001, "loss": 7.4034, "loss/crossentropy": 1.928818552196026, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.1885963099077344, "step": 3640 }, { "epoch": 0.12166666666666667, "grad_norm": 30.625, "grad_norm_var": 1.603125, "learning_rate": 0.0001, "loss": 7.4808, "loss/crossentropy": 2.0303805127739905, "loss/hidden": 3.548046875, "loss/jsd": 0.0, "loss/logits": 0.20974878408014774, "step": 3650 }, { "epoch": 0.122, "grad_norm": 30.75, "grad_norm_var": 2.04375, "learning_rate": 0.0001, "loss": 7.3728, "loss/crossentropy": 2.056822224706411, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.1907132336869836, "step": 3660 }, { "epoch": 0.12233333333333334, "grad_norm": 28.125, "grad_norm_var": 0.9934895833333334, "learning_rate": 0.0001, "loss": 7.5691, "loss/crossentropy": 2.234497883915901, "loss/hidden": 3.5125, "loss/jsd": 0.0, "loss/logits": 0.21699313148856164, "step": 3670 }, { "epoch": 0.12266666666666666, "grad_norm": 30.5, "grad_norm_var": 0.8806640625, "learning_rate": 0.0001, "loss": 7.4024, "loss/crossentropy": 1.9364117056131362, "loss/hidden": 3.526171875, "loss/jsd": 0.0, "loss/logits": 0.19387449249625205, "step": 3680 }, { "epoch": 0.123, "grad_norm": 33.75, "grad_norm_var": 6.7744140625, "learning_rate": 0.0001, "loss": 7.4407, "loss/crossentropy": 1.942778319120407, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.19070099834352733, "step": 3690 }, { "epoch": 0.12333333333333334, "grad_norm": 27.875, "grad_norm_var": 6.880143229166666, "learning_rate": 0.0001, "loss": 7.3831, "loss/crossentropy": 2.1350757479667664, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.19981242381036282, "step": 3700 }, { "epoch": 0.12366666666666666, "grad_norm": 27.25, "grad_norm_var": 2.5559895833333335, "learning_rate": 0.0001, "loss": 7.3988, "loss/crossentropy": 1.9379477031528949, "loss/hidden": 3.59765625, "loss/jsd": 0.0, "loss/logits": 0.20407978985458614, "step": 3710 }, { "epoch": 0.124, "grad_norm": 30.875, "grad_norm_var": 2.409309895833333, "learning_rate": 0.0001, "loss": 7.3331, "loss/crossentropy": 1.8865982994437218, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.19101537987589837, "step": 3720 }, { "epoch": 0.12433333333333334, "grad_norm": 29.375, "grad_norm_var": 2.8604166666666666, "learning_rate": 0.0001, "loss": 7.5101, "loss/crossentropy": 2.0952261596918107, "loss/hidden": 3.51640625, "loss/jsd": 0.0, "loss/logits": 0.19895972684025764, "step": 3730 }, { "epoch": 0.12466666666666666, "grad_norm": 33.5, "grad_norm_var": 6.4375, "learning_rate": 0.0001, "loss": 7.5632, "loss/crossentropy": 2.070046865940094, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.1971546915359795, "step": 3740 }, { "epoch": 0.125, "grad_norm": 29.125, "grad_norm_var": 5.905143229166667, "learning_rate": 0.0001, "loss": 7.4182, "loss/crossentropy": 2.0312545858323574, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.18728531878441573, "step": 3750 }, { "epoch": 0.12533333333333332, "grad_norm": 30.875, "grad_norm_var": 2.4119140625, "learning_rate": 0.0001, "loss": 7.5519, "loss/crossentropy": 1.9634180948138238, "loss/hidden": 3.530859375, "loss/jsd": 0.0, "loss/logits": 0.18896894976496698, "step": 3760 }, { "epoch": 0.12566666666666668, "grad_norm": 29.75, "grad_norm_var": 3.748893229166667, "learning_rate": 0.0001, "loss": 7.4405, "loss/crossentropy": 2.0053124725818634, "loss/hidden": 3.608984375, "loss/jsd": 0.0, "loss/logits": 0.22514045294374227, "step": 3770 }, { "epoch": 0.126, "grad_norm": 29.0, "grad_norm_var": 3.351822916666667, "learning_rate": 0.0001, "loss": 7.4796, "loss/crossentropy": 1.9557438887655736, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.203252624720335, "step": 3780 }, { "epoch": 0.12633333333333333, "grad_norm": 30.375, "grad_norm_var": 3.246875, "learning_rate": 0.0001, "loss": 7.4182, "loss/crossentropy": 2.019977870583534, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.2028788050636649, "step": 3790 }, { "epoch": 0.12666666666666668, "grad_norm": 32.5, "grad_norm_var": 1.520598173133203e+18, "learning_rate": 0.0001, "loss": 7.6338, "loss/crossentropy": 2.0642739944159985, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.19320375341922044, "step": 3800 }, { "epoch": 0.127, "grad_norm": 30.5, "grad_norm_var": 2.6333333333333333, "learning_rate": 0.0001, "loss": 7.552, "loss/crossentropy": 1.99695186316967, "loss/hidden": 3.544140625, "loss/jsd": 0.0, "loss/logits": 0.20978426598012448, "step": 3810 }, { "epoch": 0.12733333333333333, "grad_norm": 29.25, "grad_norm_var": 2.1051432291666665, "learning_rate": 0.0001, "loss": 7.416, "loss/crossentropy": 2.0920657381415366, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.2062800783663988, "step": 3820 }, { "epoch": 0.12766666666666668, "grad_norm": 31.625, "grad_norm_var": 10.94140625, "learning_rate": 0.0001, "loss": 7.6019, "loss/crossentropy": 1.9800303630530833, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.19484127070754767, "step": 3830 }, { "epoch": 0.128, "grad_norm": 34.5, "grad_norm_var": 3.974739583333333, "learning_rate": 0.0001, "loss": 7.3943, "loss/crossentropy": 2.0763349503278734, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.19928722847253083, "step": 3840 }, { "epoch": 0.12833333333333333, "grad_norm": 7482638336.0, "grad_norm_var": 3.4993672493350605e+18, "learning_rate": 0.0001, "loss": 7.5376, "loss/crossentropy": 1.8679085165262221, "loss/hidden": 3.60625, "loss/jsd": 0.0, "loss/logits": 0.2087004542350769, "step": 3850 }, { "epoch": 0.12866666666666668, "grad_norm": 28.375, "grad_norm_var": 3.4993672505042227e+18, "learning_rate": 0.0001, "loss": 7.4715, "loss/crossentropy": 2.040581877529621, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.19136519059538842, "step": 3860 }, { "epoch": 0.129, "grad_norm": 27.375, "grad_norm_var": 3.8684895833333335, "learning_rate": 0.0001, "loss": 7.5768, "loss/crossentropy": 2.0877299554646016, "loss/hidden": 3.653125, "loss/jsd": 0.0, "loss/logits": 0.2131496659014374, "step": 3870 }, { "epoch": 0.12933333333333333, "grad_norm": 28.25, "grad_norm_var": 3.22265625, "learning_rate": 0.0001, "loss": 7.4144, "loss/crossentropy": 1.7788232088088989, "loss/hidden": 3.578515625, "loss/jsd": 0.0, "loss/logits": 0.1871545252390206, "step": 3880 }, { "epoch": 0.12966666666666668, "grad_norm": 28.625, "grad_norm_var": 2.131705729166667, "learning_rate": 0.0001, "loss": 7.5155, "loss/crossentropy": 2.099466894567013, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.17847845293581485, "step": 3890 }, { "epoch": 0.13, "grad_norm": 31.875, "grad_norm_var": 1.934375, "learning_rate": 0.0001, "loss": 7.4731, "loss/crossentropy": 1.994307056069374, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.19897048063576223, "step": 3900 }, { "epoch": 0.13033333333333333, "grad_norm": 32.0, "grad_norm_var": 2.2103515625, "learning_rate": 0.0001, "loss": 7.4861, "loss/crossentropy": 2.0831891864538195, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.18841091617941857, "step": 3910 }, { "epoch": 0.13066666666666665, "grad_norm": 29.0, "grad_norm_var": 1.7030598958333334, "learning_rate": 0.0001, "loss": 7.4696, "loss/crossentropy": 2.0479237228631972, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.19577913228422403, "step": 3920 }, { "epoch": 0.131, "grad_norm": 31.25, "grad_norm_var": 1.7166666666666666, "learning_rate": 0.0001, "loss": 7.3434, "loss/crossentropy": 1.7236192353069781, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.17098939018324016, "step": 3930 }, { "epoch": 0.13133333333333333, "grad_norm": 28.625, "grad_norm_var": 1.6582682291666666, "learning_rate": 0.0001, "loss": 7.4801, "loss/crossentropy": 1.9546857163310052, "loss/hidden": 3.60078125, "loss/jsd": 0.0, "loss/logits": 0.19484964944422245, "step": 3940 }, { "epoch": 0.13166666666666665, "grad_norm": 30.75, "grad_norm_var": 2.3053504046853084e+18, "learning_rate": 0.0001, "loss": 7.4119, "loss/crossentropy": 1.9890823870897294, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.19035766888409852, "step": 3950 }, { "epoch": 0.132, "grad_norm": 36.25, "grad_norm_var": 2.3053504032175816e+18, "learning_rate": 0.0001, "loss": 7.5061, "loss/crossentropy": 1.8188596323132515, "loss/hidden": 3.556640625, "loss/jsd": 0.0, "loss/logits": 0.19044051151722668, "step": 3960 }, { "epoch": 0.13233333333333333, "grad_norm": 33.0, "grad_norm_var": 5.155143229166667, "learning_rate": 0.0001, "loss": 7.5002, "loss/crossentropy": 1.9718340262770653, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.1962990122847259, "step": 3970 }, { "epoch": 0.13266666666666665, "grad_norm": 28.25, "grad_norm_var": 59.3822265625, "learning_rate": 0.0001, "loss": 7.441, "loss/crossentropy": 1.8465905606746673, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.18071764558553696, "step": 3980 }, { "epoch": 0.133, "grad_norm": 30.625, "grad_norm_var": 59.7259765625, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.049452635645866, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19957882836461066, "step": 3990 }, { "epoch": 0.13333333333333333, "grad_norm": 27.375, "grad_norm_var": 2.0322265625, "learning_rate": 0.0001, "loss": 7.5144, "loss/crossentropy": 1.9349242925643921, "loss/hidden": 3.568359375, "loss/jsd": 0.0, "loss/logits": 0.19886549953371285, "step": 4000 }, { "epoch": 0.13366666666666666, "grad_norm": 31.375, "grad_norm_var": 2.3622395833333334, "learning_rate": 0.0001, "loss": 7.4662, "loss/crossentropy": 1.9725325986742974, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.19714612942188978, "step": 4010 }, { "epoch": 0.134, "grad_norm": 31.625, "grad_norm_var": 2.34140625, "learning_rate": 0.0001, "loss": 7.4965, "loss/crossentropy": 1.9797401160001755, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.19259887989610433, "step": 4020 }, { "epoch": 0.13433333333333333, "grad_norm": 30.125, "grad_norm_var": 3.78125, "learning_rate": 0.0001, "loss": 7.5898, "loss/crossentropy": 1.9201873689889908, "loss/hidden": 3.576953125, "loss/jsd": 0.0, "loss/logits": 0.2038030235096812, "step": 4030 }, { "epoch": 0.13466666666666666, "grad_norm": 30.875, "grad_norm_var": 5.3337890625, "learning_rate": 0.0001, "loss": 7.4327, "loss/crossentropy": 2.1032419338822366, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.20691804829984903, "step": 4040 }, { "epoch": 0.135, "grad_norm": 29.875, "grad_norm_var": 3.7827473958333333, "learning_rate": 0.0001, "loss": 7.5277, "loss/crossentropy": 2.169746695458889, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.20103545431047679, "step": 4050 }, { "epoch": 0.13533333333333333, "grad_norm": 31.75, "grad_norm_var": 2.973372395833333, "learning_rate": 0.0001, "loss": 7.3943, "loss/crossentropy": 1.9815341517329217, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.20097548943012952, "step": 4060 }, { "epoch": 0.13566666666666666, "grad_norm": 28.625, "grad_norm_var": 1.525, "learning_rate": 0.0001, "loss": 7.3709, "loss/crossentropy": 2.050188249349594, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.19337886217981576, "step": 4070 }, { "epoch": 0.136, "grad_norm": 30.125, "grad_norm_var": 0.8205729166666667, "learning_rate": 0.0001, "loss": 7.4022, "loss/crossentropy": 1.9687265530228615, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.1929249044507742, "step": 4080 }, { "epoch": 0.13633333333333333, "grad_norm": 29.375, "grad_norm_var": 39.81295572916667, "learning_rate": 0.0001, "loss": 7.5802, "loss/crossentropy": 2.006483754515648, "loss/hidden": 3.5359375, "loss/jsd": 0.0, "loss/logits": 0.20347609724849464, "step": 4090 }, { "epoch": 0.13666666666666666, "grad_norm": 29.5, "grad_norm_var": 167.5009765625, "learning_rate": 0.0001, "loss": 7.4448, "loss/crossentropy": 1.8040572479367256, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.19855520855635406, "step": 4100 }, { "epoch": 0.137, "grad_norm": 28.5, "grad_norm_var": 2.606184895833333, "learning_rate": 0.0001, "loss": 7.5065, "loss/crossentropy": 1.7555107027292252, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.1794820530805737, "step": 4110 }, { "epoch": 0.13733333333333334, "grad_norm": 31.625, "grad_norm_var": 2.4344770496656573e+18, "learning_rate": 0.0001, "loss": 7.5115, "loss/crossentropy": 2.0089548528194427, "loss/hidden": 3.53359375, "loss/jsd": 0.0, "loss/logits": 0.19754117913544178, "step": 4120 }, { "epoch": 0.13766666666666666, "grad_norm": 29.5, "grad_norm_var": 2.434477049691662e+18, "learning_rate": 0.0001, "loss": 7.4954, "loss/crossentropy": 2.0060373283922672, "loss/hidden": 3.525390625, "loss/jsd": 0.0, "loss/logits": 0.19557616151869298, "step": 4130 }, { "epoch": 0.138, "grad_norm": 30.125, "grad_norm_var": 79.81295572916666, "learning_rate": 0.0001, "loss": 7.433, "loss/crossentropy": 2.165028876066208, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.20818099621683359, "step": 4140 }, { "epoch": 0.13833333333333334, "grad_norm": 31.875, "grad_norm_var": 83.54212239583333, "learning_rate": 0.0001, "loss": 7.4617, "loss/crossentropy": 1.9516791716217994, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.18915393725037574, "step": 4150 }, { "epoch": 0.13866666666666666, "grad_norm": 29.0, "grad_norm_var": 2.7811848958333334, "learning_rate": 0.0001, "loss": 7.3217, "loss/crossentropy": 1.9394512429833413, "loss/hidden": 3.538671875, "loss/jsd": 0.0, "loss/logits": 0.20393375800922514, "step": 4160 }, { "epoch": 0.139, "grad_norm": 30.625, "grad_norm_var": 2.405989583333333, "learning_rate": 0.0001, "loss": 7.4611, "loss/crossentropy": 1.8435194082558155, "loss/hidden": 3.548828125, "loss/jsd": 0.0, "loss/logits": 0.18950867913663388, "step": 4170 }, { "epoch": 0.13933333333333334, "grad_norm": 28.5, "grad_norm_var": 6.318684895833333, "learning_rate": 0.0001, "loss": 7.4523, "loss/crossentropy": 1.807323095947504, "loss/hidden": 3.536328125, "loss/jsd": 0.0, "loss/logits": 0.20113845579326153, "step": 4180 }, { "epoch": 0.13966666666666666, "grad_norm": 29.125, "grad_norm_var": 7.7384765625, "learning_rate": 0.0001, "loss": 7.4015, "loss/crossentropy": 1.9143501825630664, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.1735602943226695, "step": 4190 }, { "epoch": 0.14, "grad_norm": 29.875, "grad_norm_var": 4.010872395833333, "learning_rate": 0.0001, "loss": 7.3273, "loss/crossentropy": 1.9778856128454207, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.18200624100863932, "step": 4200 }, { "epoch": 0.14033333333333334, "grad_norm": 30.5, "grad_norm_var": 2.5916015625, "learning_rate": 0.0001, "loss": 7.5658, "loss/crossentropy": 2.1734305538237093, "loss/hidden": 3.53203125, "loss/jsd": 0.0, "loss/logits": 0.220436554774642, "step": 4210 }, { "epoch": 0.14066666666666666, "grad_norm": 28.125, "grad_norm_var": 7.5462890625, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.1664201892912387, "loss/hidden": 3.552734375, "loss/jsd": 0.0, "loss/logits": 0.20990947782993316, "step": 4220 }, { "epoch": 0.141, "grad_norm": 26.75, "grad_norm_var": 8.26015625, "learning_rate": 0.0001, "loss": 7.3841, "loss/crossentropy": 1.8498427130281925, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.20062209563329816, "step": 4230 }, { "epoch": 0.14133333333333334, "grad_norm": 27.875, "grad_norm_var": 3.0747395833333333, "learning_rate": 0.0001, "loss": 7.3119, "loss/crossentropy": 1.868785708397627, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.18442066367715598, "step": 4240 }, { "epoch": 0.14166666666666666, "grad_norm": 32.5, "grad_norm_var": 4.773893229166666, "learning_rate": 0.0001, "loss": 7.398, "loss/crossentropy": 1.926427349448204, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.19232564913108946, "step": 4250 }, { "epoch": 0.142, "grad_norm": 28.875, "grad_norm_var": 6.0125, "learning_rate": 0.0001, "loss": 7.3891, "loss/crossentropy": 1.8769178196787835, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.18023333363234997, "step": 4260 }, { "epoch": 0.14233333333333334, "grad_norm": 30.125, "grad_norm_var": 5.557747395833333, "learning_rate": 0.0001, "loss": 7.4766, "loss/crossentropy": 1.8873730070888997, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.20799308605492114, "step": 4270 }, { "epoch": 0.14266666666666666, "grad_norm": 31.625, "grad_norm_var": 3.214583333333333, "learning_rate": 0.0001, "loss": 7.4486, "loss/crossentropy": 2.0190445192158224, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.19258085153996946, "step": 4280 }, { "epoch": 0.143, "grad_norm": 30.75, "grad_norm_var": 2.442643229166667, "learning_rate": 0.0001, "loss": 7.384, "loss/crossentropy": 1.9068180739879608, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.1906971350312233, "step": 4290 }, { "epoch": 0.14333333333333334, "grad_norm": 30.875, "grad_norm_var": 3.0247395833333335, "learning_rate": 0.0001, "loss": 7.5427, "loss/crossentropy": 2.003492370247841, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.19313554968684912, "step": 4300 }, { "epoch": 0.14366666666666666, "grad_norm": 28.625, "grad_norm_var": 1.1369140625, "learning_rate": 0.0001, "loss": 7.3942, "loss/crossentropy": 1.9972608074545861, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.18862742614001035, "step": 4310 }, { "epoch": 0.144, "grad_norm": 30.875, "grad_norm_var": 3.459830729166667, "learning_rate": 0.0001, "loss": 7.4604, "loss/crossentropy": 2.0986968971788884, "loss/hidden": 3.51875, "loss/jsd": 0.0, "loss/logits": 0.22898746514692903, "step": 4320 }, { "epoch": 0.14433333333333334, "grad_norm": 31.125, "grad_norm_var": 4.0056640625, "learning_rate": 0.0001, "loss": 7.5097, "loss/crossentropy": 2.0047623462975026, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.20348726231604813, "step": 4330 }, { "epoch": 0.14466666666666667, "grad_norm": 29.75, "grad_norm_var": 4.396809895833333, "learning_rate": 0.0001, "loss": 7.4206, "loss/crossentropy": 1.8941109143197536, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.17960004024207593, "step": 4340 }, { "epoch": 0.145, "grad_norm": 28.625, "grad_norm_var": 1.8309895833333334, "learning_rate": 0.0001, "loss": 7.4266, "loss/crossentropy": 1.9382505618035792, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.18392374292016028, "step": 4350 }, { "epoch": 0.14533333333333334, "grad_norm": 31.75, "grad_norm_var": 3.309375, "learning_rate": 0.0001, "loss": 7.4006, "loss/crossentropy": 1.903117323666811, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.19998977733775974, "step": 4360 }, { "epoch": 0.14566666666666667, "grad_norm": 30.125, "grad_norm_var": 11.7806640625, "learning_rate": 0.0001, "loss": 7.4673, "loss/crossentropy": 2.044017915427685, "loss/hidden": 3.659375, "loss/jsd": 0.0, "loss/logits": 0.21902335993945599, "step": 4370 }, { "epoch": 0.146, "grad_norm": 29.625, "grad_norm_var": 12.326041666666667, "learning_rate": 0.0001, "loss": 7.491, "loss/crossentropy": 1.9895926266908646, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19060881985351444, "step": 4380 }, { "epoch": 0.14633333333333334, "grad_norm": 30.75, "grad_norm_var": 2.4160807291666666, "learning_rate": 0.0001, "loss": 7.3001, "loss/crossentropy": 2.082070082426071, "loss/hidden": 3.466796875, "loss/jsd": 0.0, "loss/logits": 0.19202737975865602, "step": 4390 }, { "epoch": 0.14666666666666667, "grad_norm": 35.0, "grad_norm_var": 10.678125, "learning_rate": 0.0001, "loss": 7.459, "loss/crossentropy": 2.007606502622366, "loss/hidden": 3.508984375, "loss/jsd": 0.0, "loss/logits": 0.19886050876230002, "step": 4400 }, { "epoch": 0.147, "grad_norm": 29.125, "grad_norm_var": 3.345572916666667, "learning_rate": 0.0001, "loss": 7.2873, "loss/crossentropy": 1.8293472424149513, "loss/hidden": 3.551953125, "loss/jsd": 0.0, "loss/logits": 0.18400842538103462, "step": 4410 }, { "epoch": 0.14733333333333334, "grad_norm": 33.5, "grad_norm_var": 20.941666666666666, "learning_rate": 0.0001, "loss": 7.3938, "loss/crossentropy": 1.8778200969099998, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.21048751659691334, "step": 4420 }, { "epoch": 0.14766666666666667, "grad_norm": 30.25, "grad_norm_var": 21.690625, "learning_rate": 0.0001, "loss": 7.3056, "loss/crossentropy": 1.9629560872912406, "loss/hidden": 3.57890625, "loss/jsd": 0.0, "loss/logits": 0.197906737588346, "step": 4430 }, { "epoch": 0.148, "grad_norm": 33.0, "grad_norm_var": 3.284375, "learning_rate": 0.0001, "loss": 7.4357, "loss/crossentropy": 1.9894877552986145, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.20565036609768866, "step": 4440 }, { "epoch": 0.14833333333333334, "grad_norm": 39.5, "grad_norm_var": 3.2831241010079073e+18, "learning_rate": 0.0001, "loss": 7.4525, "loss/crossentropy": 1.8737743824720383, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.1965087026357651, "step": 4450 }, { "epoch": 0.14866666666666667, "grad_norm": 30.25, "grad_norm_var": 3.283124099754649e+18, "learning_rate": 0.0001, "loss": 7.5507, "loss/crossentropy": 2.0716402977705, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.20167440176010132, "step": 4460 }, { "epoch": 0.149, "grad_norm": 30.0, "grad_norm_var": 4.410416666666666, "learning_rate": 0.0001, "loss": 7.4175, "loss/crossentropy": 1.89669204428792, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.21117562893778086, "step": 4470 }, { "epoch": 0.14933333333333335, "grad_norm": 29.875, "grad_norm_var": 3.9358723958333335, "learning_rate": 0.0001, "loss": 7.3422, "loss/crossentropy": 1.8715087167918683, "loss/hidden": 3.616015625, "loss/jsd": 0.0, "loss/logits": 0.20132542541250587, "step": 4480 }, { "epoch": 0.14966666666666667, "grad_norm": 29.5, "grad_norm_var": 20.022330729166665, "learning_rate": 0.0001, "loss": 7.4029, "loss/crossentropy": 1.9388548903167249, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.18345501432195305, "step": 4490 }, { "epoch": 0.15, "grad_norm": 28.125, "grad_norm_var": 12.9666015625, "learning_rate": 0.0001, "loss": 7.5074, "loss/crossentropy": 2.068153513967991, "loss/hidden": 3.556640625, "loss/jsd": 0.0, "loss/logits": 0.21649410519748927, "step": 4500 }, { "epoch": 0.15033333333333335, "grad_norm": 26.875, "grad_norm_var": 13.52890625, "learning_rate": 0.0001, "loss": 7.2949, "loss/crossentropy": 2.05457132011652, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.18980348333716393, "step": 4510 }, { "epoch": 0.15066666666666667, "grad_norm": 28.75, "grad_norm_var": 19.64765625, "learning_rate": 0.0001, "loss": 7.3884, "loss/crossentropy": 1.9403226912021636, "loss/hidden": 3.56796875, "loss/jsd": 0.0, "loss/logits": 0.1966216434724629, "step": 4520 }, { "epoch": 0.151, "grad_norm": 28.25, "grad_norm_var": 12.336393229166667, "learning_rate": 0.0001, "loss": 7.5821, "loss/crossentropy": 2.1127709090709685, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.20213118921965362, "step": 4530 }, { "epoch": 0.15133333333333332, "grad_norm": 28.5, "grad_norm_var": 34.92493489583333, "learning_rate": 0.0001, "loss": 7.5213, "loss/crossentropy": 2.013382240384817, "loss/hidden": 3.516015625, "loss/jsd": 0.0, "loss/logits": 0.19650817457586528, "step": 4540 }, { "epoch": 0.15166666666666667, "grad_norm": 31.125, "grad_norm_var": 2.4184895833333333, "learning_rate": 0.0001, "loss": 7.5043, "loss/crossentropy": 2.0900513559579847, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.192518649995327, "step": 4550 }, { "epoch": 0.152, "grad_norm": 31.375, "grad_norm_var": 37.118489583333336, "learning_rate": 0.0001, "loss": 7.4182, "loss/crossentropy": 1.7882040813565254, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.18156962506473065, "step": 4560 }, { "epoch": 0.15233333333333332, "grad_norm": 28.625, "grad_norm_var": 37.0306640625, "learning_rate": 0.0001, "loss": 7.3733, "loss/crossentropy": 2.005775400996208, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.1880094451829791, "step": 4570 }, { "epoch": 0.15266666666666667, "grad_norm": 31.0, "grad_norm_var": 3.379166666666667, "learning_rate": 0.0001, "loss": 7.4664, "loss/crossentropy": 1.9818413272500037, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.1752587029710412, "step": 4580 }, { "epoch": 0.153, "grad_norm": 27.875, "grad_norm_var": 1.7518229166666666, "learning_rate": 0.0001, "loss": 7.2726, "loss/crossentropy": 1.8078279815614224, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.19076116774231194, "step": 4590 }, { "epoch": 0.15333333333333332, "grad_norm": 29.25, "grad_norm_var": 12.182747395833333, "learning_rate": 0.0001, "loss": 7.3288, "loss/crossentropy": 2.1259574025869368, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20191418156027793, "step": 4600 }, { "epoch": 0.15366666666666667, "grad_norm": 28.75, "grad_norm_var": 11.820833333333333, "learning_rate": 0.0001, "loss": 7.4235, "loss/crossentropy": 1.9628787368535996, "loss/hidden": 3.547265625, "loss/jsd": 0.0, "loss/logits": 0.1948615150526166, "step": 4610 }, { "epoch": 0.154, "grad_norm": 27.25, "grad_norm_var": 2.877083333333333, "learning_rate": 0.0001, "loss": 7.4381, "loss/crossentropy": 2.0635317392647265, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.195272920653224, "step": 4620 }, { "epoch": 0.15433333333333332, "grad_norm": 28.875, "grad_norm_var": 1.6124348958333334, "learning_rate": 0.0001, "loss": 7.4073, "loss/crossentropy": 1.9314781568944455, "loss/hidden": 3.553515625, "loss/jsd": 0.0, "loss/logits": 0.2065671019256115, "step": 4630 }, { "epoch": 0.15466666666666667, "grad_norm": 28.125, "grad_norm_var": 2.151497395833333, "learning_rate": 0.0001, "loss": 7.4882, "loss/crossentropy": 2.0347410716116427, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.19105215203016995, "step": 4640 }, { "epoch": 0.155, "grad_norm": 29.75, "grad_norm_var": 5.44375, "learning_rate": 0.0001, "loss": 7.435, "loss/crossentropy": 1.9775021493434906, "loss/hidden": 3.6421875, "loss/jsd": 0.0, "loss/logits": 0.20977960508316756, "step": 4650 }, { "epoch": 0.15533333333333332, "grad_norm": 28.625, "grad_norm_var": 3.214583333333333, "learning_rate": 0.0001, "loss": 7.2812, "loss/crossentropy": 2.0121472641825675, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.1979538168758154, "step": 4660 }, { "epoch": 0.15566666666666668, "grad_norm": 31.125, "grad_norm_var": 6.596875, "learning_rate": 0.0001, "loss": 7.4243, "loss/crossentropy": 2.007686108723283, "loss/hidden": 3.53671875, "loss/jsd": 0.0, "loss/logits": 0.21594900633208453, "step": 4670 }, { "epoch": 0.156, "grad_norm": 28.875, "grad_norm_var": 2.8968098958333335, "learning_rate": 0.0001, "loss": 7.4143, "loss/crossentropy": 1.9774284347891808, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.1945602297782898, "step": 4680 }, { "epoch": 0.15633333333333332, "grad_norm": 30.75, "grad_norm_var": 22.474739583333335, "learning_rate": 0.0001, "loss": 7.3322, "loss/crossentropy": 1.964871782064438, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.1918278619647026, "step": 4690 }, { "epoch": 0.15666666666666668, "grad_norm": 30.375, "grad_norm_var": 11.680989583333334, "learning_rate": 0.0001, "loss": 7.3856, "loss/crossentropy": 1.9804026313126086, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.18859584210440516, "step": 4700 }, { "epoch": 0.157, "grad_norm": 29.5, "grad_norm_var": 16.248958333333334, "learning_rate": 0.0001, "loss": 7.3516, "loss/crossentropy": 1.9562460504472257, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.19623687621206046, "step": 4710 }, { "epoch": 0.15733333333333333, "grad_norm": 29.5, "grad_norm_var": 13.3228515625, "learning_rate": 0.0001, "loss": 7.4173, "loss/crossentropy": 2.004583294689655, "loss/hidden": 3.5328125, "loss/jsd": 0.0, "loss/logits": 0.20441180877387524, "step": 4720 }, { "epoch": 0.15766666666666668, "grad_norm": 29.375, "grad_norm_var": 6.094791666666667, "learning_rate": 0.0001, "loss": 7.4712, "loss/crossentropy": 2.067339327186346, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.2090114824473858, "step": 4730 }, { "epoch": 0.158, "grad_norm": 29.625, "grad_norm_var": 5.458333333333333, "learning_rate": 0.0001, "loss": 7.4735, "loss/crossentropy": 1.951393010467291, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.18563756169751286, "step": 4740 }, { "epoch": 0.15833333333333333, "grad_norm": 31.625, "grad_norm_var": 2.736458333333333, "learning_rate": 0.0001, "loss": 7.4906, "loss/crossentropy": 2.0527772501111032, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.21584454253315927, "step": 4750 }, { "epoch": 0.15866666666666668, "grad_norm": 34.75, "grad_norm_var": 5.941666666666666, "learning_rate": 0.0001, "loss": 7.3276, "loss/crossentropy": 1.9436825871467591, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.17539917454123496, "step": 4760 }, { "epoch": 0.159, "grad_norm": 27.25, "grad_norm_var": 11.901822916666667, "learning_rate": 0.0001, "loss": 7.3668, "loss/crossentropy": 1.9678773760795594, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.18869450595229864, "step": 4770 }, { "epoch": 0.15933333333333333, "grad_norm": 28.875, "grad_norm_var": 2.11015625, "learning_rate": 0.0001, "loss": 7.2127, "loss/crossentropy": 1.8911981835961342, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.1796493912115693, "step": 4780 }, { "epoch": 0.15966666666666668, "grad_norm": 31.25, "grad_norm_var": 3.073958333333333, "learning_rate": 0.0001, "loss": 7.4219, "loss/crossentropy": 2.0220508128404617, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.20494700744748115, "step": 4790 }, { "epoch": 0.16, "grad_norm": 41.0, "grad_norm_var": 11.120572916666667, "learning_rate": 0.0001, "loss": 7.3432, "loss/crossentropy": 2.122326224297285, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.19381498489528895, "step": 4800 }, { "epoch": 0.16033333333333333, "grad_norm": 28.75, "grad_norm_var": 9.030989583333334, "learning_rate": 0.0001, "loss": 7.4743, "loss/crossentropy": 2.0681777626276014, "loss/hidden": 3.526953125, "loss/jsd": 0.0, "loss/logits": 0.20463107377290726, "step": 4810 }, { "epoch": 0.16066666666666668, "grad_norm": 29.375, "grad_norm_var": 1.8309895833333334, "learning_rate": 0.0001, "loss": 7.3658, "loss/crossentropy": 2.152210661768913, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.20886727850884199, "step": 4820 }, { "epoch": 0.161, "grad_norm": 31.875, "grad_norm_var": 3.4301432291666667, "learning_rate": 0.0001, "loss": 7.497, "loss/crossentropy": 2.109544586390257, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.18917251005768776, "step": 4830 }, { "epoch": 0.16133333333333333, "grad_norm": 30.125, "grad_norm_var": 8.333268229166666, "learning_rate": 0.0001, "loss": 7.4723, "loss/crossentropy": 2.091415516287088, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.19270135853439568, "step": 4840 }, { "epoch": 0.16166666666666665, "grad_norm": 31.0, "grad_norm_var": 7.190559895833333, "learning_rate": 0.0001, "loss": 7.2524, "loss/crossentropy": 2.109858328104019, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.19128049910068512, "step": 4850 }, { "epoch": 0.162, "grad_norm": 28.75, "grad_norm_var": 2.8462890625, "learning_rate": 0.0001, "loss": 7.2315, "loss/crossentropy": 1.8589928612112998, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.17420709859579803, "step": 4860 }, { "epoch": 0.16233333333333333, "grad_norm": 29.625, "grad_norm_var": 9.934309895833334, "learning_rate": 0.0001, "loss": 7.3778, "loss/crossentropy": 1.938297626376152, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.18825909486040474, "step": 4870 }, { "epoch": 0.16266666666666665, "grad_norm": 30.25, "grad_norm_var": 10.312239583333334, "learning_rate": 0.0001, "loss": 7.2623, "loss/crossentropy": 2.0889032699167727, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.19101898428052663, "step": 4880 }, { "epoch": 0.163, "grad_norm": 45.25, "grad_norm_var": 15.93125, "learning_rate": 0.0001, "loss": 7.3132, "loss/crossentropy": 1.9759422302246095, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.18456655219197274, "step": 4890 }, { "epoch": 0.16333333333333333, "grad_norm": 29.125, "grad_norm_var": 16.37265625, "learning_rate": 0.0001, "loss": 7.3572, "loss/crossentropy": 1.9565439254045487, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.19956080857664346, "step": 4900 }, { "epoch": 0.16366666666666665, "grad_norm": 30.625, "grad_norm_var": 2.937239583333333, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 1.9473642885684967, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.178626499325037, "step": 4910 }, { "epoch": 0.164, "grad_norm": 30.875, "grad_norm_var": 4.63125, "learning_rate": 0.0001, "loss": 7.4743, "loss/crossentropy": 1.9560952335596085, "loss/hidden": 3.551953125, "loss/jsd": 0.0, "loss/logits": 0.20588978324085475, "step": 4920 }, { "epoch": 0.16433333333333333, "grad_norm": 28.625, "grad_norm_var": 1.8582682291666666, "learning_rate": 0.0001, "loss": 7.595, "loss/crossentropy": 2.018819783627987, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.2051872259005904, "step": 4930 }, { "epoch": 0.16466666666666666, "grad_norm": 29.875, "grad_norm_var": 2.3622395833333334, "learning_rate": 0.0001, "loss": 7.4515, "loss/crossentropy": 1.9606310427188873, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.19167055282741785, "step": 4940 }, { "epoch": 0.165, "grad_norm": 29.125, "grad_norm_var": 2.533072916666667, "learning_rate": 0.0001, "loss": 7.4268, "loss/crossentropy": 1.928441097587347, "loss/hidden": 3.539453125, "loss/jsd": 0.0, "loss/logits": 0.20089686335995793, "step": 4950 }, { "epoch": 0.16533333333333333, "grad_norm": 30.375, "grad_norm_var": 2.14765625, "learning_rate": 0.0001, "loss": 7.3508, "loss/crossentropy": 1.9665434598922729, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.18130797445774077, "step": 4960 }, { "epoch": 0.16566666666666666, "grad_norm": 30.75, "grad_norm_var": 3.468489583333333, "learning_rate": 0.0001, "loss": 7.4512, "loss/crossentropy": 2.080355668067932, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.20991535894572735, "step": 4970 }, { "epoch": 0.166, "grad_norm": 29.0, "grad_norm_var": 5.7072265625, "learning_rate": 0.0001, "loss": 7.223, "loss/crossentropy": 2.004133949428797, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.18373343236744405, "step": 4980 }, { "epoch": 0.16633333333333333, "grad_norm": 31.375, "grad_norm_var": 6.067122395833334, "learning_rate": 0.0001, "loss": 7.4039, "loss/crossentropy": 2.026054282486439, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.19996197037398816, "step": 4990 }, { "epoch": 0.16666666666666666, "grad_norm": 29.375, "grad_norm_var": 3.226822916666667, "learning_rate": 0.0001, "loss": 7.4364, "loss/crossentropy": 2.03589124083519, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.21297280862927437, "step": 5000 }, { "epoch": 0.167, "grad_norm": 30.75, "grad_norm_var": 6.6822265625, "learning_rate": 0.0001, "loss": 7.4048, "loss/crossentropy": 2.0024717673659325, "loss/hidden": 3.562109375, "loss/jsd": 0.0, "loss/logits": 0.19518446139991283, "step": 5010 }, { "epoch": 0.16733333333333333, "grad_norm": 30.25, "grad_norm_var": 4.198893229166667, "learning_rate": 0.0001, "loss": 7.4676, "loss/crossentropy": 2.0565529733896257, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.19624960534274577, "step": 5020 }, { "epoch": 0.16766666666666666, "grad_norm": 32.5, "grad_norm_var": 4.139583333333333, "learning_rate": 0.0001, "loss": 7.3768, "loss/crossentropy": 1.8958128660917282, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.18569972533732654, "step": 5030 }, { "epoch": 0.168, "grad_norm": 29.5, "grad_norm_var": 3.130989583333333, "learning_rate": 0.0001, "loss": 7.2547, "loss/crossentropy": 1.9598616436123848, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.17397645227611064, "step": 5040 }, { "epoch": 0.16833333333333333, "grad_norm": 30.125, "grad_norm_var": 2.7955729166666665, "learning_rate": 0.0001, "loss": 7.3663, "loss/crossentropy": 2.0068357408046724, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.17594784051179885, "step": 5050 }, { "epoch": 0.16866666666666666, "grad_norm": 33.25, "grad_norm_var": 5.05, "learning_rate": 0.0001, "loss": 7.338, "loss/crossentropy": 2.130343735218048, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.19646532330662012, "step": 5060 }, { "epoch": 0.169, "grad_norm": 28.25, "grad_norm_var": 21.246875, "learning_rate": 0.0001, "loss": 7.3955, "loss/crossentropy": 2.081761783361435, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.20056925769895315, "step": 5070 }, { "epoch": 0.16933333333333334, "grad_norm": 31.25, "grad_norm_var": 2.96015625, "learning_rate": 0.0001, "loss": 7.3737, "loss/crossentropy": 2.0474599339067936, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.20129019785672425, "step": 5080 }, { "epoch": 0.16966666666666666, "grad_norm": 29.75, "grad_norm_var": 6.481705729166666, "learning_rate": 0.0001, "loss": 7.3547, "loss/crossentropy": 1.8864617981016636, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.1911185685545206, "step": 5090 }, { "epoch": 0.17, "grad_norm": 28.5, "grad_norm_var": 4.433333333333334, "learning_rate": 0.0001, "loss": 7.3539, "loss/crossentropy": 1.9873776763677597, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19648409485816956, "step": 5100 }, { "epoch": 0.17033333333333334, "grad_norm": 31.25, "grad_norm_var": 5.862434895833333, "learning_rate": 0.0001, "loss": 7.2765, "loss/crossentropy": 1.844328733533621, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.18084424240514635, "step": 5110 }, { "epoch": 0.17066666666666666, "grad_norm": 26.75, "grad_norm_var": 6.743489583333333, "learning_rate": 0.0001, "loss": 7.2578, "loss/crossentropy": 1.7846877314150333, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.1764072419144213, "step": 5120 }, { "epoch": 0.171, "grad_norm": 32.5, "grad_norm_var": 6.239583333333333, "learning_rate": 0.0001, "loss": 7.4607, "loss/crossentropy": 1.89234356880188, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.18658601325005292, "step": 5130 }, { "epoch": 0.17133333333333334, "grad_norm": 30.5, "grad_norm_var": 3.7379557291666665, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.0938477486371996, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.19564860872924328, "step": 5140 }, { "epoch": 0.17166666666666666, "grad_norm": 30.75, "grad_norm_var": 1.3104166666666666, "learning_rate": 0.0001, "loss": 7.4057, "loss/crossentropy": 2.0897373229265215, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.1867282001301646, "step": 5150 }, { "epoch": 0.172, "grad_norm": 29.375, "grad_norm_var": 28481.4353515625, "learning_rate": 0.0001, "loss": 7.4076, "loss/crossentropy": 1.9207363367080688, "loss/hidden": 3.61171875, "loss/jsd": 0.0, "loss/logits": 0.28685810351744295, "step": 5160 }, { "epoch": 0.17233333333333334, "grad_norm": 25.625, "grad_norm_var": 23.9322265625, "learning_rate": 0.0001, "loss": 7.3675, "loss/crossentropy": 1.873939599096775, "loss/hidden": 3.535546875, "loss/jsd": 0.0, "loss/logits": 0.19013922344893217, "step": 5170 }, { "epoch": 0.17266666666666666, "grad_norm": 32.25, "grad_norm_var": 9.347330729166666, "learning_rate": 0.0001, "loss": 7.2568, "loss/crossentropy": 2.017928695678711, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.193964634090662, "step": 5180 }, { "epoch": 0.173, "grad_norm": 27.0, "grad_norm_var": 8.343489583333334, "learning_rate": 0.0001, "loss": 7.1721, "loss/crossentropy": 1.8719376981258393, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.18648912850767374, "step": 5190 }, { "epoch": 0.17333333333333334, "grad_norm": 30.75, "grad_norm_var": 1.7416666666666667, "learning_rate": 0.0001, "loss": 7.3347, "loss/crossentropy": 1.8576467014849185, "loss/hidden": 3.60234375, "loss/jsd": 0.0, "loss/logits": 0.2010865157470107, "step": 5200 }, { "epoch": 0.17366666666666666, "grad_norm": 31.125, "grad_norm_var": 4.6166015625, "learning_rate": 0.0001, "loss": 7.3891, "loss/crossentropy": 2.06594540476799, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.19884777553379535, "step": 5210 }, { "epoch": 0.174, "grad_norm": 29.125, "grad_norm_var": 7.029166666666667, "learning_rate": 0.0001, "loss": 7.3833, "loss/crossentropy": 2.0358584992587567, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.20064938953146338, "step": 5220 }, { "epoch": 0.17433333333333334, "grad_norm": 30.375, "grad_norm_var": 3.7051432291666666, "learning_rate": 0.0001, "loss": 7.3781, "loss/crossentropy": 2.1645820796489716, "loss/hidden": 3.51015625, "loss/jsd": 0.0, "loss/logits": 0.20085810907185078, "step": 5230 }, { "epoch": 0.17466666666666666, "grad_norm": 30.5, "grad_norm_var": 22245.993684895835, "learning_rate": 0.0001, "loss": 7.485, "loss/crossentropy": 1.9308386735618115, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.18304080897942185, "step": 5240 }, { "epoch": 0.175, "grad_norm": 29.125, "grad_norm_var": 10.624739583333334, "learning_rate": 0.0001, "loss": 7.3798, "loss/crossentropy": 1.9915927998721599, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.18986997604370118, "step": 5250 }, { "epoch": 0.17533333333333334, "grad_norm": 28.75, "grad_norm_var": 5.6634765625, "learning_rate": 0.0001, "loss": 7.3904, "loss/crossentropy": 2.0106333076953886, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.1946441799402237, "step": 5260 }, { "epoch": 0.17566666666666667, "grad_norm": 29.625, "grad_norm_var": 1.6080729166666667, "learning_rate": 0.0001, "loss": 7.2941, "loss/crossentropy": 1.8494907207787037, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.18036314714699983, "step": 5270 }, { "epoch": 0.176, "grad_norm": 31.0, "grad_norm_var": 3.223372395833333, "learning_rate": 0.0001, "loss": 7.4209, "loss/crossentropy": 1.9802790865302087, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.18280974719673396, "step": 5280 }, { "epoch": 0.17633333333333334, "grad_norm": 28.5, "grad_norm_var": 2.4936848958333333, "learning_rate": 0.0001, "loss": 7.4055, "loss/crossentropy": 2.094613905251026, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.19085414353758096, "step": 5290 }, { "epoch": 0.17666666666666667, "grad_norm": 31.0, "grad_norm_var": 2.9416015625, "learning_rate": 0.0001, "loss": 7.3633, "loss/crossentropy": 2.0021329719573258, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.18117959122173488, "step": 5300 }, { "epoch": 0.177, "grad_norm": 33.0, "grad_norm_var": 1.7962890625, "learning_rate": 0.0001, "loss": 7.3813, "loss/crossentropy": 2.04015157520771, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.19033405743539333, "step": 5310 }, { "epoch": 0.17733333333333334, "grad_norm": 30.5, "grad_norm_var": 2.0520833333333335, "learning_rate": 0.0001, "loss": 7.3902, "loss/crossentropy": 2.047923192381859, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.18923539966344832, "step": 5320 }, { "epoch": 0.17766666666666667, "grad_norm": 33.25, "grad_norm_var": 3.9583333333333335, "learning_rate": 0.0001, "loss": 7.3515, "loss/crossentropy": 2.0359393090009688, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.18528680447489024, "step": 5330 }, { "epoch": 0.178, "grad_norm": 31.375, "grad_norm_var": 2.979622395833333, "learning_rate": 0.0001, "loss": 7.4063, "loss/crossentropy": 2.060121662914753, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.1836360890418291, "step": 5340 }, { "epoch": 0.17833333333333334, "grad_norm": 39.0, "grad_norm_var": 3.132886832423336e+18, "learning_rate": 0.0001, "loss": 7.3706, "loss/crossentropy": 1.9369598262012004, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.18403992811217904, "step": 5350 }, { "epoch": 0.17866666666666667, "grad_norm": 30.375, "grad_norm_var": 5.397221642608347e+18, "learning_rate": 0.0001, "loss": 7.3213, "loss/crossentropy": 1.936340370774269, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18127203043550252, "step": 5360 }, { "epoch": 0.179, "grad_norm": 30.75, "grad_norm_var": 2.648398030652799e+18, "learning_rate": 0.0001, "loss": 7.3104, "loss/crossentropy": 1.9734927944839, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.18735061455518007, "step": 5370 }, { "epoch": 0.17933333333333334, "grad_norm": 31.25, "grad_norm_var": 3.8676432291666667, "learning_rate": 0.0001, "loss": 7.2672, "loss/crossentropy": 1.8653094552457332, "loss/hidden": 3.606640625, "loss/jsd": 0.0, "loss/logits": 0.17940121721476315, "step": 5380 }, { "epoch": 0.17966666666666667, "grad_norm": 31.0, "grad_norm_var": 3.6962890625, "learning_rate": 0.0001, "loss": 7.3552, "loss/crossentropy": 1.9043451480567455, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.20052610374987126, "step": 5390 }, { "epoch": 0.18, "grad_norm": 33.25, "grad_norm_var": 2.169205729166667, "learning_rate": 0.0001, "loss": 7.3057, "loss/crossentropy": 2.0129118353128432, "loss/hidden": 3.551953125, "loss/jsd": 0.0, "loss/logits": 0.20012324377894403, "step": 5400 }, { "epoch": 0.18033333333333335, "grad_norm": 32.5, "grad_norm_var": 4.830989583333333, "learning_rate": 0.0001, "loss": 7.3247, "loss/crossentropy": 1.7881770990788937, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.18300773110240698, "step": 5410 }, { "epoch": 0.18066666666666667, "grad_norm": 36.0, "grad_norm_var": 3.256705729166667, "learning_rate": 0.0001, "loss": 7.4076, "loss/crossentropy": 1.9842897772789, "loss/hidden": 3.564453125, "loss/jsd": 0.0, "loss/logits": 0.21424803081899882, "step": 5420 }, { "epoch": 0.181, "grad_norm": 26.5, "grad_norm_var": 6.137955729166666, "learning_rate": 0.0001, "loss": 7.286, "loss/crossentropy": 1.9534772280603647, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.17796809347346426, "step": 5430 }, { "epoch": 0.18133333333333335, "grad_norm": 31.5, "grad_norm_var": 4.573893229166667, "learning_rate": 0.0001, "loss": 7.4771, "loss/crossentropy": 2.026873242855072, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19246806176379322, "step": 5440 }, { "epoch": 0.18166666666666667, "grad_norm": 30.5, "grad_norm_var": 4.36015625, "learning_rate": 0.0001, "loss": 7.318, "loss/crossentropy": 1.9746992245316506, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19005284905433656, "step": 5450 }, { "epoch": 0.182, "grad_norm": 28.875, "grad_norm_var": 2.065625, "learning_rate": 0.0001, "loss": 7.3343, "loss/crossentropy": 1.927635481953621, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.1925207343418151, "step": 5460 }, { "epoch": 0.18233333333333332, "grad_norm": 31.125, "grad_norm_var": 3.1077473958333335, "learning_rate": 0.0001, "loss": 7.3545, "loss/crossentropy": 1.8923965506255627, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.18610275611281396, "step": 5470 }, { "epoch": 0.18266666666666667, "grad_norm": 29.75, "grad_norm_var": 4.97265625, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 2.072747530043125, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.18907211050391198, "step": 5480 }, { "epoch": 0.183, "grad_norm": 49.25, "grad_norm_var": 2.703285650488313e+18, "learning_rate": 0.0001, "loss": 7.4121, "loss/crossentropy": 2.0045800119638444, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.20290692001581193, "step": 5490 }, { "epoch": 0.18333333333333332, "grad_norm": 30.0, "grad_norm_var": 2.7032856496182743e+18, "learning_rate": 0.0001, "loss": 7.3229, "loss/crossentropy": 2.0179566562175753, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.1930710466578603, "step": 5500 }, { "epoch": 0.18366666666666667, "grad_norm": 27.625, "grad_norm_var": 2.251822916666667, "learning_rate": 0.0001, "loss": 7.3555, "loss/crossentropy": 2.018584023416042, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.18761964254081248, "step": 5510 }, { "epoch": 0.184, "grad_norm": 29.125, "grad_norm_var": 6.187239583333334, "learning_rate": 0.0001, "loss": 7.3375, "loss/crossentropy": 1.91555445343256, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.21089686956256629, "step": 5520 }, { "epoch": 0.18433333333333332, "grad_norm": 30.25, "grad_norm_var": 4.8837890625, "learning_rate": 0.0001, "loss": 7.3337, "loss/crossentropy": 2.031555511802435, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.18945427071303128, "step": 5530 }, { "epoch": 0.18466666666666667, "grad_norm": 29.875, "grad_norm_var": 1.9296223958333334, "learning_rate": 0.0001, "loss": 7.4364, "loss/crossentropy": 2.052650284767151, "loss/hidden": 3.56484375, "loss/jsd": 0.0, "loss/logits": 0.20762767251580955, "step": 5540 }, { "epoch": 0.185, "grad_norm": 31.75, "grad_norm_var": 6.653125, "learning_rate": 0.0001, "loss": 7.3599, "loss/crossentropy": 2.1282555937767027, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.20784689877182244, "step": 5550 }, { "epoch": 0.18533333333333332, "grad_norm": 31.875, "grad_norm_var": 5.094791666666667, "learning_rate": 0.0001, "loss": 7.3537, "loss/crossentropy": 2.0559830352663995, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.1966929741203785, "step": 5560 }, { "epoch": 0.18566666666666667, "grad_norm": 34.75, "grad_norm_var": 2.501822916666667, "learning_rate": 0.0001, "loss": 7.2851, "loss/crossentropy": 2.02881633117795, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.18872611671686174, "step": 5570 }, { "epoch": 0.186, "grad_norm": 34.5, "grad_norm_var": 4.120572916666666, "learning_rate": 0.0001, "loss": 7.2671, "loss/crossentropy": 2.080699609220028, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.19963632430881262, "step": 5580 }, { "epoch": 0.18633333333333332, "grad_norm": 27.5, "grad_norm_var": 3.3744140625, "learning_rate": 0.0001, "loss": 7.3147, "loss/crossentropy": 1.7757218964397907, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.17035862654447556, "step": 5590 }, { "epoch": 0.18666666666666668, "grad_norm": 30.125, "grad_norm_var": 3.4247395833333334, "learning_rate": 0.0001, "loss": 7.485, "loss/crossentropy": 2.069795151799917, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.18994259741157293, "step": 5600 }, { "epoch": 0.187, "grad_norm": 29.0, "grad_norm_var": 1.7333333333333334, "learning_rate": 0.0001, "loss": 7.4504, "loss/crossentropy": 2.0682191736996174, "loss/hidden": 3.555859375, "loss/jsd": 0.0, "loss/logits": 0.21530677266418935, "step": 5610 }, { "epoch": 0.18733333333333332, "grad_norm": 33.5, "grad_norm_var": 2.0952473958333333, "learning_rate": 0.0001, "loss": 7.3842, "loss/crossentropy": 2.0398148208856584, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.19317151457071305, "step": 5620 }, { "epoch": 0.18766666666666668, "grad_norm": 29.75, "grad_norm_var": 14.5025390625, "learning_rate": 0.0001, "loss": 7.3125, "loss/crossentropy": 1.9017793960869311, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.17554700216278435, "step": 5630 }, { "epoch": 0.188, "grad_norm": 30.625, "grad_norm_var": 14.926041666666666, "learning_rate": 0.0001, "loss": 7.4144, "loss/crossentropy": 2.078898024559021, "loss/hidden": 3.5046875, "loss/jsd": 0.0, "loss/logits": 0.21142531074583532, "step": 5640 }, { "epoch": 0.18833333333333332, "grad_norm": 27.5, "grad_norm_var": 2.2295632672191764e+18, "learning_rate": 0.0001, "loss": 7.2782, "loss/crossentropy": 1.9787876695394515, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.17243995368480683, "step": 5650 }, { "epoch": 0.18866666666666668, "grad_norm": 28.875, "grad_norm_var": 3.5353515625, "learning_rate": 0.0001, "loss": 7.4111, "loss/crossentropy": 1.893690215051174, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.17849329607561232, "step": 5660 }, { "epoch": 0.189, "grad_norm": 30.375, "grad_norm_var": 3.0322265625, "learning_rate": 0.0001, "loss": 7.2889, "loss/crossentropy": 1.8960665926337241, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.19380444549024106, "step": 5670 }, { "epoch": 0.18933333333333333, "grad_norm": 30.5, "grad_norm_var": 36.87291666666667, "learning_rate": 0.0001, "loss": 7.4085, "loss/crossentropy": 2.060232773423195, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.19537428338080645, "step": 5680 }, { "epoch": 0.18966666666666668, "grad_norm": 28.75, "grad_norm_var": 4.095833333333333, "learning_rate": 0.0001, "loss": 7.2736, "loss/crossentropy": 2.0471127212047575, "loss/hidden": 3.530859375, "loss/jsd": 0.0, "loss/logits": 0.20839319732040168, "step": 5690 }, { "epoch": 0.19, "grad_norm": 30.0, "grad_norm_var": 3.70390625, "learning_rate": 0.0001, "loss": 7.3682, "loss/crossentropy": 1.9329538971185685, "loss/hidden": 3.309375, "loss/jsd": 0.0, "loss/logits": 0.16797176375985146, "step": 5700 }, { "epoch": 0.19033333333333333, "grad_norm": 30.625, "grad_norm_var": 27.408072916666665, "learning_rate": 0.0001, "loss": 7.3244, "loss/crossentropy": 2.034928911924362, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.19508280903100966, "step": 5710 }, { "epoch": 0.19066666666666668, "grad_norm": 27.375, "grad_norm_var": 36.09765625, "learning_rate": 0.0001, "loss": 7.2737, "loss/crossentropy": 1.8476089045405388, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.1921651756390929, "step": 5720 }, { "epoch": 0.191, "grad_norm": 30.0, "grad_norm_var": 2.1830729166666667, "learning_rate": 0.0001, "loss": 7.3703, "loss/crossentropy": 1.9578117609024048, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.1835012746974826, "step": 5730 }, { "epoch": 0.19133333333333333, "grad_norm": 32.0, "grad_norm_var": 3.84765625, "learning_rate": 0.0001, "loss": 7.2875, "loss/crossentropy": 1.8632203750312328, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.18514488767832518, "step": 5740 }, { "epoch": 0.19166666666666668, "grad_norm": 27.625, "grad_norm_var": 2.42890625, "learning_rate": 0.0001, "loss": 7.371, "loss/crossentropy": 1.9854460023343563, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.18058872912079096, "step": 5750 }, { "epoch": 0.192, "grad_norm": 31.125, "grad_norm_var": 3.936393229166667, "learning_rate": 0.0001, "loss": 7.3803, "loss/crossentropy": 2.0978697665035724, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.209872105717659, "step": 5760 }, { "epoch": 0.19233333333333333, "grad_norm": 32.0, "grad_norm_var": 3.208268229166667, "learning_rate": 0.0001, "loss": 7.4535, "loss/crossentropy": 2.065116219967604, "loss/hidden": 3.575, "loss/jsd": 0.0, "loss/logits": 0.19948470462113618, "step": 5770 }, { "epoch": 0.19266666666666668, "grad_norm": 30.125, "grad_norm_var": 35.37962239583333, "learning_rate": 0.0001, "loss": 7.3409, "loss/crossentropy": 2.14624527990818, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.20528145432472228, "step": 5780 }, { "epoch": 0.193, "grad_norm": 31.5, "grad_norm_var": 38.417643229166664, "learning_rate": 0.0001, "loss": 7.2623, "loss/crossentropy": 2.0024950690567493, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.18867999948561193, "step": 5790 }, { "epoch": 0.19333333333333333, "grad_norm": 27.25, "grad_norm_var": 4.762434895833334, "learning_rate": 0.0001, "loss": 7.268, "loss/crossentropy": 1.9846145622432232, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.16974254972301422, "step": 5800 }, { "epoch": 0.19366666666666665, "grad_norm": 30.625, "grad_norm_var": 2.626822916666667, "learning_rate": 0.0001, "loss": 7.3117, "loss/crossentropy": 1.890152809768915, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.1841204732656479, "step": 5810 }, { "epoch": 0.194, "grad_norm": 30.125, "grad_norm_var": 1.6802083333333333, "learning_rate": 0.0001, "loss": 7.3136, "loss/crossentropy": 1.8292512103915215, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.18547479510307313, "step": 5820 }, { "epoch": 0.19433333333333333, "grad_norm": 31.5, "grad_norm_var": 2.871326210624079e+18, "learning_rate": 0.0001, "loss": 7.306, "loss/crossentropy": 1.8880173660814763, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.18677659574896097, "step": 5830 }, { "epoch": 0.19466666666666665, "grad_norm": 30.625, "grad_norm_var": 2.871326210694683e+18, "learning_rate": 0.0001, "loss": 7.3747, "loss/crossentropy": 1.9971879363059997, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.2169063776731491, "step": 5840 }, { "epoch": 0.195, "grad_norm": 38.0, "grad_norm_var": 6.53515625, "learning_rate": 0.0001, "loss": 7.3934, "loss/crossentropy": 1.9483453705906868, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.18222760818898678, "step": 5850 }, { "epoch": 0.19533333333333333, "grad_norm": 34.25, "grad_norm_var": 6.166666666666667, "learning_rate": 0.0001, "loss": 7.409, "loss/crossentropy": 2.091171472519636, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.19173091007396578, "step": 5860 }, { "epoch": 0.19566666666666666, "grad_norm": 29.0, "grad_norm_var": 5.058268229166667, "learning_rate": 0.0001, "loss": 7.3592, "loss/crossentropy": 2.046744999289513, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.20100857987999915, "step": 5870 }, { "epoch": 0.196, "grad_norm": 32.75, "grad_norm_var": 4.084309895833333, "learning_rate": 0.0001, "loss": 7.3293, "loss/crossentropy": 2.082516020536423, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19173082280904055, "step": 5880 }, { "epoch": 0.19633333333333333, "grad_norm": 28.625, "grad_norm_var": 3.626030989548768e+18, "learning_rate": 0.0001, "loss": 7.2048, "loss/crossentropy": 1.945667991042137, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.17691243859007955, "step": 5890 }, { "epoch": 0.19666666666666666, "grad_norm": 32.5, "grad_norm_var": 3.626030989398018e+18, "learning_rate": 0.0001, "loss": 7.4434, "loss/crossentropy": 2.0542997077107428, "loss/hidden": 3.313671875, "loss/jsd": 0.0, "loss/logits": 0.17775741945952178, "step": 5900 }, { "epoch": 0.197, "grad_norm": 30.375, "grad_norm_var": 2.22265625, "learning_rate": 0.0001, "loss": 7.3073, "loss/crossentropy": 1.9394507549703122, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.2011955610476434, "step": 5910 }, { "epoch": 0.19733333333333333, "grad_norm": 27.875, "grad_norm_var": 4.385416666666667, "learning_rate": 0.0001, "loss": 7.2413, "loss/crossentropy": 2.033419676870108, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.18376299599185586, "step": 5920 }, { "epoch": 0.19766666666666666, "grad_norm": 29.0, "grad_norm_var": 4.7119140625, "learning_rate": 0.0001, "loss": 7.2217, "loss/crossentropy": 1.981913087517023, "loss/hidden": 3.657421875, "loss/jsd": 0.0, "loss/logits": 0.2043977279216051, "step": 5930 }, { "epoch": 0.198, "grad_norm": 28.375, "grad_norm_var": 3.4905598958333335, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 2.1703044548630714, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.19806311912834645, "step": 5940 }, { "epoch": 0.19833333333333333, "grad_norm": 28.125, "grad_norm_var": 3.2122395833333335, "learning_rate": 0.0001, "loss": 7.2651, "loss/crossentropy": 1.9395112864673139, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.17924010213464497, "step": 5950 }, { "epoch": 0.19866666666666666, "grad_norm": 31.25, "grad_norm_var": 5.6087890625, "learning_rate": 0.0001, "loss": 7.3445, "loss/crossentropy": 2.0103931710124017, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19979343116283416, "step": 5960 }, { "epoch": 0.199, "grad_norm": 28.0, "grad_norm_var": 5.909309895833333, "learning_rate": 0.0001, "loss": 7.4126, "loss/crossentropy": 1.9055282182991504, "loss/hidden": 3.565625, "loss/jsd": 0.0, "loss/logits": 0.1991718839854002, "step": 5970 }, { "epoch": 0.19933333333333333, "grad_norm": 31.75, "grad_norm_var": 5.3375, "learning_rate": 0.0001, "loss": 7.334, "loss/crossentropy": 2.145095956325531, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.20446721836924553, "step": 5980 }, { "epoch": 0.19966666666666666, "grad_norm": 30.25, "grad_norm_var": 18.624739583333334, "learning_rate": 0.0001, "loss": 7.3692, "loss/crossentropy": 2.0063738718628885, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.1998368751257658, "step": 5990 }, { "epoch": 0.2, "grad_norm": 33.75, "grad_norm_var": 3.321875, "learning_rate": 0.0001, "loss": 7.2548, "loss/crossentropy": 1.7764502555131911, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.1665659283287823, "step": 6000 }, { "epoch": 0.20033333333333334, "grad_norm": 27.5, "grad_norm_var": 32.6369140625, "learning_rate": 0.0001, "loss": 7.3241, "loss/crossentropy": 1.9885497316718102, "loss/hidden": 3.521875, "loss/jsd": 0.0, "loss/logits": 0.1915190517436713, "step": 6010 }, { "epoch": 0.20066666666666666, "grad_norm": 28.75, "grad_norm_var": 34.471875, "learning_rate": 0.0001, "loss": 7.3214, "loss/crossentropy": 1.9869511708617211, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.18836192023009063, "step": 6020 }, { "epoch": 0.201, "grad_norm": 32.0, "grad_norm_var": 51.337239583333336, "learning_rate": 0.0001, "loss": 7.4415, "loss/crossentropy": 2.1181409060955048, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.18881170060485603, "step": 6030 }, { "epoch": 0.20133333333333334, "grad_norm": 28.375, "grad_norm_var": 56.0837890625, "learning_rate": 0.0001, "loss": 7.2331, "loss/crossentropy": 1.9832341864705085, "loss/hidden": 3.50078125, "loss/jsd": 0.0, "loss/logits": 0.191112170368433, "step": 6040 }, { "epoch": 0.20166666666666666, "grad_norm": 29.75, "grad_norm_var": 2.2041015625, "learning_rate": 0.0001, "loss": 7.3901, "loss/crossentropy": 1.8436450406908989, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.19769377280026673, "step": 6050 }, { "epoch": 0.202, "grad_norm": 27.0, "grad_norm_var": 2.875, "learning_rate": 0.0001, "loss": 7.2967, "loss/crossentropy": 1.880169765651226, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.17581943878903986, "step": 6060 }, { "epoch": 0.20233333333333334, "grad_norm": 31.375, "grad_norm_var": 3.26015625, "learning_rate": 0.0001, "loss": 7.2969, "loss/crossentropy": 1.7483499720692635, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.1957072077319026, "step": 6070 }, { "epoch": 0.20266666666666666, "grad_norm": 31.125, "grad_norm_var": 10.3166015625, "learning_rate": 0.0001, "loss": 7.275, "loss/crossentropy": 1.8378936417400837, "loss/hidden": 3.53359375, "loss/jsd": 0.0, "loss/logits": 0.1904732353053987, "step": 6080 }, { "epoch": 0.203, "grad_norm": 31.375, "grad_norm_var": 2.010872395833333, "learning_rate": 0.0001, "loss": 7.3663, "loss/crossentropy": 1.9716134004294872, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.18555732611566783, "step": 6090 }, { "epoch": 0.20333333333333334, "grad_norm": 31.625, "grad_norm_var": 3.441666666666667, "learning_rate": 0.0001, "loss": 7.2864, "loss/crossentropy": 1.969330693781376, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.1776351287961006, "step": 6100 }, { "epoch": 0.20366666666666666, "grad_norm": 31.25, "grad_norm_var": 3.888541666666667, "learning_rate": 0.0001, "loss": 7.2876, "loss/crossentropy": 1.865223766863346, "loss/hidden": 3.605078125, "loss/jsd": 0.0, "loss/logits": 0.1961459070444107, "step": 6110 }, { "epoch": 0.204, "grad_norm": 30.0, "grad_norm_var": 3.278580729166667, "learning_rate": 0.0001, "loss": 7.2675, "loss/crossentropy": 1.9460454016923905, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.17724429341033102, "step": 6120 }, { "epoch": 0.20433333333333334, "grad_norm": 31.5, "grad_norm_var": 9.9994140625, "learning_rate": 0.0001, "loss": 7.4274, "loss/crossentropy": 2.0674363002181053, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.18237694036215543, "step": 6130 }, { "epoch": 0.20466666666666666, "grad_norm": 28.0, "grad_norm_var": 9.646875, "learning_rate": 0.0001, "loss": 7.3852, "loss/crossentropy": 2.0288800582289697, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.19911267701536417, "step": 6140 }, { "epoch": 0.205, "grad_norm": 28.25, "grad_norm_var": 1.94765625, "learning_rate": 0.0001, "loss": 7.3639, "loss/crossentropy": 1.9112939991056919, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.18210796043276786, "step": 6150 }, { "epoch": 0.20533333333333334, "grad_norm": 27.75, "grad_norm_var": 2.2681640625, "learning_rate": 0.0001, "loss": 7.3276, "loss/crossentropy": 2.0238750651478767, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.18928587958216667, "step": 6160 }, { "epoch": 0.20566666666666666, "grad_norm": 30.625, "grad_norm_var": 8.153059895833334, "learning_rate": 0.0001, "loss": 7.3289, "loss/crossentropy": 2.01273692548275, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.1841313811019063, "step": 6170 }, { "epoch": 0.206, "grad_norm": 30.0, "grad_norm_var": 6.958333333333333, "learning_rate": 0.0001, "loss": 7.3823, "loss/crossentropy": 2.1281500928103925, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.20614625760354102, "step": 6180 }, { "epoch": 0.20633333333333334, "grad_norm": 29.75, "grad_norm_var": 1.4666015625, "learning_rate": 0.0001, "loss": 7.2801, "loss/crossentropy": 1.8715808756649495, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.179180480632931, "step": 6190 }, { "epoch": 0.20666666666666667, "grad_norm": 31.0, "grad_norm_var": 0.9332682291666666, "learning_rate": 0.0001, "loss": 7.3495, "loss/crossentropy": 1.9199820198118687, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.17939657289534808, "step": 6200 }, { "epoch": 0.207, "grad_norm": 33.5, "grad_norm_var": 4.127018229166667, "learning_rate": 0.0001, "loss": 7.3009, "loss/crossentropy": 1.949132316559553, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.18577333334833385, "step": 6210 }, { "epoch": 0.20733333333333334, "grad_norm": 30.625, "grad_norm_var": 4.646875, "learning_rate": 0.0001, "loss": 7.4682, "loss/crossentropy": 2.08066081777215, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.20601688781753183, "step": 6220 }, { "epoch": 0.20766666666666667, "grad_norm": 30.375, "grad_norm_var": 3.245833333333333, "learning_rate": 0.0001, "loss": 7.4239, "loss/crossentropy": 1.811847558617592, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.18636459894478322, "step": 6230 }, { "epoch": 0.208, "grad_norm": 27.5, "grad_norm_var": 88.42291666666667, "learning_rate": 0.0001, "loss": 7.3899, "loss/crossentropy": 2.132224730402231, "loss/hidden": 3.386328125, "loss/jsd": 0.0, "loss/logits": 0.1857683427631855, "step": 6240 }, { "epoch": 0.20833333333333334, "grad_norm": 29.625, "grad_norm_var": 63.7681640625, "learning_rate": 0.0001, "loss": 7.3067, "loss/crossentropy": 2.131772571802139, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.1882297296077013, "step": 6250 }, { "epoch": 0.20866666666666667, "grad_norm": 43.75, "grad_norm_var": 29.556184895833333, "learning_rate": 0.0001, "loss": 7.2405, "loss/crossentropy": 2.014256003499031, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19100084751844407, "step": 6260 }, { "epoch": 0.209, "grad_norm": 38.0, "grad_norm_var": 15.995572916666667, "learning_rate": 0.0001, "loss": 7.3157, "loss/crossentropy": 2.039869359135628, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.2083653502166271, "step": 6270 }, { "epoch": 0.20933333333333334, "grad_norm": 30.0, "grad_norm_var": 5.902018229166667, "learning_rate": 0.0001, "loss": 7.4018, "loss/crossentropy": 1.9745189398527145, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.185630620457232, "step": 6280 }, { "epoch": 0.20966666666666667, "grad_norm": 29.625, "grad_norm_var": 10.48515625, "learning_rate": 0.0001, "loss": 7.3814, "loss/crossentropy": 1.9497050486505032, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.1931807903572917, "step": 6290 }, { "epoch": 0.21, "grad_norm": 27.75, "grad_norm_var": 48.146875, "learning_rate": 0.0001, "loss": 7.2649, "loss/crossentropy": 2.077213482558727, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.18777444660663606, "step": 6300 }, { "epoch": 0.21033333333333334, "grad_norm": 32.0, "grad_norm_var": 3.499367249490949e+18, "learning_rate": 0.0001, "loss": 7.3138, "loss/crossentropy": 1.9385949403047562, "loss/hidden": 3.499609375, "loss/jsd": 0.0, "loss/logits": 0.19814892020076513, "step": 6310 }, { "epoch": 0.21066666666666667, "grad_norm": 28.125, "grad_norm_var": 11.679166666666667, "learning_rate": 0.0001, "loss": 7.4059, "loss/crossentropy": 2.0608648508787155, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.1940284344367683, "step": 6320 }, { "epoch": 0.211, "grad_norm": 29.625, "grad_norm_var": 4.7259765625, "learning_rate": 0.0001, "loss": 7.3703, "loss/crossentropy": 2.0461514562368395, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.18261052761226892, "step": 6330 }, { "epoch": 0.21133333333333335, "grad_norm": 32.5, "grad_norm_var": 4.4837890625, "learning_rate": 0.0001, "loss": 7.4977, "loss/crossentropy": 1.966506128013134, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.2104006376117468, "step": 6340 }, { "epoch": 0.21166666666666667, "grad_norm": 28.625, "grad_norm_var": 3.8087890625, "learning_rate": 0.0001, "loss": 7.2643, "loss/crossentropy": 2.1535836666822434, "loss/hidden": 3.354296875, "loss/jsd": 0.0, "loss/logits": 0.1898369684815407, "step": 6350 }, { "epoch": 0.212, "grad_norm": 30.25, "grad_norm_var": 1.9905598958333333, "learning_rate": 0.0001, "loss": 7.1956, "loss/crossentropy": 1.9430210620164872, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.19202467575669288, "step": 6360 }, { "epoch": 0.21233333333333335, "grad_norm": 29.0, "grad_norm_var": 1.4809895833333333, "learning_rate": 0.0001, "loss": 7.4215, "loss/crossentropy": 2.0876922219991685, "loss/hidden": 3.4890625, "loss/jsd": 0.0, "loss/logits": 0.21219893172383308, "step": 6370 }, { "epoch": 0.21266666666666667, "grad_norm": 30.25, "grad_norm_var": 1.7957682291666666, "learning_rate": 0.0001, "loss": 7.3145, "loss/crossentropy": 1.9804712519049645, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.192524179443717, "step": 6380 }, { "epoch": 0.213, "grad_norm": 29.125, "grad_norm_var": 1.6983723958333334, "learning_rate": 0.0001, "loss": 7.213, "loss/crossentropy": 1.8352730557322503, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.18429453689604997, "step": 6390 }, { "epoch": 0.21333333333333335, "grad_norm": 30.125, "grad_norm_var": 1.3733723958333333, "learning_rate": 0.0001, "loss": 7.2637, "loss/crossentropy": 2.06046840660274, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.19282194171100855, "step": 6400 }, { "epoch": 0.21366666666666667, "grad_norm": 29.125, "grad_norm_var": 1.7942057291666667, "learning_rate": 0.0001, "loss": 7.1694, "loss/crossentropy": 1.891466201096773, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.17797610815614462, "step": 6410 }, { "epoch": 0.214, "grad_norm": 30.0, "grad_norm_var": 3.7660807291666667, "learning_rate": 0.0001, "loss": 7.2893, "loss/crossentropy": 1.8469770409166812, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.18063903097063302, "step": 6420 }, { "epoch": 0.21433333333333332, "grad_norm": 31.75, "grad_norm_var": 2.58125, "learning_rate": 0.0001, "loss": 7.3657, "loss/crossentropy": 1.9388149872422218, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.18306618183851242, "step": 6430 }, { "epoch": 0.21466666666666667, "grad_norm": 28.25, "grad_norm_var": 1.7692057291666667, "learning_rate": 0.0001, "loss": 7.2973, "loss/crossentropy": 2.0079730078577995, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.184094506688416, "step": 6440 }, { "epoch": 0.215, "grad_norm": 32.25, "grad_norm_var": 7.4400390625, "learning_rate": 0.0001, "loss": 7.2151, "loss/crossentropy": 1.8909920528531075, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.18189897965639829, "step": 6450 }, { "epoch": 0.21533333333333332, "grad_norm": 30.375, "grad_norm_var": 9.513541666666667, "learning_rate": 0.0001, "loss": 7.3737, "loss/crossentropy": 1.8438223473727704, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.18530392716638744, "step": 6460 }, { "epoch": 0.21566666666666667, "grad_norm": 29.5, "grad_norm_var": 4.8134765625, "learning_rate": 0.0001, "loss": 7.4841, "loss/crossentropy": 2.020700005441904, "loss/hidden": 3.499609375, "loss/jsd": 0.0, "loss/logits": 0.19707159847021102, "step": 6470 }, { "epoch": 0.216, "grad_norm": 31.875, "grad_norm_var": 2.142708333333333, "learning_rate": 0.0001, "loss": 7.5326, "loss/crossentropy": 2.0630332618951797, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.1946509636938572, "step": 6480 }, { "epoch": 0.21633333333333332, "grad_norm": 31.125, "grad_norm_var": 1.33515625, "learning_rate": 0.0001, "loss": 7.3632, "loss/crossentropy": 2.0810040444135667, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.18950448725372554, "step": 6490 }, { "epoch": 0.21666666666666667, "grad_norm": 28.5, "grad_norm_var": 1.2809895833333333, "learning_rate": 0.0001, "loss": 7.3679, "loss/crossentropy": 1.9578623324632645, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.19137522336095572, "step": 6500 }, { "epoch": 0.217, "grad_norm": 28.25, "grad_norm_var": 2.0541015625, "learning_rate": 0.0001, "loss": 7.336, "loss/crossentropy": 1.8093761764466763, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.19337214762344956, "step": 6510 }, { "epoch": 0.21733333333333332, "grad_norm": 27.875, "grad_norm_var": 7.051041666666666, "learning_rate": 0.0001, "loss": 7.2384, "loss/crossentropy": 2.037687784433365, "loss/hidden": 3.462890625, "loss/jsd": 0.0, "loss/logits": 0.19133354183286427, "step": 6520 }, { "epoch": 0.21766666666666667, "grad_norm": 29.75, "grad_norm_var": 3.0497395833333334, "learning_rate": 0.0001, "loss": 7.3143, "loss/crossentropy": 1.7976041294634342, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.17570479568094016, "step": 6530 }, { "epoch": 0.218, "grad_norm": 31.125, "grad_norm_var": 3.627083333333333, "learning_rate": 0.0001, "loss": 7.3231, "loss/crossentropy": 1.8348692402243614, "loss/hidden": 3.523046875, "loss/jsd": 0.0, "loss/logits": 0.18097719233483076, "step": 6540 }, { "epoch": 0.21833333333333332, "grad_norm": 32.75, "grad_norm_var": 1.2833333333333334, "learning_rate": 0.0001, "loss": 7.3818, "loss/crossentropy": 1.9135801509022712, "loss/hidden": 3.50078125, "loss/jsd": 0.0, "loss/logits": 0.1899829575791955, "step": 6550 }, { "epoch": 0.21866666666666668, "grad_norm": 33.25, "grad_norm_var": 2.30625, "learning_rate": 0.0001, "loss": 7.3885, "loss/crossentropy": 1.9283490404486656, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.1905393574386835, "step": 6560 }, { "epoch": 0.219, "grad_norm": 29.25, "grad_norm_var": 1.4832682291666666, "learning_rate": 0.0001, "loss": 7.263, "loss/crossentropy": 2.0127401903271673, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.1945334179326892, "step": 6570 }, { "epoch": 0.21933333333333332, "grad_norm": 29.5, "grad_norm_var": 1.6764973958333333, "learning_rate": 0.0001, "loss": 7.3412, "loss/crossentropy": 2.103078302741051, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.194960336945951, "step": 6580 }, { "epoch": 0.21966666666666668, "grad_norm": 33.25, "grad_norm_var": 4.38125, "learning_rate": 0.0001, "loss": 7.4064, "loss/crossentropy": 1.918302746117115, "loss/hidden": 3.5359375, "loss/jsd": 0.0, "loss/logits": 0.19589321874082088, "step": 6590 }, { "epoch": 0.22, "grad_norm": 31.125, "grad_norm_var": 1.5622564698231844e+18, "learning_rate": 0.0001, "loss": 7.5441, "loss/crossentropy": 2.053830276429653, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.2077662143856287, "step": 6600 }, { "epoch": 0.22033333333333333, "grad_norm": 30.125, "grad_norm_var": 1.5622564698336003e+18, "learning_rate": 0.0001, "loss": 7.3042, "loss/crossentropy": 1.9595064498484134, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.18946341704577208, "step": 6610 }, { "epoch": 0.22066666666666668, "grad_norm": 29.375, "grad_norm_var": 3.120247395833333, "learning_rate": 0.0001, "loss": 7.2245, "loss/crossentropy": 1.7036143302917481, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.16726189181208612, "step": 6620 }, { "epoch": 0.221, "grad_norm": 30.625, "grad_norm_var": 2.2738932291666667, "learning_rate": 0.0001, "loss": 7.3502, "loss/crossentropy": 2.0200803853571414, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.19582360396161674, "step": 6630 }, { "epoch": 0.22133333333333333, "grad_norm": 32.5, "grad_norm_var": 2.883072916666667, "learning_rate": 0.0001, "loss": 7.4062, "loss/crossentropy": 2.20606330037117, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.20634721163660288, "step": 6640 }, { "epoch": 0.22166666666666668, "grad_norm": 30.75, "grad_norm_var": 1.43515625, "learning_rate": 0.0001, "loss": 7.4132, "loss/crossentropy": 1.975147820264101, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.19400889575481414, "step": 6650 }, { "epoch": 0.222, "grad_norm": 29.25, "grad_norm_var": 1.3129557291666667, "learning_rate": 0.0001, "loss": 7.385, "loss/crossentropy": 1.9891602620482445, "loss/hidden": 3.297265625, "loss/jsd": 0.0, "loss/logits": 0.1783045081421733, "step": 6660 }, { "epoch": 0.22233333333333333, "grad_norm": 29.875, "grad_norm_var": 1.6275390625, "learning_rate": 0.0001, "loss": 7.4385, "loss/crossentropy": 1.898167619854212, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.18494944917038084, "step": 6670 }, { "epoch": 0.22266666666666668, "grad_norm": 35.25, "grad_norm_var": 9.96015625, "learning_rate": 0.0001, "loss": 7.3739, "loss/crossentropy": 1.9484342962503434, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.19534599408507347, "step": 6680 }, { "epoch": 0.223, "grad_norm": 29.75, "grad_norm_var": 11.58515625, "learning_rate": 0.0001, "loss": 7.3351, "loss/crossentropy": 1.9404854811728, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.18892935551702977, "step": 6690 }, { "epoch": 0.22333333333333333, "grad_norm": 29.75, "grad_norm_var": 1.4872395833333334, "learning_rate": 0.0001, "loss": 7.3311, "loss/crossentropy": 2.0992536753416062, "loss/hidden": 3.49609375, "loss/jsd": 0.0, "loss/logits": 0.20873716473579407, "step": 6700 }, { "epoch": 0.22366666666666668, "grad_norm": 28.5, "grad_norm_var": 3.6332682291666667, "learning_rate": 0.0001, "loss": 7.2208, "loss/crossentropy": 1.9632959835231305, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.19332881616428493, "step": 6710 }, { "epoch": 0.224, "grad_norm": 29.625, "grad_norm_var": 2.0395833333333333, "learning_rate": 0.0001, "loss": 7.3792, "loss/crossentropy": 2.09559326171875, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.22215770855545997, "step": 6720 }, { "epoch": 0.22433333333333333, "grad_norm": 30.25, "grad_norm_var": 2.3018229166666666, "learning_rate": 0.0001, "loss": 7.3044, "loss/crossentropy": 2.010968768596649, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.19315333794802428, "step": 6730 }, { "epoch": 0.22466666666666665, "grad_norm": 31.125, "grad_norm_var": 7.139518229166667, "learning_rate": 0.0001, "loss": 7.2818, "loss/crossentropy": 2.0015950806438925, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18405240308493376, "step": 6740 }, { "epoch": 0.225, "grad_norm": 27.375, "grad_norm_var": 8.919205729166666, "learning_rate": 0.0001, "loss": 7.3321, "loss/crossentropy": 2.1138732716441155, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.2000407662242651, "step": 6750 }, { "epoch": 0.22533333333333333, "grad_norm": 30.375, "grad_norm_var": 6.335872395833333, "learning_rate": 0.0001, "loss": 7.405, "loss/crossentropy": 2.002257353067398, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.1841747172176838, "step": 6760 }, { "epoch": 0.22566666666666665, "grad_norm": 28.75, "grad_norm_var": 3.161393229166667, "learning_rate": 0.0001, "loss": 7.2154, "loss/crossentropy": 1.7655216559767724, "loss/hidden": 3.530859375, "loss/jsd": 0.0, "loss/logits": 0.1844769461080432, "step": 6770 }, { "epoch": 0.226, "grad_norm": 29.375, "grad_norm_var": 2.2697265625, "learning_rate": 0.0001, "loss": 7.3142, "loss/crossentropy": 1.912297861278057, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.18460046257823706, "step": 6780 }, { "epoch": 0.22633333333333333, "grad_norm": 31.0, "grad_norm_var": 1.9999348958333334, "learning_rate": 0.0001, "loss": 7.4455, "loss/crossentropy": 2.1156294137239455, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.19422024730592966, "step": 6790 }, { "epoch": 0.22666666666666666, "grad_norm": 29.625, "grad_norm_var": 2.5744140625, "learning_rate": 0.0001, "loss": 7.48, "loss/crossentropy": 2.0104686178267004, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.20698846243321894, "step": 6800 }, { "epoch": 0.227, "grad_norm": 30.25, "grad_norm_var": 3.169791666666667, "learning_rate": 0.0001, "loss": 7.255, "loss/crossentropy": 1.9206750318408012, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.1844486749265343, "step": 6810 }, { "epoch": 0.22733333333333333, "grad_norm": 29.625, "grad_norm_var": 2.59140625, "learning_rate": 0.0001, "loss": 7.4141, "loss/crossentropy": 2.018783361464739, "loss/hidden": 3.5375, "loss/jsd": 0.0, "loss/logits": 0.19597072144970298, "step": 6820 }, { "epoch": 0.22766666666666666, "grad_norm": 32.5, "grad_norm_var": 2.1113932291666666, "learning_rate": 0.0001, "loss": 7.2688, "loss/crossentropy": 1.9475677818059922, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.18650611974298953, "step": 6830 }, { "epoch": 0.228, "grad_norm": 28.875, "grad_norm_var": 5.405989583333334, "learning_rate": 0.0001, "loss": 7.2923, "loss/crossentropy": 1.9802114933729171, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.20384657364338638, "step": 6840 }, { "epoch": 0.22833333333333333, "grad_norm": 30.375, "grad_norm_var": 8.486393229166667, "learning_rate": 0.0001, "loss": 7.319, "loss/crossentropy": 1.973275475203991, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19071720112115145, "step": 6850 }, { "epoch": 0.22866666666666666, "grad_norm": 30.75, "grad_norm_var": 7.258333333333334, "learning_rate": 0.0001, "loss": 7.4568, "loss/crossentropy": 1.9262220814824105, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.18887665551155805, "step": 6860 }, { "epoch": 0.229, "grad_norm": 28.25, "grad_norm_var": 3.9358723958333335, "learning_rate": 0.0001, "loss": 7.4488, "loss/crossentropy": 1.910850578546524, "loss/hidden": 3.51171875, "loss/jsd": 0.0, "loss/logits": 0.1951694213785231, "step": 6870 }, { "epoch": 0.22933333333333333, "grad_norm": 40.0, "grad_norm_var": 8.751822916666667, "learning_rate": 0.0001, "loss": 7.2825, "loss/crossentropy": 2.0395314007997514, "loss/hidden": 3.488671875, "loss/jsd": 0.0, "loss/logits": 0.2050021268427372, "step": 6880 }, { "epoch": 0.22966666666666666, "grad_norm": 30.25, "grad_norm_var": 8.251041666666667, "learning_rate": 0.0001, "loss": 7.3104, "loss/crossentropy": 1.852508794516325, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.17959043076261877, "step": 6890 }, { "epoch": 0.23, "grad_norm": 30.25, "grad_norm_var": 1.4785807291666666, "learning_rate": 0.0001, "loss": 7.4023, "loss/crossentropy": 1.9877615801990032, "loss/hidden": 3.49765625, "loss/jsd": 0.0, "loss/logits": 0.19488161485642194, "step": 6900 }, { "epoch": 0.23033333333333333, "grad_norm": 31.75, "grad_norm_var": 2.86640625, "learning_rate": 0.0001, "loss": 7.2581, "loss/crossentropy": 1.8980389423668385, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.1893090806901455, "step": 6910 }, { "epoch": 0.23066666666666666, "grad_norm": 28.625, "grad_norm_var": 12.0509765625, "learning_rate": 0.0001, "loss": 7.2319, "loss/crossentropy": 1.9217184364795685, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19511478617787362, "step": 6920 }, { "epoch": 0.231, "grad_norm": 39.5, "grad_norm_var": 18.258072916666666, "learning_rate": 0.0001, "loss": 7.3584, "loss/crossentropy": 2.102051305770874, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.1989365177229047, "step": 6930 }, { "epoch": 0.23133333333333334, "grad_norm": 29.125, "grad_norm_var": 16.690625, "learning_rate": 0.0001, "loss": 7.2739, "loss/crossentropy": 1.9943840712308885, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.1889357293024659, "step": 6940 }, { "epoch": 0.23166666666666666, "grad_norm": 36.25, "grad_norm_var": 11.562434895833333, "learning_rate": 0.0001, "loss": 7.3303, "loss/crossentropy": 1.8482569068670274, "loss/hidden": 3.51328125, "loss/jsd": 0.0, "loss/logits": 0.20613361336290836, "step": 6950 }, { "epoch": 0.232, "grad_norm": 30.25, "grad_norm_var": 9.173372395833333, "learning_rate": 0.0001, "loss": 7.3945, "loss/crossentropy": 1.9270221143960953, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.19819915164262056, "step": 6960 }, { "epoch": 0.23233333333333334, "grad_norm": 30.625, "grad_norm_var": 5.056705729166667, "learning_rate": 0.0001, "loss": 7.3302, "loss/crossentropy": 1.9942929029464722, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.18619541339576245, "step": 6970 }, { "epoch": 0.23266666666666666, "grad_norm": 29.5, "grad_norm_var": 3.2747395833333335, "learning_rate": 0.0001, "loss": 7.2886, "loss/crossentropy": 1.7987700045108794, "loss/hidden": 3.498828125, "loss/jsd": 0.0, "loss/logits": 0.18905182252638042, "step": 6980 }, { "epoch": 0.233, "grad_norm": 30.125, "grad_norm_var": 10.946809895833333, "learning_rate": 0.0001, "loss": 7.2407, "loss/crossentropy": 1.8742084830999375, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.19347118120640516, "step": 6990 }, { "epoch": 0.23333333333333334, "grad_norm": 31.125, "grad_norm_var": 9.051822916666667, "learning_rate": 0.0001, "loss": 7.3667, "loss/crossentropy": 1.8891173392534255, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.18332072868943214, "step": 7000 }, { "epoch": 0.23366666666666666, "grad_norm": 26.75, "grad_norm_var": 10.78125, "learning_rate": 0.0001, "loss": 7.405, "loss/crossentropy": 1.9603838928043842, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.18752118339762092, "step": 7010 }, { "epoch": 0.234, "grad_norm": 31.125, "grad_norm_var": 5.297916666666667, "learning_rate": 0.0001, "loss": 7.2918, "loss/crossentropy": 2.131310170888901, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.19405119232833384, "step": 7020 }, { "epoch": 0.23433333333333334, "grad_norm": 32.25, "grad_norm_var": 2.630989583333333, "learning_rate": 0.0001, "loss": 7.3856, "loss/crossentropy": 2.019246442615986, "loss/hidden": 3.623046875, "loss/jsd": 0.0, "loss/logits": 0.207649864256382, "step": 7030 }, { "epoch": 0.23466666666666666, "grad_norm": 32.75, "grad_norm_var": 9.786458333333334, "learning_rate": 0.0001, "loss": 7.3111, "loss/crossentropy": 2.064815706759691, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.194681753590703, "step": 7040 }, { "epoch": 0.235, "grad_norm": 32.75, "grad_norm_var": 6.3572265625, "learning_rate": 0.0001, "loss": 7.3827, "loss/crossentropy": 1.979122743010521, "loss/hidden": 3.4859375, "loss/jsd": 0.0, "loss/logits": 0.19751068884506823, "step": 7050 }, { "epoch": 0.23533333333333334, "grad_norm": 31.875, "grad_norm_var": 4.62265625, "learning_rate": 0.0001, "loss": 7.3785, "loss/crossentropy": 1.8898464910686017, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.18709402102977038, "step": 7060 }, { "epoch": 0.23566666666666666, "grad_norm": 33.5, "grad_norm_var": 6.995833333333334, "learning_rate": 0.0001, "loss": 7.3671, "loss/crossentropy": 2.1073614060878754, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.1862022250890732, "step": 7070 }, { "epoch": 0.236, "grad_norm": 31.125, "grad_norm_var": 3.85390625, "learning_rate": 0.0001, "loss": 7.2427, "loss/crossentropy": 2.099861590564251, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.19578987415879964, "step": 7080 }, { "epoch": 0.23633333333333334, "grad_norm": 27.0, "grad_norm_var": 9.3509765625, "learning_rate": 0.0001, "loss": 7.272, "loss/crossentropy": 2.109695206582546, "loss/hidden": 3.386328125, "loss/jsd": 0.0, "loss/logits": 0.21443557739257812, "step": 7090 }, { "epoch": 0.23666666666666666, "grad_norm": 27.0, "grad_norm_var": 3.754946527641364e+18, "learning_rate": 0.0001, "loss": 7.3467, "loss/crossentropy": 2.0120409660041334, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.17846396705135703, "step": 7100 }, { "epoch": 0.237, "grad_norm": 28.5, "grad_norm_var": 3.0737770866871547e+18, "learning_rate": 0.0001, "loss": 7.3718, "loss/crossentropy": 1.9359865628182888, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.18694842401891948, "step": 7110 }, { "epoch": 0.23733333333333334, "grad_norm": 28.625, "grad_norm_var": 6.5875, "learning_rate": 0.0001, "loss": 7.1279, "loss/crossentropy": 1.8954229757189751, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.19327899385243655, "step": 7120 }, { "epoch": 0.23766666666666666, "grad_norm": 29.875, "grad_norm_var": 2.3223307291666666, "learning_rate": 0.0001, "loss": 7.354, "loss/crossentropy": 1.967520573735237, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.18168523479253054, "step": 7130 }, { "epoch": 0.238, "grad_norm": 30.875, "grad_norm_var": 1.4372395833333333, "learning_rate": 0.0001, "loss": 7.4303, "loss/crossentropy": 2.062156692147255, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.210116427205503, "step": 7140 }, { "epoch": 0.23833333333333334, "grad_norm": 30.375, "grad_norm_var": 2.1624348958333335, "learning_rate": 0.0001, "loss": 7.4864, "loss/crossentropy": 2.157193061709404, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.20984416529536248, "step": 7150 }, { "epoch": 0.23866666666666667, "grad_norm": 30.375, "grad_norm_var": 2.161393229166667, "learning_rate": 0.0001, "loss": 7.4228, "loss/crossentropy": 2.053968481719494, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.18599526956677437, "step": 7160 }, { "epoch": 0.239, "grad_norm": 26.375, "grad_norm_var": 3.61015625, "learning_rate": 0.0001, "loss": 7.2267, "loss/crossentropy": 1.947546560317278, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.18496082462370395, "step": 7170 }, { "epoch": 0.23933333333333334, "grad_norm": 30.5, "grad_norm_var": 1.7789922008484544e+18, "learning_rate": 0.0001, "loss": 7.261, "loss/crossentropy": 2.036028765141964, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1958589216694236, "step": 7180 }, { "epoch": 0.23966666666666667, "grad_norm": 31.0, "grad_norm_var": 11.690559895833333, "learning_rate": 0.0001, "loss": 7.305, "loss/crossentropy": 1.866253786534071, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.16376810520887375, "step": 7190 }, { "epoch": 0.24, "grad_norm": 32.5, "grad_norm_var": 2.17890625, "learning_rate": 0.0001, "loss": 7.4379, "loss/crossentropy": 1.96115712672472, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.192328049428761, "step": 7200 }, { "epoch": 0.24033333333333334, "grad_norm": 29.25, "grad_norm_var": 1.6634765625, "learning_rate": 0.0001, "loss": 7.2315, "loss/crossentropy": 1.9648781597614289, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.18070857264101506, "step": 7210 }, { "epoch": 0.24066666666666667, "grad_norm": 30.375, "grad_norm_var": 4.070572916666666, "learning_rate": 0.0001, "loss": 7.3206, "loss/crossentropy": 2.0259562619030476, "loss/hidden": 3.47421875, "loss/jsd": 0.0, "loss/logits": 0.19872309640049934, "step": 7220 }, { "epoch": 0.241, "grad_norm": 30.875, "grad_norm_var": 6.051497395833334, "learning_rate": 0.0001, "loss": 7.3024, "loss/crossentropy": 1.8907338812947274, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.1759904881939292, "step": 7230 }, { "epoch": 0.24133333333333334, "grad_norm": 29.875, "grad_norm_var": 3.607291666666667, "learning_rate": 0.0001, "loss": 7.3385, "loss/crossentropy": 2.0046985894441605, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.19136561723425985, "step": 7240 }, { "epoch": 0.24166666666666667, "grad_norm": 30.25, "grad_norm_var": 2.6479166666666667, "learning_rate": 0.0001, "loss": 7.365, "loss/crossentropy": 2.0102310717105865, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.1963226730003953, "step": 7250 }, { "epoch": 0.242, "grad_norm": 30.625, "grad_norm_var": 2.263541666666667, "learning_rate": 0.0001, "loss": 7.2282, "loss/crossentropy": 1.8803921200335025, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.1832737321034074, "step": 7260 }, { "epoch": 0.24233333333333335, "grad_norm": 33.5, "grad_norm_var": 6.659375, "learning_rate": 0.0001, "loss": 7.3217, "loss/crossentropy": 1.9700173437595367, "loss/hidden": 3.49140625, "loss/jsd": 0.0, "loss/logits": 0.19216696545481682, "step": 7270 }, { "epoch": 0.24266666666666667, "grad_norm": 27.875, "grad_norm_var": 7.0353515625, "learning_rate": 0.0001, "loss": 7.3395, "loss/crossentropy": 2.068162256106734, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19025424188002943, "step": 7280 }, { "epoch": 0.243, "grad_norm": 27.375, "grad_norm_var": 3.457291666666667, "learning_rate": 0.0001, "loss": 7.3684, "loss/crossentropy": 1.8937971897423267, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.17642544778063893, "step": 7290 }, { "epoch": 0.24333333333333335, "grad_norm": 30.5, "grad_norm_var": 8.23125, "learning_rate": 0.0001, "loss": 7.227, "loss/crossentropy": 2.0714763421565294, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.18636261858046055, "step": 7300 }, { "epoch": 0.24366666666666667, "grad_norm": 32.5, "grad_norm_var": 1.86875, "learning_rate": 0.0001, "loss": 7.3551, "loss/crossentropy": 1.9897849723696708, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.2001850029453635, "step": 7310 }, { "epoch": 0.244, "grad_norm": 34.5, "grad_norm_var": 19.971875, "learning_rate": 0.0001, "loss": 7.3407, "loss/crossentropy": 1.9122483849525451, "loss/hidden": 3.50390625, "loss/jsd": 0.0, "loss/logits": 0.21700796904042363, "step": 7320 }, { "epoch": 0.24433333333333335, "grad_norm": 35.0, "grad_norm_var": 20.904166666666665, "learning_rate": 0.0001, "loss": 7.4534, "loss/crossentropy": 2.123218533396721, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.2062240641564131, "step": 7330 }, { "epoch": 0.24466666666666667, "grad_norm": 31.125, "grad_norm_var": 11.270833333333334, "learning_rate": 0.0001, "loss": 7.3618, "loss/crossentropy": 1.9315256215631962, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.18345213318243622, "step": 7340 }, { "epoch": 0.245, "grad_norm": 30.25, "grad_norm_var": 10.262239583333333, "learning_rate": 0.0001, "loss": 7.3964, "loss/crossentropy": 1.8020236909389495, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.1893236802890897, "step": 7350 }, { "epoch": 0.24533333333333332, "grad_norm": 30.625, "grad_norm_var": 1.2983723958333333, "learning_rate": 0.0001, "loss": 7.3563, "loss/crossentropy": 1.9838907152414322, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.18070107288658618, "step": 7360 }, { "epoch": 0.24566666666666667, "grad_norm": 30.0, "grad_norm_var": 5.038541666666666, "learning_rate": 0.0001, "loss": 7.3635, "loss/crossentropy": 1.9865519054234029, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.19929917454719542, "step": 7370 }, { "epoch": 0.246, "grad_norm": 33.5, "grad_norm_var": 7.249739583333334, "learning_rate": 0.0001, "loss": 7.3496, "loss/crossentropy": 1.8449418544769287, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.19202936105430127, "step": 7380 }, { "epoch": 0.24633333333333332, "grad_norm": 28.0, "grad_norm_var": 5.972916666666666, "learning_rate": 0.0001, "loss": 7.2578, "loss/crossentropy": 2.053005726635456, "loss/hidden": 3.51953125, "loss/jsd": 0.0, "loss/logits": 0.20847741970792413, "step": 7390 }, { "epoch": 0.24666666666666667, "grad_norm": 27.5, "grad_norm_var": 3.3385416666666665, "learning_rate": 0.0001, "loss": 7.2673, "loss/crossentropy": 1.9644143544137478, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.17471248637884856, "step": 7400 }, { "epoch": 0.247, "grad_norm": 29.0, "grad_norm_var": 1.1546223958333333, "learning_rate": 0.0001, "loss": 7.2628, "loss/crossentropy": 2.1313725471496583, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19471158571541308, "step": 7410 }, { "epoch": 0.24733333333333332, "grad_norm": 32.75, "grad_norm_var": 2.7768229166666667, "learning_rate": 0.0001, "loss": 7.2851, "loss/crossentropy": 1.904316257685423, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.18897277116775513, "step": 7420 }, { "epoch": 0.24766666666666667, "grad_norm": 32.25, "grad_norm_var": 5.80390625, "learning_rate": 0.0001, "loss": 7.3904, "loss/crossentropy": 1.9632198244333268, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.20001174192875623, "step": 7430 }, { "epoch": 0.248, "grad_norm": 33.25, "grad_norm_var": 6.3125, "learning_rate": 0.0001, "loss": 7.3558, "loss/crossentropy": 1.9358988516032696, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19074198668822645, "step": 7440 }, { "epoch": 0.24833333333333332, "grad_norm": 30.375, "grad_norm_var": 5.476822916666666, "learning_rate": 0.0001, "loss": 7.2968, "loss/crossentropy": 1.9899767950177192, "loss/hidden": 3.314453125, "loss/jsd": 0.0, "loss/logits": 0.1809317298233509, "step": 7450 }, { "epoch": 0.24866666666666667, "grad_norm": 27.375, "grad_norm_var": 3.5308220784435354e+18, "learning_rate": 0.0001, "loss": 7.2952, "loss/crossentropy": 1.9510671958327293, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.21452984046190976, "step": 7460 }, { "epoch": 0.249, "grad_norm": 29.75, "grad_norm_var": 27.307747395833335, "learning_rate": 0.0001, "loss": 7.2243, "loss/crossentropy": 2.0617476493120193, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.1929461531341076, "step": 7470 }, { "epoch": 0.24933333333333332, "grad_norm": 32.25, "grad_norm_var": 27.96015625, "learning_rate": 0.0001, "loss": 7.3216, "loss/crossentropy": 2.0663102462887766, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.19353907331824302, "step": 7480 }, { "epoch": 0.24966666666666668, "grad_norm": 35.5, "grad_norm_var": 35.0291015625, "learning_rate": 0.0001, "loss": 7.2194, "loss/crossentropy": 1.9002568796277046, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.1915289015509188, "step": 7490 }, { "epoch": 0.25, "grad_norm": 30.125, "grad_norm_var": 34.30618489583333, "learning_rate": 0.0001, "loss": 7.3825, "loss/crossentropy": 1.9785274967551232, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.19954648409038783, "step": 7500 }, { "epoch": 0.25033333333333335, "grad_norm": 28.75, "grad_norm_var": 2.5863932291666667, "learning_rate": 0.0001, "loss": 7.2345, "loss/crossentropy": 2.0088417567312717, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.18816519435495138, "step": 7510 }, { "epoch": 0.25066666666666665, "grad_norm": 29.875, "grad_norm_var": 2.635872395833333, "learning_rate": 0.0001, "loss": 7.4131, "loss/crossentropy": 2.0666195660829545, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.18278340790420772, "step": 7520 }, { "epoch": 0.251, "grad_norm": 28.875, "grad_norm_var": 1.5447916666666666, "learning_rate": 0.0001, "loss": 7.2734, "loss/crossentropy": 1.8732483729720115, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.18512406963855027, "step": 7530 }, { "epoch": 0.25133333333333335, "grad_norm": 31.25, "grad_norm_var": 2.9302083333333333, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.033722445368767, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.19375329315662385, "step": 7540 }, { "epoch": 0.25166666666666665, "grad_norm": 29.75, "grad_norm_var": 5.15390625, "learning_rate": 0.0001, "loss": 7.3466, "loss/crossentropy": 1.9976776465773582, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.1890201298519969, "step": 7550 }, { "epoch": 0.252, "grad_norm": 28.0, "grad_norm_var": 1.7583333333333333, "learning_rate": 0.0001, "loss": 7.2496, "loss/crossentropy": 1.987569659948349, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.18123315144330263, "step": 7560 }, { "epoch": 0.25233333333333335, "grad_norm": 27.0, "grad_norm_var": 28.96640625, "learning_rate": 0.0001, "loss": 7.3994, "loss/crossentropy": 2.0847848281264305, "loss/hidden": 3.526953125, "loss/jsd": 0.0, "loss/logits": 0.22442668601870536, "step": 7570 }, { "epoch": 0.25266666666666665, "grad_norm": 32.0, "grad_norm_var": 16.585872395833334, "learning_rate": 0.0001, "loss": 7.2715, "loss/crossentropy": 1.939840593934059, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.18043840080499648, "step": 7580 }, { "epoch": 0.253, "grad_norm": 28.75, "grad_norm_var": 5.293684895833334, "learning_rate": 0.0001, "loss": 7.1627, "loss/crossentropy": 1.9052984058856963, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.17015632186084986, "step": 7590 }, { "epoch": 0.25333333333333335, "grad_norm": 28.125, "grad_norm_var": 4.480989583333334, "learning_rate": 0.0001, "loss": 7.258, "loss/crossentropy": 1.9569363608956336, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.18446199083700776, "step": 7600 }, { "epoch": 0.25366666666666665, "grad_norm": 44.5, "grad_norm_var": 2.4607245902264924e+18, "learning_rate": 0.0001, "loss": 7.2972, "loss/crossentropy": 1.9432177975773812, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.18713300973176955, "step": 7610 }, { "epoch": 0.254, "grad_norm": 35.75, "grad_norm_var": 2.460724588246047e+18, "learning_rate": 0.0001, "loss": 7.3374, "loss/crossentropy": 1.889748415350914, "loss/hidden": 3.560546875, "loss/jsd": 0.0, "loss/logits": 0.19464636873453856, "step": 7620 }, { "epoch": 0.25433333333333336, "grad_norm": 30.625, "grad_norm_var": 5.033072916666667, "learning_rate": 0.0001, "loss": 7.3526, "loss/crossentropy": 2.1514240980148314, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.20267823729664086, "step": 7630 }, { "epoch": 0.25466666666666665, "grad_norm": 33.0, "grad_norm_var": 5.110416666666667, "learning_rate": 0.0001, "loss": 7.3412, "loss/crossentropy": 1.935009729862213, "loss/hidden": 3.550390625, "loss/jsd": 0.0, "loss/logits": 0.1941109588369727, "step": 7640 }, { "epoch": 0.255, "grad_norm": 29.0, "grad_norm_var": 6.605989583333334, "learning_rate": 0.0001, "loss": 7.2934, "loss/crossentropy": 2.0410706609487534, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.20210143011063336, "step": 7650 }, { "epoch": 0.25533333333333336, "grad_norm": 29.25, "grad_norm_var": 3.00390625, "learning_rate": 0.0001, "loss": 7.2023, "loss/crossentropy": 1.9123263359069824, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.1924495290964842, "step": 7660 }, { "epoch": 0.25566666666666665, "grad_norm": 32.0, "grad_norm_var": 2.9861680010809226e+18, "learning_rate": 0.0001, "loss": 7.2859, "loss/crossentropy": 2.0652717113494874, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.18412147276103497, "step": 7670 }, { "epoch": 0.256, "grad_norm": 30.5, "grad_norm_var": 3.678580729166667, "learning_rate": 0.0001, "loss": 7.4117, "loss/crossentropy": 2.0581321813166142, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.19974921364337206, "step": 7680 }, { "epoch": 0.25633333333333336, "grad_norm": 28.625, "grad_norm_var": 4.483072916666667, "learning_rate": 0.0001, "loss": 7.4104, "loss/crossentropy": 2.047048556804657, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.20386108793318272, "step": 7690 }, { "epoch": 0.25666666666666665, "grad_norm": 33.75, "grad_norm_var": 3.6104166666666666, "learning_rate": 0.0001, "loss": 7.2871, "loss/crossentropy": 2.0517066448926924, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.19028980899602174, "step": 7700 }, { "epoch": 0.257, "grad_norm": 27.875, "grad_norm_var": 2.7462890625, "learning_rate": 0.0001, "loss": 7.3147, "loss/crossentropy": 2.1791652791202067, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18664524629712104, "step": 7710 }, { "epoch": 0.25733333333333336, "grad_norm": 29.875, "grad_norm_var": 5.0494140625, "learning_rate": 0.0001, "loss": 7.3238, "loss/crossentropy": 2.1375704288482664, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.201789260096848, "step": 7720 }, { "epoch": 0.25766666666666665, "grad_norm": 28.5, "grad_norm_var": 5.6197265625, "learning_rate": 0.0001, "loss": 7.4161, "loss/crossentropy": 1.9807209730148316, "loss/hidden": 3.5125, "loss/jsd": 0.0, "loss/logits": 0.18879797048866748, "step": 7730 }, { "epoch": 0.258, "grad_norm": 32.75, "grad_norm_var": 5.491666666666666, "learning_rate": 0.0001, "loss": 7.4437, "loss/crossentropy": 1.8658494256436824, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.18538361974060535, "step": 7740 }, { "epoch": 0.25833333333333336, "grad_norm": 28.5, "grad_norm_var": 13.5962890625, "learning_rate": 0.0001, "loss": 7.2948, "loss/crossentropy": 2.048648712784052, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.1845292759127915, "step": 7750 }, { "epoch": 0.25866666666666666, "grad_norm": 31.375, "grad_norm_var": 5.664322916666666, "learning_rate": 0.0001, "loss": 7.501, "loss/crossentropy": 1.9704545319080353, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.20459708366543056, "step": 7760 }, { "epoch": 0.259, "grad_norm": 29.125, "grad_norm_var": 4.0103515625, "learning_rate": 0.0001, "loss": 7.45, "loss/crossentropy": 1.9425039686262608, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.1815475813113153, "step": 7770 }, { "epoch": 0.25933333333333336, "grad_norm": 29.375, "grad_norm_var": 2.3843098958333333, "learning_rate": 0.0001, "loss": 7.3397, "loss/crossentropy": 1.8276194736361504, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.18503880705684422, "step": 7780 }, { "epoch": 0.25966666666666666, "grad_norm": 30.375, "grad_norm_var": 2.9833333333333334, "learning_rate": 0.0001, "loss": 7.4588, "loss/crossentropy": 2.063306473195553, "loss/hidden": 3.55390625, "loss/jsd": 0.0, "loss/logits": 0.1996358722448349, "step": 7790 }, { "epoch": 0.26, "grad_norm": 29.375, "grad_norm_var": 3.833268229166667, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 1.9770326487720014, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.18442671336233615, "step": 7800 }, { "epoch": 0.26033333333333336, "grad_norm": 28.5, "grad_norm_var": 11.9125, "learning_rate": 0.0001, "loss": 7.3145, "loss/crossentropy": 2.0291145533323287, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.19249846283346414, "step": 7810 }, { "epoch": 0.26066666666666666, "grad_norm": 29.5, "grad_norm_var": 1.7051432291666666, "learning_rate": 0.0001, "loss": 7.3419, "loss/crossentropy": 1.9219007447361947, "loss/hidden": 3.451171875, "loss/jsd": 0.0, "loss/logits": 0.18752799583598972, "step": 7820 }, { "epoch": 0.261, "grad_norm": 28.25, "grad_norm_var": 4.77890625, "learning_rate": 0.0001, "loss": 7.3201, "loss/crossentropy": 2.0963807731866835, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.1966357234865427, "step": 7830 }, { "epoch": 0.2613333333333333, "grad_norm": 33.0, "grad_norm_var": 22.650455729166666, "learning_rate": 0.0001, "loss": 7.425, "loss/crossentropy": 1.9799813508987427, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.1927992183715105, "step": 7840 }, { "epoch": 0.26166666666666666, "grad_norm": 29.875, "grad_norm_var": 4.1125, "learning_rate": 0.0001, "loss": 7.3573, "loss/crossentropy": 2.055854082852602, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.19721447993069888, "step": 7850 }, { "epoch": 0.262, "grad_norm": 30.25, "grad_norm_var": 2.459375, "learning_rate": 0.0001, "loss": 7.3734, "loss/crossentropy": 1.9555838972330093, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.18575125150382518, "step": 7860 }, { "epoch": 0.2623333333333333, "grad_norm": 31.625, "grad_norm_var": 2.6723307291666667, "learning_rate": 0.0001, "loss": 7.3786, "loss/crossentropy": 2.012501245737076, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.19071923177689315, "step": 7870 }, { "epoch": 0.26266666666666666, "grad_norm": 29.25, "grad_norm_var": 1.7947265625, "learning_rate": 0.0001, "loss": 7.3585, "loss/crossentropy": 1.8456360399723053, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.1722103778272867, "step": 7880 }, { "epoch": 0.263, "grad_norm": 27.875, "grad_norm_var": 23.788541666666667, "learning_rate": 0.0001, "loss": 7.3694, "loss/crossentropy": 1.9698335975408554, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.18809431046247482, "step": 7890 }, { "epoch": 0.2633333333333333, "grad_norm": 31.5, "grad_norm_var": 4.664583333333334, "learning_rate": 0.0001, "loss": 7.2586, "loss/crossentropy": 2.0510014094412328, "loss/hidden": 3.61640625, "loss/jsd": 0.0, "loss/logits": 0.21446718443185092, "step": 7900 }, { "epoch": 0.26366666666666666, "grad_norm": 28.75, "grad_norm_var": 4.1400390625, "learning_rate": 0.0001, "loss": 7.3741, "loss/crossentropy": 2.055051451921463, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.19334657546132802, "step": 7910 }, { "epoch": 0.264, "grad_norm": 28.75, "grad_norm_var": 2.084830729166667, "learning_rate": 0.0001, "loss": 7.2851, "loss/crossentropy": 2.0605675019323826, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19418431911617517, "step": 7920 }, { "epoch": 0.2643333333333333, "grad_norm": 27.875, "grad_norm_var": 4.06015625, "learning_rate": 0.0001, "loss": 7.2417, "loss/crossentropy": 1.8355554074048996, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.1804340412840247, "step": 7930 }, { "epoch": 0.26466666666666666, "grad_norm": 31.125, "grad_norm_var": 11.37890625, "learning_rate": 0.0001, "loss": 7.4464, "loss/crossentropy": 2.0627989590168, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.18591106981039046, "step": 7940 }, { "epoch": 0.265, "grad_norm": 33.25, "grad_norm_var": 2.4061848958333334, "learning_rate": 0.0001, "loss": 7.3323, "loss/crossentropy": 1.9670493304729462, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.19227916207164525, "step": 7950 }, { "epoch": 0.2653333333333333, "grad_norm": 28.125, "grad_norm_var": 12.860416666666667, "learning_rate": 0.0001, "loss": 7.336, "loss/crossentropy": 1.9905685119330883, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.18828013353049755, "step": 7960 }, { "epoch": 0.26566666666666666, "grad_norm": 30.5, "grad_norm_var": 31.223372395833334, "learning_rate": 0.0001, "loss": 7.3916, "loss/crossentropy": 1.9982903897762299, "loss/hidden": 3.514453125, "loss/jsd": 0.0, "loss/logits": 0.20455247219651937, "step": 7970 }, { "epoch": 0.266, "grad_norm": 29.25, "grad_norm_var": 14.158072916666667, "learning_rate": 0.0001, "loss": 7.3886, "loss/crossentropy": 2.055228200554848, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.19732489865273237, "step": 7980 }, { "epoch": 0.2663333333333333, "grad_norm": 36.25, "grad_norm_var": 9.364322916666667, "learning_rate": 0.0001, "loss": 7.3196, "loss/crossentropy": 1.8684360593557359, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.1813243744894862, "step": 7990 }, { "epoch": 0.26666666666666666, "grad_norm": 28.5, "grad_norm_var": 7.924739583333333, "learning_rate": 0.0001, "loss": 7.3477, "loss/crossentropy": 1.9357227340340615, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.1878168947994709, "step": 8000 }, { "epoch": 0.267, "grad_norm": 28.25, "grad_norm_var": 2.758736220491298e+18, "learning_rate": 0.0001, "loss": 7.3018, "loss/crossentropy": 1.8669291988015175, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.17982392217963933, "step": 8010 }, { "epoch": 0.2673333333333333, "grad_norm": 32.75, "grad_norm_var": 2.75873622109339e+18, "learning_rate": 0.0001, "loss": 7.3287, "loss/crossentropy": 1.9699197702109814, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.17906449669972063, "step": 8020 }, { "epoch": 0.26766666666666666, "grad_norm": 33.75, "grad_norm_var": 5.6931640625, "learning_rate": 0.0001, "loss": 7.3859, "loss/crossentropy": 2.0828740768134595, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.20037321876734496, "step": 8030 }, { "epoch": 0.268, "grad_norm": 28.25, "grad_norm_var": 11.562955729166667, "learning_rate": 0.0001, "loss": 7.2938, "loss/crossentropy": 2.0559828370809554, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.187124184332788, "step": 8040 }, { "epoch": 0.2683333333333333, "grad_norm": 30.0, "grad_norm_var": 6.915559895833334, "learning_rate": 0.0001, "loss": 7.3632, "loss/crossentropy": 1.945170171558857, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.2090400829911232, "step": 8050 }, { "epoch": 0.26866666666666666, "grad_norm": 29.875, "grad_norm_var": 22.574739583333333, "learning_rate": 0.0001, "loss": 7.3822, "loss/crossentropy": 2.0073591828346253, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.19188429061323403, "step": 8060 }, { "epoch": 0.269, "grad_norm": 42.0, "grad_norm_var": 27.505989583333335, "learning_rate": 0.0001, "loss": 7.325, "loss/crossentropy": 1.8576781779527665, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.18224390475079416, "step": 8070 }, { "epoch": 0.2693333333333333, "grad_norm": 26.625, "grad_norm_var": 15.282747395833333, "learning_rate": 0.0001, "loss": 7.3035, "loss/crossentropy": 2.1271346174180508, "loss/hidden": 3.451171875, "loss/jsd": 0.0, "loss/logits": 0.20457899756729603, "step": 8080 }, { "epoch": 0.26966666666666667, "grad_norm": 27.75, "grad_norm_var": 6.512239583333334, "learning_rate": 0.0001, "loss": 7.3705, "loss/crossentropy": 2.0375955060124396, "loss/hidden": 3.51015625, "loss/jsd": 0.0, "loss/logits": 0.20770179554820062, "step": 8090 }, { "epoch": 0.27, "grad_norm": 32.5, "grad_norm_var": 6.6134765625, "learning_rate": 0.0001, "loss": 7.3383, "loss/crossentropy": 1.8748702734708786, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.17912552300840617, "step": 8100 }, { "epoch": 0.2703333333333333, "grad_norm": 31.875, "grad_norm_var": 54.78723958333333, "learning_rate": 0.0001, "loss": 7.4723, "loss/crossentropy": 1.8621863305568696, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.18697217144072056, "step": 8110 }, { "epoch": 0.27066666666666667, "grad_norm": 31.375, "grad_norm_var": 3.0921223958333335, "learning_rate": 0.0001, "loss": 7.2634, "loss/crossentropy": 2.0922766856849195, "loss/hidden": 3.53203125, "loss/jsd": 0.0, "loss/logits": 0.20104624517261982, "step": 8120 }, { "epoch": 0.271, "grad_norm": 35.0, "grad_norm_var": 4.480208333333334, "learning_rate": 0.0001, "loss": 7.3847, "loss/crossentropy": 1.9210853602737188, "loss/hidden": 3.51484375, "loss/jsd": 0.0, "loss/logits": 0.1958038537763059, "step": 8130 }, { "epoch": 0.2713333333333333, "grad_norm": 26.75, "grad_norm_var": 31.6431640625, "learning_rate": 0.0001, "loss": 7.2776, "loss/crossentropy": 2.0486815750598906, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.19873659461736679, "step": 8140 }, { "epoch": 0.27166666666666667, "grad_norm": 29.25, "grad_norm_var": 31.712955729166666, "learning_rate": 0.0001, "loss": 7.3314, "loss/crossentropy": 1.9269000887870789, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.1707632358185947, "step": 8150 }, { "epoch": 0.272, "grad_norm": 29.375, "grad_norm_var": 1.2525390625, "learning_rate": 0.0001, "loss": 7.3156, "loss/crossentropy": 1.8214071549475193, "loss/hidden": 3.48984375, "loss/jsd": 0.0, "loss/logits": 0.18597929794341325, "step": 8160 }, { "epoch": 0.2723333333333333, "grad_norm": 30.0, "grad_norm_var": 1.371875, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.050736790895462, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.18771201074123384, "step": 8170 }, { "epoch": 0.27266666666666667, "grad_norm": 31.125, "grad_norm_var": 1.5317057291666667, "learning_rate": 0.0001, "loss": 7.2903, "loss/crossentropy": 1.8649342566728593, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.18692498989403247, "step": 8180 }, { "epoch": 0.273, "grad_norm": 28.75, "grad_norm_var": 5.073372395833333, "learning_rate": 0.0001, "loss": 7.3703, "loss/crossentropy": 1.9420620132237674, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.1863358210772276, "step": 8190 }, { "epoch": 0.2733333333333333, "grad_norm": 31.0, "grad_norm_var": 5.0759765625, "learning_rate": 0.0001, "loss": 7.3263, "loss/crossentropy": 1.920608750730753, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.1867589635774493, "step": 8200 }, { "epoch": 0.27366666666666667, "grad_norm": 39.0, "grad_norm_var": 13.3837890625, "learning_rate": 0.0001, "loss": 7.2725, "loss/crossentropy": 2.0436389118433, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.19375408291816712, "step": 8210 }, { "epoch": 0.274, "grad_norm": 28.875, "grad_norm_var": 8.051822916666667, "learning_rate": 0.0001, "loss": 7.2602, "loss/crossentropy": 2.089376961439848, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.19999252073466778, "step": 8220 }, { "epoch": 0.2743333333333333, "grad_norm": 37.25, "grad_norm_var": 2.1550427676073439e+18, "learning_rate": 0.0001, "loss": 7.3357, "loss/crossentropy": 2.097749339044094, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.2029523065313697, "step": 8230 }, { "epoch": 0.27466666666666667, "grad_norm": 51.25, "grad_norm_var": 2.1550427643716132e+18, "learning_rate": 0.0001, "loss": 7.4194, "loss/crossentropy": 2.0293630376458167, "loss/hidden": 3.548828125, "loss/jsd": 0.0, "loss/logits": 0.20408941209316253, "step": 8240 }, { "epoch": 0.275, "grad_norm": 29.125, "grad_norm_var": 132.28098958333334, "learning_rate": 0.0001, "loss": 7.3619, "loss/crossentropy": 1.9629296898841857, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.19220858979970218, "step": 8250 }, { "epoch": 0.2753333333333333, "grad_norm": 29.25, "grad_norm_var": 2.526497395833333, "learning_rate": 0.0001, "loss": 7.3186, "loss/crossentropy": 2.1308686450123786, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.19903687853366137, "step": 8260 }, { "epoch": 0.27566666666666667, "grad_norm": 29.75, "grad_norm_var": 1.9770182291666667, "learning_rate": 0.0001, "loss": 7.309, "loss/crossentropy": 2.144218336045742, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.1956016467884183, "step": 8270 }, { "epoch": 0.276, "grad_norm": 29.125, "grad_norm_var": 2.3514973958333334, "learning_rate": 0.0001, "loss": 7.2394, "loss/crossentropy": 1.90976689606905, "loss/hidden": 3.58046875, "loss/jsd": 0.0, "loss/logits": 0.1956272816285491, "step": 8280 }, { "epoch": 0.2763333333333333, "grad_norm": 30.625, "grad_norm_var": 2.088997395833333, "learning_rate": 0.0001, "loss": 7.2996, "loss/crossentropy": 1.9870160937309265, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.18253450635820628, "step": 8290 }, { "epoch": 0.27666666666666667, "grad_norm": 31.75, "grad_norm_var": 2.5747395833333333, "learning_rate": 0.0001, "loss": 7.3964, "loss/crossentropy": 2.033074514567852, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.18438865775242447, "step": 8300 }, { "epoch": 0.277, "grad_norm": 28.875, "grad_norm_var": 4.203059895833333, "learning_rate": 0.0001, "loss": 7.275, "loss/crossentropy": 1.9217052049934864, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.18465679194778203, "step": 8310 }, { "epoch": 0.2773333333333333, "grad_norm": 31.875, "grad_norm_var": 20.545572916666668, "learning_rate": 0.0001, "loss": 7.4689, "loss/crossentropy": 2.006330582499504, "loss/hidden": 3.484765625, "loss/jsd": 0.0, "loss/logits": 0.1989718060940504, "step": 8320 }, { "epoch": 0.2776666666666667, "grad_norm": 29.5, "grad_norm_var": 7.373372395833333, "learning_rate": 0.0001, "loss": 7.2977, "loss/crossentropy": 1.992536310479045, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.1899057521484792, "step": 8330 }, { "epoch": 0.278, "grad_norm": 31.75, "grad_norm_var": 11.7431640625, "learning_rate": 0.0001, "loss": 7.4395, "loss/crossentropy": 1.9114972099661827, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.17868827991187572, "step": 8340 }, { "epoch": 0.2783333333333333, "grad_norm": 31.25, "grad_norm_var": 7.262434895833334, "learning_rate": 0.0001, "loss": 7.3715, "loss/crossentropy": 2.0973041877150536, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.19581574592739343, "step": 8350 }, { "epoch": 0.2786666666666667, "grad_norm": 30.0, "grad_norm_var": 3.4650390625, "learning_rate": 0.0001, "loss": 7.3661, "loss/crossentropy": 2.0369765244424345, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.18274428797885775, "step": 8360 }, { "epoch": 0.279, "grad_norm": 30.75, "grad_norm_var": 5.201822916666667, "learning_rate": 0.0001, "loss": 7.3142, "loss/crossentropy": 1.978867343068123, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.1828209361061454, "step": 8370 }, { "epoch": 0.2793333333333333, "grad_norm": 30.625, "grad_norm_var": 3.434375, "learning_rate": 0.0001, "loss": 7.3887, "loss/crossentropy": 1.9353401854634285, "loss/hidden": 3.587890625, "loss/jsd": 0.0, "loss/logits": 0.20520148221403361, "step": 8380 }, { "epoch": 0.2796666666666667, "grad_norm": 28.5, "grad_norm_var": 1.9780598958333333, "learning_rate": 0.0001, "loss": 7.3416, "loss/crossentropy": 1.9438924878835677, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.19969125539064408, "step": 8390 }, { "epoch": 0.28, "grad_norm": 30.625, "grad_norm_var": 29.139518229166665, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 1.88441639021039, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.17577803563326597, "step": 8400 }, { "epoch": 0.2803333333333333, "grad_norm": 28.0, "grad_norm_var": 35.37057291666667, "learning_rate": 0.0001, "loss": 7.42, "loss/crossentropy": 1.88421476110816, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.1753213142976165, "step": 8410 }, { "epoch": 0.2806666666666667, "grad_norm": 31.125, "grad_norm_var": 11.818684895833334, "learning_rate": 0.0001, "loss": 7.3256, "loss/crossentropy": 1.847325573116541, "loss/hidden": 3.521484375, "loss/jsd": 0.0, "loss/logits": 0.18998681753873825, "step": 8420 }, { "epoch": 0.281, "grad_norm": 29.875, "grad_norm_var": 6.065625, "learning_rate": 0.0001, "loss": 7.2691, "loss/crossentropy": 1.977832904458046, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.20181752406060696, "step": 8430 }, { "epoch": 0.2813333333333333, "grad_norm": 29.875, "grad_norm_var": 9.573958333333334, "learning_rate": 0.0001, "loss": 7.2851, "loss/crossentropy": 1.960239465534687, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.19656131733208895, "step": 8440 }, { "epoch": 0.2816666666666667, "grad_norm": 29.25, "grad_norm_var": 9.00390625, "learning_rate": 0.0001, "loss": 7.229, "loss/crossentropy": 1.9075956016778945, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.17083253003656865, "step": 8450 }, { "epoch": 0.282, "grad_norm": 28.875, "grad_norm_var": 28.992708333333333, "learning_rate": 0.0001, "loss": 7.349, "loss/crossentropy": 1.8438440788537265, "loss/hidden": 3.531640625, "loss/jsd": 0.0, "loss/logits": 0.18178284978494047, "step": 8460 }, { "epoch": 0.2823333333333333, "grad_norm": 29.25, "grad_norm_var": 32.5603515625, "learning_rate": 0.0001, "loss": 7.3202, "loss/crossentropy": 1.8627922646701336, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1979638964869082, "step": 8470 }, { "epoch": 0.2826666666666667, "grad_norm": 28.125, "grad_norm_var": 4.4416015625, "learning_rate": 0.0001, "loss": 7.3711, "loss/crossentropy": 1.9426662161946298, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.18118238784372806, "step": 8480 }, { "epoch": 0.283, "grad_norm": 33.5, "grad_norm_var": 3.313593763742631e+18, "learning_rate": 0.0001, "loss": 7.3629, "loss/crossentropy": 2.0790310174226763, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.1985883366316557, "step": 8490 }, { "epoch": 0.2833333333333333, "grad_norm": 31.0, "grad_norm_var": 3.313593763340642e+18, "learning_rate": 0.0001, "loss": 7.3192, "loss/crossentropy": 1.9939626894891262, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.18139566630125045, "step": 8500 }, { "epoch": 0.2836666666666667, "grad_norm": 33.0, "grad_norm_var": 2.828125, "learning_rate": 0.0001, "loss": 7.2396, "loss/crossentropy": 2.0894299373030663, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.1916669135913253, "step": 8510 }, { "epoch": 0.284, "grad_norm": 32.25, "grad_norm_var": 2.3587890625, "learning_rate": 0.0001, "loss": 7.2874, "loss/crossentropy": 1.9241364471614362, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.18163993773050607, "step": 8520 }, { "epoch": 0.2843333333333333, "grad_norm": 29.625, "grad_norm_var": 1.8358723958333334, "learning_rate": 0.0001, "loss": 7.2587, "loss/crossentropy": 1.9555907040834426, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.19127986058592797, "step": 8530 }, { "epoch": 0.2846666666666667, "grad_norm": 28.875, "grad_norm_var": 7.772330729166667, "learning_rate": 0.0001, "loss": 7.4092, "loss/crossentropy": 1.760101752728224, "loss/hidden": 3.50078125, "loss/jsd": 0.0, "loss/logits": 0.1969465149566531, "step": 8540 }, { "epoch": 0.285, "grad_norm": 45.25, "grad_norm_var": 20.293489583333333, "learning_rate": 0.0001, "loss": 7.2317, "loss/crossentropy": 1.9078102983534335, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.1824373509734869, "step": 8550 }, { "epoch": 0.2853333333333333, "grad_norm": 29.75, "grad_norm_var": 15.6541015625, "learning_rate": 0.0001, "loss": 7.2472, "loss/crossentropy": 1.8875003397464751, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.17615189412608742, "step": 8560 }, { "epoch": 0.2856666666666667, "grad_norm": 29.375, "grad_norm_var": 2.388541666666667, "learning_rate": 0.0001, "loss": 7.3681, "loss/crossentropy": 1.9380451321601868, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.19927144553512335, "step": 8570 }, { "epoch": 0.286, "grad_norm": 32.25, "grad_norm_var": 4.416666666666667, "learning_rate": 0.0001, "loss": 7.4075, "loss/crossentropy": 1.8619311809539796, "loss/hidden": 3.51640625, "loss/jsd": 0.0, "loss/logits": 0.19381586387753486, "step": 8580 }, { "epoch": 0.28633333333333333, "grad_norm": 32.5, "grad_norm_var": 3.9119140625, "learning_rate": 0.0001, "loss": 7.2894, "loss/crossentropy": 2.2431411504745484, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.20185342598706485, "step": 8590 }, { "epoch": 0.2866666666666667, "grad_norm": 32.0, "grad_norm_var": 2.6770833333333335, "learning_rate": 0.0001, "loss": 7.3623, "loss/crossentropy": 2.0561891317367555, "loss/hidden": 3.531640625, "loss/jsd": 0.0, "loss/logits": 0.1970042482018471, "step": 8600 }, { "epoch": 0.287, "grad_norm": 28.375, "grad_norm_var": 3.2177083333333334, "learning_rate": 0.0001, "loss": 7.428, "loss/crossentropy": 2.136035445332527, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.19230358973145484, "step": 8610 }, { "epoch": 0.28733333333333333, "grad_norm": 31.625, "grad_norm_var": 2.7775390625, "learning_rate": 0.0001, "loss": 7.3422, "loss/crossentropy": 1.8791760236024857, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.18087298078462483, "step": 8620 }, { "epoch": 0.2876666666666667, "grad_norm": 31.25, "grad_norm_var": 1.2733723958333334, "learning_rate": 0.0001, "loss": 7.3706, "loss/crossentropy": 2.014942783117294, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.19941186718642712, "step": 8630 }, { "epoch": 0.288, "grad_norm": 28.125, "grad_norm_var": 6.577018229166667, "learning_rate": 0.0001, "loss": 7.3391, "loss/crossentropy": 1.9097432620823382, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.18962045703083277, "step": 8640 }, { "epoch": 0.28833333333333333, "grad_norm": 30.5, "grad_norm_var": 5.6150390625, "learning_rate": 0.0001, "loss": 7.2254, "loss/crossentropy": 2.0556643187999724, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.1865043956786394, "step": 8650 }, { "epoch": 0.2886666666666667, "grad_norm": 30.75, "grad_norm_var": 2.880989583333333, "learning_rate": 0.0001, "loss": 7.3058, "loss/crossentropy": 2.0936836436390878, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19982548868283628, "step": 8660 }, { "epoch": 0.289, "grad_norm": 30.5, "grad_norm_var": 3.3207682291666667, "learning_rate": 0.0001, "loss": 7.2018, "loss/crossentropy": 1.8737747095525266, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.1872003685683012, "step": 8670 }, { "epoch": 0.28933333333333333, "grad_norm": 30.0, "grad_norm_var": 2.9181640625, "learning_rate": 0.0001, "loss": 7.3607, "loss/crossentropy": 1.9043843269348144, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.19682506024837493, "step": 8680 }, { "epoch": 0.2896666666666667, "grad_norm": 29.75, "grad_norm_var": 4.110416666666667, "learning_rate": 0.0001, "loss": 7.3538, "loss/crossentropy": 1.9493150204420089, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.20106997862458229, "step": 8690 }, { "epoch": 0.29, "grad_norm": 31.125, "grad_norm_var": 10.063541666666667, "learning_rate": 0.0001, "loss": 7.448, "loss/crossentropy": 1.9216491512954235, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.21330687329173087, "step": 8700 }, { "epoch": 0.29033333333333333, "grad_norm": 29.125, "grad_norm_var": 10.748893229166667, "learning_rate": 0.0001, "loss": 7.3429, "loss/crossentropy": 1.9247693940997124, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.19770433623343706, "step": 8710 }, { "epoch": 0.2906666666666667, "grad_norm": 28.25, "grad_norm_var": 2.3018229166666666, "learning_rate": 0.0001, "loss": 7.3149, "loss/crossentropy": 1.9863500609993934, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.1769007002003491, "step": 8720 }, { "epoch": 0.291, "grad_norm": 34.25, "grad_norm_var": 35.44583333333333, "learning_rate": 0.0001, "loss": 7.3615, "loss/crossentropy": 2.0112742967903614, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.17948293145745992, "step": 8730 }, { "epoch": 0.29133333333333333, "grad_norm": 28.375, "grad_norm_var": 6.162955729166667, "learning_rate": 0.0001, "loss": 7.3094, "loss/crossentropy": 1.8403407216072083, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.18989183455705644, "step": 8740 }, { "epoch": 0.2916666666666667, "grad_norm": 30.25, "grad_norm_var": 2.1550427675584102e+18, "learning_rate": 0.0001, "loss": 7.3196, "loss/crossentropy": 1.9659276127815246, "loss/hidden": 3.73671875, "loss/jsd": 0.0, "loss/logits": 0.18590187281370163, "step": 8750 }, { "epoch": 0.292, "grad_norm": 33.0, "grad_norm_var": 2.1550427671547085e+18, "learning_rate": 0.0001, "loss": 7.3724, "loss/crossentropy": 2.042274435609579, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.19375519566237925, "step": 8760 }, { "epoch": 0.29233333333333333, "grad_norm": 31.25, "grad_norm_var": 5.773893229166666, "learning_rate": 0.0001, "loss": 7.4005, "loss/crossentropy": 1.998110396414995, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.1897134262137115, "step": 8770 }, { "epoch": 0.2926666666666667, "grad_norm": 33.25, "grad_norm_var": 2.6372395833333333, "learning_rate": 0.0001, "loss": 7.3422, "loss/crossentropy": 1.8424124024808406, "loss/hidden": 3.596484375, "loss/jsd": 0.0, "loss/logits": 0.18675317373126746, "step": 8780 }, { "epoch": 0.293, "grad_norm": 32.25, "grad_norm_var": 3.778580729166667, "learning_rate": 0.0001, "loss": 7.2658, "loss/crossentropy": 2.1098380774259566, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.1918635480105877, "step": 8790 }, { "epoch": 0.29333333333333333, "grad_norm": 29.375, "grad_norm_var": 3.0166666666666666, "learning_rate": 0.0001, "loss": 7.302, "loss/crossentropy": 1.8855247870087624, "loss/hidden": 3.53359375, "loss/jsd": 0.0, "loss/logits": 0.19457933884114026, "step": 8800 }, { "epoch": 0.2936666666666667, "grad_norm": 51.75, "grad_norm_var": 30.243489583333332, "learning_rate": 0.0001, "loss": 7.4593, "loss/crossentropy": 2.078700874745846, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.19752534870058297, "step": 8810 }, { "epoch": 0.294, "grad_norm": 30.125, "grad_norm_var": 29.0494140625, "learning_rate": 0.0001, "loss": 7.4636, "loss/crossentropy": 2.0901213392615317, "loss/hidden": 3.552734375, "loss/jsd": 0.0, "loss/logits": 0.19952785074710847, "step": 8820 }, { "epoch": 0.29433333333333334, "grad_norm": 29.0, "grad_norm_var": 6.85625, "learning_rate": 0.0001, "loss": 7.3036, "loss/crossentropy": 2.087279887497425, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19126114062964916, "step": 8830 }, { "epoch": 0.2946666666666667, "grad_norm": 30.0, "grad_norm_var": 2.8622395833333334, "learning_rate": 0.0001, "loss": 7.3207, "loss/crossentropy": 1.9822525084018707, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.18684150632470847, "step": 8840 }, { "epoch": 0.295, "grad_norm": 30.75, "grad_norm_var": 4.747916666666667, "learning_rate": 0.0001, "loss": 7.3465, "loss/crossentropy": 1.9542634725570678, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.1968481082469225, "step": 8850 }, { "epoch": 0.29533333333333334, "grad_norm": 30.125, "grad_norm_var": 3.2822916666666666, "learning_rate": 0.0001, "loss": 7.2925, "loss/crossentropy": 1.9117444053292274, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.1910472957417369, "step": 8860 }, { "epoch": 0.2956666666666667, "grad_norm": 28.875, "grad_norm_var": 3.0864583333333333, "learning_rate": 0.0001, "loss": 7.2364, "loss/crossentropy": 1.8577214300632476, "loss/hidden": 3.484765625, "loss/jsd": 0.0, "loss/logits": 0.19214668683707714, "step": 8870 }, { "epoch": 0.296, "grad_norm": 28.375, "grad_norm_var": 2.631705729166667, "learning_rate": 0.0001, "loss": 7.3732, "loss/crossentropy": 1.9562288388609885, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.18718476369976997, "step": 8880 }, { "epoch": 0.29633333333333334, "grad_norm": 29.75, "grad_norm_var": 2.80390625, "learning_rate": 0.0001, "loss": 7.4691, "loss/crossentropy": 2.0526602521538733, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.18736082967370749, "step": 8890 }, { "epoch": 0.2966666666666667, "grad_norm": 32.5, "grad_norm_var": 2.2848307291666665, "learning_rate": 0.0001, "loss": 7.3632, "loss/crossentropy": 2.1331785932183265, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1979058114811778, "step": 8900 }, { "epoch": 0.297, "grad_norm": 29.25, "grad_norm_var": 2.0666015625, "learning_rate": 0.0001, "loss": 7.2501, "loss/crossentropy": 1.9319664545357227, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.18770635463297367, "step": 8910 }, { "epoch": 0.29733333333333334, "grad_norm": 31.375, "grad_norm_var": 1.465625, "learning_rate": 0.0001, "loss": 7.2874, "loss/crossentropy": 1.901083593070507, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.1807714020833373, "step": 8920 }, { "epoch": 0.2976666666666667, "grad_norm": 30.75, "grad_norm_var": 1.9343098958333333, "learning_rate": 0.0001, "loss": 7.4112, "loss/crossentropy": 2.027828375995159, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19485928257927299, "step": 8930 }, { "epoch": 0.298, "grad_norm": 31.375, "grad_norm_var": 1.8426432291666666, "learning_rate": 0.0001, "loss": 7.351, "loss/crossentropy": 2.1440160870552063, "loss/hidden": 3.47578125, "loss/jsd": 0.0, "loss/logits": 0.20018482208251953, "step": 8940 }, { "epoch": 0.29833333333333334, "grad_norm": 28.5, "grad_norm_var": 2.373893229166667, "learning_rate": 0.0001, "loss": 7.3506, "loss/crossentropy": 2.2288490504026415, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.20186269003897905, "step": 8950 }, { "epoch": 0.2986666666666667, "grad_norm": 29.375, "grad_norm_var": 4.409375, "learning_rate": 0.0001, "loss": 7.3271, "loss/crossentropy": 1.9429566815495491, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.19475207198411226, "step": 8960 }, { "epoch": 0.299, "grad_norm": 31.25, "grad_norm_var": 8.1166015625, "learning_rate": 0.0001, "loss": 7.3641, "loss/crossentropy": 1.8804932191967965, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.18377144820988178, "step": 8970 }, { "epoch": 0.29933333333333334, "grad_norm": 33.25, "grad_norm_var": 4.85390625, "learning_rate": 0.0001, "loss": 7.4229, "loss/crossentropy": 2.133847635984421, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.19583620335906743, "step": 8980 }, { "epoch": 0.2996666666666667, "grad_norm": 32.25, "grad_norm_var": 5.415559895833334, "learning_rate": 0.0001, "loss": 7.4075, "loss/crossentropy": 1.9043404765427112, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.18776802252978086, "step": 8990 }, { "epoch": 0.3, "grad_norm": 34.75, "grad_norm_var": 6.513541666666667, "learning_rate": 0.0001, "loss": 7.2113, "loss/crossentropy": 2.0467875778675078, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.18717477563768625, "step": 9000 }, { "epoch": 0.30033333333333334, "grad_norm": 30.875, "grad_norm_var": 13.496875, "learning_rate": 0.0001, "loss": 7.3626, "loss/crossentropy": 1.9334401845932008, "loss/hidden": 3.601953125, "loss/jsd": 0.0, "loss/logits": 0.20905897868797182, "step": 9010 }, { "epoch": 0.3006666666666667, "grad_norm": 31.625, "grad_norm_var": 15.729166666666666, "learning_rate": 0.0001, "loss": 7.4237, "loss/crossentropy": 2.058571843802929, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.20395366256125272, "step": 9020 }, { "epoch": 0.301, "grad_norm": 30.75, "grad_norm_var": 7.420247395833333, "learning_rate": 0.0001, "loss": 7.3638, "loss/crossentropy": 2.075588658452034, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.1875262087211013, "step": 9030 }, { "epoch": 0.30133333333333334, "grad_norm": 28.5, "grad_norm_var": 3.0608723958333335, "learning_rate": 0.0001, "loss": 7.2874, "loss/crossentropy": 1.9345158755779266, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.1736679593101144, "step": 9040 }, { "epoch": 0.3016666666666667, "grad_norm": 29.125, "grad_norm_var": 3.4160807291666666, "learning_rate": 0.0001, "loss": 7.3695, "loss/crossentropy": 1.9337209820747376, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.18637216966599227, "step": 9050 }, { "epoch": 0.302, "grad_norm": 29.875, "grad_norm_var": 14.2181640625, "learning_rate": 0.0001, "loss": 7.292, "loss/crossentropy": 2.1024497002363205, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.19641367606818677, "step": 9060 }, { "epoch": 0.30233333333333334, "grad_norm": 30.0, "grad_norm_var": 374.7514973958333, "learning_rate": 0.0001, "loss": 7.3461, "loss/crossentropy": 2.0792969286441805, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.20074989479035138, "step": 9070 }, { "epoch": 0.30266666666666664, "grad_norm": 28.625, "grad_norm_var": 366.4358723958333, "learning_rate": 0.0001, "loss": 7.2908, "loss/crossentropy": 2.0501320138573647, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.18744063824415208, "step": 9080 }, { "epoch": 0.303, "grad_norm": 30.0, "grad_norm_var": 3.79765625, "learning_rate": 0.0001, "loss": 7.2487, "loss/crossentropy": 1.9363606587052344, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.17125512538477777, "step": 9090 }, { "epoch": 0.30333333333333334, "grad_norm": 29.125, "grad_norm_var": 1.8041015625, "learning_rate": 0.0001, "loss": 7.2687, "loss/crossentropy": 1.9517054468393327, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.18131932485848665, "step": 9100 }, { "epoch": 0.30366666666666664, "grad_norm": 28.0, "grad_norm_var": 1.9796223958333334, "learning_rate": 0.0001, "loss": 7.1987, "loss/crossentropy": 1.9462280698120593, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.1759795421268791, "step": 9110 }, { "epoch": 0.304, "grad_norm": 29.875, "grad_norm_var": 1.840625, "learning_rate": 0.0001, "loss": 7.4889, "loss/crossentropy": 1.9569612272083758, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.1883821003139019, "step": 9120 }, { "epoch": 0.30433333333333334, "grad_norm": 29.875, "grad_norm_var": 0.9244140625, "learning_rate": 0.0001, "loss": 7.2561, "loss/crossentropy": 1.9265499994158746, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.17618877990171314, "step": 9130 }, { "epoch": 0.30466666666666664, "grad_norm": 28.25, "grad_norm_var": 2.7822916666666666, "learning_rate": 0.0001, "loss": 7.2528, "loss/crossentropy": 2.08494171500206, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.19873665682971478, "step": 9140 }, { "epoch": 0.305, "grad_norm": 31.5, "grad_norm_var": 29.430208333333333, "learning_rate": 0.0001, "loss": 7.3378, "loss/crossentropy": 1.980225134640932, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.1899002842605114, "step": 9150 }, { "epoch": 0.30533333333333335, "grad_norm": 32.0, "grad_norm_var": 15.6322265625, "learning_rate": 0.0001, "loss": 7.2762, "loss/crossentropy": 1.9473119281232356, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.1737335043027997, "step": 9160 }, { "epoch": 0.30566666666666664, "grad_norm": 32.0, "grad_norm_var": 6.580989583333333, "learning_rate": 0.0001, "loss": 7.37, "loss/crossentropy": 1.977689068391919, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.1920790748205036, "step": 9170 }, { "epoch": 0.306, "grad_norm": 32.5, "grad_norm_var": 17.620247395833335, "learning_rate": 0.0001, "loss": 7.3158, "loss/crossentropy": 1.9225960478186608, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.17651781998574734, "step": 9180 }, { "epoch": 0.30633333333333335, "grad_norm": 29.5, "grad_norm_var": 3.161393229166667, "learning_rate": 0.0001, "loss": 7.339, "loss/crossentropy": 2.045545983314514, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.2043112374842167, "step": 9190 }, { "epoch": 0.30666666666666664, "grad_norm": 30.875, "grad_norm_var": 69.78483072916667, "learning_rate": 0.0001, "loss": 7.391, "loss/crossentropy": 2.011800418049097, "loss/hidden": 3.50234375, "loss/jsd": 0.0, "loss/logits": 0.20068616131320596, "step": 9200 }, { "epoch": 0.307, "grad_norm": 29.625, "grad_norm_var": 146.88951822916667, "learning_rate": 0.0001, "loss": 7.363, "loss/crossentropy": 2.089944151043892, "loss/hidden": 3.524609375, "loss/jsd": 0.0, "loss/logits": 0.21082677990198134, "step": 9210 }, { "epoch": 0.30733333333333335, "grad_norm": 31.875, "grad_norm_var": 103.13020833333333, "learning_rate": 0.0001, "loss": 7.3587, "loss/crossentropy": 1.9774355866014957, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.18693090369924903, "step": 9220 }, { "epoch": 0.30766666666666664, "grad_norm": 29.75, "grad_norm_var": 420.5122395833333, "learning_rate": 0.0001, "loss": 7.4336, "loss/crossentropy": 1.935514609515667, "loss/hidden": 3.519140625, "loss/jsd": 0.0, "loss/logits": 0.18990224320441484, "step": 9230 }, { "epoch": 0.308, "grad_norm": 30.875, "grad_norm_var": 10.349934895833334, "learning_rate": 0.0001, "loss": 7.2557, "loss/crossentropy": 1.9142539575695992, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.18657770249992608, "step": 9240 }, { "epoch": 0.30833333333333335, "grad_norm": 29.0, "grad_norm_var": 1.8030598958333333, "learning_rate": 0.0001, "loss": 7.3508, "loss/crossentropy": 2.035041029006243, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.19015967566519976, "step": 9250 }, { "epoch": 0.30866666666666664, "grad_norm": 28.75, "grad_norm_var": 0.89765625, "learning_rate": 0.0001, "loss": 7.3229, "loss/crossentropy": 2.00568625703454, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.1828642250970006, "step": 9260 }, { "epoch": 0.309, "grad_norm": 33.0, "grad_norm_var": 2.1494140625, "learning_rate": 0.0001, "loss": 7.3218, "loss/crossentropy": 1.7389535933732987, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.18015089384280145, "step": 9270 }, { "epoch": 0.30933333333333335, "grad_norm": 30.75, "grad_norm_var": 3.3744140625, "learning_rate": 0.0001, "loss": 7.4333, "loss/crossentropy": 1.75627715960145, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.19567092433571814, "step": 9280 }, { "epoch": 0.30966666666666665, "grad_norm": 29.0, "grad_norm_var": 6.0009765625, "learning_rate": 0.0001, "loss": 7.3185, "loss/crossentropy": 2.1120258182287217, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.217480574734509, "step": 9290 }, { "epoch": 0.31, "grad_norm": 29.5, "grad_norm_var": 6.132291666666666, "learning_rate": 0.0001, "loss": 7.2932, "loss/crossentropy": 1.9872212290763855, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.18652648851275444, "step": 9300 }, { "epoch": 0.31033333333333335, "grad_norm": 28.0, "grad_norm_var": 2.1416015625, "learning_rate": 0.0001, "loss": 7.3201, "loss/crossentropy": 1.9554287418723106, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.17851617969572545, "step": 9310 }, { "epoch": 0.31066666666666665, "grad_norm": 29.25, "grad_norm_var": 1.83515625, "learning_rate": 0.0001, "loss": 7.2511, "loss/crossentropy": 1.8785304114222527, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.1744052106514573, "step": 9320 }, { "epoch": 0.311, "grad_norm": 32.25, "grad_norm_var": 3.015230291984882e+18, "learning_rate": 0.0001, "loss": 7.2979, "loss/crossentropy": 1.945047279447317, "loss/hidden": 3.630078125, "loss/jsd": 0.0, "loss/logits": 0.19515041969716548, "step": 9330 }, { "epoch": 0.31133333333333335, "grad_norm": 27.875, "grad_norm_var": 7.887955729166666, "learning_rate": 0.0001, "loss": 7.3083, "loss/crossentropy": 2.0693819135427476, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.1896443862468004, "step": 9340 }, { "epoch": 0.31166666666666665, "grad_norm": 44.0, "grad_norm_var": 13.896875, "learning_rate": 0.0001, "loss": 7.4178, "loss/crossentropy": 2.1485478207468987, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.2088997619226575, "step": 9350 }, { "epoch": 0.312, "grad_norm": 29.375, "grad_norm_var": 14.501497395833333, "learning_rate": 0.0001, "loss": 7.4623, "loss/crossentropy": 2.078371644765139, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.2104900972917676, "step": 9360 }, { "epoch": 0.31233333333333335, "grad_norm": 32.25, "grad_norm_var": 3.721875, "learning_rate": 0.0001, "loss": 7.4459, "loss/crossentropy": 2.163683497905731, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.21479407381266355, "step": 9370 }, { "epoch": 0.31266666666666665, "grad_norm": 29.875, "grad_norm_var": 5.030143229166667, "learning_rate": 0.0001, "loss": 7.3623, "loss/crossentropy": 2.0053013376891613, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.190034668892622, "step": 9380 }, { "epoch": 0.313, "grad_norm": 31.625, "grad_norm_var": 47.515559895833334, "learning_rate": 0.0001, "loss": 7.382, "loss/crossentropy": 1.914030884206295, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.18562994264066218, "step": 9390 }, { "epoch": 0.31333333333333335, "grad_norm": 27.25, "grad_norm_var": 52.94973958333333, "learning_rate": 0.0001, "loss": 7.0695, "loss/crossentropy": 1.8673397563397884, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.16249247072264553, "step": 9400 }, { "epoch": 0.31366666666666665, "grad_norm": 30.125, "grad_norm_var": 5.2853515625, "learning_rate": 0.0001, "loss": 7.3777, "loss/crossentropy": 2.1112064689397814, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.19787968825548888, "step": 9410 }, { "epoch": 0.314, "grad_norm": 28.25, "grad_norm_var": 4.78125, "learning_rate": 0.0001, "loss": 7.2457, "loss/crossentropy": 2.0864884607493877, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19186802459880709, "step": 9420 }, { "epoch": 0.31433333333333335, "grad_norm": 28.375, "grad_norm_var": 1.6634765625, "learning_rate": 0.0001, "loss": 7.2828, "loss/crossentropy": 1.8906163558363915, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.1704605108126998, "step": 9430 }, { "epoch": 0.31466666666666665, "grad_norm": 29.625, "grad_norm_var": 1.7541015625, "learning_rate": 0.0001, "loss": 7.2591, "loss/crossentropy": 1.9289519362151624, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.18976721679791808, "step": 9440 }, { "epoch": 0.315, "grad_norm": 29.625, "grad_norm_var": 2.2705729166666666, "learning_rate": 0.0001, "loss": 7.3286, "loss/crossentropy": 1.8244129970669747, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.1864110093563795, "step": 9450 }, { "epoch": 0.31533333333333335, "grad_norm": 35.75, "grad_norm_var": 35501.7572265625, "learning_rate": 0.0001, "loss": 7.4225, "loss/crossentropy": 1.9283518977463245, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.17858725944533943, "step": 9460 }, { "epoch": 0.31566666666666665, "grad_norm": 28.5, "grad_norm_var": 59.66223958333333, "learning_rate": 0.0001, "loss": 7.396, "loss/crossentropy": 2.020549309253693, "loss/hidden": 3.61875, "loss/jsd": 0.0, "loss/logits": 0.19497359059751035, "step": 9470 }, { "epoch": 0.316, "grad_norm": 29.5, "grad_norm_var": 1.5291666666666666, "learning_rate": 0.0001, "loss": 7.2946, "loss/crossentropy": 1.9159901462495328, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.19018601728603243, "step": 9480 }, { "epoch": 0.31633333333333336, "grad_norm": 29.5, "grad_norm_var": 2.35390625, "learning_rate": 0.0001, "loss": 7.3213, "loss/crossentropy": 2.020467573404312, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.19673708435148002, "step": 9490 }, { "epoch": 0.31666666666666665, "grad_norm": 30.5, "grad_norm_var": 2.7900390625, "learning_rate": 0.0001, "loss": 7.2781, "loss/crossentropy": 2.0468151047825813, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.18640288040041925, "step": 9500 }, { "epoch": 0.317, "grad_norm": 31.375, "grad_norm_var": 2.2306640625, "learning_rate": 0.0001, "loss": 7.3455, "loss/crossentropy": 2.056964437663555, "loss/hidden": 3.514453125, "loss/jsd": 0.0, "loss/logits": 0.1945600088685751, "step": 9510 }, { "epoch": 0.31733333333333336, "grad_norm": 26.375, "grad_norm_var": 2.730208333333333, "learning_rate": 0.0001, "loss": 7.2137, "loss/crossentropy": 1.9038727797567845, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.16782705076038837, "step": 9520 }, { "epoch": 0.31766666666666665, "grad_norm": 25.75, "grad_norm_var": 4.22265625, "learning_rate": 0.0001, "loss": 7.293, "loss/crossentropy": 1.9754691384732723, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.19230299722403288, "step": 9530 }, { "epoch": 0.318, "grad_norm": 34.75, "grad_norm_var": 11.052018229166666, "learning_rate": 0.0001, "loss": 7.4297, "loss/crossentropy": 2.135791355371475, "loss/hidden": 3.540234375, "loss/jsd": 0.0, "loss/logits": 0.22480202931910753, "step": 9540 }, { "epoch": 0.31833333333333336, "grad_norm": 31.5, "grad_norm_var": 8.505989583333333, "learning_rate": 0.0001, "loss": 7.3588, "loss/crossentropy": 1.9053886331617833, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.1867375772446394, "step": 9550 }, { "epoch": 0.31866666666666665, "grad_norm": 31.75, "grad_norm_var": 2.0624348958333334, "learning_rate": 0.0001, "loss": 7.2422, "loss/crossentropy": 1.872929535806179, "loss/hidden": 3.473046875, "loss/jsd": 0.0, "loss/logits": 0.18716121800243854, "step": 9560 }, { "epoch": 0.319, "grad_norm": 29.0, "grad_norm_var": 8.78515625, "learning_rate": 0.0001, "loss": 7.2807, "loss/crossentropy": 1.9732257407158613, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.18972884975373744, "step": 9570 }, { "epoch": 0.31933333333333336, "grad_norm": 34.75, "grad_norm_var": 22.859309895833334, "learning_rate": 0.0001, "loss": 7.2604, "loss/crossentropy": 1.889311669766903, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.1850642416626215, "step": 9580 }, { "epoch": 0.31966666666666665, "grad_norm": 28.5, "grad_norm_var": 13.089322916666667, "learning_rate": 0.0001, "loss": 7.335, "loss/crossentropy": 1.9001601874828338, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.18188440129160882, "step": 9590 }, { "epoch": 0.32, "grad_norm": 572.0, "grad_norm_var": 18308.473893229166, "learning_rate": 0.0001, "loss": 7.4051, "loss/crossentropy": 1.9268567018210887, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2223429461941123, "step": 9600 }, { "epoch": 0.32033333333333336, "grad_norm": 31.125, "grad_norm_var": 23382.36875, "learning_rate": 0.0001, "loss": 7.67, "loss/crossentropy": 1.9797904796898365, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.19619830762967466, "step": 9610 }, { "epoch": 0.32066666666666666, "grad_norm": 30.0, "grad_norm_var": 3.49140625, "learning_rate": 0.0001, "loss": 7.4502, "loss/crossentropy": 2.0388568580150603, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.1866995297372341, "step": 9620 }, { "epoch": 0.321, "grad_norm": 30.625, "grad_norm_var": 3.3119140625, "learning_rate": 0.0001, "loss": 7.4449, "loss/crossentropy": 1.992645274847746, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.19259049566462635, "step": 9630 }, { "epoch": 0.32133333333333336, "grad_norm": 33.0, "grad_norm_var": 3.91015625, "learning_rate": 0.0001, "loss": 7.3752, "loss/crossentropy": 1.9369973316788673, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.187690419703722, "step": 9640 }, { "epoch": 0.32166666666666666, "grad_norm": 30.5, "grad_norm_var": 3.6270182291666666, "learning_rate": 0.0001, "loss": 7.4122, "loss/crossentropy": 1.885891205072403, "loss/hidden": 3.5890625, "loss/jsd": 0.0, "loss/logits": 0.2046023676171899, "step": 9650 }, { "epoch": 0.322, "grad_norm": 29.5, "grad_norm_var": 11.27265625, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 1.9987530410289764, "loss/hidden": 3.526171875, "loss/jsd": 0.0, "loss/logits": 0.1897335320711136, "step": 9660 }, { "epoch": 0.32233333333333336, "grad_norm": 27.875, "grad_norm_var": 13.169205729166666, "learning_rate": 0.0001, "loss": 7.3019, "loss/crossentropy": 2.050529868900776, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.17973064891994, "step": 9670 }, { "epoch": 0.32266666666666666, "grad_norm": 28.625, "grad_norm_var": 1.7518229166666666, "learning_rate": 0.0001, "loss": 7.3111, "loss/crossentropy": 1.9904332220554353, "loss/hidden": 3.59921875, "loss/jsd": 0.0, "loss/logits": 0.20654886029660702, "step": 9680 }, { "epoch": 0.323, "grad_norm": 31.5, "grad_norm_var": 235.7775390625, "learning_rate": 0.0001, "loss": 7.4141, "loss/crossentropy": 2.116143609955907, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.19894695193506778, "step": 9690 }, { "epoch": 0.3233333333333333, "grad_norm": 29.75, "grad_norm_var": 10.9634765625, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 1.9542381793260575, "loss/hidden": 3.55625, "loss/jsd": 0.0, "loss/logits": 0.200473203137517, "step": 9700 }, { "epoch": 0.32366666666666666, "grad_norm": 34.25, "grad_norm_var": 23.1181640625, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 2.1027556218206884, "loss/hidden": 3.451171875, "loss/jsd": 0.0, "loss/logits": 0.20998089499771594, "step": 9710 }, { "epoch": 0.324, "grad_norm": 36.0, "grad_norm_var": 16.9041015625, "learning_rate": 0.0001, "loss": 7.3908, "loss/crossentropy": 1.8684283286333083, "loss/hidden": 3.5265625, "loss/jsd": 0.0, "loss/logits": 0.18103307764977217, "step": 9720 }, { "epoch": 0.3243333333333333, "grad_norm": 30.875, "grad_norm_var": 5.7634765625, "learning_rate": 0.0001, "loss": 7.4314, "loss/crossentropy": 2.074372976273298, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19300632625818254, "step": 9730 }, { "epoch": 0.32466666666666666, "grad_norm": 34.25, "grad_norm_var": 5.192708333333333, "learning_rate": 0.0001, "loss": 7.4315, "loss/crossentropy": 1.9514388039708137, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.1828241540119052, "step": 9740 }, { "epoch": 0.325, "grad_norm": 30.0, "grad_norm_var": 10.3572265625, "learning_rate": 0.0001, "loss": 7.4181, "loss/crossentropy": 1.935892714560032, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.18711116462945937, "step": 9750 }, { "epoch": 0.3253333333333333, "grad_norm": 28.0, "grad_norm_var": 3.1499348958333333, "learning_rate": 0.0001, "loss": 7.4113, "loss/crossentropy": 2.0968233779072762, "loss/hidden": 3.544921875, "loss/jsd": 0.0, "loss/logits": 0.1940503792837262, "step": 9760 }, { "epoch": 0.32566666666666666, "grad_norm": 30.75, "grad_norm_var": 3.5624348958333334, "learning_rate": 0.0001, "loss": 7.2677, "loss/crossentropy": 1.9734324872493745, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.19653846565634012, "step": 9770 }, { "epoch": 0.326, "grad_norm": 31.125, "grad_norm_var": 1.2134765625, "learning_rate": 0.0001, "loss": 7.4344, "loss/crossentropy": 2.038902147114277, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.1759825510904193, "step": 9780 }, { "epoch": 0.3263333333333333, "grad_norm": 32.0, "grad_norm_var": 2.1582682291666666, "learning_rate": 0.0001, "loss": 7.4001, "loss/crossentropy": 2.1815779954195023, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1962712539359927, "step": 9790 }, { "epoch": 0.32666666666666666, "grad_norm": 29.75, "grad_norm_var": 7.801822916666667, "learning_rate": 0.0001, "loss": 7.4007, "loss/crossentropy": 2.0599596932530404, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.185849554464221, "step": 9800 }, { "epoch": 0.327, "grad_norm": 43.75, "grad_norm_var": 42.9478515625, "learning_rate": 0.0001, "loss": 7.3017, "loss/crossentropy": 2.0855581283569338, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18200179412961007, "step": 9810 }, { "epoch": 0.3273333333333333, "grad_norm": 28.375, "grad_norm_var": 14.214322916666667, "learning_rate": 0.0001, "loss": 7.3786, "loss/crossentropy": 1.9182983696460725, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.17902343580499291, "step": 9820 }, { "epoch": 0.32766666666666666, "grad_norm": 30.375, "grad_norm_var": 2.1973307291666666, "learning_rate": 0.0001, "loss": 7.2874, "loss/crossentropy": 1.9535210832953454, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.18177984785288573, "step": 9830 }, { "epoch": 0.328, "grad_norm": 31.75, "grad_norm_var": 6.67265625, "learning_rate": 0.0001, "loss": 7.4661, "loss/crossentropy": 2.015442591905594, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.1924414152279496, "step": 9840 }, { "epoch": 0.3283333333333333, "grad_norm": 30.25, "grad_norm_var": 7.33125, "learning_rate": 0.0001, "loss": 7.397, "loss/crossentropy": 2.091397388279438, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.19350529219955206, "step": 9850 }, { "epoch": 0.32866666666666666, "grad_norm": 28.25, "grad_norm_var": 8.139583333333333, "learning_rate": 0.0001, "loss": 7.2994, "loss/crossentropy": 2.043536502867937, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.19860589876770973, "step": 9860 }, { "epoch": 0.329, "grad_norm": 31.5, "grad_norm_var": 2.9052083333333334, "learning_rate": 0.0001, "loss": 7.3729, "loss/crossentropy": 2.03020788654685, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.21006795475259424, "step": 9870 }, { "epoch": 0.3293333333333333, "grad_norm": 28.0, "grad_norm_var": 4.396875, "learning_rate": 0.0001, "loss": 7.3752, "loss/crossentropy": 2.1508992075920106, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.19727412853389978, "step": 9880 }, { "epoch": 0.32966666666666666, "grad_norm": 28.5, "grad_norm_var": 4.215559895833334, "learning_rate": 0.0001, "loss": 7.3392, "loss/crossentropy": 1.9275569841265678, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19292535874992608, "step": 9890 }, { "epoch": 0.33, "grad_norm": 30.625, "grad_norm_var": 6.151822916666666, "learning_rate": 0.0001, "loss": 7.3575, "loss/crossentropy": 2.051329031586647, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.1873044328764081, "step": 9900 }, { "epoch": 0.3303333333333333, "grad_norm": 30.625, "grad_norm_var": 4.434375, "learning_rate": 0.0001, "loss": 7.3029, "loss/crossentropy": 1.9287674106657504, "loss/hidden": 3.61953125, "loss/jsd": 0.0, "loss/logits": 0.2245039775967598, "step": 9910 }, { "epoch": 0.33066666666666666, "grad_norm": 30.125, "grad_norm_var": 3.2739583333333333, "learning_rate": 0.0001, "loss": 7.3085, "loss/crossentropy": 2.070566202700138, "loss/hidden": 3.536328125, "loss/jsd": 0.0, "loss/logits": 0.21454092003405095, "step": 9920 }, { "epoch": 0.331, "grad_norm": 28.75, "grad_norm_var": 2.409309895833333, "learning_rate": 0.0001, "loss": 7.4148, "loss/crossentropy": 2.065828824788332, "loss/hidden": 3.533984375, "loss/jsd": 0.0, "loss/logits": 0.19369891891255975, "step": 9930 }, { "epoch": 0.3313333333333333, "grad_norm": 30.375, "grad_norm_var": 3.9822265625, "learning_rate": 0.0001, "loss": 7.4414, "loss/crossentropy": 2.073743884265423, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.19146320801228284, "step": 9940 }, { "epoch": 0.33166666666666667, "grad_norm": 29.375, "grad_norm_var": 1.8504557291666666, "learning_rate": 0.0001, "loss": 7.346, "loss/crossentropy": 1.9760347455739975, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.19117477182298898, "step": 9950 }, { "epoch": 0.332, "grad_norm": 29.75, "grad_norm_var": 2.067122395833333, "learning_rate": 0.0001, "loss": 7.3117, "loss/crossentropy": 2.1041652515530584, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.19804592076689004, "step": 9960 }, { "epoch": 0.3323333333333333, "grad_norm": 33.0, "grad_norm_var": 9.158333333333333, "learning_rate": 0.0001, "loss": 7.4392, "loss/crossentropy": 1.9661363780498504, "loss/hidden": 3.499609375, "loss/jsd": 0.0, "loss/logits": 0.18938839323818685, "step": 9970 }, { "epoch": 0.33266666666666667, "grad_norm": 29.625, "grad_norm_var": 6.170768229166667, "learning_rate": 0.0001, "loss": 7.3532, "loss/crossentropy": 2.020737300813198, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.20692225946113468, "step": 9980 }, { "epoch": 0.333, "grad_norm": 28.125, "grad_norm_var": 1.1426432291666666, "learning_rate": 0.0001, "loss": 7.3008, "loss/crossentropy": 1.9223391257226468, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.18638489693403243, "step": 9990 }, { "epoch": 0.3333333333333333, "grad_norm": 29.25, "grad_norm_var": 2.292122395833333, "learning_rate": 0.0001, "loss": 7.3246, "loss/crossentropy": 2.0244668796658516, "loss/hidden": 3.556640625, "loss/jsd": 0.0, "loss/logits": 0.21203572396188974, "step": 10000 }, { "epoch": 0.33366666666666667, "grad_norm": 30.375, "grad_norm_var": 3.7747395833333335, "learning_rate": 0.0001, "loss": 7.3702, "loss/crossentropy": 1.9584268510341645, "loss/hidden": 3.469921875, "loss/jsd": 0.0, "loss/logits": 0.19440346471965314, "step": 10010 }, { "epoch": 0.334, "grad_norm": 29.75, "grad_norm_var": 5.855143229166667, "learning_rate": 0.0001, "loss": 7.387, "loss/crossentropy": 2.1112962484359743, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.19202029425650835, "step": 10020 }, { "epoch": 0.3343333333333333, "grad_norm": 27.125, "grad_norm_var": 1.320750941671732e+18, "learning_rate": 0.0001, "loss": 7.342, "loss/crossentropy": 1.8209781922399997, "loss/hidden": 3.299609375, "loss/jsd": 0.0, "loss/logits": 0.17487489972263576, "step": 10030 }, { "epoch": 0.33466666666666667, "grad_norm": 33.5, "grad_norm_var": 2.988541666666667, "learning_rate": 0.0001, "loss": 7.2582, "loss/crossentropy": 1.8906087152659894, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.17698880061507224, "step": 10040 }, { "epoch": 0.335, "grad_norm": 34.25, "grad_norm_var": 5.179622395833333, "learning_rate": 0.0001, "loss": 7.2299, "loss/crossentropy": 1.8849868863821029, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.185193127207458, "step": 10050 }, { "epoch": 0.3353333333333333, "grad_norm": 27.875, "grad_norm_var": 6.8119140625, "learning_rate": 0.0001, "loss": 7.2972, "loss/crossentropy": 2.024214446544647, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.19046234507113696, "step": 10060 }, { "epoch": 0.33566666666666667, "grad_norm": 29.875, "grad_norm_var": 3.0375, "learning_rate": 0.0001, "loss": 7.2278, "loss/crossentropy": 1.924572029709816, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.1903694735839963, "step": 10070 }, { "epoch": 0.336, "grad_norm": 28.75, "grad_norm_var": 10.2869140625, "learning_rate": 0.0001, "loss": 7.2964, "loss/crossentropy": 2.2319858193397524, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.1950693041086197, "step": 10080 }, { "epoch": 0.3363333333333333, "grad_norm": 27.625, "grad_norm_var": 9.9072265625, "learning_rate": 0.0001, "loss": 7.2993, "loss/crossentropy": 2.0440548956394196, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.18242222890257836, "step": 10090 }, { "epoch": 0.33666666666666667, "grad_norm": 28.875, "grad_norm_var": 2.1994140625, "learning_rate": 0.0001, "loss": 7.1928, "loss/crossentropy": 1.9509951189160346, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.19150491170585154, "step": 10100 }, { "epoch": 0.337, "grad_norm": 28.25, "grad_norm_var": 1.6934895833333334, "learning_rate": 0.0001, "loss": 7.2871, "loss/crossentropy": 2.059817015379667, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.18072485290467738, "step": 10110 }, { "epoch": 0.3373333333333333, "grad_norm": 32.5, "grad_norm_var": 3.9801432291666665, "learning_rate": 0.0001, "loss": 7.3276, "loss/crossentropy": 1.8736907504498959, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.17721042595803738, "step": 10120 }, { "epoch": 0.33766666666666667, "grad_norm": 30.625, "grad_norm_var": 3.8372395833333335, "learning_rate": 0.0001, "loss": 7.302, "loss/crossentropy": 1.8703613385558129, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.17593158315867186, "step": 10130 }, { "epoch": 0.338, "grad_norm": 28.5, "grad_norm_var": 3.565625, "learning_rate": 0.0001, "loss": 7.2464, "loss/crossentropy": 1.9277216866612434, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2045893307775259, "step": 10140 }, { "epoch": 0.3383333333333333, "grad_norm": 27.875, "grad_norm_var": 3.818489583333333, "learning_rate": 0.0001, "loss": 7.257, "loss/crossentropy": 1.9138327628374099, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.1965902404859662, "step": 10150 }, { "epoch": 0.33866666666666667, "grad_norm": 29.5, "grad_norm_var": 1.9166015625, "learning_rate": 0.0001, "loss": 7.3987, "loss/crossentropy": 1.9521941810846328, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.19101754343137145, "step": 10160 }, { "epoch": 0.339, "grad_norm": 29.625, "grad_norm_var": 2.8705729166666667, "learning_rate": 0.0001, "loss": 7.2499, "loss/crossentropy": 2.0758435159921644, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.18974261675029994, "step": 10170 }, { "epoch": 0.3393333333333333, "grad_norm": 31.125, "grad_norm_var": 2.798958333333333, "learning_rate": 0.0001, "loss": 7.4574, "loss/crossentropy": 2.156281590461731, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.2202395310625434, "step": 10180 }, { "epoch": 0.3396666666666667, "grad_norm": 32.25, "grad_norm_var": 2.42890625, "learning_rate": 0.0001, "loss": 7.4308, "loss/crossentropy": 2.0224428713321685, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.19194987900555133, "step": 10190 }, { "epoch": 0.34, "grad_norm": 29.625, "grad_norm_var": 2.162239583333333, "learning_rate": 0.0001, "loss": 7.3199, "loss/crossentropy": 1.959970773756504, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.18226667661219836, "step": 10200 }, { "epoch": 0.3403333333333333, "grad_norm": 29.25, "grad_norm_var": 1.51640625, "learning_rate": 0.0001, "loss": 7.37, "loss/crossentropy": 1.8382697485387325, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.1870664067566395, "step": 10210 }, { "epoch": 0.3406666666666667, "grad_norm": 26.75, "grad_norm_var": 22.426822916666666, "learning_rate": 0.0001, "loss": 7.2256, "loss/crossentropy": 1.9372355163097381, "loss/hidden": 3.545703125, "loss/jsd": 0.0, "loss/logits": 0.19385621156543492, "step": 10220 }, { "epoch": 0.341, "grad_norm": 30.0, "grad_norm_var": 22.719791666666666, "learning_rate": 0.0001, "loss": 7.2463, "loss/crossentropy": 1.9387185171246528, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.190226436406374, "step": 10230 }, { "epoch": 0.3413333333333333, "grad_norm": 28.0, "grad_norm_var": 3.409309895833333, "learning_rate": 0.0001, "loss": 7.384, "loss/crossentropy": 2.01721738576889, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.19744803626090288, "step": 10240 }, { "epoch": 0.3416666666666667, "grad_norm": 32.0, "grad_norm_var": 1.1311848958333333, "learning_rate": 0.0001, "loss": 7.3738, "loss/crossentropy": 2.1306037183851005, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18626119964756072, "step": 10250 }, { "epoch": 0.342, "grad_norm": 30.375, "grad_norm_var": 2.705208333333333, "learning_rate": 0.0001, "loss": 7.3304, "loss/crossentropy": 1.9760440967977047, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.1743823006749153, "step": 10260 }, { "epoch": 0.3423333333333333, "grad_norm": 33.0, "grad_norm_var": 3.866080729166667, "learning_rate": 0.0001, "loss": 7.3247, "loss/crossentropy": 1.9067096278071403, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.18993092682212592, "step": 10270 }, { "epoch": 0.3426666666666667, "grad_norm": 27.875, "grad_norm_var": 4.789518229166666, "learning_rate": 0.0001, "loss": 7.3058, "loss/crossentropy": 1.9344390481710434, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.1799757920205593, "step": 10280 }, { "epoch": 0.343, "grad_norm": 31.125, "grad_norm_var": 2.3393229166666667, "learning_rate": 0.0001, "loss": 7.4636, "loss/crossentropy": 2.1442542493343355, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.18748485147953034, "step": 10290 }, { "epoch": 0.3433333333333333, "grad_norm": 35.5, "grad_norm_var": 3.841080729166667, "learning_rate": 0.0001, "loss": 7.2128, "loss/crossentropy": 1.7490588132292033, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.17819906566292049, "step": 10300 }, { "epoch": 0.3436666666666667, "grad_norm": 27.375, "grad_norm_var": 32.50305989583333, "learning_rate": 0.0001, "loss": 7.3543, "loss/crossentropy": 2.0724628672003744, "loss/hidden": 3.473046875, "loss/jsd": 0.0, "loss/logits": 0.19344675689935684, "step": 10310 }, { "epoch": 0.344, "grad_norm": 31.375, "grad_norm_var": 31.676822916666666, "learning_rate": 0.0001, "loss": 7.3339, "loss/crossentropy": 2.096195527911186, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.19898923728615045, "step": 10320 }, { "epoch": 0.3443333333333333, "grad_norm": 35.0, "grad_norm_var": 2.61640625, "learning_rate": 0.0001, "loss": 7.312, "loss/crossentropy": 2.027948096394539, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.1865765569731593, "step": 10330 }, { "epoch": 0.3446666666666667, "grad_norm": 29.0, "grad_norm_var": 3.027083333333333, "learning_rate": 0.0001, "loss": 7.3044, "loss/crossentropy": 2.01663868278265, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.18688542265444993, "step": 10340 }, { "epoch": 0.345, "grad_norm": 29.125, "grad_norm_var": 3.0747395833333333, "learning_rate": 0.0001, "loss": 7.2779, "loss/crossentropy": 2.0468428120017053, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.1800114180892706, "step": 10350 }, { "epoch": 0.3453333333333333, "grad_norm": 30.75, "grad_norm_var": 3.799934895833333, "learning_rate": 0.0001, "loss": 7.3994, "loss/crossentropy": 2.1751383394002914, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19666383285075426, "step": 10360 }, { "epoch": 0.3456666666666667, "grad_norm": 30.25, "grad_norm_var": 2.57265625, "learning_rate": 0.0001, "loss": 7.3979, "loss/crossentropy": 2.0285842917859553, "loss/hidden": 3.54921875, "loss/jsd": 0.0, "loss/logits": 0.19900108613073825, "step": 10370 }, { "epoch": 0.346, "grad_norm": 29.75, "grad_norm_var": 1.6514973958333334, "learning_rate": 0.0001, "loss": 7.3511, "loss/crossentropy": 1.8374190464615823, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.19095862470567226, "step": 10380 }, { "epoch": 0.3463333333333333, "grad_norm": 29.0, "grad_norm_var": 3.8889973958333335, "learning_rate": 0.0001, "loss": 7.3309, "loss/crossentropy": 1.9641722440719604, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.1871830655261874, "step": 10390 }, { "epoch": 0.3466666666666667, "grad_norm": 29.125, "grad_norm_var": 4.235872395833334, "learning_rate": 0.0001, "loss": 7.3571, "loss/crossentropy": 1.8391975611448288, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.18625678215175867, "step": 10400 }, { "epoch": 0.347, "grad_norm": 28.875, "grad_norm_var": 2.0400390625, "learning_rate": 0.0001, "loss": 7.2293, "loss/crossentropy": 1.9488431803882122, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.1848501109983772, "step": 10410 }, { "epoch": 0.3473333333333333, "grad_norm": 28.75, "grad_norm_var": 3.225, "learning_rate": 0.0001, "loss": 7.479, "loss/crossentropy": 2.073763258755207, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.2017035260796547, "step": 10420 }, { "epoch": 0.3476666666666667, "grad_norm": 29.5, "grad_norm_var": 3.7905598958333333, "learning_rate": 0.0001, "loss": 7.3495, "loss/crossentropy": 2.0903085105121137, "loss/hidden": 3.576953125, "loss/jsd": 0.0, "loss/logits": 0.21501483637839555, "step": 10430 }, { "epoch": 0.348, "grad_norm": 31.875, "grad_norm_var": 2.4999348958333334, "learning_rate": 0.0001, "loss": 7.2697, "loss/crossentropy": 1.8777795553207397, "loss/hidden": 3.5421875, "loss/jsd": 0.0, "loss/logits": 0.18941189125180244, "step": 10440 }, { "epoch": 0.34833333333333333, "grad_norm": 37.0, "grad_norm_var": 5.517122395833334, "learning_rate": 0.0001, "loss": 7.3266, "loss/crossentropy": 2.043860497325659, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.1810751306824386, "step": 10450 }, { "epoch": 0.3486666666666667, "grad_norm": 30.625, "grad_norm_var": 5.372916666666667, "learning_rate": 0.0001, "loss": 7.3875, "loss/crossentropy": 2.1117652893066405, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.2107222313992679, "step": 10460 }, { "epoch": 0.349, "grad_norm": 31.0, "grad_norm_var": 2.095572916666667, "learning_rate": 0.0001, "loss": 7.3084, "loss/crossentropy": 2.0538375422358515, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.18809327706694604, "step": 10470 }, { "epoch": 0.34933333333333333, "grad_norm": 29.75, "grad_norm_var": 1.8947916666666667, "learning_rate": 0.0001, "loss": 7.3058, "loss/crossentropy": 1.8379462979733945, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.17395716523751617, "step": 10480 }, { "epoch": 0.3496666666666667, "grad_norm": 34.75, "grad_norm_var": 2.403125, "learning_rate": 0.0001, "loss": 7.3712, "loss/crossentropy": 1.9381973564624786, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18506800020113587, "step": 10490 }, { "epoch": 0.35, "grad_norm": 40.25, "grad_norm_var": 2.2546849048681252e+18, "learning_rate": 0.0001, "loss": 7.4194, "loss/crossentropy": 2.0407851874828338, "loss/hidden": 3.509765625, "loss/jsd": 0.0, "loss/logits": 0.20148079190403223, "step": 10500 }, { "epoch": 0.35033333333333333, "grad_norm": 28.0, "grad_norm_var": 112.378125, "learning_rate": 0.0001, "loss": 7.1889, "loss/crossentropy": 1.8268959112465382, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.1751222198829055, "step": 10510 }, { "epoch": 0.3506666666666667, "grad_norm": 37.0, "grad_norm_var": 10.7494140625, "learning_rate": 0.0001, "loss": 7.3692, "loss/crossentropy": 1.8408964581787586, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.17180111538618803, "step": 10520 }, { "epoch": 0.351, "grad_norm": 27.875, "grad_norm_var": 8.537239583333333, "learning_rate": 0.0001, "loss": 7.2579, "loss/crossentropy": 1.8571249485015868, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.17471247361972927, "step": 10530 }, { "epoch": 0.35133333333333333, "grad_norm": 30.375, "grad_norm_var": 1.0635416666666666, "learning_rate": 0.0001, "loss": 7.3263, "loss/crossentropy": 1.9803461730480194, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.16973881814628838, "step": 10540 }, { "epoch": 0.3516666666666667, "grad_norm": 31.125, "grad_norm_var": 16.555989583333332, "learning_rate": 0.0001, "loss": 7.4006, "loss/crossentropy": 1.9425342857837677, "loss/hidden": 3.578515625, "loss/jsd": 0.0, "loss/logits": 0.20237513203173876, "step": 10550 }, { "epoch": 0.352, "grad_norm": 29.375, "grad_norm_var": 19.328059895833334, "learning_rate": 0.0001, "loss": 7.21, "loss/crossentropy": 1.7168657176196576, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.19453960945829749, "step": 10560 }, { "epoch": 0.35233333333333333, "grad_norm": 32.5, "grad_norm_var": 2.854622395833333, "learning_rate": 0.0001, "loss": 7.4076, "loss/crossentropy": 2.0559593670070173, "loss/hidden": 3.525390625, "loss/jsd": 0.0, "loss/logits": 0.21057201102375983, "step": 10570 }, { "epoch": 0.3526666666666667, "grad_norm": 28.5, "grad_norm_var": 2.0801432291666666, "learning_rate": 0.0001, "loss": 7.23, "loss/crossentropy": 2.010274365544319, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.18564467392861844, "step": 10580 }, { "epoch": 0.353, "grad_norm": 28.375, "grad_norm_var": 4.68515625, "learning_rate": 0.0001, "loss": 7.4125, "loss/crossentropy": 2.03059800863266, "loss/hidden": 3.48984375, "loss/jsd": 0.0, "loss/logits": 0.2098971266299486, "step": 10590 }, { "epoch": 0.35333333333333333, "grad_norm": 32.75, "grad_norm_var": 4.137434895833334, "learning_rate": 0.0001, "loss": 7.266, "loss/crossentropy": 1.7149709843099117, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.1627730711363256, "step": 10600 }, { "epoch": 0.3536666666666667, "grad_norm": 28.875, "grad_norm_var": 6.114518229166666, "learning_rate": 0.0001, "loss": 7.3615, "loss/crossentropy": 2.0237951397895815, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.19934715293347835, "step": 10610 }, { "epoch": 0.354, "grad_norm": 31.75, "grad_norm_var": 4.555989583333333, "learning_rate": 0.0001, "loss": 7.3343, "loss/crossentropy": 1.9398625552654267, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.1788936140947044, "step": 10620 }, { "epoch": 0.35433333333333333, "grad_norm": 29.375, "grad_norm_var": 0.9863932291666667, "learning_rate": 0.0001, "loss": 7.4046, "loss/crossentropy": 2.2210143193602563, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.19667954742908478, "step": 10630 }, { "epoch": 0.3546666666666667, "grad_norm": 31.5, "grad_norm_var": 2.249739583333333, "learning_rate": 0.0001, "loss": 7.2826, "loss/crossentropy": 2.0427276700735093, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.19667604491114615, "step": 10640 }, { "epoch": 0.355, "grad_norm": 31.0, "grad_norm_var": 3.120833333333333, "learning_rate": 0.0001, "loss": 7.3429, "loss/crossentropy": 1.9891425952315331, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.1858096459880471, "step": 10650 }, { "epoch": 0.35533333333333333, "grad_norm": 35.75, "grad_norm_var": 4.804166666666666, "learning_rate": 0.0001, "loss": 7.291, "loss/crossentropy": 1.8062189549207688, "loss/hidden": 3.6046875, "loss/jsd": 0.0, "loss/logits": 0.19808114618062972, "step": 10660 }, { "epoch": 0.3556666666666667, "grad_norm": 28.625, "grad_norm_var": 4.758072916666666, "learning_rate": 0.0001, "loss": 7.4547, "loss/crossentropy": 1.9023952938616275, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.16940380418673157, "step": 10670 }, { "epoch": 0.356, "grad_norm": 31.75, "grad_norm_var": 2.684309895833333, "learning_rate": 0.0001, "loss": 7.4414, "loss/crossentropy": 1.9917625807225705, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.19237943794578313, "step": 10680 }, { "epoch": 0.35633333333333334, "grad_norm": 29.0, "grad_norm_var": 1.7208333333333334, "learning_rate": 0.0001, "loss": 7.2503, "loss/crossentropy": 1.8534020900726318, "loss/hidden": 3.5328125, "loss/jsd": 0.0, "loss/logits": 0.18708914816379546, "step": 10690 }, { "epoch": 0.3566666666666667, "grad_norm": 28.25, "grad_norm_var": 7.1119140625, "learning_rate": 0.0001, "loss": 7.2343, "loss/crossentropy": 1.8421179167926311, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.17422460094094278, "step": 10700 }, { "epoch": 0.357, "grad_norm": 32.0, "grad_norm_var": 1.4546223958333333, "learning_rate": 0.0001, "loss": 7.3771, "loss/crossentropy": 1.855978224426508, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.1802343945018947, "step": 10710 }, { "epoch": 0.35733333333333334, "grad_norm": 29.5, "grad_norm_var": 2.0160807291666667, "learning_rate": 0.0001, "loss": 7.3373, "loss/crossentropy": 2.013573457300663, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.19378562774509192, "step": 10720 }, { "epoch": 0.3576666666666667, "grad_norm": 32.0, "grad_norm_var": 2.459375, "learning_rate": 0.0001, "loss": 7.3042, "loss/crossentropy": 1.9303156457841397, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.21380280051380396, "step": 10730 }, { "epoch": 0.358, "grad_norm": 30.375, "grad_norm_var": 28.172916666666666, "learning_rate": 0.0001, "loss": 7.3259, "loss/crossentropy": 2.051717396825552, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.19064336661249398, "step": 10740 }, { "epoch": 0.35833333333333334, "grad_norm": 30.375, "grad_norm_var": 3.479622395833333, "learning_rate": 0.0001, "loss": 7.4542, "loss/crossentropy": 2.022121731936932, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.19138841945677995, "step": 10750 }, { "epoch": 0.3586666666666667, "grad_norm": 30.875, "grad_norm_var": 6.176822916666667, "learning_rate": 0.0001, "loss": 7.3554, "loss/crossentropy": 1.8345908604562282, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.17368071423843504, "step": 10760 }, { "epoch": 0.359, "grad_norm": 29.125, "grad_norm_var": 6.619205729166667, "learning_rate": 0.0001, "loss": 7.2979, "loss/crossentropy": 2.1068938851356505, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.20207747891545297, "step": 10770 }, { "epoch": 0.35933333333333334, "grad_norm": 28.125, "grad_norm_var": 2.1759765625, "learning_rate": 0.0001, "loss": 7.2643, "loss/crossentropy": 1.9664267249405385, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.18419204112142323, "step": 10780 }, { "epoch": 0.3596666666666667, "grad_norm": 29.25, "grad_norm_var": 3.4155598958333333, "learning_rate": 0.0001, "loss": 7.4352, "loss/crossentropy": 2.0276738509535788, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.18934254106134177, "step": 10790 }, { "epoch": 0.36, "grad_norm": 28.125, "grad_norm_var": 2.7900390625, "learning_rate": 0.0001, "loss": 7.3371, "loss/crossentropy": 2.0140382796525955, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19737642928957938, "step": 10800 }, { "epoch": 0.36033333333333334, "grad_norm": 29.75, "grad_norm_var": 17.4947265625, "learning_rate": 0.0001, "loss": 7.2051, "loss/crossentropy": 1.8624363638460637, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.1820073583163321, "step": 10810 }, { "epoch": 0.3606666666666667, "grad_norm": 29.25, "grad_norm_var": 19.808072916666667, "learning_rate": 0.0001, "loss": 7.3056, "loss/crossentropy": 2.039102651178837, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.18711361046880484, "step": 10820 }, { "epoch": 0.361, "grad_norm": 39.0, "grad_norm_var": 10.036458333333334, "learning_rate": 0.0001, "loss": 7.2705, "loss/crossentropy": 2.011190990358591, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.1817564379889518, "step": 10830 }, { "epoch": 0.36133333333333334, "grad_norm": 32.25, "grad_norm_var": 6.141666666666667, "learning_rate": 0.0001, "loss": 7.3481, "loss/crossentropy": 1.9442940011620522, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.19470974542200564, "step": 10840 }, { "epoch": 0.3616666666666667, "grad_norm": 31.875, "grad_norm_var": 2.85390625, "learning_rate": 0.0001, "loss": 7.401, "loss/crossentropy": 1.962213009595871, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.193806504458189, "step": 10850 }, { "epoch": 0.362, "grad_norm": 34.5, "grad_norm_var": 51.62890625, "learning_rate": 0.0001, "loss": 7.3433, "loss/crossentropy": 1.9836720064282418, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.19824857506901025, "step": 10860 }, { "epoch": 0.36233333333333334, "grad_norm": 46.75, "grad_norm_var": 128.83014322916668, "learning_rate": 0.0001, "loss": 7.2578, "loss/crossentropy": 1.9726637572050094, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.20154060889035463, "step": 10870 }, { "epoch": 0.3626666666666667, "grad_norm": 43.25, "grad_norm_var": 93.84055989583334, "learning_rate": 0.0001, "loss": 7.1637, "loss/crossentropy": 1.9339853323996068, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.1780510688200593, "step": 10880 }, { "epoch": 0.363, "grad_norm": 41.5, "grad_norm_var": 59.430989583333336, "learning_rate": 0.0001, "loss": 7.1386, "loss/crossentropy": 1.991148991137743, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.1790566130541265, "step": 10890 }, { "epoch": 0.36333333333333334, "grad_norm": 38.5, "grad_norm_var": 22.449934895833334, "learning_rate": 0.0001, "loss": 7.2013, "loss/crossentropy": 1.999082264304161, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.21667805183678865, "step": 10900 }, { "epoch": 0.3636666666666667, "grad_norm": 68.5, "grad_norm_var": 105.7, "learning_rate": 0.0001, "loss": 7.2038, "loss/crossentropy": 1.9558657839894296, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.18473593555390835, "step": 10910 }, { "epoch": 0.364, "grad_norm": 29.625, "grad_norm_var": 119.47024739583334, "learning_rate": 0.0001, "loss": 7.3083, "loss/crossentropy": 2.095320004224777, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.1940501093864441, "step": 10920 }, { "epoch": 0.36433333333333334, "grad_norm": 36.25, "grad_norm_var": 16.53515625, "learning_rate": 0.0001, "loss": 7.2246, "loss/crossentropy": 1.9507791988551617, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.17501657009124755, "step": 10930 }, { "epoch": 0.36466666666666664, "grad_norm": 29.0, "grad_norm_var": 8.79765625, "learning_rate": 0.0001, "loss": 7.2231, "loss/crossentropy": 1.8772604808211326, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.18701328663155437, "step": 10940 }, { "epoch": 0.365, "grad_norm": 31.0, "grad_norm_var": 5.7181640625, "learning_rate": 0.0001, "loss": 7.3604, "loss/crossentropy": 1.971347527205944, "loss/hidden": 3.451171875, "loss/jsd": 0.0, "loss/logits": 0.2003970267251134, "step": 10950 }, { "epoch": 0.36533333333333334, "grad_norm": 33.75, "grad_norm_var": 9.2087890625, "learning_rate": 0.0001, "loss": 7.3319, "loss/crossentropy": 1.8551844030618667, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.1821993922814727, "step": 10960 }, { "epoch": 0.36566666666666664, "grad_norm": 36.0, "grad_norm_var": 8.335416666666667, "learning_rate": 0.0001, "loss": 7.3108, "loss/crossentropy": 1.9584839902818203, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.19148140586912632, "step": 10970 }, { "epoch": 0.366, "grad_norm": 27.375, "grad_norm_var": 8.914322916666666, "learning_rate": 0.0001, "loss": 7.0811, "loss/crossentropy": 1.739734296873212, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.17282773293554782, "step": 10980 }, { "epoch": 0.36633333333333334, "grad_norm": 28.5, "grad_norm_var": 10.101497395833333, "learning_rate": 0.0001, "loss": 7.3265, "loss/crossentropy": 2.0793154716491697, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.1731626907363534, "step": 10990 }, { "epoch": 0.36666666666666664, "grad_norm": 29.875, "grad_norm_var": 12.083268229166666, "learning_rate": 0.0001, "loss": 7.2872, "loss/crossentropy": 1.9717597007751464, "loss/hidden": 3.55859375, "loss/jsd": 0.0, "loss/logits": 0.20700400788336992, "step": 11000 }, { "epoch": 0.367, "grad_norm": 32.75, "grad_norm_var": 428.325, "learning_rate": 0.0001, "loss": 7.3197, "loss/crossentropy": 1.9812920153141023, "loss/hidden": 3.5421875, "loss/jsd": 0.0, "loss/logits": 0.1934831079095602, "step": 11010 }, { "epoch": 0.36733333333333335, "grad_norm": 31.25, "grad_norm_var": 6.582747395833334, "learning_rate": 0.0001, "loss": 7.1897, "loss/crossentropy": 1.9106600619852543, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.1756363559514284, "step": 11020 }, { "epoch": 0.36766666666666664, "grad_norm": 29.875, "grad_norm_var": 5.092122395833333, "learning_rate": 0.0001, "loss": 7.2408, "loss/crossentropy": 1.8943538144230843, "loss/hidden": 3.5375, "loss/jsd": 0.0, "loss/logits": 0.200858642347157, "step": 11030 }, { "epoch": 0.368, "grad_norm": 30.375, "grad_norm_var": 4.266080729166666, "learning_rate": 0.0001, "loss": 7.2531, "loss/crossentropy": 2.014334838092327, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.19320399397984148, "step": 11040 }, { "epoch": 0.36833333333333335, "grad_norm": 30.375, "grad_norm_var": 3.60390625, "learning_rate": 0.0001, "loss": 7.2295, "loss/crossentropy": 2.045776247233152, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.18819756340235472, "step": 11050 }, { "epoch": 0.36866666666666664, "grad_norm": 32.5, "grad_norm_var": 3.91875, "learning_rate": 0.0001, "loss": 7.3359, "loss/crossentropy": 1.9073569118976592, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.1949919333681464, "step": 11060 }, { "epoch": 0.369, "grad_norm": 28.375, "grad_norm_var": 4.01015625, "learning_rate": 0.0001, "loss": 7.2347, "loss/crossentropy": 1.822785534709692, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.17801302755251527, "step": 11070 }, { "epoch": 0.36933333333333335, "grad_norm": 29.125, "grad_norm_var": 5.398958333333334, "learning_rate": 0.0001, "loss": 7.3647, "loss/crossentropy": 1.921547295153141, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.1915199004113674, "step": 11080 }, { "epoch": 0.36966666666666664, "grad_norm": 27.5, "grad_norm_var": 7.317708333333333, "learning_rate": 0.0001, "loss": 7.227, "loss/crossentropy": 2.0283150181174276, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.204646809771657, "step": 11090 }, { "epoch": 0.37, "grad_norm": 28.625, "grad_norm_var": 9.4125, "learning_rate": 0.0001, "loss": 7.1386, "loss/crossentropy": 1.8994303956627845, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.17796771945431827, "step": 11100 }, { "epoch": 0.37033333333333335, "grad_norm": 34.25, "grad_norm_var": 3.3551432291666665, "learning_rate": 0.0001, "loss": 7.2874, "loss/crossentropy": 2.08044124096632, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.19769861306995154, "step": 11110 }, { "epoch": 0.37066666666666664, "grad_norm": 30.5, "grad_norm_var": 5.558072916666666, "learning_rate": 0.0001, "loss": 7.3009, "loss/crossentropy": 1.9382298722863198, "loss/hidden": 3.47421875, "loss/jsd": 0.0, "loss/logits": 0.1852950079366565, "step": 11120 }, { "epoch": 0.371, "grad_norm": 31.375, "grad_norm_var": 7.114322916666667, "learning_rate": 0.0001, "loss": 7.24, "loss/crossentropy": 2.124358855187893, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.1883678013458848, "step": 11130 }, { "epoch": 0.37133333333333335, "grad_norm": 32.0, "grad_norm_var": 1.9947265625, "learning_rate": 0.0001, "loss": 7.3562, "loss/crossentropy": 2.1665535509586333, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.18988040778785945, "step": 11140 }, { "epoch": 0.37166666666666665, "grad_norm": 28.125, "grad_norm_var": 3.3124348958333334, "learning_rate": 0.0001, "loss": 7.2833, "loss/crossentropy": 1.8662879288196563, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.19359316304326057, "step": 11150 }, { "epoch": 0.372, "grad_norm": 29.0, "grad_norm_var": 4.192708333333333, "learning_rate": 0.0001, "loss": 7.3772, "loss/crossentropy": 2.083827328681946, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.1930124057456851, "step": 11160 }, { "epoch": 0.37233333333333335, "grad_norm": 27.5, "grad_norm_var": 3.8061848958333333, "learning_rate": 0.0001, "loss": 7.2488, "loss/crossentropy": 1.9408250719308853, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.18979046139866113, "step": 11170 }, { "epoch": 0.37266666666666665, "grad_norm": 29.0, "grad_norm_var": 1.5934895833333333, "learning_rate": 0.0001, "loss": 7.2662, "loss/crossentropy": 1.983980904519558, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.18191885035485028, "step": 11180 }, { "epoch": 0.373, "grad_norm": 28.5, "grad_norm_var": 13.3431640625, "learning_rate": 0.0001, "loss": 7.3454, "loss/crossentropy": 2.050442532449961, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.18640944473445414, "step": 11190 }, { "epoch": 0.37333333333333335, "grad_norm": 29.875, "grad_norm_var": 12.226822916666666, "learning_rate": 0.0001, "loss": 7.2504, "loss/crossentropy": 1.8866645142436027, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.18141323197633027, "step": 11200 }, { "epoch": 0.37366666666666665, "grad_norm": 32.25, "grad_norm_var": 10.409375, "learning_rate": 0.0001, "loss": 7.2505, "loss/crossentropy": 1.9389467768371105, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.20237924791872502, "step": 11210 }, { "epoch": 0.374, "grad_norm": 28.125, "grad_norm_var": 14.354166666666666, "learning_rate": 0.0001, "loss": 7.3194, "loss/crossentropy": 1.8974710524082183, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.1762718055397272, "step": 11220 }, { "epoch": 0.37433333333333335, "grad_norm": 35.5, "grad_norm_var": 10.313541666666667, "learning_rate": 0.0001, "loss": 7.3402, "loss/crossentropy": 2.039930585026741, "loss/hidden": 3.53203125, "loss/jsd": 0.0, "loss/logits": 0.2104919407516718, "step": 11230 }, { "epoch": 0.37466666666666665, "grad_norm": 28.375, "grad_norm_var": 4.6166015625, "learning_rate": 0.0001, "loss": 7.365, "loss/crossentropy": 1.8570226393640041, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.1934488659724593, "step": 11240 }, { "epoch": 0.375, "grad_norm": 30.25, "grad_norm_var": 1.5410807291666666, "learning_rate": 0.0001, "loss": 7.3019, "loss/crossentropy": 1.9520951524376868, "loss/hidden": 3.507421875, "loss/jsd": 0.0, "loss/logits": 0.19358678720891476, "step": 11250 }, { "epoch": 0.37533333333333335, "grad_norm": 30.625, "grad_norm_var": 1.9520833333333334, "learning_rate": 0.0001, "loss": 7.2311, "loss/crossentropy": 1.8577851578593254, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.18313570264726878, "step": 11260 }, { "epoch": 0.37566666666666665, "grad_norm": 28.5, "grad_norm_var": 6.084309895833333, "learning_rate": 0.0001, "loss": 7.2725, "loss/crossentropy": 2.025163532793522, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.18662182968109847, "step": 11270 }, { "epoch": 0.376, "grad_norm": 29.625, "grad_norm_var": 11.378580729166666, "learning_rate": 0.0001, "loss": 7.2665, "loss/crossentropy": 2.0567542009055613, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.20304533503949643, "step": 11280 }, { "epoch": 0.37633333333333335, "grad_norm": 31.5, "grad_norm_var": 9.76875, "learning_rate": 0.0001, "loss": 7.3736, "loss/crossentropy": 2.0652671001851557, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.18979566507041454, "step": 11290 }, { "epoch": 0.37666666666666665, "grad_norm": 29.75, "grad_norm_var": 1.72890625, "learning_rate": 0.0001, "loss": 7.2907, "loss/crossentropy": 2.1199740901589395, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.18486832501366735, "step": 11300 }, { "epoch": 0.377, "grad_norm": 34.25, "grad_norm_var": 61.1212890625, "learning_rate": 0.0001, "loss": 7.2891, "loss/crossentropy": 2.004186879098415, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.18873196374624968, "step": 11310 }, { "epoch": 0.37733333333333335, "grad_norm": 31.25, "grad_norm_var": 62.38587239583333, "learning_rate": 0.0001, "loss": 7.3144, "loss/crossentropy": 2.1030808970332147, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.18695523235946893, "step": 11320 }, { "epoch": 0.37766666666666665, "grad_norm": 31.25, "grad_norm_var": 2.292122395833333, "learning_rate": 0.0001, "loss": 7.2876, "loss/crossentropy": 1.8798470810055732, "loss/hidden": 3.530859375, "loss/jsd": 0.0, "loss/logits": 0.19491705913096666, "step": 11330 }, { "epoch": 0.378, "grad_norm": 28.625, "grad_norm_var": 5.128059895833333, "learning_rate": 0.0001, "loss": 7.3545, "loss/crossentropy": 1.976115906983614, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.19396853167563677, "step": 11340 }, { "epoch": 0.37833333333333335, "grad_norm": 50.25, "grad_norm_var": 32.458333333333336, "learning_rate": 0.0001, "loss": 7.3213, "loss/crossentropy": 1.9236118108034135, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.1946556368842721, "step": 11350 }, { "epoch": 0.37866666666666665, "grad_norm": 29.125, "grad_norm_var": 29.816666666666666, "learning_rate": 0.0001, "loss": 7.4583, "loss/crossentropy": 2.006420224905014, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.19698526579886674, "step": 11360 }, { "epoch": 0.379, "grad_norm": 28.625, "grad_norm_var": 19.915559895833333, "learning_rate": 0.0001, "loss": 7.2804, "loss/crossentropy": 2.0187849469482897, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.19657179582864046, "step": 11370 }, { "epoch": 0.37933333333333336, "grad_norm": 32.25, "grad_norm_var": 6.720833333333333, "learning_rate": 0.0001, "loss": 7.3778, "loss/crossentropy": 1.97935349047184, "loss/hidden": 3.398828125, "loss/jsd": 0.0, "loss/logits": 0.18184738792479038, "step": 11380 }, { "epoch": 0.37966666666666665, "grad_norm": 29.75, "grad_norm_var": 20.3478515625, "learning_rate": 0.0001, "loss": 7.3626, "loss/crossentropy": 1.941196110844612, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.18417117949575185, "step": 11390 }, { "epoch": 0.38, "grad_norm": 28.5, "grad_norm_var": 23.555989583333332, "learning_rate": 0.0001, "loss": 7.1861, "loss/crossentropy": 1.8577854558825493, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.17986119631677866, "step": 11400 }, { "epoch": 0.38033333333333336, "grad_norm": 28.5, "grad_norm_var": 6.016080729166666, "learning_rate": 0.0001, "loss": 7.1142, "loss/crossentropy": 2.0140000000596046, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.17388365007936954, "step": 11410 }, { "epoch": 0.38066666666666665, "grad_norm": 29.875, "grad_norm_var": 3.428125, "learning_rate": 0.0001, "loss": 7.3519, "loss/crossentropy": 2.038056407123804, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.18921603113412858, "step": 11420 }, { "epoch": 0.381, "grad_norm": 29.75, "grad_norm_var": 2.7895182291666667, "learning_rate": 0.0001, "loss": 7.3085, "loss/crossentropy": 1.9776552334427833, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.19061334989964962, "step": 11430 }, { "epoch": 0.38133333333333336, "grad_norm": 29.25, "grad_norm_var": 2.9155598958333333, "learning_rate": 0.0001, "loss": 7.3867, "loss/crossentropy": 1.8662955924868583, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.17751897126436234, "step": 11440 }, { "epoch": 0.38166666666666665, "grad_norm": 29.5, "grad_norm_var": 20.0197265625, "learning_rate": 0.0001, "loss": 7.4094, "loss/crossentropy": 2.132743002474308, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.20174112562090157, "step": 11450 }, { "epoch": 0.382, "grad_norm": 28.75, "grad_norm_var": 9.4259765625, "learning_rate": 0.0001, "loss": 7.2847, "loss/crossentropy": 2.095056842267513, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.19043400809168815, "step": 11460 }, { "epoch": 0.38233333333333336, "grad_norm": 29.125, "grad_norm_var": 2.9884765625, "learning_rate": 0.0001, "loss": 7.3759, "loss/crossentropy": 1.9882144063711167, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.18963760938495397, "step": 11470 }, { "epoch": 0.38266666666666665, "grad_norm": 32.0, "grad_norm_var": 14.9375, "learning_rate": 0.0001, "loss": 7.2853, "loss/crossentropy": 1.6863661363720894, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.17473275251686574, "step": 11480 }, { "epoch": 0.383, "grad_norm": 34.75, "grad_norm_var": 15.467643229166667, "learning_rate": 0.0001, "loss": 7.3164, "loss/crossentropy": 1.993257286399603, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.1981072451453656, "step": 11490 }, { "epoch": 0.38333333333333336, "grad_norm": 28.75, "grad_norm_var": 7.1587890625, "learning_rate": 0.0001, "loss": 7.3117, "loss/crossentropy": 2.096546545624733, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.19491787813603878, "step": 11500 }, { "epoch": 0.38366666666666666, "grad_norm": 33.0, "grad_norm_var": 4.074739583333334, "learning_rate": 0.0001, "loss": 7.3319, "loss/crossentropy": 2.0012700825929643, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.19879192877560853, "step": 11510 }, { "epoch": 0.384, "grad_norm": 39.25, "grad_norm_var": 7.322916666666667, "learning_rate": 0.0001, "loss": 7.2685, "loss/crossentropy": 1.84815816283226, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.1769365394487977, "step": 11520 }, { "epoch": 0.38433333333333336, "grad_norm": 30.375, "grad_norm_var": 9.282291666666667, "learning_rate": 0.0001, "loss": 7.156, "loss/crossentropy": 1.8190936289727688, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.17446368308737875, "step": 11530 }, { "epoch": 0.38466666666666666, "grad_norm": 29.75, "grad_norm_var": 6.3353515625, "learning_rate": 0.0001, "loss": 7.2761, "loss/crossentropy": 1.9552165806293487, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.19698831299319863, "step": 11540 }, { "epoch": 0.385, "grad_norm": 33.0, "grad_norm_var": 8.31015625, "learning_rate": 0.0001, "loss": 7.2215, "loss/crossentropy": 1.8027352683246136, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.16687594885006546, "step": 11550 }, { "epoch": 0.38533333333333336, "grad_norm": 27.25, "grad_norm_var": 8.7181640625, "learning_rate": 0.0001, "loss": 7.1856, "loss/crossentropy": 1.9099013559520244, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.18390046283602715, "step": 11560 }, { "epoch": 0.38566666666666666, "grad_norm": 29.25, "grad_norm_var": 14.662239583333333, "learning_rate": 0.0001, "loss": 7.3542, "loss/crossentropy": 2.0861241780221462, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.18345039603300392, "step": 11570 }, { "epoch": 0.386, "grad_norm": 30.0, "grad_norm_var": 7.45390625, "learning_rate": 0.0001, "loss": 7.2619, "loss/crossentropy": 1.8799682959914208, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.19429171830415726, "step": 11580 }, { "epoch": 0.3863333333333333, "grad_norm": 37.75, "grad_norm_var": 28.7806640625, "learning_rate": 0.0001, "loss": 7.2384, "loss/crossentropy": 1.9282376438379287, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.18505943827331067, "step": 11590 }, { "epoch": 0.38666666666666666, "grad_norm": 29.5, "grad_norm_var": 6.515559895833333, "learning_rate": 0.0001, "loss": 7.3731, "loss/crossentropy": 2.0443509936332704, "loss/hidden": 3.522265625, "loss/jsd": 0.0, "loss/logits": 0.20932536013424397, "step": 11600 }, { "epoch": 0.387, "grad_norm": 30.5, "grad_norm_var": 6.696809895833334, "learning_rate": 0.0001, "loss": 7.324, "loss/crossentropy": 1.856078075617552, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.1707373504526913, "step": 11610 }, { "epoch": 0.3873333333333333, "grad_norm": 30.75, "grad_norm_var": 12.4041015625, "learning_rate": 0.0001, "loss": 7.2623, "loss/crossentropy": 2.0671856701374054, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.18363108430057765, "step": 11620 }, { "epoch": 0.38766666666666666, "grad_norm": 28.0, "grad_norm_var": 13.002018229166667, "learning_rate": 0.0001, "loss": 7.3935, "loss/crossentropy": 1.959590031951666, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.1910666786134243, "step": 11630 }, { "epoch": 0.388, "grad_norm": 34.25, "grad_norm_var": 14.6900390625, "learning_rate": 0.0001, "loss": 7.3088, "loss/crossentropy": 2.075892502069473, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.18459956999868155, "step": 11640 }, { "epoch": 0.3883333333333333, "grad_norm": 28.5, "grad_norm_var": 17.138997395833332, "learning_rate": 0.0001, "loss": 7.2949, "loss/crossentropy": 1.8865861922502518, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.19968612883239983, "step": 11650 }, { "epoch": 0.38866666666666666, "grad_norm": 29.125, "grad_norm_var": 6.85, "learning_rate": 0.0001, "loss": 7.2809, "loss/crossentropy": 1.9073393203318119, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.18070550030097365, "step": 11660 }, { "epoch": 0.389, "grad_norm": 28.625, "grad_norm_var": 5.130143229166666, "learning_rate": 0.0001, "loss": 7.1172, "loss/crossentropy": 2.0171332240104674, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.18159891851246357, "step": 11670 }, { "epoch": 0.3893333333333333, "grad_norm": 27.25, "grad_norm_var": 2.1962890625, "learning_rate": 0.0001, "loss": 7.1721, "loss/crossentropy": 2.0392806589603425, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.18174661844968795, "step": 11680 }, { "epoch": 0.38966666666666666, "grad_norm": 28.75, "grad_norm_var": 8.901041666666666, "learning_rate": 0.0001, "loss": 7.341, "loss/crossentropy": 1.9692971974611282, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.18224202431738376, "step": 11690 }, { "epoch": 0.39, "grad_norm": 27.875, "grad_norm_var": 1.8988932291666667, "learning_rate": 0.0001, "loss": 7.2343, "loss/crossentropy": 1.912377967685461, "loss/hidden": 3.517578125, "loss/jsd": 0.0, "loss/logits": 0.18551887096837164, "step": 11700 }, { "epoch": 0.3903333333333333, "grad_norm": 26.5, "grad_norm_var": 3.04765625, "learning_rate": 0.0001, "loss": 7.1865, "loss/crossentropy": 1.829298423230648, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.17527710273861885, "step": 11710 }, { "epoch": 0.39066666666666666, "grad_norm": 44.0, "grad_norm_var": 19.176822916666666, "learning_rate": 0.0001, "loss": 7.2462, "loss/crossentropy": 1.8122045263648032, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.17783846724778413, "step": 11720 }, { "epoch": 0.391, "grad_norm": 29.0, "grad_norm_var": 16.9837890625, "learning_rate": 0.0001, "loss": 7.2204, "loss/crossentropy": 1.9650872960686683, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.18882772848010063, "step": 11730 }, { "epoch": 0.3913333333333333, "grad_norm": 27.625, "grad_norm_var": 17.8291015625, "learning_rate": 0.0001, "loss": 7.3414, "loss/crossentropy": 2.009046438336372, "loss/hidden": 3.31484375, "loss/jsd": 0.0, "loss/logits": 0.1767511872574687, "step": 11740 }, { "epoch": 0.39166666666666666, "grad_norm": 31.5, "grad_norm_var": 1.76875, "learning_rate": 0.0001, "loss": 7.2054, "loss/crossentropy": 1.932949498295784, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18321319725364446, "step": 11750 }, { "epoch": 0.392, "grad_norm": 29.25, "grad_norm_var": 2.4622395833333335, "learning_rate": 0.0001, "loss": 7.2481, "loss/crossentropy": 2.1433114275336265, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19988296404480935, "step": 11760 }, { "epoch": 0.3923333333333333, "grad_norm": 29.5, "grad_norm_var": 1.83125, "learning_rate": 0.0001, "loss": 7.2077, "loss/crossentropy": 2.0650949962437153, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.19057194776833059, "step": 11770 }, { "epoch": 0.39266666666666666, "grad_norm": 28.125, "grad_norm_var": 2.0061848958333335, "learning_rate": 0.0001, "loss": 7.3879, "loss/crossentropy": 2.024843217432499, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.18240368627011777, "step": 11780 }, { "epoch": 0.393, "grad_norm": 30.375, "grad_norm_var": 1.43515625, "learning_rate": 0.0001, "loss": 7.2563, "loss/crossentropy": 2.128304650634527, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.22087500467896462, "step": 11790 }, { "epoch": 0.3933333333333333, "grad_norm": 29.25, "grad_norm_var": 1.7270182291666667, "learning_rate": 0.0001, "loss": 7.27, "loss/crossentropy": 1.8697856433689595, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.18453482510522007, "step": 11800 }, { "epoch": 0.39366666666666666, "grad_norm": 30.375, "grad_norm_var": 4.525, "learning_rate": 0.0001, "loss": 7.3767, "loss/crossentropy": 2.0748488709330557, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.21022844426333903, "step": 11810 }, { "epoch": 0.394, "grad_norm": 30.375, "grad_norm_var": 14.034375, "learning_rate": 0.0001, "loss": 7.3479, "loss/crossentropy": 1.8195515662431716, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.17000398375093936, "step": 11820 }, { "epoch": 0.3943333333333333, "grad_norm": 32.75, "grad_norm_var": 54.39108072916667, "learning_rate": 0.0001, "loss": 7.4182, "loss/crossentropy": 1.885833379626274, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.20400924338027834, "step": 11830 }, { "epoch": 0.39466666666666667, "grad_norm": 30.0, "grad_norm_var": 49.18723958333333, "learning_rate": 0.0001, "loss": 7.247, "loss/crossentropy": 1.8978069722652435, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.18015803284943105, "step": 11840 }, { "epoch": 0.395, "grad_norm": 27.875, "grad_norm_var": 3.4962890625, "learning_rate": 0.0001, "loss": 7.0722, "loss/crossentropy": 1.7836640104651451, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.17640065625309945, "step": 11850 }, { "epoch": 0.3953333333333333, "grad_norm": 30.0, "grad_norm_var": 11.417122395833333, "learning_rate": 0.0001, "loss": 7.347, "loss/crossentropy": 1.756436760723591, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.18073352687060834, "step": 11860 }, { "epoch": 0.39566666666666667, "grad_norm": 31.0, "grad_norm_var": 9.781705729166667, "learning_rate": 0.0001, "loss": 7.2364, "loss/crossentropy": 1.8596447683870792, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.19009564155712724, "step": 11870 }, { "epoch": 0.396, "grad_norm": 33.5, "grad_norm_var": 3.3447916666666666, "learning_rate": 0.0001, "loss": 7.2895, "loss/crossentropy": 1.8756249353289605, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.19262257143855094, "step": 11880 }, { "epoch": 0.3963333333333333, "grad_norm": 32.25, "grad_norm_var": 8.4494140625, "learning_rate": 0.0001, "loss": 7.4226, "loss/crossentropy": 2.0137244418263434, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.18827710039913653, "step": 11890 }, { "epoch": 0.39666666666666667, "grad_norm": 47.75, "grad_norm_var": 2.928465628755709e+18, "learning_rate": 0.0001, "loss": 7.4369, "loss/crossentropy": 2.0084817469120027, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.1825602987781167, "step": 11900 }, { "epoch": 0.397, "grad_norm": 28.875, "grad_norm_var": 2.928465628841273e+18, "learning_rate": 0.0001, "loss": 7.2882, "loss/crossentropy": 1.8848083645105362, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.19836503118276597, "step": 11910 }, { "epoch": 0.3973333333333333, "grad_norm": 29.25, "grad_norm_var": 2.3053504042993976e+18, "learning_rate": 0.0001, "loss": 7.2818, "loss/crossentropy": 2.0195236086845396, "loss/hidden": 3.299609375, "loss/jsd": 0.0, "loss/logits": 0.18611480128020047, "step": 11920 }, { "epoch": 0.39766666666666667, "grad_norm": 35.5, "grad_norm_var": 2.3053504045208218e+18, "learning_rate": 0.0001, "loss": 7.2346, "loss/crossentropy": 1.9294698767364025, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.18436552435159684, "step": 11930 }, { "epoch": 0.398, "grad_norm": 29.375, "grad_norm_var": 4.400455729166667, "learning_rate": 0.0001, "loss": 7.2805, "loss/crossentropy": 2.0649065375328064, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.197495798394084, "step": 11940 }, { "epoch": 0.3983333333333333, "grad_norm": 33.0, "grad_norm_var": 2.728125, "learning_rate": 0.0001, "loss": 7.1597, "loss/crossentropy": 1.9504266425967216, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.21152498312294482, "step": 11950 }, { "epoch": 0.39866666666666667, "grad_norm": 30.125, "grad_norm_var": 3.14765625, "learning_rate": 0.0001, "loss": 7.3125, "loss/crossentropy": 2.0500699996948244, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18475822145119308, "step": 11960 }, { "epoch": 0.399, "grad_norm": 29.75, "grad_norm_var": 3.3622395833333334, "learning_rate": 0.0001, "loss": 7.2891, "loss/crossentropy": 1.982466061413288, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.19263978507369756, "step": 11970 }, { "epoch": 0.3993333333333333, "grad_norm": 29.375, "grad_norm_var": 6.780989583333334, "learning_rate": 0.0001, "loss": 7.2962, "loss/crossentropy": 1.9395920529961586, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.18587411902844905, "step": 11980 }, { "epoch": 0.39966666666666667, "grad_norm": 32.0, "grad_norm_var": 2.74765625, "learning_rate": 0.0001, "loss": 7.3323, "loss/crossentropy": 2.0551164120435716, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.18735264260321854, "step": 11990 }, { "epoch": 0.4, "grad_norm": 27.125, "grad_norm_var": 2.5962890625, "learning_rate": 0.0001, "loss": 7.2116, "loss/crossentropy": 1.9154511280357838, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.17923201527446508, "step": 12000 }, { "epoch": 0.4003333333333333, "grad_norm": 29.375, "grad_norm_var": 2.4457682291666667, "learning_rate": 0.0001, "loss": 7.3203, "loss/crossentropy": 2.175585137307644, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.19585834592580795, "step": 12010 }, { "epoch": 0.40066666666666667, "grad_norm": 28.375, "grad_norm_var": 2.839583333333333, "learning_rate": 0.0001, "loss": 7.2251, "loss/crossentropy": 1.905546525120735, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.19275268036872148, "step": 12020 }, { "epoch": 0.401, "grad_norm": 29.5, "grad_norm_var": 3.4916666666666667, "learning_rate": 0.0001, "loss": 7.3761, "loss/crossentropy": 1.9497522547841073, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.18617938784882426, "step": 12030 }, { "epoch": 0.4013333333333333, "grad_norm": 26.875, "grad_norm_var": 4.615625, "learning_rate": 0.0001, "loss": 7.2231, "loss/crossentropy": 1.9682360365986824, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.18768627978861332, "step": 12040 }, { "epoch": 0.40166666666666667, "grad_norm": 26.375, "grad_norm_var": 4.0125, "learning_rate": 0.0001, "loss": 7.198, "loss/crossentropy": 2.025482263416052, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.20312102157622575, "step": 12050 }, { "epoch": 0.402, "grad_norm": 30.125, "grad_norm_var": 2.1775390625, "learning_rate": 0.0001, "loss": 7.2769, "loss/crossentropy": 1.9865700535476207, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.19534757919609547, "step": 12060 }, { "epoch": 0.4023333333333333, "grad_norm": 28.875, "grad_norm_var": 6.545572916666667, "learning_rate": 0.0001, "loss": 7.2585, "loss/crossentropy": 1.9391134083271027, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.17897734902799128, "step": 12070 }, { "epoch": 0.4026666666666667, "grad_norm": 31.125, "grad_norm_var": 5.603125, "learning_rate": 0.0001, "loss": 7.3356, "loss/crossentropy": 2.0374672904610636, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.1868133692070842, "step": 12080 }, { "epoch": 0.403, "grad_norm": 32.25, "grad_norm_var": 5.5244140625, "learning_rate": 0.0001, "loss": 7.271, "loss/crossentropy": 1.8563154302537441, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.177560431137681, "step": 12090 }, { "epoch": 0.4033333333333333, "grad_norm": 32.0, "grad_norm_var": 2.6864583333333334, "learning_rate": 0.0001, "loss": 7.3052, "loss/crossentropy": 1.9043956212699413, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.19902166882529854, "step": 12100 }, { "epoch": 0.4036666666666667, "grad_norm": 33.0, "grad_norm_var": 2.819205729166667, "learning_rate": 0.0001, "loss": 7.2616, "loss/crossentropy": 1.998912625014782, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.1801791839301586, "step": 12110 }, { "epoch": 0.404, "grad_norm": 30.0, "grad_norm_var": 2.3705729166666667, "learning_rate": 0.0001, "loss": 7.2223, "loss/crossentropy": 1.8267780005931855, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.17697003651410342, "step": 12120 }, { "epoch": 0.4043333333333333, "grad_norm": 28.0, "grad_norm_var": 2.2645833333333334, "learning_rate": 0.0001, "loss": 7.2585, "loss/crossentropy": 1.9727560937404633, "loss/hidden": 3.49609375, "loss/jsd": 0.0, "loss/logits": 0.19548005219548942, "step": 12130 }, { "epoch": 0.4046666666666667, "grad_norm": 28.75, "grad_norm_var": 1.7739583333333333, "learning_rate": 0.0001, "loss": 7.1308, "loss/crossentropy": 2.039293521642685, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.17662172471173107, "step": 12140 }, { "epoch": 0.405, "grad_norm": 29.0, "grad_norm_var": 0.90390625, "learning_rate": 0.0001, "loss": 7.3654, "loss/crossentropy": 1.9189713895320892, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.2085176944732666, "step": 12150 }, { "epoch": 0.4053333333333333, "grad_norm": 28.875, "grad_norm_var": 1.9343098958333333, "learning_rate": 0.0001, "loss": 7.374, "loss/crossentropy": 1.9505802497267724, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.18279812578111887, "step": 12160 }, { "epoch": 0.4056666666666667, "grad_norm": 28.75, "grad_norm_var": 3.4702473958333333, "learning_rate": 0.0001, "loss": 7.493, "loss/crossentropy": 2.0895782366394995, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.18484587790444493, "step": 12170 }, { "epoch": 0.406, "grad_norm": 29.25, "grad_norm_var": 3.611458333333333, "learning_rate": 0.0001, "loss": 7.4029, "loss/crossentropy": 1.9143709048628808, "loss/hidden": 3.63046875, "loss/jsd": 0.0, "loss/logits": 0.2049937192350626, "step": 12180 }, { "epoch": 0.4063333333333333, "grad_norm": 30.125, "grad_norm_var": 1.9947265625, "learning_rate": 0.0001, "loss": 7.3265, "loss/crossentropy": 1.9104190312325955, "loss/hidden": 3.513671875, "loss/jsd": 0.0, "loss/logits": 0.19608210287988187, "step": 12190 }, { "epoch": 0.4066666666666667, "grad_norm": 29.625, "grad_norm_var": 1.4020182291666667, "learning_rate": 0.0001, "loss": 7.2603, "loss/crossentropy": 1.9053283423185348, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.1867800692562014, "step": 12200 }, { "epoch": 0.407, "grad_norm": 30.125, "grad_norm_var": 3.1434895833333334, "learning_rate": 0.0001, "loss": 7.3971, "loss/crossentropy": 2.2144618704915047, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.1966156730428338, "step": 12210 }, { "epoch": 0.4073333333333333, "grad_norm": 29.0, "grad_norm_var": 1.8926432291666666, "learning_rate": 0.0001, "loss": 7.3548, "loss/crossentropy": 2.1393819943070413, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.20189669951796532, "step": 12220 }, { "epoch": 0.4076666666666667, "grad_norm": 28.125, "grad_norm_var": 2.637955729166667, "learning_rate": 0.0001, "loss": 7.393, "loss/crossentropy": 1.9698265984654426, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.1954586585983634, "step": 12230 }, { "epoch": 0.408, "grad_norm": 28.875, "grad_norm_var": 3.534375, "learning_rate": 0.0001, "loss": 7.3133, "loss/crossentropy": 1.9470518462359905, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.19640843686647713, "step": 12240 }, { "epoch": 0.4083333333333333, "grad_norm": 29.375, "grad_norm_var": 4.572916666666667, "learning_rate": 0.0001, "loss": 7.2611, "loss/crossentropy": 1.8714442759752274, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.19231631876900793, "step": 12250 }, { "epoch": 0.4086666666666667, "grad_norm": 30.125, "grad_norm_var": 5.143489583333333, "learning_rate": 0.0001, "loss": 7.2364, "loss/crossentropy": 1.9278406761586666, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.19184371568262576, "step": 12260 }, { "epoch": 0.409, "grad_norm": 30.125, "grad_norm_var": 1.3239583333333333, "learning_rate": 0.0001, "loss": 7.2529, "loss/crossentropy": 1.8813231438398361, "loss/hidden": 3.514453125, "loss/jsd": 0.0, "loss/logits": 0.2041698221117258, "step": 12270 }, { "epoch": 0.4093333333333333, "grad_norm": 29.375, "grad_norm_var": 2.747916666666667, "learning_rate": 0.0001, "loss": 7.2761, "loss/crossentropy": 2.1753661304712297, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.18603641577064992, "step": 12280 }, { "epoch": 0.4096666666666667, "grad_norm": 30.5, "grad_norm_var": 2.20390625, "learning_rate": 0.0001, "loss": 7.2515, "loss/crossentropy": 2.0634463764727116, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.1910073021426797, "step": 12290 }, { "epoch": 0.41, "grad_norm": 29.125, "grad_norm_var": 3.5541015625, "learning_rate": 0.0001, "loss": 7.293, "loss/crossentropy": 1.9679187454283238, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.19251346932724117, "step": 12300 }, { "epoch": 0.4103333333333333, "grad_norm": 29.125, "grad_norm_var": 5.3150390625, "learning_rate": 0.0001, "loss": 7.3171, "loss/crossentropy": 2.0756116971373557, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.19804721754044294, "step": 12310 }, { "epoch": 0.4106666666666667, "grad_norm": 33.0, "grad_norm_var": 4.003125, "learning_rate": 0.0001, "loss": 7.3842, "loss/crossentropy": 1.9135066859424115, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.19638098925352096, "step": 12320 }, { "epoch": 0.411, "grad_norm": 28.75, "grad_norm_var": 3.076041666666667, "learning_rate": 0.0001, "loss": 7.2634, "loss/crossentropy": 1.9726336173713208, "loss/hidden": 3.51015625, "loss/jsd": 0.0, "loss/logits": 0.18783859070390463, "step": 12330 }, { "epoch": 0.41133333333333333, "grad_norm": 29.25, "grad_norm_var": 1.5858723958333334, "learning_rate": 0.0001, "loss": 7.4304, "loss/crossentropy": 2.0639717251062395, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19512560907751322, "step": 12340 }, { "epoch": 0.4116666666666667, "grad_norm": 31.75, "grad_norm_var": 2.8059895833333335, "learning_rate": 0.0001, "loss": 7.2125, "loss/crossentropy": 1.9434161961078644, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.20312565043568612, "step": 12350 }, { "epoch": 0.412, "grad_norm": 29.375, "grad_norm_var": 1.9577473958333333, "learning_rate": 0.0001, "loss": 7.2484, "loss/crossentropy": 1.990830972790718, "loss/hidden": 3.57734375, "loss/jsd": 0.0, "loss/logits": 0.2072274126112461, "step": 12360 }, { "epoch": 0.41233333333333333, "grad_norm": 29.25, "grad_norm_var": 1.575, "learning_rate": 0.0001, "loss": 7.3732, "loss/crossentropy": 2.19725182056427, "loss/hidden": 3.47421875, "loss/jsd": 0.0, "loss/logits": 0.20620937421917915, "step": 12370 }, { "epoch": 0.4126666666666667, "grad_norm": 29.375, "grad_norm_var": 1.48125, "learning_rate": 0.0001, "loss": 7.2273, "loss/crossentropy": 1.9830413609743118, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.1823252897709608, "step": 12380 }, { "epoch": 0.413, "grad_norm": 28.75, "grad_norm_var": 2.4150390625, "learning_rate": 0.0001, "loss": 7.4405, "loss/crossentropy": 1.964363656938076, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.19101094100624322, "step": 12390 }, { "epoch": 0.41333333333333333, "grad_norm": 29.0, "grad_norm_var": 3.2916015625, "learning_rate": 0.0001, "loss": 7.3694, "loss/crossentropy": 2.075885979831219, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.20802584663033485, "step": 12400 }, { "epoch": 0.4136666666666667, "grad_norm": 31.125, "grad_norm_var": 2.470833333333333, "learning_rate": 0.0001, "loss": 7.3727, "loss/crossentropy": 1.876482492685318, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.1959961110725999, "step": 12410 }, { "epoch": 0.414, "grad_norm": 30.125, "grad_norm_var": 3.333268229166667, "learning_rate": 0.0001, "loss": 7.3735, "loss/crossentropy": 1.9241852715611458, "loss/hidden": 3.5484375, "loss/jsd": 0.0, "loss/logits": 0.19385703448206187, "step": 12420 }, { "epoch": 0.41433333333333333, "grad_norm": 31.75, "grad_norm_var": 4.6375, "learning_rate": 0.0001, "loss": 7.4871, "loss/crossentropy": 2.1150256127119063, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.20613473132252694, "step": 12430 }, { "epoch": 0.4146666666666667, "grad_norm": 28.75, "grad_norm_var": 4.005208333333333, "learning_rate": 0.0001, "loss": 7.2489, "loss/crossentropy": 1.890253783762455, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.18485627602785826, "step": 12440 }, { "epoch": 0.415, "grad_norm": 28.25, "grad_norm_var": 5.032747395833334, "learning_rate": 0.0001, "loss": 7.323, "loss/crossentropy": 2.0493284076452256, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.1929210675880313, "step": 12450 }, { "epoch": 0.41533333333333333, "grad_norm": 29.375, "grad_norm_var": 1.8832682291666667, "learning_rate": 0.0001, "loss": 7.3382, "loss/crossentropy": 2.0090419702231883, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.18987978184595705, "step": 12460 }, { "epoch": 0.4156666666666667, "grad_norm": 27.75, "grad_norm_var": 3.3306640625, "learning_rate": 0.0001, "loss": 7.2969, "loss/crossentropy": 1.7926045283675194, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.18079517502337694, "step": 12470 }, { "epoch": 0.416, "grad_norm": 31.0, "grad_norm_var": 32.37962239583333, "learning_rate": 0.0001, "loss": 7.4179, "loss/crossentropy": 1.8877079389989375, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.16747383624315262, "step": 12480 }, { "epoch": 0.41633333333333333, "grad_norm": 29.875, "grad_norm_var": 3.439322916666667, "learning_rate": 0.0001, "loss": 7.3037, "loss/crossentropy": 1.9167642079293727, "loss/hidden": 3.28046875, "loss/jsd": 0.0, "loss/logits": 0.1714684186503291, "step": 12490 }, { "epoch": 0.4166666666666667, "grad_norm": 30.75, "grad_norm_var": 1.4863932291666666, "learning_rate": 0.0001, "loss": 7.2426, "loss/crossentropy": 1.9011107690632343, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.18280915603972972, "step": 12500 }, { "epoch": 0.417, "grad_norm": 31.5, "grad_norm_var": 2.7895182291666667, "learning_rate": 0.0001, "loss": 7.2797, "loss/crossentropy": 1.822603452950716, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.19100228594616056, "step": 12510 }, { "epoch": 0.41733333333333333, "grad_norm": 7985954816.0, "grad_norm_var": 7.534892725942109e+18, "learning_rate": 0.0001, "loss": 7.3089, "loss/crossentropy": 1.7775569193065166, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.17659799195826054, "step": 12520 }, { "epoch": 0.4176666666666667, "grad_norm": 32.0, "grad_norm_var": 7.53489272269411e+18, "learning_rate": 0.0001, "loss": 7.3223, "loss/crossentropy": 1.9730999276041985, "loss/hidden": 3.57578125, "loss/jsd": 0.0, "loss/logits": 0.2058877520263195, "step": 12530 }, { "epoch": 0.418, "grad_norm": 31.0, "grad_norm_var": 18.452018229166665, "learning_rate": 0.0001, "loss": 7.3705, "loss/crossentropy": 2.063557520508766, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.19392988570034503, "step": 12540 }, { "epoch": 0.41833333333333333, "grad_norm": 30.25, "grad_norm_var": 3.371875, "learning_rate": 0.0001, "loss": 7.3015, "loss/crossentropy": 2.0945934891700744, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19292769618332387, "step": 12550 }, { "epoch": 0.4186666666666667, "grad_norm": 28.625, "grad_norm_var": 4.092643229166667, "learning_rate": 0.0001, "loss": 7.3267, "loss/crossentropy": 2.110990159213543, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.20304199429228903, "step": 12560 }, { "epoch": 0.419, "grad_norm": 30.5, "grad_norm_var": 2.184830729166667, "learning_rate": 0.0001, "loss": 7.3984, "loss/crossentropy": 1.9135117128491401, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.189224199205637, "step": 12570 }, { "epoch": 0.41933333333333334, "grad_norm": 29.125, "grad_norm_var": 1.5393229166666667, "learning_rate": 0.0001, "loss": 7.245, "loss/crossentropy": 1.9838401451706886, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.17927548866719006, "step": 12580 }, { "epoch": 0.4196666666666667, "grad_norm": 32.75, "grad_norm_var": 1.8916666666666666, "learning_rate": 0.0001, "loss": 7.3304, "loss/crossentropy": 1.9070615381002427, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.18272874988615512, "step": 12590 }, { "epoch": 0.42, "grad_norm": 28.375, "grad_norm_var": 3.0400390625, "learning_rate": 0.0001, "loss": 7.3981, "loss/crossentropy": 1.986302938312292, "loss/hidden": 3.34296875, "loss/jsd": 0.0, "loss/logits": 0.18646600283682346, "step": 12600 }, { "epoch": 0.42033333333333334, "grad_norm": 29.5, "grad_norm_var": 3.245833333333333, "learning_rate": 0.0001, "loss": 7.2969, "loss/crossentropy": 1.7935962483286858, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.1729273796081543, "step": 12610 }, { "epoch": 0.4206666666666667, "grad_norm": 32.75, "grad_norm_var": 12.6525390625, "learning_rate": 0.0001, "loss": 7.2635, "loss/crossentropy": 1.978981538116932, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.19728896729648113, "step": 12620 }, { "epoch": 0.421, "grad_norm": 28.375, "grad_norm_var": 57.72805989583333, "learning_rate": 0.0001, "loss": 7.2204, "loss/crossentropy": 1.938880129158497, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.19008896285668014, "step": 12630 }, { "epoch": 0.42133333333333334, "grad_norm": 30.0, "grad_norm_var": 54.670572916666664, "learning_rate": 0.0001, "loss": 7.1908, "loss/crossentropy": 2.0417967908084393, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.19047454055398702, "step": 12640 }, { "epoch": 0.4216666666666667, "grad_norm": 30.125, "grad_norm_var": 8.8931640625, "learning_rate": 0.0001, "loss": 7.3077, "loss/crossentropy": 1.9125272504985333, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.18889417024329305, "step": 12650 }, { "epoch": 0.422, "grad_norm": 30.625, "grad_norm_var": 7.56015625, "learning_rate": 0.0001, "loss": 7.4319, "loss/crossentropy": 1.9714299574494363, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.1815888261422515, "step": 12660 }, { "epoch": 0.42233333333333334, "grad_norm": 27.75, "grad_norm_var": 7.889583333333333, "learning_rate": 0.0001, "loss": 7.3988, "loss/crossentropy": 2.024210865795612, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.1945285452529788, "step": 12670 }, { "epoch": 0.4226666666666667, "grad_norm": 30.625, "grad_norm_var": 2.349739583333333, "learning_rate": 0.0001, "loss": 7.2893, "loss/crossentropy": 1.8954189397394656, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.18777471426874398, "step": 12680 }, { "epoch": 0.423, "grad_norm": 30.75, "grad_norm_var": 1.5541666666666667, "learning_rate": 0.0001, "loss": 7.2448, "loss/crossentropy": 2.0001342222094536, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.1725354392081499, "step": 12690 }, { "epoch": 0.42333333333333334, "grad_norm": 32.75, "grad_norm_var": 1.6452473958333333, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.0649070352315904, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.20432663131505252, "step": 12700 }, { "epoch": 0.4236666666666667, "grad_norm": 28.25, "grad_norm_var": 2.265625, "learning_rate": 0.0001, "loss": 7.1798, "loss/crossentropy": 2.004515118896961, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.18856901675462723, "step": 12710 }, { "epoch": 0.424, "grad_norm": 33.25, "grad_norm_var": 9.315559895833333, "learning_rate": 0.0001, "loss": 7.1834, "loss/crossentropy": 1.868245577812195, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.17804253660142422, "step": 12720 }, { "epoch": 0.42433333333333334, "grad_norm": 28.625, "grad_norm_var": 15.6228515625, "learning_rate": 0.0001, "loss": 7.3254, "loss/crossentropy": 1.9993156149983407, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.1985454322770238, "step": 12730 }, { "epoch": 0.4246666666666667, "grad_norm": 34.5, "grad_norm_var": 11.248893229166667, "learning_rate": 0.0001, "loss": 7.2367, "loss/crossentropy": 1.9248070381581783, "loss/hidden": 3.523046875, "loss/jsd": 0.0, "loss/logits": 0.19863908234983682, "step": 12740 }, { "epoch": 0.425, "grad_norm": 32.75, "grad_norm_var": 3.8447916666666666, "learning_rate": 0.0001, "loss": 7.1767, "loss/crossentropy": 1.9554524950683116, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.1819900662638247, "step": 12750 }, { "epoch": 0.42533333333333334, "grad_norm": 29.5, "grad_norm_var": 2.8843098958333333, "learning_rate": 0.0001, "loss": 7.3063, "loss/crossentropy": 2.0244558215141297, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.1899258963763714, "step": 12760 }, { "epoch": 0.4256666666666667, "grad_norm": 31.625, "grad_norm_var": 6.8884765625, "learning_rate": 0.0001, "loss": 7.371, "loss/crossentropy": 1.943205365538597, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.17850949354469775, "step": 12770 }, { "epoch": 0.426, "grad_norm": 33.75, "grad_norm_var": 6.843489583333334, "learning_rate": 0.0001, "loss": 7.3094, "loss/crossentropy": 1.942814826965332, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.18254665341228246, "step": 12780 }, { "epoch": 0.42633333333333334, "grad_norm": 27.75, "grad_norm_var": 3.54140625, "learning_rate": 0.0001, "loss": 7.292, "loss/crossentropy": 2.0417085126042367, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.18853002320975065, "step": 12790 }, { "epoch": 0.4266666666666667, "grad_norm": 29.0, "grad_norm_var": 6.2603515625, "learning_rate": 0.0001, "loss": 7.3253, "loss/crossentropy": 2.074806372821331, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.1911831771954894, "step": 12800 }, { "epoch": 0.427, "grad_norm": 33.5, "grad_norm_var": 7.91015625, "learning_rate": 0.0001, "loss": 7.3655, "loss/crossentropy": 1.9392616674304008, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19804707486182452, "step": 12810 }, { "epoch": 0.42733333333333334, "grad_norm": 31.125, "grad_norm_var": 1.9926432291666667, "learning_rate": 0.0001, "loss": 7.3838, "loss/crossentropy": 1.956505012512207, "loss/hidden": 3.53515625, "loss/jsd": 0.0, "loss/logits": 0.20886103585362434, "step": 12820 }, { "epoch": 0.42766666666666664, "grad_norm": 32.0, "grad_norm_var": 2.3108723958333335, "learning_rate": 0.0001, "loss": 7.3007, "loss/crossentropy": 1.9242730617523194, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.19543598499149084, "step": 12830 }, { "epoch": 0.428, "grad_norm": 33.25, "grad_norm_var": 3.103125, "learning_rate": 0.0001, "loss": 7.2915, "loss/crossentropy": 2.08651515096426, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.18487875685095786, "step": 12840 }, { "epoch": 0.42833333333333334, "grad_norm": 29.875, "grad_norm_var": 3.3119140625, "learning_rate": 0.0001, "loss": 7.1402, "loss/crossentropy": 2.041348946094513, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.17283011246472596, "step": 12850 }, { "epoch": 0.42866666666666664, "grad_norm": 31.625, "grad_norm_var": 1.5858723958333334, "learning_rate": 0.0001, "loss": 7.2906, "loss/crossentropy": 1.996045397222042, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.1960317576304078, "step": 12860 }, { "epoch": 0.429, "grad_norm": 32.0, "grad_norm_var": 2.74140625, "learning_rate": 0.0001, "loss": 7.2965, "loss/crossentropy": 1.703165267407894, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.18170032938942313, "step": 12870 }, { "epoch": 0.42933333333333334, "grad_norm": 31.25, "grad_norm_var": 3.3072265625, "learning_rate": 0.0001, "loss": 7.3157, "loss/crossentropy": 1.9796103611588478, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18570774439722298, "step": 12880 }, { "epoch": 0.42966666666666664, "grad_norm": 30.0, "grad_norm_var": 2.414322916666667, "learning_rate": 0.0001, "loss": 7.3958, "loss/crossentropy": 1.9772098012268544, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.18324623247608543, "step": 12890 }, { "epoch": 0.43, "grad_norm": 31.125, "grad_norm_var": 2.2135416666666665, "learning_rate": 0.0001, "loss": 7.2182, "loss/crossentropy": 2.05758897960186, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.1902267148718238, "step": 12900 }, { "epoch": 0.43033333333333335, "grad_norm": 29.375, "grad_norm_var": 2.6455729166666666, "learning_rate": 0.0001, "loss": 7.2487, "loss/crossentropy": 2.04788166359067, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.18356787655502557, "step": 12910 }, { "epoch": 0.43066666666666664, "grad_norm": 30.25, "grad_norm_var": 2.8848307291666666, "learning_rate": 0.0001, "loss": 7.2956, "loss/crossentropy": 1.9563010394573213, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.19150800202041865, "step": 12920 }, { "epoch": 0.431, "grad_norm": 31.0, "grad_norm_var": 2.627083333333333, "learning_rate": 0.0001, "loss": 7.1595, "loss/crossentropy": 1.86679616458714, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.17930920696817337, "step": 12930 }, { "epoch": 0.43133333333333335, "grad_norm": 28.25, "grad_norm_var": 3.198372395833333, "learning_rate": 0.0001, "loss": 7.1396, "loss/crossentropy": 1.8882878959178924, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.17377492943778633, "step": 12940 }, { "epoch": 0.43166666666666664, "grad_norm": 30.875, "grad_norm_var": 1.721875, "learning_rate": 0.0001, "loss": 7.3663, "loss/crossentropy": 2.04179819971323, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.19386702179908752, "step": 12950 }, { "epoch": 0.432, "grad_norm": 28.75, "grad_norm_var": 2.778125, "learning_rate": 0.0001, "loss": 7.5441, "loss/crossentropy": 2.216225576400757, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2197167096659541, "step": 12960 }, { "epoch": 0.43233333333333335, "grad_norm": 31.25, "grad_norm_var": 1.81015625, "learning_rate": 0.0001, "loss": 7.4852, "loss/crossentropy": 1.8792053952813148, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.1841729011386633, "step": 12970 }, { "epoch": 0.43266666666666664, "grad_norm": 29.25, "grad_norm_var": 11.1291015625, "learning_rate": 0.0001, "loss": 7.3372, "loss/crossentropy": 2.0644855096936228, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.19256247486919165, "step": 12980 }, { "epoch": 0.433, "grad_norm": 28.75, "grad_norm_var": 3.65804876695318e+18, "learning_rate": 0.0001, "loss": 7.2852, "loss/crossentropy": 1.9974554382264613, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.18375110551714896, "step": 12990 }, { "epoch": 0.43333333333333335, "grad_norm": 27.5, "grad_norm_var": 5.353580729166667, "learning_rate": 0.0001, "loss": 7.3061, "loss/crossentropy": 1.8111790284514426, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.17170110829174517, "step": 13000 }, { "epoch": 0.43366666666666664, "grad_norm": 28.75, "grad_norm_var": 2.7275390625, "learning_rate": 0.0001, "loss": 7.3489, "loss/crossentropy": 2.010530613362789, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.18404115671291948, "step": 13010 }, { "epoch": 0.434, "grad_norm": 29.625, "grad_norm_var": 2.2447916666666665, "learning_rate": 0.0001, "loss": 7.3043, "loss/crossentropy": 1.8605936087667942, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.1884337780997157, "step": 13020 }, { "epoch": 0.43433333333333335, "grad_norm": 28.75, "grad_norm_var": 2.7822265625, "learning_rate": 0.0001, "loss": 7.3691, "loss/crossentropy": 2.109563418477774, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.18584556700661778, "step": 13030 }, { "epoch": 0.43466666666666665, "grad_norm": 28.875, "grad_norm_var": 2.2827473958333333, "learning_rate": 0.0001, "loss": 7.2526, "loss/crossentropy": 1.8590692810714244, "loss/hidden": 3.547265625, "loss/jsd": 0.0, "loss/logits": 0.19188887728378176, "step": 13040 }, { "epoch": 0.435, "grad_norm": 34.0, "grad_norm_var": 3.814583333333333, "learning_rate": 0.0001, "loss": 7.2242, "loss/crossentropy": 2.1324163652956485, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.18878793474286795, "step": 13050 }, { "epoch": 0.43533333333333335, "grad_norm": 30.875, "grad_norm_var": 2.8848307291666666, "learning_rate": 0.0001, "loss": 7.1895, "loss/crossentropy": 1.9097092799842357, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.2003808870911598, "step": 13060 }, { "epoch": 0.43566666666666665, "grad_norm": 31.0, "grad_norm_var": 1.5291015625, "learning_rate": 0.0001, "loss": 7.3021, "loss/crossentropy": 1.9634363777935504, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.18962702061980963, "step": 13070 }, { "epoch": 0.436, "grad_norm": 28.75, "grad_norm_var": 1.5197916666666667, "learning_rate": 0.0001, "loss": 7.366, "loss/crossentropy": 2.0445257820189, "loss/hidden": 3.51640625, "loss/jsd": 0.0, "loss/logits": 0.19477153085172177, "step": 13080 }, { "epoch": 0.43633333333333335, "grad_norm": 30.75, "grad_norm_var": 1.77265625, "learning_rate": 0.0001, "loss": 7.2663, "loss/crossentropy": 1.740558636933565, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.1728643019683659, "step": 13090 }, { "epoch": 0.43666666666666665, "grad_norm": 28.75, "grad_norm_var": 2.226822916666667, "learning_rate": 0.0001, "loss": 7.2917, "loss/crossentropy": 2.001444526016712, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.1790529664605856, "step": 13100 }, { "epoch": 0.437, "grad_norm": 31.125, "grad_norm_var": 4.279166666666667, "learning_rate": 0.0001, "loss": 7.1773, "loss/crossentropy": 1.897903248667717, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.1821414889767766, "step": 13110 }, { "epoch": 0.43733333333333335, "grad_norm": 28.0, "grad_norm_var": 3.3431640625, "learning_rate": 0.0001, "loss": 7.3848, "loss/crossentropy": 2.0134089671075346, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.17652187384665013, "step": 13120 }, { "epoch": 0.43766666666666665, "grad_norm": 32.25, "grad_norm_var": 2.7874348958333335, "learning_rate": 0.0001, "loss": 7.2217, "loss/crossentropy": 1.967977014183998, "loss/hidden": 3.498828125, "loss/jsd": 0.0, "loss/logits": 0.19085451643913984, "step": 13130 }, { "epoch": 0.438, "grad_norm": 32.75, "grad_norm_var": 2.1806640625, "learning_rate": 0.0001, "loss": 7.4298, "loss/crossentropy": 2.015234684944153, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.19364932924509048, "step": 13140 }, { "epoch": 0.43833333333333335, "grad_norm": 33.25, "grad_norm_var": 2.4871128698300923e+18, "learning_rate": 0.0001, "loss": 7.3698, "loss/crossentropy": 1.93935536891222, "loss/hidden": 3.54453125, "loss/jsd": 0.0, "loss/logits": 0.16914492761716246, "step": 13150 }, { "epoch": 0.43866666666666665, "grad_norm": 29.625, "grad_norm_var": 2.487112869455541e+18, "learning_rate": 0.0001, "loss": 7.3236, "loss/crossentropy": 2.0412595510482787, "loss/hidden": 3.51328125, "loss/jsd": 0.0, "loss/logits": 0.20541063342243432, "step": 13160 }, { "epoch": 0.439, "grad_norm": 30.25, "grad_norm_var": 2.6166666666666667, "learning_rate": 0.0001, "loss": 7.2295, "loss/crossentropy": 1.745029440522194, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.17878907890990375, "step": 13170 }, { "epoch": 0.43933333333333335, "grad_norm": 29.25, "grad_norm_var": 7.464322916666666, "learning_rate": 0.0001, "loss": 7.2836, "loss/crossentropy": 2.0527849346399307, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.18305875286459922, "step": 13180 }, { "epoch": 0.43966666666666665, "grad_norm": 32.25, "grad_norm_var": 9.64140625, "learning_rate": 0.0001, "loss": 7.2802, "loss/crossentropy": 1.9123728275299072, "loss/hidden": 3.524609375, "loss/jsd": 0.0, "loss/logits": 0.2029513966292143, "step": 13190 }, { "epoch": 0.44, "grad_norm": 37.75, "grad_norm_var": 3.436879802834389e+18, "learning_rate": 0.0001, "loss": 7.3615, "loss/crossentropy": 2.049905315041542, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.18804401885718108, "step": 13200 }, { "epoch": 0.44033333333333335, "grad_norm": 33.0, "grad_norm_var": 3.4368798040394127e+18, "learning_rate": 0.0001, "loss": 7.1652, "loss/crossentropy": 1.9164471715688705, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.18717883229255677, "step": 13210 }, { "epoch": 0.44066666666666665, "grad_norm": 29.25, "grad_norm_var": 56.357747395833336, "learning_rate": 0.0001, "loss": 7.2409, "loss/crossentropy": 2.1332522869110107, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.20648030024021863, "step": 13220 }, { "epoch": 0.441, "grad_norm": 28.625, "grad_norm_var": 37.78515625, "learning_rate": 0.0001, "loss": 7.3513, "loss/crossentropy": 2.0627794533967974, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.17905288450419904, "step": 13230 }, { "epoch": 0.44133333333333336, "grad_norm": 28.875, "grad_norm_var": 1.7457682291666667, "learning_rate": 0.0001, "loss": 7.1439, "loss/crossentropy": 1.8521274402737617, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.19447986232116818, "step": 13240 }, { "epoch": 0.44166666666666665, "grad_norm": 29.875, "grad_norm_var": 7.7837890625, "learning_rate": 0.0001, "loss": 7.3869, "loss/crossentropy": 1.888174219429493, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.18290935391560198, "step": 13250 }, { "epoch": 0.442, "grad_norm": 29.375, "grad_norm_var": 9.41015625, "learning_rate": 0.0001, "loss": 7.4288, "loss/crossentropy": 2.0676389575004577, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19049588460475206, "step": 13260 }, { "epoch": 0.44233333333333336, "grad_norm": 38.25, "grad_norm_var": 15.146809895833334, "learning_rate": 0.0001, "loss": 7.3795, "loss/crossentropy": 2.0825484111905097, "loss/hidden": 3.466796875, "loss/jsd": 0.0, "loss/logits": 0.18994310293346645, "step": 13270 }, { "epoch": 0.44266666666666665, "grad_norm": 37.25, "grad_norm_var": 67.60305989583334, "learning_rate": 0.0001, "loss": 7.246, "loss/crossentropy": 1.849845191091299, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.1833963361568749, "step": 13280 }, { "epoch": 0.443, "grad_norm": 31.375, "grad_norm_var": 78.75305989583333, "learning_rate": 0.0001, "loss": 7.2561, "loss/crossentropy": 2.0070059105753897, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.1796229135245085, "step": 13290 }, { "epoch": 0.44333333333333336, "grad_norm": 39.5, "grad_norm_var": 127.7619140625, "learning_rate": 0.0001, "loss": 7.3287, "loss/crossentropy": 2.0115412943065167, "loss/hidden": 3.25703125, "loss/jsd": 0.0, "loss/logits": 0.18011142145842313, "step": 13300 }, { "epoch": 0.44366666666666665, "grad_norm": 31.25, "grad_norm_var": 22.731705729166666, "learning_rate": 0.0001, "loss": 7.3316, "loss/crossentropy": 1.9415480360388755, "loss/hidden": 3.5375, "loss/jsd": 0.0, "loss/logits": 0.20214050840586423, "step": 13310 }, { "epoch": 0.444, "grad_norm": 35.75, "grad_norm_var": 19.768489583333334, "learning_rate": 0.0001, "loss": 7.3178, "loss/crossentropy": 2.1167004860937597, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.18628034861758352, "step": 13320 }, { "epoch": 0.44433333333333336, "grad_norm": 44.0, "grad_norm_var": 26.074934895833334, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 2.028274582326412, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.17940252125263215, "step": 13330 }, { "epoch": 0.44466666666666665, "grad_norm": 28.0, "grad_norm_var": 23.348893229166666, "learning_rate": 0.0001, "loss": 7.2767, "loss/crossentropy": 1.9811547137796879, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.18217029944062232, "step": 13340 }, { "epoch": 0.445, "grad_norm": 30.0, "grad_norm_var": 24.830208333333335, "learning_rate": 0.0001, "loss": 7.1972, "loss/crossentropy": 1.9702550530433656, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.19615521375089884, "step": 13350 }, { "epoch": 0.44533333333333336, "grad_norm": 29.75, "grad_norm_var": 20.037239583333335, "learning_rate": 0.0001, "loss": 7.0846, "loss/crossentropy": 2.0430791325867177, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.17391135385259987, "step": 13360 }, { "epoch": 0.44566666666666666, "grad_norm": 30.25, "grad_norm_var": 16.8962890625, "learning_rate": 0.0001, "loss": 7.2202, "loss/crossentropy": 1.8699004381895066, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.17999291978776455, "step": 13370 }, { "epoch": 0.446, "grad_norm": 46.0, "grad_norm_var": 27.853580729166666, "learning_rate": 0.0001, "loss": 7.3052, "loss/crossentropy": 1.9940235575661063, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.1924258061219007, "step": 13380 }, { "epoch": 0.44633333333333336, "grad_norm": 29.625, "grad_norm_var": 24.49765625, "learning_rate": 0.0001, "loss": 7.3186, "loss/crossentropy": 1.9557200871407985, "loss/hidden": 3.312109375, "loss/jsd": 0.0, "loss/logits": 0.17764934049919248, "step": 13390 }, { "epoch": 0.44666666666666666, "grad_norm": 44.5, "grad_norm_var": 18.795833333333334, "learning_rate": 0.0001, "loss": 7.2921, "loss/crossentropy": 1.982000894844532, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.1895873911678791, "step": 13400 }, { "epoch": 0.447, "grad_norm": 37.0, "grad_norm_var": 24.075455729166666, "learning_rate": 0.0001, "loss": 7.2108, "loss/crossentropy": 1.9171406604349612, "loss/hidden": 3.30625, "loss/jsd": 0.0, "loss/logits": 0.18148462558165193, "step": 13410 }, { "epoch": 0.44733333333333336, "grad_norm": 29.25, "grad_norm_var": 11.3806640625, "learning_rate": 0.0001, "loss": 7.1321, "loss/crossentropy": 2.017513682693243, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18248296082019805, "step": 13420 }, { "epoch": 0.44766666666666666, "grad_norm": 37.0, "grad_norm_var": 15.508333333333333, "learning_rate": 0.0001, "loss": 7.043, "loss/crossentropy": 2.0521533638238907, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.18131908401846886, "step": 13430 }, { "epoch": 0.448, "grad_norm": 26.125, "grad_norm_var": 12.25390625, "learning_rate": 0.0001, "loss": 7.1442, "loss/crossentropy": 2.0275149777531625, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.17429129611700772, "step": 13440 }, { "epoch": 0.4483333333333333, "grad_norm": 33.0, "grad_norm_var": 11.214322916666667, "learning_rate": 0.0001, "loss": 7.0361, "loss/crossentropy": 1.8195183783769608, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.19052925612777472, "step": 13450 }, { "epoch": 0.44866666666666666, "grad_norm": 27.125, "grad_norm_var": 12.353059895833333, "learning_rate": 0.0001, "loss": 7.0954, "loss/crossentropy": 1.8622570484876633, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.17434460930526258, "step": 13460 }, { "epoch": 0.449, "grad_norm": 34.25, "grad_norm_var": 7.6587890625, "learning_rate": 0.0001, "loss": 7.1898, "loss/crossentropy": 1.9870690882205964, "loss/hidden": 3.316796875, "loss/jsd": 0.0, "loss/logits": 0.17655272763222457, "step": 13470 }, { "epoch": 0.4493333333333333, "grad_norm": 29.75, "grad_norm_var": 9.496809895833334, "learning_rate": 0.0001, "loss": 7.1173, "loss/crossentropy": 2.003261724859476, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.17976139234378935, "step": 13480 }, { "epoch": 0.44966666666666666, "grad_norm": 38.0, "grad_norm_var": 14.865559895833334, "learning_rate": 0.0001, "loss": 7.136, "loss/crossentropy": 1.9371051900088787, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.1780864058993757, "step": 13490 }, { "epoch": 0.45, "grad_norm": 5737807872.0, "grad_norm_var": 2.0576524247469391e+18, "learning_rate": 0.0001, "loss": 7.3776, "loss/crossentropy": 2.017524632811546, "loss/hidden": 3.738671875, "loss/jsd": 0.0, "loss/logits": 0.2000157820060849, "step": 13500 }, { "epoch": 0.4503333333333333, "grad_norm": 35.0, "grad_norm_var": 2.057652422708822e+18, "learning_rate": 0.0001, "loss": 7.2621, "loss/crossentropy": 1.9558642476797103, "loss/hidden": 3.288671875, "loss/jsd": 0.0, "loss/logits": 0.17204577308148145, "step": 13510 }, { "epoch": 0.45066666666666666, "grad_norm": 29.125, "grad_norm_var": 12.001041666666667, "learning_rate": 0.0001, "loss": 7.1751, "loss/crossentropy": 1.8655981071293355, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.18581798635423183, "step": 13520 }, { "epoch": 0.451, "grad_norm": 43.75, "grad_norm_var": 20.3869140625, "learning_rate": 0.0001, "loss": 7.0832, "loss/crossentropy": 1.9269152946770192, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.1797495674341917, "step": 13530 }, { "epoch": 0.4513333333333333, "grad_norm": 30.875, "grad_norm_var": 18.293489583333333, "learning_rate": 0.0001, "loss": 7.1542, "loss/crossentropy": 1.8559065647423267, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.1799021612852812, "step": 13540 }, { "epoch": 0.45166666666666666, "grad_norm": 33.25, "grad_norm_var": 31.49375, "learning_rate": 0.0001, "loss": 7.1085, "loss/crossentropy": 1.9238978043198585, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.18571034241467715, "step": 13550 }, { "epoch": 0.452, "grad_norm": 35.0, "grad_norm_var": 35.737239583333334, "learning_rate": 0.0001, "loss": 7.1775, "loss/crossentropy": 2.034532202780247, "loss/hidden": 3.2984375, "loss/jsd": 0.0, "loss/logits": 0.17688622865825893, "step": 13560 }, { "epoch": 0.4523333333333333, "grad_norm": 31.125, "grad_norm_var": 10.258268229166667, "learning_rate": 0.0001, "loss": 7.1764, "loss/crossentropy": 1.9171429850161075, "loss/hidden": 3.503515625, "loss/jsd": 0.0, "loss/logits": 0.19607337545603515, "step": 13570 }, { "epoch": 0.45266666666666666, "grad_norm": 29.25, "grad_norm_var": 3.834830729166667, "learning_rate": 0.0001, "loss": 7.0753, "loss/crossentropy": 1.9000761061906815, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.18145595137029885, "step": 13580 }, { "epoch": 0.453, "grad_norm": 36.25, "grad_norm_var": 5.373958333333333, "learning_rate": 0.0001, "loss": 7.0834, "loss/crossentropy": 1.9930811017751693, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.18698291517794133, "step": 13590 }, { "epoch": 0.4533333333333333, "grad_norm": 35.25, "grad_norm_var": 7.59375, "learning_rate": 0.0001, "loss": 7.2876, "loss/crossentropy": 2.075080066919327, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.20524356812238692, "step": 13600 }, { "epoch": 0.45366666666666666, "grad_norm": 29.625, "grad_norm_var": 9.158072916666667, "learning_rate": 0.0001, "loss": 7.2466, "loss/crossentropy": 2.051040123403072, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.1915690576657653, "step": 13610 }, { "epoch": 0.454, "grad_norm": 35.75, "grad_norm_var": 11.01875, "learning_rate": 0.0001, "loss": 7.2376, "loss/crossentropy": 2.0414855420589446, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.1812042098492384, "step": 13620 }, { "epoch": 0.4543333333333333, "grad_norm": 30.125, "grad_norm_var": 9.2681640625, "learning_rate": 0.0001, "loss": 7.1839, "loss/crossentropy": 1.8989726789295673, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.1779768768697977, "step": 13630 }, { "epoch": 0.45466666666666666, "grad_norm": 35.75, "grad_norm_var": 13.395768229166666, "learning_rate": 0.0001, "loss": 7.3019, "loss/crossentropy": 2.2058630764484404, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.18832362797111274, "step": 13640 }, { "epoch": 0.455, "grad_norm": 32.25, "grad_norm_var": 11.412434895833334, "learning_rate": 0.0001, "loss": 7.2294, "loss/crossentropy": 1.9786066941916942, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.18877314804121853, "step": 13650 }, { "epoch": 0.4553333333333333, "grad_norm": 29.625, "grad_norm_var": 16.6478515625, "learning_rate": 0.0001, "loss": 7.0988, "loss/crossentropy": 2.034154790639877, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.20130378603935242, "step": 13660 }, { "epoch": 0.45566666666666666, "grad_norm": 32.25, "grad_norm_var": 15.199739583333333, "learning_rate": 0.0001, "loss": 7.1508, "loss/crossentropy": 2.1030883103609086, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.20327686462551356, "step": 13670 }, { "epoch": 0.456, "grad_norm": 35.0, "grad_norm_var": 8.778580729166666, "learning_rate": 0.0001, "loss": 7.3133, "loss/crossentropy": 1.9806414268910886, "loss/hidden": 3.282421875, "loss/jsd": 0.0, "loss/logits": 0.16928210109472275, "step": 13680 }, { "epoch": 0.4563333333333333, "grad_norm": 33.0, "grad_norm_var": 4.726497395833333, "learning_rate": 0.0001, "loss": 7.2696, "loss/crossentropy": 2.007010492682457, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.19361307416111231, "step": 13690 }, { "epoch": 0.45666666666666667, "grad_norm": 31.25, "grad_norm_var": 5.4837890625, "learning_rate": 0.0001, "loss": 7.2402, "loss/crossentropy": 1.8235751613974571, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.18109319936484097, "step": 13700 }, { "epoch": 0.457, "grad_norm": 28.75, "grad_norm_var": 3.503125, "learning_rate": 0.0001, "loss": 7.2435, "loss/crossentropy": 2.0571963757276537, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.18080200497061014, "step": 13710 }, { "epoch": 0.4573333333333333, "grad_norm": 36.75, "grad_norm_var": 10.729622395833333, "learning_rate": 0.0001, "loss": 7.2434, "loss/crossentropy": 1.9216804102063179, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18269547251984478, "step": 13720 }, { "epoch": 0.45766666666666667, "grad_norm": 28.125, "grad_norm_var": 9.25, "learning_rate": 0.0001, "loss": 7.2081, "loss/crossentropy": 2.074676251411438, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.18605531882494689, "step": 13730 }, { "epoch": 0.458, "grad_norm": 26.0, "grad_norm_var": 8.945768229166667, "learning_rate": 0.0001, "loss": 7.1585, "loss/crossentropy": 2.024524886906147, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.1849360415711999, "step": 13740 }, { "epoch": 0.4583333333333333, "grad_norm": 29.25, "grad_norm_var": 22.959375, "learning_rate": 0.0001, "loss": 7.2552, "loss/crossentropy": 1.8464436307549477, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.18502899510785936, "step": 13750 }, { "epoch": 0.45866666666666667, "grad_norm": 38.25, "grad_norm_var": 22.241080729166665, "learning_rate": 0.0001, "loss": 7.2138, "loss/crossentropy": 2.1008409567177297, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.1840114699676633, "step": 13760 }, { "epoch": 0.459, "grad_norm": 27.5, "grad_norm_var": 8.494791666666666, "learning_rate": 0.0001, "loss": 7.161, "loss/crossentropy": 2.0837729245424272, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.18524854201823474, "step": 13770 }, { "epoch": 0.4593333333333333, "grad_norm": 31.75, "grad_norm_var": 8.678125, "learning_rate": 0.0001, "loss": 7.1876, "loss/crossentropy": 2.019195820391178, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.18871064409613608, "step": 13780 }, { "epoch": 0.45966666666666667, "grad_norm": 26.375, "grad_norm_var": 6.366666666666666, "learning_rate": 0.0001, "loss": 7.127, "loss/crossentropy": 1.9793319948017598, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.17319776988588273, "step": 13790 }, { "epoch": 0.46, "grad_norm": 29.5, "grad_norm_var": 4.164322916666666, "learning_rate": 0.0001, "loss": 7.2287, "loss/crossentropy": 1.9127621680498124, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.16671730391681194, "step": 13800 }, { "epoch": 0.4603333333333333, "grad_norm": 32.0, "grad_norm_var": 4.4009765625, "learning_rate": 0.0001, "loss": 7.2896, "loss/crossentropy": 1.919666799157858, "loss/hidden": 3.5125, "loss/jsd": 0.0, "loss/logits": 0.19349863342940807, "step": 13810 }, { "epoch": 0.46066666666666667, "grad_norm": 27.5, "grad_norm_var": 4.406705729166666, "learning_rate": 0.0001, "loss": 7.2532, "loss/crossentropy": 2.1008009389042854, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.2111218985170126, "step": 13820 }, { "epoch": 0.461, "grad_norm": 32.0, "grad_norm_var": 3.949934895833333, "learning_rate": 0.0001, "loss": 7.2596, "loss/crossentropy": 2.0670258209109305, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.18374508051201702, "step": 13830 }, { "epoch": 0.4613333333333333, "grad_norm": 30.0, "grad_norm_var": 5.022330729166667, "learning_rate": 0.0001, "loss": 7.3122, "loss/crossentropy": 2.0331405267119407, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.18541072569787503, "step": 13840 }, { "epoch": 0.46166666666666667, "grad_norm": 30.375, "grad_norm_var": 2.7832682291666666, "learning_rate": 0.0001, "loss": 7.2023, "loss/crossentropy": 1.9869826652109623, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.16925314180552958, "step": 13850 }, { "epoch": 0.462, "grad_norm": 37.5, "grad_norm_var": 12.5650390625, "learning_rate": 0.0001, "loss": 7.2985, "loss/crossentropy": 2.060637920349836, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.18476733108982443, "step": 13860 }, { "epoch": 0.4623333333333333, "grad_norm": 33.0, "grad_norm_var": 27.5541015625, "learning_rate": 0.0001, "loss": 7.1753, "loss/crossentropy": 1.9634750574827193, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.1784058004617691, "step": 13870 }, { "epoch": 0.46266666666666667, "grad_norm": 30.125, "grad_norm_var": 12.693684895833334, "learning_rate": 0.0001, "loss": 7.291, "loss/crossentropy": 1.9411419093608857, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.19258206877857448, "step": 13880 }, { "epoch": 0.463, "grad_norm": 29.5, "grad_norm_var": 11.192708333333334, "learning_rate": 0.0001, "loss": 7.2104, "loss/crossentropy": 1.9191410437226295, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1851293832063675, "step": 13890 }, { "epoch": 0.4633333333333333, "grad_norm": 32.5, "grad_norm_var": 5.1056640625, "learning_rate": 0.0001, "loss": 7.4042, "loss/crossentropy": 2.041092386841774, "loss/hidden": 3.451171875, "loss/jsd": 0.0, "loss/logits": 0.19642320200800895, "step": 13900 }, { "epoch": 0.46366666666666667, "grad_norm": 33.5, "grad_norm_var": 5.095247395833334, "learning_rate": 0.0001, "loss": 7.1782, "loss/crossentropy": 1.9209259033203125, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.172767742164433, "step": 13910 }, { "epoch": 0.464, "grad_norm": 32.0, "grad_norm_var": 7.739322916666667, "learning_rate": 0.0001, "loss": 7.21, "loss/crossentropy": 2.115266653895378, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.1856145763769746, "step": 13920 }, { "epoch": 0.4643333333333333, "grad_norm": 28.625, "grad_norm_var": 10.447916666666666, "learning_rate": 0.0001, "loss": 7.2492, "loss/crossentropy": 1.9952922590076922, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.1768399015069008, "step": 13930 }, { "epoch": 0.4646666666666667, "grad_norm": 32.75, "grad_norm_var": 7.995572916666666, "learning_rate": 0.0001, "loss": 7.2501, "loss/crossentropy": 1.9574853405356407, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.18205814994871616, "step": 13940 }, { "epoch": 0.465, "grad_norm": 31.5, "grad_norm_var": 15.183072916666667, "learning_rate": 0.0001, "loss": 7.2645, "loss/crossentropy": 2.0965874053537847, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.19973656507208942, "step": 13950 }, { "epoch": 0.4653333333333333, "grad_norm": 29.125, "grad_norm_var": 17.470247395833333, "learning_rate": 0.0001, "loss": 7.2782, "loss/crossentropy": 2.008360803127289, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.19034159034490586, "step": 13960 }, { "epoch": 0.4656666666666667, "grad_norm": 28.625, "grad_norm_var": 4.143684895833333, "learning_rate": 0.0001, "loss": 7.0826, "loss/crossentropy": 1.915960668027401, "loss/hidden": 3.28203125, "loss/jsd": 0.0, "loss/logits": 0.1710082046687603, "step": 13970 }, { "epoch": 0.466, "grad_norm": 29.75, "grad_norm_var": 2.8863932291666665, "learning_rate": 0.0001, "loss": 7.2426, "loss/crossentropy": 2.017311280965805, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.19221649710088967, "step": 13980 }, { "epoch": 0.4663333333333333, "grad_norm": 41.25, "grad_norm_var": 2.2295632672067333e+18, "learning_rate": 0.0001, "loss": 7.291, "loss/crossentropy": 1.9200906842947005, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.17767350990325212, "step": 13990 }, { "epoch": 0.4666666666666667, "grad_norm": 30.875, "grad_norm_var": 2.2295632663419377e+18, "learning_rate": 0.0001, "loss": 7.2845, "loss/crossentropy": 2.00697271078825, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.20020998753607272, "step": 14000 }, { "epoch": 0.467, "grad_norm": 30.375, "grad_norm_var": 2.871326210602898e+18, "learning_rate": 0.0001, "loss": 7.3348, "loss/crossentropy": 1.8769463069736958, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.17324418537318706, "step": 14010 }, { "epoch": 0.4673333333333333, "grad_norm": 30.75, "grad_norm_var": 14.837955729166667, "learning_rate": 0.0001, "loss": 7.1903, "loss/crossentropy": 2.014280915260315, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.195509172976017, "step": 14020 }, { "epoch": 0.4676666666666667, "grad_norm": 31.5, "grad_norm_var": 7.077083333333333, "learning_rate": 0.0001, "loss": 7.3075, "loss/crossentropy": 1.8367955565452576, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.18978505972772836, "step": 14030 }, { "epoch": 0.468, "grad_norm": 37.5, "grad_norm_var": 14.403580729166666, "learning_rate": 0.0001, "loss": 7.2736, "loss/crossentropy": 1.78411867916584, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.16776023786514999, "step": 14040 }, { "epoch": 0.4683333333333333, "grad_norm": 32.25, "grad_norm_var": 11.570833333333333, "learning_rate": 0.0001, "loss": 7.1731, "loss/crossentropy": 1.9803957544267177, "loss/hidden": 3.40078125, "loss/jsd": 0.0, "loss/logits": 0.18811767078004776, "step": 14050 }, { "epoch": 0.4686666666666667, "grad_norm": 28.375, "grad_norm_var": 11.007747395833333, "learning_rate": 0.0001, "loss": 7.3203, "loss/crossentropy": 1.9736334435641765, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.1970407435670495, "step": 14060 }, { "epoch": 0.469, "grad_norm": 30.5, "grad_norm_var": 9.547916666666667, "learning_rate": 0.0001, "loss": 7.1927, "loss/crossentropy": 1.9340065911412239, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.18652195790782572, "step": 14070 }, { "epoch": 0.4693333333333333, "grad_norm": 28.5, "grad_norm_var": 3.854622395833333, "learning_rate": 0.0001, "loss": 7.1735, "loss/crossentropy": 2.111950046569109, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18685067230835556, "step": 14080 }, { "epoch": 0.4696666666666667, "grad_norm": 29.625, "grad_norm_var": 4.346875, "learning_rate": 0.0001, "loss": 7.1972, "loss/crossentropy": 2.012338759750128, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.17915884824469686, "step": 14090 }, { "epoch": 0.47, "grad_norm": 27.875, "grad_norm_var": 6.642643229166667, "learning_rate": 0.0001, "loss": 7.1036, "loss/crossentropy": 1.793365080654621, "loss/hidden": 3.314453125, "loss/jsd": 0.0, "loss/logits": 0.1639671189710498, "step": 14100 }, { "epoch": 0.4703333333333333, "grad_norm": 35.5, "grad_norm_var": 7.004166666666666, "learning_rate": 0.0001, "loss": 7.1668, "loss/crossentropy": 1.9357743330299855, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18634977592155338, "step": 14110 }, { "epoch": 0.4706666666666667, "grad_norm": 30.375, "grad_norm_var": 7.690625, "learning_rate": 0.0001, "loss": 7.3481, "loss/crossentropy": 2.083378294110298, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.1864456120878458, "step": 14120 }, { "epoch": 0.471, "grad_norm": 26.625, "grad_norm_var": 5.374739583333334, "learning_rate": 0.0001, "loss": 7.1287, "loss/crossentropy": 1.996493048220873, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.1742098187096417, "step": 14130 }, { "epoch": 0.4713333333333333, "grad_norm": 32.5, "grad_norm_var": 2.3135416666666666, "learning_rate": 0.0001, "loss": 7.345, "loss/crossentropy": 2.110854651033878, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.1971131669357419, "step": 14140 }, { "epoch": 0.4716666666666667, "grad_norm": 31.75, "grad_norm_var": 1.7858723958333333, "learning_rate": 0.0001, "loss": 7.4198, "loss/crossentropy": 2.208771276473999, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.20393431689590216, "step": 14150 }, { "epoch": 0.472, "grad_norm": 30.875, "grad_norm_var": 3.005989583333333, "learning_rate": 0.0001, "loss": 7.299, "loss/crossentropy": 1.9959791138768197, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.1865115189924836, "step": 14160 }, { "epoch": 0.4723333333333333, "grad_norm": 31.25, "grad_norm_var": 1.4497395833333333, "learning_rate": 0.0001, "loss": 7.3079, "loss/crossentropy": 2.062359070777893, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.2044944256544113, "step": 14170 }, { "epoch": 0.4726666666666667, "grad_norm": 32.25, "grad_norm_var": 4.857291666666667, "learning_rate": 0.0001, "loss": 7.3495, "loss/crossentropy": 2.040824881196022, "loss/hidden": 3.53984375, "loss/jsd": 0.0, "loss/logits": 0.20063987988978624, "step": 14180 }, { "epoch": 0.473, "grad_norm": 32.75, "grad_norm_var": 4.599934895833333, "learning_rate": 0.0001, "loss": 7.2478, "loss/crossentropy": 1.770907675474882, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.17634920533746481, "step": 14190 }, { "epoch": 0.47333333333333333, "grad_norm": 32.5, "grad_norm_var": 3.6635416666666667, "learning_rate": 0.0001, "loss": 7.3193, "loss/crossentropy": 1.931127228960395, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.194002497009933, "step": 14200 }, { "epoch": 0.4736666666666667, "grad_norm": 30.125, "grad_norm_var": 3.207747395833333, "learning_rate": 0.0001, "loss": 7.2921, "loss/crossentropy": 1.9637531742453576, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.18768604155629873, "step": 14210 }, { "epoch": 0.474, "grad_norm": 29.75, "grad_norm_var": 27.9103515625, "learning_rate": 0.0001, "loss": 7.231, "loss/crossentropy": 1.9079122804105282, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.17479714369401336, "step": 14220 }, { "epoch": 0.47433333333333333, "grad_norm": 28.375, "grad_norm_var": 27.735872395833333, "learning_rate": 0.0001, "loss": 7.2021, "loss/crossentropy": 1.8754981122910976, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.17823023833334445, "step": 14230 }, { "epoch": 0.4746666666666667, "grad_norm": 31.25, "grad_norm_var": 1.87890625, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 1.758564366400242, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.1810397258028388, "step": 14240 }, { "epoch": 0.475, "grad_norm": 28.375, "grad_norm_var": 3.498893229166667, "learning_rate": 0.0001, "loss": 7.2265, "loss/crossentropy": 1.8654813591390849, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.18339749025180935, "step": 14250 }, { "epoch": 0.47533333333333333, "grad_norm": 33.5, "grad_norm_var": 4.649739583333333, "learning_rate": 0.0001, "loss": 7.3453, "loss/crossentropy": 2.089652580022812, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.1807380985468626, "step": 14260 }, { "epoch": 0.4756666666666667, "grad_norm": 32.25, "grad_norm_var": 14.39140625, "learning_rate": 0.0001, "loss": 7.2797, "loss/crossentropy": 2.096323397755623, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.19416863657534122, "step": 14270 }, { "epoch": 0.476, "grad_norm": 30.375, "grad_norm_var": 4.297330729166666, "learning_rate": 0.0001, "loss": 7.3681, "loss/crossentropy": 1.9862741835415363, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.18516041403636335, "step": 14280 }, { "epoch": 0.47633333333333333, "grad_norm": 32.0, "grad_norm_var": 5.165625, "learning_rate": 0.0001, "loss": 7.2465, "loss/crossentropy": 2.070752876251936, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.17928477935492992, "step": 14290 }, { "epoch": 0.4766666666666667, "grad_norm": 28.875, "grad_norm_var": 2.9497395833333333, "learning_rate": 0.0001, "loss": 7.1593, "loss/crossentropy": 2.10032504722476, "loss/hidden": 3.290234375, "loss/jsd": 0.0, "loss/logits": 0.18202332993969322, "step": 14300 }, { "epoch": 0.477, "grad_norm": 30.875, "grad_norm_var": 0.8775390625, "learning_rate": 0.0001, "loss": 7.2774, "loss/crossentropy": 1.8922498770058156, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.17946081692352892, "step": 14310 }, { "epoch": 0.47733333333333333, "grad_norm": 28.5, "grad_norm_var": 3.1754557291666665, "learning_rate": 0.0001, "loss": 7.3143, "loss/crossentropy": 1.8936010167002677, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.1867059664800763, "step": 14320 }, { "epoch": 0.4776666666666667, "grad_norm": 31.5, "grad_norm_var": 4.528125, "learning_rate": 0.0001, "loss": 7.2277, "loss/crossentropy": 1.921551989018917, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.19147583292797207, "step": 14330 }, { "epoch": 0.478, "grad_norm": 31.875, "grad_norm_var": 3.7660807291666667, "learning_rate": 0.0001, "loss": 7.3654, "loss/crossentropy": 1.9913725525140762, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.24330975264310836, "step": 14340 }, { "epoch": 0.47833333333333333, "grad_norm": 28.75, "grad_norm_var": 4.665559895833334, "learning_rate": 0.0001, "loss": 7.1606, "loss/crossentropy": 1.9227544002234935, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.17083032745867968, "step": 14350 }, { "epoch": 0.4786666666666667, "grad_norm": 27.75, "grad_norm_var": 2.552018229166667, "learning_rate": 0.0001, "loss": 7.2825, "loss/crossentropy": 1.9543706424534322, "loss/hidden": 3.484765625, "loss/jsd": 0.0, "loss/logits": 0.1910249523818493, "step": 14360 }, { "epoch": 0.479, "grad_norm": 32.25, "grad_norm_var": 2.8395182291666665, "learning_rate": 0.0001, "loss": 7.3107, "loss/crossentropy": 2.0925781570374964, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.1845812678337097, "step": 14370 }, { "epoch": 0.47933333333333333, "grad_norm": 28.75, "grad_norm_var": 5.264322916666667, "learning_rate": 0.0001, "loss": 7.3043, "loss/crossentropy": 1.8956058517098426, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.1832061681896448, "step": 14380 }, { "epoch": 0.4796666666666667, "grad_norm": 30.25, "grad_norm_var": 5.139583333333333, "learning_rate": 0.0001, "loss": 7.2023, "loss/crossentropy": 1.9859229177236557, "loss/hidden": 3.284375, "loss/jsd": 0.0, "loss/logits": 0.17791289957240225, "step": 14390 }, { "epoch": 0.48, "grad_norm": 31.375, "grad_norm_var": 2.6431640625, "learning_rate": 0.0001, "loss": 7.3538, "loss/crossentropy": 1.9583167258650065, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.18876892141997814, "step": 14400 }, { "epoch": 0.48033333333333333, "grad_norm": 32.75, "grad_norm_var": 2.7059895833333334, "learning_rate": 0.0001, "loss": 7.4363, "loss/crossentropy": 2.107966187596321, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.19107939191162587, "step": 14410 }, { "epoch": 0.4806666666666667, "grad_norm": 29.875, "grad_norm_var": 3.9322265625, "learning_rate": 0.0001, "loss": 7.3112, "loss/crossentropy": 1.8692240186035634, "loss/hidden": 3.528125, "loss/jsd": 0.0, "loss/logits": 0.1931241899728775, "step": 14420 }, { "epoch": 0.481, "grad_norm": 31.25, "grad_norm_var": 2.37890625, "learning_rate": 0.0001, "loss": 7.2615, "loss/crossentropy": 2.0040179274976255, "loss/hidden": 3.45234375, "loss/jsd": 0.0, "loss/logits": 0.20073560662567616, "step": 14430 }, { "epoch": 0.48133333333333334, "grad_norm": 29.125, "grad_norm_var": 2.598372395833333, "learning_rate": 0.0001, "loss": 7.2191, "loss/crossentropy": 1.9971159234642983, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.18026294596493245, "step": 14440 }, { "epoch": 0.4816666666666667, "grad_norm": 28.75, "grad_norm_var": 2.6186848958333333, "learning_rate": 0.0001, "loss": 7.294, "loss/crossentropy": 1.9147129327058792, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.17570122852921485, "step": 14450 }, { "epoch": 0.482, "grad_norm": 31.25, "grad_norm_var": 2.021875, "learning_rate": 0.0001, "loss": 7.2644, "loss/crossentropy": 1.9100076489150524, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.1739309343509376, "step": 14460 }, { "epoch": 0.48233333333333334, "grad_norm": 28.125, "grad_norm_var": 3.2337890625, "learning_rate": 0.0001, "loss": 7.1954, "loss/crossentropy": 1.932772123813629, "loss/hidden": 3.270703125, "loss/jsd": 0.0, "loss/logits": 0.16541298273950816, "step": 14470 }, { "epoch": 0.4826666666666667, "grad_norm": 31.5, "grad_norm_var": 3.840559895833333, "learning_rate": 0.0001, "loss": 7.2881, "loss/crossentropy": 2.1337453827261923, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.19551763432100416, "step": 14480 }, { "epoch": 0.483, "grad_norm": 29.375, "grad_norm_var": 3.246809895833333, "learning_rate": 0.0001, "loss": 7.2192, "loss/crossentropy": 1.8751170352101325, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.17985125011764466, "step": 14490 }, { "epoch": 0.48333333333333334, "grad_norm": 33.0, "grad_norm_var": 4.066666666666666, "learning_rate": 0.0001, "loss": 7.3354, "loss/crossentropy": 2.095403380692005, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.18491684161126615, "step": 14500 }, { "epoch": 0.4836666666666667, "grad_norm": 30.5, "grad_norm_var": 4.2541015625, "learning_rate": 0.0001, "loss": 7.3243, "loss/crossentropy": 2.0919691421091557, "loss/hidden": 3.47578125, "loss/jsd": 0.0, "loss/logits": 0.18806700157001616, "step": 14510 }, { "epoch": 0.484, "grad_norm": 34.5, "grad_norm_var": 2.4494140625, "learning_rate": 0.0001, "loss": 7.2747, "loss/crossentropy": 1.9812095284461975, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.17819451428949834, "step": 14520 }, { "epoch": 0.48433333333333334, "grad_norm": 28.5, "grad_norm_var": 186.58743489583333, "learning_rate": 0.0001, "loss": 7.3561, "loss/crossentropy": 1.9783332735300063, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.20479958709329366, "step": 14530 }, { "epoch": 0.4846666666666667, "grad_norm": 30.0, "grad_norm_var": 1.3530598958333333, "learning_rate": 0.0001, "loss": 7.268, "loss/crossentropy": 1.9767938986420632, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.17882973086088896, "step": 14540 }, { "epoch": 0.485, "grad_norm": 30.0, "grad_norm_var": 1.0768229166666667, "learning_rate": 0.0001, "loss": 7.3649, "loss/crossentropy": 2.1269202053546907, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.1913153499364853, "step": 14550 }, { "epoch": 0.48533333333333334, "grad_norm": 29.5, "grad_norm_var": 1.7760416666666667, "learning_rate": 0.0001, "loss": 7.177, "loss/crossentropy": 2.0858373239636423, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.17888487838208675, "step": 14560 }, { "epoch": 0.4856666666666667, "grad_norm": 30.625, "grad_norm_var": 5.283333333333333, "learning_rate": 0.0001, "loss": 7.1911, "loss/crossentropy": 1.885525608062744, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.17344560530036687, "step": 14570 }, { "epoch": 0.486, "grad_norm": 31.5, "grad_norm_var": 2.4184895833333333, "learning_rate": 0.0001, "loss": 7.2973, "loss/crossentropy": 1.9727637752890588, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19562495071440936, "step": 14580 }, { "epoch": 0.48633333333333334, "grad_norm": 30.75, "grad_norm_var": 3.6580487678935434e+18, "learning_rate": 0.0001, "loss": 7.2825, "loss/crossentropy": 1.9940708130598068, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.1821042774245143, "step": 14590 }, { "epoch": 0.4866666666666667, "grad_norm": 29.875, "grad_norm_var": 2.41640625, "learning_rate": 0.0001, "loss": 7.2825, "loss/crossentropy": 1.9921208329498767, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.19399751164019108, "step": 14600 }, { "epoch": 0.487, "grad_norm": 29.25, "grad_norm_var": 1.4785807291666666, "learning_rate": 0.0001, "loss": 7.3533, "loss/crossentropy": 1.991587370634079, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2016286700963974, "step": 14610 }, { "epoch": 0.48733333333333334, "grad_norm": 28.5, "grad_norm_var": 2.5104166666666665, "learning_rate": 0.0001, "loss": 7.2231, "loss/crossentropy": 2.006051167845726, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.17960976548492907, "step": 14620 }, { "epoch": 0.4876666666666667, "grad_norm": 34.0, "grad_norm_var": 2.7145182291666665, "learning_rate": 0.0001, "loss": 7.3271, "loss/crossentropy": 2.0921467639505864, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.193321427796036, "step": 14630 }, { "epoch": 0.488, "grad_norm": 32.75, "grad_norm_var": 3.1184895833333335, "learning_rate": 0.0001, "loss": 7.2639, "loss/crossentropy": 1.8961512267589569, "loss/hidden": 3.521875, "loss/jsd": 0.0, "loss/logits": 0.18588051088154317, "step": 14640 }, { "epoch": 0.48833333333333334, "grad_norm": 31.375, "grad_norm_var": 2.3997395833333335, "learning_rate": 0.0001, "loss": 7.2498, "loss/crossentropy": 1.859757984429598, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.17947223857045175, "step": 14650 }, { "epoch": 0.4886666666666667, "grad_norm": 32.25, "grad_norm_var": 2.1285807291666665, "learning_rate": 0.0001, "loss": 7.4218, "loss/crossentropy": 1.9383841037750245, "loss/hidden": 3.51015625, "loss/jsd": 0.0, "loss/logits": 0.19589889533817767, "step": 14660 }, { "epoch": 0.489, "grad_norm": 29.5, "grad_norm_var": 2.3280598958333334, "learning_rate": 0.0001, "loss": 7.2391, "loss/crossentropy": 1.9927615858614445, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.19011715687811376, "step": 14670 }, { "epoch": 0.48933333333333334, "grad_norm": 31.375, "grad_norm_var": 144.79212239583333, "learning_rate": 0.0001, "loss": 7.2848, "loss/crossentropy": 1.9731652524322272, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.19373478470370173, "step": 14680 }, { "epoch": 0.48966666666666664, "grad_norm": 5704253440.0, "grad_norm_var": 4.424317461162658e+18, "learning_rate": 0.0001, "loss": 7.5245, "loss/crossentropy": 2.0233053267002106, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.19612479992210866, "step": 14690 }, { "epoch": 0.49, "grad_norm": 28.25, "grad_norm_var": 2.033656684167037e+18, "learning_rate": 0.0001, "loss": 7.3467, "loss/crossentropy": 2.0942041859030724, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.19329715985804796, "step": 14700 }, { "epoch": 0.49033333333333334, "grad_norm": 30.75, "grad_norm_var": 3.1962890625, "learning_rate": 0.0001, "loss": 7.335, "loss/crossentropy": 1.9769064359366895, "loss/hidden": 3.505859375, "loss/jsd": 0.0, "loss/logits": 0.19277337044477463, "step": 14710 }, { "epoch": 0.49066666666666664, "grad_norm": 30.5, "grad_norm_var": 4.612434895833333, "learning_rate": 0.0001, "loss": 7.2284, "loss/crossentropy": 1.8806253343820571, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.18804354909807444, "step": 14720 }, { "epoch": 0.491, "grad_norm": 31.875, "grad_norm_var": 12.257747395833333, "learning_rate": 0.0001, "loss": 7.2938, "loss/crossentropy": 2.0535501569509504, "loss/hidden": 3.549609375, "loss/jsd": 0.0, "loss/logits": 0.19250110480934382, "step": 14730 }, { "epoch": 0.49133333333333334, "grad_norm": 30.75, "grad_norm_var": 13.468489583333334, "learning_rate": 0.0001, "loss": 7.251, "loss/crossentropy": 1.8963430039584637, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.18113610297441482, "step": 14740 }, { "epoch": 0.49166666666666664, "grad_norm": 30.875, "grad_norm_var": 3.200455729166667, "learning_rate": 0.0001, "loss": 7.2252, "loss/crossentropy": 2.061627131700516, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.19164848681539298, "step": 14750 }, { "epoch": 0.492, "grad_norm": 29.125, "grad_norm_var": 2.2666015625, "learning_rate": 0.0001, "loss": 7.183, "loss/crossentropy": 1.977836200594902, "loss/hidden": 3.30078125, "loss/jsd": 0.0, "loss/logits": 0.18077237708494068, "step": 14760 }, { "epoch": 0.49233333333333335, "grad_norm": 29.875, "grad_norm_var": 85.4400390625, "learning_rate": 0.0001, "loss": 7.3142, "loss/crossentropy": 2.124833844602108, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.19227419793605804, "step": 14770 }, { "epoch": 0.49266666666666664, "grad_norm": 30.25, "grad_norm_var": 2.2947265625, "learning_rate": 0.0001, "loss": 7.3118, "loss/crossentropy": 2.0769446551799775, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.1867304440587759, "step": 14780 }, { "epoch": 0.493, "grad_norm": 32.5, "grad_norm_var": 2.070572916666667, "learning_rate": 0.0001, "loss": 7.4065, "loss/crossentropy": 1.9149356804788114, "loss/hidden": 3.48359375, "loss/jsd": 0.0, "loss/logits": 0.19583360571414232, "step": 14790 }, { "epoch": 0.49333333333333335, "grad_norm": 30.5, "grad_norm_var": 9.985872395833333, "learning_rate": 0.0001, "loss": 7.3685, "loss/crossentropy": 1.9120120756328105, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.17053407710045576, "step": 14800 }, { "epoch": 0.49366666666666664, "grad_norm": 32.75, "grad_norm_var": 12.080143229166667, "learning_rate": 0.0001, "loss": 7.2798, "loss/crossentropy": 1.8602485600858927, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.17466555023565888, "step": 14810 }, { "epoch": 0.494, "grad_norm": 28.375, "grad_norm_var": 4.017643229166667, "learning_rate": 0.0001, "loss": 7.2651, "loss/crossentropy": 2.084671050310135, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.19853878244757653, "step": 14820 }, { "epoch": 0.49433333333333335, "grad_norm": 28.125, "grad_norm_var": 9.098893229166666, "learning_rate": 0.0001, "loss": 7.2115, "loss/crossentropy": 1.833353342115879, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.17049445798620583, "step": 14830 }, { "epoch": 0.49466666666666664, "grad_norm": 30.0, "grad_norm_var": 8.006184895833334, "learning_rate": 0.0001, "loss": 7.4159, "loss/crossentropy": 2.1067821979522705, "loss/hidden": 3.55078125, "loss/jsd": 0.0, "loss/logits": 0.2068447494879365, "step": 14840 }, { "epoch": 0.495, "grad_norm": 29.875, "grad_norm_var": 4.899934895833334, "learning_rate": 0.0001, "loss": 7.1766, "loss/crossentropy": 1.910665026307106, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.18037952343001962, "step": 14850 }, { "epoch": 0.49533333333333335, "grad_norm": 31.75, "grad_norm_var": 23.553580729166665, "learning_rate": 0.0001, "loss": 7.4049, "loss/crossentropy": 2.08663332760334, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.18538948502391578, "step": 14860 }, { "epoch": 0.49566666666666664, "grad_norm": 27.25, "grad_norm_var": 26.8541015625, "learning_rate": 0.0001, "loss": 7.3146, "loss/crossentropy": 1.9519363686442375, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.18595816064625978, "step": 14870 }, { "epoch": 0.496, "grad_norm": 29.125, "grad_norm_var": 2.758736221287167e+18, "learning_rate": 0.0001, "loss": 7.4647, "loss/crossentropy": 1.85757380053401, "loss/hidden": 3.559375, "loss/jsd": 0.0, "loss/logits": 0.20015630628913642, "step": 14880 }, { "epoch": 0.49633333333333335, "grad_norm": 29.25, "grad_norm_var": 2.758736220733519e+18, "learning_rate": 0.0001, "loss": 7.2809, "loss/crossentropy": 1.9004122354090214, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.18271783469244837, "step": 14890 }, { "epoch": 0.49666666666666665, "grad_norm": 29.5, "grad_norm_var": 15.068489583333333, "learning_rate": 0.0001, "loss": 7.3271, "loss/crossentropy": 1.9542576663196087, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.1828365745022893, "step": 14900 }, { "epoch": 0.497, "grad_norm": 30.625, "grad_norm_var": 2.9284656293903073e+18, "learning_rate": 0.0001, "loss": 7.3389, "loss/crossentropy": 2.0319047056138517, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.1940848847851157, "step": 14910 }, { "epoch": 0.49733333333333335, "grad_norm": 27.625, "grad_norm_var": 16.395247395833334, "learning_rate": 0.0001, "loss": 7.1623, "loss/crossentropy": 2.0061253413558005, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.17438880652189254, "step": 14920 }, { "epoch": 0.49766666666666665, "grad_norm": 31.375, "grad_norm_var": 6.645572916666667, "learning_rate": 0.0001, "loss": 7.2828, "loss/crossentropy": 1.866012156754732, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.19834656277671456, "step": 14930 }, { "epoch": 0.498, "grad_norm": 35.75, "grad_norm_var": 3.5083333333333333, "learning_rate": 0.0001, "loss": 7.289, "loss/crossentropy": 1.976353394985199, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.17669850438833237, "step": 14940 }, { "epoch": 0.49833333333333335, "grad_norm": 30.0, "grad_norm_var": 5.053125, "learning_rate": 0.0001, "loss": 7.2594, "loss/crossentropy": 1.9223392769694327, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.19165921732783317, "step": 14950 }, { "epoch": 0.49866666666666665, "grad_norm": 30.25, "grad_norm_var": 4.651822916666666, "learning_rate": 0.0001, "loss": 7.4345, "loss/crossentropy": 1.9121432788670063, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.16835108688101172, "step": 14960 }, { "epoch": 0.499, "grad_norm": 30.625, "grad_norm_var": 1.9104166666666667, "learning_rate": 0.0001, "loss": 7.2463, "loss/crossentropy": 1.9629229187965394, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.19058237224817276, "step": 14970 }, { "epoch": 0.49933333333333335, "grad_norm": 29.75, "grad_norm_var": 2.84140625, "learning_rate": 0.0001, "loss": 7.135, "loss/crossentropy": 1.833661637455225, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.17307843612506985, "step": 14980 }, { "epoch": 0.49966666666666665, "grad_norm": 31.875, "grad_norm_var": 11.799739583333333, "learning_rate": 0.0001, "loss": 7.3944, "loss/crossentropy": 1.9382908195257187, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.1857702426612377, "step": 14990 }, { "epoch": 0.5, "grad_norm": 30.375, "grad_norm_var": 12.815625, "learning_rate": 0.0001, "loss": 7.263, "loss/crossentropy": 1.9668457843363285, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.19115968160331248, "step": 15000 }, { "epoch": 0.5003333333333333, "grad_norm": 30.875, "grad_norm_var": 4.258072916666666, "learning_rate": 0.0001, "loss": 7.2642, "loss/crossentropy": 1.9341649785637856, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.19099596366286278, "step": 15010 }, { "epoch": 0.5006666666666667, "grad_norm": 28.25, "grad_norm_var": 2.637434895833333, "learning_rate": 0.0001, "loss": 7.212, "loss/crossentropy": 2.1158292412757875, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.19176767682656645, "step": 15020 }, { "epoch": 0.501, "grad_norm": 30.875, "grad_norm_var": 17.722916666666666, "learning_rate": 0.0001, "loss": 7.3206, "loss/crossentropy": 2.1577776059508325, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.184534977003932, "step": 15030 }, { "epoch": 0.5013333333333333, "grad_norm": 32.75, "grad_norm_var": 18.4337890625, "learning_rate": 0.0001, "loss": 7.1828, "loss/crossentropy": 1.8885925211012364, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.1956418387591839, "step": 15040 }, { "epoch": 0.5016666666666667, "grad_norm": 31.5, "grad_norm_var": 3.999739583333333, "learning_rate": 0.0001, "loss": 7.2953, "loss/crossentropy": 1.9687896713614463, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.18052080869674683, "step": 15050 }, { "epoch": 0.502, "grad_norm": 26.375, "grad_norm_var": 5.913541666666666, "learning_rate": 0.0001, "loss": 7.3405, "loss/crossentropy": 1.9616722613573074, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.18233999414369464, "step": 15060 }, { "epoch": 0.5023333333333333, "grad_norm": 34.75, "grad_norm_var": 7.9244140625, "learning_rate": 0.0001, "loss": 7.3171, "loss/crossentropy": 1.9626036584377289, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.1945879116654396, "step": 15070 }, { "epoch": 0.5026666666666667, "grad_norm": 29.75, "grad_norm_var": 6.7103515625, "learning_rate": 0.0001, "loss": 7.2608, "loss/crossentropy": 2.009495832026005, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.18170829825103282, "step": 15080 }, { "epoch": 0.503, "grad_norm": 28.75, "grad_norm_var": 3.246875, "learning_rate": 0.0001, "loss": 7.2996, "loss/crossentropy": 1.9122354075312615, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.18928468553349376, "step": 15090 }, { "epoch": 0.5033333333333333, "grad_norm": 31.0, "grad_norm_var": 2.6572916666666666, "learning_rate": 0.0001, "loss": 7.3271, "loss/crossentropy": 1.9989626854658127, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.19386794976890087, "step": 15100 }, { "epoch": 0.5036666666666667, "grad_norm": 29.25, "grad_norm_var": 5.812239583333334, "learning_rate": 0.0001, "loss": 7.2061, "loss/crossentropy": 1.9253133445978166, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.18773257322609424, "step": 15110 }, { "epoch": 0.504, "grad_norm": 30.0, "grad_norm_var": 6.076822916666667, "learning_rate": 0.0001, "loss": 7.2539, "loss/crossentropy": 1.9763063967227936, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.18864987064152955, "step": 15120 }, { "epoch": 0.5043333333333333, "grad_norm": 31.0, "grad_norm_var": 2.255989583333333, "learning_rate": 0.0001, "loss": 7.2739, "loss/crossentropy": 1.9393825478851796, "loss/hidden": 3.267578125, "loss/jsd": 0.0, "loss/logits": 0.16808282807469369, "step": 15130 }, { "epoch": 0.5046666666666667, "grad_norm": 32.25, "grad_norm_var": 13.85390625, "learning_rate": 0.0001, "loss": 7.3414, "loss/crossentropy": 2.0459727227687834, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.1919851318001747, "step": 15140 }, { "epoch": 0.505, "grad_norm": 30.125, "grad_norm_var": 3.419791666666667, "learning_rate": 0.0001, "loss": 7.2909, "loss/crossentropy": 2.063647876679897, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.1864373993128538, "step": 15150 }, { "epoch": 0.5053333333333333, "grad_norm": 28.125, "grad_norm_var": 3.122330729166667, "learning_rate": 0.0001, "loss": 7.2721, "loss/crossentropy": 2.0184415966272353, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.1886873163282871, "step": 15160 }, { "epoch": 0.5056666666666667, "grad_norm": 29.75, "grad_norm_var": 3.284309895833333, "learning_rate": 0.0001, "loss": 7.1857, "loss/crossentropy": 2.0687252331525086, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.19416886111721396, "step": 15170 }, { "epoch": 0.506, "grad_norm": 6912212992.0, "grad_norm_var": 2.986168002506566e+18, "learning_rate": 0.0001, "loss": 7.3252, "loss/crossentropy": 2.077074535191059, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.17499108193442225, "step": 15180 }, { "epoch": 0.5063333333333333, "grad_norm": 29.0, "grad_norm_var": 2.9861680018225454e+18, "learning_rate": 0.0001, "loss": 7.2512, "loss/crossentropy": 2.035786287486553, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.17119733337312937, "step": 15190 }, { "epoch": 0.5066666666666667, "grad_norm": 27.75, "grad_norm_var": 1.9238932291666666, "learning_rate": 0.0001, "loss": 7.2582, "loss/crossentropy": 2.045074371993542, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.17849700357764958, "step": 15200 }, { "epoch": 0.507, "grad_norm": 31.875, "grad_norm_var": 2.9010416666666665, "learning_rate": 0.0001, "loss": 7.2358, "loss/crossentropy": 2.039666683226824, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.1776250446215272, "step": 15210 }, { "epoch": 0.5073333333333333, "grad_norm": 34.75, "grad_norm_var": 11.348958333333334, "learning_rate": 0.0001, "loss": 7.3783, "loss/crossentropy": 2.0881707914173604, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.1869637723080814, "step": 15220 }, { "epoch": 0.5076666666666667, "grad_norm": 30.125, "grad_norm_var": 5.097330729166667, "learning_rate": 0.0001, "loss": 7.2808, "loss/crossentropy": 1.8841201193630694, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.17460453044623137, "step": 15230 }, { "epoch": 0.508, "grad_norm": 30.25, "grad_norm_var": 5.2072265625, "learning_rate": 0.0001, "loss": 7.3502, "loss/crossentropy": 2.021357525140047, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.19254140760749577, "step": 15240 }, { "epoch": 0.5083333333333333, "grad_norm": 29.625, "grad_norm_var": 4.54375, "learning_rate": 0.0001, "loss": 7.3582, "loss/crossentropy": 2.184447726607323, "loss/hidden": 3.50859375, "loss/jsd": 0.0, "loss/logits": 0.20638411361724138, "step": 15250 }, { "epoch": 0.5086666666666667, "grad_norm": 30.5, "grad_norm_var": 2.0957682291666666, "learning_rate": 0.0001, "loss": 7.3238, "loss/crossentropy": 2.023256094753742, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.1798347791656852, "step": 15260 }, { "epoch": 0.509, "grad_norm": 30.75, "grad_norm_var": 1.6462890625, "learning_rate": 0.0001, "loss": 7.2913, "loss/crossentropy": 2.032896101474762, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.18790974766016005, "step": 15270 }, { "epoch": 0.5093333333333333, "grad_norm": 32.25, "grad_norm_var": 1.9166015625, "learning_rate": 0.0001, "loss": 7.3535, "loss/crossentropy": 2.0938129678368567, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.20121476016938686, "step": 15280 }, { "epoch": 0.5096666666666667, "grad_norm": 28.125, "grad_norm_var": 2.070572916666667, "learning_rate": 0.0001, "loss": 7.4863, "loss/crossentropy": 2.02777543887496, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.18144775219261647, "step": 15290 }, { "epoch": 0.51, "grad_norm": 28.625, "grad_norm_var": 3.863997395833333, "learning_rate": 0.0001, "loss": 7.3333, "loss/crossentropy": 2.0419157058000565, "loss/hidden": 3.48984375, "loss/jsd": 0.0, "loss/logits": 0.19659815523773433, "step": 15300 }, { "epoch": 0.5103333333333333, "grad_norm": 28.125, "grad_norm_var": 3.6462890625, "learning_rate": 0.0001, "loss": 7.2508, "loss/crossentropy": 1.900747624784708, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.18975985096767545, "step": 15310 }, { "epoch": 0.5106666666666667, "grad_norm": 32.25, "grad_norm_var": 2.76015625, "learning_rate": 0.0001, "loss": 7.441, "loss/crossentropy": 1.9752637542784215, "loss/hidden": 3.50234375, "loss/jsd": 0.0, "loss/logits": 0.21422456055879593, "step": 15320 }, { "epoch": 0.511, "grad_norm": 32.0, "grad_norm_var": 7.16640625, "learning_rate": 0.0001, "loss": 7.2949, "loss/crossentropy": 1.8371703259646892, "loss/hidden": 3.51796875, "loss/jsd": 0.0, "loss/logits": 0.1878586201928556, "step": 15330 }, { "epoch": 0.5113333333333333, "grad_norm": 29.375, "grad_norm_var": 2.6853515625, "learning_rate": 0.0001, "loss": 7.3863, "loss/crossentropy": 2.193024069070816, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.20059159230440854, "step": 15340 }, { "epoch": 0.5116666666666667, "grad_norm": 29.0, "grad_norm_var": 2.522330729166667, "learning_rate": 0.0001, "loss": 7.2859, "loss/crossentropy": 1.852187267690897, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.1828073389828205, "step": 15350 }, { "epoch": 0.512, "grad_norm": 29.125, "grad_norm_var": 5.255989583333333, "learning_rate": 0.0001, "loss": 7.204, "loss/crossentropy": 1.75890439376235, "loss/hidden": 3.53984375, "loss/jsd": 0.0, "loss/logits": 0.19306068867444992, "step": 15360 }, { "epoch": 0.5123333333333333, "grad_norm": 32.5, "grad_norm_var": 3.5166015625, "learning_rate": 0.0001, "loss": 7.3449, "loss/crossentropy": 2.0067847207188607, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.18022690201178193, "step": 15370 }, { "epoch": 0.5126666666666667, "grad_norm": 29.5, "grad_norm_var": 2.803580729166667, "learning_rate": 0.0001, "loss": 7.2846, "loss/crossentropy": 1.9421979293227196, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.17875886093825102, "step": 15380 }, { "epoch": 0.513, "grad_norm": 28.875, "grad_norm_var": 2.3926432291666666, "learning_rate": 0.0001, "loss": 7.2595, "loss/crossentropy": 1.9621699415147305, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.18496664315462114, "step": 15390 }, { "epoch": 0.5133333333333333, "grad_norm": 30.0, "grad_norm_var": 3.04140625, "learning_rate": 0.0001, "loss": 7.2837, "loss/crossentropy": 1.9308017980307342, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.18070890130475165, "step": 15400 }, { "epoch": 0.5136666666666667, "grad_norm": 28.625, "grad_norm_var": 3.193489583333333, "learning_rate": 0.0001, "loss": 7.2838, "loss/crossentropy": 1.979032465815544, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.1803389212116599, "step": 15410 }, { "epoch": 0.514, "grad_norm": 31.25, "grad_norm_var": 3.231184895833333, "learning_rate": 0.0001, "loss": 7.4329, "loss/crossentropy": 2.107999724149704, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.19837545417249203, "step": 15420 }, { "epoch": 0.5143333333333333, "grad_norm": 30.0, "grad_norm_var": 3.948958333333333, "learning_rate": 0.0001, "loss": 7.2881, "loss/crossentropy": 2.005809936672449, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.1750430199317634, "step": 15430 }, { "epoch": 0.5146666666666667, "grad_norm": 28.5, "grad_norm_var": 2.441666666666667, "learning_rate": 0.0001, "loss": 7.3494, "loss/crossentropy": 2.0274360150098802, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.19108331259340047, "step": 15440 }, { "epoch": 0.515, "grad_norm": 29.375, "grad_norm_var": 147.0884765625, "learning_rate": 0.0001, "loss": 7.3261, "loss/crossentropy": 1.9835629053413868, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.195416328497231, "step": 15450 }, { "epoch": 0.5153333333333333, "grad_norm": 27.25, "grad_norm_var": 2.3478515625, "learning_rate": 0.0001, "loss": 7.2624, "loss/crossentropy": 1.860911101102829, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.1809360329993069, "step": 15460 }, { "epoch": 0.5156666666666667, "grad_norm": 28.125, "grad_norm_var": 1.7202473958333333, "learning_rate": 0.0001, "loss": 7.292, "loss/crossentropy": 1.8568454101681708, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.18934241253882647, "step": 15470 }, { "epoch": 0.516, "grad_norm": 41.75, "grad_norm_var": 30.945833333333333, "learning_rate": 0.0001, "loss": 7.2869, "loss/crossentropy": 1.8642399728298187, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.17804431319236755, "step": 15480 }, { "epoch": 0.5163333333333333, "grad_norm": 29.375, "grad_norm_var": 31.568684895833332, "learning_rate": 0.0001, "loss": 7.2002, "loss/crossentropy": 1.9845358788967133, "loss/hidden": 3.45234375, "loss/jsd": 0.0, "loss/logits": 0.1958889814093709, "step": 15490 }, { "epoch": 0.5166666666666667, "grad_norm": 29.75, "grad_norm_var": 4.387239583333334, "learning_rate": 0.0001, "loss": 7.3583, "loss/crossentropy": 1.8880870372056962, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.17626534216105938, "step": 15500 }, { "epoch": 0.517, "grad_norm": 27.0, "grad_norm_var": 3.4072265625, "learning_rate": 0.0001, "loss": 7.255, "loss/crossentropy": 2.0607613906264306, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.18478121869266034, "step": 15510 }, { "epoch": 0.5173333333333333, "grad_norm": 27.375, "grad_norm_var": 16.258072916666666, "learning_rate": 0.0001, "loss": 7.0827, "loss/crossentropy": 1.9358821205794812, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.17726797852665185, "step": 15520 }, { "epoch": 0.5176666666666667, "grad_norm": 30.375, "grad_norm_var": 2.2822916666666666, "learning_rate": 0.0001, "loss": 7.3103, "loss/crossentropy": 1.6842635355889797, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.16621644040569664, "step": 15530 }, { "epoch": 0.518, "grad_norm": 31.25, "grad_norm_var": 0.72890625, "learning_rate": 0.0001, "loss": 7.2956, "loss/crossentropy": 1.8851488582789897, "loss/hidden": 3.5296875, "loss/jsd": 0.0, "loss/logits": 0.18778872694820165, "step": 15540 }, { "epoch": 0.5183333333333333, "grad_norm": 28.0, "grad_norm_var": 12.558333333333334, "learning_rate": 0.0001, "loss": 7.2707, "loss/crossentropy": 1.9387175090610982, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1825224945321679, "step": 15550 }, { "epoch": 0.5186666666666667, "grad_norm": 30.375, "grad_norm_var": 2.883072916666667, "learning_rate": 0.0001, "loss": 7.3642, "loss/crossentropy": 1.9614620946347714, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.19509832574985922, "step": 15560 }, { "epoch": 0.519, "grad_norm": 32.25, "grad_norm_var": 3.4905598958333335, "learning_rate": 0.0001, "loss": 7.2701, "loss/crossentropy": 2.2290361881256104, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.19939842727035284, "step": 15570 }, { "epoch": 0.5193333333333333, "grad_norm": 28.375, "grad_norm_var": 60.3869140625, "learning_rate": 0.0001, "loss": 7.3502, "loss/crossentropy": 2.0478545129299164, "loss/hidden": 3.52109375, "loss/jsd": 0.0, "loss/logits": 0.20770913884043693, "step": 15580 }, { "epoch": 0.5196666666666667, "grad_norm": 29.0, "grad_norm_var": 3.0962890625, "learning_rate": 0.0001, "loss": 7.3791, "loss/crossentropy": 2.1319813668727874, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.21386796981096268, "step": 15590 }, { "epoch": 0.52, "grad_norm": 28.375, "grad_norm_var": 2.5768229166666665, "learning_rate": 0.0001, "loss": 7.3218, "loss/crossentropy": 2.2474169731140137, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.20935069750994445, "step": 15600 }, { "epoch": 0.5203333333333333, "grad_norm": 28.75, "grad_norm_var": 2.905989583333333, "learning_rate": 0.0001, "loss": 7.2701, "loss/crossentropy": 1.8869876064360143, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.1681639501824975, "step": 15610 }, { "epoch": 0.5206666666666667, "grad_norm": 30.625, "grad_norm_var": 3.437239583333333, "learning_rate": 0.0001, "loss": 7.2415, "loss/crossentropy": 2.0793021127581595, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.19386467933654786, "step": 15620 }, { "epoch": 0.521, "grad_norm": 30.5, "grad_norm_var": 1.9843098958333334, "learning_rate": 0.0001, "loss": 7.4119, "loss/crossentropy": 2.015172242373228, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.1790133461356163, "step": 15630 }, { "epoch": 0.5213333333333333, "grad_norm": 27.75, "grad_norm_var": 1.1660807291666666, "learning_rate": 0.0001, "loss": 7.1894, "loss/crossentropy": 2.029386245831847, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.20761945452541114, "step": 15640 }, { "epoch": 0.5216666666666666, "grad_norm": 27.625, "grad_norm_var": 1.6559895833333333, "learning_rate": 0.0001, "loss": 7.2356, "loss/crossentropy": 1.8822843819856643, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.1762984933331609, "step": 15650 }, { "epoch": 0.522, "grad_norm": 28.375, "grad_norm_var": 1.3692057291666666, "learning_rate": 0.0001, "loss": 7.3525, "loss/crossentropy": 1.7902206793427466, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.17913363594561815, "step": 15660 }, { "epoch": 0.5223333333333333, "grad_norm": 28.875, "grad_norm_var": 2.341080729166667, "learning_rate": 0.0001, "loss": 7.2652, "loss/crossentropy": 2.1718143448233604, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.20259114131331443, "step": 15670 }, { "epoch": 0.5226666666666666, "grad_norm": 29.5, "grad_norm_var": 39.79837239583333, "learning_rate": 0.0001, "loss": 7.2664, "loss/crossentropy": 1.9856457620859147, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.19220476876944304, "step": 15680 }, { "epoch": 0.523, "grad_norm": 37.75, "grad_norm_var": 42.53854166666667, "learning_rate": 0.0001, "loss": 7.2429, "loss/crossentropy": 1.889240536093712, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.18168366001918912, "step": 15690 }, { "epoch": 0.5233333333333333, "grad_norm": 25.25, "grad_norm_var": 13.817643229166666, "learning_rate": 0.0001, "loss": 7.081, "loss/crossentropy": 1.974376516789198, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.16669896487146615, "step": 15700 }, { "epoch": 0.5236666666666666, "grad_norm": 29.0, "grad_norm_var": 17.565625, "learning_rate": 0.0001, "loss": 7.3469, "loss/crossentropy": 1.9174628980457782, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.18095233985222875, "step": 15710 }, { "epoch": 0.524, "grad_norm": 36.5, "grad_norm_var": 12.3744140625, "learning_rate": 0.0001, "loss": 7.4004, "loss/crossentropy": 1.848051906377077, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.17864528410136699, "step": 15720 }, { "epoch": 0.5243333333333333, "grad_norm": 30.125, "grad_norm_var": 6.037239583333333, "learning_rate": 0.0001, "loss": 7.3512, "loss/crossentropy": 1.9586548075079917, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.1987813390791416, "step": 15730 }, { "epoch": 0.5246666666666666, "grad_norm": 30.125, "grad_norm_var": 2.111458333333333, "learning_rate": 0.0001, "loss": 7.2689, "loss/crossentropy": 1.8848862916231155, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.1825363788753748, "step": 15740 }, { "epoch": 0.525, "grad_norm": 42.75, "grad_norm_var": 2.9861680004041016e+18, "learning_rate": 0.0001, "loss": 7.3173, "loss/crossentropy": 1.892388579249382, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.18400060348212718, "step": 15750 }, { "epoch": 0.5253333333333333, "grad_norm": 29.75, "grad_norm_var": 2.9861679955583524e+18, "learning_rate": 0.0001, "loss": 7.3208, "loss/crossentropy": 2.008869943767786, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.17707110848277807, "step": 15760 }, { "epoch": 0.5256666666666666, "grad_norm": 33.5, "grad_norm_var": 127.91666666666667, "learning_rate": 0.0001, "loss": 7.267, "loss/crossentropy": 1.8148567341268063, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.182215722464025, "step": 15770 }, { "epoch": 0.526, "grad_norm": 27.625, "grad_norm_var": 6.817643229166666, "learning_rate": 0.0001, "loss": 7.197, "loss/crossentropy": 1.9587459720671176, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.17491123918443918, "step": 15780 }, { "epoch": 0.5263333333333333, "grad_norm": 28.5, "grad_norm_var": 2.9791015625, "learning_rate": 0.0001, "loss": 7.1918, "loss/crossentropy": 1.8334319584071637, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.1813056123908609, "step": 15790 }, { "epoch": 0.5266666666666666, "grad_norm": 29.875, "grad_norm_var": 2.5337890625, "learning_rate": 0.0001, "loss": 7.1142, "loss/crossentropy": 1.8174663729965688, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.16524186152964832, "step": 15800 }, { "epoch": 0.527, "grad_norm": 29.875, "grad_norm_var": 3.4604166666666667, "learning_rate": 0.0001, "loss": 7.2408, "loss/crossentropy": 2.009811994433403, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.17376181483268738, "step": 15810 }, { "epoch": 0.5273333333333333, "grad_norm": 29.75, "grad_norm_var": 4.412239583333333, "learning_rate": 0.0001, "loss": 7.2197, "loss/crossentropy": 2.012850250303745, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.1919804196804762, "step": 15820 }, { "epoch": 0.5276666666666666, "grad_norm": 28.625, "grad_norm_var": 3.4785807291666666, "learning_rate": 0.0001, "loss": 7.2357, "loss/crossentropy": 1.9697308398783206, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.1740401660092175, "step": 15830 }, { "epoch": 0.528, "grad_norm": 29.0, "grad_norm_var": 7.133072916666666, "learning_rate": 0.0001, "loss": 7.2711, "loss/crossentropy": 2.0772396624088287, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.19701300971210003, "step": 15840 }, { "epoch": 0.5283333333333333, "grad_norm": 31.125, "grad_norm_var": 15.258333333333333, "learning_rate": 0.0001, "loss": 7.2718, "loss/crossentropy": 1.8829277858138085, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.1870917933061719, "step": 15850 }, { "epoch": 0.5286666666666666, "grad_norm": 29.375, "grad_norm_var": 10.5353515625, "learning_rate": 0.0001, "loss": 7.1814, "loss/crossentropy": 1.9729617074131967, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.17556989770382642, "step": 15860 }, { "epoch": 0.529, "grad_norm": 32.75, "grad_norm_var": 9.6306640625, "learning_rate": 0.0001, "loss": 7.1859, "loss/crossentropy": 1.9059140399098395, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.18621025402098895, "step": 15870 }, { "epoch": 0.5293333333333333, "grad_norm": 28.0, "grad_norm_var": 6.521809895833333, "learning_rate": 0.0001, "loss": 7.2255, "loss/crossentropy": 2.0615137211978434, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.1898872533813119, "step": 15880 }, { "epoch": 0.5296666666666666, "grad_norm": 30.75, "grad_norm_var": 2.664322916666667, "learning_rate": 0.0001, "loss": 7.3232, "loss/crossentropy": 1.9449198625981807, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.18167959284037352, "step": 15890 }, { "epoch": 0.53, "grad_norm": 29.125, "grad_norm_var": 3.4447265625, "learning_rate": 0.0001, "loss": 7.2626, "loss/crossentropy": 2.029908051341772, "loss/hidden": 3.315625, "loss/jsd": 0.0, "loss/logits": 0.1783326559700072, "step": 15900 }, { "epoch": 0.5303333333333333, "grad_norm": 38.75, "grad_norm_var": 8.472916666666666, "learning_rate": 0.0001, "loss": 7.223, "loss/crossentropy": 1.894991747289896, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.19357452634721994, "step": 15910 }, { "epoch": 0.5306666666666666, "grad_norm": 29.375, "grad_norm_var": 11.3134765625, "learning_rate": 0.0001, "loss": 7.3312, "loss/crossentropy": 2.005254841595888, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.18138317447155713, "step": 15920 }, { "epoch": 0.531, "grad_norm": 29.625, "grad_norm_var": 7.569205729166667, "learning_rate": 0.0001, "loss": 7.2904, "loss/crossentropy": 1.9167429074645042, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.1959140334278345, "step": 15930 }, { "epoch": 0.5313333333333333, "grad_norm": 28.25, "grad_norm_var": 6.427083333333333, "learning_rate": 0.0001, "loss": 7.1713, "loss/crossentropy": 1.889963824301958, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.17270949203521013, "step": 15940 }, { "epoch": 0.5316666666666666, "grad_norm": 32.0, "grad_norm_var": 9.0775390625, "learning_rate": 0.0001, "loss": 7.312, "loss/crossentropy": 1.8937507048249245, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.18164633121341467, "step": 15950 }, { "epoch": 0.532, "grad_norm": 31.25, "grad_norm_var": 26.280989583333334, "learning_rate": 0.0001, "loss": 7.2492, "loss/crossentropy": 1.962348347902298, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.18270321134477854, "step": 15960 }, { "epoch": 0.5323333333333333, "grad_norm": 29.875, "grad_norm_var": 20.961458333333333, "learning_rate": 0.0001, "loss": 7.1777, "loss/crossentropy": 1.9121162265539169, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.17723998394794763, "step": 15970 }, { "epoch": 0.5326666666666666, "grad_norm": 30.5, "grad_norm_var": 3.490664971409014e+18, "learning_rate": 0.0001, "loss": 7.3381, "loss/crossentropy": 1.9643965989351273, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.18122702687978745, "step": 15980 }, { "epoch": 0.533, "grad_norm": 28.0, "grad_norm_var": 2.2546849082841761e+18, "learning_rate": 0.0001, "loss": 7.2217, "loss/crossentropy": 2.03294402807951, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.17810682356357574, "step": 15990 }, { "epoch": 0.5333333333333333, "grad_norm": 29.625, "grad_norm_var": 47.76015625, "learning_rate": 0.0001, "loss": 7.2043, "loss/crossentropy": 2.006417116522789, "loss/hidden": 3.30234375, "loss/jsd": 0.0, "loss/logits": 0.1732567984610796, "step": 16000 }, { "epoch": 0.5336666666666666, "grad_norm": 32.5, "grad_norm_var": 15.559830729166666, "learning_rate": 0.0001, "loss": 7.2013, "loss/crossentropy": 1.7949382483959198, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.1753213692456484, "step": 16010 }, { "epoch": 0.534, "grad_norm": 29.375, "grad_norm_var": 16.995768229166668, "learning_rate": 0.0001, "loss": 7.288, "loss/crossentropy": 2.0027854360640047, "loss/hidden": 3.54453125, "loss/jsd": 0.0, "loss/logits": 0.22009344287216664, "step": 16020 }, { "epoch": 0.5343333333333333, "grad_norm": 28.125, "grad_norm_var": 20.637434895833334, "learning_rate": 0.0001, "loss": 7.1829, "loss/crossentropy": 1.938959789276123, "loss/hidden": 3.2375, "loss/jsd": 0.0, "loss/logits": 0.16864250786602497, "step": 16030 }, { "epoch": 0.5346666666666666, "grad_norm": 27.875, "grad_norm_var": 1.7789922002871519e+18, "learning_rate": 0.0001, "loss": 7.2872, "loss/crossentropy": 1.8849016308784485, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19154972955584526, "step": 16040 }, { "epoch": 0.535, "grad_norm": 30.875, "grad_norm_var": 1.7789921999148024e+18, "learning_rate": 0.0001, "loss": 7.1288, "loss/crossentropy": 1.9222312852740289, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.19766846476122737, "step": 16050 }, { "epoch": 0.5353333333333333, "grad_norm": 28.25, "grad_norm_var": 16.878580729166668, "learning_rate": 0.0001, "loss": 7.1771, "loss/crossentropy": 1.973449818789959, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.18641741164028644, "step": 16060 }, { "epoch": 0.5356666666666666, "grad_norm": 30.125, "grad_norm_var": 8.946809895833333, "learning_rate": 0.0001, "loss": 7.2912, "loss/crossentropy": 2.0171415746212005, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.1957352278754115, "step": 16070 }, { "epoch": 0.536, "grad_norm": 38.25, "grad_norm_var": 2.4607245908016717e+18, "learning_rate": 0.0001, "loss": 7.2259, "loss/crossentropy": 1.8520954191684722, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.18087810203433036, "step": 16080 }, { "epoch": 0.5363333333333333, "grad_norm": 29.25, "grad_norm_var": 2.460724590710166e+18, "learning_rate": 0.0001, "loss": 7.3035, "loss/crossentropy": 2.116926434636116, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.2029705887660384, "step": 16090 }, { "epoch": 0.5366666666666666, "grad_norm": 34.25, "grad_norm_var": 24.480989583333333, "learning_rate": 0.0001, "loss": 7.2669, "loss/crossentropy": 2.1418012261390684, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.18830186761915685, "step": 16100 }, { "epoch": 0.537, "grad_norm": 29.125, "grad_norm_var": 29.923372395833333, "learning_rate": 0.0001, "loss": 7.2623, "loss/crossentropy": 2.0094053827226164, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.18487443430349232, "step": 16110 }, { "epoch": 0.5373333333333333, "grad_norm": 28.75, "grad_norm_var": 15.712239583333334, "learning_rate": 0.0001, "loss": 7.2743, "loss/crossentropy": 1.9135031297802925, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.18463148903101684, "step": 16120 }, { "epoch": 0.5376666666666666, "grad_norm": 28.125, "grad_norm_var": 7.37890625, "learning_rate": 0.0001, "loss": 7.1926, "loss/crossentropy": 1.9881214782595635, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.20244781635701656, "step": 16130 }, { "epoch": 0.538, "grad_norm": 38.25, "grad_norm_var": 20.937239583333334, "learning_rate": 0.0001, "loss": 7.1835, "loss/crossentropy": 1.810898581892252, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.18261138200759888, "step": 16140 }, { "epoch": 0.5383333333333333, "grad_norm": 27.25, "grad_norm_var": 9.263541666666667, "learning_rate": 0.0001, "loss": 7.2461, "loss/crossentropy": 1.9501622930169105, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.19361708909273148, "step": 16150 }, { "epoch": 0.5386666666666666, "grad_norm": 28.75, "grad_norm_var": 2.7343098958333334, "learning_rate": 0.0001, "loss": 7.2853, "loss/crossentropy": 1.9921826750040055, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.18805954568088054, "step": 16160 }, { "epoch": 0.539, "grad_norm": 31.625, "grad_norm_var": 2.0817889050633436e+18, "learning_rate": 0.0001, "loss": 7.2127, "loss/crossentropy": 1.9893563330173492, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.18842688668519258, "step": 16170 }, { "epoch": 0.5393333333333333, "grad_norm": 29.875, "grad_norm_var": 2.0817889035663967e+18, "learning_rate": 0.0001, "loss": 7.3896, "loss/crossentropy": 2.1408085778355597, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.20866199284791948, "step": 16180 }, { "epoch": 0.5396666666666666, "grad_norm": 33.0, "grad_norm_var": 2.4344770474032497e+18, "learning_rate": 0.0001, "loss": 7.1779, "loss/crossentropy": 2.0065360769629477, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.1817771451547742, "step": 16190 }, { "epoch": 0.54, "grad_norm": 29.5, "grad_norm_var": 4.083864095494697e+18, "learning_rate": 0.0001, "loss": 7.1862, "loss/crossentropy": 1.9171742260456086, "loss/hidden": 3.61640625, "loss/jsd": 0.0, "loss/logits": 0.18139733877032996, "step": 16200 }, { "epoch": 0.5403333333333333, "grad_norm": 29.125, "grad_norm_var": 8.357747395833334, "learning_rate": 0.0001, "loss": 7.1592, "loss/crossentropy": 2.0798111110925674, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.20720337936654687, "step": 16210 }, { "epoch": 0.5406666666666666, "grad_norm": 29.5, "grad_norm_var": 1.590625, "learning_rate": 0.0001, "loss": 7.2744, "loss/crossentropy": 2.040429861843586, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.18505549654364586, "step": 16220 }, { "epoch": 0.541, "grad_norm": 29.125, "grad_norm_var": 3.967643229166667, "learning_rate": 0.0001, "loss": 7.248, "loss/crossentropy": 1.9645962409675122, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.19012693529948593, "step": 16230 }, { "epoch": 0.5413333333333333, "grad_norm": 26.125, "grad_norm_var": 5.912955729166667, "learning_rate": 0.0001, "loss": 7.2399, "loss/crossentropy": 1.9644932925701142, "loss/hidden": 3.35078125, "loss/jsd": 0.0, "loss/logits": 0.17754013873636723, "step": 16240 }, { "epoch": 0.5416666666666666, "grad_norm": 28.125, "grad_norm_var": 7.765559895833333, "learning_rate": 0.0001, "loss": 7.1287, "loss/crossentropy": 1.7832012966275215, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.17449556589126586, "step": 16250 }, { "epoch": 0.542, "grad_norm": 28.125, "grad_norm_var": 2.1372395833333333, "learning_rate": 0.0001, "loss": 7.3698, "loss/crossentropy": 2.0634719483554362, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19120339024811983, "step": 16260 }, { "epoch": 0.5423333333333333, "grad_norm": 26.125, "grad_norm_var": 7.352018229166666, "learning_rate": 0.0001, "loss": 7.3162, "loss/crossentropy": 2.1420525506138803, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.1886801295913756, "step": 16270 }, { "epoch": 0.5426666666666666, "grad_norm": 29.875, "grad_norm_var": 7.409830729166667, "learning_rate": 0.0001, "loss": 7.175, "loss/crossentropy": 2.002311950176954, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.19314789595082402, "step": 16280 }, { "epoch": 0.543, "grad_norm": 28.375, "grad_norm_var": 1.4535807291666667, "learning_rate": 0.0001, "loss": 7.2563, "loss/crossentropy": 2.0596153318881987, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.1911829488351941, "step": 16290 }, { "epoch": 0.5433333333333333, "grad_norm": 31.5, "grad_norm_var": 2.984375, "learning_rate": 0.0001, "loss": 7.2716, "loss/crossentropy": 2.0918624766170977, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.19311914443969727, "step": 16300 }, { "epoch": 0.5436666666666666, "grad_norm": 27.625, "grad_norm_var": 4.547330729166666, "learning_rate": 0.0001, "loss": 7.1056, "loss/crossentropy": 1.9934461861848831, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.19652517102658748, "step": 16310 }, { "epoch": 0.544, "grad_norm": 29.125, "grad_norm_var": 3.3228515625, "learning_rate": 0.0001, "loss": 7.245, "loss/crossentropy": 2.1007860742509363, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.18299960833974183, "step": 16320 }, { "epoch": 0.5443333333333333, "grad_norm": 27.625, "grad_norm_var": 4.3291015625, "learning_rate": 0.0001, "loss": 7.2893, "loss/crossentropy": 1.9577517256140708, "loss/hidden": 3.50390625, "loss/jsd": 0.0, "loss/logits": 0.19058505576103926, "step": 16330 }, { "epoch": 0.5446666666666666, "grad_norm": 30.125, "grad_norm_var": 9.969205729166667, "learning_rate": 0.0001, "loss": 7.3128, "loss/crossentropy": 1.868574035167694, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.18325023129582405, "step": 16340 }, { "epoch": 0.545, "grad_norm": 28.5, "grad_norm_var": 0.8442057291666667, "learning_rate": 0.0001, "loss": 7.2573, "loss/crossentropy": 1.9558767974376678, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.19122960995882748, "step": 16350 }, { "epoch": 0.5453333333333333, "grad_norm": 33.25, "grad_norm_var": 2.0530598958333335, "learning_rate": 0.0001, "loss": 7.2688, "loss/crossentropy": 1.9328763157129287, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.19347258768975734, "step": 16360 }, { "epoch": 0.5456666666666666, "grad_norm": 27.5, "grad_norm_var": 4.6978515625, "learning_rate": 0.0001, "loss": 7.2044, "loss/crossentropy": 1.8707745537161826, "loss/hidden": 3.4859375, "loss/jsd": 0.0, "loss/logits": 0.18527084738016128, "step": 16370 }, { "epoch": 0.546, "grad_norm": 28.375, "grad_norm_var": 3.8676432291666667, "learning_rate": 0.0001, "loss": 7.2894, "loss/crossentropy": 1.8961201742291451, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.18220544941723346, "step": 16380 }, { "epoch": 0.5463333333333333, "grad_norm": 30.375, "grad_norm_var": 1.3817057291666666, "learning_rate": 0.0001, "loss": 7.2754, "loss/crossentropy": 2.0292367458343508, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.1862015763297677, "step": 16390 }, { "epoch": 0.5466666666666666, "grad_norm": 29.625, "grad_norm_var": 3.04443332087583e+18, "learning_rate": 0.0001, "loss": 7.2521, "loss/crossentropy": 2.0008056700229644, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1873819250613451, "step": 16400 }, { "epoch": 0.547, "grad_norm": 27.5, "grad_norm_var": 21.0744140625, "learning_rate": 0.0001, "loss": 7.2123, "loss/crossentropy": 2.0601282373070715, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.1910658609122038, "step": 16410 }, { "epoch": 0.5473333333333333, "grad_norm": 28.5, "grad_norm_var": 2.3643229166666666, "learning_rate": 0.0001, "loss": 7.2808, "loss/crossentropy": 1.9178543552756309, "loss/hidden": 3.5359375, "loss/jsd": 0.0, "loss/logits": 0.18812899701297284, "step": 16420 }, { "epoch": 0.5476666666666666, "grad_norm": 29.375, "grad_norm_var": 6.2650390625, "learning_rate": 0.0001, "loss": 7.1838, "loss/crossentropy": 2.0298706971108915, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.18241058252751827, "step": 16430 }, { "epoch": 0.548, "grad_norm": 31.875, "grad_norm_var": 15.2447265625, "learning_rate": 0.0001, "loss": 7.3378, "loss/crossentropy": 2.1674243420362473, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.19292237926274539, "step": 16440 }, { "epoch": 0.5483333333333333, "grad_norm": 30.75, "grad_norm_var": 20.005208333333332, "learning_rate": 0.0001, "loss": 7.2833, "loss/crossentropy": 1.8332827553153037, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.17381727732717991, "step": 16450 }, { "epoch": 0.5486666666666666, "grad_norm": 37.5, "grad_norm_var": 6.053580729166667, "learning_rate": 0.0001, "loss": 7.3236, "loss/crossentropy": 2.009462334215641, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.18866737317293883, "step": 16460 }, { "epoch": 0.549, "grad_norm": 30.0, "grad_norm_var": 189.65390625, "learning_rate": 0.0001, "loss": 7.3108, "loss/crossentropy": 2.00550163090229, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.1886333802714944, "step": 16470 }, { "epoch": 0.5493333333333333, "grad_norm": 30.5, "grad_norm_var": 185.39270833333333, "learning_rate": 0.0001, "loss": 7.4025, "loss/crossentropy": 1.921194638311863, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.19174772277474403, "step": 16480 }, { "epoch": 0.5496666666666666, "grad_norm": 29.5, "grad_norm_var": 1.4254557291666667, "learning_rate": 0.0001, "loss": 7.3165, "loss/crossentropy": 1.9243738889694213, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.19599060341715813, "step": 16490 }, { "epoch": 0.55, "grad_norm": 30.875, "grad_norm_var": 1.75, "learning_rate": 0.0001, "loss": 7.2834, "loss/crossentropy": 1.9776796583086251, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.17723584985360502, "step": 16500 }, { "epoch": 0.5503333333333333, "grad_norm": 31.5, "grad_norm_var": 1.9514973958333333, "learning_rate": 0.0001, "loss": 7.2342, "loss/crossentropy": 1.995065599679947, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.18019242864102125, "step": 16510 }, { "epoch": 0.5506666666666666, "grad_norm": 26.375, "grad_norm_var": 3.379166666666667, "learning_rate": 0.0001, "loss": 7.2974, "loss/crossentropy": 1.8949688427150249, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.18723348630592226, "step": 16520 }, { "epoch": 0.551, "grad_norm": 25.25, "grad_norm_var": 5.0119140625, "learning_rate": 0.0001, "loss": 7.0659, "loss/crossentropy": 2.0664942413568497, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.1888657508417964, "step": 16530 }, { "epoch": 0.5513333333333333, "grad_norm": 30.125, "grad_norm_var": 3.3135937666627405e+18, "learning_rate": 0.0001, "loss": 7.3024, "loss/crossentropy": 2.0736934199929236, "loss/hidden": 3.637109375, "loss/jsd": 0.0, "loss/logits": 0.18541921228170394, "step": 16540 }, { "epoch": 0.5516666666666666, "grad_norm": 30.875, "grad_norm_var": 1.3900390625, "learning_rate": 0.0001, "loss": 7.3501, "loss/crossentropy": 1.8276831768453121, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.17508197128772734, "step": 16550 }, { "epoch": 0.552, "grad_norm": 28.125, "grad_norm_var": 5.75, "learning_rate": 0.0001, "loss": 7.2229, "loss/crossentropy": 1.839534468203783, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.17636930653825403, "step": 16560 }, { "epoch": 0.5523333333333333, "grad_norm": 28.125, "grad_norm_var": 4.422916666666667, "learning_rate": 0.0001, "loss": 7.3421, "loss/crossentropy": 2.0372296035289765, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.20839525777846574, "step": 16570 }, { "epoch": 0.5526666666666666, "grad_norm": 27.375, "grad_norm_var": 2.6978515625, "learning_rate": 0.0001, "loss": 7.2214, "loss/crossentropy": 2.151124620437622, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.18638003729283809, "step": 16580 }, { "epoch": 0.553, "grad_norm": 29.125, "grad_norm_var": 1.6375, "learning_rate": 0.0001, "loss": 7.2037, "loss/crossentropy": 1.996071733534336, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.18973351493477822, "step": 16590 }, { "epoch": 0.5533333333333333, "grad_norm": 27.125, "grad_norm_var": 3.819205729166667, "learning_rate": 0.0001, "loss": 7.3483, "loss/crossentropy": 2.0941787257790567, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19264436662197112, "step": 16600 }, { "epoch": 0.5536666666666666, "grad_norm": 31.375, "grad_norm_var": 5.737239583333333, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 2.0525165393948557, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.1884797804057598, "step": 16610 }, { "epoch": 0.554, "grad_norm": 30.0, "grad_norm_var": 6.287434895833333, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.038564084470272, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.1915274541825056, "step": 16620 }, { "epoch": 0.5543333333333333, "grad_norm": 29.5, "grad_norm_var": 1.81015625, "learning_rate": 0.0001, "loss": 7.4339, "loss/crossentropy": 2.0948300682008267, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.2008287174627185, "step": 16630 }, { "epoch": 0.5546666666666666, "grad_norm": 30.75, "grad_norm_var": 1.5747395833333333, "learning_rate": 0.0001, "loss": 7.278, "loss/crossentropy": 2.1151763960719108, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.19994504395872353, "step": 16640 }, { "epoch": 0.555, "grad_norm": 28.125, "grad_norm_var": 1.2978515625, "learning_rate": 0.0001, "loss": 7.3108, "loss/crossentropy": 1.954684516787529, "loss/hidden": 3.490625, "loss/jsd": 0.0, "loss/logits": 0.1811052923090756, "step": 16650 }, { "epoch": 0.5553333333333333, "grad_norm": 31.5, "grad_norm_var": 2.6181640625, "learning_rate": 0.0001, "loss": 7.3313, "loss/crossentropy": 2.1238763615489007, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.19945564046502112, "step": 16660 }, { "epoch": 0.5556666666666666, "grad_norm": 29.25, "grad_norm_var": 2.910872395833333, "learning_rate": 0.0001, "loss": 7.2585, "loss/crossentropy": 2.0093641102313997, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.18970496114343405, "step": 16670 }, { "epoch": 0.556, "grad_norm": 30.25, "grad_norm_var": 3.677018229166667, "learning_rate": 0.0001, "loss": 7.4061, "loss/crossentropy": 1.9827346831560135, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.20011012069880962, "step": 16680 }, { "epoch": 0.5563333333333333, "grad_norm": 30.375, "grad_norm_var": 2.246875, "learning_rate": 0.0001, "loss": 7.2568, "loss/crossentropy": 1.9675356656312943, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.1839764393866062, "step": 16690 }, { "epoch": 0.5566666666666666, "grad_norm": 26.875, "grad_norm_var": 4.296809895833333, "learning_rate": 0.0001, "loss": 7.2685, "loss/crossentropy": 1.9053036078810692, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.17970749661326407, "step": 16700 }, { "epoch": 0.557, "grad_norm": 30.875, "grad_norm_var": 38.437434895833334, "learning_rate": 0.0001, "loss": 7.2323, "loss/crossentropy": 1.967021069675684, "loss/hidden": 3.29921875, "loss/jsd": 0.0, "loss/logits": 0.17642341097816824, "step": 16710 }, { "epoch": 0.5573333333333333, "grad_norm": 29.625, "grad_norm_var": 14.90390625, "learning_rate": 0.0001, "loss": 7.2017, "loss/crossentropy": 1.9356500208377838, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.17959797563962637, "step": 16720 }, { "epoch": 0.5576666666666666, "grad_norm": 27.75, "grad_norm_var": 6.598958333333333, "learning_rate": 0.0001, "loss": 7.2486, "loss/crossentropy": 1.9907978735864162, "loss/hidden": 3.354296875, "loss/jsd": 0.0, "loss/logits": 0.18325813449919223, "step": 16730 }, { "epoch": 0.558, "grad_norm": 27.25, "grad_norm_var": 3.628059895833333, "learning_rate": 0.0001, "loss": 7.2182, "loss/crossentropy": 2.0575726598501207, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.18049408327788113, "step": 16740 }, { "epoch": 0.5583333333333333, "grad_norm": 30.125, "grad_norm_var": 3.595572916666667, "learning_rate": 0.0001, "loss": 7.2255, "loss/crossentropy": 2.0005518101155757, "loss/hidden": 3.273828125, "loss/jsd": 0.0, "loss/logits": 0.17758453395217658, "step": 16750 }, { "epoch": 0.5586666666666666, "grad_norm": 29.25, "grad_norm_var": 2.3541666666666665, "learning_rate": 0.0001, "loss": 7.3313, "loss/crossentropy": 2.005421133339405, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.18893488664180041, "step": 16760 }, { "epoch": 0.559, "grad_norm": 28.75, "grad_norm_var": 2.18515625, "learning_rate": 0.0001, "loss": 7.1785, "loss/crossentropy": 2.03833954334259, "loss/hidden": 3.47109375, "loss/jsd": 0.0, "loss/logits": 0.18463380690664052, "step": 16770 }, { "epoch": 0.5593333333333333, "grad_norm": 30.875, "grad_norm_var": 6.939322916666667, "learning_rate": 0.0001, "loss": 7.3848, "loss/crossentropy": 2.001122633367777, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19910238981246947, "step": 16780 }, { "epoch": 0.5596666666666666, "grad_norm": 31.625, "grad_norm_var": 11.178059895833334, "learning_rate": 0.0001, "loss": 7.3069, "loss/crossentropy": 1.9127840459346772, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.18504426423460246, "step": 16790 }, { "epoch": 0.56, "grad_norm": 29.875, "grad_norm_var": 8.467122395833334, "learning_rate": 0.0001, "loss": 7.3321, "loss/crossentropy": 2.057105243206024, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.19580034986138345, "step": 16800 }, { "epoch": 0.5603333333333333, "grad_norm": 34.75, "grad_norm_var": 3.258072916666667, "learning_rate": 0.0001, "loss": 7.2062, "loss/crossentropy": 1.7023715004324913, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.17715267231687903, "step": 16810 }, { "epoch": 0.5606666666666666, "grad_norm": 28.5, "grad_norm_var": 20.4994140625, "learning_rate": 0.0001, "loss": 7.3218, "loss/crossentropy": 1.9034311518073082, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.1847141981124878, "step": 16820 }, { "epoch": 0.561, "grad_norm": 28.625, "grad_norm_var": 5.739583333333333, "learning_rate": 0.0001, "loss": 7.2508, "loss/crossentropy": 1.9920265853405, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.1882050583139062, "step": 16830 }, { "epoch": 0.5613333333333334, "grad_norm": 31.875, "grad_norm_var": 3.897330729166667, "learning_rate": 0.0001, "loss": 7.2183, "loss/crossentropy": 2.0646637693047523, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.1809031156823039, "step": 16840 }, { "epoch": 0.5616666666666666, "grad_norm": 28.75, "grad_norm_var": 2.4759765625, "learning_rate": 0.0001, "loss": 7.1738, "loss/crossentropy": 2.0070127204060553, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.1883425075560808, "step": 16850 }, { "epoch": 0.562, "grad_norm": 30.0, "grad_norm_var": 2.112434895833333, "learning_rate": 0.0001, "loss": 7.3661, "loss/crossentropy": 1.9686364933848381, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.19056682977825404, "step": 16860 }, { "epoch": 0.5623333333333334, "grad_norm": 29.25, "grad_norm_var": 2.1395833333333334, "learning_rate": 0.0001, "loss": 7.3846, "loss/crossentropy": 1.927344098687172, "loss/hidden": 3.516796875, "loss/jsd": 0.0, "loss/logits": 0.20416761003434658, "step": 16870 }, { "epoch": 0.5626666666666666, "grad_norm": 29.125, "grad_norm_var": 3.09375, "learning_rate": 0.0001, "loss": 7.3285, "loss/crossentropy": 1.8983233183622361, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.19739714432507754, "step": 16880 }, { "epoch": 0.563, "grad_norm": 30.625, "grad_norm_var": 1.6822916666666667, "learning_rate": 0.0001, "loss": 7.2719, "loss/crossentropy": 1.757557562738657, "loss/hidden": 3.53046875, "loss/jsd": 0.0, "loss/logits": 0.1878972366452217, "step": 16890 }, { "epoch": 0.5633333333333334, "grad_norm": 28.75, "grad_norm_var": 1.053125, "learning_rate": 0.0001, "loss": 7.3044, "loss/crossentropy": 2.034758250415325, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.17609427832067012, "step": 16900 }, { "epoch": 0.5636666666666666, "grad_norm": 30.875, "grad_norm_var": 1.7108723958333334, "learning_rate": 0.0001, "loss": 7.2402, "loss/crossentropy": 1.975506929308176, "loss/hidden": 3.288671875, "loss/jsd": 0.0, "loss/logits": 0.17395650874823332, "step": 16910 }, { "epoch": 0.564, "grad_norm": 31.25, "grad_norm_var": 7.687434895833333, "learning_rate": 0.0001, "loss": 7.3049, "loss/crossentropy": 1.9966528728604316, "loss/hidden": 3.521875, "loss/jsd": 0.0, "loss/logits": 0.19653237238526344, "step": 16920 }, { "epoch": 0.5643333333333334, "grad_norm": 29.75, "grad_norm_var": 6.456184895833333, "learning_rate": 0.0001, "loss": 7.3647, "loss/crossentropy": 2.004221601039171, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.1901928609237075, "step": 16930 }, { "epoch": 0.5646666666666667, "grad_norm": 29.75, "grad_norm_var": 0.9936848958333333, "learning_rate": 0.0001, "loss": 7.3303, "loss/crossentropy": 1.92206571996212, "loss/hidden": 3.593359375, "loss/jsd": 0.0, "loss/logits": 0.1850942318327725, "step": 16940 }, { "epoch": 0.565, "grad_norm": 28.5, "grad_norm_var": 2.81015625, "learning_rate": 0.0001, "loss": 7.2708, "loss/crossentropy": 2.0296561308205128, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.19217534381896256, "step": 16950 }, { "epoch": 0.5653333333333334, "grad_norm": 30.625, "grad_norm_var": 4.344791666666667, "learning_rate": 0.0001, "loss": 7.2973, "loss/crossentropy": 1.7922523364424705, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.18588219452649354, "step": 16960 }, { "epoch": 0.5656666666666667, "grad_norm": 30.5, "grad_norm_var": 3.065625, "learning_rate": 0.0001, "loss": 7.3068, "loss/crossentropy": 1.807192399352789, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.18796001579612492, "step": 16970 }, { "epoch": 0.566, "grad_norm": 29.625, "grad_norm_var": 1.3768229166666666, "learning_rate": 0.0001, "loss": 7.361, "loss/crossentropy": 2.1558587104082108, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.19713825285434722, "step": 16980 }, { "epoch": 0.5663333333333334, "grad_norm": 31.125, "grad_norm_var": 3.1455729166666666, "learning_rate": 0.0001, "loss": 7.2669, "loss/crossentropy": 1.8620438933372498, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.18754112347960472, "step": 16990 }, { "epoch": 0.5666666666666667, "grad_norm": 34.75, "grad_norm_var": 2.6757714717148447e+18, "learning_rate": 0.0001, "loss": 7.4241, "loss/crossentropy": 2.23966289460659, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.20008322596549988, "step": 17000 }, { "epoch": 0.567, "grad_norm": 29.5, "grad_norm_var": 2.675771470590247e+18, "learning_rate": 0.0001, "loss": 7.2155, "loss/crossentropy": 1.8322518840432167, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.19843745082616807, "step": 17010 }, { "epoch": 0.5673333333333334, "grad_norm": 33.25, "grad_norm_var": 2.229622395833333, "learning_rate": 0.0001, "loss": 7.4247, "loss/crossentropy": 2.031532459706068, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.20040796985849738, "step": 17020 }, { "epoch": 0.5676666666666667, "grad_norm": 27.5, "grad_norm_var": 2.283333333333333, "learning_rate": 0.0001, "loss": 7.2677, "loss/crossentropy": 1.9596425406634808, "loss/hidden": 3.570703125, "loss/jsd": 0.0, "loss/logits": 0.21733255330473183, "step": 17030 }, { "epoch": 0.568, "grad_norm": 32.75, "grad_norm_var": 2.5952473958333333, "learning_rate": 0.0001, "loss": 7.2953, "loss/crossentropy": 1.858614620566368, "loss/hidden": 3.533984375, "loss/jsd": 0.0, "loss/logits": 0.17945129349827765, "step": 17040 }, { "epoch": 0.5683333333333334, "grad_norm": 28.25, "grad_norm_var": 37.88333333333333, "learning_rate": 0.0001, "loss": 7.3247, "loss/crossentropy": 2.1007143922150133, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19176965206861496, "step": 17050 }, { "epoch": 0.5686666666666667, "grad_norm": 31.25, "grad_norm_var": 3.1259765625, "learning_rate": 0.0001, "loss": 7.2894, "loss/crossentropy": 1.8990314483642579, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.18263417165726423, "step": 17060 }, { "epoch": 0.569, "grad_norm": 29.25, "grad_norm_var": 1.5244140625, "learning_rate": 0.0001, "loss": 7.1486, "loss/crossentropy": 1.9843677811324596, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.17821572832763194, "step": 17070 }, { "epoch": 0.5693333333333334, "grad_norm": 32.75, "grad_norm_var": 3.158072916666667, "learning_rate": 0.0001, "loss": 7.3464, "loss/crossentropy": 2.0689996108412743, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.19369205534458162, "step": 17080 }, { "epoch": 0.5696666666666667, "grad_norm": 30.0, "grad_norm_var": 2.7900390625, "learning_rate": 0.0001, "loss": 7.2443, "loss/crossentropy": 1.9969198986887933, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.17990297880023717, "step": 17090 }, { "epoch": 0.57, "grad_norm": 31.25, "grad_norm_var": 1.4249348958333334, "learning_rate": 0.0001, "loss": 7.2163, "loss/crossentropy": 1.9350959174335003, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.17001375257968904, "step": 17100 }, { "epoch": 0.5703333333333334, "grad_norm": 36.75, "grad_norm_var": 8.087239583333334, "learning_rate": 0.0001, "loss": 7.3, "loss/crossentropy": 2.001905527710915, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.1997325955890119, "step": 17110 }, { "epoch": 0.5706666666666667, "grad_norm": 27.5, "grad_norm_var": 9.404166666666667, "learning_rate": 0.0001, "loss": 7.2463, "loss/crossentropy": 2.1459966719150545, "loss/hidden": 3.354296875, "loss/jsd": 0.0, "loss/logits": 0.17998769599944353, "step": 17120 }, { "epoch": 0.571, "grad_norm": 30.0, "grad_norm_var": 10.045833333333333, "learning_rate": 0.0001, "loss": 7.2599, "loss/crossentropy": 2.0249556630849836, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.18499721717089415, "step": 17130 }, { "epoch": 0.5713333333333334, "grad_norm": 28.375, "grad_norm_var": 10.254166666666666, "learning_rate": 0.0001, "loss": 7.2004, "loss/crossentropy": 1.9608264788985252, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.18652515541762113, "step": 17140 }, { "epoch": 0.5716666666666667, "grad_norm": 28.625, "grad_norm_var": 10.626497395833333, "learning_rate": 0.0001, "loss": 7.2865, "loss/crossentropy": 2.049925111234188, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.1949341731145978, "step": 17150 }, { "epoch": 0.572, "grad_norm": 30.0, "grad_norm_var": 15.303125, "learning_rate": 0.0001, "loss": 7.1853, "loss/crossentropy": 1.9070643194019794, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.19253674633800982, "step": 17160 }, { "epoch": 0.5723333333333334, "grad_norm": 30.625, "grad_norm_var": 8.31640625, "learning_rate": 0.0001, "loss": 7.2139, "loss/crossentropy": 1.9311713933944703, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.18998315408825875, "step": 17170 }, { "epoch": 0.5726666666666667, "grad_norm": 28.875, "grad_norm_var": 14.683268229166666, "learning_rate": 0.0001, "loss": 7.1903, "loss/crossentropy": 1.8512010142207145, "loss/hidden": 3.35859375, "loss/jsd": 0.0, "loss/logits": 0.17788124224171042, "step": 17180 }, { "epoch": 0.573, "grad_norm": 29.125, "grad_norm_var": 13.214322916666667, "learning_rate": 0.0001, "loss": 7.3163, "loss/crossentropy": 2.1177233159542084, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19376316517591477, "step": 17190 }, { "epoch": 0.5733333333333334, "grad_norm": 30.75, "grad_norm_var": 6.737955729166667, "learning_rate": 0.0001, "loss": 7.3298, "loss/crossentropy": 1.8638263367116452, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.17339903563261033, "step": 17200 }, { "epoch": 0.5736666666666667, "grad_norm": 28.0, "grad_norm_var": 8.087955729166667, "learning_rate": 0.0001, "loss": 7.2079, "loss/crossentropy": 1.8807823807001114, "loss/hidden": 3.2984375, "loss/jsd": 0.0, "loss/logits": 0.1689205912873149, "step": 17210 }, { "epoch": 0.574, "grad_norm": 27.875, "grad_norm_var": 7.6087890625, "learning_rate": 0.0001, "loss": 7.2492, "loss/crossentropy": 1.7798081412911415, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.17888787053525448, "step": 17220 }, { "epoch": 0.5743333333333334, "grad_norm": 39.5, "grad_norm_var": 3.2831241000943877e+18, "learning_rate": 0.0001, "loss": 7.1448, "loss/crossentropy": 1.9923778727650643, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.1799482261762023, "step": 17230 }, { "epoch": 0.5746666666666667, "grad_norm": 34.25, "grad_norm_var": 3.28312409912802e+18, "learning_rate": 0.0001, "loss": 7.2066, "loss/crossentropy": 1.921684445440769, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.2075881932862103, "step": 17240 }, { "epoch": 0.575, "grad_norm": 29.75, "grad_norm_var": 9.378580729166666, "learning_rate": 0.0001, "loss": 7.2227, "loss/crossentropy": 1.9521233439445496, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.18351897122338415, "step": 17250 }, { "epoch": 0.5753333333333334, "grad_norm": 36.5, "grad_norm_var": 11.667643229166666, "learning_rate": 0.0001, "loss": 7.122, "loss/crossentropy": 1.7369130730628968, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.1618085891008377, "step": 17260 }, { "epoch": 0.5756666666666667, "grad_norm": 26.125, "grad_norm_var": 2.899825551876258e+18, "learning_rate": 0.0001, "loss": 7.2961, "loss/crossentropy": 1.9375556394457818, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.18568181274458767, "step": 17270 }, { "epoch": 0.576, "grad_norm": 30.75, "grad_norm_var": 9.453059895833333, "learning_rate": 0.0001, "loss": 7.1755, "loss/crossentropy": 1.9473226353526116, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.1897781066596508, "step": 17280 }, { "epoch": 0.5763333333333334, "grad_norm": 29.0, "grad_norm_var": 1.6389973958333333, "learning_rate": 0.0001, "loss": 7.2094, "loss/crossentropy": 2.019539497792721, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.18947214037179946, "step": 17290 }, { "epoch": 0.5766666666666667, "grad_norm": 29.625, "grad_norm_var": 2.1802083333333333, "learning_rate": 0.0001, "loss": 7.1944, "loss/crossentropy": 1.8470656275749207, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.17752633318305017, "step": 17300 }, { "epoch": 0.577, "grad_norm": 31.5, "grad_norm_var": 3.16015625, "learning_rate": 0.0001, "loss": 7.2419, "loss/crossentropy": 2.042439620196819, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.17933249222114683, "step": 17310 }, { "epoch": 0.5773333333333334, "grad_norm": 31.875, "grad_norm_var": 5.976822916666666, "learning_rate": 0.0001, "loss": 7.3397, "loss/crossentropy": 2.0274734646081924, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.1818927289918065, "step": 17320 }, { "epoch": 0.5776666666666667, "grad_norm": 33.0, "grad_norm_var": 10.7697265625, "learning_rate": 0.0001, "loss": 7.2151, "loss/crossentropy": 1.880955833941698, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.17115770764648913, "step": 17330 }, { "epoch": 0.578, "grad_norm": 31.875, "grad_norm_var": 10.1337890625, "learning_rate": 0.0001, "loss": 7.2027, "loss/crossentropy": 1.9441435895860195, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.17402775613591076, "step": 17340 }, { "epoch": 0.5783333333333334, "grad_norm": 27.875, "grad_norm_var": 9.549739583333333, "learning_rate": 0.0001, "loss": 7.2457, "loss/crossentropy": 2.0440355040133, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.17811332028359175, "step": 17350 }, { "epoch": 0.5786666666666667, "grad_norm": 28.125, "grad_norm_var": 11.058333333333334, "learning_rate": 0.0001, "loss": 7.2628, "loss/crossentropy": 2.0546402662992476, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.1932734478265047, "step": 17360 }, { "epoch": 0.579, "grad_norm": 28.625, "grad_norm_var": 4.153059895833334, "learning_rate": 0.0001, "loss": 7.2812, "loss/crossentropy": 1.9771277114748955, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.1866312323138118, "step": 17370 }, { "epoch": 0.5793333333333334, "grad_norm": 27.875, "grad_norm_var": 2.1885416666666666, "learning_rate": 0.0001, "loss": 7.1751, "loss/crossentropy": 2.052058584243059, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.17928270678967237, "step": 17380 }, { "epoch": 0.5796666666666667, "grad_norm": 31.625, "grad_norm_var": 2.7916015625, "learning_rate": 0.0001, "loss": 7.3526, "loss/crossentropy": 2.0585345178842545, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.18788467887789012, "step": 17390 }, { "epoch": 0.58, "grad_norm": 28.0, "grad_norm_var": 1.6330729166666667, "learning_rate": 0.0001, "loss": 7.2385, "loss/crossentropy": 1.8620590336620808, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.18038669703528284, "step": 17400 }, { "epoch": 0.5803333333333334, "grad_norm": 29.375, "grad_norm_var": 5.790625, "learning_rate": 0.0001, "loss": 7.1927, "loss/crossentropy": 1.9602965280413627, "loss/hidden": 3.30546875, "loss/jsd": 0.0, "loss/logits": 0.17341553717851638, "step": 17410 }, { "epoch": 0.5806666666666667, "grad_norm": 37.0, "grad_norm_var": 4.902449621635939e+18, "learning_rate": 0.0001, "loss": 7.3188, "loss/crossentropy": 2.0419380933046343, "loss/hidden": 3.51484375, "loss/jsd": 0.0, "loss/logits": 0.19791702125221491, "step": 17420 }, { "epoch": 0.581, "grad_norm": 28.375, "grad_norm_var": 4.902449620772472e+18, "learning_rate": 0.0001, "loss": 7.165, "loss/crossentropy": 2.2736024111509323, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.19110151641070844, "step": 17430 }, { "epoch": 0.5813333333333334, "grad_norm": 32.5, "grad_norm_var": 1.9207682291666666, "learning_rate": 0.0001, "loss": 7.2659, "loss/crossentropy": 2.1284452833235266, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.1974963081534952, "step": 17440 }, { "epoch": 0.5816666666666667, "grad_norm": 28.0, "grad_norm_var": 3.3134765625, "learning_rate": 0.0001, "loss": 7.2644, "loss/crossentropy": 2.063309706747532, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.18200565353035927, "step": 17450 }, { "epoch": 0.582, "grad_norm": 31.25, "grad_norm_var": 3.9296223958333334, "learning_rate": 0.0001, "loss": 7.2031, "loss/crossentropy": 1.9794615499675274, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.18595365872606634, "step": 17460 }, { "epoch": 0.5823333333333334, "grad_norm": 28.75, "grad_norm_var": 2.2125, "learning_rate": 0.0001, "loss": 7.225, "loss/crossentropy": 1.9446428142488004, "loss/hidden": 3.305078125, "loss/jsd": 0.0, "loss/logits": 0.16845882274210452, "step": 17470 }, { "epoch": 0.5826666666666667, "grad_norm": 29.0, "grad_norm_var": 56.0009765625, "learning_rate": 0.0001, "loss": 7.2606, "loss/crossentropy": 1.8466723755002021, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.1862383043859154, "step": 17480 }, { "epoch": 0.583, "grad_norm": 28.875, "grad_norm_var": 4.72265625, "learning_rate": 0.0001, "loss": 7.2603, "loss/crossentropy": 1.98887038230896, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.1857691845856607, "step": 17490 }, { "epoch": 0.5833333333333334, "grad_norm": 32.5, "grad_norm_var": 21.951497395833332, "learning_rate": 0.0001, "loss": 7.44, "loss/crossentropy": 2.0265815660357474, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.20335600562393666, "step": 17500 }, { "epoch": 0.5836666666666667, "grad_norm": 32.0, "grad_norm_var": 21.875, "learning_rate": 0.0001, "loss": 7.3403, "loss/crossentropy": 2.1662577986717224, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.19854557421058416, "step": 17510 }, { "epoch": 0.584, "grad_norm": 36.5, "grad_norm_var": 9.458072916666667, "learning_rate": 0.0001, "loss": 7.212, "loss/crossentropy": 1.8523712366819383, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.16759339440613985, "step": 17520 }, { "epoch": 0.5843333333333334, "grad_norm": 29.125, "grad_norm_var": 4.8681640625, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 1.920142289251089, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.1755612438544631, "step": 17530 }, { "epoch": 0.5846666666666667, "grad_norm": 30.75, "grad_norm_var": 3.0455729166666665, "learning_rate": 0.0001, "loss": 7.2633, "loss/crossentropy": 2.158094495534897, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19018125347793102, "step": 17540 }, { "epoch": 0.585, "grad_norm": 27.75, "grad_norm_var": 2.3556640625, "learning_rate": 0.0001, "loss": 7.1846, "loss/crossentropy": 1.951464493572712, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.1807758964598179, "step": 17550 }, { "epoch": 0.5853333333333334, "grad_norm": 30.625, "grad_norm_var": 1.5692057291666666, "learning_rate": 0.0001, "loss": 7.2541, "loss/crossentropy": 2.1018988613039253, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19067187327891588, "step": 17560 }, { "epoch": 0.5856666666666667, "grad_norm": 28.0, "grad_norm_var": 1.965625, "learning_rate": 0.0001, "loss": 7.2077, "loss/crossentropy": 2.0292631089687347, "loss/hidden": 3.294140625, "loss/jsd": 0.0, "loss/logits": 0.18160678073763847, "step": 17570 }, { "epoch": 0.586, "grad_norm": 30.625, "grad_norm_var": 3.4337890625, "learning_rate": 0.0001, "loss": 7.2902, "loss/crossentropy": 2.017189198732376, "loss/hidden": 3.524609375, "loss/jsd": 0.0, "loss/logits": 0.19253381118178367, "step": 17580 }, { "epoch": 0.5863333333333334, "grad_norm": 30.125, "grad_norm_var": 1.8875, "learning_rate": 0.0001, "loss": 7.3351, "loss/crossentropy": 1.9146175347268581, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.18046838557347655, "step": 17590 }, { "epoch": 0.5866666666666667, "grad_norm": 26.875, "grad_norm_var": 2.6634765625, "learning_rate": 0.0001, "loss": 7.2362, "loss/crossentropy": 1.9982169926166535, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.1851495834067464, "step": 17600 }, { "epoch": 0.587, "grad_norm": 28.75, "grad_norm_var": 5.6931640625, "learning_rate": 0.0001, "loss": 7.2835, "loss/crossentropy": 1.9367871195077897, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.1763663213700056, "step": 17610 }, { "epoch": 0.5873333333333334, "grad_norm": 28.25, "grad_norm_var": 17.03515625, "learning_rate": 0.0001, "loss": 7.2378, "loss/crossentropy": 2.0663704581558706, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.19266408700495957, "step": 17620 }, { "epoch": 0.5876666666666667, "grad_norm": 28.875, "grad_norm_var": 5.716666666666667, "learning_rate": 0.0001, "loss": 7.0719, "loss/crossentropy": 2.000533550977707, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.17328761713579297, "step": 17630 }, { "epoch": 0.588, "grad_norm": 32.0, "grad_norm_var": 1.8999348958333333, "learning_rate": 0.0001, "loss": 7.2611, "loss/crossentropy": 2.094344127178192, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.19264199137687682, "step": 17640 }, { "epoch": 0.5883333333333334, "grad_norm": 29.875, "grad_norm_var": 12.882291666666667, "learning_rate": 0.0001, "loss": 7.2057, "loss/crossentropy": 2.05139602124691, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.19437335543334483, "step": 17650 }, { "epoch": 0.5886666666666667, "grad_norm": 28.375, "grad_norm_var": 2.7552083333333335, "learning_rate": 0.0001, "loss": 7.2633, "loss/crossentropy": 2.1135535605251787, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20155803207308054, "step": 17660 }, { "epoch": 0.589, "grad_norm": 28.75, "grad_norm_var": 2.2059895833333334, "learning_rate": 0.0001, "loss": 7.3064, "loss/crossentropy": 2.090160796046257, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.2027810573577881, "step": 17670 }, { "epoch": 0.5893333333333334, "grad_norm": 29.5, "grad_norm_var": 2.259375, "learning_rate": 0.0001, "loss": 7.1488, "loss/crossentropy": 2.0213358223438265, "loss/hidden": 3.26796875, "loss/jsd": 0.0, "loss/logits": 0.1669399242848158, "step": 17680 }, { "epoch": 0.5896666666666667, "grad_norm": 27.75, "grad_norm_var": 1.6893229166666666, "learning_rate": 0.0001, "loss": 7.2654, "loss/crossentropy": 1.967190508544445, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.17978248447179795, "step": 17690 }, { "epoch": 0.59, "grad_norm": 29.75, "grad_norm_var": 2.3655598958333335, "learning_rate": 0.0001, "loss": 7.2403, "loss/crossentropy": 1.898328522592783, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.1806940281763673, "step": 17700 }, { "epoch": 0.5903333333333334, "grad_norm": 29.75, "grad_norm_var": 2.2577473958333334, "learning_rate": 0.0001, "loss": 7.2388, "loss/crossentropy": 2.0621902495622635, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.18974024932831526, "step": 17710 }, { "epoch": 0.5906666666666667, "grad_norm": 27.625, "grad_norm_var": 1.7827473958333333, "learning_rate": 0.0001, "loss": 7.2003, "loss/crossentropy": 2.0286877915263175, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.19554231744259595, "step": 17720 }, { "epoch": 0.591, "grad_norm": 27.875, "grad_norm_var": 6.455208333333333, "learning_rate": 0.0001, "loss": 7.2889, "loss/crossentropy": 2.047126035392284, "loss/hidden": 3.558984375, "loss/jsd": 0.0, "loss/logits": 0.21331963799893855, "step": 17730 }, { "epoch": 0.5913333333333334, "grad_norm": 29.875, "grad_norm_var": 8.551041666666666, "learning_rate": 0.0001, "loss": 7.1257, "loss/crossentropy": 2.045316530764103, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.18156069591641427, "step": 17740 }, { "epoch": 0.5916666666666667, "grad_norm": 28.375, "grad_norm_var": 1.5947265625, "learning_rate": 0.0001, "loss": 7.2895, "loss/crossentropy": 2.0665383666753767, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.1922233860939741, "step": 17750 }, { "epoch": 0.592, "grad_norm": 29.25, "grad_norm_var": 1.1582682291666666, "learning_rate": 0.0001, "loss": 7.2933, "loss/crossentropy": 2.091674039512873, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.18688697535544635, "step": 17760 }, { "epoch": 0.5923333333333334, "grad_norm": 29.625, "grad_norm_var": 3.7030598958333334, "learning_rate": 0.0001, "loss": 7.2927, "loss/crossentropy": 1.9259259879589081, "loss/hidden": 3.511328125, "loss/jsd": 0.0, "loss/logits": 0.19638193324208258, "step": 17770 }, { "epoch": 0.5926666666666667, "grad_norm": 35.5, "grad_norm_var": 6.693489583333333, "learning_rate": 0.0001, "loss": 7.2655, "loss/crossentropy": 1.9216715686023236, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.18766624759882689, "step": 17780 }, { "epoch": 0.593, "grad_norm": 30.75, "grad_norm_var": 8.443489583333333, "learning_rate": 0.0001, "loss": 7.3507, "loss/crossentropy": 2.0846157282590867, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.18957795407623051, "step": 17790 }, { "epoch": 0.5933333333333334, "grad_norm": 29.5, "grad_norm_var": 6.170833333333333, "learning_rate": 0.0001, "loss": 7.2629, "loss/crossentropy": 2.0372542373836042, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.19953991882503033, "step": 17800 }, { "epoch": 0.5936666666666667, "grad_norm": 31.875, "grad_norm_var": 1.7426432291666667, "learning_rate": 0.0001, "loss": 7.1925, "loss/crossentropy": 2.1525475442409516, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19645317643880844, "step": 17810 }, { "epoch": 0.594, "grad_norm": 30.875, "grad_norm_var": 2.4872395833333334, "learning_rate": 0.0001, "loss": 7.2082, "loss/crossentropy": 1.972256100922823, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.19237237218767406, "step": 17820 }, { "epoch": 0.5943333333333334, "grad_norm": 35.0, "grad_norm_var": 3.9994140625, "learning_rate": 0.0001, "loss": 7.2987, "loss/crossentropy": 2.0836825877428056, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.187174391746521, "step": 17830 }, { "epoch": 0.5946666666666667, "grad_norm": 30.5, "grad_norm_var": 5.955143229166667, "learning_rate": 0.0001, "loss": 7.2407, "loss/crossentropy": 1.9825619161128998, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.1979378428310156, "step": 17840 }, { "epoch": 0.595, "grad_norm": 32.0, "grad_norm_var": 21.4197265625, "learning_rate": 0.0001, "loss": 7.2156, "loss/crossentropy": 1.911845586448908, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.17510547991842032, "step": 17850 }, { "epoch": 0.5953333333333334, "grad_norm": 29.5, "grad_norm_var": 20.6712890625, "learning_rate": 0.0001, "loss": 7.1642, "loss/crossentropy": 1.9271966315805913, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.18093658033758403, "step": 17860 }, { "epoch": 0.5956666666666667, "grad_norm": 28.0, "grad_norm_var": 1.4718098958333334, "learning_rate": 0.0001, "loss": 7.2286, "loss/crossentropy": 1.86384509652853, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.18292849976569414, "step": 17870 }, { "epoch": 0.596, "grad_norm": 28.375, "grad_norm_var": 1.8358723958333334, "learning_rate": 0.0001, "loss": 7.1288, "loss/crossentropy": 1.8128426790237426, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.18369462629780173, "step": 17880 }, { "epoch": 0.5963333333333334, "grad_norm": 29.25, "grad_norm_var": 17.42265625, "learning_rate": 0.0001, "loss": 7.3356, "loss/crossentropy": 2.048015257716179, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18430774360895158, "step": 17890 }, { "epoch": 0.5966666666666667, "grad_norm": 30.625, "grad_norm_var": 16.071875, "learning_rate": 0.0001, "loss": 7.3394, "loss/crossentropy": 1.972554624080658, "loss/hidden": 3.304296875, "loss/jsd": 0.0, "loss/logits": 0.17892546746879817, "step": 17900 }, { "epoch": 0.597, "grad_norm": 31.0, "grad_norm_var": 1.5843098958333333, "learning_rate": 0.0001, "loss": 7.3165, "loss/crossentropy": 2.031166372448206, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.18816648367792368, "step": 17910 }, { "epoch": 0.5973333333333334, "grad_norm": 28.75, "grad_norm_var": 1.4067057291666667, "learning_rate": 0.0001, "loss": 7.3232, "loss/crossentropy": 2.0565346404910088, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.19050047229975461, "step": 17920 }, { "epoch": 0.5976666666666667, "grad_norm": 27.0, "grad_norm_var": 3.3218098958333333, "learning_rate": 0.0001, "loss": 7.1573, "loss/crossentropy": 1.8999904945492745, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.18193528316915036, "step": 17930 }, { "epoch": 0.598, "grad_norm": 29.5, "grad_norm_var": 4.2916015625, "learning_rate": 0.0001, "loss": 7.287, "loss/crossentropy": 2.0124031715095043, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.19219639096409083, "step": 17940 }, { "epoch": 0.5983333333333334, "grad_norm": 28.75, "grad_norm_var": 3.4879557291666665, "learning_rate": 0.0001, "loss": 7.2311, "loss/crossentropy": 2.0456438839435576, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.1854414351284504, "step": 17950 }, { "epoch": 0.5986666666666667, "grad_norm": 26.375, "grad_norm_var": 3.7768229166666667, "learning_rate": 0.0001, "loss": 7.0943, "loss/crossentropy": 2.0780529260635374, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.19596082717180252, "step": 17960 }, { "epoch": 0.599, "grad_norm": 28.875, "grad_norm_var": 4.284309895833333, "learning_rate": 0.0001, "loss": 7.2531, "loss/crossentropy": 2.0367307879030703, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19683304838836194, "step": 17970 }, { "epoch": 0.5993333333333334, "grad_norm": 31.625, "grad_norm_var": 2635.5306640625, "learning_rate": 0.0001, "loss": 7.259, "loss/crossentropy": 2.09322746694088, "loss/hidden": 3.266015625, "loss/jsd": 0.0, "loss/logits": 0.179186581261456, "step": 17980 }, { "epoch": 0.5996666666666667, "grad_norm": 30.375, "grad_norm_var": 4429.205208333334, "learning_rate": 0.0001, "loss": 7.213, "loss/crossentropy": 1.9054803669452667, "loss/hidden": 3.540234375, "loss/jsd": 0.0, "loss/logits": 0.1968476613983512, "step": 17990 }, { "epoch": 0.6, "grad_norm": 26.125, "grad_norm_var": 2221.7921223958333, "learning_rate": 0.0001, "loss": 7.2227, "loss/crossentropy": 2.0074591264128685, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.1873547974973917, "step": 18000 }, { "epoch": 0.6003333333333334, "grad_norm": 29.375, "grad_norm_var": 3.6155598958333335, "learning_rate": 0.0001, "loss": 7.1801, "loss/crossentropy": 1.9968949660658837, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.18020131913945078, "step": 18010 }, { "epoch": 0.6006666666666667, "grad_norm": 32.75, "grad_norm_var": 3.2072265625, "learning_rate": 0.0001, "loss": 7.2596, "loss/crossentropy": 1.9853629019111394, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.18953896183520555, "step": 18020 }, { "epoch": 0.601, "grad_norm": 29.875, "grad_norm_var": 45.79765625, "learning_rate": 0.0001, "loss": 7.307, "loss/crossentropy": 1.9396917998790741, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.19391046669334172, "step": 18030 }, { "epoch": 0.6013333333333334, "grad_norm": 30.375, "grad_norm_var": 21.314518229166666, "learning_rate": 0.0001, "loss": 7.2079, "loss/crossentropy": 1.8975107453763485, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.18006250262260437, "step": 18040 }, { "epoch": 0.6016666666666667, "grad_norm": 29.75, "grad_norm_var": 16.888541666666665, "learning_rate": 0.0001, "loss": 7.2938, "loss/crossentropy": 2.1740293741226195, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.20504566319286824, "step": 18050 }, { "epoch": 0.602, "grad_norm": 28.875, "grad_norm_var": 8.641080729166667, "learning_rate": 0.0001, "loss": 7.1802, "loss/crossentropy": 1.876610678434372, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.17648143526166676, "step": 18060 }, { "epoch": 0.6023333333333334, "grad_norm": 32.5, "grad_norm_var": 8.122330729166666, "learning_rate": 0.0001, "loss": 7.1863, "loss/crossentropy": 1.7905786775052548, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.17692546313628554, "step": 18070 }, { "epoch": 0.6026666666666667, "grad_norm": 30.5, "grad_norm_var": 8.7431640625, "learning_rate": 0.0001, "loss": 7.2455, "loss/crossentropy": 1.9867556169629097, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.19072600714862348, "step": 18080 }, { "epoch": 0.603, "grad_norm": 30.25, "grad_norm_var": 7.245833333333334, "learning_rate": 0.0001, "loss": 7.2846, "loss/crossentropy": 1.9955212384462357, "loss/hidden": 3.569921875, "loss/jsd": 0.0, "loss/logits": 0.21169722471386193, "step": 18090 }, { "epoch": 0.6033333333333334, "grad_norm": 30.75, "grad_norm_var": 15.767708333333333, "learning_rate": 0.0001, "loss": 7.3287, "loss/crossentropy": 1.9649004712700844, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.19037200771272184, "step": 18100 }, { "epoch": 0.6036666666666667, "grad_norm": 29.625, "grad_norm_var": 3.1925595271396854e+18, "learning_rate": 0.0001, "loss": 7.2999, "loss/crossentropy": 1.8987395763397217, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.18546657506376504, "step": 18110 }, { "epoch": 0.604, "grad_norm": 35.0, "grad_norm_var": 5.633834196659916e+18, "learning_rate": 0.0001, "loss": 7.4296, "loss/crossentropy": 1.9901069954037667, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.19916359782218934, "step": 18120 }, { "epoch": 0.6043333333333333, "grad_norm": 31.375, "grad_norm_var": 2.842967607632986e+18, "learning_rate": 0.0001, "loss": 7.3099, "loss/crossentropy": 2.0697161041200163, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.19085621517151594, "step": 18130 }, { "epoch": 0.6046666666666667, "grad_norm": 28.25, "grad_norm_var": 1.1697916666666666, "learning_rate": 0.0001, "loss": 7.2071, "loss/crossentropy": 1.950901211798191, "loss/hidden": 3.498828125, "loss/jsd": 0.0, "loss/logits": 0.18995508626103402, "step": 18140 }, { "epoch": 0.605, "grad_norm": 29.25, "grad_norm_var": 1.9296223958333334, "learning_rate": 0.0001, "loss": 7.1551, "loss/crossentropy": 1.8647797212004662, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.1695878764614463, "step": 18150 }, { "epoch": 0.6053333333333333, "grad_norm": 38.5, "grad_norm_var": 6.716666666666667, "learning_rate": 0.0001, "loss": 7.2614, "loss/crossentropy": 1.891598542034626, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.17559167575091122, "step": 18160 }, { "epoch": 0.6056666666666667, "grad_norm": 28.0, "grad_norm_var": 14.01015625, "learning_rate": 0.0001, "loss": 7.29, "loss/crossentropy": 1.9643616311252117, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.19505905471742152, "step": 18170 }, { "epoch": 0.606, "grad_norm": 28.75, "grad_norm_var": 9.397916666666667, "learning_rate": 0.0001, "loss": 7.3056, "loss/crossentropy": 1.9337473601102828, "loss/hidden": 3.526171875, "loss/jsd": 0.0, "loss/logits": 0.19584217928349973, "step": 18180 }, { "epoch": 0.6063333333333333, "grad_norm": 33.0, "grad_norm_var": 7.353059895833334, "learning_rate": 0.0001, "loss": 7.2155, "loss/crossentropy": 2.0806949749588965, "loss/hidden": 3.47421875, "loss/jsd": 0.0, "loss/logits": 0.19349032249301673, "step": 18190 }, { "epoch": 0.6066666666666667, "grad_norm": 32.25, "grad_norm_var": 11.664322916666666, "learning_rate": 0.0001, "loss": 7.2511, "loss/crossentropy": 1.963471694663167, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.17991171199828387, "step": 18200 }, { "epoch": 0.607, "grad_norm": 30.75, "grad_norm_var": 7.90390625, "learning_rate": 0.0001, "loss": 7.3202, "loss/crossentropy": 1.9921027541160583, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.18385309586301446, "step": 18210 }, { "epoch": 0.6073333333333333, "grad_norm": 29.0, "grad_norm_var": 1.3488932291666667, "learning_rate": 0.0001, "loss": 7.2404, "loss/crossentropy": 2.0484732441604137, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.18387728687375784, "step": 18220 }, { "epoch": 0.6076666666666667, "grad_norm": 33.75, "grad_norm_var": 11.079622395833333, "learning_rate": 0.0001, "loss": 7.234, "loss/crossentropy": 1.9633229859173298, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.176436207909137, "step": 18230 }, { "epoch": 0.608, "grad_norm": 31.125, "grad_norm_var": 13.346809895833333, "learning_rate": 0.0001, "loss": 7.2391, "loss/crossentropy": 1.8430568292737006, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.18235661080107093, "step": 18240 }, { "epoch": 0.6083333333333333, "grad_norm": 29.625, "grad_norm_var": 2.7744140625, "learning_rate": 0.0001, "loss": 7.1006, "loss/crossentropy": 1.9133832603693008, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.17318002954125405, "step": 18250 }, { "epoch": 0.6086666666666667, "grad_norm": 29.25, "grad_norm_var": 2.0686848958333335, "learning_rate": 0.0001, "loss": 7.2186, "loss/crossentropy": 1.9376382641494274, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.18242966691032053, "step": 18260 }, { "epoch": 0.609, "grad_norm": 29.25, "grad_norm_var": 2.0478515625, "learning_rate": 0.0001, "loss": 7.2635, "loss/crossentropy": 2.115917232632637, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.17996171973645686, "step": 18270 }, { "epoch": 0.6093333333333333, "grad_norm": 30.375, "grad_norm_var": 1.42265625, "learning_rate": 0.0001, "loss": 7.2453, "loss/crossentropy": 1.9246600836515426, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.1961457096040249, "step": 18280 }, { "epoch": 0.6096666666666667, "grad_norm": 30.875, "grad_norm_var": 2.3291666666666666, "learning_rate": 0.0001, "loss": 7.3304, "loss/crossentropy": 2.0575062677264215, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.1961954228579998, "step": 18290 }, { "epoch": 0.61, "grad_norm": 30.75, "grad_norm_var": 13.333268229166666, "learning_rate": 0.0001, "loss": 7.1919, "loss/crossentropy": 2.108006913214922, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.17983820140361786, "step": 18300 }, { "epoch": 0.6103333333333333, "grad_norm": 34.25, "grad_norm_var": 4.865559895833333, "learning_rate": 0.0001, "loss": 7.1468, "loss/crossentropy": 1.962227436900139, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.1787870392203331, "step": 18310 }, { "epoch": 0.6106666666666667, "grad_norm": 27.75, "grad_norm_var": 5.229166666666667, "learning_rate": 0.0001, "loss": 7.2723, "loss/crossentropy": 2.0233663827180863, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.1891197700984776, "step": 18320 }, { "epoch": 0.611, "grad_norm": 28.625, "grad_norm_var": 1.4139973958333334, "learning_rate": 0.0001, "loss": 7.261, "loss/crossentropy": 2.099140986800194, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.1941378552466631, "step": 18330 }, { "epoch": 0.6113333333333333, "grad_norm": 31.0, "grad_norm_var": 3.8462890625, "learning_rate": 0.0001, "loss": 7.3386, "loss/crossentropy": 1.9343717962503433, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.17795868963003159, "step": 18340 }, { "epoch": 0.6116666666666667, "grad_norm": 31.0, "grad_norm_var": 1.3393229166666667, "learning_rate": 0.0001, "loss": 7.2923, "loss/crossentropy": 1.9366346143186093, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.18902826150879265, "step": 18350 }, { "epoch": 0.612, "grad_norm": 29.375, "grad_norm_var": 1.2212890625, "learning_rate": 0.0001, "loss": 7.428, "loss/crossentropy": 2.085362437367439, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.19879386220127343, "step": 18360 }, { "epoch": 0.6123333333333333, "grad_norm": 30.875, "grad_norm_var": 1.7905598958333333, "learning_rate": 0.0001, "loss": 7.2307, "loss/crossentropy": 1.921523703634739, "loss/hidden": 3.473046875, "loss/jsd": 0.0, "loss/logits": 0.18595185689628124, "step": 18370 }, { "epoch": 0.6126666666666667, "grad_norm": 28.625, "grad_norm_var": 3.1572265625, "learning_rate": 0.0001, "loss": 7.2473, "loss/crossentropy": 1.8036039613187314, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.16718588918447494, "step": 18380 }, { "epoch": 0.613, "grad_norm": 29.75, "grad_norm_var": 4.3791015625, "learning_rate": 0.0001, "loss": 7.2935, "loss/crossentropy": 1.9395476639270783, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.1946342770010233, "step": 18390 }, { "epoch": 0.6133333333333333, "grad_norm": 28.25, "grad_norm_var": 2.3385416666666665, "learning_rate": 0.0001, "loss": 7.1963, "loss/crossentropy": 2.077028851211071, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.1959906244650483, "step": 18400 }, { "epoch": 0.6136666666666667, "grad_norm": 29.625, "grad_norm_var": 3.220833333333333, "learning_rate": 0.0001, "loss": 7.2443, "loss/crossentropy": 2.001774525642395, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.17920596692711116, "step": 18410 }, { "epoch": 0.614, "grad_norm": 28.875, "grad_norm_var": 2.7718098958333335, "learning_rate": 0.0001, "loss": 7.3045, "loss/crossentropy": 2.0613322094082833, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.19724120739847423, "step": 18420 }, { "epoch": 0.6143333333333333, "grad_norm": 33.0, "grad_norm_var": 2.3643229166666666, "learning_rate": 0.0001, "loss": 7.2027, "loss/crossentropy": 1.9185424163937568, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.18797938544303178, "step": 18430 }, { "epoch": 0.6146666666666667, "grad_norm": 29.125, "grad_norm_var": 2.4559895833333334, "learning_rate": 0.0001, "loss": 7.2468, "loss/crossentropy": 1.813283124566078, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.18377352096140384, "step": 18440 }, { "epoch": 0.615, "grad_norm": 31.25, "grad_norm_var": 1.7728515625, "learning_rate": 0.0001, "loss": 7.3922, "loss/crossentropy": 2.309616395831108, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.205170863494277, "step": 18450 }, { "epoch": 0.6153333333333333, "grad_norm": 32.25, "grad_norm_var": 3.0072265625, "learning_rate": 0.0001, "loss": 7.3169, "loss/crossentropy": 2.1744403660297396, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.1856847619637847, "step": 18460 }, { "epoch": 0.6156666666666667, "grad_norm": 29.625, "grad_norm_var": 18.755208333333332, "learning_rate": 0.0001, "loss": 7.1684, "loss/crossentropy": 1.8449337378144264, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.17641404923051596, "step": 18470 }, { "epoch": 0.616, "grad_norm": 32.5, "grad_norm_var": 15.3056640625, "learning_rate": 0.0001, "loss": 7.3307, "loss/crossentropy": 2.247915732860565, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.20192933976650237, "step": 18480 }, { "epoch": 0.6163333333333333, "grad_norm": 29.625, "grad_norm_var": 1.640625, "learning_rate": 0.0001, "loss": 7.2853, "loss/crossentropy": 1.9379427522420882, "loss/hidden": 3.287890625, "loss/jsd": 0.0, "loss/logits": 0.16415343983098865, "step": 18490 }, { "epoch": 0.6166666666666667, "grad_norm": 27.625, "grad_norm_var": 3.0893229166666667, "learning_rate": 0.0001, "loss": 7.2162, "loss/crossentropy": 1.9569771081209182, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.1877719761803746, "step": 18500 }, { "epoch": 0.617, "grad_norm": 30.625, "grad_norm_var": 5.93515625, "learning_rate": 0.0001, "loss": 7.1706, "loss/crossentropy": 1.8208848126232624, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.1760378809645772, "step": 18510 }, { "epoch": 0.6173333333333333, "grad_norm": 28.875, "grad_norm_var": 2.8301432291666666, "learning_rate": 0.0001, "loss": 7.2006, "loss/crossentropy": 2.1221661023795604, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.19056989680975675, "step": 18520 }, { "epoch": 0.6176666666666667, "grad_norm": 36.75, "grad_norm_var": 5.8375, "learning_rate": 0.0001, "loss": 7.1924, "loss/crossentropy": 1.8749456778168678, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.18271313630975783, "step": 18530 }, { "epoch": 0.618, "grad_norm": 31.5, "grad_norm_var": 4.945247395833333, "learning_rate": 0.0001, "loss": 7.2164, "loss/crossentropy": 1.8286933556199074, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.1837154201231897, "step": 18540 }, { "epoch": 0.6183333333333333, "grad_norm": 28.5, "grad_norm_var": 3.2125, "learning_rate": 0.0001, "loss": 7.1801, "loss/crossentropy": 2.070425542443991, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19038289943709968, "step": 18550 }, { "epoch": 0.6186666666666667, "grad_norm": 28.0, "grad_norm_var": 4.301041666666666, "learning_rate": 0.0001, "loss": 7.2291, "loss/crossentropy": 1.7971087485551833, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.17366793248802423, "step": 18560 }, { "epoch": 0.619, "grad_norm": 32.5, "grad_norm_var": 2.262434895833333, "learning_rate": 0.0001, "loss": 7.3941, "loss/crossentropy": 2.0128779470920564, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.19434318216517568, "step": 18570 }, { "epoch": 0.6193333333333333, "grad_norm": 31.5, "grad_norm_var": 2.561393229166667, "learning_rate": 0.0001, "loss": 7.3291, "loss/crossentropy": 1.9434144131839275, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.20261412467807532, "step": 18580 }, { "epoch": 0.6196666666666667, "grad_norm": 29.25, "grad_norm_var": 3.544791666666667, "learning_rate": 0.0001, "loss": 7.3665, "loss/crossentropy": 2.100622844696045, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.1870765585452318, "step": 18590 }, { "epoch": 0.62, "grad_norm": 27.875, "grad_norm_var": 1.7677083333333334, "learning_rate": 0.0001, "loss": 7.2565, "loss/crossentropy": 1.9993086464703083, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.17700003627687694, "step": 18600 }, { "epoch": 0.6203333333333333, "grad_norm": 36.0, "grad_norm_var": 7.925, "learning_rate": 0.0001, "loss": 7.2735, "loss/crossentropy": 2.009786033630371, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.18971458803862334, "step": 18610 }, { "epoch": 0.6206666666666667, "grad_norm": 30.625, "grad_norm_var": 7.782747395833334, "learning_rate": 0.0001, "loss": 7.2445, "loss/crossentropy": 1.9411951296031476, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.18929186034947634, "step": 18620 }, { "epoch": 0.621, "grad_norm": 29.25, "grad_norm_var": 1.6135416666666667, "learning_rate": 0.0001, "loss": 7.2858, "loss/crossentropy": 2.0704437777400018, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.19321441259235145, "step": 18630 }, { "epoch": 0.6213333333333333, "grad_norm": 29.125, "grad_norm_var": 4.93125, "learning_rate": 0.0001, "loss": 7.3834, "loss/crossentropy": 2.0620270401239393, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.20560388695448636, "step": 18640 }, { "epoch": 0.6216666666666667, "grad_norm": 28.375, "grad_norm_var": 5.256705729166667, "learning_rate": 0.0001, "loss": 7.3775, "loss/crossentropy": 2.028209035098553, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.1861821800470352, "step": 18650 }, { "epoch": 0.622, "grad_norm": 28.25, "grad_norm_var": 2.671875, "learning_rate": 0.0001, "loss": 7.3032, "loss/crossentropy": 2.118542260676622, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.19138288386166097, "step": 18660 }, { "epoch": 0.6223333333333333, "grad_norm": 29.25, "grad_norm_var": 8.783333333333333, "learning_rate": 0.0001, "loss": 7.2327, "loss/crossentropy": 2.133476397395134, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.18920790534466506, "step": 18670 }, { "epoch": 0.6226666666666667, "grad_norm": 30.5, "grad_norm_var": 2.904166666666667, "learning_rate": 0.0001, "loss": 7.2051, "loss/crossentropy": 1.9396512925624847, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.19620279967784882, "step": 18680 }, { "epoch": 0.623, "grad_norm": 32.25, "grad_norm_var": 2.780143229166667, "learning_rate": 0.0001, "loss": 7.3563, "loss/crossentropy": 1.921254688501358, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.1847329292446375, "step": 18690 }, { "epoch": 0.6233333333333333, "grad_norm": 32.0, "grad_norm_var": 2.455208333333333, "learning_rate": 0.0001, "loss": 7.3652, "loss/crossentropy": 2.0147503137588503, "loss/hidden": 3.559765625, "loss/jsd": 0.0, "loss/logits": 0.2057671697810292, "step": 18700 }, { "epoch": 0.6236666666666667, "grad_norm": 27.75, "grad_norm_var": 27.328059895833334, "learning_rate": 0.0001, "loss": 7.2372, "loss/crossentropy": 1.9983432859182357, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.18284386191517116, "step": 18710 }, { "epoch": 0.624, "grad_norm": 28.0, "grad_norm_var": 17.6134765625, "learning_rate": 0.0001, "loss": 7.345, "loss/crossentropy": 2.1294097542762755, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.21455084756016732, "step": 18720 }, { "epoch": 0.6243333333333333, "grad_norm": 29.875, "grad_norm_var": 15.466666666666667, "learning_rate": 0.0001, "loss": 7.3531, "loss/crossentropy": 1.9481904812157154, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.18082798095420002, "step": 18730 }, { "epoch": 0.6246666666666667, "grad_norm": 29.5, "grad_norm_var": 2.513641885852395e+18, "learning_rate": 0.0001, "loss": 7.2922, "loss/crossentropy": 2.1236501812934874, "loss/hidden": 3.715234375, "loss/jsd": 0.0, "loss/logits": 0.1971730487421155, "step": 18740 }, { "epoch": 0.625, "grad_norm": 28.75, "grad_norm_var": 2.9284656313582746e+18, "learning_rate": 0.0001, "loss": 7.1611, "loss/crossentropy": 1.918633434176445, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.17696992810815573, "step": 18750 }, { "epoch": 0.6253333333333333, "grad_norm": 29.0, "grad_norm_var": 1.3598307291666667, "learning_rate": 0.0001, "loss": 7.2172, "loss/crossentropy": 1.834592828899622, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.1752721296623349, "step": 18760 }, { "epoch": 0.6256666666666667, "grad_norm": 29.375, "grad_norm_var": 2.380143229166667, "learning_rate": 0.0001, "loss": 7.3018, "loss/crossentropy": 1.9775620214641094, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.18864405378699303, "step": 18770 }, { "epoch": 0.626, "grad_norm": 30.125, "grad_norm_var": 1.8983723958333334, "learning_rate": 0.0001, "loss": 7.2485, "loss/crossentropy": 2.181874208152294, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.20505423787981272, "step": 18780 }, { "epoch": 0.6263333333333333, "grad_norm": 31.625, "grad_norm_var": 1.2205729166666666, "learning_rate": 0.0001, "loss": 7.2625, "loss/crossentropy": 2.032845878601074, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.1912934934720397, "step": 18790 }, { "epoch": 0.6266666666666667, "grad_norm": 28.125, "grad_norm_var": 6.557747395833333, "learning_rate": 0.0001, "loss": 7.1859, "loss/crossentropy": 1.8757417768239975, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.17532084425911307, "step": 18800 }, { "epoch": 0.627, "grad_norm": 30.5, "grad_norm_var": 8.772330729166667, "learning_rate": 0.0001, "loss": 7.1583, "loss/crossentropy": 1.6718830045312643, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.16415114253759383, "step": 18810 }, { "epoch": 0.6273333333333333, "grad_norm": 30.375, "grad_norm_var": 29.733072916666668, "learning_rate": 0.0001, "loss": 7.2655, "loss/crossentropy": 1.927823992818594, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.18012394476681948, "step": 18820 }, { "epoch": 0.6276666666666667, "grad_norm": 29.625, "grad_norm_var": 13.2125, "learning_rate": 0.0001, "loss": 7.3041, "loss/crossentropy": 1.9741487562656403, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.1718524781987071, "step": 18830 }, { "epoch": 0.628, "grad_norm": 30.25, "grad_norm_var": 2.154166666666667, "learning_rate": 0.0001, "loss": 7.2255, "loss/crossentropy": 1.8713882826268673, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.17388084828853606, "step": 18840 }, { "epoch": 0.6283333333333333, "grad_norm": 29.375, "grad_norm_var": 1.92265625, "learning_rate": 0.0001, "loss": 7.1317, "loss/crossentropy": 1.827824203670025, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.17161936489865184, "step": 18850 }, { "epoch": 0.6286666666666667, "grad_norm": 31.0, "grad_norm_var": 6.584375, "learning_rate": 0.0001, "loss": 7.2858, "loss/crossentropy": 1.8326379302889109, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.1798766848631203, "step": 18860 }, { "epoch": 0.629, "grad_norm": 30.125, "grad_norm_var": 162.99680989583334, "learning_rate": 0.0001, "loss": 7.2863, "loss/crossentropy": 2.0633171133697035, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.1819724513217807, "step": 18870 }, { "epoch": 0.6293333333333333, "grad_norm": 28.5, "grad_norm_var": 15.788541666666667, "learning_rate": 0.0001, "loss": 7.2693, "loss/crossentropy": 2.0134837619960306, "loss/hidden": 3.507421875, "loss/jsd": 0.0, "loss/logits": 0.19326711129397153, "step": 18880 }, { "epoch": 0.6296666666666667, "grad_norm": 29.125, "grad_norm_var": 1.0931640625, "learning_rate": 0.0001, "loss": 7.1705, "loss/crossentropy": 1.956085129827261, "loss/hidden": 3.302734375, "loss/jsd": 0.0, "loss/logits": 0.17240228615701197, "step": 18890 }, { "epoch": 0.63, "grad_norm": 29.75, "grad_norm_var": 0.8551432291666666, "learning_rate": 0.0001, "loss": 7.3464, "loss/crossentropy": 2.0210869297385217, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.1942229462787509, "step": 18900 }, { "epoch": 0.6303333333333333, "grad_norm": 35.75, "grad_norm_var": 3.79765625, "learning_rate": 0.0001, "loss": 7.2465, "loss/crossentropy": 1.948809690773487, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.1928313620388508, "step": 18910 }, { "epoch": 0.6306666666666667, "grad_norm": 28.875, "grad_norm_var": 4.837434895833334, "learning_rate": 0.0001, "loss": 7.2156, "loss/crossentropy": 1.9722835347056389, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.1896892311051488, "step": 18920 }, { "epoch": 0.631, "grad_norm": 29.25, "grad_norm_var": 3.83515625, "learning_rate": 0.0001, "loss": 7.2979, "loss/crossentropy": 2.012093511223793, "loss/hidden": 3.478515625, "loss/jsd": 0.0, "loss/logits": 0.1906515618786216, "step": 18930 }, { "epoch": 0.6313333333333333, "grad_norm": 31.875, "grad_norm_var": 19.289583333333333, "learning_rate": 0.0001, "loss": 7.136, "loss/crossentropy": 1.880264090001583, "loss/hidden": 3.22421875, "loss/jsd": 0.0, "loss/logits": 0.15467084813863038, "step": 18940 }, { "epoch": 0.6316666666666667, "grad_norm": 27.75, "grad_norm_var": 1.2327473958333333, "learning_rate": 0.0001, "loss": 7.2192, "loss/crossentropy": 1.9712775982916355, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.17950046579353512, "step": 18950 }, { "epoch": 0.632, "grad_norm": 29.875, "grad_norm_var": 2.8473307291666665, "learning_rate": 0.0001, "loss": 7.1178, "loss/crossentropy": 1.9216239601373672, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.1952439049258828, "step": 18960 }, { "epoch": 0.6323333333333333, "grad_norm": 29.875, "grad_norm_var": 3.701041666666667, "learning_rate": 0.0001, "loss": 7.2086, "loss/crossentropy": 2.0472032487392426, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.20150937009602785, "step": 18970 }, { "epoch": 0.6326666666666667, "grad_norm": 29.0, "grad_norm_var": 1.0854166666666667, "learning_rate": 0.0001, "loss": 7.3013, "loss/crossentropy": 1.989279668033123, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.18323071394115686, "step": 18980 }, { "epoch": 0.633, "grad_norm": 30.0, "grad_norm_var": 2.3681640625, "learning_rate": 0.0001, "loss": 7.3393, "loss/crossentropy": 2.287104348838329, "loss/hidden": 3.32578125, "loss/jsd": 0.0, "loss/logits": 0.1947536811232567, "step": 18990 }, { "epoch": 0.6333333333333333, "grad_norm": 31.875, "grad_norm_var": 3.4749348958333335, "learning_rate": 0.0001, "loss": 7.2099, "loss/crossentropy": 1.9214728213846684, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.17559929257258772, "step": 19000 }, { "epoch": 0.6336666666666667, "grad_norm": 31.125, "grad_norm_var": 3.5603515625, "learning_rate": 0.0001, "loss": 7.2661, "loss/crossentropy": 2.087320441007614, "loss/hidden": 3.32890625, "loss/jsd": 0.0, "loss/logits": 0.1870307233184576, "step": 19010 }, { "epoch": 0.634, "grad_norm": 29.25, "grad_norm_var": 4.784375, "learning_rate": 0.0001, "loss": 7.3193, "loss/crossentropy": 1.8164633564651012, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.1848822046071291, "step": 19020 }, { "epoch": 0.6343333333333333, "grad_norm": 27.625, "grad_norm_var": 3.1925595261346253e+18, "learning_rate": 0.0001, "loss": 7.2483, "loss/crossentropy": 1.9285401314496995, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.19006774052977563, "step": 19030 }, { "epoch": 0.6346666666666667, "grad_norm": 27.625, "grad_norm_var": 3.222606980325852e+18, "learning_rate": 0.0001, "loss": 7.2282, "loss/crossentropy": 1.9070459872484207, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18037862414494157, "step": 19040 }, { "epoch": 0.635, "grad_norm": 69.0, "grad_norm_var": 105.03515625, "learning_rate": 0.0001, "loss": 7.1474, "loss/crossentropy": 1.9003420658409595, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.1778818178921938, "step": 19050 }, { "epoch": 0.6353333333333333, "grad_norm": 27.875, "grad_norm_var": 97.965625, "learning_rate": 0.0001, "loss": 7.1927, "loss/crossentropy": 1.8204132683575154, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.1777735896408558, "step": 19060 }, { "epoch": 0.6356666666666667, "grad_norm": 34.0, "grad_norm_var": 3.316666666666667, "learning_rate": 0.0001, "loss": 7.1993, "loss/crossentropy": 1.9050064146518708, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.180423572473228, "step": 19070 }, { "epoch": 0.636, "grad_norm": 38.5, "grad_norm_var": 27.94765625, "learning_rate": 0.0001, "loss": 7.2812, "loss/crossentropy": 2.0620297893881796, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.19334815796464683, "step": 19080 }, { "epoch": 0.6363333333333333, "grad_norm": 32.0, "grad_norm_var": 105.84348958333334, "learning_rate": 0.0001, "loss": 7.1699, "loss/crossentropy": 1.925288773328066, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.17268607756122947, "step": 19090 }, { "epoch": 0.6366666666666667, "grad_norm": 30.0, "grad_norm_var": 89.19264322916666, "learning_rate": 0.0001, "loss": 7.2762, "loss/crossentropy": 1.787583889067173, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.177067665848881, "step": 19100 }, { "epoch": 0.637, "grad_norm": 37.5, "grad_norm_var": 7.090625, "learning_rate": 0.0001, "loss": 7.3227, "loss/crossentropy": 2.011252249777317, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.18080266881734133, "step": 19110 }, { "epoch": 0.6373333333333333, "grad_norm": 29.375, "grad_norm_var": 8.143489583333333, "learning_rate": 0.0001, "loss": 7.2473, "loss/crossentropy": 1.9220270276069642, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.17707198914140462, "step": 19120 }, { "epoch": 0.6376666666666667, "grad_norm": 29.375, "grad_norm_var": 3.9205729166666665, "learning_rate": 0.0001, "loss": 7.3549, "loss/crossentropy": 2.023925003409386, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.19612569622695447, "step": 19130 }, { "epoch": 0.638, "grad_norm": 28.125, "grad_norm_var": 3.76640625, "learning_rate": 0.0001, "loss": 7.1915, "loss/crossentropy": 1.915830608457327, "loss/hidden": 3.48984375, "loss/jsd": 0.0, "loss/logits": 0.1768260754644871, "step": 19140 }, { "epoch": 0.6383333333333333, "grad_norm": 28.875, "grad_norm_var": 1.9410807291666667, "learning_rate": 0.0001, "loss": 7.187, "loss/crossentropy": 1.999140276759863, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.17691278262063861, "step": 19150 }, { "epoch": 0.6386666666666667, "grad_norm": 32.25, "grad_norm_var": 2.4270833333333335, "learning_rate": 0.0001, "loss": 7.2192, "loss/crossentropy": 1.9270139768719674, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.1901017867028713, "step": 19160 }, { "epoch": 0.639, "grad_norm": 30.25, "grad_norm_var": 2.7405598958333335, "learning_rate": 0.0001, "loss": 7.1155, "loss/crossentropy": 1.7382242754101753, "loss/hidden": 3.305078125, "loss/jsd": 0.0, "loss/logits": 0.162950631743297, "step": 19170 }, { "epoch": 0.6393333333333333, "grad_norm": 32.0, "grad_norm_var": 2.5978515625, "learning_rate": 0.0001, "loss": 7.1777, "loss/crossentropy": 2.0843997061252595, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.17932662814855577, "step": 19180 }, { "epoch": 0.6396666666666667, "grad_norm": 30.25, "grad_norm_var": 2.9614583333333333, "learning_rate": 0.0001, "loss": 7.3943, "loss/crossentropy": 2.032052809000015, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.20184972528368234, "step": 19190 }, { "epoch": 0.64, "grad_norm": 29.125, "grad_norm_var": 1.7854166666666667, "learning_rate": 0.0001, "loss": 7.3384, "loss/crossentropy": 2.018916334211826, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.1918800650164485, "step": 19200 }, { "epoch": 0.6403333333333333, "grad_norm": 30.25, "grad_norm_var": 1552.3135416666667, "learning_rate": 0.0001, "loss": 7.3713, "loss/crossentropy": 1.9095537073910236, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.2061590164899826, "step": 19210 }, { "epoch": 0.6406666666666667, "grad_norm": 28.875, "grad_norm_var": 49.25182291666667, "learning_rate": 0.0001, "loss": 7.2151, "loss/crossentropy": 2.099197779595852, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19360438380390405, "step": 19220 }, { "epoch": 0.641, "grad_norm": 30.25, "grad_norm_var": 2.5551432291666667, "learning_rate": 0.0001, "loss": 7.3194, "loss/crossentropy": 2.0210408866405487, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.20467441752552987, "step": 19230 }, { "epoch": 0.6413333333333333, "grad_norm": 30.0, "grad_norm_var": 1.3552083333333333, "learning_rate": 0.0001, "loss": 7.2032, "loss/crossentropy": 1.9335297152400017, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.1674502532929182, "step": 19240 }, { "epoch": 0.6416666666666667, "grad_norm": 30.875, "grad_norm_var": 2.1056640625, "learning_rate": 0.0001, "loss": 7.159, "loss/crossentropy": 2.065697592496872, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.18475553784519433, "step": 19250 }, { "epoch": 0.642, "grad_norm": 29.0, "grad_norm_var": 3.226822916666667, "learning_rate": 0.0001, "loss": 7.1542, "loss/crossentropy": 1.8511378549039363, "loss/hidden": 3.283203125, "loss/jsd": 0.0, "loss/logits": 0.1605996700003743, "step": 19260 }, { "epoch": 0.6423333333333333, "grad_norm": 30.25, "grad_norm_var": 3.4809895833333333, "learning_rate": 0.0001, "loss": 7.2327, "loss/crossentropy": 2.0499357663095, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.20136928129941226, "step": 19270 }, { "epoch": 0.6426666666666667, "grad_norm": 29.125, "grad_norm_var": 5.045572916666667, "learning_rate": 0.0001, "loss": 7.2198, "loss/crossentropy": 1.9678279034793378, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.18454033583402635, "step": 19280 }, { "epoch": 0.643, "grad_norm": 31.25, "grad_norm_var": 3.6729166666666666, "learning_rate": 0.0001, "loss": 7.2615, "loss/crossentropy": 2.1519006073474882, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.20591103062033653, "step": 19290 }, { "epoch": 0.6433333333333333, "grad_norm": 27.875, "grad_norm_var": 3.9145182291666667, "learning_rate": 0.0001, "loss": 7.2469, "loss/crossentropy": 2.1101471945643424, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.19394804071635008, "step": 19300 }, { "epoch": 0.6436666666666667, "grad_norm": 29.5, "grad_norm_var": 4.402018229166667, "learning_rate": 0.0001, "loss": 7.2472, "loss/crossentropy": 1.935300499200821, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.19340323600918055, "step": 19310 }, { "epoch": 0.644, "grad_norm": 32.25, "grad_norm_var": 1.8738932291666666, "learning_rate": 0.0001, "loss": 7.2989, "loss/crossentropy": 1.921600653976202, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.18406634870916605, "step": 19320 }, { "epoch": 0.6443333333333333, "grad_norm": 29.0, "grad_norm_var": 168.18958333333333, "learning_rate": 0.0001, "loss": 7.2885, "loss/crossentropy": 2.038621720671654, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.1950019085779786, "step": 19330 }, { "epoch": 0.6446666666666667, "grad_norm": 29.375, "grad_norm_var": 3.044433316753668e+18, "learning_rate": 0.0001, "loss": 7.345, "loss/crossentropy": 2.0226799666881563, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.18214802257716656, "step": 19340 }, { "epoch": 0.645, "grad_norm": 31.25, "grad_norm_var": 8.398372395833333, "learning_rate": 0.0001, "loss": 7.187, "loss/crossentropy": 1.8102751962840558, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.1700789694674313, "step": 19350 }, { "epoch": 0.6453333333333333, "grad_norm": 30.0, "grad_norm_var": 1.7872395833333334, "learning_rate": 0.0001, "loss": 7.2359, "loss/crossentropy": 1.9931891098618508, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.17683383747935294, "step": 19360 }, { "epoch": 0.6456666666666667, "grad_norm": 30.375, "grad_norm_var": 1.4358723958333333, "learning_rate": 0.0001, "loss": 7.1732, "loss/crossentropy": 2.0232337579131126, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.18284091874957084, "step": 19370 }, { "epoch": 0.646, "grad_norm": 29.5, "grad_norm_var": 0.6009765625, "learning_rate": 0.0001, "loss": 7.2253, "loss/crossentropy": 1.8738485649228096, "loss/hidden": 3.35078125, "loss/jsd": 0.0, "loss/logits": 0.1725960264913738, "step": 19380 }, { "epoch": 0.6463333333333333, "grad_norm": 31.875, "grad_norm_var": 0.6197265625, "learning_rate": 0.0001, "loss": 7.429, "loss/crossentropy": 1.9788135439157486, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.18516494389623403, "step": 19390 }, { "epoch": 0.6466666666666666, "grad_norm": 31.5, "grad_norm_var": 2.3869140625, "learning_rate": 0.0001, "loss": 7.3335, "loss/crossentropy": 1.9256047651171684, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.1791953630745411, "step": 19400 }, { "epoch": 0.647, "grad_norm": 28.75, "grad_norm_var": 2.1192057291666666, "learning_rate": 0.0001, "loss": 7.2155, "loss/crossentropy": 2.054460673779249, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.187739503569901, "step": 19410 }, { "epoch": 0.6473333333333333, "grad_norm": 29.0, "grad_norm_var": 1.61875, "learning_rate": 0.0001, "loss": 7.2894, "loss/crossentropy": 2.060872791707516, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.18351667653769255, "step": 19420 }, { "epoch": 0.6476666666666666, "grad_norm": 30.625, "grad_norm_var": 1.8587890625, "learning_rate": 0.0001, "loss": 7.192, "loss/crossentropy": 2.001495309919119, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.17514019403606654, "step": 19430 }, { "epoch": 0.648, "grad_norm": 27.75, "grad_norm_var": 1.8275390625, "learning_rate": 0.0001, "loss": 7.1692, "loss/crossentropy": 2.0528273679316045, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.17617126796394586, "step": 19440 }, { "epoch": 0.6483333333333333, "grad_norm": 31.5, "grad_norm_var": 43.69680989583333, "learning_rate": 0.0001, "loss": 7.2738, "loss/crossentropy": 2.047633108496666, "loss/hidden": 3.608984375, "loss/jsd": 0.0, "loss/logits": 0.2055029073730111, "step": 19450 }, { "epoch": 0.6486666666666666, "grad_norm": 30.125, "grad_norm_var": 38.22890625, "learning_rate": 0.0001, "loss": 7.3121, "loss/crossentropy": 2.028029265999794, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.20816227747127414, "step": 19460 }, { "epoch": 0.649, "grad_norm": 29.125, "grad_norm_var": 2.692122395833333, "learning_rate": 0.0001, "loss": 7.2317, "loss/crossentropy": 1.8550020724534988, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.18233551736921072, "step": 19470 }, { "epoch": 0.6493333333333333, "grad_norm": 29.25, "grad_norm_var": 1.4447916666666667, "learning_rate": 0.0001, "loss": 7.1568, "loss/crossentropy": 1.9527828395366669, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.1741065200418234, "step": 19480 }, { "epoch": 0.6496666666666666, "grad_norm": 28.125, "grad_norm_var": 4.705208333333333, "learning_rate": 0.0001, "loss": 7.2891, "loss/crossentropy": 1.9269748218357563, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.17439142554067075, "step": 19490 }, { "epoch": 0.65, "grad_norm": 31.375, "grad_norm_var": 3.3749553110757407e+18, "learning_rate": 0.0001, "loss": 7.2752, "loss/crossentropy": 2.059894037991762, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.17955098627135158, "step": 19500 }, { "epoch": 0.6503333333333333, "grad_norm": 29.0, "grad_norm_var": 3.37495531177231e+18, "learning_rate": 0.0001, "loss": 7.2252, "loss/crossentropy": 2.068094329535961, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19912790888920426, "step": 19510 }, { "epoch": 0.6506666666666666, "grad_norm": 28.375, "grad_norm_var": 1.8473307291666667, "learning_rate": 0.0001, "loss": 7.2212, "loss/crossentropy": 1.9414153158664704, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.1951848266646266, "step": 19520 }, { "epoch": 0.651, "grad_norm": 29.125, "grad_norm_var": 2.3379557291666666, "learning_rate": 0.0001, "loss": 7.3036, "loss/crossentropy": 2.078878749907017, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.2018786547705531, "step": 19530 }, { "epoch": 0.6513333333333333, "grad_norm": 29.625, "grad_norm_var": 3.0004557291666667, "learning_rate": 0.0001, "loss": 7.2883, "loss/crossentropy": 1.9592467445880175, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.17653894629329442, "step": 19540 }, { "epoch": 0.6516666666666666, "grad_norm": 28.0, "grad_norm_var": 4.493684895833334, "learning_rate": 0.0001, "loss": 7.1953, "loss/crossentropy": 2.1314990133047105, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.19988764822483063, "step": 19550 }, { "epoch": 0.652, "grad_norm": 28.875, "grad_norm_var": 33.222330729166664, "learning_rate": 0.0001, "loss": 7.1644, "loss/crossentropy": 1.9124360397458076, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.17645901460200547, "step": 19560 }, { "epoch": 0.6523333333333333, "grad_norm": 31.375, "grad_norm_var": 3.252795169959139e+18, "learning_rate": 0.0001, "loss": 7.3304, "loss/crossentropy": 2.0815951839089393, "loss/hidden": 3.624609375, "loss/jsd": 0.0, "loss/logits": 0.18988058995455503, "step": 19570 }, { "epoch": 0.6526666666666666, "grad_norm": 30.125, "grad_norm_var": 3.252795171589849e+18, "learning_rate": 0.0001, "loss": 7.1942, "loss/crossentropy": 1.9187524601817132, "loss/hidden": 3.533984375, "loss/jsd": 0.0, "loss/logits": 0.20262869838625192, "step": 19580 }, { "epoch": 0.653, "grad_norm": 28.625, "grad_norm_var": 2.31015625, "learning_rate": 0.0001, "loss": 7.106, "loss/crossentropy": 1.8900284066796302, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.1668266888707876, "step": 19590 }, { "epoch": 0.6533333333333333, "grad_norm": 28.625, "grad_norm_var": 1.8934895833333334, "learning_rate": 0.0001, "loss": 7.2263, "loss/crossentropy": 2.046683336794376, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.19105331562459468, "step": 19600 }, { "epoch": 0.6536666666666666, "grad_norm": 34.0, "grad_norm_var": 5.115625, "learning_rate": 0.0001, "loss": 7.2001, "loss/crossentropy": 2.1162250876426696, "loss/hidden": 3.2640625, "loss/jsd": 0.0, "loss/logits": 0.18407556228339672, "step": 19610 }, { "epoch": 0.654, "grad_norm": 31.125, "grad_norm_var": 5.689518229166667, "learning_rate": 0.0001, "loss": 7.1559, "loss/crossentropy": 1.86045109257102, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.17549580400809645, "step": 19620 }, { "epoch": 0.6543333333333333, "grad_norm": 28.875, "grad_norm_var": 1.3082682291666667, "learning_rate": 0.0001, "loss": 7.2174, "loss/crossentropy": 2.0019285306334496, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.1780899941921234, "step": 19630 }, { "epoch": 0.6546666666666666, "grad_norm": 29.875, "grad_norm_var": 2.942122395833333, "learning_rate": 0.0001, "loss": 7.4374, "loss/crossentropy": 2.0428459446877243, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.19818142866715788, "step": 19640 }, { "epoch": 0.655, "grad_norm": 27.5, "grad_norm_var": 2.528059895833333, "learning_rate": 0.0001, "loss": 7.2279, "loss/crossentropy": 1.9794667072594165, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.18814364075660706, "step": 19650 }, { "epoch": 0.6553333333333333, "grad_norm": 29.25, "grad_norm_var": 3.2223307291666665, "learning_rate": 0.0001, "loss": 7.2986, "loss/crossentropy": 1.961393266171217, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.1820337988436222, "step": 19660 }, { "epoch": 0.6556666666666666, "grad_norm": 30.125, "grad_norm_var": 5.751822916666667, "learning_rate": 0.0001, "loss": 7.2377, "loss/crossentropy": 1.9985428005456924, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.18256305437535048, "step": 19670 }, { "epoch": 0.656, "grad_norm": 27.125, "grad_norm_var": 6.39765625, "learning_rate": 0.0001, "loss": 7.3107, "loss/crossentropy": 2.0353022858500482, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.19593825452029706, "step": 19680 }, { "epoch": 0.6563333333333333, "grad_norm": 27.25, "grad_norm_var": 9.867708333333333, "learning_rate": 0.0001, "loss": 7.204, "loss/crossentropy": 1.870980378240347, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.1852215439081192, "step": 19690 }, { "epoch": 0.6566666666666666, "grad_norm": 28.625, "grad_norm_var": 10.408072916666667, "learning_rate": 0.0001, "loss": 7.2527, "loss/crossentropy": 1.8724316775798797, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.18473451090976595, "step": 19700 }, { "epoch": 0.657, "grad_norm": 28.5, "grad_norm_var": 2.037239583333333, "learning_rate": 0.0001, "loss": 7.1134, "loss/crossentropy": 2.0168904647231103, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.18023486584424972, "step": 19710 }, { "epoch": 0.6573333333333333, "grad_norm": 30.5, "grad_norm_var": 1.4872395833333334, "learning_rate": 0.0001, "loss": 7.2431, "loss/crossentropy": 1.998997327685356, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.18659929204732179, "step": 19720 }, { "epoch": 0.6576666666666666, "grad_norm": 27.375, "grad_norm_var": 1.6223307291666667, "learning_rate": 0.0001, "loss": 7.1294, "loss/crossentropy": 1.8406663365662097, "loss/hidden": 3.511328125, "loss/jsd": 0.0, "loss/logits": 0.1851378358900547, "step": 19730 }, { "epoch": 0.658, "grad_norm": 28.125, "grad_norm_var": 1.61015625, "learning_rate": 0.0001, "loss": 7.2561, "loss/crossentropy": 2.0889442384243013, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.1965428687632084, "step": 19740 }, { "epoch": 0.6583333333333333, "grad_norm": 47.5, "grad_norm_var": 23.92890625, "learning_rate": 0.0001, "loss": 7.2192, "loss/crossentropy": 2.155116944760084, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.18333613704890012, "step": 19750 }, { "epoch": 0.6586666666666666, "grad_norm": 31.25, "grad_norm_var": 2.986167998884855e+18, "learning_rate": 0.0001, "loss": 7.1296, "loss/crossentropy": 1.8822063371539115, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.19306997573003173, "step": 19760 }, { "epoch": 0.659, "grad_norm": 29.5, "grad_norm_var": 8.470572916666667, "learning_rate": 0.0001, "loss": 7.1673, "loss/crossentropy": 1.7748362384736538, "loss/hidden": 3.282421875, "loss/jsd": 0.0, "loss/logits": 0.1609759763814509, "step": 19770 }, { "epoch": 0.6593333333333333, "grad_norm": 29.125, "grad_norm_var": 2.287955729166667, "learning_rate": 0.0001, "loss": 7.2061, "loss/crossentropy": 1.9508386358618737, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.17473984798416495, "step": 19780 }, { "epoch": 0.6596666666666666, "grad_norm": 30.125, "grad_norm_var": 4.895833333333333, "learning_rate": 0.0001, "loss": 7.1841, "loss/crossentropy": 1.9278103783726692, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.1785666298121214, "step": 19790 }, { "epoch": 0.66, "grad_norm": 28.625, "grad_norm_var": 5.084375, "learning_rate": 0.0001, "loss": 7.1184, "loss/crossentropy": 1.9791350223124027, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.18257208168506622, "step": 19800 }, { "epoch": 0.6603333333333333, "grad_norm": 28.375, "grad_norm_var": 6.965559895833334, "learning_rate": 0.0001, "loss": 7.2522, "loss/crossentropy": 2.157275756448507, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.20591448061168194, "step": 19810 }, { "epoch": 0.6606666666666666, "grad_norm": 30.75, "grad_norm_var": 7.740559895833333, "learning_rate": 0.0001, "loss": 7.0853, "loss/crossentropy": 2.0464220918715, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.18198793176561595, "step": 19820 }, { "epoch": 0.661, "grad_norm": 30.875, "grad_norm_var": 8.008072916666666, "learning_rate": 0.0001, "loss": 7.0669, "loss/crossentropy": 1.8443207003176212, "loss/hidden": 3.258984375, "loss/jsd": 0.0, "loss/logits": 0.15769521850161256, "step": 19830 }, { "epoch": 0.6613333333333333, "grad_norm": 28.375, "grad_norm_var": 18.4509765625, "learning_rate": 0.0001, "loss": 7.2976, "loss/crossentropy": 2.046570193767548, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.17662686351686716, "step": 19840 }, { "epoch": 0.6616666666666666, "grad_norm": 29.25, "grad_norm_var": 16.310872395833332, "learning_rate": 0.0001, "loss": 7.329, "loss/crossentropy": 1.952541184425354, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.1775679487735033, "step": 19850 }, { "epoch": 0.662, "grad_norm": 27.0, "grad_norm_var": 2.4229166666666666, "learning_rate": 0.0001, "loss": 7.09, "loss/crossentropy": 1.8852140799164772, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.16374046923592686, "step": 19860 }, { "epoch": 0.6623333333333333, "grad_norm": 30.25, "grad_norm_var": 5.683072916666666, "learning_rate": 0.0001, "loss": 7.4347, "loss/crossentropy": 2.0232871949672697, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.19111645873636007, "step": 19870 }, { "epoch": 0.6626666666666666, "grad_norm": 29.0, "grad_norm_var": 2.3053504044132728e+18, "learning_rate": 0.0001, "loss": 7.3594, "loss/crossentropy": 1.878128642588854, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.19787954948842526, "step": 19880 }, { "epoch": 0.663, "grad_norm": 36.0, "grad_norm_var": 9.0962890625, "learning_rate": 0.0001, "loss": 7.0747, "loss/crossentropy": 2.044445390999317, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.17435509245842695, "step": 19890 }, { "epoch": 0.6633333333333333, "grad_norm": 29.0, "grad_norm_var": 10.116080729166667, "learning_rate": 0.0001, "loss": 7.1119, "loss/crossentropy": 1.9552796825766563, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.1702726454474032, "step": 19900 }, { "epoch": 0.6636666666666666, "grad_norm": 30.125, "grad_norm_var": 7.31015625, "learning_rate": 0.0001, "loss": 7.2149, "loss/crossentropy": 2.1259602226316927, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.1897933019325137, "step": 19910 }, { "epoch": 0.664, "grad_norm": 32.25, "grad_norm_var": 2.8744140625, "learning_rate": 0.0001, "loss": 7.2954, "loss/crossentropy": 2.0629692025482655, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18350183349102736, "step": 19920 }, { "epoch": 0.6643333333333333, "grad_norm": 30.5, "grad_norm_var": 2.5083333333333333, "learning_rate": 0.0001, "loss": 7.2325, "loss/crossentropy": 2.0942607849836348, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.17349220057949424, "step": 19930 }, { "epoch": 0.6646666666666666, "grad_norm": 29.875, "grad_norm_var": 1.5551432291666667, "learning_rate": 0.0001, "loss": 7.262, "loss/crossentropy": 1.8072184488177299, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.17160013020038606, "step": 19940 }, { "epoch": 0.665, "grad_norm": 27.875, "grad_norm_var": 2.1666015625, "learning_rate": 0.0001, "loss": 7.3404, "loss/crossentropy": 2.0357220128178595, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.19628365561366082, "step": 19950 }, { "epoch": 0.6653333333333333, "grad_norm": 33.75, "grad_norm_var": 2.698958333333333, "learning_rate": 0.0001, "loss": 7.2394, "loss/crossentropy": 1.988581009209156, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.18054685965180398, "step": 19960 }, { "epoch": 0.6656666666666666, "grad_norm": 29.125, "grad_norm_var": 43.73854166666667, "learning_rate": 0.0001, "loss": 7.1868, "loss/crossentropy": 1.9765674099326134, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.1889990497380495, "step": 19970 }, { "epoch": 0.666, "grad_norm": 32.75, "grad_norm_var": 4.490625, "learning_rate": 0.0001, "loss": 7.268, "loss/crossentropy": 1.9738644078373908, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.18034758158028125, "step": 19980 }, { "epoch": 0.6663333333333333, "grad_norm": 32.75, "grad_norm_var": 5.283268229166667, "learning_rate": 0.0001, "loss": 7.26, "loss/crossentropy": 2.114920521527529, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.1922052690759301, "step": 19990 }, { "epoch": 0.6666666666666666, "grad_norm": 32.5, "grad_norm_var": 6.1322265625, "learning_rate": 0.0001, "loss": 7.0794, "loss/crossentropy": 1.852367566525936, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.184903896227479, "step": 20000 } ], "logging_steps": 10, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.715020064017613e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }