diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,36033 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003333333333333333, + "grad_norm": 5088.0, + "learning_rate": 1.9e-05, + "loss": 132.6959, + "loss/crossentropy": 12.028920578956605, + "loss/hidden": 18.7375, + "loss/jsd": 0.0, + "loss/logits": 10.20107181072235, + "step": 10 + }, + { + "epoch": 0.0006666666666666666, + "grad_norm": 428.0, + "grad_norm_var": 86465919.73333333, + "learning_rate": 2.8000000000000003e-05, + "loss": 97.5714, + "loss/crossentropy": 8.78247936964035, + "loss/hidden": 18.70625, + "loss/jsd": 0.0, + "loss/logits": 6.826563286781311, + "step": 20 + }, + { + "epoch": 0.001, + "grad_norm": 206.0, + "grad_norm_var": 183176.66666666666, + "learning_rate": 3.7e-05, + "loss": 87.3595, + "loss/crossentropy": 8.069220972061157, + "loss/hidden": 18.36875, + "loss/jsd": 0.0, + "loss/logits": 6.267633223533631, + "step": 30 + }, + { + "epoch": 0.0013333333333333333, + "grad_norm": 1064.0, + "grad_norm_var": 99002.91666666667, + "learning_rate": 4.600000000000001e-05, + "loss": 84.1061, + "loss/crossentropy": 7.728456330299378, + "loss/hidden": 17.69375, + "loss/jsd": 0.0, + "loss/logits": 5.882031416893005, + "step": 40 + }, + { + "epoch": 0.0016666666666666668, + "grad_norm": 474.0, + "grad_norm_var": 84834.06666666667, + "learning_rate": 5.500000000000001e-05, + "loss": 75.8277, + "loss/crossentropy": 6.95980271100998, + "loss/hidden": 17.3125, + "loss/jsd": 0.0, + "loss/logits": 5.054542422294617, + "step": 50 + }, + { + "epoch": 0.002, + "grad_norm": 616.0, + "grad_norm_var": 52564.2, + "learning_rate": 6.400000000000001e-05, + "loss": 60.9591, + "loss/crossentropy": 5.805091935396194, + "loss/hidden": 15.93125, + "loss/jsd": 0.0, + "loss/logits": 3.9220160007476808, + "step": 60 + }, + { + "epoch": 0.0023333333333333335, + "grad_norm": 384.0, + "grad_norm_var": 67375.4, + "learning_rate": 7.3e-05, + "loss": 41.3956, + "loss/crossentropy": 4.246163284778595, + "loss/hidden": 13.2265625, + "loss/jsd": 0.0, + "loss/logits": 2.3237137854099275, + "step": 70 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 272.0, + "grad_norm_var": 11768.466666666667, + "learning_rate": 8.200000000000001e-05, + "loss": 28.7252, + "loss/crossentropy": 3.3240436017513275, + "loss/hidden": 11.096875, + "loss/jsd": 0.0, + "loss/logits": 1.4113391578197478, + "step": 80 + }, + { + "epoch": 0.003, + "grad_norm": 298.0, + "grad_norm_var": 2.1871038589218397e+17, + "learning_rate": 9.1e-05, + "loss": 24.0937, + "loss/crossentropy": 3.320331507921219, + "loss/hidden": 9.5125, + "loss/jsd": 0.0, + "loss/logits": 1.10816071331501, + "step": 90 + }, + { + "epoch": 0.0033333333333333335, + "grad_norm": 288.0, + "grad_norm_var": 5410.866666666667, + "learning_rate": 0.0001, + "loss": 21.4439, + "loss/crossentropy": 2.901010638475418, + "loss/hidden": 9.178125, + "loss/jsd": 0.0, + "loss/logits": 0.9687246754765511, + "step": 100 + }, + { + "epoch": 0.0036666666666666666, + "grad_norm": 280.0, + "grad_norm_var": 3854.6625, + "learning_rate": 0.0001, + "loss": 19.6349, + "loss/crossentropy": 2.818925604224205, + "loss/hidden": 8.39375, + "loss/jsd": 0.0, + "loss/logits": 0.8407707408070564, + "step": 110 + }, + { + "epoch": 0.004, + "grad_norm": 222.0, + "grad_norm_var": 1976.8958333333333, + "learning_rate": 0.0001, + "loss": 18.756, + "loss/crossentropy": 2.66967076510191, + "loss/hidden": 8.33125, + "loss/jsd": 0.0, + "loss/logits": 0.7849601306021213, + "step": 120 + }, + { + "epoch": 0.004333333333333333, + "grad_norm": 163.0, + "grad_norm_var": 1472.3833333333334, + "learning_rate": 0.0001, + "loss": 18.1448, + "loss/crossentropy": 2.513835993409157, + "loss/hidden": 8.1203125, + "loss/jsd": 0.0, + "loss/logits": 0.7554221481084824, + "step": 130 + }, + { + "epoch": 0.004666666666666667, + "grad_norm": 239.0, + "grad_norm_var": 1318.5958333333333, + "learning_rate": 0.0001, + "loss": 17.6846, + "loss/crossentropy": 2.591602721810341, + "loss/hidden": 7.6875, + "loss/jsd": 0.0, + "loss/logits": 0.7016006924211979, + "step": 140 + }, + { + "epoch": 0.005, + "grad_norm": 214.0, + "grad_norm_var": 11592.6625, + "learning_rate": 0.0001, + "loss": 17.3952, + "loss/crossentropy": 2.6045392960309983, + "loss/hidden": 7.6734375, + "loss/jsd": 0.0, + "loss/logits": 0.7216012105345726, + "step": 150 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 188.0, + "grad_norm_var": 1593.7625, + "learning_rate": 0.0001, + "loss": 16.5206, + "loss/crossentropy": 2.59020614027977, + "loss/hidden": 7.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.6340146750211716, + "step": 160 + }, + { + "epoch": 0.005666666666666667, + "grad_norm": 174.0, + "grad_norm_var": 1288.140625, + "learning_rate": 0.0001, + "loss": 16.4628, + "loss/crossentropy": 2.5054407477378846, + "loss/hidden": 7.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.667105832695961, + "step": 170 + }, + { + "epoch": 0.006, + "grad_norm": 186.0, + "grad_norm_var": 1223.015625, + "learning_rate": 0.0001, + "loss": 15.796, + "loss/crossentropy": 2.3755379378795625, + "loss/hidden": 7.1609375, + "loss/jsd": 0.0, + "loss/logits": 0.5874520897865295, + "step": 180 + }, + { + "epoch": 0.006333333333333333, + "grad_norm": 107.5, + "grad_norm_var": 1354.0239583333334, + "learning_rate": 0.0001, + "loss": 15.5922, + "loss/crossentropy": 2.3762576043605805, + "loss/hidden": 7.26484375, + "loss/jsd": 0.0, + "loss/logits": 0.6503637298941612, + "step": 190 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 109.5, + "grad_norm_var": 3071.4, + "learning_rate": 0.0001, + "loss": 15.2694, + "loss/crossentropy": 2.2902157098054885, + "loss/hidden": 7.0265625, + "loss/jsd": 0.0, + "loss/logits": 0.6195260547101498, + "step": 200 + }, + { + "epoch": 0.007, + "grad_norm": 194.0, + "grad_norm_var": 2804.890625, + "learning_rate": 0.0001, + "loss": 15.2413, + "loss/crossentropy": 2.623681750893593, + "loss/hidden": 6.796875, + "loss/jsd": 0.0, + "loss/logits": 0.6029888540506363, + "step": 210 + }, + { + "epoch": 0.007333333333333333, + "grad_norm": 136.0, + "grad_norm_var": 624.8989583333333, + "learning_rate": 0.0001, + "loss": 14.771, + "loss/crossentropy": 2.319578641653061, + "loss/hidden": 6.809375, + "loss/jsd": 0.0, + "loss/logits": 0.5596946202218532, + "step": 220 + }, + { + "epoch": 0.007666666666666666, + "grad_norm": 96.0, + "grad_norm_var": 524.8989583333333, + "learning_rate": 0.0001, + "loss": 14.4901, + "loss/crossentropy": 2.2120961263775825, + "loss/hidden": 6.6609375, + "loss/jsd": 0.0, + "loss/logits": 0.5178675353527069, + "step": 230 + }, + { + "epoch": 0.008, + "grad_norm": 119.0, + "grad_norm_var": 801.0822916666667, + "learning_rate": 0.0001, + "loss": 14.4657, + "loss/crossentropy": 2.444407218694687, + "loss/hidden": 6.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.543676958233118, + "step": 240 + }, + { + "epoch": 0.008333333333333333, + "grad_norm": 266.0, + "grad_norm_var": 1661.15, + "learning_rate": 0.0001, + "loss": 14.384, + "loss/crossentropy": 2.4827089831233025, + "loss/hidden": 6.49375, + "loss/jsd": 0.0, + "loss/logits": 0.5518345102667809, + "step": 250 + }, + { + "epoch": 0.008666666666666666, + "grad_norm": 171.0, + "grad_norm_var": 1844.040625, + "learning_rate": 0.0001, + "loss": 14.1886, + "loss/crossentropy": 2.3922463700175287, + "loss/hidden": 6.5015625, + "loss/jsd": 0.0, + "loss/logits": 0.5299135472625494, + "step": 260 + }, + { + "epoch": 0.009, + "grad_norm": 90.5, + "grad_norm_var": 1093.9625, + "learning_rate": 0.0001, + "loss": 14.2358, + "loss/crossentropy": 2.395447109639645, + "loss/hidden": 6.4796875, + "loss/jsd": 0.0, + "loss/logits": 0.5689245201647282, + "step": 270 + }, + { + "epoch": 0.009333333333333334, + "grad_norm": 123.0, + "grad_norm_var": 597.1166666666667, + "learning_rate": 0.0001, + "loss": 13.9794, + "loss/crossentropy": 2.240339662134647, + "loss/hidden": 6.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.5223498687148094, + "step": 280 + }, + { + "epoch": 0.009666666666666667, + "grad_norm": 144.0, + "grad_norm_var": 427.7, + "learning_rate": 0.0001, + "loss": 13.7849, + "loss/crossentropy": 2.214311620593071, + "loss/hidden": 6.26796875, + "loss/jsd": 0.0, + "loss/logits": 0.5229207530617714, + "step": 290 + }, + { + "epoch": 0.01, + "grad_norm": 91.0, + "grad_norm_var": 293.015625, + "learning_rate": 0.0001, + "loss": 13.5058, + "loss/crossentropy": 2.431586265563965, + "loss/hidden": 6.21015625, + "loss/jsd": 0.0, + "loss/logits": 0.5209170162677765, + "step": 300 + }, + { + "epoch": 0.010333333333333333, + "grad_norm": 94.0, + "grad_norm_var": 305.990625, + "learning_rate": 0.0001, + "loss": 13.5941, + "loss/crossentropy": 2.4835646122694017, + "loss/hidden": 5.96328125, + "loss/jsd": 0.0, + "loss/logits": 0.48731190487742426, + "step": 310 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 80.0, + "grad_norm_var": 289.59583333333336, + "learning_rate": 0.0001, + "loss": 13.3078, + "loss/crossentropy": 2.441184702515602, + "loss/hidden": 5.97421875, + "loss/jsd": 0.0, + "loss/logits": 0.46742800548672675, + "step": 320 + }, + { + "epoch": 0.011, + "grad_norm": 102.0, + "grad_norm_var": 257.42395833333336, + "learning_rate": 0.0001, + "loss": 13.0426, + "loss/crossentropy": 2.2604060992598534, + "loss/hidden": 6.03359375, + "loss/jsd": 0.0, + "loss/logits": 0.4528258040547371, + "step": 330 + }, + { + "epoch": 0.011333333333333334, + "grad_norm": 96.5, + "grad_norm_var": 2910.0958333333333, + "learning_rate": 0.0001, + "loss": 13.1213, + "loss/crossentropy": 2.4144359961152078, + "loss/hidden": 5.98203125, + "loss/jsd": 0.0, + "loss/logits": 0.488891564682126, + "step": 340 + }, + { + "epoch": 0.011666666666666667, + "grad_norm": 96.5, + "grad_norm_var": 3266.95, + "learning_rate": 0.0001, + "loss": 13.0347, + "loss/crossentropy": 2.3632063284516334, + "loss/hidden": 5.91640625, + "loss/jsd": 0.0, + "loss/logits": 0.4649609446525574, + "step": 350 + }, + { + "epoch": 0.012, + "grad_norm": 106.5, + "grad_norm_var": 655.115625, + "learning_rate": 0.0001, + "loss": 12.8798, + "loss/crossentropy": 2.2149820044636725, + "loss/hidden": 6.115625, + "loss/jsd": 0.0, + "loss/logits": 0.47544198893010614, + "step": 360 + }, + { + "epoch": 0.012333333333333333, + "grad_norm": 84.5, + "grad_norm_var": 172.790625, + "learning_rate": 0.0001, + "loss": 12.8471, + "loss/crossentropy": 2.5810438305139543, + "loss/hidden": 6.00078125, + "loss/jsd": 0.0, + "loss/logits": 0.48572778329253197, + "step": 370 + }, + { + "epoch": 0.012666666666666666, + "grad_norm": 95.0, + "grad_norm_var": 128.88333333333333, + "learning_rate": 0.0001, + "loss": 12.7666, + "loss/crossentropy": 2.2736202508211134, + "loss/hidden": 5.878125, + "loss/jsd": 0.0, + "loss/logits": 0.46869536861777306, + "step": 380 + }, + { + "epoch": 0.013, + "grad_norm": 124.5, + "grad_norm_var": 140.51666666666668, + "learning_rate": 0.0001, + "loss": 12.605, + "loss/crossentropy": 2.276585566997528, + "loss/hidden": 5.825, + "loss/jsd": 0.0, + "loss/logits": 0.45089508444070814, + "step": 390 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 94.5, + "grad_norm_var": 311.590625, + "learning_rate": 0.0001, + "loss": 12.5188, + "loss/crossentropy": 2.4131533786654473, + "loss/hidden": 5.66875, + "loss/jsd": 0.0, + "loss/logits": 0.4504088945686817, + "step": 400 + }, + { + "epoch": 0.013666666666666667, + "grad_norm": 83.5, + "grad_norm_var": 364.76666666666665, + "learning_rate": 0.0001, + "loss": 12.4151, + "loss/crossentropy": 2.1796241596341135, + "loss/hidden": 5.73671875, + "loss/jsd": 0.0, + "loss/logits": 0.435577293112874, + "step": 410 + }, + { + "epoch": 0.014, + "grad_norm": 93.5, + "grad_norm_var": 187.52395833333333, + "learning_rate": 0.0001, + "loss": 12.4676, + "loss/crossentropy": 2.1736431539058687, + "loss/hidden": 5.78359375, + "loss/jsd": 0.0, + "loss/logits": 0.4561117485165596, + "step": 420 + }, + { + "epoch": 0.014333333333333333, + "grad_norm": 90.5, + "grad_norm_var": 48.795833333333334, + "learning_rate": 0.0001, + "loss": 12.0711, + "loss/crossentropy": 2.283279325067997, + "loss/hidden": 5.45859375, + "loss/jsd": 0.0, + "loss/logits": 0.4215318731963634, + "step": 430 + }, + { + "epoch": 0.014666666666666666, + "grad_norm": 89.5, + "grad_norm_var": 49.42916666666667, + "learning_rate": 0.0001, + "loss": 12.1374, + "loss/crossentropy": 2.291328126192093, + "loss/hidden": 5.5125, + "loss/jsd": 0.0, + "loss/logits": 0.434352046251297, + "step": 440 + }, + { + "epoch": 0.015, + "grad_norm": 92.0, + "grad_norm_var": 156.45, + "learning_rate": 0.0001, + "loss": 12.1345, + "loss/crossentropy": 2.4191192060709, + "loss/hidden": 5.35546875, + "loss/jsd": 0.0, + "loss/logits": 0.4199687227606773, + "step": 450 + }, + { + "epoch": 0.015333333333333332, + "grad_norm": 88.0, + "grad_norm_var": 208.2625, + "learning_rate": 0.0001, + "loss": 12.1403, + "loss/crossentropy": 2.1956121422350408, + "loss/hidden": 5.4875, + "loss/jsd": 0.0, + "loss/logits": 0.4142579145729542, + "step": 460 + }, + { + "epoch": 0.015666666666666666, + "grad_norm": 89.0, + "grad_norm_var": 140.31666666666666, + "learning_rate": 0.0001, + "loss": 12.0963, + "loss/crossentropy": 2.2322947554290296, + "loss/hidden": 5.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.4137666640803218, + "step": 470 + }, + { + "epoch": 0.016, + "grad_norm": 76.0, + "grad_norm_var": 64.23333333333333, + "learning_rate": 0.0001, + "loss": 12.219, + "loss/crossentropy": 2.3545165613293646, + "loss/hidden": 5.5296875, + "loss/jsd": 0.0, + "loss/logits": 0.4294994674623013, + "step": 480 + }, + { + "epoch": 0.01633333333333333, + "grad_norm": 83.0, + "grad_norm_var": 58.915625, + "learning_rate": 0.0001, + "loss": 11.7744, + "loss/crossentropy": 2.3146368995308877, + "loss/hidden": 5.428125, + "loss/jsd": 0.0, + "loss/logits": 0.4236735228449106, + "step": 490 + }, + { + "epoch": 0.016666666666666666, + "grad_norm": 78.5, + "grad_norm_var": 64.83229166666666, + "learning_rate": 0.0001, + "loss": 11.8087, + "loss/crossentropy": 2.1017669927328826, + "loss/hidden": 5.6140625, + "loss/jsd": 0.0, + "loss/logits": 0.4007458407431841, + "step": 500 + }, + { + "epoch": 0.017, + "grad_norm": 79.0, + "grad_norm_var": 105.6625, + "learning_rate": 0.0001, + "loss": 11.9731, + "loss/crossentropy": 2.2921740829944612, + "loss/hidden": 5.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.4179579775780439, + "step": 510 + }, + { + "epoch": 0.017333333333333333, + "grad_norm": 108.0, + "grad_norm_var": 187.390625, + "learning_rate": 0.0001, + "loss": 11.6599, + "loss/crossentropy": 2.2103257328271866, + "loss/hidden": 5.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.3849416717886925, + "step": 520 + }, + { + "epoch": 0.017666666666666667, + "grad_norm": 81.5, + "grad_norm_var": 107.10729166666667, + "learning_rate": 0.0001, + "loss": 12.1065, + "loss/crossentropy": 2.397639387845993, + "loss/hidden": 5.52265625, + "loss/jsd": 0.0, + "loss/logits": 0.45107794776558874, + "step": 530 + }, + { + "epoch": 0.018, + "grad_norm": 129.0, + "grad_norm_var": 193.940625, + "learning_rate": 0.0001, + "loss": 11.7656, + "loss/crossentropy": 2.3040026426315308, + "loss/hidden": 5.2296875, + "loss/jsd": 0.0, + "loss/logits": 0.39434340633451936, + "step": 540 + }, + { + "epoch": 0.018333333333333333, + "grad_norm": 92.0, + "grad_norm_var": 172.44895833333334, + "learning_rate": 0.0001, + "loss": 11.8646, + "loss/crossentropy": 2.2055136799812316, + "loss/hidden": 5.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.3980072047561407, + "step": 550 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 71.0, + "grad_norm_var": 129.69895833333334, + "learning_rate": 0.0001, + "loss": 11.9471, + "loss/crossentropy": 2.5020270466804506, + "loss/hidden": 5.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.44894293025135995, + "step": 560 + }, + { + "epoch": 0.019, + "grad_norm": 84.0, + "grad_norm_var": 93.34895833333333, + "learning_rate": 0.0001, + "loss": 11.7947, + "loss/crossentropy": 2.094514015316963, + "loss/hidden": 5.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.40853818207979203, + "step": 570 + }, + { + "epoch": 0.019333333333333334, + "grad_norm": 59.5, + "grad_norm_var": 270.1322916666667, + "learning_rate": 0.0001, + "loss": 11.6187, + "loss/crossentropy": 2.3205525130033493, + "loss/hidden": 5.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.4034106068313122, + "step": 580 + }, + { + "epoch": 0.019666666666666666, + "grad_norm": 71.0, + "grad_norm_var": 190.77395833333333, + "learning_rate": 0.0001, + "loss": 11.591, + "loss/crossentropy": 2.110408242046833, + "loss/hidden": 5.26328125, + "loss/jsd": 0.0, + "loss/logits": 0.3818111319094896, + "step": 590 + }, + { + "epoch": 0.02, + "grad_norm": 78.5, + "grad_norm_var": 245.09765625, + "learning_rate": 0.0001, + "loss": 11.5567, + "loss/crossentropy": 2.301485204696655, + "loss/hidden": 5.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.39956560730934143, + "step": 600 + }, + { + "epoch": 0.02033333333333333, + "grad_norm": 85.0, + "grad_norm_var": 210.44895833333334, + "learning_rate": 0.0001, + "loss": 11.4252, + "loss/crossentropy": 2.115160013735294, + "loss/hidden": 5.18671875, + "loss/jsd": 0.0, + "loss/logits": 0.3820294298231602, + "step": 610 + }, + { + "epoch": 0.020666666666666667, + "grad_norm": 76.5, + "grad_norm_var": 312.940625, + "learning_rate": 0.0001, + "loss": 11.7005, + "loss/crossentropy": 2.3295557737350463, + "loss/hidden": 5.3875, + "loss/jsd": 0.0, + "loss/logits": 0.40171602740883827, + "step": 620 + }, + { + "epoch": 0.021, + "grad_norm": 84.0, + "grad_norm_var": 189.29557291666666, + "learning_rate": 0.0001, + "loss": 11.3593, + "loss/crossentropy": 2.1354906648397445, + "loss/hidden": 5.2125, + "loss/jsd": 0.0, + "loss/logits": 0.3748495355248451, + "step": 630 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 109.0, + "grad_norm_var": 1600.7072916666666, + "learning_rate": 0.0001, + "loss": 11.5477, + "loss/crossentropy": 2.3208198621869087, + "loss/hidden": 5.1359375, + "loss/jsd": 0.0, + "loss/logits": 0.38336952701210975, + "step": 640 + }, + { + "epoch": 0.021666666666666667, + "grad_norm": 89.5, + "grad_norm_var": 1528.8166666666666, + "learning_rate": 0.0001, + "loss": 11.4408, + "loss/crossentropy": 2.3916768223047256, + "loss/hidden": 5.29375, + "loss/jsd": 0.0, + "loss/logits": 0.39794068187475207, + "step": 650 + }, + { + "epoch": 0.022, + "grad_norm": 4294967296.0, + "grad_norm_var": 1.1529214644399553e+18, + "learning_rate": 0.0001, + "loss": 11.5534, + "loss/crossentropy": 2.384619304537773, + "loss/hidden": 5.54375, + "loss/jsd": 0.0, + "loss/logits": 0.44498190060257914, + "step": 660 + }, + { + "epoch": 0.022333333333333334, + "grad_norm": 73.5, + "grad_norm_var": 1.152921464574173e+18, + "learning_rate": 0.0001, + "loss": 11.4083, + "loss/crossentropy": 2.305806961655617, + "loss/hidden": 5.1078125, + "loss/jsd": 0.0, + "loss/logits": 0.3943803641945124, + "step": 670 + }, + { + "epoch": 0.02266666666666667, + "grad_norm": 82.5, + "grad_norm_var": 56.72890625, + "learning_rate": 0.0001, + "loss": 11.2602, + "loss/crossentropy": 2.3092581436038015, + "loss/hidden": 5.09765625, + "loss/jsd": 0.0, + "loss/logits": 0.39187341518700125, + "step": 680 + }, + { + "epoch": 0.023, + "grad_norm": 65.0, + "grad_norm_var": 117.25833333333334, + "learning_rate": 0.0001, + "loss": 11.1613, + "loss/crossentropy": 2.324054108560085, + "loss/hidden": 5.12578125, + "loss/jsd": 0.0, + "loss/logits": 0.3896127313375473, + "step": 690 + }, + { + "epoch": 0.023333333333333334, + "grad_norm": 68.0, + "grad_norm_var": 89.20833333333333, + "learning_rate": 0.0001, + "loss": 11.1875, + "loss/crossentropy": 2.2932113975286486, + "loss/hidden": 5.19453125, + "loss/jsd": 0.0, + "loss/logits": 0.40212590657174585, + "step": 700 + }, + { + "epoch": 0.023666666666666666, + "grad_norm": 79.5, + "grad_norm_var": 90.28515625, + "learning_rate": 0.0001, + "loss": 11.3541, + "loss/crossentropy": 2.4318029940128327, + "loss/hidden": 5.1109375, + "loss/jsd": 0.0, + "loss/logits": 0.4051614128053188, + "step": 710 + }, + { + "epoch": 0.024, + "grad_norm": 73.0, + "grad_norm_var": 89.22473958333333, + "learning_rate": 0.0001, + "loss": 11.1475, + "loss/crossentropy": 2.167645823955536, + "loss/hidden": 5.29296875, + "loss/jsd": 0.0, + "loss/logits": 0.43062909580767156, + "step": 720 + }, + { + "epoch": 0.024333333333333332, + "grad_norm": 74.5, + "grad_norm_var": 49.701822916666664, + "learning_rate": 0.0001, + "loss": 11.1669, + "loss/crossentropy": 2.1876634269952775, + "loss/hidden": 5.13671875, + "loss/jsd": 0.0, + "loss/logits": 0.3738254923373461, + "step": 730 + }, + { + "epoch": 0.024666666666666667, + "grad_norm": 77.5, + "grad_norm_var": 30.557291666666668, + "learning_rate": 0.0001, + "loss": 11.0079, + "loss/crossentropy": 2.248200983554125, + "loss/hidden": 5.09140625, + "loss/jsd": 0.0, + "loss/logits": 0.3778664033859968, + "step": 740 + }, + { + "epoch": 0.025, + "grad_norm": 68.0, + "grad_norm_var": 46.8625, + "learning_rate": 0.0001, + "loss": 11.0535, + "loss/crossentropy": 2.162605920433998, + "loss/hidden": 5.04921875, + "loss/jsd": 0.0, + "loss/logits": 0.38323657512664794, + "step": 750 + }, + { + "epoch": 0.025333333333333333, + "grad_norm": 63.5, + "grad_norm_var": 83.825, + "learning_rate": 0.0001, + "loss": 11.126, + "loss/crossentropy": 2.2463886097073553, + "loss/hidden": 5.05390625, + "loss/jsd": 0.0, + "loss/logits": 0.3759196888655424, + "step": 760 + }, + { + "epoch": 0.025666666666666667, + "grad_norm": 67.5, + "grad_norm_var": 39.149739583333336, + "learning_rate": 0.0001, + "loss": 11.2542, + "loss/crossentropy": 2.368313530087471, + "loss/hidden": 5.025, + "loss/jsd": 0.0, + "loss/logits": 0.40871408879756926, + "step": 770 + }, + { + "epoch": 0.026, + "grad_norm": 76.0, + "grad_norm_var": 37.065625, + "learning_rate": 0.0001, + "loss": 11.2333, + "loss/crossentropy": 2.2050742127001284, + "loss/hidden": 5.03359375, + "loss/jsd": 0.0, + "loss/logits": 0.3826916288584471, + "step": 780 + }, + { + "epoch": 0.026333333333333334, + "grad_norm": 71.5, + "grad_norm_var": 50.54765625, + "learning_rate": 0.0001, + "loss": 10.8321, + "loss/crossentropy": 2.2104426354169844, + "loss/hidden": 4.78125, + "loss/jsd": 0.0, + "loss/logits": 0.34428741969168186, + "step": 790 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 62.0, + "grad_norm_var": 51.157291666666666, + "learning_rate": 0.0001, + "loss": 10.9576, + "loss/crossentropy": 2.054627813398838, + "loss/hidden": 5.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.37033863496035335, + "step": 800 + }, + { + "epoch": 0.027, + "grad_norm": 71.0, + "grad_norm_var": 30.090625, + "learning_rate": 0.0001, + "loss": 10.8745, + "loss/crossentropy": 2.171517415344715, + "loss/hidden": 5.04921875, + "loss/jsd": 0.0, + "loss/logits": 0.36901252083480357, + "step": 810 + }, + { + "epoch": 0.027333333333333334, + "grad_norm": 83.0, + "grad_norm_var": 66.25833333333334, + "learning_rate": 0.0001, + "loss": 10.9984, + "loss/crossentropy": 2.1388269782066347, + "loss/hidden": 4.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.3639502193778753, + "step": 820 + }, + { + "epoch": 0.027666666666666666, + "grad_norm": 60.25, + "grad_norm_var": 65.29895833333333, + "learning_rate": 0.0001, + "loss": 10.9877, + "loss/crossentropy": 2.1383705154061317, + "loss/hidden": 5.08203125, + "loss/jsd": 0.0, + "loss/logits": 0.35995694771409037, + "step": 830 + }, + { + "epoch": 0.028, + "grad_norm": 60.0, + "grad_norm_var": 28.024739583333332, + "learning_rate": 0.0001, + "loss": 10.8345, + "loss/crossentropy": 2.2445564195513725, + "loss/hidden": 4.8421875, + "loss/jsd": 0.0, + "loss/logits": 0.35285502672195435, + "step": 840 + }, + { + "epoch": 0.028333333333333332, + "grad_norm": 106.0, + "grad_norm_var": 137.91432291666666, + "learning_rate": 0.0001, + "loss": 10.9112, + "loss/crossentropy": 2.26119641661644, + "loss/hidden": 4.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.3745645940303802, + "step": 850 + }, + { + "epoch": 0.028666666666666667, + "grad_norm": 61.5, + "grad_norm_var": 270.00807291666666, + "learning_rate": 0.0001, + "loss": 10.9954, + "loss/crossentropy": 2.2785057038068772, + "loss/hidden": 5.071875, + "loss/jsd": 0.0, + "loss/logits": 0.3852051142603159, + "step": 860 + }, + { + "epoch": 0.029, + "grad_norm": 114.5, + "grad_norm_var": 280.5541666666667, + "learning_rate": 0.0001, + "loss": 11.0732, + "loss/crossentropy": 2.1508523888885973, + "loss/hidden": 4.98984375, + "loss/jsd": 0.0, + "loss/logits": 0.36836351118981836, + "step": 870 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 84.5, + "grad_norm_var": 265.72265625, + "learning_rate": 0.0001, + "loss": 10.9948, + "loss/crossentropy": 2.407784271240234, + "loss/hidden": 5.109375, + "loss/jsd": 0.0, + "loss/logits": 0.4067291602492332, + "step": 880 + }, + { + "epoch": 0.029666666666666668, + "grad_norm": 68.5, + "grad_norm_var": 199.90182291666667, + "learning_rate": 0.0001, + "loss": 11.052, + "loss/crossentropy": 2.266117498278618, + "loss/hidden": 4.8125, + "loss/jsd": 0.0, + "loss/logits": 0.3621700868010521, + "step": 890 + }, + { + "epoch": 0.03, + "grad_norm": 66.0, + "grad_norm_var": 79.33307291666667, + "learning_rate": 0.0001, + "loss": 10.8697, + "loss/crossentropy": 2.2878661900758743, + "loss/hidden": 4.9578125, + "loss/jsd": 0.0, + "loss/logits": 0.38751605823636054, + "step": 900 + }, + { + "epoch": 0.030333333333333334, + "grad_norm": 71.5, + "grad_norm_var": 107.53333333333333, + "learning_rate": 0.0001, + "loss": 10.843, + "loss/crossentropy": 2.1773984007537366, + "loss/hidden": 4.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.35708251409232616, + "step": 910 + }, + { + "epoch": 0.030666666666666665, + "grad_norm": 67.0, + "grad_norm_var": 161.60390625, + "learning_rate": 0.0001, + "loss": 10.7503, + "loss/crossentropy": 2.1578031152486803, + "loss/hidden": 4.833984375, + "loss/jsd": 0.0, + "loss/logits": 0.34460235945880413, + "step": 920 + }, + { + "epoch": 0.031, + "grad_norm": 84.0, + "grad_norm_var": 74.48307291666667, + "learning_rate": 0.0001, + "loss": 10.8098, + "loss/crossentropy": 2.30833805501461, + "loss/hidden": 4.9, + "loss/jsd": 0.0, + "loss/logits": 0.363369470089674, + "step": 930 + }, + { + "epoch": 0.03133333333333333, + "grad_norm": 64.0, + "grad_norm_var": 80.66848958333334, + "learning_rate": 0.0001, + "loss": 10.7164, + "loss/crossentropy": 2.270130616426468, + "loss/hidden": 4.9078125, + "loss/jsd": 0.0, + "loss/logits": 0.3668188262730837, + "step": 940 + }, + { + "epoch": 0.03166666666666667, + "grad_norm": 61.75, + "grad_norm_var": 31.870833333333334, + "learning_rate": 0.0001, + "loss": 10.7845, + "loss/crossentropy": 2.313658607006073, + "loss/hidden": 4.87734375, + "loss/jsd": 0.0, + "loss/logits": 0.38930617440491916, + "step": 950 + }, + { + "epoch": 0.032, + "grad_norm": 65.5, + "grad_norm_var": 29.598958333333332, + "learning_rate": 0.0001, + "loss": 10.6697, + "loss/crossentropy": 2.3520640432834625, + "loss/hidden": 4.84921875, + "loss/jsd": 0.0, + "loss/logits": 0.39863014221191406, + "step": 960 + }, + { + "epoch": 0.03233333333333333, + "grad_norm": 5771362304.0, + "grad_norm_var": 2.0817888831321674e+18, + "learning_rate": 0.0001, + "loss": 10.7484, + "loss/crossentropy": 2.192959766089916, + "loss/hidden": 4.798828125, + "loss/jsd": 0.0, + "loss/logits": 0.33885405771434307, + "step": 970 + }, + { + "epoch": 0.03266666666666666, + "grad_norm": 51.25, + "grad_norm_var": 2.081788882663244e+18, + "learning_rate": 0.0001, + "loss": 10.6531, + "loss/crossentropy": 2.0819257736206054, + "loss/hidden": 4.775, + "loss/jsd": 0.0, + "loss/logits": 0.3428235022351146, + "step": 980 + }, + { + "epoch": 0.033, + "grad_norm": 63.0, + "grad_norm_var": 35.618489583333336, + "learning_rate": 0.0001, + "loss": 10.8209, + "loss/crossentropy": 2.250964765995741, + "loss/hidden": 4.784375, + "loss/jsd": 0.0, + "loss/logits": 0.3942587487399578, + "step": 990 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 52.25, + "grad_norm_var": 62.21015625, + "learning_rate": 0.0001, + "loss": 10.6187, + "loss/crossentropy": 2.1501222252845764, + "loss/hidden": 4.93984375, + "loss/jsd": 0.0, + "loss/logits": 0.3645794134587049, + "step": 1000 + }, + { + "epoch": 0.033666666666666664, + "grad_norm": 147.0, + "grad_norm_var": 810.01640625, + "learning_rate": 0.0001, + "loss": 10.7409, + "loss/crossentropy": 2.265306806564331, + "loss/hidden": 4.85390625, + "loss/jsd": 0.0, + "loss/logits": 0.3604784071445465, + "step": 1010 + }, + { + "epoch": 0.034, + "grad_norm": 75.0, + "grad_norm_var": 562.55, + "learning_rate": 0.0001, + "loss": 10.6671, + "loss/crossentropy": 2.1804181709885597, + "loss/hidden": 4.74765625, + "loss/jsd": 0.0, + "loss/logits": 0.3643572922796011, + "step": 1020 + }, + { + "epoch": 0.034333333333333334, + "grad_norm": 72.0, + "grad_norm_var": 76.40598958333334, + "learning_rate": 0.0001, + "loss": 10.7713, + "loss/crossentropy": 2.3172024488449097, + "loss/hidden": 4.83671875, + "loss/jsd": 0.0, + "loss/logits": 0.37977495454251764, + "step": 1030 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 57.25, + "grad_norm_var": 179.57265625, + "learning_rate": 0.0001, + "loss": 10.7404, + "loss/crossentropy": 2.346377784013748, + "loss/hidden": 4.78046875, + "loss/jsd": 0.0, + "loss/logits": 0.36581903472542765, + "step": 1040 + }, + { + "epoch": 0.035, + "grad_norm": 50.0, + "grad_norm_var": 17.880989583333335, + "learning_rate": 0.0001, + "loss": 10.6162, + "loss/crossentropy": 2.27249199450016, + "loss/hidden": 4.8953125, + "loss/jsd": 0.0, + "loss/logits": 0.35631081983447077, + "step": 1050 + }, + { + "epoch": 0.035333333333333335, + "grad_norm": 58.25, + "grad_norm_var": 37.270572916666666, + "learning_rate": 0.0001, + "loss": 10.6205, + "loss/crossentropy": 2.3702693939208985, + "loss/hidden": 4.74921875, + "loss/jsd": 0.0, + "loss/logits": 0.3705620352178812, + "step": 1060 + }, + { + "epoch": 0.035666666666666666, + "grad_norm": 59.75, + "grad_norm_var": 36.85, + "learning_rate": 0.0001, + "loss": 10.6335, + "loss/crossentropy": 2.2196707010269163, + "loss/hidden": 4.776953125, + "loss/jsd": 0.0, + "loss/logits": 0.3813734740018845, + "step": 1070 + }, + { + "epoch": 0.036, + "grad_norm": 71.5, + "grad_norm_var": 35.77682291666667, + "learning_rate": 0.0001, + "loss": 10.6103, + "loss/crossentropy": 2.0884764015674593, + "loss/hidden": 4.862109375, + "loss/jsd": 0.0, + "loss/logits": 0.3436795238405466, + "step": 1080 + }, + { + "epoch": 0.036333333333333336, + "grad_norm": 57.5, + "grad_norm_var": 30.591666666666665, + "learning_rate": 0.0001, + "loss": 10.6457, + "loss/crossentropy": 2.2101589158177375, + "loss/hidden": 4.82109375, + "loss/jsd": 0.0, + "loss/logits": 0.3618529438972473, + "step": 1090 + }, + { + "epoch": 0.03666666666666667, + "grad_norm": 55.0, + "grad_norm_var": 23.695572916666666, + "learning_rate": 0.0001, + "loss": 10.6528, + "loss/crossentropy": 2.122966104745865, + "loss/hidden": 4.823828125, + "loss/jsd": 0.0, + "loss/logits": 0.3658104814589024, + "step": 1100 + }, + { + "epoch": 0.037, + "grad_norm": 81.0, + "grad_norm_var": 10727.665625, + "learning_rate": 0.0001, + "loss": 10.5432, + "loss/crossentropy": 2.0415431298315525, + "loss/hidden": 4.83515625, + "loss/jsd": 0.0, + "loss/logits": 0.3341557715088129, + "step": 1110 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 50.0, + "grad_norm_var": 98.640625, + "learning_rate": 0.0001, + "loss": 10.6684, + "loss/crossentropy": 2.3639739483594893, + "loss/hidden": 4.715234375, + "loss/jsd": 0.0, + "loss/logits": 0.3543642643839121, + "step": 1120 + }, + { + "epoch": 0.03766666666666667, + "grad_norm": 51.25, + "grad_norm_var": 404.8833333333333, + "learning_rate": 0.0001, + "loss": 10.5142, + "loss/crossentropy": 2.2790277168154716, + "loss/hidden": 4.78203125, + "loss/jsd": 0.0, + "loss/logits": 0.335112139955163, + "step": 1130 + }, + { + "epoch": 0.038, + "grad_norm": 66.0, + "grad_norm_var": 80.90729166666667, + "learning_rate": 0.0001, + "loss": 10.5643, + "loss/crossentropy": 2.3700348407030107, + "loss/hidden": 4.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.35619163103401663, + "step": 1140 + }, + { + "epoch": 0.03833333333333333, + "grad_norm": 52.0, + "grad_norm_var": 128.78932291666666, + "learning_rate": 0.0001, + "loss": 10.547, + "loss/crossentropy": 2.2056336015462876, + "loss/hidden": 4.8265625, + "loss/jsd": 0.0, + "loss/logits": 0.3479484971612692, + "step": 1150 + }, + { + "epoch": 0.03866666666666667, + "grad_norm": 57.75, + "grad_norm_var": 47.465625, + "learning_rate": 0.0001, + "loss": 10.459, + "loss/crossentropy": 2.33939877897501, + "loss/hidden": 4.743359375, + "loss/jsd": 0.0, + "loss/logits": 0.3586887318640947, + "step": 1160 + }, + { + "epoch": 0.039, + "grad_norm": 44.5, + "grad_norm_var": 43.43515625, + "learning_rate": 0.0001, + "loss": 10.3386, + "loss/crossentropy": 2.1996493458747866, + "loss/hidden": 4.859375, + "loss/jsd": 0.0, + "loss/logits": 0.35360681600868704, + "step": 1170 + }, + { + "epoch": 0.03933333333333333, + "grad_norm": 63.0, + "grad_norm_var": 40115.25390625, + "learning_rate": 0.0001, + "loss": 10.4938, + "loss/crossentropy": 2.345403802394867, + "loss/hidden": 4.74375, + "loss/jsd": 0.0, + "loss/logits": 0.3767122995108366, + "step": 1180 + }, + { + "epoch": 0.03966666666666667, + "grad_norm": 55.0, + "grad_norm_var": 39818.41223958333, + "learning_rate": 0.0001, + "loss": 10.633, + "loss/crossentropy": 2.179220561683178, + "loss/hidden": 4.80234375, + "loss/jsd": 0.0, + "loss/logits": 0.36714412793517115, + "step": 1190 + }, + { + "epoch": 0.04, + "grad_norm": 59.0, + "grad_norm_var": 87.50416666666666, + "learning_rate": 0.0001, + "loss": 10.464, + "loss/crossentropy": 2.324831709265709, + "loss/hidden": 4.62578125, + "loss/jsd": 0.0, + "loss/logits": 0.3588165879249573, + "step": 1200 + }, + { + "epoch": 0.04033333333333333, + "grad_norm": 52.5, + "grad_norm_var": 22.1625, + "learning_rate": 0.0001, + "loss": 10.4303, + "loss/crossentropy": 2.3655824601650237, + "loss/hidden": 4.5671875, + "loss/jsd": 0.0, + "loss/logits": 0.34746520072221754, + "step": 1210 + }, + { + "epoch": 0.04066666666666666, + "grad_norm": 59.0, + "grad_norm_var": 25.076822916666668, + "learning_rate": 0.0001, + "loss": 10.3775, + "loss/crossentropy": 2.3404723912477494, + "loss/hidden": 4.560546875, + "loss/jsd": 0.0, + "loss/logits": 0.35012954138219354, + "step": 1220 + }, + { + "epoch": 0.041, + "grad_norm": 55.75, + "grad_norm_var": 33.32057291666667, + "learning_rate": 0.0001, + "loss": 10.2275, + "loss/crossentropy": 2.3334707781672477, + "loss/hidden": 4.620703125, + "loss/jsd": 0.0, + "loss/logits": 0.33375904690474273, + "step": 1230 + }, + { + "epoch": 0.04133333333333333, + "grad_norm": 54.25, + "grad_norm_var": 56.390625, + "learning_rate": 0.0001, + "loss": 10.2067, + "loss/crossentropy": 2.2778391599655152, + "loss/hidden": 4.7, + "loss/jsd": 0.0, + "loss/logits": 0.36180679500102997, + "step": 1240 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 48.0, + "grad_norm_var": 58.657291666666666, + "learning_rate": 0.0001, + "loss": 10.3731, + "loss/crossentropy": 2.1620317712426185, + "loss/hidden": 4.6765625, + "loss/jsd": 0.0, + "loss/logits": 0.3339271958917379, + "step": 1250 + }, + { + "epoch": 0.042, + "grad_norm": 60.75, + "grad_norm_var": 38.33932291666667, + "learning_rate": 0.0001, + "loss": 10.356, + "loss/crossentropy": 2.1490842700004578, + "loss/hidden": 4.6578125, + "loss/jsd": 0.0, + "loss/logits": 0.3494755119085312, + "step": 1260 + }, + { + "epoch": 0.042333333333333334, + "grad_norm": 54.0, + "grad_norm_var": 30.624739583333334, + "learning_rate": 0.0001, + "loss": 10.3963, + "loss/crossentropy": 2.255769196152687, + "loss/hidden": 4.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.352866580337286, + "step": 1270 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 46.25, + "grad_norm_var": 41.215625, + "learning_rate": 0.0001, + "loss": 10.3482, + "loss/crossentropy": 2.188653063029051, + "loss/hidden": 4.7625, + "loss/jsd": 0.0, + "loss/logits": 0.35777630396187304, + "step": 1280 + }, + { + "epoch": 0.043, + "grad_norm": 49.5, + "grad_norm_var": 140.48333333333332, + "learning_rate": 0.0001, + "loss": 10.4993, + "loss/crossentropy": 2.2085324838757514, + "loss/hidden": 4.764453125, + "loss/jsd": 0.0, + "loss/logits": 0.35029419548809526, + "step": 1290 + }, + { + "epoch": 0.043333333333333335, + "grad_norm": 68.5, + "grad_norm_var": 87.10807291666667, + "learning_rate": 0.0001, + "loss": 10.387, + "loss/crossentropy": 2.0001343421638014, + "loss/hidden": 4.740625, + "loss/jsd": 0.0, + "loss/logits": 0.3210140850394964, + "step": 1300 + }, + { + "epoch": 0.043666666666666666, + "grad_norm": 53.0, + "grad_norm_var": 63.59348958333333, + "learning_rate": 0.0001, + "loss": 10.4462, + "loss/crossentropy": 2.261654701828957, + "loss/hidden": 4.64765625, + "loss/jsd": 0.0, + "loss/logits": 0.3344265431165695, + "step": 1310 + }, + { + "epoch": 0.044, + "grad_norm": 51.25, + "grad_norm_var": 141.92395833333333, + "learning_rate": 0.0001, + "loss": 10.3567, + "loss/crossentropy": 2.2908728308975697, + "loss/hidden": 4.7203125, + "loss/jsd": 0.0, + "loss/logits": 0.3544796362519264, + "step": 1320 + }, + { + "epoch": 0.044333333333333336, + "grad_norm": 55.75, + "grad_norm_var": 95.390625, + "learning_rate": 0.0001, + "loss": 10.2347, + "loss/crossentropy": 2.342694191634655, + "loss/hidden": 4.574609375, + "loss/jsd": 0.0, + "loss/logits": 0.33147499468177555, + "step": 1330 + }, + { + "epoch": 0.04466666666666667, + "grad_norm": 63.0, + "grad_norm_var": 143.86848958333334, + "learning_rate": 0.0001, + "loss": 10.2828, + "loss/crossentropy": 2.1463702358305454, + "loss/hidden": 4.641796875, + "loss/jsd": 0.0, + "loss/logits": 0.3379510557278991, + "step": 1340 + }, + { + "epoch": 0.045, + "grad_norm": 58.0, + "grad_norm_var": 145.73333333333332, + "learning_rate": 0.0001, + "loss": 10.4051, + "loss/crossentropy": 2.1320372141897677, + "loss/hidden": 4.6515625, + "loss/jsd": 0.0, + "loss/logits": 0.3248579815030098, + "step": 1350 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 50.5, + "grad_norm_var": 1564.9205729166667, + "learning_rate": 0.0001, + "loss": 10.3519, + "loss/crossentropy": 2.390837848186493, + "loss/hidden": 4.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.33150205537676813, + "step": 1360 + }, + { + "epoch": 0.04566666666666667, + "grad_norm": 57.0, + "grad_norm_var": 86.76640625, + "learning_rate": 0.0001, + "loss": 10.2536, + "loss/crossentropy": 2.238551476597786, + "loss/hidden": 4.677734375, + "loss/jsd": 0.0, + "loss/logits": 0.32355707660317423, + "step": 1370 + }, + { + "epoch": 0.046, + "grad_norm": 54.0, + "grad_norm_var": 33.41848958333333, + "learning_rate": 0.0001, + "loss": 10.2308, + "loss/crossentropy": 2.172594637423754, + "loss/hidden": 4.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.31351518500596287, + "step": 1380 + }, + { + "epoch": 0.04633333333333333, + "grad_norm": 54.75, + "grad_norm_var": 37.065625, + "learning_rate": 0.0001, + "loss": 10.1326, + "loss/crossentropy": 2.299109023809433, + "loss/hidden": 4.53359375, + "loss/jsd": 0.0, + "loss/logits": 0.3222783401608467, + "step": 1390 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 55.5, + "grad_norm_var": 97.47916666666667, + "learning_rate": 0.0001, + "loss": 10.4755, + "loss/crossentropy": 2.1733594447374345, + "loss/hidden": 4.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.3652454580180347, + "step": 1400 + }, + { + "epoch": 0.047, + "grad_norm": 54.25, + "grad_norm_var": 66.57395833333334, + "learning_rate": 0.0001, + "loss": 10.3266, + "loss/crossentropy": 2.2273528560996056, + "loss/hidden": 4.498828125, + "loss/jsd": 0.0, + "loss/logits": 0.30771929658949376, + "step": 1410 + }, + { + "epoch": 0.04733333333333333, + "grad_norm": 51.75, + "grad_norm_var": 83.07395833333334, + "learning_rate": 0.0001, + "loss": 10.2202, + "loss/crossentropy": 2.286280909180641, + "loss/hidden": 4.652734375, + "loss/jsd": 0.0, + "loss/logits": 0.34912221878767014, + "step": 1420 + }, + { + "epoch": 0.04766666666666667, + "grad_norm": 54.0, + "grad_norm_var": 236.29557291666666, + "learning_rate": 0.0001, + "loss": 10.2577, + "loss/crossentropy": 2.1812940359115602, + "loss/hidden": 4.534765625, + "loss/jsd": 0.0, + "loss/logits": 0.33046910390257833, + "step": 1430 + }, + { + "epoch": 0.048, + "grad_norm": 53.25, + "grad_norm_var": 62.09765625, + "learning_rate": 0.0001, + "loss": 10.0937, + "loss/crossentropy": 2.263388830423355, + "loss/hidden": 4.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.3291785676032305, + "step": 1440 + }, + { + "epoch": 0.04833333333333333, + "grad_norm": 57.75, + "grad_norm_var": 30.408072916666665, + "learning_rate": 0.0001, + "loss": 10.1197, + "loss/crossentropy": 2.3318180561065676, + "loss/hidden": 4.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.32444748654961586, + "step": 1450 + }, + { + "epoch": 0.048666666666666664, + "grad_norm": 46.25, + "grad_norm_var": 51.42265625, + "learning_rate": 0.0001, + "loss": 10.2713, + "loss/crossentropy": 2.0284368000924586, + "loss/hidden": 4.673828125, + "loss/jsd": 0.0, + "loss/logits": 0.3348676819354296, + "step": 1460 + }, + { + "epoch": 0.049, + "grad_norm": 47.5, + "grad_norm_var": 12.757291666666667, + "learning_rate": 0.0001, + "loss": 10.1257, + "loss/crossentropy": 2.2734193384647368, + "loss/hidden": 4.6890625, + "loss/jsd": 0.0, + "loss/logits": 0.3383447080850601, + "step": 1470 + }, + { + "epoch": 0.04933333333333333, + "grad_norm": 48.5, + "grad_norm_var": 10.7625, + "learning_rate": 0.0001, + "loss": 10.1081, + "loss/crossentropy": 2.1734183013439177, + "loss/hidden": 4.680078125, + "loss/jsd": 0.0, + "loss/logits": 0.3549729684367776, + "step": 1480 + }, + { + "epoch": 0.049666666666666665, + "grad_norm": 52.75, + "grad_norm_var": 25.864322916666666, + "learning_rate": 0.0001, + "loss": 10.0498, + "loss/crossentropy": 2.125520133972168, + "loss/hidden": 4.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.33357073105871676, + "step": 1490 + }, + { + "epoch": 0.05, + "grad_norm": 41.75, + "grad_norm_var": 39.50182291666667, + "learning_rate": 0.0001, + "loss": 10.0335, + "loss/crossentropy": 1.9868148412555455, + "loss/hidden": 4.60703125, + "loss/jsd": 0.0, + "loss/logits": 0.3306591158732772, + "step": 1500 + }, + { + "epoch": 0.050333333333333334, + "grad_norm": 54.0, + "grad_norm_var": 25.6875, + "learning_rate": 0.0001, + "loss": 10.2335, + "loss/crossentropy": 2.1235173836350443, + "loss/hidden": 4.740234375, + "loss/jsd": 0.0, + "loss/logits": 0.3614276949316263, + "step": 1510 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 51.75, + "grad_norm_var": 32.75416666666667, + "learning_rate": 0.0001, + "loss": 10.2879, + "loss/crossentropy": 2.2561273902654646, + "loss/hidden": 4.503125, + "loss/jsd": 0.0, + "loss/logits": 0.32090449519455433, + "step": 1520 + }, + { + "epoch": 0.051, + "grad_norm": 47.75, + "grad_norm_var": 37.532291666666666, + "learning_rate": 0.0001, + "loss": 10.0812, + "loss/crossentropy": 2.144378663599491, + "loss/hidden": 4.47890625, + "loss/jsd": 0.0, + "loss/logits": 0.29765736870467663, + "step": 1530 + }, + { + "epoch": 0.051333333333333335, + "grad_norm": 46.5, + "grad_norm_var": 24.870833333333334, + "learning_rate": 0.0001, + "loss": 10.0522, + "loss/crossentropy": 2.2584436416625975, + "loss/hidden": 4.6109375, + "loss/jsd": 0.0, + "loss/logits": 0.34671921730041505, + "step": 1540 + }, + { + "epoch": 0.051666666666666666, + "grad_norm": 61.75, + "grad_norm_var": 32.15807291666667, + "learning_rate": 0.0001, + "loss": 9.9727, + "loss/crossentropy": 2.290890319645405, + "loss/hidden": 4.437109375, + "loss/jsd": 0.0, + "loss/logits": 0.33176828771829603, + "step": 1550 + }, + { + "epoch": 0.052, + "grad_norm": 52.5, + "grad_norm_var": 23.875, + "learning_rate": 0.0001, + "loss": 10.0301, + "loss/crossentropy": 2.1280356660485267, + "loss/hidden": 4.634375, + "loss/jsd": 0.0, + "loss/logits": 0.30865246467292307, + "step": 1560 + }, + { + "epoch": 0.052333333333333336, + "grad_norm": 46.25, + "grad_norm_var": 23.5875, + "learning_rate": 0.0001, + "loss": 10.1737, + "loss/crossentropy": 2.032886290922761, + "loss/hidden": 4.7875, + "loss/jsd": 0.0, + "loss/logits": 0.3365877510979772, + "step": 1570 + }, + { + "epoch": 0.05266666666666667, + "grad_norm": 5100273664.0, + "grad_norm_var": 1.6257994331790162e+18, + "learning_rate": 0.0001, + "loss": 10.0954, + "loss/crossentropy": 2.1190722532570363, + "loss/hidden": 4.566015625, + "loss/jsd": 0.0, + "loss/logits": 0.3125073878094554, + "step": 1580 + }, + { + "epoch": 0.053, + "grad_norm": 48.0, + "grad_norm_var": 1.6257994343053266e+18, + "learning_rate": 0.0001, + "loss": 10.2018, + "loss/crossentropy": 2.222577328979969, + "loss/hidden": 4.548828125, + "loss/jsd": 0.0, + "loss/logits": 0.31913691386580467, + "step": 1590 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 53.5, + "grad_norm_var": 34.01015625, + "learning_rate": 0.0001, + "loss": 10.0954, + "loss/crossentropy": 2.093307490646839, + "loss/hidden": 4.612109375, + "loss/jsd": 0.0, + "loss/logits": 0.31212261263281105, + "step": 1600 + }, + { + "epoch": 0.05366666666666667, + "grad_norm": 48.75, + "grad_norm_var": 35.86015625, + "learning_rate": 0.0001, + "loss": 10.0662, + "loss/crossentropy": 2.234019846469164, + "loss/hidden": 4.5765625, + "loss/jsd": 0.0, + "loss/logits": 0.31956249102950096, + "step": 1610 + }, + { + "epoch": 0.054, + "grad_norm": 46.5, + "grad_norm_var": 8.089322916666667, + "learning_rate": 0.0001, + "loss": 10.1413, + "loss/crossentropy": 2.334869381785393, + "loss/hidden": 4.3875, + "loss/jsd": 0.0, + "loss/logits": 0.31581548042595387, + "step": 1620 + }, + { + "epoch": 0.05433333333333333, + "grad_norm": 47.75, + "grad_norm_var": 9.44140625, + "learning_rate": 0.0001, + "loss": 10.1052, + "loss/crossentropy": 2.3633588939905166, + "loss/hidden": 4.56875, + "loss/jsd": 0.0, + "loss/logits": 0.34331442005932333, + "step": 1630 + }, + { + "epoch": 0.05466666666666667, + "grad_norm": 53.0, + "grad_norm_var": 90.81640625, + "learning_rate": 0.0001, + "loss": 10.168, + "loss/crossentropy": 2.4216663956642153, + "loss/hidden": 4.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.3401012416929007, + "step": 1640 + }, + { + "epoch": 0.055, + "grad_norm": 51.75, + "grad_norm_var": 39.29348958333333, + "learning_rate": 0.0001, + "loss": 10.0832, + "loss/crossentropy": 2.063167358934879, + "loss/hidden": 4.7140625, + "loss/jsd": 0.0, + "loss/logits": 0.3456306353211403, + "step": 1650 + }, + { + "epoch": 0.05533333333333333, + "grad_norm": 46.75, + "grad_norm_var": 26.27265625, + "learning_rate": 0.0001, + "loss": 10.072, + "loss/crossentropy": 2.212946060299873, + "loss/hidden": 4.4515625, + "loss/jsd": 0.0, + "loss/logits": 0.3171877060085535, + "step": 1660 + }, + { + "epoch": 0.05566666666666667, + "grad_norm": 55.0, + "grad_norm_var": 28.607291666666665, + "learning_rate": 0.0001, + "loss": 9.9379, + "loss/crossentropy": 2.141632245481014, + "loss/hidden": 4.5515625, + "loss/jsd": 0.0, + "loss/logits": 0.3336161907762289, + "step": 1670 + }, + { + "epoch": 0.056, + "grad_norm": 50.25, + "grad_norm_var": 24.514322916666668, + "learning_rate": 0.0001, + "loss": 10.1308, + "loss/crossentropy": 2.2568211957812307, + "loss/hidden": 4.49453125, + "loss/jsd": 0.0, + "loss/logits": 0.3477105274796486, + "step": 1680 + }, + { + "epoch": 0.05633333333333333, + "grad_norm": 48.75, + "grad_norm_var": 18.523958333333333, + "learning_rate": 0.0001, + "loss": 9.9396, + "loss/crossentropy": 2.2201522469520567, + "loss/hidden": 4.54453125, + "loss/jsd": 0.0, + "loss/logits": 0.34930348955094814, + "step": 1690 + }, + { + "epoch": 0.056666666666666664, + "grad_norm": 49.75, + "grad_norm_var": 18.798958333333335, + "learning_rate": 0.0001, + "loss": 9.8497, + "loss/crossentropy": 2.0173508882522584, + "loss/hidden": 4.617578125, + "loss/jsd": 0.0, + "loss/logits": 0.33352648206055163, + "step": 1700 + }, + { + "epoch": 0.057, + "grad_norm": 53.5, + "grad_norm_var": 53.06848958333333, + "learning_rate": 0.0001, + "loss": 10.0437, + "loss/crossentropy": 2.2075788587331773, + "loss/hidden": 4.520703125, + "loss/jsd": 0.0, + "loss/logits": 0.3259766954928637, + "step": 1710 + }, + { + "epoch": 0.05733333333333333, + "grad_norm": 47.5, + "grad_norm_var": 12.4875, + "learning_rate": 0.0001, + "loss": 10.0156, + "loss/crossentropy": 2.2491456001996992, + "loss/hidden": 4.46796875, + "loss/jsd": 0.0, + "loss/logits": 0.32154099717736245, + "step": 1720 + }, + { + "epoch": 0.057666666666666665, + "grad_norm": 51.5, + "grad_norm_var": 22.698958333333334, + "learning_rate": 0.0001, + "loss": 10.1127, + "loss/crossentropy": 2.2360637068748472, + "loss/hidden": 4.599609375, + "loss/jsd": 0.0, + "loss/logits": 0.34004257917404174, + "step": 1730 + }, + { + "epoch": 0.058, + "grad_norm": 46.5, + "grad_norm_var": 20.483072916666668, + "learning_rate": 0.0001, + "loss": 9.9202, + "loss/crossentropy": 2.169334437698126, + "loss/hidden": 4.485546875, + "loss/jsd": 0.0, + "loss/logits": 0.3213648945093155, + "step": 1740 + }, + { + "epoch": 0.058333333333333334, + "grad_norm": 47.0, + "grad_norm_var": 18.501822916666665, + "learning_rate": 0.0001, + "loss": 9.9663, + "loss/crossentropy": 2.0624472610652447, + "loss/hidden": 4.672265625, + "loss/jsd": 0.0, + "loss/logits": 0.33040957022458317, + "step": 1750 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 58.75, + "grad_norm_var": 20.214322916666667, + "learning_rate": 0.0001, + "loss": 9.903, + "loss/crossentropy": 2.3360592156648634, + "loss/hidden": 4.465625, + "loss/jsd": 0.0, + "loss/logits": 0.34421659298241136, + "step": 1760 + }, + { + "epoch": 0.059, + "grad_norm": 52.75, + "grad_norm_var": 22.5125, + "learning_rate": 0.0001, + "loss": 10.0066, + "loss/crossentropy": 2.2802910655736923, + "loss/hidden": 4.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.32004439644515514, + "step": 1770 + }, + { + "epoch": 0.059333333333333335, + "grad_norm": 44.25, + "grad_norm_var": 27.908072916666665, + "learning_rate": 0.0001, + "loss": 9.8284, + "loss/crossentropy": 2.3136008724570276, + "loss/hidden": 4.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.31898586712777616, + "step": 1780 + }, + { + "epoch": 0.059666666666666666, + "grad_norm": 45.75, + "grad_norm_var": 33.890625, + "learning_rate": 0.0001, + "loss": 9.9335, + "loss/crossentropy": 2.2768970370292663, + "loss/hidden": 4.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.31204146817326545, + "step": 1790 + }, + { + "epoch": 0.06, + "grad_norm": 45.5, + "grad_norm_var": 22.015625, + "learning_rate": 0.0001, + "loss": 9.9846, + "loss/crossentropy": 2.2742267102003098, + "loss/hidden": 4.58046875, + "loss/jsd": 0.0, + "loss/logits": 0.3438062757253647, + "step": 1800 + }, + { + "epoch": 0.060333333333333336, + "grad_norm": 44.25, + "grad_norm_var": 409.8541666666667, + "learning_rate": 0.0001, + "loss": 9.9716, + "loss/crossentropy": 2.2834268152713775, + "loss/hidden": 4.514453125, + "loss/jsd": 0.0, + "loss/logits": 0.33578878715634347, + "step": 1810 + }, + { + "epoch": 0.06066666666666667, + "grad_norm": 65.0, + "grad_norm_var": 437.6489583333333, + "learning_rate": 0.0001, + "loss": 10.0204, + "loss/crossentropy": 2.2084247410297393, + "loss/hidden": 4.514453125, + "loss/jsd": 0.0, + "loss/logits": 0.3214238926768303, + "step": 1820 + }, + { + "epoch": 0.061, + "grad_norm": 47.25, + "grad_norm_var": 63.00390625, + "learning_rate": 0.0001, + "loss": 9.8694, + "loss/crossentropy": 2.4278147757053374, + "loss/hidden": 4.340625, + "loss/jsd": 0.0, + "loss/logits": 0.3172066226601601, + "step": 1830 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 40.0, + "grad_norm_var": 51.55807291666667, + "learning_rate": 0.0001, + "loss": 9.9545, + "loss/crossentropy": 2.2956237584352492, + "loss/hidden": 4.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.33112434335052965, + "step": 1840 + }, + { + "epoch": 0.06166666666666667, + "grad_norm": 44.5, + "grad_norm_var": 62.907291666666666, + "learning_rate": 0.0001, + "loss": 9.9812, + "loss/crossentropy": 2.143566229194403, + "loss/hidden": 4.630078125, + "loss/jsd": 0.0, + "loss/logits": 0.3249357048422098, + "step": 1850 + }, + { + "epoch": 0.062, + "grad_norm": 66.0, + "grad_norm_var": 63.49166666666667, + "learning_rate": 0.0001, + "loss": 10.0621, + "loss/crossentropy": 2.112970842421055, + "loss/hidden": 4.344140625, + "loss/jsd": 0.0, + "loss/logits": 0.290663880482316, + "step": 1860 + }, + { + "epoch": 0.06233333333333333, + "grad_norm": 48.0, + "grad_norm_var": 91.80416666666666, + "learning_rate": 0.0001, + "loss": 9.9348, + "loss/crossentropy": 2.1757256247103216, + "loss/hidden": 4.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.315375828742981, + "step": 1870 + }, + { + "epoch": 0.06266666666666666, + "grad_norm": 48.25, + "grad_norm_var": 2.0336566681475924e+18, + "learning_rate": 0.0001, + "loss": 10.1178, + "loss/crossentropy": 2.209875613451004, + "loss/hidden": 4.583203125, + "loss/jsd": 0.0, + "loss/logits": 0.3451205603778362, + "step": 1880 + }, + { + "epoch": 0.063, + "grad_norm": 38.25, + "grad_norm_var": 378.6205729166667, + "learning_rate": 0.0001, + "loss": 9.8686, + "loss/crossentropy": 2.234115143120289, + "loss/hidden": 4.44296875, + "loss/jsd": 0.0, + "loss/logits": 0.31297464594244956, + "step": 1890 + }, + { + "epoch": 0.06333333333333334, + "grad_norm": 52.0, + "grad_norm_var": 30.382291666666667, + "learning_rate": 0.0001, + "loss": 9.9681, + "loss/crossentropy": 2.314877039194107, + "loss/hidden": 4.561328125, + "loss/jsd": 0.0, + "loss/logits": 0.34244176670908927, + "step": 1900 + }, + { + "epoch": 0.06366666666666666, + "grad_norm": 59.0, + "grad_norm_var": 534.7708333333334, + "learning_rate": 0.0001, + "loss": 10.1051, + "loss/crossentropy": 2.148053403198719, + "loss/hidden": 4.525, + "loss/jsd": 0.0, + "loss/logits": 0.3273327838629484, + "step": 1910 + }, + { + "epoch": 0.064, + "grad_norm": 47.25, + "grad_norm_var": 553.11015625, + "learning_rate": 0.0001, + "loss": 9.9442, + "loss/crossentropy": 2.3466587856411936, + "loss/hidden": 4.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.31107306741178037, + "step": 1920 + }, + { + "epoch": 0.06433333333333334, + "grad_norm": 50.0, + "grad_norm_var": 42.541666666666664, + "learning_rate": 0.0001, + "loss": 9.8719, + "loss/crossentropy": 2.252930277585983, + "loss/hidden": 4.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.3164752185344696, + "step": 1930 + }, + { + "epoch": 0.06466666666666666, + "grad_norm": 46.25, + "grad_norm_var": 34.29348958333333, + "learning_rate": 0.0001, + "loss": 9.7802, + "loss/crossentropy": 2.1432655058801173, + "loss/hidden": 4.53671875, + "loss/jsd": 0.0, + "loss/logits": 0.30679955538362264, + "step": 1940 + }, + { + "epoch": 0.065, + "grad_norm": 40.0, + "grad_norm_var": 29.705989583333334, + "learning_rate": 0.0001, + "loss": 9.8725, + "loss/crossentropy": 2.2932953238487244, + "loss/hidden": 4.425, + "loss/jsd": 0.0, + "loss/logits": 0.311083947122097, + "step": 1950 + }, + { + "epoch": 0.06533333333333333, + "grad_norm": 47.25, + "grad_norm_var": 40.08515625, + "learning_rate": 0.0001, + "loss": 9.8434, + "loss/crossentropy": 2.1042870871722696, + "loss/hidden": 4.48125, + "loss/jsd": 0.0, + "loss/logits": 0.31923425998538735, + "step": 1960 + }, + { + "epoch": 0.06566666666666666, + "grad_norm": 47.0, + "grad_norm_var": 32.69583333333333, + "learning_rate": 0.0001, + "loss": 9.9911, + "loss/crossentropy": 2.228940861672163, + "loss/hidden": 4.470703125, + "loss/jsd": 0.0, + "loss/logits": 0.3330340197309852, + "step": 1970 + }, + { + "epoch": 0.066, + "grad_norm": 49.5, + "grad_norm_var": 19.858072916666668, + "learning_rate": 0.0001, + "loss": 9.8909, + "loss/crossentropy": 2.365998923778534, + "loss/hidden": 4.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.3426622122526169, + "step": 1980 + }, + { + "epoch": 0.06633333333333333, + "grad_norm": 70.0, + "grad_norm_var": 43.84166666666667, + "learning_rate": 0.0001, + "loss": 9.8246, + "loss/crossentropy": 2.21089443564415, + "loss/hidden": 4.508203125, + "loss/jsd": 0.0, + "loss/logits": 0.33040032908320427, + "step": 1990 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 46.0, + "grad_norm_var": 200.29140625, + "learning_rate": 0.0001, + "loss": 9.9707, + "loss/crossentropy": 2.161339648067951, + "loss/hidden": 4.59375, + "loss/jsd": 0.0, + "loss/logits": 0.33537587746977804, + "step": 2000 + }, + { + "epoch": 0.067, + "grad_norm": 41.0, + "grad_norm_var": 34.44166666666667, + "learning_rate": 0.0001, + "loss": 9.8169, + "loss/crossentropy": 2.162997691333294, + "loss/hidden": 4.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.3058626361191273, + "step": 2010 + }, + { + "epoch": 0.06733333333333333, + "grad_norm": 48.75, + "grad_norm_var": 776.290625, + "learning_rate": 0.0001, + "loss": 9.8411, + "loss/crossentropy": 2.1134648233652116, + "loss/hidden": 4.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.3073283813893795, + "step": 2020 + }, + { + "epoch": 0.06766666666666667, + "grad_norm": 44.0, + "grad_norm_var": 17.798958333333335, + "learning_rate": 0.0001, + "loss": 9.8272, + "loss/crossentropy": 2.1669696398079394, + "loss/hidden": 4.311328125, + "loss/jsd": 0.0, + "loss/logits": 0.3024018405005336, + "step": 2030 + }, + { + "epoch": 0.068, + "grad_norm": 52.75, + "grad_norm_var": 39.25, + "learning_rate": 0.0001, + "loss": 9.8041, + "loss/crossentropy": 2.1358415842056275, + "loss/hidden": 4.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.2974515471607447, + "step": 2040 + }, + { + "epoch": 0.06833333333333333, + "grad_norm": 50.25, + "grad_norm_var": 372.12682291666664, + "learning_rate": 0.0001, + "loss": 9.8634, + "loss/crossentropy": 2.179739834368229, + "loss/hidden": 4.478125, + "loss/jsd": 0.0, + "loss/logits": 0.31879689246416093, + "step": 2050 + }, + { + "epoch": 0.06866666666666667, + "grad_norm": 55.5, + "grad_norm_var": 1220.3247395833334, + "learning_rate": 0.0001, + "loss": 9.7749, + "loss/crossentropy": 2.196292628347874, + "loss/hidden": 4.4796875, + "loss/jsd": 0.0, + "loss/logits": 0.3256720818579197, + "step": 2060 + }, + { + "epoch": 0.069, + "grad_norm": 63.75, + "grad_norm_var": 930.6125, + "learning_rate": 0.0001, + "loss": 9.893, + "loss/crossentropy": 2.2043800972402097, + "loss/hidden": 4.47421875, + "loss/jsd": 0.0, + "loss/logits": 0.3335278692655265, + "step": 2070 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 44.75, + "grad_norm_var": 79.98723958333333, + "learning_rate": 0.0001, + "loss": 9.8999, + "loss/crossentropy": 2.3483674988150596, + "loss/hidden": 4.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.33991905823349955, + "step": 2080 + }, + { + "epoch": 0.06966666666666667, + "grad_norm": 47.0, + "grad_norm_var": 29.093489583333334, + "learning_rate": 0.0001, + "loss": 9.9509, + "loss/crossentropy": 2.418730080127716, + "loss/hidden": 4.301953125, + "loss/jsd": 0.0, + "loss/logits": 0.31339589357376096, + "step": 2090 + }, + { + "epoch": 0.07, + "grad_norm": 42.0, + "grad_norm_var": 20.145572916666666, + "learning_rate": 0.0001, + "loss": 9.8714, + "loss/crossentropy": 2.3859033226966857, + "loss/hidden": 4.46953125, + "loss/jsd": 0.0, + "loss/logits": 0.33473276533186436, + "step": 2100 + }, + { + "epoch": 0.07033333333333333, + "grad_norm": 45.75, + "grad_norm_var": 18.748958333333334, + "learning_rate": 0.0001, + "loss": 9.6503, + "loss/crossentropy": 2.0000314809381963, + "loss/hidden": 4.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.28535501156002285, + "step": 2110 + }, + { + "epoch": 0.07066666666666667, + "grad_norm": 50.5, + "grad_norm_var": 104.64348958333333, + "learning_rate": 0.0001, + "loss": 9.8653, + "loss/crossentropy": 2.3264140084385874, + "loss/hidden": 4.554296875, + "loss/jsd": 0.0, + "loss/logits": 0.3423323597759008, + "step": 2120 + }, + { + "epoch": 0.071, + "grad_norm": 49.25, + "grad_norm_var": 111.43932291666667, + "learning_rate": 0.0001, + "loss": 9.8327, + "loss/crossentropy": 2.2525949284434317, + "loss/hidden": 4.52578125, + "loss/jsd": 0.0, + "loss/logits": 0.3271168455481529, + "step": 2130 + }, + { + "epoch": 0.07133333333333333, + "grad_norm": 48.25, + "grad_norm_var": 34.56848958333333, + "learning_rate": 0.0001, + "loss": 9.7428, + "loss/crossentropy": 2.1882525816559792, + "loss/hidden": 4.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.30069184843450786, + "step": 2140 + }, + { + "epoch": 0.07166666666666667, + "grad_norm": 38.75, + "grad_norm_var": 36.018489583333334, + "learning_rate": 0.0001, + "loss": 9.6677, + "loss/crossentropy": 2.149352750182152, + "loss/hidden": 4.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.30152420345693826, + "step": 2150 + }, + { + "epoch": 0.072, + "grad_norm": 47.5, + "grad_norm_var": 41.665625, + "learning_rate": 0.0001, + "loss": 9.7964, + "loss/crossentropy": 2.191788887232542, + "loss/hidden": 4.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.3204044759273529, + "step": 2160 + }, + { + "epoch": 0.07233333333333333, + "grad_norm": 43.75, + "grad_norm_var": 21.66015625, + "learning_rate": 0.0001, + "loss": 9.774, + "loss/crossentropy": 2.057549092173576, + "loss/hidden": 4.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.29129388704895975, + "step": 2170 + }, + { + "epoch": 0.07266666666666667, + "grad_norm": 69.5, + "grad_norm_var": 1.4591662505790013e+18, + "learning_rate": 0.0001, + "loss": 9.8565, + "loss/crossentropy": 2.1569569408893585, + "loss/hidden": 4.54453125, + "loss/jsd": 0.0, + "loss/logits": 0.3287381026893854, + "step": 2180 + }, + { + "epoch": 0.073, + "grad_norm": 41.5, + "grad_norm_var": 1.459166249522037e+18, + "learning_rate": 0.0001, + "loss": 9.7733, + "loss/crossentropy": 2.166168002039194, + "loss/hidden": 4.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.314485302567482, + "step": 2190 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 51.5, + "grad_norm_var": 15.843489583333334, + "learning_rate": 0.0001, + "loss": 10.0454, + "loss/crossentropy": 2.2209738835692407, + "loss/hidden": 4.4734375, + "loss/jsd": 0.0, + "loss/logits": 0.3463482726365328, + "step": 2200 + }, + { + "epoch": 0.07366666666666667, + "grad_norm": 47.25, + "grad_norm_var": 25.479166666666668, + "learning_rate": 0.0001, + "loss": 9.6539, + "loss/crossentropy": 2.225794421136379, + "loss/hidden": 4.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.3246209166944027, + "step": 2210 + }, + { + "epoch": 0.074, + "grad_norm": 48.75, + "grad_norm_var": 23.154166666666665, + "learning_rate": 0.0001, + "loss": 9.9317, + "loss/crossentropy": 2.207696130871773, + "loss/hidden": 4.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.33632578104734423, + "step": 2220 + }, + { + "epoch": 0.07433333333333333, + "grad_norm": 44.0, + "grad_norm_var": 17.530989583333334, + "learning_rate": 0.0001, + "loss": 9.8728, + "loss/crossentropy": 1.9358359836041927, + "loss/hidden": 4.57578125, + "loss/jsd": 0.0, + "loss/logits": 0.30893346965312957, + "step": 2230 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 41.25, + "grad_norm_var": 356.12682291666664, + "learning_rate": 0.0001, + "loss": 9.7122, + "loss/crossentropy": 2.1984036192297935, + "loss/hidden": 4.46171875, + "loss/jsd": 0.0, + "loss/logits": 0.30973851270973685, + "step": 2240 + }, + { + "epoch": 0.075, + "grad_norm": 48.25, + "grad_norm_var": 126.44166666666666, + "learning_rate": 0.0001, + "loss": 9.9046, + "loss/crossentropy": 2.2213550955057144, + "loss/hidden": 4.4125, + "loss/jsd": 0.0, + "loss/logits": 0.32458372712135314, + "step": 2250 + }, + { + "epoch": 0.07533333333333334, + "grad_norm": 42.75, + "grad_norm_var": 140.88515625, + "learning_rate": 0.0001, + "loss": 9.6263, + "loss/crossentropy": 2.2533512063324452, + "loss/hidden": 4.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.2968948673456907, + "step": 2260 + }, + { + "epoch": 0.07566666666666666, + "grad_norm": 40.75, + "grad_norm_var": 30.940625, + "learning_rate": 0.0001, + "loss": 9.6899, + "loss/crossentropy": 2.157477790862322, + "loss/hidden": 4.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.31756968759000304, + "step": 2270 + }, + { + "epoch": 0.076, + "grad_norm": 40.25, + "grad_norm_var": 48.38098958333333, + "learning_rate": 0.0001, + "loss": 9.767, + "loss/crossentropy": 2.1325583457946777, + "loss/hidden": 4.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.3174896206706762, + "step": 2280 + }, + { + "epoch": 0.07633333333333334, + "grad_norm": 44.0, + "grad_norm_var": 292.840625, + "learning_rate": 0.0001, + "loss": 9.6629, + "loss/crossentropy": 2.116827255487442, + "loss/hidden": 4.578125, + "loss/jsd": 0.0, + "loss/logits": 0.32861895225942134, + "step": 2290 + }, + { + "epoch": 0.07666666666666666, + "grad_norm": 45.75, + "grad_norm_var": 20.098958333333332, + "learning_rate": 0.0001, + "loss": 9.848, + "loss/crossentropy": 2.2478859812021255, + "loss/hidden": 4.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.3231295388191938, + "step": 2300 + }, + { + "epoch": 0.077, + "grad_norm": 44.5, + "grad_norm_var": 21.03515625, + "learning_rate": 0.0001, + "loss": 9.7126, + "loss/crossentropy": 2.2487671941518785, + "loss/hidden": 4.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.3190602418035269, + "step": 2310 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 48.0, + "grad_norm_var": 78.95, + "learning_rate": 0.0001, + "loss": 9.8802, + "loss/crossentropy": 2.0769578374922277, + "loss/hidden": 4.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.2989305056631565, + "step": 2320 + }, + { + "epoch": 0.07766666666666666, + "grad_norm": 47.0, + "grad_norm_var": 88.0875, + "learning_rate": 0.0001, + "loss": 9.6776, + "loss/crossentropy": 2.2335385888814927, + "loss/hidden": 4.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.3145127721130848, + "step": 2330 + }, + { + "epoch": 0.078, + "grad_norm": 45.0, + "grad_norm_var": 29.162239583333335, + "learning_rate": 0.0001, + "loss": 9.7538, + "loss/crossentropy": 2.0275358721613883, + "loss/hidden": 4.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.3278283253312111, + "step": 2340 + }, + { + "epoch": 0.07833333333333334, + "grad_norm": 46.0, + "grad_norm_var": 15.825, + "learning_rate": 0.0001, + "loss": 9.7288, + "loss/crossentropy": 2.2636055946350098, + "loss/hidden": 4.3, + "loss/jsd": 0.0, + "loss/logits": 0.3025582984089851, + "step": 2350 + }, + { + "epoch": 0.07866666666666666, + "grad_norm": 44.25, + "grad_norm_var": 13.548958333333333, + "learning_rate": 0.0001, + "loss": 9.5923, + "loss/crossentropy": 2.0806369572877883, + "loss/hidden": 4.4328125, + "loss/jsd": 0.0, + "loss/logits": 0.3299530727788806, + "step": 2360 + }, + { + "epoch": 0.079, + "grad_norm": 48.75, + "grad_norm_var": 22.079166666666666, + "learning_rate": 0.0001, + "loss": 9.5852, + "loss/crossentropy": 2.3268432706594466, + "loss/hidden": 4.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.30601568184792993, + "step": 2370 + }, + { + "epoch": 0.07933333333333334, + "grad_norm": 42.75, + "grad_norm_var": 20.670833333333334, + "learning_rate": 0.0001, + "loss": 9.5338, + "loss/crossentropy": 2.1361410327255728, + "loss/hidden": 4.1984375, + "loss/jsd": 0.0, + "loss/logits": 0.27998521625995637, + "step": 2380 + }, + { + "epoch": 0.07966666666666666, + "grad_norm": 49.25, + "grad_norm_var": 15.266666666666667, + "learning_rate": 0.0001, + "loss": 9.5332, + "loss/crossentropy": 2.1129361763596535, + "loss/hidden": 4.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.29650157541036604, + "step": 2390 + }, + { + "epoch": 0.08, + "grad_norm": 41.5, + "grad_norm_var": 25.429166666666667, + "learning_rate": 0.0001, + "loss": 9.6695, + "loss/crossentropy": 2.1574720084667205, + "loss/hidden": 4.269921875, + "loss/jsd": 0.0, + "loss/logits": 0.2908360369503498, + "step": 2400 + }, + { + "epoch": 0.08033333333333334, + "grad_norm": 47.75, + "grad_norm_var": 27.645572916666666, + "learning_rate": 0.0001, + "loss": 9.6921, + "loss/crossentropy": 2.1829341441392898, + "loss/hidden": 4.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.32286781407892706, + "step": 2410 + }, + { + "epoch": 0.08066666666666666, + "grad_norm": 39.25, + "grad_norm_var": 50.555989583333336, + "learning_rate": 0.0001, + "loss": 9.6315, + "loss/crossentropy": 2.202047623693943, + "loss/hidden": 4.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.3005070973187685, + "step": 2420 + }, + { + "epoch": 0.081, + "grad_norm": 55.0, + "grad_norm_var": 59.32083333333333, + "learning_rate": 0.0001, + "loss": 9.6481, + "loss/crossentropy": 2.036600667051971, + "loss/hidden": 4.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.29114639018662275, + "step": 2430 + }, + { + "epoch": 0.08133333333333333, + "grad_norm": 46.75, + "grad_norm_var": 23.999739583333334, + "learning_rate": 0.0001, + "loss": 9.6458, + "loss/crossentropy": 2.2270599991083144, + "loss/hidden": 4.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.29978666119277475, + "step": 2440 + }, + { + "epoch": 0.08166666666666667, + "grad_norm": 44.25, + "grad_norm_var": 12.179166666666667, + "learning_rate": 0.0001, + "loss": 9.5471, + "loss/crossentropy": 2.024305185675621, + "loss/hidden": 4.57265625, + "loss/jsd": 0.0, + "loss/logits": 0.3118948549032211, + "step": 2450 + }, + { + "epoch": 0.082, + "grad_norm": 44.75, + "grad_norm_var": 34.35729166666667, + "learning_rate": 0.0001, + "loss": 9.6578, + "loss/crossentropy": 2.293112243711948, + "loss/hidden": 4.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.3087532136589289, + "step": 2460 + }, + { + "epoch": 0.08233333333333333, + "grad_norm": 45.5, + "grad_norm_var": 54.340625, + "learning_rate": 0.0001, + "loss": 9.8106, + "loss/crossentropy": 2.159098155796528, + "loss/hidden": 4.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.33799122273921967, + "step": 2470 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 40.0, + "grad_norm_var": 48.490625, + "learning_rate": 0.0001, + "loss": 9.6198, + "loss/crossentropy": 2.1624063357710837, + "loss/hidden": 4.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.2944056583568454, + "step": 2480 + }, + { + "epoch": 0.083, + "grad_norm": 40.75, + "grad_norm_var": 10.579166666666667, + "learning_rate": 0.0001, + "loss": 9.3685, + "loss/crossentropy": 1.9723315440118312, + "loss/hidden": 4.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.2733167437836528, + "step": 2490 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 45.5, + "grad_norm_var": 13.723958333333334, + "learning_rate": 0.0001, + "loss": 9.7588, + "loss/crossentropy": 2.3331384271383286, + "loss/hidden": 4.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.3479126874357462, + "step": 2500 + }, + { + "epoch": 0.08366666666666667, + "grad_norm": 86.5, + "grad_norm_var": 176.58098958333332, + "learning_rate": 0.0001, + "loss": 9.604, + "loss/crossentropy": 1.999950359016657, + "loss/hidden": 4.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.2928037021309137, + "step": 2510 + }, + { + "epoch": 0.084, + "grad_norm": 49.75, + "grad_norm_var": 139.44140625, + "learning_rate": 0.0001, + "loss": 9.6797, + "loss/crossentropy": 2.1708203181624413, + "loss/hidden": 4.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.32291998714208603, + "step": 2520 + }, + { + "epoch": 0.08433333333333333, + "grad_norm": 42.25, + "grad_norm_var": 8.774739583333334, + "learning_rate": 0.0001, + "loss": 9.5592, + "loss/crossentropy": 2.2025649711489677, + "loss/hidden": 4.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.29643226135522127, + "step": 2530 + }, + { + "epoch": 0.08466666666666667, + "grad_norm": 46.25, + "grad_norm_var": 9.9875, + "learning_rate": 0.0001, + "loss": 9.5765, + "loss/crossentropy": 2.043403333425522, + "loss/hidden": 4.23671875, + "loss/jsd": 0.0, + "loss/logits": 0.2749296611174941, + "step": 2540 + }, + { + "epoch": 0.085, + "grad_norm": 48.75, + "grad_norm_var": 31.115625, + "learning_rate": 0.0001, + "loss": 9.7494, + "loss/crossentropy": 2.3262161046266554, + "loss/hidden": 4.371875, + "loss/jsd": 0.0, + "loss/logits": 0.32624533101916314, + "step": 2550 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 48.75, + "grad_norm_var": 57.76015625, + "learning_rate": 0.0001, + "loss": 9.7786, + "loss/crossentropy": 2.222296068072319, + "loss/hidden": 4.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.3247109234333038, + "step": 2560 + }, + { + "epoch": 0.08566666666666667, + "grad_norm": 42.25, + "grad_norm_var": 79.12265625, + "learning_rate": 0.0001, + "loss": 9.5314, + "loss/crossentropy": 2.2209610506892203, + "loss/hidden": 4.214453125, + "loss/jsd": 0.0, + "loss/logits": 0.3056815842166543, + "step": 2570 + }, + { + "epoch": 0.086, + "grad_norm": 40.25, + "grad_norm_var": 11.74765625, + "learning_rate": 0.0001, + "loss": 9.6404, + "loss/crossentropy": 2.27297485768795, + "loss/hidden": 4.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.3162984177470207, + "step": 2580 + }, + { + "epoch": 0.08633333333333333, + "grad_norm": 43.75, + "grad_norm_var": 49.432291666666664, + "learning_rate": 0.0001, + "loss": 9.7363, + "loss/crossentropy": 2.1449129566550256, + "loss/hidden": 4.16640625, + "loss/jsd": 0.0, + "loss/logits": 0.2817653050646186, + "step": 2590 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 50.25, + "grad_norm_var": 38.78307291666667, + "learning_rate": 0.0001, + "loss": 9.7758, + "loss/crossentropy": 2.1352466866374016, + "loss/hidden": 4.255078125, + "loss/jsd": 0.0, + "loss/logits": 0.2975259907543659, + "step": 2600 + }, + { + "epoch": 0.087, + "grad_norm": 44.0, + "grad_norm_var": 567.5809895833333, + "learning_rate": 0.0001, + "loss": 9.8575, + "loss/crossentropy": 2.139151658862829, + "loss/hidden": 4.4125, + "loss/jsd": 0.0, + "loss/logits": 0.2895995612256229, + "step": 2610 + }, + { + "epoch": 0.08733333333333333, + "grad_norm": 52.0, + "grad_norm_var": 273.78932291666666, + "learning_rate": 0.0001, + "loss": 9.6349, + "loss/crossentropy": 2.2437764003872873, + "loss/hidden": 4.2875, + "loss/jsd": 0.0, + "loss/logits": 0.32964606285095216, + "step": 2620 + }, + { + "epoch": 0.08766666666666667, + "grad_norm": 41.25, + "grad_norm_var": 14.832291666666666, + "learning_rate": 0.0001, + "loss": 9.8805, + "loss/crossentropy": 2.3176336243748663, + "loss/hidden": 4.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.3469231605529785, + "step": 2630 + }, + { + "epoch": 0.088, + "grad_norm": 42.5, + "grad_norm_var": 63.924739583333334, + "learning_rate": 0.0001, + "loss": 9.6733, + "loss/crossentropy": 2.3317414090037345, + "loss/hidden": 4.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.3158239943906665, + "step": 2640 + }, + { + "epoch": 0.08833333333333333, + "grad_norm": 42.0, + "grad_norm_var": 47.2625, + "learning_rate": 0.0001, + "loss": 9.53, + "loss/crossentropy": 2.085783836245537, + "loss/hidden": 4.154296875, + "loss/jsd": 0.0, + "loss/logits": 0.30830717273056507, + "step": 2650 + }, + { + "epoch": 0.08866666666666667, + "grad_norm": 44.75, + "grad_norm_var": 10.5125, + "learning_rate": 0.0001, + "loss": 9.6251, + "loss/crossentropy": 2.1499151602387427, + "loss/hidden": 4.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.2804956670850515, + "step": 2660 + }, + { + "epoch": 0.089, + "grad_norm": 38.5, + "grad_norm_var": 11.532291666666667, + "learning_rate": 0.0001, + "loss": 9.6324, + "loss/crossentropy": 2.209009498357773, + "loss/hidden": 4.35, + "loss/jsd": 0.0, + "loss/logits": 0.3170790944248438, + "step": 2670 + }, + { + "epoch": 0.08933333333333333, + "grad_norm": 39.75, + "grad_norm_var": 18.598958333333332, + "learning_rate": 0.0001, + "loss": 9.5224, + "loss/crossentropy": 2.2298269629478455, + "loss/hidden": 4.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.32219739593565466, + "step": 2680 + }, + { + "epoch": 0.08966666666666667, + "grad_norm": 46.0, + "grad_norm_var": 15.04140625, + "learning_rate": 0.0001, + "loss": 9.6132, + "loss/crossentropy": 1.9786877676844596, + "loss/hidden": 4.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.32063512736931443, + "step": 2690 + }, + { + "epoch": 0.09, + "grad_norm": 48.5, + "grad_norm_var": 16.370572916666667, + "learning_rate": 0.0001, + "loss": 9.5549, + "loss/crossentropy": 1.9581148944795133, + "loss/hidden": 4.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.2741738385986537, + "step": 2700 + }, + { + "epoch": 0.09033333333333333, + "grad_norm": 39.25, + "grad_norm_var": 39.25390625, + "learning_rate": 0.0001, + "loss": 9.584, + "loss/crossentropy": 2.3024230673909187, + "loss/hidden": 4.284765625, + "loss/jsd": 0.0, + "loss/logits": 0.31776211857795716, + "step": 2710 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 49.0, + "grad_norm_var": 17.290625, + "learning_rate": 0.0001, + "loss": 9.5623, + "loss/crossentropy": 2.2617855593562126, + "loss/hidden": 4.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.32258614907041194, + "step": 2720 + }, + { + "epoch": 0.091, + "grad_norm": 42.5, + "grad_norm_var": 13.182291666666666, + "learning_rate": 0.0001, + "loss": 9.6273, + "loss/crossentropy": 2.079073026776314, + "loss/hidden": 4.480859375, + "loss/jsd": 0.0, + "loss/logits": 0.3417033176869154, + "step": 2730 + }, + { + "epoch": 0.09133333333333334, + "grad_norm": 42.75, + "grad_norm_var": 13.691666666666666, + "learning_rate": 0.0001, + "loss": 9.5346, + "loss/crossentropy": 2.0505348153412344, + "loss/hidden": 4.206640625, + "loss/jsd": 0.0, + "loss/logits": 0.2786023462191224, + "step": 2740 + }, + { + "epoch": 0.09166666666666666, + "grad_norm": 41.0, + "grad_norm_var": 14.207291666666666, + "learning_rate": 0.0001, + "loss": 9.4132, + "loss/crossentropy": 2.0336243584752083, + "loss/hidden": 4.423828125, + "loss/jsd": 0.0, + "loss/logits": 0.3069038312882185, + "step": 2750 + }, + { + "epoch": 0.092, + "grad_norm": 42.25, + "grad_norm_var": 22.057291666666668, + "learning_rate": 0.0001, + "loss": 9.5053, + "loss/crossentropy": 2.140151581168175, + "loss/hidden": 4.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.2916400883346796, + "step": 2760 + }, + { + "epoch": 0.09233333333333334, + "grad_norm": 39.75, + "grad_norm_var": 15.432291666666666, + "learning_rate": 0.0001, + "loss": 9.5438, + "loss/crossentropy": 2.1993202224373816, + "loss/hidden": 4.226953125, + "loss/jsd": 0.0, + "loss/logits": 0.28949977159500123, + "step": 2770 + }, + { + "epoch": 0.09266666666666666, + "grad_norm": 41.75, + "grad_norm_var": 12.848958333333334, + "learning_rate": 0.0001, + "loss": 9.3797, + "loss/crossentropy": 2.1902914479374886, + "loss/hidden": 4.280859375, + "loss/jsd": 0.0, + "loss/logits": 0.28133582808077334, + "step": 2780 + }, + { + "epoch": 0.093, + "grad_norm": 46.25, + "grad_norm_var": 19.35, + "learning_rate": 0.0001, + "loss": 9.5715, + "loss/crossentropy": 2.249551972001791, + "loss/hidden": 4.199609375, + "loss/jsd": 0.0, + "loss/logits": 0.2843886561691761, + "step": 2790 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 41.0, + "grad_norm_var": 20.740625, + "learning_rate": 0.0001, + "loss": 9.6114, + "loss/crossentropy": 2.228690019249916, + "loss/hidden": 4.262109375, + "loss/jsd": 0.0, + "loss/logits": 0.320504542812705, + "step": 2800 + }, + { + "epoch": 0.09366666666666666, + "grad_norm": 43.0, + "grad_norm_var": 8.529166666666667, + "learning_rate": 0.0001, + "loss": 9.5136, + "loss/crossentropy": 2.291595259308815, + "loss/hidden": 4.24375, + "loss/jsd": 0.0, + "loss/logits": 0.3068136487156153, + "step": 2810 + }, + { + "epoch": 0.094, + "grad_norm": 35.25, + "grad_norm_var": 74.32395833333334, + "learning_rate": 0.0001, + "loss": 9.51, + "loss/crossentropy": 2.192962332069874, + "loss/hidden": 4.22265625, + "loss/jsd": 0.0, + "loss/logits": 0.2983078990131617, + "step": 2820 + }, + { + "epoch": 0.09433333333333334, + "grad_norm": 39.75, + "grad_norm_var": 191.76640625, + "learning_rate": 0.0001, + "loss": 9.5953, + "loss/crossentropy": 2.2489975869655607, + "loss/hidden": 4.309375, + "loss/jsd": 0.0, + "loss/logits": 0.3156152920797467, + "step": 2830 + }, + { + "epoch": 0.09466666666666666, + "grad_norm": 33.75, + "grad_norm_var": 51.555989583333336, + "learning_rate": 0.0001, + "loss": 9.5037, + "loss/crossentropy": 2.1171005085110663, + "loss/hidden": 4.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.29232311621308327, + "step": 2840 + }, + { + "epoch": 0.095, + "grad_norm": 41.25, + "grad_norm_var": 15.114322916666667, + "learning_rate": 0.0001, + "loss": 9.4713, + "loss/crossentropy": 2.1562278002500532, + "loss/hidden": 4.270703125, + "loss/jsd": 0.0, + "loss/logits": 0.32199123315513134, + "step": 2850 + }, + { + "epoch": 0.09533333333333334, + "grad_norm": 50.25, + "grad_norm_var": 15.640625, + "learning_rate": 0.0001, + "loss": 9.4132, + "loss/crossentropy": 2.1052547857165336, + "loss/hidden": 4.25390625, + "loss/jsd": 0.0, + "loss/logits": 0.28381253518164157, + "step": 2860 + }, + { + "epoch": 0.09566666666666666, + "grad_norm": 38.75, + "grad_norm_var": 13.95, + "learning_rate": 0.0001, + "loss": 9.5959, + "loss/crossentropy": 2.2764726355671883, + "loss/hidden": 4.232421875, + "loss/jsd": 0.0, + "loss/logits": 0.3202834574505687, + "step": 2870 + }, + { + "epoch": 0.096, + "grad_norm": 36.5, + "grad_norm_var": 6.601822916666666, + "learning_rate": 0.0001, + "loss": 9.4651, + "loss/crossentropy": 2.2641511857509613, + "loss/hidden": 4.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.30131282322108743, + "step": 2880 + }, + { + "epoch": 0.09633333333333334, + "grad_norm": 39.75, + "grad_norm_var": 259.33229166666666, + "learning_rate": 0.0001, + "loss": 9.5955, + "loss/crossentropy": 2.1798644959926605, + "loss/hidden": 4.316015625, + "loss/jsd": 0.0, + "loss/logits": 0.321273997426033, + "step": 2890 + }, + { + "epoch": 0.09666666666666666, + "grad_norm": 42.25, + "grad_norm_var": 265.51015625, + "learning_rate": 0.0001, + "loss": 9.4789, + "loss/crossentropy": 2.2448938064277173, + "loss/hidden": 4.21953125, + "loss/jsd": 0.0, + "loss/logits": 0.295270549505949, + "step": 2900 + }, + { + "epoch": 0.097, + "grad_norm": 39.5, + "grad_norm_var": 22.404166666666665, + "learning_rate": 0.0001, + "loss": 9.4008, + "loss/crossentropy": 2.1135250240564347, + "loss/hidden": 4.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.28801401853561404, + "step": 2910 + }, + { + "epoch": 0.09733333333333333, + "grad_norm": 40.0, + "grad_norm_var": 16.864322916666666, + "learning_rate": 0.0001, + "loss": 9.3394, + "loss/crossentropy": 2.2067424938082696, + "loss/hidden": 4.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.28688893765211104, + "step": 2920 + }, + { + "epoch": 0.09766666666666667, + "grad_norm": 46.25, + "grad_norm_var": 16.148958333333333, + "learning_rate": 0.0001, + "loss": 9.5709, + "loss/crossentropy": 2.375057080388069, + "loss/hidden": 4.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.3258885521441698, + "step": 2930 + }, + { + "epoch": 0.098, + "grad_norm": 32.25, + "grad_norm_var": 21.30390625, + "learning_rate": 0.0001, + "loss": 9.4697, + "loss/crossentropy": 2.2150216475129128, + "loss/hidden": 4.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.301010762155056, + "step": 2940 + }, + { + "epoch": 0.09833333333333333, + "grad_norm": 40.75, + "grad_norm_var": 14.576822916666666, + "learning_rate": 0.0001, + "loss": 9.5142, + "loss/crossentropy": 2.166199879348278, + "loss/hidden": 4.278125, + "loss/jsd": 0.0, + "loss/logits": 0.30978226438164713, + "step": 2950 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 43.25, + "grad_norm_var": 6.07265625, + "learning_rate": 0.0001, + "loss": 9.4248, + "loss/crossentropy": 2.0727218955755236, + "loss/hidden": 4.240234375, + "loss/jsd": 0.0, + "loss/logits": 0.2816809505224228, + "step": 2960 + }, + { + "epoch": 0.099, + "grad_norm": 43.0, + "grad_norm_var": 9.79765625, + "learning_rate": 0.0001, + "loss": 9.296, + "loss/crossentropy": 2.194628655910492, + "loss/hidden": 4.23359375, + "loss/jsd": 0.0, + "loss/logits": 0.2876921635121107, + "step": 2970 + }, + { + "epoch": 0.09933333333333333, + "grad_norm": 42.25, + "grad_norm_var": 6.145572916666667, + "learning_rate": 0.0001, + "loss": 9.4016, + "loss/crossentropy": 2.1081935077905656, + "loss/hidden": 4.201171875, + "loss/jsd": 0.0, + "loss/logits": 0.2900088790804148, + "step": 2980 + }, + { + "epoch": 0.09966666666666667, + "grad_norm": 38.75, + "grad_norm_var": 5.890625, + "learning_rate": 0.0001, + "loss": 9.4648, + "loss/crossentropy": 2.121137388050556, + "loss/hidden": 4.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.2918519277125597, + "step": 2990 + }, + { + "epoch": 0.1, + "grad_norm": 44.75, + "grad_norm_var": 328.47083333333336, + "learning_rate": 0.0001, + "loss": 9.6444, + "loss/crossentropy": 2.127225194871426, + "loss/hidden": 4.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.30514415316283705, + "step": 3000 + }, + { + "epoch": 0.10033333333333333, + "grad_norm": 39.5, + "grad_norm_var": 307.89348958333335, + "learning_rate": 0.0001, + "loss": 9.6355, + "loss/crossentropy": 2.250686952471733, + "loss/hidden": 4.398828125, + "loss/jsd": 0.0, + "loss/logits": 0.30530925914645196, + "step": 3010 + }, + { + "epoch": 0.10066666666666667, + "grad_norm": 36.75, + "grad_norm_var": 20.974739583333335, + "learning_rate": 0.0001, + "loss": 9.5967, + "loss/crossentropy": 2.182039903104305, + "loss/hidden": 4.254296875, + "loss/jsd": 0.0, + "loss/logits": 0.29223496429622176, + "step": 3020 + }, + { + "epoch": 0.101, + "grad_norm": 36.25, + "grad_norm_var": 19.032291666666666, + "learning_rate": 0.0001, + "loss": 9.2889, + "loss/crossentropy": 2.193696314841509, + "loss/hidden": 4.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.31729465052485467, + "step": 3030 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 41.25, + "grad_norm_var": 5.1625, + "learning_rate": 0.0001, + "loss": 9.2799, + "loss/crossentropy": 1.9293710552155972, + "loss/hidden": 4.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.29089682549238205, + "step": 3040 + }, + { + "epoch": 0.10166666666666667, + "grad_norm": 46.75, + "grad_norm_var": 94.43932291666667, + "learning_rate": 0.0001, + "loss": 9.5891, + "loss/crossentropy": 2.186572279036045, + "loss/hidden": 4.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.31275569424033167, + "step": 3050 + }, + { + "epoch": 0.102, + "grad_norm": 39.5, + "grad_norm_var": 96.21015625, + "learning_rate": 0.0001, + "loss": 9.4484, + "loss/crossentropy": 2.1623737648129464, + "loss/hidden": 4.15390625, + "loss/jsd": 0.0, + "loss/logits": 0.27935802303254603, + "step": 3060 + }, + { + "epoch": 0.10233333333333333, + "grad_norm": 34.0, + "grad_norm_var": 23.951822916666668, + "learning_rate": 0.0001, + "loss": 9.3534, + "loss/crossentropy": 2.1772946141660214, + "loss/hidden": 4.130859375, + "loss/jsd": 0.0, + "loss/logits": 0.2741739235818386, + "step": 3070 + }, + { + "epoch": 0.10266666666666667, + "grad_norm": 40.5, + "grad_norm_var": 1.9625138775123822e+18, + "learning_rate": 0.0001, + "loss": 9.5206, + "loss/crossentropy": 2.2383458808064463, + "loss/hidden": 4.276171875, + "loss/jsd": 0.0, + "loss/logits": 0.28973059728741646, + "step": 3080 + }, + { + "epoch": 0.103, + "grad_norm": 40.25, + "grad_norm_var": 2.9817910414744924e+18, + "learning_rate": 0.0001, + "loss": 9.55, + "loss/crossentropy": 2.3438141733407973, + "loss/hidden": 4.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.3148787975311279, + "step": 3090 + }, + { + "epoch": 0.10333333333333333, + "grad_norm": 40.25, + "grad_norm_var": 1.2261049715428168e+18, + "learning_rate": 0.0001, + "loss": 9.5073, + "loss/crossentropy": 2.3278582096099854, + "loss/hidden": 4.212109375, + "loss/jsd": 0.0, + "loss/logits": 0.29142517112195493, + "step": 3100 + }, + { + "epoch": 0.10366666666666667, + "grad_norm": 38.0, + "grad_norm_var": 291.4830729166667, + "learning_rate": 0.0001, + "loss": 9.424, + "loss/crossentropy": 2.1446138307452203, + "loss/hidden": 4.2796875, + "loss/jsd": 0.0, + "loss/logits": 0.3030157912522554, + "step": 3110 + }, + { + "epoch": 0.104, + "grad_norm": 42.5, + "grad_norm_var": 567.9125, + "learning_rate": 0.0001, + "loss": 9.5702, + "loss/crossentropy": 2.243132984638214, + "loss/hidden": 4.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.35173722021281717, + "step": 3120 + }, + { + "epoch": 0.10433333333333333, + "grad_norm": 35.5, + "grad_norm_var": 327.62395833333335, + "learning_rate": 0.0001, + "loss": 9.5972, + "loss/crossentropy": 2.124360602349043, + "loss/hidden": 4.216015625, + "loss/jsd": 0.0, + "loss/logits": 0.2907536863349378, + "step": 3130 + }, + { + "epoch": 0.10466666666666667, + "grad_norm": 52.75, + "grad_norm_var": 22.407291666666666, + "learning_rate": 0.0001, + "loss": 9.437, + "loss/crossentropy": 2.2831270948052405, + "loss/hidden": 4.153515625, + "loss/jsd": 0.0, + "loss/logits": 0.291528220102191, + "step": 3140 + }, + { + "epoch": 0.105, + "grad_norm": 40.0, + "grad_norm_var": 1.8926377153833818e+18, + "learning_rate": 0.0001, + "loss": 9.4417, + "loss/crossentropy": 2.0570017248392105, + "loss/hidden": 4.233984375, + "loss/jsd": 0.0, + "loss/logits": 0.28441670089960097, + "step": 3150 + }, + { + "epoch": 0.10533333333333333, + "grad_norm": 39.75, + "grad_norm_var": 7.590625, + "learning_rate": 0.0001, + "loss": 9.184, + "loss/crossentropy": 2.362034395337105, + "loss/hidden": 4.20234375, + "loss/jsd": 0.0, + "loss/logits": 0.30373654775321485, + "step": 3160 + }, + { + "epoch": 0.10566666666666667, + "grad_norm": 40.75, + "grad_norm_var": 9.939322916666667, + "learning_rate": 0.0001, + "loss": 9.4179, + "loss/crossentropy": 2.137139005959034, + "loss/hidden": 4.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.30007231421768665, + "step": 3170 + }, + { + "epoch": 0.106, + "grad_norm": 43.25, + "grad_norm_var": 168.83229166666666, + "learning_rate": 0.0001, + "loss": 9.5242, + "loss/crossentropy": 2.084735092520714, + "loss/hidden": 4.266015625, + "loss/jsd": 0.0, + "loss/logits": 0.29063573814928534, + "step": 3180 + }, + { + "epoch": 0.10633333333333334, + "grad_norm": 39.75, + "grad_norm_var": 13.895833333333334, + "learning_rate": 0.0001, + "loss": 9.3732, + "loss/crossentropy": 2.0884271055459975, + "loss/hidden": 4.301953125, + "loss/jsd": 0.0, + "loss/logits": 0.292556369304657, + "step": 3190 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 40.25, + "grad_norm_var": 10.104166666666666, + "learning_rate": 0.0001, + "loss": 9.409, + "loss/crossentropy": 2.1861984208226204, + "loss/hidden": 4.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3046982977539301, + "step": 3200 + }, + { + "epoch": 0.107, + "grad_norm": 33.25, + "grad_norm_var": 19.754166666666666, + "learning_rate": 0.0001, + "loss": 9.333, + "loss/crossentropy": 2.1967382043600083, + "loss/hidden": 4.280859375, + "loss/jsd": 0.0, + "loss/logits": 0.3142522796988487, + "step": 3210 + }, + { + "epoch": 0.10733333333333334, + "grad_norm": 37.75, + "grad_norm_var": 15.83515625, + "learning_rate": 0.0001, + "loss": 9.4087, + "loss/crossentropy": 2.099241575598717, + "loss/hidden": 4.223046875, + "loss/jsd": 0.0, + "loss/logits": 0.3019026231020689, + "step": 3220 + }, + { + "epoch": 0.10766666666666666, + "grad_norm": 36.75, + "grad_norm_var": 12.68515625, + "learning_rate": 0.0001, + "loss": 9.3162, + "loss/crossentropy": 2.0056164607405664, + "loss/hidden": 4.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.2895685002207756, + "step": 3230 + }, + { + "epoch": 0.108, + "grad_norm": 51.75, + "grad_norm_var": 25.812239583333334, + "learning_rate": 0.0001, + "loss": 9.4161, + "loss/crossentropy": 2.194224573671818, + "loss/hidden": 4.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.2994342103600502, + "step": 3240 + }, + { + "epoch": 0.10833333333333334, + "grad_norm": 37.75, + "grad_norm_var": 30.448958333333334, + "learning_rate": 0.0001, + "loss": 9.3774, + "loss/crossentropy": 2.1500812068581583, + "loss/hidden": 4.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.297881081327796, + "step": 3250 + }, + { + "epoch": 0.10866666666666666, + "grad_norm": 50.0, + "grad_norm_var": 15.873958333333333, + "learning_rate": 0.0001, + "loss": 9.3746, + "loss/crossentropy": 2.149769604206085, + "loss/hidden": 4.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.2857384353876114, + "step": 3260 + }, + { + "epoch": 0.109, + "grad_norm": 49.25, + "grad_norm_var": 23.241666666666667, + "learning_rate": 0.0001, + "loss": 9.4298, + "loss/crossentropy": 2.1580600261688234, + "loss/hidden": 4.203125, + "loss/jsd": 0.0, + "loss/logits": 0.2893156711012125, + "step": 3270 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 42.0, + "grad_norm_var": 14.820572916666666, + "learning_rate": 0.0001, + "loss": 9.3562, + "loss/crossentropy": 2.285036212205887, + "loss/hidden": 4.28984375, + "loss/jsd": 0.0, + "loss/logits": 0.3082501322031021, + "step": 3280 + }, + { + "epoch": 0.10966666666666666, + "grad_norm": 37.0, + "grad_norm_var": 6.114322916666667, + "learning_rate": 0.0001, + "loss": 9.6223, + "loss/crossentropy": 2.2134475603699686, + "loss/hidden": 4.16015625, + "loss/jsd": 0.0, + "loss/logits": 0.28552069552242754, + "step": 3290 + }, + { + "epoch": 0.11, + "grad_norm": 33.75, + "grad_norm_var": 13.50390625, + "learning_rate": 0.0001, + "loss": 9.3289, + "loss/crossentropy": 2.1722410164773462, + "loss/hidden": 4.277734375, + "loss/jsd": 0.0, + "loss/logits": 0.2915722324512899, + "step": 3300 + }, + { + "epoch": 0.11033333333333334, + "grad_norm": 39.5, + "grad_norm_var": 12.824739583333333, + "learning_rate": 0.0001, + "loss": 9.3712, + "loss/crossentropy": 1.9897394858300685, + "loss/hidden": 4.20390625, + "loss/jsd": 0.0, + "loss/logits": 0.2664288356900215, + "step": 3310 + }, + { + "epoch": 0.11066666666666666, + "grad_norm": 37.25, + "grad_norm_var": 13.051822916666667, + "learning_rate": 0.0001, + "loss": 9.21, + "loss/crossentropy": 2.109373450279236, + "loss/hidden": 4.2296875, + "loss/jsd": 0.0, + "loss/logits": 0.2850793283432722, + "step": 3320 + }, + { + "epoch": 0.111, + "grad_norm": 42.5, + "grad_norm_var": 14.47890625, + "learning_rate": 0.0001, + "loss": 9.2866, + "loss/crossentropy": 2.2932792961597444, + "loss/hidden": 4.23203125, + "loss/jsd": 0.0, + "loss/logits": 0.30185060724616053, + "step": 3330 + }, + { + "epoch": 0.11133333333333334, + "grad_norm": 39.25, + "grad_norm_var": 8.583333333333334, + "learning_rate": 0.0001, + "loss": 9.427, + "loss/crossentropy": 2.1133791759610174, + "loss/hidden": 4.183984375, + "loss/jsd": 0.0, + "loss/logits": 0.29095215909183025, + "step": 3340 + }, + { + "epoch": 0.11166666666666666, + "grad_norm": 36.25, + "grad_norm_var": 12.520572916666667, + "learning_rate": 0.0001, + "loss": 9.3503, + "loss/crossentropy": 2.032514417171478, + "loss/hidden": 4.228515625, + "loss/jsd": 0.0, + "loss/logits": 0.2894793044775724, + "step": 3350 + }, + { + "epoch": 0.112, + "grad_norm": 52.5, + "grad_norm_var": 24.51640625, + "learning_rate": 0.0001, + "loss": 9.3076, + "loss/crossentropy": 1.9752195596694946, + "loss/hidden": 4.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.2814787019044161, + "step": 3360 + }, + { + "epoch": 0.11233333333333333, + "grad_norm": 34.75, + "grad_norm_var": 28.245833333333334, + "learning_rate": 0.0001, + "loss": 9.4876, + "loss/crossentropy": 2.3702859327197077, + "loss/hidden": 4.1859375, + "loss/jsd": 0.0, + "loss/logits": 0.3126111339777708, + "step": 3370 + }, + { + "epoch": 0.11266666666666666, + "grad_norm": 34.5, + "grad_norm_var": 20.070572916666666, + "learning_rate": 0.0001, + "loss": 9.2654, + "loss/crossentropy": 1.8932878598570824, + "loss/hidden": 4.261328125, + "loss/jsd": 0.0, + "loss/logits": 0.2764943749643862, + "step": 3380 + }, + { + "epoch": 0.113, + "grad_norm": 41.0, + "grad_norm_var": 11.174739583333333, + "learning_rate": 0.0001, + "loss": 9.4906, + "loss/crossentropy": 2.155991692841053, + "loss/hidden": 4.19921875, + "loss/jsd": 0.0, + "loss/logits": 0.2793617382645607, + "step": 3390 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 39.75, + "grad_norm_var": 7.01015625, + "learning_rate": 0.0001, + "loss": 9.3088, + "loss/crossentropy": 2.093905381858349, + "loss/hidden": 4.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.3261349702253938, + "step": 3400 + }, + { + "epoch": 0.11366666666666667, + "grad_norm": 39.75, + "grad_norm_var": 46.25, + "learning_rate": 0.0001, + "loss": 9.486, + "loss/crossentropy": 2.1121586173772813, + "loss/hidden": 4.215625, + "loss/jsd": 0.0, + "loss/logits": 0.2922355983406305, + "step": 3410 + }, + { + "epoch": 0.114, + "grad_norm": 35.5, + "grad_norm_var": 5.224739583333333, + "learning_rate": 0.0001, + "loss": 9.3474, + "loss/crossentropy": 2.177844299376011, + "loss/hidden": 4.23671875, + "loss/jsd": 0.0, + "loss/logits": 0.29271903187036513, + "step": 3420 + }, + { + "epoch": 0.11433333333333333, + "grad_norm": 49.75, + "grad_norm_var": 15.948958333333334, + "learning_rate": 0.0001, + "loss": 9.4016, + "loss/crossentropy": 2.2300315856933595, + "loss/hidden": 4.13828125, + "loss/jsd": 0.0, + "loss/logits": 0.29194765314459803, + "step": 3430 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 40.75, + "grad_norm_var": 20.995833333333334, + "learning_rate": 0.0001, + "loss": 9.2609, + "loss/crossentropy": 2.072411538660526, + "loss/hidden": 4.219140625, + "loss/jsd": 0.0, + "loss/logits": 0.2944797810167074, + "step": 3440 + }, + { + "epoch": 0.115, + "grad_norm": 35.25, + "grad_norm_var": 23.179166666666667, + "learning_rate": 0.0001, + "loss": 9.3101, + "loss/crossentropy": 2.106787271797657, + "loss/hidden": 4.253515625, + "loss/jsd": 0.0, + "loss/logits": 0.2934326458722353, + "step": 3450 + }, + { + "epoch": 0.11533333333333333, + "grad_norm": 35.0, + "grad_norm_var": 21.362239583333334, + "learning_rate": 0.0001, + "loss": 9.3648, + "loss/crossentropy": 2.1898166716098784, + "loss/hidden": 4.171484375, + "loss/jsd": 0.0, + "loss/logits": 0.29939313270151613, + "step": 3460 + }, + { + "epoch": 0.11566666666666667, + "grad_norm": 41.0, + "grad_norm_var": 10.940625, + "learning_rate": 0.0001, + "loss": 9.2795, + "loss/crossentropy": 2.3480966717004774, + "loss/hidden": 4.236328125, + "loss/jsd": 0.0, + "loss/logits": 0.304421117156744, + "step": 3470 + }, + { + "epoch": 0.116, + "grad_norm": 39.75, + "grad_norm_var": 5.11015625, + "learning_rate": 0.0001, + "loss": 9.361, + "loss/crossentropy": 2.061821439862251, + "loss/hidden": 4.258203125, + "loss/jsd": 0.0, + "loss/logits": 0.280943001806736, + "step": 3480 + }, + { + "epoch": 0.11633333333333333, + "grad_norm": 38.75, + "grad_norm_var": 5.082291666666666, + "learning_rate": 0.0001, + "loss": 9.2472, + "loss/crossentropy": 2.089048261940479, + "loss/hidden": 4.203515625, + "loss/jsd": 0.0, + "loss/logits": 0.27816532738506794, + "step": 3490 + }, + { + "epoch": 0.11666666666666667, + "grad_norm": 39.25, + "grad_norm_var": 5.648958333333334, + "learning_rate": 0.0001, + "loss": 9.3, + "loss/crossentropy": 2.0424555987119675, + "loss/hidden": 4.278515625, + "loss/jsd": 0.0, + "loss/logits": 0.2886748146265745, + "step": 3500 + }, + { + "epoch": 0.117, + "grad_norm": 41.25, + "grad_norm_var": 5.230989583333334, + "learning_rate": 0.0001, + "loss": 9.3444, + "loss/crossentropy": 2.2320514231920243, + "loss/hidden": 4.188671875, + "loss/jsd": 0.0, + "loss/logits": 0.29398479498922825, + "step": 3510 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 32.0, + "grad_norm_var": 14.45, + "learning_rate": 0.0001, + "loss": 9.352, + "loss/crossentropy": 2.0569834411144257, + "loss/hidden": 4.1828125, + "loss/jsd": 0.0, + "loss/logits": 0.28648389838635924, + "step": 3520 + }, + { + "epoch": 0.11766666666666667, + "grad_norm": 40.5, + "grad_norm_var": 15.245572916666667, + "learning_rate": 0.0001, + "loss": 9.2835, + "loss/crossentropy": 2.1193760722875594, + "loss/hidden": 4.11171875, + "loss/jsd": 0.0, + "loss/logits": 0.277429373562336, + "step": 3530 + }, + { + "epoch": 0.118, + "grad_norm": 40.25, + "grad_norm_var": 9.59140625, + "learning_rate": 0.0001, + "loss": 9.2573, + "loss/crossentropy": 2.1044032208621504, + "loss/hidden": 4.258984375, + "loss/jsd": 0.0, + "loss/logits": 0.2949466461315751, + "step": 3540 + }, + { + "epoch": 0.11833333333333333, + "grad_norm": 43.0, + "grad_norm_var": 15.15, + "learning_rate": 0.0001, + "loss": 9.1153, + "loss/crossentropy": 2.07734075859189, + "loss/hidden": 4.13203125, + "loss/jsd": 0.0, + "loss/logits": 0.2726396777667105, + "step": 3550 + }, + { + "epoch": 0.11866666666666667, + "grad_norm": 41.75, + "grad_norm_var": 9.605989583333333, + "learning_rate": 0.0001, + "loss": 9.189, + "loss/crossentropy": 2.196866528689861, + "loss/hidden": 4.25546875, + "loss/jsd": 0.0, + "loss/logits": 0.3004810862243176, + "step": 3560 + }, + { + "epoch": 0.119, + "grad_norm": 36.75, + "grad_norm_var": 47.27682291666667, + "learning_rate": 0.0001, + "loss": 9.089, + "loss/crossentropy": 2.297177466750145, + "loss/hidden": 4.2296875, + "loss/jsd": 0.0, + "loss/logits": 0.3052255652844906, + "step": 3570 + }, + { + "epoch": 0.11933333333333333, + "grad_norm": 37.75, + "grad_norm_var": 321.41432291666666, + "learning_rate": 0.0001, + "loss": 9.2827, + "loss/crossentropy": 2.129167598485947, + "loss/hidden": 4.31171875, + "loss/jsd": 0.0, + "loss/logits": 0.304591753706336, + "step": 3580 + }, + { + "epoch": 0.11966666666666667, + "grad_norm": 40.25, + "grad_norm_var": 345.54973958333335, + "learning_rate": 0.0001, + "loss": 9.3048, + "loss/crossentropy": 2.1689872413873674, + "loss/hidden": 4.12578125, + "loss/jsd": 0.0, + "loss/logits": 0.27596075274050236, + "step": 3590 + }, + { + "epoch": 0.12, + "grad_norm": 40.0, + "grad_norm_var": 17.140625, + "learning_rate": 0.0001, + "loss": 9.2268, + "loss/crossentropy": 1.9578171581029893, + "loss/hidden": 4.265625, + "loss/jsd": 0.0, + "loss/logits": 0.28636636175215247, + "step": 3600 + }, + { + "epoch": 0.12033333333333333, + "grad_norm": 41.0, + "grad_norm_var": 68.175, + "learning_rate": 0.0001, + "loss": 9.1787, + "loss/crossentropy": 2.067359810322523, + "loss/hidden": 4.250390625, + "loss/jsd": 0.0, + "loss/logits": 0.28545970730483533, + "step": 3610 + }, + { + "epoch": 0.12066666666666667, + "grad_norm": 32.75, + "grad_norm_var": 29.920572916666668, + "learning_rate": 0.0001, + "loss": 9.2359, + "loss/crossentropy": 2.2047273397445677, + "loss/hidden": 4.180078125, + "loss/jsd": 0.0, + "loss/logits": 0.28760180845856664, + "step": 3620 + }, + { + "epoch": 0.121, + "grad_norm": 38.25, + "grad_norm_var": 13.314322916666667, + "learning_rate": 0.0001, + "loss": 9.0908, + "loss/crossentropy": 2.1291835106909276, + "loss/hidden": 4.183984375, + "loss/jsd": 0.0, + "loss/logits": 0.2917652137577534, + "step": 3630 + }, + { + "epoch": 0.12133333333333333, + "grad_norm": 41.25, + "grad_norm_var": 11.356184895833334, + "learning_rate": 0.0001, + "loss": 9.2854, + "loss/crossentropy": 2.2793887823820116, + "loss/hidden": 4.24765625, + "loss/jsd": 0.0, + "loss/logits": 0.30008267536759375, + "step": 3640 + }, + { + "epoch": 0.12166666666666667, + "grad_norm": 36.5, + "grad_norm_var": 56.16920572916667, + "learning_rate": 0.0001, + "loss": 9.2821, + "loss/crossentropy": 2.2533405125141144, + "loss/hidden": 4.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.30765612684190274, + "step": 3650 + }, + { + "epoch": 0.122, + "grad_norm": 42.5, + "grad_norm_var": 53.973958333333336, + "learning_rate": 0.0001, + "loss": 9.2816, + "loss/crossentropy": 2.294741216301918, + "loss/hidden": 4.243359375, + "loss/jsd": 0.0, + "loss/logits": 0.29128187969326974, + "step": 3660 + }, + { + "epoch": 0.12233333333333334, + "grad_norm": 37.0, + "grad_norm_var": 18.740625, + "learning_rate": 0.0001, + "loss": 9.2943, + "loss/crossentropy": 2.073512817919254, + "loss/hidden": 4.188671875, + "loss/jsd": 0.0, + "loss/logits": 0.2968620590865612, + "step": 3670 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 43.0, + "grad_norm_var": 13.92890625, + "learning_rate": 0.0001, + "loss": 9.1891, + "loss/crossentropy": 2.0943071067333223, + "loss/hidden": 4.151171875, + "loss/jsd": 0.0, + "loss/logits": 0.2801366148516536, + "step": 3680 + }, + { + "epoch": 0.123, + "grad_norm": 40.5, + "grad_norm_var": 11.00390625, + "learning_rate": 0.0001, + "loss": 9.1903, + "loss/crossentropy": 2.0213806182146072, + "loss/hidden": 4.13359375, + "loss/jsd": 0.0, + "loss/logits": 0.27214346677064893, + "step": 3690 + }, + { + "epoch": 0.12333333333333334, + "grad_norm": 50.0, + "grad_norm_var": 1.5413569484339108e+18, + "learning_rate": 0.0001, + "loss": 9.2765, + "loss/crossentropy": 2.103906211256981, + "loss/hidden": 4.205078125, + "loss/jsd": 0.0, + "loss/logits": 0.30408407263457776, + "step": 3700 + }, + { + "epoch": 0.12366666666666666, + "grad_norm": 34.5, + "grad_norm_var": 1.541356947316548e+18, + "learning_rate": 0.0001, + "loss": 9.2341, + "loss/crossentropy": 2.2875989854335783, + "loss/hidden": 4.12421875, + "loss/jsd": 0.0, + "loss/logits": 0.29328348860144615, + "step": 3710 + }, + { + "epoch": 0.124, + "grad_norm": 40.25, + "grad_norm_var": 20.280989583333334, + "learning_rate": 0.0001, + "loss": 9.3043, + "loss/crossentropy": 2.302832932770252, + "loss/hidden": 4.191796875, + "loss/jsd": 0.0, + "loss/logits": 0.30611949125304816, + "step": 3720 + }, + { + "epoch": 0.12433333333333334, + "grad_norm": 31.75, + "grad_norm_var": 43.416666666666664, + "learning_rate": 0.0001, + "loss": 9.1562, + "loss/crossentropy": 2.1152502104640005, + "loss/hidden": 4.129296875, + "loss/jsd": 0.0, + "loss/logits": 0.28165129497647284, + "step": 3730 + }, + { + "epoch": 0.12466666666666666, + "grad_norm": 38.5, + "grad_norm_var": 33.608333333333334, + "learning_rate": 0.0001, + "loss": 9.3006, + "loss/crossentropy": 2.320794602483511, + "loss/hidden": 4.10390625, + "loss/jsd": 0.0, + "loss/logits": 0.2863430541008711, + "step": 3740 + }, + { + "epoch": 0.125, + "grad_norm": 34.5, + "grad_norm_var": 6.52890625, + "learning_rate": 0.0001, + "loss": 9.275, + "loss/crossentropy": 2.0785109654068945, + "loss/hidden": 4.22265625, + "loss/jsd": 0.0, + "loss/logits": 0.3000879239290953, + "step": 3750 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 34.25, + "grad_norm_var": 22.823958333333334, + "learning_rate": 0.0001, + "loss": 9.3243, + "loss/crossentropy": 2.1262876391410828, + "loss/hidden": 4.30859375, + "loss/jsd": 0.0, + "loss/logits": 0.3010214529931545, + "step": 3760 + }, + { + "epoch": 0.12566666666666668, + "grad_norm": 37.75, + "grad_norm_var": 21.157291666666666, + "learning_rate": 0.0001, + "loss": 9.2353, + "loss/crossentropy": 2.2993004634976386, + "loss/hidden": 4.254296875, + "loss/jsd": 0.0, + "loss/logits": 0.3304103210568428, + "step": 3770 + }, + { + "epoch": 0.126, + "grad_norm": 33.0, + "grad_norm_var": 8.33515625, + "learning_rate": 0.0001, + "loss": 9.2457, + "loss/crossentropy": 2.1940866082906725, + "loss/hidden": 4.199609375, + "loss/jsd": 0.0, + "loss/logits": 0.294326201826334, + "step": 3780 + }, + { + "epoch": 0.12633333333333333, + "grad_norm": 38.5, + "grad_norm_var": 135.71015625, + "learning_rate": 0.0001, + "loss": 9.2567, + "loss/crossentropy": 2.13921734392643, + "loss/hidden": 4.177734375, + "loss/jsd": 0.0, + "loss/logits": 0.28696890603750946, + "step": 3790 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 36.0, + "grad_norm_var": 4.058333333333334, + "learning_rate": 0.0001, + "loss": 9.0874, + "loss/crossentropy": 2.2284460753202437, + "loss/hidden": 4.209375, + "loss/jsd": 0.0, + "loss/logits": 0.3025734366849065, + "step": 3800 + }, + { + "epoch": 0.127, + "grad_norm": 34.5, + "grad_norm_var": 9.889322916666666, + "learning_rate": 0.0001, + "loss": 9.2163, + "loss/crossentropy": 2.299591761827469, + "loss/hidden": 4.055859375, + "loss/jsd": 0.0, + "loss/logits": 0.296408361941576, + "step": 3810 + }, + { + "epoch": 0.12733333333333333, + "grad_norm": 37.75, + "grad_norm_var": 11.239322916666667, + "learning_rate": 0.0001, + "loss": 9.1761, + "loss/crossentropy": 2.2842276841402054, + "loss/hidden": 4.11875, + "loss/jsd": 0.0, + "loss/logits": 0.2857844788581133, + "step": 3820 + }, + { + "epoch": 0.12766666666666668, + "grad_norm": 42.0, + "grad_norm_var": 10.793489583333333, + "learning_rate": 0.0001, + "loss": 9.2579, + "loss/crossentropy": 2.2464538693428038, + "loss/hidden": 4.245703125, + "loss/jsd": 0.0, + "loss/logits": 0.2956688392907381, + "step": 3830 + }, + { + "epoch": 0.128, + "grad_norm": 35.0, + "grad_norm_var": 16.21015625, + "learning_rate": 0.0001, + "loss": 9.1472, + "loss/crossentropy": 2.228955328464508, + "loss/hidden": 4.26640625, + "loss/jsd": 0.0, + "loss/logits": 0.30142183881253004, + "step": 3840 + }, + { + "epoch": 0.12833333333333333, + "grad_norm": 41.0, + "grad_norm_var": 4.840625, + "learning_rate": 0.0001, + "loss": 9.2928, + "loss/crossentropy": 2.223462516069412, + "loss/hidden": 4.230078125, + "loss/jsd": 0.0, + "loss/logits": 0.29133280031383035, + "step": 3850 + }, + { + "epoch": 0.12866666666666668, + "grad_norm": 36.5, + "grad_norm_var": 20.939322916666665, + "learning_rate": 0.0001, + "loss": 9.0495, + "loss/crossentropy": 2.2790828943252563, + "loss/hidden": 4.126953125, + "loss/jsd": 0.0, + "loss/logits": 0.2985575716942549, + "step": 3860 + }, + { + "epoch": 0.129, + "grad_norm": 40.25, + "grad_norm_var": 20.11640625, + "learning_rate": 0.0001, + "loss": 9.1951, + "loss/crossentropy": 2.2769517719745638, + "loss/hidden": 4.061328125, + "loss/jsd": 0.0, + "loss/logits": 0.2779423680156469, + "step": 3870 + }, + { + "epoch": 0.12933333333333333, + "grad_norm": 32.0, + "grad_norm_var": 7.958072916666667, + "learning_rate": 0.0001, + "loss": 9.1347, + "loss/crossentropy": 2.097635033726692, + "loss/hidden": 4.102734375, + "loss/jsd": 0.0, + "loss/logits": 0.2780100252479315, + "step": 3880 + }, + { + "epoch": 0.12966666666666668, + "grad_norm": 33.25, + "grad_norm_var": 8.944205729166667, + "learning_rate": 0.0001, + "loss": 9.0803, + "loss/crossentropy": 2.1658859461545945, + "loss/hidden": 4.18359375, + "loss/jsd": 0.0, + "loss/logits": 0.27850373424589636, + "step": 3890 + }, + { + "epoch": 0.13, + "grad_norm": 33.25, + "grad_norm_var": 10.558268229166666, + "learning_rate": 0.0001, + "loss": 9.167, + "loss/crossentropy": 2.1380916953086855, + "loss/hidden": 4.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.2888460006564856, + "step": 3900 + }, + { + "epoch": 0.13033333333333333, + "grad_norm": 41.75, + "grad_norm_var": 6.151822916666666, + "learning_rate": 0.0001, + "loss": 9.2188, + "loss/crossentropy": 2.246109126508236, + "loss/hidden": 4.090234375, + "loss/jsd": 0.0, + "loss/logits": 0.2838183153420687, + "step": 3910 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 38.0, + "grad_norm_var": 3.626822916666667, + "learning_rate": 0.0001, + "loss": 9.0951, + "loss/crossentropy": 1.9172726094722747, + "loss/hidden": 4.18359375, + "loss/jsd": 0.0, + "loss/logits": 0.27877611964941024, + "step": 3920 + }, + { + "epoch": 0.131, + "grad_norm": 36.25, + "grad_norm_var": 7.6875, + "learning_rate": 0.0001, + "loss": 9.123, + "loss/crossentropy": 2.216602808237076, + "loss/hidden": 4.17421875, + "loss/jsd": 0.0, + "loss/logits": 0.29801386743783953, + "step": 3930 + }, + { + "epoch": 0.13133333333333333, + "grad_norm": 37.75, + "grad_norm_var": 15.170247395833334, + "learning_rate": 0.0001, + "loss": 9.0372, + "loss/crossentropy": 2.0827317383140325, + "loss/hidden": 4.1515625, + "loss/jsd": 0.0, + "loss/logits": 0.25494101997464896, + "step": 3940 + }, + { + "epoch": 0.13166666666666665, + "grad_norm": 39.25, + "grad_norm_var": 16.780143229166665, + "learning_rate": 0.0001, + "loss": 9.1713, + "loss/crossentropy": 2.0472889255732296, + "loss/hidden": 4.125390625, + "loss/jsd": 0.0, + "loss/logits": 0.2605790663510561, + "step": 3950 + }, + { + "epoch": 0.132, + "grad_norm": 35.0, + "grad_norm_var": 24.373958333333334, + "learning_rate": 0.0001, + "loss": 9.1236, + "loss/crossentropy": 2.2061238437891006, + "loss/hidden": 4.06953125, + "loss/jsd": 0.0, + "loss/logits": 0.27440296970307826, + "step": 3960 + }, + { + "epoch": 0.13233333333333333, + "grad_norm": 37.75, + "grad_norm_var": 17.36015625, + "learning_rate": 0.0001, + "loss": 9.0615, + "loss/crossentropy": 2.067065991461277, + "loss/hidden": 4.19609375, + "loss/jsd": 0.0, + "loss/logits": 0.2724686389788985, + "step": 3970 + }, + { + "epoch": 0.13266666666666665, + "grad_norm": 35.5, + "grad_norm_var": 13.895833333333334, + "learning_rate": 0.0001, + "loss": 9.2604, + "loss/crossentropy": 2.3754075884819033, + "loss/hidden": 4.234765625, + "loss/jsd": 0.0, + "loss/logits": 0.3084482606500387, + "step": 3980 + }, + { + "epoch": 0.133, + "grad_norm": 36.5, + "grad_norm_var": 15.72265625, + "learning_rate": 0.0001, + "loss": 9.0579, + "loss/crossentropy": 2.0445886969566347, + "loss/hidden": 4.1140625, + "loss/jsd": 0.0, + "loss/logits": 0.28532980997115376, + "step": 3990 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 38.0, + "grad_norm_var": 4.351822916666666, + "learning_rate": 0.0001, + "loss": 8.9463, + "loss/crossentropy": 2.154660718142986, + "loss/hidden": 4.159375, + "loss/jsd": 0.0, + "loss/logits": 0.29097947776317595, + "step": 4000 + }, + { + "epoch": 0.13366666666666666, + "grad_norm": 39.25, + "grad_norm_var": 10.275, + "learning_rate": 0.0001, + "loss": 9.1226, + "loss/crossentropy": 2.0323362797498703, + "loss/hidden": 4.0609375, + "loss/jsd": 0.0, + "loss/logits": 0.25012299232184887, + "step": 4010 + }, + { + "epoch": 0.134, + "grad_norm": 35.75, + "grad_norm_var": 5.273958333333334, + "learning_rate": 0.0001, + "loss": 9.2453, + "loss/crossentropy": 2.1468474209308623, + "loss/hidden": 4.19765625, + "loss/jsd": 0.0, + "loss/logits": 0.2808349967002869, + "step": 4020 + }, + { + "epoch": 0.13433333333333333, + "grad_norm": 32.0, + "grad_norm_var": 21.4875, + "learning_rate": 0.0001, + "loss": 9.227, + "loss/crossentropy": 2.1914032608270646, + "loss/hidden": 4.169140625, + "loss/jsd": 0.0, + "loss/logits": 0.2951745491474867, + "step": 4030 + }, + { + "epoch": 0.13466666666666666, + "grad_norm": 38.25, + "grad_norm_var": 15.52265625, + "learning_rate": 0.0001, + "loss": 9.1906, + "loss/crossentropy": 2.374897816777229, + "loss/hidden": 4.23515625, + "loss/jsd": 0.0, + "loss/logits": 0.29634634144604205, + "step": 4040 + }, + { + "epoch": 0.135, + "grad_norm": 38.5, + "grad_norm_var": 8.176822916666667, + "learning_rate": 0.0001, + "loss": 9.2041, + "loss/crossentropy": 2.213481293618679, + "loss/hidden": 4.249609375, + "loss/jsd": 0.0, + "loss/logits": 0.30703250467777254, + "step": 4050 + }, + { + "epoch": 0.13533333333333333, + "grad_norm": 199.0, + "grad_norm_var": 1616.4239583333333, + "learning_rate": 0.0001, + "loss": 9.4101, + "loss/crossentropy": 2.3092150717973707, + "loss/hidden": 4.17890625, + "loss/jsd": 0.0, + "loss/logits": 0.298384091258049, + "step": 4060 + }, + { + "epoch": 0.13566666666666666, + "grad_norm": 37.25, + "grad_norm_var": 1616.6239583333333, + "learning_rate": 0.0001, + "loss": 9.3071, + "loss/crossentropy": 2.219760200381279, + "loss/hidden": 4.290625, + "loss/jsd": 0.0, + "loss/logits": 0.3202596869319677, + "step": 4070 + }, + { + "epoch": 0.136, + "grad_norm": 40.0, + "grad_norm_var": 9.732291666666667, + "learning_rate": 0.0001, + "loss": 9.3305, + "loss/crossentropy": 2.289233188331127, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.29089505709707736, + "step": 4080 + }, + { + "epoch": 0.13633333333333333, + "grad_norm": 38.0, + "grad_norm_var": 151.53932291666666, + "learning_rate": 0.0001, + "loss": 9.2719, + "loss/crossentropy": 2.3422865584492683, + "loss/hidden": 4.199609375, + "loss/jsd": 0.0, + "loss/logits": 0.2982410844415426, + "step": 4090 + }, + { + "epoch": 0.13666666666666666, + "grad_norm": 34.5, + "grad_norm_var": 8.22265625, + "learning_rate": 0.0001, + "loss": 9.154, + "loss/crossentropy": 2.0941394165158274, + "loss/hidden": 4.131640625, + "loss/jsd": 0.0, + "loss/logits": 0.28526539970189335, + "step": 4100 + }, + { + "epoch": 0.137, + "grad_norm": 42.5, + "grad_norm_var": 13.489322916666667, + "learning_rate": 0.0001, + "loss": 9.2834, + "loss/crossentropy": 2.336969590187073, + "loss/hidden": 4.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.3301690086722374, + "step": 4110 + }, + { + "epoch": 0.13733333333333334, + "grad_norm": 38.75, + "grad_norm_var": 75.96640625, + "learning_rate": 0.0001, + "loss": 9.2545, + "loss/crossentropy": 2.354858273267746, + "loss/hidden": 4.16953125, + "loss/jsd": 0.0, + "loss/logits": 0.31785392127931117, + "step": 4120 + }, + { + "epoch": 0.13766666666666666, + "grad_norm": 32.25, + "grad_norm_var": 74.625, + "learning_rate": 0.0001, + "loss": 9.017, + "loss/crossentropy": 2.242242157459259, + "loss/hidden": 4.15703125, + "loss/jsd": 0.0, + "loss/logits": 0.27648379243910315, + "step": 4130 + }, + { + "epoch": 0.138, + "grad_norm": 38.25, + "grad_norm_var": 12.843489583333334, + "learning_rate": 0.0001, + "loss": 9.2173, + "loss/crossentropy": 2.240199755132198, + "loss/hidden": 4.201953125, + "loss/jsd": 0.0, + "loss/logits": 0.2927041232585907, + "step": 4140 + }, + { + "epoch": 0.13833333333333334, + "grad_norm": 33.75, + "grad_norm_var": 25.674739583333334, + "learning_rate": 0.0001, + "loss": 9.1547, + "loss/crossentropy": 2.1676330864429474, + "loss/hidden": 4.211328125, + "loss/jsd": 0.0, + "loss/logits": 0.2752385437488556, + "step": 4150 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 36.5, + "grad_norm_var": 4.523958333333334, + "learning_rate": 0.0001, + "loss": 9.08, + "loss/crossentropy": 1.9810823224484921, + "loss/hidden": 4.034765625, + "loss/jsd": 0.0, + "loss/logits": 0.2782739015296102, + "step": 4160 + }, + { + "epoch": 0.139, + "grad_norm": 37.75, + "grad_norm_var": 6.179166666666666, + "learning_rate": 0.0001, + "loss": 9.0211, + "loss/crossentropy": 2.088325909897685, + "loss/hidden": 4.081640625, + "loss/jsd": 0.0, + "loss/logits": 0.2718936084304005, + "step": 4170 + }, + { + "epoch": 0.13933333333333334, + "grad_norm": 36.25, + "grad_norm_var": 4.368489583333333, + "learning_rate": 0.0001, + "loss": 9.2223, + "loss/crossentropy": 2.345874647796154, + "loss/hidden": 4.141015625, + "loss/jsd": 0.0, + "loss/logits": 0.29183666668832303, + "step": 4180 + }, + { + "epoch": 0.13966666666666666, + "grad_norm": 35.25, + "grad_norm_var": 6.133333333333334, + "learning_rate": 0.0001, + "loss": 8.9914, + "loss/crossentropy": 2.076162505149841, + "loss/hidden": 4.159375, + "loss/jsd": 0.0, + "loss/logits": 0.29356912411749364, + "step": 4190 + }, + { + "epoch": 0.14, + "grad_norm": 34.25, + "grad_norm_var": 6.707291666666666, + "learning_rate": 0.0001, + "loss": 8.9754, + "loss/crossentropy": 2.1641511037945746, + "loss/hidden": 4.112109375, + "loss/jsd": 0.0, + "loss/logits": 0.2857985034584999, + "step": 4200 + }, + { + "epoch": 0.14033333333333334, + "grad_norm": 36.75, + "grad_norm_var": 4.530989583333334, + "learning_rate": 0.0001, + "loss": 9.1635, + "loss/crossentropy": 2.104039117693901, + "loss/hidden": 4.08671875, + "loss/jsd": 0.0, + "loss/logits": 0.26876664757728574, + "step": 4210 + }, + { + "epoch": 0.14066666666666666, + "grad_norm": 36.75, + "grad_norm_var": 4.38515625, + "learning_rate": 0.0001, + "loss": 8.998, + "loss/crossentropy": 2.1638469099998474, + "loss/hidden": 4.260546875, + "loss/jsd": 0.0, + "loss/logits": 0.29361540265381336, + "step": 4220 + }, + { + "epoch": 0.141, + "grad_norm": 37.25, + "grad_norm_var": 2.720572916666667, + "learning_rate": 0.0001, + "loss": 9.1357, + "loss/crossentropy": 2.1269990049302576, + "loss/hidden": 3.925, + "loss/jsd": 0.0, + "loss/logits": 0.24402263071388006, + "step": 4230 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 38.5, + "grad_norm_var": 9.082291666666666, + "learning_rate": 0.0001, + "loss": 9.0115, + "loss/crossentropy": 2.08816104978323, + "loss/hidden": 3.98203125, + "loss/jsd": 0.0, + "loss/logits": 0.2717157419770956, + "step": 4240 + }, + { + "epoch": 0.14166666666666666, + "grad_norm": 39.75, + "grad_norm_var": 10.212955729166667, + "learning_rate": 0.0001, + "loss": 9.0041, + "loss/crossentropy": 2.144673664495349, + "loss/hidden": 4.121875, + "loss/jsd": 0.0, + "loss/logits": 0.26794750997796657, + "step": 4250 + }, + { + "epoch": 0.142, + "grad_norm": 39.75, + "grad_norm_var": 17.005143229166666, + "learning_rate": 0.0001, + "loss": 9.0036, + "loss/crossentropy": 2.198061776161194, + "loss/hidden": 4.16171875, + "loss/jsd": 0.0, + "loss/logits": 0.28953395783901215, + "step": 4260 + }, + { + "epoch": 0.14233333333333334, + "grad_norm": 44.0, + "grad_norm_var": 11.192122395833334, + "learning_rate": 0.0001, + "loss": 8.9185, + "loss/crossentropy": 2.0459933675825597, + "loss/hidden": 4.1234375, + "loss/jsd": 0.0, + "loss/logits": 0.26404702849686146, + "step": 4270 + }, + { + "epoch": 0.14266666666666666, + "grad_norm": 34.0, + "grad_norm_var": 14.708333333333334, + "learning_rate": 0.0001, + "loss": 9.0252, + "loss/crossentropy": 2.142607557028532, + "loss/hidden": 4.148828125, + "loss/jsd": 0.0, + "loss/logits": 0.2864328293129802, + "step": 4280 + }, + { + "epoch": 0.143, + "grad_norm": 37.0, + "grad_norm_var": 1981.7268229166666, + "learning_rate": 0.0001, + "loss": 9.2393, + "loss/crossentropy": 2.346937409043312, + "loss/hidden": 4.025390625, + "loss/jsd": 0.0, + "loss/logits": 0.29519574213773014, + "step": 4290 + }, + { + "epoch": 0.14333333333333334, + "grad_norm": 33.75, + "grad_norm_var": 40.1125, + "learning_rate": 0.0001, + "loss": 9.0816, + "loss/crossentropy": 2.2757950969040395, + "loss/hidden": 4.086328125, + "loss/jsd": 0.0, + "loss/logits": 0.2885062342509627, + "step": 4300 + }, + { + "epoch": 0.14366666666666666, + "grad_norm": 36.5, + "grad_norm_var": 863.840625, + "learning_rate": 0.0001, + "loss": 9.1514, + "loss/crossentropy": 2.225219927728176, + "loss/hidden": 4.180078125, + "loss/jsd": 0.0, + "loss/logits": 0.29695004131644964, + "step": 4310 + }, + { + "epoch": 0.144, + "grad_norm": 33.75, + "grad_norm_var": 886.9458333333333, + "learning_rate": 0.0001, + "loss": 8.9425, + "loss/crossentropy": 2.143592892587185, + "loss/hidden": 4.075390625, + "loss/jsd": 0.0, + "loss/logits": 0.280602141469717, + "step": 4320 + }, + { + "epoch": 0.14433333333333334, + "grad_norm": 36.0, + "grad_norm_var": 14.0375, + "learning_rate": 0.0001, + "loss": 9.0048, + "loss/crossentropy": 2.196325662732124, + "loss/hidden": 4.073828125, + "loss/jsd": 0.0, + "loss/logits": 0.28205970898270605, + "step": 4330 + }, + { + "epoch": 0.14466666666666667, + "grad_norm": 34.0, + "grad_norm_var": 17.295833333333334, + "learning_rate": 0.0001, + "loss": 9.0125, + "loss/crossentropy": 2.2133986562490464, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2758210487663746, + "step": 4340 + }, + { + "epoch": 0.145, + "grad_norm": 35.25, + "grad_norm_var": 17.939322916666665, + "learning_rate": 0.0001, + "loss": 8.9665, + "loss/crossentropy": 2.2748197615146637, + "loss/hidden": 4.061328125, + "loss/jsd": 0.0, + "loss/logits": 0.27680257745087145, + "step": 4350 + }, + { + "epoch": 0.14533333333333334, + "grad_norm": 32.5, + "grad_norm_var": 9.857291666666667, + "learning_rate": 0.0001, + "loss": 9.0182, + "loss/crossentropy": 2.108441038429737, + "loss/hidden": 4.170703125, + "loss/jsd": 0.0, + "loss/logits": 0.2624512787908316, + "step": 4360 + }, + { + "epoch": 0.14566666666666667, + "grad_norm": 44.25, + "grad_norm_var": 15.3947265625, + "learning_rate": 0.0001, + "loss": 8.9951, + "loss/crossentropy": 2.038861893117428, + "loss/hidden": 4.069140625, + "loss/jsd": 0.0, + "loss/logits": 0.26383627485483885, + "step": 4370 + }, + { + "epoch": 0.146, + "grad_norm": 38.25, + "grad_norm_var": 14.3884765625, + "learning_rate": 0.0001, + "loss": 9.0622, + "loss/crossentropy": 2.1965081602334977, + "loss/hidden": 4.124609375, + "loss/jsd": 0.0, + "loss/logits": 0.29782434441149236, + "step": 4380 + }, + { + "epoch": 0.14633333333333334, + "grad_norm": 32.5, + "grad_norm_var": 127.815625, + "learning_rate": 0.0001, + "loss": 9.0006, + "loss/crossentropy": 2.0395655959844587, + "loss/hidden": 4.130078125, + "loss/jsd": 0.0, + "loss/logits": 0.26886530220508575, + "step": 4390 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 37.0, + "grad_norm_var": 10.148958333333333, + "learning_rate": 0.0001, + "loss": 9.0164, + "loss/crossentropy": 2.228328402340412, + "loss/hidden": 4.174609375, + "loss/jsd": 0.0, + "loss/logits": 0.28383949398994446, + "step": 4400 + }, + { + "epoch": 0.147, + "grad_norm": 38.75, + "grad_norm_var": 4.183072916666666, + "learning_rate": 0.0001, + "loss": 8.9809, + "loss/crossentropy": 2.1509799271821977, + "loss/hidden": 4.066015625, + "loss/jsd": 0.0, + "loss/logits": 0.26196911465376616, + "step": 4410 + }, + { + "epoch": 0.14733333333333334, + "grad_norm": 37.25, + "grad_norm_var": 5.3353515625, + "learning_rate": 0.0001, + "loss": 9.056, + "loss/crossentropy": 2.217263324558735, + "loss/hidden": 4.2015625, + "loss/jsd": 0.0, + "loss/logits": 0.28669508136808874, + "step": 4420 + }, + { + "epoch": 0.14766666666666667, + "grad_norm": 35.5, + "grad_norm_var": 6.494205729166667, + "learning_rate": 0.0001, + "loss": 9.0301, + "loss/crossentropy": 2.15300203114748, + "loss/hidden": 4.10703125, + "loss/jsd": 0.0, + "loss/logits": 0.27612753622233865, + "step": 4430 + }, + { + "epoch": 0.148, + "grad_norm": 35.5, + "grad_norm_var": 136.375, + "learning_rate": 0.0001, + "loss": 9.0967, + "loss/crossentropy": 2.093575692176819, + "loss/hidden": 4.1453125, + "loss/jsd": 0.0, + "loss/logits": 0.29885905496776105, + "step": 4440 + }, + { + "epoch": 0.14833333333333334, + "grad_norm": 39.25, + "grad_norm_var": 13.1119140625, + "learning_rate": 0.0001, + "loss": 9.0634, + "loss/crossentropy": 2.216056075692177, + "loss/hidden": 4.144921875, + "loss/jsd": 0.0, + "loss/logits": 0.30221954099833964, + "step": 4450 + }, + { + "epoch": 0.14866666666666667, + "grad_norm": 33.5, + "grad_norm_var": 11.009830729166667, + "learning_rate": 0.0001, + "loss": 9.2091, + "loss/crossentropy": 2.2127518743276595, + "loss/hidden": 4.108984375, + "loss/jsd": 0.0, + "loss/logits": 0.2779125362634659, + "step": 4460 + }, + { + "epoch": 0.149, + "grad_norm": 39.5, + "grad_norm_var": 8.137239583333333, + "learning_rate": 0.0001, + "loss": 9.086, + "loss/crossentropy": 2.192157284915447, + "loss/hidden": 4.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.27739516645669937, + "step": 4470 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 40.0, + "grad_norm_var": 7.815625, + "learning_rate": 0.0001, + "loss": 9.0644, + "loss/crossentropy": 2.1084683328866958, + "loss/hidden": 4.044921875, + "loss/jsd": 0.0, + "loss/logits": 0.26986431013792755, + "step": 4480 + }, + { + "epoch": 0.14966666666666667, + "grad_norm": 35.75, + "grad_norm_var": 10.302018229166666, + "learning_rate": 0.0001, + "loss": 9.0806, + "loss/crossentropy": 2.0632939770817758, + "loss/hidden": 4.218359375, + "loss/jsd": 0.0, + "loss/logits": 0.3002682067453861, + "step": 4490 + }, + { + "epoch": 0.15, + "grad_norm": 39.5, + "grad_norm_var": 11.279166666666667, + "learning_rate": 0.0001, + "loss": 9.1169, + "loss/crossentropy": 2.2110743284225465, + "loss/hidden": 4.1859375, + "loss/jsd": 0.0, + "loss/logits": 0.2930508263409138, + "step": 4500 + }, + { + "epoch": 0.15033333333333335, + "grad_norm": 38.5, + "grad_norm_var": 10.308072916666667, + "learning_rate": 0.0001, + "loss": 9.1674, + "loss/crossentropy": 2.3797917008399962, + "loss/hidden": 4.17109375, + "loss/jsd": 0.0, + "loss/logits": 0.3082386564463377, + "step": 4510 + }, + { + "epoch": 0.15066666666666667, + "grad_norm": 33.75, + "grad_norm_var": 11.398958333333333, + "learning_rate": 0.0001, + "loss": 8.9182, + "loss/crossentropy": 2.099128992110491, + "loss/hidden": 4.08125, + "loss/jsd": 0.0, + "loss/logits": 0.26461148345842955, + "step": 4520 + }, + { + "epoch": 0.151, + "grad_norm": 30.25, + "grad_norm_var": 18.073893229166668, + "learning_rate": 0.0001, + "loss": 9.0073, + "loss/crossentropy": 2.121720698475838, + "loss/hidden": 4.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.26439094692468645, + "step": 4530 + }, + { + "epoch": 0.15133333333333332, + "grad_norm": 51.0, + "grad_norm_var": 40.3994140625, + "learning_rate": 0.0001, + "loss": 9.0847, + "loss/crossentropy": 2.1047763034701346, + "loss/hidden": 4.223828125, + "loss/jsd": 0.0, + "loss/logits": 0.30176166333258153, + "step": 4540 + }, + { + "epoch": 0.15166666666666667, + "grad_norm": 40.25, + "grad_norm_var": 30.270833333333332, + "learning_rate": 0.0001, + "loss": 8.9126, + "loss/crossentropy": 2.112935496866703, + "loss/hidden": 3.990234375, + "loss/jsd": 0.0, + "loss/logits": 0.24414603877812624, + "step": 4550 + }, + { + "epoch": 0.152, + "grad_norm": 34.5, + "grad_norm_var": 9.594205729166667, + "learning_rate": 0.0001, + "loss": 8.823, + "loss/crossentropy": 2.057783196866512, + "loss/hidden": 4.141796875, + "loss/jsd": 0.0, + "loss/logits": 0.27383373510092496, + "step": 4560 + }, + { + "epoch": 0.15233333333333332, + "grad_norm": 33.5, + "grad_norm_var": 7.947330729166667, + "learning_rate": 0.0001, + "loss": 8.9423, + "loss/crossentropy": 2.2479268461465836, + "loss/hidden": 3.940625, + "loss/jsd": 0.0, + "loss/logits": 0.26401854380965234, + "step": 4570 + }, + { + "epoch": 0.15266666666666667, + "grad_norm": 40.25, + "grad_norm_var": 8.190625, + "learning_rate": 0.0001, + "loss": 8.8901, + "loss/crossentropy": 2.228538802266121, + "loss/hidden": 4.050390625, + "loss/jsd": 0.0, + "loss/logits": 0.27664305865764616, + "step": 4580 + }, + { + "epoch": 0.153, + "grad_norm": 32.5, + "grad_norm_var": 21.790625, + "learning_rate": 0.0001, + "loss": 9.0083, + "loss/crossentropy": 2.119870799779892, + "loss/hidden": 4.17265625, + "loss/jsd": 0.0, + "loss/logits": 0.27753249146044257, + "step": 4590 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 56.5, + "grad_norm_var": 470.8247395833333, + "learning_rate": 0.0001, + "loss": 9.1533, + "loss/crossentropy": 2.0553093053400517, + "loss/hidden": 4.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.3021026611328125, + "step": 4600 + }, + { + "epoch": 0.15366666666666667, + "grad_norm": 41.75, + "grad_norm_var": 463.7455729166667, + "learning_rate": 0.0001, + "loss": 8.9574, + "loss/crossentropy": 2.0937911093235018, + "loss/hidden": 4.11953125, + "loss/jsd": 0.0, + "loss/logits": 0.2679052516818047, + "step": 4610 + }, + { + "epoch": 0.154, + "grad_norm": 35.0, + "grad_norm_var": 6.95, + "learning_rate": 0.0001, + "loss": 9.0725, + "loss/crossentropy": 2.1097617581486703, + "loss/hidden": 4.151171875, + "loss/jsd": 0.0, + "loss/logits": 0.3005022447556257, + "step": 4620 + }, + { + "epoch": 0.15433333333333332, + "grad_norm": 34.0, + "grad_norm_var": 6.76875, + "learning_rate": 0.0001, + "loss": 8.8903, + "loss/crossentropy": 2.187983478605747, + "loss/hidden": 4.08125, + "loss/jsd": 0.0, + "loss/logits": 0.26959136240184306, + "step": 4630 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 36.75, + "grad_norm_var": 6.955143229166667, + "learning_rate": 0.0001, + "loss": 8.9611, + "loss/crossentropy": 2.096404644846916, + "loss/hidden": 4.02265625, + "loss/jsd": 0.0, + "loss/logits": 0.2623141951858997, + "step": 4640 + }, + { + "epoch": 0.155, + "grad_norm": 33.75, + "grad_norm_var": 8.134309895833333, + "learning_rate": 0.0001, + "loss": 8.8924, + "loss/crossentropy": 1.9974856853485108, + "loss/hidden": 4.131640625, + "loss/jsd": 0.0, + "loss/logits": 0.27121835108846426, + "step": 4650 + }, + { + "epoch": 0.15533333333333332, + "grad_norm": 36.75, + "grad_norm_var": 7.055989583333333, + "learning_rate": 0.0001, + "loss": 9.1254, + "loss/crossentropy": 2.255522921681404, + "loss/hidden": 4.053125, + "loss/jsd": 0.0, + "loss/logits": 0.2813701078295708, + "step": 4660 + }, + { + "epoch": 0.15566666666666668, + "grad_norm": 33.5, + "grad_norm_var": 15.758333333333333, + "learning_rate": 0.0001, + "loss": 8.9782, + "loss/crossentropy": 2.1178917974233626, + "loss/hidden": 4.116796875, + "loss/jsd": 0.0, + "loss/logits": 0.27898423206061124, + "step": 4670 + }, + { + "epoch": 0.156, + "grad_norm": 37.25, + "grad_norm_var": 8.62890625, + "learning_rate": 0.0001, + "loss": 8.9927, + "loss/crossentropy": 2.2914595365524293, + "loss/hidden": 4.09921875, + "loss/jsd": 0.0, + "loss/logits": 0.28754087798297406, + "step": 4680 + }, + { + "epoch": 0.15633333333333332, + "grad_norm": 37.5, + "grad_norm_var": 8.773958333333333, + "learning_rate": 0.0001, + "loss": 8.8989, + "loss/crossentropy": 2.075756361335516, + "loss/hidden": 4.016015625, + "loss/jsd": 0.0, + "loss/logits": 0.27427870500832796, + "step": 4690 + }, + { + "epoch": 0.15666666666666668, + "grad_norm": 43.5, + "grad_norm_var": 41.68307291666667, + "learning_rate": 0.0001, + "loss": 9.0313, + "loss/crossentropy": 2.114923672378063, + "loss/hidden": 4.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.2685729030519724, + "step": 4700 + }, + { + "epoch": 0.157, + "grad_norm": 32.25, + "grad_norm_var": 23.582291666666666, + "learning_rate": 0.0001, + "loss": 8.9486, + "loss/crossentropy": 2.1114466533064844, + "loss/hidden": 4.230859375, + "loss/jsd": 0.0, + "loss/logits": 0.28084823917597534, + "step": 4710 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 40.0, + "grad_norm_var": 17.48515625, + "learning_rate": 0.0001, + "loss": 8.993, + "loss/crossentropy": 2.1299724817276, + "loss/hidden": 4.07265625, + "loss/jsd": 0.0, + "loss/logits": 0.2675957553088665, + "step": 4720 + }, + { + "epoch": 0.15766666666666668, + "grad_norm": 45.75, + "grad_norm_var": 16.35, + "learning_rate": 0.0001, + "loss": 8.9449, + "loss/crossentropy": 2.253695184737444, + "loss/hidden": 4.016796875, + "loss/jsd": 0.0, + "loss/logits": 0.2566886018961668, + "step": 4730 + }, + { + "epoch": 0.158, + "grad_norm": 35.5, + "grad_norm_var": 25.812239583333334, + "learning_rate": 0.0001, + "loss": 9.1158, + "loss/crossentropy": 2.2754971385002136, + "loss/hidden": 4.198046875, + "loss/jsd": 0.0, + "loss/logits": 0.28829921074211595, + "step": 4740 + }, + { + "epoch": 0.15833333333333333, + "grad_norm": 40.0, + "grad_norm_var": 44.77682291666667, + "learning_rate": 0.0001, + "loss": 8.8164, + "loss/crossentropy": 2.0852062880992888, + "loss/hidden": 4.101171875, + "loss/jsd": 0.0, + "loss/logits": 0.2623613655567169, + "step": 4750 + }, + { + "epoch": 0.15866666666666668, + "grad_norm": 34.25, + "grad_norm_var": 18.587239583333332, + "learning_rate": 0.0001, + "loss": 8.8431, + "loss/crossentropy": 2.2237706407904625, + "loss/hidden": 4.155078125, + "loss/jsd": 0.0, + "loss/logits": 0.3078520778566599, + "step": 4760 + }, + { + "epoch": 0.159, + "grad_norm": 39.5, + "grad_norm_var": 10.715625, + "learning_rate": 0.0001, + "loss": 8.8808, + "loss/crossentropy": 2.139245317876339, + "loss/hidden": 4.1, + "loss/jsd": 0.0, + "loss/logits": 0.2770055137574673, + "step": 4770 + }, + { + "epoch": 0.15933333333333333, + "grad_norm": 38.5, + "grad_norm_var": 11.529166666666667, + "learning_rate": 0.0001, + "loss": 9.0282, + "loss/crossentropy": 2.153920599073172, + "loss/hidden": 4.14765625, + "loss/jsd": 0.0, + "loss/logits": 0.2666714245453477, + "step": 4780 + }, + { + "epoch": 0.15966666666666668, + "grad_norm": 37.25, + "grad_norm_var": 3.2831240942886323e+18, + "learning_rate": 0.0001, + "loss": 9.2047, + "loss/crossentropy": 2.181428015232086, + "loss/hidden": 4.17421875, + "loss/jsd": 0.0, + "loss/logits": 0.2813129939138889, + "step": 4790 + }, + { + "epoch": 0.16, + "grad_norm": 33.75, + "grad_norm_var": 6.765625, + "learning_rate": 0.0001, + "loss": 9.0916, + "loss/crossentropy": 2.148864021897316, + "loss/hidden": 4.162890625, + "loss/jsd": 0.0, + "loss/logits": 0.28246700279414655, + "step": 4800 + }, + { + "epoch": 0.16033333333333333, + "grad_norm": 32.0, + "grad_norm_var": 5.557291666666667, + "learning_rate": 0.0001, + "loss": 8.7833, + "loss/crossentropy": 2.20737906396389, + "loss/hidden": 4.041796875, + "loss/jsd": 0.0, + "loss/logits": 0.27596224322915075, + "step": 4810 + }, + { + "epoch": 0.16066666666666668, + "grad_norm": 35.0, + "grad_norm_var": 5.990625, + "learning_rate": 0.0001, + "loss": 8.9472, + "loss/crossentropy": 2.0124839752912522, + "loss/hidden": 4.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.2861068371683359, + "step": 4820 + }, + { + "epoch": 0.161, + "grad_norm": 33.25, + "grad_norm_var": 7.820247395833333, + "learning_rate": 0.0001, + "loss": 8.9053, + "loss/crossentropy": 2.158030679821968, + "loss/hidden": 3.937109375, + "loss/jsd": 0.0, + "loss/logits": 0.26078329905867575, + "step": 4830 + }, + { + "epoch": 0.16133333333333333, + "grad_norm": 34.5, + "grad_norm_var": 10.298372395833333, + "learning_rate": 0.0001, + "loss": 8.9212, + "loss/crossentropy": 2.046878896653652, + "loss/hidden": 4.169921875, + "loss/jsd": 0.0, + "loss/logits": 0.277280671428889, + "step": 4840 + }, + { + "epoch": 0.16166666666666665, + "grad_norm": 36.0, + "grad_norm_var": 8.980989583333333, + "learning_rate": 0.0001, + "loss": 8.8992, + "loss/crossentropy": 2.2036330491304397, + "loss/hidden": 3.940625, + "loss/jsd": 0.0, + "loss/logits": 0.25981649905443194, + "step": 4850 + }, + { + "epoch": 0.162, + "grad_norm": 37.5, + "grad_norm_var": 6.245833333333334, + "learning_rate": 0.0001, + "loss": 8.8779, + "loss/crossentropy": 2.001459051668644, + "loss/hidden": 4.107421875, + "loss/jsd": 0.0, + "loss/logits": 0.2751261981204152, + "step": 4860 + }, + { + "epoch": 0.16233333333333333, + "grad_norm": 34.0, + "grad_norm_var": 24.165625, + "learning_rate": 0.0001, + "loss": 9.0285, + "loss/crossentropy": 2.236599923670292, + "loss/hidden": 3.955078125, + "loss/jsd": 0.0, + "loss/logits": 0.2600332200527191, + "step": 4870 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 37.0, + "grad_norm_var": 6.662239583333333, + "learning_rate": 0.0001, + "loss": 8.9864, + "loss/crossentropy": 2.0701673187315466, + "loss/hidden": 4.109765625, + "loss/jsd": 0.0, + "loss/logits": 0.27043242640793325, + "step": 4880 + }, + { + "epoch": 0.163, + "grad_norm": 58.0, + "grad_norm_var": 1.8014398247254098e+18, + "learning_rate": 0.0001, + "loss": 8.9593, + "loss/crossentropy": 2.128389702364802, + "loss/hidden": 3.9984375, + "loss/jsd": 0.0, + "loss/logits": 0.2562539763748646, + "step": 4890 + }, + { + "epoch": 0.16333333333333333, + "grad_norm": 33.25, + "grad_norm_var": 1.801439823002949e+18, + "learning_rate": 0.0001, + "loss": 9.0252, + "loss/crossentropy": 2.0912427961826325, + "loss/hidden": 4.151171875, + "loss/jsd": 0.0, + "loss/logits": 0.2807155104354024, + "step": 4900 + }, + { + "epoch": 0.16366666666666665, + "grad_norm": 41.25, + "grad_norm_var": 27.223958333333332, + "learning_rate": 0.0001, + "loss": 8.7847, + "loss/crossentropy": 1.889319808036089, + "loss/hidden": 4.073046875, + "loss/jsd": 0.0, + "loss/logits": 0.2481829353608191, + "step": 4910 + }, + { + "epoch": 0.164, + "grad_norm": 32.75, + "grad_norm_var": 7.939322916666667, + "learning_rate": 0.0001, + "loss": 8.744, + "loss/crossentropy": 2.2621133089065553, + "loss/hidden": 4.037109375, + "loss/jsd": 0.0, + "loss/logits": 0.2557232953608036, + "step": 4920 + }, + { + "epoch": 0.16433333333333333, + "grad_norm": 35.75, + "grad_norm_var": 7.367122395833333, + "learning_rate": 0.0001, + "loss": 8.9305, + "loss/crossentropy": 2.1436791688203813, + "loss/hidden": 4.185546875, + "loss/jsd": 0.0, + "loss/logits": 0.28117387779057024, + "step": 4930 + }, + { + "epoch": 0.16466666666666666, + "grad_norm": 34.0, + "grad_norm_var": 6.180989583333333, + "learning_rate": 0.0001, + "loss": 8.8538, + "loss/crossentropy": 2.124467818439007, + "loss/hidden": 4.153125, + "loss/jsd": 0.0, + "loss/logits": 0.27935762144625187, + "step": 4940 + }, + { + "epoch": 0.165, + "grad_norm": 34.5, + "grad_norm_var": 3.9208333333333334, + "learning_rate": 0.0001, + "loss": 9.0238, + "loss/crossentropy": 2.1031661182641983, + "loss/hidden": 4.13359375, + "loss/jsd": 0.0, + "loss/logits": 0.2820569805800915, + "step": 4950 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 36.75, + "grad_norm_var": 7.434309895833334, + "learning_rate": 0.0001, + "loss": 8.9877, + "loss/crossentropy": 2.1274607971310617, + "loss/hidden": 4.0453125, + "loss/jsd": 0.0, + "loss/logits": 0.25978016797453163, + "step": 4960 + }, + { + "epoch": 0.16566666666666666, + "grad_norm": 35.75, + "grad_norm_var": 9.023372395833333, + "learning_rate": 0.0001, + "loss": 8.9786, + "loss/crossentropy": 2.265768840909004, + "loss/hidden": 3.933984375, + "loss/jsd": 0.0, + "loss/logits": 0.2593372922390699, + "step": 4970 + }, + { + "epoch": 0.166, + "grad_norm": 35.25, + "grad_norm_var": 2.4580729166666666, + "learning_rate": 0.0001, + "loss": 8.9312, + "loss/crossentropy": 2.077937413752079, + "loss/hidden": 4.08984375, + "loss/jsd": 0.0, + "loss/logits": 0.27119250893592833, + "step": 4980 + }, + { + "epoch": 0.16633333333333333, + "grad_norm": 34.25, + "grad_norm_var": 300.88743489583334, + "learning_rate": 0.0001, + "loss": 8.8076, + "loss/crossentropy": 2.2653584659099577, + "loss/hidden": 4.03515625, + "loss/jsd": 0.0, + "loss/logits": 0.27666972354054453, + "step": 4990 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 35.25, + "grad_norm_var": 6.1087890625, + "learning_rate": 0.0001, + "loss": 8.9492, + "loss/crossentropy": 2.1622142292559148, + "loss/hidden": 4.054296875, + "loss/jsd": 0.0, + "loss/logits": 0.2614580035209656, + "step": 5000 + }, + { + "epoch": 0.167, + "grad_norm": 36.5, + "grad_norm_var": 6.083072916666667, + "learning_rate": 0.0001, + "loss": 8.8944, + "loss/crossentropy": 2.125202904641628, + "loss/hidden": 4.095703125, + "loss/jsd": 0.0, + "loss/logits": 0.27341773808002473, + "step": 5010 + }, + { + "epoch": 0.16733333333333333, + "grad_norm": 35.25, + "grad_norm_var": 12.689322916666667, + "learning_rate": 0.0001, + "loss": 9.0034, + "loss/crossentropy": 2.2574457243084907, + "loss/hidden": 4.0109375, + "loss/jsd": 0.0, + "loss/logits": 0.27399955950677396, + "step": 5020 + }, + { + "epoch": 0.16766666666666666, + "grad_norm": 42.0, + "grad_norm_var": 12.624739583333334, + "learning_rate": 0.0001, + "loss": 9.0668, + "loss/crossentropy": 2.0945447117090223, + "loss/hidden": 4.031640625, + "loss/jsd": 0.0, + "loss/logits": 0.27295216396450994, + "step": 5030 + }, + { + "epoch": 0.168, + "grad_norm": 35.0, + "grad_norm_var": 20.599739583333335, + "learning_rate": 0.0001, + "loss": 8.9586, + "loss/crossentropy": 2.2430111899971963, + "loss/hidden": 4.026953125, + "loss/jsd": 0.0, + "loss/logits": 0.27031025942415, + "step": 5040 + }, + { + "epoch": 0.16833333333333333, + "grad_norm": 36.5, + "grad_norm_var": 17.99765625, + "learning_rate": 0.0001, + "loss": 9.1637, + "loss/crossentropy": 2.1593209132552147, + "loss/hidden": 4.162109375, + "loss/jsd": 0.0, + "loss/logits": 0.2839784752577543, + "step": 5050 + }, + { + "epoch": 0.16866666666666666, + "grad_norm": 32.25, + "grad_norm_var": 9.022330729166667, + "learning_rate": 0.0001, + "loss": 8.8245, + "loss/crossentropy": 2.1471748799085617, + "loss/hidden": 4.0046875, + "loss/jsd": 0.0, + "loss/logits": 0.26835247687995434, + "step": 5060 + }, + { + "epoch": 0.169, + "grad_norm": 36.5, + "grad_norm_var": 87.60390625, + "learning_rate": 0.0001, + "loss": 8.8161, + "loss/crossentropy": 2.0496731594204904, + "loss/hidden": 4.125390625, + "loss/jsd": 0.0, + "loss/logits": 0.2703436575829983, + "step": 5070 + }, + { + "epoch": 0.16933333333333334, + "grad_norm": 34.5, + "grad_norm_var": 100.34557291666667, + "learning_rate": 0.0001, + "loss": 9.0138, + "loss/crossentropy": 2.1915629282593727, + "loss/hidden": 4.15546875, + "loss/jsd": 0.0, + "loss/logits": 0.2765682227909565, + "step": 5080 + }, + { + "epoch": 0.16966666666666666, + "grad_norm": 37.0, + "grad_norm_var": 3.41640625, + "learning_rate": 0.0001, + "loss": 8.9229, + "loss/crossentropy": 2.257983461022377, + "loss/hidden": 4.04921875, + "loss/jsd": 0.0, + "loss/logits": 0.262128459662199, + "step": 5090 + }, + { + "epoch": 0.17, + "grad_norm": 35.75, + "grad_norm_var": 15.81640625, + "learning_rate": 0.0001, + "loss": 8.8926, + "loss/crossentropy": 2.187410834431648, + "loss/hidden": 4.071484375, + "loss/jsd": 0.0, + "loss/logits": 0.2799120504409075, + "step": 5100 + }, + { + "epoch": 0.17033333333333334, + "grad_norm": 35.75, + "grad_norm_var": 5.523958333333334, + "learning_rate": 0.0001, + "loss": 8.8704, + "loss/crossentropy": 2.0666673690080644, + "loss/hidden": 4.118359375, + "loss/jsd": 0.0, + "loss/logits": 0.2736505573615432, + "step": 5110 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 36.75, + "grad_norm_var": 16.540625, + "learning_rate": 0.0001, + "loss": 8.784, + "loss/crossentropy": 2.055477923154831, + "loss/hidden": 4.126953125, + "loss/jsd": 0.0, + "loss/logits": 0.27156198769807816, + "step": 5120 + }, + { + "epoch": 0.171, + "grad_norm": 41.75, + "grad_norm_var": 20.34765625, + "learning_rate": 0.0001, + "loss": 8.9327, + "loss/crossentropy": 2.037408724427223, + "loss/hidden": 4.025390625, + "loss/jsd": 0.0, + "loss/logits": 0.2697664858773351, + "step": 5130 + }, + { + "epoch": 0.17133333333333334, + "grad_norm": 36.75, + "grad_norm_var": 17.532747395833333, + "learning_rate": 0.0001, + "loss": 8.8144, + "loss/crossentropy": 2.1504238069057466, + "loss/hidden": 3.99453125, + "loss/jsd": 0.0, + "loss/logits": 0.2602078752592206, + "step": 5140 + }, + { + "epoch": 0.17166666666666666, + "grad_norm": 50.25, + "grad_norm_var": 22.230989583333333, + "learning_rate": 0.0001, + "loss": 8.7907, + "loss/crossentropy": 2.0753002099692823, + "loss/hidden": 3.951953125, + "loss/jsd": 0.0, + "loss/logits": 0.24998050797730684, + "step": 5150 + }, + { + "epoch": 0.172, + "grad_norm": 33.5, + "grad_norm_var": 18.791666666666668, + "learning_rate": 0.0001, + "loss": 8.8137, + "loss/crossentropy": 2.179745650291443, + "loss/hidden": 3.980078125, + "loss/jsd": 0.0, + "loss/logits": 0.259993402659893, + "step": 5160 + }, + { + "epoch": 0.17233333333333334, + "grad_norm": 33.25, + "grad_norm_var": 6.505989583333333, + "learning_rate": 0.0001, + "loss": 8.8, + "loss/crossentropy": 2.090472859144211, + "loss/hidden": 3.967578125, + "loss/jsd": 0.0, + "loss/logits": 0.26382889300584794, + "step": 5170 + }, + { + "epoch": 0.17266666666666666, + "grad_norm": 32.75, + "grad_norm_var": 9.782291666666667, + "learning_rate": 0.0001, + "loss": 8.8922, + "loss/crossentropy": 2.0885345190763474, + "loss/hidden": 4.10078125, + "loss/jsd": 0.0, + "loss/logits": 0.2721933271735907, + "step": 5180 + }, + { + "epoch": 0.173, + "grad_norm": 33.25, + "grad_norm_var": 12.4244140625, + "learning_rate": 0.0001, + "loss": 8.9584, + "loss/crossentropy": 2.141716684401035, + "loss/hidden": 4.02578125, + "loss/jsd": 0.0, + "loss/logits": 0.2675710514187813, + "step": 5190 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 32.5, + "grad_norm_var": 9.2931640625, + "learning_rate": 0.0001, + "loss": 8.9231, + "loss/crossentropy": 2.0917196184396745, + "loss/hidden": 4.080859375, + "loss/jsd": 0.0, + "loss/logits": 0.29290595967322586, + "step": 5200 + }, + { + "epoch": 0.17366666666666666, + "grad_norm": 48.25, + "grad_norm_var": 11236.739322916666, + "learning_rate": 0.0001, + "loss": 9.1762, + "loss/crossentropy": 2.257916547358036, + "loss/hidden": 4.270703125, + "loss/jsd": 0.0, + "loss/logits": 0.3100666496902704, + "step": 5210 + }, + { + "epoch": 0.174, + "grad_norm": 43.0, + "grad_norm_var": 11218.0125, + "learning_rate": 0.0001, + "loss": 9.0758, + "loss/crossentropy": 2.14205886721611, + "loss/hidden": 4.219140625, + "loss/jsd": 0.0, + "loss/logits": 0.3077359441667795, + "step": 5220 + }, + { + "epoch": 0.17433333333333334, + "grad_norm": 35.75, + "grad_norm_var": 17.448958333333334, + "learning_rate": 0.0001, + "loss": 8.8649, + "loss/crossentropy": 2.228261913359165, + "loss/hidden": 4.16484375, + "loss/jsd": 0.0, + "loss/logits": 0.31307811439037325, + "step": 5230 + }, + { + "epoch": 0.17466666666666666, + "grad_norm": 37.75, + "grad_norm_var": 10.398958333333333, + "learning_rate": 0.0001, + "loss": 8.8652, + "loss/crossentropy": 2.1450634144246576, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.28023948706686497, + "step": 5240 + }, + { + "epoch": 0.175, + "grad_norm": 35.0, + "grad_norm_var": 9.215559895833334, + "learning_rate": 0.0001, + "loss": 8.8716, + "loss/crossentropy": 2.114921988546848, + "loss/hidden": 4.04375, + "loss/jsd": 0.0, + "loss/logits": 0.27058052010834216, + "step": 5250 + }, + { + "epoch": 0.17533333333333334, + "grad_norm": 33.25, + "grad_norm_var": 7.984309895833333, + "learning_rate": 0.0001, + "loss": 8.9649, + "loss/crossentropy": 2.2500276297330855, + "loss/hidden": 3.951171875, + "loss/jsd": 0.0, + "loss/logits": 0.2647197004407644, + "step": 5260 + }, + { + "epoch": 0.17566666666666667, + "grad_norm": 33.75, + "grad_norm_var": 5.643489583333333, + "learning_rate": 0.0001, + "loss": 8.7341, + "loss/crossentropy": 2.175753255933523, + "loss/hidden": 4.11171875, + "loss/jsd": 0.0, + "loss/logits": 0.27517074095085264, + "step": 5270 + }, + { + "epoch": 0.176, + "grad_norm": 36.0, + "grad_norm_var": 10.556184895833333, + "learning_rate": 0.0001, + "loss": 8.9871, + "loss/crossentropy": 2.175162176787853, + "loss/hidden": 4.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.3377554725855589, + "step": 5280 + }, + { + "epoch": 0.17633333333333334, + "grad_norm": 34.5, + "grad_norm_var": 10.546809895833333, + "learning_rate": 0.0001, + "loss": 8.922, + "loss/crossentropy": 2.143188028782606, + "loss/hidden": 4.023828125, + "loss/jsd": 0.0, + "loss/logits": 0.2618455559015274, + "step": 5290 + }, + { + "epoch": 0.17666666666666667, + "grad_norm": 31.625, + "grad_norm_var": 19.506705729166665, + "learning_rate": 0.0001, + "loss": 8.7904, + "loss/crossentropy": 2.0671297401189803, + "loss/hidden": 4.09765625, + "loss/jsd": 0.0, + "loss/logits": 0.25971175618469716, + "step": 5300 + }, + { + "epoch": 0.177, + "grad_norm": 42.75, + "grad_norm_var": 15.354622395833333, + "learning_rate": 0.0001, + "loss": 8.9056, + "loss/crossentropy": 2.221310918033123, + "loss/hidden": 4.06796875, + "loss/jsd": 0.0, + "loss/logits": 0.278064251691103, + "step": 5310 + }, + { + "epoch": 0.17733333333333334, + "grad_norm": 33.5, + "grad_norm_var": 9.375, + "learning_rate": 0.0001, + "loss": 8.7992, + "loss/crossentropy": 2.0719747349619864, + "loss/hidden": 4.105859375, + "loss/jsd": 0.0, + "loss/logits": 0.2616250865161419, + "step": 5320 + }, + { + "epoch": 0.17766666666666667, + "grad_norm": 32.0, + "grad_norm_var": 5.623958333333333, + "learning_rate": 0.0001, + "loss": 8.7475, + "loss/crossentropy": 2.0968958541750906, + "loss/hidden": 4.021484375, + "loss/jsd": 0.0, + "loss/logits": 0.2842200789600611, + "step": 5330 + }, + { + "epoch": 0.178, + "grad_norm": 38.5, + "grad_norm_var": 7.82265625, + "learning_rate": 0.0001, + "loss": 8.8437, + "loss/crossentropy": 2.1536238461732866, + "loss/hidden": 3.87421875, + "loss/jsd": 0.0, + "loss/logits": 0.24708390831947327, + "step": 5340 + }, + { + "epoch": 0.17833333333333334, + "grad_norm": 30.875, + "grad_norm_var": 11.901497395833333, + "learning_rate": 0.0001, + "loss": 8.7492, + "loss/crossentropy": 2.1353225603699686, + "loss/hidden": 3.94453125, + "loss/jsd": 0.0, + "loss/logits": 0.2444358326494694, + "step": 5350 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 34.25, + "grad_norm_var": 21.014518229166665, + "learning_rate": 0.0001, + "loss": 8.9323, + "loss/crossentropy": 2.148284465074539, + "loss/hidden": 4.20703125, + "loss/jsd": 0.0, + "loss/logits": 0.2887397948652506, + "step": 5360 + }, + { + "epoch": 0.179, + "grad_norm": 35.5, + "grad_norm_var": 7.264518229166667, + "learning_rate": 0.0001, + "loss": 8.7864, + "loss/crossentropy": 1.94754306524992, + "loss/hidden": 4.208984375, + "loss/jsd": 0.0, + "loss/logits": 0.2864205963909626, + "step": 5370 + }, + { + "epoch": 0.17933333333333334, + "grad_norm": 32.75, + "grad_norm_var": 17.825455729166666, + "learning_rate": 0.0001, + "loss": 8.7709, + "loss/crossentropy": 2.111784017086029, + "loss/hidden": 4.049609375, + "loss/jsd": 0.0, + "loss/logits": 0.2593661729246378, + "step": 5380 + }, + { + "epoch": 0.17966666666666667, + "grad_norm": 35.25, + "grad_norm_var": 19.040625, + "learning_rate": 0.0001, + "loss": 8.9434, + "loss/crossentropy": 2.1750704884529113, + "loss/hidden": 4.074609375, + "loss/jsd": 0.0, + "loss/logits": 0.27710040137171743, + "step": 5390 + }, + { + "epoch": 0.18, + "grad_norm": 30.375, + "grad_norm_var": 366.95520833333336, + "learning_rate": 0.0001, + "loss": 8.848, + "loss/crossentropy": 2.1083780497312548, + "loss/hidden": 3.966015625, + "loss/jsd": 0.0, + "loss/logits": 0.25344079583883283, + "step": 5400 + }, + { + "epoch": 0.18033333333333335, + "grad_norm": 34.75, + "grad_norm_var": 379.9759765625, + "learning_rate": 0.0001, + "loss": 9.0775, + "loss/crossentropy": 2.1748669266700746, + "loss/hidden": 4.206640625, + "loss/jsd": 0.0, + "loss/logits": 0.32070644982159136, + "step": 5410 + }, + { + "epoch": 0.18066666666666667, + "grad_norm": 43.0, + "grad_norm_var": 7.449739583333334, + "learning_rate": 0.0001, + "loss": 9.0152, + "loss/crossentropy": 2.113310632109642, + "loss/hidden": 4.053125, + "loss/jsd": 0.0, + "loss/logits": 0.25953211821615696, + "step": 5420 + }, + { + "epoch": 0.181, + "grad_norm": 34.5, + "grad_norm_var": 11.134375, + "learning_rate": 0.0001, + "loss": 8.7841, + "loss/crossentropy": 2.132595753669739, + "loss/hidden": 3.9265625, + "loss/jsd": 0.0, + "loss/logits": 0.2528179976157844, + "step": 5430 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 33.5, + "grad_norm_var": 3.84765625, + "learning_rate": 0.0001, + "loss": 8.8766, + "loss/crossentropy": 2.1760675475001334, + "loss/hidden": 4.0890625, + "loss/jsd": 0.0, + "loss/logits": 0.2604052824899554, + "step": 5440 + }, + { + "epoch": 0.18166666666666667, + "grad_norm": 33.75, + "grad_norm_var": 4.217643229166667, + "learning_rate": 0.0001, + "loss": 8.9479, + "loss/crossentropy": 2.10125227868557, + "loss/hidden": 4.119140625, + "loss/jsd": 0.0, + "loss/logits": 0.275428506731987, + "step": 5450 + }, + { + "epoch": 0.182, + "grad_norm": 36.75, + "grad_norm_var": 3.2603515625, + "learning_rate": 0.0001, + "loss": 8.8121, + "loss/crossentropy": 2.0651059225201607, + "loss/hidden": 3.9671875, + "loss/jsd": 0.0, + "loss/logits": 0.26424469240009785, + "step": 5460 + }, + { + "epoch": 0.18233333333333332, + "grad_norm": 41.5, + "grad_norm_var": 55.25807291666667, + "learning_rate": 0.0001, + "loss": 8.9347, + "loss/crossentropy": 2.1086655259132385, + "loss/hidden": 4.075, + "loss/jsd": 0.0, + "loss/logits": 0.2710310023277998, + "step": 5470 + }, + { + "epoch": 0.18266666666666667, + "grad_norm": 31.25, + "grad_norm_var": 128.47858072916668, + "learning_rate": 0.0001, + "loss": 9.0253, + "loss/crossentropy": 2.069367530941963, + "loss/hidden": 4.250390625, + "loss/jsd": 0.0, + "loss/logits": 0.26114317737519743, + "step": 5480 + }, + { + "epoch": 0.183, + "grad_norm": 34.25, + "grad_norm_var": 97.15670572916666, + "learning_rate": 0.0001, + "loss": 8.9139, + "loss/crossentropy": 2.1907971248030664, + "loss/hidden": 4.216015625, + "loss/jsd": 0.0, + "loss/logits": 0.301979548484087, + "step": 5490 + }, + { + "epoch": 0.18333333333333332, + "grad_norm": 33.0, + "grad_norm_var": 1.647261910309955e+18, + "learning_rate": 0.0001, + "loss": 8.9109, + "loss/crossentropy": 2.268464684486389, + "loss/hidden": 4.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.3525669999420643, + "step": 5500 + }, + { + "epoch": 0.18366666666666667, + "grad_norm": 32.0, + "grad_norm_var": 1.6472619102618255e+18, + "learning_rate": 0.0001, + "loss": 8.7968, + "loss/crossentropy": 2.2150946646928786, + "loss/hidden": 3.966015625, + "loss/jsd": 0.0, + "loss/logits": 0.25991535745561123, + "step": 5510 + }, + { + "epoch": 0.184, + "grad_norm": 46.0, + "grad_norm_var": 40.7556640625, + "learning_rate": 0.0001, + "loss": 8.8022, + "loss/crossentropy": 1.9760248348116876, + "loss/hidden": 3.994140625, + "loss/jsd": 0.0, + "loss/logits": 0.26316012144088746, + "step": 5520 + }, + { + "epoch": 0.18433333333333332, + "grad_norm": 33.25, + "grad_norm_var": 38.213541666666664, + "learning_rate": 0.0001, + "loss": 8.9662, + "loss/crossentropy": 2.1715099826455115, + "loss/hidden": 4.0765625, + "loss/jsd": 0.0, + "loss/logits": 0.2730803471058607, + "step": 5530 + }, + { + "epoch": 0.18466666666666667, + "grad_norm": 37.0, + "grad_norm_var": 16.27265625, + "learning_rate": 0.0001, + "loss": 8.8223, + "loss/crossentropy": 2.039857251942158, + "loss/hidden": 3.984765625, + "loss/jsd": 0.0, + "loss/logits": 0.2515922043472528, + "step": 5540 + }, + { + "epoch": 0.185, + "grad_norm": 35.25, + "grad_norm_var": 25.537239583333335, + "learning_rate": 0.0001, + "loss": 8.9861, + "loss/crossentropy": 2.2859031215310095, + "loss/hidden": 4.180078125, + "loss/jsd": 0.0, + "loss/logits": 0.29738733656704425, + "step": 5550 + }, + { + "epoch": 0.18533333333333332, + "grad_norm": 36.75, + "grad_norm_var": 30.639322916666668, + "learning_rate": 0.0001, + "loss": 8.6376, + "loss/crossentropy": 2.193627268075943, + "loss/hidden": 4.059765625, + "loss/jsd": 0.0, + "loss/logits": 0.2650878496468067, + "step": 5560 + }, + { + "epoch": 0.18566666666666667, + "grad_norm": 45.5, + "grad_norm_var": 41.41223958333333, + "learning_rate": 0.0001, + "loss": 8.7447, + "loss/crossentropy": 2.255605274438858, + "loss/hidden": 4.147265625, + "loss/jsd": 0.0, + "loss/logits": 0.29602186791598795, + "step": 5570 + }, + { + "epoch": 0.186, + "grad_norm": 33.25, + "grad_norm_var": 23.320572916666666, + "learning_rate": 0.0001, + "loss": 8.7573, + "loss/crossentropy": 1.9119407512247562, + "loss/hidden": 4.146484375, + "loss/jsd": 0.0, + "loss/logits": 0.25584258073940874, + "step": 5580 + }, + { + "epoch": 0.18633333333333332, + "grad_norm": 33.0, + "grad_norm_var": 12.671875, + "learning_rate": 0.0001, + "loss": 8.8568, + "loss/crossentropy": 2.20727731436491, + "loss/hidden": 4.006640625, + "loss/jsd": 0.0, + "loss/logits": 0.2666482891887426, + "step": 5590 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 34.5, + "grad_norm_var": 17.099739583333335, + "learning_rate": 0.0001, + "loss": 8.7688, + "loss/crossentropy": 2.250939354300499, + "loss/hidden": 4.031640625, + "loss/jsd": 0.0, + "loss/logits": 0.27695418410003186, + "step": 5600 + }, + { + "epoch": 0.187, + "grad_norm": 34.25, + "grad_norm_var": 55.0259765625, + "learning_rate": 0.0001, + "loss": 8.7403, + "loss/crossentropy": 2.123698775470257, + "loss/hidden": 4.128515625, + "loss/jsd": 0.0, + "loss/logits": 0.266297522559762, + "step": 5610 + }, + { + "epoch": 0.18733333333333332, + "grad_norm": 37.5, + "grad_norm_var": 9.993684895833333, + "learning_rate": 0.0001, + "loss": 8.7692, + "loss/crossentropy": 1.985411663353443, + "loss/hidden": 4.025390625, + "loss/jsd": 0.0, + "loss/logits": 0.24248458091169595, + "step": 5620 + }, + { + "epoch": 0.18766666666666668, + "grad_norm": 33.25, + "grad_norm_var": 9.817122395833334, + "learning_rate": 0.0001, + "loss": 8.8696, + "loss/crossentropy": 2.1818307891488073, + "loss/hidden": 4.0328125, + "loss/jsd": 0.0, + "loss/logits": 0.26730893813073636, + "step": 5630 + }, + { + "epoch": 0.188, + "grad_norm": 34.75, + "grad_norm_var": 2.0837890625, + "learning_rate": 0.0001, + "loss": 8.6818, + "loss/crossentropy": 1.9562100693583488, + "loss/hidden": 3.86953125, + "loss/jsd": 0.0, + "loss/logits": 0.2361563365906477, + "step": 5640 + }, + { + "epoch": 0.18833333333333332, + "grad_norm": 35.0, + "grad_norm_var": 8.556705729166667, + "learning_rate": 0.0001, + "loss": 8.7822, + "loss/crossentropy": 2.2117529645562173, + "loss/hidden": 4.125390625, + "loss/jsd": 0.0, + "loss/logits": 0.27936047930270436, + "step": 5650 + }, + { + "epoch": 0.18866666666666668, + "grad_norm": 37.0, + "grad_norm_var": 18.499934895833334, + "learning_rate": 0.0001, + "loss": 8.8269, + "loss/crossentropy": 2.1983281478285788, + "loss/hidden": 4.016015625, + "loss/jsd": 0.0, + "loss/logits": 0.27177377715706824, + "step": 5660 + }, + { + "epoch": 0.189, + "grad_norm": 33.5, + "grad_norm_var": 53.44348958333333, + "learning_rate": 0.0001, + "loss": 8.9078, + "loss/crossentropy": 2.1987064227461817, + "loss/hidden": 4.033984375, + "loss/jsd": 0.0, + "loss/logits": 0.2688568111509085, + "step": 5670 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 39.5, + "grad_norm_var": 79.27682291666666, + "learning_rate": 0.0001, + "loss": 8.9224, + "loss/crossentropy": 2.0661555171012878, + "loss/hidden": 4.004296875, + "loss/jsd": 0.0, + "loss/logits": 0.26280424017459153, + "step": 5680 + }, + { + "epoch": 0.18966666666666668, + "grad_norm": 34.5, + "grad_norm_var": 76.48515625, + "learning_rate": 0.0001, + "loss": 8.8074, + "loss/crossentropy": 2.2006511926651, + "loss/hidden": 3.94453125, + "loss/jsd": 0.0, + "loss/logits": 0.2695758603513241, + "step": 5690 + }, + { + "epoch": 0.19, + "grad_norm": 29.625, + "grad_norm_var": 14.560872395833334, + "learning_rate": 0.0001, + "loss": 8.7091, + "loss/crossentropy": 2.1932696878910063, + "loss/hidden": 3.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2508305963128805, + "step": 5700 + }, + { + "epoch": 0.19033333333333333, + "grad_norm": 30.125, + "grad_norm_var": 29.80390625, + "learning_rate": 0.0001, + "loss": 8.8264, + "loss/crossentropy": 1.9624864727258682, + "loss/hidden": 4.17421875, + "loss/jsd": 0.0, + "loss/logits": 0.270268096216023, + "step": 5710 + }, + { + "epoch": 0.19066666666666668, + "grad_norm": 34.5, + "grad_norm_var": 10.429622395833333, + "learning_rate": 0.0001, + "loss": 8.7384, + "loss/crossentropy": 2.1305344730615614, + "loss/hidden": 3.949609375, + "loss/jsd": 0.0, + "loss/logits": 0.25091919098049403, + "step": 5720 + }, + { + "epoch": 0.191, + "grad_norm": 31.125, + "grad_norm_var": 10.763997395833334, + "learning_rate": 0.0001, + "loss": 8.7249, + "loss/crossentropy": 2.138452613353729, + "loss/hidden": 4.002734375, + "loss/jsd": 0.0, + "loss/logits": 0.2561824645847082, + "step": 5730 + }, + { + "epoch": 0.19133333333333333, + "grad_norm": 36.25, + "grad_norm_var": 10.508268229166667, + "learning_rate": 0.0001, + "loss": 8.7972, + "loss/crossentropy": 2.291805052757263, + "loss/hidden": 4.128125, + "loss/jsd": 0.0, + "loss/logits": 0.2823994573205709, + "step": 5740 + }, + { + "epoch": 0.19166666666666668, + "grad_norm": 33.75, + "grad_norm_var": 6.726822916666666, + "learning_rate": 0.0001, + "loss": 8.7028, + "loss/crossentropy": 2.0102537497878075, + "loss/hidden": 3.940234375, + "loss/jsd": 0.0, + "loss/logits": 0.2453432971611619, + "step": 5750 + }, + { + "epoch": 0.192, + "grad_norm": 33.5, + "grad_norm_var": 6.389518229166667, + "learning_rate": 0.0001, + "loss": 8.7465, + "loss/crossentropy": 2.1731634236872197, + "loss/hidden": 3.894921875, + "loss/jsd": 0.0, + "loss/logits": 0.24393599089235068, + "step": 5760 + }, + { + "epoch": 0.19233333333333333, + "grad_norm": 40.5, + "grad_norm_var": 9.5837890625, + "learning_rate": 0.0001, + "loss": 8.7436, + "loss/crossentropy": 1.9870410725474357, + "loss/hidden": 4.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.25361278727650644, + "step": 5770 + }, + { + "epoch": 0.19266666666666668, + "grad_norm": 42.0, + "grad_norm_var": 14.058333333333334, + "learning_rate": 0.0001, + "loss": 8.7938, + "loss/crossentropy": 1.954255884513259, + "loss/hidden": 4.16328125, + "loss/jsd": 0.0, + "loss/logits": 0.27968817451037464, + "step": 5780 + }, + { + "epoch": 0.193, + "grad_norm": 53.5, + "grad_norm_var": 26.7869140625, + "learning_rate": 0.0001, + "loss": 8.8003, + "loss/crossentropy": 2.2312229365110396, + "loss/hidden": 3.932421875, + "loss/jsd": 0.0, + "loss/logits": 0.25941078290343283, + "step": 5790 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 30.375, + "grad_norm_var": 26.9681640625, + "learning_rate": 0.0001, + "loss": 8.6889, + "loss/crossentropy": 2.2241889007389544, + "loss/hidden": 4.01640625, + "loss/jsd": 0.0, + "loss/logits": 0.25501362718641757, + "step": 5800 + }, + { + "epoch": 0.19366666666666665, + "grad_norm": 34.25, + "grad_norm_var": 6.756705729166667, + "learning_rate": 0.0001, + "loss": 8.6629, + "loss/crossentropy": 2.168029661476612, + "loss/hidden": 4.10390625, + "loss/jsd": 0.0, + "loss/logits": 0.2718012981116772, + "step": 5810 + }, + { + "epoch": 0.194, + "grad_norm": 32.75, + "grad_norm_var": 4.945833333333334, + "learning_rate": 0.0001, + "loss": 8.7136, + "loss/crossentropy": 2.1019906878471373, + "loss/hidden": 3.940234375, + "loss/jsd": 0.0, + "loss/logits": 0.25093156583607196, + "step": 5820 + }, + { + "epoch": 0.19433333333333333, + "grad_norm": 35.75, + "grad_norm_var": 9.555989583333334, + "learning_rate": 0.0001, + "loss": 8.6915, + "loss/crossentropy": 1.9436087012290955, + "loss/hidden": 4.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.25613378193229436, + "step": 5830 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 33.25, + "grad_norm_var": 28.499739583333334, + "learning_rate": 0.0001, + "loss": 8.8635, + "loss/crossentropy": 2.1651584833860396, + "loss/hidden": 4.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2904574632644653, + "step": 5840 + }, + { + "epoch": 0.195, + "grad_norm": 36.0, + "grad_norm_var": 21.8806640625, + "learning_rate": 0.0001, + "loss": 8.8006, + "loss/crossentropy": 2.1804804012179373, + "loss/hidden": 3.935546875, + "loss/jsd": 0.0, + "loss/logits": 0.27036979123950006, + "step": 5850 + }, + { + "epoch": 0.19533333333333333, + "grad_norm": 34.25, + "grad_norm_var": 5.287434895833333, + "learning_rate": 0.0001, + "loss": 8.7782, + "loss/crossentropy": 2.218150386214256, + "loss/hidden": 4.070703125, + "loss/jsd": 0.0, + "loss/logits": 0.2726810619235039, + "step": 5860 + }, + { + "epoch": 0.19566666666666666, + "grad_norm": 34.0, + "grad_norm_var": 6.895572916666667, + "learning_rate": 0.0001, + "loss": 8.7639, + "loss/crossentropy": 1.987610936909914, + "loss/hidden": 3.981640625, + "loss/jsd": 0.0, + "loss/logits": 0.2631619516760111, + "step": 5870 + }, + { + "epoch": 0.196, + "grad_norm": 34.25, + "grad_norm_var": 7.1416015625, + "learning_rate": 0.0001, + "loss": 8.8358, + "loss/crossentropy": 2.0127531036734583, + "loss/hidden": 3.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.24521693456918, + "step": 5880 + }, + { + "epoch": 0.19633333333333333, + "grad_norm": 37.0, + "grad_norm_var": 2.460724587808127e+18, + "learning_rate": 0.0001, + "loss": 8.7964, + "loss/crossentropy": 2.158496895432472, + "loss/hidden": 4.165625, + "loss/jsd": 0.0, + "loss/logits": 0.27088434621691704, + "step": 5890 + }, + { + "epoch": 0.19666666666666666, + "grad_norm": 37.5, + "grad_norm_var": 2.460724587657796e+18, + "learning_rate": 0.0001, + "loss": 8.7617, + "loss/crossentropy": 2.0347786456346513, + "loss/hidden": 3.994921875, + "loss/jsd": 0.0, + "loss/logits": 0.260778752155602, + "step": 5900 + }, + { + "epoch": 0.197, + "grad_norm": 34.75, + "grad_norm_var": 3.6489583333333333, + "learning_rate": 0.0001, + "loss": 8.6594, + "loss/crossentropy": 1.9388806536793708, + "loss/hidden": 4.1, + "loss/jsd": 0.0, + "loss/logits": 0.25395537763834, + "step": 5910 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 32.75, + "grad_norm_var": 6.189518229166667, + "learning_rate": 0.0001, + "loss": 8.7194, + "loss/crossentropy": 2.3467346012592314, + "loss/hidden": 3.984375, + "loss/jsd": 0.0, + "loss/logits": 0.27363166082650425, + "step": 5920 + }, + { + "epoch": 0.19766666666666666, + "grad_norm": 28.25, + "grad_norm_var": 6.940559895833333, + "learning_rate": 0.0001, + "loss": 8.7149, + "loss/crossentropy": 2.1685155972838404, + "loss/hidden": 4.040625, + "loss/jsd": 0.0, + "loss/logits": 0.2792276293039322, + "step": 5930 + }, + { + "epoch": 0.198, + "grad_norm": 44.0, + "grad_norm_var": 14.130143229166666, + "learning_rate": 0.0001, + "loss": 8.9314, + "loss/crossentropy": 2.12467120885849, + "loss/hidden": 4.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2669179428368807, + "step": 5940 + }, + { + "epoch": 0.19833333333333333, + "grad_norm": 28.5, + "grad_norm_var": 16.2056640625, + "learning_rate": 0.0001, + "loss": 8.8839, + "loss/crossentropy": 2.1688647463917734, + "loss/hidden": 3.9515625, + "loss/jsd": 0.0, + "loss/logits": 0.26754092089831827, + "step": 5950 + }, + { + "epoch": 0.19866666666666666, + "grad_norm": 36.0, + "grad_norm_var": 20.826041666666665, + "learning_rate": 0.0001, + "loss": 8.8604, + "loss/crossentropy": 2.182598438858986, + "loss/hidden": 4.00234375, + "loss/jsd": 0.0, + "loss/logits": 0.2616792509332299, + "step": 5960 + }, + { + "epoch": 0.199, + "grad_norm": 34.0, + "grad_norm_var": 18.274934895833333, + "learning_rate": 0.0001, + "loss": 8.7711, + "loss/crossentropy": 2.132971841096878, + "loss/hidden": 4.112890625, + "loss/jsd": 0.0, + "loss/logits": 0.3083286764100194, + "step": 5970 + }, + { + "epoch": 0.19933333333333333, + "grad_norm": 29.75, + "grad_norm_var": 7.208333333333333, + "learning_rate": 0.0001, + "loss": 8.7614, + "loss/crossentropy": 2.1833253771066667, + "loss/hidden": 3.871484375, + "loss/jsd": 0.0, + "loss/logits": 0.24754995480179787, + "step": 5980 + }, + { + "epoch": 0.19966666666666666, + "grad_norm": 31.0, + "grad_norm_var": 30.2509765625, + "learning_rate": 0.0001, + "loss": 8.6977, + "loss/crossentropy": 2.2232595421373844, + "loss/hidden": 3.9734375, + "loss/jsd": 0.0, + "loss/logits": 0.2633102308958769, + "step": 5990 + }, + { + "epoch": 0.2, + "grad_norm": 38.25, + "grad_norm_var": 27.7150390625, + "learning_rate": 0.0001, + "loss": 8.8047, + "loss/crossentropy": 2.0881299793720247, + "loss/hidden": 4.12109375, + "loss/jsd": 0.0, + "loss/logits": 0.2889344684779644, + "step": 6000 + }, + { + "epoch": 0.20033333333333334, + "grad_norm": 30.375, + "grad_norm_var": 8.056705729166667, + "learning_rate": 0.0001, + "loss": 8.6569, + "loss/crossentropy": 2.0547440201044083, + "loss/hidden": 3.99921875, + "loss/jsd": 0.0, + "loss/logits": 0.24832881577312946, + "step": 6010 + }, + { + "epoch": 0.20066666666666666, + "grad_norm": 38.0, + "grad_norm_var": 7.056184895833334, + "learning_rate": 0.0001, + "loss": 8.7794, + "loss/crossentropy": 2.1193090736866, + "loss/hidden": 4.0, + "loss/jsd": 0.0, + "loss/logits": 0.2597900453954935, + "step": 6020 + }, + { + "epoch": 0.201, + "grad_norm": 32.25, + "grad_norm_var": 5.3025390625, + "learning_rate": 0.0001, + "loss": 8.6794, + "loss/crossentropy": 2.2494138766080143, + "loss/hidden": 3.8703125, + "loss/jsd": 0.0, + "loss/logits": 0.2641505628824234, + "step": 6030 + }, + { + "epoch": 0.20133333333333334, + "grad_norm": 4932501504.0, + "grad_norm_var": 1.5205981723933279e+18, + "learning_rate": 0.0001, + "loss": 8.9463, + "loss/crossentropy": 2.3256581157445906, + "loss/hidden": 3.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.26595249325037, + "step": 6040 + }, + { + "epoch": 0.20166666666666666, + "grad_norm": 37.0, + "grad_norm_var": 1.52059817108827e+18, + "learning_rate": 0.0001, + "loss": 8.8264, + "loss/crossentropy": 2.053881608694792, + "loss/hidden": 3.95546875, + "loss/jsd": 0.0, + "loss/logits": 0.24859177209436895, + "step": 6050 + }, + { + "epoch": 0.202, + "grad_norm": 31.125, + "grad_norm_var": 4.933268229166667, + "learning_rate": 0.0001, + "loss": 8.7041, + "loss/crossentropy": 2.1878477543592454, + "loss/hidden": 3.91875, + "loss/jsd": 0.0, + "loss/logits": 0.2409949317574501, + "step": 6060 + }, + { + "epoch": 0.20233333333333334, + "grad_norm": 33.0, + "grad_norm_var": 351.74140625, + "learning_rate": 0.0001, + "loss": 8.7973, + "loss/crossentropy": 2.1990287870168688, + "loss/hidden": 4.031640625, + "loss/jsd": 0.0, + "loss/logits": 0.2801033824682236, + "step": 6070 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 30.375, + "grad_norm_var": 63.73125, + "learning_rate": 0.0001, + "loss": 8.7133, + "loss/crossentropy": 2.2455021925270557, + "loss/hidden": 3.989453125, + "loss/jsd": 0.0, + "loss/logits": 0.27408372741192577, + "step": 6080 + }, + { + "epoch": 0.203, + "grad_norm": 41.5, + "grad_norm_var": 15.795572916666666, + "learning_rate": 0.0001, + "loss": 8.7392, + "loss/crossentropy": 2.005758151039481, + "loss/hidden": 3.888671875, + "loss/jsd": 0.0, + "loss/logits": 0.24863583762198688, + "step": 6090 + }, + { + "epoch": 0.20333333333333334, + "grad_norm": 31.125, + "grad_norm_var": 6.892122395833334, + "learning_rate": 0.0001, + "loss": 8.6557, + "loss/crossentropy": 2.0756575793027876, + "loss/hidden": 3.941796875, + "loss/jsd": 0.0, + "loss/logits": 0.25259528737515213, + "step": 6100 + }, + { + "epoch": 0.20366666666666666, + "grad_norm": 35.75, + "grad_norm_var": 3.395768229166667, + "learning_rate": 0.0001, + "loss": 8.8129, + "loss/crossentropy": 2.1710173338651657, + "loss/hidden": 4.084375, + "loss/jsd": 0.0, + "loss/logits": 0.24820818062871694, + "step": 6110 + }, + { + "epoch": 0.204, + "grad_norm": 33.75, + "grad_norm_var": 14.166080729166667, + "learning_rate": 0.0001, + "loss": 8.6818, + "loss/crossentropy": 2.2217075169086455, + "loss/hidden": 4.032421875, + "loss/jsd": 0.0, + "loss/logits": 0.2922105029225349, + "step": 6120 + }, + { + "epoch": 0.20433333333333334, + "grad_norm": 36.75, + "grad_norm_var": 15.454166666666667, + "learning_rate": 0.0001, + "loss": 8.7605, + "loss/crossentropy": 2.103864422440529, + "loss/hidden": 3.80625, + "loss/jsd": 0.0, + "loss/logits": 0.2378301707096398, + "step": 6130 + }, + { + "epoch": 0.20466666666666666, + "grad_norm": 34.25, + "grad_norm_var": 5.6337890625, + "learning_rate": 0.0001, + "loss": 8.5937, + "loss/crossentropy": 2.066901922225952, + "loss/hidden": 3.96796875, + "loss/jsd": 0.0, + "loss/logits": 0.24710494233295321, + "step": 6140 + }, + { + "epoch": 0.205, + "grad_norm": 39.0, + "grad_norm_var": 13.94765625, + "learning_rate": 0.0001, + "loss": 8.7255, + "loss/crossentropy": 2.068124470114708, + "loss/hidden": 3.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.25081984605640173, + "step": 6150 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 30.125, + "grad_norm_var": 23.54140625, + "learning_rate": 0.0001, + "loss": 8.759, + "loss/crossentropy": 2.2413846030831337, + "loss/hidden": 4.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.30093136746436355, + "step": 6160 + }, + { + "epoch": 0.20566666666666666, + "grad_norm": 41.0, + "grad_norm_var": 8.30390625, + "learning_rate": 0.0001, + "loss": 8.6887, + "loss/crossentropy": 2.060081334412098, + "loss/hidden": 3.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.24737574942409993, + "step": 6170 + }, + { + "epoch": 0.206, + "grad_norm": 33.5, + "grad_norm_var": 9.3087890625, + "learning_rate": 0.0001, + "loss": 8.7567, + "loss/crossentropy": 2.1451657354831695, + "loss/hidden": 4.12265625, + "loss/jsd": 0.0, + "loss/logits": 0.2983713150024414, + "step": 6180 + }, + { + "epoch": 0.20633333333333334, + "grad_norm": 40.75, + "grad_norm_var": 76.71399739583333, + "learning_rate": 0.0001, + "loss": 8.715, + "loss/crossentropy": 2.2015042565762997, + "loss/hidden": 4.01796875, + "loss/jsd": 0.0, + "loss/logits": 0.27279378157109024, + "step": 6190 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 34.75, + "grad_norm_var": 80.88958333333333, + "learning_rate": 0.0001, + "loss": 8.6619, + "loss/crossentropy": 2.127535600960255, + "loss/hidden": 3.954296875, + "loss/jsd": 0.0, + "loss/logits": 0.2594041220843792, + "step": 6200 + }, + { + "epoch": 0.207, + "grad_norm": 32.0, + "grad_norm_var": 7.032747395833334, + "learning_rate": 0.0001, + "loss": 8.7577, + "loss/crossentropy": 2.1385881870985033, + "loss/hidden": 3.964453125, + "loss/jsd": 0.0, + "loss/logits": 0.2612423226237297, + "step": 6210 + }, + { + "epoch": 0.20733333333333334, + "grad_norm": 32.5, + "grad_norm_var": 11.701041666666667, + "learning_rate": 0.0001, + "loss": 8.688, + "loss/crossentropy": 2.0760431602597236, + "loss/hidden": 3.905859375, + "loss/jsd": 0.0, + "loss/logits": 0.2630361717194319, + "step": 6220 + }, + { + "epoch": 0.20766666666666667, + "grad_norm": 37.25, + "grad_norm_var": 22.809375, + "learning_rate": 0.0001, + "loss": 8.5522, + "loss/crossentropy": 2.135862450301647, + "loss/hidden": 3.986328125, + "loss/jsd": 0.0, + "loss/logits": 0.2494693139567971, + "step": 6230 + }, + { + "epoch": 0.208, + "grad_norm": 34.5, + "grad_norm_var": 16.30390625, + "learning_rate": 0.0001, + "loss": 8.6282, + "loss/crossentropy": 2.0777343571186067, + "loss/hidden": 4.07109375, + "loss/jsd": 0.0, + "loss/logits": 0.2899452358484268, + "step": 6240 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 30.5, + "grad_norm_var": 5.9728515625, + "learning_rate": 0.0001, + "loss": 8.7862, + "loss/crossentropy": 2.2019074261188507, + "loss/hidden": 3.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.25109023675322534, + "step": 6250 + }, + { + "epoch": 0.20866666666666667, + "grad_norm": 31.5, + "grad_norm_var": 5.348372395833334, + "learning_rate": 0.0001, + "loss": 8.6853, + "loss/crossentropy": 2.2050928086042405, + "loss/hidden": 3.96953125, + "loss/jsd": 0.0, + "loss/logits": 0.2619493114762008, + "step": 6260 + }, + { + "epoch": 0.209, + "grad_norm": 31.875, + "grad_norm_var": 4.7322265625, + "learning_rate": 0.0001, + "loss": 8.5978, + "loss/crossentropy": 2.322963085025549, + "loss/hidden": 3.87578125, + "loss/jsd": 0.0, + "loss/logits": 0.26414704993367194, + "step": 6270 + }, + { + "epoch": 0.20933333333333334, + "grad_norm": 6845104128.0, + "grad_norm_var": 2.9284656289268367e+18, + "learning_rate": 0.0001, + "loss": 8.7111, + "loss/crossentropy": 1.9891023762524127, + "loss/hidden": 4.237890625, + "loss/jsd": 0.0, + "loss/logits": 0.24320064708590508, + "step": 6280 + }, + { + "epoch": 0.20966666666666667, + "grad_norm": 42.25, + "grad_norm_var": 2.928465627087215e+18, + "learning_rate": 0.0001, + "loss": 8.8478, + "loss/crossentropy": 2.1030918568372727, + "loss/hidden": 3.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2577778071165085, + "step": 6290 + }, + { + "epoch": 0.21, + "grad_norm": 33.75, + "grad_norm_var": 9.833333333333334, + "learning_rate": 0.0001, + "loss": 8.7841, + "loss/crossentropy": 2.1194095268845556, + "loss/hidden": 3.93515625, + "loss/jsd": 0.0, + "loss/logits": 0.25438457299023864, + "step": 6300 + }, + { + "epoch": 0.21033333333333334, + "grad_norm": 38.0, + "grad_norm_var": 5.684830729166666, + "learning_rate": 0.0001, + "loss": 8.6073, + "loss/crossentropy": 2.1838410973548887, + "loss/hidden": 3.905078125, + "loss/jsd": 0.0, + "loss/logits": 0.2590842802077532, + "step": 6310 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 32.25, + "grad_norm_var": 14.130989583333333, + "learning_rate": 0.0001, + "loss": 8.6961, + "loss/crossentropy": 2.1097051329910754, + "loss/hidden": 3.87265625, + "loss/jsd": 0.0, + "loss/logits": 0.2509492003358901, + "step": 6320 + }, + { + "epoch": 0.211, + "grad_norm": 32.75, + "grad_norm_var": 7.970572916666667, + "learning_rate": 0.0001, + "loss": 8.758, + "loss/crossentropy": 2.1945007756352424, + "loss/hidden": 3.985546875, + "loss/jsd": 0.0, + "loss/logits": 0.25893947295844555, + "step": 6330 + }, + { + "epoch": 0.21133333333333335, + "grad_norm": 36.0, + "grad_norm_var": 1.7789921967526118e+18, + "learning_rate": 0.0001, + "loss": 8.905, + "loss/crossentropy": 2.058439862728119, + "loss/hidden": 4.17421875, + "loss/jsd": 0.0, + "loss/logits": 0.26833444014191626, + "step": 6340 + }, + { + "epoch": 0.21166666666666667, + "grad_norm": 32.0, + "grad_norm_var": 104.33229166666666, + "learning_rate": 0.0001, + "loss": 8.6526, + "loss/crossentropy": 1.890061966329813, + "loss/hidden": 4.045703125, + "loss/jsd": 0.0, + "loss/logits": 0.2450747612863779, + "step": 6350 + }, + { + "epoch": 0.212, + "grad_norm": 29.625, + "grad_norm_var": 22.006184895833332, + "learning_rate": 0.0001, + "loss": 8.7864, + "loss/crossentropy": 2.1768174074590205, + "loss/hidden": 4.052734375, + "loss/jsd": 0.0, + "loss/logits": 0.24862836562097074, + "step": 6360 + }, + { + "epoch": 0.21233333333333335, + "grad_norm": 34.75, + "grad_norm_var": 99.80598958333333, + "learning_rate": 0.0001, + "loss": 8.6528, + "loss/crossentropy": 2.2797497868537904, + "loss/hidden": 3.957421875, + "loss/jsd": 0.0, + "loss/logits": 0.26709459256380796, + "step": 6370 + }, + { + "epoch": 0.21266666666666667, + "grad_norm": 32.5, + "grad_norm_var": 86.98274739583333, + "learning_rate": 0.0001, + "loss": 8.6966, + "loss/crossentropy": 2.178654319047928, + "loss/hidden": 4.08046875, + "loss/jsd": 0.0, + "loss/logits": 0.2537883473560214, + "step": 6380 + }, + { + "epoch": 0.213, + "grad_norm": 29.625, + "grad_norm_var": 13.905208333333333, + "learning_rate": 0.0001, + "loss": 8.705, + "loss/crossentropy": 2.0725951939821243, + "loss/hidden": 3.972265625, + "loss/jsd": 0.0, + "loss/logits": 0.25900917164981363, + "step": 6390 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 34.75, + "grad_norm_var": 6.137239583333334, + "learning_rate": 0.0001, + "loss": 8.7598, + "loss/crossentropy": 2.3110455900430678, + "loss/hidden": 4.031640625, + "loss/jsd": 0.0, + "loss/logits": 0.28035171553492544, + "step": 6400 + }, + { + "epoch": 0.21366666666666667, + "grad_norm": 35.0, + "grad_norm_var": 10.5744140625, + "learning_rate": 0.0001, + "loss": 8.7293, + "loss/crossentropy": 2.2058258563280106, + "loss/hidden": 3.8546875, + "loss/jsd": 0.0, + "loss/logits": 0.2550595965236425, + "step": 6410 + }, + { + "epoch": 0.214, + "grad_norm": 35.25, + "grad_norm_var": 8.562239583333334, + "learning_rate": 0.0001, + "loss": 8.7376, + "loss/crossentropy": 2.150018022954464, + "loss/hidden": 3.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2502748826518655, + "step": 6420 + }, + { + "epoch": 0.21433333333333332, + "grad_norm": 34.5, + "grad_norm_var": 1.9860874111488097e+18, + "learning_rate": 0.0001, + "loss": 8.7269, + "loss/crossentropy": 2.2726529754698275, + "loss/hidden": 3.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2701293595135212, + "step": 6430 + }, + { + "epoch": 0.21466666666666667, + "grad_norm": 34.0, + "grad_norm_var": 6.182747395833333, + "learning_rate": 0.0001, + "loss": 8.5577, + "loss/crossentropy": 2.217649821192026, + "loss/hidden": 3.965234375, + "loss/jsd": 0.0, + "loss/logits": 0.2551640780642629, + "step": 6440 + }, + { + "epoch": 0.215, + "grad_norm": 39.5, + "grad_norm_var": 8.399739583333334, + "learning_rate": 0.0001, + "loss": 8.5535, + "loss/crossentropy": 2.080750811100006, + "loss/hidden": 3.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.23864807337522506, + "step": 6450 + }, + { + "epoch": 0.21533333333333332, + "grad_norm": 31.125, + "grad_norm_var": 5.915559895833334, + "learning_rate": 0.0001, + "loss": 8.5244, + "loss/crossentropy": 2.1134666696190836, + "loss/hidden": 3.774609375, + "loss/jsd": 0.0, + "loss/logits": 0.23039107713848353, + "step": 6460 + }, + { + "epoch": 0.21566666666666667, + "grad_norm": 34.5, + "grad_norm_var": 12.49765625, + "learning_rate": 0.0001, + "loss": 8.6924, + "loss/crossentropy": 2.2306397944688796, + "loss/hidden": 4.040234375, + "loss/jsd": 0.0, + "loss/logits": 0.28511182554066183, + "step": 6470 + }, + { + "epoch": 0.216, + "grad_norm": 38.75, + "grad_norm_var": 8.626497395833333, + "learning_rate": 0.0001, + "loss": 8.7111, + "loss/crossentropy": 2.084462544322014, + "loss/hidden": 4.030859375, + "loss/jsd": 0.0, + "loss/logits": 0.2595676215365529, + "step": 6480 + }, + { + "epoch": 0.21633333333333332, + "grad_norm": 36.5, + "grad_norm_var": 12.863997395833334, + "learning_rate": 0.0001, + "loss": 8.6783, + "loss/crossentropy": 2.2381670981645585, + "loss/hidden": 3.99375, + "loss/jsd": 0.0, + "loss/logits": 0.272398603707552, + "step": 6490 + }, + { + "epoch": 0.21666666666666667, + "grad_norm": 34.75, + "grad_norm_var": 11.4994140625, + "learning_rate": 0.0001, + "loss": 8.575, + "loss/crossentropy": 2.169520039856434, + "loss/hidden": 3.975390625, + "loss/jsd": 0.0, + "loss/logits": 0.26597979068756106, + "step": 6500 + }, + { + "epoch": 0.217, + "grad_norm": 31.375, + "grad_norm_var": 14.201822916666666, + "learning_rate": 0.0001, + "loss": 8.7438, + "loss/crossentropy": 2.3124695271253586, + "loss/hidden": 3.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.28047500401735304, + "step": 6510 + }, + { + "epoch": 0.21733333333333332, + "grad_norm": 30.625, + "grad_norm_var": 18.00625, + "learning_rate": 0.0001, + "loss": 8.6633, + "loss/crossentropy": 2.23331324160099, + "loss/hidden": 3.9703125, + "loss/jsd": 0.0, + "loss/logits": 0.2535081097856164, + "step": 6520 + }, + { + "epoch": 0.21766666666666667, + "grad_norm": 32.5, + "grad_norm_var": 13.676041666666666, + "learning_rate": 0.0001, + "loss": 8.6314, + "loss/crossentropy": 2.073501707613468, + "loss/hidden": 4.06171875, + "loss/jsd": 0.0, + "loss/logits": 0.2697511712089181, + "step": 6530 + }, + { + "epoch": 0.218, + "grad_norm": 36.0, + "grad_norm_var": 18.245572916666667, + "learning_rate": 0.0001, + "loss": 8.5764, + "loss/crossentropy": 2.1756048664450645, + "loss/hidden": 3.879296875, + "loss/jsd": 0.0, + "loss/logits": 0.24116889759898186, + "step": 6540 + }, + { + "epoch": 0.21833333333333332, + "grad_norm": 31.625, + "grad_norm_var": 8.464322916666667, + "learning_rate": 0.0001, + "loss": 8.6286, + "loss/crossentropy": 1.9958701081573964, + "loss/hidden": 3.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.24323785230517386, + "step": 6550 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 33.5, + "grad_norm_var": 6.9931640625, + "learning_rate": 0.0001, + "loss": 8.6222, + "loss/crossentropy": 2.1122898295521737, + "loss/hidden": 3.958984375, + "loss/jsd": 0.0, + "loss/logits": 0.27048107255250214, + "step": 6560 + }, + { + "epoch": 0.219, + "grad_norm": 31.5, + "grad_norm_var": 7.69765625, + "learning_rate": 0.0001, + "loss": 8.6798, + "loss/crossentropy": 2.167936125397682, + "loss/hidden": 3.95, + "loss/jsd": 0.0, + "loss/logits": 0.2612238049507141, + "step": 6570 + }, + { + "epoch": 0.21933333333333332, + "grad_norm": 30.75, + "grad_norm_var": 12.91015625, + "learning_rate": 0.0001, + "loss": 8.6542, + "loss/crossentropy": 2.060488347709179, + "loss/hidden": 3.866015625, + "loss/jsd": 0.0, + "loss/logits": 0.23970827981829643, + "step": 6580 + }, + { + "epoch": 0.21966666666666668, + "grad_norm": 34.5, + "grad_norm_var": 12.857747395833334, + "learning_rate": 0.0001, + "loss": 8.7149, + "loss/crossentropy": 2.2109495267271995, + "loss/hidden": 3.906640625, + "loss/jsd": 0.0, + "loss/logits": 0.2613677404820919, + "step": 6590 + }, + { + "epoch": 0.22, + "grad_norm": 34.5, + "grad_norm_var": 2.7032856480426143e+18, + "learning_rate": 0.0001, + "loss": 8.6825, + "loss/crossentropy": 2.1443465147167444, + "loss/hidden": 4.11484375, + "loss/jsd": 0.0, + "loss/logits": 0.2631368327885866, + "step": 6600 + }, + { + "epoch": 0.22033333333333333, + "grad_norm": 33.25, + "grad_norm_var": 37.95774739583333, + "learning_rate": 0.0001, + "loss": 8.6734, + "loss/crossentropy": 2.1459231124259532, + "loss/hidden": 3.95625, + "loss/jsd": 0.0, + "loss/logits": 0.2577024588827044, + "step": 6610 + }, + { + "epoch": 0.22066666666666668, + "grad_norm": 30.5, + "grad_norm_var": 5.280989583333334, + "learning_rate": 0.0001, + "loss": 8.716, + "loss/crossentropy": 2.4239099472761154, + "loss/hidden": 3.99921875, + "loss/jsd": 0.0, + "loss/logits": 0.28959855400025847, + "step": 6620 + }, + { + "epoch": 0.221, + "grad_norm": 31.625, + "grad_norm_var": 6.0494140625, + "learning_rate": 0.0001, + "loss": 8.5097, + "loss/crossentropy": 1.954162660241127, + "loss/hidden": 3.962890625, + "loss/jsd": 0.0, + "loss/logits": 0.2510778192430735, + "step": 6630 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 32.5, + "grad_norm_var": 4.501041666666667, + "learning_rate": 0.0001, + "loss": 8.5961, + "loss/crossentropy": 2.131689856946468, + "loss/hidden": 3.87421875, + "loss/jsd": 0.0, + "loss/logits": 0.24224275033921003, + "step": 6640 + }, + { + "epoch": 0.22166666666666668, + "grad_norm": 34.25, + "grad_norm_var": 10.9759765625, + "learning_rate": 0.0001, + "loss": 8.5582, + "loss/crossentropy": 2.0610960900783537, + "loss/hidden": 3.917578125, + "loss/jsd": 0.0, + "loss/logits": 0.24408777449280022, + "step": 6650 + }, + { + "epoch": 0.222, + "grad_norm": 34.75, + "grad_norm_var": 7.430989583333333, + "learning_rate": 0.0001, + "loss": 8.6865, + "loss/crossentropy": 2.12496095597744, + "loss/hidden": 3.811328125, + "loss/jsd": 0.0, + "loss/logits": 0.2433070670813322, + "step": 6660 + }, + { + "epoch": 0.22233333333333333, + "grad_norm": 32.25, + "grad_norm_var": 10.940559895833333, + "learning_rate": 0.0001, + "loss": 8.57, + "loss/crossentropy": 2.1832097455859185, + "loss/hidden": 3.923828125, + "loss/jsd": 0.0, + "loss/logits": 0.2531871374696493, + "step": 6670 + }, + { + "epoch": 0.22266666666666668, + "grad_norm": 31.0, + "grad_norm_var": 11.620572916666667, + "learning_rate": 0.0001, + "loss": 8.6083, + "loss/crossentropy": 2.2909538954496385, + "loss/hidden": 3.934765625, + "loss/jsd": 0.0, + "loss/logits": 0.26815793141722677, + "step": 6680 + }, + { + "epoch": 0.223, + "grad_norm": 32.5, + "grad_norm_var": 9.41875, + "learning_rate": 0.0001, + "loss": 8.6731, + "loss/crossentropy": 2.208388736844063, + "loss/hidden": 3.87265625, + "loss/jsd": 0.0, + "loss/logits": 0.25899204462766645, + "step": 6690 + }, + { + "epoch": 0.22333333333333333, + "grad_norm": 49.25, + "grad_norm_var": 24.110872395833333, + "learning_rate": 0.0001, + "loss": 8.7066, + "loss/crossentropy": 2.1818058155477047, + "loss/hidden": 4.038671875, + "loss/jsd": 0.0, + "loss/logits": 0.2769562091678381, + "step": 6700 + }, + { + "epoch": 0.22366666666666668, + "grad_norm": 31.125, + "grad_norm_var": 38.39375, + "learning_rate": 0.0001, + "loss": 8.7785, + "loss/crossentropy": 2.1179177343845366, + "loss/hidden": 4.037109375, + "loss/jsd": 0.0, + "loss/logits": 0.2616199808195233, + "step": 6710 + }, + { + "epoch": 0.224, + "grad_norm": 31.875, + "grad_norm_var": 5.8947265625, + "learning_rate": 0.0001, + "loss": 8.6151, + "loss/crossentropy": 2.0090429857373238, + "loss/hidden": 3.823046875, + "loss/jsd": 0.0, + "loss/logits": 0.2248888023197651, + "step": 6720 + }, + { + "epoch": 0.22433333333333333, + "grad_norm": 31.0, + "grad_norm_var": 55.145572916666666, + "learning_rate": 0.0001, + "loss": 8.7802, + "loss/crossentropy": 2.1233505457639694, + "loss/hidden": 3.835546875, + "loss/jsd": 0.0, + "loss/logits": 0.2540145181119442, + "step": 6730 + }, + { + "epoch": 0.22466666666666665, + "grad_norm": 32.75, + "grad_norm_var": 2.668684895833333, + "learning_rate": 0.0001, + "loss": 8.6411, + "loss/crossentropy": 2.240196964144707, + "loss/hidden": 4.01953125, + "loss/jsd": 0.0, + "loss/logits": 0.26511474009603264, + "step": 6740 + }, + { + "epoch": 0.225, + "grad_norm": 39.25, + "grad_norm_var": 7.196875, + "learning_rate": 0.0001, + "loss": 8.4853, + "loss/crossentropy": 2.1406675301492215, + "loss/hidden": 3.93125, + "loss/jsd": 0.0, + "loss/logits": 0.24782155379652976, + "step": 6750 + }, + { + "epoch": 0.22533333333333333, + "grad_norm": 31.0, + "grad_norm_var": 7.8353515625, + "learning_rate": 0.0001, + "loss": 8.6089, + "loss/crossentropy": 2.071791734546423, + "loss/hidden": 3.878515625, + "loss/jsd": 0.0, + "loss/logits": 0.25682480856776235, + "step": 6760 + }, + { + "epoch": 0.22566666666666665, + "grad_norm": 48.0, + "grad_norm_var": 1.824028194531967e+18, + "learning_rate": 0.0001, + "loss": 8.8247, + "loss/crossentropy": 2.240962551534176, + "loss/hidden": 3.891796875, + "loss/jsd": 0.0, + "loss/logits": 0.25645633824169634, + "step": 6770 + }, + { + "epoch": 0.226, + "grad_norm": 29.5, + "grad_norm_var": 3.556337769787687e+18, + "learning_rate": 0.0001, + "loss": 8.7193, + "loss/crossentropy": 1.969706627726555, + "loss/hidden": 4.058203125, + "loss/jsd": 0.0, + "loss/logits": 0.2760220758616924, + "step": 6780 + }, + { + "epoch": 0.22633333333333333, + "grad_norm": 37.5, + "grad_norm_var": 1.986087411876941e+18, + "learning_rate": 0.0001, + "loss": 8.718, + "loss/crossentropy": 2.0835460133850576, + "loss/hidden": 4.033203125, + "loss/jsd": 0.0, + "loss/logits": 0.27087474074214696, + "step": 6790 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 37.0, + "grad_norm_var": 12.514583333333333, + "learning_rate": 0.0001, + "loss": 8.5738, + "loss/crossentropy": 2.0835996329784394, + "loss/hidden": 4.028125, + "loss/jsd": 0.0, + "loss/logits": 0.26487845852971076, + "step": 6800 + }, + { + "epoch": 0.227, + "grad_norm": 33.75, + "grad_norm_var": 13.1212890625, + "learning_rate": 0.0001, + "loss": 8.6835, + "loss/crossentropy": 2.174676289409399, + "loss/hidden": 3.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.24940601829439402, + "step": 6810 + }, + { + "epoch": 0.22733333333333333, + "grad_norm": 30.0, + "grad_norm_var": 30.77890625, + "learning_rate": 0.0001, + "loss": 8.7017, + "loss/crossentropy": 1.9795321062207223, + "loss/hidden": 3.940625, + "loss/jsd": 0.0, + "loss/logits": 0.2277604851871729, + "step": 6820 + }, + { + "epoch": 0.22766666666666666, + "grad_norm": 36.0, + "grad_norm_var": 29.243489583333332, + "learning_rate": 0.0001, + "loss": 8.6398, + "loss/crossentropy": 2.0192734390497207, + "loss/hidden": 3.87421875, + "loss/jsd": 0.0, + "loss/logits": 0.27642418537288904, + "step": 6830 + }, + { + "epoch": 0.228, + "grad_norm": 31.875, + "grad_norm_var": 8.983268229166667, + "learning_rate": 0.0001, + "loss": 8.6125, + "loss/crossentropy": 2.0098339319229126, + "loss/hidden": 4.141015625, + "loss/jsd": 0.0, + "loss/logits": 0.2576570626348257, + "step": 6840 + }, + { + "epoch": 0.22833333333333333, + "grad_norm": 31.25, + "grad_norm_var": 16.34765625, + "learning_rate": 0.0001, + "loss": 8.5918, + "loss/crossentropy": 2.0969722121953964, + "loss/hidden": 4.009765625, + "loss/jsd": 0.0, + "loss/logits": 0.27101105730980635, + "step": 6850 + }, + { + "epoch": 0.22866666666666666, + "grad_norm": 30.375, + "grad_norm_var": 13.826822916666666, + "learning_rate": 0.0001, + "loss": 8.5323, + "loss/crossentropy": 2.1093384474515915, + "loss/hidden": 3.993359375, + "loss/jsd": 0.0, + "loss/logits": 0.26533141303807495, + "step": 6860 + }, + { + "epoch": 0.229, + "grad_norm": 38.75, + "grad_norm_var": 5.7994140625, + "learning_rate": 0.0001, + "loss": 8.6041, + "loss/crossentropy": 2.2051602229475975, + "loss/hidden": 4.044921875, + "loss/jsd": 0.0, + "loss/logits": 0.27079470865428446, + "step": 6870 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 36.25, + "grad_norm_var": 6.357291666666667, + "learning_rate": 0.0001, + "loss": 8.5977, + "loss/crossentropy": 2.13309033960104, + "loss/hidden": 3.709765625, + "loss/jsd": 0.0, + "loss/logits": 0.23595739863812923, + "step": 6880 + }, + { + "epoch": 0.22966666666666666, + "grad_norm": 32.0, + "grad_norm_var": 5.145833333333333, + "learning_rate": 0.0001, + "loss": 8.485, + "loss/crossentropy": 2.205070769786835, + "loss/hidden": 3.733984375, + "loss/jsd": 0.0, + "loss/logits": 0.23339474331587554, + "step": 6890 + }, + { + "epoch": 0.23, + "grad_norm": 32.25, + "grad_norm_var": 12.338541666666666, + "learning_rate": 0.0001, + "loss": 8.5976, + "loss/crossentropy": 2.162086985260248, + "loss/hidden": 3.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23039890434592963, + "step": 6900 + }, + { + "epoch": 0.23033333333333333, + "grad_norm": 41.25, + "grad_norm_var": 15.233268229166667, + "learning_rate": 0.0001, + "loss": 8.5384, + "loss/crossentropy": 2.0678157053887842, + "loss/hidden": 3.982421875, + "loss/jsd": 0.0, + "loss/logits": 0.2481289473362267, + "step": 6910 + }, + { + "epoch": 0.23066666666666666, + "grad_norm": 44.5, + "grad_norm_var": 12.778580729166666, + "learning_rate": 0.0001, + "loss": 8.6622, + "loss/crossentropy": 2.114139196276665, + "loss/hidden": 3.91328125, + "loss/jsd": 0.0, + "loss/logits": 0.25735178850591184, + "step": 6920 + }, + { + "epoch": 0.231, + "grad_norm": 34.25, + "grad_norm_var": 31.0625, + "learning_rate": 0.0001, + "loss": 8.607, + "loss/crossentropy": 2.0753188371658324, + "loss/hidden": 4.03359375, + "loss/jsd": 0.0, + "loss/logits": 0.24515043962746857, + "step": 6930 + }, + { + "epoch": 0.23133333333333334, + "grad_norm": 36.5, + "grad_norm_var": 3.3962890625, + "learning_rate": 0.0001, + "loss": 8.4551, + "loss/crossentropy": 2.1153680123388767, + "loss/hidden": 3.856640625, + "loss/jsd": 0.0, + "loss/logits": 0.2439242374151945, + "step": 6940 + }, + { + "epoch": 0.23166666666666666, + "grad_norm": 34.25, + "grad_norm_var": 7.212955729166667, + "learning_rate": 0.0001, + "loss": 8.5571, + "loss/crossentropy": 2.2401276588439942, + "loss/hidden": 3.83671875, + "loss/jsd": 0.0, + "loss/logits": 0.2469344925135374, + "step": 6950 + }, + { + "epoch": 0.232, + "grad_norm": 33.0, + "grad_norm_var": 14.067643229166666, + "learning_rate": 0.0001, + "loss": 8.5338, + "loss/crossentropy": 2.1172975957393647, + "loss/hidden": 4.00703125, + "loss/jsd": 0.0, + "loss/logits": 0.2541731720790267, + "step": 6960 + }, + { + "epoch": 0.23233333333333334, + "grad_norm": 40.75, + "grad_norm_var": 8.700455729166666, + "learning_rate": 0.0001, + "loss": 8.5392, + "loss/crossentropy": 2.144708326458931, + "loss/hidden": 3.851171875, + "loss/jsd": 0.0, + "loss/logits": 0.2414156835526228, + "step": 6970 + }, + { + "epoch": 0.23266666666666666, + "grad_norm": 37.75, + "grad_norm_var": 7.82265625, + "learning_rate": 0.0001, + "loss": 8.4426, + "loss/crossentropy": 2.066808733344078, + "loss/hidden": 3.741015625, + "loss/jsd": 0.0, + "loss/logits": 0.23481324184685945, + "step": 6980 + }, + { + "epoch": 0.233, + "grad_norm": 36.0, + "grad_norm_var": 6.967708333333333, + "learning_rate": 0.0001, + "loss": 8.6706, + "loss/crossentropy": 2.0547366201877595, + "loss/hidden": 3.919140625, + "loss/jsd": 0.0, + "loss/logits": 0.2535815857350826, + "step": 6990 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 35.75, + "grad_norm_var": 16.675455729166668, + "learning_rate": 0.0001, + "loss": 8.5713, + "loss/crossentropy": 2.146591657400131, + "loss/hidden": 3.892578125, + "loss/jsd": 0.0, + "loss/logits": 0.26058912873268125, + "step": 7000 + }, + { + "epoch": 0.23366666666666666, + "grad_norm": 35.0, + "grad_norm_var": 14.667122395833333, + "learning_rate": 0.0001, + "loss": 8.7385, + "loss/crossentropy": 2.028310924768448, + "loss/hidden": 3.966015625, + "loss/jsd": 0.0, + "loss/logits": 0.254487244784832, + "step": 7010 + }, + { + "epoch": 0.234, + "grad_norm": 33.5, + "grad_norm_var": 5.967643229166667, + "learning_rate": 0.0001, + "loss": 8.7735, + "loss/crossentropy": 2.1410273112356664, + "loss/hidden": 3.881640625, + "loss/jsd": 0.0, + "loss/logits": 0.24528108015656472, + "step": 7020 + }, + { + "epoch": 0.23433333333333334, + "grad_norm": 33.25, + "grad_norm_var": 3.5994140625, + "learning_rate": 0.0001, + "loss": 8.5423, + "loss/crossentropy": 2.0229434952139855, + "loss/hidden": 3.830078125, + "loss/jsd": 0.0, + "loss/logits": 0.23408238925039768, + "step": 7030 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 31.75, + "grad_norm_var": 2.78125, + "learning_rate": 0.0001, + "loss": 8.6283, + "loss/crossentropy": 2.1765091590583325, + "loss/hidden": 3.98203125, + "loss/jsd": 0.0, + "loss/logits": 0.26205482967197896, + "step": 7040 + }, + { + "epoch": 0.235, + "grad_norm": 31.75, + "grad_norm_var": 5.094205729166666, + "learning_rate": 0.0001, + "loss": 8.5875, + "loss/crossentropy": 2.1615469992160796, + "loss/hidden": 3.816015625, + "loss/jsd": 0.0, + "loss/logits": 0.2432116275653243, + "step": 7050 + }, + { + "epoch": 0.23533333333333334, + "grad_norm": 33.5, + "grad_norm_var": 8.434830729166666, + "learning_rate": 0.0001, + "loss": 8.6217, + "loss/crossentropy": 2.0332022219896317, + "loss/hidden": 3.891015625, + "loss/jsd": 0.0, + "loss/logits": 0.24271235838532448, + "step": 7060 + }, + { + "epoch": 0.23566666666666666, + "grad_norm": 33.0, + "grad_norm_var": 80.83541666666666, + "learning_rate": 0.0001, + "loss": 8.6128, + "loss/crossentropy": 2.094720220565796, + "loss/hidden": 3.84296875, + "loss/jsd": 0.0, + "loss/logits": 0.22856017146259547, + "step": 7070 + }, + { + "epoch": 0.236, + "grad_norm": 31.5, + "grad_norm_var": 1.5458333333333334, + "learning_rate": 0.0001, + "loss": 8.5501, + "loss/crossentropy": 2.1444082021713258, + "loss/hidden": 3.938671875, + "loss/jsd": 0.0, + "loss/logits": 0.25706543773412704, + "step": 7080 + }, + { + "epoch": 0.23633333333333334, + "grad_norm": 32.5, + "grad_norm_var": 5.824739583333334, + "learning_rate": 0.0001, + "loss": 8.7055, + "loss/crossentropy": 2.269250899553299, + "loss/hidden": 4.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2765824764966965, + "step": 7090 + }, + { + "epoch": 0.23666666666666666, + "grad_norm": 36.75, + "grad_norm_var": 16.210416666666667, + "learning_rate": 0.0001, + "loss": 8.6437, + "loss/crossentropy": 2.0246976539492607, + "loss/hidden": 3.973828125, + "loss/jsd": 0.0, + "loss/logits": 0.24731771647930145, + "step": 7100 + }, + { + "epoch": 0.237, + "grad_norm": 30.125, + "grad_norm_var": 11.0166015625, + "learning_rate": 0.0001, + "loss": 8.5064, + "loss/crossentropy": 2.1460629656910895, + "loss/hidden": 3.94375, + "loss/jsd": 0.0, + "loss/logits": 0.24730791207402944, + "step": 7110 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 32.75, + "grad_norm_var": 15.567122395833334, + "learning_rate": 0.0001, + "loss": 8.6119, + "loss/crossentropy": 2.0084538377821444, + "loss/hidden": 3.947265625, + "loss/jsd": 0.0, + "loss/logits": 0.2514385598711669, + "step": 7120 + }, + { + "epoch": 0.23766666666666666, + "grad_norm": 32.75, + "grad_norm_var": 8.709830729166667, + "learning_rate": 0.0001, + "loss": 8.5552, + "loss/crossentropy": 2.188428722321987, + "loss/hidden": 3.987890625, + "loss/jsd": 0.0, + "loss/logits": 0.2734973944723606, + "step": 7130 + }, + { + "epoch": 0.238, + "grad_norm": 33.5, + "grad_norm_var": 9.23125, + "learning_rate": 0.0001, + "loss": 8.6669, + "loss/crossentropy": 2.163307761400938, + "loss/hidden": 3.998046875, + "loss/jsd": 0.0, + "loss/logits": 0.26316353445872664, + "step": 7140 + }, + { + "epoch": 0.23833333333333334, + "grad_norm": 32.5, + "grad_norm_var": 7.6744140625, + "learning_rate": 0.0001, + "loss": 8.6865, + "loss/crossentropy": 2.0473650604486466, + "loss/hidden": 3.98984375, + "loss/jsd": 0.0, + "loss/logits": 0.2502355322241783, + "step": 7150 + }, + { + "epoch": 0.23866666666666667, + "grad_norm": 37.25, + "grad_norm_var": 6.209375, + "learning_rate": 0.0001, + "loss": 8.5554, + "loss/crossentropy": 2.10597411096096, + "loss/hidden": 3.9953125, + "loss/jsd": 0.0, + "loss/logits": 0.2646968217566609, + "step": 7160 + }, + { + "epoch": 0.239, + "grad_norm": 32.5, + "grad_norm_var": 7.327083333333333, + "learning_rate": 0.0001, + "loss": 8.5166, + "loss/crossentropy": 2.2311090558767317, + "loss/hidden": 3.971875, + "loss/jsd": 0.0, + "loss/logits": 0.2651707552373409, + "step": 7170 + }, + { + "epoch": 0.23933333333333334, + "grad_norm": 33.75, + "grad_norm_var": 2.708268229166667, + "learning_rate": 0.0001, + "loss": 8.5943, + "loss/crossentropy": 2.1325844526290894, + "loss/hidden": 3.856640625, + "loss/jsd": 0.0, + "loss/logits": 0.25004746317863463, + "step": 7180 + }, + { + "epoch": 0.23966666666666667, + "grad_norm": 33.0, + "grad_norm_var": 5.72265625, + "learning_rate": 0.0001, + "loss": 8.5078, + "loss/crossentropy": 2.127976506203413, + "loss/hidden": 4.0015625, + "loss/jsd": 0.0, + "loss/logits": 0.25839042402803897, + "step": 7190 + }, + { + "epoch": 0.24, + "grad_norm": 34.75, + "grad_norm_var": 4.226822916666666, + "learning_rate": 0.0001, + "loss": 8.5393, + "loss/crossentropy": 2.1524706527590753, + "loss/hidden": 3.78359375, + "loss/jsd": 0.0, + "loss/logits": 0.24184909779578448, + "step": 7200 + }, + { + "epoch": 0.24033333333333334, + "grad_norm": 38.0, + "grad_norm_var": 24.615559895833332, + "learning_rate": 0.0001, + "loss": 8.7409, + "loss/crossentropy": 2.0844357013702393, + "loss/hidden": 4.113671875, + "loss/jsd": 0.0, + "loss/logits": 0.25061873607337476, + "step": 7210 + }, + { + "epoch": 0.24066666666666667, + "grad_norm": 34.5, + "grad_norm_var": 4.8791015625, + "learning_rate": 0.0001, + "loss": 8.5332, + "loss/crossentropy": 2.2334757328033445, + "loss/hidden": 3.786328125, + "loss/jsd": 0.0, + "loss/logits": 0.24229202494025232, + "step": 7220 + }, + { + "epoch": 0.241, + "grad_norm": 39.75, + "grad_norm_var": 5.398372395833333, + "learning_rate": 0.0001, + "loss": 8.6905, + "loss/crossentropy": 2.095822374522686, + "loss/hidden": 4.03359375, + "loss/jsd": 0.0, + "loss/logits": 0.276114359125495, + "step": 7230 + }, + { + "epoch": 0.24133333333333334, + "grad_norm": 33.5, + "grad_norm_var": 6.189322916666667, + "learning_rate": 0.0001, + "loss": 8.621, + "loss/crossentropy": 2.083872254192829, + "loss/hidden": 3.843359375, + "loss/jsd": 0.0, + "loss/logits": 0.25903829988092186, + "step": 7240 + }, + { + "epoch": 0.24166666666666667, + "grad_norm": 40.0, + "grad_norm_var": 9.3556640625, + "learning_rate": 0.0001, + "loss": 8.8226, + "loss/crossentropy": 2.1576643377542495, + "loss/hidden": 3.93046875, + "loss/jsd": 0.0, + "loss/logits": 0.277049720287323, + "step": 7250 + }, + { + "epoch": 0.242, + "grad_norm": 46.0, + "grad_norm_var": 42.713541666666664, + "learning_rate": 0.0001, + "loss": 8.5405, + "loss/crossentropy": 2.1249560177326203, + "loss/hidden": 3.844921875, + "loss/jsd": 0.0, + "loss/logits": 0.24358038194477558, + "step": 7260 + }, + { + "epoch": 0.24233333333333335, + "grad_norm": 32.0, + "grad_norm_var": 43.81666666666667, + "learning_rate": 0.0001, + "loss": 8.6632, + "loss/crossentropy": 2.271902731060982, + "loss/hidden": 3.877734375, + "loss/jsd": 0.0, + "loss/logits": 0.2662301120348275, + "step": 7270 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 29.25, + "grad_norm_var": 3.8811848958333335, + "learning_rate": 0.0001, + "loss": 8.4007, + "loss/crossentropy": 2.037639981508255, + "loss/hidden": 3.926953125, + "loss/jsd": 0.0, + "loss/logits": 0.2521992586553097, + "step": 7280 + }, + { + "epoch": 0.243, + "grad_norm": 37.75, + "grad_norm_var": 11.527018229166666, + "learning_rate": 0.0001, + "loss": 8.7671, + "loss/crossentropy": 2.0688220985233783, + "loss/hidden": 4.000390625, + "loss/jsd": 0.0, + "loss/logits": 0.2653772059828043, + "step": 7290 + }, + { + "epoch": 0.24333333333333335, + "grad_norm": 34.75, + "grad_norm_var": 9.0103515625, + "learning_rate": 0.0001, + "loss": 8.6682, + "loss/crossentropy": 1.9946186635643244, + "loss/hidden": 3.946875, + "loss/jsd": 0.0, + "loss/logits": 0.2328943044412881, + "step": 7300 + }, + { + "epoch": 0.24366666666666667, + "grad_norm": 31.5, + "grad_norm_var": 7.376497395833334, + "learning_rate": 0.0001, + "loss": 8.5699, + "loss/crossentropy": 2.139767034351826, + "loss/hidden": 3.88828125, + "loss/jsd": 0.0, + "loss/logits": 0.2668099632486701, + "step": 7310 + }, + { + "epoch": 0.244, + "grad_norm": 30.875, + "grad_norm_var": 5.427018229166666, + "learning_rate": 0.0001, + "loss": 8.6389, + "loss/crossentropy": 2.0648159228265284, + "loss/hidden": 3.88359375, + "loss/jsd": 0.0, + "loss/logits": 0.2502809874713421, + "step": 7320 + }, + { + "epoch": 0.24433333333333335, + "grad_norm": 31.25, + "grad_norm_var": 7.09140625, + "learning_rate": 0.0001, + "loss": 8.6659, + "loss/crossentropy": 2.106768397986889, + "loss/hidden": 3.928125, + "loss/jsd": 0.0, + "loss/logits": 0.2518464956432581, + "step": 7330 + }, + { + "epoch": 0.24466666666666667, + "grad_norm": 31.75, + "grad_norm_var": 11.3603515625, + "learning_rate": 0.0001, + "loss": 8.5392, + "loss/crossentropy": 2.1911858722567557, + "loss/hidden": 3.996484375, + "loss/jsd": 0.0, + "loss/logits": 0.26760652028024196, + "step": 7340 + }, + { + "epoch": 0.245, + "grad_norm": 33.75, + "grad_norm_var": 19.9259765625, + "learning_rate": 0.0001, + "loss": 8.5007, + "loss/crossentropy": 2.066247297823429, + "loss/hidden": 3.937109375, + "loss/jsd": 0.0, + "loss/logits": 0.2681329587474465, + "step": 7350 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 28.75, + "grad_norm_var": 3.015230290082031e+18, + "learning_rate": 0.0001, + "loss": 8.6882, + "loss/crossentropy": 2.286055992543697, + "loss/hidden": 3.84609375, + "loss/jsd": 0.0, + "loss/logits": 0.262431076541543, + "step": 7360 + }, + { + "epoch": 0.24566666666666667, + "grad_norm": 33.5, + "grad_norm_var": 15.270247395833334, + "learning_rate": 0.0001, + "loss": 8.4911, + "loss/crossentropy": 2.203142321109772, + "loss/hidden": 4.007421875, + "loss/jsd": 0.0, + "loss/logits": 0.25816880762577055, + "step": 7370 + }, + { + "epoch": 0.246, + "grad_norm": 31.25, + "grad_norm_var": 6.605208333333334, + "learning_rate": 0.0001, + "loss": 8.5749, + "loss/crossentropy": 2.168962088227272, + "loss/hidden": 3.844140625, + "loss/jsd": 0.0, + "loss/logits": 0.24683325868099928, + "step": 7380 + }, + { + "epoch": 0.24633333333333332, + "grad_norm": 29.125, + "grad_norm_var": 8.370572916666667, + "learning_rate": 0.0001, + "loss": 8.6485, + "loss/crossentropy": 2.201642544567585, + "loss/hidden": 3.88984375, + "loss/jsd": 0.0, + "loss/logits": 0.25863207802176474, + "step": 7390 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 31.5, + "grad_norm_var": 3.2416015625, + "learning_rate": 0.0001, + "loss": 8.4782, + "loss/crossentropy": 1.960936988890171, + "loss/hidden": 3.831640625, + "loss/jsd": 0.0, + "loss/logits": 0.22650879565626383, + "step": 7400 + }, + { + "epoch": 0.247, + "grad_norm": 38.25, + "grad_norm_var": 5.086393229166666, + "learning_rate": 0.0001, + "loss": 8.6146, + "loss/crossentropy": 2.148800623416901, + "loss/hidden": 3.9265625, + "loss/jsd": 0.0, + "loss/logits": 0.27121419459581375, + "step": 7410 + }, + { + "epoch": 0.24733333333333332, + "grad_norm": 32.25, + "grad_norm_var": 4.398372395833333, + "learning_rate": 0.0001, + "loss": 8.5397, + "loss/crossentropy": 2.1694126784801484, + "loss/hidden": 3.891796875, + "loss/jsd": 0.0, + "loss/logits": 0.24342003595083953, + "step": 7420 + }, + { + "epoch": 0.24766666666666667, + "grad_norm": 32.75, + "grad_norm_var": 9.516666666666667, + "learning_rate": 0.0001, + "loss": 8.6136, + "loss/crossentropy": 2.1182317078113555, + "loss/hidden": 3.844921875, + "loss/jsd": 0.0, + "loss/logits": 0.24156781807541847, + "step": 7430 + }, + { + "epoch": 0.248, + "grad_norm": 36.5, + "grad_norm_var": 7.377018229166667, + "learning_rate": 0.0001, + "loss": 8.5314, + "loss/crossentropy": 2.1088318437337876, + "loss/hidden": 3.822265625, + "loss/jsd": 0.0, + "loss/logits": 0.250948965921998, + "step": 7440 + }, + { + "epoch": 0.24833333333333332, + "grad_norm": 34.25, + "grad_norm_var": 3248.1872395833334, + "learning_rate": 0.0001, + "loss": 8.6694, + "loss/crossentropy": 2.2178388088941574, + "loss/hidden": 3.946875, + "loss/jsd": 0.0, + "loss/logits": 0.25901649333536625, + "step": 7450 + }, + { + "epoch": 0.24866666666666667, + "grad_norm": 31.125, + "grad_norm_var": 3290.9358723958335, + "learning_rate": 0.0001, + "loss": 8.5781, + "loss/crossentropy": 2.2698897421360016, + "loss/hidden": 3.926171875, + "loss/jsd": 0.0, + "loss/logits": 0.2536152143031359, + "step": 7460 + }, + { + "epoch": 0.249, + "grad_norm": 33.0, + "grad_norm_var": 1772.1875, + "learning_rate": 0.0001, + "loss": 8.5832, + "loss/crossentropy": 2.1621989846229552, + "loss/hidden": 3.84609375, + "loss/jsd": 0.0, + "loss/logits": 0.24833030719310045, + "step": 7470 + }, + { + "epoch": 0.24933333333333332, + "grad_norm": 39.25, + "grad_norm_var": 2.814749738438492e+18, + "learning_rate": 0.0001, + "loss": 8.6171, + "loss/crossentropy": 2.14983384013176, + "loss/hidden": 4.019921875, + "loss/jsd": 0.0, + "loss/logits": 0.27834896706044676, + "step": 7480 + }, + { + "epoch": 0.24966666666666668, + "grad_norm": 31.25, + "grad_norm_var": 2.81474973868316e+18, + "learning_rate": 0.0001, + "loss": 8.6016, + "loss/crossentropy": 2.168473194539547, + "loss/hidden": 4.10546875, + "loss/jsd": 0.0, + "loss/logits": 0.29676152374595405, + "step": 7490 + }, + { + "epoch": 0.25, + "grad_norm": 33.0, + "grad_norm_var": 3567.3869140625, + "learning_rate": 0.0001, + "loss": 8.6921, + "loss/crossentropy": 2.042189783602953, + "loss/hidden": 3.898828125, + "loss/jsd": 0.0, + "loss/logits": 0.2364582633599639, + "step": 7500 + }, + { + "epoch": 0.25033333333333335, + "grad_norm": 30.25, + "grad_norm_var": 5.686458333333333, + "learning_rate": 0.0001, + "loss": 8.6958, + "loss/crossentropy": 2.062736430764198, + "loss/hidden": 3.84609375, + "loss/jsd": 0.0, + "loss/logits": 0.240386860165745, + "step": 7510 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 31.375, + "grad_norm_var": 17.955989583333334, + "learning_rate": 0.0001, + "loss": 8.515, + "loss/crossentropy": 2.0180068641901014, + "loss/hidden": 3.877734375, + "loss/jsd": 0.0, + "loss/logits": 0.23197599165141583, + "step": 7520 + }, + { + "epoch": 0.251, + "grad_norm": 33.75, + "grad_norm_var": 2.0817889033199114e+18, + "learning_rate": 0.0001, + "loss": 8.5997, + "loss/crossentropy": 2.106365057826042, + "loss/hidden": 3.99921875, + "loss/jsd": 0.0, + "loss/logits": 0.24268077537417412, + "step": 7530 + }, + { + "epoch": 0.25133333333333335, + "grad_norm": 32.5, + "grad_norm_var": 8.407291666666667, + "learning_rate": 0.0001, + "loss": 8.4314, + "loss/crossentropy": 1.9716456033289433, + "loss/hidden": 3.909375, + "loss/jsd": 0.0, + "loss/logits": 0.2563434375450015, + "step": 7540 + }, + { + "epoch": 0.25166666666666665, + "grad_norm": 30.375, + "grad_norm_var": 9.754166666666666, + "learning_rate": 0.0001, + "loss": 8.4357, + "loss/crossentropy": 2.214118207991123, + "loss/hidden": 3.816796875, + "loss/jsd": 0.0, + "loss/logits": 0.2364793760702014, + "step": 7550 + }, + { + "epoch": 0.252, + "grad_norm": 32.0, + "grad_norm_var": 11.27265625, + "learning_rate": 0.0001, + "loss": 8.5187, + "loss/crossentropy": 2.0577072143554687, + "loss/hidden": 3.916015625, + "loss/jsd": 0.0, + "loss/logits": 0.2660173770040274, + "step": 7560 + }, + { + "epoch": 0.25233333333333335, + "grad_norm": 34.25, + "grad_norm_var": 6.864518229166666, + "learning_rate": 0.0001, + "loss": 8.5382, + "loss/crossentropy": 2.28112398609519, + "loss/hidden": 3.882421875, + "loss/jsd": 0.0, + "loss/logits": 0.2548609297722578, + "step": 7570 + }, + { + "epoch": 0.25266666666666665, + "grad_norm": 31.0, + "grad_norm_var": 7.536458333333333, + "learning_rate": 0.0001, + "loss": 8.5838, + "loss/crossentropy": 2.148515190184116, + "loss/hidden": 3.8375, + "loss/jsd": 0.0, + "loss/logits": 0.24244300834834576, + "step": 7580 + }, + { + "epoch": 0.253, + "grad_norm": 32.0, + "grad_norm_var": 74.91087239583334, + "learning_rate": 0.0001, + "loss": 8.5155, + "loss/crossentropy": 2.1178071200847626, + "loss/hidden": 3.883984375, + "loss/jsd": 0.0, + "loss/logits": 0.25539283007383345, + "step": 7590 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 34.75, + "grad_norm_var": 7.797330729166666, + "learning_rate": 0.0001, + "loss": 8.5274, + "loss/crossentropy": 2.0491638466715814, + "loss/hidden": 3.93828125, + "loss/jsd": 0.0, + "loss/logits": 0.24984916690737008, + "step": 7600 + }, + { + "epoch": 0.25366666666666665, + "grad_norm": 34.0, + "grad_norm_var": 4.676497395833334, + "learning_rate": 0.0001, + "loss": 8.4896, + "loss/crossentropy": 1.989129790663719, + "loss/hidden": 3.969921875, + "loss/jsd": 0.0, + "loss/logits": 0.2540574911981821, + "step": 7610 + }, + { + "epoch": 0.254, + "grad_norm": 35.0, + "grad_norm_var": 10.124934895833333, + "learning_rate": 0.0001, + "loss": 8.5499, + "loss/crossentropy": 2.1711443960666656, + "loss/hidden": 3.880859375, + "loss/jsd": 0.0, + "loss/logits": 0.2434135077521205, + "step": 7620 + }, + { + "epoch": 0.25433333333333336, + "grad_norm": 31.75, + "grad_norm_var": 10.613541666666666, + "learning_rate": 0.0001, + "loss": 8.4787, + "loss/crossentropy": 2.111149328947067, + "loss/hidden": 4.075, + "loss/jsd": 0.0, + "loss/logits": 0.28135603088885547, + "step": 7630 + }, + { + "epoch": 0.25466666666666665, + "grad_norm": 32.25, + "grad_norm_var": 4.476041666666666, + "learning_rate": 0.0001, + "loss": 8.5342, + "loss/crossentropy": 2.0875839471817015, + "loss/hidden": 3.80234375, + "loss/jsd": 0.0, + "loss/logits": 0.2278506338596344, + "step": 7640 + }, + { + "epoch": 0.255, + "grad_norm": 31.75, + "grad_norm_var": 2.718489583333333, + "learning_rate": 0.0001, + "loss": 8.5779, + "loss/crossentropy": 2.2961817413568495, + "loss/hidden": 3.91171875, + "loss/jsd": 0.0, + "loss/logits": 0.2761024903506041, + "step": 7650 + }, + { + "epoch": 0.25533333333333336, + "grad_norm": 30.875, + "grad_norm_var": 4.695572916666666, + "learning_rate": 0.0001, + "loss": 8.4936, + "loss/crossentropy": 2.064536126330495, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2472051708959043, + "step": 7660 + }, + { + "epoch": 0.25566666666666665, + "grad_norm": 31.5, + "grad_norm_var": 31.70390625, + "learning_rate": 0.0001, + "loss": 8.5492, + "loss/crossentropy": 2.0761756777763365, + "loss/hidden": 3.91953125, + "loss/jsd": 0.0, + "loss/logits": 0.2548640869557858, + "step": 7670 + }, + { + "epoch": 0.256, + "grad_norm": 31.75, + "grad_norm_var": 11.212239583333334, + "learning_rate": 0.0001, + "loss": 8.5113, + "loss/crossentropy": 2.1705673079937697, + "loss/hidden": 3.92109375, + "loss/jsd": 0.0, + "loss/logits": 0.24495558133348821, + "step": 7680 + }, + { + "epoch": 0.25633333333333336, + "grad_norm": 35.0, + "grad_norm_var": 5.424739583333333, + "learning_rate": 0.0001, + "loss": 8.4703, + "loss/crossentropy": 2.2372745871543884, + "loss/hidden": 3.875390625, + "loss/jsd": 0.0, + "loss/logits": 0.24880910199135542, + "step": 7690 + }, + { + "epoch": 0.25666666666666665, + "grad_norm": 31.375, + "grad_norm_var": 4.406705729166666, + "learning_rate": 0.0001, + "loss": 8.4214, + "loss/crossentropy": 2.114253217726946, + "loss/hidden": 3.767578125, + "loss/jsd": 0.0, + "loss/logits": 0.23989613354206085, + "step": 7700 + }, + { + "epoch": 0.257, + "grad_norm": 32.5, + "grad_norm_var": 5.330989583333333, + "learning_rate": 0.0001, + "loss": 8.5299, + "loss/crossentropy": 2.1180673211812975, + "loss/hidden": 3.901171875, + "loss/jsd": 0.0, + "loss/logits": 0.26575036309659483, + "step": 7710 + }, + { + "epoch": 0.25733333333333336, + "grad_norm": 29.75, + "grad_norm_var": 7.863541666666666, + "learning_rate": 0.0001, + "loss": 8.5739, + "loss/crossentropy": 2.1078746899962426, + "loss/hidden": 3.819140625, + "loss/jsd": 0.0, + "loss/logits": 0.235656151548028, + "step": 7720 + }, + { + "epoch": 0.25766666666666665, + "grad_norm": 32.5, + "grad_norm_var": 8.870572916666667, + "learning_rate": 0.0001, + "loss": 8.5049, + "loss/crossentropy": 2.2084743842482566, + "loss/hidden": 3.92265625, + "loss/jsd": 0.0, + "loss/logits": 0.24384659044444562, + "step": 7730 + }, + { + "epoch": 0.258, + "grad_norm": 34.25, + "grad_norm_var": 2.2997395833333334, + "learning_rate": 0.0001, + "loss": 8.3794, + "loss/crossentropy": 2.1690520867705345, + "loss/hidden": 3.808203125, + "loss/jsd": 0.0, + "loss/logits": 0.2447241246700287, + "step": 7740 + }, + { + "epoch": 0.25833333333333336, + "grad_norm": 34.5, + "grad_norm_var": 7.211458333333334, + "learning_rate": 0.0001, + "loss": 8.566, + "loss/crossentropy": 2.237781625241041, + "loss/hidden": 3.851171875, + "loss/jsd": 0.0, + "loss/logits": 0.23708969578146935, + "step": 7750 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 31.875, + "grad_norm_var": 51.203125, + "learning_rate": 0.0001, + "loss": 8.4896, + "loss/crossentropy": 2.0946034505963325, + "loss/hidden": 3.991796875, + "loss/jsd": 0.0, + "loss/logits": 0.25754191167652607, + "step": 7760 + }, + { + "epoch": 0.259, + "grad_norm": 30.375, + "grad_norm_var": 5.221875, + "learning_rate": 0.0001, + "loss": 8.5675, + "loss/crossentropy": 2.2402331814169885, + "loss/hidden": 3.866015625, + "loss/jsd": 0.0, + "loss/logits": 0.2628116441890597, + "step": 7770 + }, + { + "epoch": 0.25933333333333336, + "grad_norm": 40.25, + "grad_norm_var": 17.0900390625, + "learning_rate": 0.0001, + "loss": 8.5823, + "loss/crossentropy": 2.1165684029459952, + "loss/hidden": 3.88125, + "loss/jsd": 0.0, + "loss/logits": 0.2519789934158325, + "step": 7780 + }, + { + "epoch": 0.25966666666666666, + "grad_norm": 33.75, + "grad_norm_var": 8.1728515625, + "learning_rate": 0.0001, + "loss": 8.5378, + "loss/crossentropy": 2.0959367021918296, + "loss/hidden": 3.931640625, + "loss/jsd": 0.0, + "loss/logits": 0.23852321952581407, + "step": 7790 + }, + { + "epoch": 0.26, + "grad_norm": 36.5, + "grad_norm_var": 4.790625, + "learning_rate": 0.0001, + "loss": 8.3399, + "loss/crossentropy": 1.9469283685088157, + "loss/hidden": 3.78046875, + "loss/jsd": 0.0, + "loss/logits": 0.22438753712922335, + "step": 7800 + }, + { + "epoch": 0.26033333333333336, + "grad_norm": 32.25, + "grad_norm_var": 4.2625, + "learning_rate": 0.0001, + "loss": 8.5059, + "loss/crossentropy": 2.1072940029203893, + "loss/hidden": 3.945703125, + "loss/jsd": 0.0, + "loss/logits": 0.2441211288794875, + "step": 7810 + }, + { + "epoch": 0.26066666666666666, + "grad_norm": 29.5, + "grad_norm_var": 17.897330729166665, + "learning_rate": 0.0001, + "loss": 8.4184, + "loss/crossentropy": 1.8887931071221828, + "loss/hidden": 3.984765625, + "loss/jsd": 0.0, + "loss/logits": 0.24000756442546844, + "step": 7820 + }, + { + "epoch": 0.261, + "grad_norm": 33.25, + "grad_norm_var": 21.8119140625, + "learning_rate": 0.0001, + "loss": 8.4814, + "loss/crossentropy": 2.1927064463496206, + "loss/hidden": 3.80546875, + "loss/jsd": 0.0, + "loss/logits": 0.23618043400347233, + "step": 7830 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 37.75, + "grad_norm_var": 12.579622395833333, + "learning_rate": 0.0001, + "loss": 8.5882, + "loss/crossentropy": 2.0585607342422008, + "loss/hidden": 3.861328125, + "loss/jsd": 0.0, + "loss/logits": 0.24375837668776512, + "step": 7840 + }, + { + "epoch": 0.26166666666666666, + "grad_norm": 33.0, + "grad_norm_var": 6.588997395833333, + "learning_rate": 0.0001, + "loss": 8.5555, + "loss/crossentropy": 2.0930290199816226, + "loss/hidden": 3.85234375, + "loss/jsd": 0.0, + "loss/logits": 0.2372116858139634, + "step": 7850 + }, + { + "epoch": 0.262, + "grad_norm": 32.75, + "grad_norm_var": 4.052018229166666, + "learning_rate": 0.0001, + "loss": 8.5284, + "loss/crossentropy": 2.2643882423639297, + "loss/hidden": 3.848046875, + "loss/jsd": 0.0, + "loss/logits": 0.2545361390337348, + "step": 7860 + }, + { + "epoch": 0.2623333333333333, + "grad_norm": 37.25, + "grad_norm_var": 32.57265625, + "learning_rate": 0.0001, + "loss": 8.4368, + "loss/crossentropy": 2.151304465532303, + "loss/hidden": 3.903515625, + "loss/jsd": 0.0, + "loss/logits": 0.2559200949966908, + "step": 7870 + }, + { + "epoch": 0.26266666666666666, + "grad_norm": 32.75, + "grad_norm_var": 9.16640625, + "learning_rate": 0.0001, + "loss": 8.6239, + "loss/crossentropy": 2.215903551876545, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2483787966892123, + "step": 7880 + }, + { + "epoch": 0.263, + "grad_norm": 31.375, + "grad_norm_var": 5.636393229166667, + "learning_rate": 0.0001, + "loss": 8.522, + "loss/crossentropy": 2.2514882028102874, + "loss/hidden": 3.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2960167687386274, + "step": 7890 + }, + { + "epoch": 0.2633333333333333, + "grad_norm": 30.75, + "grad_norm_var": 4.056184895833334, + "learning_rate": 0.0001, + "loss": 8.4697, + "loss/crossentropy": 2.1326026201248167, + "loss/hidden": 4.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2860325377434492, + "step": 7900 + }, + { + "epoch": 0.26366666666666666, + "grad_norm": 31.75, + "grad_norm_var": 3.7244140625, + "learning_rate": 0.0001, + "loss": 8.6586, + "loss/crossentropy": 2.2379645466804505, + "loss/hidden": 3.812109375, + "loss/jsd": 0.0, + "loss/logits": 0.24608665630221366, + "step": 7910 + }, + { + "epoch": 0.264, + "grad_norm": 32.5, + "grad_norm_var": 6.681184895833334, + "learning_rate": 0.0001, + "loss": 8.5366, + "loss/crossentropy": 2.066775370389223, + "loss/hidden": 4.01796875, + "loss/jsd": 0.0, + "loss/logits": 0.2635978292673826, + "step": 7920 + }, + { + "epoch": 0.2643333333333333, + "grad_norm": 32.25, + "grad_norm_var": 6.824934895833334, + "learning_rate": 0.0001, + "loss": 8.5181, + "loss/crossentropy": 2.0588886007666587, + "loss/hidden": 3.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2531863532960415, + "step": 7930 + }, + { + "epoch": 0.26466666666666666, + "grad_norm": 29.25, + "grad_norm_var": 2.8147497390536566e+18, + "learning_rate": 0.0001, + "loss": 8.5683, + "loss/crossentropy": 2.1350580543279647, + "loss/hidden": 3.85703125, + "loss/jsd": 0.0, + "loss/logits": 0.24998535066843033, + "step": 7940 + }, + { + "epoch": 0.265, + "grad_norm": 34.25, + "grad_norm_var": 2.814749738836951e+18, + "learning_rate": 0.0001, + "loss": 8.5132, + "loss/crossentropy": 2.0762300439178945, + "loss/hidden": 3.84609375, + "loss/jsd": 0.0, + "loss/logits": 0.2441089889034629, + "step": 7950 + }, + { + "epoch": 0.2653333333333333, + "grad_norm": 31.875, + "grad_norm_var": 8.36640625, + "learning_rate": 0.0001, + "loss": 8.4256, + "loss/crossentropy": 2.1040103793144227, + "loss/hidden": 3.84296875, + "loss/jsd": 0.0, + "loss/logits": 0.2428693912923336, + "step": 7960 + }, + { + "epoch": 0.26566666666666666, + "grad_norm": 29.375, + "grad_norm_var": 8.192122395833334, + "learning_rate": 0.0001, + "loss": 8.6841, + "loss/crossentropy": 2.0118587724864483, + "loss/hidden": 3.854296875, + "loss/jsd": 0.0, + "loss/logits": 0.21796635556966065, + "step": 7970 + }, + { + "epoch": 0.266, + "grad_norm": 31.625, + "grad_norm_var": 7.3525390625, + "learning_rate": 0.0001, + "loss": 8.4692, + "loss/crossentropy": 2.134692121297121, + "loss/hidden": 3.940625, + "loss/jsd": 0.0, + "loss/logits": 0.24534992277622222, + "step": 7980 + }, + { + "epoch": 0.2663333333333333, + "grad_norm": 35.0, + "grad_norm_var": 4.893684895833333, + "learning_rate": 0.0001, + "loss": 8.5476, + "loss/crossentropy": 2.161470866203308, + "loss/hidden": 4.1234375, + "loss/jsd": 0.0, + "loss/logits": 0.2858205262571573, + "step": 7990 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 44.25, + "grad_norm_var": 24.559830729166666, + "learning_rate": 0.0001, + "loss": 8.5115, + "loss/crossentropy": 1.9794396072626115, + "loss/hidden": 3.883984375, + "loss/jsd": 0.0, + "loss/logits": 0.253316623903811, + "step": 8000 + }, + { + "epoch": 0.267, + "grad_norm": 34.0, + "grad_norm_var": 22.160416666666666, + "learning_rate": 0.0001, + "loss": 8.5737, + "loss/crossentropy": 2.203532671928406, + "loss/hidden": 3.905859375, + "loss/jsd": 0.0, + "loss/logits": 0.26553509533405306, + "step": 8010 + }, + { + "epoch": 0.2673333333333333, + "grad_norm": 33.0, + "grad_norm_var": 9.372916666666667, + "learning_rate": 0.0001, + "loss": 8.5596, + "loss/crossentropy": 2.116582728922367, + "loss/hidden": 3.955078125, + "loss/jsd": 0.0, + "loss/logits": 0.28444691337645056, + "step": 8020 + }, + { + "epoch": 0.26766666666666666, + "grad_norm": 33.0, + "grad_norm_var": 4.118489583333333, + "learning_rate": 0.0001, + "loss": 8.7085, + "loss/crossentropy": 2.136409956216812, + "loss/hidden": 3.94609375, + "loss/jsd": 0.0, + "loss/logits": 0.26452013887465, + "step": 8030 + }, + { + "epoch": 0.268, + "grad_norm": 34.25, + "grad_norm_var": 5.120572916666666, + "learning_rate": 0.0001, + "loss": 8.7193, + "loss/crossentropy": 2.17041220664978, + "loss/hidden": 3.9, + "loss/jsd": 0.0, + "loss/logits": 0.25961695313453675, + "step": 8040 + }, + { + "epoch": 0.2683333333333333, + "grad_norm": 38.75, + "grad_norm_var": 11.583072916666667, + "learning_rate": 0.0001, + "loss": 8.55, + "loss/crossentropy": 2.1484374403953552, + "loss/hidden": 3.784375, + "loss/jsd": 0.0, + "loss/logits": 0.2425002339296043, + "step": 8050 + }, + { + "epoch": 0.26866666666666666, + "grad_norm": 32.0, + "grad_norm_var": 10.539583333333333, + "learning_rate": 0.0001, + "loss": 8.4497, + "loss/crossentropy": 2.201520799845457, + "loss/hidden": 3.866796875, + "loss/jsd": 0.0, + "loss/logits": 0.23861566837877035, + "step": 8060 + }, + { + "epoch": 0.269, + "grad_norm": 33.75, + "grad_norm_var": 7.472916666666666, + "learning_rate": 0.0001, + "loss": 8.6457, + "loss/crossentropy": 2.260037848353386, + "loss/hidden": 3.873828125, + "loss/jsd": 0.0, + "loss/logits": 0.2488136703148484, + "step": 8070 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 29.25, + "grad_norm_var": 4.987239583333333, + "learning_rate": 0.0001, + "loss": 8.5122, + "loss/crossentropy": 2.239154724776745, + "loss/hidden": 3.805078125, + "loss/jsd": 0.0, + "loss/logits": 0.24010765701532363, + "step": 8080 + }, + { + "epoch": 0.26966666666666667, + "grad_norm": 38.0, + "grad_norm_var": 4.78515625, + "learning_rate": 0.0001, + "loss": 8.5795, + "loss/crossentropy": 2.123927664756775, + "loss/hidden": 4.057421875, + "loss/jsd": 0.0, + "loss/logits": 0.27356666754931214, + "step": 8090 + }, + { + "epoch": 0.27, + "grad_norm": 29.75, + "grad_norm_var": 2.6757714700654346e+18, + "learning_rate": 0.0001, + "loss": 8.5442, + "loss/crossentropy": 2.1999975204467774, + "loss/hidden": 3.831640625, + "loss/jsd": 0.0, + "loss/logits": 0.2607791792601347, + "step": 8100 + }, + { + "epoch": 0.2703333333333333, + "grad_norm": 52.25, + "grad_norm_var": 2.675771468504629e+18, + "learning_rate": 0.0001, + "loss": 8.5682, + "loss/crossentropy": 2.1693030931055546, + "loss/hidden": 3.843359375, + "loss/jsd": 0.0, + "loss/logits": 0.251168143004179, + "step": 8110 + }, + { + "epoch": 0.27066666666666667, + "grad_norm": 32.25, + "grad_norm_var": 31.667708333333334, + "learning_rate": 0.0001, + "loss": 8.3905, + "loss/crossentropy": 2.1287291169166567, + "loss/hidden": 3.902734375, + "loss/jsd": 0.0, + "loss/logits": 0.24765251912176608, + "step": 8120 + }, + { + "epoch": 0.271, + "grad_norm": 30.0, + "grad_norm_var": 8.573958333333334, + "learning_rate": 0.0001, + "loss": 8.4201, + "loss/crossentropy": 2.0576461493968963, + "loss/hidden": 3.87734375, + "loss/jsd": 0.0, + "loss/logits": 0.241207036934793, + "step": 8130 + }, + { + "epoch": 0.2713333333333333, + "grad_norm": 43.75, + "grad_norm_var": 2.7309405654549356e+18, + "learning_rate": 0.0001, + "loss": 8.4172, + "loss/crossentropy": 2.0151774257421495, + "loss/hidden": 3.844921875, + "loss/jsd": 0.0, + "loss/logits": 0.2428217800334096, + "step": 8140 + }, + { + "epoch": 0.27166666666666667, + "grad_norm": 32.75, + "grad_norm_var": 2.7309405653723075e+18, + "learning_rate": 0.0001, + "loss": 8.5564, + "loss/crossentropy": 2.226572999358177, + "loss/hidden": 3.91328125, + "loss/jsd": 0.0, + "loss/logits": 0.26639420036226513, + "step": 8150 + }, + { + "epoch": 0.272, + "grad_norm": 31.5, + "grad_norm_var": 1.8499348958333333, + "learning_rate": 0.0001, + "loss": 8.5586, + "loss/crossentropy": 2.336693507432938, + "loss/hidden": 4.00859375, + "loss/jsd": 0.0, + "loss/logits": 0.28926637172698977, + "step": 8160 + }, + { + "epoch": 0.2723333333333333, + "grad_norm": 31.75, + "grad_norm_var": 2.5447265625, + "learning_rate": 0.0001, + "loss": 8.4725, + "loss/crossentropy": 2.2132105618715285, + "loss/hidden": 3.820703125, + "loss/jsd": 0.0, + "loss/logits": 0.24436241313815116, + "step": 8170 + }, + { + "epoch": 0.27266666666666667, + "grad_norm": 30.375, + "grad_norm_var": 2.330894253998281e+18, + "learning_rate": 0.0001, + "loss": 8.5414, + "loss/crossentropy": 1.975562959909439, + "loss/hidden": 3.817578125, + "loss/jsd": 0.0, + "loss/logits": 0.22787413820624353, + "step": 8180 + }, + { + "epoch": 0.273, + "grad_norm": 31.875, + "grad_norm_var": 40.270833333333336, + "learning_rate": 0.0001, + "loss": 8.4846, + "loss/crossentropy": 2.0327411964535713, + "loss/hidden": 3.84453125, + "loss/jsd": 0.0, + "loss/logits": 0.242176865786314, + "step": 8190 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 29.0, + "grad_norm_var": 50.523372395833334, + "learning_rate": 0.0001, + "loss": 8.4559, + "loss/crossentropy": 2.1319633327424525, + "loss/hidden": 3.97421875, + "loss/jsd": 0.0, + "loss/logits": 0.2501418372616172, + "step": 8200 + }, + { + "epoch": 0.27366666666666667, + "grad_norm": 34.0, + "grad_norm_var": 18.219205729166667, + "learning_rate": 0.0001, + "loss": 8.5216, + "loss/crossentropy": 2.2018288552761076, + "loss/hidden": 3.9546875, + "loss/jsd": 0.0, + "loss/logits": 0.2552491381764412, + "step": 8210 + }, + { + "epoch": 0.274, + "grad_norm": 33.5, + "grad_norm_var": 4.381705729166667, + "learning_rate": 0.0001, + "loss": 8.4802, + "loss/crossentropy": 2.0432650595903397, + "loss/hidden": 3.86640625, + "loss/jsd": 0.0, + "loss/logits": 0.2392191395163536, + "step": 8220 + }, + { + "epoch": 0.2743333333333333, + "grad_norm": 30.25, + "grad_norm_var": 2.1, + "learning_rate": 0.0001, + "loss": 8.4893, + "loss/crossentropy": 2.1175171703100206, + "loss/hidden": 3.949609375, + "loss/jsd": 0.0, + "loss/logits": 0.26016080360859634, + "step": 8230 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 32.0, + "grad_norm_var": 4.23125, + "learning_rate": 0.0001, + "loss": 8.4556, + "loss/crossentropy": 2.103234487399459, + "loss/hidden": 3.947265625, + "loss/jsd": 0.0, + "loss/logits": 0.24362556543201208, + "step": 8240 + }, + { + "epoch": 0.275, + "grad_norm": 29.5, + "grad_norm_var": 1.3207509407140326e+18, + "learning_rate": 0.0001, + "loss": 8.3986, + "loss/crossentropy": 2.1267191670835017, + "loss/hidden": 3.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.24572798358276488, + "step": 8250 + }, + { + "epoch": 0.2753333333333333, + "grad_norm": 29.0, + "grad_norm_var": 6.517643229166667, + "learning_rate": 0.0001, + "loss": 8.4968, + "loss/crossentropy": 2.0798249572515486, + "loss/hidden": 3.844140625, + "loss/jsd": 0.0, + "loss/logits": 0.23399676829576493, + "step": 8260 + }, + { + "epoch": 0.27566666666666667, + "grad_norm": 33.75, + "grad_norm_var": 5.4759765625, + "learning_rate": 0.0001, + "loss": 8.4268, + "loss/crossentropy": 2.0175932213664054, + "loss/hidden": 3.88203125, + "loss/jsd": 0.0, + "loss/logits": 0.2511680208146572, + "step": 8270 + }, + { + "epoch": 0.276, + "grad_norm": 33.25, + "grad_norm_var": 8.170247395833334, + "learning_rate": 0.0001, + "loss": 8.4686, + "loss/crossentropy": 2.050298312306404, + "loss/hidden": 3.818359375, + "loss/jsd": 0.0, + "loss/logits": 0.24749082382768392, + "step": 8280 + }, + { + "epoch": 0.2763333333333333, + "grad_norm": 46.5, + "grad_norm_var": 2.2546849066262026e+18, + "learning_rate": 0.0001, + "loss": 8.5837, + "loss/crossentropy": 2.107571153342724, + "loss/hidden": 3.908984375, + "loss/jsd": 0.0, + "loss/logits": 0.24962719343602657, + "step": 8290 + }, + { + "epoch": 0.27666666666666667, + "grad_norm": 31.625, + "grad_norm_var": 2.2546849069640538e+18, + "learning_rate": 0.0001, + "loss": 8.4953, + "loss/crossentropy": 2.1752505511045457, + "loss/hidden": 3.854296875, + "loss/jsd": 0.0, + "loss/logits": 0.24640031829476355, + "step": 8300 + }, + { + "epoch": 0.277, + "grad_norm": 31.25, + "grad_norm_var": 7.98515625, + "learning_rate": 0.0001, + "loss": 8.5105, + "loss/crossentropy": 2.1649673506617546, + "loss/hidden": 3.9171875, + "loss/jsd": 0.0, + "loss/logits": 0.24632014315575362, + "step": 8310 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 36.25, + "grad_norm_var": 5.1103515625, + "learning_rate": 0.0001, + "loss": 8.6269, + "loss/crossentropy": 2.1055419132113458, + "loss/hidden": 3.809375, + "loss/jsd": 0.0, + "loss/logits": 0.24666682127863168, + "step": 8320 + }, + { + "epoch": 0.2776666666666667, + "grad_norm": 35.25, + "grad_norm_var": 4.3712890625, + "learning_rate": 0.0001, + "loss": 8.5265, + "loss/crossentropy": 2.051154574751854, + "loss/hidden": 4.063671875, + "loss/jsd": 0.0, + "loss/logits": 0.2739197164773941, + "step": 8330 + }, + { + "epoch": 0.278, + "grad_norm": 27.375, + "grad_norm_var": 4.311458333333333, + "learning_rate": 0.0001, + "loss": 8.467, + "loss/crossentropy": 1.931150709837675, + "loss/hidden": 3.9625, + "loss/jsd": 0.0, + "loss/logits": 0.26600636430084706, + "step": 8340 + }, + { + "epoch": 0.2783333333333333, + "grad_norm": 41.25, + "grad_norm_var": 31.12265625, + "learning_rate": 0.0001, + "loss": 8.4761, + "loss/crossentropy": 2.0970492526888846, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.2257708402350545, + "step": 8350 + }, + { + "epoch": 0.2786666666666667, + "grad_norm": 32.75, + "grad_norm_var": 3.313593760837691e+18, + "learning_rate": 0.0001, + "loss": 8.5959, + "loss/crossentropy": 2.107315970212221, + "loss/hidden": 3.95546875, + "loss/jsd": 0.0, + "loss/logits": 0.2386183949187398, + "step": 8360 + }, + { + "epoch": 0.279, + "grad_norm": 31.25, + "grad_norm_var": 3.313593762134675e+18, + "learning_rate": 0.0001, + "loss": 8.5496, + "loss/crossentropy": 2.1075058460235594, + "loss/hidden": 3.875390625, + "loss/jsd": 0.0, + "loss/logits": 0.25149064473807814, + "step": 8370 + }, + { + "epoch": 0.2793333333333333, + "grad_norm": 36.25, + "grad_norm_var": 20.053059895833332, + "learning_rate": 0.0001, + "loss": 8.5338, + "loss/crossentropy": 2.059878170490265, + "loss/hidden": 3.878515625, + "loss/jsd": 0.0, + "loss/logits": 0.23977196607738732, + "step": 8380 + }, + { + "epoch": 0.2796666666666667, + "grad_norm": 30.875, + "grad_norm_var": 19.945768229166667, + "learning_rate": 0.0001, + "loss": 8.5962, + "loss/crossentropy": 2.052942344546318, + "loss/hidden": 3.8734375, + "loss/jsd": 0.0, + "loss/logits": 0.24317781031131744, + "step": 8390 + }, + { + "epoch": 0.28, + "grad_norm": 30.25, + "grad_norm_var": 17.1150390625, + "learning_rate": 0.0001, + "loss": 8.4453, + "loss/crossentropy": 2.103861276805401, + "loss/hidden": 3.9421875, + "loss/jsd": 0.0, + "loss/logits": 0.2519053351134062, + "step": 8400 + }, + { + "epoch": 0.2803333333333333, + "grad_norm": 31.75, + "grad_norm_var": 16.060872395833332, + "learning_rate": 0.0001, + "loss": 8.4694, + "loss/crossentropy": 2.0992368295788766, + "loss/hidden": 3.91484375, + "loss/jsd": 0.0, + "loss/logits": 0.2527425540611148, + "step": 8410 + }, + { + "epoch": 0.2806666666666667, + "grad_norm": 30.625, + "grad_norm_var": 1.9884765625, + "learning_rate": 0.0001, + "loss": 8.4347, + "loss/crossentropy": 2.264920949935913, + "loss/hidden": 4.038671875, + "loss/jsd": 0.0, + "loss/logits": 0.27555460929870607, + "step": 8420 + }, + { + "epoch": 0.281, + "grad_norm": 32.25, + "grad_norm_var": 1.3729166666666666, + "learning_rate": 0.0001, + "loss": 8.4686, + "loss/crossentropy": 2.0547826454043387, + "loss/hidden": 3.818359375, + "loss/jsd": 0.0, + "loss/logits": 0.2389751397073269, + "step": 8430 + }, + { + "epoch": 0.2813333333333333, + "grad_norm": 33.25, + "grad_norm_var": 3.923958333333333, + "learning_rate": 0.0001, + "loss": 8.4972, + "loss/crossentropy": 2.141887503862381, + "loss/hidden": 3.865234375, + "loss/jsd": 0.0, + "loss/logits": 0.23841603249311447, + "step": 8440 + }, + { + "epoch": 0.2816666666666667, + "grad_norm": 29.375, + "grad_norm_var": 5.348958333333333, + "learning_rate": 0.0001, + "loss": 8.5568, + "loss/crossentropy": 2.070135848224163, + "loss/hidden": 3.938671875, + "loss/jsd": 0.0, + "loss/logits": 0.2530257642269135, + "step": 8450 + }, + { + "epoch": 0.282, + "grad_norm": 28.75, + "grad_norm_var": 7.098372395833334, + "learning_rate": 0.0001, + "loss": 8.4062, + "loss/crossentropy": 2.1277823865413668, + "loss/hidden": 3.954296875, + "loss/jsd": 0.0, + "loss/logits": 0.25204644426703454, + "step": 8460 + }, + { + "epoch": 0.2823333333333333, + "grad_norm": 33.25, + "grad_norm_var": 4.593489583333334, + "learning_rate": 0.0001, + "loss": 8.4317, + "loss/crossentropy": 2.169590988755226, + "loss/hidden": 3.957421875, + "loss/jsd": 0.0, + "loss/logits": 0.26035118848085403, + "step": 8470 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 30.0, + "grad_norm_var": 3.678580729166667, + "learning_rate": 0.0001, + "loss": 8.4946, + "loss/crossentropy": 2.287361499667168, + "loss/hidden": 3.77109375, + "loss/jsd": 0.0, + "loss/logits": 0.23725899122655392, + "step": 8480 + }, + { + "epoch": 0.283, + "grad_norm": 54.25, + "grad_norm_var": 35.12265625, + "learning_rate": 0.0001, + "loss": 8.5028, + "loss/crossentropy": 2.078622847050428, + "loss/hidden": 3.809375, + "loss/jsd": 0.0, + "loss/logits": 0.2394928558729589, + "step": 8490 + }, + { + "epoch": 0.2833333333333333, + "grad_norm": 30.375, + "grad_norm_var": 35.994791666666664, + "learning_rate": 0.0001, + "loss": 8.3967, + "loss/crossentropy": 1.993205615878105, + "loss/hidden": 3.875, + "loss/jsd": 0.0, + "loss/logits": 0.25120790507644414, + "step": 8500 + }, + { + "epoch": 0.2836666666666667, + "grad_norm": 30.375, + "grad_norm_var": 54.930989583333336, + "learning_rate": 0.0001, + "loss": 8.4862, + "loss/crossentropy": 2.1012352854013443, + "loss/hidden": 4.03984375, + "loss/jsd": 0.0, + "loss/logits": 0.26620072200894357, + "step": 8510 + }, + { + "epoch": 0.284, + "grad_norm": 32.25, + "grad_norm_var": 53.98098958333333, + "learning_rate": 0.0001, + "loss": 8.4943, + "loss/crossentropy": 2.0601254284381865, + "loss/hidden": 3.88046875, + "loss/jsd": 0.0, + "loss/logits": 0.24300396777689456, + "step": 8520 + }, + { + "epoch": 0.2843333333333333, + "grad_norm": 45.75, + "grad_norm_var": 19.379622395833334, + "learning_rate": 0.0001, + "loss": 8.5667, + "loss/crossentropy": 2.150593836605549, + "loss/hidden": 3.97578125, + "loss/jsd": 0.0, + "loss/logits": 0.2611148880794644, + "step": 8530 + }, + { + "epoch": 0.2846666666666667, + "grad_norm": 30.0, + "grad_norm_var": 17.92890625, + "learning_rate": 0.0001, + "loss": 8.5768, + "loss/crossentropy": 2.158891648054123, + "loss/hidden": 3.895703125, + "loss/jsd": 0.0, + "loss/logits": 0.2555501349270344, + "step": 8540 + }, + { + "epoch": 0.285, + "grad_norm": 30.75, + "grad_norm_var": 6.314583333333333, + "learning_rate": 0.0001, + "loss": 8.4415, + "loss/crossentropy": 2.1268053114414216, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2675272192806005, + "step": 8550 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 30.125, + "grad_norm_var": 5.924739583333333, + "learning_rate": 0.0001, + "loss": 8.3905, + "loss/crossentropy": 1.8700345799326896, + "loss/hidden": 3.9625, + "loss/jsd": 0.0, + "loss/logits": 0.22793399412184953, + "step": 8560 + }, + { + "epoch": 0.2856666666666667, + "grad_norm": 34.75, + "grad_norm_var": 28.47265625, + "learning_rate": 0.0001, + "loss": 8.5967, + "loss/crossentropy": 2.2868270367383956, + "loss/hidden": 3.873046875, + "loss/jsd": 0.0, + "loss/logits": 0.2460212778300047, + "step": 8570 + }, + { + "epoch": 0.286, + "grad_norm": 31.375, + "grad_norm_var": 25.4125, + "learning_rate": 0.0001, + "loss": 8.5052, + "loss/crossentropy": 1.9612272754311562, + "loss/hidden": 4.025, + "loss/jsd": 0.0, + "loss/logits": 0.24744862429797648, + "step": 8580 + }, + { + "epoch": 0.28633333333333333, + "grad_norm": 31.875, + "grad_norm_var": 19.161458333333332, + "learning_rate": 0.0001, + "loss": 8.5026, + "loss/crossentropy": 1.9540777966380118, + "loss/hidden": 3.840625, + "loss/jsd": 0.0, + "loss/logits": 0.22912114206701517, + "step": 8590 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 32.75, + "grad_norm_var": 1.7705729166666666, + "learning_rate": 0.0001, + "loss": 8.4321, + "loss/crossentropy": 2.183373187482357, + "loss/hidden": 3.820703125, + "loss/jsd": 0.0, + "loss/logits": 0.24152661189436914, + "step": 8600 + }, + { + "epoch": 0.287, + "grad_norm": 39.0, + "grad_norm_var": 14.377018229166667, + "learning_rate": 0.0001, + "loss": 8.6251, + "loss/crossentropy": 2.0541095778346063, + "loss/hidden": 4.001953125, + "loss/jsd": 0.0, + "loss/logits": 0.27750074546784165, + "step": 8610 + }, + { + "epoch": 0.28733333333333333, + "grad_norm": 30.625, + "grad_norm_var": 21.0462890625, + "learning_rate": 0.0001, + "loss": 8.4404, + "loss/crossentropy": 2.1556227087974547, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.23535772711038588, + "step": 8620 + }, + { + "epoch": 0.2876666666666667, + "grad_norm": 31.625, + "grad_norm_var": 6.3337890625, + "learning_rate": 0.0001, + "loss": 8.5323, + "loss/crossentropy": 2.2881136484444142, + "loss/hidden": 3.823046875, + "loss/jsd": 0.0, + "loss/logits": 0.26564789917320014, + "step": 8630 + }, + { + "epoch": 0.288, + "grad_norm": 29.875, + "grad_norm_var": 5.957747395833334, + "learning_rate": 0.0001, + "loss": 8.4478, + "loss/crossentropy": 2.1415953427553176, + "loss/hidden": 3.861328125, + "loss/jsd": 0.0, + "loss/logits": 0.24978371188044549, + "step": 8640 + }, + { + "epoch": 0.28833333333333333, + "grad_norm": 32.75, + "grad_norm_var": 8.3916015625, + "learning_rate": 0.0001, + "loss": 8.4509, + "loss/crossentropy": 2.0754496946930887, + "loss/hidden": 3.883203125, + "loss/jsd": 0.0, + "loss/logits": 0.2334285033866763, + "step": 8650 + }, + { + "epoch": 0.2886666666666667, + "grad_norm": 39.0, + "grad_norm_var": 12.662239583333333, + "learning_rate": 0.0001, + "loss": 8.5949, + "loss/crossentropy": 2.2709645599126818, + "loss/hidden": 3.901953125, + "loss/jsd": 0.0, + "loss/logits": 0.2653419800102711, + "step": 8660 + }, + { + "epoch": 0.289, + "grad_norm": 30.375, + "grad_norm_var": 11.949739583333333, + "learning_rate": 0.0001, + "loss": 8.4114, + "loss/crossentropy": 2.142335993051529, + "loss/hidden": 3.825390625, + "loss/jsd": 0.0, + "loss/logits": 0.2271432813256979, + "step": 8670 + }, + { + "epoch": 0.28933333333333333, + "grad_norm": 30.75, + "grad_norm_var": 13.236393229166667, + "learning_rate": 0.0001, + "loss": 8.4548, + "loss/crossentropy": 1.991011817008257, + "loss/hidden": 3.896875, + "loss/jsd": 0.0, + "loss/logits": 0.2274771448224783, + "step": 8680 + }, + { + "epoch": 0.2896666666666667, + "grad_norm": 31.875, + "grad_norm_var": 21.591666666666665, + "learning_rate": 0.0001, + "loss": 8.5036, + "loss/crossentropy": 2.067255499958992, + "loss/hidden": 3.975, + "loss/jsd": 0.0, + "loss/logits": 0.25000386498868465, + "step": 8690 + }, + { + "epoch": 0.29, + "grad_norm": 31.625, + "grad_norm_var": 13.7697265625, + "learning_rate": 0.0001, + "loss": 8.4065, + "loss/crossentropy": 2.032663035392761, + "loss/hidden": 3.9734375, + "loss/jsd": 0.0, + "loss/logits": 0.26009538136422633, + "step": 8700 + }, + { + "epoch": 0.29033333333333333, + "grad_norm": 28.625, + "grad_norm_var": 9.573372395833333, + "learning_rate": 0.0001, + "loss": 8.4264, + "loss/crossentropy": 2.129157376289368, + "loss/hidden": 3.781640625, + "loss/jsd": 0.0, + "loss/logits": 0.2426974017173052, + "step": 8710 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 31.125, + "grad_norm_var": 9.345247395833333, + "learning_rate": 0.0001, + "loss": 8.4556, + "loss/crossentropy": 2.1253123968839644, + "loss/hidden": 3.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2316815422847867, + "step": 8720 + }, + { + "epoch": 0.291, + "grad_norm": 30.375, + "grad_norm_var": 288.3697265625, + "learning_rate": 0.0001, + "loss": 8.3447, + "loss/crossentropy": 2.0766125731170177, + "loss/hidden": 3.826171875, + "loss/jsd": 0.0, + "loss/logits": 0.2288608182221651, + "step": 8730 + }, + { + "epoch": 0.29133333333333333, + "grad_norm": 30.0, + "grad_norm_var": 291.34140625, + "learning_rate": 0.0001, + "loss": 8.4282, + "loss/crossentropy": 2.0344169199466706, + "loss/hidden": 3.881640625, + "loss/jsd": 0.0, + "loss/logits": 0.2401146437972784, + "step": 8740 + }, + { + "epoch": 0.2916666666666667, + "grad_norm": 31.375, + "grad_norm_var": 8.283333333333333, + "learning_rate": 0.0001, + "loss": 8.5577, + "loss/crossentropy": 2.0992552161216738, + "loss/hidden": 3.89296875, + "loss/jsd": 0.0, + "loss/logits": 0.2575317870825529, + "step": 8750 + }, + { + "epoch": 0.292, + "grad_norm": 27.75, + "grad_norm_var": 18.577083333333334, + "learning_rate": 0.0001, + "loss": 8.572, + "loss/crossentropy": 2.1781798616051673, + "loss/hidden": 3.837890625, + "loss/jsd": 0.0, + "loss/logits": 0.24388179033994675, + "step": 8760 + }, + { + "epoch": 0.29233333333333333, + "grad_norm": 31.125, + "grad_norm_var": 20.822330729166666, + "learning_rate": 0.0001, + "loss": 8.3706, + "loss/crossentropy": 2.0903132036328316, + "loss/hidden": 3.845703125, + "loss/jsd": 0.0, + "loss/logits": 0.22936930637806655, + "step": 8770 + }, + { + "epoch": 0.2926666666666667, + "grad_norm": 32.25, + "grad_norm_var": 12.4869140625, + "learning_rate": 0.0001, + "loss": 8.4817, + "loss/crossentropy": 2.1733203932642935, + "loss/hidden": 3.75, + "loss/jsd": 0.0, + "loss/logits": 0.23095921371132136, + "step": 8780 + }, + { + "epoch": 0.293, + "grad_norm": 33.25, + "grad_norm_var": 17.363997395833334, + "learning_rate": 0.0001, + "loss": 8.5659, + "loss/crossentropy": 2.088620986789465, + "loss/hidden": 3.82578125, + "loss/jsd": 0.0, + "loss/logits": 0.2334995089098811, + "step": 8790 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 33.75, + "grad_norm_var": 11.377018229166667, + "learning_rate": 0.0001, + "loss": 8.5218, + "loss/crossentropy": 1.9903348997235297, + "loss/hidden": 3.859375, + "loss/jsd": 0.0, + "loss/logits": 0.24103877376765012, + "step": 8800 + }, + { + "epoch": 0.2936666666666667, + "grad_norm": 31.625, + "grad_norm_var": 23.551041666666666, + "learning_rate": 0.0001, + "loss": 8.4445, + "loss/crossentropy": 2.0936522856354713, + "loss/hidden": 3.83125, + "loss/jsd": 0.0, + "loss/logits": 0.2551413768902421, + "step": 8810 + }, + { + "epoch": 0.294, + "grad_norm": 29.625, + "grad_norm_var": 9.57890625, + "learning_rate": 0.0001, + "loss": 8.4923, + "loss/crossentropy": 2.3439304143190385, + "loss/hidden": 3.803515625, + "loss/jsd": 0.0, + "loss/logits": 0.2454788561910391, + "step": 8820 + }, + { + "epoch": 0.29433333333333334, + "grad_norm": 35.5, + "grad_norm_var": 5.115559895833333, + "learning_rate": 0.0001, + "loss": 8.506, + "loss/crossentropy": 2.0696601763367655, + "loss/hidden": 3.872265625, + "loss/jsd": 0.0, + "loss/logits": 0.2583671987056732, + "step": 8830 + }, + { + "epoch": 0.2946666666666667, + "grad_norm": 30.625, + "grad_norm_var": 10.8509765625, + "learning_rate": 0.0001, + "loss": 8.4108, + "loss/crossentropy": 2.0794931963086127, + "loss/hidden": 3.76328125, + "loss/jsd": 0.0, + "loss/logits": 0.2228974211961031, + "step": 8840 + }, + { + "epoch": 0.295, + "grad_norm": 30.25, + "grad_norm_var": 10.792708333333334, + "learning_rate": 0.0001, + "loss": 8.4583, + "loss/crossentropy": 2.083049839735031, + "loss/hidden": 3.909765625, + "loss/jsd": 0.0, + "loss/logits": 0.243487436324358, + "step": 8850 + }, + { + "epoch": 0.29533333333333334, + "grad_norm": 33.25, + "grad_norm_var": 24.915625, + "learning_rate": 0.0001, + "loss": 8.5509, + "loss/crossentropy": 2.260637935996056, + "loss/hidden": 3.880859375, + "loss/jsd": 0.0, + "loss/logits": 0.24824294932186602, + "step": 8860 + }, + { + "epoch": 0.2956666666666667, + "grad_norm": 29.0, + "grad_norm_var": 29.515559895833334, + "learning_rate": 0.0001, + "loss": 8.3554, + "loss/crossentropy": 2.0667995259165766, + "loss/hidden": 3.91875, + "loss/jsd": 0.0, + "loss/logits": 0.24629948288202286, + "step": 8870 + }, + { + "epoch": 0.296, + "grad_norm": 35.25, + "grad_norm_var": 11.530208333333333, + "learning_rate": 0.0001, + "loss": 8.575, + "loss/crossentropy": 2.1920250236988066, + "loss/hidden": 3.991796875, + "loss/jsd": 0.0, + "loss/logits": 0.2532145943492651, + "step": 8880 + }, + { + "epoch": 0.29633333333333334, + "grad_norm": 32.75, + "grad_norm_var": 9.559375, + "learning_rate": 0.0001, + "loss": 8.3751, + "loss/crossentropy": 2.1635709404945374, + "loss/hidden": 3.77265625, + "loss/jsd": 0.0, + "loss/logits": 0.23346599154174327, + "step": 8890 + }, + { + "epoch": 0.2966666666666667, + "grad_norm": 33.25, + "grad_norm_var": 6.438997395833334, + "learning_rate": 0.0001, + "loss": 8.48, + "loss/crossentropy": 2.2036055833101273, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.22973880134522914, + "step": 8900 + }, + { + "epoch": 0.297, + "grad_norm": 32.5, + "grad_norm_var": 5.424739583333333, + "learning_rate": 0.0001, + "loss": 8.4154, + "loss/crossentropy": 2.319879895448685, + "loss/hidden": 3.826171875, + "loss/jsd": 0.0, + "loss/logits": 0.24667385257780552, + "step": 8910 + }, + { + "epoch": 0.29733333333333334, + "grad_norm": 30.25, + "grad_norm_var": 9.153059895833334, + "learning_rate": 0.0001, + "loss": 8.4004, + "loss/crossentropy": 2.2534718930721285, + "loss/hidden": 3.858984375, + "loss/jsd": 0.0, + "loss/logits": 0.25190291851758956, + "step": 8920 + }, + { + "epoch": 0.2976666666666667, + "grad_norm": 30.375, + "grad_norm_var": 6.745833333333334, + "learning_rate": 0.0001, + "loss": 8.4038, + "loss/crossentropy": 2.002136807143688, + "loss/hidden": 3.998828125, + "loss/jsd": 0.0, + "loss/logits": 0.2326902337372303, + "step": 8930 + }, + { + "epoch": 0.298, + "grad_norm": 30.75, + "grad_norm_var": 2.4567057291666665, + "learning_rate": 0.0001, + "loss": 8.4458, + "loss/crossentropy": 2.013306239247322, + "loss/hidden": 3.84921875, + "loss/jsd": 0.0, + "loss/logits": 0.22715070880949498, + "step": 8940 + }, + { + "epoch": 0.29833333333333334, + "grad_norm": 33.0, + "grad_norm_var": 4.373372395833333, + "learning_rate": 0.0001, + "loss": 8.447, + "loss/crossentropy": 2.2556902036070823, + "loss/hidden": 3.84921875, + "loss/jsd": 0.0, + "loss/logits": 0.2423699676990509, + "step": 8950 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 29.5, + "grad_norm_var": 43.209309895833336, + "learning_rate": 0.0001, + "loss": 8.4247, + "loss/crossentropy": 2.2286648035049437, + "loss/hidden": 3.874609375, + "loss/jsd": 0.0, + "loss/logits": 0.2592891216278076, + "step": 8960 + }, + { + "epoch": 0.299, + "grad_norm": 30.375, + "grad_norm_var": 7.9244140625, + "learning_rate": 0.0001, + "loss": 8.4795, + "loss/crossentropy": 2.003975507616997, + "loss/hidden": 3.9015625, + "loss/jsd": 0.0, + "loss/logits": 0.22656005583703517, + "step": 8970 + }, + { + "epoch": 0.29933333333333334, + "grad_norm": 29.875, + "grad_norm_var": 86.21979166666667, + "learning_rate": 0.0001, + "loss": 8.3252, + "loss/crossentropy": 2.1622410126030447, + "loss/hidden": 3.7859375, + "loss/jsd": 0.0, + "loss/logits": 0.25566081050783396, + "step": 8980 + }, + { + "epoch": 0.2996666666666667, + "grad_norm": 30.625, + "grad_norm_var": 28.685872395833332, + "learning_rate": 0.0001, + "loss": 8.4789, + "loss/crossentropy": 2.012830953299999, + "loss/hidden": 3.932421875, + "loss/jsd": 0.0, + "loss/logits": 0.22984218932688236, + "step": 8990 + }, + { + "epoch": 0.3, + "grad_norm": 34.5, + "grad_norm_var": 53.1634765625, + "learning_rate": 0.0001, + "loss": 8.5887, + "loss/crossentropy": 2.114059830456972, + "loss/hidden": 3.7703125, + "loss/jsd": 0.0, + "loss/logits": 0.23531355792656541, + "step": 9000 + }, + { + "epoch": 0.30033333333333334, + "grad_norm": 30.875, + "grad_norm_var": 16.183333333333334, + "learning_rate": 0.0001, + "loss": 8.433, + "loss/crossentropy": 2.0997436851263047, + "loss/hidden": 3.858984375, + "loss/jsd": 0.0, + "loss/logits": 0.2503551162779331, + "step": 9010 + }, + { + "epoch": 0.3006666666666667, + "grad_norm": 32.25, + "grad_norm_var": 6.585872395833333, + "learning_rate": 0.0001, + "loss": 8.4156, + "loss/crossentropy": 1.9693719133734704, + "loss/hidden": 3.89375, + "loss/jsd": 0.0, + "loss/logits": 0.24317112397402524, + "step": 9020 + }, + { + "epoch": 0.301, + "grad_norm": 32.75, + "grad_norm_var": 6.77265625, + "learning_rate": 0.0001, + "loss": 8.4748, + "loss/crossentropy": 2.2020839557051657, + "loss/hidden": 3.834375, + "loss/jsd": 0.0, + "loss/logits": 0.24702335204929113, + "step": 9030 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 32.75, + "grad_norm_var": 39.889322916666664, + "learning_rate": 0.0001, + "loss": 8.5507, + "loss/crossentropy": 2.1863808527588846, + "loss/hidden": 3.866015625, + "loss/jsd": 0.0, + "loss/logits": 0.2480563845485449, + "step": 9040 + }, + { + "epoch": 0.3016666666666667, + "grad_norm": 30.875, + "grad_norm_var": 3.921875, + "learning_rate": 0.0001, + "loss": 8.468, + "loss/crossentropy": 2.190934830904007, + "loss/hidden": 3.825390625, + "loss/jsd": 0.0, + "loss/logits": 0.23958997726440429, + "step": 9050 + }, + { + "epoch": 0.302, + "grad_norm": 32.0, + "grad_norm_var": 3.06875, + "learning_rate": 0.0001, + "loss": 8.6069, + "loss/crossentropy": 2.0262902580201625, + "loss/hidden": 3.857421875, + "loss/jsd": 0.0, + "loss/logits": 0.238032066822052, + "step": 9060 + }, + { + "epoch": 0.30233333333333334, + "grad_norm": 31.125, + "grad_norm_var": 10.905143229166667, + "learning_rate": 0.0001, + "loss": 8.5303, + "loss/crossentropy": 2.250582979619503, + "loss/hidden": 3.889453125, + "loss/jsd": 0.0, + "loss/logits": 0.2632708761841059, + "step": 9070 + }, + { + "epoch": 0.30266666666666664, + "grad_norm": 32.0, + "grad_norm_var": 7.757747395833333, + "learning_rate": 0.0001, + "loss": 8.3925, + "loss/crossentropy": 2.2143336325883864, + "loss/hidden": 3.739453125, + "loss/jsd": 0.0, + "loss/logits": 0.24901481308043003, + "step": 9080 + }, + { + "epoch": 0.303, + "grad_norm": 29.0, + "grad_norm_var": 4.017122395833334, + "learning_rate": 0.0001, + "loss": 8.2542, + "loss/crossentropy": 1.9929497942328454, + "loss/hidden": 3.869921875, + "loss/jsd": 0.0, + "loss/logits": 0.21925227269530295, + "step": 9090 + }, + { + "epoch": 0.30333333333333334, + "grad_norm": 35.25, + "grad_norm_var": 5.552083333333333, + "learning_rate": 0.0001, + "loss": 8.4873, + "loss/crossentropy": 2.0095451258122923, + "loss/hidden": 3.928515625, + "loss/jsd": 0.0, + "loss/logits": 0.2549102198332548, + "step": 9100 + }, + { + "epoch": 0.30366666666666664, + "grad_norm": 31.875, + "grad_norm_var": 382.7212890625, + "learning_rate": 0.0001, + "loss": 8.2141, + "loss/crossentropy": 2.1979493319988253, + "loss/hidden": 3.853515625, + "loss/jsd": 0.0, + "loss/logits": 0.24009186886250972, + "step": 9110 + }, + { + "epoch": 0.304, + "grad_norm": 27.375, + "grad_norm_var": 8.060416666666667, + "learning_rate": 0.0001, + "loss": 8.4104, + "loss/crossentropy": 1.9613987788558007, + "loss/hidden": 3.8875, + "loss/jsd": 0.0, + "loss/logits": 0.24363567791879176, + "step": 9120 + }, + { + "epoch": 0.30433333333333334, + "grad_norm": 31.75, + "grad_norm_var": 4.2900390625, + "learning_rate": 0.0001, + "loss": 8.2729, + "loss/crossentropy": 2.1863881021738054, + "loss/hidden": 3.91875, + "loss/jsd": 0.0, + "loss/logits": 0.2690675131976604, + "step": 9130 + }, + { + "epoch": 0.30466666666666664, + "grad_norm": 33.75, + "grad_norm_var": 6.741080729166667, + "learning_rate": 0.0001, + "loss": 8.5514, + "loss/crossentropy": 2.2427688628435134, + "loss/hidden": 3.885546875, + "loss/jsd": 0.0, + "loss/logits": 0.26169066652655604, + "step": 9140 + }, + { + "epoch": 0.305, + "grad_norm": 37.0, + "grad_norm_var": 285.70598958333335, + "learning_rate": 0.0001, + "loss": 8.6121, + "loss/crossentropy": 1.990150697529316, + "loss/hidden": 3.994921875, + "loss/jsd": 0.0, + "loss/logits": 0.24852563850581647, + "step": 9150 + }, + { + "epoch": 0.30533333333333335, + "grad_norm": 29.125, + "grad_norm_var": 306.2962890625, + "learning_rate": 0.0001, + "loss": 8.3866, + "loss/crossentropy": 2.1101179368793965, + "loss/hidden": 3.81328125, + "loss/jsd": 0.0, + "loss/logits": 0.22837954824790357, + "step": 9160 + }, + { + "epoch": 0.30566666666666664, + "grad_norm": 32.25, + "grad_norm_var": 2.093489583333333, + "learning_rate": 0.0001, + "loss": 8.4458, + "loss/crossentropy": 2.197181521356106, + "loss/hidden": 3.775390625, + "loss/jsd": 0.0, + "loss/logits": 0.22719864509999751, + "step": 9170 + }, + { + "epoch": 0.306, + "grad_norm": 33.5, + "grad_norm_var": 7.672916666666667, + "learning_rate": 0.0001, + "loss": 8.4434, + "loss/crossentropy": 2.16142196059227, + "loss/hidden": 3.771875, + "loss/jsd": 0.0, + "loss/logits": 0.23021659553050994, + "step": 9180 + }, + { + "epoch": 0.30633333333333335, + "grad_norm": 31.75, + "grad_norm_var": 9.355989583333333, + "learning_rate": 0.0001, + "loss": 8.3877, + "loss/crossentropy": 2.226871684193611, + "loss/hidden": 3.780859375, + "loss/jsd": 0.0, + "loss/logits": 0.24033361952751875, + "step": 9190 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 33.75, + "grad_norm_var": 9.868489583333334, + "learning_rate": 0.0001, + "loss": 8.3055, + "loss/crossentropy": 2.077723103761673, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.22728441171348096, + "step": 9200 + }, + { + "epoch": 0.307, + "grad_norm": 30.125, + "grad_norm_var": 2.35390625, + "learning_rate": 0.0001, + "loss": 8.3532, + "loss/crossentropy": 1.9681759729981423, + "loss/hidden": 3.755859375, + "loss/jsd": 0.0, + "loss/logits": 0.21074463604018093, + "step": 9210 + }, + { + "epoch": 0.30733333333333335, + "grad_norm": 29.0, + "grad_norm_var": 3.51015625, + "learning_rate": 0.0001, + "loss": 8.3401, + "loss/crossentropy": 2.04879729449749, + "loss/hidden": 3.869921875, + "loss/jsd": 0.0, + "loss/logits": 0.2396139794960618, + "step": 9220 + }, + { + "epoch": 0.30766666666666664, + "grad_norm": 30.0, + "grad_norm_var": 4.790625, + "learning_rate": 0.0001, + "loss": 8.4286, + "loss/crossentropy": 2.1882148049771786, + "loss/hidden": 3.808984375, + "loss/jsd": 0.0, + "loss/logits": 0.24543070700019598, + "step": 9230 + }, + { + "epoch": 0.308, + "grad_norm": 31.625, + "grad_norm_var": 2.678125, + "learning_rate": 0.0001, + "loss": 8.4211, + "loss/crossentropy": 2.2667997002601625, + "loss/hidden": 3.884375, + "loss/jsd": 0.0, + "loss/logits": 0.24577980488538742, + "step": 9240 + }, + { + "epoch": 0.30833333333333335, + "grad_norm": 29.75, + "grad_norm_var": 7.91875, + "learning_rate": 0.0001, + "loss": 8.219, + "loss/crossentropy": 1.994037589430809, + "loss/hidden": 3.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.24291819017380475, + "step": 9250 + }, + { + "epoch": 0.30866666666666664, + "grad_norm": 31.125, + "grad_norm_var": 8.796809895833333, + "learning_rate": 0.0001, + "loss": 8.3915, + "loss/crossentropy": 2.081589598953724, + "loss/hidden": 3.825390625, + "loss/jsd": 0.0, + "loss/logits": 0.2480682110413909, + "step": 9260 + }, + { + "epoch": 0.309, + "grad_norm": 32.75, + "grad_norm_var": 3.5931640625, + "learning_rate": 0.0001, + "loss": 8.3068, + "loss/crossentropy": 2.1044846177101135, + "loss/hidden": 3.815625, + "loss/jsd": 0.0, + "loss/logits": 0.24079927131533624, + "step": 9270 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 29.875, + "grad_norm_var": 2.27890625, + "learning_rate": 0.0001, + "loss": 8.2079, + "loss/crossentropy": 2.1612381815910338, + "loss/hidden": 3.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.24531424194574356, + "step": 9280 + }, + { + "epoch": 0.30966666666666665, + "grad_norm": 37.5, + "grad_norm_var": 5.51640625, + "learning_rate": 0.0001, + "loss": 8.4078, + "loss/crossentropy": 2.2455517396330835, + "loss/hidden": 3.787890625, + "loss/jsd": 0.0, + "loss/logits": 0.2453090760856867, + "step": 9290 + }, + { + "epoch": 0.31, + "grad_norm": 30.5, + "grad_norm_var": 6.86015625, + "learning_rate": 0.0001, + "loss": 8.3582, + "loss/crossentropy": 2.20194024592638, + "loss/hidden": 3.809375, + "loss/jsd": 0.0, + "loss/logits": 0.23695877343416213, + "step": 9300 + }, + { + "epoch": 0.31033333333333335, + "grad_norm": 30.0, + "grad_norm_var": 1.6389973958333333, + "learning_rate": 0.0001, + "loss": 8.3247, + "loss/crossentropy": 2.0889609307050705, + "loss/hidden": 3.86796875, + "loss/jsd": 0.0, + "loss/logits": 0.2596073430031538, + "step": 9310 + }, + { + "epoch": 0.31066666666666665, + "grad_norm": 30.5, + "grad_norm_var": 2.6884765625, + "learning_rate": 0.0001, + "loss": 8.352, + "loss/crossentropy": 2.1579457476735113, + "loss/hidden": 3.731640625, + "loss/jsd": 0.0, + "loss/logits": 0.2405384209007025, + "step": 9320 + }, + { + "epoch": 0.311, + "grad_norm": 33.25, + "grad_norm_var": 4.2119140625, + "learning_rate": 0.0001, + "loss": 8.4274, + "loss/crossentropy": 2.1878247916698457, + "loss/hidden": 3.809765625, + "loss/jsd": 0.0, + "loss/logits": 0.25121636800467967, + "step": 9330 + }, + { + "epoch": 0.31133333333333335, + "grad_norm": 31.75, + "grad_norm_var": 10.9603515625, + "learning_rate": 0.0001, + "loss": 8.4321, + "loss/crossentropy": 1.912748570740223, + "loss/hidden": 3.85703125, + "loss/jsd": 0.0, + "loss/logits": 0.23128144443035126, + "step": 9340 + }, + { + "epoch": 0.31166666666666665, + "grad_norm": 33.25, + "grad_norm_var": 2.4077473958333333, + "learning_rate": 0.0001, + "loss": 8.3519, + "loss/crossentropy": 2.1460745990276338, + "loss/hidden": 3.698046875, + "loss/jsd": 0.0, + "loss/logits": 0.21920422809198498, + "step": 9350 + }, + { + "epoch": 0.312, + "grad_norm": 33.25, + "grad_norm_var": 6.1134765625, + "learning_rate": 0.0001, + "loss": 8.464, + "loss/crossentropy": 2.0449389033019543, + "loss/hidden": 3.89140625, + "loss/jsd": 0.0, + "loss/logits": 0.23441595807671547, + "step": 9360 + }, + { + "epoch": 0.31233333333333335, + "grad_norm": 31.875, + "grad_norm_var": 9.36015625, + "learning_rate": 0.0001, + "loss": 8.4169, + "loss/crossentropy": 2.0924183890223502, + "loss/hidden": 3.936328125, + "loss/jsd": 0.0, + "loss/logits": 0.25249840430915355, + "step": 9370 + }, + { + "epoch": 0.31266666666666665, + "grad_norm": 30.625, + "grad_norm_var": 2.5635416666666666, + "learning_rate": 0.0001, + "loss": 8.3858, + "loss/crossentropy": 2.085674402490258, + "loss/hidden": 3.870703125, + "loss/jsd": 0.0, + "loss/logits": 0.2478548888117075, + "step": 9380 + }, + { + "epoch": 0.313, + "grad_norm": 31.625, + "grad_norm_var": 1.5947916666666666, + "learning_rate": 0.0001, + "loss": 8.3522, + "loss/crossentropy": 2.216167467832565, + "loss/hidden": 3.952734375, + "loss/jsd": 0.0, + "loss/logits": 0.2461514551192522, + "step": 9390 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 27.25, + "grad_norm_var": 4.6212890625, + "learning_rate": 0.0001, + "loss": 8.3455, + "loss/crossentropy": 2.0884574115276338, + "loss/hidden": 3.975, + "loss/jsd": 0.0, + "loss/logits": 0.23749772738665342, + "step": 9400 + }, + { + "epoch": 0.31366666666666665, + "grad_norm": 31.5, + "grad_norm_var": 4.935872395833333, + "learning_rate": 0.0001, + "loss": 8.4968, + "loss/crossentropy": 2.0512664087116717, + "loss/hidden": 3.9125, + "loss/jsd": 0.0, + "loss/logits": 0.24940967485308646, + "step": 9410 + }, + { + "epoch": 0.314, + "grad_norm": 32.75, + "grad_norm_var": 3.8374348958333333, + "learning_rate": 0.0001, + "loss": 8.507, + "loss/crossentropy": 2.1305520750582216, + "loss/hidden": 3.708203125, + "loss/jsd": 0.0, + "loss/logits": 0.22748439833521844, + "step": 9420 + }, + { + "epoch": 0.31433333333333335, + "grad_norm": 33.25, + "grad_norm_var": 3.1197265625, + "learning_rate": 0.0001, + "loss": 8.3505, + "loss/crossentropy": 2.1478428706526755, + "loss/hidden": 3.8171875, + "loss/jsd": 0.0, + "loss/logits": 0.26068285927176477, + "step": 9430 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 31.0, + "grad_norm_var": 10.255989583333333, + "learning_rate": 0.0001, + "loss": 8.4617, + "loss/crossentropy": 2.257087817788124, + "loss/hidden": 3.928515625, + "loss/jsd": 0.0, + "loss/logits": 0.24564366415143013, + "step": 9440 + }, + { + "epoch": 0.315, + "grad_norm": 30.625, + "grad_norm_var": 3.189322916666667, + "learning_rate": 0.0001, + "loss": 8.5139, + "loss/crossentropy": 2.2028581954538824, + "loss/hidden": 3.878125, + "loss/jsd": 0.0, + "loss/logits": 0.24985253605991603, + "step": 9450 + }, + { + "epoch": 0.31533333333333335, + "grad_norm": 32.25, + "grad_norm_var": 12.878059895833333, + "learning_rate": 0.0001, + "loss": 8.3581, + "loss/crossentropy": 2.2839835971593856, + "loss/hidden": 3.7609375, + "loss/jsd": 0.0, + "loss/logits": 0.24003779105842113, + "step": 9460 + }, + { + "epoch": 0.31566666666666665, + "grad_norm": 27.125, + "grad_norm_var": 13.624934895833333, + "learning_rate": 0.0001, + "loss": 8.3315, + "loss/crossentropy": 2.1846924126148224, + "loss/hidden": 3.828515625, + "loss/jsd": 0.0, + "loss/logits": 0.2577309591695666, + "step": 9470 + }, + { + "epoch": 0.316, + "grad_norm": 30.875, + "grad_norm_var": 11.371809895833334, + "learning_rate": 0.0001, + "loss": 8.3005, + "loss/crossentropy": 2.1467300802469254, + "loss/hidden": 3.784765625, + "loss/jsd": 0.0, + "loss/logits": 0.2385113213211298, + "step": 9480 + }, + { + "epoch": 0.31633333333333336, + "grad_norm": 32.25, + "grad_norm_var": 6.6697265625, + "learning_rate": 0.0001, + "loss": 8.4251, + "loss/crossentropy": 2.0814339205622674, + "loss/hidden": 3.93984375, + "loss/jsd": 0.0, + "loss/logits": 0.2647860247641802, + "step": 9490 + }, + { + "epoch": 0.31666666666666665, + "grad_norm": 40.0, + "grad_norm_var": 10.087955729166667, + "learning_rate": 0.0001, + "loss": 8.2889, + "loss/crossentropy": 2.0348707735538483, + "loss/hidden": 3.828125, + "loss/jsd": 0.0, + "loss/logits": 0.23109398372471332, + "step": 9500 + }, + { + "epoch": 0.317, + "grad_norm": 31.25, + "grad_norm_var": 8.975, + "learning_rate": 0.0001, + "loss": 8.3453, + "loss/crossentropy": 2.011518883705139, + "loss/hidden": 3.915234375, + "loss/jsd": 0.0, + "loss/logits": 0.24540937803685664, + "step": 9510 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 30.125, + "grad_norm_var": 2.7587890625, + "learning_rate": 0.0001, + "loss": 8.3162, + "loss/crossentropy": 2.1556458704173567, + "loss/hidden": 3.7796875, + "loss/jsd": 0.0, + "loss/logits": 0.2444867927581072, + "step": 9520 + }, + { + "epoch": 0.31766666666666665, + "grad_norm": 32.75, + "grad_norm_var": 3.5035807291666665, + "learning_rate": 0.0001, + "loss": 8.3472, + "loss/crossentropy": 2.081064721941948, + "loss/hidden": 3.833984375, + "loss/jsd": 0.0, + "loss/logits": 0.24423375371843575, + "step": 9530 + }, + { + "epoch": 0.318, + "grad_norm": 30.0, + "grad_norm_var": 9.223372395833334, + "learning_rate": 0.0001, + "loss": 8.3239, + "loss/crossentropy": 2.0272342920303346, + "loss/hidden": 4.003515625, + "loss/jsd": 0.0, + "loss/logits": 0.24455212838947774, + "step": 9540 + }, + { + "epoch": 0.31833333333333336, + "grad_norm": 34.5, + "grad_norm_var": 2.3824041777970545e+18, + "learning_rate": 0.0001, + "loss": 8.4159, + "loss/crossentropy": 2.2179324753582477, + "loss/hidden": 3.867578125, + "loss/jsd": 0.0, + "loss/logits": 0.23996016178280116, + "step": 9550 + }, + { + "epoch": 0.31866666666666665, + "grad_norm": 30.75, + "grad_norm_var": 2.382404178144343e+18, + "learning_rate": 0.0001, + "loss": 8.3233, + "loss/crossentropy": 2.08397556617856, + "loss/hidden": 3.85234375, + "loss/jsd": 0.0, + "loss/logits": 0.22639973247423767, + "step": 9560 + }, + { + "epoch": 0.319, + "grad_norm": 34.25, + "grad_norm_var": 99.1587890625, + "learning_rate": 0.0001, + "loss": 8.3621, + "loss/crossentropy": 2.1781677812337876, + "loss/hidden": 3.758984375, + "loss/jsd": 0.0, + "loss/logits": 0.23908968791365623, + "step": 9570 + }, + { + "epoch": 0.31933333333333336, + "grad_norm": 31.625, + "grad_norm_var": 3.3208333333333333, + "learning_rate": 0.0001, + "loss": 8.3194, + "loss/crossentropy": 2.165592886507511, + "loss/hidden": 3.739453125, + "loss/jsd": 0.0, + "loss/logits": 0.23889986649155617, + "step": 9580 + }, + { + "epoch": 0.31966666666666665, + "grad_norm": 31.75, + "grad_norm_var": 4.983072916666667, + "learning_rate": 0.0001, + "loss": 8.3029, + "loss/crossentropy": 2.0493919394910334, + "loss/hidden": 3.8265625, + "loss/jsd": 0.0, + "loss/logits": 0.21441856175661086, + "step": 9590 + }, + { + "epoch": 0.32, + "grad_norm": 32.25, + "grad_norm_var": 3.874739583333333, + "learning_rate": 0.0001, + "loss": 8.2748, + "loss/crossentropy": 2.035661220550537, + "loss/hidden": 3.841796875, + "loss/jsd": 0.0, + "loss/logits": 0.24448039066046476, + "step": 9600 + }, + { + "epoch": 0.32033333333333336, + "grad_norm": 28.375, + "grad_norm_var": 42.1072265625, + "learning_rate": 0.0001, + "loss": 8.3015, + "loss/crossentropy": 2.126982606202364, + "loss/hidden": 3.825390625, + "loss/jsd": 0.0, + "loss/logits": 0.2457258015871048, + "step": 9610 + }, + { + "epoch": 0.32066666666666666, + "grad_norm": 28.375, + "grad_norm_var": 28.330989583333334, + "learning_rate": 0.0001, + "loss": 8.2735, + "loss/crossentropy": 2.0983067765831946, + "loss/hidden": 3.830078125, + "loss/jsd": 0.0, + "loss/logits": 0.23017309829592705, + "step": 9620 + }, + { + "epoch": 0.321, + "grad_norm": 31.125, + "grad_norm_var": 40.84264322916667, + "learning_rate": 0.0001, + "loss": 8.3467, + "loss/crossentropy": 2.183681347966194, + "loss/hidden": 3.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.24658725559711456, + "step": 9630 + }, + { + "epoch": 0.32133333333333336, + "grad_norm": 31.625, + "grad_norm_var": 27.297916666666666, + "learning_rate": 0.0001, + "loss": 8.2972, + "loss/crossentropy": 2.162729802727699, + "loss/hidden": 3.79609375, + "loss/jsd": 0.0, + "loss/logits": 0.23747125826776028, + "step": 9640 + }, + { + "epoch": 0.32166666666666666, + "grad_norm": 35.0, + "grad_norm_var": 5.288541666666666, + "learning_rate": 0.0001, + "loss": 8.434, + "loss/crossentropy": 2.1427035361528395, + "loss/hidden": 3.78671875, + "loss/jsd": 0.0, + "loss/logits": 0.249039919488132, + "step": 9650 + }, + { + "epoch": 0.322, + "grad_norm": 28.875, + "grad_norm_var": 6.216666666666667, + "learning_rate": 0.0001, + "loss": 8.2608, + "loss/crossentropy": 2.2181741327047346, + "loss/hidden": 3.7984375, + "loss/jsd": 0.0, + "loss/logits": 0.2580121297389269, + "step": 9660 + }, + { + "epoch": 0.32233333333333336, + "grad_norm": 32.0, + "grad_norm_var": 2.3535807291666666, + "learning_rate": 0.0001, + "loss": 8.3163, + "loss/crossentropy": 2.1240747086703777, + "loss/hidden": 3.805859375, + "loss/jsd": 0.0, + "loss/logits": 0.23749625347554684, + "step": 9670 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 31.0, + "grad_norm_var": 10.026041666666666, + "learning_rate": 0.0001, + "loss": 8.3716, + "loss/crossentropy": 2.178921973705292, + "loss/hidden": 3.826953125, + "loss/jsd": 0.0, + "loss/logits": 0.23335804082453251, + "step": 9680 + }, + { + "epoch": 0.323, + "grad_norm": 29.875, + "grad_norm_var": 3.7514973958333333, + "learning_rate": 0.0001, + "loss": 8.2053, + "loss/crossentropy": 1.9151308126747608, + "loss/hidden": 3.718359375, + "loss/jsd": 0.0, + "loss/logits": 0.22061082273721694, + "step": 9690 + }, + { + "epoch": 0.3233333333333333, + "grad_norm": 32.25, + "grad_norm_var": 6.75, + "learning_rate": 0.0001, + "loss": 8.2783, + "loss/crossentropy": 2.1763371601700783, + "loss/hidden": 3.832421875, + "loss/jsd": 0.0, + "loss/logits": 0.24064910151064395, + "step": 9700 + }, + { + "epoch": 0.32366666666666666, + "grad_norm": 35.5, + "grad_norm_var": 11.44765625, + "learning_rate": 0.0001, + "loss": 8.5377, + "loss/crossentropy": 2.058997410535812, + "loss/hidden": 3.8796875, + "loss/jsd": 0.0, + "loss/logits": 0.23803395926952362, + "step": 9710 + }, + { + "epoch": 0.324, + "grad_norm": 29.0, + "grad_norm_var": 4.698372395833333, + "learning_rate": 0.0001, + "loss": 8.1845, + "loss/crossentropy": 2.0421560525894167, + "loss/hidden": 3.766015625, + "loss/jsd": 0.0, + "loss/logits": 0.22774729747325181, + "step": 9720 + }, + { + "epoch": 0.3243333333333333, + "grad_norm": 29.5, + "grad_norm_var": 31.884309895833333, + "learning_rate": 0.0001, + "loss": 8.2768, + "loss/crossentropy": 2.1532857537269594, + "loss/hidden": 3.821875, + "loss/jsd": 0.0, + "loss/logits": 0.24113734550774096, + "step": 9730 + }, + { + "epoch": 0.32466666666666666, + "grad_norm": 31.0, + "grad_norm_var": 7.264518229166667, + "learning_rate": 0.0001, + "loss": 8.4747, + "loss/crossentropy": 2.0760410211980345, + "loss/hidden": 3.835546875, + "loss/jsd": 0.0, + "loss/logits": 0.23573338724672793, + "step": 9740 + }, + { + "epoch": 0.325, + "grad_norm": 29.375, + "grad_norm_var": 6.7337890625, + "learning_rate": 0.0001, + "loss": 8.3715, + "loss/crossentropy": 2.2192045249044896, + "loss/hidden": 3.683203125, + "loss/jsd": 0.0, + "loss/logits": 0.21446713693439962, + "step": 9750 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 31.25, + "grad_norm_var": 8.4291015625, + "learning_rate": 0.0001, + "loss": 8.3085, + "loss/crossentropy": 2.195969696342945, + "loss/hidden": 3.7796875, + "loss/jsd": 0.0, + "loss/logits": 0.24359578415751457, + "step": 9760 + }, + { + "epoch": 0.32566666666666666, + "grad_norm": 29.375, + "grad_norm_var": 19.993489583333332, + "learning_rate": 0.0001, + "loss": 8.3417, + "loss/crossentropy": 2.0906290262937546, + "loss/hidden": 3.74921875, + "loss/jsd": 0.0, + "loss/logits": 0.2237309933640063, + "step": 9770 + }, + { + "epoch": 0.326, + "grad_norm": 36.75, + "grad_norm_var": 21.134830729166666, + "learning_rate": 0.0001, + "loss": 8.5136, + "loss/crossentropy": 2.0349312365055083, + "loss/hidden": 4.02265625, + "loss/jsd": 0.0, + "loss/logits": 0.25023720134049654, + "step": 9780 + }, + { + "epoch": 0.3263333333333333, + "grad_norm": 33.5, + "grad_norm_var": 6.266666666666667, + "learning_rate": 0.0001, + "loss": 8.3713, + "loss/crossentropy": 2.0479774929583074, + "loss/hidden": 3.88203125, + "loss/jsd": 0.0, + "loss/logits": 0.2510897688567638, + "step": 9790 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 29.25, + "grad_norm_var": 3.5947916666666666, + "learning_rate": 0.0001, + "loss": 8.3595, + "loss/crossentropy": 2.1898303270339965, + "loss/hidden": 3.8953125, + "loss/jsd": 0.0, + "loss/logits": 0.25962252635508776, + "step": 9800 + }, + { + "epoch": 0.327, + "grad_norm": 36.5, + "grad_norm_var": 6.862434895833333, + "learning_rate": 0.0001, + "loss": 8.3717, + "loss/crossentropy": 2.0998566307127478, + "loss/hidden": 3.752734375, + "loss/jsd": 0.0, + "loss/logits": 0.2255854407325387, + "step": 9810 + }, + { + "epoch": 0.3273333333333333, + "grad_norm": 30.125, + "grad_norm_var": 7.448893229166667, + "learning_rate": 0.0001, + "loss": 8.3888, + "loss/crossentropy": 2.15216224193573, + "loss/hidden": 3.88203125, + "loss/jsd": 0.0, + "loss/logits": 0.24142319150269032, + "step": 9820 + }, + { + "epoch": 0.32766666666666666, + "grad_norm": 32.5, + "grad_norm_var": 5.168684895833334, + "learning_rate": 0.0001, + "loss": 8.2977, + "loss/crossentropy": 2.0397166281938555, + "loss/hidden": 3.85859375, + "loss/jsd": 0.0, + "loss/logits": 0.24393697939813136, + "step": 9830 + }, + { + "epoch": 0.328, + "grad_norm": 30.125, + "grad_norm_var": 3.12890625, + "learning_rate": 0.0001, + "loss": 8.3692, + "loss/crossentropy": 2.008080554753542, + "loss/hidden": 3.7828125, + "loss/jsd": 0.0, + "loss/logits": 0.22885626852512359, + "step": 9840 + }, + { + "epoch": 0.3283333333333333, + "grad_norm": 30.5, + "grad_norm_var": 2.624739583333333, + "learning_rate": 0.0001, + "loss": 8.3227, + "loss/crossentropy": 2.1162449195981026, + "loss/hidden": 3.86953125, + "loss/jsd": 0.0, + "loss/logits": 0.2397829968482256, + "step": 9850 + }, + { + "epoch": 0.32866666666666666, + "grad_norm": 29.5, + "grad_norm_var": 5.151041666666667, + "learning_rate": 0.0001, + "loss": 8.347, + "loss/crossentropy": 2.2286925226449967, + "loss/hidden": 3.859375, + "loss/jsd": 0.0, + "loss/logits": 0.24280091263353826, + "step": 9860 + }, + { + "epoch": 0.329, + "grad_norm": 30.375, + "grad_norm_var": 8.0072265625, + "learning_rate": 0.0001, + "loss": 8.3725, + "loss/crossentropy": 2.1761581540107726, + "loss/hidden": 3.762890625, + "loss/jsd": 0.0, + "loss/logits": 0.22693675048649312, + "step": 9870 + }, + { + "epoch": 0.3293333333333333, + "grad_norm": 29.125, + "grad_norm_var": 2.90390625, + "learning_rate": 0.0001, + "loss": 8.3481, + "loss/crossentropy": 2.080559401214123, + "loss/hidden": 3.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.23833436965942384, + "step": 9880 + }, + { + "epoch": 0.32966666666666666, + "grad_norm": 28.625, + "grad_norm_var": 10.793489583333333, + "learning_rate": 0.0001, + "loss": 8.2629, + "loss/crossentropy": 2.1615509897470475, + "loss/hidden": 3.900390625, + "loss/jsd": 0.0, + "loss/logits": 0.2443408088758588, + "step": 9890 + }, + { + "epoch": 0.33, + "grad_norm": 29.0, + "grad_norm_var": 13.769205729166666, + "learning_rate": 0.0001, + "loss": 8.4146, + "loss/crossentropy": 2.0900515958666803, + "loss/hidden": 3.8265625, + "loss/jsd": 0.0, + "loss/logits": 0.23338977247476578, + "step": 9900 + }, + { + "epoch": 0.3303333333333333, + "grad_norm": 29.875, + "grad_norm_var": 5.7009765625, + "learning_rate": 0.0001, + "loss": 8.3165, + "loss/crossentropy": 2.0612318471074103, + "loss/hidden": 3.8640625, + "loss/jsd": 0.0, + "loss/logits": 0.2572694033384323, + "step": 9910 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 38.5, + "grad_norm_var": 9.492122395833333, + "learning_rate": 0.0001, + "loss": 8.3621, + "loss/crossentropy": 2.1355748385190965, + "loss/hidden": 3.861328125, + "loss/jsd": 0.0, + "loss/logits": 0.2555574133992195, + "step": 9920 + }, + { + "epoch": 0.331, + "grad_norm": 31.875, + "grad_norm_var": 8.562434895833333, + "learning_rate": 0.0001, + "loss": 8.2093, + "loss/crossentropy": 2.032300639897585, + "loss/hidden": 3.8375, + "loss/jsd": 0.0, + "loss/logits": 0.25119177643209695, + "step": 9930 + }, + { + "epoch": 0.3313333333333333, + "grad_norm": 33.25, + "grad_norm_var": 1.31640625, + "learning_rate": 0.0001, + "loss": 8.3696, + "loss/crossentropy": 2.0900708585977554, + "loss/hidden": 3.874609375, + "loss/jsd": 0.0, + "loss/logits": 0.23378594107925893, + "step": 9940 + }, + { + "epoch": 0.33166666666666667, + "grad_norm": 34.5, + "grad_norm_var": 55.3228515625, + "learning_rate": 0.0001, + "loss": 8.3493, + "loss/crossentropy": 1.9557788401842118, + "loss/hidden": 3.908984375, + "loss/jsd": 0.0, + "loss/logits": 0.24859324246644973, + "step": 9950 + }, + { + "epoch": 0.332, + "grad_norm": 30.625, + "grad_norm_var": 58.469791666666666, + "learning_rate": 0.0001, + "loss": 8.3664, + "loss/crossentropy": 2.041897915303707, + "loss/hidden": 3.775, + "loss/jsd": 0.0, + "loss/logits": 0.229884634912014, + "step": 9960 + }, + { + "epoch": 0.3323333333333333, + "grad_norm": 33.5, + "grad_norm_var": 3.70390625, + "learning_rate": 0.0001, + "loss": 8.4056, + "loss/crossentropy": 2.10531293079257, + "loss/hidden": 3.849609375, + "loss/jsd": 0.0, + "loss/logits": 0.23604489639401435, + "step": 9970 + }, + { + "epoch": 0.33266666666666667, + "grad_norm": 33.0, + "grad_norm_var": 15.415559895833333, + "learning_rate": 0.0001, + "loss": 8.2762, + "loss/crossentropy": 2.3318694084882736, + "loss/hidden": 3.791015625, + "loss/jsd": 0.0, + "loss/logits": 0.24015157260000705, + "step": 9980 + }, + { + "epoch": 0.333, + "grad_norm": 30.625, + "grad_norm_var": 3.520768229166667, + "learning_rate": 0.0001, + "loss": 8.3135, + "loss/crossentropy": 2.2009642593562604, + "loss/hidden": 3.73671875, + "loss/jsd": 0.0, + "loss/logits": 0.22714318558573723, + "step": 9990 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 28.5, + "grad_norm_var": 3.31875, + "learning_rate": 0.0001, + "loss": 8.0996, + "loss/crossentropy": 1.9759045481681823, + "loss/hidden": 3.86015625, + "loss/jsd": 0.0, + "loss/logits": 0.2291174167767167, + "step": 10000 + }, + { + "epoch": 0.33366666666666667, + "grad_norm": 32.25, + "grad_norm_var": 7.280143229166667, + "learning_rate": 0.0001, + "loss": 8.3198, + "loss/crossentropy": 2.250378394126892, + "loss/hidden": 3.8390625, + "loss/jsd": 0.0, + "loss/logits": 0.24800184294581412, + "step": 10010 + }, + { + "epoch": 0.334, + "grad_norm": 37.5, + "grad_norm_var": 1.892637721373547e+18, + "learning_rate": 0.0001, + "loss": 8.5697, + "loss/crossentropy": 2.0878022998571395, + "loss/hidden": 3.894140625, + "loss/jsd": 0.0, + "loss/logits": 0.24692457877099513, + "step": 10020 + }, + { + "epoch": 0.3343333333333333, + "grad_norm": 34.25, + "grad_norm_var": 17.612239583333334, + "learning_rate": 0.0001, + "loss": 8.4287, + "loss/crossentropy": 2.1991532504558564, + "loss/hidden": 3.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.22239614240825176, + "step": 10030 + }, + { + "epoch": 0.33466666666666667, + "grad_norm": 33.75, + "grad_norm_var": 14.96015625, + "learning_rate": 0.0001, + "loss": 8.3611, + "loss/crossentropy": 2.1013733208179475, + "loss/hidden": 3.845703125, + "loss/jsd": 0.0, + "loss/logits": 0.2412415651604533, + "step": 10040 + }, + { + "epoch": 0.335, + "grad_norm": 31.625, + "grad_norm_var": 2.1184895833333335, + "learning_rate": 0.0001, + "loss": 8.3079, + "loss/crossentropy": 2.1323711395263674, + "loss/hidden": 3.775390625, + "loss/jsd": 0.0, + "loss/logits": 0.2319201186299324, + "step": 10050 + }, + { + "epoch": 0.3353333333333333, + "grad_norm": 28.5, + "grad_norm_var": 4.184309895833334, + "learning_rate": 0.0001, + "loss": 8.3487, + "loss/crossentropy": 2.1746664479374886, + "loss/hidden": 3.869921875, + "loss/jsd": 0.0, + "loss/logits": 0.24217009954154492, + "step": 10060 + }, + { + "epoch": 0.33566666666666667, + "grad_norm": 31.0, + "grad_norm_var": 4.6712890625, + "learning_rate": 0.0001, + "loss": 8.4495, + "loss/crossentropy": 2.1526307716965674, + "loss/hidden": 3.76875, + "loss/jsd": 0.0, + "loss/logits": 0.24897574950009585, + "step": 10070 + }, + { + "epoch": 0.336, + "grad_norm": 33.25, + "grad_norm_var": 3.486458333333333, + "learning_rate": 0.0001, + "loss": 8.3761, + "loss/crossentropy": 2.175819969177246, + "loss/hidden": 3.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2495252525433898, + "step": 10080 + }, + { + "epoch": 0.3363333333333333, + "grad_norm": 31.375, + "grad_norm_var": 3.7577473958333334, + "learning_rate": 0.0001, + "loss": 8.2976, + "loss/crossentropy": 2.1154340267181397, + "loss/hidden": 3.886328125, + "loss/jsd": 0.0, + "loss/logits": 0.2435309149324894, + "step": 10090 + }, + { + "epoch": 0.33666666666666667, + "grad_norm": 28.25, + "grad_norm_var": 5.289583333333334, + "learning_rate": 0.0001, + "loss": 8.2897, + "loss/crossentropy": 1.9862043529748916, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22356732599437237, + "step": 10100 + }, + { + "epoch": 0.337, + "grad_norm": 30.125, + "grad_norm_var": 4.676497395833334, + "learning_rate": 0.0001, + "loss": 8.4647, + "loss/crossentropy": 2.149342668801546, + "loss/hidden": 3.8328125, + "loss/jsd": 0.0, + "loss/logits": 0.25235841386020186, + "step": 10110 + }, + { + "epoch": 0.3373333333333333, + "grad_norm": 41.75, + "grad_norm_var": 70.57057291666666, + "learning_rate": 0.0001, + "loss": 8.4197, + "loss/crossentropy": 2.3661131516098974, + "loss/hidden": 3.769140625, + "loss/jsd": 0.0, + "loss/logits": 0.25782175101339816, + "step": 10120 + }, + { + "epoch": 0.33766666666666667, + "grad_norm": 30.75, + "grad_norm_var": 295.07604166666664, + "learning_rate": 0.0001, + "loss": 8.2983, + "loss/crossentropy": 2.2677551925182344, + "loss/hidden": 3.787890625, + "loss/jsd": 0.0, + "loss/logits": 0.24740365371108056, + "step": 10130 + }, + { + "epoch": 0.338, + "grad_norm": 35.5, + "grad_norm_var": 284.6176432291667, + "learning_rate": 0.0001, + "loss": 8.4841, + "loss/crossentropy": 2.134307199716568, + "loss/hidden": 3.8296875, + "loss/jsd": 0.0, + "loss/logits": 0.23806515689939262, + "step": 10140 + }, + { + "epoch": 0.3383333333333333, + "grad_norm": 32.5, + "grad_norm_var": 4.538997395833333, + "learning_rate": 0.0001, + "loss": 8.3621, + "loss/crossentropy": 2.3126337975263596, + "loss/hidden": 3.85703125, + "loss/jsd": 0.0, + "loss/logits": 0.26148965023458004, + "step": 10150 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 37.0, + "grad_norm_var": 5.0353515625, + "learning_rate": 0.0001, + "loss": 8.2753, + "loss/crossentropy": 2.0247806657105683, + "loss/hidden": 3.89375, + "loss/jsd": 0.0, + "loss/logits": 0.23136391188018024, + "step": 10160 + }, + { + "epoch": 0.339, + "grad_norm": 27.75, + "grad_norm_var": 7.939518229166667, + "learning_rate": 0.0001, + "loss": 8.2568, + "loss/crossentropy": 2.1857830375432967, + "loss/hidden": 3.741796875, + "loss/jsd": 0.0, + "loss/logits": 0.23809754736721517, + "step": 10170 + }, + { + "epoch": 0.3393333333333333, + "grad_norm": 31.75, + "grad_norm_var": 6.262239583333334, + "learning_rate": 0.0001, + "loss": 8.321, + "loss/crossentropy": 2.045456614345312, + "loss/hidden": 3.776953125, + "loss/jsd": 0.0, + "loss/logits": 0.23576115854084492, + "step": 10180 + }, + { + "epoch": 0.3396666666666667, + "grad_norm": 30.5, + "grad_norm_var": 1.8822265625, + "learning_rate": 0.0001, + "loss": 8.2247, + "loss/crossentropy": 2.035719431936741, + "loss/hidden": 3.757421875, + "loss/jsd": 0.0, + "loss/logits": 0.2281944528222084, + "step": 10190 + }, + { + "epoch": 0.34, + "grad_norm": 31.375, + "grad_norm_var": 7.2744140625, + "learning_rate": 0.0001, + "loss": 8.2969, + "loss/crossentropy": 2.130661930143833, + "loss/hidden": 3.798046875, + "loss/jsd": 0.0, + "loss/logits": 0.23119777366518973, + "step": 10200 + }, + { + "epoch": 0.3403333333333333, + "grad_norm": 38.0, + "grad_norm_var": 9.555989583333334, + "learning_rate": 0.0001, + "loss": 8.3353, + "loss/crossentropy": 2.007842856645584, + "loss/hidden": 3.954296875, + "loss/jsd": 0.0, + "loss/logits": 0.2692394644021988, + "step": 10210 + }, + { + "epoch": 0.3406666666666667, + "grad_norm": 29.75, + "grad_norm_var": 6.705208333333333, + "learning_rate": 0.0001, + "loss": 8.3837, + "loss/crossentropy": 2.097426188737154, + "loss/hidden": 3.803515625, + "loss/jsd": 0.0, + "loss/logits": 0.23504080064594746, + "step": 10220 + }, + { + "epoch": 0.341, + "grad_norm": 34.75, + "grad_norm_var": 5.630143229166666, + "learning_rate": 0.0001, + "loss": 8.3845, + "loss/crossentropy": 2.030475867539644, + "loss/hidden": 3.895703125, + "loss/jsd": 0.0, + "loss/logits": 0.2574477320536971, + "step": 10230 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 29.75, + "grad_norm_var": 4.81640625, + "learning_rate": 0.0001, + "loss": 8.3345, + "loss/crossentropy": 2.1374482408165933, + "loss/hidden": 3.755859375, + "loss/jsd": 0.0, + "loss/logits": 0.23863786160945893, + "step": 10240 + }, + { + "epoch": 0.3416666666666667, + "grad_norm": 31.625, + "grad_norm_var": 3.5625, + "learning_rate": 0.0001, + "loss": 8.4214, + "loss/crossentropy": 1.949062729626894, + "loss/hidden": 3.873828125, + "loss/jsd": 0.0, + "loss/logits": 0.24437980260699987, + "step": 10250 + }, + { + "epoch": 0.342, + "grad_norm": 31.75, + "grad_norm_var": 3.8447265625, + "learning_rate": 0.0001, + "loss": 8.4116, + "loss/crossentropy": 2.06142435669899, + "loss/hidden": 3.733984375, + "loss/jsd": 0.0, + "loss/logits": 0.21934528928250074, + "step": 10260 + }, + { + "epoch": 0.3423333333333333, + "grad_norm": 29.25, + "grad_norm_var": 4.991080729166667, + "learning_rate": 0.0001, + "loss": 8.3459, + "loss/crossentropy": 2.0533831655979156, + "loss/hidden": 3.821484375, + "loss/jsd": 0.0, + "loss/logits": 0.237164250575006, + "step": 10270 + }, + { + "epoch": 0.3426666666666667, + "grad_norm": 29.875, + "grad_norm_var": 8.5125, + "learning_rate": 0.0001, + "loss": 8.4675, + "loss/crossentropy": 2.220197274535894, + "loss/hidden": 3.89140625, + "loss/jsd": 0.0, + "loss/logits": 0.253680607303977, + "step": 10280 + }, + { + "epoch": 0.343, + "grad_norm": 31.625, + "grad_norm_var": 4.987239583333333, + "learning_rate": 0.0001, + "loss": 8.2682, + "loss/crossentropy": 2.1430200926959513, + "loss/hidden": 3.75703125, + "loss/jsd": 0.0, + "loss/logits": 0.22555206064134836, + "step": 10290 + }, + { + "epoch": 0.3433333333333333, + "grad_norm": 28.375, + "grad_norm_var": 4.96640625, + "learning_rate": 0.0001, + "loss": 8.3617, + "loss/crossentropy": 2.2133467949926855, + "loss/hidden": 3.903125, + "loss/jsd": 0.0, + "loss/logits": 0.26156550645828247, + "step": 10300 + }, + { + "epoch": 0.3436666666666667, + "grad_norm": 29.375, + "grad_norm_var": 7.2212890625, + "learning_rate": 0.0001, + "loss": 8.2326, + "loss/crossentropy": 2.02518198415637, + "loss/hidden": 3.685546875, + "loss/jsd": 0.0, + "loss/logits": 0.21950181983411313, + "step": 10310 + }, + { + "epoch": 0.344, + "grad_norm": 29.875, + "grad_norm_var": 8.96015625, + "learning_rate": 0.0001, + "loss": 8.2243, + "loss/crossentropy": 2.1252332836389543, + "loss/hidden": 3.783203125, + "loss/jsd": 0.0, + "loss/logits": 0.23033270034939052, + "step": 10320 + }, + { + "epoch": 0.3443333333333333, + "grad_norm": 28.25, + "grad_norm_var": 7.640625, + "learning_rate": 0.0001, + "loss": 8.4185, + "loss/crossentropy": 2.0800456672906877, + "loss/hidden": 3.892578125, + "loss/jsd": 0.0, + "loss/logits": 0.25349009446799753, + "step": 10330 + }, + { + "epoch": 0.3446666666666667, + "grad_norm": 6408896512.0, + "grad_norm_var": 2.5671221312037934e+18, + "learning_rate": 0.0001, + "loss": 8.4544, + "loss/crossentropy": 2.2498678863048553, + "loss/hidden": 3.81015625, + "loss/jsd": 0.0, + "loss/logits": 0.24522239342331886, + "step": 10340 + }, + { + "epoch": 0.345, + "grad_norm": 29.75, + "grad_norm_var": 2.56712212992869e+18, + "learning_rate": 0.0001, + "loss": 8.332, + "loss/crossentropy": 2.067190906405449, + "loss/hidden": 3.79765625, + "loss/jsd": 0.0, + "loss/logits": 0.22377760540693997, + "step": 10350 + }, + { + "epoch": 0.3453333333333333, + "grad_norm": 30.875, + "grad_norm_var": 35.8744140625, + "learning_rate": 0.0001, + "loss": 8.3931, + "loss/crossentropy": 2.3343340516090394, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.23258175030350686, + "step": 10360 + }, + { + "epoch": 0.3456666666666667, + "grad_norm": 30.125, + "grad_norm_var": 5.64765625, + "learning_rate": 0.0001, + "loss": 8.2511, + "loss/crossentropy": 2.0171373963356016, + "loss/hidden": 3.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.24750286806374788, + "step": 10370 + }, + { + "epoch": 0.346, + "grad_norm": 34.0, + "grad_norm_var": 2117.798372395833, + "learning_rate": 0.0001, + "loss": 8.4717, + "loss/crossentropy": 2.027893168479204, + "loss/hidden": 4.028125, + "loss/jsd": 0.0, + "loss/logits": 0.256832991912961, + "step": 10380 + }, + { + "epoch": 0.3463333333333333, + "grad_norm": 33.75, + "grad_norm_var": 2119.6337890625, + "learning_rate": 0.0001, + "loss": 8.4619, + "loss/crossentropy": 2.106752772629261, + "loss/hidden": 3.865625, + "loss/jsd": 0.0, + "loss/logits": 0.2447080912068486, + "step": 10390 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 32.5, + "grad_norm_var": 10.728059895833333, + "learning_rate": 0.0001, + "loss": 8.4674, + "loss/crossentropy": 2.150819255411625, + "loss/hidden": 3.839453125, + "loss/jsd": 0.0, + "loss/logits": 0.2522192716598511, + "step": 10400 + }, + { + "epoch": 0.347, + "grad_norm": 29.25, + "grad_norm_var": 19.118489583333332, + "learning_rate": 0.0001, + "loss": 8.3815, + "loss/crossentropy": 2.1638945795595648, + "loss/hidden": 3.760546875, + "loss/jsd": 0.0, + "loss/logits": 0.2254624404013157, + "step": 10410 + }, + { + "epoch": 0.3473333333333333, + "grad_norm": 27.625, + "grad_norm_var": 5.62890625, + "learning_rate": 0.0001, + "loss": 8.2479, + "loss/crossentropy": 2.0657031178474425, + "loss/hidden": 3.637890625, + "loss/jsd": 0.0, + "loss/logits": 0.21996993869543074, + "step": 10420 + }, + { + "epoch": 0.3476666666666667, + "grad_norm": 28.375, + "grad_norm_var": 5.833072916666667, + "learning_rate": 0.0001, + "loss": 8.3344, + "loss/crossentropy": 2.1299983762204646, + "loss/hidden": 3.780859375, + "loss/jsd": 0.0, + "loss/logits": 0.21561280181631445, + "step": 10430 + }, + { + "epoch": 0.348, + "grad_norm": 28.375, + "grad_norm_var": 8.4150390625, + "learning_rate": 0.0001, + "loss": 8.3951, + "loss/crossentropy": 2.1073717825114726, + "loss/hidden": 3.808203125, + "loss/jsd": 0.0, + "loss/logits": 0.21989304553717376, + "step": 10440 + }, + { + "epoch": 0.34833333333333333, + "grad_norm": 34.0, + "grad_norm_var": 3.081184895833333, + "learning_rate": 0.0001, + "loss": 8.2967, + "loss/crossentropy": 2.077288343012333, + "loss/hidden": 3.78359375, + "loss/jsd": 0.0, + "loss/logits": 0.23938167467713356, + "step": 10450 + }, + { + "epoch": 0.3486666666666667, + "grad_norm": 31.25, + "grad_norm_var": 2.089583333333333, + "learning_rate": 0.0001, + "loss": 8.3636, + "loss/crossentropy": 1.9518108278512956, + "loss/hidden": 3.88125, + "loss/jsd": 0.0, + "loss/logits": 0.2389603516086936, + "step": 10460 + }, + { + "epoch": 0.349, + "grad_norm": 28.625, + "grad_norm_var": 1.7738932291666667, + "learning_rate": 0.0001, + "loss": 8.4079, + "loss/crossentropy": 2.0541266784071923, + "loss/hidden": 3.709375, + "loss/jsd": 0.0, + "loss/logits": 0.21510923374444246, + "step": 10470 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 29.5, + "grad_norm_var": 3.5259765625, + "learning_rate": 0.0001, + "loss": 8.3548, + "loss/crossentropy": 1.9732642628252506, + "loss/hidden": 3.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.23084475416690112, + "step": 10480 + }, + { + "epoch": 0.3496666666666667, + "grad_norm": 31.25, + "grad_norm_var": 2.0927083333333334, + "learning_rate": 0.0001, + "loss": 8.245, + "loss/crossentropy": 2.0425802804529667, + "loss/hidden": 3.7640625, + "loss/jsd": 0.0, + "loss/logits": 0.2300992401316762, + "step": 10490 + }, + { + "epoch": 0.35, + "grad_norm": 27.375, + "grad_norm_var": 3.1681640625, + "learning_rate": 0.0001, + "loss": 8.3515, + "loss/crossentropy": 2.1685428470373154, + "loss/hidden": 3.887109375, + "loss/jsd": 0.0, + "loss/logits": 0.2667371932417154, + "step": 10500 + }, + { + "epoch": 0.35033333333333333, + "grad_norm": 30.375, + "grad_norm_var": 8.442708333333334, + "learning_rate": 0.0001, + "loss": 8.2839, + "loss/crossentropy": 2.0770496785640717, + "loss/hidden": 3.8609375, + "loss/jsd": 0.0, + "loss/logits": 0.23902000039815902, + "step": 10510 + }, + { + "epoch": 0.3506666666666667, + "grad_norm": 32.0, + "grad_norm_var": 8.070768229166667, + "learning_rate": 0.0001, + "loss": 8.2154, + "loss/crossentropy": 2.2012623459100724, + "loss/hidden": 3.900390625, + "loss/jsd": 0.0, + "loss/logits": 0.26947569735348226, + "step": 10520 + }, + { + "epoch": 0.351, + "grad_norm": 31.875, + "grad_norm_var": 6.551822916666667, + "learning_rate": 0.0001, + "loss": 8.5346, + "loss/crossentropy": 2.1797248646616936, + "loss/hidden": 3.939453125, + "loss/jsd": 0.0, + "loss/logits": 0.2714757215231657, + "step": 10530 + }, + { + "epoch": 0.35133333333333333, + "grad_norm": 28.875, + "grad_norm_var": 9.1759765625, + "learning_rate": 0.0001, + "loss": 8.271, + "loss/crossentropy": 2.2226035714149477, + "loss/hidden": 3.975, + "loss/jsd": 0.0, + "loss/logits": 0.2569460779428482, + "step": 10540 + }, + { + "epoch": 0.3516666666666667, + "grad_norm": 34.0, + "grad_norm_var": 9.7556640625, + "learning_rate": 0.0001, + "loss": 8.4072, + "loss/crossentropy": 2.0563524261116983, + "loss/hidden": 3.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22892842460423707, + "step": 10550 + }, + { + "epoch": 0.352, + "grad_norm": 38.5, + "grad_norm_var": 13.52265625, + "learning_rate": 0.0001, + "loss": 8.3639, + "loss/crossentropy": 2.146382841467857, + "loss/hidden": 3.823046875, + "loss/jsd": 0.0, + "loss/logits": 0.2412761567160487, + "step": 10560 + }, + { + "epoch": 0.35233333333333333, + "grad_norm": 31.0, + "grad_norm_var": 14.380143229166666, + "learning_rate": 0.0001, + "loss": 8.4058, + "loss/crossentropy": 2.0560551561415195, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22143340446054935, + "step": 10570 + }, + { + "epoch": 0.3526666666666667, + "grad_norm": 31.375, + "grad_norm_var": 6.331705729166667, + "learning_rate": 0.0001, + "loss": 8.4463, + "loss/crossentropy": 1.9261908829212189, + "loss/hidden": 3.851953125, + "loss/jsd": 0.0, + "loss/logits": 0.23094973117113113, + "step": 10580 + }, + { + "epoch": 0.353, + "grad_norm": 31.125, + "grad_norm_var": 765.9447916666667, + "learning_rate": 0.0001, + "loss": 8.561, + "loss/crossentropy": 2.198425108194351, + "loss/hidden": 3.90390625, + "loss/jsd": 0.0, + "loss/logits": 0.32166901491582395, + "step": 10590 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 34.0, + "grad_norm_var": 68.19557291666666, + "learning_rate": 0.0001, + "loss": 8.3153, + "loss/crossentropy": 2.0817312106490133, + "loss/hidden": 3.9421875, + "loss/jsd": 0.0, + "loss/logits": 0.24946041442453862, + "step": 10600 + }, + { + "epoch": 0.3536666666666667, + "grad_norm": 32.75, + "grad_norm_var": 3.442122395833333, + "learning_rate": 0.0001, + "loss": 8.2396, + "loss/crossentropy": 2.0021930016577243, + "loss/hidden": 3.927734375, + "loss/jsd": 0.0, + "loss/logits": 0.23456851877272128, + "step": 10610 + }, + { + "epoch": 0.354, + "grad_norm": 31.25, + "grad_norm_var": 2.9041015625, + "learning_rate": 0.0001, + "loss": 8.5516, + "loss/crossentropy": 2.3048987478017806, + "loss/hidden": 3.7765625, + "loss/jsd": 0.0, + "loss/logits": 0.2382359316572547, + "step": 10620 + }, + { + "epoch": 0.35433333333333333, + "grad_norm": 29.625, + "grad_norm_var": 2.2348307291666667, + "learning_rate": 0.0001, + "loss": 8.2786, + "loss/crossentropy": 2.1949386417865755, + "loss/hidden": 3.804296875, + "loss/jsd": 0.0, + "loss/logits": 0.2493376847356558, + "step": 10630 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 29.375, + "grad_norm_var": 3.965625, + "learning_rate": 0.0001, + "loss": 8.3273, + "loss/crossentropy": 2.0018108122050764, + "loss/hidden": 3.7875, + "loss/jsd": 0.0, + "loss/logits": 0.21743584834039212, + "step": 10640 + }, + { + "epoch": 0.355, + "grad_norm": 31.75, + "grad_norm_var": 44.281184895833334, + "learning_rate": 0.0001, + "loss": 8.2713, + "loss/crossentropy": 2.1437037006020545, + "loss/hidden": 3.738671875, + "loss/jsd": 0.0, + "loss/logits": 0.23090928047895432, + "step": 10650 + }, + { + "epoch": 0.35533333333333333, + "grad_norm": 30.5, + "grad_norm_var": 3.9192575434663747e+18, + "learning_rate": 0.0001, + "loss": 8.478, + "loss/crossentropy": 2.2309256963431836, + "loss/hidden": 3.974609375, + "loss/jsd": 0.0, + "loss/logits": 0.2605215635150671, + "step": 10660 + }, + { + "epoch": 0.3556666666666667, + "grad_norm": 29.375, + "grad_norm_var": 12.66015625, + "learning_rate": 0.0001, + "loss": 8.3008, + "loss/crossentropy": 2.087778661772609, + "loss/hidden": 3.8953125, + "loss/jsd": 0.0, + "loss/logits": 0.24346806921530514, + "step": 10670 + }, + { + "epoch": 0.356, + "grad_norm": 28.375, + "grad_norm_var": 3.5434895833333333, + "learning_rate": 0.0001, + "loss": 8.4067, + "loss/crossentropy": 2.0724117450416086, + "loss/hidden": 3.762890625, + "loss/jsd": 0.0, + "loss/logits": 0.23170766066759824, + "step": 10680 + }, + { + "epoch": 0.35633333333333334, + "grad_norm": 28.25, + "grad_norm_var": 3.880143229166667, + "learning_rate": 0.0001, + "loss": 8.3141, + "loss/crossentropy": 2.0804959647357464, + "loss/hidden": 3.932421875, + "loss/jsd": 0.0, + "loss/logits": 0.26033860705792905, + "step": 10690 + }, + { + "epoch": 0.3566666666666667, + "grad_norm": 31.375, + "grad_norm_var": 2.2768229166666667, + "learning_rate": 0.0001, + "loss": 8.2118, + "loss/crossentropy": 2.1898273028433324, + "loss/hidden": 3.808203125, + "loss/jsd": 0.0, + "loss/logits": 0.23994966400787235, + "step": 10700 + }, + { + "epoch": 0.357, + "grad_norm": 30.625, + "grad_norm_var": 38.139322916666664, + "learning_rate": 0.0001, + "loss": 8.3548, + "loss/crossentropy": 2.1299043744802475, + "loss/hidden": 3.891796875, + "loss/jsd": 0.0, + "loss/logits": 0.25432286225259304, + "step": 10710 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 30.5, + "grad_norm_var": 2.2309895833333333, + "learning_rate": 0.0001, + "loss": 8.3522, + "loss/crossentropy": 2.230180199444294, + "loss/hidden": 3.7171875, + "loss/jsd": 0.0, + "loss/logits": 0.22721530161798, + "step": 10720 + }, + { + "epoch": 0.3576666666666667, + "grad_norm": 30.375, + "grad_norm_var": 7.738541666666666, + "learning_rate": 0.0001, + "loss": 8.23, + "loss/crossentropy": 2.0474620938301085, + "loss/hidden": 3.76640625, + "loss/jsd": 0.0, + "loss/logits": 0.2256452445872128, + "step": 10730 + }, + { + "epoch": 0.358, + "grad_norm": 31.25, + "grad_norm_var": 3.0249348958333333, + "learning_rate": 0.0001, + "loss": 8.3739, + "loss/crossentropy": 2.152415704727173, + "loss/hidden": 3.884765625, + "loss/jsd": 0.0, + "loss/logits": 0.2414580164477229, + "step": 10740 + }, + { + "epoch": 0.35833333333333334, + "grad_norm": 35.5, + "grad_norm_var": 4.0625, + "learning_rate": 0.0001, + "loss": 8.205, + "loss/crossentropy": 2.3166534900665283, + "loss/hidden": 3.880859375, + "loss/jsd": 0.0, + "loss/logits": 0.2524943361058831, + "step": 10750 + }, + { + "epoch": 0.3586666666666667, + "grad_norm": 32.25, + "grad_norm_var": 3.746875, + "learning_rate": 0.0001, + "loss": 8.3312, + "loss/crossentropy": 2.152712790668011, + "loss/hidden": 3.825, + "loss/jsd": 0.0, + "loss/logits": 0.2455398654565215, + "step": 10760 + }, + { + "epoch": 0.359, + "grad_norm": 29.0, + "grad_norm_var": 2.4900390625, + "learning_rate": 0.0001, + "loss": 8.3316, + "loss/crossentropy": 2.1520379945635795, + "loss/hidden": 3.7921875, + "loss/jsd": 0.0, + "loss/logits": 0.23001325819641352, + "step": 10770 + }, + { + "epoch": 0.35933333333333334, + "grad_norm": 30.5, + "grad_norm_var": 2.9306640625, + "learning_rate": 0.0001, + "loss": 8.2616, + "loss/crossentropy": 1.945100226998329, + "loss/hidden": 3.8171875, + "loss/jsd": 0.0, + "loss/logits": 0.23169657299295068, + "step": 10780 + }, + { + "epoch": 0.3596666666666667, + "grad_norm": 59.0, + "grad_norm_var": 58.0666015625, + "learning_rate": 0.0001, + "loss": 8.3974, + "loss/crossentropy": 2.103043520450592, + "loss/hidden": 3.84921875, + "loss/jsd": 0.0, + "loss/logits": 0.24081441648304464, + "step": 10790 + }, + { + "epoch": 0.36, + "grad_norm": 29.0, + "grad_norm_var": 59.16087239583333, + "learning_rate": 0.0001, + "loss": 8.2643, + "loss/crossentropy": 1.9263419553637504, + "loss/hidden": 3.848046875, + "loss/jsd": 0.0, + "loss/logits": 0.22828295342624189, + "step": 10800 + }, + { + "epoch": 0.36033333333333334, + "grad_norm": 41.75, + "grad_norm_var": 14.025, + "learning_rate": 0.0001, + "loss": 8.3654, + "loss/crossentropy": 2.2317003183066846, + "loss/hidden": 3.78828125, + "loss/jsd": 0.0, + "loss/logits": 0.23634406868368388, + "step": 10810 + }, + { + "epoch": 0.3606666666666667, + "grad_norm": 28.75, + "grad_norm_var": 11.925, + "learning_rate": 0.0001, + "loss": 8.3209, + "loss/crossentropy": 2.0176270991563796, + "loss/hidden": 3.842578125, + "loss/jsd": 0.0, + "loss/logits": 0.2656209450215101, + "step": 10820 + }, + { + "epoch": 0.361, + "grad_norm": 29.875, + "grad_norm_var": 3.6561848958333334, + "learning_rate": 0.0001, + "loss": 8.3232, + "loss/crossentropy": 2.127125210314989, + "loss/hidden": 3.6890625, + "loss/jsd": 0.0, + "loss/logits": 0.23250760212540628, + "step": 10830 + }, + { + "epoch": 0.36133333333333334, + "grad_norm": 27.625, + "grad_norm_var": 14.4447265625, + "learning_rate": 0.0001, + "loss": 8.3106, + "loss/crossentropy": 2.065077592432499, + "loss/hidden": 3.804296875, + "loss/jsd": 0.0, + "loss/logits": 0.22122176755219697, + "step": 10840 + }, + { + "epoch": 0.3616666666666667, + "grad_norm": 32.75, + "grad_norm_var": 14.9212890625, + "learning_rate": 0.0001, + "loss": 8.1455, + "loss/crossentropy": 1.9944854885339738, + "loss/hidden": 3.878515625, + "loss/jsd": 0.0, + "loss/logits": 0.2446516625583172, + "step": 10850 + }, + { + "epoch": 0.362, + "grad_norm": 28.125, + "grad_norm_var": 16.002018229166666, + "learning_rate": 0.0001, + "loss": 8.1836, + "loss/crossentropy": 2.1445549219846725, + "loss/hidden": 3.805078125, + "loss/jsd": 0.0, + "loss/logits": 0.24592317193746566, + "step": 10860 + }, + { + "epoch": 0.36233333333333334, + "grad_norm": 32.0, + "grad_norm_var": 4.886393229166667, + "learning_rate": 0.0001, + "loss": 8.3139, + "loss/crossentropy": 2.0830292530357837, + "loss/hidden": 3.759765625, + "loss/jsd": 0.0, + "loss/logits": 0.2338992802426219, + "step": 10870 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 28.25, + "grad_norm_var": 8.9744140625, + "learning_rate": 0.0001, + "loss": 8.3703, + "loss/crossentropy": 2.074661585688591, + "loss/hidden": 3.763671875, + "loss/jsd": 0.0, + "loss/logits": 0.2267523631453514, + "step": 10880 + }, + { + "epoch": 0.363, + "grad_norm": 30.0, + "grad_norm_var": 6.71015625, + "learning_rate": 0.0001, + "loss": 8.2861, + "loss/crossentropy": 2.218615745007992, + "loss/hidden": 3.708984375, + "loss/jsd": 0.0, + "loss/logits": 0.2329158153384924, + "step": 10890 + }, + { + "epoch": 0.36333333333333334, + "grad_norm": 31.5, + "grad_norm_var": 2.10390625, + "learning_rate": 0.0001, + "loss": 8.4114, + "loss/crossentropy": 2.2118052423000334, + "loss/hidden": 3.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.24035598244518042, + "step": 10900 + }, + { + "epoch": 0.3636666666666667, + "grad_norm": 32.25, + "grad_norm_var": 6.5916015625, + "learning_rate": 0.0001, + "loss": 8.253, + "loss/crossentropy": 1.9713818281888962, + "loss/hidden": 3.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21810249648988247, + "step": 10910 + }, + { + "epoch": 0.364, + "grad_norm": 28.125, + "grad_norm_var": 3.3604166666666666, + "learning_rate": 0.0001, + "loss": 8.402, + "loss/crossentropy": 2.087474272400141, + "loss/hidden": 3.903515625, + "loss/jsd": 0.0, + "loss/logits": 0.25297512784600257, + "step": 10920 + }, + { + "epoch": 0.36433333333333334, + "grad_norm": 29.0, + "grad_norm_var": 2.8676432291666667, + "learning_rate": 0.0001, + "loss": 8.3376, + "loss/crossentropy": 2.055064349621534, + "loss/hidden": 3.871484375, + "loss/jsd": 0.0, + "loss/logits": 0.23781403247267008, + "step": 10930 + }, + { + "epoch": 0.36466666666666664, + "grad_norm": 29.25, + "grad_norm_var": 3.84140625, + "learning_rate": 0.0001, + "loss": 8.3393, + "loss/crossentropy": 2.240050254762173, + "loss/hidden": 3.837890625, + "loss/jsd": 0.0, + "loss/logits": 0.26323652667924763, + "step": 10940 + }, + { + "epoch": 0.365, + "grad_norm": 31.875, + "grad_norm_var": 5.914322916666666, + "learning_rate": 0.0001, + "loss": 8.4046, + "loss/crossentropy": 2.0274403050541876, + "loss/hidden": 3.884765625, + "loss/jsd": 0.0, + "loss/logits": 0.255847645457834, + "step": 10950 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 31.625, + "grad_norm_var": 4.284375, + "learning_rate": 0.0001, + "loss": 8.2417, + "loss/crossentropy": 2.148882707953453, + "loss/hidden": 3.7375, + "loss/jsd": 0.0, + "loss/logits": 0.22276342548429967, + "step": 10960 + }, + { + "epoch": 0.36566666666666664, + "grad_norm": 34.75, + "grad_norm_var": 4.983072916666667, + "learning_rate": 0.0001, + "loss": 8.2975, + "loss/crossentropy": 2.1014214023947715, + "loss/hidden": 3.73359375, + "loss/jsd": 0.0, + "loss/logits": 0.2284602228552103, + "step": 10970 + }, + { + "epoch": 0.366, + "grad_norm": 32.0, + "grad_norm_var": 2.1797421953052575e+18, + "learning_rate": 0.0001, + "loss": 8.3736, + "loss/crossentropy": 2.01134799271822, + "loss/hidden": 3.832421875, + "loss/jsd": 0.0, + "loss/logits": 0.23655065074563025, + "step": 10980 + }, + { + "epoch": 0.36633333333333334, + "grad_norm": 29.75, + "grad_norm_var": 8.880989583333333, + "learning_rate": 0.0001, + "loss": 8.3184, + "loss/crossentropy": 2.019283553212881, + "loss/hidden": 3.891015625, + "loss/jsd": 0.0, + "loss/logits": 0.24222001079469918, + "step": 10990 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 34.75, + "grad_norm_var": 4.905989583333334, + "learning_rate": 0.0001, + "loss": 8.1939, + "loss/crossentropy": 2.0408636704087257, + "loss/hidden": 3.95625, + "loss/jsd": 0.0, + "loss/logits": 0.257934401743114, + "step": 11000 + }, + { + "epoch": 0.367, + "grad_norm": 31.125, + "grad_norm_var": 6.711393229166666, + "learning_rate": 0.0001, + "loss": 8.3125, + "loss/crossentropy": 1.987822836637497, + "loss/hidden": 3.8765625, + "loss/jsd": 0.0, + "loss/logits": 0.24592833667993547, + "step": 11010 + }, + { + "epoch": 0.36733333333333335, + "grad_norm": 28.0, + "grad_norm_var": 10.1884765625, + "learning_rate": 0.0001, + "loss": 8.2068, + "loss/crossentropy": 1.934238361567259, + "loss/hidden": 3.6390625, + "loss/jsd": 0.0, + "loss/logits": 0.20752198286354542, + "step": 11020 + }, + { + "epoch": 0.36766666666666664, + "grad_norm": 30.75, + "grad_norm_var": 14.797330729166667, + "learning_rate": 0.0001, + "loss": 8.3522, + "loss/crossentropy": 2.091546893119812, + "loss/hidden": 3.8109375, + "loss/jsd": 0.0, + "loss/logits": 0.22276817485690117, + "step": 11030 + }, + { + "epoch": 0.368, + "grad_norm": 32.25, + "grad_norm_var": 10.873372395833334, + "learning_rate": 0.0001, + "loss": 8.26, + "loss/crossentropy": 2.046231422573328, + "loss/hidden": 3.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.218076004460454, + "step": 11040 + }, + { + "epoch": 0.36833333333333335, + "grad_norm": 29.625, + "grad_norm_var": 9.6994140625, + "learning_rate": 0.0001, + "loss": 8.2113, + "loss/crossentropy": 2.1118013873696326, + "loss/hidden": 3.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.22814447209239005, + "step": 11050 + }, + { + "epoch": 0.36866666666666664, + "grad_norm": 32.75, + "grad_norm_var": 5.477018229166666, + "learning_rate": 0.0001, + "loss": 8.3915, + "loss/crossentropy": 2.138951501250267, + "loss/hidden": 3.796484375, + "loss/jsd": 0.0, + "loss/logits": 0.2464024931192398, + "step": 11060 + }, + { + "epoch": 0.369, + "grad_norm": 32.5, + "grad_norm_var": 4.268489583333333, + "learning_rate": 0.0001, + "loss": 8.2656, + "loss/crossentropy": 2.0544290356338024, + "loss/hidden": 3.797265625, + "loss/jsd": 0.0, + "loss/logits": 0.21736350897699594, + "step": 11070 + }, + { + "epoch": 0.36933333333333335, + "grad_norm": 30.625, + "grad_norm_var": 2.41640625, + "learning_rate": 0.0001, + "loss": 8.2015, + "loss/crossentropy": 2.0753560826182365, + "loss/hidden": 3.994921875, + "loss/jsd": 0.0, + "loss/logits": 0.25065676774829626, + "step": 11080 + }, + { + "epoch": 0.36966666666666664, + "grad_norm": 32.5, + "grad_norm_var": 2.3811848958333335, + "learning_rate": 0.0001, + "loss": 8.3318, + "loss/crossentropy": 2.3124043948948385, + "loss/hidden": 3.866015625, + "loss/jsd": 0.0, + "loss/logits": 0.25761293675750496, + "step": 11090 + }, + { + "epoch": 0.37, + "grad_norm": 29.375, + "grad_norm_var": 75.17057291666667, + "learning_rate": 0.0001, + "loss": 8.3787, + "loss/crossentropy": 2.0899481564760207, + "loss/hidden": 3.833984375, + "loss/jsd": 0.0, + "loss/logits": 0.24109712056815624, + "step": 11100 + }, + { + "epoch": 0.37033333333333335, + "grad_norm": 29.5, + "grad_norm_var": 87.39973958333333, + "learning_rate": 0.0001, + "loss": 8.2869, + "loss/crossentropy": 2.068728582933545, + "loss/hidden": 3.794921875, + "loss/jsd": 0.0, + "loss/logits": 0.21987394848838449, + "step": 11110 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 30.875, + "grad_norm_var": 20.74765625, + "learning_rate": 0.0001, + "loss": 8.3058, + "loss/crossentropy": 2.1063128843903542, + "loss/hidden": 3.7140625, + "loss/jsd": 0.0, + "loss/logits": 0.22465858031064273, + "step": 11120 + }, + { + "epoch": 0.371, + "grad_norm": 31.125, + "grad_norm_var": 34.37805989583333, + "learning_rate": 0.0001, + "loss": 8.2634, + "loss/crossentropy": 2.128019214421511, + "loss/hidden": 3.806640625, + "loss/jsd": 0.0, + "loss/logits": 0.2511159829795361, + "step": 11130 + }, + { + "epoch": 0.37133333333333335, + "grad_norm": 28.875, + "grad_norm_var": 216.6681640625, + "learning_rate": 0.0001, + "loss": 8.3359, + "loss/crossentropy": 2.0605734646320344, + "loss/hidden": 3.862109375, + "loss/jsd": 0.0, + "loss/logits": 0.23643396981060505, + "step": 11140 + }, + { + "epoch": 0.37166666666666665, + "grad_norm": 29.75, + "grad_norm_var": 218.5197265625, + "learning_rate": 0.0001, + "loss": 8.3266, + "loss/crossentropy": 1.8941866405308248, + "loss/hidden": 4.06171875, + "loss/jsd": 0.0, + "loss/logits": 0.2489516731351614, + "step": 11150 + }, + { + "epoch": 0.372, + "grad_norm": 31.75, + "grad_norm_var": 2.745247395833333, + "learning_rate": 0.0001, + "loss": 8.2646, + "loss/crossentropy": 2.023110543191433, + "loss/hidden": 3.8078125, + "loss/jsd": 0.0, + "loss/logits": 0.23438771143555642, + "step": 11160 + }, + { + "epoch": 0.37233333333333335, + "grad_norm": 34.5, + "grad_norm_var": 5.59140625, + "learning_rate": 0.0001, + "loss": 8.293, + "loss/crossentropy": 2.115256902575493, + "loss/hidden": 3.8171875, + "loss/jsd": 0.0, + "loss/logits": 0.23501987420022488, + "step": 11170 + }, + { + "epoch": 0.37266666666666665, + "grad_norm": 33.0, + "grad_norm_var": 5.412239583333333, + "learning_rate": 0.0001, + "loss": 8.2813, + "loss/crossentropy": 2.183938892185688, + "loss/hidden": 3.8171875, + "loss/jsd": 0.0, + "loss/logits": 0.23923480361700059, + "step": 11180 + }, + { + "epoch": 0.373, + "grad_norm": 29.5, + "grad_norm_var": 4.455989583333333, + "learning_rate": 0.0001, + "loss": 8.2506, + "loss/crossentropy": 2.09958486109972, + "loss/hidden": 3.836328125, + "loss/jsd": 0.0, + "loss/logits": 0.24175492376089097, + "step": 11190 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 29.375, + "grad_norm_var": 3.034375, + "learning_rate": 0.0001, + "loss": 8.1771, + "loss/crossentropy": 2.150550900399685, + "loss/hidden": 3.712109375, + "loss/jsd": 0.0, + "loss/logits": 0.2199710313230753, + "step": 11200 + }, + { + "epoch": 0.37366666666666665, + "grad_norm": 39.0, + "grad_norm_var": 7.083072916666667, + "learning_rate": 0.0001, + "loss": 8.3038, + "loss/crossentropy": 2.160426365584135, + "loss/hidden": 3.8125, + "loss/jsd": 0.0, + "loss/logits": 0.22729940414428712, + "step": 11210 + }, + { + "epoch": 0.374, + "grad_norm": 29.625, + "grad_norm_var": 8.17890625, + "learning_rate": 0.0001, + "loss": 8.3813, + "loss/crossentropy": 2.2687479466199876, + "loss/hidden": 3.866796875, + "loss/jsd": 0.0, + "loss/logits": 0.2643072698265314, + "step": 11220 + }, + { + "epoch": 0.37433333333333335, + "grad_norm": 33.25, + "grad_norm_var": 4.737434895833333, + "learning_rate": 0.0001, + "loss": 8.2897, + "loss/crossentropy": 1.9880081087350845, + "loss/hidden": 3.82265625, + "loss/jsd": 0.0, + "loss/logits": 0.23600170221179723, + "step": 11230 + }, + { + "epoch": 0.37466666666666665, + "grad_norm": 30.25, + "grad_norm_var": 2.0268229166666667, + "learning_rate": 0.0001, + "loss": 8.2243, + "loss/crossentropy": 2.157718874514103, + "loss/hidden": 3.725390625, + "loss/jsd": 0.0, + "loss/logits": 0.22776034101843834, + "step": 11240 + }, + { + "epoch": 0.375, + "grad_norm": 31.0, + "grad_norm_var": 1.9270182291666667, + "learning_rate": 0.0001, + "loss": 8.2751, + "loss/crossentropy": 2.1132646039128304, + "loss/hidden": 3.725, + "loss/jsd": 0.0, + "loss/logits": 0.22339010071009396, + "step": 11250 + }, + { + "epoch": 0.37533333333333335, + "grad_norm": 36.75, + "grad_norm_var": 7.060872395833333, + "learning_rate": 0.0001, + "loss": 8.4215, + "loss/crossentropy": 2.206932783126831, + "loss/hidden": 3.808203125, + "loss/jsd": 0.0, + "loss/logits": 0.2356038186699152, + "step": 11260 + }, + { + "epoch": 0.37566666666666665, + "grad_norm": 40.5, + "grad_norm_var": 14.249739583333334, + "learning_rate": 0.0001, + "loss": 8.3276, + "loss/crossentropy": 2.2165247052907944, + "loss/hidden": 3.765234375, + "loss/jsd": 0.0, + "loss/logits": 0.23299887999892235, + "step": 11270 + }, + { + "epoch": 0.376, + "grad_norm": 34.25, + "grad_norm_var": 23.989518229166666, + "learning_rate": 0.0001, + "loss": 8.202, + "loss/crossentropy": 2.1586028307676317, + "loss/hidden": 3.811328125, + "loss/jsd": 0.0, + "loss/logits": 0.23767958618700505, + "step": 11280 + }, + { + "epoch": 0.37633333333333335, + "grad_norm": 28.25, + "grad_norm_var": 6.742122395833333, + "learning_rate": 0.0001, + "loss": 8.2606, + "loss/crossentropy": 2.0374640226364136, + "loss/hidden": 3.73203125, + "loss/jsd": 0.0, + "loss/logits": 0.21241307370364665, + "step": 11290 + }, + { + "epoch": 0.37666666666666665, + "grad_norm": 32.25, + "grad_norm_var": 1.2348307291666667, + "learning_rate": 0.0001, + "loss": 8.2161, + "loss/crossentropy": 2.06580873131752, + "loss/hidden": 3.783984375, + "loss/jsd": 0.0, + "loss/logits": 0.25028328634798525, + "step": 11300 + }, + { + "epoch": 0.377, + "grad_norm": 28.875, + "grad_norm_var": 31.407747395833333, + "learning_rate": 0.0001, + "loss": 8.1998, + "loss/crossentropy": 2.120278796553612, + "loss/hidden": 3.77578125, + "loss/jsd": 0.0, + "loss/logits": 0.24368874300271273, + "step": 11310 + }, + { + "epoch": 0.37733333333333335, + "grad_norm": 31.125, + "grad_norm_var": 14.039322916666666, + "learning_rate": 0.0001, + "loss": 8.3848, + "loss/crossentropy": 2.0408181294798853, + "loss/hidden": 3.86796875, + "loss/jsd": 0.0, + "loss/logits": 0.23626107163727283, + "step": 11320 + }, + { + "epoch": 0.37766666666666665, + "grad_norm": 28.25, + "grad_norm_var": 8.001497395833333, + "learning_rate": 0.0001, + "loss": 8.1908, + "loss/crossentropy": 2.026593156158924, + "loss/hidden": 3.78359375, + "loss/jsd": 0.0, + "loss/logits": 0.21986434515565634, + "step": 11330 + }, + { + "epoch": 0.378, + "grad_norm": 28.375, + "grad_norm_var": 3.3416015625, + "learning_rate": 0.0001, + "loss": 8.1949, + "loss/crossentropy": 2.04437660574913, + "loss/hidden": 3.7546875, + "loss/jsd": 0.0, + "loss/logits": 0.2269467730075121, + "step": 11340 + }, + { + "epoch": 0.37833333333333335, + "grad_norm": 28.0, + "grad_norm_var": 3.505989583333333, + "learning_rate": 0.0001, + "loss": 8.2826, + "loss/crossentropy": 2.074940774589777, + "loss/hidden": 3.762109375, + "loss/jsd": 0.0, + "loss/logits": 0.2203363472595811, + "step": 11350 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 31.375, + "grad_norm_var": 3.77890625, + "learning_rate": 0.0001, + "loss": 8.0388, + "loss/crossentropy": 2.0113267697393895, + "loss/hidden": 3.8703125, + "loss/jsd": 0.0, + "loss/logits": 0.2317951550707221, + "step": 11360 + }, + { + "epoch": 0.379, + "grad_norm": 30.75, + "grad_norm_var": 3.2884765625, + "learning_rate": 0.0001, + "loss": 8.212, + "loss/crossentropy": 2.1366314753890037, + "loss/hidden": 3.79921875, + "loss/jsd": 0.0, + "loss/logits": 0.2496491651982069, + "step": 11370 + }, + { + "epoch": 0.37933333333333336, + "grad_norm": 31.75, + "grad_norm_var": 8.070572916666666, + "learning_rate": 0.0001, + "loss": 8.256, + "loss/crossentropy": 1.991414950788021, + "loss/hidden": 3.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.23439864348620176, + "step": 11380 + }, + { + "epoch": 0.37966666666666665, + "grad_norm": 30.625, + "grad_norm_var": 6.458268229166666, + "learning_rate": 0.0001, + "loss": 8.2754, + "loss/crossentropy": 2.055978857725859, + "loss/hidden": 3.683984375, + "loss/jsd": 0.0, + "loss/logits": 0.20942887850105762, + "step": 11390 + }, + { + "epoch": 0.38, + "grad_norm": 29.75, + "grad_norm_var": 2.795768229166667, + "learning_rate": 0.0001, + "loss": 8.2368, + "loss/crossentropy": 2.2677480787038804, + "loss/hidden": 3.74296875, + "loss/jsd": 0.0, + "loss/logits": 0.24482562113553286, + "step": 11400 + }, + { + "epoch": 0.38033333333333336, + "grad_norm": 29.875, + "grad_norm_var": 6.1806640625, + "learning_rate": 0.0001, + "loss": 8.3087, + "loss/crossentropy": 2.010858987271786, + "loss/hidden": 3.876171875, + "loss/jsd": 0.0, + "loss/logits": 0.2329118952155113, + "step": 11410 + }, + { + "epoch": 0.38066666666666665, + "grad_norm": 29.5, + "grad_norm_var": 3.107291666666667, + "learning_rate": 0.0001, + "loss": 8.3397, + "loss/crossentropy": 2.19465371966362, + "loss/hidden": 3.771484375, + "loss/jsd": 0.0, + "loss/logits": 0.24298481158912183, + "step": 11420 + }, + { + "epoch": 0.381, + "grad_norm": 29.625, + "grad_norm_var": 3.9291015625, + "learning_rate": 0.0001, + "loss": 8.3047, + "loss/crossentropy": 2.1756315886974336, + "loss/hidden": 3.7765625, + "loss/jsd": 0.0, + "loss/logits": 0.23532975129783154, + "step": 11430 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 32.75, + "grad_norm_var": 2.5580729166666667, + "learning_rate": 0.0001, + "loss": 8.3201, + "loss/crossentropy": 2.1976757258176804, + "loss/hidden": 3.82734375, + "loss/jsd": 0.0, + "loss/logits": 0.2476160578429699, + "step": 11440 + }, + { + "epoch": 0.38166666666666665, + "grad_norm": 5838471168.0, + "grad_norm_var": 2.130484076579337e+18, + "learning_rate": 0.0001, + "loss": 8.2311, + "loss/crossentropy": 2.0670726232230665, + "loss/hidden": 3.984765625, + "loss/jsd": 0.0, + "loss/logits": 0.22541351839900017, + "step": 11450 + }, + { + "epoch": 0.382, + "grad_norm": 29.0, + "grad_norm_var": 2.1304840749737574e+18, + "learning_rate": 0.0001, + "loss": 8.1004, + "loss/crossentropy": 2.2240239530801773, + "loss/hidden": 3.746484375, + "loss/jsd": 0.0, + "loss/logits": 0.23139581680297852, + "step": 11460 + }, + { + "epoch": 0.38233333333333336, + "grad_norm": 29.5, + "grad_norm_var": 35.31555989583333, + "learning_rate": 0.0001, + "loss": 8.2446, + "loss/crossentropy": 2.029728998243809, + "loss/hidden": 3.733203125, + "loss/jsd": 0.0, + "loss/logits": 0.22069137105718256, + "step": 11470 + }, + { + "epoch": 0.38266666666666665, + "grad_norm": 30.125, + "grad_norm_var": 2.24765625, + "learning_rate": 0.0001, + "loss": 8.1198, + "loss/crossentropy": 1.9788580626249312, + "loss/hidden": 3.730078125, + "loss/jsd": 0.0, + "loss/logits": 0.21759489141404628, + "step": 11480 + }, + { + "epoch": 0.383, + "grad_norm": 31.5, + "grad_norm_var": 2.2822916666666666, + "learning_rate": 0.0001, + "loss": 8.1322, + "loss/crossentropy": 2.1184426814317705, + "loss/hidden": 3.819140625, + "loss/jsd": 0.0, + "loss/logits": 0.23590471846982836, + "step": 11490 + }, + { + "epoch": 0.38333333333333336, + "grad_norm": 29.25, + "grad_norm_var": 2.4567057291666665, + "learning_rate": 0.0001, + "loss": 8.3504, + "loss/crossentropy": 2.023914510011673, + "loss/hidden": 3.8296875, + "loss/jsd": 0.0, + "loss/logits": 0.23768907226622105, + "step": 11500 + }, + { + "epoch": 0.38366666666666666, + "grad_norm": 30.5, + "grad_norm_var": 34.16875, + "learning_rate": 0.0001, + "loss": 8.3174, + "loss/crossentropy": 1.9882623553276062, + "loss/hidden": 3.70859375, + "loss/jsd": 0.0, + "loss/logits": 0.20902501344680785, + "step": 11510 + }, + { + "epoch": 0.384, + "grad_norm": 31.875, + "grad_norm_var": 37.2625, + "learning_rate": 0.0001, + "loss": 8.3973, + "loss/crossentropy": 1.9671563521027564, + "loss/hidden": 3.816796875, + "loss/jsd": 0.0, + "loss/logits": 0.21824515145272017, + "step": 11520 + }, + { + "epoch": 0.38433333333333336, + "grad_norm": 36.0, + "grad_norm_var": 9.521809895833334, + "learning_rate": 0.0001, + "loss": 8.2993, + "loss/crossentropy": 2.104415476322174, + "loss/hidden": 3.745703125, + "loss/jsd": 0.0, + "loss/logits": 0.24023280292749405, + "step": 11530 + }, + { + "epoch": 0.38466666666666666, + "grad_norm": 30.375, + "grad_norm_var": 6.6275390625, + "learning_rate": 0.0001, + "loss": 8.2753, + "loss/crossentropy": 2.1653599768877028, + "loss/hidden": 3.816796875, + "loss/jsd": 0.0, + "loss/logits": 0.22879305072128772, + "step": 11540 + }, + { + "epoch": 0.385, + "grad_norm": 30.125, + "grad_norm_var": 6.420572916666667, + "learning_rate": 0.0001, + "loss": 8.2014, + "loss/crossentropy": 2.0401563957333564, + "loss/hidden": 3.77890625, + "loss/jsd": 0.0, + "loss/logits": 0.23115058969706298, + "step": 11550 + }, + { + "epoch": 0.38533333333333336, + "grad_norm": 29.25, + "grad_norm_var": 10.065559895833333, + "learning_rate": 0.0001, + "loss": 8.3626, + "loss/crossentropy": 2.2447339951992036, + "loss/hidden": 3.770703125, + "loss/jsd": 0.0, + "loss/logits": 0.2418015170842409, + "step": 11560 + }, + { + "epoch": 0.38566666666666666, + "grad_norm": 32.75, + "grad_norm_var": 3.3020833333333335, + "learning_rate": 0.0001, + "loss": 8.4421, + "loss/crossentropy": 2.1326376996934413, + "loss/hidden": 3.803515625, + "loss/jsd": 0.0, + "loss/logits": 0.24543905295431614, + "step": 11570 + }, + { + "epoch": 0.386, + "grad_norm": 30.625, + "grad_norm_var": 3.28125, + "learning_rate": 0.0001, + "loss": 8.2912, + "loss/crossentropy": 2.099239933490753, + "loss/hidden": 3.776953125, + "loss/jsd": 0.0, + "loss/logits": 0.235281278192997, + "step": 11580 + }, + { + "epoch": 0.3863333333333333, + "grad_norm": 35.0, + "grad_norm_var": 12.728580729166667, + "learning_rate": 0.0001, + "loss": 8.3226, + "loss/crossentropy": 2.091626935452223, + "loss/hidden": 3.803515625, + "loss/jsd": 0.0, + "loss/logits": 0.22663789633661507, + "step": 11590 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 31.375, + "grad_norm_var": 4.16015625, + "learning_rate": 0.0001, + "loss": 8.2711, + "loss/crossentropy": 1.935661745071411, + "loss/hidden": 3.773046875, + "loss/jsd": 0.0, + "loss/logits": 0.22390243038535118, + "step": 11600 + }, + { + "epoch": 0.387, + "grad_norm": 42.25, + "grad_norm_var": 18.571809895833333, + "learning_rate": 0.0001, + "loss": 8.3944, + "loss/crossentropy": 2.2436830163002015, + "loss/hidden": 3.830078125, + "loss/jsd": 0.0, + "loss/logits": 0.26995023861527445, + "step": 11610 + }, + { + "epoch": 0.3873333333333333, + "grad_norm": 36.25, + "grad_norm_var": 19.981705729166666, + "learning_rate": 0.0001, + "loss": 8.2244, + "loss/crossentropy": 2.096161872893572, + "loss/hidden": 3.787109375, + "loss/jsd": 0.0, + "loss/logits": 0.25067653246223925, + "step": 11620 + }, + { + "epoch": 0.38766666666666666, + "grad_norm": 31.375, + "grad_norm_var": 7.373893229166667, + "learning_rate": 0.0001, + "loss": 8.3287, + "loss/crossentropy": 2.2269149988889696, + "loss/hidden": 3.855859375, + "loss/jsd": 0.0, + "loss/logits": 0.2346229925751686, + "step": 11630 + }, + { + "epoch": 0.388, + "grad_norm": 31.75, + "grad_norm_var": 4.336458333333334, + "learning_rate": 0.0001, + "loss": 8.3193, + "loss/crossentropy": 2.1625199913978577, + "loss/hidden": 3.838671875, + "loss/jsd": 0.0, + "loss/logits": 0.24539714939892293, + "step": 11640 + }, + { + "epoch": 0.3883333333333333, + "grad_norm": 33.5, + "grad_norm_var": 24.7009765625, + "learning_rate": 0.0001, + "loss": 8.4532, + "loss/crossentropy": 2.1474158462136983, + "loss/hidden": 3.900390625, + "loss/jsd": 0.0, + "loss/logits": 0.24705803375691177, + "step": 11650 + }, + { + "epoch": 0.38866666666666666, + "grad_norm": 30.0, + "grad_norm_var": 4.01875, + "learning_rate": 0.0001, + "loss": 8.1543, + "loss/crossentropy": 2.1757007278501987, + "loss/hidden": 3.7453125, + "loss/jsd": 0.0, + "loss/logits": 0.23021320514380933, + "step": 11660 + }, + { + "epoch": 0.389, + "grad_norm": 32.25, + "grad_norm_var": 13.9056640625, + "learning_rate": 0.0001, + "loss": 8.3819, + "loss/crossentropy": 2.0909801930189134, + "loss/hidden": 3.8921875, + "loss/jsd": 0.0, + "loss/logits": 0.25553538724780084, + "step": 11670 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 33.25, + "grad_norm_var": 13.77265625, + "learning_rate": 0.0001, + "loss": 8.342, + "loss/crossentropy": 2.209575629234314, + "loss/hidden": 3.7796875, + "loss/jsd": 0.0, + "loss/logits": 0.24838075898587703, + "step": 11680 + }, + { + "epoch": 0.38966666666666666, + "grad_norm": 32.0, + "grad_norm_var": 2.9744140625, + "learning_rate": 0.0001, + "loss": 8.249, + "loss/crossentropy": 2.150561396032572, + "loss/hidden": 3.758984375, + "loss/jsd": 0.0, + "loss/logits": 0.23041013162583113, + "step": 11690 + }, + { + "epoch": 0.39, + "grad_norm": 28.5, + "grad_norm_var": 2.283072916666667, + "learning_rate": 0.0001, + "loss": 8.3467, + "loss/crossentropy": 1.9362058877944945, + "loss/hidden": 3.861328125, + "loss/jsd": 0.0, + "loss/logits": 0.2275281075388193, + "step": 11700 + }, + { + "epoch": 0.3903333333333333, + "grad_norm": 30.25, + "grad_norm_var": 9.784375, + "learning_rate": 0.0001, + "loss": 8.2697, + "loss/crossentropy": 2.184156297147274, + "loss/hidden": 3.823046875, + "loss/jsd": 0.0, + "loss/logits": 0.227661694213748, + "step": 11710 + }, + { + "epoch": 0.39066666666666666, + "grad_norm": 30.0, + "grad_norm_var": 7.74140625, + "learning_rate": 0.0001, + "loss": 8.2916, + "loss/crossentropy": 2.005098359286785, + "loss/hidden": 3.771875, + "loss/jsd": 0.0, + "loss/logits": 0.22573864944279193, + "step": 11720 + }, + { + "epoch": 0.391, + "grad_norm": 39.0, + "grad_norm_var": 8.258333333333333, + "learning_rate": 0.0001, + "loss": 8.2232, + "loss/crossentropy": 1.9749820090830326, + "loss/hidden": 3.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2378404688090086, + "step": 11730 + }, + { + "epoch": 0.3913333333333333, + "grad_norm": 29.5, + "grad_norm_var": 9.064322916666667, + "learning_rate": 0.0001, + "loss": 8.2125, + "loss/crossentropy": 2.0756162479519844, + "loss/hidden": 3.665234375, + "loss/jsd": 0.0, + "loss/logits": 0.22730147559195757, + "step": 11740 + }, + { + "epoch": 0.39166666666666666, + "grad_norm": 30.75, + "grad_norm_var": 6.217708333333333, + "learning_rate": 0.0001, + "loss": 8.2565, + "loss/crossentropy": 2.1849037185311317, + "loss/hidden": 3.685546875, + "loss/jsd": 0.0, + "loss/logits": 0.2365235961973667, + "step": 11750 + }, + { + "epoch": 0.392, + "grad_norm": 30.25, + "grad_norm_var": 2.765625, + "learning_rate": 0.0001, + "loss": 8.2262, + "loss/crossentropy": 2.0965361006557943, + "loss/hidden": 3.73515625, + "loss/jsd": 0.0, + "loss/logits": 0.2350387828424573, + "step": 11760 + }, + { + "epoch": 0.3923333333333333, + "grad_norm": 29.625, + "grad_norm_var": 1.9030598958333333, + "learning_rate": 0.0001, + "loss": 8.2198, + "loss/crossentropy": 2.1148156762123107, + "loss/hidden": 3.747265625, + "loss/jsd": 0.0, + "loss/logits": 0.24504089988768102, + "step": 11770 + }, + { + "epoch": 0.39266666666666666, + "grad_norm": 30.75, + "grad_norm_var": 1.6375, + "learning_rate": 0.0001, + "loss": 8.1997, + "loss/crossentropy": 1.9177335992455482, + "loss/hidden": 3.763671875, + "loss/jsd": 0.0, + "loss/logits": 0.22061962708830835, + "step": 11780 + }, + { + "epoch": 0.393, + "grad_norm": 30.0, + "grad_norm_var": 2.4176432291666665, + "learning_rate": 0.0001, + "loss": 8.1247, + "loss/crossentropy": 1.9748799093067646, + "loss/hidden": 3.68125, + "loss/jsd": 0.0, + "loss/logits": 0.21441790107637643, + "step": 11790 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 28.25, + "grad_norm_var": 4.838997395833333, + "learning_rate": 0.0001, + "loss": 8.167, + "loss/crossentropy": 2.2271966516971586, + "loss/hidden": 3.799609375, + "loss/jsd": 0.0, + "loss/logits": 0.25528619475662706, + "step": 11800 + }, + { + "epoch": 0.39366666666666666, + "grad_norm": 33.75, + "grad_norm_var": 5.270768229166666, + "learning_rate": 0.0001, + "loss": 8.2939, + "loss/crossentropy": 2.261426217854023, + "loss/hidden": 3.82578125, + "loss/jsd": 0.0, + "loss/logits": 0.2428593784570694, + "step": 11810 + }, + { + "epoch": 0.394, + "grad_norm": 34.25, + "grad_norm_var": 10.937434895833333, + "learning_rate": 0.0001, + "loss": 8.4139, + "loss/crossentropy": 1.9368678316473962, + "loss/hidden": 3.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.22458538115024568, + "step": 11820 + }, + { + "epoch": 0.3943333333333333, + "grad_norm": 28.5, + "grad_norm_var": 44247710555649.48, + "learning_rate": 0.0001, + "loss": 8.368, + "loss/crossentropy": 2.1160020515322686, + "loss/hidden": 3.730078125, + "loss/jsd": 0.0, + "loss/logits": 0.2263868011534214, + "step": 11830 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 27.625, + "grad_norm_var": 96.88098958333333, + "learning_rate": 0.0001, + "loss": 8.2835, + "loss/crossentropy": 2.120930030941963, + "loss/hidden": 3.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21769896019250154, + "step": 11840 + }, + { + "epoch": 0.395, + "grad_norm": 30.375, + "grad_norm_var": 4.537955729166667, + "learning_rate": 0.0001, + "loss": 8.0825, + "loss/crossentropy": 2.06858219653368, + "loss/hidden": 3.687890625, + "loss/jsd": 0.0, + "loss/logits": 0.21344130001962186, + "step": 11850 + }, + { + "epoch": 0.3953333333333333, + "grad_norm": 33.0, + "grad_norm_var": 12.7337890625, + "learning_rate": 0.0001, + "loss": 8.3254, + "loss/crossentropy": 2.1121141463518143, + "loss/hidden": 3.840234375, + "loss/jsd": 0.0, + "loss/logits": 0.23941405918449163, + "step": 11860 + }, + { + "epoch": 0.39566666666666667, + "grad_norm": 32.0, + "grad_norm_var": 2.2372395833333334, + "learning_rate": 0.0001, + "loss": 8.3672, + "loss/crossentropy": 2.1878841519355774, + "loss/hidden": 3.74921875, + "loss/jsd": 0.0, + "loss/logits": 0.22403320614248515, + "step": 11870 + }, + { + "epoch": 0.396, + "grad_norm": 30.25, + "grad_norm_var": 3.2372395833333334, + "learning_rate": 0.0001, + "loss": 8.4055, + "loss/crossentropy": 2.1782354429364204, + "loss/hidden": 3.84921875, + "loss/jsd": 0.0, + "loss/logits": 0.2412811905145645, + "step": 11880 + }, + { + "epoch": 0.3963333333333333, + "grad_norm": 29.0, + "grad_norm_var": 5.423958333333333, + "learning_rate": 0.0001, + "loss": 8.162, + "loss/crossentropy": 1.9386512018740176, + "loss/hidden": 3.79765625, + "loss/jsd": 0.0, + "loss/logits": 0.23406942784786225, + "step": 11890 + }, + { + "epoch": 0.39666666666666667, + "grad_norm": 29.625, + "grad_norm_var": 7.05, + "learning_rate": 0.0001, + "loss": 8.3352, + "loss/crossentropy": 2.165928477048874, + "loss/hidden": 3.92890625, + "loss/jsd": 0.0, + "loss/logits": 0.2521603927016258, + "step": 11900 + }, + { + "epoch": 0.397, + "grad_norm": 30.875, + "grad_norm_var": 5.3822265625, + "learning_rate": 0.0001, + "loss": 8.2937, + "loss/crossentropy": 2.1375706143677236, + "loss/hidden": 3.666796875, + "loss/jsd": 0.0, + "loss/logits": 0.22246642410755157, + "step": 11910 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 29.5, + "grad_norm_var": 2.851822916666667, + "learning_rate": 0.0001, + "loss": 8.2117, + "loss/crossentropy": 2.0515445560216903, + "loss/hidden": 3.798828125, + "loss/jsd": 0.0, + "loss/logits": 0.2385630363598466, + "step": 11920 + }, + { + "epoch": 0.39766666666666667, + "grad_norm": 32.0, + "grad_norm_var": 2.3583333333333334, + "learning_rate": 0.0001, + "loss": 8.1148, + "loss/crossentropy": 2.233632105588913, + "loss/hidden": 3.760546875, + "loss/jsd": 0.0, + "loss/logits": 0.24521742388606071, + "step": 11930 + }, + { + "epoch": 0.398, + "grad_norm": 31.75, + "grad_norm_var": 4.1587890625, + "learning_rate": 0.0001, + "loss": 8.3179, + "loss/crossentropy": 2.2155081748962404, + "loss/hidden": 3.841796875, + "loss/jsd": 0.0, + "loss/logits": 0.2516425810754299, + "step": 11940 + }, + { + "epoch": 0.3983333333333333, + "grad_norm": 31.875, + "grad_norm_var": 6.0322265625, + "learning_rate": 0.0001, + "loss": 8.3005, + "loss/crossentropy": 2.1274189479649066, + "loss/hidden": 3.746484375, + "loss/jsd": 0.0, + "loss/logits": 0.24990401780232788, + "step": 11950 + }, + { + "epoch": 0.39866666666666667, + "grad_norm": 53.75, + "grad_norm_var": 36.696875, + "learning_rate": 0.0001, + "loss": 8.0809, + "loss/crossentropy": 2.0530085660517217, + "loss/hidden": 3.639453125, + "loss/jsd": 0.0, + "loss/logits": 0.21376553494483233, + "step": 11960 + }, + { + "epoch": 0.399, + "grad_norm": 29.875, + "grad_norm_var": 36.781184895833334, + "learning_rate": 0.0001, + "loss": 8.1538, + "loss/crossentropy": 2.0867031171917914, + "loss/hidden": 3.821484375, + "loss/jsd": 0.0, + "loss/logits": 0.23751907888799906, + "step": 11970 + }, + { + "epoch": 0.3993333333333333, + "grad_norm": 28.25, + "grad_norm_var": 2.7080729166666666, + "learning_rate": 0.0001, + "loss": 8.0891, + "loss/crossentropy": 2.2460917532444, + "loss/hidden": 3.734765625, + "loss/jsd": 0.0, + "loss/logits": 0.22936972938477992, + "step": 11980 + }, + { + "epoch": 0.39966666666666667, + "grad_norm": 29.5, + "grad_norm_var": 14.363997395833334, + "learning_rate": 0.0001, + "loss": 8.2289, + "loss/crossentropy": 2.11966609954834, + "loss/hidden": 3.81796875, + "loss/jsd": 0.0, + "loss/logits": 0.25193934664130213, + "step": 11990 + }, + { + "epoch": 0.4, + "grad_norm": 31.25, + "grad_norm_var": 11.431705729166667, + "learning_rate": 0.0001, + "loss": 8.3615, + "loss/crossentropy": 1.9913646757602692, + "loss/hidden": 3.76875, + "loss/jsd": 0.0, + "loss/logits": 0.2379540206864476, + "step": 12000 + }, + { + "epoch": 0.4003333333333333, + "grad_norm": 34.25, + "grad_norm_var": 2.90625, + "learning_rate": 0.0001, + "loss": 8.3603, + "loss/crossentropy": 2.265652423352003, + "loss/hidden": 3.737890625, + "loss/jsd": 0.0, + "loss/logits": 0.23408753713592886, + "step": 12010 + }, + { + "epoch": 0.40066666666666667, + "grad_norm": 33.75, + "grad_norm_var": 5.2181640625, + "learning_rate": 0.0001, + "loss": 8.2743, + "loss/crossentropy": 2.255320507287979, + "loss/hidden": 3.74375, + "loss/jsd": 0.0, + "loss/logits": 0.23977783247828482, + "step": 12020 + }, + { + "epoch": 0.401, + "grad_norm": 33.25, + "grad_norm_var": 4.6869140625, + "learning_rate": 0.0001, + "loss": 8.2277, + "loss/crossentropy": 2.155760329961777, + "loss/hidden": 3.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.246426010876894, + "step": 12030 + }, + { + "epoch": 0.4013333333333333, + "grad_norm": 26.25, + "grad_norm_var": 7.045768229166667, + "learning_rate": 0.0001, + "loss": 8.1739, + "loss/crossentropy": 2.1292715579271317, + "loss/hidden": 3.857421875, + "loss/jsd": 0.0, + "loss/logits": 0.23238162267953158, + "step": 12040 + }, + { + "epoch": 0.40166666666666667, + "grad_norm": 32.0, + "grad_norm_var": 8.898958333333333, + "learning_rate": 0.0001, + "loss": 8.1614, + "loss/crossentropy": 2.148128533363342, + "loss/hidden": 3.73203125, + "loss/jsd": 0.0, + "loss/logits": 0.23012944478541614, + "step": 12050 + }, + { + "epoch": 0.402, + "grad_norm": 30.5, + "grad_norm_var": 3.3577473958333335, + "learning_rate": 0.0001, + "loss": 8.2708, + "loss/crossentropy": 1.872607284784317, + "loss/hidden": 3.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2496652290225029, + "step": 12060 + }, + { + "epoch": 0.4023333333333333, + "grad_norm": 30.125, + "grad_norm_var": 2.1304840760927977e+18, + "learning_rate": 0.0001, + "loss": 8.3292, + "loss/crossentropy": 2.3182139307260514, + "loss/hidden": 3.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.238917101547122, + "step": 12070 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 29.375, + "grad_norm_var": 2.1304840765610918e+18, + "learning_rate": 0.0001, + "loss": 8.158, + "loss/crossentropy": 1.9857861787080764, + "loss/hidden": 3.871484375, + "loss/jsd": 0.0, + "loss/logits": 0.23468630164861679, + "step": 12080 + }, + { + "epoch": 0.403, + "grad_norm": 32.25, + "grad_norm_var": 14.543489583333333, + "learning_rate": 0.0001, + "loss": 8.1162, + "loss/crossentropy": 2.062743777036667, + "loss/hidden": 3.77421875, + "loss/jsd": 0.0, + "loss/logits": 0.23012079745531083, + "step": 12090 + }, + { + "epoch": 0.4033333333333333, + "grad_norm": 31.0, + "grad_norm_var": 14.9791015625, + "learning_rate": 0.0001, + "loss": 8.1531, + "loss/crossentropy": 2.027921313047409, + "loss/hidden": 3.718359375, + "loss/jsd": 0.0, + "loss/logits": 0.22394589530304074, + "step": 12100 + }, + { + "epoch": 0.4036666666666667, + "grad_norm": 40.0, + "grad_norm_var": 14.040559895833333, + "learning_rate": 0.0001, + "loss": 8.3209, + "loss/crossentropy": 2.006143531948328, + "loss/hidden": 3.65703125, + "loss/jsd": 0.0, + "loss/logits": 0.21074291467666625, + "step": 12110 + }, + { + "epoch": 0.404, + "grad_norm": 28.25, + "grad_norm_var": 16.646875, + "learning_rate": 0.0001, + "loss": 8.2199, + "loss/crossentropy": 2.064042943716049, + "loss/hidden": 3.8078125, + "loss/jsd": 0.0, + "loss/logits": 0.22763095535337924, + "step": 12120 + }, + { + "epoch": 0.4043333333333333, + "grad_norm": 30.5, + "grad_norm_var": 1.8330729166666666, + "learning_rate": 0.0001, + "loss": 8.1802, + "loss/crossentropy": 2.1694528847932815, + "loss/hidden": 3.924609375, + "loss/jsd": 0.0, + "loss/logits": 0.23536618407815696, + "step": 12130 + }, + { + "epoch": 0.4046666666666667, + "grad_norm": 31.625, + "grad_norm_var": 2.0556640625, + "learning_rate": 0.0001, + "loss": 8.1972, + "loss/crossentropy": 2.119773244857788, + "loss/hidden": 3.8078125, + "loss/jsd": 0.0, + "loss/logits": 0.24443610161542892, + "step": 12140 + }, + { + "epoch": 0.405, + "grad_norm": 29.625, + "grad_norm_var": 2.2260416666666667, + "learning_rate": 0.0001, + "loss": 8.2187, + "loss/crossentropy": 2.132966651767492, + "loss/hidden": 3.776171875, + "loss/jsd": 0.0, + "loss/logits": 0.23649816121906042, + "step": 12150 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 25.625, + "grad_norm_var": 21.917122395833335, + "learning_rate": 0.0001, + "loss": 8.2396, + "loss/crossentropy": 2.133436472713947, + "loss/hidden": 3.783203125, + "loss/jsd": 0.0, + "loss/logits": 0.2467921631410718, + "step": 12160 + }, + { + "epoch": 0.4056666666666667, + "grad_norm": 29.5, + "grad_norm_var": 87.078125, + "learning_rate": 0.0001, + "loss": 8.2219, + "loss/crossentropy": 2.2029434219002724, + "loss/hidden": 3.776171875, + "loss/jsd": 0.0, + "loss/logits": 0.2552025170996785, + "step": 12170 + }, + { + "epoch": 0.406, + "grad_norm": 33.25, + "grad_norm_var": 96.19270833333333, + "learning_rate": 0.0001, + "loss": 8.2495, + "loss/crossentropy": 2.253942059725523, + "loss/hidden": 3.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.22677662651985883, + "step": 12180 + }, + { + "epoch": 0.4063333333333333, + "grad_norm": 28.375, + "grad_norm_var": 4.829166666666667, + "learning_rate": 0.0001, + "loss": 8.2984, + "loss/crossentropy": 1.909020482003689, + "loss/hidden": 4.008984375, + "loss/jsd": 0.0, + "loss/logits": 0.22903512194752693, + "step": 12190 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 27.25, + "grad_norm_var": 43.521809895833336, + "learning_rate": 0.0001, + "loss": 8.0987, + "loss/crossentropy": 2.0227720350027085, + "loss/hidden": 3.803125, + "loss/jsd": 0.0, + "loss/logits": 0.2277982523664832, + "step": 12200 + }, + { + "epoch": 0.407, + "grad_norm": 54.0, + "grad_norm_var": 40.82962239583333, + "learning_rate": 0.0001, + "loss": 8.1799, + "loss/crossentropy": 2.0709748052060606, + "loss/hidden": 3.848828125, + "loss/jsd": 0.0, + "loss/logits": 0.22707768445834517, + "step": 12210 + }, + { + "epoch": 0.4073333333333333, + "grad_norm": 30.875, + "grad_norm_var": 35.93723958333333, + "learning_rate": 0.0001, + "loss": 8.2627, + "loss/crossentropy": 2.0857065066695215, + "loss/hidden": 3.755859375, + "loss/jsd": 0.0, + "loss/logits": 0.23003701977431773, + "step": 12220 + }, + { + "epoch": 0.4076666666666667, + "grad_norm": 29.625, + "grad_norm_var": 1.0643229166666666, + "learning_rate": 0.0001, + "loss": 8.2114, + "loss/crossentropy": 2.136391428112984, + "loss/hidden": 3.7546875, + "loss/jsd": 0.0, + "loss/logits": 0.23175083976238967, + "step": 12230 + }, + { + "epoch": 0.408, + "grad_norm": 31.375, + "grad_norm_var": 4.687239583333334, + "learning_rate": 0.0001, + "loss": 8.193, + "loss/crossentropy": 2.0665802858769893, + "loss/hidden": 3.741015625, + "loss/jsd": 0.0, + "loss/logits": 0.23225973546504974, + "step": 12240 + }, + { + "epoch": 0.4083333333333333, + "grad_norm": 32.0, + "grad_norm_var": 11.190559895833333, + "learning_rate": 0.0001, + "loss": 8.2347, + "loss/crossentropy": 2.0192012012004854, + "loss/hidden": 3.7859375, + "loss/jsd": 0.0, + "loss/logits": 0.2294670270755887, + "step": 12250 + }, + { + "epoch": 0.4086666666666667, + "grad_norm": 30.0, + "grad_norm_var": 5.26875, + "learning_rate": 0.0001, + "loss": 8.3742, + "loss/crossentropy": 2.1292088687419892, + "loss/hidden": 3.85234375, + "loss/jsd": 0.0, + "loss/logits": 0.24639325439929963, + "step": 12260 + }, + { + "epoch": 0.409, + "grad_norm": 29.75, + "grad_norm_var": 6.730989583333334, + "learning_rate": 0.0001, + "loss": 8.1186, + "loss/crossentropy": 2.134373862296343, + "loss/hidden": 3.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2399260677397251, + "step": 12270 + }, + { + "epoch": 0.4093333333333333, + "grad_norm": 31.25, + "grad_norm_var": 2.959830729166667, + "learning_rate": 0.0001, + "loss": 8.2645, + "loss/crossentropy": 2.1076954215765, + "loss/hidden": 3.67734375, + "loss/jsd": 0.0, + "loss/logits": 0.22277417313307524, + "step": 12280 + }, + { + "epoch": 0.4096666666666667, + "grad_norm": 30.75, + "grad_norm_var": 1.9390810930048315e+18, + "learning_rate": 0.0001, + "loss": 8.2145, + "loss/crossentropy": 1.9754585176706314, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.22273671329021455, + "step": 12290 + }, + { + "epoch": 0.41, + "grad_norm": 31.75, + "grad_norm_var": 1.9390810922215452e+18, + "learning_rate": 0.0001, + "loss": 8.255, + "loss/crossentropy": 2.21117245554924, + "loss/hidden": 3.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.22954177036881446, + "step": 12300 + }, + { + "epoch": 0.4103333333333333, + "grad_norm": 28.75, + "grad_norm_var": 11.9619140625, + "learning_rate": 0.0001, + "loss": 8.3073, + "loss/crossentropy": 2.0862128123641015, + "loss/hidden": 3.717578125, + "loss/jsd": 0.0, + "loss/logits": 0.23974426425993442, + "step": 12310 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 30.875, + "grad_norm_var": 5.587239583333333, + "learning_rate": 0.0001, + "loss": 8.1389, + "loss/crossentropy": 1.9903224393725396, + "loss/hidden": 3.753515625, + "loss/jsd": 0.0, + "loss/logits": 0.22904759608209133, + "step": 12320 + }, + { + "epoch": 0.411, + "grad_norm": 31.75, + "grad_norm_var": 5.352083333333334, + "learning_rate": 0.0001, + "loss": 8.1885, + "loss/crossentropy": 2.1092441350221636, + "loss/hidden": 3.68671875, + "loss/jsd": 0.0, + "loss/logits": 0.2219138015061617, + "step": 12330 + }, + { + "epoch": 0.41133333333333333, + "grad_norm": 28.0, + "grad_norm_var": 4.24140625, + "learning_rate": 0.0001, + "loss": 8.2663, + "loss/crossentropy": 2.008776394277811, + "loss/hidden": 3.861328125, + "loss/jsd": 0.0, + "loss/logits": 0.23205508813261985, + "step": 12340 + }, + { + "epoch": 0.4116666666666667, + "grad_norm": 29.75, + "grad_norm_var": 2.79765625, + "learning_rate": 0.0001, + "loss": 8.2388, + "loss/crossentropy": 2.10398950278759, + "loss/hidden": 3.7484375, + "loss/jsd": 0.0, + "loss/logits": 0.2270292304456234, + "step": 12350 + }, + { + "epoch": 0.412, + "grad_norm": 30.5, + "grad_norm_var": 13.00625, + "learning_rate": 0.0001, + "loss": 8.1708, + "loss/crossentropy": 1.9869476959109307, + "loss/hidden": 3.760546875, + "loss/jsd": 0.0, + "loss/logits": 0.2278031835332513, + "step": 12360 + }, + { + "epoch": 0.41233333333333333, + "grad_norm": 28.875, + "grad_norm_var": 15.693684895833334, + "learning_rate": 0.0001, + "loss": 8.2568, + "loss/crossentropy": 2.1288417890667914, + "loss/hidden": 3.849609375, + "loss/jsd": 0.0, + "loss/logits": 0.2364706289023161, + "step": 12370 + }, + { + "epoch": 0.4126666666666667, + "grad_norm": 28.375, + "grad_norm_var": 13.906184895833333, + "learning_rate": 0.0001, + "loss": 8.203, + "loss/crossentropy": 2.0824377298355103, + "loss/hidden": 3.742578125, + "loss/jsd": 0.0, + "loss/logits": 0.219917696993798, + "step": 12380 + }, + { + "epoch": 0.413, + "grad_norm": 32.0, + "grad_norm_var": 13.921875, + "learning_rate": 0.0001, + "loss": 8.2617, + "loss/crossentropy": 2.155314549803734, + "loss/hidden": 3.882421875, + "loss/jsd": 0.0, + "loss/logits": 0.24771923571825027, + "step": 12390 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 36.0, + "grad_norm_var": 12.428125, + "learning_rate": 0.0001, + "loss": 8.092, + "loss/crossentropy": 1.9445300944149495, + "loss/hidden": 3.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22118018846958876, + "step": 12400 + }, + { + "epoch": 0.4136666666666667, + "grad_norm": 31.5, + "grad_norm_var": 5.564518229166667, + "learning_rate": 0.0001, + "loss": 8.2348, + "loss/crossentropy": 2.07020313590765, + "loss/hidden": 3.790625, + "loss/jsd": 0.0, + "loss/logits": 0.22232855744659902, + "step": 12410 + }, + { + "epoch": 0.414, + "grad_norm": 35.25, + "grad_norm_var": 8.139322916666666, + "learning_rate": 0.0001, + "loss": 8.3157, + "loss/crossentropy": 2.108063217997551, + "loss/hidden": 3.845703125, + "loss/jsd": 0.0, + "loss/logits": 0.2458704814314842, + "step": 12420 + }, + { + "epoch": 0.41433333333333333, + "grad_norm": 29.5, + "grad_norm_var": 5.182291666666667, + "learning_rate": 0.0001, + "loss": 8.0417, + "loss/crossentropy": 2.181876909732819, + "loss/hidden": 3.773828125, + "loss/jsd": 0.0, + "loss/logits": 0.22876899931579828, + "step": 12430 + }, + { + "epoch": 0.4146666666666667, + "grad_norm": 28.375, + "grad_norm_var": 1.6083333333333334, + "learning_rate": 0.0001, + "loss": 8.1716, + "loss/crossentropy": 2.095130206644535, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2197817573323846, + "step": 12440 + }, + { + "epoch": 0.415, + "grad_norm": 29.625, + "grad_norm_var": 5.591666666666667, + "learning_rate": 0.0001, + "loss": 8.0813, + "loss/crossentropy": 2.0588746845722197, + "loss/hidden": 3.812109375, + "loss/jsd": 0.0, + "loss/logits": 0.232548057846725, + "step": 12450 + }, + { + "epoch": 0.41533333333333333, + "grad_norm": 32.0, + "grad_norm_var": 4.77265625, + "learning_rate": 0.0001, + "loss": 8.3514, + "loss/crossentropy": 2.0703504741191865, + "loss/hidden": 3.870703125, + "loss/jsd": 0.0, + "loss/logits": 0.25821793731302023, + "step": 12460 + }, + { + "epoch": 0.4156666666666667, + "grad_norm": 27.75, + "grad_norm_var": 42.38854166666667, + "learning_rate": 0.0001, + "loss": 8.2507, + "loss/crossentropy": 2.1307108625769615, + "loss/hidden": 3.871484375, + "loss/jsd": 0.0, + "loss/logits": 0.24973033610731363, + "step": 12470 + }, + { + "epoch": 0.416, + "grad_norm": 30.875, + "grad_norm_var": 34.26640625, + "learning_rate": 0.0001, + "loss": 8.1867, + "loss/crossentropy": 2.117819709330797, + "loss/hidden": 3.648828125, + "loss/jsd": 0.0, + "loss/logits": 0.22720290068536997, + "step": 12480 + }, + { + "epoch": 0.41633333333333333, + "grad_norm": 30.25, + "grad_norm_var": 12.4416015625, + "learning_rate": 0.0001, + "loss": 8.2697, + "loss/crossentropy": 2.109950542449951, + "loss/hidden": 3.687890625, + "loss/jsd": 0.0, + "loss/logits": 0.21628530863672496, + "step": 12490 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 31.0, + "grad_norm_var": 10.474739583333333, + "learning_rate": 0.0001, + "loss": 8.105, + "loss/crossentropy": 1.9980547428131104, + "loss/hidden": 3.808984375, + "loss/jsd": 0.0, + "loss/logits": 0.21721296831965448, + "step": 12500 + }, + { + "epoch": 0.417, + "grad_norm": 29.625, + "grad_norm_var": 3.655143229166667, + "learning_rate": 0.0001, + "loss": 8.0715, + "loss/crossentropy": 2.131350800395012, + "loss/hidden": 3.78984375, + "loss/jsd": 0.0, + "loss/logits": 0.23274635933339596, + "step": 12510 + }, + { + "epoch": 0.41733333333333333, + "grad_norm": 35.5, + "grad_norm_var": 4.509830729166667, + "learning_rate": 0.0001, + "loss": 8.1491, + "loss/crossentropy": 2.1224256813526154, + "loss/hidden": 3.827734375, + "loss/jsd": 0.0, + "loss/logits": 0.25079987831413747, + "step": 12520 + }, + { + "epoch": 0.4176666666666667, + "grad_norm": 29.5, + "grad_norm_var": 4.074739583333334, + "learning_rate": 0.0001, + "loss": 8.2616, + "loss/crossentropy": 2.0152181297540666, + "loss/hidden": 3.7921875, + "loss/jsd": 0.0, + "loss/logits": 0.23770801294595004, + "step": 12530 + }, + { + "epoch": 0.418, + "grad_norm": 29.625, + "grad_norm_var": 1.6389973958333333, + "learning_rate": 0.0001, + "loss": 8.1426, + "loss/crossentropy": 1.915038924664259, + "loss/hidden": 3.79140625, + "loss/jsd": 0.0, + "loss/logits": 0.21815692326053976, + "step": 12540 + }, + { + "epoch": 0.41833333333333333, + "grad_norm": 32.5, + "grad_norm_var": 2.895247395833333, + "learning_rate": 0.0001, + "loss": 8.1342, + "loss/crossentropy": 2.038285069167614, + "loss/hidden": 3.685546875, + "loss/jsd": 0.0, + "loss/logits": 0.2054979182779789, + "step": 12550 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 28.75, + "grad_norm_var": 4.293489583333334, + "learning_rate": 0.0001, + "loss": 8.2244, + "loss/crossentropy": 2.0995118111371993, + "loss/hidden": 3.823046875, + "loss/jsd": 0.0, + "loss/logits": 0.22938680201768874, + "step": 12560 + }, + { + "epoch": 0.419, + "grad_norm": 31.75, + "grad_norm_var": 3.577018229166667, + "learning_rate": 0.0001, + "loss": 8.2214, + "loss/crossentropy": 2.1287095353007315, + "loss/hidden": 3.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.23023095317184925, + "step": 12570 + }, + { + "epoch": 0.41933333333333334, + "grad_norm": 27.25, + "grad_norm_var": 6.601497395833333, + "learning_rate": 0.0001, + "loss": 8.0257, + "loss/crossentropy": 2.163818618655205, + "loss/hidden": 3.804296875, + "loss/jsd": 0.0, + "loss/logits": 0.2438998742029071, + "step": 12580 + }, + { + "epoch": 0.4196666666666667, + "grad_norm": 30.75, + "grad_norm_var": 12.27265625, + "learning_rate": 0.0001, + "loss": 8.1848, + "loss/crossentropy": 2.00474643856287, + "loss/hidden": 3.778515625, + "loss/jsd": 0.0, + "loss/logits": 0.22157613541930915, + "step": 12590 + }, + { + "epoch": 0.42, + "grad_norm": 32.5, + "grad_norm_var": 3.70390625, + "learning_rate": 0.0001, + "loss": 8.1346, + "loss/crossentropy": 2.164738741517067, + "loss/hidden": 3.70546875, + "loss/jsd": 0.0, + "loss/logits": 0.23143419064581394, + "step": 12600 + }, + { + "epoch": 0.42033333333333334, + "grad_norm": 31.75, + "grad_norm_var": 1.8309895833333334, + "learning_rate": 0.0001, + "loss": 8.016, + "loss/crossentropy": 2.0151191845536234, + "loss/hidden": 3.86796875, + "loss/jsd": 0.0, + "loss/logits": 0.22970234788954258, + "step": 12610 + }, + { + "epoch": 0.4206666666666667, + "grad_norm": 30.5, + "grad_norm_var": 1.6122395833333334, + "learning_rate": 0.0001, + "loss": 8.2133, + "loss/crossentropy": 2.1432757824659348, + "loss/hidden": 3.746484375, + "loss/jsd": 0.0, + "loss/logits": 0.22702465616166592, + "step": 12620 + }, + { + "epoch": 0.421, + "grad_norm": 32.25, + "grad_norm_var": 4.889322916666667, + "learning_rate": 0.0001, + "loss": 8.0972, + "loss/crossentropy": 2.182205152511597, + "loss/hidden": 3.598046875, + "loss/jsd": 0.0, + "loss/logits": 0.20892607383430004, + "step": 12630 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 34.5, + "grad_norm_var": 8.5041015625, + "learning_rate": 0.0001, + "loss": 8.0624, + "loss/crossentropy": 2.151243197917938, + "loss/hidden": 3.69765625, + "loss/jsd": 0.0, + "loss/logits": 0.22152379900217056, + "step": 12640 + }, + { + "epoch": 0.4216666666666667, + "grad_norm": 31.625, + "grad_norm_var": 2.472916666666667, + "learning_rate": 0.0001, + "loss": 8.1911, + "loss/crossentropy": 2.1016953229904174, + "loss/hidden": 3.759765625, + "loss/jsd": 0.0, + "loss/logits": 0.23559323363006116, + "step": 12650 + }, + { + "epoch": 0.422, + "grad_norm": 30.75, + "grad_norm_var": 7.993684895833334, + "learning_rate": 0.0001, + "loss": 8.165, + "loss/crossentropy": 1.9417916133999824, + "loss/hidden": 3.772265625, + "loss/jsd": 0.0, + "loss/logits": 0.2225018298253417, + "step": 12660 + }, + { + "epoch": 0.42233333333333334, + "grad_norm": 30.0, + "grad_norm_var": 16.370572916666667, + "learning_rate": 0.0001, + "loss": 8.0033, + "loss/crossentropy": 1.9000740669667722, + "loss/hidden": 3.640234375, + "loss/jsd": 0.0, + "loss/logits": 0.20819500964134932, + "step": 12670 + }, + { + "epoch": 0.4226666666666667, + "grad_norm": 30.0, + "grad_norm_var": 3.5973307291666665, + "learning_rate": 0.0001, + "loss": 8.006, + "loss/crossentropy": 2.1980207815766333, + "loss/hidden": 3.790625, + "loss/jsd": 0.0, + "loss/logits": 0.22516860738396643, + "step": 12680 + }, + { + "epoch": 0.423, + "grad_norm": 30.625, + "grad_norm_var": 2.098893229166667, + "learning_rate": 0.0001, + "loss": 8.1445, + "loss/crossentropy": 2.3503684222698213, + "loss/hidden": 3.687109375, + "loss/jsd": 0.0, + "loss/logits": 0.236199764162302, + "step": 12690 + }, + { + "epoch": 0.42333333333333334, + "grad_norm": 32.75, + "grad_norm_var": 1.47890625, + "learning_rate": 0.0001, + "loss": 7.9247, + "loss/crossentropy": 2.023277834057808, + "loss/hidden": 3.72890625, + "loss/jsd": 0.0, + "loss/logits": 0.2181034479290247, + "step": 12700 + }, + { + "epoch": 0.4236666666666667, + "grad_norm": 31.0, + "grad_norm_var": 3.9884765625, + "learning_rate": 0.0001, + "loss": 8.1299, + "loss/crossentropy": 2.138180735707283, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.2189898299984634, + "step": 12710 + }, + { + "epoch": 0.424, + "grad_norm": 30.75, + "grad_norm_var": 1.6708333333333334, + "learning_rate": 0.0001, + "loss": 7.9982, + "loss/crossentropy": 2.0362440764904024, + "loss/hidden": 3.639453125, + "loss/jsd": 0.0, + "loss/logits": 0.20388144720345736, + "step": 12720 + }, + { + "epoch": 0.42433333333333334, + "grad_norm": 30.5, + "grad_norm_var": 11.117708333333333, + "learning_rate": 0.0001, + "loss": 8.1727, + "loss/crossentropy": 2.1874516278505327, + "loss/hidden": 3.585546875, + "loss/jsd": 0.0, + "loss/logits": 0.21905291676521302, + "step": 12730 + }, + { + "epoch": 0.4246666666666667, + "grad_norm": 41.0, + "grad_norm_var": 20.251822916666665, + "learning_rate": 0.0001, + "loss": 8.0777, + "loss/crossentropy": 2.2824623227119445, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.22527546025812625, + "step": 12740 + }, + { + "epoch": 0.425, + "grad_norm": 29.5, + "grad_norm_var": 25.060416666666665, + "learning_rate": 0.0001, + "loss": 8.1515, + "loss/crossentropy": 2.0866646379232408, + "loss/hidden": 3.727734375, + "loss/jsd": 0.0, + "loss/logits": 0.2159626353532076, + "step": 12750 + }, + { + "epoch": 0.42533333333333334, + "grad_norm": 32.5, + "grad_norm_var": 25.107747395833332, + "learning_rate": 0.0001, + "loss": 7.8544, + "loss/crossentropy": 2.126218634843826, + "loss/hidden": 3.794140625, + "loss/jsd": 0.0, + "loss/logits": 0.23769555874168874, + "step": 12760 + }, + { + "epoch": 0.4256666666666667, + "grad_norm": 36.5, + "grad_norm_var": 39.00625, + "learning_rate": 0.0001, + "loss": 8.0843, + "loss/crossentropy": 2.1776451751589776, + "loss/hidden": 3.627734375, + "loss/jsd": 0.0, + "loss/logits": 0.22230409383773803, + "step": 12770 + }, + { + "epoch": 0.426, + "grad_norm": 31.375, + "grad_norm_var": 13.474934895833334, + "learning_rate": 0.0001, + "loss": 7.99, + "loss/crossentropy": 2.101397790014744, + "loss/hidden": 3.725390625, + "loss/jsd": 0.0, + "loss/logits": 0.231523541174829, + "step": 12780 + }, + { + "epoch": 0.42633333333333334, + "grad_norm": 35.25, + "grad_norm_var": 13.215625, + "learning_rate": 0.0001, + "loss": 7.9369, + "loss/crossentropy": 2.1546214550733565, + "loss/hidden": 3.74296875, + "loss/jsd": 0.0, + "loss/logits": 0.22521314695477485, + "step": 12790 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 28.875, + "grad_norm_var": 17.4791015625, + "learning_rate": 0.0001, + "loss": 8.0401, + "loss/crossentropy": 2.232549238950014, + "loss/hidden": 3.86640625, + "loss/jsd": 0.0, + "loss/logits": 0.2322084965184331, + "step": 12800 + }, + { + "epoch": 0.427, + "grad_norm": 30.25, + "grad_norm_var": 8.587239583333334, + "learning_rate": 0.0001, + "loss": 8.1028, + "loss/crossentropy": 2.1313828572630884, + "loss/hidden": 3.651171875, + "loss/jsd": 0.0, + "loss/logits": 0.21781184244900942, + "step": 12810 + }, + { + "epoch": 0.42733333333333334, + "grad_norm": 27.625, + "grad_norm_var": 10.428125, + "learning_rate": 0.0001, + "loss": 8.0773, + "loss/crossentropy": 2.243235859274864, + "loss/hidden": 3.6453125, + "loss/jsd": 0.0, + "loss/logits": 0.21965202130377293, + "step": 12820 + }, + { + "epoch": 0.42766666666666664, + "grad_norm": 37.0, + "grad_norm_var": 14.7791015625, + "learning_rate": 0.0001, + "loss": 8.1248, + "loss/crossentropy": 2.162339176237583, + "loss/hidden": 3.74296875, + "loss/jsd": 0.0, + "loss/logits": 0.23140791803598404, + "step": 12830 + }, + { + "epoch": 0.428, + "grad_norm": 29.625, + "grad_norm_var": 12.1447265625, + "learning_rate": 0.0001, + "loss": 8.1194, + "loss/crossentropy": 2.1178503066301344, + "loss/hidden": 3.759375, + "loss/jsd": 0.0, + "loss/logits": 0.243923881649971, + "step": 12840 + }, + { + "epoch": 0.42833333333333334, + "grad_norm": 49.0, + "grad_norm_var": 25.64140625, + "learning_rate": 0.0001, + "loss": 8.0362, + "loss/crossentropy": 2.1236428640782834, + "loss/hidden": 3.683984375, + "loss/jsd": 0.0, + "loss/logits": 0.21966639999300241, + "step": 12850 + }, + { + "epoch": 0.42866666666666664, + "grad_norm": 30.375, + "grad_norm_var": 23.0181640625, + "learning_rate": 0.0001, + "loss": 8.0254, + "loss/crossentropy": 2.1352688685059547, + "loss/hidden": 3.699609375, + "loss/jsd": 0.0, + "loss/logits": 0.2124914363026619, + "step": 12860 + }, + { + "epoch": 0.429, + "grad_norm": 29.875, + "grad_norm_var": 4.287239583333333, + "learning_rate": 0.0001, + "loss": 8.0469, + "loss/crossentropy": 2.0519951224327087, + "loss/hidden": 3.68828125, + "loss/jsd": 0.0, + "loss/logits": 0.22668614089488984, + "step": 12870 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 29.875, + "grad_norm_var": 2.6497395833333335, + "learning_rate": 0.0001, + "loss": 8.0736, + "loss/crossentropy": 2.089827132225037, + "loss/hidden": 3.68046875, + "loss/jsd": 0.0, + "loss/logits": 0.2167070461437106, + "step": 12880 + }, + { + "epoch": 0.42966666666666664, + "grad_norm": 27.125, + "grad_norm_var": 6.01875, + "learning_rate": 0.0001, + "loss": 7.9577, + "loss/crossentropy": 2.1272108972072603, + "loss/hidden": 3.645703125, + "loss/jsd": 0.0, + "loss/logits": 0.22290566843003035, + "step": 12890 + }, + { + "epoch": 0.43, + "grad_norm": 26.75, + "grad_norm_var": 7.3875, + "learning_rate": 0.0001, + "loss": 7.9521, + "loss/crossentropy": 2.169513902813196, + "loss/hidden": 3.675, + "loss/jsd": 0.0, + "loss/logits": 0.21037282003089786, + "step": 12900 + }, + { + "epoch": 0.43033333333333335, + "grad_norm": 32.25, + "grad_norm_var": 3.3, + "learning_rate": 0.0001, + "loss": 7.9838, + "loss/crossentropy": 2.093190697580576, + "loss/hidden": 3.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21419555507600307, + "step": 12910 + }, + { + "epoch": 0.43066666666666664, + "grad_norm": 33.5, + "grad_norm_var": 4.025, + "learning_rate": 0.0001, + "loss": 8.1292, + "loss/crossentropy": 2.128591850399971, + "loss/hidden": 3.81328125, + "loss/jsd": 0.0, + "loss/logits": 0.2363378331065178, + "step": 12920 + }, + { + "epoch": 0.431, + "grad_norm": 33.0, + "grad_norm_var": 3.588997395833333, + "learning_rate": 0.0001, + "loss": 8.0753, + "loss/crossentropy": 2.07633658349514, + "loss/hidden": 3.728125, + "loss/jsd": 0.0, + "loss/logits": 0.23476697821170092, + "step": 12930 + }, + { + "epoch": 0.43133333333333335, + "grad_norm": 34.0, + "grad_norm_var": 4.872330729166666, + "learning_rate": 0.0001, + "loss": 8.0557, + "loss/crossentropy": 2.173259836435318, + "loss/hidden": 3.8171875, + "loss/jsd": 0.0, + "loss/logits": 0.23491751477122308, + "step": 12940 + }, + { + "epoch": 0.43166666666666664, + "grad_norm": 33.5, + "grad_norm_var": 4.478059895833334, + "learning_rate": 0.0001, + "loss": 8.0571, + "loss/crossentropy": 2.076581171154976, + "loss/hidden": 3.694140625, + "loss/jsd": 0.0, + "loss/logits": 0.21625892743468283, + "step": 12950 + }, + { + "epoch": 0.432, + "grad_norm": 41.5, + "grad_norm_var": 2.540311638953689e+18, + "learning_rate": 0.0001, + "loss": 8.2307, + "loss/crossentropy": 2.2042708441615106, + "loss/hidden": 3.641015625, + "loss/jsd": 0.0, + "loss/logits": 0.23953549321740866, + "step": 12960 + }, + { + "epoch": 0.43233333333333335, + "grad_norm": 33.5, + "grad_norm_var": 2.5403116395912233e+18, + "learning_rate": 0.0001, + "loss": 8.1052, + "loss/crossentropy": 2.1010278701782226, + "loss/hidden": 3.75625, + "loss/jsd": 0.0, + "loss/logits": 0.23639172241091727, + "step": 12970 + }, + { + "epoch": 0.43266666666666664, + "grad_norm": 33.25, + "grad_norm_var": 3.4994140625, + "learning_rate": 0.0001, + "loss": 8.0009, + "loss/crossentropy": 2.057431307435036, + "loss/hidden": 3.7015625, + "loss/jsd": 0.0, + "loss/logits": 0.23213282637298108, + "step": 12980 + }, + { + "epoch": 0.433, + "grad_norm": 29.0, + "grad_norm_var": 94.1931640625, + "learning_rate": 0.0001, + "loss": 8.0487, + "loss/crossentropy": 2.0685934379696844, + "loss/hidden": 3.766796875, + "loss/jsd": 0.0, + "loss/logits": 0.23443159088492393, + "step": 12990 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 31.0, + "grad_norm_var": 92.73326822916667, + "learning_rate": 0.0001, + "loss": 8.1327, + "loss/crossentropy": 2.1575643092393877, + "loss/hidden": 3.73984375, + "loss/jsd": 0.0, + "loss/logits": 0.2321011306717992, + "step": 13000 + }, + { + "epoch": 0.43366666666666664, + "grad_norm": 30.0, + "grad_norm_var": 6.570572916666666, + "learning_rate": 0.0001, + "loss": 8.0488, + "loss/crossentropy": 1.9470253214240074, + "loss/hidden": 3.684765625, + "loss/jsd": 0.0, + "loss/logits": 0.20168767049908637, + "step": 13010 + }, + { + "epoch": 0.434, + "grad_norm": 31.875, + "grad_norm_var": 6.384830729166667, + "learning_rate": 0.0001, + "loss": 7.9454, + "loss/crossentropy": 1.8895531304180622, + "loss/hidden": 3.55, + "loss/jsd": 0.0, + "loss/logits": 0.19299599220976232, + "step": 13020 + }, + { + "epoch": 0.43433333333333335, + "grad_norm": 30.875, + "grad_norm_var": 2.934375, + "learning_rate": 0.0001, + "loss": 8.0835, + "loss/crossentropy": 2.1535514682531356, + "loss/hidden": 3.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.22370851337909697, + "step": 13030 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 30.625, + "grad_norm_var": 5.705989583333333, + "learning_rate": 0.0001, + "loss": 7.979, + "loss/crossentropy": 2.1214609906077384, + "loss/hidden": 3.688671875, + "loss/jsd": 0.0, + "loss/logits": 0.21629442609846591, + "step": 13040 + }, + { + "epoch": 0.435, + "grad_norm": 31.75, + "grad_norm_var": 6.716666666666667, + "learning_rate": 0.0001, + "loss": 7.9544, + "loss/crossentropy": 1.9861764639616013, + "loss/hidden": 3.801171875, + "loss/jsd": 0.0, + "loss/logits": 0.23933750428259373, + "step": 13050 + }, + { + "epoch": 0.43533333333333335, + "grad_norm": 30.625, + "grad_norm_var": 4.792643229166667, + "learning_rate": 0.0001, + "loss": 8.1328, + "loss/crossentropy": 2.0345228269696234, + "loss/hidden": 3.725390625, + "loss/jsd": 0.0, + "loss/logits": 0.2294387150555849, + "step": 13060 + }, + { + "epoch": 0.43566666666666665, + "grad_norm": 31.25, + "grad_norm_var": 9.701822916666666, + "learning_rate": 0.0001, + "loss": 7.9392, + "loss/crossentropy": 2.0680396020412446, + "loss/hidden": 3.887109375, + "loss/jsd": 0.0, + "loss/logits": 0.23335713148117065, + "step": 13070 + }, + { + "epoch": 0.436, + "grad_norm": 34.75, + "grad_norm_var": 5.339322916666666, + "learning_rate": 0.0001, + "loss": 8.1304, + "loss/crossentropy": 2.0736231788992883, + "loss/hidden": 3.753125, + "loss/jsd": 0.0, + "loss/logits": 0.23877801094204187, + "step": 13080 + }, + { + "epoch": 0.43633333333333335, + "grad_norm": 30.75, + "grad_norm_var": 4.684025051839935e+18, + "learning_rate": 0.0001, + "loss": 8.1914, + "loss/crossentropy": 2.2048259407281874, + "loss/hidden": 3.637109375, + "loss/jsd": 0.0, + "loss/logits": 0.23145930115133523, + "step": 13090 + }, + { + "epoch": 0.43666666666666665, + "grad_norm": 31.625, + "grad_norm_var": 21.812955729166667, + "learning_rate": 0.0001, + "loss": 8.0019, + "loss/crossentropy": 2.1128788188099863, + "loss/hidden": 3.742578125, + "loss/jsd": 0.0, + "loss/logits": 0.2392245376482606, + "step": 13100 + }, + { + "epoch": 0.437, + "grad_norm": 32.25, + "grad_norm_var": 9.125, + "learning_rate": 0.0001, + "loss": 8.0213, + "loss/crossentropy": 2.0316510528326033, + "loss/hidden": 3.684375, + "loss/jsd": 0.0, + "loss/logits": 0.21703706961125135, + "step": 13110 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 30.625, + "grad_norm_var": 13.457747395833334, + "learning_rate": 0.0001, + "loss": 8.1833, + "loss/crossentropy": 1.937141789495945, + "loss/hidden": 3.655078125, + "loss/jsd": 0.0, + "loss/logits": 0.2118115139193833, + "step": 13120 + }, + { + "epoch": 0.43766666666666665, + "grad_norm": 32.75, + "grad_norm_var": 12.8369140625, + "learning_rate": 0.0001, + "loss": 8.0128, + "loss/crossentropy": 2.057337316870689, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2210848169401288, + "step": 13130 + }, + { + "epoch": 0.438, + "grad_norm": 34.25, + "grad_norm_var": 8.7587890625, + "learning_rate": 0.0001, + "loss": 8.0243, + "loss/crossentropy": 2.086243689060211, + "loss/hidden": 3.696484375, + "loss/jsd": 0.0, + "loss/logits": 0.2156898221001029, + "step": 13140 + }, + { + "epoch": 0.43833333333333335, + "grad_norm": 31.875, + "grad_norm_var": 7.94765625, + "learning_rate": 0.0001, + "loss": 8.1641, + "loss/crossentropy": 2.2972807347774507, + "loss/hidden": 3.756640625, + "loss/jsd": 0.0, + "loss/logits": 0.24422766156494619, + "step": 13150 + }, + { + "epoch": 0.43866666666666665, + "grad_norm": 30.5, + "grad_norm_var": 5.2791015625, + "learning_rate": 0.0001, + "loss": 8.0809, + "loss/crossentropy": 2.2393217980861664, + "loss/hidden": 3.623828125, + "loss/jsd": 0.0, + "loss/logits": 0.22840084582567216, + "step": 13160 + }, + { + "epoch": 0.439, + "grad_norm": 32.0, + "grad_norm_var": 4.076497395833333, + "learning_rate": 0.0001, + "loss": 8.0278, + "loss/crossentropy": 2.149459010362625, + "loss/hidden": 3.697265625, + "loss/jsd": 0.0, + "loss/logits": 0.2299880154430866, + "step": 13170 + }, + { + "epoch": 0.43933333333333335, + "grad_norm": 29.0, + "grad_norm_var": 2.039322916666667, + "learning_rate": 0.0001, + "loss": 8.0435, + "loss/crossentropy": 2.022595777362585, + "loss/hidden": 3.65390625, + "loss/jsd": 0.0, + "loss/logits": 0.21843499960377813, + "step": 13180 + }, + { + "epoch": 0.43966666666666665, + "grad_norm": 32.5, + "grad_norm_var": 8.3072265625, + "learning_rate": 0.0001, + "loss": 8.0339, + "loss/crossentropy": 1.966893842816353, + "loss/hidden": 3.7140625, + "loss/jsd": 0.0, + "loss/logits": 0.23092244230210782, + "step": 13190 + }, + { + "epoch": 0.44, + "grad_norm": 30.125, + "grad_norm_var": 1.8580729166666667, + "learning_rate": 0.0001, + "loss": 8.1766, + "loss/crossentropy": 2.2937954008579253, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.24365076944231986, + "step": 13200 + }, + { + "epoch": 0.44033333333333335, + "grad_norm": 42.0, + "grad_norm_var": 8.8447265625, + "learning_rate": 0.0001, + "loss": 8.0685, + "loss/crossentropy": 2.203465947508812, + "loss/hidden": 3.625390625, + "loss/jsd": 0.0, + "loss/logits": 0.22517771869897843, + "step": 13210 + }, + { + "epoch": 0.44066666666666665, + "grad_norm": 32.25, + "grad_norm_var": 8.635872395833333, + "learning_rate": 0.0001, + "loss": 8.0298, + "loss/crossentropy": 2.14248249232769, + "loss/hidden": 3.504296875, + "loss/jsd": 0.0, + "loss/logits": 0.21284189969301223, + "step": 13220 + }, + { + "epoch": 0.441, + "grad_norm": 27.75, + "grad_norm_var": 4.1634765625, + "learning_rate": 0.0001, + "loss": 7.9522, + "loss/crossentropy": 2.112074154615402, + "loss/hidden": 3.856640625, + "loss/jsd": 0.0, + "loss/logits": 0.23715684618800878, + "step": 13230 + }, + { + "epoch": 0.44133333333333336, + "grad_norm": 32.25, + "grad_norm_var": 6.068489583333333, + "learning_rate": 0.0001, + "loss": 7.9992, + "loss/crossentropy": 2.0217273235321045, + "loss/hidden": 3.68984375, + "loss/jsd": 0.0, + "loss/logits": 0.22903131749480962, + "step": 13240 + }, + { + "epoch": 0.44166666666666665, + "grad_norm": 30.5, + "grad_norm_var": 11.684375, + "learning_rate": 0.0001, + "loss": 8.0119, + "loss/crossentropy": 2.080713841319084, + "loss/hidden": 3.68515625, + "loss/jsd": 0.0, + "loss/logits": 0.22152625005692245, + "step": 13250 + }, + { + "epoch": 0.442, + "grad_norm": 37.0, + "grad_norm_var": 1240.8749348958333, + "learning_rate": 0.0001, + "loss": 8.1225, + "loss/crossentropy": 2.0688235819339753, + "loss/hidden": 3.74296875, + "loss/jsd": 0.0, + "loss/logits": 0.22032655104994775, + "step": 13260 + }, + { + "epoch": 0.44233333333333336, + "grad_norm": 32.75, + "grad_norm_var": 1244.62890625, + "learning_rate": 0.0001, + "loss": 7.9617, + "loss/crossentropy": 2.0978364631533624, + "loss/hidden": 3.712109375, + "loss/jsd": 0.0, + "loss/logits": 0.2254788476973772, + "step": 13270 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 34.0, + "grad_norm_var": 5.268684895833333, + "learning_rate": 0.0001, + "loss": 8.0872, + "loss/crossentropy": 2.0726170748472215, + "loss/hidden": 3.562890625, + "loss/jsd": 0.0, + "loss/logits": 0.19704890344291925, + "step": 13280 + }, + { + "epoch": 0.443, + "grad_norm": 29.75, + "grad_norm_var": 6.375455729166666, + "learning_rate": 0.0001, + "loss": 7.9701, + "loss/crossentropy": 2.2288454949855803, + "loss/hidden": 3.67890625, + "loss/jsd": 0.0, + "loss/logits": 0.21981892809271814, + "step": 13290 + }, + { + "epoch": 0.44333333333333336, + "grad_norm": 32.5, + "grad_norm_var": 4.275, + "learning_rate": 0.0001, + "loss": 8.1481, + "loss/crossentropy": 2.1981609016656876, + "loss/hidden": 3.7671875, + "loss/jsd": 0.0, + "loss/logits": 0.2299773920327425, + "step": 13300 + }, + { + "epoch": 0.44366666666666665, + "grad_norm": 31.125, + "grad_norm_var": 5.218489583333334, + "learning_rate": 0.0001, + "loss": 8.1545, + "loss/crossentropy": 2.1825950175523756, + "loss/hidden": 3.631640625, + "loss/jsd": 0.0, + "loss/logits": 0.22280107364058493, + "step": 13310 + }, + { + "epoch": 0.444, + "grad_norm": 31.375, + "grad_norm_var": 5.693684895833333, + "learning_rate": 0.0001, + "loss": 8.0126, + "loss/crossentropy": 2.1816250920295714, + "loss/hidden": 3.725, + "loss/jsd": 0.0, + "loss/logits": 0.23028801158070564, + "step": 13320 + }, + { + "epoch": 0.44433333333333336, + "grad_norm": 35.25, + "grad_norm_var": 5.933072916666666, + "learning_rate": 0.0001, + "loss": 8.0725, + "loss/crossentropy": 2.0108284398913385, + "loss/hidden": 3.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.2190965536981821, + "step": 13330 + }, + { + "epoch": 0.44466666666666665, + "grad_norm": 30.5, + "grad_norm_var": 9.8134765625, + "learning_rate": 0.0001, + "loss": 7.9707, + "loss/crossentropy": 2.1558491311967374, + "loss/hidden": 3.773828125, + "loss/jsd": 0.0, + "loss/logits": 0.20952776670455933, + "step": 13340 + }, + { + "epoch": 0.445, + "grad_norm": 29.875, + "grad_norm_var": 10.82890625, + "learning_rate": 0.0001, + "loss": 8.1556, + "loss/crossentropy": 2.0820034801959992, + "loss/hidden": 3.894921875, + "loss/jsd": 0.0, + "loss/logits": 0.25748000014573336, + "step": 13350 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 32.0, + "grad_norm_var": 20.269791666666666, + "learning_rate": 0.0001, + "loss": 8.1191, + "loss/crossentropy": 2.0460492126643657, + "loss/hidden": 3.7515625, + "loss/jsd": 0.0, + "loss/logits": 0.22600278463214635, + "step": 13360 + }, + { + "epoch": 0.44566666666666666, + "grad_norm": 27.625, + "grad_norm_var": 14.958072916666667, + "learning_rate": 0.0001, + "loss": 8.042, + "loss/crossentropy": 2.1917740404605865, + "loss/hidden": 3.61875, + "loss/jsd": 0.0, + "loss/logits": 0.20980511526577175, + "step": 13370 + }, + { + "epoch": 0.446, + "grad_norm": 33.25, + "grad_norm_var": 16.076497395833332, + "learning_rate": 0.0001, + "loss": 8.0894, + "loss/crossentropy": 2.2024613440036775, + "loss/hidden": 3.649609375, + "loss/jsd": 0.0, + "loss/logits": 0.2355958294123411, + "step": 13380 + }, + { + "epoch": 0.44633333333333336, + "grad_norm": 33.5, + "grad_norm_var": 17.540625, + "learning_rate": 0.0001, + "loss": 7.9545, + "loss/crossentropy": 1.9565787248313427, + "loss/hidden": 3.79765625, + "loss/jsd": 0.0, + "loss/logits": 0.21949218455702066, + "step": 13390 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 33.75, + "grad_norm_var": 20.5400390625, + "learning_rate": 0.0001, + "loss": 7.9703, + "loss/crossentropy": 2.0982728376984596, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.20750523190945386, + "step": 13400 + }, + { + "epoch": 0.447, + "grad_norm": 30.0, + "grad_norm_var": 5.753580729166667, + "learning_rate": 0.0001, + "loss": 7.9816, + "loss/crossentropy": 2.1122142657637597, + "loss/hidden": 3.613671875, + "loss/jsd": 0.0, + "loss/logits": 0.22008793614804745, + "step": 13410 + }, + { + "epoch": 0.44733333333333336, + "grad_norm": 35.25, + "grad_norm_var": 5.796875, + "learning_rate": 0.0001, + "loss": 8.0286, + "loss/crossentropy": 2.07668551504612, + "loss/hidden": 3.783203125, + "loss/jsd": 0.0, + "loss/logits": 0.21882363129407167, + "step": 13420 + }, + { + "epoch": 0.44766666666666666, + "grad_norm": 28.125, + "grad_norm_var": 7.4822265625, + "learning_rate": 0.0001, + "loss": 7.8724, + "loss/crossentropy": 2.0558082655072214, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.20779120028018952, + "step": 13430 + }, + { + "epoch": 0.448, + "grad_norm": 31.25, + "grad_norm_var": 43.58743489583333, + "learning_rate": 0.0001, + "loss": 8.0085, + "loss/crossentropy": 2.0572322949767115, + "loss/hidden": 3.719140625, + "loss/jsd": 0.0, + "loss/logits": 0.22204263061285018, + "step": 13440 + }, + { + "epoch": 0.4483333333333333, + "grad_norm": 31.125, + "grad_norm_var": 23.96640625, + "learning_rate": 0.0001, + "loss": 7.9379, + "loss/crossentropy": 2.07759770154953, + "loss/hidden": 3.603515625, + "loss/jsd": 0.0, + "loss/logits": 0.21129580233246087, + "step": 13450 + }, + { + "epoch": 0.44866666666666666, + "grad_norm": 33.25, + "grad_norm_var": 18.164322916666666, + "learning_rate": 0.0001, + "loss": 8.0347, + "loss/crossentropy": 2.015417565405369, + "loss/hidden": 3.744140625, + "loss/jsd": 0.0, + "loss/logits": 0.22753252387046813, + "step": 13460 + }, + { + "epoch": 0.449, + "grad_norm": 29.25, + "grad_norm_var": 25.124739583333334, + "learning_rate": 0.0001, + "loss": 8.1289, + "loss/crossentropy": 2.05652796626091, + "loss/hidden": 3.781640625, + "loss/jsd": 0.0, + "loss/logits": 0.24078646618872881, + "step": 13470 + }, + { + "epoch": 0.4493333333333333, + "grad_norm": 29.625, + "grad_norm_var": 24.924934895833335, + "learning_rate": 0.0001, + "loss": 8.0484, + "loss/crossentropy": 2.1728298760950566, + "loss/hidden": 3.636328125, + "loss/jsd": 0.0, + "loss/logits": 0.2125161023810506, + "step": 13480 + }, + { + "epoch": 0.44966666666666666, + "grad_norm": 30.875, + "grad_norm_var": 17.8150390625, + "learning_rate": 0.0001, + "loss": 7.8773, + "loss/crossentropy": 2.021086546033621, + "loss/hidden": 3.686328125, + "loss/jsd": 0.0, + "loss/logits": 0.2219966731965542, + "step": 13490 + }, + { + "epoch": 0.45, + "grad_norm": 43.25, + "grad_norm_var": 20.687955729166667, + "learning_rate": 0.0001, + "loss": 8.0709, + "loss/crossentropy": 1.9942471355199813, + "loss/hidden": 3.686328125, + "loss/jsd": 0.0, + "loss/logits": 0.24962616860866546, + "step": 13500 + }, + { + "epoch": 0.4503333333333333, + "grad_norm": 30.375, + "grad_norm_var": 19.948372395833335, + "learning_rate": 0.0001, + "loss": 8.031, + "loss/crossentropy": 2.1690925747156142, + "loss/hidden": 3.634765625, + "loss/jsd": 0.0, + "loss/logits": 0.2215597040951252, + "step": 13510 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 32.25, + "grad_norm_var": 11.6369140625, + "learning_rate": 0.0001, + "loss": 7.948, + "loss/crossentropy": 2.134739102423191, + "loss/hidden": 3.713671875, + "loss/jsd": 0.0, + "loss/logits": 0.2251646015793085, + "step": 13520 + }, + { + "epoch": 0.451, + "grad_norm": 38.25, + "grad_norm_var": 11.8212890625, + "learning_rate": 0.0001, + "loss": 7.8974, + "loss/crossentropy": 2.251084867119789, + "loss/hidden": 3.6875, + "loss/jsd": 0.0, + "loss/logits": 0.214023519679904, + "step": 13530 + }, + { + "epoch": 0.4513333333333333, + "grad_norm": 26.5, + "grad_norm_var": 9.022916666666667, + "learning_rate": 0.0001, + "loss": 7.8569, + "loss/crossentropy": 2.065364643931389, + "loss/hidden": 3.59296875, + "loss/jsd": 0.0, + "loss/logits": 0.2112195746973157, + "step": 13540 + }, + { + "epoch": 0.45166666666666666, + "grad_norm": 31.875, + "grad_norm_var": 6.6087890625, + "learning_rate": 0.0001, + "loss": 8.094, + "loss/crossentropy": 2.207910177111626, + "loss/hidden": 3.665234375, + "loss/jsd": 0.0, + "loss/logits": 0.2230087785050273, + "step": 13550 + }, + { + "epoch": 0.452, + "grad_norm": 32.25, + "grad_norm_var": 2.4344770479298447e+18, + "learning_rate": 0.0001, + "loss": 8.0482, + "loss/crossentropy": 2.0523271694779397, + "loss/hidden": 3.74296875, + "loss/jsd": 0.0, + "loss/logits": 0.2225760780274868, + "step": 13560 + }, + { + "epoch": 0.4523333333333333, + "grad_norm": 29.125, + "grad_norm_var": 2.4344770485864627e+18, + "learning_rate": 0.0001, + "loss": 8.008, + "loss/crossentropy": 2.1910267025232315, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2098704855889082, + "step": 13570 + }, + { + "epoch": 0.45266666666666666, + "grad_norm": 29.75, + "grad_norm_var": 10.664583333333333, + "learning_rate": 0.0001, + "loss": 8.1107, + "loss/crossentropy": 2.2485829517245293, + "loss/hidden": 3.705078125, + "loss/jsd": 0.0, + "loss/logits": 0.23670779224485158, + "step": 13580 + }, + { + "epoch": 0.453, + "grad_norm": 29.375, + "grad_norm_var": 11.153125, + "learning_rate": 0.0001, + "loss": 8.0307, + "loss/crossentropy": 2.176164289563894, + "loss/hidden": 3.590625, + "loss/jsd": 0.0, + "loss/logits": 0.216172288171947, + "step": 13590 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 28.5, + "grad_norm_var": 4.143489583333333, + "learning_rate": 0.0001, + "loss": 7.9631, + "loss/crossentropy": 2.0201455272734163, + "loss/hidden": 3.5984375, + "loss/jsd": 0.0, + "loss/logits": 0.20862020272761583, + "step": 13600 + }, + { + "epoch": 0.45366666666666666, + "grad_norm": 32.25, + "grad_norm_var": 13.143489583333333, + "learning_rate": 0.0001, + "loss": 8.0374, + "loss/crossentropy": 2.041334181651473, + "loss/hidden": 3.77578125, + "loss/jsd": 0.0, + "loss/logits": 0.22034290386363864, + "step": 13610 + }, + { + "epoch": 0.454, + "grad_norm": 29.75, + "grad_norm_var": 2.6327473958333334, + "learning_rate": 0.0001, + "loss": 8.0312, + "loss/crossentropy": 2.2032358795404434, + "loss/hidden": 3.691796875, + "loss/jsd": 0.0, + "loss/logits": 0.23516745883971452, + "step": 13620 + }, + { + "epoch": 0.4543333333333333, + "grad_norm": 34.0, + "grad_norm_var": 10.5134765625, + "learning_rate": 0.0001, + "loss": 8.0251, + "loss/crossentropy": 2.212277019023895, + "loss/hidden": 3.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.24109804332256318, + "step": 13630 + }, + { + "epoch": 0.45466666666666666, + "grad_norm": 29.375, + "grad_norm_var": 10.6587890625, + "learning_rate": 0.0001, + "loss": 7.9654, + "loss/crossentropy": 2.0316985830664636, + "loss/hidden": 3.719921875, + "loss/jsd": 0.0, + "loss/logits": 0.22388876546174288, + "step": 13640 + }, + { + "epoch": 0.455, + "grad_norm": 27.25, + "grad_norm_var": 15.3166015625, + "learning_rate": 0.0001, + "loss": 7.9874, + "loss/crossentropy": 1.9860161900520326, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.2077900541946292, + "step": 13650 + }, + { + "epoch": 0.4553333333333333, + "grad_norm": 30.125, + "grad_norm_var": 22.734830729166667, + "learning_rate": 0.0001, + "loss": 8.0461, + "loss/crossentropy": 2.2462501987814902, + "loss/hidden": 3.64921875, + "loss/jsd": 0.0, + "loss/logits": 0.2214917227625847, + "step": 13660 + }, + { + "epoch": 0.45566666666666666, + "grad_norm": 30.875, + "grad_norm_var": 10.469205729166667, + "learning_rate": 0.0001, + "loss": 7.9203, + "loss/crossentropy": 1.9449552714824676, + "loss/hidden": 3.7953125, + "loss/jsd": 0.0, + "loss/logits": 0.21775004733353853, + "step": 13670 + }, + { + "epoch": 0.456, + "grad_norm": 32.5, + "grad_norm_var": 3.26640625, + "learning_rate": 0.0001, + "loss": 8.0556, + "loss/crossentropy": 2.0170522332191467, + "loss/hidden": 3.7125, + "loss/jsd": 0.0, + "loss/logits": 0.2343472855165601, + "step": 13680 + }, + { + "epoch": 0.4563333333333333, + "grad_norm": 32.75, + "grad_norm_var": 4.356184895833334, + "learning_rate": 0.0001, + "loss": 7.9995, + "loss/crossentropy": 2.104297934472561, + "loss/hidden": 3.646875, + "loss/jsd": 0.0, + "loss/logits": 0.21948921959847212, + "step": 13690 + }, + { + "epoch": 0.45666666666666667, + "grad_norm": 30.875, + "grad_norm_var": 3.4082682291666666, + "learning_rate": 0.0001, + "loss": 7.9706, + "loss/crossentropy": 2.0912899121642115, + "loss/hidden": 3.707421875, + "loss/jsd": 0.0, + "loss/logits": 0.21979312859475614, + "step": 13700 + }, + { + "epoch": 0.457, + "grad_norm": 30.75, + "grad_norm_var": 3.9270833333333335, + "learning_rate": 0.0001, + "loss": 7.9165, + "loss/crossentropy": 2.121154861152172, + "loss/hidden": 3.65546875, + "loss/jsd": 0.0, + "loss/logits": 0.2177841143682599, + "step": 13710 + }, + { + "epoch": 0.4573333333333333, + "grad_norm": 36.5, + "grad_norm_var": 5.361393229166667, + "learning_rate": 0.0001, + "loss": 8.0071, + "loss/crossentropy": 2.136076480150223, + "loss/hidden": 3.635546875, + "loss/jsd": 0.0, + "loss/logits": 0.23270511198788882, + "step": 13720 + }, + { + "epoch": 0.45766666666666667, + "grad_norm": 29.625, + "grad_norm_var": 7.4634765625, + "learning_rate": 0.0001, + "loss": 8.0277, + "loss/crossentropy": 2.091973701864481, + "loss/hidden": 3.5796875, + "loss/jsd": 0.0, + "loss/logits": 0.20484627303667366, + "step": 13730 + }, + { + "epoch": 0.458, + "grad_norm": 30.625, + "grad_norm_var": 3.123893229166667, + "learning_rate": 0.0001, + "loss": 7.9874, + "loss/crossentropy": 2.2389348953962327, + "loss/hidden": 3.6984375, + "loss/jsd": 0.0, + "loss/logits": 0.2297331139445305, + "step": 13740 + }, + { + "epoch": 0.4583333333333333, + "grad_norm": 30.875, + "grad_norm_var": 3.8634765625, + "learning_rate": 0.0001, + "loss": 8.2201, + "loss/crossentropy": 2.1332207426428793, + "loss/hidden": 3.782421875, + "loss/jsd": 0.0, + "loss/logits": 0.24505181200802326, + "step": 13750 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 31.125, + "grad_norm_var": 2.3067057291666666, + "learning_rate": 0.0001, + "loss": 7.8715, + "loss/crossentropy": 1.9607113853096962, + "loss/hidden": 3.636328125, + "loss/jsd": 0.0, + "loss/logits": 0.21614569872617723, + "step": 13760 + }, + { + "epoch": 0.459, + "grad_norm": 30.75, + "grad_norm_var": 3.914322916666667, + "learning_rate": 0.0001, + "loss": 8.024, + "loss/crossentropy": 2.0893411085009577, + "loss/hidden": 3.710546875, + "loss/jsd": 0.0, + "loss/logits": 0.22553631979972125, + "step": 13770 + }, + { + "epoch": 0.4593333333333333, + "grad_norm": 35.5, + "grad_norm_var": 31.7822265625, + "learning_rate": 0.0001, + "loss": 8.0481, + "loss/crossentropy": 2.132881796360016, + "loss/hidden": 3.697265625, + "loss/jsd": 0.0, + "loss/logits": 0.23121243454515933, + "step": 13780 + }, + { + "epoch": 0.45966666666666667, + "grad_norm": 30.875, + "grad_norm_var": 30.0931640625, + "learning_rate": 0.0001, + "loss": 8.2036, + "loss/crossentropy": 2.3052436083555223, + "loss/hidden": 3.640234375, + "loss/jsd": 0.0, + "loss/logits": 0.23779372237622737, + "step": 13790 + }, + { + "epoch": 0.46, + "grad_norm": 31.375, + "grad_norm_var": 3.6020182291666667, + "learning_rate": 0.0001, + "loss": 8.0591, + "loss/crossentropy": 2.032654400169849, + "loss/hidden": 3.706640625, + "loss/jsd": 0.0, + "loss/logits": 0.22621268928050994, + "step": 13800 + }, + { + "epoch": 0.4603333333333333, + "grad_norm": 30.125, + "grad_norm_var": 13.4212890625, + "learning_rate": 0.0001, + "loss": 8.158, + "loss/crossentropy": 2.011768199503422, + "loss/hidden": 3.775390625, + "loss/jsd": 0.0, + "loss/logits": 0.21962493509054185, + "step": 13810 + }, + { + "epoch": 0.46066666666666667, + "grad_norm": 33.5, + "grad_norm_var": 13.245247395833333, + "learning_rate": 0.0001, + "loss": 7.9673, + "loss/crossentropy": 2.0957317486405374, + "loss/hidden": 3.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.21697306856513024, + "step": 13820 + }, + { + "epoch": 0.461, + "grad_norm": 30.875, + "grad_norm_var": 5.01640625, + "learning_rate": 0.0001, + "loss": 8.0855, + "loss/crossentropy": 2.2701291263103487, + "loss/hidden": 3.691796875, + "loss/jsd": 0.0, + "loss/logits": 0.2501831637695432, + "step": 13830 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 38.5, + "grad_norm_var": 6.1462890625, + "learning_rate": 0.0001, + "loss": 7.9714, + "loss/crossentropy": 2.0974492438137533, + "loss/hidden": 3.861328125, + "loss/jsd": 0.0, + "loss/logits": 0.2554807654581964, + "step": 13840 + }, + { + "epoch": 0.46166666666666667, + "grad_norm": 31.75, + "grad_norm_var": 5.338997395833333, + "learning_rate": 0.0001, + "loss": 7.9723, + "loss/crossentropy": 2.0560192227363587, + "loss/hidden": 3.7453125, + "loss/jsd": 0.0, + "loss/logits": 0.22991488091647624, + "step": 13850 + }, + { + "epoch": 0.462, + "grad_norm": 33.75, + "grad_norm_var": 3.0452473958333335, + "learning_rate": 0.0001, + "loss": 8.1789, + "loss/crossentropy": 2.1576769910752773, + "loss/hidden": 3.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.23487880751490592, + "step": 13860 + }, + { + "epoch": 0.4623333333333333, + "grad_norm": 36.0, + "grad_norm_var": 5.680208333333334, + "learning_rate": 0.0001, + "loss": 7.9999, + "loss/crossentropy": 2.061297869682312, + "loss/hidden": 3.701171875, + "loss/jsd": 0.0, + "loss/logits": 0.24054675735533237, + "step": 13870 + }, + { + "epoch": 0.46266666666666667, + "grad_norm": 29.75, + "grad_norm_var": 2.9879557291666665, + "learning_rate": 0.0001, + "loss": 8.0911, + "loss/crossentropy": 2.0008441783487796, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.22590927435085179, + "step": 13880 + }, + { + "epoch": 0.463, + "grad_norm": 31.0, + "grad_norm_var": 4.104622395833333, + "learning_rate": 0.0001, + "loss": 7.9195, + "loss/crossentropy": 2.132499638199806, + "loss/hidden": 3.634765625, + "loss/jsd": 0.0, + "loss/logits": 0.2145272171124816, + "step": 13890 + }, + { + "epoch": 0.4633333333333333, + "grad_norm": 35.25, + "grad_norm_var": 5.206705729166667, + "learning_rate": 0.0001, + "loss": 8.0658, + "loss/crossentropy": 2.259749516099691, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.2152292856015265, + "step": 13900 + }, + { + "epoch": 0.46366666666666667, + "grad_norm": 31.75, + "grad_norm_var": 3.825455729166667, + "learning_rate": 0.0001, + "loss": 7.924, + "loss/crossentropy": 2.1031736478209497, + "loss/hidden": 3.691796875, + "loss/jsd": 0.0, + "loss/logits": 0.23380641918629408, + "step": 13910 + }, + { + "epoch": 0.464, + "grad_norm": 32.75, + "grad_norm_var": 3.6497395833333335, + "learning_rate": 0.0001, + "loss": 8.152, + "loss/crossentropy": 2.0122231051325796, + "loss/hidden": 3.760546875, + "loss/jsd": 0.0, + "loss/logits": 0.22742101289331912, + "step": 13920 + }, + { + "epoch": 0.4643333333333333, + "grad_norm": 33.25, + "grad_norm_var": 4.8625, + "learning_rate": 0.0001, + "loss": 8.0922, + "loss/crossentropy": 2.1263721615076063, + "loss/hidden": 3.7046875, + "loss/jsd": 0.0, + "loss/logits": 0.22606851048767568, + "step": 13930 + }, + { + "epoch": 0.4646666666666667, + "grad_norm": 30.5, + "grad_norm_var": 2.9322265625, + "learning_rate": 0.0001, + "loss": 8.1061, + "loss/crossentropy": 2.079650565981865, + "loss/hidden": 3.73203125, + "loss/jsd": 0.0, + "loss/logits": 0.2313553038984537, + "step": 13940 + }, + { + "epoch": 0.465, + "grad_norm": 30.125, + "grad_norm_var": 5.832747395833334, + "learning_rate": 0.0001, + "loss": 7.9564, + "loss/crossentropy": 2.284359359741211, + "loss/hidden": 3.587109375, + "loss/jsd": 0.0, + "loss/logits": 0.22488110959529878, + "step": 13950 + }, + { + "epoch": 0.4653333333333333, + "grad_norm": 30.75, + "grad_norm_var": 2.8223307291666666, + "learning_rate": 0.0001, + "loss": 8.0784, + "loss/crossentropy": 1.9363142460584641, + "loss/hidden": 3.709375, + "loss/jsd": 0.0, + "loss/logits": 0.22093999302014708, + "step": 13960 + }, + { + "epoch": 0.4656666666666667, + "grad_norm": 29.625, + "grad_norm_var": 16.656184895833334, + "learning_rate": 0.0001, + "loss": 8.1271, + "loss/crossentropy": 2.150686714053154, + "loss/hidden": 3.700390625, + "loss/jsd": 0.0, + "loss/logits": 0.2396129213273525, + "step": 13970 + }, + { + "epoch": 0.466, + "grad_norm": 30.5, + "grad_norm_var": 1.6785807291666666, + "learning_rate": 0.0001, + "loss": 8.0096, + "loss/crossentropy": 2.039792370796204, + "loss/hidden": 3.678515625, + "loss/jsd": 0.0, + "loss/logits": 0.23292775694280862, + "step": 13980 + }, + { + "epoch": 0.4663333333333333, + "grad_norm": 30.75, + "grad_norm_var": 3.801497395833333, + "learning_rate": 0.0001, + "loss": 7.9907, + "loss/crossentropy": 2.0111122995615007, + "loss/hidden": 3.690234375, + "loss/jsd": 0.0, + "loss/logits": 0.22928319536149502, + "step": 13990 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 38.0, + "grad_norm_var": 5.3822265625, + "learning_rate": 0.0001, + "loss": 7.9428, + "loss/crossentropy": 2.140259427577257, + "loss/hidden": 3.747265625, + "loss/jsd": 0.0, + "loss/logits": 0.23131428118795155, + "step": 14000 + }, + { + "epoch": 0.467, + "grad_norm": 36.25, + "grad_norm_var": 6.494791666666667, + "learning_rate": 0.0001, + "loss": 8.0925, + "loss/crossentropy": 2.0670676976442337, + "loss/hidden": 3.81953125, + "loss/jsd": 0.0, + "loss/logits": 0.2178192425519228, + "step": 14010 + }, + { + "epoch": 0.4673333333333333, + "grad_norm": 34.25, + "grad_norm_var": 4.870572916666666, + "learning_rate": 0.0001, + "loss": 8.002, + "loss/crossentropy": 2.1283276975154877, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.2105252131819725, + "step": 14020 + }, + { + "epoch": 0.4676666666666667, + "grad_norm": 29.5, + "grad_norm_var": 2.8785807291666665, + "learning_rate": 0.0001, + "loss": 7.9375, + "loss/crossentropy": 2.189284147322178, + "loss/hidden": 3.700390625, + "loss/jsd": 0.0, + "loss/logits": 0.23593165911734104, + "step": 14030 + }, + { + "epoch": 0.468, + "grad_norm": 33.0, + "grad_norm_var": 1.7705729166666666, + "learning_rate": 0.0001, + "loss": 7.8595, + "loss/crossentropy": 2.0938302144408225, + "loss/hidden": 3.683203125, + "loss/jsd": 0.0, + "loss/logits": 0.2286398086696863, + "step": 14040 + }, + { + "epoch": 0.4683333333333333, + "grad_norm": 32.25, + "grad_norm_var": 7.680208333333334, + "learning_rate": 0.0001, + "loss": 7.8761, + "loss/crossentropy": 2.1059680595993995, + "loss/hidden": 3.79921875, + "loss/jsd": 0.0, + "loss/logits": 0.22907451894134284, + "step": 14050 + }, + { + "epoch": 0.4686666666666667, + "grad_norm": 31.375, + "grad_norm_var": 15.355208333333334, + "learning_rate": 0.0001, + "loss": 8.0853, + "loss/crossentropy": 2.011585946381092, + "loss/hidden": 3.675390625, + "loss/jsd": 0.0, + "loss/logits": 0.23615019097924234, + "step": 14060 + }, + { + "epoch": 0.469, + "grad_norm": 31.875, + "grad_norm_var": 10.226041666666667, + "learning_rate": 0.0001, + "loss": 8.0453, + "loss/crossentropy": 2.1759623274207116, + "loss/hidden": 3.6734375, + "loss/jsd": 0.0, + "loss/logits": 0.23897310364991425, + "step": 14070 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 28.875, + "grad_norm_var": 3.2926432291666665, + "learning_rate": 0.0001, + "loss": 7.9501, + "loss/crossentropy": 2.1067129537463187, + "loss/hidden": 3.684765625, + "loss/jsd": 0.0, + "loss/logits": 0.21687035337090493, + "step": 14080 + }, + { + "epoch": 0.4696666666666667, + "grad_norm": 32.75, + "grad_norm_var": 10.553125, + "learning_rate": 0.0001, + "loss": 7.8951, + "loss/crossentropy": 2.073711508512497, + "loss/hidden": 3.58359375, + "loss/jsd": 0.0, + "loss/logits": 0.20740561783313752, + "step": 14090 + }, + { + "epoch": 0.47, + "grad_norm": 42.75, + "grad_norm_var": 11.9791015625, + "learning_rate": 0.0001, + "loss": 8.0888, + "loss/crossentropy": 2.1998244017362594, + "loss/hidden": 3.66171875, + "loss/jsd": 0.0, + "loss/logits": 0.24048520512878896, + "step": 14100 + }, + { + "epoch": 0.4703333333333333, + "grad_norm": 35.75, + "grad_norm_var": 12.2572265625, + "learning_rate": 0.0001, + "loss": 7.9691, + "loss/crossentropy": 1.9739395514130593, + "loss/hidden": 3.6375, + "loss/jsd": 0.0, + "loss/logits": 0.20827601440250873, + "step": 14110 + }, + { + "epoch": 0.4706666666666667, + "grad_norm": 30.375, + "grad_norm_var": 7.487239583333333, + "learning_rate": 0.0001, + "loss": 8.0045, + "loss/crossentropy": 1.9031679958105088, + "loss/hidden": 3.577734375, + "loss/jsd": 0.0, + "loss/logits": 0.22086440566927196, + "step": 14120 + }, + { + "epoch": 0.471, + "grad_norm": 29.375, + "grad_norm_var": 2.3363932291666667, + "learning_rate": 0.0001, + "loss": 7.9829, + "loss/crossentropy": 2.1658167608082293, + "loss/hidden": 3.58125, + "loss/jsd": 0.0, + "loss/logits": 0.21573278903961182, + "step": 14130 + }, + { + "epoch": 0.4713333333333333, + "grad_norm": 31.125, + "grad_norm_var": 5.509375, + "learning_rate": 0.0001, + "loss": 8.0107, + "loss/crossentropy": 2.1494806349277495, + "loss/hidden": 3.708203125, + "loss/jsd": 0.0, + "loss/logits": 0.23198096118867398, + "step": 14140 + }, + { + "epoch": 0.4716666666666667, + "grad_norm": 32.75, + "grad_norm_var": 7.82890625, + "learning_rate": 0.0001, + "loss": 7.9608, + "loss/crossentropy": 2.176995001733303, + "loss/hidden": 3.669921875, + "loss/jsd": 0.0, + "loss/logits": 0.22184601295739412, + "step": 14150 + }, + { + "epoch": 0.472, + "grad_norm": 31.5, + "grad_norm_var": 7.715559895833334, + "learning_rate": 0.0001, + "loss": 7.9168, + "loss/crossentropy": 2.1046732500195504, + "loss/hidden": 3.767578125, + "loss/jsd": 0.0, + "loss/logits": 0.2493480734527111, + "step": 14160 + }, + { + "epoch": 0.4723333333333333, + "grad_norm": 34.5, + "grad_norm_var": 4.651822916666666, + "learning_rate": 0.0001, + "loss": 8.0195, + "loss/crossentropy": 2.0825782030820847, + "loss/hidden": 3.859765625, + "loss/jsd": 0.0, + "loss/logits": 0.21776366755366325, + "step": 14170 + }, + { + "epoch": 0.4726666666666667, + "grad_norm": 30.25, + "grad_norm_var": 2.594073359300323e+18, + "learning_rate": 0.0001, + "loss": 7.9941, + "loss/crossentropy": 2.0367950215935706, + "loss/hidden": 3.656640625, + "loss/jsd": 0.0, + "loss/logits": 0.21370016261935235, + "step": 14180 + }, + { + "epoch": 0.473, + "grad_norm": 31.375, + "grad_norm_var": 8.9119140625, + "learning_rate": 0.0001, + "loss": 7.9755, + "loss/crossentropy": 2.044953337311745, + "loss/hidden": 3.748828125, + "loss/jsd": 0.0, + "loss/logits": 0.23148317448794842, + "step": 14190 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 33.0, + "grad_norm_var": 3.4613932291666667, + "learning_rate": 0.0001, + "loss": 8.0085, + "loss/crossentropy": 2.0775508999824526, + "loss/hidden": 3.666796875, + "loss/jsd": 0.0, + "loss/logits": 0.21293406821787358, + "step": 14200 + }, + { + "epoch": 0.4736666666666667, + "grad_norm": 34.0, + "grad_norm_var": 3.6113932291666666, + "learning_rate": 0.0001, + "loss": 7.9466, + "loss/crossentropy": 2.060066529363394, + "loss/hidden": 3.63125, + "loss/jsd": 0.0, + "loss/logits": 0.21972225215286018, + "step": 14210 + }, + { + "epoch": 0.474, + "grad_norm": 29.625, + "grad_norm_var": 14.949739583333333, + "learning_rate": 0.0001, + "loss": 7.9651, + "loss/crossentropy": 2.1454847924411298, + "loss/hidden": 3.675390625, + "loss/jsd": 0.0, + "loss/logits": 0.23835664317011834, + "step": 14220 + }, + { + "epoch": 0.47433333333333333, + "grad_norm": 30.625, + "grad_norm_var": 23.971809895833335, + "learning_rate": 0.0001, + "loss": 7.9706, + "loss/crossentropy": 2.0558887153863905, + "loss/hidden": 3.663671875, + "loss/jsd": 0.0, + "loss/logits": 0.22023830823600293, + "step": 14230 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 30.75, + "grad_norm_var": 5.146875, + "learning_rate": 0.0001, + "loss": 7.9699, + "loss/crossentropy": 2.0701268397271635, + "loss/hidden": 3.599609375, + "loss/jsd": 0.0, + "loss/logits": 0.21326845940202474, + "step": 14240 + }, + { + "epoch": 0.475, + "grad_norm": 31.875, + "grad_norm_var": 6.955989583333333, + "learning_rate": 0.0001, + "loss": 7.911, + "loss/crossentropy": 2.0048009738326074, + "loss/hidden": 3.685546875, + "loss/jsd": 0.0, + "loss/logits": 0.20537266619503497, + "step": 14250 + }, + { + "epoch": 0.47533333333333333, + "grad_norm": 31.0, + "grad_norm_var": 1.5291015625, + "learning_rate": 0.0001, + "loss": 8.0106, + "loss/crossentropy": 1.9276894822716713, + "loss/hidden": 3.639453125, + "loss/jsd": 0.0, + "loss/logits": 0.19781601782888175, + "step": 14260 + }, + { + "epoch": 0.4756666666666667, + "grad_norm": 29.625, + "grad_norm_var": 4.012955729166666, + "learning_rate": 0.0001, + "loss": 7.7983, + "loss/crossentropy": 2.127088063955307, + "loss/hidden": 3.523828125, + "loss/jsd": 0.0, + "loss/logits": 0.19607614930719136, + "step": 14270 + }, + { + "epoch": 0.476, + "grad_norm": 32.25, + "grad_norm_var": 4.188997395833334, + "learning_rate": 0.0001, + "loss": 7.859, + "loss/crossentropy": 2.0276701495051386, + "loss/hidden": 3.63515625, + "loss/jsd": 0.0, + "loss/logits": 0.22934147100895644, + "step": 14280 + }, + { + "epoch": 0.47633333333333333, + "grad_norm": 28.875, + "grad_norm_var": 15.313997395833333, + "learning_rate": 0.0001, + "loss": 7.9605, + "loss/crossentropy": 2.1230327248573304, + "loss/hidden": 3.80703125, + "loss/jsd": 0.0, + "loss/logits": 0.24875117875635624, + "step": 14290 + }, + { + "epoch": 0.4766666666666667, + "grad_norm": 30.75, + "grad_norm_var": 19.182291666666668, + "learning_rate": 0.0001, + "loss": 7.9298, + "loss/crossentropy": 2.065612518787384, + "loss/hidden": 3.6734375, + "loss/jsd": 0.0, + "loss/logits": 0.22124754767864943, + "step": 14300 + }, + { + "epoch": 0.477, + "grad_norm": 30.75, + "grad_norm_var": 10.710872395833333, + "learning_rate": 0.0001, + "loss": 7.812, + "loss/crossentropy": 2.04611222743988, + "loss/hidden": 3.694921875, + "loss/jsd": 0.0, + "loss/logits": 0.20459628701210023, + "step": 14310 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 31.25, + "grad_norm_var": 4.3697265625, + "learning_rate": 0.0001, + "loss": 8.1206, + "loss/crossentropy": 2.121537686884403, + "loss/hidden": 3.793359375, + "loss/jsd": 0.0, + "loss/logits": 0.2447736568748951, + "step": 14320 + }, + { + "epoch": 0.4776666666666667, + "grad_norm": 35.0, + "grad_norm_var": 6.0625, + "learning_rate": 0.0001, + "loss": 7.918, + "loss/crossentropy": 1.9219128280878066, + "loss/hidden": 3.623828125, + "loss/jsd": 0.0, + "loss/logits": 0.2076597328297794, + "step": 14330 + }, + { + "epoch": 0.478, + "grad_norm": 33.5, + "grad_norm_var": 9.618489583333334, + "learning_rate": 0.0001, + "loss": 7.8741, + "loss/crossentropy": 2.039739317446947, + "loss/hidden": 3.623046875, + "loss/jsd": 0.0, + "loss/logits": 0.21342823561280966, + "step": 14340 + }, + { + "epoch": 0.47833333333333333, + "grad_norm": 30.0, + "grad_norm_var": 10.0306640625, + "learning_rate": 0.0001, + "loss": 7.9885, + "loss/crossentropy": 2.163815528154373, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20537124276161195, + "step": 14350 + }, + { + "epoch": 0.4786666666666667, + "grad_norm": 32.25, + "grad_norm_var": 3.658072916666667, + "learning_rate": 0.0001, + "loss": 8.0153, + "loss/crossentropy": 2.087558428943157, + "loss/hidden": 3.76640625, + "loss/jsd": 0.0, + "loss/logits": 0.2245228797197342, + "step": 14360 + }, + { + "epoch": 0.479, + "grad_norm": 31.0, + "grad_norm_var": 1.5530598958333333, + "learning_rate": 0.0001, + "loss": 7.9594, + "loss/crossentropy": 2.2264768585562704, + "loss/hidden": 3.705078125, + "loss/jsd": 0.0, + "loss/logits": 0.23378158863633872, + "step": 14370 + }, + { + "epoch": 0.47933333333333333, + "grad_norm": 30.25, + "grad_norm_var": 1.8177083333333333, + "learning_rate": 0.0001, + "loss": 7.9091, + "loss/crossentropy": 2.1510671019554137, + "loss/hidden": 3.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.21997169237583875, + "step": 14380 + }, + { + "epoch": 0.4796666666666667, + "grad_norm": 30.0, + "grad_norm_var": 4.183072916666666, + "learning_rate": 0.0001, + "loss": 8.1278, + "loss/crossentropy": 1.996177999675274, + "loss/hidden": 3.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.22189988046884537, + "step": 14390 + }, + { + "epoch": 0.48, + "grad_norm": 33.5, + "grad_norm_var": 6.430143229166666, + "learning_rate": 0.0001, + "loss": 8.0033, + "loss/crossentropy": 2.106117682904005, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.20855927262455226, + "step": 14400 + }, + { + "epoch": 0.48033333333333333, + "grad_norm": 31.125, + "grad_norm_var": 10.13515625, + "learning_rate": 0.0001, + "loss": 7.9176, + "loss/crossentropy": 2.1830692276358605, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.2149069756269455, + "step": 14410 + }, + { + "epoch": 0.4806666666666667, + "grad_norm": 31.875, + "grad_norm_var": 8.387434895833334, + "learning_rate": 0.0001, + "loss": 8.0137, + "loss/crossentropy": 2.2273536786437034, + "loss/hidden": 3.727734375, + "loss/jsd": 0.0, + "loss/logits": 0.23837392879649996, + "step": 14420 + }, + { + "epoch": 0.481, + "grad_norm": 29.375, + "grad_norm_var": 7.53125, + "learning_rate": 0.0001, + "loss": 7.8473, + "loss/crossentropy": 2.0939138531684875, + "loss/hidden": 3.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20872913114726543, + "step": 14430 + }, + { + "epoch": 0.48133333333333334, + "grad_norm": 30.375, + "grad_norm_var": 5.299934895833333, + "learning_rate": 0.0001, + "loss": 7.9815, + "loss/crossentropy": 2.166853901743889, + "loss/hidden": 3.783203125, + "loss/jsd": 0.0, + "loss/logits": 0.24097004868090152, + "step": 14440 + }, + { + "epoch": 0.4816666666666667, + "grad_norm": 32.0, + "grad_norm_var": 3.1468098958333335, + "learning_rate": 0.0001, + "loss": 8.0419, + "loss/crossentropy": 2.1047842048108576, + "loss/hidden": 3.592578125, + "loss/jsd": 0.0, + "loss/logits": 0.20935575142502785, + "step": 14450 + }, + { + "epoch": 0.482, + "grad_norm": 30.5, + "grad_norm_var": 3.626030986454421e+18, + "learning_rate": 0.0001, + "loss": 7.9523, + "loss/crossentropy": 2.057080474495888, + "loss/hidden": 3.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.2086722683161497, + "step": 14460 + }, + { + "epoch": 0.48233333333333334, + "grad_norm": 30.0, + "grad_norm_var": 103.56223958333334, + "learning_rate": 0.0001, + "loss": 7.8493, + "loss/crossentropy": 2.046734869480133, + "loss/hidden": 3.613671875, + "loss/jsd": 0.0, + "loss/logits": 0.21412031259387732, + "step": 14470 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 31.875, + "grad_norm_var": 101.18307291666666, + "learning_rate": 0.0001, + "loss": 7.9051, + "loss/crossentropy": 2.065974645316601, + "loss/hidden": 3.724609375, + "loss/jsd": 0.0, + "loss/logits": 0.23375277630984784, + "step": 14480 + }, + { + "epoch": 0.483, + "grad_norm": 31.25, + "grad_norm_var": 5.0541015625, + "learning_rate": 0.0001, + "loss": 7.8899, + "loss/crossentropy": 2.0884762033820152, + "loss/hidden": 3.63203125, + "loss/jsd": 0.0, + "loss/logits": 0.20585475508123635, + "step": 14490 + }, + { + "epoch": 0.48333333333333334, + "grad_norm": 32.25, + "grad_norm_var": 2.6499348958333333, + "learning_rate": 0.0001, + "loss": 7.9974, + "loss/crossentropy": 2.2915369153022764, + "loss/hidden": 3.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2142224058508873, + "step": 14500 + }, + { + "epoch": 0.4836666666666667, + "grad_norm": 32.25, + "grad_norm_var": 6.114322916666667, + "learning_rate": 0.0001, + "loss": 8.0747, + "loss/crossentropy": 2.240130066871643, + "loss/hidden": 3.709375, + "loss/jsd": 0.0, + "loss/logits": 0.23856233302503824, + "step": 14510 + }, + { + "epoch": 0.484, + "grad_norm": 30.125, + "grad_norm_var": 6.818684895833333, + "learning_rate": 0.0001, + "loss": 8.0631, + "loss/crossentropy": 2.109334260225296, + "loss/hidden": 3.775390625, + "loss/jsd": 0.0, + "loss/logits": 0.22316622659564017, + "step": 14520 + }, + { + "epoch": 0.48433333333333334, + "grad_norm": 29.75, + "grad_norm_var": 2.5233723958333334, + "learning_rate": 0.0001, + "loss": 7.9728, + "loss/crossentropy": 2.1873391047120094, + "loss/hidden": 3.580078125, + "loss/jsd": 0.0, + "loss/logits": 0.213110950961709, + "step": 14530 + }, + { + "epoch": 0.4846666666666667, + "grad_norm": 177.0, + "grad_norm_var": 1326.0431640625, + "learning_rate": 0.0001, + "loss": 7.9814, + "loss/crossentropy": 2.0492543891072272, + "loss/hidden": 3.687890625, + "loss/jsd": 0.0, + "loss/logits": 0.22612165659666061, + "step": 14540 + }, + { + "epoch": 0.485, + "grad_norm": 35.0, + "grad_norm_var": 1308.9384765625, + "learning_rate": 0.0001, + "loss": 7.9937, + "loss/crossentropy": 2.0765153512358667, + "loss/hidden": 3.716796875, + "loss/jsd": 0.0, + "loss/logits": 0.21763208881020546, + "step": 14550 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 35.25, + "grad_norm_var": 7.351822916666666, + "learning_rate": 0.0001, + "loss": 7.9428, + "loss/crossentropy": 2.065848244726658, + "loss/hidden": 3.709375, + "loss/jsd": 0.0, + "loss/logits": 0.21093793530017138, + "step": 14560 + }, + { + "epoch": 0.4856666666666667, + "grad_norm": 34.75, + "grad_norm_var": 4.43125, + "learning_rate": 0.0001, + "loss": 8.1182, + "loss/crossentropy": 2.021928811073303, + "loss/hidden": 3.63046875, + "loss/jsd": 0.0, + "loss/logits": 0.2118746515363455, + "step": 14570 + }, + { + "epoch": 0.486, + "grad_norm": 35.25, + "grad_norm_var": 6.9916015625, + "learning_rate": 0.0001, + "loss": 8.0468, + "loss/crossentropy": 1.9547654077410699, + "loss/hidden": 3.819140625, + "loss/jsd": 0.0, + "loss/logits": 0.23913611695170403, + "step": 14580 + }, + { + "epoch": 0.48633333333333334, + "grad_norm": 33.25, + "grad_norm_var": 833.4160807291667, + "learning_rate": 0.0001, + "loss": 8.0743, + "loss/crossentropy": 1.9865302249789238, + "loss/hidden": 3.90703125, + "loss/jsd": 0.0, + "loss/logits": 0.23207287043333052, + "step": 14590 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 32.75, + "grad_norm_var": 11.039322916666666, + "learning_rate": 0.0001, + "loss": 8.1016, + "loss/crossentropy": 2.2250148117542268, + "loss/hidden": 3.704296875, + "loss/jsd": 0.0, + "loss/logits": 0.21750411633402109, + "step": 14600 + }, + { + "epoch": 0.487, + "grad_norm": 33.0, + "grad_norm_var": 3.9218098958333334, + "learning_rate": 0.0001, + "loss": 7.9989, + "loss/crossentropy": 2.0343859881162643, + "loss/hidden": 3.581640625, + "loss/jsd": 0.0, + "loss/logits": 0.20473443409428, + "step": 14610 + }, + { + "epoch": 0.48733333333333334, + "grad_norm": 33.5, + "grad_norm_var": 102.97858072916667, + "learning_rate": 0.0001, + "loss": 8.0228, + "loss/crossentropy": 2.1920588284730913, + "loss/hidden": 3.72578125, + "loss/jsd": 0.0, + "loss/logits": 0.2238582916557789, + "step": 14620 + }, + { + "epoch": 0.4876666666666667, + "grad_norm": 32.75, + "grad_norm_var": 106.72057291666667, + "learning_rate": 0.0001, + "loss": 7.9914, + "loss/crossentropy": 2.131534478068352, + "loss/hidden": 3.618359375, + "loss/jsd": 0.0, + "loss/logits": 0.21192469485104085, + "step": 14630 + }, + { + "epoch": 0.488, + "grad_norm": 37.0, + "grad_norm_var": 130.6650390625, + "learning_rate": 0.0001, + "loss": 8.0006, + "loss/crossentropy": 1.9975043579936027, + "loss/hidden": 3.716796875, + "loss/jsd": 0.0, + "loss/logits": 0.20322852581739426, + "step": 14640 + }, + { + "epoch": 0.48833333333333334, + "grad_norm": 30.375, + "grad_norm_var": 125.80833333333334, + "learning_rate": 0.0001, + "loss": 7.9713, + "loss/crossentropy": 2.1270855344831943, + "loss/hidden": 3.663671875, + "loss/jsd": 0.0, + "loss/logits": 0.220531555917114, + "step": 14650 + }, + { + "epoch": 0.4886666666666667, + "grad_norm": 31.5, + "grad_norm_var": 3.64765625, + "learning_rate": 0.0001, + "loss": 8.0117, + "loss/crossentropy": 2.089583569765091, + "loss/hidden": 3.6625, + "loss/jsd": 0.0, + "loss/logits": 0.22326747328042984, + "step": 14660 + }, + { + "epoch": 0.489, + "grad_norm": 30.75, + "grad_norm_var": 4.487434895833333, + "learning_rate": 0.0001, + "loss": 7.9531, + "loss/crossentropy": 1.9965915471315383, + "loss/hidden": 3.665625, + "loss/jsd": 0.0, + "loss/logits": 0.21961635909974575, + "step": 14670 + }, + { + "epoch": 0.48933333333333334, + "grad_norm": 38.0, + "grad_norm_var": 24.405989583333334, + "learning_rate": 0.0001, + "loss": 8.0753, + "loss/crossentropy": 2.1705081194639204, + "loss/hidden": 3.66484375, + "loss/jsd": 0.0, + "loss/logits": 0.21931095998734235, + "step": 14680 + }, + { + "epoch": 0.48966666666666664, + "grad_norm": 36.0, + "grad_norm_var": 12.178125, + "learning_rate": 0.0001, + "loss": 7.943, + "loss/crossentropy": 2.050249530375004, + "loss/hidden": 3.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2148410253226757, + "step": 14690 + }, + { + "epoch": 0.49, + "grad_norm": 30.125, + "grad_norm_var": 4.697916666666667, + "learning_rate": 0.0001, + "loss": 7.9524, + "loss/crossentropy": 2.0615778759121897, + "loss/hidden": 3.587890625, + "loss/jsd": 0.0, + "loss/logits": 0.21635166741907597, + "step": 14700 + }, + { + "epoch": 0.49033333333333334, + "grad_norm": 37.0, + "grad_norm_var": 4.417643229166667, + "learning_rate": 0.0001, + "loss": 8.0631, + "loss/crossentropy": 2.2596657291054725, + "loss/hidden": 3.695703125, + "loss/jsd": 0.0, + "loss/logits": 0.23083409201353788, + "step": 14710 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 39.5, + "grad_norm_var": 16.4875, + "learning_rate": 0.0001, + "loss": 7.9577, + "loss/crossentropy": 2.0465092822909354, + "loss/hidden": 3.67890625, + "loss/jsd": 0.0, + "loss/logits": 0.22049062736332417, + "step": 14720 + }, + { + "epoch": 0.491, + "grad_norm": 30.0, + "grad_norm_var": 8.873893229166667, + "learning_rate": 0.0001, + "loss": 7.995, + "loss/crossentropy": 2.16557691693306, + "loss/hidden": 3.664453125, + "loss/jsd": 0.0, + "loss/logits": 0.21666408702731133, + "step": 14730 + }, + { + "epoch": 0.49133333333333334, + "grad_norm": 33.75, + "grad_norm_var": 5.620247395833333, + "learning_rate": 0.0001, + "loss": 7.9573, + "loss/crossentropy": 2.158591315150261, + "loss/hidden": 3.630859375, + "loss/jsd": 0.0, + "loss/logits": 0.22307645455002784, + "step": 14740 + }, + { + "epoch": 0.49166666666666664, + "grad_norm": 31.5, + "grad_norm_var": 9.358268229166667, + "learning_rate": 0.0001, + "loss": 7.9671, + "loss/crossentropy": 2.0228962182998655, + "loss/hidden": 3.733203125, + "loss/jsd": 0.0, + "loss/logits": 0.21425336729735137, + "step": 14750 + }, + { + "epoch": 0.492, + "grad_norm": 44.75, + "grad_norm_var": 16.083333333333332, + "learning_rate": 0.0001, + "loss": 7.9912, + "loss/crossentropy": 2.1796020001173018, + "loss/hidden": 3.570703125, + "loss/jsd": 0.0, + "loss/logits": 0.2131647277623415, + "step": 14760 + }, + { + "epoch": 0.49233333333333335, + "grad_norm": 38.0, + "grad_norm_var": 18.555989583333332, + "learning_rate": 0.0001, + "loss": 8.0265, + "loss/crossentropy": 2.079108493030071, + "loss/hidden": 3.732421875, + "loss/jsd": 0.0, + "loss/logits": 0.21902708765119314, + "step": 14770 + }, + { + "epoch": 0.49266666666666664, + "grad_norm": 30.875, + "grad_norm_var": 5.920247395833333, + "learning_rate": 0.0001, + "loss": 8.1693, + "loss/crossentropy": 2.2386298209428785, + "loss/hidden": 3.790234375, + "loss/jsd": 0.0, + "loss/logits": 0.24830550476908683, + "step": 14780 + }, + { + "epoch": 0.493, + "grad_norm": 37.5, + "grad_norm_var": 14.933333333333334, + "learning_rate": 0.0001, + "loss": 7.9356, + "loss/crossentropy": 2.29894140958786, + "loss/hidden": 3.671484375, + "loss/jsd": 0.0, + "loss/logits": 0.21857519987970592, + "step": 14790 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 38.5, + "grad_norm_var": 19.596809895833335, + "learning_rate": 0.0001, + "loss": 7.993, + "loss/crossentropy": 2.003948637843132, + "loss/hidden": 3.584765625, + "loss/jsd": 0.0, + "loss/logits": 0.20476762484759092, + "step": 14800 + }, + { + "epoch": 0.49366666666666664, + "grad_norm": 31.625, + "grad_norm_var": 9.97890625, + "learning_rate": 0.0001, + "loss": 8.0814, + "loss/crossentropy": 2.151717406511307, + "loss/hidden": 3.7, + "loss/jsd": 0.0, + "loss/logits": 0.22604979574680328, + "step": 14810 + }, + { + "epoch": 0.494, + "grad_norm": 37.5, + "grad_norm_var": 8.230989583333333, + "learning_rate": 0.0001, + "loss": 8.1202, + "loss/crossentropy": 1.9646364904940128, + "loss/hidden": 3.797265625, + "loss/jsd": 0.0, + "loss/logits": 0.24005853114649653, + "step": 14820 + }, + { + "epoch": 0.49433333333333335, + "grad_norm": 29.875, + "grad_norm_var": 10.67890625, + "learning_rate": 0.0001, + "loss": 8.032, + "loss/crossentropy": 2.162761890888214, + "loss/hidden": 3.69375, + "loss/jsd": 0.0, + "loss/logits": 0.2281369637697935, + "step": 14830 + }, + { + "epoch": 0.49466666666666664, + "grad_norm": 29.25, + "grad_norm_var": 17.162955729166665, + "learning_rate": 0.0001, + "loss": 7.9308, + "loss/crossentropy": 1.809071047604084, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.20106423925608397, + "step": 14840 + }, + { + "epoch": 0.495, + "grad_norm": 31.25, + "grad_norm_var": 14.433268229166666, + "learning_rate": 0.0001, + "loss": 7.8974, + "loss/crossentropy": 2.1457689851522446, + "loss/hidden": 3.73984375, + "loss/jsd": 0.0, + "loss/logits": 0.24656938333064318, + "step": 14850 + }, + { + "epoch": 0.49533333333333335, + "grad_norm": 29.125, + "grad_norm_var": 12.724739583333333, + "learning_rate": 0.0001, + "loss": 8.0257, + "loss/crossentropy": 2.097585503757, + "loss/hidden": 3.728125, + "loss/jsd": 0.0, + "loss/logits": 0.2211755273863673, + "step": 14860 + }, + { + "epoch": 0.49566666666666664, + "grad_norm": 34.25, + "grad_norm_var": 11.1931640625, + "learning_rate": 0.0001, + "loss": 8.0109, + "loss/crossentropy": 2.1149248749017717, + "loss/hidden": 3.628515625, + "loss/jsd": 0.0, + "loss/logits": 0.20364532712846994, + "step": 14870 + }, + { + "epoch": 0.496, + "grad_norm": 29.625, + "grad_norm_var": 2.5940733597902177e+18, + "learning_rate": 0.0001, + "loss": 8.023, + "loss/crossentropy": 2.096835497021675, + "loss/hidden": 3.75546875, + "loss/jsd": 0.0, + "loss/logits": 0.21487231757491826, + "step": 14880 + }, + { + "epoch": 0.49633333333333335, + "grad_norm": 27.625, + "grad_norm_var": 7.139583333333333, + "learning_rate": 0.0001, + "loss": 7.9587, + "loss/crossentropy": 1.9992181949317456, + "loss/hidden": 3.6421875, + "loss/jsd": 0.0, + "loss/logits": 0.20439089126884938, + "step": 14890 + }, + { + "epoch": 0.49666666666666665, + "grad_norm": 31.0, + "grad_norm_var": 11.989583333333334, + "learning_rate": 0.0001, + "loss": 8.0151, + "loss/crossentropy": 1.997377061843872, + "loss/hidden": 3.794921875, + "loss/jsd": 0.0, + "loss/logits": 0.22013774681836368, + "step": 14900 + }, + { + "epoch": 0.497, + "grad_norm": 31.625, + "grad_norm_var": 6.09765625, + "learning_rate": 0.0001, + "loss": 8.0484, + "loss/crossentropy": 2.2603927135467528, + "loss/hidden": 3.6703125, + "loss/jsd": 0.0, + "loss/logits": 0.22995477840304374, + "step": 14910 + }, + { + "epoch": 0.49733333333333335, + "grad_norm": 28.25, + "grad_norm_var": 2.544791666666667, + "learning_rate": 0.0001, + "loss": 7.9737, + "loss/crossentropy": 2.167400282621384, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.20647088065743446, + "step": 14920 + }, + { + "epoch": 0.49766666666666665, + "grad_norm": 28.25, + "grad_norm_var": 17.2556640625, + "learning_rate": 0.0001, + "loss": 7.9713, + "loss/crossentropy": 1.9748560175299645, + "loss/hidden": 3.684765625, + "loss/jsd": 0.0, + "loss/logits": 0.21687341015785933, + "step": 14930 + }, + { + "epoch": 0.498, + "grad_norm": 34.75, + "grad_norm_var": 15.188997395833333, + "learning_rate": 0.0001, + "loss": 7.9405, + "loss/crossentropy": 2.1966336160898208, + "loss/hidden": 3.56796875, + "loss/jsd": 0.0, + "loss/logits": 0.20776706095784903, + "step": 14940 + }, + { + "epoch": 0.49833333333333335, + "grad_norm": 7079985152.0, + "grad_norm_var": 3.1328868311327135e+18, + "learning_rate": 0.0001, + "loss": 7.9715, + "loss/crossentropy": 2.108351056277752, + "loss/hidden": 3.567578125, + "loss/jsd": 0.0, + "loss/logits": 0.21284203305840493, + "step": 14950 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 32.5, + "grad_norm_var": 3.1328868312875884e+18, + "learning_rate": 0.0001, + "loss": 8.0864, + "loss/crossentropy": 2.0066813334822653, + "loss/hidden": 3.6734375, + "loss/jsd": 0.0, + "loss/logits": 0.226681593246758, + "step": 14960 + }, + { + "epoch": 0.499, + "grad_norm": 33.5, + "grad_norm_var": 2.80390625, + "learning_rate": 0.0001, + "loss": 8.0583, + "loss/crossentropy": 1.9094878628849983, + "loss/hidden": 3.566015625, + "loss/jsd": 0.0, + "loss/logits": 0.21297882869839668, + "step": 14970 + }, + { + "epoch": 0.49933333333333335, + "grad_norm": 31.375, + "grad_norm_var": 3.4322916666666665, + "learning_rate": 0.0001, + "loss": 7.9844, + "loss/crossentropy": 2.138418735563755, + "loss/hidden": 3.585546875, + "loss/jsd": 0.0, + "loss/logits": 0.218233341909945, + "step": 14980 + }, + { + "epoch": 0.49966666666666665, + "grad_norm": 32.25, + "grad_norm_var": 2.3619140625, + "learning_rate": 0.0001, + "loss": 7.9809, + "loss/crossentropy": 2.0114166542887686, + "loss/hidden": 3.661328125, + "loss/jsd": 0.0, + "loss/logits": 0.21623858716338873, + "step": 14990 + }, + { + "epoch": 0.5, + "grad_norm": 41.0, + "grad_norm_var": 246.77180989583334, + "learning_rate": 0.0001, + "loss": 8.0603, + "loss/crossentropy": 2.034164222329855, + "loss/hidden": 3.712109375, + "loss/jsd": 0.0, + "loss/logits": 0.2103666251525283, + "step": 15000 + }, + { + "epoch": 0.5003333333333333, + "grad_norm": 30.5, + "grad_norm_var": 248.26640625, + "learning_rate": 0.0001, + "loss": 8.1163, + "loss/crossentropy": 2.1978528052568436, + "loss/hidden": 3.641796875, + "loss/jsd": 0.0, + "loss/logits": 0.2373816639184952, + "step": 15010 + }, + { + "epoch": 0.5006666666666667, + "grad_norm": 31.375, + "grad_norm_var": 2.21015625, + "learning_rate": 0.0001, + "loss": 7.9552, + "loss/crossentropy": 2.03947726637125, + "loss/hidden": 3.6125, + "loss/jsd": 0.0, + "loss/logits": 0.2085746269673109, + "step": 15020 + }, + { + "epoch": 0.501, + "grad_norm": 32.75, + "grad_norm_var": 6.3572265625, + "learning_rate": 0.0001, + "loss": 8.1111, + "loss/crossentropy": 2.008555364608765, + "loss/hidden": 3.66171875, + "loss/jsd": 0.0, + "loss/logits": 0.20857672542333602, + "step": 15030 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 29.875, + "grad_norm_var": 4.277018229166667, + "learning_rate": 0.0001, + "loss": 7.942, + "loss/crossentropy": 2.0552697718143462, + "loss/hidden": 3.753125, + "loss/jsd": 0.0, + "loss/logits": 0.21989797037094833, + "step": 15040 + }, + { + "epoch": 0.5016666666666667, + "grad_norm": 169.0, + "grad_norm_var": 1190.8098307291666, + "learning_rate": 0.0001, + "loss": 8.0327, + "loss/crossentropy": 2.1051773697137834, + "loss/hidden": 3.775390625, + "loss/jsd": 0.0, + "loss/logits": 0.2577056746929884, + "step": 15050 + }, + { + "epoch": 0.502, + "grad_norm": 31.25, + "grad_norm_var": 1164.67890625, + "learning_rate": 0.0001, + "loss": 8.0819, + "loss/crossentropy": 2.186911401152611, + "loss/hidden": 3.72421875, + "loss/jsd": 0.0, + "loss/logits": 0.21980911456048488, + "step": 15060 + }, + { + "epoch": 0.5023333333333333, + "grad_norm": 28.0, + "grad_norm_var": 4.070833333333334, + "learning_rate": 0.0001, + "loss": 8.0812, + "loss/crossentropy": 2.112710100412369, + "loss/hidden": 3.774609375, + "loss/jsd": 0.0, + "loss/logits": 0.23789308052510022, + "step": 15070 + }, + { + "epoch": 0.5026666666666667, + "grad_norm": 29.0, + "grad_norm_var": 7.726497395833333, + "learning_rate": 0.0001, + "loss": 8.0101, + "loss/crossentropy": 2.1419308796525, + "loss/hidden": 3.6109375, + "loss/jsd": 0.0, + "loss/logits": 0.21457258183509112, + "step": 15080 + }, + { + "epoch": 0.503, + "grad_norm": 38.0, + "grad_norm_var": 12497.576041666667, + "learning_rate": 0.0001, + "loss": 8.1218, + "loss/crossentropy": 2.134933979809284, + "loss/hidden": 3.790234375, + "loss/jsd": 0.0, + "loss/logits": 0.23342457674443723, + "step": 15090 + }, + { + "epoch": 0.5033333333333333, + "grad_norm": 29.5, + "grad_norm_var": 20.352083333333333, + "learning_rate": 0.0001, + "loss": 7.9202, + "loss/crossentropy": 2.3430344820022584, + "loss/hidden": 3.519140625, + "loss/jsd": 0.0, + "loss/logits": 0.2118502750992775, + "step": 15100 + }, + { + "epoch": 0.5036666666666667, + "grad_norm": 35.25, + "grad_norm_var": 22.322330729166666, + "learning_rate": 0.0001, + "loss": 7.9041, + "loss/crossentropy": 1.9799678571522237, + "loss/hidden": 3.635546875, + "loss/jsd": 0.0, + "loss/logits": 0.21085582114756107, + "step": 15110 + }, + { + "epoch": 0.504, + "grad_norm": 30.625, + "grad_norm_var": 16.030143229166665, + "learning_rate": 0.0001, + "loss": 7.9714, + "loss/crossentropy": 1.9788647621870041, + "loss/hidden": 3.731640625, + "loss/jsd": 0.0, + "loss/logits": 0.20716245826333762, + "step": 15120 + }, + { + "epoch": 0.5043333333333333, + "grad_norm": 32.25, + "grad_norm_var": 14.680989583333334, + "learning_rate": 0.0001, + "loss": 7.8813, + "loss/crossentropy": 2.1622576892375944, + "loss/hidden": 3.61875, + "loss/jsd": 0.0, + "loss/logits": 0.21591525189578534, + "step": 15130 + }, + { + "epoch": 0.5046666666666667, + "grad_norm": 34.25, + "grad_norm_var": 12.263541666666667, + "learning_rate": 0.0001, + "loss": 8.0432, + "loss/crossentropy": 2.158085845410824, + "loss/hidden": 3.726953125, + "loss/jsd": 0.0, + "loss/logits": 0.22357744220644235, + "step": 15140 + }, + { + "epoch": 0.505, + "grad_norm": 37.0, + "grad_norm_var": 1032.2447265625, + "learning_rate": 0.0001, + "loss": 8.0755, + "loss/crossentropy": 2.2892831161618235, + "loss/hidden": 3.76328125, + "loss/jsd": 0.0, + "loss/logits": 0.23992609437555074, + "step": 15150 + }, + { + "epoch": 0.5053333333333333, + "grad_norm": 34.0, + "grad_norm_var": 1034.0916666666667, + "learning_rate": 0.0001, + "loss": 7.9403, + "loss/crossentropy": 2.1311425492167473, + "loss/hidden": 3.624609375, + "loss/jsd": 0.0, + "loss/logits": 0.21165461391210555, + "step": 15160 + }, + { + "epoch": 0.5056666666666667, + "grad_norm": 29.25, + "grad_norm_var": 3.64375, + "learning_rate": 0.0001, + "loss": 7.9553, + "loss/crossentropy": 2.171247933804989, + "loss/hidden": 3.805859375, + "loss/jsd": 0.0, + "loss/logits": 0.23857482858002185, + "step": 15170 + }, + { + "epoch": 0.506, + "grad_norm": 28.75, + "grad_norm_var": 2.582291666666667, + "learning_rate": 0.0001, + "loss": 7.9445, + "loss/crossentropy": 1.846039692312479, + "loss/hidden": 3.678515625, + "loss/jsd": 0.0, + "loss/logits": 0.19486570674926043, + "step": 15180 + }, + { + "epoch": 0.5063333333333333, + "grad_norm": 34.75, + "grad_norm_var": 6.375455729166666, + "learning_rate": 0.0001, + "loss": 8.0507, + "loss/crossentropy": 2.1973116233944894, + "loss/hidden": 3.7453125, + "loss/jsd": 0.0, + "loss/logits": 0.2503014124929905, + "step": 15190 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 30.0, + "grad_norm_var": 13.45625, + "learning_rate": 0.0001, + "loss": 8.0027, + "loss/crossentropy": 1.9380142621695995, + "loss/hidden": 3.798828125, + "loss/jsd": 0.0, + "loss/logits": 0.23612585961818694, + "step": 15200 + }, + { + "epoch": 0.507, + "grad_norm": 31.0, + "grad_norm_var": 4.994205729166667, + "learning_rate": 0.0001, + "loss": 7.9165, + "loss/crossentropy": 1.953425794839859, + "loss/hidden": 3.744921875, + "loss/jsd": 0.0, + "loss/logits": 0.24761936087161301, + "step": 15210 + }, + { + "epoch": 0.5073333333333333, + "grad_norm": 33.0, + "grad_norm_var": 6.151822916666666, + "learning_rate": 0.0001, + "loss": 7.947, + "loss/crossentropy": 2.066864788532257, + "loss/hidden": 3.654296875, + "loss/jsd": 0.0, + "loss/logits": 0.21229154095053673, + "step": 15220 + }, + { + "epoch": 0.5076666666666667, + "grad_norm": 33.0, + "grad_norm_var": 13.453059895833333, + "learning_rate": 0.0001, + "loss": 7.9878, + "loss/crossentropy": 2.2750732988119124, + "loss/hidden": 3.66640625, + "loss/jsd": 0.0, + "loss/logits": 0.226811171323061, + "step": 15230 + }, + { + "epoch": 0.508, + "grad_norm": 30.375, + "grad_norm_var": 9.880143229166666, + "learning_rate": 0.0001, + "loss": 7.8715, + "loss/crossentropy": 2.0473168551921845, + "loss/hidden": 3.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20602242182940245, + "step": 15240 + }, + { + "epoch": 0.5083333333333333, + "grad_norm": 34.25, + "grad_norm_var": 7.327018229166667, + "learning_rate": 0.0001, + "loss": 8.1526, + "loss/crossentropy": 2.1135044425725935, + "loss/hidden": 3.73125, + "loss/jsd": 0.0, + "loss/logits": 0.25442443899810313, + "step": 15250 + }, + { + "epoch": 0.5086666666666667, + "grad_norm": 33.25, + "grad_norm_var": 5.2322265625, + "learning_rate": 0.0001, + "loss": 7.9502, + "loss/crossentropy": 2.002006813138723, + "loss/hidden": 3.751953125, + "loss/jsd": 0.0, + "loss/logits": 0.22036314904689788, + "step": 15260 + }, + { + "epoch": 0.509, + "grad_norm": 31.5, + "grad_norm_var": 5.1212890625, + "learning_rate": 0.0001, + "loss": 7.9084, + "loss/crossentropy": 2.3665783375501634, + "loss/hidden": 3.685546875, + "loss/jsd": 0.0, + "loss/logits": 0.23151662331074477, + "step": 15270 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 39.0, + "grad_norm_var": 7.746809895833334, + "learning_rate": 0.0001, + "loss": 7.9309, + "loss/crossentropy": 1.96308908239007, + "loss/hidden": 3.65703125, + "loss/jsd": 0.0, + "loss/logits": 0.21435827864333987, + "step": 15280 + }, + { + "epoch": 0.5096666666666667, + "grad_norm": 31.25, + "grad_norm_var": 12.887239583333333, + "learning_rate": 0.0001, + "loss": 7.8462, + "loss/crossentropy": 2.0475788712501526, + "loss/hidden": 3.5890625, + "loss/jsd": 0.0, + "loss/logits": 0.209853470697999, + "step": 15290 + }, + { + "epoch": 0.51, + "grad_norm": 34.5, + "grad_norm_var": 8.1666015625, + "learning_rate": 0.0001, + "loss": 7.9855, + "loss/crossentropy": 2.048559895157814, + "loss/hidden": 3.743359375, + "loss/jsd": 0.0, + "loss/logits": 0.22797312829643487, + "step": 15300 + }, + { + "epoch": 0.5103333333333333, + "grad_norm": 30.25, + "grad_norm_var": 4.605989583333334, + "learning_rate": 0.0001, + "loss": 7.9321, + "loss/crossentropy": 2.081040045619011, + "loss/hidden": 3.7390625, + "loss/jsd": 0.0, + "loss/logits": 0.22966741789132356, + "step": 15310 + }, + { + "epoch": 0.5106666666666667, + "grad_norm": 33.75, + "grad_norm_var": 5.2681640625, + "learning_rate": 0.0001, + "loss": 8.0778, + "loss/crossentropy": 2.1107777029275896, + "loss/hidden": 3.535546875, + "loss/jsd": 0.0, + "loss/logits": 0.20291287880390882, + "step": 15320 + }, + { + "epoch": 0.511, + "grad_norm": 29.5, + "grad_norm_var": 4.077083333333333, + "learning_rate": 0.0001, + "loss": 7.9746, + "loss/crossentropy": 2.1766668647527694, + "loss/hidden": 3.625390625, + "loss/jsd": 0.0, + "loss/logits": 0.21749209128320218, + "step": 15330 + }, + { + "epoch": 0.5113333333333333, + "grad_norm": 35.75, + "grad_norm_var": 6.962239583333333, + "learning_rate": 0.0001, + "loss": 8.039, + "loss/crossentropy": 2.0856088645756246, + "loss/hidden": 3.663671875, + "loss/jsd": 0.0, + "loss/logits": 0.22033254262059926, + "step": 15340 + }, + { + "epoch": 0.5116666666666667, + "grad_norm": 30.375, + "grad_norm_var": 6.554166666666666, + "learning_rate": 0.0001, + "loss": 7.9891, + "loss/crossentropy": 2.093947410583496, + "loss/hidden": 3.667578125, + "loss/jsd": 0.0, + "loss/logits": 0.22944947555661202, + "step": 15350 + }, + { + "epoch": 0.512, + "grad_norm": 41.25, + "grad_norm_var": 9.861393229166667, + "learning_rate": 0.0001, + "loss": 7.9193, + "loss/crossentropy": 2.1564169749617577, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2197624057531357, + "step": 15360 + }, + { + "epoch": 0.5123333333333333, + "grad_norm": 33.25, + "grad_norm_var": 10.402018229166666, + "learning_rate": 0.0001, + "loss": 7.9731, + "loss/crossentropy": 2.135643947124481, + "loss/hidden": 3.63203125, + "loss/jsd": 0.0, + "loss/logits": 0.21202728524804115, + "step": 15370 + }, + { + "epoch": 0.5126666666666667, + "grad_norm": 30.0, + "grad_norm_var": 4.05323963237086e+18, + "learning_rate": 0.0001, + "loss": 7.9059, + "loss/crossentropy": 1.9712642952799797, + "loss/hidden": 3.63125, + "loss/jsd": 0.0, + "loss/logits": 0.19932207949459552, + "step": 15380 + }, + { + "epoch": 0.513, + "grad_norm": 32.0, + "grad_norm_var": 4.053239631984984e+18, + "learning_rate": 0.0001, + "loss": 8.0131, + "loss/crossentropy": 2.114562599360943, + "loss/hidden": 3.741796875, + "loss/jsd": 0.0, + "loss/logits": 0.24068702533841133, + "step": 15390 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 30.5, + "grad_norm_var": 15.04140625, + "learning_rate": 0.0001, + "loss": 7.9017, + "loss/crossentropy": 2.1701153457164764, + "loss/hidden": 3.631640625, + "loss/jsd": 0.0, + "loss/logits": 0.21901333723217248, + "step": 15400 + }, + { + "epoch": 0.5136666666666667, + "grad_norm": 29.25, + "grad_norm_var": 8.6369140625, + "learning_rate": 0.0001, + "loss": 7.9558, + "loss/crossentropy": 2.1302370369434356, + "loss/hidden": 3.73359375, + "loss/jsd": 0.0, + "loss/logits": 0.22752520255744457, + "step": 15410 + }, + { + "epoch": 0.514, + "grad_norm": 37.5, + "grad_norm_var": 10.702018229166667, + "learning_rate": 0.0001, + "loss": 8.0334, + "loss/crossentropy": 2.076037485152483, + "loss/hidden": 3.653125, + "loss/jsd": 0.0, + "loss/logits": 0.2252957560122013, + "step": 15420 + }, + { + "epoch": 0.5143333333333333, + "grad_norm": 33.0, + "grad_norm_var": 11.815625, + "learning_rate": 0.0001, + "loss": 7.9922, + "loss/crossentropy": 2.090342365950346, + "loss/hidden": 3.630859375, + "loss/jsd": 0.0, + "loss/logits": 0.2138144064694643, + "step": 15430 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 39.0, + "grad_norm_var": 28.45390625, + "learning_rate": 0.0001, + "loss": 8.0114, + "loss/crossentropy": 2.163317432999611, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20781796853989362, + "step": 15440 + }, + { + "epoch": 0.515, + "grad_norm": 34.5, + "grad_norm_var": 10.485872395833333, + "learning_rate": 0.0001, + "loss": 7.9288, + "loss/crossentropy": 2.043715859204531, + "loss/hidden": 3.691015625, + "loss/jsd": 0.0, + "loss/logits": 0.2161100683733821, + "step": 15450 + }, + { + "epoch": 0.5153333333333333, + "grad_norm": 30.375, + "grad_norm_var": 5.3556640625, + "learning_rate": 0.0001, + "loss": 8.0563, + "loss/crossentropy": 2.294564101099968, + "loss/hidden": 3.6375, + "loss/jsd": 0.0, + "loss/logits": 0.23372058011591434, + "step": 15460 + }, + { + "epoch": 0.5156666666666667, + "grad_norm": 28.25, + "grad_norm_var": 7.530989583333334, + "learning_rate": 0.0001, + "loss": 7.8509, + "loss/crossentropy": 1.954894269257784, + "loss/hidden": 3.632421875, + "loss/jsd": 0.0, + "loss/logits": 0.2037217952311039, + "step": 15470 + }, + { + "epoch": 0.516, + "grad_norm": 31.375, + "grad_norm_var": 16.819791666666667, + "learning_rate": 0.0001, + "loss": 7.7803, + "loss/crossentropy": 2.1873670905828475, + "loss/hidden": 3.649609375, + "loss/jsd": 0.0, + "loss/logits": 0.22097154781222345, + "step": 15480 + }, + { + "epoch": 0.5163333333333333, + "grad_norm": 27.875, + "grad_norm_var": 16.633072916666666, + "learning_rate": 0.0001, + "loss": 7.9936, + "loss/crossentropy": 2.12020967900753, + "loss/hidden": 3.719140625, + "loss/jsd": 0.0, + "loss/logits": 0.2284359024837613, + "step": 15490 + }, + { + "epoch": 0.5166666666666667, + "grad_norm": 33.0, + "grad_norm_var": 13.01875, + "learning_rate": 0.0001, + "loss": 8.0295, + "loss/crossentropy": 2.144003964960575, + "loss/hidden": 3.5765625, + "loss/jsd": 0.0, + "loss/logits": 0.2127245606854558, + "step": 15500 + }, + { + "epoch": 0.517, + "grad_norm": 6140461056.0, + "grad_norm_var": 2.3565788491207936e+18, + "learning_rate": 0.0001, + "loss": 7.9846, + "loss/crossentropy": 2.0483295105397703, + "loss/hidden": 3.84296875, + "loss/jsd": 0.0, + "loss/logits": 0.21562441848218442, + "step": 15510 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 27.625, + "grad_norm_var": 2.3565788492039455e+18, + "learning_rate": 0.0001, + "loss": 7.8762, + "loss/crossentropy": 2.045905639976263, + "loss/hidden": 3.5796875, + "loss/jsd": 0.0, + "loss/logits": 0.21305822925642132, + "step": 15520 + }, + { + "epoch": 0.5176666666666667, + "grad_norm": 30.25, + "grad_norm_var": 7.0087890625, + "learning_rate": 0.0001, + "loss": 7.9002, + "loss/crossentropy": 2.0140881910920143, + "loss/hidden": 3.78203125, + "loss/jsd": 0.0, + "loss/logits": 0.2302077604457736, + "step": 15530 + }, + { + "epoch": 0.518, + "grad_norm": 31.375, + "grad_norm_var": 8.4869140625, + "learning_rate": 0.0001, + "loss": 7.9189, + "loss/crossentropy": 2.144743651151657, + "loss/hidden": 3.662890625, + "loss/jsd": 0.0, + "loss/logits": 0.22745619527995586, + "step": 15540 + }, + { + "epoch": 0.5183333333333333, + "grad_norm": 34.5, + "grad_norm_var": 16.46640625, + "learning_rate": 0.0001, + "loss": 8.0041, + "loss/crossentropy": 2.162237875163555, + "loss/hidden": 3.667578125, + "loss/jsd": 0.0, + "loss/logits": 0.22348958030343055, + "step": 15550 + }, + { + "epoch": 0.5186666666666667, + "grad_norm": 33.0, + "grad_norm_var": 10.215625, + "learning_rate": 0.0001, + "loss": 7.9595, + "loss/crossentropy": 2.1002556174993514, + "loss/hidden": 3.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.221992826461792, + "step": 15560 + }, + { + "epoch": 0.519, + "grad_norm": 28.5, + "grad_norm_var": 11.612239583333333, + "learning_rate": 0.0001, + "loss": 7.9435, + "loss/crossentropy": 2.1175016567111014, + "loss/hidden": 3.697265625, + "loss/jsd": 0.0, + "loss/logits": 0.2175672125071287, + "step": 15570 + }, + { + "epoch": 0.5193333333333333, + "grad_norm": 51.75, + "grad_norm_var": 32.14479166666667, + "learning_rate": 0.0001, + "loss": 7.8969, + "loss/crossentropy": 2.1000607915222647, + "loss/hidden": 3.578515625, + "loss/jsd": 0.0, + "loss/logits": 0.22374147176742554, + "step": 15580 + }, + { + "epoch": 0.5196666666666667, + "grad_norm": 35.25, + "grad_norm_var": 216.00826822916667, + "learning_rate": 0.0001, + "loss": 7.9352, + "loss/crossentropy": 2.062793227285147, + "loss/hidden": 3.69765625, + "loss/jsd": 0.0, + "loss/logits": 0.22876775842159985, + "step": 15590 + }, + { + "epoch": 0.52, + "grad_norm": 29.875, + "grad_norm_var": 209.046875, + "learning_rate": 0.0001, + "loss": 7.9366, + "loss/crossentropy": 2.120285242795944, + "loss/hidden": 3.648046875, + "loss/jsd": 0.0, + "loss/logits": 0.21306953616440297, + "step": 15600 + }, + { + "epoch": 0.5203333333333333, + "grad_norm": 31.125, + "grad_norm_var": 6.2775390625, + "learning_rate": 0.0001, + "loss": 7.8629, + "loss/crossentropy": 2.0886681511998177, + "loss/hidden": 3.725, + "loss/jsd": 0.0, + "loss/logits": 0.23063711524009706, + "step": 15610 + }, + { + "epoch": 0.5206666666666667, + "grad_norm": 42.75, + "grad_norm_var": 2.3565788485707105e+18, + "learning_rate": 0.0001, + "loss": 7.9075, + "loss/crossentropy": 2.059058104455471, + "loss/hidden": 3.790625, + "loss/jsd": 0.0, + "loss/logits": 0.2093208000063896, + "step": 15620 + }, + { + "epoch": 0.521, + "grad_norm": 29.0, + "grad_norm_var": 2.356578849024849e+18, + "learning_rate": 0.0001, + "loss": 8.0243, + "loss/crossentropy": 2.0345511339604854, + "loss/hidden": 3.615625, + "loss/jsd": 0.0, + "loss/logits": 0.20063564516603946, + "step": 15630 + }, + { + "epoch": 0.5213333333333333, + "grad_norm": 34.0, + "grad_norm_var": 994.1535807291667, + "learning_rate": 0.0001, + "loss": 7.9865, + "loss/crossentropy": 1.9753425560891629, + "loss/hidden": 3.65390625, + "loss/jsd": 0.0, + "loss/logits": 0.21483086440712212, + "step": 15640 + }, + { + "epoch": 0.5216666666666666, + "grad_norm": 30.125, + "grad_norm_var": 23.786393229166666, + "learning_rate": 0.0001, + "loss": 7.9327, + "loss/crossentropy": 2.0674639120697975, + "loss/hidden": 3.81171875, + "loss/jsd": 0.0, + "loss/logits": 0.2364231664687395, + "step": 15650 + }, + { + "epoch": 0.522, + "grad_norm": 28.875, + "grad_norm_var": 258.47083333333336, + "learning_rate": 0.0001, + "loss": 7.9931, + "loss/crossentropy": 1.968726746737957, + "loss/hidden": 3.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21212349347770215, + "step": 15660 + }, + { + "epoch": 0.5223333333333333, + "grad_norm": 40.25, + "grad_norm_var": 11.349739583333333, + "learning_rate": 0.0001, + "loss": 7.7757, + "loss/crossentropy": 2.2387484058737757, + "loss/hidden": 3.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.21647185329347848, + "step": 15670 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 27.5, + "grad_norm_var": 13.4103515625, + "learning_rate": 0.0001, + "loss": 7.79, + "loss/crossentropy": 2.063428644835949, + "loss/hidden": 3.5640625, + "loss/jsd": 0.0, + "loss/logits": 0.21321867797523736, + "step": 15680 + }, + { + "epoch": 0.523, + "grad_norm": 31.875, + "grad_norm_var": 13.243489583333334, + "learning_rate": 0.0001, + "loss": 7.9336, + "loss/crossentropy": 2.1281570941209793, + "loss/hidden": 3.611328125, + "loss/jsd": 0.0, + "loss/logits": 0.22930383421480655, + "step": 15690 + }, + { + "epoch": 0.5233333333333333, + "grad_norm": 30.5, + "grad_norm_var": 13.87890625, + "learning_rate": 0.0001, + "loss": 7.9178, + "loss/crossentropy": 2.07712532132864, + "loss/hidden": 3.64375, + "loss/jsd": 0.0, + "loss/logits": 0.21440593730658292, + "step": 15700 + }, + { + "epoch": 0.5236666666666666, + "grad_norm": 33.0, + "grad_norm_var": 62.56848958333333, + "learning_rate": 0.0001, + "loss": 7.8597, + "loss/crossentropy": 2.111837570369244, + "loss/hidden": 3.63984375, + "loss/jsd": 0.0, + "loss/logits": 0.2352095700800419, + "step": 15710 + }, + { + "epoch": 0.524, + "grad_norm": 30.25, + "grad_norm_var": 59.56295572916667, + "learning_rate": 0.0001, + "loss": 7.8544, + "loss/crossentropy": 1.9604034937918187, + "loss/hidden": 3.65390625, + "loss/jsd": 0.0, + "loss/logits": 0.21035183649510145, + "step": 15720 + }, + { + "epoch": 0.5243333333333333, + "grad_norm": 30.25, + "grad_norm_var": 19.3744140625, + "learning_rate": 0.0001, + "loss": 8.0273, + "loss/crossentropy": 2.1459563463926314, + "loss/hidden": 3.573828125, + "loss/jsd": 0.0, + "loss/logits": 0.21267954409122466, + "step": 15730 + }, + { + "epoch": 0.5246666666666666, + "grad_norm": 30.0, + "grad_norm_var": 23.273372395833334, + "learning_rate": 0.0001, + "loss": 7.9939, + "loss/crossentropy": 2.1543472737073897, + "loss/hidden": 3.731640625, + "loss/jsd": 0.0, + "loss/logits": 0.2248241593129933, + "step": 15740 + }, + { + "epoch": 0.525, + "grad_norm": 36.25, + "grad_norm_var": 5.14765625, + "learning_rate": 0.0001, + "loss": 7.9856, + "loss/crossentropy": 2.0191838264465334, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.22251855283975602, + "step": 15750 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 34.75, + "grad_norm_var": 6.240625, + "learning_rate": 0.0001, + "loss": 8.0418, + "loss/crossentropy": 2.22621650993824, + "loss/hidden": 3.591796875, + "loss/jsd": 0.0, + "loss/logits": 0.21183471530675888, + "step": 15760 + }, + { + "epoch": 0.5256666666666666, + "grad_norm": 34.0, + "grad_norm_var": 8.637434895833334, + "learning_rate": 0.0001, + "loss": 7.9608, + "loss/crossentropy": 2.0110961377620695, + "loss/hidden": 3.655078125, + "loss/jsd": 0.0, + "loss/logits": 0.22007959876209499, + "step": 15770 + }, + { + "epoch": 0.526, + "grad_norm": 34.75, + "grad_norm_var": 9.070572916666666, + "learning_rate": 0.0001, + "loss": 7.9034, + "loss/crossentropy": 1.9873220488429069, + "loss/hidden": 3.691015625, + "loss/jsd": 0.0, + "loss/logits": 0.2183900134637952, + "step": 15780 + }, + { + "epoch": 0.5263333333333333, + "grad_norm": 30.375, + "grad_norm_var": 5.1853515625, + "learning_rate": 0.0001, + "loss": 7.9344, + "loss/crossentropy": 1.9563458181917668, + "loss/hidden": 3.67890625, + "loss/jsd": 0.0, + "loss/logits": 0.20562523752450942, + "step": 15790 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 30.25, + "grad_norm_var": 7.5353515625, + "learning_rate": 0.0001, + "loss": 8.0172, + "loss/crossentropy": 2.1347015112638474, + "loss/hidden": 3.6546875, + "loss/jsd": 0.0, + "loss/logits": 0.22778294049203396, + "step": 15800 + }, + { + "epoch": 0.527, + "grad_norm": 29.625, + "grad_norm_var": 8.6978515625, + "learning_rate": 0.0001, + "loss": 7.9092, + "loss/crossentropy": 2.0619646534323692, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.2228071277961135, + "step": 15810 + }, + { + "epoch": 0.5273333333333333, + "grad_norm": 30.5, + "grad_norm_var": 3.0259765625, + "learning_rate": 0.0001, + "loss": 7.9127, + "loss/crossentropy": 2.079516027867794, + "loss/hidden": 3.61953125, + "loss/jsd": 0.0, + "loss/logits": 0.20713965147733687, + "step": 15820 + }, + { + "epoch": 0.5276666666666666, + "grad_norm": 30.875, + "grad_norm_var": 4.55390625, + "learning_rate": 0.0001, + "loss": 7.9888, + "loss/crossentropy": 2.1745656400918962, + "loss/hidden": 3.707421875, + "loss/jsd": 0.0, + "loss/logits": 0.22695780582726002, + "step": 15830 + }, + { + "epoch": 0.528, + "grad_norm": 29.25, + "grad_norm_var": 5.205208333333333, + "learning_rate": 0.0001, + "loss": 7.9376, + "loss/crossentropy": 2.1931827545166014, + "loss/hidden": 3.61171875, + "loss/jsd": 0.0, + "loss/logits": 0.21837961710989476, + "step": 15840 + }, + { + "epoch": 0.5283333333333333, + "grad_norm": 37.0, + "grad_norm_var": 5.242708333333334, + "learning_rate": 0.0001, + "loss": 7.973, + "loss/crossentropy": 2.1738994657993316, + "loss/hidden": 3.61953125, + "loss/jsd": 0.0, + "loss/logits": 0.21239091642200947, + "step": 15850 + }, + { + "epoch": 0.5286666666666666, + "grad_norm": 32.0, + "grad_norm_var": 2.6757714712718213e+18, + "learning_rate": 0.0001, + "loss": 7.8871, + "loss/crossentropy": 2.017245587706566, + "loss/hidden": 3.933203125, + "loss/jsd": 0.0, + "loss/logits": 0.2155450826510787, + "step": 15860 + }, + { + "epoch": 0.529, + "grad_norm": 36.5, + "grad_norm_var": 5.8291015625, + "learning_rate": 0.0001, + "loss": 8.0041, + "loss/crossentropy": 2.173825052380562, + "loss/hidden": 3.65390625, + "loss/jsd": 0.0, + "loss/logits": 0.22582603432238102, + "step": 15870 + }, + { + "epoch": 0.5293333333333333, + "grad_norm": 30.375, + "grad_norm_var": 5.437434895833333, + "learning_rate": 0.0001, + "loss": 7.8263, + "loss/crossentropy": 1.9266249172389507, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.18464618194848298, + "step": 15880 + }, + { + "epoch": 0.5296666666666666, + "grad_norm": 32.5, + "grad_norm_var": 5.3228515625, + "learning_rate": 0.0001, + "loss": 7.8421, + "loss/crossentropy": 2.084079180657864, + "loss/hidden": 3.684375, + "loss/jsd": 0.0, + "loss/logits": 0.21603441275656224, + "step": 15890 + }, + { + "epoch": 0.53, + "grad_norm": 31.75, + "grad_norm_var": 145.74479166666666, + "learning_rate": 0.0001, + "loss": 7.9261, + "loss/crossentropy": 2.1224375024437903, + "loss/hidden": 3.7546875, + "loss/jsd": 0.0, + "loss/logits": 0.2454449266195297, + "step": 15900 + }, + { + "epoch": 0.5303333333333333, + "grad_norm": 32.75, + "grad_norm_var": 4.843684895833333, + "learning_rate": 0.0001, + "loss": 8.0083, + "loss/crossentropy": 2.198984383046627, + "loss/hidden": 3.601171875, + "loss/jsd": 0.0, + "loss/logits": 0.22642315719276668, + "step": 15910 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 32.0, + "grad_norm_var": 2.9103515625, + "learning_rate": 0.0001, + "loss": 8.0344, + "loss/crossentropy": 2.021199995279312, + "loss/hidden": 3.823046875, + "loss/jsd": 0.0, + "loss/logits": 0.23787404000759124, + "step": 15920 + }, + { + "epoch": 0.531, + "grad_norm": 34.0, + "grad_norm_var": 3.325455729166667, + "learning_rate": 0.0001, + "loss": 7.9977, + "loss/crossentropy": 2.1375532552599905, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21390576139092446, + "step": 15930 + }, + { + "epoch": 0.5313333333333333, + "grad_norm": 30.875, + "grad_norm_var": 3.0072916666666667, + "learning_rate": 0.0001, + "loss": 7.8004, + "loss/crossentropy": 2.2248755604028703, + "loss/hidden": 3.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.21414327146485448, + "step": 15940 + }, + { + "epoch": 0.5316666666666666, + "grad_norm": 28.625, + "grad_norm_var": 7.127083333333333, + "learning_rate": 0.0001, + "loss": 7.8167, + "loss/crossentropy": 2.024421763420105, + "loss/hidden": 3.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.2099402707070112, + "step": 15950 + }, + { + "epoch": 0.532, + "grad_norm": 28.75, + "grad_norm_var": 7.255989583333333, + "learning_rate": 0.0001, + "loss": 7.9159, + "loss/crossentropy": 2.0035487972199917, + "loss/hidden": 3.780078125, + "loss/jsd": 0.0, + "loss/logits": 0.23340979432687164, + "step": 15960 + }, + { + "epoch": 0.5323333333333333, + "grad_norm": 31.75, + "grad_norm_var": 28.76640625, + "learning_rate": 0.0001, + "loss": 8.009, + "loss/crossentropy": 2.155665622651577, + "loss/hidden": 3.703515625, + "loss/jsd": 0.0, + "loss/logits": 0.23035227973014116, + "step": 15970 + }, + { + "epoch": 0.5326666666666666, + "grad_norm": 45.25, + "grad_norm_var": 19.4119140625, + "learning_rate": 0.0001, + "loss": 8.0717, + "loss/crossentropy": 2.0463739298284054, + "loss/hidden": 3.700390625, + "loss/jsd": 0.0, + "loss/logits": 0.24000506065785884, + "step": 15980 + }, + { + "epoch": 0.533, + "grad_norm": 32.75, + "grad_norm_var": 20.661458333333332, + "learning_rate": 0.0001, + "loss": 7.9986, + "loss/crossentropy": 2.107135473191738, + "loss/hidden": 3.735546875, + "loss/jsd": 0.0, + "loss/logits": 0.22552732955664395, + "step": 15990 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 28.625, + "grad_norm_var": 13.914583333333333, + "learning_rate": 0.0001, + "loss": 7.9784, + "loss/crossentropy": 2.142227107286453, + "loss/hidden": 3.600390625, + "loss/jsd": 0.0, + "loss/logits": 0.21058713737875223, + "step": 16000 + }, + { + "epoch": 0.5336666666666666, + "grad_norm": 33.0, + "grad_norm_var": 18.480208333333334, + "learning_rate": 0.0001, + "loss": 7.9253, + "loss/crossentropy": 2.209307189285755, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.22212357576936484, + "step": 16010 + }, + { + "epoch": 0.534, + "grad_norm": 27.625, + "grad_norm_var": 7.594791666666667, + "learning_rate": 0.0001, + "loss": 7.8662, + "loss/crossentropy": 2.009366624057293, + "loss/hidden": 3.6296875, + "loss/jsd": 0.0, + "loss/logits": 0.22330178935080766, + "step": 16020 + }, + { + "epoch": 0.5343333333333333, + "grad_norm": 29.625, + "grad_norm_var": 11.910872395833334, + "learning_rate": 0.0001, + "loss": 8.0186, + "loss/crossentropy": 2.1544925197958946, + "loss/hidden": 3.6453125, + "loss/jsd": 0.0, + "loss/logits": 0.22340870313346387, + "step": 16030 + }, + { + "epoch": 0.5346666666666666, + "grad_norm": 30.75, + "grad_norm_var": 14.717122395833334, + "learning_rate": 0.0001, + "loss": 7.8662, + "loss/crossentropy": 2.2394671350717545, + "loss/hidden": 3.53671875, + "loss/jsd": 0.0, + "loss/logits": 0.21254578419029713, + "step": 16040 + }, + { + "epoch": 0.535, + "grad_norm": 32.5, + "grad_norm_var": 4.167708333333334, + "learning_rate": 0.0001, + "loss": 7.9273, + "loss/crossentropy": 2.0182781517505646, + "loss/hidden": 3.722265625, + "loss/jsd": 0.0, + "loss/logits": 0.23184970542788505, + "step": 16050 + }, + { + "epoch": 0.5353333333333333, + "grad_norm": 34.75, + "grad_norm_var": 3.883072916666667, + "learning_rate": 0.0001, + "loss": 7.9456, + "loss/crossentropy": 2.200615034997463, + "loss/hidden": 3.752734375, + "loss/jsd": 0.0, + "loss/logits": 0.2355814663693309, + "step": 16060 + }, + { + "epoch": 0.5356666666666666, + "grad_norm": 31.0, + "grad_norm_var": 10.541666666666666, + "learning_rate": 0.0001, + "loss": 7.9085, + "loss/crossentropy": 1.7848753452301025, + "loss/hidden": 3.684375, + "loss/jsd": 0.0, + "loss/logits": 0.2178065821528435, + "step": 16070 + }, + { + "epoch": 0.536, + "grad_norm": 30.375, + "grad_norm_var": 9.554622395833333, + "learning_rate": 0.0001, + "loss": 7.9787, + "loss/crossentropy": 2.126433804631233, + "loss/hidden": 3.66640625, + "loss/jsd": 0.0, + "loss/logits": 0.2092861395329237, + "step": 16080 + }, + { + "epoch": 0.5363333333333333, + "grad_norm": 31.875, + "grad_norm_var": 2.609375, + "learning_rate": 0.0001, + "loss": 7.8776, + "loss/crossentropy": 2.0730943456292152, + "loss/hidden": 3.683984375, + "loss/jsd": 0.0, + "loss/logits": 0.22322714999318122, + "step": 16090 + }, + { + "epoch": 0.5366666666666666, + "grad_norm": 31.5, + "grad_norm_var": 4.9822265625, + "learning_rate": 0.0001, + "loss": 7.9642, + "loss/crossentropy": 2.1567077368497847, + "loss/hidden": 3.841015625, + "loss/jsd": 0.0, + "loss/logits": 0.24368763864040374, + "step": 16100 + }, + { + "epoch": 0.537, + "grad_norm": 27.875, + "grad_norm_var": 2.101822916666667, + "learning_rate": 0.0001, + "loss": 7.9671, + "loss/crossentropy": 2.0234420910477637, + "loss/hidden": 3.687109375, + "loss/jsd": 0.0, + "loss/logits": 0.20963803213089705, + "step": 16110 + }, + { + "epoch": 0.5373333333333333, + "grad_norm": 31.125, + "grad_norm_var": 2.290625, + "learning_rate": 0.0001, + "loss": 7.8891, + "loss/crossentropy": 2.162296248972416, + "loss/hidden": 3.69765625, + "loss/jsd": 0.0, + "loss/logits": 0.21969048418104647, + "step": 16120 + }, + { + "epoch": 0.5376666666666666, + "grad_norm": 28.125, + "grad_norm_var": 4.483072916666667, + "learning_rate": 0.0001, + "loss": 7.7938, + "loss/crossentropy": 2.0626881010830402, + "loss/hidden": 3.5109375, + "loss/jsd": 0.0, + "loss/logits": 0.20465954188257457, + "step": 16130 + }, + { + "epoch": 0.538, + "grad_norm": 31.25, + "grad_norm_var": 2.715625, + "learning_rate": 0.0001, + "loss": 7.9009, + "loss/crossentropy": 2.1950813859701155, + "loss/hidden": 3.492578125, + "loss/jsd": 0.0, + "loss/logits": 0.21106129735708237, + "step": 16140 + }, + { + "epoch": 0.5383333333333333, + "grad_norm": 33.75, + "grad_norm_var": 2.15, + "learning_rate": 0.0001, + "loss": 7.8827, + "loss/crossentropy": 2.0365090548992155, + "loss/hidden": 3.5890625, + "loss/jsd": 0.0, + "loss/logits": 0.19862626306712627, + "step": 16150 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 29.875, + "grad_norm_var": 2.46015625, + "learning_rate": 0.0001, + "loss": 8.0171, + "loss/crossentropy": 2.0772834539413454, + "loss/hidden": 3.652734375, + "loss/jsd": 0.0, + "loss/logits": 0.22482867874205112, + "step": 16160 + }, + { + "epoch": 0.539, + "grad_norm": 31.5, + "grad_norm_var": 13.727083333333333, + "learning_rate": 0.0001, + "loss": 7.9269, + "loss/crossentropy": 2.0035090267658235, + "loss/hidden": 3.639453125, + "loss/jsd": 0.0, + "loss/logits": 0.21533504147082566, + "step": 16170 + }, + { + "epoch": 0.5393333333333333, + "grad_norm": 29.625, + "grad_norm_var": 3.4058471876790845e+18, + "learning_rate": 0.0001, + "loss": 7.9526, + "loss/crossentropy": 2.0684109210968016, + "loss/hidden": 3.809765625, + "loss/jsd": 0.0, + "loss/logits": 0.24036269690841436, + "step": 16180 + }, + { + "epoch": 0.5396666666666666, + "grad_norm": 29.75, + "grad_norm_var": 1.0208333333333333, + "learning_rate": 0.0001, + "loss": 7.9665, + "loss/crossentropy": 2.0767479740083217, + "loss/hidden": 3.6546875, + "loss/jsd": 0.0, + "loss/logits": 0.22391563206911086, + "step": 16190 + }, + { + "epoch": 0.54, + "grad_norm": 32.75, + "grad_norm_var": 0.9249348958333333, + "learning_rate": 0.0001, + "loss": 7.8065, + "loss/crossentropy": 2.070406360924244, + "loss/hidden": 3.697265625, + "loss/jsd": 0.0, + "loss/logits": 0.22352562863379716, + "step": 16200 + }, + { + "epoch": 0.5403333333333333, + "grad_norm": 31.125, + "grad_norm_var": 2.8259765625, + "learning_rate": 0.0001, + "loss": 7.7915, + "loss/crossentropy": 2.135431842878461, + "loss/hidden": 3.585546875, + "loss/jsd": 0.0, + "loss/logits": 0.20899291220121086, + "step": 16210 + }, + { + "epoch": 0.5406666666666666, + "grad_norm": 27.75, + "grad_norm_var": 5.374934895833333, + "learning_rate": 0.0001, + "loss": 7.8726, + "loss/crossentropy": 1.9553619243204594, + "loss/hidden": 3.6859375, + "loss/jsd": 0.0, + "loss/logits": 0.21065005520358682, + "step": 16220 + }, + { + "epoch": 0.541, + "grad_norm": 32.25, + "grad_norm_var": 2.9205729166666665, + "learning_rate": 0.0001, + "loss": 7.9361, + "loss/crossentropy": 2.235548512637615, + "loss/hidden": 3.632421875, + "loss/jsd": 0.0, + "loss/logits": 0.23148855995386838, + "step": 16230 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 28.75, + "grad_norm_var": 5.443489583333333, + "learning_rate": 0.0001, + "loss": 7.9547, + "loss/crossentropy": 2.0689282923936845, + "loss/hidden": 3.73359375, + "loss/jsd": 0.0, + "loss/logits": 0.22652086950838565, + "step": 16240 + }, + { + "epoch": 0.5416666666666666, + "grad_norm": 31.875, + "grad_norm_var": 4.34765625, + "learning_rate": 0.0001, + "loss": 7.903, + "loss/crossentropy": 1.9587130278348923, + "loss/hidden": 3.82578125, + "loss/jsd": 0.0, + "loss/logits": 0.24109735526144505, + "step": 16250 + }, + { + "epoch": 0.542, + "grad_norm": 31.0, + "grad_norm_var": 2.956184895833333, + "learning_rate": 0.0001, + "loss": 7.9763, + "loss/crossentropy": 2.231505811214447, + "loss/hidden": 3.670703125, + "loss/jsd": 0.0, + "loss/logits": 0.22332519851624966, + "step": 16260 + }, + { + "epoch": 0.5423333333333333, + "grad_norm": 30.875, + "grad_norm_var": 2.87890625, + "learning_rate": 0.0001, + "loss": 7.9534, + "loss/crossentropy": 2.21473398655653, + "loss/hidden": 3.684765625, + "loss/jsd": 0.0, + "loss/logits": 0.2405968852341175, + "step": 16270 + }, + { + "epoch": 0.5426666666666666, + "grad_norm": 31.125, + "grad_norm_var": 2.044205729166667, + "learning_rate": 0.0001, + "loss": 7.8368, + "loss/crossentropy": 2.0223120510578156, + "loss/hidden": 3.600390625, + "loss/jsd": 0.0, + "loss/logits": 0.20706812832504512, + "step": 16280 + }, + { + "epoch": 0.543, + "grad_norm": 31.125, + "grad_norm_var": 3.6705729166666665, + "learning_rate": 0.0001, + "loss": 8.1192, + "loss/crossentropy": 2.2137394294142725, + "loss/hidden": 3.80234375, + "loss/jsd": 0.0, + "loss/logits": 0.24676046203821897, + "step": 16290 + }, + { + "epoch": 0.5433333333333333, + "grad_norm": 31.875, + "grad_norm_var": 2.3997395833333335, + "learning_rate": 0.0001, + "loss": 8.0137, + "loss/crossentropy": 2.1324679240584374, + "loss/hidden": 3.715234375, + "loss/jsd": 0.0, + "loss/logits": 0.21872996147722007, + "step": 16300 + }, + { + "epoch": 0.5436666666666666, + "grad_norm": 30.875, + "grad_norm_var": 6.243489583333333, + "learning_rate": 0.0001, + "loss": 8.0561, + "loss/crossentropy": 2.1422536253929136, + "loss/hidden": 3.705078125, + "loss/jsd": 0.0, + "loss/logits": 0.24038595696911216, + "step": 16310 + }, + { + "epoch": 0.544, + "grad_norm": 28.875, + "grad_norm_var": 15.54375, + "learning_rate": 0.0001, + "loss": 7.9935, + "loss/crossentropy": 1.994092260301113, + "loss/hidden": 3.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20646399296820164, + "step": 16320 + }, + { + "epoch": 0.5443333333333333, + "grad_norm": 34.5, + "grad_norm_var": 8.833072916666667, + "learning_rate": 0.0001, + "loss": 7.8759, + "loss/crossentropy": 2.1263050198554994, + "loss/hidden": 3.73359375, + "loss/jsd": 0.0, + "loss/logits": 0.23247295394539833, + "step": 16330 + }, + { + "epoch": 0.5446666666666666, + "grad_norm": 32.25, + "grad_norm_var": 3.8926432291666666, + "learning_rate": 0.0001, + "loss": 7.8947, + "loss/crossentropy": 2.1508326224982737, + "loss/hidden": 3.615625, + "loss/jsd": 0.0, + "loss/logits": 0.22677485179156065, + "step": 16340 + }, + { + "epoch": 0.545, + "grad_norm": 33.75, + "grad_norm_var": 3.565625, + "learning_rate": 0.0001, + "loss": 7.9352, + "loss/crossentropy": 2.146810656785965, + "loss/hidden": 3.7203125, + "loss/jsd": 0.0, + "loss/logits": 0.22902542352676392, + "step": 16350 + }, + { + "epoch": 0.5453333333333333, + "grad_norm": 32.25, + "grad_norm_var": 2.5322916666666666, + "learning_rate": 0.0001, + "loss": 7.895, + "loss/crossentropy": 2.1730375468730925, + "loss/hidden": 3.6609375, + "loss/jsd": 0.0, + "loss/logits": 0.22302960231900215, + "step": 16360 + }, + { + "epoch": 0.5456666666666666, + "grad_norm": 28.875, + "grad_norm_var": 2.1747395833333334, + "learning_rate": 0.0001, + "loss": 7.965, + "loss/crossentropy": 2.1157036066055297, + "loss/hidden": 3.708203125, + "loss/jsd": 0.0, + "loss/logits": 0.24063412323594094, + "step": 16370 + }, + { + "epoch": 0.546, + "grad_norm": 29.75, + "grad_norm_var": 6.789322916666666, + "learning_rate": 0.0001, + "loss": 7.9686, + "loss/crossentropy": 2.04927616417408, + "loss/hidden": 3.698828125, + "loss/jsd": 0.0, + "loss/logits": 0.21224019005894662, + "step": 16380 + }, + { + "epoch": 0.5463333333333333, + "grad_norm": 31.125, + "grad_norm_var": 5.550455729166667, + "learning_rate": 0.0001, + "loss": 7.9747, + "loss/crossentropy": 2.068675779551268, + "loss/hidden": 3.584765625, + "loss/jsd": 0.0, + "loss/logits": 0.2084495802409947, + "step": 16390 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 30.875, + "grad_norm_var": 1.5639973958333333, + "learning_rate": 0.0001, + "loss": 7.988, + "loss/crossentropy": 2.0940013602375984, + "loss/hidden": 3.691796875, + "loss/jsd": 0.0, + "loss/logits": 0.22360229659825565, + "step": 16400 + }, + { + "epoch": 0.547, + "grad_norm": 31.625, + "grad_norm_var": 1.7455729166666667, + "learning_rate": 0.0001, + "loss": 8.0907, + "loss/crossentropy": 2.1774737536907196, + "loss/hidden": 3.6859375, + "loss/jsd": 0.0, + "loss/logits": 0.22327901497483255, + "step": 16410 + }, + { + "epoch": 0.5473333333333333, + "grad_norm": 29.875, + "grad_norm_var": 4.50625, + "learning_rate": 0.0001, + "loss": 8.0022, + "loss/crossentropy": 2.1608075901865957, + "loss/hidden": 3.700390625, + "loss/jsd": 0.0, + "loss/logits": 0.25244998149573805, + "step": 16420 + }, + { + "epoch": 0.5476666666666666, + "grad_norm": 31.875, + "grad_norm_var": 3.2671223958333333, + "learning_rate": 0.0001, + "loss": 8.1234, + "loss/crossentropy": 1.982553929835558, + "loss/hidden": 3.565625, + "loss/jsd": 0.0, + "loss/logits": 0.20166566986590623, + "step": 16430 + }, + { + "epoch": 0.548, + "grad_norm": 32.5, + "grad_norm_var": 1.7520833333333334, + "learning_rate": 0.0001, + "loss": 8.037, + "loss/crossentropy": 2.0213874965906142, + "loss/hidden": 3.787109375, + "loss/jsd": 0.0, + "loss/logits": 0.23120209593325852, + "step": 16440 + }, + { + "epoch": 0.5483333333333333, + "grad_norm": 32.5, + "grad_norm_var": 2.7625, + "learning_rate": 0.0001, + "loss": 7.9356, + "loss/crossentropy": 2.0151995003223417, + "loss/hidden": 3.6828125, + "loss/jsd": 0.0, + "loss/logits": 0.21384722124785185, + "step": 16450 + }, + { + "epoch": 0.5486666666666666, + "grad_norm": 30.75, + "grad_norm_var": 2.8580729166666665, + "learning_rate": 0.0001, + "loss": 8.0426, + "loss/crossentropy": 2.258426922559738, + "loss/hidden": 3.57109375, + "loss/jsd": 0.0, + "loss/logits": 0.21724292561411856, + "step": 16460 + }, + { + "epoch": 0.549, + "grad_norm": 30.625, + "grad_norm_var": 2.6211653255110676e+18, + "learning_rate": 0.0001, + "loss": 7.8979, + "loss/crossentropy": 2.1004390507936477, + "loss/hidden": 3.63125, + "loss/jsd": 0.0, + "loss/logits": 0.2168941769748926, + "step": 16470 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 32.0, + "grad_norm_var": 21.358072916666668, + "learning_rate": 0.0001, + "loss": 7.9368, + "loss/crossentropy": 2.234841299057007, + "loss/hidden": 3.671484375, + "loss/jsd": 0.0, + "loss/logits": 0.22458531036973, + "step": 16480 + }, + { + "epoch": 0.5496666666666666, + "grad_norm": 31.0, + "grad_norm_var": 19.398372395833334, + "learning_rate": 0.0001, + "loss": 7.7756, + "loss/crossentropy": 1.9220365844666958, + "loss/hidden": 3.650390625, + "loss/jsd": 0.0, + "loss/logits": 0.21966882292181253, + "step": 16490 + }, + { + "epoch": 0.55, + "grad_norm": 29.25, + "grad_norm_var": 10.578059895833333, + "learning_rate": 0.0001, + "loss": 7.9483, + "loss/crossentropy": 2.137505892664194, + "loss/hidden": 3.684375, + "loss/jsd": 0.0, + "loss/logits": 0.20756643787026405, + "step": 16500 + }, + { + "epoch": 0.5503333333333333, + "grad_norm": 30.5, + "grad_norm_var": 4.569791666666666, + "learning_rate": 0.0001, + "loss": 7.842, + "loss/crossentropy": 2.163301798701286, + "loss/hidden": 3.641015625, + "loss/jsd": 0.0, + "loss/logits": 0.21592859141528606, + "step": 16510 + }, + { + "epoch": 0.5506666666666666, + "grad_norm": 30.25, + "grad_norm_var": 1.9749348958333333, + "learning_rate": 0.0001, + "loss": 7.9053, + "loss/crossentropy": 1.8833866529166698, + "loss/hidden": 3.783203125, + "loss/jsd": 0.0, + "loss/logits": 0.22102598939090967, + "step": 16520 + }, + { + "epoch": 0.551, + "grad_norm": 30.5, + "grad_norm_var": 2.0176432291666666, + "learning_rate": 0.0001, + "loss": 7.8003, + "loss/crossentropy": 1.942694688588381, + "loss/hidden": 3.71875, + "loss/jsd": 0.0, + "loss/logits": 0.21318345218896867, + "step": 16530 + }, + { + "epoch": 0.5513333333333333, + "grad_norm": 33.0, + "grad_norm_var": 2.539322916666667, + "learning_rate": 0.0001, + "loss": 7.7828, + "loss/crossentropy": 2.0692317470908166, + "loss/hidden": 3.541015625, + "loss/jsd": 0.0, + "loss/logits": 0.20164419133216144, + "step": 16540 + }, + { + "epoch": 0.5516666666666666, + "grad_norm": 29.5, + "grad_norm_var": 2.595572916666667, + "learning_rate": 0.0001, + "loss": 7.9161, + "loss/crossentropy": 2.046563369035721, + "loss/hidden": 3.6890625, + "loss/jsd": 0.0, + "loss/logits": 0.22405224461108447, + "step": 16550 + }, + { + "epoch": 0.552, + "grad_norm": 34.75, + "grad_norm_var": 3.570572916666667, + "learning_rate": 0.0001, + "loss": 7.9911, + "loss/crossentropy": 2.018775662779808, + "loss/hidden": 3.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21541824340820312, + "step": 16560 + }, + { + "epoch": 0.5523333333333333, + "grad_norm": 29.875, + "grad_norm_var": 3.6080729166666665, + "learning_rate": 0.0001, + "loss": 7.9031, + "loss/crossentropy": 2.25411321669817, + "loss/hidden": 3.666015625, + "loss/jsd": 0.0, + "loss/logits": 0.2227392230182886, + "step": 16570 + }, + { + "epoch": 0.5526666666666666, + "grad_norm": 30.25, + "grad_norm_var": 2.8603515625, + "learning_rate": 0.0001, + "loss": 7.8556, + "loss/crossentropy": 2.0392999947071075, + "loss/hidden": 3.684375, + "loss/jsd": 0.0, + "loss/logits": 0.2251502934843302, + "step": 16580 + }, + { + "epoch": 0.553, + "grad_norm": 30.75, + "grad_norm_var": 3.153125, + "learning_rate": 0.0001, + "loss": 7.7396, + "loss/crossentropy": 1.9471760131418705, + "loss/hidden": 3.534765625, + "loss/jsd": 0.0, + "loss/logits": 0.1881936783902347, + "step": 16590 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 32.25, + "grad_norm_var": 4.223883836303868e+18, + "learning_rate": 0.0001, + "loss": 7.9172, + "loss/crossentropy": 2.0934195905923842, + "loss/hidden": 3.65703125, + "loss/jsd": 0.0, + "loss/logits": 0.217781463265419, + "step": 16600 + }, + { + "epoch": 0.5536666666666666, + "grad_norm": 30.125, + "grad_norm_var": 4.223883835918516e+18, + "learning_rate": 0.0001, + "loss": 7.8719, + "loss/crossentropy": 2.053102213144302, + "loss/hidden": 3.6890625, + "loss/jsd": 0.0, + "loss/logits": 0.22234885692596434, + "step": 16610 + }, + { + "epoch": 0.554, + "grad_norm": 32.0, + "grad_norm_var": 4.234309895833333, + "learning_rate": 0.0001, + "loss": 7.9785, + "loss/crossentropy": 2.0579131454229356, + "loss/hidden": 3.679296875, + "loss/jsd": 0.0, + "loss/logits": 0.22978297639638184, + "step": 16620 + }, + { + "epoch": 0.5543333333333333, + "grad_norm": 29.0, + "grad_norm_var": 6.4478515625, + "learning_rate": 0.0001, + "loss": 7.9221, + "loss/crossentropy": 2.098356659710407, + "loss/hidden": 3.72421875, + "loss/jsd": 0.0, + "loss/logits": 0.22436762768775226, + "step": 16630 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 29.25, + "grad_norm_var": 3.4427083333333335, + "learning_rate": 0.0001, + "loss": 7.8154, + "loss/crossentropy": 1.9613312944769858, + "loss/hidden": 3.689453125, + "loss/jsd": 0.0, + "loss/logits": 0.21493730675429107, + "step": 16640 + }, + { + "epoch": 0.555, + "grad_norm": 29.5, + "grad_norm_var": 1.4729166666666667, + "learning_rate": 0.0001, + "loss": 7.9289, + "loss/crossentropy": 1.970650000870228, + "loss/hidden": 3.795703125, + "loss/jsd": 0.0, + "loss/logits": 0.22653428725898267, + "step": 16650 + }, + { + "epoch": 0.5553333333333333, + "grad_norm": 31.75, + "grad_norm_var": 2.425, + "learning_rate": 0.0001, + "loss": 7.8909, + "loss/crossentropy": 2.1313917048275473, + "loss/hidden": 3.67421875, + "loss/jsd": 0.0, + "loss/logits": 0.2265587305650115, + "step": 16660 + }, + { + "epoch": 0.5556666666666666, + "grad_norm": 32.5, + "grad_norm_var": 3.2223307291666665, + "learning_rate": 0.0001, + "loss": 7.8742, + "loss/crossentropy": 2.010394226014614, + "loss/hidden": 3.540625, + "loss/jsd": 0.0, + "loss/logits": 0.19476988408714532, + "step": 16670 + }, + { + "epoch": 0.556, + "grad_norm": 33.25, + "grad_norm_var": 2.989518229166667, + "learning_rate": 0.0001, + "loss": 7.9211, + "loss/crossentropy": 2.206305223703384, + "loss/hidden": 3.595703125, + "loss/jsd": 0.0, + "loss/logits": 0.22279220167547464, + "step": 16680 + }, + { + "epoch": 0.5563333333333333, + "grad_norm": 29.5, + "grad_norm_var": 7.698372395833333, + "learning_rate": 0.0001, + "loss": 7.8795, + "loss/crossentropy": 2.1127166926860808, + "loss/hidden": 3.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.2053209213539958, + "step": 16690 + }, + { + "epoch": 0.5566666666666666, + "grad_norm": 30.25, + "grad_norm_var": 3.4082682291666666, + "learning_rate": 0.0001, + "loss": 7.9118, + "loss/crossentropy": 1.980313377827406, + "loss/hidden": 3.665234375, + "loss/jsd": 0.0, + "loss/logits": 0.19750017933547498, + "step": 16700 + }, + { + "epoch": 0.557, + "grad_norm": 32.0, + "grad_norm_var": 4.745572916666666, + "learning_rate": 0.0001, + "loss": 7.8491, + "loss/crossentropy": 2.187881177663803, + "loss/hidden": 3.666796875, + "loss/jsd": 0.0, + "loss/logits": 0.2327125236392021, + "step": 16710 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 30.125, + "grad_norm_var": 3.0895182291666665, + "learning_rate": 0.0001, + "loss": 7.8578, + "loss/crossentropy": 2.0527888640761374, + "loss/hidden": 3.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.24014483857899904, + "step": 16720 + }, + { + "epoch": 0.5576666666666666, + "grad_norm": 29.125, + "grad_norm_var": 2.315625, + "learning_rate": 0.0001, + "loss": 7.7581, + "loss/crossentropy": 1.989125171303749, + "loss/hidden": 3.584765625, + "loss/jsd": 0.0, + "loss/logits": 0.20171179957687854, + "step": 16730 + }, + { + "epoch": 0.558, + "grad_norm": 34.0, + "grad_norm_var": 3.3030598958333335, + "learning_rate": 0.0001, + "loss": 7.8911, + "loss/crossentropy": 1.909910400211811, + "loss/hidden": 3.766015625, + "loss/jsd": 0.0, + "loss/logits": 0.2186163429170847, + "step": 16740 + }, + { + "epoch": 0.5583333333333333, + "grad_norm": 30.875, + "grad_norm_var": 3.330208333333333, + "learning_rate": 0.0001, + "loss": 7.8985, + "loss/crossentropy": 2.0753150559961795, + "loss/hidden": 3.61640625, + "loss/jsd": 0.0, + "loss/logits": 0.21899790465831756, + "step": 16750 + }, + { + "epoch": 0.5586666666666666, + "grad_norm": 30.875, + "grad_norm_var": 2.187239583333333, + "learning_rate": 0.0001, + "loss": 7.8861, + "loss/crossentropy": 2.1016260892152787, + "loss/hidden": 3.615625, + "loss/jsd": 0.0, + "loss/logits": 0.210488342307508, + "step": 16760 + }, + { + "epoch": 0.559, + "grad_norm": 31.0, + "grad_norm_var": 2.8421223958333335, + "learning_rate": 0.0001, + "loss": 8.0766, + "loss/crossentropy": 2.208987255394459, + "loss/hidden": 3.822265625, + "loss/jsd": 0.0, + "loss/logits": 0.245903043076396, + "step": 16770 + }, + { + "epoch": 0.5593333333333333, + "grad_norm": 36.25, + "grad_norm_var": 320.3322265625, + "learning_rate": 0.0001, + "loss": 7.9084, + "loss/crossentropy": 2.0669932678341865, + "loss/hidden": 3.662109375, + "loss/jsd": 0.0, + "loss/logits": 0.21224353071302177, + "step": 16780 + }, + { + "epoch": 0.5596666666666666, + "grad_norm": 43.5, + "grad_norm_var": 305.8958333333333, + "learning_rate": 0.0001, + "loss": 8.0134, + "loss/crossentropy": 2.2563454896211623, + "loss/hidden": 3.6125, + "loss/jsd": 0.0, + "loss/logits": 0.22311148904263972, + "step": 16790 + }, + { + "epoch": 0.56, + "grad_norm": 32.75, + "grad_norm_var": 20.851497395833334, + "learning_rate": 0.0001, + "loss": 8.0603, + "loss/crossentropy": 2.2054395377635956, + "loss/hidden": 3.583203125, + "loss/jsd": 0.0, + "loss/logits": 0.20811190865933896, + "step": 16800 + }, + { + "epoch": 0.5603333333333333, + "grad_norm": 28.25, + "grad_norm_var": 2.448958333333333, + "learning_rate": 0.0001, + "loss": 7.7593, + "loss/crossentropy": 2.236998660862446, + "loss/hidden": 3.611328125, + "loss/jsd": 0.0, + "loss/logits": 0.22802891582250595, + "step": 16810 + }, + { + "epoch": 0.5606666666666666, + "grad_norm": 31.5, + "grad_norm_var": 3.7239583333333335, + "learning_rate": 0.0001, + "loss": 7.7787, + "loss/crossentropy": 1.884455829113722, + "loss/hidden": 3.53828125, + "loss/jsd": 0.0, + "loss/logits": 0.19550493340939284, + "step": 16820 + }, + { + "epoch": 0.561, + "grad_norm": 32.25, + "grad_norm_var": 3.5822265625, + "learning_rate": 0.0001, + "loss": 7.9028, + "loss/crossentropy": 2.0985020123422147, + "loss/hidden": 3.604296875, + "loss/jsd": 0.0, + "loss/logits": 0.2114247432909906, + "step": 16830 + }, + { + "epoch": 0.5613333333333334, + "grad_norm": 29.75, + "grad_norm_var": 2.0468098958333334, + "learning_rate": 0.0001, + "loss": 7.9145, + "loss/crossentropy": 1.935218346118927, + "loss/hidden": 3.733203125, + "loss/jsd": 0.0, + "loss/logits": 0.22361029600724577, + "step": 16840 + }, + { + "epoch": 0.5616666666666666, + "grad_norm": 29.75, + "grad_norm_var": 2.8337890625, + "learning_rate": 0.0001, + "loss": 7.8986, + "loss/crossentropy": 1.9841939061880112, + "loss/hidden": 3.7, + "loss/jsd": 0.0, + "loss/logits": 0.20547962225973607, + "step": 16850 + }, + { + "epoch": 0.562, + "grad_norm": 33.0, + "grad_norm_var": 10.559830729166666, + "learning_rate": 0.0001, + "loss": 7.9661, + "loss/crossentropy": 2.242799472808838, + "loss/hidden": 3.653515625, + "loss/jsd": 0.0, + "loss/logits": 0.24114026986062526, + "step": 16860 + }, + { + "epoch": 0.5623333333333334, + "grad_norm": 30.75, + "grad_norm_var": 10.02265625, + "learning_rate": 0.0001, + "loss": 7.9501, + "loss/crossentropy": 2.272703301906586, + "loss/hidden": 3.5921875, + "loss/jsd": 0.0, + "loss/logits": 0.219810495339334, + "step": 16870 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 28.75, + "grad_norm_var": 8.073893229166666, + "learning_rate": 0.0001, + "loss": 7.8381, + "loss/crossentropy": 1.99768578261137, + "loss/hidden": 3.56796875, + "loss/jsd": 0.0, + "loss/logits": 0.229691500402987, + "step": 16880 + }, + { + "epoch": 0.563, + "grad_norm": 30.625, + "grad_norm_var": 3.9358723958333335, + "learning_rate": 0.0001, + "loss": 7.7505, + "loss/crossentropy": 1.958486919105053, + "loss/hidden": 3.501171875, + "loss/jsd": 0.0, + "loss/logits": 0.19877009009942412, + "step": 16890 + }, + { + "epoch": 0.5633333333333334, + "grad_norm": 31.625, + "grad_norm_var": 12.167643229166666, + "learning_rate": 0.0001, + "loss": 7.9282, + "loss/crossentropy": 2.2899662777781487, + "loss/hidden": 3.60703125, + "loss/jsd": 0.0, + "loss/logits": 0.22736097928136587, + "step": 16900 + }, + { + "epoch": 0.5636666666666666, + "grad_norm": 31.625, + "grad_norm_var": 1.9051432291666666, + "learning_rate": 0.0001, + "loss": 7.9337, + "loss/crossentropy": 2.046753417700529, + "loss/hidden": 3.581640625, + "loss/jsd": 0.0, + "loss/logits": 0.20713044237345457, + "step": 16910 + }, + { + "epoch": 0.564, + "grad_norm": 31.5, + "grad_norm_var": 12.576041666666667, + "learning_rate": 0.0001, + "loss": 7.8576, + "loss/crossentropy": 2.137194776535034, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.22013001535087823, + "step": 16920 + }, + { + "epoch": 0.5643333333333334, + "grad_norm": 31.25, + "grad_norm_var": 15.774934895833333, + "learning_rate": 0.0001, + "loss": 7.913, + "loss/crossentropy": 1.9944169409573078, + "loss/hidden": 3.628125, + "loss/jsd": 0.0, + "loss/logits": 0.21307808943092824, + "step": 16930 + }, + { + "epoch": 0.5646666666666667, + "grad_norm": 30.375, + "grad_norm_var": 5.337955729166667, + "learning_rate": 0.0001, + "loss": 7.886, + "loss/crossentropy": 2.018931347131729, + "loss/hidden": 3.6046875, + "loss/jsd": 0.0, + "loss/logits": 0.21659799050539733, + "step": 16940 + }, + { + "epoch": 0.565, + "grad_norm": 30.375, + "grad_norm_var": 7.773958333333334, + "learning_rate": 0.0001, + "loss": 7.8816, + "loss/crossentropy": 2.0590699821710587, + "loss/hidden": 3.665234375, + "loss/jsd": 0.0, + "loss/logits": 0.20541296005249024, + "step": 16950 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 31.5, + "grad_norm_var": 5.34140625, + "learning_rate": 0.0001, + "loss": 7.9332, + "loss/crossentropy": 2.048498646169901, + "loss/hidden": 3.7015625, + "loss/jsd": 0.0, + "loss/logits": 0.21477829590439795, + "step": 16960 + }, + { + "epoch": 0.5656666666666667, + "grad_norm": 28.875, + "grad_norm_var": 2.6676432291666665, + "learning_rate": 0.0001, + "loss": 7.864, + "loss/crossentropy": 2.0648021958768368, + "loss/hidden": 3.712109375, + "loss/jsd": 0.0, + "loss/logits": 0.2224846264347434, + "step": 16970 + }, + { + "epoch": 0.566, + "grad_norm": 33.5, + "grad_norm_var": 2.24765625, + "learning_rate": 0.0001, + "loss": 7.7163, + "loss/crossentropy": 2.107031860947609, + "loss/hidden": 3.53671875, + "loss/jsd": 0.0, + "loss/logits": 0.20737082306295634, + "step": 16980 + }, + { + "epoch": 0.5663333333333334, + "grad_norm": 32.0, + "grad_norm_var": 7.234830729166666, + "learning_rate": 0.0001, + "loss": 8.0486, + "loss/crossentropy": 2.139879457652569, + "loss/hidden": 3.6375, + "loss/jsd": 0.0, + "loss/logits": 0.23415213711559774, + "step": 16990 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 28.375, + "grad_norm_var": 26.10625, + "learning_rate": 0.0001, + "loss": 7.9511, + "loss/crossentropy": 2.0089096277952194, + "loss/hidden": 3.590234375, + "loss/jsd": 0.0, + "loss/logits": 0.20133000621572136, + "step": 17000 + }, + { + "epoch": 0.567, + "grad_norm": 32.5, + "grad_norm_var": 28.608072916666668, + "learning_rate": 0.0001, + "loss": 7.8054, + "loss/crossentropy": 1.9827151045203208, + "loss/hidden": 3.64296875, + "loss/jsd": 0.0, + "loss/logits": 0.2022013606503606, + "step": 17010 + }, + { + "epoch": 0.5673333333333334, + "grad_norm": 30.5, + "grad_norm_var": 519.3660807291667, + "learning_rate": 0.0001, + "loss": 8.0306, + "loss/crossentropy": 2.227891056239605, + "loss/hidden": 3.672265625, + "loss/jsd": 0.0, + "loss/logits": 0.23921767324209214, + "step": 17020 + }, + { + "epoch": 0.5676666666666667, + "grad_norm": 30.75, + "grad_norm_var": 517.3622395833333, + "learning_rate": 0.0001, + "loss": 7.7689, + "loss/crossentropy": 1.9673498637974263, + "loss/hidden": 3.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.20032266862690448, + "step": 17030 + }, + { + "epoch": 0.568, + "grad_norm": 35.0, + "grad_norm_var": 5.939583333333333, + "learning_rate": 0.0001, + "loss": 7.8396, + "loss/crossentropy": 2.0264181800186636, + "loss/hidden": 3.62734375, + "loss/jsd": 0.0, + "loss/logits": 0.22799574863165617, + "step": 17040 + }, + { + "epoch": 0.5683333333333334, + "grad_norm": 30.75, + "grad_norm_var": 4.541666666666667, + "learning_rate": 0.0001, + "loss": 7.8507, + "loss/crossentropy": 2.1397699415683746, + "loss/hidden": 3.669140625, + "loss/jsd": 0.0, + "loss/logits": 0.22351325545459985, + "step": 17050 + }, + { + "epoch": 0.5686666666666667, + "grad_norm": 33.0, + "grad_norm_var": 3.1244140625, + "learning_rate": 0.0001, + "loss": 7.9212, + "loss/crossentropy": 2.0302638575434684, + "loss/hidden": 3.717578125, + "loss/jsd": 0.0, + "loss/logits": 0.21876529976725578, + "step": 17060 + }, + { + "epoch": 0.569, + "grad_norm": 36.75, + "grad_norm_var": 6.9625, + "learning_rate": 0.0001, + "loss": 7.9896, + "loss/crossentropy": 2.150805290043354, + "loss/hidden": 3.660546875, + "loss/jsd": 0.0, + "loss/logits": 0.22219337709248066, + "step": 17070 + }, + { + "epoch": 0.5693333333333334, + "grad_norm": 32.5, + "grad_norm_var": 3.405847187879013e+18, + "learning_rate": 0.0001, + "loss": 8.0392, + "loss/crossentropy": 2.158428954333067, + "loss/hidden": 3.6515625, + "loss/jsd": 0.0, + "loss/logits": 0.22352358270436526, + "step": 17080 + }, + { + "epoch": 0.5696666666666667, + "grad_norm": 31.25, + "grad_norm_var": 3.405847188432661e+18, + "learning_rate": 0.0001, + "loss": 7.9175, + "loss/crossentropy": 2.0420360594987867, + "loss/hidden": 3.608203125, + "loss/jsd": 0.0, + "loss/logits": 0.20006783921271562, + "step": 17090 + }, + { + "epoch": 0.57, + "grad_norm": 28.75, + "grad_norm_var": 3.9212890625, + "learning_rate": 0.0001, + "loss": 7.9392, + "loss/crossentropy": 2.012987617403269, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21622594874352216, + "step": 17100 + }, + { + "epoch": 0.5703333333333334, + "grad_norm": 33.75, + "grad_norm_var": 4.261393229166667, + "learning_rate": 0.0001, + "loss": 7.891, + "loss/crossentropy": 1.951448941975832, + "loss/hidden": 3.6390625, + "loss/jsd": 0.0, + "loss/logits": 0.22058595921844243, + "step": 17110 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 29.25, + "grad_norm_var": 1.7567057291666666, + "learning_rate": 0.0001, + "loss": 7.8458, + "loss/crossentropy": 2.106409525871277, + "loss/hidden": 3.5765625, + "loss/jsd": 0.0, + "loss/logits": 0.20967069901525975, + "step": 17120 + }, + { + "epoch": 0.571, + "grad_norm": 30.25, + "grad_norm_var": 1.96875, + "learning_rate": 0.0001, + "loss": 7.7126, + "loss/crossentropy": 2.0033893555402758, + "loss/hidden": 3.51640625, + "loss/jsd": 0.0, + "loss/logits": 0.1892759721726179, + "step": 17130 + }, + { + "epoch": 0.5713333333333334, + "grad_norm": 28.5, + "grad_norm_var": 2.662239583333333, + "learning_rate": 0.0001, + "loss": 7.8506, + "loss/crossentropy": 2.1353479593992235, + "loss/hidden": 3.612890625, + "loss/jsd": 0.0, + "loss/logits": 0.21052795574069022, + "step": 17140 + }, + { + "epoch": 0.5716666666666667, + "grad_norm": 32.5, + "grad_norm_var": 2.648398029737392e+18, + "learning_rate": 0.0001, + "loss": 7.9248, + "loss/crossentropy": 2.06460902094841, + "loss/hidden": 3.50859375, + "loss/jsd": 0.0, + "loss/logits": 0.20914022997021675, + "step": 17150 + }, + { + "epoch": 0.572, + "grad_norm": 30.5, + "grad_norm_var": 2.648398030388348e+18, + "learning_rate": 0.0001, + "loss": 7.9167, + "loss/crossentropy": 2.0927888706326483, + "loss/hidden": 3.646484375, + "loss/jsd": 0.0, + "loss/logits": 0.23212119033560158, + "step": 17160 + }, + { + "epoch": 0.5723333333333334, + "grad_norm": 29.375, + "grad_norm_var": 5.845833333333333, + "learning_rate": 0.0001, + "loss": 7.7342, + "loss/crossentropy": 1.9288011252880097, + "loss/hidden": 3.569140625, + "loss/jsd": 0.0, + "loss/logits": 0.19227788979187607, + "step": 17170 + }, + { + "epoch": 0.5726666666666667, + "grad_norm": 31.75, + "grad_norm_var": 5.4416015625, + "learning_rate": 0.0001, + "loss": 7.8135, + "loss/crossentropy": 2.0256782703101637, + "loss/hidden": 3.76328125, + "loss/jsd": 0.0, + "loss/logits": 0.22770339585840702, + "step": 17180 + }, + { + "epoch": 0.573, + "grad_norm": 30.75, + "grad_norm_var": 3.3447265625, + "learning_rate": 0.0001, + "loss": 7.8143, + "loss/crossentropy": 2.1895001590251923, + "loss/hidden": 3.615625, + "loss/jsd": 0.0, + "loss/logits": 0.21934428252279758, + "step": 17190 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 28.25, + "grad_norm_var": 3.7504557291666667, + "learning_rate": 0.0001, + "loss": 7.7612, + "loss/crossentropy": 2.141167312860489, + "loss/hidden": 3.6140625, + "loss/jsd": 0.0, + "loss/logits": 0.2146891826763749, + "step": 17200 + }, + { + "epoch": 0.5736666666666667, + "grad_norm": 33.25, + "grad_norm_var": 4.369791666666667, + "learning_rate": 0.0001, + "loss": 7.7998, + "loss/crossentropy": 2.129530963301659, + "loss/hidden": 3.57578125, + "loss/jsd": 0.0, + "loss/logits": 0.21841056421399116, + "step": 17210 + }, + { + "epoch": 0.574, + "grad_norm": 34.25, + "grad_norm_var": 5.690559895833333, + "learning_rate": 0.0001, + "loss": 7.7933, + "loss/crossentropy": 2.0486621774733065, + "loss/hidden": 3.641015625, + "loss/jsd": 0.0, + "loss/logits": 0.23156200405210256, + "step": 17220 + }, + { + "epoch": 0.5743333333333334, + "grad_norm": 32.5, + "grad_norm_var": 1.8551432291666667, + "learning_rate": 0.0001, + "loss": 7.7678, + "loss/crossentropy": 1.8859696760773659, + "loss/hidden": 3.709765625, + "loss/jsd": 0.0, + "loss/logits": 0.22125184228643774, + "step": 17230 + }, + { + "epoch": 0.5746666666666667, + "grad_norm": 30.75, + "grad_norm_var": 0.84140625, + "learning_rate": 0.0001, + "loss": 7.9437, + "loss/crossentropy": 2.071402122825384, + "loss/hidden": 3.684375, + "loss/jsd": 0.0, + "loss/logits": 0.21905634058639406, + "step": 17240 + }, + { + "epoch": 0.575, + "grad_norm": 30.375, + "grad_norm_var": 1.1559895833333333, + "learning_rate": 0.0001, + "loss": 7.9371, + "loss/crossentropy": 2.094747845083475, + "loss/hidden": 3.583984375, + "loss/jsd": 0.0, + "loss/logits": 0.21283579124137758, + "step": 17250 + }, + { + "epoch": 0.5753333333333334, + "grad_norm": 30.875, + "grad_norm_var": 1.7791015625, + "learning_rate": 0.0001, + "loss": 7.8497, + "loss/crossentropy": 2.0071739844977854, + "loss/hidden": 3.646875, + "loss/jsd": 0.0, + "loss/logits": 0.2293556292541325, + "step": 17260 + }, + { + "epoch": 0.5756666666666667, + "grad_norm": 36.75, + "grad_norm_var": 5.1978515625, + "learning_rate": 0.0001, + "loss": 7.8883, + "loss/crossentropy": 2.0315137624740602, + "loss/hidden": 3.611328125, + "loss/jsd": 0.0, + "loss/logits": 0.21106694657355546, + "step": 17270 + }, + { + "epoch": 0.576, + "grad_norm": 32.75, + "grad_norm_var": 4.042122395833333, + "learning_rate": 0.0001, + "loss": 7.7853, + "loss/crossentropy": 2.161001367866993, + "loss/hidden": 3.676953125, + "loss/jsd": 0.0, + "loss/logits": 0.22858329731971025, + "step": 17280 + }, + { + "epoch": 0.5763333333333334, + "grad_norm": 29.75, + "grad_norm_var": 2.7973307291666667, + "learning_rate": 0.0001, + "loss": 7.8583, + "loss/crossentropy": 2.130069175362587, + "loss/hidden": 3.63359375, + "loss/jsd": 0.0, + "loss/logits": 0.2083722459152341, + "step": 17290 + }, + { + "epoch": 0.5766666666666667, + "grad_norm": 31.25, + "grad_norm_var": 3.4238932291666666, + "learning_rate": 0.0001, + "loss": 7.8721, + "loss/crossentropy": 2.082015645503998, + "loss/hidden": 3.614453125, + "loss/jsd": 0.0, + "loss/logits": 0.20946309231221677, + "step": 17300 + }, + { + "epoch": 0.577, + "grad_norm": 30.75, + "grad_norm_var": 3.5875, + "learning_rate": 0.0001, + "loss": 7.9545, + "loss/crossentropy": 2.096890838444233, + "loss/hidden": 3.560546875, + "loss/jsd": 0.0, + "loss/logits": 0.20209384206682443, + "step": 17310 + }, + { + "epoch": 0.5773333333333334, + "grad_norm": 34.75, + "grad_norm_var": 4.995572916666666, + "learning_rate": 0.0001, + "loss": 7.8557, + "loss/crossentropy": 2.106398382782936, + "loss/hidden": 3.70546875, + "loss/jsd": 0.0, + "loss/logits": 0.21345611102879047, + "step": 17320 + }, + { + "epoch": 0.5776666666666667, + "grad_norm": 34.0, + "grad_norm_var": 2.546875, + "learning_rate": 0.0001, + "loss": 7.8012, + "loss/crossentropy": 2.096838581562042, + "loss/hidden": 3.58828125, + "loss/jsd": 0.0, + "loss/logits": 0.2053753226995468, + "step": 17330 + }, + { + "epoch": 0.578, + "grad_norm": 26.875, + "grad_norm_var": 2.9572464468024796e+18, + "learning_rate": 0.0001, + "loss": 7.8668, + "loss/crossentropy": 2.1166788838803767, + "loss/hidden": 3.55546875, + "loss/jsd": 0.0, + "loss/logits": 0.20833127275109292, + "step": 17340 + }, + { + "epoch": 0.5783333333333334, + "grad_norm": 29.625, + "grad_norm_var": 7.27265625, + "learning_rate": 0.0001, + "loss": 7.7414, + "loss/crossentropy": 1.9886778131127358, + "loss/hidden": 3.65390625, + "loss/jsd": 0.0, + "loss/logits": 0.21065135411918162, + "step": 17350 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 29.5, + "grad_norm_var": 11.183268229166666, + "learning_rate": 0.0001, + "loss": 7.9642, + "loss/crossentropy": 2.0476474441587924, + "loss/hidden": 3.547265625, + "loss/jsd": 0.0, + "loss/logits": 0.21347164940088986, + "step": 17360 + }, + { + "epoch": 0.579, + "grad_norm": 32.75, + "grad_norm_var": 10.703059895833333, + "learning_rate": 0.0001, + "loss": 7.8541, + "loss/crossentropy": 2.146274469792843, + "loss/hidden": 3.580859375, + "loss/jsd": 0.0, + "loss/logits": 0.24069792926311492, + "step": 17370 + }, + { + "epoch": 0.5793333333333334, + "grad_norm": 33.0, + "grad_norm_var": 3.1455729166666666, + "learning_rate": 0.0001, + "loss": 7.8856, + "loss/crossentropy": 2.0081627793610095, + "loss/hidden": 3.589453125, + "loss/jsd": 0.0, + "loss/logits": 0.20462132934480906, + "step": 17380 + }, + { + "epoch": 0.5796666666666667, + "grad_norm": 30.625, + "grad_norm_var": 4.042708333333334, + "learning_rate": 0.0001, + "loss": 7.8438, + "loss/crossentropy": 2.133964368700981, + "loss/hidden": 3.604296875, + "loss/jsd": 0.0, + "loss/logits": 0.2076242446899414, + "step": 17390 + }, + { + "epoch": 0.58, + "grad_norm": 33.25, + "grad_norm_var": 1.9681640625, + "learning_rate": 0.0001, + "loss": 7.883, + "loss/crossentropy": 1.9912319853901863, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.22393401358276604, + "step": 17400 + }, + { + "epoch": 0.5803333333333334, + "grad_norm": 28.625, + "grad_norm_var": 4.534309895833333, + "learning_rate": 0.0001, + "loss": 7.8925, + "loss/crossentropy": 2.1712723806500436, + "loss/hidden": 3.639453125, + "loss/jsd": 0.0, + "loss/logits": 0.2111651472747326, + "step": 17410 + }, + { + "epoch": 0.5806666666666667, + "grad_norm": 31.125, + "grad_norm_var": 2.540311639717402e+18, + "learning_rate": 0.0001, + "loss": 7.8283, + "loss/crossentropy": 1.96769128292799, + "loss/hidden": 3.621484375, + "loss/jsd": 0.0, + "loss/logits": 0.21129720862954854, + "step": 17420 + }, + { + "epoch": 0.581, + "grad_norm": 34.0, + "grad_norm_var": 6.0900390625, + "learning_rate": 0.0001, + "loss": 7.9127, + "loss/crossentropy": 2.0979452088475226, + "loss/hidden": 3.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.23280210494995118, + "step": 17430 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 31.125, + "grad_norm_var": 3.7671223958333333, + "learning_rate": 0.0001, + "loss": 7.9495, + "loss/crossentropy": 2.3015418693423273, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.2063061658293009, + "step": 17440 + }, + { + "epoch": 0.5816666666666667, + "grad_norm": 31.125, + "grad_norm_var": 8.673958333333333, + "learning_rate": 0.0001, + "loss": 7.977, + "loss/crossentropy": 2.1084218993782997, + "loss/hidden": 3.688671875, + "loss/jsd": 0.0, + "loss/logits": 0.2300797041505575, + "step": 17450 + }, + { + "epoch": 0.582, + "grad_norm": 29.875, + "grad_norm_var": 18.838997395833335, + "learning_rate": 0.0001, + "loss": 7.8748, + "loss/crossentropy": 2.125347241014242, + "loss/hidden": 3.650390625, + "loss/jsd": 0.0, + "loss/logits": 0.21882319189608096, + "step": 17460 + }, + { + "epoch": 0.5823333333333334, + "grad_norm": 31.625, + "grad_norm_var": 13.420247395833334, + "learning_rate": 0.0001, + "loss": 7.7885, + "loss/crossentropy": 1.9236932694911957, + "loss/hidden": 3.629296875, + "loss/jsd": 0.0, + "loss/logits": 0.20307795237749815, + "step": 17470 + }, + { + "epoch": 0.5826666666666667, + "grad_norm": 31.625, + "grad_norm_var": 6.923372395833334, + "learning_rate": 0.0001, + "loss": 7.9531, + "loss/crossentropy": 2.0547634214162827, + "loss/hidden": 3.689453125, + "loss/jsd": 0.0, + "loss/logits": 0.20383700206875802, + "step": 17480 + }, + { + "epoch": 0.583, + "grad_norm": 30.625, + "grad_norm_var": 8.875, + "learning_rate": 0.0001, + "loss": 7.8942, + "loss/crossentropy": 2.1729307577013968, + "loss/hidden": 3.663671875, + "loss/jsd": 0.0, + "loss/logits": 0.22479334622621536, + "step": 17490 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 30.25, + "grad_norm_var": 5.0056640625, + "learning_rate": 0.0001, + "loss": 7.9187, + "loss/crossentropy": 2.1915037110447884, + "loss/hidden": 3.710546875, + "loss/jsd": 0.0, + "loss/logits": 0.22519682794809343, + "step": 17500 + }, + { + "epoch": 0.5836666666666667, + "grad_norm": 35.0, + "grad_norm_var": 12.376822916666667, + "learning_rate": 0.0001, + "loss": 7.8727, + "loss/crossentropy": 2.2274638898670673, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.20867663882672788, + "step": 17510 + }, + { + "epoch": 0.584, + "grad_norm": 31.375, + "grad_norm_var": 12.6072265625, + "learning_rate": 0.0001, + "loss": 7.8598, + "loss/crossentropy": 1.9983490526676178, + "loss/hidden": 3.81796875, + "loss/jsd": 0.0, + "loss/logits": 0.23317093290388585, + "step": 17520 + }, + { + "epoch": 0.5843333333333334, + "grad_norm": 28.625, + "grad_norm_var": 5.734309895833333, + "learning_rate": 0.0001, + "loss": 7.8956, + "loss/crossentropy": 2.036000092327595, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2117432462051511, + "step": 17530 + }, + { + "epoch": 0.5846666666666667, + "grad_norm": 29.5, + "grad_norm_var": 3.980208333333333, + "learning_rate": 0.0001, + "loss": 7.8556, + "loss/crossentropy": 2.1683146730065346, + "loss/hidden": 3.63984375, + "loss/jsd": 0.0, + "loss/logits": 0.21113013792783022, + "step": 17540 + }, + { + "epoch": 0.585, + "grad_norm": 36.5, + "grad_norm_var": 65.62473958333334, + "learning_rate": 0.0001, + "loss": 7.8502, + "loss/crossentropy": 1.9271365851163864, + "loss/hidden": 3.673828125, + "loss/jsd": 0.0, + "loss/logits": 0.21881148554384708, + "step": 17550 + }, + { + "epoch": 0.5853333333333334, + "grad_norm": 30.125, + "grad_norm_var": 64.79264322916667, + "learning_rate": 0.0001, + "loss": 7.853, + "loss/crossentropy": 2.1277351498603823, + "loss/hidden": 3.63046875, + "loss/jsd": 0.0, + "loss/logits": 0.21536952927708625, + "step": 17560 + }, + { + "epoch": 0.5856666666666667, + "grad_norm": 34.25, + "grad_norm_var": 2.48125, + "learning_rate": 0.0001, + "loss": 7.8684, + "loss/crossentropy": 2.2192533940076826, + "loss/hidden": 3.625390625, + "loss/jsd": 0.0, + "loss/logits": 0.24672232531011104, + "step": 17570 + }, + { + "epoch": 0.586, + "grad_norm": 30.5, + "grad_norm_var": 3.376822916666667, + "learning_rate": 0.0001, + "loss": 7.825, + "loss/crossentropy": 1.9819466196000577, + "loss/hidden": 3.597265625, + "loss/jsd": 0.0, + "loss/logits": 0.2152025356888771, + "step": 17580 + }, + { + "epoch": 0.5863333333333334, + "grad_norm": 32.0, + "grad_norm_var": 3.51015625, + "learning_rate": 0.0001, + "loss": 7.7497, + "loss/crossentropy": 2.1631928592920304, + "loss/hidden": 3.626171875, + "loss/jsd": 0.0, + "loss/logits": 0.21683248728513718, + "step": 17590 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 29.75, + "grad_norm_var": 5.7322265625, + "learning_rate": 0.0001, + "loss": 7.7934, + "loss/crossentropy": 2.0797213554382323, + "loss/hidden": 3.641015625, + "loss/jsd": 0.0, + "loss/logits": 0.21361089181154966, + "step": 17600 + }, + { + "epoch": 0.587, + "grad_norm": 30.75, + "grad_norm_var": 4.33125, + "learning_rate": 0.0001, + "loss": 7.9096, + "loss/crossentropy": 2.1399587824940682, + "loss/hidden": 3.652734375, + "loss/jsd": 0.0, + "loss/logits": 0.22931497786194086, + "step": 17610 + }, + { + "epoch": 0.5873333333333334, + "grad_norm": 32.5, + "grad_norm_var": 2.1080729166666665, + "learning_rate": 0.0001, + "loss": 8.0199, + "loss/crossentropy": 2.082290044426918, + "loss/hidden": 3.6421875, + "loss/jsd": 0.0, + "loss/logits": 0.21377197336405515, + "step": 17620 + }, + { + "epoch": 0.5876666666666667, + "grad_norm": 29.0, + "grad_norm_var": 3.7447916666666665, + "learning_rate": 0.0001, + "loss": 7.8002, + "loss/crossentropy": 1.9901188783347608, + "loss/hidden": 3.667578125, + "loss/jsd": 0.0, + "loss/logits": 0.22527266927063466, + "step": 17630 + }, + { + "epoch": 0.588, + "grad_norm": 51.0, + "grad_norm_var": 32.34557291666667, + "learning_rate": 0.0001, + "loss": 7.9691, + "loss/crossentropy": 2.2055923312902452, + "loss/hidden": 3.60859375, + "loss/jsd": 0.0, + "loss/logits": 0.2215042721480131, + "step": 17640 + }, + { + "epoch": 0.5883333333333334, + "grad_norm": 33.0, + "grad_norm_var": 27.630143229166666, + "learning_rate": 0.0001, + "loss": 7.8568, + "loss/crossentropy": 1.9692361816763877, + "loss/hidden": 3.622265625, + "loss/jsd": 0.0, + "loss/logits": 0.20123363118618726, + "step": 17650 + }, + { + "epoch": 0.5886666666666667, + "grad_norm": 31.25, + "grad_norm_var": 2.424934895833333, + "learning_rate": 0.0001, + "loss": 7.9636, + "loss/crossentropy": 2.0752515137195586, + "loss/hidden": 3.627734375, + "loss/jsd": 0.0, + "loss/logits": 0.21257804036140443, + "step": 17660 + }, + { + "epoch": 0.589, + "grad_norm": 31.375, + "grad_norm_var": 3.3559895833333333, + "learning_rate": 0.0001, + "loss": 7.8761, + "loss/crossentropy": 2.0425384148955343, + "loss/hidden": 3.70078125, + "loss/jsd": 0.0, + "loss/logits": 0.21111119659617544, + "step": 17670 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 5301600256.0, + "grad_norm_var": 1.7566853087264507e+18, + "learning_rate": 0.0001, + "loss": 8.0848, + "loss/crossentropy": 2.2280178025364874, + "loss/hidden": 3.89609375, + "loss/jsd": 0.0, + "loss/logits": 0.29019313380122186, + "step": 17680 + }, + { + "epoch": 0.5896666666666667, + "grad_norm": 30.75, + "grad_norm_var": 1.756685307947778e+18, + "learning_rate": 0.0001, + "loss": 7.9978, + "loss/crossentropy": 2.1417148754000666, + "loss/hidden": 3.7203125, + "loss/jsd": 0.0, + "loss/logits": 0.2305316347628832, + "step": 17690 + }, + { + "epoch": 0.59, + "grad_norm": 29.0, + "grad_norm_var": 1.6520182291666667, + "learning_rate": 0.0001, + "loss": 7.732, + "loss/crossentropy": 1.9635935053229332, + "loss/hidden": 3.571875, + "loss/jsd": 0.0, + "loss/logits": 0.19552220217883587, + "step": 17700 + }, + { + "epoch": 0.5903333333333334, + "grad_norm": 31.125, + "grad_norm_var": 4.158072916666667, + "learning_rate": 0.0001, + "loss": 8.101, + "loss/crossentropy": 1.9116744890809059, + "loss/hidden": 3.78671875, + "loss/jsd": 0.0, + "loss/logits": 0.21419992987066508, + "step": 17710 + }, + { + "epoch": 0.5906666666666667, + "grad_norm": 32.25, + "grad_norm_var": 2.2223307291666665, + "learning_rate": 0.0001, + "loss": 7.75, + "loss/crossentropy": 2.117913420498371, + "loss/hidden": 3.62421875, + "loss/jsd": 0.0, + "loss/logits": 0.20440428592264653, + "step": 17720 + }, + { + "epoch": 0.591, + "grad_norm": 31.375, + "grad_norm_var": 4.03125, + "learning_rate": 0.0001, + "loss": 7.9153, + "loss/crossentropy": 2.0854917369782924, + "loss/hidden": 3.64609375, + "loss/jsd": 0.0, + "loss/logits": 0.21392401214689016, + "step": 17730 + }, + { + "epoch": 0.5913333333333334, + "grad_norm": 29.5, + "grad_norm_var": 3.948958333333333, + "learning_rate": 0.0001, + "loss": 7.7836, + "loss/crossentropy": 1.9979670539498329, + "loss/hidden": 3.59453125, + "loss/jsd": 0.0, + "loss/logits": 0.20232196077704429, + "step": 17740 + }, + { + "epoch": 0.5916666666666667, + "grad_norm": 28.0, + "grad_norm_var": 2.9452473958333334, + "learning_rate": 0.0001, + "loss": 7.7919, + "loss/crossentropy": 2.124793681502342, + "loss/hidden": 3.6234375, + "loss/jsd": 0.0, + "loss/logits": 0.2240230105817318, + "step": 17750 + }, + { + "epoch": 0.592, + "grad_norm": 30.5, + "grad_norm_var": 1.1768229166666666, + "learning_rate": 0.0001, + "loss": 7.8956, + "loss/crossentropy": 2.3084448873996735, + "loss/hidden": 3.575, + "loss/jsd": 0.0, + "loss/logits": 0.21302221789956094, + "step": 17760 + }, + { + "epoch": 0.5923333333333334, + "grad_norm": 32.5, + "grad_norm_var": 4.325, + "learning_rate": 0.0001, + "loss": 7.7268, + "loss/crossentropy": 2.060881958901882, + "loss/hidden": 3.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20148423872888088, + "step": 17770 + }, + { + "epoch": 0.5926666666666667, + "grad_norm": 31.875, + "grad_norm_var": 6.21640625, + "learning_rate": 0.0001, + "loss": 7.881, + "loss/crossentropy": 2.123748776316643, + "loss/hidden": 3.608984375, + "loss/jsd": 0.0, + "loss/logits": 0.22054104711860417, + "step": 17780 + }, + { + "epoch": 0.593, + "grad_norm": 59.0, + "grad_norm_var": 53.02649739583333, + "learning_rate": 0.0001, + "loss": 7.8065, + "loss/crossentropy": 2.0521097406744957, + "loss/hidden": 3.47421875, + "loss/jsd": 0.0, + "loss/logits": 0.18986649625003338, + "step": 17790 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 30.125, + "grad_norm_var": 50.04479166666667, + "learning_rate": 0.0001, + "loss": 7.9947, + "loss/crossentropy": 1.9684382773935796, + "loss/hidden": 3.701953125, + "loss/jsd": 0.0, + "loss/logits": 0.2199052505195141, + "step": 17800 + }, + { + "epoch": 0.5936666666666667, + "grad_norm": 31.25, + "grad_norm_var": 2.84140625, + "learning_rate": 0.0001, + "loss": 7.8549, + "loss/crossentropy": 2.2951119184494018, + "loss/hidden": 3.68203125, + "loss/jsd": 0.0, + "loss/logits": 0.23184156119823457, + "step": 17810 + }, + { + "epoch": 0.594, + "grad_norm": 29.5, + "grad_norm_var": 1.8166015625, + "learning_rate": 0.0001, + "loss": 7.9599, + "loss/crossentropy": 2.1213736176490783, + "loss/hidden": 3.627734375, + "loss/jsd": 0.0, + "loss/logits": 0.22608150485903025, + "step": 17820 + }, + { + "epoch": 0.5943333333333334, + "grad_norm": 29.75, + "grad_norm_var": 2.29140625, + "learning_rate": 0.0001, + "loss": 7.9982, + "loss/crossentropy": 2.08011159747839, + "loss/hidden": 3.68203125, + "loss/jsd": 0.0, + "loss/logits": 0.21864136941730977, + "step": 17830 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 30.375, + "grad_norm_var": 2.092643229166667, + "learning_rate": 0.0001, + "loss": 7.9028, + "loss/crossentropy": 2.084545207023621, + "loss/hidden": 3.675390625, + "loss/jsd": 0.0, + "loss/logits": 0.2244249342009425, + "step": 17840 + }, + { + "epoch": 0.595, + "grad_norm": 31.375, + "grad_norm_var": 2.0020182291666666, + "learning_rate": 0.0001, + "loss": 7.9467, + "loss/crossentropy": 1.9666382275521754, + "loss/hidden": 3.605078125, + "loss/jsd": 0.0, + "loss/logits": 0.21182227209210397, + "step": 17850 + }, + { + "epoch": 0.5953333333333334, + "grad_norm": 29.25, + "grad_norm_var": 1.9979166666666666, + "learning_rate": 0.0001, + "loss": 7.7981, + "loss/crossentropy": 2.0992372572422027, + "loss/hidden": 3.65078125, + "loss/jsd": 0.0, + "loss/logits": 0.20616454482078553, + "step": 17860 + }, + { + "epoch": 0.5956666666666667, + "grad_norm": 31.5, + "grad_norm_var": 3.7270833333333333, + "learning_rate": 0.0001, + "loss": 7.8429, + "loss/crossentropy": 2.115238733589649, + "loss/hidden": 3.659765625, + "loss/jsd": 0.0, + "loss/logits": 0.22112161125987767, + "step": 17870 + }, + { + "epoch": 0.596, + "grad_norm": 38.5, + "grad_norm_var": 49.32076822916667, + "learning_rate": 0.0001, + "loss": 7.9198, + "loss/crossentropy": 2.004359558224678, + "loss/hidden": 3.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.20139172691851853, + "step": 17880 + }, + { + "epoch": 0.5963333333333334, + "grad_norm": 62.75, + "grad_norm_var": 542.1577473958333, + "learning_rate": 0.0001, + "loss": 8.0141, + "loss/crossentropy": 2.0738891914486883, + "loss/hidden": 3.722265625, + "loss/jsd": 0.0, + "loss/logits": 0.27466350272297857, + "step": 17890 + }, + { + "epoch": 0.5966666666666667, + "grad_norm": 28.875, + "grad_norm_var": 123.88170572916667, + "learning_rate": 0.0001, + "loss": 7.773, + "loss/crossentropy": 2.1576938211917875, + "loss/hidden": 3.61015625, + "loss/jsd": 0.0, + "loss/logits": 0.21197507828474044, + "step": 17900 + }, + { + "epoch": 0.597, + "grad_norm": 29.875, + "grad_norm_var": 4.25625, + "learning_rate": 0.0001, + "loss": 7.7789, + "loss/crossentropy": 2.0792277440428735, + "loss/hidden": 3.653125, + "loss/jsd": 0.0, + "loss/logits": 0.21021516602486373, + "step": 17910 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 28.5, + "grad_norm_var": 4.112955729166667, + "learning_rate": 0.0001, + "loss": 7.791, + "loss/crossentropy": 2.0338504150509835, + "loss/hidden": 3.58515625, + "loss/jsd": 0.0, + "loss/logits": 0.21274630688130855, + "step": 17920 + }, + { + "epoch": 0.5976666666666667, + "grad_norm": 34.0, + "grad_norm_var": 4.971875, + "learning_rate": 0.0001, + "loss": 7.7827, + "loss/crossentropy": 1.9480732060968875, + "loss/hidden": 3.644921875, + "loss/jsd": 0.0, + "loss/logits": 0.1969135446473956, + "step": 17930 + }, + { + "epoch": 0.598, + "grad_norm": 33.25, + "grad_norm_var": 2.4208333333333334, + "learning_rate": 0.0001, + "loss": 7.7746, + "loss/crossentropy": 2.045719124376774, + "loss/hidden": 3.585546875, + "loss/jsd": 0.0, + "loss/logits": 0.2005929106846452, + "step": 17940 + }, + { + "epoch": 0.5983333333333334, + "grad_norm": 33.75, + "grad_norm_var": 7.9587890625, + "learning_rate": 0.0001, + "loss": 7.8875, + "loss/crossentropy": 2.07333282828331, + "loss/hidden": 3.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20832881294190883, + "step": 17950 + }, + { + "epoch": 0.5986666666666667, + "grad_norm": 33.75, + "grad_norm_var": 8.305989583333334, + "learning_rate": 0.0001, + "loss": 7.7515, + "loss/crossentropy": 2.0196738630533217, + "loss/hidden": 3.61015625, + "loss/jsd": 0.0, + "loss/logits": 0.21534995995461942, + "step": 17960 + }, + { + "epoch": 0.599, + "grad_norm": 29.125, + "grad_norm_var": 10.612434895833333, + "learning_rate": 0.0001, + "loss": 7.8834, + "loss/crossentropy": 2.0255559869110584, + "loss/hidden": 3.601171875, + "loss/jsd": 0.0, + "loss/logits": 0.19980104472488164, + "step": 17970 + }, + { + "epoch": 0.5993333333333334, + "grad_norm": 33.0, + "grad_norm_var": 5.305989583333333, + "learning_rate": 0.0001, + "loss": 7.8593, + "loss/crossentropy": 2.0782414257526396, + "loss/hidden": 3.631640625, + "loss/jsd": 0.0, + "loss/logits": 0.2156868301331997, + "step": 17980 + }, + { + "epoch": 0.5996666666666667, + "grad_norm": 29.375, + "grad_norm_var": 3.23125, + "learning_rate": 0.0001, + "loss": 7.8168, + "loss/crossentropy": 2.1622331708669664, + "loss/hidden": 3.577734375, + "loss/jsd": 0.0, + "loss/logits": 0.2100482653826475, + "step": 17990 + }, + { + "epoch": 0.6, + "grad_norm": 30.375, + "grad_norm_var": 5.889322916666667, + "learning_rate": 0.0001, + "loss": 7.9181, + "loss/crossentropy": 2.0450053200125695, + "loss/hidden": 3.779296875, + "loss/jsd": 0.0, + "loss/logits": 0.22763790674507617, + "step": 18000 + }, + { + "epoch": 0.6003333333333334, + "grad_norm": 30.625, + "grad_norm_var": 4.785416666666666, + "learning_rate": 0.0001, + "loss": 7.7476, + "loss/crossentropy": 2.1718004338443277, + "loss/hidden": 3.62421875, + "loss/jsd": 0.0, + "loss/logits": 0.21642480613663792, + "step": 18010 + }, + { + "epoch": 0.6006666666666667, + "grad_norm": 38.0, + "grad_norm_var": 5.748372395833333, + "learning_rate": 0.0001, + "loss": 8.012, + "loss/crossentropy": 2.207152932882309, + "loss/hidden": 3.66328125, + "loss/jsd": 0.0, + "loss/logits": 0.24362048767507077, + "step": 18020 + }, + { + "epoch": 0.601, + "grad_norm": 33.0, + "grad_norm_var": 8.595768229166667, + "learning_rate": 0.0001, + "loss": 7.9312, + "loss/crossentropy": 2.0700930416584016, + "loss/hidden": 3.679296875, + "loss/jsd": 0.0, + "loss/logits": 0.20959441419690847, + "step": 18030 + }, + { + "epoch": 0.6013333333333334, + "grad_norm": 36.75, + "grad_norm_var": 6.439322916666667, + "learning_rate": 0.0001, + "loss": 7.8615, + "loss/crossentropy": 1.945305197685957, + "loss/hidden": 3.768359375, + "loss/jsd": 0.0, + "loss/logits": 0.23298689387738705, + "step": 18040 + }, + { + "epoch": 0.6016666666666667, + "grad_norm": 33.5, + "grad_norm_var": 4.405208333333333, + "learning_rate": 0.0001, + "loss": 7.7177, + "loss/crossentropy": 2.1207897052168847, + "loss/hidden": 3.72421875, + "loss/jsd": 0.0, + "loss/logits": 0.21883321572095155, + "step": 18050 + }, + { + "epoch": 0.602, + "grad_norm": 30.125, + "grad_norm_var": 3.4468098958333333, + "learning_rate": 0.0001, + "loss": 7.8522, + "loss/crossentropy": 1.9687151461839676, + "loss/hidden": 3.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21371175833046435, + "step": 18060 + }, + { + "epoch": 0.6023333333333334, + "grad_norm": 30.375, + "grad_norm_var": 3.4989583333333334, + "learning_rate": 0.0001, + "loss": 7.7965, + "loss/crossentropy": 2.0616400502622128, + "loss/hidden": 3.56953125, + "loss/jsd": 0.0, + "loss/logits": 0.19918912472203373, + "step": 18070 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 30.375, + "grad_norm_var": 2.894791666666667, + "learning_rate": 0.0001, + "loss": 7.8959, + "loss/crossentropy": 2.110419492423534, + "loss/hidden": 3.580859375, + "loss/jsd": 0.0, + "loss/logits": 0.21065182648599148, + "step": 18080 + }, + { + "epoch": 0.603, + "grad_norm": 37.75, + "grad_norm_var": 30.475455729166665, + "learning_rate": 0.0001, + "loss": 7.8809, + "loss/crossentropy": 2.0502669103443623, + "loss/hidden": 3.59609375, + "loss/jsd": 0.0, + "loss/logits": 0.2201010322198272, + "step": 18090 + }, + { + "epoch": 0.6033333333333334, + "grad_norm": 31.75, + "grad_norm_var": 7.795572916666667, + "learning_rate": 0.0001, + "loss": 7.8007, + "loss/crossentropy": 1.998288343846798, + "loss/hidden": 3.690234375, + "loss/jsd": 0.0, + "loss/logits": 0.20705017000436782, + "step": 18100 + }, + { + "epoch": 0.6036666666666667, + "grad_norm": 29.125, + "grad_norm_var": 4.1125, + "learning_rate": 0.0001, + "loss": 7.919, + "loss/crossentropy": 2.1517566978931426, + "loss/hidden": 3.579296875, + "loss/jsd": 0.0, + "loss/logits": 0.22426084131002427, + "step": 18110 + }, + { + "epoch": 0.604, + "grad_norm": 30.0, + "grad_norm_var": 4.998893229166667, + "learning_rate": 0.0001, + "loss": 7.7191, + "loss/crossentropy": 2.1819751486182213, + "loss/hidden": 3.59296875, + "loss/jsd": 0.0, + "loss/logits": 0.2131780631840229, + "step": 18120 + }, + { + "epoch": 0.6043333333333333, + "grad_norm": 28.125, + "grad_norm_var": 3.2264973958333334, + "learning_rate": 0.0001, + "loss": 7.8286, + "loss/crossentropy": 2.0302142813801765, + "loss/hidden": 3.49140625, + "loss/jsd": 0.0, + "loss/logits": 0.21191238649189473, + "step": 18130 + }, + { + "epoch": 0.6046666666666667, + "grad_norm": 27.75, + "grad_norm_var": 4.076041666666667, + "learning_rate": 0.0001, + "loss": 7.8356, + "loss/crossentropy": 2.1582538709044456, + "loss/hidden": 3.681640625, + "loss/jsd": 0.0, + "loss/logits": 0.22549642771482467, + "step": 18140 + }, + { + "epoch": 0.605, + "grad_norm": 32.25, + "grad_norm_var": 3.0893229166666667, + "learning_rate": 0.0001, + "loss": 7.9474, + "loss/crossentropy": 1.9815901264548301, + "loss/hidden": 3.696875, + "loss/jsd": 0.0, + "loss/logits": 0.20591741409152747, + "step": 18150 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 29.75, + "grad_norm_var": 5.76015625, + "learning_rate": 0.0001, + "loss": 7.8622, + "loss/crossentropy": 2.011575572192669, + "loss/hidden": 3.656640625, + "loss/jsd": 0.0, + "loss/logits": 0.21709777507930994, + "step": 18160 + }, + { + "epoch": 0.6056666666666667, + "grad_norm": 31.125, + "grad_norm_var": 1.6330729166666667, + "learning_rate": 0.0001, + "loss": 7.8002, + "loss/crossentropy": 1.9985271662473678, + "loss/hidden": 3.662890625, + "loss/jsd": 0.0, + "loss/logits": 0.2164109718054533, + "step": 18170 + }, + { + "epoch": 0.606, + "grad_norm": 28.875, + "grad_norm_var": 2.1587890625, + "learning_rate": 0.0001, + "loss": 7.8748, + "loss/crossentropy": 1.9468450605869294, + "loss/hidden": 3.795703125, + "loss/jsd": 0.0, + "loss/logits": 0.22511965408921242, + "step": 18180 + }, + { + "epoch": 0.6063333333333333, + "grad_norm": 31.875, + "grad_norm_var": 8.801041666666666, + "learning_rate": 0.0001, + "loss": 7.7904, + "loss/crossentropy": 1.8904692113399506, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.19247866850346326, + "step": 18190 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 30.125, + "grad_norm_var": 1.32265625, + "learning_rate": 0.0001, + "loss": 7.9876, + "loss/crossentropy": 2.1818495839834213, + "loss/hidden": 3.655859375, + "loss/jsd": 0.0, + "loss/logits": 0.2132304223254323, + "step": 18200 + }, + { + "epoch": 0.607, + "grad_norm": 29.125, + "grad_norm_var": 1.7712890625, + "learning_rate": 0.0001, + "loss": 7.7898, + "loss/crossentropy": 2.0314568877220154, + "loss/hidden": 3.555859375, + "loss/jsd": 0.0, + "loss/logits": 0.20160603299736976, + "step": 18210 + }, + { + "epoch": 0.6073333333333333, + "grad_norm": 29.0, + "grad_norm_var": 6.9947265625, + "learning_rate": 0.0001, + "loss": 7.7958, + "loss/crossentropy": 2.127374881505966, + "loss/hidden": 3.6359375, + "loss/jsd": 0.0, + "loss/logits": 0.2240319259464741, + "step": 18220 + }, + { + "epoch": 0.6076666666666667, + "grad_norm": 30.375, + "grad_norm_var": 7.166080729166667, + "learning_rate": 0.0001, + "loss": 7.8324, + "loss/crossentropy": 2.0251319468021394, + "loss/hidden": 3.702734375, + "loss/jsd": 0.0, + "loss/logits": 0.23137976359575987, + "step": 18230 + }, + { + "epoch": 0.608, + "grad_norm": 34.75, + "grad_norm_var": 3.998893229166667, + "learning_rate": 0.0001, + "loss": 7.9401, + "loss/crossentropy": 2.006920612603426, + "loss/hidden": 3.603125, + "loss/jsd": 0.0, + "loss/logits": 0.20304168649017812, + "step": 18240 + }, + { + "epoch": 0.6083333333333333, + "grad_norm": 32.5, + "grad_norm_var": 2.3916666666666666, + "learning_rate": 0.0001, + "loss": 7.7267, + "loss/crossentropy": 2.189460189640522, + "loss/hidden": 3.564453125, + "loss/jsd": 0.0, + "loss/logits": 0.20912937633693218, + "step": 18250 + }, + { + "epoch": 0.6086666666666667, + "grad_norm": 28.875, + "grad_norm_var": 2.653059895833333, + "learning_rate": 0.0001, + "loss": 7.8839, + "loss/crossentropy": 2.109463243186474, + "loss/hidden": 3.65, + "loss/jsd": 0.0, + "loss/logits": 0.22287122681736946, + "step": 18260 + }, + { + "epoch": 0.609, + "grad_norm": 29.625, + "grad_norm_var": 5.537434895833333, + "learning_rate": 0.0001, + "loss": 7.9143, + "loss/crossentropy": 1.9888273879885674, + "loss/hidden": 3.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19882734641432762, + "step": 18270 + }, + { + "epoch": 0.6093333333333333, + "grad_norm": 29.875, + "grad_norm_var": 6.3697265625, + "learning_rate": 0.0001, + "loss": 7.8695, + "loss/crossentropy": 2.1051917299628258, + "loss/hidden": 3.73203125, + "loss/jsd": 0.0, + "loss/logits": 0.22681960612535476, + "step": 18280 + }, + { + "epoch": 0.6096666666666667, + "grad_norm": 30.125, + "grad_norm_var": 4.312239583333334, + "learning_rate": 0.0001, + "loss": 7.7899, + "loss/crossentropy": 1.9936149284243583, + "loss/hidden": 3.71953125, + "loss/jsd": 0.0, + "loss/logits": 0.22636721413582564, + "step": 18290 + }, + { + "epoch": 0.61, + "grad_norm": 32.75, + "grad_norm_var": 3.8655598958333335, + "learning_rate": 0.0001, + "loss": 7.9113, + "loss/crossentropy": 2.1669924929738045, + "loss/hidden": 3.64375, + "loss/jsd": 0.0, + "loss/logits": 0.21989998165518046, + "step": 18300 + }, + { + "epoch": 0.6103333333333333, + "grad_norm": 33.25, + "grad_norm_var": 4.784830729166667, + "learning_rate": 0.0001, + "loss": 7.8107, + "loss/crossentropy": 2.1663272455334663, + "loss/hidden": 3.603125, + "loss/jsd": 0.0, + "loss/logits": 0.22850660514086485, + "step": 18310 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 29.5, + "grad_norm_var": 15.930989583333334, + "learning_rate": 0.0001, + "loss": 7.921, + "loss/crossentropy": 2.106405158340931, + "loss/hidden": 3.580859375, + "loss/jsd": 0.0, + "loss/logits": 0.22974986005574466, + "step": 18320 + }, + { + "epoch": 0.611, + "grad_norm": 31.25, + "grad_norm_var": 18.299739583333334, + "learning_rate": 0.0001, + "loss": 7.8336, + "loss/crossentropy": 2.130492168664932, + "loss/hidden": 3.606640625, + "loss/jsd": 0.0, + "loss/logits": 0.2170414287596941, + "step": 18330 + }, + { + "epoch": 0.6113333333333333, + "grad_norm": 29.875, + "grad_norm_var": 7.1650390625, + "learning_rate": 0.0001, + "loss": 7.8573, + "loss/crossentropy": 2.3027665317058563, + "loss/hidden": 3.6, + "loss/jsd": 0.0, + "loss/logits": 0.22886360697448255, + "step": 18340 + }, + { + "epoch": 0.6116666666666667, + "grad_norm": 31.875, + "grad_norm_var": 6.0697265625, + "learning_rate": 0.0001, + "loss": 7.7478, + "loss/crossentropy": 2.0985746681690216, + "loss/hidden": 3.57734375, + "loss/jsd": 0.0, + "loss/logits": 0.20908834878355265, + "step": 18350 + }, + { + "epoch": 0.612, + "grad_norm": 30.5, + "grad_norm_var": 6.792643229166667, + "learning_rate": 0.0001, + "loss": 7.8184, + "loss/crossentropy": 2.152451690286398, + "loss/hidden": 3.6, + "loss/jsd": 0.0, + "loss/logits": 0.21251746444031597, + "step": 18360 + }, + { + "epoch": 0.6123333333333333, + "grad_norm": 27.25, + "grad_norm_var": 2.3962890625, + "learning_rate": 0.0001, + "loss": 7.8537, + "loss/crossentropy": 2.0875338673591615, + "loss/hidden": 3.631640625, + "loss/jsd": 0.0, + "loss/logits": 0.23585428856313229, + "step": 18370 + }, + { + "epoch": 0.6126666666666667, + "grad_norm": 29.75, + "grad_norm_var": 4.625, + "learning_rate": 0.0001, + "loss": 7.9296, + "loss/crossentropy": 2.0466977350413798, + "loss/hidden": 3.60859375, + "loss/jsd": 0.0, + "loss/logits": 0.2116424733772874, + "step": 18380 + }, + { + "epoch": 0.613, + "grad_norm": 29.875, + "grad_norm_var": 2.3478515625, + "learning_rate": 0.0001, + "loss": 7.9076, + "loss/crossentropy": 2.0156113907694815, + "loss/hidden": 3.674609375, + "loss/jsd": 0.0, + "loss/logits": 0.21385385412722827, + "step": 18390 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 31.0, + "grad_norm_var": 3.7384765625, + "learning_rate": 0.0001, + "loss": 7.9007, + "loss/crossentropy": 2.1177249431610106, + "loss/hidden": 3.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2171054555103183, + "step": 18400 + }, + { + "epoch": 0.6136666666666667, + "grad_norm": 30.875, + "grad_norm_var": 5.0056640625, + "learning_rate": 0.0001, + "loss": 7.8653, + "loss/crossentropy": 2.0109619170427324, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.24297788869589568, + "step": 18410 + }, + { + "epoch": 0.614, + "grad_norm": 29.875, + "grad_norm_var": 6.345247395833334, + "learning_rate": 0.0001, + "loss": 7.8787, + "loss/crossentropy": 2.008924402296543, + "loss/hidden": 3.815625, + "loss/jsd": 0.0, + "loss/logits": 0.23545185066759586, + "step": 18420 + }, + { + "epoch": 0.6143333333333333, + "grad_norm": 31.875, + "grad_norm_var": 1.9572265625, + "learning_rate": 0.0001, + "loss": 7.7924, + "loss/crossentropy": 2.1681634426116942, + "loss/hidden": 3.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2179112009704113, + "step": 18430 + }, + { + "epoch": 0.6146666666666667, + "grad_norm": 29.5, + "grad_norm_var": 2.56015625, + "learning_rate": 0.0001, + "loss": 7.8609, + "loss/crossentropy": 2.100195789337158, + "loss/hidden": 3.582421875, + "loss/jsd": 0.0, + "loss/logits": 0.21292860489338636, + "step": 18440 + }, + { + "epoch": 0.615, + "grad_norm": 30.5, + "grad_norm_var": 1.2983723958333333, + "learning_rate": 0.0001, + "loss": 7.967, + "loss/crossentropy": 2.2022788748145103, + "loss/hidden": 3.609765625, + "loss/jsd": 0.0, + "loss/logits": 0.2205308698117733, + "step": 18450 + }, + { + "epoch": 0.6153333333333333, + "grad_norm": 39.25, + "grad_norm_var": 5.6375, + "learning_rate": 0.0001, + "loss": 7.914, + "loss/crossentropy": 2.1412271529436113, + "loss/hidden": 3.605859375, + "loss/jsd": 0.0, + "loss/logits": 0.21023106891661883, + "step": 18460 + }, + { + "epoch": 0.6156666666666667, + "grad_norm": 31.0, + "grad_norm_var": 3.313593764281145e+18, + "learning_rate": 0.0001, + "loss": 7.8255, + "loss/crossentropy": 2.116056500375271, + "loss/hidden": 3.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22026809379458429, + "step": 18470 + }, + { + "epoch": 0.616, + "grad_norm": 34.5, + "grad_norm_var": 3.3135937641066967e+18, + "learning_rate": 0.0001, + "loss": 7.8251, + "loss/crossentropy": 2.1441701896488667, + "loss/hidden": 3.804296875, + "loss/jsd": 0.0, + "loss/logits": 0.23796399366110563, + "step": 18480 + }, + { + "epoch": 0.6163333333333333, + "grad_norm": 30.5, + "grad_norm_var": 3.0306640625, + "learning_rate": 0.0001, + "loss": 7.8743, + "loss/crossentropy": 2.05239050835371, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.20417920276522636, + "step": 18490 + }, + { + "epoch": 0.6166666666666667, + "grad_norm": 29.75, + "grad_norm_var": 7.375, + "learning_rate": 0.0001, + "loss": 7.8488, + "loss/crossentropy": 1.98827982544899, + "loss/hidden": 3.65546875, + "loss/jsd": 0.0, + "loss/logits": 0.21382159925997257, + "step": 18500 + }, + { + "epoch": 0.617, + "grad_norm": 31.125, + "grad_norm_var": 9.1916015625, + "learning_rate": 0.0001, + "loss": 7.9309, + "loss/crossentropy": 2.108893929421902, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.2015895338729024, + "step": 18510 + }, + { + "epoch": 0.6173333333333333, + "grad_norm": 30.375, + "grad_norm_var": 4.250455729166666, + "learning_rate": 0.0001, + "loss": 7.8394, + "loss/crossentropy": 2.2074019432067873, + "loss/hidden": 3.61875, + "loss/jsd": 0.0, + "loss/logits": 0.21143595930188894, + "step": 18520 + }, + { + "epoch": 0.6176666666666667, + "grad_norm": 30.5, + "grad_norm_var": 3.97265625, + "learning_rate": 0.0001, + "loss": 7.8296, + "loss/crossentropy": 2.0721921652555464, + "loss/hidden": 3.702734375, + "loss/jsd": 0.0, + "loss/logits": 0.23150747194886206, + "step": 18530 + }, + { + "epoch": 0.618, + "grad_norm": 32.5, + "grad_norm_var": 3.901497395833333, + "learning_rate": 0.0001, + "loss": 7.8179, + "loss/crossentropy": 2.12041699886322, + "loss/hidden": 3.6109375, + "loss/jsd": 0.0, + "loss/logits": 0.20933685936033725, + "step": 18540 + }, + { + "epoch": 0.6183333333333333, + "grad_norm": 32.25, + "grad_norm_var": 22.773893229166667, + "learning_rate": 0.0001, + "loss": 7.8798, + "loss/crossentropy": 2.073936428129673, + "loss/hidden": 3.55625, + "loss/jsd": 0.0, + "loss/logits": 0.21363019309937953, + "step": 18550 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 31.375, + "grad_norm_var": 21.6416015625, + "learning_rate": 0.0001, + "loss": 7.7396, + "loss/crossentropy": 1.969584984332323, + "loss/hidden": 3.607421875, + "loss/jsd": 0.0, + "loss/logits": 0.20250753909349442, + "step": 18560 + }, + { + "epoch": 0.619, + "grad_norm": 36.5, + "grad_norm_var": 3.5582682291666665, + "learning_rate": 0.0001, + "loss": 7.881, + "loss/crossentropy": 2.3890004098415374, + "loss/hidden": 3.58359375, + "loss/jsd": 0.0, + "loss/logits": 0.21717921365052462, + "step": 18570 + }, + { + "epoch": 0.6193333333333333, + "grad_norm": 31.875, + "grad_norm_var": 5.0962890625, + "learning_rate": 0.0001, + "loss": 7.8276, + "loss/crossentropy": 1.9156524941325188, + "loss/hidden": 3.602734375, + "loss/jsd": 0.0, + "loss/logits": 0.22258005812764167, + "step": 18580 + }, + { + "epoch": 0.6196666666666667, + "grad_norm": 34.25, + "grad_norm_var": 3.2916015625, + "learning_rate": 0.0001, + "loss": 7.8671, + "loss/crossentropy": 2.1052445240318773, + "loss/hidden": 3.541015625, + "loss/jsd": 0.0, + "loss/logits": 0.20677947774529457, + "step": 18590 + }, + { + "epoch": 0.62, + "grad_norm": 29.25, + "grad_norm_var": 4.991080729166667, + "learning_rate": 0.0001, + "loss": 7.8464, + "loss/crossentropy": 1.9568229861557485, + "loss/hidden": 3.6140625, + "loss/jsd": 0.0, + "loss/logits": 0.22466170443221928, + "step": 18600 + }, + { + "epoch": 0.6203333333333333, + "grad_norm": 29.875, + "grad_norm_var": 3.1832682291666665, + "learning_rate": 0.0001, + "loss": 7.8225, + "loss/crossentropy": 2.082220788300037, + "loss/hidden": 3.5328125, + "loss/jsd": 0.0, + "loss/logits": 0.20434877574443816, + "step": 18610 + }, + { + "epoch": 0.6206666666666667, + "grad_norm": 29.625, + "grad_norm_var": 11.320833333333333, + "learning_rate": 0.0001, + "loss": 7.9303, + "loss/crossentropy": 2.120234587043524, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.23275276124477387, + "step": 18620 + }, + { + "epoch": 0.621, + "grad_norm": 29.375, + "grad_norm_var": 2.448372395833333, + "learning_rate": 0.0001, + "loss": 7.8128, + "loss/crossentropy": 2.058797413110733, + "loss/hidden": 3.676171875, + "loss/jsd": 0.0, + "loss/logits": 0.2265950959175825, + "step": 18630 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 29.5, + "grad_norm_var": 9.9197265625, + "learning_rate": 0.0001, + "loss": 7.8673, + "loss/crossentropy": 1.9986446030437945, + "loss/hidden": 3.60078125, + "loss/jsd": 0.0, + "loss/logits": 0.19868890419602395, + "step": 18640 + }, + { + "epoch": 0.6216666666666667, + "grad_norm": 33.0, + "grad_norm_var": 2.082747395833333, + "learning_rate": 0.0001, + "loss": 7.9691, + "loss/crossentropy": 2.2711413890123366, + "loss/hidden": 3.621875, + "loss/jsd": 0.0, + "loss/logits": 0.22637809179723262, + "step": 18650 + }, + { + "epoch": 0.622, + "grad_norm": 29.0, + "grad_norm_var": 3.0853515625, + "learning_rate": 0.0001, + "loss": 7.9306, + "loss/crossentropy": 2.130834940075874, + "loss/hidden": 3.587109375, + "loss/jsd": 0.0, + "loss/logits": 0.21620727181434632, + "step": 18660 + }, + { + "epoch": 0.6223333333333333, + "grad_norm": 29.0, + "grad_norm_var": 4.4884765625, + "learning_rate": 0.0001, + "loss": 7.9184, + "loss/crossentropy": 2.132387759536505, + "loss/hidden": 3.719921875, + "loss/jsd": 0.0, + "loss/logits": 0.2213591465726495, + "step": 18670 + }, + { + "epoch": 0.6226666666666667, + "grad_norm": 37.5, + "grad_norm_var": 5.250455729166666, + "learning_rate": 0.0001, + "loss": 7.8612, + "loss/crossentropy": 2.172727197408676, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.21677472554147242, + "step": 18680 + }, + { + "epoch": 0.623, + "grad_norm": 32.75, + "grad_norm_var": 5.864583333333333, + "learning_rate": 0.0001, + "loss": 7.8139, + "loss/crossentropy": 2.1482373237609864, + "loss/hidden": 3.638671875, + "loss/jsd": 0.0, + "loss/logits": 0.22568456567823886, + "step": 18690 + }, + { + "epoch": 0.6233333333333333, + "grad_norm": 33.25, + "grad_norm_var": 9.995768229166666, + "learning_rate": 0.0001, + "loss": 7.667, + "loss/crossentropy": 2.1058941036462784, + "loss/hidden": 3.533203125, + "loss/jsd": 0.0, + "loss/logits": 0.2003629505634308, + "step": 18700 + }, + { + "epoch": 0.6236666666666667, + "grad_norm": 31.25, + "grad_norm_var": 3.0462890625, + "learning_rate": 0.0001, + "loss": 7.8472, + "loss/crossentropy": 2.0365977115929126, + "loss/hidden": 3.583984375, + "loss/jsd": 0.0, + "loss/logits": 0.21015565544366838, + "step": 18710 + }, + { + "epoch": 0.624, + "grad_norm": 31.125, + "grad_norm_var": 4.335416666666666, + "learning_rate": 0.0001, + "loss": 7.7206, + "loss/crossentropy": 2.099413389712572, + "loss/hidden": 3.494921875, + "loss/jsd": 0.0, + "loss/logits": 0.20410673916339875, + "step": 18720 + }, + { + "epoch": 0.6243333333333333, + "grad_norm": 27.875, + "grad_norm_var": 11.883072916666666, + "learning_rate": 0.0001, + "loss": 7.9741, + "loss/crossentropy": 2.0666474759578706, + "loss/hidden": 3.708984375, + "loss/jsd": 0.0, + "loss/logits": 0.21149700321257114, + "step": 18730 + }, + { + "epoch": 0.6246666666666667, + "grad_norm": 32.25, + "grad_norm_var": 8.562239583333334, + "learning_rate": 0.0001, + "loss": 7.8691, + "loss/crossentropy": 2.0636663090437652, + "loss/hidden": 3.5921875, + "loss/jsd": 0.0, + "loss/logits": 0.21715150568634273, + "step": 18740 + }, + { + "epoch": 0.625, + "grad_norm": 37.75, + "grad_norm_var": 8.626822916666667, + "learning_rate": 0.0001, + "loss": 7.7649, + "loss/crossentropy": 2.0067582026124002, + "loss/hidden": 3.5578125, + "loss/jsd": 0.0, + "loss/logits": 0.204201880376786, + "step": 18750 + }, + { + "epoch": 0.6253333333333333, + "grad_norm": 31.75, + "grad_norm_var": 9.3634765625, + "learning_rate": 0.0001, + "loss": 7.8405, + "loss/crossentropy": 2.0874268427491187, + "loss/hidden": 3.680859375, + "loss/jsd": 0.0, + "loss/logits": 0.2324770163744688, + "step": 18760 + }, + { + "epoch": 0.6256666666666667, + "grad_norm": 28.75, + "grad_norm_var": 2.8872395833333333, + "learning_rate": 0.0001, + "loss": 7.8715, + "loss/crossentropy": 1.892872792482376, + "loss/hidden": 3.663671875, + "loss/jsd": 0.0, + "loss/logits": 0.21375333461910487, + "step": 18770 + }, + { + "epoch": 0.626, + "grad_norm": 30.625, + "grad_norm_var": 3.6372395833333333, + "learning_rate": 0.0001, + "loss": 7.9126, + "loss/crossentropy": 2.181600275635719, + "loss/hidden": 3.6828125, + "loss/jsd": 0.0, + "loss/logits": 0.23229926731437445, + "step": 18780 + }, + { + "epoch": 0.6263333333333333, + "grad_norm": 31.375, + "grad_norm_var": 532.3809895833333, + "learning_rate": 0.0001, + "loss": 7.8422, + "loss/crossentropy": 2.1315030232071877, + "loss/hidden": 3.515234375, + "loss/jsd": 0.0, + "loss/logits": 0.20269421245902777, + "step": 18790 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 31.125, + "grad_norm_var": 24.878125, + "learning_rate": 0.0001, + "loss": 7.9348, + "loss/crossentropy": 2.173051218688488, + "loss/hidden": 3.5890625, + "loss/jsd": 0.0, + "loss/logits": 0.2082651512697339, + "step": 18800 + }, + { + "epoch": 0.627, + "grad_norm": 30.375, + "grad_norm_var": 3.22265625, + "learning_rate": 0.0001, + "loss": 7.9028, + "loss/crossentropy": 2.1568052619695663, + "loss/hidden": 3.58828125, + "loss/jsd": 0.0, + "loss/logits": 0.21820257026702167, + "step": 18810 + }, + { + "epoch": 0.6273333333333333, + "grad_norm": 31.25, + "grad_norm_var": 3.409309895833333, + "learning_rate": 0.0001, + "loss": 7.9375, + "loss/crossentropy": 1.8293808348476888, + "loss/hidden": 3.6078125, + "loss/jsd": 0.0, + "loss/logits": 0.19814650276675821, + "step": 18820 + }, + { + "epoch": 0.6276666666666667, + "grad_norm": 28.875, + "grad_norm_var": 21.13125, + "learning_rate": 0.0001, + "loss": 7.9111, + "loss/crossentropy": 2.19702318161726, + "loss/hidden": 3.63125, + "loss/jsd": 0.0, + "loss/logits": 0.21540935784578324, + "step": 18830 + }, + { + "epoch": 0.628, + "grad_norm": 32.25, + "grad_norm_var": 20.922916666666666, + "learning_rate": 0.0001, + "loss": 7.8251, + "loss/crossentropy": 2.1570638747885824, + "loss/hidden": 3.54609375, + "loss/jsd": 0.0, + "loss/logits": 0.20867985542863607, + "step": 18840 + }, + { + "epoch": 0.6283333333333333, + "grad_norm": 30.375, + "grad_norm_var": 4.72890625, + "learning_rate": 0.0001, + "loss": 7.8015, + "loss/crossentropy": 2.1077527910470963, + "loss/hidden": 3.674609375, + "loss/jsd": 0.0, + "loss/logits": 0.22238324768841267, + "step": 18850 + }, + { + "epoch": 0.6286666666666667, + "grad_norm": 32.75, + "grad_norm_var": 12.74765625, + "learning_rate": 0.0001, + "loss": 7.8461, + "loss/crossentropy": 2.0374200642108917, + "loss/hidden": 3.715234375, + "loss/jsd": 0.0, + "loss/logits": 0.21136217713356018, + "step": 18860 + }, + { + "epoch": 0.629, + "grad_norm": 30.625, + "grad_norm_var": 6.53515625, + "learning_rate": 0.0001, + "loss": 7.9069, + "loss/crossentropy": 2.0224805563688277, + "loss/hidden": 3.51015625, + "loss/jsd": 0.0, + "loss/logits": 0.2152063086628914, + "step": 18870 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 37.0, + "grad_norm_var": 10.820572916666666, + "learning_rate": 0.0001, + "loss": 7.9014, + "loss/crossentropy": 2.063296413421631, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.20050969813019037, + "step": 18880 + }, + { + "epoch": 0.6296666666666667, + "grad_norm": 31.625, + "grad_norm_var": 7.643684895833333, + "learning_rate": 0.0001, + "loss": 7.8825, + "loss/crossentropy": 2.2169769048690795, + "loss/hidden": 3.5921875, + "loss/jsd": 0.0, + "loss/logits": 0.21663882099092008, + "step": 18890 + }, + { + "epoch": 0.63, + "grad_norm": 29.5, + "grad_norm_var": 5.8775390625, + "learning_rate": 0.0001, + "loss": 7.8653, + "loss/crossentropy": 2.0760796412825586, + "loss/hidden": 3.532421875, + "loss/jsd": 0.0, + "loss/logits": 0.1870915897190571, + "step": 18900 + }, + { + "epoch": 0.6303333333333333, + "grad_norm": 29.75, + "grad_norm_var": 1.6059895833333333, + "learning_rate": 0.0001, + "loss": 7.9256, + "loss/crossentropy": 2.2268298670649527, + "loss/hidden": 3.6609375, + "loss/jsd": 0.0, + "loss/logits": 0.21423916518688202, + "step": 18910 + }, + { + "epoch": 0.6306666666666667, + "grad_norm": 33.75, + "grad_norm_var": 6.870572916666666, + "learning_rate": 0.0001, + "loss": 7.9247, + "loss/crossentropy": 2.192274183034897, + "loss/hidden": 3.730078125, + "loss/jsd": 0.0, + "loss/logits": 0.2382037065923214, + "step": 18920 + }, + { + "epoch": 0.631, + "grad_norm": 34.5, + "grad_norm_var": 4.92265625, + "learning_rate": 0.0001, + "loss": 7.9498, + "loss/crossentropy": 2.1766092889010906, + "loss/hidden": 3.615234375, + "loss/jsd": 0.0, + "loss/logits": 0.20994122456759215, + "step": 18930 + }, + { + "epoch": 0.6313333333333333, + "grad_norm": 31.25, + "grad_norm_var": 3.9643229166666667, + "learning_rate": 0.0001, + "loss": 7.8339, + "loss/crossentropy": 2.1794259466230868, + "loss/hidden": 3.58125, + "loss/jsd": 0.0, + "loss/logits": 0.21326899696141483, + "step": 18940 + }, + { + "epoch": 0.6316666666666667, + "grad_norm": 30.375, + "grad_norm_var": 4.389518229166667, + "learning_rate": 0.0001, + "loss": 7.8916, + "loss/crossentropy": 2.1451626420021057, + "loss/hidden": 3.74765625, + "loss/jsd": 0.0, + "loss/logits": 0.22022609002888202, + "step": 18950 + }, + { + "epoch": 0.632, + "grad_norm": 28.5, + "grad_norm_var": 5.9275390625, + "learning_rate": 0.0001, + "loss": 7.885, + "loss/crossentropy": 2.143310196697712, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21481310818344354, + "step": 18960 + }, + { + "epoch": 0.6323333333333333, + "grad_norm": 33.5, + "grad_norm_var": 111.21041666666666, + "learning_rate": 0.0001, + "loss": 7.8308, + "loss/crossentropy": 2.0815651670098303, + "loss/hidden": 3.738671875, + "loss/jsd": 0.0, + "loss/logits": 0.2265876606106758, + "step": 18970 + }, + { + "epoch": 0.6326666666666667, + "grad_norm": 81.0, + "grad_norm_var": 249.68515625, + "learning_rate": 0.0001, + "loss": 7.8743, + "loss/crossentropy": 2.1671969212591646, + "loss/hidden": 3.51796875, + "loss/jsd": 0.0, + "loss/logits": 0.20081726741045713, + "step": 18980 + }, + { + "epoch": 0.633, + "grad_norm": 29.25, + "grad_norm_var": 167.84368489583332, + "learning_rate": 0.0001, + "loss": 7.7065, + "loss/crossentropy": 2.221551278233528, + "loss/hidden": 3.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.20150368921458722, + "step": 18990 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 31.75, + "grad_norm_var": 2.2393229166666666, + "learning_rate": 0.0001, + "loss": 7.8872, + "loss/crossentropy": 2.1742024421691895, + "loss/hidden": 3.70078125, + "loss/jsd": 0.0, + "loss/logits": 0.2297104850411415, + "step": 19000 + }, + { + "epoch": 0.6336666666666667, + "grad_norm": 30.125, + "grad_norm_var": 1.6155598958333333, + "learning_rate": 0.0001, + "loss": 7.8937, + "loss/crossentropy": 2.013886445760727, + "loss/hidden": 3.634765625, + "loss/jsd": 0.0, + "loss/logits": 0.2087485622614622, + "step": 19010 + }, + { + "epoch": 0.634, + "grad_norm": 29.25, + "grad_norm_var": 3.457747395833333, + "learning_rate": 0.0001, + "loss": 7.8267, + "loss/crossentropy": 2.125851184129715, + "loss/hidden": 3.680078125, + "loss/jsd": 0.0, + "loss/logits": 0.2237798146903515, + "step": 19020 + }, + { + "epoch": 0.6343333333333333, + "grad_norm": 30.625, + "grad_norm_var": 1.234375, + "learning_rate": 0.0001, + "loss": 7.8299, + "loss/crossentropy": 2.154933376610279, + "loss/hidden": 3.540625, + "loss/jsd": 0.0, + "loss/logits": 0.19935899265110493, + "step": 19030 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 28.875, + "grad_norm_var": 1.6129557291666667, + "learning_rate": 0.0001, + "loss": 7.8469, + "loss/crossentropy": 2.093220832943916, + "loss/hidden": 3.62734375, + "loss/jsd": 0.0, + "loss/logits": 0.2095857124775648, + "step": 19040 + }, + { + "epoch": 0.635, + "grad_norm": 32.5, + "grad_norm_var": 3.49765625, + "learning_rate": 0.0001, + "loss": 7.8074, + "loss/crossentropy": 2.1023634552955626, + "loss/hidden": 3.59140625, + "loss/jsd": 0.0, + "loss/logits": 0.1920078145340085, + "step": 19050 + }, + { + "epoch": 0.6353333333333333, + "grad_norm": 30.875, + "grad_norm_var": 2.9302083333333333, + "learning_rate": 0.0001, + "loss": 7.836, + "loss/crossentropy": 2.2064808174967765, + "loss/hidden": 3.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.20856849979609252, + "step": 19060 + }, + { + "epoch": 0.6356666666666667, + "grad_norm": 31.875, + "grad_norm_var": 14.134375, + "learning_rate": 0.0001, + "loss": 7.9529, + "loss/crossentropy": 2.022094827145338, + "loss/hidden": 3.588671875, + "loss/jsd": 0.0, + "loss/logits": 0.21211060788482428, + "step": 19070 + }, + { + "epoch": 0.636, + "grad_norm": 30.625, + "grad_norm_var": 7.253125, + "learning_rate": 0.0001, + "loss": 7.9353, + "loss/crossentropy": 2.1081229224801064, + "loss/hidden": 3.61875, + "loss/jsd": 0.0, + "loss/logits": 0.20318191722035409, + "step": 19080 + }, + { + "epoch": 0.6363333333333333, + "grad_norm": 31.0, + "grad_norm_var": 7.374934895833333, + "learning_rate": 0.0001, + "loss": 7.9194, + "loss/crossentropy": 2.179345028847456, + "loss/hidden": 3.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2187561433762312, + "step": 19090 + }, + { + "epoch": 0.6366666666666667, + "grad_norm": 30.75, + "grad_norm_var": 1.6650390625, + "learning_rate": 0.0001, + "loss": 7.7918, + "loss/crossentropy": 2.073818951845169, + "loss/hidden": 3.637890625, + "loss/jsd": 0.0, + "loss/logits": 0.22620401345193386, + "step": 19100 + }, + { + "epoch": 0.637, + "grad_norm": 33.5, + "grad_norm_var": 13.125, + "learning_rate": 0.0001, + "loss": 7.8342, + "loss/crossentropy": 2.1005424194037916, + "loss/hidden": 3.555859375, + "loss/jsd": 0.0, + "loss/logits": 0.197703623957932, + "step": 19110 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 29.5, + "grad_norm_var": 13.416666666666666, + "learning_rate": 0.0001, + "loss": 7.712, + "loss/crossentropy": 1.9248533256351947, + "loss/hidden": 3.691015625, + "loss/jsd": 0.0, + "loss/logits": 0.22173443883657457, + "step": 19120 + }, + { + "epoch": 0.6376666666666667, + "grad_norm": 32.75, + "grad_norm_var": 7.163608467333465e+18, + "learning_rate": 0.0001, + "loss": 7.8861, + "loss/crossentropy": 2.133715681731701, + "loss/hidden": 3.56328125, + "loss/jsd": 0.0, + "loss/logits": 0.20394432581961155, + "step": 19130 + }, + { + "epoch": 0.638, + "grad_norm": 32.0, + "grad_norm_var": 3.9525419601292687e+18, + "learning_rate": 0.0001, + "loss": 7.9626, + "loss/crossentropy": 2.325319290161133, + "loss/hidden": 3.668359375, + "loss/jsd": 0.0, + "loss/logits": 0.23943714387714862, + "step": 19140 + }, + { + "epoch": 0.6383333333333333, + "grad_norm": 31.375, + "grad_norm_var": 1.5083333333333333, + "learning_rate": 0.0001, + "loss": 7.8766, + "loss/crossentropy": 2.0861736297607423, + "loss/hidden": 3.673046875, + "loss/jsd": 0.0, + "loss/logits": 0.2469935854896903, + "step": 19150 + }, + { + "epoch": 0.6386666666666667, + "grad_norm": 29.25, + "grad_norm_var": 4.2587890625, + "learning_rate": 0.0001, + "loss": 7.7649, + "loss/crossentropy": 2.025897032767534, + "loss/hidden": 3.508984375, + "loss/jsd": 0.0, + "loss/logits": 0.19602714721113443, + "step": 19160 + }, + { + "epoch": 0.639, + "grad_norm": 310.0, + "grad_norm_var": 4881.70390625, + "learning_rate": 0.0001, + "loss": 8.0063, + "loss/crossentropy": 2.2703626573085787, + "loss/hidden": 3.65078125, + "loss/jsd": 0.0, + "loss/logits": 0.22815561573952436, + "step": 19170 + }, + { + "epoch": 0.6393333333333333, + "grad_norm": 30.75, + "grad_norm_var": 4814.96875, + "learning_rate": 0.0001, + "loss": 7.9568, + "loss/crossentropy": 2.130793032050133, + "loss/hidden": 3.572265625, + "loss/jsd": 0.0, + "loss/logits": 0.21214349009096622, + "step": 19180 + }, + { + "epoch": 0.6396666666666667, + "grad_norm": 40.0, + "grad_norm_var": 18.782291666666666, + "learning_rate": 0.0001, + "loss": 7.8414, + "loss/crossentropy": 1.9671615742146968, + "loss/hidden": 3.69296875, + "loss/jsd": 0.0, + "loss/logits": 0.21655476819723846, + "step": 19190 + }, + { + "epoch": 0.64, + "grad_norm": 28.5, + "grad_norm_var": 7.598958333333333, + "learning_rate": 0.0001, + "loss": 7.6977, + "loss/crossentropy": 1.9905995845794677, + "loss/hidden": 3.551953125, + "loss/jsd": 0.0, + "loss/logits": 0.20198220126330851, + "step": 19200 + }, + { + "epoch": 0.6403333333333333, + "grad_norm": 29.875, + "grad_norm_var": 14.686393229166667, + "learning_rate": 0.0001, + "loss": 7.7702, + "loss/crossentropy": 2.1015154205262663, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1957567347213626, + "step": 19210 + }, + { + "epoch": 0.6406666666666667, + "grad_norm": 29.875, + "grad_norm_var": 13.4166015625, + "learning_rate": 0.0001, + "loss": 7.8531, + "loss/crossentropy": 2.102202596515417, + "loss/hidden": 3.497265625, + "loss/jsd": 0.0, + "loss/logits": 0.20049150586128234, + "step": 19220 + }, + { + "epoch": 0.641, + "grad_norm": 29.625, + "grad_norm_var": 4.143489583333333, + "learning_rate": 0.0001, + "loss": 7.7549, + "loss/crossentropy": 2.0605901539325715, + "loss/hidden": 3.601171875, + "loss/jsd": 0.0, + "loss/logits": 0.21827564649283887, + "step": 19230 + }, + { + "epoch": 0.6413333333333333, + "grad_norm": 28.875, + "grad_norm_var": 26.48515625, + "learning_rate": 0.0001, + "loss": 7.8123, + "loss/crossentropy": 2.0212767884135245, + "loss/hidden": 3.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2223007800988853, + "step": 19240 + }, + { + "epoch": 0.6416666666666667, + "grad_norm": 31.375, + "grad_norm_var": 26.699739583333333, + "learning_rate": 0.0001, + "loss": 7.937, + "loss/crossentropy": 2.0694237641990183, + "loss/hidden": 3.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.20976187251508235, + "step": 19250 + }, + { + "epoch": 0.642, + "grad_norm": 29.625, + "grad_norm_var": 5.612239583333333, + "learning_rate": 0.0001, + "loss": 7.805, + "loss/crossentropy": 2.163316985964775, + "loss/hidden": 3.512109375, + "loss/jsd": 0.0, + "loss/logits": 0.20325905755162238, + "step": 19260 + }, + { + "epoch": 0.6423333333333333, + "grad_norm": 30.875, + "grad_norm_var": 4.861458333333333, + "learning_rate": 0.0001, + "loss": 7.7719, + "loss/crossentropy": 2.0881971672177313, + "loss/hidden": 3.61953125, + "loss/jsd": 0.0, + "loss/logits": 0.21606105621904134, + "step": 19270 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 29.0, + "grad_norm_var": 9.8556640625, + "learning_rate": 0.0001, + "loss": 7.8889, + "loss/crossentropy": 2.171047043800354, + "loss/hidden": 3.62421875, + "loss/jsd": 0.0, + "loss/logits": 0.22386016957461835, + "step": 19280 + }, + { + "epoch": 0.643, + "grad_norm": 30.5, + "grad_norm_var": 5.2375, + "learning_rate": 0.0001, + "loss": 7.7399, + "loss/crossentropy": 2.09329297542572, + "loss/hidden": 3.684765625, + "loss/jsd": 0.0, + "loss/logits": 0.218923170119524, + "step": 19290 + }, + { + "epoch": 0.6433333333333333, + "grad_norm": 27.25, + "grad_norm_var": 7.108072916666667, + "learning_rate": 0.0001, + "loss": 7.7758, + "loss/crossentropy": 2.0641403660178184, + "loss/hidden": 3.558984375, + "loss/jsd": 0.0, + "loss/logits": 0.20265798550099134, + "step": 19300 + }, + { + "epoch": 0.6436666666666667, + "grad_norm": 35.75, + "grad_norm_var": 4.120833333333334, + "learning_rate": 0.0001, + "loss": 7.8368, + "loss/crossentropy": 2.1241923332214356, + "loss/hidden": 3.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.19217857848852873, + "step": 19310 + }, + { + "epoch": 0.644, + "grad_norm": 36.75, + "grad_norm_var": 27.8978515625, + "learning_rate": 0.0001, + "loss": 7.8321, + "loss/crossentropy": 2.0242124810814857, + "loss/hidden": 3.682421875, + "loss/jsd": 0.0, + "loss/logits": 0.19688315968960524, + "step": 19320 + }, + { + "epoch": 0.6443333333333333, + "grad_norm": 28.375, + "grad_norm_var": 7.702083333333333, + "learning_rate": 0.0001, + "loss": 7.7452, + "loss/crossentropy": 2.093150442838669, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1998631376773119, + "step": 19330 + }, + { + "epoch": 0.6446666666666667, + "grad_norm": 36.75, + "grad_norm_var": 9.519791666666666, + "learning_rate": 0.0001, + "loss": 7.8062, + "loss/crossentropy": 2.1420770615339277, + "loss/hidden": 3.5640625, + "loss/jsd": 0.0, + "loss/logits": 0.20922097396105527, + "step": 19340 + }, + { + "epoch": 0.645, + "grad_norm": 33.0, + "grad_norm_var": 9.233333333333333, + "learning_rate": 0.0001, + "loss": 7.8477, + "loss/crossentropy": 2.1200323194265365, + "loss/hidden": 3.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.23046185187995433, + "step": 19350 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 29.375, + "grad_norm_var": 6.540559895833334, + "learning_rate": 0.0001, + "loss": 7.7967, + "loss/crossentropy": 2.054234591126442, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.2208017086610198, + "step": 19360 + }, + { + "epoch": 0.6456666666666667, + "grad_norm": 30.25, + "grad_norm_var": 2.4139973958333334, + "learning_rate": 0.0001, + "loss": 7.8385, + "loss/crossentropy": 2.0128255039453506, + "loss/hidden": 3.786328125, + "loss/jsd": 0.0, + "loss/logits": 0.22819863110780716, + "step": 19370 + }, + { + "epoch": 0.646, + "grad_norm": 32.0, + "grad_norm_var": 1.6363932291666667, + "learning_rate": 0.0001, + "loss": 7.7009, + "loss/crossentropy": 2.024762587249279, + "loss/hidden": 3.61640625, + "loss/jsd": 0.0, + "loss/logits": 0.20730710867792368, + "step": 19380 + }, + { + "epoch": 0.6463333333333333, + "grad_norm": 36.0, + "grad_norm_var": 3.693489583333333, + "learning_rate": 0.0001, + "loss": 7.7849, + "loss/crossentropy": 1.9413291484117507, + "loss/hidden": 3.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20006432849913836, + "step": 19390 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 28.0, + "grad_norm_var": 6.737239583333333, + "learning_rate": 0.0001, + "loss": 7.7068, + "loss/crossentropy": 2.219281970709562, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2173599960282445, + "step": 19400 + }, + { + "epoch": 0.647, + "grad_norm": 29.5, + "grad_norm_var": 5.31640625, + "learning_rate": 0.0001, + "loss": 7.9205, + "loss/crossentropy": 2.0932188779115677, + "loss/hidden": 3.573046875, + "loss/jsd": 0.0, + "loss/logits": 0.2144004687666893, + "step": 19410 + }, + { + "epoch": 0.6473333333333333, + "grad_norm": 33.75, + "grad_norm_var": 2.167708333333333, + "learning_rate": 0.0001, + "loss": 7.9694, + "loss/crossentropy": 2.187512440979481, + "loss/hidden": 3.666015625, + "loss/jsd": 0.0, + "loss/logits": 0.21288955435156823, + "step": 19420 + }, + { + "epoch": 0.6476666666666666, + "grad_norm": 32.0, + "grad_norm_var": 19.3244140625, + "learning_rate": 0.0001, + "loss": 7.9391, + "loss/crossentropy": 2.156757637858391, + "loss/hidden": 3.61875, + "loss/jsd": 0.0, + "loss/logits": 0.21084595024585723, + "step": 19430 + }, + { + "epoch": 0.648, + "grad_norm": 29.875, + "grad_norm_var": 4.955143229166667, + "learning_rate": 0.0001, + "loss": 7.8517, + "loss/crossentropy": 2.218225100636482, + "loss/hidden": 3.56015625, + "loss/jsd": 0.0, + "loss/logits": 0.21003954205662012, + "step": 19440 + }, + { + "epoch": 0.6483333333333333, + "grad_norm": 32.25, + "grad_norm_var": 2.2905598958333333, + "learning_rate": 0.0001, + "loss": 7.8119, + "loss/crossentropy": 2.1156990200281145, + "loss/hidden": 3.6578125, + "loss/jsd": 0.0, + "loss/logits": 0.21486669424921273, + "step": 19450 + }, + { + "epoch": 0.6486666666666666, + "grad_norm": 30.875, + "grad_norm_var": 3.03515625, + "learning_rate": 0.0001, + "loss": 7.7407, + "loss/crossentropy": 2.0934729874134064, + "loss/hidden": 3.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20924665350466967, + "step": 19460 + }, + { + "epoch": 0.649, + "grad_norm": 31.25, + "grad_norm_var": 1.3832682291666667, + "learning_rate": 0.0001, + "loss": 7.7772, + "loss/crossentropy": 2.167033377289772, + "loss/hidden": 3.596484375, + "loss/jsd": 0.0, + "loss/logits": 0.21081157084554433, + "step": 19470 + }, + { + "epoch": 0.6493333333333333, + "grad_norm": 30.25, + "grad_norm_var": 6.167643229166667, + "learning_rate": 0.0001, + "loss": 7.8026, + "loss/crossentropy": 2.0574812293052673, + "loss/hidden": 3.6078125, + "loss/jsd": 0.0, + "loss/logits": 0.21090691294521094, + "step": 19480 + }, + { + "epoch": 0.6496666666666666, + "grad_norm": 30.125, + "grad_norm_var": 131.05625, + "learning_rate": 0.0001, + "loss": 7.8775, + "loss/crossentropy": 2.037951024621725, + "loss/hidden": 3.607421875, + "loss/jsd": 0.0, + "loss/logits": 0.2193830787204206, + "step": 19490 + }, + { + "epoch": 0.65, + "grad_norm": 32.5, + "grad_norm_var": 1.7166666666666666, + "learning_rate": 0.0001, + "loss": 7.9099, + "loss/crossentropy": 2.1335637837648393, + "loss/hidden": 3.621484375, + "loss/jsd": 0.0, + "loss/logits": 0.22563568409532309, + "step": 19500 + }, + { + "epoch": 0.6503333333333333, + "grad_norm": 31.0, + "grad_norm_var": 7.759309895833334, + "learning_rate": 0.0001, + "loss": 7.6947, + "loss/crossentropy": 2.0860137730836867, + "loss/hidden": 3.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.20874691233038903, + "step": 19510 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 32.0, + "grad_norm_var": 20.720247395833333, + "learning_rate": 0.0001, + "loss": 7.9359, + "loss/crossentropy": 2.176954896748066, + "loss/hidden": 3.58359375, + "loss/jsd": 0.0, + "loss/logits": 0.21417912952601909, + "step": 19520 + }, + { + "epoch": 0.651, + "grad_norm": 29.25, + "grad_norm_var": 16.797916666666666, + "learning_rate": 0.0001, + "loss": 7.7905, + "loss/crossentropy": 2.112550212442875, + "loss/hidden": 3.535546875, + "loss/jsd": 0.0, + "loss/logits": 0.21057379432022572, + "step": 19530 + }, + { + "epoch": 0.6513333333333333, + "grad_norm": 35.25, + "grad_norm_var": 5.109375, + "learning_rate": 0.0001, + "loss": 7.7381, + "loss/crossentropy": 2.0441600404679776, + "loss/hidden": 3.759765625, + "loss/jsd": 0.0, + "loss/logits": 0.2118663378059864, + "step": 19540 + }, + { + "epoch": 0.6516666666666666, + "grad_norm": 29.25, + "grad_norm_var": 3.8622395833333334, + "learning_rate": 0.0001, + "loss": 7.8123, + "loss/crossentropy": 2.015673951804638, + "loss/hidden": 3.516015625, + "loss/jsd": 0.0, + "loss/logits": 0.2035064060240984, + "step": 19550 + }, + { + "epoch": 0.652, + "grad_norm": 30.5, + "grad_norm_var": 4.835416666666666, + "learning_rate": 0.0001, + "loss": 7.9088, + "loss/crossentropy": 2.1083447858691216, + "loss/hidden": 3.686328125, + "loss/jsd": 0.0, + "loss/logits": 0.22277338355779647, + "step": 19560 + }, + { + "epoch": 0.6523333333333333, + "grad_norm": 29.5, + "grad_norm_var": 7.986458333333333, + "learning_rate": 0.0001, + "loss": 7.7413, + "loss/crossentropy": 2.2140139706432818, + "loss/hidden": 3.536328125, + "loss/jsd": 0.0, + "loss/logits": 0.19835165999829768, + "step": 19570 + }, + { + "epoch": 0.6526666666666666, + "grad_norm": 46.25, + "grad_norm_var": 22.6853515625, + "learning_rate": 0.0001, + "loss": 7.8156, + "loss/crossentropy": 2.098857820034027, + "loss/hidden": 3.628125, + "loss/jsd": 0.0, + "loss/logits": 0.2128402628004551, + "step": 19580 + }, + { + "epoch": 0.653, + "grad_norm": 29.0, + "grad_norm_var": 3.4058471867640274e+18, + "learning_rate": 0.0001, + "loss": 7.941, + "loss/crossentropy": 2.161790570616722, + "loss/hidden": 3.558203125, + "loss/jsd": 0.0, + "loss/logits": 0.21066656708717346, + "step": 19590 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 30.5, + "grad_norm_var": 31.020572916666666, + "learning_rate": 0.0001, + "loss": 7.8532, + "loss/crossentropy": 2.1808153837919235, + "loss/hidden": 3.630859375, + "loss/jsd": 0.0, + "loss/logits": 0.21154428124427796, + "step": 19600 + }, + { + "epoch": 0.6536666666666666, + "grad_norm": 30.0, + "grad_norm_var": 2.294791666666667, + "learning_rate": 0.0001, + "loss": 7.7141, + "loss/crossentropy": 2.084941604733467, + "loss/hidden": 3.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.2059195751324296, + "step": 19610 + }, + { + "epoch": 0.654, + "grad_norm": 29.875, + "grad_norm_var": 2.474739583333333, + "learning_rate": 0.0001, + "loss": 7.7995, + "loss/crossentropy": 1.9402640502899886, + "loss/hidden": 3.602734375, + "loss/jsd": 0.0, + "loss/logits": 0.19613281125202775, + "step": 19620 + }, + { + "epoch": 0.6543333333333333, + "grad_norm": 30.125, + "grad_norm_var": 123.0462890625, + "learning_rate": 0.0001, + "loss": 7.8662, + "loss/crossentropy": 2.056548047810793, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.19428260792046786, + "step": 19630 + }, + { + "epoch": 0.6546666666666666, + "grad_norm": 5469372416.0, + "grad_norm_var": 3.757034120500845e+18, + "learning_rate": 0.0001, + "loss": 7.8375, + "loss/crossentropy": 2.173163182288408, + "loss/hidden": 3.60703125, + "loss/jsd": 0.0, + "loss/logits": 0.21105932276695966, + "step": 19640 + }, + { + "epoch": 0.655, + "grad_norm": 30.125, + "grad_norm_var": 3.757034118776007e+18, + "learning_rate": 0.0001, + "loss": 7.7935, + "loss/crossentropy": 2.129123020917177, + "loss/hidden": 3.625390625, + "loss/jsd": 0.0, + "loss/logits": 0.21240208223462104, + "step": 19650 + }, + { + "epoch": 0.6553333333333333, + "grad_norm": 29.25, + "grad_norm_var": 4.189518229166667, + "learning_rate": 0.0001, + "loss": 7.6158, + "loss/crossentropy": 2.080010825395584, + "loss/hidden": 3.594921875, + "loss/jsd": 0.0, + "loss/logits": 0.20815913137048483, + "step": 19660 + }, + { + "epoch": 0.6556666666666666, + "grad_norm": 27.25, + "grad_norm_var": 6.65, + "learning_rate": 0.0001, + "loss": 7.6821, + "loss/crossentropy": 1.9834842666983605, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.19239332657307387, + "step": 19670 + }, + { + "epoch": 0.656, + "grad_norm": 49.0, + "grad_norm_var": 26.429622395833334, + "learning_rate": 0.0001, + "loss": 7.7928, + "loss/crossentropy": 2.0769309490919112, + "loss/hidden": 3.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.2172299936413765, + "step": 19680 + }, + { + "epoch": 0.6563333333333333, + "grad_norm": 31.0, + "grad_norm_var": 24.833333333333332, + "learning_rate": 0.0001, + "loss": 7.7632, + "loss/crossentropy": 2.0378880076110364, + "loss/hidden": 3.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21062599224969744, + "step": 19690 + }, + { + "epoch": 0.6566666666666666, + "grad_norm": 33.5, + "grad_norm_var": 3.2285807291666666, + "learning_rate": 0.0001, + "loss": 7.8179, + "loss/crossentropy": 2.0788474015891554, + "loss/hidden": 3.703515625, + "loss/jsd": 0.0, + "loss/logits": 0.20390022164210678, + "step": 19700 + }, + { + "epoch": 0.657, + "grad_norm": 32.0, + "grad_norm_var": 2.20625, + "learning_rate": 0.0001, + "loss": 7.796, + "loss/crossentropy": 1.9499784991145135, + "loss/hidden": 3.72890625, + "loss/jsd": 0.0, + "loss/logits": 0.21807905454188586, + "step": 19710 + }, + { + "epoch": 0.6573333333333333, + "grad_norm": 29.125, + "grad_norm_var": 2.9082682291666666, + "learning_rate": 0.0001, + "loss": 7.7345, + "loss/crossentropy": 2.05738410204649, + "loss/hidden": 3.614453125, + "loss/jsd": 0.0, + "loss/logits": 0.2033387843519449, + "step": 19720 + }, + { + "epoch": 0.6576666666666666, + "grad_norm": 31.75, + "grad_norm_var": 6.730989583333334, + "learning_rate": 0.0001, + "loss": 7.8437, + "loss/crossentropy": 2.076332356035709, + "loss/hidden": 3.60703125, + "loss/jsd": 0.0, + "loss/logits": 0.2070010544732213, + "step": 19730 + }, + { + "epoch": 0.658, + "grad_norm": 31.5, + "grad_norm_var": 1.5077473958333334, + "learning_rate": 0.0001, + "loss": 7.7077, + "loss/crossentropy": 2.165965069830418, + "loss/hidden": 3.56328125, + "loss/jsd": 0.0, + "loss/logits": 0.20955195166170598, + "step": 19740 + }, + { + "epoch": 0.6583333333333333, + "grad_norm": 29.875, + "grad_norm_var": 1.075, + "learning_rate": 0.0001, + "loss": 7.8811, + "loss/crossentropy": 2.1568801373243334, + "loss/hidden": 3.615625, + "loss/jsd": 0.0, + "loss/logits": 0.21716065630316733, + "step": 19750 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 28.625, + "grad_norm_var": 6.21015625, + "learning_rate": 0.0001, + "loss": 7.8508, + "loss/crossentropy": 2.0843460261821747, + "loss/hidden": 3.658984375, + "loss/jsd": 0.0, + "loss/logits": 0.23474157508462667, + "step": 19760 + }, + { + "epoch": 0.659, + "grad_norm": 29.75, + "grad_norm_var": 5.1494140625, + "learning_rate": 0.0001, + "loss": 7.792, + "loss/crossentropy": 2.084406663477421, + "loss/hidden": 3.51796875, + "loss/jsd": 0.0, + "loss/logits": 0.20483186282217503, + "step": 19770 + }, + { + "epoch": 0.6593333333333333, + "grad_norm": 29.25, + "grad_norm_var": 2.3197265625, + "learning_rate": 0.0001, + "loss": 7.7708, + "loss/crossentropy": 2.0922622852027417, + "loss/hidden": 3.5796875, + "loss/jsd": 0.0, + "loss/logits": 0.20298854364082217, + "step": 19780 + }, + { + "epoch": 0.6596666666666666, + "grad_norm": 31.375, + "grad_norm_var": 3.8041015625, + "learning_rate": 0.0001, + "loss": 7.9193, + "loss/crossentropy": 1.9548385262489318, + "loss/hidden": 3.68125, + "loss/jsd": 0.0, + "loss/logits": 0.2344514699652791, + "step": 19790 + }, + { + "epoch": 0.66, + "grad_norm": 31.875, + "grad_norm_var": 3.436393229166667, + "learning_rate": 0.0001, + "loss": 7.9176, + "loss/crossentropy": 2.099733465909958, + "loss/hidden": 3.55234375, + "loss/jsd": 0.0, + "loss/logits": 0.20575151350349188, + "step": 19800 + }, + { + "epoch": 0.6603333333333333, + "grad_norm": 29.125, + "grad_norm_var": 13.2375, + "learning_rate": 0.0001, + "loss": 7.7812, + "loss/crossentropy": 2.038871665298939, + "loss/hidden": 3.61484375, + "loss/jsd": 0.0, + "loss/logits": 0.2236450683325529, + "step": 19810 + }, + { + "epoch": 0.6606666666666666, + "grad_norm": 30.75, + "grad_norm_var": 13.39765625, + "learning_rate": 0.0001, + "loss": 7.7482, + "loss/crossentropy": 2.075606144964695, + "loss/hidden": 3.666796875, + "loss/jsd": 0.0, + "loss/logits": 0.20769685432314872, + "step": 19820 + }, + { + "epoch": 0.661, + "grad_norm": 37.0, + "grad_norm_var": 6.546875, + "learning_rate": 0.0001, + "loss": 7.7292, + "loss/crossentropy": 2.247103089094162, + "loss/hidden": 3.686328125, + "loss/jsd": 0.0, + "loss/logits": 0.2320896226912737, + "step": 19830 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 29.0, + "grad_norm_var": 18.645247395833334, + "learning_rate": 0.0001, + "loss": 7.782, + "loss/crossentropy": 2.1111979484558105, + "loss/hidden": 3.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2169733637943864, + "step": 19840 + }, + { + "epoch": 0.6616666666666666, + "grad_norm": 34.25, + "grad_norm_var": 20.564518229166666, + "learning_rate": 0.0001, + "loss": 7.805, + "loss/crossentropy": 2.1517412811517715, + "loss/hidden": 3.656640625, + "loss/jsd": 0.0, + "loss/logits": 0.22375482488423587, + "step": 19850 + }, + { + "epoch": 0.662, + "grad_norm": 30.75, + "grad_norm_var": 6.059375, + "learning_rate": 0.0001, + "loss": 7.6481, + "loss/crossentropy": 1.8939931578934193, + "loss/hidden": 3.6609375, + "loss/jsd": 0.0, + "loss/logits": 0.2128385290503502, + "step": 19860 + }, + { + "epoch": 0.6623333333333333, + "grad_norm": 30.5, + "grad_norm_var": 7.92890625, + "learning_rate": 0.0001, + "loss": 7.7487, + "loss/crossentropy": 2.2018162116408346, + "loss/hidden": 3.61875, + "loss/jsd": 0.0, + "loss/logits": 0.22412298023700714, + "step": 19870 + }, + { + "epoch": 0.6626666666666666, + "grad_norm": 32.75, + "grad_norm_var": 9.864322916666667, + "learning_rate": 0.0001, + "loss": 7.7606, + "loss/crossentropy": 2.1517828926444054, + "loss/hidden": 3.54140625, + "loss/jsd": 0.0, + "loss/logits": 0.2149300893768668, + "step": 19880 + }, + { + "epoch": 0.663, + "grad_norm": 30.0, + "grad_norm_var": 3.3268229166666665, + "learning_rate": 0.0001, + "loss": 7.9545, + "loss/crossentropy": 2.1691304370760918, + "loss/hidden": 3.662890625, + "loss/jsd": 0.0, + "loss/logits": 0.21897413935512305, + "step": 19890 + }, + { + "epoch": 0.6633333333333333, + "grad_norm": 35.5, + "grad_norm_var": 11.280143229166667, + "learning_rate": 0.0001, + "loss": 7.8222, + "loss/crossentropy": 2.1076577827334404, + "loss/hidden": 3.65078125, + "loss/jsd": 0.0, + "loss/logits": 0.21274937596172094, + "step": 19900 + }, + { + "epoch": 0.6636666666666666, + "grad_norm": 29.625, + "grad_norm_var": 13.858072916666666, + "learning_rate": 0.0001, + "loss": 7.8676, + "loss/crossentropy": 2.1668387286365034, + "loss/hidden": 3.5890625, + "loss/jsd": 0.0, + "loss/logits": 0.21026035211980343, + "step": 19910 + }, + { + "epoch": 0.664, + "grad_norm": 30.125, + "grad_norm_var": 1.4061848958333334, + "learning_rate": 0.0001, + "loss": 7.7496, + "loss/crossentropy": 2.052803510427475, + "loss/hidden": 3.642578125, + "loss/jsd": 0.0, + "loss/logits": 0.20628087930381298, + "step": 19920 + }, + { + "epoch": 0.6643333333333333, + "grad_norm": 31.5, + "grad_norm_var": 5.010872395833333, + "learning_rate": 0.0001, + "loss": 7.8597, + "loss/crossentropy": 2.2376641765236855, + "loss/hidden": 3.6359375, + "loss/jsd": 0.0, + "loss/logits": 0.22267442829906942, + "step": 19930 + }, + { + "epoch": 0.6646666666666666, + "grad_norm": 30.875, + "grad_norm_var": 7.891666666666667, + "learning_rate": 0.0001, + "loss": 7.7664, + "loss/crossentropy": 2.1567267000675203, + "loss/hidden": 3.590625, + "loss/jsd": 0.0, + "loss/logits": 0.20887317396700383, + "step": 19940 + }, + { + "epoch": 0.665, + "grad_norm": 29.375, + "grad_norm_var": 33.853580729166666, + "learning_rate": 0.0001, + "loss": 7.7958, + "loss/crossentropy": 2.0834827691316606, + "loss/hidden": 3.5921875, + "loss/jsd": 0.0, + "loss/logits": 0.21954385321587325, + "step": 19950 + }, + { + "epoch": 0.6653333333333333, + "grad_norm": 32.75, + "grad_norm_var": 35.7337890625, + "learning_rate": 0.0001, + "loss": 7.855, + "loss/crossentropy": 2.0476011231541635, + "loss/hidden": 3.689453125, + "loss/jsd": 0.0, + "loss/logits": 0.23169058002531528, + "step": 19960 + }, + { + "epoch": 0.6656666666666666, + "grad_norm": 31.5, + "grad_norm_var": 8.638997395833334, + "learning_rate": 0.0001, + "loss": 7.741, + "loss/crossentropy": 2.2387500554323196, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.2047835446894169, + "step": 19970 + }, + { + "epoch": 0.666, + "grad_norm": 38.25, + "grad_norm_var": 11.805208333333333, + "learning_rate": 0.0001, + "loss": 7.7604, + "loss/crossentropy": 2.0881649285554884, + "loss/hidden": 3.614453125, + "loss/jsd": 0.0, + "loss/logits": 0.19943447094410657, + "step": 19980 + }, + { + "epoch": 0.6663333333333333, + "grad_norm": 38.0, + "grad_norm_var": 10.564518229166667, + "learning_rate": 0.0001, + "loss": 7.7988, + "loss/crossentropy": 2.0458758428692816, + "loss/hidden": 3.673046875, + "loss/jsd": 0.0, + "loss/logits": 0.22079507317394018, + "step": 19990 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 35.0, + "grad_norm_var": 8.631184895833334, + "learning_rate": 0.0001, + "loss": 7.7353, + "loss/crossentropy": 2.1835017532110212, + "loss/hidden": 3.6046875, + "loss/jsd": 0.0, + "loss/logits": 0.21447087433189155, + "step": 20000 + }, + { + "epoch": 0.667, + "grad_norm": 27.75, + "grad_norm_var": 2.8713262106311393e+18, + "learning_rate": 9.999977793408362e-05, + "loss": 7.7009, + "loss/crossentropy": 2.1651393327862025, + "loss/hidden": 3.6609375, + "loss/jsd": 0.0, + "loss/logits": 0.22053546169772745, + "step": 20010 + }, + { + "epoch": 0.6673333333333333, + "grad_norm": 29.0, + "grad_norm_var": 10.5337890625, + "learning_rate": 9.999911173852618e-05, + "loss": 7.7892, + "loss/crossentropy": 2.049077409505844, + "loss/hidden": 3.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21919873766601086, + "step": 20020 + }, + { + "epoch": 0.6676666666666666, + "grad_norm": 33.25, + "grad_norm_var": 5.30390625, + "learning_rate": 9.999800141990274e-05, + "loss": 7.7849, + "loss/crossentropy": 2.100401471555233, + "loss/hidden": 3.543359375, + "loss/jsd": 0.0, + "loss/logits": 0.1976662116125226, + "step": 20030 + }, + { + "epoch": 0.668, + "grad_norm": 30.75, + "grad_norm_var": 4.901041666666667, + "learning_rate": 9.999644698917173e-05, + "loss": 7.7615, + "loss/crossentropy": 2.0303331464529037, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.20121301785111428, + "step": 20040 + }, + { + "epoch": 0.6683333333333333, + "grad_norm": 6710886400.0, + "grad_norm_var": 6.481711869968061e+18, + "learning_rate": 9.999444846167473e-05, + "loss": 7.9027, + "loss/crossentropy": 2.11352252215147, + "loss/hidden": 3.676953125, + "loss/jsd": 0.0, + "loss/logits": 0.23072675410658122, + "step": 20050 + }, + { + "epoch": 0.6686666666666666, + "grad_norm": 29.875, + "grad_norm_var": 6.481711869365549e+18, + "learning_rate": 9.99920058571364e-05, + "loss": 7.8042, + "loss/crossentropy": 2.0778122201561926, + "loss/hidden": 3.620703125, + "loss/jsd": 0.0, + "loss/logits": 0.20652580186724662, + "step": 20060 + }, + { + "epoch": 0.669, + "grad_norm": 30.125, + "grad_norm_var": 6.4056640625, + "learning_rate": 9.99891191996643e-05, + "loss": 7.6094, + "loss/crossentropy": 2.1776298195123673, + "loss/hidden": 3.56796875, + "loss/jsd": 0.0, + "loss/logits": 0.20591478087008, + "step": 20070 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 28.75, + "grad_norm_var": 8.021875, + "learning_rate": 9.99857885177485e-05, + "loss": 7.7139, + "loss/crossentropy": 2.0494499459862707, + "loss/hidden": 3.546484375, + "loss/jsd": 0.0, + "loss/logits": 0.19295884054154158, + "step": 20080 + }, + { + "epoch": 0.6696666666666666, + "grad_norm": 32.5, + "grad_norm_var": 5.2994140625, + "learning_rate": 9.998201384426155e-05, + "loss": 7.7195, + "loss/crossentropy": 1.9814274668693543, + "loss/hidden": 3.580078125, + "loss/jsd": 0.0, + "loss/logits": 0.20319805517792702, + "step": 20090 + }, + { + "epoch": 0.67, + "grad_norm": 28.25, + "grad_norm_var": 7.0119140625, + "learning_rate": 9.997779521645793e-05, + "loss": 7.8642, + "loss/crossentropy": 2.0849661231040955, + "loss/hidden": 3.614453125, + "loss/jsd": 0.0, + "loss/logits": 0.21840104656293988, + "step": 20100 + }, + { + "epoch": 0.6703333333333333, + "grad_norm": 30.875, + "grad_norm_var": 6.142122395833334, + "learning_rate": 9.997313267597378e-05, + "loss": 7.8638, + "loss/crossentropy": 2.0836787208914758, + "loss/hidden": 3.576953125, + "loss/jsd": 0.0, + "loss/logits": 0.2186410004273057, + "step": 20110 + }, + { + "epoch": 0.6706666666666666, + "grad_norm": 30.375, + "grad_norm_var": 12.291080729166667, + "learning_rate": 9.996802626882653e-05, + "loss": 7.7777, + "loss/crossentropy": 2.03349449634552, + "loss/hidden": 3.59609375, + "loss/jsd": 0.0, + "loss/logits": 0.21252898061648012, + "step": 20120 + }, + { + "epoch": 0.671, + "grad_norm": 28.875, + "grad_norm_var": 12.816080729166666, + "learning_rate": 9.99624760454143e-05, + "loss": 7.7899, + "loss/crossentropy": 2.032351566851139, + "loss/hidden": 3.625, + "loss/jsd": 0.0, + "loss/logits": 0.21746116746217012, + "step": 20130 + }, + { + "epoch": 0.6713333333333333, + "grad_norm": 27.75, + "grad_norm_var": 4.3947265625, + "learning_rate": 9.995648206051563e-05, + "loss": 7.8777, + "loss/crossentropy": 2.148711860179901, + "loss/hidden": 3.753125, + "loss/jsd": 0.0, + "loss/logits": 0.22083626296371223, + "step": 20140 + }, + { + "epoch": 0.6716666666666666, + "grad_norm": 31.75, + "grad_norm_var": 2.348372395833333, + "learning_rate": 9.995004437328867e-05, + "loss": 7.7507, + "loss/crossentropy": 1.838768770545721, + "loss/hidden": 3.54765625, + "loss/jsd": 0.0, + "loss/logits": 0.18891090219840406, + "step": 20150 + }, + { + "epoch": 0.672, + "grad_norm": 30.75, + "grad_norm_var": 3.405989583333333, + "learning_rate": 9.99431630472708e-05, + "loss": 7.7609, + "loss/crossentropy": 1.9995342150330544, + "loss/hidden": 3.71796875, + "loss/jsd": 0.0, + "loss/logits": 0.22888598432764412, + "step": 20160 + }, + { + "epoch": 0.6723333333333333, + "grad_norm": 29.25, + "grad_norm_var": 115.0259765625, + "learning_rate": 9.993583815037793e-05, + "loss": 7.8473, + "loss/crossentropy": 2.0745826318860052, + "loss/hidden": 3.52421875, + "loss/jsd": 0.0, + "loss/logits": 0.1968067741021514, + "step": 20170 + }, + { + "epoch": 0.6726666666666666, + "grad_norm": 30.5, + "grad_norm_var": 119.87682291666667, + "learning_rate": 9.992806975490389e-05, + "loss": 7.6878, + "loss/crossentropy": 2.186102945357561, + "loss/hidden": 3.604296875, + "loss/jsd": 0.0, + "loss/logits": 0.21814998863264917, + "step": 20180 + }, + { + "epoch": 0.673, + "grad_norm": 30.375, + "grad_norm_var": 3.3150390625, + "learning_rate": 9.991985793751955e-05, + "loss": 7.869, + "loss/crossentropy": 2.1551688984036446, + "loss/hidden": 3.740234375, + "loss/jsd": 0.0, + "loss/logits": 0.2181480558589101, + "step": 20190 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 31.125, + "grad_norm_var": 2.3955729166666666, + "learning_rate": 9.991120277927223e-05, + "loss": 7.7436, + "loss/crossentropy": 2.1464362293481827, + "loss/hidden": 3.53671875, + "loss/jsd": 0.0, + "loss/logits": 0.20534380227327348, + "step": 20200 + }, + { + "epoch": 0.6736666666666666, + "grad_norm": 33.75, + "grad_norm_var": 2.9457682291666667, + "learning_rate": 9.990210436558488e-05, + "loss": 7.7295, + "loss/crossentropy": 2.1891664013266565, + "loss/hidden": 3.49296875, + "loss/jsd": 0.0, + "loss/logits": 0.20168747715651988, + "step": 20210 + }, + { + "epoch": 0.674, + "grad_norm": 30.25, + "grad_norm_var": 4.6306640625, + "learning_rate": 9.989256278625514e-05, + "loss": 7.6581, + "loss/crossentropy": 1.9678195029497147, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.1858463604003191, + "step": 20220 + }, + { + "epoch": 0.6743333333333333, + "grad_norm": 33.0, + "grad_norm_var": 5.518684895833333, + "learning_rate": 9.988257813545458e-05, + "loss": 7.7895, + "loss/crossentropy": 2.106415245682001, + "loss/hidden": 3.66953125, + "loss/jsd": 0.0, + "loss/logits": 0.22159097539260983, + "step": 20230 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 30.75, + "grad_norm_var": 2.7979166666666666, + "learning_rate": 9.987215051172763e-05, + "loss": 7.8636, + "loss/crossentropy": 2.040626636892557, + "loss/hidden": 3.558984375, + "loss/jsd": 0.0, + "loss/logits": 0.21652155686169863, + "step": 20240 + }, + { + "epoch": 0.675, + "grad_norm": 31.0, + "grad_norm_var": 3.1832682291666665, + "learning_rate": 9.986128001799077e-05, + "loss": 7.7925, + "loss/crossentropy": 2.1772810891270638, + "loss/hidden": 3.684375, + "loss/jsd": 0.0, + "loss/logits": 0.2191918555647135, + "step": 20250 + }, + { + "epoch": 0.6753333333333333, + "grad_norm": 31.875, + "grad_norm_var": 3.9124348958333335, + "learning_rate": 9.984996676153134e-05, + "loss": 7.695, + "loss/crossentropy": 2.180080857872963, + "loss/hidden": 3.647265625, + "loss/jsd": 0.0, + "loss/logits": 0.22119200490415097, + "step": 20260 + }, + { + "epoch": 0.6756666666666666, + "grad_norm": 30.0, + "grad_norm_var": 4.589518229166667, + "learning_rate": 9.983821085400665e-05, + "loss": 7.9482, + "loss/crossentropy": 2.102330905199051, + "loss/hidden": 3.6609375, + "loss/jsd": 0.0, + "loss/logits": 0.2217390850186348, + "step": 20270 + }, + { + "epoch": 0.676, + "grad_norm": 31.0, + "grad_norm_var": 3.888541666666667, + "learning_rate": 9.982601241144277e-05, + "loss": 7.7288, + "loss/crossentropy": 1.9200996845960616, + "loss/hidden": 3.636328125, + "loss/jsd": 0.0, + "loss/logits": 0.19473480042070151, + "step": 20280 + }, + { + "epoch": 0.6763333333333333, + "grad_norm": 27.25, + "grad_norm_var": 13.608072916666666, + "learning_rate": 9.981337155423336e-05, + "loss": 7.876, + "loss/crossentropy": 2.160831370949745, + "loss/hidden": 3.55, + "loss/jsd": 0.0, + "loss/logits": 0.21856002546846867, + "step": 20290 + }, + { + "epoch": 0.6766666666666666, + "grad_norm": 29.0, + "grad_norm_var": 3.2994140625, + "learning_rate": 9.980028840713861e-05, + "loss": 7.7397, + "loss/crossentropy": 1.8465786181390285, + "loss/hidden": 3.592578125, + "loss/jsd": 0.0, + "loss/logits": 0.19492179118096828, + "step": 20300 + }, + { + "epoch": 0.677, + "grad_norm": 31.125, + "grad_norm_var": 2.7372395833333334, + "learning_rate": 9.978676309928389e-05, + "loss": 7.7692, + "loss/crossentropy": 2.069914309307933, + "loss/hidden": 3.591015625, + "loss/jsd": 0.0, + "loss/logits": 0.20638742656446993, + "step": 20310 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 32.25, + "grad_norm_var": 2.5479166666666666, + "learning_rate": 9.977279576415853e-05, + "loss": 8.0062, + "loss/crossentropy": 2.1518563136458395, + "loss/hidden": 3.65, + "loss/jsd": 0.0, + "loss/logits": 0.21888233721256256, + "step": 20320 + }, + { + "epoch": 0.6776666666666666, + "grad_norm": 32.5, + "grad_norm_var": 1.9254557291666667, + "learning_rate": 9.975838653961446e-05, + "loss": 7.7995, + "loss/crossentropy": 2.1415294885635374, + "loss/hidden": 3.61796875, + "loss/jsd": 0.0, + "loss/logits": 0.2294903416186571, + "step": 20330 + }, + { + "epoch": 0.678, + "grad_norm": 30.125, + "grad_norm_var": 3.7520182291666666, + "learning_rate": 9.974353556786496e-05, + "loss": 7.7562, + "loss/crossentropy": 2.060848282277584, + "loss/hidden": 3.518359375, + "loss/jsd": 0.0, + "loss/logits": 0.2027845649048686, + "step": 20340 + }, + { + "epoch": 0.6783333333333333, + "grad_norm": 33.25, + "grad_norm_var": 1.8582682291666666, + "learning_rate": 9.97282429954831e-05, + "loss": 7.8245, + "loss/crossentropy": 2.0787692457437514, + "loss/hidden": 3.639453125, + "loss/jsd": 0.0, + "loss/logits": 0.21248381081968545, + "step": 20350 + }, + { + "epoch": 0.6786666666666666, + "grad_norm": 29.25, + "grad_norm_var": 1.2686848958333334, + "learning_rate": 9.971250897340038e-05, + "loss": 7.7767, + "loss/crossentropy": 2.1000698655843735, + "loss/hidden": 3.746484375, + "loss/jsd": 0.0, + "loss/logits": 0.2223961053416133, + "step": 20360 + }, + { + "epoch": 0.679, + "grad_norm": 30.0, + "grad_norm_var": 2.798958333333333, + "learning_rate": 9.969633365690528e-05, + "loss": 7.7855, + "loss/crossentropy": 2.2210501074790954, + "loss/hidden": 3.634765625, + "loss/jsd": 0.0, + "loss/logits": 0.2190061157569289, + "step": 20370 + }, + { + "epoch": 0.6793333333333333, + "grad_norm": 31.5, + "grad_norm_var": 10.903580729166666, + "learning_rate": 9.967971720564162e-05, + "loss": 7.8671, + "loss/crossentropy": 2.25355578660965, + "loss/hidden": 3.623828125, + "loss/jsd": 0.0, + "loss/logits": 0.22110446617007257, + "step": 20380 + }, + { + "epoch": 0.6796666666666666, + "grad_norm": 32.25, + "grad_norm_var": 10.023958333333333, + "learning_rate": 9.966265978360708e-05, + "loss": 7.9619, + "loss/crossentropy": 2.2148331478238106, + "loss/hidden": 3.708984375, + "loss/jsd": 0.0, + "loss/logits": 0.2353464813902974, + "step": 20390 + }, + { + "epoch": 0.68, + "grad_norm": 29.25, + "grad_norm_var": 3.583268229166667, + "learning_rate": 9.964516155915151e-05, + "loss": 7.7277, + "loss/crossentropy": 2.0867557391524314, + "loss/hidden": 3.665234375, + "loss/jsd": 0.0, + "loss/logits": 0.217846536077559, + "step": 20400 + }, + { + "epoch": 0.6803333333333333, + "grad_norm": 29.625, + "grad_norm_var": 2.0872395833333335, + "learning_rate": 9.962722270497534e-05, + "loss": 7.81, + "loss/crossentropy": 2.1222174257040023, + "loss/hidden": 3.691796875, + "loss/jsd": 0.0, + "loss/logits": 0.2225545782595873, + "step": 20410 + }, + { + "epoch": 0.6806666666666666, + "grad_norm": 32.5, + "grad_norm_var": 3.3593098958333334, + "learning_rate": 9.960884339812781e-05, + "loss": 7.8604, + "loss/crossentropy": 2.0085777252912522, + "loss/hidden": 3.63359375, + "loss/jsd": 0.0, + "loss/logits": 0.2187149330973625, + "step": 20420 + }, + { + "epoch": 0.681, + "grad_norm": 29.0, + "grad_norm_var": 306.26875, + "learning_rate": 9.959002382000524e-05, + "loss": 7.7649, + "loss/crossentropy": 1.929932000488043, + "loss/hidden": 3.672265625, + "loss/jsd": 0.0, + "loss/logits": 0.20384540902450682, + "step": 20430 + }, + { + "epoch": 0.6813333333333333, + "grad_norm": 31.625, + "grad_norm_var": 299.79765625, + "learning_rate": 9.95707641563493e-05, + "loss": 7.8215, + "loss/crossentropy": 2.055840089917183, + "loss/hidden": 3.61953125, + "loss/jsd": 0.0, + "loss/logits": 0.19900662265717983, + "step": 20440 + }, + { + "epoch": 0.6816666666666666, + "grad_norm": 34.25, + "grad_norm_var": 22.658072916666665, + "learning_rate": 9.95510645972451e-05, + "loss": 7.7012, + "loss/crossentropy": 2.1488531097769736, + "loss/hidden": 3.63359375, + "loss/jsd": 0.0, + "loss/logits": 0.2117614457383752, + "step": 20450 + }, + { + "epoch": 0.682, + "grad_norm": 33.5, + "grad_norm_var": 26.0291015625, + "learning_rate": 9.95309253371193e-05, + "loss": 7.717, + "loss/crossentropy": 2.2348560094833374, + "loss/hidden": 3.512890625, + "loss/jsd": 0.0, + "loss/logits": 0.1998794011771679, + "step": 20460 + }, + { + "epoch": 0.6823333333333333, + "grad_norm": 29.125, + "grad_norm_var": 2.5136418842933724e+18, + "learning_rate": 9.951034657473828e-05, + "loss": 7.7369, + "loss/crossentropy": 2.160574886202812, + "loss/hidden": 3.598046875, + "loss/jsd": 0.0, + "loss/logits": 0.20629960373044015, + "step": 20470 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 28.5, + "grad_norm_var": 65.0134765625, + "learning_rate": 9.948932851320614e-05, + "loss": 7.7652, + "loss/crossentropy": 2.1478210985660553, + "loss/hidden": 3.62265625, + "loss/jsd": 0.0, + "loss/logits": 0.21312507782131435, + "step": 20480 + }, + { + "epoch": 0.683, + "grad_norm": 28.0, + "grad_norm_var": 73.778125, + "learning_rate": 9.946787135996263e-05, + "loss": 7.6988, + "loss/crossentropy": 1.978414911031723, + "loss/hidden": 3.618359375, + "loss/jsd": 0.0, + "loss/logits": 0.22136187348514796, + "step": 20490 + }, + { + "epoch": 0.6833333333333333, + "grad_norm": 30.25, + "grad_norm_var": 13.6369140625, + "learning_rate": 9.94459753267812e-05, + "loss": 7.9084, + "loss/crossentropy": 2.1078326418995856, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20318037196993827, + "step": 20500 + }, + { + "epoch": 0.6836666666666666, + "grad_norm": 28.625, + "grad_norm_var": 15.780143229166667, + "learning_rate": 9.942364062976687e-05, + "loss": 7.8032, + "loss/crossentropy": 2.143668609857559, + "loss/hidden": 3.65703125, + "loss/jsd": 0.0, + "loss/logits": 0.21416857857257127, + "step": 20510 + }, + { + "epoch": 0.684, + "grad_norm": 33.0, + "grad_norm_var": 3.283124096840447e+18, + "learning_rate": 9.940086748935406e-05, + "loss": 7.8061, + "loss/crossentropy": 1.9484010234475135, + "loss/hidden": 3.91875, + "loss/jsd": 0.0, + "loss/logits": 0.22513604983687402, + "step": 20520 + }, + { + "epoch": 0.6843333333333333, + "grad_norm": 36.75, + "grad_norm_var": 3.2831240977690655e+18, + "learning_rate": 9.937765613030451e-05, + "loss": 7.7232, + "loss/crossentropy": 2.0105784103274345, + "loss/hidden": 3.689453125, + "loss/jsd": 0.0, + "loss/logits": 0.20260654278099538, + "step": 20530 + }, + { + "epoch": 0.6846666666666666, + "grad_norm": 34.75, + "grad_norm_var": 11.262239583333333, + "learning_rate": 9.935400678170492e-05, + "loss": 7.808, + "loss/crossentropy": 2.044108145684004, + "loss/hidden": 3.599609375, + "loss/jsd": 0.0, + "loss/logits": 0.20465944344177842, + "step": 20540 + }, + { + "epoch": 0.685, + "grad_norm": 32.0, + "grad_norm_var": 13.480143229166666, + "learning_rate": 9.932991967696483e-05, + "loss": 7.7901, + "loss/crossentropy": 2.155579614639282, + "loss/hidden": 3.554296875, + "loss/jsd": 0.0, + "loss/logits": 0.21164779588580132, + "step": 20550 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 49.25, + "grad_norm_var": 34.08274739583333, + "learning_rate": 9.930539505381426e-05, + "loss": 7.8068, + "loss/crossentropy": 2.2071971163153647, + "loss/hidden": 3.643359375, + "loss/jsd": 0.0, + "loss/logits": 0.21646333318203687, + "step": 20560 + }, + { + "epoch": 0.6856666666666666, + "grad_norm": 30.0, + "grad_norm_var": 25.3353515625, + "learning_rate": 9.928043315430128e-05, + "loss": 7.7163, + "loss/crossentropy": 2.0667000114917755, + "loss/hidden": 3.5875, + "loss/jsd": 0.0, + "loss/logits": 0.21009525340050458, + "step": 20570 + }, + { + "epoch": 0.686, + "grad_norm": 28.75, + "grad_norm_var": 8.79140625, + "learning_rate": 9.925503422478984e-05, + "loss": 7.7736, + "loss/crossentropy": 2.207004964351654, + "loss/hidden": 3.572265625, + "loss/jsd": 0.0, + "loss/logits": 0.21299835238605738, + "step": 20580 + }, + { + "epoch": 0.6863333333333334, + "grad_norm": 31.875, + "grad_norm_var": 18.999934895833334, + "learning_rate": 9.922919851595707e-05, + "loss": 7.864, + "loss/crossentropy": 2.0951112896203994, + "loss/hidden": 3.70546875, + "loss/jsd": 0.0, + "loss/logits": 0.2267284881323576, + "step": 20590 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 33.5, + "grad_norm_var": 20.902083333333334, + "learning_rate": 9.920292628279099e-05, + "loss": 7.7329, + "loss/crossentropy": 2.0619683027267457, + "loss/hidden": 3.67265625, + "loss/jsd": 0.0, + "loss/logits": 0.21196486745029688, + "step": 20600 + }, + { + "epoch": 0.687, + "grad_norm": 31.5, + "grad_norm_var": 9.529166666666667, + "learning_rate": 9.917621778458796e-05, + "loss": 7.7423, + "loss/crossentropy": 2.003011184930801, + "loss/hidden": 3.54765625, + "loss/jsd": 0.0, + "loss/logits": 0.19809650387614966, + "step": 20610 + }, + { + "epoch": 0.6873333333333334, + "grad_norm": 31.125, + "grad_norm_var": 2.4936848958333333, + "learning_rate": 9.914907328495003e-05, + "loss": 7.7587, + "loss/crossentropy": 2.047277623414993, + "loss/hidden": 3.625390625, + "loss/jsd": 0.0, + "loss/logits": 0.20881472658365965, + "step": 20620 + }, + { + "epoch": 0.6876666666666666, + "grad_norm": 38.75, + "grad_norm_var": 7.1369140625, + "learning_rate": 9.91214930517825e-05, + "loss": 7.8442, + "loss/crossentropy": 2.047130857408047, + "loss/hidden": 3.61953125, + "loss/jsd": 0.0, + "loss/logits": 0.2019510269165039, + "step": 20630 + }, + { + "epoch": 0.688, + "grad_norm": 28.0, + "grad_norm_var": 8.914518229166667, + "learning_rate": 9.909347735729111e-05, + "loss": 7.7182, + "loss/crossentropy": 2.101368544995785, + "loss/hidden": 3.518359375, + "loss/jsd": 0.0, + "loss/logits": 0.22426690720021725, + "step": 20640 + }, + { + "epoch": 0.6883333333333334, + "grad_norm": 30.25, + "grad_norm_var": 4.451497395833333, + "learning_rate": 9.906502647797946e-05, + "loss": 7.8233, + "loss/crossentropy": 1.987765783816576, + "loss/hidden": 3.694921875, + "loss/jsd": 0.0, + "loss/logits": 0.2175267556682229, + "step": 20650 + }, + { + "epoch": 0.6886666666666666, + "grad_norm": 27.375, + "grad_norm_var": 21.977083333333333, + "learning_rate": 9.903614069464625e-05, + "loss": 7.7182, + "loss/crossentropy": 2.124682963639498, + "loss/hidden": 3.592578125, + "loss/jsd": 0.0, + "loss/logits": 0.20821518804877998, + "step": 20660 + }, + { + "epoch": 0.689, + "grad_norm": 31.75, + "grad_norm_var": 7.99375, + "learning_rate": 9.900682029238249e-05, + "loss": 7.8463, + "loss/crossentropy": 2.0248075053095818, + "loss/hidden": 3.645703125, + "loss/jsd": 0.0, + "loss/logits": 0.22646026099100708, + "step": 20670 + }, + { + "epoch": 0.6893333333333334, + "grad_norm": 31.625, + "grad_norm_var": 5.612955729166667, + "learning_rate": 9.897706556056872e-05, + "loss": 7.9115, + "loss/crossentropy": 2.0372205063700677, + "loss/hidden": 3.763671875, + "loss/jsd": 0.0, + "loss/logits": 0.23688461929559707, + "step": 20680 + }, + { + "epoch": 0.6896666666666667, + "grad_norm": 30.0, + "grad_norm_var": 3.0962890625, + "learning_rate": 9.894687679287211e-05, + "loss": 7.7103, + "loss/crossentropy": 1.9503981947898865, + "loss/hidden": 3.5734375, + "loss/jsd": 0.0, + "loss/logits": 0.20171856675297023, + "step": 20690 + }, + { + "epoch": 0.69, + "grad_norm": 32.25, + "grad_norm_var": 1.8863932291666667, + "learning_rate": 9.891625428724363e-05, + "loss": 7.8373, + "loss/crossentropy": 1.9779121212661266, + "loss/hidden": 3.706640625, + "loss/jsd": 0.0, + "loss/logits": 0.20387490205466746, + "step": 20700 + }, + { + "epoch": 0.6903333333333334, + "grad_norm": 28.75, + "grad_norm_var": 2.567122395833333, + "learning_rate": 9.888519834591505e-05, + "loss": 7.8038, + "loss/crossentropy": 2.0854588031768797, + "loss/hidden": 3.56171875, + "loss/jsd": 0.0, + "loss/logits": 0.19318762868642808, + "step": 20710 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 28.625, + "grad_norm_var": 5.96640625, + "learning_rate": 9.885370927539598e-05, + "loss": 7.8213, + "loss/crossentropy": 2.0958469033241274, + "loss/hidden": 3.70078125, + "loss/jsd": 0.0, + "loss/logits": 0.21331228129565716, + "step": 20720 + }, + { + "epoch": 0.691, + "grad_norm": 27.75, + "grad_norm_var": 4.359375, + "learning_rate": 9.88217873864708e-05, + "loss": 7.7409, + "loss/crossentropy": 2.0466203540563583, + "loss/hidden": 3.627734375, + "loss/jsd": 0.0, + "loss/logits": 0.20199444703757763, + "step": 20730 + }, + { + "epoch": 0.6913333333333334, + "grad_norm": 36.5, + "grad_norm_var": 4.928059895833333, + "learning_rate": 9.878943299419571e-05, + "loss": 7.7546, + "loss/crossentropy": 1.9727531932294369, + "loss/hidden": 3.6375, + "loss/jsd": 0.0, + "loss/logits": 0.20264990702271463, + "step": 20740 + }, + { + "epoch": 0.6916666666666667, + "grad_norm": 30.125, + "grad_norm_var": 7.667643229166667, + "learning_rate": 9.875664641789545e-05, + "loss": 7.7905, + "loss/crossentropy": 1.9583543412387372, + "loss/hidden": 3.551953125, + "loss/jsd": 0.0, + "loss/logits": 0.21098938062787057, + "step": 20750 + }, + { + "epoch": 0.692, + "grad_norm": 37.25, + "grad_norm_var": 2.155042766665373e+18, + "learning_rate": 9.872342798116033e-05, + "loss": 7.9216, + "loss/crossentropy": 2.1403904750943186, + "loss/hidden": 3.613671875, + "loss/jsd": 0.0, + "loss/logits": 0.20952636245638132, + "step": 20760 + }, + { + "epoch": 0.6923333333333334, + "grad_norm": 8019509248.0, + "grad_norm_var": 5.782152745791608e+18, + "learning_rate": 9.86897780118429e-05, + "loss": 7.6639, + "loss/crossentropy": 2.111561615765095, + "loss/hidden": 3.617578125, + "loss/jsd": 0.0, + "loss/logits": 0.2071863466873765, + "step": 20770 + }, + { + "epoch": 0.6926666666666667, + "grad_norm": 27.375, + "grad_norm_var": 4.0195330041945523e+18, + "learning_rate": 9.865569684205477e-05, + "loss": 7.8907, + "loss/crossentropy": 2.1496111884713174, + "loss/hidden": 3.567578125, + "loss/jsd": 0.0, + "loss/logits": 0.2130032055079937, + "step": 20780 + }, + { + "epoch": 0.693, + "grad_norm": 30.75, + "grad_norm_var": 4.995572916666666, + "learning_rate": 9.862118480816331e-05, + "loss": 7.8467, + "loss/crossentropy": 2.1802233815193177, + "loss/hidden": 3.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22255988270044327, + "step": 20790 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 28.25, + "grad_norm_var": 4.992643229166666, + "learning_rate": 9.858624225078841e-05, + "loss": 7.695, + "loss/crossentropy": 2.0942075878381727, + "loss/hidden": 3.585546875, + "loss/jsd": 0.0, + "loss/logits": 0.2092249434441328, + "step": 20800 + }, + { + "epoch": 0.6936666666666667, + "grad_norm": 31.5, + "grad_norm_var": 5.141666666666667, + "learning_rate": 9.855086951479894e-05, + "loss": 7.7499, + "loss/crossentropy": 2.094904583692551, + "loss/hidden": 3.546484375, + "loss/jsd": 0.0, + "loss/logits": 0.20001194700598718, + "step": 20810 + }, + { + "epoch": 0.694, + "grad_norm": 7616856064.0, + "grad_norm_var": 3.626030989921677e+18, + "learning_rate": 9.851506694930958e-05, + "loss": 7.8424, + "loss/crossentropy": 2.24151945784688, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.21622586157172918, + "step": 20820 + }, + { + "epoch": 0.6943333333333334, + "grad_norm": 35.0, + "grad_norm_var": 3.6260309876366203e+18, + "learning_rate": 9.847883490767716e-05, + "loss": 7.7868, + "loss/crossentropy": 2.077386400103569, + "loss/hidden": 3.5921875, + "loss/jsd": 0.0, + "loss/logits": 0.21033733878284694, + "step": 20830 + }, + { + "epoch": 0.6946666666666667, + "grad_norm": 31.25, + "grad_norm_var": 25.595572916666665, + "learning_rate": 9.844217374749732e-05, + "loss": 7.7393, + "loss/crossentropy": 2.1060422867536546, + "loss/hidden": 3.43984375, + "loss/jsd": 0.0, + "loss/logits": 0.19694673204794527, + "step": 20840 + }, + { + "epoch": 0.695, + "grad_norm": 30.125, + "grad_norm_var": 2.786672612471944e+18, + "learning_rate": 9.840508383060093e-05, + "loss": 7.7705, + "loss/crossentropy": 2.167571856081486, + "loss/hidden": 3.524609375, + "loss/jsd": 0.0, + "loss/logits": 0.2048779834061861, + "step": 20850 + }, + { + "epoch": 0.6953333333333334, + "grad_norm": 31.5, + "grad_norm_var": 2.7866726123189217e+18, + "learning_rate": 9.836756552305044e-05, + "loss": 7.7599, + "loss/crossentropy": 2.1368554055690767, + "loss/hidden": 3.569921875, + "loss/jsd": 0.0, + "loss/logits": 0.20562508180737496, + "step": 20860 + }, + { + "epoch": 0.6956666666666667, + "grad_norm": 28.75, + "grad_norm_var": 3.823372395833333, + "learning_rate": 9.832961919513646e-05, + "loss": 7.7132, + "loss/crossentropy": 2.0987789154052736, + "loss/hidden": 3.653515625, + "loss/jsd": 0.0, + "loss/logits": 0.21177561171352863, + "step": 20870 + }, + { + "epoch": 0.696, + "grad_norm": 30.5, + "grad_norm_var": 8.401822916666667, + "learning_rate": 9.829124522137386e-05, + "loss": 7.7658, + "loss/crossentropy": 2.0699412554502485, + "loss/hidden": 3.548828125, + "loss/jsd": 0.0, + "loss/logits": 0.19380738902837039, + "step": 20880 + }, + { + "epoch": 0.6963333333333334, + "grad_norm": 28.125, + "grad_norm_var": 3.7056640625, + "learning_rate": 9.825244398049834e-05, + "loss": 7.8394, + "loss/crossentropy": 2.1802071809768675, + "loss/hidden": 3.55234375, + "loss/jsd": 0.0, + "loss/logits": 0.20943715162575244, + "step": 20890 + }, + { + "epoch": 0.6966666666666667, + "grad_norm": 30.75, + "grad_norm_var": 2.9302083333333333, + "learning_rate": 9.821321585546244e-05, + "loss": 7.801, + "loss/crossentropy": 2.187861883640289, + "loss/hidden": 3.58828125, + "loss/jsd": 0.0, + "loss/logits": 0.20320057701319455, + "step": 20900 + }, + { + "epoch": 0.697, + "grad_norm": 33.75, + "grad_norm_var": 2.439322916666667, + "learning_rate": 9.817356123343193e-05, + "loss": 7.8392, + "loss/crossentropy": 2.143737843632698, + "loss/hidden": 3.59609375, + "loss/jsd": 0.0, + "loss/logits": 0.21037317141890527, + "step": 20910 + }, + { + "epoch": 0.6973333333333334, + "grad_norm": 34.5, + "grad_norm_var": 5.75625, + "learning_rate": 9.813348050578191e-05, + "loss": 7.6046, + "loss/crossentropy": 2.01915595009923, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.19461457710713148, + "step": 20920 + }, + { + "epoch": 0.6976666666666667, + "grad_norm": 29.625, + "grad_norm_var": 10.854166666666666, + "learning_rate": 9.8092974068093e-05, + "loss": 7.6391, + "loss/crossentropy": 2.104771558940411, + "loss/hidden": 3.622265625, + "loss/jsd": 0.0, + "loss/logits": 0.20720904739573598, + "step": 20930 + }, + { + "epoch": 0.698, + "grad_norm": 31.625, + "grad_norm_var": 28.412434895833332, + "learning_rate": 9.805204232014738e-05, + "loss": 7.7715, + "loss/crossentropy": 2.0513715844601395, + "loss/hidden": 3.775, + "loss/jsd": 0.0, + "loss/logits": 0.20735556054860355, + "step": 20940 + }, + { + "epoch": 0.6983333333333334, + "grad_norm": 29.625, + "grad_norm_var": 28.428059895833332, + "learning_rate": 9.801068566592485e-05, + "loss": 7.7446, + "loss/crossentropy": 2.141914916783571, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.20847442476078867, + "step": 20950 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 28.75, + "grad_norm_var": 4.476822916666666, + "learning_rate": 9.796890451359894e-05, + "loss": 7.8283, + "loss/crossentropy": 2.0166680470108984, + "loss/hidden": 3.70078125, + "loss/jsd": 0.0, + "loss/logits": 0.2056173078715801, + "step": 20960 + }, + { + "epoch": 0.699, + "grad_norm": 28.625, + "grad_norm_var": 4.481705729166666, + "learning_rate": 9.792669927553271e-05, + "loss": 7.6997, + "loss/crossentropy": 2.0283680982887744, + "loss/hidden": 3.576171875, + "loss/jsd": 0.0, + "loss/logits": 0.2040084034204483, + "step": 20970 + }, + { + "epoch": 0.6993333333333334, + "grad_norm": 32.25, + "grad_norm_var": 15.808072916666667, + "learning_rate": 9.788407036827486e-05, + "loss": 7.8985, + "loss/crossentropy": 2.110441932082176, + "loss/hidden": 3.62734375, + "loss/jsd": 0.0, + "loss/logits": 0.2083257043734193, + "step": 20980 + }, + { + "epoch": 0.6996666666666667, + "grad_norm": 31.0, + "grad_norm_var": 13.953125, + "learning_rate": 9.784101821255546e-05, + "loss": 7.8639, + "loss/crossentropy": 2.1061757408082484, + "loss/hidden": 3.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.2161664988845587, + "step": 20990 + }, + { + "epoch": 0.7, + "grad_norm": 30.25, + "grad_norm_var": 2.733268229166667, + "learning_rate": 9.779754323328192e-05, + "loss": 7.6809, + "loss/crossentropy": 1.968663776665926, + "loss/hidden": 3.59453125, + "loss/jsd": 0.0, + "loss/logits": 0.20843339152634144, + "step": 21000 + }, + { + "epoch": 0.7003333333333334, + "grad_norm": 33.75, + "grad_norm_var": 4.762239583333334, + "learning_rate": 9.775364585953473e-05, + "loss": 7.7791, + "loss/crossentropy": 2.0461377993226053, + "loss/hidden": 3.669140625, + "loss/jsd": 0.0, + "loss/logits": 0.20731903873384, + "step": 21010 + }, + { + "epoch": 0.7006666666666667, + "grad_norm": 28.75, + "grad_norm_var": 5.0916015625, + "learning_rate": 9.770932652456326e-05, + "loss": 7.7952, + "loss/crossentropy": 2.0643589437007903, + "loss/hidden": 3.619140625, + "loss/jsd": 0.0, + "loss/logits": 0.2156803973019123, + "step": 21020 + }, + { + "epoch": 0.701, + "grad_norm": 28.875, + "grad_norm_var": 5.80625, + "learning_rate": 9.766458566578143e-05, + "loss": 7.773, + "loss/crossentropy": 2.0716071873903275, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2185486238449812, + "step": 21030 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 38.0, + "grad_norm_var": 6.7322265625, + "learning_rate": 9.76194237247635e-05, + "loss": 7.728, + "loss/crossentropy": 2.182681308686733, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.1979642266407609, + "step": 21040 + }, + { + "epoch": 0.7016666666666667, + "grad_norm": 30.25, + "grad_norm_var": 6.1759765625, + "learning_rate": 9.757384114723954e-05, + "loss": 7.7438, + "loss/crossentropy": 2.0533548787236215, + "loss/hidden": 3.613671875, + "loss/jsd": 0.0, + "loss/logits": 0.21022803112864494, + "step": 21050 + }, + { + "epoch": 0.702, + "grad_norm": 29.5, + "grad_norm_var": 3.6103515625, + "learning_rate": 9.752783838309123e-05, + "loss": 7.7947, + "loss/crossentropy": 2.148482698202133, + "loss/hidden": 3.63828125, + "loss/jsd": 0.0, + "loss/logits": 0.2157578205689788, + "step": 21060 + }, + { + "epoch": 0.7023333333333334, + "grad_norm": 27.875, + "grad_norm_var": 2.6416666666666666, + "learning_rate": 9.748141588634725e-05, + "loss": 7.7997, + "loss/crossentropy": 1.9508283972740172, + "loss/hidden": 3.637890625, + "loss/jsd": 0.0, + "loss/logits": 0.20378196705132723, + "step": 21070 + }, + { + "epoch": 0.7026666666666667, + "grad_norm": 31.75, + "grad_norm_var": 23.627083333333335, + "learning_rate": 9.743457411517892e-05, + "loss": 7.8502, + "loss/crossentropy": 2.063946034014225, + "loss/hidden": 3.628515625, + "loss/jsd": 0.0, + "loss/logits": 0.22184212561696767, + "step": 21080 + }, + { + "epoch": 0.703, + "grad_norm": 34.0, + "grad_norm_var": 2.7603515625, + "learning_rate": 9.738731353189558e-05, + "loss": 7.7532, + "loss/crossentropy": 1.9890851199626922, + "loss/hidden": 3.553125, + "loss/jsd": 0.0, + "loss/logits": 0.19615612691268325, + "step": 21090 + }, + { + "epoch": 0.7033333333333334, + "grad_norm": 30.0, + "grad_norm_var": 2.9686848958333334, + "learning_rate": 9.733963460294015e-05, + "loss": 7.8208, + "loss/crossentropy": 1.9296169601380826, + "loss/hidden": 3.633984375, + "loss/jsd": 0.0, + "loss/logits": 0.2059748636558652, + "step": 21100 + }, + { + "epoch": 0.7036666666666667, + "grad_norm": 31.25, + "grad_norm_var": 41.9806640625, + "learning_rate": 9.729153779888439e-05, + "loss": 7.7385, + "loss/crossentropy": 1.9927678152918815, + "loss/hidden": 3.566796875, + "loss/jsd": 0.0, + "loss/logits": 0.19540305892005563, + "step": 21110 + }, + { + "epoch": 0.704, + "grad_norm": 28.75, + "grad_norm_var": 2.295768229166667, + "learning_rate": 9.724302359442434e-05, + "loss": 7.8334, + "loss/crossentropy": 2.0166243493556975, + "loss/hidden": 3.549609375, + "loss/jsd": 0.0, + "loss/logits": 0.20116372499614954, + "step": 21120 + }, + { + "epoch": 0.7043333333333334, + "grad_norm": 29.125, + "grad_norm_var": 1.8184895833333334, + "learning_rate": 9.719409246837561e-05, + "loss": 7.8302, + "loss/crossentropy": 1.9066402643918992, + "loss/hidden": 3.54140625, + "loss/jsd": 0.0, + "loss/logits": 0.1955884052440524, + "step": 21130 + }, + { + "epoch": 0.7046666666666667, + "grad_norm": 29.625, + "grad_norm_var": 14619.393489583334, + "learning_rate": 9.714474490366866e-05, + "loss": 7.917, + "loss/crossentropy": 1.9063568994402886, + "loss/hidden": 3.60859375, + "loss/jsd": 0.0, + "loss/logits": 0.1982713321223855, + "step": 21140 + }, + { + "epoch": 0.705, + "grad_norm": 28.375, + "grad_norm_var": 14504.148958333333, + "learning_rate": 9.709498138734405e-05, + "loss": 7.7312, + "loss/crossentropy": 1.9017325207591056, + "loss/hidden": 3.636328125, + "loss/jsd": 0.0, + "loss/logits": 0.19890787806361915, + "step": 21150 + }, + { + "epoch": 0.7053333333333334, + "grad_norm": 28.875, + "grad_norm_var": 19.811458333333334, + "learning_rate": 9.704480241054755e-05, + "loss": 7.6471, + "loss/crossentropy": 1.9751385256648064, + "loss/hidden": 3.6046875, + "loss/jsd": 0.0, + "loss/logits": 0.20594419166445732, + "step": 21160 + }, + { + "epoch": 0.7056666666666667, + "grad_norm": 28.875, + "grad_norm_var": 0.9884765625, + "learning_rate": 9.699420846852544e-05, + "loss": 7.7681, + "loss/crossentropy": 2.0237425029277802, + "loss/hidden": 3.624609375, + "loss/jsd": 0.0, + "loss/logits": 0.2027163729071617, + "step": 21170 + }, + { + "epoch": 0.706, + "grad_norm": 29.5, + "grad_norm_var": 0.8358723958333333, + "learning_rate": 9.694320006061949e-05, + "loss": 7.706, + "loss/crossentropy": 1.9728805258870126, + "loss/hidden": 3.50625, + "loss/jsd": 0.0, + "loss/logits": 0.20415272628888487, + "step": 21180 + }, + { + "epoch": 0.7063333333333334, + "grad_norm": 31.75, + "grad_norm_var": 1.5285807291666667, + "learning_rate": 9.689177769026211e-05, + "loss": 7.7188, + "loss/crossentropy": 2.114629329741001, + "loss/hidden": 3.634375, + "loss/jsd": 0.0, + "loss/logits": 0.2147956196218729, + "step": 21190 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 31.625, + "grad_norm_var": 14.480989583333333, + "learning_rate": 9.683994186497132e-05, + "loss": 7.8607, + "loss/crossentropy": 2.210577738285065, + "loss/hidden": 3.62578125, + "loss/jsd": 0.0, + "loss/logits": 0.22065361309796572, + "step": 21200 + }, + { + "epoch": 0.707, + "grad_norm": 28.75, + "grad_norm_var": 15.8478515625, + "learning_rate": 9.678769309634579e-05, + "loss": 7.7987, + "loss/crossentropy": 2.1382463231682776, + "loss/hidden": 3.630078125, + "loss/jsd": 0.0, + "loss/logits": 0.20682547576725482, + "step": 21210 + }, + { + "epoch": 0.7073333333333334, + "grad_norm": 50.0, + "grad_norm_var": 27.705989583333334, + "learning_rate": 9.673503190005977e-05, + "loss": 7.7034, + "loss/crossentropy": 1.9322528079152108, + "loss/hidden": 3.531640625, + "loss/jsd": 0.0, + "loss/logits": 0.19704128410667182, + "step": 21220 + }, + { + "epoch": 0.7076666666666667, + "grad_norm": 30.0, + "grad_norm_var": 34.70358072916667, + "learning_rate": 9.6681958795858e-05, + "loss": 7.7049, + "loss/crossentropy": 2.050449796766043, + "loss/hidden": 3.621484375, + "loss/jsd": 0.0, + "loss/logits": 0.23184078792110085, + "step": 21230 + }, + { + "epoch": 0.708, + "grad_norm": 31.375, + "grad_norm_var": 2.04140625, + "learning_rate": 9.66284743075506e-05, + "loss": 7.8191, + "loss/crossentropy": 2.2278542831540107, + "loss/hidden": 3.62265625, + "loss/jsd": 0.0, + "loss/logits": 0.22254880461841822, + "step": 21240 + }, + { + "epoch": 0.7083333333333334, + "grad_norm": 31.375, + "grad_norm_var": 6.119791666666667, + "learning_rate": 9.657457896300791e-05, + "loss": 7.7321, + "loss/crossentropy": 2.144664096832275, + "loss/hidden": 3.49453125, + "loss/jsd": 0.0, + "loss/logits": 0.20551785565912722, + "step": 21250 + }, + { + "epoch": 0.7086666666666667, + "grad_norm": 31.0, + "grad_norm_var": 9.594791666666667, + "learning_rate": 9.652027329415517e-05, + "loss": 7.7328, + "loss/crossentropy": 1.999681544303894, + "loss/hidden": 3.6890625, + "loss/jsd": 0.0, + "loss/logits": 0.2114706352353096, + "step": 21260 + }, + { + "epoch": 0.709, + "grad_norm": 28.375, + "grad_norm_var": 268.0978515625, + "learning_rate": 9.646555783696743e-05, + "loss": 7.8, + "loss/crossentropy": 2.0811921998858454, + "loss/hidden": 3.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.21599450353533028, + "step": 21270 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 28.5, + "grad_norm_var": 303.81458333333336, + "learning_rate": 9.641043313146417e-05, + "loss": 7.9496, + "loss/crossentropy": 2.1750354193151, + "loss/hidden": 3.63046875, + "loss/jsd": 0.0, + "loss/logits": 0.2116277886554599, + "step": 21280 + }, + { + "epoch": 0.7096666666666667, + "grad_norm": 29.375, + "grad_norm_var": 169.38125, + "learning_rate": 9.635489972170397e-05, + "loss": 7.6372, + "loss/crossentropy": 2.006293947994709, + "loss/hidden": 3.588671875, + "loss/jsd": 0.0, + "loss/logits": 0.19880817979574203, + "step": 21290 + }, + { + "epoch": 0.71, + "grad_norm": 28.75, + "grad_norm_var": 14.073958333333334, + "learning_rate": 9.629895815577916e-05, + "loss": 7.7662, + "loss/crossentropy": 2.0567120373249055, + "loss/hidden": 3.612890625, + "loss/jsd": 0.0, + "loss/logits": 0.2118833553045988, + "step": 21300 + }, + { + "epoch": 0.7103333333333334, + "grad_norm": 28.75, + "grad_norm_var": 14.435416666666667, + "learning_rate": 9.62426089858104e-05, + "loss": 7.7359, + "loss/crossentropy": 2.0882872194051743, + "loss/hidden": 3.63359375, + "loss/jsd": 0.0, + "loss/logits": 0.19910078253597022, + "step": 21310 + }, + { + "epoch": 0.7106666666666667, + "grad_norm": 31.125, + "grad_norm_var": 8.761393229166666, + "learning_rate": 9.618585276794129e-05, + "loss": 7.7853, + "loss/crossentropy": 2.0896404944360256, + "loss/hidden": 3.64921875, + "loss/jsd": 0.0, + "loss/logits": 0.22682881793007253, + "step": 21320 + }, + { + "epoch": 0.711, + "grad_norm": 33.25, + "grad_norm_var": 23.874739583333334, + "learning_rate": 9.612869006233275e-05, + "loss": 7.9966, + "loss/crossentropy": 2.1692073047161102, + "loss/hidden": 3.61796875, + "loss/jsd": 0.0, + "loss/logits": 0.2034358810633421, + "step": 21330 + }, + { + "epoch": 0.7113333333333334, + "grad_norm": 32.25, + "grad_norm_var": 18.7791015625, + "learning_rate": 9.607112143315763e-05, + "loss": 7.8539, + "loss/crossentropy": 2.0817694112658502, + "loss/hidden": 3.56171875, + "loss/jsd": 0.0, + "loss/logits": 0.2055924255400896, + "step": 21340 + }, + { + "epoch": 0.7116666666666667, + "grad_norm": 31.625, + "grad_norm_var": 9.392643229166667, + "learning_rate": 9.601314744859504e-05, + "loss": 7.5976, + "loss/crossentropy": 2.005814277380705, + "loss/hidden": 3.581640625, + "loss/jsd": 0.0, + "loss/logits": 0.2026340899989009, + "step": 21350 + }, + { + "epoch": 0.712, + "grad_norm": 31.625, + "grad_norm_var": 3.4900390625, + "learning_rate": 9.595476868082481e-05, + "loss": 7.8636, + "loss/crossentropy": 1.9804232444614172, + "loss/hidden": 3.69453125, + "loss/jsd": 0.0, + "loss/logits": 0.21351769701577722, + "step": 21360 + }, + { + "epoch": 0.7123333333333334, + "grad_norm": 31.0, + "grad_norm_var": 2.987434895833333, + "learning_rate": 9.589598570602181e-05, + "loss": 7.858, + "loss/crossentropy": 2.1446668222546577, + "loss/hidden": 3.575390625, + "loss/jsd": 0.0, + "loss/logits": 0.20793895637616516, + "step": 21370 + }, + { + "epoch": 0.7126666666666667, + "grad_norm": 30.25, + "grad_norm_var": 120.17545572916667, + "learning_rate": 9.583679910435026e-05, + "loss": 7.8126, + "loss/crossentropy": 2.034381502121687, + "loss/hidden": 3.66328125, + "loss/jsd": 0.0, + "loss/logits": 0.20835008025169371, + "step": 21380 + }, + { + "epoch": 0.713, + "grad_norm": 34.25, + "grad_norm_var": 112.87708333333333, + "learning_rate": 9.577720945995803e-05, + "loss": 7.8897, + "loss/crossentropy": 2.081572139263153, + "loss/hidden": 3.615625, + "loss/jsd": 0.0, + "loss/logits": 0.21697744037956, + "step": 21390 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 28.875, + "grad_norm_var": 119.82493489583334, + "learning_rate": 9.571721736097089e-05, + "loss": 7.6045, + "loss/crossentropy": 2.171599693596363, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.19405515491962433, + "step": 21400 + }, + { + "epoch": 0.7136666666666667, + "grad_norm": 30.0, + "grad_norm_var": 20.370247395833335, + "learning_rate": 9.565682339948657e-05, + "loss": 7.8137, + "loss/crossentropy": 2.099457284808159, + "loss/hidden": 3.64609375, + "loss/jsd": 0.0, + "loss/logits": 0.20623879097402095, + "step": 21410 + }, + { + "epoch": 0.714, + "grad_norm": 29.0, + "grad_norm_var": 5.495768229166667, + "learning_rate": 9.559602817156913e-05, + "loss": 7.6481, + "loss/crossentropy": 2.1219607055187226, + "loss/hidden": 3.60234375, + "loss/jsd": 0.0, + "loss/logits": 0.20431581195443868, + "step": 21420 + }, + { + "epoch": 0.7143333333333334, + "grad_norm": 29.25, + "grad_norm_var": 2.7, + "learning_rate": 9.553483227724292e-05, + "loss": 7.7096, + "loss/crossentropy": 2.0621854946017266, + "loss/hidden": 3.661328125, + "loss/jsd": 0.0, + "loss/logits": 0.21732638962566853, + "step": 21430 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 30.0, + "grad_norm_var": 2.5171223958333333, + "learning_rate": 9.54732363204867e-05, + "loss": 7.5899, + "loss/crossentropy": 2.1101118355989454, + "loss/hidden": 3.526171875, + "loss/jsd": 0.0, + "loss/logits": 0.20486003924161195, + "step": 21440 + }, + { + "epoch": 0.715, + "grad_norm": 29.0, + "grad_norm_var": 4.364518229166666, + "learning_rate": 9.54112409092277e-05, + "loss": 7.7143, + "loss/crossentropy": 2.039649748057127, + "loss/hidden": 3.55, + "loss/jsd": 0.0, + "loss/logits": 0.20175143275409937, + "step": 21450 + }, + { + "epoch": 0.7153333333333334, + "grad_norm": 30.75, + "grad_norm_var": 3.331184895833333, + "learning_rate": 9.534884665533563e-05, + "loss": 7.6551, + "loss/crossentropy": 2.087866473197937, + "loss/hidden": 3.572265625, + "loss/jsd": 0.0, + "loss/logits": 0.20887105632573366, + "step": 21460 + }, + { + "epoch": 0.7156666666666667, + "grad_norm": 31.375, + "grad_norm_var": 3.495247395833333, + "learning_rate": 9.528605417461653e-05, + "loss": 7.6824, + "loss/crossentropy": 1.9937364026904105, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.19099010657519103, + "step": 21470 + }, + { + "epoch": 0.716, + "grad_norm": 28.625, + "grad_norm_var": 2.4607245902591734e+18, + "learning_rate": 9.522286408680687e-05, + "loss": 7.7331, + "loss/crossentropy": 2.0533931560814382, + "loss/hidden": 3.495703125, + "loss/jsd": 0.0, + "loss/logits": 0.19130754433572292, + "step": 21480 + }, + { + "epoch": 0.7163333333333334, + "grad_norm": 28.875, + "grad_norm_var": 6.831430466374188e+18, + "learning_rate": 9.51592770155673e-05, + "loss": 7.6974, + "loss/crossentropy": 2.0633395805954935, + "loss/hidden": 3.654296875, + "loss/jsd": 0.0, + "loss/logits": 0.220349186565727, + "step": 21490 + }, + { + "epoch": 0.7166666666666667, + "grad_norm": 54.25, + "grad_norm_var": 38.76640625, + "learning_rate": 9.509529358847655e-05, + "loss": 7.6228, + "loss/crossentropy": 1.8947391845285892, + "loss/hidden": 3.4703125, + "loss/jsd": 0.0, + "loss/logits": 0.19983619190752505, + "step": 21500 + }, + { + "epoch": 0.717, + "grad_norm": 31.75, + "grad_norm_var": 38.83326822916667, + "learning_rate": 9.503091443702522e-05, + "loss": 7.7001, + "loss/crossentropy": 2.0377241536974906, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.19533433131873607, + "step": 21510 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 29.625, + "grad_norm_var": 3.3018229166666666, + "learning_rate": 9.496614019660951e-05, + "loss": 7.803, + "loss/crossentropy": 2.1326826021075247, + "loss/hidden": 3.651171875, + "loss/jsd": 0.0, + "loss/logits": 0.2108999377116561, + "step": 21520 + }, + { + "epoch": 0.7176666666666667, + "grad_norm": 33.75, + "grad_norm_var": 6.46640625, + "learning_rate": 9.490097150652505e-05, + "loss": 7.7089, + "loss/crossentropy": 1.8233733780682087, + "loss/hidden": 3.501171875, + "loss/jsd": 0.0, + "loss/logits": 0.17455916814506053, + "step": 21530 + }, + { + "epoch": 0.718, + "grad_norm": 29.125, + "grad_norm_var": 5.584309895833333, + "learning_rate": 9.483540900996049e-05, + "loss": 7.7816, + "loss/crossentropy": 2.222658357024193, + "loss/hidden": 3.5171875, + "loss/jsd": 0.0, + "loss/logits": 0.20714823082089423, + "step": 21540 + }, + { + "epoch": 0.7183333333333334, + "grad_norm": 28.125, + "grad_norm_var": 7.22890625, + "learning_rate": 9.476945335399122e-05, + "loss": 7.76, + "loss/crossentropy": 2.0847948037087916, + "loss/hidden": 3.44765625, + "loss/jsd": 0.0, + "loss/logits": 0.19452479667961597, + "step": 21550 + }, + { + "epoch": 0.7186666666666667, + "grad_norm": 59.0, + "grad_norm_var": 2.899825550166275e+18, + "learning_rate": 9.47031051895729e-05, + "loss": 7.7464, + "loss/crossentropy": 2.0613483414053917, + "loss/hidden": 3.583984375, + "loss/jsd": 0.0, + "loss/logits": 0.2091040827333927, + "step": 21560 + }, + { + "epoch": 0.719, + "grad_norm": 157.0, + "grad_norm_var": 1006.6166015625, + "learning_rate": 9.463636517153517e-05, + "loss": 7.8657, + "loss/crossentropy": 2.310398209095001, + "loss/hidden": 3.470703125, + "loss/jsd": 0.0, + "loss/logits": 0.2095829950645566, + "step": 21570 + }, + { + "epoch": 0.7193333333333334, + "grad_norm": 27.25, + "grad_norm_var": 999.0166015625, + "learning_rate": 9.456923395857503e-05, + "loss": 7.7325, + "loss/crossentropy": 1.9690666690468788, + "loss/hidden": 3.622265625, + "loss/jsd": 0.0, + "loss/logits": 0.2094871997833252, + "step": 21580 + }, + { + "epoch": 0.7196666666666667, + "grad_norm": 31.125, + "grad_norm_var": 12.08515625, + "learning_rate": 9.450171221325049e-05, + "loss": 7.6418, + "loss/crossentropy": 1.9917885288596153, + "loss/hidden": 3.512109375, + "loss/jsd": 0.0, + "loss/logits": 0.1998442027717829, + "step": 21590 + }, + { + "epoch": 0.72, + "grad_norm": 28.25, + "grad_norm_var": 5.362434895833333, + "learning_rate": 9.443380060197387e-05, + "loss": 7.7914, + "loss/crossentropy": 2.0604188472032545, + "loss/hidden": 3.528515625, + "loss/jsd": 0.0, + "loss/logits": 0.20265566650778055, + "step": 21600 + }, + { + "epoch": 0.7203333333333334, + "grad_norm": 114.5, + "grad_norm_var": 447.34973958333336, + "learning_rate": 9.436549979500539e-05, + "loss": 7.693, + "loss/crossentropy": 2.003858245909214, + "loss/hidden": 3.709375, + "loss/jsd": 0.0, + "loss/logits": 0.22414386905729772, + "step": 21610 + }, + { + "epoch": 0.7206666666666667, + "grad_norm": 26.625, + "grad_norm_var": 453.5893229166667, + "learning_rate": 9.42968104664464e-05, + "loss": 7.6233, + "loss/crossentropy": 2.0694996997714044, + "loss/hidden": 3.617578125, + "loss/jsd": 0.0, + "loss/logits": 0.205183663405478, + "step": 21620 + }, + { + "epoch": 0.721, + "grad_norm": 33.25, + "grad_norm_var": 13.485416666666667, + "learning_rate": 9.422773329423292e-05, + "loss": 7.7632, + "loss/crossentropy": 2.0694395408034323, + "loss/hidden": 3.654296875, + "loss/jsd": 0.0, + "loss/logits": 0.20871220286935568, + "step": 21630 + }, + { + "epoch": 0.7213333333333334, + "grad_norm": 32.5, + "grad_norm_var": 3.9759765625, + "learning_rate": 9.415826896012865e-05, + "loss": 7.6489, + "loss/crossentropy": 1.975140118598938, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.20482220761477948, + "step": 21640 + }, + { + "epoch": 0.7216666666666667, + "grad_norm": 29.375, + "grad_norm_var": 3.7978515625, + "learning_rate": 9.408841814971861e-05, + "loss": 7.6203, + "loss/crossentropy": 2.039801698923111, + "loss/hidden": 3.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19393778946250678, + "step": 21650 + }, + { + "epoch": 0.722, + "grad_norm": 31.125, + "grad_norm_var": 15.802018229166666, + "learning_rate": 9.401818155240205e-05, + "loss": 7.7741, + "loss/crossentropy": 2.119242195785046, + "loss/hidden": 3.559765625, + "loss/jsd": 0.0, + "loss/logits": 0.20300054959952832, + "step": 21660 + }, + { + "epoch": 0.7223333333333334, + "grad_norm": 28.125, + "grad_norm_var": 15.137239583333333, + "learning_rate": 9.394755986138586e-05, + "loss": 7.7166, + "loss/crossentropy": 1.8970364406704903, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.20859413947910072, + "step": 21670 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 27.125, + "grad_norm_var": 2.8337890625, + "learning_rate": 9.387655377367758e-05, + "loss": 7.6903, + "loss/crossentropy": 2.132764849066734, + "loss/hidden": 3.533203125, + "loss/jsd": 0.0, + "loss/logits": 0.2067256074398756, + "step": 21680 + }, + { + "epoch": 0.723, + "grad_norm": 28.5, + "grad_norm_var": 21.7197265625, + "learning_rate": 9.380516399007868e-05, + "loss": 7.6587, + "loss/crossentropy": 1.9144255444407463, + "loss/hidden": 3.6828125, + "loss/jsd": 0.0, + "loss/logits": 0.2001453947275877, + "step": 21690 + }, + { + "epoch": 0.7233333333333334, + "grad_norm": 30.0, + "grad_norm_var": 5.901822916666666, + "learning_rate": 9.373339121517747e-05, + "loss": 7.7691, + "loss/crossentropy": 2.2400635674595835, + "loss/hidden": 3.60234375, + "loss/jsd": 0.0, + "loss/logits": 0.2141938941553235, + "step": 21700 + }, + { + "epoch": 0.7236666666666667, + "grad_norm": 28.125, + "grad_norm_var": 3.468053159442855e+18, + "learning_rate": 9.366123615734227e-05, + "loss": 7.7074, + "loss/crossentropy": 2.0811144724488257, + "loss/hidden": 3.775, + "loss/jsd": 0.0, + "loss/logits": 0.1989194665104151, + "step": 21710 + }, + { + "epoch": 0.724, + "grad_norm": 28.125, + "grad_norm_var": 5.074739583333334, + "learning_rate": 9.358869952871436e-05, + "loss": 7.6993, + "loss/crossentropy": 2.1098939001560213, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.20545907951891423, + "step": 21720 + }, + { + "epoch": 0.7243333333333334, + "grad_norm": 30.875, + "grad_norm_var": 5.959375, + "learning_rate": 9.351578204520099e-05, + "loss": 7.5884, + "loss/crossentropy": 1.8791275858879088, + "loss/hidden": 3.540625, + "loss/jsd": 0.0, + "loss/logits": 0.20307840630412102, + "step": 21730 + }, + { + "epoch": 0.7246666666666667, + "grad_norm": 27.25, + "grad_norm_var": 3.1426432291666666, + "learning_rate": 9.344248442646829e-05, + "loss": 7.6224, + "loss/crossentropy": 2.0195631198585033, + "loss/hidden": 3.648828125, + "loss/jsd": 0.0, + "loss/logits": 0.19278614791110157, + "step": 21740 + }, + { + "epoch": 0.725, + "grad_norm": 28.25, + "grad_norm_var": 2.1442057291666665, + "learning_rate": 9.336880739593416e-05, + "loss": 7.535, + "loss/crossentropy": 2.1503761291503904, + "loss/hidden": 3.5609375, + "loss/jsd": 0.0, + "loss/logits": 0.2019724454730749, + "step": 21750 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 25.125, + "grad_norm_var": 5.632291666666666, + "learning_rate": 9.329475168076114e-05, + "loss": 7.6548, + "loss/crossentropy": 2.0659672379493714, + "loss/hidden": 3.661328125, + "loss/jsd": 0.0, + "loss/logits": 0.21330699287354946, + "step": 21760 + }, + { + "epoch": 0.7256666666666667, + "grad_norm": 30.5, + "grad_norm_var": 9.28125, + "learning_rate": 9.322031801184925e-05, + "loss": 7.6069, + "loss/crossentropy": 2.1512291483581065, + "loss/hidden": 3.644140625, + "loss/jsd": 0.0, + "loss/logits": 0.21503518372774125, + "step": 21770 + }, + { + "epoch": 0.726, + "grad_norm": 29.875, + "grad_norm_var": 7.08125, + "learning_rate": 9.314550712382875e-05, + "loss": 7.6104, + "loss/crossentropy": 2.112006691843271, + "loss/hidden": 3.452734375, + "loss/jsd": 0.0, + "loss/logits": 0.19404661422595382, + "step": 21780 + }, + { + "epoch": 0.7263333333333334, + "grad_norm": 28.75, + "grad_norm_var": 3.4056640625, + "learning_rate": 9.307031975505291e-05, + "loss": 7.672, + "loss/crossentropy": 2.076837729662657, + "loss/hidden": 3.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1984027072787285, + "step": 21790 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 28.75, + "grad_norm_var": 1.8900390625, + "learning_rate": 9.299475664759069e-05, + "loss": 7.6271, + "loss/crossentropy": 2.0399362690746785, + "loss/hidden": 3.519921875, + "loss/jsd": 0.0, + "loss/logits": 0.19836630206555128, + "step": 21800 + }, + { + "epoch": 0.727, + "grad_norm": 33.0, + "grad_norm_var": 1.9166666666666667, + "learning_rate": 9.291881854721946e-05, + "loss": 7.6745, + "loss/crossentropy": 2.128536182641983, + "loss/hidden": 3.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21894673127681016, + "step": 21810 + }, + { + "epoch": 0.7273333333333334, + "grad_norm": 29.125, + "grad_norm_var": 1.7624348958333333, + "learning_rate": 9.28425062034176e-05, + "loss": 7.6159, + "loss/crossentropy": 2.0389281518757345, + "loss/hidden": 3.598828125, + "loss/jsd": 0.0, + "loss/logits": 0.21084144692867995, + "step": 21820 + }, + { + "epoch": 0.7276666666666667, + "grad_norm": 29.125, + "grad_norm_var": 2.925, + "learning_rate": 9.276582036935717e-05, + "loss": 7.6311, + "loss/crossentropy": 2.02659250497818, + "loss/hidden": 3.496484375, + "loss/jsd": 0.0, + "loss/logits": 0.21077424064278602, + "step": 21830 + }, + { + "epoch": 0.728, + "grad_norm": 29.375, + "grad_norm_var": 3.093489583333333, + "learning_rate": 9.268876180189639e-05, + "loss": 7.627, + "loss/crossentropy": 2.014843727648258, + "loss/hidden": 3.561328125, + "loss/jsd": 0.0, + "loss/logits": 0.21272912081331014, + "step": 21840 + }, + { + "epoch": 0.7283333333333334, + "grad_norm": 28.125, + "grad_norm_var": 5.7712890625, + "learning_rate": 9.261133126157218e-05, + "loss": 7.7058, + "loss/crossentropy": 2.11625085696578, + "loss/hidden": 3.67109375, + "loss/jsd": 0.0, + "loss/logits": 0.2178550474345684, + "step": 21850 + }, + { + "epoch": 0.7286666666666667, + "grad_norm": 29.125, + "grad_norm_var": 1.3955729166666666, + "learning_rate": 9.253352951259271e-05, + "loss": 7.7112, + "loss/crossentropy": 2.1387905418872832, + "loss/hidden": 3.59140625, + "loss/jsd": 0.0, + "loss/logits": 0.2076106144115329, + "step": 21860 + }, + { + "epoch": 0.729, + "grad_norm": 29.75, + "grad_norm_var": 1.2997395833333334, + "learning_rate": 9.245535732282986e-05, + "loss": 7.6483, + "loss/crossentropy": 2.1955421969294546, + "loss/hidden": 3.541796875, + "loss/jsd": 0.0, + "loss/logits": 0.21497300919145346, + "step": 21870 + }, + { + "epoch": 0.7293333333333333, + "grad_norm": 28.375, + "grad_norm_var": 2.2643229166666665, + "learning_rate": 9.237681546381157e-05, + "loss": 7.6649, + "loss/crossentropy": 2.0390606805682183, + "loss/hidden": 3.533203125, + "loss/jsd": 0.0, + "loss/logits": 0.1902286982163787, + "step": 21880 + }, + { + "epoch": 0.7296666666666667, + "grad_norm": 30.625, + "grad_norm_var": 2.988997395833333, + "learning_rate": 9.229790471071429e-05, + "loss": 7.7516, + "loss/crossentropy": 2.15336195230484, + "loss/hidden": 3.5890625, + "loss/jsd": 0.0, + "loss/logits": 0.2047728981822729, + "step": 21890 + }, + { + "epoch": 0.73, + "grad_norm": 30.0, + "grad_norm_var": 1.6747395833333334, + "learning_rate": 9.221862584235528e-05, + "loss": 7.7046, + "loss/crossentropy": 2.0997920632362366, + "loss/hidden": 3.599609375, + "loss/jsd": 0.0, + "loss/logits": 0.20689391866326332, + "step": 21900 + }, + { + "epoch": 0.7303333333333333, + "grad_norm": 31.375, + "grad_norm_var": 2.26640625, + "learning_rate": 9.213897964118499e-05, + "loss": 7.5344, + "loss/crossentropy": 2.067221947014332, + "loss/hidden": 3.539453125, + "loss/jsd": 0.0, + "loss/logits": 0.19843466561287643, + "step": 21910 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 29.125, + "grad_norm_var": 6.4853515625, + "learning_rate": 9.205896689327923e-05, + "loss": 7.737, + "loss/crossentropy": 2.1428965769708155, + "loss/hidden": 3.585546875, + "loss/jsd": 0.0, + "loss/logits": 0.2188850357197225, + "step": 21920 + }, + { + "epoch": 0.731, + "grad_norm": 31.5, + "grad_norm_var": 7.158333333333333, + "learning_rate": 9.197858838833157e-05, + "loss": 7.6879, + "loss/crossentropy": 1.9083328664302825, + "loss/hidden": 3.662109375, + "loss/jsd": 0.0, + "loss/logits": 0.20349522549659013, + "step": 21930 + }, + { + "epoch": 0.7313333333333333, + "grad_norm": 31.125, + "grad_norm_var": 2.8212890625, + "learning_rate": 9.189784491964536e-05, + "loss": 7.6476, + "loss/crossentropy": 1.9690759629011154, + "loss/hidden": 3.66640625, + "loss/jsd": 0.0, + "loss/logits": 0.21341784493997693, + "step": 21940 + }, + { + "epoch": 0.7316666666666667, + "grad_norm": 26.875, + "grad_norm_var": 3.5497395833333334, + "learning_rate": 9.181673728412605e-05, + "loss": 7.6774, + "loss/crossentropy": 2.0544747814536093, + "loss/hidden": 3.682421875, + "loss/jsd": 0.0, + "loss/logits": 0.21329349987208843, + "step": 21950 + }, + { + "epoch": 0.732, + "grad_norm": 27.875, + "grad_norm_var": 30.809830729166666, + "learning_rate": 9.173526628227329e-05, + "loss": 7.6327, + "loss/crossentropy": 2.000888040661812, + "loss/hidden": 3.57109375, + "loss/jsd": 0.0, + "loss/logits": 0.20834028851240874, + "step": 21960 + }, + { + "epoch": 0.7323333333333333, + "grad_norm": 30.5, + "grad_norm_var": 3.89765625, + "learning_rate": 9.165343271817292e-05, + "loss": 7.7272, + "loss/crossentropy": 2.2294601082801817, + "loss/hidden": 3.483203125, + "loss/jsd": 0.0, + "loss/logits": 0.20684596356004475, + "step": 21970 + }, + { + "epoch": 0.7326666666666667, + "grad_norm": 35.75, + "grad_norm_var": 3.42265625, + "learning_rate": 9.157123739948924e-05, + "loss": 7.6675, + "loss/crossentropy": 2.032812249660492, + "loss/hidden": 3.5453125, + "loss/jsd": 0.0, + "loss/logits": 0.21877431515604256, + "step": 21980 + }, + { + "epoch": 0.733, + "grad_norm": 31.625, + "grad_norm_var": 3.84140625, + "learning_rate": 9.148868113745681e-05, + "loss": 7.7739, + "loss/crossentropy": 1.9926821939647197, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.20471897087991237, + "step": 21990 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 29.375, + "grad_norm_var": 4.79140625, + "learning_rate": 9.140576474687264e-05, + "loss": 7.6612, + "loss/crossentropy": 2.0667272046208383, + "loss/hidden": 3.5640625, + "loss/jsd": 0.0, + "loss/logits": 0.2066561786457896, + "step": 22000 + }, + { + "epoch": 0.7336666666666667, + "grad_norm": 29.375, + "grad_norm_var": 5.839518229166667, + "learning_rate": 9.132248904608801e-05, + "loss": 7.678, + "loss/crossentropy": 2.0169097036123276, + "loss/hidden": 3.441015625, + "loss/jsd": 0.0, + "loss/logits": 0.1938495047390461, + "step": 22010 + }, + { + "epoch": 0.734, + "grad_norm": 28.25, + "grad_norm_var": 1.7587890625, + "learning_rate": 9.123885485700049e-05, + "loss": 7.5806, + "loss/crossentropy": 2.1955192267894743, + "loss/hidden": 3.57734375, + "loss/jsd": 0.0, + "loss/logits": 0.21994279995560645, + "step": 22020 + }, + { + "epoch": 0.7343333333333333, + "grad_norm": 29.625, + "grad_norm_var": 908.9122395833333, + "learning_rate": 9.115486300504575e-05, + "loss": 7.7442, + "loss/crossentropy": 2.1037651874125003, + "loss/hidden": 3.608984375, + "loss/jsd": 0.0, + "loss/logits": 0.20334150791168212, + "step": 22030 + }, + { + "epoch": 0.7346666666666667, + "grad_norm": 27.875, + "grad_norm_var": 9.395572916666667, + "learning_rate": 9.107051431918944e-05, + "loss": 7.7365, + "loss/crossentropy": 2.210801270604134, + "loss/hidden": 3.435546875, + "loss/jsd": 0.0, + "loss/logits": 0.19594964981079102, + "step": 22040 + }, + { + "epoch": 0.735, + "grad_norm": 28.25, + "grad_norm_var": 3.0942057291666667, + "learning_rate": 9.098580963191908e-05, + "loss": 7.7258, + "loss/crossentropy": 2.0567986249923704, + "loss/hidden": 3.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19552275333553554, + "step": 22050 + }, + { + "epoch": 0.7353333333333333, + "grad_norm": 28.0, + "grad_norm_var": 1.80390625, + "learning_rate": 9.09007497792357e-05, + "loss": 7.6263, + "loss/crossentropy": 2.0325539082288744, + "loss/hidden": 3.584375, + "loss/jsd": 0.0, + "loss/logits": 0.19655003491789103, + "step": 22060 + }, + { + "epoch": 0.7356666666666667, + "grad_norm": 29.375, + "grad_norm_var": 1.70390625, + "learning_rate": 9.08153356006457e-05, + "loss": 7.6428, + "loss/crossentropy": 1.9758184522390365, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.180863216239959, + "step": 22070 + }, + { + "epoch": 0.736, + "grad_norm": 28.75, + "grad_norm_var": 3.6546223958333335, + "learning_rate": 9.07295679391526e-05, + "loss": 7.6479, + "loss/crossentropy": 2.0911218881607057, + "loss/hidden": 3.5734375, + "loss/jsd": 0.0, + "loss/logits": 0.21416922919452192, + "step": 22080 + }, + { + "epoch": 0.7363333333333333, + "grad_norm": 29.375, + "grad_norm_var": 2.2997395833333334, + "learning_rate": 9.064344764124852e-05, + "loss": 7.5806, + "loss/crossentropy": 1.9327972888946534, + "loss/hidden": 3.623828125, + "loss/jsd": 0.0, + "loss/logits": 0.19096189700067043, + "step": 22090 + }, + { + "epoch": 0.7366666666666667, + "grad_norm": 26.625, + "grad_norm_var": 2.6122395833333334, + "learning_rate": 9.055697555690608e-05, + "loss": 7.6489, + "loss/crossentropy": 2.1428242295980455, + "loss/hidden": 3.5375, + "loss/jsd": 0.0, + "loss/logits": 0.21426594704389573, + "step": 22100 + }, + { + "epoch": 0.737, + "grad_norm": 28.125, + "grad_norm_var": 2.536393229166667, + "learning_rate": 9.047015253956981e-05, + "loss": 7.6099, + "loss/crossentropy": 2.2596158146858216, + "loss/hidden": 3.55703125, + "loss/jsd": 0.0, + "loss/logits": 0.21648634187877178, + "step": 22110 + }, + { + "epoch": 0.7373333333333333, + "grad_norm": 28.375, + "grad_norm_var": 2.1442057291666665, + "learning_rate": 9.038297944614785e-05, + "loss": 7.7341, + "loss/crossentropy": 2.044772403687239, + "loss/hidden": 3.441015625, + "loss/jsd": 0.0, + "loss/logits": 0.18932544207200408, + "step": 22120 + }, + { + "epoch": 0.7376666666666667, + "grad_norm": 32.5, + "grad_norm_var": 3.3999348958333333, + "learning_rate": 9.029545713700346e-05, + "loss": 7.6648, + "loss/crossentropy": 1.961163030564785, + "loss/hidden": 3.614453125, + "loss/jsd": 0.0, + "loss/logits": 0.1884706408716738, + "step": 22130 + }, + { + "epoch": 0.738, + "grad_norm": 28.0, + "grad_norm_var": 3.909375, + "learning_rate": 9.020758647594646e-05, + "loss": 7.5784, + "loss/crossentropy": 1.8796802014112473, + "loss/hidden": 3.5046875, + "loss/jsd": 0.0, + "loss/logits": 0.18157810363918542, + "step": 22140 + }, + { + "epoch": 0.7383333333333333, + "grad_norm": 28.0, + "grad_norm_var": 3.0444333217627853e+18, + "learning_rate": 9.011936833022484e-05, + "loss": 7.7373, + "loss/crossentropy": 2.1361525490880013, + "loss/hidden": 3.5359375, + "loss/jsd": 0.0, + "loss/logits": 0.20192783158272504, + "step": 22150 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 32.0, + "grad_norm_var": 5.5259765625, + "learning_rate": 9.003080357051607e-05, + "loss": 7.5862, + "loss/crossentropy": 2.1341183722019195, + "loss/hidden": 3.521484375, + "loss/jsd": 0.0, + "loss/logits": 0.21967477165162563, + "step": 22160 + }, + { + "epoch": 0.739, + "grad_norm": 28.5, + "grad_norm_var": 1.7645182291666666, + "learning_rate": 8.994189307091854e-05, + "loss": 7.6545, + "loss/crossentropy": 2.0633115977048875, + "loss/hidden": 3.61796875, + "loss/jsd": 0.0, + "loss/logits": 0.193378933891654, + "step": 22170 + }, + { + "epoch": 0.7393333333333333, + "grad_norm": 32.25, + "grad_norm_var": 1.4718098958333334, + "learning_rate": 8.985263770894302e-05, + "loss": 7.7163, + "loss/crossentropy": 2.150431227684021, + "loss/hidden": 3.693359375, + "loss/jsd": 0.0, + "loss/logits": 0.21285873763263224, + "step": 22180 + }, + { + "epoch": 0.7396666666666667, + "grad_norm": 28.625, + "grad_norm_var": 16.570768229166667, + "learning_rate": 8.97630383655039e-05, + "loss": 7.6727, + "loss/crossentropy": 1.9807396337389946, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.19658283134922386, + "step": 22190 + }, + { + "epoch": 0.74, + "grad_norm": 27.25, + "grad_norm_var": 19.762239583333333, + "learning_rate": 8.967309592491052e-05, + "loss": 7.5625, + "loss/crossentropy": 2.1757678367197513, + "loss/hidden": 3.458984375, + "loss/jsd": 0.0, + "loss/logits": 0.2128155424259603, + "step": 22200 + }, + { + "epoch": 0.7403333333333333, + "grad_norm": 30.375, + "grad_norm_var": 1.4488932291666667, + "learning_rate": 8.958281127485845e-05, + "loss": 7.5472, + "loss/crossentropy": 2.061194130033255, + "loss/hidden": 3.479296875, + "loss/jsd": 0.0, + "loss/logits": 0.1889759385958314, + "step": 22210 + }, + { + "epoch": 0.7406666666666667, + "grad_norm": 27.625, + "grad_norm_var": 2.7952473958333335, + "learning_rate": 8.949218530642075e-05, + "loss": 7.6151, + "loss/crossentropy": 2.0728229813277723, + "loss/hidden": 3.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.1952021485194564, + "step": 22220 + }, + { + "epoch": 0.741, + "grad_norm": 29.75, + "grad_norm_var": 32.40807291666667, + "learning_rate": 8.940121891403912e-05, + "loss": 7.5998, + "loss/crossentropy": 2.088392072916031, + "loss/hidden": 3.605078125, + "loss/jsd": 0.0, + "loss/logits": 0.1989578979089856, + "step": 22230 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 29.125, + "grad_norm_var": 16.971809895833335, + "learning_rate": 8.930991299551515e-05, + "loss": 7.678, + "loss/crossentropy": 2.089249915629625, + "loss/hidden": 3.569140625, + "loss/jsd": 0.0, + "loss/logits": 0.20038176514208317, + "step": 22240 + }, + { + "epoch": 0.7416666666666667, + "grad_norm": 29.75, + "grad_norm_var": 3.230208333333333, + "learning_rate": 8.921826845200139e-05, + "loss": 7.5722, + "loss/crossentropy": 2.1239826932549475, + "loss/hidden": 3.61953125, + "loss/jsd": 0.0, + "loss/logits": 0.2005884603597224, + "step": 22250 + }, + { + "epoch": 0.742, + "grad_norm": 30.625, + "grad_norm_var": 2.9854166666666666, + "learning_rate": 8.91262861879925e-05, + "loss": 7.6025, + "loss/crossentropy": 2.14085738658905, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.2137705808505416, + "step": 22260 + }, + { + "epoch": 0.7423333333333333, + "grad_norm": 27.25, + "grad_norm_var": 2.343489583333333, + "learning_rate": 8.903396711131624e-05, + "loss": 7.6046, + "loss/crossentropy": 1.8913337871432305, + "loss/hidden": 3.537890625, + "loss/jsd": 0.0, + "loss/logits": 0.1923683611676097, + "step": 22270 + }, + { + "epoch": 0.7426666666666667, + "grad_norm": 29.5, + "grad_norm_var": 1.8832682291666667, + "learning_rate": 8.894131213312467e-05, + "loss": 7.5532, + "loss/crossentropy": 1.9501267954707147, + "loss/hidden": 3.5484375, + "loss/jsd": 0.0, + "loss/logits": 0.19819272067397833, + "step": 22280 + }, + { + "epoch": 0.743, + "grad_norm": 28.875, + "grad_norm_var": 2.819205729166667, + "learning_rate": 8.884832216788501e-05, + "loss": 7.6744, + "loss/crossentropy": 2.220875917375088, + "loss/hidden": 3.469921875, + "loss/jsd": 0.0, + "loss/logits": 0.19938996117562055, + "step": 22290 + }, + { + "epoch": 0.7433333333333333, + "grad_norm": 30.0, + "grad_norm_var": 1.6025390625, + "learning_rate": 8.875499813337069e-05, + "loss": 7.5482, + "loss/crossentropy": 2.1203695118427275, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.19239178942516447, + "step": 22300 + }, + { + "epoch": 0.7436666666666667, + "grad_norm": 26.625, + "grad_norm_var": 8.838541666666666, + "learning_rate": 8.866134095065222e-05, + "loss": 7.6747, + "loss/crossentropy": 2.07410399466753, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.1968332275748253, + "step": 22310 + }, + { + "epoch": 0.744, + "grad_norm": 29.625, + "grad_norm_var": 8.917122395833333, + "learning_rate": 8.85673515440882e-05, + "loss": 7.5404, + "loss/crossentropy": 2.104242541640997, + "loss/hidden": 3.562890625, + "loss/jsd": 0.0, + "loss/logits": 0.20138515261933207, + "step": 22320 + }, + { + "epoch": 0.7443333333333333, + "grad_norm": 27.25, + "grad_norm_var": 2.3087890625, + "learning_rate": 8.847303084131613e-05, + "loss": 7.679, + "loss/crossentropy": 2.076655426621437, + "loss/hidden": 3.521484375, + "loss/jsd": 0.0, + "loss/logits": 0.20225820317864418, + "step": 22330 + }, + { + "epoch": 0.7446666666666667, + "grad_norm": 28.25, + "grad_norm_var": 1.4270182291666667, + "learning_rate": 8.837837977324328e-05, + "loss": 7.5549, + "loss/crossentropy": 1.97199331484735, + "loss/hidden": 3.46015625, + "loss/jsd": 0.0, + "loss/logits": 0.18509145381394773, + "step": 22340 + }, + { + "epoch": 0.745, + "grad_norm": 29.625, + "grad_norm_var": 0.9, + "learning_rate": 8.828339927403745e-05, + "loss": 7.5815, + "loss/crossentropy": 2.1016619965434074, + "loss/hidden": 3.515234375, + "loss/jsd": 0.0, + "loss/logits": 0.2104162724688649, + "step": 22350 + }, + { + "epoch": 0.7453333333333333, + "grad_norm": 27.25, + "grad_norm_var": 1.4723307291666667, + "learning_rate": 8.818809028111783e-05, + "loss": 7.7285, + "loss/crossentropy": 2.034029767662287, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.18854013606905937, + "step": 22360 + }, + { + "epoch": 0.7456666666666667, + "grad_norm": 28.5, + "grad_norm_var": 11.351822916666666, + "learning_rate": 8.809245373514572e-05, + "loss": 7.5918, + "loss/crossentropy": 2.0507765501737594, + "loss/hidden": 3.602734375, + "loss/jsd": 0.0, + "loss/logits": 0.21991799995303155, + "step": 22370 + }, + { + "epoch": 0.746, + "grad_norm": 27.875, + "grad_norm_var": 10.278059895833334, + "learning_rate": 8.799649058001521e-05, + "loss": 7.6398, + "loss/crossentropy": 2.0420378386974334, + "loss/hidden": 3.62265625, + "loss/jsd": 0.0, + "loss/logits": 0.19834638610482216, + "step": 22380 + }, + { + "epoch": 0.7463333333333333, + "grad_norm": 29.875, + "grad_norm_var": 139.56139322916667, + "learning_rate": 8.79002017628439e-05, + "loss": 7.6281, + "loss/crossentropy": 2.114726561307907, + "loss/hidden": 3.511328125, + "loss/jsd": 0.0, + "loss/logits": 0.1945993335917592, + "step": 22390 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 27.25, + "grad_norm_var": 143.01764322916668, + "learning_rate": 8.780358823396352e-05, + "loss": 7.6033, + "loss/crossentropy": 2.1299582980573177, + "loss/hidden": 3.465625, + "loss/jsd": 0.0, + "loss/logits": 0.21268012626096605, + "step": 22400 + }, + { + "epoch": 0.747, + "grad_norm": 28.5, + "grad_norm_var": 24.453059895833334, + "learning_rate": 8.770665094691064e-05, + "loss": 7.6004, + "loss/crossentropy": 2.09407639503479, + "loss/hidden": 3.528125, + "loss/jsd": 0.0, + "loss/logits": 0.20005517825484276, + "step": 22410 + }, + { + "epoch": 0.7473333333333333, + "grad_norm": 68.5, + "grad_norm_var": 109.85807291666667, + "learning_rate": 8.76093908584171e-05, + "loss": 7.6905, + "loss/crossentropy": 2.253283692896366, + "loss/hidden": 3.551171875, + "loss/jsd": 0.0, + "loss/logits": 0.21519504617899657, + "step": 22420 + }, + { + "epoch": 0.7476666666666667, + "grad_norm": 29.875, + "grad_norm_var": 100.96920572916666, + "learning_rate": 8.751180892840074e-05, + "loss": 7.53, + "loss/crossentropy": 1.9626074001193046, + "loss/hidden": 3.508203125, + "loss/jsd": 0.0, + "loss/logits": 0.18422856479883193, + "step": 22430 + }, + { + "epoch": 0.748, + "grad_norm": 27.25, + "grad_norm_var": 3.2129557291666666, + "learning_rate": 8.741390611995581e-05, + "loss": 7.4322, + "loss/crossentropy": 1.8774556368589401, + "loss/hidden": 3.54375, + "loss/jsd": 0.0, + "loss/logits": 0.21066911737434565, + "step": 22440 + }, + { + "epoch": 0.7483333333333333, + "grad_norm": 30.0, + "grad_norm_var": 3.0580729166666667, + "learning_rate": 8.731568339934349e-05, + "loss": 7.611, + "loss/crossentropy": 2.0886680990457536, + "loss/hidden": 3.53984375, + "loss/jsd": 0.0, + "loss/logits": 0.20858072843402625, + "step": 22450 + }, + { + "epoch": 0.7486666666666667, + "grad_norm": 26.625, + "grad_norm_var": 3.0931640625, + "learning_rate": 8.72171417359824e-05, + "loss": 7.4627, + "loss/crossentropy": 2.0577007859945295, + "loss/hidden": 3.546484375, + "loss/jsd": 0.0, + "loss/logits": 0.19628361649811268, + "step": 22460 + }, + { + "epoch": 0.749, + "grad_norm": 30.625, + "grad_norm_var": 4.778059895833334, + "learning_rate": 8.711828210243896e-05, + "loss": 7.5627, + "loss/crossentropy": 2.1247499108314516, + "loss/hidden": 3.594140625, + "loss/jsd": 0.0, + "loss/logits": 0.2208824411034584, + "step": 22470 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 28.75, + "grad_norm_var": 3.2431640625, + "learning_rate": 8.701910547441786e-05, + "loss": 7.5314, + "loss/crossentropy": 2.1249613009393213, + "loss/hidden": 3.519921875, + "loss/jsd": 0.0, + "loss/logits": 0.18236859384924173, + "step": 22480 + }, + { + "epoch": 0.7496666666666667, + "grad_norm": 28.875, + "grad_norm_var": 0.9884765625, + "learning_rate": 8.691961283075233e-05, + "loss": 7.7002, + "loss/crossentropy": 2.0473913952708243, + "loss/hidden": 3.4546875, + "loss/jsd": 0.0, + "loss/logits": 0.19657958708703518, + "step": 22490 + }, + { + "epoch": 0.75, + "grad_norm": 27.875, + "grad_norm_var": 1.3979166666666667, + "learning_rate": 8.681980515339464e-05, + "loss": 7.545, + "loss/crossentropy": 2.0645889565348625, + "loss/hidden": 3.58828125, + "loss/jsd": 0.0, + "loss/logits": 0.21940149031579495, + "step": 22500 + }, + { + "epoch": 0.7503333333333333, + "grad_norm": 29.625, + "grad_norm_var": 0.8614583333333333, + "learning_rate": 8.671968342740627e-05, + "loss": 7.5906, + "loss/crossentropy": 2.134204125404358, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.1961730806156993, + "step": 22510 + }, + { + "epoch": 0.7506666666666667, + "grad_norm": 27.125, + "grad_norm_var": 2.3247395833333333, + "learning_rate": 8.661924864094822e-05, + "loss": 7.6235, + "loss/crossentropy": 1.9457140512764455, + "loss/hidden": 3.56953125, + "loss/jsd": 0.0, + "loss/logits": 0.1988594863563776, + "step": 22520 + }, + { + "epoch": 0.751, + "grad_norm": 31.0, + "grad_norm_var": 6.930208333333334, + "learning_rate": 8.65185017852713e-05, + "loss": 7.5477, + "loss/crossentropy": 2.175088369846344, + "loss/hidden": 3.48125, + "loss/jsd": 0.0, + "loss/logits": 0.19464628268033266, + "step": 22530 + }, + { + "epoch": 0.7513333333333333, + "grad_norm": 27.875, + "grad_norm_var": 6.657291666666667, + "learning_rate": 8.641744385470628e-05, + "loss": 7.5943, + "loss/crossentropy": 1.9843165129423141, + "loss/hidden": 3.610546875, + "loss/jsd": 0.0, + "loss/logits": 0.20345317116007208, + "step": 22540 + }, + { + "epoch": 0.7516666666666667, + "grad_norm": 30.25, + "grad_norm_var": 1.2, + "learning_rate": 8.631607584665414e-05, + "loss": 7.5538, + "loss/crossentropy": 2.139354394376278, + "loss/hidden": 3.590234375, + "loss/jsd": 0.0, + "loss/logits": 0.2131027102470398, + "step": 22550 + }, + { + "epoch": 0.752, + "grad_norm": 30.875, + "grad_norm_var": 3.14140625, + "learning_rate": 8.621439876157622e-05, + "loss": 7.5358, + "loss/crossentropy": 1.9600604437291622, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.19278565216809512, + "step": 22560 + }, + { + "epoch": 0.7523333333333333, + "grad_norm": 31.75, + "grad_norm_var": 3.1697265625, + "learning_rate": 8.611241360298429e-05, + "loss": 7.601, + "loss/crossentropy": 2.1606590077281, + "loss/hidden": 3.551171875, + "loss/jsd": 0.0, + "loss/logits": 0.20311268288642167, + "step": 22570 + }, + { + "epoch": 0.7526666666666667, + "grad_norm": 27.75, + "grad_norm_var": 1.6947916666666667, + "learning_rate": 8.601012137743069e-05, + "loss": 7.4441, + "loss/crossentropy": 1.9806740552186965, + "loss/hidden": 3.47734375, + "loss/jsd": 0.0, + "loss/logits": 0.1898936064913869, + "step": 22580 + }, + { + "epoch": 0.753, + "grad_norm": 26.75, + "grad_norm_var": 1.121875, + "learning_rate": 8.590752309449837e-05, + "loss": 7.5579, + "loss/crossentropy": 2.1649864450097085, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.1936817906796932, + "step": 22590 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 28.875, + "grad_norm_var": 1.2957682291666666, + "learning_rate": 8.5804619766791e-05, + "loss": 7.4623, + "loss/crossentropy": 1.9912237107753754, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.17677724361419678, + "step": 22600 + }, + { + "epoch": 0.7536666666666667, + "grad_norm": 34.5, + "grad_norm_var": 4.583072916666667, + "learning_rate": 8.570141240992285e-05, + "loss": 7.5052, + "loss/crossentropy": 1.9258248887956142, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.1718863126821816, + "step": 22610 + }, + { + "epoch": 0.754, + "grad_norm": 27.25, + "grad_norm_var": 4.39765625, + "learning_rate": 8.559790204250887e-05, + "loss": 7.6286, + "loss/crossentropy": 1.920270534604788, + "loss/hidden": 3.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.21592878960072995, + "step": 22620 + }, + { + "epoch": 0.7543333333333333, + "grad_norm": 28.375, + "grad_norm_var": 3.350455729166667, + "learning_rate": 8.549408968615461e-05, + "loss": 7.4976, + "loss/crossentropy": 1.9794688627123833, + "loss/hidden": 3.555859375, + "loss/jsd": 0.0, + "loss/logits": 0.20622843131422997, + "step": 22630 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 27.75, + "grad_norm_var": 3.265625, + "learning_rate": 8.53899763654461e-05, + "loss": 7.5832, + "loss/crossentropy": 2.1832057766616346, + "loss/hidden": 3.578515625, + "loss/jsd": 0.0, + "loss/logits": 0.20718610547482968, + "step": 22640 + }, + { + "epoch": 0.755, + "grad_norm": 30.625, + "grad_norm_var": 7.5572265625, + "learning_rate": 8.52855631079398e-05, + "loss": 7.5354, + "loss/crossentropy": 2.0273820132017137, + "loss/hidden": 3.46796875, + "loss/jsd": 0.0, + "loss/logits": 0.1982576385140419, + "step": 22650 + }, + { + "epoch": 0.7553333333333333, + "grad_norm": 28.0, + "grad_norm_var": 7.424739583333333, + "learning_rate": 8.51808509441524e-05, + "loss": 7.6259, + "loss/crossentropy": 2.224448761343956, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.21609959285706282, + "step": 22660 + }, + { + "epoch": 0.7556666666666667, + "grad_norm": 27.0, + "grad_norm_var": 1.3926432291666666, + "learning_rate": 8.507584090755069e-05, + "loss": 7.541, + "loss/crossentropy": 2.052679204940796, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.2042101456783712, + "step": 22670 + }, + { + "epoch": 0.756, + "grad_norm": 29.25, + "grad_norm_var": 1.3041015625, + "learning_rate": 8.497053403454133e-05, + "loss": 7.4877, + "loss/crossentropy": 2.0474780216813087, + "loss/hidden": 3.46171875, + "loss/jsd": 0.0, + "loss/logits": 0.19255555672571062, + "step": 22680 + }, + { + "epoch": 0.7563333333333333, + "grad_norm": 27.375, + "grad_norm_var": 1.6796223958333334, + "learning_rate": 8.486493136446064e-05, + "loss": 7.6495, + "loss/crossentropy": 2.1654643058776855, + "loss/hidden": 3.448828125, + "loss/jsd": 0.0, + "loss/logits": 0.19368699844926596, + "step": 22690 + }, + { + "epoch": 0.7566666666666667, + "grad_norm": 35.75, + "grad_norm_var": 8.53515625, + "learning_rate": 8.475903393956434e-05, + "loss": 7.699, + "loss/crossentropy": 2.2046103149652483, + "loss/hidden": 3.51484375, + "loss/jsd": 0.0, + "loss/logits": 0.20576203987002373, + "step": 22700 + }, + { + "epoch": 0.757, + "grad_norm": 28.375, + "grad_norm_var": 4.378059895833333, + "learning_rate": 8.465284280501728e-05, + "loss": 7.673, + "loss/crossentropy": 2.0582919239997866, + "loss/hidden": 3.466796875, + "loss/jsd": 0.0, + "loss/logits": 0.19977389723062516, + "step": 22710 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 27.25, + "grad_norm_var": 1.103125, + "learning_rate": 8.454635900888305e-05, + "loss": 7.503, + "loss/crossentropy": 2.1073424354195596, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.2041914898902178, + "step": 22720 + }, + { + "epoch": 0.7576666666666667, + "grad_norm": 28.5, + "grad_norm_var": 3.9934895833333335, + "learning_rate": 8.443958360211376e-05, + "loss": 7.5862, + "loss/crossentropy": 1.996552325040102, + "loss/hidden": 3.50234375, + "loss/jsd": 0.0, + "loss/logits": 0.19313392527401446, + "step": 22730 + }, + { + "epoch": 0.758, + "grad_norm": 25.375, + "grad_norm_var": 12.517643229166667, + "learning_rate": 8.433251763853955e-05, + "loss": 7.5433, + "loss/crossentropy": 2.0218321952968834, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.17924583591520787, + "step": 22740 + }, + { + "epoch": 0.7583333333333333, + "grad_norm": 30.0, + "grad_norm_var": 13.1822265625, + "learning_rate": 8.422516217485826e-05, + "loss": 7.4756, + "loss/crossentropy": 1.9910611510276794, + "loss/hidden": 3.531640625, + "loss/jsd": 0.0, + "loss/logits": 0.19828474670648574, + "step": 22750 + }, + { + "epoch": 0.7586666666666667, + "grad_norm": 30.5, + "grad_norm_var": 4.151497395833333, + "learning_rate": 8.4117518270625e-05, + "loss": 7.457, + "loss/crossentropy": 2.0528080210089685, + "loss/hidden": 3.537890625, + "loss/jsd": 0.0, + "loss/logits": 0.19192924145609142, + "step": 22760 + }, + { + "epoch": 0.759, + "grad_norm": 27.5, + "grad_norm_var": 3.5759765625, + "learning_rate": 8.400958698824161e-05, + "loss": 7.5715, + "loss/crossentropy": 1.9694609761238098, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.20183660499751568, + "step": 22770 + }, + { + "epoch": 0.7593333333333333, + "grad_norm": 27.875, + "grad_norm_var": 3.09375, + "learning_rate": 8.390136939294631e-05, + "loss": 7.526, + "loss/crossentropy": 2.0170742586255073, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.21597461104393006, + "step": 22780 + }, + { + "epoch": 0.7596666666666667, + "grad_norm": 28.125, + "grad_norm_var": 5.36640625, + "learning_rate": 8.379286655280302e-05, + "loss": 7.6393, + "loss/crossentropy": 2.0312787666916847, + "loss/hidden": 3.508203125, + "loss/jsd": 0.0, + "loss/logits": 0.19151955414563418, + "step": 22790 + }, + { + "epoch": 0.76, + "grad_norm": 29.625, + "grad_norm_var": 3.6395833333333334, + "learning_rate": 8.368407953869104e-05, + "loss": 7.4895, + "loss/crossentropy": 2.09163758456707, + "loss/hidden": 3.4484375, + "loss/jsd": 0.0, + "loss/logits": 0.19382547289133073, + "step": 22800 + }, + { + "epoch": 0.7603333333333333, + "grad_norm": 30.5, + "grad_norm_var": 2.655143229166667, + "learning_rate": 8.357500942429424e-05, + "loss": 7.5593, + "loss/crossentropy": 2.095877369493246, + "loss/hidden": 3.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.20801592376083136, + "step": 22810 + }, + { + "epoch": 0.7606666666666667, + "grad_norm": 29.5, + "grad_norm_var": 1.4619140625, + "learning_rate": 8.34656572860906e-05, + "loss": 7.7253, + "loss/crossentropy": 2.164764193445444, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.20626930426806211, + "step": 22820 + }, + { + "epoch": 0.761, + "grad_norm": 28.5, + "grad_norm_var": 0.8291666666666667, + "learning_rate": 8.335602420334162e-05, + "loss": 7.5984, + "loss/crossentropy": 2.2208146676421165, + "loss/hidden": 3.469921875, + "loss/jsd": 0.0, + "loss/logits": 0.1930639874190092, + "step": 22830 + }, + { + "epoch": 0.7613333333333333, + "grad_norm": 27.375, + "grad_norm_var": 1.4035807291666667, + "learning_rate": 8.324611125808153e-05, + "loss": 7.323, + "loss/crossentropy": 2.047605223953724, + "loss/hidden": 3.500390625, + "loss/jsd": 0.0, + "loss/logits": 0.1907837161794305, + "step": 22840 + }, + { + "epoch": 0.7616666666666667, + "grad_norm": 26.0, + "grad_norm_var": 1.9572916666666667, + "learning_rate": 8.313591953510675e-05, + "loss": 7.4896, + "loss/crossentropy": 1.9626680858433248, + "loss/hidden": 3.481640625, + "loss/jsd": 0.0, + "loss/logits": 0.18827831279486418, + "step": 22850 + }, + { + "epoch": 0.762, + "grad_norm": 30.75, + "grad_norm_var": 1.478125, + "learning_rate": 8.302545012196506e-05, + "loss": 7.4469, + "loss/crossentropy": 2.024528782069683, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.1860325404442847, + "step": 22860 + }, + { + "epoch": 0.7623333333333333, + "grad_norm": 27.5, + "grad_norm_var": 1.5614583333333334, + "learning_rate": 8.291470410894503e-05, + "loss": 7.4275, + "loss/crossentropy": 2.144756194204092, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.18779438687488437, + "step": 22870 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 27.25, + "grad_norm_var": 0.7822916666666667, + "learning_rate": 8.280368258906505e-05, + "loss": 7.4683, + "loss/crossentropy": 1.9406724080443383, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.1975240783765912, + "step": 22880 + }, + { + "epoch": 0.763, + "grad_norm": 25.875, + "grad_norm_var": 1.4681640625, + "learning_rate": 8.269238665806273e-05, + "loss": 7.5113, + "loss/crossentropy": 2.1385065048933027, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.19636416397988796, + "step": 22890 + }, + { + "epoch": 0.7633333333333333, + "grad_norm": 27.625, + "grad_norm_var": 8.433268229166666, + "learning_rate": 8.258081741438395e-05, + "loss": 7.4728, + "loss/crossentropy": 2.0857705280184744, + "loss/hidden": 3.497265625, + "loss/jsd": 0.0, + "loss/logits": 0.2008900310844183, + "step": 22900 + }, + { + "epoch": 0.7636666666666667, + "grad_norm": 30.625, + "grad_norm_var": 1.9275390625, + "learning_rate": 8.246897595917212e-05, + "loss": 7.5127, + "loss/crossentropy": 2.101105071604252, + "loss/hidden": 3.49140625, + "loss/jsd": 0.0, + "loss/logits": 0.2026148896664381, + "step": 22910 + }, + { + "epoch": 0.764, + "grad_norm": 28.75, + "grad_norm_var": 1.1254557291666667, + "learning_rate": 8.235686339625725e-05, + "loss": 7.4431, + "loss/crossentropy": 2.023197513818741, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.20429725479334593, + "step": 22920 + }, + { + "epoch": 0.7643333333333333, + "grad_norm": 29.625, + "grad_norm_var": 1.9676432291666666, + "learning_rate": 8.224448083214506e-05, + "loss": 7.4991, + "loss/crossentropy": 2.096772846579552, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.19150063805282116, + "step": 22930 + }, + { + "epoch": 0.7646666666666667, + "grad_norm": 27.25, + "grad_norm_var": 1.6228515625, + "learning_rate": 8.213182937600612e-05, + "loss": 7.4012, + "loss/crossentropy": 1.9597499519586563, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.17938691582530736, + "step": 22940 + }, + { + "epoch": 0.765, + "grad_norm": 30.125, + "grad_norm_var": 2.763541666666667, + "learning_rate": 8.201891013966478e-05, + "loss": 7.4707, + "loss/crossentropy": 1.9817360505461692, + "loss/hidden": 3.523828125, + "loss/jsd": 0.0, + "loss/logits": 0.19509880822151898, + "step": 22950 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 35.25, + "grad_norm_var": 9.289518229166667, + "learning_rate": 8.190572423758835e-05, + "loss": 7.5923, + "loss/crossentropy": 2.0987179767340423, + "loss/hidden": 3.45625, + "loss/jsd": 0.0, + "loss/logits": 0.18913752851076424, + "step": 22960 + }, + { + "epoch": 0.7656666666666667, + "grad_norm": 27.0, + "grad_norm_var": 8.620247395833333, + "learning_rate": 8.179227278687598e-05, + "loss": 7.5594, + "loss/crossentropy": 2.007404398918152, + "loss/hidden": 3.62578125, + "loss/jsd": 0.0, + "loss/logits": 0.22985132094472646, + "step": 22970 + }, + { + "epoch": 0.766, + "grad_norm": 32.0, + "grad_norm_var": 2.2978515625, + "learning_rate": 8.167855690724767e-05, + "loss": 7.4219, + "loss/crossentropy": 1.8726444259285926, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1758563483133912, + "step": 22980 + }, + { + "epoch": 0.7663333333333333, + "grad_norm": 26.625, + "grad_norm_var": 3.0171223958333333, + "learning_rate": 8.156457772103326e-05, + "loss": 7.3819, + "loss/crossentropy": 1.9945810578763485, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.17547337915748357, + "step": 22990 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 28.0, + "grad_norm_var": 5.90390625, + "learning_rate": 8.14503363531613e-05, + "loss": 7.4977, + "loss/crossentropy": 1.9299149721860887, + "loss/hidden": 3.561328125, + "loss/jsd": 0.0, + "loss/logits": 0.19388887714594602, + "step": 23000 + }, + { + "epoch": 0.767, + "grad_norm": 27.625, + "grad_norm_var": 1.2747395833333333, + "learning_rate": 8.133583393114797e-05, + "loss": 7.5107, + "loss/crossentropy": 2.1192862302064897, + "loss/hidden": 3.494140625, + "loss/jsd": 0.0, + "loss/logits": 0.19927413761615753, + "step": 23010 + }, + { + "epoch": 0.7673333333333333, + "grad_norm": 28.25, + "grad_norm_var": 2.528580729166667, + "learning_rate": 8.122107158508592e-05, + "loss": 7.5196, + "loss/crossentropy": 2.044304075837135, + "loss/hidden": 3.53359375, + "loss/jsd": 0.0, + "loss/logits": 0.20039083026349544, + "step": 23020 + }, + { + "epoch": 0.7676666666666667, + "grad_norm": 27.25, + "grad_norm_var": 3.283333333333333, + "learning_rate": 8.110605044763323e-05, + "loss": 7.5047, + "loss/crossentropy": 2.1309088692069054, + "loss/hidden": 3.490234375, + "loss/jsd": 0.0, + "loss/logits": 0.19504800960421562, + "step": 23030 + }, + { + "epoch": 0.768, + "grad_norm": 28.125, + "grad_norm_var": 5.66640625, + "learning_rate": 8.099077165400204e-05, + "loss": 7.533, + "loss/crossentropy": 1.9686566561460495, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18667809749022127, + "step": 23040 + }, + { + "epoch": 0.7683333333333333, + "grad_norm": 29.75, + "grad_norm_var": 5.320768229166666, + "learning_rate": 8.087523634194755e-05, + "loss": 7.4886, + "loss/crossentropy": 2.005880794674158, + "loss/hidden": 3.44296875, + "loss/jsd": 0.0, + "loss/logits": 0.19481752207502723, + "step": 23050 + }, + { + "epoch": 0.7686666666666667, + "grad_norm": 26.375, + "grad_norm_var": 1.2410807291666666, + "learning_rate": 8.075944565175659e-05, + "loss": 7.5662, + "loss/crossentropy": 2.192203278839588, + "loss/hidden": 3.458984375, + "loss/jsd": 0.0, + "loss/logits": 0.21429934445768595, + "step": 23060 + }, + { + "epoch": 0.769, + "grad_norm": 26.25, + "grad_norm_var": 1.6624348958333333, + "learning_rate": 8.064340072623657e-05, + "loss": 7.3836, + "loss/crossentropy": 2.021069821715355, + "loss/hidden": 3.543359375, + "loss/jsd": 0.0, + "loss/logits": 0.21521367449313403, + "step": 23070 + }, + { + "epoch": 0.7693333333333333, + "grad_norm": 28.625, + "grad_norm_var": 39.1353515625, + "learning_rate": 8.052710271070405e-05, + "loss": 7.4727, + "loss/crossentropy": 1.9759119272232055, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.18273938745260238, + "step": 23080 + }, + { + "epoch": 0.7696666666666667, + "grad_norm": 26.125, + "grad_norm_var": 113.128125, + "learning_rate": 8.041055275297348e-05, + "loss": 7.467, + "loss/crossentropy": 1.9765710644423962, + "loss/hidden": 3.504296875, + "loss/jsd": 0.0, + "loss/logits": 0.19071924965828657, + "step": 23090 + }, + { + "epoch": 0.77, + "grad_norm": 25.25, + "grad_norm_var": 111.61399739583334, + "learning_rate": 8.029375200334588e-05, + "loss": 7.3353, + "loss/crossentropy": 2.1736841291189193, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.17423349283635617, + "step": 23100 + }, + { + "epoch": 0.7703333333333333, + "grad_norm": 26.0, + "grad_norm_var": 1.8122395833333333, + "learning_rate": 8.017670161459752e-05, + "loss": 7.3313, + "loss/crossentropy": 2.2096520021557806, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.19355475530028343, + "step": 23110 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 28.25, + "grad_norm_var": 2.364518229166667, + "learning_rate": 8.005940274196846e-05, + "loss": 7.4944, + "loss/crossentropy": 2.2522154793143274, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.19156052991747857, + "step": 23120 + }, + { + "epoch": 0.771, + "grad_norm": 27.25, + "grad_norm_var": 3.044205729166667, + "learning_rate": 7.994185654315124e-05, + "loss": 7.3322, + "loss/crossentropy": 2.017530345916748, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.17398178149014712, + "step": 23130 + }, + { + "epoch": 0.7713333333333333, + "grad_norm": 25.25, + "grad_norm_var": 6.49140625, + "learning_rate": 7.982406417827936e-05, + "loss": 7.4188, + "loss/crossentropy": 2.13538076877594, + "loss/hidden": 3.479296875, + "loss/jsd": 0.0, + "loss/logits": 0.19466998036950828, + "step": 23140 + }, + { + "epoch": 0.7716666666666666, + "grad_norm": 25.625, + "grad_norm_var": 7.2697265625, + "learning_rate": 7.970602680991594e-05, + "loss": 7.4274, + "loss/crossentropy": 1.9886194601655007, + "loss/hidden": 3.508984375, + "loss/jsd": 0.0, + "loss/logits": 0.187652344442904, + "step": 23150 + }, + { + "epoch": 0.772, + "grad_norm": 28.625, + "grad_norm_var": 2.081184895833333, + "learning_rate": 7.958774560304213e-05, + "loss": 7.4564, + "loss/crossentropy": 2.0018317684531213, + "loss/hidden": 3.51796875, + "loss/jsd": 0.0, + "loss/logits": 0.18953299205750226, + "step": 23160 + }, + { + "epoch": 0.7723333333333333, + "grad_norm": 29.875, + "grad_norm_var": 3.2228515625, + "learning_rate": 7.946922172504567e-05, + "loss": 7.6272, + "loss/crossentropy": 2.1628062181174754, + "loss/hidden": 3.5109375, + "loss/jsd": 0.0, + "loss/logits": 0.20114805568009614, + "step": 23170 + }, + { + "epoch": 0.7726666666666666, + "grad_norm": 29.25, + "grad_norm_var": 2.3030598958333335, + "learning_rate": 7.935045634570941e-05, + "loss": 7.454, + "loss/crossentropy": 1.9677003532648087, + "loss/hidden": 3.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.20842795697972177, + "step": 23180 + }, + { + "epoch": 0.773, + "grad_norm": 27.0, + "grad_norm_var": 2.3705729166666667, + "learning_rate": 7.923145063719972e-05, + "loss": 7.3973, + "loss/crossentropy": 1.9138947121798993, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.18487755134701728, + "step": 23190 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 27.5, + "grad_norm_var": 1.2983723958333333, + "learning_rate": 7.911220577405484e-05, + "loss": 7.4438, + "loss/crossentropy": 2.0332114972174167, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.19924609856680037, + "step": 23200 + }, + { + "epoch": 0.7736666666666666, + "grad_norm": 24.375, + "grad_norm_var": 7.162434895833333, + "learning_rate": 7.89927229331735e-05, + "loss": 7.4012, + "loss/crossentropy": 2.2687218472361566, + "loss/hidden": 3.284765625, + "loss/jsd": 0.0, + "loss/logits": 0.18496205024421214, + "step": 23210 + }, + { + "epoch": 0.774, + "grad_norm": 26.625, + "grad_norm_var": 2.1125, + "learning_rate": 7.887300329380304e-05, + "loss": 7.4174, + "loss/crossentropy": 2.0132935985922815, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.18549591191112996, + "step": 23220 + }, + { + "epoch": 0.7743333333333333, + "grad_norm": 26.125, + "grad_norm_var": 2.008072916666667, + "learning_rate": 7.8753048037528e-05, + "loss": 7.3578, + "loss/crossentropy": 2.038771292567253, + "loss/hidden": 3.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1932824071496725, + "step": 23230 + }, + { + "epoch": 0.7746666666666666, + "grad_norm": 32.5, + "grad_norm_var": 4.643489583333333, + "learning_rate": 7.863285834825832e-05, + "loss": 7.399, + "loss/crossentropy": 2.0289094880223275, + "loss/hidden": 3.521484375, + "loss/jsd": 0.0, + "loss/logits": 0.19404235538095235, + "step": 23240 + }, + { + "epoch": 0.775, + "grad_norm": 29.125, + "grad_norm_var": 8.295572916666666, + "learning_rate": 7.85124354122177e-05, + "loss": 7.4128, + "loss/crossentropy": 2.110806605219841, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.1978354908525944, + "step": 23250 + }, + { + "epoch": 0.7753333333333333, + "grad_norm": 27.125, + "grad_norm_var": 2.4707682291666666, + "learning_rate": 7.839178041793193e-05, + "loss": 7.4052, + "loss/crossentropy": 2.0687429390847685, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.2014083441346884, + "step": 23260 + }, + { + "epoch": 0.7756666666666666, + "grad_norm": 27.375, + "grad_norm_var": 1.3455729166666666, + "learning_rate": 7.827089455621707e-05, + "loss": 7.4279, + "loss/crossentropy": 2.0050750449299812, + "loss/hidden": 3.269921875, + "loss/jsd": 0.0, + "loss/logits": 0.1765454810112715, + "step": 23270 + }, + { + "epoch": 0.776, + "grad_norm": 28.0, + "grad_norm_var": 1.1393229166666667, + "learning_rate": 7.814977902016779e-05, + "loss": 7.4339, + "loss/crossentropy": 2.0278828397393225, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.1700163958594203, + "step": 23280 + }, + { + "epoch": 0.7763333333333333, + "grad_norm": 26.5, + "grad_norm_var": 1.82890625, + "learning_rate": 7.802843500514553e-05, + "loss": 7.4093, + "loss/crossentropy": 1.9275280833244324, + "loss/hidden": 3.5328125, + "loss/jsd": 0.0, + "loss/logits": 0.19342173589393497, + "step": 23290 + }, + { + "epoch": 0.7766666666666666, + "grad_norm": 27.625, + "grad_norm_var": 1.4684895833333333, + "learning_rate": 7.790686370876671e-05, + "loss": 7.582, + "loss/crossentropy": 2.128317493200302, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.1935270931571722, + "step": 23300 + }, + { + "epoch": 0.777, + "grad_norm": 26.5, + "grad_norm_var": 1.24140625, + "learning_rate": 7.778506633089096e-05, + "loss": 7.3099, + "loss/crossentropy": 2.1446583211421966, + "loss/hidden": 3.35546875, + "loss/jsd": 0.0, + "loss/logits": 0.1842292295768857, + "step": 23310 + }, + { + "epoch": 0.7773333333333333, + "grad_norm": 29.875, + "grad_norm_var": 1.647261914673709e+18, + "learning_rate": 7.766304407360924e-05, + "loss": 7.5152, + "loss/crossentropy": 2.117107591032982, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.18693095725029707, + "step": 23320 + }, + { + "epoch": 0.7776666666666666, + "grad_norm": 27.25, + "grad_norm_var": 1.647261914636275e+18, + "learning_rate": 7.754079814123195e-05, + "loss": 7.3871, + "loss/crossentropy": 1.9594203799962997, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.18217467218637468, + "step": 23330 + }, + { + "epoch": 0.778, + "grad_norm": 25.625, + "grad_norm_var": 1.3916015625, + "learning_rate": 7.741832974027709e-05, + "loss": 7.3621, + "loss/crossentropy": 1.9576537497341633, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.17676338767632843, + "step": 23340 + }, + { + "epoch": 0.7783333333333333, + "grad_norm": 26.0, + "grad_norm_var": 1.1270833333333334, + "learning_rate": 7.729564007945835e-05, + "loss": 7.345, + "loss/crossentropy": 1.9789281010627746, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.1813190994784236, + "step": 23350 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 25.75, + "grad_norm_var": 3.4680531624457667e+18, + "learning_rate": 7.717273036967312e-05, + "loss": 7.3519, + "loss/crossentropy": 2.2810946226119997, + "loss/hidden": 3.4328125, + "loss/jsd": 0.0, + "loss/logits": 0.1951879994943738, + "step": 23360 + }, + { + "epoch": 0.779, + "grad_norm": 28.375, + "grad_norm_var": 1.7999348958333334, + "learning_rate": 7.704960182399065e-05, + "loss": 7.3035, + "loss/crossentropy": 2.106377599388361, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.18016488589346408, + "step": 23370 + }, + { + "epoch": 0.7793333333333333, + "grad_norm": 27.125, + "grad_norm_var": 1.2702473958333333, + "learning_rate": 7.692625565763996e-05, + "loss": 7.3742, + "loss/crossentropy": 1.963225745409727, + "loss/hidden": 3.4859375, + "loss/jsd": 0.0, + "loss/logits": 0.201007841527462, + "step": 23380 + }, + { + "epoch": 0.7796666666666666, + "grad_norm": 27.25, + "grad_norm_var": 1.5384765625, + "learning_rate": 7.680269308799791e-05, + "loss": 7.279, + "loss/crossentropy": 2.0159963831305503, + "loss/hidden": 3.458984375, + "loss/jsd": 0.0, + "loss/logits": 0.187699238024652, + "step": 23390 + }, + { + "epoch": 0.78, + "grad_norm": 25.625, + "grad_norm_var": 0.9184895833333333, + "learning_rate": 7.667891533457719e-05, + "loss": 7.419, + "loss/crossentropy": 2.0797833621501924, + "loss/hidden": 3.42890625, + "loss/jsd": 0.0, + "loss/logits": 0.1819260410964489, + "step": 23400 + }, + { + "epoch": 0.7803333333333333, + "grad_norm": 27.375, + "grad_norm_var": 1.5462890625, + "learning_rate": 7.655492361901425e-05, + "loss": 7.4127, + "loss/crossentropy": 1.9969025284051896, + "loss/hidden": 3.566796875, + "loss/jsd": 0.0, + "loss/logits": 0.20141962189227341, + "step": 23410 + }, + { + "epoch": 0.7806666666666666, + "grad_norm": 24.625, + "grad_norm_var": 2.5885416666666665, + "learning_rate": 7.643071916505726e-05, + "loss": 7.2426, + "loss/crossentropy": 2.1777842193841934, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.19313708059489726, + "step": 23420 + }, + { + "epoch": 0.781, + "grad_norm": 28.875, + "grad_norm_var": 2.540311642526537e+18, + "learning_rate": 7.630630319855406e-05, + "loss": 7.4692, + "loss/crossentropy": 1.98684598878026, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.17943298360332846, + "step": 23430 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 25.875, + "grad_norm_var": 1.8872395833333333, + "learning_rate": 7.618167694743998e-05, + "loss": 7.3491, + "loss/crossentropy": 1.9924467638134957, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.19452919848263264, + "step": 23440 + }, + { + "epoch": 0.7816666666666666, + "grad_norm": 26.375, + "grad_norm_var": 0.8955729166666667, + "learning_rate": 7.60568416417258e-05, + "loss": 7.4007, + "loss/crossentropy": 2.1785530865192415, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.1901299361139536, + "step": 23450 + }, + { + "epoch": 0.782, + "grad_norm": 26.375, + "grad_norm_var": 1.4455729166666667, + "learning_rate": 7.593179851348563e-05, + "loss": 7.3419, + "loss/crossentropy": 2.048526135832071, + "loss/hidden": 3.457421875, + "loss/jsd": 0.0, + "loss/logits": 0.1861328760161996, + "step": 23460 + }, + { + "epoch": 0.7823333333333333, + "grad_norm": 26.75, + "grad_norm_var": 3.459375, + "learning_rate": 7.580654879684464e-05, + "loss": 7.4101, + "loss/crossentropy": 1.9338685415685177, + "loss/hidden": 3.4875, + "loss/jsd": 0.0, + "loss/logits": 0.19165972275659443, + "step": 23470 + }, + { + "epoch": 0.7826666666666666, + "grad_norm": 27.125, + "grad_norm_var": 4.140625, + "learning_rate": 7.568109372796697e-05, + "loss": 7.3926, + "loss/crossentropy": 2.115327002480626, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.187891862122342, + "step": 23480 + }, + { + "epoch": 0.783, + "grad_norm": 28.25, + "grad_norm_var": 2.3053504071336287e+18, + "learning_rate": 7.555543454504348e-05, + "loss": 7.3786, + "loss/crossentropy": 2.1964672222733497, + "loss/hidden": 3.740234375, + "loss/jsd": 0.0, + "loss/logits": 0.1924813449382782, + "step": 23490 + }, + { + "epoch": 0.7833333333333333, + "grad_norm": 27.375, + "grad_norm_var": 1.5384765625, + "learning_rate": 7.542957248827961e-05, + "loss": 7.3937, + "loss/crossentropy": 2.072338564693928, + "loss/hidden": 3.492578125, + "loss/jsd": 0.0, + "loss/logits": 0.20393363032490014, + "step": 23500 + }, + { + "epoch": 0.7836666666666666, + "grad_norm": 26.0, + "grad_norm_var": 1.6676432291666667, + "learning_rate": 7.530350879988304e-05, + "loss": 7.2573, + "loss/crossentropy": 2.1300232261419296, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.19609809312969445, + "step": 23510 + }, + { + "epoch": 0.784, + "grad_norm": 26.875, + "grad_norm_var": 1.9218098958333334, + "learning_rate": 7.517724472405146e-05, + "loss": 7.3829, + "loss/crossentropy": 2.1858866199851037, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.18005712442100047, + "step": 23520 + }, + { + "epoch": 0.7843333333333333, + "grad_norm": 27.5, + "grad_norm_var": 2.6322265625, + "learning_rate": 7.505078150696035e-05, + "loss": 7.259, + "loss/crossentropy": 2.11143764257431, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.17099386416375636, + "step": 23530 + }, + { + "epoch": 0.7846666666666666, + "grad_norm": 27.875, + "grad_norm_var": 19.049739583333334, + "learning_rate": 7.492412039675058e-05, + "loss": 7.3895, + "loss/crossentropy": 2.0830544363707304, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.184346787026152, + "step": 23540 + }, + { + "epoch": 0.785, + "grad_norm": 28.0, + "grad_norm_var": 19.7978515625, + "learning_rate": 7.479726264351618e-05, + "loss": 7.3045, + "loss/crossentropy": 2.0409920796751977, + "loss/hidden": 3.45234375, + "loss/jsd": 0.0, + "loss/logits": 0.190955501049757, + "step": 23550 + }, + { + "epoch": 0.7853333333333333, + "grad_norm": 27.0, + "grad_norm_var": 1.64140625, + "learning_rate": 7.4670209499292e-05, + "loss": 7.3454, + "loss/crossentropy": 1.9914120055735112, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.1819184892810881, + "step": 23560 + }, + { + "epoch": 0.7856666666666666, + "grad_norm": 28.125, + "grad_norm_var": 1.2546223958333333, + "learning_rate": 7.454296221804121e-05, + "loss": 7.2387, + "loss/crossentropy": 2.0846676357090472, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.18154996410012245, + "step": 23570 + }, + { + "epoch": 0.786, + "grad_norm": 26.625, + "grad_norm_var": 2.4833333333333334, + "learning_rate": 7.441552205564317e-05, + "loss": 7.3623, + "loss/crossentropy": 2.1405218988656998, + "loss/hidden": 3.52890625, + "loss/jsd": 0.0, + "loss/logits": 0.21327570956200362, + "step": 23580 + }, + { + "epoch": 0.7863333333333333, + "grad_norm": 27.0, + "grad_norm_var": 1.6942057291666666, + "learning_rate": 7.428789026988078e-05, + "loss": 7.3275, + "loss/crossentropy": 2.189077128469944, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.183055036701262, + "step": 23590 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 26.5, + "grad_norm_var": 1.03125, + "learning_rate": 7.416006812042828e-05, + "loss": 7.2771, + "loss/crossentropy": 2.1273641705513002, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.1837273458018899, + "step": 23600 + }, + { + "epoch": 0.787, + "grad_norm": 25.25, + "grad_norm_var": 1.1322916666666667, + "learning_rate": 7.403205686883864e-05, + "loss": 7.1614, + "loss/crossentropy": 2.0558855824172495, + "loss/hidden": 3.311328125, + "loss/jsd": 0.0, + "loss/logits": 0.17631138348951936, + "step": 23610 + }, + { + "epoch": 0.7873333333333333, + "grad_norm": 26.75, + "grad_norm_var": 1.50390625, + "learning_rate": 7.39038577785313e-05, + "loss": 7.3046, + "loss/crossentropy": 1.9993584722280502, + "loss/hidden": 3.648828125, + "loss/jsd": 0.0, + "loss/logits": 0.21630566865205764, + "step": 23620 + }, + { + "epoch": 0.7876666666666666, + "grad_norm": 26.375, + "grad_norm_var": 1.0979166666666667, + "learning_rate": 7.377547211477946e-05, + "loss": 7.4785, + "loss/crossentropy": 2.142679235339165, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.18648772593587637, + "step": 23630 + }, + { + "epoch": 0.788, + "grad_norm": 28.5, + "grad_norm_var": 2.067122395833333, + "learning_rate": 7.36469011446978e-05, + "loss": 7.4104, + "loss/crossentropy": 2.191077730059624, + "loss/hidden": 3.295703125, + "loss/jsd": 0.0, + "loss/logits": 0.17883399985730647, + "step": 23640 + }, + { + "epoch": 0.7883333333333333, + "grad_norm": 26.125, + "grad_norm_var": 1.7035807291666667, + "learning_rate": 7.35181461372299e-05, + "loss": 7.3389, + "loss/crossentropy": 2.063406619429588, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.20122182425111532, + "step": 23650 + }, + { + "epoch": 0.7886666666666666, + "grad_norm": 25.5, + "grad_norm_var": 18.208072916666666, + "learning_rate": 7.338920836313572e-05, + "loss": 7.2588, + "loss/crossentropy": 2.062808007001877, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.19022527448832988, + "step": 23660 + }, + { + "epoch": 0.789, + "grad_norm": 27.375, + "grad_norm_var": 1.4186848958333333, + "learning_rate": 7.326008909497901e-05, + "loss": 7.2722, + "loss/crossentropy": 2.0678359627723695, + "loss/hidden": 3.456640625, + "loss/jsd": 0.0, + "loss/logits": 0.19251144528388978, + "step": 23670 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 27.5, + "grad_norm_var": 2.3308942610148623e+18, + "learning_rate": 7.313078960711483e-05, + "loss": 7.3772, + "loss/crossentropy": 2.3504543006420135, + "loss/hidden": 3.399609375, + "loss/jsd": 0.0, + "loss/logits": 0.1857314633205533, + "step": 23680 + }, + { + "epoch": 0.7896666666666666, + "grad_norm": 26.875, + "grad_norm_var": 2.330894261447435e+18, + "learning_rate": 7.300131117567692e-05, + "loss": 7.2372, + "loss/crossentropy": 2.0384044095873834, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.1833371376618743, + "step": 23690 + }, + { + "epoch": 0.79, + "grad_norm": 26.375, + "grad_norm_var": 2.9447916666666667, + "learning_rate": 7.287165507856512e-05, + "loss": 7.3968, + "loss/crossentropy": 2.0169043824076653, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.17834869027137756, + "step": 23700 + }, + { + "epoch": 0.7903333333333333, + "grad_norm": 25.875, + "grad_norm_var": 2.8291015625, + "learning_rate": 7.27418225954328e-05, + "loss": 7.209, + "loss/crossentropy": 2.0671774983406066, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.1780660256743431, + "step": 23710 + }, + { + "epoch": 0.7906666666666666, + "grad_norm": 24.875, + "grad_norm_var": 0.7567057291666667, + "learning_rate": 7.261181500767413e-05, + "loss": 7.2326, + "loss/crossentropy": 1.840275975316763, + "loss/hidden": 3.2921875, + "loss/jsd": 0.0, + "loss/logits": 0.1688528038561344, + "step": 23720 + }, + { + "epoch": 0.791, + "grad_norm": 28.25, + "grad_norm_var": 1.8389973958333334, + "learning_rate": 7.248163359841148e-05, + "loss": 7.2749, + "loss/crossentropy": 1.863051414489746, + "loss/hidden": 3.254296875, + "loss/jsd": 0.0, + "loss/logits": 0.15819047279655934, + "step": 23730 + }, + { + "epoch": 0.7913333333333333, + "grad_norm": 25.875, + "grad_norm_var": 1.14140625, + "learning_rate": 7.235127965248285e-05, + "loss": 7.3715, + "loss/crossentropy": 2.033174179494381, + "loss/hidden": 3.48125, + "loss/jsd": 0.0, + "loss/logits": 0.18322906009852885, + "step": 23740 + }, + { + "epoch": 0.7916666666666666, + "grad_norm": 26.375, + "grad_norm_var": 0.790625, + "learning_rate": 7.222075445642904e-05, + "loss": 7.3671, + "loss/crossentropy": 2.1419862687587736, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.1809433963149786, + "step": 23750 + }, + { + "epoch": 0.792, + "grad_norm": 25.0, + "grad_norm_var": 0.6431640625, + "learning_rate": 7.209005929848107e-05, + "loss": 7.3336, + "loss/crossentropy": 2.0215205609798432, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.1783350331708789, + "step": 23760 + }, + { + "epoch": 0.7923333333333333, + "grad_norm": 26.25, + "grad_norm_var": 1.0122395833333333, + "learning_rate": 7.195919546854732e-05, + "loss": 7.2975, + "loss/crossentropy": 2.16986320912838, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.17684876844286918, + "step": 23770 + }, + { + "epoch": 0.7926666666666666, + "grad_norm": 26.125, + "grad_norm_var": 6.220572916666667, + "learning_rate": 7.182816425820101e-05, + "loss": 7.286, + "loss/crossentropy": 2.0275315180420876, + "loss/hidden": 3.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1788380341604352, + "step": 23780 + }, + { + "epoch": 0.793, + "grad_norm": 27.875, + "grad_norm_var": 1.3247395833333333, + "learning_rate": 7.16969669606673e-05, + "loss": 7.4463, + "loss/crossentropy": 2.0448300421237944, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.1782231353223324, + "step": 23790 + }, + { + "epoch": 0.7933333333333333, + "grad_norm": 27.5, + "grad_norm_var": 0.9686848958333333, + "learning_rate": 7.156560487081053e-05, + "loss": 7.2929, + "loss/crossentropy": 2.0439211681485174, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.18058539805933832, + "step": 23800 + }, + { + "epoch": 0.7936666666666666, + "grad_norm": 25.875, + "grad_norm_var": 1.1385416666666666, + "learning_rate": 7.143407928512146e-05, + "loss": 7.1752, + "loss/crossentropy": 2.0834071934223175, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.1776286605745554, + "step": 23810 + }, + { + "epoch": 0.794, + "grad_norm": 29.0, + "grad_norm_var": 2.6458333333333335, + "learning_rate": 7.130239150170455e-05, + "loss": 7.3533, + "loss/crossentropy": 2.069356369972229, + "loss/hidden": 3.456640625, + "loss/jsd": 0.0, + "loss/logits": 0.18835821226239205, + "step": 23820 + }, + { + "epoch": 0.7943333333333333, + "grad_norm": 26.375, + "grad_norm_var": 2.9332682291666665, + "learning_rate": 7.117054282026508e-05, + "loss": 7.3786, + "loss/crossentropy": 2.15442588403821, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.17487594103440643, + "step": 23830 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 26.0, + "grad_norm_var": 1.1457682291666667, + "learning_rate": 7.103853454209628e-05, + "loss": 7.2191, + "loss/crossentropy": 2.086764992028475, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.17575874989852308, + "step": 23840 + }, + { + "epoch": 0.795, + "grad_norm": 25.625, + "grad_norm_var": 1.2385416666666667, + "learning_rate": 7.090636797006658e-05, + "loss": 7.3092, + "loss/crossentropy": 2.035817837715149, + "loss/hidden": 3.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.1798616824671626, + "step": 23850 + }, + { + "epoch": 0.7953333333333333, + "grad_norm": 23.625, + "grad_norm_var": 2.1634765625, + "learning_rate": 7.077404440860666e-05, + "loss": 7.2022, + "loss/crossentropy": 2.0340130746364595, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.18757070507854223, + "step": 23860 + }, + { + "epoch": 0.7956666666666666, + "grad_norm": 24.5, + "grad_norm_var": 2.4514973958333335, + "learning_rate": 7.064156516369666e-05, + "loss": 7.3237, + "loss/crossentropy": 2.046588622033596, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.18935591662302614, + "step": 23870 + }, + { + "epoch": 0.796, + "grad_norm": 27.0, + "grad_norm_var": 1.1660807291666666, + "learning_rate": 7.050893154285327e-05, + "loss": 7.331, + "loss/crossentropy": 2.1309110179543493, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.17215617671608924, + "step": 23880 + }, + { + "epoch": 0.7963333333333333, + "grad_norm": 26.5, + "grad_norm_var": 0.9455729166666667, + "learning_rate": 7.037614485511676e-05, + "loss": 7.251, + "loss/crossentropy": 2.123047386109829, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.1786116823554039, + "step": 23890 + }, + { + "epoch": 0.7966666666666666, + "grad_norm": 27.875, + "grad_norm_var": 1.2010416666666666, + "learning_rate": 7.024320641103812e-05, + "loss": 7.2897, + "loss/crossentropy": 2.1215869694948197, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.1894577570259571, + "step": 23900 + }, + { + "epoch": 0.797, + "grad_norm": 25.375, + "grad_norm_var": 1.1927083333333333, + "learning_rate": 7.011011752266612e-05, + "loss": 7.112, + "loss/crossentropy": 2.1410455122590064, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.18200043272227048, + "step": 23910 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 23.75, + "grad_norm_var": 0.8145833333333333, + "learning_rate": 6.99768795035344e-05, + "loss": 7.1817, + "loss/crossentropy": 1.9395751819014548, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.17033605417236686, + "step": 23920 + }, + { + "epoch": 0.7976666666666666, + "grad_norm": 23.25, + "grad_norm_var": 2.395768229166667, + "learning_rate": 6.984349366864839e-05, + "loss": 7.1585, + "loss/crossentropy": 1.8782190293073655, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.1784798389300704, + "step": 23930 + }, + { + "epoch": 0.798, + "grad_norm": 25.125, + "grad_norm_var": 2.3053504077789222e+18, + "learning_rate": 6.97099613344724e-05, + "loss": 7.308, + "loss/crossentropy": 2.1117516651749613, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.1816192871890962, + "step": 23940 + }, + { + "epoch": 0.7983333333333333, + "grad_norm": 24.625, + "grad_norm_var": 2.305350407854839e+18, + "learning_rate": 6.957628381891673e-05, + "loss": 7.3415, + "loss/crossentropy": 2.0898191846907137, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.17528143543750047, + "step": 23950 + }, + { + "epoch": 0.7986666666666666, + "grad_norm": 23.75, + "grad_norm_var": 2.019791666666667, + "learning_rate": 6.944246244132443e-05, + "loss": 7.1587, + "loss/crossentropy": 2.0945689618587493, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.1783385954797268, + "step": 23960 + }, + { + "epoch": 0.799, + "grad_norm": 25.625, + "grad_norm_var": 2.0619140625, + "learning_rate": 6.930849852245848e-05, + "loss": 7.3629, + "loss/crossentropy": 2.0345364600419997, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.18947122450917958, + "step": 23970 + }, + { + "epoch": 0.7993333333333333, + "grad_norm": 30.375, + "grad_norm_var": 2.7583333333333333, + "learning_rate": 6.917439338448872e-05, + "loss": 7.2008, + "loss/crossentropy": 2.0551148861646653, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.17965832073241472, + "step": 23980 + }, + { + "epoch": 0.7996666666666666, + "grad_norm": 26.625, + "grad_norm_var": 21.099739583333335, + "learning_rate": 6.904014835097867e-05, + "loss": 7.2763, + "loss/crossentropy": 2.044001418352127, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.18677353039383887, + "step": 23990 + }, + { + "epoch": 0.8, + "grad_norm": 25.375, + "grad_norm_var": 0.9718098958333333, + "learning_rate": 6.890576474687263e-05, + "loss": 7.211, + "loss/crossentropy": 2.2494696259498594, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.18474820386618376, + "step": 24000 + }, + { + "epoch": 0.8003333333333333, + "grad_norm": 25.25, + "grad_norm_var": 1.2018229166666667, + "learning_rate": 6.877124389848254e-05, + "loss": 7.2794, + "loss/crossentropy": 2.300140696763992, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.19267369732260703, + "step": 24010 + }, + { + "epoch": 0.8006666666666666, + "grad_norm": 22.75, + "grad_norm_var": 2.919791666666667, + "learning_rate": 6.863658713347484e-05, + "loss": 7.2944, + "loss/crossentropy": 2.0973946295678614, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.18365285200998188, + "step": 24020 + }, + { + "epoch": 0.801, + "grad_norm": 25.5, + "grad_norm_var": 2.533072916666667, + "learning_rate": 6.850179578085744e-05, + "loss": 7.226, + "loss/crossentropy": 1.9838631860911846, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.17135070022195578, + "step": 24030 + }, + { + "epoch": 0.8013333333333333, + "grad_norm": 25.875, + "grad_norm_var": 0.7624348958333333, + "learning_rate": 6.836687117096657e-05, + "loss": 7.1629, + "loss/crossentropy": 2.1537132054567336, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17650238294154405, + "step": 24040 + }, + { + "epoch": 0.8016666666666666, + "grad_norm": 25.75, + "grad_norm_var": 0.9291666666666667, + "learning_rate": 6.823181463545368e-05, + "loss": 7.2053, + "loss/crossentropy": 2.0979036509990694, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.17083593588322402, + "step": 24050 + }, + { + "epoch": 0.802, + "grad_norm": 26.75, + "grad_norm_var": 1.4358723958333333, + "learning_rate": 6.809662750727222e-05, + "loss": 7.3477, + "loss/crossentropy": 2.0106400445103647, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.17500849366188048, + "step": 24060 + }, + { + "epoch": 0.8023333333333333, + "grad_norm": 27.75, + "grad_norm_var": 3.159375, + "learning_rate": 6.796131112066461e-05, + "loss": 7.1892, + "loss/crossentropy": 2.011872109770775, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.16934235505759715, + "step": 24070 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 25.25, + "grad_norm_var": 3.3551432291666665, + "learning_rate": 6.782586681114894e-05, + "loss": 7.2536, + "loss/crossentropy": 1.9991176337003709, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.173070646263659, + "step": 24080 + }, + { + "epoch": 0.803, + "grad_norm": 25.125, + "grad_norm_var": 0.7145833333333333, + "learning_rate": 6.769029591550581e-05, + "loss": 7.2212, + "loss/crossentropy": 2.190063714981079, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.19409491550177335, + "step": 24090 + }, + { + "epoch": 0.8033333333333333, + "grad_norm": 27.25, + "grad_norm_var": 1.0520833333333333, + "learning_rate": 6.755459977176533e-05, + "loss": 7.226, + "loss/crossentropy": 2.0515845850110055, + "loss/hidden": 3.2640625, + "loss/jsd": 0.0, + "loss/logits": 0.17549332650378346, + "step": 24100 + }, + { + "epoch": 0.8036666666666666, + "grad_norm": 23.875, + "grad_norm_var": 1.3291666666666666, + "learning_rate": 6.741877971919357e-05, + "loss": 7.3119, + "loss/crossentropy": 2.1062870398163795, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.1822298699989915, + "step": 24110 + }, + { + "epoch": 0.804, + "grad_norm": 25.125, + "grad_norm_var": 2.460724593429193e+18, + "learning_rate": 6.728283709827963e-05, + "loss": 7.2493, + "loss/crossentropy": 2.1633963331580164, + "loss/hidden": 3.459375, + "loss/jsd": 0.0, + "loss/logits": 0.19363478161394596, + "step": 24120 + }, + { + "epoch": 0.8043333333333333, + "grad_norm": 24.375, + "grad_norm_var": 22.76640625, + "learning_rate": 6.714677325072235e-05, + "loss": 7.2432, + "loss/crossentropy": 2.1364134401082993, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.1911363998427987, + "step": 24130 + }, + { + "epoch": 0.8046666666666666, + "grad_norm": 27.125, + "grad_norm_var": 1.0369140625, + "learning_rate": 6.701058951941691e-05, + "loss": 7.2875, + "loss/crossentropy": 2.144788406789303, + "loss/hidden": 3.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.19746586456894874, + "step": 24140 + }, + { + "epoch": 0.805, + "grad_norm": 28.5, + "grad_norm_var": 4.764322916666667, + "learning_rate": 6.687428724844179e-05, + "loss": 7.143, + "loss/crossentropy": 1.9672424003481865, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.1883099837228656, + "step": 24150 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 26.125, + "grad_norm_var": 5.80390625, + "learning_rate": 6.673786778304537e-05, + "loss": 7.1432, + "loss/crossentropy": 1.940127792209387, + "loss/hidden": 3.219140625, + "loss/jsd": 0.0, + "loss/logits": 0.15666389614343643, + "step": 24160 + }, + { + "epoch": 0.8056666666666666, + "grad_norm": 25.0, + "grad_norm_var": 373.32890625, + "learning_rate": 6.66013324696327e-05, + "loss": 7.2992, + "loss/crossentropy": 2.203534686565399, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.17815530616790057, + "step": 24170 + }, + { + "epoch": 0.806, + "grad_norm": 25.25, + "grad_norm_var": 0.9864583333333333, + "learning_rate": 6.646468265575219e-05, + "loss": 7.2117, + "loss/crossentropy": 2.0774502992630004, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.17327735256403684, + "step": 24180 + }, + { + "epoch": 0.8063333333333333, + "grad_norm": 26.875, + "grad_norm_var": 0.8587890625, + "learning_rate": 6.632791969008237e-05, + "loss": 7.2638, + "loss/crossentropy": 2.0702610716223715, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.18914096765220165, + "step": 24190 + }, + { + "epoch": 0.8066666666666666, + "grad_norm": 26.0, + "grad_norm_var": 2.709830729166667, + "learning_rate": 6.619104492241848e-05, + "loss": 7.2388, + "loss/crossentropy": 2.02001933157444, + "loss/hidden": 3.426953125, + "loss/jsd": 0.0, + "loss/logits": 0.200399025157094, + "step": 24200 + }, + { + "epoch": 0.807, + "grad_norm": 24.75, + "grad_norm_var": 1.3978515625, + "learning_rate": 6.60540597036592e-05, + "loss": 7.1491, + "loss/crossentropy": 2.1803820818662643, + "loss/hidden": 3.303125, + "loss/jsd": 0.0, + "loss/logits": 0.17881411854177715, + "step": 24210 + }, + { + "epoch": 0.8073333333333333, + "grad_norm": 23.125, + "grad_norm_var": 0.85390625, + "learning_rate": 6.591696538579334e-05, + "loss": 7.1335, + "loss/crossentropy": 2.097467389702797, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.17912068534642459, + "step": 24220 + }, + { + "epoch": 0.8076666666666666, + "grad_norm": 23.75, + "grad_norm_var": 3.476822916666667, + "learning_rate": 6.577976332188649e-05, + "loss": 7.1903, + "loss/crossentropy": 2.0106175623834135, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.19294406063854694, + "step": 24230 + }, + { + "epoch": 0.808, + "grad_norm": 25.875, + "grad_norm_var": 1.7233723958333333, + "learning_rate": 6.564245486606762e-05, + "loss": 7.123, + "loss/crossentropy": 2.056936872005463, + "loss/hidden": 3.296484375, + "loss/jsd": 0.0, + "loss/logits": 0.16803365563973785, + "step": 24240 + }, + { + "epoch": 0.8083333333333333, + "grad_norm": 22.875, + "grad_norm_var": 1.75, + "learning_rate": 6.550504137351576e-05, + "loss": 7.0223, + "loss/crossentropy": 2.026827494055033, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.18043983895331622, + "step": 24250 + }, + { + "epoch": 0.8086666666666666, + "grad_norm": 26.125, + "grad_norm_var": 52.36451822916667, + "learning_rate": 6.536752420044659e-05, + "loss": 7.0505, + "loss/crossentropy": 2.047990356385708, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.16095355469733477, + "step": 24260 + }, + { + "epoch": 0.809, + "grad_norm": 24.75, + "grad_norm_var": 51.09791666666667, + "learning_rate": 6.522990470409909e-05, + "loss": 7.1775, + "loss/crossentropy": 2.093078485131264, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.17120972126722336, + "step": 24270 + }, + { + "epoch": 0.8093333333333333, + "grad_norm": 23.875, + "grad_norm_var": 1.6280598958333334, + "learning_rate": 6.509218424272216e-05, + "loss": 7.1735, + "loss/crossentropy": 2.125036987662315, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.19020479824393988, + "step": 24280 + }, + { + "epoch": 0.8096666666666666, + "grad_norm": 23.375, + "grad_norm_var": 63.603125, + "learning_rate": 6.495436417556113e-05, + "loss": 7.1969, + "loss/crossentropy": 2.22238949239254, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.19780659209936857, + "step": 24290 + }, + { + "epoch": 0.81, + "grad_norm": 24.25, + "grad_norm_var": 11.257291666666667, + "learning_rate": 6.481644586284442e-05, + "loss": 7.153, + "loss/crossentropy": 2.0770174629986284, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18271327950060368, + "step": 24300 + }, + { + "epoch": 0.8103333333333333, + "grad_norm": 24.625, + "grad_norm_var": 0.8643229166666667, + "learning_rate": 6.46784306657701e-05, + "loss": 7.1539, + "loss/crossentropy": 1.8965822540223598, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.17321830820292233, + "step": 24310 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 25.25, + "grad_norm_var": 1.559375, + "learning_rate": 6.454031994649247e-05, + "loss": 7.1674, + "loss/crossentropy": 2.2343272864818573, + "loss/hidden": 3.285546875, + "loss/jsd": 0.0, + "loss/logits": 0.17637284398078917, + "step": 24320 + }, + { + "epoch": 0.811, + "grad_norm": 196.0, + "grad_norm_var": 1813.99375, + "learning_rate": 6.440211506810852e-05, + "loss": 7.2401, + "loss/crossentropy": 2.118020176887512, + "loss/hidden": 3.42890625, + "loss/jsd": 0.0, + "loss/logits": 0.18733559399843216, + "step": 24330 + }, + { + "epoch": 0.8113333333333334, + "grad_norm": 24.375, + "grad_norm_var": 1817.9728515625, + "learning_rate": 6.426381739464466e-05, + "loss": 7.1611, + "loss/crossentropy": 2.0687691517174245, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.18494962928816677, + "step": 24340 + }, + { + "epoch": 0.8116666666666666, + "grad_norm": 24.25, + "grad_norm_var": 2.7593098958333333, + "learning_rate": 6.412542829104307e-05, + "loss": 7.1438, + "loss/crossentropy": 2.1410858571529388, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.1797313429415226, + "step": 24350 + }, + { + "epoch": 0.812, + "grad_norm": 23.875, + "grad_norm_var": 1.2186848958333334, + "learning_rate": 6.398694912314831e-05, + "loss": 7.1624, + "loss/crossentropy": 2.146064803004265, + "loss/hidden": 3.25234375, + "loss/jsd": 0.0, + "loss/logits": 0.17778887879103422, + "step": 24360 + }, + { + "epoch": 0.8123333333333334, + "grad_norm": 24.25, + "grad_norm_var": 0.9973307291666667, + "learning_rate": 6.38483812576939e-05, + "loss": 7.0275, + "loss/crossentropy": 2.1565047204494476, + "loss/hidden": 3.282421875, + "loss/jsd": 0.0, + "loss/logits": 0.17124725691974163, + "step": 24370 + }, + { + "epoch": 0.8126666666666666, + "grad_norm": 23.0, + "grad_norm_var": 0.5893229166666667, + "learning_rate": 6.370972606228872e-05, + "loss": 7.1234, + "loss/crossentropy": 2.0467451363801956, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.1872189924120903, + "step": 24380 + }, + { + "epoch": 0.813, + "grad_norm": 24.75, + "grad_norm_var": 0.5145833333333333, + "learning_rate": 6.357098490540355e-05, + "loss": 7.1545, + "loss/crossentropy": 2.3109287858009337, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.1918891828507185, + "step": 24390 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 23.375, + "grad_norm_var": 0.9212890625, + "learning_rate": 6.343215915635762e-05, + "loss": 7.1035, + "loss/crossentropy": 1.956181785464287, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.1630766457878053, + "step": 24400 + }, + { + "epoch": 0.8136666666666666, + "grad_norm": 25.0, + "grad_norm_var": 0.709375, + "learning_rate": 6.329325018530501e-05, + "loss": 7.0542, + "loss/crossentropy": 1.9934518307447433, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.16185767110437155, + "step": 24410 + }, + { + "epoch": 0.814, + "grad_norm": 23.125, + "grad_norm_var": 5.9931640625, + "learning_rate": 6.315425936322118e-05, + "loss": 7.0992, + "loss/crossentropy": 2.089629125595093, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.1817337304353714, + "step": 24420 + }, + { + "epoch": 0.8143333333333334, + "grad_norm": 24.5, + "grad_norm_var": 5.65, + "learning_rate": 6.301518806188946e-05, + "loss": 7.0823, + "loss/crossentropy": 2.0498588502407076, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.17004711236804723, + "step": 24430 + }, + { + "epoch": 0.8146666666666667, + "grad_norm": 27.0, + "grad_norm_var": 1.5229166666666667, + "learning_rate": 6.287603765388743e-05, + "loss": 7.2639, + "loss/crossentropy": 2.05265491604805, + "loss/hidden": 3.2953125, + "loss/jsd": 0.0, + "loss/logits": 0.16803640704602002, + "step": 24440 + }, + { + "epoch": 0.815, + "grad_norm": 26.125, + "grad_norm_var": 2.348372395833333, + "learning_rate": 6.273680951257342e-05, + "loss": 7.1844, + "loss/crossentropy": 2.1695328533649443, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.18274456169456244, + "step": 24450 + }, + { + "epoch": 0.8153333333333334, + "grad_norm": 26.625, + "grad_norm_var": 1.5375, + "learning_rate": 6.259750501207302e-05, + "loss": 7.18, + "loss/crossentropy": 2.132620003819466, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.17444018907845021, + "step": 24460 + }, + { + "epoch": 0.8156666666666667, + "grad_norm": 23.875, + "grad_norm_var": 0.8046223958333333, + "learning_rate": 6.245812552726538e-05, + "loss": 7.1452, + "loss/crossentropy": 2.058624839782715, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.1701804917305708, + "step": 24470 + }, + { + "epoch": 0.816, + "grad_norm": 25.5, + "grad_norm_var": 1.4427083333333333, + "learning_rate": 6.231867243376977e-05, + "loss": 7.0437, + "loss/crossentropy": 2.1087178610265256, + "loss/hidden": 3.196875, + "loss/jsd": 0.0, + "loss/logits": 0.16274321246892215, + "step": 24480 + }, + { + "epoch": 0.8163333333333334, + "grad_norm": 25.125, + "grad_norm_var": 1.4010416666666667, + "learning_rate": 6.217914710793189e-05, + "loss": 6.9728, + "loss/crossentropy": 1.9185438066720963, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.1684217657893896, + "step": 24490 + }, + { + "epoch": 0.8166666666666667, + "grad_norm": 24.5, + "grad_norm_var": 0.509375, + "learning_rate": 6.203955092681039e-05, + "loss": 7.0605, + "loss/crossentropy": 2.115895939618349, + "loss/hidden": 3.401171875, + "loss/jsd": 0.0, + "loss/logits": 0.18307006321847438, + "step": 24500 + }, + { + "epoch": 0.817, + "grad_norm": 23.5, + "grad_norm_var": 1.1372395833333333, + "learning_rate": 6.189988526816323e-05, + "loss": 7.0337, + "loss/crossentropy": 2.096031680703163, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.1889460150152445, + "step": 24510 + }, + { + "epoch": 0.8173333333333334, + "grad_norm": 24.0, + "grad_norm_var": 1.6379557291666667, + "learning_rate": 6.176015151043407e-05, + "loss": 7.2482, + "loss/crossentropy": 2.0701269775629045, + "loss/hidden": 3.2984375, + "loss/jsd": 0.0, + "loss/logits": 0.1742158493027091, + "step": 24520 + }, + { + "epoch": 0.8176666666666667, + "grad_norm": 24.25, + "grad_norm_var": 2.842967612297891e+18, + "learning_rate": 6.16203510327387e-05, + "loss": 7.0644, + "loss/crossentropy": 2.0755894117057325, + "loss/hidden": 3.527734375, + "loss/jsd": 0.0, + "loss/logits": 0.16671819221228362, + "step": 24530 + }, + { + "epoch": 0.818, + "grad_norm": 25.125, + "grad_norm_var": 3.0311848958333334, + "learning_rate": 6.148048521485134e-05, + "loss": 7.0163, + "loss/crossentropy": 2.0209048211574556, + "loss/hidden": 3.2140625, + "loss/jsd": 0.0, + "loss/logits": 0.15955890230834485, + "step": 24540 + }, + { + "epoch": 0.8183333333333334, + "grad_norm": 24.125, + "grad_norm_var": 2.5462890625, + "learning_rate": 6.134055543719121e-05, + "loss": 7.0369, + "loss/crossentropy": 1.9814843587577342, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.16640124581754207, + "step": 24550 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 23.5, + "grad_norm_var": 0.7041015625, + "learning_rate": 6.120056308080872e-05, + "loss": 6.9686, + "loss/crossentropy": 2.104407861828804, + "loss/hidden": 3.325390625, + "loss/jsd": 0.0, + "loss/logits": 0.18350574728101493, + "step": 24560 + }, + { + "epoch": 0.819, + "grad_norm": 24.0, + "grad_norm_var": 1.1535807291666667, + "learning_rate": 6.106050952737186e-05, + "loss": 6.955, + "loss/crossentropy": 2.0084666229784487, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.1645617727190256, + "step": 24570 + }, + { + "epoch": 0.8193333333333334, + "grad_norm": 25.375, + "grad_norm_var": 1.9801432291666667, + "learning_rate": 6.0920396159152716e-05, + "loss": 7.0885, + "loss/crossentropy": 2.1339985907077788, + "loss/hidden": 3.247265625, + "loss/jsd": 0.0, + "loss/logits": 0.17009613076224922, + "step": 24580 + }, + { + "epoch": 0.8196666666666667, + "grad_norm": 24.5, + "grad_norm_var": 2.2291666666666665, + "learning_rate": 6.078022435901364e-05, + "loss": 7.0881, + "loss/crossentropy": 2.0973347425460815, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.18192722033709288, + "step": 24590 + }, + { + "epoch": 0.82, + "grad_norm": 24.125, + "grad_norm_var": 1.88125, + "learning_rate": 6.06399955103937e-05, + "loss": 7.169, + "loss/crossentropy": 1.9335032500326634, + "loss/hidden": 3.326171875, + "loss/jsd": 0.0, + "loss/logits": 0.17163177412003278, + "step": 24600 + }, + { + "epoch": 0.8203333333333334, + "grad_norm": 22.375, + "grad_norm_var": 1.2608723958333334, + "learning_rate": 6.049971099729502e-05, + "loss": 7.1524, + "loss/crossentropy": 2.197794906795025, + "loss/hidden": 3.253515625, + "loss/jsd": 0.0, + "loss/logits": 0.18091356940567493, + "step": 24610 + }, + { + "epoch": 0.8206666666666667, + "grad_norm": 25.0, + "grad_norm_var": 3.07377709315215e+18, + "learning_rate": 6.035937220426915e-05, + "loss": 7.0232, + "loss/crossentropy": 2.0041457399725915, + "loss/hidden": 3.27578125, + "loss/jsd": 0.0, + "loss/logits": 0.1686849119141698, + "step": 24620 + }, + { + "epoch": 0.821, + "grad_norm": 23.25, + "grad_norm_var": 3.073777092874557e+18, + "learning_rate": 6.0218980516403265e-05, + "loss": 7.1444, + "loss/crossentropy": 1.9922945663332938, + "loss/hidden": 3.185546875, + "loss/jsd": 0.0, + "loss/logits": 0.15499509871006012, + "step": 24630 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 25.0, + "grad_norm_var": 0.7931640625, + "learning_rate": 6.007853731930667e-05, + "loss": 7.0009, + "loss/crossentropy": 2.0803733453154565, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.2030915966257453, + "step": 24640 + }, + { + "epoch": 0.8216666666666667, + "grad_norm": 24.375, + "grad_norm_var": 1.1309895833333334, + "learning_rate": 5.993804399909704e-05, + "loss": 7.0855, + "loss/crossentropy": 2.0983700484037398, + "loss/hidden": 3.289453125, + "loss/jsd": 0.0, + "loss/logits": 0.1855665436014533, + "step": 24650 + }, + { + "epoch": 0.822, + "grad_norm": 24.25, + "grad_norm_var": 0.6684895833333333, + "learning_rate": 5.97975019423867e-05, + "loss": 7.0473, + "loss/crossentropy": 2.035327473282814, + "loss/hidden": 3.296484375, + "loss/jsd": 0.0, + "loss/logits": 0.168466529622674, + "step": 24660 + }, + { + "epoch": 0.8223333333333334, + "grad_norm": 25.125, + "grad_norm_var": 0.9580729166666667, + "learning_rate": 5.9656912536269015e-05, + "loss": 7.1414, + "loss/crossentropy": 2.1208567664027216, + "loss/hidden": 3.266796875, + "loss/jsd": 0.0, + "loss/logits": 0.17649125978350638, + "step": 24670 + }, + { + "epoch": 0.8226666666666667, + "grad_norm": 22.875, + "grad_norm_var": 1.2436848958333333, + "learning_rate": 5.951627716830467e-05, + "loss": 7.0739, + "loss/crossentropy": 2.098179739713669, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.16574389152228833, + "step": 24680 + }, + { + "epoch": 0.823, + "grad_norm": 24.75, + "grad_norm_var": 3.40625, + "learning_rate": 5.937559722650799e-05, + "loss": 7.0056, + "loss/crossentropy": 1.9866340532898903, + "loss/hidden": 3.21015625, + "loss/jsd": 0.0, + "loss/logits": 0.15324429739266635, + "step": 24690 + }, + { + "epoch": 0.8233333333333334, + "grad_norm": 23.75, + "grad_norm_var": 1.1768229166666666, + "learning_rate": 5.923487409933316e-05, + "loss": 7.0449, + "loss/crossentropy": 2.0189765483140945, + "loss/hidden": 3.226953125, + "loss/jsd": 0.0, + "loss/logits": 0.1633864961564541, + "step": 24700 + }, + { + "epoch": 0.8236666666666667, + "grad_norm": 24.75, + "grad_norm_var": 2.551041666666667, + "learning_rate": 5.909410917566066e-05, + "loss": 7.1172, + "loss/crossentropy": 1.9779780194163323, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.16957656461745502, + "step": 24710 + }, + { + "epoch": 0.824, + "grad_norm": 23.0, + "grad_norm_var": 2.0374348958333335, + "learning_rate": 5.8953303844783456e-05, + "loss": 7.0346, + "loss/crossentropy": 2.046160864830017, + "loss/hidden": 3.26171875, + "loss/jsd": 0.0, + "loss/logits": 0.17642345037311316, + "step": 24720 + }, + { + "epoch": 0.8243333333333334, + "grad_norm": 25.625, + "grad_norm_var": 3.0152302976861993e+18, + "learning_rate": 5.881245949639331e-05, + "loss": 7.1386, + "loss/crossentropy": 2.0644598096609115, + "loss/hidden": 3.26953125, + "loss/jsd": 0.0, + "loss/logits": 0.16755576469004155, + "step": 24730 + }, + { + "epoch": 0.8246666666666667, + "grad_norm": 22.625, + "grad_norm_var": 1.5059895833333334, + "learning_rate": 5.86715775205671e-05, + "loss": 6.9054, + "loss/crossentropy": 1.9000537507236004, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.16331597846001386, + "step": 24740 + }, + { + "epoch": 0.825, + "grad_norm": 23.375, + "grad_norm_var": 1.2119140625, + "learning_rate": 5.8530659307753036e-05, + "loss": 7.0357, + "loss/crossentropy": 1.8462681017816067, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.1659678179770708, + "step": 24750 + }, + { + "epoch": 0.8253333333333334, + "grad_norm": 23.125, + "grad_norm_var": 1.2677083333333334, + "learning_rate": 5.838970624875698e-05, + "loss": 7.0251, + "loss/crossentropy": 2.1098924592137336, + "loss/hidden": 3.2328125, + "loss/jsd": 0.0, + "loss/logits": 0.16376893278211355, + "step": 24760 + }, + { + "epoch": 0.8256666666666667, + "grad_norm": 26.25, + "grad_norm_var": 1.5848307291666666, + "learning_rate": 5.824871973472874e-05, + "loss": 7.0887, + "loss/crossentropy": 1.9424061939120292, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.16495383866131305, + "step": 24770 + }, + { + "epoch": 0.826, + "grad_norm": 4898947072.0, + "grad_norm_var": 1.4999801359186788e+18, + "learning_rate": 5.8107701157148277e-05, + "loss": 7.0898, + "loss/crossentropy": 2.0248693346977236, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.17345572579652072, + "step": 24780 + }, + { + "epoch": 0.8263333333333334, + "grad_norm": 25.625, + "grad_norm_var": 1.4999801358166175e+18, + "learning_rate": 5.796665190781201e-05, + "loss": 6.9785, + "loss/crossentropy": 1.9292976334691048, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.17304837796837091, + "step": 24790 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 23.5, + "grad_norm_var": 32.87024739583333, + "learning_rate": 5.782557337881911e-05, + "loss": 7.1747, + "loss/crossentropy": 1.8966447107493878, + "loss/hidden": 3.32578125, + "loss/jsd": 0.0, + "loss/logits": 0.16593249971047044, + "step": 24800 + }, + { + "epoch": 0.827, + "grad_norm": 5066719232.0, + "grad_norm_var": 1.6044777186466813e+18, + "learning_rate": 5.768446696255769e-05, + "loss": 7.1285, + "loss/crossentropy": 1.9656455472111702, + "loss/hidden": 3.52421875, + "loss/jsd": 0.0, + "loss/logits": 0.17714616544544698, + "step": 24810 + }, + { + "epoch": 0.8273333333333334, + "grad_norm": 22.375, + "grad_norm_var": 1.6044777203778104e+18, + "learning_rate": 5.754333405169111e-05, + "loss": 6.9896, + "loss/crossentropy": 2.0341189607977865, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.17659138450399042, + "step": 24820 + }, + { + "epoch": 0.8276666666666667, + "grad_norm": 25.25, + "grad_norm_var": 2.066666666666667, + "learning_rate": 5.740217603914423e-05, + "loss": 7.0219, + "loss/crossentropy": 1.9185968987643718, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.1869355977512896, + "step": 24830 + }, + { + "epoch": 0.828, + "grad_norm": 24.625, + "grad_norm_var": 0.9889973958333333, + "learning_rate": 5.726099431808963e-05, + "loss": 7.1146, + "loss/crossentropy": 1.8310720384120942, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.16051160339266063, + "step": 24840 + }, + { + "epoch": 0.8283333333333334, + "grad_norm": 24.875, + "grad_norm_var": 0.5910807291666667, + "learning_rate": 5.7119790281933914e-05, + "loss": 7.0388, + "loss/crossentropy": 1.9527421653270722, + "loss/hidden": 3.23203125, + "loss/jsd": 0.0, + "loss/logits": 0.16238325983285903, + "step": 24850 + }, + { + "epoch": 0.8286666666666667, + "grad_norm": 25.25, + "grad_norm_var": 89.24583333333334, + "learning_rate": 5.6978565324303926e-05, + "loss": 7.0605, + "loss/crossentropy": 2.168795867264271, + "loss/hidden": 3.24453125, + "loss/jsd": 0.0, + "loss/logits": 0.16646072771400214, + "step": 24860 + }, + { + "epoch": 0.829, + "grad_norm": 22.875, + "grad_norm_var": 89.57057291666666, + "learning_rate": 5.683732083903296e-05, + "loss": 7.0862, + "loss/crossentropy": 1.9611302673816682, + "loss/hidden": 3.26796875, + "loss/jsd": 0.0, + "loss/logits": 0.1669299216940999, + "step": 24870 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 23.25, + "grad_norm_var": 1.1806640625, + "learning_rate": 5.669605822014706e-05, + "loss": 7.0333, + "loss/crossentropy": 1.7856550820171833, + "loss/hidden": 3.243359375, + "loss/jsd": 0.0, + "loss/logits": 0.15327755445614458, + "step": 24880 + }, + { + "epoch": 0.8296666666666667, + "grad_norm": 24.75, + "grad_norm_var": 0.8072916666666666, + "learning_rate": 5.655477886185126e-05, + "loss": 7.0163, + "loss/crossentropy": 2.0259492844343185, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.16790905371308326, + "step": 24890 + }, + { + "epoch": 0.83, + "grad_norm": 24.5, + "grad_norm_var": 0.8129557291666667, + "learning_rate": 5.641348415851577e-05, + "loss": 6.97, + "loss/crossentropy": 1.9602381430566311, + "loss/hidden": 3.240625, + "loss/jsd": 0.0, + "loss/logits": 0.1515656548552215, + "step": 24900 + }, + { + "epoch": 0.8303333333333334, + "grad_norm": 23.875, + "grad_norm_var": 0.9968098958333333, + "learning_rate": 5.62721755046623e-05, + "loss": 6.9703, + "loss/crossentropy": 2.0206805035471915, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.18460522294044496, + "step": 24910 + }, + { + "epoch": 0.8306666666666667, + "grad_norm": 23.875, + "grad_norm_var": 0.8238932291666666, + "learning_rate": 5.61308542949502e-05, + "loss": 7.0159, + "loss/crossentropy": 1.9096168451011182, + "loss/hidden": 3.27421875, + "loss/jsd": 0.0, + "loss/logits": 0.16171670304611324, + "step": 24920 + }, + { + "epoch": 0.831, + "grad_norm": 24.75, + "grad_norm_var": 0.5723307291666667, + "learning_rate": 5.598952192416274e-05, + "loss": 7.0482, + "loss/crossentropy": 1.9502201959490777, + "loss/hidden": 3.251953125, + "loss/jsd": 0.0, + "loss/logits": 0.1745383620262146, + "step": 24930 + }, + { + "epoch": 0.8313333333333334, + "grad_norm": 26.875, + "grad_norm_var": 1.8233723958333334, + "learning_rate": 5.584817978719338e-05, + "loss": 7.0207, + "loss/crossentropy": 2.020615467429161, + "loss/hidden": 3.23203125, + "loss/jsd": 0.0, + "loss/logits": 0.1621037432923913, + "step": 24940 + }, + { + "epoch": 0.8316666666666667, + "grad_norm": 23.5, + "grad_norm_var": 0.9497395833333333, + "learning_rate": 5.570682927903194e-05, + "loss": 7.0464, + "loss/crossentropy": 1.9485878251492976, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.18183569833636284, + "step": 24950 + }, + { + "epoch": 0.832, + "grad_norm": 22.625, + "grad_norm_var": 0.57265625, + "learning_rate": 5.556547179475088e-05, + "loss": 6.9525, + "loss/crossentropy": 2.003249977529049, + "loss/hidden": 3.172265625, + "loss/jsd": 0.0, + "loss/logits": 0.15923047866672277, + "step": 24960 + }, + { + "epoch": 0.8323333333333334, + "grad_norm": 24.375, + "grad_norm_var": 0.4988932291666667, + "learning_rate": 5.54241087294915e-05, + "loss": 7.0322, + "loss/crossentropy": 1.9297384425997735, + "loss/hidden": 3.25859375, + "loss/jsd": 0.0, + "loss/logits": 0.1763775937259197, + "step": 24970 + }, + { + "epoch": 0.8326666666666667, + "grad_norm": 21.75, + "grad_norm_var": 1.0848307291666666, + "learning_rate": 5.528274147845016e-05, + "loss": 6.994, + "loss/crossentropy": 2.1030173070728777, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.17390758330002426, + "step": 24980 + }, + { + "epoch": 0.833, + "grad_norm": 23.25, + "grad_norm_var": 1.94765625, + "learning_rate": 5.514137143686459e-05, + "loss": 7.0266, + "loss/crossentropy": 2.0625434547662733, + "loss/hidden": 3.228125, + "loss/jsd": 0.0, + "loss/logits": 0.1578001905232668, + "step": 24990 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 22.625, + "grad_norm_var": 0.7910807291666667, + "learning_rate": 5.500000000000001e-05, + "loss": 6.9959, + "loss/crossentropy": 2.1564994513988496, + "loss/hidden": 3.234765625, + "loss/jsd": 0.0, + "loss/logits": 0.16419483684003353, + "step": 25000 + }, + { + "epoch": 0.8336666666666667, + "grad_norm": 22.125, + "grad_norm_var": 1.3046223958333334, + "learning_rate": 5.485862856313543e-05, + "loss": 6.9643, + "loss/crossentropy": 1.9692196190357207, + "loss/hidden": 3.23046875, + "loss/jsd": 0.0, + "loss/logits": 0.17642919681966304, + "step": 25010 + }, + { + "epoch": 0.834, + "grad_norm": 25.0, + "grad_norm_var": 1.1301432291666667, + "learning_rate": 5.4717258521549855e-05, + "loss": 7.0045, + "loss/crossentropy": 1.896971306949854, + "loss/hidden": 3.316015625, + "loss/jsd": 0.0, + "loss/logits": 0.16555657889693975, + "step": 25020 + }, + { + "epoch": 0.8343333333333334, + "grad_norm": 23.0, + "grad_norm_var": 5.253125, + "learning_rate": 5.4575891270508526e-05, + "loss": 7.0076, + "loss/crossentropy": 1.9228805772960187, + "loss/hidden": 3.23671875, + "loss/jsd": 0.0, + "loss/logits": 0.1604012963362038, + "step": 25030 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 23.875, + "grad_norm_var": 4.692708333333333, + "learning_rate": 5.443452820524913e-05, + "loss": 7.1268, + "loss/crossentropy": 1.9976628370583058, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18949662614613771, + "step": 25040 + }, + { + "epoch": 0.835, + "grad_norm": 23.25, + "grad_norm_var": 0.9931640625, + "learning_rate": 5.429317072096808e-05, + "loss": 6.9171, + "loss/crossentropy": 1.95694150775671, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.17416810244321823, + "step": 25050 + }, + { + "epoch": 0.8353333333333334, + "grad_norm": 23.0, + "grad_norm_var": 1.4254557291666667, + "learning_rate": 5.4151820212806633e-05, + "loss": 6.9901, + "loss/crossentropy": 1.967911347746849, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.16883484926074743, + "step": 25060 + }, + { + "epoch": 0.8356666666666667, + "grad_norm": 24.5, + "grad_norm_var": 1.0718098958333333, + "learning_rate": 5.401047807583728e-05, + "loss": 7.0916, + "loss/crossentropy": 1.9769475132226944, + "loss/hidden": 3.325390625, + "loss/jsd": 0.0, + "loss/logits": 0.1776350039988756, + "step": 25070 + }, + { + "epoch": 0.836, + "grad_norm": 24.0, + "grad_norm_var": 0.6205729166666667, + "learning_rate": 5.3869145705049814e-05, + "loss": 7.1016, + "loss/crossentropy": 2.0979168742895125, + "loss/hidden": 3.24453125, + "loss/jsd": 0.0, + "loss/logits": 0.17412376143038272, + "step": 25080 + }, + { + "epoch": 0.8363333333333334, + "grad_norm": 22.625, + "grad_norm_var": 1.0655598958333334, + "learning_rate": 5.372782449533771e-05, + "loss": 7.0239, + "loss/crossentropy": 2.2201668590307237, + "loss/hidden": 3.15703125, + "loss/jsd": 0.0, + "loss/logits": 0.16662366669625045, + "step": 25090 + }, + { + "epoch": 0.8366666666666667, + "grad_norm": 24.5, + "grad_norm_var": 0.7947916666666667, + "learning_rate": 5.358651584148423e-05, + "loss": 7.0296, + "loss/crossentropy": 1.9929725021123885, + "loss/hidden": 3.25859375, + "loss/jsd": 0.0, + "loss/logits": 0.16217339746654033, + "step": 25100 + }, + { + "epoch": 0.837, + "grad_norm": 23.5, + "grad_norm_var": 0.9895182291666667, + "learning_rate": 5.344522113814875e-05, + "loss": 6.9488, + "loss/crossentropy": 1.940374694764614, + "loss/hidden": 3.068359375, + "loss/jsd": 0.0, + "loss/logits": 0.15328829986974596, + "step": 25110 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 24.625, + "grad_norm_var": 1.365625, + "learning_rate": 5.330394177985295e-05, + "loss": 7.0492, + "loss/crossentropy": 1.9210307955741883, + "loss/hidden": 3.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.16172695737332105, + "step": 25120 + }, + { + "epoch": 0.8376666666666667, + "grad_norm": 23.5, + "grad_norm_var": 1.39140625, + "learning_rate": 5.316267916096705e-05, + "loss": 6.838, + "loss/crossentropy": 2.1268305659294127, + "loss/hidden": 3.221484375, + "loss/jsd": 0.0, + "loss/logits": 0.1686540162190795, + "step": 25130 + }, + { + "epoch": 0.838, + "grad_norm": 23.125, + "grad_norm_var": 1.3035807291666666, + "learning_rate": 5.302143467569609e-05, + "loss": 7.0213, + "loss/crossentropy": 2.2016376689076425, + "loss/hidden": 3.239453125, + "loss/jsd": 0.0, + "loss/logits": 0.185344104655087, + "step": 25140 + }, + { + "epoch": 0.8383333333333334, + "grad_norm": 23.625, + "grad_norm_var": 0.7619140625, + "learning_rate": 5.288020971806609e-05, + "loss": 7.0272, + "loss/crossentropy": 1.9791180558502675, + "loss/hidden": 3.294140625, + "loss/jsd": 0.0, + "loss/logits": 0.16557303946465254, + "step": 25150 + }, + { + "epoch": 0.8386666666666667, + "grad_norm": 21.75, + "grad_norm_var": 0.9837890625, + "learning_rate": 5.273900568191038e-05, + "loss": 6.871, + "loss/crossentropy": 2.078375779092312, + "loss/hidden": 3.248046875, + "loss/jsd": 0.0, + "loss/logits": 0.1593981696292758, + "step": 25160 + }, + { + "epoch": 0.839, + "grad_norm": 22.375, + "grad_norm_var": 1.7968098958333334, + "learning_rate": 5.259782396085579e-05, + "loss": 6.963, + "loss/crossentropy": 1.9973531074821949, + "loss/hidden": 3.166796875, + "loss/jsd": 0.0, + "loss/logits": 0.15650712680071593, + "step": 25170 + }, + { + "epoch": 0.8393333333333334, + "grad_norm": 22.375, + "grad_norm_var": 1.8041666666666667, + "learning_rate": 5.24566659483089e-05, + "loss": 6.8987, + "loss/crossentropy": 1.9296558193862439, + "loss/hidden": 3.121875, + "loss/jsd": 0.0, + "loss/logits": 0.1516895718872547, + "step": 25180 + }, + { + "epoch": 0.8396666666666667, + "grad_norm": 21.75, + "grad_norm_var": 0.7635416666666667, + "learning_rate": 5.231553303744232e-05, + "loss": 6.9936, + "loss/crossentropy": 2.114676037430763, + "loss/hidden": 3.329296875, + "loss/jsd": 0.0, + "loss/logits": 0.17353012934327125, + "step": 25190 + }, + { + "epoch": 0.84, + "grad_norm": 22.5, + "grad_norm_var": 0.4205729166666667, + "learning_rate": 5.2174426621180906e-05, + "loss": 6.9546, + "loss/crossentropy": 2.0760419577360154, + "loss/hidden": 3.226171875, + "loss/jsd": 0.0, + "loss/logits": 0.16167073398828508, + "step": 25200 + }, + { + "epoch": 0.8403333333333334, + "grad_norm": 22.125, + "grad_norm_var": 2.5940733667158523e+18, + "learning_rate": 5.2033348092187996e-05, + "loss": 6.8677, + "loss/crossentropy": 1.9620779484510422, + "loss/hidden": 3.191796875, + "loss/jsd": 0.0, + "loss/logits": 0.1550289398059249, + "step": 25210 + }, + { + "epoch": 0.8406666666666667, + "grad_norm": 22.125, + "grad_norm_var": 2.594073366910468e+18, + "learning_rate": 5.189229884285174e-05, + "loss": 6.9043, + "loss/crossentropy": 1.9976623475551605, + "loss/hidden": 3.2375, + "loss/jsd": 0.0, + "loss/logits": 0.16729694679379464, + "step": 25220 + }, + { + "epoch": 0.841, + "grad_norm": 25.25, + "grad_norm_var": 2.8238932291666665, + "learning_rate": 5.175128026527128e-05, + "loss": 7.0103, + "loss/crossentropy": 2.192066043615341, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.20732564926147462, + "step": 25230 + }, + { + "epoch": 0.8413333333333334, + "grad_norm": 23.375, + "grad_norm_var": 2.5020182291666666, + "learning_rate": 5.161029375124303e-05, + "loss": 6.9139, + "loss/crossentropy": 1.877561804652214, + "loss/hidden": 3.178515625, + "loss/jsd": 0.0, + "loss/logits": 0.15400900933891534, + "step": 25240 + }, + { + "epoch": 0.8416666666666667, + "grad_norm": 21.25, + "grad_norm_var": 1.5604166666666666, + "learning_rate": 5.1469340692246995e-05, + "loss": 6.9029, + "loss/crossentropy": 2.0022835403680803, + "loss/hidden": 3.1765625, + "loss/jsd": 0.0, + "loss/logits": 0.16735202725976706, + "step": 25250 + }, + { + "epoch": 0.842, + "grad_norm": 22.125, + "grad_norm_var": 4.509375, + "learning_rate": 5.1328422479432915e-05, + "loss": 6.9624, + "loss/crossentropy": 1.8773959062993526, + "loss/hidden": 3.25546875, + "loss/jsd": 0.0, + "loss/logits": 0.15194975724443793, + "step": 25260 + }, + { + "epoch": 0.8423333333333334, + "grad_norm": 21.75, + "grad_norm_var": 1.1247395833333333, + "learning_rate": 5.11875405036067e-05, + "loss": 6.9817, + "loss/crossentropy": 2.165113839507103, + "loss/hidden": 3.247265625, + "loss/jsd": 0.0, + "loss/logits": 0.16993321236222983, + "step": 25270 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 22.125, + "grad_norm_var": 0.9184895833333333, + "learning_rate": 5.104669615521657e-05, + "loss": 6.923, + "loss/crossentropy": 2.084018699079752, + "loss/hidden": 3.15546875, + "loss/jsd": 0.0, + "loss/logits": 0.16481912517920136, + "step": 25280 + }, + { + "epoch": 0.843, + "grad_norm": 22.375, + "grad_norm_var": 1.2830729166666666, + "learning_rate": 5.090589082433935e-05, + "loss": 6.9785, + "loss/crossentropy": 2.2565275222063064, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.17223725598305464, + "step": 25290 + }, + { + "epoch": 0.8433333333333334, + "grad_norm": 23.0, + "grad_norm_var": 1.3186848958333333, + "learning_rate": 5.076512590066685e-05, + "loss": 7.0165, + "loss/crossentropy": 2.0910873889923094, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.16965348087251186, + "step": 25300 + }, + { + "epoch": 0.8436666666666667, + "grad_norm": 22.25, + "grad_norm_var": 1.065625, + "learning_rate": 5.062440277349203e-05, + "loss": 6.9454, + "loss/crossentropy": 2.2037932582199575, + "loss/hidden": 3.130078125, + "loss/jsd": 0.0, + "loss/logits": 0.16220169235020876, + "step": 25310 + }, + { + "epoch": 0.844, + "grad_norm": 22.75, + "grad_norm_var": 2.582747395833333, + "learning_rate": 5.048372283169532e-05, + "loss": 6.9965, + "loss/crossentropy": 2.1661527663469315, + "loss/hidden": 3.18984375, + "loss/jsd": 0.0, + "loss/logits": 0.16337131895124912, + "step": 25320 + }, + { + "epoch": 0.8443333333333334, + "grad_norm": 23.0, + "grad_norm_var": 1.9525390625, + "learning_rate": 5.0343087463730996e-05, + "loss": 6.9872, + "loss/crossentropy": 2.1112076193094254, + "loss/hidden": 3.278125, + "loss/jsd": 0.0, + "loss/logits": 0.16945380419492723, + "step": 25330 + }, + { + "epoch": 0.8446666666666667, + "grad_norm": 23.0, + "grad_norm_var": 0.6872395833333333, + "learning_rate": 5.020249805761331e-05, + "loss": 7.0327, + "loss/crossentropy": 1.9474051117897033, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.1742366042919457, + "step": 25340 + }, + { + "epoch": 0.845, + "grad_norm": 22.25, + "grad_norm_var": 0.9494140625, + "learning_rate": 5.006195600090297e-05, + "loss": 7.0176, + "loss/crossentropy": 2.0254900440573693, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.1812945833429694, + "step": 25350 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 22.5, + "grad_norm_var": 1.2004557291666667, + "learning_rate": 4.992146268069333e-05, + "loss": 6.899, + "loss/crossentropy": 1.9700914964079856, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.17236013878136874, + "step": 25360 + }, + { + "epoch": 0.8456666666666667, + "grad_norm": 21.375, + "grad_norm_var": 1.1629557291666666, + "learning_rate": 4.9781019483596746e-05, + "loss": 6.9598, + "loss/crossentropy": 2.0217429384589196, + "loss/hidden": 3.219921875, + "loss/jsd": 0.0, + "loss/logits": 0.1671298835426569, + "step": 25370 + }, + { + "epoch": 0.846, + "grad_norm": 23.25, + "grad_norm_var": 1.7785807291666667, + "learning_rate": 4.9640627795730866e-05, + "loss": 6.88, + "loss/crossentropy": 1.943567543849349, + "loss/hidden": 3.249609375, + "loss/jsd": 0.0, + "loss/logits": 0.1804880647920072, + "step": 25380 + }, + { + "epoch": 0.8463333333333334, + "grad_norm": 24.875, + "grad_norm_var": 2.3009765625, + "learning_rate": 4.9500289002704984e-05, + "loss": 6.8981, + "loss/crossentropy": 2.006009988486767, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.17958016656339168, + "step": 25390 + }, + { + "epoch": 0.8466666666666667, + "grad_norm": 22.5, + "grad_norm_var": 4.074934895833334, + "learning_rate": 4.936000448960631e-05, + "loss": 7.0513, + "loss/crossentropy": 2.2360173970460893, + "loss/hidden": 3.230078125, + "loss/jsd": 0.0, + "loss/logits": 0.18158553242683412, + "step": 25400 + }, + { + "epoch": 0.847, + "grad_norm": 24.0, + "grad_norm_var": 2.7462890625, + "learning_rate": 4.9219775640986366e-05, + "loss": 6.924, + "loss/crossentropy": 1.8591318547725677, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.1813932742923498, + "step": 25410 + }, + { + "epoch": 0.8473333333333334, + "grad_norm": 22.75, + "grad_norm_var": 1.3337890625, + "learning_rate": 4.907960384084729e-05, + "loss": 6.9502, + "loss/crossentropy": 2.1037135615944864, + "loss/hidden": 3.21953125, + "loss/jsd": 0.0, + "loss/logits": 0.16334721986204387, + "step": 25420 + }, + { + "epoch": 0.8476666666666667, + "grad_norm": 22.375, + "grad_norm_var": 1.86015625, + "learning_rate": 4.8939490472628136e-05, + "loss": 6.8949, + "loss/crossentropy": 1.9349641531705857, + "loss/hidden": 3.32734375, + "loss/jsd": 0.0, + "loss/logits": 0.16478215027600526, + "step": 25430 + }, + { + "epoch": 0.848, + "grad_norm": 22.75, + "grad_norm_var": 0.5947265625, + "learning_rate": 4.87994369191913e-05, + "loss": 6.9594, + "loss/crossentropy": 2.1374287590384484, + "loss/hidden": 3.138671875, + "loss/jsd": 0.0, + "loss/logits": 0.1534802021458745, + "step": 25440 + }, + { + "epoch": 0.8483333333333334, + "grad_norm": 22.75, + "grad_norm_var": 0.9608723958333333, + "learning_rate": 4.865944456280879e-05, + "loss": 6.876, + "loss/crossentropy": 1.952788008749485, + "loss/hidden": 3.270703125, + "loss/jsd": 0.0, + "loss/logits": 0.16467729359865188, + "step": 25450 + }, + { + "epoch": 0.8486666666666667, + "grad_norm": 23.375, + "grad_norm_var": 11.014518229166667, + "learning_rate": 4.851951478514866e-05, + "loss": 7.014, + "loss/crossentropy": 2.1284357413649557, + "loss/hidden": 3.238671875, + "loss/jsd": 0.0, + "loss/logits": 0.16996914581395686, + "step": 25460 + }, + { + "epoch": 0.849, + "grad_norm": 23.25, + "grad_norm_var": 11.498958333333333, + "learning_rate": 4.837964896726132e-05, + "loss": 6.9063, + "loss/crossentropy": 2.096763235330582, + "loss/hidden": 3.18984375, + "loss/jsd": 0.0, + "loss/logits": 0.162611080147326, + "step": 25470 + }, + { + "epoch": 0.8493333333333334, + "grad_norm": 20.125, + "grad_norm_var": 1.0518229166666666, + "learning_rate": 4.823984848956593e-05, + "loss": 6.8635, + "loss/crossentropy": 2.0680422112345695, + "loss/hidden": 3.23984375, + "loss/jsd": 0.0, + "loss/logits": 0.15710455570369958, + "step": 25480 + }, + { + "epoch": 0.8496666666666667, + "grad_norm": 20.75, + "grad_norm_var": 1.9317057291666666, + "learning_rate": 4.810011473183677e-05, + "loss": 6.8994, + "loss/crossentropy": 2.0600294291973116, + "loss/hidden": 3.29375, + "loss/jsd": 0.0, + "loss/logits": 0.16484030187129975, + "step": 25490 + }, + { + "epoch": 0.85, + "grad_norm": 23.25, + "grad_norm_var": 8.0212890625, + "learning_rate": 4.7960449073189606e-05, + "loss": 6.9659, + "loss/crossentropy": 1.971318671107292, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.18275586236268282, + "step": 25500 + }, + { + "epoch": 0.8503333333333334, + "grad_norm": 21.875, + "grad_norm_var": 0.8061848958333333, + "learning_rate": 4.7820852892068114e-05, + "loss": 6.9837, + "loss/crossentropy": 1.809413194656372, + "loss/hidden": 3.2796875, + "loss/jsd": 0.0, + "loss/logits": 0.15677141044288873, + "step": 25510 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 22.5, + "grad_norm_var": 0.7009765625, + "learning_rate": 4.768132756623024e-05, + "loss": 6.8624, + "loss/crossentropy": 1.8316463023424148, + "loss/hidden": 3.30859375, + "loss/jsd": 0.0, + "loss/logits": 0.1573034648783505, + "step": 25520 + }, + { + "epoch": 0.851, + "grad_norm": 21.0, + "grad_norm_var": 0.953125, + "learning_rate": 4.754187447273461e-05, + "loss": 6.8507, + "loss/crossentropy": 1.9352269530296327, + "loss/hidden": 3.27578125, + "loss/jsd": 0.0, + "loss/logits": 0.15878485683351756, + "step": 25530 + }, + { + "epoch": 0.8513333333333334, + "grad_norm": 22.25, + "grad_norm_var": 1.61015625, + "learning_rate": 4.740249498792698e-05, + "loss": 6.8568, + "loss/crossentropy": 2.010533457994461, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.17276135310530663, + "step": 25540 + }, + { + "epoch": 0.8516666666666667, + "grad_norm": 21.375, + "grad_norm_var": 0.8395833333333333, + "learning_rate": 4.7263190487426564e-05, + "loss": 6.9387, + "loss/crossentropy": 2.134304754436016, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.1872491927817464, + "step": 25550 + }, + { + "epoch": 0.852, + "grad_norm": 21.75, + "grad_norm_var": 0.9747395833333333, + "learning_rate": 4.7123962346112584e-05, + "loss": 6.886, + "loss/crossentropy": 2.072511524707079, + "loss/hidden": 3.144921875, + "loss/jsd": 0.0, + "loss/logits": 0.1534841218031943, + "step": 25560 + }, + { + "epoch": 0.8523333333333334, + "grad_norm": 23.375, + "grad_norm_var": 0.7905598958333333, + "learning_rate": 4.698481193811054e-05, + "loss": 6.8584, + "loss/crossentropy": 2.1463774725794793, + "loss/hidden": 3.184375, + "loss/jsd": 0.0, + "loss/logits": 0.16341485381126403, + "step": 25570 + }, + { + "epoch": 0.8526666666666667, + "grad_norm": 21.125, + "grad_norm_var": 2.0010416666666666, + "learning_rate": 4.684574063677881e-05, + "loss": 6.9106, + "loss/crossentropy": 2.065951754152775, + "loss/hidden": 3.2859375, + "loss/jsd": 0.0, + "loss/logits": 0.16066975481808185, + "step": 25580 + }, + { + "epoch": 0.853, + "grad_norm": 29.5, + "grad_norm_var": 5.664518229166666, + "learning_rate": 4.6706749814694997e-05, + "loss": 6.7972, + "loss/crossentropy": 2.0351175434887407, + "loss/hidden": 3.193359375, + "loss/jsd": 0.0, + "loss/logits": 0.16005632225424052, + "step": 25590 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 21.75, + "grad_norm_var": 6.993684895833334, + "learning_rate": 4.6567840843642384e-05, + "loss": 6.9496, + "loss/crossentropy": 2.0523830361664297, + "loss/hidden": 3.146875, + "loss/jsd": 0.0, + "loss/logits": 0.15730505622923374, + "step": 25600 + }, + { + "epoch": 0.8536666666666667, + "grad_norm": 23.0, + "grad_norm_var": 7.039518229166666, + "learning_rate": 4.642901509459646e-05, + "loss": 6.7785, + "loss/crossentropy": 2.1218873113393784, + "loss/hidden": 3.20703125, + "loss/jsd": 0.0, + "loss/logits": 0.16152856182307004, + "step": 25610 + }, + { + "epoch": 0.854, + "grad_norm": 21.625, + "grad_norm_var": 6.968489583333334, + "learning_rate": 4.629027393771129e-05, + "loss": 6.8866, + "loss/crossentropy": 1.9210114896297454, + "loss/hidden": 3.155078125, + "loss/jsd": 0.0, + "loss/logits": 0.14758066833019257, + "step": 25620 + }, + { + "epoch": 0.8543333333333333, + "grad_norm": 23.5, + "grad_norm_var": 1.2405598958333333, + "learning_rate": 4.61516187423061e-05, + "loss": 6.899, + "loss/crossentropy": 2.0598912209272386, + "loss/hidden": 3.251953125, + "loss/jsd": 0.0, + "loss/logits": 0.1691096406430006, + "step": 25630 + }, + { + "epoch": 0.8546666666666667, + "grad_norm": 23.875, + "grad_norm_var": 1.0854166666666667, + "learning_rate": 4.601305087685169e-05, + "loss": 6.9173, + "loss/crossentropy": 2.205300694704056, + "loss/hidden": 3.23671875, + "loss/jsd": 0.0, + "loss/logits": 0.17890902925282717, + "step": 25640 + }, + { + "epoch": 0.855, + "grad_norm": 22.0, + "grad_norm_var": 1.8083333333333333, + "learning_rate": 4.587457170895696e-05, + "loss": 6.868, + "loss/crossentropy": 2.148914474248886, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.17795586232095956, + "step": 25650 + }, + { + "epoch": 0.8553333333333333, + "grad_norm": 21.25, + "grad_norm_var": 2.0822265625, + "learning_rate": 4.573618260535536e-05, + "loss": 6.8924, + "loss/crossentropy": 1.9687716513872147, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.18561361059546472, + "step": 25660 + }, + { + "epoch": 0.8556666666666667, + "grad_norm": 21.5, + "grad_norm_var": 284.1080729166667, + "learning_rate": 4.559788493189149e-05, + "loss": 6.8702, + "loss/crossentropy": 2.0562238790094853, + "loss/hidden": 3.18359375, + "loss/jsd": 0.0, + "loss/logits": 0.1623332142829895, + "step": 25670 + }, + { + "epoch": 0.856, + "grad_norm": 20.5, + "grad_norm_var": 282.97682291666666, + "learning_rate": 4.545968005350756e-05, + "loss": 6.8716, + "loss/crossentropy": 2.110888344049454, + "loss/hidden": 3.22578125, + "loss/jsd": 0.0, + "loss/logits": 0.16720662415027618, + "step": 25680 + }, + { + "epoch": 0.8563333333333333, + "grad_norm": 21.25, + "grad_norm_var": 0.49765625, + "learning_rate": 4.5321569334229916e-05, + "loss": 6.8537, + "loss/crossentropy": 1.988807225972414, + "loss/hidden": 3.259375, + "loss/jsd": 0.0, + "loss/logits": 0.16363519094884396, + "step": 25690 + }, + { + "epoch": 0.8566666666666667, + "grad_norm": 21.5, + "grad_norm_var": 4.025, + "learning_rate": 4.5183554137155606e-05, + "loss": 6.84, + "loss/crossentropy": 1.9043106943368913, + "loss/hidden": 3.165625, + "loss/jsd": 0.0, + "loss/logits": 0.14644915759563445, + "step": 25700 + }, + { + "epoch": 0.857, + "grad_norm": 24.0, + "grad_norm_var": 2.8580729166666665, + "learning_rate": 4.504563582443889e-05, + "loss": 6.964, + "loss/crossentropy": 2.051722328364849, + "loss/hidden": 3.206640625, + "loss/jsd": 0.0, + "loss/logits": 0.17551100347191095, + "step": 25710 + }, + { + "epoch": 0.8573333333333333, + "grad_norm": 21.625, + "grad_norm_var": 0.96640625, + "learning_rate": 4.490781575727786e-05, + "loss": 6.8005, + "loss/crossentropy": 2.009612035751343, + "loss/hidden": 3.187109375, + "loss/jsd": 0.0, + "loss/logits": 0.14919841345399618, + "step": 25720 + }, + { + "epoch": 0.8576666666666667, + "grad_norm": 22.125, + "grad_norm_var": 0.5322916666666667, + "learning_rate": 4.4770095295900924e-05, + "loss": 6.8377, + "loss/crossentropy": 2.0377252414822578, + "loss/hidden": 3.158203125, + "loss/jsd": 0.0, + "loss/logits": 0.18529058247804642, + "step": 25730 + }, + { + "epoch": 0.858, + "grad_norm": 5771362304.0, + "grad_norm_var": 2.0817889116463037e+18, + "learning_rate": 4.463247579955344e-05, + "loss": 7.0199, + "loss/crossentropy": 1.9959793724119663, + "loss/hidden": 3.283203125, + "loss/jsd": 0.0, + "loss/logits": 0.16647218465805053, + "step": 25740 + }, + { + "epoch": 0.8583333333333333, + "grad_norm": 21.125, + "grad_norm_var": 2.081788911934872e+18, + "learning_rate": 4.4494958626484276e-05, + "loss": 6.8499, + "loss/crossentropy": 2.0159687541425226, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.15637603402137756, + "step": 25750 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 22.125, + "grad_norm_var": 1.3205729166666667, + "learning_rate": 4.43575451339324e-05, + "loss": 6.9875, + "loss/crossentropy": 2.023764471709728, + "loss/hidden": 3.19609375, + "loss/jsd": 0.0, + "loss/logits": 0.15618936270475386, + "step": 25760 + }, + { + "epoch": 0.859, + "grad_norm": 20.75, + "grad_norm_var": 4.481184895833334, + "learning_rate": 4.4220236678113536e-05, + "loss": 6.8386, + "loss/crossentropy": 2.0717529535293577, + "loss/hidden": 3.173828125, + "loss/jsd": 0.0, + "loss/logits": 0.1514882566407323, + "step": 25770 + }, + { + "epoch": 0.8593333333333333, + "grad_norm": 22.875, + "grad_norm_var": 4.6025390625, + "learning_rate": 4.4083034614206674e-05, + "loss": 6.9052, + "loss/crossentropy": 2.154834459722042, + "loss/hidden": 3.181640625, + "loss/jsd": 0.0, + "loss/logits": 0.15938506573438643, + "step": 25780 + }, + { + "epoch": 0.8596666666666667, + "grad_norm": 22.5, + "grad_norm_var": 1.3811848958333333, + "learning_rate": 4.3945940296340824e-05, + "loss": 6.9774, + "loss/crossentropy": 2.141025458276272, + "loss/hidden": 3.224609375, + "loss/jsd": 0.0, + "loss/logits": 0.1737861094996333, + "step": 25790 + }, + { + "epoch": 0.86, + "grad_norm": 21.125, + "grad_norm_var": 1.7874348958333333, + "learning_rate": 4.380895507758155e-05, + "loss": 6.8555, + "loss/crossentropy": 1.9630529195070268, + "loss/hidden": 3.222265625, + "loss/jsd": 0.0, + "loss/logits": 0.16939648147672415, + "step": 25800 + }, + { + "epoch": 0.8603333333333333, + "grad_norm": 20.375, + "grad_norm_var": 1.5978515625, + "learning_rate": 4.367208030991764e-05, + "loss": 6.8227, + "loss/crossentropy": 1.893832840025425, + "loss/hidden": 3.196484375, + "loss/jsd": 0.0, + "loss/logits": 0.15906968284398318, + "step": 25810 + }, + { + "epoch": 0.8606666666666667, + "grad_norm": 22.25, + "grad_norm_var": 1.7520182291666666, + "learning_rate": 4.353531734424782e-05, + "loss": 6.9535, + "loss/crossentropy": 1.9903650164604187, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.17482327315956353, + "step": 25820 + }, + { + "epoch": 0.861, + "grad_norm": 22.375, + "grad_norm_var": 0.61640625, + "learning_rate": 4.3398667530367306e-05, + "loss": 6.8628, + "loss/crossentropy": 2.053640615940094, + "loss/hidden": 3.285546875, + "loss/jsd": 0.0, + "loss/logits": 0.19147922191768885, + "step": 25830 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 24.625, + "grad_norm_var": 56.209309895833336, + "learning_rate": 4.3262132216954656e-05, + "loss": 6.9087, + "loss/crossentropy": 2.0847674936056135, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.17146144881844522, + "step": 25840 + }, + { + "epoch": 0.8616666666666667, + "grad_norm": 20.75, + "grad_norm_var": 3.0233723958333334, + "learning_rate": 4.312571275155823e-05, + "loss": 6.8788, + "loss/crossentropy": 2.0731761664152146, + "loss/hidden": 3.144921875, + "loss/jsd": 0.0, + "loss/logits": 0.16782324127852916, + "step": 25850 + }, + { + "epoch": 0.862, + "grad_norm": 22.875, + "grad_norm_var": 2.468489583333333, + "learning_rate": 4.2989410480583116e-05, + "loss": 6.8898, + "loss/crossentropy": 1.9479005321860314, + "loss/hidden": 3.233984375, + "loss/jsd": 0.0, + "loss/logits": 0.15610639620572328, + "step": 25860 + }, + { + "epoch": 0.8623333333333333, + "grad_norm": 26.0, + "grad_norm_var": 489.34583333333336, + "learning_rate": 4.285322674927768e-05, + "loss": 6.858, + "loss/crossentropy": 1.9504839967936278, + "loss/hidden": 3.195703125, + "loss/jsd": 0.0, + "loss/logits": 0.16356785856187345, + "step": 25870 + }, + { + "epoch": 0.8626666666666667, + "grad_norm": 20.375, + "grad_norm_var": 2.1858723958333335, + "learning_rate": 4.271716290172038e-05, + "loss": 6.9462, + "loss/crossentropy": 2.123225097358227, + "loss/hidden": 3.27109375, + "loss/jsd": 0.0, + "loss/logits": 0.16322279013693333, + "step": 25880 + }, + { + "epoch": 0.863, + "grad_norm": 21.5, + "grad_norm_var": 1.52265625, + "learning_rate": 4.258122028080646e-05, + "loss": 6.8839, + "loss/crossentropy": 2.133731837570667, + "loss/hidden": 3.16640625, + "loss/jsd": 0.0, + "loss/logits": 0.16982710380107163, + "step": 25890 + }, + { + "epoch": 0.8633333333333333, + "grad_norm": 21.375, + "grad_norm_var": 1.2479166666666666, + "learning_rate": 4.2445400228234686e-05, + "loss": 6.8131, + "loss/crossentropy": 2.0845893740653993, + "loss/hidden": 3.18203125, + "loss/jsd": 0.0, + "loss/logits": 0.17494960688054562, + "step": 25900 + }, + { + "epoch": 0.8636666666666667, + "grad_norm": 24.625, + "grad_norm_var": 1.909375, + "learning_rate": 4.230970408449418e-05, + "loss": 6.8778, + "loss/crossentropy": 2.051340754330158, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.17781901303678752, + "step": 25910 + }, + { + "epoch": 0.864, + "grad_norm": 20.875, + "grad_norm_var": 2.33515625, + "learning_rate": 4.217413318885108e-05, + "loss": 6.8758, + "loss/crossentropy": 2.0643552422523497, + "loss/hidden": 3.170703125, + "loss/jsd": 0.0, + "loss/logits": 0.1577897410839796, + "step": 25920 + }, + { + "epoch": 0.8643333333333333, + "grad_norm": 24.875, + "grad_norm_var": 1.6145182291666667, + "learning_rate": 4.203868887933541e-05, + "loss": 6.8634, + "loss/crossentropy": 2.0819766454398634, + "loss/hidden": 3.286328125, + "loss/jsd": 0.0, + "loss/logits": 0.16645964570343494, + "step": 25930 + }, + { + "epoch": 0.8646666666666667, + "grad_norm": 22.125, + "grad_norm_var": 1.2541666666666667, + "learning_rate": 4.190337249272778e-05, + "loss": 6.7763, + "loss/crossentropy": 2.0638196393847466, + "loss/hidden": 3.2515625, + "loss/jsd": 0.0, + "loss/logits": 0.1723222305998206, + "step": 25940 + }, + { + "epoch": 0.865, + "grad_norm": 21.375, + "grad_norm_var": 0.8973307291666667, + "learning_rate": 4.176818536454633e-05, + "loss": 6.9368, + "loss/crossentropy": 1.9603420421481133, + "loss/hidden": 3.14609375, + "loss/jsd": 0.0, + "loss/logits": 0.14999181237071751, + "step": 25950 + }, + { + "epoch": 0.8653333333333333, + "grad_norm": 21.625, + "grad_norm_var": 0.8802083333333334, + "learning_rate": 4.163312882903344e-05, + "loss": 6.7771, + "loss/crossentropy": 2.026094362139702, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.15913072023540736, + "step": 25960 + }, + { + "epoch": 0.8656666666666667, + "grad_norm": 24.5, + "grad_norm_var": 1.1759765625, + "learning_rate": 4.1498204219142575e-05, + "loss": 6.8418, + "loss/crossentropy": 2.0223079532384873, + "loss/hidden": 3.153125, + "loss/jsd": 0.0, + "loss/logits": 0.1593662802129984, + "step": 25970 + }, + { + "epoch": 0.866, + "grad_norm": 21.25, + "grad_norm_var": 1.265625, + "learning_rate": 4.1363412866525185e-05, + "loss": 6.8294, + "loss/crossentropy": 2.1484047800302504, + "loss/hidden": 3.16875, + "loss/jsd": 0.0, + "loss/logits": 0.15932908514514565, + "step": 25980 + }, + { + "epoch": 0.8663333333333333, + "grad_norm": 21.75, + "grad_norm_var": 0.91640625, + "learning_rate": 4.1228756101517475e-05, + "loss": 6.7557, + "loss/crossentropy": 1.8646988950669765, + "loss/hidden": 3.26953125, + "loss/jsd": 0.0, + "loss/logits": 0.17402277877554298, + "step": 25990 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 21.875, + "grad_norm_var": 0.76640625, + "learning_rate": 4.109423525312738e-05, + "loss": 6.9098, + "loss/crossentropy": 2.212962034344673, + "loss/hidden": 3.174609375, + "loss/jsd": 0.0, + "loss/logits": 0.16355629544705153, + "step": 26000 + }, + { + "epoch": 0.867, + "grad_norm": 23.0, + "grad_norm_var": 0.5978515625, + "learning_rate": 4.0959851649021344e-05, + "loss": 6.9753, + "loss/crossentropy": 1.9954494401812553, + "loss/hidden": 3.270703125, + "loss/jsd": 0.0, + "loss/logits": 0.17405376564711333, + "step": 26010 + }, + { + "epoch": 0.8673333333333333, + "grad_norm": 22.375, + "grad_norm_var": 0.75390625, + "learning_rate": 4.0825606615511305e-05, + "loss": 7.0159, + "loss/crossentropy": 2.1162398613989355, + "loss/hidden": 3.087890625, + "loss/jsd": 0.0, + "loss/logits": 0.14784672670066357, + "step": 26020 + }, + { + "epoch": 0.8676666666666667, + "grad_norm": 21.0, + "grad_norm_var": 2.594073367574846e+18, + "learning_rate": 4.069150147754151e-05, + "loss": 6.9345, + "loss/crossentropy": 1.9555442228913307, + "loss/hidden": 3.191796875, + "loss/jsd": 0.0, + "loss/logits": 0.157724441960454, + "step": 26030 + }, + { + "epoch": 0.868, + "grad_norm": 22.25, + "grad_norm_var": 0.5895833333333333, + "learning_rate": 4.0557537558675583e-05, + "loss": 7.0644, + "loss/crossentropy": 2.1746340721845625, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.1773978678509593, + "step": 26040 + }, + { + "epoch": 0.8683333333333333, + "grad_norm": 24.25, + "grad_norm_var": 1.3176432291666667, + "learning_rate": 4.042371618108329e-05, + "loss": 6.7896, + "loss/crossentropy": 1.9868990987539292, + "loss/hidden": 3.216796875, + "loss/jsd": 0.0, + "loss/logits": 0.15773731619119644, + "step": 26050 + }, + { + "epoch": 0.8686666666666667, + "grad_norm": 20.875, + "grad_norm_var": 1.96015625, + "learning_rate": 4.0290038665527596e-05, + "loss": 6.8032, + "loss/crossentropy": 2.1249034658074377, + "loss/hidden": 3.162109375, + "loss/jsd": 0.0, + "loss/logits": 0.1534424176439643, + "step": 26060 + }, + { + "epoch": 0.869, + "grad_norm": 24.125, + "grad_norm_var": 2.8893229166666665, + "learning_rate": 4.015650633135163e-05, + "loss": 6.861, + "loss/crossentropy": 2.0854105949401855, + "loss/hidden": 3.12265625, + "loss/jsd": 0.0, + "loss/logits": 0.16376592293381692, + "step": 26070 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 20.375, + "grad_norm_var": 1.4309895833333333, + "learning_rate": 4.00231204964656e-05, + "loss": 6.8508, + "loss/crossentropy": 2.0845695704221727, + "loss/hidden": 3.164453125, + "loss/jsd": 0.0, + "loss/logits": 0.15720440819859505, + "step": 26080 + }, + { + "epoch": 0.8696666666666667, + "grad_norm": 23.375, + "grad_norm_var": 1.2332682291666666, + "learning_rate": 3.9889882477333874e-05, + "loss": 6.7856, + "loss/crossentropy": 1.9149738550186157, + "loss/hidden": 3.256640625, + "loss/jsd": 0.0, + "loss/logits": 0.15268718972802162, + "step": 26090 + }, + { + "epoch": 0.87, + "grad_norm": 22.625, + "grad_norm_var": 2.383072916666667, + "learning_rate": 3.9756793588961896e-05, + "loss": 6.895, + "loss/crossentropy": 2.0855982795357706, + "loss/hidden": 3.18046875, + "loss/jsd": 0.0, + "loss/logits": 0.15859134048223494, + "step": 26100 + }, + { + "epoch": 0.8703333333333333, + "grad_norm": 23.875, + "grad_norm_var": 1.2754557291666666, + "learning_rate": 3.962385514488326e-05, + "loss": 6.8528, + "loss/crossentropy": 2.095623345673084, + "loss/hidden": 3.217578125, + "loss/jsd": 0.0, + "loss/logits": 0.1637012053281069, + "step": 26110 + }, + { + "epoch": 0.8706666666666667, + "grad_norm": 21.375, + "grad_norm_var": 1.7705729166666666, + "learning_rate": 3.949106845714674e-05, + "loss": 6.8451, + "loss/crossentropy": 1.8034477911889553, + "loss/hidden": 3.085546875, + "loss/jsd": 0.0, + "loss/logits": 0.14811227219179274, + "step": 26120 + }, + { + "epoch": 0.871, + "grad_norm": 22.375, + "grad_norm_var": 14.495247395833333, + "learning_rate": 3.9358434836303336e-05, + "loss": 6.8776, + "loss/crossentropy": 1.918418012559414, + "loss/hidden": 3.282421875, + "loss/jsd": 0.0, + "loss/logits": 0.16268355417996644, + "step": 26130 + }, + { + "epoch": 0.8713333333333333, + "grad_norm": 21.0, + "grad_norm_var": 1.4895833333333333, + "learning_rate": 3.922595559139336e-05, + "loss": 6.8094, + "loss/crossentropy": 2.0094055980443954, + "loss/hidden": 3.13984375, + "loss/jsd": 0.0, + "loss/logits": 0.14857212770730258, + "step": 26140 + }, + { + "epoch": 0.8716666666666667, + "grad_norm": 21.75, + "grad_norm_var": 1.16640625, + "learning_rate": 3.9093632029933435e-05, + "loss": 6.8686, + "loss/crossentropy": 1.9957379199564458, + "loss/hidden": 3.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.14984978251159192, + "step": 26150 + }, + { + "epoch": 0.872, + "grad_norm": 20.75, + "grad_norm_var": 0.4947916666666667, + "learning_rate": 3.896146545790372e-05, + "loss": 6.7922, + "loss/crossentropy": 2.0107031047344206, + "loss/hidden": 3.17421875, + "loss/jsd": 0.0, + "loss/logits": 0.15825871471315622, + "step": 26160 + }, + { + "epoch": 0.8723333333333333, + "grad_norm": 22.375, + "grad_norm_var": 0.7457682291666666, + "learning_rate": 3.882945717973493e-05, + "loss": 6.873, + "loss/crossentropy": 1.9550271481275558, + "loss/hidden": 3.1921875, + "loss/jsd": 0.0, + "loss/logits": 0.15566041497513652, + "step": 26170 + }, + { + "epoch": 0.8726666666666667, + "grad_norm": 21.75, + "grad_norm_var": 0.5197916666666667, + "learning_rate": 3.8697608498295445e-05, + "loss": 6.8371, + "loss/crossentropy": 2.016065427660942, + "loss/hidden": 3.176953125, + "loss/jsd": 0.0, + "loss/logits": 0.1619328921660781, + "step": 26180 + }, + { + "epoch": 0.873, + "grad_norm": 20.625, + "grad_norm_var": 0.97890625, + "learning_rate": 3.856592071487856e-05, + "loss": 6.8235, + "loss/crossentropy": 2.003811553120613, + "loss/hidden": 3.15234375, + "loss/jsd": 0.0, + "loss/logits": 0.1609561923891306, + "step": 26190 + }, + { + "epoch": 0.8733333333333333, + "grad_norm": 23.0, + "grad_norm_var": 1.50390625, + "learning_rate": 3.843439512918949e-05, + "loss": 6.8469, + "loss/crossentropy": 2.0764830335974693, + "loss/hidden": 3.256640625, + "loss/jsd": 0.0, + "loss/logits": 0.16151853874325753, + "step": 26200 + }, + { + "epoch": 0.8736666666666667, + "grad_norm": 21.5, + "grad_norm_var": 1.196875, + "learning_rate": 3.830303303933271e-05, + "loss": 6.7814, + "loss/crossentropy": 2.1526197090744974, + "loss/hidden": 3.108984375, + "loss/jsd": 0.0, + "loss/logits": 0.1596878958866, + "step": 26210 + }, + { + "epoch": 0.874, + "grad_norm": 20.75, + "grad_norm_var": 2.0893229166666667, + "learning_rate": 3.817183574179899e-05, + "loss": 6.9767, + "loss/crossentropy": 2.1804853290319444, + "loss/hidden": 3.18671875, + "loss/jsd": 0.0, + "loss/logits": 0.16332378438673914, + "step": 26220 + }, + { + "epoch": 0.8743333333333333, + "grad_norm": 22.75, + "grad_norm_var": 1.4337890625, + "learning_rate": 3.804080453145269e-05, + "loss": 6.8338, + "loss/crossentropy": 2.0908204093575478, + "loss/hidden": 3.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.16485908310860395, + "step": 26230 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 21.875, + "grad_norm_var": 0.5223307291666667, + "learning_rate": 3.790994070151895e-05, + "loss": 6.8804, + "loss/crossentropy": 2.14748295545578, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.17930770702660084, + "step": 26240 + }, + { + "epoch": 0.875, + "grad_norm": 20.25, + "grad_norm_var": 0.7072916666666667, + "learning_rate": 3.777924554357096e-05, + "loss": 6.7229, + "loss/crossentropy": 2.000728341937065, + "loss/hidden": 3.21328125, + "loss/jsd": 0.0, + "loss/logits": 0.1487280648201704, + "step": 26250 + }, + { + "epoch": 0.8753333333333333, + "grad_norm": 21.125, + "grad_norm_var": 2.13515625, + "learning_rate": 3.7648720347517166e-05, + "loss": 6.7624, + "loss/crossentropy": 1.9981106102466584, + "loss/hidden": 3.25703125, + "loss/jsd": 0.0, + "loss/logits": 0.15397127764299512, + "step": 26260 + }, + { + "epoch": 0.8756666666666667, + "grad_norm": 20.375, + "grad_norm_var": 1.5372395833333334, + "learning_rate": 3.7518366401588536e-05, + "loss": 6.8671, + "loss/crossentropy": 2.229478067159653, + "loss/hidden": 3.126171875, + "loss/jsd": 0.0, + "loss/logits": 0.15968595184385775, + "step": 26270 + }, + { + "epoch": 0.876, + "grad_norm": 21.625, + "grad_norm_var": 1.2525390625, + "learning_rate": 3.738818499232589e-05, + "loss": 6.786, + "loss/crossentropy": 2.017266020178795, + "loss/hidden": 3.18828125, + "loss/jsd": 0.0, + "loss/logits": 0.16191368382424115, + "step": 26280 + }, + { + "epoch": 0.8763333333333333, + "grad_norm": 21.75, + "grad_norm_var": 1.1780598958333333, + "learning_rate": 3.725817740456721e-05, + "loss": 7.0003, + "loss/crossentropy": 2.014402036368847, + "loss/hidden": 3.177734375, + "loss/jsd": 0.0, + "loss/logits": 0.15464963000267745, + "step": 26290 + }, + { + "epoch": 0.8766666666666667, + "grad_norm": 20.5, + "grad_norm_var": 1.271875, + "learning_rate": 3.712834492143488e-05, + "loss": 6.856, + "loss/crossentropy": 1.9306327871978284, + "loss/hidden": 3.261328125, + "loss/jsd": 0.0, + "loss/logits": 0.15594524987973274, + "step": 26300 + }, + { + "epoch": 0.877, + "grad_norm": 21.0, + "grad_norm_var": 0.88515625, + "learning_rate": 3.699868882432309e-05, + "loss": 6.8967, + "loss/crossentropy": 1.9130300246179104, + "loss/hidden": 3.09765625, + "loss/jsd": 0.0, + "loss/logits": 0.13649473995901645, + "step": 26310 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 21.5, + "grad_norm_var": 2.0336566906021478e+18, + "learning_rate": 3.686921039288519e-05, + "loss": 7.0264, + "loss/crossentropy": 2.181557595729828, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.16082917023450136, + "step": 26320 + }, + { + "epoch": 0.8776666666666667, + "grad_norm": 22.0, + "grad_norm_var": 2.0336566913151795e+18, + "learning_rate": 3.673991090502101e-05, + "loss": 6.8328, + "loss/crossentropy": 1.9753350079059602, + "loss/hidden": 3.13828125, + "loss/jsd": 0.0, + "loss/logits": 0.1500071782618761, + "step": 26330 + }, + { + "epoch": 0.878, + "grad_norm": 22.5, + "grad_norm_var": 1.1697916666666666, + "learning_rate": 3.661079163686431e-05, + "loss": 6.8732, + "loss/crossentropy": 1.9919006183743477, + "loss/hidden": 3.105859375, + "loss/jsd": 0.0, + "loss/logits": 0.1555755365639925, + "step": 26340 + }, + { + "epoch": 0.8783333333333333, + "grad_norm": 23.25, + "grad_norm_var": 1.1916015625, + "learning_rate": 3.648185386277011e-05, + "loss": 6.8612, + "loss/crossentropy": 2.0109338417649267, + "loss/hidden": 3.1859375, + "loss/jsd": 0.0, + "loss/logits": 0.15537302363663913, + "step": 26350 + }, + { + "epoch": 0.8786666666666667, + "grad_norm": 21.25, + "grad_norm_var": 1.0379557291666666, + "learning_rate": 3.6353098855302215e-05, + "loss": 6.7091, + "loss/crossentropy": 1.9878887504339218, + "loss/hidden": 3.0875, + "loss/jsd": 0.0, + "loss/logits": 0.1486053698696196, + "step": 26360 + }, + { + "epoch": 0.879, + "grad_norm": 21.5, + "grad_norm_var": 0.5676432291666667, + "learning_rate": 3.622452788522057e-05, + "loss": 6.8374, + "loss/crossentropy": 1.9497860811650753, + "loss/hidden": 3.23046875, + "loss/jsd": 0.0, + "loss/logits": 0.1838926389813423, + "step": 26370 + }, + { + "epoch": 0.8793333333333333, + "grad_norm": 21.5, + "grad_norm_var": 0.9098307291666666, + "learning_rate": 3.609614222146872e-05, + "loss": 6.8293, + "loss/crossentropy": 2.1610096618533134, + "loss/hidden": 3.177734375, + "loss/jsd": 0.0, + "loss/logits": 0.1660961801186204, + "step": 26380 + }, + { + "epoch": 0.8796666666666667, + "grad_norm": 20.375, + "grad_norm_var": 0.99140625, + "learning_rate": 3.596794313116136e-05, + "loss": 6.8184, + "loss/crossentropy": 2.0240518391132354, + "loss/hidden": 3.149609375, + "loss/jsd": 0.0, + "loss/logits": 0.16600796654820443, + "step": 26390 + }, + { + "epoch": 0.88, + "grad_norm": 24.25, + "grad_norm_var": 1.4854166666666666, + "learning_rate": 3.583993187957173e-05, + "loss": 6.8498, + "loss/crossentropy": 1.9888987004756928, + "loss/hidden": 3.249609375, + "loss/jsd": 0.0, + "loss/logits": 0.1616065276786685, + "step": 26400 + }, + { + "epoch": 0.8803333333333333, + "grad_norm": 20.625, + "grad_norm_var": 1.8791666666666667, + "learning_rate": 3.571210973011924e-05, + "loss": 6.8116, + "loss/crossentropy": 1.8744437299668788, + "loss/hidden": 3.220703125, + "loss/jsd": 0.0, + "loss/logits": 0.15525809191167356, + "step": 26410 + }, + { + "epoch": 0.8806666666666667, + "grad_norm": 22.0, + "grad_norm_var": 98.81608072916667, + "learning_rate": 3.5584477944356845e-05, + "loss": 6.9597, + "loss/crossentropy": 2.1749876379966735, + "loss/hidden": 3.2375, + "loss/jsd": 0.0, + "loss/logits": 0.16630711518228053, + "step": 26420 + }, + { + "epoch": 0.881, + "grad_norm": 21.5, + "grad_norm_var": 98.72265625, + "learning_rate": 3.5457037781958805e-05, + "loss": 6.9383, + "loss/crossentropy": 2.053211937844753, + "loss/hidden": 3.176171875, + "loss/jsd": 0.0, + "loss/logits": 0.15977289276197554, + "step": 26430 + }, + { + "epoch": 0.8813333333333333, + "grad_norm": 23.125, + "grad_norm_var": 1.06015625, + "learning_rate": 3.532979050070804e-05, + "loss": 6.8057, + "loss/crossentropy": 1.9794712126255036, + "loss/hidden": 3.29609375, + "loss/jsd": 0.0, + "loss/logits": 0.16630720421671868, + "step": 26440 + }, + { + "epoch": 0.8816666666666667, + "grad_norm": 20.5, + "grad_norm_var": 4.742122395833333, + "learning_rate": 3.520273735648382e-05, + "loss": 6.7564, + "loss/crossentropy": 2.0068790689110756, + "loss/hidden": 3.21328125, + "loss/jsd": 0.0, + "loss/logits": 0.15001734271645545, + "step": 26450 + }, + { + "epoch": 0.882, + "grad_norm": 20.875, + "grad_norm_var": 0.8375, + "learning_rate": 3.507587960324944e-05, + "loss": 6.9896, + "loss/crossentropy": 1.996173518896103, + "loss/hidden": 3.202734375, + "loss/jsd": 0.0, + "loss/logits": 0.16484985016286374, + "step": 26460 + }, + { + "epoch": 0.8823333333333333, + "grad_norm": 22.375, + "grad_norm_var": 0.9186848958333333, + "learning_rate": 3.494921849303967e-05, + "loss": 6.9035, + "loss/crossentropy": 2.044399265944958, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.17791436351835727, + "step": 26470 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 20.0, + "grad_norm_var": 1.3457682291666666, + "learning_rate": 3.482275527594856e-05, + "loss": 6.7077, + "loss/crossentropy": 1.9672799199819564, + "loss/hidden": 3.09609375, + "loss/jsd": 0.0, + "loss/logits": 0.14841998741030693, + "step": 26480 + }, + { + "epoch": 0.883, + "grad_norm": 23.625, + "grad_norm_var": 1.25390625, + "learning_rate": 3.469649120011697e-05, + "loss": 6.714, + "loss/crossentropy": 2.0286851942539217, + "loss/hidden": 3.160546875, + "loss/jsd": 0.0, + "loss/logits": 0.15806122818030416, + "step": 26490 + }, + { + "epoch": 0.8833333333333333, + "grad_norm": 20.375, + "grad_norm_var": 1.3455729166666666, + "learning_rate": 3.45704275117204e-05, + "loss": 6.8575, + "loss/crossentropy": 2.0830163829028607, + "loss/hidden": 3.194140625, + "loss/jsd": 0.0, + "loss/logits": 0.15631103357300163, + "step": 26500 + }, + { + "epoch": 0.8836666666666667, + "grad_norm": 20.875, + "grad_norm_var": 1.9254557291666667, + "learning_rate": 3.444456545495652e-05, + "loss": 6.8168, + "loss/crossentropy": 1.9875051081180573, + "loss/hidden": 3.215234375, + "loss/jsd": 0.0, + "loss/logits": 0.15791778452694416, + "step": 26510 + }, + { + "epoch": 0.884, + "grad_norm": 21.125, + "grad_norm_var": 1.2979166666666666, + "learning_rate": 3.431890627203305e-05, + "loss": 6.8636, + "loss/crossentropy": 2.145845976471901, + "loss/hidden": 3.14921875, + "loss/jsd": 0.0, + "loss/logits": 0.16650803480297327, + "step": 26520 + }, + { + "epoch": 0.8843333333333333, + "grad_norm": 23.75, + "grad_norm_var": 1.3427083333333334, + "learning_rate": 3.419345120315538e-05, + "loss": 6.8361, + "loss/crossentropy": 2.108425536751747, + "loss/hidden": 3.209765625, + "loss/jsd": 0.0, + "loss/logits": 0.16783894039690495, + "step": 26530 + }, + { + "epoch": 0.8846666666666667, + "grad_norm": 21.5, + "grad_norm_var": 1.2098307291666666, + "learning_rate": 3.4068201486514376e-05, + "loss": 6.9219, + "loss/crossentropy": 2.0194236926734446, + "loss/hidden": 3.217578125, + "loss/jsd": 0.0, + "loss/logits": 0.16996841207146646, + "step": 26540 + }, + { + "epoch": 0.885, + "grad_norm": 21.375, + "grad_norm_var": 4.703580729166666, + "learning_rate": 3.394315835827421e-05, + "loss": 6.9036, + "loss/crossentropy": 2.0967435270547865, + "loss/hidden": 3.17265625, + "loss/jsd": 0.0, + "loss/logits": 0.15975359827280045, + "step": 26550 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 27.875, + "grad_norm_var": 3.9785807291666666, + "learning_rate": 3.381832305256004e-05, + "loss": 6.9396, + "loss/crossentropy": 2.10302966684103, + "loss/hidden": 3.077734375, + "loss/jsd": 0.0, + "loss/logits": 0.17214497793465852, + "step": 26560 + }, + { + "epoch": 0.8856666666666667, + "grad_norm": 20.5, + "grad_norm_var": 1.9625138908734423e+18, + "learning_rate": 3.3693696801445954e-05, + "loss": 6.9578, + "loss/crossentropy": 2.2149000599980355, + "loss/hidden": 3.213671875, + "loss/jsd": 0.0, + "loss/logits": 0.17492201793938875, + "step": 26570 + }, + { + "epoch": 0.886, + "grad_norm": 22.125, + "grad_norm_var": 1.9625138910719027e+18, + "learning_rate": 3.356928083494274e-05, + "loss": 6.9037, + "loss/crossentropy": 2.0742943078279494, + "loss/hidden": 3.119140625, + "loss/jsd": 0.0, + "loss/logits": 0.15730843115597964, + "step": 26580 + }, + { + "epoch": 0.8863333333333333, + "grad_norm": 20.75, + "grad_norm_var": 2.6059895833333333, + "learning_rate": 3.344507638098576e-05, + "loss": 6.7874, + "loss/crossentropy": 1.9956744939088822, + "loss/hidden": 3.201953125, + "loss/jsd": 0.0, + "loss/logits": 0.15899997791275383, + "step": 26590 + }, + { + "epoch": 0.8866666666666667, + "grad_norm": 20.375, + "grad_norm_var": 0.6603515625, + "learning_rate": 3.3321084665422807e-05, + "loss": 6.8337, + "loss/crossentropy": 1.9816527277231217, + "loss/hidden": 3.175, + "loss/jsd": 0.0, + "loss/logits": 0.16425166334956884, + "step": 26600 + }, + { + "epoch": 0.887, + "grad_norm": 20.5, + "grad_norm_var": 1.3455729166666666, + "learning_rate": 3.319730691200209e-05, + "loss": 6.8578, + "loss/crossentropy": 1.8931610018014908, + "loss/hidden": 3.1734375, + "loss/jsd": 0.0, + "loss/logits": 0.15017597610130906, + "step": 26610 + }, + { + "epoch": 0.8873333333333333, + "grad_norm": 23.625, + "grad_norm_var": 2.134375, + "learning_rate": 3.307374434236003e-05, + "loss": 6.7593, + "loss/crossentropy": 2.0294124722480773, + "loss/hidden": 3.161328125, + "loss/jsd": 0.0, + "loss/logits": 0.15351739330217243, + "step": 26620 + }, + { + "epoch": 0.8876666666666667, + "grad_norm": 22.625, + "grad_norm_var": 19.565625, + "learning_rate": 3.295039817600936e-05, + "loss": 6.8753, + "loss/crossentropy": 2.105466166138649, + "loss/hidden": 3.216796875, + "loss/jsd": 0.0, + "loss/logits": 0.18532855240628124, + "step": 26630 + }, + { + "epoch": 0.888, + "grad_norm": 21.25, + "grad_norm_var": 15.985416666666667, + "learning_rate": 3.2827269630326885e-05, + "loss": 6.7157, + "loss/crossentropy": 2.0061062544584276, + "loss/hidden": 3.219921875, + "loss/jsd": 0.0, + "loss/logits": 0.15896273953840137, + "step": 26640 + }, + { + "epoch": 0.8883333333333333, + "grad_norm": 21.75, + "grad_norm_var": 9.217122395833334, + "learning_rate": 3.270435992054166e-05, + "loss": 6.9179, + "loss/crossentropy": 2.0659444093704225, + "loss/hidden": 3.2515625, + "loss/jsd": 0.0, + "loss/logits": 0.16416719797998666, + "step": 26650 + }, + { + "epoch": 0.8886666666666667, + "grad_norm": 26.375, + "grad_norm_var": 3.4905598958333335, + "learning_rate": 3.258167025972292e-05, + "loss": 6.851, + "loss/crossentropy": 2.0915834248065948, + "loss/hidden": 3.230078125, + "loss/jsd": 0.0, + "loss/logits": 0.15449760612100363, + "step": 26660 + }, + { + "epoch": 0.889, + "grad_norm": 29.75, + "grad_norm_var": 10.816080729166666, + "learning_rate": 3.245920185876805e-05, + "loss": 6.9643, + "loss/crossentropy": 1.9756429754197598, + "loss/hidden": 3.17578125, + "loss/jsd": 0.0, + "loss/logits": 0.15356689458712935, + "step": 26670 + }, + { + "epoch": 0.8893333333333333, + "grad_norm": 26.75, + "grad_norm_var": 11.518489583333333, + "learning_rate": 3.233695592639077e-05, + "loss": 6.9679, + "loss/crossentropy": 1.9231618136167525, + "loss/hidden": 3.290625, + "loss/jsd": 0.0, + "loss/logits": 0.15716882031410934, + "step": 26680 + }, + { + "epoch": 0.8896666666666667, + "grad_norm": 22.25, + "grad_norm_var": 13.483072916666666, + "learning_rate": 3.221493366910903e-05, + "loss": 6.8899, + "loss/crossentropy": 1.9402207165956498, + "loss/hidden": 3.172265625, + "loss/jsd": 0.0, + "loss/logits": 0.15298937689512968, + "step": 26690 + }, + { + "epoch": 0.89, + "grad_norm": 23.25, + "grad_norm_var": 12.5056640625, + "learning_rate": 3.2093136291233296e-05, + "loss": 6.8965, + "loss/crossentropy": 1.9652688920497894, + "loss/hidden": 3.266015625, + "loss/jsd": 0.0, + "loss/logits": 0.15768850333988665, + "step": 26700 + }, + { + "epoch": 0.8903333333333333, + "grad_norm": 21.5, + "grad_norm_var": 4.339518229166667, + "learning_rate": 3.197156499485447e-05, + "loss": 6.8229, + "loss/crossentropy": 2.013945384323597, + "loss/hidden": 3.150390625, + "loss/jsd": 0.0, + "loss/logits": 0.15189841520041228, + "step": 26710 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 25.25, + "grad_norm_var": 2.01640625, + "learning_rate": 3.185022097983221e-05, + "loss": 6.8226, + "loss/crossentropy": 2.015190437436104, + "loss/hidden": 3.262109375, + "loss/jsd": 0.0, + "loss/logits": 0.16249268716201187, + "step": 26720 + }, + { + "epoch": 0.891, + "grad_norm": 26.875, + "grad_norm_var": 4.39375, + "learning_rate": 3.172910544378294e-05, + "loss": 6.9557, + "loss/crossentropy": 2.177249902486801, + "loss/hidden": 3.132421875, + "loss/jsd": 0.0, + "loss/logits": 0.15406437516212462, + "step": 26730 + }, + { + "epoch": 0.8913333333333333, + "grad_norm": 24.375, + "grad_norm_var": 4.626497395833334, + "learning_rate": 3.160821958206807e-05, + "loss": 6.9043, + "loss/crossentropy": 2.0702683687210084, + "loss/hidden": 3.2078125, + "loss/jsd": 0.0, + "loss/logits": 0.1650611654855311, + "step": 26740 + }, + { + "epoch": 0.8916666666666667, + "grad_norm": 23.0, + "grad_norm_var": 3.2301432291666665, + "learning_rate": 3.1487564587782306e-05, + "loss": 6.9284, + "loss/crossentropy": 2.2026931807398795, + "loss/hidden": 3.140234375, + "loss/jsd": 0.0, + "loss/logits": 0.17769969888031484, + "step": 26750 + }, + { + "epoch": 0.892, + "grad_norm": 22.875, + "grad_norm_var": 3.2577473958333334, + "learning_rate": 3.1367141651741694e-05, + "loss": 6.838, + "loss/crossentropy": 1.979924051463604, + "loss/hidden": 3.16875, + "loss/jsd": 0.0, + "loss/logits": 0.16592128686606883, + "step": 26760 + }, + { + "epoch": 0.8923333333333333, + "grad_norm": 7348420608.0, + "grad_norm_var": 3.374955317819448e+18, + "learning_rate": 3.124695196247202e-05, + "loss": 6.9768, + "loss/crossentropy": 2.240220108628273, + "loss/hidden": 3.144140625, + "loss/jsd": 0.0, + "loss/logits": 0.16037558643147348, + "step": 26770 + }, + { + "epoch": 0.8926666666666667, + "grad_norm": 22.75, + "grad_norm_var": 3.374955318079704e+18, + "learning_rate": 3.112699670619696e-05, + "loss": 6.8575, + "loss/crossentropy": 2.125392961502075, + "loss/hidden": 3.223046875, + "loss/jsd": 0.0, + "loss/logits": 0.16047360915690662, + "step": 26780 + }, + { + "epoch": 0.893, + "grad_norm": 24.625, + "grad_norm_var": 9.473958333333334, + "learning_rate": 3.100727706682651e-05, + "loss": 6.954, + "loss/crossentropy": 2.044107362627983, + "loss/hidden": 3.1625, + "loss/jsd": 0.0, + "loss/logits": 0.16029341490939258, + "step": 26790 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 24.125, + "grad_norm_var": 8.245572916666667, + "learning_rate": 3.088779422594514e-05, + "loss": 6.9426, + "loss/crossentropy": 1.9640724688768387, + "loss/hidden": 3.24453125, + "loss/jsd": 0.0, + "loss/logits": 0.15900588724762202, + "step": 26800 + }, + { + "epoch": 0.8936666666666667, + "grad_norm": 20.875, + "grad_norm_var": 4.214322916666666, + "learning_rate": 3.0768549362800294e-05, + "loss": 6.9375, + "loss/crossentropy": 2.050625918060541, + "loss/hidden": 3.1703125, + "loss/jsd": 0.0, + "loss/logits": 0.16425505680963398, + "step": 26810 + }, + { + "epoch": 0.894, + "grad_norm": 23.5, + "grad_norm_var": 3.2604166666666665, + "learning_rate": 3.064954365429059e-05, + "loss": 6.9182, + "loss/crossentropy": 1.9688321188092233, + "loss/hidden": 3.227734375, + "loss/jsd": 0.0, + "loss/logits": 0.17398671787232162, + "step": 26820 + }, + { + "epoch": 0.8943333333333333, + "grad_norm": 22.25, + "grad_norm_var": 3.302018229166667, + "learning_rate": 3.053077827495433e-05, + "loss": 6.8169, + "loss/crossentropy": 2.088005256652832, + "loss/hidden": 3.047265625, + "loss/jsd": 0.0, + "loss/logits": 0.14643877744674683, + "step": 26830 + }, + { + "epoch": 0.8946666666666667, + "grad_norm": 21.625, + "grad_norm_var": 2.1304840813595853e+18, + "learning_rate": 3.0412254396957896e-05, + "loss": 6.8132, + "loss/crossentropy": 2.170038291811943, + "loss/hidden": 3.19765625, + "loss/jsd": 0.0, + "loss/logits": 0.16484488490968943, + "step": 26840 + }, + { + "epoch": 0.895, + "grad_norm": 26.0, + "grad_norm_var": 2.1304840811710513e+18, + "learning_rate": 3.0293973190084068e-05, + "loss": 6.7694, + "loss/crossentropy": 1.9106760919094086, + "loss/hidden": 3.17421875, + "loss/jsd": 0.0, + "loss/logits": 0.14406620375812054, + "step": 26850 + }, + { + "epoch": 0.8953333333333333, + "grad_norm": 26.0, + "grad_norm_var": 2.5884765625, + "learning_rate": 3.0175935821720648e-05, + "loss": 6.8457, + "loss/crossentropy": 2.27715582549572, + "loss/hidden": 3.127734375, + "loss/jsd": 0.0, + "loss/logits": 0.158475461602211, + "step": 26860 + }, + { + "epoch": 0.8956666666666667, + "grad_norm": 26.375, + "grad_norm_var": 3.62890625, + "learning_rate": 3.0058143456848765e-05, + "loss": 6.7258, + "loss/crossentropy": 1.9482488855719566, + "loss/hidden": 3.14921875, + "loss/jsd": 0.0, + "loss/logits": 0.1498094605281949, + "step": 26870 + }, + { + "epoch": 0.896, + "grad_norm": 21.125, + "grad_norm_var": 1.4389704356790623e+18, + "learning_rate": 2.994059725803156e-05, + "loss": 6.7777, + "loss/crossentropy": 2.0152445122599603, + "loss/hidden": 3.187109375, + "loss/jsd": 0.0, + "loss/logits": 0.15422911364585162, + "step": 26880 + }, + { + "epoch": 0.8963333333333333, + "grad_norm": 23.5, + "grad_norm_var": 1.4389704356690657e+18, + "learning_rate": 2.9823298385402492e-05, + "loss": 6.8206, + "loss/crossentropy": 2.0900501251220702, + "loss/hidden": 3.166015625, + "loss/jsd": 0.0, + "loss/logits": 0.16731840167194606, + "step": 26890 + }, + { + "epoch": 0.8966666666666666, + "grad_norm": 22.5, + "grad_norm_var": 2.513641891262733e+18, + "learning_rate": 2.9706247996654137e-05, + "loss": 6.876, + "loss/crossentropy": 1.9369783684611321, + "loss/hidden": 3.293359375, + "loss/jsd": 0.0, + "loss/logits": 0.16292606424540282, + "step": 26900 + }, + { + "epoch": 0.897, + "grad_norm": 26.5, + "grad_norm_var": 6.4962890625, + "learning_rate": 2.958944724702654e-05, + "loss": 6.7905, + "loss/crossentropy": 1.9939923129975796, + "loss/hidden": 3.150390625, + "loss/jsd": 0.0, + "loss/logits": 0.15159244257956744, + "step": 26910 + }, + { + "epoch": 0.8973333333333333, + "grad_norm": 26.75, + "grad_norm_var": 2.792122395833333, + "learning_rate": 2.947289728929597e-05, + "loss": 6.8971, + "loss/crossentropy": 2.078751567006111, + "loss/hidden": 3.190234375, + "loss/jsd": 0.0, + "loss/logits": 0.17754473332315684, + "step": 26920 + }, + { + "epoch": 0.8976666666666666, + "grad_norm": 25.0, + "grad_norm_var": 1.6082682291666666, + "learning_rate": 2.935659927376343e-05, + "loss": 6.8012, + "loss/crossentropy": 2.0356945395469666, + "loss/hidden": 3.208984375, + "loss/jsd": 0.0, + "loss/logits": 0.16562622915953398, + "step": 26930 + }, + { + "epoch": 0.898, + "grad_norm": 26.625, + "grad_norm_var": 3.0184895833333334, + "learning_rate": 2.924055434824342e-05, + "loss": 6.7869, + "loss/crossentropy": 2.1356831192970276, + "loss/hidden": 3.251953125, + "loss/jsd": 0.0, + "loss/logits": 0.17853607889264822, + "step": 26940 + }, + { + "epoch": 0.8983333333333333, + "grad_norm": 22.125, + "grad_norm_var": 11.651822916666667, + "learning_rate": 2.9124763658052478e-05, + "loss": 6.8149, + "loss/crossentropy": 1.9252381205558777, + "loss/hidden": 3.224609375, + "loss/jsd": 0.0, + "loss/logits": 0.16029497589915992, + "step": 26950 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 22.75, + "grad_norm_var": 3.1830729166666667, + "learning_rate": 2.900922834599797e-05, + "loss": 6.9226, + "loss/crossentropy": 2.205397879332304, + "loss/hidden": 3.209765625, + "loss/jsd": 0.0, + "loss/logits": 0.15344799063168466, + "step": 26960 + }, + { + "epoch": 0.899, + "grad_norm": 23.875, + "grad_norm_var": 2.919205729166667, + "learning_rate": 2.8893949552366796e-05, + "loss": 6.8206, + "loss/crossentropy": 2.09553968757391, + "loss/hidden": 3.194140625, + "loss/jsd": 0.0, + "loss/logits": 0.16381179327145218, + "step": 26970 + }, + { + "epoch": 0.8993333333333333, + "grad_norm": 25.875, + "grad_norm_var": 3.3749348958333334, + "learning_rate": 2.8778928414914085e-05, + "loss": 6.8139, + "loss/crossentropy": 2.029978536069393, + "loss/hidden": 3.2390625, + "loss/jsd": 0.0, + "loss/logits": 0.1680966019630432, + "step": 26980 + }, + { + "epoch": 0.8996666666666666, + "grad_norm": 24.875, + "grad_norm_var": 2.4541015625, + "learning_rate": 2.8664166068852062e-05, + "loss": 6.8405, + "loss/crossentropy": 1.9429209612309932, + "loss/hidden": 3.27578125, + "loss/jsd": 0.0, + "loss/logits": 0.16744533190503716, + "step": 26990 + }, + { + "epoch": 0.9, + "grad_norm": 21.75, + "grad_norm_var": 3.388541666666667, + "learning_rate": 2.854966364683872e-05, + "loss": 6.8216, + "loss/crossentropy": 1.9241836979985236, + "loss/hidden": 3.226953125, + "loss/jsd": 0.0, + "loss/logits": 0.16613443605601788, + "step": 27000 + }, + { + "epoch": 0.9003333333333333, + "grad_norm": 26.125, + "grad_norm_var": 2.4205729166666665, + "learning_rate": 2.843542227896676e-05, + "loss": 6.8825, + "loss/crossentropy": 2.012719841301441, + "loss/hidden": 3.23515625, + "loss/jsd": 0.0, + "loss/logits": 0.15762700429186224, + "step": 27010 + }, + { + "epoch": 0.9006666666666666, + "grad_norm": 22.625, + "grad_norm_var": 2.0228515625, + "learning_rate": 2.8321443092752338e-05, + "loss": 6.7563, + "loss/crossentropy": 1.9895868554711342, + "loss/hidden": 3.196484375, + "loss/jsd": 0.0, + "loss/logits": 0.17497619222849609, + "step": 27020 + }, + { + "epoch": 0.901, + "grad_norm": 24.0, + "grad_norm_var": 3.278059895833333, + "learning_rate": 2.8207727213124035e-05, + "loss": 6.7559, + "loss/crossentropy": 1.9740510500967503, + "loss/hidden": 3.12265625, + "loss/jsd": 0.0, + "loss/logits": 0.1432420744560659, + "step": 27030 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 24.125, + "grad_norm_var": 2.713997395833333, + "learning_rate": 2.809427576241167e-05, + "loss": 6.8997, + "loss/crossentropy": 2.1658532321453094, + "loss/hidden": 3.162109375, + "loss/jsd": 0.0, + "loss/logits": 0.1637007687240839, + "step": 27040 + }, + { + "epoch": 0.9016666666666666, + "grad_norm": 21.125, + "grad_norm_var": 3.376822916666667, + "learning_rate": 2.798108986033523e-05, + "loss": 6.9438, + "loss/crossentropy": 2.1903593868017195, + "loss/hidden": 3.13359375, + "loss/jsd": 0.0, + "loss/logits": 0.1633994322270155, + "step": 27050 + }, + { + "epoch": 0.902, + "grad_norm": 21.875, + "grad_norm_var": 5.095572916666667, + "learning_rate": 2.7868170623993905e-05, + "loss": 7.0145, + "loss/crossentropy": 2.0305363297462464, + "loss/hidden": 3.247265625, + "loss/jsd": 0.0, + "loss/logits": 0.17958665620535613, + "step": 27060 + }, + { + "epoch": 0.9023333333333333, + "grad_norm": 25.625, + "grad_norm_var": 3.6702473958333335, + "learning_rate": 2.7755519167854944e-05, + "loss": 6.7408, + "loss/crossentropy": 1.86053267121315, + "loss/hidden": 3.155078125, + "loss/jsd": 0.0, + "loss/logits": 0.14872891837731003, + "step": 27070 + }, + { + "epoch": 0.9026666666666666, + "grad_norm": 21.0, + "grad_norm_var": 4.24375, + "learning_rate": 2.764313660374277e-05, + "loss": 6.8407, + "loss/crossentropy": 2.007214891910553, + "loss/hidden": 3.1125, + "loss/jsd": 0.0, + "loss/logits": 0.15470210947096347, + "step": 27080 + }, + { + "epoch": 0.903, + "grad_norm": 23.375, + "grad_norm_var": 6.474739583333333, + "learning_rate": 2.753102404082789e-05, + "loss": 6.9169, + "loss/crossentropy": 2.1241619139909744, + "loss/hidden": 3.139453125, + "loss/jsd": 0.0, + "loss/logits": 0.15222108382731675, + "step": 27090 + }, + { + "epoch": 0.9033333333333333, + "grad_norm": 23.625, + "grad_norm_var": 2.134375, + "learning_rate": 2.741918258561607e-05, + "loss": 6.7749, + "loss/crossentropy": 1.9174664333462714, + "loss/hidden": 3.1, + "loss/jsd": 0.0, + "loss/logits": 0.14461091123521327, + "step": 27100 + }, + { + "epoch": 0.9036666666666666, + "grad_norm": 25.625, + "grad_norm_var": 1.6541015625, + "learning_rate": 2.7307613341937282e-05, + "loss": 6.8602, + "loss/crossentropy": 2.0042121566832067, + "loss/hidden": 3.255859375, + "loss/jsd": 0.0, + "loss/logits": 0.1733078501187265, + "step": 27110 + }, + { + "epoch": 0.904, + "grad_norm": 25.875, + "grad_norm_var": 3.5332682291666666, + "learning_rate": 2.7196317410934964e-05, + "loss": 6.886, + "loss/crossentropy": 2.019241477549076, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.19402055349200964, + "step": 27120 + }, + { + "epoch": 0.9043333333333333, + "grad_norm": 22.875, + "grad_norm_var": 2.234375, + "learning_rate": 2.7085295891054997e-05, + "loss": 6.9037, + "loss/crossentropy": 2.0694904938340186, + "loss/hidden": 3.2640625, + "loss/jsd": 0.0, + "loss/logits": 0.16915742177516221, + "step": 27130 + }, + { + "epoch": 0.9046666666666666, + "grad_norm": 28.375, + "grad_norm_var": 8.305143229166667, + "learning_rate": 2.697454987803495e-05, + "loss": 6.8822, + "loss/crossentropy": 2.0155827552080154, + "loss/hidden": 3.23671875, + "loss/jsd": 0.0, + "loss/logits": 0.16962535195052625, + "step": 27140 + }, + { + "epoch": 0.905, + "grad_norm": 22.0, + "grad_norm_var": 10.370768229166666, + "learning_rate": 2.6864080464893282e-05, + "loss": 6.7997, + "loss/crossentropy": 2.08170278519392, + "loss/hidden": 3.153515625, + "loss/jsd": 0.0, + "loss/logits": 0.1523496536538005, + "step": 27150 + }, + { + "epoch": 0.9053333333333333, + "grad_norm": 20.125, + "grad_norm_var": 1.3744140625, + "learning_rate": 2.6753888741918488e-05, + "loss": 6.9908, + "loss/crossentropy": 2.0863103806972503, + "loss/hidden": 3.188671875, + "loss/jsd": 0.0, + "loss/logits": 0.15311094475910067, + "step": 27160 + }, + { + "epoch": 0.9056666666666666, + "grad_norm": 22.0, + "grad_norm_var": 1.1864583333333334, + "learning_rate": 2.6643975796658406e-05, + "loss": 6.8451, + "loss/crossentropy": 2.150225210189819, + "loss/hidden": 3.04375, + "loss/jsd": 0.0, + "loss/logits": 0.14159671682864428, + "step": 27170 + }, + { + "epoch": 0.906, + "grad_norm": 22.75, + "grad_norm_var": 0.8052083333333333, + "learning_rate": 2.65343427139094e-05, + "loss": 6.8341, + "loss/crossentropy": 2.0567213878035546, + "loss/hidden": 3.133984375, + "loss/jsd": 0.0, + "loss/logits": 0.15704135950654746, + "step": 27180 + }, + { + "epoch": 0.9063333333333333, + "grad_norm": 21.25, + "grad_norm_var": 0.8254557291666667, + "learning_rate": 2.642499057570578e-05, + "loss": 6.8178, + "loss/crossentropy": 2.0047308802604675, + "loss/hidden": 3.1453125, + "loss/jsd": 0.0, + "loss/logits": 0.15476641841232777, + "step": 27190 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 20.625, + "grad_norm_var": 0.9639973958333333, + "learning_rate": 2.6315920461308964e-05, + "loss": 6.8925, + "loss/crossentropy": 2.0975175350904465, + "loss/hidden": 3.23984375, + "loss/jsd": 0.0, + "loss/logits": 0.17165146991610528, + "step": 27200 + }, + { + "epoch": 0.907, + "grad_norm": 20.75, + "grad_norm_var": 1.4910807291666666, + "learning_rate": 2.620713344719698e-05, + "loss": 6.9033, + "loss/crossentropy": 2.116207906603813, + "loss/hidden": 3.19609375, + "loss/jsd": 0.0, + "loss/logits": 0.17307401802390815, + "step": 27210 + }, + { + "epoch": 0.9073333333333333, + "grad_norm": 19.75, + "grad_norm_var": 1.5059895833333334, + "learning_rate": 2.6098630607053704e-05, + "loss": 6.8249, + "loss/crossentropy": 2.1922834485769274, + "loss/hidden": 3.1828125, + "loss/jsd": 0.0, + "loss/logits": 0.16770193502306938, + "step": 27220 + }, + { + "epoch": 0.9076666666666666, + "grad_norm": 21.75, + "grad_norm_var": 0.8363932291666667, + "learning_rate": 2.5990413011758396e-05, + "loss": 6.9028, + "loss/crossentropy": 2.190079639852047, + "loss/hidden": 3.20546875, + "loss/jsd": 0.0, + "loss/logits": 0.17454652497544884, + "step": 27230 + }, + { + "epoch": 0.908, + "grad_norm": 21.625, + "grad_norm_var": 0.7738932291666667, + "learning_rate": 2.588248172937502e-05, + "loss": 6.7325, + "loss/crossentropy": 1.9603225111961364, + "loss/hidden": 3.145703125, + "loss/jsd": 0.0, + "loss/logits": 0.14061546474695205, + "step": 27240 + }, + { + "epoch": 0.9083333333333333, + "grad_norm": 21.0, + "grad_norm_var": 0.3322265625, + "learning_rate": 2.577483782514174e-05, + "loss": 6.8603, + "loss/crossentropy": 2.165058287978172, + "loss/hidden": 3.1921875, + "loss/jsd": 0.0, + "loss/logits": 0.16146605722606183, + "step": 27250 + }, + { + "epoch": 0.9086666666666666, + "grad_norm": 21.5, + "grad_norm_var": 0.8113932291666667, + "learning_rate": 2.5667482361460467e-05, + "loss": 6.8768, + "loss/crossentropy": 2.0848546117544173, + "loss/hidden": 3.243359375, + "loss/jsd": 0.0, + "loss/logits": 0.16068812049925327, + "step": 27260 + }, + { + "epoch": 0.909, + "grad_norm": 21.125, + "grad_norm_var": 0.6427083333333333, + "learning_rate": 2.5560416397886257e-05, + "loss": 6.9293, + "loss/crossentropy": 1.8594784066081047, + "loss/hidden": 3.20703125, + "loss/jsd": 0.0, + "loss/logits": 0.1557474084198475, + "step": 27270 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 22.375, + "grad_norm_var": 0.4705729166666667, + "learning_rate": 2.5453640991116967e-05, + "loss": 6.8813, + "loss/crossentropy": 2.048447531461716, + "loss/hidden": 3.159375, + "loss/jsd": 0.0, + "loss/logits": 0.1582455337047577, + "step": 27280 + }, + { + "epoch": 0.9096666666666666, + "grad_norm": 22.375, + "grad_norm_var": 6.491666666666666, + "learning_rate": 2.5347157194982742e-05, + "loss": 6.8006, + "loss/crossentropy": 2.028676262497902, + "loss/hidden": 3.18125, + "loss/jsd": 0.0, + "loss/logits": 0.15210597179830074, + "step": 27290 + }, + { + "epoch": 0.91, + "grad_norm": 20.5, + "grad_norm_var": 1.2197916666666666, + "learning_rate": 2.5240966060435677e-05, + "loss": 6.908, + "loss/crossentropy": 2.1295453563332556, + "loss/hidden": 3.267578125, + "loss/jsd": 0.0, + "loss/logits": 0.18300293069332838, + "step": 27300 + }, + { + "epoch": 0.9103333333333333, + "grad_norm": 20.5, + "grad_norm_var": 0.5468098958333333, + "learning_rate": 2.5135068635539366e-05, + "loss": 6.7928, + "loss/crossentropy": 2.176609678566456, + "loss/hidden": 3.1765625, + "loss/jsd": 0.0, + "loss/logits": 0.1436314729042351, + "step": 27310 + }, + { + "epoch": 0.9106666666666666, + "grad_norm": 21.0, + "grad_norm_var": 1.2052083333333334, + "learning_rate": 2.5029465965458683e-05, + "loss": 6.8852, + "loss/crossentropy": 1.9842437624931335, + "loss/hidden": 3.241015625, + "loss/jsd": 0.0, + "loss/logits": 0.15974466726183892, + "step": 27320 + }, + { + "epoch": 0.911, + "grad_norm": 22.0, + "grad_norm_var": 1.2155598958333333, + "learning_rate": 2.4924159092449325e-05, + "loss": 6.8875, + "loss/crossentropy": 1.9043216429650784, + "loss/hidden": 3.2078125, + "loss/jsd": 0.0, + "loss/logits": 0.17278967509046197, + "step": 27330 + }, + { + "epoch": 0.9113333333333333, + "grad_norm": 21.75, + "grad_norm_var": 5.60625, + "learning_rate": 2.48191490558476e-05, + "loss": 6.8304, + "loss/crossentropy": 2.0150970712304117, + "loss/hidden": 3.262109375, + "loss/jsd": 0.0, + "loss/logits": 0.16625587958842517, + "step": 27340 + }, + { + "epoch": 0.9116666666666666, + "grad_norm": 21.5, + "grad_norm_var": 8.317708333333334, + "learning_rate": 2.4714436892060213e-05, + "loss": 6.8042, + "loss/crossentropy": 2.058341934531927, + "loss/hidden": 3.1359375, + "loss/jsd": 0.0, + "loss/logits": 0.154982496984303, + "step": 27350 + }, + { + "epoch": 0.912, + "grad_norm": 20.75, + "grad_norm_var": 3.8749348958333334, + "learning_rate": 2.46100236345539e-05, + "loss": 6.7786, + "loss/crossentropy": 1.8252541318535804, + "loss/hidden": 3.261328125, + "loss/jsd": 0.0, + "loss/logits": 0.1562123046256602, + "step": 27360 + }, + { + "epoch": 0.9123333333333333, + "grad_norm": 21.875, + "grad_norm_var": 0.9858723958333333, + "learning_rate": 2.4505910313845408e-05, + "loss": 6.8645, + "loss/crossentropy": 1.9364535629749298, + "loss/hidden": 3.12578125, + "loss/jsd": 0.0, + "loss/logits": 0.1512385666370392, + "step": 27370 + }, + { + "epoch": 0.9126666666666666, + "grad_norm": 21.75, + "grad_norm_var": 1.0983723958333333, + "learning_rate": 2.440209795749114e-05, + "loss": 6.8863, + "loss/crossentropy": 1.9228644296526909, + "loss/hidden": 3.23984375, + "loss/jsd": 0.0, + "loss/logits": 0.16667801439762114, + "step": 27380 + }, + { + "epoch": 0.913, + "grad_norm": 20.875, + "grad_norm_var": 0.9931640625, + "learning_rate": 2.4298587590077164e-05, + "loss": 6.9802, + "loss/crossentropy": 1.9965920761227607, + "loss/hidden": 3.22109375, + "loss/jsd": 0.0, + "loss/logits": 0.16229025460779667, + "step": 27390 + }, + { + "epoch": 0.9133333333333333, + "grad_norm": 20.25, + "grad_norm_var": 0.5082682291666667, + "learning_rate": 2.4195380233209008e-05, + "loss": 6.6642, + "loss/crossentropy": 1.916854026913643, + "loss/hidden": 3.20078125, + "loss/jsd": 0.0, + "loss/logits": 0.15667275432497263, + "step": 27400 + }, + { + "epoch": 0.9136666666666666, + "grad_norm": 25.25, + "grad_norm_var": 2.068489583333333, + "learning_rate": 2.4092476905501634e-05, + "loss": 6.9134, + "loss/crossentropy": 2.0995171763002873, + "loss/hidden": 3.16640625, + "loss/jsd": 0.0, + "loss/logits": 0.15996734565123916, + "step": 27410 + }, + { + "epoch": 0.914, + "grad_norm": 23.0, + "grad_norm_var": 2.5400390625, + "learning_rate": 2.398987862256933e-05, + "loss": 6.979, + "loss/crossentropy": 2.112964731827378, + "loss/hidden": 3.340234375, + "loss/jsd": 0.0, + "loss/logits": 0.1689059093594551, + "step": 27420 + }, + { + "epoch": 0.9143333333333333, + "grad_norm": 21.75, + "grad_norm_var": 1.621875, + "learning_rate": 2.3887586397015716e-05, + "loss": 6.9236, + "loss/crossentropy": 2.0072560638189314, + "loss/hidden": 3.2296875, + "loss/jsd": 0.0, + "loss/logits": 0.1832346895709634, + "step": 27430 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 26.375, + "grad_norm_var": 2.6416015625, + "learning_rate": 2.3785601238423787e-05, + "loss": 6.8881, + "loss/crossentropy": 2.0381537839770316, + "loss/hidden": 3.209765625, + "loss/jsd": 0.0, + "loss/logits": 0.1642130235210061, + "step": 27440 + }, + { + "epoch": 0.915, + "grad_norm": 21.875, + "grad_norm_var": 2.71015625, + "learning_rate": 2.3683924153345856e-05, + "loss": 6.898, + "loss/crossentropy": 2.050332149863243, + "loss/hidden": 3.220703125, + "loss/jsd": 0.0, + "loss/logits": 0.15957360472530127, + "step": 27450 + }, + { + "epoch": 0.9153333333333333, + "grad_norm": 20.75, + "grad_norm_var": 1.1238932291666666, + "learning_rate": 2.358255614529374e-05, + "loss": 6.7788, + "loss/crossentropy": 1.9567649722099305, + "loss/hidden": 3.21640625, + "loss/jsd": 0.0, + "loss/logits": 0.1502897882834077, + "step": 27460 + }, + { + "epoch": 0.9156666666666666, + "grad_norm": 20.875, + "grad_norm_var": 0.890625, + "learning_rate": 2.3481498214728717e-05, + "loss": 6.7887, + "loss/crossentropy": 1.9293017938733101, + "loss/hidden": 3.262890625, + "loss/jsd": 0.0, + "loss/logits": 0.16474825162440537, + "step": 27470 + }, + { + "epoch": 0.916, + "grad_norm": 20.25, + "grad_norm_var": 0.6587890625, + "learning_rate": 2.3380751359051795e-05, + "loss": 6.8496, + "loss/crossentropy": 2.057722179591656, + "loss/hidden": 3.166015625, + "loss/jsd": 0.0, + "loss/logits": 0.1477236093953252, + "step": 27480 + }, + { + "epoch": 0.9163333333333333, + "grad_norm": 21.0, + "grad_norm_var": 0.6978515625, + "learning_rate": 2.3280316572593735e-05, + "loss": 6.851, + "loss/crossentropy": 2.0436879307031632, + "loss/hidden": 3.100390625, + "loss/jsd": 0.0, + "loss/logits": 0.15924023166298867, + "step": 27490 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 21.5, + "grad_norm_var": 0.82890625, + "learning_rate": 2.3180194846605367e-05, + "loss": 6.8103, + "loss/crossentropy": 2.0646505132317543, + "loss/hidden": 3.11796875, + "loss/jsd": 0.0, + "loss/logits": 0.14654937675222754, + "step": 27500 + }, + { + "epoch": 0.917, + "grad_norm": 21.0, + "grad_norm_var": 0.8551432291666666, + "learning_rate": 2.3080387169247687e-05, + "loss": 6.8039, + "loss/crossentropy": 2.101382979750633, + "loss/hidden": 3.2671875, + "loss/jsd": 0.0, + "loss/logits": 0.16098164729773998, + "step": 27510 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 21.375, + "grad_norm_var": 1.1291015625, + "learning_rate": 2.298089452558216e-05, + "loss": 6.7319, + "loss/crossentropy": 1.9046258434653283, + "loss/hidden": 3.130078125, + "loss/jsd": 0.0, + "loss/logits": 0.15497801061719657, + "step": 27520 + }, + { + "epoch": 0.9176666666666666, + "grad_norm": 22.0, + "grad_norm_var": 0.8749348958333333, + "learning_rate": 2.288171789756105e-05, + "loss": 6.8369, + "loss/crossentropy": 2.108339750766754, + "loss/hidden": 3.21171875, + "loss/jsd": 0.0, + "loss/logits": 0.17621326725929976, + "step": 27530 + }, + { + "epoch": 0.918, + "grad_norm": 20.875, + "grad_norm_var": 1.2478515625, + "learning_rate": 2.2782858264017598e-05, + "loss": 6.8024, + "loss/crossentropy": 2.042202705144882, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.15201376751065254, + "step": 27540 + }, + { + "epoch": 0.9183333333333333, + "grad_norm": 21.125, + "grad_norm_var": 1.3363932291666667, + "learning_rate": 2.268431660065651e-05, + "loss": 6.7997, + "loss/crossentropy": 1.8682068414986133, + "loss/hidden": 3.153125, + "loss/jsd": 0.0, + "loss/logits": 0.15471092467196285, + "step": 27550 + }, + { + "epoch": 0.9186666666666666, + "grad_norm": 21.5, + "grad_norm_var": 0.43723958333333335, + "learning_rate": 2.258609388004419e-05, + "loss": 6.7734, + "loss/crossentropy": 1.9644837513566018, + "loss/hidden": 3.20390625, + "loss/jsd": 0.0, + "loss/logits": 0.16138502229005097, + "step": 27560 + }, + { + "epoch": 0.919, + "grad_norm": 21.75, + "grad_norm_var": 0.6681640625, + "learning_rate": 2.2488191071599263e-05, + "loss": 6.799, + "loss/crossentropy": 2.0926445186138154, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.19344071615487338, + "step": 27570 + }, + { + "epoch": 0.9193333333333333, + "grad_norm": 21.0, + "grad_norm_var": 0.7291015625, + "learning_rate": 2.2390609141582902e-05, + "loss": 6.7563, + "loss/crossentropy": 2.0497403740882874, + "loss/hidden": 3.1421875, + "loss/jsd": 0.0, + "loss/logits": 0.15796293318271637, + "step": 27580 + }, + { + "epoch": 0.9196666666666666, + "grad_norm": 20.5, + "grad_norm_var": 8.709375, + "learning_rate": 2.229334905308938e-05, + "loss": 6.679, + "loss/crossentropy": 1.8748921178281308, + "loss/hidden": 3.208984375, + "loss/jsd": 0.0, + "loss/logits": 0.14360655695199967, + "step": 27590 + }, + { + "epoch": 0.92, + "grad_norm": 21.75, + "grad_norm_var": 7.952018229166667, + "learning_rate": 2.219641176603649e-05, + "loss": 6.8976, + "loss/crossentropy": 1.9149666860699655, + "loss/hidden": 3.216796875, + "loss/jsd": 0.0, + "loss/logits": 0.1568117355927825, + "step": 27600 + }, + { + "epoch": 0.9203333333333333, + "grad_norm": 24.0, + "grad_norm_var": 3.1035807291666666, + "learning_rate": 2.2099798237156116e-05, + "loss": 6.8551, + "loss/crossentropy": 2.1548288121819494, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.1809004159644246, + "step": 27610 + }, + { + "epoch": 0.9206666666666666, + "grad_norm": 21.5, + "grad_norm_var": 3.595833333333333, + "learning_rate": 2.200350941998481e-05, + "loss": 6.8465, + "loss/crossentropy": 2.0366897195577622, + "loss/hidden": 3.103515625, + "loss/jsd": 0.0, + "loss/logits": 0.15638676267117263, + "step": 27620 + }, + { + "epoch": 0.921, + "grad_norm": 20.125, + "grad_norm_var": 0.8080729166666667, + "learning_rate": 2.1907546264854283e-05, + "loss": 6.9391, + "loss/crossentropy": 1.9020028218626976, + "loss/hidden": 3.107421875, + "loss/jsd": 0.0, + "loss/logits": 0.14446177333593369, + "step": 27630 + }, + { + "epoch": 0.9213333333333333, + "grad_norm": 21.0, + "grad_norm_var": 0.878125, + "learning_rate": 2.181190971888218e-05, + "loss": 6.8741, + "loss/crossentropy": 2.1242057621479034, + "loss/hidden": 3.25078125, + "loss/jsd": 0.0, + "loss/logits": 0.16890477053821087, + "step": 27640 + }, + { + "epoch": 0.9216666666666666, + "grad_norm": 21.5, + "grad_norm_var": 0.5712890625, + "learning_rate": 2.1716600725962562e-05, + "loss": 6.854, + "loss/crossentropy": 2.003288094699383, + "loss/hidden": 3.13203125, + "loss/jsd": 0.0, + "loss/logits": 0.15149989314377307, + "step": 27650 + }, + { + "epoch": 0.922, + "grad_norm": 21.75, + "grad_norm_var": 1.2197265625, + "learning_rate": 2.1621620226756745e-05, + "loss": 6.8234, + "loss/crossentropy": 1.8800167009234428, + "loss/hidden": 3.2140625, + "loss/jsd": 0.0, + "loss/logits": 0.154136617295444, + "step": 27660 + }, + { + "epoch": 0.9223333333333333, + "grad_norm": 20.75, + "grad_norm_var": 0.8014973958333333, + "learning_rate": 2.1526969158683875e-05, + "loss": 6.7598, + "loss/crossentropy": 2.1428465634584426, + "loss/hidden": 3.107421875, + "loss/jsd": 0.0, + "loss/logits": 0.1558793431147933, + "step": 27670 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 20.75, + "grad_norm_var": 1.2302083333333333, + "learning_rate": 2.1432648455911808e-05, + "loss": 6.8209, + "loss/crossentropy": 1.9693931117653847, + "loss/hidden": 3.183984375, + "loss/jsd": 0.0, + "loss/logits": 0.16253619380295276, + "step": 27680 + }, + { + "epoch": 0.923, + "grad_norm": 23.0, + "grad_norm_var": 1.6561848958333334, + "learning_rate": 2.1338659049347798e-05, + "loss": 6.871, + "loss/crossentropy": 2.2309056654572488, + "loss/hidden": 3.18046875, + "loss/jsd": 0.0, + "loss/logits": 0.16542233377695084, + "step": 27690 + }, + { + "epoch": 0.9233333333333333, + "grad_norm": 22.375, + "grad_norm_var": 0.6228515625, + "learning_rate": 2.1245001866629322e-05, + "loss": 6.8937, + "loss/crossentropy": 2.0580638118088244, + "loss/hidden": 3.295703125, + "loss/jsd": 0.0, + "loss/logits": 0.1742462942842394, + "step": 27700 + }, + { + "epoch": 0.9236666666666666, + "grad_norm": 23.375, + "grad_norm_var": 2.3160807291666665, + "learning_rate": 2.1151677832114996e-05, + "loss": 6.9408, + "loss/crossentropy": 1.997247189283371, + "loss/hidden": 3.149609375, + "loss/jsd": 0.0, + "loss/logits": 0.14960271613672377, + "step": 27710 + }, + { + "epoch": 0.924, + "grad_norm": 23.625, + "grad_norm_var": 2.661393229166667, + "learning_rate": 2.1058687866875328e-05, + "loss": 6.8154, + "loss/crossentropy": 1.9769588127732276, + "loss/hidden": 3.16875, + "loss/jsd": 0.0, + "loss/logits": 0.15400861240923405, + "step": 27720 + }, + { + "epoch": 0.9243333333333333, + "grad_norm": 21.875, + "grad_norm_var": 10.42265625, + "learning_rate": 2.0966032888683773e-05, + "loss": 6.8957, + "loss/crossentropy": 1.9895775854587554, + "loss/hidden": 3.198046875, + "loss/jsd": 0.0, + "loss/logits": 0.15623829020187258, + "step": 27730 + }, + { + "epoch": 0.9246666666666666, + "grad_norm": 22.25, + "grad_norm_var": 1.3666666666666667, + "learning_rate": 2.0873713812007517e-05, + "loss": 6.9308, + "loss/crossentropy": 2.0563116490840914, + "loss/hidden": 3.275, + "loss/jsd": 0.0, + "loss/logits": 0.16867623366415502, + "step": 27740 + }, + { + "epoch": 0.925, + "grad_norm": 20.875, + "grad_norm_var": 0.6372395833333333, + "learning_rate": 2.0781731547998614e-05, + "loss": 6.8815, + "loss/crossentropy": 1.8822642505168914, + "loss/hidden": 3.246875, + "loss/jsd": 0.0, + "loss/logits": 0.16076278118416668, + "step": 27750 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 19.875, + "grad_norm_var": 0.9455729166666667, + "learning_rate": 2.0690087004484844e-05, + "loss": 6.801, + "loss/crossentropy": 2.086822558939457, + "loss/hidden": 3.212109375, + "loss/jsd": 0.0, + "loss/logits": 0.16068840138614177, + "step": 27760 + }, + { + "epoch": 0.9256666666666666, + "grad_norm": 23.75, + "grad_norm_var": 1.9806640625, + "learning_rate": 2.0598781085960883e-05, + "loss": 6.8743, + "loss/crossentropy": 2.00646168962121, + "loss/hidden": 3.18203125, + "loss/jsd": 0.0, + "loss/logits": 0.15596114667132496, + "step": 27770 + }, + { + "epoch": 0.926, + "grad_norm": 21.5, + "grad_norm_var": 1.0457682291666666, + "learning_rate": 2.0507814693579263e-05, + "loss": 6.8621, + "loss/crossentropy": 1.9436368495225906, + "loss/hidden": 3.1109375, + "loss/jsd": 0.0, + "loss/logits": 0.14514606250450016, + "step": 27780 + }, + { + "epoch": 0.9263333333333333, + "grad_norm": 21.125, + "grad_norm_var": 1.446875, + "learning_rate": 2.0417188725141557e-05, + "loss": 6.8546, + "loss/crossentropy": 2.0124169424176217, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.16959600700065494, + "step": 27790 + }, + { + "epoch": 0.9266666666666666, + "grad_norm": 21.125, + "grad_norm_var": 1.4510416666666666, + "learning_rate": 2.0326904075089492e-05, + "loss": 6.8477, + "loss/crossentropy": 2.062483602762222, + "loss/hidden": 3.134375, + "loss/jsd": 0.0, + "loss/logits": 0.1529495507478714, + "step": 27800 + }, + { + "epoch": 0.927, + "grad_norm": 20.25, + "grad_norm_var": 1.040625, + "learning_rate": 2.02369616344961e-05, + "loss": 6.8063, + "loss/crossentropy": 2.018744045495987, + "loss/hidden": 3.1296875, + "loss/jsd": 0.0, + "loss/logits": 0.15262581091374158, + "step": 27810 + }, + { + "epoch": 0.9273333333333333, + "grad_norm": 22.5, + "grad_norm_var": 0.7434895833333334, + "learning_rate": 2.0147362291056983e-05, + "loss": 6.8214, + "loss/crossentropy": 1.9554542362689973, + "loss/hidden": 3.255078125, + "loss/jsd": 0.0, + "loss/logits": 0.15628779772669077, + "step": 27820 + }, + { + "epoch": 0.9276666666666666, + "grad_norm": 20.375, + "grad_norm_var": 0.6375, + "learning_rate": 2.005810692908146e-05, + "loss": 6.773, + "loss/crossentropy": 1.8790936447679996, + "loss/hidden": 3.12265625, + "loss/jsd": 0.0, + "loss/logits": 0.1430271876975894, + "step": 27830 + }, + { + "epoch": 0.928, + "grad_norm": 22.75, + "grad_norm_var": 0.7018229166666666, + "learning_rate": 1.996919642948395e-05, + "loss": 6.9103, + "loss/crossentropy": 1.9114558339118957, + "loss/hidden": 3.26953125, + "loss/jsd": 0.0, + "loss/logits": 0.16306452695280313, + "step": 27840 + }, + { + "epoch": 0.9283333333333333, + "grad_norm": 22.875, + "grad_norm_var": 2.474739583333333, + "learning_rate": 1.9880631669775164e-05, + "loss": 6.9135, + "loss/crossentropy": 1.938335907459259, + "loss/hidden": 3.210546875, + "loss/jsd": 0.0, + "loss/logits": 0.15777956116944553, + "step": 27850 + }, + { + "epoch": 0.9286666666666666, + "grad_norm": 21.25, + "grad_norm_var": 0.9306640625, + "learning_rate": 1.9792413524053538e-05, + "loss": 6.8582, + "loss/crossentropy": 2.0509339734911918, + "loss/hidden": 3.16640625, + "loss/jsd": 0.0, + "loss/logits": 0.17118664290755986, + "step": 27860 + }, + { + "epoch": 0.929, + "grad_norm": 23.125, + "grad_norm_var": 1.2134765625, + "learning_rate": 1.970454286299654e-05, + "loss": 6.8609, + "loss/crossentropy": 2.035054676234722, + "loss/hidden": 3.24453125, + "loss/jsd": 0.0, + "loss/logits": 0.17292858399450778, + "step": 27870 + }, + { + "epoch": 0.9293333333333333, + "grad_norm": 21.75, + "grad_norm_var": 0.65, + "learning_rate": 1.961702055385215e-05, + "loss": 6.9531, + "loss/crossentropy": 2.035239374637604, + "loss/hidden": 3.10625, + "loss/jsd": 0.0, + "loss/logits": 0.15701537095010282, + "step": 27880 + }, + { + "epoch": 0.9296666666666666, + "grad_norm": 21.5, + "grad_norm_var": 0.8754557291666667, + "learning_rate": 1.9529847460430206e-05, + "loss": 6.7717, + "loss/crossentropy": 2.0914264246821404, + "loss/hidden": 3.19921875, + "loss/jsd": 0.0, + "loss/logits": 0.1649886442348361, + "step": 27890 + }, + { + "epoch": 0.93, + "grad_norm": 20.75, + "grad_norm_var": 2.5403116477197844e+18, + "learning_rate": 1.944302444309393e-05, + "loss": 6.8185, + "loss/crossentropy": 1.963211180269718, + "loss/hidden": 3.149609375, + "loss/jsd": 0.0, + "loss/logits": 0.15954519156366587, + "step": 27900 + }, + { + "epoch": 0.9303333333333333, + "grad_norm": 19.875, + "grad_norm_var": 2.540311647507273e+18, + "learning_rate": 1.9356552358751486e-05, + "loss": 6.8701, + "loss/crossentropy": 1.9134356677532196, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.16030828636139632, + "step": 27910 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 20.75, + "grad_norm_var": 1.9268229166666666, + "learning_rate": 1.927043206084741e-05, + "loss": 6.8608, + "loss/crossentropy": 2.0048422425985337, + "loss/hidden": 3.26015625, + "loss/jsd": 0.0, + "loss/logits": 0.16555739659816027, + "step": 27920 + }, + { + "epoch": 0.931, + "grad_norm": 21.75, + "grad_norm_var": 1.1997395833333333, + "learning_rate": 1.918466439935429e-05, + "loss": 6.8752, + "loss/crossentropy": 2.1238563142716886, + "loss/hidden": 3.2640625, + "loss/jsd": 0.0, + "loss/logits": 0.16962270541116595, + "step": 27930 + }, + { + "epoch": 0.9313333333333333, + "grad_norm": 21.875, + "grad_norm_var": 1.2989583333333334, + "learning_rate": 1.9099250220764303e-05, + "loss": 6.8435, + "loss/crossentropy": 2.0242248825728892, + "loss/hidden": 3.190234375, + "loss/jsd": 0.0, + "loss/logits": 0.17032607905566693, + "step": 27940 + }, + { + "epoch": 0.9316666666666666, + "grad_norm": 22.0, + "grad_norm_var": 0.5895182291666666, + "learning_rate": 1.9014190368080926e-05, + "loss": 7.0145, + "loss/crossentropy": 2.1218235939741135, + "loss/hidden": 3.275390625, + "loss/jsd": 0.0, + "loss/logits": 0.17224793788045645, + "step": 27950 + }, + { + "epoch": 0.932, + "grad_norm": 22.375, + "grad_norm_var": 0.7541666666666667, + "learning_rate": 1.892948568081055e-05, + "loss": 6.8804, + "loss/crossentropy": 2.163966727256775, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.1737080292776227, + "step": 27960 + }, + { + "epoch": 0.9323333333333333, + "grad_norm": 22.5, + "grad_norm_var": 0.7546223958333333, + "learning_rate": 1.884513699495426e-05, + "loss": 6.8348, + "loss/crossentropy": 2.0167593225836753, + "loss/hidden": 3.251953125, + "loss/jsd": 0.0, + "loss/logits": 0.15947899948805572, + "step": 27970 + }, + { + "epoch": 0.9326666666666666, + "grad_norm": 21.875, + "grad_norm_var": 0.39837239583333334, + "learning_rate": 1.8761145142999516e-05, + "loss": 6.8929, + "loss/crossentropy": 1.9491732098162173, + "loss/hidden": 3.207421875, + "loss/jsd": 0.0, + "loss/logits": 0.17612145710736513, + "step": 27980 + }, + { + "epoch": 0.933, + "grad_norm": 26.0, + "grad_norm_var": 1.6577473958333333, + "learning_rate": 1.8677510953911987e-05, + "loss": 6.9866, + "loss/crossentropy": 2.0607218489050867, + "loss/hidden": 3.302734375, + "loss/jsd": 0.0, + "loss/logits": 0.17759426180273294, + "step": 27990 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 21.875, + "grad_norm_var": 1.38515625, + "learning_rate": 1.8594235253127375e-05, + "loss": 7.0145, + "loss/crossentropy": 2.211387987434864, + "loss/hidden": 3.19296875, + "loss/jsd": 0.0, + "loss/logits": 0.1805992743000388, + "step": 28000 + }, + { + "epoch": 0.9336666666666666, + "grad_norm": 22.875, + "grad_norm_var": 0.5228515625, + "learning_rate": 1.851131886254319e-05, + "loss": 6.8231, + "loss/crossentropy": 2.094671034812927, + "loss/hidden": 3.12109375, + "loss/jsd": 0.0, + "loss/logits": 0.15865894313901663, + "step": 28010 + }, + { + "epoch": 0.934, + "grad_norm": 20.25, + "grad_norm_var": 0.6889973958333333, + "learning_rate": 1.8428762600510772e-05, + "loss": 6.8635, + "loss/crossentropy": 2.0300868436694146, + "loss/hidden": 3.245703125, + "loss/jsd": 0.0, + "loss/logits": 0.1639298925176263, + "step": 28020 + }, + { + "epoch": 0.9343333333333333, + "grad_norm": 25.125, + "grad_norm_var": 2.202018229166667, + "learning_rate": 1.8346567281827077e-05, + "loss": 6.7595, + "loss/crossentropy": 1.9703581586480141, + "loss/hidden": 3.2296875, + "loss/jsd": 0.0, + "loss/logits": 0.17280979938805102, + "step": 28030 + }, + { + "epoch": 0.9346666666666666, + "grad_norm": 21.625, + "grad_norm_var": 2.2697265625, + "learning_rate": 1.8264733717726722e-05, + "loss": 6.864, + "loss/crossentropy": 1.8735784053802491, + "loss/hidden": 3.290625, + "loss/jsd": 0.0, + "loss/logits": 0.17170735779218377, + "step": 28040 + }, + { + "epoch": 0.935, + "grad_norm": 21.0, + "grad_norm_var": 0.7077473958333333, + "learning_rate": 1.818326271587394e-05, + "loss": 6.871, + "loss/crossentropy": 1.9842637002468109, + "loss/hidden": 3.215234375, + "loss/jsd": 0.0, + "loss/logits": 0.16598169598728418, + "step": 28050 + }, + { + "epoch": 0.9353333333333333, + "grad_norm": 22.125, + "grad_norm_var": 1.0958333333333334, + "learning_rate": 1.8102155080354642e-05, + "loss": 6.8726, + "loss/crossentropy": 2.063341203331947, + "loss/hidden": 3.079296875, + "loss/jsd": 0.0, + "loss/logits": 0.15235913041979074, + "step": 28060 + }, + { + "epoch": 0.9356666666666666, + "grad_norm": 23.5, + "grad_norm_var": 1.3186848958333333, + "learning_rate": 1.8021411611668444e-05, + "loss": 6.8173, + "loss/crossentropy": 2.0359160229563713, + "loss/hidden": 3.2375, + "loss/jsd": 0.0, + "loss/logits": 0.16284253299236298, + "step": 28070 + }, + { + "epoch": 0.936, + "grad_norm": 21.75, + "grad_norm_var": 1.9625138911536218e+18, + "learning_rate": 1.7941033106720768e-05, + "loss": 6.8429, + "loss/crossentropy": 1.9865235716104508, + "loss/hidden": 3.175, + "loss/jsd": 0.0, + "loss/logits": 0.15688623264431953, + "step": 28080 + }, + { + "epoch": 0.9363333333333334, + "grad_norm": 21.625, + "grad_norm_var": 1.962513891247015e+18, + "learning_rate": 1.7861020358815024e-05, + "loss": 6.9292, + "loss/crossentropy": 2.052123652398586, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.1626156263053417, + "step": 28090 + }, + { + "epoch": 0.9366666666666666, + "grad_norm": 21.625, + "grad_norm_var": 0.5916015625, + "learning_rate": 1.7781374157644715e-05, + "loss": 6.8609, + "loss/crossentropy": 2.0096867479383946, + "loss/hidden": 3.196484375, + "loss/jsd": 0.0, + "loss/logits": 0.15360062830150129, + "step": 28100 + }, + { + "epoch": 0.937, + "grad_norm": 21.875, + "grad_norm_var": 0.3650390625, + "learning_rate": 1.7702095289285717e-05, + "loss": 6.8755, + "loss/crossentropy": 2.258693332970142, + "loss/hidden": 3.0671875, + "loss/jsd": 0.0, + "loss/logits": 0.15729560470208526, + "step": 28110 + }, + { + "epoch": 0.9373333333333334, + "grad_norm": 21.25, + "grad_norm_var": 0.6020182291666667, + "learning_rate": 1.7623184536188424e-05, + "loss": 6.926, + "loss/crossentropy": 1.9955579489469528, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.16989528406411408, + "step": 28120 + }, + { + "epoch": 0.9376666666666666, + "grad_norm": 21.375, + "grad_norm_var": 0.6176432291666667, + "learning_rate": 1.7544642677170152e-05, + "loss": 6.9319, + "loss/crossentropy": 2.0671664133667944, + "loss/hidden": 3.248046875, + "loss/jsd": 0.0, + "loss/logits": 0.1739020137116313, + "step": 28130 + }, + { + "epoch": 0.938, + "grad_norm": 20.5, + "grad_norm_var": 0.6077473958333334, + "learning_rate": 1.74664704874073e-05, + "loss": 6.8389, + "loss/crossentropy": 1.8632956266403198, + "loss/hidden": 3.19453125, + "loss/jsd": 0.0, + "loss/logits": 0.1709655337035656, + "step": 28140 + }, + { + "epoch": 0.9383333333333334, + "grad_norm": 22.625, + "grad_norm_var": 0.9457682291666667, + "learning_rate": 1.738866873842785e-05, + "loss": 6.8875, + "loss/crossentropy": 2.1864455230534077, + "loss/hidden": 3.13828125, + "loss/jsd": 0.0, + "loss/logits": 0.1634229407645762, + "step": 28150 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 21.5, + "grad_norm_var": 0.7875, + "learning_rate": 1.7311238198103627e-05, + "loss": 6.8575, + "loss/crossentropy": 1.8006783843040466, + "loss/hidden": 3.20859375, + "loss/jsd": 0.0, + "loss/logits": 0.14259467422962188, + "step": 28160 + }, + { + "epoch": 0.939, + "grad_norm": 21.5, + "grad_norm_var": 0.4369140625, + "learning_rate": 1.7234179630642834e-05, + "loss": 6.7653, + "loss/crossentropy": 2.0104843035340307, + "loss/hidden": 3.162109375, + "loss/jsd": 0.0, + "loss/logits": 0.15113328117877245, + "step": 28170 + }, + { + "epoch": 0.9393333333333334, + "grad_norm": 20.25, + "grad_norm_var": 1709.6525390625, + "learning_rate": 1.7157493796582398e-05, + "loss": 6.8128, + "loss/crossentropy": 1.9046835117042065, + "loss/hidden": 3.2140625, + "loss/jsd": 0.0, + "loss/logits": 0.15331623200327157, + "step": 28180 + }, + { + "epoch": 0.9396666666666667, + "grad_norm": 21.75, + "grad_norm_var": 1700.040625, + "learning_rate": 1.708118145278056e-05, + "loss": 6.8447, + "loss/crossentropy": 1.9132866755127906, + "loss/hidden": 3.25859375, + "loss/jsd": 0.0, + "loss/logits": 0.16736448789015412, + "step": 28190 + }, + { + "epoch": 0.94, + "grad_norm": 21.5, + "grad_norm_var": 2.4009765625, + "learning_rate": 1.7005243352409334e-05, + "loss": 6.8374, + "loss/crossentropy": 2.1414462864398955, + "loss/hidden": 3.1203125, + "loss/jsd": 0.0, + "loss/logits": 0.157612294703722, + "step": 28200 + }, + { + "epoch": 0.9403333333333334, + "grad_norm": 21.125, + "grad_norm_var": 2.1567057291666667, + "learning_rate": 1.692968024494711e-05, + "loss": 6.7885, + "loss/crossentropy": 1.9627116709947585, + "loss/hidden": 3.141015625, + "loss/jsd": 0.0, + "loss/logits": 0.14800271224230527, + "step": 28210 + }, + { + "epoch": 0.9406666666666667, + "grad_norm": 22.25, + "grad_norm_var": 0.9718098958333333, + "learning_rate": 1.6854492876171264e-05, + "loss": 6.8826, + "loss/crossentropy": 1.9164805084466934, + "loss/hidden": 3.2640625, + "loss/jsd": 0.0, + "loss/logits": 0.1629214364103973, + "step": 28220 + }, + { + "epoch": 0.941, + "grad_norm": 21.375, + "grad_norm_var": 1.3614583333333334, + "learning_rate": 1.677968198815076e-05, + "loss": 6.731, + "loss/crossentropy": 1.940926407277584, + "loss/hidden": 3.167578125, + "loss/jsd": 0.0, + "loss/logits": 0.1472564697265625, + "step": 28230 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 21.25, + "grad_norm_var": 1.4962890625, + "learning_rate": 1.6705248319238876e-05, + "loss": 6.9204, + "loss/crossentropy": 2.1460791036486624, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.15511545334011317, + "step": 28240 + }, + { + "epoch": 0.9416666666666667, + "grad_norm": 20.625, + "grad_norm_var": 1.2593098958333333, + "learning_rate": 1.6631192604065855e-05, + "loss": 6.7155, + "loss/crossentropy": 2.0769747786223887, + "loss/hidden": 3.1859375, + "loss/jsd": 0.0, + "loss/logits": 0.1538231515791267, + "step": 28250 + }, + { + "epoch": 0.942, + "grad_norm": 22.75, + "grad_norm_var": 1.3264973958333333, + "learning_rate": 1.6557515573531724e-05, + "loss": 6.913, + "loss/crossentropy": 2.01955421641469, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.1581470502540469, + "step": 28260 + }, + { + "epoch": 0.9423333333333334, + "grad_norm": 21.0, + "grad_norm_var": 1.2729166666666667, + "learning_rate": 1.6484217954799018e-05, + "loss": 6.7413, + "loss/crossentropy": 1.9801385045051574, + "loss/hidden": 3.24609375, + "loss/jsd": 0.0, + "loss/logits": 0.16784953828901053, + "step": 28270 + }, + { + "epoch": 0.9426666666666667, + "grad_norm": 20.5, + "grad_norm_var": 0.51015625, + "learning_rate": 1.6411300471285656e-05, + "loss": 6.8198, + "loss/crossentropy": 2.2034427911043166, + "loss/hidden": 3.079296875, + "loss/jsd": 0.0, + "loss/logits": 0.15067932959645985, + "step": 28280 + }, + { + "epoch": 0.943, + "grad_norm": 23.875, + "grad_norm_var": 1.01015625, + "learning_rate": 1.6338763842657757e-05, + "loss": 6.9264, + "loss/crossentropy": 1.979648907482624, + "loss/hidden": 3.31171875, + "loss/jsd": 0.0, + "loss/logits": 0.1757309900596738, + "step": 28290 + }, + { + "epoch": 0.9433333333333334, + "grad_norm": 22.375, + "grad_norm_var": 2.1809895833333335, + "learning_rate": 1.6266608784822544e-05, + "loss": 6.7541, + "loss/crossentropy": 2.0895672395825384, + "loss/hidden": 3.17265625, + "loss/jsd": 0.0, + "loss/logits": 0.16868185754865408, + "step": 28300 + }, + { + "epoch": 0.9436666666666667, + "grad_norm": 20.625, + "grad_norm_var": 0.7218098958333333, + "learning_rate": 1.6194836009921332e-05, + "loss": 6.7872, + "loss/crossentropy": 1.9959282279014587, + "loss/hidden": 3.15625, + "loss/jsd": 0.0, + "loss/logits": 0.1535317923873663, + "step": 28310 + }, + { + "epoch": 0.944, + "grad_norm": 21.5, + "grad_norm_var": 0.9622395833333334, + "learning_rate": 1.6123446226322414e-05, + "loss": 6.7534, + "loss/crossentropy": 2.1295602142810823, + "loss/hidden": 3.19921875, + "loss/jsd": 0.0, + "loss/logits": 0.16768959537148476, + "step": 28320 + }, + { + "epoch": 0.9443333333333334, + "grad_norm": 21.125, + "grad_norm_var": 1.1333333333333333, + "learning_rate": 1.6052440138614155e-05, + "loss": 6.9399, + "loss/crossentropy": 2.0156208984553814, + "loss/hidden": 3.18828125, + "loss/jsd": 0.0, + "loss/logits": 0.15371856791898608, + "step": 28330 + }, + { + "epoch": 0.9446666666666667, + "grad_norm": 21.0, + "grad_norm_var": 1.05, + "learning_rate": 1.598181844759795e-05, + "loss": 6.8335, + "loss/crossentropy": 2.0985936269164087, + "loss/hidden": 3.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.1626562364399433, + "step": 28340 + }, + { + "epoch": 0.945, + "grad_norm": 20.75, + "grad_norm_var": 1.15, + "learning_rate": 1.5911581850281403e-05, + "loss": 6.7963, + "loss/crossentropy": 1.8603245675563813, + "loss/hidden": 3.183203125, + "loss/jsd": 0.0, + "loss/logits": 0.14134703744202853, + "step": 28350 + }, + { + "epoch": 0.9453333333333334, + "grad_norm": 21.5, + "grad_norm_var": 0.72890625, + "learning_rate": 1.5841731039871348e-05, + "loss": 6.7295, + "loss/crossentropy": 2.0212074637413027, + "loss/hidden": 3.219921875, + "loss/jsd": 0.0, + "loss/logits": 0.15525523126125335, + "step": 28360 + }, + { + "epoch": 0.9456666666666667, + "grad_norm": 23.625, + "grad_norm_var": 1.4372395833333333, + "learning_rate": 1.5772266705767108e-05, + "loss": 6.8022, + "loss/crossentropy": 2.0861593782901764, + "loss/hidden": 3.192578125, + "loss/jsd": 0.0, + "loss/logits": 0.17841291818767785, + "step": 28370 + }, + { + "epoch": 0.946, + "grad_norm": 22.375, + "grad_norm_var": 2.3707682291666665, + "learning_rate": 1.5703189533553605e-05, + "loss": 6.9794, + "loss/crossentropy": 2.1092930763959883, + "loss/hidden": 3.28515625, + "loss/jsd": 0.0, + "loss/logits": 0.18949546683579682, + "step": 28380 + }, + { + "epoch": 0.9463333333333334, + "grad_norm": 21.375, + "grad_norm_var": 0.9843098958333333, + "learning_rate": 1.563450020499463e-05, + "loss": 6.8461, + "loss/crossentropy": 2.025676953792572, + "loss/hidden": 3.21015625, + "loss/jsd": 0.0, + "loss/logits": 0.17395553570240735, + "step": 28390 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 21.25, + "grad_norm_var": 3.162652818595678e+18, + "learning_rate": 1.556619939802615e-05, + "loss": 6.8821, + "loss/crossentropy": 2.098123352229595, + "loss/hidden": 3.2015625, + "loss/jsd": 0.0, + "loss/logits": 0.1534987824037671, + "step": 28400 + }, + { + "epoch": 0.947, + "grad_norm": 22.0, + "grad_norm_var": 0.74140625, + "learning_rate": 1.549828778674953e-05, + "loss": 6.9398, + "loss/crossentropy": 2.09067225754261, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.17208856642246245, + "step": 28410 + }, + { + "epoch": 0.9473333333333334, + "grad_norm": 22.75, + "grad_norm_var": 0.55390625, + "learning_rate": 1.5430766041424978e-05, + "loss": 6.8388, + "loss/crossentropy": 1.8563894510269165, + "loss/hidden": 3.2703125, + "loss/jsd": 0.0, + "loss/logits": 0.15164101766422391, + "step": 28420 + }, + { + "epoch": 0.9476666666666667, + "grad_norm": 23.125, + "grad_norm_var": 0.8442057291666667, + "learning_rate": 1.536363482846484e-05, + "loss": 6.8602, + "loss/crossentropy": 1.9244048327207566, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.15772407911717892, + "step": 28430 + }, + { + "epoch": 0.948, + "grad_norm": 20.25, + "grad_norm_var": 94.33743489583334, + "learning_rate": 1.529689481042711e-05, + "loss": 6.8611, + "loss/crossentropy": 1.9303564444184302, + "loss/hidden": 3.19921875, + "loss/jsd": 0.0, + "loss/logits": 0.1556209173053503, + "step": 28440 + }, + { + "epoch": 0.9483333333333334, + "grad_norm": 22.125, + "grad_norm_var": 94.33483072916667, + "learning_rate": 1.5230546646008795e-05, + "loss": 6.8918, + "loss/crossentropy": 2.0158408626914026, + "loss/hidden": 3.240234375, + "loss/jsd": 0.0, + "loss/logits": 0.15476850140839815, + "step": 28450 + }, + { + "epoch": 0.9486666666666667, + "grad_norm": 21.875, + "grad_norm_var": 1.2551432291666667, + "learning_rate": 1.516459099003952e-05, + "loss": 6.802, + "loss/crossentropy": 2.104232335090637, + "loss/hidden": 3.19375, + "loss/jsd": 0.0, + "loss/logits": 0.16216706801205874, + "step": 28460 + }, + { + "epoch": 0.949, + "grad_norm": 23.375, + "grad_norm_var": 2.8147497485817175e+18, + "learning_rate": 1.5099028493474956e-05, + "loss": 6.8888, + "loss/crossentropy": 2.0458613131195307, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.17042651497758926, + "step": 28470 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 21.5, + "grad_norm_var": 2.814749748588708e+18, + "learning_rate": 1.50338598033905e-05, + "loss": 6.8804, + "loss/crossentropy": 2.069540320336819, + "loss/hidden": 3.151953125, + "loss/jsd": 0.0, + "loss/logits": 0.15929017215967178, + "step": 28480 + }, + { + "epoch": 0.9496666666666667, + "grad_norm": 21.75, + "grad_norm_var": 2.448958333333333, + "learning_rate": 1.49690855629748e-05, + "loss": 6.8573, + "loss/crossentropy": 2.0447287276387214, + "loss/hidden": 3.244921875, + "loss/jsd": 0.0, + "loss/logits": 0.17378965076059102, + "step": 28490 + }, + { + "epoch": 0.95, + "grad_norm": 23.125, + "grad_norm_var": 4.262239583333334, + "learning_rate": 1.490470641152345e-05, + "loss": 6.8061, + "loss/crossentropy": 2.057085025310516, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.15816838014870882, + "step": 28500 + }, + { + "epoch": 0.9503333333333334, + "grad_norm": 22.0, + "grad_norm_var": 5.236458333333333, + "learning_rate": 1.4840722984432701e-05, + "loss": 6.8545, + "loss/crossentropy": 1.9297899812459947, + "loss/hidden": 3.16015625, + "loss/jsd": 0.0, + "loss/logits": 0.15209535052999854, + "step": 28510 + }, + { + "epoch": 0.9506666666666667, + "grad_norm": 21.75, + "grad_norm_var": 2.0322916666666666, + "learning_rate": 1.4777135913193132e-05, + "loss": 6.8245, + "loss/crossentropy": 2.107157987356186, + "loss/hidden": 3.212890625, + "loss/jsd": 0.0, + "loss/logits": 0.16366199534386397, + "step": 28520 + }, + { + "epoch": 0.951, + "grad_norm": 22.0, + "grad_norm_var": 0.25833333333333336, + "learning_rate": 1.471394582538348e-05, + "loss": 6.8325, + "loss/crossentropy": 2.0403877660632133, + "loss/hidden": 3.17265625, + "loss/jsd": 0.0, + "loss/logits": 0.16406202521175145, + "step": 28530 + }, + { + "epoch": 0.9513333333333334, + "grad_norm": 24.125, + "grad_norm_var": 3.139518229166667, + "learning_rate": 1.4651153344664387e-05, + "loss": 6.9738, + "loss/crossentropy": 2.258484014868736, + "loss/hidden": 3.1625, + "loss/jsd": 0.0, + "loss/logits": 0.15992612596601247, + "step": 28540 + }, + { + "epoch": 0.9516666666666667, + "grad_norm": 20.375, + "grad_norm_var": 1.6613932291666667, + "learning_rate": 1.4588759090772302e-05, + "loss": 6.8308, + "loss/crossentropy": 2.0535311087965966, + "loss/hidden": 3.146484375, + "loss/jsd": 0.0, + "loss/logits": 0.167077792994678, + "step": 28550 + }, + { + "epoch": 0.952, + "grad_norm": 20.75, + "grad_norm_var": 2.121809895833333, + "learning_rate": 1.4526763679513303e-05, + "loss": 6.9378, + "loss/crossentropy": 2.1362095795571805, + "loss/hidden": 3.16875, + "loss/jsd": 0.0, + "loss/logits": 0.15281093278899788, + "step": 28560 + }, + { + "epoch": 0.9523333333333334, + "grad_norm": 21.875, + "grad_norm_var": 1.9955729166666667, + "learning_rate": 1.446516772275709e-05, + "loss": 6.8711, + "loss/crossentropy": 1.9070044673979283, + "loss/hidden": 3.177734375, + "loss/jsd": 0.0, + "loss/logits": 0.16114689372479915, + "step": 28570 + }, + { + "epoch": 0.9526666666666667, + "grad_norm": 22.25, + "grad_norm_var": 0.5455729166666666, + "learning_rate": 1.440397182843088e-05, + "loss": 6.8352, + "loss/crossentropy": 1.896916215121746, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.16120940092951058, + "step": 28580 + }, + { + "epoch": 0.953, + "grad_norm": 22.75, + "grad_norm_var": 0.5768229166666666, + "learning_rate": 1.4343176600513433e-05, + "loss": 6.9722, + "loss/crossentropy": 2.0961402654647827, + "loss/hidden": 3.2859375, + "loss/jsd": 0.0, + "loss/logits": 0.18284041043370963, + "step": 28590 + }, + { + "epoch": 0.9533333333333334, + "grad_norm": 21.875, + "grad_norm_var": 0.6259765625, + "learning_rate": 1.428278263902913e-05, + "loss": 6.8938, + "loss/crossentropy": 1.9410855919122696, + "loss/hidden": 3.192578125, + "loss/jsd": 0.0, + "loss/logits": 0.15987232998013495, + "step": 28600 + }, + { + "epoch": 0.9536666666666667, + "grad_norm": 22.625, + "grad_norm_var": 0.6947265625, + "learning_rate": 1.422279054004196e-05, + "loss": 6.7791, + "loss/crossentropy": 2.0120147198438643, + "loss/hidden": 3.147265625, + "loss/jsd": 0.0, + "loss/logits": 0.15940133705735207, + "step": 28610 + }, + { + "epoch": 0.954, + "grad_norm": 24.875, + "grad_norm_var": 1.1875, + "learning_rate": 1.4163200895649742e-05, + "loss": 6.9824, + "loss/crossentropy": 1.899172729998827, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.14629402589052914, + "step": 28620 + }, + { + "epoch": 0.9543333333333334, + "grad_norm": 22.625, + "grad_norm_var": 2.0936848958333334, + "learning_rate": 1.4104014293978196e-05, + "loss": 6.8647, + "loss/crossentropy": 2.0003262996673583, + "loss/hidden": 3.188671875, + "loss/jsd": 0.0, + "loss/logits": 0.1569441094994545, + "step": 28630 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 22.625, + "grad_norm_var": 1.5327473958333333, + "learning_rate": 1.4045231319175198e-05, + "loss": 6.9435, + "loss/crossentropy": 2.0988379955291747, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.17967260386794806, + "step": 28640 + }, + { + "epoch": 0.955, + "grad_norm": 21.375, + "grad_norm_var": 1.1369140625, + "learning_rate": 1.3986852551404964e-05, + "loss": 6.8599, + "loss/crossentropy": 2.1141707748174667, + "loss/hidden": 3.1359375, + "loss/jsd": 0.0, + "loss/logits": 0.15787138119339944, + "step": 28650 + }, + { + "epoch": 0.9553333333333334, + "grad_norm": 23.75, + "grad_norm_var": 0.8989583333333333, + "learning_rate": 1.3928878566842376e-05, + "loss": 6.8908, + "loss/crossentropy": 2.0345154732465742, + "loss/hidden": 3.225, + "loss/jsd": 0.0, + "loss/logits": 0.18289269097149372, + "step": 28660 + }, + { + "epoch": 0.9556666666666667, + "grad_norm": 22.75, + "grad_norm_var": 0.9889973958333333, + "learning_rate": 1.3871309937667253e-05, + "loss": 6.9933, + "loss/crossentropy": 2.1303988128900526, + "loss/hidden": 3.21171875, + "loss/jsd": 0.0, + "loss/logits": 0.17532578259706497, + "step": 28670 + }, + { + "epoch": 0.956, + "grad_norm": 22.125, + "grad_norm_var": 1.1150390625, + "learning_rate": 1.3814147232058714e-05, + "loss": 6.6818, + "loss/crossentropy": 1.7408723145723344, + "loss/hidden": 3.17265625, + "loss/jsd": 0.0, + "loss/logits": 0.15916957296431064, + "step": 28680 + }, + { + "epoch": 0.9563333333333334, + "grad_norm": 22.5, + "grad_norm_var": 1.0160807291666667, + "learning_rate": 1.3757391014189596e-05, + "loss": 6.9554, + "loss/crossentropy": 1.9530368164181708, + "loss/hidden": 3.181640625, + "loss/jsd": 0.0, + "loss/logits": 0.18459425549954175, + "step": 28690 + }, + { + "epoch": 0.9566666666666667, + "grad_norm": 21.5, + "grad_norm_var": 1.0666015625, + "learning_rate": 1.3701041844220849e-05, + "loss": 6.9349, + "loss/crossentropy": 1.9662514954805375, + "loss/hidden": 3.18671875, + "loss/jsd": 0.0, + "loss/logits": 0.17320307586342096, + "step": 28700 + }, + { + "epoch": 0.957, + "grad_norm": 22.125, + "grad_norm_var": 1.0697265625, + "learning_rate": 1.3645100278296047e-05, + "loss": 6.937, + "loss/crossentropy": 2.046416383981705, + "loss/hidden": 3.233984375, + "loss/jsd": 0.0, + "loss/logits": 0.19815693870186807, + "step": 28710 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 21.5, + "grad_norm_var": 1.4759765625, + "learning_rate": 1.3589566868535836e-05, + "loss": 6.8148, + "loss/crossentropy": 2.0860094636678697, + "loss/hidden": 3.12734375, + "loss/jsd": 0.0, + "loss/logits": 0.15798233803361655, + "step": 28720 + }, + { + "epoch": 0.9576666666666667, + "grad_norm": 20.75, + "grad_norm_var": 1.5302083333333334, + "learning_rate": 1.3534442163032574e-05, + "loss": 6.8177, + "loss/crossentropy": 2.1211801931262015, + "loss/hidden": 3.222265625, + "loss/jsd": 0.0, + "loss/logits": 0.1808505615219474, + "step": 28730 + }, + { + "epoch": 0.958, + "grad_norm": 22.375, + "grad_norm_var": 0.9270833333333334, + "learning_rate": 1.347972670584483e-05, + "loss": 6.8425, + "loss/crossentropy": 1.9814658090472221, + "loss/hidden": 3.116796875, + "loss/jsd": 0.0, + "loss/logits": 0.15671081114560365, + "step": 28740 + }, + { + "epoch": 0.9583333333333334, + "grad_norm": 20.75, + "grad_norm_var": 0.8885416666666667, + "learning_rate": 1.3425421036992098e-05, + "loss": 6.7837, + "loss/crossentropy": 1.9576505310833454, + "loss/hidden": 3.153515625, + "loss/jsd": 0.0, + "loss/logits": 0.1521891091018915, + "step": 28750 + }, + { + "epoch": 0.9586666666666667, + "grad_norm": 20.625, + "grad_norm_var": 0.6791666666666667, + "learning_rate": 1.3371525692449394e-05, + "loss": 6.9583, + "loss/crossentropy": 2.1901199877262116, + "loss/hidden": 3.2046875, + "loss/jsd": 0.0, + "loss/logits": 0.16546592973172664, + "step": 28760 + }, + { + "epoch": 0.959, + "grad_norm": 22.25, + "grad_norm_var": 2.3824041859454684e+18, + "learning_rate": 1.3318041204142004e-05, + "loss": 6.9222, + "loss/crossentropy": 1.9837070412933826, + "loss/hidden": 3.20234375, + "loss/jsd": 0.0, + "loss/logits": 0.16084651360288263, + "step": 28770 + }, + { + "epoch": 0.9593333333333334, + "grad_norm": 20.375, + "grad_norm_var": 1.9385416666666666, + "learning_rate": 1.3264968099940245e-05, + "loss": 6.9218, + "loss/crossentropy": 2.1181742370128633, + "loss/hidden": 3.244140625, + "loss/jsd": 0.0, + "loss/logits": 0.18821782916784285, + "step": 28780 + }, + { + "epoch": 0.9596666666666667, + "grad_norm": 24.0, + "grad_norm_var": 2.1869140625, + "learning_rate": 1.321230690365422e-05, + "loss": 6.8798, + "loss/crossentropy": 2.0393978893756866, + "loss/hidden": 3.191796875, + "loss/jsd": 0.0, + "loss/logits": 0.15889321286231278, + "step": 28790 + }, + { + "epoch": 0.96, + "grad_norm": 22.25, + "grad_norm_var": 1.2372395833333334, + "learning_rate": 1.3160058135028691e-05, + "loss": 6.8816, + "loss/crossentropy": 1.9314091876149178, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.154315375816077, + "step": 28800 + }, + { + "epoch": 0.9603333333333334, + "grad_norm": 21.125, + "grad_norm_var": 1.59765625, + "learning_rate": 1.3108222309737892e-05, + "loss": 6.9788, + "loss/crossentropy": 1.9973760724067688, + "loss/hidden": 3.198828125, + "loss/jsd": 0.0, + "loss/logits": 0.1759620831348002, + "step": 28810 + }, + { + "epoch": 0.9606666666666667, + "grad_norm": 22.375, + "grad_norm_var": 1.0561848958333333, + "learning_rate": 1.305679993938051e-05, + "loss": 6.8864, + "loss/crossentropy": 2.043069842457771, + "loss/hidden": 3.166796875, + "loss/jsd": 0.0, + "loss/logits": 0.15617891773581505, + "step": 28820 + }, + { + "epoch": 0.961, + "grad_norm": 22.125, + "grad_norm_var": 12.6650390625, + "learning_rate": 1.3005791531474562e-05, + "loss": 6.8934, + "loss/crossentropy": 2.0857333853840827, + "loss/hidden": 3.164453125, + "loss/jsd": 0.0, + "loss/logits": 0.1577781980857253, + "step": 28830 + }, + { + "epoch": 0.9613333333333334, + "grad_norm": 21.625, + "grad_norm_var": 12.689322916666667, + "learning_rate": 1.2955197589452462e-05, + "loss": 6.8934, + "loss/crossentropy": 1.9508272759616374, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.16099842144176363, + "step": 28840 + }, + { + "epoch": 0.9616666666666667, + "grad_norm": 23.25, + "grad_norm_var": 0.4393229166666667, + "learning_rate": 1.2905018612655975e-05, + "loss": 6.86, + "loss/crossentropy": 2.013827832788229, + "loss/hidden": 3.207421875, + "loss/jsd": 0.0, + "loss/logits": 0.16024797260761262, + "step": 28850 + }, + { + "epoch": 0.962, + "grad_norm": 21.5, + "grad_norm_var": 0.6760416666666667, + "learning_rate": 1.2855255096331348e-05, + "loss": 6.9056, + "loss/crossentropy": 2.1667084366083147, + "loss/hidden": 3.098046875, + "loss/jsd": 0.0, + "loss/logits": 0.15538214575499296, + "step": 28860 + }, + { + "epoch": 0.9623333333333334, + "grad_norm": 22.75, + "grad_norm_var": 0.5228515625, + "learning_rate": 1.2805907531624403e-05, + "loss": 6.7589, + "loss/crossentropy": 1.8827613063156605, + "loss/hidden": 3.068359375, + "loss/jsd": 0.0, + "loss/logits": 0.14002714012749493, + "step": 28870 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 23.125, + "grad_norm_var": 1.7014973958333333, + "learning_rate": 1.2756976405575668e-05, + "loss": 6.8948, + "loss/crossentropy": 1.9622853726148606, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.1584384061396122, + "step": 28880 + }, + { + "epoch": 0.963, + "grad_norm": 21.875, + "grad_norm_var": 1.6854166666666666, + "learning_rate": 1.2708462201115617e-05, + "loss": 6.7792, + "loss/crossentropy": 1.8909013763070106, + "loss/hidden": 3.235546875, + "loss/jsd": 0.0, + "loss/logits": 0.16017456604167818, + "step": 28890 + }, + { + "epoch": 0.9633333333333334, + "grad_norm": 22.375, + "grad_norm_var": 0.5988932291666667, + "learning_rate": 1.2660365397059856e-05, + "loss": 6.7184, + "loss/crossentropy": 1.9674251511693002, + "loss/hidden": 3.17890625, + "loss/jsd": 0.0, + "loss/logits": 0.1554909586906433, + "step": 28900 + }, + { + "epoch": 0.9636666666666667, + "grad_norm": 23.125, + "grad_norm_var": 0.99140625, + "learning_rate": 1.2612686468104426e-05, + "loss": 6.8514, + "loss/crossentropy": 2.0045790046453478, + "loss/hidden": 3.18046875, + "loss/jsd": 0.0, + "loss/logits": 0.16668143030256033, + "step": 28910 + }, + { + "epoch": 0.964, + "grad_norm": 22.125, + "grad_norm_var": 1.246875, + "learning_rate": 1.2565425884821096e-05, + "loss": 6.9451, + "loss/crossentropy": 2.1058658018708227, + "loss/hidden": 3.23671875, + "loss/jsd": 0.0, + "loss/logits": 0.17369681475684046, + "step": 28920 + }, + { + "epoch": 0.9643333333333334, + "grad_norm": 22.125, + "grad_norm_var": 0.9249348958333333, + "learning_rate": 1.2518584113652767e-05, + "loss": 6.92, + "loss/crossentropy": 2.0629913471639156, + "loss/hidden": 3.19453125, + "loss/jsd": 0.0, + "loss/logits": 0.16003647521138192, + "step": 28930 + }, + { + "epoch": 0.9646666666666667, + "grad_norm": 23.75, + "grad_norm_var": 0.7143229166666667, + "learning_rate": 1.247216161690879e-05, + "loss": 6.9623, + "loss/crossentropy": 2.127345842123032, + "loss/hidden": 3.251171875, + "loss/jsd": 0.0, + "loss/logits": 0.17870207615196704, + "step": 28940 + }, + { + "epoch": 0.965, + "grad_norm": 22.0, + "grad_norm_var": 0.7660807291666667, + "learning_rate": 1.2426158852760462e-05, + "loss": 6.7875, + "loss/crossentropy": 1.906770334392786, + "loss/hidden": 3.098828125, + "loss/jsd": 0.0, + "loss/logits": 0.14323475370183586, + "step": 28950 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 22.75, + "grad_norm_var": 2.2660807291666667, + "learning_rate": 1.2380576275236511e-05, + "loss": 6.8731, + "loss/crossentropy": 2.055904617905617, + "loss/hidden": 3.18046875, + "loss/jsd": 0.0, + "loss/logits": 0.15869035460054876, + "step": 28960 + }, + { + "epoch": 0.9656666666666667, + "grad_norm": 24.125, + "grad_norm_var": 1.1556640625, + "learning_rate": 1.2335414334218561e-05, + "loss": 7.0247, + "loss/crossentropy": 2.0497659265995027, + "loss/hidden": 3.180859375, + "loss/jsd": 0.0, + "loss/logits": 0.1614781607873738, + "step": 28970 + }, + { + "epoch": 0.966, + "grad_norm": 22.25, + "grad_norm_var": 1.1497395833333333, + "learning_rate": 1.229067347543675e-05, + "loss": 6.8011, + "loss/crossentropy": 2.0531945556402205, + "loss/hidden": 3.221875, + "loss/jsd": 0.0, + "loss/logits": 0.16420614402741193, + "step": 28980 + }, + { + "epoch": 0.9663333333333334, + "grad_norm": 21.625, + "grad_norm_var": 0.8541666666666666, + "learning_rate": 1.224635414046527e-05, + "loss": 6.8427, + "loss/crossentropy": 1.9432912215590477, + "loss/hidden": 3.16328125, + "loss/jsd": 0.0, + "loss/logits": 0.16180085185915233, + "step": 28990 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 20.875, + "grad_norm_var": 0.6416666666666667, + "learning_rate": 1.2202456766718093e-05, + "loss": 6.7551, + "loss/crossentropy": 2.1089743584394456, + "loss/hidden": 3.159375, + "loss/jsd": 0.0, + "loss/logits": 0.1651729150209576, + "step": 29000 + }, + { + "epoch": 0.967, + "grad_norm": 21.0, + "grad_norm_var": 1.4041666666666666, + "learning_rate": 1.2158981787444552e-05, + "loss": 6.8471, + "loss/crossentropy": 1.9657625079154968, + "loss/hidden": 3.24453125, + "loss/jsd": 0.0, + "loss/logits": 0.16529466435313225, + "step": 29010 + }, + { + "epoch": 0.9673333333333334, + "grad_norm": 22.25, + "grad_norm_var": 0.9593098958333334, + "learning_rate": 1.2115929631725158e-05, + "loss": 6.8562, + "loss/crossentropy": 1.982128444686532, + "loss/hidden": 3.14296875, + "loss/jsd": 0.0, + "loss/logits": 0.15026735952123998, + "step": 29020 + }, + { + "epoch": 0.9676666666666667, + "grad_norm": 24.0, + "grad_norm_var": 1.4122395833333334, + "learning_rate": 1.2073300724467295e-05, + "loss": 6.7686, + "loss/crossentropy": 2.0765829384326935, + "loss/hidden": 3.253125, + "loss/jsd": 0.0, + "loss/logits": 0.16304893530905246, + "step": 29030 + }, + { + "epoch": 0.968, + "grad_norm": 22.375, + "grad_norm_var": 2.3053504105878477e+18, + "learning_rate": 1.2031095486401069e-05, + "loss": 6.9941, + "loss/crossentropy": 2.093338930606842, + "loss/hidden": 3.43125, + "loss/jsd": 0.0, + "loss/logits": 0.16548265926539898, + "step": 29040 + }, + { + "epoch": 0.9683333333333334, + "grad_norm": 26.5, + "grad_norm_var": 2.305350410347444e+18, + "learning_rate": 1.1989314334075145e-05, + "loss": 6.9207, + "loss/crossentropy": 2.0706306278705595, + "loss/hidden": 3.175, + "loss/jsd": 0.0, + "loss/logits": 0.1563433837145567, + "step": 29050 + }, + { + "epoch": 0.9686666666666667, + "grad_norm": 23.5, + "grad_norm_var": 2.209830729166667, + "learning_rate": 1.1947957679852627e-05, + "loss": 6.9284, + "loss/crossentropy": 1.939845222979784, + "loss/hidden": 3.170703125, + "loss/jsd": 0.0, + "loss/logits": 0.15329579524695874, + "step": 29060 + }, + { + "epoch": 0.969, + "grad_norm": 22.5, + "grad_norm_var": 0.6143229166666667, + "learning_rate": 1.1907025931907e-05, + "loss": 6.8166, + "loss/crossentropy": 2.0235880702733993, + "loss/hidden": 3.131640625, + "loss/jsd": 0.0, + "loss/logits": 0.15207564570009707, + "step": 29070 + }, + { + "epoch": 0.9693333333333334, + "grad_norm": 23.125, + "grad_norm_var": 1.1895182291666666, + "learning_rate": 1.1866519494218084e-05, + "loss": 6.9347, + "loss/crossentropy": 2.039486038684845, + "loss/hidden": 3.239453125, + "loss/jsd": 0.0, + "loss/logits": 0.18337175534106792, + "step": 29080 + }, + { + "epoch": 0.9696666666666667, + "grad_norm": 22.75, + "grad_norm_var": 1.1238932291666666, + "learning_rate": 1.1826438766568076e-05, + "loss": 6.8713, + "loss/crossentropy": 2.1116551235318184, + "loss/hidden": 3.141015625, + "loss/jsd": 0.0, + "loss/logits": 0.16442451104521752, + "step": 29090 + }, + { + "epoch": 0.97, + "grad_norm": 22.375, + "grad_norm_var": 1.9671223958333333, + "learning_rate": 1.1786784144537563e-05, + "loss": 6.8258, + "loss/crossentropy": 2.092792363464832, + "loss/hidden": 3.242578125, + "loss/jsd": 0.0, + "loss/logits": 0.1660682398825884, + "step": 29100 + }, + { + "epoch": 0.9703333333333334, + "grad_norm": 21.75, + "grad_norm_var": 1.4775390625, + "learning_rate": 1.1747556019501665e-05, + "loss": 6.8014, + "loss/crossentropy": 2.1302355214953423, + "loss/hidden": 3.139453125, + "loss/jsd": 0.0, + "loss/logits": 0.1670895716175437, + "step": 29110 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 21.5, + "grad_norm_var": 1.8885416666666666, + "learning_rate": 1.1708754778626134e-05, + "loss": 6.9092, + "loss/crossentropy": 2.0474965393543245, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.19082598555833102, + "step": 29120 + }, + { + "epoch": 0.971, + "grad_norm": 24.375, + "grad_norm_var": 2.9284656384458097e+18, + "learning_rate": 1.1670380804863557e-05, + "loss": 7.0317, + "loss/crossentropy": 2.060644108057022, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18089290745556355, + "step": 29130 + }, + { + "epoch": 0.9713333333333334, + "grad_norm": 22.5, + "grad_norm_var": 0.46868489583333334, + "learning_rate": 1.1632434476949564e-05, + "loss": 6.9555, + "loss/crossentropy": 2.125787417590618, + "loss/hidden": 3.22421875, + "loss/jsd": 0.0, + "loss/logits": 0.17629719469696284, + "step": 29140 + }, + { + "epoch": 0.9716666666666667, + "grad_norm": 22.5, + "grad_norm_var": 0.6518229166666667, + "learning_rate": 1.1594916169399088e-05, + "loss": 6.8684, + "loss/crossentropy": 2.202189776301384, + "loss/hidden": 3.119921875, + "loss/jsd": 0.0, + "loss/logits": 0.15515435487031937, + "step": 29150 + }, + { + "epoch": 0.972, + "grad_norm": 22.25, + "grad_norm_var": 0.8455729166666667, + "learning_rate": 1.1557826252502677e-05, + "loss": 6.8047, + "loss/crossentropy": 2.0502734132111073, + "loss/hidden": 3.1578125, + "loss/jsd": 0.0, + "loss/logits": 0.1607513885013759, + "step": 29160 + }, + { + "epoch": 0.9723333333333334, + "grad_norm": 22.0, + "grad_norm_var": 0.8205729166666667, + "learning_rate": 1.1521165092322836e-05, + "loss": 6.8834, + "loss/crossentropy": 2.01381069123745, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.16493042316287757, + "step": 29170 + }, + { + "epoch": 0.9726666666666667, + "grad_norm": 25.0, + "grad_norm_var": 0.646875, + "learning_rate": 1.1484933050690425e-05, + "loss": 6.9093, + "loss/crossentropy": 2.077186991274357, + "loss/hidden": 3.162890625, + "loss/jsd": 0.0, + "loss/logits": 0.1712075762450695, + "step": 29180 + }, + { + "epoch": 0.973, + "grad_norm": 22.375, + "grad_norm_var": 0.759375, + "learning_rate": 1.1449130485201056e-05, + "loss": 6.8158, + "loss/crossentropy": 1.9148377593606711, + "loss/hidden": 3.103125, + "loss/jsd": 0.0, + "loss/logits": 0.14505248717032374, + "step": 29190 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 23.25, + "grad_norm_var": 0.3634765625, + "learning_rate": 1.1413757749211602e-05, + "loss": 6.8682, + "loss/crossentropy": 2.138897517323494, + "loss/hidden": 3.0640625, + "loss/jsd": 0.0, + "loss/logits": 0.1604563297703862, + "step": 29200 + }, + { + "epoch": 0.9736666666666667, + "grad_norm": 23.375, + "grad_norm_var": 0.4837890625, + "learning_rate": 1.1378815191836679e-05, + "loss": 6.8282, + "loss/crossentropy": 2.014554353058338, + "loss/hidden": 3.26796875, + "loss/jsd": 0.0, + "loss/logits": 0.16188111137598754, + "step": 29210 + }, + { + "epoch": 0.974, + "grad_norm": 22.0, + "grad_norm_var": 2894.9393229166667, + "learning_rate": 1.1344303157945242e-05, + "loss": 6.9728, + "loss/crossentropy": 2.0439544051885603, + "loss/hidden": 3.188671875, + "loss/jsd": 0.0, + "loss/logits": 0.1644747108221054, + "step": 29220 + }, + { + "epoch": 0.9743333333333334, + "grad_norm": 21.625, + "grad_norm_var": 2892.2497395833334, + "learning_rate": 1.1310221988157106e-05, + "loss": 6.8651, + "loss/crossentropy": 2.073014111816883, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.1810118304565549, + "step": 29230 + }, + { + "epoch": 0.9746666666666667, + "grad_norm": 23.25, + "grad_norm_var": 0.915625, + "learning_rate": 1.1276572018839673e-05, + "loss": 6.9769, + "loss/crossentropy": 2.0511143311858175, + "loss/hidden": 3.195703125, + "loss/jsd": 0.0, + "loss/logits": 0.15696678645908832, + "step": 29240 + }, + { + "epoch": 0.975, + "grad_norm": 22.75, + "grad_norm_var": 0.38014322916666665, + "learning_rate": 1.1243353582104556e-05, + "loss": 7.0323, + "loss/crossentropy": 2.084176428616047, + "loss/hidden": 3.24765625, + "loss/jsd": 0.0, + "loss/logits": 0.17536051329225302, + "step": 29250 + }, + { + "epoch": 0.9753333333333334, + "grad_norm": 25.5, + "grad_norm_var": 1.859375, + "learning_rate": 1.1210567005804302e-05, + "loss": 6.8833, + "loss/crossentropy": 2.0693555802106856, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.1687312951311469, + "step": 29260 + }, + { + "epoch": 0.9756666666666667, + "grad_norm": 22.375, + "grad_norm_var": 2.265625, + "learning_rate": 1.1178212613529202e-05, + "loss": 6.7959, + "loss/crossentropy": 2.032899996638298, + "loss/hidden": 3.148828125, + "loss/jsd": 0.0, + "loss/logits": 0.15500539531931282, + "step": 29270 + }, + { + "epoch": 0.976, + "grad_norm": 22.5, + "grad_norm_var": 3.958268229166667, + "learning_rate": 1.1146290724604024e-05, + "loss": 6.9032, + "loss/crossentropy": 2.0004256799817086, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.16235917941667138, + "step": 29280 + }, + { + "epoch": 0.9763333333333334, + "grad_norm": 21.875, + "grad_norm_var": 3.7122395833333335, + "learning_rate": 1.1114801654084949e-05, + "loss": 6.8216, + "loss/crossentropy": 1.98628860861063, + "loss/hidden": 3.196875, + "loss/jsd": 0.0, + "loss/logits": 0.16457067504525186, + "step": 29290 + }, + { + "epoch": 0.9766666666666667, + "grad_norm": 23.0, + "grad_norm_var": 0.884375, + "learning_rate": 1.1083745712756367e-05, + "loss": 6.9611, + "loss/crossentropy": 2.2105645328760146, + "loss/hidden": 3.204296875, + "loss/jsd": 0.0, + "loss/logits": 0.17289726454764603, + "step": 29300 + }, + { + "epoch": 0.977, + "grad_norm": 23.375, + "grad_norm_var": 0.46041666666666664, + "learning_rate": 1.1053123207127896e-05, + "loss": 6.9226, + "loss/crossentropy": 2.0838935345411302, + "loss/hidden": 3.16015625, + "loss/jsd": 0.0, + "loss/logits": 0.15632101874798537, + "step": 29310 + }, + { + "epoch": 0.9773333333333334, + "grad_norm": 24.25, + "grad_norm_var": 0.7247395833333333, + "learning_rate": 1.1022934439431295e-05, + "loss": 6.8949, + "loss/crossentropy": 1.9940695136785507, + "loss/hidden": 3.185546875, + "loss/jsd": 0.0, + "loss/logits": 0.1685311601497233, + "step": 29320 + }, + { + "epoch": 0.9776666666666667, + "grad_norm": 22.25, + "grad_norm_var": 0.5684895833333333, + "learning_rate": 1.0993179707617519e-05, + "loss": 6.8931, + "loss/crossentropy": 2.195150835812092, + "loss/hidden": 3.161328125, + "loss/jsd": 0.0, + "loss/logits": 0.16006924994289876, + "step": 29330 + }, + { + "epoch": 0.978, + "grad_norm": 22.375, + "grad_norm_var": 0.4952473958333333, + "learning_rate": 1.0963859305353758e-05, + "loss": 6.954, + "loss/crossentropy": 2.0411001086235045, + "loss/hidden": 3.287890625, + "loss/jsd": 0.0, + "loss/logits": 0.17475655488669872, + "step": 29340 + }, + { + "epoch": 0.9783333333333334, + "grad_norm": 23.375, + "grad_norm_var": 6.3931640625, + "learning_rate": 1.0934973522020538e-05, + "loss": 6.9526, + "loss/crossentropy": 2.057637444138527, + "loss/hidden": 3.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.1770678885281086, + "step": 29350 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 21.125, + "grad_norm_var": 2.4233723958333333, + "learning_rate": 1.0906522642708893e-05, + "loss": 6.897, + "loss/crossentropy": 2.0793089002370833, + "loss/hidden": 3.180859375, + "loss/jsd": 0.0, + "loss/logits": 0.16001901477575303, + "step": 29360 + }, + { + "epoch": 0.979, + "grad_norm": 23.375, + "grad_norm_var": 2.24765625, + "learning_rate": 1.0878506948217503e-05, + "loss": 6.9443, + "loss/crossentropy": 2.04323640614748, + "loss/hidden": 3.2484375, + "loss/jsd": 0.0, + "loss/logits": 0.16657722741365433, + "step": 29370 + }, + { + "epoch": 0.9793333333333333, + "grad_norm": 23.0, + "grad_norm_var": 0.4520182291666667, + "learning_rate": 1.0850926715049972e-05, + "loss": 6.8765, + "loss/crossentropy": 1.9868069365620613, + "loss/hidden": 3.137109375, + "loss/jsd": 0.0, + "loss/logits": 0.1584441527724266, + "step": 29380 + }, + { + "epoch": 0.9796666666666667, + "grad_norm": 22.25, + "grad_norm_var": 0.7582682291666667, + "learning_rate": 1.0823782215412054e-05, + "loss": 6.9489, + "loss/crossentropy": 1.9985451444983482, + "loss/hidden": 3.283984375, + "loss/jsd": 0.0, + "loss/logits": 0.15724884811788797, + "step": 29390 + }, + { + "epoch": 0.98, + "grad_norm": 22.5, + "grad_norm_var": 0.6583333333333333, + "learning_rate": 1.0797073717209014e-05, + "loss": 6.8264, + "loss/crossentropy": 2.087764638662338, + "loss/hidden": 3.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.1608564306050539, + "step": 29400 + }, + { + "epoch": 0.9803333333333333, + "grad_norm": 23.75, + "grad_norm_var": 1.7789922062113964e+18, + "learning_rate": 1.0770801484042939e-05, + "loss": 6.8378, + "loss/crossentropy": 1.9793974101543426, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.1507678169757128, + "step": 29410 + }, + { + "epoch": 0.9806666666666667, + "grad_norm": 23.5, + "grad_norm_var": 0.8333333333333334, + "learning_rate": 1.0744965775210168e-05, + "loss": 6.9311, + "loss/crossentropy": 2.0206541672348974, + "loss/hidden": 3.29609375, + "loss/jsd": 0.0, + "loss/logits": 0.18002614229917527, + "step": 29420 + }, + { + "epoch": 0.981, + "grad_norm": 21.625, + "grad_norm_var": 1.0957682291666666, + "learning_rate": 1.0719566845698715e-05, + "loss": 6.9871, + "loss/crossentropy": 1.9728245690464974, + "loss/hidden": 3.217578125, + "loss/jsd": 0.0, + "loss/logits": 0.16548432894051074, + "step": 29430 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 28.5, + "grad_norm_var": 2.74375, + "learning_rate": 1.0694604946185762e-05, + "loss": 6.9848, + "loss/crossentropy": 2.042478208243847, + "loss/hidden": 3.17890625, + "loss/jsd": 0.0, + "loss/logits": 0.17351799383759497, + "step": 29440 + }, + { + "epoch": 0.9816666666666667, + "grad_norm": 23.75, + "grad_norm_var": 2.24140625, + "learning_rate": 1.0670080323035176e-05, + "loss": 6.9812, + "loss/crossentropy": 2.0702026799321174, + "loss/hidden": 3.1515625, + "loss/jsd": 0.0, + "loss/logits": 0.15049307681620122, + "step": 29450 + }, + { + "epoch": 0.982, + "grad_norm": 23.125, + "grad_norm_var": 0.77265625, + "learning_rate": 1.0645993218295088e-05, + "loss": 6.8362, + "loss/crossentropy": 1.9153663486242294, + "loss/hidden": 3.1890625, + "loss/jsd": 0.0, + "loss/logits": 0.16654033735394477, + "step": 29460 + }, + { + "epoch": 0.9823333333333333, + "grad_norm": 24.125, + "grad_norm_var": 0.9309895833333334, + "learning_rate": 1.0622343869695508e-05, + "loss": 6.8102, + "loss/crossentropy": 1.9985662505030632, + "loss/hidden": 3.265234375, + "loss/jsd": 0.0, + "loss/logits": 0.16704850597307086, + "step": 29470 + }, + { + "epoch": 0.9826666666666667, + "grad_norm": 23.25, + "grad_norm_var": 1.1309895833333334, + "learning_rate": 1.0599132510645939e-05, + "loss": 6.8195, + "loss/crossentropy": 2.0518441289663314, + "loss/hidden": 3.111328125, + "loss/jsd": 0.0, + "loss/logits": 0.14787574112415314, + "step": 29480 + }, + { + "epoch": 0.983, + "grad_norm": 25.625, + "grad_norm_var": 2.0879557291666666, + "learning_rate": 1.057635937023314e-05, + "loss": 6.9664, + "loss/crossentropy": 2.0693943217396735, + "loss/hidden": 3.16015625, + "loss/jsd": 0.0, + "loss/logits": 0.15590767320245505, + "step": 29490 + }, + { + "epoch": 0.9833333333333333, + "grad_norm": 23.875, + "grad_norm_var": 0.9830729166666666, + "learning_rate": 1.0554024673218807e-05, + "loss": 6.9185, + "loss/crossentropy": 1.8844229593873023, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.17540524620562792, + "step": 29500 + }, + { + "epoch": 0.9836666666666667, + "grad_norm": 23.25, + "grad_norm_var": 1.0747395833333333, + "learning_rate": 1.053212864003738e-05, + "loss": 6.869, + "loss/crossentropy": 1.8106282196938992, + "loss/hidden": 3.278515625, + "loss/jsd": 0.0, + "loss/logits": 0.15563625153154134, + "step": 29510 + }, + { + "epoch": 0.984, + "grad_norm": 22.5, + "grad_norm_var": 0.5796223958333333, + "learning_rate": 1.0510671486793873e-05, + "loss": 6.9721, + "loss/crossentropy": 2.120162781327963, + "loss/hidden": 3.1421875, + "loss/jsd": 0.0, + "loss/logits": 0.15654509966261684, + "step": 29520 + }, + { + "epoch": 0.9843333333333333, + "grad_norm": 24.875, + "grad_norm_var": 1.2080729166666666, + "learning_rate": 1.0489653425261721e-05, + "loss": 6.9396, + "loss/crossentropy": 2.093233019858599, + "loss/hidden": 3.149609375, + "loss/jsd": 0.0, + "loss/logits": 0.15795425418764353, + "step": 29530 + }, + { + "epoch": 0.9846666666666667, + "grad_norm": 22.25, + "grad_norm_var": 1.6087890625, + "learning_rate": 1.046907466288071e-05, + "loss": 6.7796, + "loss/crossentropy": 2.213361156731844, + "loss/hidden": 3.19765625, + "loss/jsd": 0.0, + "loss/logits": 0.16810417706146835, + "step": 29540 + }, + { + "epoch": 0.985, + "grad_norm": 22.125, + "grad_norm_var": 1.1510416666666667, + "learning_rate": 1.0448935402754912e-05, + "loss": 6.8736, + "loss/crossentropy": 2.0324639290571214, + "loss/hidden": 3.151171875, + "loss/jsd": 0.0, + "loss/logits": 0.16171343475580216, + "step": 29550 + }, + { + "epoch": 0.9853333333333333, + "grad_norm": 38.25, + "grad_norm_var": 33.201822916666664, + "learning_rate": 1.0429235843650698e-05, + "loss": 6.9544, + "loss/crossentropy": 2.1379873633384703, + "loss/hidden": 3.222265625, + "loss/jsd": 0.0, + "loss/logits": 0.16771480459719895, + "step": 29560 + }, + { + "epoch": 0.9856666666666667, + "grad_norm": 22.0, + "grad_norm_var": 33.7134765625, + "learning_rate": 1.0409976179994762e-05, + "loss": 6.8317, + "loss/crossentropy": 1.8709135249257087, + "loss/hidden": 3.3171875, + "loss/jsd": 0.0, + "loss/logits": 0.15978736570104957, + "step": 29570 + }, + { + "epoch": 0.986, + "grad_norm": 23.625, + "grad_norm_var": 0.9660807291666667, + "learning_rate": 1.039115660187221e-05, + "loss": 6.8523, + "loss/crossentropy": 1.9445186778903008, + "loss/hidden": 3.201953125, + "loss/jsd": 0.0, + "loss/logits": 0.15582914650440216, + "step": 29580 + }, + { + "epoch": 0.9863333333333333, + "grad_norm": 25.5, + "grad_norm_var": 2.6393229166666665, + "learning_rate": 1.0372777295024676e-05, + "loss": 6.8136, + "loss/crossentropy": 1.7506938025355339, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.16162756085395813, + "step": 29590 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 22.375, + "grad_norm_var": 3.03125, + "learning_rate": 1.0354838440848503e-05, + "loss": 6.848, + "loss/crossentropy": 2.032399223744869, + "loss/hidden": 3.1625, + "loss/jsd": 0.0, + "loss/logits": 0.16586268395185472, + "step": 29600 + }, + { + "epoch": 0.987, + "grad_norm": 22.625, + "grad_norm_var": 0.3900390625, + "learning_rate": 1.0337340216392933e-05, + "loss": 6.8192, + "loss/crossentropy": 1.934443362057209, + "loss/hidden": 3.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.180020921677351, + "step": 29610 + }, + { + "epoch": 0.9873333333333333, + "grad_norm": 25.25, + "grad_norm_var": 1.2455729166666667, + "learning_rate": 1.032028279435839e-05, + "loss": 6.9186, + "loss/crossentropy": 2.2124100014567376, + "loss/hidden": 3.185546875, + "loss/jsd": 0.0, + "loss/logits": 0.18372708857059478, + "step": 29620 + }, + { + "epoch": 0.9876666666666667, + "grad_norm": 22.5, + "grad_norm_var": 1.2145833333333333, + "learning_rate": 1.030366634309473e-05, + "loss": 6.8804, + "loss/crossentropy": 2.0187501519918443, + "loss/hidden": 3.20703125, + "loss/jsd": 0.0, + "loss/logits": 0.17060858262702822, + "step": 29630 + }, + { + "epoch": 0.988, + "grad_norm": 23.125, + "grad_norm_var": 1.0729166666666667, + "learning_rate": 1.0287491026599623e-05, + "loss": 6.892, + "loss/crossentropy": 2.02512718886137, + "loss/hidden": 3.1984375, + "loss/jsd": 0.0, + "loss/logits": 0.17388947010040284, + "step": 29640 + }, + { + "epoch": 0.9883333333333333, + "grad_norm": 21.125, + "grad_norm_var": 1.2705729166666666, + "learning_rate": 1.0271757004516918e-05, + "loss": 6.8015, + "loss/crossentropy": 2.1398928314447403, + "loss/hidden": 3.289453125, + "loss/jsd": 0.0, + "loss/logits": 0.1810132971033454, + "step": 29650 + }, + { + "epoch": 0.9886666666666667, + "grad_norm": 23.875, + "grad_norm_var": 2.3364583333333333, + "learning_rate": 1.0256464432135048e-05, + "loss": 7.0092, + "loss/crossentropy": 2.1282053992152212, + "loss/hidden": 3.175390625, + "loss/jsd": 0.0, + "loss/logits": 0.16205706167966127, + "step": 29660 + }, + { + "epoch": 0.989, + "grad_norm": 25.75, + "grad_norm_var": 1.3434895833333333, + "learning_rate": 1.0241613460385547e-05, + "loss": 6.9536, + "loss/crossentropy": 2.082467722892761, + "loss/hidden": 3.23515625, + "loss/jsd": 0.0, + "loss/logits": 0.17200557347387074, + "step": 29670 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 22.75, + "grad_norm_var": 1.3639973958333333, + "learning_rate": 1.0227204235841493e-05, + "loss": 7.0005, + "loss/crossentropy": 2.0898186802864074, + "loss/hidden": 3.2125, + "loss/jsd": 0.0, + "loss/logits": 0.18023983463644982, + "step": 29680 + }, + { + "epoch": 0.9896666666666667, + "grad_norm": 23.0, + "grad_norm_var": 1.6541015625, + "learning_rate": 1.0213236900716126e-05, + "loss": 6.853, + "loss/crossentropy": 2.187703275680542, + "loss/hidden": 3.0921875, + "loss/jsd": 0.0, + "loss/logits": 0.16371893137693405, + "step": 29690 + }, + { + "epoch": 0.99, + "grad_norm": 22.875, + "grad_norm_var": 0.7145182291666666, + "learning_rate": 1.01997115928614e-05, + "loss": 6.8488, + "loss/crossentropy": 2.0195549950003624, + "loss/hidden": 3.162109375, + "loss/jsd": 0.0, + "loss/logits": 0.16155789233744144, + "step": 29700 + }, + { + "epoch": 0.9903333333333333, + "grad_norm": 24.5, + "grad_norm_var": 0.5082682291666667, + "learning_rate": 1.0186628445766647e-05, + "loss": 6.8828, + "loss/crossentropy": 2.007741495221853, + "loss/hidden": 3.296484375, + "loss/jsd": 0.0, + "loss/logits": 0.17947645513340832, + "step": 29710 + }, + { + "epoch": 0.9906666666666667, + "grad_norm": 22.25, + "grad_norm_var": 0.8624348958333333, + "learning_rate": 1.0173987588557237e-05, + "loss": 6.9354, + "loss/crossentropy": 2.135732203722, + "loss/hidden": 3.221875, + "loss/jsd": 0.0, + "loss/logits": 0.16441609486937522, + "step": 29720 + }, + { + "epoch": 0.991, + "grad_norm": 22.75, + "grad_norm_var": 0.9004557291666667, + "learning_rate": 1.0161789145993343e-05, + "loss": 6.9097, + "loss/crossentropy": 1.8655649699270724, + "loss/hidden": 3.1390625, + "loss/jsd": 0.0, + "loss/logits": 0.15107152182608843, + "step": 29730 + }, + { + "epoch": 0.9913333333333333, + "grad_norm": 22.0, + "grad_norm_var": 1.1739583333333334, + "learning_rate": 1.0150033238468656e-05, + "loss": 6.9158, + "loss/crossentropy": 1.901015117764473, + "loss/hidden": 3.223828125, + "loss/jsd": 0.0, + "loss/logits": 0.1663993639871478, + "step": 29740 + }, + { + "epoch": 0.9916666666666667, + "grad_norm": 23.75, + "grad_norm_var": 0.9447916666666667, + "learning_rate": 1.0138719982009242e-05, + "loss": 6.8547, + "loss/crossentropy": 2.1790758818387985, + "loss/hidden": 3.03671875, + "loss/jsd": 0.0, + "loss/logits": 0.15662378910928965, + "step": 29750 + }, + { + "epoch": 0.992, + "grad_norm": 23.875, + "grad_norm_var": 0.5333333333333333, + "learning_rate": 1.0127849488272375e-05, + "loss": 6.9303, + "loss/crossentropy": 1.960058543086052, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.16203988939523697, + "step": 29760 + }, + { + "epoch": 0.9923333333333333, + "grad_norm": 23.25, + "grad_norm_var": 1.4504557291666667, + "learning_rate": 1.0117421864545435e-05, + "loss": 6.9141, + "loss/crossentropy": 2.013607097789645, + "loss/hidden": 3.1875, + "loss/jsd": 0.0, + "loss/logits": 0.1687733193859458, + "step": 29770 + }, + { + "epoch": 0.9926666666666667, + "grad_norm": 25.0, + "grad_norm_var": 0.8330729166666667, + "learning_rate": 1.0107437213744867e-05, + "loss": 6.8548, + "loss/crossentropy": 1.9799937024712562, + "loss/hidden": 3.115234375, + "loss/jsd": 0.0, + "loss/logits": 0.1619122765958309, + "step": 29780 + }, + { + "epoch": 0.993, + "grad_norm": 22.625, + "grad_norm_var": 0.8077473958333333, + "learning_rate": 1.0097895634415135e-05, + "loss": 6.8267, + "loss/crossentropy": 2.14156903848052, + "loss/hidden": 3.17109375, + "loss/jsd": 0.0, + "loss/logits": 0.16840279754251242, + "step": 29790 + }, + { + "epoch": 0.9933333333333333, + "grad_norm": 24.25, + "grad_norm_var": 2.7143229166666667, + "learning_rate": 1.008879722072778e-05, + "loss": 6.9818, + "loss/crossentropy": 2.0121023267507554, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.17791486158967018, + "step": 29800 + }, + { + "epoch": 0.9936666666666667, + "grad_norm": 23.375, + "grad_norm_var": 1.2275390625, + "learning_rate": 1.008014206248047e-05, + "loss": 6.9668, + "loss/crossentropy": 2.126334875077009, + "loss/hidden": 3.2703125, + "loss/jsd": 0.0, + "loss/logits": 0.17075007781386375, + "step": 29810 + }, + { + "epoch": 0.994, + "grad_norm": 22.5, + "grad_norm_var": 0.6520182291666666, + "learning_rate": 1.0071930245096125e-05, + "loss": 6.9318, + "loss/crossentropy": 2.3043752014636993, + "loss/hidden": 3.238671875, + "loss/jsd": 0.0, + "loss/logits": 0.17486888822168112, + "step": 29820 + }, + { + "epoch": 0.9943333333333333, + "grad_norm": 23.25, + "grad_norm_var": 0.6082682291666667, + "learning_rate": 1.0064161849622065e-05, + "loss": 6.8321, + "loss/crossentropy": 2.0536348327994345, + "loss/hidden": 3.244921875, + "loss/jsd": 0.0, + "loss/logits": 0.16649605836719275, + "step": 29830 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 22.125, + "grad_norm_var": 1.2979166666666666, + "learning_rate": 1.0056836952729215e-05, + "loss": 6.9919, + "loss/crossentropy": 1.991570144891739, + "loss/hidden": 3.15234375, + "loss/jsd": 0.0, + "loss/logits": 0.19409504476934672, + "step": 29840 + }, + { + "epoch": 0.995, + "grad_norm": 23.625, + "grad_norm_var": 1.1014973958333334, + "learning_rate": 1.0049955626711355e-05, + "loss": 6.9519, + "loss/crossentropy": 2.031588687002659, + "loss/hidden": 3.28984375, + "loss/jsd": 0.0, + "loss/logits": 0.1794443614780903, + "step": 29850 + }, + { + "epoch": 0.9953333333333333, + "grad_norm": 23.5, + "grad_norm_var": 0.8025390625, + "learning_rate": 1.004351793948439e-05, + "loss": 6.8755, + "loss/crossentropy": 2.027684749662876, + "loss/hidden": 3.226953125, + "loss/jsd": 0.0, + "loss/logits": 0.17099270056933163, + "step": 29860 + }, + { + "epoch": 0.9956666666666667, + "grad_norm": 23.75, + "grad_norm_var": 0.9811848958333333, + "learning_rate": 1.0037523954585697e-05, + "loss": 6.8869, + "loss/crossentropy": 1.9910648241639137, + "loss/hidden": 3.235546875, + "loss/jsd": 0.0, + "loss/logits": 0.156367249134928, + "step": 29870 + }, + { + "epoch": 0.996, + "grad_norm": 23.25, + "grad_norm_var": 1.2125, + "learning_rate": 1.0031973731173486e-05, + "loss": 6.716, + "loss/crossentropy": 1.9186164811253548, + "loss/hidden": 3.153125, + "loss/jsd": 0.0, + "loss/logits": 0.15821984894573687, + "step": 29880 + }, + { + "epoch": 0.9963333333333333, + "grad_norm": 24.625, + "grad_norm_var": 0.98125, + "learning_rate": 1.002686732402622e-05, + "loss": 6.9111, + "loss/crossentropy": 2.014424833655357, + "loss/hidden": 3.20234375, + "loss/jsd": 0.0, + "loss/logits": 0.16852001175284387, + "step": 29890 + }, + { + "epoch": 0.9966666666666667, + "grad_norm": 24.125, + "grad_norm_var": 1.0712890625, + "learning_rate": 1.002220478354208e-05, + "loss": 6.9157, + "loss/crossentropy": 2.054695198684931, + "loss/hidden": 3.137109375, + "loss/jsd": 0.0, + "loss/logits": 0.15729085877537727, + "step": 29900 + }, + { + "epoch": 0.997, + "grad_norm": 23.0, + "grad_norm_var": 0.97265625, + "learning_rate": 1.0017986155738457e-05, + "loss": 6.8971, + "loss/crossentropy": 1.8903593212366103, + "loss/hidden": 3.093359375, + "loss/jsd": 0.0, + "loss/logits": 0.14642833340913058, + "step": 29910 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 22.75, + "grad_norm_var": 1.9572916666666667, + "learning_rate": 1.0014211482251503e-05, + "loss": 7.0028, + "loss/crossentropy": 2.0199310213327406, + "loss/hidden": 3.270703125, + "loss/jsd": 0.0, + "loss/logits": 0.17333223409950732, + "step": 29920 + }, + { + "epoch": 0.9976666666666667, + "grad_norm": 27.5, + "grad_norm_var": 2.383072916666667, + "learning_rate": 1.0010880800335719e-05, + "loss": 6.9141, + "loss/crossentropy": 2.015287238359451, + "loss/hidden": 3.228515625, + "loss/jsd": 0.0, + "loss/logits": 0.17535847648978234, + "step": 29930 + }, + { + "epoch": 0.998, + "grad_norm": 23.375, + "grad_norm_var": 2.2910807291666666, + "learning_rate": 1.0007994142863597e-05, + "loss": 6.8943, + "loss/crossentropy": 2.1094014227390288, + "loss/hidden": 3.202734375, + "loss/jsd": 0.0, + "loss/logits": 0.16332617327570914, + "step": 29940 + }, + { + "epoch": 0.9983333333333333, + "grad_norm": 21.75, + "grad_norm_var": 1.0893229166666667, + "learning_rate": 1.0005551538325275e-05, + "loss": 6.7901, + "loss/crossentropy": 1.9851688370108604, + "loss/hidden": 3.212890625, + "loss/jsd": 0.0, + "loss/logits": 0.1485843539237976, + "step": 29950 + }, + { + "epoch": 0.9986666666666667, + "grad_norm": 24.25, + "grad_norm_var": 0.675, + "learning_rate": 1.0003553010828276e-05, + "loss": 6.8926, + "loss/crossentropy": 2.0727066323161125, + "loss/hidden": 3.213671875, + "loss/jsd": 0.0, + "loss/logits": 0.16851818263530732, + "step": 29960 + }, + { + "epoch": 0.999, + "grad_norm": 23.5, + "grad_norm_var": 0.8747395833333333, + "learning_rate": 1.000199858009726e-05, + "loss": 6.8944, + "loss/crossentropy": 2.1489027053117753, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.1805833499878645, + "step": 29970 + }, + { + "epoch": 0.9993333333333333, + "grad_norm": 22.625, + "grad_norm_var": 0.4759765625, + "learning_rate": 1.0000888261473831e-05, + "loss": 6.957, + "loss/crossentropy": 2.0706566661596297, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.18294469746761025, + "step": 29980 + }, + { + "epoch": 0.9996666666666667, + "grad_norm": 23.5, + "grad_norm_var": 2.99140625, + "learning_rate": 1.0000222065916382e-05, + "loss": 6.9522, + "loss/crossentropy": 2.0081850692629812, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.177884781640023, + "step": 29990 + }, + { + "epoch": 1.0, + "grad_norm": 21.125, + "grad_norm_var": 1.1768229166666666, + "learning_rate": 1e-05, + "loss": 6.8942, + "loss/crossentropy": 2.0051390439271928, + "loss/hidden": 3.146875, + "loss/jsd": 0.0, + "loss/logits": 0.15861098784953356, + "step": 30000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.57253009602642e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}