| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 2000, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001, |
| "grad_norm": 4736.0, |
| "learning_rate": 1.9e-05, |
| "loss": 132.1055, |
| "loss/crossentropy": 12.246079635620116, |
| "loss/hidden": 18.7125, |
| "loss/jsd": 0.0, |
| "loss/logits": 10.372939014434815, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 330.0, |
| "grad_norm_var": 91640269.18333334, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 95.9731, |
| "loss/crossentropy": 8.862393474578857, |
| "loss/hidden": 18.675, |
| "loss/jsd": 0.0, |
| "loss/logits": 6.677179157733917, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 394.0, |
| "grad_norm_var": 237715.45, |
| "learning_rate": 3.7e-05, |
| "loss": 86.3778, |
| "loss/crossentropy": 8.083840227127075, |
| "loss/hidden": 18.259375, |
| "loss/jsd": 0.0, |
| "loss/logits": 6.130921971797943, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 924.0, |
| "grad_norm_var": 2.6757682503402172e+16, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 82.5914, |
| "loss/crossentropy": 7.802511918544769, |
| "loss/hidden": 17.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 5.772503018379211, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 516.0, |
| "grad_norm_var": 38597.583333333336, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 75.3397, |
| "loss/crossentropy": 7.156700026988983, |
| "loss/hidden": 17.253125, |
| "loss/jsd": 0.0, |
| "loss/logits": 5.156575608253479, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1232.0, |
| "grad_norm_var": 68241.45, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 61.2745, |
| "loss/crossentropy": 6.0138510942459105, |
| "loss/hidden": 15.80625, |
| "loss/jsd": 0.0, |
| "loss/logits": 3.8037488579750063, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 376.0, |
| "grad_norm_var": 626103.4, |
| "learning_rate": 7.3e-05, |
| "loss": 41.3695, |
| "loss/crossentropy": 4.422797441482544, |
| "loss/hidden": 13.1125, |
| "loss/jsd": 0.0, |
| "loss/logits": 2.4006322652101515, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 272.0, |
| "grad_norm_var": 674923.45, |
| "learning_rate": 8.200000000000001e-05, |
| "loss": 27.4755, |
| "loss/crossentropy": 3.3576226443052293, |
| "loss/hidden": 10.7359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 1.3968962401151657, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 296.0, |
| "grad_norm_var": 15426.383333333333, |
| "learning_rate": 9.1e-05, |
| "loss": 22.6607, |
| "loss/crossentropy": 3.217679074406624, |
| "loss/hidden": 9.2140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 1.055714099109173, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 328.0, |
| "grad_norm_var": 9349.666666666666, |
| "learning_rate": 0.0001, |
| "loss": 20.3108, |
| "loss/crossentropy": 2.934060016274452, |
| "loss/hidden": 8.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.8702833190560341, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 194.0, |
| "grad_norm_var": 5992.866666666667, |
| "learning_rate": 0.0001, |
| "loss": 18.8852, |
| "loss/crossentropy": 2.8450062334537507, |
| "loss/hidden": 8.221875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.8380498677492142, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 244.0, |
| "grad_norm_var": 1176.5333333333333, |
| "learning_rate": 0.0001, |
| "loss": 17.97, |
| "loss/crossentropy": 2.612249107658863, |
| "loss/hidden": 7.578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.686215291172266, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 242.0, |
| "grad_norm_var": 1168.8958333333333, |
| "learning_rate": 0.0001, |
| "loss": 17.2904, |
| "loss/crossentropy": 2.8242316216230394, |
| "loss/hidden": 7.7390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.7805894792079926, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 179.0, |
| "grad_norm_var": 1465.1333333333334, |
| "learning_rate": 0.0001, |
| "loss": 16.5581, |
| "loss/crossentropy": 2.737143725156784, |
| "loss/hidden": 7.3421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.6888546235859394, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 175.0, |
| "grad_norm_var": 1119.8625, |
| "learning_rate": 0.0001, |
| "loss": 16.0501, |
| "loss/crossentropy": 2.7599751561880113, |
| "loss/hidden": 7.05703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.6640767879784107, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 186.0, |
| "grad_norm_var": 1044.5166666666667, |
| "learning_rate": 0.0001, |
| "loss": 15.4631, |
| "loss/crossentropy": 2.6100075274705885, |
| "loss/hidden": 6.8203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5824844464659691, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 179.0, |
| "grad_norm_var": 1082.8, |
| "learning_rate": 0.0001, |
| "loss": 15.2201, |
| "loss/crossentropy": 2.4276285111904143, |
| "loss/hidden": 6.8203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5915141828358174, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 153.0, |
| "grad_norm_var": 622.6625, |
| "learning_rate": 0.0001, |
| "loss": 14.9606, |
| "loss/crossentropy": 2.630460512638092, |
| "loss/hidden": 6.52578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5396774187684059, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 176.0, |
| "grad_norm_var": 1093.2, |
| "learning_rate": 0.0001, |
| "loss": 14.6255, |
| "loss/crossentropy": 2.3158223152160646, |
| "loss/hidden": 6.50390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4905257746577263, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 112.0, |
| "grad_norm_var": 695.7291666666666, |
| "learning_rate": 0.0001, |
| "loss": 14.3647, |
| "loss/crossentropy": 2.586851382255554, |
| "loss/hidden": 6.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5586091712117195, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 118.5, |
| "grad_norm_var": 574.3072916666666, |
| "learning_rate": 0.0001, |
| "loss": 14.0867, |
| "loss/crossentropy": 2.5010055124759676, |
| "loss/hidden": 6.34453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4965482771396637, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 88.5, |
| "grad_norm_var": 662.65, |
| "learning_rate": 0.0001, |
| "loss": 13.6551, |
| "loss/crossentropy": 2.573444625735283, |
| "loss/hidden": 6.33125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5534068010747433, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 118.0, |
| "grad_norm_var": 412.1958333333333, |
| "learning_rate": 0.0001, |
| "loss": 13.4715, |
| "loss/crossentropy": 2.4142292886972427, |
| "loss/hidden": 5.96640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.44360905699431896, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 134.0, |
| "grad_norm_var": 242.9, |
| "learning_rate": 0.0001, |
| "loss": 13.3289, |
| "loss/crossentropy": 2.4670142769813537, |
| "loss/hidden": 5.98671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.47392544001340864, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 137.0, |
| "grad_norm_var": 158.4625, |
| "learning_rate": 0.0001, |
| "loss": 13.0031, |
| "loss/crossentropy": 2.416000656783581, |
| "loss/hidden": 5.7859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.44607544504106045, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 109.0, |
| "grad_norm_var": 279.990625, |
| "learning_rate": 0.0001, |
| "loss": 13.0076, |
| "loss/crossentropy": 2.370332670211792, |
| "loss/hidden": 5.9984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.5006627842783928, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 129.0, |
| "grad_norm_var": 427.37395833333335, |
| "learning_rate": 0.0001, |
| "loss": 12.8809, |
| "loss/crossentropy": 2.281908763945103, |
| "loss/hidden": 5.98671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.45061586182564495, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 98.0, |
| "grad_norm_var": 278.1489583333333, |
| "learning_rate": 0.0001, |
| "loss": 12.8942, |
| "loss/crossentropy": 2.3922384053468706, |
| "loss/hidden": 5.6984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.44376694336533545, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 99.5, |
| "grad_norm_var": 303.55, |
| "learning_rate": 0.0001, |
| "loss": 12.7122, |
| "loss/crossentropy": 2.730095013976097, |
| "loss/hidden": 5.49140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4411045670509338, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 112.5, |
| "grad_norm_var": 359.56666666666666, |
| "learning_rate": 0.0001, |
| "loss": 12.5618, |
| "loss/crossentropy": 2.3741705983877184, |
| "loss/hidden": 5.43203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.40091707594692705, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 84.5, |
| "grad_norm_var": 245.25729166666667, |
| "learning_rate": 0.0001, |
| "loss": 12.2525, |
| "loss/crossentropy": 2.2781229317188263, |
| "loss/hidden": 5.53515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4274128321558237, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 108.5, |
| "grad_norm_var": 140.59583333333333, |
| "learning_rate": 0.0001, |
| "loss": 12.2935, |
| "loss/crossentropy": 2.5757294684648513, |
| "loss/hidden": 5.4609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.42916890494525434, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 108.0, |
| "grad_norm_var": 70.89895833333334, |
| "learning_rate": 0.0001, |
| "loss": 12.1545, |
| "loss/crossentropy": 2.527638339996338, |
| "loss/hidden": 5.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4032053742557764, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 210.0, |
| "grad_norm_var": 1272.465625, |
| "learning_rate": 0.0001, |
| "loss": 12.2482, |
| "loss/crossentropy": 2.5401821002364158, |
| "loss/hidden": 5.390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4444709587842226, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 79.5, |
| "grad_norm_var": 1376.5958333333333, |
| "learning_rate": 0.0001, |
| "loss": 12.08, |
| "loss/crossentropy": 2.514840933680534, |
| "loss/hidden": 5.2640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4077944982796907, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 87.0, |
| "grad_norm_var": 418.83229166666666, |
| "learning_rate": 0.0001, |
| "loss": 12.0245, |
| "loss/crossentropy": 2.420889538526535, |
| "loss/hidden": 5.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.44222328886389733, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 76.5, |
| "grad_norm_var": 138.5625, |
| "learning_rate": 0.0001, |
| "loss": 11.7097, |
| "loss/crossentropy": 2.2826619133353234, |
| "loss/hidden": 5.3296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3849468305706978, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 96.5, |
| "grad_norm_var": 184.93229166666666, |
| "learning_rate": 0.0001, |
| "loss": 11.465, |
| "loss/crossentropy": 2.4052042722702027, |
| "loss/hidden": 5.16796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.40173302926123144, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 125.5, |
| "grad_norm_var": 183.09583333333333, |
| "learning_rate": 0.0001, |
| "loss": 11.6273, |
| "loss/crossentropy": 2.540145033597946, |
| "loss/hidden": 5.215625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.41224894523620603, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 83.5, |
| "grad_norm_var": 258.315625, |
| "learning_rate": 0.0001, |
| "loss": 11.397, |
| "loss/crossentropy": 2.207468980550766, |
| "loss/hidden": 5.09296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3590874429792166, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 94.5, |
| "grad_norm_var": 184.5625, |
| "learning_rate": 0.0001, |
| "loss": 11.443, |
| "loss/crossentropy": 2.4378984421491623, |
| "loss/hidden": 5.21171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.40493359677493573, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 106.5, |
| "grad_norm_var": 125.590625, |
| "learning_rate": 0.0001, |
| "loss": 11.5678, |
| "loss/crossentropy": 2.518555220961571, |
| "loss/hidden": 5.07265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.4297170080244541, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 87.5, |
| "grad_norm_var": 115.765625, |
| "learning_rate": 0.0001, |
| "loss": 11.3132, |
| "loss/crossentropy": 2.490597203373909, |
| "loss/hidden": 5.11171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.403754598274827, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 92.5, |
| "grad_norm_var": 156.35729166666667, |
| "learning_rate": 0.0001, |
| "loss": 11.1476, |
| "loss/crossentropy": 2.037529316544533, |
| "loss/hidden": 5.07421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.35246654506772757, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 80.5, |
| "grad_norm_var": 210.66666666666666, |
| "learning_rate": 0.0001, |
| "loss": 11.3038, |
| "loss/crossentropy": 2.3201738983392715, |
| "loss/hidden": 5.0828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.38196625709533694, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 107.5, |
| "grad_norm_var": 284.1666666666667, |
| "learning_rate": 0.0001, |
| "loss": 11.3625, |
| "loss/crossentropy": 2.4791718110442162, |
| "loss/hidden": 4.95546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.36495909169316293, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 91.5, |
| "grad_norm_var": 247.39895833333333, |
| "learning_rate": 0.0001, |
| "loss": 11.0542, |
| "loss/crossentropy": 2.3155667960643767, |
| "loss/hidden": 4.93828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.362844867631793, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 95.0, |
| "grad_norm_var": 194.79895833333333, |
| "learning_rate": 0.0001, |
| "loss": 11.2413, |
| "loss/crossentropy": 2.496318203210831, |
| "loss/hidden": 4.840625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3887303464114666, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 74.5, |
| "grad_norm_var": 243.840625, |
| "learning_rate": 0.0001, |
| "loss": 10.9416, |
| "loss/crossentropy": 2.385223904252052, |
| "loss/hidden": 4.85234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3598880790174007, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 79.0, |
| "grad_norm_var": 105.990625, |
| "learning_rate": 0.0001, |
| "loss": 10.9114, |
| "loss/crossentropy": 2.2462552055716514, |
| "loss/hidden": 4.80859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3265662036836147, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 96.5, |
| "grad_norm_var": 138.43229166666666, |
| "learning_rate": 0.0001, |
| "loss": 10.8821, |
| "loss/crossentropy": 2.297148121893406, |
| "loss/hidden": 4.8609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3467547960579395, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 97.5, |
| "grad_norm_var": 129.365625, |
| "learning_rate": 0.0001, |
| "loss": 10.9299, |
| "loss/crossentropy": 2.4197026968002318, |
| "loss/hidden": 4.7921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3632193084806204, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 81.5, |
| "grad_norm_var": 99.47395833333333, |
| "learning_rate": 0.0001, |
| "loss": 10.787, |
| "loss/crossentropy": 2.36982424557209, |
| "loss/hidden": 4.825, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3405680742114782, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 85.5, |
| "grad_norm_var": 48.340625, |
| "learning_rate": 0.0001, |
| "loss": 10.8675, |
| "loss/crossentropy": 2.4611779801547526, |
| "loss/hidden": 4.8625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.36872007288038733, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 93.5, |
| "grad_norm_var": 84.24895833333333, |
| "learning_rate": 0.0001, |
| "loss": 10.64, |
| "loss/crossentropy": 2.1758567959070207, |
| "loss/hidden": 4.7484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3336840860545635, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 114.0, |
| "grad_norm_var": 129.53098958333334, |
| "learning_rate": 0.0001, |
| "loss": 10.5615, |
| "loss/crossentropy": 2.3970536097884176, |
| "loss/hidden": 4.7625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.34276723079383375, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 80.0, |
| "grad_norm_var": 579.57890625, |
| "learning_rate": 0.0001, |
| "loss": 10.8999, |
| "loss/crossentropy": 2.4695185527205465, |
| "loss/hidden": 4.9453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.42829814068973066, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 85.0, |
| "grad_norm_var": 596.9572916666667, |
| "learning_rate": 0.0001, |
| "loss": 10.8802, |
| "loss/crossentropy": 2.3520184576511385, |
| "loss/hidden": 4.790625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3662864986807108, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 73.0, |
| "grad_norm_var": 181.69583333333333, |
| "learning_rate": 0.0001, |
| "loss": 10.6744, |
| "loss/crossentropy": 2.2842736929655074, |
| "loss/hidden": 4.71484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3500846643000841, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 97.0, |
| "grad_norm_var": 160.58307291666668, |
| "learning_rate": 0.0001, |
| "loss": 10.6987, |
| "loss/crossentropy": 2.29906165599823, |
| "loss/hidden": 4.602734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.334361494705081, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 89.0, |
| "grad_norm_var": 162.67682291666668, |
| "learning_rate": 0.0001, |
| "loss": 10.6143, |
| "loss/crossentropy": 2.3032930195331573, |
| "loss/hidden": 4.6703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3258141163736582, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 77.5, |
| "grad_norm_var": 97.12916666666666, |
| "learning_rate": 0.0001, |
| "loss": 10.5946, |
| "loss/crossentropy": 2.452244046330452, |
| "loss/hidden": 4.7109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3432691916823387, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 75.5, |
| "grad_norm_var": 227.69973958333333, |
| "learning_rate": 0.0001, |
| "loss": 10.6287, |
| "loss/crossentropy": 2.2894835874438284, |
| "loss/hidden": 4.74609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.35672005768865345, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 70.0, |
| "grad_norm_var": 541.2322916666667, |
| "learning_rate": 0.0001, |
| "loss": 10.6195, |
| "loss/crossentropy": 2.4114772886037827, |
| "loss/hidden": 4.70546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.35591375902295114, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 77.0, |
| "grad_norm_var": 435.15390625, |
| "learning_rate": 0.0001, |
| "loss": 10.4142, |
| "loss/crossentropy": 2.332440134882927, |
| "loss/hidden": 4.634375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.339809150993824, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 71.5, |
| "grad_norm_var": 118.03307291666667, |
| "learning_rate": 0.0001, |
| "loss": 10.4602, |
| "loss/crossentropy": 2.154422373324633, |
| "loss/hidden": 4.54140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3334257358685136, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 73.5, |
| "grad_norm_var": 144.94166666666666, |
| "learning_rate": 0.0001, |
| "loss": 10.5185, |
| "loss/crossentropy": 2.3223402693867685, |
| "loss/hidden": 4.795703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.37188967503607273, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 61.5, |
| "grad_norm_var": 169.65598958333334, |
| "learning_rate": 0.0001, |
| "loss": 10.5323, |
| "loss/crossentropy": 2.332353001832962, |
| "loss/hidden": 4.50625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31948004066944125, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 74.0, |
| "grad_norm_var": 155.94140625, |
| "learning_rate": 0.0001, |
| "loss": 10.4359, |
| "loss/crossentropy": 2.4077556908130644, |
| "loss/hidden": 4.623828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.339173823595047, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 82.5, |
| "grad_norm_var": 125.55416666666666, |
| "learning_rate": 0.0001, |
| "loss": 10.4493, |
| "loss/crossentropy": 2.292634981870651, |
| "loss/hidden": 4.571875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3477486100047827, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 88.0, |
| "grad_norm_var": 155.84166666666667, |
| "learning_rate": 0.0001, |
| "loss": 10.2041, |
| "loss/crossentropy": 2.4034020826220512, |
| "loss/hidden": 4.53046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3406600248068571, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 124.0, |
| "grad_norm_var": 230.83307291666668, |
| "learning_rate": 0.0001, |
| "loss": 10.3489, |
| "loss/crossentropy": 2.333241228759289, |
| "loss/hidden": 4.6015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3285223826766014, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 71.0, |
| "grad_norm_var": 278.95390625, |
| "learning_rate": 0.0001, |
| "loss": 10.1548, |
| "loss/crossentropy": 2.4066421508789064, |
| "loss/hidden": 4.682421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.338771004602313, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 84.5, |
| "grad_norm_var": 166.85729166666667, |
| "learning_rate": 0.0001, |
| "loss": 10.2647, |
| "loss/crossentropy": 2.2724754482507707, |
| "loss/hidden": 4.567578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3267147310078144, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 67.5, |
| "grad_norm_var": 343.5247395833333, |
| "learning_rate": 0.0001, |
| "loss": 10.2815, |
| "loss/crossentropy": 2.3046080738306047, |
| "loss/hidden": 4.473828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.33236319161951544, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 68.5, |
| "grad_norm_var": 306.540625, |
| "learning_rate": 0.0001, |
| "loss": 10.2479, |
| "loss/crossentropy": 2.2831736013293265, |
| "loss/hidden": 4.62734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3329113606363535, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 88.5, |
| "grad_norm_var": 111.57473958333334, |
| "learning_rate": 0.0001, |
| "loss": 10.2161, |
| "loss/crossentropy": 2.3853780582547186, |
| "loss/hidden": 4.541015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31959532871842383, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 80.5, |
| "grad_norm_var": 110.65729166666667, |
| "learning_rate": 0.0001, |
| "loss": 10.2076, |
| "loss/crossentropy": 2.3982744574546815, |
| "loss/hidden": 4.55859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3542841043323278, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 66.0, |
| "grad_norm_var": 275.6322916666667, |
| "learning_rate": 0.0001, |
| "loss": 10.1697, |
| "loss/crossentropy": 2.4292824655771255, |
| "loss/hidden": 4.632421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.36711033545434474, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 57.25, |
| "grad_norm_var": 290.9291666666667, |
| "learning_rate": 0.0001, |
| "loss": 10.2176, |
| "loss/crossentropy": 2.380542576313019, |
| "loss/hidden": 4.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3368827097117901, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 60.75, |
| "grad_norm_var": 52.67916666666667, |
| "learning_rate": 0.0001, |
| "loss": 10.2311, |
| "loss/crossentropy": 2.4212940514087675, |
| "loss/hidden": 4.55546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.355662290379405, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 60.75, |
| "grad_norm_var": 65.81666666666666, |
| "learning_rate": 0.0001, |
| "loss": 10.1866, |
| "loss/crossentropy": 2.4809795886278154, |
| "loss/hidden": 4.49140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3553234666585922, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 56.5, |
| "grad_norm_var": 98.2875, |
| "learning_rate": 0.0001, |
| "loss": 9.9805, |
| "loss/crossentropy": 2.306653854250908, |
| "loss/hidden": 4.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3146494958549738, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 68.0, |
| "grad_norm_var": 38.51015625, |
| "learning_rate": 0.0001, |
| "loss": 10.1087, |
| "loss/crossentropy": 2.250006601214409, |
| "loss/hidden": 4.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30200174674391744, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 70.5, |
| "grad_norm_var": 43.483072916666664, |
| "learning_rate": 0.0001, |
| "loss": 10.0526, |
| "loss/crossentropy": 2.211633677780628, |
| "loss/hidden": 4.47421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31178686060011385, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 61.0, |
| "grad_norm_var": 41.545572916666664, |
| "learning_rate": 0.0001, |
| "loss": 10.1915, |
| "loss/crossentropy": 2.5281356513500213, |
| "loss/hidden": 4.389453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.34625968635082244, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 72.5, |
| "grad_norm_var": 54.475, |
| "learning_rate": 0.0001, |
| "loss": 10.0007, |
| "loss/crossentropy": 2.4020907685160635, |
| "loss/hidden": 4.326171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32252500094473363, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 142.0, |
| "grad_norm_var": 499.1375, |
| "learning_rate": 0.0001, |
| "loss": 9.99, |
| "loss/crossentropy": 2.384984764456749, |
| "loss/hidden": 4.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3189360786229372, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 57.75, |
| "grad_norm_var": 527.23515625, |
| "learning_rate": 0.0001, |
| "loss": 9.9879, |
| "loss/crossentropy": 2.3401281625032424, |
| "loss/hidden": 4.46328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3382201848551631, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 71.5, |
| "grad_norm_var": 95.97265625, |
| "learning_rate": 0.0001, |
| "loss": 9.9352, |
| "loss/crossentropy": 2.3969784706830977, |
| "loss/hidden": 4.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.336395762488246, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 79.0, |
| "grad_norm_var": 144.96666666666667, |
| "learning_rate": 0.0001, |
| "loss": 10.149, |
| "loss/crossentropy": 2.4599110893905163, |
| "loss/hidden": 4.30703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3240171395242214, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 65.5, |
| "grad_norm_var": 119.2375, |
| "learning_rate": 0.0001, |
| "loss": 9.9634, |
| "loss/crossentropy": 2.4210876494646074, |
| "loss/hidden": 4.30390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32166178375482557, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 63.0, |
| "grad_norm_var": 41.47083333333333, |
| "learning_rate": 0.0001, |
| "loss": 9.744, |
| "loss/crossentropy": 2.2256636448204516, |
| "loss/hidden": 4.284765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29795306362211704, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 53.5, |
| "grad_norm_var": 192.55807291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.8636, |
| "loss/crossentropy": 2.297808923572302, |
| "loss/hidden": 4.31640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30742434673011304, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 61.0, |
| "grad_norm_var": 81.95729166666666, |
| "learning_rate": 0.0001, |
| "loss": 9.798, |
| "loss/crossentropy": 2.3219059616327287, |
| "loss/hidden": 4.211328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30037002861499784, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 56.75, |
| "grad_norm_var": 61.55807291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.7449, |
| "loss/crossentropy": 2.3104363679885864, |
| "loss/hidden": 4.388671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.327311984449625, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 60.0, |
| "grad_norm_var": 56.18932291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.9668, |
| "loss/crossentropy": 2.308886554837227, |
| "loss/hidden": 4.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3183224782347679, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 66.5, |
| "grad_norm_var": 42.05416666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.7807, |
| "loss/crossentropy": 2.3363482102751734, |
| "loss/hidden": 4.2921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3384779039770365, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 57.25, |
| "grad_norm_var": 56.891666666666666, |
| "learning_rate": 0.0001, |
| "loss": 9.7501, |
| "loss/crossentropy": 2.1767295479774473, |
| "loss/hidden": 4.466015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31410733237862587, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 50.25, |
| "grad_norm_var": 75.85598958333334, |
| "learning_rate": 0.0001, |
| "loss": 9.9273, |
| "loss/crossentropy": 2.505411845445633, |
| "loss/hidden": 4.36015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.33212706074118614, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 77.5, |
| "grad_norm_var": 196.21848958333334, |
| "learning_rate": 0.0001, |
| "loss": 9.9237, |
| "loss/crossentropy": 2.3281257838010787, |
| "loss/hidden": 4.35546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32293859515339135, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 63.25, |
| "grad_norm_var": 167.42395833333333, |
| "learning_rate": 0.0001, |
| "loss": 9.7592, |
| "loss/crossentropy": 2.3165650010108947, |
| "loss/hidden": 4.32890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31759811006486416, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 60.0, |
| "grad_norm_var": 153.80833333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.7366, |
| "loss/crossentropy": 2.3203016728162766, |
| "loss/hidden": 4.28515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31944827549159527, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 66.5, |
| "grad_norm_var": 3319.3958333333335, |
| "learning_rate": 0.0001, |
| "loss": 10.0035, |
| "loss/crossentropy": 2.4188640087842943, |
| "loss/hidden": 4.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3581279247999191, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 60.25, |
| "grad_norm_var": 3338.31640625, |
| "learning_rate": 0.0001, |
| "loss": 9.6837, |
| "loss/crossentropy": 2.2860016629099844, |
| "loss/hidden": 4.325390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.318701284006238, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 74.0, |
| "grad_norm_var": 112.4, |
| "learning_rate": 0.0001, |
| "loss": 9.517, |
| "loss/crossentropy": 2.4143033266067504, |
| "loss/hidden": 4.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3072842717170715, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 71.5, |
| "grad_norm_var": 68.60598958333334, |
| "learning_rate": 0.0001, |
| "loss": 9.8549, |
| "loss/crossentropy": 2.351083371043205, |
| "loss/hidden": 4.398046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.33429058492183683, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 53.25, |
| "grad_norm_var": 43.83229166666667, |
| "learning_rate": 0.0001, |
| "loss": 9.7738, |
| "loss/crossentropy": 2.4011227190494537, |
| "loss/hidden": 4.29453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3128178097307682, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 72.0, |
| "grad_norm_var": 34.82890625, |
| "learning_rate": 0.0001, |
| "loss": 9.7432, |
| "loss/crossentropy": 2.310031126439571, |
| "loss/hidden": 4.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3273486144840717, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 66.5, |
| "grad_norm_var": 111.64895833333334, |
| "learning_rate": 0.0001, |
| "loss": 9.6743, |
| "loss/crossentropy": 2.3055127263069153, |
| "loss/hidden": 4.21796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32233874313533306, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 51.5, |
| "grad_norm_var": 46.70729166666667, |
| "learning_rate": 0.0001, |
| "loss": 9.8026, |
| "loss/crossentropy": 2.314373381435871, |
| "loss/hidden": 4.256640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3083756107836962, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 57.75, |
| "grad_norm_var": 7292.4375, |
| "learning_rate": 0.0001, |
| "loss": 9.7291, |
| "loss/crossentropy": 2.5138203650712967, |
| "loss/hidden": 4.19921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30809955932199956, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 56.5, |
| "grad_norm_var": 29.190625, |
| "learning_rate": 0.0001, |
| "loss": 9.6823, |
| "loss/crossentropy": 2.2719234466552733, |
| "loss/hidden": 4.294140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3143883816897869, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 60.5, |
| "grad_norm_var": 45.925, |
| "learning_rate": 0.0001, |
| "loss": 9.7564, |
| "loss/crossentropy": 2.4254489660263063, |
| "loss/hidden": 4.261328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3154076419770718, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 56.0, |
| "grad_norm_var": 71.74583333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.7001, |
| "loss/crossentropy": 2.28252642005682, |
| "loss/hidden": 4.323046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3203336976468563, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 67.0, |
| "grad_norm_var": 46.040625, |
| "learning_rate": 0.0001, |
| "loss": 9.7436, |
| "loss/crossentropy": 2.391976150870323, |
| "loss/hidden": 4.225390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31455044373869895, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 46.0, |
| "grad_norm_var": 47.06640625, |
| "learning_rate": 0.0001, |
| "loss": 9.5622, |
| "loss/crossentropy": 2.3361207604408265, |
| "loss/hidden": 4.19296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30060703232884406, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 56.25, |
| "grad_norm_var": 49.264322916666664, |
| "learning_rate": 0.0001, |
| "loss": 9.6834, |
| "loss/crossentropy": 2.297483670711517, |
| "loss/hidden": 4.2890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2907493541017175, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 52.5, |
| "grad_norm_var": 12.27890625, |
| "learning_rate": 0.0001, |
| "loss": 9.6207, |
| "loss/crossentropy": 2.2364058643579483, |
| "loss/hidden": 4.277734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3097097765654325, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 68.5, |
| "grad_norm_var": 35.055989583333336, |
| "learning_rate": 0.0001, |
| "loss": 9.6018, |
| "loss/crossentropy": 2.2412969201803206, |
| "loss/hidden": 4.287109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31851550191640854, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 60.5, |
| "grad_norm_var": 25.774739583333332, |
| "learning_rate": 0.0001, |
| "loss": 9.6979, |
| "loss/crossentropy": 2.3062032952904703, |
| "loss/hidden": 4.258984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3124631106853485, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 59.25, |
| "grad_norm_var": 20.026822916666667, |
| "learning_rate": 0.0001, |
| "loss": 9.7129, |
| "loss/crossentropy": 2.4036868065595627, |
| "loss/hidden": 4.20234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31101155243813994, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 53.25, |
| "grad_norm_var": 75.30833333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.7047, |
| "loss/crossentropy": 2.3730016142129897, |
| "loss/hidden": 4.193359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3105484452098608, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 62.25, |
| "grad_norm_var": 33.27682291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.6313, |
| "loss/crossentropy": 2.2872567594051363, |
| "loss/hidden": 4.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3244694545865059, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 61.25, |
| "grad_norm_var": 25.673958333333335, |
| "learning_rate": 0.0001, |
| "loss": 9.6217, |
| "loss/crossentropy": 2.3013710603117943, |
| "loss/hidden": 4.29140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.32178852558135984, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 49.5, |
| "grad_norm_var": 46.97265625, |
| "learning_rate": 0.0001, |
| "loss": 9.6151, |
| "loss/crossentropy": 2.2743802405893803, |
| "loss/hidden": 4.19921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3104738780297339, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 52.25, |
| "grad_norm_var": 318.94557291666666, |
| "learning_rate": 0.0001, |
| "loss": 9.635, |
| "loss/crossentropy": 2.2751111879944803, |
| "loss/hidden": 4.17578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2947248375043273, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 59.5, |
| "grad_norm_var": 201.26848958333332, |
| "learning_rate": 0.0001, |
| "loss": 9.605, |
| "loss/crossentropy": 2.3590754181146623, |
| "loss/hidden": 4.116015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29773430675268175, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 50.0, |
| "grad_norm_var": 25.795833333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.4314, |
| "loss/crossentropy": 2.165515697002411, |
| "loss/hidden": 4.148046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2729496695101261, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 51.5, |
| "grad_norm_var": 65.69557291666666, |
| "learning_rate": 0.0001, |
| "loss": 9.4579, |
| "loss/crossentropy": 2.425456903874874, |
| "loss/hidden": 4.1140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3002984166145325, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 55.75, |
| "grad_norm_var": 74.63515625, |
| "learning_rate": 0.0001, |
| "loss": 9.562, |
| "loss/crossentropy": 2.3212677478790282, |
| "loss/hidden": 4.209765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28645528480410576, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 44.75, |
| "grad_norm_var": 39.139322916666664, |
| "learning_rate": 0.0001, |
| "loss": 9.305, |
| "loss/crossentropy": 2.2911602184176445, |
| "loss/hidden": 4.133984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28404638059437276, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 52.25, |
| "grad_norm_var": 76.19583333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.3122, |
| "loss/crossentropy": 2.3109163105487824, |
| "loss/hidden": 4.137109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2864396806806326, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 47.0, |
| "grad_norm_var": 41.66015625, |
| "learning_rate": 0.0001, |
| "loss": 9.4629, |
| "loss/crossentropy": 2.353537403047085, |
| "loss/hidden": 4.08203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2971150416880846, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 46.25, |
| "grad_norm_var": 45.31848958333333, |
| "learning_rate": 0.0001, |
| "loss": 9.365, |
| "loss/crossentropy": 2.3774181246757506, |
| "loss/hidden": 4.09296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2799839396029711, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 51.0, |
| "grad_norm_var": 17.93515625, |
| "learning_rate": 0.0001, |
| "loss": 9.3498, |
| "loss/crossentropy": 2.246833881735802, |
| "loss/hidden": 4.18828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2903384942561388, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 51.25, |
| "grad_norm_var": 12.420833333333333, |
| "learning_rate": 0.0001, |
| "loss": 9.4976, |
| "loss/crossentropy": 2.453240838646889, |
| "loss/hidden": 4.173828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3144164770841599, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 70.0, |
| "grad_norm_var": 2011.0322916666667, |
| "learning_rate": 0.0001, |
| "loss": 9.5884, |
| "loss/crossentropy": 2.174116183817387, |
| "loss/hidden": 4.24921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2923248626291752, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 53.75, |
| "grad_norm_var": 1988.8833333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.5249, |
| "loss/crossentropy": 2.3638354018330574, |
| "loss/hidden": 4.184765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3066251628100872, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 55.5, |
| "grad_norm_var": 22.779166666666665, |
| "learning_rate": 0.0001, |
| "loss": 9.3528, |
| "loss/crossentropy": 2.4166768550872804, |
| "loss/hidden": 4.123828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29636494982987643, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 60.5, |
| "grad_norm_var": 66.59348958333334, |
| "learning_rate": 0.0001, |
| "loss": 9.5339, |
| "loss/crossentropy": 2.3475931867957116, |
| "loss/hidden": 4.1953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30608872696757317, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 51.25, |
| "grad_norm_var": 62.49140625, |
| "learning_rate": 0.0001, |
| "loss": 9.3342, |
| "loss/crossentropy": 2.1785849004983904, |
| "loss/hidden": 4.161328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27641028352081776, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 54.25, |
| "grad_norm_var": 30.154166666666665, |
| "learning_rate": 0.0001, |
| "loss": 9.3898, |
| "loss/crossentropy": 2.3990818440914152, |
| "loss/hidden": 4.18359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2944341886788607, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 51.75, |
| "grad_norm_var": 38.93932291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.4628, |
| "loss/crossentropy": 2.4946817860007284, |
| "loss/hidden": 4.198828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31867978498339655, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 53.75, |
| "grad_norm_var": 33.9, |
| "learning_rate": 0.0001, |
| "loss": 9.3416, |
| "loss/crossentropy": 2.2067521095275877, |
| "loss/hidden": 4.235546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2976540043950081, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 60.75, |
| "grad_norm_var": 142.08229166666666, |
| "learning_rate": 0.0001, |
| "loss": 9.4716, |
| "loss/crossentropy": 2.4361192852258684, |
| "loss/hidden": 4.1140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2877715673297644, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 58.75, |
| "grad_norm_var": 44.35, |
| "learning_rate": 0.0001, |
| "loss": 9.4006, |
| "loss/crossentropy": 2.239429622516036, |
| "loss/hidden": 4.026171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27844256814569235, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 45.0, |
| "grad_norm_var": 33.95390625, |
| "learning_rate": 0.0001, |
| "loss": 9.3993, |
| "loss/crossentropy": 2.0759536787867545, |
| "loss/hidden": 4.068359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2688772227615118, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 51.75, |
| "grad_norm_var": 25.795833333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.3786, |
| "loss/crossentropy": 2.286362998187542, |
| "loss/hidden": 4.15078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2942257083952427, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 46.75, |
| "grad_norm_var": 20.520833333333332, |
| "learning_rate": 0.0001, |
| "loss": 9.2903, |
| "loss/crossentropy": 2.312733788788319, |
| "loss/hidden": 3.971484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2691910218447447, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 50.25, |
| "grad_norm_var": 28.290625, |
| "learning_rate": 0.0001, |
| "loss": 9.3076, |
| "loss/crossentropy": 2.2467628076672552, |
| "loss/hidden": 4.105078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2887777745723724, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 63.5, |
| "grad_norm_var": 33.73098958333333, |
| "learning_rate": 0.0001, |
| "loss": 9.4203, |
| "loss/crossentropy": 2.372379180788994, |
| "loss/hidden": 4.07578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3087839350104332, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 45.5, |
| "grad_norm_var": 40.108333333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.3215, |
| "loss/crossentropy": 2.3452367037534714, |
| "loss/hidden": 4.210546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3159611392766237, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 58.25, |
| "grad_norm_var": 27.539322916666666, |
| "learning_rate": 0.0001, |
| "loss": 9.3755, |
| "loss/crossentropy": 2.3029753446578978, |
| "loss/hidden": 3.999609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2623455457389355, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 51.75, |
| "grad_norm_var": 26.9, |
| "learning_rate": 0.0001, |
| "loss": 9.3578, |
| "loss/crossentropy": 2.3988554388284684, |
| "loss/hidden": 4.08828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2846154376864433, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 91.0, |
| "grad_norm_var": 1307.8372395833333, |
| "learning_rate": 0.0001, |
| "loss": 9.432, |
| "loss/crossentropy": 2.343544365465641, |
| "loss/hidden": 4.021875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2907770898193121, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 52.0, |
| "grad_norm_var": 170.62890625, |
| "learning_rate": 0.0001, |
| "loss": 9.3432, |
| "loss/crossentropy": 2.173108433187008, |
| "loss/hidden": 4.10859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28518917988985776, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 42.0, |
| "grad_norm_var": 47.56015625, |
| "learning_rate": 0.0001, |
| "loss": 9.367, |
| "loss/crossentropy": 2.2230691239237785, |
| "loss/hidden": 4.23515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30039387457072736, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 72.0, |
| "grad_norm_var": 1.226104970407838e+18, |
| "learning_rate": 0.0001, |
| "loss": 9.3564, |
| "loss/crossentropy": 2.263391149044037, |
| "loss/hidden": 4.10625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2923804897814989, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 52.5, |
| "grad_norm_var": 1.2261049681378806e+18, |
| "learning_rate": 0.0001, |
| "loss": 9.4959, |
| "loss/crossentropy": 2.113241518288851, |
| "loss/hidden": 4.087109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2759646028280258, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 66.0, |
| "grad_norm_var": 734.1489583333333, |
| "learning_rate": 0.0001, |
| "loss": 9.4743, |
| "loss/crossentropy": 2.3895165085792542, |
| "loss/hidden": 4.059765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2987998936325312, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 44.75, |
| "grad_norm_var": 50.00182291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.1919, |
| "loss/crossentropy": 2.251766300201416, |
| "loss/hidden": 4.04375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2781111396849155, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 52.75, |
| "grad_norm_var": 437.49583333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.4572, |
| "loss/crossentropy": 2.382322034239769, |
| "loss/hidden": 4.07734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31318275928497313, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 61.0, |
| "grad_norm_var": 40.301822916666666, |
| "learning_rate": 0.0001, |
| "loss": 9.2668, |
| "loss/crossentropy": 2.1683703124523164, |
| "loss/hidden": 4.07578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.283413190767169, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 42.75, |
| "grad_norm_var": 57.307291666666664, |
| "learning_rate": 0.0001, |
| "loss": 9.339, |
| "loss/crossentropy": 2.3430400043725967, |
| "loss/hidden": 4.036328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.287694800645113, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 46.75, |
| "grad_norm_var": 65.52395833333334, |
| "learning_rate": 0.0001, |
| "loss": 9.3768, |
| "loss/crossentropy": 2.2867416352033616, |
| "loss/hidden": 4.017578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29683431759476664, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 52.5, |
| "grad_norm_var": 61.18932291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.2451, |
| "loss/crossentropy": 2.3707614041864873, |
| "loss/hidden": 4.06015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29184688804671166, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 51.75, |
| "grad_norm_var": 20.895572916666666, |
| "learning_rate": 0.0001, |
| "loss": 9.3601, |
| "loss/crossentropy": 2.3268392831087112, |
| "loss/hidden": 4.12734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29570323824882505, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 44.0, |
| "grad_norm_var": 10.290625, |
| "learning_rate": 0.0001, |
| "loss": 9.4214, |
| "loss/crossentropy": 2.324131193757057, |
| "loss/hidden": 4.1984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3133995305746794, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 58.25, |
| "grad_norm_var": 19.124739583333334, |
| "learning_rate": 0.0001, |
| "loss": 9.2465, |
| "loss/crossentropy": 2.35849623978138, |
| "loss/hidden": 4.00703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2762619823217392, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 45.75, |
| "grad_norm_var": 53.89895833333333, |
| "learning_rate": 0.0001, |
| "loss": 9.1951, |
| "loss/crossentropy": 2.3914038598537446, |
| "loss/hidden": 3.9984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2871177852153778, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 43.25, |
| "grad_norm_var": 16.479166666666668, |
| "learning_rate": 0.0001, |
| "loss": 9.1669, |
| "loss/crossentropy": 2.152750685811043, |
| "loss/hidden": 4.100390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28708020225167274, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 49.25, |
| "grad_norm_var": 13.45390625, |
| "learning_rate": 0.0001, |
| "loss": 9.1015, |
| "loss/crossentropy": 2.2946193665266037, |
| "loss/hidden": 4.085546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3062314610928297, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 46.5, |
| "grad_norm_var": 22.473958333333332, |
| "learning_rate": 0.0001, |
| "loss": 9.1287, |
| "loss/crossentropy": 2.1538643553853034, |
| "loss/hidden": 3.9421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2666194221004844, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 47.0, |
| "grad_norm_var": 32.62057291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.411, |
| "loss/crossentropy": 2.387891933321953, |
| "loss/hidden": 4.14921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29542505368590355, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 45.25, |
| "grad_norm_var": 26.92265625, |
| "learning_rate": 0.0001, |
| "loss": 9.2833, |
| "loss/crossentropy": 2.3024097591638566, |
| "loss/hidden": 4.03984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29066667445003985, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 53.5, |
| "grad_norm_var": 17.832291666666666, |
| "learning_rate": 0.0001, |
| "loss": 9.2665, |
| "loss/crossentropy": 2.4454205125570296, |
| "loss/hidden": 3.955859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2910691563040018, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 42.25, |
| "grad_norm_var": 29.865625, |
| "learning_rate": 0.0001, |
| "loss": 9.1701, |
| "loss/crossentropy": 2.2966391056776048, |
| "loss/hidden": 4.027734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2789210833609104, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 48.25, |
| "grad_norm_var": 17.548958333333335, |
| "learning_rate": 0.0001, |
| "loss": 9.1992, |
| "loss/crossentropy": 2.395502945780754, |
| "loss/hidden": 3.934765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2776679117232561, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 40.75, |
| "grad_norm_var": 13.282291666666667, |
| "learning_rate": 0.0001, |
| "loss": 9.1046, |
| "loss/crossentropy": 2.22285817861557, |
| "loss/hidden": 3.9046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.26667180880904195, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 36.25, |
| "grad_norm_var": 34.90807291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.3204, |
| "loss/crossentropy": 2.3842350512743, |
| "loss/hidden": 4.009375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.300260554254055, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 46.75, |
| "grad_norm_var": 27.77890625, |
| "learning_rate": 0.0001, |
| "loss": 9.0943, |
| "loss/crossentropy": 2.274762773513794, |
| "loss/hidden": 4.01328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28360783979296683, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 55.5, |
| "grad_norm_var": 27.298958333333335, |
| "learning_rate": 0.0001, |
| "loss": 9.1699, |
| "loss/crossentropy": 2.1643219627439976, |
| "loss/hidden": 3.9921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.267458438500762, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 49.75, |
| "grad_norm_var": 43.94583333333333, |
| "learning_rate": 0.0001, |
| "loss": 9.3022, |
| "loss/crossentropy": 2.464679929614067, |
| "loss/hidden": 3.9546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.29758369028568266, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 51.0, |
| "grad_norm_var": 37.90807291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.1863, |
| "loss/crossentropy": 2.3199010998010636, |
| "loss/hidden": 3.99453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27702242247760295, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 46.5, |
| "grad_norm_var": 40.920833333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.2872, |
| "loss/crossentropy": 2.4041683062911035, |
| "loss/hidden": 4.09140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.30005627647042277, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 39.75, |
| "grad_norm_var": 40.723958333333336, |
| "learning_rate": 0.0001, |
| "loss": 9.1081, |
| "loss/crossentropy": 2.273802790045738, |
| "loss/hidden": 4.175, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.3045934235677123, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 43.25, |
| "grad_norm_var": 33.35729166666667, |
| "learning_rate": 0.0001, |
| "loss": 9.106, |
| "loss/crossentropy": 2.3607766672968866, |
| "loss/hidden": 3.9765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28193066976964476, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 48.25, |
| "grad_norm_var": 14.915625, |
| "learning_rate": 0.0001, |
| "loss": 9.1437, |
| "loss/crossentropy": 2.2798361241817475, |
| "loss/hidden": 3.9890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2721746701747179, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 44.0, |
| "grad_norm_var": 21.223958333333332, |
| "learning_rate": 0.0001, |
| "loss": 9.0972, |
| "loss/crossentropy": 2.21695294380188, |
| "loss/hidden": 4.0015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2832322970032692, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 39.5, |
| "grad_norm_var": 27.808333333333334, |
| "learning_rate": 0.0001, |
| "loss": 9.1587, |
| "loss/crossentropy": 2.1728454776108266, |
| "loss/hidden": 3.98828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27170457877218723, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 41.75, |
| "grad_norm_var": 13.315625, |
| "learning_rate": 0.0001, |
| "loss": 9.1326, |
| "loss/crossentropy": 2.154237084835768, |
| "loss/hidden": 4.063671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27950075305998323, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 43.0, |
| "grad_norm_var": 25.240625, |
| "learning_rate": 0.0001, |
| "loss": 9.1013, |
| "loss/crossentropy": 2.2507698431611063, |
| "loss/hidden": 4.01171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2808088269084692, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 49.25, |
| "grad_norm_var": 22.832291666666666, |
| "learning_rate": 0.0001, |
| "loss": 9.2429, |
| "loss/crossentropy": 2.288056728243828, |
| "loss/hidden": 4.1546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.31668607220053674, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 48.5, |
| "grad_norm_var": 58.09557291666667, |
| "learning_rate": 0.0001, |
| "loss": 9.1742, |
| "loss/crossentropy": 2.2107961744070055, |
| "loss/hidden": 4.05078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2858551822602749, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 39.25, |
| "grad_norm_var": 49.50390625, |
| "learning_rate": 0.0001, |
| "loss": 9.1293, |
| "loss/crossentropy": 2.224529256671667, |
| "loss/hidden": 3.9703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.27973891496658326, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 39.25, |
| "grad_norm_var": 13.890625, |
| "learning_rate": 0.0001, |
| "loss": 9.0689, |
| "loss/crossentropy": 2.363737019896507, |
| "loss/hidden": 4.027734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2919711694121361, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 55.75, |
| "grad_norm_var": 26.655989583333334, |
| "learning_rate": 0.0001, |
| "loss": 9.2228, |
| "loss/crossentropy": 2.3380469545722007, |
| "loss/hidden": 4.0171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28581551983952524, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 45.0, |
| "grad_norm_var": 27.357291666666665, |
| "learning_rate": 0.0001, |
| "loss": 9.173, |
| "loss/crossentropy": 2.43135461807251, |
| "loss/hidden": 3.970703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.28222124874591825, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 44.5, |
| "grad_norm_var": 16.04140625, |
| "learning_rate": 0.0001, |
| "loss": 9.1554, |
| "loss/crossentropy": 2.4415812104940415, |
| "loss/hidden": 4.064453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2984179027378559, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.715020064017613e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|