{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 2000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 4736.0, "learning_rate": 1.9e-05, "loss": 132.1055, "loss/crossentropy": 12.246079635620116, "loss/hidden": 18.7125, "loss/jsd": 0.0, "loss/logits": 10.372939014434815, "step": 10 }, { "epoch": 0.002, "grad_norm": 330.0, "grad_norm_var": 91640269.18333334, "learning_rate": 2.8000000000000003e-05, "loss": 95.9731, "loss/crossentropy": 8.862393474578857, "loss/hidden": 18.675, "loss/jsd": 0.0, "loss/logits": 6.677179157733917, "step": 20 }, { "epoch": 0.003, "grad_norm": 394.0, "grad_norm_var": 237715.45, "learning_rate": 3.7e-05, "loss": 86.3778, "loss/crossentropy": 8.083840227127075, "loss/hidden": 18.259375, "loss/jsd": 0.0, "loss/logits": 6.130921971797943, "step": 30 }, { "epoch": 0.004, "grad_norm": 924.0, "grad_norm_var": 2.6757682503402172e+16, "learning_rate": 4.600000000000001e-05, "loss": 82.5914, "loss/crossentropy": 7.802511918544769, "loss/hidden": 17.440625, "loss/jsd": 0.0, "loss/logits": 5.772503018379211, "step": 40 }, { "epoch": 0.005, "grad_norm": 516.0, "grad_norm_var": 38597.583333333336, "learning_rate": 5.500000000000001e-05, "loss": 75.3397, "loss/crossentropy": 7.156700026988983, "loss/hidden": 17.253125, "loss/jsd": 0.0, "loss/logits": 5.156575608253479, "step": 50 }, { "epoch": 0.006, "grad_norm": 1232.0, "grad_norm_var": 68241.45, "learning_rate": 6.400000000000001e-05, "loss": 61.2745, "loss/crossentropy": 6.0138510942459105, "loss/hidden": 15.80625, "loss/jsd": 0.0, "loss/logits": 3.8037488579750063, "step": 60 }, { "epoch": 0.007, "grad_norm": 376.0, "grad_norm_var": 626103.4, "learning_rate": 7.3e-05, "loss": 41.3695, "loss/crossentropy": 4.422797441482544, "loss/hidden": 13.1125, "loss/jsd": 0.0, "loss/logits": 2.4006322652101515, "step": 70 }, { "epoch": 0.008, "grad_norm": 272.0, "grad_norm_var": 674923.45, "learning_rate": 8.200000000000001e-05, "loss": 27.4755, "loss/crossentropy": 3.3576226443052293, "loss/hidden": 10.7359375, "loss/jsd": 0.0, "loss/logits": 1.3968962401151657, "step": 80 }, { "epoch": 0.009, "grad_norm": 296.0, "grad_norm_var": 15426.383333333333, "learning_rate": 9.1e-05, "loss": 22.6607, "loss/crossentropy": 3.217679074406624, "loss/hidden": 9.2140625, "loss/jsd": 0.0, "loss/logits": 1.055714099109173, "step": 90 }, { "epoch": 0.01, "grad_norm": 328.0, "grad_norm_var": 9349.666666666666, "learning_rate": 0.0001, "loss": 20.3108, "loss/crossentropy": 2.934060016274452, "loss/hidden": 8.40703125, "loss/jsd": 0.0, "loss/logits": 0.8702833190560341, "step": 100 }, { "epoch": 0.011, "grad_norm": 194.0, "grad_norm_var": 5992.866666666667, "learning_rate": 0.0001, "loss": 18.8852, "loss/crossentropy": 2.8450062334537507, "loss/hidden": 8.221875, "loss/jsd": 0.0, "loss/logits": 0.8380498677492142, "step": 110 }, { "epoch": 0.012, "grad_norm": 244.0, "grad_norm_var": 1176.5333333333333, "learning_rate": 0.0001, "loss": 17.97, "loss/crossentropy": 2.612249107658863, "loss/hidden": 7.578125, "loss/jsd": 0.0, "loss/logits": 0.686215291172266, "step": 120 }, { "epoch": 0.013, "grad_norm": 242.0, "grad_norm_var": 1168.8958333333333, "learning_rate": 0.0001, "loss": 17.2904, "loss/crossentropy": 2.8242316216230394, "loss/hidden": 7.7390625, "loss/jsd": 0.0, "loss/logits": 0.7805894792079926, "step": 130 }, { "epoch": 0.014, "grad_norm": 179.0, "grad_norm_var": 1465.1333333333334, "learning_rate": 0.0001, "loss": 16.5581, "loss/crossentropy": 2.737143725156784, "loss/hidden": 7.3421875, "loss/jsd": 0.0, "loss/logits": 0.6888546235859394, "step": 140 }, { "epoch": 0.015, "grad_norm": 175.0, "grad_norm_var": 1119.8625, "learning_rate": 0.0001, "loss": 16.0501, "loss/crossentropy": 2.7599751561880113, "loss/hidden": 7.05703125, "loss/jsd": 0.0, "loss/logits": 0.6640767879784107, "step": 150 }, { "epoch": 0.016, "grad_norm": 186.0, "grad_norm_var": 1044.5166666666667, "learning_rate": 0.0001, "loss": 15.4631, "loss/crossentropy": 2.6100075274705885, "loss/hidden": 6.8203125, "loss/jsd": 0.0, "loss/logits": 0.5824844464659691, "step": 160 }, { "epoch": 0.017, "grad_norm": 179.0, "grad_norm_var": 1082.8, "learning_rate": 0.0001, "loss": 15.2201, "loss/crossentropy": 2.4276285111904143, "loss/hidden": 6.8203125, "loss/jsd": 0.0, "loss/logits": 0.5915141828358174, "step": 170 }, { "epoch": 0.018, "grad_norm": 153.0, "grad_norm_var": 622.6625, "learning_rate": 0.0001, "loss": 14.9606, "loss/crossentropy": 2.630460512638092, "loss/hidden": 6.52578125, "loss/jsd": 0.0, "loss/logits": 0.5396774187684059, "step": 180 }, { "epoch": 0.019, "grad_norm": 176.0, "grad_norm_var": 1093.2, "learning_rate": 0.0001, "loss": 14.6255, "loss/crossentropy": 2.3158223152160646, "loss/hidden": 6.50390625, "loss/jsd": 0.0, "loss/logits": 0.4905257746577263, "step": 190 }, { "epoch": 0.02, "grad_norm": 112.0, "grad_norm_var": 695.7291666666666, "learning_rate": 0.0001, "loss": 14.3647, "loss/crossentropy": 2.586851382255554, "loss/hidden": 6.42265625, "loss/jsd": 0.0, "loss/logits": 0.5586091712117195, "step": 200 }, { "epoch": 0.021, "grad_norm": 118.5, "grad_norm_var": 574.3072916666666, "learning_rate": 0.0001, "loss": 14.0867, "loss/crossentropy": 2.5010055124759676, "loss/hidden": 6.34453125, "loss/jsd": 0.0, "loss/logits": 0.4965482771396637, "step": 210 }, { "epoch": 0.022, "grad_norm": 88.5, "grad_norm_var": 662.65, "learning_rate": 0.0001, "loss": 13.6551, "loss/crossentropy": 2.573444625735283, "loss/hidden": 6.33125, "loss/jsd": 0.0, "loss/logits": 0.5534068010747433, "step": 220 }, { "epoch": 0.023, "grad_norm": 118.0, "grad_norm_var": 412.1958333333333, "learning_rate": 0.0001, "loss": 13.4715, "loss/crossentropy": 2.4142292886972427, "loss/hidden": 5.96640625, "loss/jsd": 0.0, "loss/logits": 0.44360905699431896, "step": 230 }, { "epoch": 0.024, "grad_norm": 134.0, "grad_norm_var": 242.9, "learning_rate": 0.0001, "loss": 13.3289, "loss/crossentropy": 2.4670142769813537, "loss/hidden": 5.98671875, "loss/jsd": 0.0, "loss/logits": 0.47392544001340864, "step": 240 }, { "epoch": 0.025, "grad_norm": 137.0, "grad_norm_var": 158.4625, "learning_rate": 0.0001, "loss": 13.0031, "loss/crossentropy": 2.416000656783581, "loss/hidden": 5.7859375, "loss/jsd": 0.0, "loss/logits": 0.44607544504106045, "step": 250 }, { "epoch": 0.026, "grad_norm": 109.0, "grad_norm_var": 279.990625, "learning_rate": 0.0001, "loss": 13.0076, "loss/crossentropy": 2.370332670211792, "loss/hidden": 5.9984375, "loss/jsd": 0.0, "loss/logits": 0.5006627842783928, "step": 260 }, { "epoch": 0.027, "grad_norm": 129.0, "grad_norm_var": 427.37395833333335, "learning_rate": 0.0001, "loss": 12.8809, "loss/crossentropy": 2.281908763945103, "loss/hidden": 5.98671875, "loss/jsd": 0.0, "loss/logits": 0.45061586182564495, "step": 270 }, { "epoch": 0.028, "grad_norm": 98.0, "grad_norm_var": 278.1489583333333, "learning_rate": 0.0001, "loss": 12.8942, "loss/crossentropy": 2.3922384053468706, "loss/hidden": 5.6984375, "loss/jsd": 0.0, "loss/logits": 0.44376694336533545, "step": 280 }, { "epoch": 0.029, "grad_norm": 99.5, "grad_norm_var": 303.55, "learning_rate": 0.0001, "loss": 12.7122, "loss/crossentropy": 2.730095013976097, "loss/hidden": 5.49140625, "loss/jsd": 0.0, "loss/logits": 0.4411045670509338, "step": 290 }, { "epoch": 0.03, "grad_norm": 112.5, "grad_norm_var": 359.56666666666666, "learning_rate": 0.0001, "loss": 12.5618, "loss/crossentropy": 2.3741705983877184, "loss/hidden": 5.43203125, "loss/jsd": 0.0, "loss/logits": 0.40091707594692705, "step": 300 }, { "epoch": 0.031, "grad_norm": 84.5, "grad_norm_var": 245.25729166666667, "learning_rate": 0.0001, "loss": 12.2525, "loss/crossentropy": 2.2781229317188263, "loss/hidden": 5.53515625, "loss/jsd": 0.0, "loss/logits": 0.4274128321558237, "step": 310 }, { "epoch": 0.032, "grad_norm": 108.5, "grad_norm_var": 140.59583333333333, "learning_rate": 0.0001, "loss": 12.2935, "loss/crossentropy": 2.5757294684648513, "loss/hidden": 5.4609375, "loss/jsd": 0.0, "loss/logits": 0.42916890494525434, "step": 320 }, { "epoch": 0.033, "grad_norm": 108.0, "grad_norm_var": 70.89895833333334, "learning_rate": 0.0001, "loss": 12.1545, "loss/crossentropy": 2.527638339996338, "loss/hidden": 5.378125, "loss/jsd": 0.0, "loss/logits": 0.4032053742557764, "step": 330 }, { "epoch": 0.034, "grad_norm": 210.0, "grad_norm_var": 1272.465625, "learning_rate": 0.0001, "loss": 12.2482, "loss/crossentropy": 2.5401821002364158, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.4444709587842226, "step": 340 }, { "epoch": 0.035, "grad_norm": 79.5, "grad_norm_var": 1376.5958333333333, "learning_rate": 0.0001, "loss": 12.08, "loss/crossentropy": 2.514840933680534, "loss/hidden": 5.2640625, "loss/jsd": 0.0, "loss/logits": 0.4077944982796907, "step": 350 }, { "epoch": 0.036, "grad_norm": 87.0, "grad_norm_var": 418.83229166666666, "learning_rate": 0.0001, "loss": 12.0245, "loss/crossentropy": 2.420889538526535, "loss/hidden": 5.34921875, "loss/jsd": 0.0, "loss/logits": 0.44222328886389733, "step": 360 }, { "epoch": 0.037, "grad_norm": 76.5, "grad_norm_var": 138.5625, "learning_rate": 0.0001, "loss": 11.7097, "loss/crossentropy": 2.2826619133353234, "loss/hidden": 5.3296875, "loss/jsd": 0.0, "loss/logits": 0.3849468305706978, "step": 370 }, { "epoch": 0.038, "grad_norm": 96.5, "grad_norm_var": 184.93229166666666, "learning_rate": 0.0001, "loss": 11.465, "loss/crossentropy": 2.4052042722702027, "loss/hidden": 5.16796875, "loss/jsd": 0.0, "loss/logits": 0.40173302926123144, "step": 380 }, { "epoch": 0.039, "grad_norm": 125.5, "grad_norm_var": 183.09583333333333, "learning_rate": 0.0001, "loss": 11.6273, "loss/crossentropy": 2.540145033597946, "loss/hidden": 5.215625, "loss/jsd": 0.0, "loss/logits": 0.41224894523620603, "step": 390 }, { "epoch": 0.04, "grad_norm": 83.5, "grad_norm_var": 258.315625, "learning_rate": 0.0001, "loss": 11.397, "loss/crossentropy": 2.207468980550766, "loss/hidden": 5.09296875, "loss/jsd": 0.0, "loss/logits": 0.3590874429792166, "step": 400 }, { "epoch": 0.041, "grad_norm": 94.5, "grad_norm_var": 184.5625, "learning_rate": 0.0001, "loss": 11.443, "loss/crossentropy": 2.4378984421491623, "loss/hidden": 5.21171875, "loss/jsd": 0.0, "loss/logits": 0.40493359677493573, "step": 410 }, { "epoch": 0.042, "grad_norm": 106.5, "grad_norm_var": 125.590625, "learning_rate": 0.0001, "loss": 11.5678, "loss/crossentropy": 2.518555220961571, "loss/hidden": 5.07265625, "loss/jsd": 0.0, "loss/logits": 0.4297170080244541, "step": 420 }, { "epoch": 0.043, "grad_norm": 87.5, "grad_norm_var": 115.765625, "learning_rate": 0.0001, "loss": 11.3132, "loss/crossentropy": 2.490597203373909, "loss/hidden": 5.11171875, "loss/jsd": 0.0, "loss/logits": 0.403754598274827, "step": 430 }, { "epoch": 0.044, "grad_norm": 92.5, "grad_norm_var": 156.35729166666667, "learning_rate": 0.0001, "loss": 11.1476, "loss/crossentropy": 2.037529316544533, "loss/hidden": 5.07421875, "loss/jsd": 0.0, "loss/logits": 0.35246654506772757, "step": 440 }, { "epoch": 0.045, "grad_norm": 80.5, "grad_norm_var": 210.66666666666666, "learning_rate": 0.0001, "loss": 11.3038, "loss/crossentropy": 2.3201738983392715, "loss/hidden": 5.0828125, "loss/jsd": 0.0, "loss/logits": 0.38196625709533694, "step": 450 }, { "epoch": 0.046, "grad_norm": 107.5, "grad_norm_var": 284.1666666666667, "learning_rate": 0.0001, "loss": 11.3625, "loss/crossentropy": 2.4791718110442162, "loss/hidden": 4.95546875, "loss/jsd": 0.0, "loss/logits": 0.36495909169316293, "step": 460 }, { "epoch": 0.047, "grad_norm": 91.5, "grad_norm_var": 247.39895833333333, "learning_rate": 0.0001, "loss": 11.0542, "loss/crossentropy": 2.3155667960643767, "loss/hidden": 4.93828125, "loss/jsd": 0.0, "loss/logits": 0.362844867631793, "step": 470 }, { "epoch": 0.048, "grad_norm": 95.0, "grad_norm_var": 194.79895833333333, "learning_rate": 0.0001, "loss": 11.2413, "loss/crossentropy": 2.496318203210831, "loss/hidden": 4.840625, "loss/jsd": 0.0, "loss/logits": 0.3887303464114666, "step": 480 }, { "epoch": 0.049, "grad_norm": 74.5, "grad_norm_var": 243.840625, "learning_rate": 0.0001, "loss": 10.9416, "loss/crossentropy": 2.385223904252052, "loss/hidden": 4.85234375, "loss/jsd": 0.0, "loss/logits": 0.3598880790174007, "step": 490 }, { "epoch": 0.05, "grad_norm": 79.0, "grad_norm_var": 105.990625, "learning_rate": 0.0001, "loss": 10.9114, "loss/crossentropy": 2.2462552055716514, "loss/hidden": 4.80859375, "loss/jsd": 0.0, "loss/logits": 0.3265662036836147, "step": 500 }, { "epoch": 0.051, "grad_norm": 96.5, "grad_norm_var": 138.43229166666666, "learning_rate": 0.0001, "loss": 10.8821, "loss/crossentropy": 2.297148121893406, "loss/hidden": 4.8609375, "loss/jsd": 0.0, "loss/logits": 0.3467547960579395, "step": 510 }, { "epoch": 0.052, "grad_norm": 97.5, "grad_norm_var": 129.365625, "learning_rate": 0.0001, "loss": 10.9299, "loss/crossentropy": 2.4197026968002318, "loss/hidden": 4.7921875, "loss/jsd": 0.0, "loss/logits": 0.3632193084806204, "step": 520 }, { "epoch": 0.053, "grad_norm": 81.5, "grad_norm_var": 99.47395833333333, "learning_rate": 0.0001, "loss": 10.787, "loss/crossentropy": 2.36982424557209, "loss/hidden": 4.825, "loss/jsd": 0.0, "loss/logits": 0.3405680742114782, "step": 530 }, { "epoch": 0.054, "grad_norm": 85.5, "grad_norm_var": 48.340625, "learning_rate": 0.0001, "loss": 10.8675, "loss/crossentropy": 2.4611779801547526, "loss/hidden": 4.8625, "loss/jsd": 0.0, "loss/logits": 0.36872007288038733, "step": 540 }, { "epoch": 0.055, "grad_norm": 93.5, "grad_norm_var": 84.24895833333333, "learning_rate": 0.0001, "loss": 10.64, "loss/crossentropy": 2.1758567959070207, "loss/hidden": 4.7484375, "loss/jsd": 0.0, "loss/logits": 0.3336840860545635, "step": 550 }, { "epoch": 0.056, "grad_norm": 114.0, "grad_norm_var": 129.53098958333334, "learning_rate": 0.0001, "loss": 10.5615, "loss/crossentropy": 2.3970536097884176, "loss/hidden": 4.7625, "loss/jsd": 0.0, "loss/logits": 0.34276723079383375, "step": 560 }, { "epoch": 0.057, "grad_norm": 80.0, "grad_norm_var": 579.57890625, "learning_rate": 0.0001, "loss": 10.8999, "loss/crossentropy": 2.4695185527205465, "loss/hidden": 4.9453125, "loss/jsd": 0.0, "loss/logits": 0.42829814068973066, "step": 570 }, { "epoch": 0.058, "grad_norm": 85.0, "grad_norm_var": 596.9572916666667, "learning_rate": 0.0001, "loss": 10.8802, "loss/crossentropy": 2.3520184576511385, "loss/hidden": 4.790625, "loss/jsd": 0.0, "loss/logits": 0.3662864986807108, "step": 580 }, { "epoch": 0.059, "grad_norm": 73.0, "grad_norm_var": 181.69583333333333, "learning_rate": 0.0001, "loss": 10.6744, "loss/crossentropy": 2.2842736929655074, "loss/hidden": 4.71484375, "loss/jsd": 0.0, "loss/logits": 0.3500846643000841, "step": 590 }, { "epoch": 0.06, "grad_norm": 97.0, "grad_norm_var": 160.58307291666668, "learning_rate": 0.0001, "loss": 10.6987, "loss/crossentropy": 2.29906165599823, "loss/hidden": 4.602734375, "loss/jsd": 0.0, "loss/logits": 0.334361494705081, "step": 600 }, { "epoch": 0.061, "grad_norm": 89.0, "grad_norm_var": 162.67682291666668, "learning_rate": 0.0001, "loss": 10.6143, "loss/crossentropy": 2.3032930195331573, "loss/hidden": 4.6703125, "loss/jsd": 0.0, "loss/logits": 0.3258141163736582, "step": 610 }, { "epoch": 0.062, "grad_norm": 77.5, "grad_norm_var": 97.12916666666666, "learning_rate": 0.0001, "loss": 10.5946, "loss/crossentropy": 2.452244046330452, "loss/hidden": 4.7109375, "loss/jsd": 0.0, "loss/logits": 0.3432691916823387, "step": 620 }, { "epoch": 0.063, "grad_norm": 75.5, "grad_norm_var": 227.69973958333333, "learning_rate": 0.0001, "loss": 10.6287, "loss/crossentropy": 2.2894835874438284, "loss/hidden": 4.74609375, "loss/jsd": 0.0, "loss/logits": 0.35672005768865345, "step": 630 }, { "epoch": 0.064, "grad_norm": 70.0, "grad_norm_var": 541.2322916666667, "learning_rate": 0.0001, "loss": 10.6195, "loss/crossentropy": 2.4114772886037827, "loss/hidden": 4.70546875, "loss/jsd": 0.0, "loss/logits": 0.35591375902295114, "step": 640 }, { "epoch": 0.065, "grad_norm": 77.0, "grad_norm_var": 435.15390625, "learning_rate": 0.0001, "loss": 10.4142, "loss/crossentropy": 2.332440134882927, "loss/hidden": 4.634375, "loss/jsd": 0.0, "loss/logits": 0.339809150993824, "step": 650 }, { "epoch": 0.066, "grad_norm": 71.5, "grad_norm_var": 118.03307291666667, "learning_rate": 0.0001, "loss": 10.4602, "loss/crossentropy": 2.154422373324633, "loss/hidden": 4.54140625, "loss/jsd": 0.0, "loss/logits": 0.3334257358685136, "step": 660 }, { "epoch": 0.067, "grad_norm": 73.5, "grad_norm_var": 144.94166666666666, "learning_rate": 0.0001, "loss": 10.5185, "loss/crossentropy": 2.3223402693867685, "loss/hidden": 4.795703125, "loss/jsd": 0.0, "loss/logits": 0.37188967503607273, "step": 670 }, { "epoch": 0.068, "grad_norm": 61.5, "grad_norm_var": 169.65598958333334, "learning_rate": 0.0001, "loss": 10.5323, "loss/crossentropy": 2.332353001832962, "loss/hidden": 4.50625, "loss/jsd": 0.0, "loss/logits": 0.31948004066944125, "step": 680 }, { "epoch": 0.069, "grad_norm": 74.0, "grad_norm_var": 155.94140625, "learning_rate": 0.0001, "loss": 10.4359, "loss/crossentropy": 2.4077556908130644, "loss/hidden": 4.623828125, "loss/jsd": 0.0, "loss/logits": 0.339173823595047, "step": 690 }, { "epoch": 0.07, "grad_norm": 82.5, "grad_norm_var": 125.55416666666666, "learning_rate": 0.0001, "loss": 10.4493, "loss/crossentropy": 2.292634981870651, "loss/hidden": 4.571875, "loss/jsd": 0.0, "loss/logits": 0.3477486100047827, "step": 700 }, { "epoch": 0.071, "grad_norm": 88.0, "grad_norm_var": 155.84166666666667, "learning_rate": 0.0001, "loss": 10.2041, "loss/crossentropy": 2.4034020826220512, "loss/hidden": 4.53046875, "loss/jsd": 0.0, "loss/logits": 0.3406600248068571, "step": 710 }, { "epoch": 0.072, "grad_norm": 124.0, "grad_norm_var": 230.83307291666668, "learning_rate": 0.0001, "loss": 10.3489, "loss/crossentropy": 2.333241228759289, "loss/hidden": 4.6015625, "loss/jsd": 0.0, "loss/logits": 0.3285223826766014, "step": 720 }, { "epoch": 0.073, "grad_norm": 71.0, "grad_norm_var": 278.95390625, "learning_rate": 0.0001, "loss": 10.1548, "loss/crossentropy": 2.4066421508789064, "loss/hidden": 4.682421875, "loss/jsd": 0.0, "loss/logits": 0.338771004602313, "step": 730 }, { "epoch": 0.074, "grad_norm": 84.5, "grad_norm_var": 166.85729166666667, "learning_rate": 0.0001, "loss": 10.2647, "loss/crossentropy": 2.2724754482507707, "loss/hidden": 4.567578125, "loss/jsd": 0.0, "loss/logits": 0.3267147310078144, "step": 740 }, { "epoch": 0.075, "grad_norm": 67.5, "grad_norm_var": 343.5247395833333, "learning_rate": 0.0001, "loss": 10.2815, "loss/crossentropy": 2.3046080738306047, "loss/hidden": 4.473828125, "loss/jsd": 0.0, "loss/logits": 0.33236319161951544, "step": 750 }, { "epoch": 0.076, "grad_norm": 68.5, "grad_norm_var": 306.540625, "learning_rate": 0.0001, "loss": 10.2479, "loss/crossentropy": 2.2831736013293265, "loss/hidden": 4.62734375, "loss/jsd": 0.0, "loss/logits": 0.3329113606363535, "step": 760 }, { "epoch": 0.077, "grad_norm": 88.5, "grad_norm_var": 111.57473958333334, "learning_rate": 0.0001, "loss": 10.2161, "loss/crossentropy": 2.3853780582547186, "loss/hidden": 4.541015625, "loss/jsd": 0.0, "loss/logits": 0.31959532871842383, "step": 770 }, { "epoch": 0.078, "grad_norm": 80.5, "grad_norm_var": 110.65729166666667, "learning_rate": 0.0001, "loss": 10.2076, "loss/crossentropy": 2.3982744574546815, "loss/hidden": 4.55859375, "loss/jsd": 0.0, "loss/logits": 0.3542841043323278, "step": 780 }, { "epoch": 0.079, "grad_norm": 66.0, "grad_norm_var": 275.6322916666667, "learning_rate": 0.0001, "loss": 10.1697, "loss/crossentropy": 2.4292824655771255, "loss/hidden": 4.632421875, "loss/jsd": 0.0, "loss/logits": 0.36711033545434474, "step": 790 }, { "epoch": 0.08, "grad_norm": 57.25, "grad_norm_var": 290.9291666666667, "learning_rate": 0.0001, "loss": 10.2176, "loss/crossentropy": 2.380542576313019, "loss/hidden": 4.509375, "loss/jsd": 0.0, "loss/logits": 0.3368827097117901, "step": 800 }, { "epoch": 0.081, "grad_norm": 60.75, "grad_norm_var": 52.67916666666667, "learning_rate": 0.0001, "loss": 10.2311, "loss/crossentropy": 2.4212940514087675, "loss/hidden": 4.55546875, "loss/jsd": 0.0, "loss/logits": 0.355662290379405, "step": 810 }, { "epoch": 0.082, "grad_norm": 60.75, "grad_norm_var": 65.81666666666666, "learning_rate": 0.0001, "loss": 10.1866, "loss/crossentropy": 2.4809795886278154, "loss/hidden": 4.49140625, "loss/jsd": 0.0, "loss/logits": 0.3553234666585922, "step": 820 }, { "epoch": 0.083, "grad_norm": 56.5, "grad_norm_var": 98.2875, "learning_rate": 0.0001, "loss": 9.9805, "loss/crossentropy": 2.306653854250908, "loss/hidden": 4.40078125, "loss/jsd": 0.0, "loss/logits": 0.3146494958549738, "step": 830 }, { "epoch": 0.084, "grad_norm": 68.0, "grad_norm_var": 38.51015625, "learning_rate": 0.0001, "loss": 10.1087, "loss/crossentropy": 2.250006601214409, "loss/hidden": 4.422265625, "loss/jsd": 0.0, "loss/logits": 0.30200174674391744, "step": 840 }, { "epoch": 0.085, "grad_norm": 70.5, "grad_norm_var": 43.483072916666664, "learning_rate": 0.0001, "loss": 10.0526, "loss/crossentropy": 2.211633677780628, "loss/hidden": 4.47421875, "loss/jsd": 0.0, "loss/logits": 0.31178686060011385, "step": 850 }, { "epoch": 0.086, "grad_norm": 61.0, "grad_norm_var": 41.545572916666664, "learning_rate": 0.0001, "loss": 10.1915, "loss/crossentropy": 2.5281356513500213, "loss/hidden": 4.389453125, "loss/jsd": 0.0, "loss/logits": 0.34625968635082244, "step": 860 }, { "epoch": 0.087, "grad_norm": 72.5, "grad_norm_var": 54.475, "learning_rate": 0.0001, "loss": 10.0007, "loss/crossentropy": 2.4020907685160635, "loss/hidden": 4.326171875, "loss/jsd": 0.0, "loss/logits": 0.32252500094473363, "step": 870 }, { "epoch": 0.088, "grad_norm": 142.0, "grad_norm_var": 499.1375, "learning_rate": 0.0001, "loss": 9.99, "loss/crossentropy": 2.384984764456749, "loss/hidden": 4.38515625, "loss/jsd": 0.0, "loss/logits": 0.3189360786229372, "step": 880 }, { "epoch": 0.089, "grad_norm": 57.75, "grad_norm_var": 527.23515625, "learning_rate": 0.0001, "loss": 9.9879, "loss/crossentropy": 2.3401281625032424, "loss/hidden": 4.46328125, "loss/jsd": 0.0, "loss/logits": 0.3382201848551631, "step": 890 }, { "epoch": 0.09, "grad_norm": 71.5, "grad_norm_var": 95.97265625, "learning_rate": 0.0001, "loss": 9.9352, "loss/crossentropy": 2.3969784706830977, "loss/hidden": 4.384375, "loss/jsd": 0.0, "loss/logits": 0.336395762488246, "step": 900 }, { "epoch": 0.091, "grad_norm": 79.0, "grad_norm_var": 144.96666666666667, "learning_rate": 0.0001, "loss": 10.149, "loss/crossentropy": 2.4599110893905163, "loss/hidden": 4.30703125, "loss/jsd": 0.0, "loss/logits": 0.3240171395242214, "step": 910 }, { "epoch": 0.092, "grad_norm": 65.5, "grad_norm_var": 119.2375, "learning_rate": 0.0001, "loss": 9.9634, "loss/crossentropy": 2.4210876494646074, "loss/hidden": 4.30390625, "loss/jsd": 0.0, "loss/logits": 0.32166178375482557, "step": 920 }, { "epoch": 0.093, "grad_norm": 63.0, "grad_norm_var": 41.47083333333333, "learning_rate": 0.0001, "loss": 9.744, "loss/crossentropy": 2.2256636448204516, "loss/hidden": 4.284765625, "loss/jsd": 0.0, "loss/logits": 0.29795306362211704, "step": 930 }, { "epoch": 0.094, "grad_norm": 53.5, "grad_norm_var": 192.55807291666667, "learning_rate": 0.0001, "loss": 9.8636, "loss/crossentropy": 2.297808923572302, "loss/hidden": 4.31640625, "loss/jsd": 0.0, "loss/logits": 0.30742434673011304, "step": 940 }, { "epoch": 0.095, "grad_norm": 61.0, "grad_norm_var": 81.95729166666666, "learning_rate": 0.0001, "loss": 9.798, "loss/crossentropy": 2.3219059616327287, "loss/hidden": 4.211328125, "loss/jsd": 0.0, "loss/logits": 0.30037002861499784, "step": 950 }, { "epoch": 0.096, "grad_norm": 56.75, "grad_norm_var": 61.55807291666667, "learning_rate": 0.0001, "loss": 9.7449, "loss/crossentropy": 2.3104363679885864, "loss/hidden": 4.388671875, "loss/jsd": 0.0, "loss/logits": 0.327311984449625, "step": 960 }, { "epoch": 0.097, "grad_norm": 60.0, "grad_norm_var": 56.18932291666667, "learning_rate": 0.0001, "loss": 9.9668, "loss/crossentropy": 2.308886554837227, "loss/hidden": 4.407421875, "loss/jsd": 0.0, "loss/logits": 0.3183224782347679, "step": 970 }, { "epoch": 0.098, "grad_norm": 66.5, "grad_norm_var": 42.05416666666667, "learning_rate": 0.0001, "loss": 9.7807, "loss/crossentropy": 2.3363482102751734, "loss/hidden": 4.2921875, "loss/jsd": 0.0, "loss/logits": 0.3384779039770365, "step": 980 }, { "epoch": 0.099, "grad_norm": 57.25, "grad_norm_var": 56.891666666666666, "learning_rate": 0.0001, "loss": 9.7501, "loss/crossentropy": 2.1767295479774473, "loss/hidden": 4.466015625, "loss/jsd": 0.0, "loss/logits": 0.31410733237862587, "step": 990 }, { "epoch": 0.1, "grad_norm": 50.25, "grad_norm_var": 75.85598958333334, "learning_rate": 0.0001, "loss": 9.9273, "loss/crossentropy": 2.505411845445633, "loss/hidden": 4.36015625, "loss/jsd": 0.0, "loss/logits": 0.33212706074118614, "step": 1000 }, { "epoch": 0.101, "grad_norm": 77.5, "grad_norm_var": 196.21848958333334, "learning_rate": 0.0001, "loss": 9.9237, "loss/crossentropy": 2.3281257838010787, "loss/hidden": 4.35546875, "loss/jsd": 0.0, "loss/logits": 0.32293859515339135, "step": 1010 }, { "epoch": 0.102, "grad_norm": 63.25, "grad_norm_var": 167.42395833333333, "learning_rate": 0.0001, "loss": 9.7592, "loss/crossentropy": 2.3165650010108947, "loss/hidden": 4.32890625, "loss/jsd": 0.0, "loss/logits": 0.31759811006486416, "step": 1020 }, { "epoch": 0.103, "grad_norm": 60.0, "grad_norm_var": 153.80833333333334, "learning_rate": 0.0001, "loss": 9.7366, "loss/crossentropy": 2.3203016728162766, "loss/hidden": 4.28515625, "loss/jsd": 0.0, "loss/logits": 0.31944827549159527, "step": 1030 }, { "epoch": 0.104, "grad_norm": 66.5, "grad_norm_var": 3319.3958333333335, "learning_rate": 0.0001, "loss": 10.0035, "loss/crossentropy": 2.4188640087842943, "loss/hidden": 4.38828125, "loss/jsd": 0.0, "loss/logits": 0.3581279247999191, "step": 1040 }, { "epoch": 0.105, "grad_norm": 60.25, "grad_norm_var": 3338.31640625, "learning_rate": 0.0001, "loss": 9.6837, "loss/crossentropy": 2.2860016629099844, "loss/hidden": 4.325390625, "loss/jsd": 0.0, "loss/logits": 0.318701284006238, "step": 1050 }, { "epoch": 0.106, "grad_norm": 74.0, "grad_norm_var": 112.4, "learning_rate": 0.0001, "loss": 9.517, "loss/crossentropy": 2.4143033266067504, "loss/hidden": 4.319140625, "loss/jsd": 0.0, "loss/logits": 0.3072842717170715, "step": 1060 }, { "epoch": 0.107, "grad_norm": 71.5, "grad_norm_var": 68.60598958333334, "learning_rate": 0.0001, "loss": 9.8549, "loss/crossentropy": 2.351083371043205, "loss/hidden": 4.398046875, "loss/jsd": 0.0, "loss/logits": 0.33429058492183683, "step": 1070 }, { "epoch": 0.108, "grad_norm": 53.25, "grad_norm_var": 43.83229166666667, "learning_rate": 0.0001, "loss": 9.7738, "loss/crossentropy": 2.4011227190494537, "loss/hidden": 4.29453125, "loss/jsd": 0.0, "loss/logits": 0.3128178097307682, "step": 1080 }, { "epoch": 0.109, "grad_norm": 72.0, "grad_norm_var": 34.82890625, "learning_rate": 0.0001, "loss": 9.7432, "loss/crossentropy": 2.310031126439571, "loss/hidden": 4.38984375, "loss/jsd": 0.0, "loss/logits": 0.3273486144840717, "step": 1090 }, { "epoch": 0.11, "grad_norm": 66.5, "grad_norm_var": 111.64895833333334, "learning_rate": 0.0001, "loss": 9.6743, "loss/crossentropy": 2.3055127263069153, "loss/hidden": 4.21796875, "loss/jsd": 0.0, "loss/logits": 0.32233874313533306, "step": 1100 }, { "epoch": 0.111, "grad_norm": 51.5, "grad_norm_var": 46.70729166666667, "learning_rate": 0.0001, "loss": 9.8026, "loss/crossentropy": 2.314373381435871, "loss/hidden": 4.256640625, "loss/jsd": 0.0, "loss/logits": 0.3083756107836962, "step": 1110 }, { "epoch": 0.112, "grad_norm": 57.75, "grad_norm_var": 7292.4375, "learning_rate": 0.0001, "loss": 9.7291, "loss/crossentropy": 2.5138203650712967, "loss/hidden": 4.19921875, "loss/jsd": 0.0, "loss/logits": 0.30809955932199956, "step": 1120 }, { "epoch": 0.113, "grad_norm": 56.5, "grad_norm_var": 29.190625, "learning_rate": 0.0001, "loss": 9.6823, "loss/crossentropy": 2.2719234466552733, "loss/hidden": 4.294140625, "loss/jsd": 0.0, "loss/logits": 0.3143883816897869, "step": 1130 }, { "epoch": 0.114, "grad_norm": 60.5, "grad_norm_var": 45.925, "learning_rate": 0.0001, "loss": 9.7564, "loss/crossentropy": 2.4254489660263063, "loss/hidden": 4.261328125, "loss/jsd": 0.0, "loss/logits": 0.3154076419770718, "step": 1140 }, { "epoch": 0.115, "grad_norm": 56.0, "grad_norm_var": 71.74583333333334, "learning_rate": 0.0001, "loss": 9.7001, "loss/crossentropy": 2.28252642005682, "loss/hidden": 4.323046875, "loss/jsd": 0.0, "loss/logits": 0.3203336976468563, "step": 1150 }, { "epoch": 0.116, "grad_norm": 67.0, "grad_norm_var": 46.040625, "learning_rate": 0.0001, "loss": 9.7436, "loss/crossentropy": 2.391976150870323, "loss/hidden": 4.225390625, "loss/jsd": 0.0, "loss/logits": 0.31455044373869895, "step": 1160 }, { "epoch": 0.117, "grad_norm": 46.0, "grad_norm_var": 47.06640625, "learning_rate": 0.0001, "loss": 9.5622, "loss/crossentropy": 2.3361207604408265, "loss/hidden": 4.19296875, "loss/jsd": 0.0, "loss/logits": 0.30060703232884406, "step": 1170 }, { "epoch": 0.118, "grad_norm": 56.25, "grad_norm_var": 49.264322916666664, "learning_rate": 0.0001, "loss": 9.6834, "loss/crossentropy": 2.297483670711517, "loss/hidden": 4.2890625, "loss/jsd": 0.0, "loss/logits": 0.2907493541017175, "step": 1180 }, { "epoch": 0.119, "grad_norm": 52.5, "grad_norm_var": 12.27890625, "learning_rate": 0.0001, "loss": 9.6207, "loss/crossentropy": 2.2364058643579483, "loss/hidden": 4.277734375, "loss/jsd": 0.0, "loss/logits": 0.3097097765654325, "step": 1190 }, { "epoch": 0.12, "grad_norm": 68.5, "grad_norm_var": 35.055989583333336, "learning_rate": 0.0001, "loss": 9.6018, "loss/crossentropy": 2.2412969201803206, "loss/hidden": 4.287109375, "loss/jsd": 0.0, "loss/logits": 0.31851550191640854, "step": 1200 }, { "epoch": 0.121, "grad_norm": 60.5, "grad_norm_var": 25.774739583333332, "learning_rate": 0.0001, "loss": 9.6979, "loss/crossentropy": 2.3062032952904703, "loss/hidden": 4.258984375, "loss/jsd": 0.0, "loss/logits": 0.3124631106853485, "step": 1210 }, { "epoch": 0.122, "grad_norm": 59.25, "grad_norm_var": 20.026822916666667, "learning_rate": 0.0001, "loss": 9.7129, "loss/crossentropy": 2.4036868065595627, "loss/hidden": 4.20234375, "loss/jsd": 0.0, "loss/logits": 0.31101155243813994, "step": 1220 }, { "epoch": 0.123, "grad_norm": 53.25, "grad_norm_var": 75.30833333333334, "learning_rate": 0.0001, "loss": 9.7047, "loss/crossentropy": 2.3730016142129897, "loss/hidden": 4.193359375, "loss/jsd": 0.0, "loss/logits": 0.3105484452098608, "step": 1230 }, { "epoch": 0.124, "grad_norm": 62.25, "grad_norm_var": 33.27682291666667, "learning_rate": 0.0001, "loss": 9.6313, "loss/crossentropy": 2.2872567594051363, "loss/hidden": 4.319140625, "loss/jsd": 0.0, "loss/logits": 0.3244694545865059, "step": 1240 }, { "epoch": 0.125, "grad_norm": 61.25, "grad_norm_var": 25.673958333333335, "learning_rate": 0.0001, "loss": 9.6217, "loss/crossentropy": 2.3013710603117943, "loss/hidden": 4.29140625, "loss/jsd": 0.0, "loss/logits": 0.32178852558135984, "step": 1250 }, { "epoch": 0.126, "grad_norm": 49.5, "grad_norm_var": 46.97265625, "learning_rate": 0.0001, "loss": 9.6151, "loss/crossentropy": 2.2743802405893803, "loss/hidden": 4.19921875, "loss/jsd": 0.0, "loss/logits": 0.3104738780297339, "step": 1260 }, { "epoch": 0.127, "grad_norm": 52.25, "grad_norm_var": 318.94557291666666, "learning_rate": 0.0001, "loss": 9.635, "loss/crossentropy": 2.2751111879944803, "loss/hidden": 4.17578125, "loss/jsd": 0.0, "loss/logits": 0.2947248375043273, "step": 1270 }, { "epoch": 0.128, "grad_norm": 59.5, "grad_norm_var": 201.26848958333332, "learning_rate": 0.0001, "loss": 9.605, "loss/crossentropy": 2.3590754181146623, "loss/hidden": 4.116015625, "loss/jsd": 0.0, "loss/logits": 0.29773430675268175, "step": 1280 }, { "epoch": 0.129, "grad_norm": 50.0, "grad_norm_var": 25.795833333333334, "learning_rate": 0.0001, "loss": 9.4314, "loss/crossentropy": 2.165515697002411, "loss/hidden": 4.148046875, "loss/jsd": 0.0, "loss/logits": 0.2729496695101261, "step": 1290 }, { "epoch": 0.13, "grad_norm": 51.5, "grad_norm_var": 65.69557291666666, "learning_rate": 0.0001, "loss": 9.4579, "loss/crossentropy": 2.425456903874874, "loss/hidden": 4.1140625, "loss/jsd": 0.0, "loss/logits": 0.3002984166145325, "step": 1300 }, { "epoch": 0.131, "grad_norm": 55.75, "grad_norm_var": 74.63515625, "learning_rate": 0.0001, "loss": 9.562, "loss/crossentropy": 2.3212677478790282, "loss/hidden": 4.209765625, "loss/jsd": 0.0, "loss/logits": 0.28645528480410576, "step": 1310 }, { "epoch": 0.132, "grad_norm": 44.75, "grad_norm_var": 39.139322916666664, "learning_rate": 0.0001, "loss": 9.305, "loss/crossentropy": 2.2911602184176445, "loss/hidden": 4.133984375, "loss/jsd": 0.0, "loss/logits": 0.28404638059437276, "step": 1320 }, { "epoch": 0.133, "grad_norm": 52.25, "grad_norm_var": 76.19583333333334, "learning_rate": 0.0001, "loss": 9.3122, "loss/crossentropy": 2.3109163105487824, "loss/hidden": 4.137109375, "loss/jsd": 0.0, "loss/logits": 0.2864396806806326, "step": 1330 }, { "epoch": 0.134, "grad_norm": 47.0, "grad_norm_var": 41.66015625, "learning_rate": 0.0001, "loss": 9.4629, "loss/crossentropy": 2.353537403047085, "loss/hidden": 4.08203125, "loss/jsd": 0.0, "loss/logits": 0.2971150416880846, "step": 1340 }, { "epoch": 0.135, "grad_norm": 46.25, "grad_norm_var": 45.31848958333333, "learning_rate": 0.0001, "loss": 9.365, "loss/crossentropy": 2.3774181246757506, "loss/hidden": 4.09296875, "loss/jsd": 0.0, "loss/logits": 0.2799839396029711, "step": 1350 }, { "epoch": 0.136, "grad_norm": 51.0, "grad_norm_var": 17.93515625, "learning_rate": 0.0001, "loss": 9.3498, "loss/crossentropy": 2.246833881735802, "loss/hidden": 4.18828125, "loss/jsd": 0.0, "loss/logits": 0.2903384942561388, "step": 1360 }, { "epoch": 0.137, "grad_norm": 51.25, "grad_norm_var": 12.420833333333333, "learning_rate": 0.0001, "loss": 9.4976, "loss/crossentropy": 2.453240838646889, "loss/hidden": 4.173828125, "loss/jsd": 0.0, "loss/logits": 0.3144164770841599, "step": 1370 }, { "epoch": 0.138, "grad_norm": 70.0, "grad_norm_var": 2011.0322916666667, "learning_rate": 0.0001, "loss": 9.5884, "loss/crossentropy": 2.174116183817387, "loss/hidden": 4.24921875, "loss/jsd": 0.0, "loss/logits": 0.2923248626291752, "step": 1380 }, { "epoch": 0.139, "grad_norm": 53.75, "grad_norm_var": 1988.8833333333334, "learning_rate": 0.0001, "loss": 9.5249, "loss/crossentropy": 2.3638354018330574, "loss/hidden": 4.184765625, "loss/jsd": 0.0, "loss/logits": 0.3066251628100872, "step": 1390 }, { "epoch": 0.14, "grad_norm": 55.5, "grad_norm_var": 22.779166666666665, "learning_rate": 0.0001, "loss": 9.3528, "loss/crossentropy": 2.4166768550872804, "loss/hidden": 4.123828125, "loss/jsd": 0.0, "loss/logits": 0.29636494982987643, "step": 1400 }, { "epoch": 0.141, "grad_norm": 60.5, "grad_norm_var": 66.59348958333334, "learning_rate": 0.0001, "loss": 9.5339, "loss/crossentropy": 2.3475931867957116, "loss/hidden": 4.1953125, "loss/jsd": 0.0, "loss/logits": 0.30608872696757317, "step": 1410 }, { "epoch": 0.142, "grad_norm": 51.25, "grad_norm_var": 62.49140625, "learning_rate": 0.0001, "loss": 9.3342, "loss/crossentropy": 2.1785849004983904, "loss/hidden": 4.161328125, "loss/jsd": 0.0, "loss/logits": 0.27641028352081776, "step": 1420 }, { "epoch": 0.143, "grad_norm": 54.25, "grad_norm_var": 30.154166666666665, "learning_rate": 0.0001, "loss": 9.3898, "loss/crossentropy": 2.3990818440914152, "loss/hidden": 4.18359375, "loss/jsd": 0.0, "loss/logits": 0.2944341886788607, "step": 1430 }, { "epoch": 0.144, "grad_norm": 51.75, "grad_norm_var": 38.93932291666667, "learning_rate": 0.0001, "loss": 9.4628, "loss/crossentropy": 2.4946817860007284, "loss/hidden": 4.198828125, "loss/jsd": 0.0, "loss/logits": 0.31867978498339655, "step": 1440 }, { "epoch": 0.145, "grad_norm": 53.75, "grad_norm_var": 33.9, "learning_rate": 0.0001, "loss": 9.3416, "loss/crossentropy": 2.2067521095275877, "loss/hidden": 4.235546875, "loss/jsd": 0.0, "loss/logits": 0.2976540043950081, "step": 1450 }, { "epoch": 0.146, "grad_norm": 60.75, "grad_norm_var": 142.08229166666666, "learning_rate": 0.0001, "loss": 9.4716, "loss/crossentropy": 2.4361192852258684, "loss/hidden": 4.1140625, "loss/jsd": 0.0, "loss/logits": 0.2877715673297644, "step": 1460 }, { "epoch": 0.147, "grad_norm": 58.75, "grad_norm_var": 44.35, "learning_rate": 0.0001, "loss": 9.4006, "loss/crossentropy": 2.239429622516036, "loss/hidden": 4.026171875, "loss/jsd": 0.0, "loss/logits": 0.27844256814569235, "step": 1470 }, { "epoch": 0.148, "grad_norm": 45.0, "grad_norm_var": 33.95390625, "learning_rate": 0.0001, "loss": 9.3993, "loss/crossentropy": 2.0759536787867545, "loss/hidden": 4.068359375, "loss/jsd": 0.0, "loss/logits": 0.2688772227615118, "step": 1480 }, { "epoch": 0.149, "grad_norm": 51.75, "grad_norm_var": 25.795833333333334, "learning_rate": 0.0001, "loss": 9.3786, "loss/crossentropy": 2.286362998187542, "loss/hidden": 4.15078125, "loss/jsd": 0.0, "loss/logits": 0.2942257083952427, "step": 1490 }, { "epoch": 0.15, "grad_norm": 46.75, "grad_norm_var": 20.520833333333332, "learning_rate": 0.0001, "loss": 9.2903, "loss/crossentropy": 2.312733788788319, "loss/hidden": 3.971484375, "loss/jsd": 0.0, "loss/logits": 0.2691910218447447, "step": 1500 }, { "epoch": 0.151, "grad_norm": 50.25, "grad_norm_var": 28.290625, "learning_rate": 0.0001, "loss": 9.3076, "loss/crossentropy": 2.2467628076672552, "loss/hidden": 4.105078125, "loss/jsd": 0.0, "loss/logits": 0.2887777745723724, "step": 1510 }, { "epoch": 0.152, "grad_norm": 63.5, "grad_norm_var": 33.73098958333333, "learning_rate": 0.0001, "loss": 9.4203, "loss/crossentropy": 2.372379180788994, "loss/hidden": 4.07578125, "loss/jsd": 0.0, "loss/logits": 0.3087839350104332, "step": 1520 }, { "epoch": 0.153, "grad_norm": 45.5, "grad_norm_var": 40.108333333333334, "learning_rate": 0.0001, "loss": 9.3215, "loss/crossentropy": 2.3452367037534714, "loss/hidden": 4.210546875, "loss/jsd": 0.0, "loss/logits": 0.3159611392766237, "step": 1530 }, { "epoch": 0.154, "grad_norm": 58.25, "grad_norm_var": 27.539322916666666, "learning_rate": 0.0001, "loss": 9.3755, "loss/crossentropy": 2.3029753446578978, "loss/hidden": 3.999609375, "loss/jsd": 0.0, "loss/logits": 0.2623455457389355, "step": 1540 }, { "epoch": 0.155, "grad_norm": 51.75, "grad_norm_var": 26.9, "learning_rate": 0.0001, "loss": 9.3578, "loss/crossentropy": 2.3988554388284684, "loss/hidden": 4.08828125, "loss/jsd": 0.0, "loss/logits": 0.2846154376864433, "step": 1550 }, { "epoch": 0.156, "grad_norm": 91.0, "grad_norm_var": 1307.8372395833333, "learning_rate": 0.0001, "loss": 9.432, "loss/crossentropy": 2.343544365465641, "loss/hidden": 4.021875, "loss/jsd": 0.0, "loss/logits": 0.2907770898193121, "step": 1560 }, { "epoch": 0.157, "grad_norm": 52.0, "grad_norm_var": 170.62890625, "learning_rate": 0.0001, "loss": 9.3432, "loss/crossentropy": 2.173108433187008, "loss/hidden": 4.10859375, "loss/jsd": 0.0, "loss/logits": 0.28518917988985776, "step": 1570 }, { "epoch": 0.158, "grad_norm": 42.0, "grad_norm_var": 47.56015625, "learning_rate": 0.0001, "loss": 9.367, "loss/crossentropy": 2.2230691239237785, "loss/hidden": 4.23515625, "loss/jsd": 0.0, "loss/logits": 0.30039387457072736, "step": 1580 }, { "epoch": 0.159, "grad_norm": 72.0, "grad_norm_var": 1.226104970407838e+18, "learning_rate": 0.0001, "loss": 9.3564, "loss/crossentropy": 2.263391149044037, "loss/hidden": 4.10625, "loss/jsd": 0.0, "loss/logits": 0.2923804897814989, "step": 1590 }, { "epoch": 0.16, "grad_norm": 52.5, "grad_norm_var": 1.2261049681378806e+18, "learning_rate": 0.0001, "loss": 9.4959, "loss/crossentropy": 2.113241518288851, "loss/hidden": 4.087109375, "loss/jsd": 0.0, "loss/logits": 0.2759646028280258, "step": 1600 }, { "epoch": 0.161, "grad_norm": 66.0, "grad_norm_var": 734.1489583333333, "learning_rate": 0.0001, "loss": 9.4743, "loss/crossentropy": 2.3895165085792542, "loss/hidden": 4.059765625, "loss/jsd": 0.0, "loss/logits": 0.2987998936325312, "step": 1610 }, { "epoch": 0.162, "grad_norm": 44.75, "grad_norm_var": 50.00182291666667, "learning_rate": 0.0001, "loss": 9.1919, "loss/crossentropy": 2.251766300201416, "loss/hidden": 4.04375, "loss/jsd": 0.0, "loss/logits": 0.2781111396849155, "step": 1620 }, { "epoch": 0.163, "grad_norm": 52.75, "grad_norm_var": 437.49583333333334, "learning_rate": 0.0001, "loss": 9.4572, "loss/crossentropy": 2.382322034239769, "loss/hidden": 4.07734375, "loss/jsd": 0.0, "loss/logits": 0.31318275928497313, "step": 1630 }, { "epoch": 0.164, "grad_norm": 61.0, "grad_norm_var": 40.301822916666666, "learning_rate": 0.0001, "loss": 9.2668, "loss/crossentropy": 2.1683703124523164, "loss/hidden": 4.07578125, "loss/jsd": 0.0, "loss/logits": 0.283413190767169, "step": 1640 }, { "epoch": 0.165, "grad_norm": 42.75, "grad_norm_var": 57.307291666666664, "learning_rate": 0.0001, "loss": 9.339, "loss/crossentropy": 2.3430400043725967, "loss/hidden": 4.036328125, "loss/jsd": 0.0, "loss/logits": 0.287694800645113, "step": 1650 }, { "epoch": 0.166, "grad_norm": 46.75, "grad_norm_var": 65.52395833333334, "learning_rate": 0.0001, "loss": 9.3768, "loss/crossentropy": 2.2867416352033616, "loss/hidden": 4.017578125, "loss/jsd": 0.0, "loss/logits": 0.29683431759476664, "step": 1660 }, { "epoch": 0.167, "grad_norm": 52.5, "grad_norm_var": 61.18932291666667, "learning_rate": 0.0001, "loss": 9.2451, "loss/crossentropy": 2.3707614041864873, "loss/hidden": 4.06015625, "loss/jsd": 0.0, "loss/logits": 0.29184688804671166, "step": 1670 }, { "epoch": 0.168, "grad_norm": 51.75, "grad_norm_var": 20.895572916666666, "learning_rate": 0.0001, "loss": 9.3601, "loss/crossentropy": 2.3268392831087112, "loss/hidden": 4.12734375, "loss/jsd": 0.0, "loss/logits": 0.29570323824882505, "step": 1680 }, { "epoch": 0.169, "grad_norm": 44.0, "grad_norm_var": 10.290625, "learning_rate": 0.0001, "loss": 9.4214, "loss/crossentropy": 2.324131193757057, "loss/hidden": 4.1984375, "loss/jsd": 0.0, "loss/logits": 0.3133995305746794, "step": 1690 }, { "epoch": 0.17, "grad_norm": 58.25, "grad_norm_var": 19.124739583333334, "learning_rate": 0.0001, "loss": 9.2465, "loss/crossentropy": 2.35849623978138, "loss/hidden": 4.00703125, "loss/jsd": 0.0, "loss/logits": 0.2762619823217392, "step": 1700 }, { "epoch": 0.171, "grad_norm": 45.75, "grad_norm_var": 53.89895833333333, "learning_rate": 0.0001, "loss": 9.1951, "loss/crossentropy": 2.3914038598537446, "loss/hidden": 3.9984375, "loss/jsd": 0.0, "loss/logits": 0.2871177852153778, "step": 1710 }, { "epoch": 0.172, "grad_norm": 43.25, "grad_norm_var": 16.479166666666668, "learning_rate": 0.0001, "loss": 9.1669, "loss/crossentropy": 2.152750685811043, "loss/hidden": 4.100390625, "loss/jsd": 0.0, "loss/logits": 0.28708020225167274, "step": 1720 }, { "epoch": 0.173, "grad_norm": 49.25, "grad_norm_var": 13.45390625, "learning_rate": 0.0001, "loss": 9.1015, "loss/crossentropy": 2.2946193665266037, "loss/hidden": 4.085546875, "loss/jsd": 0.0, "loss/logits": 0.3062314610928297, "step": 1730 }, { "epoch": 0.174, "grad_norm": 46.5, "grad_norm_var": 22.473958333333332, "learning_rate": 0.0001, "loss": 9.1287, "loss/crossentropy": 2.1538643553853034, "loss/hidden": 3.9421875, "loss/jsd": 0.0, "loss/logits": 0.2666194221004844, "step": 1740 }, { "epoch": 0.175, "grad_norm": 47.0, "grad_norm_var": 32.62057291666667, "learning_rate": 0.0001, "loss": 9.411, "loss/crossentropy": 2.387891933321953, "loss/hidden": 4.14921875, "loss/jsd": 0.0, "loss/logits": 0.29542505368590355, "step": 1750 }, { "epoch": 0.176, "grad_norm": 45.25, "grad_norm_var": 26.92265625, "learning_rate": 0.0001, "loss": 9.2833, "loss/crossentropy": 2.3024097591638566, "loss/hidden": 4.03984375, "loss/jsd": 0.0, "loss/logits": 0.29066667445003985, "step": 1760 }, { "epoch": 0.177, "grad_norm": 53.5, "grad_norm_var": 17.832291666666666, "learning_rate": 0.0001, "loss": 9.2665, "loss/crossentropy": 2.4454205125570296, "loss/hidden": 3.955859375, "loss/jsd": 0.0, "loss/logits": 0.2910691563040018, "step": 1770 }, { "epoch": 0.178, "grad_norm": 42.25, "grad_norm_var": 29.865625, "learning_rate": 0.0001, "loss": 9.1701, "loss/crossentropy": 2.2966391056776048, "loss/hidden": 4.027734375, "loss/jsd": 0.0, "loss/logits": 0.2789210833609104, "step": 1780 }, { "epoch": 0.179, "grad_norm": 48.25, "grad_norm_var": 17.548958333333335, "learning_rate": 0.0001, "loss": 9.1992, "loss/crossentropy": 2.395502945780754, "loss/hidden": 3.934765625, "loss/jsd": 0.0, "loss/logits": 0.2776679117232561, "step": 1790 }, { "epoch": 0.18, "grad_norm": 40.75, "grad_norm_var": 13.282291666666667, "learning_rate": 0.0001, "loss": 9.1046, "loss/crossentropy": 2.22285817861557, "loss/hidden": 3.9046875, "loss/jsd": 0.0, "loss/logits": 0.26667180880904195, "step": 1800 }, { "epoch": 0.181, "grad_norm": 36.25, "grad_norm_var": 34.90807291666667, "learning_rate": 0.0001, "loss": 9.3204, "loss/crossentropy": 2.3842350512743, "loss/hidden": 4.009375, "loss/jsd": 0.0, "loss/logits": 0.300260554254055, "step": 1810 }, { "epoch": 0.182, "grad_norm": 46.75, "grad_norm_var": 27.77890625, "learning_rate": 0.0001, "loss": 9.0943, "loss/crossentropy": 2.274762773513794, "loss/hidden": 4.01328125, "loss/jsd": 0.0, "loss/logits": 0.28360783979296683, "step": 1820 }, { "epoch": 0.183, "grad_norm": 55.5, "grad_norm_var": 27.298958333333335, "learning_rate": 0.0001, "loss": 9.1699, "loss/crossentropy": 2.1643219627439976, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.267458438500762, "step": 1830 }, { "epoch": 0.184, "grad_norm": 49.75, "grad_norm_var": 43.94583333333333, "learning_rate": 0.0001, "loss": 9.3022, "loss/crossentropy": 2.464679929614067, "loss/hidden": 3.9546875, "loss/jsd": 0.0, "loss/logits": 0.29758369028568266, "step": 1840 }, { "epoch": 0.185, "grad_norm": 51.0, "grad_norm_var": 37.90807291666667, "learning_rate": 0.0001, "loss": 9.1863, "loss/crossentropy": 2.3199010998010636, "loss/hidden": 3.99453125, "loss/jsd": 0.0, "loss/logits": 0.27702242247760295, "step": 1850 }, { "epoch": 0.186, "grad_norm": 46.5, "grad_norm_var": 40.920833333333334, "learning_rate": 0.0001, "loss": 9.2872, "loss/crossentropy": 2.4041683062911035, "loss/hidden": 4.09140625, "loss/jsd": 0.0, "loss/logits": 0.30005627647042277, "step": 1860 }, { "epoch": 0.187, "grad_norm": 39.75, "grad_norm_var": 40.723958333333336, "learning_rate": 0.0001, "loss": 9.1081, "loss/crossentropy": 2.273802790045738, "loss/hidden": 4.175, "loss/jsd": 0.0, "loss/logits": 0.3045934235677123, "step": 1870 }, { "epoch": 0.188, "grad_norm": 43.25, "grad_norm_var": 33.35729166666667, "learning_rate": 0.0001, "loss": 9.106, "loss/crossentropy": 2.3607766672968866, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.28193066976964476, "step": 1880 }, { "epoch": 0.189, "grad_norm": 48.25, "grad_norm_var": 14.915625, "learning_rate": 0.0001, "loss": 9.1437, "loss/crossentropy": 2.2798361241817475, "loss/hidden": 3.9890625, "loss/jsd": 0.0, "loss/logits": 0.2721746701747179, "step": 1890 }, { "epoch": 0.19, "grad_norm": 44.0, "grad_norm_var": 21.223958333333332, "learning_rate": 0.0001, "loss": 9.0972, "loss/crossentropy": 2.21695294380188, "loss/hidden": 4.0015625, "loss/jsd": 0.0, "loss/logits": 0.2832322970032692, "step": 1900 }, { "epoch": 0.191, "grad_norm": 39.5, "grad_norm_var": 27.808333333333334, "learning_rate": 0.0001, "loss": 9.1587, "loss/crossentropy": 2.1728454776108266, "loss/hidden": 3.98828125, "loss/jsd": 0.0, "loss/logits": 0.27170457877218723, "step": 1910 }, { "epoch": 0.192, "grad_norm": 41.75, "grad_norm_var": 13.315625, "learning_rate": 0.0001, "loss": 9.1326, "loss/crossentropy": 2.154237084835768, "loss/hidden": 4.063671875, "loss/jsd": 0.0, "loss/logits": 0.27950075305998323, "step": 1920 }, { "epoch": 0.193, "grad_norm": 43.0, "grad_norm_var": 25.240625, "learning_rate": 0.0001, "loss": 9.1013, "loss/crossentropy": 2.2507698431611063, "loss/hidden": 4.01171875, "loss/jsd": 0.0, "loss/logits": 0.2808088269084692, "step": 1930 }, { "epoch": 0.194, "grad_norm": 49.25, "grad_norm_var": 22.832291666666666, "learning_rate": 0.0001, "loss": 9.2429, "loss/crossentropy": 2.288056728243828, "loss/hidden": 4.1546875, "loss/jsd": 0.0, "loss/logits": 0.31668607220053674, "step": 1940 }, { "epoch": 0.195, "grad_norm": 48.5, "grad_norm_var": 58.09557291666667, "learning_rate": 0.0001, "loss": 9.1742, "loss/crossentropy": 2.2107961744070055, "loss/hidden": 4.05078125, "loss/jsd": 0.0, "loss/logits": 0.2858551822602749, "step": 1950 }, { "epoch": 0.196, "grad_norm": 39.25, "grad_norm_var": 49.50390625, "learning_rate": 0.0001, "loss": 9.1293, "loss/crossentropy": 2.224529256671667, "loss/hidden": 3.9703125, "loss/jsd": 0.0, "loss/logits": 0.27973891496658326, "step": 1960 }, { "epoch": 0.197, "grad_norm": 39.25, "grad_norm_var": 13.890625, "learning_rate": 0.0001, "loss": 9.0689, "loss/crossentropy": 2.363737019896507, "loss/hidden": 4.027734375, "loss/jsd": 0.0, "loss/logits": 0.2919711694121361, "step": 1970 }, { "epoch": 0.198, "grad_norm": 55.75, "grad_norm_var": 26.655989583333334, "learning_rate": 0.0001, "loss": 9.2228, "loss/crossentropy": 2.3380469545722007, "loss/hidden": 4.0171875, "loss/jsd": 0.0, "loss/logits": 0.28581551983952524, "step": 1980 }, { "epoch": 0.199, "grad_norm": 45.0, "grad_norm_var": 27.357291666666665, "learning_rate": 0.0001, "loss": 9.173, "loss/crossentropy": 2.43135461807251, "loss/hidden": 3.970703125, "loss/jsd": 0.0, "loss/logits": 0.28222124874591825, "step": 1990 }, { "epoch": 0.2, "grad_norm": 44.5, "grad_norm_var": 16.04140625, "learning_rate": 0.0001, "loss": 9.1554, "loss/crossentropy": 2.4415812104940415, "loss/hidden": 4.064453125, "loss/jsd": 0.0, "loss/logits": 0.2984179027378559, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.715020064017613e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }