{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 2000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 10752.0, "learning_rate": 1.9e-05, "loss": 158.0638, "loss/crossentropy": 14.456178283691406, "loss/hidden": 18.91875, "loss/jsd": 0.0, "loss/logits": 12.539741969108581, "step": 10 }, { "epoch": 0.002, "grad_norm": 3264.0, "grad_norm_var": 13568954.666666666, "learning_rate": 2.8000000000000003e-05, "loss": 129.8883, "loss/crossentropy": 11.943150734901428, "loss/hidden": 19.128125, "loss/jsd": 0.0, "loss/logits": 10.032073307037354, "step": 20 }, { "epoch": 0.003, "grad_norm": 1824.0, "grad_norm_var": 3372859.7333333334, "learning_rate": 3.7e-05, "loss": 100.0245, "loss/crossentropy": 9.159896969795227, "loss/hidden": 18.609375, "loss/jsd": 0.0, "loss/logits": 7.277156031131744, "step": 30 }, { "epoch": 0.004, "grad_norm": 604.0, "grad_norm_var": 331110.3333333333, "learning_rate": 4.600000000000001e-05, "loss": 90.5579, "loss/crossentropy": 8.28059525489807, "loss/hidden": 18.39375, "loss/jsd": 0.0, "loss/logits": 6.247069478034973, "step": 40 }, { "epoch": 0.005, "grad_norm": 1128.0, "grad_norm_var": 60515.2, "learning_rate": 5.500000000000001e-05, "loss": 86.1966, "loss/crossentropy": 8.01256047487259, "loss/hidden": 18.175, "loss/jsd": 0.0, "loss/logits": 6.1038679599761965, "step": 50 }, { "epoch": 0.006, "grad_norm": 1360.0, "grad_norm_var": 67713.86666666667, "learning_rate": 6.400000000000001e-05, "loss": 82.9348, "loss/crossentropy": 7.731317961215973, "loss/hidden": 17.959375, "loss/jsd": 0.0, "loss/logits": 5.726186037063599, "step": 60 }, { "epoch": 0.007, "grad_norm": 1016.0, "grad_norm_var": 35902.933333333334, "learning_rate": 7.3e-05, "loss": 78.6625, "loss/crossentropy": 7.318132603168488, "loss/hidden": 17.8375, "loss/jsd": 0.0, "loss/logits": 5.322234338521957, "step": 70 }, { "epoch": 0.008, "grad_norm": 836.0, "grad_norm_var": 12856.466666666667, "learning_rate": 8.200000000000001e-05, "loss": 74.6, "loss/crossentropy": 6.863537752628327, "loss/hidden": 17.325, "loss/jsd": 0.0, "loss/logits": 4.851147556304932, "step": 80 }, { "epoch": 0.009, "grad_norm": 1168.0, "grad_norm_var": 38569.0, "learning_rate": 9.1e-05, "loss": 69.2648, "loss/crossentropy": 6.536011290550232, "loss/hidden": 16.871875, "loss/jsd": 0.0, "loss/logits": 4.729572284221649, "step": 90 }, { "epoch": 0.01, "grad_norm": 956.0, "grad_norm_var": 54132.26666666667, "learning_rate": 0.0001, "loss": 61.5492, "loss/crossentropy": 5.978731215000153, "loss/hidden": 15.9046875, "loss/jsd": 0.0, "loss/logits": 4.037681633234024, "step": 100 }, { "epoch": 0.011, "grad_norm": 494.0, "grad_norm_var": 60329.066666666666, "learning_rate": 0.0001, "loss": 50.5696, "loss/crossentropy": 5.069290089607239, "loss/hidden": 13.9625, "loss/jsd": 0.0, "loss/logits": 3.04628010392189, "step": 110 }, { "epoch": 0.012, "grad_norm": 242.0, "grad_norm_var": 33342.59583333333, "learning_rate": 0.0001, "loss": 38.8513, "loss/crossentropy": 4.116593188047409, "loss/hidden": 12.21875, "loss/jsd": 0.0, "loss/logits": 2.207608225941658, "step": 120 }, { "epoch": 0.013, "grad_norm": 189.0, "grad_norm_var": 2268.9625, "learning_rate": 0.0001, "loss": 30.2934, "loss/crossentropy": 3.6065172433853148, "loss/hidden": 10.4703125, "loss/jsd": 0.0, "loss/logits": 1.553831559419632, "step": 130 }, { "epoch": 0.014, "grad_norm": 129.0, "grad_norm_var": 428.78333333333336, "learning_rate": 0.0001, "loss": 25.4075, "loss/crossentropy": 3.238997083902359, "loss/hidden": 9.36875, "loss/jsd": 0.0, "loss/logits": 1.2455815717577934, "step": 140 }, { "epoch": 0.015, "grad_norm": 147.0, "grad_norm_var": 884.8666666666667, "learning_rate": 0.0001, "loss": 21.889, "loss/crossentropy": 3.104075390100479, "loss/hidden": 8.19296875, "loss/jsd": 0.0, "loss/logits": 0.981781056523323, "step": 150 }, { "epoch": 0.016, "grad_norm": 242.0, "grad_norm_var": 1127.890625, "learning_rate": 0.0001, "loss": 19.3636, "loss/crossentropy": 2.6487351998686792, "loss/hidden": 7.96328125, "loss/jsd": 0.0, "loss/logits": 0.862408060580492, "step": 160 }, { "epoch": 0.017, "grad_norm": 139.0, "grad_norm_var": 1720.690625, "learning_rate": 0.0001, "loss": 17.9103, "loss/crossentropy": 2.944036450982094, "loss/hidden": 7.28671875, "loss/jsd": 0.0, "loss/logits": 0.7954695858061314, "step": 170 }, { "epoch": 0.018, "grad_norm": 127.0, "grad_norm_var": 1522.8958333333333, "learning_rate": 0.0001, "loss": 17.1787, "loss/crossentropy": 2.7259451180696486, "loss/hidden": 7.03046875, "loss/jsd": 0.0, "loss/logits": 0.7603268466889859, "step": 180 }, { "epoch": 0.019, "grad_norm": 155.0, "grad_norm_var": 1390.2666666666667, "learning_rate": 0.0001, "loss": 16.3546, "loss/crossentropy": 2.745239295065403, "loss/hidden": 6.74765625, "loss/jsd": 0.0, "loss/logits": 0.6926519803702831, "step": 190 }, { "epoch": 0.02, "grad_norm": 164.0, "grad_norm_var": 902.2666666666667, "learning_rate": 0.0001, "loss": 15.7972, "loss/crossentropy": 2.6587735950946807, "loss/hidden": 6.6234375, "loss/jsd": 0.0, "loss/logits": 0.6642795346677304, "step": 200 }, { "epoch": 0.021, "grad_norm": 173.0, "grad_norm_var": 1056.5166666666667, "learning_rate": 0.0001, "loss": 15.4154, "loss/crossentropy": 2.67086471170187, "loss/hidden": 6.30390625, "loss/jsd": 0.0, "loss/logits": 0.6120679222047329, "step": 210 }, { "epoch": 0.022, "grad_norm": 168.0, "grad_norm_var": 446.2291666666667, "learning_rate": 0.0001, "loss": 14.9164, "loss/crossentropy": 2.8284773945808412, "loss/hidden": 6.15546875, "loss/jsd": 0.0, "loss/logits": 0.6234366297721863, "step": 220 }, { "epoch": 0.023, "grad_norm": 187.0, "grad_norm_var": 7334.5625, "learning_rate": 0.0001, "loss": 14.9531, "loss/crossentropy": 2.716707041859627, "loss/hidden": 6.196875, "loss/jsd": 0.0, "loss/logits": 0.6206937313079834, "step": 230 }, { "epoch": 0.024, "grad_norm": 172.0, "grad_norm_var": 6329.6625, "learning_rate": 0.0001, "loss": 14.4769, "loss/crossentropy": 2.4854482382535936, "loss/hidden": 6.05859375, "loss/jsd": 0.0, "loss/logits": 0.5394440380856395, "step": 240 }, { "epoch": 0.025, "grad_norm": 1149239296.0, "grad_norm_var": 8.254690576187568e+16, "learning_rate": 0.0001, "loss": 14.4045, "loss/crossentropy": 2.717127138376236, "loss/hidden": 5.9890625, "loss/jsd": 0.0, "loss/logits": 0.5883205510675907, "step": 250 }, { "epoch": 0.026, "grad_norm": 149.0, "grad_norm_var": 8.254691009067347e+16, "learning_rate": 0.0001, "loss": 13.9101, "loss/crossentropy": 2.478851719200611, "loss/hidden": 5.8765625, "loss/jsd": 0.0, "loss/logits": 0.5184394292533397, "step": 260 }, { "epoch": 0.027, "grad_norm": 186.0, "grad_norm_var": 930.0625, "learning_rate": 0.0001, "loss": 13.6027, "loss/crossentropy": 2.553143638372421, "loss/hidden": 5.75234375, "loss/jsd": 0.0, "loss/logits": 0.5368255846202373, "step": 270 }, { "epoch": 0.028, "grad_norm": 172.0, "grad_norm_var": 3805.0666666666666, "learning_rate": 0.0001, "loss": 13.9542, "loss/crossentropy": 2.7657821238040925, "loss/hidden": 5.8421875, "loss/jsd": 0.0, "loss/logits": 0.5587639883160591, "step": 280 }, { "epoch": 0.029, "grad_norm": 135.0, "grad_norm_var": 3837.616666666667, "learning_rate": 0.0001, "loss": 13.3979, "loss/crossentropy": 2.4579825714230537, "loss/hidden": 5.62421875, "loss/jsd": 0.0, "loss/logits": 0.5003356814384461, "step": 290 }, { "epoch": 0.03, "grad_norm": 119.5, "grad_norm_var": 4336.095833333334, "learning_rate": 0.0001, "loss": 13.372, "loss/crossentropy": 2.4825384080410005, "loss/hidden": 5.77734375, "loss/jsd": 0.0, "loss/logits": 0.5297574065625668, "step": 300 }, { "epoch": 0.031, "grad_norm": 144.0, "grad_norm_var": 2114.4333333333334, "learning_rate": 0.0001, "loss": 13.2199, "loss/crossentropy": 2.6365180641412733, "loss/hidden": 5.5484375, "loss/jsd": 0.0, "loss/logits": 0.5377178646624088, "step": 310 }, { "epoch": 0.032, "grad_norm": 126.5, "grad_norm_var": 885.75, "learning_rate": 0.0001, "loss": 13.022, "loss/crossentropy": 2.41667592599988, "loss/hidden": 5.5296875, "loss/jsd": 0.0, "loss/logits": 0.4934091318398714, "step": 320 }, { "epoch": 0.033, "grad_norm": 113.0, "grad_norm_var": 4216.623958333334, "learning_rate": 0.0001, "loss": 12.6825, "loss/crossentropy": 2.6186458706855773, "loss/hidden": 5.3796875, "loss/jsd": 0.0, "loss/logits": 0.48778619766235354, "step": 330 }, { "epoch": 0.034, "grad_norm": 154.0, "grad_norm_var": 637.090625, "learning_rate": 0.0001, "loss": 12.6415, "loss/crossentropy": 2.6686057686805724, "loss/hidden": 5.384375, "loss/jsd": 0.0, "loss/logits": 0.4940062865614891, "step": 340 }, { "epoch": 0.035, "grad_norm": 144.0, "grad_norm_var": 2633.765625, "learning_rate": 0.0001, "loss": 12.6064, "loss/crossentropy": 2.52793410718441, "loss/hidden": 5.21171875, "loss/jsd": 0.0, "loss/logits": 0.45680325478315353, "step": 350 }, { "epoch": 0.036, "grad_norm": 141.0, "grad_norm_var": 2513.148958333333, "learning_rate": 0.0001, "loss": 12.508, "loss/crossentropy": 2.445630243420601, "loss/hidden": 5.31171875, "loss/jsd": 0.0, "loss/logits": 0.4673466898500919, "step": 360 }, { "epoch": 0.037, "grad_norm": 146.0, "grad_norm_var": 161.95729166666666, "learning_rate": 0.0001, "loss": 12.3383, "loss/crossentropy": 2.432392257452011, "loss/hidden": 5.2109375, "loss/jsd": 0.0, "loss/logits": 0.4600852273404598, "step": 370 }, { "epoch": 0.038, "grad_norm": 122.5, "grad_norm_var": 1555.340625, "learning_rate": 0.0001, "loss": 12.2486, "loss/crossentropy": 2.448658475279808, "loss/hidden": 5.29765625, "loss/jsd": 0.0, "loss/logits": 0.47797103337943553, "step": 380 }, { "epoch": 0.039, "grad_norm": 110.5, "grad_norm_var": 159.92916666666667, "learning_rate": 0.0001, "loss": 11.9006, "loss/crossentropy": 2.4291503965854644, "loss/hidden": 5.01328125, "loss/jsd": 0.0, "loss/logits": 0.43006020598113537, "step": 390 }, { "epoch": 0.04, "grad_norm": 136.0, "grad_norm_var": 175.37395833333332, "learning_rate": 0.0001, "loss": 11.9938, "loss/crossentropy": 2.604290932416916, "loss/hidden": 4.9828125, "loss/jsd": 0.0, "loss/logits": 0.4612982179969549, "step": 400 }, { "epoch": 0.041, "grad_norm": 109.0, "grad_norm_var": 170.09583333333333, "learning_rate": 0.0001, "loss": 11.8251, "loss/crossentropy": 2.3994911506772043, "loss/hidden": 5.03984375, "loss/jsd": 0.0, "loss/logits": 0.4143600896000862, "step": 410 }, { "epoch": 0.042, "grad_norm": 122.0, "grad_norm_var": 150.45729166666666, "learning_rate": 0.0001, "loss": 11.6797, "loss/crossentropy": 2.428033410012722, "loss/hidden": 4.96171875, "loss/jsd": 0.0, "loss/logits": 0.41778192222118377, "step": 420 }, { "epoch": 0.043, "grad_norm": 119.0, "grad_norm_var": 125.55729166666667, "learning_rate": 0.0001, "loss": 11.7055, "loss/crossentropy": 2.569334480166435, "loss/hidden": 4.9921875, "loss/jsd": 0.0, "loss/logits": 0.4176106728613377, "step": 430 }, { "epoch": 0.044, "grad_norm": 120.0, "grad_norm_var": 186.67395833333333, "learning_rate": 0.0001, "loss": 11.5608, "loss/crossentropy": 2.5353519901633264, "loss/hidden": 4.82578125, "loss/jsd": 0.0, "loss/logits": 0.4004150029271841, "step": 440 }, { "epoch": 0.045, "grad_norm": 111.5, "grad_norm_var": 157.52395833333333, "learning_rate": 0.0001, "loss": 11.6926, "loss/crossentropy": 2.539342051744461, "loss/hidden": 4.9390625, "loss/jsd": 0.0, "loss/logits": 0.4505396105349064, "step": 450 }, { "epoch": 0.046, "grad_norm": 126.0, "grad_norm_var": 329.32916666666665, "learning_rate": 0.0001, "loss": 11.3179, "loss/crossentropy": 2.4947912380099297, "loss/hidden": 4.70703125, "loss/jsd": 0.0, "loss/logits": 0.39311613626778125, "step": 460 }, { "epoch": 0.047, "grad_norm": 130.0, "grad_norm_var": 482.1958333333333, "learning_rate": 0.0001, "loss": 11.2995, "loss/crossentropy": 2.522867926955223, "loss/hidden": 4.778125, "loss/jsd": 0.0, "loss/logits": 0.39878650680184363, "step": 470 }, { "epoch": 0.048, "grad_norm": 114.5, "grad_norm_var": 159.2, "learning_rate": 0.0001, "loss": 11.1298, "loss/crossentropy": 2.503119890391827, "loss/hidden": 4.6859375, "loss/jsd": 0.0, "loss/logits": 0.4145892545580864, "step": 480 }, { "epoch": 0.049, "grad_norm": 123.5, "grad_norm_var": 2113.4625, "learning_rate": 0.0001, "loss": 11.0383, "loss/crossentropy": 2.4039885073900225, "loss/hidden": 4.658203125, "loss/jsd": 0.0, "loss/logits": 0.37959295585751535, "step": 490 }, { "epoch": 0.05, "grad_norm": 110.0, "grad_norm_var": 1545.5291666666667, "learning_rate": 0.0001, "loss": 10.9564, "loss/crossentropy": 2.3160028889775277, "loss/hidden": 4.78359375, "loss/jsd": 0.0, "loss/logits": 0.4041217315942049, "step": 500 }, { "epoch": 0.051, "grad_norm": 118.5, "grad_norm_var": 1485.1572916666667, "learning_rate": 0.0001, "loss": 11.0273, "loss/crossentropy": 2.3481629095971583, "loss/hidden": 4.76796875, "loss/jsd": 0.0, "loss/logits": 0.388704277202487, "step": 510 }, { "epoch": 0.052, "grad_norm": 302.0, "grad_norm_var": 4060.695833333333, "learning_rate": 0.0001, "loss": 10.8826, "loss/crossentropy": 2.432570169866085, "loss/hidden": 4.647265625, "loss/jsd": 0.0, "loss/logits": 0.4005543690174818, "step": 520 }, { "epoch": 0.053, "grad_norm": 262.0, "grad_norm_var": 5144.929166666667, "learning_rate": 0.0001, "loss": 10.9255, "loss/crossentropy": 2.4078257739543916, "loss/hidden": 4.51953125, "loss/jsd": 0.0, "loss/logits": 0.3619723778218031, "step": 530 }, { "epoch": 0.054, "grad_norm": 111.5, "grad_norm_var": 3058.195833333333, "learning_rate": 0.0001, "loss": 10.8513, "loss/crossentropy": 2.1905623614788055, "loss/hidden": 4.54921875, "loss/jsd": 0.0, "loss/logits": 0.3489991918206215, "step": 540 }, { "epoch": 0.055, "grad_norm": 98.0, "grad_norm_var": 2313.990625, "learning_rate": 0.0001, "loss": 10.8386, "loss/crossentropy": 2.4719990983605387, "loss/hidden": 4.63984375, "loss/jsd": 0.0, "loss/logits": 0.4116944268345833, "step": 550 }, { "epoch": 0.056, "grad_norm": 105.0, "grad_norm_var": 1808.315625, "learning_rate": 0.0001, "loss": 10.7797, "loss/crossentropy": 2.381363682448864, "loss/hidden": 4.58828125, "loss/jsd": 0.0, "loss/logits": 0.38398357704281805, "step": 560 }, { "epoch": 0.057, "grad_norm": 206.0, "grad_norm_var": 1395.3, "learning_rate": 0.0001, "loss": 10.6643, "loss/crossentropy": 2.531977267563343, "loss/hidden": 4.6171875, "loss/jsd": 0.0, "loss/logits": 0.37524734511971475, "step": 570 }, { "epoch": 0.058, "grad_norm": 150.0, "grad_norm_var": 1246.6333333333334, "learning_rate": 0.0001, "loss": 10.5081, "loss/crossentropy": 2.391422814875841, "loss/hidden": 4.46484375, "loss/jsd": 0.0, "loss/logits": 0.3587542846798897, "step": 580 }, { "epoch": 0.059, "grad_norm": 110.5, "grad_norm_var": 678.5333333333333, "learning_rate": 0.0001, "loss": 10.4338, "loss/crossentropy": 2.267008524388075, "loss/hidden": 4.356640625, "loss/jsd": 0.0, "loss/logits": 0.3174692545086145, "step": 590 }, { "epoch": 0.06, "grad_norm": 135.0, "grad_norm_var": 914.0489583333333, "learning_rate": 0.0001, "loss": 10.5236, "loss/crossentropy": 2.3517861902713775, "loss/hidden": 4.42265625, "loss/jsd": 0.0, "loss/logits": 0.3542962525039911, "step": 600 }, { "epoch": 0.061, "grad_norm": 103.0, "grad_norm_var": 904.1989583333333, "learning_rate": 0.0001, "loss": 10.345, "loss/crossentropy": 2.3741147622466086, "loss/hidden": 4.4171875, "loss/jsd": 0.0, "loss/logits": 0.3606201378628612, "step": 610 }, { "epoch": 0.062, "grad_norm": 86.0, "grad_norm_var": 624.25, "learning_rate": 0.0001, "loss": 10.4494, "loss/crossentropy": 2.3786921083927153, "loss/hidden": 4.291796875, "loss/jsd": 0.0, "loss/logits": 0.33345147483050824, "step": 620 }, { "epoch": 0.063, "grad_norm": 109.0, "grad_norm_var": 580.0333333333333, "learning_rate": 0.0001, "loss": 10.1494, "loss/crossentropy": 2.3835427895188332, "loss/hidden": 4.328515625, "loss/jsd": 0.0, "loss/logits": 0.33732542097568513, "step": 630 }, { "epoch": 0.064, "grad_norm": 106.5, "grad_norm_var": 407.5625, "learning_rate": 0.0001, "loss": 10.334, "loss/crossentropy": 2.3970961540937425, "loss/hidden": 4.486328125, "loss/jsd": 0.0, "loss/logits": 0.3739761531352997, "step": 640 }, { "epoch": 0.065, "grad_norm": 127.0, "grad_norm_var": 8.827054751968406e+17, "learning_rate": 0.0001, "loss": 10.3909, "loss/crossentropy": 2.603018820285797, "loss/hidden": 4.336328125, "loss/jsd": 0.0, "loss/logits": 0.35302893407642844, "step": 650 }, { "epoch": 0.066, "grad_norm": 103.0, "grad_norm_var": 8.827054748993245e+17, "learning_rate": 0.0001, "loss": 10.3448, "loss/crossentropy": 2.209125077724457, "loss/hidden": 4.298046875, "loss/jsd": 0.0, "loss/logits": 0.3420632269233465, "step": 660 }, { "epoch": 0.067, "grad_norm": 111.5, "grad_norm_var": 190.75, "learning_rate": 0.0001, "loss": 10.1599, "loss/crossentropy": 2.1904555816203355, "loss/hidden": 4.413671875, "loss/jsd": 0.0, "loss/logits": 0.3357353564351797, "step": 670 }, { "epoch": 0.068, "grad_norm": 88.5, "grad_norm_var": 215.29895833333333, "learning_rate": 0.0001, "loss": 9.9371, "loss/crossentropy": 2.3618984460830688, "loss/hidden": 4.186328125, "loss/jsd": 0.0, "loss/logits": 0.33678749240934847, "step": 680 }, { "epoch": 0.069, "grad_norm": 95.0, "grad_norm_var": 228.140625, "learning_rate": 0.0001, "loss": 10.0861, "loss/crossentropy": 2.372377243638039, "loss/hidden": 4.2109375, "loss/jsd": 0.0, "loss/logits": 0.3243491280823946, "step": 690 }, { "epoch": 0.07, "grad_norm": 84.0, "grad_norm_var": 520.1666666666666, "learning_rate": 0.0001, "loss": 10.2116, "loss/crossentropy": 2.235209721326828, "loss/hidden": 4.303515625, "loss/jsd": 0.0, "loss/logits": 0.34188132397830484, "step": 700 }, { "epoch": 0.071, "grad_norm": 103.0, "grad_norm_var": 553.9291666666667, "learning_rate": 0.0001, "loss": 9.9575, "loss/crossentropy": 2.3372152552008627, "loss/hidden": 4.1390625, "loss/jsd": 0.0, "loss/logits": 0.3105729196220636, "step": 710 }, { "epoch": 0.072, "grad_norm": 107.0, "grad_norm_var": 538.540625, "learning_rate": 0.0001, "loss": 9.978, "loss/crossentropy": 2.510573136806488, "loss/hidden": 4.14453125, "loss/jsd": 0.0, "loss/logits": 0.3471809647977352, "step": 720 }, { "epoch": 0.073, "grad_norm": 139.0, "grad_norm_var": 493.49583333333334, "learning_rate": 0.0001, "loss": 9.9677, "loss/crossentropy": 2.3755437433719635, "loss/hidden": 4.123828125, "loss/jsd": 0.0, "loss/logits": 0.3338810380548239, "step": 730 }, { "epoch": 0.074, "grad_norm": 99.0, "grad_norm_var": 286.8625, "learning_rate": 0.0001, "loss": 9.8714, "loss/crossentropy": 2.3226330026984217, "loss/hidden": 4.11015625, "loss/jsd": 0.0, "loss/logits": 0.32580162063241, "step": 740 }, { "epoch": 0.075, "grad_norm": 85.5, "grad_norm_var": 425.8625, "learning_rate": 0.0001, "loss": 9.7891, "loss/crossentropy": 2.3768628584221005, "loss/hidden": 4.124609375, "loss/jsd": 0.0, "loss/logits": 0.31062583327293397, "step": 750 }, { "epoch": 0.076, "grad_norm": 120.0, "grad_norm_var": 373.765625, "learning_rate": 0.0001, "loss": 9.8455, "loss/crossentropy": 2.4248126417398455, "loss/hidden": 4.2359375, "loss/jsd": 0.0, "loss/logits": 0.3379279874265194, "step": 760 }, { "epoch": 0.077, "grad_norm": 115.5, "grad_norm_var": 366.765625, "learning_rate": 0.0001, "loss": 9.7894, "loss/crossentropy": 2.2128719061613085, "loss/hidden": 4.18515625, "loss/jsd": 0.0, "loss/logits": 0.3335044614970684, "step": 770 }, { "epoch": 0.078, "grad_norm": 82.0, "grad_norm_var": 207.05, "learning_rate": 0.0001, "loss": 9.7323, "loss/crossentropy": 2.321111184358597, "loss/hidden": 4.112109375, "loss/jsd": 0.0, "loss/logits": 0.30921670254319905, "step": 780 }, { "epoch": 0.079, "grad_norm": 90.0, "grad_norm_var": 321.65729166666665, "learning_rate": 0.0001, "loss": 9.7419, "loss/crossentropy": 2.3887290723621843, "loss/hidden": 4.17421875, "loss/jsd": 0.0, "loss/logits": 0.34963752441108226, "step": 790 }, { "epoch": 0.08, "grad_norm": 90.0, "grad_norm_var": 1653.9958333333334, "learning_rate": 0.0001, "loss": 9.6443, "loss/crossentropy": 2.34355805516243, "loss/hidden": 4.119140625, "loss/jsd": 0.0, "loss/logits": 0.3216205321252346, "step": 800 }, { "epoch": 0.081, "grad_norm": 111.0, "grad_norm_var": 1760.865625, "learning_rate": 0.0001, "loss": 9.7151, "loss/crossentropy": 2.26568204164505, "loss/hidden": 4.0734375, "loss/jsd": 0.0, "loss/logits": 0.3119744971394539, "step": 810 }, { "epoch": 0.082, "grad_norm": 100.0, "grad_norm_var": 365.0, "learning_rate": 0.0001, "loss": 9.6335, "loss/crossentropy": 2.363439542800188, "loss/hidden": 4.0421875, "loss/jsd": 0.0, "loss/logits": 0.3196489207446575, "step": 820 }, { "epoch": 0.083, "grad_norm": 105.0, "grad_norm_var": 725.840625, "learning_rate": 0.0001, "loss": 9.5683, "loss/crossentropy": 2.25376470759511, "loss/hidden": 4.040625, "loss/jsd": 0.0, "loss/logits": 0.3137321826070547, "step": 830 }, { "epoch": 0.084, "grad_norm": 91.0, "grad_norm_var": 243.115625, "learning_rate": 0.0001, "loss": 9.6059, "loss/crossentropy": 2.402809253334999, "loss/hidden": 4.08359375, "loss/jsd": 0.0, "loss/logits": 0.3079391553997993, "step": 840 }, { "epoch": 0.085, "grad_norm": 115.5, "grad_norm_var": 52.3625, "learning_rate": 0.0001, "loss": 9.4809, "loss/crossentropy": 2.3521162420511246, "loss/hidden": 3.929296875, "loss/jsd": 0.0, "loss/logits": 0.3063440557569265, "step": 850 }, { "epoch": 0.086, "grad_norm": 91.0, "grad_norm_var": 109.71666666666667, "learning_rate": 0.0001, "loss": 9.6562, "loss/crossentropy": 2.443948082625866, "loss/hidden": 4.025390625, "loss/jsd": 0.0, "loss/logits": 0.33005591817200186, "step": 860 }, { "epoch": 0.087, "grad_norm": 99.0, "grad_norm_var": 8.906043697083199e+17, "learning_rate": 0.0001, "loss": 9.6756, "loss/crossentropy": 2.2569786101579665, "loss/hidden": 4.169140625, "loss/jsd": 0.0, "loss/logits": 0.32912670746445655, "step": 870 }, { "epoch": 0.088, "grad_norm": 87.5, "grad_norm_var": 8.90604369488119e+17, "learning_rate": 0.0001, "loss": 9.6822, "loss/crossentropy": 2.542811484634876, "loss/hidden": 3.961328125, "loss/jsd": 0.0, "loss/logits": 0.3259673956781626, "step": 880 }, { "epoch": 0.089, "grad_norm": 117.0, "grad_norm_var": 227.42395833333333, "learning_rate": 0.0001, "loss": 9.44, "loss/crossentropy": 2.3939336955547335, "loss/hidden": 3.878515625, "loss/jsd": 0.0, "loss/logits": 0.29817260801792145, "step": 890 }, { "epoch": 0.09, "grad_norm": 79.0, "grad_norm_var": 200.97395833333334, "learning_rate": 0.0001, "loss": 9.3573, "loss/crossentropy": 2.496935114264488, "loss/hidden": 4.0015625, "loss/jsd": 0.0, "loss/logits": 0.3248747974634171, "step": 900 }, { "epoch": 0.091, "grad_norm": 97.5, "grad_norm_var": 517.7, "learning_rate": 0.0001, "loss": 9.4559, "loss/crossentropy": 2.245865948498249, "loss/hidden": 3.951953125, "loss/jsd": 0.0, "loss/logits": 0.30880712568759916, "step": 910 }, { "epoch": 0.092, "grad_norm": 93.0, "grad_norm_var": 475.07395833333334, "learning_rate": 0.0001, "loss": 9.3572, "loss/crossentropy": 2.3004986569285393, "loss/hidden": 3.912890625, "loss/jsd": 0.0, "loss/logits": 0.2959143763408065, "step": 920 }, { "epoch": 0.093, "grad_norm": 94.5, "grad_norm_var": 139.9, "learning_rate": 0.0001, "loss": 9.461, "loss/crossentropy": 2.360969065129757, "loss/hidden": 3.9828125, "loss/jsd": 0.0, "loss/logits": 0.3106645856052637, "step": 930 }, { "epoch": 0.094, "grad_norm": 102.5, "grad_norm_var": 82.290625, "learning_rate": 0.0001, "loss": 9.3725, "loss/crossentropy": 2.442077124118805, "loss/hidden": 3.887109375, "loss/jsd": 0.0, "loss/logits": 0.30320504680275917, "step": 940 }, { "epoch": 0.095, "grad_norm": 81.0, "grad_norm_var": 283.8989583333333, "learning_rate": 0.0001, "loss": 9.215, "loss/crossentropy": 2.2990706115961075, "loss/hidden": 3.908203125, "loss/jsd": 0.0, "loss/logits": 0.2945917289704084, "step": 950 }, { "epoch": 0.096, "grad_norm": 85.5, "grad_norm_var": 935.5291666666667, "learning_rate": 0.0001, "loss": 9.3148, "loss/crossentropy": 2.405318558216095, "loss/hidden": 3.8734375, "loss/jsd": 0.0, "loss/logits": 0.29377752766013143, "step": 960 }, { "epoch": 0.097, "grad_norm": 90.5, "grad_norm_var": 745.3291666666667, "learning_rate": 0.0001, "loss": 9.2675, "loss/crossentropy": 2.313190388679504, "loss/hidden": 3.908203125, "loss/jsd": 0.0, "loss/logits": 0.3074024930596352, "step": 970 }, { "epoch": 0.098, "grad_norm": 91.5, "grad_norm_var": 74.38333333333334, "learning_rate": 0.0001, "loss": 9.3473, "loss/crossentropy": 2.4643412232398987, "loss/hidden": 3.9109375, "loss/jsd": 0.0, "loss/logits": 0.31328765451908114, "step": 980 }, { "epoch": 0.099, "grad_norm": 83.0, "grad_norm_var": 77.24895833333333, "learning_rate": 0.0001, "loss": 9.1591, "loss/crossentropy": 2.3321994699537756, "loss/hidden": 3.794140625, "loss/jsd": 0.0, "loss/logits": 0.2842238027602434, "step": 990 }, { "epoch": 0.1, "grad_norm": 2919235584.0, "grad_norm_var": 5.3262099304352365e+17, "learning_rate": 0.0001, "loss": 9.2499, "loss/crossentropy": 2.24974425137043, "loss/hidden": 3.69921875, "loss/jsd": 0.0, "loss/logits": 0.2656703107059002, "step": 1000 }, { "epoch": 0.101, "grad_norm": 83.0, "grad_norm_var": 5.3262099137712704e+17, "learning_rate": 0.0001, "loss": 9.1036, "loss/crossentropy": 2.248470115661621, "loss/hidden": 3.834375, "loss/jsd": 0.0, "loss/logits": 0.28389163631945846, "step": 1010 }, { "epoch": 0.102, "grad_norm": 99.5, "grad_norm_var": 260.3958333333333, "learning_rate": 0.0001, "loss": 9.1529, "loss/crossentropy": 2.177551028132439, "loss/hidden": 3.85625, "loss/jsd": 0.0, "loss/logits": 0.2901096811518073, "step": 1020 }, { "epoch": 0.103, "grad_norm": 107.0, "grad_norm_var": 126.18333333333334, "learning_rate": 0.0001, "loss": 9.2276, "loss/crossentropy": 2.4588360369205473, "loss/hidden": 3.81953125, "loss/jsd": 0.0, "loss/logits": 0.3054195210337639, "step": 1030 }, { "epoch": 0.104, "grad_norm": 92.5, "grad_norm_var": 773.6822916666666, "learning_rate": 0.0001, "loss": 9.2522, "loss/crossentropy": 2.36704108864069, "loss/hidden": 3.98984375, "loss/jsd": 0.0, "loss/logits": 0.32540309652686117, "step": 1040 }, { "epoch": 0.105, "grad_norm": 94.0, "grad_norm_var": 747.9958333333333, "learning_rate": 0.0001, "loss": 9.1546, "loss/crossentropy": 2.2803470581769942, "loss/hidden": 3.805078125, "loss/jsd": 0.0, "loss/logits": 0.3206649195402861, "step": 1050 }, { "epoch": 0.106, "grad_norm": 74.0, "grad_norm_var": 118.565625, "learning_rate": 0.0001, "loss": 9.1738, "loss/crossentropy": 2.468463772535324, "loss/hidden": 3.775390625, "loss/jsd": 0.0, "loss/logits": 0.3029760651290417, "step": 1060 }, { "epoch": 0.107, "grad_norm": 76.0, "grad_norm_var": 112.42395833333333, "learning_rate": 0.0001, "loss": 9.0442, "loss/crossentropy": 2.3093275628983974, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.30387087166309357, "step": 1070 }, { "epoch": 0.108, "grad_norm": 92.5, "grad_norm_var": 47.71666666666667, "learning_rate": 0.0001, "loss": 9.0691, "loss/crossentropy": 2.3587117075920103, "loss/hidden": 3.7921875, "loss/jsd": 0.0, "loss/logits": 0.2959397092461586, "step": 1080 }, { "epoch": 0.109, "grad_norm": 82.0, "grad_norm_var": 82.57395833333334, "learning_rate": 0.0001, "loss": 9.0681, "loss/crossentropy": 2.3668819189071657, "loss/hidden": 3.873828125, "loss/jsd": 0.0, "loss/logits": 0.30734706819057467, "step": 1090 }, { "epoch": 0.11, "grad_norm": 98.0, "grad_norm_var": 130.02916666666667, "learning_rate": 0.0001, "loss": 9.1895, "loss/crossentropy": 2.4498503282666206, "loss/hidden": 3.838671875, "loss/jsd": 0.0, "loss/logits": 0.30784521605819465, "step": 1100 }, { "epoch": 0.111, "grad_norm": 88.5, "grad_norm_var": 92.05729166666667, "learning_rate": 0.0001, "loss": 9.2165, "loss/crossentropy": 2.37082399725914, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.29330057725310327, "step": 1110 }, { "epoch": 0.112, "grad_norm": 89.5, "grad_norm_var": 160.39895833333333, "learning_rate": 0.0001, "loss": 9.0963, "loss/crossentropy": 2.245619586110115, "loss/hidden": 3.839453125, "loss/jsd": 0.0, "loss/logits": 0.3090781785547733, "step": 1120 }, { "epoch": 0.113, "grad_norm": 89.0, "grad_norm_var": 153.57395833333334, "learning_rate": 0.0001, "loss": 9.1747, "loss/crossentropy": 2.254079730808735, "loss/hidden": 3.863671875, "loss/jsd": 0.0, "loss/logits": 0.29664100557565687, "step": 1130 }, { "epoch": 0.114, "grad_norm": 85.5, "grad_norm_var": 177.3625, "learning_rate": 0.0001, "loss": 8.9365, "loss/crossentropy": 2.3813750982284545, "loss/hidden": 3.841796875, "loss/jsd": 0.0, "loss/logits": 0.2955601759254932, "step": 1140 }, { "epoch": 0.115, "grad_norm": 97.0, "grad_norm_var": 177.75, "learning_rate": 0.0001, "loss": 9.0288, "loss/crossentropy": 2.317107746005058, "loss/hidden": 3.7171875, "loss/jsd": 0.0, "loss/logits": 0.2740287099033594, "step": 1150 }, { "epoch": 0.116, "grad_norm": 84.0, "grad_norm_var": 192.15, "learning_rate": 0.0001, "loss": 8.9149, "loss/crossentropy": 2.2348272860050202, "loss/hidden": 3.748828125, "loss/jsd": 0.0, "loss/logits": 0.26385229676961897, "step": 1160 }, { "epoch": 0.117, "grad_norm": 78.0, "grad_norm_var": 139.8625, "learning_rate": 0.0001, "loss": 8.9416, "loss/crossentropy": 2.186076807975769, "loss/hidden": 3.68046875, "loss/jsd": 0.0, "loss/logits": 0.2610600605607033, "step": 1170 }, { "epoch": 0.118, "grad_norm": 80.5, "grad_norm_var": 175.85, "learning_rate": 0.0001, "loss": 8.9542, "loss/crossentropy": 2.258153685927391, "loss/hidden": 3.740234375, "loss/jsd": 0.0, "loss/logits": 0.27120565343648195, "step": 1180 }, { "epoch": 0.119, "grad_norm": 79.0, "grad_norm_var": 164.89583333333334, "learning_rate": 0.0001, "loss": 8.8167, "loss/crossentropy": 2.4536369144916534, "loss/hidden": 3.7125, "loss/jsd": 0.0, "loss/logits": 0.28769057895988226, "step": 1190 }, { "epoch": 0.12, "grad_norm": 63.0, "grad_norm_var": 103.565625, "learning_rate": 0.0001, "loss": 8.7058, "loss/crossentropy": 2.2031524434685705, "loss/hidden": 3.709375, "loss/jsd": 0.0, "loss/logits": 0.2841499318368733, "step": 1200 }, { "epoch": 0.121, "grad_norm": 74.0, "grad_norm_var": 117.23229166666667, "learning_rate": 0.0001, "loss": 8.8823, "loss/crossentropy": 2.2541019685566424, "loss/hidden": 3.725, "loss/jsd": 0.0, "loss/logits": 0.2822803447023034, "step": 1210 }, { "epoch": 0.122, "grad_norm": 75.5, "grad_norm_var": 163.3625, "learning_rate": 0.0001, "loss": 8.7654, "loss/crossentropy": 2.4589641630649566, "loss/hidden": 3.77265625, "loss/jsd": 0.0, "loss/logits": 0.28896796628832816, "step": 1220 }, { "epoch": 0.123, "grad_norm": 83.0, "grad_norm_var": 68.25, "learning_rate": 0.0001, "loss": 8.9438, "loss/crossentropy": 2.2707848742604257, "loss/hidden": 3.685546875, "loss/jsd": 0.0, "loss/logits": 0.2688711144030094, "step": 1230 }, { "epoch": 0.124, "grad_norm": 97.5, "grad_norm_var": 75.89895833333334, "learning_rate": 0.0001, "loss": 8.8432, "loss/crossentropy": 2.5097223311662673, "loss/hidden": 3.656640625, "loss/jsd": 0.0, "loss/logits": 0.29047914147377013, "step": 1240 }, { "epoch": 0.125, "grad_norm": 89.0, "grad_norm_var": 1450.2989583333333, "learning_rate": 0.0001, "loss": 8.8377, "loss/crossentropy": 2.3170286387205126, "loss/hidden": 3.7, "loss/jsd": 0.0, "loss/logits": 0.2755675740540028, "step": 1250 }, { "epoch": 0.126, "grad_norm": 65.0, "grad_norm_var": 1693.0291666666667, "learning_rate": 0.0001, "loss": 8.6604, "loss/crossentropy": 2.1438958957791328, "loss/hidden": 3.639453125, "loss/jsd": 0.0, "loss/logits": 0.2576067052781582, "step": 1260 }, { "epoch": 0.127, "grad_norm": 74.0, "grad_norm_var": 126.190625, "learning_rate": 0.0001, "loss": 8.8333, "loss/crossentropy": 2.3025652706623077, "loss/hidden": 3.691015625, "loss/jsd": 0.0, "loss/logits": 0.2775576956570148, "step": 1270 }, { "epoch": 0.128, "grad_norm": 71.5, "grad_norm_var": 87.23229166666667, "learning_rate": 0.0001, "loss": 8.7094, "loss/crossentropy": 2.13181097432971, "loss/hidden": 3.702734375, "loss/jsd": 0.0, "loss/logits": 0.27296230792999265, "step": 1280 }, { "epoch": 0.129, "grad_norm": 113.0, "grad_norm_var": 149.48229166666667, "learning_rate": 0.0001, "loss": 8.6782, "loss/crossentropy": 2.1315632432699205, "loss/hidden": 3.625390625, "loss/jsd": 0.0, "loss/logits": 0.2651492517441511, "step": 1290 }, { "epoch": 0.13, "grad_norm": 85.0, "grad_norm_var": 111.440625, "learning_rate": 0.0001, "loss": 8.742, "loss/crossentropy": 2.339846658706665, "loss/hidden": 3.623828125, "loss/jsd": 0.0, "loss/logits": 0.2743611980229616, "step": 1300 }, { "epoch": 0.131, "grad_norm": 86.0, "grad_norm_var": 122.965625, "learning_rate": 0.0001, "loss": 8.6397, "loss/crossentropy": 2.2031438082456587, "loss/hidden": 3.5578125, "loss/jsd": 0.0, "loss/logits": 0.2621523380279541, "step": 1310 }, { "epoch": 0.132, "grad_norm": 71.5, "grad_norm_var": 132.10729166666667, "learning_rate": 0.0001, "loss": 8.7931, "loss/crossentropy": 2.465841978788376, "loss/hidden": 3.657421875, "loss/jsd": 0.0, "loss/logits": 0.29582356065511706, "step": 1320 }, { "epoch": 0.133, "grad_norm": 98.5, "grad_norm_var": 136.34973958333333, "learning_rate": 0.0001, "loss": 8.7755, "loss/crossentropy": 2.3093322798609734, "loss/hidden": 3.675390625, "loss/jsd": 0.0, "loss/logits": 0.28201375566422937, "step": 1330 }, { "epoch": 0.134, "grad_norm": 87.0, "grad_norm_var": 45.1625, "learning_rate": 0.0001, "loss": 8.8767, "loss/crossentropy": 2.3267322540283204, "loss/hidden": 3.687109375, "loss/jsd": 0.0, "loss/logits": 0.27597835548222066, "step": 1340 }, { "epoch": 0.135, "grad_norm": 78.5, "grad_norm_var": 52.19583333333333, "learning_rate": 0.0001, "loss": 8.7636, "loss/crossentropy": 2.250748935341835, "loss/hidden": 3.722265625, "loss/jsd": 0.0, "loss/logits": 0.275000686571002, "step": 1350 }, { "epoch": 0.136, "grad_norm": 74.0, "grad_norm_var": 77.68229166666667, "learning_rate": 0.0001, "loss": 8.8309, "loss/crossentropy": 2.294243222475052, "loss/hidden": 3.781640625, "loss/jsd": 0.0, "loss/logits": 0.29517283104360104, "step": 1360 }, { "epoch": 0.137, "grad_norm": 73.0, "grad_norm_var": 70.565625, "learning_rate": 0.0001, "loss": 8.6486, "loss/crossentropy": 2.4063815265893935, "loss/hidden": 3.561328125, "loss/jsd": 0.0, "loss/logits": 0.27084620147943494, "step": 1370 }, { "epoch": 0.138, "grad_norm": 72.0, "grad_norm_var": 160.365625, "learning_rate": 0.0001, "loss": 8.6319, "loss/crossentropy": 2.0357704624533652, "loss/hidden": 3.55390625, "loss/jsd": 0.0, "loss/logits": 0.24529488924890758, "step": 1380 }, { "epoch": 0.139, "grad_norm": 92.0, "grad_norm_var": 159.2625, "learning_rate": 0.0001, "loss": 8.6773, "loss/crossentropy": 2.207934172451496, "loss/hidden": 3.626953125, "loss/jsd": 0.0, "loss/logits": 0.26206000819802283, "step": 1390 }, { "epoch": 0.14, "grad_norm": 92.0, "grad_norm_var": 75.85, "learning_rate": 0.0001, "loss": 8.6142, "loss/crossentropy": 2.2258728444576263, "loss/hidden": 3.694140625, "loss/jsd": 0.0, "loss/logits": 0.2842423222959042, "step": 1400 }, { "epoch": 0.141, "grad_norm": 75.5, "grad_norm_var": 69.590625, "learning_rate": 0.0001, "loss": 8.7049, "loss/crossentropy": 2.405027574300766, "loss/hidden": 3.594140625, "loss/jsd": 0.0, "loss/logits": 0.2595718756318092, "step": 1410 }, { "epoch": 0.142, "grad_norm": 175.0, "grad_norm_var": 622.85, "learning_rate": 0.0001, "loss": 8.5144, "loss/crossentropy": 2.3508727669715883, "loss/hidden": 3.6578125, "loss/jsd": 0.0, "loss/logits": 0.2513396417722106, "step": 1420 }, { "epoch": 0.143, "grad_norm": 144.0, "grad_norm_var": 827.8739583333333, "learning_rate": 0.0001, "loss": 8.64, "loss/crossentropy": 2.158524568378925, "loss/hidden": 3.666015625, "loss/jsd": 0.0, "loss/logits": 0.25669998563826085, "step": 1430 }, { "epoch": 0.144, "grad_norm": 90.5, "grad_norm_var": 339.35729166666664, "learning_rate": 0.0001, "loss": 8.5076, "loss/crossentropy": 2.1952589228749275, "loss/hidden": 3.490625, "loss/jsd": 0.0, "loss/logits": 0.246895507350564, "step": 1440 }, { "epoch": 0.145, "grad_norm": 65.5, "grad_norm_var": 314.6333333333333, "learning_rate": 0.0001, "loss": 8.6159, "loss/crossentropy": 2.3050056755542756, "loss/hidden": 3.553515625, "loss/jsd": 0.0, "loss/logits": 0.2641986530274153, "step": 1450 }, { "epoch": 0.146, "grad_norm": 76.5, "grad_norm_var": 426.1166666666667, "learning_rate": 0.0001, "loss": 8.527, "loss/crossentropy": 2.281977441906929, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.2622336186468601, "step": 1460 }, { "epoch": 0.147, "grad_norm": 74.0, "grad_norm_var": 278.69348958333336, "learning_rate": 0.0001, "loss": 8.6149, "loss/crossentropy": 2.303273032605648, "loss/hidden": 3.5671875, "loss/jsd": 0.0, "loss/logits": 0.2778003554791212, "step": 1470 }, { "epoch": 0.148, "grad_norm": 102.0, "grad_norm_var": 134.70729166666666, "learning_rate": 0.0001, "loss": 8.4927, "loss/crossentropy": 2.40097414329648, "loss/hidden": 3.536328125, "loss/jsd": 0.0, "loss/logits": 0.27044865442439914, "step": 1480 }, { "epoch": 0.149, "grad_norm": 72.0, "grad_norm_var": 87.8, "learning_rate": 0.0001, "loss": 8.4056, "loss/crossentropy": 2.186897784471512, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.24866797383874656, "step": 1490 }, { "epoch": 0.15, "grad_norm": 75.5, "grad_norm_var": 133.09583333333333, "learning_rate": 0.0001, "loss": 8.5426, "loss/crossentropy": 2.311472164094448, "loss/hidden": 3.53359375, "loss/jsd": 0.0, "loss/logits": 0.25585599690675737, "step": 1500 }, { "epoch": 0.151, "grad_norm": 136.0, "grad_norm_var": 258.7625, "learning_rate": 0.0001, "loss": 8.3875, "loss/crossentropy": 2.2983651250600814, "loss/hidden": 3.562890625, "loss/jsd": 0.0, "loss/logits": 0.2763795707374811, "step": 1510 }, { "epoch": 0.152, "grad_norm": 94.5, "grad_norm_var": 292.75598958333336, "learning_rate": 0.0001, "loss": 8.5971, "loss/crossentropy": 2.3549255669116973, "loss/hidden": 3.55703125, "loss/jsd": 0.0, "loss/logits": 0.268990096822381, "step": 1520 }, { "epoch": 0.153, "grad_norm": 83.0, "grad_norm_var": 1.4189153071319926e+18, "learning_rate": 0.0001, "loss": 8.7383, "loss/crossentropy": 2.267159214615822, "loss/hidden": 3.5671875, "loss/jsd": 0.0, "loss/logits": 0.27373309470713136, "step": 1530 }, { "epoch": 0.154, "grad_norm": 77.5, "grad_norm_var": 63.916666666666664, "learning_rate": 0.0001, "loss": 8.5718, "loss/crossentropy": 2.259125065803528, "loss/hidden": 3.67265625, "loss/jsd": 0.0, "loss/logits": 0.28385352455079554, "step": 1540 }, { "epoch": 0.155, "grad_norm": 73.5, "grad_norm_var": 39.78723958333333, "learning_rate": 0.0001, "loss": 8.4993, "loss/crossentropy": 2.3606351226568223, "loss/hidden": 3.558984375, "loss/jsd": 0.0, "loss/logits": 0.272869897633791, "step": 1550 }, { "epoch": 0.156, "grad_norm": 71.5, "grad_norm_var": 247.2625, "learning_rate": 0.0001, "loss": 8.6188, "loss/crossentropy": 2.394289918243885, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.269069866463542, "step": 1560 }, { "epoch": 0.157, "grad_norm": 71.0, "grad_norm_var": 265.7, "learning_rate": 0.0001, "loss": 8.4936, "loss/crossentropy": 2.2599784307181836, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.26600994151085616, "step": 1570 }, { "epoch": 0.158, "grad_norm": 83.0, "grad_norm_var": 42.88333333333333, "learning_rate": 0.0001, "loss": 8.5015, "loss/crossentropy": 2.3098704159259795, "loss/hidden": 3.628515625, "loss/jsd": 0.0, "loss/logits": 0.285567194968462, "step": 1580 }, { "epoch": 0.159, "grad_norm": 67.0, "grad_norm_var": 111.665625, "learning_rate": 0.0001, "loss": 8.4128, "loss/crossentropy": 2.1794722147285936, "loss/hidden": 3.526953125, "loss/jsd": 0.0, "loss/logits": 0.2647275095805526, "step": 1590 }, { "epoch": 0.16, "grad_norm": 91.0, "grad_norm_var": 149.97890625, "learning_rate": 0.0001, "loss": 8.4745, "loss/crossentropy": 2.2243838563561438, "loss/hidden": 3.550390625, "loss/jsd": 0.0, "loss/logits": 0.2563688028603792, "step": 1600 }, { "epoch": 0.161, "grad_norm": 90.0, "grad_norm_var": 157.87395833333332, "learning_rate": 0.0001, "loss": 8.4168, "loss/crossentropy": 2.3965038657188416, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.27364722844213246, "step": 1610 }, { "epoch": 0.162, "grad_norm": 96.0, "grad_norm_var": 380.89348958333335, "learning_rate": 0.0001, "loss": 8.6256, "loss/crossentropy": 2.519009140133858, "loss/hidden": 3.536328125, "loss/jsd": 0.0, "loss/logits": 0.29145103991031646, "step": 1620 }, { "epoch": 0.163, "grad_norm": 80.0, "grad_norm_var": 331.05, "learning_rate": 0.0001, "loss": 8.2011, "loss/crossentropy": 2.1994084089994432, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.2542119387537241, "step": 1630 }, { "epoch": 0.164, "grad_norm": 72.0, "grad_norm_var": 41.19583333333333, "learning_rate": 0.0001, "loss": 8.3636, "loss/crossentropy": 2.4333469703793527, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.25861090533435344, "step": 1640 }, { "epoch": 0.165, "grad_norm": 79.0, "grad_norm_var": 226.29583333333332, "learning_rate": 0.0001, "loss": 8.5285, "loss/crossentropy": 2.468096488714218, "loss/hidden": 3.478515625, "loss/jsd": 0.0, "loss/logits": 0.26285996809601786, "step": 1650 }, { "epoch": 0.166, "grad_norm": 84.5, "grad_norm_var": 218.12916666666666, "learning_rate": 0.0001, "loss": 8.4346, "loss/crossentropy": 2.2107077345252035, "loss/hidden": 3.591015625, "loss/jsd": 0.0, "loss/logits": 0.2654247496277094, "step": 1660 }, { "epoch": 0.167, "grad_norm": 68.0, "grad_norm_var": 47.329166666666666, "learning_rate": 0.0001, "loss": 8.4021, "loss/crossentropy": 2.188153588026762, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.24756914153695106, "step": 1670 }, { "epoch": 0.168, "grad_norm": 68.0, "grad_norm_var": 232.240625, "learning_rate": 0.0001, "loss": 8.4491, "loss/crossentropy": 2.3357387453317644, "loss/hidden": 3.552734375, "loss/jsd": 0.0, "loss/logits": 0.28453084602952006, "step": 1680 }, { "epoch": 0.169, "grad_norm": 63.5, "grad_norm_var": 179.80729166666666, "learning_rate": 0.0001, "loss": 8.4439, "loss/crossentropy": 2.3677712947130205, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2788976304233074, "step": 1690 }, { "epoch": 0.17, "grad_norm": 119.5, "grad_norm_var": 398.665625, "learning_rate": 0.0001, "loss": 8.3827, "loss/crossentropy": 2.4275426417589188, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.26100732628256085, "step": 1700 }, { "epoch": 0.171, "grad_norm": 66.5, "grad_norm_var": 209.37395833333332, "learning_rate": 0.0001, "loss": 8.3197, "loss/crossentropy": 2.237619758397341, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.2523366323672235, "step": 1710 }, { "epoch": 0.172, "grad_norm": 171.0, "grad_norm_var": 636.4833333333333, "learning_rate": 0.0001, "loss": 8.2648, "loss/crossentropy": 2.169030448794365, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.24771953662857413, "step": 1720 }, { "epoch": 0.173, "grad_norm": 68.0, "grad_norm_var": 861.1247395833333, "learning_rate": 0.0001, "loss": 8.2948, "loss/crossentropy": 2.197067990899086, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.2451560577377677, "step": 1730 }, { "epoch": 0.174, "grad_norm": 63.5, "grad_norm_var": 524.7833333333333, "learning_rate": 0.0001, "loss": 8.2316, "loss/crossentropy": 2.2412655726075172, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.26945888753980396, "step": 1740 }, { "epoch": 0.175, "grad_norm": 90.5, "grad_norm_var": 496.12395833333335, "learning_rate": 0.0001, "loss": 8.3094, "loss/crossentropy": 2.314925655722618, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.27251414209604263, "step": 1750 }, { "epoch": 0.176, "grad_norm": 70.0, "grad_norm_var": 484.890625, "learning_rate": 0.0001, "loss": 8.3807, "loss/crossentropy": 2.3074424833059313, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.2574224047362804, "step": 1760 }, { "epoch": 0.177, "grad_norm": 69.0, "grad_norm_var": 88.83932291666666, "learning_rate": 0.0001, "loss": 8.3403, "loss/crossentropy": 2.2954701989889146, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.25794004313647745, "step": 1770 }, { "epoch": 0.178, "grad_norm": 71.5, "grad_norm_var": 86.65598958333334, "learning_rate": 0.0001, "loss": 8.1745, "loss/crossentropy": 2.2755073979496956, "loss/hidden": 3.521875, "loss/jsd": 0.0, "loss/logits": 0.26081139910966156, "step": 1780 }, { "epoch": 0.179, "grad_norm": 73.5, "grad_norm_var": 46.40390625, "learning_rate": 0.0001, "loss": 8.2619, "loss/crossentropy": 2.2126931130886076, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.27995246797800066, "step": 1790 }, { "epoch": 0.18, "grad_norm": 70.0, "grad_norm_var": 44.215625, "learning_rate": 0.0001, "loss": 8.3462, "loss/crossentropy": 2.3120188415050507, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.25568581037223337, "step": 1800 }, { "epoch": 0.181, "grad_norm": 79.0, "grad_norm_var": 250.965625, "learning_rate": 0.0001, "loss": 8.3991, "loss/crossentropy": 2.2807445406913756, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.2566069485619664, "step": 1810 }, { "epoch": 0.182, "grad_norm": 72.5, "grad_norm_var": 287.98333333333335, "learning_rate": 0.0001, "loss": 8.2019, "loss/crossentropy": 2.3523808985948564, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.2528150577098131, "step": 1820 }, { "epoch": 0.183, "grad_norm": 85.0, "grad_norm_var": 37.733072916666664, "learning_rate": 0.0001, "loss": 8.1958, "loss/crossentropy": 2.0805646784603598, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.22354185171425342, "step": 1830 }, { "epoch": 0.184, "grad_norm": 67.5, "grad_norm_var": 72.85729166666667, "learning_rate": 0.0001, "loss": 8.0768, "loss/crossentropy": 2.3133904695510865, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.24347416013479234, "step": 1840 }, { "epoch": 0.185, "grad_norm": 79.0, "grad_norm_var": 169.65833333333333, "learning_rate": 0.0001, "loss": 8.2994, "loss/crossentropy": 2.3512276649475097, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.26196608748286965, "step": 1850 }, { "epoch": 0.186, "grad_norm": 69.5, "grad_norm_var": 2388.08515625, "learning_rate": 0.0001, "loss": 8.3424, "loss/crossentropy": 2.3174356922507284, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.2508500372990966, "step": 1860 }, { "epoch": 0.187, "grad_norm": 60.0, "grad_norm_var": 196.15833333333333, "learning_rate": 0.0001, "loss": 8.256, "loss/crossentropy": 2.280574831366539, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2615037776529789, "step": 1870 }, { "epoch": 0.188, "grad_norm": 73.0, "grad_norm_var": 116.42890625, "learning_rate": 0.0001, "loss": 8.2108, "loss/crossentropy": 2.275608576834202, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.2556317184120417, "step": 1880 }, { "epoch": 0.189, "grad_norm": 66.5, "grad_norm_var": 38.723958333333336, "learning_rate": 0.0001, "loss": 8.3584, "loss/crossentropy": 2.356363560259342, "loss/hidden": 3.490625, "loss/jsd": 0.0, "loss/logits": 0.26850553378462794, "step": 1890 }, { "epoch": 0.19, "grad_norm": 70.0, "grad_norm_var": 90.62916666666666, "learning_rate": 0.0001, "loss": 8.1875, "loss/crossentropy": 2.282008448243141, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.25158569142222403, "step": 1900 }, { "epoch": 0.191, "grad_norm": 69.0, "grad_norm_var": 26.895833333333332, "learning_rate": 0.0001, "loss": 8.1676, "loss/crossentropy": 2.3583726406097414, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.2574294516816735, "step": 1910 }, { "epoch": 0.192, "grad_norm": 62.25, "grad_norm_var": 34.430989583333336, "learning_rate": 0.0001, "loss": 8.2457, "loss/crossentropy": 2.310526317358017, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.25894895792007444, "step": 1920 }, { "epoch": 0.193, "grad_norm": 84.0, "grad_norm_var": 65.58307291666667, "learning_rate": 0.0001, "loss": 8.2176, "loss/crossentropy": 2.0871855318546295, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.24589193761348724, "step": 1930 }, { "epoch": 0.194, "grad_norm": 65.5, "grad_norm_var": 39.07473958333333, "learning_rate": 0.0001, "loss": 8.1842, "loss/crossentropy": 2.261622406542301, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.24441927969455718, "step": 1940 }, { "epoch": 0.195, "grad_norm": 64.0, "grad_norm_var": 57.848958333333336, "learning_rate": 0.0001, "loss": 8.1485, "loss/crossentropy": 2.386093820631504, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2539959207177162, "step": 1950 }, { "epoch": 0.196, "grad_norm": 72.5, "grad_norm_var": 37.01223958333333, "learning_rate": 0.0001, "loss": 8.3277, "loss/crossentropy": 2.2825982570648193, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.2809562737122178, "step": 1960 }, { "epoch": 0.197, "grad_norm": 69.0, "grad_norm_var": 15.633333333333333, "learning_rate": 0.0001, "loss": 8.1367, "loss/crossentropy": 2.181477516889572, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.26897694952785967, "step": 1970 }, { "epoch": 0.198, "grad_norm": 68.5, "grad_norm_var": 13.966666666666667, "learning_rate": 0.0001, "loss": 8.1232, "loss/crossentropy": 2.292652648687363, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.24958589412271975, "step": 1980 }, { "epoch": 0.199, "grad_norm": 68.0, "grad_norm_var": 88.34765625, "learning_rate": 0.0001, "loss": 8.09, "loss/crossentropy": 2.367698776721954, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.2641737159341574, "step": 1990 }, { "epoch": 0.2, "grad_norm": 91.5, "grad_norm_var": 120.825, "learning_rate": 0.0001, "loss": 8.1587, "loss/crossentropy": 2.3354921892285345, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.24634175039827824, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.715020064017613e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }