{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7272727272727273, "eval_steps": 2000, "global_step": 16000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.09090909090909e-05, "grad_norm": 1360.0, "learning_rate": 2.1600000000000003e-05, "loss": 90.3949, "loss/crossentropy": 9.176323413848877, "loss/hidden": 10.28125, "loss/jsd": 0.0, "loss/logits": 7.093728303909302, "step": 2 }, { "epoch": 0.0001818181818181818, "grad_norm": 1552.0, "learning_rate": 2.32e-05, "loss": 90.1365, "loss/crossentropy": 9.369012594223022, "loss/hidden": 10.15625, "loss/jsd": 0.0, "loss/logits": 7.061126232147217, "step": 4 }, { "epoch": 0.00027272727272727274, "grad_norm": 1264.0, "learning_rate": 2.48e-05, "loss": 90.528, "loss/crossentropy": 8.99296236038208, "loss/hidden": 10.34375, "loss/jsd": 0.0, "loss/logits": 7.119132399559021, "step": 6 }, { "epoch": 0.0003636363636363636, "grad_norm": 1016.0, "learning_rate": 2.64e-05, "loss": 88.9458, "loss/crossentropy": 9.097182750701904, "loss/hidden": 10.109375, "loss/jsd": 0.0, "loss/logits": 6.973919987678528, "step": 8 }, { "epoch": 0.00045454545454545455, "grad_norm": 680.0, "learning_rate": 2.8000000000000003e-05, "loss": 80.8151, "loss/crossentropy": 8.566726207733154, "loss/hidden": 9.96875, "loss/jsd": 0.0, "loss/logits": 6.227962374687195, "step": 10 }, { "epoch": 0.0005454545454545455, "grad_norm": 406.0, "learning_rate": 2.9600000000000005e-05, "loss": 74.951, "loss/crossentropy": 7.72035026550293, "loss/hidden": 10.234375, "loss/jsd": 0.0, "loss/logits": 5.699627995491028, "step": 12 }, { "epoch": 0.0006363636363636364, "grad_norm": 195.0, "learning_rate": 3.1200000000000006e-05, "loss": 63.7058, "loss/crossentropy": 7.1191651821136475, "loss/hidden": 9.796875, "loss/jsd": 0.0, "loss/logits": 4.678979516029358, "step": 14 }, { "epoch": 0.0007272727272727272, "grad_norm": 224.0, "grad_norm_var": 257504.93333333332, "learning_rate": 3.2800000000000004e-05, "loss": 59.1982, "loss/crossentropy": 6.597274303436279, "loss/hidden": 9.53125, "loss/jsd": 0.0, "loss/logits": 4.306970596313477, "step": 16 }, { "epoch": 0.0008181818181818182, "grad_norm": 142.0, "grad_norm_var": 262114.73333333334, "learning_rate": 3.4399999999999996e-05, "loss": 56.8138, "loss/crossentropy": 5.990436792373657, "loss/hidden": 9.375, "loss/jsd": 0.0, "loss/logits": 4.144834399223328, "step": 18 }, { "epoch": 0.0009090909090909091, "grad_norm": 146.0, "grad_norm_var": 188436.38333333333, "learning_rate": 3.600000000000001e-05, "loss": 50.9131, "loss/crossentropy": 5.8276907205581665, "loss/hidden": 9.0, "loss/jsd": 0.0, "loss/logits": 3.608542263507843, "step": 20 }, { "epoch": 0.001, "grad_norm": 124.0, "grad_norm_var": 115294.8, "learning_rate": 3.76e-05, "loss": 45.0112, "loss/crossentropy": 5.350461959838867, "loss/hidden": 8.59375, "loss/jsd": 0.0, "loss/logits": 3.106696605682373, "step": 22 }, { "epoch": 0.001090909090909091, "grad_norm": 106.5, "grad_norm_var": 36484.198958333334, "learning_rate": 3.9200000000000004e-05, "loss": 42.591, "loss/crossentropy": 5.153482675552368, "loss/hidden": 8.265625, "loss/jsd": 0.0, "loss/logits": 2.917190909385681, "step": 24 }, { "epoch": 0.0011818181818181819, "grad_norm": 113.5, "grad_norm_var": 13437.966666666667, "learning_rate": 4.08e-05, "loss": 37.0835, "loss/crossentropy": 4.608655571937561, "loss/hidden": 7.859375, "loss/jsd": 0.0, "loss/logits": 2.4615488052368164, "step": 26 }, { "epoch": 0.0012727272727272728, "grad_norm": 81.5, "grad_norm_var": 2558.498958333333, "learning_rate": 4.240000000000001e-05, "loss": 33.6668, "loss/crossentropy": 4.5027313232421875, "loss/hidden": 7.359375, "loss/jsd": 0.0, "loss/logits": 2.180470585823059, "step": 28 }, { "epoch": 0.0013636363636363637, "grad_norm": 95.0, "grad_norm_var": 2151.4625, "learning_rate": 4.4000000000000006e-05, "loss": 29.2165, "loss/crossentropy": 3.8934799432754517, "loss/hidden": 7.03125, "loss/jsd": 0.0, "loss/logits": 1.8291802406311035, "step": 30 }, { "epoch": 0.0014545454545454545, "grad_norm": 103.0, "grad_norm_var": 543.5291666666667, "learning_rate": 4.5600000000000004e-05, "loss": 27.2888, "loss/crossentropy": 4.079542279243469, "loss/hidden": 6.5, "loss/jsd": 0.0, "loss/logits": 1.6709230244159698, "step": 32 }, { "epoch": 0.0015454545454545454, "grad_norm": 56.25, "grad_norm_var": 595.7018229166666, "learning_rate": 4.72e-05, "loss": 25.9051, "loss/crossentropy": 3.854455292224884, "loss/hidden": 6.2578125, "loss/jsd": 0.0, "loss/logits": 1.579281985759735, "step": 34 }, { "epoch": 0.0016363636363636363, "grad_norm": 53.0, "grad_norm_var": 584.4041666666667, "learning_rate": 4.88e-05, "loss": 23.429, "loss/crossentropy": 3.724231719970703, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 1.3860992789268494, "step": 36 }, { "epoch": 0.0017272727272727272, "grad_norm": 43.25, "grad_norm_var": 560.3822916666667, "learning_rate": 5.0400000000000005e-05, "loss": 21.9289, "loss/crossentropy": 3.573738753795624, "loss/hidden": 5.6484375, "loss/jsd": 0.0, "loss/logits": 1.2706688344478607, "step": 38 }, { "epoch": 0.0018181818181818182, "grad_norm": 64.5, "grad_norm_var": 539.2291666666666, "learning_rate": 5.2000000000000004e-05, "loss": 21.4012, "loss/crossentropy": 3.6262060403823853, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 1.2337479889392853, "step": 40 }, { "epoch": 0.0019090909090909091, "grad_norm": 38.75, "grad_norm_var": 498.765625, "learning_rate": 5.360000000000001e-05, "loss": 19.7598, "loss/crossentropy": 3.448512375354767, "loss/hidden": 5.1875, "loss/jsd": 0.0, "loss/logits": 1.1123754382133484, "step": 42 }, { "epoch": 0.002, "grad_norm": 34.0, "grad_norm_var": 460.3041666666667, "learning_rate": 5.520000000000001e-05, "loss": 19.3785, "loss/crossentropy": 3.2990421056747437, "loss/hidden": 5.1328125, "loss/jsd": 0.0, "loss/logits": 1.0946654677391052, "step": 44 }, { "epoch": 0.0020909090909090908, "grad_norm": 39.75, "grad_norm_var": 371.12473958333334, "learning_rate": 5.680000000000001e-05, "loss": 17.9111, "loss/crossentropy": 3.202954888343811, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.9786234647035599, "step": 46 }, { "epoch": 0.002181818181818182, "grad_norm": 32.75, "grad_norm_var": 201.20729166666666, "learning_rate": 5.840000000000001e-05, "loss": 16.5927, "loss/crossentropy": 2.978914201259613, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.8832564502954483, "step": 48 }, { "epoch": 0.0022727272727272726, "grad_norm": 33.25, "grad_norm_var": 101.68515625, "learning_rate": 6.0000000000000015e-05, "loss": 17.0952, "loss/crossentropy": 3.123198628425598, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.9284461736679077, "step": 50 }, { "epoch": 0.0023636363636363638, "grad_norm": 27.0, "grad_norm_var": 88.95390625, "learning_rate": 6.16e-05, "loss": 15.8285, "loss/crossentropy": 2.889090836048126, "loss/hidden": 4.5703125, "loss/jsd": 0.0, "loss/logits": 0.8369109928607941, "step": 52 }, { "epoch": 0.0024545454545454545, "grad_norm": 31.5, "grad_norm_var": 79.23723958333333, "learning_rate": 6.320000000000002e-05, "loss": 16.1042, "loss/crossentropy": 3.2919073700904846, "loss/hidden": 4.3828125, "loss/jsd": 0.0, "loss/logits": 0.8429520279169083, "step": 54 }, { "epoch": 0.0025454545454545456, "grad_norm": 32.5, "grad_norm_var": 17.640559895833334, "learning_rate": 6.480000000000002e-05, "loss": 15.0155, "loss/crossentropy": 2.880679130554199, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.7697282284498215, "step": 56 }, { "epoch": 0.0026363636363636363, "grad_norm": 37.75, "grad_norm_var": 9.8587890625, "learning_rate": 6.64e-05, "loss": 16.0118, "loss/crossentropy": 3.455830991268158, "loss/hidden": 4.2890625, "loss/jsd": 0.0, "loss/logits": 0.8266867697238922, "step": 58 }, { "epoch": 0.0027272727272727275, "grad_norm": 29.25, "grad_norm_var": 13.4666015625, "learning_rate": 6.8e-05, "loss": 14.5926, "loss/crossentropy": 2.9022597670555115, "loss/hidden": 4.2421875, "loss/jsd": 0.0, "loss/logits": 0.7448104918003082, "step": 60 }, { "epoch": 0.002818181818181818, "grad_norm": 23.0, "grad_norm_var": 14.4775390625, "learning_rate": 6.96e-05, "loss": 14.6715, "loss/crossentropy": 3.121829330921173, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.7377803772687912, "step": 62 }, { "epoch": 0.002909090909090909, "grad_norm": 30.0, "grad_norm_var": 17.617708333333333, "learning_rate": 7.12e-05, "loss": 14.9055, "loss/crossentropy": 3.0429744124412537, "loss/hidden": 4.12109375, "loss/jsd": 0.0, "loss/logits": 0.7741448879241943, "step": 64 }, { "epoch": 0.003, "grad_norm": 29.25, "grad_norm_var": 15.928059895833334, "learning_rate": 7.280000000000001e-05, "loss": 14.1943, "loss/crossentropy": 2.950484037399292, "loss/hidden": 4.0703125, "loss/jsd": 0.0, "loss/logits": 0.717346727848053, "step": 66 }, { "epoch": 0.0030909090909090908, "grad_norm": 24.5, "grad_norm_var": 16.054166666666667, "learning_rate": 7.44e-05, "loss": 14.7314, "loss/crossentropy": 3.1602155566215515, "loss/hidden": 4.03515625, "loss/jsd": 0.0, "loss/logits": 0.7536057233810425, "step": 68 }, { "epoch": 0.003181818181818182, "grad_norm": 24.875, "grad_norm_var": 15.701822916666666, "learning_rate": 7.6e-05, "loss": 14.2663, "loss/crossentropy": 3.1877995133399963, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.7164454311132431, "step": 70 }, { "epoch": 0.0032727272727272726, "grad_norm": 26.25, "grad_norm_var": 15.691080729166666, "learning_rate": 7.76e-05, "loss": 13.6663, "loss/crossentropy": 3.0498266220092773, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.6741430163383484, "step": 72 }, { "epoch": 0.003363636363636364, "grad_norm": 25.25, "grad_norm_var": 6.430143229166666, "learning_rate": 7.920000000000001e-05, "loss": 13.2456, "loss/crossentropy": 2.9760413765907288, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.6441413909196854, "step": 74 }, { "epoch": 0.0034545454545454545, "grad_norm": 28.25, "grad_norm_var": 10.0712890625, "learning_rate": 8.080000000000001e-05, "loss": 14.5251, "loss/crossentropy": 3.2674867510795593, "loss/hidden": 3.87109375, "loss/jsd": 0.0, "loss/logits": 0.7386563122272491, "step": 76 }, { "epoch": 0.0035454545454545456, "grad_norm": 27.125, "grad_norm_var": 9.1259765625, "learning_rate": 8.240000000000001e-05, "loss": 13.8005, "loss/crossentropy": 2.954371929168701, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.7033596634864807, "step": 78 }, { "epoch": 0.0036363636363636364, "grad_norm": 23.0, "grad_norm_var": 9.031184895833333, "learning_rate": 8.400000000000001e-05, "loss": 12.6488, "loss/crossentropy": 2.833801567554474, "loss/hidden": 3.72265625, "loss/jsd": 0.0, "loss/logits": 0.6092382520437241, "step": 80 }, { "epoch": 0.003727272727272727, "grad_norm": 21.0, "grad_norm_var": 9.297916666666667, "learning_rate": 8.560000000000001e-05, "loss": 12.8727, "loss/crossentropy": 2.897067129611969, "loss/hidden": 3.76953125, "loss/jsd": 0.0, "loss/logits": 0.6206130385398865, "step": 82 }, { "epoch": 0.0038181818181818182, "grad_norm": 22.0, "grad_norm_var": 9.426041666666666, "learning_rate": 8.720000000000002e-05, "loss": 12.882, "loss/crossentropy": 2.973766028881073, "loss/hidden": 3.66015625, "loss/jsd": 0.0, "loss/logits": 0.6248069703578949, "step": 84 }, { "epoch": 0.003909090909090909, "grad_norm": 30.5, "grad_norm_var": 11.759830729166667, "learning_rate": 8.880000000000002e-05, "loss": 12.9551, "loss/crossentropy": 3.0301239490509033, "loss/hidden": 3.58984375, "loss/jsd": 0.0, "loss/logits": 0.633510634303093, "step": 86 }, { "epoch": 0.004, "grad_norm": 18.5, "grad_norm_var": 15.156184895833333, "learning_rate": 9.040000000000002e-05, "loss": 12.6876, "loss/crossentropy": 2.96478807926178, "loss/hidden": 3.58203125, "loss/jsd": 0.0, "loss/logits": 0.6140733659267426, "step": 88 }, { "epoch": 0.004090909090909091, "grad_norm": 22.0, "grad_norm_var": 16.297916666666666, "learning_rate": 9.200000000000001e-05, "loss": 12.7216, "loss/crossentropy": 3.128583014011383, "loss/hidden": 3.54296875, "loss/jsd": 0.0, "loss/logits": 0.605008602142334, "step": 90 }, { "epoch": 0.0041818181818181815, "grad_norm": 19.75, "grad_norm_var": 11.3134765625, "learning_rate": 9.360000000000003e-05, "loss": 11.8172, "loss/crossentropy": 2.6920205950737, "loss/hidden": 3.50390625, "loss/jsd": 0.0, "loss/logits": 0.5621242374181747, "step": 92 }, { "epoch": 0.004272727272727273, "grad_norm": 25.25, "grad_norm_var": 75.8197265625, "learning_rate": 9.52e-05, "loss": 13.5439, "loss/crossentropy": 3.2377907037734985, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.6817845404148102, "step": 94 }, { "epoch": 0.004363636363636364, "grad_norm": 28.375, "grad_norm_var": 75.66451822916666, "learning_rate": 9.680000000000001e-05, "loss": 12.3032, "loss/crossentropy": 2.8024024963378906, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.5961685478687286, "step": 96 }, { "epoch": 0.004454545454545455, "grad_norm": 21.125, "grad_norm_var": 76.52682291666666, "learning_rate": 9.84e-05, "loss": 11.6879, "loss/crossentropy": 2.671006679534912, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.5446560382843018, "step": 98 }, { "epoch": 0.004545454545454545, "grad_norm": 23.875, "grad_norm_var": 77.74791666666667, "learning_rate": 0.0001, "loss": 11.3669, "loss/crossentropy": 2.5788058042526245, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.533108577132225, "step": 100 }, { "epoch": 0.004636363636363636, "grad_norm": 26.25, "grad_norm_var": 75.88430989583334, "learning_rate": 0.0001, "loss": 12.1702, "loss/crossentropy": 2.9276909828186035, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.5832359790802002, "step": 102 }, { "epoch": 0.0047272727272727275, "grad_norm": 20.25, "grad_norm_var": 76.33932291666666, "learning_rate": 0.0001, "loss": 11.9113, "loss/crossentropy": 2.7861939072608948, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.5699332356452942, "step": 104 }, { "epoch": 0.004818181818181818, "grad_norm": 22.5, "grad_norm_var": 76.52858072916666, "learning_rate": 0.0001, "loss": 11.0636, "loss/crossentropy": 2.458002597093582, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.51641945540905, "step": 106 }, { "epoch": 0.004909090909090909, "grad_norm": 19.75, "grad_norm_var": 76.46145833333334, "learning_rate": 0.0001, "loss": 11.7127, "loss/crossentropy": 2.7719802260398865, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.558525025844574, "step": 108 }, { "epoch": 0.005, "grad_norm": 20.5, "grad_norm_var": 7.7431640625, "learning_rate": 0.0001, "loss": 12.1741, "loss/crossentropy": 3.082732379436493, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.5763235092163086, "step": 110 }, { "epoch": 0.005090909090909091, "grad_norm": 20.75, "grad_norm_var": 4.109309895833333, "learning_rate": 0.0001, "loss": 11.2932, "loss/crossentropy": 2.7725719809532166, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.5231538042426109, "step": 112 }, { "epoch": 0.0051818181818181815, "grad_norm": 26.25, "grad_norm_var": 6.2775390625, "learning_rate": 0.0001, "loss": 11.0881, "loss/crossentropy": 2.7906739115715027, "loss/hidden": 3.20703125, "loss/jsd": 0.0, "loss/logits": 0.5090426653623581, "step": 114 }, { "epoch": 0.005272727272727273, "grad_norm": 17.875, "grad_norm_var": 10.571875, "learning_rate": 0.0001, "loss": 11.3643, "loss/crossentropy": 2.790172278881073, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.5269483178853989, "step": 116 }, { "epoch": 0.005363636363636364, "grad_norm": 19.75, "grad_norm_var": 9.723893229166666, "learning_rate": 0.0001, "loss": 11.8852, "loss/crossentropy": 2.946044921875, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.567350909113884, "step": 118 }, { "epoch": 0.005454545454545455, "grad_norm": 18.375, "grad_norm_var": 10.24140625, "learning_rate": 0.0001, "loss": 11.7239, "loss/crossentropy": 2.8700475096702576, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.561168447136879, "step": 120 }, { "epoch": 0.005545454545454545, "grad_norm": 17.375, "grad_norm_var": 11.387239583333333, "learning_rate": 0.0001, "loss": 11.1598, "loss/crossentropy": 2.7673959136009216, "loss/hidden": 3.24609375, "loss/jsd": 0.0, "loss/logits": 0.5146333426237106, "step": 122 }, { "epoch": 0.005636363636363636, "grad_norm": 17.0, "grad_norm_var": 12.383072916666666, "learning_rate": 0.0001, "loss": 11.0667, "loss/crossentropy": 2.7340869903564453, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.5121653378009796, "step": 124 }, { "epoch": 0.0057272727272727275, "grad_norm": 18.75, "grad_norm_var": 12.5712890625, "learning_rate": 0.0001, "loss": 10.5497, "loss/crossentropy": 2.6885997354984283, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.47126661241054535, "step": 126 }, { "epoch": 0.005818181818181818, "grad_norm": 20.875, "grad_norm_var": 12.439518229166667, "learning_rate": 0.0001, "loss": 11.6341, "loss/crossentropy": 2.9975308775901794, "loss/hidden": 3.16015625, "loss/jsd": 0.0, "loss/logits": 0.5476436764001846, "step": 128 }, { "epoch": 0.005909090909090909, "grad_norm": 24.0, "grad_norm_var": 10.752018229166667, "learning_rate": 0.0001, "loss": 10.693, "loss/crossentropy": 2.624659776687622, "loss/hidden": 3.17578125, "loss/jsd": 0.0, "loss/logits": 0.4892517775297165, "step": 130 }, { "epoch": 0.006, "grad_norm": 17.75, "grad_norm_var": 6.49140625, "learning_rate": 0.0001, "loss": 10.4226, "loss/crossentropy": 2.5996333956718445, "loss/hidden": 3.16796875, "loss/jsd": 0.0, "loss/logits": 0.4655040204524994, "step": 132 }, { "epoch": 0.006090909090909091, "grad_norm": 18.125, "grad_norm_var": 5.289518229166666, "learning_rate": 0.0001, "loss": 11.1863, "loss/crossentropy": 2.90828537940979, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.5199909061193466, "step": 134 }, { "epoch": 0.0061818181818181816, "grad_norm": 18.5, "grad_norm_var": 3.5580729166666667, "learning_rate": 0.0001, "loss": 10.169, "loss/crossentropy": 2.619682729244232, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.44712138175964355, "step": 136 }, { "epoch": 0.006272727272727273, "grad_norm": 22.875, "grad_norm_var": 4.398893229166666, "learning_rate": 0.0001, "loss": 11.332, "loss/crossentropy": 3.037371039390564, "loss/hidden": 3.05078125, "loss/jsd": 0.0, "loss/logits": 0.5243876725435257, "step": 138 }, { "epoch": 0.006363636363636364, "grad_norm": 16.75, "grad_norm_var": 5.875, "learning_rate": 0.0001, "loss": 10.7355, "loss/crossentropy": 2.981704592704773, "loss/hidden": 3.02734375, "loss/jsd": 0.0, "loss/logits": 0.47264309227466583, "step": 140 }, { "epoch": 0.006454545454545454, "grad_norm": 18.875, "grad_norm_var": 5.864322916666667, "learning_rate": 0.0001, "loss": 10.5426, "loss/crossentropy": 2.586820363998413, "loss/hidden": 3.08984375, "loss/jsd": 0.0, "loss/logits": 0.48659584671258926, "step": 142 }, { "epoch": 0.006545454545454545, "grad_norm": 23.0, "grad_norm_var": 6.71875, "learning_rate": 0.0001, "loss": 11.7386, "loss/crossentropy": 3.298483431339264, "loss/hidden": 3.07421875, "loss/jsd": 0.0, "loss/logits": 0.5365912169218063, "step": 144 }, { "epoch": 0.006636363636363636, "grad_norm": 14.5625, "grad_norm_var": 5.843733723958334, "learning_rate": 0.0001, "loss": 10.8236, "loss/crossentropy": 2.9122031927108765, "loss/hidden": 3.01171875, "loss/jsd": 0.0, "loss/logits": 0.489966943860054, "step": 146 }, { "epoch": 0.006727272727272728, "grad_norm": 17.75, "grad_norm_var": 5.474593098958334, "learning_rate": 0.0001, "loss": 10.3807, "loss/crossentropy": 2.733789712190628, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.4600061699748039, "step": 148 }, { "epoch": 0.006818181818181818, "grad_norm": 17.625, "grad_norm_var": 5.888134765625, "learning_rate": 0.0001, "loss": 10.6615, "loss/crossentropy": 2.896587073802948, "loss/hidden": 2.99609375, "loss/jsd": 0.0, "loss/logits": 0.47687888890504837, "step": 150 }, { "epoch": 0.006909090909090909, "grad_norm": 18.0, "grad_norm_var": 5.942431640625, "learning_rate": 0.0001, "loss": 10.4981, "loss/crossentropy": 2.9280700087547302, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.46325472742319107, "step": 152 }, { "epoch": 0.007, "grad_norm": 19.375, "grad_norm_var": 4.960400390625, "learning_rate": 0.0001, "loss": 10.3229, "loss/crossentropy": 2.7962088882923126, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.4542351961135864, "step": 154 }, { "epoch": 0.007090909090909091, "grad_norm": 15.25, "grad_norm_var": 4.8384765625, "learning_rate": 0.0001, "loss": 10.5132, "loss/crossentropy": 2.8652373552322388, "loss/hidden": 2.9375, "loss/jsd": 0.0, "loss/logits": 0.4710420146584511, "step": 156 }, { "epoch": 0.007181818181818182, "grad_norm": 17.875, "grad_norm_var": 4.8150390625, "learning_rate": 0.0001, "loss": 10.4484, "loss/crossentropy": 2.914491832256317, "loss/hidden": 2.92578125, "loss/jsd": 0.0, "loss/logits": 0.4608093202114105, "step": 158 }, { "epoch": 0.007272727272727273, "grad_norm": 18.0, "grad_norm_var": 2.3598307291666667, "learning_rate": 0.0001, "loss": 10.4659, "loss/crossentropy": 2.919116258621216, "loss/hidden": 2.9296875, "loss/jsd": 0.0, "loss/logits": 0.4617074206471443, "step": 160 }, { "epoch": 0.007363636363636364, "grad_norm": 15.875, "grad_norm_var": 2.234619140625, "learning_rate": 0.0001, "loss": 10.4598, "loss/crossentropy": 2.8868841528892517, "loss/hidden": 2.9140625, "loss/jsd": 0.0, "loss/logits": 0.46588635444641113, "step": 162 }, { "epoch": 0.007454545454545454, "grad_norm": 19.125, "grad_norm_var": 2.377327473958333, "learning_rate": 0.0001, "loss": 10.4338, "loss/crossentropy": 2.674839496612549, "loss/hidden": 2.96484375, "loss/jsd": 0.0, "loss/logits": 0.47941194474697113, "step": 164 }, { "epoch": 0.007545454545454545, "grad_norm": 21.5, "grad_norm_var": 3.4983723958333335, "learning_rate": 0.0001, "loss": 9.8643, "loss/crossentropy": 2.656435251235962, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.4286011680960655, "step": 166 }, { "epoch": 0.0076363636363636364, "grad_norm": 18.125, "grad_norm_var": 3.3244140625, "learning_rate": 0.0001, "loss": 10.2057, "loss/crossentropy": 2.6137644052505493, "loss/hidden": 2.9453125, "loss/jsd": 0.0, "loss/logits": 0.46466120332479477, "step": 168 }, { "epoch": 0.007727272727272728, "grad_norm": 15.9375, "grad_norm_var": 2.9206868489583333, "learning_rate": 0.0001, "loss": 10.3836, "loss/crossentropy": 2.7845226526260376, "loss/hidden": 2.91796875, "loss/jsd": 0.0, "loss/logits": 0.4681082144379616, "step": 170 }, { "epoch": 0.007818181818181818, "grad_norm": 17.875, "grad_norm_var": 2.588997395833333, "learning_rate": 0.0001, "loss": 10.8833, "loss/crossentropy": 3.017181158065796, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.4944266676902771, "step": 172 }, { "epoch": 0.007909090909090909, "grad_norm": 15.5, "grad_norm_var": 2.972395833333333, "learning_rate": 0.0001, "loss": 9.8174, "loss/crossentropy": 2.776689827442169, "loss/hidden": 2.87890625, "loss/jsd": 0.0, "loss/logits": 0.4161801263689995, "step": 174 }, { "epoch": 0.008, "grad_norm": 13.3125, "grad_norm_var": 3.952978515625, "learning_rate": 0.0001, "loss": 9.2752, "loss/crossentropy": 2.333541750907898, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.4035414010286331, "step": 176 }, { "epoch": 0.008090909090909091, "grad_norm": 17.625, "grad_norm_var": 3.504931640625, "learning_rate": 0.0001, "loss": 9.9024, "loss/crossentropy": 2.8116763830184937, "loss/hidden": 2.859375, "loss/jsd": 0.0, "loss/logits": 0.42313288152217865, "step": 178 }, { "epoch": 0.008181818181818182, "grad_norm": 15.125, "grad_norm_var": 4.189322916666667, "learning_rate": 0.0001, "loss": 9.7316, "loss/crossentropy": 2.689953565597534, "loss/hidden": 2.83203125, "loss/jsd": 0.0, "loss/logits": 0.42096173018217087, "step": 180 }, { "epoch": 0.008272727272727272, "grad_norm": 17.75, "grad_norm_var": 2.833447265625, "learning_rate": 0.0001, "loss": 10.2971, "loss/crossentropy": 2.8897904753684998, "loss/hidden": 2.83203125, "loss/jsd": 0.0, "loss/logits": 0.45752543210983276, "step": 182 }, { "epoch": 0.008363636363636363, "grad_norm": 13.75, "grad_norm_var": 3.5320149739583333, "learning_rate": 0.0001, "loss": 9.6538, "loss/crossentropy": 2.5922796726226807, "loss/hidden": 2.78125, "loss/jsd": 0.0, "loss/logits": 0.4280288740992546, "step": 184 }, { "epoch": 0.008454545454545454, "grad_norm": 15.8125, "grad_norm_var": 3.536181640625, "learning_rate": 0.0001, "loss": 9.9893, "loss/crossentropy": 2.7334728837013245, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.44120776653289795, "step": 186 }, { "epoch": 0.008545454545454545, "grad_norm": 16.375, "grad_norm_var": 2.728125, "learning_rate": 0.0001, "loss": 9.8911, "loss/crossentropy": 2.848461151123047, "loss/hidden": 2.74609375, "loss/jsd": 0.0, "loss/logits": 0.4296583905816078, "step": 188 }, { "epoch": 0.008636363636363636, "grad_norm": 17.375, "grad_norm_var": 7.061458333333333, "learning_rate": 0.0001, "loss": 10.5802, "loss/crossentropy": 2.905729293823242, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.48072902858257294, "step": 190 }, { "epoch": 0.008727272727272728, "grad_norm": 16.0, "grad_norm_var": 6.444124348958334, "learning_rate": 0.0001, "loss": 9.9674, "loss/crossentropy": 2.834249794483185, "loss/hidden": 2.828125, "loss/jsd": 0.0, "loss/logits": 0.4305073842406273, "step": 192 }, { "epoch": 0.008818181818181819, "grad_norm": 15.5, "grad_norm_var": 6.712239583333333, "learning_rate": 0.0001, "loss": 9.6361, "loss/crossentropy": 2.7647129893302917, "loss/hidden": 2.76953125, "loss/jsd": 0.0, "loss/logits": 0.4101871997117996, "step": 194 }, { "epoch": 0.00890909090909091, "grad_norm": 14.0625, "grad_norm_var": 6.514583333333333, "learning_rate": 0.0001, "loss": 9.6583, "loss/crossentropy": 2.630301356315613, "loss/hidden": 2.76171875, "loss/jsd": 0.0, "loss/logits": 0.42662937194108963, "step": 196 }, { "epoch": 0.009, "grad_norm": 14.1875, "grad_norm_var": 6.610139973958334, "learning_rate": 0.0001, "loss": 9.527, "loss/crossentropy": 2.605971574783325, "loss/hidden": 2.77734375, "loss/jsd": 0.0, "loss/logits": 0.4143667370080948, "step": 198 }, { "epoch": 0.00909090909090909, "grad_norm": 34.5, "grad_norm_var": 26.2712890625, "learning_rate": 0.0001, "loss": 10.0793, "loss/crossentropy": 2.6595062017440796, "loss/hidden": 2.85546875, "loss/jsd": 0.0, "loss/logits": 0.4564330130815506, "step": 200 }, { "epoch": 0.009181818181818182, "grad_norm": 13.8125, "grad_norm_var": 27.052604166666665, "learning_rate": 0.0001, "loss": 9.5614, "loss/crossentropy": 2.5215924680233, "loss/hidden": 2.81640625, "loss/jsd": 0.0, "loss/logits": 0.42234230786561966, "step": 202 }, { "epoch": 0.009272727272727273, "grad_norm": 15.0, "grad_norm_var": 27.356884765625, "learning_rate": 0.0001, "loss": 9.9632, "loss/crossentropy": 2.9082372188568115, "loss/hidden": 2.703125, "loss/jsd": 0.0, "loss/logits": 0.43518833816051483, "step": 204 }, { "epoch": 0.009363636363636364, "grad_norm": 13.5, "grad_norm_var": 25.1822265625, "learning_rate": 0.0001, "loss": 9.5785, "loss/crossentropy": 2.8254444003105164, "loss/hidden": 2.69140625, "loss/jsd": 0.0, "loss/logits": 0.4061614125967026, "step": 206 }, { "epoch": 0.009454545454545455, "grad_norm": 15.1875, "grad_norm_var": 25.627718098958333, "learning_rate": 0.0001, "loss": 8.7696, "loss/crossentropy": 2.29874524474144, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.3713030517101288, "step": 208 }, { "epoch": 0.009545454545454546, "grad_norm": 23.5, "grad_norm_var": 28.11484375, "learning_rate": 0.0001, "loss": 10.515, "loss/crossentropy": 3.057756543159485, "loss/hidden": 2.77734375, "loss/jsd": 0.0, "loss/logits": 0.46799224615097046, "step": 210 }, { "epoch": 0.009636363636363636, "grad_norm": 14.4375, "grad_norm_var": 28.67421875, "learning_rate": 0.0001, "loss": 9.0239, "loss/crossentropy": 2.5103819966316223, "loss/hidden": 2.68359375, "loss/jsd": 0.0, "loss/logits": 0.3829887807369232, "step": 212 }, { "epoch": 0.009727272727272727, "grad_norm": 14.6875, "grad_norm_var": 28.4400390625, "learning_rate": 0.0001, "loss": 9.5994, "loss/crossentropy": 2.7546178698539734, "loss/hidden": 2.69140625, "loss/jsd": 0.0, "loss/logits": 0.4153418242931366, "step": 214 }, { "epoch": 0.009818181818181818, "grad_norm": 16.75, "grad_norm_var": 6.455143229166667, "learning_rate": 0.0001, "loss": 9.4972, "loss/crossentropy": 2.6819902062416077, "loss/hidden": 2.73046875, "loss/jsd": 0.0, "loss/logits": 0.40847061574459076, "step": 216 }, { "epoch": 0.009909090909090909, "grad_norm": 14.0, "grad_norm_var": 5.999853515625, "learning_rate": 0.0001, "loss": 9.3628, "loss/crossentropy": 2.7075770497322083, "loss/hidden": 2.69921875, "loss/jsd": 0.0, "loss/logits": 0.3956048712134361, "step": 218 }, { "epoch": 0.01, "grad_norm": 14.6875, "grad_norm_var": 6.166129557291667, "learning_rate": 0.0001, "loss": 9.1279, "loss/crossentropy": 2.551356852054596, "loss/hidden": 2.6875, "loss/jsd": 0.0, "loss/logits": 0.3889072462916374, "step": 220 }, { "epoch": 0.010090909090909091, "grad_norm": 13.5, "grad_norm_var": 6.169124348958333, "learning_rate": 0.0001, "loss": 9.3371, "loss/crossentropy": 2.6227335929870605, "loss/hidden": 2.73828125, "loss/jsd": 0.0, "loss/logits": 0.39760615676641464, "step": 222 }, { "epoch": 0.010181818181818183, "grad_norm": 13.875, "grad_norm_var": 6.494514973958333, "learning_rate": 0.0001, "loss": 9.2778, "loss/crossentropy": 2.6055907011032104, "loss/hidden": 2.6875, "loss/jsd": 0.0, "loss/logits": 0.39847317337989807, "step": 224 }, { "epoch": 0.010272727272727272, "grad_norm": 15.4375, "grad_norm_var": 1.147119140625, "learning_rate": 0.0001, "loss": 9.8481, "loss/crossentropy": 2.9287002086639404, "loss/hidden": 2.65234375, "loss/jsd": 0.0, "loss/logits": 0.426703080534935, "step": 226 }, { "epoch": 0.010363636363636363, "grad_norm": 15.5, "grad_norm_var": 1.1598307291666667, "learning_rate": 0.0001, "loss": 9.7062, "loss/crossentropy": 2.8765456080436707, "loss/hidden": 2.7109375, "loss/jsd": 0.0, "loss/logits": 0.4118741601705551, "step": 228 }, { "epoch": 0.010454545454545454, "grad_norm": 17.5, "grad_norm_var": 1.6874837239583333, "learning_rate": 0.0001, "loss": 9.5256, "loss/crossentropy": 2.782268702983856, "loss/hidden": 2.62890625, "loss/jsd": 0.0, "loss/logits": 0.41144131124019623, "step": 230 }, { "epoch": 0.010545454545454545, "grad_norm": 14.3125, "grad_norm_var": 1.7559895833333334, "learning_rate": 0.0001, "loss": 9.0658, "loss/crossentropy": 2.495389223098755, "loss/hidden": 2.671875, "loss/jsd": 0.0, "loss/logits": 0.3898563012480736, "step": 232 }, { "epoch": 0.010636363636363637, "grad_norm": 15.0625, "grad_norm_var": 1.7140625, "learning_rate": 0.0001, "loss": 9.0809, "loss/crossentropy": 2.5901657342910767, "loss/hidden": 2.67578125, "loss/jsd": 0.0, "loss/logits": 0.38149479776620865, "step": 234 }, { "epoch": 0.010727272727272728, "grad_norm": 13.9375, "grad_norm_var": 1.6908854166666667, "learning_rate": 0.0001, "loss": 9.4984, "loss/crossentropy": 2.7978809475898743, "loss/hidden": 2.66015625, "loss/jsd": 0.0, "loss/logits": 0.4040401205420494, "step": 236 }, { "epoch": 0.010818181818181819, "grad_norm": 14.5625, "grad_norm_var": 1.7769368489583333, "learning_rate": 0.0001, "loss": 9.3311, "loss/crossentropy": 2.760347902774811, "loss/hidden": 2.59765625, "loss/jsd": 0.0, "loss/logits": 0.39731085300445557, "step": 238 }, { "epoch": 0.01090909090909091, "grad_norm": 16.75, "grad_norm_var": 2.518684895833333, "learning_rate": 0.0001, "loss": 9.2608, "loss/crossentropy": 2.492695450782776, "loss/hidden": 2.69921875, "loss/jsd": 0.0, "loss/logits": 0.4068887531757355, "step": 240 }, { "epoch": 0.011, "grad_norm": 13.5, "grad_norm_var": 2.8421223958333335, "learning_rate": 0.0001, "loss": 9.4469, "loss/crossentropy": 2.7908156514167786, "loss/hidden": 2.61328125, "loss/jsd": 0.0, "loss/logits": 0.40427956730127335, "step": 242 }, { "epoch": 0.01109090909090909, "grad_norm": 15.4375, "grad_norm_var": 3.3954264322916665, "learning_rate": 0.0001, "loss": 9.534, "loss/crossentropy": 2.698845863342285, "loss/hidden": 2.67578125, "loss/jsd": 0.0, "loss/logits": 0.41593826562166214, "step": 244 }, { "epoch": 0.011181818181818182, "grad_norm": 14.75, "grad_norm_var": 3.221728515625, "learning_rate": 0.0001, "loss": 9.3478, "loss/crossentropy": 2.618387281894684, "loss/hidden": 2.6328125, "loss/jsd": 0.0, "loss/logits": 0.40965674072504044, "step": 246 }, { "epoch": 0.011272727272727273, "grad_norm": 13.625, "grad_norm_var": 3.0785807291666667, "learning_rate": 0.0001, "loss": 9.1122, "loss/crossentropy": 2.6165446043014526, "loss/hidden": 2.65625, "loss/jsd": 0.0, "loss/logits": 0.3839371129870415, "step": 248 }, { "epoch": 0.011363636363636364, "grad_norm": 13.75, "grad_norm_var": 3.006705729166667, "learning_rate": 0.0001, "loss": 8.9681, "loss/crossentropy": 2.5924696922302246, "loss/hidden": 2.66015625, "loss/jsd": 0.0, "loss/logits": 0.3715454265475273, "step": 250 }, { "epoch": 0.011454545454545455, "grad_norm": 14.375, "grad_norm_var": 3.519905598958333, "learning_rate": 0.0001, "loss": 8.5098, "loss/crossentropy": 2.41774845123291, "loss/hidden": 2.58203125, "loss/jsd": 0.0, "loss/logits": 0.3510003983974457, "step": 252 }, { "epoch": 0.011545454545454546, "grad_norm": 18.625, "grad_norm_var": 4.377587890625, "learning_rate": 0.0001, "loss": 9.4537, "loss/crossentropy": 2.7227142453193665, "loss/hidden": 2.62890625, "loss/jsd": 0.0, "loss/logits": 0.41020624339580536, "step": 254 }, { "epoch": 0.011636363636363636, "grad_norm": 23.125, "grad_norm_var": 8.021468098958334, "learning_rate": 0.0001, "loss": 10.1169, "loss/crossentropy": 3.064366936683655, "loss/hidden": 2.62109375, "loss/jsd": 0.0, "loss/logits": 0.44314325600862503, "step": 256 }, { "epoch": 0.011727272727272727, "grad_norm": 14.625, "grad_norm_var": 7.563655598958333, "learning_rate": 0.0001, "loss": 9.3507, "loss/crossentropy": 2.758595883846283, "loss/hidden": 2.59765625, "loss/jsd": 0.0, "loss/logits": 0.3994433060288429, "step": 258 }, { "epoch": 0.011818181818181818, "grad_norm": 13.5, "grad_norm_var": 7.050455729166667, "learning_rate": 0.0001, "loss": 9.588, "loss/crossentropy": 2.8884157538414, "loss/hidden": 2.609375, "loss/jsd": 0.0, "loss/logits": 0.40901610255241394, "step": 260 }, { "epoch": 0.011909090909090909, "grad_norm": 13.5625, "grad_norm_var": 6.827083333333333, "learning_rate": 0.0001, "loss": 9.2293, "loss/crossentropy": 2.7340936958789825, "loss/hidden": 2.58984375, "loss/jsd": 0.0, "loss/logits": 0.3905390128493309, "step": 262 }, { "epoch": 0.012, "grad_norm": 12.0, "grad_norm_var": 7.653125, "learning_rate": 0.0001, "loss": 9.0084, "loss/crossentropy": 2.6723862290382385, "loss/hidden": 2.5625, "loss/jsd": 0.0, "loss/logits": 0.37734799832105637, "step": 264 }, { "epoch": 0.012090909090909091, "grad_norm": 13.375, "grad_norm_var": 8.046988932291667, "learning_rate": 0.0001, "loss": 9.085, "loss/crossentropy": 2.672752320766449, "loss/hidden": 2.5625, "loss/jsd": 0.0, "loss/logits": 0.38497258722782135, "step": 266 }, { "epoch": 0.012181818181818183, "grad_norm": 19.75, "grad_norm_var": 8.901676432291667, "learning_rate": 0.0001, "loss": 9.2223, "loss/crossentropy": 2.5398565530776978, "loss/hidden": 2.61328125, "loss/jsd": 0.0, "loss/logits": 0.4069190099835396, "step": 268 }, { "epoch": 0.012272727272727272, "grad_norm": 14.6875, "grad_norm_var": 12.03046875, "learning_rate": 0.0001, "loss": 9.7148, "loss/crossentropy": 2.8930506110191345, "loss/hidden": 2.63671875, "loss/jsd": 0.0, "loss/logits": 0.41850143671035767, "step": 270 }, { "epoch": 0.012363636363636363, "grad_norm": 13.9375, "grad_norm_var": 7.841780598958334, "learning_rate": 0.0001, "loss": 9.8508, "loss/crossentropy": 2.9170458912849426, "loss/hidden": 2.60546875, "loss/jsd": 0.0, "loss/logits": 0.4328300356864929, "step": 272 }, { "epoch": 0.012454545454545454, "grad_norm": 13.125, "grad_norm_var": 8.309358723958333, "learning_rate": 0.0001, "loss": 8.9089, "loss/crossentropy": 2.558280646800995, "loss/hidden": 2.62890625, "loss/jsd": 0.0, "loss/logits": 0.37217412143945694, "step": 274 }, { "epoch": 0.012545454545454545, "grad_norm": 11.875, "grad_norm_var": 9.112239583333333, "learning_rate": 0.0001, "loss": 8.6441, "loss/crossentropy": 2.544630229473114, "loss/hidden": 2.5703125, "loss/jsd": 0.0, "loss/logits": 0.3529123440384865, "step": 276 }, { "epoch": 0.012636363636363637, "grad_norm": 14.125, "grad_norm_var": 9.032552083333334, "learning_rate": 0.0001, "loss": 9.1869, "loss/crossentropy": 2.820072293281555, "loss/hidden": 2.55859375, "loss/jsd": 0.0, "loss/logits": 0.3808233141899109, "step": 278 }, { "epoch": 0.012727272727272728, "grad_norm": 13.3125, "grad_norm_var": 8.459358723958333, "learning_rate": 0.0001, "loss": 9.1011, "loss/crossentropy": 2.7540424466133118, "loss/hidden": 2.55078125, "loss/jsd": 0.0, "loss/logits": 0.3796296864748001, "step": 280 }, { "epoch": 0.012818181818181819, "grad_norm": 15.1875, "grad_norm_var": 10.494645182291666, "learning_rate": 0.0001, "loss": 9.1534, "loss/crossentropy": 2.7221486568450928, "loss/hidden": 2.6171875, "loss/jsd": 0.0, "loss/logits": 0.38140640407800674, "step": 282 }, { "epoch": 0.012909090909090908, "grad_norm": 13.0625, "grad_norm_var": 9.551806640625, "learning_rate": 0.0001, "loss": 8.7025, "loss/crossentropy": 2.41121643781662, "loss/hidden": 2.53515625, "loss/jsd": 0.0, "loss/logits": 0.3756142780184746, "step": 284 }, { "epoch": 0.013, "grad_norm": 12.5625, "grad_norm_var": 5.731494140625, "learning_rate": 0.0001, "loss": 9.3659, "loss/crossentropy": 2.8278268575668335, "loss/hidden": 2.5703125, "loss/jsd": 0.0, "loss/logits": 0.3967735692858696, "step": 286 }, { "epoch": 0.01309090909090909, "grad_norm": 12.8125, "grad_norm_var": 5.968684895833333, "learning_rate": 0.0001, "loss": 9.3949, "loss/crossentropy": 2.8807522654533386, "loss/hidden": 2.52734375, "loss/jsd": 0.0, "loss/logits": 0.39867910742759705, "step": 288 }, { "epoch": 0.013181818181818182, "grad_norm": 12.1875, "grad_norm_var": 6.524072265625, "learning_rate": 0.0001, "loss": 9.6917, "loss/crossentropy": 2.8897382616996765, "loss/hidden": 2.53515625, "loss/jsd": 0.0, "loss/logits": 0.42668189853429794, "step": 290 }, { "epoch": 0.013272727272727273, "grad_norm": 14.4375, "grad_norm_var": 6.54296875, "learning_rate": 0.0001, "loss": 8.9844, "loss/crossentropy": 2.8252031803131104, "loss/hidden": 2.52734375, "loss/jsd": 0.0, "loss/logits": 0.36318885535001755, "step": 292 }, { "epoch": 0.013363636363636364, "grad_norm": 12.4375, "grad_norm_var": 6.718994140625, "learning_rate": 0.0001, "loss": 9.6619, "loss/crossentropy": 3.105584681034088, "loss/hidden": 2.4765625, "loss/jsd": 0.0, "loss/logits": 0.4079744666814804, "step": 294 }, { "epoch": 0.013454545454545455, "grad_norm": 13.8125, "grad_norm_var": 7.507747395833333, "learning_rate": 0.0001, "loss": 9.4414, "loss/crossentropy": 2.8170496821403503, "loss/hidden": 2.56640625, "loss/jsd": 0.0, "loss/logits": 0.40579015761613846, "step": 296 }, { "epoch": 0.013545454545454546, "grad_norm": 13.5625, "grad_norm_var": 3.870166015625, "learning_rate": 0.0001, "loss": 8.9055, "loss/crossentropy": 2.605405867099762, "loss/hidden": 2.55859375, "loss/jsd": 0.0, "loss/logits": 0.3741501048207283, "step": 298 }, { "epoch": 0.013636363636363636, "grad_norm": 13.75, "grad_norm_var": 3.445296223958333, "learning_rate": 0.0001, "loss": 9.2485, "loss/crossentropy": 2.8597434163093567, "loss/hidden": 2.48046875, "loss/jsd": 0.0, "loss/logits": 0.39082615822553635, "step": 300 }, { "epoch": 0.013727272727272727, "grad_norm": 11.75, "grad_norm_var": 3.449462890625, "learning_rate": 0.0001, "loss": 8.947, "loss/crossentropy": 2.7397614121437073, "loss/hidden": 2.48046875, "loss/jsd": 0.0, "loss/logits": 0.37267347425222397, "step": 302 }, { "epoch": 0.013818181818181818, "grad_norm": 11.4375, "grad_norm_var": 3.6916015625, "learning_rate": 0.0001, "loss": 9.1641, "loss/crossentropy": 2.7920069098472595, "loss/hidden": 2.54296875, "loss/jsd": 0.0, "loss/logits": 0.3829163387417793, "step": 304 }, { "epoch": 0.01390909090909091, "grad_norm": 13.0, "grad_norm_var": 2.435400390625, "learning_rate": 0.0001, "loss": 8.7156, "loss/crossentropy": 2.531908631324768, "loss/hidden": 2.515625, "loss/jsd": 0.0, "loss/logits": 0.36680569499731064, "step": 306 }, { "epoch": 0.014, "grad_norm": 12.6875, "grad_norm_var": 2.235400390625, "learning_rate": 0.0001, "loss": 9.2996, "loss/crossentropy": 3.001237988471985, "loss/hidden": 2.45703125, "loss/jsd": 0.0, "loss/logits": 0.3841311112046242, "step": 308 }, { "epoch": 0.014090909090909091, "grad_norm": 14.6875, "grad_norm_var": 2.56796875, "learning_rate": 0.0001, "loss": 8.7311, "loss/crossentropy": 2.6722575426101685, "loss/hidden": 2.53125, "loss/jsd": 0.0, "loss/logits": 0.3527621477842331, "step": 310 }, { "epoch": 0.014181818181818183, "grad_norm": 11.3125, "grad_norm_var": 1.308837890625, "learning_rate": 0.0001, "loss": 8.8622, "loss/crossentropy": 2.782529652118683, "loss/hidden": 2.45703125, "loss/jsd": 0.0, "loss/logits": 0.3622671514749527, "step": 312 }, { "epoch": 0.014272727272727272, "grad_norm": 21.0, "grad_norm_var": 5.386181640625, "learning_rate": 0.0001, "loss": 8.5863, "loss/crossentropy": 2.4452203512191772, "loss/hidden": 2.52734375, "loss/jsd": 0.0, "loss/logits": 0.36137835681438446, "step": 314 }, { "epoch": 0.014363636363636363, "grad_norm": 12.75, "grad_norm_var": 5.548551432291666, "learning_rate": 0.0001, "loss": 9.0814, "loss/crossentropy": 2.677475154399872, "loss/hidden": 2.5546875, "loss/jsd": 0.0, "loss/logits": 0.38492750376462936, "step": 316 }, { "epoch": 0.014454545454545454, "grad_norm": 14.0, "grad_norm_var": 5.764322916666667, "learning_rate": 0.0001, "loss": 8.7152, "loss/crossentropy": 2.5013190507888794, "loss/hidden": 2.51171875, "loss/jsd": 0.0, "loss/logits": 0.37022029608488083, "step": 318 }, { "epoch": 0.014545454545454545, "grad_norm": 12.25, "grad_norm_var": 5.815364583333333, "learning_rate": 0.0001, "loss": 8.567, "loss/crossentropy": 2.5590061247348785, "loss/hidden": 2.48828125, "loss/jsd": 0.0, "loss/logits": 0.3519688919186592, "step": 320 }, { "epoch": 0.014636363636363637, "grad_norm": 12.6875, "grad_norm_var": 6.220247395833334, "learning_rate": 0.0001, "loss": 8.2759, "loss/crossentropy": 2.4992421865463257, "loss/hidden": 2.4375, "loss/jsd": 0.0, "loss/logits": 0.33391469717025757, "step": 322 }, { "epoch": 0.014727272727272728, "grad_norm": 12.875, "grad_norm_var": 6.141129557291666, "learning_rate": 0.0001, "loss": 8.9395, "loss/crossentropy": 2.716381251811981, "loss/hidden": 2.49609375, "loss/jsd": 0.0, "loss/logits": 0.37269916385412216, "step": 324 }, { "epoch": 0.014818181818181819, "grad_norm": 11.625, "grad_norm_var": 5.943343098958334, "learning_rate": 0.0001, "loss": 9.1225, "loss/crossentropy": 2.8756073117256165, "loss/hidden": 2.41796875, "loss/jsd": 0.0, "loss/logits": 0.3828911632299423, "step": 326 }, { "epoch": 0.014909090909090908, "grad_norm": 12.1875, "grad_norm_var": 5.814567057291667, "learning_rate": 0.0001, "loss": 8.8982, "loss/crossentropy": 2.6761738061904907, "loss/hidden": 2.46484375, "loss/jsd": 0.0, "loss/logits": 0.37572193890810013, "step": 328 }, { "epoch": 0.015, "grad_norm": 13.125, "grad_norm_var": 1.4239420572916666, "learning_rate": 0.0001, "loss": 8.6842, "loss/crossentropy": 2.636396884918213, "loss/hidden": 2.52734375, "loss/jsd": 0.0, "loss/logits": 0.3520495444536209, "step": 330 }, { "epoch": 0.01509090909090909, "grad_norm": 13.4375, "grad_norm_var": 1.046728515625, "learning_rate": 0.0001, "loss": 8.9284, "loss/crossentropy": 2.851913332939148, "loss/hidden": 2.40234375, "loss/jsd": 0.0, "loss/logits": 0.3674178719520569, "step": 332 }, { "epoch": 0.015181818181818182, "grad_norm": 14.0625, "grad_norm_var": 1.049462890625, "learning_rate": 0.0001, "loss": 9.0419, "loss/crossentropy": 2.604884445667267, "loss/hidden": 2.51953125, "loss/jsd": 0.0, "loss/logits": 0.39174456149339676, "step": 334 }, { "epoch": 0.015272727272727273, "grad_norm": 11.625, "grad_norm_var": 1.146337890625, "learning_rate": 0.0001, "loss": 8.7998, "loss/crossentropy": 2.688425123691559, "loss/hidden": 2.46875, "loss/jsd": 0.0, "loss/logits": 0.36425749212503433, "step": 336 }, { "epoch": 0.015363636363636364, "grad_norm": 11.375, "grad_norm_var": 1.0736979166666667, "learning_rate": 0.0001, "loss": 8.7189, "loss/crossentropy": 2.6869476437568665, "loss/hidden": 2.4375, "loss/jsd": 0.0, "loss/logits": 0.3594486191868782, "step": 338 }, { "epoch": 0.015454545454545455, "grad_norm": 11.0625, "grad_norm_var": 0.991259765625, "learning_rate": 0.0001, "loss": 8.6851, "loss/crossentropy": 2.6936079263687134, "loss/hidden": 2.4140625, "loss/jsd": 0.0, "loss/logits": 0.3577471375465393, "step": 340 }, { "epoch": 0.015545454545454545, "grad_norm": 12.0625, "grad_norm_var": 1.1031087239583333, "learning_rate": 0.0001, "loss": 8.7423, "loss/crossentropy": 2.735546052455902, "loss/hidden": 2.4609375, "loss/jsd": 0.0, "loss/logits": 0.3545772135257721, "step": 342 }, { "epoch": 0.015636363636363636, "grad_norm": 11.375, "grad_norm_var": 1.2324055989583333, "learning_rate": 0.0001, "loss": 8.5214, "loss/crossentropy": 2.7685824036598206, "loss/hidden": 2.3984375, "loss/jsd": 0.0, "loss/logits": 0.3354339599609375, "step": 344 }, { "epoch": 0.01572727272727273, "grad_norm": 11.25, "grad_norm_var": 1.4828125, "learning_rate": 0.0001, "loss": 8.5284, "loss/crossentropy": 2.582988679409027, "loss/hidden": 2.4453125, "loss/jsd": 0.0, "loss/logits": 0.35001372545957565, "step": 346 }, { "epoch": 0.015818181818181818, "grad_norm": 13.4375, "grad_norm_var": 1.50859375, "learning_rate": 0.0001, "loss": 8.526, "loss/crossentropy": 2.7004936933517456, "loss/hidden": 2.359375, "loss/jsd": 0.0, "loss/logits": 0.3466150835156441, "step": 348 }, { "epoch": 0.015909090909090907, "grad_norm": 11.375, "grad_norm_var": 1.1788899739583334, "learning_rate": 0.0001, "loss": 8.8773, "loss/crossentropy": 2.786874234676361, "loss/hidden": 2.453125, "loss/jsd": 0.0, "loss/logits": 0.3637252077460289, "step": 350 }, { "epoch": 0.016, "grad_norm": 12.3125, "grad_norm_var": 3.801416015625, "learning_rate": 0.0001, "loss": 9.1101, "loss/crossentropy": 2.899165987968445, "loss/hidden": 2.44140625, "loss/jsd": 0.0, "loss/logits": 0.37695759534835815, "step": 352 }, { "epoch": 0.01609090909090909, "grad_norm": 12.0625, "grad_norm_var": 3.7973307291666667, "learning_rate": 0.0001, "loss": 8.1759, "loss/crossentropy": 2.530939280986786, "loss/hidden": 2.4375, "loss/jsd": 0.0, "loss/logits": 0.3207484185695648, "step": 354 }, { "epoch": 0.016181818181818183, "grad_norm": 11.8125, "grad_norm_var": 3.693863932291667, "learning_rate": 0.0001, "loss": 8.828, "loss/crossentropy": 2.8835015892982483, "loss/hidden": 2.37890625, "loss/jsd": 0.0, "loss/logits": 0.35655752569437027, "step": 356 }, { "epoch": 0.016272727272727272, "grad_norm": 10.5625, "grad_norm_var": 3.8604166666666666, "learning_rate": 0.0001, "loss": 8.5141, "loss/crossentropy": 2.6444206833839417, "loss/hidden": 2.390625, "loss/jsd": 0.0, "loss/logits": 0.3479056656360626, "step": 358 }, { "epoch": 0.016363636363636365, "grad_norm": 9.8125, "grad_norm_var": 4.1228515625, "learning_rate": 0.0001, "loss": 8.3345, "loss/crossentropy": 2.562632441520691, "loss/hidden": 2.3828125, "loss/jsd": 0.0, "loss/logits": 0.33890556544065475, "step": 360 }, { "epoch": 0.016454545454545454, "grad_norm": 10.9375, "grad_norm_var": 4.045768229166667, "learning_rate": 0.0001, "loss": 8.4674, "loss/crossentropy": 2.703312635421753, "loss/hidden": 2.375, "loss/jsd": 0.0, "loss/logits": 0.33890827000141144, "step": 362 }, { "epoch": 0.016545454545454544, "grad_norm": 11.1875, "grad_norm_var": 3.980452473958333, "learning_rate": 0.0001, "loss": 8.095, "loss/crossentropy": 2.4543430507183075, "loss/hidden": 2.37109375, "loss/jsd": 0.0, "loss/logits": 0.3269590809941292, "step": 364 }, { "epoch": 0.016636363636363637, "grad_norm": 10.3125, "grad_norm_var": 3.898697916666667, "learning_rate": 0.0001, "loss": 8.4241, "loss/crossentropy": 2.6735291481018066, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.3406801074743271, "step": 366 }, { "epoch": 0.016727272727272726, "grad_norm": 13.375, "grad_norm_var": 0.991259765625, "learning_rate": 0.0001, "loss": 8.5529, "loss/crossentropy": 2.730522871017456, "loss/hidden": 2.38671875, "loss/jsd": 0.0, "loss/logits": 0.3435642421245575, "step": 368 }, { "epoch": 0.01681818181818182, "grad_norm": 11.25, "grad_norm_var": 1.2101399739583334, "learning_rate": 0.0001, "loss": 8.4523, "loss/crossentropy": 2.586120307445526, "loss/hidden": 2.41796875, "loss/jsd": 0.0, "loss/logits": 0.3448205813765526, "step": 370 }, { "epoch": 0.01690909090909091, "grad_norm": 10.875, "grad_norm_var": 1.177587890625, "learning_rate": 0.0001, "loss": 8.5475, "loss/crossentropy": 2.7360384464263916, "loss/hidden": 2.40234375, "loss/jsd": 0.0, "loss/logits": 0.3409164547920227, "step": 372 }, { "epoch": 0.017, "grad_norm": 11.125, "grad_norm_var": 1.1478515625, "learning_rate": 0.0001, "loss": 7.821, "loss/crossentropy": 2.4194540977478027, "loss/hidden": 2.36328125, "loss/jsd": 0.0, "loss/logits": 0.303824283182621, "step": 374 }, { "epoch": 0.01709090909090909, "grad_norm": 10.125, "grad_norm_var": 1.0817708333333333, "learning_rate": 0.0001, "loss": 7.7646, "loss/crossentropy": 2.3265328407287598, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.309429794549942, "step": 376 }, { "epoch": 0.017181818181818184, "grad_norm": 11.3125, "grad_norm_var": 1.0749348958333333, "learning_rate": 0.0001, "loss": 8.4002, "loss/crossentropy": 2.776569664478302, "loss/hidden": 2.328125, "loss/jsd": 0.0, "loss/logits": 0.32955221831798553, "step": 378 }, { "epoch": 0.017272727272727273, "grad_norm": 13.0, "grad_norm_var": 1.7091145833333334, "learning_rate": 0.0001, "loss": 8.7814, "loss/crossentropy": 2.83564954996109, "loss/hidden": 2.40625, "loss/jsd": 0.0, "loss/logits": 0.3539498969912529, "step": 380 }, { "epoch": 0.017363636363636362, "grad_norm": 12.125, "grad_norm_var": 1.4356770833333334, "learning_rate": 0.0001, "loss": 8.5308, "loss/crossentropy": 2.587800443172455, "loss/hidden": 2.4140625, "loss/jsd": 0.0, "loss/logits": 0.3528904318809509, "step": 382 }, { "epoch": 0.017454545454545455, "grad_norm": 11.25, "grad_norm_var": 1.2106770833333333, "learning_rate": 0.0001, "loss": 8.6375, "loss/crossentropy": 2.8026124238967896, "loss/hidden": 2.35546875, "loss/jsd": 0.0, "loss/logits": 0.34793730080127716, "step": 384 }, { "epoch": 0.017545454545454545, "grad_norm": 11.625, "grad_norm_var": 1.2531087239583334, "learning_rate": 0.0001, "loss": 8.042, "loss/crossentropy": 2.5102298259735107, "loss/hidden": 2.328125, "loss/jsd": 0.0, "loss/logits": 0.32036813348531723, "step": 386 }, { "epoch": 0.017636363636363638, "grad_norm": 10.4375, "grad_norm_var": 1.4895182291666667, "learning_rate": 0.0001, "loss": 8.3852, "loss/crossentropy": 2.646978437900543, "loss/hidden": 2.34765625, "loss/jsd": 0.0, "loss/logits": 0.33905282616615295, "step": 388 }, { "epoch": 0.017727272727272727, "grad_norm": 12.125, "grad_norm_var": 1.5426920572916667, "learning_rate": 0.0001, "loss": 8.3731, "loss/crossentropy": 2.5710819959640503, "loss/hidden": 2.37109375, "loss/jsd": 0.0, "loss/logits": 0.34309565275907516, "step": 390 }, { "epoch": 0.01781818181818182, "grad_norm": 11.1875, "grad_norm_var": 1.4559733072916667, "learning_rate": 0.0001, "loss": 8.8706, "loss/crossentropy": 2.889687418937683, "loss/hidden": 2.33984375, "loss/jsd": 0.0, "loss/logits": 0.3641056716442108, "step": 392 }, { "epoch": 0.01790909090909091, "grad_norm": 11.125, "grad_norm_var": 1.519775390625, "learning_rate": 0.0001, "loss": 8.7537, "loss/crossentropy": 2.7842275500297546, "loss/hidden": 2.375, "loss/jsd": 0.0, "loss/logits": 0.35944826900959015, "step": 394 }, { "epoch": 0.018, "grad_norm": 15.125, "grad_norm_var": 1.717431640625, "learning_rate": 0.0001, "loss": 8.5649, "loss/crossentropy": 2.783021628856659, "loss/hidden": 2.359375, "loss/jsd": 0.0, "loss/logits": 0.3422454819083214, "step": 396 }, { "epoch": 0.01809090909090909, "grad_norm": 11.25, "grad_norm_var": 1.8356608072916667, "learning_rate": 0.0001, "loss": 8.3145, "loss/crossentropy": 2.513590395450592, "loss/hidden": 2.42578125, "loss/jsd": 0.0, "loss/logits": 0.3375101760029793, "step": 398 }, { "epoch": 0.01818181818181818, "grad_norm": 11.625, "grad_norm_var": 1.8501139322916667, "learning_rate": 0.0001, "loss": 8.3681, "loss/crossentropy": 2.6710524559020996, "loss/hidden": 2.34765625, "loss/jsd": 0.0, "loss/logits": 0.33494116365909576, "step": 400 }, { "epoch": 0.018272727272727274, "grad_norm": 12.0625, "grad_norm_var": 1.6458333333333333, "learning_rate": 0.0001, "loss": 8.8343, "loss/crossentropy": 2.8294795155525208, "loss/hidden": 2.37109375, "loss/jsd": 0.0, "loss/logits": 0.36336972564458847, "step": 402 }, { "epoch": 0.018363636363636363, "grad_norm": 10.5, "grad_norm_var": 1.5026041666666667, "learning_rate": 0.0001, "loss": 8.4655, "loss/crossentropy": 2.6821552515029907, "loss/hidden": 2.3671875, "loss/jsd": 0.0, "loss/logits": 0.3416147381067276, "step": 404 }, { "epoch": 0.018454545454545456, "grad_norm": 13.25, "grad_norm_var": 1.5629557291666667, "learning_rate": 0.0001, "loss": 8.0844, "loss/crossentropy": 2.552999794483185, "loss/hidden": 2.36328125, "loss/jsd": 0.0, "loss/logits": 0.31680778414011, "step": 406 }, { "epoch": 0.018545454545454546, "grad_norm": 11.0, "grad_norm_var": 1.535791015625, "learning_rate": 0.0001, "loss": 8.502, "loss/crossentropy": 2.7151389718055725, "loss/hidden": 2.3359375, "loss/jsd": 0.0, "loss/logits": 0.34509019553661346, "step": 408 }, { "epoch": 0.018636363636363635, "grad_norm": 11.5, "grad_norm_var": 1.5960774739583334, "learning_rate": 0.0001, "loss": 7.7881, "loss/crossentropy": 2.396757632493973, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.3047561049461365, "step": 410 }, { "epoch": 0.018727272727272728, "grad_norm": 13.5625, "grad_norm_var": 1.0754557291666667, "learning_rate": 0.0001, "loss": 8.5818, "loss/crossentropy": 2.670750916004181, "loss/hidden": 2.39453125, "loss/jsd": 0.0, "loss/logits": 0.3516511395573616, "step": 412 }, { "epoch": 0.018818181818181817, "grad_norm": 13.0, "grad_norm_var": 0.9837890625, "learning_rate": 0.0001, "loss": 8.6154, "loss/crossentropy": 2.797249972820282, "loss/hidden": 2.35546875, "loss/jsd": 0.0, "loss/logits": 0.346266970038414, "step": 414 }, { "epoch": 0.01890909090909091, "grad_norm": 11.6875, "grad_norm_var": 1.1238932291666666, "learning_rate": 0.0001, "loss": 8.2706, "loss/crossentropy": 2.709167778491974, "loss/hidden": 2.328125, "loss/jsd": 0.0, "loss/logits": 0.3233323395252228, "step": 416 }, { "epoch": 0.019, "grad_norm": 11.9375, "grad_norm_var": 1.211181640625, "learning_rate": 0.0001, "loss": 8.1562, "loss/crossentropy": 2.614866018295288, "loss/hidden": 2.33984375, "loss/jsd": 0.0, "loss/logits": 0.320144958794117, "step": 418 }, { "epoch": 0.019090909090909092, "grad_norm": 11.0, "grad_norm_var": 1.302587890625, "learning_rate": 0.0001, "loss": 8.1919, "loss/crossentropy": 2.6071656346321106, "loss/hidden": 2.32421875, "loss/jsd": 0.0, "loss/logits": 0.326055821031332, "step": 420 }, { "epoch": 0.019181818181818182, "grad_norm": 13.625, "grad_norm_var": 1.7093098958333333, "learning_rate": 0.0001, "loss": 8.3695, "loss/crossentropy": 2.6707395911216736, "loss/hidden": 2.30078125, "loss/jsd": 0.0, "loss/logits": 0.33979544788599014, "step": 422 }, { "epoch": 0.01927272727272727, "grad_norm": 13.5625, "grad_norm_var": 1.9546223958333333, "learning_rate": 0.0001, "loss": 8.2709, "loss/crossentropy": 2.641660451889038, "loss/hidden": 2.26953125, "loss/jsd": 0.0, "loss/logits": 0.33597543835639954, "step": 424 }, { "epoch": 0.019363636363636364, "grad_norm": 13.875, "grad_norm_var": 2.1296875, "learning_rate": 0.0001, "loss": 8.2129, "loss/crossentropy": 2.54421603679657, "loss/hidden": 2.36328125, "loss/jsd": 0.0, "loss/logits": 0.33053911477327347, "step": 426 }, { "epoch": 0.019454545454545454, "grad_norm": 11.0625, "grad_norm_var": 1.9926920572916667, "learning_rate": 0.0001, "loss": 8.6249, "loss/crossentropy": 2.7779430150985718, "loss/hidden": 2.33203125, "loss/jsd": 0.0, "loss/logits": 0.3514908254146576, "step": 428 }, { "epoch": 0.019545454545454546, "grad_norm": 9.3125, "grad_norm_var": 2.3869140625, "learning_rate": 0.0001, "loss": 7.8996, "loss/crossentropy": 2.5229170620441437, "loss/hidden": 2.26953125, "loss/jsd": 0.0, "loss/logits": 0.31071098893880844, "step": 430 }, { "epoch": 0.019636363636363636, "grad_norm": 13.5, "grad_norm_var": 2.420556640625, "learning_rate": 0.0001, "loss": 8.7139, "loss/crossentropy": 2.653527021408081, "loss/hidden": 2.41796875, "loss/jsd": 0.0, "loss/logits": 0.3642421290278435, "step": 432 }, { "epoch": 0.01972727272727273, "grad_norm": 10.0625, "grad_norm_var": 2.5775390625, "learning_rate": 0.0001, "loss": 7.6068, "loss/crossentropy": 2.2957631945610046, "loss/hidden": 2.32421875, "loss/jsd": 0.0, "loss/logits": 0.29868000000715256, "step": 434 }, { "epoch": 0.019818181818181818, "grad_norm": 10.5625, "grad_norm_var": 2.86484375, "learning_rate": 0.0001, "loss": 8.112, "loss/crossentropy": 2.6191462874412537, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.3211618736386299, "step": 436 }, { "epoch": 0.019909090909090908, "grad_norm": 9.875, "grad_norm_var": 2.755729166666667, "learning_rate": 0.0001, "loss": 8.3941, "loss/crossentropy": 2.616967797279358, "loss/hidden": 2.3515625, "loss/jsd": 0.0, "loss/logits": 0.34255898743867874, "step": 438 }, { "epoch": 0.02, "grad_norm": 9.8125, "grad_norm_var": 2.598160807291667, "learning_rate": 0.0001, "loss": 7.9619, "loss/crossentropy": 2.554704785346985, "loss/hidden": 2.26953125, "loss/jsd": 0.0, "loss/logits": 0.31376778334379196, "step": 440 }, { "epoch": 0.02009090909090909, "grad_norm": 9.9375, "grad_norm_var": 2.2606608072916665, "learning_rate": 0.0001, "loss": 8.0164, "loss/crossentropy": 2.353427827358246, "loss/hidden": 2.37109375, "loss/jsd": 0.0, "loss/logits": 0.32918737083673477, "step": 442 }, { "epoch": 0.020181818181818183, "grad_norm": 10.25, "grad_norm_var": 2.3843098958333333, "learning_rate": 0.0001, "loss": 8.0806, "loss/crossentropy": 2.506446123123169, "loss/hidden": 2.30078125, "loss/jsd": 0.0, "loss/logits": 0.32733579725027084, "step": 444 }, { "epoch": 0.020272727272727272, "grad_norm": 10.625, "grad_norm_var": 1.9190104166666666, "learning_rate": 0.0001, "loss": 7.698, "loss/crossentropy": 2.33599516749382, "loss/hidden": 2.34765625, "loss/jsd": 0.0, "loss/logits": 0.3014376536011696, "step": 446 }, { "epoch": 0.020363636363636365, "grad_norm": 10.375, "grad_norm_var": 1.9676920572916667, "learning_rate": 0.0001, "loss": 7.8728, "loss/crossentropy": 2.411829501390457, "loss/hidden": 2.3046875, "loss/jsd": 0.0, "loss/logits": 0.3156287297606468, "step": 448 }, { "epoch": 0.020454545454545454, "grad_norm": 10.625, "grad_norm_var": 1.9169108072916667, "learning_rate": 0.0001, "loss": 7.9374, "loss/crossentropy": 2.4188692569732666, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.3237292766571045, "step": 450 }, { "epoch": 0.020545454545454544, "grad_norm": 13.5625, "grad_norm_var": 1.5669108072916667, "learning_rate": 0.0001, "loss": 8.7536, "loss/crossentropy": 2.893232226371765, "loss/hidden": 2.30859375, "loss/jsd": 0.0, "loss/logits": 0.35517967492341995, "step": 452 }, { "epoch": 0.020636363636363637, "grad_norm": 11.0625, "grad_norm_var": 1.374462890625, "learning_rate": 0.0001, "loss": 7.792, "loss/crossentropy": 2.4036229848861694, "loss/hidden": 2.3046875, "loss/jsd": 0.0, "loss/logits": 0.308364138007164, "step": 454 }, { "epoch": 0.020727272727272726, "grad_norm": 10.9375, "grad_norm_var": 1.339306640625, "learning_rate": 0.0001, "loss": 8.5055, "loss/crossentropy": 2.768629550933838, "loss/hidden": 2.26953125, "loss/jsd": 0.0, "loss/logits": 0.3467291072010994, "step": 456 }, { "epoch": 0.02081818181818182, "grad_norm": 12.4375, "grad_norm_var": 1.3377604166666666, "learning_rate": 0.0001, "loss": 8.3282, "loss/crossentropy": 2.7623966932296753, "loss/hidden": 2.26953125, "loss/jsd": 0.0, "loss/logits": 0.32962774485349655, "step": 458 }, { "epoch": 0.02090909090909091, "grad_norm": 12.1875, "grad_norm_var": 3.3535807291666666, "learning_rate": 0.0001, "loss": 8.5929, "loss/crossentropy": 2.8169997930526733, "loss/hidden": 2.30859375, "loss/jsd": 0.0, "loss/logits": 0.3467351570725441, "step": 460 }, { "epoch": 0.021, "grad_norm": 12.0, "grad_norm_var": 4.480843098958333, "learning_rate": 0.0001, "loss": 8.7597, "loss/crossentropy": 2.7858665585517883, "loss/hidden": 2.3203125, "loss/jsd": 0.0, "loss/logits": 0.3653511703014374, "step": 462 }, { "epoch": 0.02109090909090909, "grad_norm": 11.625, "grad_norm_var": 4.153108723958334, "learning_rate": 0.0001, "loss": 8.6191, "loss/crossentropy": 2.7461845874786377, "loss/hidden": 2.33203125, "loss/jsd": 0.0, "loss/logits": 0.3540932387113571, "step": 464 }, { "epoch": 0.02118181818181818, "grad_norm": 12.6875, "grad_norm_var": 4.174853515625, "learning_rate": 0.0001, "loss": 8.3249, "loss/crossentropy": 2.6806713342666626, "loss/hidden": 2.33984375, "loss/jsd": 0.0, "loss/logits": 0.33043457567691803, "step": 466 }, { "epoch": 0.021272727272727273, "grad_norm": 11.3125, "grad_norm_var": 4.267041015625, "learning_rate": 0.0001, "loss": 8.3501, "loss/crossentropy": 2.659107208251953, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.34332192689180374, "step": 468 }, { "epoch": 0.021363636363636362, "grad_norm": 10.6875, "grad_norm_var": 4.308317057291666, "learning_rate": 0.0001, "loss": 7.7764, "loss/crossentropy": 2.496876984834671, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.30061157047748566, "step": 470 }, { "epoch": 0.021454545454545455, "grad_norm": 11.75, "grad_norm_var": 4.2322265625, "learning_rate": 0.0001, "loss": 8.3674, "loss/crossentropy": 2.6399036645889282, "loss/hidden": 2.3125, "loss/jsd": 0.0, "loss/logits": 0.34149952232837677, "step": 472 }, { "epoch": 0.021545454545454545, "grad_norm": 11.125, "grad_norm_var": 4.517041015625, "learning_rate": 0.0001, "loss": 8.1385, "loss/crossentropy": 2.554620921611786, "loss/hidden": 2.30078125, "loss/jsd": 0.0, "loss/logits": 0.3283117413520813, "step": 474 }, { "epoch": 0.021636363636363638, "grad_norm": 10.375, "grad_norm_var": 2.7411295572916665, "learning_rate": 0.0001, "loss": 8.0673, "loss/crossentropy": 2.6156252026557922, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.32250892370939255, "step": 476 }, { "epoch": 0.021727272727272727, "grad_norm": 10.25, "grad_norm_var": 1.0794270833333333, "learning_rate": 0.0001, "loss": 7.8934, "loss/crossentropy": 2.5864129662513733, "loss/hidden": 2.23046875, "loss/jsd": 0.0, "loss/logits": 0.3076532110571861, "step": 478 }, { "epoch": 0.02181818181818182, "grad_norm": 11.5, "grad_norm_var": 1.0071451822916666, "learning_rate": 0.0001, "loss": 8.0505, "loss/crossentropy": 2.5434500575065613, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.32180205732584, "step": 480 }, { "epoch": 0.02190909090909091, "grad_norm": 11.0625, "grad_norm_var": 0.745166015625, "learning_rate": 0.0001, "loss": 8.183, "loss/crossentropy": 2.7674933671951294, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.3204602673649788, "step": 482 }, { "epoch": 0.022, "grad_norm": 10.4375, "grad_norm_var": 0.7457682291666666, "learning_rate": 0.0001, "loss": 8.6468, "loss/crossentropy": 2.942021131515503, "loss/hidden": 2.25, "loss/jsd": 0.0, "loss/logits": 0.3454737290740013, "step": 484 }, { "epoch": 0.02209090909090909, "grad_norm": 10.0, "grad_norm_var": 0.79375, "learning_rate": 0.0001, "loss": 7.8768, "loss/crossentropy": 2.484807789325714, "loss/hidden": 2.26171875, "loss/jsd": 0.0, "loss/logits": 0.31302617490291595, "step": 486 }, { "epoch": 0.02218181818181818, "grad_norm": 10.1875, "grad_norm_var": 0.35428059895833336, "learning_rate": 0.0001, "loss": 7.943, "loss/crossentropy": 2.550123691558838, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.31506694108247757, "step": 488 }, { "epoch": 0.022272727272727274, "grad_norm": 9.25, "grad_norm_var": 0.35494791666666664, "learning_rate": 0.0001, "loss": 8.1233, "loss/crossentropy": 2.5635900497436523, "loss/hidden": 2.27734375, "loss/jsd": 0.0, "loss/logits": 0.3282403200864792, "step": 490 }, { "epoch": 0.022363636363636363, "grad_norm": 10.0, "grad_norm_var": 1.1723795572916667, "learning_rate": 0.0001, "loss": 7.9939, "loss/crossentropy": 2.5055437684059143, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.31993044912815094, "step": 492 }, { "epoch": 0.022454545454545456, "grad_norm": 12.125, "grad_norm_var": 1.3438639322916666, "learning_rate": 0.0001, "loss": 8.2175, "loss/crossentropy": 2.652112066745758, "loss/hidden": 2.29296875, "loss/jsd": 0.0, "loss/logits": 0.3272392079234123, "step": 494 }, { "epoch": 0.022545454545454546, "grad_norm": 8.875, "grad_norm_var": 1.5690104166666667, "learning_rate": 0.0001, "loss": 8.1116, "loss/crossentropy": 2.6405823826789856, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.32132217288017273, "step": 496 }, { "epoch": 0.022636363636363635, "grad_norm": 10.5625, "grad_norm_var": 1.7269368489583334, "learning_rate": 0.0001, "loss": 8.1178, "loss/crossentropy": 2.667576551437378, "loss/hidden": 2.203125, "loss/jsd": 0.0, "loss/logits": 0.32470641285181046, "step": 498 }, { "epoch": 0.022727272727272728, "grad_norm": 10.75, "grad_norm_var": 1.741259765625, "learning_rate": 0.0001, "loss": 8.3204, "loss/crossentropy": 2.8687479496002197, "loss/hidden": 2.21484375, "loss/jsd": 0.0, "loss/logits": 0.3236808553338051, "step": 500 }, { "epoch": 0.022818181818181817, "grad_norm": 13.1875, "grad_norm_var": 2.022119140625, "learning_rate": 0.0001, "loss": 8.1201, "loss/crossentropy": 2.6463162302970886, "loss/hidden": 2.296875, "loss/jsd": 0.0, "loss/logits": 0.3176872879266739, "step": 502 }, { "epoch": 0.02290909090909091, "grad_norm": 13.4375, "grad_norm_var": 2.353385416666667, "learning_rate": 0.0001, "loss": 8.1764, "loss/crossentropy": 2.609562575817108, "loss/hidden": 2.25390625, "loss/jsd": 0.0, "loss/logits": 0.33129340410232544, "step": 504 }, { "epoch": 0.023, "grad_norm": 12.75, "grad_norm_var": 2.424934895833333, "learning_rate": 0.0001, "loss": 8.2057, "loss/crossentropy": 2.6960158348083496, "loss/hidden": 2.25, "loss/jsd": 0.0, "loss/logits": 0.3259701207280159, "step": 506 }, { "epoch": 0.023090909090909092, "grad_norm": 18.0, "grad_norm_var": 4.7296875, "learning_rate": 0.0001, "loss": 8.4436, "loss/crossentropy": 2.698199152946472, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.3456302210688591, "step": 508 }, { "epoch": 0.023181818181818182, "grad_norm": 12.0, "grad_norm_var": 5.015608723958334, "learning_rate": 0.0001, "loss": 8.0129, "loss/crossentropy": 2.550926625728607, "loss/hidden": 2.26953125, "loss/jsd": 0.0, "loss/logits": 0.31924838572740555, "step": 510 }, { "epoch": 0.02327272727272727, "grad_norm": 11.375, "grad_norm_var": 4.459830729166667, "learning_rate": 0.0001, "loss": 8.776, "loss/crossentropy": 3.0650025606155396, "loss/hidden": 2.26171875, "loss/jsd": 0.0, "loss/logits": 0.34492994099855423, "step": 512 }, { "epoch": 0.023363636363636364, "grad_norm": 10.25, "grad_norm_var": 4.067643229166666, "learning_rate": 0.0001, "loss": 7.8818, "loss/crossentropy": 2.4906539916992188, "loss/hidden": 2.26171875, "loss/jsd": 0.0, "loss/logits": 0.3129442036151886, "step": 514 }, { "epoch": 0.023454545454545454, "grad_norm": 14.375, "grad_norm_var": 4.483577473958333, "learning_rate": 0.0001, "loss": 8.142, "loss/crossentropy": 2.601547062397003, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.3282606452703476, "step": 516 }, { "epoch": 0.023545454545454546, "grad_norm": 9.375, "grad_norm_var": 4.9119140625, "learning_rate": 0.0001, "loss": 8.111, "loss/crossentropy": 2.596492826938629, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.3256702572107315, "step": 518 }, { "epoch": 0.023636363636363636, "grad_norm": 11.75, "grad_norm_var": 4.78125, "learning_rate": 0.0001, "loss": 8.3867, "loss/crossentropy": 2.790030539035797, "loss/hidden": 2.26171875, "loss/jsd": 0.0, "loss/logits": 0.33349260687828064, "step": 520 }, { "epoch": 0.02372727272727273, "grad_norm": 11.625, "grad_norm_var": 4.3166015625, "learning_rate": 0.0001, "loss": 8.2415, "loss/crossentropy": 2.652206540107727, "loss/hidden": 2.26171875, "loss/jsd": 0.0, "loss/logits": 0.3327581584453583, "step": 522 }, { "epoch": 0.023818181818181818, "grad_norm": 11.875, "grad_norm_var": 1.8856608072916667, "learning_rate": 0.0001, "loss": 8.0812, "loss/crossentropy": 2.704369068145752, "loss/hidden": 2.234375, "loss/jsd": 0.0, "loss/logits": 0.3142460435628891, "step": 524 }, { "epoch": 0.023909090909090908, "grad_norm": 10.6875, "grad_norm_var": 1.6207682291666667, "learning_rate": 0.0001, "loss": 8.4386, "loss/crossentropy": 2.885480046272278, "loss/hidden": 2.234375, "loss/jsd": 0.0, "loss/logits": 0.33187486231327057, "step": 526 }, { "epoch": 0.024, "grad_norm": 10.125, "grad_norm_var": 1.8825358072916667, "learning_rate": 0.0001, "loss": 8.1963, "loss/crossentropy": 2.7648484110832214, "loss/hidden": 2.17578125, "loss/jsd": 0.0, "loss/logits": 0.32556314766407013, "step": 528 }, { "epoch": 0.02409090909090909, "grad_norm": 8.9375, "grad_norm_var": 2.1669270833333334, "learning_rate": 0.0001, "loss": 7.5272, "loss/crossentropy": 2.4097719192504883, "loss/hidden": 2.22265625, "loss/jsd": 0.0, "loss/logits": 0.2894775718450546, "step": 530 }, { "epoch": 0.024181818181818183, "grad_norm": 10.4375, "grad_norm_var": 1.3554524739583333, "learning_rate": 0.0001, "loss": 7.896, "loss/crossentropy": 2.485668897628784, "loss/hidden": 2.24609375, "loss/jsd": 0.0, "loss/logits": 0.3164222612977028, "step": 532 }, { "epoch": 0.024272727272727272, "grad_norm": 11.375, "grad_norm_var": 0.7230305989583333, "learning_rate": 0.0001, "loss": 7.9694, "loss/crossentropy": 2.5782090425491333, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.3109891787171364, "step": 534 }, { "epoch": 0.024363636363636365, "grad_norm": 10.4375, "grad_norm_var": 0.6688639322916666, "learning_rate": 0.0001, "loss": 7.9608, "loss/crossentropy": 2.5204774737358093, "loss/hidden": 2.23828125, "loss/jsd": 0.0, "loss/logits": 0.3202071040868759, "step": 536 }, { "epoch": 0.024454545454545455, "grad_norm": 10.5625, "grad_norm_var": 1.171337890625, "learning_rate": 0.0001, "loss": 8.8223, "loss/crossentropy": 2.8830824494361877, "loss/hidden": 2.24609375, "loss/jsd": 0.0, "loss/logits": 0.3693086728453636, "step": 538 }, { "epoch": 0.024545454545454544, "grad_norm": 10.75, "grad_norm_var": 1.321337890625, "learning_rate": 0.0001, "loss": 8.1052, "loss/crossentropy": 2.647070825099945, "loss/hidden": 2.26953125, "loss/jsd": 0.0, "loss/logits": 0.3188588097691536, "step": 540 }, { "epoch": 0.024636363636363637, "grad_norm": 16.25, "grad_norm_var": 3.1989420572916667, "learning_rate": 0.0001, "loss": 8.3473, "loss/crossentropy": 2.780426323413849, "loss/hidden": 2.24609375, "loss/jsd": 0.0, "loss/logits": 0.3320828005671501, "step": 542 }, { "epoch": 0.024727272727272726, "grad_norm": 13.1875, "grad_norm_var": 3.184309895833333, "learning_rate": 0.0001, "loss": 8.8009, "loss/crossentropy": 2.9271947741508484, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.36002635955810547, "step": 544 }, { "epoch": 0.02481818181818182, "grad_norm": 10.5625, "grad_norm_var": 3.256233723958333, "learning_rate": 0.0001, "loss": 8.2605, "loss/crossentropy": 2.7026455402374268, "loss/hidden": 2.21484375, "loss/jsd": 0.0, "loss/logits": 0.33430326730012894, "step": 546 }, { "epoch": 0.02490909090909091, "grad_norm": 11.0, "grad_norm_var": 3.420768229166667, "learning_rate": 0.0001, "loss": 7.8616, "loss/crossentropy": 2.5264296531677246, "loss/hidden": 2.23828125, "loss/jsd": 0.0, "loss/logits": 0.309689961373806, "step": 548 }, { "epoch": 0.025, "grad_norm": 10.75, "grad_norm_var": 3.6395670572916665, "learning_rate": 0.0001, "loss": 8.1425, "loss/crossentropy": 2.5754901468753815, "loss/hidden": 2.25390625, "loss/jsd": 0.0, "loss/logits": 0.33131010830402374, "step": 550 }, { "epoch": 0.02509090909090909, "grad_norm": 9.5625, "grad_norm_var": 3.7244140625, "learning_rate": 0.0001, "loss": 7.8923, "loss/crossentropy": 2.5499551594257355, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.314707949757576, "step": 552 }, { "epoch": 0.02518181818181818, "grad_norm": 9.9375, "grad_norm_var": 3.8551920572916667, "learning_rate": 0.0001, "loss": 7.9249, "loss/crossentropy": 2.554664134979248, "loss/hidden": 2.19921875, "loss/jsd": 0.0, "loss/logits": 0.3171003982424736, "step": 554 }, { "epoch": 0.025272727272727273, "grad_norm": 10.1875, "grad_norm_var": 3.8572265625, "learning_rate": 0.0001, "loss": 7.9685, "loss/crossentropy": 2.627944827079773, "loss/hidden": 2.22265625, "loss/jsd": 0.0, "loss/logits": 0.3117859438061714, "step": 556 }, { "epoch": 0.025363636363636363, "grad_norm": 10.75, "grad_norm_var": 2.089957682291667, "learning_rate": 0.0001, "loss": 7.8205, "loss/crossentropy": 2.5410854816436768, "loss/hidden": 2.22265625, "loss/jsd": 0.0, "loss/logits": 0.305672787129879, "step": 558 }, { "epoch": 0.025454545454545455, "grad_norm": 9.5625, "grad_norm_var": 1.6811848958333333, "learning_rate": 0.0001, "loss": 7.7811, "loss/crossentropy": 2.4522191882133484, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.3110092058777809, "step": 560 }, { "epoch": 0.025545454545454545, "grad_norm": 10.625, "grad_norm_var": 0.6411295572916667, "learning_rate": 0.0001, "loss": 7.3377, "loss/crossentropy": 2.3402442634105682, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.2778668701648712, "step": 562 }, { "epoch": 0.025636363636363638, "grad_norm": 9.6875, "grad_norm_var": 0.715869140625, "learning_rate": 0.0001, "loss": 7.6525, "loss/crossentropy": 2.4776196479797363, "loss/hidden": 2.15234375, "loss/jsd": 0.0, "loss/logits": 0.3022525757551193, "step": 564 }, { "epoch": 0.025727272727272727, "grad_norm": 15.9375, "grad_norm_var": 2.6844889322916665, "learning_rate": 0.0001, "loss": 7.8636, "loss/crossentropy": 2.5621307492256165, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.31217939406633377, "step": 566 }, { "epoch": 0.025818181818181817, "grad_norm": 10.375, "grad_norm_var": 2.5504557291666665, "learning_rate": 0.0001, "loss": 8.0997, "loss/crossentropy": 2.7002248764038086, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.32197698950767517, "step": 568 }, { "epoch": 0.02590909090909091, "grad_norm": 10.5, "grad_norm_var": 2.441650390625, "learning_rate": 0.0001, "loss": 8.0861, "loss/crossentropy": 2.607551157474518, "loss/hidden": 2.21484375, "loss/jsd": 0.0, "loss/logits": 0.32636839896440506, "step": 570 }, { "epoch": 0.026, "grad_norm": 10.9375, "grad_norm_var": 2.445556640625, "learning_rate": 0.0001, "loss": 7.9349, "loss/crossentropy": 2.5587512254714966, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.31574322283267975, "step": 572 }, { "epoch": 0.02609090909090909, "grad_norm": 9.5625, "grad_norm_var": 2.949934895833333, "learning_rate": 0.0001, "loss": 7.842, "loss/crossentropy": 2.4658702611923218, "loss/hidden": 2.22265625, "loss/jsd": 0.0, "loss/logits": 0.31534551084041595, "step": 574 }, { "epoch": 0.02618181818181818, "grad_norm": 8.375, "grad_norm_var": 3.316389973958333, "learning_rate": 0.0001, "loss": 7.5601, "loss/crossentropy": 2.5368428826332092, "loss/hidden": 2.16796875, "loss/jsd": 0.0, "loss/logits": 0.285533145070076, "step": 576 }, { "epoch": 0.026272727272727274, "grad_norm": 10.5, "grad_norm_var": 3.1617024739583335, "learning_rate": 0.0001, "loss": 7.4771, "loss/crossentropy": 2.34205424785614, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.2924086004495621, "step": 578 }, { "epoch": 0.026363636363636363, "grad_norm": 9.6875, "grad_norm_var": 3.1083170572916665, "learning_rate": 0.0001, "loss": 8.4057, "loss/crossentropy": 2.9491363763809204, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.3230031058192253, "step": 580 }, { "epoch": 0.026454545454545456, "grad_norm": 10.1875, "grad_norm_var": 1.084375, "learning_rate": 0.0001, "loss": 7.6038, "loss/crossentropy": 2.477908134460449, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.2946201488375664, "step": 582 }, { "epoch": 0.026545454545454546, "grad_norm": 36.25, "grad_norm_var": 43.303125, "learning_rate": 0.0001, "loss": 8.5974, "loss/crossentropy": 2.454878717660904, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.38534318655729294, "step": 584 }, { "epoch": 0.026636363636363635, "grad_norm": 10.8125, "grad_norm_var": 43.08802083333333, "learning_rate": 0.0001, "loss": 8.2631, "loss/crossentropy": 2.753837764263153, "loss/hidden": 2.21484375, "loss/jsd": 0.0, "loss/logits": 0.32944564521312714, "step": 586 }, { "epoch": 0.026727272727272728, "grad_norm": 10.3125, "grad_norm_var": 43.13318684895833, "learning_rate": 0.0001, "loss": 7.8017, "loss/crossentropy": 2.5285814702510834, "loss/hidden": 2.20703125, "loss/jsd": 0.0, "loss/logits": 0.30660726875066757, "step": 588 }, { "epoch": 0.026818181818181817, "grad_norm": 10.9375, "grad_norm_var": 43.1984375, "learning_rate": 0.0001, "loss": 7.4015, "loss/crossentropy": 2.4034638702869415, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.2826210930943489, "step": 590 }, { "epoch": 0.02690909090909091, "grad_norm": 11.4375, "grad_norm_var": 42.2697265625, "learning_rate": 0.0001, "loss": 7.9889, "loss/crossentropy": 2.75188148021698, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.31120583415031433, "step": 592 }, { "epoch": 0.027, "grad_norm": 10.25, "grad_norm_var": 42.279541015625, "learning_rate": 0.0001, "loss": 8.1385, "loss/crossentropy": 2.7894468307495117, "loss/hidden": 2.16796875, "loss/jsd": 0.0, "loss/logits": 0.318103663623333, "step": 594 }, { "epoch": 0.027090909090909093, "grad_norm": 11.3125, "grad_norm_var": 42.86484375, "learning_rate": 0.0001, "loss": 8.331, "loss/crossentropy": 2.7955238819122314, "loss/hidden": 2.203125, "loss/jsd": 0.0, "loss/logits": 0.33323612809181213, "step": 596 }, { "epoch": 0.027181818181818182, "grad_norm": 11.25, "grad_norm_var": 42.78795572916667, "learning_rate": 0.0001, "loss": 7.6709, "loss/crossentropy": 2.41246235370636, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.30006297677755356, "step": 598 }, { "epoch": 0.02727272727272727, "grad_norm": 9.75, "grad_norm_var": 2.7202962239583335, "learning_rate": 0.0001, "loss": 8.0377, "loss/crossentropy": 2.7179266810417175, "loss/hidden": 2.18359375, "loss/jsd": 0.0, "loss/logits": 0.31362052261829376, "step": 600 }, { "epoch": 0.027363636363636364, "grad_norm": 9.875, "grad_norm_var": 2.6809895833333335, "learning_rate": 0.0001, "loss": 7.9193, "loss/crossentropy": 2.531787395477295, "loss/hidden": 2.203125, "loss/jsd": 0.0, "loss/logits": 0.3184369429945946, "step": 602 }, { "epoch": 0.027454545454545454, "grad_norm": 10.25, "grad_norm_var": 2.6973307291666666, "learning_rate": 0.0001, "loss": 8.3628, "loss/crossentropy": 2.9673649668693542, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.32313745468854904, "step": 604 }, { "epoch": 0.027545454545454547, "grad_norm": 8.875, "grad_norm_var": 2.8258951822916667, "learning_rate": 0.0001, "loss": 7.4802, "loss/crossentropy": 2.3611857891082764, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.2962745800614357, "step": 606 }, { "epoch": 0.027636363636363636, "grad_norm": 11.3125, "grad_norm_var": 2.7749348958333333, "learning_rate": 0.0001, "loss": 8.1291, "loss/crossentropy": 2.809281885623932, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.3171345889568329, "step": 608 }, { "epoch": 0.02772727272727273, "grad_norm": 10.0, "grad_norm_var": 2.9520833333333334, "learning_rate": 0.0001, "loss": 8.2911, "loss/crossentropy": 2.8731058835983276, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.3253922685980797, "step": 610 }, { "epoch": 0.02781818181818182, "grad_norm": 9.375, "grad_norm_var": 0.7218098958333333, "learning_rate": 0.0001, "loss": 8.1653, "loss/crossentropy": 2.776527464389801, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.324818417429924, "step": 612 }, { "epoch": 0.027909090909090908, "grad_norm": 9.1875, "grad_norm_var": 0.6809895833333334, "learning_rate": 0.0001, "loss": 7.8243, "loss/crossentropy": 2.640493869781494, "loss/hidden": 2.18359375, "loss/jsd": 0.0, "loss/logits": 0.3000243678689003, "step": 614 }, { "epoch": 0.028, "grad_norm": 8.3125, "grad_norm_var": 0.8723958333333334, "learning_rate": 0.0001, "loss": 7.7585, "loss/crossentropy": 2.6909168362617493, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.29738467931747437, "step": 616 }, { "epoch": 0.02809090909090909, "grad_norm": 9.625, "grad_norm_var": 0.9001139322916667, "learning_rate": 0.0001, "loss": 8.0147, "loss/crossentropy": 2.7001615166664124, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.3119227960705757, "step": 618 }, { "epoch": 0.028181818181818183, "grad_norm": 9.0, "grad_norm_var": 0.9379557291666667, "learning_rate": 0.0001, "loss": 8.2076, "loss/crossentropy": 2.793542444705963, "loss/hidden": 2.16796875, "loss/jsd": 0.0, "loss/logits": 0.3246040418744087, "step": 620 }, { "epoch": 0.028272727272727272, "grad_norm": 8.5625, "grad_norm_var": 1.080322265625, "learning_rate": 0.0001, "loss": 7.5741, "loss/crossentropy": 2.5615251660346985, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.2910986915230751, "step": 622 }, { "epoch": 0.028363636363636365, "grad_norm": 9.0625, "grad_norm_var": 1.0322265625, "learning_rate": 0.0001, "loss": 7.8794, "loss/crossentropy": 2.72199022769928, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.2985563427209854, "step": 624 }, { "epoch": 0.028454545454545455, "grad_norm": 10.75, "grad_norm_var": 0.6469889322916667, "learning_rate": 0.0001, "loss": 8.1671, "loss/crossentropy": 2.833717405796051, "loss/hidden": 2.13671875, "loss/jsd": 0.0, "loss/logits": 0.31966758519411087, "step": 626 }, { "epoch": 0.028545454545454544, "grad_norm": 9.3125, "grad_norm_var": 0.5358723958333333, "learning_rate": 0.0001, "loss": 7.5222, "loss/crossentropy": 2.452955663204193, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.2897372171282768, "step": 628 }, { "epoch": 0.028636363636363637, "grad_norm": 9.9375, "grad_norm_var": 0.5601399739583334, "learning_rate": 0.0001, "loss": 7.6215, "loss/crossentropy": 2.5359931588172913, "loss/hidden": 2.14453125, "loss/jsd": 0.0, "loss/logits": 0.2941012308001518, "step": 630 }, { "epoch": 0.028727272727272726, "grad_norm": 10.25, "grad_norm_var": 0.5344889322916667, "learning_rate": 0.0001, "loss": 8.0759, "loss/crossentropy": 2.8469958901405334, "loss/hidden": 2.12890625, "loss/jsd": 0.0, "loss/logits": 0.3099951893091202, "step": 632 }, { "epoch": 0.02881818181818182, "grad_norm": 8.5625, "grad_norm_var": 0.5770833333333333, "learning_rate": 0.0001, "loss": 7.3189, "loss/crossentropy": 2.409875512123108, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.27527838945388794, "step": 634 }, { "epoch": 0.02890909090909091, "grad_norm": 10.6875, "grad_norm_var": 3.0629557291666667, "learning_rate": 0.0001, "loss": 8.0163, "loss/crossentropy": 2.6671429872512817, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.31773262470960617, "step": 636 }, { "epoch": 0.029, "grad_norm": 9.875, "grad_norm_var": 2.7745930989583334, "learning_rate": 0.0001, "loss": 8.0368, "loss/crossentropy": 2.698503255844116, "loss/hidden": 2.203125, "loss/jsd": 0.0, "loss/logits": 0.31352080404758453, "step": 638 }, { "epoch": 0.02909090909090909, "grad_norm": 10.1875, "grad_norm_var": 2.8098958333333335, "learning_rate": 0.0001, "loss": 8.1273, "loss/crossentropy": 2.802562892436981, "loss/hidden": 2.16015625, "loss/jsd": 0.0, "loss/logits": 0.316461943089962, "step": 640 }, { "epoch": 0.02918181818181818, "grad_norm": 9.1875, "grad_norm_var": 2.8916015625, "learning_rate": 0.0001, "loss": 7.4003, "loss/crossentropy": 2.49009370803833, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.27852414920926094, "step": 642 }, { "epoch": 0.029272727272727273, "grad_norm": 10.4375, "grad_norm_var": 2.767822265625, "learning_rate": 0.0001, "loss": 7.6419, "loss/crossentropy": 2.635101616382599, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.2913009002804756, "step": 644 }, { "epoch": 0.029363636363636363, "grad_norm": 10.4375, "grad_norm_var": 2.7952473958333335, "learning_rate": 0.0001, "loss": 7.7213, "loss/crossentropy": 2.619856894016266, "loss/hidden": 2.10546875, "loss/jsd": 0.0, "loss/logits": 0.299600325524807, "step": 646 }, { "epoch": 0.029454545454545455, "grad_norm": 10.0625, "grad_norm_var": 2.8997395833333335, "learning_rate": 0.0001, "loss": 7.5925, "loss/crossentropy": 2.5130884647369385, "loss/hidden": 2.14453125, "loss/jsd": 0.0, "loss/logits": 0.29348304122686386, "step": 648 }, { "epoch": 0.029545454545454545, "grad_norm": 10.375, "grad_norm_var": 2.7726399739583334, "learning_rate": 0.0001, "loss": 8.1776, "loss/crossentropy": 2.823518395423889, "loss/hidden": 2.16015625, "loss/jsd": 0.0, "loss/logits": 0.31939148157835007, "step": 650 }, { "epoch": 0.029636363636363638, "grad_norm": 9.4375, "grad_norm_var": 0.7644368489583333, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.5506816506385803, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.2830839157104492, "step": 652 }, { "epoch": 0.029727272727272727, "grad_norm": 10.0625, "grad_norm_var": 4.125764973958334, "learning_rate": 0.0001, "loss": 8.0667, "loss/crossentropy": 2.6991006731987, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.32347649335861206, "step": 654 }, { "epoch": 0.029818181818181817, "grad_norm": 11.875, "grad_norm_var": 4.1859375, "learning_rate": 0.0001, "loss": 7.7135, "loss/crossentropy": 2.6317290663719177, "loss/hidden": 2.11328125, "loss/jsd": 0.0, "loss/logits": 0.2968512699007988, "step": 656 }, { "epoch": 0.02990909090909091, "grad_norm": 8.5625, "grad_norm_var": 4.144254557291666, "learning_rate": 0.0001, "loss": 7.9103, "loss/crossentropy": 2.6656662225723267, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.313529334962368, "step": 658 }, { "epoch": 0.03, "grad_norm": 9.25, "grad_norm_var": 4.246614583333334, "learning_rate": 0.0001, "loss": 7.6871, "loss/crossentropy": 2.611989140510559, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.29735246300697327, "step": 660 }, { "epoch": 0.030090909090909092, "grad_norm": 10.1875, "grad_norm_var": 4.289176432291667, "learning_rate": 0.0001, "loss": 7.4345, "loss/crossentropy": 2.5160269737243652, "loss/hidden": 2.11328125, "loss/jsd": 0.0, "loss/logits": 0.280514195561409, "step": 662 }, { "epoch": 0.03018181818181818, "grad_norm": 10.25, "grad_norm_var": 4.098893229166666, "learning_rate": 0.0001, "loss": 8.1254, "loss/crossentropy": 2.7559604048728943, "loss/hidden": 2.16015625, "loss/jsd": 0.0, "loss/logits": 0.32093099504709244, "step": 664 }, { "epoch": 0.030272727272727274, "grad_norm": 10.0, "grad_norm_var": 4.301497395833334, "learning_rate": 0.0001, "loss": 7.4294, "loss/crossentropy": 2.280423253774643, "loss/hidden": 2.12109375, "loss/jsd": 0.0, "loss/logits": 0.3027914687991142, "step": 666 }, { "epoch": 0.030363636363636363, "grad_norm": 9.875, "grad_norm_var": 4.259309895833334, "learning_rate": 0.0001, "loss": 8.0548, "loss/crossentropy": 2.7498602867126465, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.32033535093069077, "step": 668 }, { "epoch": 0.030454545454545453, "grad_norm": 9.375, "grad_norm_var": 1.0155598958333334, "learning_rate": 0.0001, "loss": 7.6655, "loss/crossentropy": 2.5924076437950134, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.2963766008615494, "step": 670 }, { "epoch": 0.030545454545454546, "grad_norm": 10.8125, "grad_norm_var": 0.6900390625, "learning_rate": 0.0001, "loss": 7.4532, "loss/crossentropy": 2.4079315662384033, "loss/hidden": 2.13671875, "loss/jsd": 0.0, "loss/logits": 0.29085320234298706, "step": 672 }, { "epoch": 0.030636363636363635, "grad_norm": 10.5, "grad_norm_var": 0.6484212239583333, "learning_rate": 0.0001, "loss": 7.9982, "loss/crossentropy": 2.825055956840515, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.2993457019329071, "step": 674 }, { "epoch": 0.030727272727272728, "grad_norm": 10.25, "grad_norm_var": 0.657275390625, "learning_rate": 0.0001, "loss": 8.002, "loss/crossentropy": 2.788451671600342, "loss/hidden": 2.11328125, "loss/jsd": 0.0, "loss/logits": 0.3100292459130287, "step": 676 }, { "epoch": 0.030818181818181817, "grad_norm": 9.0625, "grad_norm_var": 0.693603515625, "learning_rate": 0.0001, "loss": 7.7803, "loss/crossentropy": 2.6578460931777954, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.3028682470321655, "step": 678 }, { "epoch": 0.03090909090909091, "grad_norm": 9.9375, "grad_norm_var": 0.70859375, "learning_rate": 0.0001, "loss": 7.8692, "loss/crossentropy": 2.759332537651062, "loss/hidden": 2.11328125, "loss/jsd": 0.0, "loss/logits": 0.2996634691953659, "step": 680 }, { "epoch": 0.031, "grad_norm": 9.125, "grad_norm_var": 0.611962890625, "learning_rate": 0.0001, "loss": 7.6867, "loss/crossentropy": 2.6121044158935547, "loss/hidden": 2.12890625, "loss/jsd": 0.0, "loss/logits": 0.29457370936870575, "step": 682 }, { "epoch": 0.03109090909090909, "grad_norm": 10.5, "grad_norm_var": 0.5301432291666667, "learning_rate": 0.0001, "loss": 7.9708, "loss/crossentropy": 2.8594168424606323, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.2978571951389313, "step": 684 }, { "epoch": 0.031181818181818182, "grad_norm": 9.875, "grad_norm_var": 0.5416666666666666, "learning_rate": 0.0001, "loss": 7.4286, "loss/crossentropy": 2.434048056602478, "loss/hidden": 2.08203125, "loss/jsd": 0.0, "loss/logits": 0.2912544161081314, "step": 686 }, { "epoch": 0.03127272727272727, "grad_norm": 9.375, "grad_norm_var": 0.48020833333333335, "learning_rate": 0.0001, "loss": 7.9005, "loss/crossentropy": 2.760048270225525, "loss/hidden": 2.12890625, "loss/jsd": 0.0, "loss/logits": 0.3011501580476761, "step": 688 }, { "epoch": 0.031363636363636364, "grad_norm": 8.4375, "grad_norm_var": 0.4351399739583333, "learning_rate": 0.0001, "loss": 7.6395, "loss/crossentropy": 2.559808075428009, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.2978133335709572, "step": 690 }, { "epoch": 0.03145454545454546, "grad_norm": 9.4375, "grad_norm_var": 0.47237955729166664, "learning_rate": 0.0001, "loss": 7.7654, "loss/crossentropy": 2.6951447129249573, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.29843124747276306, "step": 692 }, { "epoch": 0.03154545454545454, "grad_norm": 9.125, "grad_norm_var": 0.3431640625, "learning_rate": 0.0001, "loss": 7.6742, "loss/crossentropy": 2.6581901907920837, "loss/hidden": 2.09765625, "loss/jsd": 0.0, "loss/logits": 0.2918402776122093, "step": 694 }, { "epoch": 0.031636363636363636, "grad_norm": 31.125, "grad_norm_var": 30.069384765625, "learning_rate": 0.0001, "loss": 8.7986, "loss/crossentropy": 2.8296613097190857, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.38517018407583237, "step": 696 }, { "epoch": 0.03172727272727273, "grad_norm": 11.75, "grad_norm_var": 30.053059895833332, "learning_rate": 0.0001, "loss": 8.4319, "loss/crossentropy": 2.956857204437256, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.3303192928433418, "step": 698 }, { "epoch": 0.031818181818181815, "grad_norm": 8.6875, "grad_norm_var": 30.252197265625, "learning_rate": 0.0001, "loss": 7.8927, "loss/crossentropy": 2.6550886631011963, "loss/hidden": 2.16796875, "loss/jsd": 0.0, "loss/logits": 0.3069688081741333, "step": 700 }, { "epoch": 0.03190909090909091, "grad_norm": 9.4375, "grad_norm_var": 30.004150390625, "learning_rate": 0.0001, "loss": 7.5494, "loss/crossentropy": 2.6117730140686035, "loss/hidden": 2.09765625, "loss/jsd": 0.0, "loss/logits": 0.2839971333742142, "step": 702 }, { "epoch": 0.032, "grad_norm": 8.8125, "grad_norm_var": 30.1984375, "learning_rate": 0.0001, "loss": 7.6628, "loss/crossentropy": 2.5494920015335083, "loss/hidden": 2.138671875, "loss/jsd": 0.0, "loss/logits": 0.29746201634407043, "step": 704 }, { "epoch": 0.032090909090909094, "grad_norm": 9.0625, "grad_norm_var": 30.282014973958333, "learning_rate": 0.0001, "loss": 7.0155, "loss/crossentropy": 2.286539524793625, "loss/hidden": 2.10546875, "loss/jsd": 0.0, "loss/logits": 0.2623445764183998, "step": 706 }, { "epoch": 0.03218181818181818, "grad_norm": 10.0625, "grad_norm_var": 29.827604166666667, "learning_rate": 0.0001, "loss": 7.6099, "loss/crossentropy": 2.567818760871887, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.293275348842144, "step": 708 }, { "epoch": 0.03227272727272727, "grad_norm": 10.375, "grad_norm_var": 29.682014973958335, "learning_rate": 0.0001, "loss": 7.973, "loss/crossentropy": 2.750872492790222, "loss/hidden": 2.10546875, "loss/jsd": 0.0, "loss/logits": 0.31166839599609375, "step": 710 }, { "epoch": 0.032363636363636365, "grad_norm": 8.4375, "grad_norm_var": 1.4715983072916667, "learning_rate": 0.0001, "loss": 7.366, "loss/crossentropy": 2.446383059024811, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.2794596739113331, "step": 712 }, { "epoch": 0.03245454545454545, "grad_norm": 9.5625, "grad_norm_var": 0.6077473958333334, "learning_rate": 0.0001, "loss": 7.4051, "loss/crossentropy": 2.508723944425583, "loss/hidden": 2.07421875, "loss/jsd": 0.0, "loss/logits": 0.28221270814538, "step": 714 }, { "epoch": 0.032545454545454544, "grad_norm": 8.8125, "grad_norm_var": 0.5763020833333333, "learning_rate": 0.0001, "loss": 7.628, "loss/crossentropy": 2.620191812515259, "loss/hidden": 2.08203125, "loss/jsd": 0.0, "loss/logits": 0.29257481172680855, "step": 716 }, { "epoch": 0.03263636363636364, "grad_norm": 11.5625, "grad_norm_var": 0.79609375, "learning_rate": 0.0001, "loss": 7.833, "loss/crossentropy": 2.6508530974388123, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.30102600902318954, "step": 718 }, { "epoch": 0.03272727272727273, "grad_norm": 9.9375, "grad_norm_var": 0.7728515625, "learning_rate": 0.0001, "loss": 7.877, "loss/crossentropy": 2.6941930055618286, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.3034411072731018, "step": 720 }, { "epoch": 0.032818181818181816, "grad_norm": 8.875, "grad_norm_var": 0.7275390625, "learning_rate": 0.0001, "loss": 7.3625, "loss/crossentropy": 2.4118319153785706, "loss/hidden": 2.08984375, "loss/jsd": 0.0, "loss/logits": 0.2860780730843544, "step": 722 }, { "epoch": 0.03290909090909091, "grad_norm": 10.25, "grad_norm_var": 0.743212890625, "learning_rate": 0.0001, "loss": 8.11, "loss/crossentropy": 2.8327419757843018, "loss/hidden": 2.11328125, "loss/jsd": 0.0, "loss/logits": 0.3163973242044449, "step": 724 }, { "epoch": 0.033, "grad_norm": 11.8125, "grad_norm_var": 0.9941243489583333, "learning_rate": 0.0001, "loss": 7.7506, "loss/crossentropy": 2.6242750883102417, "loss/hidden": 2.08984375, "loss/jsd": 0.0, "loss/logits": 0.3036531060934067, "step": 726 }, { "epoch": 0.03309090909090909, "grad_norm": 10.25, "grad_norm_var": 0.8972493489583333, "learning_rate": 0.0001, "loss": 7.7159, "loss/crossentropy": 2.4613832235336304, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.3090450093150139, "step": 728 }, { "epoch": 0.03318181818181818, "grad_norm": 9.3125, "grad_norm_var": 0.7070149739583333, "learning_rate": 0.0001, "loss": 7.3051, "loss/crossentropy": 2.4003431499004364, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.2803229093551636, "step": 730 }, { "epoch": 0.03327272727272727, "grad_norm": 9.6875, "grad_norm_var": 0.5968098958333333, "learning_rate": 0.0001, "loss": 7.7914, "loss/crossentropy": 2.6524071097373962, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.30218031257390976, "step": 732 }, { "epoch": 0.033363636363636366, "grad_norm": 8.5, "grad_norm_var": 0.5676432291666667, "learning_rate": 0.0001, "loss": 7.5049, "loss/crossentropy": 2.5877906680107117, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.28312113881111145, "step": 734 }, { "epoch": 0.03345454545454545, "grad_norm": 11.0, "grad_norm_var": 0.7359375, "learning_rate": 0.0001, "loss": 7.6016, "loss/crossentropy": 2.6137605905532837, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.2855057418346405, "step": 736 }, { "epoch": 0.033545454545454545, "grad_norm": 9.75, "grad_norm_var": 0.6754557291666666, "learning_rate": 0.0001, "loss": 7.6005, "loss/crossentropy": 2.636506140232086, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.2870270311832428, "step": 738 }, { "epoch": 0.03363636363636364, "grad_norm": 8.6875, "grad_norm_var": 0.7973795572916667, "learning_rate": 0.0001, "loss": 7.638, "loss/crossentropy": 2.660768508911133, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.2867831662297249, "step": 740 }, { "epoch": 0.03372727272727273, "grad_norm": 8.75, "grad_norm_var": 0.522119140625, "learning_rate": 0.0001, "loss": 7.6617, "loss/crossentropy": 2.6612327098846436, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.2922295778989792, "step": 742 }, { "epoch": 0.03381818181818182, "grad_norm": 10.0, "grad_norm_var": 0.5153483072916667, "learning_rate": 0.0001, "loss": 7.6685, "loss/crossentropy": 2.6454962491989136, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.2929238900542259, "step": 744 }, { "epoch": 0.03390909090909091, "grad_norm": 9.0625, "grad_norm_var": 0.4847493489583333, "learning_rate": 0.0001, "loss": 7.6078, "loss/crossentropy": 2.6077443957328796, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.2906327396631241, "step": 746 }, { "epoch": 0.034, "grad_norm": 8.5, "grad_norm_var": 0.5083170572916667, "learning_rate": 0.0001, "loss": 7.5787, "loss/crossentropy": 2.4560895562171936, "loss/hidden": 2.16015625, "loss/jsd": 0.0, "loss/logits": 0.29624994844198227, "step": 748 }, { "epoch": 0.03409090909090909, "grad_norm": 8.25, "grad_norm_var": 0.5360514322916666, "learning_rate": 0.0001, "loss": 7.639, "loss/crossentropy": 2.604844570159912, "loss/hidden": 2.052734375, "loss/jsd": 0.0, "loss/logits": 0.29814238101243973, "step": 750 }, { "epoch": 0.03418181818181818, "grad_norm": 9.125, "grad_norm_var": 0.4088541666666667, "learning_rate": 0.0001, "loss": 7.8594, "loss/crossentropy": 2.6660702228546143, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3091736435890198, "step": 752 }, { "epoch": 0.034272727272727274, "grad_norm": 8.6875, "grad_norm_var": 0.37161458333333336, "learning_rate": 0.0001, "loss": 7.668, "loss/crossentropy": 2.632961630821228, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.291782446205616, "step": 754 }, { "epoch": 0.03436363636363637, "grad_norm": 8.75, "grad_norm_var": 0.37161458333333336, "learning_rate": 0.0001, "loss": 7.6285, "loss/crossentropy": 2.6047069430351257, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.29144028574228287, "step": 756 }, { "epoch": 0.03445454545454545, "grad_norm": 9.8125, "grad_norm_var": 0.34837239583333335, "learning_rate": 0.0001, "loss": 7.1707, "loss/crossentropy": 2.357417196035385, "loss/hidden": 2.12890625, "loss/jsd": 0.0, "loss/logits": 0.26843687146902084, "step": 758 }, { "epoch": 0.034545454545454546, "grad_norm": 8.75, "grad_norm_var": 0.511572265625, "learning_rate": 0.0001, "loss": 7.6921, "loss/crossentropy": 2.5781309604644775, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.3028004616498947, "step": 760 }, { "epoch": 0.03463636363636364, "grad_norm": 9.4375, "grad_norm_var": 0.49138997395833334, "learning_rate": 0.0001, "loss": 7.8158, "loss/crossentropy": 2.7118375301361084, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3025844097137451, "step": 762 }, { "epoch": 0.034727272727272725, "grad_norm": 9.375, "grad_norm_var": 0.45636393229166666, "learning_rate": 0.0001, "loss": 7.7222, "loss/crossentropy": 2.6527684926986694, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.2975689247250557, "step": 764 }, { "epoch": 0.03481818181818182, "grad_norm": 9.5, "grad_norm_var": 0.38800455729166666, "learning_rate": 0.0001, "loss": 7.9662, "loss/crossentropy": 2.7728688716888428, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.309178963303566, "step": 766 }, { "epoch": 0.03490909090909091, "grad_norm": 10.0625, "grad_norm_var": 0.34010416666666665, "learning_rate": 0.0001, "loss": 7.8562, "loss/crossentropy": 2.7317837476730347, "loss/hidden": 2.08203125, "loss/jsd": 0.0, "loss/logits": 0.3042432814836502, "step": 768 }, { "epoch": 0.035, "grad_norm": 9.5625, "grad_norm_var": 0.30349934895833336, "learning_rate": 0.0001, "loss": 7.9356, "loss/crossentropy": 2.7856013774871826, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.3091363161802292, "step": 770 }, { "epoch": 0.03509090909090909, "grad_norm": 8.8125, "grad_norm_var": 0.27024739583333335, "learning_rate": 0.0001, "loss": 7.8484, "loss/crossentropy": 2.7463120818138123, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.30083074420690536, "step": 772 }, { "epoch": 0.03518181818181818, "grad_norm": 9.125, "grad_norm_var": 0.39479166666666665, "learning_rate": 0.0001, "loss": 7.2576, "loss/crossentropy": 2.493532121181488, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.2709430046379566, "step": 774 }, { "epoch": 0.035272727272727275, "grad_norm": 8.9375, "grad_norm_var": 0.21886393229166667, "learning_rate": 0.0001, "loss": 7.6198, "loss/crossentropy": 2.761355400085449, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2811591401696205, "step": 776 }, { "epoch": 0.03536363636363636, "grad_norm": 8.8125, "grad_norm_var": 0.27760416666666665, "learning_rate": 0.0001, "loss": 7.4, "loss/crossentropy": 2.477522075176239, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.289900965988636, "step": 778 }, { "epoch": 0.035454545454545454, "grad_norm": 8.25, "grad_norm_var": 0.42418212890625, "learning_rate": 0.0001, "loss": 6.4104, "loss/crossentropy": 2.102365493774414, "loss/hidden": 2.001953125, "loss/jsd": 0.0, "loss/logits": 0.23060761764645576, "step": 780 }, { "epoch": 0.03554545454545455, "grad_norm": 11.4375, "grad_norm_var": 0.79898681640625, "learning_rate": 0.0001, "loss": 8.3121, "loss/crossentropy": 2.978004515171051, "loss/hidden": 2.12109375, "loss/jsd": 0.0, "loss/logits": 0.3213004618883133, "step": 782 }, { "epoch": 0.03563636363636364, "grad_norm": 7.65625, "grad_norm_var": 1.293994140625, "learning_rate": 0.0001, "loss": 6.9867, "loss/crossentropy": 2.2722569704055786, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.2659756615757942, "step": 784 }, { "epoch": 0.035727272727272726, "grad_norm": 8.6875, "grad_norm_var": 1.281103515625, "learning_rate": 0.0001, "loss": 7.5367, "loss/crossentropy": 2.616827666759491, "loss/hidden": 2.037109375, "loss/jsd": 0.0, "loss/logits": 0.2882799804210663, "step": 786 }, { "epoch": 0.03581818181818182, "grad_norm": 8.75, "grad_norm_var": 1.3108072916666667, "learning_rate": 0.0001, "loss": 7.585, "loss/crossentropy": 2.6705965399742126, "loss/hidden": 2.04296875, "loss/jsd": 0.0, "loss/logits": 0.2871450111269951, "step": 788 }, { "epoch": 0.03590909090909091, "grad_norm": 12.625, "grad_norm_var": 1.9671712239583334, "learning_rate": 0.0001, "loss": 7.2014, "loss/crossentropy": 2.393044650554657, "loss/hidden": 2.05078125, "loss/jsd": 0.0, "loss/logits": 0.2757565267384052, "step": 790 }, { "epoch": 0.036, "grad_norm": 8.8125, "grad_norm_var": 1.975244140625, "learning_rate": 0.0001, "loss": 7.7424, "loss/crossentropy": 2.6991345286369324, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.296518050134182, "step": 792 }, { "epoch": 0.03609090909090909, "grad_norm": 9.25, "grad_norm_var": 1.8998046875, "learning_rate": 0.0001, "loss": 7.7302, "loss/crossentropy": 2.775338888168335, "loss/hidden": 2.05078125, "loss/jsd": 0.0, "loss/logits": 0.29041290283203125, "step": 794 }, { "epoch": 0.03618181818181818, "grad_norm": 9.0, "grad_norm_var": 1.6304646809895833, "learning_rate": 0.0001, "loss": 7.7456, "loss/crossentropy": 2.775674045085907, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.2934732586145401, "step": 796 }, { "epoch": 0.036272727272727276, "grad_norm": 9.5, "grad_norm_var": 1.4157185872395834, "learning_rate": 0.0001, "loss": 7.3621, "loss/crossentropy": 2.520434081554413, "loss/hidden": 2.07421875, "loss/jsd": 0.0, "loss/logits": 0.2767455130815506, "step": 798 }, { "epoch": 0.03636363636363636, "grad_norm": 8.9375, "grad_norm_var": 0.8916015625, "learning_rate": 0.0001, "loss": 7.4572, "loss/crossentropy": 2.4732616543769836, "loss/hidden": 2.09765625, "loss/jsd": 0.0, "loss/logits": 0.2886253595352173, "step": 800 }, { "epoch": 0.036454545454545455, "grad_norm": 9.4375, "grad_norm_var": 1.825244140625, "learning_rate": 0.0001, "loss": 7.795, "loss/crossentropy": 2.7347466945648193, "loss/hidden": 2.08203125, "loss/jsd": 0.0, "loss/logits": 0.29782192409038544, "step": 802 }, { "epoch": 0.03654545454545455, "grad_norm": 9.9375, "grad_norm_var": 1.8113932291666666, "learning_rate": 0.0001, "loss": 7.3194, "loss/crossentropy": 2.46749609708786, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.2789382115006447, "step": 804 }, { "epoch": 0.036636363636363634, "grad_norm": 9.3125, "grad_norm_var": 1.260009765625, "learning_rate": 0.0001, "loss": 7.6572, "loss/crossentropy": 2.7170470356941223, "loss/hidden": 2.05078125, "loss/jsd": 0.0, "loss/logits": 0.2889394611120224, "step": 806 }, { "epoch": 0.036727272727272726, "grad_norm": 10.0, "grad_norm_var": 1.2639973958333333, "learning_rate": 0.0001, "loss": 7.7452, "loss/crossentropy": 2.7828078866004944, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.2884252220392227, "step": 808 }, { "epoch": 0.03681818181818182, "grad_norm": 8.5625, "grad_norm_var": 1.3497233072916666, "learning_rate": 0.0001, "loss": 7.3396, "loss/crossentropy": 2.5699315667152405, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.2730615958571434, "step": 810 }, { "epoch": 0.03690909090909091, "grad_norm": 9.4375, "grad_norm_var": 1.3433430989583333, "learning_rate": 0.0001, "loss": 7.7904, "loss/crossentropy": 2.668477475643158, "loss/hidden": 2.11328125, "loss/jsd": 0.0, "loss/logits": 0.3008626401424408, "step": 812 }, { "epoch": 0.037, "grad_norm": 8.8125, "grad_norm_var": 1.3282389322916666, "learning_rate": 0.0001, "loss": 7.538, "loss/crossentropy": 2.611567497253418, "loss/hidden": 2.08203125, "loss/jsd": 0.0, "loss/logits": 0.2844388261437416, "step": 814 }, { "epoch": 0.03709090909090909, "grad_norm": 8.8125, "grad_norm_var": 1.3430826822916666, "learning_rate": 0.0001, "loss": 7.4602, "loss/crossentropy": 2.5771912932395935, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.2824435755610466, "step": 816 }, { "epoch": 0.037181818181818184, "grad_norm": 9.875, "grad_norm_var": 0.371728515625, "learning_rate": 0.0001, "loss": 7.1784, "loss/crossentropy": 2.360037326812744, "loss/hidden": 2.08203125, "loss/jsd": 0.0, "loss/logits": 0.2736290991306305, "step": 818 }, { "epoch": 0.03727272727272727, "grad_norm": 10.1875, "grad_norm_var": 28.090885416666666, "learning_rate": 0.0001, "loss": 7.7807, "loss/crossentropy": 2.520742416381836, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.31740423291921616, "step": 820 }, { "epoch": 0.03736363636363636, "grad_norm": 10.0, "grad_norm_var": 27.668343098958335, "learning_rate": 0.0001, "loss": 7.389, "loss/crossentropy": 2.4648948907852173, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.28381309285759926, "step": 822 }, { "epoch": 0.037454545454545456, "grad_norm": 14.0625, "grad_norm_var": 28.451025390625, "learning_rate": 0.0001, "loss": 7.7298, "loss/crossentropy": 2.6694108843803406, "loss/hidden": 2.11328125, "loss/jsd": 0.0, "loss/logits": 0.29471351206302643, "step": 824 }, { "epoch": 0.03754545454545455, "grad_norm": 8.5625, "grad_norm_var": 28.020035807291666, "learning_rate": 0.0001, "loss": 7.511, "loss/crossentropy": 2.6032878756523132, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.2853025272488594, "step": 826 }, { "epoch": 0.037636363636363634, "grad_norm": 8.625, "grad_norm_var": 28.3916015625, "learning_rate": 0.0001, "loss": 7.5678, "loss/crossentropy": 2.617487132549286, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.28800226002931595, "step": 828 }, { "epoch": 0.03772727272727273, "grad_norm": 9.75, "grad_norm_var": 28.837744140625, "learning_rate": 0.0001, "loss": 7.4856, "loss/crossentropy": 2.5793757140636444, "loss/hidden": 2.083984375, "loss/jsd": 0.0, "loss/logits": 0.2822200283408165, "step": 830 }, { "epoch": 0.03781818181818182, "grad_norm": 8.0625, "grad_norm_var": 29.361311848958334, "learning_rate": 0.0001, "loss": 7.4782, "loss/crossentropy": 2.633286237716675, "loss/hidden": 2.05078125, "loss/jsd": 0.0, "loss/logits": 0.27941077947616577, "step": 832 }, { "epoch": 0.037909090909090906, "grad_norm": 8.0, "grad_norm_var": 29.710677083333334, "learning_rate": 0.0001, "loss": 7.2055, "loss/crossentropy": 2.4763447642326355, "loss/hidden": 2.04296875, "loss/jsd": 0.0, "loss/logits": 0.26861533522605896, "step": 834 }, { "epoch": 0.038, "grad_norm": 8.9375, "grad_norm_var": 2.4410807291666665, "learning_rate": 0.0001, "loss": 7.5229, "loss/crossentropy": 2.6458545923233032, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.2818458005785942, "step": 836 }, { "epoch": 0.03809090909090909, "grad_norm": 8.8125, "grad_norm_var": 2.5609375, "learning_rate": 0.0001, "loss": 7.6764, "loss/crossentropy": 2.728838324546814, "loss/hidden": 2.02734375, "loss/jsd": 0.0, "loss/logits": 0.2920207530260086, "step": 838 }, { "epoch": 0.038181818181818185, "grad_norm": 7.84375, "grad_norm_var": 0.96314697265625, "learning_rate": 0.0001, "loss": 7.5012, "loss/crossentropy": 2.640512466430664, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.2798164337873459, "step": 840 }, { "epoch": 0.03827272727272727, "grad_norm": 9.25, "grad_norm_var": 0.72144775390625, "learning_rate": 0.0001, "loss": 7.7626, "loss/crossentropy": 2.632368743419647, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.3044291138648987, "step": 842 }, { "epoch": 0.038363636363636364, "grad_norm": 9.3125, "grad_norm_var": 0.7610636393229167, "learning_rate": 0.0001, "loss": 7.4662, "loss/crossentropy": 2.5239705443382263, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.2883678898215294, "step": 844 }, { "epoch": 0.038454545454545457, "grad_norm": 9.75, "grad_norm_var": 0.7610636393229167, "learning_rate": 0.0001, "loss": 7.4806, "loss/crossentropy": 2.6211798787117004, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.2804770842194557, "step": 846 }, { "epoch": 0.03854545454545454, "grad_norm": 9.75, "grad_norm_var": 0.7162394205729167, "learning_rate": 0.0001, "loss": 7.8909, "loss/crossentropy": 2.845347285270691, "loss/hidden": 2.060546875, "loss/jsd": 0.0, "loss/logits": 0.29850221425294876, "step": 848 }, { "epoch": 0.038636363636363635, "grad_norm": 8.125, "grad_norm_var": 0.6849568684895834, "learning_rate": 0.0001, "loss": 7.2797, "loss/crossentropy": 2.5021623969078064, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.2742362916469574, "step": 850 }, { "epoch": 0.03872727272727273, "grad_norm": 9.875, "grad_norm_var": 0.7201131184895834, "learning_rate": 0.0001, "loss": 7.5846, "loss/crossentropy": 2.6470109820365906, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.28672561049461365, "step": 852 }, { "epoch": 0.03881818181818182, "grad_norm": 9.9375, "grad_norm_var": 0.5217081705729166, "learning_rate": 0.0001, "loss": 7.5236, "loss/crossentropy": 2.4747504591941833, "loss/hidden": 2.12109375, "loss/jsd": 0.0, "loss/logits": 0.2927774488925934, "step": 854 }, { "epoch": 0.03890909090909091, "grad_norm": 8.0, "grad_norm_var": 0.4856770833333333, "learning_rate": 0.0001, "loss": 7.6691, "loss/crossentropy": 2.6846747398376465, "loss/hidden": 2.04296875, "loss/jsd": 0.0, "loss/logits": 0.2941449508070946, "step": 856 }, { "epoch": 0.039, "grad_norm": 8.875, "grad_norm_var": 0.5155598958333333, "learning_rate": 0.0001, "loss": 7.081, "loss/crossentropy": 2.4637567698955536, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.26093944534659386, "step": 858 }, { "epoch": 0.03909090909090909, "grad_norm": 8.75, "grad_norm_var": 0.5136555989583333, "learning_rate": 0.0001, "loss": 7.8913, "loss/crossentropy": 2.7479973435401917, "loss/hidden": 2.04296875, "loss/jsd": 0.0, "loss/logits": 0.31003710627555847, "step": 860 }, { "epoch": 0.03918181818181818, "grad_norm": 8.3125, "grad_norm_var": 0.40011393229166664, "learning_rate": 0.0001, "loss": 6.853, "loss/crossentropy": 2.3689213693141937, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.24918761476874352, "step": 862 }, { "epoch": 0.03927272727272727, "grad_norm": 9.1875, "grad_norm_var": 0.4166666666666667, "learning_rate": 0.0001, "loss": 7.0673, "loss/crossentropy": 2.3253394961357117, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2695126533508301, "step": 864 }, { "epoch": 0.039363636363636365, "grad_norm": 7.8125, "grad_norm_var": 0.44993489583333335, "learning_rate": 0.0001, "loss": 7.0816, "loss/crossentropy": 2.4387176632881165, "loss/hidden": 2.021484375, "loss/jsd": 0.0, "loss/logits": 0.26213637739419937, "step": 866 }, { "epoch": 0.03945454545454546, "grad_norm": 7.875, "grad_norm_var": 0.41796875, "learning_rate": 0.0001, "loss": 7.1459, "loss/crossentropy": 2.475292384624481, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.26315581426024437, "step": 868 }, { "epoch": 0.03954545454545454, "grad_norm": 8.875, "grad_norm_var": 0.29583333333333334, "learning_rate": 0.0001, "loss": 7.6756, "loss/crossentropy": 2.838263511657715, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.28217408061027527, "step": 870 }, { "epoch": 0.039636363636363636, "grad_norm": 8.75, "grad_norm_var": 0.30388997395833334, "learning_rate": 0.0001, "loss": 7.5317, "loss/crossentropy": 2.7052788734436035, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.28069108724594116, "step": 872 }, { "epoch": 0.03972727272727273, "grad_norm": 9.0, "grad_norm_var": 0.31378580729166666, "learning_rate": 0.0001, "loss": 7.5304, "loss/crossentropy": 2.6983466744422913, "loss/hidden": 2.001953125, "loss/jsd": 0.0, "loss/logits": 0.2830134779214859, "step": 874 }, { "epoch": 0.039818181818181815, "grad_norm": 9.75, "grad_norm_var": 0.3578125, "learning_rate": 0.0001, "loss": 7.9129, "loss/crossentropy": 2.8246246576309204, "loss/hidden": 2.05078125, "loss/jsd": 0.0, "loss/logits": 0.30374450981616974, "step": 876 }, { "epoch": 0.03990909090909091, "grad_norm": 9.6875, "grad_norm_var": 0.5119791666666667, "learning_rate": 0.0001, "loss": 7.7946, "loss/crossentropy": 2.7534927129745483, "loss/hidden": 2.041015625, "loss/jsd": 0.0, "loss/logits": 0.30000482499599457, "step": 878 }, { "epoch": 0.04, "grad_norm": 10.375, "grad_norm_var": 0.6171712239583333, "learning_rate": 0.0001, "loss": 7.4436, "loss/crossentropy": 2.595871150493622, "loss/hidden": 2.021484375, "loss/jsd": 0.0, "loss/logits": 0.2826279513537884, "step": 880 }, { "epoch": 0.040090909090909094, "grad_norm": 8.9375, "grad_norm_var": 0.5174479166666667, "learning_rate": 0.0001, "loss": 7.9696, "loss/crossentropy": 2.8875725269317627, "loss/hidden": 2.033203125, "loss/jsd": 0.0, "loss/logits": 0.3048801124095917, "step": 882 }, { "epoch": 0.04018181818181818, "grad_norm": 10.3125, "grad_norm_var": 0.43748372395833335, "learning_rate": 0.0001, "loss": 7.3805, "loss/crossentropy": 2.4742724895477295, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.2871021553874016, "step": 884 }, { "epoch": 0.04027272727272727, "grad_norm": 10.4375, "grad_norm_var": 0.5173014322916667, "learning_rate": 0.0001, "loss": 7.9782, "loss/crossentropy": 2.7337942719459534, "loss/hidden": 2.076171875, "loss/jsd": 0.0, "loss/logits": 0.3168240562081337, "step": 886 }, { "epoch": 0.040363636363636365, "grad_norm": 8.3125, "grad_norm_var": 0.6765462239583333, "learning_rate": 0.0001, "loss": 7.2684, "loss/crossentropy": 2.5466448068618774, "loss/hidden": 2.029296875, "loss/jsd": 0.0, "loss/logits": 0.26924166083335876, "step": 888 }, { "epoch": 0.04045454545454545, "grad_norm": 9.375, "grad_norm_var": 0.7433430989583333, "learning_rate": 0.0001, "loss": 7.8814, "loss/crossentropy": 2.7235788106918335, "loss/hidden": 2.06640625, "loss/jsd": 0.0, "loss/logits": 0.3091403990983963, "step": 890 }, { "epoch": 0.040545454545454544, "grad_norm": 10.0, "grad_norm_var": 0.990869140625, "learning_rate": 0.0001, "loss": 7.8062, "loss/crossentropy": 2.8233283162117004, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.2947686240077019, "step": 892 }, { "epoch": 0.04063636363636364, "grad_norm": 8.125, "grad_norm_var": 1.1306640625, "learning_rate": 0.0001, "loss": 7.3965, "loss/crossentropy": 2.4787421226501465, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.28865480422973633, "step": 894 }, { "epoch": 0.04072727272727273, "grad_norm": 10.5625, "grad_norm_var": 1.41324462890625, "learning_rate": 0.0001, "loss": 7.3275, "loss/crossentropy": 2.5281259417533875, "loss/hidden": 2.02734375, "loss/jsd": 0.0, "loss/logits": 0.2771988920867443, "step": 896 }, { "epoch": 0.040818181818181816, "grad_norm": 11.125, "grad_norm_var": 1.66763916015625, "learning_rate": 0.0001, "loss": 7.552, "loss/crossentropy": 2.632531225681305, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.2880382090806961, "step": 898 }, { "epoch": 0.04090909090909091, "grad_norm": 9.5, "grad_norm_var": 1.6762003580729166, "learning_rate": 0.0001, "loss": 7.3747, "loss/crossentropy": 2.609145402908325, "loss/hidden": 2.009765625, "loss/jsd": 0.0, "loss/logits": 0.2755746468901634, "step": 900 }, { "epoch": 0.041, "grad_norm": 8.25, "grad_norm_var": 1.5867146809895833, "learning_rate": 0.0001, "loss": 7.365, "loss/crossentropy": 2.614375650882721, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.27193911373615265, "step": 902 }, { "epoch": 0.04109090909090909, "grad_norm": 13.1875, "grad_norm_var": 2.375907389322917, "learning_rate": 0.0001, "loss": 7.5152, "loss/crossentropy": 2.519492030143738, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.29175354540348053, "step": 904 }, { "epoch": 0.04118181818181818, "grad_norm": 8.75, "grad_norm_var": 3.7359334309895833, "learning_rate": 0.0001, "loss": 7.2981, "loss/crossentropy": 2.415618658065796, "loss/hidden": 2.08984375, "loss/jsd": 0.0, "loss/logits": 0.2792598828673363, "step": 906 }, { "epoch": 0.04127272727272727, "grad_norm": 9.5, "grad_norm_var": 3.64605712890625, "learning_rate": 0.0001, "loss": 7.8546, "loss/crossentropy": 2.777782380580902, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.3014319986104965, "step": 908 }, { "epoch": 0.041363636363636366, "grad_norm": 8.75, "grad_norm_var": 3.8423136393229167, "learning_rate": 0.0001, "loss": 7.2885, "loss/crossentropy": 2.52365505695343, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.27452773228287697, "step": 910 }, { "epoch": 0.04145454545454545, "grad_norm": 7.875, "grad_norm_var": 3.779671223958333, "learning_rate": 0.0001, "loss": 7.2517, "loss/crossentropy": 2.4605175852775574, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.2767706960439682, "step": 912 }, { "epoch": 0.041545454545454545, "grad_norm": 9.125, "grad_norm_var": 3.503369140625, "learning_rate": 0.0001, "loss": 7.2914, "loss/crossentropy": 2.551686108112335, "loss/hidden": 1.998046875, "loss/jsd": 0.0, "loss/logits": 0.2741653472185135, "step": 914 }, { "epoch": 0.04163636363636364, "grad_norm": 8.5, "grad_norm_var": 3.486458333333333, "learning_rate": 0.0001, "loss": 7.2785, "loss/crossentropy": 2.555837631225586, "loss/hidden": 2.02734375, "loss/jsd": 0.0, "loss/logits": 0.2695278041064739, "step": 916 }, { "epoch": 0.041727272727272724, "grad_norm": 12.75, "grad_norm_var": 27.019791666666666, "learning_rate": 0.0001, "loss": 8.5471, "loss/crossentropy": 2.772084414958954, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.362660177052021, "step": 918 }, { "epoch": 0.04181818181818182, "grad_norm": 8.75, "grad_norm_var": 26.751041666666666, "learning_rate": 0.0001, "loss": 7.7592, "loss/crossentropy": 2.7441869378089905, "loss/hidden": 2.04296875, "loss/jsd": 0.0, "loss/logits": 0.29720689356327057, "step": 920 }, { "epoch": 0.04190909090909091, "grad_norm": 8.4375, "grad_norm_var": 25.940738932291666, "learning_rate": 0.0001, "loss": 7.6385, "loss/crossentropy": 2.7667773962020874, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.28405076265335083, "step": 922 }, { "epoch": 0.042, "grad_norm": 8.5625, "grad_norm_var": 26.020817057291666, "learning_rate": 0.0001, "loss": 7.8172, "loss/crossentropy": 2.8558149933815002, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.2953573539853096, "step": 924 }, { "epoch": 0.04209090909090909, "grad_norm": 8.3125, "grad_norm_var": 25.814322916666665, "learning_rate": 0.0001, "loss": 7.7781, "loss/crossentropy": 2.888501524925232, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.28661684691905975, "step": 926 }, { "epoch": 0.04218181818181818, "grad_norm": 8.375, "grad_norm_var": 25.5587890625, "learning_rate": 0.0001, "loss": 7.5371, "loss/crossentropy": 2.6759337782859802, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.28299514204263687, "step": 928 }, { "epoch": 0.042272727272727274, "grad_norm": 10.1875, "grad_norm_var": 25.389176432291666, "learning_rate": 0.0001, "loss": 7.8876, "loss/crossentropy": 2.8753767609596252, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.30044400691986084, "step": 930 }, { "epoch": 0.04236363636363636, "grad_norm": 8.125, "grad_norm_var": 25.681184895833333, "learning_rate": 0.0001, "loss": 7.1725, "loss/crossentropy": 2.47357714176178, "loss/hidden": 2.02734375, "loss/jsd": 0.0, "loss/logits": 0.26715561375021935, "step": 932 }, { "epoch": 0.04245454545454545, "grad_norm": 8.3125, "grad_norm_var": 0.2994140625, "learning_rate": 0.0001, "loss": 6.9849, "loss/crossentropy": 2.31432044506073, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.2654954567551613, "step": 934 }, { "epoch": 0.042545454545454546, "grad_norm": 8.625, "grad_norm_var": 0.36666666666666664, "learning_rate": 0.0001, "loss": 7.9262, "loss/crossentropy": 2.7791746258735657, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.3088426813483238, "step": 936 }, { "epoch": 0.04263636363636364, "grad_norm": 10.25, "grad_norm_var": 0.5328125, "learning_rate": 0.0001, "loss": 7.1368, "loss/crossentropy": 2.517613172531128, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.2630890905857086, "step": 938 }, { "epoch": 0.042727272727272725, "grad_norm": 8.1875, "grad_norm_var": 0.5505208333333333, "learning_rate": 0.0001, "loss": 7.0142, "loss/crossentropy": 2.3666262328624725, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.260855995118618, "step": 940 }, { "epoch": 0.04281818181818182, "grad_norm": 7.90625, "grad_norm_var": 0.6647135416666666, "learning_rate": 0.0001, "loss": 7.5186, "loss/crossentropy": 2.7445489168167114, "loss/hidden": 1.998046875, "loss/jsd": 0.0, "loss/logits": 0.277604840695858, "step": 942 }, { "epoch": 0.04290909090909091, "grad_norm": 11.125, "grad_norm_var": 20.034488932291666, "learning_rate": 0.0001, "loss": 7.6486, "loss/crossentropy": 2.599597990512848, "loss/hidden": 2.013671875, "loss/jsd": 0.0, "loss/logits": 0.3035336136817932, "step": 944 }, { "epoch": 0.043, "grad_norm": 8.75, "grad_norm_var": 20.049072265625, "learning_rate": 0.0001, "loss": 7.4333, "loss/crossentropy": 2.5221100449562073, "loss/hidden": 2.025390625, "loss/jsd": 0.0, "loss/logits": 0.2885800376534462, "step": 946 }, { "epoch": 0.04309090909090909, "grad_norm": 7.53125, "grad_norm_var": 20.16597900390625, "learning_rate": 0.0001, "loss": 6.7024, "loss/crossentropy": 2.2175972759723663, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.2473069652915001, "step": 948 }, { "epoch": 0.04318181818181818, "grad_norm": 7.84375, "grad_norm_var": 20.14140625, "learning_rate": 0.0001, "loss": 7.3393, "loss/crossentropy": 2.610602915287018, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.2763842046260834, "step": 950 }, { "epoch": 0.043272727272727275, "grad_norm": 11.75, "grad_norm_var": 155.11756184895833, "learning_rate": 0.0001, "loss": 8.6763, "loss/crossentropy": 2.682557702064514, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.38999687880277634, "step": 952 }, { "epoch": 0.04336363636363636, "grad_norm": 12.0625, "grad_norm_var": 153.853369140625, "learning_rate": 0.0001, "loss": 7.1708, "loss/crossentropy": 2.3510724008083344, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.277289979159832, "step": 954 }, { "epoch": 0.043454545454545454, "grad_norm": 10.625, "grad_norm_var": 152.3953125, "learning_rate": 0.0001, "loss": 7.6829, "loss/crossentropy": 2.578477382659912, "loss/hidden": 2.06640625, "loss/jsd": 0.0, "loss/logits": 0.3038061708211899, "step": 956 }, { "epoch": 0.04354545454545455, "grad_norm": 8.375, "grad_norm_var": 150.7037109375, "learning_rate": 0.0001, "loss": 7.5263, "loss/crossentropy": 2.621874213218689, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.28575151413679123, "step": 958 }, { "epoch": 0.04363636363636364, "grad_norm": 9.125, "grad_norm_var": 140.818603515625, "learning_rate": 0.0001, "loss": 6.9496, "loss/crossentropy": 2.369056522846222, "loss/hidden": 2.00390625, "loss/jsd": 0.0, "loss/logits": 0.25766705721616745, "step": 960 }, { "epoch": 0.043727272727272726, "grad_norm": 8.8125, "grad_norm_var": 141.14998372395834, "learning_rate": 0.0001, "loss": 7.5879, "loss/crossentropy": 2.7704147696495056, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2825275808572769, "step": 962 }, { "epoch": 0.04381818181818182, "grad_norm": 11.0, "grad_norm_var": 139.0453084309896, "learning_rate": 0.0001, "loss": 7.3775, "loss/crossentropy": 2.40766704082489, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.28839318454265594, "step": 964 }, { "epoch": 0.04390909090909091, "grad_norm": 8.5, "grad_norm_var": 138.07447916666666, "learning_rate": 0.0001, "loss": 7.6047, "loss/crossentropy": 2.635489761829376, "loss/hidden": 2.033203125, "loss/jsd": 0.0, "loss/logits": 0.29360445588827133, "step": 966 }, { "epoch": 0.044, "grad_norm": 9.875, "grad_norm_var": 28.3728515625, "learning_rate": 0.0001, "loss": 8.1445, "loss/crossentropy": 2.564156651496887, "loss/hidden": 2.12109375, "loss/jsd": 0.0, "loss/logits": 0.34592773020267487, "step": 968 }, { "epoch": 0.04409090909090909, "grad_norm": 9.4375, "grad_norm_var": 28.539957682291668, "learning_rate": 0.0001, "loss": 7.2303, "loss/crossentropy": 2.4342291951179504, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.2737446203827858, "step": 970 }, { "epoch": 0.04418181818181818, "grad_norm": 8.0, "grad_norm_var": 28.8728515625, "learning_rate": 0.0001, "loss": 7.0432, "loss/crossentropy": 2.3951300382614136, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.2589489221572876, "step": 972 }, { "epoch": 0.044272727272727276, "grad_norm": 8.0, "grad_norm_var": 29.303125, "learning_rate": 0.0001, "loss": 7.4079, "loss/crossentropy": 2.7032630443573, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.268513698130846, "step": 974 }, { "epoch": 0.04436363636363636, "grad_norm": 7.625, "grad_norm_var": 29.806884765625, "learning_rate": 0.0001, "loss": 7.2601, "loss/crossentropy": 2.5813175439834595, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.26671136915683746, "step": 976 }, { "epoch": 0.044454545454545455, "grad_norm": 13.1875, "grad_norm_var": 30.300809733072917, "learning_rate": 0.0001, "loss": 6.7919, "loss/crossentropy": 2.18803808093071, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.2502276822924614, "step": 978 }, { "epoch": 0.04454545454545455, "grad_norm": 18.25, "grad_norm_var": 33.861714680989586, "learning_rate": 0.0001, "loss": 7.8457, "loss/crossentropy": 2.7356996536254883, "loss/hidden": 2.02734375, "loss/jsd": 0.0, "loss/logits": 0.30826691538095474, "step": 980 }, { "epoch": 0.044636363636363634, "grad_norm": 10.75, "grad_norm_var": 33.76119384765625, "learning_rate": 0.0001, "loss": 7.3738, "loss/crossentropy": 2.5090240836143494, "loss/hidden": 2.08203125, "loss/jsd": 0.0, "loss/logits": 0.2782706692814827, "step": 982 }, { "epoch": 0.04472727272727273, "grad_norm": 9.0, "grad_norm_var": 7.538765462239583, "learning_rate": 0.0001, "loss": 7.0928, "loss/crossentropy": 2.4226272106170654, "loss/hidden": 1.994140625, "loss/jsd": 0.0, "loss/logits": 0.26760101318359375, "step": 984 }, { "epoch": 0.04481818181818182, "grad_norm": 7.875, "grad_norm_var": 7.838407389322916, "learning_rate": 0.0001, "loss": 7.3808, "loss/crossentropy": 2.656381368637085, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.27556952834129333, "step": 986 }, { "epoch": 0.04490909090909091, "grad_norm": 9.3125, "grad_norm_var": 7.73804931640625, "learning_rate": 0.0001, "loss": 7.748, "loss/crossentropy": 2.6568104028701782, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.30325889587402344, "step": 988 }, { "epoch": 0.045, "grad_norm": 8.3125, "grad_norm_var": 7.700516764322916, "learning_rate": 0.0001, "loss": 7.072, "loss/crossentropy": 2.4898024201393127, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.26056449115276337, "step": 990 }, { "epoch": 0.04509090909090909, "grad_norm": 9.375, "grad_norm_var": 7.689253743489584, "learning_rate": 0.0001, "loss": 7.8681, "loss/crossentropy": 2.685258686542511, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.30890756845474243, "step": 992 }, { "epoch": 0.045181818181818184, "grad_norm": 9.3125, "grad_norm_var": 6.795247395833333, "learning_rate": 0.0001, "loss": 7.4771, "loss/crossentropy": 2.654419720172882, "loss/hidden": 2.044921875, "loss/jsd": 0.0, "loss/logits": 0.27777642756700516, "step": 994 }, { "epoch": 0.04527272727272727, "grad_norm": 10.6875, "grad_norm_var": 1.4587076822916667, "learning_rate": 0.0001, "loss": 7.1371, "loss/crossentropy": 2.5182244777679443, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.2630576714873314, "step": 996 }, { "epoch": 0.04536363636363636, "grad_norm": 7.71875, "grad_norm_var": 1.4315388997395833, "learning_rate": 0.0001, "loss": 7.2782, "loss/crossentropy": 2.562164068222046, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.2719992324709892, "step": 998 }, { "epoch": 0.045454545454545456, "grad_norm": 8.875, "grad_norm_var": 1.5065388997395834, "learning_rate": 0.0001, "loss": 7.229, "loss/crossentropy": 2.5800214409828186, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.26645852997899055, "step": 1000 }, { "epoch": 0.04554545454545455, "grad_norm": 8.4375, "grad_norm_var": 1.6356404622395833, "learning_rate": 0.0001, "loss": 7.6184, "loss/crossentropy": 2.6712105870246887, "loss/hidden": 2.07421875, "loss/jsd": 0.0, "loss/logits": 0.2872982397675514, "step": 1002 }, { "epoch": 0.045636363636363635, "grad_norm": 8.25, "grad_norm_var": 1.6618123372395834, "learning_rate": 0.0001, "loss": 7.8281, "loss/crossentropy": 2.839757263660431, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.29531770199537277, "step": 1004 }, { "epoch": 0.04572727272727273, "grad_norm": 11.5625, "grad_norm_var": 1.89244384765625, "learning_rate": 0.0001, "loss": 7.6174, "loss/crossentropy": 2.720873534679413, "loss/hidden": 2.009765625, "loss/jsd": 0.0, "loss/logits": 0.28867417573928833, "step": 1006 }, { "epoch": 0.04581818181818182, "grad_norm": 9.75, "grad_norm_var": 1.20338134765625, "learning_rate": 0.0001, "loss": 7.6107, "loss/crossentropy": 2.779583513736725, "loss/hidden": 2.009765625, "loss/jsd": 0.0, "loss/logits": 0.28213293105363846, "step": 1008 }, { "epoch": 0.045909090909090906, "grad_norm": 8.8125, "grad_norm_var": 1.2412394205729167, "learning_rate": 0.0001, "loss": 7.7305, "loss/crossentropy": 2.8188117146492004, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.2892199456691742, "step": 1010 }, { "epoch": 0.046, "grad_norm": 8.5625, "grad_norm_var": 1.1019816080729166, "learning_rate": 0.0001, "loss": 7.5843, "loss/crossentropy": 2.8148451447486877, "loss/hidden": 1.955078125, "loss/jsd": 0.0, "loss/logits": 0.28143714368343353, "step": 1012 }, { "epoch": 0.04609090909090909, "grad_norm": 8.75, "grad_norm_var": 4.287434895833333, "learning_rate": 0.0001, "loss": 7.4705, "loss/crossentropy": 2.5636560320854187, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.2871640995144844, "step": 1014 }, { "epoch": 0.046181818181818185, "grad_norm": 8.6875, "grad_norm_var": 4.154020182291666, "learning_rate": 0.0001, "loss": 7.3815, "loss/crossentropy": 2.6296050548553467, "loss/hidden": 1.970703125, "loss/jsd": 0.0, "loss/logits": 0.27811431884765625, "step": 1016 }, { "epoch": 0.04627272727272727, "grad_norm": 9.0625, "grad_norm_var": 4.038525390625, "learning_rate": 0.0001, "loss": 7.3135, "loss/crossentropy": 2.5775824189186096, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.271636538207531, "step": 1018 }, { "epoch": 0.046363636363636364, "grad_norm": 8.375, "grad_norm_var": 4.250972493489583, "learning_rate": 0.0001, "loss": 6.9496, "loss/crossentropy": 2.3780550956726074, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.2540319114923477, "step": 1020 }, { "epoch": 0.04645454545454546, "grad_norm": 9.0, "grad_norm_var": 3.92769775390625, "learning_rate": 0.0001, "loss": 6.9589, "loss/crossentropy": 2.3662286400794983, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.25575150176882744, "step": 1022 }, { "epoch": 0.04654545454545454, "grad_norm": 7.71875, "grad_norm_var": 4.026171875, "learning_rate": 0.0001, "loss": 7.279, "loss/crossentropy": 2.652387320995331, "loss/hidden": 1.958984375, "loss/jsd": 0.0, "loss/logits": 0.26676736027002335, "step": 1024 }, { "epoch": 0.046636363636363636, "grad_norm": 8.4375, "grad_norm_var": 4.148942057291666, "learning_rate": 0.0001, "loss": 6.871, "loss/crossentropy": 2.4122619032859802, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.24431489780545235, "step": 1026 }, { "epoch": 0.04672727272727273, "grad_norm": 7.96875, "grad_norm_var": 4.190816243489583, "learning_rate": 0.0001, "loss": 7.2708, "loss/crossentropy": 2.607126474380493, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.26870811730623245, "step": 1028 }, { "epoch": 0.04681818181818182, "grad_norm": 8.0625, "grad_norm_var": 0.2738566080729167, "learning_rate": 0.0001, "loss": 7.3169, "loss/crossentropy": 2.5835671424865723, "loss/hidden": 2.001953125, "loss/jsd": 0.0, "loss/logits": 0.27314095944166183, "step": 1030 }, { "epoch": 0.04690909090909091, "grad_norm": 27.0, "grad_norm_var": 21.97349853515625, "learning_rate": 0.0001, "loss": 7.8977, "loss/crossentropy": 2.5888620615005493, "loss/hidden": 2.021484375, "loss/jsd": 0.0, "loss/logits": 0.32873429358005524, "step": 1032 }, { "epoch": 0.047, "grad_norm": 8.4375, "grad_norm_var": 21.963212076822916, "learning_rate": 0.0001, "loss": 6.9084, "loss/crossentropy": 2.2471708953380585, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.262998778373003, "step": 1034 }, { "epoch": 0.04709090909090909, "grad_norm": 8.875, "grad_norm_var": 21.561328125, "learning_rate": 0.0001, "loss": 7.1595, "loss/crossentropy": 2.3758057355880737, "loss/hidden": 2.05078125, "loss/jsd": 0.0, "loss/logits": 0.2732931636273861, "step": 1036 }, { "epoch": 0.04718181818181818, "grad_norm": 12.5, "grad_norm_var": 22.007145182291666, "learning_rate": 0.0001, "loss": 7.5147, "loss/crossentropy": 2.548030436038971, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.2951078340411186, "step": 1038 }, { "epoch": 0.04727272727272727, "grad_norm": 8.375, "grad_norm_var": 21.69439697265625, "learning_rate": 0.0001, "loss": 7.8578, "loss/crossentropy": 2.881781220436096, "loss/hidden": 2.013671875, "loss/jsd": 0.0, "loss/logits": 0.296232208609581, "step": 1040 }, { "epoch": 0.047363636363636365, "grad_norm": 8.375, "grad_norm_var": 21.7201171875, "learning_rate": 0.0001, "loss": 7.1442, "loss/crossentropy": 2.5386404991149902, "loss/hidden": 1.962890625, "loss/jsd": 0.0, "loss/logits": 0.26426322758197784, "step": 1042 }, { "epoch": 0.04745454545454546, "grad_norm": 8.4375, "grad_norm_var": 22.08541259765625, "learning_rate": 0.0001, "loss": 7.7434, "loss/crossentropy": 2.6462042927742004, "loss/hidden": 2.05859375, "loss/jsd": 0.0, "loss/logits": 0.3038581982254982, "step": 1044 }, { "epoch": 0.047545454545454544, "grad_norm": 8.375, "grad_norm_var": 22.40621337890625, "learning_rate": 0.0001, "loss": 7.0371, "loss/crossentropy": 2.2035286724567413, "loss/hidden": 2.08984375, "loss/jsd": 0.0, "loss/logits": 0.27437590807676315, "step": 1046 }, { "epoch": 0.047636363636363636, "grad_norm": 7.875, "grad_norm_var": 4.522786458333333, "learning_rate": 0.0001, "loss": 7.1754, "loss/crossentropy": 2.605966627597809, "loss/hidden": 1.978515625, "loss/jsd": 0.0, "loss/logits": 0.25908737257122993, "step": 1048 }, { "epoch": 0.04772727272727273, "grad_norm": 8.125, "grad_norm_var": 4.673307291666666, "learning_rate": 0.0001, "loss": 7.5399, "loss/crossentropy": 2.8035109639167786, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.27950040251016617, "step": 1050 }, { "epoch": 0.047818181818181815, "grad_norm": 16.0, "grad_norm_var": 7.414827473958334, "learning_rate": 0.0001, "loss": 7.4942, "loss/crossentropy": 2.723147451877594, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.27554241567850113, "step": 1052 }, { "epoch": 0.04790909090909091, "grad_norm": 9.0625, "grad_norm_var": 6.899283854166667, "learning_rate": 0.0001, "loss": 7.8025, "loss/crossentropy": 2.86588454246521, "loss/hidden": 1.978515625, "loss/jsd": 0.0, "loss/logits": 0.29580553621053696, "step": 1054 }, { "epoch": 0.048, "grad_norm": 8.5625, "grad_norm_var": 6.8513671875, "learning_rate": 0.0001, "loss": 7.2894, "loss/crossentropy": 2.4514788687229156, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.2798847258090973, "step": 1056 }, { "epoch": 0.048090909090909094, "grad_norm": 8.1875, "grad_norm_var": 6.74136962890625, "learning_rate": 0.0001, "loss": 7.3084, "loss/crossentropy": 2.6378528475761414, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.267832450568676, "step": 1058 }, { "epoch": 0.04818181818181818, "grad_norm": 8.75, "grad_norm_var": 5.619462076822916, "learning_rate": 0.0001, "loss": 7.8683, "loss/crossentropy": 2.810686945915222, "loss/hidden": 2.029296875, "loss/jsd": 0.0, "loss/logits": 0.30283160507678986, "step": 1060 }, { "epoch": 0.04827272727272727, "grad_norm": 8.1875, "grad_norm_var": 3.7253214518229165, "learning_rate": 0.0001, "loss": 7.1779, "loss/crossentropy": 2.5254364609718323, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.2644653394818306, "step": 1062 }, { "epoch": 0.048363636363636366, "grad_norm": 22.0, "grad_norm_var": 14.346875, "learning_rate": 0.0001, "loss": 6.965, "loss/crossentropy": 2.3269357681274414, "loss/hidden": 1.978515625, "loss/jsd": 0.0, "loss/logits": 0.2659508213400841, "step": 1064 }, { "epoch": 0.04845454545454545, "grad_norm": 8.0625, "grad_norm_var": 14.286393229166666, "learning_rate": 0.0001, "loss": 7.6718, "loss/crossentropy": 2.7643895149230957, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.28879255428910255, "step": 1066 }, { "epoch": 0.048545454545454544, "grad_norm": 7.59375, "grad_norm_var": 12.032275390625, "learning_rate": 0.0001, "loss": 7.0394, "loss/crossentropy": 2.4835625290870667, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.2547999769449234, "step": 1068 }, { "epoch": 0.04863636363636364, "grad_norm": 7.5625, "grad_norm_var": 12.379557291666666, "learning_rate": 0.0001, "loss": 7.0989, "loss/crossentropy": 2.5719141960144043, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.25895189121365547, "step": 1070 }, { "epoch": 0.04872727272727273, "grad_norm": 9.0, "grad_norm_var": 12.352994791666667, "learning_rate": 0.0001, "loss": 7.7677, "loss/crossentropy": 2.8079739212989807, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.29519277811050415, "step": 1072 }, { "epoch": 0.048818181818181816, "grad_norm": 7.9375, "grad_norm_var": 12.398160807291667, "learning_rate": 0.0001, "loss": 7.5114, "loss/crossentropy": 2.700074255466461, "loss/hidden": 1.990234375, "loss/jsd": 0.0, "loss/logits": 0.28211110830307007, "step": 1074 }, { "epoch": 0.04890909090909091, "grad_norm": 7.53125, "grad_norm_var": 12.558915201822916, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.809931457042694, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.28107593208551407, "step": 1076 }, { "epoch": 0.049, "grad_norm": 7.96875, "grad_norm_var": 12.569775390625, "learning_rate": 0.0001, "loss": 7.3025, "loss/crossentropy": 2.548322379589081, "loss/hidden": 2.04296875, "loss/jsd": 0.0, "loss/logits": 0.2711233124136925, "step": 1078 }, { "epoch": 0.04909090909090909, "grad_norm": 8.3125, "grad_norm_var": 0.739306640625, "learning_rate": 0.0001, "loss": 7.8143, "loss/crossentropy": 2.905961275100708, "loss/hidden": 2.009765625, "loss/jsd": 0.0, "loss/logits": 0.28986065089702606, "step": 1080 }, { "epoch": 0.04918181818181818, "grad_norm": 7.875, "grad_norm_var": 0.31555582682291666, "learning_rate": 0.0001, "loss": 7.0247, "loss/crossentropy": 2.45279461145401, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.2599208652973175, "step": 1082 }, { "epoch": 0.049272727272727274, "grad_norm": 7.6875, "grad_norm_var": 0.28297119140625, "learning_rate": 0.0001, "loss": 7.0251, "loss/crossentropy": 2.5124455094337463, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.2555593438446522, "step": 1084 }, { "epoch": 0.049363636363636366, "grad_norm": 8.75, "grad_norm_var": 0.2875284830729167, "learning_rate": 0.0001, "loss": 7.3728, "loss/crossentropy": 2.7131059765815735, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.27065833657979965, "step": 1086 }, { "epoch": 0.04945454545454545, "grad_norm": 8.1875, "grad_norm_var": 0.24397379557291668, "learning_rate": 0.0001, "loss": 6.9742, "loss/crossentropy": 2.4336851835250854, "loss/hidden": 2.005859375, "loss/jsd": 0.0, "loss/logits": 0.25346213579177856, "step": 1088 }, { "epoch": 0.049545454545454545, "grad_norm": 7.34375, "grad_norm_var": 0.2947916666666667, "learning_rate": 0.0001, "loss": 6.8968, "loss/crossentropy": 2.4370792508125305, "loss/hidden": 1.919921875, "loss/jsd": 0.0, "loss/logits": 0.25397682189941406, "step": 1090 }, { "epoch": 0.04963636363636364, "grad_norm": 7.0625, "grad_norm_var": 0.33264567057291666, "learning_rate": 0.0001, "loss": 7.0417, "loss/crossentropy": 2.568955361843109, "loss/hidden": 1.9609375, "loss/jsd": 0.0, "loss/logits": 0.25117628648877144, "step": 1092 }, { "epoch": 0.049727272727272724, "grad_norm": 7.8125, "grad_norm_var": 0.33893229166666666, "learning_rate": 0.0001, "loss": 6.8873, "loss/crossentropy": 2.4283714294433594, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2490144670009613, "step": 1094 }, { "epoch": 0.04981818181818182, "grad_norm": 7.625, "grad_norm_var": 0.3265462239583333, "learning_rate": 0.0001, "loss": 6.7582, "loss/crossentropy": 2.329250931739807, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.24641036987304688, "step": 1096 }, { "epoch": 0.04990909090909091, "grad_norm": 8.625, "grad_norm_var": 0.3817342122395833, "learning_rate": 0.0001, "loss": 7.2666, "loss/crossentropy": 2.5970011949539185, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.26813556253910065, "step": 1098 }, { "epoch": 0.05, "grad_norm": 7.46875, "grad_norm_var": 0.37330322265625, "learning_rate": 0.0001, "loss": 7.1421, "loss/crossentropy": 2.543682813644409, "loss/hidden": 1.955078125, "loss/jsd": 0.0, "loss/logits": 0.26433587074279785, "step": 1100 }, { "epoch": 0.05009090909090909, "grad_norm": 9.25, "grad_norm_var": 0.6111612955729167, "learning_rate": 0.0001, "loss": 7.6046, "loss/crossentropy": 2.6838017106056213, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.2912943586707115, "step": 1102 }, { "epoch": 0.05018181818181818, "grad_norm": 9.0, "grad_norm_var": 0.7199178059895833, "learning_rate": 0.0001, "loss": 6.9191, "loss/crossentropy": 2.4252968430519104, "loss/hidden": 1.978515625, "loss/jsd": 0.0, "loss/logits": 0.2515271008014679, "step": 1104 }, { "epoch": 0.050272727272727274, "grad_norm": 93.5, "grad_norm_var": 455.0230305989583, "learning_rate": 0.0001, "loss": 7.806, "loss/crossentropy": 2.6990936398506165, "loss/hidden": 2.103515625, "loss/jsd": 0.0, "loss/logits": 0.30033597350120544, "step": 1106 }, { "epoch": 0.05036363636363636, "grad_norm": 8.375, "grad_norm_var": 453.15533854166665, "learning_rate": 0.0001, "loss": 7.4831, "loss/crossentropy": 2.66216242313385, "loss/hidden": 1.986328125, "loss/jsd": 0.0, "loss/logits": 0.28346090763807297, "step": 1108 }, { "epoch": 0.05045454545454545, "grad_norm": 8.625, "grad_norm_var": 452.42356770833334, "learning_rate": 0.0001, "loss": 6.8696, "loss/crossentropy": 2.4224236607551575, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.2435428872704506, "step": 1110 }, { "epoch": 0.050545454545454546, "grad_norm": 8.5625, "grad_norm_var": 450.660009765625, "learning_rate": 0.0001, "loss": 7.0951, "loss/crossentropy": 2.4986310601234436, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.25847018137574196, "step": 1112 }, { "epoch": 0.05063636363636364, "grad_norm": 8.6875, "grad_norm_var": 449.54099934895834, "learning_rate": 0.0001, "loss": 7.3582, "loss/crossentropy": 2.549586594104767, "loss/hidden": 2.04296875, "loss/jsd": 0.0, "loss/logits": 0.276569165289402, "step": 1114 }, { "epoch": 0.050727272727272725, "grad_norm": 8.875, "grad_norm_var": 447.92395833333336, "learning_rate": 0.0001, "loss": 7.1084, "loss/crossentropy": 2.449738323688507, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.26625825464725494, "step": 1116 }, { "epoch": 0.05081818181818182, "grad_norm": 8.25, "grad_norm_var": 449.24895833333335, "learning_rate": 0.0001, "loss": 7.1738, "loss/crossentropy": 2.629892349243164, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.26181699708104134, "step": 1118 }, { "epoch": 0.05090909090909091, "grad_norm": 7.78125, "grad_norm_var": 450.9629842122396, "learning_rate": 0.0001, "loss": 7.1883, "loss/crossentropy": 2.590371787548065, "loss/hidden": 1.970703125, "loss/jsd": 0.0, "loss/logits": 0.2627180181443691, "step": 1120 }, { "epoch": 0.051, "grad_norm": 7.78125, "grad_norm_var": 0.24361979166666667, "learning_rate": 0.0001, "loss": 6.8613, "loss/crossentropy": 2.327029824256897, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2557664141058922, "step": 1122 }, { "epoch": 0.05109090909090909, "grad_norm": 8.625, "grad_norm_var": 0.162109375, "learning_rate": 0.0001, "loss": 7.3537, "loss/crossentropy": 2.67605984210968, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.27089108526706696, "step": 1124 }, { "epoch": 0.05118181818181818, "grad_norm": 8.4375, "grad_norm_var": 0.263916015625, "learning_rate": 0.0001, "loss": 7.263, "loss/crossentropy": 2.5770376324653625, "loss/hidden": 1.966796875, "loss/jsd": 0.0, "loss/logits": 0.2719154618680477, "step": 1126 }, { "epoch": 0.051272727272727275, "grad_norm": 8.5625, "grad_norm_var": 0.806640625, "learning_rate": 0.0001, "loss": 7.3102, "loss/crossentropy": 2.5811874866485596, "loss/hidden": 1.978515625, "loss/jsd": 0.0, "loss/logits": 0.2750508300960064, "step": 1128 }, { "epoch": 0.05136363636363636, "grad_norm": 8.125, "grad_norm_var": 0.822265625, "learning_rate": 0.0001, "loss": 6.9273, "loss/crossentropy": 2.4703776240348816, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.25272640585899353, "step": 1130 }, { "epoch": 0.051454545454545454, "grad_norm": 8.8125, "grad_norm_var": 0.8242838541666667, "learning_rate": 0.0001, "loss": 7.1974, "loss/crossentropy": 2.613615930080414, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.26150811836123466, "step": 1132 }, { "epoch": 0.05154545454545455, "grad_norm": 7.78125, "grad_norm_var": 0.81900634765625, "learning_rate": 0.0001, "loss": 7.5199, "loss/crossentropy": 2.8864769339561462, "loss/hidden": 1.939453125, "loss/jsd": 0.0, "loss/logits": 0.2693956345319748, "step": 1134 }, { "epoch": 0.05163636363636363, "grad_norm": 8.125, "grad_norm_var": 0.8345052083333333, "learning_rate": 0.0001, "loss": 7.0275, "loss/crossentropy": 2.409299373626709, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.25791673362255096, "step": 1136 }, { "epoch": 0.051727272727272726, "grad_norm": 8.0625, "grad_norm_var": 0.8821573893229167, "learning_rate": 0.0001, "loss": 7.074, "loss/crossentropy": 2.5273653268814087, "loss/hidden": 1.943359375, "loss/jsd": 0.0, "loss/logits": 0.26032573729753494, "step": 1138 }, { "epoch": 0.05181818181818182, "grad_norm": 7.28125, "grad_norm_var": 0.9413899739583333, "learning_rate": 0.0001, "loss": 7.2, "loss/crossentropy": 2.5874040126800537, "loss/hidden": 1.962890625, "loss/jsd": 0.0, "loss/logits": 0.26496652513742447, "step": 1140 }, { "epoch": 0.05190909090909091, "grad_norm": 8.125, "grad_norm_var": 0.8661092122395834, "learning_rate": 0.0001, "loss": 7.2164, "loss/crossentropy": 2.6226788759231567, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.26797088980674744, "step": 1142 }, { "epoch": 0.052, "grad_norm": 7.53125, "grad_norm_var": 0.28665364583333336, "learning_rate": 0.0001, "loss": 6.9083, "loss/crossentropy": 2.541751265525818, "loss/hidden": 1.927734375, "loss/jsd": 0.0, "loss/logits": 0.24388077855110168, "step": 1144 }, { "epoch": 0.05209090909090909, "grad_norm": 9.75, "grad_norm_var": 0.447900390625, "learning_rate": 0.0001, "loss": 7.3472, "loss/crossentropy": 2.746846914291382, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.26549940556287766, "step": 1146 }, { "epoch": 0.05218181818181818, "grad_norm": 7.5625, "grad_norm_var": 0.4505208333333333, "learning_rate": 0.0001, "loss": 7.4515, "loss/crossentropy": 2.8127639293670654, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.26895536482334137, "step": 1148 }, { "epoch": 0.05227272727272727, "grad_norm": 8.125, "grad_norm_var": 0.5622233072916667, "learning_rate": 0.0001, "loss": 7.2417, "loss/crossentropy": 2.7256194949150085, "loss/hidden": 1.935546875, "loss/jsd": 0.0, "loss/logits": 0.2580496035516262, "step": 1150 }, { "epoch": 0.05236363636363636, "grad_norm": 8.0, "grad_norm_var": 0.49179280598958336, "learning_rate": 0.0001, "loss": 7.2844, "loss/crossentropy": 2.7250009775161743, "loss/hidden": 1.908203125, "loss/jsd": 0.0, "loss/logits": 0.26511870324611664, "step": 1152 }, { "epoch": 0.052454545454545455, "grad_norm": 7.0, "grad_norm_var": 0.5304646809895833, "learning_rate": 0.0001, "loss": 6.7007, "loss/crossentropy": 2.3156438767910004, "loss/hidden": 1.908203125, "loss/jsd": 0.0, "loss/logits": 0.24768873676657677, "step": 1154 }, { "epoch": 0.05254545454545455, "grad_norm": 7.78125, "grad_norm_var": 0.483984375, "learning_rate": 0.0001, "loss": 7.2939, "loss/crossentropy": 2.6349948048591614, "loss/hidden": 1.970703125, "loss/jsd": 0.0, "loss/logits": 0.26881861686706543, "step": 1156 }, { "epoch": 0.052636363636363634, "grad_norm": 8.3125, "grad_norm_var": 0.5800089518229167, "learning_rate": 0.0001, "loss": 7.0235, "loss/crossentropy": 2.4151678383350372, "loss/hidden": 1.982421875, "loss/jsd": 0.0, "loss/logits": 0.26258858293294907, "step": 1158 }, { "epoch": 0.05272727272727273, "grad_norm": 8.1875, "grad_norm_var": 0.5236328125, "learning_rate": 0.0001, "loss": 7.5599, "loss/crossentropy": 2.8179251551628113, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.28122442960739136, "step": 1160 }, { "epoch": 0.05281818181818182, "grad_norm": 8.0, "grad_norm_var": 0.31197916666666664, "learning_rate": 0.0001, "loss": 7.4412, "loss/crossentropy": 2.686148703098297, "loss/hidden": 1.9609375, "loss/jsd": 0.0, "loss/logits": 0.2794131822884083, "step": 1162 }, { "epoch": 0.05290909090909091, "grad_norm": 7.625, "grad_norm_var": 0.3078125, "learning_rate": 0.0001, "loss": 6.9202, "loss/crossentropy": 2.4211618304252625, "loss/hidden": 1.962890625, "loss/jsd": 0.0, "loss/logits": 0.2536177709698677, "step": 1164 }, { "epoch": 0.053, "grad_norm": 8.625, "grad_norm_var": 0.24869384765625, "learning_rate": 0.0001, "loss": 7.3721, "loss/crossentropy": 2.732385754585266, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.264754094183445, "step": 1166 }, { "epoch": 0.05309090909090909, "grad_norm": 7.90625, "grad_norm_var": 0.24970296223958333, "learning_rate": 0.0001, "loss": 7.4533, "loss/crossentropy": 2.7934324145317078, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2734098732471466, "step": 1168 }, { "epoch": 0.053181818181818184, "grad_norm": 8.0, "grad_norm_var": 0.3024698893229167, "learning_rate": 0.0001, "loss": 7.2325, "loss/crossentropy": 2.538682460784912, "loss/hidden": 1.998046875, "loss/jsd": 0.0, "loss/logits": 0.26958150416612625, "step": 1170 }, { "epoch": 0.05327272727272727, "grad_norm": 7.0625, "grad_norm_var": 0.41731363932291665, "learning_rate": 0.0001, "loss": 6.7343, "loss/crossentropy": 2.349640130996704, "loss/hidden": 1.919921875, "loss/jsd": 0.0, "loss/logits": 0.24647089838981628, "step": 1172 }, { "epoch": 0.05336363636363636, "grad_norm": 9.1875, "grad_norm_var": 0.4779256184895833, "learning_rate": 0.0001, "loss": 7.4632, "loss/crossentropy": 2.7870468497276306, "loss/hidden": 1.935546875, "loss/jsd": 0.0, "loss/logits": 0.2740621343255043, "step": 1174 }, { "epoch": 0.053454545454545456, "grad_norm": 8.6875, "grad_norm_var": 10.27838134765625, "learning_rate": 0.0001, "loss": 7.2373, "loss/crossentropy": 2.6096192598342896, "loss/hidden": 2.02734375, "loss/jsd": 0.0, "loss/logits": 0.2600362226366997, "step": 1176 }, { "epoch": 0.05354545454545455, "grad_norm": 7.40625, "grad_norm_var": 10.436962890625, "learning_rate": 0.0001, "loss": 7.1269, "loss/crossentropy": 2.6223524808883667, "loss/hidden": 1.939453125, "loss/jsd": 0.0, "loss/logits": 0.2565115988254547, "step": 1178 }, { "epoch": 0.053636363636363635, "grad_norm": 9.125, "grad_norm_var": 10.393733723958333, "learning_rate": 0.0001, "loss": 6.4248, "loss/crossentropy": 2.126849442720413, "loss/hidden": 1.951171875, "loss/jsd": 0.0, "loss/logits": 0.23467664048075676, "step": 1180 }, { "epoch": 0.05372727272727273, "grad_norm": 7.625, "grad_norm_var": 10.433138020833333, "learning_rate": 0.0001, "loss": 7.0188, "loss/crossentropy": 2.5513610243797302, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.25299783423542976, "step": 1182 }, { "epoch": 0.05381818181818182, "grad_norm": 8.875, "grad_norm_var": 10.331245930989583, "learning_rate": 0.0001, "loss": 7.5648, "loss/crossentropy": 2.8133671283721924, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.27865782380104065, "step": 1184 }, { "epoch": 0.053909090909090907, "grad_norm": 8.1875, "grad_norm_var": 10.34586181640625, "learning_rate": 0.0001, "loss": 7.267, "loss/crossentropy": 2.636054754257202, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.270121194422245, "step": 1186 }, { "epoch": 0.054, "grad_norm": 8.8125, "grad_norm_var": 9.724702962239583, "learning_rate": 0.0001, "loss": 7.4639, "loss/crossentropy": 2.7734354734420776, "loss/hidden": 1.970703125, "loss/jsd": 0.0, "loss/logits": 0.2719772234559059, "step": 1188 }, { "epoch": 0.05409090909090909, "grad_norm": 8.25, "grad_norm_var": 9.887300618489583, "learning_rate": 0.0001, "loss": 7.4543, "loss/crossentropy": 2.755846858024597, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.277263842523098, "step": 1190 }, { "epoch": 0.054181818181818185, "grad_norm": 7.90625, "grad_norm_var": 0.5656087239583333, "learning_rate": 0.0001, "loss": 7.2412, "loss/crossentropy": 2.491327702999115, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2765457257628441, "step": 1192 }, { "epoch": 0.05427272727272727, "grad_norm": 6.4375, "grad_norm_var": 0.7748697916666667, "learning_rate": 0.0001, "loss": 6.7082, "loss/crossentropy": 2.4197921752929688, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2374388910830021, "step": 1194 }, { "epoch": 0.054363636363636364, "grad_norm": 8.625, "grad_norm_var": 0.713525390625, "learning_rate": 0.0001, "loss": 7.2952, "loss/crossentropy": 2.7264302372932434, "loss/hidden": 1.943359375, "loss/jsd": 0.0, "loss/logits": 0.26253872737288475, "step": 1196 }, { "epoch": 0.05445454545454546, "grad_norm": 8.6875, "grad_norm_var": 0.77828369140625, "learning_rate": 0.0001, "loss": 7.1958, "loss/crossentropy": 2.626085937023163, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2651774138212204, "step": 1198 }, { "epoch": 0.05454545454545454, "grad_norm": 6.96875, "grad_norm_var": 0.83541259765625, "learning_rate": 0.0001, "loss": 6.5731, "loss/crossentropy": 2.384039342403412, "loss/hidden": 1.904296875, "loss/jsd": 0.0, "loss/logits": 0.22847458347678185, "step": 1200 }, { "epoch": 0.054636363636363636, "grad_norm": 8.0625, "grad_norm_var": 0.5917277018229167, "learning_rate": 0.0001, "loss": 7.1627, "loss/crossentropy": 2.547714650630951, "loss/hidden": 1.966796875, "loss/jsd": 0.0, "loss/logits": 0.26481954008340836, "step": 1202 }, { "epoch": 0.05472727272727273, "grad_norm": 8.4375, "grad_norm_var": 0.44855143229166666, "learning_rate": 0.0001, "loss": 7.1103, "loss/crossentropy": 2.5330899357795715, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.26084573194384575, "step": 1204 }, { "epoch": 0.05481818181818182, "grad_norm": 7.59375, "grad_norm_var": 0.43878580729166666, "learning_rate": 0.0001, "loss": 6.7698, "loss/crossentropy": 2.4256432950496674, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.24320050328969955, "step": 1206 }, { "epoch": 0.05490909090909091, "grad_norm": 7.65625, "grad_norm_var": 1.723291015625, "learning_rate": 0.0001, "loss": 6.8614, "loss/crossentropy": 2.391255557537079, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.25560425966978073, "step": 1208 }, { "epoch": 0.055, "grad_norm": 8.0, "grad_norm_var": 1.52564697265625, "learning_rate": 0.0001, "loss": 6.5985, "loss/crossentropy": 2.2560774087905884, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.24009989574551582, "step": 1210 }, { "epoch": 0.05509090909090909, "grad_norm": 8.4375, "grad_norm_var": 1.49586181640625, "learning_rate": 0.0001, "loss": 7.2594, "loss/crossentropy": 2.7265680730342865, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2599274218082428, "step": 1212 }, { "epoch": 0.05518181818181818, "grad_norm": 7.90625, "grad_norm_var": 1.4149698893229166, "learning_rate": 0.0001, "loss": 7.2606, "loss/crossentropy": 2.6719335317611694, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.26316704601049423, "step": 1214 }, { "epoch": 0.05527272727272727, "grad_norm": 7.90625, "grad_norm_var": 1.30201416015625, "learning_rate": 0.0001, "loss": 7.1627, "loss/crossentropy": 2.539786636829376, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.2626775801181793, "step": 1216 }, { "epoch": 0.055363636363636365, "grad_norm": 8.1875, "grad_norm_var": 1.3214803059895834, "learning_rate": 0.0001, "loss": 6.9042, "loss/crossentropy": 2.4450316429138184, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.2509952262043953, "step": 1218 }, { "epoch": 0.05545454545454546, "grad_norm": 6.96875, "grad_norm_var": 1.404150390625, "learning_rate": 0.0001, "loss": 7.1042, "loss/crossentropy": 2.639298677444458, "loss/hidden": 1.919921875, "loss/jsd": 0.0, "loss/logits": 0.25449733436107635, "step": 1220 }, { "epoch": 0.055545454545454544, "grad_norm": 7.75, "grad_norm_var": 1.3927734375, "learning_rate": 0.0001, "loss": 7.3515, "loss/crossentropy": 2.857682764530182, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.25758032873272896, "step": 1222 }, { "epoch": 0.05563636363636364, "grad_norm": 8.375, "grad_norm_var": 0.14084879557291666, "learning_rate": 0.0001, "loss": 7.2311, "loss/crossentropy": 2.582948327064514, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.2726235091686249, "step": 1224 }, { "epoch": 0.05572727272727273, "grad_norm": 7.21875, "grad_norm_var": 0.20950520833333333, "learning_rate": 0.0001, "loss": 6.7606, "loss/crossentropy": 2.34921070933342, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.24465255439281464, "step": 1226 }, { "epoch": 0.055818181818181815, "grad_norm": 8.4375, "grad_norm_var": 0.24729410807291666, "learning_rate": 0.0001, "loss": 6.996, "loss/crossentropy": 2.5638477206230164, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.25063879787921906, "step": 1228 }, { "epoch": 0.05590909090909091, "grad_norm": 7.71875, "grad_norm_var": 0.34680582682291666, "learning_rate": 0.0001, "loss": 6.9277, "loss/crossentropy": 2.446713149547577, "loss/hidden": 1.935546875, "loss/jsd": 0.0, "loss/logits": 0.25454553589224815, "step": 1230 }, { "epoch": 0.056, "grad_norm": 8.3125, "grad_norm_var": 0.36018473307291665, "learning_rate": 0.0001, "loss": 7.179, "loss/crossentropy": 2.6133479475975037, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2632070481777191, "step": 1232 }, { "epoch": 0.056090909090909094, "grad_norm": 7.875, "grad_norm_var": 0.37467041015625, "learning_rate": 0.0001, "loss": 7.0371, "loss/crossentropy": 2.6027392148971558, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2520269900560379, "step": 1234 }, { "epoch": 0.05618181818181818, "grad_norm": 8.8125, "grad_norm_var": 0.35621337890625, "learning_rate": 0.0001, "loss": 6.8887, "loss/crossentropy": 2.4919639229774475, "loss/hidden": 1.923828125, "loss/jsd": 0.0, "loss/logits": 0.24728695303201675, "step": 1236 }, { "epoch": 0.05627272727272727, "grad_norm": 8.375, "grad_norm_var": 0.36630452473958336, "learning_rate": 0.0001, "loss": 7.1467, "loss/crossentropy": 2.586303472518921, "loss/hidden": 1.939453125, "loss/jsd": 0.0, "loss/logits": 0.2620953470468521, "step": 1238 }, { "epoch": 0.056363636363636366, "grad_norm": 9.0, "grad_norm_var": 0.42170817057291665, "learning_rate": 0.0001, "loss": 7.1099, "loss/crossentropy": 2.585188716650009, "loss/hidden": 1.943359375, "loss/jsd": 0.0, "loss/logits": 0.25813957303762436, "step": 1240 }, { "epoch": 0.05645454545454545, "grad_norm": 7.8125, "grad_norm_var": 0.36627604166666666, "learning_rate": 0.0001, "loss": 6.8879, "loss/crossentropy": 2.4935426712036133, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.24529778584837914, "step": 1242 }, { "epoch": 0.056545454545454545, "grad_norm": 6.625, "grad_norm_var": 0.44153238932291666, "learning_rate": 0.0001, "loss": 6.852, "loss/crossentropy": 2.556459605693817, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.24048806354403496, "step": 1244 }, { "epoch": 0.05663636363636364, "grad_norm": 7.9375, "grad_norm_var": 0.443994140625, "learning_rate": 0.0001, "loss": 6.7599, "loss/crossentropy": 2.355483114719391, "loss/hidden": 1.966796875, "loss/jsd": 0.0, "loss/logits": 0.24376581609249115, "step": 1246 }, { "epoch": 0.05672727272727273, "grad_norm": 8.75, "grad_norm_var": 0.46578369140625, "learning_rate": 0.0001, "loss": 7.2721, "loss/crossentropy": 2.654756784439087, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.26798122376203537, "step": 1248 }, { "epoch": 0.056818181818181816, "grad_norm": 7.84375, "grad_norm_var": 0.43357747395833335, "learning_rate": 0.0001, "loss": 7.4275, "loss/crossentropy": 2.743292510509491, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.271550215780735, "step": 1250 }, { "epoch": 0.05690909090909091, "grad_norm": 7.40625, "grad_norm_var": 0.4420857747395833, "learning_rate": 0.0001, "loss": 6.7641, "loss/crossentropy": 2.408964514732361, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.24254048615694046, "step": 1252 }, { "epoch": 0.057, "grad_norm": 8.0625, "grad_norm_var": 0.42001546223958336, "learning_rate": 0.0001, "loss": 6.7934, "loss/crossentropy": 2.4587342739105225, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.24244710057973862, "step": 1254 }, { "epoch": 0.05709090909090909, "grad_norm": 6.875, "grad_norm_var": 0.43995768229166665, "learning_rate": 0.0001, "loss": 6.6442, "loss/crossentropy": 2.3592546582221985, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.23630603402853012, "step": 1256 }, { "epoch": 0.05718181818181818, "grad_norm": 8.0, "grad_norm_var": 0.4515462239583333, "learning_rate": 0.0001, "loss": 7.0211, "loss/crossentropy": 2.4988872706890106, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.25495246797800064, "step": 1258 }, { "epoch": 0.057272727272727274, "grad_norm": 8.9375, "grad_norm_var": 0.346728515625, "learning_rate": 0.0001, "loss": 7.3398, "loss/crossentropy": 2.8623791337013245, "loss/hidden": 1.900390625, "loss/jsd": 0.0, "loss/logits": 0.2577030174434185, "step": 1260 }, { "epoch": 0.05736363636363637, "grad_norm": 8.125, "grad_norm_var": 0.2734375, "learning_rate": 0.0001, "loss": 7.2349, "loss/crossentropy": 2.6641156673431396, "loss/hidden": 1.939453125, "loss/jsd": 0.0, "loss/logits": 0.26313356310129166, "step": 1262 }, { "epoch": 0.05745454545454545, "grad_norm": 7.0, "grad_norm_var": 0.4032389322916667, "learning_rate": 0.0001, "loss": 6.4445, "loss/crossentropy": 2.2592975199222565, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.23141195625066757, "step": 1264 }, { "epoch": 0.057545454545454545, "grad_norm": 7.625, "grad_norm_var": 0.39401041666666664, "learning_rate": 0.0001, "loss": 6.7344, "loss/crossentropy": 2.4119213223457336, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.24123146012425423, "step": 1266 }, { "epoch": 0.05763636363636364, "grad_norm": 8.375, "grad_norm_var": 0.6456339518229167, "learning_rate": 0.0001, "loss": 7.3359, "loss/crossentropy": 2.7948678135871887, "loss/hidden": 1.908203125, "loss/jsd": 0.0, "loss/logits": 0.2632821723818779, "step": 1268 }, { "epoch": 0.057727272727272724, "grad_norm": 8.1875, "grad_norm_var": 0.73902587890625, "learning_rate": 0.0001, "loss": 6.8326, "loss/crossentropy": 2.372406542301178, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.2514873594045639, "step": 1270 }, { "epoch": 0.05781818181818182, "grad_norm": 7.5625, "grad_norm_var": 0.6695963541666666, "learning_rate": 0.0001, "loss": 7.127, "loss/crossentropy": 2.556018114089966, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.2621726207435131, "step": 1272 }, { "epoch": 0.05790909090909091, "grad_norm": 8.9375, "grad_norm_var": 0.6959635416666666, "learning_rate": 0.0001, "loss": 7.28, "loss/crossentropy": 2.713866114616394, "loss/hidden": 1.958984375, "loss/jsd": 0.0, "loss/logits": 0.2607143074274063, "step": 1274 }, { "epoch": 0.058, "grad_norm": 7.6875, "grad_norm_var": 0.67926025390625, "learning_rate": 0.0001, "loss": 7.0771, "loss/crossentropy": 2.576918363571167, "loss/hidden": 1.923828125, "loss/jsd": 0.0, "loss/logits": 0.257634773850441, "step": 1276 }, { "epoch": 0.05809090909090909, "grad_norm": 8.3125, "grad_norm_var": 0.6855753580729167, "learning_rate": 0.0001, "loss": 7.44, "loss/crossentropy": 2.7353321313858032, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.2782762683928013, "step": 1278 }, { "epoch": 0.05818181818181818, "grad_norm": 8.3125, "grad_norm_var": 0.5083170572916667, "learning_rate": 0.0001, "loss": 7.1675, "loss/crossentropy": 2.547225832939148, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.27023012191057205, "step": 1280 }, { "epoch": 0.058272727272727275, "grad_norm": 9.1875, "grad_norm_var": 0.5866170247395833, "learning_rate": 0.0001, "loss": 6.9273, "loss/crossentropy": 2.5039344429969788, "loss/hidden": 1.931640625, "loss/jsd": 0.0, "loss/logits": 0.24916840717196465, "step": 1282 }, { "epoch": 0.05836363636363636, "grad_norm": 8.5, "grad_norm_var": 7.212984212239584, "learning_rate": 0.0001, "loss": 7.4949, "loss/crossentropy": 2.630077540874481, "loss/hidden": 1.970703125, "loss/jsd": 0.0, "loss/logits": 0.28941285610198975, "step": 1284 }, { "epoch": 0.058454545454545453, "grad_norm": 7.5, "grad_norm_var": 7.363765462239583, "learning_rate": 0.0001, "loss": 6.6026, "loss/crossentropy": 2.338949680328369, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.23417918384075165, "step": 1286 }, { "epoch": 0.058545454545454546, "grad_norm": 7.90625, "grad_norm_var": 7.404813639322916, "learning_rate": 0.0001, "loss": 6.8047, "loss/crossentropy": 2.4980801343917847, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.24355736374855042, "step": 1288 }, { "epoch": 0.05863636363636364, "grad_norm": 8.8125, "grad_norm_var": 7.515869140625, "learning_rate": 0.0001, "loss": 6.9623, "loss/crossentropy": 2.5256548523902893, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2518656402826309, "step": 1290 }, { "epoch": 0.058727272727272725, "grad_norm": 7.46875, "grad_norm_var": 7.553902180989583, "learning_rate": 0.0001, "loss": 6.6995, "loss/crossentropy": 2.41861754655838, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.23882880434393883, "step": 1292 }, { "epoch": 0.05881818181818182, "grad_norm": 15.4375, "grad_norm_var": 10.522456868489583, "learning_rate": 0.0001, "loss": 7.6122, "loss/crossentropy": 2.6197566986083984, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.30236879363656044, "step": 1294 }, { "epoch": 0.05890909090909091, "grad_norm": 10.5, "grad_norm_var": 10.682645670572917, "learning_rate": 0.0001, "loss": 7.6911, "loss/crossentropy": 2.8123474717140198, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.2890446111559868, "step": 1296 }, { "epoch": 0.059, "grad_norm": 7.78125, "grad_norm_var": 10.696598307291667, "learning_rate": 0.0001, "loss": 7.034, "loss/crossentropy": 2.5357277393341064, "loss/hidden": 1.958984375, "loss/jsd": 0.0, "loss/logits": 0.2539264000952244, "step": 1298 }, { "epoch": 0.05909090909090909, "grad_norm": 7.8125, "grad_norm_var": 4.690608723958333, "learning_rate": 0.0001, "loss": 7.099, "loss/crossentropy": 2.6270817518234253, "loss/hidden": 1.900390625, "loss/jsd": 0.0, "loss/logits": 0.2571498937904835, "step": 1300 }, { "epoch": 0.05918181818181818, "grad_norm": 8.375, "grad_norm_var": 4.658199055989583, "learning_rate": 0.0001, "loss": 6.9156, "loss/crossentropy": 2.5047536492347717, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.25006451085209846, "step": 1302 }, { "epoch": 0.059272727272727276, "grad_norm": 7.46875, "grad_norm_var": 4.579671223958333, "learning_rate": 0.0001, "loss": 6.9875, "loss/crossentropy": 2.6032822728157043, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.24916110932826996, "step": 1304 }, { "epoch": 0.05936363636363636, "grad_norm": 8.1875, "grad_norm_var": 4.490946451822917, "learning_rate": 0.0001, "loss": 6.9892, "loss/crossentropy": 2.569200098514557, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.25020037591457367, "step": 1306 }, { "epoch": 0.059454545454545454, "grad_norm": 8.5, "grad_norm_var": 4.303999837239584, "learning_rate": 0.0001, "loss": 7.475, "loss/crossentropy": 2.817148804664612, "loss/hidden": 1.916015625, "loss/jsd": 0.0, "loss/logits": 0.27417904138565063, "step": 1308 }, { "epoch": 0.05954545454545455, "grad_norm": 9.5, "grad_norm_var": 1.2173136393229167, "learning_rate": 0.0001, "loss": 7.4892, "loss/crossentropy": 2.779682695865631, "loss/hidden": 1.931640625, "loss/jsd": 0.0, "loss/logits": 0.2777864784002304, "step": 1310 }, { "epoch": 0.05963636363636363, "grad_norm": 7.34375, "grad_norm_var": 0.34295247395833334, "learning_rate": 0.0001, "loss": 7.5065, "loss/crossentropy": 2.8603720664978027, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.2733999118208885, "step": 1312 }, { "epoch": 0.059727272727272726, "grad_norm": 8.125, "grad_norm_var": 0.6210896809895833, "learning_rate": 0.0001, "loss": 7.5656, "loss/crossentropy": 2.7774816155433655, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.27880871295928955, "step": 1314 }, { "epoch": 0.05981818181818182, "grad_norm": 7.90625, "grad_norm_var": 0.5480305989583333, "learning_rate": 0.0001, "loss": 6.8849, "loss/crossentropy": 2.376310706138611, "loss/hidden": 1.958984375, "loss/jsd": 0.0, "loss/logits": 0.2549576014280319, "step": 1316 }, { "epoch": 0.05990909090909091, "grad_norm": 6.71875, "grad_norm_var": 0.7473917643229167, "learning_rate": 0.0001, "loss": 6.2902, "loss/crossentropy": 2.177956074476242, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.21903900057077408, "step": 1318 }, { "epoch": 0.06, "grad_norm": 7.71875, "grad_norm_var": 0.734765625, "learning_rate": 0.0001, "loss": 6.8511, "loss/crossentropy": 2.3824944496154785, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2531082257628441, "step": 1320 }, { "epoch": 0.06009090909090909, "grad_norm": 10.9375, "grad_norm_var": 1.2394368489583334, "learning_rate": 0.0001, "loss": 7.7072, "loss/crossentropy": 2.7157546281814575, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.30149130150675774, "step": 1322 }, { "epoch": 0.060181818181818184, "grad_norm": 9.1875, "grad_norm_var": 1.3110026041666667, "learning_rate": 0.0001, "loss": 7.8565, "loss/crossentropy": 2.8779205083847046, "loss/hidden": 1.9609375, "loss/jsd": 0.0, "loss/logits": 0.3017624393105507, "step": 1324 }, { "epoch": 0.06027272727272727, "grad_norm": 8.375, "grad_norm_var": 1.2305623372395833, "learning_rate": 0.0001, "loss": 6.7515, "loss/crossentropy": 2.3685740530490875, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.24571321532130241, "step": 1326 }, { "epoch": 0.06036363636363636, "grad_norm": 8.3125, "grad_norm_var": 1.1987263997395834, "learning_rate": 0.0001, "loss": 7.0405, "loss/crossentropy": 2.544330596923828, "loss/hidden": 1.927734375, "loss/jsd": 0.0, "loss/logits": 0.256846159696579, "step": 1328 }, { "epoch": 0.060454545454545455, "grad_norm": 8.625, "grad_norm_var": 1.0003865559895833, "learning_rate": 0.0001, "loss": 7.5511, "loss/crossentropy": 2.7415046095848083, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.2879941761493683, "step": 1330 }, { "epoch": 0.06054545454545455, "grad_norm": 7.5, "grad_norm_var": 1.0813639322916666, "learning_rate": 0.0001, "loss": 6.8602, "loss/crossentropy": 2.4283607602119446, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.25139082595705986, "step": 1332 }, { "epoch": 0.060636363636363634, "grad_norm": 7.28125, "grad_norm_var": 0.878515625, "learning_rate": 0.0001, "loss": 7.2665, "loss/crossentropy": 2.7426722645759583, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.2617620415985584, "step": 1334 }, { "epoch": 0.06072727272727273, "grad_norm": 8.3125, "grad_norm_var": 0.8356608072916667, "learning_rate": 0.0001, "loss": 7.2759, "loss/crossentropy": 2.610305905342102, "loss/hidden": 1.947265625, "loss/jsd": 0.0, "loss/logits": 0.27183690667152405, "step": 1336 }, { "epoch": 0.06081818181818182, "grad_norm": 7.5625, "grad_norm_var": 0.4869791666666667, "learning_rate": 0.0001, "loss": 7.0985, "loss/crossentropy": 2.557736873626709, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2607147991657257, "step": 1338 }, { "epoch": 0.060909090909090906, "grad_norm": 8.75, "grad_norm_var": 0.39016520182291664, "learning_rate": 0.0001, "loss": 7.3848, "loss/crossentropy": 2.689080238342285, "loss/hidden": 1.916015625, "loss/jsd": 0.0, "loss/logits": 0.27796898782253265, "step": 1340 }, { "epoch": 0.061, "grad_norm": 8.625, "grad_norm_var": 0.45904541015625, "learning_rate": 0.0001, "loss": 7.2086, "loss/crossentropy": 2.7037513852119446, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.25927815586328506, "step": 1342 }, { "epoch": 0.06109090909090909, "grad_norm": 8.375, "grad_norm_var": 0.499072265625, "learning_rate": 0.0001, "loss": 6.7542, "loss/crossentropy": 2.3869938254356384, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.24922240525484085, "step": 1344 }, { "epoch": 0.061181818181818184, "grad_norm": 7.90625, "grad_norm_var": 0.45705973307291664, "learning_rate": 0.0001, "loss": 7.1034, "loss/crossentropy": 2.5577882528305054, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.26315247267484665, "step": 1346 }, { "epoch": 0.06127272727272727, "grad_norm": 7.71875, "grad_norm_var": 0.41028238932291666, "learning_rate": 0.0001, "loss": 6.9612, "loss/crossentropy": 2.521449863910675, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.25139546766877174, "step": 1348 }, { "epoch": 0.06136363636363636, "grad_norm": 7.25, "grad_norm_var": 0.40976155598958336, "learning_rate": 0.0001, "loss": 6.833, "loss/crossentropy": 2.4304393529891968, "loss/hidden": 1.931640625, "loss/jsd": 0.0, "loss/logits": 0.24709315598011017, "step": 1350 }, { "epoch": 0.061454545454545456, "grad_norm": 7.75, "grad_norm_var": 0.42587483723958336, "learning_rate": 0.0001, "loss": 7.1871, "loss/crossentropy": 2.66780424118042, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.2626674622297287, "step": 1352 }, { "epoch": 0.06154545454545454, "grad_norm": 9.5625, "grad_norm_var": 0.4281209309895833, "learning_rate": 0.0001, "loss": 6.9704, "loss/crossentropy": 2.456390678882599, "loss/hidden": 1.927734375, "loss/jsd": 0.0, "loss/logits": 0.2586263567209244, "step": 1354 }, { "epoch": 0.061636363636363635, "grad_norm": 7.46875, "grad_norm_var": 0.42838134765625, "learning_rate": 0.0001, "loss": 7.1684, "loss/crossentropy": 2.6644539833068848, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.26133305951952934, "step": 1356 }, { "epoch": 0.06172727272727273, "grad_norm": 8.6875, "grad_norm_var": 0.4244791666666667, "learning_rate": 0.0001, "loss": 7.0253, "loss/crossentropy": 2.508890211582184, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.2567192539572716, "step": 1358 }, { "epoch": 0.06181818181818182, "grad_norm": 8.1875, "grad_norm_var": 0.36061197916666665, "learning_rate": 0.0001, "loss": 6.6788, "loss/crossentropy": 2.3293236196041107, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.24549274519085884, "step": 1360 }, { "epoch": 0.06190909090909091, "grad_norm": 7.59375, "grad_norm_var": 0.38583577473958336, "learning_rate": 0.0001, "loss": 6.8848, "loss/crossentropy": 2.533506691455841, "loss/hidden": 1.888671875, "loss/jsd": 0.0, "loss/logits": 0.24626604095101357, "step": 1362 }, { "epoch": 0.062, "grad_norm": 6.9375, "grad_norm_var": 0.47628580729166664, "learning_rate": 0.0001, "loss": 6.6502, "loss/crossentropy": 2.4291718006134033, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.2363595962524414, "step": 1364 }, { "epoch": 0.06209090909090909, "grad_norm": 7.4375, "grad_norm_var": 0.47642822265625, "learning_rate": 0.0001, "loss": 6.5442, "loss/crossentropy": 2.2908631563186646, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2358803004026413, "step": 1366 }, { "epoch": 0.06218181818181818, "grad_norm": 8.0, "grad_norm_var": 0.5037068684895833, "learning_rate": 0.0001, "loss": 6.7034, "loss/crossentropy": 2.4458579421043396, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.23512868955731392, "step": 1368 }, { "epoch": 0.06227272727272727, "grad_norm": 8.375, "grad_norm_var": 0.33186442057291665, "learning_rate": 0.0001, "loss": 7.1096, "loss/crossentropy": 2.5760458111763, "loss/hidden": 1.908203125, "loss/jsd": 0.0, "loss/logits": 0.26253964751958847, "step": 1370 }, { "epoch": 0.062363636363636364, "grad_norm": 7.0625, "grad_norm_var": 0.36718343098958334, "learning_rate": 0.0001, "loss": 6.9324, "loss/crossentropy": 2.6016812324523926, "loss/hidden": 1.876953125, "loss/jsd": 0.0, "loss/logits": 0.24537566676735878, "step": 1372 }, { "epoch": 0.06245454545454546, "grad_norm": 7.46875, "grad_norm_var": 0.18271077473958333, "learning_rate": 0.0001, "loss": 7.0406, "loss/crossentropy": 2.6739256381988525, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.24643409997224808, "step": 1374 }, { "epoch": 0.06254545454545454, "grad_norm": 7.96875, "grad_norm_var": 0.14508056640625, "learning_rate": 0.0001, "loss": 6.9797, "loss/crossentropy": 2.529656708240509, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2559405565261841, "step": 1376 }, { "epoch": 0.06263636363636364, "grad_norm": 7.90625, "grad_norm_var": 0.17630208333333333, "learning_rate": 0.0001, "loss": 6.8985, "loss/crossentropy": 2.5282881259918213, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2460016943514347, "step": 1378 }, { "epoch": 0.06272727272727273, "grad_norm": 7.0, "grad_norm_var": 0.21070556640625, "learning_rate": 0.0001, "loss": 6.7443, "loss/crossentropy": 2.4323253631591797, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.24213355407118797, "step": 1380 }, { "epoch": 0.06281818181818181, "grad_norm": 7.5625, "grad_norm_var": 0.2232421875, "learning_rate": 0.0001, "loss": 7.1879, "loss/crossentropy": 2.8065810799598694, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.24906540662050247, "step": 1382 }, { "epoch": 0.06290909090909091, "grad_norm": 6.78125, "grad_norm_var": 0.27206624348958336, "learning_rate": 0.0001, "loss": 7.1183, "loss/crossentropy": 2.6850576400756836, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2542582005262375, "step": 1384 }, { "epoch": 0.063, "grad_norm": 8.0, "grad_norm_var": 0.2570271809895833, "learning_rate": 0.0001, "loss": 7.0894, "loss/crossentropy": 2.554163694381714, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.2642667293548584, "step": 1386 }, { "epoch": 0.06309090909090909, "grad_norm": 8.5625, "grad_norm_var": 0.24192301432291666, "learning_rate": 0.0001, "loss": 7.0705, "loss/crossentropy": 2.668620526790619, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2511293701827526, "step": 1388 }, { "epoch": 0.06318181818181819, "grad_norm": 8.125, "grad_norm_var": 0.24205322265625, "learning_rate": 0.0001, "loss": 7.5389, "loss/crossentropy": 3.0003084540367126, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2647942155599594, "step": 1390 }, { "epoch": 0.06327272727272727, "grad_norm": 6.6875, "grad_norm_var": 0.33375244140625, "learning_rate": 0.0001, "loss": 7.1291, "loss/crossentropy": 2.7177412509918213, "loss/hidden": 1.880859375, "loss/jsd": 0.0, "loss/logits": 0.25305069983005524, "step": 1392 }, { "epoch": 0.06336363636363636, "grad_norm": 8.5625, "grad_norm_var": 0.4826171875, "learning_rate": 0.0001, "loss": 7.1178, "loss/crossentropy": 2.6340973377227783, "loss/hidden": 1.923828125, "loss/jsd": 0.0, "loss/logits": 0.25598250329494476, "step": 1394 }, { "epoch": 0.06345454545454546, "grad_norm": 8.125, "grad_norm_var": 0.4263671875, "learning_rate": 0.0001, "loss": 6.9016, "loss/crossentropy": 2.5182031989097595, "loss/hidden": 1.896484375, "loss/jsd": 0.0, "loss/logits": 0.24868660047650337, "step": 1396 }, { "epoch": 0.06354545454545454, "grad_norm": 8.3125, "grad_norm_var": 0.42316080729166666, "learning_rate": 0.0001, "loss": 7.3397, "loss/crossentropy": 2.727614939212799, "loss/hidden": 1.900390625, "loss/jsd": 0.0, "loss/logits": 0.2711666598916054, "step": 1398 }, { "epoch": 0.06363636363636363, "grad_norm": 8.0625, "grad_norm_var": 0.60367431640625, "learning_rate": 0.0001, "loss": 7.3274, "loss/crossentropy": 2.7438981533050537, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2685081511735916, "step": 1400 }, { "epoch": 0.06372727272727273, "grad_norm": 7.34375, "grad_norm_var": 0.65347900390625, "learning_rate": 0.0001, "loss": 6.9159, "loss/crossentropy": 2.56310498714447, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.24679841846227646, "step": 1402 }, { "epoch": 0.06381818181818182, "grad_norm": 7.0625, "grad_norm_var": 0.7196451822916666, "learning_rate": 0.0001, "loss": 6.7822, "loss/crossentropy": 2.4160358607769012, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.2459934651851654, "step": 1404 }, { "epoch": 0.06390909090909092, "grad_norm": 7.28125, "grad_norm_var": 0.8582316080729167, "learning_rate": 0.0001, "loss": 6.8784, "loss/crossentropy": 2.530008375644684, "loss/hidden": 1.908203125, "loss/jsd": 0.0, "loss/logits": 0.24401811137795448, "step": 1406 }, { "epoch": 0.064, "grad_norm": 6.59375, "grad_norm_var": 0.8747029622395833, "learning_rate": 0.0001, "loss": 6.7431, "loss/crossentropy": 2.4738081097602844, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.23904188722372055, "step": 1408 }, { "epoch": 0.06409090909090909, "grad_norm": 7.1875, "grad_norm_var": 0.7227701822916667, "learning_rate": 0.0001, "loss": 6.7258, "loss/crossentropy": 2.4280710220336914, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.24031923711299896, "step": 1410 }, { "epoch": 0.06418181818181819, "grad_norm": 8.25, "grad_norm_var": 0.7248697916666667, "learning_rate": 0.0001, "loss": 7.0319, "loss/crossentropy": 2.63577663898468, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.25211508572101593, "step": 1412 }, { "epoch": 0.06427272727272727, "grad_norm": 7.03125, "grad_norm_var": 0.7398274739583334, "learning_rate": 0.0001, "loss": 6.7375, "loss/crossentropy": 2.4471907019615173, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2415277138352394, "step": 1414 }, { "epoch": 0.06436363636363636, "grad_norm": 7.28125, "grad_norm_var": 0.31936442057291664, "learning_rate": 0.0001, "loss": 6.8651, "loss/crossentropy": 2.5159440636634827, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.24624311551451683, "step": 1416 }, { "epoch": 0.06445454545454546, "grad_norm": 7.53125, "grad_norm_var": 0.48162434895833334, "learning_rate": 0.0001, "loss": 7.2982, "loss/crossentropy": 2.675741672515869, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.2729852758347988, "step": 1418 }, { "epoch": 0.06454545454545454, "grad_norm": 7.21875, "grad_norm_var": 0.41321207682291666, "learning_rate": 0.0001, "loss": 6.7269, "loss/crossentropy": 2.5306320190429688, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.23329950869083405, "step": 1420 }, { "epoch": 0.06463636363636363, "grad_norm": 7.8125, "grad_norm_var": 0.3537068684895833, "learning_rate": 0.0001, "loss": 7.0549, "loss/crossentropy": 2.5926889777183533, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.25501344352960587, "step": 1422 }, { "epoch": 0.06472727272727273, "grad_norm": 7.28125, "grad_norm_var": 0.3, "learning_rate": 0.0001, "loss": 6.4306, "loss/crossentropy": 2.289490729570389, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.22836683690547943, "step": 1424 }, { "epoch": 0.06481818181818182, "grad_norm": 7.25, "grad_norm_var": 0.3151326497395833, "learning_rate": 0.0001, "loss": 6.7935, "loss/crossentropy": 2.515723943710327, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.24301636964082718, "step": 1426 }, { "epoch": 0.0649090909090909, "grad_norm": 7.6875, "grad_norm_var": 0.28489583333333335, "learning_rate": 0.0001, "loss": 7.0186, "loss/crossentropy": 2.6152376532554626, "loss/hidden": 1.923828125, "loss/jsd": 0.0, "loss/logits": 0.24795560911297798, "step": 1428 }, { "epoch": 0.065, "grad_norm": 7.9375, "grad_norm_var": 0.31569010416666665, "learning_rate": 0.0001, "loss": 6.923, "loss/crossentropy": 2.5338815450668335, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.25043222680687904, "step": 1430 }, { "epoch": 0.06509090909090909, "grad_norm": 6.8125, "grad_norm_var": 0.34934895833333335, "learning_rate": 0.0001, "loss": 6.5682, "loss/crossentropy": 2.361299365758896, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2312367707490921, "step": 1432 }, { "epoch": 0.06518181818181819, "grad_norm": 6.875, "grad_norm_var": 0.21653645833333332, "learning_rate": 0.0001, "loss": 6.6673, "loss/crossentropy": 2.5243313312530518, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.22698874026536942, "step": 1434 }, { "epoch": 0.06527272727272727, "grad_norm": 9.25, "grad_norm_var": 0.41923421223958335, "learning_rate": 0.0001, "loss": 7.1873, "loss/crossentropy": 2.60907906293869, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.2685687467455864, "step": 1436 }, { "epoch": 0.06536363636363636, "grad_norm": 9.0, "grad_norm_var": 0.5580078125, "learning_rate": 0.0001, "loss": 7.461, "loss/crossentropy": 2.8055449724197388, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.2749202139675617, "step": 1438 }, { "epoch": 0.06545454545454546, "grad_norm": 9.5625, "grad_norm_var": 0.74781494140625, "learning_rate": 0.0001, "loss": 7.2246, "loss/crossentropy": 2.69939649105072, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.26189372316002846, "step": 1440 }, { "epoch": 0.06554545454545455, "grad_norm": 8.5, "grad_norm_var": 0.7002237955729167, "learning_rate": 0.0001, "loss": 7.1895, "loss/crossentropy": 2.672913074493408, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2625926658511162, "step": 1442 }, { "epoch": 0.06563636363636363, "grad_norm": 7.90625, "grad_norm_var": 0.8353515625, "learning_rate": 0.0001, "loss": 6.7181, "loss/crossentropy": 2.479158490896225, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.23658467084169388, "step": 1444 }, { "epoch": 0.06572727272727273, "grad_norm": 7.1875, "grad_norm_var": 0.8524698893229167, "learning_rate": 0.0001, "loss": 6.8387, "loss/crossentropy": 2.5292187333106995, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.2403181567788124, "step": 1446 }, { "epoch": 0.06581818181818182, "grad_norm": 8.375, "grad_norm_var": 0.7800089518229166, "learning_rate": 0.0001, "loss": 6.9854, "loss/crossentropy": 2.6136640906333923, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.25104276090860367, "step": 1448 }, { "epoch": 0.0659090909090909, "grad_norm": 8.0625, "grad_norm_var": 0.7548136393229167, "learning_rate": 0.0001, "loss": 6.8459, "loss/crossentropy": 2.545633912086487, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.2431112378835678, "step": 1450 }, { "epoch": 0.066, "grad_norm": 7.34375, "grad_norm_var": 0.794775390625, "learning_rate": 0.0001, "loss": 6.6902, "loss/crossentropy": 2.4649100303649902, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.23464320600032806, "step": 1452 }, { "epoch": 0.06609090909090909, "grad_norm": 6.875, "grad_norm_var": 0.7045857747395833, "learning_rate": 0.0001, "loss": 6.7587, "loss/crossentropy": 2.489704668521881, "loss/hidden": 1.865234375, "loss/jsd": 0.0, "loss/logits": 0.2403787188231945, "step": 1454 }, { "epoch": 0.06618181818181817, "grad_norm": 10.4375, "grad_norm_var": 0.9981730143229167, "learning_rate": 0.0001, "loss": 6.9835, "loss/crossentropy": 2.5466808676719666, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.25539863482117653, "step": 1456 }, { "epoch": 0.06627272727272727, "grad_norm": 7.0625, "grad_norm_var": 1.01510009765625, "learning_rate": 0.0001, "loss": 7.1481, "loss/crossentropy": 2.5897440910339355, "loss/hidden": 1.916015625, "loss/jsd": 0.0, "loss/logits": 0.26423129439353943, "step": 1458 }, { "epoch": 0.06636363636363636, "grad_norm": 7.1875, "grad_norm_var": 0.946337890625, "learning_rate": 0.0001, "loss": 7.0225, "loss/crossentropy": 2.541971504688263, "loss/hidden": 1.908203125, "loss/jsd": 0.0, "loss/logits": 0.2572375386953354, "step": 1460 }, { "epoch": 0.06645454545454546, "grad_norm": 8.375, "grad_norm_var": 1.0680338541666667, "learning_rate": 0.0001, "loss": 6.3014, "loss/crossentropy": 2.1714923679828644, "loss/hidden": 1.880859375, "loss/jsd": 0.0, "loss/logits": 0.22490673139691353, "step": 1462 }, { "epoch": 0.06654545454545455, "grad_norm": 7.53125, "grad_norm_var": 1.0236612955729167, "learning_rate": 0.0001, "loss": 7.2421, "loss/crossentropy": 2.736813247203827, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.26127511262893677, "step": 1464 }, { "epoch": 0.06663636363636363, "grad_norm": 8.1875, "grad_norm_var": 1.0266560872395833, "learning_rate": 0.0001, "loss": 6.899, "loss/crossentropy": 2.466466248035431, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.25419094413518906, "step": 1466 }, { "epoch": 0.06672727272727273, "grad_norm": 7.21875, "grad_norm_var": 0.93671875, "learning_rate": 0.0001, "loss": 6.7423, "loss/crossentropy": 2.5028017163276672, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.23917942866683006, "step": 1468 }, { "epoch": 0.06681818181818182, "grad_norm": 7.625, "grad_norm_var": 0.90367431640625, "learning_rate": 0.0001, "loss": 6.8196, "loss/crossentropy": 2.5423922538757324, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.24022432044148445, "step": 1470 }, { "epoch": 0.0669090909090909, "grad_norm": 8.8125, "grad_norm_var": 0.44427083333333334, "learning_rate": 0.0001, "loss": 7.2623, "loss/crossentropy": 2.733770430088043, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.26144368574023247, "step": 1472 }, { "epoch": 0.067, "grad_norm": 7.6875, "grad_norm_var": 0.337744140625, "learning_rate": 0.0001, "loss": 6.8873, "loss/crossentropy": 2.4893890619277954, "loss/hidden": 1.888671875, "loss/jsd": 0.0, "loss/logits": 0.25092754513025284, "step": 1474 }, { "epoch": 0.06709090909090909, "grad_norm": 7.28125, "grad_norm_var": 0.354541015625, "learning_rate": 0.0001, "loss": 6.9134, "loss/crossentropy": 2.595797836780548, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.24758381769061089, "step": 1476 }, { "epoch": 0.06718181818181818, "grad_norm": 7.03125, "grad_norm_var": 0.24256184895833333, "learning_rate": 0.0001, "loss": 7.0658, "loss/crossentropy": 2.699470102787018, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.24971921369433403, "step": 1478 }, { "epoch": 0.06727272727272728, "grad_norm": 9.0625, "grad_norm_var": 0.4162109375, "learning_rate": 0.0001, "loss": 6.9067, "loss/crossentropy": 2.535395562648773, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.24865521863102913, "step": 1480 }, { "epoch": 0.06736363636363636, "grad_norm": 7.75, "grad_norm_var": 0.3949055989583333, "learning_rate": 0.0001, "loss": 6.5743, "loss/crossentropy": 2.3474051356315613, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.23538202047348022, "step": 1482 }, { "epoch": 0.06745454545454546, "grad_norm": 7.875, "grad_norm_var": 0.38577067057291664, "learning_rate": 0.0001, "loss": 6.9793, "loss/crossentropy": 2.6185059547424316, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.24936328828334808, "step": 1484 }, { "epoch": 0.06754545454545455, "grad_norm": 7.0, "grad_norm_var": 0.401171875, "learning_rate": 0.0001, "loss": 6.5002, "loss/crossentropy": 2.3471375703811646, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.23092833906412125, "step": 1486 }, { "epoch": 0.06763636363636363, "grad_norm": 7.4375, "grad_norm_var": 0.30139567057291666, "learning_rate": 0.0001, "loss": 6.7288, "loss/crossentropy": 2.4720747768878937, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.238368172198534, "step": 1488 }, { "epoch": 0.06772727272727273, "grad_norm": 7.25, "grad_norm_var": 0.30481363932291666, "learning_rate": 0.0001, "loss": 6.7146, "loss/crossentropy": 2.4383236169815063, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.24032779783010483, "step": 1490 }, { "epoch": 0.06781818181818182, "grad_norm": 6.53125, "grad_norm_var": 0.34553629557291665, "learning_rate": 0.0001, "loss": 6.4367, "loss/crossentropy": 2.281952679157257, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.22289547324180603, "step": 1492 }, { "epoch": 0.0679090909090909, "grad_norm": 7.4375, "grad_norm_var": 0.33007405598958334, "learning_rate": 0.0001, "loss": 6.6617, "loss/crossentropy": 2.3452593982219696, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.24179702997207642, "step": 1494 }, { "epoch": 0.068, "grad_norm": 7.40625, "grad_norm_var": 0.15377604166666667, "learning_rate": 0.0001, "loss": 7.1526, "loss/crossentropy": 2.6715307235717773, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.25787443667650223, "step": 1496 }, { "epoch": 0.06809090909090909, "grad_norm": 7.03125, "grad_norm_var": 0.14329427083333332, "learning_rate": 0.0001, "loss": 7.0637, "loss/crossentropy": 2.6338878870010376, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.25684987008571625, "step": 1498 }, { "epoch": 0.06818181818181818, "grad_norm": 7.3125, "grad_norm_var": 0.1466796875, "learning_rate": 0.0001, "loss": 7.2322, "loss/crossentropy": 2.7907318472862244, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2562597244977951, "step": 1500 }, { "epoch": 0.06827272727272728, "grad_norm": 9.9375, "grad_norm_var": 0.5437459309895833, "learning_rate": 0.0001, "loss": 6.8863, "loss/crossentropy": 2.4150056540966034, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.25591933354735374, "step": 1502 }, { "epoch": 0.06836363636363636, "grad_norm": 7.375, "grad_norm_var": 0.5507649739583333, "learning_rate": 0.0001, "loss": 7.062, "loss/crossentropy": 2.57680082321167, "loss/hidden": 1.892578125, "loss/jsd": 0.0, "loss/logits": 0.2592657431960106, "step": 1504 }, { "epoch": 0.06845454545454545, "grad_norm": 7.28125, "grad_norm_var": 0.63668212890625, "learning_rate": 0.0001, "loss": 6.5493, "loss/crossentropy": 2.2138187885284424, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2409692257642746, "step": 1506 }, { "epoch": 0.06854545454545455, "grad_norm": 7.90625, "grad_norm_var": 0.5208292643229167, "learning_rate": 0.0001, "loss": 7.3879, "loss/crossentropy": 2.825879991054535, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.26596643030643463, "step": 1508 }, { "epoch": 0.06863636363636363, "grad_norm": 7.5625, "grad_norm_var": 0.5111287434895834, "learning_rate": 0.0001, "loss": 7.0543, "loss/crossentropy": 2.609448790550232, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.25620361417531967, "step": 1510 }, { "epoch": 0.06872727272727273, "grad_norm": 7.5, "grad_norm_var": 0.5159993489583333, "learning_rate": 0.0001, "loss": 6.9299, "loss/crossentropy": 2.575264573097229, "loss/hidden": 1.896484375, "loss/jsd": 0.0, "loss/logits": 0.24581725150346756, "step": 1512 }, { "epoch": 0.06881818181818182, "grad_norm": 7.53125, "grad_norm_var": 0.5766927083333333, "learning_rate": 0.0001, "loss": 6.4317, "loss/crossentropy": 2.25967276096344, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.22931160032749176, "step": 1514 }, { "epoch": 0.0689090909090909, "grad_norm": 7.78125, "grad_norm_var": 0.5542317708333333, "learning_rate": 0.0001, "loss": 7.2111, "loss/crossentropy": 2.707684338092804, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2624484710395336, "step": 1516 }, { "epoch": 0.069, "grad_norm": 7.625, "grad_norm_var": 0.2970052083333333, "learning_rate": 0.0001, "loss": 7.336, "loss/crossentropy": 2.7603180408477783, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.27162686735391617, "step": 1518 }, { "epoch": 0.06909090909090909, "grad_norm": 7.21875, "grad_norm_var": 0.30735677083333335, "learning_rate": 0.0001, "loss": 7.0416, "loss/crossentropy": 2.6365042328834534, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.24949249625205994, "step": 1520 }, { "epoch": 0.06918181818181818, "grad_norm": 7.90625, "grad_norm_var": 0.20506184895833332, "learning_rate": 0.0001, "loss": 6.8027, "loss/crossentropy": 2.53031188249588, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.24247222393751144, "step": 1522 }, { "epoch": 0.06927272727272728, "grad_norm": 13.6875, "grad_norm_var": 2.46109619140625, "learning_rate": 0.0001, "loss": 7.3982, "loss/crossentropy": 2.806537389755249, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2673695385456085, "step": 1524 }, { "epoch": 0.06936363636363636, "grad_norm": 7.21875, "grad_norm_var": 2.5054972330729166, "learning_rate": 0.0001, "loss": 7.0096, "loss/crossentropy": 2.6383551359176636, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.2502100020647049, "step": 1526 }, { "epoch": 0.06945454545454545, "grad_norm": 7.5, "grad_norm_var": 2.4981730143229166, "learning_rate": 0.0001, "loss": 6.8447, "loss/crossentropy": 2.5157331228256226, "loss/hidden": 1.880859375, "loss/jsd": 0.0, "loss/logits": 0.24480869248509407, "step": 1528 }, { "epoch": 0.06954545454545455, "grad_norm": 6.84375, "grad_norm_var": 2.52965087890625, "learning_rate": 0.0001, "loss": 6.5559, "loss/crossentropy": 2.4624500274658203, "loss/hidden": 1.833984375, "loss/jsd": 0.0, "loss/logits": 0.22595097869634628, "step": 1530 }, { "epoch": 0.06963636363636364, "grad_norm": 7.09375, "grad_norm_var": 2.691015625, "learning_rate": 0.0001, "loss": 6.9288, "loss/crossentropy": 2.6127743124961853, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.24293527379631996, "step": 1532 }, { "epoch": 0.06972727272727272, "grad_norm": 6.59375, "grad_norm_var": 2.76314697265625, "learning_rate": 0.0001, "loss": 6.5719, "loss/crossentropy": 2.4315163493156433, "loss/hidden": 1.833984375, "loss/jsd": 0.0, "loss/logits": 0.23063985258340836, "step": 1534 }, { "epoch": 0.06981818181818182, "grad_norm": 7.78125, "grad_norm_var": 2.76480712890625, "learning_rate": 0.0001, "loss": 7.5427, "loss/crossentropy": 2.9227262139320374, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.2707821726799011, "step": 1536 }, { "epoch": 0.0699090909090909, "grad_norm": 6.8125, "grad_norm_var": 2.8550130208333333, "learning_rate": 0.0001, "loss": 6.4482, "loss/crossentropy": 2.3686240911483765, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.22358181327581406, "step": 1538 }, { "epoch": 0.07, "grad_norm": 8.8125, "grad_norm_var": 0.4181599934895833, "learning_rate": 0.0001, "loss": 7.0069, "loss/crossentropy": 2.590732991695404, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.25040291249752045, "step": 1540 }, { "epoch": 0.07009090909090909, "grad_norm": 7.4375, "grad_norm_var": 0.544140625, "learning_rate": 0.0001, "loss": 6.8588, "loss/crossentropy": 2.4408555924892426, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.2533219940960407, "step": 1542 }, { "epoch": 0.07018181818181818, "grad_norm": 6.90625, "grad_norm_var": 0.6686482747395833, "learning_rate": 0.0001, "loss": 6.7762, "loss/crossentropy": 2.5244152545928955, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.238847516477108, "step": 1544 }, { "epoch": 0.07027272727272728, "grad_norm": 8.125, "grad_norm_var": 0.8110514322916667, "learning_rate": 0.0001, "loss": 7.1728, "loss/crossentropy": 2.6816635131835938, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.26180846989154816, "step": 1546 }, { "epoch": 0.07036363636363636, "grad_norm": 7.125, "grad_norm_var": 0.76226806640625, "learning_rate": 0.0001, "loss": 6.2892, "loss/crossentropy": 2.1636123061180115, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2227126695215702, "step": 1548 }, { "epoch": 0.07045454545454545, "grad_norm": 7.65625, "grad_norm_var": 0.703759765625, "learning_rate": 0.0001, "loss": 6.5766, "loss/crossentropy": 2.303733915090561, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.23705003410577774, "step": 1550 }, { "epoch": 0.07054545454545455, "grad_norm": 8.25, "grad_norm_var": 0.698681640625, "learning_rate": 0.0001, "loss": 6.8807, "loss/crossentropy": 2.6148383021354675, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.23967039212584496, "step": 1552 }, { "epoch": 0.07063636363636364, "grad_norm": 7.625, "grad_norm_var": 0.5867024739583333, "learning_rate": 0.0001, "loss": 7.0053, "loss/crossentropy": 2.6673110723495483, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.24688457697629929, "step": 1554 }, { "epoch": 0.07072727272727272, "grad_norm": 7.5625, "grad_norm_var": 0.58892822265625, "learning_rate": 0.0001, "loss": 6.644, "loss/crossentropy": 2.422544777393341, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.23308398574590683, "step": 1556 }, { "epoch": 0.07081818181818182, "grad_norm": 7.71875, "grad_norm_var": 0.5321614583333333, "learning_rate": 0.0001, "loss": 7.1146, "loss/crossentropy": 2.6794604063034058, "loss/hidden": 1.896484375, "loss/jsd": 0.0, "loss/logits": 0.2538612186908722, "step": 1558 }, { "epoch": 0.07090909090909091, "grad_norm": 7.625, "grad_norm_var": 0.4139322916666667, "learning_rate": 0.0001, "loss": 6.9783, "loss/crossentropy": 2.599258840084076, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.25079814344644547, "step": 1560 }, { "epoch": 0.071, "grad_norm": 7.625, "grad_norm_var": 0.19127604166666667, "learning_rate": 0.0001, "loss": 6.9434, "loss/crossentropy": 2.660662293434143, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.23999016359448433, "step": 1562 }, { "epoch": 0.0710909090909091, "grad_norm": 7.34375, "grad_norm_var": 0.15331624348958334, "learning_rate": 0.0001, "loss": 7.1112, "loss/crossentropy": 2.7306941151618958, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25328245013952255, "step": 1564 }, { "epoch": 0.07118181818181818, "grad_norm": 6.90625, "grad_norm_var": 0.168994140625, "learning_rate": 0.0001, "loss": 7.1433, "loss/crossentropy": 2.7004584670066833, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2563983052968979, "step": 1566 }, { "epoch": 0.07127272727272728, "grad_norm": 8.8125, "grad_norm_var": 0.24856770833333333, "learning_rate": 0.0001, "loss": 7.2088, "loss/crossentropy": 2.675542712211609, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.26113374531269073, "step": 1568 }, { "epoch": 0.07136363636363637, "grad_norm": 7.96875, "grad_norm_var": 0.23697509765625, "learning_rate": 0.0001, "loss": 7.3665, "loss/crossentropy": 2.8957841992378235, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2607404664158821, "step": 1570 }, { "epoch": 0.07145454545454545, "grad_norm": 6.625, "grad_norm_var": 0.2721964518229167, "learning_rate": 0.0001, "loss": 6.9308, "loss/crossentropy": 2.6478306651115417, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2419702559709549, "step": 1572 }, { "epoch": 0.07154545454545455, "grad_norm": 7.4375, "grad_norm_var": 0.25178629557291665, "learning_rate": 0.0001, "loss": 7.2748, "loss/crossentropy": 2.8382381796836853, "loss/hidden": 1.876953125, "loss/jsd": 0.0, "loss/logits": 0.2559586763381958, "step": 1574 }, { "epoch": 0.07163636363636364, "grad_norm": 8.25, "grad_norm_var": 0.30663655598958334, "learning_rate": 0.0001, "loss": 6.9493, "loss/crossentropy": 2.6121036410331726, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.25071410089731216, "step": 1576 }, { "epoch": 0.07172727272727272, "grad_norm": 8.5625, "grad_norm_var": 0.36353759765625, "learning_rate": 0.0001, "loss": 7.3162, "loss/crossentropy": 2.83389675617218, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.25974903628230095, "step": 1578 }, { "epoch": 0.07181818181818182, "grad_norm": 7.40625, "grad_norm_var": 0.36584879557291666, "learning_rate": 0.0001, "loss": 6.4779, "loss/crossentropy": 2.332092583179474, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.22902904450893402, "step": 1580 }, { "epoch": 0.07190909090909091, "grad_norm": 6.4375, "grad_norm_var": 0.42678629557291664, "learning_rate": 0.0001, "loss": 6.1445, "loss/crossentropy": 2.1901482343673706, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.21359983086585999, "step": 1582 }, { "epoch": 0.072, "grad_norm": 7.8125, "grad_norm_var": 0.3419921875, "learning_rate": 0.0001, "loss": 6.7345, "loss/crossentropy": 2.4857473969459534, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.24303866922855377, "step": 1584 }, { "epoch": 0.0720909090909091, "grad_norm": 7.59375, "grad_norm_var": 0.33482666015625, "learning_rate": 0.0001, "loss": 6.8235, "loss/crossentropy": 2.5346600115299225, "loss/hidden": 1.849609375, "loss/jsd": 0.0, "loss/logits": 0.24392474070191383, "step": 1586 }, { "epoch": 0.07218181818181818, "grad_norm": 7.0625, "grad_norm_var": 0.28013916015625, "learning_rate": 0.0001, "loss": 7.1606, "loss/crossentropy": 2.7894734740257263, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.25410664826631546, "step": 1588 }, { "epoch": 0.07227272727272727, "grad_norm": 7.375, "grad_norm_var": 0.279150390625, "learning_rate": 0.0001, "loss": 6.6827, "loss/crossentropy": 2.4341554641723633, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.23754534870386124, "step": 1590 }, { "epoch": 0.07236363636363637, "grad_norm": 10.3125, "grad_norm_var": 0.7708292643229167, "learning_rate": 0.0001, "loss": 7.1791, "loss/crossentropy": 2.720008373260498, "loss/hidden": 1.904296875, "loss/jsd": 0.0, "loss/logits": 0.2554788812994957, "step": 1592 }, { "epoch": 0.07245454545454545, "grad_norm": 7.75, "grad_norm_var": 0.7141560872395833, "learning_rate": 0.0001, "loss": 7.2828, "loss/crossentropy": 2.8618932962417603, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.25615669041872025, "step": 1594 }, { "epoch": 0.07254545454545455, "grad_norm": 7.78125, "grad_norm_var": 0.7159138997395833, "learning_rate": 0.0001, "loss": 7.2644, "loss/crossentropy": 2.747575581073761, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.26457472145557404, "step": 1596 }, { "epoch": 0.07263636363636364, "grad_norm": 8.5625, "grad_norm_var": 0.6630045572916666, "learning_rate": 0.0001, "loss": 7.0351, "loss/crossentropy": 2.6593527793884277, "loss/hidden": 1.876953125, "loss/jsd": 0.0, "loss/logits": 0.24987700954079628, "step": 1598 }, { "epoch": 0.07272727272727272, "grad_norm": 7.40625, "grad_norm_var": 0.6543904622395833, "learning_rate": 0.0001, "loss": 6.6829, "loss/crossentropy": 2.4581469893455505, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.2340012714266777, "step": 1600 }, { "epoch": 0.07281818181818182, "grad_norm": 8.3125, "grad_norm_var": 0.6345703125, "learning_rate": 0.0001, "loss": 6.9293, "loss/crossentropy": 2.6615978479385376, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.24160955846309662, "step": 1602 }, { "epoch": 0.07290909090909091, "grad_norm": 7.71875, "grad_norm_var": 0.6313761393229167, "learning_rate": 0.0001, "loss": 6.6719, "loss/crossentropy": 2.4078759849071503, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.24222470447421074, "step": 1604 }, { "epoch": 0.073, "grad_norm": 11.4375, "grad_norm_var": 1.44556884765625, "learning_rate": 0.0001, "loss": 6.9495, "loss/crossentropy": 2.444189190864563, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2603014372289181, "step": 1606 }, { "epoch": 0.0730909090909091, "grad_norm": 8.125, "grad_norm_var": 1.06939697265625, "learning_rate": 0.0001, "loss": 6.8246, "loss/crossentropy": 2.541041672229767, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.23988081887364388, "step": 1608 }, { "epoch": 0.07318181818181818, "grad_norm": 11.0, "grad_norm_var": 1.6885050455729167, "learning_rate": 0.0001, "loss": 7.1949, "loss/crossentropy": 2.6087493300437927, "loss/hidden": 1.923828125, "loss/jsd": 0.0, "loss/logits": 0.2662286013364792, "step": 1610 }, { "epoch": 0.07327272727272727, "grad_norm": 7.65625, "grad_norm_var": 1.6841145833333333, "learning_rate": 0.0001, "loss": 7.3869, "loss/crossentropy": 2.87174791097641, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.263624906539917, "step": 1612 }, { "epoch": 0.07336363636363637, "grad_norm": 7.5, "grad_norm_var": 1.6862263997395834, "learning_rate": 0.0001, "loss": 6.8245, "loss/crossentropy": 2.5777947902679443, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.237760778516531, "step": 1614 }, { "epoch": 0.07345454545454545, "grad_norm": 7.53125, "grad_norm_var": 1.6751261393229167, "learning_rate": 0.0001, "loss": 7.0125, "loss/crossentropy": 2.6196248531341553, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.25140033662319183, "step": 1616 }, { "epoch": 0.07354545454545454, "grad_norm": 11.25, "grad_norm_var": 2.1798828125, "learning_rate": 0.0001, "loss": 7.058, "loss/crossentropy": 2.650411009788513, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2505294010043144, "step": 1618 }, { "epoch": 0.07363636363636364, "grad_norm": 7.84375, "grad_norm_var": 2.12760009765625, "learning_rate": 0.0001, "loss": 6.78, "loss/crossentropy": 2.553132951259613, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.23850496485829353, "step": 1620 }, { "epoch": 0.07372727272727272, "grad_norm": 8.6875, "grad_norm_var": 1.5699178059895833, "learning_rate": 0.0001, "loss": 7.2981, "loss/crossentropy": 2.8623653650283813, "loss/hidden": 1.900390625, "loss/jsd": 0.0, "loss/logits": 0.25353358685970306, "step": 1622 }, { "epoch": 0.07381818181818182, "grad_norm": 8.1875, "grad_norm_var": 6.83121337890625, "learning_rate": 0.0001, "loss": 6.3109, "loss/crossentropy": 2.1133968234062195, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.22951976209878922, "step": 1624 }, { "epoch": 0.07390909090909091, "grad_norm": 7.6875, "grad_norm_var": 6.792952473958334, "learning_rate": 0.0001, "loss": 6.6896, "loss/crossentropy": 2.5015568137168884, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.2361832559108734, "step": 1626 }, { "epoch": 0.074, "grad_norm": 7.71875, "grad_norm_var": 6.995638020833334, "learning_rate": 0.0001, "loss": 6.4947, "loss/crossentropy": 2.3154841363430023, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.23080967366695404, "step": 1628 }, { "epoch": 0.0740909090909091, "grad_norm": 6.90625, "grad_norm_var": 7.180757649739584, "learning_rate": 0.0001, "loss": 6.2633, "loss/crossentropy": 2.1264026761054993, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.22677898406982422, "step": 1630 }, { "epoch": 0.07418181818181818, "grad_norm": 7.125, "grad_norm_var": 7.270210774739583, "learning_rate": 0.0001, "loss": 6.465, "loss/crossentropy": 2.4152557849884033, "loss/hidden": 1.845703125, "loss/jsd": 0.0, "loss/logits": 0.2204054482281208, "step": 1632 }, { "epoch": 0.07427272727272727, "grad_norm": 7.5, "grad_norm_var": 6.707405598958333, "learning_rate": 0.0001, "loss": 6.9938, "loss/crossentropy": 2.749488651752472, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.24044233933091164, "step": 1634 }, { "epoch": 0.07436363636363637, "grad_norm": 8.0625, "grad_norm_var": 6.678739420572916, "learning_rate": 0.0001, "loss": 6.7723, "loss/crossentropy": 2.530972182750702, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.23623905703425407, "step": 1636 }, { "epoch": 0.07445454545454545, "grad_norm": 7.46875, "grad_norm_var": 6.626822916666667, "learning_rate": 0.0001, "loss": 6.9872, "loss/crossentropy": 2.699167490005493, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.2446269914507866, "step": 1638 }, { "epoch": 0.07454545454545454, "grad_norm": 8.1875, "grad_norm_var": 0.513134765625, "learning_rate": 0.0001, "loss": 6.8853, "loss/crossentropy": 2.54313862323761, "loss/hidden": 1.853515625, "loss/jsd": 0.0, "loss/logits": 0.24886756390333176, "step": 1640 }, { "epoch": 0.07463636363636364, "grad_norm": 9.1875, "grad_norm_var": 1.10552978515625, "learning_rate": 0.0001, "loss": 6.9636, "loss/crossentropy": 2.52490571141243, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2536342777311802, "step": 1642 }, { "epoch": 0.07472727272727273, "grad_norm": 9.0, "grad_norm_var": 1.0691243489583333, "learning_rate": 0.0001, "loss": 7.1463, "loss/crossentropy": 2.7912575006484985, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2526901811361313, "step": 1644 }, { "epoch": 0.07481818181818181, "grad_norm": 7.28125, "grad_norm_var": 0.9999308268229167, "learning_rate": 0.0001, "loss": 6.4371, "loss/crossentropy": 2.297102451324463, "loss/hidden": 1.845703125, "loss/jsd": 0.0, "loss/logits": 0.22943313419818878, "step": 1646 }, { "epoch": 0.07490909090909091, "grad_norm": 8.8125, "grad_norm_var": 1.0933878580729166, "learning_rate": 0.0001, "loss": 6.2048, "loss/crossentropy": 2.147425264120102, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.21550292894244194, "step": 1648 }, { "epoch": 0.075, "grad_norm": 9.8125, "grad_norm_var": 1.2439412434895833, "learning_rate": 0.0001, "loss": 7.1231, "loss/crossentropy": 2.671559154987335, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.2578492611646652, "step": 1650 }, { "epoch": 0.0750909090909091, "grad_norm": 10.25, "grad_norm_var": 1.4515462239583334, "learning_rate": 0.0001, "loss": 6.938, "loss/crossentropy": 2.5494303703308105, "loss/hidden": 1.888671875, "loss/jsd": 0.0, "loss/logits": 0.24998994916677475, "step": 1652 }, { "epoch": 0.07518181818181818, "grad_norm": 7.53125, "grad_norm_var": 1.421875, "learning_rate": 0.0001, "loss": 7.0729, "loss/crossentropy": 2.5355628728866577, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.2666223794221878, "step": 1654 }, { "epoch": 0.07527272727272727, "grad_norm": 7.53125, "grad_norm_var": 1.3662760416666666, "learning_rate": 0.0001, "loss": 6.6783, "loss/crossentropy": 2.450907289981842, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.23699767515063286, "step": 1656 }, { "epoch": 0.07536363636363637, "grad_norm": 7.375, "grad_norm_var": 1.0501139322916666, "learning_rate": 0.0001, "loss": 6.9476, "loss/crossentropy": 2.5768070816993713, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.24860437959432602, "step": 1658 }, { "epoch": 0.07545454545454545, "grad_norm": 7.90625, "grad_norm_var": 0.9932576497395833, "learning_rate": 0.0001, "loss": 7.1839, "loss/crossentropy": 2.6630491614341736, "loss/hidden": 1.923828125, "loss/jsd": 0.0, "loss/logits": 0.25970544666051865, "step": 1660 }, { "epoch": 0.07554545454545454, "grad_norm": 6.65625, "grad_norm_var": 1.0710774739583333, "learning_rate": 0.0001, "loss": 6.6026, "loss/crossentropy": 2.4616634845733643, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.22815901041030884, "step": 1662 }, { "epoch": 0.07563636363636364, "grad_norm": 7.875, "grad_norm_var": 0.8829060872395833, "learning_rate": 0.0001, "loss": 6.7282, "loss/crossentropy": 2.412792444229126, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2417001985013485, "step": 1664 }, { "epoch": 0.07572727272727273, "grad_norm": 7.15625, "grad_norm_var": 0.7396443684895834, "learning_rate": 0.0001, "loss": 7.0068, "loss/crossentropy": 2.6026028394699097, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2529170364141464, "step": 1666 }, { "epoch": 0.07581818181818181, "grad_norm": 7.28125, "grad_norm_var": 0.39568684895833334, "learning_rate": 0.0001, "loss": 6.3167, "loss/crossentropy": 2.285330832004547, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.21895631030201912, "step": 1668 }, { "epoch": 0.07590909090909091, "grad_norm": 7.09375, "grad_norm_var": 0.2916015625, "learning_rate": 0.0001, "loss": 6.8886, "loss/crossentropy": 2.6279414296150208, "loss/hidden": 1.849609375, "loss/jsd": 0.0, "loss/logits": 0.2411026991903782, "step": 1670 }, { "epoch": 0.076, "grad_norm": 8.5, "grad_norm_var": 0.372265625, "learning_rate": 0.0001, "loss": 6.8257, "loss/crossentropy": 2.5685067176818848, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.2419259510934353, "step": 1672 }, { "epoch": 0.07609090909090908, "grad_norm": 7.65625, "grad_norm_var": 0.38993733723958335, "learning_rate": 0.0001, "loss": 6.7451, "loss/crossentropy": 2.5199586153030396, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.23814209178090096, "step": 1674 }, { "epoch": 0.07618181818181818, "grad_norm": 7.59375, "grad_norm_var": 0.3626953125, "learning_rate": 0.0001, "loss": 6.8813, "loss/crossentropy": 2.642638385295868, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.24027236551046371, "step": 1676 }, { "epoch": 0.07627272727272727, "grad_norm": 7.125, "grad_norm_var": 0.325244140625, "learning_rate": 0.0001, "loss": 6.9785, "loss/crossentropy": 2.626296639442444, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.251429732888937, "step": 1678 }, { "epoch": 0.07636363636363637, "grad_norm": 6.96875, "grad_norm_var": 0.29963785807291665, "learning_rate": 0.0001, "loss": 7.0794, "loss/crossentropy": 2.7618534564971924, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.24776997417211533, "step": 1680 }, { "epoch": 0.07645454545454546, "grad_norm": 7.46875, "grad_norm_var": 0.25924072265625, "learning_rate": 0.0001, "loss": 6.5256, "loss/crossentropy": 2.464977264404297, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.22442244365811348, "step": 1682 }, { "epoch": 0.07654545454545454, "grad_norm": 7.90625, "grad_norm_var": 0.3921712239583333, "learning_rate": 0.0001, "loss": 6.7936, "loss/crossentropy": 2.633847177028656, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.23765048384666443, "step": 1684 }, { "epoch": 0.07663636363636364, "grad_norm": 7.46875, "grad_norm_var": 0.362353515625, "learning_rate": 0.0001, "loss": 6.8662, "loss/crossentropy": 2.5090121030807495, "loss/hidden": 1.900390625, "loss/jsd": 0.0, "loss/logits": 0.24568375945091248, "step": 1686 }, { "epoch": 0.07672727272727273, "grad_norm": 7.875, "grad_norm_var": 0.36451822916666665, "learning_rate": 0.0001, "loss": 7.2262, "loss/crossentropy": 2.8023496866226196, "loss/hidden": 1.888671875, "loss/jsd": 0.0, "loss/logits": 0.2535206228494644, "step": 1688 }, { "epoch": 0.07681818181818181, "grad_norm": 9.625, "grad_norm_var": 1.4346964518229166, "learning_rate": 0.0001, "loss": 7.2334, "loss/crossentropy": 2.690771222114563, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2691079080104828, "step": 1690 }, { "epoch": 0.07690909090909091, "grad_norm": 8.4375, "grad_norm_var": 1.6913370768229166, "learning_rate": 0.0001, "loss": 7.2982, "loss/crossentropy": 2.710816979408264, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.264601644128561, "step": 1692 }, { "epoch": 0.077, "grad_norm": 7.46875, "grad_norm_var": 1.669775390625, "learning_rate": 0.0001, "loss": 6.8583, "loss/crossentropy": 2.5497897267341614, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.24471817165613174, "step": 1694 }, { "epoch": 0.07709090909090908, "grad_norm": 8.0, "grad_norm_var": 2.8931925455729166, "learning_rate": 0.0001, "loss": 6.8235, "loss/crossentropy": 2.439873993396759, "loss/hidden": 1.896484375, "loss/jsd": 0.0, "loss/logits": 0.24871685728430748, "step": 1696 }, { "epoch": 0.07718181818181818, "grad_norm": 7.5625, "grad_norm_var": 2.7041015625, "learning_rate": 0.0001, "loss": 6.8099, "loss/crossentropy": 2.4587424397468567, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.24683381617069244, "step": 1698 }, { "epoch": 0.07727272727272727, "grad_norm": 9.5625, "grad_norm_var": 2.342476399739583, "learning_rate": 0.0001, "loss": 6.6914, "loss/crossentropy": 2.4138737320899963, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.23927268013358116, "step": 1700 }, { "epoch": 0.07736363636363637, "grad_norm": 7.4375, "grad_norm_var": 2.27066650390625, "learning_rate": 0.0001, "loss": 6.3169, "loss/crossentropy": 2.2237395644187927, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.22025851160287857, "step": 1702 }, { "epoch": 0.07745454545454546, "grad_norm": 11.25, "grad_norm_var": 2.59527587890625, "learning_rate": 0.0001, "loss": 7.5731, "loss/crossentropy": 2.9181792736053467, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2787778750061989, "step": 1704 }, { "epoch": 0.07754545454545454, "grad_norm": 7.125, "grad_norm_var": 2.588212076822917, "learning_rate": 0.0001, "loss": 6.859, "loss/crossentropy": 2.68680876493454, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.23109132796525955, "step": 1706 }, { "epoch": 0.07763636363636364, "grad_norm": 6.75, "grad_norm_var": 2.678544108072917, "learning_rate": 0.0001, "loss": 6.6189, "loss/crossentropy": 2.3912535309791565, "loss/hidden": 1.845703125, "loss/jsd": 0.0, "loss/logits": 0.23819401487708092, "step": 1708 }, { "epoch": 0.07772727272727273, "grad_norm": 8.0625, "grad_norm_var": 2.6753743489583335, "learning_rate": 0.0001, "loss": 6.8971, "loss/crossentropy": 2.4760853946208954, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.25265225023031235, "step": 1710 }, { "epoch": 0.07781818181818181, "grad_norm": 7.5, "grad_norm_var": 1.3913899739583333, "learning_rate": 0.0001, "loss": 6.6369, "loss/crossentropy": 2.4537559747695923, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.23217959702014923, "step": 1712 }, { "epoch": 0.07790909090909091, "grad_norm": 7.0, "grad_norm_var": 1.2726399739583334, "learning_rate": 0.0001, "loss": 6.7968, "loss/crossentropy": 2.5932639241218567, "loss/hidden": 1.833984375, "loss/jsd": 0.0, "loss/logits": 0.23695803433656693, "step": 1714 }, { "epoch": 0.078, "grad_norm": 7.5, "grad_norm_var": 1.0644368489583333, "learning_rate": 0.0001, "loss": 7.1149, "loss/crossentropy": 2.7047353386878967, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.25702811405062675, "step": 1716 }, { "epoch": 0.07809090909090909, "grad_norm": 6.65625, "grad_norm_var": 1.1341796875, "learning_rate": 0.0001, "loss": 6.694, "loss/crossentropy": 2.3806830644607544, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.24265509843826294, "step": 1718 }, { "epoch": 0.07818181818181819, "grad_norm": 7.53125, "grad_norm_var": 0.21612955729166666, "learning_rate": 0.0001, "loss": 7.1553, "loss/crossentropy": 2.7085254192352295, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.2575667202472687, "step": 1720 }, { "epoch": 0.07827272727272727, "grad_norm": 8.9375, "grad_norm_var": 0.3870930989583333, "learning_rate": 0.0001, "loss": 6.6474, "loss/crossentropy": 2.4799830317497253, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.23470698297023773, "step": 1722 }, { "epoch": 0.07836363636363636, "grad_norm": 7.28125, "grad_norm_var": 0.29737955729166665, "learning_rate": 0.0001, "loss": 6.6584, "loss/crossentropy": 2.4955409169197083, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2295696884393692, "step": 1724 }, { "epoch": 0.07845454545454546, "grad_norm": 8.5, "grad_norm_var": 0.3502888997395833, "learning_rate": 0.0001, "loss": 7.105, "loss/crossentropy": 2.7244325280189514, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.2523157373070717, "step": 1726 }, { "epoch": 0.07854545454545454, "grad_norm": 6.96875, "grad_norm_var": 0.4166015625, "learning_rate": 0.0001, "loss": 6.7896, "loss/crossentropy": 2.519038736820221, "loss/hidden": 1.853515625, "loss/jsd": 0.0, "loss/logits": 0.24170900508761406, "step": 1728 }, { "epoch": 0.07863636363636364, "grad_norm": 6.90625, "grad_norm_var": 0.41861572265625, "learning_rate": 0.0001, "loss": 6.7086, "loss/crossentropy": 2.502685546875, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.2375829704105854, "step": 1730 }, { "epoch": 0.07872727272727273, "grad_norm": 7.15625, "grad_norm_var": 0.4310831705729167, "learning_rate": 0.0001, "loss": 6.9012, "loss/crossentropy": 2.589648127555847, "loss/hidden": 1.869140625, "loss/jsd": 0.0, "loss/logits": 0.24423645436763763, "step": 1732 }, { "epoch": 0.07881818181818182, "grad_norm": 7.90625, "grad_norm_var": 0.39553629557291664, "learning_rate": 0.0001, "loss": 7.2179, "loss/crossentropy": 2.7893800735473633, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.25671692937612534, "step": 1734 }, { "epoch": 0.07890909090909091, "grad_norm": 6.9375, "grad_norm_var": 0.4306640625, "learning_rate": 0.0001, "loss": 7.1782, "loss/crossentropy": 2.744024395942688, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.25611041113734245, "step": 1736 }, { "epoch": 0.079, "grad_norm": 6.84375, "grad_norm_var": 0.33170166015625, "learning_rate": 0.0001, "loss": 6.4248, "loss/crossentropy": 2.3771746158599854, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.22273360937833786, "step": 1738 }, { "epoch": 0.07909090909090909, "grad_norm": 6.9375, "grad_norm_var": 0.36087239583333336, "learning_rate": 0.0001, "loss": 6.9895, "loss/crossentropy": 2.7351529598236084, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.24144663661718369, "step": 1740 }, { "epoch": 0.07918181818181819, "grad_norm": 6.96875, "grad_norm_var": 0.3238932291666667, "learning_rate": 0.0001, "loss": 6.754, "loss/crossentropy": 2.593050539493561, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2352374531328678, "step": 1742 }, { "epoch": 0.07927272727272727, "grad_norm": 7.78125, "grad_norm_var": 0.2719034830729167, "learning_rate": 0.0001, "loss": 6.8061, "loss/crossentropy": 2.5595776438713074, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.23989097028970718, "step": 1744 }, { "epoch": 0.07936363636363636, "grad_norm": 7.03125, "grad_norm_var": 0.33544514973958334, "learning_rate": 0.0001, "loss": 6.8511, "loss/crossentropy": 2.6031071543693542, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2400287203490734, "step": 1746 }, { "epoch": 0.07945454545454546, "grad_norm": 7.3125, "grad_norm_var": 0.33592122395833335, "learning_rate": 0.0001, "loss": 6.7166, "loss/crossentropy": 2.5219178199768066, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.23509012907743454, "step": 1748 }, { "epoch": 0.07954545454545454, "grad_norm": 6.59375, "grad_norm_var": 0.29375, "learning_rate": 0.0001, "loss": 6.6411, "loss/crossentropy": 2.4645248651504517, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.23503892868757248, "step": 1750 }, { "epoch": 0.07963636363636363, "grad_norm": 8.125, "grad_norm_var": 0.33717447916666665, "learning_rate": 0.0001, "loss": 6.6138, "loss/crossentropy": 2.4561954736709595, "loss/hidden": 1.845703125, "loss/jsd": 0.0, "loss/logits": 0.23118843510746956, "step": 1752 }, { "epoch": 0.07972727272727273, "grad_norm": 7.21875, "grad_norm_var": 0.3124308268229167, "learning_rate": 0.0001, "loss": 6.7796, "loss/crossentropy": 2.578744113445282, "loss/hidden": 1.833984375, "loss/jsd": 0.0, "loss/logits": 0.23668743669986725, "step": 1754 }, { "epoch": 0.07981818181818182, "grad_norm": 7.90625, "grad_norm_var": 0.34254150390625, "learning_rate": 0.0001, "loss": 6.9967, "loss/crossentropy": 2.6556901931762695, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.2483621910214424, "step": 1756 }, { "epoch": 0.07990909090909092, "grad_norm": 7.25, "grad_norm_var": 0.29195556640625, "learning_rate": 0.0001, "loss": 6.8454, "loss/crossentropy": 2.551648199558258, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.24363691732287407, "step": 1758 }, { "epoch": 0.08, "grad_norm": 7.53125, "grad_norm_var": 0.28544514973958335, "learning_rate": 0.0001, "loss": 6.8159, "loss/crossentropy": 2.6017324924468994, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.2378222458064556, "step": 1760 }, { "epoch": 0.08009090909090909, "grad_norm": 6.65625, "grad_norm_var": 0.22342122395833333, "learning_rate": 0.0001, "loss": 6.7256, "loss/crossentropy": 2.6202617287635803, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.22674521803855896, "step": 1762 }, { "epoch": 0.08018181818181819, "grad_norm": 7.09375, "grad_norm_var": 0.24581705729166667, "learning_rate": 0.0001, "loss": 6.7522, "loss/crossentropy": 2.57942134141922, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2344692088663578, "step": 1764 }, { "epoch": 0.08027272727272727, "grad_norm": 6.96875, "grad_norm_var": 0.23424072265625, "learning_rate": 0.0001, "loss": 6.6252, "loss/crossentropy": 2.5035269260406494, "loss/hidden": 1.833984375, "loss/jsd": 0.0, "loss/logits": 0.22877006977796555, "step": 1766 }, { "epoch": 0.08036363636363636, "grad_norm": 6.625, "grad_norm_var": 0.16875, "learning_rate": 0.0001, "loss": 6.4124, "loss/crossentropy": 2.3987413346767426, "loss/hidden": 1.791015625, "loss/jsd": 0.0, "loss/logits": 0.22225938737392426, "step": 1768 }, { "epoch": 0.08045454545454546, "grad_norm": 7.59375, "grad_norm_var": 0.18170166015625, "learning_rate": 0.0001, "loss": 7.0378, "loss/crossentropy": 2.6865763068199158, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.24899458140134811, "step": 1770 }, { "epoch": 0.08054545454545455, "grad_norm": 7.0, "grad_norm_var": 0.4620442708333333, "learning_rate": 0.0001, "loss": 7.0005, "loss/crossentropy": 2.6896318197250366, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.24632279947400093, "step": 1772 }, { "epoch": 0.08063636363636363, "grad_norm": 8.3125, "grad_norm_var": 0.5154256184895833, "learning_rate": 0.0001, "loss": 7.2316, "loss/crossentropy": 2.829870820045471, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.25501782447099686, "step": 1774 }, { "epoch": 0.08072727272727273, "grad_norm": 6.84375, "grad_norm_var": 0.5098917643229167, "learning_rate": 0.0001, "loss": 6.353, "loss/crossentropy": 2.1893248558044434, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.23082420229911804, "step": 1776 }, { "epoch": 0.08081818181818182, "grad_norm": 7.3125, "grad_norm_var": 0.48566080729166666, "learning_rate": 0.0001, "loss": 6.7472, "loss/crossentropy": 2.5265368223190308, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.23827573657035828, "step": 1778 }, { "epoch": 0.0809090909090909, "grad_norm": 6.3125, "grad_norm_var": 0.5333292643229167, "learning_rate": 0.0001, "loss": 6.1151, "loss/crossentropy": 2.1737406849861145, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.21190829202532768, "step": 1780 }, { "epoch": 0.081, "grad_norm": 7.84375, "grad_norm_var": 0.530078125, "learning_rate": 0.0001, "loss": 6.8851, "loss/crossentropy": 2.5884510278701782, "loss/hidden": 1.853515625, "loss/jsd": 0.0, "loss/logits": 0.24431560561060905, "step": 1782 }, { "epoch": 0.08109090909090909, "grad_norm": 7.25, "grad_norm_var": 0.5367024739583334, "learning_rate": 0.0001, "loss": 6.6905, "loss/crossentropy": 2.4561506509780884, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.23984266445040703, "step": 1784 }, { "epoch": 0.08118181818181819, "grad_norm": 7.53125, "grad_norm_var": 0.5302734375, "learning_rate": 0.0001, "loss": 7.2455, "loss/crossentropy": 2.8783578276634216, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.25077391043305397, "step": 1786 }, { "epoch": 0.08127272727272727, "grad_norm": 6.8125, "grad_norm_var": 0.25552978515625, "learning_rate": 0.0001, "loss": 6.5074, "loss/crossentropy": 2.403487503528595, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.22913889214396477, "step": 1788 }, { "epoch": 0.08136363636363636, "grad_norm": 7.25, "grad_norm_var": 0.23782145182291667, "learning_rate": 0.0001, "loss": 6.8632, "loss/crossentropy": 2.5346306562423706, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.24711811542510986, "step": 1790 }, { "epoch": 0.08145454545454546, "grad_norm": 6.9375, "grad_norm_var": 0.23498942057291666, "learning_rate": 0.0001, "loss": 6.751, "loss/crossentropy": 2.5604066848754883, "loss/hidden": 1.814453125, "loss/jsd": 0.0, "loss/logits": 0.23761043697595596, "step": 1792 }, { "epoch": 0.08154545454545455, "grad_norm": 6.59375, "grad_norm_var": 0.25950520833333335, "learning_rate": 0.0001, "loss": 6.5896, "loss/crossentropy": 2.430568218231201, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.23426047340035439, "step": 1794 }, { "epoch": 0.08163636363636363, "grad_norm": 7.0625, "grad_norm_var": 0.24325764973958333, "learning_rate": 0.0001, "loss": 6.4883, "loss/crossentropy": 2.4597220420837402, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.22571437433362007, "step": 1796 }, { "epoch": 0.08172727272727273, "grad_norm": 7.875, "grad_norm_var": 0.5583333333333333, "learning_rate": 0.0001, "loss": 6.2649, "loss/crossentropy": 2.200390875339508, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.2209002934396267, "step": 1798 }, { "epoch": 0.08181818181818182, "grad_norm": 7.3125, "grad_norm_var": 0.5322550455729167, "learning_rate": 0.0001, "loss": 6.8447, "loss/crossentropy": 2.6542215943336487, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.23525648191571236, "step": 1800 }, { "epoch": 0.0819090909090909, "grad_norm": 7.03125, "grad_norm_var": 0.5313639322916667, "learning_rate": 0.0001, "loss": 6.7756, "loss/crossentropy": 2.5809763073921204, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.23528442531824112, "step": 1802 }, { "epoch": 0.082, "grad_norm": 7.78125, "grad_norm_var": 0.5123046875, "learning_rate": 0.0001, "loss": 6.9691, "loss/crossentropy": 2.7052100896835327, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2416185364127159, "step": 1804 }, { "epoch": 0.08209090909090909, "grad_norm": 7.1875, "grad_norm_var": 0.48043212890625, "learning_rate": 0.0001, "loss": 6.5503, "loss/crossentropy": 2.4275742173194885, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.23121452704071999, "step": 1806 }, { "epoch": 0.08218181818181818, "grad_norm": 7.65625, "grad_norm_var": 0.49947916666666664, "learning_rate": 0.0001, "loss": 6.2292, "loss/crossentropy": 2.224739283323288, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.21997912600636482, "step": 1808 }, { "epoch": 0.08227272727272728, "grad_norm": 6.78125, "grad_norm_var": 0.46510416666666665, "learning_rate": 0.0001, "loss": 6.604, "loss/crossentropy": 2.4319871068000793, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.23497751727700233, "step": 1810 }, { "epoch": 0.08236363636363636, "grad_norm": 8.25, "grad_norm_var": 0.43668212890625, "learning_rate": 0.0001, "loss": 6.9317, "loss/crossentropy": 2.587947905063629, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.2486337423324585, "step": 1812 }, { "epoch": 0.08245454545454546, "grad_norm": 7.25, "grad_norm_var": 0.18865559895833334, "learning_rate": 0.0001, "loss": 6.3159, "loss/crossentropy": 2.274057686328888, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.21863626316189766, "step": 1814 }, { "epoch": 0.08254545454545455, "grad_norm": 6.5, "grad_norm_var": 0.22851155598958334, "learning_rate": 0.0001, "loss": 6.9337, "loss/crossentropy": 2.62152898311615, "loss/hidden": 1.833984375, "loss/jsd": 0.0, "loss/logits": 0.24782249704003334, "step": 1816 }, { "epoch": 0.08263636363636363, "grad_norm": 6.21875, "grad_norm_var": 0.3028645833333333, "learning_rate": 0.0001, "loss": 6.6366, "loss/crossentropy": 2.4882009029388428, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2300722636282444, "step": 1818 }, { "epoch": 0.08272727272727273, "grad_norm": 6.53125, "grad_norm_var": 0.27395833333333336, "learning_rate": 0.0001, "loss": 6.2636, "loss/crossentropy": 2.275485008955002, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.2165808118879795, "step": 1820 }, { "epoch": 0.08281818181818182, "grad_norm": 6.5625, "grad_norm_var": 0.6070271809895833, "learning_rate": 0.0001, "loss": 7.1079, "loss/crossentropy": 2.712400197982788, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2504880651831627, "step": 1822 }, { "epoch": 0.0829090909090909, "grad_norm": 10.4375, "grad_norm_var": 1.2379842122395834, "learning_rate": 0.0001, "loss": 6.9937, "loss/crossentropy": 2.6617042422294617, "loss/hidden": 1.865234375, "loss/jsd": 0.0, "loss/logits": 0.2466791272163391, "step": 1824 }, { "epoch": 0.083, "grad_norm": 9.9375, "grad_norm_var": 1.5926066080729167, "learning_rate": 0.0001, "loss": 6.9655, "loss/crossentropy": 2.673843801021576, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.24088496714830399, "step": 1826 }, { "epoch": 0.08309090909090909, "grad_norm": 7.125, "grad_norm_var": 1.5702433268229166, "learning_rate": 0.0001, "loss": 6.7954, "loss/crossentropy": 2.5501341819763184, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.24171824753284454, "step": 1828 }, { "epoch": 0.08318181818181818, "grad_norm": 6.59375, "grad_norm_var": 1.5923136393229167, "learning_rate": 0.0001, "loss": 6.9447, "loss/crossentropy": 2.6975836753845215, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.24482956528663635, "step": 1830 }, { "epoch": 0.08327272727272728, "grad_norm": 6.71875, "grad_norm_var": 1.61802978515625, "learning_rate": 0.0001, "loss": 6.501, "loss/crossentropy": 2.4281011819839478, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.2248641662299633, "step": 1832 }, { "epoch": 0.08336363636363636, "grad_norm": 6.34375, "grad_norm_var": 1.6444661458333334, "learning_rate": 0.0001, "loss": 6.3954, "loss/crossentropy": 2.2675143778324127, "loss/hidden": 1.865234375, "loss/jsd": 0.0, "loss/logits": 0.2262681983411312, "step": 1834 }, { "epoch": 0.08345454545454545, "grad_norm": 7.34375, "grad_norm_var": 1.594384765625, "learning_rate": 0.0001, "loss": 6.643, "loss/crossentropy": 2.4178337454795837, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.23892372474074364, "step": 1836 }, { "epoch": 0.08354545454545455, "grad_norm": 7.1875, "grad_norm_var": 1.3265584309895833, "learning_rate": 0.0001, "loss": 7.2708, "loss/crossentropy": 2.849451720714569, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.25795839726924896, "step": 1838 }, { "epoch": 0.08363636363636363, "grad_norm": 8.375, "grad_norm_var": 0.7861979166666667, "learning_rate": 0.0001, "loss": 7.1188, "loss/crossentropy": 2.646511673927307, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.26129525527358055, "step": 1840 }, { "epoch": 0.08372727272727273, "grad_norm": 6.875, "grad_norm_var": 0.27862955729166666, "learning_rate": 0.0001, "loss": 6.7451, "loss/crossentropy": 2.577880322933197, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.23488979414105415, "step": 1842 }, { "epoch": 0.08381818181818182, "grad_norm": 6.5, "grad_norm_var": 0.30220947265625, "learning_rate": 0.0001, "loss": 6.5185, "loss/crossentropy": 2.399432599544525, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.22928844392299652, "step": 1844 }, { "epoch": 0.0839090909090909, "grad_norm": 9.3125, "grad_norm_var": 0.6288899739583333, "learning_rate": 0.0001, "loss": 6.6537, "loss/crossentropy": 2.5166958570480347, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.23303226754069328, "step": 1846 }, { "epoch": 0.084, "grad_norm": 7.65625, "grad_norm_var": 0.6116170247395833, "learning_rate": 0.0001, "loss": 6.456, "loss/crossentropy": 2.4283064007759094, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.21976109221577644, "step": 1848 }, { "epoch": 0.08409090909090909, "grad_norm": 6.71875, "grad_norm_var": 0.547509765625, "learning_rate": 0.0001, "loss": 6.704, "loss/crossentropy": 2.5783458948135376, "loss/hidden": 1.814453125, "loss/jsd": 0.0, "loss/logits": 0.23111592233181, "step": 1850 }, { "epoch": 0.08418181818181818, "grad_norm": 6.71875, "grad_norm_var": 0.5895833333333333, "learning_rate": 0.0001, "loss": 6.6461, "loss/crossentropy": 2.4720367789268494, "loss/hidden": 1.853515625, "loss/jsd": 0.0, "loss/logits": 0.2320513054728508, "step": 1852 }, { "epoch": 0.08427272727272728, "grad_norm": 7.34375, "grad_norm_var": 0.58804931640625, "learning_rate": 0.0001, "loss": 7.2191, "loss/crossentropy": 2.7939889430999756, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2577446959912777, "step": 1854 }, { "epoch": 0.08436363636363636, "grad_norm": 7.0625, "grad_norm_var": 0.46737874348958336, "learning_rate": 0.0001, "loss": 6.5604, "loss/crossentropy": 2.445186138153076, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.22734183818101883, "step": 1856 }, { "epoch": 0.08445454545454545, "grad_norm": 7.0, "grad_norm_var": 0.5382771809895833, "learning_rate": 0.0001, "loss": 6.8872, "loss/crossentropy": 2.624586582183838, "loss/hidden": 1.853515625, "loss/jsd": 0.0, "loss/logits": 0.24090798571705818, "step": 1858 }, { "epoch": 0.08454545454545455, "grad_norm": 7.78125, "grad_norm_var": 0.5469034830729167, "learning_rate": 0.0001, "loss": 6.4844, "loss/crossentropy": 2.3496578335762024, "loss/hidden": 1.814453125, "loss/jsd": 0.0, "loss/logits": 0.23202476277947426, "step": 1860 }, { "epoch": 0.08463636363636363, "grad_norm": 7.1875, "grad_norm_var": 0.24130452473958333, "learning_rate": 0.0001, "loss": 7.1915, "loss/crossentropy": 2.8135461807250977, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.25595611333847046, "step": 1862 }, { "epoch": 0.08472727272727272, "grad_norm": 6.90625, "grad_norm_var": 0.22655843098958334, "learning_rate": 0.0001, "loss": 6.6345, "loss/crossentropy": 2.49606192111969, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.23122742772102356, "step": 1864 }, { "epoch": 0.08481818181818182, "grad_norm": 7.40625, "grad_norm_var": 0.21998697916666668, "learning_rate": 0.0001, "loss": 6.7618, "loss/crossentropy": 2.5801833271980286, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.23847507312893867, "step": 1866 }, { "epoch": 0.0849090909090909, "grad_norm": 7.28125, "grad_norm_var": 0.19218343098958332, "learning_rate": 0.0001, "loss": 6.8397, "loss/crossentropy": 2.6182567477226257, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.23991582542657852, "step": 1868 }, { "epoch": 0.085, "grad_norm": 7.53125, "grad_norm_var": 0.18192952473958332, "learning_rate": 0.0001, "loss": 6.7819, "loss/crossentropy": 2.585779368877411, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.23484187945723534, "step": 1870 }, { "epoch": 0.08509090909090909, "grad_norm": 7.03125, "grad_norm_var": 0.17355143229166667, "learning_rate": 0.0001, "loss": 6.7042, "loss/crossentropy": 2.521089732646942, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.23608194291591644, "step": 1872 }, { "epoch": 0.08518181818181818, "grad_norm": 7.90625, "grad_norm_var": 0.13964436848958334, "learning_rate": 0.0001, "loss": 6.863, "loss/crossentropy": 2.6070526838302612, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.24336443096399307, "step": 1874 }, { "epoch": 0.08527272727272728, "grad_norm": 7.40625, "grad_norm_var": 0.10115559895833333, "learning_rate": 0.0001, "loss": 6.6737, "loss/crossentropy": 2.505201280117035, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.23326070234179497, "step": 1876 }, { "epoch": 0.08536363636363636, "grad_norm": 7.46875, "grad_norm_var": 0.07928059895833334, "learning_rate": 0.0001, "loss": 7.0919, "loss/crossentropy": 2.729393720626831, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2565630227327347, "step": 1878 }, { "epoch": 0.08545454545454545, "grad_norm": 6.8125, "grad_norm_var": 0.08201497395833333, "learning_rate": 0.0001, "loss": 6.8057, "loss/crossentropy": 2.659099519252777, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.23379917070269585, "step": 1880 }, { "epoch": 0.08554545454545455, "grad_norm": 6.71875, "grad_norm_var": 0.10849202473958333, "learning_rate": 0.0001, "loss": 6.48, "loss/crossentropy": 2.465826392173767, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.22348376363515854, "step": 1882 }, { "epoch": 0.08563636363636364, "grad_norm": 7.09375, "grad_norm_var": 0.11317952473958333, "learning_rate": 0.0001, "loss": 6.4054, "loss/crossentropy": 2.3234405517578125, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.22753292694687843, "step": 1884 }, { "epoch": 0.08572727272727272, "grad_norm": 6.4375, "grad_norm_var": 0.14263916015625, "learning_rate": 0.0001, "loss": 6.2316, "loss/crossentropy": 2.19297593832016, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.219484094530344, "step": 1886 }, { "epoch": 0.08581818181818182, "grad_norm": 7.1875, "grad_norm_var": 0.14468994140625, "learning_rate": 0.0001, "loss": 6.4059, "loss/crossentropy": 2.416292667388916, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.2186913602054119, "step": 1888 }, { "epoch": 0.08590909090909091, "grad_norm": 7.4375, "grad_norm_var": 0.11623942057291667, "learning_rate": 0.0001, "loss": 7.0472, "loss/crossentropy": 2.769022524356842, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24539456143975258, "step": 1890 }, { "epoch": 0.086, "grad_norm": 6.9375, "grad_norm_var": 0.11822916666666666, "learning_rate": 0.0001, "loss": 6.546, "loss/crossentropy": 2.4132516980171204, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.22851350530982018, "step": 1892 }, { "epoch": 0.08609090909090909, "grad_norm": 7.0625, "grad_norm_var": 0.10162760416666666, "learning_rate": 0.0001, "loss": 6.5138, "loss/crossentropy": 2.416671335697174, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.22729145735502243, "step": 1894 }, { "epoch": 0.08618181818181818, "grad_norm": 6.78125, "grad_norm_var": 0.11027018229166667, "learning_rate": 0.0001, "loss": 5.8265, "loss/crossentropy": 2.021403878927231, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.1988644376397133, "step": 1896 }, { "epoch": 0.08627272727272728, "grad_norm": 7.96875, "grad_norm_var": 0.17662760416666667, "learning_rate": 0.0001, "loss": 6.9194, "loss/crossentropy": 2.625897526741028, "loss/hidden": 1.833984375, "loss/jsd": 0.0, "loss/logits": 0.24594946950674057, "step": 1898 }, { "epoch": 0.08636363636363636, "grad_norm": 7.5625, "grad_norm_var": 0.18411051432291667, "learning_rate": 0.0001, "loss": 6.303, "loss/crossentropy": 2.2932853996753693, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.22049856558442116, "step": 1900 }, { "epoch": 0.08645454545454545, "grad_norm": 6.59375, "grad_norm_var": 0.18332926432291666, "learning_rate": 0.0001, "loss": 6.9465, "loss/crossentropy": 2.759499728679657, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.23510509729385376, "step": 1902 }, { "epoch": 0.08654545454545455, "grad_norm": 6.6875, "grad_norm_var": 0.19293212890625, "learning_rate": 0.0001, "loss": 6.5584, "loss/crossentropy": 2.4784180223941803, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.22382117807865143, "step": 1904 }, { "epoch": 0.08663636363636364, "grad_norm": 7.4375, "grad_norm_var": 0.18004150390625, "learning_rate": 0.0001, "loss": 6.5252, "loss/crossentropy": 2.397749960422516, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.22895295545458794, "step": 1906 }, { "epoch": 0.08672727272727272, "grad_norm": 6.84375, "grad_norm_var": 0.22154541015625, "learning_rate": 0.0001, "loss": 6.5368, "loss/crossentropy": 2.550812840461731, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.22027435898780823, "step": 1908 }, { "epoch": 0.08681818181818182, "grad_norm": 8.125, "grad_norm_var": 0.31575113932291665, "learning_rate": 0.0001, "loss": 6.3522, "loss/crossentropy": 2.2117621302604675, "loss/hidden": 1.912109375, "loss/jsd": 0.0, "loss/logits": 0.22283722832798958, "step": 1910 }, { "epoch": 0.08690909090909091, "grad_norm": 7.21875, "grad_norm_var": 0.31210530598958336, "learning_rate": 0.0001, "loss": 6.8043, "loss/crossentropy": 2.5939536094665527, "loss/hidden": 1.853515625, "loss/jsd": 0.0, "loss/logits": 0.23568161576986313, "step": 1912 }, { "epoch": 0.087, "grad_norm": 9.125, "grad_norm_var": 1.0426432291666667, "learning_rate": 0.0001, "loss": 6.8127, "loss/crossentropy": 2.512093424797058, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.24686157703399658, "step": 1914 }, { "epoch": 0.0870909090909091, "grad_norm": 6.84375, "grad_norm_var": 1.0457967122395833, "learning_rate": 0.0001, "loss": 6.6585, "loss/crossentropy": 2.565581738948822, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.23175561055541039, "step": 1916 }, { "epoch": 0.08718181818181818, "grad_norm": 7.34375, "grad_norm_var": 0.9909993489583333, "learning_rate": 0.0001, "loss": 6.9316, "loss/crossentropy": 2.664746105670929, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2450474351644516, "step": 1918 }, { "epoch": 0.08727272727272728, "grad_norm": 14.0, "grad_norm_var": 3.61285400390625, "learning_rate": 0.0001, "loss": 7.1397, "loss/crossentropy": 2.635696291923523, "loss/hidden": 1.849609375, "loss/jsd": 0.0, "loss/logits": 0.26544202491641045, "step": 1920 }, { "epoch": 0.08736363636363637, "grad_norm": 7.375, "grad_norm_var": 3.58082275390625, "learning_rate": 0.0001, "loss": 6.5916, "loss/crossentropy": 2.455788493156433, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.232529666274786, "step": 1922 }, { "epoch": 0.08745454545454545, "grad_norm": 8.125, "grad_norm_var": 3.3742472330729165, "learning_rate": 0.0001, "loss": 6.6729, "loss/crossentropy": 2.5104852318763733, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.23420657590031624, "step": 1924 }, { "epoch": 0.08754545454545455, "grad_norm": 7.5, "grad_norm_var": 3.26441650390625, "learning_rate": 0.0001, "loss": 6.8218, "loss/crossentropy": 2.5784717798233032, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24191565811634064, "step": 1926 }, { "epoch": 0.08763636363636364, "grad_norm": 6.90625, "grad_norm_var": 3.299149576822917, "learning_rate": 0.0001, "loss": 6.758, "loss/crossentropy": 2.5808131098747253, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.23764004558324814, "step": 1928 }, { "epoch": 0.08772727272727272, "grad_norm": 7.25, "grad_norm_var": 3.048661295572917, "learning_rate": 0.0001, "loss": 6.536, "loss/crossentropy": 2.506673038005829, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.22519706934690475, "step": 1930 }, { "epoch": 0.08781818181818182, "grad_norm": 7.59375, "grad_norm_var": 2.972489420572917, "learning_rate": 0.0001, "loss": 6.9204, "loss/crossentropy": 2.632648527622223, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2459583468735218, "step": 1932 }, { "epoch": 0.08790909090909091, "grad_norm": 7.75, "grad_norm_var": 2.9680826822916666, "learning_rate": 0.0001, "loss": 6.799, "loss/crossentropy": 2.5793023109436035, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.23935018479824066, "step": 1934 }, { "epoch": 0.088, "grad_norm": 6.65625, "grad_norm_var": 0.3058553059895833, "learning_rate": 0.0001, "loss": 6.4657, "loss/crossentropy": 2.4727900326251984, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.21824008971452713, "step": 1936 }, { "epoch": 0.0880909090909091, "grad_norm": 6.9375, "grad_norm_var": 0.34664306640625, "learning_rate": 0.0001, "loss": 6.6498, "loss/crossentropy": 2.516536921262741, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.23246772959828377, "step": 1938 }, { "epoch": 0.08818181818181818, "grad_norm": 6.875, "grad_norm_var": 0.2928995768229167, "learning_rate": 0.0001, "loss": 6.5862, "loss/crossentropy": 2.5277639031410217, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2285030037164688, "step": 1940 }, { "epoch": 0.08827272727272727, "grad_norm": 6.3125, "grad_norm_var": 0.27405192057291666, "learning_rate": 0.0001, "loss": 6.3599, "loss/crossentropy": 2.438523232936859, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.21108761057257652, "step": 1942 }, { "epoch": 0.08836363636363637, "grad_norm": 6.875, "grad_norm_var": 0.6462890625, "learning_rate": 0.0001, "loss": 6.7727, "loss/crossentropy": 2.6130451560020447, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.23374062031507492, "step": 1944 }, { "epoch": 0.08845454545454545, "grad_norm": 8.25, "grad_norm_var": 1.0860514322916666, "learning_rate": 0.0001, "loss": 6.7496, "loss/crossentropy": 2.523411273956299, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.23941748216748238, "step": 1946 }, { "epoch": 0.08854545454545455, "grad_norm": 7.25, "grad_norm_var": 1.06656494140625, "learning_rate": 0.0001, "loss": 7.029, "loss/crossentropy": 2.793712556362152, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.24755492433905602, "step": 1948 }, { "epoch": 0.08863636363636364, "grad_norm": 7.0, "grad_norm_var": 1.0564453125, "learning_rate": 0.0001, "loss": 6.5819, "loss/crossentropy": 2.4561043977737427, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.22995807975530624, "step": 1950 }, { "epoch": 0.08872727272727272, "grad_norm": 12.3125, "grad_norm_var": 2.511063639322917, "learning_rate": 0.0001, "loss": 7.03, "loss/crossentropy": 2.635041296482086, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.2537502460181713, "step": 1952 }, { "epoch": 0.08881818181818182, "grad_norm": 8.0, "grad_norm_var": 2.3677734375, "learning_rate": 0.0001, "loss": 6.8001, "loss/crossentropy": 2.5431306660175323, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.24366723746061325, "step": 1954 }, { "epoch": 0.08890909090909091, "grad_norm": 9.25, "grad_norm_var": 2.4597493489583333, "learning_rate": 0.0001, "loss": 6.3352, "loss/crossentropy": 2.421297162771225, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.21033195033669472, "step": 1956 }, { "epoch": 0.089, "grad_norm": 6.53125, "grad_norm_var": 2.48619384765625, "learning_rate": 0.0001, "loss": 6.1973, "loss/crossentropy": 2.272256016731262, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.21008434519171715, "step": 1958 }, { "epoch": 0.0890909090909091, "grad_norm": 6.9375, "grad_norm_var": 2.361832682291667, "learning_rate": 0.0001, "loss": 6.8525, "loss/crossentropy": 2.595970332622528, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.241666492074728, "step": 1960 }, { "epoch": 0.08918181818181818, "grad_norm": 7.21875, "grad_norm_var": 2.06246337890625, "learning_rate": 0.0001, "loss": 6.7049, "loss/crossentropy": 2.5051854252815247, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.23755338788032532, "step": 1962 }, { "epoch": 0.08927272727272727, "grad_norm": 7.0, "grad_norm_var": 2.1494791666666666, "learning_rate": 0.0001, "loss": 7.1041, "loss/crossentropy": 2.672653615474701, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2572050541639328, "step": 1964 }, { "epoch": 0.08936363636363637, "grad_norm": 7.28125, "grad_norm_var": 2.0965983072916665, "learning_rate": 0.0001, "loss": 6.7578, "loss/crossentropy": 2.544907569885254, "loss/hidden": 1.873046875, "loss/jsd": 0.0, "loss/logits": 0.23398613184690475, "step": 1966 }, { "epoch": 0.08945454545454545, "grad_norm": 7.09375, "grad_norm_var": 0.7023274739583333, "learning_rate": 0.0001, "loss": 6.2503, "loss/crossentropy": 2.2844992578029633, "loss/hidden": 1.794921875, "loss/jsd": 0.0, "loss/logits": 0.21708396822214127, "step": 1968 }, { "epoch": 0.08954545454545454, "grad_norm": 6.8125, "grad_norm_var": 0.690869140625, "learning_rate": 0.0001, "loss": 6.6676, "loss/crossentropy": 2.497009664773941, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.23483608290553093, "step": 1970 }, { "epoch": 0.08963636363636364, "grad_norm": 6.34375, "grad_norm_var": 0.4052042643229167, "learning_rate": 0.0001, "loss": 6.4819, "loss/crossentropy": 2.45990389585495, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.2238803580403328, "step": 1972 }, { "epoch": 0.08972727272727272, "grad_norm": 7.65625, "grad_norm_var": 0.424072265625, "learning_rate": 0.0001, "loss": 7.0289, "loss/crossentropy": 2.8154333233833313, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.24068227037787437, "step": 1974 }, { "epoch": 0.08981818181818182, "grad_norm": 6.09375, "grad_norm_var": 0.50670166015625, "learning_rate": 0.0001, "loss": 6.1155, "loss/crossentropy": 2.2867461144924164, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.20572412759065628, "step": 1976 }, { "epoch": 0.08990909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.5849609375, "learning_rate": 0.0001, "loss": 6.0081, "loss/crossentropy": 2.190065324306488, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.2069966048002243, "step": 1978 }, { "epoch": 0.09, "grad_norm": 7.5625, "grad_norm_var": 0.6279947916666667, "learning_rate": 0.0001, "loss": 7.0575, "loss/crossentropy": 2.6584523916244507, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.2572856433689594, "step": 1980 }, { "epoch": 0.0900909090909091, "grad_norm": 7.65625, "grad_norm_var": 0.5506510416666667, "learning_rate": 0.0001, "loss": 6.8796, "loss/crossentropy": 2.6703200936317444, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.24046391993761063, "step": 1982 }, { "epoch": 0.09018181818181818, "grad_norm": 6.875, "grad_norm_var": 0.7567545572916666, "learning_rate": 0.0001, "loss": 7.0599, "loss/crossentropy": 2.6706767678260803, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.25825590267777443, "step": 1984 }, { "epoch": 0.09027272727272727, "grad_norm": 8.0, "grad_norm_var": 0.8185506184895833, "learning_rate": 0.0001, "loss": 7.3092, "loss/crossentropy": 2.8618393540382385, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.2621229737997055, "step": 1986 }, { "epoch": 0.09036363636363637, "grad_norm": 6.4375, "grad_norm_var": 0.8114583333333333, "learning_rate": 0.0001, "loss": 6.9204, "loss/crossentropy": 2.6150144934654236, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.2487044334411621, "step": 1988 }, { "epoch": 0.09045454545454545, "grad_norm": 8.375, "grad_norm_var": 0.8708170572916667, "learning_rate": 0.0001, "loss": 7.0034, "loss/crossentropy": 2.6524434089660645, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.24681596085429192, "step": 1990 }, { "epoch": 0.09054545454545454, "grad_norm": 8.25, "grad_norm_var": 1.7554036458333333, "learning_rate": 0.0001, "loss": 6.8203, "loss/crossentropy": 2.4934303760528564, "loss/hidden": 1.876953125, "loss/jsd": 0.0, "loss/logits": 0.24499179422855377, "step": 1992 }, { "epoch": 0.09063636363636364, "grad_norm": 7.21875, "grad_norm_var": 2.1679646809895834, "learning_rate": 0.0001, "loss": 7.0019, "loss/crossentropy": 2.69158935546875, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.2480246238410473, "step": 1994 }, { "epoch": 0.09072727272727273, "grad_norm": 6.4375, "grad_norm_var": 2.35709228515625, "learning_rate": 0.0001, "loss": 6.5155, "loss/crossentropy": 2.475242018699646, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2258969247341156, "step": 1996 }, { "epoch": 0.09081818181818181, "grad_norm": 6.375, "grad_norm_var": 2.391259765625, "learning_rate": 0.0001, "loss": 6.5569, "loss/crossentropy": 2.4350878596305847, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.22917022556066513, "step": 1998 }, { "epoch": 0.09090909090909091, "grad_norm": 7.375, "grad_norm_var": 2.31158447265625, "learning_rate": 0.0001, "loss": 6.3599, "loss/crossentropy": 2.241253614425659, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2243676483631134, "step": 2000 }, { "epoch": 0.091, "grad_norm": 8.125, "grad_norm_var": 2.9446614583333335, "learning_rate": 0.0001, "loss": 6.892, "loss/crossentropy": 2.5300811529159546, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.2504505552351475, "step": 2002 }, { "epoch": 0.0910909090909091, "grad_norm": 6.84375, "grad_norm_var": 2.9583333333333335, "learning_rate": 0.0001, "loss": 6.6919, "loss/crossentropy": 2.5454285740852356, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.2343730702996254, "step": 2004 }, { "epoch": 0.09118181818181818, "grad_norm": 7.34375, "grad_norm_var": 2.996223958333333, "learning_rate": 0.0001, "loss": 6.7879, "loss/crossentropy": 2.553304374217987, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.2390880547463894, "step": 2006 }, { "epoch": 0.09127272727272727, "grad_norm": 6.9375, "grad_norm_var": 2.169661458333333, "learning_rate": 0.0001, "loss": 6.9267, "loss/crossentropy": 2.630187749862671, "loss/hidden": 1.845703125, "loss/jsd": 0.0, "loss/logits": 0.24507642164826393, "step": 2008 }, { "epoch": 0.09136363636363637, "grad_norm": 6.375, "grad_norm_var": 1.2680989583333333, "learning_rate": 0.0001, "loss": 6.2925, "loss/crossentropy": 2.3485819697380066, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.21626833453774452, "step": 2010 }, { "epoch": 0.09145454545454546, "grad_norm": 6.6875, "grad_norm_var": 1.242822265625, "learning_rate": 0.0001, "loss": 6.6705, "loss/crossentropy": 2.6109142303466797, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.22822590917348862, "step": 2012 }, { "epoch": 0.09154545454545454, "grad_norm": 7.875, "grad_norm_var": 1.1735026041666667, "learning_rate": 0.0001, "loss": 7.0765, "loss/crossentropy": 2.8401160836219788, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.24179907515645027, "step": 2014 }, { "epoch": 0.09163636363636364, "grad_norm": 6.9375, "grad_norm_var": 1.284619140625, "learning_rate": 0.0001, "loss": 6.7872, "loss/crossentropy": 2.6181931495666504, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.23662761226296425, "step": 2016 }, { "epoch": 0.09172727272727273, "grad_norm": 7.03125, "grad_norm_var": 0.26275634765625, "learning_rate": 0.0001, "loss": 6.6924, "loss/crossentropy": 2.5986210107803345, "loss/hidden": 1.787109375, "loss/jsd": 0.0, "loss/logits": 0.2306664139032364, "step": 2018 }, { "epoch": 0.09181818181818181, "grad_norm": 7.40625, "grad_norm_var": 0.2684733072916667, "learning_rate": 0.0001, "loss": 6.9786, "loss/crossentropy": 2.734652101993561, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.24158405140042305, "step": 2020 }, { "epoch": 0.09190909090909091, "grad_norm": 7.5, "grad_norm_var": 0.2512003580729167, "learning_rate": 0.0001, "loss": 6.9365, "loss/crossentropy": 2.7616981863975525, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.23505959659814835, "step": 2022 }, { "epoch": 0.092, "grad_norm": 6.8125, "grad_norm_var": 0.21248372395833334, "learning_rate": 0.0001, "loss": 6.8725, "loss/crossentropy": 2.701369285583496, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.23722998052835464, "step": 2024 }, { "epoch": 0.09209090909090908, "grad_norm": 6.875, "grad_norm_var": 0.20115559895833332, "learning_rate": 0.0001, "loss": 7.0029, "loss/crossentropy": 2.7848921418190002, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.24328401684761047, "step": 2026 }, { "epoch": 0.09218181818181818, "grad_norm": 6.90625, "grad_norm_var": 0.19737955729166667, "learning_rate": 0.0001, "loss": 6.5328, "loss/crossentropy": 2.492722511291504, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.22002365067601204, "step": 2028 }, { "epoch": 0.09227272727272727, "grad_norm": 7.34375, "grad_norm_var": 0.14856363932291666, "learning_rate": 0.0001, "loss": 6.4891, "loss/crossentropy": 2.444334864616394, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.225958663970232, "step": 2030 }, { "epoch": 0.09236363636363637, "grad_norm": 6.9375, "grad_norm_var": 0.19573160807291667, "learning_rate": 0.0001, "loss": 6.3163, "loss/crossentropy": 2.3582222759723663, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.2190546803176403, "step": 2032 }, { "epoch": 0.09245454545454546, "grad_norm": 6.90625, "grad_norm_var": 0.24804280598958334, "learning_rate": 0.0001, "loss": 6.6199, "loss/crossentropy": 2.4163666367530823, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.23792828619480133, "step": 2034 }, { "epoch": 0.09254545454545454, "grad_norm": 6.84375, "grad_norm_var": 0.2589192708333333, "learning_rate": 0.0001, "loss": 6.3589, "loss/crossentropy": 2.3645960092544556, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.22110597044229507, "step": 2036 }, { "epoch": 0.09263636363636364, "grad_norm": 6.46875, "grad_norm_var": 0.2620930989583333, "learning_rate": 0.0001, "loss": 6.7549, "loss/crossentropy": 2.613785892724991, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.23247382417321205, "step": 2038 }, { "epoch": 0.09272727272727273, "grad_norm": 7.1875, "grad_norm_var": 0.2628255208333333, "learning_rate": 0.0001, "loss": 7.0604, "loss/crossentropy": 2.8575403094291687, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24294546991586685, "step": 2040 }, { "epoch": 0.09281818181818181, "grad_norm": 7.40625, "grad_norm_var": 0.26744384765625, "learning_rate": 0.0001, "loss": 6.6875, "loss/crossentropy": 2.5763700008392334, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.23103249073028564, "step": 2042 }, { "epoch": 0.09290909090909091, "grad_norm": 6.9375, "grad_norm_var": 0.32303059895833336, "learning_rate": 0.0001, "loss": 6.3532, "loss/crossentropy": 2.398654580116272, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2157638594508171, "step": 2044 }, { "epoch": 0.093, "grad_norm": 6.53125, "grad_norm_var": 0.308837890625, "learning_rate": 0.0001, "loss": 6.53, "loss/crossentropy": 2.566009759902954, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.21944578364491463, "step": 2046 }, { "epoch": 0.09309090909090909, "grad_norm": 6.90625, "grad_norm_var": 0.336328125, "learning_rate": 0.0001, "loss": 6.7425, "loss/crossentropy": 2.585025370121002, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.23449861258268356, "step": 2048 }, { "epoch": 0.09318181818181819, "grad_norm": 8.1875, "grad_norm_var": 0.33590087890625, "learning_rate": 0.0001, "loss": 7.0826, "loss/crossentropy": 2.8642430305480957, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.24293267354369164, "step": 2050 }, { "epoch": 0.09327272727272727, "grad_norm": 6.375, "grad_norm_var": 0.3421223958333333, "learning_rate": 0.0001, "loss": 6.453, "loss/crossentropy": 2.513999283313751, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2196781374514103, "step": 2052 }, { "epoch": 0.09336363636363636, "grad_norm": 7.375, "grad_norm_var": 0.32888997395833336, "learning_rate": 0.0001, "loss": 6.7995, "loss/crossentropy": 2.663069188594818, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.23473485559225082, "step": 2054 }, { "epoch": 0.09345454545454546, "grad_norm": 6.90625, "grad_norm_var": 0.3259765625, "learning_rate": 0.0001, "loss": 6.232, "loss/crossentropy": 2.226692169904709, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2165484018623829, "step": 2056 }, { "epoch": 0.09354545454545454, "grad_norm": 6.15625, "grad_norm_var": 0.3715128580729167, "learning_rate": 0.0001, "loss": 6.4596, "loss/crossentropy": 2.5282496213912964, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.21734946593642235, "step": 2058 }, { "epoch": 0.09363636363636364, "grad_norm": 6.9375, "grad_norm_var": 0.3231608072916667, "learning_rate": 0.0001, "loss": 6.518, "loss/crossentropy": 2.569755345582962, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.217091616243124, "step": 2060 }, { "epoch": 0.09372727272727273, "grad_norm": 6.59375, "grad_norm_var": 0.31861979166666665, "learning_rate": 0.0001, "loss": 6.6617, "loss/crossentropy": 2.5537375807762146, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.23188824206590652, "step": 2062 }, { "epoch": 0.09381818181818181, "grad_norm": 8.0625, "grad_norm_var": 0.7538370768229167, "learning_rate": 0.0001, "loss": 6.604, "loss/crossentropy": 2.386505275964737, "loss/hidden": 1.857421875, "loss/jsd": 0.0, "loss/logits": 0.2360030673444271, "step": 2064 }, { "epoch": 0.09390909090909091, "grad_norm": 7.3125, "grad_norm_var": 0.6999308268229166, "learning_rate": 0.0001, "loss": 6.6574, "loss/crossentropy": 2.587347686290741, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.22868825867772102, "step": 2066 }, { "epoch": 0.094, "grad_norm": 6.59375, "grad_norm_var": 0.7378743489583334, "learning_rate": 0.0001, "loss": 6.2055, "loss/crossentropy": 2.3297134041786194, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.20827943459153175, "step": 2068 }, { "epoch": 0.09409090909090909, "grad_norm": 6.75, "grad_norm_var": 0.767041015625, "learning_rate": 0.0001, "loss": 6.7153, "loss/crossentropy": 2.5760812163352966, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.23286651447415352, "step": 2070 }, { "epoch": 0.09418181818181819, "grad_norm": 6.125, "grad_norm_var": 0.8524088541666667, "learning_rate": 0.0001, "loss": 6.2865, "loss/crossentropy": 2.418145090341568, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.21007786318659782, "step": 2072 }, { "epoch": 0.09427272727272727, "grad_norm": 7.1875, "grad_norm_var": 0.80074462890625, "learning_rate": 0.0001, "loss": 6.754, "loss/crossentropy": 2.641834318637848, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23465442657470703, "step": 2074 }, { "epoch": 0.09436363636363636, "grad_norm": 6.125, "grad_norm_var": 0.8399576822916667, "learning_rate": 0.0001, "loss": 6.5192, "loss/crossentropy": 2.545854091644287, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2192121110856533, "step": 2076 }, { "epoch": 0.09445454545454546, "grad_norm": 7.0625, "grad_norm_var": 0.8243123372395833, "learning_rate": 0.0001, "loss": 6.7364, "loss/crossentropy": 2.656651198863983, "loss/hidden": 1.791015625, "loss/jsd": 0.0, "loss/logits": 0.22887061908841133, "step": 2078 }, { "epoch": 0.09454545454545454, "grad_norm": 6.71875, "grad_norm_var": 0.21080322265625, "learning_rate": 0.0001, "loss": 6.427, "loss/crossentropy": 2.4592573046684265, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.22001325711607933, "step": 2080 }, { "epoch": 0.09463636363636363, "grad_norm": 7.75, "grad_norm_var": 0.7753214518229167, "learning_rate": 0.0001, "loss": 7.0811, "loss/crossentropy": 2.744954526424408, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25080181658267975, "step": 2082 }, { "epoch": 0.09472727272727273, "grad_norm": 6.40625, "grad_norm_var": 0.7706339518229167, "learning_rate": 0.0001, "loss": 6.5371, "loss/crossentropy": 2.5192238092422485, "loss/hidden": 1.787109375, "loss/jsd": 0.0, "loss/logits": 0.22307667508721352, "step": 2084 }, { "epoch": 0.09481818181818182, "grad_norm": 6.46875, "grad_norm_var": 0.7598307291666667, "learning_rate": 0.0001, "loss": 6.5472, "loss/crossentropy": 2.5241917967796326, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2249603345990181, "step": 2086 }, { "epoch": 0.09490909090909092, "grad_norm": 6.75, "grad_norm_var": 0.73922119140625, "learning_rate": 0.0001, "loss": 6.6305, "loss/crossentropy": 2.552960515022278, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.227085392922163, "step": 2088 }, { "epoch": 0.095, "grad_norm": 6.5625, "grad_norm_var": 0.7425618489583333, "learning_rate": 0.0001, "loss": 6.3944, "loss/crossentropy": 2.371616393327713, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2241552509367466, "step": 2090 }, { "epoch": 0.09509090909090909, "grad_norm": 8.3125, "grad_norm_var": 0.7725545247395833, "learning_rate": 0.0001, "loss": 6.8683, "loss/crossentropy": 2.7457125782966614, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.23256758227944374, "step": 2092 }, { "epoch": 0.09518181818181819, "grad_norm": 7.59375, "grad_norm_var": 0.7800618489583333, "learning_rate": 0.0001, "loss": 7.0352, "loss/crossentropy": 2.7208399772644043, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.25037631392478943, "step": 2094 }, { "epoch": 0.09527272727272727, "grad_norm": 7.1875, "grad_norm_var": 0.75523681640625, "learning_rate": 0.0001, "loss": 6.9473, "loss/crossentropy": 2.700271487236023, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.2424807846546173, "step": 2096 }, { "epoch": 0.09536363636363636, "grad_norm": 8.75, "grad_norm_var": 0.49191080729166664, "learning_rate": 0.0001, "loss": 6.3716, "loss/crossentropy": 2.4448878169059753, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.21923352032899857, "step": 2098 }, { "epoch": 0.09545454545454546, "grad_norm": 7.15625, "grad_norm_var": 0.4098917643229167, "learning_rate": 0.0001, "loss": 6.6535, "loss/crossentropy": 2.5035911798477173, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.23081107437610626, "step": 2100 }, { "epoch": 0.09554545454545454, "grad_norm": 6.875, "grad_norm_var": 0.45702718098958334, "learning_rate": 0.0001, "loss": 6.4391, "loss/crossentropy": 2.4602227807044983, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.2172190472483635, "step": 2102 }, { "epoch": 0.09563636363636363, "grad_norm": 7.71875, "grad_norm_var": 0.44000244140625, "learning_rate": 0.0001, "loss": 6.3561, "loss/crossentropy": 2.2470219135284424, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.22828913107514381, "step": 2104 }, { "epoch": 0.09572727272727273, "grad_norm": 6.8125, "grad_norm_var": 0.40396728515625, "learning_rate": 0.0001, "loss": 6.7415, "loss/crossentropy": 2.540314257144928, "loss/hidden": 1.853515625, "loss/jsd": 0.0, "loss/logits": 0.23477141559123993, "step": 2106 }, { "epoch": 0.09581818181818182, "grad_norm": 7.84375, "grad_norm_var": 0.6978515625, "learning_rate": 0.0001, "loss": 6.6761, "loss/crossentropy": 2.5202813148498535, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.23277314007282257, "step": 2108 }, { "epoch": 0.0959090909090909, "grad_norm": 7.125, "grad_norm_var": 0.7522745768229167, "learning_rate": 0.0001, "loss": 6.8031, "loss/crossentropy": 2.671639621257782, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.23424390703439713, "step": 2110 }, { "epoch": 0.096, "grad_norm": 6.90625, "grad_norm_var": 0.7814737955729166, "learning_rate": 0.0001, "loss": 6.4607, "loss/crossentropy": 2.396448493003845, "loss/hidden": 1.845703125, "loss/jsd": 0.0, "loss/logits": 0.2218533381819725, "step": 2112 }, { "epoch": 0.09609090909090909, "grad_norm": 6.53125, "grad_norm_var": 0.7148274739583333, "learning_rate": 0.0001, "loss": 6.5096, "loss/crossentropy": 2.4954736828804016, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.22153464332222939, "step": 2114 }, { "epoch": 0.09618181818181819, "grad_norm": 6.75, "grad_norm_var": 0.7447916666666666, "learning_rate": 0.0001, "loss": 7.1388, "loss/crossentropy": 2.788123309612274, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.25479064881801605, "step": 2116 }, { "epoch": 0.09627272727272727, "grad_norm": 7.5, "grad_norm_var": 0.6679646809895833, "learning_rate": 0.0001, "loss": 6.6064, "loss/crossentropy": 2.509108930826187, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.2265244536101818, "step": 2118 }, { "epoch": 0.09636363636363636, "grad_norm": 6.4375, "grad_norm_var": 0.68277587890625, "learning_rate": 0.0001, "loss": 6.6837, "loss/crossentropy": 2.591402232646942, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.23071609809994698, "step": 2120 }, { "epoch": 0.09645454545454546, "grad_norm": 8.4375, "grad_norm_var": 0.7980305989583333, "learning_rate": 0.0001, "loss": 6.5772, "loss/crossentropy": 2.499972105026245, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.23018122091889381, "step": 2122 }, { "epoch": 0.09654545454545455, "grad_norm": 6.40625, "grad_norm_var": 0.28720296223958336, "learning_rate": 0.0001, "loss": 6.6528, "loss/crossentropy": 2.625283181667328, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.22599781304597855, "step": 2124 }, { "epoch": 0.09663636363636363, "grad_norm": 7.34375, "grad_norm_var": 0.35917561848958335, "learning_rate": 0.0001, "loss": 6.7389, "loss/crossentropy": 2.6761947870254517, "loss/hidden": 1.791015625, "loss/jsd": 0.0, "loss/logits": 0.2271675169467926, "step": 2126 }, { "epoch": 0.09672727272727273, "grad_norm": 7.25, "grad_norm_var": 0.3910115559895833, "learning_rate": 0.0001, "loss": 6.6808, "loss/crossentropy": 2.411289691925049, "loss/hidden": 1.841796875, "loss/jsd": 0.0, "loss/logits": 0.24277274310588837, "step": 2128 }, { "epoch": 0.09681818181818182, "grad_norm": 6.8125, "grad_norm_var": 0.3890625, "learning_rate": 0.0001, "loss": 6.7916, "loss/crossentropy": 2.6266147792339325, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.23661914840340614, "step": 2130 }, { "epoch": 0.0969090909090909, "grad_norm": 8.25, "grad_norm_var": 0.4632161458333333, "learning_rate": 0.0001, "loss": 6.4221, "loss/crossentropy": 2.4310688972473145, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.2160940058529377, "step": 2132 }, { "epoch": 0.097, "grad_norm": 6.84375, "grad_norm_var": 0.5608723958333334, "learning_rate": 0.0001, "loss": 6.5807, "loss/crossentropy": 2.4189382791519165, "loss/hidden": 1.830078125, "loss/jsd": 0.0, "loss/logits": 0.23317326605319977, "step": 2134 }, { "epoch": 0.09709090909090909, "grad_norm": 6.46875, "grad_norm_var": 0.55650634765625, "learning_rate": 0.0001, "loss": 6.4657, "loss/crossentropy": 2.4395387172698975, "loss/hidden": 1.791015625, "loss/jsd": 0.0, "loss/logits": 0.22351185977458954, "step": 2136 }, { "epoch": 0.09718181818181819, "grad_norm": 6.96875, "grad_norm_var": 0.41646728515625, "learning_rate": 0.0001, "loss": 6.7802, "loss/crossentropy": 2.700277268886566, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.22595683485269547, "step": 2138 }, { "epoch": 0.09727272727272727, "grad_norm": 7.0625, "grad_norm_var": 0.4166951497395833, "learning_rate": 0.0001, "loss": 7.1203, "loss/crossentropy": 2.882878839969635, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.24346697330474854, "step": 2140 }, { "epoch": 0.09736363636363636, "grad_norm": 7.4375, "grad_norm_var": 0.3345703125, "learning_rate": 0.0001, "loss": 6.9478, "loss/crossentropy": 2.7755722403526306, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2375350184738636, "step": 2142 }, { "epoch": 0.09745454545454546, "grad_norm": 7.3125, "grad_norm_var": 0.32467041015625, "learning_rate": 0.0001, "loss": 6.8898, "loss/crossentropy": 2.739086151123047, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.2332373633980751, "step": 2144 }, { "epoch": 0.09754545454545455, "grad_norm": 6.59375, "grad_norm_var": 0.35540364583333334, "learning_rate": 0.0001, "loss": 6.309, "loss/crossentropy": 2.414136290550232, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.21331625059247017, "step": 2146 }, { "epoch": 0.09763636363636363, "grad_norm": 6.40625, "grad_norm_var": 0.392822265625, "learning_rate": 0.0001, "loss": 6.2145, "loss/crossentropy": 2.2327563166618347, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.21829446777701378, "step": 2148 }, { "epoch": 0.09772727272727273, "grad_norm": 6.40625, "grad_norm_var": 0.33941650390625, "learning_rate": 0.0001, "loss": 5.837, "loss/crossentropy": 2.0580545365810394, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.1997738629579544, "step": 2150 }, { "epoch": 0.09781818181818182, "grad_norm": 6.59375, "grad_norm_var": 0.3317342122395833, "learning_rate": 0.0001, "loss": 6.3243, "loss/crossentropy": 2.350900173187256, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.21764787659049034, "step": 2152 }, { "epoch": 0.0979090909090909, "grad_norm": 6.09375, "grad_norm_var": 0.386181640625, "learning_rate": 0.0001, "loss": 6.5675, "loss/crossentropy": 2.5784101486206055, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.222539484500885, "step": 2154 }, { "epoch": 0.098, "grad_norm": 6.9375, "grad_norm_var": 0.3607381184895833, "learning_rate": 0.0001, "loss": 6.5092, "loss/crossentropy": 2.41951847076416, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.22967032715678215, "step": 2156 }, { "epoch": 0.09809090909090909, "grad_norm": 7.3125, "grad_norm_var": 0.3225911458333333, "learning_rate": 0.0001, "loss": 6.5006, "loss/crossentropy": 2.5102153420448303, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.22188511118292809, "step": 2158 }, { "epoch": 0.09818181818181818, "grad_norm": 7.1875, "grad_norm_var": 0.3173014322916667, "learning_rate": 0.0001, "loss": 7.0453, "loss/crossentropy": 2.7568907737731934, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.24778494238853455, "step": 2160 }, { "epoch": 0.09827272727272728, "grad_norm": 6.84375, "grad_norm_var": 0.3002237955729167, "learning_rate": 0.0001, "loss": 6.6804, "loss/crossentropy": 2.5721816420555115, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.234842237085104, "step": 2162 }, { "epoch": 0.09836363636363636, "grad_norm": 6.875, "grad_norm_var": 0.18834635416666667, "learning_rate": 0.0001, "loss": 6.5505, "loss/crossentropy": 2.5367932319641113, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.22461143881082535, "step": 2164 }, { "epoch": 0.09845454545454546, "grad_norm": 7.5625, "grad_norm_var": 0.20295817057291668, "learning_rate": 0.0001, "loss": 7.0234, "loss/crossentropy": 2.8240426778793335, "loss/hidden": 1.794921875, "loss/jsd": 0.0, "loss/logits": 0.24044154584407806, "step": 2166 }, { "epoch": 0.09854545454545455, "grad_norm": 7.03125, "grad_norm_var": 0.2540364583333333, "learning_rate": 0.0001, "loss": 6.3853, "loss/crossentropy": 2.407252550125122, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.2216377668082714, "step": 2168 }, { "epoch": 0.09863636363636363, "grad_norm": 6.1875, "grad_norm_var": 0.2975545247395833, "learning_rate": 0.0001, "loss": 6.6616, "loss/crossentropy": 2.518974244594574, "loss/hidden": 1.814453125, "loss/jsd": 0.0, "loss/logits": 0.2328217215836048, "step": 2170 }, { "epoch": 0.09872727272727273, "grad_norm": 7.25, "grad_norm_var": 0.2884724934895833, "learning_rate": 0.0001, "loss": 6.9184, "loss/crossentropy": 2.7335253953933716, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.23997018858790398, "step": 2172 }, { "epoch": 0.09881818181818182, "grad_norm": 6.03125, "grad_norm_var": 0.3552734375, "learning_rate": 0.0001, "loss": 6.5442, "loss/crossentropy": 2.522932231426239, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2267369106411934, "step": 2174 }, { "epoch": 0.0989090909090909, "grad_norm": 7.0, "grad_norm_var": 0.38599853515625, "learning_rate": 0.0001, "loss": 6.4606, "loss/crossentropy": 2.445783317089081, "loss/hidden": 1.787109375, "loss/jsd": 0.0, "loss/logits": 0.22276851534843445, "step": 2176 }, { "epoch": 0.099, "grad_norm": 7.9375, "grad_norm_var": 0.4577433268229167, "learning_rate": 0.0001, "loss": 6.6168, "loss/crossentropy": 2.5790339708328247, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2248680219054222, "step": 2178 }, { "epoch": 0.09909090909090909, "grad_norm": 6.96875, "grad_norm_var": 0.42564697265625, "learning_rate": 0.0001, "loss": 6.7725, "loss/crossentropy": 2.5257643461227417, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.2408846616744995, "step": 2180 }, { "epoch": 0.09918181818181818, "grad_norm": 6.65625, "grad_norm_var": 0.4162068684895833, "learning_rate": 0.0001, "loss": 6.7386, "loss/crossentropy": 2.682630240917206, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.22434347867965698, "step": 2182 }, { "epoch": 0.09927272727272728, "grad_norm": 6.8125, "grad_norm_var": 0.35383707682291665, "learning_rate": 0.0001, "loss": 6.7678, "loss/crossentropy": 2.6592246294021606, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.2340964786708355, "step": 2184 }, { "epoch": 0.09936363636363636, "grad_norm": 7.0625, "grad_norm_var": 0.24993489583333334, "learning_rate": 0.0001, "loss": 7.0078, "loss/crossentropy": 2.7653265595436096, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24494673311710358, "step": 2186 }, { "epoch": 0.09945454545454545, "grad_norm": 7.0625, "grad_norm_var": 0.25362955729166664, "learning_rate": 0.0001, "loss": 6.7173, "loss/crossentropy": 2.687639355659485, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22718212008476257, "step": 2188 }, { "epoch": 0.09954545454545455, "grad_norm": 8.1875, "grad_norm_var": 0.28956705729166665, "learning_rate": 0.0001, "loss": 6.4868, "loss/crossentropy": 2.4370796382427216, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2229364961385727, "step": 2190 }, { "epoch": 0.09963636363636363, "grad_norm": 6.625, "grad_norm_var": 0.2638956705729167, "learning_rate": 0.0001, "loss": 6.2177, "loss/crossentropy": 2.352575182914734, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.20917226374149323, "step": 2192 }, { "epoch": 0.09972727272727273, "grad_norm": 6.84375, "grad_norm_var": 0.2087890625, "learning_rate": 0.0001, "loss": 6.4927, "loss/crossentropy": 2.4823288321495056, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.2227180041372776, "step": 2194 }, { "epoch": 0.09981818181818182, "grad_norm": 7.25, "grad_norm_var": 0.20738525390625, "learning_rate": 0.0001, "loss": 6.5974, "loss/crossentropy": 2.488089621067047, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.2324165664613247, "step": 2196 }, { "epoch": 0.0999090909090909, "grad_norm": 6.90625, "grad_norm_var": 0.20362955729166668, "learning_rate": 0.0001, "loss": 6.4152, "loss/crossentropy": 2.41095232963562, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.21917683631181717, "step": 2198 }, { "epoch": 0.1, "grad_norm": 6.90625, "grad_norm_var": 0.21780192057291667, "learning_rate": 0.0001, "loss": 6.4278, "loss/crossentropy": 2.4413758516311646, "loss/hidden": 1.791015625, "loss/jsd": 0.0, "loss/logits": 0.2195441648364067, "step": 2200 }, { "epoch": 0.10009090909090909, "grad_norm": 6.1875, "grad_norm_var": 0.23279622395833333, "learning_rate": 0.0001, "loss": 6.5211, "loss/crossentropy": 2.5226413011550903, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22406179457902908, "step": 2202 }, { "epoch": 0.10018181818181818, "grad_norm": 8.4375, "grad_norm_var": 0.39263916015625, "learning_rate": 0.0001, "loss": 6.3532, "loss/crossentropy": 2.324337989091873, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.2191016599535942, "step": 2204 }, { "epoch": 0.10027272727272728, "grad_norm": 6.6875, "grad_norm_var": 0.31051025390625, "learning_rate": 0.0001, "loss": 6.8847, "loss/crossentropy": 2.6668474674224854, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.24112073332071304, "step": 2206 }, { "epoch": 0.10036363636363636, "grad_norm": 8.3125, "grad_norm_var": 0.4899373372395833, "learning_rate": 0.0001, "loss": 6.163, "loss/crossentropy": 2.237955093383789, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.21398672834038734, "step": 2208 }, { "epoch": 0.10045454545454545, "grad_norm": 7.59375, "grad_norm_var": 0.5064412434895833, "learning_rate": 0.0001, "loss": 6.8958, "loss/crossentropy": 2.7114394307136536, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24109404534101486, "step": 2210 }, { "epoch": 0.10054545454545455, "grad_norm": 6.6875, "grad_norm_var": 0.46903889973958335, "learning_rate": 0.0001, "loss": 6.1174, "loss/crossentropy": 2.2371974289417267, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2087279036641121, "step": 2212 }, { "epoch": 0.10063636363636363, "grad_norm": 7.09375, "grad_norm_var": 0.460546875, "learning_rate": 0.0001, "loss": 6.5584, "loss/crossentropy": 2.4349502325057983, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.23207452148199081, "step": 2214 }, { "epoch": 0.10072727272727272, "grad_norm": 7.6875, "grad_norm_var": 0.49703369140625, "learning_rate": 0.0001, "loss": 6.5691, "loss/crossentropy": 2.5476068258285522, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.22461135312914848, "step": 2216 }, { "epoch": 0.10081818181818182, "grad_norm": 6.25, "grad_norm_var": 0.48190104166666664, "learning_rate": 0.0001, "loss": 6.6147, "loss/crossentropy": 2.596009314060211, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.22628504410386086, "step": 2218 }, { "epoch": 0.1009090909090909, "grad_norm": 8.375, "grad_norm_var": 522.5930989583334, "learning_rate": 0.0001, "loss": 6.9675, "loss/crossentropy": 2.4377543926239014, "loss/hidden": 1.880859375, "loss/jsd": 0.0, "loss/logits": 0.26489150151610374, "step": 2220 }, { "epoch": 0.101, "grad_norm": 6.78125, "grad_norm_var": 522.6274576822917, "learning_rate": 0.0001, "loss": 6.7362, "loss/crossentropy": 2.647149980068207, "loss/hidden": 1.794921875, "loss/jsd": 0.0, "loss/logits": 0.22941048815846443, "step": 2222 }, { "epoch": 0.10109090909090909, "grad_norm": 7.375, "grad_norm_var": 523.0505167643229, "learning_rate": 0.0001, "loss": 6.3639, "loss/crossentropy": 2.3641132712364197, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.2228301428258419, "step": 2224 }, { "epoch": 0.10118181818181818, "grad_norm": 6.4375, "grad_norm_var": 524.2480102539063, "learning_rate": 0.0001, "loss": 6.3997, "loss/crossentropy": 2.423394203186035, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.21969591826200485, "step": 2226 }, { "epoch": 0.10127272727272728, "grad_norm": 6.875, "grad_norm_var": 524.2173014322917, "learning_rate": 0.0001, "loss": 6.879, "loss/crossentropy": 2.7657694816589355, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.23534775152802467, "step": 2228 }, { "epoch": 0.10136363636363636, "grad_norm": 6.09375, "grad_norm_var": 524.648291015625, "learning_rate": 0.0001, "loss": 6.6195, "loss/crossentropy": 2.561352461576462, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.22846758365631104, "step": 2230 }, { "epoch": 0.10145454545454545, "grad_norm": 6.40625, "grad_norm_var": 525.1431599934896, "learning_rate": 0.0001, "loss": 6.9073, "loss/crossentropy": 2.8422860503196716, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.2309170439839363, "step": 2232 }, { "epoch": 0.10154545454545455, "grad_norm": 6.84375, "grad_norm_var": 525.0016276041666, "learning_rate": 0.0001, "loss": 6.621, "loss/crossentropy": 2.6193114519119263, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.22399931028485298, "step": 2234 }, { "epoch": 0.10163636363636364, "grad_norm": 7.53125, "grad_norm_var": 0.18253580729166666, "learning_rate": 0.0001, "loss": 6.5713, "loss/crossentropy": 2.550194561481476, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22633255645632744, "step": 2236 }, { "epoch": 0.10172727272727272, "grad_norm": 7.96875, "grad_norm_var": 0.26903889973958334, "learning_rate": 0.0001, "loss": 6.5654, "loss/crossentropy": 2.550317645072937, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2249424159526825, "step": 2238 }, { "epoch": 0.10181818181818182, "grad_norm": 6.875, "grad_norm_var": 0.4161458333333333, "learning_rate": 0.0001, "loss": 7.0694, "loss/crossentropy": 2.8484217524528503, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.24495243281126022, "step": 2240 }, { "epoch": 0.10190909090909091, "grad_norm": 6.75, "grad_norm_var": 0.3954264322916667, "learning_rate": 0.0001, "loss": 6.9043, "loss/crossentropy": 2.7802500128746033, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.2321292981505394, "step": 2242 }, { "epoch": 0.102, "grad_norm": 6.46875, "grad_norm_var": 0.4181599934895833, "learning_rate": 0.0001, "loss": 6.3306, "loss/crossentropy": 2.4664966464042664, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.2084815464913845, "step": 2244 }, { "epoch": 0.1020909090909091, "grad_norm": 6.8125, "grad_norm_var": 0.37157796223958334, "learning_rate": 0.0001, "loss": 6.8956, "loss/crossentropy": 2.6814700961112976, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.24074804782867432, "step": 2246 }, { "epoch": 0.10218181818181818, "grad_norm": 6.53125, "grad_norm_var": 0.363134765625, "learning_rate": 0.0001, "loss": 6.3947, "loss/crossentropy": 2.508235812187195, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.21267321333289146, "step": 2248 }, { "epoch": 0.10227272727272728, "grad_norm": 6.8125, "grad_norm_var": 0.42782796223958336, "learning_rate": 0.0001, "loss": 6.0685, "loss/crossentropy": 2.191565215587616, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.20976407825946808, "step": 2250 }, { "epoch": 0.10236363636363636, "grad_norm": 6.6875, "grad_norm_var": 0.40103759765625, "learning_rate": 0.0001, "loss": 6.2114, "loss/crossentropy": 2.347328245639801, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.21238329634070396, "step": 2252 }, { "epoch": 0.10245454545454545, "grad_norm": 6.5, "grad_norm_var": 0.46099853515625, "learning_rate": 0.0001, "loss": 6.5581, "loss/crossentropy": 2.5097222328186035, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.22553923353552818, "step": 2254 }, { "epoch": 0.10254545454545455, "grad_norm": 7.40625, "grad_norm_var": 0.31669514973958335, "learning_rate": 0.0001, "loss": 6.4353, "loss/crossentropy": 2.447897434234619, "loss/hidden": 1.787109375, "loss/jsd": 0.0, "loss/logits": 0.22002848610281944, "step": 2256 }, { "epoch": 0.10263636363636364, "grad_norm": 6.90625, "grad_norm_var": 0.33990478515625, "learning_rate": 0.0001, "loss": 6.557, "loss/crossentropy": 2.501563847064972, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2274167314171791, "step": 2258 }, { "epoch": 0.10272727272727272, "grad_norm": 6.25, "grad_norm_var": 0.3546875, "learning_rate": 0.0001, "loss": 6.7581, "loss/crossentropy": 2.6757776141166687, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.22932901605963707, "step": 2260 }, { "epoch": 0.10281818181818182, "grad_norm": 6.5, "grad_norm_var": 0.3541015625, "learning_rate": 0.0001, "loss": 6.7718, "loss/crossentropy": 2.7472312450408936, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.22472640126943588, "step": 2262 }, { "epoch": 0.10290909090909091, "grad_norm": 6.375, "grad_norm_var": 0.35638020833333334, "learning_rate": 0.0001, "loss": 6.4199, "loss/crossentropy": 2.5249348878860474, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.21371610835194588, "step": 2264 }, { "epoch": 0.103, "grad_norm": 7.3125, "grad_norm_var": 0.33513997395833334, "learning_rate": 0.0001, "loss": 6.3546, "loss/crossentropy": 2.439451038837433, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.21787745878100395, "step": 2266 }, { "epoch": 0.1030909090909091, "grad_norm": 7.6875, "grad_norm_var": 0.4225870768229167, "learning_rate": 0.0001, "loss": 7.0985, "loss/crossentropy": 2.780232310295105, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.2496005967259407, "step": 2268 }, { "epoch": 0.10318181818181818, "grad_norm": 8.0, "grad_norm_var": 0.3700480143229167, "learning_rate": 0.0001, "loss": 7.1162, "loss/crossentropy": 2.700486898422241, "loss/hidden": 1.802734375, "loss/jsd": 0.0, "loss/logits": 0.2612946406006813, "step": 2270 }, { "epoch": 0.10327272727272727, "grad_norm": 7.1875, "grad_norm_var": 0.3553670247395833, "learning_rate": 0.0001, "loss": 6.596, "loss/crossentropy": 2.4818791151046753, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.233485896140337, "step": 2272 }, { "epoch": 0.10336363636363637, "grad_norm": 6.03125, "grad_norm_var": 0.38599853515625, "learning_rate": 0.0001, "loss": 6.2088, "loss/crossentropy": 2.3670301735401154, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.20781396329402924, "step": 2274 }, { "epoch": 0.10345454545454545, "grad_norm": 6.78125, "grad_norm_var": 0.36799723307291665, "learning_rate": 0.0001, "loss": 6.8662, "loss/crossentropy": 2.7041377425193787, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.2390565648674965, "step": 2276 }, { "epoch": 0.10354545454545455, "grad_norm": 7.84375, "grad_norm_var": 0.4105428059895833, "learning_rate": 0.0001, "loss": 6.9249, "loss/crossentropy": 2.6992441415786743, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24248278141021729, "step": 2278 }, { "epoch": 0.10363636363636364, "grad_norm": 7.65625, "grad_norm_var": 0.40078125, "learning_rate": 0.0001, "loss": 7.0277, "loss/crossentropy": 2.7696458101272583, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.24787752330303192, "step": 2280 }, { "epoch": 0.10372727272727272, "grad_norm": 6.5625, "grad_norm_var": 0.4434895833333333, "learning_rate": 0.0001, "loss": 6.1919, "loss/crossentropy": 2.3074000775814056, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.2122797667980194, "step": 2282 }, { "epoch": 0.10381818181818182, "grad_norm": 7.5, "grad_norm_var": 0.41695556640625, "learning_rate": 0.0001, "loss": 7.1129, "loss/crossentropy": 2.833618938922882, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.25038741156458855, "step": 2284 }, { "epoch": 0.10390909090909091, "grad_norm": 6.125, "grad_norm_var": 0.36431884765625, "learning_rate": 0.0001, "loss": 6.4495, "loss/crossentropy": 2.513339042663574, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22096241265535355, "step": 2286 }, { "epoch": 0.104, "grad_norm": 7.21875, "grad_norm_var": 0.36139322916666666, "learning_rate": 0.0001, "loss": 6.8389, "loss/crossentropy": 2.758551836013794, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2334234081208706, "step": 2288 }, { "epoch": 0.1040909090909091, "grad_norm": 6.65625, "grad_norm_var": 0.32994384765625, "learning_rate": 0.0001, "loss": 6.6669, "loss/crossentropy": 2.6375975012779236, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22792958840727806, "step": 2290 }, { "epoch": 0.10418181818181818, "grad_norm": 6.34375, "grad_norm_var": 0.3448567708333333, "learning_rate": 0.0001, "loss": 6.4786, "loss/crossentropy": 2.5383864641189575, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.21413367614150047, "step": 2292 }, { "epoch": 0.10427272727272727, "grad_norm": 6.875, "grad_norm_var": 0.273828125, "learning_rate": 0.0001, "loss": 6.7324, "loss/crossentropy": 2.5394334197044373, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.23941514641046524, "step": 2294 }, { "epoch": 0.10436363636363637, "grad_norm": 6.4375, "grad_norm_var": 0.213525390625, "learning_rate": 0.0001, "loss": 6.3908, "loss/crossentropy": 2.5319733917713165, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.21009671688079834, "step": 2296 }, { "epoch": 0.10445454545454545, "grad_norm": 7.90625, "grad_norm_var": 7.989436848958333, "learning_rate": 0.0001, "loss": 7.1575, "loss/crossentropy": 2.806516468524933, "loss/hidden": 1.791015625, "loss/jsd": 0.0, "loss/logits": 0.25599434971809387, "step": 2298 }, { "epoch": 0.10454545454545454, "grad_norm": 8.8125, "grad_norm_var": 8.139774576822917, "learning_rate": 0.0001, "loss": 6.4675, "loss/crossentropy": 2.537616491317749, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.21642249077558517, "step": 2300 }, { "epoch": 0.10463636363636364, "grad_norm": 6.40625, "grad_norm_var": 8.000764973958333, "learning_rate": 0.0001, "loss": 6.4188, "loss/crossentropy": 2.4912667274475098, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.21736649796366692, "step": 2302 }, { "epoch": 0.10472727272727272, "grad_norm": 6.25, "grad_norm_var": 8.103645833333333, "learning_rate": 0.0001, "loss": 6.2991, "loss/crossentropy": 2.38142991065979, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.21637261286377907, "step": 2304 }, { "epoch": 0.10481818181818182, "grad_norm": 7.125, "grad_norm_var": 8.0087890625, "learning_rate": 0.0001, "loss": 6.502, "loss/crossentropy": 2.495523691177368, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.22525475546717644, "step": 2306 }, { "epoch": 0.10490909090909091, "grad_norm": 6.8125, "grad_norm_var": 7.864306640625, "learning_rate": 0.0001, "loss": 6.5917, "loss/crossentropy": 2.4820730686187744, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.23264006152749062, "step": 2308 }, { "epoch": 0.105, "grad_norm": 6.1875, "grad_norm_var": 7.971468098958334, "learning_rate": 0.0001, "loss": 6.6432, "loss/crossentropy": 2.6104800701141357, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.23003048077225685, "step": 2310 }, { "epoch": 0.1050909090909091, "grad_norm": 5.625, "grad_norm_var": 8.031766764322917, "learning_rate": 0.0001, "loss": 6.3435, "loss/crossentropy": 2.515109062194824, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.2080327309668064, "step": 2312 }, { "epoch": 0.10518181818181818, "grad_norm": 6.0, "grad_norm_var": 0.5309529622395833, "learning_rate": 0.0001, "loss": 6.4076, "loss/crossentropy": 2.50371652841568, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2177334800362587, "step": 2314 }, { "epoch": 0.10527272727272727, "grad_norm": 23.625, "grad_norm_var": 18.319364420572917, "learning_rate": 0.0001, "loss": 6.874, "loss/crossentropy": 2.5739458203315735, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.2516830749809742, "step": 2316 }, { "epoch": 0.10536363636363637, "grad_norm": 7.3125, "grad_norm_var": 19.038134765625, "learning_rate": 0.0001, "loss": 6.7286, "loss/crossentropy": 2.4843453764915466, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24200522527098656, "step": 2318 }, { "epoch": 0.10545454545454545, "grad_norm": 7.09375, "grad_norm_var": 18.757535807291667, "learning_rate": 0.0001, "loss": 6.9164, "loss/crossentropy": 2.7339001297950745, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.24305235594511032, "step": 2320 }, { "epoch": 0.10554545454545454, "grad_norm": 6.28125, "grad_norm_var": 18.984696451822916, "learning_rate": 0.0001, "loss": 6.3392, "loss/crossentropy": 2.4116148352622986, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.2160031534731388, "step": 2322 }, { "epoch": 0.10563636363636364, "grad_norm": 6.78125, "grad_norm_var": 19.153645833333332, "learning_rate": 0.0001, "loss": 6.6964, "loss/crossentropy": 2.656450569629669, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.22723528742790222, "step": 2324 }, { "epoch": 0.10572727272727273, "grad_norm": 7.375, "grad_norm_var": 18.955074055989584, "learning_rate": 0.0001, "loss": 6.7244, "loss/crossentropy": 2.641882300376892, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.22856083884835243, "step": 2326 }, { "epoch": 0.10581818181818183, "grad_norm": 7.03125, "grad_norm_var": 18.640425618489584, "learning_rate": 0.0001, "loss": 6.6449, "loss/crossentropy": 2.585801362991333, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.22700664028525352, "step": 2328 }, { "epoch": 0.10590909090909091, "grad_norm": 6.5625, "grad_norm_var": 18.328316243489585, "learning_rate": 0.0001, "loss": 6.84, "loss/crossentropy": 2.6874037981033325, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2355700172483921, "step": 2330 }, { "epoch": 0.106, "grad_norm": 7.1875, "grad_norm_var": 1.4430948893229167, "learning_rate": 0.0001, "loss": 6.465, "loss/crossentropy": 2.468120723962784, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.2233189046382904, "step": 2332 }, { "epoch": 0.1060909090909091, "grad_norm": 7.03125, "grad_norm_var": 0.8180338541666666, "learning_rate": 0.0001, "loss": 6.6092, "loss/crossentropy": 2.5729154348373413, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.22608917951583862, "step": 2334 }, { "epoch": 0.10618181818181818, "grad_norm": 7.78125, "grad_norm_var": 0.8157552083333334, "learning_rate": 0.0001, "loss": 6.6808, "loss/crossentropy": 2.5952518582344055, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2319917008280754, "step": 2336 }, { "epoch": 0.10627272727272727, "grad_norm": 6.03125, "grad_norm_var": 0.8251139322916666, "learning_rate": 0.0001, "loss": 6.1954, "loss/crossentropy": 2.384052574634552, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.20770030841231346, "step": 2338 }, { "epoch": 0.10636363636363637, "grad_norm": 6.5625, "grad_norm_var": 0.8265462239583333, "learning_rate": 0.0001, "loss": 6.7296, "loss/crossentropy": 2.6678497195243835, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.23000411689281464, "step": 2340 }, { "epoch": 0.10645454545454545, "grad_norm": 6.3125, "grad_norm_var": 0.8520670572916667, "learning_rate": 0.0001, "loss": 6.4786, "loss/crossentropy": 2.5180256962776184, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22261911258101463, "step": 2342 }, { "epoch": 0.10654545454545454, "grad_norm": 6.8125, "grad_norm_var": 0.8540323893229167, "learning_rate": 0.0001, "loss": 6.6608, "loss/crossentropy": 2.5603407621383667, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2319168671965599, "step": 2344 }, { "epoch": 0.10663636363636364, "grad_norm": 6.09375, "grad_norm_var": 0.8877604166666667, "learning_rate": 0.0001, "loss": 6.3297, "loss/crossentropy": 2.4164589643478394, "loss/hidden": 1.798828125, "loss/jsd": 0.0, "loss/logits": 0.2114396058022976, "step": 2346 }, { "epoch": 0.10672727272727273, "grad_norm": 6.125, "grad_norm_var": 0.91875, "learning_rate": 0.0001, "loss": 6.2578, "loss/crossentropy": 2.3306872844696045, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.21634380519390106, "step": 2348 }, { "epoch": 0.10681818181818181, "grad_norm": 6.6875, "grad_norm_var": 0.35725504557291665, "learning_rate": 0.0001, "loss": 7.0967, "loss/crossentropy": 2.8986279368400574, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24050820618867874, "step": 2350 }, { "epoch": 0.10690909090909091, "grad_norm": 6.78125, "grad_norm_var": 0.2981770833333333, "learning_rate": 0.0001, "loss": 7.0559, "loss/crossentropy": 2.810436248779297, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.24603186920285225, "step": 2352 }, { "epoch": 0.107, "grad_norm": 6.78125, "grad_norm_var": 0.2699178059895833, "learning_rate": 0.0001, "loss": 6.521, "loss/crossentropy": 2.508292257785797, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.22529098391532898, "step": 2354 }, { "epoch": 0.1070909090909091, "grad_norm": 6.625, "grad_norm_var": 0.26177978515625, "learning_rate": 0.0001, "loss": 6.2275, "loss/crossentropy": 2.4387702345848083, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2058219127357006, "step": 2356 }, { "epoch": 0.10718181818181818, "grad_norm": 7.71875, "grad_norm_var": 0.2992024739583333, "learning_rate": 0.0001, "loss": 6.9017, "loss/crossentropy": 2.7574825286865234, "loss/hidden": 1.787109375, "loss/jsd": 0.0, "loss/logits": 0.23570888862013817, "step": 2358 }, { "epoch": 0.10727272727272727, "grad_norm": 6.40625, "grad_norm_var": 0.32157796223958335, "learning_rate": 0.0001, "loss": 6.37, "loss/crossentropy": 2.458302915096283, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.21675719320774078, "step": 2360 }, { "epoch": 0.10736363636363637, "grad_norm": 8.125, "grad_norm_var": 0.43052978515625, "learning_rate": 0.0001, "loss": 6.9483, "loss/crossentropy": 2.7472158074378967, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24002933874726295, "step": 2362 }, { "epoch": 0.10745454545454546, "grad_norm": 6.75, "grad_norm_var": 0.38785400390625, "learning_rate": 0.0001, "loss": 6.6896, "loss/crossentropy": 2.642614960670471, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22969825938344002, "step": 2364 }, { "epoch": 0.10754545454545454, "grad_norm": 7.5625, "grad_norm_var": 0.29400634765625, "learning_rate": 0.0001, "loss": 6.4178, "loss/crossentropy": 2.4161985516548157, "loss/hidden": 1.861328125, "loss/jsd": 0.0, "loss/logits": 0.2140268050134182, "step": 2366 }, { "epoch": 0.10763636363636364, "grad_norm": 6.4375, "grad_norm_var": 0.3298136393229167, "learning_rate": 0.0001, "loss": 6.0239, "loss/crossentropy": 2.212704300880432, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.20435743406414986, "step": 2368 }, { "epoch": 0.10772727272727273, "grad_norm": 6.625, "grad_norm_var": 0.3419108072916667, "learning_rate": 0.0001, "loss": 6.143, "loss/crossentropy": 2.3108986616134644, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.2060643509030342, "step": 2370 }, { "epoch": 0.10781818181818181, "grad_norm": 7.65625, "grad_norm_var": 0.41209309895833335, "learning_rate": 0.0001, "loss": 6.7205, "loss/crossentropy": 2.57393342256546, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.23301346972584724, "step": 2372 }, { "epoch": 0.10790909090909091, "grad_norm": 7.46875, "grad_norm_var": 0.3888956705729167, "learning_rate": 0.0001, "loss": 6.6332, "loss/crossentropy": 2.5461731553077698, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2297971546649933, "step": 2374 }, { "epoch": 0.108, "grad_norm": 7.78125, "grad_norm_var": 0.437353515625, "learning_rate": 0.0001, "loss": 7.0331, "loss/crossentropy": 2.774714231491089, "loss/hidden": 1.826171875, "loss/jsd": 0.0, "loss/logits": 0.2432178445160389, "step": 2376 }, { "epoch": 0.10809090909090908, "grad_norm": 6.46875, "grad_norm_var": 0.31365559895833334, "learning_rate": 0.0001, "loss": 6.3444, "loss/crossentropy": 2.401026666164398, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.21543539687991142, "step": 2378 }, { "epoch": 0.10818181818181818, "grad_norm": 7.09375, "grad_norm_var": 0.3439453125, "learning_rate": 0.0001, "loss": 6.7734, "loss/crossentropy": 2.6420504450798035, "loss/hidden": 1.794921875, "loss/jsd": 0.0, "loss/logits": 0.2336462363600731, "step": 2380 }, { "epoch": 0.10827272727272727, "grad_norm": 7.25, "grad_norm_var": 0.310791015625, "learning_rate": 0.0001, "loss": 6.9735, "loss/crossentropy": 2.821815311908722, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.23840929195284843, "step": 2382 }, { "epoch": 0.10836363636363637, "grad_norm": 6.90625, "grad_norm_var": 0.23111572265625, "learning_rate": 0.0001, "loss": 6.7571, "loss/crossentropy": 2.683608114719391, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.22609443962574005, "step": 2384 }, { "epoch": 0.10845454545454546, "grad_norm": 6.40625, "grad_norm_var": 0.23948160807291666, "learning_rate": 0.0001, "loss": 6.1506, "loss/crossentropy": 2.31675922870636, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.2058471329510212, "step": 2386 }, { "epoch": 0.10854545454545454, "grad_norm": 7.0625, "grad_norm_var": 0.21378580729166666, "learning_rate": 0.0001, "loss": 6.7365, "loss/crossentropy": 2.6603968143463135, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.2304641455411911, "step": 2388 }, { "epoch": 0.10863636363636364, "grad_norm": 9.8125, "grad_norm_var": 0.6903645833333333, "learning_rate": 0.0001, "loss": 6.5957, "loss/crossentropy": 2.4994541704654694, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2291543260216713, "step": 2390 }, { "epoch": 0.10872727272727273, "grad_norm": 7.4375, "grad_norm_var": 0.7127237955729167, "learning_rate": 0.0001, "loss": 6.6283, "loss/crossentropy": 2.6117952466011047, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.22372106462717056, "step": 2392 }, { "epoch": 0.10881818181818181, "grad_norm": 6.59375, "grad_norm_var": 0.7064737955729167, "learning_rate": 0.0001, "loss": 6.656, "loss/crossentropy": 2.6537616848945618, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22756672650575638, "step": 2394 }, { "epoch": 0.10890909090909091, "grad_norm": 6.0625, "grad_norm_var": 0.7579386393229167, "learning_rate": 0.0001, "loss": 6.6375, "loss/crossentropy": 2.633244037628174, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.22288207709789276, "step": 2396 }, { "epoch": 0.109, "grad_norm": 6.375, "grad_norm_var": 0.7874348958333334, "learning_rate": 0.0001, "loss": 6.8877, "loss/crossentropy": 2.8278791904449463, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.23117608577013016, "step": 2398 }, { "epoch": 0.10909090909090909, "grad_norm": 6.34375, "grad_norm_var": 0.8210245768229166, "learning_rate": 0.0001, "loss": 6.07, "loss/crossentropy": 2.2872989177703857, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2040541209280491, "step": 2400 }, { "epoch": 0.10918181818181819, "grad_norm": 7.5, "grad_norm_var": 0.7891764322916667, "learning_rate": 0.0001, "loss": 6.522, "loss/crossentropy": 2.588916301727295, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.2161567434668541, "step": 2402 }, { "epoch": 0.10927272727272727, "grad_norm": 7.96875, "grad_norm_var": 0.8206868489583333, "learning_rate": 0.0001, "loss": 6.6483, "loss/crossentropy": 2.5265321731567383, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.23327065631747246, "step": 2404 }, { "epoch": 0.10936363636363636, "grad_norm": 6.0625, "grad_norm_var": 0.39042561848958335, "learning_rate": 0.0001, "loss": 6.0876, "loss/crossentropy": 2.2138105034828186, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.2098434902727604, "step": 2406 }, { "epoch": 0.10945454545454546, "grad_norm": 6.59375, "grad_norm_var": 0.4495930989583333, "learning_rate": 0.0001, "loss": 6.2636, "loss/crossentropy": 2.307199627161026, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2147839181125164, "step": 2408 }, { "epoch": 0.10954545454545454, "grad_norm": 7.28125, "grad_norm_var": 0.46002197265625, "learning_rate": 0.0001, "loss": 6.7549, "loss/crossentropy": 2.6363006830215454, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2325584851205349, "step": 2410 }, { "epoch": 0.10963636363636364, "grad_norm": 8.8125, "grad_norm_var": 0.6316243489583333, "learning_rate": 0.0001, "loss": 6.0646, "loss/crossentropy": 2.2430023550987244, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.20618164911866188, "step": 2412 }, { "epoch": 0.10972727272727273, "grad_norm": 5.90625, "grad_norm_var": 0.6942057291666667, "learning_rate": 0.0001, "loss": 6.5986, "loss/crossentropy": 2.6157103776931763, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22406740486621857, "step": 2414 }, { "epoch": 0.10981818181818181, "grad_norm": 7.0, "grad_norm_var": 0.6419921875, "learning_rate": 0.0001, "loss": 7.0021, "loss/crossentropy": 2.8694024682044983, "loss/hidden": 1.794921875, "loss/jsd": 0.0, "loss/logits": 0.23377934470772743, "step": 2416 }, { "epoch": 0.10990909090909091, "grad_norm": 6.78125, "grad_norm_var": 0.6224609375, "learning_rate": 0.0001, "loss": 6.2676, "loss/crossentropy": 2.3947401642799377, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.21150412037968636, "step": 2418 }, { "epoch": 0.11, "grad_norm": 5.8125, "grad_norm_var": 0.6781087239583333, "learning_rate": 0.0001, "loss": 5.7896, "loss/crossentropy": 2.1457395553588867, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.19270236417651176, "step": 2420 }, { "epoch": 0.11009090909090909, "grad_norm": 6.25, "grad_norm_var": 0.7445963541666667, "learning_rate": 0.0001, "loss": 5.8853, "loss/crossentropy": 2.253238558769226, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.1915307492017746, "step": 2422 }, { "epoch": 0.11018181818181819, "grad_norm": 6.46875, "grad_norm_var": 0.6175618489583333, "learning_rate": 0.0001, "loss": 6.4978, "loss/crossentropy": 2.4759638905525208, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2193702720105648, "step": 2424 }, { "epoch": 0.11027272727272727, "grad_norm": 6.28125, "grad_norm_var": 0.6026652018229167, "learning_rate": 0.0001, "loss": 6.5964, "loss/crossentropy": 2.6187002062797546, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.2214038074016571, "step": 2426 }, { "epoch": 0.11036363636363636, "grad_norm": 7.21875, "grad_norm_var": 0.31769205729166666, "learning_rate": 0.0001, "loss": 6.6542, "loss/crossentropy": 2.6415520906448364, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2223547287285328, "step": 2428 }, { "epoch": 0.11045454545454546, "grad_norm": 6.84375, "grad_norm_var": 0.2903645833333333, "learning_rate": 0.0001, "loss": 6.8786, "loss/crossentropy": 2.73384165763855, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2390863336622715, "step": 2430 }, { "epoch": 0.11054545454545454, "grad_norm": 6.40625, "grad_norm_var": 0.28522135416666666, "learning_rate": 0.0001, "loss": 6.5863, "loss/crossentropy": 2.6261104941368103, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22180498391389847, "step": 2432 }, { "epoch": 0.11063636363636363, "grad_norm": 5.875, "grad_norm_var": 0.30500895182291665, "learning_rate": 0.0001, "loss": 5.9394, "loss/crossentropy": 2.191485285758972, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.200958963483572, "step": 2434 }, { "epoch": 0.11072727272727273, "grad_norm": 6.90625, "grad_norm_var": 0.29495035807291664, "learning_rate": 0.0001, "loss": 6.2725, "loss/crossentropy": 2.340693771839142, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.21720218285918236, "step": 2436 }, { "epoch": 0.11081818181818182, "grad_norm": 6.28125, "grad_norm_var": 0.251806640625, "learning_rate": 0.0001, "loss": 6.5147, "loss/crossentropy": 2.549993574619293, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.21990703791379929, "step": 2438 }, { "epoch": 0.11090909090909092, "grad_norm": 7.90625, "grad_norm_var": 0.3089680989583333, "learning_rate": 0.0001, "loss": 6.6393, "loss/crossentropy": 2.532857298851013, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.23349517583847046, "step": 2440 }, { "epoch": 0.111, "grad_norm": 6.40625, "grad_norm_var": 0.2945149739583333, "learning_rate": 0.0001, "loss": 6.1538, "loss/crossentropy": 2.2528491020202637, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.2145116776227951, "step": 2442 }, { "epoch": 0.11109090909090909, "grad_norm": 7.5625, "grad_norm_var": 0.3283487955729167, "learning_rate": 0.0001, "loss": 6.4613, "loss/crossentropy": 2.559779107570648, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.21417205035686493, "step": 2444 }, { "epoch": 0.11118181818181819, "grad_norm": 10.75, "grad_norm_var": 1.3200520833333333, "learning_rate": 0.0001, "loss": 6.7036, "loss/crossentropy": 2.6314034461975098, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.230075191706419, "step": 2446 }, { "epoch": 0.11127272727272727, "grad_norm": 6.3125, "grad_norm_var": 1.3111612955729166, "learning_rate": 0.0001, "loss": 6.8237, "loss/crossentropy": 2.7418288588523865, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.2320193387567997, "step": 2448 }, { "epoch": 0.11136363636363636, "grad_norm": 7.40625, "grad_norm_var": 1.2512858072916666, "learning_rate": 0.0001, "loss": 6.6093, "loss/crossentropy": 2.5658533573150635, "loss/hidden": 1.787109375, "loss/jsd": 0.0, "loss/logits": 0.22562891989946365, "step": 2450 }, { "epoch": 0.11145454545454546, "grad_norm": 6.96875, "grad_norm_var": 1.181494140625, "learning_rate": 0.0001, "loss": 7.2208, "loss/crossentropy": 2.995599329471588, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.24557049944996834, "step": 2452 }, { "epoch": 0.11154545454545454, "grad_norm": 7.0625, "grad_norm_var": 1.142041015625, "learning_rate": 0.0001, "loss": 6.3917, "loss/crossentropy": 2.4208266139030457, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.22071952372789383, "step": 2454 }, { "epoch": 0.11163636363636363, "grad_norm": 6.34375, "grad_norm_var": 1.630078125, "learning_rate": 0.0001, "loss": 6.4965, "loss/crossentropy": 2.5142208337783813, "loss/hidden": 1.794921875, "loss/jsd": 0.0, "loss/logits": 0.21873761340975761, "step": 2456 }, { "epoch": 0.11172727272727273, "grad_norm": 7.0, "grad_norm_var": 1.5637858072916666, "learning_rate": 0.0001, "loss": 6.8481, "loss/crossentropy": 2.717682898044586, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2356971986591816, "step": 2458 }, { "epoch": 0.11181818181818182, "grad_norm": 6.5, "grad_norm_var": 1.55269775390625, "learning_rate": 0.0001, "loss": 6.6887, "loss/crossentropy": 2.6149967312812805, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23080312460660934, "step": 2460 }, { "epoch": 0.1119090909090909, "grad_norm": 6.3125, "grad_norm_var": 0.7403279622395833, "learning_rate": 0.0001, "loss": 6.1939, "loss/crossentropy": 2.306995987892151, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2140858918428421, "step": 2462 }, { "epoch": 0.112, "grad_norm": 6.375, "grad_norm_var": 0.7555826822916667, "learning_rate": 0.0001, "loss": 6.2512, "loss/crossentropy": 2.471362829208374, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.204546507447958, "step": 2464 }, { "epoch": 0.11209090909090909, "grad_norm": 7.59375, "grad_norm_var": 0.7738118489583333, "learning_rate": 0.0001, "loss": 5.9184, "loss/crossentropy": 2.1482625007629395, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.19732723012566566, "step": 2466 }, { "epoch": 0.11218181818181819, "grad_norm": 6.75, "grad_norm_var": 0.75494384765625, "learning_rate": 0.0001, "loss": 6.3883, "loss/crossentropy": 2.4170810878276825, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.2187996692955494, "step": 2468 }, { "epoch": 0.11227272727272727, "grad_norm": 7.75, "grad_norm_var": 565.0273396809896, "learning_rate": 0.0001, "loss": 7.6553, "loss/crossentropy": 2.4276530742645264, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.32041651383042336, "step": 2470 }, { "epoch": 0.11236363636363636, "grad_norm": 10.5, "grad_norm_var": 564.449462890625, "learning_rate": 0.0001, "loss": 6.5901, "loss/crossentropy": 2.514492928981781, "loss/hidden": 1.837890625, "loss/jsd": 0.0, "loss/logits": 0.2237676866352558, "step": 2472 }, { "epoch": 0.11245454545454546, "grad_norm": 6.5625, "grad_norm_var": 565.3046834309896, "learning_rate": 0.0001, "loss": 6.1255, "loss/crossentropy": 2.2130234837532043, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.20940903574228287, "step": 2474 }, { "epoch": 0.11254545454545455, "grad_norm": 6.25, "grad_norm_var": 566.427197265625, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.2081708908081055, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.19649718329310417, "step": 2476 }, { "epoch": 0.11263636363636363, "grad_norm": 7.03125, "grad_norm_var": 565.501806640625, "learning_rate": 0.0001, "loss": 6.5641, "loss/crossentropy": 2.513413190841675, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.22401876375079155, "step": 2478 }, { "epoch": 0.11272727272727273, "grad_norm": 8.0625, "grad_norm_var": 563.6607055664062, "learning_rate": 0.0001, "loss": 6.6141, "loss/crossentropy": 2.5385472774505615, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.23001927509903908, "step": 2480 }, { "epoch": 0.11281818181818182, "grad_norm": 6.625, "grad_norm_var": 564.1792928059896, "learning_rate": 0.0001, "loss": 7.04, "loss/crossentropy": 2.9088745713233948, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.23752764612436295, "step": 2482 }, { "epoch": 0.1129090909090909, "grad_norm": 6.9375, "grad_norm_var": 564.3474568684895, "learning_rate": 0.0001, "loss": 6.3944, "loss/crossentropy": 2.574869215488434, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.20968597009778023, "step": 2484 }, { "epoch": 0.113, "grad_norm": 7.125, "grad_norm_var": 1.11353759765625, "learning_rate": 0.0001, "loss": 6.4396, "loss/crossentropy": 2.3805389404296875, "loss/hidden": 1.787109375, "loss/jsd": 0.0, "loss/logits": 0.22719699516892433, "step": 2486 }, { "epoch": 0.11309090909090909, "grad_norm": 6.0, "grad_norm_var": 0.3297526041666667, "learning_rate": 0.0001, "loss": 6.595, "loss/crossentropy": 2.6121524572372437, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.22152896970510483, "step": 2488 }, { "epoch": 0.11318181818181818, "grad_norm": 6.78125, "grad_norm_var": 0.32307535807291665, "learning_rate": 0.0001, "loss": 6.5105, "loss/crossentropy": 2.5186230540275574, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.21910609304904938, "step": 2490 }, { "epoch": 0.11327272727272727, "grad_norm": 6.625, "grad_norm_var": 0.26875, "learning_rate": 0.0001, "loss": 6.4964, "loss/crossentropy": 2.443294942378998, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.22601742669939995, "step": 2492 }, { "epoch": 0.11336363636363636, "grad_norm": 6.8125, "grad_norm_var": 0.3055338541666667, "learning_rate": 0.0001, "loss": 6.1518, "loss/crossentropy": 2.2836464643478394, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.21220244094729424, "step": 2494 }, { "epoch": 0.11345454545454546, "grad_norm": 7.6875, "grad_norm_var": 0.80308837890625, "learning_rate": 0.0001, "loss": 7.0239, "loss/crossentropy": 2.728638172149658, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.25061947479844093, "step": 2496 }, { "epoch": 0.11354545454545455, "grad_norm": 6.59375, "grad_norm_var": 0.87047119140625, "learning_rate": 0.0001, "loss": 6.4866, "loss/crossentropy": 2.5901816487312317, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21542534604668617, "step": 2498 }, { "epoch": 0.11363636363636363, "grad_norm": 6.375, "grad_norm_var": 0.8845987955729167, "learning_rate": 0.0001, "loss": 6.243, "loss/crossentropy": 2.3652487993240356, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21394584700465202, "step": 2500 }, { "epoch": 0.11372727272727273, "grad_norm": 6.25, "grad_norm_var": 0.88521728515625, "learning_rate": 0.0001, "loss": 6.4213, "loss/crossentropy": 2.5122419595718384, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.21825385838747025, "step": 2502 }, { "epoch": 0.11381818181818182, "grad_norm": 6.71875, "grad_norm_var": 0.8425618489583333, "learning_rate": 0.0001, "loss": 6.6692, "loss/crossentropy": 2.632053792476654, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.22519776597619057, "step": 2504 }, { "epoch": 0.1139090909090909, "grad_norm": 6.4375, "grad_norm_var": 0.84609375, "learning_rate": 0.0001, "loss": 6.2444, "loss/crossentropy": 2.413341760635376, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2100612185895443, "step": 2506 }, { "epoch": 0.114, "grad_norm": 6.78125, "grad_norm_var": 0.8447916666666667, "learning_rate": 0.0001, "loss": 6.4317, "loss/crossentropy": 2.4959405064582825, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.21603361889719963, "step": 2508 }, { "epoch": 0.11409090909090909, "grad_norm": 7.375, "grad_norm_var": 0.8343709309895834, "learning_rate": 0.0001, "loss": 6.5798, "loss/crossentropy": 2.5516855716705322, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.2264426313340664, "step": 2510 }, { "epoch": 0.11418181818181818, "grad_norm": 6.34375, "grad_norm_var": 0.16848551432291667, "learning_rate": 0.0001, "loss": 6.4443, "loss/crossentropy": 2.584889590740204, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.21309100836515427, "step": 2512 }, { "epoch": 0.11427272727272728, "grad_norm": 6.25, "grad_norm_var": 0.15558268229166666, "learning_rate": 0.0001, "loss": 6.4088, "loss/crossentropy": 2.5074948370456696, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2155197449028492, "step": 2514 }, { "epoch": 0.11436363636363636, "grad_norm": 7.09375, "grad_norm_var": 1.3237263997395834, "learning_rate": 0.0001, "loss": 6.9112, "loss/crossentropy": 2.604119837284088, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.25375060364603996, "step": 2516 }, { "epoch": 0.11445454545454545, "grad_norm": 6.5, "grad_norm_var": 1.2942708333333333, "learning_rate": 0.0001, "loss": 6.0528, "loss/crossentropy": 2.2617377638816833, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.20547639578580856, "step": 2518 }, { "epoch": 0.11454545454545455, "grad_norm": 6.4375, "grad_norm_var": 1.3078084309895834, "learning_rate": 0.0001, "loss": 6.5979, "loss/crossentropy": 2.565395414829254, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.22941900044679642, "step": 2520 }, { "epoch": 0.11463636363636363, "grad_norm": 6.5625, "grad_norm_var": 1.3006795247395833, "learning_rate": 0.0001, "loss": 6.4554, "loss/crossentropy": 2.5820701718330383, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.21213679015636444, "step": 2522 }, { "epoch": 0.11472727272727273, "grad_norm": 6.4375, "grad_norm_var": 1.3140909830729166, "learning_rate": 0.0001, "loss": 6.553, "loss/crossentropy": 2.572339713573456, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22462748363614082, "step": 2524 }, { "epoch": 0.11481818181818182, "grad_norm": 7.28125, "grad_norm_var": 1.30709228515625, "learning_rate": 0.0001, "loss": 6.5524, "loss/crossentropy": 2.562723398208618, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.22240900248289108, "step": 2526 }, { "epoch": 0.1149090909090909, "grad_norm": 7.09375, "grad_norm_var": 1.2948567708333334, "learning_rate": 0.0001, "loss": 6.8908, "loss/crossentropy": 2.83914315700531, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23017014935612679, "step": 2528 }, { "epoch": 0.115, "grad_norm": 6.09375, "grad_norm_var": 1.314306640625, "learning_rate": 0.0001, "loss": 6.1321, "loss/crossentropy": 2.349180579185486, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.2034899704158306, "step": 2530 }, { "epoch": 0.11509090909090909, "grad_norm": 8.3125, "grad_norm_var": 0.29146728515625, "learning_rate": 0.0001, "loss": 6.9759, "loss/crossentropy": 2.7543601393699646, "loss/hidden": 1.810546875, "loss/jsd": 0.0, "loss/logits": 0.24109667167067528, "step": 2532 }, { "epoch": 0.11518181818181818, "grad_norm": 7.59375, "grad_norm_var": 0.30331624348958336, "learning_rate": 0.0001, "loss": 6.283, "loss/crossentropy": 2.3895630836486816, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.21180372685194016, "step": 2534 }, { "epoch": 0.11527272727272728, "grad_norm": 6.53125, "grad_norm_var": 0.3224894205729167, "learning_rate": 0.0001, "loss": 6.5639, "loss/crossentropy": 2.575703263282776, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.22362713888287544, "step": 2536 }, { "epoch": 0.11536363636363636, "grad_norm": 7.0, "grad_norm_var": 0.32919514973958336, "learning_rate": 0.0001, "loss": 6.5083, "loss/crossentropy": 2.566877007484436, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.220506701618433, "step": 2538 }, { "epoch": 0.11545454545454545, "grad_norm": 6.8125, "grad_norm_var": 0.32303059895833336, "learning_rate": 0.0001, "loss": 6.8432, "loss/crossentropy": 2.7093961238861084, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.23486962169408798, "step": 2540 }, { "epoch": 0.11554545454545455, "grad_norm": 6.21875, "grad_norm_var": 0.3236612955729167, "learning_rate": 0.0001, "loss": 6.536, "loss/crossentropy": 2.546757400035858, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22587992995977402, "step": 2542 }, { "epoch": 0.11563636363636363, "grad_norm": 7.375, "grad_norm_var": 0.3422159830729167, "learning_rate": 0.0001, "loss": 6.6864, "loss/crossentropy": 2.719393789768219, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.22345400601625443, "step": 2544 }, { "epoch": 0.11572727272727273, "grad_norm": 7.84375, "grad_norm_var": 0.36881103515625, "learning_rate": 0.0001, "loss": 6.7115, "loss/crossentropy": 2.6691285371780396, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.22591513767838478, "step": 2546 }, { "epoch": 0.11581818181818182, "grad_norm": 6.625, "grad_norm_var": 0.25104166666666666, "learning_rate": 0.0001, "loss": 6.1085, "loss/crossentropy": 2.2659255266189575, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.2114081121981144, "step": 2548 }, { "epoch": 0.1159090909090909, "grad_norm": 6.34375, "grad_norm_var": 0.26666666666666666, "learning_rate": 0.0001, "loss": 6.4806, "loss/crossentropy": 2.4979478120803833, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.22053129225969315, "step": 2550 }, { "epoch": 0.116, "grad_norm": 6.28125, "grad_norm_var": 0.32825113932291666, "learning_rate": 0.0001, "loss": 6.5554, "loss/crossentropy": 2.530072510242462, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.22655491158366203, "step": 2552 }, { "epoch": 0.11609090909090909, "grad_norm": 6.21875, "grad_norm_var": 0.385791015625, "learning_rate": 0.0001, "loss": 6.3623, "loss/crossentropy": 2.464530885219574, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21594997867941856, "step": 2554 }, { "epoch": 0.11618181818181818, "grad_norm": 7.0, "grad_norm_var": 0.3863118489583333, "learning_rate": 0.0001, "loss": 6.5658, "loss/crossentropy": 2.510302782058716, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.22859620302915573, "step": 2556 }, { "epoch": 0.11627272727272728, "grad_norm": 6.625, "grad_norm_var": 0.36636962890625, "learning_rate": 0.0001, "loss": 6.5713, "loss/crossentropy": 2.6154470443725586, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.21999704092741013, "step": 2558 }, { "epoch": 0.11636363636363636, "grad_norm": 6.9375, "grad_norm_var": 0.33528238932291665, "learning_rate": 0.0001, "loss": 6.3061, "loss/crossentropy": 2.4088079929351807, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.21394739672541618, "step": 2560 }, { "epoch": 0.11645454545454545, "grad_norm": 6.71875, "grad_norm_var": 0.25832926432291664, "learning_rate": 0.0001, "loss": 6.5392, "loss/crossentropy": 2.5944758653640747, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2186911143362522, "step": 2562 }, { "epoch": 0.11654545454545455, "grad_norm": 6.09375, "grad_norm_var": 0.27812093098958335, "learning_rate": 0.0001, "loss": 6.2887, "loss/crossentropy": 2.420532763004303, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2145509123802185, "step": 2564 }, { "epoch": 0.11663636363636364, "grad_norm": 6.46875, "grad_norm_var": 0.20753580729166668, "learning_rate": 0.0001, "loss": 6.6933, "loss/crossentropy": 2.645323157310486, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.2303863950073719, "step": 2566 }, { "epoch": 0.11672727272727272, "grad_norm": 6.0625, "grad_norm_var": 0.10428059895833333, "learning_rate": 0.0001, "loss": 6.6093, "loss/crossentropy": 2.670214354991913, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.22008198127150536, "step": 2568 }, { "epoch": 0.11681818181818182, "grad_norm": 6.46875, "grad_norm_var": 0.078125, "learning_rate": 0.0001, "loss": 6.6841, "loss/crossentropy": 2.6642268896102905, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22776445373892784, "step": 2570 }, { "epoch": 0.11690909090909091, "grad_norm": 6.1875, "grad_norm_var": 0.07779947916666667, "learning_rate": 0.0001, "loss": 6.2733, "loss/crossentropy": 2.4844674468040466, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.20544597879052162, "step": 2572 }, { "epoch": 0.117, "grad_norm": 6.46875, "grad_norm_var": 0.08472900390625, "learning_rate": 0.0001, "loss": 6.0647, "loss/crossentropy": 2.2637850642204285, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.20626651123166084, "step": 2574 }, { "epoch": 0.11709090909090909, "grad_norm": 6.59375, "grad_norm_var": 0.11168212890625, "learning_rate": 0.0001, "loss": 6.4174, "loss/crossentropy": 2.448845624923706, "loss/hidden": 1.779296875, "loss/jsd": 0.0, "loss/logits": 0.21892286464571953, "step": 2576 }, { "epoch": 0.11718181818181818, "grad_norm": 6.375, "grad_norm_var": 0.099072265625, "learning_rate": 0.0001, "loss": 6.2248, "loss/crossentropy": 2.3379208147525787, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2140834741294384, "step": 2578 }, { "epoch": 0.11727272727272728, "grad_norm": 6.78125, "grad_norm_var": 0.09534098307291666, "learning_rate": 0.0001, "loss": 6.1221, "loss/crossentropy": 2.328279495239258, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.2049710750579834, "step": 2580 }, { "epoch": 0.11736363636363636, "grad_norm": 7.375, "grad_norm_var": 0.16246337890625, "learning_rate": 0.0001, "loss": 6.5345, "loss/crossentropy": 2.5897342562675476, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.21947617828845978, "step": 2582 }, { "epoch": 0.11745454545454545, "grad_norm": 10.125, "grad_norm_var": 0.95, "learning_rate": 0.0001, "loss": 6.7823, "loss/crossentropy": 2.6896771788597107, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.23211681097745895, "step": 2584 }, { "epoch": 0.11754545454545455, "grad_norm": 6.4375, "grad_norm_var": 0.95357666015625, "learning_rate": 0.0001, "loss": 6.2237, "loss/crossentropy": 2.329060196876526, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2140689603984356, "step": 2586 }, { "epoch": 0.11763636363636364, "grad_norm": 6.5, "grad_norm_var": 0.9254842122395833, "learning_rate": 0.0001, "loss": 6.2894, "loss/crossentropy": 2.410509705543518, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.21015429869294167, "step": 2588 }, { "epoch": 0.11772727272727272, "grad_norm": 6.875, "grad_norm_var": 0.8897420247395833, "learning_rate": 0.0001, "loss": 6.2491, "loss/crossentropy": 2.41566401720047, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.2108834683895111, "step": 2590 }, { "epoch": 0.11781818181818182, "grad_norm": 7.125, "grad_norm_var": 0.8917277018229167, "learning_rate": 0.0001, "loss": 6.394, "loss/crossentropy": 2.46917861700058, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.21631211787462234, "step": 2592 }, { "epoch": 0.11790909090909091, "grad_norm": 6.3125, "grad_norm_var": 0.8968098958333334, "learning_rate": 0.0001, "loss": 6.5625, "loss/crossentropy": 2.6358630657196045, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.2198099121451378, "step": 2594 }, { "epoch": 0.118, "grad_norm": 5.875, "grad_norm_var": 0.9799763997395833, "learning_rate": 0.0001, "loss": 6.2125, "loss/crossentropy": 2.4763870239257812, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.20271041244268417, "step": 2596 }, { "epoch": 0.1180909090909091, "grad_norm": 5.96875, "grad_norm_var": 0.9611979166666667, "learning_rate": 0.0001, "loss": 6.4037, "loss/crossentropy": 2.5260677337646484, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.2137417159974575, "step": 2598 }, { "epoch": 0.11818181818181818, "grad_norm": 6.65625, "grad_norm_var": 0.10909830729166667, "learning_rate": 0.0001, "loss": 6.5005, "loss/crossentropy": 2.484919935464859, "loss/hidden": 1.822265625, "loss/jsd": 0.0, "loss/logits": 0.21933411434292793, "step": 2600 }, { "epoch": 0.11827272727272728, "grad_norm": 6.03125, "grad_norm_var": 0.14251302083333334, "learning_rate": 0.0001, "loss": 6.6086, "loss/crossentropy": 2.6665404438972473, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.22311491146683693, "step": 2602 }, { "epoch": 0.11836363636363637, "grad_norm": 7.125, "grad_norm_var": 0.19856770833333334, "learning_rate": 0.0001, "loss": 6.1944, "loss/crossentropy": 2.31499707698822, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.21079124510288239, "step": 2604 }, { "epoch": 0.11845454545454545, "grad_norm": 6.4375, "grad_norm_var": 0.19547119140625, "learning_rate": 0.0001, "loss": 6.2324, "loss/crossentropy": 2.4041150212287903, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2113424800336361, "step": 2606 }, { "epoch": 0.11854545454545455, "grad_norm": 6.75, "grad_norm_var": 0.20441080729166666, "learning_rate": 0.0001, "loss": 5.896, "loss/crossentropy": 2.171609044075012, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.19548209011554718, "step": 2608 }, { "epoch": 0.11863636363636364, "grad_norm": 19.125, "grad_norm_var": 10.29312744140625, "learning_rate": 0.0001, "loss": 6.5371, "loss/crossentropy": 2.4281857013702393, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.22925247624516487, "step": 2610 }, { "epoch": 0.11872727272727272, "grad_norm": 7.03125, "grad_norm_var": 10.033186848958334, "learning_rate": 0.0001, "loss": 6.3355, "loss/crossentropy": 2.4132575690746307, "loss/hidden": 1.884765625, "loss/jsd": 0.0, "loss/logits": 0.20374682918190956, "step": 2612 }, { "epoch": 0.11881818181818182, "grad_norm": 6.78125, "grad_norm_var": 9.93170166015625, "learning_rate": 0.0001, "loss": 6.1034, "loss/crossentropy": 2.242507964372635, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2079687863588333, "step": 2614 }, { "epoch": 0.11890909090909091, "grad_norm": 5.6875, "grad_norm_var": 10.116044108072916, "learning_rate": 0.0001, "loss": 5.8453, "loss/crossentropy": 2.155485063791275, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.19359133392572403, "step": 2616 }, { "epoch": 0.119, "grad_norm": 7.0, "grad_norm_var": 10.053902180989583, "learning_rate": 0.0001, "loss": 6.5734, "loss/crossentropy": 2.5493127703666687, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22663185372948647, "step": 2618 }, { "epoch": 0.1190909090909091, "grad_norm": 6.71875, "grad_norm_var": 9.912239583333333, "learning_rate": 0.0001, "loss": 6.4792, "loss/crossentropy": 2.512988030910492, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.22181367129087448, "step": 2620 }, { "epoch": 0.11918181818181818, "grad_norm": 10.125, "grad_norm_var": 10.117171223958334, "learning_rate": 0.0001, "loss": 6.2286, "loss/crossentropy": 2.3042044043540955, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.21177547052502632, "step": 2622 }, { "epoch": 0.11927272727272727, "grad_norm": 7.40625, "grad_norm_var": 10.604781087239584, "learning_rate": 0.0001, "loss": 6.3032, "loss/crossentropy": 2.2941298484802246, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.22161490842700005, "step": 2624 }, { "epoch": 0.11936363636363637, "grad_norm": 6.75, "grad_norm_var": 1.8525390625, "learning_rate": 0.0001, "loss": 6.6602, "loss/crossentropy": 2.6576865315437317, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.22427790984511375, "step": 2626 }, { "epoch": 0.11945454545454545, "grad_norm": 6.21875, "grad_norm_var": 1.93515625, "learning_rate": 0.0001, "loss": 6.0481, "loss/crossentropy": 2.2764945328235626, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.20450608804821968, "step": 2628 }, { "epoch": 0.11954545454545455, "grad_norm": 6.4375, "grad_norm_var": 1.9147135416666667, "learning_rate": 0.0001, "loss": 6.8788, "loss/crossentropy": 2.7071139812469482, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.24040622264146805, "step": 2630 }, { "epoch": 0.11963636363636364, "grad_norm": 11.1875, "grad_norm_var": 2.65728759765625, "learning_rate": 0.0001, "loss": 6.3271, "loss/crossentropy": 2.4413177371025085, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.21338729560375214, "step": 2632 }, { "epoch": 0.11972727272727272, "grad_norm": 6.59375, "grad_norm_var": 2.697119140625, "learning_rate": 0.0001, "loss": 6.7835, "loss/crossentropy": 2.6644367575645447, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.236122515052557, "step": 2634 }, { "epoch": 0.11981818181818182, "grad_norm": 7.03125, "grad_norm_var": 2.6698567708333334, "learning_rate": 0.0001, "loss": 6.5316, "loss/crossentropy": 2.6052330136299133, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.21548626199364662, "step": 2636 }, { "epoch": 0.11990909090909091, "grad_norm": 6.15625, "grad_norm_var": 2.2773396809895834, "learning_rate": 0.0001, "loss": 6.7246, "loss/crossentropy": 2.6357845067977905, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23192665353417397, "step": 2638 }, { "epoch": 0.12, "grad_norm": 7.53125, "grad_norm_var": 1.4546183268229167, "learning_rate": 0.0001, "loss": 6.6335, "loss/crossentropy": 2.676109552383423, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2203461416065693, "step": 2640 }, { "epoch": 0.1200909090909091, "grad_norm": 7.28125, "grad_norm_var": 1.47008056640625, "learning_rate": 0.0001, "loss": 6.5342, "loss/crossentropy": 2.5392077565193176, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22528530284762383, "step": 2642 }, { "epoch": 0.12018181818181818, "grad_norm": 6.65625, "grad_norm_var": 1.47271728515625, "learning_rate": 0.0001, "loss": 6.1995, "loss/crossentropy": 2.3487519025802612, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21085338294506073, "step": 2644 }, { "epoch": 0.12027272727272727, "grad_norm": 6.5625, "grad_norm_var": 1.5212849934895833, "learning_rate": 0.0001, "loss": 6.5963, "loss/crossentropy": 2.6368995308876038, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.22133442759513855, "step": 2646 }, { "epoch": 0.12036363636363637, "grad_norm": 6.40625, "grad_norm_var": 0.26011962890625, "learning_rate": 0.0001, "loss": 6.7612, "loss/crossentropy": 2.733081638813019, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2254696674644947, "step": 2648 }, { "epoch": 0.12045454545454545, "grad_norm": 6.9375, "grad_norm_var": 0.2619099934895833, "learning_rate": 0.0001, "loss": 6.8376, "loss/crossentropy": 2.7540119886398315, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.2314092330634594, "step": 2650 }, { "epoch": 0.12054545454545454, "grad_norm": 7.25, "grad_norm_var": 0.32457275390625, "learning_rate": 0.0001, "loss": 6.5921, "loss/crossentropy": 2.6549574732780457, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21949981153011322, "step": 2652 }, { "epoch": 0.12063636363636364, "grad_norm": 5.84375, "grad_norm_var": 0.5301920572916666, "learning_rate": 0.0001, "loss": 6.5879, "loss/crossentropy": 2.604270577430725, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22336705401539803, "step": 2654 }, { "epoch": 0.12072727272727272, "grad_norm": 7.03125, "grad_norm_var": 13.866015625, "learning_rate": 0.0001, "loss": 6.503, "loss/crossentropy": 2.5181546211242676, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.22094090282917023, "step": 2656 }, { "epoch": 0.12081818181818182, "grad_norm": 6.46875, "grad_norm_var": 14.098551432291666, "learning_rate": 0.0001, "loss": 6.5268, "loss/crossentropy": 2.6139676570892334, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.21393496170639992, "step": 2658 }, { "epoch": 0.12090909090909091, "grad_norm": 6.1875, "grad_norm_var": 14.113667805989584, "learning_rate": 0.0001, "loss": 5.9479, "loss/crossentropy": 2.2899185121059418, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.19391950964927673, "step": 2660 }, { "epoch": 0.121, "grad_norm": 7.3125, "grad_norm_var": 13.935807291666666, "learning_rate": 0.0001, "loss": 6.6967, "loss/crossentropy": 2.607537031173706, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23195886239409447, "step": 2662 }, { "epoch": 0.1210909090909091, "grad_norm": 12.875, "grad_norm_var": 15.580399576822916, "learning_rate": 0.0001, "loss": 6.2798, "loss/crossentropy": 2.3522263169288635, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.21677620336413383, "step": 2664 }, { "epoch": 0.12118181818181818, "grad_norm": 6.34375, "grad_norm_var": 15.649723307291667, "learning_rate": 0.0001, "loss": 6.1116, "loss/crossentropy": 2.365808218717575, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.20153110846877098, "step": 2666 }, { "epoch": 0.12127272727272727, "grad_norm": 5.90625, "grad_norm_var": 15.65435791015625, "learning_rate": 0.0001, "loss": 6.0982, "loss/crossentropy": 2.409467577934265, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.19836759567260742, "step": 2668 }, { "epoch": 0.12136363636363637, "grad_norm": 6.59375, "grad_norm_var": 15.642313639322916, "learning_rate": 0.0001, "loss": 6.164, "loss/crossentropy": 2.3281174302101135, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.20859138295054436, "step": 2670 }, { "epoch": 0.12145454545454545, "grad_norm": 6.75, "grad_norm_var": 2.675065104166667, "learning_rate": 0.0001, "loss": 6.1034, "loss/crossentropy": 2.302488625049591, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.20645618811249733, "step": 2672 }, { "epoch": 0.12154545454545454, "grad_norm": 6.96875, "grad_norm_var": 2.611588541666667, "learning_rate": 0.0001, "loss": 6.9944, "loss/crossentropy": 2.852058470249176, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.2359095960855484, "step": 2674 }, { "epoch": 0.12163636363636364, "grad_norm": 6.5, "grad_norm_var": 2.653251139322917, "learning_rate": 0.0001, "loss": 6.2374, "loss/crossentropy": 2.451550990343094, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.205147884786129, "step": 2676 }, { "epoch": 0.12172727272727273, "grad_norm": 6.15625, "grad_norm_var": 2.677734375, "learning_rate": 0.0001, "loss": 6.4308, "loss/crossentropy": 2.592244863510132, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.21119875460863113, "step": 2678 }, { "epoch": 0.12181818181818181, "grad_norm": 7.0625, "grad_norm_var": 0.148291015625, "learning_rate": 0.0001, "loss": 6.5117, "loss/crossentropy": 2.587056577205658, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.218045674264431, "step": 2680 }, { "epoch": 0.12190909090909091, "grad_norm": 6.78125, "grad_norm_var": 0.16861979166666666, "learning_rate": 0.0001, "loss": 6.4234, "loss/crossentropy": 2.591376543045044, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21093855798244476, "step": 2682 }, { "epoch": 0.122, "grad_norm": 5.90625, "grad_norm_var": 0.15611979166666667, "learning_rate": 0.0001, "loss": 6.4937, "loss/crossentropy": 2.624010145664215, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.21450642123818398, "step": 2684 }, { "epoch": 0.1220909090909091, "grad_norm": 6.5625, "grad_norm_var": 0.22786051432291668, "learning_rate": 0.0001, "loss": 6.179, "loss/crossentropy": 2.4566783905029297, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20231297239661217, "step": 2686 }, { "epoch": 0.12218181818181818, "grad_norm": 6.5625, "grad_norm_var": 0.22079671223958333, "learning_rate": 0.0001, "loss": 6.3425, "loss/crossentropy": 2.450541466474533, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2161453552544117, "step": 2688 }, { "epoch": 0.12227272727272727, "grad_norm": 6.90625, "grad_norm_var": 0.210791015625, "learning_rate": 0.0001, "loss": 6.6211, "loss/crossentropy": 2.567799687385559, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.22603528574109077, "step": 2690 }, { "epoch": 0.12236363636363637, "grad_norm": 6.5625, "grad_norm_var": 0.19254150390625, "learning_rate": 0.0001, "loss": 6.5605, "loss/crossentropy": 2.537658989429474, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.22377340868115425, "step": 2692 }, { "epoch": 0.12245454545454545, "grad_norm": 6.53125, "grad_norm_var": 0.18258056640625, "learning_rate": 0.0001, "loss": 6.4431, "loss/crossentropy": 2.472044289112091, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2182031385600567, "step": 2694 }, { "epoch": 0.12254545454545454, "grad_norm": 7.0, "grad_norm_var": 0.21649983723958333, "learning_rate": 0.0001, "loss": 6.5047, "loss/crossentropy": 2.5727121829986572, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.21781232208013535, "step": 2696 }, { "epoch": 0.12263636363636364, "grad_norm": 6.59375, "grad_norm_var": 0.17773030598958334, "learning_rate": 0.0001, "loss": 6.4771, "loss/crossentropy": 2.5305890440940857, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.21886763721704483, "step": 2698 }, { "epoch": 0.12272727272727273, "grad_norm": 6.75, "grad_norm_var": 0.15243733723958333, "learning_rate": 0.0001, "loss": 6.9707, "loss/crossentropy": 2.9221672415733337, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.22966111078858376, "step": 2700 }, { "epoch": 0.12281818181818181, "grad_norm": 6.65625, "grad_norm_var": 0.07701822916666666, "learning_rate": 0.0001, "loss": 6.3203, "loss/crossentropy": 2.4978410601615906, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.2117410972714424, "step": 2702 }, { "epoch": 0.12290909090909091, "grad_norm": 6.09375, "grad_norm_var": 0.09104410807291667, "learning_rate": 0.0001, "loss": 6.0955, "loss/crossentropy": 2.322937309741974, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.20713720470666885, "step": 2704 }, { "epoch": 0.123, "grad_norm": 7.03125, "grad_norm_var": 0.09553629557291667, "learning_rate": 0.0001, "loss": 6.6383, "loss/crossentropy": 2.533981144428253, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.23113510757684708, "step": 2706 }, { "epoch": 0.12309090909090908, "grad_norm": 6.8125, "grad_norm_var": 0.09781494140625, "learning_rate": 0.0001, "loss": 6.7548, "loss/crossentropy": 2.7529000639915466, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.22577852755784988, "step": 2708 }, { "epoch": 0.12318181818181818, "grad_norm": 8.5625, "grad_norm_var": 0.33837483723958334, "learning_rate": 0.0001, "loss": 6.2856, "loss/crossentropy": 2.4838244318962097, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.20928234979510307, "step": 2710 }, { "epoch": 0.12327272727272727, "grad_norm": 7.5, "grad_norm_var": 0.36034749348958334, "learning_rate": 0.0001, "loss": 6.5616, "loss/crossentropy": 2.559080183506012, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.22564257681369781, "step": 2712 }, { "epoch": 0.12336363636363637, "grad_norm": 7.78125, "grad_norm_var": 0.42187093098958334, "learning_rate": 0.0001, "loss": 6.5518, "loss/crossentropy": 2.6169363856315613, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.21907563880085945, "step": 2714 }, { "epoch": 0.12345454545454546, "grad_norm": 6.25, "grad_norm_var": 0.463134765625, "learning_rate": 0.0001, "loss": 6.8954, "loss/crossentropy": 2.814181327819824, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23312346264719963, "step": 2716 }, { "epoch": 0.12354545454545454, "grad_norm": 6.625, "grad_norm_var": 0.413134765625, "learning_rate": 0.0001, "loss": 6.3332, "loss/crossentropy": 2.4192804098129272, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.2146334946155548, "step": 2718 }, { "epoch": 0.12363636363636364, "grad_norm": 6.71875, "grad_norm_var": 0.35792643229166665, "learning_rate": 0.0001, "loss": 6.3717, "loss/crossentropy": 2.502548396587372, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.21191545203328133, "step": 2720 }, { "epoch": 0.12372727272727273, "grad_norm": 6.03125, "grad_norm_var": 0.42382405598958334, "learning_rate": 0.0001, "loss": 6.1773, "loss/crossentropy": 2.405661880970001, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.2007928602397442, "step": 2722 }, { "epoch": 0.12381818181818181, "grad_norm": 6.15625, "grad_norm_var": 0.5257649739583333, "learning_rate": 0.0001, "loss": 5.7827, "loss/crossentropy": 2.14049232006073, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.1925441287457943, "step": 2724 }, { "epoch": 0.12390909090909091, "grad_norm": 6.1875, "grad_norm_var": 0.438916015625, "learning_rate": 0.0001, "loss": 5.8825, "loss/crossentropy": 2.254271537065506, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.19231774285435677, "step": 2726 }, { "epoch": 0.124, "grad_norm": 6.5, "grad_norm_var": 1.0610636393229167, "learning_rate": 0.0001, "loss": 6.4587, "loss/crossentropy": 2.4758510887622833, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.21761886775493622, "step": 2728 }, { "epoch": 0.12409090909090909, "grad_norm": 6.125, "grad_norm_var": 1.05758056640625, "learning_rate": 0.0001, "loss": 6.372, "loss/crossentropy": 2.5025835633277893, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21389122307300568, "step": 2730 }, { "epoch": 0.12418181818181818, "grad_norm": 6.3125, "grad_norm_var": 1.0114217122395834, "learning_rate": 0.0001, "loss": 6.4518, "loss/crossentropy": 2.5330556631088257, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21883085742592812, "step": 2732 }, { "epoch": 0.12427272727272727, "grad_norm": 6.15625, "grad_norm_var": 1.05006103515625, "learning_rate": 0.0001, "loss": 6.214, "loss/crossentropy": 2.4774256348609924, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.2039293460547924, "step": 2734 }, { "epoch": 0.12436363636363636, "grad_norm": 6.0, "grad_norm_var": 1.0735677083333333, "learning_rate": 0.0001, "loss": 5.9388, "loss/crossentropy": 2.298799842596054, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.19563748314976692, "step": 2736 }, { "epoch": 0.12445454545454546, "grad_norm": 6.375, "grad_norm_var": 1.0087198893229166, "learning_rate": 0.0001, "loss": 6.6201, "loss/crossentropy": 2.702800989151001, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21868425235152245, "step": 2738 }, { "epoch": 0.12454545454545454, "grad_norm": 7.0, "grad_norm_var": 1.0071573893229167, "learning_rate": 0.0001, "loss": 6.3918, "loss/crossentropy": 2.4516066312789917, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22057821974158287, "step": 2740 }, { "epoch": 0.12463636363636364, "grad_norm": 6.09375, "grad_norm_var": 0.9399576822916667, "learning_rate": 0.0001, "loss": 6.3818, "loss/crossentropy": 2.553639054298401, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21094055473804474, "step": 2742 }, { "epoch": 0.12472727272727273, "grad_norm": 6.09375, "grad_norm_var": 0.15533447265625, "learning_rate": 0.0001, "loss": 5.6633, "loss/crossentropy": 2.0678513646125793, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.18786770477890968, "step": 2744 }, { "epoch": 0.12481818181818181, "grad_norm": 6.28125, "grad_norm_var": 0.14542643229166666, "learning_rate": 0.0001, "loss": 6.7196, "loss/crossentropy": 2.782585024833679, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.22279980778694153, "step": 2746 }, { "epoch": 0.12490909090909091, "grad_norm": 6.84375, "grad_norm_var": 0.1849609375, "learning_rate": 0.0001, "loss": 6.6464, "loss/crossentropy": 2.711301267147064, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22202341258525848, "step": 2748 }, { "epoch": 0.125, "grad_norm": 6.4375, "grad_norm_var": 0.16500244140625, "learning_rate": 0.0001, "loss": 6.052, "loss/crossentropy": 2.2654501497745514, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.2030654102563858, "step": 2750 }, { "epoch": 0.12509090909090909, "grad_norm": 7.65625, "grad_norm_var": 0.23201497395833334, "learning_rate": 0.0001, "loss": 6.4408, "loss/crossentropy": 2.4252228140830994, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.22792898863554, "step": 2752 }, { "epoch": 0.12518181818181817, "grad_norm": 6.84375, "grad_norm_var": 0.4652180989583333, "learning_rate": 0.0001, "loss": 6.7395, "loss/crossentropy": 2.5835251212120056, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.23728077486157417, "step": 2754 }, { "epoch": 0.12527272727272729, "grad_norm": 6.8125, "grad_norm_var": 0.4652628580729167, "learning_rate": 0.0001, "loss": 6.5819, "loss/crossentropy": 2.6075581312179565, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.22535939514636993, "step": 2756 }, { "epoch": 0.12536363636363637, "grad_norm": 6.71875, "grad_norm_var": 0.4850870768229167, "learning_rate": 0.0001, "loss": 6.5434, "loss/crossentropy": 2.522481322288513, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22709285840392113, "step": 2758 }, { "epoch": 0.12545454545454546, "grad_norm": 7.375, "grad_norm_var": 0.4261067708333333, "learning_rate": 0.0001, "loss": 7.3582, "loss/crossentropy": 2.9610356092453003, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.26159265264868736, "step": 2760 }, { "epoch": 0.12554545454545454, "grad_norm": 6.375, "grad_norm_var": 0.4173177083333333, "learning_rate": 0.0001, "loss": 6.6236, "loss/crossentropy": 2.567100942134857, "loss/hidden": 1.783203125, "loss/jsd": 0.0, "loss/logits": 0.22733162716031075, "step": 2762 }, { "epoch": 0.12563636363636363, "grad_norm": 18.25, "grad_norm_var": 8.4505859375, "learning_rate": 0.0001, "loss": 7.0222, "loss/crossentropy": 2.890468180179596, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.23543576896190643, "step": 2764 }, { "epoch": 0.12572727272727272, "grad_norm": 6.90625, "grad_norm_var": 8.415653483072917, "learning_rate": 0.0001, "loss": 6.6104, "loss/crossentropy": 2.4458926618099213, "loss/hidden": 1.818359375, "loss/jsd": 0.0, "loss/logits": 0.23461904376745224, "step": 2766 }, { "epoch": 0.12581818181818183, "grad_norm": 28.0, "grad_norm_var": 33.772509765625, "learning_rate": 0.0001, "loss": 7.2751, "loss/crossentropy": 2.8137935996055603, "loss/hidden": 1.794921875, "loss/jsd": 0.0, "loss/logits": 0.2666341960430145, "step": 2768 }, { "epoch": 0.12590909090909091, "grad_norm": 7.4375, "grad_norm_var": 33.94468994140625, "learning_rate": 0.0001, "loss": 6.2243, "loss/crossentropy": 2.3576794862747192, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.21126847714185715, "step": 2770 }, { "epoch": 0.126, "grad_norm": 6.875, "grad_norm_var": 33.66327718098958, "learning_rate": 0.0001, "loss": 6.6738, "loss/crossentropy": 2.6724790930747986, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.2272774912416935, "step": 2772 }, { "epoch": 0.1260909090909091, "grad_norm": 6.5625, "grad_norm_var": 33.987788899739584, "learning_rate": 0.0001, "loss": 6.3739, "loss/crossentropy": 2.4990451335906982, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.21248618140816689, "step": 2774 }, { "epoch": 0.12618181818181817, "grad_norm": 6.71875, "grad_norm_var": 34.067867024739584, "learning_rate": 0.0001, "loss": 6.7399, "loss/crossentropy": 2.616873860359192, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23534774407744408, "step": 2776 }, { "epoch": 0.1262727272727273, "grad_norm": 6.03125, "grad_norm_var": 34.35696614583333, "learning_rate": 0.0001, "loss": 6.4922, "loss/crossentropy": 2.624195158481598, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21453159302473068, "step": 2778 }, { "epoch": 0.12636363636363637, "grad_norm": 6.90625, "grad_norm_var": 28.364937337239585, "learning_rate": 0.0001, "loss": 6.4374, "loss/crossentropy": 2.568828284740448, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.20873061567544937, "step": 2780 }, { "epoch": 0.12645454545454546, "grad_norm": 6.1875, "grad_norm_var": 28.663264973958334, "learning_rate": 0.0001, "loss": 6.3936, "loss/crossentropy": 2.4242590069770813, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.21939367428421974, "step": 2782 }, { "epoch": 0.12654545454545454, "grad_norm": 7.25, "grad_norm_var": 0.29230143229166666, "learning_rate": 0.0001, "loss": 6.7148, "loss/crossentropy": 2.695539653301239, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22692862898111343, "step": 2784 }, { "epoch": 0.12663636363636363, "grad_norm": 6.3125, "grad_norm_var": 0.28817952473958336, "learning_rate": 0.0001, "loss": 6.059, "loss/crossentropy": 2.291350841522217, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.2035183347761631, "step": 2786 }, { "epoch": 0.12672727272727272, "grad_norm": 6.71875, "grad_norm_var": 0.31516927083333335, "learning_rate": 0.0001, "loss": 6.7356, "loss/crossentropy": 2.771942615509033, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.22097574919462204, "step": 2788 }, { "epoch": 0.12681818181818183, "grad_norm": 6.34375, "grad_norm_var": 0.336572265625, "learning_rate": 0.0001, "loss": 6.5182, "loss/crossentropy": 2.654086709022522, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.21375445276498795, "step": 2790 }, { "epoch": 0.12690909090909092, "grad_norm": 7.15625, "grad_norm_var": 0.2567545572916667, "learning_rate": 0.0001, "loss": 6.4449, "loss/crossentropy": 2.5360644459724426, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.21119898930191994, "step": 2792 }, { "epoch": 0.127, "grad_norm": 7.6875, "grad_norm_var": 0.292431640625, "learning_rate": 0.0001, "loss": 6.5333, "loss/crossentropy": 2.489084243774414, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.22747059538960457, "step": 2794 }, { "epoch": 0.1270909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.33795166015625, "learning_rate": 0.0001, "loss": 6.424, "loss/crossentropy": 2.5977739691734314, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21191470697522163, "step": 2796 }, { "epoch": 0.12718181818181817, "grad_norm": 6.34375, "grad_norm_var": 0.378125, "learning_rate": 0.0001, "loss": 6.525, "loss/crossentropy": 2.7206323742866516, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2089548297226429, "step": 2798 }, { "epoch": 0.12727272727272726, "grad_norm": 6.71875, "grad_norm_var": 0.3833984375, "learning_rate": 0.0001, "loss": 6.6301, "loss/crossentropy": 2.587091326713562, "loss/hidden": 1.791015625, "loss/jsd": 0.0, "loss/logits": 0.225199855864048, "step": 2800 }, { "epoch": 0.12736363636363637, "grad_norm": 5.9375, "grad_norm_var": 0.38853759765625, "learning_rate": 0.0001, "loss": 6.0888, "loss/crossentropy": 2.36591175198555, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.19963311031460762, "step": 2802 }, { "epoch": 0.12745454545454546, "grad_norm": 7.0, "grad_norm_var": 0.35234375, "learning_rate": 0.0001, "loss": 6.1213, "loss/crossentropy": 2.337364763021469, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.20573704317212105, "step": 2804 }, { "epoch": 0.12754545454545455, "grad_norm": 6.03125, "grad_norm_var": 0.39010009765625, "learning_rate": 0.0001, "loss": 5.9341, "loss/crossentropy": 2.245442897081375, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.19406035915017128, "step": 2806 }, { "epoch": 0.12763636363636363, "grad_norm": 6.53125, "grad_norm_var": 0.38821614583333336, "learning_rate": 0.0001, "loss": 6.584, "loss/crossentropy": 2.673758566379547, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.21778207644820213, "step": 2808 }, { "epoch": 0.12772727272727272, "grad_norm": 6.0, "grad_norm_var": 0.31964518229166666, "learning_rate": 0.0001, "loss": 6.1873, "loss/crossentropy": 2.409298598766327, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.20260517671704292, "step": 2810 }, { "epoch": 0.12781818181818183, "grad_norm": 6.84375, "grad_norm_var": 0.2526041666666667, "learning_rate": 0.0001, "loss": 6.3532, "loss/crossentropy": 2.476793050765991, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.21420633047819138, "step": 2812 }, { "epoch": 0.12790909090909092, "grad_norm": 6.84375, "grad_norm_var": 0.2546875, "learning_rate": 0.0001, "loss": 5.8456, "loss/crossentropy": 2.1754502058029175, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.19318808242678642, "step": 2814 }, { "epoch": 0.128, "grad_norm": 6.34375, "grad_norm_var": 0.20540364583333334, "learning_rate": 0.0001, "loss": 6.5366, "loss/crossentropy": 2.5734543204307556, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.22228792309761047, "step": 2816 }, { "epoch": 0.1280909090909091, "grad_norm": 6.125, "grad_norm_var": 0.18931884765625, "learning_rate": 0.0001, "loss": 6.483, "loss/crossentropy": 2.5888549089431763, "loss/hidden": 1.712890625, "loss/jsd": 0.0, "loss/logits": 0.21812808886170387, "step": 2818 }, { "epoch": 0.12818181818181817, "grad_norm": 7.03125, "grad_norm_var": 0.3659138997395833, "learning_rate": 0.0001, "loss": 6.8632, "loss/crossentropy": 2.7279325127601624, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23540248721837997, "step": 2820 }, { "epoch": 0.12827272727272726, "grad_norm": 6.65625, "grad_norm_var": 0.3268229166666667, "learning_rate": 0.0001, "loss": 6.5206, "loss/crossentropy": 2.6393321752548218, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.21761618927121162, "step": 2822 }, { "epoch": 0.12836363636363637, "grad_norm": 6.53125, "grad_norm_var": 0.3148396809895833, "learning_rate": 0.0001, "loss": 6.4806, "loss/crossentropy": 2.589099705219269, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2188328467309475, "step": 2824 }, { "epoch": 0.12845454545454546, "grad_norm": 6.28125, "grad_norm_var": 0.3008951822916667, "learning_rate": 0.0001, "loss": 6.4589, "loss/crossentropy": 2.6123695373535156, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.20945368707180023, "step": 2826 }, { "epoch": 0.12854545454545455, "grad_norm": 6.1875, "grad_norm_var": 0.30416259765625, "learning_rate": 0.0001, "loss": 6.4811, "loss/crossentropy": 2.5592007637023926, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.215823195874691, "step": 2828 }, { "epoch": 0.12863636363636363, "grad_norm": 6.1875, "grad_norm_var": 0.26529541015625, "learning_rate": 0.0001, "loss": 6.4588, "loss/crossentropy": 2.5830045342445374, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.21218618005514145, "step": 2830 }, { "epoch": 0.12872727272727272, "grad_norm": 6.3125, "grad_norm_var": 0.273681640625, "learning_rate": 0.0001, "loss": 6.3321, "loss/crossentropy": 2.5342363715171814, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2063468173146248, "step": 2832 }, { "epoch": 0.12881818181818183, "grad_norm": 11.75, "grad_norm_var": 1.9124308268229167, "learning_rate": 0.0001, "loss": 6.7872, "loss/crossentropy": 2.610603451728821, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.23563092947006226, "step": 2834 }, { "epoch": 0.12890909090909092, "grad_norm": 7.28125, "grad_norm_var": 1.8128865559895833, "learning_rate": 0.0001, "loss": 6.4108, "loss/crossentropy": 2.625970244407654, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20855971053242683, "step": 2836 }, { "epoch": 0.129, "grad_norm": 6.3125, "grad_norm_var": 1.791259765625, "learning_rate": 0.0001, "loss": 6.2644, "loss/crossentropy": 2.4382246136665344, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.20878949016332626, "step": 2838 }, { "epoch": 0.1290909090909091, "grad_norm": 6.6875, "grad_norm_var": 1.80601806640625, "learning_rate": 0.0001, "loss": 6.3122, "loss/crossentropy": 2.5073341131210327, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.2056833803653717, "step": 2840 }, { "epoch": 0.12918181818181818, "grad_norm": 7.125, "grad_norm_var": 1.77886962890625, "learning_rate": 0.0001, "loss": 7.1249, "loss/crossentropy": 2.9490395188331604, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.24278023838996887, "step": 2842 }, { "epoch": 0.12927272727272726, "grad_norm": 9.625, "grad_norm_var": 2.23004150390625, "learning_rate": 0.0001, "loss": 5.9961, "loss/crossentropy": 2.205965608358383, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.20459461584687233, "step": 2844 }, { "epoch": 0.12936363636363638, "grad_norm": 5.875, "grad_norm_var": 2.263655598958333, "learning_rate": 0.0001, "loss": 6.5628, "loss/crossentropy": 2.5690550208091736, "loss/hidden": 1.775390625, "loss/jsd": 0.0, "loss/logits": 0.2218395695090294, "step": 2846 }, { "epoch": 0.12945454545454546, "grad_norm": 5.90625, "grad_norm_var": 2.386181640625, "learning_rate": 0.0001, "loss": 5.9093, "loss/crossentropy": 2.181858003139496, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.1971598006784916, "step": 2848 }, { "epoch": 0.12954545454545455, "grad_norm": 5.8125, "grad_norm_var": 0.9195149739583334, "learning_rate": 0.0001, "loss": 6.1129, "loss/crossentropy": 2.4312663972377777, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.19843613728880882, "step": 2850 }, { "epoch": 0.12963636363636363, "grad_norm": 11.0625, "grad_norm_var": 2.32320556640625, "learning_rate": 0.0001, "loss": 6.7012, "loss/crossentropy": 2.5397114753723145, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2380264401435852, "step": 2852 }, { "epoch": 0.12972727272727272, "grad_norm": 6.53125, "grad_norm_var": 2.31431884765625, "learning_rate": 0.0001, "loss": 6.5962, "loss/crossentropy": 2.5832064151763916, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.22590523213148117, "step": 2854 }, { "epoch": 0.1298181818181818, "grad_norm": 6.59375, "grad_norm_var": 2.239697265625, "learning_rate": 0.0001, "loss": 6.8385, "loss/crossentropy": 2.74837189912796, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.23440046608448029, "step": 2856 }, { "epoch": 0.12990909090909092, "grad_norm": 7.0625, "grad_norm_var": 2.2741495768229165, "learning_rate": 0.0001, "loss": 6.0829, "loss/crossentropy": 2.3190388083457947, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.20392365381121635, "step": 2858 }, { "epoch": 0.13, "grad_norm": 6.84375, "grad_norm_var": 1.78345947265625, "learning_rate": 0.0001, "loss": 6.7526, "loss/crossentropy": 2.719459354877472, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2279239408671856, "step": 2860 }, { "epoch": 0.1300909090909091, "grad_norm": 6.15625, "grad_norm_var": 1.7434733072916666, "learning_rate": 0.0001, "loss": 6.585, "loss/crossentropy": 2.7192612290382385, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.21391548216342926, "step": 2862 }, { "epoch": 0.13018181818181818, "grad_norm": 7.125, "grad_norm_var": 1.65103759765625, "learning_rate": 0.0001, "loss": 6.5207, "loss/crossentropy": 2.598050117492676, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.2178530991077423, "step": 2864 }, { "epoch": 0.13027272727272726, "grad_norm": 6.28125, "grad_norm_var": 1.470947265625, "learning_rate": 0.0001, "loss": 6.5189, "loss/crossentropy": 2.670633316040039, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.21138625219464302, "step": 2866 }, { "epoch": 0.13036363636363638, "grad_norm": 6.53125, "grad_norm_var": 2.1198527018229165, "learning_rate": 0.0001, "loss": 6.6276, "loss/crossentropy": 2.585620939731598, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23114820569753647, "step": 2868 }, { "epoch": 0.13045454545454546, "grad_norm": 7.09375, "grad_norm_var": 2.085282389322917, "learning_rate": 0.0001, "loss": 6.7901, "loss/crossentropy": 2.7814583778381348, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.22684207186102867, "step": 2870 }, { "epoch": 0.13054545454545455, "grad_norm": 6.15625, "grad_norm_var": 2.1270182291666666, "learning_rate": 0.0001, "loss": 6.5888, "loss/crossentropy": 2.6469817757606506, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2207445427775383, "step": 2872 }, { "epoch": 0.13063636363636363, "grad_norm": 6.84375, "grad_norm_var": 2.190234375, "learning_rate": 0.0001, "loss": 6.499, "loss/crossentropy": 2.602073609828949, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.21567289903759956, "step": 2874 }, { "epoch": 0.13072727272727272, "grad_norm": 7.1875, "grad_norm_var": 2.2875, "learning_rate": 0.0001, "loss": 6.4778, "loss/crossentropy": 2.67367947101593, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.20951592549681664, "step": 2876 }, { "epoch": 0.1308181818181818, "grad_norm": 6.8125, "grad_norm_var": 2.24713134765625, "learning_rate": 0.0001, "loss": 6.6416, "loss/crossentropy": 2.5526316165924072, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.2325294092297554, "step": 2878 }, { "epoch": 0.13090909090909092, "grad_norm": 7.375, "grad_norm_var": 2.20859375, "learning_rate": 0.0001, "loss": 6.4361, "loss/crossentropy": 2.557086944580078, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.2109529711306095, "step": 2880 }, { "epoch": 0.131, "grad_norm": 6.21875, "grad_norm_var": 2.234175618489583, "learning_rate": 0.0001, "loss": 6.5783, "loss/crossentropy": 2.599289357662201, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22289641946554184, "step": 2882 }, { "epoch": 0.1310909090909091, "grad_norm": 6.625, "grad_norm_var": 0.27489827473958334, "learning_rate": 0.0001, "loss": 6.3694, "loss/crossentropy": 2.5522918105125427, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.21080785989761353, "step": 2884 }, { "epoch": 0.13118181818181818, "grad_norm": 6.25, "grad_norm_var": 0.2628865559895833, "learning_rate": 0.0001, "loss": 6.6919, "loss/crossentropy": 2.678319811820984, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.22694333642721176, "step": 2886 }, { "epoch": 0.13127272727272726, "grad_norm": 7.28125, "grad_norm_var": 0.27081705729166666, "learning_rate": 0.0001, "loss": 6.4051, "loss/crossentropy": 2.5170613527297974, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2161475345492363, "step": 2888 }, { "epoch": 0.13136363636363638, "grad_norm": 6.84375, "grad_norm_var": 0.217431640625, "learning_rate": 0.0001, "loss": 6.282, "loss/crossentropy": 2.4377209842205048, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.20903877168893814, "step": 2890 }, { "epoch": 0.13145454545454546, "grad_norm": 6.21875, "grad_norm_var": 0.231640625, "learning_rate": 0.0001, "loss": 6.014, "loss/crossentropy": 2.329648196697235, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.19636791571974754, "step": 2892 }, { "epoch": 0.13154545454545455, "grad_norm": 6.0625, "grad_norm_var": 0.20536702473958332, "learning_rate": 0.0001, "loss": 5.9782, "loss/crossentropy": 2.2970349490642548, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.19565346091985703, "step": 2894 }, { "epoch": 0.13163636363636363, "grad_norm": 6.6875, "grad_norm_var": 0.18841145833333334, "learning_rate": 0.0001, "loss": 6.8316, "loss/crossentropy": 2.7423255443573, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2323623113334179, "step": 2896 }, { "epoch": 0.13172727272727272, "grad_norm": 7.03125, "grad_norm_var": 1.75836181640625, "learning_rate": 0.0001, "loss": 6.255, "loss/crossentropy": 2.2824171781539917, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.22519123926758766, "step": 2898 }, { "epoch": 0.1318181818181818, "grad_norm": 6.34375, "grad_norm_var": 1.7811197916666666, "learning_rate": 0.0001, "loss": 6.2705, "loss/crossentropy": 2.425143003463745, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.21442236378788948, "step": 2900 }, { "epoch": 0.13190909090909092, "grad_norm": 6.25, "grad_norm_var": 1.7815755208333333, "learning_rate": 0.0001, "loss": 6.1933, "loss/crossentropy": 2.3914489150047302, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.20499225333333015, "step": 2902 }, { "epoch": 0.132, "grad_norm": 7.03125, "grad_norm_var": 1.8279296875, "learning_rate": 0.0001, "loss": 6.0052, "loss/crossentropy": 2.3184721767902374, "loss/hidden": 1.712890625, "loss/jsd": 0.0, "loss/logits": 0.1973840929567814, "step": 2904 }, { "epoch": 0.1320909090909091, "grad_norm": 6.0625, "grad_norm_var": 1.8706990559895833, "learning_rate": 0.0001, "loss": 6.1095, "loss/crossentropy": 2.3929895758628845, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20094340667128563, "step": 2906 }, { "epoch": 0.13218181818181818, "grad_norm": 6.0625, "grad_norm_var": 1.8138956705729166, "learning_rate": 0.0001, "loss": 6.4851, "loss/crossentropy": 2.54459685087204, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.22354253381490707, "step": 2908 }, { "epoch": 0.13227272727272726, "grad_norm": 6.40625, "grad_norm_var": 1.78941650390625, "learning_rate": 0.0001, "loss": 6.428, "loss/crossentropy": 2.548600912094116, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.21782363951206207, "step": 2910 }, { "epoch": 0.13236363636363635, "grad_norm": 6.0, "grad_norm_var": 1.91744384765625, "learning_rate": 0.0001, "loss": 6.025, "loss/crossentropy": 2.3776528239250183, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.19559425115585327, "step": 2912 }, { "epoch": 0.13245454545454546, "grad_norm": 7.90625, "grad_norm_var": 0.30227864583333336, "learning_rate": 0.0001, "loss": 6.2616, "loss/crossentropy": 2.3928467631340027, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.2108992040157318, "step": 2914 }, { "epoch": 0.13254545454545455, "grad_norm": 6.15625, "grad_norm_var": 0.3792805989583333, "learning_rate": 0.0001, "loss": 6.5894, "loss/crossentropy": 2.6370485424995422, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.2235574573278427, "step": 2916 }, { "epoch": 0.13263636363636364, "grad_norm": 10.25, "grad_norm_var": 1.2790201822916667, "learning_rate": 0.0001, "loss": 6.8602, "loss/crossentropy": 2.697630763053894, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.24262113124132156, "step": 2918 }, { "epoch": 0.13272727272727272, "grad_norm": 5.59375, "grad_norm_var": 1.310400390625, "learning_rate": 0.0001, "loss": 6.2768, "loss/crossentropy": 2.5529052913188934, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.1991482637822628, "step": 2920 }, { "epoch": 0.1328181818181818, "grad_norm": 7.8125, "grad_norm_var": 1.3585286458333334, "learning_rate": 0.0001, "loss": 6.6578, "loss/crossentropy": 2.6724501848220825, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22509828582406044, "step": 2922 }, { "epoch": 0.13290909090909092, "grad_norm": 6.78125, "grad_norm_var": 1.3349609375, "learning_rate": 0.0001, "loss": 6.0845, "loss/crossentropy": 2.3252860605716705, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.20365220308303833, "step": 2924 }, { "epoch": 0.133, "grad_norm": 6.65625, "grad_norm_var": 1.3255859375, "learning_rate": 0.0001, "loss": 6.6821, "loss/crossentropy": 2.704566478729248, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.2248992621898651, "step": 2926 }, { "epoch": 0.1330909090909091, "grad_norm": 6.09375, "grad_norm_var": 1.23619384765625, "learning_rate": 0.0001, "loss": 6.5856, "loss/crossentropy": 2.7079612612724304, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21589115634560585, "step": 2928 }, { "epoch": 0.13318181818181818, "grad_norm": 7.03125, "grad_norm_var": 1.2048014322916667, "learning_rate": 0.0001, "loss": 6.3962, "loss/crossentropy": 2.507194697856903, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21507299691438675, "step": 2930 }, { "epoch": 0.13327272727272726, "grad_norm": 6.28125, "grad_norm_var": 1.1827107747395833, "learning_rate": 0.0001, "loss": 6.7405, "loss/crossentropy": 2.7197722792625427, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22942136228084564, "step": 2932 }, { "epoch": 0.13336363636363635, "grad_norm": 6.3125, "grad_norm_var": 0.306884765625, "learning_rate": 0.0001, "loss": 6.4239, "loss/crossentropy": 2.582553744316101, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.2097177691757679, "step": 2934 }, { "epoch": 0.13345454545454546, "grad_norm": 7.03125, "grad_norm_var": 0.26851806640625, "learning_rate": 0.0001, "loss": 6.4468, "loss/crossentropy": 2.543601632118225, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21727268025279045, "step": 2936 }, { "epoch": 0.13354545454545455, "grad_norm": 6.25, "grad_norm_var": 0.16144205729166666, "learning_rate": 0.0001, "loss": 6.1361, "loss/crossentropy": 2.4297807216644287, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.19993258267641068, "step": 2938 }, { "epoch": 0.13363636363636364, "grad_norm": 6.84375, "grad_norm_var": 0.20071614583333333, "learning_rate": 0.0001, "loss": 6.5691, "loss/crossentropy": 2.6419342756271362, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22084128856658936, "step": 2940 }, { "epoch": 0.13372727272727272, "grad_norm": 6.40625, "grad_norm_var": 0.21689046223958333, "learning_rate": 0.0001, "loss": 6.4035, "loss/crossentropy": 2.5518693923950195, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.213485699146986, "step": 2942 }, { "epoch": 0.1338181818181818, "grad_norm": 6.03125, "grad_norm_var": 0.20338134765625, "learning_rate": 0.0001, "loss": 6.6066, "loss/crossentropy": 2.7411420345306396, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.21486646309494972, "step": 2944 }, { "epoch": 0.13390909090909092, "grad_norm": 6.46875, "grad_norm_var": 0.21851806640625, "learning_rate": 0.0001, "loss": 6.3619, "loss/crossentropy": 2.626721143722534, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20438141748309135, "step": 2946 }, { "epoch": 0.134, "grad_norm": 6.8125, "grad_norm_var": 0.24332275390625, "learning_rate": 0.0001, "loss": 6.2181, "loss/crossentropy": 2.5348673462867737, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2019125185906887, "step": 2948 }, { "epoch": 0.1340909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.28313802083333334, "learning_rate": 0.0001, "loss": 6.4822, "loss/crossentropy": 2.7032231092453003, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.20738939940929413, "step": 2950 }, { "epoch": 0.13418181818181818, "grad_norm": 6.8125, "grad_norm_var": 0.64683837890625, "learning_rate": 0.0001, "loss": 6.8571, "loss/crossentropy": 2.785077154636383, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23181423544883728, "step": 2952 }, { "epoch": 0.13427272727272727, "grad_norm": 6.25, "grad_norm_var": 0.6564412434895833, "learning_rate": 0.0001, "loss": 6.6762, "loss/crossentropy": 2.636157125234604, "loss/hidden": 1.767578125, "loss/jsd": 0.0, "loss/logits": 0.22724641487002373, "step": 2954 }, { "epoch": 0.13436363636363635, "grad_norm": 6.59375, "grad_norm_var": 0.6238932291666667, "learning_rate": 0.0001, "loss": 6.2746, "loss/crossentropy": 2.4967474937438965, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.20767099037766457, "step": 2956 }, { "epoch": 0.13445454545454547, "grad_norm": 6.21875, "grad_norm_var": 0.6141886393229167, "learning_rate": 0.0001, "loss": 6.4261, "loss/crossentropy": 2.6710670590400696, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.20440874993801117, "step": 2958 }, { "epoch": 0.13454545454545455, "grad_norm": 7.625, "grad_norm_var": 0.6807902018229167, "learning_rate": 0.0001, "loss": 7.0527, "loss/crossentropy": 2.9837154746055603, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23189881816506386, "step": 2960 }, { "epoch": 0.13463636363636364, "grad_norm": 6.625, "grad_norm_var": 0.6302734375, "learning_rate": 0.0001, "loss": 6.7023, "loss/crossentropy": 2.6672862768173218, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2304532267153263, "step": 2962 }, { "epoch": 0.13472727272727272, "grad_norm": 5.8125, "grad_norm_var": 0.5975870768229167, "learning_rate": 0.0001, "loss": 6.5195, "loss/crossentropy": 2.590092420578003, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2198900692164898, "step": 2964 }, { "epoch": 0.1348181818181818, "grad_norm": 6.09375, "grad_norm_var": 0.5345011393229167, "learning_rate": 0.0001, "loss": 5.9676, "loss/crossentropy": 2.300375521183014, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.1946493797004223, "step": 2966 }, { "epoch": 0.13490909090909092, "grad_norm": 5.9375, "grad_norm_var": 0.235400390625, "learning_rate": 0.0001, "loss": 6.3057, "loss/crossentropy": 2.5289664268493652, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.2075541540980339, "step": 2968 }, { "epoch": 0.135, "grad_norm": 10.25, "grad_norm_var": 1.1328084309895834, "learning_rate": 0.0001, "loss": 5.9035, "loss/crossentropy": 2.1778028309345245, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.2004951387643814, "step": 2970 }, { "epoch": 0.1350909090909091, "grad_norm": 6.0, "grad_norm_var": 1.1579386393229167, "learning_rate": 0.0001, "loss": 6.4924, "loss/crossentropy": 2.6960668563842773, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21009943261742592, "step": 2972 }, { "epoch": 0.13518181818181818, "grad_norm": 8.4375, "grad_norm_var": 1.35806884765625, "learning_rate": 0.0001, "loss": 6.5227, "loss/crossentropy": 2.5694674253463745, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22345015779137611, "step": 2974 }, { "epoch": 0.13527272727272727, "grad_norm": 7.03125, "grad_norm_var": 1.65426025390625, "learning_rate": 0.0001, "loss": 6.4015, "loss/crossentropy": 2.4976794719696045, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.2179187871515751, "step": 2976 }, { "epoch": 0.13536363636363635, "grad_norm": 5.75, "grad_norm_var": 1.701025390625, "learning_rate": 0.0001, "loss": 6.1429, "loss/crossentropy": 2.434891104698181, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.20146595686674118, "step": 2978 }, { "epoch": 0.13545454545454547, "grad_norm": 5.125, "grad_norm_var": 1.8534138997395833, "learning_rate": 0.0001, "loss": 6.1844, "loss/crossentropy": 2.4685265123844147, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.20342189073562622, "step": 2980 }, { "epoch": 0.13554545454545455, "grad_norm": 6.15625, "grad_norm_var": 1.9327473958333334, "learning_rate": 0.0001, "loss": 6.072, "loss/crossentropy": 2.302230268716812, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.2025594599545002, "step": 2982 }, { "epoch": 0.13563636363636364, "grad_norm": 6.53125, "grad_norm_var": 1.9030598958333333, "learning_rate": 0.0001, "loss": 6.4615, "loss/crossentropy": 2.657979369163513, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.2110113985836506, "step": 2984 }, { "epoch": 0.13572727272727272, "grad_norm": 10.625, "grad_norm_var": 2.051041666666667, "learning_rate": 0.0001, "loss": 6.323, "loss/crossentropy": 2.3187204003334045, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.22660143673419952, "step": 2986 }, { "epoch": 0.1358181818181818, "grad_norm": 6.375, "grad_norm_var": 2.0082682291666667, "learning_rate": 0.0001, "loss": 6.5374, "loss/crossentropy": 2.6201655864715576, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.21731050685048103, "step": 2988 }, { "epoch": 0.1359090909090909, "grad_norm": 7.5625, "grad_norm_var": 1.8510416666666667, "learning_rate": 0.0001, "loss": 6.5798, "loss/crossentropy": 2.72028911113739, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.21544167399406433, "step": 2990 }, { "epoch": 0.136, "grad_norm": 6.15625, "grad_norm_var": 1.49244384765625, "learning_rate": 0.0001, "loss": 6.2934, "loss/crossentropy": 2.5245619416236877, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.2075505517423153, "step": 2992 }, { "epoch": 0.1360909090909091, "grad_norm": 6.15625, "grad_norm_var": 1.4613118489583334, "learning_rate": 0.0001, "loss": 6.4544, "loss/crossentropy": 2.5792028307914734, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21330077946186066, "step": 2994 }, { "epoch": 0.13618181818181818, "grad_norm": 6.125, "grad_norm_var": 1.32662353515625, "learning_rate": 0.0001, "loss": 6.4918, "loss/crossentropy": 2.589960813522339, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21908588334918022, "step": 2996 }, { "epoch": 0.13627272727272727, "grad_norm": 6.4375, "grad_norm_var": 1.2379557291666667, "learning_rate": 0.0001, "loss": 6.2827, "loss/crossentropy": 2.492607593536377, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.20810669288039207, "step": 2998 }, { "epoch": 0.13636363636363635, "grad_norm": 7.28125, "grad_norm_var": 1.2567057291666666, "learning_rate": 0.0001, "loss": 6.2497, "loss/crossentropy": 2.428509294986725, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.20887553319334984, "step": 3000 }, { "epoch": 0.13645454545454547, "grad_norm": 6.53125, "grad_norm_var": 0.20467122395833334, "learning_rate": 0.0001, "loss": 6.7005, "loss/crossentropy": 2.7743768095970154, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22268862649798393, "step": 3002 }, { "epoch": 0.13654545454545455, "grad_norm": 7.15625, "grad_norm_var": 1.0128743489583334, "learning_rate": 0.0001, "loss": 6.7218, "loss/crossentropy": 2.661856949329376, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2294272482395172, "step": 3004 }, { "epoch": 0.13663636363636364, "grad_norm": 5.90625, "grad_norm_var": 1.1458170572916666, "learning_rate": 0.0001, "loss": 6.1908, "loss/crossentropy": 2.385139584541321, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.2094724029302597, "step": 3006 }, { "epoch": 0.13672727272727273, "grad_norm": 6.34375, "grad_norm_var": 1.1073527018229166, "learning_rate": 0.0001, "loss": 6.5748, "loss/crossentropy": 2.632906496524811, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2215326949954033, "step": 3008 }, { "epoch": 0.1368181818181818, "grad_norm": 6.46875, "grad_norm_var": 1.1199055989583333, "learning_rate": 0.0001, "loss": 6.7257, "loss/crossentropy": 2.808994948863983, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2190180905163288, "step": 3010 }, { "epoch": 0.1369090909090909, "grad_norm": 6.21875, "grad_norm_var": 1.13736572265625, "learning_rate": 0.0001, "loss": 6.457, "loss/crossentropy": 2.594053030014038, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21636832132935524, "step": 3012 }, { "epoch": 0.137, "grad_norm": 6.0, "grad_norm_var": 1.19921875, "learning_rate": 0.0001, "loss": 6.0814, "loss/crossentropy": 2.314297318458557, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.20659391582012177, "step": 3014 }, { "epoch": 0.1370909090909091, "grad_norm": 6.4375, "grad_norm_var": 1.16666259765625, "learning_rate": 0.0001, "loss": 6.2903, "loss/crossentropy": 2.400680363178253, "loss/hidden": 1.744140625, "loss/jsd": 0.0, "loss/logits": 0.21454422548413277, "step": 3016 }, { "epoch": 0.13718181818181818, "grad_norm": 6.375, "grad_norm_var": 1.1742838541666667, "learning_rate": 0.0001, "loss": 6.4296, "loss/crossentropy": 2.5784913301467896, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21323423087596893, "step": 3018 }, { "epoch": 0.13727272727272727, "grad_norm": 6.5625, "grad_norm_var": 0.3458170572916667, "learning_rate": 0.0001, "loss": 6.771, "loss/crossentropy": 2.6916770339012146, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23527918010950089, "step": 3020 }, { "epoch": 0.13736363636363635, "grad_norm": 7.15625, "grad_norm_var": 0.180859375, "learning_rate": 0.0001, "loss": 6.0421, "loss/crossentropy": 2.3130958676338196, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.19926705956459045, "step": 3022 }, { "epoch": 0.13745454545454547, "grad_norm": 6.1875, "grad_norm_var": 0.15709635416666667, "learning_rate": 0.0001, "loss": 6.1118, "loss/crossentropy": 2.407963901758194, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.19987719506025314, "step": 3024 }, { "epoch": 0.13754545454545455, "grad_norm": 6.625, "grad_norm_var": 0.15738525390625, "learning_rate": 0.0001, "loss": 6.4052, "loss/crossentropy": 2.570662796497345, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2131451591849327, "step": 3026 }, { "epoch": 0.13763636363636364, "grad_norm": 6.40625, "grad_norm_var": 0.14039306640625, "learning_rate": 0.0001, "loss": 6.2262, "loss/crossentropy": 2.4395031332969666, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.2062043584883213, "step": 3028 }, { "epoch": 0.13772727272727273, "grad_norm": 5.75, "grad_norm_var": 0.1484375, "learning_rate": 0.0001, "loss": 6.2883, "loss/crossentropy": 2.4541375637054443, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.21056897938251495, "step": 3030 }, { "epoch": 0.1378181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.24055989583333334, "learning_rate": 0.0001, "loss": 6.3988, "loss/crossentropy": 2.6006484627723694, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.2093096598982811, "step": 3032 }, { "epoch": 0.1379090909090909, "grad_norm": 6.96875, "grad_norm_var": 0.2688802083333333, "learning_rate": 0.0001, "loss": 6.7346, "loss/crossentropy": 2.6771397590637207, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23386963456869125, "step": 3034 }, { "epoch": 0.138, "grad_norm": 5.46875, "grad_norm_var": 0.2819295247395833, "learning_rate": 0.0001, "loss": 6.0893, "loss/crossentropy": 2.425393968820572, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.196660365909338, "step": 3036 }, { "epoch": 0.1380909090909091, "grad_norm": 6.1875, "grad_norm_var": 0.22121988932291667, "learning_rate": 0.0001, "loss": 6.4291, "loss/crossentropy": 2.561570405960083, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2152658775448799, "step": 3038 }, { "epoch": 0.13818181818181818, "grad_norm": 6.1875, "grad_norm_var": 0.21415608723958332, "learning_rate": 0.0001, "loss": 6.1424, "loss/crossentropy": 2.2494788765907288, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.2129259668290615, "step": 3040 }, { "epoch": 0.13827272727272727, "grad_norm": 6.4375, "grad_norm_var": 0.21308186848958333, "learning_rate": 0.0001, "loss": 6.4271, "loss/crossentropy": 2.6930670738220215, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20660628378391266, "step": 3042 }, { "epoch": 0.13836363636363636, "grad_norm": 6.71875, "grad_norm_var": 0.28761393229166665, "learning_rate": 0.0001, "loss": 5.9235, "loss/crossentropy": 2.1991688311100006, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.19899709150195122, "step": 3044 }, { "epoch": 0.13845454545454544, "grad_norm": 5.96875, "grad_norm_var": 0.2745442708333333, "learning_rate": 0.0001, "loss": 5.9725, "loss/crossentropy": 2.2899543046951294, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.19970311224460602, "step": 3046 }, { "epoch": 0.13854545454545455, "grad_norm": 5.84375, "grad_norm_var": 0.21105143229166667, "learning_rate": 0.0001, "loss": 6.1456, "loss/crossentropy": 2.4467918276786804, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.20093349739909172, "step": 3048 }, { "epoch": 0.13863636363636364, "grad_norm": 6.9375, "grad_norm_var": 0.20562744140625, "learning_rate": 0.0001, "loss": 6.3057, "loss/crossentropy": 2.4102434515953064, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.21239475905895233, "step": 3050 }, { "epoch": 0.13872727272727273, "grad_norm": 6.1875, "grad_norm_var": 0.152587890625, "learning_rate": 0.0001, "loss": 6.2345, "loss/crossentropy": 2.430996596813202, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.21179084852337837, "step": 3052 }, { "epoch": 0.1388181818181818, "grad_norm": 6.375, "grad_norm_var": 1.0397420247395834, "learning_rate": 0.0001, "loss": 6.1998, "loss/crossentropy": 2.4269062280654907, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.2034621797502041, "step": 3054 }, { "epoch": 0.1389090909090909, "grad_norm": 6.0, "grad_norm_var": 1.0402303059895834, "learning_rate": 0.0001, "loss": 6.3154, "loss/crossentropy": 2.4813053607940674, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2119206078350544, "step": 3056 }, { "epoch": 0.139, "grad_norm": 6.75, "grad_norm_var": 1.0055826822916667, "learning_rate": 0.0001, "loss": 6.5347, "loss/crossentropy": 2.6199982166290283, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.21939776465296745, "step": 3058 }, { "epoch": 0.1390909090909091, "grad_norm": 6.5625, "grad_norm_var": 0.9855753580729166, "learning_rate": 0.0001, "loss": 6.4977, "loss/crossentropy": 2.691272020339966, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.2085680179297924, "step": 3060 }, { "epoch": 0.13918181818181818, "grad_norm": 6.25, "grad_norm_var": 0.9615519205729167, "learning_rate": 0.0001, "loss": 6.416, "loss/crossentropy": 2.6311779618263245, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20817000791430473, "step": 3062 }, { "epoch": 0.13927272727272727, "grad_norm": 6.625, "grad_norm_var": 0.926416015625, "learning_rate": 0.0001, "loss": 6.2448, "loss/crossentropy": 2.373628556728363, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2148534208536148, "step": 3064 }, { "epoch": 0.13936363636363636, "grad_norm": 6.59375, "grad_norm_var": 0.90445556640625, "learning_rate": 0.0001, "loss": 6.4739, "loss/crossentropy": 2.6215447783470154, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21452797204256058, "step": 3066 }, { "epoch": 0.13945454545454544, "grad_norm": 7.0625, "grad_norm_var": 0.8839152018229167, "learning_rate": 0.0001, "loss": 6.4285, "loss/crossentropy": 2.5278608798980713, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21779390797019005, "step": 3068 }, { "epoch": 0.13954545454545456, "grad_norm": 6.25, "grad_norm_var": 0.063134765625, "learning_rate": 0.0001, "loss": 6.3719, "loss/crossentropy": 2.5908570885658264, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.20622896030545235, "step": 3070 }, { "epoch": 0.13963636363636364, "grad_norm": 6.5625, "grad_norm_var": 0.04608968098958333, "learning_rate": 0.0001, "loss": 6.3121, "loss/crossentropy": 2.4452916979789734, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21480408310890198, "step": 3072 }, { "epoch": 0.13972727272727273, "grad_norm": 5.9375, "grad_norm_var": 0.07115885416666666, "learning_rate": 0.0001, "loss": 6.12, "loss/crossentropy": 2.4010531902313232, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.19943542033433914, "step": 3074 }, { "epoch": 0.1398181818181818, "grad_norm": 6.53125, "grad_norm_var": 0.07174479166666667, "learning_rate": 0.0001, "loss": 6.3373, "loss/crossentropy": 2.4282415211200714, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.21766271069645882, "step": 3076 }, { "epoch": 0.1399090909090909, "grad_norm": 6.5625, "grad_norm_var": 0.08839518229166667, "learning_rate": 0.0001, "loss": 6.3363, "loss/crossentropy": 2.5349613428115845, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.2115759439766407, "step": 3078 }, { "epoch": 0.14, "grad_norm": 6.09375, "grad_norm_var": 0.20690104166666667, "learning_rate": 0.0001, "loss": 6.6875, "loss/crossentropy": 2.732331335544586, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22207534685730934, "step": 3080 }, { "epoch": 0.1400909090909091, "grad_norm": 6.71875, "grad_norm_var": 0.212109375, "learning_rate": 0.0001, "loss": 6.4911, "loss/crossentropy": 2.642714560031891, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2145271599292755, "step": 3082 }, { "epoch": 0.14018181818181819, "grad_norm": 6.1875, "grad_norm_var": 0.20142822265625, "learning_rate": 0.0001, "loss": 6.5059, "loss/crossentropy": 2.6583288311958313, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21523040533065796, "step": 3084 }, { "epoch": 0.14027272727272727, "grad_norm": 6.5, "grad_norm_var": 0.21432291666666667, "learning_rate": 0.0001, "loss": 6.5197, "loss/crossentropy": 2.7320054173469543, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.21138197928667068, "step": 3086 }, { "epoch": 0.14036363636363636, "grad_norm": 8.5625, "grad_norm_var": 0.5174479166666667, "learning_rate": 0.0001, "loss": 6.4175, "loss/crossentropy": 2.4788646697998047, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2215995192527771, "step": 3088 }, { "epoch": 0.14045454545454544, "grad_norm": 6.03125, "grad_norm_var": 0.5737630208333333, "learning_rate": 0.0001, "loss": 6.6982, "loss/crossentropy": 2.770599365234375, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.22185972332954407, "step": 3090 }, { "epoch": 0.14054545454545456, "grad_norm": 6.96875, "grad_norm_var": 0.5783162434895833, "learning_rate": 0.0001, "loss": 6.1807, "loss/crossentropy": 2.4286603331565857, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.20235063880681992, "step": 3092 }, { "epoch": 0.14063636363636364, "grad_norm": 6.78125, "grad_norm_var": 0.61304931640625, "learning_rate": 0.0001, "loss": 6.0822, "loss/crossentropy": 2.3441578447818756, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.1999749168753624, "step": 3094 }, { "epoch": 0.14072727272727273, "grad_norm": 6.53125, "grad_norm_var": 0.5364217122395833, "learning_rate": 0.0001, "loss": 6.292, "loss/crossentropy": 2.438682198524475, "loss/hidden": 1.751953125, "loss/jsd": 0.0, "loss/logits": 0.21014123037457466, "step": 3096 }, { "epoch": 0.14081818181818181, "grad_norm": 5.90625, "grad_norm_var": 0.5619099934895834, "learning_rate": 0.0001, "loss": 6.401, "loss/crossentropy": 2.523850977420807, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.2156413272023201, "step": 3098 }, { "epoch": 0.1409090909090909, "grad_norm": 6.375, "grad_norm_var": 0.5282185872395834, "learning_rate": 0.0001, "loss": 6.4411, "loss/crossentropy": 2.584400177001953, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.2159481830894947, "step": 3100 }, { "epoch": 0.141, "grad_norm": 10.8125, "grad_norm_var": 1.6191243489583333, "learning_rate": 0.0001, "loss": 6.5161, "loss/crossentropy": 2.5830821990966797, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.21693133935332298, "step": 3102 }, { "epoch": 0.1410909090909091, "grad_norm": 5.78125, "grad_norm_var": 1.5120402018229167, "learning_rate": 0.0001, "loss": 6.0273, "loss/crossentropy": 2.4222531616687775, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.19292956963181496, "step": 3104 }, { "epoch": 0.14118181818181819, "grad_norm": 6.0625, "grad_norm_var": 1.46851806640625, "learning_rate": 0.0001, "loss": 6.3956, "loss/crossentropy": 2.5347251892089844, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21304012835025787, "step": 3106 }, { "epoch": 0.14127272727272727, "grad_norm": 6.4375, "grad_norm_var": 1.47379150390625, "learning_rate": 0.0001, "loss": 6.6491, "loss/crossentropy": 2.64521187543869, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.2263670563697815, "step": 3108 }, { "epoch": 0.14136363636363636, "grad_norm": 6.03125, "grad_norm_var": 1.43658447265625, "learning_rate": 0.0001, "loss": 6.4133, "loss/crossentropy": 2.566640794277191, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21630220487713814, "step": 3110 }, { "epoch": 0.14145454545454544, "grad_norm": 6.34375, "grad_norm_var": 1.427978515625, "learning_rate": 0.0001, "loss": 6.2292, "loss/crossentropy": 2.3913604617118835, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.20644504204392433, "step": 3112 }, { "epoch": 0.14154545454545456, "grad_norm": 5.78125, "grad_norm_var": 1.458837890625, "learning_rate": 0.0001, "loss": 6.0109, "loss/crossentropy": 2.405590236186981, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.19275681301951408, "step": 3114 }, { "epoch": 0.14163636363636364, "grad_norm": 7.15625, "grad_norm_var": 1.47379150390625, "learning_rate": 0.0001, "loss": 6.408, "loss/crossentropy": 2.5116565227508545, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21775518730282784, "step": 3116 }, { "epoch": 0.14172727272727273, "grad_norm": 6.5625, "grad_norm_var": 0.22928059895833333, "learning_rate": 0.0001, "loss": 6.2935, "loss/crossentropy": 2.5012866258621216, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20852204412221909, "step": 3118 }, { "epoch": 0.14181818181818182, "grad_norm": 5.65625, "grad_norm_var": 0.22646077473958334, "learning_rate": 0.0001, "loss": 6.023, "loss/crossentropy": 2.341329425573349, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.1982453428208828, "step": 3120 }, { "epoch": 0.1419090909090909, "grad_norm": 9.3125, "grad_norm_var": 0.717822265625, "learning_rate": 0.0001, "loss": 6.7331, "loss/crossentropy": 2.6950594782829285, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.23173213005065918, "step": 3122 }, { "epoch": 0.142, "grad_norm": 6.28125, "grad_norm_var": 0.7101521809895833, "learning_rate": 0.0001, "loss": 6.1096, "loss/crossentropy": 2.437426447868347, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.1982693299651146, "step": 3124 }, { "epoch": 0.1420909090909091, "grad_norm": 6.03125, "grad_norm_var": 0.7148274739583333, "learning_rate": 0.0001, "loss": 6.0396, "loss/crossentropy": 2.332288771867752, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.20217252522706985, "step": 3126 }, { "epoch": 0.1421818181818182, "grad_norm": 6.78125, "grad_norm_var": 0.7779947916666666, "learning_rate": 0.0001, "loss": 6.464, "loss/crossentropy": 2.5025664269924164, "loss/hidden": 1.806640625, "loss/jsd": 0.0, "loss/logits": 0.21548134833574295, "step": 3128 }, { "epoch": 0.14227272727272727, "grad_norm": 6.6875, "grad_norm_var": 0.7397745768229167, "learning_rate": 0.0001, "loss": 5.9311, "loss/crossentropy": 2.273518890142441, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.19485770165920258, "step": 3130 }, { "epoch": 0.14236363636363636, "grad_norm": 7.53125, "grad_norm_var": 0.8291300455729167, "learning_rate": 0.0001, "loss": 6.5108, "loss/crossentropy": 2.6496514678001404, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.21287404373288155, "step": 3132 }, { "epoch": 0.14245454545454544, "grad_norm": 6.28125, "grad_norm_var": 0.8141927083333333, "learning_rate": 0.0001, "loss": 6.5124, "loss/crossentropy": 2.6471863985061646, "loss/hidden": 1.712890625, "loss/jsd": 0.0, "loss/logits": 0.21523334830999374, "step": 3134 }, { "epoch": 0.14254545454545456, "grad_norm": 6.53125, "grad_norm_var": 0.797265625, "learning_rate": 0.0001, "loss": 5.8947, "loss/crossentropy": 2.269842505455017, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.1935427486896515, "step": 3136 }, { "epoch": 0.14263636363636364, "grad_norm": 6.6875, "grad_norm_var": 0.34049072265625, "learning_rate": 0.0001, "loss": 5.8291, "loss/crossentropy": 2.28103169798851, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.18292763829231262, "step": 3138 }, { "epoch": 0.14272727272727273, "grad_norm": 5.71875, "grad_norm_var": 0.38228759765625, "learning_rate": 0.0001, "loss": 6.3732, "loss/crossentropy": 2.5278252959251404, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21383924037218094, "step": 3140 }, { "epoch": 0.14281818181818182, "grad_norm": 6.4375, "grad_norm_var": 0.35833333333333334, "learning_rate": 0.0001, "loss": 6.2072, "loss/crossentropy": 2.43711119890213, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.20474699512124062, "step": 3142 }, { "epoch": 0.1429090909090909, "grad_norm": 6.0625, "grad_norm_var": 0.2841796875, "learning_rate": 0.0001, "loss": 6.3299, "loss/crossentropy": 2.4830440878868103, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.21417544037103653, "step": 3144 }, { "epoch": 0.143, "grad_norm": 5.84375, "grad_norm_var": 0.2967732747395833, "learning_rate": 0.0001, "loss": 6.6702, "loss/crossentropy": 2.736515522003174, "loss/hidden": 1.771484375, "loss/jsd": 0.0, "loss/logits": 0.21621844917535782, "step": 3146 }, { "epoch": 0.1430909090909091, "grad_norm": 6.40625, "grad_norm_var": 0.11545817057291667, "learning_rate": 0.0001, "loss": 6.6507, "loss/crossentropy": 2.8378498554229736, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2129288949072361, "step": 3148 }, { "epoch": 0.1431818181818182, "grad_norm": 5.71875, "grad_norm_var": 0.14270426432291666, "learning_rate": 0.0001, "loss": 6.3259, "loss/crossentropy": 2.5644309520721436, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20700686052441597, "step": 3150 }, { "epoch": 0.14327272727272727, "grad_norm": 6.5, "grad_norm_var": 0.13232014973958334, "learning_rate": 0.0001, "loss": 6.5251, "loss/crossentropy": 2.6771376729011536, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21487358957529068, "step": 3152 }, { "epoch": 0.14336363636363636, "grad_norm": 5.875, "grad_norm_var": 0.14416910807291666, "learning_rate": 0.0001, "loss": 6.1875, "loss/crossentropy": 2.4625123739242554, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.201800424605608, "step": 3154 }, { "epoch": 0.14345454545454545, "grad_norm": 6.03125, "grad_norm_var": 0.14244384765625, "learning_rate": 0.0001, "loss": 5.9903, "loss/crossentropy": 2.34772264957428, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.19531260803341866, "step": 3156 }, { "epoch": 0.14354545454545453, "grad_norm": 6.21875, "grad_norm_var": 0.13489176432291666, "learning_rate": 0.0001, "loss": 6.3432, "loss/crossentropy": 2.4786987900733948, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.21360202133655548, "step": 3158 }, { "epoch": 0.14363636363636365, "grad_norm": 6.0625, "grad_norm_var": 0.13326822916666667, "learning_rate": 0.0001, "loss": 6.5437, "loss/crossentropy": 2.6473403573036194, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.21912996470928192, "step": 3160 }, { "epoch": 0.14372727272727273, "grad_norm": 6.5625, "grad_norm_var": 0.131494140625, "learning_rate": 0.0001, "loss": 6.6534, "loss/crossentropy": 2.7432695031166077, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.21894089877605438, "step": 3162 }, { "epoch": 0.14381818181818182, "grad_norm": 6.375, "grad_norm_var": 0.168994140625, "learning_rate": 0.0001, "loss": 6.4093, "loss/crossentropy": 2.6641159057617188, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2069423608481884, "step": 3164 }, { "epoch": 0.1439090909090909, "grad_norm": 7.21875, "grad_norm_var": 0.2208984375, "learning_rate": 0.0001, "loss": 6.2503, "loss/crossentropy": 2.4263925552368164, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.20953703299164772, "step": 3166 }, { "epoch": 0.144, "grad_norm": 6.71875, "grad_norm_var": 0.22760009765625, "learning_rate": 0.0001, "loss": 6.5146, "loss/crossentropy": 2.6620755791664124, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.216500885784626, "step": 3168 }, { "epoch": 0.1440909090909091, "grad_norm": 6.0625, "grad_norm_var": 0.19000244140625, "learning_rate": 0.0001, "loss": 5.9938, "loss/crossentropy": 2.3297001719474792, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.19492904841899872, "step": 3170 }, { "epoch": 0.1441818181818182, "grad_norm": 6.25, "grad_norm_var": 0.15575764973958334, "learning_rate": 0.0001, "loss": 6.1598, "loss/crossentropy": 2.3784215450286865, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.2060680091381073, "step": 3172 }, { "epoch": 0.14427272727272727, "grad_norm": 6.03125, "grad_norm_var": 0.2130859375, "learning_rate": 0.0001, "loss": 6.2867, "loss/crossentropy": 2.499049484729767, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.2078666016459465, "step": 3174 }, { "epoch": 0.14436363636363636, "grad_norm": 6.28125, "grad_norm_var": 0.196337890625, "learning_rate": 0.0001, "loss": 6.4087, "loss/crossentropy": 2.6098875999450684, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.21132546663284302, "step": 3176 }, { "epoch": 0.14445454545454545, "grad_norm": 6.125, "grad_norm_var": 0.22316080729166668, "learning_rate": 0.0001, "loss": 6.509, "loss/crossentropy": 2.64523446559906, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.2158697210252285, "step": 3178 }, { "epoch": 0.14454545454545453, "grad_norm": 5.84375, "grad_norm_var": 0.352734375, "learning_rate": 0.0001, "loss": 6.169, "loss/crossentropy": 2.4140483140945435, "loss/hidden": 1.759765625, "loss/jsd": 0.0, "loss/logits": 0.19952339306473732, "step": 3180 }, { "epoch": 0.14463636363636365, "grad_norm": 5.9375, "grad_norm_var": 0.30455322265625, "learning_rate": 0.0001, "loss": 6.2608, "loss/crossentropy": 2.5134955048561096, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.2073458544909954, "step": 3182 }, { "epoch": 0.14472727272727273, "grad_norm": 5.5, "grad_norm_var": 0.3997395833333333, "learning_rate": 0.0001, "loss": 5.9785, "loss/crossentropy": 2.3909292221069336, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19410915672779083, "step": 3184 }, { "epoch": 0.14481818181818182, "grad_norm": 6.0625, "grad_norm_var": 0.4002888997395833, "learning_rate": 0.0001, "loss": 6.1802, "loss/crossentropy": 2.436787962913513, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.20324410125613213, "step": 3186 }, { "epoch": 0.1449090909090909, "grad_norm": 5.90625, "grad_norm_var": 0.40015869140625, "learning_rate": 0.0001, "loss": 6.2505, "loss/crossentropy": 2.4683521389961243, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20750975981354713, "step": 3188 }, { "epoch": 0.145, "grad_norm": 5.6875, "grad_norm_var": 0.3304036458333333, "learning_rate": 0.0001, "loss": 5.7761, "loss/crossentropy": 2.272940218448639, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.18214993551373482, "step": 3190 }, { "epoch": 0.1450909090909091, "grad_norm": 6.84375, "grad_norm_var": 0.3655598958333333, "learning_rate": 0.0001, "loss": 6.6165, "loss/crossentropy": 2.7340283393859863, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2198871709406376, "step": 3192 }, { "epoch": 0.1451818181818182, "grad_norm": 7.5, "grad_norm_var": 0.51090087890625, "learning_rate": 0.0001, "loss": 6.7998, "loss/crossentropy": 2.7752973437309265, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.22764497622847557, "step": 3194 }, { "epoch": 0.14527272727272728, "grad_norm": 5.90625, "grad_norm_var": 0.3478474934895833, "learning_rate": 0.0001, "loss": 6.3451, "loss/crossentropy": 2.621429204940796, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.20381306111812592, "step": 3196 }, { "epoch": 0.14536363636363636, "grad_norm": 6.25, "grad_norm_var": 0.34231363932291664, "learning_rate": 0.0001, "loss": 6.6629, "loss/crossentropy": 2.795845866203308, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21444140747189522, "step": 3198 }, { "epoch": 0.14545454545454545, "grad_norm": 6.53125, "grad_norm_var": 0.2626139322916667, "learning_rate": 0.0001, "loss": 6.3478, "loss/crossentropy": 2.5634111166000366, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.20949756726622581, "step": 3200 }, { "epoch": 0.14554545454545453, "grad_norm": 6.46875, "grad_norm_var": 0.2559733072916667, "learning_rate": 0.0001, "loss": 6.1754, "loss/crossentropy": 2.470259130001068, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.1998089700937271, "step": 3202 }, { "epoch": 0.14563636363636365, "grad_norm": 6.59375, "grad_norm_var": 0.24147135416666668, "learning_rate": 0.0001, "loss": 6.4194, "loss/crossentropy": 2.436974048614502, "loss/hidden": 1.763671875, "loss/jsd": 0.0, "loss/logits": 0.2218710593879223, "step": 3204 }, { "epoch": 0.14572727272727273, "grad_norm": 6.84375, "grad_norm_var": 0.17398681640625, "learning_rate": 0.0001, "loss": 6.6927, "loss/crossentropy": 2.7827082872390747, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22147186845541, "step": 3206 }, { "epoch": 0.14581818181818182, "grad_norm": 5.875, "grad_norm_var": 0.162109375, "learning_rate": 0.0001, "loss": 6.2079, "loss/crossentropy": 2.420740306377411, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20840130001306534, "step": 3208 }, { "epoch": 0.1459090909090909, "grad_norm": 5.84375, "grad_norm_var": 0.09052327473958334, "learning_rate": 0.0001, "loss": 6.5313, "loss/crossentropy": 2.677858352661133, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21581730619072914, "step": 3210 }, { "epoch": 0.146, "grad_norm": 6.4375, "grad_norm_var": 0.0927734375, "learning_rate": 0.0001, "loss": 6.1268, "loss/crossentropy": 2.4205955266952515, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.1981612965464592, "step": 3212 }, { "epoch": 0.1460909090909091, "grad_norm": 6.15625, "grad_norm_var": 0.10123697916666667, "learning_rate": 0.0001, "loss": 6.4647, "loss/crossentropy": 2.6974762678146362, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.20621944218873978, "step": 3214 }, { "epoch": 0.1461818181818182, "grad_norm": 5.625, "grad_norm_var": 0.138134765625, "learning_rate": 0.0001, "loss": 5.8174, "loss/crossentropy": 2.205770790576935, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.1922134943306446, "step": 3216 }, { "epoch": 0.14627272727272728, "grad_norm": 6.46875, "grad_norm_var": 0.19097900390625, "learning_rate": 0.0001, "loss": 5.8382, "loss/crossentropy": 2.2847235798835754, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.18933698162436485, "step": 3218 }, { "epoch": 0.14636363636363636, "grad_norm": 6.9375, "grad_norm_var": 0.20621337890625, "learning_rate": 0.0001, "loss": 5.9726, "loss/crossentropy": 2.3346540927886963, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.19270380586385727, "step": 3220 }, { "epoch": 0.14645454545454545, "grad_norm": 9.25, "grad_norm_var": 0.8003743489583334, "learning_rate": 0.0001, "loss": 6.2109, "loss/crossentropy": 2.460857331752777, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.20254746451973915, "step": 3222 }, { "epoch": 0.14654545454545453, "grad_norm": 6.0, "grad_norm_var": 0.8043619791666666, "learning_rate": 0.0001, "loss": 6.0801, "loss/crossentropy": 2.4283517003059387, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.19759635999798775, "step": 3224 }, { "epoch": 0.14663636363636365, "grad_norm": 7.65625, "grad_norm_var": 0.9048828125, "learning_rate": 0.0001, "loss": 6.692, "loss/crossentropy": 2.823076069355011, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21735847368836403, "step": 3226 }, { "epoch": 0.14672727272727273, "grad_norm": 6.6875, "grad_norm_var": 0.92379150390625, "learning_rate": 0.0001, "loss": 6.5225, "loss/crossentropy": 2.632829010486603, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21826696768403053, "step": 3228 }, { "epoch": 0.14681818181818182, "grad_norm": 5.375, "grad_norm_var": 0.971337890625, "learning_rate": 0.0001, "loss": 6.1095, "loss/crossentropy": 2.4285095930099487, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20169299095869064, "step": 3230 }, { "epoch": 0.1469090909090909, "grad_norm": 6.15625, "grad_norm_var": 0.9289347330729166, "learning_rate": 0.0001, "loss": 5.9908, "loss/crossentropy": 2.340480387210846, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.19432570785284042, "step": 3232 }, { "epoch": 0.147, "grad_norm": 6.1875, "grad_norm_var": 0.8321451822916667, "learning_rate": 0.0001, "loss": 6.0799, "loss/crossentropy": 2.359663665294647, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.20112543180584908, "step": 3234 }, { "epoch": 0.14709090909090908, "grad_norm": 6.0625, "grad_norm_var": 0.81402587890625, "learning_rate": 0.0001, "loss": 6.2199, "loss/crossentropy": 2.4710846543312073, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.20554166287183762, "step": 3236 }, { "epoch": 0.1471818181818182, "grad_norm": 6.40625, "grad_norm_var": 0.29010416666666666, "learning_rate": 0.0001, "loss": 6.4058, "loss/crossentropy": 2.581398367881775, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21174193918704987, "step": 3238 }, { "epoch": 0.14727272727272728, "grad_norm": 5.875, "grad_norm_var": 0.31324462890625, "learning_rate": 0.0001, "loss": 6.0885, "loss/crossentropy": 2.4488173723220825, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.19541488215327263, "step": 3240 }, { "epoch": 0.14736363636363636, "grad_norm": 5.6875, "grad_norm_var": 0.23023681640625, "learning_rate": 0.0001, "loss": 6.1259, "loss/crossentropy": 2.4630167484283447, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.19695354625582695, "step": 3242 }, { "epoch": 0.14745454545454545, "grad_norm": 6.0625, "grad_norm_var": 0.14934895833333334, "learning_rate": 0.0001, "loss": 6.5853, "loss/crossentropy": 2.7549890279769897, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.21486708894371986, "step": 3244 }, { "epoch": 0.14754545454545454, "grad_norm": 5.8125, "grad_norm_var": 0.11222330729166667, "learning_rate": 0.0001, "loss": 6.3714, "loss/crossentropy": 2.562860906124115, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21328014507889748, "step": 3246 }, { "epoch": 0.14763636363636365, "grad_norm": 5.65625, "grad_norm_var": 0.13253580729166667, "learning_rate": 0.0001, "loss": 5.9784, "loss/crossentropy": 2.4264267683029175, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1915271282196045, "step": 3248 }, { "epoch": 0.14772727272727273, "grad_norm": 6.03125, "grad_norm_var": 0.30924072265625, "learning_rate": 0.0001, "loss": 6.3057, "loss/crossentropy": 2.513372927904129, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21008849889039993, "step": 3250 }, { "epoch": 0.14781818181818182, "grad_norm": 6.125, "grad_norm_var": 0.310400390625, "learning_rate": 0.0001, "loss": 6.716, "loss/crossentropy": 2.696326196193695, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.22833311185240746, "step": 3252 }, { "epoch": 0.1479090909090909, "grad_norm": 6.96875, "grad_norm_var": 0.3664347330729167, "learning_rate": 0.0001, "loss": 6.1506, "loss/crossentropy": 2.4923298358917236, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20058954879641533, "step": 3254 }, { "epoch": 0.148, "grad_norm": 6.90625, "grad_norm_var": 0.5633748372395834, "learning_rate": 0.0001, "loss": 6.8374, "loss/crossentropy": 2.7663440704345703, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.23503388091921806, "step": 3256 }, { "epoch": 0.14809090909090908, "grad_norm": 6.21875, "grad_norm_var": 0.5183430989583333, "learning_rate": 0.0001, "loss": 6.1427, "loss/crossentropy": 2.4006778597831726, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.20604054257273674, "step": 3258 }, { "epoch": 0.1481818181818182, "grad_norm": 6.0625, "grad_norm_var": 0.5278279622395833, "learning_rate": 0.0001, "loss": 5.7928, "loss/crossentropy": 2.188796639442444, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.18891680240631104, "step": 3260 }, { "epoch": 0.14827272727272728, "grad_norm": 5.96875, "grad_norm_var": 0.59195556640625, "learning_rate": 0.0001, "loss": 5.813, "loss/crossentropy": 2.257271885871887, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.18721157312393188, "step": 3262 }, { "epoch": 0.14836363636363636, "grad_norm": 6.53125, "grad_norm_var": 0.5692545572916666, "learning_rate": 0.0001, "loss": 6.0815, "loss/crossentropy": 2.4430121779441833, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.1956843063235283, "step": 3264 }, { "epoch": 0.14845454545454545, "grad_norm": 6.125, "grad_norm_var": 0.41562093098958336, "learning_rate": 0.0001, "loss": 6.6197, "loss/crossentropy": 2.6902194023132324, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.22361619770526886, "step": 3266 }, { "epoch": 0.14854545454545454, "grad_norm": 5.75, "grad_norm_var": 0.45826822916666665, "learning_rate": 0.0001, "loss": 6.1923, "loss/crossentropy": 2.528076171875, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.19884108752012253, "step": 3268 }, { "epoch": 0.14863636363636365, "grad_norm": 5.84375, "grad_norm_var": 0.41041259765625, "learning_rate": 0.0001, "loss": 6.3671, "loss/crossentropy": 2.51224684715271, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.21654174849390984, "step": 3270 }, { "epoch": 0.14872727272727274, "grad_norm": 7.5, "grad_norm_var": 0.2640462239583333, "learning_rate": 0.0001, "loss": 6.3784, "loss/crossentropy": 2.628266394138336, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2035244032740593, "step": 3272 }, { "epoch": 0.14881818181818182, "grad_norm": 5.96875, "grad_norm_var": 0.2896484375, "learning_rate": 0.0001, "loss": 6.2468, "loss/crossentropy": 2.516316294670105, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.20331741124391556, "step": 3274 }, { "epoch": 0.1489090909090909, "grad_norm": 5.96875, "grad_norm_var": 0.3046183268229167, "learning_rate": 0.0001, "loss": 6.065, "loss/crossentropy": 2.3304448425769806, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.20333748683333397, "step": 3276 }, { "epoch": 0.149, "grad_norm": 5.75, "grad_norm_var": 0.27545166015625, "learning_rate": 0.0001, "loss": 6.3784, "loss/crossentropy": 2.4830278754234314, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.21414393931627274, "step": 3278 }, { "epoch": 0.14909090909090908, "grad_norm": 5.71875, "grad_norm_var": 0.2673828125, "learning_rate": 0.0001, "loss": 5.992, "loss/crossentropy": 2.349488377571106, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.19393620640039444, "step": 3280 }, { "epoch": 0.1491818181818182, "grad_norm": 6.25, "grad_norm_var": 0.27928059895833335, "learning_rate": 0.0001, "loss": 6.6005, "loss/crossentropy": 2.6587527990341187, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.22053823620080948, "step": 3282 }, { "epoch": 0.14927272727272728, "grad_norm": 6.3125, "grad_norm_var": 0.5417805989583333, "learning_rate": 0.0001, "loss": 6.4318, "loss/crossentropy": 2.5062549114227295, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21950970217585564, "step": 3284 }, { "epoch": 0.14936363636363637, "grad_norm": 6.0625, "grad_norm_var": 0.529931640625, "learning_rate": 0.0001, "loss": 6.4054, "loss/crossentropy": 2.618837356567383, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.21009937301278114, "step": 3286 }, { "epoch": 0.14945454545454545, "grad_norm": 6.0, "grad_norm_var": 0.44830322265625, "learning_rate": 0.0001, "loss": 6.525, "loss/crossentropy": 2.6555290818214417, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21624113246798515, "step": 3288 }, { "epoch": 0.14954545454545454, "grad_norm": 5.84375, "grad_norm_var": 0.43437093098958335, "learning_rate": 0.0001, "loss": 6.4725, "loss/crossentropy": 2.644383728504181, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.21269257366657257, "step": 3290 }, { "epoch": 0.14963636363636362, "grad_norm": 6.34375, "grad_norm_var": 0.42083333333333334, "learning_rate": 0.0001, "loss": 6.2482, "loss/crossentropy": 2.5786006450653076, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.20036261901259422, "step": 3292 }, { "epoch": 0.14972727272727274, "grad_norm": 5.59375, "grad_norm_var": 0.44449462890625, "learning_rate": 0.0001, "loss": 5.9316, "loss/crossentropy": 2.2730345129966736, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.19847765192389488, "step": 3294 }, { "epoch": 0.14981818181818182, "grad_norm": 7.09375, "grad_norm_var": 0.4786295572916667, "learning_rate": 0.0001, "loss": 6.4701, "loss/crossentropy": 2.664171040058136, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21105869114398956, "step": 3296 }, { "epoch": 0.1499090909090909, "grad_norm": 6.46875, "grad_norm_var": 0.8653645833333333, "learning_rate": 0.0001, "loss": 6.4266, "loss/crossentropy": 2.4923182129859924, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22076866775751114, "step": 3298 }, { "epoch": 0.15, "grad_norm": 6.09375, "grad_norm_var": 0.60484619140625, "learning_rate": 0.0001, "loss": 5.7301, "loss/crossentropy": 2.135237008333206, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.18741197884082794, "step": 3300 }, { "epoch": 0.15009090909090908, "grad_norm": 6.34375, "grad_norm_var": 0.687109375, "learning_rate": 0.0001, "loss": 5.8718, "loss/crossentropy": 2.287621259689331, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.19044627249240875, "step": 3302 }, { "epoch": 0.1501818181818182, "grad_norm": 6.28125, "grad_norm_var": 0.68150634765625, "learning_rate": 0.0001, "loss": 6.3852, "loss/crossentropy": 2.576646625995636, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21093649044632912, "step": 3304 }, { "epoch": 0.15027272727272728, "grad_norm": 6.125, "grad_norm_var": 0.6669921875, "learning_rate": 0.0001, "loss": 6.4919, "loss/crossentropy": 2.6300162076950073, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.21607114002108574, "step": 3306 }, { "epoch": 0.15036363636363637, "grad_norm": 5.78125, "grad_norm_var": 0.6654947916666667, "learning_rate": 0.0001, "loss": 6.0515, "loss/crossentropy": 2.3984909653663635, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.19479462131857872, "step": 3308 }, { "epoch": 0.15045454545454545, "grad_norm": 6.03125, "grad_norm_var": 0.6173014322916667, "learning_rate": 0.0001, "loss": 5.7735, "loss/crossentropy": 2.1211978793144226, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.19589859619736671, "step": 3310 }, { "epoch": 0.15054545454545454, "grad_norm": 6.78125, "grad_norm_var": 0.5712198893229167, "learning_rate": 0.0001, "loss": 6.4506, "loss/crossentropy": 2.554212510585785, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.21640130504965782, "step": 3312 }, { "epoch": 0.15063636363636362, "grad_norm": 6.21875, "grad_norm_var": 0.174462890625, "learning_rate": 0.0001, "loss": 6.4219, "loss/crossentropy": 2.664043962955475, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.20801245421171188, "step": 3314 }, { "epoch": 0.15072727272727274, "grad_norm": 5.25, "grad_norm_var": 0.2109375, "learning_rate": 0.0001, "loss": 5.857, "loss/crossentropy": 2.2977276742458344, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.1889313943684101, "step": 3316 }, { "epoch": 0.15081818181818182, "grad_norm": 5.96875, "grad_norm_var": 0.14855143229166667, "learning_rate": 0.0001, "loss": 6.2952, "loss/crossentropy": 2.531409800052643, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.20938252285122871, "step": 3318 }, { "epoch": 0.1509090909090909, "grad_norm": 6.34375, "grad_norm_var": 0.15494384765625, "learning_rate": 0.0001, "loss": 6.3162, "loss/crossentropy": 2.5554425716400146, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.2071310691535473, "step": 3320 }, { "epoch": 0.151, "grad_norm": 6.46875, "grad_norm_var": 0.16087239583333332, "learning_rate": 0.0001, "loss": 6.3236, "loss/crossentropy": 2.5806429386138916, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.20574238896369934, "step": 3322 }, { "epoch": 0.15109090909090908, "grad_norm": 6.34375, "grad_norm_var": 0.15624593098958334, "learning_rate": 0.0001, "loss": 6.1963, "loss/crossentropy": 2.4920397996902466, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2020631544291973, "step": 3324 }, { "epoch": 0.1511818181818182, "grad_norm": 6.15625, "grad_norm_var": 0.14657796223958333, "learning_rate": 0.0001, "loss": 6.2051, "loss/crossentropy": 2.5175286531448364, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.20294076204299927, "step": 3326 }, { "epoch": 0.15127272727272728, "grad_norm": 7.21875, "grad_norm_var": 0.22628580729166667, "learning_rate": 0.0001, "loss": 6.9032, "loss/crossentropy": 2.7104408740997314, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.24036958441138268, "step": 3328 }, { "epoch": 0.15136363636363637, "grad_norm": 5.53125, "grad_norm_var": 0.24589436848958332, "learning_rate": 0.0001, "loss": 6.072, "loss/crossentropy": 2.3943982422351837, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.19725417345762253, "step": 3330 }, { "epoch": 0.15145454545454545, "grad_norm": 5.71875, "grad_norm_var": 0.24039306640625, "learning_rate": 0.0001, "loss": 6.0963, "loss/crossentropy": 2.430791884660721, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.19721932336688042, "step": 3332 }, { "epoch": 0.15154545454545454, "grad_norm": 6.1875, "grad_norm_var": 0.237109375, "learning_rate": 0.0001, "loss": 6.0317, "loss/crossentropy": 2.385330080986023, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.19764567911624908, "step": 3334 }, { "epoch": 0.15163636363636362, "grad_norm": 6.53125, "grad_norm_var": 0.23668212890625, "learning_rate": 0.0001, "loss": 6.0816, "loss/crossentropy": 2.3560237288475037, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20420218631625175, "step": 3336 }, { "epoch": 0.15172727272727274, "grad_norm": 7.3125, "grad_norm_var": 0.30992431640625, "learning_rate": 0.0001, "loss": 6.1924, "loss/crossentropy": 2.3727870285511017, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.20911434292793274, "step": 3338 }, { "epoch": 0.15181818181818182, "grad_norm": 6.125, "grad_norm_var": 0.31031494140625, "learning_rate": 0.0001, "loss": 5.7902, "loss/crossentropy": 2.2270340621471405, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.18951964750885963, "step": 3340 }, { "epoch": 0.1519090909090909, "grad_norm": 6.25, "grad_norm_var": 0.32766520182291664, "learning_rate": 0.0001, "loss": 6.8679, "loss/crossentropy": 2.8863967061042786, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22549711912870407, "step": 3342 }, { "epoch": 0.152, "grad_norm": 5.9375, "grad_norm_var": 0.24739176432291668, "learning_rate": 0.0001, "loss": 6.3483, "loss/crossentropy": 2.581421911716461, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20833102613687515, "step": 3344 }, { "epoch": 0.15209090909090908, "grad_norm": 6.53125, "grad_norm_var": 0.28355712890625, "learning_rate": 0.0001, "loss": 6.4308, "loss/crossentropy": 2.7585192918777466, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20120932534337044, "step": 3346 }, { "epoch": 0.15218181818181817, "grad_norm": 6.46875, "grad_norm_var": 0.21373697916666667, "learning_rate": 0.0001, "loss": 6.5105, "loss/crossentropy": 2.651751220226288, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.2165391594171524, "step": 3348 }, { "epoch": 0.15227272727272728, "grad_norm": 6.21875, "grad_norm_var": 0.23828125, "learning_rate": 0.0001, "loss": 6.1288, "loss/crossentropy": 2.453589975833893, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.20053325220942497, "step": 3350 }, { "epoch": 0.15236363636363637, "grad_norm": 5.46875, "grad_norm_var": 0.27342122395833335, "learning_rate": 0.0001, "loss": 6.1295, "loss/crossentropy": 2.490999162197113, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.20115668699145317, "step": 3352 }, { "epoch": 0.15245454545454545, "grad_norm": 6.59375, "grad_norm_var": 0.19698893229166667, "learning_rate": 0.0001, "loss": 6.1125, "loss/crossentropy": 2.479306697845459, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19652187824249268, "step": 3354 }, { "epoch": 0.15254545454545454, "grad_norm": 6.25, "grad_norm_var": 0.20100504557291668, "learning_rate": 0.0001, "loss": 6.4917, "loss/crossentropy": 2.6221222281455994, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2197704203426838, "step": 3356 }, { "epoch": 0.15263636363636363, "grad_norm": 6.4375, "grad_norm_var": 0.46926676432291664, "learning_rate": 0.0001, "loss": 6.3938, "loss/crossentropy": 2.5660024881362915, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.21227608621120453, "step": 3358 }, { "epoch": 0.15272727272727274, "grad_norm": 6.15625, "grad_norm_var": 0.45193684895833336, "learning_rate": 0.0001, "loss": 6.1954, "loss/crossentropy": 2.5452521443367004, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.19607427343726158, "step": 3360 }, { "epoch": 0.15281818181818183, "grad_norm": 6.375, "grad_norm_var": 0.39138997395833336, "learning_rate": 0.0001, "loss": 6.7575, "loss/crossentropy": 2.8674705624580383, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2167394459247589, "step": 3362 }, { "epoch": 0.1529090909090909, "grad_norm": 6.0625, "grad_norm_var": 0.40552978515625, "learning_rate": 0.0001, "loss": 6.0046, "loss/crossentropy": 2.3734301030635834, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.19358256086707115, "step": 3364 }, { "epoch": 0.153, "grad_norm": 9.3125, "grad_norm_var": 1.0428019205729167, "learning_rate": 0.0001, "loss": 6.0279, "loss/crossentropy": 2.3765516579151154, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.19716497883200645, "step": 3366 }, { "epoch": 0.15309090909090908, "grad_norm": 6.34375, "grad_norm_var": 0.96226806640625, "learning_rate": 0.0001, "loss": 6.4016, "loss/crossentropy": 2.6598156690597534, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20660297200083733, "step": 3368 }, { "epoch": 0.15318181818181817, "grad_norm": 6.34375, "grad_norm_var": 0.9415201822916667, "learning_rate": 0.0001, "loss": 6.3954, "loss/crossentropy": 2.6097107529640198, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.2100110836327076, "step": 3370 }, { "epoch": 0.15327272727272728, "grad_norm": 5.75, "grad_norm_var": 0.96099853515625, "learning_rate": 0.0001, "loss": 6.6848, "loss/crossentropy": 2.7827059626579285, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.21969978511333466, "step": 3372 }, { "epoch": 0.15336363636363637, "grad_norm": 6.40625, "grad_norm_var": 0.777587890625, "learning_rate": 0.0001, "loss": 6.4108, "loss/crossentropy": 2.5804526805877686, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.2125299721956253, "step": 3374 }, { "epoch": 0.15345454545454545, "grad_norm": 6.4375, "grad_norm_var": 0.8090128580729167, "learning_rate": 0.0001, "loss": 5.9936, "loss/crossentropy": 2.363675057888031, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.19873088970780373, "step": 3376 }, { "epoch": 0.15354545454545454, "grad_norm": 6.09375, "grad_norm_var": 0.8477213541666667, "learning_rate": 0.0001, "loss": 5.7347, "loss/crossentropy": 2.1771908700466156, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.18700241297483444, "step": 3378 }, { "epoch": 0.15363636363636363, "grad_norm": 5.9375, "grad_norm_var": 0.83638916015625, "learning_rate": 0.0001, "loss": 6.1676, "loss/crossentropy": 2.4380911588668823, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.20400532335042953, "step": 3380 }, { "epoch": 0.15372727272727274, "grad_norm": 6.8125, "grad_norm_var": 0.19670817057291667, "learning_rate": 0.0001, "loss": 6.6704, "loss/crossentropy": 2.7362945675849915, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.22407226637005806, "step": 3382 }, { "epoch": 0.15381818181818183, "grad_norm": 6.6875, "grad_norm_var": 0.20234375, "learning_rate": 0.0001, "loss": 6.2585, "loss/crossentropy": 2.4484063386917114, "loss/hidden": 1.724609375, "loss/jsd": 0.0, "loss/logits": 0.20854933932423592, "step": 3384 }, { "epoch": 0.1539090909090909, "grad_norm": 5.875, "grad_norm_var": 0.22242431640625, "learning_rate": 0.0001, "loss": 5.7716, "loss/crossentropy": 2.294054388999939, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.18271559104323387, "step": 3386 }, { "epoch": 0.154, "grad_norm": 6.96875, "grad_norm_var": 0.21285400390625, "learning_rate": 0.0001, "loss": 6.0064, "loss/crossentropy": 2.310445189476013, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.20143362879753113, "step": 3388 }, { "epoch": 0.15409090909090908, "grad_norm": 6.09375, "grad_norm_var": 0.22203369140625, "learning_rate": 0.0001, "loss": 6.5529, "loss/crossentropy": 2.6720749735832214, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21972159296274185, "step": 3390 }, { "epoch": 0.15418181818181817, "grad_norm": 5.5, "grad_norm_var": 0.24609375, "learning_rate": 0.0001, "loss": 6.1966, "loss/crossentropy": 2.5708730220794678, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.19675535336136818, "step": 3392 }, { "epoch": 0.15427272727272728, "grad_norm": 7.78125, "grad_norm_var": 0.39732666015625, "learning_rate": 0.0001, "loss": 6.1244, "loss/crossentropy": 2.4404448866844177, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.200815137475729, "step": 3394 }, { "epoch": 0.15436363636363637, "grad_norm": 6.53125, "grad_norm_var": 0.37584228515625, "learning_rate": 0.0001, "loss": 6.2488, "loss/crossentropy": 2.4661887884140015, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.2089260220527649, "step": 3396 }, { "epoch": 0.15445454545454546, "grad_norm": 6.5625, "grad_norm_var": 0.37121988932291666, "learning_rate": 0.0001, "loss": 6.5154, "loss/crossentropy": 2.5970450043678284, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22152512520551682, "step": 3398 }, { "epoch": 0.15454545454545454, "grad_norm": 7.21875, "grad_norm_var": 0.4100260416666667, "learning_rate": 0.0001, "loss": 6.4863, "loss/crossentropy": 2.6096429228782654, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21735722571611404, "step": 3400 }, { "epoch": 0.15463636363636363, "grad_norm": 5.96875, "grad_norm_var": 0.39205729166666664, "learning_rate": 0.0001, "loss": 6.2322, "loss/crossentropy": 2.445048540830612, "loss/hidden": 1.728515625, "loss/jsd": 0.0, "loss/logits": 0.205864567309618, "step": 3402 }, { "epoch": 0.15472727272727274, "grad_norm": 6.0, "grad_norm_var": 0.3830078125, "learning_rate": 0.0001, "loss": 6.354, "loss/crossentropy": 2.5555624961853027, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20914025604724884, "step": 3404 }, { "epoch": 0.15481818181818183, "grad_norm": 6.5, "grad_norm_var": 0.33179931640625, "learning_rate": 0.0001, "loss": 6.3128, "loss/crossentropy": 2.4997612833976746, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.20922859758138657, "step": 3406 }, { "epoch": 0.1549090909090909, "grad_norm": 5.59375, "grad_norm_var": 0.3486328125, "learning_rate": 0.0001, "loss": 5.9126, "loss/crossentropy": 2.262055605649948, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.1945503205060959, "step": 3408 }, { "epoch": 0.155, "grad_norm": 6.65625, "grad_norm_var": 0.18316650390625, "learning_rate": 0.0001, "loss": 6.2121, "loss/crossentropy": 2.490810751914978, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.2047473005950451, "step": 3410 }, { "epoch": 0.15509090909090908, "grad_norm": 6.40625, "grad_norm_var": 0.82890625, "learning_rate": 0.0001, "loss": 6.4574, "loss/crossentropy": 2.5416743457317352, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.21794236451387405, "step": 3412 }, { "epoch": 0.15518181818181817, "grad_norm": 5.59375, "grad_norm_var": 0.8790201822916667, "learning_rate": 0.0001, "loss": 6.1656, "loss/crossentropy": 2.5008689761161804, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.19811807572841644, "step": 3414 }, { "epoch": 0.15527272727272728, "grad_norm": 6.90625, "grad_norm_var": 0.86187744140625, "learning_rate": 0.0001, "loss": 6.3308, "loss/crossentropy": 2.483784556388855, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2089226208627224, "step": 3416 }, { "epoch": 0.15536363636363637, "grad_norm": 6.71875, "grad_norm_var": 0.8486287434895833, "learning_rate": 0.0001, "loss": 6.4548, "loss/crossentropy": 2.630200147628784, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.21429434046149254, "step": 3418 }, { "epoch": 0.15545454545454546, "grad_norm": 6.15625, "grad_norm_var": 0.8237263997395833, "learning_rate": 0.0001, "loss": 6.4952, "loss/crossentropy": 2.6211345195770264, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.21846313029527664, "step": 3420 }, { "epoch": 0.15554545454545454, "grad_norm": 6.8125, "grad_norm_var": 0.84127197265625, "learning_rate": 0.0001, "loss": 6.3103, "loss/crossentropy": 2.496788889169693, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.20908556133508682, "step": 3422 }, { "epoch": 0.15563636363636363, "grad_norm": 6.5625, "grad_norm_var": 0.7551432291666667, "learning_rate": 0.0001, "loss": 6.4398, "loss/crossentropy": 2.5983047485351562, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.21403002366423607, "step": 3424 }, { "epoch": 0.15572727272727271, "grad_norm": 5.78125, "grad_norm_var": 0.83004150390625, "learning_rate": 0.0001, "loss": 6.198, "loss/crossentropy": 2.536584794521332, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20129283890128136, "step": 3426 }, { "epoch": 0.15581818181818183, "grad_norm": 6.5625, "grad_norm_var": 0.23863525390625, "learning_rate": 0.0001, "loss": 6.8426, "loss/crossentropy": 2.8369085788726807, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22869372740387917, "step": 3428 }, { "epoch": 0.1559090909090909, "grad_norm": 6.375, "grad_norm_var": 0.21770426432291667, "learning_rate": 0.0001, "loss": 6.2862, "loss/crossentropy": 2.5466989874839783, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.2069564200937748, "step": 3430 }, { "epoch": 0.156, "grad_norm": 6.09375, "grad_norm_var": 0.22164306640625, "learning_rate": 0.0001, "loss": 6.3712, "loss/crossentropy": 2.5627682209014893, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.21345893293619156, "step": 3432 }, { "epoch": 0.15609090909090909, "grad_norm": 6.28125, "grad_norm_var": 0.2244140625, "learning_rate": 0.0001, "loss": 6.3845, "loss/crossentropy": 2.53183114528656, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.21358761563897133, "step": 3434 }, { "epoch": 0.15618181818181817, "grad_norm": 6.34375, "grad_norm_var": 0.222900390625, "learning_rate": 0.0001, "loss": 6.1131, "loss/crossentropy": 2.501641035079956, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.1933768056333065, "step": 3436 }, { "epoch": 0.15627272727272729, "grad_norm": 6.5, "grad_norm_var": 0.225634765625, "learning_rate": 0.0001, "loss": 6.5498, "loss/crossentropy": 2.6839720606803894, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.2145145907998085, "step": 3438 }, { "epoch": 0.15636363636363637, "grad_norm": 6.34375, "grad_norm_var": 0.22042643229166667, "learning_rate": 0.0001, "loss": 6.4023, "loss/crossentropy": 2.7017518281936646, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.20032459124922752, "step": 3440 }, { "epoch": 0.15645454545454546, "grad_norm": 6.375, "grad_norm_var": 0.44905192057291665, "learning_rate": 0.0001, "loss": 6.2378, "loss/crossentropy": 2.4411545395851135, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.20739738270640373, "step": 3442 }, { "epoch": 0.15654545454545454, "grad_norm": 5.875, "grad_norm_var": 0.41378580729166664, "learning_rate": 0.0001, "loss": 6.0059, "loss/crossentropy": 2.4015129804611206, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19403648003935814, "step": 3444 }, { "epoch": 0.15663636363636363, "grad_norm": 5.78125, "grad_norm_var": 0.41484375, "learning_rate": 0.0001, "loss": 6.1378, "loss/crossentropy": 2.546124756336212, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19314969703555107, "step": 3446 }, { "epoch": 0.15672727272727272, "grad_norm": 6.21875, "grad_norm_var": 0.40390625, "learning_rate": 0.0001, "loss": 6.4455, "loss/crossentropy": 2.640919864177704, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.21034513786435127, "step": 3448 }, { "epoch": 0.15681818181818183, "grad_norm": 6.28125, "grad_norm_var": 0.39254150390625, "learning_rate": 0.0001, "loss": 6.3571, "loss/crossentropy": 2.623012959957123, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.20524929836392403, "step": 3450 }, { "epoch": 0.15690909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.41822509765625, "learning_rate": 0.0001, "loss": 6.2068, "loss/crossentropy": 2.5624905824661255, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20037024468183517, "step": 3452 }, { "epoch": 0.157, "grad_norm": 5.96875, "grad_norm_var": 0.40901285807291665, "learning_rate": 0.0001, "loss": 5.9799, "loss/crossentropy": 2.4055966436862946, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19258391484618187, "step": 3454 }, { "epoch": 0.1570909090909091, "grad_norm": 5.9375, "grad_norm_var": 0.42073160807291665, "learning_rate": 0.0001, "loss": 6.5399, "loss/crossentropy": 2.7488685846328735, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21386495232582092, "step": 3456 }, { "epoch": 0.15718181818181817, "grad_norm": 6.09375, "grad_norm_var": 0.06884358723958334, "learning_rate": 0.0001, "loss": 6.0579, "loss/crossentropy": 2.3984326124191284, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.19895801693201065, "step": 3458 }, { "epoch": 0.1572727272727273, "grad_norm": 7.8125, "grad_norm_var": 0.315234375, "learning_rate": 0.0001, "loss": 6.6338, "loss/crossentropy": 2.7609057426452637, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2177535854279995, "step": 3460 }, { "epoch": 0.15736363636363637, "grad_norm": 6.875, "grad_norm_var": 0.330859375, "learning_rate": 0.0001, "loss": 6.2951, "loss/crossentropy": 2.49477881193161, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.20855162292718887, "step": 3462 }, { "epoch": 0.15745454545454546, "grad_norm": 6.03125, "grad_norm_var": 0.34683837890625, "learning_rate": 0.0001, "loss": 6.034, "loss/crossentropy": 2.477426767349243, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19042237102985382, "step": 3464 }, { "epoch": 0.15754545454545454, "grad_norm": 6.40625, "grad_norm_var": 0.3462890625, "learning_rate": 0.0001, "loss": 6.213, "loss/crossentropy": 2.418627768754959, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.2054162509739399, "step": 3466 }, { "epoch": 0.15763636363636363, "grad_norm": 5.84375, "grad_norm_var": 0.3192057291666667, "learning_rate": 0.0001, "loss": 6.2019, "loss/crossentropy": 2.5056048929691315, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20166431739926338, "step": 3468 }, { "epoch": 0.15772727272727272, "grad_norm": 7.0, "grad_norm_var": 0.37320556640625, "learning_rate": 0.0001, "loss": 6.0774, "loss/crossentropy": 2.3574554324150085, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.20245954394340515, "step": 3470 }, { "epoch": 0.15781818181818183, "grad_norm": 5.78125, "grad_norm_var": 0.35845947265625, "learning_rate": 0.0001, "loss": 6.3363, "loss/crossentropy": 2.64945787191391, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20188936591148376, "step": 3472 }, { "epoch": 0.15790909090909092, "grad_norm": 6.5625, "grad_norm_var": 0.359375, "learning_rate": 0.0001, "loss": 6.5702, "loss/crossentropy": 2.7030083537101746, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.2169942520558834, "step": 3474 }, { "epoch": 0.158, "grad_norm": 6.15625, "grad_norm_var": 0.24273681640625, "learning_rate": 0.0001, "loss": 6.5385, "loss/crossentropy": 2.631165385246277, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2212069146335125, "step": 3476 }, { "epoch": 0.1580909090909091, "grad_norm": 5.96875, "grad_norm_var": 0.24202067057291668, "learning_rate": 0.0001, "loss": 6.3021, "loss/crossentropy": 2.668975830078125, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19886402413249016, "step": 3478 }, { "epoch": 0.15818181818181817, "grad_norm": 5.53125, "grad_norm_var": 0.26278889973958336, "learning_rate": 0.0001, "loss": 6.0981, "loss/crossentropy": 2.4687342643737793, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.19672385975718498, "step": 3480 }, { "epoch": 0.15827272727272726, "grad_norm": 6.15625, "grad_norm_var": 0.279931640625, "learning_rate": 0.0001, "loss": 6.3775, "loss/crossentropy": 2.5685237646102905, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.21547050401568413, "step": 3482 }, { "epoch": 0.15836363636363637, "grad_norm": 9.75, "grad_norm_var": 1.13111572265625, "learning_rate": 0.0001, "loss": 6.312, "loss/crossentropy": 2.422010898590088, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.21497058868408203, "step": 3484 }, { "epoch": 0.15845454545454546, "grad_norm": 7.8125, "grad_norm_var": 1.1801717122395834, "learning_rate": 0.0001, "loss": 6.8338, "loss/crossentropy": 2.847530424594879, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.22851385176181793, "step": 3486 }, { "epoch": 0.15854545454545454, "grad_norm": 5.75, "grad_norm_var": 1.1790364583333333, "learning_rate": 0.0001, "loss": 6.3926, "loss/crossentropy": 2.6173436641693115, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.20525959134101868, "step": 3488 }, { "epoch": 0.15863636363636363, "grad_norm": 6.4375, "grad_norm_var": 1.2333170572916667, "learning_rate": 0.0001, "loss": 5.94, "loss/crossentropy": 2.349908709526062, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.19240494444966316, "step": 3490 }, { "epoch": 0.15872727272727272, "grad_norm": 8.5625, "grad_norm_var": 1.4755045572916667, "learning_rate": 0.0001, "loss": 6.2126, "loss/crossentropy": 2.392519533634186, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21169129386544228, "step": 3492 }, { "epoch": 0.15881818181818183, "grad_norm": 6.78125, "grad_norm_var": 1.3832316080729166, "learning_rate": 0.0001, "loss": 6.4874, "loss/crossentropy": 2.700030207633972, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.20666765794157982, "step": 3494 }, { "epoch": 0.15890909090909092, "grad_norm": 6.03125, "grad_norm_var": 1.3208170572916667, "learning_rate": 0.0001, "loss": 6.0683, "loss/crossentropy": 2.364784359931946, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20199178531765938, "step": 3496 }, { "epoch": 0.159, "grad_norm": 5.9375, "grad_norm_var": 1.25546875, "learning_rate": 0.0001, "loss": 6.5362, "loss/crossentropy": 2.723261296749115, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21254485845565796, "step": 3498 }, { "epoch": 0.1590909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.6358357747395833, "learning_rate": 0.0001, "loss": 6.3188, "loss/crossentropy": 2.6181763410568237, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.2018997259438038, "step": 3500 }, { "epoch": 0.15918181818181817, "grad_norm": 6.53125, "grad_norm_var": 0.49947916666666664, "learning_rate": 0.0001, "loss": 6.3997, "loss/crossentropy": 2.5634013414382935, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2137114517390728, "step": 3502 }, { "epoch": 0.15927272727272726, "grad_norm": 6.09375, "grad_norm_var": 0.47623291015625, "learning_rate": 0.0001, "loss": 6.475, "loss/crossentropy": 2.7077895402908325, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21031823009252548, "step": 3504 }, { "epoch": 0.15936363636363637, "grad_norm": 5.84375, "grad_norm_var": 0.4428019205729167, "learning_rate": 0.0001, "loss": 5.8511, "loss/crossentropy": 2.2453423738479614, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19533690810203552, "step": 3506 }, { "epoch": 0.15945454545454546, "grad_norm": 8.3125, "grad_norm_var": 5779307751890947.0, "learning_rate": 0.0001, "loss": 6.627, "loss/crossentropy": 2.441849112510681, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.20679871737957, "step": 3508 }, { "epoch": 0.15954545454545455, "grad_norm": 6.84375, "grad_norm_var": 5779307753316355.0, "learning_rate": 0.0001, "loss": 6.3032, "loss/crossentropy": 2.5258585810661316, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20859327912330627, "step": 3510 }, { "epoch": 0.15963636363636363, "grad_norm": 6.34375, "grad_norm_var": 5779307752049326.0, "learning_rate": 0.0001, "loss": 6.4333, "loss/crossentropy": 2.5876155495643616, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2142590507864952, "step": 3512 }, { "epoch": 0.15972727272727272, "grad_norm": 6.40625, "grad_norm_var": 5779307752841219.0, "learning_rate": 0.0001, "loss": 6.0805, "loss/crossentropy": 2.377358376979828, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.199997927993536, "step": 3514 }, { "epoch": 0.15981818181818183, "grad_norm": 6.03125, "grad_norm_var": 5779307752999598.0, "learning_rate": 0.0001, "loss": 5.7717, "loss/crossentropy": 2.187394291162491, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.18967711552977562, "step": 3516 }, { "epoch": 0.15990909090909092, "grad_norm": 5.5, "grad_norm_var": 5779307757275822.0, "learning_rate": 0.0001, "loss": 5.743, "loss/crossentropy": 2.2927498817443848, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18174166232347488, "step": 3518 }, { "epoch": 0.16, "grad_norm": 5.9375, "grad_norm_var": 5779307758542851.0, "learning_rate": 0.0001, "loss": 6.0548, "loss/crossentropy": 2.420728027820587, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.1989535391330719, "step": 3520 }, { "epoch": 0.1600909090909091, "grad_norm": 6.0, "grad_norm_var": 5779307759097176.0, "learning_rate": 0.0001, "loss": 6.339, "loss/crossentropy": 2.5667598247528076, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20886226370930672, "step": 3522 }, { "epoch": 0.16018181818181818, "grad_norm": 5.25, "grad_norm_var": 0.14833577473958334, "learning_rate": 0.0001, "loss": 5.6902, "loss/crossentropy": 2.2545958757400513, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.17422913387417793, "step": 3524 }, { "epoch": 0.16027272727272726, "grad_norm": 6.6875, "grad_norm_var": 0.15256754557291666, "learning_rate": 0.0001, "loss": 5.9663, "loss/crossentropy": 2.39420622587204, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.19216598942875862, "step": 3526 }, { "epoch": 0.16036363636363637, "grad_norm": 6.78125, "grad_norm_var": 0.19347330729166667, "learning_rate": 0.0001, "loss": 6.2839, "loss/crossentropy": 2.6006454825401306, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20270410925149918, "step": 3528 }, { "epoch": 0.16045454545454546, "grad_norm": 5.8125, "grad_norm_var": 0.20403645833333334, "learning_rate": 0.0001, "loss": 6.1227, "loss/crossentropy": 2.3693277835845947, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.20229175686836243, "step": 3530 }, { "epoch": 0.16054545454545455, "grad_norm": 5.75, "grad_norm_var": 0.21695556640625, "learning_rate": 0.0001, "loss": 6.2413, "loss/crossentropy": 2.5012152791023254, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.20623257756233215, "step": 3532 }, { "epoch": 0.16063636363636363, "grad_norm": 6.28125, "grad_norm_var": 0.45243733723958335, "learning_rate": 0.0001, "loss": 6.6111, "loss/crossentropy": 2.6832721829414368, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.2230548933148384, "step": 3534 }, { "epoch": 0.16072727272727272, "grad_norm": 5.9375, "grad_norm_var": 0.48515218098958335, "learning_rate": 0.0001, "loss": 6.0046, "loss/crossentropy": 2.3398528695106506, "loss/hidden": 1.732421875, "loss/jsd": 0.0, "loss/logits": 0.19323104992508888, "step": 3536 }, { "epoch": 0.1608181818181818, "grad_norm": 6.6875, "grad_norm_var": 0.48878580729166665, "learning_rate": 0.0001, "loss": 6.5695, "loss/crossentropy": 2.682500958442688, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21878045052289963, "step": 3538 }, { "epoch": 0.16090909090909092, "grad_norm": 6.25, "grad_norm_var": 0.3868326822916667, "learning_rate": 0.0001, "loss": 6.1488, "loss/crossentropy": 2.5009196996688843, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.198578879237175, "step": 3540 }, { "epoch": 0.161, "grad_norm": 5.375, "grad_norm_var": 0.3738932291666667, "learning_rate": 0.0001, "loss": 5.8567, "loss/crossentropy": 2.333570182323456, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.18844840303063393, "step": 3542 }, { "epoch": 0.1610909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.3504557291666667, "learning_rate": 0.0001, "loss": 6.1509, "loss/crossentropy": 2.481253921985626, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20016399025917053, "step": 3544 }, { "epoch": 0.16118181818181818, "grad_norm": 6.0, "grad_norm_var": 0.38440348307291666, "learning_rate": 0.0001, "loss": 5.7873, "loss/crossentropy": 2.254674017429352, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.18568046763539314, "step": 3546 }, { "epoch": 0.16127272727272726, "grad_norm": 5.34375, "grad_norm_var": 0.42849934895833336, "learning_rate": 0.0001, "loss": 5.9164, "loss/crossentropy": 2.328031599521637, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.18969857692718506, "step": 3548 }, { "epoch": 0.16136363636363638, "grad_norm": 5.9375, "grad_norm_var": 0.21612955729166666, "learning_rate": 0.0001, "loss": 6.2711, "loss/crossentropy": 2.5015710592269897, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2062496356666088, "step": 3550 }, { "epoch": 0.16145454545454546, "grad_norm": 6.25, "grad_norm_var": 0.19205322265625, "learning_rate": 0.0001, "loss": 6.1446, "loss/crossentropy": 2.5024638175964355, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.19566143676638603, "step": 3552 }, { "epoch": 0.16154545454545455, "grad_norm": 6.3125, "grad_norm_var": 0.164697265625, "learning_rate": 0.0001, "loss": 6.2209, "loss/crossentropy": 2.4695937633514404, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2028619796037674, "step": 3554 }, { "epoch": 0.16163636363636363, "grad_norm": 5.84375, "grad_norm_var": 0.17327067057291667, "learning_rate": 0.0001, "loss": 5.9809, "loss/crossentropy": 2.289512038230896, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2003883272409439, "step": 3556 }, { "epoch": 0.16172727272727272, "grad_norm": 6.4375, "grad_norm_var": 0.13443603515625, "learning_rate": 0.0001, "loss": 6.69, "loss/crossentropy": 2.725961744785309, "loss/hidden": 1.740234375, "loss/jsd": 0.0, "loss/logits": 0.22237983345985413, "step": 3558 }, { "epoch": 0.1618181818181818, "grad_norm": 6.0, "grad_norm_var": 0.12515869140625, "learning_rate": 0.0001, "loss": 6.0638, "loss/crossentropy": 2.3939421474933624, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.19667073339223862, "step": 3560 }, { "epoch": 0.16190909090909092, "grad_norm": 5.71875, "grad_norm_var": 0.110400390625, "learning_rate": 0.0001, "loss": 6.008, "loss/crossentropy": 2.3520345091819763, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.19450132176280022, "step": 3562 }, { "epoch": 0.162, "grad_norm": 6.28125, "grad_norm_var": 0.06285400390625, "learning_rate": 0.0001, "loss": 6.2729, "loss/crossentropy": 2.5557450652122498, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20140201970934868, "step": 3564 }, { "epoch": 0.1620909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.10221354166666667, "learning_rate": 0.0001, "loss": 6.1569, "loss/crossentropy": 2.452498883008957, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.2038404867053032, "step": 3566 }, { "epoch": 0.16218181818181818, "grad_norm": 5.71875, "grad_norm_var": 0.11209309895833333, "learning_rate": 0.0001, "loss": 6.2808, "loss/crossentropy": 2.5685258507728577, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.20501847192645073, "step": 3568 }, { "epoch": 0.16227272727272726, "grad_norm": 5.9375, "grad_norm_var": 0.11483968098958333, "learning_rate": 0.0001, "loss": 6.2549, "loss/crossentropy": 2.4990068078041077, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.20703687146306038, "step": 3570 }, { "epoch": 0.16236363636363638, "grad_norm": 5.5625, "grad_norm_var": 0.13775634765625, "learning_rate": 0.0001, "loss": 5.7173, "loss/crossentropy": 2.2113667130470276, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.1802796870470047, "step": 3572 }, { "epoch": 0.16245454545454546, "grad_norm": 6.53125, "grad_norm_var": 0.156640625, "learning_rate": 0.0001, "loss": 6.5744, "loss/crossentropy": 2.750638961791992, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21518649533391, "step": 3574 }, { "epoch": 0.16254545454545455, "grad_norm": 5.59375, "grad_norm_var": 0.15832926432291666, "learning_rate": 0.0001, "loss": 5.8152, "loss/crossentropy": 2.304818272590637, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1877555288374424, "step": 3576 }, { "epoch": 0.16263636363636363, "grad_norm": 6.375, "grad_norm_var": 0.14927978515625, "learning_rate": 0.0001, "loss": 6.1579, "loss/crossentropy": 2.450581431388855, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.1998363882303238, "step": 3578 }, { "epoch": 0.16272727272727272, "grad_norm": 7.9375, "grad_norm_var": 0.40084635416666664, "learning_rate": 0.0001, "loss": 6.1811, "loss/crossentropy": 2.450781285762787, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20897255465388298, "step": 3580 }, { "epoch": 0.1628181818181818, "grad_norm": 6.0, "grad_norm_var": 0.35256754557291664, "learning_rate": 0.0001, "loss": 6.0449, "loss/crossentropy": 2.4290542006492615, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.1951821781694889, "step": 3582 }, { "epoch": 0.16290909090909092, "grad_norm": 6.125, "grad_norm_var": 0.3356404622395833, "learning_rate": 0.0001, "loss": 6.185, "loss/crossentropy": 2.398352563381195, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.20971805602312088, "step": 3584 }, { "epoch": 0.163, "grad_norm": 6.46875, "grad_norm_var": 0.3836588541666667, "learning_rate": 0.0001, "loss": 6.3841, "loss/crossentropy": 2.694360375404358, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.2019820213317871, "step": 3586 }, { "epoch": 0.1630909090909091, "grad_norm": 7.0, "grad_norm_var": 0.38479410807291664, "learning_rate": 0.0001, "loss": 5.8729, "loss/crossentropy": 2.2553427815437317, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.19300296530127525, "step": 3588 }, { "epoch": 0.16318181818181818, "grad_norm": 5.5625, "grad_norm_var": 0.39143473307291665, "learning_rate": 0.0001, "loss": 5.8045, "loss/crossentropy": 2.2423582673072815, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.18941550329327583, "step": 3590 }, { "epoch": 0.16327272727272726, "grad_norm": 5.84375, "grad_norm_var": 0.3929036458333333, "learning_rate": 0.0001, "loss": 6.2114, "loss/crossentropy": 2.5638272166252136, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.19854536652565002, "step": 3592 }, { "epoch": 0.16336363636363635, "grad_norm": 6.40625, "grad_norm_var": 0.47649739583333334, "learning_rate": 0.0001, "loss": 5.7819, "loss/crossentropy": 2.2297229170799255, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.1874437779188156, "step": 3594 }, { "epoch": 0.16345454545454546, "grad_norm": 7.1875, "grad_norm_var": 0.31256103515625, "learning_rate": 0.0001, "loss": 6.099, "loss/crossentropy": 2.414579302072525, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.19832630082964897, "step": 3596 }, { "epoch": 0.16354545454545455, "grad_norm": 6.75, "grad_norm_var": 0.35982666015625, "learning_rate": 0.0001, "loss": 6.4498, "loss/crossentropy": 2.6965479850769043, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20657382532954216, "step": 3598 }, { "epoch": 0.16363636363636364, "grad_norm": 6.0625, "grad_norm_var": 0.36259358723958335, "learning_rate": 0.0001, "loss": 6.4179, "loss/crossentropy": 2.6895810961723328, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.2073989473283291, "step": 3600 }, { "epoch": 0.16372727272727272, "grad_norm": 7.0, "grad_norm_var": 0.3729777018229167, "learning_rate": 0.0001, "loss": 6.6604, "loss/crossentropy": 2.6980590224266052, "loss/hidden": 1.712890625, "loss/jsd": 0.0, "loss/logits": 0.22494655847549438, "step": 3602 }, { "epoch": 0.1638181818181818, "grad_norm": 6.59375, "grad_norm_var": 0.338525390625, "learning_rate": 0.0001, "loss": 6.1586, "loss/crossentropy": 2.404732882976532, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.20722679793834686, "step": 3604 }, { "epoch": 0.16390909090909092, "grad_norm": 8.4375, "grad_norm_var": 0.5885416666666666, "learning_rate": 0.0001, "loss": 6.2065, "loss/crossentropy": 2.44076931476593, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20782802253961563, "step": 3606 }, { "epoch": 0.164, "grad_norm": 5.78125, "grad_norm_var": 0.5493326822916667, "learning_rate": 0.0001, "loss": 6.3411, "loss/crossentropy": 2.5951849222183228, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20818618312478065, "step": 3608 }, { "epoch": 0.1640909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.460009765625, "learning_rate": 0.0001, "loss": 6.1086, "loss/crossentropy": 2.457001268863678, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.1966077834367752, "step": 3610 }, { "epoch": 0.16418181818181818, "grad_norm": 6.34375, "grad_norm_var": 0.4150390625, "learning_rate": 0.0001, "loss": 6.0574, "loss/crossentropy": 2.348346471786499, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.20039797946810722, "step": 3612 }, { "epoch": 0.16427272727272726, "grad_norm": 6.15625, "grad_norm_var": 0.446484375, "learning_rate": 0.0001, "loss": 5.9306, "loss/crossentropy": 2.2824460566043854, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.19606948643922806, "step": 3614 }, { "epoch": 0.16436363636363635, "grad_norm": 5.84375, "grad_norm_var": 0.4832967122395833, "learning_rate": 0.0001, "loss": 6.2149, "loss/crossentropy": 2.5595412850379944, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.1983461119234562, "step": 3616 }, { "epoch": 0.16445454545454546, "grad_norm": 5.0, "grad_norm_var": 0.5622029622395833, "learning_rate": 0.0001, "loss": 5.5517, "loss/crossentropy": 2.2205335795879364, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.17101026698946953, "step": 3618 }, { "epoch": 0.16454545454545455, "grad_norm": 6.375, "grad_norm_var": 0.546337890625, "learning_rate": 0.0001, "loss": 5.9479, "loss/crossentropy": 2.33952134847641, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.19423821568489075, "step": 3620 }, { "epoch": 0.16463636363636364, "grad_norm": 6.90625, "grad_norm_var": 1.8719889322916667, "learning_rate": 0.0001, "loss": 6.5186, "loss/crossentropy": 2.532242178916931, "loss/hidden": 1.736328125, "loss/jsd": 0.0, "loss/logits": 0.22499863058328629, "step": 3622 }, { "epoch": 0.16472727272727272, "grad_norm": 5.9375, "grad_norm_var": 1.86480712890625, "learning_rate": 0.0001, "loss": 6.422, "loss/crossentropy": 2.586313784122467, "loss/hidden": 1.716796875, "loss/jsd": 0.0, "loss/logits": 0.21188803017139435, "step": 3624 }, { "epoch": 0.1648181818181818, "grad_norm": 7.5, "grad_norm_var": 1.943603515625, "learning_rate": 0.0001, "loss": 6.3903, "loss/crossentropy": 2.673857092857361, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.20308662578463554, "step": 3626 }, { "epoch": 0.16490909090909092, "grad_norm": 7.3125, "grad_norm_var": 1.9831339518229167, "learning_rate": 0.0001, "loss": 6.3645, "loss/crossentropy": 2.5581252574920654, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21305760368704796, "step": 3628 }, { "epoch": 0.165, "grad_norm": 6.21875, "grad_norm_var": 1.9475545247395833, "learning_rate": 0.0001, "loss": 6.1766, "loss/crossentropy": 2.514763057231903, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.1986050270497799, "step": 3630 }, { "epoch": 0.1650909090909091, "grad_norm": 5.34375, "grad_norm_var": 1.96422119140625, "learning_rate": 0.0001, "loss": 6.1298, "loss/crossentropy": 2.5304266810417175, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.19040555506944656, "step": 3632 }, { "epoch": 0.16518181818181818, "grad_norm": 5.625, "grad_norm_var": 1.830322265625, "learning_rate": 0.0001, "loss": 5.836, "loss/crossentropy": 2.3642649352550507, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.1833045668900013, "step": 3634 }, { "epoch": 0.16527272727272727, "grad_norm": 5.96875, "grad_norm_var": 1.8453125, "learning_rate": 0.0001, "loss": 6.4293, "loss/crossentropy": 2.5802618861198425, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.21635359525680542, "step": 3636 }, { "epoch": 0.16536363636363635, "grad_norm": 6.0625, "grad_norm_var": 0.313525390625, "learning_rate": 0.0001, "loss": 6.2286, "loss/crossentropy": 2.568045914173126, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.19730418175458908, "step": 3638 }, { "epoch": 0.16545454545454547, "grad_norm": 6.71875, "grad_norm_var": 0.3791015625, "learning_rate": 0.0001, "loss": 5.5493, "loss/crossentropy": 2.1270083487033844, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.17562789097428322, "step": 3640 }, { "epoch": 0.16554545454545455, "grad_norm": 6.3125, "grad_norm_var": 0.24911702473958333, "learning_rate": 0.0001, "loss": 6.6635, "loss/crossentropy": 2.8285049200057983, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.21533431112766266, "step": 3642 }, { "epoch": 0.16563636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.17069905598958332, "learning_rate": 0.0001, "loss": 6.3317, "loss/crossentropy": 2.627326250076294, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20637206733226776, "step": 3644 }, { "epoch": 0.16572727272727272, "grad_norm": 6.125, "grad_norm_var": 0.16845296223958334, "learning_rate": 0.0001, "loss": 6.1073, "loss/crossentropy": 2.3808946013450623, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.2056499905884266, "step": 3646 }, { "epoch": 0.1658181818181818, "grad_norm": 5.71875, "grad_norm_var": 0.14687093098958334, "learning_rate": 0.0001, "loss": 5.9481, "loss/crossentropy": 2.358168452978134, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19414477795362473, "step": 3648 }, { "epoch": 0.16590909090909092, "grad_norm": 6.46875, "grad_norm_var": 0.1865234375, "learning_rate": 0.0001, "loss": 6.5267, "loss/crossentropy": 2.704894244670868, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2149958685040474, "step": 3650 }, { "epoch": 0.166, "grad_norm": 5.71875, "grad_norm_var": 0.19810791015625, "learning_rate": 0.0001, "loss": 6.3022, "loss/crossentropy": 2.5468077659606934, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.20698357373476028, "step": 3652 }, { "epoch": 0.1660909090909091, "grad_norm": 6.46875, "grad_norm_var": 0.22180582682291666, "learning_rate": 0.0001, "loss": 6.3251, "loss/crossentropy": 2.6877756118774414, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.1984953060746193, "step": 3654 }, { "epoch": 0.16618181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.19924723307291667, "learning_rate": 0.0001, "loss": 6.2411, "loss/crossentropy": 2.5778526663780212, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.202065858989954, "step": 3656 }, { "epoch": 0.16627272727272727, "grad_norm": 6.28125, "grad_norm_var": 0.19358317057291666, "learning_rate": 0.0001, "loss": 6.4997, "loss/crossentropy": 2.6306233406066895, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21659595891833305, "step": 3658 }, { "epoch": 0.16636363636363635, "grad_norm": 6.375, "grad_norm_var": 0.17102457682291666, "learning_rate": 0.0001, "loss": 6.3541, "loss/crossentropy": 2.6229442954063416, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20241138339042664, "step": 3660 }, { "epoch": 0.16645454545454547, "grad_norm": 5.375, "grad_norm_var": 0.20504150390625, "learning_rate": 0.0001, "loss": 5.9982, "loss/crossentropy": 2.4416307508945465, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.1900324560701847, "step": 3662 }, { "epoch": 0.16654545454545455, "grad_norm": 6.59375, "grad_norm_var": 0.21871337890625, "learning_rate": 0.0001, "loss": 6.072, "loss/crossentropy": 2.381405472755432, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.1993291787803173, "step": 3664 }, { "epoch": 0.16663636363636364, "grad_norm": 5.75, "grad_norm_var": 0.1962890625, "learning_rate": 0.0001, "loss": 6.1814, "loss/crossentropy": 2.496928036212921, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.20067767053842545, "step": 3666 }, { "epoch": 0.16672727272727272, "grad_norm": 6.1875, "grad_norm_var": 0.18709309895833334, "learning_rate": 0.0001, "loss": 6.217, "loss/crossentropy": 2.5323992371559143, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20166011527180672, "step": 3668 }, { "epoch": 0.1668181818181818, "grad_norm": 5.375, "grad_norm_var": 0.19846598307291666, "learning_rate": 0.0001, "loss": 6.3644, "loss/crossentropy": 2.6240394711494446, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.20703883469104767, "step": 3670 }, { "epoch": 0.1669090909090909, "grad_norm": 6.59375, "grad_norm_var": 0.15745035807291666, "learning_rate": 0.0001, "loss": 6.2506, "loss/crossentropy": 2.6075815558433533, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19906987249851227, "step": 3672 }, { "epoch": 0.167, "grad_norm": 5.46875, "grad_norm_var": 0.188916015625, "learning_rate": 0.0001, "loss": 6.283, "loss/crossentropy": 2.630330741405487, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.2010105401277542, "step": 3674 }, { "epoch": 0.1670909090909091, "grad_norm": 5.65625, "grad_norm_var": 0.18531494140625, "learning_rate": 0.0001, "loss": 6.0763, "loss/crossentropy": 2.479452908039093, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19249730929732323, "step": 3676 }, { "epoch": 0.16718181818181818, "grad_norm": 6.09375, "grad_norm_var": 0.15227864583333334, "learning_rate": 0.0001, "loss": 6.3699, "loss/crossentropy": 2.5904005765914917, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.2082209214568138, "step": 3678 }, { "epoch": 0.16727272727272727, "grad_norm": 6.4375, "grad_norm_var": 0.14000244140625, "learning_rate": 0.0001, "loss": 6.4067, "loss/crossentropy": 2.6290996074676514, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.20842456817626953, "step": 3680 }, { "epoch": 0.16736363636363635, "grad_norm": 6.21875, "grad_norm_var": 0.12389322916666666, "learning_rate": 0.0001, "loss": 6.03, "loss/crossentropy": 2.4414528012275696, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.19303787499666214, "step": 3682 }, { "epoch": 0.16745454545454547, "grad_norm": 6.03125, "grad_norm_var": 0.216796875, "learning_rate": 0.0001, "loss": 6.242, "loss/crossentropy": 2.484056830406189, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.2060634344816208, "step": 3684 }, { "epoch": 0.16754545454545455, "grad_norm": 6.28125, "grad_norm_var": 0.17467447916666667, "learning_rate": 0.0001, "loss": 6.0774, "loss/crossentropy": 2.484617233276367, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.19364922866225243, "step": 3686 }, { "epoch": 0.16763636363636364, "grad_norm": 6.25, "grad_norm_var": 0.20487874348958332, "learning_rate": 0.0001, "loss": 6.175, "loss/crossentropy": 2.4751541018486023, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2027941346168518, "step": 3688 }, { "epoch": 0.16772727272727272, "grad_norm": 6.4375, "grad_norm_var": 0.16330973307291666, "learning_rate": 0.0001, "loss": 6.6273, "loss/crossentropy": 2.857986629009247, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20857425779104233, "step": 3690 }, { "epoch": 0.1678181818181818, "grad_norm": 6.4375, "grad_norm_var": 0.16842447916666667, "learning_rate": 0.0001, "loss": 6.4663, "loss/crossentropy": 2.75096595287323, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.20845039188861847, "step": 3692 }, { "epoch": 0.1679090909090909, "grad_norm": 6.5625, "grad_norm_var": 0.16643473307291667, "learning_rate": 0.0001, "loss": 6.5604, "loss/crossentropy": 2.7411015033721924, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21356967836618423, "step": 3694 }, { "epoch": 0.168, "grad_norm": 5.8125, "grad_norm_var": 0.19563802083333334, "learning_rate": 0.0001, "loss": 6.4602, "loss/crossentropy": 2.702766180038452, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2069941684603691, "step": 3696 }, { "epoch": 0.1680909090909091, "grad_norm": 5.78125, "grad_norm_var": 0.22356770833333334, "learning_rate": 0.0001, "loss": 6.5327, "loss/crossentropy": 2.7407975792884827, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21005331724882126, "step": 3698 }, { "epoch": 0.16818181818181818, "grad_norm": 6.21875, "grad_norm_var": 0.15271809895833333, "learning_rate": 0.0001, "loss": 6.2646, "loss/crossentropy": 2.4914658069610596, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.20836932957172394, "step": 3700 }, { "epoch": 0.16827272727272727, "grad_norm": 6.46875, "grad_norm_var": 0.15247395833333333, "learning_rate": 0.0001, "loss": 6.5824, "loss/crossentropy": 2.697091728448868, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.2176305688917637, "step": 3702 }, { "epoch": 0.16836363636363635, "grad_norm": 6.34375, "grad_norm_var": 0.11927083333333334, "learning_rate": 0.0001, "loss": 6.1784, "loss/crossentropy": 2.512207329273224, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.19689130038022995, "step": 3704 }, { "epoch": 0.16845454545454547, "grad_norm": 6.40625, "grad_norm_var": 0.11379801432291667, "learning_rate": 0.0001, "loss": 6.5044, "loss/crossentropy": 2.7836809158325195, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20800549909472466, "step": 3706 }, { "epoch": 0.16854545454545455, "grad_norm": 6.46875, "grad_norm_var": 0.20780843098958332, "learning_rate": 0.0001, "loss": 6.4058, "loss/crossentropy": 2.599448263645172, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.20856009796261787, "step": 3708 }, { "epoch": 0.16863636363636364, "grad_norm": 5.46875, "grad_norm_var": 0.24420572916666666, "learning_rate": 0.0001, "loss": 6.1726, "loss/crossentropy": 2.5678030252456665, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19582778215408325, "step": 3710 }, { "epoch": 0.16872727272727273, "grad_norm": 7.25, "grad_norm_var": 0.30552978515625, "learning_rate": 0.0001, "loss": 6.5949, "loss/crossentropy": 2.82963627576828, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.2083670049905777, "step": 3712 }, { "epoch": 0.1688181818181818, "grad_norm": 6.09375, "grad_norm_var": 0.28123372395833335, "learning_rate": 0.0001, "loss": 6.1443, "loss/crossentropy": 2.4835479259490967, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.1986972987651825, "step": 3714 }, { "epoch": 0.1689090909090909, "grad_norm": 6.25, "grad_norm_var": 0.27923177083333334, "learning_rate": 0.0001, "loss": 5.9813, "loss/crossentropy": 2.2979612946510315, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.19450850412249565, "step": 3716 }, { "epoch": 0.169, "grad_norm": 6.15625, "grad_norm_var": 0.30575764973958336, "learning_rate": 0.0001, "loss": 6.7419, "loss/crossentropy": 2.80462247133255, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22380616143345833, "step": 3718 }, { "epoch": 0.1690909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.3429972330729167, "learning_rate": 0.0001, "loss": 6.2043, "loss/crossentropy": 2.5130286812782288, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.20096806436777115, "step": 3720 }, { "epoch": 0.16918181818181818, "grad_norm": 6.40625, "grad_norm_var": 0.34918212890625, "learning_rate": 0.0001, "loss": 6.1095, "loss/crossentropy": 2.4732043743133545, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.19702335447072983, "step": 3722 }, { "epoch": 0.16927272727272727, "grad_norm": 5.625, "grad_norm_var": 0.23765869140625, "learning_rate": 0.0001, "loss": 5.6496, "loss/crossentropy": 2.2372738122940063, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.17717018723487854, "step": 3724 }, { "epoch": 0.16936363636363636, "grad_norm": 5.6875, "grad_norm_var": 0.22272135416666666, "learning_rate": 0.0001, "loss": 6.3601, "loss/crossentropy": 2.5752373337745667, "loss/hidden": 1.697265625, "loss/jsd": 0.0, "loss/logits": 0.20875998586416245, "step": 3726 }, { "epoch": 0.16945454545454544, "grad_norm": 6.28125, "grad_norm_var": 0.12018229166666666, "learning_rate": 0.0001, "loss": 6.181, "loss/crossentropy": 2.4376107454299927, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20754503458738327, "step": 3728 }, { "epoch": 0.16954545454545455, "grad_norm": 6.03125, "grad_norm_var": 0.17174479166666667, "learning_rate": 0.0001, "loss": 6.2232, "loss/crossentropy": 2.4200029373168945, "loss/hidden": 1.748046875, "loss/jsd": 0.0, "loss/logits": 0.20551645755767822, "step": 3730 }, { "epoch": 0.16963636363636364, "grad_norm": 5.46875, "grad_norm_var": 0.20738525390625, "learning_rate": 0.0001, "loss": 6.2626, "loss/crossentropy": 2.5481541752815247, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20465228706598282, "step": 3732 }, { "epoch": 0.16972727272727273, "grad_norm": 5.78125, "grad_norm_var": 0.15826416015625, "learning_rate": 0.0001, "loss": 6.089, "loss/crossentropy": 2.518933892250061, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19469990953803062, "step": 3734 }, { "epoch": 0.1698181818181818, "grad_norm": 5.96875, "grad_norm_var": 0.14178059895833334, "learning_rate": 0.0001, "loss": 6.206, "loss/crossentropy": 2.4689792096614838, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.20592928305268288, "step": 3736 }, { "epoch": 0.1699090909090909, "grad_norm": 6.34375, "grad_norm_var": 0.14416910807291666, "learning_rate": 0.0001, "loss": 6.1449, "loss/crossentropy": 2.5279879570007324, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.1941157467663288, "step": 3738 }, { "epoch": 0.17, "grad_norm": 5.5625, "grad_norm_var": 0.21103108723958333, "learning_rate": 0.0001, "loss": 5.5425, "loss/crossentropy": 2.204057037830353, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.17134227976202965, "step": 3740 }, { "epoch": 0.1700909090909091, "grad_norm": 5.5, "grad_norm_var": 0.22512613932291667, "learning_rate": 0.0001, "loss": 5.8357, "loss/crossentropy": 2.3184778094291687, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18687989190220833, "step": 3742 }, { "epoch": 0.17018181818181818, "grad_norm": 5.90625, "grad_norm_var": 0.23527018229166666, "learning_rate": 0.0001, "loss": 6.3754, "loss/crossentropy": 2.614904284477234, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20769395306706429, "step": 3744 }, { "epoch": 0.17027272727272727, "grad_norm": 6.3125, "grad_norm_var": 0.19944254557291666, "learning_rate": 0.0001, "loss": 6.1479, "loss/crossentropy": 2.4149129986763, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20415351539850235, "step": 3746 }, { "epoch": 0.17036363636363636, "grad_norm": 5.96875, "grad_norm_var": 0.16145833333333334, "learning_rate": 0.0001, "loss": 6.3725, "loss/crossentropy": 2.6258928179740906, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20669222250580788, "step": 3748 }, { "epoch": 0.17045454545454544, "grad_norm": 6.5, "grad_norm_var": 0.17265625, "learning_rate": 0.0001, "loss": 6.4519, "loss/crossentropy": 2.660762846469879, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21387861669063568, "step": 3750 }, { "epoch": 0.17054545454545456, "grad_norm": 5.46875, "grad_norm_var": 0.19215087890625, "learning_rate": 0.0001, "loss": 5.9333, "loss/crossentropy": 2.3097579181194305, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.19438285380601883, "step": 3752 }, { "epoch": 0.17063636363636364, "grad_norm": 6.84375, "grad_norm_var": 0.23958333333333334, "learning_rate": 0.0001, "loss": 6.1632, "loss/crossentropy": 2.534003645181656, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.19475888833403587, "step": 3754 }, { "epoch": 0.17072727272727273, "grad_norm": 5.71875, "grad_norm_var": 0.17721354166666667, "learning_rate": 0.0001, "loss": 6.2426, "loss/crossentropy": 2.5580177903175354, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.20185711979866028, "step": 3756 }, { "epoch": 0.1708181818181818, "grad_norm": 6.375, "grad_norm_var": 0.15787353515625, "learning_rate": 0.0001, "loss": 6.4032, "loss/crossentropy": 2.599544882774353, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.2125873602926731, "step": 3758 }, { "epoch": 0.1709090909090909, "grad_norm": 5.90625, "grad_norm_var": 0.15500895182291666, "learning_rate": 0.0001, "loss": 6.1252, "loss/crossentropy": 2.4329859614372253, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20437712967395782, "step": 3760 }, { "epoch": 0.171, "grad_norm": 5.78125, "grad_norm_var": 0.14416910807291666, "learning_rate": 0.0001, "loss": 5.8258, "loss/crossentropy": 2.3118117451667786, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.18616171553730965, "step": 3762 }, { "epoch": 0.1710909090909091, "grad_norm": 6.59375, "grad_norm_var": 0.16886393229166666, "learning_rate": 0.0001, "loss": 6.3528, "loss/crossentropy": 2.6653149724006653, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20039040595293045, "step": 3764 }, { "epoch": 0.17118181818181818, "grad_norm": 6.84375, "grad_norm_var": 0.19794514973958333, "learning_rate": 0.0001, "loss": 5.9974, "loss/crossentropy": 2.4238707423210144, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.19172781333327293, "step": 3766 }, { "epoch": 0.17127272727272727, "grad_norm": 5.59375, "grad_norm_var": 0.19156494140625, "learning_rate": 0.0001, "loss": 6.2598, "loss/crossentropy": 2.5361987948417664, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.2073173075914383, "step": 3768 }, { "epoch": 0.17136363636363636, "grad_norm": 6.25, "grad_norm_var": 0.1322265625, "learning_rate": 0.0001, "loss": 6.3119, "loss/crossentropy": 2.648592233657837, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.1979677975177765, "step": 3770 }, { "epoch": 0.17145454545454544, "grad_norm": 6.28125, "grad_norm_var": 0.13160400390625, "learning_rate": 0.0001, "loss": 6.483, "loss/crossentropy": 2.6991185545921326, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20964059233665466, "step": 3772 }, { "epoch": 0.17154545454545456, "grad_norm": 5.625, "grad_norm_var": 0.14986572265625, "learning_rate": 0.0001, "loss": 6.0354, "loss/crossentropy": 2.4405998289585114, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.19404618814587593, "step": 3774 }, { "epoch": 0.17163636363636364, "grad_norm": 5.1875, "grad_norm_var": 0.19804280598958332, "learning_rate": 0.0001, "loss": 5.9689, "loss/crossentropy": 2.4565613865852356, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1902933344244957, "step": 3776 }, { "epoch": 0.17172727272727273, "grad_norm": 5.4375, "grad_norm_var": 0.22489827473958332, "learning_rate": 0.0001, "loss": 6.3708, "loss/crossentropy": 2.6991333961486816, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20232197642326355, "step": 3778 }, { "epoch": 0.17181818181818181, "grad_norm": 6.15625, "grad_norm_var": 0.21404622395833334, "learning_rate": 0.0001, "loss": 6.3229, "loss/crossentropy": 2.55633944272995, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21025178581476212, "step": 3780 }, { "epoch": 0.1719090909090909, "grad_norm": 5.71875, "grad_norm_var": 0.18125, "learning_rate": 0.0001, "loss": 5.7885, "loss/crossentropy": 2.301587998867035, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18384933099150658, "step": 3782 }, { "epoch": 0.172, "grad_norm": 6.09375, "grad_norm_var": 0.16490478515625, "learning_rate": 0.0001, "loss": 6.0778, "loss/crossentropy": 2.4533385932445526, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.19467723742127419, "step": 3784 }, { "epoch": 0.1720909090909091, "grad_norm": 6.09375, "grad_norm_var": 0.163525390625, "learning_rate": 0.0001, "loss": 6.3471, "loss/crossentropy": 2.6403600573539734, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20231449231505394, "step": 3786 }, { "epoch": 0.17218181818181819, "grad_norm": 6.1875, "grad_norm_var": 0.11646728515625, "learning_rate": 0.0001, "loss": 6.1061, "loss/crossentropy": 2.4506136775016785, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.19523466750979424, "step": 3788 }, { "epoch": 0.17227272727272727, "grad_norm": 6.15625, "grad_norm_var": 0.11575113932291667, "learning_rate": 0.0001, "loss": 6.1992, "loss/crossentropy": 2.585551083087921, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.1969149224460125, "step": 3790 }, { "epoch": 0.17236363636363636, "grad_norm": 6.34375, "grad_norm_var": 0.07760416666666667, "learning_rate": 0.0001, "loss": 6.4131, "loss/crossentropy": 2.5720545053482056, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2145768441259861, "step": 3792 }, { "epoch": 0.17245454545454544, "grad_norm": 5.96875, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 5.9384, "loss/crossentropy": 2.3995402455329895, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.19118967652320862, "step": 3794 }, { "epoch": 0.17254545454545456, "grad_norm": 6.125, "grad_norm_var": 0.03899739583333333, "learning_rate": 0.0001, "loss": 6.1285, "loss/crossentropy": 2.500584304332733, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.19482606649398804, "step": 3796 }, { "epoch": 0.17263636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.050244140625, "learning_rate": 0.0001, "loss": 6.0487, "loss/crossentropy": 2.4606496691703796, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.1937658190727234, "step": 3798 }, { "epoch": 0.17272727272727273, "grad_norm": 6.0625, "grad_norm_var": 0.05468343098958333, "learning_rate": 0.0001, "loss": 6.353, "loss/crossentropy": 2.6022858023643494, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.2076842002570629, "step": 3800 }, { "epoch": 0.17281818181818182, "grad_norm": 6.78125, "grad_norm_var": 0.09230143229166667, "learning_rate": 0.0001, "loss": 6.3255, "loss/crossentropy": 2.594381630420685, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20826809480786324, "step": 3802 }, { "epoch": 0.1729090909090909, "grad_norm": 6.0, "grad_norm_var": 0.090478515625, "learning_rate": 0.0001, "loss": 6.3822, "loss/crossentropy": 2.690230071544647, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20161523669958115, "step": 3804 }, { "epoch": 0.173, "grad_norm": 6.3125, "grad_norm_var": 0.08743489583333333, "learning_rate": 0.0001, "loss": 6.0421, "loss/crossentropy": 2.383758455514908, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.19766532257199287, "step": 3806 }, { "epoch": 0.1730909090909091, "grad_norm": 5.65625, "grad_norm_var": 0.10253499348958334, "learning_rate": 0.0001, "loss": 6.0261, "loss/crossentropy": 2.464147210121155, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.18998238071799278, "step": 3808 }, { "epoch": 0.1731818181818182, "grad_norm": 5.40625, "grad_norm_var": 0.12706705729166667, "learning_rate": 0.0001, "loss": 6.0371, "loss/crossentropy": 2.540132224559784, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.18660715967416763, "step": 3810 }, { "epoch": 0.17327272727272727, "grad_norm": 7.875, "grad_norm_var": 0.34722900390625, "learning_rate": 0.0001, "loss": 6.6315, "loss/crossentropy": 2.7894975543022156, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.21603599935770035, "step": 3812 }, { "epoch": 0.17336363636363636, "grad_norm": 5.8125, "grad_norm_var": 0.325, "learning_rate": 0.0001, "loss": 6.0418, "loss/crossentropy": 2.4187511801719666, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1990196406841278, "step": 3814 }, { "epoch": 0.17345454545454544, "grad_norm": 5.34375, "grad_norm_var": 0.36886393229166664, "learning_rate": 0.0001, "loss": 6.1418, "loss/crossentropy": 2.5335395336151123, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19774062559008598, "step": 3816 }, { "epoch": 0.17354545454545456, "grad_norm": 6.15625, "grad_norm_var": 0.33391927083333334, "learning_rate": 0.0001, "loss": 5.8609, "loss/crossentropy": 2.3783416152000427, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.18302004784345627, "step": 3818 }, { "epoch": 0.17363636363636364, "grad_norm": 8.6875, "grad_norm_var": 0.7867146809895833, "learning_rate": 0.0001, "loss": 6.0711, "loss/crossentropy": 2.3235272765159607, "loss/hidden": 1.708984375, "loss/jsd": 0.0, "loss/logits": 0.20385896787047386, "step": 3820 }, { "epoch": 0.17372727272727273, "grad_norm": 5.8125, "grad_norm_var": 0.7996378580729167, "learning_rate": 0.0001, "loss": 6.0543, "loss/crossentropy": 2.403712213039398, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19826089218258858, "step": 3822 }, { "epoch": 0.17381818181818182, "grad_norm": 5.65625, "grad_norm_var": 0.7851399739583333, "learning_rate": 0.0001, "loss": 6.2974, "loss/crossentropy": 2.5889362692832947, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.2073681391775608, "step": 3824 }, { "epoch": 0.1739090909090909, "grad_norm": 6.1875, "grad_norm_var": 0.7568644205729167, "learning_rate": 0.0001, "loss": 6.1122, "loss/crossentropy": 2.432122230529785, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.20140565186738968, "step": 3826 }, { "epoch": 0.174, "grad_norm": 6.09375, "grad_norm_var": 0.5483072916666667, "learning_rate": 0.0001, "loss": 6.1256, "loss/crossentropy": 2.4939897060394287, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.19577790424227715, "step": 3828 }, { "epoch": 0.1740909090909091, "grad_norm": 6.40625, "grad_norm_var": 0.5571614583333333, "learning_rate": 0.0001, "loss": 6.3842, "loss/crossentropy": 2.4642977118492126, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22207055613398552, "step": 3830 }, { "epoch": 0.1741818181818182, "grad_norm": 5.53125, "grad_norm_var": 0.5323567708333333, "learning_rate": 0.0001, "loss": 6.4126, "loss/crossentropy": 2.713141083717346, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2035430260002613, "step": 3832 }, { "epoch": 0.17427272727272727, "grad_norm": 5.9375, "grad_norm_var": 0.6641886393229167, "learning_rate": 0.0001, "loss": 5.9153, "loss/crossentropy": 2.2444231808185577, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.19970815256237984, "step": 3834 }, { "epoch": 0.17436363636363636, "grad_norm": 6.21875, "grad_norm_var": 0.2633748372395833, "learning_rate": 0.0001, "loss": 6.5721, "loss/crossentropy": 2.705324411392212, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.22007619217038155, "step": 3836 }, { "epoch": 0.17445454545454545, "grad_norm": 6.40625, "grad_norm_var": 0.24856770833333333, "learning_rate": 0.0001, "loss": 6.3661, "loss/crossentropy": 2.6076979637145996, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20786932483315468, "step": 3838 }, { "epoch": 0.17454545454545456, "grad_norm": 6.96875, "grad_norm_var": 0.2601521809895833, "learning_rate": 0.0001, "loss": 6.1983, "loss/crossentropy": 2.5239198207855225, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20142635703086853, "step": 3840 }, { "epoch": 0.17463636363636365, "grad_norm": 5.46875, "grad_norm_var": 0.3012003580729167, "learning_rate": 0.0001, "loss": 5.6437, "loss/crossentropy": 2.217732846736908, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.17853335663676262, "step": 3842 }, { "epoch": 0.17472727272727273, "grad_norm": 5.78125, "grad_norm_var": 0.31217041015625, "learning_rate": 0.0001, "loss": 6.4198, "loss/crossentropy": 2.7282195687294006, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.208415649831295, "step": 3844 }, { "epoch": 0.17481818181818182, "grad_norm": 7.0625, "grad_norm_var": 0.35247395833333334, "learning_rate": 0.0001, "loss": 6.0351, "loss/crossentropy": 2.409081310033798, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.19717654213309288, "step": 3846 }, { "epoch": 0.1749090909090909, "grad_norm": 6.28125, "grad_norm_var": 0.31497395833333336, "learning_rate": 0.0001, "loss": 6.0636, "loss/crossentropy": 2.456543803215027, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19351642578840256, "step": 3848 }, { "epoch": 0.175, "grad_norm": 5.96875, "grad_norm_var": 0.21383056640625, "learning_rate": 0.0001, "loss": 6.2142, "loss/crossentropy": 2.556404232978821, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.1999613717198372, "step": 3850 }, { "epoch": 0.1750909090909091, "grad_norm": 6.59375, "grad_norm_var": 0.21209309895833334, "learning_rate": 0.0001, "loss": 6.2512, "loss/crossentropy": 2.5756537318229675, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.20212148874998093, "step": 3852 }, { "epoch": 0.1751818181818182, "grad_norm": 5.71875, "grad_norm_var": 0.2517578125, "learning_rate": 0.0001, "loss": 6.0882, "loss/crossentropy": 2.4581545889377594, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.19347311928868294, "step": 3854 }, { "epoch": 0.17527272727272727, "grad_norm": 7.09375, "grad_norm_var": 0.24661051432291667, "learning_rate": 0.0001, "loss": 5.5826, "loss/crossentropy": 2.023097813129425, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.18524499982595444, "step": 3856 }, { "epoch": 0.17536363636363636, "grad_norm": 5.71875, "grad_norm_var": 0.22060139973958334, "learning_rate": 0.0001, "loss": 6.3882, "loss/crossentropy": 2.5616695284843445, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.21371031180024147, "step": 3858 }, { "epoch": 0.17545454545454545, "grad_norm": 5.8125, "grad_norm_var": 0.250390625, "learning_rate": 0.0001, "loss": 6.0369, "loss/crossentropy": 2.4233603477478027, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.1963135413825512, "step": 3860 }, { "epoch": 0.17554545454545453, "grad_norm": 5.65625, "grad_norm_var": 0.23043212890625, "learning_rate": 0.0001, "loss": 6.1384, "loss/crossentropy": 2.4840198755264282, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.19785922020673752, "step": 3862 }, { "epoch": 0.17563636363636365, "grad_norm": 6.03125, "grad_norm_var": 0.23006184895833334, "learning_rate": 0.0001, "loss": 6.2469, "loss/crossentropy": 2.572755455970764, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.2008083201944828, "step": 3864 }, { "epoch": 0.17572727272727273, "grad_norm": 6.03125, "grad_norm_var": 0.20987955729166666, "learning_rate": 0.0001, "loss": 5.9916, "loss/crossentropy": 2.4578575491905212, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.191655695438385, "step": 3866 }, { "epoch": 0.17581818181818182, "grad_norm": 5.40625, "grad_norm_var": 0.21925455729166668, "learning_rate": 0.0001, "loss": 6.1684, "loss/crossentropy": 2.5671547651290894, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19860413670539856, "step": 3868 }, { "epoch": 0.1759090909090909, "grad_norm": 6.03125, "grad_norm_var": 0.169140625, "learning_rate": 0.0001, "loss": 6.4368, "loss/crossentropy": 2.6695435643196106, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.20973380282521248, "step": 3870 }, { "epoch": 0.176, "grad_norm": 5.625, "grad_norm_var": 0.0888671875, "learning_rate": 0.0001, "loss": 6.4254, "loss/crossentropy": 2.696273922920227, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.2055278904736042, "step": 3872 }, { "epoch": 0.1760909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.050679524739583336, "learning_rate": 0.0001, "loss": 6.0163, "loss/crossentropy": 2.4287679195404053, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1964471973478794, "step": 3874 }, { "epoch": 0.1761818181818182, "grad_norm": 12.3125, "grad_norm_var": 2.6757771809895834, "learning_rate": 0.0001, "loss": 6.4308, "loss/crossentropy": 2.454353630542755, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22498587891459465, "step": 3876 }, { "epoch": 0.17627272727272728, "grad_norm": 6.25, "grad_norm_var": 2.6515462239583334, "learning_rate": 0.0001, "loss": 6.5182, "loss/crossentropy": 2.7081337571144104, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.21401526778936386, "step": 3878 }, { "epoch": 0.17636363636363636, "grad_norm": 5.78125, "grad_norm_var": 2.6587198893229167, "learning_rate": 0.0001, "loss": 6.2835, "loss/crossentropy": 2.5771660208702087, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.2052028328180313, "step": 3880 }, { "epoch": 0.17645454545454545, "grad_norm": 11.625, "grad_norm_var": 4.394950358072917, "learning_rate": 0.0001, "loss": 6.3469, "loss/crossentropy": 2.476637125015259, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21632175147533417, "step": 3882 }, { "epoch": 0.17654545454545453, "grad_norm": 5.34375, "grad_norm_var": 4.387093098958333, "learning_rate": 0.0001, "loss": 6.0247, "loss/crossentropy": 2.513404905796051, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.18804632499814034, "step": 3884 }, { "epoch": 0.17663636363636365, "grad_norm": 6.125, "grad_norm_var": 4.360400390625, "learning_rate": 0.0001, "loss": 6.4559, "loss/crossentropy": 2.637160837650299, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21351467445492744, "step": 3886 }, { "epoch": 0.17672727272727273, "grad_norm": 5.5625, "grad_norm_var": 4.3701171875, "learning_rate": 0.0001, "loss": 5.567, "loss/crossentropy": 2.1600129902362823, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.17663514614105225, "step": 3888 }, { "epoch": 0.17681818181818182, "grad_norm": 6.125, "grad_norm_var": 4.222261555989584, "learning_rate": 0.0001, "loss": 6.0639, "loss/crossentropy": 2.419843077659607, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.19701973721385002, "step": 3890 }, { "epoch": 0.1769090909090909, "grad_norm": 6.6875, "grad_norm_var": 2.4014607747395833, "learning_rate": 0.0001, "loss": 6.7217, "loss/crossentropy": 2.8327451944351196, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.21956396102905273, "step": 3892 }, { "epoch": 0.177, "grad_norm": 6.1875, "grad_norm_var": 2.42115478515625, "learning_rate": 0.0001, "loss": 6.1994, "loss/crossentropy": 2.4718754291534424, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.20615523681044579, "step": 3894 }, { "epoch": 0.1770909090909091, "grad_norm": 6.09375, "grad_norm_var": 2.3899576822916666, "learning_rate": 0.0001, "loss": 6.3576, "loss/crossentropy": 2.693653404712677, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20194115862250328, "step": 3896 }, { "epoch": 0.1771818181818182, "grad_norm": 5.65625, "grad_norm_var": 0.656494140625, "learning_rate": 0.0001, "loss": 6.152, "loss/crossentropy": 2.473484843969345, "loss/hidden": 1.693359375, "loss/jsd": 0.0, "loss/logits": 0.19851670786738396, "step": 3898 }, { "epoch": 0.17727272727272728, "grad_norm": 5.6875, "grad_norm_var": 0.6206339518229167, "learning_rate": 0.0001, "loss": 6.3407, "loss/crossentropy": 2.5911760926246643, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2073749229311943, "step": 3900 }, { "epoch": 0.17736363636363636, "grad_norm": 5.71875, "grad_norm_var": 0.5809855143229167, "learning_rate": 0.0001, "loss": 6.1562, "loss/crossentropy": 2.558847486972809, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1950853168964386, "step": 3902 }, { "epoch": 0.17745454545454545, "grad_norm": 6.03125, "grad_norm_var": 0.5682291666666667, "learning_rate": 0.0001, "loss": 5.9675, "loss/crossentropy": 2.4488719701766968, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1872129887342453, "step": 3904 }, { "epoch": 0.17754545454545453, "grad_norm": 5.78125, "grad_norm_var": 0.6090494791666666, "learning_rate": 0.0001, "loss": 6.1057, "loss/crossentropy": 2.439751148223877, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.19784095510840416, "step": 3906 }, { "epoch": 0.17763636363636365, "grad_norm": 5.71875, "grad_norm_var": 0.126025390625, "learning_rate": 0.0001, "loss": 6.323, "loss/crossentropy": 2.6481664776802063, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20107900351285934, "step": 3908 }, { "epoch": 0.17772727272727273, "grad_norm": 5.375, "grad_norm_var": 0.14908854166666666, "learning_rate": 0.0001, "loss": 6.0769, "loss/crossentropy": 2.4709521532058716, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.194965697824955, "step": 3910 }, { "epoch": 0.17781818181818182, "grad_norm": 5.375, "grad_norm_var": 0.18886311848958334, "learning_rate": 0.0001, "loss": 5.89, "loss/crossentropy": 2.455429971218109, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.18154604732990265, "step": 3912 }, { "epoch": 0.1779090909090909, "grad_norm": 5.125, "grad_norm_var": 0.19472249348958334, "learning_rate": 0.0001, "loss": 5.8865, "loss/crossentropy": 2.3508248925209045, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.18911901116371155, "step": 3914 }, { "epoch": 0.178, "grad_norm": 6.15625, "grad_norm_var": 0.192822265625, "learning_rate": 0.0001, "loss": 6.1216, "loss/crossentropy": 2.5482152104377747, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.194057896733284, "step": 3916 }, { "epoch": 0.17809090909090908, "grad_norm": 6.15625, "grad_norm_var": 0.20787353515625, "learning_rate": 0.0001, "loss": 6.3422, "loss/crossentropy": 2.552854299545288, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.20745313167572021, "step": 3918 }, { "epoch": 0.1781818181818182, "grad_norm": 6.59375, "grad_norm_var": 0.311328125, "learning_rate": 0.0001, "loss": 6.6177, "loss/crossentropy": 2.801006257534027, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21486841887235641, "step": 3920 }, { "epoch": 0.17827272727272728, "grad_norm": 6.0625, "grad_norm_var": 0.33521728515625, "learning_rate": 0.0001, "loss": 6.5422, "loss/crossentropy": 2.7375234365463257, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21054201945662498, "step": 3922 }, { "epoch": 0.17836363636363636, "grad_norm": 5.84375, "grad_norm_var": 0.33609619140625, "learning_rate": 0.0001, "loss": 6.3814, "loss/crossentropy": 2.681509494781494, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.2033889926970005, "step": 3924 }, { "epoch": 0.17845454545454545, "grad_norm": 5.96875, "grad_norm_var": 0.3078084309895833, "learning_rate": 0.0001, "loss": 5.9368, "loss/crossentropy": 2.37211811542511, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.18733183667063713, "step": 3926 }, { "epoch": 0.17854545454545453, "grad_norm": 5.96875, "grad_norm_var": 0.24189046223958333, "learning_rate": 0.0001, "loss": 6.3029, "loss/crossentropy": 2.610478311777115, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.20303580164909363, "step": 3928 }, { "epoch": 0.17863636363636365, "grad_norm": 6.0625, "grad_norm_var": 0.17320556640625, "learning_rate": 0.0001, "loss": 6.2292, "loss/crossentropy": 2.5011950731277466, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20561693236231804, "step": 3930 }, { "epoch": 0.17872727272727273, "grad_norm": 6.125, "grad_norm_var": 0.1478515625, "learning_rate": 0.0001, "loss": 6.4891, "loss/crossentropy": 2.764755666255951, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20407165959477425, "step": 3932 }, { "epoch": 0.17881818181818182, "grad_norm": 6.25, "grad_norm_var": 0.1515625, "learning_rate": 0.0001, "loss": 6.4835, "loss/crossentropy": 2.7410109639167786, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.20647718012332916, "step": 3934 }, { "epoch": 0.1789090909090909, "grad_norm": 6.1875, "grad_norm_var": 0.13917643229166668, "learning_rate": 0.0001, "loss": 5.9189, "loss/crossentropy": 2.355079025030136, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19173608347773552, "step": 3936 }, { "epoch": 0.179, "grad_norm": 6.75, "grad_norm_var": 0.12561442057291666, "learning_rate": 0.0001, "loss": 6.0476, "loss/crossentropy": 2.432106375694275, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19552959874272346, "step": 3938 }, { "epoch": 0.17909090909090908, "grad_norm": 8.9375, "grad_norm_var": 0.6491495768229166, "learning_rate": 0.0001, "loss": 6.5833, "loss/crossentropy": 2.697195827960968, "loss/hidden": 1.720703125, "loss/jsd": 0.0, "loss/logits": 0.21653834357857704, "step": 3940 }, { "epoch": 0.1791818181818182, "grad_norm": 5.625, "grad_norm_var": 0.6722493489583333, "learning_rate": 0.0001, "loss": 6.2105, "loss/crossentropy": 2.590141534805298, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19739048182964325, "step": 3942 }, { "epoch": 0.17927272727272728, "grad_norm": 6.53125, "grad_norm_var": 0.67115478515625, "learning_rate": 0.0001, "loss": 6.1642, "loss/crossentropy": 2.46735417842865, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20367174595594406, "step": 3944 }, { "epoch": 0.17936363636363636, "grad_norm": 5.96875, "grad_norm_var": 0.7531087239583333, "learning_rate": 0.0001, "loss": 5.4609, "loss/crossentropy": 2.107422888278961, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1716727763414383, "step": 3946 }, { "epoch": 0.17945454545454545, "grad_norm": 6.0625, "grad_norm_var": 0.75728759765625, "learning_rate": 0.0001, "loss": 6.2558, "loss/crossentropy": 2.52687406539917, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20414042100310326, "step": 3948 }, { "epoch": 0.17954545454545454, "grad_norm": 6.1875, "grad_norm_var": 0.7662760416666666, "learning_rate": 0.0001, "loss": 6.2552, "loss/crossentropy": 2.5508036017417908, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2052101269364357, "step": 3950 }, { "epoch": 0.17963636363636365, "grad_norm": 6.1875, "grad_norm_var": 0.71470947265625, "learning_rate": 0.0001, "loss": 6.087, "loss/crossentropy": 2.4224958419799805, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19965773448348045, "step": 3952 }, { "epoch": 0.17972727272727274, "grad_norm": 6.375, "grad_norm_var": 0.6643229166666667, "learning_rate": 0.0001, "loss": 6.0207, "loss/crossentropy": 2.408890962600708, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.19380001351237297, "step": 3954 }, { "epoch": 0.17981818181818182, "grad_norm": 5.5625, "grad_norm_var": 0.16087239583333332, "learning_rate": 0.0001, "loss": 6.1513, "loss/crossentropy": 2.492328107357025, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.1985146887600422, "step": 3956 }, { "epoch": 0.1799090909090909, "grad_norm": 6.21875, "grad_norm_var": 0.17336832682291667, "learning_rate": 0.0001, "loss": 6.4268, "loss/crossentropy": 2.708676040172577, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.2059900313615799, "step": 3958 }, { "epoch": 0.18, "grad_norm": 6.40625, "grad_norm_var": 0.16448160807291667, "learning_rate": 0.0001, "loss": 6.1906, "loss/crossentropy": 2.484995484352112, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2029808945953846, "step": 3960 }, { "epoch": 0.18009090909090908, "grad_norm": 5.28125, "grad_norm_var": 0.153515625, "learning_rate": 0.0001, "loss": 6.0179, "loss/crossentropy": 2.4029815196990967, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.1933290958404541, "step": 3962 }, { "epoch": 0.1801818181818182, "grad_norm": 6.0625, "grad_norm_var": 0.153515625, "learning_rate": 0.0001, "loss": 6.3366, "loss/crossentropy": 2.5812178254127502, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.2097191885113716, "step": 3964 }, { "epoch": 0.18027272727272728, "grad_norm": 6.03125, "grad_norm_var": 0.33352864583333336, "learning_rate": 0.0001, "loss": 6.4738, "loss/crossentropy": 2.671384632587433, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.212073415517807, "step": 3966 }, { "epoch": 0.18036363636363636, "grad_norm": 5.6875, "grad_norm_var": 0.357275390625, "learning_rate": 0.0001, "loss": 5.9149, "loss/crossentropy": 2.3833834528923035, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19064940884709358, "step": 3968 }, { "epoch": 0.18045454545454545, "grad_norm": 9.25, "grad_norm_var": 0.9663045247395833, "learning_rate": 0.0001, "loss": 6.4475, "loss/crossentropy": 2.5339179039001465, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2257356010377407, "step": 3970 }, { "epoch": 0.18054545454545454, "grad_norm": 6.8125, "grad_norm_var": 0.9422526041666667, "learning_rate": 0.0001, "loss": 6.2892, "loss/crossentropy": 2.564932107925415, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20485171675682068, "step": 3972 }, { "epoch": 0.18063636363636362, "grad_norm": 5.75, "grad_norm_var": 0.9723307291666666, "learning_rate": 0.0001, "loss": 6.3254, "loss/crossentropy": 2.6666144132614136, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20181823894381523, "step": 3974 }, { "epoch": 0.18072727272727274, "grad_norm": 5.96875, "grad_norm_var": 1.0370076497395833, "learning_rate": 0.0001, "loss": 6.3536, "loss/crossentropy": 2.609151065349579, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.20823120325803757, "step": 3976 }, { "epoch": 0.18081818181818182, "grad_norm": 5.59375, "grad_norm_var": 1.0394490559895833, "learning_rate": 0.0001, "loss": 5.9027, "loss/crossentropy": 2.3191885948181152, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19350386783480644, "step": 3978 }, { "epoch": 0.1809090909090909, "grad_norm": 6.8125, "grad_norm_var": 1.0537068684895834, "learning_rate": 0.0001, "loss": 6.3769, "loss/crossentropy": 2.642591118812561, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.20682790130376816, "step": 3980 }, { "epoch": 0.181, "grad_norm": 6.21875, "grad_norm_var": 1.0344034830729167, "learning_rate": 0.0001, "loss": 6.5974, "loss/crossentropy": 2.708809792995453, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21971874311566353, "step": 3982 }, { "epoch": 0.18109090909090908, "grad_norm": 5.84375, "grad_norm_var": 0.9964192708333334, "learning_rate": 0.0001, "loss": 6.2386, "loss/crossentropy": 2.5317405462265015, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.20251823961734772, "step": 3984 }, { "epoch": 0.1811818181818182, "grad_norm": 5.71875, "grad_norm_var": 0.4669881184895833, "learning_rate": 0.0001, "loss": 6.3942, "loss/crossentropy": 2.6952701210975647, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20114055648446083, "step": 3986 }, { "epoch": 0.18127272727272728, "grad_norm": 5.53125, "grad_norm_var": 0.45774739583333335, "learning_rate": 0.0001, "loss": 6.3691, "loss/crossentropy": 2.6939985752105713, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20461470261216164, "step": 3988 }, { "epoch": 0.18136363636363637, "grad_norm": 6.8125, "grad_norm_var": 0.48045247395833335, "learning_rate": 0.0001, "loss": 6.2283, "loss/crossentropy": 2.482782483100891, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20775504782795906, "step": 3990 }, { "epoch": 0.18145454545454545, "grad_norm": 5.75, "grad_norm_var": 0.41287434895833336, "learning_rate": 0.0001, "loss": 6.1464, "loss/crossentropy": 2.55173659324646, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19423054531216621, "step": 3992 }, { "epoch": 0.18154545454545454, "grad_norm": 6.25, "grad_norm_var": 0.36061197916666665, "learning_rate": 0.0001, "loss": 6.6024, "loss/crossentropy": 2.786294162273407, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.21500681340694427, "step": 3994 }, { "epoch": 0.18163636363636362, "grad_norm": 5.71875, "grad_norm_var": 0.40797119140625, "learning_rate": 0.0001, "loss": 6.0178, "loss/crossentropy": 2.4357628226280212, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.19238543137907982, "step": 3996 }, { "epoch": 0.18172727272727274, "grad_norm": 6.03125, "grad_norm_var": 0.2528483072916667, "learning_rate": 0.0001, "loss": 6.1009, "loss/crossentropy": 2.553075611591339, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.18934806063771248, "step": 3998 }, { "epoch": 0.18181818181818182, "grad_norm": 6.09375, "grad_norm_var": 0.2559895833333333, "learning_rate": 0.0001, "loss": 6.4075, "loss/crossentropy": 2.667267382144928, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.2066437117755413, "step": 4000 }, { "epoch": 0.1819090909090909, "grad_norm": 5.6875, "grad_norm_var": 0.3304972330729167, "learning_rate": 0.0001, "loss": 6.1018, "loss/crossentropy": 2.5782812237739563, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19063392654061317, "step": 4002 }, { "epoch": 0.182, "grad_norm": 6.1875, "grad_norm_var": 0.30481363932291666, "learning_rate": 0.0001, "loss": 5.7968, "loss/crossentropy": 2.3341749906539917, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1845429614186287, "step": 4004 }, { "epoch": 0.18209090909090908, "grad_norm": 6.40625, "grad_norm_var": 0.20442301432291668, "learning_rate": 0.0001, "loss": 6.2793, "loss/crossentropy": 2.5755722522735596, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.20650076493620872, "step": 4006 }, { "epoch": 0.1821818181818182, "grad_norm": 5.84375, "grad_norm_var": 0.20338541666666668, "learning_rate": 0.0001, "loss": 5.8469, "loss/crossentropy": 2.3462856113910675, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.18619055300951004, "step": 4008 }, { "epoch": 0.18227272727272728, "grad_norm": 5.96875, "grad_norm_var": 0.2068359375, "learning_rate": 0.0001, "loss": 6.6238, "loss/crossentropy": 2.8250837922096252, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2130710557103157, "step": 4010 }, { "epoch": 0.18236363636363637, "grad_norm": 6.9375, "grad_norm_var": 0.24055989583333334, "learning_rate": 0.0001, "loss": 5.9404, "loss/crossentropy": 2.3749420940876007, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19189269095659256, "step": 4012 }, { "epoch": 0.18245454545454545, "grad_norm": 6.1875, "grad_norm_var": 0.21027018229166666, "learning_rate": 0.0001, "loss": 6.1707, "loss/crossentropy": 2.496341645717621, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.2023933231830597, "step": 4014 }, { "epoch": 0.18254545454545454, "grad_norm": 5.84375, "grad_norm_var": 0.222509765625, "learning_rate": 0.0001, "loss": 6.1059, "loss/crossentropy": 2.521575629711151, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19612868502736092, "step": 4016 }, { "epoch": 0.18263636363636362, "grad_norm": 6.28125, "grad_norm_var": 0.15123697916666667, "learning_rate": 0.0001, "loss": 6.2418, "loss/crossentropy": 2.6182876229286194, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1976982243359089, "step": 4018 }, { "epoch": 0.18272727272727274, "grad_norm": 6.15625, "grad_norm_var": 0.16190999348958332, "learning_rate": 0.0001, "loss": 6.3951, "loss/crossentropy": 2.609127938747406, "loss/hidden": 1.701171875, "loss/jsd": 0.0, "loss/logits": 0.20848459750413895, "step": 4020 }, { "epoch": 0.18281818181818182, "grad_norm": 6.0625, "grad_norm_var": 0.15188395182291667, "learning_rate": 0.0001, "loss": 6.2134, "loss/crossentropy": 2.5820690989494324, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19945910945534706, "step": 4022 }, { "epoch": 0.1829090909090909, "grad_norm": 5.6875, "grad_norm_var": 0.16448160807291667, "learning_rate": 0.0001, "loss": 5.9246, "loss/crossentropy": 2.427072584629059, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1880320906639099, "step": 4024 }, { "epoch": 0.183, "grad_norm": 6.65625, "grad_norm_var": 0.17252197265625, "learning_rate": 0.0001, "loss": 6.3036, "loss/crossentropy": 2.5972780585289, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2030511163175106, "step": 4026 }, { "epoch": 0.18309090909090908, "grad_norm": 5.25, "grad_norm_var": 0.163134765625, "learning_rate": 0.0001, "loss": 5.8009, "loss/crossentropy": 2.2896396815776825, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.18530816212296486, "step": 4028 }, { "epoch": 0.1831818181818182, "grad_norm": 6.03125, "grad_norm_var": 0.15904947916666667, "learning_rate": 0.0001, "loss": 6.1523, "loss/crossentropy": 2.5529805123806, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.19918957352638245, "step": 4030 }, { "epoch": 0.18327272727272728, "grad_norm": 6.125, "grad_norm_var": 0.15167643229166666, "learning_rate": 0.0001, "loss": 6.2811, "loss/crossentropy": 2.594207465648651, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20306550338864326, "step": 4032 }, { "epoch": 0.18336363636363637, "grad_norm": 5.5, "grad_norm_var": 0.17642822265625, "learning_rate": 0.0001, "loss": 5.7609, "loss/crossentropy": 2.238765150308609, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.18659057840704918, "step": 4034 }, { "epoch": 0.18345454545454545, "grad_norm": 5.875, "grad_norm_var": 0.15188802083333333, "learning_rate": 0.0001, "loss": 5.8338, "loss/crossentropy": 2.3245806097984314, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1872471198439598, "step": 4036 }, { "epoch": 0.18354545454545454, "grad_norm": 5.9375, "grad_norm_var": 0.14615885416666666, "learning_rate": 0.0001, "loss": 6.2142, "loss/crossentropy": 2.6170145869255066, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.19585063681006432, "step": 4038 }, { "epoch": 0.18363636363636363, "grad_norm": 6.0625, "grad_norm_var": 0.14518229166666666, "learning_rate": 0.0001, "loss": 6.0713, "loss/crossentropy": 2.5075170397758484, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.19368676468729973, "step": 4040 }, { "epoch": 0.18372727272727274, "grad_norm": 6.0625, "grad_norm_var": 0.12459309895833333, "learning_rate": 0.0001, "loss": 5.8212, "loss/crossentropy": 2.3854914605617523, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18536532297730446, "step": 4042 }, { "epoch": 0.18381818181818183, "grad_norm": 5.71875, "grad_norm_var": 0.10243733723958333, "learning_rate": 0.0001, "loss": 6.0337, "loss/crossentropy": 2.522558033466339, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.18724750354886055, "step": 4044 }, { "epoch": 0.1839090909090909, "grad_norm": 6.125, "grad_norm_var": 0.106103515625, "learning_rate": 0.0001, "loss": 6.1894, "loss/crossentropy": 2.487956464290619, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20178886875510216, "step": 4046 }, { "epoch": 0.184, "grad_norm": 5.75, "grad_norm_var": 0.09293212890625, "learning_rate": 0.0001, "loss": 5.9878, "loss/crossentropy": 2.4085124135017395, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19308879226446152, "step": 4048 }, { "epoch": 0.18409090909090908, "grad_norm": 5.6875, "grad_norm_var": 0.07493082682291667, "learning_rate": 0.0001, "loss": 6.4791, "loss/crossentropy": 2.7623943090438843, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.2097601406276226, "step": 4050 }, { "epoch": 0.18418181818181817, "grad_norm": 6.21875, "grad_norm_var": 0.07473551432291667, "learning_rate": 0.0001, "loss": 6.4906, "loss/crossentropy": 2.7390636205673218, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.20776843652129173, "step": 4052 }, { "epoch": 0.18427272727272728, "grad_norm": 6.59375, "grad_norm_var": 0.13625895182291667, "learning_rate": 0.0001, "loss": 5.9856, "loss/crossentropy": 2.4669116735458374, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.18819943815469742, "step": 4054 }, { "epoch": 0.18436363636363637, "grad_norm": 6.59375, "grad_norm_var": 0.18176676432291666, "learning_rate": 0.0001, "loss": 6.2683, "loss/crossentropy": 2.4944888949394226, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.20785382390022278, "step": 4056 }, { "epoch": 0.18445454545454545, "grad_norm": 5.5, "grad_norm_var": 0.16711832682291666, "learning_rate": 0.0001, "loss": 6.1581, "loss/crossentropy": 2.5209715962409973, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.19574791938066483, "step": 4058 }, { "epoch": 0.18454545454545454, "grad_norm": 7.65625, "grad_norm_var": 0.31497395833333336, "learning_rate": 0.0001, "loss": 5.8805, "loss/crossentropy": 2.340275138616562, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.18410297855734825, "step": 4060 }, { "epoch": 0.18463636363636363, "grad_norm": 5.6875, "grad_norm_var": 0.32760009765625, "learning_rate": 0.0001, "loss": 5.8819, "loss/crossentropy": 2.2897616028785706, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.19221877306699753, "step": 4062 }, { "epoch": 0.18472727272727274, "grad_norm": 5.40625, "grad_norm_var": 0.34260660807291665, "learning_rate": 0.0001, "loss": 5.8957, "loss/crossentropy": 2.3606884479522705, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.1859210580587387, "step": 4064 }, { "epoch": 0.18481818181818183, "grad_norm": 6.53125, "grad_norm_var": 0.34911702473958334, "learning_rate": 0.0001, "loss": 5.9915, "loss/crossentropy": 2.4346149563789368, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1916307620704174, "step": 4066 }, { "epoch": 0.1849090909090909, "grad_norm": 5.25, "grad_norm_var": 0.414306640625, "learning_rate": 0.0001, "loss": 6.0061, "loss/crossentropy": 2.5593570470809937, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.179633941501379, "step": 4068 }, { "epoch": 0.185, "grad_norm": 5.8125, "grad_norm_var": 0.373291015625, "learning_rate": 0.0001, "loss": 6.3052, "loss/crossentropy": 2.655909240245819, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.20223430916666985, "step": 4070 }, { "epoch": 0.18509090909090908, "grad_norm": 6.0, "grad_norm_var": 0.34967447916666666, "learning_rate": 0.0001, "loss": 6.6053, "loss/crossentropy": 2.809503495693207, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2112242616713047, "step": 4072 }, { "epoch": 0.18518181818181817, "grad_norm": 5.625, "grad_norm_var": 0.36041259765625, "learning_rate": 0.0001, "loss": 6.1287, "loss/crossentropy": 2.4760541319847107, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.1986585482954979, "step": 4074 }, { "epoch": 0.18527272727272728, "grad_norm": 6.5, "grad_norm_var": 0.19817301432291667, "learning_rate": 0.0001, "loss": 6.1768, "loss/crossentropy": 2.5888670682907104, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.19257883355021477, "step": 4076 }, { "epoch": 0.18536363636363637, "grad_norm": 5.59375, "grad_norm_var": 0.20491129557291668, "learning_rate": 0.0001, "loss": 6.1974, "loss/crossentropy": 2.6425928473472595, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19337135180830956, "step": 4078 }, { "epoch": 0.18545454545454546, "grad_norm": 6.59375, "grad_norm_var": 0.20911458333333333, "learning_rate": 0.0001, "loss": 6.0906, "loss/crossentropy": 2.5161978006362915, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.19356858730316162, "step": 4080 }, { "epoch": 0.18554545454545454, "grad_norm": 5.34375, "grad_norm_var": 0.2840494791666667, "learning_rate": 0.0001, "loss": 6.1946, "loss/crossentropy": 2.5922016501426697, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1991085596382618, "step": 4082 }, { "epoch": 0.18563636363636363, "grad_norm": 5.5625, "grad_norm_var": 0.237744140625, "learning_rate": 0.0001, "loss": 5.9381, "loss/crossentropy": 2.43224173784256, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1902342028915882, "step": 4084 }, { "epoch": 0.18572727272727274, "grad_norm": 5.34375, "grad_norm_var": 0.24680582682291666, "learning_rate": 0.0001, "loss": 5.9787, "loss/crossentropy": 2.478111684322357, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18951410055160522, "step": 4086 }, { "epoch": 0.18581818181818183, "grad_norm": 5.5, "grad_norm_var": 0.25347900390625, "learning_rate": 0.0001, "loss": 5.8063, "loss/crossentropy": 2.3012473583221436, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.1878049671649933, "step": 4088 }, { "epoch": 0.1859090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.37662353515625, "learning_rate": 0.0001, "loss": 6.1302, "loss/crossentropy": 2.5270552039146423, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.19527805969119072, "step": 4090 }, { "epoch": 0.186, "grad_norm": 6.59375, "grad_norm_var": 0.38599853515625, "learning_rate": 0.0001, "loss": 6.1724, "loss/crossentropy": 2.5560285449028015, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.19738012924790382, "step": 4092 }, { "epoch": 0.18609090909090908, "grad_norm": 6.0, "grad_norm_var": 0.3676066080729167, "learning_rate": 0.0001, "loss": 6.3755, "loss/crossentropy": 2.644045412540436, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.20810477808117867, "step": 4094 }, { "epoch": 0.18618181818181817, "grad_norm": 6.03125, "grad_norm_var": 0.34329427083333336, "learning_rate": 0.0001, "loss": 6.2601, "loss/crossentropy": 2.65691477060318, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1970413289964199, "step": 4096 }, { "epoch": 0.18627272727272728, "grad_norm": 5.96875, "grad_norm_var": 0.24976806640625, "learning_rate": 0.0001, "loss": 6.1241, "loss/crossentropy": 2.5261834859848022, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1961178183555603, "step": 4098 }, { "epoch": 0.18636363636363637, "grad_norm": 6.03125, "grad_norm_var": 0.2482421875, "learning_rate": 0.0001, "loss": 5.9686, "loss/crossentropy": 2.4354472756385803, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19042688608169556, "step": 4100 }, { "epoch": 0.18645454545454546, "grad_norm": 5.9375, "grad_norm_var": 0.22128499348958333, "learning_rate": 0.0001, "loss": 6.525, "loss/crossentropy": 2.7637118101119995, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.21109408885240555, "step": 4102 }, { "epoch": 0.18654545454545454, "grad_norm": 6.03125, "grad_norm_var": 0.21521809895833333, "learning_rate": 0.0001, "loss": 6.216, "loss/crossentropy": 2.6583393812179565, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1938474290072918, "step": 4104 }, { "epoch": 0.18663636363636363, "grad_norm": 5.6875, "grad_norm_var": 0.06291910807291666, "learning_rate": 0.0001, "loss": 6.3414, "loss/crossentropy": 2.665970742702484, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.2021171934902668, "step": 4106 }, { "epoch": 0.18672727272727271, "grad_norm": 5.71875, "grad_norm_var": 0.032450358072916664, "learning_rate": 0.0001, "loss": 6.0533, "loss/crossentropy": 2.492405354976654, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1939772330224514, "step": 4108 }, { "epoch": 0.18681818181818183, "grad_norm": 5.6875, "grad_norm_var": 0.12408447265625, "learning_rate": 0.0001, "loss": 6.217, "loss/crossentropy": 2.526017278432846, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20269539579749107, "step": 4110 }, { "epoch": 0.1869090909090909, "grad_norm": 5.71875, "grad_norm_var": 0.12498372395833333, "learning_rate": 0.0001, "loss": 6.2737, "loss/crossentropy": 2.6644479632377625, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1976439505815506, "step": 4112 }, { "epoch": 0.187, "grad_norm": 5.8125, "grad_norm_var": 0.12919514973958332, "learning_rate": 0.0001, "loss": 6.1223, "loss/crossentropy": 2.5164064168930054, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.19789652153849602, "step": 4114 }, { "epoch": 0.18709090909090909, "grad_norm": 6.25, "grad_norm_var": 0.13267822265625, "learning_rate": 0.0001, "loss": 6.1979, "loss/crossentropy": 2.5181760787963867, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20195366442203522, "step": 4116 }, { "epoch": 0.18718181818181817, "grad_norm": 6.125, "grad_norm_var": 0.17532552083333333, "learning_rate": 0.0001, "loss": 6.0342, "loss/crossentropy": 2.441206693649292, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19250360503792763, "step": 4118 }, { "epoch": 0.18727272727272729, "grad_norm": 6.65625, "grad_norm_var": 0.21106363932291666, "learning_rate": 0.0001, "loss": 6.337, "loss/crossentropy": 2.6568750143051147, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.20218796655535698, "step": 4120 }, { "epoch": 0.18736363636363637, "grad_norm": 5.28125, "grad_norm_var": 0.235400390625, "learning_rate": 0.0001, "loss": 5.777, "loss/crossentropy": 2.296217381954193, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18401556462049484, "step": 4122 }, { "epoch": 0.18745454545454546, "grad_norm": 5.28125, "grad_norm_var": 0.29466145833333335, "learning_rate": 0.0001, "loss": 6.0533, "loss/crossentropy": 2.559040516614914, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1888827346265316, "step": 4124 }, { "epoch": 0.18754545454545454, "grad_norm": 7.0, "grad_norm_var": 0.28709309895833335, "learning_rate": 0.0001, "loss": 6.1271, "loss/crossentropy": 2.528501868247986, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1961832270026207, "step": 4126 }, { "epoch": 0.18763636363636363, "grad_norm": 5.90625, "grad_norm_var": 0.2913899739583333, "learning_rate": 0.0001, "loss": 6.2146, "loss/crossentropy": 2.6316065788269043, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1979430876672268, "step": 4128 }, { "epoch": 0.18772727272727271, "grad_norm": 8.4375, "grad_norm_var": 0.6803670247395833, "learning_rate": 0.0001, "loss": 6.129, "loss/crossentropy": 2.408836454153061, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.20345842093229294, "step": 4130 }, { "epoch": 0.18781818181818183, "grad_norm": 5.125, "grad_norm_var": 0.7495930989583334, "learning_rate": 0.0001, "loss": 5.7632, "loss/crossentropy": 2.290218234062195, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.18050542101264, "step": 4132 }, { "epoch": 0.18790909090909091, "grad_norm": 5.8125, "grad_norm_var": 0.7030232747395834, "learning_rate": 0.0001, "loss": 6.3544, "loss/crossentropy": 2.6521276235580444, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20616335794329643, "step": 4134 }, { "epoch": 0.188, "grad_norm": 5.8125, "grad_norm_var": 0.6884114583333333, "learning_rate": 0.0001, "loss": 6.1119, "loss/crossentropy": 2.467079997062683, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.20100632682442665, "step": 4136 }, { "epoch": 0.1880909090909091, "grad_norm": 5.65625, "grad_norm_var": 0.6569010416666666, "learning_rate": 0.0001, "loss": 6.4131, "loss/crossentropy": 2.721685826778412, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.2025376297533512, "step": 4138 }, { "epoch": 0.18818181818181817, "grad_norm": 5.8125, "grad_norm_var": 0.5940388997395833, "learning_rate": 0.0001, "loss": 5.8998, "loss/crossentropy": 2.3572500348091125, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.18726632371544838, "step": 4140 }, { "epoch": 0.18827272727272729, "grad_norm": 5.96875, "grad_norm_var": 0.5331868489583333, "learning_rate": 0.0001, "loss": 6.6748, "loss/crossentropy": 2.8653038144111633, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2149326428771019, "step": 4142 }, { "epoch": 0.18836363636363637, "grad_norm": 5.78125, "grad_norm_var": 0.5363118489583333, "learning_rate": 0.0001, "loss": 6.216, "loss/crossentropy": 2.6307814717292786, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19172293692827225, "step": 4144 }, { "epoch": 0.18845454545454546, "grad_norm": 6.46875, "grad_norm_var": 0.10077718098958334, "learning_rate": 0.0001, "loss": 6.5048, "loss/crossentropy": 2.7400301694869995, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2112421989440918, "step": 4146 }, { "epoch": 0.18854545454545454, "grad_norm": 5.8125, "grad_norm_var": 0.06730143229166667, "learning_rate": 0.0001, "loss": 6.4436, "loss/crossentropy": 2.685651659965515, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.207433819770813, "step": 4148 }, { "epoch": 0.18863636363636363, "grad_norm": 5.90625, "grad_norm_var": 0.0673828125, "learning_rate": 0.0001, "loss": 6.3903, "loss/crossentropy": 2.682944893836975, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20784952864050865, "step": 4150 }, { "epoch": 0.18872727272727272, "grad_norm": 6.53125, "grad_norm_var": 0.083203125, "learning_rate": 0.0001, "loss": 6.1904, "loss/crossentropy": 2.600552797317505, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19453564658761024, "step": 4152 }, { "epoch": 0.18881818181818183, "grad_norm": 6.34375, "grad_norm_var": 0.09814046223958334, "learning_rate": 0.0001, "loss": 6.0493, "loss/crossentropy": 2.535381257534027, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18732647597789764, "step": 4154 }, { "epoch": 0.18890909090909092, "grad_norm": 6.15625, "grad_norm_var": 0.09576822916666666, "learning_rate": 0.0001, "loss": 6.4083, "loss/crossentropy": 2.6400386095046997, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21003349497914314, "step": 4156 }, { "epoch": 0.189, "grad_norm": 6.40625, "grad_norm_var": 0.11599934895833333, "learning_rate": 0.0001, "loss": 6.0888, "loss/crossentropy": 2.4658954739570618, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.1980346441268921, "step": 4158 }, { "epoch": 0.1890909090909091, "grad_norm": 6.03125, "grad_norm_var": 0.116650390625, "learning_rate": 0.0001, "loss": 5.9736, "loss/crossentropy": 2.431057393550873, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.18960446119308472, "step": 4160 }, { "epoch": 0.18918181818181817, "grad_norm": 6.125, "grad_norm_var": 0.16090087890625, "learning_rate": 0.0001, "loss": 5.6443, "loss/crossentropy": 2.214470684528351, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.1802891083061695, "step": 4162 }, { "epoch": 0.18927272727272726, "grad_norm": 5.59375, "grad_norm_var": 0.16417643229166667, "learning_rate": 0.0001, "loss": 6.1661, "loss/crossentropy": 2.574301838874817, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19608961790800095, "step": 4164 }, { "epoch": 0.18936363636363637, "grad_norm": 5.34375, "grad_norm_var": 0.19729410807291667, "learning_rate": 0.0001, "loss": 6.3301, "loss/crossentropy": 2.6354116201400757, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20110563561320305, "step": 4166 }, { "epoch": 0.18945454545454546, "grad_norm": 5.875, "grad_norm_var": 0.17014567057291666, "learning_rate": 0.0001, "loss": 6.2724, "loss/crossentropy": 2.5884240865707397, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.20492419227957726, "step": 4168 }, { "epoch": 0.18954545454545454, "grad_norm": 6.28125, "grad_norm_var": 0.16521809895833334, "learning_rate": 0.0001, "loss": 6.1504, "loss/crossentropy": 2.597361385822296, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19222233444452286, "step": 4170 }, { "epoch": 0.18963636363636363, "grad_norm": 5.90625, "grad_norm_var": 0.179931640625, "learning_rate": 0.0001, "loss": 6.2952, "loss/crossentropy": 2.5720279216766357, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20708265155553818, "step": 4172 }, { "epoch": 0.18972727272727272, "grad_norm": 5.875, "grad_norm_var": 0.15441080729166667, "learning_rate": 0.0001, "loss": 6.4355, "loss/crossentropy": 2.7345727682113647, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.20466114580631256, "step": 4174 }, { "epoch": 0.18981818181818183, "grad_norm": 5.71875, "grad_norm_var": 0.13632405598958333, "learning_rate": 0.0001, "loss": 6.3382, "loss/crossentropy": 2.6576154828071594, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20438257232308388, "step": 4176 }, { "epoch": 0.18990909090909092, "grad_norm": 5.9375, "grad_norm_var": 0.09993082682291667, "learning_rate": 0.0001, "loss": 5.913, "loss/crossentropy": 2.4575578570365906, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.185197114944458, "step": 4178 }, { "epoch": 0.19, "grad_norm": 5.5, "grad_norm_var": 0.11959228515625, "learning_rate": 0.0001, "loss": 5.7697, "loss/crossentropy": 2.2832245230674744, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18458619713783264, "step": 4180 }, { "epoch": 0.1900909090909091, "grad_norm": 5.75, "grad_norm_var": 0.08938802083333333, "learning_rate": 0.0001, "loss": 6.3625, "loss/crossentropy": 2.6514329314231873, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.20684565231204033, "step": 4182 }, { "epoch": 0.19018181818181817, "grad_norm": 5.90625, "grad_norm_var": 0.10662434895833334, "learning_rate": 0.0001, "loss": 6.4631, "loss/crossentropy": 2.796268343925476, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20339706540107727, "step": 4184 }, { "epoch": 0.19027272727272726, "grad_norm": 5.9375, "grad_norm_var": 0.09390869140625, "learning_rate": 0.0001, "loss": 6.4284, "loss/crossentropy": 2.7172377705574036, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2078324891626835, "step": 4186 }, { "epoch": 0.19036363636363637, "grad_norm": 6.25, "grad_norm_var": 0.09101155598958334, "learning_rate": 0.0001, "loss": 5.6561, "loss/crossentropy": 2.2221826910972595, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.1826501414179802, "step": 4188 }, { "epoch": 0.19045454545454546, "grad_norm": 5.5625, "grad_norm_var": 0.09547119140625, "learning_rate": 0.0001, "loss": 6.2143, "loss/crossentropy": 2.641390383243561, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19498570263385773, "step": 4190 }, { "epoch": 0.19054545454545455, "grad_norm": 5.84375, "grad_norm_var": 0.141259765625, "learning_rate": 0.0001, "loss": 6.2072, "loss/crossentropy": 2.566817879676819, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20153476297855377, "step": 4192 }, { "epoch": 0.19063636363636363, "grad_norm": 6.09375, "grad_norm_var": 0.13709309895833333, "learning_rate": 0.0001, "loss": 5.8109, "loss/crossentropy": 2.2602714002132416, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.18689971789717674, "step": 4194 }, { "epoch": 0.19072727272727272, "grad_norm": 5.625, "grad_norm_var": 0.14732666015625, "learning_rate": 0.0001, "loss": 6.1046, "loss/crossentropy": 2.553285598754883, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1904796101152897, "step": 4196 }, { "epoch": 0.19081818181818183, "grad_norm": 5.8125, "grad_norm_var": 0.16861572265625, "learning_rate": 0.0001, "loss": 6.7574, "loss/crossentropy": 2.9394150376319885, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21304897591471672, "step": 4198 }, { "epoch": 0.19090909090909092, "grad_norm": 5.78125, "grad_norm_var": 0.1671875, "learning_rate": 0.0001, "loss": 6.204, "loss/crossentropy": 2.6457638144493103, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19214873388409615, "step": 4200 }, { "epoch": 0.191, "grad_norm": 5.84375, "grad_norm_var": 0.17086181640625, "learning_rate": 0.0001, "loss": 6.0983, "loss/crossentropy": 2.4780426025390625, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19757385924458504, "step": 4202 }, { "epoch": 0.1910909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.15115559895833333, "learning_rate": 0.0001, "loss": 6.3745, "loss/crossentropy": 2.5902099609375, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21553903073072433, "step": 4204 }, { "epoch": 0.19118181818181817, "grad_norm": 5.84375, "grad_norm_var": 0.1384765625, "learning_rate": 0.0001, "loss": 6.4, "loss/crossentropy": 2.7156633734703064, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20359260588884354, "step": 4206 }, { "epoch": 0.19127272727272726, "grad_norm": 6.71875, "grad_norm_var": 0.16536458333333334, "learning_rate": 0.0001, "loss": 5.9871, "loss/crossentropy": 2.4380687475204468, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.1881016455590725, "step": 4208 }, { "epoch": 0.19136363636363637, "grad_norm": 5.71875, "grad_norm_var": 0.17252197265625, "learning_rate": 0.0001, "loss": 6.218, "loss/crossentropy": 2.571974515914917, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.198001466691494, "step": 4210 }, { "epoch": 0.19145454545454546, "grad_norm": 6.0, "grad_norm_var": 0.166650390625, "learning_rate": 0.0001, "loss": 6.4405, "loss/crossentropy": 2.7593639492988586, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20366114005446434, "step": 4212 }, { "epoch": 0.19154545454545455, "grad_norm": 5.4375, "grad_norm_var": 0.166015625, "learning_rate": 0.0001, "loss": 6.3932, "loss/crossentropy": 2.7019941210746765, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.2036937028169632, "step": 4214 }, { "epoch": 0.19163636363636363, "grad_norm": 6.375, "grad_norm_var": 0.20162353515625, "learning_rate": 0.0001, "loss": 5.8902, "loss/crossentropy": 2.395720899105072, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18616831302642822, "step": 4216 }, { "epoch": 0.19172727272727272, "grad_norm": 5.71875, "grad_norm_var": 0.21678059895833332, "learning_rate": 0.0001, "loss": 6.1022, "loss/crossentropy": 2.560490071773529, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18932271003723145, "step": 4218 }, { "epoch": 0.1918181818181818, "grad_norm": 6.25, "grad_norm_var": 0.21887613932291666, "learning_rate": 0.0001, "loss": 6.298, "loss/crossentropy": 2.531761407852173, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2063147984445095, "step": 4220 }, { "epoch": 0.19190909090909092, "grad_norm": 5.6875, "grad_norm_var": 0.22076822916666666, "learning_rate": 0.0001, "loss": 6.2262, "loss/crossentropy": 2.5550498962402344, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20305253192782402, "step": 4222 }, { "epoch": 0.192, "grad_norm": 5.625, "grad_norm_var": 0.16711832682291666, "learning_rate": 0.0001, "loss": 6.0897, "loss/crossentropy": 2.510464310646057, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.1940581426024437, "step": 4224 }, { "epoch": 0.1920909090909091, "grad_norm": 6.75, "grad_norm_var": 0.33215738932291666, "learning_rate": 0.0001, "loss": 6.0562, "loss/crossentropy": 2.403315842151642, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.1965426690876484, "step": 4226 }, { "epoch": 0.19218181818181818, "grad_norm": 6.84375, "grad_norm_var": 0.3680826822916667, "learning_rate": 0.0001, "loss": 5.9223, "loss/crossentropy": 2.427201211452484, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18739888072013855, "step": 4228 }, { "epoch": 0.19227272727272726, "grad_norm": 6.59375, "grad_norm_var": 0.357275390625, "learning_rate": 0.0001, "loss": 6.0536, "loss/crossentropy": 2.488649547100067, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.1914595365524292, "step": 4230 }, { "epoch": 0.19236363636363638, "grad_norm": 5.96875, "grad_norm_var": 0.31636962890625, "learning_rate": 0.0001, "loss": 6.2797, "loss/crossentropy": 2.6231146454811096, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.20218368247151375, "step": 4232 }, { "epoch": 0.19245454545454546, "grad_norm": 5.375, "grad_norm_var": 0.34306233723958335, "learning_rate": 0.0001, "loss": 5.7734, "loss/crossentropy": 2.298375904560089, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18520119041204453, "step": 4234 }, { "epoch": 0.19254545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.38160400390625, "learning_rate": 0.0001, "loss": 6.0671, "loss/crossentropy": 2.4879594445228577, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.19600151479244232, "step": 4236 }, { "epoch": 0.19263636363636363, "grad_norm": 5.4375, "grad_norm_var": 0.400390625, "learning_rate": 0.0001, "loss": 5.9156, "loss/crossentropy": 2.3659111857414246, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19168555364012718, "step": 4238 }, { "epoch": 0.19272727272727272, "grad_norm": 8.1875, "grad_norm_var": 0.6826171875, "learning_rate": 0.0001, "loss": 6.0705, "loss/crossentropy": 2.49290531873703, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19643311947584152, "step": 4240 }, { "epoch": 0.1928181818181818, "grad_norm": 5.90625, "grad_norm_var": 0.54752197265625, "learning_rate": 0.0001, "loss": 6.0137, "loss/crossentropy": 2.415317714214325, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19538962095975876, "step": 4242 }, { "epoch": 0.19290909090909092, "grad_norm": 6.125, "grad_norm_var": 0.61041259765625, "learning_rate": 0.0001, "loss": 6.1204, "loss/crossentropy": 2.432610422372818, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.20140080899000168, "step": 4244 }, { "epoch": 0.193, "grad_norm": 5.75, "grad_norm_var": 0.6007120768229167, "learning_rate": 0.0001, "loss": 6.3443, "loss/crossentropy": 2.7415496110916138, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1969904527068138, "step": 4246 }, { "epoch": 0.1930909090909091, "grad_norm": 6.125, "grad_norm_var": 0.5984212239583333, "learning_rate": 0.0001, "loss": 6.4456, "loss/crossentropy": 2.7079776525497437, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.20833633095026016, "step": 4248 }, { "epoch": 0.19318181818181818, "grad_norm": 6.1875, "grad_norm_var": 0.5520670572916667, "learning_rate": 0.0001, "loss": 6.0264, "loss/crossentropy": 2.375360608100891, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.19772014021873474, "step": 4250 }, { "epoch": 0.19327272727272726, "grad_norm": 6.3125, "grad_norm_var": 0.511328125, "learning_rate": 0.0001, "loss": 6.4821, "loss/crossentropy": 2.590950608253479, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2207537777721882, "step": 4252 }, { "epoch": 0.19336363636363638, "grad_norm": 6.59375, "grad_norm_var": 0.5206868489583333, "learning_rate": 0.0001, "loss": 6.1583, "loss/crossentropy": 2.509304463863373, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.19791222736239433, "step": 4254 }, { "epoch": 0.19345454545454546, "grad_norm": 5.21875, "grad_norm_var": 0.2841796875, "learning_rate": 0.0001, "loss": 5.8923, "loss/crossentropy": 2.4570559561252594, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18317419290542603, "step": 4256 }, { "epoch": 0.19354545454545455, "grad_norm": 5.875, "grad_norm_var": 0.290478515625, "learning_rate": 0.0001, "loss": 5.6852, "loss/crossentropy": 2.269787847995758, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1809917874634266, "step": 4258 }, { "epoch": 0.19363636363636363, "grad_norm": 5.875, "grad_norm_var": 0.15271809895833333, "learning_rate": 0.0001, "loss": 6.167, "loss/crossentropy": 2.4809215664863586, "loss/hidden": 1.712890625, "loss/jsd": 0.0, "loss/logits": 0.19731810688972473, "step": 4260 }, { "epoch": 0.19372727272727272, "grad_norm": 6.875, "grad_norm_var": 0.21516927083333334, "learning_rate": 0.0001, "loss": 6.0751, "loss/crossentropy": 2.363742858171463, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.19925590977072716, "step": 4262 }, { "epoch": 0.1938181818181818, "grad_norm": 5.75, "grad_norm_var": 0.21985270182291666, "learning_rate": 0.0001, "loss": 6.358, "loss/crossentropy": 2.6845974922180176, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.20269474387168884, "step": 4264 }, { "epoch": 0.19390909090909092, "grad_norm": 6.375, "grad_norm_var": 0.23964436848958334, "learning_rate": 0.0001, "loss": 5.9309, "loss/crossentropy": 2.3986001014709473, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.18623963370919228, "step": 4266 }, { "epoch": 0.194, "grad_norm": 5.59375, "grad_norm_var": 0.27698160807291666, "learning_rate": 0.0001, "loss": 5.5247, "loss/crossentropy": 2.1538237631320953, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1724427491426468, "step": 4268 }, { "epoch": 0.1940909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.251171875, "learning_rate": 0.0001, "loss": 6.0555, "loss/crossentropy": 2.55034738779068, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19016480818390846, "step": 4270 }, { "epoch": 0.19418181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.28062744140625, "learning_rate": 0.0001, "loss": 5.704, "loss/crossentropy": 2.35320246219635, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17589672282338142, "step": 4272 }, { "epoch": 0.19427272727272726, "grad_norm": 5.4375, "grad_norm_var": 0.29010416666666666, "learning_rate": 0.0001, "loss": 5.9298, "loss/crossentropy": 2.4862906336784363, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.18516972288489342, "step": 4274 }, { "epoch": 0.19436363636363638, "grad_norm": 5.46875, "grad_norm_var": 0.28453369140625, "learning_rate": 0.0001, "loss": 6.2438, "loss/crossentropy": 2.6229540705680847, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.20056232064962387, "step": 4276 }, { "epoch": 0.19445454545454546, "grad_norm": 5.59375, "grad_norm_var": 0.11057535807291667, "learning_rate": 0.0001, "loss": 5.7842, "loss/crossentropy": 2.332363784313202, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.18131427094340324, "step": 4278 }, { "epoch": 0.19454545454545455, "grad_norm": 6.34375, "grad_norm_var": 0.17831624348958333, "learning_rate": 0.0001, "loss": 6.4662, "loss/crossentropy": 2.7372747659683228, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21000108495354652, "step": 4280 }, { "epoch": 0.19463636363636364, "grad_norm": 5.78125, "grad_norm_var": 0.17083333333333334, "learning_rate": 0.0001, "loss": 6.2393, "loss/crossentropy": 2.620232045650482, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19882166385650635, "step": 4282 }, { "epoch": 0.19472727272727272, "grad_norm": 5.71875, "grad_norm_var": 0.16328125, "learning_rate": 0.0001, "loss": 5.5549, "loss/crossentropy": 2.130387991666794, "loss/hidden": 1.681640625, "loss/jsd": 0.0, "loss/logits": 0.1742863766849041, "step": 4284 }, { "epoch": 0.1948181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.16087239583333332, "learning_rate": 0.0001, "loss": 5.9193, "loss/crossentropy": 2.442248284816742, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18794139102101326, "step": 4286 }, { "epoch": 0.19490909090909092, "grad_norm": 5.65625, "grad_norm_var": 0.15362955729166666, "learning_rate": 0.0001, "loss": 5.4892, "loss/crossentropy": 2.1811380982398987, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1683029681444168, "step": 4288 }, { "epoch": 0.195, "grad_norm": 5.71875, "grad_norm_var": 0.15510660807291668, "learning_rate": 0.0001, "loss": 6.0365, "loss/crossentropy": 2.504754424095154, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19086829200387, "step": 4290 }, { "epoch": 0.1950909090909091, "grad_norm": 5.5625, "grad_norm_var": 0.14607747395833334, "learning_rate": 0.0001, "loss": 5.8517, "loss/crossentropy": 2.3606632351875305, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.18542743474245071, "step": 4292 }, { "epoch": 0.19518181818181818, "grad_norm": 5.5625, "grad_norm_var": 0.14452718098958334, "learning_rate": 0.0001, "loss": 5.9185, "loss/crossentropy": 2.4360159039497375, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.18457433953881264, "step": 4294 }, { "epoch": 0.19527272727272726, "grad_norm": 5.8125, "grad_norm_var": 0.09876302083333334, "learning_rate": 0.0001, "loss": 5.893, "loss/crossentropy": 2.35750475525856, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.18558265268802643, "step": 4296 }, { "epoch": 0.19536363636363635, "grad_norm": 6.375, "grad_norm_var": 0.1068359375, "learning_rate": 0.0001, "loss": 5.9583, "loss/crossentropy": 2.3313439190387726, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.1982380412518978, "step": 4298 }, { "epoch": 0.19545454545454546, "grad_norm": 6.375, "grad_norm_var": 0.11770426432291667, "learning_rate": 0.0001, "loss": 6.0953, "loss/crossentropy": 2.4416235983371735, "loss/hidden": 1.755859375, "loss/jsd": 0.0, "loss/logits": 0.1897774524986744, "step": 4300 }, { "epoch": 0.19554545454545455, "grad_norm": 5.875, "grad_norm_var": 0.10813395182291667, "learning_rate": 0.0001, "loss": 6.2474, "loss/crossentropy": 2.5543683767318726, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.20660914108157158, "step": 4302 }, { "epoch": 0.19563636363636364, "grad_norm": 5.96875, "grad_norm_var": 0.05813395182291667, "learning_rate": 0.0001, "loss": 6.0915, "loss/crossentropy": 2.545836865901947, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.18991868942975998, "step": 4304 }, { "epoch": 0.19572727272727272, "grad_norm": 5.6875, "grad_norm_var": 0.060139973958333336, "learning_rate": 0.0001, "loss": 6.1141, "loss/crossentropy": 2.5710269808769226, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19200648739933968, "step": 4306 }, { "epoch": 0.1958181818181818, "grad_norm": 6.15625, "grad_norm_var": 0.06412760416666667, "learning_rate": 0.0001, "loss": 6.0211, "loss/crossentropy": 2.512058675289154, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18762147426605225, "step": 4308 }, { "epoch": 0.19590909090909092, "grad_norm": 5.8125, "grad_norm_var": 0.05963134765625, "learning_rate": 0.0001, "loss": 6.0611, "loss/crossentropy": 2.51479572057724, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19154006242752075, "step": 4310 }, { "epoch": 0.196, "grad_norm": 5.3125, "grad_norm_var": 0.1021484375, "learning_rate": 0.0001, "loss": 6.0557, "loss/crossentropy": 2.495525896549225, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.19058460742235184, "step": 4312 }, { "epoch": 0.1960909090909091, "grad_norm": 5.71875, "grad_norm_var": 0.09130452473958334, "learning_rate": 0.0001, "loss": 6.3991, "loss/crossentropy": 2.749830722808838, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.20183977112174034, "step": 4314 }, { "epoch": 0.19618181818181818, "grad_norm": 5.28125, "grad_norm_var": 0.09661051432291666, "learning_rate": 0.0001, "loss": 5.6473, "loss/crossentropy": 2.2322949171066284, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.17587679624557495, "step": 4316 }, { "epoch": 0.19627272727272727, "grad_norm": 5.875, "grad_norm_var": 0.11926676432291666, "learning_rate": 0.0001, "loss": 5.9602, "loss/crossentropy": 2.4922374486923218, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.187034472823143, "step": 4318 }, { "epoch": 0.19636363636363635, "grad_norm": 5.71875, "grad_norm_var": 0.55758056640625, "learning_rate": 0.0001, "loss": 6.686, "loss/crossentropy": 2.8471357226371765, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21435264497995377, "step": 4320 }, { "epoch": 0.19645454545454547, "grad_norm": 6.125, "grad_norm_var": 0.56148681640625, "learning_rate": 0.0001, "loss": 6.1353, "loss/crossentropy": 2.4138529896736145, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2026178538799286, "step": 4322 }, { "epoch": 0.19654545454545455, "grad_norm": 5.59375, "grad_norm_var": 0.556103515625, "learning_rate": 0.0001, "loss": 6.1913, "loss/crossentropy": 2.585480272769928, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19964591041207314, "step": 4324 }, { "epoch": 0.19663636363636364, "grad_norm": 5.375, "grad_norm_var": 0.5838175455729167, "learning_rate": 0.0001, "loss": 5.6992, "loss/crossentropy": 2.2962841987609863, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1785726323723793, "step": 4326 }, { "epoch": 0.19672727272727272, "grad_norm": 5.46875, "grad_norm_var": 0.5581868489583334, "learning_rate": 0.0001, "loss": 6.3409, "loss/crossentropy": 2.6665703654289246, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.20278627052903175, "step": 4328 }, { "epoch": 0.1968181818181818, "grad_norm": 5.59375, "grad_norm_var": 0.5692667643229167, "learning_rate": 0.0001, "loss": 6.16, "loss/crossentropy": 2.585988700389862, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19588002189993858, "step": 4330 }, { "epoch": 0.19690909090909092, "grad_norm": 5.9375, "grad_norm_var": 0.5554646809895833, "learning_rate": 0.0001, "loss": 6.3668, "loss/crossentropy": 2.7173324823379517, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20205366238951683, "step": 4332 }, { "epoch": 0.197, "grad_norm": 5.0, "grad_norm_var": 0.5753743489583333, "learning_rate": 0.0001, "loss": 5.836, "loss/crossentropy": 2.274884343147278, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.18736330792307854, "step": 4334 }, { "epoch": 0.1970909090909091, "grad_norm": 5.375, "grad_norm_var": 0.15930582682291666, "learning_rate": 0.0001, "loss": 5.3057, "loss/crossentropy": 2.0664561688899994, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.1631852351129055, "step": 4336 }, { "epoch": 0.19718181818181818, "grad_norm": 6.46875, "grad_norm_var": 0.17450764973958333, "learning_rate": 0.0001, "loss": 6.1171, "loss/crossentropy": 2.4647712111473083, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.20175394788384438, "step": 4338 }, { "epoch": 0.19727272727272727, "grad_norm": 5.53125, "grad_norm_var": 0.18605143229166668, "learning_rate": 0.0001, "loss": 5.9654, "loss/crossentropy": 2.382912188768387, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.1928221695125103, "step": 4340 }, { "epoch": 0.19736363636363635, "grad_norm": 5.5625, "grad_norm_var": 0.17746988932291666, "learning_rate": 0.0001, "loss": 6.0757, "loss/crossentropy": 2.5458348989486694, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19088003784418106, "step": 4342 }, { "epoch": 0.19745454545454547, "grad_norm": 6.03125, "grad_norm_var": 0.16324462890625, "learning_rate": 0.0001, "loss": 6.1534, "loss/crossentropy": 2.5967233777046204, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19121187180280685, "step": 4344 }, { "epoch": 0.19754545454545455, "grad_norm": 6.03125, "grad_norm_var": 0.16321207682291666, "learning_rate": 0.0001, "loss": 6.1494, "loss/crossentropy": 2.5164793729782104, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19610871374607086, "step": 4346 }, { "epoch": 0.19763636363636364, "grad_norm": 6.78125, "grad_norm_var": 0.3410441080729167, "learning_rate": 0.0001, "loss": 6.0887, "loss/crossentropy": 2.5497613549232483, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1892409548163414, "step": 4348 }, { "epoch": 0.19772727272727272, "grad_norm": 5.34375, "grad_norm_var": 0.3136067708333333, "learning_rate": 0.0001, "loss": 5.3821, "loss/crossentropy": 2.085080564022064, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.16642099991440773, "step": 4350 }, { "epoch": 0.1978181818181818, "grad_norm": 5.5625, "grad_norm_var": 0.2613118489583333, "learning_rate": 0.0001, "loss": 5.938, "loss/crossentropy": 2.472169518470764, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18720387667417526, "step": 4352 }, { "epoch": 0.1979090909090909, "grad_norm": 5.65625, "grad_norm_var": 0.24466145833333333, "learning_rate": 0.0001, "loss": 6.2081, "loss/crossentropy": 2.670970916748047, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.19101829454302788, "step": 4354 }, { "epoch": 0.198, "grad_norm": 5.875, "grad_norm_var": 0.23370768229166666, "learning_rate": 0.0001, "loss": 5.9831, "loss/crossentropy": 2.40706604719162, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.1933434158563614, "step": 4356 }, { "epoch": 0.1980909090909091, "grad_norm": 5.875, "grad_norm_var": 0.22489827473958332, "learning_rate": 0.0001, "loss": 6.238, "loss/crossentropy": 2.5505582690238953, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.20292799174785614, "step": 4358 }, { "epoch": 0.19818181818181818, "grad_norm": 6.03125, "grad_norm_var": 0.22675374348958333, "learning_rate": 0.0001, "loss": 5.8159, "loss/crossentropy": 2.2980202734470367, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.1875329166650772, "step": 4360 }, { "epoch": 0.19827272727272727, "grad_norm": 5.84375, "grad_norm_var": 0.22766520182291666, "learning_rate": 0.0001, "loss": 6.0808, "loss/crossentropy": 2.5353772044181824, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.1953665092587471, "step": 4362 }, { "epoch": 0.19836363636363635, "grad_norm": 6.46875, "grad_norm_var": 0.09244791666666667, "learning_rate": 0.0001, "loss": 6.3266, "loss/crossentropy": 2.6478540301322937, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.20049558207392693, "step": 4364 }, { "epoch": 0.19845454545454547, "grad_norm": 5.5, "grad_norm_var": 0.07001546223958334, "learning_rate": 0.0001, "loss": 5.7189, "loss/crossentropy": 2.2566206455230713, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18216098099946976, "step": 4366 }, { "epoch": 0.19854545454545455, "grad_norm": 5.96875, "grad_norm_var": 0.06458333333333334, "learning_rate": 0.0001, "loss": 6.0137, "loss/crossentropy": 2.346487045288086, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20031726732850075, "step": 4368 }, { "epoch": 0.19863636363636364, "grad_norm": 5.71875, "grad_norm_var": 0.06868489583333333, "learning_rate": 0.0001, "loss": 5.9681, "loss/crossentropy": 2.4566863775253296, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19000902399420738, "step": 4370 }, { "epoch": 0.19872727272727272, "grad_norm": 5.5, "grad_norm_var": 0.08036702473958333, "learning_rate": 0.0001, "loss": 6.1765, "loss/crossentropy": 2.5801004767417908, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19479438289999962, "step": 4372 }, { "epoch": 0.1988181818181818, "grad_norm": 5.53125, "grad_norm_var": 0.10904541015625, "learning_rate": 0.0001, "loss": 6.1165, "loss/crossentropy": 2.5382774472236633, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.1951260343194008, "step": 4374 }, { "epoch": 0.1989090909090909, "grad_norm": 5.78125, "grad_norm_var": 0.11482747395833333, "learning_rate": 0.0001, "loss": 5.9328, "loss/crossentropy": 2.412575989961624, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1873694472014904, "step": 4376 }, { "epoch": 0.199, "grad_norm": 6.28125, "grad_norm_var": 0.13062744140625, "learning_rate": 0.0001, "loss": 5.9135, "loss/crossentropy": 2.467609852552414, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18228929117321968, "step": 4378 }, { "epoch": 0.1990909090909091, "grad_norm": 5.90625, "grad_norm_var": 0.09485270182291666, "learning_rate": 0.0001, "loss": 6.3924, "loss/crossentropy": 2.7011473178863525, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.2076004110276699, "step": 4380 }, { "epoch": 0.19918181818181818, "grad_norm": 5.78125, "grad_norm_var": 0.108837890625, "learning_rate": 0.0001, "loss": 6.1799, "loss/crossentropy": 2.52748304605484, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.1978565789759159, "step": 4382 }, { "epoch": 0.19927272727272727, "grad_norm": 6.28125, "grad_norm_var": 0.4930338541666667, "learning_rate": 0.0001, "loss": 5.9535, "loss/crossentropy": 2.3540825247764587, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19627335667610168, "step": 4384 }, { "epoch": 0.19936363636363635, "grad_norm": 5.96875, "grad_norm_var": 0.479931640625, "learning_rate": 0.0001, "loss": 6.2267, "loss/crossentropy": 2.51715886592865, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20376838743686676, "step": 4386 }, { "epoch": 0.19945454545454547, "grad_norm": 6.6875, "grad_norm_var": 0.4984212239583333, "learning_rate": 0.0001, "loss": 6.3344, "loss/crossentropy": 2.6651312708854675, "loss/hidden": 1.689453125, "loss/jsd": 0.0, "loss/logits": 0.1979827806353569, "step": 4388 }, { "epoch": 0.19954545454545455, "grad_norm": 6.03125, "grad_norm_var": 0.5132120768229167, "learning_rate": 0.0001, "loss": 6.1854, "loss/crossentropy": 2.649400532245636, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19265877828001976, "step": 4390 }, { "epoch": 0.19963636363636364, "grad_norm": 5.375, "grad_norm_var": 0.5340983072916666, "learning_rate": 0.0001, "loss": 5.8609, "loss/crossentropy": 2.439736008644104, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1807878538966179, "step": 4392 }, { "epoch": 0.19972727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.55211181640625, "learning_rate": 0.0001, "loss": 6.0335, "loss/crossentropy": 2.4905953407287598, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.19042008742690086, "step": 4394 }, { "epoch": 0.1998181818181818, "grad_norm": 5.65625, "grad_norm_var": 0.562109375, "learning_rate": 0.0001, "loss": 6.089, "loss/crossentropy": 2.5648422837257385, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19108513742685318, "step": 4396 }, { "epoch": 0.1999090909090909, "grad_norm": 6.5, "grad_norm_var": 0.5733683268229167, "learning_rate": 0.0001, "loss": 6.1761, "loss/crossentropy": 2.5530128479003906, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.1984456554055214, "step": 4398 }, { "epoch": 0.2, "grad_norm": 6.125, "grad_norm_var": 0.22847900390625, "learning_rate": 0.0001, "loss": 6.3899, "loss/crossentropy": 2.7272539138793945, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2037619687616825, "step": 4400 }, { "epoch": 0.2000909090909091, "grad_norm": 6.3125, "grad_norm_var": 0.24724934895833334, "learning_rate": 0.0001, "loss": 6.384, "loss/crossentropy": 2.661967933177948, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20618478581309319, "step": 4402 }, { "epoch": 0.20018181818181818, "grad_norm": 5.84375, "grad_norm_var": 0.20035400390625, "learning_rate": 0.0001, "loss": 6.1117, "loss/crossentropy": 2.549590528011322, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19019721448421478, "step": 4404 }, { "epoch": 0.20027272727272727, "grad_norm": 6.4375, "grad_norm_var": 0.1697265625, "learning_rate": 0.0001, "loss": 6.2147, "loss/crossentropy": 2.556674003601074, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19900329038500786, "step": 4406 }, { "epoch": 0.20036363636363635, "grad_norm": 5.375, "grad_norm_var": 0.17157796223958333, "learning_rate": 0.0001, "loss": 5.704, "loss/crossentropy": 2.287501722574234, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.1728997342288494, "step": 4408 }, { "epoch": 0.20045454545454544, "grad_norm": 5.34375, "grad_norm_var": 0.16516927083333333, "learning_rate": 0.0001, "loss": 5.5734, "loss/crossentropy": 2.2333319783210754, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.17111177369952202, "step": 4410 }, { "epoch": 0.20054545454545455, "grad_norm": 5.6875, "grad_norm_var": 0.15636393229166667, "learning_rate": 0.0001, "loss": 6.2686, "loss/crossentropy": 2.6522738933563232, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20225460454821587, "step": 4412 }, { "epoch": 0.20063636363636364, "grad_norm": 6.09375, "grad_norm_var": 0.14933268229166666, "learning_rate": 0.0001, "loss": 6.4214, "loss/crossentropy": 2.737513840198517, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.2037407085299492, "step": 4414 }, { "epoch": 0.20072727272727273, "grad_norm": 5.6875, "grad_norm_var": 0.13238525390625, "learning_rate": 0.0001, "loss": 6.4066, "loss/crossentropy": 2.7320311665534973, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20300797000527382, "step": 4416 }, { "epoch": 0.2008181818181818, "grad_norm": 6.0, "grad_norm_var": 0.11399332682291667, "learning_rate": 0.0001, "loss": 6.0019, "loss/crossentropy": 2.3904895186424255, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19473758339881897, "step": 4418 }, { "epoch": 0.2009090909090909, "grad_norm": 5.96875, "grad_norm_var": 0.11456705729166666, "learning_rate": 0.0001, "loss": 6.2456, "loss/crossentropy": 2.651884973049164, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19569531828165054, "step": 4420 }, { "epoch": 0.201, "grad_norm": 5.65625, "grad_norm_var": 0.09612223307291666, "learning_rate": 0.0001, "loss": 6.3823, "loss/crossentropy": 2.753440022468567, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.20214134827256203, "step": 4422 }, { "epoch": 0.2010909090909091, "grad_norm": 5.9375, "grad_norm_var": 0.09739583333333333, "learning_rate": 0.0001, "loss": 6.283, "loss/crossentropy": 2.640276074409485, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.20431145653128624, "step": 4424 }, { "epoch": 0.20118181818181818, "grad_norm": 5.34375, "grad_norm_var": 0.13943684895833333, "learning_rate": 0.0001, "loss": 5.4951, "loss/crossentropy": 2.1797913908958435, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.1719626821577549, "step": 4426 }, { "epoch": 0.20127272727272727, "grad_norm": 5.71875, "grad_norm_var": 0.14225260416666666, "learning_rate": 0.0001, "loss": 6.2378, "loss/crossentropy": 2.5985162258148193, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.20279290527105331, "step": 4428 }, { "epoch": 0.20136363636363636, "grad_norm": 5.375, "grad_norm_var": 0.13072916666666667, "learning_rate": 0.0001, "loss": 5.7657, "loss/crossentropy": 2.43401175737381, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.1735936999320984, "step": 4430 }, { "epoch": 0.20145454545454544, "grad_norm": 5.65625, "grad_norm_var": 0.10831705729166667, "learning_rate": 0.0001, "loss": 5.963, "loss/crossentropy": 2.4338278472423553, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.1886623054742813, "step": 4432 }, { "epoch": 0.20154545454545456, "grad_norm": 5.125, "grad_norm_var": 0.114306640625, "learning_rate": 0.0001, "loss": 5.72, "loss/crossentropy": 2.3729305267333984, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17611102759838104, "step": 4434 }, { "epoch": 0.20163636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.10162353515625, "learning_rate": 0.0001, "loss": 5.7968, "loss/crossentropy": 2.3809931576251984, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1833774484694004, "step": 4436 }, { "epoch": 0.20172727272727273, "grad_norm": 5.46875, "grad_norm_var": 0.078369140625, "learning_rate": 0.0001, "loss": 5.6092, "loss/crossentropy": 2.2571921646595, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1740698181092739, "step": 4438 }, { "epoch": 0.2018181818181818, "grad_norm": 6.3125, "grad_norm_var": 0.10500895182291667, "learning_rate": 0.0001, "loss": 6.0608, "loss/crossentropy": 2.4435415267944336, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.19610470160841942, "step": 4440 }, { "epoch": 0.2019090909090909, "grad_norm": 6.0625, "grad_norm_var": 0.093212890625, "learning_rate": 0.0001, "loss": 5.5681, "loss/crossentropy": 2.184315860271454, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.17568789795041084, "step": 4442 }, { "epoch": 0.202, "grad_norm": 5.71875, "grad_norm_var": 0.09068603515625, "learning_rate": 0.0001, "loss": 6.1146, "loss/crossentropy": 2.5933516025543213, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19177807122468948, "step": 4444 }, { "epoch": 0.2020909090909091, "grad_norm": 5.625, "grad_norm_var": 0.08313802083333334, "learning_rate": 0.0001, "loss": 5.6709, "loss/crossentropy": 2.2699460089206696, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18013375625014305, "step": 4446 }, { "epoch": 0.20218181818181818, "grad_norm": 5.625, "grad_norm_var": 0.09543863932291667, "learning_rate": 0.0001, "loss": 6.3293, "loss/crossentropy": 2.6872466802597046, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.19877508282661438, "step": 4448 }, { "epoch": 0.20227272727272727, "grad_norm": 5.6875, "grad_norm_var": 0.07675374348958333, "learning_rate": 0.0001, "loss": 5.8328, "loss/crossentropy": 2.3448285162448883, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18786123394966125, "step": 4450 }, { "epoch": 0.20236363636363636, "grad_norm": 5.84375, "grad_norm_var": 0.07668863932291667, "learning_rate": 0.0001, "loss": 5.6541, "loss/crossentropy": 2.205860137939453, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18388479948043823, "step": 4452 }, { "epoch": 0.20245454545454544, "grad_norm": 5.625, "grad_norm_var": 0.07980143229166667, "learning_rate": 0.0001, "loss": 6.2467, "loss/crossentropy": 2.5529826283454895, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20101585239171982, "step": 4454 }, { "epoch": 0.20254545454545456, "grad_norm": 6.34375, "grad_norm_var": 0.23443603515625, "learning_rate": 0.0001, "loss": 6.5563, "loss/crossentropy": 2.77260422706604, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.2117663137614727, "step": 4456 }, { "epoch": 0.20263636363636364, "grad_norm": 5.71875, "grad_norm_var": 0.22942301432291667, "learning_rate": 0.0001, "loss": 5.8447, "loss/crossentropy": 2.3067293763160706, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.1895441599190235, "step": 4458 }, { "epoch": 0.20272727272727273, "grad_norm": 5.84375, "grad_norm_var": 0.21783854166666666, "learning_rate": 0.0001, "loss": 6.1077, "loss/crossentropy": 2.4955081939697266, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.1977463848888874, "step": 4460 }, { "epoch": 0.20281818181818181, "grad_norm": 5.5, "grad_norm_var": 0.219384765625, "learning_rate": 0.0001, "loss": 5.9148, "loss/crossentropy": 2.4538360238075256, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18359950184822083, "step": 4462 }, { "epoch": 0.2029090909090909, "grad_norm": 5.25, "grad_norm_var": 0.24582926432291666, "learning_rate": 0.0001, "loss": 5.8706, "loss/crossentropy": 2.442430257797241, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18070486560463905, "step": 4464 }, { "epoch": 0.203, "grad_norm": 5.59375, "grad_norm_var": 0.26534830729166664, "learning_rate": 0.0001, "loss": 5.9934, "loss/crossentropy": 2.5100361108779907, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.1875985600054264, "step": 4466 }, { "epoch": 0.2030909090909091, "grad_norm": 5.78125, "grad_norm_var": 0.26155192057291665, "learning_rate": 0.0001, "loss": 6.273, "loss/crossentropy": 2.716595768928528, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.19568435847759247, "step": 4468 }, { "epoch": 0.20318181818181819, "grad_norm": 6.09375, "grad_norm_var": 0.26417643229166665, "learning_rate": 0.0001, "loss": 5.7698, "loss/crossentropy": 2.3536485731601715, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18341590464115143, "step": 4470 }, { "epoch": 0.20327272727272727, "grad_norm": 6.03125, "grad_norm_var": 0.106103515625, "learning_rate": 0.0001, "loss": 6.319, "loss/crossentropy": 2.6986995935440063, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19562029466032982, "step": 4472 }, { "epoch": 0.20336363636363636, "grad_norm": 5.90625, "grad_norm_var": 0.13534749348958333, "learning_rate": 0.0001, "loss": 6.2242, "loss/crossentropy": 2.5849894881248474, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.19966596737504005, "step": 4474 }, { "epoch": 0.20345454545454544, "grad_norm": 5.84375, "grad_norm_var": 0.1169921875, "learning_rate": 0.0001, "loss": 6.1292, "loss/crossentropy": 2.5737810730934143, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.19285017251968384, "step": 4476 }, { "epoch": 0.20354545454545456, "grad_norm": 5.46875, "grad_norm_var": 0.14659830729166667, "learning_rate": 0.0001, "loss": 5.8993, "loss/crossentropy": 2.3821602761745453, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18764927610754967, "step": 4478 }, { "epoch": 0.20363636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.13238525390625, "learning_rate": 0.0001, "loss": 5.7594, "loss/crossentropy": 2.2908918857574463, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.18298261985182762, "step": 4480 }, { "epoch": 0.20372727272727273, "grad_norm": 5.40625, "grad_norm_var": 0.11724853515625, "learning_rate": 0.0001, "loss": 5.8095, "loss/crossentropy": 2.3198224306106567, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1880258210003376, "step": 4482 }, { "epoch": 0.20381818181818182, "grad_norm": 5.8125, "grad_norm_var": 0.10909830729166667, "learning_rate": 0.0001, "loss": 5.9959, "loss/crossentropy": 2.5053275525569916, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18694881349802017, "step": 4484 }, { "epoch": 0.2039090909090909, "grad_norm": 5.53125, "grad_norm_var": 0.12014567057291667, "learning_rate": 0.0001, "loss": 6.1643, "loss/crossentropy": 2.6370161175727844, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19140247255563736, "step": 4486 }, { "epoch": 0.204, "grad_norm": 5.625, "grad_norm_var": 0.11916910807291667, "learning_rate": 0.0001, "loss": 6.2575, "loss/crossentropy": 2.7239381074905396, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19242360442876816, "step": 4488 }, { "epoch": 0.2040909090909091, "grad_norm": 8.25, "grad_norm_var": 0.4718709309895833, "learning_rate": 0.0001, "loss": 6.1649, "loss/crossentropy": 2.4463285207748413, "loss/hidden": 1.705078125, "loss/jsd": 0.0, "loss/logits": 0.2013503797352314, "step": 4490 }, { "epoch": 0.2041818181818182, "grad_norm": 5.5, "grad_norm_var": 0.48580729166666664, "learning_rate": 0.0001, "loss": 5.8829, "loss/crossentropy": 2.3740429878234863, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.18936089798808098, "step": 4492 }, { "epoch": 0.20427272727272727, "grad_norm": 5.625, "grad_norm_var": 0.4613566080729167, "learning_rate": 0.0001, "loss": 6.2502, "loss/crossentropy": 2.6500184535980225, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19634953886270523, "step": 4494 }, { "epoch": 0.20436363636363636, "grad_norm": 5.875, "grad_norm_var": 0.44464518229166666, "learning_rate": 0.0001, "loss": 6.0084, "loss/crossentropy": 2.4522774815559387, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1950610987842083, "step": 4496 }, { "epoch": 0.20445454545454544, "grad_norm": 6.40625, "grad_norm_var": 0.4324869791666667, "learning_rate": 0.0001, "loss": 6.1987, "loss/crossentropy": 2.5394410490989685, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20069589465856552, "step": 4498 }, { "epoch": 0.20454545454545456, "grad_norm": 6.5625, "grad_norm_var": 0.48245035807291664, "learning_rate": 0.0001, "loss": 5.9578, "loss/crossentropy": 2.4233248233795166, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.18977637961506844, "step": 4500 }, { "epoch": 0.20463636363636364, "grad_norm": 6.1875, "grad_norm_var": 0.44451497395833334, "learning_rate": 0.0001, "loss": 6.0328, "loss/crossentropy": 2.404525011777878, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1987668201327324, "step": 4502 }, { "epoch": 0.20472727272727273, "grad_norm": 6.15625, "grad_norm_var": 0.45709228515625, "learning_rate": 0.0001, "loss": 6.0234, "loss/crossentropy": 2.4940614104270935, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1908205822110176, "step": 4504 }, { "epoch": 0.20481818181818182, "grad_norm": 5.84375, "grad_norm_var": 0.14283447265625, "learning_rate": 0.0001, "loss": 6.1364, "loss/crossentropy": 2.649518847465515, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1883370541036129, "step": 4506 }, { "epoch": 0.2049090909090909, "grad_norm": 5.6875, "grad_norm_var": 0.1359375, "learning_rate": 0.0001, "loss": 5.7585, "loss/crossentropy": 2.277000367641449, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18408457934856415, "step": 4508 }, { "epoch": 0.205, "grad_norm": 5.34375, "grad_norm_var": 0.141796875, "learning_rate": 0.0001, "loss": 5.8155, "loss/crossentropy": 2.3043004274368286, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.18745040521025658, "step": 4510 }, { "epoch": 0.2050909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.14599202473958334, "learning_rate": 0.0001, "loss": 6.3206, "loss/crossentropy": 2.6371302604675293, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2058470919728279, "step": 4512 }, { "epoch": 0.2051818181818182, "grad_norm": 6.1875, "grad_norm_var": 0.13316650390625, "learning_rate": 0.0001, "loss": 6.2851, "loss/crossentropy": 2.6434935927391052, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19736270979046822, "step": 4514 }, { "epoch": 0.20527272727272727, "grad_norm": 5.84375, "grad_norm_var": 0.081640625, "learning_rate": 0.0001, "loss": 5.9062, "loss/crossentropy": 2.3967719078063965, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19195592030882835, "step": 4516 }, { "epoch": 0.20536363636363636, "grad_norm": 5.5, "grad_norm_var": 0.07121988932291666, "learning_rate": 0.0001, "loss": 5.8001, "loss/crossentropy": 2.384543001651764, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1833532378077507, "step": 4518 }, { "epoch": 0.20545454545454545, "grad_norm": 5.46875, "grad_norm_var": 0.077978515625, "learning_rate": 0.0001, "loss": 5.8853, "loss/crossentropy": 2.3783714175224304, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1881878823041916, "step": 4520 }, { "epoch": 0.20554545454545456, "grad_norm": 8.125, "grad_norm_var": 0.42107747395833334, "learning_rate": 0.0001, "loss": 6.0236, "loss/crossentropy": 2.4620060324668884, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19580597802996635, "step": 4522 }, { "epoch": 0.20563636363636364, "grad_norm": 5.96875, "grad_norm_var": 0.41588134765625, "learning_rate": 0.0001, "loss": 6.1514, "loss/crossentropy": 2.6063619256019592, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19337321817874908, "step": 4524 }, { "epoch": 0.20572727272727273, "grad_norm": 6.1875, "grad_norm_var": 0.395556640625, "learning_rate": 0.0001, "loss": 5.747, "loss/crossentropy": 2.218610644340515, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.188583854585886, "step": 4526 }, { "epoch": 0.20581818181818182, "grad_norm": 5.8125, "grad_norm_var": 0.39000244140625, "learning_rate": 0.0001, "loss": 6.2962, "loss/crossentropy": 2.7111703753471375, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1963944025337696, "step": 4528 }, { "epoch": 0.2059090909090909, "grad_norm": 6.5625, "grad_norm_var": 0.40787353515625, "learning_rate": 0.0001, "loss": 6.4456, "loss/crossentropy": 2.806709349155426, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.201580960303545, "step": 4530 }, { "epoch": 0.206, "grad_norm": 5.875, "grad_norm_var": 0.42063802083333335, "learning_rate": 0.0001, "loss": 6.357, "loss/crossentropy": 2.582755923271179, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.21004552394151688, "step": 4532 }, { "epoch": 0.2060909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.4181925455729167, "learning_rate": 0.0001, "loss": 6.1469, "loss/crossentropy": 2.5810599327087402, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19545258954167366, "step": 4534 }, { "epoch": 0.2061818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.51500244140625, "learning_rate": 0.0001, "loss": 5.3847, "loss/crossentropy": 2.1293925642967224, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.16380960121750832, "step": 4536 }, { "epoch": 0.20627272727272727, "grad_norm": 5.65625, "grad_norm_var": 0.20022379557291667, "learning_rate": 0.0001, "loss": 6.1981, "loss/crossentropy": 2.626780927181244, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19404346868395805, "step": 4538 }, { "epoch": 0.20636363636363636, "grad_norm": 5.84375, "grad_norm_var": 0.2017578125, "learning_rate": 0.0001, "loss": 5.8031, "loss/crossentropy": 2.2828280925750732, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.1871824935078621, "step": 4540 }, { "epoch": 0.20645454545454545, "grad_norm": 5.4375, "grad_norm_var": 0.2017578125, "learning_rate": 0.0001, "loss": 5.9135, "loss/crossentropy": 2.41127210855484, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1885046474635601, "step": 4542 }, { "epoch": 0.20654545454545453, "grad_norm": 5.96875, "grad_norm_var": 0.19303385416666666, "learning_rate": 0.0001, "loss": 6.0506, "loss/crossentropy": 2.502337336540222, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1931041143834591, "step": 4544 }, { "epoch": 0.20663636363636365, "grad_norm": 5.78125, "grad_norm_var": 0.14967447916666668, "learning_rate": 0.0001, "loss": 6.1663, "loss/crossentropy": 2.568474769592285, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19748208299279213, "step": 4546 }, { "epoch": 0.20672727272727273, "grad_norm": 6.0, "grad_norm_var": 0.084228515625, "learning_rate": 0.0001, "loss": 5.921, "loss/crossentropy": 2.404313862323761, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.18623997271060944, "step": 4548 }, { "epoch": 0.20681818181818182, "grad_norm": 5.65625, "grad_norm_var": 0.21873372395833332, "learning_rate": 0.0001, "loss": 6.1088, "loss/crossentropy": 2.454119384288788, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.20004121959209442, "step": 4550 }, { "epoch": 0.2069090909090909, "grad_norm": 5.625, "grad_norm_var": 0.15037434895833332, "learning_rate": 0.0001, "loss": 6.1542, "loss/crossentropy": 2.6090381145477295, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19123512133955956, "step": 4552 }, { "epoch": 0.207, "grad_norm": 5.59375, "grad_norm_var": 0.15128580729166666, "learning_rate": 0.0001, "loss": 6.0349, "loss/crossentropy": 2.5033894181251526, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18909135088324547, "step": 4554 }, { "epoch": 0.2070909090909091, "grad_norm": 6.4375, "grad_norm_var": 0.16638997395833333, "learning_rate": 0.0001, "loss": 6.1172, "loss/crossentropy": 2.486943542957306, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.1977870874106884, "step": 4556 }, { "epoch": 0.2071818181818182, "grad_norm": 5.0, "grad_norm_var": 0.20813802083333333, "learning_rate": 0.0001, "loss": 5.4416, "loss/crossentropy": 2.0269999504089355, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.17427396774291992, "step": 4558 }, { "epoch": 0.20727272727272728, "grad_norm": 5.9375, "grad_norm_var": 0.23776041666666667, "learning_rate": 0.0001, "loss": 6.0278, "loss/crossentropy": 2.4834102988243103, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19349832832813263, "step": 4560 }, { "epoch": 0.20736363636363636, "grad_norm": 5.65625, "grad_norm_var": 0.24763997395833334, "learning_rate": 0.0001, "loss": 5.8286, "loss/crossentropy": 2.415743261575699, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1793760508298874, "step": 4562 }, { "epoch": 0.20745454545454545, "grad_norm": 6.09375, "grad_norm_var": 0.28573811848958336, "learning_rate": 0.0001, "loss": 6.2898, "loss/crossentropy": 2.5383092761039734, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.2105003111064434, "step": 4564 }, { "epoch": 0.20754545454545453, "grad_norm": 5.9375, "grad_norm_var": 0.16571858723958333, "learning_rate": 0.0001, "loss": 5.8548, "loss/crossentropy": 2.3919833600521088, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.18553526699543, "step": 4566 }, { "epoch": 0.20763636363636365, "grad_norm": 5.9375, "grad_norm_var": 0.16647135416666667, "learning_rate": 0.0001, "loss": 6.2589, "loss/crossentropy": 2.6629912853240967, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.19962633773684502, "step": 4568 }, { "epoch": 0.20772727272727273, "grad_norm": 6.0625, "grad_norm_var": 0.16638997395833333, "learning_rate": 0.0001, "loss": 6.2008, "loss/crossentropy": 2.5786516666412354, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19697590172290802, "step": 4570 }, { "epoch": 0.20781818181818182, "grad_norm": 6.15625, "grad_norm_var": 0.8979166666666667, "learning_rate": 0.0001, "loss": 6.3282, "loss/crossentropy": 2.5381128191947937, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.21123846620321274, "step": 4572 }, { "epoch": 0.2079090909090909, "grad_norm": 6.34375, "grad_norm_var": 0.83160400390625, "learning_rate": 0.0001, "loss": 6.0541, "loss/crossentropy": 2.465854525566101, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19514964893460274, "step": 4574 }, { "epoch": 0.208, "grad_norm": 6.09375, "grad_norm_var": 0.7803670247395833, "learning_rate": 0.0001, "loss": 5.9624, "loss/crossentropy": 2.4113656878471375, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.18987130373716354, "step": 4576 }, { "epoch": 0.2080909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.771337890625, "learning_rate": 0.0001, "loss": 5.9342, "loss/crossentropy": 2.422668933868408, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1886579543352127, "step": 4578 }, { "epoch": 0.2081818181818182, "grad_norm": 6.15625, "grad_norm_var": 0.7676920572916667, "learning_rate": 0.0001, "loss": 6.4872, "loss/crossentropy": 2.761171817779541, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.20912478864192963, "step": 4580 }, { "epoch": 0.20827272727272728, "grad_norm": 6.90625, "grad_norm_var": 0.7925130208333333, "learning_rate": 0.0001, "loss": 5.8136, "loss/crossentropy": 2.296721875667572, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.186846312135458, "step": 4582 }, { "epoch": 0.20836363636363636, "grad_norm": 6.1875, "grad_norm_var": 0.767041015625, "learning_rate": 0.0001, "loss": 5.9247, "loss/crossentropy": 2.328207641839981, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19519297406077385, "step": 4584 }, { "epoch": 0.20845454545454545, "grad_norm": 5.84375, "grad_norm_var": 0.7841145833333333, "learning_rate": 0.0001, "loss": 6.5259, "loss/crossentropy": 2.71491676568985, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21508588269352913, "step": 4586 }, { "epoch": 0.20854545454545453, "grad_norm": 5.3125, "grad_norm_var": 0.22259114583333334, "learning_rate": 0.0001, "loss": 5.7499, "loss/crossentropy": 2.3319883048534393, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1800704523921013, "step": 4588 }, { "epoch": 0.20863636363636365, "grad_norm": 5.25, "grad_norm_var": 0.24745686848958334, "learning_rate": 0.0001, "loss": 6.3327, "loss/crossentropy": 2.7266002893447876, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.19986297190189362, "step": 4590 }, { "epoch": 0.20872727272727273, "grad_norm": 6.3125, "grad_norm_var": 1.04732666015625, "learning_rate": 0.0001, "loss": 5.9755, "loss/crossentropy": 2.277674913406372, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20259835571050644, "step": 4592 }, { "epoch": 0.20881818181818182, "grad_norm": 6.71875, "grad_norm_var": 1.0311808268229166, "learning_rate": 0.0001, "loss": 5.9984, "loss/crossentropy": 2.4623554348945618, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18876329436898232, "step": 4594 }, { "epoch": 0.2089090909090909, "grad_norm": 5.90625, "grad_norm_var": 1.054150390625, "learning_rate": 0.0001, "loss": 6.3858, "loss/crossentropy": 2.6521191000938416, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2100824937224388, "step": 4596 }, { "epoch": 0.209, "grad_norm": 5.53125, "grad_norm_var": 1.0494791666666667, "learning_rate": 0.0001, "loss": 6.1158, "loss/crossentropy": 2.616398274898529, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18861224502325058, "step": 4598 }, { "epoch": 0.20909090909090908, "grad_norm": 5.84375, "grad_norm_var": 1.0644816080729167, "learning_rate": 0.0001, "loss": 6.1878, "loss/crossentropy": 2.6354967951774597, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19409603625535965, "step": 4600 }, { "epoch": 0.2091818181818182, "grad_norm": 5.46875, "grad_norm_var": 1.0634073893229166, "learning_rate": 0.0001, "loss": 5.7127, "loss/crossentropy": 2.3584187626838684, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1737099140882492, "step": 4602 }, { "epoch": 0.20927272727272728, "grad_norm": 5.09375, "grad_norm_var": 1.1106608072916666, "learning_rate": 0.0001, "loss": 5.6054, "loss/crossentropy": 2.271976888179779, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17240141704678535, "step": 4604 }, { "epoch": 0.20936363636363636, "grad_norm": 5.5, "grad_norm_var": 1.0964680989583333, "learning_rate": 0.0001, "loss": 5.8002, "loss/crossentropy": 2.3647013306617737, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18378885462880135, "step": 4606 }, { "epoch": 0.20945454545454545, "grad_norm": 6.21875, "grad_norm_var": 0.19081624348958334, "learning_rate": 0.0001, "loss": 6.1764, "loss/crossentropy": 2.654026687145233, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19247223064303398, "step": 4608 }, { "epoch": 0.20954545454545453, "grad_norm": 6.1875, "grad_norm_var": 0.16415608723958333, "learning_rate": 0.0001, "loss": 6.0026, "loss/crossentropy": 2.4470973312854767, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1932430863380432, "step": 4610 }, { "epoch": 0.20963636363636365, "grad_norm": 5.8125, "grad_norm_var": 0.17245686848958333, "learning_rate": 0.0001, "loss": 6.1611, "loss/crossentropy": 2.4992928504943848, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.20153073593974113, "step": 4612 }, { "epoch": 0.20972727272727273, "grad_norm": 5.53125, "grad_norm_var": 0.17138264973958334, "learning_rate": 0.0001, "loss": 5.7653, "loss/crossentropy": 2.3488845229148865, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.17777461931109428, "step": 4614 }, { "epoch": 0.20981818181818182, "grad_norm": 5.90625, "grad_norm_var": 0.19425455729166666, "learning_rate": 0.0001, "loss": 6.0807, "loss/crossentropy": 2.5641199350357056, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1895497627556324, "step": 4616 }, { "epoch": 0.2099090909090909, "grad_norm": 5.75, "grad_norm_var": 0.14211832682291667, "learning_rate": 0.0001, "loss": 6.0532, "loss/crossentropy": 2.4758607745170593, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.1921125277876854, "step": 4618 }, { "epoch": 0.21, "grad_norm": 6.4375, "grad_norm_var": 0.15402018229166667, "learning_rate": 0.0001, "loss": 5.8335, "loss/crossentropy": 2.387103885412216, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1835113763809204, "step": 4620 }, { "epoch": 0.21009090909090908, "grad_norm": 12.0625, "grad_norm_var": 2.570015462239583, "learning_rate": 0.0001, "loss": 6.1369, "loss/crossentropy": 2.5087013840675354, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19837168604135513, "step": 4622 }, { "epoch": 0.2101818181818182, "grad_norm": 6.15625, "grad_norm_var": 2.587744140625, "learning_rate": 0.0001, "loss": 6.222, "loss/crossentropy": 2.534659743309021, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.2021297812461853, "step": 4624 }, { "epoch": 0.21027272727272728, "grad_norm": 5.875, "grad_norm_var": 2.6197265625, "learning_rate": 0.0001, "loss": 6.293, "loss/crossentropy": 2.694242000579834, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19737646728754044, "step": 4626 }, { "epoch": 0.21036363636363636, "grad_norm": 6.09375, "grad_norm_var": 2.61314697265625, "learning_rate": 0.0001, "loss": 6.4466, "loss/crossentropy": 2.7291194796562195, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.20709915831685066, "step": 4628 }, { "epoch": 0.21045454545454545, "grad_norm": 5.875, "grad_norm_var": 3.434619140625, "learning_rate": 0.0001, "loss": 6.2441, "loss/crossentropy": 2.604355573654175, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.20049339905381203, "step": 4630 }, { "epoch": 0.21054545454545454, "grad_norm": 5.5625, "grad_norm_var": 3.380729166666667, "learning_rate": 0.0001, "loss": 5.734, "loss/crossentropy": 2.2966145873069763, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.18182281032204628, "step": 4632 }, { "epoch": 0.21063636363636365, "grad_norm": 6.90625, "grad_norm_var": 3.3704264322916666, "learning_rate": 0.0001, "loss": 6.2248, "loss/crossentropy": 2.5736573338508606, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.19929535314440727, "step": 4634 }, { "epoch": 0.21072727272727274, "grad_norm": 7.25, "grad_norm_var": 3.2204386393229165, "learning_rate": 0.0001, "loss": 6.3608, "loss/crossentropy": 2.701739013195038, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.20438171550631523, "step": 4636 }, { "epoch": 0.21081818181818182, "grad_norm": 5.71875, "grad_norm_var": 1.263134765625, "learning_rate": 0.0001, "loss": 6.2173, "loss/crossentropy": 2.5553082823753357, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20135819911956787, "step": 4638 }, { "epoch": 0.2109090909090909, "grad_norm": 6.96875, "grad_norm_var": 1.3771443684895834, "learning_rate": 0.0001, "loss": 5.7859, "loss/crossentropy": 2.3317081332206726, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.18389102071523666, "step": 4640 }, { "epoch": 0.211, "grad_norm": 7.4375, "grad_norm_var": 1.4676920572916667, "learning_rate": 0.0001, "loss": 6.2471, "loss/crossentropy": 2.625073552131653, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.20106522738933563, "step": 4642 }, { "epoch": 0.21109090909090908, "grad_norm": 5.625, "grad_norm_var": 1.5253214518229166, "learning_rate": 0.0001, "loss": 5.8861, "loss/crossentropy": 2.437596917152405, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.18215858936309814, "step": 4644 }, { "epoch": 0.2111818181818182, "grad_norm": 5.84375, "grad_norm_var": 0.5598795572916667, "learning_rate": 0.0001, "loss": 5.9109, "loss/crossentropy": 2.4546636939048767, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.18449357151985168, "step": 4646 }, { "epoch": 0.21127272727272728, "grad_norm": 9.375, "grad_norm_var": 1.275634765625, "learning_rate": 0.0001, "loss": 6.0841, "loss/crossentropy": 2.440632611513138, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19969618320465088, "step": 4648 }, { "epoch": 0.21136363636363636, "grad_norm": 6.125, "grad_norm_var": 1.1973795572916666, "learning_rate": 0.0001, "loss": 6.2315, "loss/crossentropy": 2.5803001523017883, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19793058931827545, "step": 4650 }, { "epoch": 0.21145454545454545, "grad_norm": 5.78125, "grad_norm_var": 1.1404256184895833, "learning_rate": 0.0001, "loss": 6.0485, "loss/crossentropy": 2.634104549884796, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18011298775672913, "step": 4652 }, { "epoch": 0.21154545454545454, "grad_norm": 5.65625, "grad_norm_var": 1.142431640625, "learning_rate": 0.0001, "loss": 6.1961, "loss/crossentropy": 2.6161237955093384, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.19608177617192268, "step": 4654 }, { "epoch": 0.21163636363636365, "grad_norm": 6.09375, "grad_norm_var": 1.0132649739583333, "learning_rate": 0.0001, "loss": 6.0034, "loss/crossentropy": 2.45170921087265, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19364505261182785, "step": 4656 }, { "epoch": 0.21172727272727274, "grad_norm": 5.5625, "grad_norm_var": 0.8807902018229167, "learning_rate": 0.0001, "loss": 6.2673, "loss/crossentropy": 2.6537999510765076, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.2013901025056839, "step": 4658 }, { "epoch": 0.21181818181818182, "grad_norm": 5.8125, "grad_norm_var": 0.8624308268229167, "learning_rate": 0.0001, "loss": 6.2166, "loss/crossentropy": 2.705284893512726, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.19273525476455688, "step": 4660 }, { "epoch": 0.2119090909090909, "grad_norm": 6.09375, "grad_norm_var": 0.8571614583333333, "learning_rate": 0.0001, "loss": 5.9163, "loss/crossentropy": 2.416284888982773, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1878916174173355, "step": 4662 }, { "epoch": 0.212, "grad_norm": 5.75, "grad_norm_var": 0.057666015625, "learning_rate": 0.0001, "loss": 5.9422, "loss/crossentropy": 2.4301450848579407, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.18675613403320312, "step": 4664 }, { "epoch": 0.21209090909090908, "grad_norm": 6.28125, "grad_norm_var": 0.06031494140625, "learning_rate": 0.0001, "loss": 6.1074, "loss/crossentropy": 2.519901216030121, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19762147963047028, "step": 4666 }, { "epoch": 0.2121818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.18456624348958334, "learning_rate": 0.0001, "loss": 6.0379, "loss/crossentropy": 2.506149262189865, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18989015743136406, "step": 4668 }, { "epoch": 0.21227272727272728, "grad_norm": 5.40625, "grad_norm_var": 0.19542643229166667, "learning_rate": 0.0001, "loss": 5.8134, "loss/crossentropy": 2.419213116168976, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18160250037908554, "step": 4670 }, { "epoch": 0.21236363636363637, "grad_norm": 10.6875, "grad_norm_var": 1.6805338541666666, "learning_rate": 0.0001, "loss": 6.0061, "loss/crossentropy": 2.484161138534546, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18891028314828873, "step": 4672 }, { "epoch": 0.21245454545454545, "grad_norm": 5.5, "grad_norm_var": 1.6962076822916667, "learning_rate": 0.0001, "loss": 5.9332, "loss/crossentropy": 2.3969732522964478, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19151683896780014, "step": 4674 }, { "epoch": 0.21254545454545454, "grad_norm": 5.6875, "grad_norm_var": 1.7385050455729167, "learning_rate": 0.0001, "loss": 5.8067, "loss/crossentropy": 2.3641874194145203, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.1846815012395382, "step": 4676 }, { "epoch": 0.21263636363636362, "grad_norm": 6.0625, "grad_norm_var": 1.7059529622395833, "learning_rate": 0.0001, "loss": 6.5442, "loss/crossentropy": 2.7788187861442566, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.21188656613230705, "step": 4678 }, { "epoch": 0.21272727272727274, "grad_norm": 5.5, "grad_norm_var": 1.7410115559895833, "learning_rate": 0.0001, "loss": 6.0816, "loss/crossentropy": 2.5314322113990784, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.1915433555841446, "step": 4680 }, { "epoch": 0.21281818181818182, "grad_norm": 6.375, "grad_norm_var": 1.7276326497395833, "learning_rate": 0.0001, "loss": 6.1303, "loss/crossentropy": 2.5915746092796326, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1919567584991455, "step": 4682 }, { "epoch": 0.2129090909090909, "grad_norm": 5.34375, "grad_norm_var": 1.651025390625, "learning_rate": 0.0001, "loss": 5.4432, "loss/crossentropy": 2.0801709294319153, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.17048702016472816, "step": 4684 }, { "epoch": 0.213, "grad_norm": 5.71875, "grad_norm_var": 1.6391927083333333, "learning_rate": 0.0001, "loss": 6.117, "loss/crossentropy": 2.600594699382782, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19187498465180397, "step": 4686 }, { "epoch": 0.21309090909090908, "grad_norm": 5.25, "grad_norm_var": 0.08817952473958333, "learning_rate": 0.0001, "loss": 5.9649, "loss/crossentropy": 2.519342064857483, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18478530272841454, "step": 4688 }, { "epoch": 0.2131818181818182, "grad_norm": 5.625, "grad_norm_var": 0.09933268229166667, "learning_rate": 0.0001, "loss": 5.9977, "loss/crossentropy": 2.4627619087696075, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.19002041220664978, "step": 4690 }, { "epoch": 0.21327272727272728, "grad_norm": 5.6875, "grad_norm_var": 0.09869791666666666, "learning_rate": 0.0001, "loss": 6.1735, "loss/crossentropy": 2.6620839834213257, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.19157173112034798, "step": 4692 }, { "epoch": 0.21336363636363637, "grad_norm": 5.84375, "grad_norm_var": 0.18756510416666666, "learning_rate": 0.0001, "loss": 6.4972, "loss/crossentropy": 2.709022104740143, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21006471291184425, "step": 4694 }, { "epoch": 0.21345454545454545, "grad_norm": 5.625, "grad_norm_var": 0.246728515625, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.356051743030548, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1833433099091053, "step": 4696 }, { "epoch": 0.21354545454545454, "grad_norm": 10.9375, "grad_norm_var": 1.8627237955729166, "learning_rate": 0.0001, "loss": 6.2864, "loss/crossentropy": 2.698318600654602, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.19494185596704483, "step": 4698 }, { "epoch": 0.21363636363636362, "grad_norm": 5.84375, "grad_norm_var": 1.8437459309895834, "learning_rate": 0.0001, "loss": 5.6492, "loss/crossentropy": 2.3070772290229797, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.17385901510715485, "step": 4700 }, { "epoch": 0.21372727272727274, "grad_norm": 6.09375, "grad_norm_var": 1.83531494140625, "learning_rate": 0.0001, "loss": 6.2458, "loss/crossentropy": 2.665278375148773, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19340625032782555, "step": 4702 }, { "epoch": 0.21381818181818182, "grad_norm": 5.875, "grad_norm_var": 1.7609375, "learning_rate": 0.0001, "loss": 6.326, "loss/crossentropy": 2.63917076587677, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20422593131661415, "step": 4704 }, { "epoch": 0.2139090909090909, "grad_norm": 5.46875, "grad_norm_var": 1.7818359375, "learning_rate": 0.0001, "loss": 6.2924, "loss/crossentropy": 2.7308883666992188, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.19345589354634285, "step": 4706 }, { "epoch": 0.214, "grad_norm": 5.625, "grad_norm_var": 1.8087076822916666, "learning_rate": 0.0001, "loss": 6.0232, "loss/crossentropy": 2.5147290229797363, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18853742629289627, "step": 4708 }, { "epoch": 0.21409090909090908, "grad_norm": 5.34375, "grad_norm_var": 1.80787353515625, "learning_rate": 0.0001, "loss": 6.1087, "loss/crossentropy": 2.5763426423072815, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19151876121759415, "step": 4710 }, { "epoch": 0.2141818181818182, "grad_norm": 5.8125, "grad_norm_var": 1.76724853515625, "learning_rate": 0.0001, "loss": 6.1826, "loss/crossentropy": 2.652659058570862, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19146888330578804, "step": 4712 }, { "epoch": 0.21427272727272728, "grad_norm": 5.625, "grad_norm_var": 0.07024739583333334, "learning_rate": 0.0001, "loss": 6.1882, "loss/crossentropy": 2.649291455745697, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19354193657636642, "step": 4714 }, { "epoch": 0.21436363636363637, "grad_norm": 10.625, "grad_norm_var": 1.5547810872395833, "learning_rate": 0.0001, "loss": 5.9302, "loss/crossentropy": 2.4074273109436035, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18997179716825485, "step": 4716 }, { "epoch": 0.21445454545454545, "grad_norm": 5.65625, "grad_norm_var": 1.54765625, "learning_rate": 0.0001, "loss": 6.0022, "loss/crossentropy": 2.422228693962097, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1956934817135334, "step": 4718 }, { "epoch": 0.21454545454545454, "grad_norm": 5.75, "grad_norm_var": 1.5585774739583333, "learning_rate": 0.0001, "loss": 6.0768, "loss/crossentropy": 2.5125085711479187, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19197820127010345, "step": 4720 }, { "epoch": 0.21463636363636363, "grad_norm": 5.53125, "grad_norm_var": 1.595166015625, "learning_rate": 0.0001, "loss": 5.9594, "loss/crossentropy": 2.450977623462677, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18990011885762215, "step": 4722 }, { "epoch": 0.21472727272727274, "grad_norm": 6.71875, "grad_norm_var": 1.6069295247395834, "learning_rate": 0.0001, "loss": 6.3508, "loss/crossentropy": 2.6103617548942566, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20802495628595352, "step": 4724 }, { "epoch": 0.21481818181818182, "grad_norm": 5.46875, "grad_norm_var": 1.5953084309895833, "learning_rate": 0.0001, "loss": 6.3017, "loss/crossentropy": 2.6341435313224792, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.20483721047639847, "step": 4726 }, { "epoch": 0.2149090909090909, "grad_norm": 5.59375, "grad_norm_var": 1.6594889322916666, "learning_rate": 0.0001, "loss": 6.013, "loss/crossentropy": 2.514268159866333, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18932294845581055, "step": 4728 }, { "epoch": 0.215, "grad_norm": 5.84375, "grad_norm_var": 1.6538045247395834, "learning_rate": 0.0001, "loss": 6.158, "loss/crossentropy": 2.6183154582977295, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.19400463998317719, "step": 4730 }, { "epoch": 0.21509090909090908, "grad_norm": 7.28125, "grad_norm_var": 0.3209920247395833, "learning_rate": 0.0001, "loss": 6.4886, "loss/crossentropy": 2.7681784629821777, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2060268670320511, "step": 4732 }, { "epoch": 0.21518181818181817, "grad_norm": 5.78125, "grad_norm_var": 0.35608317057291666, "learning_rate": 0.0001, "loss": 6.2315, "loss/crossentropy": 2.6260945796966553, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19921046122908592, "step": 4734 }, { "epoch": 0.21527272727272728, "grad_norm": 5.8125, "grad_norm_var": 0.954931640625, "learning_rate": 0.0001, "loss": 6.0755, "loss/crossentropy": 2.4885366559028625, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19423916935920715, "step": 4736 }, { "epoch": 0.21536363636363637, "grad_norm": 6.34375, "grad_norm_var": 0.8897135416666667, "learning_rate": 0.0001, "loss": 6.189, "loss/crossentropy": 2.5969467759132385, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19787875190377235, "step": 4738 }, { "epoch": 0.21545454545454545, "grad_norm": 7.28125, "grad_norm_var": 0.9762858072916667, "learning_rate": 0.0001, "loss": 6.2062, "loss/crossentropy": 2.6487796902656555, "loss/hidden": 1.673828125, "loss/jsd": 0.0, "loss/logits": 0.18835725635290146, "step": 4740 }, { "epoch": 0.21554545454545454, "grad_norm": 5.65625, "grad_norm_var": 0.9799112955729167, "learning_rate": 0.0001, "loss": 6.2122, "loss/crossentropy": 2.662469983100891, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19325551018118858, "step": 4742 }, { "epoch": 0.21563636363636363, "grad_norm": 6.1875, "grad_norm_var": 0.9350545247395833, "learning_rate": 0.0001, "loss": 6.3114, "loss/crossentropy": 2.7531410455703735, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19546981528401375, "step": 4744 }, { "epoch": 0.21572727272727274, "grad_norm": 5.96875, "grad_norm_var": 0.9057576497395833, "learning_rate": 0.0001, "loss": 6.3887, "loss/crossentropy": 2.7703250646591187, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1977788247168064, "step": 4746 }, { "epoch": 0.21581818181818183, "grad_norm": 5.71875, "grad_norm_var": 0.8777303059895833, "learning_rate": 0.0001, "loss": 6.0309, "loss/crossentropy": 2.565134346485138, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18329393863677979, "step": 4748 }, { "epoch": 0.2159090909090909, "grad_norm": 5.8125, "grad_norm_var": 0.857275390625, "learning_rate": 0.0001, "loss": 6.0695, "loss/crossentropy": 2.5733712911605835, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1890621967613697, "step": 4750 }, { "epoch": 0.216, "grad_norm": 6.3125, "grad_norm_var": 0.23370768229166666, "learning_rate": 0.0001, "loss": 5.7219, "loss/crossentropy": 2.330030769109726, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1780538260936737, "step": 4752 }, { "epoch": 0.21609090909090908, "grad_norm": 6.75, "grad_norm_var": 0.26444905598958335, "learning_rate": 0.0001, "loss": 5.499, "loss/crossentropy": 2.1404694616794586, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.17218079045414925, "step": 4754 }, { "epoch": 0.21618181818181817, "grad_norm": 6.0625, "grad_norm_var": 0.13294270833333333, "learning_rate": 0.0001, "loss": 6.2527, "loss/crossentropy": 2.617996037006378, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19979356229305267, "step": 4756 }, { "epoch": 0.21627272727272728, "grad_norm": 5.03125, "grad_norm_var": 0.18961181640625, "learning_rate": 0.0001, "loss": 5.982, "loss/crossentropy": 2.484579622745514, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1888076364994049, "step": 4758 }, { "epoch": 0.21636363636363637, "grad_norm": 5.46875, "grad_norm_var": 0.19361572265625, "learning_rate": 0.0001, "loss": 5.5912, "loss/crossentropy": 2.19041845202446, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18030979856848717, "step": 4760 }, { "epoch": 0.21645454545454546, "grad_norm": 6.0625, "grad_norm_var": 0.20777587890625, "learning_rate": 0.0001, "loss": 6.224, "loss/crossentropy": 2.6810399889945984, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19316530227661133, "step": 4762 }, { "epoch": 0.21654545454545454, "grad_norm": 5.25, "grad_norm_var": 0.21910400390625, "learning_rate": 0.0001, "loss": 6.324, "loss/crossentropy": 2.7852479815483093, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1972389593720436, "step": 4764 }, { "epoch": 0.21663636363636363, "grad_norm": 5.5625, "grad_norm_var": 0.259619140625, "learning_rate": 0.0001, "loss": 5.8357, "loss/crossentropy": 2.4334307610988617, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18182388693094254, "step": 4766 }, { "epoch": 0.21672727272727274, "grad_norm": 5.8125, "grad_norm_var": 0.23821207682291667, "learning_rate": 0.0001, "loss": 6.0811, "loss/crossentropy": 2.6096954941749573, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1862000711262226, "step": 4768 }, { "epoch": 0.21681818181818183, "grad_norm": 5.71875, "grad_norm_var": 0.15930582682291666, "learning_rate": 0.0001, "loss": 5.9527, "loss/crossentropy": 2.4253385066986084, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1908184476196766, "step": 4770 }, { "epoch": 0.2169090909090909, "grad_norm": 5.875, "grad_norm_var": 0.21230061848958334, "learning_rate": 0.0001, "loss": 6.5458, "loss/crossentropy": 2.771722137928009, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.21392735093832016, "step": 4772 }, { "epoch": 0.217, "grad_norm": 5.25, "grad_norm_var": 0.19322509765625, "learning_rate": 0.0001, "loss": 6.0897, "loss/crossentropy": 2.607056677341461, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18850333988666534, "step": 4774 }, { "epoch": 0.21709090909090908, "grad_norm": 5.03125, "grad_norm_var": 0.20530192057291666, "learning_rate": 0.0001, "loss": 5.9297, "loss/crossentropy": 2.4722664952278137, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.18421650305390358, "step": 4776 }, { "epoch": 0.21718181818181817, "grad_norm": 8.4375, "grad_norm_var": 0.6743448893229167, "learning_rate": 0.0001, "loss": 6.1392, "loss/crossentropy": 2.6599085927009583, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18933751434087753, "step": 4778 }, { "epoch": 0.21727272727272728, "grad_norm": 5.78125, "grad_norm_var": 1.02115478515625, "learning_rate": 0.0001, "loss": 5.8775, "loss/crossentropy": 2.417559117078781, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18368485942482948, "step": 4780 }, { "epoch": 0.21736363636363637, "grad_norm": 5.46875, "grad_norm_var": 0.96314697265625, "learning_rate": 0.0001, "loss": 5.8435, "loss/crossentropy": 2.381256580352783, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18528784438967705, "step": 4782 }, { "epoch": 0.21745454545454546, "grad_norm": 5.6875, "grad_norm_var": 0.9538045247395833, "learning_rate": 0.0001, "loss": 5.8147, "loss/crossentropy": 2.3648226857185364, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18327056989073753, "step": 4784 }, { "epoch": 0.21754545454545454, "grad_norm": 6.0, "grad_norm_var": 0.9421223958333333, "learning_rate": 0.0001, "loss": 6.2661, "loss/crossentropy": 2.5808921456336975, "loss/hidden": 1.677734375, "loss/jsd": 0.0, "loss/logits": 0.20074767619371414, "step": 4786 }, { "epoch": 0.21763636363636363, "grad_norm": 5.15625, "grad_norm_var": 0.96197509765625, "learning_rate": 0.0001, "loss": 5.9062, "loss/crossentropy": 2.4479114413261414, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1839195415377617, "step": 4788 }, { "epoch": 0.2177272727272727, "grad_norm": 5.625, "grad_norm_var": 0.9196451822916667, "learning_rate": 0.0001, "loss": 5.8645, "loss/crossentropy": 2.4669910073280334, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18252258747816086, "step": 4790 }, { "epoch": 0.21781818181818183, "grad_norm": 5.96875, "grad_norm_var": 0.86343994140625, "learning_rate": 0.0001, "loss": 6.0381, "loss/crossentropy": 2.4884058833122253, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.19149469956755638, "step": 4792 }, { "epoch": 0.2179090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.474853515625, "learning_rate": 0.0001, "loss": 6.1316, "loss/crossentropy": 2.5568495988845825, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.19556130841374397, "step": 4794 }, { "epoch": 0.218, "grad_norm": 6.75, "grad_norm_var": 0.15074462890625, "learning_rate": 0.0001, "loss": 6.2327, "loss/crossentropy": 2.631493091583252, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19684035703539848, "step": 4796 }, { "epoch": 0.21809090909090909, "grad_norm": 5.78125, "grad_norm_var": 0.14433186848958332, "learning_rate": 0.0001, "loss": 5.8062, "loss/crossentropy": 2.347288489341736, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.17948533594608307, "step": 4798 }, { "epoch": 0.21818181818181817, "grad_norm": 5.78125, "grad_norm_var": 0.15103759765625, "learning_rate": 0.0001, "loss": 5.8907, "loss/crossentropy": 2.396562337875366, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1871093511581421, "step": 4800 }, { "epoch": 0.21827272727272728, "grad_norm": 6.78125, "grad_norm_var": 0.20409749348958334, "learning_rate": 0.0001, "loss": 6.071, "loss/crossentropy": 2.573709547519684, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.1882014125585556, "step": 4802 }, { "epoch": 0.21836363636363637, "grad_norm": 6.0625, "grad_norm_var": 0.4895833333333333, "learning_rate": 0.0001, "loss": 5.7366, "loss/crossentropy": 2.29565966129303, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18237412720918655, "step": 4804 }, { "epoch": 0.21845454545454546, "grad_norm": 5.5625, "grad_norm_var": 0.5033854166666667, "learning_rate": 0.0001, "loss": 5.8181, "loss/crossentropy": 2.394889712333679, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.18040324375033379, "step": 4806 }, { "epoch": 0.21854545454545454, "grad_norm": 5.9375, "grad_norm_var": 0.51373291015625, "learning_rate": 0.0001, "loss": 6.3255, "loss/crossentropy": 2.748706042766571, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.1945958100259304, "step": 4808 }, { "epoch": 0.21863636363636363, "grad_norm": 6.0, "grad_norm_var": 0.48267822265625, "learning_rate": 0.0001, "loss": 6.2698, "loss/crossentropy": 2.68618643283844, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.19332293421030045, "step": 4810 }, { "epoch": 0.21872727272727271, "grad_norm": 6.03125, "grad_norm_var": 0.425244140625, "learning_rate": 0.0001, "loss": 6.3872, "loss/crossentropy": 2.780047655105591, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1988021656870842, "step": 4812 }, { "epoch": 0.21881818181818183, "grad_norm": 5.96875, "grad_norm_var": 0.417578125, "learning_rate": 0.0001, "loss": 5.8042, "loss/crossentropy": 2.313770055770874, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.1828295737504959, "step": 4814 }, { "epoch": 0.21890909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.47511393229166665, "learning_rate": 0.0001, "loss": 5.8886, "loss/crossentropy": 2.4828322529792786, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18022475764155388, "step": 4816 }, { "epoch": 0.219, "grad_norm": 6.40625, "grad_norm_var": 0.45201416015625, "learning_rate": 0.0001, "loss": 6.2301, "loss/crossentropy": 2.6824368238449097, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.19675559923052788, "step": 4818 }, { "epoch": 0.2190909090909091, "grad_norm": 6.0, "grad_norm_var": 0.15793863932291666, "learning_rate": 0.0001, "loss": 6.319, "loss/crossentropy": 2.622422456741333, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.20695773512125015, "step": 4820 }, { "epoch": 0.21918181818181817, "grad_norm": 6.6875, "grad_norm_var": 0.17862955729166666, "learning_rate": 0.0001, "loss": 6.3078, "loss/crossentropy": 2.638982832431793, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20125709474086761, "step": 4822 }, { "epoch": 0.21927272727272729, "grad_norm": 5.375, "grad_norm_var": 0.230078125, "learning_rate": 0.0001, "loss": 5.8282, "loss/crossentropy": 2.2640033662319183, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19001267105340958, "step": 4824 }, { "epoch": 0.21936363636363637, "grad_norm": 5.90625, "grad_norm_var": 0.22652587890625, "learning_rate": 0.0001, "loss": 6.1251, "loss/crossentropy": 2.578502893447876, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.189814705401659, "step": 4826 }, { "epoch": 0.21945454545454546, "grad_norm": 5.875, "grad_norm_var": 0.22440999348958332, "learning_rate": 0.0001, "loss": 6.1577, "loss/crossentropy": 2.6506312489509583, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19055328518152237, "step": 4828 }, { "epoch": 0.21954545454545454, "grad_norm": 5.28125, "grad_norm_var": 0.2511067708333333, "learning_rate": 0.0001, "loss": 5.8207, "loss/crossentropy": 2.380418062210083, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.17878986150026321, "step": 4830 }, { "epoch": 0.21963636363636363, "grad_norm": 5.84375, "grad_norm_var": 0.23573811848958334, "learning_rate": 0.0001, "loss": 5.8931, "loss/crossentropy": 2.4703086018562317, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.18075443804264069, "step": 4832 }, { "epoch": 0.21972727272727272, "grad_norm": 5.09375, "grad_norm_var": 0.2625, "learning_rate": 0.0001, "loss": 5.7949, "loss/crossentropy": 2.386338233947754, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.1801094301044941, "step": 4834 }, { "epoch": 0.21981818181818183, "grad_norm": 6.1875, "grad_norm_var": 0.27681884765625, "learning_rate": 0.0001, "loss": 5.9372, "loss/crossentropy": 2.3972066938877106, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.1885695904493332, "step": 4836 }, { "epoch": 0.21990909090909092, "grad_norm": 7.09375, "grad_norm_var": 0.353515625, "learning_rate": 0.0001, "loss": 6.2215, "loss/crossentropy": 2.690002202987671, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19279613345861435, "step": 4838 }, { "epoch": 0.22, "grad_norm": 6.3125, "grad_norm_var": 0.30181884765625, "learning_rate": 0.0001, "loss": 5.9277, "loss/crossentropy": 2.47845596075058, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.18301071971654892, "step": 4840 }, { "epoch": 0.2200909090909091, "grad_norm": 6.0625, "grad_norm_var": 0.3068644205729167, "learning_rate": 0.0001, "loss": 5.6663, "loss/crossentropy": 2.3415171802043915, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1734950877726078, "step": 4842 }, { "epoch": 0.22018181818181817, "grad_norm": 6.09375, "grad_norm_var": 0.38170166015625, "learning_rate": 0.0001, "loss": 6.2825, "loss/crossentropy": 2.6106679439544678, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.2029261812567711, "step": 4844 }, { "epoch": 0.22027272727272726, "grad_norm": 5.6875, "grad_norm_var": 0.3646484375, "learning_rate": 0.0001, "loss": 6.1745, "loss/crossentropy": 2.4815784692764282, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2052343674004078, "step": 4846 }, { "epoch": 0.22036363636363637, "grad_norm": 5.34375, "grad_norm_var": 0.33557535807291666, "learning_rate": 0.0001, "loss": 5.6544, "loss/crossentropy": 2.21720752120018, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18415364250540733, "step": 4848 }, { "epoch": 0.22045454545454546, "grad_norm": 5.53125, "grad_norm_var": 0.3020833333333333, "learning_rate": 0.0001, "loss": 5.8486, "loss/crossentropy": 2.4414351284503937, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1793907769024372, "step": 4850 }, { "epoch": 0.22054545454545454, "grad_norm": 6.75, "grad_norm_var": 0.32405192057291665, "learning_rate": 0.0001, "loss": 6.0683, "loss/crossentropy": 2.5654351115226746, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.19110943377017975, "step": 4852 }, { "epoch": 0.22063636363636363, "grad_norm": 5.34375, "grad_norm_var": 0.23470052083333334, "learning_rate": 0.0001, "loss": 5.7032, "loss/crossentropy": 2.325701951980591, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.17368509247899055, "step": 4854 }, { "epoch": 0.22072727272727272, "grad_norm": 11.3125, "grad_norm_var": 2.11451416015625, "learning_rate": 0.0001, "loss": 6.3704, "loss/crossentropy": 2.674340784549713, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20984004065394402, "step": 4856 }, { "epoch": 0.22081818181818183, "grad_norm": 5.875, "grad_norm_var": 2.111197916666667, "learning_rate": 0.0001, "loss": 6.3269, "loss/crossentropy": 2.7417932748794556, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.19386393204331398, "step": 4858 }, { "epoch": 0.22090909090909092, "grad_norm": 10.8125, "grad_norm_var": 3.47574462890625, "learning_rate": 0.0001, "loss": 6.1443, "loss/crossentropy": 2.489362120628357, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.19693514332175255, "step": 4860 }, { "epoch": 0.221, "grad_norm": 5.53125, "grad_norm_var": 3.514176432291667, "learning_rate": 0.0001, "loss": 5.8366, "loss/crossentropy": 2.380613088607788, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1823217049241066, "step": 4862 }, { "epoch": 0.2210909090909091, "grad_norm": 6.125, "grad_norm_var": 3.5644816080729167, "learning_rate": 0.0001, "loss": 5.6576, "loss/crossentropy": 2.2133682668209076, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18035933375358582, "step": 4864 }, { "epoch": 0.22118181818181817, "grad_norm": 5.375, "grad_norm_var": 3.6012858072916667, "learning_rate": 0.0001, "loss": 5.5852, "loss/crossentropy": 2.3028062880039215, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1702341064810753, "step": 4866 }, { "epoch": 0.22127272727272726, "grad_norm": 6.09375, "grad_norm_var": 3.604931640625, "learning_rate": 0.0001, "loss": 6.2072, "loss/crossentropy": 2.5905571579933167, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1979910023510456, "step": 4868 }, { "epoch": 0.22136363636363637, "grad_norm": 5.84375, "grad_norm_var": 3.582275390625, "learning_rate": 0.0001, "loss": 6.4145, "loss/crossentropy": 2.7222476601600647, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.20613934844732285, "step": 4870 }, { "epoch": 0.22145454545454546, "grad_norm": 5.625, "grad_norm_var": 1.8950154622395834, "learning_rate": 0.0001, "loss": 6.1539, "loss/crossentropy": 2.596329689025879, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19443176686763763, "step": 4872 }, { "epoch": 0.22154545454545455, "grad_norm": 5.875, "grad_norm_var": 1.8964803059895834, "learning_rate": 0.0001, "loss": 5.8936, "loss/crossentropy": 2.3603259921073914, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.18984685093164444, "step": 4874 }, { "epoch": 0.22163636363636363, "grad_norm": 5.3125, "grad_norm_var": 0.31222330729166664, "learning_rate": 0.0001, "loss": 5.9855, "loss/crossentropy": 2.494263470172882, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18740437552332878, "step": 4876 }, { "epoch": 0.22172727272727272, "grad_norm": 5.71875, "grad_norm_var": 0.427734375, "learning_rate": 0.0001, "loss": 6.1968, "loss/crossentropy": 2.5374634861946106, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.1983545422554016, "step": 4878 }, { "epoch": 0.22181818181818183, "grad_norm": 6.0625, "grad_norm_var": 0.38084309895833335, "learning_rate": 0.0001, "loss": 6.1084, "loss/crossentropy": 2.4300894141197205, "loss/hidden": 1.662109375, "loss/jsd": 0.0, "loss/logits": 0.2016192376613617, "step": 4880 }, { "epoch": 0.22190909090909092, "grad_norm": 6.21875, "grad_norm_var": 0.3253865559895833, "learning_rate": 0.0001, "loss": 6.1705, "loss/crossentropy": 2.6861830055713654, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18944524601101875, "step": 4882 }, { "epoch": 0.222, "grad_norm": 6.03125, "grad_norm_var": 0.33570556640625, "learning_rate": 0.0001, "loss": 6.0457, "loss/crossentropy": 2.5568532943725586, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.18599044159054756, "step": 4884 }, { "epoch": 0.2220909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.1896484375, "learning_rate": 0.0001, "loss": 5.998, "loss/crossentropy": 2.523762583732605, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.18218698352575302, "step": 4886 }, { "epoch": 0.22218181818181817, "grad_norm": 5.4375, "grad_norm_var": 0.19989827473958333, "learning_rate": 0.0001, "loss": 6.129, "loss/crossentropy": 2.5663060545921326, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19337721168994904, "step": 4888 }, { "epoch": 0.22227272727272726, "grad_norm": 5.65625, "grad_norm_var": 0.19876302083333333, "learning_rate": 0.0001, "loss": 6.4714, "loss/crossentropy": 2.770940124988556, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.20851919054985046, "step": 4890 }, { "epoch": 0.22236363636363637, "grad_norm": 5.90625, "grad_norm_var": 0.20233968098958333, "learning_rate": 0.0001, "loss": 5.9851, "loss/crossentropy": 2.473245859146118, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18868959695100784, "step": 4892 }, { "epoch": 0.22245454545454546, "grad_norm": 5.5625, "grad_norm_var": 0.09283854166666666, "learning_rate": 0.0001, "loss": 6.2037, "loss/crossentropy": 2.6706987023353577, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.191973727196455, "step": 4894 }, { "epoch": 0.22254545454545455, "grad_norm": 5.125, "grad_norm_var": 0.10539957682291666, "learning_rate": 0.0001, "loss": 5.8991, "loss/crossentropy": 2.4816490411758423, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18275706842541695, "step": 4896 }, { "epoch": 0.22263636363636363, "grad_norm": 5.65625, "grad_norm_var": 0.07498372395833333, "learning_rate": 0.0001, "loss": 5.7908, "loss/crossentropy": 2.411046117544174, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17821436747908592, "step": 4898 }, { "epoch": 0.22272727272727272, "grad_norm": 5.28125, "grad_norm_var": 0.09862874348958334, "learning_rate": 0.0001, "loss": 6.2962, "loss/crossentropy": 2.616926074028015, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2038627713918686, "step": 4900 }, { "epoch": 0.22281818181818183, "grad_norm": 5.46875, "grad_norm_var": 0.1080078125, "learning_rate": 0.0001, "loss": 6.2246, "loss/crossentropy": 2.684308171272278, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1942644976079464, "step": 4902 }, { "epoch": 0.22290909090909092, "grad_norm": 4.90625, "grad_norm_var": 0.12867431640625, "learning_rate": 0.0001, "loss": 5.539, "loss/crossentropy": 2.245702385902405, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17210476472973824, "step": 4904 }, { "epoch": 0.223, "grad_norm": 5.40625, "grad_norm_var": 0.13202718098958333, "learning_rate": 0.0001, "loss": 5.7984, "loss/crossentropy": 2.3572540879249573, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.18298125267028809, "step": 4906 }, { "epoch": 0.2230909090909091, "grad_norm": 5.5, "grad_norm_var": 0.122119140625, "learning_rate": 0.0001, "loss": 5.7506, "loss/crossentropy": 2.3660637736320496, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.18044201284646988, "step": 4908 }, { "epoch": 0.22318181818181818, "grad_norm": 5.6875, "grad_norm_var": 0.18694254557291667, "learning_rate": 0.0001, "loss": 5.8123, "loss/crossentropy": 2.385489284992218, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1803799793124199, "step": 4910 }, { "epoch": 0.22327272727272726, "grad_norm": 5.65625, "grad_norm_var": 0.17463785807291668, "learning_rate": 0.0001, "loss": 5.5411, "loss/crossentropy": 2.1967233419418335, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.17134756967425346, "step": 4912 }, { "epoch": 0.22336363636363638, "grad_norm": 6.40625, "grad_norm_var": 0.21534830729166668, "learning_rate": 0.0001, "loss": 6.1993, "loss/crossentropy": 2.6713815331459045, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.19009153172373772, "step": 4914 }, { "epoch": 0.22345454545454546, "grad_norm": 5.6875, "grad_norm_var": 0.23411458333333332, "learning_rate": 0.0001, "loss": 6.2813, "loss/crossentropy": 2.6814457774162292, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19943510740995407, "step": 4916 }, { "epoch": 0.22354545454545455, "grad_norm": 5.5625, "grad_norm_var": 0.22626546223958333, "learning_rate": 0.0001, "loss": 5.7544, "loss/crossentropy": 2.371589332818985, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18144457414746284, "step": 4918 }, { "epoch": 0.22363636363636363, "grad_norm": 5.5, "grad_norm_var": 0.19000244140625, "learning_rate": 0.0001, "loss": 6.1362, "loss/crossentropy": 2.6364822387695312, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18961920216679573, "step": 4920 }, { "epoch": 0.22372727272727272, "grad_norm": 5.1875, "grad_norm_var": 0.21734619140625, "learning_rate": 0.0001, "loss": 6.0362, "loss/crossentropy": 2.461503714323044, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19301627203822136, "step": 4922 }, { "epoch": 0.2238181818181818, "grad_norm": 5.6875, "grad_norm_var": 0.23199462890625, "learning_rate": 0.0001, "loss": 5.9599, "loss/crossentropy": 2.444436877965927, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19002115726470947, "step": 4924 }, { "epoch": 0.22390909090909092, "grad_norm": 4.8125, "grad_norm_var": 0.267822265625, "learning_rate": 0.0001, "loss": 5.2127, "loss/crossentropy": 2.0854178965091705, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.15627839416265488, "step": 4926 }, { "epoch": 0.224, "grad_norm": 6.0, "grad_norm_var": 0.2540201822916667, "learning_rate": 0.0001, "loss": 6.129, "loss/crossentropy": 2.5456132292747498, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19662300497293472, "step": 4928 }, { "epoch": 0.2240909090909091, "grad_norm": 5.8125, "grad_norm_var": 0.22577718098958333, "learning_rate": 0.0001, "loss": 5.7544, "loss/crossentropy": 2.366196483373642, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.17671297118067741, "step": 4930 }, { "epoch": 0.22418181818181818, "grad_norm": 5.71875, "grad_norm_var": 0.178369140625, "learning_rate": 0.0001, "loss": 6.1969, "loss/crossentropy": 2.6246054768562317, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19238891080021858, "step": 4932 }, { "epoch": 0.22427272727272726, "grad_norm": 5.65625, "grad_norm_var": 0.191650390625, "learning_rate": 0.0001, "loss": 6.2454, "loss/crossentropy": 2.63258296251297, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19820033758878708, "step": 4934 }, { "epoch": 0.22436363636363638, "grad_norm": 5.40625, "grad_norm_var": 0.21770833333333334, "learning_rate": 0.0001, "loss": 5.7725, "loss/crossentropy": 2.4150819778442383, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.17519601434469223, "step": 4936 }, { "epoch": 0.22445454545454546, "grad_norm": 5.875, "grad_norm_var": 0.181494140625, "learning_rate": 0.0001, "loss": 6.4044, "loss/crossentropy": 2.760706067085266, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20225578173995018, "step": 4938 }, { "epoch": 0.22454545454545455, "grad_norm": 6.21875, "grad_norm_var": 0.15292561848958333, "learning_rate": 0.0001, "loss": 5.8414, "loss/crossentropy": 2.387132942676544, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1844932623207569, "step": 4940 }, { "epoch": 0.22463636363636363, "grad_norm": 6.09375, "grad_norm_var": 0.14596354166666667, "learning_rate": 0.0001, "loss": 6.4124, "loss/crossentropy": 2.813187301158905, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.2026955969631672, "step": 4942 }, { "epoch": 0.22472727272727272, "grad_norm": 5.5625, "grad_norm_var": 0.21952718098958332, "learning_rate": 0.0001, "loss": 5.5454, "loss/crossentropy": 2.2503121495246887, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17209018394351006, "step": 4944 }, { "epoch": 0.2248181818181818, "grad_norm": 6.28125, "grad_norm_var": 0.24091389973958333, "learning_rate": 0.0001, "loss": 6.4771, "loss/crossentropy": 2.9071980714797974, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19566133990883827, "step": 4946 }, { "epoch": 0.22490909090909092, "grad_norm": 6.53125, "grad_norm_var": 0.27486979166666664, "learning_rate": 0.0001, "loss": 6.0468, "loss/crossentropy": 2.4622368812561035, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19732244685292244, "step": 4948 }, { "epoch": 0.225, "grad_norm": 5.65625, "grad_norm_var": 0.27571207682291665, "learning_rate": 0.0001, "loss": 6.4757, "loss/crossentropy": 2.8340529799461365, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.20342683047056198, "step": 4950 }, { "epoch": 0.2250909090909091, "grad_norm": 6.09375, "grad_norm_var": 0.400244140625, "learning_rate": 0.0001, "loss": 6.4794, "loss/crossentropy": 2.760801136493683, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.20760051161050797, "step": 4952 }, { "epoch": 0.22518181818181818, "grad_norm": 5.78125, "grad_norm_var": 0.402734375, "learning_rate": 0.0001, "loss": 5.6567, "loss/crossentropy": 2.2679705023765564, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.17617538571357727, "step": 4954 }, { "epoch": 0.22527272727272726, "grad_norm": 5.8125, "grad_norm_var": 0.38720296223958334, "learning_rate": 0.0001, "loss": 6.4096, "loss/crossentropy": 2.7792826294898987, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20287635549902916, "step": 4956 }, { "epoch": 0.22536363636363638, "grad_norm": 5.34375, "grad_norm_var": 0.379150390625, "learning_rate": 0.0001, "loss": 6.0848, "loss/crossentropy": 2.5742223858833313, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.1910979561507702, "step": 4958 }, { "epoch": 0.22545454545454546, "grad_norm": 5.125, "grad_norm_var": 0.3115234375, "learning_rate": 0.0001, "loss": 5.8904, "loss/crossentropy": 2.445654034614563, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18393051996827126, "step": 4960 }, { "epoch": 0.22554545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.38232014973958334, "learning_rate": 0.0001, "loss": 5.6818, "loss/crossentropy": 2.3693887591362, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17498857527971268, "step": 4962 }, { "epoch": 0.22563636363636363, "grad_norm": 5.5, "grad_norm_var": 0.3519490559895833, "learning_rate": 0.0001, "loss": 5.8768, "loss/crossentropy": 2.383093237876892, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1899917982518673, "step": 4964 }, { "epoch": 0.22572727272727272, "grad_norm": 6.15625, "grad_norm_var": 0.38209228515625, "learning_rate": 0.0001, "loss": 6.334, "loss/crossentropy": 2.7062861919403076, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20300188288092613, "step": 4966 }, { "epoch": 0.2258181818181818, "grad_norm": 5.78125, "grad_norm_var": 0.18896077473958334, "learning_rate": 0.0001, "loss": 6.2442, "loss/crossentropy": 2.652328610420227, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.19492708891630173, "step": 4968 }, { "epoch": 0.22590909090909092, "grad_norm": 5.65625, "grad_norm_var": 0.1857421875, "learning_rate": 0.0001, "loss": 6.1903, "loss/crossentropy": 2.6292330026626587, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19750796630978584, "step": 4970 }, { "epoch": 0.226, "grad_norm": 5.6875, "grad_norm_var": 0.16343994140625, "learning_rate": 0.0001, "loss": 6.255, "loss/crossentropy": 2.6869053840637207, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.19919318333268166, "step": 4972 }, { "epoch": 0.2260909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.18479410807291666, "learning_rate": 0.0001, "loss": 5.9341, "loss/crossentropy": 2.4876169562339783, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18527605012059212, "step": 4974 }, { "epoch": 0.22618181818181818, "grad_norm": 5.25, "grad_norm_var": 0.17317708333333334, "learning_rate": 0.0001, "loss": 5.8457, "loss/crossentropy": 2.467233180999756, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.17729681357741356, "step": 4976 }, { "epoch": 0.22627272727272726, "grad_norm": 5.0, "grad_norm_var": 0.16259358723958334, "learning_rate": 0.0001, "loss": 5.7053, "loss/crossentropy": 2.3153087198734283, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.17708748579025269, "step": 4978 }, { "epoch": 0.22636363636363635, "grad_norm": 5.8125, "grad_norm_var": 0.16177978515625, "learning_rate": 0.0001, "loss": 6.097, "loss/crossentropy": 2.5837907195091248, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18882109224796295, "step": 4980 }, { "epoch": 0.22645454545454546, "grad_norm": 6.0625, "grad_norm_var": 0.14127197265625, "learning_rate": 0.0001, "loss": 6.171, "loss/crossentropy": 2.545957922935486, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1988370716571808, "step": 4982 }, { "epoch": 0.22654545454545455, "grad_norm": 5.71875, "grad_norm_var": 0.099462890625, "learning_rate": 0.0001, "loss": 5.8452, "loss/crossentropy": 2.39152592420578, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18716459721326828, "step": 4984 }, { "epoch": 0.22663636363636364, "grad_norm": 7.75, "grad_norm_var": 0.383837890625, "learning_rate": 0.0001, "loss": 5.9081, "loss/crossentropy": 2.39241886138916, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1890704184770584, "step": 4986 }, { "epoch": 0.22672727272727272, "grad_norm": 5.3125, "grad_norm_var": 0.38756103515625, "learning_rate": 0.0001, "loss": 6.0442, "loss/crossentropy": 2.559696078300476, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.18731897696852684, "step": 4988 }, { "epoch": 0.2268181818181818, "grad_norm": 5.75, "grad_norm_var": 0.36604410807291665, "learning_rate": 0.0001, "loss": 5.7358, "loss/crossentropy": 2.3791548013687134, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17629197239875793, "step": 4990 }, { "epoch": 0.22690909090909092, "grad_norm": 5.9375, "grad_norm_var": 0.34475504557291664, "learning_rate": 0.0001, "loss": 6.0202, "loss/crossentropy": 2.560320019721985, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1840725988149643, "step": 4992 }, { "epoch": 0.227, "grad_norm": 5.8125, "grad_norm_var": 0.2997233072916667, "learning_rate": 0.0001, "loss": 5.9439, "loss/crossentropy": 2.425273299217224, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1885850541293621, "step": 4994 }, { "epoch": 0.2270909090909091, "grad_norm": 6.96875, "grad_norm_var": 0.39283447265625, "learning_rate": 0.0001, "loss": 5.8494, "loss/crossentropy": 2.198267489671707, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.1959741711616516, "step": 4996 }, { "epoch": 0.22718181818181818, "grad_norm": 6.34375, "grad_norm_var": 0.94312744140625, "learning_rate": 0.0001, "loss": 6.2124, "loss/crossentropy": 2.467073082923889, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20734605938196182, "step": 4998 }, { "epoch": 0.22727272727272727, "grad_norm": 5.28125, "grad_norm_var": 1.04635009765625, "learning_rate": 0.0001, "loss": 5.5938, "loss/crossentropy": 2.246084839105606, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17540044337511063, "step": 5000 }, { "epoch": 0.22736363636363635, "grad_norm": 5.34375, "grad_norm_var": 0.8850911458333334, "learning_rate": 0.0001, "loss": 5.9753, "loss/crossentropy": 2.417951762676239, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19479568302631378, "step": 5002 }, { "epoch": 0.22745454545454546, "grad_norm": 5.375, "grad_norm_var": 0.8827473958333333, "learning_rate": 0.0001, "loss": 5.695, "loss/crossentropy": 2.3760430216789246, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17193235456943512, "step": 5004 }, { "epoch": 0.22754545454545455, "grad_norm": 5.21875, "grad_norm_var": 0.9507120768229167, "learning_rate": 0.0001, "loss": 5.5952, "loss/crossentropy": 2.3154635429382324, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17055213078856468, "step": 5006 }, { "epoch": 0.22763636363636364, "grad_norm": 5.65625, "grad_norm_var": 1.0162394205729166, "learning_rate": 0.0001, "loss": 5.5496, "loss/crossentropy": 2.2785319685935974, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.16948822513222694, "step": 5008 }, { "epoch": 0.22772727272727272, "grad_norm": 5.71875, "grad_norm_var": 1.0495930989583333, "learning_rate": 0.0001, "loss": 5.9267, "loss/crossentropy": 2.4180286526679993, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.18777703121304512, "step": 5010 }, { "epoch": 0.2278181818181818, "grad_norm": 6.625, "grad_norm_var": 0.97838134765625, "learning_rate": 0.0001, "loss": 6.0061, "loss/crossentropy": 2.4825897216796875, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.18965794891119003, "step": 5012 }, { "epoch": 0.22790909090909092, "grad_norm": 5.625, "grad_norm_var": 0.4642578125, "learning_rate": 0.0001, "loss": 5.8912, "loss/crossentropy": 2.424448072910309, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18339833244681358, "step": 5014 }, { "epoch": 0.228, "grad_norm": 5.78125, "grad_norm_var": 0.3117472330729167, "learning_rate": 0.0001, "loss": 6.1916, "loss/crossentropy": 2.714663505554199, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18831907957792282, "step": 5016 }, { "epoch": 0.2280909090909091, "grad_norm": 5.625, "grad_norm_var": 0.33039957682291665, "learning_rate": 0.0001, "loss": 5.7179, "loss/crossentropy": 2.407412827014923, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1720632091164589, "step": 5018 }, { "epoch": 0.22818181818181818, "grad_norm": 6.375, "grad_norm_var": 0.38785400390625, "learning_rate": 0.0001, "loss": 6.4253, "loss/crossentropy": 2.6901135444641113, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20945283770561218, "step": 5020 }, { "epoch": 0.22827272727272727, "grad_norm": 5.5625, "grad_norm_var": 0.3363566080729167, "learning_rate": 0.0001, "loss": 6.0422, "loss/crossentropy": 2.5382027626037598, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.18926643952727318, "step": 5022 }, { "epoch": 0.22836363636363635, "grad_norm": 5.71875, "grad_norm_var": 0.2769368489583333, "learning_rate": 0.0001, "loss": 5.8781, "loss/crossentropy": 2.5011588633060455, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1787128485739231, "step": 5024 }, { "epoch": 0.22845454545454547, "grad_norm": 6.53125, "grad_norm_var": 0.273681640625, "learning_rate": 0.0001, "loss": 6.4078, "loss/crossentropy": 2.7735294699668884, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2028796747326851, "step": 5026 }, { "epoch": 0.22854545454545455, "grad_norm": 6.40625, "grad_norm_var": 0.28157145182291665, "learning_rate": 0.0001, "loss": 5.5442, "loss/crossentropy": 2.1651386618614197, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.17541010677814484, "step": 5028 }, { "epoch": 0.22863636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.2542805989583333, "learning_rate": 0.0001, "loss": 5.7508, "loss/crossentropy": 2.418203055858612, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.17291300371289253, "step": 5030 }, { "epoch": 0.22872727272727272, "grad_norm": 5.5625, "grad_norm_var": 0.24576416015625, "learning_rate": 0.0001, "loss": 5.9165, "loss/crossentropy": 2.495219349861145, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18313901871442795, "step": 5032 }, { "epoch": 0.2288181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.23787434895833334, "learning_rate": 0.0001, "loss": 6.0052, "loss/crossentropy": 2.5639955401420593, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18377280607819557, "step": 5034 }, { "epoch": 0.2289090909090909, "grad_norm": 6.6875, "grad_norm_var": 0.24410400390625, "learning_rate": 0.0001, "loss": 5.4281, "loss/crossentropy": 2.1530635058879852, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.16930460929870605, "step": 5036 }, { "epoch": 0.229, "grad_norm": 5.6875, "grad_norm_var": 0.24631754557291666, "learning_rate": 0.0001, "loss": 5.8689, "loss/crossentropy": 2.3886294662952423, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1843525916337967, "step": 5038 }, { "epoch": 0.2290909090909091, "grad_norm": 6.40625, "grad_norm_var": 0.26304931640625, "learning_rate": 0.0001, "loss": 6.2994, "loss/crossentropy": 2.683770716190338, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.20043354853987694, "step": 5040 }, { "epoch": 0.22918181818181818, "grad_norm": 5.75, "grad_norm_var": 0.22603759765625, "learning_rate": 0.0001, "loss": 5.8096, "loss/crossentropy": 2.356851041316986, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18296677619218826, "step": 5042 }, { "epoch": 0.22927272727272727, "grad_norm": 5.78125, "grad_norm_var": 0.21236572265625, "learning_rate": 0.0001, "loss": 5.443, "loss/crossentropy": 2.1469419598579407, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.16769473254680634, "step": 5044 }, { "epoch": 0.22936363636363635, "grad_norm": 5.71875, "grad_norm_var": 0.19735921223958333, "learning_rate": 0.0001, "loss": 6.3651, "loss/crossentropy": 2.7590052485466003, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20044903829693794, "step": 5046 }, { "epoch": 0.22945454545454547, "grad_norm": 6.375, "grad_norm_var": 13.2845703125, "learning_rate": 0.0001, "loss": 5.9781, "loss/crossentropy": 2.291284918785095, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.20598673075437546, "step": 5048 }, { "epoch": 0.22954545454545455, "grad_norm": 5.90625, "grad_norm_var": 13.23179931640625, "learning_rate": 0.0001, "loss": 6.0357, "loss/crossentropy": 2.530905783176422, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.189545638859272, "step": 5050 }, { "epoch": 0.22963636363636364, "grad_norm": 6.4375, "grad_norm_var": 13.04039306640625, "learning_rate": 0.0001, "loss": 6.1189, "loss/crossentropy": 2.580171048641205, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19352232292294502, "step": 5052 }, { "epoch": 0.22972727272727272, "grad_norm": 5.59375, "grad_norm_var": 13.177197265625, "learning_rate": 0.0001, "loss": 5.9617, "loss/crossentropy": 2.462615132331848, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18995089456439018, "step": 5054 }, { "epoch": 0.2298181818181818, "grad_norm": 6.09375, "grad_norm_var": 13.205322265625, "learning_rate": 0.0001, "loss": 6.4351, "loss/crossentropy": 2.7486367225646973, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.20672940835356712, "step": 5056 }, { "epoch": 0.2299090909090909, "grad_norm": 5.59375, "grad_norm_var": 13.221858723958333, "learning_rate": 0.0001, "loss": 6.2211, "loss/crossentropy": 2.6313478350639343, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19842814654111862, "step": 5058 }, { "epoch": 0.23, "grad_norm": 5.53125, "grad_norm_var": 13.105452473958334, "learning_rate": 0.0001, "loss": 5.6724, "loss/crossentropy": 2.1663262248039246, "loss/hidden": 1.666015625, "loss/jsd": 0.0, "loss/logits": 0.1840081550180912, "step": 5060 }, { "epoch": 0.2300909090909091, "grad_norm": 5.6875, "grad_norm_var": 13.143880208333334, "learning_rate": 0.0001, "loss": 6.0238, "loss/crossentropy": 2.488701581954956, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.19277264550328255, "step": 5062 }, { "epoch": 0.23018181818181818, "grad_norm": 6.21875, "grad_norm_var": 0.2994791666666667, "learning_rate": 0.0001, "loss": 6.0433, "loss/crossentropy": 2.477411448955536, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19526266306638718, "step": 5064 }, { "epoch": 0.23027272727272727, "grad_norm": 5.40625, "grad_norm_var": 0.3126953125, "learning_rate": 0.0001, "loss": 6.2268, "loss/crossentropy": 2.673436403274536, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19362256675958633, "step": 5066 }, { "epoch": 0.23036363636363635, "grad_norm": 5.25, "grad_norm_var": 0.2938435872395833, "learning_rate": 0.0001, "loss": 6.0015, "loss/crossentropy": 2.5071781873703003, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1877165585756302, "step": 5068 }, { "epoch": 0.23045454545454547, "grad_norm": 5.21875, "grad_norm_var": 0.29110921223958336, "learning_rate": 0.0001, "loss": 5.6775, "loss/crossentropy": 2.331337034702301, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1766052283346653, "step": 5070 }, { "epoch": 0.23054545454545455, "grad_norm": 6.09375, "grad_norm_var": 0.29055989583333336, "learning_rate": 0.0001, "loss": 5.831, "loss/crossentropy": 2.4398198425769806, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18013563007116318, "step": 5072 }, { "epoch": 0.23063636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.28566080729166665, "learning_rate": 0.0001, "loss": 6.04, "loss/crossentropy": 2.5898200273513794, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18447521328926086, "step": 5074 }, { "epoch": 0.23072727272727273, "grad_norm": 10.625, "grad_norm_var": 1.576171875, "learning_rate": 0.0001, "loss": 6.2593, "loss/crossentropy": 2.665124535560608, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.19242161139845848, "step": 5076 }, { "epoch": 0.2308181818181818, "grad_norm": 5.1875, "grad_norm_var": 1.6130859375, "learning_rate": 0.0001, "loss": 5.7119, "loss/crossentropy": 2.322984218597412, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18048953637480736, "step": 5078 }, { "epoch": 0.2309090909090909, "grad_norm": 6.71875, "grad_norm_var": 1.6998697916666667, "learning_rate": 0.0001, "loss": 5.9327, "loss/crossentropy": 2.391903817653656, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.18337391316890717, "step": 5080 }, { "epoch": 0.231, "grad_norm": 6.15625, "grad_norm_var": 1.6932942708333334, "learning_rate": 0.0001, "loss": 6.3646, "loss/crossentropy": 2.7243769764900208, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20386986806988716, "step": 5082 }, { "epoch": 0.2310909090909091, "grad_norm": 6.03125, "grad_norm_var": 1.6484212239583333, "learning_rate": 0.0001, "loss": 5.7144, "loss/crossentropy": 2.37843781709671, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17441312223672867, "step": 5084 }, { "epoch": 0.23118181818181818, "grad_norm": 5.75, "grad_norm_var": 1.679931640625, "learning_rate": 0.0001, "loss": 6.0961, "loss/crossentropy": 2.5312663316726685, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.1926199607551098, "step": 5086 }, { "epoch": 0.23127272727272727, "grad_norm": 5.125, "grad_norm_var": 1.7846638997395834, "learning_rate": 0.0001, "loss": 5.9052, "loss/crossentropy": 2.5101293325424194, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1811077743768692, "step": 5088 }, { "epoch": 0.23136363636363635, "grad_norm": 5.375, "grad_norm_var": 1.8001261393229167, "learning_rate": 0.0001, "loss": 5.9496, "loss/crossentropy": 2.4533360600471497, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.19005801901221275, "step": 5090 }, { "epoch": 0.23145454545454547, "grad_norm": 5.15625, "grad_norm_var": 0.32942301432291665, "learning_rate": 0.0001, "loss": 5.8841, "loss/crossentropy": 2.500189244747162, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.17628274857997894, "step": 5092 }, { "epoch": 0.23154545454545455, "grad_norm": 6.0625, "grad_norm_var": 0.30709228515625, "learning_rate": 0.0001, "loss": 6.3059, "loss/crossentropy": 2.6797110438346863, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.2018727958202362, "step": 5094 }, { "epoch": 0.23163636363636364, "grad_norm": 5.96875, "grad_norm_var": 0.21360677083333332, "learning_rate": 0.0001, "loss": 6.1663, "loss/crossentropy": 2.6047465801239014, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19600235670804977, "step": 5096 }, { "epoch": 0.23172727272727273, "grad_norm": 5.3125, "grad_norm_var": 0.20491129557291668, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.3934826850891113, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.18136228248476982, "step": 5098 }, { "epoch": 0.2318181818181818, "grad_norm": 6.5, "grad_norm_var": 0.18873697916666668, "learning_rate": 0.0001, "loss": 5.7695, "loss/crossentropy": 2.355048179626465, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.17894456163048744, "step": 5100 }, { "epoch": 0.2319090909090909, "grad_norm": 6.0, "grad_norm_var": 0.16510416666666666, "learning_rate": 0.0001, "loss": 5.9722, "loss/crossentropy": 2.461318850517273, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18858394771814346, "step": 5102 }, { "epoch": 0.232, "grad_norm": 5.875, "grad_norm_var": 0.13674723307291667, "learning_rate": 0.0001, "loss": 6.062, "loss/crossentropy": 2.5160343050956726, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.1911240592598915, "step": 5104 }, { "epoch": 0.2320909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.12740885416666667, "learning_rate": 0.0001, "loss": 5.7995, "loss/crossentropy": 2.400128126144409, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.1799730733036995, "step": 5106 }, { "epoch": 0.23218181818181818, "grad_norm": 5.96875, "grad_norm_var": 0.12537434895833333, "learning_rate": 0.0001, "loss": 6.4989, "loss/crossentropy": 2.7603710889816284, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2105734944343567, "step": 5108 }, { "epoch": 0.23227272727272727, "grad_norm": 5.625, "grad_norm_var": 0.15636393229166667, "learning_rate": 0.0001, "loss": 5.6953, "loss/crossentropy": 2.3086855113506317, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1775287240743637, "step": 5110 }, { "epoch": 0.23236363636363636, "grad_norm": 6.375, "grad_norm_var": 0.17941080729166667, "learning_rate": 0.0001, "loss": 6.0729, "loss/crossentropy": 2.5913595855236053, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18877826631069183, "step": 5112 }, { "epoch": 0.23245454545454544, "grad_norm": 5.5, "grad_norm_var": 0.170166015625, "learning_rate": 0.0001, "loss": 6.1678, "loss/crossentropy": 2.636508822441101, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.19394824653863907, "step": 5114 }, { "epoch": 0.23254545454545456, "grad_norm": 6.15625, "grad_norm_var": 0.14351806640625, "learning_rate": 0.0001, "loss": 6.1937, "loss/crossentropy": 2.632579982280731, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19361691921949387, "step": 5116 }, { "epoch": 0.23263636363636364, "grad_norm": 6.125, "grad_norm_var": 0.16493733723958334, "learning_rate": 0.0001, "loss": 5.9017, "loss/crossentropy": 2.4772740602493286, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18404768779873848, "step": 5118 }, { "epoch": 0.23272727272727273, "grad_norm": 5.84375, "grad_norm_var": 0.15377197265625, "learning_rate": 0.0001, "loss": 6.0378, "loss/crossentropy": 2.5719524025917053, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18721095100045204, "step": 5120 }, { "epoch": 0.2328181818181818, "grad_norm": 6.4375, "grad_norm_var": 0.193359375, "learning_rate": 0.0001, "loss": 5.6185, "loss/crossentropy": 2.2263936400413513, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.17553826794028282, "step": 5122 }, { "epoch": 0.2329090909090909, "grad_norm": 5.625, "grad_norm_var": 0.16770426432291666, "learning_rate": 0.0001, "loss": 5.7847, "loss/crossentropy": 2.3112049102783203, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.18446330353617668, "step": 5124 }, { "epoch": 0.233, "grad_norm": 5.375, "grad_norm_var": 0.14607747395833334, "learning_rate": 0.0001, "loss": 5.7664, "loss/crossentropy": 2.3372780084609985, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1819714605808258, "step": 5126 }, { "epoch": 0.2330909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.12141520182291667, "learning_rate": 0.0001, "loss": 5.6071, "loss/crossentropy": 2.2973757684230804, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.17081446386873722, "step": 5128 }, { "epoch": 0.23318181818181818, "grad_norm": 5.90625, "grad_norm_var": 0.12079671223958334, "learning_rate": 0.0001, "loss": 5.7955, "loss/crossentropy": 2.3945239782333374, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.17739750444889069, "step": 5130 }, { "epoch": 0.23327272727272727, "grad_norm": 5.96875, "grad_norm_var": 0.11301676432291667, "learning_rate": 0.0001, "loss": 6.3254, "loss/crossentropy": 2.6872552037239075, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2028791829943657, "step": 5132 }, { "epoch": 0.23336363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.11952718098958333, "learning_rate": 0.0001, "loss": 5.6891, "loss/crossentropy": 2.3307848274707794, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17587035708129406, "step": 5134 }, { "epoch": 0.23345454545454544, "grad_norm": 5.9375, "grad_norm_var": 0.11669514973958334, "learning_rate": 0.0001, "loss": 6.0866, "loss/crossentropy": 2.570270359516144, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18991660326719284, "step": 5136 }, { "epoch": 0.23354545454545456, "grad_norm": 6.0625, "grad_norm_var": 0.07610677083333334, "learning_rate": 0.0001, "loss": 5.8399, "loss/crossentropy": 2.455160081386566, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18027036637067795, "step": 5138 }, { "epoch": 0.23363636363636364, "grad_norm": 5.5, "grad_norm_var": 0.07980143229166667, "learning_rate": 0.0001, "loss": 6.0734, "loss/crossentropy": 2.5290788412094116, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1934896931052208, "step": 5140 }, { "epoch": 0.23372727272727273, "grad_norm": 5.46875, "grad_norm_var": 0.07629801432291666, "learning_rate": 0.0001, "loss": 5.764, "loss/crossentropy": 2.3864232897758484, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17857299000024796, "step": 5142 }, { "epoch": 0.23381818181818181, "grad_norm": 5.6875, "grad_norm_var": 0.07506510416666666, "learning_rate": 0.0001, "loss": 6.0864, "loss/crossentropy": 2.62265545129776, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1870003677904606, "step": 5144 }, { "epoch": 0.2339090909090909, "grad_norm": 5.25, "grad_norm_var": 0.07858072916666667, "learning_rate": 0.0001, "loss": 5.6548, "loss/crossentropy": 2.288139045238495, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.17377522587776184, "step": 5146 }, { "epoch": 0.234, "grad_norm": 5.8125, "grad_norm_var": 0.06808268229166667, "learning_rate": 0.0001, "loss": 6.257, "loss/crossentropy": 2.650491237640381, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19854618981480598, "step": 5148 }, { "epoch": 0.2340909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.05245768229166667, "learning_rate": 0.0001, "loss": 6.1101, "loss/crossentropy": 2.5938669443130493, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.1908797100186348, "step": 5150 }, { "epoch": 0.23418181818181819, "grad_norm": 5.5625, "grad_norm_var": 0.06061197916666667, "learning_rate": 0.0001, "loss": 5.8873, "loss/crossentropy": 2.411479562520981, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.18390553444623947, "step": 5152 }, { "epoch": 0.23427272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.0658203125, "learning_rate": 0.0001, "loss": 5.7977, "loss/crossentropy": 2.4325217604637146, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17949067801237106, "step": 5154 }, { "epoch": 0.23436363636363636, "grad_norm": 5.84375, "grad_norm_var": 0.065625, "learning_rate": 0.0001, "loss": 6.4752, "loss/crossentropy": 2.7148830890655518, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21118442341685295, "step": 5156 }, { "epoch": 0.23445454545454544, "grad_norm": 5.5625, "grad_norm_var": 0.09335530598958333, "learning_rate": 0.0001, "loss": 6.0218, "loss/crossentropy": 2.5188227891921997, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1897510327398777, "step": 5158 }, { "epoch": 0.23454545454545456, "grad_norm": 5.6875, "grad_norm_var": 0.09842122395833333, "learning_rate": 0.0001, "loss": 6.6169, "loss/crossentropy": 2.911488652229309, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20959893614053726, "step": 5160 }, { "epoch": 0.23463636363636364, "grad_norm": 6.03125, "grad_norm_var": 0.30696207682291665, "learning_rate": 0.0001, "loss": 6.4037, "loss/crossentropy": 2.7943153381347656, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19844141229987144, "step": 5162 }, { "epoch": 0.23472727272727273, "grad_norm": 5.75, "grad_norm_var": 0.30670166015625, "learning_rate": 0.0001, "loss": 6.4147, "loss/crossentropy": 2.7694013714790344, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.20300504192709923, "step": 5164 }, { "epoch": 0.23481818181818181, "grad_norm": 5.5625, "grad_norm_var": 0.3309855143229167, "learning_rate": 0.0001, "loss": 6.0446, "loss/crossentropy": 2.5685391426086426, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1907748058438301, "step": 5166 }, { "epoch": 0.2349090909090909, "grad_norm": 6.0625, "grad_norm_var": 0.35465087890625, "learning_rate": 0.0001, "loss": 5.5644, "loss/crossentropy": 2.253501534461975, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1736655943095684, "step": 5168 }, { "epoch": 0.235, "grad_norm": 5.59375, "grad_norm_var": 0.33318684895833334, "learning_rate": 0.0001, "loss": 5.9825, "loss/crossentropy": 2.4649749398231506, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.186908058822155, "step": 5170 }, { "epoch": 0.2350909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.39112955729166665, "learning_rate": 0.0001, "loss": 5.7046, "loss/crossentropy": 2.3741918802261353, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17464375868439674, "step": 5172 }, { "epoch": 0.2351818181818182, "grad_norm": 5.5625, "grad_norm_var": 0.38069254557291665, "learning_rate": 0.0001, "loss": 6.0591, "loss/crossentropy": 2.6336739659309387, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1827770657837391, "step": 5174 }, { "epoch": 0.23527272727272727, "grad_norm": 5.84375, "grad_norm_var": 0.37745768229166665, "learning_rate": 0.0001, "loss": 6.1163, "loss/crossentropy": 2.5699557662010193, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.19545649364590645, "step": 5176 }, { "epoch": 0.23536363636363636, "grad_norm": 5.875, "grad_norm_var": 0.13007405598958333, "learning_rate": 0.0001, "loss": 6.0957, "loss/crossentropy": 2.4882991313934326, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.19648586213588715, "step": 5178 }, { "epoch": 0.23545454545454544, "grad_norm": 6.09375, "grad_norm_var": 0.1423828125, "learning_rate": 0.0001, "loss": 6.0316, "loss/crossentropy": 2.543359100818634, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1906256079673767, "step": 5180 }, { "epoch": 0.23554545454545456, "grad_norm": 5.59375, "grad_norm_var": 0.13420817057291667, "learning_rate": 0.0001, "loss": 5.9059, "loss/crossentropy": 2.434583067893982, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.18443574383854866, "step": 5182 }, { "epoch": 0.23563636363636364, "grad_norm": 5.78125, "grad_norm_var": 0.11222330729166667, "learning_rate": 0.0001, "loss": 5.4762, "loss/crossentropy": 2.123971462249756, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1742822825908661, "step": 5184 }, { "epoch": 0.23572727272727273, "grad_norm": 5.65625, "grad_norm_var": 0.07040608723958333, "learning_rate": 0.0001, "loss": 6.3501, "loss/crossentropy": 2.7084320783615112, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.20225486159324646, "step": 5186 }, { "epoch": 0.23581818181818182, "grad_norm": 6.21875, "grad_norm_var": 0.058186848958333336, "learning_rate": 0.0001, "loss": 6.3749, "loss/crossentropy": 2.717116892337799, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.20230627059936523, "step": 5188 }, { "epoch": 0.2359090909090909, "grad_norm": 6.25, "grad_norm_var": 0.06663004557291667, "learning_rate": 0.0001, "loss": 5.6934, "loss/crossentropy": 2.2847179770469666, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.17699947580695152, "step": 5190 }, { "epoch": 0.236, "grad_norm": 5.8125, "grad_norm_var": 0.18448893229166666, "learning_rate": 0.0001, "loss": 6.1751, "loss/crossentropy": 2.5283588767051697, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19748730212450027, "step": 5192 }, { "epoch": 0.2360909090909091, "grad_norm": 5.0, "grad_norm_var": 0.21539306640625, "learning_rate": 0.0001, "loss": 6.127, "loss/crossentropy": 2.638889789581299, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18943119421601295, "step": 5194 }, { "epoch": 0.2361818181818182, "grad_norm": 6.125, "grad_norm_var": 0.21519775390625, "learning_rate": 0.0001, "loss": 6.5217, "loss/crossentropy": 2.856294333934784, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.20657844096422195, "step": 5196 }, { "epoch": 0.23627272727272727, "grad_norm": 5.65625, "grad_norm_var": 0.21936442057291666, "learning_rate": 0.0001, "loss": 5.6501, "loss/crossentropy": 2.2674087584018707, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.17420554533600807, "step": 5198 }, { "epoch": 0.23636363636363636, "grad_norm": 5.0, "grad_norm_var": 0.282666015625, "learning_rate": 0.0001, "loss": 5.5125, "loss/crossentropy": 2.2660940289497375, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.16526731848716736, "step": 5200 }, { "epoch": 0.23645454545454545, "grad_norm": 5.3125, "grad_norm_var": 0.29550374348958336, "learning_rate": 0.0001, "loss": 6.3056, "loss/crossentropy": 2.784971535205841, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19151382893323898, "step": 5202 }, { "epoch": 0.23654545454545456, "grad_norm": 6.1875, "grad_norm_var": 0.29423421223958335, "learning_rate": 0.0001, "loss": 6.2065, "loss/crossentropy": 2.5687966346740723, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.20380695909261703, "step": 5204 }, { "epoch": 0.23663636363636364, "grad_norm": 5.84375, "grad_norm_var": 0.2777180989583333, "learning_rate": 0.0001, "loss": 6.228, "loss/crossentropy": 2.6819822192192078, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19484072923660278, "step": 5206 }, { "epoch": 0.23672727272727273, "grad_norm": 5.25, "grad_norm_var": 0.15940348307291666, "learning_rate": 0.0001, "loss": 5.6489, "loss/crossentropy": 2.277124524116516, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.17761225253343582, "step": 5208 }, { "epoch": 0.23681818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.16105143229166666, "learning_rate": 0.0001, "loss": 5.9146, "loss/crossentropy": 2.548310697078705, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1794060431420803, "step": 5210 }, { "epoch": 0.2369090909090909, "grad_norm": 5.5625, "grad_norm_var": 0.15779622395833334, "learning_rate": 0.0001, "loss": 5.5443, "loss/crossentropy": 2.1790881156921387, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17733775451779366, "step": 5212 }, { "epoch": 0.237, "grad_norm": 5.125, "grad_norm_var": 0.18138020833333332, "learning_rate": 0.0001, "loss": 5.4496, "loss/crossentropy": 2.2258917093276978, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.1663142368197441, "step": 5214 }, { "epoch": 0.2370909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.12743733723958334, "learning_rate": 0.0001, "loss": 6.059, "loss/crossentropy": 2.6309894323349, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1853836104273796, "step": 5216 }, { "epoch": 0.2371818181818182, "grad_norm": 4.875, "grad_norm_var": 0.13787434895833334, "learning_rate": 0.0001, "loss": 5.6894, "loss/crossentropy": 2.3828718066215515, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1742075216025114, "step": 5218 }, { "epoch": 0.23727272727272727, "grad_norm": 5.375, "grad_norm_var": 0.09972330729166666, "learning_rate": 0.0001, "loss": 5.797, "loss/crossentropy": 2.4273175597190857, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.17895976454019547, "step": 5220 }, { "epoch": 0.23736363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.07301025390625, "learning_rate": 0.0001, "loss": 5.9549, "loss/crossentropy": 2.546494722366333, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18420402333140373, "step": 5222 }, { "epoch": 0.23745454545454545, "grad_norm": 5.46875, "grad_norm_var": 0.08346354166666667, "learning_rate": 0.0001, "loss": 6.3185, "loss/crossentropy": 2.7423513531684875, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19433604553341866, "step": 5224 }, { "epoch": 0.23754545454545453, "grad_norm": 6.25, "grad_norm_var": 0.12545572916666667, "learning_rate": 0.0001, "loss": 5.7114, "loss/crossentropy": 2.3536400496959686, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.17777219787240028, "step": 5226 }, { "epoch": 0.23763636363636365, "grad_norm": 5.84375, "grad_norm_var": 1.1558878580729166, "learning_rate": 0.0001, "loss": 5.9185, "loss/crossentropy": 2.5061742663383484, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.17990071326494217, "step": 5228 }, { "epoch": 0.23772727272727273, "grad_norm": 5.5625, "grad_norm_var": 1.1367472330729167, "learning_rate": 0.0001, "loss": 5.9849, "loss/crossentropy": 2.4634615182876587, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.18944432958960533, "step": 5230 }, { "epoch": 0.23781818181818182, "grad_norm": 5.40625, "grad_norm_var": 1.1243326822916666, "learning_rate": 0.0001, "loss": 6.0397, "loss/crossentropy": 2.5353768467903137, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19066617265343666, "step": 5232 }, { "epoch": 0.2379090909090909, "grad_norm": 5.625, "grad_norm_var": 1.0606730143229166, "learning_rate": 0.0001, "loss": 5.9671, "loss/crossentropy": 2.5042720437049866, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.188468337059021, "step": 5234 }, { "epoch": 0.238, "grad_norm": 5.625, "grad_norm_var": 1.0269368489583333, "learning_rate": 0.0001, "loss": 6.1421, "loss/crossentropy": 2.5985350012779236, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.19244717434048653, "step": 5236 }, { "epoch": 0.2380909090909091, "grad_norm": 14.625, "grad_norm_var": 5.690230305989584, "learning_rate": 0.0001, "loss": 6.1595, "loss/crossentropy": 2.596712112426758, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19475920870900154, "step": 5238 }, { "epoch": 0.2381818181818182, "grad_norm": 6.03125, "grad_norm_var": 5.623726399739583, "learning_rate": 0.0001, "loss": 6.0426, "loss/crossentropy": 2.5592262744903564, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18935208022594452, "step": 5240 }, { "epoch": 0.23827272727272727, "grad_norm": 5.3125, "grad_norm_var": 5.646858723958333, "learning_rate": 0.0001, "loss": 5.5766, "loss/crossentropy": 2.247683823108673, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17293399944901466, "step": 5242 }, { "epoch": 0.23836363636363636, "grad_norm": 6.0, "grad_norm_var": 5.044462076822916, "learning_rate": 0.0001, "loss": 5.8726, "loss/crossentropy": 2.4405230283737183, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18363793566823006, "step": 5244 }, { "epoch": 0.23845454545454545, "grad_norm": 5.6875, "grad_norm_var": 5.035009765625, "learning_rate": 0.0001, "loss": 5.9114, "loss/crossentropy": 2.379866451025009, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.1873374916613102, "step": 5246 }, { "epoch": 0.23854545454545453, "grad_norm": 4.875, "grad_norm_var": 5.144856770833333, "learning_rate": 0.0001, "loss": 5.6548, "loss/crossentropy": 2.348492443561554, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17281513661146164, "step": 5248 }, { "epoch": 0.23863636363636365, "grad_norm": 5.40625, "grad_norm_var": 5.1462890625, "learning_rate": 0.0001, "loss": 5.8666, "loss/crossentropy": 2.3634331822395325, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.1860595978796482, "step": 5250 }, { "epoch": 0.23872727272727273, "grad_norm": 6.0, "grad_norm_var": 5.160921223958334, "learning_rate": 0.0001, "loss": 5.8234, "loss/crossentropy": 2.3800913989543915, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.18319829180836678, "step": 5252 }, { "epoch": 0.23881818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.18709309895833334, "learning_rate": 0.0001, "loss": 5.8768, "loss/crossentropy": 2.4205907583236694, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.1807760763913393, "step": 5254 }, { "epoch": 0.2389090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.19875895182291667, "learning_rate": 0.0001, "loss": 5.6414, "loss/crossentropy": 2.3452791571617126, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17336363717913628, "step": 5256 }, { "epoch": 0.239, "grad_norm": 5.53125, "grad_norm_var": 0.25881754557291664, "learning_rate": 0.0001, "loss": 5.9737, "loss/crossentropy": 2.489455461502075, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1908075511455536, "step": 5258 }, { "epoch": 0.2390909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.25441080729166665, "learning_rate": 0.0001, "loss": 5.7932, "loss/crossentropy": 2.463314950466156, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17614973336458206, "step": 5260 }, { "epoch": 0.2391818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.2791666666666667, "learning_rate": 0.0001, "loss": 6.0732, "loss/crossentropy": 2.567935883998871, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.189397931098938, "step": 5262 }, { "epoch": 0.23927272727272728, "grad_norm": 6.25, "grad_norm_var": 0.24914957682291666, "learning_rate": 0.0001, "loss": 6.1352, "loss/crossentropy": 2.5476366877555847, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.195865660905838, "step": 5264 }, { "epoch": 0.23936363636363636, "grad_norm": 5.65625, "grad_norm_var": 0.23144124348958334, "learning_rate": 0.0001, "loss": 6.0378, "loss/crossentropy": 2.5391570925712585, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.18541599810123444, "step": 5266 }, { "epoch": 0.23945454545454545, "grad_norm": 6.03125, "grad_norm_var": 0.21276041666666667, "learning_rate": 0.0001, "loss": 6.0239, "loss/crossentropy": 2.4897753596305847, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19325829297304153, "step": 5268 }, { "epoch": 0.23954545454545453, "grad_norm": 5.875, "grad_norm_var": 0.19230143229166666, "learning_rate": 0.0001, "loss": 6.1359, "loss/crossentropy": 2.577280580997467, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19297057762742043, "step": 5270 }, { "epoch": 0.23963636363636365, "grad_norm": 6.21875, "grad_norm_var": 2.3939453125, "learning_rate": 0.0001, "loss": 6.5337, "loss/crossentropy": 2.669033944606781, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.22220828756690025, "step": 5272 }, { "epoch": 0.23972727272727273, "grad_norm": 5.21875, "grad_norm_var": 2.42340087890625, "learning_rate": 0.0001, "loss": 6.0341, "loss/crossentropy": 2.5090062022209167, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18922512233257294, "step": 5274 }, { "epoch": 0.23981818181818182, "grad_norm": 5.59375, "grad_norm_var": 2.4196573893229165, "learning_rate": 0.0001, "loss": 5.9334, "loss/crossentropy": 2.4336878061294556, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19020845741033554, "step": 5276 }, { "epoch": 0.2399090909090909, "grad_norm": 5.6875, "grad_norm_var": 2.4187337239583333, "learning_rate": 0.0001, "loss": 5.6786, "loss/crossentropy": 2.317914664745331, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17825524136424065, "step": 5278 }, { "epoch": 0.24, "grad_norm": 6.1875, "grad_norm_var": 2.432666015625, "learning_rate": 0.0001, "loss": 5.9993, "loss/crossentropy": 2.502106010913849, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18644026666879654, "step": 5280 }, { "epoch": 0.24009090909090908, "grad_norm": 6.15625, "grad_norm_var": 2.418648274739583, "learning_rate": 0.0001, "loss": 5.919, "loss/crossentropy": 2.4139450788497925, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.18507454171776772, "step": 5282 }, { "epoch": 0.2401818181818182, "grad_norm": 5.46875, "grad_norm_var": 2.472916666666667, "learning_rate": 0.0001, "loss": 5.7872, "loss/crossentropy": 2.433010995388031, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17818934842944145, "step": 5284 }, { "epoch": 0.24027272727272728, "grad_norm": 5.5, "grad_norm_var": 2.5322550455729167, "learning_rate": 0.0001, "loss": 5.7118, "loss/crossentropy": 2.3821598887443542, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17476462200284004, "step": 5286 }, { "epoch": 0.24036363636363636, "grad_norm": 5.5625, "grad_norm_var": 0.08954671223958334, "learning_rate": 0.0001, "loss": 6.3059, "loss/crossentropy": 2.7343008518218994, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1968081183731556, "step": 5288 }, { "epoch": 0.24045454545454545, "grad_norm": 5.09375, "grad_norm_var": 0.10126546223958334, "learning_rate": 0.0001, "loss": 5.9643, "loss/crossentropy": 2.6072933077812195, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1814018040895462, "step": 5290 }, { "epoch": 0.24054545454545453, "grad_norm": 9.625, "grad_norm_var": 1.1499959309895833, "learning_rate": 0.0001, "loss": 6.1166, "loss/crossentropy": 2.6437975764274597, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1898568980395794, "step": 5292 }, { "epoch": 0.24063636363636365, "grad_norm": 7.3125, "grad_norm_var": 1.2705362955729167, "learning_rate": 0.0001, "loss": 6.2873, "loss/crossentropy": 2.7236010432243347, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19738080725073814, "step": 5294 }, { "epoch": 0.24072727272727273, "grad_norm": 5.3125, "grad_norm_var": 1.3015625, "learning_rate": 0.0001, "loss": 6.0669, "loss/crossentropy": 2.534763813018799, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19266269356012344, "step": 5296 }, { "epoch": 0.24081818181818182, "grad_norm": 6.96875, "grad_norm_var": 1.3591756184895833, "learning_rate": 0.0001, "loss": 5.9419, "loss/crossentropy": 2.447086453437805, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18775959685444832, "step": 5298 }, { "epoch": 0.2409090909090909, "grad_norm": 5.6875, "grad_norm_var": 1.3098795572916666, "learning_rate": 0.0001, "loss": 5.7846, "loss/crossentropy": 2.3713358640670776, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18077636882662773, "step": 5300 }, { "epoch": 0.241, "grad_norm": 5.9375, "grad_norm_var": 1.2234375, "learning_rate": 0.0001, "loss": 5.3782, "loss/crossentropy": 2.159586489200592, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.16268053650856018, "step": 5302 }, { "epoch": 0.24109090909090908, "grad_norm": 5.90625, "grad_norm_var": 1.1959269205729166, "learning_rate": 0.0001, "loss": 6.0852, "loss/crossentropy": 2.5758556723594666, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.18725790083408356, "step": 5304 }, { "epoch": 0.2411818181818182, "grad_norm": 5.375, "grad_norm_var": 1.1000284830729166, "learning_rate": 0.0001, "loss": 5.9722, "loss/crossentropy": 2.5312931537628174, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.182961106300354, "step": 5306 }, { "epoch": 0.24127272727272728, "grad_norm": 5.5625, "grad_norm_var": 0.30690104166666665, "learning_rate": 0.0001, "loss": 6.0362, "loss/crossentropy": 2.5301268696784973, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.19064684584736824, "step": 5308 }, { "epoch": 0.24136363636363636, "grad_norm": 5.625, "grad_norm_var": 0.18053385416666667, "learning_rate": 0.0001, "loss": 5.6804, "loss/crossentropy": 2.3248682022094727, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17812839895486832, "step": 5310 }, { "epoch": 0.24145454545454545, "grad_norm": 8.1875, "grad_norm_var": 0.54683837890625, "learning_rate": 0.0001, "loss": 6.1871, "loss/crossentropy": 2.4946274161338806, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2071414366364479, "step": 5312 }, { "epoch": 0.24154545454545454, "grad_norm": 5.5, "grad_norm_var": 0.4786295572916667, "learning_rate": 0.0001, "loss": 6.1648, "loss/crossentropy": 2.6241593956947327, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19195623695850372, "step": 5314 }, { "epoch": 0.24163636363636365, "grad_norm": 5.4375, "grad_norm_var": 0.48368733723958335, "learning_rate": 0.0001, "loss": 6.2318, "loss/crossentropy": 2.67513769865036, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19414417818188667, "step": 5316 }, { "epoch": 0.24172727272727274, "grad_norm": 5.65625, "grad_norm_var": 0.48192952473958334, "learning_rate": 0.0001, "loss": 5.9716, "loss/crossentropy": 2.5408445596694946, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.18155302852392197, "step": 5318 }, { "epoch": 0.24181818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.50689697265625, "learning_rate": 0.0001, "loss": 6.0949, "loss/crossentropy": 2.6046436429023743, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.18984968960285187, "step": 5320 }, { "epoch": 0.2419090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.51383056640625, "learning_rate": 0.0001, "loss": 5.6759, "loss/crossentropy": 2.289393424987793, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17947372794151306, "step": 5322 }, { "epoch": 0.242, "grad_norm": 5.34375, "grad_norm_var": 0.5112263997395833, "learning_rate": 0.0001, "loss": 5.7292, "loss/crossentropy": 2.44137305021286, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17311672121286392, "step": 5324 }, { "epoch": 0.24209090909090908, "grad_norm": 6.59375, "grad_norm_var": 0.5622395833333333, "learning_rate": 0.0001, "loss": 6.2831, "loss/crossentropy": 2.6836938858032227, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1982182003557682, "step": 5326 }, { "epoch": 0.2421818181818182, "grad_norm": 5.875, "grad_norm_var": 0.12342122395833334, "learning_rate": 0.0001, "loss": 5.7784, "loss/crossentropy": 2.301431328058243, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18715344369411469, "step": 5328 }, { "epoch": 0.24227272727272728, "grad_norm": 5.53125, "grad_norm_var": 0.11496988932291667, "learning_rate": 0.0001, "loss": 5.6502, "loss/crossentropy": 2.3039159178733826, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.17173577472567558, "step": 5330 }, { "epoch": 0.24236363636363636, "grad_norm": 6.15625, "grad_norm_var": 0.36209309895833336, "learning_rate": 0.0001, "loss": 6.1233, "loss/crossentropy": 2.47801411151886, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20515811070799828, "step": 5332 }, { "epoch": 0.24245454545454545, "grad_norm": 6.21875, "grad_norm_var": 0.45565999348958336, "learning_rate": 0.0001, "loss": 6.0805, "loss/crossentropy": 2.506300449371338, "loss/hidden": 1.658203125, "loss/jsd": 0.0, "loss/logits": 0.19159870967268944, "step": 5334 }, { "epoch": 0.24254545454545454, "grad_norm": 5.5, "grad_norm_var": 0.42877197265625, "learning_rate": 0.0001, "loss": 5.9481, "loss/crossentropy": 2.485770583152771, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1860775649547577, "step": 5336 }, { "epoch": 0.24263636363636365, "grad_norm": 5.78125, "grad_norm_var": 0.3878255208333333, "learning_rate": 0.0001, "loss": 5.8544, "loss/crossentropy": 2.4502851963043213, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18064381927251816, "step": 5338 }, { "epoch": 0.24272727272727274, "grad_norm": 5.09375, "grad_norm_var": 0.4222493489583333, "learning_rate": 0.0001, "loss": 5.6648, "loss/crossentropy": 2.3447375893592834, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1730247475206852, "step": 5340 }, { "epoch": 0.24281818181818182, "grad_norm": 6.96875, "grad_norm_var": 0.47737223307291665, "learning_rate": 0.0001, "loss": 6.3082, "loss/crossentropy": 2.685643196105957, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19897766411304474, "step": 5342 }, { "epoch": 0.2429090909090909, "grad_norm": 6.09375, "grad_norm_var": 0.45292561848958335, "learning_rate": 0.0001, "loss": 6.3311, "loss/crossentropy": 2.7183467745780945, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20072459429502487, "step": 5344 }, { "epoch": 0.243, "grad_norm": 6.15625, "grad_norm_var": 0.4279296875, "learning_rate": 0.0001, "loss": 6.331, "loss/crossentropy": 2.715075969696045, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.20007193833589554, "step": 5346 }, { "epoch": 0.24309090909090908, "grad_norm": 5.6875, "grad_norm_var": 0.292822265625, "learning_rate": 0.0001, "loss": 5.8109, "loss/crossentropy": 2.440055698156357, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1792716234922409, "step": 5348 }, { "epoch": 0.2431818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.2548014322916667, "learning_rate": 0.0001, "loss": 5.7038, "loss/crossentropy": 2.386445701122284, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17704646661877632, "step": 5350 }, { "epoch": 0.24327272727272728, "grad_norm": 6.1875, "grad_norm_var": 0.2570597330729167, "learning_rate": 0.0001, "loss": 6.2077, "loss/crossentropy": 2.6149675846099854, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.19853255525231361, "step": 5352 }, { "epoch": 0.24336363636363637, "grad_norm": 7.28125, "grad_norm_var": 0.38157552083333335, "learning_rate": 0.0001, "loss": 6.0805, "loss/crossentropy": 2.5628356337547302, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1960996612906456, "step": 5354 }, { "epoch": 0.24345454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.390869140625, "learning_rate": 0.0001, "loss": 5.5773, "loss/crossentropy": 2.289062023162842, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.174531240016222, "step": 5356 }, { "epoch": 0.24354545454545454, "grad_norm": 5.375, "grad_norm_var": 0.3485310872395833, "learning_rate": 0.0001, "loss": 5.6073, "loss/crossentropy": 2.4494484066963196, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16031761839985847, "step": 5358 }, { "epoch": 0.24363636363636362, "grad_norm": 6.09375, "grad_norm_var": 0.3773396809895833, "learning_rate": 0.0001, "loss": 6.0409, "loss/crossentropy": 2.544720947742462, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.19239409640431404, "step": 5360 }, { "epoch": 0.24372727272727274, "grad_norm": 5.90625, "grad_norm_var": 0.34429931640625, "learning_rate": 0.0001, "loss": 6.1239, "loss/crossentropy": 2.654368221759796, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18874580040574074, "step": 5362 }, { "epoch": 0.24381818181818182, "grad_norm": 5.3125, "grad_norm_var": 0.35690104166666664, "learning_rate": 0.0001, "loss": 6.091, "loss/crossentropy": 2.6623011231422424, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.18408234789967537, "step": 5364 }, { "epoch": 0.2439090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.33228759765625, "learning_rate": 0.0001, "loss": 5.9969, "loss/crossentropy": 2.576606512069702, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18499404564499855, "step": 5366 }, { "epoch": 0.244, "grad_norm": 5.5, "grad_norm_var": 0.27120768229166664, "learning_rate": 0.0001, "loss": 5.6127, "loss/crossentropy": 2.2588785886764526, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17620032280683517, "step": 5368 }, { "epoch": 0.24409090909090908, "grad_norm": 5.6875, "grad_norm_var": 0.07571614583333333, "learning_rate": 0.0001, "loss": 5.9414, "loss/crossentropy": 2.4331492483615875, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18832852318882942, "step": 5370 }, { "epoch": 0.2441818181818182, "grad_norm": 6.0, "grad_norm_var": 0.07923177083333334, "learning_rate": 0.0001, "loss": 6.2551, "loss/crossentropy": 2.7635170817375183, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.19193029776215553, "step": 5372 }, { "epoch": 0.24427272727272728, "grad_norm": 5.375, "grad_norm_var": 0.09950764973958333, "learning_rate": 0.0001, "loss": 6.1605, "loss/crossentropy": 2.6599742770195007, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1922404281795025, "step": 5374 }, { "epoch": 0.24436363636363637, "grad_norm": 5.28125, "grad_norm_var": 0.087353515625, "learning_rate": 0.0001, "loss": 6.1828, "loss/crossentropy": 2.5899658203125, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19697966799139977, "step": 5376 }, { "epoch": 0.24445454545454545, "grad_norm": 5.6875, "grad_norm_var": 0.09286702473958333, "learning_rate": 0.0001, "loss": 5.6176, "loss/crossentropy": 2.2818716764450073, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.17283151671290398, "step": 5378 }, { "epoch": 0.24454545454545454, "grad_norm": 5.90625, "grad_norm_var": 0.09019775390625, "learning_rate": 0.0001, "loss": 6.0599, "loss/crossentropy": 2.5388757586479187, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.1921464167535305, "step": 5380 }, { "epoch": 0.24463636363636362, "grad_norm": 6.03125, "grad_norm_var": 0.09814046223958334, "learning_rate": 0.0001, "loss": 5.7048, "loss/crossentropy": 2.2812033593654633, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18180834501981735, "step": 5382 }, { "epoch": 0.24472727272727274, "grad_norm": 6.96875, "grad_norm_var": 0.21105143229166667, "learning_rate": 0.0001, "loss": 6.0435, "loss/crossentropy": 2.58781099319458, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18521960452198982, "step": 5384 }, { "epoch": 0.24481818181818182, "grad_norm": 5.5, "grad_norm_var": 0.23199462890625, "learning_rate": 0.0001, "loss": 5.9934, "loss/crossentropy": 2.5542834997177124, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1841483972966671, "step": 5386 }, { "epoch": 0.2449090909090909, "grad_norm": 5.84375, "grad_norm_var": 0.24342447916666668, "learning_rate": 0.0001, "loss": 5.9721, "loss/crossentropy": 2.556259274482727, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18494447693228722, "step": 5388 }, { "epoch": 0.245, "grad_norm": 5.0625, "grad_norm_var": 0.23292643229166668, "learning_rate": 0.0001, "loss": 5.9361, "loss/crossentropy": 2.591788113117218, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18091527000069618, "step": 5390 }, { "epoch": 0.24509090909090908, "grad_norm": 5.125, "grad_norm_var": 0.23474934895833333, "learning_rate": 0.0001, "loss": 5.5952, "loss/crossentropy": 2.3374852538108826, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16952333226799965, "step": 5392 }, { "epoch": 0.2451818181818182, "grad_norm": 5.6875, "grad_norm_var": 0.23983968098958333, "learning_rate": 0.0001, "loss": 5.6856, "loss/crossentropy": 2.331016778945923, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1768607571721077, "step": 5394 }, { "epoch": 0.24527272727272728, "grad_norm": 5.375, "grad_norm_var": 0.2431640625, "learning_rate": 0.0001, "loss": 5.75, "loss/crossentropy": 2.4123058319091797, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.1730245240032673, "step": 5396 }, { "epoch": 0.24536363636363637, "grad_norm": 5.65625, "grad_norm_var": 0.24685872395833333, "learning_rate": 0.0001, "loss": 6.2942, "loss/crossentropy": 2.7165818214416504, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19545671716332436, "step": 5398 }, { "epoch": 0.24545454545454545, "grad_norm": 5.8125, "grad_norm_var": 0.13977864583333333, "learning_rate": 0.0001, "loss": 5.9533, "loss/crossentropy": 2.447149693965912, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18811912089586258, "step": 5400 }, { "epoch": 0.24554545454545454, "grad_norm": 5.96875, "grad_norm_var": 0.16443684895833333, "learning_rate": 0.0001, "loss": 6.5188, "loss/crossentropy": 2.7712361216545105, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.20932698994874954, "step": 5402 }, { "epoch": 0.24563636363636362, "grad_norm": 5.5625, "grad_norm_var": 0.16223958333333333, "learning_rate": 0.0001, "loss": 6.0296, "loss/crossentropy": 2.5046054124832153, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19137123227119446, "step": 5404 }, { "epoch": 0.24572727272727274, "grad_norm": 6.125, "grad_norm_var": 0.16099853515625, "learning_rate": 0.0001, "loss": 6.2161, "loss/crossentropy": 2.648628532886505, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19346147775650024, "step": 5406 }, { "epoch": 0.24581818181818182, "grad_norm": 5.46875, "grad_norm_var": 0.14566650390625, "learning_rate": 0.0001, "loss": 5.0274, "loss/crossentropy": 1.8938754796981812, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.15163414180278778, "step": 5408 }, { "epoch": 0.2459090909090909, "grad_norm": 5.65625, "grad_norm_var": 0.2689412434895833, "learning_rate": 0.0001, "loss": 5.5234, "loss/crossentropy": 2.108154773712158, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.17629196867346764, "step": 5410 }, { "epoch": 0.246, "grad_norm": 5.0, "grad_norm_var": 0.2668253580729167, "learning_rate": 0.0001, "loss": 5.6713, "loss/crossentropy": 2.377124547958374, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.16906404867768288, "step": 5412 }, { "epoch": 0.24609090909090908, "grad_norm": 5.5625, "grad_norm_var": 0.26495768229166666, "learning_rate": 0.0001, "loss": 5.7823, "loss/crossentropy": 2.3927263617515564, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1778211072087288, "step": 5414 }, { "epoch": 0.24618181818181817, "grad_norm": 5.28125, "grad_norm_var": 0.268212890625, "learning_rate": 0.0001, "loss": 5.5944, "loss/crossentropy": 2.2704212069511414, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17419474199414253, "step": 5416 }, { "epoch": 0.24627272727272728, "grad_norm": 6.21875, "grad_norm_var": 0.28710530598958334, "learning_rate": 0.0001, "loss": 5.6056, "loss/crossentropy": 2.246330350637436, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.17478998005390167, "step": 5418 }, { "epoch": 0.24636363636363637, "grad_norm": 6.15625, "grad_norm_var": 0.31308186848958336, "learning_rate": 0.0001, "loss": 5.9206, "loss/crossentropy": 2.468049705028534, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1862700618803501, "step": 5420 }, { "epoch": 0.24645454545454545, "grad_norm": 6.0625, "grad_norm_var": 0.3043619791666667, "learning_rate": 0.0001, "loss": 5.6383, "loss/crossentropy": 2.2345202565193176, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18022558093070984, "step": 5422 }, { "epoch": 0.24654545454545454, "grad_norm": 5.375, "grad_norm_var": 0.299853515625, "learning_rate": 0.0001, "loss": 6.033, "loss/crossentropy": 2.5687504410743713, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1854885146021843, "step": 5424 }, { "epoch": 0.24663636363636363, "grad_norm": 4.71875, "grad_norm_var": 0.20093994140625, "learning_rate": 0.0001, "loss": 5.2152, "loss/crossentropy": 2.0507818460464478, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16136005334556103, "step": 5426 }, { "epoch": 0.24672727272727274, "grad_norm": 5.90625, "grad_norm_var": 0.21155192057291666, "learning_rate": 0.0001, "loss": 5.984, "loss/crossentropy": 2.5026867389678955, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.186993557959795, "step": 5428 }, { "epoch": 0.24681818181818183, "grad_norm": 5.09375, "grad_norm_var": 0.229931640625, "learning_rate": 0.0001, "loss": 5.6742, "loss/crossentropy": 2.3894774317741394, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17397554591298103, "step": 5430 }, { "epoch": 0.2469090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.22472330729166667, "learning_rate": 0.0001, "loss": 6.1857, "loss/crossentropy": 2.624625027179718, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19497551396489143, "step": 5432 }, { "epoch": 0.247, "grad_norm": 5.8125, "grad_norm_var": 0.20308837890625, "learning_rate": 0.0001, "loss": 5.7163, "loss/crossentropy": 2.38016539812088, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1754056252539158, "step": 5434 }, { "epoch": 0.24709090909090908, "grad_norm": 5.375, "grad_norm_var": 0.16365559895833334, "learning_rate": 0.0001, "loss": 5.5057, "loss/crossentropy": 2.281724274158478, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16672980040311813, "step": 5436 }, { "epoch": 0.24718181818181817, "grad_norm": 5.5, "grad_norm_var": 0.14685872395833333, "learning_rate": 0.0001, "loss": 6.0049, "loss/crossentropy": 2.5817041099071503, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1858750581741333, "step": 5438 }, { "epoch": 0.24727272727272728, "grad_norm": 5.5, "grad_norm_var": 0.15638020833333333, "learning_rate": 0.0001, "loss": 5.6573, "loss/crossentropy": 2.2957158982753754, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17697428911924362, "step": 5440 }, { "epoch": 0.24736363636363637, "grad_norm": 5.625, "grad_norm_var": 0.11330973307291667, "learning_rate": 0.0001, "loss": 6.019, "loss/crossentropy": 2.604142904281616, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1846546083688736, "step": 5442 }, { "epoch": 0.24745454545454545, "grad_norm": 6.03125, "grad_norm_var": 0.08967692057291667, "learning_rate": 0.0001, "loss": 6.1997, "loss/crossentropy": 2.5116732120513916, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.203570444136858, "step": 5444 }, { "epoch": 0.24754545454545454, "grad_norm": 5.84375, "grad_norm_var": 0.08332926432291667, "learning_rate": 0.0001, "loss": 6.1423, "loss/crossentropy": 2.637167811393738, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.19328797236084938, "step": 5446 }, { "epoch": 0.24763636363636363, "grad_norm": 5.65625, "grad_norm_var": 0.0810546875, "learning_rate": 0.0001, "loss": 6.0132, "loss/crossentropy": 2.530698776245117, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1877000629901886, "step": 5448 }, { "epoch": 0.24772727272727274, "grad_norm": 7.3125, "grad_norm_var": 0.27584228515625, "learning_rate": 0.0001, "loss": 5.6231, "loss/crossentropy": 2.242911696434021, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18059493973851204, "step": 5450 }, { "epoch": 0.24781818181818183, "grad_norm": 5.0625, "grad_norm_var": 0.45441080729166666, "learning_rate": 0.0001, "loss": 5.944, "loss/crossentropy": 2.4274357557296753, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.19091897457838058, "step": 5452 }, { "epoch": 0.2479090909090909, "grad_norm": 8.5625, "grad_norm_var": 0.9440755208333333, "learning_rate": 0.0001, "loss": 6.1946, "loss/crossentropy": 2.6108630299568176, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.19763021171092987, "step": 5454 }, { "epoch": 0.248, "grad_norm": 6.21875, "grad_norm_var": 0.90426025390625, "learning_rate": 0.0001, "loss": 6.167, "loss/crossentropy": 2.605988025665283, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.1916465237736702, "step": 5456 }, { "epoch": 0.24809090909090908, "grad_norm": 6.5625, "grad_norm_var": 0.83218994140625, "learning_rate": 0.0001, "loss": 6.0502, "loss/crossentropy": 2.4573455154895782, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19561165571212769, "step": 5458 }, { "epoch": 0.24818181818181817, "grad_norm": 5.40625, "grad_norm_var": 0.8620442708333333, "learning_rate": 0.0001, "loss": 5.8024, "loss/crossentropy": 2.418425440788269, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1799968183040619, "step": 5460 }, { "epoch": 0.24827272727272728, "grad_norm": 5.84375, "grad_norm_var": 0.8749837239583333, "learning_rate": 0.0001, "loss": 5.6787, "loss/crossentropy": 2.3230491280555725, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.1756085492670536, "step": 5462 }, { "epoch": 0.24836363636363637, "grad_norm": 5.625, "grad_norm_var": 0.8673014322916667, "learning_rate": 0.0001, "loss": 6.1539, "loss/crossentropy": 2.657992899417877, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.19002189487218857, "step": 5464 }, { "epoch": 0.24845454545454546, "grad_norm": 5.59375, "grad_norm_var": 0.7606608072916666, "learning_rate": 0.0001, "loss": 6.1693, "loss/crossentropy": 2.6028157472610474, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19433945789933205, "step": 5466 }, { "epoch": 0.24854545454545454, "grad_norm": 5.90625, "grad_norm_var": 0.6024576822916666, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.418645143508911, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18521326780319214, "step": 5468 }, { "epoch": 0.24863636363636363, "grad_norm": 4.9375, "grad_norm_var": 0.21301676432291666, "learning_rate": 0.0001, "loss": 5.6456, "loss/crossentropy": 2.3324727416038513, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.17369108647108078, "step": 5470 }, { "epoch": 0.2487272727272727, "grad_norm": 5.4375, "grad_norm_var": 0.14322916666666666, "learning_rate": 0.0001, "loss": 5.9638, "loss/crossentropy": 2.543529987335205, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1853884942829609, "step": 5472 }, { "epoch": 0.24881818181818183, "grad_norm": 5.96875, "grad_norm_var": 0.0814453125, "learning_rate": 0.0001, "loss": 6.15, "loss/crossentropy": 2.5999763011932373, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19387077167630196, "step": 5474 }, { "epoch": 0.2489090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.08943684895833333, "learning_rate": 0.0001, "loss": 5.8549, "loss/crossentropy": 2.464403808116913, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1804525926709175, "step": 5476 }, { "epoch": 0.249, "grad_norm": 4.84375, "grad_norm_var": 0.108447265625, "learning_rate": 0.0001, "loss": 5.6678, "loss/crossentropy": 2.424163281917572, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.16850892826914787, "step": 5478 }, { "epoch": 0.24909090909090909, "grad_norm": 5.75, "grad_norm_var": 0.10777587890625, "learning_rate": 0.0001, "loss": 6.1004, "loss/crossentropy": 2.5805919766426086, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18987076729536057, "step": 5480 }, { "epoch": 0.24918181818181817, "grad_norm": 5.375, "grad_norm_var": 0.10692952473958334, "learning_rate": 0.0001, "loss": 6.0516, "loss/crossentropy": 2.594587802886963, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18672002479434013, "step": 5482 }, { "epoch": 0.24927272727272728, "grad_norm": 5.21875, "grad_norm_var": 0.09569905598958334, "learning_rate": 0.0001, "loss": 5.7342, "loss/crossentropy": 2.3581422567367554, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18057045340538025, "step": 5484 }, { "epoch": 0.24936363636363637, "grad_norm": 5.0625, "grad_norm_var": 0.10666910807291667, "learning_rate": 0.0001, "loss": 5.3662, "loss/crossentropy": 2.144623726606369, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1657106988132, "step": 5486 }, { "epoch": 0.24945454545454546, "grad_norm": 5.6875, "grad_norm_var": 0.11047770182291666, "learning_rate": 0.0001, "loss": 5.8697, "loss/crossentropy": 2.440901279449463, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.1836998537182808, "step": 5488 }, { "epoch": 0.24954545454545454, "grad_norm": 5.25, "grad_norm_var": 0.08941650390625, "learning_rate": 0.0001, "loss": 6.1447, "loss/crossentropy": 2.644778072834015, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.19080809503793716, "step": 5490 }, { "epoch": 0.24963636363636363, "grad_norm": 5.40625, "grad_norm_var": 0.08170572916666667, "learning_rate": 0.0001, "loss": 6.0351, "loss/crossentropy": 2.5821409821510315, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18572163581848145, "step": 5492 }, { "epoch": 0.24972727272727271, "grad_norm": 5.4375, "grad_norm_var": 0.06378580729166666, "learning_rate": 0.0001, "loss": 5.83, "loss/crossentropy": 2.508774995803833, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1766526885330677, "step": 5494 }, { "epoch": 0.24981818181818183, "grad_norm": 5.84375, "grad_norm_var": 0.06627604166666666, "learning_rate": 0.0001, "loss": 6.0031, "loss/crossentropy": 2.444332480430603, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19220781326293945, "step": 5496 }, { "epoch": 0.24990909090909091, "grad_norm": 5.90625, "grad_norm_var": 0.09998372395833334, "learning_rate": 0.0001, "loss": 6.3226, "loss/crossentropy": 2.719578206539154, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.20268452912569046, "step": 5498 }, { "epoch": 0.25, "grad_norm": 5.4375, "grad_norm_var": 0.09295247395833334, "learning_rate": 0.0001, "loss": 5.7878, "loss/crossentropy": 2.45141339302063, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.17562933266162872, "step": 5500 }, { "epoch": 0.2500909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.058426920572916666, "learning_rate": 0.0001, "loss": 5.3318, "loss/crossentropy": 2.169805109500885, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1548670120537281, "step": 5502 }, { "epoch": 0.25018181818181817, "grad_norm": 5.78125, "grad_norm_var": 0.060347493489583334, "learning_rate": 0.0001, "loss": 5.8786, "loss/crossentropy": 2.4128172993659973, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.1866133213043213, "step": 5504 }, { "epoch": 0.25027272727272726, "grad_norm": 5.6875, "grad_norm_var": 0.059375, "learning_rate": 0.0001, "loss": 5.9687, "loss/crossentropy": 2.415063261985779, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19325630739331245, "step": 5506 }, { "epoch": 0.25036363636363634, "grad_norm": 5.25, "grad_norm_var": 0.06471354166666667, "learning_rate": 0.0001, "loss": 5.8791, "loss/crossentropy": 2.4326528906822205, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18428919464349747, "step": 5508 }, { "epoch": 0.25045454545454543, "grad_norm": 5.34375, "grad_norm_var": 0.06682535807291666, "learning_rate": 0.0001, "loss": 5.7253, "loss/crossentropy": 2.3943344354629517, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1764531023800373, "step": 5510 }, { "epoch": 0.25054545454545457, "grad_norm": 5.90625, "grad_norm_var": 0.07311197916666666, "learning_rate": 0.0001, "loss": 5.7931, "loss/crossentropy": 2.4150326251983643, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17804372683167458, "step": 5512 }, { "epoch": 0.25063636363636366, "grad_norm": 5.5625, "grad_norm_var": 0.057906087239583334, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.469946026802063, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.18683026731014252, "step": 5514 }, { "epoch": 0.25072727272727274, "grad_norm": 6.625, "grad_norm_var": 0.133837890625, "learning_rate": 0.0001, "loss": 5.9248, "loss/crossentropy": 2.5529792308807373, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17995600402355194, "step": 5516 }, { "epoch": 0.25081818181818183, "grad_norm": 6.03125, "grad_norm_var": 0.17913004557291667, "learning_rate": 0.0001, "loss": 6.1995, "loss/crossentropy": 2.597653329372406, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19787822291254997, "step": 5518 }, { "epoch": 0.2509090909090909, "grad_norm": 6.5, "grad_norm_var": 0.21204020182291666, "learning_rate": 0.0001, "loss": 5.6408, "loss/crossentropy": 2.2740082144737244, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.17262009903788567, "step": 5520 }, { "epoch": 0.251, "grad_norm": 6.09375, "grad_norm_var": 0.34244384765625, "learning_rate": 0.0001, "loss": 6.6698, "loss/crossentropy": 2.909782826900482, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2135024219751358, "step": 5522 }, { "epoch": 0.2510909090909091, "grad_norm": 5.71875, "grad_norm_var": 0.34918212890625, "learning_rate": 0.0001, "loss": 5.7698, "loss/crossentropy": 2.4422265887260437, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1757289655506611, "step": 5524 }, { "epoch": 0.2511818181818182, "grad_norm": 5.90625, "grad_norm_var": 0.30520426432291664, "learning_rate": 0.0001, "loss": 5.828, "loss/crossentropy": 2.441142976284027, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18146229162812233, "step": 5526 }, { "epoch": 0.25127272727272726, "grad_norm": 5.21875, "grad_norm_var": 0.33865559895833336, "learning_rate": 0.0001, "loss": 5.7602, "loss/crossentropy": 2.424628883600235, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17789068445563316, "step": 5528 }, { "epoch": 0.25136363636363634, "grad_norm": 5.21875, "grad_norm_var": 0.36573893229166665, "learning_rate": 0.0001, "loss": 5.7573, "loss/crossentropy": 2.368686854839325, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.17929479107260704, "step": 5530 }, { "epoch": 0.25145454545454543, "grad_norm": 5.9375, "grad_norm_var": 0.3365885416666667, "learning_rate": 0.0001, "loss": 5.8684, "loss/crossentropy": 2.45298433303833, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.18510109931230545, "step": 5532 }, { "epoch": 0.25154545454545457, "grad_norm": 5.8125, "grad_norm_var": 0.35064697265625, "learning_rate": 0.0001, "loss": 5.7882, "loss/crossentropy": 2.471168637275696, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17721160873770714, "step": 5534 }, { "epoch": 0.25163636363636366, "grad_norm": 5.59375, "grad_norm_var": 0.31698811848958336, "learning_rate": 0.0001, "loss": 5.8644, "loss/crossentropy": 2.4927144646644592, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17740661650896072, "step": 5536 }, { "epoch": 0.25172727272727274, "grad_norm": 5.6875, "grad_norm_var": 0.11608072916666666, "learning_rate": 0.0001, "loss": 6.1146, "loss/crossentropy": 2.6247618198394775, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18843841180205345, "step": 5538 }, { "epoch": 0.25181818181818183, "grad_norm": 5.5, "grad_norm_var": 0.10354410807291667, "learning_rate": 0.0001, "loss": 6.0087, "loss/crossentropy": 2.516871213912964, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18785270303487778, "step": 5540 }, { "epoch": 0.2519090909090909, "grad_norm": 5.8125, "grad_norm_var": 0.09894205729166666, "learning_rate": 0.0001, "loss": 6.1602, "loss/crossentropy": 2.6798150539398193, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18905555829405785, "step": 5542 }, { "epoch": 0.252, "grad_norm": 5.9375, "grad_norm_var": 0.10240885416666666, "learning_rate": 0.0001, "loss": 5.8611, "loss/crossentropy": 2.416464775800705, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1843029260635376, "step": 5544 }, { "epoch": 0.2520909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.09646809895833333, "learning_rate": 0.0001, "loss": 6.2581, "loss/crossentropy": 2.7282729148864746, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19438473507761955, "step": 5546 }, { "epoch": 0.2521818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.0798828125, "learning_rate": 0.0001, "loss": 5.9722, "loss/crossentropy": 2.433922588825226, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19406605139374733, "step": 5548 }, { "epoch": 0.25227272727272726, "grad_norm": 5.34375, "grad_norm_var": 0.05097249348958333, "learning_rate": 0.0001, "loss": 5.6352, "loss/crossentropy": 2.3381550014019012, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17306331172585487, "step": 5550 }, { "epoch": 0.25236363636363635, "grad_norm": 5.75, "grad_norm_var": 0.523291015625, "learning_rate": 0.0001, "loss": 6.2523, "loss/crossentropy": 2.7281041741371155, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.19362905621528625, "step": 5552 }, { "epoch": 0.25245454545454543, "grad_norm": 5.40625, "grad_norm_var": 0.5265625, "learning_rate": 0.0001, "loss": 6.2012, "loss/crossentropy": 2.688320517539978, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1919160969555378, "step": 5554 }, { "epoch": 0.2525454545454546, "grad_norm": 5.8125, "grad_norm_var": 0.5217081705729166, "learning_rate": 0.0001, "loss": 6.1159, "loss/crossentropy": 2.6733813285827637, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1862419992685318, "step": 5556 }, { "epoch": 0.25263636363636366, "grad_norm": 6.53125, "grad_norm_var": 0.5636555989583333, "learning_rate": 0.0001, "loss": 6.1409, "loss/crossentropy": 2.605445444583893, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19495071843266487, "step": 5558 }, { "epoch": 0.25272727272727274, "grad_norm": 5.84375, "grad_norm_var": 0.542822265625, "learning_rate": 0.0001, "loss": 6.1827, "loss/crossentropy": 2.58667129278183, "loss/hidden": 1.669921875, "loss/jsd": 0.0, "loss/logits": 0.19260944053530693, "step": 5560 }, { "epoch": 0.25281818181818183, "grad_norm": 5.59375, "grad_norm_var": 0.5341756184895833, "learning_rate": 0.0001, "loss": 5.7683, "loss/crossentropy": 2.3552408814430237, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.18251241371035576, "step": 5562 }, { "epoch": 0.2529090909090909, "grad_norm": 5.8125, "grad_norm_var": 0.5359334309895833, "learning_rate": 0.0001, "loss": 5.7991, "loss/crossentropy": 2.4319608211517334, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.178508672863245, "step": 5564 }, { "epoch": 0.253, "grad_norm": 5.65625, "grad_norm_var": 0.4818359375, "learning_rate": 0.0001, "loss": 6.2756, "loss/crossentropy": 2.75134015083313, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19617345929145813, "step": 5566 }, { "epoch": 0.2530909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.0958984375, "learning_rate": 0.0001, "loss": 5.4483, "loss/crossentropy": 2.1901616156101227, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1682014986872673, "step": 5568 }, { "epoch": 0.2531818181818182, "grad_norm": 5.3125, "grad_norm_var": 0.11848551432291667, "learning_rate": 0.0001, "loss": 5.7476, "loss/crossentropy": 2.2940013110637665, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18129675835371017, "step": 5570 }, { "epoch": 0.25327272727272726, "grad_norm": 5.375, "grad_norm_var": 0.12076416015625, "learning_rate": 0.0001, "loss": 5.6555, "loss/crossentropy": 2.2844072580337524, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17733904346823692, "step": 5572 }, { "epoch": 0.25336363636363635, "grad_norm": 5.625, "grad_norm_var": 0.07415364583333334, "learning_rate": 0.0001, "loss": 5.8942, "loss/crossentropy": 2.469766616821289, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18346331641077995, "step": 5574 }, { "epoch": 0.25345454545454543, "grad_norm": 5.03125, "grad_norm_var": 0.08995768229166666, "learning_rate": 0.0001, "loss": 5.7791, "loss/crossentropy": 2.3861950635910034, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.17912978306412697, "step": 5576 }, { "epoch": 0.2535454545454545, "grad_norm": 5.8125, "grad_norm_var": 0.09319254557291666, "learning_rate": 0.0001, "loss": 6.3076, "loss/crossentropy": 2.7590107321739197, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19509190693497658, "step": 5578 }, { "epoch": 0.25363636363636366, "grad_norm": 5.375, "grad_norm_var": 0.10598958333333333, "learning_rate": 0.0001, "loss": 5.7683, "loss/crossentropy": 2.3720486164093018, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18063966929912567, "step": 5580 }, { "epoch": 0.25372727272727275, "grad_norm": 5.0625, "grad_norm_var": 0.15823160807291667, "learning_rate": 0.0001, "loss": 5.9357, "loss/crossentropy": 2.5038710832595825, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1822444126009941, "step": 5582 }, { "epoch": 0.25381818181818183, "grad_norm": 5.5, "grad_norm_var": 0.15831705729166667, "learning_rate": 0.0001, "loss": 5.8745, "loss/crossentropy": 2.451191544532776, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1841241978108883, "step": 5584 }, { "epoch": 0.2539090909090909, "grad_norm": 5.90625, "grad_norm_var": 0.14654541015625, "learning_rate": 0.0001, "loss": 6.1285, "loss/crossentropy": 2.5469164848327637, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19507109373807907, "step": 5586 }, { "epoch": 0.254, "grad_norm": 5.96875, "grad_norm_var": 0.147265625, "learning_rate": 0.0001, "loss": 5.898, "loss/crossentropy": 2.440480053424835, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18617907166481018, "step": 5588 }, { "epoch": 0.2540909090909091, "grad_norm": 5.65625, "grad_norm_var": 0.16822509765625, "learning_rate": 0.0001, "loss": 5.8007, "loss/crossentropy": 2.498827815055847, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1723771095275879, "step": 5590 }, { "epoch": 0.2541818181818182, "grad_norm": 5.65625, "grad_norm_var": 0.13932291666666666, "learning_rate": 0.0001, "loss": 6.1354, "loss/crossentropy": 2.5798068046569824, "loss/hidden": 1.646484375, "loss/jsd": 0.0, "loss/logits": 0.1909101866185665, "step": 5592 }, { "epoch": 0.25427272727272726, "grad_norm": 5.71875, "grad_norm_var": 0.16378580729166667, "learning_rate": 0.0001, "loss": 5.944, "loss/crossentropy": 2.4900847673416138, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18757658451795578, "step": 5594 }, { "epoch": 0.25436363636363635, "grad_norm": 5.28125, "grad_norm_var": 0.15675455729166668, "learning_rate": 0.0001, "loss": 5.9656, "loss/crossentropy": 2.5626303255558014, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18013693764805794, "step": 5596 }, { "epoch": 0.25445454545454543, "grad_norm": 5.6875, "grad_norm_var": 0.10208333333333333, "learning_rate": 0.0001, "loss": 5.9794, "loss/crossentropy": 2.5559308528900146, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18512073159217834, "step": 5598 }, { "epoch": 0.2545454545454545, "grad_norm": 5.3125, "grad_norm_var": 0.11248372395833334, "learning_rate": 0.0001, "loss": 5.8391, "loss/crossentropy": 2.4928591549396515, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.18208377808332443, "step": 5600 }, { "epoch": 0.25463636363636366, "grad_norm": 5.71875, "grad_norm_var": 0.08004150390625, "learning_rate": 0.0001, "loss": 6.0721, "loss/crossentropy": 2.635941207408905, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1854078769683838, "step": 5602 }, { "epoch": 0.25472727272727275, "grad_norm": 5.875, "grad_norm_var": 0.0875, "learning_rate": 0.0001, "loss": 5.6652, "loss/crossentropy": 2.3502303659915924, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.173882856965065, "step": 5604 }, { "epoch": 0.25481818181818183, "grad_norm": 6.21875, "grad_norm_var": 0.11171875, "learning_rate": 0.0001, "loss": 5.7242, "loss/crossentropy": 2.2552677392959595, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1851724348962307, "step": 5606 }, { "epoch": 0.2549090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.10136311848958333, "learning_rate": 0.0001, "loss": 5.8751, "loss/crossentropy": 2.4727302193641663, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18183637410402298, "step": 5608 }, { "epoch": 0.255, "grad_norm": 5.75, "grad_norm_var": 0.094775390625, "learning_rate": 0.0001, "loss": 6.148, "loss/crossentropy": 2.580852687358856, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19850869476795197, "step": 5610 }, { "epoch": 0.2550909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.0958984375, "learning_rate": 0.0001, "loss": 5.4834, "loss/crossentropy": 2.3054490089416504, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16350267454981804, "step": 5612 }, { "epoch": 0.2551818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.21890869140625, "learning_rate": 0.0001, "loss": 5.5423, "loss/crossentropy": 2.2180136144161224, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17324518412351608, "step": 5614 }, { "epoch": 0.25527272727272726, "grad_norm": 5.75, "grad_norm_var": 0.20896809895833332, "learning_rate": 0.0001, "loss": 5.8746, "loss/crossentropy": 2.4710484743118286, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18489113822579384, "step": 5616 }, { "epoch": 0.25536363636363635, "grad_norm": 6.34375, "grad_norm_var": 0.24373372395833334, "learning_rate": 0.0001, "loss": 5.8781, "loss/crossentropy": 2.398476719856262, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18663563579320908, "step": 5618 }, { "epoch": 0.25545454545454543, "grad_norm": 5.59375, "grad_norm_var": 0.21131184895833333, "learning_rate": 0.0001, "loss": 5.8308, "loss/crossentropy": 2.4484116435050964, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.1790611557662487, "step": 5620 }, { "epoch": 0.2555454545454545, "grad_norm": 5.96875, "grad_norm_var": 0.19687093098958333, "learning_rate": 0.0001, "loss": 6.0103, "loss/crossentropy": 2.579889476299286, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.18229443207383156, "step": 5622 }, { "epoch": 0.25563636363636366, "grad_norm": 11.0625, "grad_norm_var": 1.98345947265625, "learning_rate": 0.0001, "loss": 6.3111, "loss/crossentropy": 2.618342161178589, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.20814233273267746, "step": 5624 }, { "epoch": 0.25572727272727275, "grad_norm": 5.34375, "grad_norm_var": 2.005322265625, "learning_rate": 0.0001, "loss": 5.7871, "loss/crossentropy": 2.434260904788971, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1739519014954567, "step": 5626 }, { "epoch": 0.25581818181818183, "grad_norm": 4.9375, "grad_norm_var": 2.0135701497395835, "learning_rate": 0.0001, "loss": 5.8625, "loss/crossentropy": 2.4456624388694763, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18406951054930687, "step": 5628 }, { "epoch": 0.2559090909090909, "grad_norm": 5.53125, "grad_norm_var": 1.9063802083333334, "learning_rate": 0.0001, "loss": 5.9064, "loss/crossentropy": 2.4677192866802216, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18644771352410316, "step": 5630 }, { "epoch": 0.256, "grad_norm": 7.375, "grad_norm_var": 2.0436197916666665, "learning_rate": 0.0001, "loss": 5.7712, "loss/crossentropy": 2.330005466938019, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1818121336400509, "step": 5632 }, { "epoch": 0.2560909090909091, "grad_norm": 5.1875, "grad_norm_var": 2.098030598958333, "learning_rate": 0.0001, "loss": 5.5286, "loss/crossentropy": 2.3409968614578247, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16387873142957687, "step": 5634 }, { "epoch": 0.2561818181818182, "grad_norm": 6.5625, "grad_norm_var": 2.0798014322916667, "learning_rate": 0.0001, "loss": 5.7164, "loss/crossentropy": 2.3823477625846863, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17598765715956688, "step": 5636 }, { "epoch": 0.25627272727272726, "grad_norm": 8.25, "grad_norm_var": 2.321610514322917, "learning_rate": 0.0001, "loss": 6.0736, "loss/crossentropy": 2.4820277094841003, "loss/hidden": 1.638671875, "loss/jsd": 0.0, "loss/logits": 0.19529039785265923, "step": 5638 }, { "epoch": 0.25636363636363635, "grad_norm": 6.5625, "grad_norm_var": 0.7353800455729167, "learning_rate": 0.0001, "loss": 6.2357, "loss/crossentropy": 2.690007448196411, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1922646500170231, "step": 5640 }, { "epoch": 0.25645454545454544, "grad_norm": 5.75, "grad_norm_var": 0.7333170572916666, "learning_rate": 0.0001, "loss": 5.6039, "loss/crossentropy": 2.1963719725608826, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.17922711744904518, "step": 5642 }, { "epoch": 0.2565454545454545, "grad_norm": 5.40625, "grad_norm_var": 0.6854777018229167, "learning_rate": 0.0001, "loss": 6.2353, "loss/crossentropy": 2.7393441200256348, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19177977740764618, "step": 5644 }, { "epoch": 0.25663636363636366, "grad_norm": 6.65625, "grad_norm_var": 0.68248291015625, "learning_rate": 0.0001, "loss": 6.1188, "loss/crossentropy": 2.5100232362747192, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19837657362222672, "step": 5646 }, { "epoch": 0.25672727272727275, "grad_norm": 5.96875, "grad_norm_var": 0.6883748372395834, "learning_rate": 0.0001, "loss": 6.1544, "loss/crossentropy": 2.5245312452316284, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19970984011888504, "step": 5648 }, { "epoch": 0.25681818181818183, "grad_norm": 5.09375, "grad_norm_var": 0.7385701497395833, "learning_rate": 0.0001, "loss": 5.3857, "loss/crossentropy": 2.195163607597351, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.16104578413069248, "step": 5650 }, { "epoch": 0.2569090909090909, "grad_norm": 5.59375, "grad_norm_var": 0.7557942708333333, "learning_rate": 0.0001, "loss": 5.7288, "loss/crossentropy": 2.335300236940384, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17841604351997375, "step": 5652 }, { "epoch": 0.257, "grad_norm": 5.4375, "grad_norm_var": 0.5055948893229166, "learning_rate": 0.0001, "loss": 5.981, "loss/crossentropy": 2.590072274208069, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18225755915045738, "step": 5654 }, { "epoch": 0.2570909090909091, "grad_norm": 5.5, "grad_norm_var": 0.53131103515625, "learning_rate": 0.0001, "loss": 5.6074, "loss/crossentropy": 2.2967518866062164, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1754044145345688, "step": 5656 }, { "epoch": 0.2571818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.5348917643229166, "learning_rate": 0.0001, "loss": 5.7554, "loss/crossentropy": 2.385137379169464, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.17589306831359863, "step": 5658 }, { "epoch": 0.25727272727272726, "grad_norm": 5.125, "grad_norm_var": 0.5554972330729167, "learning_rate": 0.0001, "loss": 5.6227, "loss/crossentropy": 2.283957600593567, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17391127347946167, "step": 5660 }, { "epoch": 0.25736363636363635, "grad_norm": 5.09375, "grad_norm_var": 0.490478515625, "learning_rate": 0.0001, "loss": 5.596, "loss/crossentropy": 2.251600980758667, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.17408691719174385, "step": 5662 }, { "epoch": 0.25745454545454544, "grad_norm": 5.65625, "grad_norm_var": 0.08752848307291666, "learning_rate": 0.0001, "loss": 6.1614, "loss/crossentropy": 2.585922062397003, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19602638110518456, "step": 5664 }, { "epoch": 0.2575454545454545, "grad_norm": 5.65625, "grad_norm_var": 0.08821207682291667, "learning_rate": 0.0001, "loss": 5.8587, "loss/crossentropy": 2.423900604248047, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18391115590929985, "step": 5666 }, { "epoch": 0.25763636363636366, "grad_norm": 5.6875, "grad_norm_var": 0.08020426432291666, "learning_rate": 0.0001, "loss": 6.1887, "loss/crossentropy": 2.6827602982521057, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.19219738617539406, "step": 5668 }, { "epoch": 0.25772727272727275, "grad_norm": 6.0625, "grad_norm_var": 0.10089518229166666, "learning_rate": 0.0001, "loss": 5.6213, "loss/crossentropy": 2.214400291442871, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18072684109210968, "step": 5670 }, { "epoch": 0.25781818181818184, "grad_norm": 7.3125, "grad_norm_var": 0.2606608072916667, "learning_rate": 0.0001, "loss": 5.8066, "loss/crossentropy": 2.4384723007678986, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1790046989917755, "step": 5672 }, { "epoch": 0.2579090909090909, "grad_norm": 5.875, "grad_norm_var": 0.3187459309895833, "learning_rate": 0.0001, "loss": 6.2297, "loss/crossentropy": 2.6709960103034973, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.19708527997136116, "step": 5674 }, { "epoch": 0.258, "grad_norm": 6.125, "grad_norm_var": 0.33160400390625, "learning_rate": 0.0001, "loss": 6.0761, "loss/crossentropy": 2.6640848517417908, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18182456865906715, "step": 5676 }, { "epoch": 0.2580909090909091, "grad_norm": 7.375, "grad_norm_var": 0.4423136393229167, "learning_rate": 0.0001, "loss": 5.8436, "loss/crossentropy": 2.4837626814842224, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1736793853342533, "step": 5678 }, { "epoch": 0.2581818181818182, "grad_norm": 5.8125, "grad_norm_var": 0.4416015625, "learning_rate": 0.0001, "loss": 5.7544, "loss/crossentropy": 2.3217352628707886, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18154532462358475, "step": 5680 }, { "epoch": 0.25827272727272726, "grad_norm": 5.5, "grad_norm_var": 0.4214680989583333, "learning_rate": 0.0001, "loss": 6.0011, "loss/crossentropy": 2.485438287258148, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18945660814642906, "step": 5682 }, { "epoch": 0.25836363636363635, "grad_norm": 6.5, "grad_norm_var": 0.43482666015625, "learning_rate": 0.0001, "loss": 5.8989, "loss/crossentropy": 2.427987217903137, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18459248170256615, "step": 5684 }, { "epoch": 0.25845454545454544, "grad_norm": 6.25, "grad_norm_var": 0.45050455729166666, "learning_rate": 0.0001, "loss": 5.6453, "loss/crossentropy": 2.3183189034461975, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17449074238538742, "step": 5686 }, { "epoch": 0.2585454545454545, "grad_norm": 6.1875, "grad_norm_var": 0.33987223307291664, "learning_rate": 0.0001, "loss": 6.2536, "loss/crossentropy": 2.683152198791504, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19649500399827957, "step": 5688 }, { "epoch": 0.2586363636363636, "grad_norm": 5.40625, "grad_norm_var": 0.41799723307291664, "learning_rate": 0.0001, "loss": 6.1563, "loss/crossentropy": 2.605736255645752, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19177787378430367, "step": 5690 }, { "epoch": 0.25872727272727275, "grad_norm": 5.3125, "grad_norm_var": 0.38450520833333335, "learning_rate": 0.0001, "loss": 5.9082, "loss/crossentropy": 2.4680967330932617, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18229404464364052, "step": 5692 }, { "epoch": 0.25881818181818184, "grad_norm": 5.875, "grad_norm_var": 0.2521484375, "learning_rate": 0.0001, "loss": 5.8209, "loss/crossentropy": 2.3264302909374237, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.1867547146975994, "step": 5694 }, { "epoch": 0.2589090909090909, "grad_norm": 7.75, "grad_norm_var": 0.43345947265625, "learning_rate": 0.0001, "loss": 5.7167, "loss/crossentropy": 2.333291172981262, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17994579672813416, "step": 5696 }, { "epoch": 0.259, "grad_norm": 5.71875, "grad_norm_var": 0.42382405598958334, "learning_rate": 0.0001, "loss": 5.9523, "loss/crossentropy": 2.5320907831192017, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18361755833029747, "step": 5698 }, { "epoch": 0.2590909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.443212890625, "learning_rate": 0.0001, "loss": 6.1663, "loss/crossentropy": 2.6570820212364197, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18881703168153763, "step": 5700 }, { "epoch": 0.2591818181818182, "grad_norm": 6.5625, "grad_norm_var": 0.737109375, "learning_rate": 0.0001, "loss": 6.2586, "loss/crossentropy": 2.628726363182068, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.2022484503686428, "step": 5702 }, { "epoch": 0.25927272727272727, "grad_norm": 5.5, "grad_norm_var": 0.7696614583333333, "learning_rate": 0.0001, "loss": 5.5778, "loss/crossentropy": 2.239816278219223, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1728624440729618, "step": 5704 }, { "epoch": 0.25936363636363635, "grad_norm": 5.90625, "grad_norm_var": 0.6737630208333333, "learning_rate": 0.0001, "loss": 6.3433, "loss/crossentropy": 2.711871922016144, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20142460241913795, "step": 5706 }, { "epoch": 0.25945454545454544, "grad_norm": 5.375, "grad_norm_var": 0.6691691080729166, "learning_rate": 0.0001, "loss": 5.5855, "loss/crossentropy": 2.2702017426490784, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.17390839010477066, "step": 5708 }, { "epoch": 0.2595454545454545, "grad_norm": 5.53125, "grad_norm_var": 0.6872029622395833, "learning_rate": 0.0001, "loss": 5.8416, "loss/crossentropy": 2.416866958141327, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18388225883245468, "step": 5710 }, { "epoch": 0.2596363636363636, "grad_norm": 5.46875, "grad_norm_var": 0.5160807291666667, "learning_rate": 0.0001, "loss": 6.1989, "loss/crossentropy": 2.634715974330902, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19821325689554214, "step": 5712 }, { "epoch": 0.25972727272727275, "grad_norm": 8.5, "grad_norm_var": 0.8951131184895833, "learning_rate": 0.0001, "loss": 6.0298, "loss/crossentropy": 2.51674884557724, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.18626506254076958, "step": 5714 }, { "epoch": 0.25981818181818184, "grad_norm": 6.0625, "grad_norm_var": 0.826025390625, "learning_rate": 0.0001, "loss": 5.7869, "loss/crossentropy": 2.396436482667923, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17967600002884865, "step": 5716 }, { "epoch": 0.2599090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.5712076822916666, "learning_rate": 0.0001, "loss": 6.118, "loss/crossentropy": 2.6426037549972534, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18992367014288902, "step": 5718 }, { "epoch": 0.26, "grad_norm": 5.90625, "grad_norm_var": 0.55533447265625, "learning_rate": 0.0001, "loss": 5.8882, "loss/crossentropy": 2.4155641198158264, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18515240028500557, "step": 5720 }, { "epoch": 0.2600909090909091, "grad_norm": 6.09375, "grad_norm_var": 0.5553019205729167, "learning_rate": 0.0001, "loss": 5.8499, "loss/crossentropy": 2.430284321308136, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.1800507754087448, "step": 5722 }, { "epoch": 0.2601818181818182, "grad_norm": 6.875, "grad_norm_var": 0.5794270833333334, "learning_rate": 0.0001, "loss": 6.3038, "loss/crossentropy": 2.667774260044098, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.20168713480234146, "step": 5724 }, { "epoch": 0.26027272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.6676920572916667, "learning_rate": 0.0001, "loss": 5.792, "loss/crossentropy": 2.4467819929122925, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17631936445832253, "step": 5726 }, { "epoch": 0.26036363636363635, "grad_norm": 5.3125, "grad_norm_var": 0.6745402018229166, "learning_rate": 0.0001, "loss": 6.0369, "loss/crossentropy": 2.547436863183975, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18898308649659157, "step": 5728 }, { "epoch": 0.26045454545454544, "grad_norm": 4.96875, "grad_norm_var": 0.32369791666666664, "learning_rate": 0.0001, "loss": 5.7243, "loss/crossentropy": 2.3699090480804443, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1760638691484928, "step": 5730 }, { "epoch": 0.2605454545454545, "grad_norm": 5.125, "grad_norm_var": 0.33606363932291666, "learning_rate": 0.0001, "loss": 6.2248, "loss/crossentropy": 2.7249796390533447, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.19236671924591064, "step": 5732 }, { "epoch": 0.2606363636363636, "grad_norm": 12.125, "grad_norm_var": 2.8551920572916667, "learning_rate": 0.0001, "loss": 6.42, "loss/crossentropy": 2.735238552093506, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20519918203353882, "step": 5734 }, { "epoch": 0.26072727272727275, "grad_norm": 5.53125, "grad_norm_var": 2.8817545572916665, "learning_rate": 0.0001, "loss": 6.1914, "loss/crossentropy": 2.721900224685669, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.19050655514001846, "step": 5736 }, { "epoch": 0.26081818181818184, "grad_norm": 6.25, "grad_norm_var": 2.8916666666666666, "learning_rate": 0.0001, "loss": 6.3532, "loss/crossentropy": 2.7541086077690125, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.19643491879105568, "step": 5738 }, { "epoch": 0.2609090909090909, "grad_norm": 6.34375, "grad_norm_var": 2.858203125, "learning_rate": 0.0001, "loss": 6.2542, "loss/crossentropy": 2.6684443950653076, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1980263516306877, "step": 5740 }, { "epoch": 0.261, "grad_norm": 5.65625, "grad_norm_var": 2.7739542643229167, "learning_rate": 0.0001, "loss": 6.1601, "loss/crossentropy": 2.6390480399131775, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.18921254202723503, "step": 5742 }, { "epoch": 0.2610909090909091, "grad_norm": 7.03125, "grad_norm_var": 2.7784138997395833, "learning_rate": 0.0001, "loss": 5.7977, "loss/crossentropy": 2.3102816939353943, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.18526291474699974, "step": 5744 }, { "epoch": 0.2611818181818182, "grad_norm": 5.78125, "grad_norm_var": 2.632645670572917, "learning_rate": 0.0001, "loss": 6.3707, "loss/crossentropy": 2.796078324317932, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19808895885944366, "step": 5746 }, { "epoch": 0.26127272727272727, "grad_norm": 7.3125, "grad_norm_var": 2.5756510416666667, "learning_rate": 0.0001, "loss": 6.042, "loss/crossentropy": 2.488944947719574, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.19222060218453407, "step": 5748 }, { "epoch": 0.26136363636363635, "grad_norm": 5.3125, "grad_norm_var": 0.34685872395833334, "learning_rate": 0.0001, "loss": 5.874, "loss/crossentropy": 2.5575886368751526, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17539190873503685, "step": 5750 }, { "epoch": 0.26145454545454544, "grad_norm": 5.46875, "grad_norm_var": 0.3505167643229167, "learning_rate": 0.0001, "loss": 5.964, "loss/crossentropy": 2.5272799730300903, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18429899588227272, "step": 5752 }, { "epoch": 0.2615454545454545, "grad_norm": 5.125, "grad_norm_var": 0.39581705729166666, "learning_rate": 0.0001, "loss": 5.6795, "loss/crossentropy": 2.3628312051296234, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17521803453564644, "step": 5754 }, { "epoch": 0.2616363636363636, "grad_norm": 5.34375, "grad_norm_var": 0.37952067057291666, "learning_rate": 0.0001, "loss": 5.9044, "loss/crossentropy": 2.45076584815979, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18404023721814156, "step": 5756 }, { "epoch": 0.26172727272727275, "grad_norm": 5.625, "grad_norm_var": 0.370166015625, "learning_rate": 0.0001, "loss": 6.0226, "loss/crossentropy": 2.5538487434387207, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18652309477329254, "step": 5758 }, { "epoch": 0.26181818181818184, "grad_norm": 5.53125, "grad_norm_var": 0.264697265625, "learning_rate": 0.0001, "loss": 6.1029, "loss/crossentropy": 2.651432156562805, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18850862607359886, "step": 5760 }, { "epoch": 0.2619090909090909, "grad_norm": 5.40625, "grad_norm_var": 0.48912353515625, "learning_rate": 0.0001, "loss": 5.418, "loss/crossentropy": 2.1558308005332947, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17113477736711502, "step": 5762 }, { "epoch": 0.262, "grad_norm": 5.34375, "grad_norm_var": 0.31484375, "learning_rate": 0.0001, "loss": 5.4567, "loss/crossentropy": 2.19326975941658, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.16911393404006958, "step": 5764 }, { "epoch": 0.2620909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.33329671223958335, "learning_rate": 0.0001, "loss": 5.7784, "loss/crossentropy": 2.4634231328964233, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17289985343813896, "step": 5766 }, { "epoch": 0.2621818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.3512369791666667, "learning_rate": 0.0001, "loss": 5.7975, "loss/crossentropy": 2.3635264933109283, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18323871493339539, "step": 5768 }, { "epoch": 0.26227272727272727, "grad_norm": 5.3125, "grad_norm_var": 0.33932291666666664, "learning_rate": 0.0001, "loss": 5.704, "loss/crossentropy": 2.412585437297821, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17132912576198578, "step": 5770 }, { "epoch": 0.26236363636363635, "grad_norm": 5.28125, "grad_norm_var": 0.35230712890625, "learning_rate": 0.0001, "loss": 5.7991, "loss/crossentropy": 2.4060285091400146, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.17524230480194092, "step": 5772 }, { "epoch": 0.26245454545454544, "grad_norm": 5.65625, "grad_norm_var": 0.35349934895833335, "learning_rate": 0.0001, "loss": 5.814, "loss/crossentropy": 2.390952944755554, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18058301508426666, "step": 5774 }, { "epoch": 0.2625454545454545, "grad_norm": 6.9375, "grad_norm_var": 0.4641927083333333, "learning_rate": 0.0001, "loss": 6.3749, "loss/crossentropy": 2.824941575527191, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.19699156656861305, "step": 5776 }, { "epoch": 0.2626363636363636, "grad_norm": 6.28125, "grad_norm_var": 0.24830729166666668, "learning_rate": 0.0001, "loss": 6.1039, "loss/crossentropy": 2.6176664233207703, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.18749132007360458, "step": 5778 }, { "epoch": 0.26272727272727275, "grad_norm": 5.21875, "grad_norm_var": 0.2560546875, "learning_rate": 0.0001, "loss": 6.2115, "loss/crossentropy": 2.764811635017395, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18842295184731483, "step": 5780 }, { "epoch": 0.26281818181818184, "grad_norm": 5.5625, "grad_norm_var": 0.22509358723958334, "learning_rate": 0.0001, "loss": 5.9011, "loss/crossentropy": 2.4715009331703186, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.18143512681126595, "step": 5782 }, { "epoch": 0.2629090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.22779947916666668, "learning_rate": 0.0001, "loss": 5.7129, "loss/crossentropy": 2.4068915247917175, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17493240162730217, "step": 5784 }, { "epoch": 0.263, "grad_norm": 5.375, "grad_norm_var": 0.2601521809895833, "learning_rate": 0.0001, "loss": 5.161, "loss/crossentropy": 1.9983368217945099, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.15844989567995071, "step": 5786 }, { "epoch": 0.2630909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.24803059895833332, "learning_rate": 0.0001, "loss": 6.0075, "loss/crossentropy": 2.5710058212280273, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18739524111151695, "step": 5788 }, { "epoch": 0.2631818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.27112223307291666, "learning_rate": 0.0001, "loss": 5.7953, "loss/crossentropy": 2.478832483291626, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17461451515555382, "step": 5790 }, { "epoch": 0.26327272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.15208333333333332, "learning_rate": 0.0001, "loss": 5.8217, "loss/crossentropy": 2.4186983704566956, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1791691668331623, "step": 5792 }, { "epoch": 0.26336363636363636, "grad_norm": 6.15625, "grad_norm_var": 0.123681640625, "learning_rate": 0.0001, "loss": 6.2129, "loss/crossentropy": 2.6742125153541565, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19449345767498016, "step": 5794 }, { "epoch": 0.26345454545454544, "grad_norm": 5.1875, "grad_norm_var": 0.15558268229166666, "learning_rate": 0.0001, "loss": 5.1689, "loss/crossentropy": 2.072534531354904, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.15358630940318108, "step": 5796 }, { "epoch": 0.2635454545454545, "grad_norm": 10.1875, "grad_norm_var": 1.6173828125, "learning_rate": 0.0001, "loss": 6.0269, "loss/crossentropy": 2.548341929912567, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18652277067303658, "step": 5798 }, { "epoch": 0.2636363636363636, "grad_norm": 5.9375, "grad_norm_var": 1.5870402018229166, "learning_rate": 0.0001, "loss": 6.0013, "loss/crossentropy": 2.626779079437256, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18178488686680794, "step": 5800 }, { "epoch": 0.26372727272727275, "grad_norm": 6.71875, "grad_norm_var": 10.536551920572917, "learning_rate": 0.0001, "loss": 6.5022, "loss/crossentropy": 2.6404505372047424, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2252328135073185, "step": 5802 }, { "epoch": 0.26381818181818184, "grad_norm": 5.6875, "grad_norm_var": 10.568973795572917, "learning_rate": 0.0001, "loss": 5.8231, "loss/crossentropy": 2.4577096700668335, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17990122735500336, "step": 5804 }, { "epoch": 0.2639090909090909, "grad_norm": 5.78125, "grad_norm_var": 10.342431640625, "learning_rate": 0.0001, "loss": 6.273, "loss/crossentropy": 2.6336666345596313, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.19967635348439217, "step": 5806 }, { "epoch": 0.264, "grad_norm": 5.6875, "grad_norm_var": 10.287744140625, "learning_rate": 0.0001, "loss": 6.3399, "loss/crossentropy": 2.7321213483810425, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.20081447809934616, "step": 5808 }, { "epoch": 0.2640909090909091, "grad_norm": 5.5625, "grad_norm_var": 10.312202962239583, "learning_rate": 0.0001, "loss": 6.0088, "loss/crossentropy": 2.477350950241089, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19200847670435905, "step": 5810 }, { "epoch": 0.2641818181818182, "grad_norm": 6.25, "grad_norm_var": 9.96724853515625, "learning_rate": 0.0001, "loss": 5.6345, "loss/crossentropy": 2.3010896146297455, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.17220701277256012, "step": 5812 }, { "epoch": 0.26427272727272727, "grad_norm": 5.28125, "grad_norm_var": 9.407291666666667, "learning_rate": 0.0001, "loss": 5.6825, "loss/crossentropy": 2.369584619998932, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.17250709235668182, "step": 5814 }, { "epoch": 0.26436363636363636, "grad_norm": 5.46875, "grad_norm_var": 9.496337890625, "learning_rate": 0.0001, "loss": 5.8418, "loss/crossentropy": 2.444036841392517, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18040172010660172, "step": 5816 }, { "epoch": 0.26445454545454544, "grad_norm": 5.0, "grad_norm_var": 0.13177083333333334, "learning_rate": 0.0001, "loss": 5.6135, "loss/crossentropy": 2.267820715904236, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.17226262763142586, "step": 5818 }, { "epoch": 0.26454545454545453, "grad_norm": 5.15625, "grad_norm_var": 0.12851155598958333, "learning_rate": 0.0001, "loss": 5.8861, "loss/crossentropy": 2.5458221435546875, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.1744585521519184, "step": 5820 }, { "epoch": 0.2646363636363636, "grad_norm": 6.71875, "grad_norm_var": 0.19347330729166667, "learning_rate": 0.0001, "loss": 6.167, "loss/crossentropy": 2.5960874557495117, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19850032776594162, "step": 5822 }, { "epoch": 0.2647272727272727, "grad_norm": 5.625, "grad_norm_var": 0.31721598307291665, "learning_rate": 0.0001, "loss": 6.3283, "loss/crossentropy": 2.676890552043915, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20263750478625298, "step": 5824 }, { "epoch": 0.26481818181818184, "grad_norm": 5.09375, "grad_norm_var": 0.34729410807291666, "learning_rate": 0.0001, "loss": 5.2575, "loss/crossentropy": 2.1405998170375824, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.15738866105675697, "step": 5826 }, { "epoch": 0.2649090909090909, "grad_norm": 5.5, "grad_norm_var": 0.31417643229166664, "learning_rate": 0.0001, "loss": 5.9687, "loss/crossentropy": 2.520846962928772, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18756136670708656, "step": 5828 }, { "epoch": 0.265, "grad_norm": 4.84375, "grad_norm_var": 0.3263956705729167, "learning_rate": 0.0001, "loss": 5.2197, "loss/crossentropy": 2.1049848794937134, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1550256870687008, "step": 5830 }, { "epoch": 0.2650909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.52125244140625, "learning_rate": 0.0001, "loss": 6.0331, "loss/crossentropy": 2.5238559246063232, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19037621468305588, "step": 5832 }, { "epoch": 0.2651818181818182, "grad_norm": 5.75, "grad_norm_var": 0.492041015625, "learning_rate": 0.0001, "loss": 6.2292, "loss/crossentropy": 2.6549994945526123, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.1958950236439705, "step": 5834 }, { "epoch": 0.26527272727272727, "grad_norm": 5.875, "grad_norm_var": 0.48697509765625, "learning_rate": 0.0001, "loss": 6.1966, "loss/crossentropy": 2.7137425541877747, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1889095902442932, "step": 5836 }, { "epoch": 0.26536363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.44488525390625, "learning_rate": 0.0001, "loss": 5.8856, "loss/crossentropy": 2.490302085876465, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.178792055696249, "step": 5838 }, { "epoch": 0.26545454545454544, "grad_norm": 5.3125, "grad_norm_var": 0.3548177083333333, "learning_rate": 0.0001, "loss": 6.0812, "loss/crossentropy": 2.5732027888298035, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.19200579822063446, "step": 5840 }, { "epoch": 0.26554545454545453, "grad_norm": 5.4375, "grad_norm_var": 0.32890625, "learning_rate": 0.0001, "loss": 5.6914, "loss/crossentropy": 2.416718006134033, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17239023745059967, "step": 5842 }, { "epoch": 0.2656363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.340234375, "learning_rate": 0.0001, "loss": 5.6898, "loss/crossentropy": 2.340934455394745, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17785639315843582, "step": 5844 }, { "epoch": 0.2657272727272727, "grad_norm": 5.6875, "grad_norm_var": 0.29302978515625, "learning_rate": 0.0001, "loss": 6.3885, "loss/crossentropy": 2.8066951036453247, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19665642827749252, "step": 5846 }, { "epoch": 0.26581818181818184, "grad_norm": 5.53125, "grad_norm_var": 1.32984619140625, "learning_rate": 0.0001, "loss": 5.6597, "loss/crossentropy": 2.199356436729431, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18275120854377747, "step": 5848 }, { "epoch": 0.26590909090909093, "grad_norm": 5.53125, "grad_norm_var": 1.40543212890625, "learning_rate": 0.0001, "loss": 5.3466, "loss/crossentropy": 2.120250165462494, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1652148813009262, "step": 5850 }, { "epoch": 0.266, "grad_norm": 6.0, "grad_norm_var": 1.3825358072916667, "learning_rate": 0.0001, "loss": 6.181, "loss/crossentropy": 2.593084454536438, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.19648809731006622, "step": 5852 }, { "epoch": 0.2660909090909091, "grad_norm": 5.875, "grad_norm_var": 1.3548014322916666, "learning_rate": 0.0001, "loss": 6.303, "loss/crossentropy": 2.831651508808136, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18990708515048027, "step": 5854 }, { "epoch": 0.2661818181818182, "grad_norm": 5.0, "grad_norm_var": 1.3919108072916666, "learning_rate": 0.0001, "loss": 5.6069, "loss/crossentropy": 2.321613371372223, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17169733718037605, "step": 5856 }, { "epoch": 0.26627272727272727, "grad_norm": 5.5, "grad_norm_var": 1.374072265625, "learning_rate": 0.0001, "loss": 5.92, "loss/crossentropy": 2.454506993293762, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19069154933094978, "step": 5858 }, { "epoch": 0.26636363636363636, "grad_norm": 5.5, "grad_norm_var": 1.35546875, "learning_rate": 0.0001, "loss": 5.753, "loss/crossentropy": 2.3365607261657715, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18304569646716118, "step": 5860 }, { "epoch": 0.26645454545454544, "grad_norm": 7.03125, "grad_norm_var": 1.4485026041666667, "learning_rate": 0.0001, "loss": 6.2206, "loss/crossentropy": 2.7206313610076904, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.19160063192248344, "step": 5862 }, { "epoch": 0.26654545454545453, "grad_norm": 5.8125, "grad_norm_var": 0.27068684895833334, "learning_rate": 0.0001, "loss": 6.2225, "loss/crossentropy": 2.632763624191284, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.2001807540655136, "step": 5864 }, { "epoch": 0.2666363636363636, "grad_norm": 6.34375, "grad_norm_var": 0.267578125, "learning_rate": 0.0001, "loss": 5.9499, "loss/crossentropy": 2.4622994661331177, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19172877818346024, "step": 5866 }, { "epoch": 0.2667272727272727, "grad_norm": 6.15625, "grad_norm_var": 0.9077107747395833, "learning_rate": 0.0001, "loss": 6.2957, "loss/crossentropy": 2.71923291683197, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.19846243038773537, "step": 5868 }, { "epoch": 0.26681818181818184, "grad_norm": 5.3125, "grad_norm_var": 0.91978759765625, "learning_rate": 0.0001, "loss": 5.784, "loss/crossentropy": 2.37562358379364, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.17891882732510567, "step": 5870 }, { "epoch": 0.26690909090909093, "grad_norm": 5.71875, "grad_norm_var": 0.8268229166666666, "learning_rate": 0.0001, "loss": 6.0256, "loss/crossentropy": 2.5164310932159424, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.18744471296668053, "step": 5872 }, { "epoch": 0.267, "grad_norm": 6.0, "grad_norm_var": 0.83765869140625, "learning_rate": 0.0001, "loss": 5.6314, "loss/crossentropy": 2.3278926014900208, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.17156046628952026, "step": 5874 }, { "epoch": 0.2670909090909091, "grad_norm": 5.78125, "grad_norm_var": 0.832421875, "learning_rate": 0.0001, "loss": 6.2818, "loss/crossentropy": 2.644488036632538, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20436039194464684, "step": 5876 }, { "epoch": 0.2671818181818182, "grad_norm": 5.625, "grad_norm_var": 0.7625, "learning_rate": 0.0001, "loss": 6.1997, "loss/crossentropy": 2.7073062658309937, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.19084522128105164, "step": 5878 }, { "epoch": 0.2672727272727273, "grad_norm": 5.53125, "grad_norm_var": 0.7884724934895834, "learning_rate": 0.0001, "loss": 5.9458, "loss/crossentropy": 2.5175701379776, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.18402954190969467, "step": 5880 }, { "epoch": 0.26736363636363636, "grad_norm": 5.0625, "grad_norm_var": 0.7809855143229166, "learning_rate": 0.0001, "loss": 5.7678, "loss/crossentropy": 2.4323927760124207, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.17397336661815643, "step": 5882 }, { "epoch": 0.26745454545454544, "grad_norm": 5.25, "grad_norm_var": 0.13528645833333333, "learning_rate": 0.0001, "loss": 5.9834, "loss/crossentropy": 2.5477211475372314, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1861441545188427, "step": 5884 }, { "epoch": 0.26754545454545453, "grad_norm": 5.875, "grad_norm_var": 0.13157552083333332, "learning_rate": 0.0001, "loss": 5.8578, "loss/crossentropy": 2.4138625264167786, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18267244845628738, "step": 5886 }, { "epoch": 0.2676363636363636, "grad_norm": 4.5, "grad_norm_var": 0.24257405598958334, "learning_rate": 0.0001, "loss": 5.3223, "loss/crossentropy": 2.2209398448467255, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1585710532963276, "step": 5888 }, { "epoch": 0.2677272727272727, "grad_norm": 5.15625, "grad_norm_var": 0.23123372395833333, "learning_rate": 0.0001, "loss": 5.8441, "loss/crossentropy": 2.4801344871520996, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.17878170683979988, "step": 5890 }, { "epoch": 0.26781818181818184, "grad_norm": 6.71875, "grad_norm_var": 0.27615559895833336, "learning_rate": 0.0001, "loss": 6.2088, "loss/crossentropy": 2.6457996368408203, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19575565680861473, "step": 5892 }, { "epoch": 0.26790909090909093, "grad_norm": 4.96875, "grad_norm_var": 0.25517171223958335, "learning_rate": 0.0001, "loss": 5.6157, "loss/crossentropy": 2.326825201511383, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17166398465633392, "step": 5894 }, { "epoch": 0.268, "grad_norm": 6.5, "grad_norm_var": 0.3374837239583333, "learning_rate": 0.0001, "loss": 5.3193, "loss/crossentropy": 2.1064546704292297, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1646413803100586, "step": 5896 }, { "epoch": 0.2680909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.32633056640625, "learning_rate": 0.0001, "loss": 5.9896, "loss/crossentropy": 2.5876590609550476, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18315789476037025, "step": 5898 }, { "epoch": 0.2681818181818182, "grad_norm": 5.40625, "grad_norm_var": 0.31571858723958335, "learning_rate": 0.0001, "loss": 5.6009, "loss/crossentropy": 2.2862594425678253, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17228864133358002, "step": 5900 }, { "epoch": 0.2682727272727273, "grad_norm": 5.125, "grad_norm_var": 0.31053059895833335, "learning_rate": 0.0001, "loss": 5.7134, "loss/crossentropy": 2.378711462020874, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17683258280158043, "step": 5902 }, { "epoch": 0.26836363636363636, "grad_norm": 5.53125, "grad_norm_var": 0.23528238932291667, "learning_rate": 0.0001, "loss": 5.9033, "loss/crossentropy": 2.45073664188385, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.18608016520738602, "step": 5904 }, { "epoch": 0.26845454545454545, "grad_norm": 5.25, "grad_norm_var": 0.24166666666666667, "learning_rate": 0.0001, "loss": 6.3023, "loss/crossentropy": 2.7994688153266907, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19246752932667732, "step": 5906 }, { "epoch": 0.26854545454545453, "grad_norm": 5.90625, "grad_norm_var": 0.15061442057291666, "learning_rate": 0.0001, "loss": 5.8682, "loss/crossentropy": 2.470064640045166, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18277748301625252, "step": 5908 }, { "epoch": 0.2686363636363636, "grad_norm": 5.5, "grad_norm_var": 0.12812093098958333, "learning_rate": 0.0001, "loss": 5.9132, "loss/crossentropy": 2.4935377836227417, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1833757348358631, "step": 5910 }, { "epoch": 0.2687272727272727, "grad_norm": 5.625, "grad_norm_var": 0.06614176432291667, "learning_rate": 0.0001, "loss": 6.0899, "loss/crossentropy": 2.626103639602661, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18544312939047813, "step": 5912 }, { "epoch": 0.26881818181818184, "grad_norm": 6.875, "grad_norm_var": 0.17727864583333333, "learning_rate": 0.0001, "loss": 6.0356, "loss/crossentropy": 2.564587712287903, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18753557279706, "step": 5914 }, { "epoch": 0.26890909090909093, "grad_norm": 5.375, "grad_norm_var": 0.20325113932291666, "learning_rate": 0.0001, "loss": 5.8136, "loss/crossentropy": 2.4435157477855682, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17782456055283546, "step": 5916 }, { "epoch": 0.269, "grad_norm": 5.75, "grad_norm_var": 0.18409830729166668, "learning_rate": 0.0001, "loss": 5.9958, "loss/crossentropy": 2.51665061712265, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18560577183961868, "step": 5918 }, { "epoch": 0.2690909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.21584879557291667, "learning_rate": 0.0001, "loss": 6.1898, "loss/crossentropy": 2.687604248523712, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19319000095129013, "step": 5920 }, { "epoch": 0.2691818181818182, "grad_norm": 5.0625, "grad_norm_var": 0.2609375, "learning_rate": 0.0001, "loss": 5.2928, "loss/crossentropy": 2.1473434567451477, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.15633806586265564, "step": 5922 }, { "epoch": 0.2692727272727273, "grad_norm": 5.375, "grad_norm_var": 0.2575154622395833, "learning_rate": 0.0001, "loss": 5.8808, "loss/crossentropy": 2.5464372038841248, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17738382518291473, "step": 5924 }, { "epoch": 0.26936363636363636, "grad_norm": 5.1875, "grad_norm_var": 0.2706990559895833, "learning_rate": 0.0001, "loss": 5.9785, "loss/crossentropy": 2.555889666080475, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18601392582058907, "step": 5926 }, { "epoch": 0.26945454545454545, "grad_norm": 5.0, "grad_norm_var": 0.29263916015625, "learning_rate": 0.0001, "loss": 5.6962, "loss/crossentropy": 2.4745389819145203, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16669804230332375, "step": 5928 }, { "epoch": 0.26954545454545453, "grad_norm": 5.6875, "grad_norm_var": 0.17779947916666666, "learning_rate": 0.0001, "loss": 5.9497, "loss/crossentropy": 2.5573790669441223, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18200130015611649, "step": 5930 }, { "epoch": 0.2696363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.13088785807291667, "learning_rate": 0.0001, "loss": 5.8402, "loss/crossentropy": 2.4676170349121094, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17905192449688911, "step": 5932 }, { "epoch": 0.2697272727272727, "grad_norm": 6.21875, "grad_norm_var": 0.149462890625, "learning_rate": 0.0001, "loss": 5.7358, "loss/crossentropy": 2.3896324932575226, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1762148141860962, "step": 5934 }, { "epoch": 0.26981818181818185, "grad_norm": 4.8125, "grad_norm_var": 0.14752197265625, "learning_rate": 0.0001, "loss": 5.6757, "loss/crossentropy": 2.3692748844623566, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17283453047275543, "step": 5936 }, { "epoch": 0.26990909090909093, "grad_norm": 5.46875, "grad_norm_var": 0.1435546875, "learning_rate": 0.0001, "loss": 5.7312, "loss/crossentropy": 2.404398560523987, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.1727181188762188, "step": 5938 }, { "epoch": 0.27, "grad_norm": 5.03125, "grad_norm_var": 0.14803059895833334, "learning_rate": 0.0001, "loss": 5.5637, "loss/crossentropy": 2.314653307199478, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.16885317116975784, "step": 5940 }, { "epoch": 0.2700909090909091, "grad_norm": 5.375, "grad_norm_var": 0.14191080729166666, "learning_rate": 0.0001, "loss": 6.0232, "loss/crossentropy": 2.627137839794159, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18277360498905182, "step": 5942 }, { "epoch": 0.2701818181818182, "grad_norm": 5.34375, "grad_norm_var": 0.181884765625, "learning_rate": 0.0001, "loss": 6.2408, "loss/crossentropy": 2.7366049885749817, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19338400289416313, "step": 5944 }, { "epoch": 0.2702727272727273, "grad_norm": 6.03125, "grad_norm_var": 0.36847330729166666, "learning_rate": 0.0001, "loss": 6.8269, "loss/crossentropy": 2.8966423869132996, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.23013928532600403, "step": 5946 }, { "epoch": 0.27036363636363636, "grad_norm": 5.28125, "grad_norm_var": 0.3630859375, "learning_rate": 0.0001, "loss": 5.6833, "loss/crossentropy": 2.3163431882858276, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17849260196089745, "step": 5948 }, { "epoch": 0.27045454545454545, "grad_norm": 4.90625, "grad_norm_var": 0.34293212890625, "learning_rate": 0.0001, "loss": 5.3974, "loss/crossentropy": 2.2641043663024902, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.15844785422086716, "step": 5950 }, { "epoch": 0.27054545454545453, "grad_norm": 5.59375, "grad_norm_var": 0.30299479166666665, "learning_rate": 0.0001, "loss": 5.9307, "loss/crossentropy": 2.5523602962493896, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18060670420527458, "step": 5952 }, { "epoch": 0.2706363636363636, "grad_norm": 5.90625, "grad_norm_var": 0.31737874348958334, "learning_rate": 0.0001, "loss": 5.7396, "loss/crossentropy": 2.4214794635772705, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17868415266275406, "step": 5954 }, { "epoch": 0.2707272727272727, "grad_norm": 5.40625, "grad_norm_var": 0.28854166666666664, "learning_rate": 0.0001, "loss": 6.0601, "loss/crossentropy": 2.6639710068702698, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1829753741621971, "step": 5956 }, { "epoch": 0.2708181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.29498697916666666, "learning_rate": 0.0001, "loss": 5.9412, "loss/crossentropy": 2.5470606684684753, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.182970330119133, "step": 5958 }, { "epoch": 0.27090909090909093, "grad_norm": 5.59375, "grad_norm_var": 0.2671875, "learning_rate": 0.0001, "loss": 6.3356, "loss/crossentropy": 2.8177576661109924, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.19338176399469376, "step": 5960 }, { "epoch": 0.271, "grad_norm": 5.53125, "grad_norm_var": 0.12967122395833333, "learning_rate": 0.0001, "loss": 6.3229, "loss/crossentropy": 2.8112851977348328, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.19158675149083138, "step": 5962 }, { "epoch": 0.2710909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.14661458333333333, "learning_rate": 0.0001, "loss": 5.8103, "loss/crossentropy": 2.5013681650161743, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1750335767865181, "step": 5964 }, { "epoch": 0.2711818181818182, "grad_norm": 5.65625, "grad_norm_var": 0.15128580729166666, "learning_rate": 0.0001, "loss": 5.8165, "loss/crossentropy": 2.376968801021576, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18262919411063194, "step": 5966 }, { "epoch": 0.2712727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.18557535807291667, "learning_rate": 0.0001, "loss": 5.7816, "loss/crossentropy": 2.5214271545410156, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1742626167833805, "step": 5968 }, { "epoch": 0.27136363636363636, "grad_norm": 5.28125, "grad_norm_var": 0.16495768229166666, "learning_rate": 0.0001, "loss": 5.743, "loss/crossentropy": 2.4436203241348267, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.174269437789917, "step": 5970 }, { "epoch": 0.27145454545454545, "grad_norm": 6.75, "grad_norm_var": 0.7399576822916667, "learning_rate": 0.0001, "loss": 5.8334, "loss/crossentropy": 2.3972115218639374, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.18131515756249428, "step": 5972 }, { "epoch": 0.27154545454545453, "grad_norm": 6.03125, "grad_norm_var": 0.7332682291666667, "learning_rate": 0.0001, "loss": 6.2011, "loss/crossentropy": 2.6347209215164185, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19726430997252464, "step": 5974 }, { "epoch": 0.2716363636363636, "grad_norm": 5.4375, "grad_norm_var": 0.7372233072916666, "learning_rate": 0.0001, "loss": 5.5598, "loss/crossentropy": 2.2042457461357117, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.17227914556860924, "step": 5976 }, { "epoch": 0.2717272727272727, "grad_norm": 5.53125, "grad_norm_var": 0.7321451822916667, "learning_rate": 0.0001, "loss": 5.8911, "loss/crossentropy": 2.48200660943985, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1825074665248394, "step": 5978 }, { "epoch": 0.2718181818181818, "grad_norm": 5.21875, "grad_norm_var": 0.69224853515625, "learning_rate": 0.0001, "loss": 5.7312, "loss/crossentropy": 2.2654300928115845, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1860276274383068, "step": 5980 }, { "epoch": 0.27190909090909093, "grad_norm": 5.8125, "grad_norm_var": 0.7006510416666667, "learning_rate": 0.0001, "loss": 5.869, "loss/crossentropy": 2.444200873374939, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18389015272259712, "step": 5982 }, { "epoch": 0.272, "grad_norm": 5.5, "grad_norm_var": 0.7587239583333333, "learning_rate": 0.0001, "loss": 5.8628, "loss/crossentropy": 2.4450870156288147, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18571989610791206, "step": 5984 }, { "epoch": 0.2720909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.7892862955729166, "learning_rate": 0.0001, "loss": 5.2634, "loss/crossentropy": 2.132382094860077, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.15528910979628563, "step": 5986 }, { "epoch": 0.2721818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.32076416015625, "learning_rate": 0.0001, "loss": 5.8947, "loss/crossentropy": 2.4580007791519165, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1819470152258873, "step": 5988 }, { "epoch": 0.2722727272727273, "grad_norm": 5.65625, "grad_norm_var": 0.283056640625, "learning_rate": 0.0001, "loss": 5.7228, "loss/crossentropy": 2.435745060443878, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17284729704260826, "step": 5990 }, { "epoch": 0.27236363636363636, "grad_norm": 5.4375, "grad_norm_var": 0.2911295572916667, "learning_rate": 0.0001, "loss": 6.1904, "loss/crossentropy": 2.719989240169525, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1866879090666771, "step": 5992 }, { "epoch": 0.27245454545454545, "grad_norm": 5.6875, "grad_norm_var": 1.19058837890625, "learning_rate": 0.0001, "loss": 6.1143, "loss/crossentropy": 2.510596215724945, "loss/hidden": 1.634765625, "loss/jsd": 0.0, "loss/logits": 0.19689403101801872, "step": 5994 }, { "epoch": 0.27254545454545454, "grad_norm": 5.46875, "grad_norm_var": 1.1821451822916667, "learning_rate": 0.0001, "loss": 5.683, "loss/crossentropy": 2.3895859718322754, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1715291105210781, "step": 5996 }, { "epoch": 0.2726363636363636, "grad_norm": 4.96875, "grad_norm_var": 1.1926920572916666, "learning_rate": 0.0001, "loss": 5.6952, "loss/crossentropy": 2.40893018245697, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17198546230793, "step": 5998 }, { "epoch": 0.2727272727272727, "grad_norm": 4.9375, "grad_norm_var": 1.0717081705729166, "learning_rate": 0.0001, "loss": 5.5485, "loss/crossentropy": 2.2892919778823853, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.16732675209641457, "step": 6000 }, { "epoch": 0.2728181818181818, "grad_norm": 5.3125, "grad_norm_var": 1.0678019205729166, "learning_rate": 0.0001, "loss": 5.7401, "loss/crossentropy": 2.391613006591797, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17645404115319252, "step": 6002 }, { "epoch": 0.27290909090909093, "grad_norm": 6.15625, "grad_norm_var": 1.0794108072916666, "learning_rate": 0.0001, "loss": 6.207, "loss/crossentropy": 2.723789691925049, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1907070353627205, "step": 6004 }, { "epoch": 0.273, "grad_norm": 5.84375, "grad_norm_var": 1.0662394205729167, "learning_rate": 0.0001, "loss": 6.1563, "loss/crossentropy": 2.7030221819877625, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18869176879525185, "step": 6006 }, { "epoch": 0.2730909090909091, "grad_norm": 5.71875, "grad_norm_var": 1.0423014322916666, "learning_rate": 0.0001, "loss": 5.9921, "loss/crossentropy": 2.548328459262848, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18402071297168732, "step": 6008 }, { "epoch": 0.2731818181818182, "grad_norm": 7.375, "grad_norm_var": 0.35657145182291666, "learning_rate": 0.0001, "loss": 6.4012, "loss/crossentropy": 2.8169460892677307, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19592910259962082, "step": 6010 }, { "epoch": 0.2732727272727273, "grad_norm": 5.40625, "grad_norm_var": 0.35689697265625, "learning_rate": 0.0001, "loss": 6.1368, "loss/crossentropy": 2.657585918903351, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1912849210202694, "step": 6012 }, { "epoch": 0.27336363636363636, "grad_norm": 5.9375, "grad_norm_var": 0.32623291015625, "learning_rate": 0.0001, "loss": 6.4059, "loss/crossentropy": 2.770575523376465, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.2047477699816227, "step": 6014 }, { "epoch": 0.27345454545454545, "grad_norm": 5.21875, "grad_norm_var": 0.3087890625, "learning_rate": 0.0001, "loss": 5.6833, "loss/crossentropy": 2.3847475051879883, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17458340525627136, "step": 6016 }, { "epoch": 0.27354545454545454, "grad_norm": 6.125, "grad_norm_var": 0.2548014322916667, "learning_rate": 0.0001, "loss": 6.2725, "loss/crossentropy": 2.739912748336792, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.19524770602583885, "step": 6018 }, { "epoch": 0.2736363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.2798828125, "learning_rate": 0.0001, "loss": 5.8217, "loss/crossentropy": 2.4833412170410156, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.17621878162026405, "step": 6020 }, { "epoch": 0.2737272727272727, "grad_norm": 5.90625, "grad_norm_var": 0.27916259765625, "learning_rate": 0.0001, "loss": 5.8523, "loss/crossentropy": 2.4926061034202576, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17503278329968452, "step": 6022 }, { "epoch": 0.2738181818181818, "grad_norm": 5.6875, "grad_norm_var": 0.29185791015625, "learning_rate": 0.0001, "loss": 6.1069, "loss/crossentropy": 2.6132779717445374, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19076722115278244, "step": 6024 }, { "epoch": 0.27390909090909094, "grad_norm": 5.65625, "grad_norm_var": 0.10406494140625, "learning_rate": 0.0001, "loss": 6.076, "loss/crossentropy": 2.568857967853546, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19133853539824486, "step": 6026 }, { "epoch": 0.274, "grad_norm": 6.6875, "grad_norm_var": 0.17271728515625, "learning_rate": 0.0001, "loss": 5.9159, "loss/crossentropy": 2.5186655521392822, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18015291169285774, "step": 6028 }, { "epoch": 0.2740909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.19205729166666666, "learning_rate": 0.0001, "loss": 6.0153, "loss/crossentropy": 2.5747812390327454, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18584664911031723, "step": 6030 }, { "epoch": 0.2741818181818182, "grad_norm": 4.875, "grad_norm_var": 0.22450764973958334, "learning_rate": 0.0001, "loss": 5.9817, "loss/crossentropy": 2.530528038740158, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.18203015252947807, "step": 6032 }, { "epoch": 0.2742727272727273, "grad_norm": 5.78125, "grad_norm_var": 0.21300455729166667, "learning_rate": 0.0001, "loss": 6.2069, "loss/crossentropy": 2.5912572145462036, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.2000434286892414, "step": 6034 }, { "epoch": 0.27436363636363637, "grad_norm": 5.21875, "grad_norm_var": 0.20562744140625, "learning_rate": 0.0001, "loss": 6.1055, "loss/crossentropy": 2.7393038272857666, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.18134642019867897, "step": 6036 }, { "epoch": 0.27445454545454545, "grad_norm": 5.875, "grad_norm_var": 0.19387613932291667, "learning_rate": 0.0001, "loss": 5.8155, "loss/crossentropy": 2.433807820081711, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1801576167345047, "step": 6038 }, { "epoch": 0.27454545454545454, "grad_norm": 6.8125, "grad_norm_var": 0.2850260416666667, "learning_rate": 0.0001, "loss": 6.1926, "loss/crossentropy": 2.7182093262672424, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.18942921236157417, "step": 6040 }, { "epoch": 0.2746363636363636, "grad_norm": 5.34375, "grad_norm_var": 0.3094401041666667, "learning_rate": 0.0001, "loss": 5.9674, "loss/crossentropy": 2.594225525856018, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18107236921787262, "step": 6042 }, { "epoch": 0.2747272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.2545572916666667, "learning_rate": 0.0001, "loss": 5.7716, "loss/crossentropy": 2.461612284183502, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.17220667377114296, "step": 6044 }, { "epoch": 0.2748181818181818, "grad_norm": 4.875, "grad_norm_var": 0.257275390625, "learning_rate": 0.0001, "loss": 5.8756, "loss/crossentropy": 2.525066316127777, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1778286062180996, "step": 6046 }, { "epoch": 0.27490909090909094, "grad_norm": 5.3125, "grad_norm_var": 0.229150390625, "learning_rate": 0.0001, "loss": 5.6714, "loss/crossentropy": 2.423026889562607, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17113003507256508, "step": 6048 }, { "epoch": 0.275, "grad_norm": 5.625, "grad_norm_var": 0.21627197265625, "learning_rate": 0.0001, "loss": 6.2112, "loss/crossentropy": 2.6878910660743713, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1952950917184353, "step": 6050 }, { "epoch": 0.2750909090909091, "grad_norm": 6.53125, "grad_norm_var": 0.28072509765625, "learning_rate": 0.0001, "loss": 6.1352, "loss/crossentropy": 2.577985167503357, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19322508201003075, "step": 6052 }, { "epoch": 0.2751818181818182, "grad_norm": 5.71875, "grad_norm_var": 0.277734375, "learning_rate": 0.0001, "loss": 5.718, "loss/crossentropy": 2.4269115924835205, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1714891754090786, "step": 6054 }, { "epoch": 0.2752727272727273, "grad_norm": 6.4375, "grad_norm_var": 0.235791015625, "learning_rate": 0.0001, "loss": 6.5385, "loss/crossentropy": 2.756730854511261, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2164541892707348, "step": 6056 }, { "epoch": 0.27536363636363637, "grad_norm": 4.65625, "grad_norm_var": 0.28201497395833336, "learning_rate": 0.0001, "loss": 5.7073, "loss/crossentropy": 2.44374418258667, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17186209186911583, "step": 6058 }, { "epoch": 0.27545454545454545, "grad_norm": 6.125, "grad_norm_var": 0.2860310872395833, "learning_rate": 0.0001, "loss": 6.2244, "loss/crossentropy": 2.712331533432007, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19105390459299088, "step": 6060 }, { "epoch": 0.27554545454545454, "grad_norm": 5.8125, "grad_norm_var": 0.26428629557291666, "learning_rate": 0.0001, "loss": 6.2568, "loss/crossentropy": 2.7552881836891174, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1923416629433632, "step": 6062 }, { "epoch": 0.2756363636363636, "grad_norm": 5.90625, "grad_norm_var": 0.25146077473958334, "learning_rate": 0.0001, "loss": 5.6535, "loss/crossentropy": 2.318437695503235, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17686838656663895, "step": 6064 }, { "epoch": 0.2757272727272727, "grad_norm": 6.5, "grad_norm_var": 0.29488525390625, "learning_rate": 0.0001, "loss": 6.0573, "loss/crossentropy": 2.633773982524872, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18298063054680824, "step": 6066 }, { "epoch": 0.2758181818181818, "grad_norm": 6.0625, "grad_norm_var": 0.28240559895833334, "learning_rate": 0.0001, "loss": 6.3486, "loss/crossentropy": 2.7400839924812317, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19932401180267334, "step": 6068 }, { "epoch": 0.2759090909090909, "grad_norm": 5.75, "grad_norm_var": 0.2703125, "learning_rate": 0.0001, "loss": 6.1544, "loss/crossentropy": 2.6082104444503784, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.1886054277420044, "step": 6070 }, { "epoch": 0.276, "grad_norm": 5.71875, "grad_norm_var": 0.25198160807291664, "learning_rate": 0.0001, "loss": 6.1121, "loss/crossentropy": 2.6512269377708435, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18886274471879005, "step": 6072 }, { "epoch": 0.2760909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.17265218098958332, "learning_rate": 0.0001, "loss": 6.3655, "loss/crossentropy": 2.8783657550811768, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.19148647412657738, "step": 6074 }, { "epoch": 0.2761818181818182, "grad_norm": 5.9375, "grad_norm_var": 0.17525634765625, "learning_rate": 0.0001, "loss": 6.0463, "loss/crossentropy": 2.514492630958557, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19263773411512375, "step": 6076 }, { "epoch": 0.2762727272727273, "grad_norm": 5.78125, "grad_norm_var": 0.17486979166666666, "learning_rate": 0.0001, "loss": 6.0681, "loss/crossentropy": 2.596966862678528, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.18910795077681541, "step": 6078 }, { "epoch": 0.27636363636363637, "grad_norm": 5.25, "grad_norm_var": 0.17105712890625, "learning_rate": 0.0001, "loss": 5.8967, "loss/crossentropy": 2.5254527926445007, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18028998002409935, "step": 6080 }, { "epoch": 0.27645454545454545, "grad_norm": 5.65625, "grad_norm_var": 0.14685872395833333, "learning_rate": 0.0001, "loss": 6.1113, "loss/crossentropy": 2.6530045866966248, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.18938035890460014, "step": 6082 }, { "epoch": 0.27654545454545454, "grad_norm": 5.0, "grad_norm_var": 0.12659098307291666, "learning_rate": 0.0001, "loss": 5.7883, "loss/crossentropy": 2.4064712524414062, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.18017257004976273, "step": 6084 }, { "epoch": 0.2766363636363636, "grad_norm": 10.25, "grad_norm_var": 1.5012858072916666, "learning_rate": 0.0001, "loss": 6.3569, "loss/crossentropy": 2.812319815158844, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.1956687979400158, "step": 6086 }, { "epoch": 0.2767272727272727, "grad_norm": 5.90625, "grad_norm_var": 1.56295166015625, "learning_rate": 0.0001, "loss": 6.1747, "loss/crossentropy": 2.5682713985443115, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.19989952072501183, "step": 6088 }, { "epoch": 0.2768181818181818, "grad_norm": 5.9375, "grad_norm_var": 1.5362630208333334, "learning_rate": 0.0001, "loss": 6.2891, "loss/crossentropy": 2.658204674720764, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19980484247207642, "step": 6090 }, { "epoch": 0.2769090909090909, "grad_norm": 5.40625, "grad_norm_var": 1.5379191080729167, "learning_rate": 0.0001, "loss": 5.6636, "loss/crossentropy": 2.4608142971992493, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16696316003799438, "step": 6092 }, { "epoch": 0.277, "grad_norm": 5.3125, "grad_norm_var": 1.5279947916666667, "learning_rate": 0.0001, "loss": 5.8237, "loss/crossentropy": 2.4023757576942444, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18354317545890808, "step": 6094 }, { "epoch": 0.2770909090909091, "grad_norm": 5.71875, "grad_norm_var": 1.4739420572916666, "learning_rate": 0.0001, "loss": 6.241, "loss/crossentropy": 2.719673454761505, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19392774254083633, "step": 6096 }, { "epoch": 0.2771818181818182, "grad_norm": 5.34375, "grad_norm_var": 1.5003743489583334, "learning_rate": 0.0001, "loss": 5.8878, "loss/crossentropy": 2.4498439729213715, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18383906781673431, "step": 6098 }, { "epoch": 0.2772727272727273, "grad_norm": 5.375, "grad_norm_var": 1.4664347330729166, "learning_rate": 0.0001, "loss": 5.8867, "loss/crossentropy": 2.5980716943740845, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17495695501565933, "step": 6100 }, { "epoch": 0.27736363636363637, "grad_norm": 6.0625, "grad_norm_var": 0.27161051432291666, "learning_rate": 0.0001, "loss": 6.2285, "loss/crossentropy": 2.6834587454795837, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19591142609715462, "step": 6102 }, { "epoch": 0.27745454545454545, "grad_norm": 4.96875, "grad_norm_var": 0.15357666015625, "learning_rate": 0.0001, "loss": 5.7634, "loss/crossentropy": 2.396860182285309, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18060123175382614, "step": 6104 }, { "epoch": 0.27754545454545454, "grad_norm": 4.9375, "grad_norm_var": 0.12828369140625, "learning_rate": 0.0001, "loss": 5.9247, "loss/crossentropy": 2.60214626789093, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17757237702608109, "step": 6106 }, { "epoch": 0.2776363636363636, "grad_norm": 6.34375, "grad_norm_var": 0.174609375, "learning_rate": 0.0001, "loss": 5.6916, "loss/crossentropy": 2.3209517002105713, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.17749344184994698, "step": 6108 }, { "epoch": 0.2777272727272727, "grad_norm": 5.96875, "grad_norm_var": 0.18804931640625, "learning_rate": 0.0001, "loss": 6.1263, "loss/crossentropy": 2.594844162464142, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.19318616390228271, "step": 6110 }, { "epoch": 0.2778181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.18977864583333334, "learning_rate": 0.0001, "loss": 5.6478, "loss/crossentropy": 2.3444472551345825, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1736970879137516, "step": 6112 }, { "epoch": 0.2779090909090909, "grad_norm": 5.25, "grad_norm_var": 0.15519205729166666, "learning_rate": 0.0001, "loss": 6.1346, "loss/crossentropy": 2.7090682983398438, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1845444180071354, "step": 6114 }, { "epoch": 0.278, "grad_norm": 4.84375, "grad_norm_var": 0.18176676432291666, "learning_rate": 0.0001, "loss": 5.5221, "loss/crossentropy": 2.349319577217102, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16356723010540009, "step": 6116 }, { "epoch": 0.2780909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.15794270833333332, "learning_rate": 0.0001, "loss": 5.6311, "loss/crossentropy": 2.3619819581508636, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1685164012014866, "step": 6118 }, { "epoch": 0.2781818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.14894205729166668, "learning_rate": 0.0001, "loss": 5.8197, "loss/crossentropy": 2.520208179950714, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1744765006005764, "step": 6120 }, { "epoch": 0.2782727272727273, "grad_norm": 5.625, "grad_norm_var": 0.13580322265625, "learning_rate": 0.0001, "loss": 5.8804, "loss/crossentropy": 2.4582772850990295, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18537615612149239, "step": 6122 }, { "epoch": 0.27836363636363637, "grad_norm": 5.875, "grad_norm_var": 0.932666015625, "learning_rate": 0.0001, "loss": 5.9147, "loss/crossentropy": 2.49913489818573, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18101033940911293, "step": 6124 }, { "epoch": 0.27845454545454545, "grad_norm": 5.96875, "grad_norm_var": 0.9279296875, "learning_rate": 0.0001, "loss": 5.9702, "loss/crossentropy": 2.5169333815574646, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18849504739046097, "step": 6126 }, { "epoch": 0.27854545454545454, "grad_norm": 9.25, "grad_norm_var": 1.73013916015625, "learning_rate": 0.0001, "loss": 5.7969, "loss/crossentropy": 2.3692610263824463, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1833891198039055, "step": 6128 }, { "epoch": 0.2786363636363636, "grad_norm": 5.1875, "grad_norm_var": 1.7167928059895834, "learning_rate": 0.0001, "loss": 5.5125, "loss/crossentropy": 2.2305076122283936, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.16667266190052032, "step": 6130 }, { "epoch": 0.2787272727272727, "grad_norm": 5.59375, "grad_norm_var": 1.6202962239583334, "learning_rate": 0.0001, "loss": 6.2232, "loss/crossentropy": 2.69929438829422, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19028090685606003, "step": 6132 }, { "epoch": 0.2788181818181818, "grad_norm": 5.78125, "grad_norm_var": 1.68316650390625, "learning_rate": 0.0001, "loss": 5.6643, "loss/crossentropy": 2.4340811669826508, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16793977469205856, "step": 6134 }, { "epoch": 0.2789090909090909, "grad_norm": 4.59375, "grad_norm_var": 1.7482706705729167, "learning_rate": 0.0001, "loss": 5.8288, "loss/crossentropy": 2.4233900904655457, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18546582758426666, "step": 6136 }, { "epoch": 0.279, "grad_norm": 5.1875, "grad_norm_var": 1.7761027018229167, "learning_rate": 0.0001, "loss": 5.6914, "loss/crossentropy": 2.3561409413814545, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17415138706564903, "step": 6138 }, { "epoch": 0.2790909090909091, "grad_norm": 5.875, "grad_norm_var": 1.1239420572916667, "learning_rate": 0.0001, "loss": 5.8831, "loss/crossentropy": 2.5121955275535583, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17966564372181892, "step": 6140 }, { "epoch": 0.2791818181818182, "grad_norm": 4.90625, "grad_norm_var": 1.2110677083333334, "learning_rate": 0.0001, "loss": 5.5896, "loss/crossentropy": 2.3619688749313354, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17100363969802856, "step": 6142 }, { "epoch": 0.2792727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.32081705729166665, "learning_rate": 0.0001, "loss": 6.0562, "loss/crossentropy": 2.631604492664337, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18816117569804192, "step": 6144 }, { "epoch": 0.27936363636363637, "grad_norm": 4.875, "grad_norm_var": 0.335546875, "learning_rate": 0.0001, "loss": 5.4616, "loss/crossentropy": 2.262197196483612, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16545017436146736, "step": 6146 }, { "epoch": 0.27945454545454546, "grad_norm": 5.4375, "grad_norm_var": 0.40167643229166666, "learning_rate": 0.0001, "loss": 6.269, "loss/crossentropy": 2.6935577392578125, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19895176589488983, "step": 6148 }, { "epoch": 0.27954545454545454, "grad_norm": 5.375, "grad_norm_var": 0.35519205729166664, "learning_rate": 0.0001, "loss": 5.9606, "loss/crossentropy": 2.5077319145202637, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18591045960783958, "step": 6150 }, { "epoch": 0.2796363636363636, "grad_norm": 5.96875, "grad_norm_var": 0.3350911458333333, "learning_rate": 0.0001, "loss": 5.7146, "loss/crossentropy": 2.406820625066757, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1725730448961258, "step": 6152 }, { "epoch": 0.2797272727272727, "grad_norm": 6.03125, "grad_norm_var": 0.3997395833333333, "learning_rate": 0.0001, "loss": 5.7859, "loss/crossentropy": 2.4849827885627747, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17422958835959435, "step": 6154 }, { "epoch": 0.2798181818181818, "grad_norm": 5.84375, "grad_norm_var": 0.28118489583333334, "learning_rate": 0.0001, "loss": 6.042, "loss/crossentropy": 2.5945178270339966, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18654528260231018, "step": 6156 }, { "epoch": 0.2799090909090909, "grad_norm": 5.53125, "grad_norm_var": 0.24455973307291667, "learning_rate": 0.0001, "loss": 5.9812, "loss/crossentropy": 2.551450192928314, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18438109010457993, "step": 6158 }, { "epoch": 0.28, "grad_norm": 5.75, "grad_norm_var": 0.22349853515625, "learning_rate": 0.0001, "loss": 5.6779, "loss/crossentropy": 2.3795100450515747, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1741701029241085, "step": 6160 }, { "epoch": 0.2800909090909091, "grad_norm": 5.5625, "grad_norm_var": 0.19373372395833333, "learning_rate": 0.0001, "loss": 6.1542, "loss/crossentropy": 2.6914498805999756, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18728972971439362, "step": 6162 }, { "epoch": 0.2801818181818182, "grad_norm": 7.375, "grad_norm_var": 0.43391520182291665, "learning_rate": 0.0001, "loss": 6.5844, "loss/crossentropy": 2.825493037700653, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2165176086127758, "step": 6164 }, { "epoch": 0.2802727272727273, "grad_norm": 5.5625, "grad_norm_var": 0.47050374348958335, "learning_rate": 0.0001, "loss": 5.655, "loss/crossentropy": 2.3630964159965515, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17703814804553986, "step": 6166 }, { "epoch": 0.28036363636363637, "grad_norm": 5.46875, "grad_norm_var": 0.6128255208333333, "learning_rate": 0.0001, "loss": 6.1951, "loss/crossentropy": 2.6521946787834167, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19178754091262817, "step": 6168 }, { "epoch": 0.28045454545454546, "grad_norm": 5.46875, "grad_norm_var": 0.56676025390625, "learning_rate": 0.0001, "loss": 5.6405, "loss/crossentropy": 2.35545614361763, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17479611560702324, "step": 6170 }, { "epoch": 0.28054545454545454, "grad_norm": 5.8125, "grad_norm_var": 0.5770792643229167, "learning_rate": 0.0001, "loss": 5.7852, "loss/crossentropy": 2.381085455417633, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18006043136119843, "step": 6172 }, { "epoch": 0.28063636363636363, "grad_norm": 5.4375, "grad_norm_var": 0.5801920572916667, "learning_rate": 0.0001, "loss": 5.5982, "loss/crossentropy": 2.263257712125778, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.17294495552778244, "step": 6174 }, { "epoch": 0.2807272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.5884073893229167, "learning_rate": 0.0001, "loss": 5.6343, "loss/crossentropy": 2.35051092505455, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1711498349905014, "step": 6176 }, { "epoch": 0.2808181818181818, "grad_norm": 5.46875, "grad_norm_var": 0.58551025390625, "learning_rate": 0.0001, "loss": 5.6903, "loss/crossentropy": 2.456808865070343, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17003324255347252, "step": 6178 }, { "epoch": 0.2809090909090909, "grad_norm": 6.09375, "grad_norm_var": 0.33677978515625, "learning_rate": 0.0001, "loss": 6.134, "loss/crossentropy": 2.5953491926193237, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19058740511536598, "step": 6180 }, { "epoch": 0.281, "grad_norm": 10.875, "grad_norm_var": 2.00582275390625, "learning_rate": 0.0001, "loss": 5.6633, "loss/crossentropy": 2.2980777621269226, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1775391846895218, "step": 6182 }, { "epoch": 0.2810909090909091, "grad_norm": 5.8125, "grad_norm_var": 1.8687337239583333, "learning_rate": 0.0001, "loss": 5.8902, "loss/crossentropy": 2.3970591127872467, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1913064680993557, "step": 6184 }, { "epoch": 0.2811818181818182, "grad_norm": 6.09375, "grad_norm_var": 1.7820963541666666, "learning_rate": 0.0001, "loss": 5.669, "loss/crossentropy": 2.382421910762787, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.16986753419041634, "step": 6186 }, { "epoch": 0.2812727272727273, "grad_norm": 6.125, "grad_norm_var": 1.758837890625, "learning_rate": 0.0001, "loss": 6.0706, "loss/crossentropy": 2.5313831567764282, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.1910293586552143, "step": 6188 }, { "epoch": 0.28136363636363637, "grad_norm": 5.6875, "grad_norm_var": 1.787890625, "learning_rate": 0.0001, "loss": 5.8527, "loss/crossentropy": 2.4434171319007874, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1805764026939869, "step": 6190 }, { "epoch": 0.28145454545454546, "grad_norm": 5.65625, "grad_norm_var": 1.72095947265625, "learning_rate": 0.0001, "loss": 5.8143, "loss/crossentropy": 2.425303339958191, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1791347749531269, "step": 6192 }, { "epoch": 0.28154545454545454, "grad_norm": 5.90625, "grad_norm_var": 1.6906209309895834, "learning_rate": 0.0001, "loss": 6.1145, "loss/crossentropy": 2.635828137397766, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.18907805532217026, "step": 6194 }, { "epoch": 0.28163636363636363, "grad_norm": 6.09375, "grad_norm_var": 1.7908854166666666, "learning_rate": 0.0001, "loss": 5.5757, "loss/crossentropy": 2.287829279899597, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1715599149465561, "step": 6196 }, { "epoch": 0.2817272727272727, "grad_norm": 5.875, "grad_norm_var": 0.17535400390625, "learning_rate": 0.0001, "loss": 5.908, "loss/crossentropy": 2.4511019587516785, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.18689802661538124, "step": 6198 }, { "epoch": 0.2818181818181818, "grad_norm": 6.03125, "grad_norm_var": 0.18409830729166668, "learning_rate": 0.0001, "loss": 6.261, "loss/crossentropy": 2.764111638069153, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19265975430607796, "step": 6200 }, { "epoch": 0.2819090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.19947916666666668, "learning_rate": 0.0001, "loss": 5.6538, "loss/crossentropy": 2.371579587459564, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.17020975053310394, "step": 6202 }, { "epoch": 0.282, "grad_norm": 5.125, "grad_norm_var": 0.20979410807291668, "learning_rate": 0.0001, "loss": 5.4397, "loss/crossentropy": 2.2341582775115967, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.16371500492095947, "step": 6204 }, { "epoch": 0.2820909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.19348551432291666, "learning_rate": 0.0001, "loss": 6.0633, "loss/crossentropy": 2.5840483903884888, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18893950060009956, "step": 6206 }, { "epoch": 0.2821818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.23830973307291667, "learning_rate": 0.0001, "loss": 5.41, "loss/crossentropy": 2.275637686252594, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.15777696296572685, "step": 6208 }, { "epoch": 0.2822727272727273, "grad_norm": 5.625, "grad_norm_var": 0.22902018229166668, "learning_rate": 0.0001, "loss": 5.783, "loss/crossentropy": 2.485249400138855, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17117827013134956, "step": 6210 }, { "epoch": 0.28236363636363637, "grad_norm": 5.21875, "grad_norm_var": 0.188916015625, "learning_rate": 0.0001, "loss": 5.734, "loss/crossentropy": 2.373093694448471, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17789212614297867, "step": 6212 }, { "epoch": 0.28245454545454546, "grad_norm": 5.96875, "grad_norm_var": 0.10914306640625, "learning_rate": 0.0001, "loss": 5.9199, "loss/crossentropy": 2.5639971494674683, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17718806490302086, "step": 6214 }, { "epoch": 0.28254545454545454, "grad_norm": 5.125, "grad_norm_var": 0.08045247395833334, "learning_rate": 0.0001, "loss": 5.7376, "loss/crossentropy": 2.4567716121673584, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17222067341208458, "step": 6216 }, { "epoch": 0.28263636363636363, "grad_norm": 6.125, "grad_norm_var": 0.11927083333333334, "learning_rate": 0.0001, "loss": 5.7954, "loss/crossentropy": 2.518618106842041, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1737738884985447, "step": 6218 }, { "epoch": 0.2827272727272727, "grad_norm": 6.4375, "grad_norm_var": 0.18036702473958333, "learning_rate": 0.0001, "loss": 5.8065, "loss/crossentropy": 2.4018567204475403, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.18401523679494858, "step": 6220 }, { "epoch": 0.2828181818181818, "grad_norm": 5.34375, "grad_norm_var": 0.27102457682291664, "learning_rate": 0.0001, "loss": 5.7664, "loss/crossentropy": 2.446796864271164, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1764921061694622, "step": 6222 }, { "epoch": 0.2829090909090909, "grad_norm": 5.53125, "grad_norm_var": 0.22795817057291667, "learning_rate": 0.0001, "loss": 5.9196, "loss/crossentropy": 2.543501317501068, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1799934096634388, "step": 6224 }, { "epoch": 0.283, "grad_norm": 5.625, "grad_norm_var": 0.22714436848958333, "learning_rate": 0.0001, "loss": 5.8409, "loss/crossentropy": 2.3732824325561523, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18504595011472702, "step": 6226 }, { "epoch": 0.2830909090909091, "grad_norm": 5.5, "grad_norm_var": 0.22073160807291667, "learning_rate": 0.0001, "loss": 5.7461, "loss/crossentropy": 2.3975793719291687, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17645684257149696, "step": 6228 }, { "epoch": 0.2831818181818182, "grad_norm": 5.0625, "grad_norm_var": 0.21806233723958332, "learning_rate": 0.0001, "loss": 5.8693, "loss/crossentropy": 2.5679317116737366, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1746697574853897, "step": 6230 }, { "epoch": 0.2832727272727273, "grad_norm": 5.46875, "grad_norm_var": 0.20181884765625, "learning_rate": 0.0001, "loss": 6.2061, "loss/crossentropy": 2.722753942012787, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19013643637299538, "step": 6232 }, { "epoch": 0.2833636363636364, "grad_norm": 4.875, "grad_norm_var": 0.23553059895833334, "learning_rate": 0.0001, "loss": 5.5567, "loss/crossentropy": 2.340935707092285, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16845062747597694, "step": 6234 }, { "epoch": 0.28345454545454546, "grad_norm": 5.28125, "grad_norm_var": 0.19511311848958332, "learning_rate": 0.0001, "loss": 5.5242, "loss/crossentropy": 2.2898285388946533, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16718914359807968, "step": 6236 }, { "epoch": 0.28354545454545454, "grad_norm": 5.71875, "grad_norm_var": 0.10927327473958333, "learning_rate": 0.0001, "loss": 6.0531, "loss/crossentropy": 2.6070430874824524, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.184253491461277, "step": 6238 }, { "epoch": 0.28363636363636363, "grad_norm": 5.46875, "grad_norm_var": 0.18879801432291668, "learning_rate": 0.0001, "loss": 6.0155, "loss/crossentropy": 2.610632300376892, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1820836141705513, "step": 6240 }, { "epoch": 0.2837272727272727, "grad_norm": 5.125, "grad_norm_var": 0.189306640625, "learning_rate": 0.0001, "loss": 5.8117, "loss/crossentropy": 2.4375378489494324, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1797959990799427, "step": 6242 }, { "epoch": 0.2838181818181818, "grad_norm": 6.34375, "grad_norm_var": 0.22603759765625, "learning_rate": 0.0001, "loss": 6.2137, "loss/crossentropy": 2.7206252813339233, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18914983794093132, "step": 6244 }, { "epoch": 0.2839090909090909, "grad_norm": 6.0625, "grad_norm_var": 0.22802327473958334, "learning_rate": 0.0001, "loss": 6.1668, "loss/crossentropy": 2.5976945757865906, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1965608447790146, "step": 6246 }, { "epoch": 0.284, "grad_norm": 5.21875, "grad_norm_var": 0.22841389973958334, "learning_rate": 0.0001, "loss": 5.965, "loss/crossentropy": 2.60948246717453, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17988574504852295, "step": 6248 }, { "epoch": 0.2840909090909091, "grad_norm": 5.625, "grad_norm_var": 0.18489583333333334, "learning_rate": 0.0001, "loss": 5.8429, "loss/crossentropy": 2.4682503938674927, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1796569563448429, "step": 6250 }, { "epoch": 0.2841818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.16975504557291668, "learning_rate": 0.0001, "loss": 5.7726, "loss/crossentropy": 2.4322637915611267, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1750539280474186, "step": 6252 }, { "epoch": 0.2842727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.18053385416666667, "learning_rate": 0.0001, "loss": 5.7645, "loss/crossentropy": 2.433059811592102, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.173179030418396, "step": 6254 }, { "epoch": 0.2843636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.11516927083333334, "learning_rate": 0.0001, "loss": 5.9596, "loss/crossentropy": 2.525149941444397, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.18426718935370445, "step": 6256 }, { "epoch": 0.28445454545454546, "grad_norm": 5.5, "grad_norm_var": 0.13023681640625, "learning_rate": 0.0001, "loss": 5.9011, "loss/crossentropy": 2.546357810497284, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17883851379156113, "step": 6258 }, { "epoch": 0.28454545454545455, "grad_norm": 5.5625, "grad_norm_var": 0.11363525390625, "learning_rate": 0.0001, "loss": 5.7425, "loss/crossentropy": 2.4826707243919373, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.16738468036055565, "step": 6260 }, { "epoch": 0.28463636363636363, "grad_norm": 4.96875, "grad_norm_var": 0.10832926432291666, "learning_rate": 0.0001, "loss": 5.8651, "loss/crossentropy": 2.5535390973091125, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17607520893216133, "step": 6262 }, { "epoch": 0.2847272727272727, "grad_norm": 5.71875, "grad_norm_var": 0.111572265625, "learning_rate": 0.0001, "loss": 5.8865, "loss/crossentropy": 2.516864240169525, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.17622029781341553, "step": 6264 }, { "epoch": 0.2848181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.12278238932291667, "learning_rate": 0.0001, "loss": 5.6515, "loss/crossentropy": 2.3771833181381226, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1721564121544361, "step": 6266 }, { "epoch": 0.2849090909090909, "grad_norm": 25.5, "grad_norm_var": 25.32086181640625, "learning_rate": 0.0001, "loss": 5.7675, "loss/crossentropy": 2.1719510555267334, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19841954112052917, "step": 6268 }, { "epoch": 0.285, "grad_norm": 5.5, "grad_norm_var": 25.18052978515625, "learning_rate": 0.0001, "loss": 5.799, "loss/crossentropy": 2.4536860287189484, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1765250414609909, "step": 6270 }, { "epoch": 0.2850909090909091, "grad_norm": 5.46875, "grad_norm_var": 25.162788899739585, "learning_rate": 0.0001, "loss": 5.7757, "loss/crossentropy": 2.422710508108139, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17534128203988075, "step": 6272 }, { "epoch": 0.2851818181818182, "grad_norm": 5.4375, "grad_norm_var": 25.326025390625, "learning_rate": 0.0001, "loss": 5.7449, "loss/crossentropy": 2.4527793526649475, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17237956449389458, "step": 6274 }, { "epoch": 0.2852727272727273, "grad_norm": 7.0, "grad_norm_var": 25.09664306640625, "learning_rate": 0.0001, "loss": 5.9049, "loss/crossentropy": 2.4089139699935913, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18866291642189026, "step": 6276 }, { "epoch": 0.2853636363636364, "grad_norm": 5.3125, "grad_norm_var": 25.028841145833333, "learning_rate": 0.0001, "loss": 5.934, "loss/crossentropy": 2.541024684906006, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1803099513053894, "step": 6278 }, { "epoch": 0.28545454545454546, "grad_norm": 9.0625, "grad_norm_var": 25.209468587239584, "learning_rate": 0.0001, "loss": 6.0028, "loss/crossentropy": 2.4775821566581726, "loss/hidden": 1.642578125, "loss/jsd": 0.0, "loss/logits": 0.18826641887426376, "step": 6280 }, { "epoch": 0.28554545454545455, "grad_norm": 5.9375, "grad_norm_var": 24.875634765625, "learning_rate": 0.0001, "loss": 6.0363, "loss/crossentropy": 2.553365409374237, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18657007068395615, "step": 6282 }, { "epoch": 0.28563636363636363, "grad_norm": 6.03125, "grad_norm_var": 0.8663899739583333, "learning_rate": 0.0001, "loss": 6.2333, "loss/crossentropy": 2.672857642173767, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.19725053757429123, "step": 6284 }, { "epoch": 0.2857272727272727, "grad_norm": 6.53125, "grad_norm_var": 0.9039713541666666, "learning_rate": 0.0001, "loss": 6.2118, "loss/crossentropy": 2.7326711416244507, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18736499175429344, "step": 6286 }, { "epoch": 0.2858181818181818, "grad_norm": 5.5625, "grad_norm_var": 0.8868326822916667, "learning_rate": 0.0001, "loss": 6.101, "loss/crossentropy": 2.63644540309906, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.18845225498080254, "step": 6288 }, { "epoch": 0.2859090909090909, "grad_norm": 5.125, "grad_norm_var": 0.8625284830729166, "learning_rate": 0.0001, "loss": 6.1402, "loss/crossentropy": 2.668896198272705, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18893208727240562, "step": 6290 }, { "epoch": 0.286, "grad_norm": 4.8125, "grad_norm_var": 0.9188151041666667, "learning_rate": 0.0001, "loss": 5.7975, "loss/crossentropy": 2.4730613827705383, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.173854261636734, "step": 6292 }, { "epoch": 0.2860909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.9000651041666666, "learning_rate": 0.0001, "loss": 6.0499, "loss/crossentropy": 2.5877519249916077, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18625811487436295, "step": 6294 }, { "epoch": 0.2861818181818182, "grad_norm": 5.40625, "grad_norm_var": 0.25787760416666666, "learning_rate": 0.0001, "loss": 5.5756, "loss/crossentropy": 2.3273880779743195, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.16407456621527672, "step": 6296 }, { "epoch": 0.2862727272727273, "grad_norm": 5.375, "grad_norm_var": 0.28487955729166664, "learning_rate": 0.0001, "loss": 6.2051, "loss/crossentropy": 2.70187109708786, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19368651881814003, "step": 6298 }, { "epoch": 0.2863636363636364, "grad_norm": 5.375, "grad_norm_var": 0.29983317057291664, "learning_rate": 0.0001, "loss": 5.7931, "loss/crossentropy": 2.3979705572128296, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.17936168238520622, "step": 6300 }, { "epoch": 0.28645454545454546, "grad_norm": 5.21875, "grad_norm_var": 0.16064046223958334, "learning_rate": 0.0001, "loss": 5.7675, "loss/crossentropy": 2.426335096359253, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1764983870089054, "step": 6302 }, { "epoch": 0.28654545454545455, "grad_norm": 6.125, "grad_norm_var": 0.17180989583333334, "learning_rate": 0.0001, "loss": 5.746, "loss/crossentropy": 2.3893784880638123, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1731640063226223, "step": 6304 }, { "epoch": 0.28663636363636363, "grad_norm": 6.8125, "grad_norm_var": 0.26365559895833335, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.279855251312256, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17281443998217583, "step": 6306 }, { "epoch": 0.2867272727272727, "grad_norm": 10.0, "grad_norm_var": 1.3641886393229166, "learning_rate": 0.0001, "loss": 6.2649, "loss/crossentropy": 2.598421633243561, "loss/hidden": 1.626953125, "loss/jsd": 0.0, "loss/logits": 0.20395628735423088, "step": 6308 }, { "epoch": 0.2868181818181818, "grad_norm": 4.84375, "grad_norm_var": 1.43853759765625, "learning_rate": 0.0001, "loss": 5.7955, "loss/crossentropy": 2.4730805158615112, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1769649237394333, "step": 6310 }, { "epoch": 0.2869090909090909, "grad_norm": 5.125, "grad_norm_var": 1.5046183268229167, "learning_rate": 0.0001, "loss": 5.6841, "loss/crossentropy": 2.4120708107948303, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17329534888267517, "step": 6312 }, { "epoch": 0.287, "grad_norm": 5.8125, "grad_norm_var": 1.4756510416666666, "learning_rate": 0.0001, "loss": 6.2234, "loss/crossentropy": 2.771201193332672, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18936152383685112, "step": 6314 }, { "epoch": 0.28709090909090906, "grad_norm": 5.21875, "grad_norm_var": 1.490478515625, "learning_rate": 0.0001, "loss": 5.954, "loss/crossentropy": 2.550211489200592, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1827579326927662, "step": 6316 }, { "epoch": 0.2871818181818182, "grad_norm": 5.3125, "grad_norm_var": 1.5123982747395834, "learning_rate": 0.0001, "loss": 6.0034, "loss/crossentropy": 2.634208619594574, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1820397526025772, "step": 6318 }, { "epoch": 0.2872727272727273, "grad_norm": 5.875, "grad_norm_var": 1.5003865559895833, "learning_rate": 0.0001, "loss": 6.2548, "loss/crossentropy": 2.7846121788024902, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.19135894253849983, "step": 6320 }, { "epoch": 0.2873636363636364, "grad_norm": 4.84375, "grad_norm_var": 1.5020182291666666, "learning_rate": 0.0001, "loss": 5.9476, "loss/crossentropy": 2.602706491947174, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17628280073404312, "step": 6322 }, { "epoch": 0.28745454545454546, "grad_norm": 5.8125, "grad_norm_var": 0.2992146809895833, "learning_rate": 0.0001, "loss": 5.7445, "loss/crossentropy": 2.3759241104125977, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17690129205584526, "step": 6324 }, { "epoch": 0.28754545454545455, "grad_norm": 4.90625, "grad_norm_var": 0.24895833333333334, "learning_rate": 0.0001, "loss": 5.5789, "loss/crossentropy": 2.342465341091156, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16817041113972664, "step": 6326 }, { "epoch": 0.28763636363636363, "grad_norm": 5.46875, "grad_norm_var": 0.23892822265625, "learning_rate": 0.0001, "loss": 6.1735, "loss/crossentropy": 2.703784227371216, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1879875808954239, "step": 6328 }, { "epoch": 0.2877272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.249072265625, "learning_rate": 0.0001, "loss": 5.8984, "loss/crossentropy": 2.433645784854889, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1874902844429016, "step": 6330 }, { "epoch": 0.2878181818181818, "grad_norm": 5.59375, "grad_norm_var": 0.6885701497395833, "learning_rate": 0.0001, "loss": 5.9997, "loss/crossentropy": 2.5178195238113403, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1892038956284523, "step": 6332 }, { "epoch": 0.2879090909090909, "grad_norm": 5.59375, "grad_norm_var": 0.6722493489583333, "learning_rate": 0.0001, "loss": 5.7982, "loss/crossentropy": 2.4828706085681915, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.17391812801361084, "step": 6334 }, { "epoch": 0.288, "grad_norm": 5.53125, "grad_norm_var": 0.6455078125, "learning_rate": 0.0001, "loss": 5.7055, "loss/crossentropy": 2.405693531036377, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17256105691194534, "step": 6336 }, { "epoch": 0.28809090909090906, "grad_norm": 5.65625, "grad_norm_var": 0.58560791015625, "learning_rate": 0.0001, "loss": 5.9928, "loss/crossentropy": 2.474815011024475, "loss/hidden": 1.615234375, "loss/jsd": 0.0, "loss/logits": 0.19027239829301834, "step": 6338 }, { "epoch": 0.2881818181818182, "grad_norm": 5.125, "grad_norm_var": 0.5338541666666666, "learning_rate": 0.0001, "loss": 5.7101, "loss/crossentropy": 2.4250869154930115, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17263760790228844, "step": 6340 }, { "epoch": 0.2882727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.50699462890625, "learning_rate": 0.0001, "loss": 5.7798, "loss/crossentropy": 2.5251635313034058, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17214010655879974, "step": 6342 }, { "epoch": 0.2883636363636364, "grad_norm": 5.40625, "grad_norm_var": 0.5001139322916667, "learning_rate": 0.0001, "loss": 5.9844, "loss/crossentropy": 2.576678454875946, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18413127586245537, "step": 6344 }, { "epoch": 0.28845454545454546, "grad_norm": 5.625, "grad_norm_var": 0.47899983723958334, "learning_rate": 0.0001, "loss": 5.8428, "loss/crossentropy": 2.477466106414795, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.17540239170193672, "step": 6346 }, { "epoch": 0.28854545454545455, "grad_norm": 5.3125, "grad_norm_var": 0.056233723958333336, "learning_rate": 0.0001, "loss": 5.6484, "loss/crossentropy": 2.3888925909996033, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17009150236845016, "step": 6348 }, { "epoch": 0.28863636363636364, "grad_norm": 5.71875, "grad_norm_var": 0.06373697916666667, "learning_rate": 0.0001, "loss": 5.6013, "loss/crossentropy": 2.307472884654999, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17332836985588074, "step": 6350 }, { "epoch": 0.2887272727272727, "grad_norm": 5.25, "grad_norm_var": 0.07336832682291666, "learning_rate": 0.0001, "loss": 6.0492, "loss/crossentropy": 2.671185314655304, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.18330973759293556, "step": 6352 }, { "epoch": 0.2888181818181818, "grad_norm": 5.0, "grad_norm_var": 0.07277018229166667, "learning_rate": 0.0001, "loss": 5.6674, "loss/crossentropy": 2.379883885383606, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1719113029539585, "step": 6354 }, { "epoch": 0.2889090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.090478515625, "learning_rate": 0.0001, "loss": 5.7071, "loss/crossentropy": 2.399138867855072, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.1747443713247776, "step": 6356 }, { "epoch": 0.289, "grad_norm": 5.53125, "grad_norm_var": 0.084619140625, "learning_rate": 0.0001, "loss": 5.5288, "loss/crossentropy": 2.253431111574173, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.1664043292403221, "step": 6358 }, { "epoch": 0.28909090909090907, "grad_norm": 5.125, "grad_norm_var": 0.08279622395833333, "learning_rate": 0.0001, "loss": 5.9713, "loss/crossentropy": 2.61785888671875, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.1792871206998825, "step": 6360 }, { "epoch": 0.2891818181818182, "grad_norm": 5.53125, "grad_norm_var": 0.101025390625, "learning_rate": 0.0001, "loss": 5.5212, "loss/crossentropy": 2.2918064296245575, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16805771738290787, "step": 6362 }, { "epoch": 0.2892727272727273, "grad_norm": 5.4375, "grad_norm_var": 0.103369140625, "learning_rate": 0.0001, "loss": 5.7524, "loss/crossentropy": 2.4292851090431213, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17645365372300148, "step": 6364 }, { "epoch": 0.2893636363636364, "grad_norm": 6.09375, "grad_norm_var": 0.148681640625, "learning_rate": 0.0001, "loss": 5.9101, "loss/crossentropy": 2.5565357208251953, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18223585188388824, "step": 6366 }, { "epoch": 0.28945454545454546, "grad_norm": 5.125, "grad_norm_var": 0.62008056640625, "learning_rate": 0.0001, "loss": 5.6748, "loss/crossentropy": 2.314288765192032, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17667290940880775, "step": 6368 }, { "epoch": 0.28954545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.61011962890625, "learning_rate": 0.0001, "loss": 5.9121, "loss/crossentropy": 2.559453547000885, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.181550782173872, "step": 6370 }, { "epoch": 0.28963636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.596484375, "learning_rate": 0.0001, "loss": 5.7537, "loss/crossentropy": 2.4324541687965393, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1762661673128605, "step": 6372 }, { "epoch": 0.2897272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.603369140625, "learning_rate": 0.0001, "loss": 5.8807, "loss/crossentropy": 2.5067326426506042, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18056003376841545, "step": 6374 }, { "epoch": 0.2898181818181818, "grad_norm": 4.9375, "grad_norm_var": 0.6106404622395833, "learning_rate": 0.0001, "loss": 5.413, "loss/crossentropy": 2.2404470443725586, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16275963932275772, "step": 6376 }, { "epoch": 0.2899090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.57672119140625, "learning_rate": 0.0001, "loss": 5.9924, "loss/crossentropy": 2.620850682258606, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.182079765945673, "step": 6378 }, { "epoch": 0.29, "grad_norm": 6.03125, "grad_norm_var": 0.59888916015625, "learning_rate": 0.0001, "loss": 5.6764, "loss/crossentropy": 2.362844169139862, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17608178779482841, "step": 6380 }, { "epoch": 0.29009090909090907, "grad_norm": 5.15625, "grad_norm_var": 0.55142822265625, "learning_rate": 0.0001, "loss": 5.8571, "loss/crossentropy": 2.496740758419037, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.18154755607247353, "step": 6382 }, { "epoch": 0.2901818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.10025634765625, "learning_rate": 0.0001, "loss": 5.9089, "loss/crossentropy": 2.5661333203315735, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18037055432796478, "step": 6384 }, { "epoch": 0.2902727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.10712483723958334, "learning_rate": 0.0001, "loss": 5.5186, "loss/crossentropy": 2.256395697593689, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.16820861771702766, "step": 6386 }, { "epoch": 0.2903636363636364, "grad_norm": 5.5625, "grad_norm_var": 0.11373697916666667, "learning_rate": 0.0001, "loss": 5.5923, "loss/crossentropy": 2.3543186485767365, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16950234025716782, "step": 6388 }, { "epoch": 0.29045454545454547, "grad_norm": 5.0, "grad_norm_var": 0.10670166015625, "learning_rate": 0.0001, "loss": 5.3708, "loss/crossentropy": 2.174628734588623, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16473028808832169, "step": 6390 }, { "epoch": 0.29054545454545455, "grad_norm": 5.21875, "grad_norm_var": 0.09933268229166667, "learning_rate": 0.0001, "loss": 5.7867, "loss/crossentropy": 2.435766816139221, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17787155136466026, "step": 6392 }, { "epoch": 0.29063636363636364, "grad_norm": 5.40625, "grad_norm_var": 0.10089518229166666, "learning_rate": 0.0001, "loss": 5.7728, "loss/crossentropy": 2.460361123085022, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17557813227176666, "step": 6394 }, { "epoch": 0.2907272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.07649739583333333, "learning_rate": 0.0001, "loss": 5.461, "loss/crossentropy": 2.292537808418274, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16411224752664566, "step": 6396 }, { "epoch": 0.2908181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.07795817057291667, "learning_rate": 0.0001, "loss": 5.5345, "loss/crossentropy": 2.2857338786125183, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1697985753417015, "step": 6398 }, { "epoch": 0.2909090909090909, "grad_norm": 5.5, "grad_norm_var": 0.08583577473958333, "learning_rate": 0.0001, "loss": 5.4739, "loss/crossentropy": 2.2212458848953247, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1686296984553337, "step": 6400 }, { "epoch": 0.291, "grad_norm": 5.21875, "grad_norm_var": 0.08251546223958334, "learning_rate": 0.0001, "loss": 5.4142, "loss/crossentropy": 2.2028337717056274, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16449121572077274, "step": 6402 }, { "epoch": 0.29109090909090907, "grad_norm": 5.15625, "grad_norm_var": 0.07654622395833334, "learning_rate": 0.0001, "loss": 5.7345, "loss/crossentropy": 2.3936077058315277, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1756908856332302, "step": 6404 }, { "epoch": 0.2911818181818182, "grad_norm": 5.71875, "grad_norm_var": 0.09920247395833333, "learning_rate": 0.0001, "loss": 5.6829, "loss/crossentropy": 2.3977487981319427, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.1697305329144001, "step": 6406 }, { "epoch": 0.2912727272727273, "grad_norm": 5.5, "grad_norm_var": 0.10526936848958333, "learning_rate": 0.0001, "loss": 5.8253, "loss/crossentropy": 2.4930441975593567, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17951051518321037, "step": 6408 }, { "epoch": 0.2913636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.11939697265625, "learning_rate": 0.0001, "loss": 5.4823, "loss/crossentropy": 2.178756058216095, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17547029629349709, "step": 6410 }, { "epoch": 0.29145454545454547, "grad_norm": 5.59375, "grad_norm_var": 0.087109375, "learning_rate": 0.0001, "loss": 6.0334, "loss/crossentropy": 2.602608621120453, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18448660895228386, "step": 6412 }, { "epoch": 0.29154545454545455, "grad_norm": 5.53125, "grad_norm_var": 0.0771484375, "learning_rate": 0.0001, "loss": 6.0442, "loss/crossentropy": 2.5804009437561035, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18954746052622795, "step": 6414 }, { "epoch": 0.29163636363636364, "grad_norm": 5.09375, "grad_norm_var": 0.065869140625, "learning_rate": 0.0001, "loss": 5.8092, "loss/crossentropy": 2.4676977396011353, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17770282551646233, "step": 6416 }, { "epoch": 0.2917272727272727, "grad_norm": 6.75, "grad_norm_var": 0.16825764973958332, "learning_rate": 0.0001, "loss": 6.0519, "loss/crossentropy": 2.654853045940399, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.18716738000512123, "step": 6418 }, { "epoch": 0.2918181818181818, "grad_norm": 5.84375, "grad_norm_var": 11.239827473958334, "learning_rate": 0.0001, "loss": 5.8987, "loss/crossentropy": 2.3913660645484924, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.19272873178124428, "step": 6420 }, { "epoch": 0.2919090909090909, "grad_norm": 6.03125, "grad_norm_var": 11.269136555989583, "learning_rate": 0.0001, "loss": 6.0119, "loss/crossentropy": 2.52312171459198, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18989095464348793, "step": 6422 }, { "epoch": 0.292, "grad_norm": 10.0625, "grad_norm_var": 12.110791015625, "learning_rate": 0.0001, "loss": 5.5255, "loss/crossentropy": 2.2766585052013397, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.16687772050499916, "step": 6424 }, { "epoch": 0.29209090909090907, "grad_norm": 5.65625, "grad_norm_var": 11.963407389322917, "learning_rate": 0.0001, "loss": 5.974, "loss/crossentropy": 2.5498313307762146, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.1820608302950859, "step": 6426 }, { "epoch": 0.2921818181818182, "grad_norm": 6.0, "grad_norm_var": 11.79361572265625, "learning_rate": 0.0001, "loss": 6.1126, "loss/crossentropy": 2.6533778309822083, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18694061785936356, "step": 6428 }, { "epoch": 0.2922727272727273, "grad_norm": 5.8125, "grad_norm_var": 11.785400390625, "learning_rate": 0.0001, "loss": 6.0212, "loss/crossentropy": 2.545608878135681, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.19032984972000122, "step": 6430 }, { "epoch": 0.2923636363636364, "grad_norm": 4.8125, "grad_norm_var": 11.83551025390625, "learning_rate": 0.0001, "loss": 4.7536, "loss/crossentropy": 1.7930310666561127, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.14098310470581055, "step": 6432 }, { "epoch": 0.29245454545454547, "grad_norm": 5.28125, "grad_norm_var": 11.99351806640625, "learning_rate": 0.0001, "loss": 5.7582, "loss/crossentropy": 2.484983891248703, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17028889805078506, "step": 6434 }, { "epoch": 0.29254545454545455, "grad_norm": 5.40625, "grad_norm_var": 1.4623006184895833, "learning_rate": 0.0001, "loss": 5.9461, "loss/crossentropy": 2.5085741877555847, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18359917029738426, "step": 6436 }, { "epoch": 0.29263636363636364, "grad_norm": 5.78125, "grad_norm_var": 1.51060791015625, "learning_rate": 0.0001, "loss": 6.2435, "loss/crossentropy": 2.652120888233185, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.19800803810358047, "step": 6438 }, { "epoch": 0.2927272727272727, "grad_norm": 5.6875, "grad_norm_var": 0.25050455729166665, "learning_rate": 0.0001, "loss": 5.9115, "loss/crossentropy": 2.437718093395233, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.1878112033009529, "step": 6440 }, { "epoch": 0.2928181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.27786458333333336, "learning_rate": 0.0001, "loss": 5.964, "loss/crossentropy": 2.5635862350463867, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1833968721330166, "step": 6442 }, { "epoch": 0.2929090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.27844645182291666, "learning_rate": 0.0001, "loss": 5.5321, "loss/crossentropy": 2.28199702501297, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.16524932906031609, "step": 6444 }, { "epoch": 0.293, "grad_norm": 5.125, "grad_norm_var": 0.28560791015625, "learning_rate": 0.0001, "loss": 5.9689, "loss/crossentropy": 2.5819453597068787, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1853792890906334, "step": 6446 }, { "epoch": 0.29309090909090907, "grad_norm": 5.34375, "grad_norm_var": 0.24980061848958332, "learning_rate": 0.0001, "loss": 5.9399, "loss/crossentropy": 2.518230438232422, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18454816192388535, "step": 6448 }, { "epoch": 0.29318181818181815, "grad_norm": 5.8125, "grad_norm_var": 0.27928059895833335, "learning_rate": 0.0001, "loss": 5.6357, "loss/crossentropy": 2.395445376634598, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.16660762578248978, "step": 6450 }, { "epoch": 0.2932727272727273, "grad_norm": 5.46875, "grad_norm_var": 0.42610270182291665, "learning_rate": 0.0001, "loss": 5.762, "loss/crossentropy": 2.4128954708576202, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17670660465955734, "step": 6452 }, { "epoch": 0.2933636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.33212483723958336, "learning_rate": 0.0001, "loss": 5.4298, "loss/crossentropy": 2.1671144366264343, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1694289930164814, "step": 6454 }, { "epoch": 0.29345454545454547, "grad_norm": 5.78125, "grad_norm_var": 0.3329264322916667, "learning_rate": 0.0001, "loss": 6.2093, "loss/crossentropy": 2.697747588157654, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1921689584851265, "step": 6456 }, { "epoch": 0.29354545454545455, "grad_norm": 5.46875, "grad_norm_var": 0.30858968098958334, "learning_rate": 0.0001, "loss": 5.6781, "loss/crossentropy": 2.332130491733551, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1736597754061222, "step": 6458 }, { "epoch": 0.29363636363636364, "grad_norm": 6.03125, "grad_norm_var": 0.3322224934895833, "learning_rate": 0.0001, "loss": 5.6878, "loss/crossentropy": 2.368040770292282, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17475193366408348, "step": 6460 }, { "epoch": 0.2937272727272727, "grad_norm": 5.46875, "grad_norm_var": 0.32107747395833336, "learning_rate": 0.0001, "loss": 5.922, "loss/crossentropy": 2.5045376420021057, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18393009155988693, "step": 6462 }, { "epoch": 0.2938181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.3478800455729167, "learning_rate": 0.0001, "loss": 5.9022, "loss/crossentropy": 2.5498309433460236, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1789899319410324, "step": 6464 }, { "epoch": 0.2939090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.315478515625, "learning_rate": 0.0001, "loss": 5.8967, "loss/crossentropy": 2.522626757621765, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.18213747441768646, "step": 6466 }, { "epoch": 0.294, "grad_norm": 5.25, "grad_norm_var": 0.122265625, "learning_rate": 0.0001, "loss": 5.8191, "loss/crossentropy": 2.493261158466339, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17809418588876724, "step": 6468 }, { "epoch": 0.29409090909090907, "grad_norm": 5.84375, "grad_norm_var": 0.10487874348958333, "learning_rate": 0.0001, "loss": 5.9176, "loss/crossentropy": 2.5297817289829254, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18488026410341263, "step": 6470 }, { "epoch": 0.29418181818181816, "grad_norm": 5.90625, "grad_norm_var": 0.11328125, "learning_rate": 0.0001, "loss": 6.3333, "loss/crossentropy": 2.729183614253998, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.2024078667163849, "step": 6472 }, { "epoch": 0.2942727272727273, "grad_norm": 5.96875, "grad_norm_var": 0.12808837890625, "learning_rate": 0.0001, "loss": 6.1177, "loss/crossentropy": 2.570357084274292, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19106128439307213, "step": 6474 }, { "epoch": 0.2943636363636364, "grad_norm": 5.1875, "grad_norm_var": 0.09078369140625, "learning_rate": 0.0001, "loss": 5.9447, "loss/crossentropy": 2.5954015254974365, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17848378792405128, "step": 6476 }, { "epoch": 0.29445454545454547, "grad_norm": 5.28125, "grad_norm_var": 0.1013671875, "learning_rate": 0.0001, "loss": 6.1484, "loss/crossentropy": 2.7360446453094482, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18576863780617714, "step": 6478 }, { "epoch": 0.29454545454545455, "grad_norm": 5.75, "grad_norm_var": 0.08700764973958333, "learning_rate": 0.0001, "loss": 5.7179, "loss/crossentropy": 2.4017537236213684, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1734119914472103, "step": 6480 }, { "epoch": 0.29463636363636364, "grad_norm": 5.15625, "grad_norm_var": 0.10013020833333333, "learning_rate": 0.0001, "loss": 6.0308, "loss/crossentropy": 2.6180224418640137, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18502632156014442, "step": 6482 }, { "epoch": 0.2947272727272727, "grad_norm": 6.96875, "grad_norm_var": 0.22316080729166668, "learning_rate": 0.0001, "loss": 5.983, "loss/crossentropy": 2.572745203971863, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18438100442290306, "step": 6484 }, { "epoch": 0.2948181818181818, "grad_norm": 6.15625, "grad_norm_var": 0.23020833333333332, "learning_rate": 0.0001, "loss": 5.8888, "loss/crossentropy": 2.532371401786804, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17978516593575478, "step": 6486 }, { "epoch": 0.2949090909090909, "grad_norm": 5.75, "grad_norm_var": 0.25002848307291664, "learning_rate": 0.0001, "loss": 6.2124, "loss/crossentropy": 2.6982439160346985, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19477874785661697, "step": 6488 }, { "epoch": 0.295, "grad_norm": 5.40625, "grad_norm_var": 0.25787760416666666, "learning_rate": 0.0001, "loss": 6.1607, "loss/crossentropy": 2.7382612228393555, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18364539369940758, "step": 6490 }, { "epoch": 0.29509090909090907, "grad_norm": 6.125, "grad_norm_var": 0.25519205729166666, "learning_rate": 0.0001, "loss": 5.7694, "loss/crossentropy": 2.4049428701400757, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17648246139287949, "step": 6492 }, { "epoch": 0.29518181818181816, "grad_norm": 6.90625, "grad_norm_var": 0.33043212890625, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.3929060101509094, "loss/hidden": 1.630859375, "loss/jsd": 0.0, "loss/logits": 0.18024563044309616, "step": 6494 }, { "epoch": 0.2952727272727273, "grad_norm": 5.875, "grad_norm_var": 0.2958984375, "learning_rate": 0.0001, "loss": 5.6423, "loss/crossentropy": 2.296562075614929, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17559007182717323, "step": 6496 }, { "epoch": 0.2953636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.2904581705729167, "learning_rate": 0.0001, "loss": 5.8541, "loss/crossentropy": 2.4249702990055084, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18197200447320938, "step": 6498 }, { "epoch": 0.29545454545454547, "grad_norm": 5.34375, "grad_norm_var": 0.21386311848958334, "learning_rate": 0.0001, "loss": 5.7383, "loss/crossentropy": 2.4196815490722656, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17677923664450645, "step": 6500 }, { "epoch": 0.29554545454545456, "grad_norm": 6.3125, "grad_norm_var": 0.241650390625, "learning_rate": 0.0001, "loss": 6.0796, "loss/crossentropy": 2.643429696559906, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18814954534173012, "step": 6502 }, { "epoch": 0.29563636363636364, "grad_norm": 5.15625, "grad_norm_var": 0.24677327473958333, "learning_rate": 0.0001, "loss": 5.6954, "loss/crossentropy": 2.375609517097473, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1763158030807972, "step": 6504 }, { "epoch": 0.2957272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.27740885416666666, "learning_rate": 0.0001, "loss": 5.6682, "loss/crossentropy": 2.4114389419555664, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17293699458241463, "step": 6506 }, { "epoch": 0.2958181818181818, "grad_norm": 5.6875, "grad_norm_var": 0.2660441080729167, "learning_rate": 0.0001, "loss": 6.0424, "loss/crossentropy": 2.6865822672843933, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1818743534386158, "step": 6508 }, { "epoch": 0.2959090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.13554280598958332, "learning_rate": 0.0001, "loss": 5.7346, "loss/crossentropy": 2.406400144100189, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1761789470911026, "step": 6510 }, { "epoch": 0.296, "grad_norm": 5.625, "grad_norm_var": 0.11171875, "learning_rate": 0.0001, "loss": 5.7382, "loss/crossentropy": 2.4822378158569336, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17110326886177063, "step": 6512 }, { "epoch": 0.29609090909090907, "grad_norm": 5.65625, "grad_norm_var": 0.130712890625, "learning_rate": 0.0001, "loss": 6.1711, "loss/crossentropy": 2.749013364315033, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.18849612772464752, "step": 6514 }, { "epoch": 0.29618181818181816, "grad_norm": 5.5625, "grad_norm_var": 0.12766520182291666, "learning_rate": 0.0001, "loss": 6.1323, "loss/crossentropy": 2.7050217986106873, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1860848292708397, "step": 6516 }, { "epoch": 0.2962727272727273, "grad_norm": 5.5, "grad_norm_var": 0.073828125, "learning_rate": 0.0001, "loss": 5.9879, "loss/crossentropy": 2.609997570514679, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18271423876285553, "step": 6518 }, { "epoch": 0.2963636363636364, "grad_norm": 5.46875, "grad_norm_var": 0.06933186848958334, "learning_rate": 0.0001, "loss": 6.1268, "loss/crossentropy": 2.7027615308761597, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18497726321220398, "step": 6520 }, { "epoch": 0.29645454545454547, "grad_norm": 5.34375, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 6.0715, "loss/crossentropy": 2.599531412124634, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1899654008448124, "step": 6522 }, { "epoch": 0.29654545454545456, "grad_norm": 5.65625, "grad_norm_var": 0.04967041015625, "learning_rate": 0.0001, "loss": 5.5499, "loss/crossentropy": 2.2718420028686523, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17174772918224335, "step": 6524 }, { "epoch": 0.29663636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.05211181640625, "learning_rate": 0.0001, "loss": 5.6099, "loss/crossentropy": 2.341661810874939, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.16881554573774338, "step": 6526 }, { "epoch": 0.29672727272727273, "grad_norm": 6.09375, "grad_norm_var": 0.09542643229166667, "learning_rate": 0.0001, "loss": 5.5971, "loss/crossentropy": 2.3563195765018463, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17016953974962234, "step": 6528 }, { "epoch": 0.2968181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.09465738932291666, "learning_rate": 0.0001, "loss": 6.046, "loss/crossentropy": 2.6402681469917297, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18510755524039268, "step": 6530 }, { "epoch": 0.2969090909090909, "grad_norm": 5.59375, "grad_norm_var": 0.09582926432291666, "learning_rate": 0.0001, "loss": 6.2747, "loss/crossentropy": 2.736659586429596, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1944325901567936, "step": 6532 }, { "epoch": 0.297, "grad_norm": 5.0625, "grad_norm_var": 0.108837890625, "learning_rate": 0.0001, "loss": 5.7997, "loss/crossentropy": 2.514099806547165, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1750393733382225, "step": 6534 }, { "epoch": 0.29709090909090907, "grad_norm": 5.125, "grad_norm_var": 0.1142578125, "learning_rate": 0.0001, "loss": 6.0707, "loss/crossentropy": 2.6434158086776733, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.18823403865098953, "step": 6536 }, { "epoch": 0.29718181818181816, "grad_norm": 5.625, "grad_norm_var": 0.12633056640625, "learning_rate": 0.0001, "loss": 5.8988, "loss/crossentropy": 2.5051318407058716, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18350643292069435, "step": 6538 }, { "epoch": 0.2972727272727273, "grad_norm": 5.4375, "grad_norm_var": 0.13271077473958334, "learning_rate": 0.0001, "loss": 5.4948, "loss/crossentropy": 2.1939810514450073, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.17129762843251228, "step": 6540 }, { "epoch": 0.2973636363636364, "grad_norm": 5.6875, "grad_norm_var": 0.13240559895833334, "learning_rate": 0.0001, "loss": 6.2164, "loss/crossentropy": 2.7436951994895935, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18906325101852417, "step": 6542 }, { "epoch": 0.29745454545454547, "grad_norm": 5.4375, "grad_norm_var": 0.08800455729166666, "learning_rate": 0.0001, "loss": 5.6128, "loss/crossentropy": 2.3744664192199707, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16953874379396439, "step": 6544 }, { "epoch": 0.29754545454545456, "grad_norm": 5.53125, "grad_norm_var": 0.08157145182291667, "learning_rate": 0.0001, "loss": 6.3762, "loss/crossentropy": 2.840195953845978, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.1944228745996952, "step": 6546 }, { "epoch": 0.29763636363636364, "grad_norm": 5.3125, "grad_norm_var": 0.07795817057291667, "learning_rate": 0.0001, "loss": 5.8485, "loss/crossentropy": 2.44404274225235, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.18165262416005135, "step": 6548 }, { "epoch": 0.29772727272727273, "grad_norm": 5.0, "grad_norm_var": 0.08577067057291667, "learning_rate": 0.0001, "loss": 5.0797, "loss/crossentropy": 1.9967524111270905, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.15380455553531647, "step": 6550 }, { "epoch": 0.2978181818181818, "grad_norm": 6.1875, "grad_norm_var": 0.131640625, "learning_rate": 0.0001, "loss": 5.6361, "loss/crossentropy": 2.294685423374176, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.17613400518894196, "step": 6552 }, { "epoch": 0.2979090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.11910400390625, "learning_rate": 0.0001, "loss": 6.1208, "loss/crossentropy": 2.6203067898750305, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1918504200875759, "step": 6554 }, { "epoch": 0.298, "grad_norm": 5.15625, "grad_norm_var": 0.11011962890625, "learning_rate": 0.0001, "loss": 5.8084, "loss/crossentropy": 2.4648477435112, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1788887418806553, "step": 6556 }, { "epoch": 0.2980909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.14152018229166666, "learning_rate": 0.0001, "loss": 6.0071, "loss/crossentropy": 2.5683117508888245, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1889933943748474, "step": 6558 }, { "epoch": 0.29818181818181816, "grad_norm": 5.5, "grad_norm_var": 0.12792561848958334, "learning_rate": 0.0001, "loss": 5.9834, "loss/crossentropy": 2.542698919773102, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1878208890557289, "step": 6560 }, { "epoch": 0.2982727272727273, "grad_norm": 5.5, "grad_norm_var": 0.13444010416666666, "learning_rate": 0.0001, "loss": 6.3299, "loss/crossentropy": 2.7866373658180237, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19222021475434303, "step": 6562 }, { "epoch": 0.2983636363636364, "grad_norm": 4.875, "grad_norm_var": 0.17499593098958333, "learning_rate": 0.0001, "loss": 6.032, "loss/crossentropy": 2.6425121426582336, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18113483116030693, "step": 6564 }, { "epoch": 0.2984545454545455, "grad_norm": 5.0, "grad_norm_var": 0.14833577473958334, "learning_rate": 0.0001, "loss": 5.3721, "loss/crossentropy": 2.243730306625366, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16088581830263138, "step": 6566 }, { "epoch": 0.29854545454545456, "grad_norm": 5.3125, "grad_norm_var": 0.12047119140625, "learning_rate": 0.0001, "loss": 5.735, "loss/crossentropy": 2.39736670255661, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17672723531723022, "step": 6568 }, { "epoch": 0.29863636363636364, "grad_norm": 5.40625, "grad_norm_var": 0.12613525390625, "learning_rate": 0.0001, "loss": 5.6848, "loss/crossentropy": 2.4619824290275574, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16856960207223892, "step": 6570 }, { "epoch": 0.29872727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.12224934895833334, "learning_rate": 0.0001, "loss": 5.982, "loss/crossentropy": 2.505266308784485, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19103313237428665, "step": 6572 }, { "epoch": 0.2988181818181818, "grad_norm": 5.5625, "grad_norm_var": 0.08765869140625, "learning_rate": 0.0001, "loss": 5.9305, "loss/crossentropy": 2.505149781703949, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1843288093805313, "step": 6574 }, { "epoch": 0.2989090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.10468343098958334, "learning_rate": 0.0001, "loss": 5.7429, "loss/crossentropy": 2.5155736804008484, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16902107745409012, "step": 6576 }, { "epoch": 0.299, "grad_norm": 6.03125, "grad_norm_var": 0.12812093098958333, "learning_rate": 0.0001, "loss": 6.0092, "loss/crossentropy": 2.623918890953064, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1840352825820446, "step": 6578 }, { "epoch": 0.2990909090909091, "grad_norm": 4.75, "grad_norm_var": 0.11575113932291667, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.3682748079299927, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17250679060816765, "step": 6580 }, { "epoch": 0.29918181818181816, "grad_norm": 5.3125, "grad_norm_var": 0.10050455729166667, "learning_rate": 0.0001, "loss": 5.8168, "loss/crossentropy": 2.525462806224823, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1740555614233017, "step": 6582 }, { "epoch": 0.29927272727272725, "grad_norm": 4.875, "grad_norm_var": 0.11461181640625, "learning_rate": 0.0001, "loss": 5.656, "loss/crossentropy": 2.415970206260681, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1689259484410286, "step": 6584 }, { "epoch": 0.2993636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.11482747395833333, "learning_rate": 0.0001, "loss": 5.8722, "loss/crossentropy": 2.4952322840690613, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.18242036551237106, "step": 6586 }, { "epoch": 0.2994545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.11646728515625, "learning_rate": 0.0001, "loss": 5.4681, "loss/crossentropy": 2.3172805309295654, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1635218746960163, "step": 6588 }, { "epoch": 0.29954545454545456, "grad_norm": 5.0, "grad_norm_var": 0.13059488932291666, "learning_rate": 0.0001, "loss": 5.8159, "loss/crossentropy": 2.433593988418579, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18198344111442566, "step": 6590 }, { "epoch": 0.29963636363636365, "grad_norm": 6.46875, "grad_norm_var": 0.21287434895833332, "learning_rate": 0.0001, "loss": 6.0559, "loss/crossentropy": 2.5927539467811584, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18948018550872803, "step": 6592 }, { "epoch": 0.29972727272727273, "grad_norm": 5.8125, "grad_norm_var": 0.20439046223958332, "learning_rate": 0.0001, "loss": 5.8122, "loss/crossentropy": 2.55493625998497, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17084529623389244, "step": 6594 }, { "epoch": 0.2998181818181818, "grad_norm": 5.84375, "grad_norm_var": 0.19840087890625, "learning_rate": 0.0001, "loss": 5.6195, "loss/crossentropy": 2.285454958677292, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1757846586406231, "step": 6596 }, { "epoch": 0.2999090909090909, "grad_norm": 4.875, "grad_norm_var": 0.340234375, "learning_rate": 0.0001, "loss": 5.7751, "loss/crossentropy": 2.4246646761894226, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17586781457066536, "step": 6598 }, { "epoch": 0.3, "grad_norm": 5.625, "grad_norm_var": 0.326416015625, "learning_rate": 0.0001, "loss": 5.5987, "loss/crossentropy": 2.3004738986492157, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17337920889258385, "step": 6600 }, { "epoch": 0.3000909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.32242431640625, "learning_rate": 0.0001, "loss": 6.2292, "loss/crossentropy": 2.687479019165039, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.19772286340594292, "step": 6602 }, { "epoch": 0.30018181818181816, "grad_norm": 5.625, "grad_norm_var": 0.3031534830729167, "learning_rate": 0.0001, "loss": 6.1452, "loss/crossentropy": 2.651427149772644, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1888299062848091, "step": 6604 }, { "epoch": 0.30027272727272725, "grad_norm": 5.375, "grad_norm_var": 0.35871988932291665, "learning_rate": 0.0001, "loss": 5.7098, "loss/crossentropy": 2.433163672685623, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17512352019548416, "step": 6606 }, { "epoch": 0.3003636363636364, "grad_norm": 5.4375, "grad_norm_var": 0.32454020182291665, "learning_rate": 0.0001, "loss": 5.7421, "loss/crossentropy": 2.492726147174835, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17142273858189583, "step": 6608 }, { "epoch": 0.3004545454545455, "grad_norm": 6.15625, "grad_norm_var": 0.31573893229166666, "learning_rate": 0.0001, "loss": 6.0188, "loss/crossentropy": 2.572873830795288, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18600069358944893, "step": 6610 }, { "epoch": 0.30054545454545456, "grad_norm": 5.15625, "grad_norm_var": 0.3626912434895833, "learning_rate": 0.0001, "loss": 5.6168, "loss/crossentropy": 2.4497491121292114, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16396822035312653, "step": 6612 }, { "epoch": 0.30063636363636365, "grad_norm": 5.71875, "grad_norm_var": 0.23553059895833334, "learning_rate": 0.0001, "loss": 5.813, "loss/crossentropy": 2.4744390845298767, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17897716164588928, "step": 6614 }, { "epoch": 0.30072727272727273, "grad_norm": 5.5625, "grad_norm_var": 0.22576497395833334, "learning_rate": 0.0001, "loss": 6.0436, "loss/crossentropy": 2.604487180709839, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18765868991613388, "step": 6616 }, { "epoch": 0.3008181818181818, "grad_norm": 5.0625, "grad_norm_var": 2.7483723958333335, "learning_rate": 0.0001, "loss": 6.0666, "loss/crossentropy": 2.6116737723350525, "loss/hidden": 1.654296875, "loss/jsd": 0.0, "loss/logits": 0.18006427958607674, "step": 6618 }, { "epoch": 0.3009090909090909, "grad_norm": 5.46875, "grad_norm_var": 2.8042277018229167, "learning_rate": 0.0001, "loss": 5.4866, "loss/crossentropy": 2.28801953792572, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1663440763950348, "step": 6620 }, { "epoch": 0.301, "grad_norm": 7.4375, "grad_norm_var": 2.909049479166667, "learning_rate": 0.0001, "loss": 6.2574, "loss/crossentropy": 2.7142525911331177, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19767284765839577, "step": 6622 }, { "epoch": 0.3010909090909091, "grad_norm": 5.59375, "grad_norm_var": 2.9029947916666665, "learning_rate": 0.0001, "loss": 5.9568, "loss/crossentropy": 2.5691938400268555, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18173380568623543, "step": 6624 }, { "epoch": 0.30118181818181816, "grad_norm": 5.65625, "grad_norm_var": 2.8587849934895835, "learning_rate": 0.0001, "loss": 5.8961, "loss/crossentropy": 2.4423163533210754, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18795661255717278, "step": 6626 }, { "epoch": 0.30127272727272725, "grad_norm": 5.1875, "grad_norm_var": 2.7727701822916666, "learning_rate": 0.0001, "loss": 5.5023, "loss/crossentropy": 2.2815914154052734, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.16601505875587463, "step": 6628 }, { "epoch": 0.3013636363636364, "grad_norm": 6.21875, "grad_norm_var": 2.785416666666667, "learning_rate": 0.0001, "loss": 6.0243, "loss/crossentropy": 2.5783817768096924, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18599765747785568, "step": 6630 }, { "epoch": 0.3014545454545455, "grad_norm": 5.1875, "grad_norm_var": 2.8307291666666665, "learning_rate": 0.0001, "loss": 5.7105, "loss/crossentropy": 2.4382545351982117, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17195503041148186, "step": 6632 }, { "epoch": 0.30154545454545456, "grad_norm": 5.46875, "grad_norm_var": 0.3597005208333333, "learning_rate": 0.0001, "loss": 6.0098, "loss/crossentropy": 2.500635027885437, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19037288054823875, "step": 6634 }, { "epoch": 0.30163636363636365, "grad_norm": 4.9375, "grad_norm_var": 0.3706868489583333, "learning_rate": 0.0001, "loss": 5.6041, "loss/crossentropy": 2.355778753757477, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16936635598540306, "step": 6636 }, { "epoch": 0.30172727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.1529296875, "learning_rate": 0.0001, "loss": 5.8771, "loss/crossentropy": 2.585618495941162, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17621426284313202, "step": 6638 }, { "epoch": 0.3018181818181818, "grad_norm": 5.34375, "grad_norm_var": 0.16422119140625, "learning_rate": 0.0001, "loss": 5.6246, "loss/crossentropy": 2.3690313696861267, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1696997657418251, "step": 6640 }, { "epoch": 0.3019090909090909, "grad_norm": 5.875, "grad_norm_var": 0.1220703125, "learning_rate": 0.0001, "loss": 6.13, "loss/crossentropy": 2.6394307613372803, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1885085590183735, "step": 6642 }, { "epoch": 0.302, "grad_norm": 5.09375, "grad_norm_var": 0.12688802083333334, "learning_rate": 0.0001, "loss": 5.9484, "loss/crossentropy": 2.6296958923339844, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17874616011977196, "step": 6644 }, { "epoch": 0.3020909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.07717692057291667, "learning_rate": 0.0001, "loss": 5.7803, "loss/crossentropy": 2.4806795716285706, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17332225292921066, "step": 6646 }, { "epoch": 0.30218181818181816, "grad_norm": 4.9375, "grad_norm_var": 0.08567708333333333, "learning_rate": 0.0001, "loss": 5.9112, "loss/crossentropy": 2.5275455117225647, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18093906715512276, "step": 6648 }, { "epoch": 0.30227272727272725, "grad_norm": 5.625, "grad_norm_var": 0.13778889973958333, "learning_rate": 0.0001, "loss": 5.8348, "loss/crossentropy": 2.5052413940429688, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.1722174882888794, "step": 6650 }, { "epoch": 0.3023636363636364, "grad_norm": 5.5, "grad_norm_var": 0.17498372395833334, "learning_rate": 0.0001, "loss": 5.8649, "loss/crossentropy": 2.5031628012657166, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1779722422361374, "step": 6652 }, { "epoch": 0.3024545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.17766927083333334, "learning_rate": 0.0001, "loss": 5.5734, "loss/crossentropy": 2.278750777244568, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1714612916111946, "step": 6654 }, { "epoch": 0.30254545454545456, "grad_norm": 5.21875, "grad_norm_var": 0.16669514973958333, "learning_rate": 0.0001, "loss": 5.4997, "loss/crossentropy": 2.299560308456421, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16474349424242973, "step": 6656 }, { "epoch": 0.30263636363636365, "grad_norm": 5.65625, "grad_norm_var": 0.16226806640625, "learning_rate": 0.0001, "loss": 6.1364, "loss/crossentropy": 2.678773820400238, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.19009432196617126, "step": 6658 }, { "epoch": 0.30272727272727273, "grad_norm": 5.3125, "grad_norm_var": 0.6963826497395833, "learning_rate": 0.0001, "loss": 6.0307, "loss/crossentropy": 2.5744833946228027, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18898599222302437, "step": 6660 }, { "epoch": 0.3028181818181818, "grad_norm": 5.4375, "grad_norm_var": 0.6725260416666666, "learning_rate": 0.0001, "loss": 5.8462, "loss/crossentropy": 2.502289831638336, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17852694913744926, "step": 6662 }, { "epoch": 0.3029090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.6462076822916667, "learning_rate": 0.0001, "loss": 5.7408, "loss/crossentropy": 2.481033504009247, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17050732299685478, "step": 6664 }, { "epoch": 0.303, "grad_norm": 4.90625, "grad_norm_var": 0.65914306640625, "learning_rate": 0.0001, "loss": 5.1489, "loss/crossentropy": 2.0397094786167145, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1566176638007164, "step": 6666 }, { "epoch": 0.3030909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.6567667643229167, "learning_rate": 0.0001, "loss": 5.7048, "loss/crossentropy": 2.4634463787078857, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16945136338472366, "step": 6668 }, { "epoch": 0.30318181818181816, "grad_norm": 5.28125, "grad_norm_var": 0.6652180989583333, "learning_rate": 0.0001, "loss": 5.8777, "loss/crossentropy": 2.4693605303764343, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18399635329842567, "step": 6670 }, { "epoch": 0.30327272727272725, "grad_norm": 5.34375, "grad_norm_var": 0.66295166015625, "learning_rate": 0.0001, "loss": 5.5334, "loss/crossentropy": 2.2958070635795593, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16711437702178955, "step": 6672 }, { "epoch": 0.3033636363636364, "grad_norm": 4.875, "grad_norm_var": 0.6914021809895833, "learning_rate": 0.0001, "loss": 5.5465, "loss/crossentropy": 2.2785834074020386, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17210310325026512, "step": 6674 }, { "epoch": 0.3034545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.08730061848958333, "learning_rate": 0.0001, "loss": 6.056, "loss/crossentropy": 2.6750616431236267, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18301242217421532, "step": 6676 }, { "epoch": 0.30354545454545456, "grad_norm": 5.1875, "grad_norm_var": 0.08560791015625, "learning_rate": 0.0001, "loss": 5.721, "loss/crossentropy": 2.472710371017456, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.16896959394216537, "step": 6678 }, { "epoch": 0.30363636363636365, "grad_norm": 5.25, "grad_norm_var": 0.090087890625, "learning_rate": 0.0001, "loss": 5.4067, "loss/crossentropy": 2.277940511703491, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.15877820551395416, "step": 6680 }, { "epoch": 0.30372727272727273, "grad_norm": 5.59375, "grad_norm_var": 0.08040364583333333, "learning_rate": 0.0001, "loss": 5.8778, "loss/crossentropy": 2.540711522102356, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17668123543262482, "step": 6682 }, { "epoch": 0.3038181818181818, "grad_norm": 5.84375, "grad_norm_var": 0.14375, "learning_rate": 0.0001, "loss": 5.8872, "loss/crossentropy": 2.539237529039383, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17776435613632202, "step": 6684 }, { "epoch": 0.3039090909090909, "grad_norm": 4.875, "grad_norm_var": 0.12395833333333334, "learning_rate": 0.0001, "loss": 6.1393, "loss/crossentropy": 2.753306210041046, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1829376146197319, "step": 6686 }, { "epoch": 0.304, "grad_norm": 5.21875, "grad_norm_var": 0.12526041666666668, "learning_rate": 0.0001, "loss": 5.7321, "loss/crossentropy": 2.4185855388641357, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17471306025981903, "step": 6688 }, { "epoch": 0.3040909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.11170247395833334, "learning_rate": 0.0001, "loss": 5.677, "loss/crossentropy": 2.3336669206619263, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17867084220051765, "step": 6690 }, { "epoch": 0.30418181818181816, "grad_norm": 5.21875, "grad_norm_var": 0.18808186848958333, "learning_rate": 0.0001, "loss": 5.9849, "loss/crossentropy": 2.563067138195038, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18495334684848785, "step": 6692 }, { "epoch": 0.30427272727272725, "grad_norm": 4.875, "grad_norm_var": 0.21028645833333334, "learning_rate": 0.0001, "loss": 5.3265, "loss/crossentropy": 2.1315263509750366, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.16266366094350815, "step": 6694 }, { "epoch": 0.30436363636363634, "grad_norm": 4.96875, "grad_norm_var": 0.20735270182291668, "learning_rate": 0.0001, "loss": 5.3131, "loss/crossentropy": 2.0882128179073334, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.16291940584778786, "step": 6696 }, { "epoch": 0.3044545454545455, "grad_norm": 5.25, "grad_norm_var": 0.20855712890625, "learning_rate": 0.0001, "loss": 5.9743, "loss/crossentropy": 2.6105095744132996, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18208226934075356, "step": 6698 }, { "epoch": 0.30454545454545456, "grad_norm": 5.65625, "grad_norm_var": 0.15777587890625, "learning_rate": 0.0001, "loss": 5.7741, "loss/crossentropy": 2.498719274997711, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17226507142186165, "step": 6700 }, { "epoch": 0.30463636363636365, "grad_norm": 5.6875, "grad_norm_var": 0.14498291015625, "learning_rate": 0.0001, "loss": 5.7598, "loss/crossentropy": 2.4472256302833557, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17422191053628922, "step": 6702 }, { "epoch": 0.30472727272727274, "grad_norm": 5.0625, "grad_norm_var": 0.1494140625, "learning_rate": 0.0001, "loss": 5.5286, "loss/crossentropy": 2.240762233734131, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1719437651336193, "step": 6704 }, { "epoch": 0.3048181818181818, "grad_norm": 5.78125, "grad_norm_var": 0.15623372395833332, "learning_rate": 0.0001, "loss": 5.731, "loss/crossentropy": 2.3897168934345245, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.17339036986231804, "step": 6706 }, { "epoch": 0.3049090909090909, "grad_norm": 5.5625, "grad_norm_var": 0.08821614583333333, "learning_rate": 0.0001, "loss": 5.8585, "loss/crossentropy": 2.5632822513580322, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17268243804574013, "step": 6708 }, { "epoch": 0.305, "grad_norm": 5.34375, "grad_norm_var": 0.06116129557291667, "learning_rate": 0.0001, "loss": 5.5092, "loss/crossentropy": 2.268141031265259, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16883350908756256, "step": 6710 }, { "epoch": 0.3050909090909091, "grad_norm": 4.5625, "grad_norm_var": 0.11717122395833333, "learning_rate": 0.0001, "loss": 5.3201, "loss/crossentropy": 2.195032089948654, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1570342518389225, "step": 6712 }, { "epoch": 0.30518181818181817, "grad_norm": 5.9375, "grad_norm_var": 0.13821614583333333, "learning_rate": 0.0001, "loss": 5.5204, "loss/crossentropy": 2.2466814815998077, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.1685827486217022, "step": 6714 }, { "epoch": 0.30527272727272725, "grad_norm": 5.71875, "grad_norm_var": 0.13396809895833334, "learning_rate": 0.0001, "loss": 5.9243, "loss/crossentropy": 2.6436794996261597, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.16985806450247765, "step": 6716 }, { "epoch": 0.30536363636363634, "grad_norm": 4.90625, "grad_norm_var": 0.14836832682291667, "learning_rate": 0.0001, "loss": 5.5658, "loss/crossentropy": 2.3271549940109253, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16761675104498863, "step": 6718 }, { "epoch": 0.3054545454545455, "grad_norm": 5.78125, "grad_norm_var": 0.17327067057291667, "learning_rate": 0.0001, "loss": 6.2505, "loss/crossentropy": 2.7384812235832214, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1947612129151821, "step": 6720 }, { "epoch": 0.30554545454545456, "grad_norm": 4.90625, "grad_norm_var": 0.1880859375, "learning_rate": 0.0001, "loss": 5.8111, "loss/crossentropy": 2.5117342472076416, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17407281696796417, "step": 6722 }, { "epoch": 0.30563636363636365, "grad_norm": 6.9375, "grad_norm_var": 0.335400390625, "learning_rate": 0.0001, "loss": 5.7052, "loss/crossentropy": 2.3668654561042786, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17602579668164253, "step": 6724 }, { "epoch": 0.30572727272727274, "grad_norm": 5.09375, "grad_norm_var": 0.36067301432291665, "learning_rate": 0.0001, "loss": 5.8842, "loss/crossentropy": 2.4975239038467407, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1802656203508377, "step": 6726 }, { "epoch": 0.3058181818181818, "grad_norm": 6.09375, "grad_norm_var": 0.320556640625, "learning_rate": 0.0001, "loss": 5.7054, "loss/crossentropy": 2.4592906832695007, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1718798242509365, "step": 6728 }, { "epoch": 0.3059090909090909, "grad_norm": 6.125, "grad_norm_var": 0.3337198893229167, "learning_rate": 0.0001, "loss": 6.0317, "loss/crossentropy": 2.637117385864258, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1812579110264778, "step": 6730 }, { "epoch": 0.306, "grad_norm": 5.65625, "grad_norm_var": 0.35911458333333335, "learning_rate": 0.0001, "loss": 5.708, "loss/crossentropy": 2.509102165699005, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1650029979646206, "step": 6732 }, { "epoch": 0.3060909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.36868489583333336, "learning_rate": 0.0001, "loss": 5.9932, "loss/crossentropy": 2.655020833015442, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.18089309707283974, "step": 6734 }, { "epoch": 0.30618181818181817, "grad_norm": 5.375, "grad_norm_var": 0.36031494140625, "learning_rate": 0.0001, "loss": 5.7009, "loss/crossentropy": 2.3674222230911255, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.17534280940890312, "step": 6736 }, { "epoch": 0.30627272727272725, "grad_norm": 5.28125, "grad_norm_var": 0.3611328125, "learning_rate": 0.0001, "loss": 5.8314, "loss/crossentropy": 2.5281983613967896, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17504702508449554, "step": 6738 }, { "epoch": 0.30636363636363634, "grad_norm": 5.375, "grad_norm_var": 0.20396728515625, "learning_rate": 0.0001, "loss": 5.7394, "loss/crossentropy": 2.4190593957901, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17266285046935081, "step": 6740 }, { "epoch": 0.3064545454545455, "grad_norm": 5.4375, "grad_norm_var": 0.17472330729166666, "learning_rate": 0.0001, "loss": 5.7911, "loss/crossentropy": 2.5496371388435364, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16965192928910255, "step": 6742 }, { "epoch": 0.30654545454545457, "grad_norm": 5.53125, "grad_norm_var": 0.13632405598958333, "learning_rate": 0.0001, "loss": 6.034, "loss/crossentropy": 2.5889362394809723, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1878686137497425, "step": 6744 }, { "epoch": 0.30663636363636365, "grad_norm": 6.0625, "grad_norm_var": 0.13020426432291668, "learning_rate": 0.0001, "loss": 5.7687, "loss/crossentropy": 2.397672802209854, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.179484061896801, "step": 6746 }, { "epoch": 0.30672727272727274, "grad_norm": 5.28125, "grad_norm_var": 0.12233072916666667, "learning_rate": 0.0001, "loss": 5.68, "loss/crossentropy": 2.3882066011428833, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1752701997756958, "step": 6748 }, { "epoch": 0.3068181818181818, "grad_norm": 4.9375, "grad_norm_var": 0.12235921223958333, "learning_rate": 0.0001, "loss": 5.7554, "loss/crossentropy": 2.505259394645691, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17090748250484467, "step": 6750 }, { "epoch": 0.3069090909090909, "grad_norm": 6.78125, "grad_norm_var": 0.24745686848958334, "learning_rate": 0.0001, "loss": 5.7131, "loss/crossentropy": 2.2914520502090454, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.18298565223813057, "step": 6752 }, { "epoch": 0.307, "grad_norm": 6.09375, "grad_norm_var": 0.4578776041666667, "learning_rate": 0.0001, "loss": 5.4962, "loss/crossentropy": 2.1311601102352142, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17810554429888725, "step": 6754 }, { "epoch": 0.3070909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.4509073893229167, "learning_rate": 0.0001, "loss": 6.1645, "loss/crossentropy": 2.6232816576957703, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19553092122077942, "step": 6756 }, { "epoch": 0.30718181818181817, "grad_norm": 5.625, "grad_norm_var": 0.43255208333333334, "learning_rate": 0.0001, "loss": 5.9953, "loss/crossentropy": 2.524265766143799, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18753265962004662, "step": 6758 }, { "epoch": 0.30727272727272725, "grad_norm": 6.78125, "grad_norm_var": 0.5102701822916667, "learning_rate": 0.0001, "loss": 5.8984, "loss/crossentropy": 2.5082197189331055, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.17768927663564682, "step": 6760 }, { "epoch": 0.30736363636363634, "grad_norm": 5.375, "grad_norm_var": 0.518603515625, "learning_rate": 0.0001, "loss": 5.5982, "loss/crossentropy": 2.340929925441742, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1683022528886795, "step": 6762 }, { "epoch": 0.3074545454545455, "grad_norm": 5.59375, "grad_norm_var": 0.5148274739583333, "learning_rate": 0.0001, "loss": 5.3494, "loss/crossentropy": 2.150405764579773, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.16208837553858757, "step": 6764 }, { "epoch": 0.30754545454545457, "grad_norm": 5.75, "grad_norm_var": 0.46925455729166665, "learning_rate": 0.0001, "loss": 6.0472, "loss/crossentropy": 2.6315059065818787, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18316753581166267, "step": 6766 }, { "epoch": 0.30763636363636365, "grad_norm": 4.90625, "grad_norm_var": 0.38424072265625, "learning_rate": 0.0001, "loss": 5.624, "loss/crossentropy": 2.3608171343803406, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1692887581884861, "step": 6768 }, { "epoch": 0.30772727272727274, "grad_norm": 6.90625, "grad_norm_var": 0.32037760416666666, "learning_rate": 0.0001, "loss": 6.0668, "loss/crossentropy": 2.630467414855957, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18699396774172783, "step": 6770 }, { "epoch": 0.3078181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.335791015625, "learning_rate": 0.0001, "loss": 5.7226, "loss/crossentropy": 2.467108726501465, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.16754180565476418, "step": 6772 }, { "epoch": 0.3079090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.332421875, "learning_rate": 0.0001, "loss": 5.9626, "loss/crossentropy": 2.586606740951538, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.18349330499768257, "step": 6774 }, { "epoch": 0.308, "grad_norm": 5.96875, "grad_norm_var": 0.25243733723958334, "learning_rate": 0.0001, "loss": 5.7406, "loss/crossentropy": 2.477500319480896, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17220743373036385, "step": 6776 }, { "epoch": 0.3080909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.26223958333333336, "learning_rate": 0.0001, "loss": 5.0507, "loss/crossentropy": 2.011750966310501, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1497909277677536, "step": 6778 }, { "epoch": 0.30818181818181817, "grad_norm": 5.375, "grad_norm_var": 0.24999593098958334, "learning_rate": 0.0001, "loss": 6.1763, "loss/crossentropy": 2.669061064720154, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.19037719443440437, "step": 6780 }, { "epoch": 0.30827272727272725, "grad_norm": 5.28125, "grad_norm_var": 0.2416015625, "learning_rate": 0.0001, "loss": 5.486, "loss/crossentropy": 2.2485924065113068, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16807199269533157, "step": 6782 }, { "epoch": 0.30836363636363634, "grad_norm": 5.03125, "grad_norm_var": 0.23463541666666668, "learning_rate": 0.0001, "loss": 5.6918, "loss/crossentropy": 2.396824389696121, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17403024435043335, "step": 6784 }, { "epoch": 0.3084545454545455, "grad_norm": 5.1875, "grad_norm_var": 0.09882405598958334, "learning_rate": 0.0001, "loss": 5.4491, "loss/crossentropy": 2.300634950399399, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1603504866361618, "step": 6786 }, { "epoch": 0.30854545454545457, "grad_norm": 5.5, "grad_norm_var": 0.09934488932291667, "learning_rate": 0.0001, "loss": 6.1538, "loss/crossentropy": 2.6931806206703186, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.18961849063634872, "step": 6788 }, { "epoch": 0.30863636363636365, "grad_norm": 5.03125, "grad_norm_var": 0.10467122395833334, "learning_rate": 0.0001, "loss": 5.6708, "loss/crossentropy": 2.3487836122512817, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1720479242503643, "step": 6790 }, { "epoch": 0.30872727272727274, "grad_norm": 6.875, "grad_norm_var": 0.2577107747395833, "learning_rate": 0.0001, "loss": 5.6818, "loss/crossentropy": 2.4103517830371857, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17323649674654007, "step": 6792 }, { "epoch": 0.3088181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.2633748372395833, "learning_rate": 0.0001, "loss": 5.6986, "loss/crossentropy": 2.4377059936523438, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17238206416368484, "step": 6794 }, { "epoch": 0.3089090909090909, "grad_norm": 7.5625, "grad_norm_var": 0.57496337890625, "learning_rate": 0.0001, "loss": 5.3347, "loss/crossentropy": 2.1768576204776764, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16188236698508263, "step": 6796 }, { "epoch": 0.309, "grad_norm": 5.09375, "grad_norm_var": 0.582275390625, "learning_rate": 0.0001, "loss": 5.9346, "loss/crossentropy": 2.590120792388916, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17760981991887093, "step": 6798 }, { "epoch": 0.3090909090909091, "grad_norm": 5.8125, "grad_norm_var": 0.7708170572916667, "learning_rate": 0.0001, "loss": 6.0087, "loss/crossentropy": 2.551900327205658, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1835717372596264, "step": 6800 }, { "epoch": 0.30918181818181817, "grad_norm": 4.875, "grad_norm_var": 0.7747233072916667, "learning_rate": 0.0001, "loss": 5.6353, "loss/crossentropy": 2.396121710538864, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1704035848379135, "step": 6802 }, { "epoch": 0.30927272727272725, "grad_norm": 5.40625, "grad_norm_var": 0.7703125, "learning_rate": 0.0001, "loss": 5.832, "loss/crossentropy": 2.532931089401245, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1752237193286419, "step": 6804 }, { "epoch": 0.30936363636363634, "grad_norm": 6.21875, "grad_norm_var": 0.8184529622395833, "learning_rate": 0.0001, "loss": 5.8602, "loss/crossentropy": 2.4500394463539124, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1839882992208004, "step": 6806 }, { "epoch": 0.3094545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.66041259765625, "learning_rate": 0.0001, "loss": 6.1298, "loss/crossentropy": 2.6541372537612915, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.190341517329216, "step": 6808 }, { "epoch": 0.30954545454545457, "grad_norm": 6.03125, "grad_norm_var": 5.833919270833333, "learning_rate": 0.0001, "loss": 6.0229, "loss/crossentropy": 2.488401770591736, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1950531266629696, "step": 6810 }, { "epoch": 0.30963636363636365, "grad_norm": 5.40625, "grad_norm_var": 5.737483723958333, "learning_rate": 0.0001, "loss": 5.4229, "loss/crossentropy": 2.2493740916252136, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.16325125843286514, "step": 6812 }, { "epoch": 0.30972727272727274, "grad_norm": 5.96875, "grad_norm_var": 5.6845703125, "learning_rate": 0.0001, "loss": 5.4299, "loss/crossentropy": 2.1968085765838623, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.16686851158738136, "step": 6814 }, { "epoch": 0.3098181818181818, "grad_norm": 5.25, "grad_norm_var": 5.689306640625, "learning_rate": 0.0001, "loss": 5.872, "loss/crossentropy": 2.4977970719337463, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1809723973274231, "step": 6816 }, { "epoch": 0.3099090909090909, "grad_norm": 5.8125, "grad_norm_var": 5.4892578125, "learning_rate": 0.0001, "loss": 6.19, "loss/crossentropy": 2.692014694213867, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19120166450738907, "step": 6818 }, { "epoch": 0.31, "grad_norm": 4.9375, "grad_norm_var": 5.529150390625, "learning_rate": 0.0001, "loss": 5.6592, "loss/crossentropy": 2.376803696155548, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1719900593161583, "step": 6820 }, { "epoch": 0.3100909090909091, "grad_norm": 5.21875, "grad_norm_var": 5.617561848958333, "learning_rate": 0.0001, "loss": 6.0138, "loss/crossentropy": 2.6132513284683228, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.1808798983693123, "step": 6822 }, { "epoch": 0.31018181818181817, "grad_norm": 5.03125, "grad_norm_var": 5.681734212239584, "learning_rate": 0.0001, "loss": 5.6691, "loss/crossentropy": 2.427042007446289, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17225748300552368, "step": 6824 }, { "epoch": 0.31027272727272726, "grad_norm": 5.46875, "grad_norm_var": 0.08201497395833333, "learning_rate": 0.0001, "loss": 5.7422, "loss/crossentropy": 2.4376882910728455, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17459092289209366, "step": 6826 }, { "epoch": 0.31036363636363634, "grad_norm": 5.375, "grad_norm_var": 0.08958333333333333, "learning_rate": 0.0001, "loss": 5.8799, "loss/crossentropy": 2.5064918398857117, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18011458590626717, "step": 6828 }, { "epoch": 0.3104545454545454, "grad_norm": 5.1875, "grad_norm_var": 0.06750895182291666, "learning_rate": 0.0001, "loss": 6.3544, "loss/crossentropy": 2.848910331726074, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19547269120812416, "step": 6830 }, { "epoch": 0.31054545454545457, "grad_norm": 5.0, "grad_norm_var": 0.0728515625, "learning_rate": 0.0001, "loss": 5.7988, "loss/crossentropy": 2.508919596672058, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17488804087042809, "step": 6832 }, { "epoch": 0.31063636363636365, "grad_norm": 5.5625, "grad_norm_var": 0.055074055989583336, "learning_rate": 0.0001, "loss": 6.1118, "loss/crossentropy": 2.666120409965515, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18597598001360893, "step": 6834 }, { "epoch": 0.31072727272727274, "grad_norm": 5.25, "grad_norm_var": 0.05074462890625, "learning_rate": 0.0001, "loss": 6.0625, "loss/crossentropy": 2.631662368774414, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18644659221172333, "step": 6836 }, { "epoch": 0.3108181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.05142822265625, "learning_rate": 0.0001, "loss": 5.5229, "loss/crossentropy": 2.3026795089244843, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1683108899742365, "step": 6838 }, { "epoch": 0.3109090909090909, "grad_norm": 5.5625, "grad_norm_var": 0.07044270833333334, "learning_rate": 0.0001, "loss": 5.7272, "loss/crossentropy": 2.4265310168266296, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.17206371575593948, "step": 6840 }, { "epoch": 0.311, "grad_norm": 5.09375, "grad_norm_var": 0.07942301432291667, "learning_rate": 0.0001, "loss": 5.4799, "loss/crossentropy": 2.2978036403656006, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16449791565537453, "step": 6842 }, { "epoch": 0.3110909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.08287760416666666, "learning_rate": 0.0001, "loss": 6.1686, "loss/crossentropy": 2.7183183431625366, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1872176118195057, "step": 6844 }, { "epoch": 0.31118181818181817, "grad_norm": 5.09375, "grad_norm_var": 0.10823160807291667, "learning_rate": 0.0001, "loss": 5.5577, "loss/crossentropy": 2.3741630613803864, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16132068634033203, "step": 6846 }, { "epoch": 0.31127272727272726, "grad_norm": 5.0, "grad_norm_var": 0.10357666015625, "learning_rate": 0.0001, "loss": 5.628, "loss/crossentropy": 2.368086576461792, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17052729800343513, "step": 6848 }, { "epoch": 0.31136363636363634, "grad_norm": 5.25, "grad_norm_var": 0.10115559895833333, "learning_rate": 0.0001, "loss": 5.7539, "loss/crossentropy": 2.43812096118927, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17532476410269737, "step": 6850 }, { "epoch": 0.31145454545454543, "grad_norm": 4.8125, "grad_norm_var": 0.10621337890625, "learning_rate": 0.0001, "loss": 5.3542, "loss/crossentropy": 2.1917944252490997, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.15960419178009033, "step": 6852 }, { "epoch": 0.31154545454545457, "grad_norm": 5.5625, "grad_norm_var": 0.10963134765625, "learning_rate": 0.0001, "loss": 5.747, "loss/crossentropy": 2.462857961654663, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17411481589078903, "step": 6854 }, { "epoch": 0.31163636363636366, "grad_norm": 5.25, "grad_norm_var": 0.09127604166666667, "learning_rate": 0.0001, "loss": 5.6941, "loss/crossentropy": 2.4614263772964478, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16896935924887657, "step": 6856 }, { "epoch": 0.31172727272727274, "grad_norm": 5.46875, "grad_norm_var": 0.080322265625, "learning_rate": 0.0001, "loss": 5.4846, "loss/crossentropy": 2.270065128803253, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1657896563410759, "step": 6858 }, { "epoch": 0.3118181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.08401285807291667, "learning_rate": 0.0001, "loss": 5.7071, "loss/crossentropy": 2.4529592394828796, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1701386161148548, "step": 6860 }, { "epoch": 0.3119090909090909, "grad_norm": 5.59375, "grad_norm_var": 0.0587890625, "learning_rate": 0.0001, "loss": 5.8805, "loss/crossentropy": 2.4907527565956116, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1835065819323063, "step": 6862 }, { "epoch": 0.312, "grad_norm": 5.5, "grad_norm_var": 0.057535807291666664, "learning_rate": 0.0001, "loss": 5.8378, "loss/crossentropy": 2.510494649410248, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17725712060928345, "step": 6864 }, { "epoch": 0.3120909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.061181640625, "learning_rate": 0.0001, "loss": 5.7512, "loss/crossentropy": 2.3311347663402557, "loss/hidden": 1.619140625, "loss/jsd": 0.0, "loss/logits": 0.18009237572550774, "step": 6866 }, { "epoch": 0.31218181818181817, "grad_norm": 5.75, "grad_norm_var": 0.0541015625, "learning_rate": 0.0001, "loss": 5.6227, "loss/crossentropy": 2.3962608575820923, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.16619378700852394, "step": 6868 }, { "epoch": 0.31227272727272726, "grad_norm": 5.40625, "grad_norm_var": 0.054947916666666666, "learning_rate": 0.0001, "loss": 6.0457, "loss/crossentropy": 2.531989336013794, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19355924054980278, "step": 6870 }, { "epoch": 0.31236363636363634, "grad_norm": 5.09375, "grad_norm_var": 0.06829427083333334, "learning_rate": 0.0001, "loss": 5.3745, "loss/crossentropy": 2.1863067746162415, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1635504774749279, "step": 6872 }, { "epoch": 0.31245454545454543, "grad_norm": 4.875, "grad_norm_var": 0.08127848307291667, "learning_rate": 0.0001, "loss": 5.7731, "loss/crossentropy": 2.480404853820801, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17438506335020065, "step": 6874 }, { "epoch": 0.31254545454545457, "grad_norm": 5.65625, "grad_norm_var": 0.07148030598958334, "learning_rate": 0.0001, "loss": 5.9974, "loss/crossentropy": 2.5743629932403564, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18488332629203796, "step": 6876 }, { "epoch": 0.31263636363636366, "grad_norm": 6.15625, "grad_norm_var": 0.10670166015625, "learning_rate": 0.0001, "loss": 6.065, "loss/crossentropy": 2.6272627115249634, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.1877221316099167, "step": 6878 }, { "epoch": 0.31272727272727274, "grad_norm": 4.5, "grad_norm_var": 0.48778889973958334, "learning_rate": 0.0001, "loss": 5.7646, "loss/crossentropy": 2.5111544132232666, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17261124216020107, "step": 6880 }, { "epoch": 0.31281818181818183, "grad_norm": 6.09375, "grad_norm_var": 0.56754150390625, "learning_rate": 0.0001, "loss": 5.6688, "loss/crossentropy": 2.391104578971863, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17347493767738342, "step": 6882 }, { "epoch": 0.3129090909090909, "grad_norm": 5.8125, "grad_norm_var": 0.56412353515625, "learning_rate": 0.0001, "loss": 6.1315, "loss/crossentropy": 2.6653889417648315, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.18782251700758934, "step": 6884 }, { "epoch": 0.313, "grad_norm": 5.4375, "grad_norm_var": 0.5648274739583333, "learning_rate": 0.0001, "loss": 5.7029, "loss/crossentropy": 2.4322752952575684, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1727668046951294, "step": 6886 }, { "epoch": 0.3130909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.5520792643229167, "learning_rate": 0.0001, "loss": 5.6053, "loss/crossentropy": 2.3281836807727814, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.16716356948018074, "step": 6888 }, { "epoch": 0.3131818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.5671875, "learning_rate": 0.0001, "loss": 5.7275, "loss/crossentropy": 2.4659522473812103, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17244413681328297, "step": 6890 }, { "epoch": 0.31327272727272726, "grad_norm": 5.34375, "grad_norm_var": 0.5691243489583333, "learning_rate": 0.0001, "loss": 6.1304, "loss/crossentropy": 2.6410909295082092, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19228537008166313, "step": 6892 }, { "epoch": 0.31336363636363634, "grad_norm": 5.34375, "grad_norm_var": 0.5401326497395833, "learning_rate": 0.0001, "loss": 6.0347, "loss/crossentropy": 2.573694169521332, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18828653916716576, "step": 6894 }, { "epoch": 0.31345454545454543, "grad_norm": 4.84375, "grad_norm_var": 0.171875, "learning_rate": 0.0001, "loss": 5.8404, "loss/crossentropy": 2.524019181728363, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1751917228102684, "step": 6896 }, { "epoch": 0.31354545454545457, "grad_norm": 5.28125, "grad_norm_var": 0.09836832682291667, "learning_rate": 0.0001, "loss": 5.9303, "loss/crossentropy": 2.5433506965637207, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.18225103244185448, "step": 6898 }, { "epoch": 0.31363636363636366, "grad_norm": 5.0625, "grad_norm_var": 0.08811442057291667, "learning_rate": 0.0001, "loss": 5.7036, "loss/crossentropy": 2.3973066806793213, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1757492981851101, "step": 6900 }, { "epoch": 0.31372727272727274, "grad_norm": 9.75, "grad_norm_var": 1.35953369140625, "learning_rate": 0.0001, "loss": 5.8652, "loss/crossentropy": 2.477041095495224, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18217642977833748, "step": 6902 }, { "epoch": 0.31381818181818183, "grad_norm": 5.25, "grad_norm_var": 1.3421875, "learning_rate": 0.0001, "loss": 5.8809, "loss/crossentropy": 2.529063045978546, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17873459681868553, "step": 6904 }, { "epoch": 0.3139090909090909, "grad_norm": 5.375, "grad_norm_var": 1.32261962890625, "learning_rate": 0.0001, "loss": 5.5521, "loss/crossentropy": 2.4050039052963257, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16158539429306984, "step": 6906 }, { "epoch": 0.314, "grad_norm": 6.53125, "grad_norm_var": 1.3868448893229166, "learning_rate": 0.0001, "loss": 5.9513, "loss/crossentropy": 2.5326806902885437, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18482979387044907, "step": 6908 }, { "epoch": 0.3140909090909091, "grad_norm": 5.21875, "grad_norm_var": 1.3919230143229167, "learning_rate": 0.0001, "loss": 5.8562, "loss/crossentropy": 2.4851385354995728, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.17831915244460106, "step": 6910 }, { "epoch": 0.3141818181818182, "grad_norm": 5.875, "grad_norm_var": 1.3996378580729167, "learning_rate": 0.0001, "loss": 5.228, "loss/crossentropy": 2.093440920114517, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.15408452600240707, "step": 6912 }, { "epoch": 0.31427272727272726, "grad_norm": 5.6875, "grad_norm_var": 1.4200358072916666, "learning_rate": 0.0001, "loss": 6.1865, "loss/crossentropy": 2.734572947025299, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18582099303603172, "step": 6914 }, { "epoch": 0.31436363636363635, "grad_norm": 5.84375, "grad_norm_var": 1.3671183268229166, "learning_rate": 0.0001, "loss": 6.0056, "loss/crossentropy": 2.5714350938796997, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18736224621534348, "step": 6916 }, { "epoch": 0.31445454545454543, "grad_norm": 5.65625, "grad_norm_var": 0.22961832682291666, "learning_rate": 0.0001, "loss": 6.0443, "loss/crossentropy": 2.568240761756897, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18920406699180603, "step": 6918 }, { "epoch": 0.3145454545454546, "grad_norm": 5.4375, "grad_norm_var": 0.229150390625, "learning_rate": 0.0001, "loss": 5.715, "loss/crossentropy": 2.453384280204773, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.16971703991293907, "step": 6920 }, { "epoch": 0.31463636363636366, "grad_norm": 5.34375, "grad_norm_var": 0.22008056640625, "learning_rate": 0.0001, "loss": 6.0455, "loss/crossentropy": 2.6235231161117554, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.18731101229786873, "step": 6922 }, { "epoch": 0.31472727272727274, "grad_norm": 5.4375, "grad_norm_var": 0.15559488932291668, "learning_rate": 0.0001, "loss": 6.0538, "loss/crossentropy": 2.596672832965851, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18965576216578484, "step": 6924 }, { "epoch": 0.31481818181818183, "grad_norm": 5.15625, "grad_norm_var": 0.15911458333333334, "learning_rate": 0.0001, "loss": 5.6533, "loss/crossentropy": 2.3752098083496094, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17292916029691696, "step": 6926 }, { "epoch": 0.3149090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.158837890625, "learning_rate": 0.0001, "loss": 5.6207, "loss/crossentropy": 2.3843217492103577, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17285306379199028, "step": 6928 }, { "epoch": 0.315, "grad_norm": 5.125, "grad_norm_var": 0.1037109375, "learning_rate": 0.0001, "loss": 5.5891, "loss/crossentropy": 2.4095190465450287, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.16209552437067032, "step": 6930 }, { "epoch": 0.3150909090909091, "grad_norm": 5.65625, "grad_norm_var": 0.09735921223958334, "learning_rate": 0.0001, "loss": 5.8056, "loss/crossentropy": 2.5009660720825195, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1755843460559845, "step": 6932 }, { "epoch": 0.3151818181818182, "grad_norm": 5.5, "grad_norm_var": 0.10428059895833333, "learning_rate": 0.0001, "loss": 5.5777, "loss/crossentropy": 2.3057241439819336, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17056113854050636, "step": 6934 }, { "epoch": 0.31527272727272726, "grad_norm": 5.4375, "grad_norm_var": 0.10370686848958334, "learning_rate": 0.0001, "loss": 5.9282, "loss/crossentropy": 2.54399237036705, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1815808117389679, "step": 6936 }, { "epoch": 0.31536363636363635, "grad_norm": 5.375, "grad_norm_var": 0.06744791666666666, "learning_rate": 0.0001, "loss": 6.3027, "loss/crossentropy": 2.801161766052246, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1942930705845356, "step": 6938 }, { "epoch": 0.31545454545454543, "grad_norm": 5.34375, "grad_norm_var": 0.09325764973958334, "learning_rate": 0.0001, "loss": 5.7893, "loss/crossentropy": 2.483893573284149, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.1705833375453949, "step": 6940 }, { "epoch": 0.3155454545454545, "grad_norm": 5.53125, "grad_norm_var": 0.08865559895833333, "learning_rate": 0.0001, "loss": 6.1517, "loss/crossentropy": 2.7756999731063843, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18330569937825203, "step": 6942 }, { "epoch": 0.31563636363636366, "grad_norm": 4.71875, "grad_norm_var": 0.09810791015625, "learning_rate": 0.0001, "loss": 5.7419, "loss/crossentropy": 2.448312222957611, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1750575564801693, "step": 6944 }, { "epoch": 0.31572727272727275, "grad_norm": 4.9375, "grad_norm_var": 0.10660400390625, "learning_rate": 0.0001, "loss": 5.6312, "loss/crossentropy": 2.43033367395401, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16715722158551216, "step": 6946 }, { "epoch": 0.31581818181818183, "grad_norm": 5.28125, "grad_norm_var": 0.09810791015625, "learning_rate": 0.0001, "loss": 5.9718, "loss/crossentropy": 2.614811360836029, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.18159940466284752, "step": 6948 }, { "epoch": 0.3159090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.09308268229166666, "learning_rate": 0.0001, "loss": 5.4662, "loss/crossentropy": 2.2368339896202087, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.16570613905787468, "step": 6950 }, { "epoch": 0.316, "grad_norm": 5.4375, "grad_norm_var": 0.09582926432291666, "learning_rate": 0.0001, "loss": 5.7596, "loss/crossentropy": 2.4721075296401978, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17152682691812515, "step": 6952 }, { "epoch": 0.3160909090909091, "grad_norm": 6.0, "grad_norm_var": 0.1234375, "learning_rate": 0.0001, "loss": 5.9056, "loss/crossentropy": 2.5313527584075928, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.17980629578232765, "step": 6954 }, { "epoch": 0.3161818181818182, "grad_norm": 5.6875, "grad_norm_var": 0.10662434895833334, "learning_rate": 0.0001, "loss": 5.7755, "loss/crossentropy": 2.397016763687134, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18198533356189728, "step": 6956 }, { "epoch": 0.31627272727272726, "grad_norm": 5.21875, "grad_norm_var": 0.11685791015625, "learning_rate": 0.0001, "loss": 5.5346, "loss/crossentropy": 2.3137795329093933, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.16622278094291687, "step": 6958 }, { "epoch": 0.31636363636363635, "grad_norm": 4.59375, "grad_norm_var": 0.1326171875, "learning_rate": 0.0001, "loss": 5.4686, "loss/crossentropy": 2.361387312412262, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15934975817799568, "step": 6960 }, { "epoch": 0.31645454545454543, "grad_norm": 5.34375, "grad_norm_var": 0.13131510416666667, "learning_rate": 0.0001, "loss": 5.8614, "loss/crossentropy": 2.5366820096969604, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17583338916301727, "step": 6962 }, { "epoch": 0.3165454545454545, "grad_norm": 8.125, "grad_norm_var": 0.624072265625, "learning_rate": 0.0001, "loss": 5.8977, "loss/crossentropy": 2.514443099498749, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18012001737952232, "step": 6964 }, { "epoch": 0.31663636363636366, "grad_norm": 5.65625, "grad_norm_var": 0.6104817708333333, "learning_rate": 0.0001, "loss": 6.0513, "loss/crossentropy": 2.6633766889572144, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1852782927453518, "step": 6966 }, { "epoch": 0.31672727272727275, "grad_norm": 6.09375, "grad_norm_var": 0.61431884765625, "learning_rate": 0.0001, "loss": 6.064, "loss/crossentropy": 2.6434390544891357, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18443681672215462, "step": 6968 }, { "epoch": 0.31681818181818183, "grad_norm": 5.0, "grad_norm_var": 0.6240519205729167, "learning_rate": 0.0001, "loss": 5.8129, "loss/crossentropy": 2.4749990701675415, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17734725028276443, "step": 6970 }, { "epoch": 0.3169090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.63033447265625, "learning_rate": 0.0001, "loss": 5.8205, "loss/crossentropy": 2.4769230484962463, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17810724675655365, "step": 6972 }, { "epoch": 0.317, "grad_norm": 5.8125, "grad_norm_var": 0.6374837239583333, "learning_rate": 0.0001, "loss": 5.8674, "loss/crossentropy": 2.582548439502716, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1761452481150627, "step": 6974 }, { "epoch": 0.3170909090909091, "grad_norm": 6.34375, "grad_norm_var": 0.58521728515625, "learning_rate": 0.0001, "loss": 6.0749, "loss/crossentropy": 2.6578428149223328, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.1821386069059372, "step": 6976 }, { "epoch": 0.3171818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.57320556640625, "learning_rate": 0.0001, "loss": 5.6585, "loss/crossentropy": 2.40144944190979, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1714111715555191, "step": 6978 }, { "epoch": 0.31727272727272726, "grad_norm": 18.375, "grad_norm_var": 10.521858723958333, "learning_rate": 0.0001, "loss": 6.0399, "loss/crossentropy": 2.45738822221756, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.20199887827038765, "step": 6980 }, { "epoch": 0.31736363636363635, "grad_norm": 5.5, "grad_norm_var": 10.502718098958333, "learning_rate": 0.0001, "loss": 5.8402, "loss/crossentropy": 2.500304937362671, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17480658367276192, "step": 6982 }, { "epoch": 0.31745454545454543, "grad_norm": 5.3125, "grad_norm_var": 10.552197265625, "learning_rate": 0.0001, "loss": 5.549, "loss/crossentropy": 2.2742612659931183, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.16907131299376488, "step": 6984 }, { "epoch": 0.3175454545454545, "grad_norm": 5.3125, "grad_norm_var": 10.522391764322917, "learning_rate": 0.0001, "loss": 5.8253, "loss/crossentropy": 2.4981988072395325, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17880507186055183, "step": 6986 }, { "epoch": 0.31763636363636366, "grad_norm": 5.625, "grad_norm_var": 10.53863525390625, "learning_rate": 0.0001, "loss": 5.8594, "loss/crossentropy": 2.5356303453445435, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17729951068758965, "step": 6988 }, { "epoch": 0.31772727272727275, "grad_norm": 5.84375, "grad_norm_var": 10.570182291666667, "learning_rate": 0.0001, "loss": 5.8715, "loss/crossentropy": 2.5765095353126526, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17734819278120995, "step": 6990 }, { "epoch": 0.31781818181818183, "grad_norm": 5.4375, "grad_norm_var": 10.738151041666667, "learning_rate": 0.0001, "loss": 5.9367, "loss/crossentropy": 2.5563809871673584, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.18353776633739471, "step": 6992 }, { "epoch": 0.3179090909090909, "grad_norm": 5.28125, "grad_norm_var": 10.773421223958334, "learning_rate": 0.0001, "loss": 5.5221, "loss/crossentropy": 2.336158037185669, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16371019929647446, "step": 6994 }, { "epoch": 0.318, "grad_norm": 4.84375, "grad_norm_var": 0.26829427083333335, "learning_rate": 0.0001, "loss": 5.5201, "loss/crossentropy": 2.3283283412456512, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16546158492565155, "step": 6996 }, { "epoch": 0.3180909090909091, "grad_norm": 5.9375, "grad_norm_var": 1.474853515625, "learning_rate": 0.0001, "loss": 5.7768, "loss/crossentropy": 2.4176366925239563, "loss/hidden": 1.623046875, "loss/jsd": 0.0, "loss/logits": 0.1736104004085064, "step": 6998 }, { "epoch": 0.3181818181818182, "grad_norm": 4.90625, "grad_norm_var": 1.5194661458333334, "learning_rate": 0.0001, "loss": 5.4723, "loss/crossentropy": 2.245038002729416, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16706379503011703, "step": 7000 }, { "epoch": 0.31827272727272726, "grad_norm": 5.75, "grad_norm_var": 1.5166666666666666, "learning_rate": 0.0001, "loss": 6.044, "loss/crossentropy": 2.591178596019745, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18590456619858742, "step": 7002 }, { "epoch": 0.31836363636363635, "grad_norm": 5.53125, "grad_norm_var": 1.5020670572916666, "learning_rate": 0.0001, "loss": 6.0404, "loss/crossentropy": 2.565687656402588, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18965914100408554, "step": 7004 }, { "epoch": 0.31845454545454543, "grad_norm": 5.125, "grad_norm_var": 1.486572265625, "learning_rate": 0.0001, "loss": 5.8501, "loss/crossentropy": 2.5242879986763, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17828521504998207, "step": 7006 }, { "epoch": 0.3185454545454545, "grad_norm": 7.3125, "grad_norm_var": 1.6631144205729167, "learning_rate": 0.0001, "loss": 5.8521, "loss/crossentropy": 2.484105408191681, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.17566681653261185, "step": 7008 }, { "epoch": 0.31863636363636366, "grad_norm": 5.75, "grad_norm_var": 4.3890625, "learning_rate": 0.0001, "loss": 6.2039, "loss/crossentropy": 2.618151009082794, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19958674907684326, "step": 7010 }, { "epoch": 0.31872727272727275, "grad_norm": 5.65625, "grad_norm_var": 4.194254557291667, "learning_rate": 0.0001, "loss": 5.858, "loss/crossentropy": 2.468145489692688, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18078552931547165, "step": 7012 }, { "epoch": 0.31881818181818183, "grad_norm": 5.8125, "grad_norm_var": 3.249312337239583, "learning_rate": 0.0001, "loss": 5.8258, "loss/crossentropy": 2.400465428829193, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18433326110243797, "step": 7014 }, { "epoch": 0.3189090909090909, "grad_norm": 5.59375, "grad_norm_var": 3.1109659830729166, "learning_rate": 0.0001, "loss": 6.1844, "loss/crossentropy": 2.7516914010047913, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.186827190220356, "step": 7016 }, { "epoch": 0.319, "grad_norm": 5.1875, "grad_norm_var": 3.236442057291667, "learning_rate": 0.0001, "loss": 5.3709, "loss/crossentropy": 2.2371483743190765, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16083870083093643, "step": 7018 }, { "epoch": 0.3190909090909091, "grad_norm": 6.46875, "grad_norm_var": 3.1986287434895835, "learning_rate": 0.0001, "loss": 5.3685, "loss/crossentropy": 2.179025948047638, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16601282730698586, "step": 7020 }, { "epoch": 0.3191818181818182, "grad_norm": 5.75, "grad_norm_var": 3.0807902018229165, "learning_rate": 0.0001, "loss": 5.7699, "loss/crossentropy": 2.4097092151641846, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17625688016414642, "step": 7022 }, { "epoch": 0.31927272727272726, "grad_norm": 5.40625, "grad_norm_var": 3.016304524739583, "learning_rate": 0.0001, "loss": 6.2798, "loss/crossentropy": 2.7455169558525085, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19678691029548645, "step": 7024 }, { "epoch": 0.31936363636363635, "grad_norm": 5.375, "grad_norm_var": 0.299072265625, "learning_rate": 0.0001, "loss": 5.5877, "loss/crossentropy": 2.3292073607444763, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1680338978767395, "step": 7026 }, { "epoch": 0.31945454545454544, "grad_norm": 5.4375, "grad_norm_var": 0.3063151041666667, "learning_rate": 0.0001, "loss": 5.7714, "loss/crossentropy": 2.3928236961364746, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18083027377724648, "step": 7028 }, { "epoch": 0.3195454545454545, "grad_norm": 5.46875, "grad_norm_var": 0.31506754557291666, "learning_rate": 0.0001, "loss": 5.7899, "loss/crossentropy": 2.4924952387809753, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1750537045300007, "step": 7030 }, { "epoch": 0.31963636363636366, "grad_norm": 5.65625, "grad_norm_var": 0.3204427083333333, "learning_rate": 0.0001, "loss": 6.1151, "loss/crossentropy": 2.603553831577301, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19412655010819435, "step": 7032 }, { "epoch": 0.31972727272727275, "grad_norm": 4.84375, "grad_norm_var": 0.31881103515625, "learning_rate": 0.0001, "loss": 5.6439, "loss/crossentropy": 2.4534780979156494, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16650530695915222, "step": 7034 }, { "epoch": 0.31981818181818183, "grad_norm": 5.34375, "grad_norm_var": 0.26695556640625, "learning_rate": 0.0001, "loss": 5.486, "loss/crossentropy": 2.2575650215148926, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.16424795612692833, "step": 7036 }, { "epoch": 0.3199090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.26901041666666664, "learning_rate": 0.0001, "loss": 5.3206, "loss/crossentropy": 2.169728010892868, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.15942662209272385, "step": 7038 }, { "epoch": 0.32, "grad_norm": 5.53125, "grad_norm_var": 0.065869140625, "learning_rate": 0.0001, "loss": 5.7496, "loss/crossentropy": 2.4100629687309265, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17790015414357185, "step": 7040 }, { "epoch": 0.3200909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.08043212890625, "learning_rate": 0.0001, "loss": 6.3016, "loss/crossentropy": 2.885983943939209, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18472466617822647, "step": 7042 }, { "epoch": 0.3201818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.12845052083333333, "learning_rate": 0.0001, "loss": 6.0529, "loss/crossentropy": 2.5708983540534973, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18726325035095215, "step": 7044 }, { "epoch": 0.32027272727272726, "grad_norm": 6.46875, "grad_norm_var": 0.222900390625, "learning_rate": 0.0001, "loss": 6.2245, "loss/crossentropy": 2.6765209436416626, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19541937112808228, "step": 7046 }, { "epoch": 0.32036363636363635, "grad_norm": 4.9375, "grad_norm_var": 0.247900390625, "learning_rate": 0.0001, "loss": 5.3117, "loss/crossentropy": 2.1980584263801575, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.15530913695693016, "step": 7048 }, { "epoch": 0.32045454545454544, "grad_norm": 6.34375, "grad_norm_var": 0.271484375, "learning_rate": 0.0001, "loss": 6.088, "loss/crossentropy": 2.6892707347869873, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18147936090826988, "step": 7050 }, { "epoch": 0.3205454545454545, "grad_norm": 5.34375, "grad_norm_var": 0.27981770833333336, "learning_rate": 0.0001, "loss": 5.6035, "loss/crossentropy": 2.3366349935531616, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1716048400849104, "step": 7052 }, { "epoch": 0.32063636363636366, "grad_norm": 6.0625, "grad_norm_var": 0.70211181640625, "learning_rate": 0.0001, "loss": 5.7336, "loss/crossentropy": 2.4455073475837708, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17040590569376945, "step": 7054 }, { "epoch": 0.32072727272727275, "grad_norm": 5.25, "grad_norm_var": 0.7294881184895833, "learning_rate": 0.0001, "loss": 5.8332, "loss/crossentropy": 2.5102930665016174, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1774124875664711, "step": 7056 }, { "epoch": 0.32081818181818184, "grad_norm": 5.375, "grad_norm_var": 0.76754150390625, "learning_rate": 0.0001, "loss": 5.7986, "loss/crossentropy": 2.5259247422218323, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1745343618094921, "step": 7058 }, { "epoch": 0.3209090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.77340087890625, "learning_rate": 0.0001, "loss": 5.6697, "loss/crossentropy": 2.369995355606079, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17411361634731293, "step": 7060 }, { "epoch": 0.321, "grad_norm": 6.90625, "grad_norm_var": 0.832275390625, "learning_rate": 0.0001, "loss": 5.9345, "loss/crossentropy": 2.523563504219055, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1826905757188797, "step": 7062 }, { "epoch": 0.3210909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.8629557291666666, "learning_rate": 0.0001, "loss": 5.2502, "loss/crossentropy": 2.2130128145217896, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15156623348593712, "step": 7064 }, { "epoch": 0.3211818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.8546834309895833, "learning_rate": 0.0001, "loss": 5.7887, "loss/crossentropy": 2.521957039833069, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17492078617215157, "step": 7066 }, { "epoch": 0.32127272727272727, "grad_norm": 5.9375, "grad_norm_var": 0.8708170572916667, "learning_rate": 0.0001, "loss": 5.4524, "loss/crossentropy": 2.231541395187378, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1648595593869686, "step": 7068 }, { "epoch": 0.32136363636363635, "grad_norm": 5.40625, "grad_norm_var": 0.26686197916666665, "learning_rate": 0.0001, "loss": 5.7189, "loss/crossentropy": 2.389087736606598, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17673583701252937, "step": 7070 }, { "epoch": 0.32145454545454544, "grad_norm": 5.5, "grad_norm_var": 0.26760660807291664, "learning_rate": 0.0001, "loss": 5.8322, "loss/crossentropy": 2.481712520122528, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1811404600739479, "step": 7072 }, { "epoch": 0.3215454545454545, "grad_norm": 5.4375, "grad_norm_var": 0.27209879557291666, "learning_rate": 0.0001, "loss": 6.2067, "loss/crossentropy": 2.7442203164100647, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1897992230951786, "step": 7074 }, { "epoch": 0.3216363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.2600911458333333, "learning_rate": 0.0001, "loss": 5.5674, "loss/crossentropy": 2.3262118697166443, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1696268431842327, "step": 7076 }, { "epoch": 0.32172727272727275, "grad_norm": 5.375, "grad_norm_var": 0.09687093098958334, "learning_rate": 0.0001, "loss": 6.0794, "loss/crossentropy": 2.699090361595154, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18119221553206444, "step": 7078 }, { "epoch": 0.32181818181818184, "grad_norm": 6.40625, "grad_norm_var": 0.13019205729166666, "learning_rate": 0.0001, "loss": 5.7551, "loss/crossentropy": 2.3586134910583496, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.18242153152823448, "step": 7080 }, { "epoch": 0.3219090909090909, "grad_norm": 8.625, "grad_norm_var": 0.7316243489583333, "learning_rate": 0.0001, "loss": 5.5892, "loss/crossentropy": 2.1906994581222534, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.17910584062337875, "step": 7082 }, { "epoch": 0.322, "grad_norm": 5.46875, "grad_norm_var": 0.7155232747395833, "learning_rate": 0.0001, "loss": 5.7593, "loss/crossentropy": 2.471660017967224, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17173630744218826, "step": 7084 }, { "epoch": 0.3220909090909091, "grad_norm": 7.5, "grad_norm_var": 0.9258097330729167, "learning_rate": 0.0001, "loss": 6.121, "loss/crossentropy": 2.5820266604423523, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.19823526218533516, "step": 7086 }, { "epoch": 0.3221818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.95357666015625, "learning_rate": 0.0001, "loss": 5.7576, "loss/crossentropy": 2.4711132049560547, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17396187782287598, "step": 7088 }, { "epoch": 0.32227272727272727, "grad_norm": 4.78125, "grad_norm_var": 1.01871337890625, "learning_rate": 0.0001, "loss": 5.7407, "loss/crossentropy": 2.4834749698638916, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1727905347943306, "step": 7090 }, { "epoch": 0.32236363636363635, "grad_norm": 5.34375, "grad_norm_var": 0.99713134765625, "learning_rate": 0.0001, "loss": 5.7593, "loss/crossentropy": 2.3623180389404297, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1799338012933731, "step": 7092 }, { "epoch": 0.32245454545454544, "grad_norm": 5.46875, "grad_norm_var": 1.04644775390625, "learning_rate": 0.0001, "loss": 5.1679, "loss/crossentropy": 2.060254782438278, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.15646303445100784, "step": 7094 }, { "epoch": 0.3225454545454545, "grad_norm": 4.8125, "grad_norm_var": 1.06666259765625, "learning_rate": 0.0001, "loss": 5.4007, "loss/crossentropy": 2.2231662273406982, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1605258397758007, "step": 7096 }, { "epoch": 0.3226363636363636, "grad_norm": 6.09375, "grad_norm_var": 0.4722493489583333, "learning_rate": 0.0001, "loss": 6.2372, "loss/crossentropy": 2.6941157579421997, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1945468783378601, "step": 7098 }, { "epoch": 0.32272727272727275, "grad_norm": 5.21875, "grad_norm_var": 0.5130859375, "learning_rate": 0.0001, "loss": 6.1395, "loss/crossentropy": 2.6767420768737793, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18631698563694954, "step": 7100 }, { "epoch": 0.32281818181818184, "grad_norm": 5.78125, "grad_norm_var": 0.2631144205729167, "learning_rate": 0.0001, "loss": 5.7232, "loss/crossentropy": 2.470178246498108, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.16807988658547401, "step": 7102 }, { "epoch": 0.3229090909090909, "grad_norm": 5.125, "grad_norm_var": 0.270947265625, "learning_rate": 0.0001, "loss": 6.0832, "loss/crossentropy": 2.6431024074554443, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1834583692252636, "step": 7104 }, { "epoch": 0.323, "grad_norm": 5.28125, "grad_norm_var": 0.241796875, "learning_rate": 0.0001, "loss": 5.7343, "loss/crossentropy": 2.453684151172638, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17395784333348274, "step": 7106 }, { "epoch": 0.3230909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.230712890625, "learning_rate": 0.0001, "loss": 5.8198, "loss/crossentropy": 2.507817506790161, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17573313042521477, "step": 7108 }, { "epoch": 0.3231818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.23222249348958332, "learning_rate": 0.0001, "loss": 5.7831, "loss/crossentropy": 2.5569621324539185, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1722281128168106, "step": 7110 }, { "epoch": 0.32327272727272727, "grad_norm": 5.09375, "grad_norm_var": 0.22662353515625, "learning_rate": 0.0001, "loss": 5.4424, "loss/crossentropy": 2.27615088224411, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16232329234480858, "step": 7112 }, { "epoch": 0.32336363636363635, "grad_norm": 5.90625, "grad_norm_var": 0.17278238932291667, "learning_rate": 0.0001, "loss": 6.0567, "loss/crossentropy": 2.6443105936050415, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18440154194831848, "step": 7114 }, { "epoch": 0.32345454545454544, "grad_norm": 5.21875, "grad_norm_var": 0.1201171875, "learning_rate": 0.0001, "loss": 5.5608, "loss/crossentropy": 2.3896672129631042, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16476848348975182, "step": 7116 }, { "epoch": 0.3235454545454545, "grad_norm": 5.5, "grad_norm_var": 0.10943603515625, "learning_rate": 0.0001, "loss": 5.8277, "loss/crossentropy": 2.4713273644447327, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.18036096543073654, "step": 7118 }, { "epoch": 0.3236363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.09049479166666667, "learning_rate": 0.0001, "loss": 5.7467, "loss/crossentropy": 2.46297687292099, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17094623669981956, "step": 7120 }, { "epoch": 0.32372727272727275, "grad_norm": 4.9375, "grad_norm_var": 0.11261393229166666, "learning_rate": 0.0001, "loss": 5.7474, "loss/crossentropy": 2.5294829607009888, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16886335611343384, "step": 7122 }, { "epoch": 0.32381818181818184, "grad_norm": 5.6875, "grad_norm_var": 0.11962483723958334, "learning_rate": 0.0001, "loss": 5.8185, "loss/crossentropy": 2.518710732460022, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1737307645380497, "step": 7124 }, { "epoch": 0.3239090909090909, "grad_norm": 5.5, "grad_norm_var": 0.11222330729166667, "learning_rate": 0.0001, "loss": 5.669, "loss/crossentropy": 2.379661738872528, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17209362611174583, "step": 7126 }, { "epoch": 0.324, "grad_norm": 5.8125, "grad_norm_var": 0.10266520182291666, "learning_rate": 0.0001, "loss": 5.9222, "loss/crossentropy": 2.5294875502586365, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1818465031683445, "step": 7128 }, { "epoch": 0.3240909090909091, "grad_norm": 5.65625, "grad_norm_var": 0.09377848307291667, "learning_rate": 0.0001, "loss": 5.7873, "loss/crossentropy": 2.4336524605751038, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17540483176708221, "step": 7130 }, { "epoch": 0.3241818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.078515625, "learning_rate": 0.0001, "loss": 5.7609, "loss/crossentropy": 2.4897045493125916, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.172824177891016, "step": 7132 }, { "epoch": 0.32427272727272727, "grad_norm": 5.0, "grad_norm_var": 0.08336181640625, "learning_rate": 0.0001, "loss": 5.5036, "loss/crossentropy": 2.3286211490631104, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1622285097837448, "step": 7134 }, { "epoch": 0.32436363636363635, "grad_norm": 15.5625, "grad_norm_var": 6.503999837239584, "learning_rate": 0.0001, "loss": 5.9933, "loss/crossentropy": 2.4222405552864075, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.19831980764865875, "step": 7136 }, { "epoch": 0.32445454545454544, "grad_norm": 4.53125, "grad_norm_var": 6.605061848958333, "learning_rate": 0.0001, "loss": 5.2759, "loss/crossentropy": 2.205240160226822, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1547212041914463, "step": 7138 }, { "epoch": 0.3245454545454545, "grad_norm": 5.40625, "grad_norm_var": 6.616434733072917, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.4018945693969727, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17227481305599213, "step": 7140 }, { "epoch": 0.3246363636363636, "grad_norm": 5.75, "grad_norm_var": 6.531884765625, "learning_rate": 0.0001, "loss": 6.2338, "loss/crossentropy": 2.7189735174179077, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1909397579729557, "step": 7142 }, { "epoch": 0.32472727272727275, "grad_norm": 5.78125, "grad_norm_var": 6.557942708333333, "learning_rate": 0.0001, "loss": 5.7351, "loss/crossentropy": 2.4960139989852905, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17097560316324234, "step": 7144 }, { "epoch": 0.32481818181818184, "grad_norm": 5.25, "grad_norm_var": 6.605497233072916, "learning_rate": 0.0001, "loss": 5.4041, "loss/crossentropy": 2.2360042333602905, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16407747939229012, "step": 7146 }, { "epoch": 0.3249090909090909, "grad_norm": 5.34375, "grad_norm_var": 6.596468098958334, "learning_rate": 0.0001, "loss": 5.7517, "loss/crossentropy": 2.485721707344055, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17092932015657425, "step": 7148 }, { "epoch": 0.325, "grad_norm": 5.875, "grad_norm_var": 6.497900390625, "learning_rate": 0.0001, "loss": 5.9356, "loss/crossentropy": 2.5220232009887695, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18432796001434326, "step": 7150 }, { "epoch": 0.3250909090909091, "grad_norm": 5.5625, "grad_norm_var": 0.1111328125, "learning_rate": 0.0001, "loss": 5.4978, "loss/crossentropy": 2.26737043261528, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.16581613570451736, "step": 7152 }, { "epoch": 0.3251818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.06519775390625, "learning_rate": 0.0001, "loss": 5.7205, "loss/crossentropy": 2.519988000392914, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16693059727549553, "step": 7154 }, { "epoch": 0.32527272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.08365885416666667, "learning_rate": 0.0001, "loss": 5.2284, "loss/crossentropy": 2.219617635011673, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.14950662665069103, "step": 7156 }, { "epoch": 0.32536363636363635, "grad_norm": 10.0625, "grad_norm_var": 1.4478800455729166, "learning_rate": 0.0001, "loss": 5.6752, "loss/crossentropy": 2.3602048754692078, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17622388899326324, "step": 7158 }, { "epoch": 0.32545454545454544, "grad_norm": 5.09375, "grad_norm_var": 1.4622029622395833, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.486173093318939, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17986207082867622, "step": 7160 }, { "epoch": 0.3255454545454545, "grad_norm": 5.34375, "grad_norm_var": 1.4630045572916666, "learning_rate": 0.0001, "loss": 5.8347, "loss/crossentropy": 2.500016689300537, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17663278803229332, "step": 7162 }, { "epoch": 0.3256363636363636, "grad_norm": 5.03125, "grad_norm_var": 1.5031209309895834, "learning_rate": 0.0001, "loss": 5.7315, "loss/crossentropy": 2.490819275379181, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1705547273159027, "step": 7164 }, { "epoch": 0.32572727272727275, "grad_norm": 5.40625, "grad_norm_var": 1.4958943684895833, "learning_rate": 0.0001, "loss": 6.0294, "loss/crossentropy": 2.606768846511841, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18386515974998474, "step": 7166 }, { "epoch": 0.32581818181818184, "grad_norm": 5.59375, "grad_norm_var": 1.5077473958333334, "learning_rate": 0.0001, "loss": 5.7815, "loss/crossentropy": 2.4702301025390625, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17487720772624016, "step": 7168 }, { "epoch": 0.3259090909090909, "grad_norm": 5.78125, "grad_norm_var": 1.49453125, "learning_rate": 0.0001, "loss": 5.3092, "loss/crossentropy": 2.2105672359466553, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.15282873809337616, "step": 7170 }, { "epoch": 0.326, "grad_norm": 4.96875, "grad_norm_var": 1.4929646809895833, "learning_rate": 0.0001, "loss": 5.4526, "loss/crossentropy": 2.292171597480774, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16115840896964073, "step": 7172 }, { "epoch": 0.3260909090909091, "grad_norm": 6.25, "grad_norm_var": 0.1322265625, "learning_rate": 0.0001, "loss": 5.7874, "loss/crossentropy": 2.47031769156456, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17467618733644485, "step": 7174 }, { "epoch": 0.3261818181818182, "grad_norm": 5.40625, "grad_norm_var": 0.10920817057291667, "learning_rate": 0.0001, "loss": 6.2511, "loss/crossentropy": 2.749321401119232, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.193338755518198, "step": 7176 }, { "epoch": 0.32627272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.124072265625, "learning_rate": 0.0001, "loss": 5.6739, "loss/crossentropy": 2.4037073850631714, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17252695560455322, "step": 7178 }, { "epoch": 0.32636363636363636, "grad_norm": 5.59375, "grad_norm_var": 0.10625, "learning_rate": 0.0001, "loss": 5.8149, "loss/crossentropy": 2.513899028301239, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17755692079663277, "step": 7180 }, { "epoch": 0.32645454545454544, "grad_norm": 5.9375, "grad_norm_var": 0.24875895182291666, "learning_rate": 0.0001, "loss": 6.0604, "loss/crossentropy": 2.6589261889457703, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.18526463210582733, "step": 7182 }, { "epoch": 0.3265454545454545, "grad_norm": 5.65625, "grad_norm_var": 0.24529622395833334, "learning_rate": 0.0001, "loss": 5.6261, "loss/crossentropy": 2.315260589122772, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17112064734101295, "step": 7184 }, { "epoch": 0.3266363636363636, "grad_norm": 5.75, "grad_norm_var": 0.24729410807291666, "learning_rate": 0.0001, "loss": 5.7634, "loss/crossentropy": 2.4522212743759155, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17564550414681435, "step": 7186 }, { "epoch": 0.3267272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.24862874348958333, "learning_rate": 0.0001, "loss": 5.4309, "loss/crossentropy": 2.278888761997223, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16285409033298492, "step": 7188 }, { "epoch": 0.32681818181818184, "grad_norm": 5.0625, "grad_norm_var": 0.22522379557291666, "learning_rate": 0.0001, "loss": 5.8089, "loss/crossentropy": 2.4775696992874146, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17610615491867065, "step": 7190 }, { "epoch": 0.3269090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.24397379557291668, "learning_rate": 0.0001, "loss": 5.6984, "loss/crossentropy": 2.372829258441925, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17161497846245766, "step": 7192 }, { "epoch": 0.327, "grad_norm": 4.5625, "grad_norm_var": 0.27493082682291664, "learning_rate": 0.0001, "loss": 5.4161, "loss/crossentropy": 2.2026017606258392, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1662694737315178, "step": 7194 }, { "epoch": 0.3270909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.3006795247395833, "learning_rate": 0.0001, "loss": 5.5529, "loss/crossentropy": 2.308093100786209, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1697890479117632, "step": 7196 }, { "epoch": 0.3271818181818182, "grad_norm": 5.5625, "grad_norm_var": 0.10198160807291666, "learning_rate": 0.0001, "loss": 5.8587, "loss/crossentropy": 2.5191215872764587, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17809917032718658, "step": 7198 }, { "epoch": 0.32727272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.09959309895833333, "learning_rate": 0.0001, "loss": 5.5952, "loss/crossentropy": 2.3573147654533386, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1669568084180355, "step": 7200 }, { "epoch": 0.32736363636363636, "grad_norm": 5.40625, "grad_norm_var": 0.0984375, "learning_rate": 0.0001, "loss": 6.2078, "loss/crossentropy": 2.7344619035720825, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1910831294953823, "step": 7202 }, { "epoch": 0.32745454545454544, "grad_norm": 5.375, "grad_norm_var": 0.10536702473958333, "learning_rate": 0.0001, "loss": 5.7235, "loss/crossentropy": 2.452792167663574, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1751149445772171, "step": 7204 }, { "epoch": 0.32754545454545453, "grad_norm": 5.09375, "grad_norm_var": 0.1048828125, "learning_rate": 0.0001, "loss": 5.6787, "loss/crossentropy": 2.430859684944153, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16970521584153175, "step": 7206 }, { "epoch": 0.3276363636363636, "grad_norm": 5.5625, "grad_norm_var": 0.14140625, "learning_rate": 0.0001, "loss": 6.0907, "loss/crossentropy": 2.644229769706726, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1844896525144577, "step": 7208 }, { "epoch": 0.3277272727272727, "grad_norm": 5.375, "grad_norm_var": 0.10480143229166666, "learning_rate": 0.0001, "loss": 5.8071, "loss/crossentropy": 2.4944929480552673, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17599067836999893, "step": 7210 }, { "epoch": 0.32781818181818184, "grad_norm": 5.34375, "grad_norm_var": 0.07277018229166667, "learning_rate": 0.0001, "loss": 5.7834, "loss/crossentropy": 2.4763445258140564, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17523768916726112, "step": 7212 }, { "epoch": 0.32790909090909093, "grad_norm": 5.3125, "grad_norm_var": 0.07115478515625, "learning_rate": 0.0001, "loss": 5.3823, "loss/crossentropy": 2.1951240599155426, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16246528550982475, "step": 7214 }, { "epoch": 0.328, "grad_norm": 5.90625, "grad_norm_var": 0.07733968098958334, "learning_rate": 0.0001, "loss": 6.3319, "loss/crossentropy": 2.8045809268951416, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19452543556690216, "step": 7216 }, { "epoch": 0.3280909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.09518229166666667, "learning_rate": 0.0001, "loss": 5.7128, "loss/crossentropy": 2.3561104834079742, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1751183532178402, "step": 7218 }, { "epoch": 0.3281818181818182, "grad_norm": 5.34375, "grad_norm_var": 0.079150390625, "learning_rate": 0.0001, "loss": 5.8258, "loss/crossentropy": 2.533752977848053, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17217185348272324, "step": 7220 }, { "epoch": 0.32827272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.076953125, "learning_rate": 0.0001, "loss": 5.9471, "loss/crossentropy": 2.578039050102234, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1831905134022236, "step": 7222 }, { "epoch": 0.32836363636363636, "grad_norm": 5.0, "grad_norm_var": 0.07496337890625, "learning_rate": 0.0001, "loss": 5.7485, "loss/crossentropy": 2.478627026081085, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1748342625796795, "step": 7224 }, { "epoch": 0.32845454545454544, "grad_norm": 5.46875, "grad_norm_var": 0.078759765625, "learning_rate": 0.0001, "loss": 6.256, "loss/crossentropy": 2.835538923740387, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.184622872620821, "step": 7226 }, { "epoch": 0.32854545454545453, "grad_norm": 5.21875, "grad_norm_var": 0.08544514973958334, "learning_rate": 0.0001, "loss": 5.61, "loss/crossentropy": 2.4204914569854736, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16270465403795242, "step": 7228 }, { "epoch": 0.3286363636363636, "grad_norm": 5.78125, "grad_norm_var": 0.09811197916666667, "learning_rate": 0.0001, "loss": 6.0058, "loss/crossentropy": 2.6840550899505615, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17787479609251022, "step": 7230 }, { "epoch": 0.3287272727272727, "grad_norm": 5.125, "grad_norm_var": 0.07825520833333334, "learning_rate": 0.0001, "loss": 5.7292, "loss/crossentropy": 2.4762925505638123, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17080016806721687, "step": 7232 }, { "epoch": 0.32881818181818184, "grad_norm": 6.46875, "grad_norm_var": 0.14700520833333333, "learning_rate": 0.0001, "loss": 5.8259, "loss/crossentropy": 2.5828452706336975, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17020480334758759, "step": 7234 }, { "epoch": 0.32890909090909093, "grad_norm": 6.96875, "grad_norm_var": 0.316796875, "learning_rate": 0.0001, "loss": 5.7057, "loss/crossentropy": 2.4989709854125977, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1673545241355896, "step": 7236 }, { "epoch": 0.329, "grad_norm": 5.84375, "grad_norm_var": 0.332275390625, "learning_rate": 0.0001, "loss": 5.8854, "loss/crossentropy": 2.5783753991127014, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17445121333003044, "step": 7238 }, { "epoch": 0.3290909090909091, "grad_norm": 5.125, "grad_norm_var": 0.30076497395833335, "learning_rate": 0.0001, "loss": 5.7171, "loss/crossentropy": 2.4647990465164185, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1719125285744667, "step": 7240 }, { "epoch": 0.3291818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.30779622395833334, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.475413143634796, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.17597432434558868, "step": 7242 }, { "epoch": 0.3292727272727273, "grad_norm": 5.75, "grad_norm_var": 0.28935139973958335, "learning_rate": 0.0001, "loss": 6.2447, "loss/crossentropy": 2.811923384666443, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18761711567640305, "step": 7244 }, { "epoch": 0.32936363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.288525390625, "learning_rate": 0.0001, "loss": 5.7212, "loss/crossentropy": 2.386250674724579, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17373109236359596, "step": 7246 }, { "epoch": 0.32945454545454544, "grad_norm": 6.09375, "grad_norm_var": 0.2751261393229167, "learning_rate": 0.0001, "loss": 5.6895, "loss/crossentropy": 2.35316064953804, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17386828921735287, "step": 7248 }, { "epoch": 0.32954545454545453, "grad_norm": 5.3125, "grad_norm_var": 0.235400390625, "learning_rate": 0.0001, "loss": 5.8771, "loss/crossentropy": 2.518888533115387, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1795669160783291, "step": 7250 }, { "epoch": 0.3296363636363636, "grad_norm": 5.375, "grad_norm_var": 0.103515625, "learning_rate": 0.0001, "loss": 5.7045, "loss/crossentropy": 2.4561302065849304, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17034007981419563, "step": 7252 }, { "epoch": 0.3297272727272727, "grad_norm": 7.0, "grad_norm_var": 0.22980143229166666, "learning_rate": 0.0001, "loss": 5.9643, "loss/crossentropy": 2.5596101880073547, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1822708323597908, "step": 7254 }, { "epoch": 0.32981818181818184, "grad_norm": 5.3125, "grad_norm_var": 0.21652018229166667, "learning_rate": 0.0001, "loss": 6.2193, "loss/crossentropy": 2.8194814324378967, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18451207503676414, "step": 7256 }, { "epoch": 0.32990909090909093, "grad_norm": 10.375, "grad_norm_var": 1.6162068684895834, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.291295111179352, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.17970095574855804, "step": 7258 }, { "epoch": 0.33, "grad_norm": 5.4375, "grad_norm_var": 1.6282552083333333, "learning_rate": 0.0001, "loss": 6.1204, "loss/crossentropy": 2.7084208726882935, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1841617412865162, "step": 7260 }, { "epoch": 0.3300909090909091, "grad_norm": 5.25, "grad_norm_var": 1.6281087239583334, "learning_rate": 0.0001, "loss": 6.1749, "loss/crossentropy": 2.6910151839256287, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19135326519608498, "step": 7262 }, { "epoch": 0.3301818181818182, "grad_norm": 5.6875, "grad_norm_var": 1.6547810872395834, "learning_rate": 0.0001, "loss": 5.7777, "loss/crossentropy": 2.4737480878829956, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17512431740760803, "step": 7264 }, { "epoch": 0.3302727272727273, "grad_norm": 6.40625, "grad_norm_var": 1.648828125, "learning_rate": 0.0001, "loss": 5.9485, "loss/crossentropy": 2.5351127982139587, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18040360137820244, "step": 7266 }, { "epoch": 0.33036363636363636, "grad_norm": 5.6875, "grad_norm_var": 1.6228474934895833, "learning_rate": 0.0001, "loss": 6.107, "loss/crossentropy": 2.7146058082580566, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18415868654847145, "step": 7268 }, { "epoch": 0.33045454545454545, "grad_norm": 5.34375, "grad_norm_var": 1.5773722330729167, "learning_rate": 0.0001, "loss": 5.2361, "loss/crossentropy": 2.1708699762821198, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1533951722085476, "step": 7270 }, { "epoch": 0.33054545454545453, "grad_norm": 5.65625, "grad_norm_var": 1.56519775390625, "learning_rate": 0.0001, "loss": 6.0784, "loss/crossentropy": 2.6086220145225525, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18877608701586723, "step": 7272 }, { "epoch": 0.3306363636363636, "grad_norm": 4.8125, "grad_norm_var": 0.135791015625, "learning_rate": 0.0001, "loss": 5.298, "loss/crossentropy": 2.2393347024917603, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15312795713543892, "step": 7274 }, { "epoch": 0.3307272727272727, "grad_norm": 6.40625, "grad_norm_var": 0.18974202473958332, "learning_rate": 0.0001, "loss": 5.7491, "loss/crossentropy": 2.4172052145004272, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1757657788693905, "step": 7276 }, { "epoch": 0.33081818181818184, "grad_norm": 4.5, "grad_norm_var": 0.2536092122395833, "learning_rate": 0.0001, "loss": 5.2317, "loss/crossentropy": 2.120537042617798, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1544717401266098, "step": 7278 }, { "epoch": 0.33090909090909093, "grad_norm": 6.4375, "grad_norm_var": 0.3029256184895833, "learning_rate": 0.0001, "loss": 6.0498, "loss/crossentropy": 2.5488283038139343, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.19013390317559242, "step": 7280 }, { "epoch": 0.331, "grad_norm": 5.3125, "grad_norm_var": 0.24856363932291667, "learning_rate": 0.0001, "loss": 6.0134, "loss/crossentropy": 2.574483096599579, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18490593880414963, "step": 7282 }, { "epoch": 0.3310909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.24905192057291667, "learning_rate": 0.0001, "loss": 5.7406, "loss/crossentropy": 2.5130444169044495, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17021175101399422, "step": 7284 }, { "epoch": 0.3311818181818182, "grad_norm": 5.46875, "grad_norm_var": 0.25735677083333336, "learning_rate": 0.0001, "loss": 5.7086, "loss/crossentropy": 2.4319599270820618, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1733633428812027, "step": 7286 }, { "epoch": 0.3312727272727273, "grad_norm": 7.84375, "grad_norm_var": 0.6245930989583334, "learning_rate": 0.0001, "loss": 5.7653, "loss/crossentropy": 2.351571500301361, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18355701863765717, "step": 7288 }, { "epoch": 0.33136363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.5770833333333333, "learning_rate": 0.0001, "loss": 6.0569, "loss/crossentropy": 2.6738284826278687, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18128003552556038, "step": 7290 }, { "epoch": 0.33145454545454545, "grad_norm": 6.4375, "grad_norm_var": 0.58033447265625, "learning_rate": 0.0001, "loss": 6.0537, "loss/crossentropy": 2.5762927532196045, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.18934165686368942, "step": 7292 }, { "epoch": 0.33154545454545453, "grad_norm": 5.3125, "grad_norm_var": 0.50093994140625, "learning_rate": 0.0001, "loss": 5.9934, "loss/crossentropy": 2.702389180660248, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1748085580766201, "step": 7294 }, { "epoch": 0.3316363636363636, "grad_norm": 5.4375, "grad_norm_var": 0.49166259765625, "learning_rate": 0.0001, "loss": 5.625, "loss/crossentropy": 2.423635184764862, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1646638587117195, "step": 7296 }, { "epoch": 0.3317272727272727, "grad_norm": 4.71875, "grad_norm_var": 0.5330037434895833, "learning_rate": 0.0001, "loss": 5.309, "loss/crossentropy": 2.1849833726882935, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.15634725615382195, "step": 7298 }, { "epoch": 0.33181818181818185, "grad_norm": 5.90625, "grad_norm_var": 0.5328084309895833, "learning_rate": 0.0001, "loss": 5.9103, "loss/crossentropy": 2.544940710067749, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1779419742524624, "step": 7300 }, { "epoch": 0.33190909090909093, "grad_norm": 5.625, "grad_norm_var": 0.56715087890625, "learning_rate": 0.0001, "loss": 6.1822, "loss/crossentropy": 2.7353574633598328, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18687187507748604, "step": 7302 }, { "epoch": 0.332, "grad_norm": 5.6875, "grad_norm_var": 0.24508056640625, "learning_rate": 0.0001, "loss": 5.9798, "loss/crossentropy": 2.6397745609283447, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18048293516039848, "step": 7304 }, { "epoch": 0.3320909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.23385009765625, "learning_rate": 0.0001, "loss": 5.8501, "loss/crossentropy": 2.6102797389030457, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16948660090565681, "step": 7306 }, { "epoch": 0.3321818181818182, "grad_norm": 5.625, "grad_norm_var": 0.17939046223958333, "learning_rate": 0.0001, "loss": 5.9929, "loss/crossentropy": 2.616094321012497, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18084952980279922, "step": 7308 }, { "epoch": 0.3322727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.28684895833333335, "learning_rate": 0.0001, "loss": 4.9678, "loss/crossentropy": 1.9980733692646027, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.14658349752426147, "step": 7310 }, { "epoch": 0.33236363636363636, "grad_norm": 5.28125, "grad_norm_var": 0.31197509765625, "learning_rate": 0.0001, "loss": 5.7963, "loss/crossentropy": 2.521453082561493, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17436440661549568, "step": 7312 }, { "epoch": 0.33245454545454545, "grad_norm": 6.34375, "grad_norm_var": 0.33941650390625, "learning_rate": 0.0001, "loss": 5.8991, "loss/crossentropy": 2.5301079750061035, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.178890161216259, "step": 7314 }, { "epoch": 0.33254545454545453, "grad_norm": 5.53125, "grad_norm_var": 0.3296712239583333, "learning_rate": 0.0001, "loss": 6.1243, "loss/crossentropy": 2.677391231060028, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18766102939844131, "step": 7316 }, { "epoch": 0.3326363636363636, "grad_norm": 5.3125, "grad_norm_var": 0.21936442057291666, "learning_rate": 0.0001, "loss": 5.3863, "loss/crossentropy": 2.227075606584549, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.15713811665773392, "step": 7318 }, { "epoch": 0.3327272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.21751302083333332, "learning_rate": 0.0001, "loss": 5.2604, "loss/crossentropy": 2.19342839717865, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15356791764497757, "step": 7320 }, { "epoch": 0.3328181818181818, "grad_norm": 5.0, "grad_norm_var": 0.21848958333333332, "learning_rate": 0.0001, "loss": 5.5164, "loss/crossentropy": 2.296057641506195, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.16304896399378777, "step": 7322 }, { "epoch": 0.33290909090909093, "grad_norm": 7.125, "grad_norm_var": 0.43564046223958336, "learning_rate": 0.0001, "loss": 5.7755, "loss/crossentropy": 2.455841064453125, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17844605818390846, "step": 7324 }, { "epoch": 0.333, "grad_norm": 6.0625, "grad_norm_var": 0.36864827473958334, "learning_rate": 0.0001, "loss": 5.8266, "loss/crossentropy": 2.551252603530884, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.16796036064624786, "step": 7326 }, { "epoch": 0.3330909090909091, "grad_norm": 5.0, "grad_norm_var": 0.33993733723958336, "learning_rate": 0.0001, "loss": 5.7893, "loss/crossentropy": 2.54526025056839, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16736898198723793, "step": 7328 }, { "epoch": 0.3331818181818182, "grad_norm": 4.6875, "grad_norm_var": 0.324853515625, "learning_rate": 0.0001, "loss": 5.6422, "loss/crossentropy": 2.428013861179352, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16887570917606354, "step": 7330 }, { "epoch": 0.3332727272727273, "grad_norm": 4.75, "grad_norm_var": 0.36028645833333334, "learning_rate": 0.0001, "loss": 5.6317, "loss/crossentropy": 2.4058090150356293, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16731810197234154, "step": 7332 }, { "epoch": 0.33336363636363636, "grad_norm": 5.84375, "grad_norm_var": 0.37771809895833336, "learning_rate": 0.0001, "loss": 6.1458, "loss/crossentropy": 2.691236436367035, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18998732417821884, "step": 7334 }, { "epoch": 0.33345454545454545, "grad_norm": 5.1875, "grad_norm_var": 0.38622639973958334, "learning_rate": 0.0001, "loss": 5.775, "loss/crossentropy": 2.5030179619789124, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1729017086327076, "step": 7336 }, { "epoch": 0.33354545454545453, "grad_norm": 5.75, "grad_norm_var": 0.39390869140625, "learning_rate": 0.0001, "loss": 5.9109, "loss/crossentropy": 2.561486840248108, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1773243397474289, "step": 7338 }, { "epoch": 0.3336363636363636, "grad_norm": 7.5625, "grad_norm_var": 0.5024576822916667, "learning_rate": 0.0001, "loss": 5.9644, "loss/crossentropy": 2.6145442128181458, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17658264935016632, "step": 7340 }, { "epoch": 0.3337272727272727, "grad_norm": 5.34375, "grad_norm_var": 0.47522379557291666, "learning_rate": 0.0001, "loss": 5.7818, "loss/crossentropy": 2.469043552875519, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.1752163991332054, "step": 7342 }, { "epoch": 0.3338181818181818, "grad_norm": 5.6875, "grad_norm_var": 0.46630452473958334, "learning_rate": 0.0001, "loss": 6.1319, "loss/crossentropy": 2.664930522441864, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1925937570631504, "step": 7344 }, { "epoch": 0.33390909090909093, "grad_norm": 5.8125, "grad_norm_var": 0.4212239583333333, "learning_rate": 0.0001, "loss": 6.101, "loss/crossentropy": 2.64811110496521, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18572291359305382, "step": 7346 }, { "epoch": 0.334, "grad_norm": 5.34375, "grad_norm_var": 0.510009765625, "learning_rate": 0.0001, "loss": 5.9, "loss/crossentropy": 2.570264518260956, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17672446370124817, "step": 7348 }, { "epoch": 0.3340909090909091, "grad_norm": 5.8125, "grad_norm_var": 0.5102701822916667, "learning_rate": 0.0001, "loss": 6.3053, "loss/crossentropy": 2.823966681957245, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19188128039240837, "step": 7350 }, { "epoch": 0.3341818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.48814697265625, "learning_rate": 0.0001, "loss": 5.8792, "loss/crossentropy": 2.524182915687561, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.177104402333498, "step": 7352 }, { "epoch": 0.3342727272727273, "grad_norm": 5.03125, "grad_norm_var": 0.48626302083333334, "learning_rate": 0.0001, "loss": 5.7384, "loss/crossentropy": 2.4712822437286377, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17163782939314842, "step": 7354 }, { "epoch": 0.33436363636363636, "grad_norm": 5.875, "grad_norm_var": 0.24325764973958333, "learning_rate": 0.0001, "loss": 6.081, "loss/crossentropy": 2.6486698985099792, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18503377214074135, "step": 7356 }, { "epoch": 0.33445454545454545, "grad_norm": 5.21875, "grad_norm_var": 0.23811442057291668, "learning_rate": 0.0001, "loss": 6.0352, "loss/crossentropy": 2.6335780024528503, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1831297166645527, "step": 7358 }, { "epoch": 0.33454545454545453, "grad_norm": 6.09375, "grad_norm_var": 0.24677327473958333, "learning_rate": 0.0001, "loss": 6.1213, "loss/crossentropy": 2.619325578212738, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.19101770594716072, "step": 7360 }, { "epoch": 0.3346363636363636, "grad_norm": 5.6875, "grad_norm_var": 0.25048421223958334, "learning_rate": 0.0001, "loss": 5.9266, "loss/crossentropy": 2.510174036026001, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.1824614368379116, "step": 7362 }, { "epoch": 0.3347272727272727, "grad_norm": 5.75, "grad_norm_var": 0.11370035807291666, "learning_rate": 0.0001, "loss": 6.0311, "loss/crossentropy": 2.6678205728530884, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17948973923921585, "step": 7364 }, { "epoch": 0.3348181818181818, "grad_norm": 5.125, "grad_norm_var": 0.120166015625, "learning_rate": 0.0001, "loss": 5.6115, "loss/crossentropy": 2.432698607444763, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16475483030080795, "step": 7366 }, { "epoch": 0.33490909090909093, "grad_norm": 5.59375, "grad_norm_var": 0.108056640625, "learning_rate": 0.0001, "loss": 5.8872, "loss/crossentropy": 2.5251446962356567, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18015021458268166, "step": 7368 }, { "epoch": 0.335, "grad_norm": 5.84375, "grad_norm_var": 0.13557535807291668, "learning_rate": 0.0001, "loss": 5.3944, "loss/crossentropy": 2.245284378528595, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16452119871973991, "step": 7370 }, { "epoch": 0.3350909090909091, "grad_norm": 5.8125, "grad_norm_var": 0.13424072265625, "learning_rate": 0.0001, "loss": 5.9335, "loss/crossentropy": 2.5237144827842712, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18199345096945763, "step": 7372 }, { "epoch": 0.3351818181818182, "grad_norm": 5.125, "grad_norm_var": 0.138525390625, "learning_rate": 0.0001, "loss": 5.6142, "loss/crossentropy": 2.3593953251838684, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17021091282367706, "step": 7374 }, { "epoch": 0.3352727272727273, "grad_norm": 5.625, "grad_norm_var": 0.109228515625, "learning_rate": 0.0001, "loss": 5.4336, "loss/crossentropy": 2.2437329292297363, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16702980548143387, "step": 7376 }, { "epoch": 0.33536363636363636, "grad_norm": 5.46875, "grad_norm_var": 0.10595296223958334, "learning_rate": 0.0001, "loss": 5.951, "loss/crossentropy": 2.6051706671714783, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17950277030467987, "step": 7378 }, { "epoch": 0.33545454545454545, "grad_norm": 5.21875, "grad_norm_var": 0.10289306640625, "learning_rate": 0.0001, "loss": 6.1777, "loss/crossentropy": 2.7396645545959473, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.18462365493178368, "step": 7380 }, { "epoch": 0.33554545454545454, "grad_norm": 5.6875, "grad_norm_var": 0.10780843098958333, "learning_rate": 0.0001, "loss": 5.8939, "loss/crossentropy": 2.4325972199440002, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18616559728980064, "step": 7382 }, { "epoch": 0.3356363636363636, "grad_norm": 5.59375, "grad_norm_var": 0.10271809895833334, "learning_rate": 0.0001, "loss": 5.836, "loss/crossentropy": 2.5405735969543457, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17602701857686043, "step": 7384 }, { "epoch": 0.3357272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.05963541666666667, "learning_rate": 0.0001, "loss": 6.2295, "loss/crossentropy": 2.7410935163497925, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1937653198838234, "step": 7386 }, { "epoch": 0.3358181818181818, "grad_norm": 5.25, "grad_norm_var": 0.05028889973958333, "learning_rate": 0.0001, "loss": 5.7967, "loss/crossentropy": 2.4375804662704468, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17888424918055534, "step": 7388 }, { "epoch": 0.33590909090909093, "grad_norm": 5.34375, "grad_norm_var": 0.04771728515625, "learning_rate": 0.0001, "loss": 5.9135, "loss/crossentropy": 2.6140822172164917, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.173885740339756, "step": 7390 }, { "epoch": 0.336, "grad_norm": 5.28125, "grad_norm_var": 0.049479166666666664, "learning_rate": 0.0001, "loss": 5.7155, "loss/crossentropy": 2.4005414247512817, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17485232651233673, "step": 7392 }, { "epoch": 0.3360909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.048421223958333336, "learning_rate": 0.0001, "loss": 5.8606, "loss/crossentropy": 2.546834349632263, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17688657715916634, "step": 7394 }, { "epoch": 0.3361818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.08448893229166667, "learning_rate": 0.0001, "loss": 5.4168, "loss/crossentropy": 2.336260497570038, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1572684682905674, "step": 7396 }, { "epoch": 0.3362727272727273, "grad_norm": 5.125, "grad_norm_var": 0.06545817057291667, "learning_rate": 0.0001, "loss": 6.1942, "loss/crossentropy": 2.834761083126068, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.18105817213654518, "step": 7398 }, { "epoch": 0.33636363636363636, "grad_norm": 5.46875, "grad_norm_var": 0.2912109375, "learning_rate": 0.0001, "loss": 5.5427, "loss/crossentropy": 2.3323437571525574, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16596175357699394, "step": 7400 }, { "epoch": 0.33645454545454545, "grad_norm": 5.0, "grad_norm_var": 0.30422770182291664, "learning_rate": 0.0001, "loss": 5.5212, "loss/crossentropy": 2.2608346939086914, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16939297318458557, "step": 7402 }, { "epoch": 0.33654545454545454, "grad_norm": 5.53125, "grad_norm_var": 0.31256510416666666, "learning_rate": 0.0001, "loss": 5.7299, "loss/crossentropy": 2.4681469202041626, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.1673874445259571, "step": 7404 }, { "epoch": 0.3366363636363636, "grad_norm": 5.1875, "grad_norm_var": 0.4095703125, "learning_rate": 0.0001, "loss": 5.916, "loss/crossentropy": 2.599897086620331, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17770465090870857, "step": 7406 }, { "epoch": 0.3367272727272727, "grad_norm": 5.59375, "grad_norm_var": 0.40526936848958334, "learning_rate": 0.0001, "loss": 5.8448, "loss/crossentropy": 2.510252892971039, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17603203654289246, "step": 7408 }, { "epoch": 0.3368181818181818, "grad_norm": 5.0, "grad_norm_var": 0.417578125, "learning_rate": 0.0001, "loss": 5.7205, "loss/crossentropy": 2.451657712459564, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1729804314672947, "step": 7410 }, { "epoch": 0.33690909090909094, "grad_norm": 4.75, "grad_norm_var": 0.40552978515625, "learning_rate": 0.0001, "loss": 5.6031, "loss/crossentropy": 2.4476343989372253, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16378731653094292, "step": 7412 }, { "epoch": 0.337, "grad_norm": 5.125, "grad_norm_var": 0.418603515625, "learning_rate": 0.0001, "loss": 5.6941, "loss/crossentropy": 2.495083510875702, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16560716554522514, "step": 7414 }, { "epoch": 0.3370909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.20331624348958333, "learning_rate": 0.0001, "loss": 5.4931, "loss/crossentropy": 2.409870743751526, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1557803489267826, "step": 7416 }, { "epoch": 0.3371818181818182, "grad_norm": 5.5625, "grad_norm_var": 0.20692952473958334, "learning_rate": 0.0001, "loss": 5.8484, "loss/crossentropy": 2.517484128475189, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.17469745501875877, "step": 7418 }, { "epoch": 0.3372727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.36184895833333336, "learning_rate": 0.0001, "loss": 6.0429, "loss/crossentropy": 2.5917909145355225, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18905501440167427, "step": 7420 }, { "epoch": 0.33736363636363637, "grad_norm": 5.59375, "grad_norm_var": 0.5046223958333333, "learning_rate": 0.0001, "loss": 5.4856, "loss/crossentropy": 2.2892252802848816, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16495148092508316, "step": 7422 }, { "epoch": 0.33745454545454545, "grad_norm": 5.65625, "grad_norm_var": 0.5056599934895833, "learning_rate": 0.0001, "loss": 6.0055, "loss/crossentropy": 2.5863245129585266, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1846875250339508, "step": 7424 }, { "epoch": 0.33754545454545454, "grad_norm": 6.46875, "grad_norm_var": 0.5501139322916667, "learning_rate": 0.0001, "loss": 6.3047, "loss/crossentropy": 2.8601332902908325, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18820984289050102, "step": 7426 }, { "epoch": 0.3376363636363636, "grad_norm": 5.125, "grad_norm_var": 0.50904541015625, "learning_rate": 0.0001, "loss": 5.5293, "loss/crossentropy": 2.307663172483444, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16552451997995377, "step": 7428 }, { "epoch": 0.3377272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.47394205729166666, "learning_rate": 0.0001, "loss": 5.8127, "loss/crossentropy": 2.4916341304779053, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17917665094137192, "step": 7430 }, { "epoch": 0.3378181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.4395670572916667, "learning_rate": 0.0001, "loss": 5.4876, "loss/crossentropy": 2.344219744205475, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1614132933318615, "step": 7432 }, { "epoch": 0.33790909090909094, "grad_norm": 5.34375, "grad_norm_var": 0.44830729166666666, "learning_rate": 0.0001, "loss": 6.3408, "loss/crossentropy": 2.8669037222862244, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1921132169663906, "step": 7434 }, { "epoch": 0.338, "grad_norm": 5.40625, "grad_norm_var": 0.33411051432291666, "learning_rate": 0.0001, "loss": 6.378, "loss/crossentropy": 2.9287912845611572, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19023333862423897, "step": 7436 }, { "epoch": 0.3380909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.1240234375, "learning_rate": 0.0001, "loss": 5.8771, "loss/crossentropy": 2.461836576461792, "loss/hidden": 1.607421875, "loss/jsd": 0.0, "loss/logits": 0.180780827999115, "step": 7438 }, { "epoch": 0.3381818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.126416015625, "learning_rate": 0.0001, "loss": 5.9066, "loss/crossentropy": 2.5799349546432495, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17817208170890808, "step": 7440 }, { "epoch": 0.3382727272727273, "grad_norm": 4.75, "grad_norm_var": 0.08761393229166667, "learning_rate": 0.0001, "loss": 5.715, "loss/crossentropy": 2.4714918732643127, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17083991691470146, "step": 7442 }, { "epoch": 0.33836363636363637, "grad_norm": 5.34375, "grad_norm_var": 0.08430989583333333, "learning_rate": 0.0001, "loss": 6.082, "loss/crossentropy": 2.7265704870224, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17890483886003494, "step": 7444 }, { "epoch": 0.33845454545454545, "grad_norm": 4.8125, "grad_norm_var": 0.11287434895833333, "learning_rate": 0.0001, "loss": 5.5344, "loss/crossentropy": 2.362063944339752, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16488901525735855, "step": 7446 }, { "epoch": 0.33854545454545454, "grad_norm": 5.03125, "grad_norm_var": 0.12434895833333333, "learning_rate": 0.0001, "loss": 5.8238, "loss/crossentropy": 2.600530982017517, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16880978271365166, "step": 7448 }, { "epoch": 0.3386363636363636, "grad_norm": 5.75, "grad_norm_var": 0.12220052083333334, "learning_rate": 0.0001, "loss": 6.3454, "loss/crossentropy": 2.8697733283042908, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18994362279772758, "step": 7450 }, { "epoch": 0.3387272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.096728515625, "learning_rate": 0.0001, "loss": 5.8326, "loss/crossentropy": 2.5923022031784058, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17090797424316406, "step": 7452 }, { "epoch": 0.3388181818181818, "grad_norm": 5.5, "grad_norm_var": 0.11105143229166667, "learning_rate": 0.0001, "loss": 5.8485, "loss/crossentropy": 2.5514355897903442, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17462852597236633, "step": 7454 }, { "epoch": 0.3389090909090909, "grad_norm": 5.375, "grad_norm_var": 0.22375895182291666, "learning_rate": 0.0001, "loss": 5.8839, "loss/crossentropy": 2.543789267539978, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17717386782169342, "step": 7456 }, { "epoch": 0.339, "grad_norm": 6.0, "grad_norm_var": 0.2518513997395833, "learning_rate": 0.0001, "loss": 5.7592, "loss/crossentropy": 2.4683006405830383, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17694422230124474, "step": 7458 }, { "epoch": 0.3390909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.25948893229166664, "learning_rate": 0.0001, "loss": 6.0639, "loss/crossentropy": 2.6798218488693237, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18216022104024887, "step": 7460 }, { "epoch": 0.3391818181818182, "grad_norm": 5.46875, "grad_norm_var": 0.22805582682291667, "learning_rate": 0.0001, "loss": 5.5224, "loss/crossentropy": 2.2871183156967163, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.16513438522815704, "step": 7462 }, { "epoch": 0.3392727272727273, "grad_norm": 4.78125, "grad_norm_var": 0.22576497395833334, "learning_rate": 0.0001, "loss": 5.8157, "loss/crossentropy": 2.5748071670532227, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16959283500909805, "step": 7464 }, { "epoch": 0.33936363636363637, "grad_norm": 5.03125, "grad_norm_var": 0.21796875, "learning_rate": 0.0001, "loss": 5.7356, "loss/crossentropy": 2.5501828789711, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16698135435581207, "step": 7466 }, { "epoch": 0.33945454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.22029622395833334, "learning_rate": 0.0001, "loss": 5.7276, "loss/crossentropy": 2.481203019618988, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16897498071193695, "step": 7468 }, { "epoch": 0.33954545454545454, "grad_norm": 4.46875, "grad_norm_var": 0.22629801432291666, "learning_rate": 0.0001, "loss": 5.2849, "loss/crossentropy": 2.213577061891556, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.15420495718717575, "step": 7470 }, { "epoch": 0.3396363636363636, "grad_norm": 5.5, "grad_norm_var": 0.13365478515625, "learning_rate": 0.0001, "loss": 5.7, "loss/crossentropy": 2.454045534133911, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17068738117814064, "step": 7472 }, { "epoch": 0.3397272727272727, "grad_norm": 5.46875, "grad_norm_var": 0.09293212890625, "learning_rate": 0.0001, "loss": 5.9937, "loss/crossentropy": 2.658147871494293, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.18140656128525734, "step": 7474 }, { "epoch": 0.3398181818181818, "grad_norm": 5.25, "grad_norm_var": 0.08189697265625, "learning_rate": 0.0001, "loss": 5.8344, "loss/crossentropy": 2.5336162447929382, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17577774077653885, "step": 7476 }, { "epoch": 0.3399090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.08722330729166666, "learning_rate": 0.0001, "loss": 5.65, "loss/crossentropy": 2.4052090644836426, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17056892067193985, "step": 7478 }, { "epoch": 0.34, "grad_norm": 5.21875, "grad_norm_var": 0.07928059895833334, "learning_rate": 0.0001, "loss": 5.8633, "loss/crossentropy": 2.5454994440078735, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.176900252699852, "step": 7480 }, { "epoch": 0.3400909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.13590087890625, "learning_rate": 0.0001, "loss": 5.0897, "loss/crossentropy": 2.0792024433612823, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.14987558871507645, "step": 7482 }, { "epoch": 0.3401818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.1353515625, "learning_rate": 0.0001, "loss": 5.4032, "loss/crossentropy": 2.2760419845581055, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15881304070353508, "step": 7484 }, { "epoch": 0.3402727272727273, "grad_norm": 5.5625, "grad_norm_var": 0.11457926432291667, "learning_rate": 0.0001, "loss": 5.7478, "loss/crossentropy": 2.453454911708832, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17435337975621223, "step": 7486 }, { "epoch": 0.34036363636363637, "grad_norm": 5.40625, "grad_norm_var": 0.13331705729166668, "learning_rate": 0.0001, "loss": 6.0868, "loss/crossentropy": 2.705544114112854, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18538892269134521, "step": 7488 }, { "epoch": 0.34045454545454545, "grad_norm": 5.6875, "grad_norm_var": 0.14108072916666667, "learning_rate": 0.0001, "loss": 5.7363, "loss/crossentropy": 2.4662355184555054, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1715364009141922, "step": 7490 }, { "epoch": 0.34054545454545454, "grad_norm": 5.65625, "grad_norm_var": 0.15155843098958333, "learning_rate": 0.0001, "loss": 5.2519, "loss/crossentropy": 2.087756484746933, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.15802082791924477, "step": 7492 }, { "epoch": 0.3406363636363636, "grad_norm": 5.25, "grad_norm_var": 0.15358072916666668, "learning_rate": 0.0001, "loss": 6.1863, "loss/crossentropy": 2.737539291381836, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19018686562776566, "step": 7494 }, { "epoch": 0.3407272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.15754801432291668, "learning_rate": 0.0001, "loss": 5.5142, "loss/crossentropy": 2.360860586166382, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16318939626216888, "step": 7496 }, { "epoch": 0.3408181818181818, "grad_norm": 5.25, "grad_norm_var": 0.09286702473958333, "learning_rate": 0.0001, "loss": 5.596, "loss/crossentropy": 2.3816097378730774, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16577406227588654, "step": 7498 }, { "epoch": 0.3409090909090909, "grad_norm": 5.6875, "grad_norm_var": 1.8549479166666667, "learning_rate": 0.0001, "loss": 5.7471, "loss/crossentropy": 2.266848385334015, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1874791570007801, "step": 7500 }, { "epoch": 0.341, "grad_norm": 5.28125, "grad_norm_var": 1.8693644205729167, "learning_rate": 0.0001, "loss": 5.7543, "loss/crossentropy": 2.4737756848335266, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17199822515249252, "step": 7502 }, { "epoch": 0.3410909090909091, "grad_norm": 5.25, "grad_norm_var": 1.8856770833333334, "learning_rate": 0.0001, "loss": 6.1344, "loss/crossentropy": 2.750528335571289, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1862426921725273, "step": 7504 }, { "epoch": 0.3411818181818182, "grad_norm": 8.4375, "grad_norm_var": 2.3411458333333335, "learning_rate": 0.0001, "loss": 5.9567, "loss/crossentropy": 2.3638781309127808, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.18232634291052818, "step": 7506 }, { "epoch": 0.3412727272727273, "grad_norm": 5.375, "grad_norm_var": 2.346077473958333, "learning_rate": 0.0001, "loss": 6.0046, "loss/crossentropy": 2.725671708583832, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17574190348386765, "step": 7508 }, { "epoch": 0.34136363636363637, "grad_norm": 5.6875, "grad_norm_var": 2.3306599934895833, "learning_rate": 0.0001, "loss": 5.7353, "loss/crossentropy": 2.414307564496994, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1754600927233696, "step": 7510 }, { "epoch": 0.34145454545454546, "grad_norm": 5.375, "grad_norm_var": 2.2481119791666666, "learning_rate": 0.0001, "loss": 5.6139, "loss/crossentropy": 2.3657326102256775, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17071516439318657, "step": 7512 }, { "epoch": 0.34154545454545454, "grad_norm": 6.40625, "grad_norm_var": 2.203369140625, "learning_rate": 0.0001, "loss": 5.7858, "loss/crossentropy": 2.448819637298584, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.17491088807582855, "step": 7514 }, { "epoch": 0.3416363636363636, "grad_norm": 5.5, "grad_norm_var": 0.6360677083333334, "learning_rate": 0.0001, "loss": 5.9331, "loss/crossentropy": 2.507777452468872, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.18296552449464798, "step": 7516 }, { "epoch": 0.3417272727272727, "grad_norm": 6.09375, "grad_norm_var": 0.6226847330729167, "learning_rate": 0.0001, "loss": 5.6477, "loss/crossentropy": 2.312514007091522, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.175511185079813, "step": 7518 }, { "epoch": 0.3418181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.6512980143229167, "learning_rate": 0.0001, "loss": 5.8776, "loss/crossentropy": 2.5318419337272644, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1796925961971283, "step": 7520 }, { "epoch": 0.3419090909090909, "grad_norm": 5.125, "grad_norm_var": 0.18316650390625, "learning_rate": 0.0001, "loss": 5.6212, "loss/crossentropy": 2.4496193528175354, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16481401771306992, "step": 7522 }, { "epoch": 0.342, "grad_norm": 5.03125, "grad_norm_var": 0.25579427083333334, "learning_rate": 0.0001, "loss": 6.0197, "loss/crossentropy": 2.6105279326438904, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18623282760381699, "step": 7524 }, { "epoch": 0.3420909090909091, "grad_norm": 5.125, "grad_norm_var": 0.27792561848958336, "learning_rate": 0.0001, "loss": 5.9702, "loss/crossentropy": 2.5585920214653015, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18549181893467903, "step": 7526 }, { "epoch": 0.3421818181818182, "grad_norm": 7.09375, "grad_norm_var": 0.4356608072916667, "learning_rate": 0.0001, "loss": 6.1816, "loss/crossentropy": 2.692238986492157, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18956440314650536, "step": 7528 }, { "epoch": 0.3422727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.4167277018229167, "learning_rate": 0.0001, "loss": 5.5705, "loss/crossentropy": 2.325917601585388, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1701608970761299, "step": 7530 }, { "epoch": 0.34236363636363637, "grad_norm": 4.9375, "grad_norm_var": 0.4241536458333333, "learning_rate": 0.0001, "loss": 5.8091, "loss/crossentropy": 2.5049081444740295, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17788486927747726, "step": 7532 }, { "epoch": 0.34245454545454546, "grad_norm": 5.5625, "grad_norm_var": 0.39729410807291665, "learning_rate": 0.0001, "loss": 6.04, "loss/crossentropy": 2.6289483308792114, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1856338456273079, "step": 7534 }, { "epoch": 0.34254545454545454, "grad_norm": 15.5, "grad_norm_var": 6.763411458333334, "learning_rate": 0.0001, "loss": 6.3564, "loss/crossentropy": 2.712366223335266, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.20580780506134033, "step": 7536 }, { "epoch": 0.34263636363636363, "grad_norm": 6.4375, "grad_norm_var": 6.723140462239583, "learning_rate": 0.0001, "loss": 6.0329, "loss/crossentropy": 2.464118778705597, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19399037584662437, "step": 7538 }, { "epoch": 0.3427272727272727, "grad_norm": 5.59375, "grad_norm_var": 6.682222493489584, "learning_rate": 0.0001, "loss": 5.8222, "loss/crossentropy": 2.478808879852295, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17926260083913803, "step": 7540 }, { "epoch": 0.3428181818181818, "grad_norm": 5.0625, "grad_norm_var": 6.716206868489583, "learning_rate": 0.0001, "loss": 5.6208, "loss/crossentropy": 2.4489314556121826, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16542737185955048, "step": 7542 }, { "epoch": 0.3429090909090909, "grad_norm": 5.4375, "grad_norm_var": 6.644466145833333, "learning_rate": 0.0001, "loss": 6.0608, "loss/crossentropy": 2.6189202070236206, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18559002503752708, "step": 7544 }, { "epoch": 0.343, "grad_norm": 5.3125, "grad_norm_var": 6.569950358072917, "learning_rate": 0.0001, "loss": 5.4886, "loss/crossentropy": 2.2805938720703125, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1657264567911625, "step": 7546 }, { "epoch": 0.3430909090909091, "grad_norm": 5.15625, "grad_norm_var": 6.607405598958334, "learning_rate": 0.0001, "loss": 5.4475, "loss/crossentropy": 2.294269472360611, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16004830598831177, "step": 7548 }, { "epoch": 0.3431818181818182, "grad_norm": 6.21875, "grad_norm_var": 6.517952473958333, "learning_rate": 0.0001, "loss": 5.8934, "loss/crossentropy": 2.608040928840637, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17267774045467377, "step": 7550 }, { "epoch": 0.3432727272727273, "grad_norm": 6.34375, "grad_norm_var": 0.5062459309895834, "learning_rate": 0.0001, "loss": 5.9442, "loss/crossentropy": 2.622525453567505, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1747441478073597, "step": 7552 }, { "epoch": 0.34336363636363637, "grad_norm": 6.0, "grad_norm_var": 0.29182535807291665, "learning_rate": 0.0001, "loss": 5.8672, "loss/crossentropy": 2.458222448825836, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18464716523885727, "step": 7554 }, { "epoch": 0.34345454545454546, "grad_norm": 5.28125, "grad_norm_var": 0.3010050455729167, "learning_rate": 0.0001, "loss": 5.7367, "loss/crossentropy": 2.3981602787971497, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1768268272280693, "step": 7556 }, { "epoch": 0.34354545454545454, "grad_norm": 5.34375, "grad_norm_var": 0.2833170572916667, "learning_rate": 0.0001, "loss": 5.897, "loss/crossentropy": 2.564927339553833, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.176368560642004, "step": 7558 }, { "epoch": 0.34363636363636363, "grad_norm": 4.875, "grad_norm_var": 0.31236572265625, "learning_rate": 0.0001, "loss": 5.4577, "loss/crossentropy": 2.3150975108146667, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16308392584323883, "step": 7560 }, { "epoch": 0.3437272727272727, "grad_norm": 6.28125, "grad_norm_var": 0.3381144205729167, "learning_rate": 0.0001, "loss": 6.2344, "loss/crossentropy": 2.817262053489685, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.18722542375326157, "step": 7562 }, { "epoch": 0.3438181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.33853759765625, "learning_rate": 0.0001, "loss": 5.7075, "loss/crossentropy": 2.463107109069824, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17034076899290085, "step": 7564 }, { "epoch": 0.3439090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.37604166666666666, "learning_rate": 0.0001, "loss": 5.3485, "loss/crossentropy": 2.2457701563835144, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.15695305541157722, "step": 7566 }, { "epoch": 0.344, "grad_norm": 5.53125, "grad_norm_var": 0.32265625, "learning_rate": 0.0001, "loss": 5.8609, "loss/crossentropy": 2.5197285413742065, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17903994768857956, "step": 7568 }, { "epoch": 0.3440909090909091, "grad_norm": 5.78125, "grad_norm_var": 0.16705729166666666, "learning_rate": 0.0001, "loss": 5.9035, "loss/crossentropy": 2.514367312192917, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1824645884335041, "step": 7570 }, { "epoch": 0.3441818181818182, "grad_norm": 5.0, "grad_norm_var": 0.18938395182291667, "learning_rate": 0.0001, "loss": 5.6058, "loss/crossentropy": 2.43924880027771, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1643136739730835, "step": 7572 }, { "epoch": 0.3442727272727273, "grad_norm": 8.0, "grad_norm_var": 0.6680623372395833, "learning_rate": 0.0001, "loss": 5.9685, "loss/crossentropy": 2.5483257174491882, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18596284836530685, "step": 7574 }, { "epoch": 0.34436363636363637, "grad_norm": 4.78125, "grad_norm_var": 0.6700520833333333, "learning_rate": 0.0001, "loss": 5.369, "loss/crossentropy": 2.225980818271637, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16059526801109314, "step": 7576 }, { "epoch": 0.34445454545454546, "grad_norm": 4.65625, "grad_norm_var": 0.6619791666666667, "learning_rate": 0.0001, "loss": 5.4457, "loss/crossentropy": 2.327445387840271, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.159285556524992, "step": 7578 }, { "epoch": 0.34454545454545454, "grad_norm": 5.0, "grad_norm_var": 0.6445597330729167, "learning_rate": 0.0001, "loss": 5.8822, "loss/crossentropy": 2.5718175768852234, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17752453684806824, "step": 7580 }, { "epoch": 0.34463636363636363, "grad_norm": 4.78125, "grad_norm_var": 0.641015625, "learning_rate": 0.0001, "loss": 5.8734, "loss/crossentropy": 2.560241997241974, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17701539769768715, "step": 7582 }, { "epoch": 0.3447272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.6521484375, "learning_rate": 0.0001, "loss": 5.9193, "loss/crossentropy": 2.6087165474891663, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1775454767048359, "step": 7584 }, { "epoch": 0.3448181818181818, "grad_norm": 5.0, "grad_norm_var": 0.6357706705729167, "learning_rate": 0.0001, "loss": 5.7876, "loss/crossentropy": 2.55258047580719, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17115382850170135, "step": 7586 }, { "epoch": 0.3449090909090909, "grad_norm": 6.125, "grad_norm_var": 0.6587076822916667, "learning_rate": 0.0001, "loss": 5.6013, "loss/crossentropy": 2.3683518171310425, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16802287474274635, "step": 7588 }, { "epoch": 0.345, "grad_norm": 5.46875, "grad_norm_var": 0.15416259765625, "learning_rate": 0.0001, "loss": 5.7725, "loss/crossentropy": 2.477569043636322, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17421583086252213, "step": 7590 }, { "epoch": 0.3450909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.15930582682291666, "learning_rate": 0.0001, "loss": 5.2895, "loss/crossentropy": 2.22902050614357, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1550750806927681, "step": 7592 }, { "epoch": 0.3451818181818182, "grad_norm": 5.84375, "grad_norm_var": 0.19329020182291667, "learning_rate": 0.0001, "loss": 5.7304, "loss/crossentropy": 2.3601896166801453, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17608311027288437, "step": 7594 }, { "epoch": 0.3452727272727273, "grad_norm": 5.34375, "grad_norm_var": 0.21873372395833332, "learning_rate": 0.0001, "loss": 5.9713, "loss/crossentropy": 2.5656110048294067, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1825632005929947, "step": 7596 }, { "epoch": 0.3453636363636364, "grad_norm": 5.625, "grad_norm_var": 0.19423421223958334, "learning_rate": 0.0001, "loss": 6.0281, "loss/crossentropy": 2.604975938796997, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.1862541139125824, "step": 7598 }, { "epoch": 0.34545454545454546, "grad_norm": 5.3125, "grad_norm_var": 0.1744140625, "learning_rate": 0.0001, "loss": 5.5301, "loss/crossentropy": 2.3345640301704407, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16486188769340515, "step": 7600 }, { "epoch": 0.34554545454545454, "grad_norm": 4.75, "grad_norm_var": 0.18515625, "learning_rate": 0.0001, "loss": 5.8052, "loss/crossentropy": 2.5512454509735107, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17344561591744423, "step": 7602 }, { "epoch": 0.34563636363636363, "grad_norm": 4.90625, "grad_norm_var": 0.17519124348958334, "learning_rate": 0.0001, "loss": 5.6606, "loss/crossentropy": 2.3839803636074066, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.16984569281339645, "step": 7604 }, { "epoch": 0.3457272727272727, "grad_norm": 4.875, "grad_norm_var": 0.19250895182291666, "learning_rate": 0.0001, "loss": 5.531, "loss/crossentropy": 2.389645516872406, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16159548610448837, "step": 7606 }, { "epoch": 0.3458181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.16376546223958333, "learning_rate": 0.0001, "loss": 5.843, "loss/crossentropy": 2.574206292629242, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17316913977265358, "step": 7608 }, { "epoch": 0.3459090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.116650390625, "learning_rate": 0.0001, "loss": 5.3826, "loss/crossentropy": 2.2234974205493927, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16239111870527267, "step": 7610 }, { "epoch": 0.346, "grad_norm": 5.46875, "grad_norm_var": 0.08163655598958333, "learning_rate": 0.0001, "loss": 5.7897, "loss/crossentropy": 2.495492458343506, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1757146306335926, "step": 7612 }, { "epoch": 0.3460909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.079931640625, "learning_rate": 0.0001, "loss": 5.7368, "loss/crossentropy": 2.466887354850769, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17210714146494865, "step": 7614 }, { "epoch": 0.3461818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.07769775390625, "learning_rate": 0.0001, "loss": 5.8056, "loss/crossentropy": 2.538061022758484, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.16913577541708946, "step": 7616 }, { "epoch": 0.3462727272727273, "grad_norm": 5.46875, "grad_norm_var": 0.0650390625, "learning_rate": 0.0001, "loss": 5.7783, "loss/crossentropy": 2.4227917194366455, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1785191111266613, "step": 7618 }, { "epoch": 0.3463636363636364, "grad_norm": 5.40625, "grad_norm_var": 0.0603515625, "learning_rate": 0.0001, "loss": 5.8703, "loss/crossentropy": 2.563526690006256, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17658046633005142, "step": 7620 }, { "epoch": 0.34645454545454546, "grad_norm": 5.21875, "grad_norm_var": 0.07745768229166666, "learning_rate": 0.0001, "loss": 6.1774, "loss/crossentropy": 2.71617990732193, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1896805539727211, "step": 7622 }, { "epoch": 0.34654545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.07235921223958333, "learning_rate": 0.0001, "loss": 5.7939, "loss/crossentropy": 2.48145067691803, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1744108609855175, "step": 7624 }, { "epoch": 0.34663636363636363, "grad_norm": 5.375, "grad_norm_var": 0.07659098307291666, "learning_rate": 0.0001, "loss": 5.7168, "loss/crossentropy": 2.4435293078422546, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17283335700631142, "step": 7626 }, { "epoch": 0.3467272727272727, "grad_norm": 5.84375, "grad_norm_var": 0.12760416666666666, "learning_rate": 0.0001, "loss": 5.7708, "loss/crossentropy": 2.4802876710891724, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.17026277631521225, "step": 7628 }, { "epoch": 0.3468181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.11926676432291666, "learning_rate": 0.0001, "loss": 5.7298, "loss/crossentropy": 2.4432942271232605, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17220642417669296, "step": 7630 }, { "epoch": 0.3469090909090909, "grad_norm": 5.25, "grad_norm_var": 0.13919270833333333, "learning_rate": 0.0001, "loss": 5.0271, "loss/crossentropy": 2.024901568889618, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.14807253703474998, "step": 7632 }, { "epoch": 0.347, "grad_norm": 8.875, "grad_norm_var": 0.9624348958333333, "learning_rate": 0.0001, "loss": 5.8321, "loss/crossentropy": 2.5336541533470154, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17574751004576683, "step": 7634 }, { "epoch": 0.3470909090909091, "grad_norm": 5.5, "grad_norm_var": 0.9683430989583334, "learning_rate": 0.0001, "loss": 5.9166, "loss/crossentropy": 2.533679723739624, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1816539280116558, "step": 7636 }, { "epoch": 0.3471818181818182, "grad_norm": 4.9375, "grad_norm_var": 1.1511027018229167, "learning_rate": 0.0001, "loss": 5.4043, "loss/crossentropy": 2.261030852794647, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16296308115124702, "step": 7638 }, { "epoch": 0.3472727272727273, "grad_norm": 5.25, "grad_norm_var": 1.1528483072916667, "learning_rate": 0.0001, "loss": 5.71, "loss/crossentropy": 2.4508626461029053, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.16985653340816498, "step": 7640 }, { "epoch": 0.3473636363636364, "grad_norm": 6.25, "grad_norm_var": 1.1744099934895833, "learning_rate": 0.0001, "loss": 6.2922, "loss/crossentropy": 2.8364033699035645, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1877700500190258, "step": 7642 }, { "epoch": 0.34745454545454546, "grad_norm": 4.9375, "grad_norm_var": 1.1367146809895834, "learning_rate": 0.0001, "loss": 5.3941, "loss/crossentropy": 2.254979431629181, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16117724403738976, "step": 7644 }, { "epoch": 0.34754545454545455, "grad_norm": 5.28125, "grad_norm_var": 1.1223795572916666, "learning_rate": 0.0001, "loss": 5.7834, "loss/crossentropy": 2.459070533514023, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17618151754140854, "step": 7646 }, { "epoch": 0.34763636363636363, "grad_norm": 7.125, "grad_norm_var": 1.1930826822916667, "learning_rate": 0.0001, "loss": 5.7279, "loss/crossentropy": 2.4611469507217407, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.16925305500626564, "step": 7648 }, { "epoch": 0.3477272727272727, "grad_norm": 5.5625, "grad_norm_var": 0.47877604166666665, "learning_rate": 0.0001, "loss": 6.0186, "loss/crossentropy": 2.6605945229530334, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1811118982732296, "step": 7650 }, { "epoch": 0.3478181818181818, "grad_norm": 5.375, "grad_norm_var": 0.49529622395833334, "learning_rate": 0.0001, "loss": 5.7363, "loss/crossentropy": 2.5228059887886047, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16861902177333832, "step": 7652 }, { "epoch": 0.3479090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.304931640625, "learning_rate": 0.0001, "loss": 5.9415, "loss/crossentropy": 2.6654938459396362, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17427891492843628, "step": 7654 }, { "epoch": 0.348, "grad_norm": 4.65625, "grad_norm_var": 0.3458943684895833, "learning_rate": 0.0001, "loss": 5.3848, "loss/crossentropy": 2.349590003490448, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15450213849544525, "step": 7656 }, { "epoch": 0.3480909090909091, "grad_norm": 5.375, "grad_norm_var": 0.2981770833333333, "learning_rate": 0.0001, "loss": 6.0129, "loss/crossentropy": 2.6638248562812805, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1792413778603077, "step": 7658 }, { "epoch": 0.3481818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.29225260416666665, "learning_rate": 0.0001, "loss": 5.5601, "loss/crossentropy": 2.3794214725494385, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16494561731815338, "step": 7660 }, { "epoch": 0.3482727272727273, "grad_norm": 5.4375, "grad_norm_var": 0.28513997395833335, "learning_rate": 0.0001, "loss": 5.7782, "loss/crossentropy": 2.4439194202423096, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17776622623205185, "step": 7662 }, { "epoch": 0.3483636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.07701416015625, "learning_rate": 0.0001, "loss": 5.7573, "loss/crossentropy": 2.545481562614441, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16942528635263443, "step": 7664 }, { "epoch": 0.34845454545454546, "grad_norm": 5.28125, "grad_norm_var": 0.067041015625, "learning_rate": 0.0001, "loss": 5.6659, "loss/crossentropy": 2.453036308288574, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16561991348862648, "step": 7666 }, { "epoch": 0.34854545454545455, "grad_norm": 5.15625, "grad_norm_var": 0.06555989583333334, "learning_rate": 0.0001, "loss": 5.9563, "loss/crossentropy": 2.626330256462097, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17654701694846153, "step": 7668 }, { "epoch": 0.34863636363636363, "grad_norm": 4.59375, "grad_norm_var": 0.08684895833333334, "learning_rate": 0.0001, "loss": 5.1009, "loss/crossentropy": 2.081808567047119, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.14742188155651093, "step": 7670 }, { "epoch": 0.3487272727272727, "grad_norm": 4.59375, "grad_norm_var": 0.099609375, "learning_rate": 0.0001, "loss": 5.3889, "loss/crossentropy": 2.198450654745102, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16201245784759521, "step": 7672 }, { "epoch": 0.3488181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.09664306640625, "learning_rate": 0.0001, "loss": 5.8669, "loss/crossentropy": 2.4734710454940796, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18036143854260445, "step": 7674 }, { "epoch": 0.3489090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.09908447265625, "learning_rate": 0.0001, "loss": 5.7125, "loss/crossentropy": 2.5081340074539185, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16848578676581383, "step": 7676 }, { "epoch": 0.349, "grad_norm": 5.5, "grad_norm_var": 0.09576822916666666, "learning_rate": 0.0001, "loss": 5.586, "loss/crossentropy": 2.368090033531189, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16651922836899757, "step": 7678 }, { "epoch": 0.3490909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.08655192057291666, "learning_rate": 0.0001, "loss": 5.6588, "loss/crossentropy": 2.4728167057037354, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16430116444826126, "step": 7680 }, { "epoch": 0.3491818181818182, "grad_norm": 5.125, "grad_norm_var": 0.10792643229166667, "learning_rate": 0.0001, "loss": 6.0289, "loss/crossentropy": 2.682537794113159, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.18092236667871475, "step": 7682 }, { "epoch": 0.3492727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.7391276041666667, "learning_rate": 0.0001, "loss": 5.4539, "loss/crossentropy": 2.2742587327957153, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16327635943889618, "step": 7684 }, { "epoch": 0.3493636363636364, "grad_norm": 4.90625, "grad_norm_var": 0.70191650390625, "learning_rate": 0.0001, "loss": 5.6001, "loss/crossentropy": 2.4227518439292908, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16499808803200722, "step": 7686 }, { "epoch": 0.34945454545454546, "grad_norm": 5.15625, "grad_norm_var": 0.6883951822916666, "learning_rate": 0.0001, "loss": 5.4248, "loss/crossentropy": 2.2995139360427856, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.15725506842136383, "step": 7688 }, { "epoch": 0.34954545454545455, "grad_norm": 5.46875, "grad_norm_var": 0.6862589518229166, "learning_rate": 0.0001, "loss": 5.8548, "loss/crossentropy": 2.601768672466278, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17412984371185303, "step": 7690 }, { "epoch": 0.34963636363636363, "grad_norm": 4.53125, "grad_norm_var": 0.7460896809895833, "learning_rate": 0.0001, "loss": 5.3086, "loss/crossentropy": 2.198653668165207, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1576768048107624, "step": 7692 }, { "epoch": 0.3497272727272727, "grad_norm": 4.875, "grad_norm_var": 0.766259765625, "learning_rate": 0.0001, "loss": 5.9628, "loss/crossentropy": 2.6434834599494934, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17939582094550133, "step": 7694 }, { "epoch": 0.3498181818181818, "grad_norm": 6.5, "grad_norm_var": 2.933984375, "learning_rate": 0.0001, "loss": 5.5609, "loss/crossentropy": 2.1851328909397125, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.17996232956647873, "step": 7696 }, { "epoch": 0.3499090909090909, "grad_norm": 4.84375, "grad_norm_var": 2.9669230143229166, "learning_rate": 0.0001, "loss": 5.716, "loss/crossentropy": 2.468975305557251, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17137747630476952, "step": 7698 }, { "epoch": 0.35, "grad_norm": 5.125, "grad_norm_var": 2.5112589518229167, "learning_rate": 0.0001, "loss": 5.8336, "loss/crossentropy": 2.515192747116089, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17598500475287437, "step": 7700 }, { "epoch": 0.35009090909090906, "grad_norm": 5.1875, "grad_norm_var": 2.5015909830729166, "learning_rate": 0.0001, "loss": 5.3969, "loss/crossentropy": 2.2778880894184113, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.15624162927269936, "step": 7702 }, { "epoch": 0.3501818181818182, "grad_norm": 5.5, "grad_norm_var": 2.45689697265625, "learning_rate": 0.0001, "loss": 5.8454, "loss/crossentropy": 2.5474531650543213, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17335091531276703, "step": 7704 }, { "epoch": 0.3502727272727273, "grad_norm": 5.125, "grad_norm_var": 2.495833333333333, "learning_rate": 0.0001, "loss": 5.7673, "loss/crossentropy": 2.5052571296691895, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1738591007888317, "step": 7706 }, { "epoch": 0.3503636363636364, "grad_norm": 5.90625, "grad_norm_var": 2.630171712239583, "learning_rate": 0.0001, "loss": 6.0411, "loss/crossentropy": 2.6756651401519775, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.18009864166378975, "step": 7708 }, { "epoch": 0.35045454545454546, "grad_norm": 4.9375, "grad_norm_var": 2.6080037434895833, "learning_rate": 0.0001, "loss": 5.5298, "loss/crossentropy": 2.3278309106826782, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1672675423324108, "step": 7710 }, { "epoch": 0.35054545454545455, "grad_norm": 5.5, "grad_norm_var": 0.44192708333333336, "learning_rate": 0.0001, "loss": 5.9402, "loss/crossentropy": 2.596254885196686, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1820557340979576, "step": 7712 }, { "epoch": 0.35063636363636363, "grad_norm": 5.15625, "grad_norm_var": 0.4122233072916667, "learning_rate": 0.0001, "loss": 5.8187, "loss/crossentropy": 2.5493358373641968, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17537562176585197, "step": 7714 }, { "epoch": 0.3507272727272727, "grad_norm": 5.09375, "grad_norm_var": 0.4149373372395833, "learning_rate": 0.0001, "loss": 5.571, "loss/crossentropy": 2.3604654669761658, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16890520974993706, "step": 7716 }, { "epoch": 0.3508181818181818, "grad_norm": 5.375, "grad_norm_var": 0.42864583333333334, "learning_rate": 0.0001, "loss": 5.4947, "loss/crossentropy": 2.302914798259735, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16585992276668549, "step": 7718 }, { "epoch": 0.3509090909090909, "grad_norm": 5.0, "grad_norm_var": 0.43079020182291666, "learning_rate": 0.0001, "loss": 5.927, "loss/crossentropy": 2.6124993562698364, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17852437123656273, "step": 7720 }, { "epoch": 0.351, "grad_norm": 4.96875, "grad_norm_var": 0.4313151041666667, "learning_rate": 0.0001, "loss": 5.577, "loss/crossentropy": 2.410922944545746, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16289322450757027, "step": 7722 }, { "epoch": 0.35109090909090906, "grad_norm": 5.125, "grad_norm_var": 0.04169514973958333, "learning_rate": 0.0001, "loss": 5.6823, "loss/crossentropy": 2.4720667004585266, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1680888496339321, "step": 7724 }, { "epoch": 0.3511818181818182, "grad_norm": 5.6875, "grad_norm_var": 0.04485677083333333, "learning_rate": 0.0001, "loss": 5.5932, "loss/crossentropy": 2.3905377984046936, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16811295598745346, "step": 7726 }, { "epoch": 0.3512727272727273, "grad_norm": 5.3125, "grad_norm_var": 0.03915608723958333, "learning_rate": 0.0001, "loss": 5.9546, "loss/crossentropy": 2.64103764295578, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17725537717342377, "step": 7728 }, { "epoch": 0.3513636363636364, "grad_norm": 4.96875, "grad_norm_var": 0.046187337239583334, "learning_rate": 0.0001, "loss": 5.7549, "loss/crossentropy": 2.4947838187217712, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17151512205600739, "step": 7730 }, { "epoch": 0.35145454545454546, "grad_norm": 5.9375, "grad_norm_var": 0.38039957682291664, "learning_rate": 0.0001, "loss": 6.1336, "loss/crossentropy": 2.6973701119422913, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.188937209546566, "step": 7732 }, { "epoch": 0.35154545454545455, "grad_norm": 4.90625, "grad_norm_var": 0.3734659830729167, "learning_rate": 0.0001, "loss": 5.8759, "loss/crossentropy": 2.6108214259147644, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17338700965046883, "step": 7734 }, { "epoch": 0.35163636363636364, "grad_norm": 6.0, "grad_norm_var": 0.38162434895833336, "learning_rate": 0.0001, "loss": 5.9687, "loss/crossentropy": 2.6048476099967957, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18170206621289253, "step": 7736 }, { "epoch": 0.3517272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.3896484375, "learning_rate": 0.0001, "loss": 5.5018, "loss/crossentropy": 2.343757927417755, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16365829855203629, "step": 7738 }, { "epoch": 0.3518181818181818, "grad_norm": 5.8125, "grad_norm_var": 0.38600260416666665, "learning_rate": 0.0001, "loss": 5.901, "loss/crossentropy": 2.644598603248596, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17290647700428963, "step": 7740 }, { "epoch": 0.3519090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.3938802083333333, "learning_rate": 0.0001, "loss": 5.6768, "loss/crossentropy": 2.4566701650619507, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17025909945368767, "step": 7742 }, { "epoch": 0.352, "grad_norm": 5.21875, "grad_norm_var": 0.3934733072916667, "learning_rate": 0.0001, "loss": 6.0332, "loss/crossentropy": 2.6583099961280823, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18163412809371948, "step": 7744 }, { "epoch": 0.35209090909090907, "grad_norm": 4.84375, "grad_norm_var": 0.40380452473958334, "learning_rate": 0.0001, "loss": 5.6732, "loss/crossentropy": 2.4059143662452698, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17243172973394394, "step": 7746 }, { "epoch": 0.3521818181818182, "grad_norm": 5.34375, "grad_norm_var": 0.5507120768229167, "learning_rate": 0.0001, "loss": 5.8261, "loss/crossentropy": 2.4886814057826996, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18100247904658318, "step": 7748 }, { "epoch": 0.3522727272727273, "grad_norm": 5.0, "grad_norm_var": 0.5465128580729167, "learning_rate": 0.0001, "loss": 5.5574, "loss/crossentropy": 2.3584718704223633, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16617706790566444, "step": 7750 }, { "epoch": 0.3523636363636364, "grad_norm": 5.125, "grad_norm_var": 0.5434855143229167, "learning_rate": 0.0001, "loss": 5.8297, "loss/crossentropy": 2.5329294204711914, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17772850394248962, "step": 7752 }, { "epoch": 0.35245454545454546, "grad_norm": 5.1875, "grad_norm_var": 0.5313802083333333, "learning_rate": 0.0001, "loss": 5.795, "loss/crossentropy": 2.50874263048172, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17334900796413422, "step": 7754 }, { "epoch": 0.35254545454545455, "grad_norm": 5.96875, "grad_norm_var": 0.5463541666666667, "learning_rate": 0.0001, "loss": 5.7379, "loss/crossentropy": 2.4488591849803925, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1724574938416481, "step": 7756 }, { "epoch": 0.35263636363636364, "grad_norm": 5.65625, "grad_norm_var": 0.5379557291666667, "learning_rate": 0.0001, "loss": 5.8338, "loss/crossentropy": 2.508488416671753, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17823480442166328, "step": 7758 }, { "epoch": 0.3527272727272727, "grad_norm": 5.8125, "grad_norm_var": 0.547900390625, "learning_rate": 0.0001, "loss": 5.7651, "loss/crossentropy": 2.4911176562309265, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1715414598584175, "step": 7760 }, { "epoch": 0.3528181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.5484212239583334, "learning_rate": 0.0001, "loss": 5.7531, "loss/crossentropy": 2.5231072902679443, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1712436079978943, "step": 7762 }, { "epoch": 0.3529090909090909, "grad_norm": 5.125, "grad_norm_var": 0.12144775390625, "learning_rate": 0.0001, "loss": 5.3067, "loss/crossentropy": 2.2210998833179474, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1550440639257431, "step": 7764 }, { "epoch": 0.353, "grad_norm": 4.75, "grad_norm_var": 0.12437744140625, "learning_rate": 0.0001, "loss": 5.4431, "loss/crossentropy": 2.336355060338974, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1608695462346077, "step": 7766 }, { "epoch": 0.35309090909090907, "grad_norm": 5.25, "grad_norm_var": 0.122119140625, "learning_rate": 0.0001, "loss": 5.4548, "loss/crossentropy": 2.276310682296753, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16648321598768234, "step": 7768 }, { "epoch": 0.3531818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.13448893229166667, "learning_rate": 0.0001, "loss": 5.668, "loss/crossentropy": 2.471729815006256, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1633724384009838, "step": 7770 }, { "epoch": 0.3532727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.11222330729166667, "learning_rate": 0.0001, "loss": 5.5674, "loss/crossentropy": 2.3917447328567505, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1626783236861229, "step": 7772 }, { "epoch": 0.3533636363636364, "grad_norm": 5.53125, "grad_norm_var": 0.10660400390625, "learning_rate": 0.0001, "loss": 6.095, "loss/crossentropy": 2.6389830112457275, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18993613868951797, "step": 7774 }, { "epoch": 0.35345454545454547, "grad_norm": 4.8125, "grad_norm_var": 0.08033854166666667, "learning_rate": 0.0001, "loss": 5.6096, "loss/crossentropy": 2.481003701686859, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16266915947198868, "step": 7776 }, { "epoch": 0.35354545454545455, "grad_norm": 5.34375, "grad_norm_var": 0.09876302083333334, "learning_rate": 0.0001, "loss": 6.0743, "loss/crossentropy": 2.6710383892059326, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18427389860153198, "step": 7778 }, { "epoch": 0.35363636363636364, "grad_norm": 5.375, "grad_norm_var": 0.09289957682291666, "learning_rate": 0.0001, "loss": 6.2356, "loss/crossentropy": 2.800097703933716, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18730300664901733, "step": 7780 }, { "epoch": 0.3537272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.07815348307291667, "learning_rate": 0.0001, "loss": 5.7804, "loss/crossentropy": 2.5447680950164795, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1708291508257389, "step": 7782 }, { "epoch": 0.3538181818181818, "grad_norm": 5.21875, "grad_norm_var": 0.07980143229166667, "learning_rate": 0.0001, "loss": 6.1148, "loss/crossentropy": 2.778086543083191, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1801510788500309, "step": 7784 }, { "epoch": 0.3539090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.07789306640625, "learning_rate": 0.0001, "loss": 5.7247, "loss/crossentropy": 2.5080051720142365, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16893015056848526, "step": 7786 }, { "epoch": 0.354, "grad_norm": 5.25, "grad_norm_var": 0.05663655598958333, "learning_rate": 0.0001, "loss": 5.7322, "loss/crossentropy": 2.466847777366638, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1745796725153923, "step": 7788 }, { "epoch": 0.35409090909090907, "grad_norm": 5.28125, "grad_norm_var": 0.050390625, "learning_rate": 0.0001, "loss": 6.0555, "loss/crossentropy": 2.6928465962409973, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18040503934025764, "step": 7790 }, { "epoch": 0.3541818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.045308430989583336, "learning_rate": 0.0001, "loss": 5.4953, "loss/crossentropy": 2.361535906791687, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1610308587551117, "step": 7792 }, { "epoch": 0.3542727272727273, "grad_norm": 5.125, "grad_norm_var": 0.049072265625, "learning_rate": 0.0001, "loss": 5.8585, "loss/crossentropy": 2.533300518989563, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17919712141156197, "step": 7794 }, { "epoch": 0.3543636363636364, "grad_norm": 7.09375, "grad_norm_var": 0.2740885416666667, "learning_rate": 0.0001, "loss": 5.6089, "loss/crossentropy": 2.3496662974357605, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.16987039148807526, "step": 7796 }, { "epoch": 0.35445454545454547, "grad_norm": 5.34375, "grad_norm_var": 0.29638264973958334, "learning_rate": 0.0001, "loss": 6.3208, "loss/crossentropy": 2.83771675825119, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.19186154380440712, "step": 7798 }, { "epoch": 0.35454545454545455, "grad_norm": 5.5625, "grad_norm_var": 0.29455973307291666, "learning_rate": 0.0001, "loss": 5.9546, "loss/crossentropy": 2.614160269498825, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.18033010885119438, "step": 7800 }, { "epoch": 0.35463636363636364, "grad_norm": 5.25, "grad_norm_var": 0.27034098307291665, "learning_rate": 0.0001, "loss": 5.9335, "loss/crossentropy": 2.5784170031547546, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17828616127371788, "step": 7802 }, { "epoch": 0.3547272727272727, "grad_norm": 4.875, "grad_norm_var": 0.2891764322916667, "learning_rate": 0.0001, "loss": 5.4687, "loss/crossentropy": 2.363520860671997, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.158371914178133, "step": 7804 }, { "epoch": 0.3548181818181818, "grad_norm": 5.125, "grad_norm_var": 0.33736572265625, "learning_rate": 0.0001, "loss": 5.506, "loss/crossentropy": 2.3522729575634003, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1618546098470688, "step": 7806 }, { "epoch": 0.3549090909090909, "grad_norm": 5.9375, "grad_norm_var": 0.3167317708333333, "learning_rate": 0.0001, "loss": 5.884, "loss/crossentropy": 2.4916412830352783, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1843547560274601, "step": 7808 }, { "epoch": 0.355, "grad_norm": 5.3125, "grad_norm_var": 0.3649373372395833, "learning_rate": 0.0001, "loss": 5.6313, "loss/crossentropy": 2.333017647266388, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1720196157693863, "step": 7810 }, { "epoch": 0.35509090909090907, "grad_norm": 5.4375, "grad_norm_var": 0.20362955729166668, "learning_rate": 0.0001, "loss": 5.8191, "loss/crossentropy": 2.5467440485954285, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1731303073465824, "step": 7812 }, { "epoch": 0.35518181818181815, "grad_norm": 5.28125, "grad_norm_var": 0.233447265625, "learning_rate": 0.0001, "loss": 5.6999, "loss/crossentropy": 2.3979623913764954, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1764838583767414, "step": 7814 }, { "epoch": 0.3552727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.23912353515625, "learning_rate": 0.0001, "loss": 5.3502, "loss/crossentropy": 2.2313580214977264, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1587543785572052, "step": 7816 }, { "epoch": 0.3553636363636364, "grad_norm": 5.1875, "grad_norm_var": 0.22519124348958333, "learning_rate": 0.0001, "loss": 5.7926, "loss/crossentropy": 2.5216299891471863, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1733892783522606, "step": 7818 }, { "epoch": 0.35545454545454547, "grad_norm": 5.84375, "grad_norm_var": 0.21451416015625, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.4221996665000916, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1718045249581337, "step": 7820 }, { "epoch": 0.35554545454545455, "grad_norm": 6.875, "grad_norm_var": 0.27003580729166665, "learning_rate": 0.0001, "loss": 5.6746, "loss/crossentropy": 2.4448662400245667, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16769737377762794, "step": 7822 }, { "epoch": 0.35563636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.2771484375, "learning_rate": 0.0001, "loss": 5.7899, "loss/crossentropy": 2.4796582460403442, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1732156127691269, "step": 7824 }, { "epoch": 0.3557272727272727, "grad_norm": 5.6875, "grad_norm_var": 0.22350260416666667, "learning_rate": 0.0001, "loss": 6.2283, "loss/crossentropy": 2.7858164310455322, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.18976017832756042, "step": 7826 }, { "epoch": 0.3558181818181818, "grad_norm": 5.8125, "grad_norm_var": 0.23131510416666667, "learning_rate": 0.0001, "loss": 6.1217, "loss/crossentropy": 2.7215453386306763, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1843544952571392, "step": 7828 }, { "epoch": 0.3559090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.2123046875, "learning_rate": 0.0001, "loss": 5.553, "loss/crossentropy": 2.40500146150589, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1630457416176796, "step": 7830 }, { "epoch": 0.356, "grad_norm": 5.4375, "grad_norm_var": 0.20032145182291666, "learning_rate": 0.0001, "loss": 5.5086, "loss/crossentropy": 2.2957004606723785, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16563056409358978, "step": 7832 }, { "epoch": 0.35609090909090907, "grad_norm": 5.65625, "grad_norm_var": 0.23053385416666666, "learning_rate": 0.0001, "loss": 5.8985, "loss/crossentropy": 2.5949078798294067, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1735246740281582, "step": 7834 }, { "epoch": 0.35618181818181816, "grad_norm": 5.3125, "grad_norm_var": 0.22823893229166667, "learning_rate": 0.0001, "loss": 5.9981, "loss/crossentropy": 2.6608660221099854, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18137970194220543, "step": 7836 }, { "epoch": 0.3562727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.12063802083333333, "learning_rate": 0.0001, "loss": 5.7759, "loss/crossentropy": 2.4544079303741455, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17609456926584244, "step": 7838 }, { "epoch": 0.3563636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.14807535807291666, "learning_rate": 0.0001, "loss": 5.6028, "loss/crossentropy": 2.364047408103943, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17114441469311714, "step": 7840 }, { "epoch": 0.35645454545454547, "grad_norm": 5.125, "grad_norm_var": 0.11526285807291667, "learning_rate": 0.0001, "loss": 5.8958, "loss/crossentropy": 2.5797959566116333, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1773015409708023, "step": 7842 }, { "epoch": 0.35654545454545455, "grad_norm": 5.0, "grad_norm_var": 0.09529622395833333, "learning_rate": 0.0001, "loss": 5.6535, "loss/crossentropy": 2.465357780456543, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16627483069896698, "step": 7844 }, { "epoch": 0.35663636363636364, "grad_norm": 5.5625, "grad_norm_var": 0.10677083333333333, "learning_rate": 0.0001, "loss": 5.537, "loss/crossentropy": 2.3186313211917877, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16812725365161896, "step": 7846 }, { "epoch": 0.3567272727272727, "grad_norm": 6.0, "grad_norm_var": 0.14218343098958333, "learning_rate": 0.0001, "loss": 5.5886, "loss/crossentropy": 2.3248310685157776, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16934509202837944, "step": 7848 }, { "epoch": 0.3568181818181818, "grad_norm": 5.59375, "grad_norm_var": 0.1283203125, "learning_rate": 0.0001, "loss": 6.0478, "loss/crossentropy": 2.686325192451477, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18048494681715965, "step": 7850 }, { "epoch": 0.3569090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.10558268229166666, "learning_rate": 0.0001, "loss": 6.0016, "loss/crossentropy": 2.685814619064331, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17962440103292465, "step": 7852 }, { "epoch": 0.357, "grad_norm": 4.90625, "grad_norm_var": 0.10689697265625, "learning_rate": 0.0001, "loss": 5.554, "loss/crossentropy": 2.4177092909812927, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16382554173469543, "step": 7854 }, { "epoch": 0.35709090909090907, "grad_norm": 4.875, "grad_norm_var": 0.09078369140625, "learning_rate": 0.0001, "loss": 5.5504, "loss/crossentropy": 2.419389843940735, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16134413704276085, "step": 7856 }, { "epoch": 0.35718181818181816, "grad_norm": 4.5, "grad_norm_var": 0.12890218098958334, "learning_rate": 0.0001, "loss": 4.9554, "loss/crossentropy": 1.9339612424373627, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.14980263262987137, "step": 7858 }, { "epoch": 0.3572727272727273, "grad_norm": 5.25, "grad_norm_var": 0.12590738932291667, "learning_rate": 0.0001, "loss": 5.7757, "loss/crossentropy": 2.504632830619812, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17125209048390388, "step": 7860 }, { "epoch": 0.3573636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.11847330729166666, "learning_rate": 0.0001, "loss": 5.9261, "loss/crossentropy": 2.6501417756080627, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17427204176783562, "step": 7862 }, { "epoch": 0.35745454545454547, "grad_norm": 4.875, "grad_norm_var": 0.08108317057291667, "learning_rate": 0.0001, "loss": 5.4634, "loss/crossentropy": 2.3523204922676086, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15876540541648865, "step": 7864 }, { "epoch": 0.35754545454545456, "grad_norm": 5.71875, "grad_norm_var": 0.08052978515625, "learning_rate": 0.0001, "loss": 5.9525, "loss/crossentropy": 2.6223827600479126, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17890818789601326, "step": 7866 }, { "epoch": 0.35763636363636364, "grad_norm": 5.65625, "grad_norm_var": 0.09579671223958333, "learning_rate": 0.0001, "loss": 6.1545, "loss/crossentropy": 2.6835198402404785, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18948491662740707, "step": 7868 }, { "epoch": 0.3577272727272727, "grad_norm": 7.84375, "grad_norm_var": 0.5654947916666667, "learning_rate": 0.0001, "loss": 6.1549, "loss/crossentropy": 2.7074958086013794, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.19025316089391708, "step": 7870 }, { "epoch": 0.3578181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.552587890625, "learning_rate": 0.0001, "loss": 5.9002, "loss/crossentropy": 2.6103041768074036, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1768382340669632, "step": 7872 }, { "epoch": 0.3579090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.50758056640625, "learning_rate": 0.0001, "loss": 5.5807, "loss/crossentropy": 2.3186309933662415, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.172883041203022, "step": 7874 }, { "epoch": 0.358, "grad_norm": 5.1875, "grad_norm_var": 0.5112630208333333, "learning_rate": 0.0001, "loss": 5.7817, "loss/crossentropy": 2.4782856702804565, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17721965163946152, "step": 7876 }, { "epoch": 0.35809090909090907, "grad_norm": 5.3125, "grad_norm_var": 0.5907185872395834, "learning_rate": 0.0001, "loss": 5.2793, "loss/crossentropy": 2.170366257429123, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15952465310692787, "step": 7878 }, { "epoch": 0.35818181818181816, "grad_norm": 5.59375, "grad_norm_var": 0.5773396809895833, "learning_rate": 0.0001, "loss": 5.85, "loss/crossentropy": 2.5645402669906616, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17464350908994675, "step": 7880 }, { "epoch": 0.3582727272727273, "grad_norm": 5.25, "grad_norm_var": 0.564306640625, "learning_rate": 0.0001, "loss": 5.6876, "loss/crossentropy": 2.4109575152397156, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17180068418383598, "step": 7882 }, { "epoch": 0.3583636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.553759765625, "learning_rate": 0.0001, "loss": 5.6928, "loss/crossentropy": 2.4577219486236572, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17096976935863495, "step": 7884 }, { "epoch": 0.35845454545454547, "grad_norm": 5.15625, "grad_norm_var": 0.09114176432291667, "learning_rate": 0.0001, "loss": 5.4812, "loss/crossentropy": 2.3788026571273804, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16199884563684464, "step": 7886 }, { "epoch": 0.35854545454545456, "grad_norm": 5.375, "grad_norm_var": 0.09427083333333333, "learning_rate": 0.0001, "loss": 6.0057, "loss/crossentropy": 2.6464380621910095, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1808464601635933, "step": 7888 }, { "epoch": 0.35863636363636364, "grad_norm": 4.90625, "grad_norm_var": 0.09709879557291666, "learning_rate": 0.0001, "loss": 5.4214, "loss/crossentropy": 2.3203569650650024, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1587420552968979, "step": 7890 }, { "epoch": 0.35872727272727273, "grad_norm": 5.375, "grad_norm_var": 0.102587890625, "learning_rate": 0.0001, "loss": 5.5574, "loss/crossentropy": 2.3470487892627716, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1663484163582325, "step": 7892 }, { "epoch": 0.3588181818181818, "grad_norm": 6.125, "grad_norm_var": 0.094921875, "learning_rate": 0.0001, "loss": 5.7822, "loss/crossentropy": 2.4780914783477783, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17591619491577148, "step": 7894 }, { "epoch": 0.3589090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.09221598307291666, "learning_rate": 0.0001, "loss": 5.9233, "loss/crossentropy": 2.623812675476074, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17565586045384407, "step": 7896 }, { "epoch": 0.359, "grad_norm": 4.8125, "grad_norm_var": 0.1021484375, "learning_rate": 0.0001, "loss": 5.5038, "loss/crossentropy": 2.409674733877182, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15804775059223175, "step": 7898 }, { "epoch": 0.35909090909090907, "grad_norm": 5.375, "grad_norm_var": 0.106884765625, "learning_rate": 0.0001, "loss": 6.0441, "loss/crossentropy": 2.6736207604408264, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18236283212900162, "step": 7900 }, { "epoch": 0.35918181818181816, "grad_norm": 5.09375, "grad_norm_var": 0.10787760416666667, "learning_rate": 0.0001, "loss": 5.6127, "loss/crossentropy": 2.4520338773727417, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16489767283201218, "step": 7902 }, { "epoch": 0.3592727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.11106770833333333, "learning_rate": 0.0001, "loss": 5.7819, "loss/crossentropy": 2.5044506192207336, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17305255681276321, "step": 7904 }, { "epoch": 0.3593636363636364, "grad_norm": 5.125, "grad_norm_var": 0.11151936848958334, "learning_rate": 0.0001, "loss": 5.7443, "loss/crossentropy": 2.5406017303466797, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16665644943714142, "step": 7906 }, { "epoch": 0.35945454545454547, "grad_norm": 4.84375, "grad_norm_var": 0.11995035807291667, "learning_rate": 0.0001, "loss": 5.4778, "loss/crossentropy": 2.317347973585129, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1633138656616211, "step": 7908 }, { "epoch": 0.35954545454545456, "grad_norm": 5.125, "grad_norm_var": 0.05774332682291667, "learning_rate": 0.0001, "loss": 5.6903, "loss/crossentropy": 2.4585703015327454, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17356118187308311, "step": 7910 }, { "epoch": 0.35963636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.05025634765625, "learning_rate": 0.0001, "loss": 5.894, "loss/crossentropy": 2.5899163484573364, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1774809993803501, "step": 7912 }, { "epoch": 0.35972727272727273, "grad_norm": 6.3125, "grad_norm_var": 0.192822265625, "learning_rate": 0.0001, "loss": 6.0417, "loss/crossentropy": 2.6521077156066895, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.17880279198288918, "step": 7914 }, { "epoch": 0.3598181818181818, "grad_norm": 5.25, "grad_norm_var": 0.18879801432291668, "learning_rate": 0.0001, "loss": 5.6121, "loss/crossentropy": 2.430845320224762, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16343820467591286, "step": 7916 }, { "epoch": 0.3599090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.18479410807291666, "learning_rate": 0.0001, "loss": 5.5651, "loss/crossentropy": 2.39590722322464, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16652848571538925, "step": 7918 }, { "epoch": 0.36, "grad_norm": 5.28125, "grad_norm_var": 0.19034830729166666, "learning_rate": 0.0001, "loss": 5.9372, "loss/crossentropy": 2.613874912261963, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17901606857776642, "step": 7920 }, { "epoch": 0.3600909090909091, "grad_norm": 4.96875, "grad_norm_var": 0.18759358723958333, "learning_rate": 0.0001, "loss": 5.2111, "loss/crossentropy": 2.162766218185425, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15268812328577042, "step": 7922 }, { "epoch": 0.36018181818181816, "grad_norm": 5.03125, "grad_norm_var": 0.18131103515625, "learning_rate": 0.0001, "loss": 5.4613, "loss/crossentropy": 2.3620288372039795, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1591472327709198, "step": 7924 }, { "epoch": 0.3602727272727273, "grad_norm": 4.75, "grad_norm_var": 0.19488525390625, "learning_rate": 0.0001, "loss": 5.4307, "loss/crossentropy": 2.3350034952163696, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1593748889863491, "step": 7926 }, { "epoch": 0.3603636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.22626546223958333, "learning_rate": 0.0001, "loss": 5.0917, "loss/crossentropy": 2.1979233622550964, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1387905515730381, "step": 7928 }, { "epoch": 0.36045454545454547, "grad_norm": 6.53125, "grad_norm_var": 0.19724934895833332, "learning_rate": 0.0001, "loss": 5.8802, "loss/crossentropy": 2.554271936416626, "loss/hidden": 1.583984375, "loss/jsd": 0.0, "loss/logits": 0.1741935834288597, "step": 7930 }, { "epoch": 0.36054545454545456, "grad_norm": 4.8125, "grad_norm_var": 0.25115559895833334, "learning_rate": 0.0001, "loss": 5.6855, "loss/crossentropy": 2.4528009593486786, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17210300266742706, "step": 7932 }, { "epoch": 0.36063636363636364, "grad_norm": 5.6875, "grad_norm_var": 0.26015625, "learning_rate": 0.0001, "loss": 6.0018, "loss/crossentropy": 2.6721755266189575, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17886392027139664, "step": 7934 }, { "epoch": 0.36072727272727273, "grad_norm": 5.875, "grad_norm_var": 0.28690999348958335, "learning_rate": 0.0001, "loss": 5.7632, "loss/crossentropy": 2.495077133178711, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1738862469792366, "step": 7936 }, { "epoch": 0.3608181818181818, "grad_norm": 5.375, "grad_norm_var": 0.28398030598958335, "learning_rate": 0.0001, "loss": 5.6878, "loss/crossentropy": 2.500083088874817, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16915837302803993, "step": 7938 }, { "epoch": 0.3609090909090909, "grad_norm": 6.09375, "grad_norm_var": 0.5208984375, "learning_rate": 0.0001, "loss": 5.9666, "loss/crossentropy": 2.5162710547447205, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18722181022167206, "step": 7940 }, { "epoch": 0.361, "grad_norm": 4.78125, "grad_norm_var": 0.5092081705729167, "learning_rate": 0.0001, "loss": 5.5846, "loss/crossentropy": 2.345380336046219, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16942943260073662, "step": 7942 }, { "epoch": 0.3610909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.39898681640625, "learning_rate": 0.0001, "loss": 5.9176, "loss/crossentropy": 2.5336471796035767, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18214120343327522, "step": 7944 }, { "epoch": 0.36118181818181816, "grad_norm": 8.6875, "grad_norm_var": 0.972119140625, "learning_rate": 0.0001, "loss": 5.7888, "loss/crossentropy": 2.5298617482185364, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17315791547298431, "step": 7946 }, { "epoch": 0.36127272727272725, "grad_norm": 5.5625, "grad_norm_var": 0.909765625, "learning_rate": 0.0001, "loss": 5.9469, "loss/crossentropy": 2.5914506316184998, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17890141531825066, "step": 7948 }, { "epoch": 0.3613636363636364, "grad_norm": 5.5625, "grad_norm_var": 0.907275390625, "learning_rate": 0.0001, "loss": 5.9487, "loss/crossentropy": 2.6261304020881653, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17952725291252136, "step": 7950 }, { "epoch": 0.3614545454545455, "grad_norm": 5.1875, "grad_norm_var": 0.9645792643229166, "learning_rate": 0.0001, "loss": 5.767, "loss/crossentropy": 2.567139148712158, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17018048465251923, "step": 7952 }, { "epoch": 0.36154545454545456, "grad_norm": 5.25, "grad_norm_var": 0.97711181640625, "learning_rate": 0.0001, "loss": 5.5358, "loss/crossentropy": 2.432581126689911, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15934742987155914, "step": 7954 }, { "epoch": 0.36163636363636364, "grad_norm": 5.46875, "grad_norm_var": 0.8175618489583333, "learning_rate": 0.0001, "loss": 5.7923, "loss/crossentropy": 2.4779586791992188, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17557238787412643, "step": 7956 }, { "epoch": 0.36172727272727273, "grad_norm": 6.03125, "grad_norm_var": 0.8244791666666667, "learning_rate": 0.0001, "loss": 5.644, "loss/crossentropy": 2.3800307512283325, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17131876572966576, "step": 7958 }, { "epoch": 0.3618181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.8312459309895833, "learning_rate": 0.0001, "loss": 5.5297, "loss/crossentropy": 2.3600886464118958, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16227812692523003, "step": 7960 }, { "epoch": 0.3619090909090909, "grad_norm": 6.90625, "grad_norm_var": 0.2674763997395833, "learning_rate": 0.0001, "loss": 6.0568, "loss/crossentropy": 2.6243278980255127, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18621912971138954, "step": 7962 }, { "epoch": 0.362, "grad_norm": 5.625, "grad_norm_var": 0.25924072265625, "learning_rate": 0.0001, "loss": 5.8013, "loss/crossentropy": 2.503600388765335, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17664625123143196, "step": 7964 }, { "epoch": 0.3620909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.25467122395833336, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.50965815782547, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17190155014395714, "step": 7966 }, { "epoch": 0.36218181818181816, "grad_norm": 5.625, "grad_norm_var": 0.302734375, "learning_rate": 0.0001, "loss": 6.1262, "loss/crossentropy": 2.6712759733200073, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18982425704598427, "step": 7968 }, { "epoch": 0.36227272727272725, "grad_norm": 6.59375, "grad_norm_var": 0.32864583333333336, "learning_rate": 0.0001, "loss": 5.7035, "loss/crossentropy": 2.3640856742858887, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17886130511760712, "step": 7970 }, { "epoch": 0.3623636363636364, "grad_norm": 5.1875, "grad_norm_var": 0.3050618489583333, "learning_rate": 0.0001, "loss": 5.884, "loss/crossentropy": 2.5676207542419434, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17578187957406044, "step": 7972 }, { "epoch": 0.3624545454545455, "grad_norm": 4.96875, "grad_norm_var": 0.28560791015625, "learning_rate": 0.0001, "loss": 5.4727, "loss/crossentropy": 2.3202966451644897, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16387535631656647, "step": 7974 }, { "epoch": 0.36254545454545456, "grad_norm": 5.375, "grad_norm_var": 0.3140625, "learning_rate": 0.0001, "loss": 5.9555, "loss/crossentropy": 2.5968295335769653, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1796206347644329, "step": 7976 }, { "epoch": 0.36263636363636365, "grad_norm": 5.25, "grad_norm_var": 0.22148030598958332, "learning_rate": 0.0001, "loss": 5.6281, "loss/crossentropy": 2.465562582015991, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16625773534178734, "step": 7978 }, { "epoch": 0.36272727272727273, "grad_norm": 5.125, "grad_norm_var": 0.23671875, "learning_rate": 0.0001, "loss": 5.5393, "loss/crossentropy": 2.3973783254623413, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1626308150589466, "step": 7980 }, { "epoch": 0.3628181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.2696451822916667, "learning_rate": 0.0001, "loss": 5.2691, "loss/crossentropy": 2.181470423936844, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15642398037016392, "step": 7982 }, { "epoch": 0.3629090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.19855143229166666, "learning_rate": 0.0001, "loss": 5.6735, "loss/crossentropy": 2.5026443004608154, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16435014829039574, "step": 7984 }, { "epoch": 0.363, "grad_norm": 5.1875, "grad_norm_var": 0.09654947916666666, "learning_rate": 0.0001, "loss": 5.3903, "loss/crossentropy": 2.304395377635956, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15780559554696083, "step": 7986 }, { "epoch": 0.3630909090909091, "grad_norm": 6.625, "grad_norm_var": 0.21652018229166667, "learning_rate": 0.0001, "loss": 5.7796, "loss/crossentropy": 2.499683201313019, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17311019077897072, "step": 7988 }, { "epoch": 0.36318181818181816, "grad_norm": 5.21875, "grad_norm_var": 0.20732014973958332, "learning_rate": 0.0001, "loss": 5.8217, "loss/crossentropy": 2.5114020109176636, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17653273418545723, "step": 7990 }, { "epoch": 0.36327272727272725, "grad_norm": 5.40625, "grad_norm_var": 0.21731770833333333, "learning_rate": 0.0001, "loss": 5.6463, "loss/crossentropy": 2.4859555065631866, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16583651304244995, "step": 7992 }, { "epoch": 0.3633636363636364, "grad_norm": 5.4375, "grad_norm_var": 0.22576497395833334, "learning_rate": 0.0001, "loss": 5.8514, "loss/crossentropy": 2.597142994403839, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17367186769843102, "step": 7994 }, { "epoch": 0.3634545454545455, "grad_norm": 5.125, "grad_norm_var": 0.22576497395833334, "learning_rate": 0.0001, "loss": 5.507, "loss/crossentropy": 2.4147456288337708, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1584419161081314, "step": 7996 }, { "epoch": 0.36354545454545456, "grad_norm": 5.0625, "grad_norm_var": 0.223828125, "learning_rate": 0.0001, "loss": 5.7365, "loss/crossentropy": 2.5141067504882812, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1675497591495514, "step": 7998 }, { "epoch": 0.36363636363636365, "grad_norm": 4.84375, "grad_norm_var": 0.22421468098958333, "learning_rate": 0.0001, "loss": 5.6441, "loss/crossentropy": 2.452590048313141, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16681186854839325, "step": 8000 }, { "epoch": 0.36372727272727273, "grad_norm": 8.0625, "grad_norm_var": 0.6735514322916667, "learning_rate": 0.0001, "loss": 5.7361, "loss/crossentropy": 2.5165863633155823, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1690262034535408, "step": 8002 }, { "epoch": 0.3638181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.5727213541666667, "learning_rate": 0.0001, "loss": 5.7533, "loss/crossentropy": 2.523823916912079, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1729508973658085, "step": 8004 }, { "epoch": 0.3639090909090909, "grad_norm": 5.96875, "grad_norm_var": 0.5972005208333333, "learning_rate": 0.0001, "loss": 6.0157, "loss/crossentropy": 2.6408029198646545, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18162554875016212, "step": 8006 }, { "epoch": 0.364, "grad_norm": 5.71875, "grad_norm_var": 0.5709635416666666, "learning_rate": 0.0001, "loss": 6.1008, "loss/crossentropy": 2.7455297708511353, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1855306625366211, "step": 8008 }, { "epoch": 0.3640909090909091, "grad_norm": 4.96875, "grad_norm_var": 0.5873982747395833, "learning_rate": 0.0001, "loss": 5.7099, "loss/crossentropy": 2.4510951042175293, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17294611781835556, "step": 8010 }, { "epoch": 0.36418181818181816, "grad_norm": 6.03125, "grad_norm_var": 0.6063761393229167, "learning_rate": 0.0001, "loss": 5.657, "loss/crossentropy": 2.4094552993774414, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1700674369931221, "step": 8012 }, { "epoch": 0.36427272727272725, "grad_norm": 5.375, "grad_norm_var": 0.5641886393229166, "learning_rate": 0.0001, "loss": 6.0498, "loss/crossentropy": 2.6996167302131653, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.18130897358059883, "step": 8014 }, { "epoch": 0.3643636363636364, "grad_norm": 5.4375, "grad_norm_var": 0.5128865559895833, "learning_rate": 0.0001, "loss": 5.7445, "loss/crossentropy": 2.4011253714561462, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1741826869547367, "step": 8016 }, { "epoch": 0.3644545454545455, "grad_norm": 5.4375, "grad_norm_var": 0.08183186848958333, "learning_rate": 0.0001, "loss": 5.7525, "loss/crossentropy": 2.510754317045212, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17124631628394127, "step": 8018 }, { "epoch": 0.36454545454545456, "grad_norm": 5.5625, "grad_norm_var": 0.082666015625, "learning_rate": 0.0001, "loss": 6.1689, "loss/crossentropy": 2.695366382598877, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1895432248711586, "step": 8020 }, { "epoch": 0.36463636363636365, "grad_norm": 5.0, "grad_norm_var": 0.07623697916666666, "learning_rate": 0.0001, "loss": 5.5649, "loss/crossentropy": 2.379688322544098, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16597814857959747, "step": 8022 }, { "epoch": 0.36472727272727273, "grad_norm": 5.6875, "grad_norm_var": 0.076416015625, "learning_rate": 0.0001, "loss": 5.7581, "loss/crossentropy": 2.46904718875885, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17441261932253838, "step": 8024 }, { "epoch": 0.3648181818181818, "grad_norm": 5.5, "grad_norm_var": 0.06692708333333333, "learning_rate": 0.0001, "loss": 5.4981, "loss/crossentropy": 2.339071273803711, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16414179280400276, "step": 8026 }, { "epoch": 0.3649090909090909, "grad_norm": 5.375, "grad_norm_var": 0.034077962239583336, "learning_rate": 0.0001, "loss": 5.6433, "loss/crossentropy": 2.382120668888092, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.168302521109581, "step": 8028 }, { "epoch": 0.365, "grad_norm": 5.28125, "grad_norm_var": 0.04269205729166667, "learning_rate": 0.0001, "loss": 5.4984, "loss/crossentropy": 2.319732964038849, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1633703000843525, "step": 8030 }, { "epoch": 0.3650909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.09375, "learning_rate": 0.0001, "loss": 5.7067, "loss/crossentropy": 2.500293016433716, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16810392588377, "step": 8032 }, { "epoch": 0.36518181818181816, "grad_norm": 5.75, "grad_norm_var": 0.10051676432291666, "learning_rate": 0.0001, "loss": 5.7894, "loss/crossentropy": 2.5325969457626343, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17001591250300407, "step": 8034 }, { "epoch": 0.36527272727272725, "grad_norm": 5.46875, "grad_norm_var": 0.10193684895833334, "learning_rate": 0.0001, "loss": 5.8645, "loss/crossentropy": 2.5773720741271973, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17558399587869644, "step": 8036 }, { "epoch": 0.3653636363636364, "grad_norm": 5.375, "grad_norm_var": 0.14807535807291666, "learning_rate": 0.0001, "loss": 5.3843, "loss/crossentropy": 2.277116119861603, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15857001766562462, "step": 8038 }, { "epoch": 0.3654545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.12810872395833334, "learning_rate": 0.0001, "loss": 5.8239, "loss/crossentropy": 2.5503156781196594, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17325256764888763, "step": 8040 }, { "epoch": 0.36554545454545456, "grad_norm": 5.28125, "grad_norm_var": 0.12551676432291667, "learning_rate": 0.0001, "loss": 5.9072, "loss/crossentropy": 2.606301009654999, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17422770336270332, "step": 8042 }, { "epoch": 0.36563636363636365, "grad_norm": 6.28125, "grad_norm_var": 0.224853515625, "learning_rate": 0.0001, "loss": 6.2686, "loss/crossentropy": 2.735263466835022, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.19571838155388832, "step": 8044 }, { "epoch": 0.36572727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.21985270182291666, "learning_rate": 0.0001, "loss": 5.8033, "loss/crossentropy": 2.5726553201675415, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1699444204568863, "step": 8046 }, { "epoch": 0.3658181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.18414306640625, "learning_rate": 0.0001, "loss": 5.5364, "loss/crossentropy": 2.3820493817329407, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16387122124433517, "step": 8048 }, { "epoch": 0.3659090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.178369140625, "learning_rate": 0.0001, "loss": 5.5725, "loss/crossentropy": 2.379706621170044, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16458696499466896, "step": 8050 }, { "epoch": 0.366, "grad_norm": 7.09375, "grad_norm_var": 0.40818684895833335, "learning_rate": 0.0001, "loss": 5.5681, "loss/crossentropy": 2.365069657564163, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16463553160429, "step": 8052 }, { "epoch": 0.3660909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.360400390625, "learning_rate": 0.0001, "loss": 5.7207, "loss/crossentropy": 2.499022603034973, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16982489079236984, "step": 8054 }, { "epoch": 0.36618181818181816, "grad_norm": 5.375, "grad_norm_var": 3.8863240559895833, "learning_rate": 0.0001, "loss": 5.8193, "loss/crossentropy": 2.389000743627548, "loss/hidden": 1.685546875, "loss/jsd": 0.0, "loss/logits": 0.17447037249803543, "step": 8056 }, { "epoch": 0.36627272727272725, "grad_norm": 5.375, "grad_norm_var": 3.9604817708333333, "learning_rate": 0.0001, "loss": 5.6315, "loss/crossentropy": 2.4287036657333374, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16832483932375908, "step": 8058 }, { "epoch": 0.3663636363636364, "grad_norm": 5.375, "grad_norm_var": 3.967215983072917, "learning_rate": 0.0001, "loss": 5.7347, "loss/crossentropy": 2.436898171901703, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17470045015215874, "step": 8060 }, { "epoch": 0.3664545454545455, "grad_norm": 5.53125, "grad_norm_var": 3.9755859375, "learning_rate": 0.0001, "loss": 6.036, "loss/crossentropy": 2.712597906589508, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17628547176718712, "step": 8062 }, { "epoch": 0.36654545454545456, "grad_norm": 5.25, "grad_norm_var": 3.934012858072917, "learning_rate": 0.0001, "loss": 5.5807, "loss/crossentropy": 2.4027384221553802, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16467326879501343, "step": 8064 }, { "epoch": 0.36663636363636365, "grad_norm": 4.71875, "grad_norm_var": 3.974983723958333, "learning_rate": 0.0001, "loss": 5.6228, "loss/crossentropy": 2.3908682465553284, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16947799921035767, "step": 8066 }, { "epoch": 0.36672727272727274, "grad_norm": 4.90625, "grad_norm_var": 3.83726806640625, "learning_rate": 0.0001, "loss": 5.8886, "loss/crossentropy": 2.641898512840271, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17076721787452698, "step": 8068 }, { "epoch": 0.3668181818181818, "grad_norm": 5.34375, "grad_norm_var": 3.8665201822916666, "learning_rate": 0.0001, "loss": 5.5888, "loss/crossentropy": 2.3851481676101685, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.16391753405332565, "step": 8070 }, { "epoch": 0.3669090909090909, "grad_norm": 5.5625, "grad_norm_var": 0.211181640625, "learning_rate": 0.0001, "loss": 5.9953, "loss/crossentropy": 2.6671950817108154, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18046338856220245, "step": 8072 }, { "epoch": 0.367, "grad_norm": 5.71875, "grad_norm_var": 0.19550374348958333, "learning_rate": 0.0001, "loss": 5.6358, "loss/crossentropy": 2.3240102529525757, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17648938670754433, "step": 8074 }, { "epoch": 0.3670909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.19667561848958334, "learning_rate": 0.0001, "loss": 5.7328, "loss/crossentropy": 2.4125033020973206, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17480114847421646, "step": 8076 }, { "epoch": 0.36718181818181816, "grad_norm": 5.03125, "grad_norm_var": 0.19377848307291667, "learning_rate": 0.0001, "loss": 5.9586, "loss/crossentropy": 2.616635739803314, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18029145523905754, "step": 8078 }, { "epoch": 0.36727272727272725, "grad_norm": 5.09375, "grad_norm_var": 0.19781494140625, "learning_rate": 0.0001, "loss": 5.4727, "loss/crossentropy": 2.258391499519348, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1669364757835865, "step": 8080 }, { "epoch": 0.36736363636363634, "grad_norm": 5.125, "grad_norm_var": 0.08769124348958333, "learning_rate": 0.0001, "loss": 6.0235, "loss/crossentropy": 2.6877623200416565, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18084242194890976, "step": 8082 }, { "epoch": 0.3674545454545455, "grad_norm": 5.34375, "grad_norm_var": 0.0623046875, "learning_rate": 0.0001, "loss": 5.6898, "loss/crossentropy": 2.466749429702759, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1697646863758564, "step": 8084 }, { "epoch": 0.36754545454545456, "grad_norm": 5.0, "grad_norm_var": 0.050581868489583334, "learning_rate": 0.0001, "loss": 6.0329, "loss/crossentropy": 2.7345253229141235, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18022598326206207, "step": 8086 }, { "epoch": 0.36763636363636365, "grad_norm": 5.65625, "grad_norm_var": 0.06640218098958334, "learning_rate": 0.0001, "loss": 5.7743, "loss/crossentropy": 2.515002429485321, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17436616122722626, "step": 8088 }, { "epoch": 0.36772727272727274, "grad_norm": 5.40625, "grad_norm_var": 0.05543212890625, "learning_rate": 0.0001, "loss": 5.5662, "loss/crossentropy": 2.337611734867096, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1703171581029892, "step": 8090 }, { "epoch": 0.3678181818181818, "grad_norm": 5.125, "grad_norm_var": 0.05396728515625, "learning_rate": 0.0001, "loss": 5.7836, "loss/crossentropy": 2.50414776802063, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17286524921655655, "step": 8092 }, { "epoch": 0.3679090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.08329671223958333, "learning_rate": 0.0001, "loss": 5.3688, "loss/crossentropy": 2.29214870929718, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1588401459157467, "step": 8094 }, { "epoch": 0.368, "grad_norm": 5.21875, "grad_norm_var": 0.08683268229166667, "learning_rate": 0.0001, "loss": 5.835, "loss/crossentropy": 2.5572436451911926, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1744585931301117, "step": 8096 }, { "epoch": 0.3680909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.07818603515625, "learning_rate": 0.0001, "loss": 5.4367, "loss/crossentropy": 2.2426282167434692, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16237959265708923, "step": 8098 }, { "epoch": 0.36818181818181817, "grad_norm": 5.46875, "grad_norm_var": 0.08739827473958334, "learning_rate": 0.0001, "loss": 5.9565, "loss/crossentropy": 2.678491771221161, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1748736947774887, "step": 8100 }, { "epoch": 0.36827272727272725, "grad_norm": 5.25, "grad_norm_var": 0.08487955729166667, "learning_rate": 0.0001, "loss": 5.6042, "loss/crossentropy": 2.4420596957206726, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.164655189961195, "step": 8102 }, { "epoch": 0.36836363636363634, "grad_norm": 5.65625, "grad_norm_var": 0.08336181640625, "learning_rate": 0.0001, "loss": 6.1538, "loss/crossentropy": 2.7385355830192566, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18684256821870804, "step": 8104 }, { "epoch": 0.3684545454545455, "grad_norm": 6.375, "grad_norm_var": 0.16653645833333333, "learning_rate": 0.0001, "loss": 5.2386, "loss/crossentropy": 2.126623719930649, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1592477336525917, "step": 8106 }, { "epoch": 0.36854545454545456, "grad_norm": 5.625, "grad_norm_var": 0.17565104166666667, "learning_rate": 0.0001, "loss": 5.9623, "loss/crossentropy": 2.620829224586487, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17867521569132805, "step": 8108 }, { "epoch": 0.36863636363636365, "grad_norm": 5.5, "grad_norm_var": 0.1236328125, "learning_rate": 0.0001, "loss": 5.7665, "loss/crossentropy": 2.4458479285240173, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.1740558221936226, "step": 8110 }, { "epoch": 0.36872727272727274, "grad_norm": 5.125, "grad_norm_var": 0.12667643229166667, "learning_rate": 0.0001, "loss": 5.8164, "loss/crossentropy": 2.500159204006195, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17635483667254448, "step": 8112 }, { "epoch": 0.3688181818181818, "grad_norm": 5.46875, "grad_norm_var": 0.14534098307291668, "learning_rate": 0.0001, "loss": 5.7755, "loss/crossentropy": 2.5479100346565247, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.170217152684927, "step": 8114 }, { "epoch": 0.3689090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.16855061848958333, "learning_rate": 0.0001, "loss": 5.8401, "loss/crossentropy": 2.581813156604767, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17388012632727623, "step": 8116 }, { "epoch": 0.369, "grad_norm": 5.40625, "grad_norm_var": 0.16698811848958334, "learning_rate": 0.0001, "loss": 5.8127, "loss/crossentropy": 2.558173358440399, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1690112240612507, "step": 8118 }, { "epoch": 0.3690909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.15755208333333334, "learning_rate": 0.0001, "loss": 5.8517, "loss/crossentropy": 2.573505222797394, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17567594349384308, "step": 8120 }, { "epoch": 0.36918181818181817, "grad_norm": 5.375, "grad_norm_var": 0.09335530598958333, "learning_rate": 0.0001, "loss": 5.5782, "loss/crossentropy": 2.37308731675148, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16581884026527405, "step": 8122 }, { "epoch": 0.36927272727272725, "grad_norm": 5.0625, "grad_norm_var": 0.08001302083333334, "learning_rate": 0.0001, "loss": 5.4059, "loss/crossentropy": 2.2192187905311584, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16203181073069572, "step": 8124 }, { "epoch": 0.36936363636363634, "grad_norm": 5.5, "grad_norm_var": 0.09563802083333334, "learning_rate": 0.0001, "loss": 5.9304, "loss/crossentropy": 2.602893054485321, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1786499246954918, "step": 8126 }, { "epoch": 0.3694545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.09358317057291667, "learning_rate": 0.0001, "loss": 5.9418, "loss/crossentropy": 2.656128168106079, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17544302344322205, "step": 8128 }, { "epoch": 0.36954545454545457, "grad_norm": 5.34375, "grad_norm_var": 0.07682291666666667, "learning_rate": 0.0001, "loss": 5.8193, "loss/crossentropy": 2.5149887204170227, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17301039770245552, "step": 8130 }, { "epoch": 0.36963636363636365, "grad_norm": 5.28125, "grad_norm_var": 0.05712483723958333, "learning_rate": 0.0001, "loss": 5.9324, "loss/crossentropy": 2.5917213559150696, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.18114031106233597, "step": 8132 }, { "epoch": 0.36972727272727274, "grad_norm": 4.6875, "grad_norm_var": 0.08072916666666667, "learning_rate": 0.0001, "loss": 5.239, "loss/crossentropy": 2.2345215678215027, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15044613927602768, "step": 8134 }, { "epoch": 0.3698181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.080322265625, "learning_rate": 0.0001, "loss": 5.5564, "loss/crossentropy": 2.458915889263153, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16052642092108727, "step": 8136 }, { "epoch": 0.3699090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.07589518229166667, "learning_rate": 0.0001, "loss": 6.0798, "loss/crossentropy": 2.7681482434272766, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.18018972128629684, "step": 8138 }, { "epoch": 0.37, "grad_norm": 5.03125, "grad_norm_var": 0.07564697265625, "learning_rate": 0.0001, "loss": 5.7362, "loss/crossentropy": 2.5182217955589294, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16886547580361366, "step": 8140 }, { "epoch": 0.3700909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.040327962239583334, "learning_rate": 0.0001, "loss": 5.6237, "loss/crossentropy": 2.4056636095046997, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16750460118055344, "step": 8142 }, { "epoch": 0.37018181818181817, "grad_norm": 5.46875, "grad_norm_var": 0.12535400390625, "learning_rate": 0.0001, "loss": 5.6906, "loss/crossentropy": 2.378077745437622, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17480545118451118, "step": 8144 }, { "epoch": 0.37027272727272725, "grad_norm": 4.875, "grad_norm_var": 0.13605143229166666, "learning_rate": 0.0001, "loss": 5.2427, "loss/crossentropy": 2.1908567547798157, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15401426330208778, "step": 8146 }, { "epoch": 0.37036363636363634, "grad_norm": 5.03125, "grad_norm_var": 0.13424072265625, "learning_rate": 0.0001, "loss": 5.7956, "loss/crossentropy": 2.544190466403961, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17064503952860832, "step": 8148 }, { "epoch": 0.3704545454545455, "grad_norm": 4.53125, "grad_norm_var": 0.14804280598958333, "learning_rate": 0.0001, "loss": 5.3916, "loss/crossentropy": 2.30328232049942, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.15707724913954735, "step": 8150 }, { "epoch": 0.37054545454545457, "grad_norm": 5.5625, "grad_norm_var": 0.15349934895833334, "learning_rate": 0.0001, "loss": 5.3614, "loss/crossentropy": 2.242383122444153, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15975547581911087, "step": 8152 }, { "epoch": 0.37063636363636365, "grad_norm": 4.90625, "grad_norm_var": 0.15445556640625, "learning_rate": 0.0001, "loss": 5.882, "loss/crossentropy": 2.5952069759368896, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1749710515141487, "step": 8154 }, { "epoch": 0.37072727272727274, "grad_norm": 5.15625, "grad_norm_var": 0.15284830729166668, "learning_rate": 0.0001, "loss": 5.9186, "loss/crossentropy": 2.60797518491745, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17715873569250107, "step": 8156 }, { "epoch": 0.3708181818181818, "grad_norm": 5.53125, "grad_norm_var": 0.20974934895833333, "learning_rate": 0.0001, "loss": 5.7129, "loss/crossentropy": 2.4027822017669678, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17320308834314346, "step": 8158 }, { "epoch": 0.3709090909090909, "grad_norm": 5.375, "grad_norm_var": 0.14464518229166667, "learning_rate": 0.0001, "loss": 5.8002, "loss/crossentropy": 2.4999378323554993, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1731879934668541, "step": 8160 }, { "epoch": 0.371, "grad_norm": 5.125, "grad_norm_var": 0.12245686848958333, "learning_rate": 0.0001, "loss": 5.9508, "loss/crossentropy": 2.6251959204673767, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.18041475117206573, "step": 8162 }, { "epoch": 0.3710909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.35540364583333334, "learning_rate": 0.0001, "loss": 6.0083, "loss/crossentropy": 2.5810970067977905, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.18744781613349915, "step": 8164 }, { "epoch": 0.37118181818181817, "grad_norm": 5.78125, "grad_norm_var": 0.30579020182291666, "learning_rate": 0.0001, "loss": 6.0298, "loss/crossentropy": 2.6758949160575867, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17914079874753952, "step": 8166 }, { "epoch": 0.37127272727272725, "grad_norm": 4.9375, "grad_norm_var": 0.33151041666666664, "learning_rate": 0.0001, "loss": 5.4563, "loss/crossentropy": 2.3540584444999695, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16120271757245064, "step": 8168 }, { "epoch": 0.37136363636363634, "grad_norm": 5.1875, "grad_norm_var": 0.3175130208333333, "learning_rate": 0.0001, "loss": 5.7677, "loss/crossentropy": 2.4671287834644318, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17517229542136192, "step": 8170 }, { "epoch": 0.3714545454545455, "grad_norm": 4.96875, "grad_norm_var": 0.33232014973958335, "learning_rate": 0.0001, "loss": 5.2699, "loss/crossentropy": 2.2155524492263794, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.15055323019623756, "step": 8172 }, { "epoch": 0.37154545454545457, "grad_norm": 5.125, "grad_norm_var": 0.32766520182291664, "learning_rate": 0.0001, "loss": 5.5106, "loss/crossentropy": 2.375662475824356, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16407577320933342, "step": 8174 }, { "epoch": 0.37163636363636365, "grad_norm": 5.3125, "grad_norm_var": 0.4369425455729167, "learning_rate": 0.0001, "loss": 5.7425, "loss/crossentropy": 2.4489214420318604, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17583931237459183, "step": 8176 }, { "epoch": 0.37172727272727274, "grad_norm": 4.9375, "grad_norm_var": 0.5149576822916667, "learning_rate": 0.0001, "loss": 5.7586, "loss/crossentropy": 2.4719176292419434, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17457114905118942, "step": 8178 }, { "epoch": 0.3718181818181818, "grad_norm": 4.75, "grad_norm_var": 0.32597249348958335, "learning_rate": 0.0001, "loss": 5.5812, "loss/crossentropy": 2.4368614554405212, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16267117485404015, "step": 8180 }, { "epoch": 0.3719090909090909, "grad_norm": 9.6875, "grad_norm_var": 1.506494140625, "learning_rate": 0.0001, "loss": 5.8862, "loss/crossentropy": 2.409911274909973, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1890355348587036, "step": 8182 }, { "epoch": 0.372, "grad_norm": 5.3125, "grad_norm_var": 1.4439453125, "learning_rate": 0.0001, "loss": 5.9113, "loss/crossentropy": 2.647143542766571, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1738797016441822, "step": 8184 }, { "epoch": 0.3720909090909091, "grad_norm": 5.375, "grad_norm_var": 1.45533447265625, "learning_rate": 0.0001, "loss": 5.6557, "loss/crossentropy": 2.4334771037101746, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16695237532258034, "step": 8186 }, { "epoch": 0.37218181818181817, "grad_norm": 5.0, "grad_norm_var": 1.4359212239583334, "learning_rate": 0.0001, "loss": 5.6356, "loss/crossentropy": 2.3878210186958313, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17086929082870483, "step": 8188 }, { "epoch": 0.37227272727272726, "grad_norm": 5.09375, "grad_norm_var": 1.406103515625, "learning_rate": 0.0001, "loss": 5.8535, "loss/crossentropy": 2.595495820045471, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17189622670412064, "step": 8190 }, { "epoch": 0.37236363636363634, "grad_norm": 5.3125, "grad_norm_var": 1.3606730143229167, "learning_rate": 0.0001, "loss": 5.849, "loss/crossentropy": 2.588889718055725, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17269137129187584, "step": 8192 }, { "epoch": 0.3724545454545454, "grad_norm": 5.03125, "grad_norm_var": 1.3047159830729167, "learning_rate": 0.0001, "loss": 5.7746, "loss/crossentropy": 2.5345299243927, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1726415604352951, "step": 8194 }, { "epoch": 0.37254545454545457, "grad_norm": 6.09375, "grad_norm_var": 1.3003214518229167, "learning_rate": 0.0001, "loss": 5.709, "loss/crossentropy": 2.4733389616012573, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.166922889649868, "step": 8196 }, { "epoch": 0.37263636363636365, "grad_norm": 5.46875, "grad_norm_var": 0.08834635416666667, "learning_rate": 0.0001, "loss": 5.2099, "loss/crossentropy": 2.120604395866394, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1536605954170227, "step": 8198 }, { "epoch": 0.37272727272727274, "grad_norm": 5.09375, "grad_norm_var": 0.085400390625, "learning_rate": 0.0001, "loss": 6.1178, "loss/crossentropy": 2.7098426818847656, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18806355446577072, "step": 8200 }, { "epoch": 0.3728181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.08553059895833333, "learning_rate": 0.0001, "loss": 5.58, "loss/crossentropy": 2.334290027618408, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1704697273671627, "step": 8202 }, { "epoch": 0.3729090909090909, "grad_norm": 5.65625, "grad_norm_var": 0.10299072265625, "learning_rate": 0.0001, "loss": 6.1955, "loss/crossentropy": 2.725205957889557, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18960342556238174, "step": 8204 }, { "epoch": 0.373, "grad_norm": 5.09375, "grad_norm_var": 0.11392822265625, "learning_rate": 0.0001, "loss": 6.0222, "loss/crossentropy": 2.644789755344391, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18500644341111183, "step": 8206 }, { "epoch": 0.3730909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.12789306640625, "learning_rate": 0.0001, "loss": 5.489, "loss/crossentropy": 2.3353055715560913, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.15990034490823746, "step": 8208 }, { "epoch": 0.37318181818181817, "grad_norm": 5.28125, "grad_norm_var": 0.12525634765625, "learning_rate": 0.0001, "loss": 5.6134, "loss/crossentropy": 2.386352002620697, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1662573181092739, "step": 8210 }, { "epoch": 0.37327272727272726, "grad_norm": 5.4375, "grad_norm_var": 0.093994140625, "learning_rate": 0.0001, "loss": 6.0406, "loss/crossentropy": 2.6626161336898804, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18213532492518425, "step": 8212 }, { "epoch": 0.37336363636363634, "grad_norm": 5.03125, "grad_norm_var": 0.11692301432291667, "learning_rate": 0.0001, "loss": 5.687, "loss/crossentropy": 2.529313623905182, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16635769978165627, "step": 8214 }, { "epoch": 0.37345454545454543, "grad_norm": 5.40625, "grad_norm_var": 0.110791015625, "learning_rate": 0.0001, "loss": 6.1636, "loss/crossentropy": 2.755027115345001, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18733739480376244, "step": 8216 }, { "epoch": 0.37354545454545457, "grad_norm": 5.84375, "grad_norm_var": 0.12107747395833333, "learning_rate": 0.0001, "loss": 5.5801, "loss/crossentropy": 2.3423517644405365, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17045382410287857, "step": 8218 }, { "epoch": 0.37363636363636366, "grad_norm": 5.15625, "grad_norm_var": 0.10937093098958334, "learning_rate": 0.0001, "loss": 5.7939, "loss/crossentropy": 2.5692931413650513, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16933632269501686, "step": 8220 }, { "epoch": 0.37372727272727274, "grad_norm": 5.6875, "grad_norm_var": 0.10543212890625, "learning_rate": 0.0001, "loss": 5.6536, "loss/crossentropy": 2.472529888153076, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16830483451485634, "step": 8222 }, { "epoch": 0.3738181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.09855143229166667, "learning_rate": 0.0001, "loss": 5.9786, "loss/crossentropy": 2.6696455478668213, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1783588044345379, "step": 8224 }, { "epoch": 0.3739090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.092578125, "learning_rate": 0.0001, "loss": 6.0043, "loss/crossentropy": 2.656103253364563, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17935003712773323, "step": 8226 }, { "epoch": 0.374, "grad_norm": 5.84375, "grad_norm_var": 0.11979166666666667, "learning_rate": 0.0001, "loss": 5.3684, "loss/crossentropy": 2.2316839694976807, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1599595658481121, "step": 8228 }, { "epoch": 0.3740909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.10129801432291667, "learning_rate": 0.0001, "loss": 5.7504, "loss/crossentropy": 2.517499625682831, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1709471084177494, "step": 8230 }, { "epoch": 0.37418181818181817, "grad_norm": 5.125, "grad_norm_var": 0.11979166666666667, "learning_rate": 0.0001, "loss": 5.5088, "loss/crossentropy": 2.3159122467041016, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1657705269753933, "step": 8232 }, { "epoch": 0.37427272727272726, "grad_norm": 5.1875, "grad_norm_var": 0.11389567057291666, "learning_rate": 0.0001, "loss": 5.5817, "loss/crossentropy": 2.4306276440620422, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16530436277389526, "step": 8234 }, { "epoch": 0.37436363636363634, "grad_norm": 5.09375, "grad_norm_var": 0.1150390625, "learning_rate": 0.0001, "loss": 5.7722, "loss/crossentropy": 2.537528872489929, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1707340106368065, "step": 8236 }, { "epoch": 0.37445454545454543, "grad_norm": 5.1875, "grad_norm_var": 0.13238525390625, "learning_rate": 0.0001, "loss": 5.754, "loss/crossentropy": 2.480078876018524, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17465969175100327, "step": 8238 }, { "epoch": 0.37454545454545457, "grad_norm": 5.09375, "grad_norm_var": 0.12120768229166666, "learning_rate": 0.0001, "loss": 5.5829, "loss/crossentropy": 2.423685133457184, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16376811265945435, "step": 8240 }, { "epoch": 0.37463636363636366, "grad_norm": 4.78125, "grad_norm_var": 0.13261311848958332, "learning_rate": 0.0001, "loss": 5.234, "loss/crossentropy": 2.213123381137848, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15149985253810883, "step": 8242 }, { "epoch": 0.37472727272727274, "grad_norm": 5.09375, "grad_norm_var": 0.08713785807291667, "learning_rate": 0.0001, "loss": 5.4811, "loss/crossentropy": 2.338470906019211, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.15937510877847672, "step": 8244 }, { "epoch": 0.37481818181818183, "grad_norm": 5.75, "grad_norm_var": 0.1298828125, "learning_rate": 0.0001, "loss": 6.1424, "loss/crossentropy": 2.7031256556510925, "loss/hidden": 1.580078125, "loss/jsd": 0.0, "loss/logits": 0.18592341989278793, "step": 8246 }, { "epoch": 0.3749090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.132421875, "learning_rate": 0.0001, "loss": 5.7215, "loss/crossentropy": 2.494376540184021, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17075713723897934, "step": 8248 }, { "epoch": 0.375, "grad_norm": 5.125, "grad_norm_var": 0.12042643229166666, "learning_rate": 0.0001, "loss": 5.6858, "loss/crossentropy": 2.4531930685043335, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16955258324742317, "step": 8250 }, { "epoch": 0.3750909090909091, "grad_norm": 4.96875, "grad_norm_var": 0.13079020182291667, "learning_rate": 0.0001, "loss": 5.7007, "loss/crossentropy": 2.4465829730033875, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1746290624141693, "step": 8252 }, { "epoch": 0.37518181818181817, "grad_norm": 4.84375, "grad_norm_var": 0.14804280598958333, "learning_rate": 0.0001, "loss": 5.5926, "loss/crossentropy": 2.3572625517845154, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17060556635260582, "step": 8254 }, { "epoch": 0.37527272727272726, "grad_norm": 5.09375, "grad_norm_var": 0.14778645833333334, "learning_rate": 0.0001, "loss": 5.6602, "loss/crossentropy": 2.456266164779663, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16863707080483437, "step": 8256 }, { "epoch": 0.37536363636363634, "grad_norm": 4.90625, "grad_norm_var": 0.13644205729166667, "learning_rate": 0.0001, "loss": 5.916, "loss/crossentropy": 2.7114774584770203, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16967536509037018, "step": 8258 }, { "epoch": 0.37545454545454543, "grad_norm": 5.09375, "grad_norm_var": 0.13339436848958333, "learning_rate": 0.0001, "loss": 5.5811, "loss/crossentropy": 2.384709358215332, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1645606942474842, "step": 8260 }, { "epoch": 0.37554545454545457, "grad_norm": 5.0, "grad_norm_var": 0.10338541666666666, "learning_rate": 0.0001, "loss": 5.8218, "loss/crossentropy": 2.565527081489563, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17464859783649445, "step": 8262 }, { "epoch": 0.37563636363636366, "grad_norm": 5.5, "grad_norm_var": 0.09888916015625, "learning_rate": 0.0001, "loss": 5.5627, "loss/crossentropy": 2.3133752942085266, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1702437624335289, "step": 8264 }, { "epoch": 0.37572727272727274, "grad_norm": 7.71875, "grad_norm_var": 0.5108723958333333, "learning_rate": 0.0001, "loss": 5.5071, "loss/crossentropy": 2.401216149330139, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16000284627079964, "step": 8266 }, { "epoch": 0.37581818181818183, "grad_norm": 5.46875, "grad_norm_var": 0.49931233723958335, "learning_rate": 0.0001, "loss": 5.4897, "loss/crossentropy": 2.293437361717224, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.16357644274830818, "step": 8268 }, { "epoch": 0.3759090909090909, "grad_norm": 5.375, "grad_norm_var": 0.52437744140625, "learning_rate": 0.0001, "loss": 5.3381, "loss/crossentropy": 2.1990128457546234, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16136569529771805, "step": 8270 }, { "epoch": 0.376, "grad_norm": 5.65625, "grad_norm_var": 0.5395833333333333, "learning_rate": 0.0001, "loss": 5.3646, "loss/crossentropy": 2.2943168580532074, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15683170408010483, "step": 8272 }, { "epoch": 0.3760909090909091, "grad_norm": 5.25, "grad_norm_var": 0.5233357747395834, "learning_rate": 0.0001, "loss": 5.9171, "loss/crossentropy": 2.5994438529014587, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1798134408891201, "step": 8274 }, { "epoch": 0.3761818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.5330078125, "learning_rate": 0.0001, "loss": 5.596, "loss/crossentropy": 2.406097948551178, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16605643928050995, "step": 8276 }, { "epoch": 0.37627272727272726, "grad_norm": 5.625, "grad_norm_var": 0.517431640625, "learning_rate": 0.0001, "loss": 5.9439, "loss/crossentropy": 2.5951586961746216, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18136289343237877, "step": 8278 }, { "epoch": 0.37636363636363634, "grad_norm": 5.15625, "grad_norm_var": 0.5151326497395833, "learning_rate": 0.0001, "loss": 5.9014, "loss/crossentropy": 2.6029189825057983, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17613568902015686, "step": 8280 }, { "epoch": 0.37645454545454543, "grad_norm": 5.15625, "grad_norm_var": 0.14466145833333333, "learning_rate": 0.0001, "loss": 5.6109, "loss/crossentropy": 2.4476775527000427, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16436461359262466, "step": 8282 }, { "epoch": 0.37654545454545457, "grad_norm": 5.53125, "grad_norm_var": 0.17420247395833333, "learning_rate": 0.0001, "loss": 5.5803, "loss/crossentropy": 2.395129084587097, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16597681865096092, "step": 8284 }, { "epoch": 0.37663636363636366, "grad_norm": 5.9375, "grad_norm_var": 0.12277018229166667, "learning_rate": 0.0001, "loss": 5.936, "loss/crossentropy": 2.6297133564949036, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17593741789460182, "step": 8286 }, { "epoch": 0.37672727272727274, "grad_norm": 4.53125, "grad_norm_var": 0.17724202473958334, "learning_rate": 0.0001, "loss": 5.4574, "loss/crossentropy": 2.2807492315769196, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16571568697690964, "step": 8288 }, { "epoch": 0.37681818181818183, "grad_norm": 5.0625, "grad_norm_var": 0.17884114583333333, "learning_rate": 0.0001, "loss": 5.7622, "loss/crossentropy": 2.547470986843109, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17147643119096756, "step": 8290 }, { "epoch": 0.3769090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.19986572265625, "learning_rate": 0.0001, "loss": 5.2666, "loss/crossentropy": 2.1774691343307495, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.15520313382148743, "step": 8292 }, { "epoch": 0.377, "grad_norm": 4.9375, "grad_norm_var": 0.199853515625, "learning_rate": 0.0001, "loss": 5.4004, "loss/crossentropy": 2.352587878704071, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15478188917040825, "step": 8294 }, { "epoch": 0.3770909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.18603108723958334, "learning_rate": 0.0001, "loss": 5.7521, "loss/crossentropy": 2.508893072605133, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17080892249941826, "step": 8296 }, { "epoch": 0.3771818181818182, "grad_norm": 7.4375, "grad_norm_var": 0.5590983072916667, "learning_rate": 0.0001, "loss": 5.9198, "loss/crossentropy": 2.4795799255371094, "loss/hidden": 1.603515625, "loss/jsd": 0.0, "loss/logits": 0.18367064371705055, "step": 8298 }, { "epoch": 0.37727272727272726, "grad_norm": 4.59375, "grad_norm_var": 0.5766886393229167, "learning_rate": 0.0001, "loss": 5.5248, "loss/crossentropy": 2.3441748917102814, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1647471934556961, "step": 8300 }, { "epoch": 0.37736363636363635, "grad_norm": 5.65625, "grad_norm_var": 0.5658162434895834, "learning_rate": 0.0001, "loss": 5.9994, "loss/crossentropy": 2.6634527444839478, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17929809913039207, "step": 8302 }, { "epoch": 0.37745454545454543, "grad_norm": 4.4375, "grad_norm_var": 0.5693318684895833, "learning_rate": 0.0001, "loss": 5.3871, "loss/crossentropy": 2.3219440281391144, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1545669212937355, "step": 8304 }, { "epoch": 0.3775454545454546, "grad_norm": 4.96875, "grad_norm_var": 0.5836873372395833, "learning_rate": 0.0001, "loss": 5.7762, "loss/crossentropy": 2.5134026408195496, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17335481196641922, "step": 8306 }, { "epoch": 0.37763636363636366, "grad_norm": 5.375, "grad_norm_var": 0.5646484375, "learning_rate": 0.0001, "loss": 5.4716, "loss/crossentropy": 2.2978808283805847, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.16131756454706192, "step": 8308 }, { "epoch": 0.37772727272727274, "grad_norm": 4.90625, "grad_norm_var": 0.5719401041666666, "learning_rate": 0.0001, "loss": 5.938, "loss/crossentropy": 2.623380184173584, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17794320732355118, "step": 8310 }, { "epoch": 0.37781818181818183, "grad_norm": 4.9375, "grad_norm_var": 0.5921712239583333, "learning_rate": 0.0001, "loss": 5.967, "loss/crossentropy": 2.654853641986847, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17671894282102585, "step": 8312 }, { "epoch": 0.3779090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.189306640625, "learning_rate": 0.0001, "loss": 5.7461, "loss/crossentropy": 2.5182356238365173, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16848581284284592, "step": 8314 }, { "epoch": 0.378, "grad_norm": 5.4375, "grad_norm_var": 0.16308186848958334, "learning_rate": 0.0001, "loss": 5.3953, "loss/crossentropy": 2.299838960170746, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1585710532963276, "step": 8316 }, { "epoch": 0.3780909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.16327718098958333, "learning_rate": 0.0001, "loss": 5.6711, "loss/crossentropy": 2.498646318912506, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16743874177336693, "step": 8318 }, { "epoch": 0.3781818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.12030843098958334, "learning_rate": 0.0001, "loss": 5.343, "loss/crossentropy": 2.2000731825828552, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16156090423464775, "step": 8320 }, { "epoch": 0.37827272727272726, "grad_norm": 4.875, "grad_norm_var": 0.09685872395833334, "learning_rate": 0.0001, "loss": 5.4495, "loss/crossentropy": 2.30002224445343, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16318758949637413, "step": 8322 }, { "epoch": 0.37836363636363635, "grad_norm": 5.5, "grad_norm_var": 0.10022379557291666, "learning_rate": 0.0001, "loss": 5.8745, "loss/crossentropy": 2.6313043236732483, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17314831912517548, "step": 8324 }, { "epoch": 0.37845454545454543, "grad_norm": 5.0625, "grad_norm_var": 0.8516276041666667, "learning_rate": 0.0001, "loss": 5.8111, "loss/crossentropy": 2.517442047595978, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17448030039668083, "step": 8326 }, { "epoch": 0.3785454545454545, "grad_norm": 5.4375, "grad_norm_var": 0.8347615559895833, "learning_rate": 0.0001, "loss": 6.0719, "loss/crossentropy": 2.717946410179138, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1805122271180153, "step": 8328 }, { "epoch": 0.37863636363636366, "grad_norm": 5.40625, "grad_norm_var": 3.6026326497395833, "learning_rate": 0.0001, "loss": 5.7044, "loss/crossentropy": 2.3147695064544678, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1823260374367237, "step": 8330 }, { "epoch": 0.37872727272727275, "grad_norm": 5.0625, "grad_norm_var": 3.5540364583333335, "learning_rate": 0.0001, "loss": 5.8433, "loss/crossentropy": 2.5096236169338226, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17477700114250183, "step": 8332 }, { "epoch": 0.37881818181818183, "grad_norm": 5.375, "grad_norm_var": 3.521223958333333, "learning_rate": 0.0001, "loss": 5.8494, "loss/crossentropy": 2.6232800483703613, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17105373740196228, "step": 8334 }, { "epoch": 0.3789090909090909, "grad_norm": 5.40625, "grad_norm_var": 3.5983072916666665, "learning_rate": 0.0001, "loss": 5.4846, "loss/crossentropy": 2.3373343348503113, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16199643164873123, "step": 8336 }, { "epoch": 0.379, "grad_norm": 5.6875, "grad_norm_var": 3.5287760416666667, "learning_rate": 0.0001, "loss": 5.7742, "loss/crossentropy": 2.4232208728790283, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17943555116653442, "step": 8338 }, { "epoch": 0.3790909090909091, "grad_norm": 5.125, "grad_norm_var": 3.515755208333333, "learning_rate": 0.0001, "loss": 5.5278, "loss/crossentropy": 2.3061121702194214, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17099732160568237, "step": 8340 }, { "epoch": 0.3791818181818182, "grad_norm": 4.8125, "grad_norm_var": 3.027632649739583, "learning_rate": 0.0001, "loss": 5.2137, "loss/crossentropy": 2.1467910706996918, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15278365835547447, "step": 8342 }, { "epoch": 0.37927272727272726, "grad_norm": 5.53125, "grad_norm_var": 3.01793212890625, "learning_rate": 0.0001, "loss": 6.0798, "loss/crossentropy": 2.6948848962783813, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18536989018321037, "step": 8344 }, { "epoch": 0.37936363636363635, "grad_norm": 7.1875, "grad_norm_var": 0.37896728515625, "learning_rate": 0.0001, "loss": 5.9218, "loss/crossentropy": 2.537634313106537, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18079646304249763, "step": 8346 }, { "epoch": 0.37945454545454543, "grad_norm": 6.65625, "grad_norm_var": 0.478759765625, "learning_rate": 0.0001, "loss": 5.8874, "loss/crossentropy": 2.403881072998047, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.18838820978999138, "step": 8348 }, { "epoch": 0.3795454545454545, "grad_norm": 5.28125, "grad_norm_var": 0.46028238932291665, "learning_rate": 0.0001, "loss": 6.0016, "loss/crossentropy": 2.661629617214203, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17930816859006882, "step": 8350 }, { "epoch": 0.37963636363636366, "grad_norm": 5.125, "grad_norm_var": 0.4144816080729167, "learning_rate": 0.0001, "loss": 5.5739, "loss/crossentropy": 2.4119803309440613, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1659940518438816, "step": 8352 }, { "epoch": 0.37972727272727275, "grad_norm": 5.875, "grad_norm_var": 0.41558837890625, "learning_rate": 0.0001, "loss": 5.6561, "loss/crossentropy": 2.289001226425171, "loss/hidden": 1.650390625, "loss/jsd": 0.0, "loss/logits": 0.17167416214942932, "step": 8354 }, { "epoch": 0.37981818181818183, "grad_norm": 5.5, "grad_norm_var": 0.4127604166666667, "learning_rate": 0.0001, "loss": 5.8259, "loss/crossentropy": 2.553327739238739, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.17002886533737183, "step": 8356 }, { "epoch": 0.3799090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.4032185872395833, "learning_rate": 0.0001, "loss": 5.9285, "loss/crossentropy": 2.6243273615837097, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17455332353711128, "step": 8358 }, { "epoch": 0.38, "grad_norm": 5.09375, "grad_norm_var": 0.45388997395833336, "learning_rate": 0.0001, "loss": 5.4975, "loss/crossentropy": 2.3675565123558044, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16494872979819775, "step": 8360 }, { "epoch": 0.3800909090909091, "grad_norm": 5.125, "grad_norm_var": 0.28736572265625, "learning_rate": 0.0001, "loss": 5.3964, "loss/crossentropy": 2.301552802324295, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15869933739304543, "step": 8362 }, { "epoch": 0.3801818181818182, "grad_norm": 5.84375, "grad_norm_var": 0.10716145833333333, "learning_rate": 0.0001, "loss": 5.9978, "loss/crossentropy": 2.6082149147987366, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18310096114873886, "step": 8364 }, { "epoch": 0.38027272727272726, "grad_norm": 5.28125, "grad_norm_var": 0.11248372395833334, "learning_rate": 0.0001, "loss": 5.9042, "loss/crossentropy": 2.631616324186325, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17569394037127495, "step": 8366 }, { "epoch": 0.38036363636363635, "grad_norm": 4.9375, "grad_norm_var": 0.12294514973958333, "learning_rate": 0.0001, "loss": 5.4878, "loss/crossentropy": 2.315430372953415, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16333090886473656, "step": 8368 }, { "epoch": 0.38045454545454543, "grad_norm": 4.78125, "grad_norm_var": 0.11620686848958334, "learning_rate": 0.0001, "loss": 5.6507, "loss/crossentropy": 2.485536575317383, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16553892940282822, "step": 8370 }, { "epoch": 0.3805454545454545, "grad_norm": 5.625, "grad_norm_var": 0.11873372395833333, "learning_rate": 0.0001, "loss": 5.8619, "loss/crossentropy": 2.592230260372162, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1734473966062069, "step": 8372 }, { "epoch": 0.38063636363636366, "grad_norm": 4.90625, "grad_norm_var": 0.10718994140625, "learning_rate": 0.0001, "loss": 5.7721, "loss/crossentropy": 2.511164665222168, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1727733351290226, "step": 8374 }, { "epoch": 0.38072727272727275, "grad_norm": 4.25, "grad_norm_var": 0.1712890625, "learning_rate": 0.0001, "loss": 5.0384, "loss/crossentropy": 2.0726614892482758, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1469679195433855, "step": 8376 }, { "epoch": 0.38081818181818183, "grad_norm": 6.0625, "grad_norm_var": 0.1994140625, "learning_rate": 0.0001, "loss": 5.8005, "loss/crossentropy": 2.4279830157756805, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17982980981469154, "step": 8378 }, { "epoch": 0.3809090909090909, "grad_norm": 7.84375, "grad_norm_var": 0.6077473958333334, "learning_rate": 0.0001, "loss": 5.8103, "loss/crossentropy": 2.4725541472434998, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17870071530342102, "step": 8380 }, { "epoch": 0.381, "grad_norm": 5.71875, "grad_norm_var": 0.8950358072916667, "learning_rate": 0.0001, "loss": 5.8085, "loss/crossentropy": 2.4390189349651337, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.1781585030257702, "step": 8382 }, { "epoch": 0.3810909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.8603800455729167, "learning_rate": 0.0001, "loss": 5.8805, "loss/crossentropy": 2.6172209978103638, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1724209003150463, "step": 8384 }, { "epoch": 0.3811818181818182, "grad_norm": 6.03125, "grad_norm_var": 0.81363525390625, "learning_rate": 0.0001, "loss": 5.6637, "loss/crossentropy": 2.4345000982284546, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17057298496365547, "step": 8386 }, { "epoch": 0.38127272727272726, "grad_norm": 5.65625, "grad_norm_var": 0.80318603515625, "learning_rate": 0.0001, "loss": 5.2719, "loss/crossentropy": 2.126842677593231, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1578647457063198, "step": 8388 }, { "epoch": 0.38136363636363635, "grad_norm": 5.3125, "grad_norm_var": 0.74283447265625, "learning_rate": 0.0001, "loss": 5.8833, "loss/crossentropy": 2.63182932138443, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17338494583964348, "step": 8390 }, { "epoch": 0.38145454545454544, "grad_norm": 4.84375, "grad_norm_var": 0.6581380208333333, "learning_rate": 0.0001, "loss": 5.8946, "loss/crossentropy": 2.5810691714286804, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.18037337437272072, "step": 8392 }, { "epoch": 0.3815454545454545, "grad_norm": 5.25, "grad_norm_var": 0.6620076497395834, "learning_rate": 0.0001, "loss": 5.9237, "loss/crossentropy": 2.5990009903907776, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1760287657380104, "step": 8394 }, { "epoch": 0.38163636363636366, "grad_norm": 9.375, "grad_norm_var": 1.2362589518229166, "learning_rate": 0.0001, "loss": 5.9588, "loss/crossentropy": 2.511056125164032, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.1883287988603115, "step": 8396 }, { "epoch": 0.38172727272727275, "grad_norm": 4.65625, "grad_norm_var": 1.1199055989583333, "learning_rate": 0.0001, "loss": 5.8341, "loss/crossentropy": 2.6457808017730713, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16531788930296898, "step": 8398 }, { "epoch": 0.38181818181818183, "grad_norm": 5.78125, "grad_norm_var": 1.1144368489583334, "learning_rate": 0.0001, "loss": 5.9022, "loss/crossentropy": 2.5401413440704346, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17937396839261055, "step": 8400 }, { "epoch": 0.3819090909090909, "grad_norm": 5.1875, "grad_norm_var": 1.1233357747395833, "learning_rate": 0.0001, "loss": 5.9396, "loss/crossentropy": 2.637661874294281, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.177065622061491, "step": 8402 }, { "epoch": 0.382, "grad_norm": 4.90625, "grad_norm_var": 1.166015625, "learning_rate": 0.0001, "loss": 5.7837, "loss/crossentropy": 2.5297065675258636, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17364533990621567, "step": 8404 }, { "epoch": 0.3820909090909091, "grad_norm": 5.15625, "grad_norm_var": 1.1747233072916667, "learning_rate": 0.0001, "loss": 5.6988, "loss/crossentropy": 2.46262127161026, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16814491152763367, "step": 8406 }, { "epoch": 0.3821818181818182, "grad_norm": 4.96875, "grad_norm_var": 1.1706990559895833, "learning_rate": 0.0001, "loss": 5.6659, "loss/crossentropy": 2.4818076491355896, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1652812361717224, "step": 8408 }, { "epoch": 0.38227272727272726, "grad_norm": 6.40625, "grad_norm_var": 1.38101806640625, "learning_rate": 0.0001, "loss": 5.5884, "loss/crossentropy": 2.361419439315796, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16820595040917397, "step": 8410 }, { "epoch": 0.38236363636363635, "grad_norm": 4.84375, "grad_norm_var": 0.46920572916666664, "learning_rate": 0.0001, "loss": 5.3075, "loss/crossentropy": 2.218160957098007, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15659499168395996, "step": 8412 }, { "epoch": 0.38245454545454544, "grad_norm": 5.0, "grad_norm_var": 0.4725870768229167, "learning_rate": 0.0001, "loss": 5.8989, "loss/crossentropy": 2.5301853716373444, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18140285834670067, "step": 8414 }, { "epoch": 0.3825454545454545, "grad_norm": 5.5, "grad_norm_var": 0.4727701822916667, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.517665207386017, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.17173795774579048, "step": 8416 }, { "epoch": 0.38263636363636366, "grad_norm": 5.6875, "grad_norm_var": 0.49394124348958335, "learning_rate": 0.0001, "loss": 5.7866, "loss/crossentropy": 2.5493377447128296, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1706039048731327, "step": 8418 }, { "epoch": 0.38272727272727275, "grad_norm": 5.25, "grad_norm_var": 0.48639322916666666, "learning_rate": 0.0001, "loss": 5.9006, "loss/crossentropy": 2.684768259525299, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17002328485250473, "step": 8420 }, { "epoch": 0.38281818181818184, "grad_norm": 5.28125, "grad_norm_var": 0.48306884765625, "learning_rate": 0.0001, "loss": 5.8734, "loss/crossentropy": 2.591350495815277, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17586291208863258, "step": 8422 }, { "epoch": 0.3829090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.47649739583333334, "learning_rate": 0.0001, "loss": 5.7598, "loss/crossentropy": 2.5121834874153137, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17105427756905556, "step": 8424 }, { "epoch": 0.383, "grad_norm": 4.96875, "grad_norm_var": 0.18509114583333333, "learning_rate": 0.0001, "loss": 5.3571, "loss/crossentropy": 2.2290828227996826, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16143294051289558, "step": 8426 }, { "epoch": 0.3830909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.17552083333333332, "learning_rate": 0.0001, "loss": 5.6483, "loss/crossentropy": 2.4121710658073425, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17029278352856636, "step": 8428 }, { "epoch": 0.3831818181818182, "grad_norm": 4.71875, "grad_norm_var": 0.08899332682291666, "learning_rate": 0.0001, "loss": 5.6493, "loss/crossentropy": 2.4458039700984955, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17015575245022774, "step": 8430 }, { "epoch": 0.38327272727272726, "grad_norm": 4.8125, "grad_norm_var": 0.09016520182291667, "learning_rate": 0.0001, "loss": 5.682, "loss/crossentropy": 2.5042206048965454, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16699527949094772, "step": 8432 }, { "epoch": 0.38336363636363635, "grad_norm": 4.84375, "grad_norm_var": 0.06959228515625, "learning_rate": 0.0001, "loss": 5.108, "loss/crossentropy": 2.0410006046295166, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.15143056958913803, "step": 8434 }, { "epoch": 0.38345454545454544, "grad_norm": 4.90625, "grad_norm_var": 0.070947265625, "learning_rate": 0.0001, "loss": 5.6766, "loss/crossentropy": 2.460633635520935, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16867170482873917, "step": 8436 }, { "epoch": 0.3835454545454545, "grad_norm": 4.59375, "grad_norm_var": 0.08097330729166667, "learning_rate": 0.0001, "loss": 6.0186, "loss/crossentropy": 2.7822524309158325, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1740286573767662, "step": 8438 }, { "epoch": 0.3836363636363636, "grad_norm": 4.875, "grad_norm_var": 0.0796875, "learning_rate": 0.0001, "loss": 5.6564, "loss/crossentropy": 2.487436592578888, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1635809987783432, "step": 8440 }, { "epoch": 0.38372727272727275, "grad_norm": 5.59375, "grad_norm_var": 0.14680989583333334, "learning_rate": 0.0001, "loss": 5.9118, "loss/crossentropy": 2.5641437768936157, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17929362133145332, "step": 8442 }, { "epoch": 0.38381818181818184, "grad_norm": 4.96875, "grad_norm_var": 0.140478515625, "learning_rate": 0.0001, "loss": 5.7036, "loss/crossentropy": 2.5107139945030212, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1675320826470852, "step": 8444 }, { "epoch": 0.3839090909090909, "grad_norm": 5.5, "grad_norm_var": 0.13873697916666666, "learning_rate": 0.0001, "loss": 5.5528, "loss/crossentropy": 2.3540187776088715, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16499492526054382, "step": 8446 }, { "epoch": 0.384, "grad_norm": 5.1875, "grad_norm_var": 0.13391520182291666, "learning_rate": 0.0001, "loss": 5.4875, "loss/crossentropy": 2.2930628657341003, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1665184535086155, "step": 8448 }, { "epoch": 0.3840909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.13365478515625, "learning_rate": 0.0001, "loss": 5.6794, "loss/crossentropy": 2.526404023170471, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16549064964056015, "step": 8450 }, { "epoch": 0.3841818181818182, "grad_norm": 5.8125, "grad_norm_var": 0.15510660807291668, "learning_rate": 0.0001, "loss": 5.829, "loss/crossentropy": 2.4704532623291016, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1819518618285656, "step": 8452 }, { "epoch": 0.38427272727272727, "grad_norm": 5.65625, "grad_norm_var": 0.14453125, "learning_rate": 0.0001, "loss": 5.8545, "loss/crossentropy": 2.5661814212799072, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17375653982162476, "step": 8454 }, { "epoch": 0.38436363636363635, "grad_norm": 6.3125, "grad_norm_var": 0.18095296223958332, "learning_rate": 0.0001, "loss": 5.4894, "loss/crossentropy": 2.3264094591140747, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16258541867136955, "step": 8456 }, { "epoch": 0.38445454545454544, "grad_norm": 4.78125, "grad_norm_var": 0.2103515625, "learning_rate": 0.0001, "loss": 5.671, "loss/crossentropy": 2.4899386763572693, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16536922380328178, "step": 8458 }, { "epoch": 0.3845454545454545, "grad_norm": 5.125, "grad_norm_var": 0.21419270833333334, "learning_rate": 0.0001, "loss": 5.7831, "loss/crossentropy": 2.5306739807128906, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17290205880999565, "step": 8460 }, { "epoch": 0.3846363636363636, "grad_norm": 5.25, "grad_norm_var": 0.21236572265625, "learning_rate": 0.0001, "loss": 5.8348, "loss/crossentropy": 2.534018039703369, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17793341726064682, "step": 8462 }, { "epoch": 0.38472727272727275, "grad_norm": 4.78125, "grad_norm_var": 0.22174479166666666, "learning_rate": 0.0001, "loss": 5.7905, "loss/crossentropy": 2.547767996788025, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17368892580270767, "step": 8464 }, { "epoch": 0.38481818181818184, "grad_norm": 5.0, "grad_norm_var": 0.211572265625, "learning_rate": 0.0001, "loss": 6.0711, "loss/crossentropy": 2.718814432621002, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1803455725312233, "step": 8466 }, { "epoch": 0.3849090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.19595947265625, "learning_rate": 0.0001, "loss": 5.8305, "loss/crossentropy": 2.569219708442688, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17260993644595146, "step": 8468 }, { "epoch": 0.385, "grad_norm": 5.09375, "grad_norm_var": 0.18995768229166668, "learning_rate": 0.0001, "loss": 5.9319, "loss/crossentropy": 2.572605311870575, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18241384997963905, "step": 8470 }, { "epoch": 0.3850909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.13632405598958333, "learning_rate": 0.0001, "loss": 5.8272, "loss/crossentropy": 2.579208552837372, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1736292876303196, "step": 8472 }, { "epoch": 0.3851818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.06539306640625, "learning_rate": 0.0001, "loss": 5.8661, "loss/crossentropy": 2.6428810358047485, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17290344834327698, "step": 8474 }, { "epoch": 0.38527272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.068994140625, "learning_rate": 0.0001, "loss": 5.705, "loss/crossentropy": 2.4443294405937195, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1739169806241989, "step": 8476 }, { "epoch": 0.38536363636363635, "grad_norm": 5.40625, "grad_norm_var": 0.07646077473958333, "learning_rate": 0.0001, "loss": 5.7868, "loss/crossentropy": 2.539016008377075, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17477479577064514, "step": 8478 }, { "epoch": 0.38545454545454544, "grad_norm": 5.4375, "grad_norm_var": 0.06884358723958334, "learning_rate": 0.0001, "loss": 5.7441, "loss/crossentropy": 2.523398458957672, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17148634046316147, "step": 8480 }, { "epoch": 0.3855454545454545, "grad_norm": 5.3125, "grad_norm_var": 0.09299723307291667, "learning_rate": 0.0001, "loss": 5.5717, "loss/crossentropy": 2.428604304790497, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16430781036615372, "step": 8482 }, { "epoch": 0.3856363636363636, "grad_norm": 5.0, "grad_norm_var": 0.09446614583333333, "learning_rate": 0.0001, "loss": 5.341, "loss/crossentropy": 2.2287389934062958, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15732057765126228, "step": 8484 }, { "epoch": 0.38572727272727275, "grad_norm": 5.21875, "grad_norm_var": 0.076806640625, "learning_rate": 0.0001, "loss": 5.5175, "loss/crossentropy": 2.351514458656311, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16503284871578217, "step": 8486 }, { "epoch": 0.38581818181818184, "grad_norm": 5.5, "grad_norm_var": 0.065869140625, "learning_rate": 0.0001, "loss": 5.5945, "loss/crossentropy": 2.3431498408317566, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16810211911797523, "step": 8488 }, { "epoch": 0.3859090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.06747639973958333, "learning_rate": 0.0001, "loss": 5.7548, "loss/crossentropy": 2.511168956756592, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17045852914452553, "step": 8490 }, { "epoch": 0.386, "grad_norm": 5.09375, "grad_norm_var": 0.059488932291666664, "learning_rate": 0.0001, "loss": 5.8391, "loss/crossentropy": 2.5642592906951904, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17534056678414345, "step": 8492 }, { "epoch": 0.3860909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.060530598958333334, "learning_rate": 0.0001, "loss": 5.4748, "loss/crossentropy": 2.369950771331787, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15912266820669174, "step": 8494 }, { "epoch": 0.3861818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.06975504557291666, "learning_rate": 0.0001, "loss": 5.2498, "loss/crossentropy": 2.2084809243679047, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1558869183063507, "step": 8496 }, { "epoch": 0.38627272727272727, "grad_norm": 8.75, "grad_norm_var": 0.93873291015625, "learning_rate": 0.0001, "loss": 5.1591, "loss/crossentropy": 2.0826890766620636, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15686029940843582, "step": 8498 }, { "epoch": 0.38636363636363635, "grad_norm": 6.375, "grad_norm_var": 1.0029581705729167, "learning_rate": 0.0001, "loss": 6.1737, "loss/crossentropy": 2.6253671646118164, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.1960403509438038, "step": 8500 }, { "epoch": 0.38645454545454544, "grad_norm": 5.25, "grad_norm_var": 0.9998046875, "learning_rate": 0.0001, "loss": 5.6218, "loss/crossentropy": 2.387474238872528, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17030929401516914, "step": 8502 }, { "epoch": 0.3865454545454545, "grad_norm": 5.46875, "grad_norm_var": 1.0014322916666667, "learning_rate": 0.0001, "loss": 5.9621, "loss/crossentropy": 2.6296172738075256, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1801183633506298, "step": 8504 }, { "epoch": 0.3866363636363636, "grad_norm": 5.625, "grad_norm_var": 0.986328125, "learning_rate": 0.0001, "loss": 6.1824, "loss/crossentropy": 2.79385906457901, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18338261172175407, "step": 8506 }, { "epoch": 0.38672727272727275, "grad_norm": 5.28125, "grad_norm_var": 1.00806884765625, "learning_rate": 0.0001, "loss": 5.8927, "loss/crossentropy": 2.620570182800293, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1744769886136055, "step": 8508 }, { "epoch": 0.38681818181818184, "grad_norm": 5.28125, "grad_norm_var": 0.9799112955729167, "learning_rate": 0.0001, "loss": 5.7062, "loss/crossentropy": 2.471191108226776, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17018308863043785, "step": 8510 }, { "epoch": 0.3869090909090909, "grad_norm": 5.375, "grad_norm_var": 1.0447550455729167, "learning_rate": 0.0001, "loss": 5.5554, "loss/crossentropy": 2.371455430984497, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16487492620944977, "step": 8512 }, { "epoch": 0.387, "grad_norm": 5.625, "grad_norm_var": 0.294384765625, "learning_rate": 0.0001, "loss": 5.8788, "loss/crossentropy": 2.556325376033783, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17775101959705353, "step": 8514 }, { "epoch": 0.3870909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.2562459309895833, "learning_rate": 0.0001, "loss": 5.3155, "loss/crossentropy": 2.2773972153663635, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15264017134904861, "step": 8516 }, { "epoch": 0.3871818181818182, "grad_norm": 5.0625, "grad_norm_var": 0.27883707682291664, "learning_rate": 0.0001, "loss": 5.2529, "loss/crossentropy": 2.1602098643779755, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1571166068315506, "step": 8518 }, { "epoch": 0.38727272727272727, "grad_norm": 8.125, "grad_norm_var": 0.7352701822916666, "learning_rate": 0.0001, "loss": 5.5968, "loss/crossentropy": 2.3491055369377136, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17047670111060143, "step": 8520 }, { "epoch": 0.38736363636363635, "grad_norm": 5.1875, "grad_norm_var": 0.7369099934895833, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.5615010261535645, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1808910258114338, "step": 8522 }, { "epoch": 0.38745454545454544, "grad_norm": 20.25, "grad_norm_var": 14.167708333333334, "learning_rate": 0.0001, "loss": 5.5638, "loss/crossentropy": 2.369775265455246, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.16295326873660088, "step": 8524 }, { "epoch": 0.3875454545454545, "grad_norm": 4.90625, "grad_norm_var": 14.163134765625, "learning_rate": 0.0001, "loss": 5.9278, "loss/crossentropy": 2.661176860332489, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1743139997124672, "step": 8526 }, { "epoch": 0.3876363636363636, "grad_norm": 5.59375, "grad_norm_var": 14.134700520833333, "learning_rate": 0.0001, "loss": 6.2265, "loss/crossentropy": 2.77306991815567, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18968238309025764, "step": 8528 }, { "epoch": 0.38772727272727275, "grad_norm": 5.25, "grad_norm_var": 14.21470947265625, "learning_rate": 0.0001, "loss": 5.6005, "loss/crossentropy": 2.3534892201423645, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17079954594373703, "step": 8530 }, { "epoch": 0.38781818181818184, "grad_norm": 4.75, "grad_norm_var": 14.397119140625, "learning_rate": 0.0001, "loss": 5.6614, "loss/crossentropy": 2.396019160747528, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17145859822630882, "step": 8532 }, { "epoch": 0.3879090909090909, "grad_norm": 4.375, "grad_norm_var": 14.555061848958333, "learning_rate": 0.0001, "loss": 4.93, "loss/crossentropy": 2.028966635465622, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.14010196179151535, "step": 8534 }, { "epoch": 0.388, "grad_norm": 4.34375, "grad_norm_var": 14.555192057291666, "learning_rate": 0.0001, "loss": 5.3523, "loss/crossentropy": 2.3275738656520844, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1548168770968914, "step": 8536 }, { "epoch": 0.3880909090909091, "grad_norm": 4.96875, "grad_norm_var": 14.62232666015625, "learning_rate": 0.0001, "loss": 6.0702, "loss/crossentropy": 2.7322608828544617, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18262220919132233, "step": 8538 }, { "epoch": 0.3881818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.19420572916666667, "learning_rate": 0.0001, "loss": 5.9277, "loss/crossentropy": 2.548810124397278, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18105677142739296, "step": 8540 }, { "epoch": 0.38827272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.18677978515625, "learning_rate": 0.0001, "loss": 5.7768, "loss/crossentropy": 2.576912581920624, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16842154413461685, "step": 8542 }, { "epoch": 0.38836363636363636, "grad_norm": 5.125, "grad_norm_var": 0.12691650390625, "learning_rate": 0.0001, "loss": 5.9097, "loss/crossentropy": 2.6609081029891968, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16862806305289268, "step": 8544 }, { "epoch": 0.38845454545454544, "grad_norm": 5.21875, "grad_norm_var": 0.12229410807291667, "learning_rate": 0.0001, "loss": 5.6153, "loss/crossentropy": 2.4762638807296753, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1656644567847252, "step": 8546 }, { "epoch": 0.3885454545454545, "grad_norm": 5.125, "grad_norm_var": 0.131640625, "learning_rate": 0.0001, "loss": 5.7944, "loss/crossentropy": 2.5422770380973816, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17247328534722328, "step": 8548 }, { "epoch": 0.3886363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.09827067057291666, "learning_rate": 0.0001, "loss": 5.5864, "loss/crossentropy": 2.364854723215103, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1676587574183941, "step": 8550 }, { "epoch": 0.38872727272727275, "grad_norm": 4.9375, "grad_norm_var": 0.05423177083333333, "learning_rate": 0.0001, "loss": 5.8208, "loss/crossentropy": 2.5822423696517944, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1720951609313488, "step": 8552 }, { "epoch": 0.38881818181818184, "grad_norm": 5.25, "grad_norm_var": 0.05090738932291667, "learning_rate": 0.0001, "loss": 5.7533, "loss/crossentropy": 2.48781681060791, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17596668004989624, "step": 8554 }, { "epoch": 0.3889090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.037109375, "learning_rate": 0.0001, "loss": 5.5022, "loss/crossentropy": 2.305183321237564, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16618390381336212, "step": 8556 }, { "epoch": 0.389, "grad_norm": 4.96875, "grad_norm_var": 0.0427734375, "learning_rate": 0.0001, "loss": 5.6356, "loss/crossentropy": 2.4439812898635864, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16916486993432045, "step": 8558 }, { "epoch": 0.3890909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.042041015625, "learning_rate": 0.0001, "loss": 5.6949, "loss/crossentropy": 2.4499581456184387, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1701982133090496, "step": 8560 }, { "epoch": 0.3891818181818182, "grad_norm": 5.59375, "grad_norm_var": 0.06959228515625, "learning_rate": 0.0001, "loss": 5.7045, "loss/crossentropy": 2.5090646743774414, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1675867848098278, "step": 8562 }, { "epoch": 0.38927272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.05181884765625, "learning_rate": 0.0001, "loss": 5.9473, "loss/crossentropy": 2.6363126635551453, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1760166548192501, "step": 8564 }, { "epoch": 0.38936363636363636, "grad_norm": 5.1875, "grad_norm_var": 0.047900390625, "learning_rate": 0.0001, "loss": 5.7734, "loss/crossentropy": 2.5533541440963745, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1669272631406784, "step": 8566 }, { "epoch": 0.38945454545454544, "grad_norm": 4.0625, "grad_norm_var": 0.11109619140625, "learning_rate": 0.0001, "loss": 5.2265, "loss/crossentropy": 2.2378977835178375, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14925310015678406, "step": 8568 }, { "epoch": 0.38954545454545453, "grad_norm": 5.09375, "grad_norm_var": 0.117822265625, "learning_rate": 0.0001, "loss": 5.8195, "loss/crossentropy": 2.530672311782837, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1749768741428852, "step": 8570 }, { "epoch": 0.3896363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.11676025390625, "learning_rate": 0.0001, "loss": 5.4945, "loss/crossentropy": 2.2978906631469727, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16712472587823868, "step": 8572 }, { "epoch": 0.3897272727272727, "grad_norm": 5.46875, "grad_norm_var": 0.187109375, "learning_rate": 0.0001, "loss": 5.9862, "loss/crossentropy": 2.5924649834632874, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18233873695135117, "step": 8574 }, { "epoch": 0.38981818181818184, "grad_norm": 5.8125, "grad_norm_var": 0.22252604166666667, "learning_rate": 0.0001, "loss": 5.7245, "loss/crossentropy": 2.5495764017105103, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16632025316357613, "step": 8576 }, { "epoch": 0.3899090909090909, "grad_norm": 5.0, "grad_norm_var": 0.240869140625, "learning_rate": 0.0001, "loss": 5.7139, "loss/crossentropy": 2.4467561542987823, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1735864281654358, "step": 8578 }, { "epoch": 0.39, "grad_norm": 5.375, "grad_norm_var": 0.26392822265625, "learning_rate": 0.0001, "loss": 5.6317, "loss/crossentropy": 2.4428809881210327, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1675182469189167, "step": 8580 }, { "epoch": 0.3900909090909091, "grad_norm": 5.125, "grad_norm_var": 0.2650390625, "learning_rate": 0.0001, "loss": 5.9076, "loss/crossentropy": 2.6517862677574158, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1716712862253189, "step": 8582 }, { "epoch": 0.3901818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.17421875, "learning_rate": 0.0001, "loss": 5.9277, "loss/crossentropy": 2.6809502840042114, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17350316047668457, "step": 8584 }, { "epoch": 0.39027272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.18821207682291666, "learning_rate": 0.0001, "loss": 5.3515, "loss/crossentropy": 2.303350269794464, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15364303812384605, "step": 8586 }, { "epoch": 0.39036363636363636, "grad_norm": 5.4375, "grad_norm_var": 0.19205322265625, "learning_rate": 0.0001, "loss": 5.1567, "loss/crossentropy": 2.033982992172241, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.1554395779967308, "step": 8588 }, { "epoch": 0.39045454545454544, "grad_norm": 5.09375, "grad_norm_var": 0.15110677083333332, "learning_rate": 0.0001, "loss": 6.1022, "loss/crossentropy": 2.80961149930954, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17887217923998833, "step": 8590 }, { "epoch": 0.39054545454545453, "grad_norm": 5.625, "grad_norm_var": 0.13469645182291667, "learning_rate": 0.0001, "loss": 5.9132, "loss/crossentropy": 2.597468912601471, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17922846972942352, "step": 8592 }, { "epoch": 0.3906363636363636, "grad_norm": 4.53125, "grad_norm_var": 0.12434488932291667, "learning_rate": 0.0001, "loss": 5.5634, "loss/crossentropy": 2.4377925395965576, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1635354794561863, "step": 8594 }, { "epoch": 0.3907272727272727, "grad_norm": 5.3125, "grad_norm_var": 0.1830078125, "learning_rate": 0.0001, "loss": 6.0146, "loss/crossentropy": 2.6199668645858765, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18555810675024986, "step": 8596 }, { "epoch": 0.39081818181818184, "grad_norm": 5.6875, "grad_norm_var": 0.21998291015625, "learning_rate": 0.0001, "loss": 5.5019, "loss/crossentropy": 2.375937342643738, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16240069456398487, "step": 8598 }, { "epoch": 0.39090909090909093, "grad_norm": 5.15625, "grad_norm_var": 0.21467692057291668, "learning_rate": 0.0001, "loss": 5.4732, "loss/crossentropy": 2.2657060027122498, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16723091900348663, "step": 8600 }, { "epoch": 0.391, "grad_norm": 5.0, "grad_norm_var": 0.20193684895833333, "learning_rate": 0.0001, "loss": 5.5567, "loss/crossentropy": 2.3332691192626953, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1676596775650978, "step": 8602 }, { "epoch": 0.3910909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.18391927083333334, "learning_rate": 0.0001, "loss": 5.6863, "loss/crossentropy": 2.448861539363861, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16984162107110023, "step": 8604 }, { "epoch": 0.3911818181818182, "grad_norm": 5.375, "grad_norm_var": 0.18644205729166666, "learning_rate": 0.0001, "loss": 5.9712, "loss/crossentropy": 2.6001265048980713, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.18300579488277435, "step": 8606 }, { "epoch": 0.39127272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.2527994791666667, "learning_rate": 0.0001, "loss": 5.7652, "loss/crossentropy": 2.5141494274139404, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17392858117818832, "step": 8608 }, { "epoch": 0.39136363636363636, "grad_norm": 4.84375, "grad_norm_var": 0.22823893229166667, "learning_rate": 0.0001, "loss": 5.3256, "loss/crossentropy": 2.2992203533649445, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15283094719052315, "step": 8610 }, { "epoch": 0.39145454545454544, "grad_norm": 4.71875, "grad_norm_var": 0.18098958333333334, "learning_rate": 0.0001, "loss": 5.2821, "loss/crossentropy": 2.2583947479724884, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15022173151373863, "step": 8612 }, { "epoch": 0.39154545454545453, "grad_norm": 5.875, "grad_norm_var": 0.16653238932291667, "learning_rate": 0.0001, "loss": 5.6421, "loss/crossentropy": 2.379972606897354, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1703493483364582, "step": 8614 }, { "epoch": 0.3916363636363636, "grad_norm": 5.3125, "grad_norm_var": 0.358056640625, "learning_rate": 0.0001, "loss": 5.7007, "loss/crossentropy": 2.417026996612549, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1760275587439537, "step": 8616 }, { "epoch": 0.3917272727272727, "grad_norm": 4.6875, "grad_norm_var": 0.3923136393229167, "learning_rate": 0.0001, "loss": 5.7231, "loss/crossentropy": 2.4402520656585693, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17554941773414612, "step": 8618 }, { "epoch": 0.39181818181818184, "grad_norm": 5.0, "grad_norm_var": 0.4010416666666667, "learning_rate": 0.0001, "loss": 5.4079, "loss/crossentropy": 2.330580711364746, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15694952383637428, "step": 8620 }, { "epoch": 0.39190909090909093, "grad_norm": 5.0625, "grad_norm_var": 0.4172159830729167, "learning_rate": 0.0001, "loss": 5.5175, "loss/crossentropy": 2.449957311153412, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1573357954621315, "step": 8622 }, { "epoch": 0.392, "grad_norm": 5.84375, "grad_norm_var": 0.39781494140625, "learning_rate": 0.0001, "loss": 5.4864, "loss/crossentropy": 2.302748382091522, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.16426056995987892, "step": 8624 }, { "epoch": 0.3920909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.3743489583333333, "learning_rate": 0.0001, "loss": 5.7808, "loss/crossentropy": 2.5469435155391693, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16947469115257263, "step": 8626 }, { "epoch": 0.3921818181818182, "grad_norm": 6.625, "grad_norm_var": 0.43209635416666664, "learning_rate": 0.0001, "loss": 6.0555, "loss/crossentropy": 2.6596115231513977, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.18666056171059608, "step": 8628 }, { "epoch": 0.3922727272727273, "grad_norm": 5.5, "grad_norm_var": 0.41653645833333336, "learning_rate": 0.0001, "loss": 5.4767, "loss/crossentropy": 2.286514788866043, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.16491955891251564, "step": 8630 }, { "epoch": 0.39236363636363636, "grad_norm": 5.625, "grad_norm_var": 0.26024983723958334, "learning_rate": 0.0001, "loss": 5.8441, "loss/crossentropy": 2.505117565393448, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17784709110856056, "step": 8632 }, { "epoch": 0.39245454545454544, "grad_norm": 5.21875, "grad_norm_var": 0.21730143229166668, "learning_rate": 0.0001, "loss": 5.5752, "loss/crossentropy": 2.368400812149048, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16540661826729774, "step": 8634 }, { "epoch": 0.39254545454545453, "grad_norm": 5.375, "grad_norm_var": 0.21170247395833333, "learning_rate": 0.0001, "loss": 6.081, "loss/crossentropy": 2.6851744055747986, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18372588604688644, "step": 8636 }, { "epoch": 0.3926363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.19147135416666666, "learning_rate": 0.0001, "loss": 5.7603, "loss/crossentropy": 2.5581244826316833, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1690489761531353, "step": 8638 }, { "epoch": 0.3927272727272727, "grad_norm": 6.21875, "grad_norm_var": 0.2091796875, "learning_rate": 0.0001, "loss": 5.9368, "loss/crossentropy": 2.5585341453552246, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.18098896369338036, "step": 8640 }, { "epoch": 0.39281818181818184, "grad_norm": 4.65625, "grad_norm_var": 0.24615885416666666, "learning_rate": 0.0001, "loss": 5.5758, "loss/crossentropy": 2.4537095427513123, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16474735736846924, "step": 8642 }, { "epoch": 0.39290909090909093, "grad_norm": 5.0625, "grad_norm_var": 0.16769205729166667, "learning_rate": 0.0001, "loss": 5.746, "loss/crossentropy": 2.5158822536468506, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16969172656536102, "step": 8644 }, { "epoch": 0.393, "grad_norm": 5.03125, "grad_norm_var": 0.17392171223958333, "learning_rate": 0.0001, "loss": 5.6646, "loss/crossentropy": 2.42098867893219, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17201566323637962, "step": 8646 }, { "epoch": 0.3930909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.16901041666666666, "learning_rate": 0.0001, "loss": 5.5082, "loss/crossentropy": 2.39504936337471, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.162489403039217, "step": 8648 }, { "epoch": 0.3931818181818182, "grad_norm": 5.0, "grad_norm_var": 0.192431640625, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.4822739362716675, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16018864139914513, "step": 8650 }, { "epoch": 0.3932727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.16106770833333334, "learning_rate": 0.0001, "loss": 5.6094, "loss/crossentropy": 2.4351716339588165, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1658569909632206, "step": 8652 }, { "epoch": 0.39336363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.19537760416666666, "learning_rate": 0.0001, "loss": 5.7242, "loss/crossentropy": 2.5543596148490906, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16522467881441116, "step": 8654 }, { "epoch": 0.39345454545454545, "grad_norm": 5.21875, "grad_norm_var": 0.06744384765625, "learning_rate": 0.0001, "loss": 5.6878, "loss/crossentropy": 2.4547367095947266, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17252590879797935, "step": 8656 }, { "epoch": 0.39354545454545453, "grad_norm": 5.34375, "grad_norm_var": 0.055192057291666666, "learning_rate": 0.0001, "loss": 5.6082, "loss/crossentropy": 2.40598326921463, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16729385778307915, "step": 8658 }, { "epoch": 0.3936363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.06809488932291667, "learning_rate": 0.0001, "loss": 5.5547, "loss/crossentropy": 2.4233295917510986, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16254713386297226, "step": 8660 }, { "epoch": 0.3937272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.279931640625, "learning_rate": 0.0001, "loss": 5.9889, "loss/crossentropy": 2.6221883296966553, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.18413108587265015, "step": 8662 }, { "epoch": 0.39381818181818184, "grad_norm": 5.4375, "grad_norm_var": 0.2865193684895833, "learning_rate": 0.0001, "loss": 5.9022, "loss/crossentropy": 2.6098684072494507, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17805872857570648, "step": 8664 }, { "epoch": 0.39390909090909093, "grad_norm": 5.9375, "grad_norm_var": 0.3355305989583333, "learning_rate": 0.0001, "loss": 5.8158, "loss/crossentropy": 2.5440520644187927, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1728760451078415, "step": 8666 }, { "epoch": 0.394, "grad_norm": 4.65625, "grad_norm_var": 0.3502604166666667, "learning_rate": 0.0001, "loss": 5.5235, "loss/crossentropy": 2.383163422346115, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1634455770254135, "step": 8668 }, { "epoch": 0.3940909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.3263956705729167, "learning_rate": 0.0001, "loss": 5.7978, "loss/crossentropy": 2.560909330844879, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1723252572119236, "step": 8670 }, { "epoch": 0.3941818181818182, "grad_norm": 4.875, "grad_norm_var": 0.321875, "learning_rate": 0.0001, "loss": 5.912, "loss/crossentropy": 2.5785775780677795, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17924316227436066, "step": 8672 }, { "epoch": 0.3942727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.32333577473958336, "learning_rate": 0.0001, "loss": 5.9812, "loss/crossentropy": 2.6334659457206726, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17969239503145218, "step": 8674 }, { "epoch": 0.39436363636363636, "grad_norm": 4.875, "grad_norm_var": 0.303125, "learning_rate": 0.0001, "loss": 5.598, "loss/crossentropy": 2.454313278198242, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.15987465158104897, "step": 8676 }, { "epoch": 0.39445454545454545, "grad_norm": 4.75, "grad_norm_var": 0.13331705729166668, "learning_rate": 0.0001, "loss": 5.4537, "loss/crossentropy": 2.3714802265167236, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.15529310889542103, "step": 8678 }, { "epoch": 0.39454545454545453, "grad_norm": 5.34375, "grad_norm_var": 0.13450113932291666, "learning_rate": 0.0001, "loss": 5.3443, "loss/crossentropy": 2.2196803092956543, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.15992069616913795, "step": 8680 }, { "epoch": 0.3946363636363636, "grad_norm": 5.4375, "grad_norm_var": 0.08098551432291666, "learning_rate": 0.0001, "loss": 5.5597, "loss/crossentropy": 2.401606321334839, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1658048965036869, "step": 8682 }, { "epoch": 0.3947272727272727, "grad_norm": 4.59375, "grad_norm_var": 0.08752848307291666, "learning_rate": 0.0001, "loss": 5.7653, "loss/crossentropy": 2.585154891014099, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17035535350441933, "step": 8684 }, { "epoch": 0.39481818181818185, "grad_norm": 5.125, "grad_norm_var": 0.10467122395833334, "learning_rate": 0.0001, "loss": 6.1428, "loss/crossentropy": 2.79320365190506, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18300728127360344, "step": 8686 }, { "epoch": 0.39490909090909093, "grad_norm": 4.625, "grad_norm_var": 0.11790364583333333, "learning_rate": 0.0001, "loss": 5.497, "loss/crossentropy": 2.3402962386608124, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16313530877232552, "step": 8688 }, { "epoch": 0.395, "grad_norm": 4.75, "grad_norm_var": 0.14778238932291668, "learning_rate": 0.0001, "loss": 4.9498, "loss/crossentropy": 2.048710972070694, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14128276892006397, "step": 8690 }, { "epoch": 0.3950909090909091, "grad_norm": 4.625, "grad_norm_var": 0.15076497395833333, "learning_rate": 0.0001, "loss": 5.6777, "loss/crossentropy": 2.4741881489753723, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16917477548122406, "step": 8692 }, { "epoch": 0.3951818181818182, "grad_norm": 5.3125, "grad_norm_var": 0.16542561848958334, "learning_rate": 0.0001, "loss": 5.9191, "loss/crossentropy": 2.6837294697761536, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17158759385347366, "step": 8694 }, { "epoch": 0.3952727272727273, "grad_norm": 5.8125, "grad_norm_var": 0.20013020833333334, "learning_rate": 0.0001, "loss": 5.434, "loss/crossentropy": 2.294377624988556, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16337984055280685, "step": 8696 }, { "epoch": 0.39536363636363636, "grad_norm": 5.25, "grad_norm_var": 0.18941650390625, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.5258371829986572, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1784369796514511, "step": 8698 }, { "epoch": 0.39545454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.1865234375, "learning_rate": 0.0001, "loss": 5.2468, "loss/crossentropy": 2.2490848898887634, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15172713994979858, "step": 8700 }, { "epoch": 0.39554545454545453, "grad_norm": 5.71875, "grad_norm_var": 0.18019205729166668, "learning_rate": 0.0001, "loss": 5.96, "loss/crossentropy": 2.666135013103485, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17742958292365074, "step": 8702 }, { "epoch": 0.3956363636363636, "grad_norm": 5.96875, "grad_norm_var": 0.22317301432291667, "learning_rate": 0.0001, "loss": 5.6426, "loss/crossentropy": 2.454384446144104, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16706020385026932, "step": 8704 }, { "epoch": 0.3957272727272727, "grad_norm": 5.125, "grad_norm_var": 0.17746988932291666, "learning_rate": 0.0001, "loss": 5.4607, "loss/crossentropy": 2.2578559517860413, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1667681373655796, "step": 8706 }, { "epoch": 0.3958181818181818, "grad_norm": 5.8125, "grad_norm_var": 0.24244384765625, "learning_rate": 0.0001, "loss": 5.8354, "loss/crossentropy": 2.487589567899704, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17833438515663147, "step": 8708 }, { "epoch": 0.39590909090909093, "grad_norm": 4.90625, "grad_norm_var": 0.21458333333333332, "learning_rate": 0.0001, "loss": 5.5627, "loss/crossentropy": 2.402708649635315, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16443490236997604, "step": 8710 }, { "epoch": 0.396, "grad_norm": 5.40625, "grad_norm_var": 0.19423421223958334, "learning_rate": 0.0001, "loss": 5.8656, "loss/crossentropy": 2.5184354186058044, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1819862611591816, "step": 8712 }, { "epoch": 0.3960909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.1919921875, "learning_rate": 0.0001, "loss": 5.9784, "loss/crossentropy": 2.6582050919532776, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17830432951450348, "step": 8714 }, { "epoch": 0.3961818181818182, "grad_norm": 5.46875, "grad_norm_var": 0.16132405598958333, "learning_rate": 0.0001, "loss": 5.4809, "loss/crossentropy": 2.281703442335129, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1685475967824459, "step": 8716 }, { "epoch": 0.3962727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.15562744140625, "learning_rate": 0.0001, "loss": 6.0586, "loss/crossentropy": 2.7429264783859253, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.18019796162843704, "step": 8718 }, { "epoch": 0.39636363636363636, "grad_norm": 5.25, "grad_norm_var": 0.12232666015625, "learning_rate": 0.0001, "loss": 5.525, "loss/crossentropy": 2.4053398966789246, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16098502278327942, "step": 8720 }, { "epoch": 0.39645454545454545, "grad_norm": 5.125, "grad_norm_var": 0.14724934895833333, "learning_rate": 0.0001, "loss": 5.7113, "loss/crossentropy": 2.5208852291107178, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16904151067137718, "step": 8722 }, { "epoch": 0.39654545454545453, "grad_norm": 5.0625, "grad_norm_var": 0.05429280598958333, "learning_rate": 0.0001, "loss": 5.9485, "loss/crossentropy": 2.612912893295288, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18316591531038284, "step": 8724 }, { "epoch": 0.3966363636363636, "grad_norm": 7.28125, "grad_norm_var": 0.34308268229166666, "learning_rate": 0.0001, "loss": 5.2118, "loss/crossentropy": 2.1567561626434326, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.15003694966435432, "step": 8726 }, { "epoch": 0.3967272727272727, "grad_norm": 4.84375, "grad_norm_var": 0.35545247395833335, "learning_rate": 0.0001, "loss": 5.3061, "loss/crossentropy": 2.2747166752815247, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15333054959774017, "step": 8728 }, { "epoch": 0.3968181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.3678995768229167, "learning_rate": 0.0001, "loss": 5.2892, "loss/crossentropy": 2.162221312522888, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.15898411348462105, "step": 8730 }, { "epoch": 0.39690909090909093, "grad_norm": 5.0625, "grad_norm_var": 0.3831868489583333, "learning_rate": 0.0001, "loss": 5.719, "loss/crossentropy": 2.4392701983451843, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17231208086013794, "step": 8732 }, { "epoch": 0.397, "grad_norm": 4.9375, "grad_norm_var": 0.3878255208333333, "learning_rate": 0.0001, "loss": 5.7934, "loss/crossentropy": 2.5967493653297424, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16848834604024887, "step": 8734 }, { "epoch": 0.3970909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.40933837890625, "learning_rate": 0.0001, "loss": 5.7283, "loss/crossentropy": 2.504656732082367, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16885270923376083, "step": 8736 }, { "epoch": 0.3971818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.5430826822916667, "learning_rate": 0.0001, "loss": 5.6102, "loss/crossentropy": 2.4009179770946503, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16701855137944221, "step": 8738 }, { "epoch": 0.3972727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.54921875, "learning_rate": 0.0001, "loss": 5.5081, "loss/crossentropy": 2.3340596556663513, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16290823742747307, "step": 8740 }, { "epoch": 0.39736363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.3056640625, "learning_rate": 0.0001, "loss": 5.6964, "loss/crossentropy": 2.53637832403183, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1624894216656685, "step": 8742 }, { "epoch": 0.39745454545454545, "grad_norm": 5.125, "grad_norm_var": 0.29566650390625, "learning_rate": 0.0001, "loss": 5.5604, "loss/crossentropy": 2.3618770241737366, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16711723804473877, "step": 8744 }, { "epoch": 0.39754545454545454, "grad_norm": 4.875, "grad_norm_var": 0.30513916015625, "learning_rate": 0.0001, "loss": 5.4328, "loss/crossentropy": 2.328159213066101, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15948671847581863, "step": 8746 }, { "epoch": 0.3976363636363636, "grad_norm": 5.3125, "grad_norm_var": 0.29055582682291664, "learning_rate": 0.0001, "loss": 5.9359, "loss/crossentropy": 2.633341133594513, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1777195818722248, "step": 8748 }, { "epoch": 0.3977272727272727, "grad_norm": 5.34375, "grad_norm_var": 0.289306640625, "learning_rate": 0.0001, "loss": 5.5022, "loss/crossentropy": 2.3679912090301514, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16088345646858215, "step": 8750 }, { "epoch": 0.3978181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.26708577473958334, "learning_rate": 0.0001, "loss": 6.0487, "loss/crossentropy": 2.6975097060203552, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1815999485552311, "step": 8752 }, { "epoch": 0.39790909090909093, "grad_norm": 5.5, "grad_norm_var": 0.06926676432291666, "learning_rate": 0.0001, "loss": 5.8837, "loss/crossentropy": 2.623634397983551, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1732749491930008, "step": 8754 }, { "epoch": 0.398, "grad_norm": 4.65625, "grad_norm_var": 0.12185872395833333, "learning_rate": 0.0001, "loss": 5.858, "loss/crossentropy": 2.6089565753936768, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1708020195364952, "step": 8756 }, { "epoch": 0.3980909090909091, "grad_norm": 5.71875, "grad_norm_var": 0.12545166015625, "learning_rate": 0.0001, "loss": 6.0313, "loss/crossentropy": 2.668986976146698, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18506158143281937, "step": 8758 }, { "epoch": 0.3981818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.14371337890625, "learning_rate": 0.0001, "loss": 5.61, "loss/crossentropy": 2.443591892719269, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1660530734807253, "step": 8760 }, { "epoch": 0.3982727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.12343343098958333, "learning_rate": 0.0001, "loss": 5.8237, "loss/crossentropy": 2.5916784405708313, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17125118896365166, "step": 8762 }, { "epoch": 0.39836363636363636, "grad_norm": 5.0625, "grad_norm_var": 0.132275390625, "learning_rate": 0.0001, "loss": 5.5778, "loss/crossentropy": 2.4659520387649536, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16138077154755592, "step": 8764 }, { "epoch": 0.39845454545454545, "grad_norm": 4.8125, "grad_norm_var": 0.149462890625, "learning_rate": 0.0001, "loss": 5.6225, "loss/crossentropy": 2.417519837617874, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16913550347089767, "step": 8766 }, { "epoch": 0.39854545454545454, "grad_norm": 4.75, "grad_norm_var": 0.18284098307291666, "learning_rate": 0.0001, "loss": 5.0729, "loss/crossentropy": 2.135969638824463, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1450572945177555, "step": 8768 }, { "epoch": 0.3986363636363636, "grad_norm": 4.84375, "grad_norm_var": 0.17498372395833334, "learning_rate": 0.0001, "loss": 5.7948, "loss/crossentropy": 2.50168639421463, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17618310451507568, "step": 8770 }, { "epoch": 0.3987272727272727, "grad_norm": 5.34375, "grad_norm_var": 0.122509765625, "learning_rate": 0.0001, "loss": 5.8317, "loss/crossentropy": 2.65065735578537, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16712292283773422, "step": 8772 }, { "epoch": 0.3988181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.098681640625, "learning_rate": 0.0001, "loss": 5.7109, "loss/crossentropy": 2.574806809425354, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1620486043393612, "step": 8774 }, { "epoch": 0.39890909090909094, "grad_norm": 6.4375, "grad_norm_var": 0.2341796875, "learning_rate": 0.0001, "loss": 5.5888, "loss/crossentropy": 2.464730203151703, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16221624612808228, "step": 8776 }, { "epoch": 0.399, "grad_norm": 4.6875, "grad_norm_var": 0.25549723307291666, "learning_rate": 0.0001, "loss": 5.5378, "loss/crossentropy": 2.379082590341568, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16489366441965103, "step": 8778 }, { "epoch": 0.3990909090909091, "grad_norm": 5.625, "grad_norm_var": 0.244775390625, "learning_rate": 0.0001, "loss": 5.9936, "loss/crossentropy": 2.7001824378967285, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17875192314386368, "step": 8780 }, { "epoch": 0.3991818181818182, "grad_norm": 5.53125, "grad_norm_var": 0.22730712890625, "learning_rate": 0.0001, "loss": 5.9914, "loss/crossentropy": 2.6456461548805237, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.18086020275950432, "step": 8782 }, { "epoch": 0.3992727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.16373291015625, "learning_rate": 0.0001, "loss": 5.9112, "loss/crossentropy": 2.5492976903915405, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18150485679507256, "step": 8784 }, { "epoch": 0.39936363636363637, "grad_norm": 4.5, "grad_norm_var": 0.19426676432291667, "learning_rate": 0.0001, "loss": 5.7185, "loss/crossentropy": 2.5152339935302734, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16876790672540665, "step": 8786 }, { "epoch": 0.39945454545454545, "grad_norm": 5.125, "grad_norm_var": 0.231494140625, "learning_rate": 0.0001, "loss": 5.4181, "loss/crossentropy": 2.344560205936432, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1548115760087967, "step": 8788 }, { "epoch": 0.39954545454545454, "grad_norm": 4.78125, "grad_norm_var": 0.248828125, "learning_rate": 0.0001, "loss": 5.7328, "loss/crossentropy": 2.5573912858963013, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16831912100315094, "step": 8790 }, { "epoch": 0.3996363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.12844645182291667, "learning_rate": 0.0001, "loss": 5.8807, "loss/crossentropy": 2.648989498615265, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17317141592502594, "step": 8792 }, { "epoch": 0.3997272727272727, "grad_norm": 5.71875, "grad_norm_var": 0.34010416666666665, "learning_rate": 0.0001, "loss": 5.8857, "loss/crossentropy": 2.5946831107139587, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1763676553964615, "step": 8794 }, { "epoch": 0.3998181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.3739420572916667, "learning_rate": 0.0001, "loss": 5.944, "loss/crossentropy": 2.585190773010254, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1815820075571537, "step": 8796 }, { "epoch": 0.39990909090909094, "grad_norm": 4.59375, "grad_norm_var": 0.41769205729166664, "learning_rate": 0.0001, "loss": 5.4997, "loss/crossentropy": 2.401069760322571, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1625974401831627, "step": 8798 }, { "epoch": 0.4, "grad_norm": 4.90625, "grad_norm_var": 0.42359619140625, "learning_rate": 0.0001, "loss": 5.5392, "loss/crossentropy": 2.3808037638664246, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16427456215023994, "step": 8800 }, { "epoch": 0.4000909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.387109375, "learning_rate": 0.0001, "loss": 5.7364, "loss/crossentropy": 2.5231423377990723, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1689816638827324, "step": 8802 }, { "epoch": 0.4001818181818182, "grad_norm": 5.78125, "grad_norm_var": 0.360791015625, "learning_rate": 0.0001, "loss": 5.7678, "loss/crossentropy": 2.4799236059188843, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1746816597878933, "step": 8804 }, { "epoch": 0.4002727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.34894205729166666, "learning_rate": 0.0001, "loss": 5.7253, "loss/crossentropy": 2.4665740728378296, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17040317878127098, "step": 8806 }, { "epoch": 0.40036363636363637, "grad_norm": 5.25, "grad_norm_var": 0.34674072265625, "learning_rate": 0.0001, "loss": 5.3585, "loss/crossentropy": 2.20720973610878, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16259033232927322, "step": 8808 }, { "epoch": 0.40045454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.14329020182291666, "learning_rate": 0.0001, "loss": 5.4883, "loss/crossentropy": 2.4065417647361755, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15993230789899826, "step": 8810 }, { "epoch": 0.40054545454545454, "grad_norm": 4.96875, "grad_norm_var": 0.17159830729166667, "learning_rate": 0.0001, "loss": 5.6698, "loss/crossentropy": 2.36074161529541, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17817608639597893, "step": 8812 }, { "epoch": 0.4006363636363636, "grad_norm": 4.65625, "grad_norm_var": 0.15637613932291666, "learning_rate": 0.0001, "loss": 5.6768, "loss/crossentropy": 2.481663763523102, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1685386821627617, "step": 8814 }, { "epoch": 0.4007272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.15325113932291667, "learning_rate": 0.0001, "loss": 5.5517, "loss/crossentropy": 2.4188016951084137, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1588018573820591, "step": 8816 }, { "epoch": 0.4008181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.183056640625, "learning_rate": 0.0001, "loss": 5.773, "loss/crossentropy": 2.3927231431007385, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1821655109524727, "step": 8818 }, { "epoch": 0.4009090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.16534830729166666, "learning_rate": 0.0001, "loss": 5.9425, "loss/crossentropy": 2.630377948284149, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1776922307908535, "step": 8820 }, { "epoch": 0.401, "grad_norm": 4.875, "grad_norm_var": 0.18062744140625, "learning_rate": 0.0001, "loss": 5.7677, "loss/crossentropy": 2.565136730670929, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16850248724222183, "step": 8822 }, { "epoch": 0.4010909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.16965738932291666, "learning_rate": 0.0001, "loss": 5.6138, "loss/crossentropy": 2.430623173713684, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1653866432607174, "step": 8824 }, { "epoch": 0.4011818181818182, "grad_norm": 5.3125, "grad_norm_var": 0.16691080729166666, "learning_rate": 0.0001, "loss": 5.69, "loss/crossentropy": 2.5011755228042603, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16712353006005287, "step": 8826 }, { "epoch": 0.4012727272727273, "grad_norm": 4.65625, "grad_norm_var": 0.11770426432291667, "learning_rate": 0.0001, "loss": 5.0758, "loss/crossentropy": 2.0592753291130066, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.14774437993764877, "step": 8828 }, { "epoch": 0.40136363636363637, "grad_norm": 4.875, "grad_norm_var": 0.12011311848958334, "learning_rate": 0.0001, "loss": 5.4529, "loss/crossentropy": 2.3727807700634003, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15566834807395935, "step": 8830 }, { "epoch": 0.40145454545454545, "grad_norm": 4.8125, "grad_norm_var": 0.12431233723958333, "learning_rate": 0.0001, "loss": 5.3243, "loss/crossentropy": 2.2841823399066925, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15518254414200783, "step": 8832 }, { "epoch": 0.40154545454545454, "grad_norm": 4.9375, "grad_norm_var": 0.21222330729166666, "learning_rate": 0.0001, "loss": 5.6343, "loss/crossentropy": 2.445761203765869, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16787956655025482, "step": 8834 }, { "epoch": 0.4016363636363636, "grad_norm": 4.8125, "grad_norm_var": 0.21669514973958334, "learning_rate": 0.0001, "loss": 5.7765, "loss/crossentropy": 2.6401950120925903, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1638273224234581, "step": 8836 }, { "epoch": 0.4017272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.19568684895833333, "learning_rate": 0.0001, "loss": 5.7414, "loss/crossentropy": 2.520653188228607, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1691407896578312, "step": 8838 }, { "epoch": 0.4018181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.20963541666666666, "learning_rate": 0.0001, "loss": 5.4942, "loss/crossentropy": 2.383426547050476, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16166220977902412, "step": 8840 }, { "epoch": 0.4019090909090909, "grad_norm": 5.375, "grad_norm_var": 0.21222330729166666, "learning_rate": 0.0001, "loss": 5.9918, "loss/crossentropy": 2.6649479269981384, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.18053880333900452, "step": 8842 }, { "epoch": 0.402, "grad_norm": 4.875, "grad_norm_var": 0.22174072265625, "learning_rate": 0.0001, "loss": 5.4828, "loss/crossentropy": 2.33236226439476, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.15918513387441635, "step": 8844 }, { "epoch": 0.4020909090909091, "grad_norm": 4.46875, "grad_norm_var": 0.23342692057291667, "learning_rate": 0.0001, "loss": 5.3354, "loss/crossentropy": 2.26948082447052, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1575694978237152, "step": 8846 }, { "epoch": 0.4021818181818182, "grad_norm": 5.34375, "grad_norm_var": 0.23834635416666666, "learning_rate": 0.0001, "loss": 5.0656, "loss/crossentropy": 2.078822046518326, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.14750730991363525, "step": 8848 }, { "epoch": 0.4022727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.13943684895833333, "learning_rate": 0.0001, "loss": 5.3165, "loss/crossentropy": 2.2141923904418945, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15710552781820297, "step": 8850 }, { "epoch": 0.40236363636363637, "grad_norm": 4.875, "grad_norm_var": 0.137109375, "learning_rate": 0.0001, "loss": 5.7614, "loss/crossentropy": 2.571683347225189, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16604218631982803, "step": 8852 }, { "epoch": 0.40245454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.13489176432291666, "learning_rate": 0.0001, "loss": 5.929, "loss/crossentropy": 2.653332769870758, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17444322258234024, "step": 8854 }, { "epoch": 0.40254545454545454, "grad_norm": 5.125, "grad_norm_var": 0.11656494140625, "learning_rate": 0.0001, "loss": 5.972, "loss/crossentropy": 2.6538676023483276, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.18239905685186386, "step": 8856 }, { "epoch": 0.4026363636363636, "grad_norm": 5.0, "grad_norm_var": 0.11497395833333333, "learning_rate": 0.0001, "loss": 5.8787, "loss/crossentropy": 2.5718274116516113, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17873810231685638, "step": 8858 }, { "epoch": 0.4027272727272727, "grad_norm": 5.3125, "grad_norm_var": 0.08739827473958334, "learning_rate": 0.0001, "loss": 6.1488, "loss/crossentropy": 2.7645881175994873, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18490706756711006, "step": 8860 }, { "epoch": 0.4028181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.07849934895833334, "learning_rate": 0.0001, "loss": 5.1966, "loss/crossentropy": 2.1014079451560974, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.15444349497556686, "step": 8862 }, { "epoch": 0.4029090909090909, "grad_norm": 6.71875, "grad_norm_var": 0.22928059895833333, "learning_rate": 0.0001, "loss": 6.2254, "loss/crossentropy": 2.7925270199775696, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18977273628115654, "step": 8864 }, { "epoch": 0.403, "grad_norm": 5.34375, "grad_norm_var": 0.20349934895833333, "learning_rate": 0.0001, "loss": 5.7127, "loss/crossentropy": 2.509160280227661, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16762417927384377, "step": 8866 }, { "epoch": 0.4030909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.19661458333333334, "learning_rate": 0.0001, "loss": 5.8139, "loss/crossentropy": 2.5456225872039795, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17467620596289635, "step": 8868 }, { "epoch": 0.4031818181818182, "grad_norm": 8.875, "grad_norm_var": 1.0585286458333334, "learning_rate": 0.0001, "loss": 5.5345, "loss/crossentropy": 2.258188545703888, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17372506856918335, "step": 8870 }, { "epoch": 0.4032727272727273, "grad_norm": 5.6875, "grad_norm_var": 1.0548787434895834, "learning_rate": 0.0001, "loss": 6.0996, "loss/crossentropy": 2.686727821826935, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1877710558474064, "step": 8872 }, { "epoch": 0.40336363636363637, "grad_norm": 4.78125, "grad_norm_var": 1.067578125, "learning_rate": 0.0001, "loss": 5.4877, "loss/crossentropy": 2.3328015208244324, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1602177731692791, "step": 8874 }, { "epoch": 0.40345454545454545, "grad_norm": 5.625, "grad_norm_var": 1.0635701497395833, "learning_rate": 0.0001, "loss": 5.3369, "loss/crossentropy": 2.172898769378662, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16093587689101696, "step": 8876 }, { "epoch": 0.40354545454545454, "grad_norm": 5.28125, "grad_norm_var": 1.00260009765625, "learning_rate": 0.0001, "loss": 5.567, "loss/crossentropy": 2.3489055037498474, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16868778318166733, "step": 8878 }, { "epoch": 0.4036363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.8997233072916667, "learning_rate": 0.0001, "loss": 5.5189, "loss/crossentropy": 2.3410913348197937, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16543545201420784, "step": 8880 }, { "epoch": 0.4037272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.9141764322916667, "learning_rate": 0.0001, "loss": 5.7325, "loss/crossentropy": 2.540696382522583, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16878538206219673, "step": 8882 }, { "epoch": 0.4038181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.9247233072916666, "learning_rate": 0.0001, "loss": 5.8296, "loss/crossentropy": 2.5500020384788513, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17639652267098427, "step": 8884 }, { "epoch": 0.4039090909090909, "grad_norm": 5.53125, "grad_norm_var": 0.08326416015625, "learning_rate": 0.0001, "loss": 5.6824, "loss/crossentropy": 2.3366130590438843, "loss/hidden": 1.595703125, "loss/jsd": 0.0, "loss/logits": 0.1750091314315796, "step": 8886 }, { "epoch": 0.404, "grad_norm": 6.0, "grad_norm_var": 0.11795247395833333, "learning_rate": 0.0001, "loss": 5.6104, "loss/crossentropy": 2.4165660738945007, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1664537452161312, "step": 8888 }, { "epoch": 0.4040909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.10022379557291666, "learning_rate": 0.0001, "loss": 5.7468, "loss/crossentropy": 2.550030291080475, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16850100085139275, "step": 8890 }, { "epoch": 0.4041818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.094140625, "learning_rate": 0.0001, "loss": 6.0676, "loss/crossentropy": 2.7361207604408264, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18079981580376625, "step": 8892 }, { "epoch": 0.4042727272727273, "grad_norm": 5.25, "grad_norm_var": 0.11155192057291667, "learning_rate": 0.0001, "loss": 5.9381, "loss/crossentropy": 2.683428108692169, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1725338138639927, "step": 8894 }, { "epoch": 0.40436363636363637, "grad_norm": 4.59375, "grad_norm_var": 0.14781494140625, "learning_rate": 0.0001, "loss": 5.4955, "loss/crossentropy": 2.4291954040527344, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15701619535684586, "step": 8896 }, { "epoch": 0.40445454545454546, "grad_norm": 5.125, "grad_norm_var": 0.14455973307291667, "learning_rate": 0.0001, "loss": 5.8017, "loss/crossentropy": 2.517822027206421, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17624251917004585, "step": 8898 }, { "epoch": 0.40454545454545454, "grad_norm": 4.84375, "grad_norm_var": 0.14573160807291666, "learning_rate": 0.0001, "loss": 5.6466, "loss/crossentropy": 2.4903072118759155, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16504108905792236, "step": 8900 }, { "epoch": 0.4046363636363636, "grad_norm": 5.3125, "grad_norm_var": 0.1197265625, "learning_rate": 0.0001, "loss": 6.0862, "loss/crossentropy": 2.737510025501251, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1833062469959259, "step": 8902 }, { "epoch": 0.4047272727272727, "grad_norm": 5.25, "grad_norm_var": 0.07823893229166666, "learning_rate": 0.0001, "loss": 5.9165, "loss/crossentropy": 2.627279818058014, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1740439124405384, "step": 8904 }, { "epoch": 0.4048181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.0759765625, "learning_rate": 0.0001, "loss": 5.8524, "loss/crossentropy": 2.5215047001838684, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1772315800189972, "step": 8906 }, { "epoch": 0.4049090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.07421875, "learning_rate": 0.0001, "loss": 5.7105, "loss/crossentropy": 2.4936496019363403, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.169339120388031, "step": 8908 }, { "epoch": 0.405, "grad_norm": 5.375, "grad_norm_var": 0.06998697916666667, "learning_rate": 0.0001, "loss": 5.5296, "loss/crossentropy": 2.3382963240146637, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16835078038275242, "step": 8910 }, { "epoch": 0.4050909090909091, "grad_norm": 5.125, "grad_norm_var": 0.0451171875, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.3958439230918884, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16292845830321312, "step": 8912 }, { "epoch": 0.4051818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 5.843, "loss/crossentropy": 2.5511268377304077, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17704112455248833, "step": 8914 }, { "epoch": 0.4052727272727273, "grad_norm": 5.5625, "grad_norm_var": 0.03271077473958333, "learning_rate": 0.0001, "loss": 5.7028, "loss/crossentropy": 2.4834336936473846, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17115415632724762, "step": 8916 }, { "epoch": 0.40536363636363637, "grad_norm": 5.09375, "grad_norm_var": 0.03772379557291667, "learning_rate": 0.0001, "loss": 5.455, "loss/crossentropy": 2.3504709601402283, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.15791605040431023, "step": 8918 }, { "epoch": 0.40545454545454546, "grad_norm": 5.46875, "grad_norm_var": 0.033524576822916666, "learning_rate": 0.0001, "loss": 5.6429, "loss/crossentropy": 2.437079131603241, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17038289457559586, "step": 8920 }, { "epoch": 0.40554545454545454, "grad_norm": 5.4375, "grad_norm_var": 0.04147135416666667, "learning_rate": 0.0001, "loss": 6.3865, "loss/crossentropy": 2.9025104641914368, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19449738785624504, "step": 8922 }, { "epoch": 0.40563636363636363, "grad_norm": 4.96875, "grad_norm_var": 0.04537353515625, "learning_rate": 0.0001, "loss": 5.7114, "loss/crossentropy": 2.54541277885437, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16640324890613556, "step": 8924 }, { "epoch": 0.4057272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.04185791015625, "learning_rate": 0.0001, "loss": 5.7965, "loss/crossentropy": 2.5697356462478638, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17092004045844078, "step": 8926 }, { "epoch": 0.4058181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.04361572265625, "learning_rate": 0.0001, "loss": 5.6852, "loss/crossentropy": 2.4800503253936768, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16836456954479218, "step": 8928 }, { "epoch": 0.4059090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.05388997395833333, "learning_rate": 0.0001, "loss": 5.6947, "loss/crossentropy": 2.4835768938064575, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17130989208817482, "step": 8930 }, { "epoch": 0.406, "grad_norm": 6.15625, "grad_norm_var": 0.10246988932291666, "learning_rate": 0.0001, "loss": 6.0194, "loss/crossentropy": 2.6376644372940063, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18309902772307396, "step": 8932 }, { "epoch": 0.4060909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.09999593098958333, "learning_rate": 0.0001, "loss": 5.697, "loss/crossentropy": 2.486423999071121, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16988839581608772, "step": 8934 }, { "epoch": 0.4061818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.106494140625, "learning_rate": 0.0001, "loss": 5.46, "loss/crossentropy": 2.392189860343933, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15756597742438316, "step": 8936 }, { "epoch": 0.4062727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.11077067057291666, "learning_rate": 0.0001, "loss": 5.6326, "loss/crossentropy": 2.505305528640747, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1635102443397045, "step": 8938 }, { "epoch": 0.40636363636363637, "grad_norm": 5.9375, "grad_norm_var": 0.14563395182291666, "learning_rate": 0.0001, "loss": 5.5566, "loss/crossentropy": 2.3882293105125427, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1641060933470726, "step": 8940 }, { "epoch": 0.40645454545454546, "grad_norm": 4.9375, "grad_norm_var": 0.14947509765625, "learning_rate": 0.0001, "loss": 5.6355, "loss/crossentropy": 2.4054128527641296, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16910289600491524, "step": 8942 }, { "epoch": 0.40654545454545454, "grad_norm": 6.40625, "grad_norm_var": 0.250244140625, "learning_rate": 0.0001, "loss": 6.1413, "loss/crossentropy": 2.6664934754371643, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19279323518276215, "step": 8944 }, { "epoch": 0.40663636363636363, "grad_norm": 4.875, "grad_norm_var": 0.246728515625, "learning_rate": 0.0001, "loss": 5.5503, "loss/crossentropy": 2.4029968976974487, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16278139129281044, "step": 8946 }, { "epoch": 0.4067272727272727, "grad_norm": 6.84375, "grad_norm_var": 0.38943684895833336, "learning_rate": 0.0001, "loss": 5.9667, "loss/crossentropy": 2.6026939153671265, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.18074028193950653, "step": 8948 }, { "epoch": 0.4068181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.38841145833333335, "learning_rate": 0.0001, "loss": 5.8109, "loss/crossentropy": 2.627146542072296, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16935429349541664, "step": 8950 }, { "epoch": 0.4069090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.36516927083333334, "learning_rate": 0.0001, "loss": 5.8119, "loss/crossentropy": 2.571002185344696, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1717461347579956, "step": 8952 }, { "epoch": 0.407, "grad_norm": 4.90625, "grad_norm_var": 0.36067301432291665, "learning_rate": 0.0001, "loss": 5.5082, "loss/crossentropy": 2.4101059436798096, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15883377566933632, "step": 8954 }, { "epoch": 0.4070909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.33892822265625, "learning_rate": 0.0001, "loss": 5.8972, "loss/crossentropy": 2.6135876178741455, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17523830756545067, "step": 8956 }, { "epoch": 0.4071818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.3495402018229167, "learning_rate": 0.0001, "loss": 5.7818, "loss/crossentropy": 2.608945608139038, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16787534579634666, "step": 8958 }, { "epoch": 0.4072727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.287890625, "learning_rate": 0.0001, "loss": 5.8516, "loss/crossentropy": 2.605623424053192, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17264677211642265, "step": 8960 }, { "epoch": 0.40736363636363637, "grad_norm": 5.15625, "grad_norm_var": 0.2771484375, "learning_rate": 0.0001, "loss": 5.7177, "loss/crossentropy": 2.567055583000183, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16505950316786766, "step": 8962 }, { "epoch": 0.40745454545454546, "grad_norm": 5.125, "grad_norm_var": 0.06100260416666667, "learning_rate": 0.0001, "loss": 5.843, "loss/crossentropy": 2.6129695177078247, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16909530758857727, "step": 8964 }, { "epoch": 0.40754545454545454, "grad_norm": 5.03125, "grad_norm_var": 0.05758056640625, "learning_rate": 0.0001, "loss": 5.483, "loss/crossentropy": 2.352656990289688, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16244445368647575, "step": 8966 }, { "epoch": 0.40763636363636363, "grad_norm": 5.03125, "grad_norm_var": 0.07421875, "learning_rate": 0.0001, "loss": 6.0614, "loss/crossentropy": 2.732121229171753, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17921554297208786, "step": 8968 }, { "epoch": 0.4077272727272727, "grad_norm": 5.09375, "grad_norm_var": 0.086181640625, "learning_rate": 0.0001, "loss": 5.9508, "loss/crossentropy": 2.683270812034607, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17460177838802338, "step": 8970 }, { "epoch": 0.4078181818181818, "grad_norm": 5.375, "grad_norm_var": 0.0673828125, "learning_rate": 0.0001, "loss": 5.9785, "loss/crossentropy": 2.6720091104507446, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1767408773303032, "step": 8972 }, { "epoch": 0.4079090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.077197265625, "learning_rate": 0.0001, "loss": 5.6535, "loss/crossentropy": 2.379905939102173, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17188699543476105, "step": 8974 }, { "epoch": 0.408, "grad_norm": 5.125, "grad_norm_var": 0.10347900390625, "learning_rate": 0.0001, "loss": 5.4144, "loss/crossentropy": 2.4290412068367004, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14989914372563362, "step": 8976 }, { "epoch": 0.4080909090909091, "grad_norm": 5.875, "grad_norm_var": 0.12860921223958333, "learning_rate": 0.0001, "loss": 5.7625, "loss/crossentropy": 2.4892901182174683, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17146185785531998, "step": 8978 }, { "epoch": 0.4081818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.13331705729166668, "learning_rate": 0.0001, "loss": 5.7115, "loss/crossentropy": 2.4494027495384216, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1730833798646927, "step": 8980 }, { "epoch": 0.4082727272727273, "grad_norm": 7.15625, "grad_norm_var": 0.36734619140625, "learning_rate": 0.0001, "loss": 5.7364, "loss/crossentropy": 2.4939403533935547, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16955946758389473, "step": 8982 }, { "epoch": 0.4083636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.3736979166666667, "learning_rate": 0.0001, "loss": 5.4433, "loss/crossentropy": 2.3790727853775024, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15779190510511398, "step": 8984 }, { "epoch": 0.40845454545454546, "grad_norm": 4.9375, "grad_norm_var": 0.3544108072916667, "learning_rate": 0.0001, "loss": 5.7775, "loss/crossentropy": 2.5247498750686646, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17488602548837662, "step": 8986 }, { "epoch": 0.40854545454545454, "grad_norm": 5.0625, "grad_norm_var": 0.3712076822916667, "learning_rate": 0.0001, "loss": 5.8164, "loss/crossentropy": 2.6134940087795258, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16814350709319115, "step": 8988 }, { "epoch": 0.40863636363636363, "grad_norm": 8.875, "grad_norm_var": 1.2342732747395833, "learning_rate": 0.0001, "loss": 5.7487, "loss/crossentropy": 2.5349494218826294, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16942545399069786, "step": 8990 }, { "epoch": 0.4087272727272727, "grad_norm": 5.59375, "grad_norm_var": 1.1641927083333334, "learning_rate": 0.0001, "loss": 5.7913, "loss/crossentropy": 2.5126869678497314, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.1725909262895584, "step": 8992 }, { "epoch": 0.4088181818181818, "grad_norm": 5.5, "grad_norm_var": 1.1539021809895833, "learning_rate": 0.0001, "loss": 6.0274, "loss/crossentropy": 2.7201231718063354, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17818718403577805, "step": 8994 }, { "epoch": 0.4089090909090909, "grad_norm": 5.6875, "grad_norm_var": 1.135009765625, "learning_rate": 0.0001, "loss": 5.8686, "loss/crossentropy": 2.607642710208893, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17238519713282585, "step": 8996 }, { "epoch": 0.409, "grad_norm": 5.0, "grad_norm_var": 0.9512003580729167, "learning_rate": 0.0001, "loss": 5.8082, "loss/crossentropy": 2.561551809310913, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17153503000736237, "step": 8998 }, { "epoch": 0.4090909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.9411295572916667, "learning_rate": 0.0001, "loss": 5.5119, "loss/crossentropy": 2.382863700389862, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16114775836467743, "step": 9000 }, { "epoch": 0.4091818181818182, "grad_norm": 5.5, "grad_norm_var": 0.92984619140625, "learning_rate": 0.0001, "loss": 5.9522, "loss/crossentropy": 2.6426321268081665, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.17529460787773132, "step": 9002 }, { "epoch": 0.4092727272727273, "grad_norm": 5.6875, "grad_norm_var": 0.8865519205729167, "learning_rate": 0.0001, "loss": 5.8235, "loss/crossentropy": 2.551420569419861, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1738901436328888, "step": 9004 }, { "epoch": 0.4093636363636364, "grad_norm": 5.375, "grad_norm_var": 0.079150390625, "learning_rate": 0.0001, "loss": 5.5914, "loss/crossentropy": 2.3574270606040955, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16987674683332443, "step": 9006 }, { "epoch": 0.40945454545454546, "grad_norm": 5.5625, "grad_norm_var": 0.08300374348958334, "learning_rate": 0.0001, "loss": 5.8953, "loss/crossentropy": 2.6493223309516907, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1736244261264801, "step": 9008 }, { "epoch": 0.40954545454545455, "grad_norm": 7.21875, "grad_norm_var": 0.3036295572916667, "learning_rate": 0.0001, "loss": 5.1874, "loss/crossentropy": 2.0653102695941925, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.15693525597453117, "step": 9010 }, { "epoch": 0.40963636363636363, "grad_norm": 4.53125, "grad_norm_var": 0.34351806640625, "learning_rate": 0.0001, "loss": 5.527, "loss/crossentropy": 2.4255195260047913, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1574089229106903, "step": 9012 }, { "epoch": 0.4097272727272727, "grad_norm": 5.3125, "grad_norm_var": 0.38046468098958336, "learning_rate": 0.0001, "loss": 5.293, "loss/crossentropy": 2.2480961978435516, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15468523651361465, "step": 9014 }, { "epoch": 0.4098181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.36656494140625, "learning_rate": 0.0001, "loss": 5.8154, "loss/crossentropy": 2.5773361921310425, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17341860011219978, "step": 9016 }, { "epoch": 0.4099090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.37277018229166664, "learning_rate": 0.0001, "loss": 5.5047, "loss/crossentropy": 2.404334604740143, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16101688519120216, "step": 9018 }, { "epoch": 0.41, "grad_norm": 5.3125, "grad_norm_var": 0.37693684895833335, "learning_rate": 0.0001, "loss": 5.9459, "loss/crossentropy": 2.6789684295654297, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1747385859489441, "step": 9020 }, { "epoch": 0.4100909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.39088134765625, "learning_rate": 0.0001, "loss": 5.685, "loss/crossentropy": 2.464866876602173, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16947044804692268, "step": 9022 }, { "epoch": 0.4101818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.39993489583333336, "learning_rate": 0.0001, "loss": 5.8355, "loss/crossentropy": 2.550879120826721, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17494191229343414, "step": 9024 }, { "epoch": 0.4102727272727273, "grad_norm": 5.125, "grad_norm_var": 0.12089436848958333, "learning_rate": 0.0001, "loss": 6.2402, "loss/crossentropy": 2.8676096200942993, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1833503246307373, "step": 9026 }, { "epoch": 0.4103636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.10155843098958334, "learning_rate": 0.0001, "loss": 5.4548, "loss/crossentropy": 2.3980191946029663, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.15275097265839577, "step": 9028 }, { "epoch": 0.41045454545454546, "grad_norm": 4.8125, "grad_norm_var": 0.08072509765625, "learning_rate": 0.0001, "loss": 5.3126, "loss/crossentropy": 2.234447807073593, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15801222249865532, "step": 9030 }, { "epoch": 0.41054545454545455, "grad_norm": 4.6875, "grad_norm_var": 0.096337890625, "learning_rate": 0.0001, "loss": 5.0035, "loss/crossentropy": 2.088921159505844, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1416519582271576, "step": 9032 }, { "epoch": 0.41063636363636363, "grad_norm": 5.03125, "grad_norm_var": 0.13938802083333332, "learning_rate": 0.0001, "loss": 5.9069, "loss/crossentropy": 2.582159101963043, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18051723390817642, "step": 9034 }, { "epoch": 0.4107272727272727, "grad_norm": 5.25, "grad_norm_var": 0.151171875, "learning_rate": 0.0001, "loss": 6.146, "loss/crossentropy": 2.712544083595276, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.188852246850729, "step": 9036 }, { "epoch": 0.4108181818181818, "grad_norm": 13.1875, "grad_norm_var": 4.172847493489583, "learning_rate": 0.0001, "loss": 5.6832, "loss/crossentropy": 2.439292848110199, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17107019573450089, "step": 9038 }, { "epoch": 0.4109090909090909, "grad_norm": 5.4375, "grad_norm_var": 4.146468098958334, "learning_rate": 0.0001, "loss": 5.998, "loss/crossentropy": 2.656339168548584, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1802579201757908, "step": 9040 }, { "epoch": 0.411, "grad_norm": 5.625, "grad_norm_var": 4.187223307291666, "learning_rate": 0.0001, "loss": 5.4534, "loss/crossentropy": 2.3540740609169006, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15993372350931168, "step": 9042 }, { "epoch": 0.4110909090909091, "grad_norm": 5.125, "grad_norm_var": 4.122261555989583, "learning_rate": 0.0001, "loss": 5.902, "loss/crossentropy": 2.6144962906837463, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1752324253320694, "step": 9044 }, { "epoch": 0.4111818181818182, "grad_norm": 5.28125, "grad_norm_var": 4.139839680989583, "learning_rate": 0.0001, "loss": 5.4495, "loss/crossentropy": 2.367364317178726, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15665150992572308, "step": 9046 }, { "epoch": 0.4112727272727273, "grad_norm": 4.8125, "grad_norm_var": 4.119856770833334, "learning_rate": 0.0001, "loss": 5.5152, "loss/crossentropy": 2.3833484649658203, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1631830595433712, "step": 9048 }, { "epoch": 0.4113636363636364, "grad_norm": 5.1875, "grad_norm_var": 4.107486979166667, "learning_rate": 0.0001, "loss": 5.7762, "loss/crossentropy": 2.503614664077759, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17549626156687737, "step": 9050 }, { "epoch": 0.41145454545454546, "grad_norm": 5.125, "grad_norm_var": 4.126200358072917, "learning_rate": 0.0001, "loss": 5.8805, "loss/crossentropy": 2.579093635082245, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1776055544614792, "step": 9052 }, { "epoch": 0.41154545454545455, "grad_norm": 5.125, "grad_norm_var": 0.09959309895833333, "learning_rate": 0.0001, "loss": 5.9691, "loss/crossentropy": 2.657065689563751, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1737806797027588, "step": 9054 }, { "epoch": 0.41163636363636363, "grad_norm": 5.46875, "grad_norm_var": 0.09758707682291666, "learning_rate": 0.0001, "loss": 5.5115, "loss/crossentropy": 2.3070300221443176, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16712896153330803, "step": 9056 }, { "epoch": 0.4117272727272727, "grad_norm": 4.71875, "grad_norm_var": 0.07857666015625, "learning_rate": 0.0001, "loss": 5.49, "loss/crossentropy": 2.4021334052085876, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.158589418977499, "step": 9058 }, { "epoch": 0.4118181818181818, "grad_norm": 5.65625, "grad_norm_var": 0.12346598307291666, "learning_rate": 0.0001, "loss": 5.6594, "loss/crossentropy": 2.511763870716095, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16320053860545158, "step": 9060 }, { "epoch": 0.4119090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.11256103515625, "learning_rate": 0.0001, "loss": 5.5345, "loss/crossentropy": 2.393501400947571, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16254166513681412, "step": 9062 }, { "epoch": 0.412, "grad_norm": 18.5, "grad_norm_var": 11.429931640625, "learning_rate": 0.0001, "loss": 5.3201, "loss/crossentropy": 2.185466378927231, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.15486694872379303, "step": 9064 }, { "epoch": 0.41209090909090906, "grad_norm": 5.5625, "grad_norm_var": 11.390950520833334, "learning_rate": 0.0001, "loss": 6.0078, "loss/crossentropy": 2.6186943650245667, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1840289682149887, "step": 9066 }, { "epoch": 0.4121818181818182, "grad_norm": 4.75, "grad_norm_var": 11.413016764322917, "learning_rate": 0.0001, "loss": 5.6745, "loss/crossentropy": 2.4865230321884155, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16762402281165123, "step": 9068 }, { "epoch": 0.4122727272727273, "grad_norm": 5.21875, "grad_norm_var": 11.468973795572916, "learning_rate": 0.0001, "loss": 5.857, "loss/crossentropy": 2.563770353794098, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17619356140494347, "step": 9070 }, { "epoch": 0.4123636363636364, "grad_norm": 7.46875, "grad_norm_var": 11.554911295572916, "learning_rate": 0.0001, "loss": 6.1243, "loss/crossentropy": 2.780922830104828, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.18102141842246056, "step": 9072 }, { "epoch": 0.41245454545454546, "grad_norm": 5.03125, "grad_norm_var": 11.513700358072917, "learning_rate": 0.0001, "loss": 5.491, "loss/crossentropy": 2.3940993547439575, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1585160195827484, "step": 9074 }, { "epoch": 0.41254545454545455, "grad_norm": 5.125, "grad_norm_var": 11.42447509765625, "learning_rate": 0.0001, "loss": 5.9713, "loss/crossentropy": 2.6025314331054688, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1810135394334793, "step": 9076 }, { "epoch": 0.41263636363636363, "grad_norm": 5.375, "grad_norm_var": 11.270426432291666, "learning_rate": 0.0001, "loss": 6.2842, "loss/crossentropy": 2.8669283390045166, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1887979917228222, "step": 9078 }, { "epoch": 0.4127272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.41812744140625, "learning_rate": 0.0001, "loss": 5.6829, "loss/crossentropy": 2.46230286359787, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1697131134569645, "step": 9080 }, { "epoch": 0.4128181818181818, "grad_norm": 5.25, "grad_norm_var": 0.39308268229166665, "learning_rate": 0.0001, "loss": 5.7605, "loss/crossentropy": 2.5798415541648865, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16591888666152954, "step": 9082 }, { "epoch": 0.4129090909090909, "grad_norm": 4.875, "grad_norm_var": 0.3829264322916667, "learning_rate": 0.0001, "loss": 5.3431, "loss/crossentropy": 2.3128527402877808, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15361405909061432, "step": 9084 }, { "epoch": 0.413, "grad_norm": 4.96875, "grad_norm_var": 0.371875, "learning_rate": 0.0001, "loss": 5.8367, "loss/crossentropy": 2.5808262825012207, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1740257851779461, "step": 9086 }, { "epoch": 0.41309090909090906, "grad_norm": 5.25, "grad_norm_var": 0.09195556640625, "learning_rate": 0.0001, "loss": 5.6768, "loss/crossentropy": 2.4346776008605957, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1695224680006504, "step": 9088 }, { "epoch": 0.4131818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.08817952473958333, "learning_rate": 0.0001, "loss": 5.6401, "loss/crossentropy": 2.4688965678215027, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16633570939302444, "step": 9090 }, { "epoch": 0.4132727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.17151285807291666, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.4310200810432434, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.1689762994647026, "step": 9092 }, { "epoch": 0.4133636363636364, "grad_norm": 5.15625, "grad_norm_var": 0.18880208333333334, "learning_rate": 0.0001, "loss": 5.8149, "loss/crossentropy": 2.625262200832367, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1674058996140957, "step": 9094 }, { "epoch": 0.41345454545454546, "grad_norm": 5.0, "grad_norm_var": 0.1947265625, "learning_rate": 0.0001, "loss": 5.8458, "loss/crossentropy": 2.6006959080696106, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17294878140091896, "step": 9096 }, { "epoch": 0.41354545454545455, "grad_norm": 5.375, "grad_norm_var": 0.19607747395833333, "learning_rate": 0.0001, "loss": 5.9989, "loss/crossentropy": 2.7025999426841736, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17767725512385368, "step": 9098 }, { "epoch": 0.41363636363636364, "grad_norm": 4.90625, "grad_norm_var": 0.19646809895833334, "learning_rate": 0.0001, "loss": 5.3403, "loss/crossentropy": 2.2704809308052063, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1544407606124878, "step": 9100 }, { "epoch": 0.4137272727272727, "grad_norm": 4.75, "grad_norm_var": 0.23033447265625, "learning_rate": 0.0001, "loss": 5.7359, "loss/crossentropy": 2.5991823971271515, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16308758780360222, "step": 9102 }, { "epoch": 0.4138181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.20836181640625, "learning_rate": 0.0001, "loss": 5.9503, "loss/crossentropy": 2.666278064250946, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17528161033988, "step": 9104 }, { "epoch": 0.4139090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.2369140625, "learning_rate": 0.0001, "loss": 5.486, "loss/crossentropy": 2.3298065066337585, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16327154263854027, "step": 9106 }, { "epoch": 0.414, "grad_norm": 5.0, "grad_norm_var": 0.12057291666666667, "learning_rate": 0.0001, "loss": 5.4144, "loss/crossentropy": 2.315724164247513, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15928634628653526, "step": 9108 }, { "epoch": 0.41409090909090907, "grad_norm": 5.15625, "grad_norm_var": 0.13694254557291666, "learning_rate": 0.0001, "loss": 5.8825, "loss/crossentropy": 2.655086636543274, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1725483313202858, "step": 9110 }, { "epoch": 0.4141818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.122119140625, "learning_rate": 0.0001, "loss": 5.7637, "loss/crossentropy": 2.516017258167267, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17320551350712776, "step": 9112 }, { "epoch": 0.4142727272727273, "grad_norm": 4.125, "grad_norm_var": 0.165087890625, "learning_rate": 0.0001, "loss": 5.1413, "loss/crossentropy": 2.2068383991718292, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.14676929637789726, "step": 9114 }, { "epoch": 0.4143636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.16542561848958334, "learning_rate": 0.0001, "loss": 5.3787, "loss/crossentropy": 2.326017737388611, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1566387452185154, "step": 9116 }, { "epoch": 0.41445454545454546, "grad_norm": 5.15625, "grad_norm_var": 0.11764322916666667, "learning_rate": 0.0001, "loss": 5.9327, "loss/crossentropy": 2.6719177961349487, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17432047799229622, "step": 9118 }, { "epoch": 0.41454545454545455, "grad_norm": 5.4375, "grad_norm_var": 0.14032796223958333, "learning_rate": 0.0001, "loss": 5.9717, "loss/crossentropy": 2.6645943224430084, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17641238868236542, "step": 9120 }, { "epoch": 0.41463636363636364, "grad_norm": 4.75, "grad_norm_var": 0.12525634765625, "learning_rate": 0.0001, "loss": 5.4537, "loss/crossentropy": 2.316910147666931, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16153296828269958, "step": 9122 }, { "epoch": 0.4147272727272727, "grad_norm": 5.46875, "grad_norm_var": 0.13518473307291667, "learning_rate": 0.0001, "loss": 6.1159, "loss/crossentropy": 2.7986136078834534, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17566942796111107, "step": 9124 }, { "epoch": 0.4148181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.126416015625, "learning_rate": 0.0001, "loss": 5.3492, "loss/crossentropy": 2.3036744594573975, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1524076983332634, "step": 9126 }, { "epoch": 0.4149090909090909, "grad_norm": 4.875, "grad_norm_var": 0.12831624348958334, "learning_rate": 0.0001, "loss": 5.4851, "loss/crossentropy": 2.3725348114967346, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16184523329138756, "step": 9128 }, { "epoch": 0.415, "grad_norm": 5.625, "grad_norm_var": 0.12206624348958334, "learning_rate": 0.0001, "loss": 5.5096, "loss/crossentropy": 2.366301119327545, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16237761825323105, "step": 9130 }, { "epoch": 0.41509090909090907, "grad_norm": 5.21875, "grad_norm_var": 0.16783854166666667, "learning_rate": 0.0001, "loss": 5.9628, "loss/crossentropy": 2.6381117701530457, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17895422130823135, "step": 9132 }, { "epoch": 0.4151818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.16871337890625, "learning_rate": 0.0001, "loss": 5.492, "loss/crossentropy": 2.328566074371338, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16438555344939232, "step": 9134 }, { "epoch": 0.4152727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.15904541015625, "learning_rate": 0.0001, "loss": 5.6248, "loss/crossentropy": 2.449077695608139, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.167768694460392, "step": 9136 }, { "epoch": 0.4153636363636364, "grad_norm": 4.875, "grad_norm_var": 0.15471598307291667, "learning_rate": 0.0001, "loss": 5.9983, "loss/crossentropy": 2.675701379776001, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.18050165474414825, "step": 9138 }, { "epoch": 0.41545454545454547, "grad_norm": 5.34375, "grad_norm_var": 0.14882405598958334, "learning_rate": 0.0001, "loss": 6.212, "loss/crossentropy": 2.8853262662887573, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1799359954893589, "step": 9140 }, { "epoch": 0.41554545454545455, "grad_norm": 5.125, "grad_norm_var": 0.13318684895833333, "learning_rate": 0.0001, "loss": 5.6777, "loss/crossentropy": 2.4998286366462708, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16681552305817604, "step": 9142 }, { "epoch": 0.41563636363636364, "grad_norm": 5.375, "grad_norm_var": 0.12551676432291667, "learning_rate": 0.0001, "loss": 5.7206, "loss/crossentropy": 2.4968748688697815, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16749312728643417, "step": 9144 }, { "epoch": 0.4157272727272727, "grad_norm": 6.15625, "grad_norm_var": 0.4003214518229167, "learning_rate": 0.0001, "loss": 5.7027, "loss/crossentropy": 2.453124463558197, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17281226441264153, "step": 9146 }, { "epoch": 0.4158181818181818, "grad_norm": 4.15625, "grad_norm_var": 0.46060791015625, "learning_rate": 0.0001, "loss": 5.4336, "loss/crossentropy": 2.2417061924934387, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.166262187063694, "step": 9148 }, { "epoch": 0.4159090909090909, "grad_norm": 5.25, "grad_norm_var": 0.5073567708333333, "learning_rate": 0.0001, "loss": 5.44, "loss/crossentropy": 2.297153800725937, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1603812538087368, "step": 9150 }, { "epoch": 0.416, "grad_norm": 5.34375, "grad_norm_var": 0.47356770833333334, "learning_rate": 0.0001, "loss": 5.8297, "loss/crossentropy": 2.5459973216056824, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1721172295510769, "step": 9152 }, { "epoch": 0.41609090909090907, "grad_norm": 5.375, "grad_norm_var": 0.45167643229166665, "learning_rate": 0.0001, "loss": 6.044, "loss/crossentropy": 2.7271419763565063, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17856375128030777, "step": 9154 }, { "epoch": 0.4161818181818182, "grad_norm": 5.6875, "grad_norm_var": 0.4593587239583333, "learning_rate": 0.0001, "loss": 5.8653, "loss/crossentropy": 2.5541568994522095, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17857569456100464, "step": 9156 }, { "epoch": 0.4162727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.456103515625, "learning_rate": 0.0001, "loss": 5.5479, "loss/crossentropy": 2.3796509504318237, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16506541520357132, "step": 9158 }, { "epoch": 0.4163636363636364, "grad_norm": 5.46875, "grad_norm_var": 0.504296875, "learning_rate": 0.0001, "loss": 5.2807, "loss/crossentropy": 2.281962215900421, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.14811598509550095, "step": 9160 }, { "epoch": 0.41645454545454547, "grad_norm": 5.34375, "grad_norm_var": 0.22414957682291667, "learning_rate": 0.0001, "loss": 5.8215, "loss/crossentropy": 2.610002636909485, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17056229338049889, "step": 9162 }, { "epoch": 0.41654545454545455, "grad_norm": 5.28125, "grad_norm_var": 0.14329020182291666, "learning_rate": 0.0001, "loss": 5.9731, "loss/crossentropy": 2.6596428751945496, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17841405421495438, "step": 9164 }, { "epoch": 0.41663636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.09208577473958333, "learning_rate": 0.0001, "loss": 5.5734, "loss/crossentropy": 2.425945460796356, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16416407003998756, "step": 9166 }, { "epoch": 0.4167272727272727, "grad_norm": 5.75, "grad_norm_var": 0.10142822265625, "learning_rate": 0.0001, "loss": 5.4515, "loss/crossentropy": 2.323750853538513, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16140834242105484, "step": 9168 }, { "epoch": 0.4168181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.09959309895833333, "learning_rate": 0.0001, "loss": 5.5494, "loss/crossentropy": 2.446066439151764, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1613115556538105, "step": 9170 }, { "epoch": 0.4169090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.08453369140625, "learning_rate": 0.0001, "loss": 5.694, "loss/crossentropy": 2.4895222187042236, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16908247396349907, "step": 9172 }, { "epoch": 0.417, "grad_norm": 4.8125, "grad_norm_var": 0.0876953125, "learning_rate": 0.0001, "loss": 5.5052, "loss/crossentropy": 2.350357323884964, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.162947665899992, "step": 9174 }, { "epoch": 0.41709090909090907, "grad_norm": 4.65625, "grad_norm_var": 0.08020833333333334, "learning_rate": 0.0001, "loss": 5.5228, "loss/crossentropy": 2.4907233715057373, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15477178245782852, "step": 9176 }, { "epoch": 0.4171818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.07649739583333333, "learning_rate": 0.0001, "loss": 5.8872, "loss/crossentropy": 2.6560429334640503, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17623760551214218, "step": 9178 }, { "epoch": 0.4172727272727273, "grad_norm": 5.40625, "grad_norm_var": 0.08079020182291667, "learning_rate": 0.0001, "loss": 5.9984, "loss/crossentropy": 2.62273770570755, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18053903803229332, "step": 9180 }, { "epoch": 0.4173636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.08547770182291667, "learning_rate": 0.0001, "loss": 5.6603, "loss/crossentropy": 2.4802220165729523, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16956531628966331, "step": 9182 }, { "epoch": 0.41745454545454547, "grad_norm": 5.5, "grad_norm_var": 0.072900390625, "learning_rate": 0.0001, "loss": 5.6954, "loss/crossentropy": 2.491903007030487, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1668320633471012, "step": 9184 }, { "epoch": 0.41754545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.08984375, "learning_rate": 0.0001, "loss": 5.4167, "loss/crossentropy": 2.345475524663925, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15595422312617302, "step": 9186 }, { "epoch": 0.41763636363636364, "grad_norm": 4.96875, "grad_norm_var": 0.15084635416666667, "learning_rate": 0.0001, "loss": 5.9579, "loss/crossentropy": 2.6626169085502625, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1791405975818634, "step": 9188 }, { "epoch": 0.4177272727272727, "grad_norm": 4.5625, "grad_norm_var": 0.16301676432291667, "learning_rate": 0.0001, "loss": 5.4886, "loss/crossentropy": 2.398391008377075, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15960882604122162, "step": 9190 }, { "epoch": 0.4178181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.14425455729166667, "learning_rate": 0.0001, "loss": 5.7358, "loss/crossentropy": 2.461854875087738, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17309290170669556, "step": 9192 }, { "epoch": 0.4179090909090909, "grad_norm": 5.125, "grad_norm_var": 0.14267171223958333, "learning_rate": 0.0001, "loss": 5.6949, "loss/crossentropy": 2.495146721601486, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16840959340333939, "step": 9194 }, { "epoch": 0.418, "grad_norm": 4.75, "grad_norm_var": 0.14568684895833334, "learning_rate": 0.0001, "loss": 5.3487, "loss/crossentropy": 2.2665308117866516, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1548941433429718, "step": 9196 }, { "epoch": 0.41809090909090907, "grad_norm": 5.40625, "grad_norm_var": 0.14345296223958334, "learning_rate": 0.0001, "loss": 5.8106, "loss/crossentropy": 2.527641177177429, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17322077602148056, "step": 9198 }, { "epoch": 0.41818181818181815, "grad_norm": 5.21875, "grad_norm_var": 0.13056233723958333, "learning_rate": 0.0001, "loss": 5.6211, "loss/crossentropy": 2.418681710958481, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16750449314713478, "step": 9200 }, { "epoch": 0.4182727272727273, "grad_norm": 5.90625, "grad_norm_var": 0.44631754557291664, "learning_rate": 0.0001, "loss": 6.2024, "loss/crossentropy": 2.7218493223190308, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19415192306041718, "step": 9202 }, { "epoch": 0.4183636363636364, "grad_norm": 6.625, "grad_norm_var": 0.5184244791666667, "learning_rate": 0.0001, "loss": 5.7915, "loss/crossentropy": 2.522812008857727, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17413848266005516, "step": 9204 }, { "epoch": 0.41845454545454547, "grad_norm": 5.34375, "grad_norm_var": 0.5064412434895833, "learning_rate": 0.0001, "loss": 5.3929, "loss/crossentropy": 2.313233345746994, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.161290992051363, "step": 9206 }, { "epoch": 0.41854545454545455, "grad_norm": 5.34375, "grad_norm_var": 0.5227864583333334, "learning_rate": 0.0001, "loss": 5.9573, "loss/crossentropy": 2.602978765964508, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18309054151177406, "step": 9208 }, { "epoch": 0.41863636363636364, "grad_norm": 4.96875, "grad_norm_var": 0.5574503580729167, "learning_rate": 0.0001, "loss": 5.4771, "loss/crossentropy": 2.3822867572307587, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16143333539366722, "step": 9210 }, { "epoch": 0.4187272727272727, "grad_norm": 5.40625, "grad_norm_var": 0.55377197265625, "learning_rate": 0.0001, "loss": 5.8811, "loss/crossentropy": 2.6522496938705444, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17366251721978188, "step": 9212 }, { "epoch": 0.4188181818181818, "grad_norm": 4.875, "grad_norm_var": 0.57164306640625, "learning_rate": 0.0001, "loss": 5.7607, "loss/crossentropy": 2.5031381845474243, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17282232642173767, "step": 9214 }, { "epoch": 0.4189090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.5495402018229166, "learning_rate": 0.0001, "loss": 5.9242, "loss/crossentropy": 2.6646138429641724, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1753705032169819, "step": 9216 }, { "epoch": 0.419, "grad_norm": 5.46875, "grad_norm_var": 0.29933268229166665, "learning_rate": 0.0001, "loss": 5.7791, "loss/crossentropy": 2.4565961956977844, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17834273353219032, "step": 9218 }, { "epoch": 0.41909090909090907, "grad_norm": 5.03125, "grad_norm_var": 0.20364176432291667, "learning_rate": 0.0001, "loss": 5.4032, "loss/crossentropy": 2.331638813018799, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15695712715387344, "step": 9220 }, { "epoch": 0.41918181818181816, "grad_norm": 5.4375, "grad_norm_var": 0.20380452473958333, "learning_rate": 0.0001, "loss": 5.9715, "loss/crossentropy": 2.5973238945007324, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1831190511584282, "step": 9222 }, { "epoch": 0.4192727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.16678059895833333, "learning_rate": 0.0001, "loss": 5.6772, "loss/crossentropy": 2.4942683577537537, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16829026862978935, "step": 9224 }, { "epoch": 0.4193636363636364, "grad_norm": 5.75, "grad_norm_var": 0.16691080729166666, "learning_rate": 0.0001, "loss": 5.9373, "loss/crossentropy": 2.565104067325592, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1799880675971508, "step": 9226 }, { "epoch": 0.41945454545454547, "grad_norm": 4.6875, "grad_norm_var": 0.17760009765625, "learning_rate": 0.0001, "loss": 5.7165, "loss/crossentropy": 2.5695464611053467, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16684740036725998, "step": 9228 }, { "epoch": 0.41954545454545455, "grad_norm": 4.75, "grad_norm_var": 0.162109375, "learning_rate": 0.0001, "loss": 5.5924, "loss/crossentropy": 2.4197437167167664, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1662859097123146, "step": 9230 }, { "epoch": 0.41963636363636364, "grad_norm": 4.53125, "grad_norm_var": 0.19273681640625, "learning_rate": 0.0001, "loss": 5.4912, "loss/crossentropy": 2.3519493639469147, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16314839199185371, "step": 9232 }, { "epoch": 0.4197272727272727, "grad_norm": 5.25, "grad_norm_var": 0.18642171223958334, "learning_rate": 0.0001, "loss": 5.3095, "loss/crossentropy": 2.2212584614753723, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15570181608200073, "step": 9234 }, { "epoch": 0.4198181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.22128499348958333, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.650990843772888, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17115668579936028, "step": 9236 }, { "epoch": 0.4199090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.1900390625, "learning_rate": 0.0001, "loss": 5.6935, "loss/crossentropy": 2.477017045021057, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16735556721687317, "step": 9238 }, { "epoch": 0.42, "grad_norm": 5.21875, "grad_norm_var": 0.1958984375, "learning_rate": 0.0001, "loss": 5.7351, "loss/crossentropy": 2.5111956000328064, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17160794511437416, "step": 9240 }, { "epoch": 0.42009090909090907, "grad_norm": 5.25, "grad_norm_var": 0.17975260416666666, "learning_rate": 0.0001, "loss": 5.6501, "loss/crossentropy": 2.4572083353996277, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16655640676617622, "step": 9242 }, { "epoch": 0.42018181818181816, "grad_norm": 5.03125, "grad_norm_var": 0.15592041015625, "learning_rate": 0.0001, "loss": 5.8394, "loss/crossentropy": 2.53106552362442, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17829518020153046, "step": 9244 }, { "epoch": 0.4202727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.14045817057291668, "learning_rate": 0.0001, "loss": 5.6812, "loss/crossentropy": 2.533610165119171, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1637783907353878, "step": 9246 }, { "epoch": 0.4203636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.09368082682291666, "learning_rate": 0.0001, "loss": 5.413, "loss/crossentropy": 2.315670073032379, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.15445518121123314, "step": 9248 }, { "epoch": 0.42045454545454547, "grad_norm": 4.625, "grad_norm_var": 0.12590738932291667, "learning_rate": 0.0001, "loss": 5.2497, "loss/crossentropy": 2.2819572389125824, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14716040343046188, "step": 9250 }, { "epoch": 0.42054545454545456, "grad_norm": 4.78125, "grad_norm_var": 0.0572265625, "learning_rate": 0.0001, "loss": 5.7832, "loss/crossentropy": 2.5722416043281555, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17109886556863785, "step": 9252 }, { "epoch": 0.42063636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.06627197265625, "learning_rate": 0.0001, "loss": 5.8014, "loss/crossentropy": 2.5823814868927, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16838641837239265, "step": 9254 }, { "epoch": 0.4207272727272727, "grad_norm": 4.4375, "grad_norm_var": 0.1150390625, "learning_rate": 0.0001, "loss": 5.5875, "loss/crossentropy": 2.4293660819530487, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16698583960533142, "step": 9256 }, { "epoch": 0.4208181818181818, "grad_norm": 5.125, "grad_norm_var": 0.11599934895833333, "learning_rate": 0.0001, "loss": 5.8738, "loss/crossentropy": 2.540212571620941, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1788622997701168, "step": 9258 }, { "epoch": 0.4209090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.11717122395833333, "learning_rate": 0.0001, "loss": 5.5155, "loss/crossentropy": 2.408302068710327, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16169316321611404, "step": 9260 }, { "epoch": 0.421, "grad_norm": 5.0, "grad_norm_var": 0.5376912434895833, "learning_rate": 0.0001, "loss": 5.5283, "loss/crossentropy": 2.335671156644821, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16593917831778526, "step": 9262 }, { "epoch": 0.42109090909090907, "grad_norm": 4.40625, "grad_norm_var": 0.59615478515625, "learning_rate": 0.0001, "loss": 5.3195, "loss/crossentropy": 2.2875491976737976, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15651235356926918, "step": 9264 }, { "epoch": 0.42118181818181816, "grad_norm": 5.34375, "grad_norm_var": 0.559765625, "learning_rate": 0.0001, "loss": 5.7795, "loss/crossentropy": 2.5101967453956604, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.175559613853693, "step": 9266 }, { "epoch": 0.4212727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.5823567708333334, "learning_rate": 0.0001, "loss": 5.8218, "loss/crossentropy": 2.600131928920746, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17001835629343987, "step": 9268 }, { "epoch": 0.4213636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.5831339518229167, "learning_rate": 0.0001, "loss": 5.6293, "loss/crossentropy": 2.4280980825424194, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1701212041079998, "step": 9270 }, { "epoch": 0.42145454545454547, "grad_norm": 4.96875, "grad_norm_var": 0.5373697916666667, "learning_rate": 0.0001, "loss": 5.3183, "loss/crossentropy": 2.301127314567566, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15425146371126175, "step": 9272 }, { "epoch": 0.42154545454545456, "grad_norm": 4.5, "grad_norm_var": 0.573828125, "learning_rate": 0.0001, "loss": 5.4134, "loss/crossentropy": 2.396771192550659, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1534184031188488, "step": 9274 }, { "epoch": 0.42163636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.5755818684895834, "learning_rate": 0.0001, "loss": 5.5977, "loss/crossentropy": 2.474961280822754, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1587599366903305, "step": 9276 }, { "epoch": 0.42172727272727273, "grad_norm": 5.0, "grad_norm_var": 0.15480143229166668, "learning_rate": 0.0001, "loss": 5.8572, "loss/crossentropy": 2.5738940238952637, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.177155539393425, "step": 9278 }, { "epoch": 0.4218181818181818, "grad_norm": 5.375, "grad_norm_var": 0.12571614583333332, "learning_rate": 0.0001, "loss": 5.4044, "loss/crossentropy": 2.2707647681236267, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1608281396329403, "step": 9280 }, { "epoch": 0.4219090909090909, "grad_norm": 5.84375, "grad_norm_var": 0.15911458333333334, "learning_rate": 0.0001, "loss": 5.0916, "loss/crossentropy": 2.1173766553401947, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14780916459858418, "step": 9282 }, { "epoch": 0.422, "grad_norm": 5.25, "grad_norm_var": 0.11034749348958334, "learning_rate": 0.0001, "loss": 5.9566, "loss/crossentropy": 2.7089367508888245, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17378858476877213, "step": 9284 }, { "epoch": 0.42209090909090907, "grad_norm": 5.03125, "grad_norm_var": 0.11521809895833333, "learning_rate": 0.0001, "loss": 5.6692, "loss/crossentropy": 2.539555609226227, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16296860203146935, "step": 9286 }, { "epoch": 0.42218181818181816, "grad_norm": 5.21875, "grad_norm_var": 0.13136393229166668, "learning_rate": 0.0001, "loss": 5.7122, "loss/crossentropy": 2.5520118474960327, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1675841175019741, "step": 9288 }, { "epoch": 0.4222727272727273, "grad_norm": 4.4375, "grad_norm_var": 0.13409830729166666, "learning_rate": 0.0001, "loss": 5.3704, "loss/crossentropy": 2.3718971014022827, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1535562351346016, "step": 9290 }, { "epoch": 0.4223636363636364, "grad_norm": 4.875, "grad_norm_var": 0.13743082682291666, "learning_rate": 0.0001, "loss": 5.4567, "loss/crossentropy": 2.350755274295807, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.15727785602211952, "step": 9292 }, { "epoch": 0.42245454545454547, "grad_norm": 5.0625, "grad_norm_var": 0.13606770833333334, "learning_rate": 0.0001, "loss": 5.6504, "loss/crossentropy": 2.4940322637557983, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16524187102913857, "step": 9294 }, { "epoch": 0.42254545454545456, "grad_norm": 5.625, "grad_norm_var": 0.13019205729166666, "learning_rate": 0.0001, "loss": 5.7672, "loss/crossentropy": 2.498849630355835, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1709795780479908, "step": 9296 }, { "epoch": 0.42263636363636364, "grad_norm": 5.09375, "grad_norm_var": 0.09084879557291667, "learning_rate": 0.0001, "loss": 6.1123, "loss/crossentropy": 2.760930061340332, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1839623525738716, "step": 9298 }, { "epoch": 0.42272727272727273, "grad_norm": 5.5, "grad_norm_var": 0.105859375, "learning_rate": 0.0001, "loss": 5.7531, "loss/crossentropy": 2.5485756397247314, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1686921939253807, "step": 9300 }, { "epoch": 0.4228181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.11691080729166667, "learning_rate": 0.0001, "loss": 5.4653, "loss/crossentropy": 2.3796326220035553, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15954803675413132, "step": 9302 }, { "epoch": 0.4229090909090909, "grad_norm": 5.125, "grad_norm_var": 0.12649332682291667, "learning_rate": 0.0001, "loss": 5.7162, "loss/crossentropy": 2.468282163143158, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1728346049785614, "step": 9304 }, { "epoch": 0.423, "grad_norm": 4.96875, "grad_norm_var": 0.10480143229166666, "learning_rate": 0.0001, "loss": 5.591, "loss/crossentropy": 2.4440702199935913, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16234859079122543, "step": 9306 }, { "epoch": 0.4230909090909091, "grad_norm": 5.0, "grad_norm_var": 0.12472330729166667, "learning_rate": 0.0001, "loss": 5.0986, "loss/crossentropy": 2.144126147031784, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1483786553144455, "step": 9308 }, { "epoch": 0.42318181818181816, "grad_norm": 5.03125, "grad_norm_var": 0.16057535807291667, "learning_rate": 0.0001, "loss": 5.3787, "loss/crossentropy": 2.2782144844532013, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15692294016480446, "step": 9310 }, { "epoch": 0.4232727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.165869140625, "learning_rate": 0.0001, "loss": 5.7439, "loss/crossentropy": 2.4561809301376343, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17408230900764465, "step": 9312 }, { "epoch": 0.4233636363636364, "grad_norm": 5.4375, "grad_norm_var": 0.17107747395833334, "learning_rate": 0.0001, "loss": 5.8845, "loss/crossentropy": 2.623834788799286, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1737251579761505, "step": 9314 }, { "epoch": 0.4234545454545455, "grad_norm": 5.3125, "grad_norm_var": 0.15240885416666666, "learning_rate": 0.0001, "loss": 5.6331, "loss/crossentropy": 2.4322144985198975, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1691153608262539, "step": 9316 }, { "epoch": 0.42354545454545456, "grad_norm": 5.0, "grad_norm_var": 0.16747639973958334, "learning_rate": 0.0001, "loss": 4.9967, "loss/crossentropy": 2.0337994992733, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14746482856571674, "step": 9318 }, { "epoch": 0.42363636363636364, "grad_norm": 4.75, "grad_norm_var": 0.18941650390625, "learning_rate": 0.0001, "loss": 5.7644, "loss/crossentropy": 2.5406110286712646, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17315759137272835, "step": 9320 }, { "epoch": 0.42372727272727273, "grad_norm": 5.84375, "grad_norm_var": 0.22515869140625, "learning_rate": 0.0001, "loss": 5.7689, "loss/crossentropy": 2.5310853719711304, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17046039178967476, "step": 9322 }, { "epoch": 0.4238181818181818, "grad_norm": 5.25, "grad_norm_var": 0.18222249348958333, "learning_rate": 0.0001, "loss": 5.7663, "loss/crossentropy": 2.4729230403900146, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17504549399018288, "step": 9324 }, { "epoch": 0.4239090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.17805582682291668, "learning_rate": 0.0001, "loss": 5.2521, "loss/crossentropy": 2.1616547107696533, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1584630236029625, "step": 9326 }, { "epoch": 0.424, "grad_norm": 5.03125, "grad_norm_var": 0.16080322265625, "learning_rate": 0.0001, "loss": 5.8001, "loss/crossentropy": 2.5377457439899445, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17389535158872604, "step": 9328 }, { "epoch": 0.4240909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.16560872395833334, "learning_rate": 0.0001, "loss": 5.7202, "loss/crossentropy": 2.609678089618683, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1630026400089264, "step": 9330 }, { "epoch": 0.42418181818181816, "grad_norm": 5.40625, "grad_norm_var": 0.16968994140625, "learning_rate": 0.0001, "loss": 5.8525, "loss/crossentropy": 2.673624038696289, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16769396141171455, "step": 9332 }, { "epoch": 0.42427272727272725, "grad_norm": 4.96875, "grad_norm_var": 0.14244791666666667, "learning_rate": 0.0001, "loss": 5.2753, "loss/crossentropy": 2.1865815818309784, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15926618501544, "step": 9334 }, { "epoch": 0.4243636363636364, "grad_norm": 4.625, "grad_norm_var": 0.11027018229166667, "learning_rate": 0.0001, "loss": 5.3883, "loss/crossentropy": 2.2927918434143066, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.15466882288455963, "step": 9336 }, { "epoch": 0.4244545454545455, "grad_norm": 9.5, "grad_norm_var": 1.2912068684895834, "learning_rate": 0.0001, "loss": 5.8577, "loss/crossentropy": 2.591914176940918, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1734531633555889, "step": 9338 }, { "epoch": 0.42454545454545456, "grad_norm": 5.03125, "grad_norm_var": 1.29761962890625, "learning_rate": 0.0001, "loss": 5.6987, "loss/crossentropy": 2.520301580429077, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16745257377624512, "step": 9340 }, { "epoch": 0.42463636363636365, "grad_norm": 5.875, "grad_norm_var": 1.3021443684895833, "learning_rate": 0.0001, "loss": 5.8105, "loss/crossentropy": 2.5053142309188843, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.17446022108197212, "step": 9342 }, { "epoch": 0.42472727272727273, "grad_norm": 6.25, "grad_norm_var": 2.3484212239583333, "learning_rate": 0.0001, "loss": 5.99, "loss/crossentropy": 2.630111038684845, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1842326745390892, "step": 9344 }, { "epoch": 0.4248181818181818, "grad_norm": 5.28125, "grad_norm_var": 2.251395670572917, "learning_rate": 0.0001, "loss": 5.5588, "loss/crossentropy": 2.3886689245700836, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16408231109380722, "step": 9346 }, { "epoch": 0.4249090909090909, "grad_norm": 4.8125, "grad_norm_var": 2.2682902018229165, "learning_rate": 0.0001, "loss": 5.4431, "loss/crossentropy": 2.333577036857605, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1564580611884594, "step": 9348 }, { "epoch": 0.425, "grad_norm": 5.25, "grad_norm_var": 2.256363932291667, "learning_rate": 0.0001, "loss": 5.7692, "loss/crossentropy": 2.5053413808345795, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17306499183177948, "step": 9350 }, { "epoch": 0.4250909090909091, "grad_norm": 5.375, "grad_norm_var": 2.168343098958333, "learning_rate": 0.0001, "loss": 5.6993, "loss/crossentropy": 2.53690242767334, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16682422533631325, "step": 9352 }, { "epoch": 0.42518181818181816, "grad_norm": 4.9375, "grad_norm_var": 1.235791015625, "learning_rate": 0.0001, "loss": 5.5336, "loss/crossentropy": 2.365312159061432, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16409216821193695, "step": 9354 }, { "epoch": 0.42527272727272725, "grad_norm": 8.0, "grad_norm_var": 1.5597493489583334, "learning_rate": 0.0001, "loss": 5.5956, "loss/crossentropy": 2.2627598643302917, "loss/hidden": 1.591796875, "loss/jsd": 0.0, "loss/logits": 0.17410097271203995, "step": 9356 }, { "epoch": 0.4253636363636364, "grad_norm": 4.875, "grad_norm_var": 1.5683553059895834, "learning_rate": 0.0001, "loss": 5.433, "loss/crossentropy": 2.3536015152931213, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1552007645368576, "step": 9358 }, { "epoch": 0.4254545454545455, "grad_norm": 7.9375, "grad_norm_var": 0.9210774739583333, "learning_rate": 0.0001, "loss": 6.0402, "loss/crossentropy": 2.653425455093384, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18438392877578735, "step": 9360 }, { "epoch": 0.42554545454545456, "grad_norm": 5.0625, "grad_norm_var": 0.93541259765625, "learning_rate": 0.0001, "loss": 5.5998, "loss/crossentropy": 2.424165964126587, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16424736008048058, "step": 9362 }, { "epoch": 0.42563636363636365, "grad_norm": 5.1875, "grad_norm_var": 0.93472900390625, "learning_rate": 0.0001, "loss": 6.0899, "loss/crossentropy": 2.700291097164154, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18544501438736916, "step": 9364 }, { "epoch": 0.42572727272727273, "grad_norm": 8.125, "grad_norm_var": 1.3303385416666667, "learning_rate": 0.0001, "loss": 5.3189, "loss/crossentropy": 2.197687566280365, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.15841319784522057, "step": 9366 }, { "epoch": 0.4258181818181818, "grad_norm": 6.53125, "grad_norm_var": 1.369384765625, "learning_rate": 0.0001, "loss": 5.7111, "loss/crossentropy": 2.465893268585205, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1735471487045288, "step": 9368 }, { "epoch": 0.4259090909090909, "grad_norm": 11.125, "grad_norm_var": 3.0201171875, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.508749544620514, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17715445533394814, "step": 9370 }, { "epoch": 0.426, "grad_norm": 5.1875, "grad_norm_var": 2.8331339518229166, "learning_rate": 0.0001, "loss": 5.7698, "loss/crossentropy": 2.5501515865325928, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17118331789970398, "step": 9372 }, { "epoch": 0.4260909090909091, "grad_norm": 5.1875, "grad_norm_var": 2.764306640625, "learning_rate": 0.0001, "loss": 5.4099, "loss/crossentropy": 2.267017662525177, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16096726432442665, "step": 9374 }, { "epoch": 0.42618181818181816, "grad_norm": 5.28125, "grad_norm_var": 2.5753743489583334, "learning_rate": 0.0001, "loss": 5.6555, "loss/crossentropy": 2.460238128900528, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.166401669383049, "step": 9376 }, { "epoch": 0.42627272727272725, "grad_norm": 5.21875, "grad_norm_var": 2.595536295572917, "learning_rate": 0.0001, "loss": 5.7137, "loss/crossentropy": 2.4746329188346863, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17352043092250824, "step": 9378 }, { "epoch": 0.4263636363636364, "grad_norm": 5.15625, "grad_norm_var": 2.56422119140625, "learning_rate": 0.0001, "loss": 5.5645, "loss/crossentropy": 2.396647274494171, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.162490326911211, "step": 9380 }, { "epoch": 0.4264545454545455, "grad_norm": 5.0, "grad_norm_var": 2.2630208333333335, "learning_rate": 0.0001, "loss": 5.7656, "loss/crossentropy": 2.536457061767578, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17017871513962746, "step": 9382 }, { "epoch": 0.42654545454545456, "grad_norm": 4.65625, "grad_norm_var": 2.290869140625, "learning_rate": 0.0001, "loss": 5.7792, "loss/crossentropy": 2.6052631735801697, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16836974397301674, "step": 9384 }, { "epoch": 0.42663636363636365, "grad_norm": 5.375, "grad_norm_var": 0.13839518229166667, "learning_rate": 0.0001, "loss": 5.3455, "loss/crossentropy": 2.223507195711136, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1586848609149456, "step": 9386 }, { "epoch": 0.42672727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.035872395833333334, "learning_rate": 0.0001, "loss": 5.8941, "loss/crossentropy": 2.6808685064315796, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17132215201854706, "step": 9388 }, { "epoch": 0.4268181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.03616129557291667, "learning_rate": 0.0001, "loss": 5.4151, "loss/crossentropy": 2.336798667907715, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.15607047453522682, "step": 9390 }, { "epoch": 0.4269090909090909, "grad_norm": 5.6875, "grad_norm_var": 0.05455322265625, "learning_rate": 0.0001, "loss": 5.7374, "loss/crossentropy": 2.47582870721817, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16952073574066162, "step": 9392 }, { "epoch": 0.427, "grad_norm": 4.6875, "grad_norm_var": 0.06964518229166666, "learning_rate": 0.0001, "loss": 5.7653, "loss/crossentropy": 2.606525421142578, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16666308045387268, "step": 9394 }, { "epoch": 0.4270909090909091, "grad_norm": 5.25, "grad_norm_var": 0.08033854166666667, "learning_rate": 0.0001, "loss": 5.5585, "loss/crossentropy": 2.4157012701034546, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.15940043330192566, "step": 9396 }, { "epoch": 0.42718181818181816, "grad_norm": 5.625, "grad_norm_var": 0.09798177083333333, "learning_rate": 0.0001, "loss": 6.1099, "loss/crossentropy": 2.769762396812439, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.18264954537153244, "step": 9398 }, { "epoch": 0.42727272727272725, "grad_norm": 5.53125, "grad_norm_var": 0.11578369140625, "learning_rate": 0.0001, "loss": 5.1947, "loss/crossentropy": 2.1881246268749237, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.14967748895287514, "step": 9400 }, { "epoch": 0.4273636363636364, "grad_norm": 5.5, "grad_norm_var": 0.11698811848958333, "learning_rate": 0.0001, "loss": 5.9733, "loss/crossentropy": 2.54227876663208, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.18899596109986305, "step": 9402 }, { "epoch": 0.4274545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.10940348307291667, "learning_rate": 0.0001, "loss": 5.4639, "loss/crossentropy": 2.347598612308502, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16162975877523422, "step": 9404 }, { "epoch": 0.42754545454545456, "grad_norm": 4.84375, "grad_norm_var": 0.11573893229166667, "learning_rate": 0.0001, "loss": 5.7098, "loss/crossentropy": 2.5088696479797363, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16912134736776352, "step": 9406 }, { "epoch": 0.42763636363636365, "grad_norm": 4.84375, "grad_norm_var": 0.105712890625, "learning_rate": 0.0001, "loss": 5.8355, "loss/crossentropy": 2.630889654159546, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1698785237967968, "step": 9408 }, { "epoch": 0.42772727272727273, "grad_norm": 5.125, "grad_norm_var": 0.09842122395833333, "learning_rate": 0.0001, "loss": 5.8322, "loss/crossentropy": 2.5789555311203003, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17376244440674782, "step": 9410 }, { "epoch": 0.4278181818181818, "grad_norm": 4.71875, "grad_norm_var": 0.10909830729166667, "learning_rate": 0.0001, "loss": 5.8547, "loss/crossentropy": 2.57583224773407, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17280520498752594, "step": 9412 }, { "epoch": 0.4279090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.0923828125, "learning_rate": 0.0001, "loss": 5.8726, "loss/crossentropy": 2.660696268081665, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16884420812129974, "step": 9414 }, { "epoch": 0.428, "grad_norm": 4.4375, "grad_norm_var": 0.108837890625, "learning_rate": 0.0001, "loss": 5.2792, "loss/crossentropy": 2.308299243450165, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15021934732794762, "step": 9416 }, { "epoch": 0.4280909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.10129801432291667, "learning_rate": 0.0001, "loss": 5.4768, "loss/crossentropy": 2.3345141410827637, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1630578450858593, "step": 9418 }, { "epoch": 0.42818181818181816, "grad_norm": 5.375, "grad_norm_var": 0.11653238932291667, "learning_rate": 0.0001, "loss": 5.6678, "loss/crossentropy": 2.4846096634864807, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16518981754779816, "step": 9420 }, { "epoch": 0.42827272727272725, "grad_norm": 5.375, "grad_norm_var": 0.12591145833333334, "learning_rate": 0.0001, "loss": 6.0976, "loss/crossentropy": 2.7326200008392334, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18180741742253304, "step": 9422 }, { "epoch": 0.4283636363636364, "grad_norm": 4.625, "grad_norm_var": 0.13915608723958334, "learning_rate": 0.0001, "loss": 5.4487, "loss/crossentropy": 2.387654185295105, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1580572985112667, "step": 9424 }, { "epoch": 0.4284545454545455, "grad_norm": 5.34375, "grad_norm_var": 0.1560546875, "learning_rate": 0.0001, "loss": 5.4865, "loss/crossentropy": 2.3490604758262634, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1633523367345333, "step": 9426 }, { "epoch": 0.42854545454545456, "grad_norm": 5.125, "grad_norm_var": 0.13723958333333333, "learning_rate": 0.0001, "loss": 6.0712, "loss/crossentropy": 2.746830940246582, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.18028756603598595, "step": 9428 }, { "epoch": 0.42863636363636365, "grad_norm": 5.125, "grad_norm_var": 0.14973958333333334, "learning_rate": 0.0001, "loss": 5.7372, "loss/crossentropy": 2.5505874156951904, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16709518805146217, "step": 9430 }, { "epoch": 0.42872727272727273, "grad_norm": 5.65625, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 6.2259, "loss/crossentropy": 2.840523838996887, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18307022377848625, "step": 9432 }, { "epoch": 0.4288181818181818, "grad_norm": 4.5, "grad_norm_var": 0.12919514973958332, "learning_rate": 0.0001, "loss": 5.3841, "loss/crossentropy": 2.3776785135269165, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15103180333971977, "step": 9434 }, { "epoch": 0.4289090909090909, "grad_norm": 10.8125, "grad_norm_var": 2.171858723958333, "learning_rate": 0.0001, "loss": 5.7474, "loss/crossentropy": 2.363158941268921, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1835363805294037, "step": 9436 }, { "epoch": 0.429, "grad_norm": 5.15625, "grad_norm_var": 2.176070149739583, "learning_rate": 0.0001, "loss": 5.9061, "loss/crossentropy": 2.6599615812301636, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17500338703393936, "step": 9438 }, { "epoch": 0.4290909090909091, "grad_norm": 4.78125, "grad_norm_var": 2.13365478515625, "learning_rate": 0.0001, "loss": 5.8491, "loss/crossentropy": 2.6223285496234894, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17268149554729462, "step": 9440 }, { "epoch": 0.42918181818181816, "grad_norm": 4.90625, "grad_norm_var": 2.09742431640625, "learning_rate": 0.0001, "loss": 6.1632, "loss/crossentropy": 2.879313826560974, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17585180699825287, "step": 9442 }, { "epoch": 0.42927272727272725, "grad_norm": 7.25, "grad_norm_var": 2.334794108072917, "learning_rate": 0.0001, "loss": 5.865, "loss/crossentropy": 2.61386239528656, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17491448670625687, "step": 9444 }, { "epoch": 0.42936363636363634, "grad_norm": 5.03125, "grad_norm_var": 2.377799479166667, "learning_rate": 0.0001, "loss": 5.2678, "loss/crossentropy": 2.2488425374031067, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1481861062347889, "step": 9446 }, { "epoch": 0.4294545454545455, "grad_norm": 5.0625, "grad_norm_var": 2.386962890625, "learning_rate": 0.0001, "loss": 5.6931, "loss/crossentropy": 2.469967007637024, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1715327985584736, "step": 9448 }, { "epoch": 0.42954545454545456, "grad_norm": 5.125, "grad_norm_var": 2.30078125, "learning_rate": 0.0001, "loss": 5.9214, "loss/crossentropy": 2.61772221326828, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1768522970378399, "step": 9450 }, { "epoch": 0.42963636363636365, "grad_norm": 5.09375, "grad_norm_var": 0.4534505208333333, "learning_rate": 0.0001, "loss": 6.0112, "loss/crossentropy": 2.6571194529533386, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18110715597867966, "step": 9452 }, { "epoch": 0.42972727272727274, "grad_norm": 5.90625, "grad_norm_var": 0.49518229166666666, "learning_rate": 0.0001, "loss": 5.8704, "loss/crossentropy": 2.650025963783264, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17067304253578186, "step": 9454 }, { "epoch": 0.4298181818181818, "grad_norm": 5.84375, "grad_norm_var": 0.4950358072916667, "learning_rate": 0.0001, "loss": 5.8429, "loss/crossentropy": 2.625036358833313, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1712004393339157, "step": 9456 }, { "epoch": 0.4299090909090909, "grad_norm": 5.125, "grad_norm_var": 0.47652587890625, "learning_rate": 0.0001, "loss": 5.539, "loss/crossentropy": 2.415967881679535, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1619114987552166, "step": 9458 }, { "epoch": 0.43, "grad_norm": 5.03125, "grad_norm_var": 0.26360677083333334, "learning_rate": 0.0001, "loss": 5.6032, "loss/crossentropy": 2.456184506416321, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.161768838763237, "step": 9460 }, { "epoch": 0.4300909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.19455973307291666, "learning_rate": 0.0001, "loss": 5.7517, "loss/crossentropy": 2.4702277779579163, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17268286272883415, "step": 9462 }, { "epoch": 0.43018181818181817, "grad_norm": 6.03125, "grad_norm_var": 0.22771809895833334, "learning_rate": 0.0001, "loss": 5.7619, "loss/crossentropy": 2.571035146713257, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16537529975175858, "step": 9464 }, { "epoch": 0.43027272727272725, "grad_norm": 5.09375, "grad_norm_var": 0.22974853515625, "learning_rate": 0.0001, "loss": 5.9288, "loss/crossentropy": 2.6428269147872925, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17937494069337845, "step": 9466 }, { "epoch": 0.43036363636363634, "grad_norm": 5.625, "grad_norm_var": 0.22018229166666667, "learning_rate": 0.0001, "loss": 5.5445, "loss/crossentropy": 2.43057844042778, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.15885576233267784, "step": 9468 }, { "epoch": 0.4304545454545455, "grad_norm": 4.9375, "grad_norm_var": 0.18336181640625, "learning_rate": 0.0001, "loss": 5.7493, "loss/crossentropy": 2.4530009031295776, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17631122469902039, "step": 9470 }, { "epoch": 0.43054545454545456, "grad_norm": 6.03125, "grad_norm_var": 0.20054931640625, "learning_rate": 0.0001, "loss": 5.5135, "loss/crossentropy": 2.281982570886612, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1700298897922039, "step": 9472 }, { "epoch": 0.43063636363636365, "grad_norm": 4.84375, "grad_norm_var": 0.191259765625, "learning_rate": 0.0001, "loss": 5.4156, "loss/crossentropy": 2.326467275619507, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.15695633739233017, "step": 9474 }, { "epoch": 0.43072727272727274, "grad_norm": 4.75, "grad_norm_var": 0.1818359375, "learning_rate": 0.0001, "loss": 5.5473, "loss/crossentropy": 2.4194228053092957, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16317948698997498, "step": 9476 }, { "epoch": 0.4308181818181818, "grad_norm": 4.875, "grad_norm_var": 0.18668212890625, "learning_rate": 0.0001, "loss": 5.2636, "loss/crossentropy": 2.214362919330597, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15179990231990814, "step": 9478 }, { "epoch": 0.4309090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.1361328125, "learning_rate": 0.0001, "loss": 5.7053, "loss/crossentropy": 2.5424755215644836, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16550591588020325, "step": 9480 }, { "epoch": 0.431, "grad_norm": 5.0625, "grad_norm_var": 0.13007405598958333, "learning_rate": 0.0001, "loss": 5.5995, "loss/crossentropy": 2.3986103534698486, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16852675005793571, "step": 9482 }, { "epoch": 0.4310909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.10637613932291666, "learning_rate": 0.0001, "loss": 5.7229, "loss/crossentropy": 2.498221695423126, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1703166849911213, "step": 9484 }, { "epoch": 0.43118181818181817, "grad_norm": 4.5, "grad_norm_var": 0.12574462890625, "learning_rate": 0.0001, "loss": 5.344, "loss/crossentropy": 2.28619521856308, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15715161710977554, "step": 9486 }, { "epoch": 0.43127272727272725, "grad_norm": 5.21875, "grad_norm_var": 0.058394368489583334, "learning_rate": 0.0001, "loss": 5.6248, "loss/crossentropy": 2.4217090010643005, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16894202306866646, "step": 9488 }, { "epoch": 0.43136363636363634, "grad_norm": 5.1875, "grad_norm_var": 0.061962890625, "learning_rate": 0.0001, "loss": 5.5984, "loss/crossentropy": 2.4397332072257996, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16391151770949364, "step": 9490 }, { "epoch": 0.4314545454545455, "grad_norm": 5.21875, "grad_norm_var": 0.07131754557291667, "learning_rate": 0.0001, "loss": 5.5844, "loss/crossentropy": 2.467136800289154, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16231400147080421, "step": 9492 }, { "epoch": 0.43154545454545457, "grad_norm": 5.03125, "grad_norm_var": 0.06903889973958334, "learning_rate": 0.0001, "loss": 5.844, "loss/crossentropy": 2.570401608943939, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17541120201349258, "step": 9494 }, { "epoch": 0.43163636363636365, "grad_norm": 4.90625, "grad_norm_var": 0.08850504557291666, "learning_rate": 0.0001, "loss": 5.5374, "loss/crossentropy": 2.401064932346344, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16402899846434593, "step": 9496 }, { "epoch": 0.43172727272727274, "grad_norm": 4.9375, "grad_norm_var": 0.08385009765625, "learning_rate": 0.0001, "loss": 5.6913, "loss/crossentropy": 2.4919530153274536, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1685711070895195, "step": 9498 }, { "epoch": 0.4318181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.07467447916666667, "learning_rate": 0.0001, "loss": 5.8496, "loss/crossentropy": 2.6179198622703552, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16964831575751305, "step": 9500 }, { "epoch": 0.4319090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.08450113932291667, "learning_rate": 0.0001, "loss": 6.194, "loss/crossentropy": 2.7930731177330017, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18813833594322205, "step": 9502 }, { "epoch": 0.432, "grad_norm": 4.78125, "grad_norm_var": 0.08843994140625, "learning_rate": 0.0001, "loss": 5.6198, "loss/crossentropy": 2.496462821960449, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1578432247042656, "step": 9504 }, { "epoch": 0.4320909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.10165608723958333, "learning_rate": 0.0001, "loss": 5.0447, "loss/crossentropy": 2.096149653196335, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14621901884675026, "step": 9506 }, { "epoch": 0.43218181818181817, "grad_norm": 7.34375, "grad_norm_var": 0.46773681640625, "learning_rate": 0.0001, "loss": 5.4757, "loss/crossentropy": 2.3780962228775024, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1554652601480484, "step": 9508 }, { "epoch": 0.43227272727272725, "grad_norm": 5.375, "grad_norm_var": 0.46663004557291665, "learning_rate": 0.0001, "loss": 5.4395, "loss/crossentropy": 2.341642677783966, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15939155593514442, "step": 9510 }, { "epoch": 0.43236363636363634, "grad_norm": 5.46875, "grad_norm_var": 0.4511027018229167, "learning_rate": 0.0001, "loss": 5.6782, "loss/crossentropy": 2.4553165435791016, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16916094347834587, "step": 9512 }, { "epoch": 0.4324545454545455, "grad_norm": 5.3125, "grad_norm_var": 0.4376912434895833, "learning_rate": 0.0001, "loss": 5.5285, "loss/crossentropy": 2.357685685157776, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16141431778669357, "step": 9514 }, { "epoch": 0.43254545454545457, "grad_norm": 5.34375, "grad_norm_var": 0.4563802083333333, "learning_rate": 0.0001, "loss": 6.1304, "loss/crossentropy": 2.8219074010849, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18006975576281548, "step": 9516 }, { "epoch": 0.43263636363636365, "grad_norm": 5.34375, "grad_norm_var": 0.45510660807291664, "learning_rate": 0.0001, "loss": 5.3103, "loss/crossentropy": 2.158522605895996, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1645917072892189, "step": 9518 }, { "epoch": 0.43272727272727274, "grad_norm": 5.03125, "grad_norm_var": 0.43453369140625, "learning_rate": 0.0001, "loss": 5.4969, "loss/crossentropy": 2.3196456730365753, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16401061043143272, "step": 9520 }, { "epoch": 0.4328181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.37967122395833336, "learning_rate": 0.0001, "loss": 5.7384, "loss/crossentropy": 2.493756115436554, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16997667774558067, "step": 9522 }, { "epoch": 0.4329090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.09709879557291666, "learning_rate": 0.0001, "loss": 5.3683, "loss/crossentropy": 2.221048891544342, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16277290135622025, "step": 9524 }, { "epoch": 0.433, "grad_norm": 4.78125, "grad_norm_var": 0.10689697265625, "learning_rate": 0.0001, "loss": 5.3245, "loss/crossentropy": 2.248598098754883, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15700503066182137, "step": 9526 }, { "epoch": 0.4330909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.10386962890625, "learning_rate": 0.0001, "loss": 5.6513, "loss/crossentropy": 2.4488733410835266, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1645764485001564, "step": 9528 }, { "epoch": 0.43318181818181817, "grad_norm": 5.53125, "grad_norm_var": 0.1265625, "learning_rate": 0.0001, "loss": 5.7823, "loss/crossentropy": 2.571406275033951, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16855034604668617, "step": 9530 }, { "epoch": 0.43327272727272725, "grad_norm": 4.46875, "grad_norm_var": 0.10670166015625, "learning_rate": 0.0001, "loss": 5.5159, "loss/crossentropy": 2.4114567637443542, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16004907712340355, "step": 9532 }, { "epoch": 0.43336363636363634, "grad_norm": 5.53125, "grad_norm_var": 0.14358317057291667, "learning_rate": 0.0001, "loss": 5.9712, "loss/crossentropy": 2.6108365654945374, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1809600442647934, "step": 9534 }, { "epoch": 0.4334545454545455, "grad_norm": 5.25, "grad_norm_var": 0.117822265625, "learning_rate": 0.0001, "loss": 5.6451, "loss/crossentropy": 2.4699226021766663, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1657620370388031, "step": 9536 }, { "epoch": 0.43354545454545457, "grad_norm": 6.34375, "grad_norm_var": 0.20675455729166667, "learning_rate": 0.0001, "loss": 5.8445, "loss/crossentropy": 2.546162962913513, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17671144753694534, "step": 9538 }, { "epoch": 0.43363636363636365, "grad_norm": 5.0, "grad_norm_var": 0.20026041666666666, "learning_rate": 0.0001, "loss": 5.6892, "loss/crossentropy": 2.4998494386672974, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16874190047383308, "step": 9540 }, { "epoch": 0.43372727272727274, "grad_norm": 4.84375, "grad_norm_var": 0.19830322265625, "learning_rate": 0.0001, "loss": 5.3917, "loss/crossentropy": 2.30352059006691, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15569359436631203, "step": 9542 }, { "epoch": 0.4338181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.197509765625, "learning_rate": 0.0001, "loss": 5.8598, "loss/crossentropy": 2.643075406551361, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17010513693094254, "step": 9544 }, { "epoch": 0.4339090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.18121337890625, "learning_rate": 0.0001, "loss": 5.1954, "loss/crossentropy": 2.143805652856827, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15203788876533508, "step": 9546 }, { "epoch": 0.434, "grad_norm": 5.25, "grad_norm_var": 0.143212890625, "learning_rate": 0.0001, "loss": 5.8134, "loss/crossentropy": 2.608206629753113, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1699323207139969, "step": 9548 }, { "epoch": 0.4340909090909091, "grad_norm": 5.53125, "grad_norm_var": 0.27394205729166665, "learning_rate": 0.0001, "loss": 5.2665, "loss/crossentropy": 2.202328681945801, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15485251881182194, "step": 9550 }, { "epoch": 0.43418181818181817, "grad_norm": 5.21875, "grad_norm_var": 0.27301025390625, "learning_rate": 0.0001, "loss": 5.8495, "loss/crossentropy": 2.59857976436615, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17294810339808464, "step": 9552 }, { "epoch": 0.43427272727272725, "grad_norm": 5.84375, "grad_norm_var": 0.21730143229166668, "learning_rate": 0.0001, "loss": 5.7992, "loss/crossentropy": 2.5837690234184265, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17076434195041656, "step": 9554 }, { "epoch": 0.43436363636363634, "grad_norm": 4.53125, "grad_norm_var": 0.25051676432291664, "learning_rate": 0.0001, "loss": 5.6312, "loss/crossentropy": 2.5165997743606567, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15989487245678902, "step": 9556 }, { "epoch": 0.4344545454545455, "grad_norm": 5.46875, "grad_norm_var": 0.24622395833333333, "learning_rate": 0.0001, "loss": 5.684, "loss/crossentropy": 2.493964374065399, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1688125617802143, "step": 9558 }, { "epoch": 0.43454545454545457, "grad_norm": 5.75, "grad_norm_var": 0.26327718098958336, "learning_rate": 0.0001, "loss": 5.7998, "loss/crossentropy": 2.480831563472748, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.1746712625026703, "step": 9560 }, { "epoch": 0.43463636363636365, "grad_norm": 5.0625, "grad_norm_var": 0.24797770182291667, "learning_rate": 0.0001, "loss": 5.6124, "loss/crossentropy": 2.384292960166931, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16773243248462677, "step": 9562 }, { "epoch": 0.43472727272727274, "grad_norm": 5.28125, "grad_norm_var": 0.2506510416666667, "learning_rate": 0.0001, "loss": 6.041, "loss/crossentropy": 2.721573054790497, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.18057656660676003, "step": 9564 }, { "epoch": 0.4348181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.12076416015625, "learning_rate": 0.0001, "loss": 5.5328, "loss/crossentropy": 2.3497208952903748, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16752298921346664, "step": 9566 }, { "epoch": 0.4349090909090909, "grad_norm": 6.15625, "grad_norm_var": 0.1923828125, "learning_rate": 0.0001, "loss": 5.1491, "loss/crossentropy": 2.0675994753837585, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.15639013051986694, "step": 9568 }, { "epoch": 0.435, "grad_norm": 5.25, "grad_norm_var": 0.22721354166666666, "learning_rate": 0.0001, "loss": 5.4352, "loss/crossentropy": 2.2472200989723206, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16371814534068108, "step": 9570 }, { "epoch": 0.4350909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.200634765625, "learning_rate": 0.0001, "loss": 5.8179, "loss/crossentropy": 2.579411566257477, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1699388101696968, "step": 9572 }, { "epoch": 0.43518181818181817, "grad_norm": 5.21875, "grad_norm_var": 0.19842122395833334, "learning_rate": 0.0001, "loss": 5.5767, "loss/crossentropy": 2.412045121192932, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16665753722190857, "step": 9574 }, { "epoch": 0.43527272727272726, "grad_norm": 5.6875, "grad_norm_var": 0.18381754557291666, "learning_rate": 0.0001, "loss": 5.8961, "loss/crossentropy": 2.6190112233161926, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.1732134036719799, "step": 9576 }, { "epoch": 0.43536363636363634, "grad_norm": 4.84375, "grad_norm_var": 0.23917643229166666, "learning_rate": 0.0001, "loss": 5.7899, "loss/crossentropy": 2.4932940006256104, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17849094048142433, "step": 9578 }, { "epoch": 0.4354545454545454, "grad_norm": 5.0625, "grad_norm_var": 0.25774739583333334, "learning_rate": 0.0001, "loss": 6.0412, "loss/crossentropy": 2.650846242904663, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18512507155537605, "step": 9580 }, { "epoch": 0.43554545454545457, "grad_norm": 5.125, "grad_norm_var": 0.26165364583333334, "learning_rate": 0.0001, "loss": 5.6455, "loss/crossentropy": 2.4770203232765198, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16586796939373016, "step": 9582 }, { "epoch": 0.43563636363636365, "grad_norm": 4.84375, "grad_norm_var": 0.20935872395833333, "learning_rate": 0.0001, "loss": 5.6857, "loss/crossentropy": 2.4276500940322876, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17346571758389473, "step": 9584 }, { "epoch": 0.43572727272727274, "grad_norm": 4.84375, "grad_norm_var": 0.16151936848958334, "learning_rate": 0.0001, "loss": 5.6231, "loss/crossentropy": 2.455300450325012, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16345801949501038, "step": 9586 }, { "epoch": 0.4358181818181818, "grad_norm": 4.75, "grad_norm_var": 0.168994140625, "learning_rate": 0.0001, "loss": 5.44, "loss/crossentropy": 2.2760313749313354, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16463636606931686, "step": 9588 }, { "epoch": 0.4359090909090909, "grad_norm": 5.125, "grad_norm_var": 0.163671875, "learning_rate": 0.0001, "loss": 5.6598, "loss/crossentropy": 2.4891793727874756, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16608456894755363, "step": 9590 }, { "epoch": 0.436, "grad_norm": 5.0625, "grad_norm_var": 0.16682535807291668, "learning_rate": 0.0001, "loss": 5.6505, "loss/crossentropy": 2.4318345189094543, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1704983115196228, "step": 9592 }, { "epoch": 0.4360909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.12528889973958332, "learning_rate": 0.0001, "loss": 5.7258, "loss/crossentropy": 2.541464328765869, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16706737875938416, "step": 9594 }, { "epoch": 0.43618181818181817, "grad_norm": 5.53125, "grad_norm_var": 0.09794514973958333, "learning_rate": 0.0001, "loss": 6.0087, "loss/crossentropy": 2.7182486057281494, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17396695911884308, "step": 9596 }, { "epoch": 0.43627272727272726, "grad_norm": 5.75, "grad_norm_var": 0.11194254557291666, "learning_rate": 0.0001, "loss": 5.5341, "loss/crossentropy": 2.4003376364707947, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16337702795863152, "step": 9598 }, { "epoch": 0.43636363636363634, "grad_norm": 5.25, "grad_norm_var": 0.100634765625, "learning_rate": 0.0001, "loss": 5.8476, "loss/crossentropy": 2.5948551297187805, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17586364224553108, "step": 9600 }, { "epoch": 0.43645454545454543, "grad_norm": 5.25, "grad_norm_var": 0.120556640625, "learning_rate": 0.0001, "loss": 5.0133, "loss/crossentropy": 2.0626597702503204, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.14487284049391747, "step": 9602 }, { "epoch": 0.43654545454545457, "grad_norm": 5.0, "grad_norm_var": 0.11571858723958334, "learning_rate": 0.0001, "loss": 5.683, "loss/crossentropy": 2.519709527492523, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1628178469836712, "step": 9604 }, { "epoch": 0.43663636363636366, "grad_norm": 5.25, "grad_norm_var": 0.11612955729166667, "learning_rate": 0.0001, "loss": 5.942, "loss/crossentropy": 2.6474741101264954, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17613377794623375, "step": 9606 }, { "epoch": 0.43672727272727274, "grad_norm": 5.125, "grad_norm_var": 0.09605712890625, "learning_rate": 0.0001, "loss": 5.8376, "loss/crossentropy": 2.5513583421707153, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1786275915801525, "step": 9608 }, { "epoch": 0.4368181818181818, "grad_norm": 5.53125, "grad_norm_var": 0.687890625, "learning_rate": 0.0001, "loss": 5.7239, "loss/crossentropy": 2.4741432666778564, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17106809094548225, "step": 9610 }, { "epoch": 0.4369090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.689697265625, "learning_rate": 0.0001, "loss": 5.7318, "loss/crossentropy": 2.5162060260772705, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1690186709165573, "step": 9612 }, { "epoch": 0.437, "grad_norm": 5.3125, "grad_norm_var": 0.6770670572916667, "learning_rate": 0.0001, "loss": 5.9053, "loss/crossentropy": 2.6415266394615173, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17461784929037094, "step": 9614 }, { "epoch": 0.4370909090909091, "grad_norm": 6.21875, "grad_norm_var": 0.7364542643229167, "learning_rate": 0.0001, "loss": 5.7363, "loss/crossentropy": 2.483099639415741, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1723945066332817, "step": 9616 }, { "epoch": 0.43718181818181817, "grad_norm": 5.15625, "grad_norm_var": 0.6876953125, "learning_rate": 0.0001, "loss": 5.3764, "loss/crossentropy": 2.2349845468997955, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16023627296090126, "step": 9618 }, { "epoch": 0.43727272727272726, "grad_norm": 5.1875, "grad_norm_var": 0.6632649739583333, "learning_rate": 0.0001, "loss": 5.7325, "loss/crossentropy": 2.5174434781074524, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16955554857850075, "step": 9620 }, { "epoch": 0.43736363636363634, "grad_norm": 4.78125, "grad_norm_var": 0.7275349934895833, "learning_rate": 0.0001, "loss": 5.3633, "loss/crossentropy": 2.221191108226776, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.15971453860402107, "step": 9622 }, { "epoch": 0.43745454545454543, "grad_norm": 5.625, "grad_norm_var": 0.716650390625, "learning_rate": 0.0001, "loss": 6.1715, "loss/crossentropy": 2.739165723323822, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.18717597052454948, "step": 9624 }, { "epoch": 0.43754545454545457, "grad_norm": 4.6875, "grad_norm_var": 0.167822265625, "learning_rate": 0.0001, "loss": 5.8116, "loss/crossentropy": 2.5650784969329834, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17211562767624855, "step": 9626 }, { "epoch": 0.43763636363636366, "grad_norm": 5.21875, "grad_norm_var": 0.16461181640625, "learning_rate": 0.0001, "loss": 5.5758, "loss/crossentropy": 2.4355886578559875, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1610911749303341, "step": 9628 }, { "epoch": 0.43772727272727274, "grad_norm": 4.75, "grad_norm_var": 0.1890625, "learning_rate": 0.0001, "loss": 5.6855, "loss/crossentropy": 2.537950098514557, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1670982800424099, "step": 9630 }, { "epoch": 0.43781818181818183, "grad_norm": 5.5625, "grad_norm_var": 0.13162434895833333, "learning_rate": 0.0001, "loss": 5.5059, "loss/crossentropy": 2.3598171174526215, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16186905279755592, "step": 9632 }, { "epoch": 0.4379090909090909, "grad_norm": 5.25, "grad_norm_var": 0.123681640625, "learning_rate": 0.0001, "loss": 5.9324, "loss/crossentropy": 2.67038893699646, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1750280037522316, "step": 9634 }, { "epoch": 0.438, "grad_norm": 5.03125, "grad_norm_var": 0.12496337890625, "learning_rate": 0.0001, "loss": 5.7907, "loss/crossentropy": 2.4938968420028687, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17616066709160805, "step": 9636 }, { "epoch": 0.4380909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.117431640625, "learning_rate": 0.0001, "loss": 5.6832, "loss/crossentropy": 2.5409653186798096, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16461192071437836, "step": 9638 }, { "epoch": 0.4381818181818182, "grad_norm": 6.125, "grad_norm_var": 0.1419921875, "learning_rate": 0.0001, "loss": 5.54, "loss/crossentropy": 2.3533395528793335, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1651514247059822, "step": 9640 }, { "epoch": 0.43827272727272726, "grad_norm": 5.3125, "grad_norm_var": 0.12967122395833333, "learning_rate": 0.0001, "loss": 5.8607, "loss/crossentropy": 2.6573344469070435, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16759736090898514, "step": 9642 }, { "epoch": 0.43836363636363634, "grad_norm": 5.1875, "grad_norm_var": 0.12860921223958333, "learning_rate": 0.0001, "loss": 5.4946, "loss/crossentropy": 2.3155439496040344, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1643924079835415, "step": 9644 }, { "epoch": 0.43845454545454543, "grad_norm": 4.90625, "grad_norm_var": 0.11428629557291667, "learning_rate": 0.0001, "loss": 5.461, "loss/crossentropy": 2.361731141805649, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1591457687318325, "step": 9646 }, { "epoch": 0.43854545454545457, "grad_norm": 9.6875, "grad_norm_var": 1.3724894205729166, "learning_rate": 0.0001, "loss": 5.2045, "loss/crossentropy": 2.1812667548656464, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15231901034712791, "step": 9648 }, { "epoch": 0.43863636363636366, "grad_norm": 5.21875, "grad_norm_var": 1.3550618489583333, "learning_rate": 0.0001, "loss": 5.913, "loss/crossentropy": 2.630346417427063, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17728877812623978, "step": 9650 }, { "epoch": 0.43872727272727274, "grad_norm": 4.84375, "grad_norm_var": 1.38590087890625, "learning_rate": 0.0001, "loss": 5.6141, "loss/crossentropy": 2.43847393989563, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1652217134833336, "step": 9652 }, { "epoch": 0.43881818181818183, "grad_norm": 5.46875, "grad_norm_var": 1.36002197265625, "learning_rate": 0.0001, "loss": 5.7912, "loss/crossentropy": 2.522170126438141, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1718214638531208, "step": 9654 }, { "epoch": 0.4389090909090909, "grad_norm": 4.96875, "grad_norm_var": 1.34547119140625, "learning_rate": 0.0001, "loss": 5.8863, "loss/crossentropy": 2.625618100166321, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17411759868264198, "step": 9656 }, { "epoch": 0.439, "grad_norm": 5.0, "grad_norm_var": 1.388134765625, "learning_rate": 0.0001, "loss": 5.4414, "loss/crossentropy": 2.378608465194702, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15666572004556656, "step": 9658 }, { "epoch": 0.4390909090909091, "grad_norm": 4.40625, "grad_norm_var": 1.44459228515625, "learning_rate": 0.0001, "loss": 5.1644, "loss/crossentropy": 2.191171020269394, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14888828247785568, "step": 9660 }, { "epoch": 0.4391818181818182, "grad_norm": 4.96875, "grad_norm_var": 1.4676920572916667, "learning_rate": 0.0001, "loss": 5.8612, "loss/crossentropy": 2.603557825088501, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17459533736109734, "step": 9662 }, { "epoch": 0.43927272727272726, "grad_norm": 5.09375, "grad_norm_var": 0.146875, "learning_rate": 0.0001, "loss": 5.6216, "loss/crossentropy": 2.399798274040222, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17120468616485596, "step": 9664 }, { "epoch": 0.43936363636363635, "grad_norm": 4.75, "grad_norm_var": 0.13629150390625, "learning_rate": 0.0001, "loss": 5.4003, "loss/crossentropy": 2.2965644001960754, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15763698518276215, "step": 9666 }, { "epoch": 0.43945454545454543, "grad_norm": 5.09375, "grad_norm_var": 0.14550374348958334, "learning_rate": 0.0001, "loss": 6.1497, "loss/crossentropy": 2.7819154858589172, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.18307092785835266, "step": 9668 }, { "epoch": 0.4395454545454546, "grad_norm": 4.96875, "grad_norm_var": 0.12851155598958333, "learning_rate": 0.0001, "loss": 5.1548, "loss/crossentropy": 2.1297472417354584, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.14743154495954514, "step": 9670 }, { "epoch": 0.43963636363636366, "grad_norm": 5.03125, "grad_norm_var": 0.12928059895833333, "learning_rate": 0.0001, "loss": 5.4083, "loss/crossentropy": 2.2279835641384125, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16529446840286255, "step": 9672 }, { "epoch": 0.43972727272727274, "grad_norm": 5.125, "grad_norm_var": 0.12512613932291666, "learning_rate": 0.0001, "loss": 5.8404, "loss/crossentropy": 2.5725335478782654, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17385567724704742, "step": 9674 }, { "epoch": 0.43981818181818183, "grad_norm": 5.53125, "grad_norm_var": 0.101171875, "learning_rate": 0.0001, "loss": 5.5638, "loss/crossentropy": 2.4309073388576508, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1632910706102848, "step": 9676 }, { "epoch": 0.4399090909090909, "grad_norm": 5.125, "grad_norm_var": 0.051070149739583334, "learning_rate": 0.0001, "loss": 5.5392, "loss/crossentropy": 2.3624666333198547, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.162397138774395, "step": 9678 }, { "epoch": 0.44, "grad_norm": 5.25, "grad_norm_var": 0.04698893229166667, "learning_rate": 0.0001, "loss": 5.6736, "loss/crossentropy": 2.484287977218628, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1665872260928154, "step": 9680 }, { "epoch": 0.4400909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.06328125, "learning_rate": 0.0001, "loss": 5.6257, "loss/crossentropy": 2.400183916091919, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1674778312444687, "step": 9682 }, { "epoch": 0.4401818181818182, "grad_norm": 5.40625, "grad_norm_var": 0.06259358723958333, "learning_rate": 0.0001, "loss": 5.7515, "loss/crossentropy": 2.4905447363853455, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17394719272851944, "step": 9684 }, { "epoch": 0.44027272727272726, "grad_norm": 5.5625, "grad_norm_var": 0.056929524739583334, "learning_rate": 0.0001, "loss": 6.1001, "loss/crossentropy": 2.768555521965027, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.18139224126935005, "step": 9686 }, { "epoch": 0.44036363636363635, "grad_norm": 5.375, "grad_norm_var": 0.051806640625, "learning_rate": 0.0001, "loss": 6.1359, "loss/crossentropy": 2.7748007774353027, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18337879329919815, "step": 9688 }, { "epoch": 0.44045454545454543, "grad_norm": 5.34375, "grad_norm_var": 0.04778645833333333, "learning_rate": 0.0001, "loss": 5.5992, "loss/crossentropy": 2.429988145828247, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16652629896998405, "step": 9690 }, { "epoch": 0.4405454545454545, "grad_norm": 4.625, "grad_norm_var": 0.06887613932291667, "learning_rate": 0.0001, "loss": 5.8612, "loss/crossentropy": 2.697149932384491, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16699177399277687, "step": 9692 }, { "epoch": 0.44063636363636366, "grad_norm": 4.5625, "grad_norm_var": 0.10393473307291666, "learning_rate": 0.0001, "loss": 5.5011, "loss/crossentropy": 2.3980730772018433, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1595175936818123, "step": 9694 }, { "epoch": 0.44072727272727275, "grad_norm": 5.40625, "grad_norm_var": 0.11523030598958334, "learning_rate": 0.0001, "loss": 5.7302, "loss/crossentropy": 2.5300907492637634, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17079024389386177, "step": 9696 }, { "epoch": 0.44081818181818183, "grad_norm": 5.34375, "grad_norm_var": 0.08840738932291667, "learning_rate": 0.0001, "loss": 5.9606, "loss/crossentropy": 2.672847092151642, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17721064016222954, "step": 9698 }, { "epoch": 0.4409090909090909, "grad_norm": 9.75, "grad_norm_var": 1.4157389322916667, "learning_rate": 0.0001, "loss": 5.4523, "loss/crossentropy": 2.295397698879242, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16666745021939278, "step": 9700 }, { "epoch": 0.441, "grad_norm": 4.75, "grad_norm_var": 1.4410115559895833, "learning_rate": 0.0001, "loss": 5.7262, "loss/crossentropy": 2.538903295993805, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16912450268864632, "step": 9702 }, { "epoch": 0.4410909090909091, "grad_norm": 5.125, "grad_norm_var": 1.4505045572916666, "learning_rate": 0.0001, "loss": 5.3523, "loss/crossentropy": 2.2656816840171814, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15963681414723396, "step": 9704 }, { "epoch": 0.4411818181818182, "grad_norm": 5.03125, "grad_norm_var": 1.4807902018229167, "learning_rate": 0.0001, "loss": 5.424, "loss/crossentropy": 2.3594393730163574, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.157632227987051, "step": 9706 }, { "epoch": 0.44127272727272726, "grad_norm": 5.1875, "grad_norm_var": 1.4570597330729167, "learning_rate": 0.0001, "loss": 5.76, "loss/crossentropy": 2.545968234539032, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17023028805851936, "step": 9708 }, { "epoch": 0.44136363636363635, "grad_norm": 4.9375, "grad_norm_var": 1.422509765625, "learning_rate": 0.0001, "loss": 5.7435, "loss/crossentropy": 2.513465642929077, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1702674850821495, "step": 9710 }, { "epoch": 0.44145454545454543, "grad_norm": 5.1875, "grad_norm_var": 1.4337239583333334, "learning_rate": 0.0001, "loss": 5.4606, "loss/crossentropy": 2.3436743319034576, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.15915169566869736, "step": 9712 }, { "epoch": 0.4415454545454545, "grad_norm": 5.78125, "grad_norm_var": 1.44400634765625, "learning_rate": 0.0001, "loss": 5.9561, "loss/crossentropy": 2.615219235420227, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1813586726784706, "step": 9714 }, { "epoch": 0.44163636363636366, "grad_norm": 5.15625, "grad_norm_var": 0.10035400390625, "learning_rate": 0.0001, "loss": 5.5765, "loss/crossentropy": 2.3655056953430176, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1679762303829193, "step": 9716 }, { "epoch": 0.44172727272727275, "grad_norm": 6.0, "grad_norm_var": 0.12493082682291666, "learning_rate": 0.0001, "loss": 5.9664, "loss/crossentropy": 2.6577983498573303, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18008147925138474, "step": 9718 }, { "epoch": 0.44181818181818183, "grad_norm": 4.65625, "grad_norm_var": 0.15677083333333333, "learning_rate": 0.0001, "loss": 4.9333, "loss/crossentropy": 2.0454886853694916, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14073007181286812, "step": 9720 }, { "epoch": 0.4419090909090909, "grad_norm": 5.875, "grad_norm_var": 0.16599934895833332, "learning_rate": 0.0001, "loss": 5.8905, "loss/crossentropy": 2.5579426288604736, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17915795370936394, "step": 9722 }, { "epoch": 0.442, "grad_norm": 5.1875, "grad_norm_var": 0.15074462890625, "learning_rate": 0.0001, "loss": 5.3311, "loss/crossentropy": 2.2774737775325775, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.15184297412633896, "step": 9724 }, { "epoch": 0.4420909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.15220947265625, "learning_rate": 0.0001, "loss": 5.6447, "loss/crossentropy": 2.4354504346847534, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1687791794538498, "step": 9726 }, { "epoch": 0.4421818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.16647135416666667, "learning_rate": 0.0001, "loss": 5.5346, "loss/crossentropy": 2.35850065946579, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16565274819731712, "step": 9728 }, { "epoch": 0.44227272727272726, "grad_norm": 5.09375, "grad_norm_var": 0.14687093098958334, "learning_rate": 0.0001, "loss": 5.7664, "loss/crossentropy": 2.6164668798446655, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16519130766391754, "step": 9730 }, { "epoch": 0.44236363636363635, "grad_norm": 5.03125, "grad_norm_var": 0.276025390625, "learning_rate": 0.0001, "loss": 6.1626, "loss/crossentropy": 2.826776385307312, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17948492616415024, "step": 9732 }, { "epoch": 0.44245454545454543, "grad_norm": 5.09375, "grad_norm_var": 0.23746337890625, "learning_rate": 0.0001, "loss": 5.8711, "loss/crossentropy": 2.657096207141876, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1706194020807743, "step": 9734 }, { "epoch": 0.4425454545454545, "grad_norm": 5.25, "grad_norm_var": 0.21760660807291668, "learning_rate": 0.0001, "loss": 5.8534, "loss/crossentropy": 2.5670372247695923, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17492425069212914, "step": 9736 }, { "epoch": 0.44263636363636366, "grad_norm": 5.25, "grad_norm_var": 0.19052327473958333, "learning_rate": 0.0001, "loss": 6.0101, "loss/crossentropy": 2.7039040327072144, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17846780270338058, "step": 9738 }, { "epoch": 0.44272727272727275, "grad_norm": 5.25, "grad_norm_var": 0.19373372395833333, "learning_rate": 0.0001, "loss": 5.785, "loss/crossentropy": 2.5570037364959717, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1720135472714901, "step": 9740 }, { "epoch": 0.44281818181818183, "grad_norm": 5.4375, "grad_norm_var": 0.18932291666666667, "learning_rate": 0.0001, "loss": 5.5825, "loss/crossentropy": 2.466224431991577, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16163238883018494, "step": 9742 }, { "epoch": 0.4429090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.15740559895833334, "learning_rate": 0.0001, "loss": 5.5581, "loss/crossentropy": 2.421896457672119, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.15991152450442314, "step": 9744 }, { "epoch": 0.443, "grad_norm": 5.0625, "grad_norm_var": 0.19230143229166666, "learning_rate": 0.0001, "loss": 5.1675, "loss/crossentropy": 2.1247904002666473, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15681365877389908, "step": 9746 }, { "epoch": 0.4430909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.07511393229166667, "learning_rate": 0.0001, "loss": 5.6443, "loss/crossentropy": 2.46167653799057, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16689978167414665, "step": 9748 }, { "epoch": 0.4431818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.09781494140625, "learning_rate": 0.0001, "loss": 5.4489, "loss/crossentropy": 2.3749427795410156, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15836966037750244, "step": 9750 }, { "epoch": 0.44327272727272726, "grad_norm": 4.875, "grad_norm_var": 0.09390869140625, "learning_rate": 0.0001, "loss": 5.9646, "loss/crossentropy": 2.7500336170196533, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1712619885802269, "step": 9752 }, { "epoch": 0.44336363636363635, "grad_norm": 6.0625, "grad_norm_var": 0.15917561848958334, "learning_rate": 0.0001, "loss": 6.1023, "loss/crossentropy": 2.8048510551452637, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17739736288785934, "step": 9754 }, { "epoch": 0.44345454545454543, "grad_norm": 5.5625, "grad_norm_var": 0.163916015625, "learning_rate": 0.0001, "loss": 5.8312, "loss/crossentropy": 2.5163082778453827, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17875056341290474, "step": 9756 }, { "epoch": 0.4435454545454545, "grad_norm": 5.1875, "grad_norm_var": 0.15050455729166667, "learning_rate": 0.0001, "loss": 5.6538, "loss/crossentropy": 2.422090172767639, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17140892148017883, "step": 9758 }, { "epoch": 0.44363636363636366, "grad_norm": 4.65625, "grad_norm_var": 0.18092447916666668, "learning_rate": 0.0001, "loss": 5.8786, "loss/crossentropy": 2.632476568222046, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17597533017396927, "step": 9760 }, { "epoch": 0.44372727272727275, "grad_norm": 4.4375, "grad_norm_var": 0.19563395182291668, "learning_rate": 0.0001, "loss": 5.7413, "loss/crossentropy": 2.5605103373527527, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16729551926255226, "step": 9762 }, { "epoch": 0.44381818181818183, "grad_norm": 5.125, "grad_norm_var": 0.19846598307291666, "learning_rate": 0.0001, "loss": 5.8878, "loss/crossentropy": 2.5967541933059692, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17617468908429146, "step": 9764 }, { "epoch": 0.4439090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.18487955729166666, "learning_rate": 0.0001, "loss": 5.4239, "loss/crossentropy": 2.345575749874115, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.157837375998497, "step": 9766 }, { "epoch": 0.444, "grad_norm": 5.3125, "grad_norm_var": 0.19449462890625, "learning_rate": 0.0001, "loss": 5.7666, "loss/crossentropy": 2.580723285675049, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16936636343598366, "step": 9768 }, { "epoch": 0.4440909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.13190104166666666, "learning_rate": 0.0001, "loss": 5.3956, "loss/crossentropy": 2.2619318068027496, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1594599112868309, "step": 9770 }, { "epoch": 0.4441818181818182, "grad_norm": 4.875, "grad_norm_var": 0.11926676432291666, "learning_rate": 0.0001, "loss": 5.5742, "loss/crossentropy": 2.4393643140792847, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16582528874278069, "step": 9772 }, { "epoch": 0.44427272727272726, "grad_norm": 5.28125, "grad_norm_var": 0.13625895182291667, "learning_rate": 0.0001, "loss": 5.7114, "loss/crossentropy": 2.4723166823387146, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17156483605504036, "step": 9774 }, { "epoch": 0.44436363636363635, "grad_norm": 4.71875, "grad_norm_var": 0.10904541015625, "learning_rate": 0.0001, "loss": 5.5875, "loss/crossentropy": 2.45275217294693, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16504021733999252, "step": 9776 }, { "epoch": 0.44445454545454544, "grad_norm": 5.0625, "grad_norm_var": 0.08105061848958334, "learning_rate": 0.0001, "loss": 5.1151, "loss/crossentropy": 2.1416975557804108, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1494864746928215, "step": 9778 }, { "epoch": 0.4445454545454545, "grad_norm": 5.375, "grad_norm_var": 0.07138264973958333, "learning_rate": 0.0001, "loss": 5.7623, "loss/crossentropy": 2.552247941493988, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17198267206549644, "step": 9780 }, { "epoch": 0.44463636363636366, "grad_norm": 5.3125, "grad_norm_var": 0.0677734375, "learning_rate": 0.0001, "loss": 5.7057, "loss/crossentropy": 2.5572712421417236, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1658211387693882, "step": 9782 }, { "epoch": 0.44472727272727275, "grad_norm": 4.6875, "grad_norm_var": 0.06884358723958334, "learning_rate": 0.0001, "loss": 5.3235, "loss/crossentropy": 2.2516128420829773, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1585538610816002, "step": 9784 }, { "epoch": 0.44481818181818183, "grad_norm": 4.3125, "grad_norm_var": 0.10245768229166667, "learning_rate": 0.0001, "loss": 5.3198, "loss/crossentropy": 2.3194624185562134, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1512073017656803, "step": 9786 }, { "epoch": 0.4449090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.09361979166666666, "learning_rate": 0.0001, "loss": 5.6263, "loss/crossentropy": 2.4793052673339844, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16528090089559555, "step": 9788 }, { "epoch": 0.445, "grad_norm": 5.375, "grad_norm_var": 0.08440348307291666, "learning_rate": 0.0001, "loss": 5.5911, "loss/crossentropy": 2.3982726335525513, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16967139020562172, "step": 9790 }, { "epoch": 0.4450909090909091, "grad_norm": 4.875, "grad_norm_var": 0.09895833333333333, "learning_rate": 0.0001, "loss": 5.3237, "loss/crossentropy": 2.3895062804222107, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14713513478636742, "step": 9792 }, { "epoch": 0.4451818181818182, "grad_norm": 4.875, "grad_norm_var": 0.10362955729166666, "learning_rate": 0.0001, "loss": 5.9104, "loss/crossentropy": 2.645791172981262, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17587169632315636, "step": 9794 }, { "epoch": 0.44527272727272726, "grad_norm": 4.90625, "grad_norm_var": 0.09026285807291666, "learning_rate": 0.0001, "loss": 5.3366, "loss/crossentropy": 2.2615470588207245, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15769824013113976, "step": 9796 }, { "epoch": 0.44536363636363635, "grad_norm": 5.3125, "grad_norm_var": 0.09332275390625, "learning_rate": 0.0001, "loss": 5.2954, "loss/crossentropy": 2.2048584818840027, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15749341994524002, "step": 9798 }, { "epoch": 0.44545454545454544, "grad_norm": 4.6875, "grad_norm_var": 0.08567301432291667, "learning_rate": 0.0001, "loss": 5.2244, "loss/crossentropy": 2.2594746947288513, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14805316552519798, "step": 9800 }, { "epoch": 0.4455454545454545, "grad_norm": 5.125, "grad_norm_var": 0.060933430989583336, "learning_rate": 0.0001, "loss": 5.631, "loss/crossentropy": 2.4467058181762695, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16530022770166397, "step": 9802 }, { "epoch": 0.44563636363636366, "grad_norm": 5.09375, "grad_norm_var": 0.058447265625, "learning_rate": 0.0001, "loss": 5.0628, "loss/crossentropy": 2.0785293877124786, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.14921098574995995, "step": 9804 }, { "epoch": 0.44572727272727275, "grad_norm": 4.875, "grad_norm_var": 0.39075113932291666, "learning_rate": 0.0001, "loss": 5.7143, "loss/crossentropy": 2.5108291506767273, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16878465935587883, "step": 9806 }, { "epoch": 0.44581818181818184, "grad_norm": 4.9375, "grad_norm_var": 0.3869140625, "learning_rate": 0.0001, "loss": 5.6242, "loss/crossentropy": 2.4320813417434692, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16960430145263672, "step": 9808 }, { "epoch": 0.4459090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.38717041015625, "learning_rate": 0.0001, "loss": 5.2876, "loss/crossentropy": 2.298626482486725, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15104326233267784, "step": 9810 }, { "epoch": 0.446, "grad_norm": 5.03125, "grad_norm_var": 0.394775390625, "learning_rate": 0.0001, "loss": 5.9435, "loss/crossentropy": 2.644850730895996, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17849688977003098, "step": 9812 }, { "epoch": 0.4460909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.3875935872395833, "learning_rate": 0.0001, "loss": 5.7221, "loss/crossentropy": 2.535998225212097, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16587752103805542, "step": 9814 }, { "epoch": 0.4461818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.36327718098958334, "learning_rate": 0.0001, "loss": 5.7886, "loss/crossentropy": 2.58622670173645, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16828200966119766, "step": 9816 }, { "epoch": 0.44627272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.36601155598958335, "learning_rate": 0.0001, "loss": 5.8545, "loss/crossentropy": 2.6418243646621704, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17029434069991112, "step": 9818 }, { "epoch": 0.44636363636363635, "grad_norm": 5.1875, "grad_norm_var": 0.36569010416666664, "learning_rate": 0.0001, "loss": 5.291, "loss/crossentropy": 2.205466240644455, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15738235786557198, "step": 9820 }, { "epoch": 0.44645454545454544, "grad_norm": 5.1875, "grad_norm_var": 0.054931640625, "learning_rate": 0.0001, "loss": 5.8204, "loss/crossentropy": 2.556040108203888, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1709691546857357, "step": 9822 }, { "epoch": 0.4465454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.032145182291666664, "learning_rate": 0.0001, "loss": 5.755, "loss/crossentropy": 2.538231372833252, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16991936787962914, "step": 9824 }, { "epoch": 0.4466363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.022916666666666665, "learning_rate": 0.0001, "loss": 6.0419, "loss/crossentropy": 2.684324026107788, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.18322231993079185, "step": 9826 }, { "epoch": 0.44672727272727275, "grad_norm": 5.0, "grad_norm_var": 0.02265625, "learning_rate": 0.0001, "loss": 5.538, "loss/crossentropy": 2.3872306644916534, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16292943246662617, "step": 9828 }, { "epoch": 0.44681818181818184, "grad_norm": 5.1875, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 5.9277, "loss/crossentropy": 2.61668199300766, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17719197273254395, "step": 9830 }, { "epoch": 0.4469090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.030143229166666667, "learning_rate": 0.0001, "loss": 5.0301, "loss/crossentropy": 2.049421638250351, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1451379433274269, "step": 9832 }, { "epoch": 0.447, "grad_norm": 4.65625, "grad_norm_var": 0.03918863932291667, "learning_rate": 0.0001, "loss": 5.2469, "loss/crossentropy": 2.2071838676929474, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15240572206676006, "step": 9834 }, { "epoch": 0.4470909090909091, "grad_norm": 4.71875, "grad_norm_var": 0.06103108723958333, "learning_rate": 0.0001, "loss": 4.8976, "loss/crossentropy": 2.016099363565445, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1410798542201519, "step": 9836 }, { "epoch": 0.4471818181818182, "grad_norm": 5.34375, "grad_norm_var": 0.07923177083333334, "learning_rate": 0.0001, "loss": 5.6651, "loss/crossentropy": 2.4608944058418274, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16768815368413925, "step": 9838 }, { "epoch": 0.44727272727272727, "grad_norm": 4.75, "grad_norm_var": 0.08919270833333333, "learning_rate": 0.0001, "loss": 5.7202, "loss/crossentropy": 2.5684866309165955, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16419682279229164, "step": 9840 }, { "epoch": 0.44736363636363635, "grad_norm": 4.78125, "grad_norm_var": 0.08616129557291667, "learning_rate": 0.0001, "loss": 5.2383, "loss/crossentropy": 2.192286968231201, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1546006202697754, "step": 9842 }, { "epoch": 0.44745454545454544, "grad_norm": 12.0, "grad_norm_var": 3.1816691080729167, "learning_rate": 0.0001, "loss": 6.2597, "loss/crossentropy": 2.8295286893844604, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.19105955585837364, "step": 9844 }, { "epoch": 0.4475454545454545, "grad_norm": 4.71875, "grad_norm_var": 3.2233072916666665, "learning_rate": 0.0001, "loss": 5.687, "loss/crossentropy": 2.4706928730010986, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1714394986629486, "step": 9846 }, { "epoch": 0.4476363636363636, "grad_norm": 4.78125, "grad_norm_var": 3.19918212890625, "learning_rate": 0.0001, "loss": 5.6051, "loss/crossentropy": 2.4472239017486572, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1642216257750988, "step": 9848 }, { "epoch": 0.44772727272727275, "grad_norm": 5.0, "grad_norm_var": 3.1934529622395833, "learning_rate": 0.0001, "loss": 5.8091, "loss/crossentropy": 2.6402162313461304, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16766948625445366, "step": 9850 }, { "epoch": 0.44781818181818184, "grad_norm": 5.0625, "grad_norm_var": 3.0835245768229167, "learning_rate": 0.0001, "loss": 5.9208, "loss/crossentropy": 2.673665702342987, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17452159151434898, "step": 9852 }, { "epoch": 0.4479090909090909, "grad_norm": 5.125, "grad_norm_var": 3.131966145833333, "learning_rate": 0.0001, "loss": 5.4355, "loss/crossentropy": 2.337144136428833, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16198408231139183, "step": 9854 }, { "epoch": 0.448, "grad_norm": 5.46875, "grad_norm_var": 3.09635009765625, "learning_rate": 0.0001, "loss": 5.8629, "loss/crossentropy": 2.5591869950294495, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17607257142663002, "step": 9856 }, { "epoch": 0.4480909090909091, "grad_norm": 5.1875, "grad_norm_var": 3.042041015625, "learning_rate": 0.0001, "loss": 5.9477, "loss/crossentropy": 2.657531499862671, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17589613795280457, "step": 9858 }, { "epoch": 0.4481818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.10885416666666667, "learning_rate": 0.0001, "loss": 5.8376, "loss/crossentropy": 2.613641142845154, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17200323194265366, "step": 9860 }, { "epoch": 0.44827272727272727, "grad_norm": 5.3125, "grad_norm_var": 0.056441243489583334, "learning_rate": 0.0001, "loss": 5.9864, "loss/crossentropy": 2.6862305402755737, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17982565239071846, "step": 9862 }, { "epoch": 0.44836363636363635, "grad_norm": 4.71875, "grad_norm_var": 0.05858968098958333, "learning_rate": 0.0001, "loss": 5.4658, "loss/crossentropy": 2.4184882640838623, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15687678009271622, "step": 9864 }, { "epoch": 0.44845454545454544, "grad_norm": 5.34375, "grad_norm_var": 0.060872395833333336, "learning_rate": 0.0001, "loss": 5.5667, "loss/crossentropy": 2.41924449801445, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16416184976696968, "step": 9866 }, { "epoch": 0.4485454545454545, "grad_norm": 5.34375, "grad_norm_var": 0.06842041015625, "learning_rate": 0.0001, "loss": 6.095, "loss/crossentropy": 2.7992125153541565, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1782093159854412, "step": 9868 }, { "epoch": 0.4486363636363636, "grad_norm": 5.40625, "grad_norm_var": 0.06177978515625, "learning_rate": 0.0001, "loss": 5.8154, "loss/crossentropy": 2.607474625110626, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1723511628806591, "step": 9870 }, { "epoch": 0.44872727272727275, "grad_norm": 4.75, "grad_norm_var": 0.07021077473958333, "learning_rate": 0.0001, "loss": 5.1933, "loss/crossentropy": 2.148149073123932, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.15275714546442032, "step": 9872 }, { "epoch": 0.44881818181818184, "grad_norm": 4.5625, "grad_norm_var": 0.080322265625, "learning_rate": 0.0001, "loss": 5.5021, "loss/crossentropy": 2.412787616252899, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15853769332170486, "step": 9874 }, { "epoch": 0.4489090909090909, "grad_norm": 4.40625, "grad_norm_var": 0.10403238932291667, "learning_rate": 0.0001, "loss": 5.6321, "loss/crossentropy": 2.501882016658783, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16340779140591621, "step": 9876 }, { "epoch": 0.449, "grad_norm": 5.4375, "grad_norm_var": 0.11183268229166667, "learning_rate": 0.0001, "loss": 5.7014, "loss/crossentropy": 2.537947177886963, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.168888121843338, "step": 9878 }, { "epoch": 0.4490909090909091, "grad_norm": 4.625, "grad_norm_var": 0.12405192057291667, "learning_rate": 0.0001, "loss": 5.674, "loss/crossentropy": 2.548644781112671, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1631189025938511, "step": 9880 }, { "epoch": 0.4491818181818182, "grad_norm": 5.3125, "grad_norm_var": 0.119775390625, "learning_rate": 0.0001, "loss": 6.023, "loss/crossentropy": 2.6879788637161255, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.18174024298787117, "step": 9882 }, { "epoch": 0.44927272727272727, "grad_norm": 4.59375, "grad_norm_var": 0.10858968098958334, "learning_rate": 0.0001, "loss": 5.472, "loss/crossentropy": 2.440395772457123, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.153939351439476, "step": 9884 }, { "epoch": 0.44936363636363635, "grad_norm": 5.46875, "grad_norm_var": 0.12936197916666667, "learning_rate": 0.0001, "loss": 5.754, "loss/crossentropy": 2.5565463304519653, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16876451671123505, "step": 9886 }, { "epoch": 0.44945454545454544, "grad_norm": 5.0625, "grad_norm_var": 0.12922770182291668, "learning_rate": 0.0001, "loss": 5.7704, "loss/crossentropy": 2.5437657833099365, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17129985243082047, "step": 9888 }, { "epoch": 0.4495454545454545, "grad_norm": 5.375, "grad_norm_var": 0.12237955729166666, "learning_rate": 0.0001, "loss": 5.8715, "loss/crossentropy": 2.6137222051620483, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1755792424082756, "step": 9890 }, { "epoch": 0.4496363636363636, "grad_norm": 4.78125, "grad_norm_var": 0.10393473307291666, "learning_rate": 0.0001, "loss": 5.598, "loss/crossentropy": 2.491656571626663, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1633707657456398, "step": 9892 }, { "epoch": 0.44972727272727275, "grad_norm": 4.1875, "grad_norm_var": 0.14527587890625, "learning_rate": 0.0001, "loss": 5.2922, "loss/crossentropy": 2.271613657474518, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15127765014767647, "step": 9894 }, { "epoch": 0.44981818181818184, "grad_norm": 5.375, "grad_norm_var": 0.13756103515625, "learning_rate": 0.0001, "loss": 5.745, "loss/crossentropy": 2.591968059539795, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16628039255738258, "step": 9896 }, { "epoch": 0.4499090909090909, "grad_norm": 5.5625, "grad_norm_var": 0.16209309895833332, "learning_rate": 0.0001, "loss": 5.9157, "loss/crossentropy": 2.582744777202606, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.1776285283267498, "step": 9898 }, { "epoch": 0.45, "grad_norm": 5.15625, "grad_norm_var": 0.14755452473958333, "learning_rate": 0.0001, "loss": 5.8163, "loss/crossentropy": 2.5589338541030884, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17299866303801537, "step": 9900 }, { "epoch": 0.4500909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.12519124348958333, "learning_rate": 0.0001, "loss": 5.6978, "loss/crossentropy": 2.503752112388611, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17018358409404755, "step": 9902 }, { "epoch": 0.4501818181818182, "grad_norm": 4.75, "grad_norm_var": 0.13313802083333334, "learning_rate": 0.0001, "loss": 5.4637, "loss/crossentropy": 2.3597838282585144, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16156302765011787, "step": 9904 }, { "epoch": 0.45027272727272727, "grad_norm": 5.3125, "grad_norm_var": 0.1341796875, "learning_rate": 0.0001, "loss": 5.4749, "loss/crossentropy": 2.304222911596298, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16511930152773857, "step": 9906 }, { "epoch": 0.45036363636363635, "grad_norm": 5.46875, "grad_norm_var": 0.13189697265625, "learning_rate": 0.0001, "loss": 5.4003, "loss/crossentropy": 2.2681483328342438, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.15754931792616844, "step": 9908 }, { "epoch": 0.45045454545454544, "grad_norm": 5.625, "grad_norm_var": 0.08709309895833334, "learning_rate": 0.0001, "loss": 5.7866, "loss/crossentropy": 2.4977128505706787, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17635075375437737, "step": 9910 }, { "epoch": 0.4505454545454545, "grad_norm": 4.71875, "grad_norm_var": 0.09726155598958333, "learning_rate": 0.0001, "loss": 5.4118, "loss/crossentropy": 2.3607118725776672, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15784311294555664, "step": 9912 }, { "epoch": 0.4506363636363636, "grad_norm": 5.5, "grad_norm_var": 0.08586832682291666, "learning_rate": 0.0001, "loss": 5.7642, "loss/crossentropy": 2.531279504299164, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17231113836169243, "step": 9914 }, { "epoch": 0.45072727272727275, "grad_norm": 4.8125, "grad_norm_var": 0.09104410807291667, "learning_rate": 0.0001, "loss": 5.7685, "loss/crossentropy": 2.5299524664878845, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17209995537996292, "step": 9916 }, { "epoch": 0.45081818181818184, "grad_norm": 4.75, "grad_norm_var": 0.09685872395833334, "learning_rate": 0.0001, "loss": 5.1209, "loss/crossentropy": 2.1836409866809845, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14391806349158287, "step": 9918 }, { "epoch": 0.4509090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.09543863932291667, "learning_rate": 0.0001, "loss": 5.3717, "loss/crossentropy": 2.3096715211868286, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15600623562932014, "step": 9920 }, { "epoch": 0.451, "grad_norm": 5.1875, "grad_norm_var": 0.09537760416666667, "learning_rate": 0.0001, "loss": 5.8308, "loss/crossentropy": 2.48009330034256, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18233538791537285, "step": 9922 }, { "epoch": 0.4510909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.09006754557291667, "learning_rate": 0.0001, "loss": 5.8419, "loss/crossentropy": 2.588304817676544, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1726243607699871, "step": 9924 }, { "epoch": 0.4511818181818182, "grad_norm": 5.0, "grad_norm_var": 0.06995035807291666, "learning_rate": 0.0001, "loss": 5.4831, "loss/crossentropy": 2.392348349094391, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15985437110066414, "step": 9926 }, { "epoch": 0.45127272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.06676025390625, "learning_rate": 0.0001, "loss": 5.9819, "loss/crossentropy": 2.6854889392852783, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17827484384179115, "step": 9928 }, { "epoch": 0.45136363636363636, "grad_norm": 4.875, "grad_norm_var": 0.057421875, "learning_rate": 0.0001, "loss": 5.7227, "loss/crossentropy": 2.5546412467956543, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16387531906366348, "step": 9930 }, { "epoch": 0.45145454545454544, "grad_norm": 4.96875, "grad_norm_var": 0.059619140625, "learning_rate": 0.0001, "loss": 5.7924, "loss/crossentropy": 2.5371205806732178, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1735745444893837, "step": 9932 }, { "epoch": 0.4515454545454545, "grad_norm": 5.4375, "grad_norm_var": 0.09879150390625, "learning_rate": 0.0001, "loss": 6.1128, "loss/crossentropy": 2.73148113489151, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.18364343792200089, "step": 9934 }, { "epoch": 0.4516363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.13801676432291668, "learning_rate": 0.0001, "loss": 5.7875, "loss/crossentropy": 2.4987555146217346, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17555230855941772, "step": 9936 }, { "epoch": 0.4517272727272727, "grad_norm": 5.46875, "grad_norm_var": 0.15651041666666668, "learning_rate": 0.0001, "loss": 5.3852, "loss/crossentropy": 2.255656898021698, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16256717965006828, "step": 9938 }, { "epoch": 0.45181818181818184, "grad_norm": 4.90625, "grad_norm_var": 0.16780192057291668, "learning_rate": 0.0001, "loss": 5.5456, "loss/crossentropy": 2.3715029656887054, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16584819927811623, "step": 9940 }, { "epoch": 0.4519090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.16873372395833333, "learning_rate": 0.0001, "loss": 5.7079, "loss/crossentropy": 2.471422553062439, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1709126941859722, "step": 9942 }, { "epoch": 0.452, "grad_norm": 4.96875, "grad_norm_var": 0.19276936848958334, "learning_rate": 0.0001, "loss": 5.6616, "loss/crossentropy": 2.541171371936798, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1640000008046627, "step": 9944 }, { "epoch": 0.4520909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.19972330729166668, "learning_rate": 0.0001, "loss": 5.992, "loss/crossentropy": 2.7501670718193054, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1745714247226715, "step": 9946 }, { "epoch": 0.4521818181818182, "grad_norm": 4.875, "grad_norm_var": 0.20627848307291666, "learning_rate": 0.0001, "loss": 5.2756, "loss/crossentropy": 2.2078288197517395, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15756015107035637, "step": 9948 }, { "epoch": 0.45227272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.15087483723958334, "learning_rate": 0.0001, "loss": 5.6775, "loss/crossentropy": 2.4221372604370117, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17182153463363647, "step": 9950 }, { "epoch": 0.45236363636363636, "grad_norm": 5.0625, "grad_norm_var": 0.10506184895833333, "learning_rate": 0.0001, "loss": 5.6558, "loss/crossentropy": 2.4332515597343445, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.1673712059855461, "step": 9952 }, { "epoch": 0.45245454545454544, "grad_norm": 6.09375, "grad_norm_var": 0.15696207682291666, "learning_rate": 0.0001, "loss": 5.7044, "loss/crossentropy": 2.520835220813751, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16659484058618546, "step": 9954 }, { "epoch": 0.45254545454545453, "grad_norm": 4.8125, "grad_norm_var": 0.15362955729166666, "learning_rate": 0.0001, "loss": 5.4556, "loss/crossentropy": 2.3503502011299133, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15993623062968254, "step": 9956 }, { "epoch": 0.4526363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.15898030598958332, "learning_rate": 0.0001, "loss": 5.4494, "loss/crossentropy": 2.2998453974723816, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16241898760199547, "step": 9958 }, { "epoch": 0.4527272727272727, "grad_norm": 6.0, "grad_norm_var": 0.20403645833333334, "learning_rate": 0.0001, "loss": 5.5705, "loss/crossentropy": 2.3943233489990234, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16390305757522583, "step": 9960 }, { "epoch": 0.45281818181818184, "grad_norm": 5.96875, "grad_norm_var": 0.22001546223958332, "learning_rate": 0.0001, "loss": 5.7612, "loss/crossentropy": 2.5597667694091797, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16955279931426048, "step": 9962 }, { "epoch": 0.45290909090909093, "grad_norm": 5.375, "grad_norm_var": 0.2244140625, "learning_rate": 0.0001, "loss": 5.4989, "loss/crossentropy": 2.3733577132225037, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16255344823002815, "step": 9964 }, { "epoch": 0.453, "grad_norm": 5.15625, "grad_norm_var": 0.225634765625, "learning_rate": 0.0001, "loss": 6.0847, "loss/crossentropy": 2.814182758331299, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17861182615160942, "step": 9966 }, { "epoch": 0.4530909090909091, "grad_norm": 6.25, "grad_norm_var": 17.098811848958334, "learning_rate": 0.0001, "loss": 5.9994, "loss/crossentropy": 2.597605049610138, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.18256178870797157, "step": 9968 }, { "epoch": 0.4531818181818182, "grad_norm": 5.40625, "grad_norm_var": 17.080367024739584, "learning_rate": 0.0001, "loss": 5.7546, "loss/crossentropy": 2.526964545249939, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1704181768000126, "step": 9970 }, { "epoch": 0.45327272727272727, "grad_norm": 4.96875, "grad_norm_var": 17.068583170572918, "learning_rate": 0.0001, "loss": 5.9384, "loss/crossentropy": 2.711436629295349, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17328055202960968, "step": 9972 }, { "epoch": 0.45336363636363636, "grad_norm": 4.5, "grad_norm_var": 17.18375244140625, "learning_rate": 0.0001, "loss": 5.3979, "loss/crossentropy": 2.3201522827148438, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1583620011806488, "step": 9974 }, { "epoch": 0.45345454545454544, "grad_norm": 5.1875, "grad_norm_var": 17.350809733072918, "learning_rate": 0.0001, "loss": 5.9129, "loss/crossentropy": 2.7034268379211426, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17172909900546074, "step": 9976 }, { "epoch": 0.45354545454545453, "grad_norm": 6.0, "grad_norm_var": 17.300484212239585, "learning_rate": 0.0001, "loss": 5.8089, "loss/crossentropy": 2.5669616758823395, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17106683924794197, "step": 9978 }, { "epoch": 0.4536363636363636, "grad_norm": 5.4375, "grad_norm_var": 17.175764973958334, "learning_rate": 0.0001, "loss": 5.8309, "loss/crossentropy": 2.5724830627441406, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17388436198234558, "step": 9980 }, { "epoch": 0.4537272727272727, "grad_norm": 5.28125, "grad_norm_var": 17.084488932291666, "learning_rate": 0.0001, "loss": 6.0411, "loss/crossentropy": 2.6853926181793213, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18010633811354637, "step": 9982 }, { "epoch": 0.45381818181818184, "grad_norm": 5.4375, "grad_norm_var": 0.19133707682291667, "learning_rate": 0.0001, "loss": 6.0976, "loss/crossentropy": 2.8070791959762573, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.18002795800566673, "step": 9984 }, { "epoch": 0.45390909090909093, "grad_norm": 5.625, "grad_norm_var": 0.19999593098958332, "learning_rate": 0.0001, "loss": 5.8383, "loss/crossentropy": 2.5200236439704895, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17772898450493813, "step": 9986 }, { "epoch": 0.454, "grad_norm": 4.625, "grad_norm_var": 0.23905843098958332, "learning_rate": 0.0001, "loss": 5.1938, "loss/crossentropy": 2.2395099401474, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1475762017071247, "step": 9988 }, { "epoch": 0.4540909090909091, "grad_norm": 5.0, "grad_norm_var": 0.1892578125, "learning_rate": 0.0001, "loss": 6.0299, "loss/crossentropy": 2.7253392338752747, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17732637375593185, "step": 9990 }, { "epoch": 0.4541818181818182, "grad_norm": 4.875, "grad_norm_var": 0.19117431640625, "learning_rate": 0.0001, "loss": 5.0877, "loss/crossentropy": 2.160137802362442, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14509542472660542, "step": 9992 }, { "epoch": 0.4542727272727273, "grad_norm": 4.5625, "grad_norm_var": 0.157275390625, "learning_rate": 0.0001, "loss": 5.5588, "loss/crossentropy": 2.4046831130981445, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16150183603167534, "step": 9994 }, { "epoch": 0.45436363636363636, "grad_norm": 5.1875, "grad_norm_var": 0.15920817057291667, "learning_rate": 0.0001, "loss": 6.0676, "loss/crossentropy": 2.731833517551422, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18280025571584702, "step": 9996 }, { "epoch": 0.45445454545454544, "grad_norm": 5.0625, "grad_norm_var": 0.14479166666666668, "learning_rate": 0.0001, "loss": 5.7003, "loss/crossentropy": 2.5079164505004883, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1670922301709652, "step": 9998 }, { "epoch": 0.45454545454545453, "grad_norm": 5.0625, "grad_norm_var": 0.15260416666666668, "learning_rate": 0.0001, "loss": 5.5075, "loss/crossentropy": 2.352709263563156, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1648980863392353, "step": 10000 }, { "epoch": 0.4546363636363636, "grad_norm": 4.875, "grad_norm_var": 0.13199462890625, "learning_rate": 0.0001, "loss": 5.4044, "loss/crossentropy": 2.346976339817047, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15593292191624641, "step": 10002 }, { "epoch": 0.4547272727272727, "grad_norm": 5.46875, "grad_norm_var": 0.11013997395833333, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.5840137600898743, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16779443249106407, "step": 10004 }, { "epoch": 0.45481818181818184, "grad_norm": 4.9375, "grad_norm_var": 0.09269205729166667, "learning_rate": 0.0001, "loss": 5.7356, "loss/crossentropy": 2.5986169576644897, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1637025624513626, "step": 10006 }, { "epoch": 0.45490909090909093, "grad_norm": 4.9375, "grad_norm_var": 0.09254150390625, "learning_rate": 0.0001, "loss": 6.0005, "loss/crossentropy": 2.7729349732398987, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17197975516319275, "step": 10008 }, { "epoch": 0.455, "grad_norm": 4.59375, "grad_norm_var": 0.10240478515625, "learning_rate": 0.0001, "loss": 5.6295, "loss/crossentropy": 2.444638282060623, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16809239611029625, "step": 10010 }, { "epoch": 0.4550909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.09694010416666667, "learning_rate": 0.0001, "loss": 5.6365, "loss/crossentropy": 2.5522406101226807, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16057441383600235, "step": 10012 }, { "epoch": 0.4551818181818182, "grad_norm": 4.71875, "grad_norm_var": 0.10266927083333334, "learning_rate": 0.0001, "loss": 5.7271, "loss/crossentropy": 2.5584079027175903, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1658891960978508, "step": 10014 }, { "epoch": 0.4552727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.08840738932291667, "learning_rate": 0.0001, "loss": 5.7194, "loss/crossentropy": 2.438162475824356, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.16953328996896744, "step": 10016 }, { "epoch": 0.45536363636363636, "grad_norm": 4.78125, "grad_norm_var": 0.13258056640625, "learning_rate": 0.0001, "loss": 5.7745, "loss/crossentropy": 2.568824827671051, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1699860654771328, "step": 10018 }, { "epoch": 0.45545454545454545, "grad_norm": 4.8125, "grad_norm_var": 0.130712890625, "learning_rate": 0.0001, "loss": 5.4516, "loss/crossentropy": 2.3246760964393616, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16191348806023598, "step": 10020 }, { "epoch": 0.45554545454545453, "grad_norm": 5.28125, "grad_norm_var": 0.13811442057291667, "learning_rate": 0.0001, "loss": 5.9677, "loss/crossentropy": 2.717298746109009, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17367255315184593, "step": 10022 }, { "epoch": 0.4556363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.13492431640625, "learning_rate": 0.0001, "loss": 5.6377, "loss/crossentropy": 2.470965802669525, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1676451712846756, "step": 10024 }, { "epoch": 0.4557272727272727, "grad_norm": 5.3125, "grad_norm_var": 0.11230061848958334, "learning_rate": 0.0001, "loss": 5.468, "loss/crossentropy": 2.3317726254463196, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1636262834072113, "step": 10026 }, { "epoch": 0.45581818181818184, "grad_norm": 6.15625, "grad_norm_var": 0.19322509765625, "learning_rate": 0.0001, "loss": 5.6046, "loss/crossentropy": 2.4077145755290985, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1665637567639351, "step": 10028 }, { "epoch": 0.45590909090909093, "grad_norm": 5.6875, "grad_norm_var": 0.22652587890625, "learning_rate": 0.0001, "loss": 5.6489, "loss/crossentropy": 2.4442853927612305, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.16596796363592148, "step": 10030 }, { "epoch": 0.456, "grad_norm": 4.8125, "grad_norm_var": 0.23045247395833332, "learning_rate": 0.0001, "loss": 5.5531, "loss/crossentropy": 2.446883976459503, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16062652319669724, "step": 10032 }, { "epoch": 0.4560909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.18409830729166668, "learning_rate": 0.0001, "loss": 5.8439, "loss/crossentropy": 2.634361445903778, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16899850592017174, "step": 10034 }, { "epoch": 0.4561818181818182, "grad_norm": 5.125, "grad_norm_var": 0.2640584309895833, "learning_rate": 0.0001, "loss": 5.4809, "loss/crossentropy": 2.288202166557312, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16575796902179718, "step": 10036 }, { "epoch": 0.4562727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.2827962239583333, "learning_rate": 0.0001, "loss": 5.5252, "loss/crossentropy": 2.404763102531433, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16184397041797638, "step": 10038 }, { "epoch": 0.45636363636363636, "grad_norm": 4.84375, "grad_norm_var": 0.35312093098958336, "learning_rate": 0.0001, "loss": 5.7545, "loss/crossentropy": 2.529928982257843, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17010902985930443, "step": 10040 }, { "epoch": 0.45645454545454545, "grad_norm": 5.6875, "grad_norm_var": 0.3185831705729167, "learning_rate": 0.0001, "loss": 5.8623, "loss/crossentropy": 2.5492815375328064, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.17485880106687546, "step": 10042 }, { "epoch": 0.45654545454545453, "grad_norm": 5.03125, "grad_norm_var": 0.3798136393229167, "learning_rate": 0.0001, "loss": 5.7027, "loss/crossentropy": 2.4489444494247437, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17244458198547363, "step": 10044 }, { "epoch": 0.4566363636363636, "grad_norm": 4.71875, "grad_norm_var": 0.3541951497395833, "learning_rate": 0.0001, "loss": 5.3438, "loss/crossentropy": 2.2852503955364227, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15566114336252213, "step": 10046 }, { "epoch": 0.4567272727272727, "grad_norm": 5.34375, "grad_norm_var": 0.33088785807291665, "learning_rate": 0.0001, "loss": 6.1217, "loss/crossentropy": 2.6838021874427795, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18988371640443802, "step": 10048 }, { "epoch": 0.45681818181818185, "grad_norm": 4.90625, "grad_norm_var": 0.32278238932291664, "learning_rate": 0.0001, "loss": 5.5752, "loss/crossentropy": 2.432948112487793, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1628535334020853, "step": 10050 }, { "epoch": 0.45690909090909093, "grad_norm": 5.375, "grad_norm_var": 0.28218994140625, "learning_rate": 0.0001, "loss": 6.2022, "loss/crossentropy": 2.8422064185142517, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18404511734843254, "step": 10052 }, { "epoch": 0.457, "grad_norm": 5.03125, "grad_norm_var": 0.28648681640625, "learning_rate": 0.0001, "loss": 5.3507, "loss/crossentropy": 2.3239948749542236, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1540401577949524, "step": 10054 }, { "epoch": 0.4570909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.22923177083333332, "learning_rate": 0.0001, "loss": 5.6871, "loss/crossentropy": 2.4895747303962708, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1674066036939621, "step": 10056 }, { "epoch": 0.4571818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.236962890625, "learning_rate": 0.0001, "loss": 5.1667, "loss/crossentropy": 2.181908518075943, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14964619278907776, "step": 10058 }, { "epoch": 0.4572727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.11090087890625, "learning_rate": 0.0001, "loss": 5.8678, "loss/crossentropy": 2.5781367421150208, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17427734285593033, "step": 10060 }, { "epoch": 0.45736363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.115478515625, "learning_rate": 0.0001, "loss": 5.785, "loss/crossentropy": 2.5754446983337402, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17153740674257278, "step": 10062 }, { "epoch": 0.45745454545454545, "grad_norm": 5.1875, "grad_norm_var": 0.08248697916666667, "learning_rate": 0.0001, "loss": 5.6148, "loss/crossentropy": 2.4987303614616394, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16180389747023582, "step": 10064 }, { "epoch": 0.45754545454545453, "grad_norm": 4.5625, "grad_norm_var": 0.09898681640625, "learning_rate": 0.0001, "loss": 5.4887, "loss/crossentropy": 2.4485074281692505, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1555791962891817, "step": 10066 }, { "epoch": 0.4576363636363636, "grad_norm": 5.34375, "grad_norm_var": 0.06138916015625, "learning_rate": 0.0001, "loss": 5.8524, "loss/crossentropy": 2.592992067337036, "loss/hidden": 1.587890625, "loss/jsd": 0.0, "loss/logits": 0.16715390980243683, "step": 10068 }, { "epoch": 0.4577272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.06474202473958333, "learning_rate": 0.0001, "loss": 5.6565, "loss/crossentropy": 2.5196773409843445, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16484957933425903, "step": 10070 }, { "epoch": 0.4578181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.0701171875, "learning_rate": 0.0001, "loss": 5.4088, "loss/crossentropy": 2.3494051694869995, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15847743675112724, "step": 10072 }, { "epoch": 0.45790909090909093, "grad_norm": 5.28125, "grad_norm_var": 0.06519775390625, "learning_rate": 0.0001, "loss": 5.5563, "loss/crossentropy": 2.4030725955963135, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16239717602729797, "step": 10074 }, { "epoch": 0.458, "grad_norm": 6.21875, "grad_norm_var": 0.154296875, "learning_rate": 0.0001, "loss": 5.2975, "loss/crossentropy": 2.2262721061706543, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15614472702145576, "step": 10076 }, { "epoch": 0.4580909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.147119140625, "learning_rate": 0.0001, "loss": 5.6061, "loss/crossentropy": 2.5055068731307983, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16083704680204391, "step": 10078 }, { "epoch": 0.4581818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.15110677083333332, "learning_rate": 0.0001, "loss": 5.6855, "loss/crossentropy": 2.521155059337616, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16779913008213043, "step": 10080 }, { "epoch": 0.4582727272727273, "grad_norm": 5.25, "grad_norm_var": 0.13214518229166666, "learning_rate": 0.0001, "loss": 5.5162, "loss/crossentropy": 2.350587785243988, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.16128567978739738, "step": 10082 }, { "epoch": 0.45836363636363636, "grad_norm": 5.4375, "grad_norm_var": 0.13274332682291667, "learning_rate": 0.0001, "loss": 5.5485, "loss/crossentropy": 2.37705260515213, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1646038144826889, "step": 10084 }, { "epoch": 0.45845454545454545, "grad_norm": 5.28125, "grad_norm_var": 0.11848551432291667, "learning_rate": 0.0001, "loss": 6.0129, "loss/crossentropy": 2.6729613542556763, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1837969310581684, "step": 10086 }, { "epoch": 0.45854545454545453, "grad_norm": 5.1875, "grad_norm_var": 0.091650390625, "learning_rate": 0.0001, "loss": 5.9093, "loss/crossentropy": 2.673964023590088, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17099866271018982, "step": 10088 }, { "epoch": 0.4586363636363636, "grad_norm": 6.25, "grad_norm_var": 0.154150390625, "learning_rate": 0.0001, "loss": 5.8152, "loss/crossentropy": 2.5524577498435974, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17120065540075302, "step": 10090 }, { "epoch": 0.4587272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.11295572916666667, "learning_rate": 0.0001, "loss": 5.3151, "loss/crossentropy": 2.273399144411087, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1547576766461134, "step": 10092 }, { "epoch": 0.4588181818181818, "grad_norm": 5.125, "grad_norm_var": 0.10966389973958333, "learning_rate": 0.0001, "loss": 5.4469, "loss/crossentropy": 2.32647106051445, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.15754761546850204, "step": 10094 }, { "epoch": 0.45890909090909093, "grad_norm": 5.09375, "grad_norm_var": 0.10089518229166666, "learning_rate": 0.0001, "loss": 5.7782, "loss/crossentropy": 2.5891076922416687, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16930406540632248, "step": 10096 }, { "epoch": 0.459, "grad_norm": 5.03125, "grad_norm_var": 0.10513916015625, "learning_rate": 0.0001, "loss": 6.0809, "loss/crossentropy": 2.813577950000763, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17732226476073265, "step": 10098 }, { "epoch": 0.4590909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.11966145833333333, "learning_rate": 0.0001, "loss": 5.6009, "loss/crossentropy": 2.4874807596206665, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.160752821713686, "step": 10100 }, { "epoch": 0.4591818181818182, "grad_norm": 4.625, "grad_norm_var": 0.14700520833333333, "learning_rate": 0.0001, "loss": 5.1431, "loss/crossentropy": 2.190559983253479, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1468144841492176, "step": 10102 }, { "epoch": 0.4592727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.16939697265625, "learning_rate": 0.0001, "loss": 5.0406, "loss/crossentropy": 2.1303173303604126, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1416107527911663, "step": 10104 }, { "epoch": 0.45936363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.07069905598958333, "learning_rate": 0.0001, "loss": 5.4999, "loss/crossentropy": 2.469534695148468, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15636150911450386, "step": 10106 }, { "epoch": 0.45945454545454545, "grad_norm": 4.9375, "grad_norm_var": 0.071728515625, "learning_rate": 0.0001, "loss": 5.708, "loss/crossentropy": 2.507957696914673, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16785526648163795, "step": 10108 }, { "epoch": 0.45954545454545453, "grad_norm": 5.46875, "grad_norm_var": 0.07459309895833334, "learning_rate": 0.0001, "loss": 5.7098, "loss/crossentropy": 2.5127435326576233, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16814586520195007, "step": 10110 }, { "epoch": 0.4596363636363636, "grad_norm": 5.125, "grad_norm_var": 0.08837483723958334, "learning_rate": 0.0001, "loss": 5.6874, "loss/crossentropy": 2.5159651041030884, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1657714769244194, "step": 10112 }, { "epoch": 0.4597272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.08801676432291666, "learning_rate": 0.0001, "loss": 5.7296, "loss/crossentropy": 2.5522586703300476, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1659788265824318, "step": 10114 }, { "epoch": 0.4598181818181818, "grad_norm": 5.28125, "grad_norm_var": 0.08684488932291666, "learning_rate": 0.0001, "loss": 5.927, "loss/crossentropy": 2.66098952293396, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1754264086484909, "step": 10116 }, { "epoch": 0.45990909090909093, "grad_norm": 5.15625, "grad_norm_var": 0.07303059895833333, "learning_rate": 0.0001, "loss": 5.9709, "loss/crossentropy": 2.6837974190711975, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1753871887922287, "step": 10118 }, { "epoch": 0.46, "grad_norm": 5.0625, "grad_norm_var": 0.053125, "learning_rate": 0.0001, "loss": 5.8412, "loss/crossentropy": 2.6355390548706055, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17193510755896568, "step": 10120 }, { "epoch": 0.4600909090909091, "grad_norm": 4.875, "grad_norm_var": 0.034375, "learning_rate": 0.0001, "loss": 5.7772, "loss/crossentropy": 2.582201838493347, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1669638305902481, "step": 10122 }, { "epoch": 0.4601818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.033447265625, "learning_rate": 0.0001, "loss": 5.6942, "loss/crossentropy": 2.528245061635971, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16561562567949295, "step": 10124 }, { "epoch": 0.4602727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.05178629557291667, "learning_rate": 0.0001, "loss": 5.6568, "loss/crossentropy": 2.438015580177307, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16836068034172058, "step": 10126 }, { "epoch": 0.46036363636363636, "grad_norm": 6.125, "grad_norm_var": 0.11066080729166666, "learning_rate": 0.0001, "loss": 5.8822, "loss/crossentropy": 2.6333528757095337, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17117619514465332, "step": 10128 }, { "epoch": 0.46045454545454545, "grad_norm": 5.0, "grad_norm_var": 0.10992431640625, "learning_rate": 0.0001, "loss": 5.5461, "loss/crossentropy": 2.4341570138931274, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15884821116924286, "step": 10130 }, { "epoch": 0.46054545454545454, "grad_norm": 4.5, "grad_norm_var": 0.15435791015625, "learning_rate": 0.0001, "loss": 5.222, "loss/crossentropy": 2.2636961340904236, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1499356273561716, "step": 10132 }, { "epoch": 0.4606363636363636, "grad_norm": 5.625, "grad_norm_var": 0.233056640625, "learning_rate": 0.0001, "loss": 5.5808, "loss/crossentropy": 2.4303122758865356, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16426336765289307, "step": 10134 }, { "epoch": 0.4607272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.23748372395833334, "learning_rate": 0.0001, "loss": 5.813, "loss/crossentropy": 2.642366349697113, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16725772991776466, "step": 10136 }, { "epoch": 0.4608181818181818, "grad_norm": 5.0, "grad_norm_var": 0.23277587890625, "learning_rate": 0.0001, "loss": 5.4996, "loss/crossentropy": 2.348834991455078, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16468404605984688, "step": 10138 }, { "epoch": 0.46090909090909093, "grad_norm": 5.65625, "grad_norm_var": 0.24451497395833333, "learning_rate": 0.0001, "loss": 5.4233, "loss/crossentropy": 2.3629530668258667, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15505998209118843, "step": 10140 }, { "epoch": 0.461, "grad_norm": 5.1875, "grad_norm_var": 0.23039957682291667, "learning_rate": 0.0001, "loss": 5.8546, "loss/crossentropy": 2.648069143295288, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1671368069946766, "step": 10142 }, { "epoch": 0.4610909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.17200113932291666, "learning_rate": 0.0001, "loss": 5.8836, "loss/crossentropy": 2.645288646221161, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17421811446547508, "step": 10144 }, { "epoch": 0.4611818181818182, "grad_norm": 8.8125, "grad_norm_var": 0.9895792643229167, "learning_rate": 0.0001, "loss": 5.6889, "loss/crossentropy": 2.4696746468544006, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17231778800487518, "step": 10146 }, { "epoch": 0.4612727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.9122355143229167, "learning_rate": 0.0001, "loss": 5.4625, "loss/crossentropy": 2.3247100710868835, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.15947851911187172, "step": 10148 }, { "epoch": 0.46136363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.8884073893229166, "learning_rate": 0.0001, "loss": 5.7755, "loss/crossentropy": 2.547317385673523, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17165100201964378, "step": 10150 }, { "epoch": 0.46145454545454545, "grad_norm": 5.6875, "grad_norm_var": 0.8880045572916667, "learning_rate": 0.0001, "loss": 5.4492, "loss/crossentropy": 2.360816776752472, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16001111641526222, "step": 10152 }, { "epoch": 0.46154545454545454, "grad_norm": 4.9375, "grad_norm_var": 0.89088134765625, "learning_rate": 0.0001, "loss": 5.5907, "loss/crossentropy": 2.4356706738471985, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16530462354421616, "step": 10154 }, { "epoch": 0.4616363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.884375, "learning_rate": 0.0001, "loss": 5.9815, "loss/crossentropy": 2.6721596717834473, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17858954146504402, "step": 10156 }, { "epoch": 0.4617272727272727, "grad_norm": 5.40625, "grad_norm_var": 0.88570556640625, "learning_rate": 0.0001, "loss": 5.5053, "loss/crossentropy": 2.350754141807556, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1638893522322178, "step": 10158 }, { "epoch": 0.4618181818181818, "grad_norm": 4.875, "grad_norm_var": 0.9043904622395833, "learning_rate": 0.0001, "loss": 5.3561, "loss/crossentropy": 2.3107032775878906, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15648816153407097, "step": 10160 }, { "epoch": 0.46190909090909094, "grad_norm": 4.59375, "grad_norm_var": 0.08625895182291667, "learning_rate": 0.0001, "loss": 5.2003, "loss/crossentropy": 2.262757420539856, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.14316556602716446, "step": 10162 }, { "epoch": 0.462, "grad_norm": 5.9375, "grad_norm_var": 0.12626546223958332, "learning_rate": 0.0001, "loss": 5.9179, "loss/crossentropy": 2.6317758560180664, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17881131172180176, "step": 10164 }, { "epoch": 0.4620909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.13189697265625, "learning_rate": 0.0001, "loss": 5.5969, "loss/crossentropy": 2.471145808696747, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16120385378599167, "step": 10166 }, { "epoch": 0.4621818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.12991129557291667, "learning_rate": 0.0001, "loss": 5.9735, "loss/crossentropy": 2.633733034133911, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17889398708939552, "step": 10168 }, { "epoch": 0.4622727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.14498291015625, "learning_rate": 0.0001, "loss": 5.6705, "loss/crossentropy": 2.5365323424339294, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16339827701449394, "step": 10170 }, { "epoch": 0.46236363636363637, "grad_norm": 4.8125, "grad_norm_var": 0.146875, "learning_rate": 0.0001, "loss": 5.7597, "loss/crossentropy": 2.5665743350982666, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16969982534646988, "step": 10172 }, { "epoch": 0.46245454545454545, "grad_norm": 5.21875, "grad_norm_var": 0.14159749348958334, "learning_rate": 0.0001, "loss": 5.8245, "loss/crossentropy": 2.6248472929000854, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16703861206769943, "step": 10174 }, { "epoch": 0.46254545454545454, "grad_norm": 7.53125, "grad_norm_var": 0.5184529622395834, "learning_rate": 0.0001, "loss": 5.6952, "loss/crossentropy": 2.487411081790924, "loss/hidden": 1.564453125, "loss/jsd": 0.0, "loss/logits": 0.16433263570070267, "step": 10176 }, { "epoch": 0.4626363636363636, "grad_norm": 4.75, "grad_norm_var": 0.47362874348958334, "learning_rate": 0.0001, "loss": 5.4024, "loss/crossentropy": 2.3721718788146973, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15263645723462105, "step": 10178 }, { "epoch": 0.4627272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.4486612955729167, "learning_rate": 0.0001, "loss": 5.6552, "loss/crossentropy": 2.474564164876938, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1663099005818367, "step": 10180 }, { "epoch": 0.4628181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.43902587890625, "learning_rate": 0.0001, "loss": 5.7234, "loss/crossentropy": 2.514283239841461, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1716976761817932, "step": 10182 }, { "epoch": 0.46290909090909094, "grad_norm": 4.5625, "grad_norm_var": 0.45865478515625, "learning_rate": 0.0001, "loss": 5.5406, "loss/crossentropy": 2.454796254634857, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1603374257683754, "step": 10184 }, { "epoch": 0.463, "grad_norm": 4.875, "grad_norm_var": 0.4772420247395833, "learning_rate": 0.0001, "loss": 5.5807, "loss/crossentropy": 2.499754846096039, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1616072617471218, "step": 10186 }, { "epoch": 0.4630909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.475634765625, "learning_rate": 0.0001, "loss": 5.2616, "loss/crossentropy": 2.2287834882736206, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1546516828238964, "step": 10188 }, { "epoch": 0.4631818181818182, "grad_norm": 4.75, "grad_norm_var": 0.48814697265625, "learning_rate": 0.0001, "loss": 5.2765, "loss/crossentropy": 2.235899329185486, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15503547340631485, "step": 10190 }, { "epoch": 0.4632727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.0740234375, "learning_rate": 0.0001, "loss": 5.8881, "loss/crossentropy": 2.6763737201690674, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17175846546888351, "step": 10192 }, { "epoch": 0.46336363636363637, "grad_norm": 5.125, "grad_norm_var": 0.06461181640625, "learning_rate": 0.0001, "loss": 5.7411, "loss/crossentropy": 2.4610151648521423, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17527318373322487, "step": 10194 }, { "epoch": 0.46345454545454545, "grad_norm": 5.3125, "grad_norm_var": 0.065478515625, "learning_rate": 0.0001, "loss": 5.7151, "loss/crossentropy": 2.463517665863037, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.170863077044487, "step": 10196 }, { "epoch": 0.46354545454545454, "grad_norm": 4.4375, "grad_norm_var": 0.0640625, "learning_rate": 0.0001, "loss": 5.4836, "loss/crossentropy": 2.397216945886612, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15941618010401726, "step": 10198 }, { "epoch": 0.4636363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.057938639322916666, "learning_rate": 0.0001, "loss": 5.9626, "loss/crossentropy": 2.7211652398109436, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17590255662798882, "step": 10200 }, { "epoch": 0.4637272727272727, "grad_norm": 4.75, "grad_norm_var": 0.05575764973958333, "learning_rate": 0.0001, "loss": 5.294, "loss/crossentropy": 2.2421064376831055, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15519215911626816, "step": 10202 }, { "epoch": 0.4638181818181818, "grad_norm": 5.125, "grad_norm_var": 0.05227864583333333, "learning_rate": 0.0001, "loss": 5.8727, "loss/crossentropy": 2.625628352165222, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17549137398600578, "step": 10204 }, { "epoch": 0.4639090909090909, "grad_norm": 5.9375, "grad_norm_var": 0.11278889973958334, "learning_rate": 0.0001, "loss": 6.0481, "loss/crossentropy": 2.7325512766838074, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18038010597229004, "step": 10206 }, { "epoch": 0.464, "grad_norm": 5.40625, "grad_norm_var": 0.12089436848958333, "learning_rate": 0.0001, "loss": 6.1284, "loss/crossentropy": 2.804191827774048, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17929257079958916, "step": 10208 }, { "epoch": 0.4640909090909091, "grad_norm": 5.25, "grad_norm_var": 0.12888997395833332, "learning_rate": 0.0001, "loss": 5.7527, "loss/crossentropy": 2.552089273929596, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.168109692633152, "step": 10210 }, { "epoch": 0.4641818181818182, "grad_norm": 5.0, "grad_norm_var": 0.12237955729166666, "learning_rate": 0.0001, "loss": 5.5425, "loss/crossentropy": 2.394629567861557, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16849565878510475, "step": 10212 }, { "epoch": 0.4642727272727273, "grad_norm": 5.875, "grad_norm_var": 0.12472330729166667, "learning_rate": 0.0001, "loss": 5.915, "loss/crossentropy": 2.6668078303337097, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1732548177242279, "step": 10214 }, { "epoch": 0.46436363636363637, "grad_norm": 5.53125, "grad_norm_var": 0.11441650390625, "learning_rate": 0.0001, "loss": 5.9009, "loss/crossentropy": 2.7019988298416138, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17008152231574059, "step": 10216 }, { "epoch": 0.46445454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.12545572916666667, "learning_rate": 0.0001, "loss": 5.8138, "loss/crossentropy": 2.686108887195587, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1645258516073227, "step": 10218 }, { "epoch": 0.46454545454545454, "grad_norm": 5.125, "grad_norm_var": 0.11955973307291666, "learning_rate": 0.0001, "loss": 6.0676, "loss/crossentropy": 2.821801006793976, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1736070215702057, "step": 10220 }, { "epoch": 0.4646363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.08521728515625, "learning_rate": 0.0001, "loss": 5.6358, "loss/crossentropy": 2.481220781803131, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16604836657643318, "step": 10222 }, { "epoch": 0.4647272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.09152018229166667, "learning_rate": 0.0001, "loss": 5.4285, "loss/crossentropy": 2.3580016493797302, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15782896056771278, "step": 10224 }, { "epoch": 0.4648181818181818, "grad_norm": 5.125, "grad_norm_var": 0.08527018229166666, "learning_rate": 0.0001, "loss": 5.6063, "loss/crossentropy": 2.3915364742279053, "loss/hidden": 1.572265625, "loss/jsd": 0.0, "loss/logits": 0.16425182670354843, "step": 10226 }, { "epoch": 0.4649090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.089697265625, "learning_rate": 0.0001, "loss": 5.9133, "loss/crossentropy": 2.716290533542633, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16755032911896706, "step": 10228 }, { "epoch": 0.465, "grad_norm": 5.0, "grad_norm_var": 0.08192952473958333, "learning_rate": 0.0001, "loss": 5.7135, "loss/crossentropy": 2.4960232973098755, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1695951335132122, "step": 10230 }, { "epoch": 0.4650909090909091, "grad_norm": 5.375, "grad_norm_var": 0.07512613932291666, "learning_rate": 0.0001, "loss": 5.903, "loss/crossentropy": 2.6503379940986633, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17389726266264915, "step": 10232 }, { "epoch": 0.4651818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.10022379557291666, "learning_rate": 0.0001, "loss": 5.2011, "loss/crossentropy": 2.2310989797115326, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1489558331668377, "step": 10234 }, { "epoch": 0.4652727272727273, "grad_norm": 4.5625, "grad_norm_var": 0.11663004557291666, "learning_rate": 0.0001, "loss": 5.1288, "loss/crossentropy": 2.165414035320282, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1488744243979454, "step": 10236 }, { "epoch": 0.46536363636363637, "grad_norm": 5.03125, "grad_norm_var": 0.12984619140625, "learning_rate": 0.0001, "loss": 5.6305, "loss/crossentropy": 2.3956871032714844, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17035681754350662, "step": 10238 }, { "epoch": 0.46545454545454545, "grad_norm": 5.5, "grad_norm_var": 0.13085530598958334, "learning_rate": 0.0001, "loss": 5.8668, "loss/crossentropy": 2.6057299375534058, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17454735934734344, "step": 10240 }, { "epoch": 0.46554545454545454, "grad_norm": 5.0625, "grad_norm_var": 0.13332926432291667, "learning_rate": 0.0001, "loss": 5.3972, "loss/crossentropy": 2.2675250470638275, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16120528057217598, "step": 10242 }, { "epoch": 0.4656363636363636, "grad_norm": 5.625, "grad_norm_var": 0.1439453125, "learning_rate": 0.0001, "loss": 5.6698, "loss/crossentropy": 2.470262289047241, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1693725623190403, "step": 10244 }, { "epoch": 0.4657272727272727, "grad_norm": 5.125, "grad_norm_var": 0.11197916666666667, "learning_rate": 0.0001, "loss": 5.6954, "loss/crossentropy": 2.5352267026901245, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16582264378666878, "step": 10246 }, { "epoch": 0.4658181818181818, "grad_norm": 5.53125, "grad_norm_var": 0.11873372395833333, "learning_rate": 0.0001, "loss": 5.6346, "loss/crossentropy": 2.4787798523902893, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.15932821854948997, "step": 10248 }, { "epoch": 0.4659090909090909, "grad_norm": 5.25, "grad_norm_var": 0.09251302083333333, "learning_rate": 0.0001, "loss": 5.4991, "loss/crossentropy": 2.402052164077759, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16048173606395721, "step": 10250 }, { "epoch": 0.466, "grad_norm": 5.0625, "grad_norm_var": 0.09855143229166667, "learning_rate": 0.0001, "loss": 5.95, "loss/crossentropy": 2.6814972162246704, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17392338439822197, "step": 10252 }, { "epoch": 0.4660909090909091, "grad_norm": 4.96875, "grad_norm_var": 0.096337890625, "learning_rate": 0.0001, "loss": 5.7428, "loss/crossentropy": 2.5705888867378235, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1660449393093586, "step": 10254 }, { "epoch": 0.4661818181818182, "grad_norm": 4.75, "grad_norm_var": 0.1130859375, "learning_rate": 0.0001, "loss": 5.699, "loss/crossentropy": 2.606991708278656, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15979165583848953, "step": 10256 }, { "epoch": 0.4662727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.11249593098958334, "learning_rate": 0.0001, "loss": 6.0228, "loss/crossentropy": 2.7264586687088013, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17748289182782173, "step": 10258 }, { "epoch": 0.46636363636363637, "grad_norm": 5.28125, "grad_norm_var": 0.10402018229166667, "learning_rate": 0.0001, "loss": 5.9147, "loss/crossentropy": 2.619687497615814, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17676228657364845, "step": 10260 }, { "epoch": 0.46645454545454546, "grad_norm": 5.3125, "grad_norm_var": 0.11142171223958333, "learning_rate": 0.0001, "loss": 5.6013, "loss/crossentropy": 2.503659963607788, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1587851643562317, "step": 10262 }, { "epoch": 0.46654545454545454, "grad_norm": 5.40625, "grad_norm_var": 0.11780192057291666, "learning_rate": 0.0001, "loss": 5.5974, "loss/crossentropy": 2.4649731516838074, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16285210102796555, "step": 10264 }, { "epoch": 0.4666363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.10167643229166666, "learning_rate": 0.0001, "loss": 5.809, "loss/crossentropy": 2.6400445699691772, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16728178411722183, "step": 10266 }, { "epoch": 0.4667272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.0564453125, "learning_rate": 0.0001, "loss": 5.5922, "loss/crossentropy": 2.47862446308136, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1603844277560711, "step": 10268 }, { "epoch": 0.4668181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.08229166666666667, "learning_rate": 0.0001, "loss": 5.7376, "loss/crossentropy": 2.5751465559005737, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1672227419912815, "step": 10270 }, { "epoch": 0.4669090909090909, "grad_norm": 4.34375, "grad_norm_var": 0.10565999348958334, "learning_rate": 0.0001, "loss": 5.3833, "loss/crossentropy": 2.3502790927886963, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15329759009182453, "step": 10272 }, { "epoch": 0.467, "grad_norm": 4.8125, "grad_norm_var": 0.10480143229166666, "learning_rate": 0.0001, "loss": 5.6135, "loss/crossentropy": 2.4944411516189575, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16131922230124474, "step": 10274 }, { "epoch": 0.4670909090909091, "grad_norm": 4.875, "grad_norm_var": 0.086962890625, "learning_rate": 0.0001, "loss": 5.4955, "loss/crossentropy": 2.458085298538208, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15491009876132011, "step": 10276 }, { "epoch": 0.4671818181818182, "grad_norm": 5.5, "grad_norm_var": 0.1197265625, "learning_rate": 0.0001, "loss": 5.6246, "loss/crossentropy": 2.460436522960663, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16388041526079178, "step": 10278 }, { "epoch": 0.4672727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.10467122395833334, "learning_rate": 0.0001, "loss": 5.6272, "loss/crossentropy": 2.4376583099365234, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16777895018458366, "step": 10280 }, { "epoch": 0.46736363636363637, "grad_norm": 5.0, "grad_norm_var": 0.10310872395833333, "learning_rate": 0.0001, "loss": 5.6147, "loss/crossentropy": 2.475985825061798, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1621105745434761, "step": 10282 }, { "epoch": 0.46745454545454546, "grad_norm": 5.1875, "grad_norm_var": 0.11808268229166667, "learning_rate": 0.0001, "loss": 5.5724, "loss/crossentropy": 2.4327688217163086, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.15986617282032967, "step": 10284 }, { "epoch": 0.46754545454545454, "grad_norm": 4.53125, "grad_norm_var": 0.13599853515625, "learning_rate": 0.0001, "loss": 5.071, "loss/crossentropy": 2.1515549421310425, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.14604653418064117, "step": 10286 }, { "epoch": 0.46763636363636363, "grad_norm": 5.125, "grad_norm_var": 0.11077067057291666, "learning_rate": 0.0001, "loss": 5.4015, "loss/crossentropy": 2.3547644317150116, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15565212815999985, "step": 10288 }, { "epoch": 0.4677272727272727, "grad_norm": 4.65625, "grad_norm_var": 0.11900634765625, "learning_rate": 0.0001, "loss": 5.2888, "loss/crossentropy": 2.260964572429657, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1518026925623417, "step": 10290 }, { "epoch": 0.4678181818181818, "grad_norm": 5.53125, "grad_norm_var": 0.15167643229166666, "learning_rate": 0.0001, "loss": 5.59, "loss/crossentropy": 2.416129767894745, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16738584637641907, "step": 10292 }, { "epoch": 0.4679090909090909, "grad_norm": 4.5625, "grad_norm_var": 0.11340738932291666, "learning_rate": 0.0001, "loss": 5.4806, "loss/crossentropy": 2.424233078956604, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1595420427620411, "step": 10294 }, { "epoch": 0.468, "grad_norm": 5.125, "grad_norm_var": 0.14368489583333333, "learning_rate": 0.0001, "loss": 5.6309, "loss/crossentropy": 2.449775516986847, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1651780605316162, "step": 10296 }, { "epoch": 0.4680909090909091, "grad_norm": 5.375, "grad_norm_var": 0.41386311848958335, "learning_rate": 0.0001, "loss": 6.0578, "loss/crossentropy": 2.7274314761161804, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17971158027648926, "step": 10298 }, { "epoch": 0.4681818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.42720947265625, "learning_rate": 0.0001, "loss": 5.1115, "loss/crossentropy": 2.1059387028217316, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15211839228868484, "step": 10300 }, { "epoch": 0.4682727272727273, "grad_norm": 5.3125, "grad_norm_var": 0.3827433268229167, "learning_rate": 0.0001, "loss": 5.4711, "loss/crossentropy": 2.3175462186336517, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1626206375658512, "step": 10302 }, { "epoch": 0.46836363636363637, "grad_norm": 5.0, "grad_norm_var": 0.38013916015625, "learning_rate": 0.0001, "loss": 5.722, "loss/crossentropy": 2.5750012397766113, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16489386186003685, "step": 10304 }, { "epoch": 0.46845454545454546, "grad_norm": 5.0, "grad_norm_var": 0.36595052083333335, "learning_rate": 0.0001, "loss": 5.7887, "loss/crossentropy": 2.628276824951172, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16603873670101166, "step": 10306 }, { "epoch": 0.46854545454545454, "grad_norm": 4.65625, "grad_norm_var": 0.346728515625, "learning_rate": 0.0001, "loss": 5.8213, "loss/crossentropy": 2.7035282254219055, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16451570764183998, "step": 10308 }, { "epoch": 0.46863636363636363, "grad_norm": 4.65625, "grad_norm_var": 0.338916015625, "learning_rate": 0.0001, "loss": 5.4852, "loss/crossentropy": 2.387792646884918, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15758850425481796, "step": 10310 }, { "epoch": 0.4687272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.327587890625, "learning_rate": 0.0001, "loss": 5.647, "loss/crossentropy": 2.451628863811493, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16738490760326385, "step": 10312 }, { "epoch": 0.4688181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.07662353515625, "learning_rate": 0.0001, "loss": 5.4999, "loss/crossentropy": 2.381097972393036, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16168687492609024, "step": 10314 }, { "epoch": 0.4689090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.065625, "learning_rate": 0.0001, "loss": 5.6299, "loss/crossentropy": 2.519278585910797, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16242564469575882, "step": 10316 }, { "epoch": 0.469, "grad_norm": 5.1875, "grad_norm_var": 0.05245768229166667, "learning_rate": 0.0001, "loss": 5.52, "loss/crossentropy": 2.4072290658950806, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16088226810097694, "step": 10318 }, { "epoch": 0.4690909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.05670572916666667, "learning_rate": 0.0001, "loss": 5.6058, "loss/crossentropy": 2.564880907535553, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1574162170290947, "step": 10320 }, { "epoch": 0.4691818181818182, "grad_norm": 5.53125, "grad_norm_var": 0.5912394205729167, "learning_rate": 0.0001, "loss": 6.1794, "loss/crossentropy": 2.8071731328964233, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1856592260301113, "step": 10322 }, { "epoch": 0.4692727272727273, "grad_norm": 5.5625, "grad_norm_var": 0.5934529622395833, "learning_rate": 0.0001, "loss": 5.8092, "loss/crossentropy": 2.6045714616775513, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17027004435658455, "step": 10324 }, { "epoch": 0.46936363636363637, "grad_norm": 5.125, "grad_norm_var": 0.6058430989583333, "learning_rate": 0.0001, "loss": 5.6659, "loss/crossentropy": 2.4195780754089355, "loss/hidden": 1.560546875, "loss/jsd": 0.0, "loss/logits": 0.16857796162366867, "step": 10326 }, { "epoch": 0.46945454545454546, "grad_norm": 5.46875, "grad_norm_var": 0.6098307291666667, "learning_rate": 0.0001, "loss": 6.172, "loss/crossentropy": 2.8652427196502686, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17931154370307922, "step": 10328 }, { "epoch": 0.46954545454545454, "grad_norm": 4.59375, "grad_norm_var": 0.6028645833333334, "learning_rate": 0.0001, "loss": 5.453, "loss/crossentropy": 2.3860133290290833, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15787334740161896, "step": 10330 }, { "epoch": 0.46963636363636363, "grad_norm": 4.90625, "grad_norm_var": 0.64527587890625, "learning_rate": 0.0001, "loss": 5.1509, "loss/crossentropy": 2.16838476061821, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15020570531487465, "step": 10332 }, { "epoch": 0.4697272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.6307576497395834, "learning_rate": 0.0001, "loss": 5.5674, "loss/crossentropy": 2.464353561401367, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16069798171520233, "step": 10334 }, { "epoch": 0.4698181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.60670166015625, "learning_rate": 0.0001, "loss": 5.9769, "loss/crossentropy": 2.725364327430725, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17671694606542587, "step": 10336 }, { "epoch": 0.4699090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.15753580729166666, "learning_rate": 0.0001, "loss": 5.8292, "loss/crossentropy": 2.676965117454529, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16678671538829803, "step": 10338 }, { "epoch": 0.47, "grad_norm": 4.5625, "grad_norm_var": 0.15689697265625, "learning_rate": 0.0001, "loss": 5.4297, "loss/crossentropy": 2.404458522796631, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15604405105113983, "step": 10340 }, { "epoch": 0.4700909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.09136962890625, "learning_rate": 0.0001, "loss": 6.1882, "loss/crossentropy": 2.8405264019966125, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.18222477287054062, "step": 10342 }, { "epoch": 0.4701818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.054032389322916666, "learning_rate": 0.0001, "loss": 5.6054, "loss/crossentropy": 2.476129949092865, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16409720852971077, "step": 10344 }, { "epoch": 0.4702727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.05054931640625, "learning_rate": 0.0001, "loss": 5.453, "loss/crossentropy": 2.367178499698639, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15779682993888855, "step": 10346 }, { "epoch": 0.4703636363636364, "grad_norm": 5.0625, "grad_norm_var": 0.044905598958333334, "learning_rate": 0.0001, "loss": 5.2266, "loss/crossentropy": 2.189123213291168, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15647954121232033, "step": 10348 }, { "epoch": 0.47045454545454546, "grad_norm": 5.125, "grad_norm_var": 0.045308430989583336, "learning_rate": 0.0001, "loss": 5.8512, "loss/crossentropy": 2.628066599369049, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17133944109082222, "step": 10350 }, { "epoch": 0.47054545454545454, "grad_norm": 5.1875, "grad_norm_var": 0.05831705729166667, "learning_rate": 0.0001, "loss": 6.1039, "loss/crossentropy": 2.7674103379249573, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18013695627450943, "step": 10352 }, { "epoch": 0.47063636363636363, "grad_norm": 5.25, "grad_norm_var": 0.07146809895833334, "learning_rate": 0.0001, "loss": 5.7099, "loss/crossentropy": 2.4922373294830322, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17157429084181786, "step": 10354 }, { "epoch": 0.4707272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.05677083333333333, "learning_rate": 0.0001, "loss": 5.2921, "loss/crossentropy": 2.318471670150757, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1461876779794693, "step": 10356 }, { "epoch": 0.4708181818181818, "grad_norm": 4.8125, "grad_norm_var": 0.057535807291666664, "learning_rate": 0.0001, "loss": 5.9016, "loss/crossentropy": 2.7132039070129395, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16962099820375443, "step": 10358 }, { "epoch": 0.4709090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.05618082682291667, "learning_rate": 0.0001, "loss": 5.8832, "loss/crossentropy": 2.7303494811058044, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16606241464614868, "step": 10360 }, { "epoch": 0.471, "grad_norm": 4.59375, "grad_norm_var": 0.072509765625, "learning_rate": 0.0001, "loss": 5.4018, "loss/crossentropy": 2.3479784727096558, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15479470044374466, "step": 10362 }, { "epoch": 0.4710909090909091, "grad_norm": 4.625, "grad_norm_var": 0.07258707682291667, "learning_rate": 0.0001, "loss": 5.175, "loss/crossentropy": 2.212315022945404, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14646058157086372, "step": 10364 }, { "epoch": 0.4711818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.07375895182291667, "learning_rate": 0.0001, "loss": 5.4812, "loss/crossentropy": 2.408233106136322, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15514393523335457, "step": 10366 }, { "epoch": 0.4712727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.07589518229166667, "learning_rate": 0.0001, "loss": 5.2837, "loss/crossentropy": 2.265029489994049, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15186350792646408, "step": 10368 }, { "epoch": 0.4713636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.054280598958333336, "learning_rate": 0.0001, "loss": 5.4238, "loss/crossentropy": 2.3651607632637024, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15566496178507805, "step": 10370 }, { "epoch": 0.47145454545454546, "grad_norm": 4.90625, "grad_norm_var": 0.055497233072916666, "learning_rate": 0.0001, "loss": 5.2974, "loss/crossentropy": 2.241815507411957, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1541912481188774, "step": 10372 }, { "epoch": 0.47154545454545455, "grad_norm": 4.96875, "grad_norm_var": 0.05162760416666667, "learning_rate": 0.0001, "loss": 5.4756, "loss/crossentropy": 2.3846670985221863, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1583143100142479, "step": 10374 }, { "epoch": 0.47163636363636363, "grad_norm": 5.875, "grad_norm_var": 0.12277018229166667, "learning_rate": 0.0001, "loss": 5.5922, "loss/crossentropy": 2.421465516090393, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16316719725728035, "step": 10376 }, { "epoch": 0.4717272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.1009765625, "learning_rate": 0.0001, "loss": 5.3615, "loss/crossentropy": 2.3337239027023315, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15219657123088837, "step": 10378 }, { "epoch": 0.4718181818181818, "grad_norm": 4.71875, "grad_norm_var": 0.09195556640625, "learning_rate": 0.0001, "loss": 5.8249, "loss/crossentropy": 2.6515893936157227, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16986994445323944, "step": 10380 }, { "epoch": 0.4719090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.12213134765625, "learning_rate": 0.0001, "loss": 5.1523, "loss/crossentropy": 2.162407875061035, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14937683567404747, "step": 10382 }, { "epoch": 0.472, "grad_norm": 4.875, "grad_norm_var": 0.11302083333333333, "learning_rate": 0.0001, "loss": 5.5028, "loss/crossentropy": 2.4539568424224854, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15351307019591331, "step": 10384 }, { "epoch": 0.4720909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.13567708333333334, "learning_rate": 0.0001, "loss": 5.9246, "loss/crossentropy": 2.6984254121780396, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1718318685889244, "step": 10386 }, { "epoch": 0.4721818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.13795572916666668, "learning_rate": 0.0001, "loss": 5.6197, "loss/crossentropy": 2.48600572347641, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16629966348409653, "step": 10388 }, { "epoch": 0.4722727272727273, "grad_norm": 5.03125, "grad_norm_var": 0.13474934895833332, "learning_rate": 0.0001, "loss": 5.4374, "loss/crossentropy": 2.3833118081092834, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1565813012421131, "step": 10390 }, { "epoch": 0.4723636363636364, "grad_norm": 5.4375, "grad_norm_var": 0.15198160807291666, "learning_rate": 0.0001, "loss": 5.6012, "loss/crossentropy": 2.430551290512085, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16687048226594925, "step": 10392 }, { "epoch": 0.47245454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.37545572916666664, "learning_rate": 0.0001, "loss": 5.9638, "loss/crossentropy": 2.6363439559936523, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18196310475468636, "step": 10394 }, { "epoch": 0.47254545454545455, "grad_norm": 4.90625, "grad_norm_var": 0.3684895833333333, "learning_rate": 0.0001, "loss": 5.6301, "loss/crossentropy": 2.514181911945343, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1604190729558468, "step": 10396 }, { "epoch": 0.47263636363636363, "grad_norm": 4.84375, "grad_norm_var": 0.32808837890625, "learning_rate": 0.0001, "loss": 5.2865, "loss/crossentropy": 2.2711414098739624, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15211760625243187, "step": 10398 }, { "epoch": 0.4727272727272727, "grad_norm": 5.0, "grad_norm_var": 0.305712890625, "learning_rate": 0.0001, "loss": 5.5023, "loss/crossentropy": 2.366205930709839, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16380318254232407, "step": 10400 }, { "epoch": 0.4728181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.28837483723958335, "learning_rate": 0.0001, "loss": 5.85, "loss/crossentropy": 2.5750760436058044, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17515313625335693, "step": 10402 }, { "epoch": 0.4729090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.31301676432291664, "learning_rate": 0.0001, "loss": 5.528, "loss/crossentropy": 2.418864071369171, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16169051080942154, "step": 10404 }, { "epoch": 0.473, "grad_norm": 5.03125, "grad_norm_var": 0.3182291666666667, "learning_rate": 0.0001, "loss": 5.7058, "loss/crossentropy": 2.564753234386444, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1658579409122467, "step": 10406 }, { "epoch": 0.4730909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.2723307291666667, "learning_rate": 0.0001, "loss": 5.5219, "loss/crossentropy": 2.468438148498535, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15612875670194626, "step": 10408 }, { "epoch": 0.4731818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.05154622395833333, "learning_rate": 0.0001, "loss": 5.744, "loss/crossentropy": 2.539944112300873, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1700100339949131, "step": 10410 }, { "epoch": 0.4732727272727273, "grad_norm": 5.03125, "grad_norm_var": 0.07107747395833333, "learning_rate": 0.0001, "loss": 5.4063, "loss/crossentropy": 2.376885175704956, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15372546017169952, "step": 10412 }, { "epoch": 0.4733636363636364, "grad_norm": 6.0625, "grad_norm_var": 0.14075113932291666, "learning_rate": 0.0001, "loss": 5.9117, "loss/crossentropy": 2.579428732395172, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1806904412806034, "step": 10414 }, { "epoch": 0.47345454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.14855143229166667, "learning_rate": 0.0001, "loss": 5.6404, "loss/crossentropy": 2.4083632826805115, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17046483978629112, "step": 10416 }, { "epoch": 0.47354545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.15152587890625, "learning_rate": 0.0001, "loss": 5.4428, "loss/crossentropy": 2.316746562719345, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16103798151016235, "step": 10418 }, { "epoch": 0.47363636363636363, "grad_norm": 5.21875, "grad_norm_var": 0.12720947265625, "learning_rate": 0.0001, "loss": 5.558, "loss/crossentropy": 2.334693670272827, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16921007633209229, "step": 10420 }, { "epoch": 0.4737272727272727, "grad_norm": 5.125, "grad_norm_var": 0.1400390625, "learning_rate": 0.0001, "loss": 5.6864, "loss/crossentropy": 2.601039707660675, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1597081460058689, "step": 10422 }, { "epoch": 0.4738181818181818, "grad_norm": 4.71875, "grad_norm_var": 0.164697265625, "learning_rate": 0.0001, "loss": 5.3233, "loss/crossentropy": 2.3194199800491333, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.14882060885429382, "step": 10424 }, { "epoch": 0.4739090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.19049479166666666, "learning_rate": 0.0001, "loss": 5.4581, "loss/crossentropy": 2.282997637987137, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16712283715605736, "step": 10426 }, { "epoch": 0.474, "grad_norm": 5.5, "grad_norm_var": 0.16730143229166666, "learning_rate": 0.0001, "loss": 5.9154, "loss/crossentropy": 2.6853572726249695, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1735936850309372, "step": 10428 }, { "epoch": 0.4740909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.11926676432291666, "learning_rate": 0.0001, "loss": 5.8136, "loss/crossentropy": 2.6626201272010803, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16822362318634987, "step": 10430 }, { "epoch": 0.4741818181818182, "grad_norm": 5.40625, "grad_norm_var": 0.11627197265625, "learning_rate": 0.0001, "loss": 5.6702, "loss/crossentropy": 2.5026819109916687, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1675366908311844, "step": 10432 }, { "epoch": 0.4742727272727273, "grad_norm": 5.46875, "grad_norm_var": 0.12278645833333333, "learning_rate": 0.0001, "loss": 5.7498, "loss/crossentropy": 2.5802228450775146, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16793661937117577, "step": 10434 }, { "epoch": 0.4743636363636364, "grad_norm": 5.0, "grad_norm_var": 0.12506103515625, "learning_rate": 0.0001, "loss": 5.752, "loss/crossentropy": 2.5887632369995117, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16378363221883774, "step": 10436 }, { "epoch": 0.47445454545454546, "grad_norm": 5.125, "grad_norm_var": 0.11560872395833334, "learning_rate": 0.0001, "loss": 5.6735, "loss/crossentropy": 2.4264530539512634, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17236346751451492, "step": 10438 }, { "epoch": 0.47454545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.09915364583333333, "learning_rate": 0.0001, "loss": 5.6143, "loss/crossentropy": 2.451512098312378, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16451674327254295, "step": 10440 }, { "epoch": 0.47463636363636363, "grad_norm": 4.875, "grad_norm_var": 0.06542561848958334, "learning_rate": 0.0001, "loss": 5.7013, "loss/crossentropy": 2.461239755153656, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17030007764697075, "step": 10442 }, { "epoch": 0.4747272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.17919514973958334, "learning_rate": 0.0001, "loss": 5.7901, "loss/crossentropy": 2.554618716239929, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17315731570124626, "step": 10444 }, { "epoch": 0.4748181818181818, "grad_norm": 5.125, "grad_norm_var": 0.16951497395833334, "learning_rate": 0.0001, "loss": 5.8664, "loss/crossentropy": 2.6347113847732544, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1720011718571186, "step": 10446 }, { "epoch": 0.4749090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.16638997395833333, "learning_rate": 0.0001, "loss": 5.8089, "loss/crossentropy": 2.6028707027435303, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17060595378279686, "step": 10448 }, { "epoch": 0.475, "grad_norm": 5.25, "grad_norm_var": 0.225244140625, "learning_rate": 0.0001, "loss": 5.6683, "loss/crossentropy": 2.430481255054474, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17143911868333817, "step": 10450 }, { "epoch": 0.47509090909090906, "grad_norm": 4.8125, "grad_norm_var": 0.22454427083333334, "learning_rate": 0.0001, "loss": 5.85, "loss/crossentropy": 2.671310245990753, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16884593293070793, "step": 10452 }, { "epoch": 0.4751818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.235400390625, "learning_rate": 0.0001, "loss": 5.6332, "loss/crossentropy": 2.492436647415161, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16251663118600845, "step": 10454 }, { "epoch": 0.4752727272727273, "grad_norm": 5.5625, "grad_norm_var": 0.22069905598958334, "learning_rate": 0.0001, "loss": 5.2753, "loss/crossentropy": 2.2284893691539764, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15780295431613922, "step": 10456 }, { "epoch": 0.4753636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.21086832682291667, "learning_rate": 0.0001, "loss": 5.5216, "loss/crossentropy": 2.443158507347107, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1603798009455204, "step": 10458 }, { "epoch": 0.47545454545454546, "grad_norm": 4.5, "grad_norm_var": 0.13567708333333334, "learning_rate": 0.0001, "loss": 5.5806, "loss/crossentropy": 2.5180488228797913, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15781552344560623, "step": 10460 }, { "epoch": 0.47554545454545455, "grad_norm": 4.59375, "grad_norm_var": 0.15428059895833332, "learning_rate": 0.0001, "loss": 5.4022, "loss/crossentropy": 2.3884583711624146, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1531331092119217, "step": 10462 }, { "epoch": 0.47563636363636363, "grad_norm": 4.78125, "grad_norm_var": 0.15829671223958333, "learning_rate": 0.0001, "loss": 5.3988, "loss/crossentropy": 2.290177881717682, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.15792762860655785, "step": 10464 }, { "epoch": 0.4757272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.097900390625, "learning_rate": 0.0001, "loss": 6.0115, "loss/crossentropy": 2.698906421661377, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1783280335366726, "step": 10466 }, { "epoch": 0.4758181818181818, "grad_norm": 5.71875, "grad_norm_var": 0.12459309895833333, "learning_rate": 0.0001, "loss": 5.4965, "loss/crossentropy": 2.4116637110710144, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15906695649027824, "step": 10468 }, { "epoch": 0.4759090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.12356363932291667, "learning_rate": 0.0001, "loss": 5.4872, "loss/crossentropy": 2.445721447467804, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1562940999865532, "step": 10470 }, { "epoch": 0.476, "grad_norm": 4.71875, "grad_norm_var": 0.11705729166666666, "learning_rate": 0.0001, "loss": 5.5028, "loss/crossentropy": 2.454220175743103, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15680831670761108, "step": 10472 }, { "epoch": 0.47609090909090906, "grad_norm": 5.4375, "grad_norm_var": 0.12526041666666668, "learning_rate": 0.0001, "loss": 5.9266, "loss/crossentropy": 2.641008496284485, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17563126981258392, "step": 10474 }, { "epoch": 0.4761818181818182, "grad_norm": 4.59375, "grad_norm_var": 0.13893229166666668, "learning_rate": 0.0001, "loss": 5.8268, "loss/crossentropy": 2.647970587015152, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16768623143434525, "step": 10476 }, { "epoch": 0.4762727272727273, "grad_norm": 5.53125, "grad_norm_var": 0.13111979166666668, "learning_rate": 0.0001, "loss": 5.6233, "loss/crossentropy": 2.415182948112488, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16866430267691612, "step": 10478 }, { "epoch": 0.4763636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.12066650390625, "learning_rate": 0.0001, "loss": 5.6444, "loss/crossentropy": 2.473461866378784, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1659240797162056, "step": 10480 }, { "epoch": 0.47645454545454546, "grad_norm": 5.34375, "grad_norm_var": 0.10780843098958333, "learning_rate": 0.0001, "loss": 5.5543, "loss/crossentropy": 2.370418220758438, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16858629137277603, "step": 10482 }, { "epoch": 0.47654545454545455, "grad_norm": 5.25, "grad_norm_var": 0.07965087890625, "learning_rate": 0.0001, "loss": 5.7792, "loss/crossentropy": 2.5074959993362427, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1746332086622715, "step": 10484 }, { "epoch": 0.47663636363636364, "grad_norm": 5.6875, "grad_norm_var": 0.08821207682291667, "learning_rate": 0.0001, "loss": 5.835, "loss/crossentropy": 2.497186541557312, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.18045833334326744, "step": 10486 }, { "epoch": 0.4767272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.072509765625, "learning_rate": 0.0001, "loss": 5.985, "loss/crossentropy": 2.814320147037506, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16921289637684822, "step": 10488 }, { "epoch": 0.4768181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.07437744140625, "learning_rate": 0.0001, "loss": 5.8009, "loss/crossentropy": 2.593185245990753, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16900929063558578, "step": 10490 }, { "epoch": 0.4769090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 5.7051, "loss/crossentropy": 2.503829300403595, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1677846424281597, "step": 10492 }, { "epoch": 0.477, "grad_norm": 5.21875, "grad_norm_var": 0.2359375, "learning_rate": 0.0001, "loss": 5.862, "loss/crossentropy": 2.5549212098121643, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1775796115398407, "step": 10494 }, { "epoch": 0.47709090909090907, "grad_norm": 4.625, "grad_norm_var": 0.27359619140625, "learning_rate": 0.0001, "loss": 5.1059, "loss/crossentropy": 2.1496995091438293, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.14620696753263474, "step": 10496 }, { "epoch": 0.4771818181818182, "grad_norm": 6.375, "grad_norm_var": 0.3522135416666667, "learning_rate": 0.0001, "loss": 5.4959, "loss/crossentropy": 2.30396431684494, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1645035445690155, "step": 10498 }, { "epoch": 0.4772727272727273, "grad_norm": 4.875, "grad_norm_var": 0.3754191080729167, "learning_rate": 0.0001, "loss": 5.5362, "loss/crossentropy": 2.420139253139496, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16082272306084633, "step": 10500 }, { "epoch": 0.4773636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.38726806640625, "learning_rate": 0.0001, "loss": 5.2345, "loss/crossentropy": 2.1815671026706696, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.15275486931204796, "step": 10502 }, { "epoch": 0.47745454545454546, "grad_norm": 6.375, "grad_norm_var": 0.4615885416666667, "learning_rate": 0.0001, "loss": 5.7219, "loss/crossentropy": 2.482323944568634, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17121999338269234, "step": 10504 }, { "epoch": 0.47754545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.45846354166666664, "learning_rate": 0.0001, "loss": 6.1583, "loss/crossentropy": 2.854656398296356, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17938358336687088, "step": 10506 }, { "epoch": 0.47763636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.4542317708333333, "learning_rate": 0.0001, "loss": 5.6718, "loss/crossentropy": 2.403244972229004, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17177685722708702, "step": 10508 }, { "epoch": 0.4777272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.3000284830729167, "learning_rate": 0.0001, "loss": 5.4739, "loss/crossentropy": 2.363788604736328, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16101226583123207, "step": 10510 }, { "epoch": 0.4778181818181818, "grad_norm": 4.8125, "grad_norm_var": 0.2840494791666667, "learning_rate": 0.0001, "loss": 5.4376, "loss/crossentropy": 2.3726806044578552, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15805833414196968, "step": 10512 }, { "epoch": 0.4779090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.18209228515625, "learning_rate": 0.0001, "loss": 5.9405, "loss/crossentropy": 2.7407442927360535, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1693895123898983, "step": 10514 }, { "epoch": 0.478, "grad_norm": 4.59375, "grad_norm_var": 0.19256184895833334, "learning_rate": 0.0001, "loss": 5.6212, "loss/crossentropy": 2.542519211769104, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1559142991900444, "step": 10516 }, { "epoch": 0.47809090909090907, "grad_norm": 5.28125, "grad_norm_var": 0.19348551432291666, "learning_rate": 0.0001, "loss": 5.7058, "loss/crossentropy": 2.547797441482544, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1640392430126667, "step": 10518 }, { "epoch": 0.4781818181818182, "grad_norm": 4.40625, "grad_norm_var": 0.10989176432291667, "learning_rate": 0.0001, "loss": 5.5914, "loss/crossentropy": 2.4544748663902283, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1654486171901226, "step": 10520 }, { "epoch": 0.4782727272727273, "grad_norm": 4.53125, "grad_norm_var": 0.08043212890625, "learning_rate": 0.0001, "loss": 5.1351, "loss/crossentropy": 2.1834129095077515, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14536695554852486, "step": 10522 }, { "epoch": 0.4783636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.08136393229166666, "learning_rate": 0.0001, "loss": 5.7126, "loss/crossentropy": 2.520366132259369, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16590604558587074, "step": 10524 }, { "epoch": 0.47845454545454547, "grad_norm": 4.65625, "grad_norm_var": 0.08006184895833333, "learning_rate": 0.0001, "loss": 5.5213, "loss/crossentropy": 2.4076969027519226, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15979614108800888, "step": 10526 }, { "epoch": 0.47854545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.08062744140625, "learning_rate": 0.0001, "loss": 5.8165, "loss/crossentropy": 2.63871169090271, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16641145944595337, "step": 10528 }, { "epoch": 0.47863636363636364, "grad_norm": 6.0, "grad_norm_var": 0.15126546223958334, "learning_rate": 0.0001, "loss": 5.4484, "loss/crossentropy": 2.4241693019866943, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.153206342831254, "step": 10530 }, { "epoch": 0.4787272727272727, "grad_norm": 5.0, "grad_norm_var": 0.147509765625, "learning_rate": 0.0001, "loss": 6.0306, "loss/crossentropy": 2.73368376493454, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.18027793988585472, "step": 10532 }, { "epoch": 0.4788181818181818, "grad_norm": 4.65625, "grad_norm_var": 0.14670817057291666, "learning_rate": 0.0001, "loss": 5.7, "loss/crossentropy": 2.4924342036247253, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1693868488073349, "step": 10534 }, { "epoch": 0.4789090909090909, "grad_norm": 5.125, "grad_norm_var": 0.12248942057291666, "learning_rate": 0.0001, "loss": 5.3919, "loss/crossentropy": 2.2811232805252075, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16146419197320938, "step": 10536 }, { "epoch": 0.479, "grad_norm": 5.0, "grad_norm_var": 0.10976155598958333, "learning_rate": 0.0001, "loss": 5.8811, "loss/crossentropy": 2.623072385787964, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17482542246580124, "step": 10538 }, { "epoch": 0.47909090909090907, "grad_norm": 4.71875, "grad_norm_var": 0.128369140625, "learning_rate": 0.0001, "loss": 5.2791, "loss/crossentropy": 2.2237344086170197, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15241431072354317, "step": 10540 }, { "epoch": 0.4791818181818182, "grad_norm": 6.21875, "grad_norm_var": 0.19501546223958333, "learning_rate": 0.0001, "loss": 5.5194, "loss/crossentropy": 2.3724443316459656, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.163131944835186, "step": 10542 }, { "epoch": 0.4792727272727273, "grad_norm": 5.40625, "grad_norm_var": 0.18943684895833332, "learning_rate": 0.0001, "loss": 5.7271, "loss/crossentropy": 2.5489786863327026, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16820641607046127, "step": 10544 }, { "epoch": 0.4793636363636364, "grad_norm": 4.40625, "grad_norm_var": 0.16555989583333333, "learning_rate": 0.0001, "loss": 5.4071, "loss/crossentropy": 2.4028687179088593, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15178730338811874, "step": 10546 }, { "epoch": 0.47945454545454547, "grad_norm": 4.9375, "grad_norm_var": 0.16451822916666667, "learning_rate": 0.0001, "loss": 5.5703, "loss/crossentropy": 2.3919546604156494, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16588222980499268, "step": 10548 }, { "epoch": 0.47954545454545455, "grad_norm": 17.375, "grad_norm_var": 9.555106608072917, "learning_rate": 0.0001, "loss": 5.7783, "loss/crossentropy": 2.518182933330536, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17288857698440552, "step": 10550 }, { "epoch": 0.47963636363636364, "grad_norm": 6.25, "grad_norm_var": 9.512626139322917, "learning_rate": 0.0001, "loss": 5.4559, "loss/crossentropy": 2.3195879459381104, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15972746908664703, "step": 10552 }, { "epoch": 0.4797272727272727, "grad_norm": 5.1875, "grad_norm_var": 9.488411458333333, "learning_rate": 0.0001, "loss": 5.7339, "loss/crossentropy": 2.5535968542099, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16705119982361794, "step": 10554 }, { "epoch": 0.4798181818181818, "grad_norm": 5.0625, "grad_norm_var": 9.5177734375, "learning_rate": 0.0001, "loss": 5.266, "loss/crossentropy": 2.207124650478363, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1547161564230919, "step": 10556 }, { "epoch": 0.4799090909090909, "grad_norm": 4.84375, "grad_norm_var": 9.573502604166666, "learning_rate": 0.0001, "loss": 5.4978, "loss/crossentropy": 2.3346753120422363, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1637725979089737, "step": 10558 }, { "epoch": 0.48, "grad_norm": 5.34375, "grad_norm_var": 9.601200358072917, "learning_rate": 0.0001, "loss": 5.4088, "loss/crossentropy": 2.387103110551834, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15451094880700111, "step": 10560 }, { "epoch": 0.48009090909090907, "grad_norm": 4.6875, "grad_norm_var": 9.591650390625, "learning_rate": 0.0001, "loss": 5.177, "loss/crossentropy": 2.2631341218948364, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1439240537583828, "step": 10562 }, { "epoch": 0.48018181818181815, "grad_norm": 4.84375, "grad_norm_var": 9.595166015625, "learning_rate": 0.0001, "loss": 5.5732, "loss/crossentropy": 2.4649627208709717, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16179580241441727, "step": 10564 }, { "epoch": 0.4802727272727273, "grad_norm": 8.5, "grad_norm_var": 0.8572224934895833, "learning_rate": 0.0001, "loss": 6.0375, "loss/crossentropy": 2.5697981119155884, "loss/hidden": 1.611328125, "loss/jsd": 0.0, "loss/logits": 0.18563295900821686, "step": 10566 }, { "epoch": 0.4803636363636364, "grad_norm": 6.34375, "grad_norm_var": 0.8676717122395833, "learning_rate": 0.0001, "loss": 5.831, "loss/crossentropy": 2.5785105228424072, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17154184356331825, "step": 10568 }, { "epoch": 0.48045454545454547, "grad_norm": 5.5, "grad_norm_var": 0.8961222330729167, "learning_rate": 0.0001, "loss": 6.0748, "loss/crossentropy": 2.650905430316925, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18731451779603958, "step": 10570 }, { "epoch": 0.48054545454545455, "grad_norm": 4.84375, "grad_norm_var": 0.88756103515625, "learning_rate": 0.0001, "loss": 5.7773, "loss/crossentropy": 2.595990538597107, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16851690784096718, "step": 10572 }, { "epoch": 0.48063636363636364, "grad_norm": 5.3125, "grad_norm_var": 0.8701456705729167, "learning_rate": 0.0001, "loss": 5.9479, "loss/crossentropy": 2.7230416536331177, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1718970499932766, "step": 10574 }, { "epoch": 0.4807272727272727, "grad_norm": 6.6875, "grad_norm_var": 0.929931640625, "learning_rate": 0.0001, "loss": 5.9839, "loss/crossentropy": 2.6605352759361267, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17959774658083916, "step": 10576 }, { "epoch": 0.4808181818181818, "grad_norm": 6.25, "grad_norm_var": 2.264176432291667, "learning_rate": 0.0001, "loss": 5.8016, "loss/crossentropy": 2.492332875728607, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17838799953460693, "step": 10578 }, { "epoch": 0.4809090909090909, "grad_norm": 5.15625, "grad_norm_var": 2.261572265625, "learning_rate": 0.0001, "loss": 4.9165, "loss/crossentropy": 2.035343676805496, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1394832879304886, "step": 10580 }, { "epoch": 0.481, "grad_norm": 5.34375, "grad_norm_var": 1.8567708333333333, "learning_rate": 0.0001, "loss": 5.5117, "loss/crossentropy": 2.4105653762817383, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.15815550833940506, "step": 10582 }, { "epoch": 0.48109090909090907, "grad_norm": 4.75, "grad_norm_var": 1.9229817708333334, "learning_rate": 0.0001, "loss": 5.642, "loss/crossentropy": 2.505485087633133, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16326356306672096, "step": 10584 }, { "epoch": 0.48118181818181816, "grad_norm": 4.75, "grad_norm_var": 2.0459269205729167, "learning_rate": 0.0001, "loss": 5.2229, "loss/crossentropy": 2.2507587373256683, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14955417066812515, "step": 10586 }, { "epoch": 0.4812727272727273, "grad_norm": 4.96875, "grad_norm_var": 2.0515625, "learning_rate": 0.0001, "loss": 5.857, "loss/crossentropy": 2.6008426547050476, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17444349825382233, "step": 10588 }, { "epoch": 0.4813636363636364, "grad_norm": 5.46875, "grad_norm_var": 2.120947265625, "learning_rate": 0.0001, "loss": 5.4057, "loss/crossentropy": 2.3330383896827698, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.15316111221909523, "step": 10590 }, { "epoch": 0.48145454545454547, "grad_norm": 4.4375, "grad_norm_var": 2.1609375, "learning_rate": 0.0001, "loss": 5.3085, "loss/crossentropy": 2.301218032836914, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15248991549015045, "step": 10592 }, { "epoch": 0.48154545454545455, "grad_norm": 9.125, "grad_norm_var": 1.2794108072916666, "learning_rate": 0.0001, "loss": 5.4757, "loss/crossentropy": 2.3039655685424805, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1673714891076088, "step": 10594 }, { "epoch": 0.48163636363636364, "grad_norm": 5.1875, "grad_norm_var": 1.2676920572916666, "learning_rate": 0.0001, "loss": 5.607, "loss/crossentropy": 2.479051411151886, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1612282320857048, "step": 10596 }, { "epoch": 0.4817272727272727, "grad_norm": 8.8125, "grad_norm_var": 1.9986287434895833, "learning_rate": 0.0001, "loss": 5.6906, "loss/crossentropy": 2.4898672103881836, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16636193543672562, "step": 10598 }, { "epoch": 0.4818181818181818, "grad_norm": 5.6875, "grad_norm_var": 1.9462849934895834, "learning_rate": 0.0001, "loss": 5.7596, "loss/crossentropy": 2.5595630407333374, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16492348536849022, "step": 10600 }, { "epoch": 0.4819090909090909, "grad_norm": 4.96875, "grad_norm_var": 1.8665364583333333, "learning_rate": 0.0001, "loss": 5.6047, "loss/crossentropy": 2.4978244304656982, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1589253768324852, "step": 10602 }, { "epoch": 0.482, "grad_norm": 4.375, "grad_norm_var": 1.9209920247395833, "learning_rate": 0.0001, "loss": 5.5609, "loss/crossentropy": 2.499027192592621, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1579442210495472, "step": 10604 }, { "epoch": 0.48209090909090907, "grad_norm": 4.96875, "grad_norm_var": 1.8829264322916666, "learning_rate": 0.0001, "loss": 5.2192, "loss/crossentropy": 2.2193018794059753, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.14940261468291283, "step": 10606 }, { "epoch": 0.48218181818181816, "grad_norm": 5.90625, "grad_norm_var": 2.51783447265625, "learning_rate": 0.0001, "loss": 5.6922, "loss/crossentropy": 2.4584478735923767, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16966064274311066, "step": 10608 }, { "epoch": 0.4822727272727273, "grad_norm": 5.25, "grad_norm_var": 1.8527994791666667, "learning_rate": 0.0001, "loss": 5.8367, "loss/crossentropy": 2.619767129421234, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1695423200726509, "step": 10610 }, { "epoch": 0.4823636363636364, "grad_norm": 5.0625, "grad_norm_var": 1.90865478515625, "learning_rate": 0.0001, "loss": 5.4736, "loss/crossentropy": 2.3886604607105255, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15986577793955803, "step": 10612 }, { "epoch": 0.48245454545454547, "grad_norm": 5.59375, "grad_norm_var": 1.19097900390625, "learning_rate": 0.0001, "loss": 5.7593, "loss/crossentropy": 2.495206832885742, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17269376292824745, "step": 10614 }, { "epoch": 0.48254545454545456, "grad_norm": 5.46875, "grad_norm_var": 1.2089803059895834, "learning_rate": 0.0001, "loss": 5.617, "loss/crossentropy": 2.5011597871780396, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1615833416581154, "step": 10616 }, { "epoch": 0.48263636363636364, "grad_norm": 5.0, "grad_norm_var": 1.2201822916666667, "learning_rate": 0.0001, "loss": 5.4951, "loss/crossentropy": 2.3502395153045654, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.15901820734143257, "step": 10618 }, { "epoch": 0.4827272727272727, "grad_norm": 5.9375, "grad_norm_var": 1.16519775390625, "learning_rate": 0.0001, "loss": 5.7214, "loss/crossentropy": 2.548953801393509, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16412286460399628, "step": 10620 }, { "epoch": 0.4828181818181818, "grad_norm": 5.0625, "grad_norm_var": 1.1421712239583333, "learning_rate": 0.0001, "loss": 5.5722, "loss/crossentropy": 2.4781614542007446, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1607741340994835, "step": 10622 }, { "epoch": 0.4829090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.13151041666666666, "learning_rate": 0.0001, "loss": 5.1237, "loss/crossentropy": 2.230949342250824, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.142013031989336, "step": 10624 }, { "epoch": 0.483, "grad_norm": 5.21875, "grad_norm_var": 0.14185791015625, "learning_rate": 0.0001, "loss": 5.6093, "loss/crossentropy": 2.4959736466407776, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16113939508795738, "step": 10626 }, { "epoch": 0.48309090909090907, "grad_norm": 4.5, "grad_norm_var": 0.16236572265625, "learning_rate": 0.0001, "loss": 5.646, "loss/crossentropy": 2.536493420600891, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16172923147678375, "step": 10628 }, { "epoch": 0.48318181818181816, "grad_norm": 4.6875, "grad_norm_var": 0.16246337890625, "learning_rate": 0.0001, "loss": 5.6111, "loss/crossentropy": 2.504235565662384, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16107238084077835, "step": 10630 }, { "epoch": 0.4832727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.151025390625, "learning_rate": 0.0001, "loss": 5.6181, "loss/crossentropy": 2.479309856891632, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16407165676355362, "step": 10632 }, { "epoch": 0.4833636363636364, "grad_norm": 5.0, "grad_norm_var": 0.18365478515625, "learning_rate": 0.0001, "loss": 5.7973, "loss/crossentropy": 2.585134446620941, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16847887635231018, "step": 10634 }, { "epoch": 0.48345454545454547, "grad_norm": 5.4375, "grad_norm_var": 0.159228515625, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.5298823714256287, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1704738698899746, "step": 10636 }, { "epoch": 0.48354545454545456, "grad_norm": 5.28125, "grad_norm_var": 0.15292561848958333, "learning_rate": 0.0001, "loss": 5.6208, "loss/crossentropy": 2.4832499027252197, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16473186016082764, "step": 10638 }, { "epoch": 0.48363636363636364, "grad_norm": 5.1875, "grad_norm_var": 0.13982747395833334, "learning_rate": 0.0001, "loss": 5.4038, "loss/crossentropy": 2.3883787393569946, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15134631842374802, "step": 10640 }, { "epoch": 0.48372727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.1296875, "learning_rate": 0.0001, "loss": 5.6216, "loss/crossentropy": 2.501989781856537, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16254552453756332, "step": 10642 }, { "epoch": 0.4838181818181818, "grad_norm": 5.46875, "grad_norm_var": 0.09954020182291666, "learning_rate": 0.0001, "loss": 5.7925, "loss/crossentropy": 2.4865781664848328, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1759035959839821, "step": 10644 }, { "epoch": 0.4839090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.13528238932291667, "learning_rate": 0.0001, "loss": 5.2374, "loss/crossentropy": 2.273634195327759, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14930746890604496, "step": 10646 }, { "epoch": 0.484, "grad_norm": 5.34375, "grad_norm_var": 0.14042561848958332, "learning_rate": 0.0001, "loss": 5.4787, "loss/crossentropy": 2.325417459011078, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16415511444211006, "step": 10648 }, { "epoch": 0.48409090909090907, "grad_norm": 5.03125, "grad_norm_var": 0.11998291015625, "learning_rate": 0.0001, "loss": 5.7144, "loss/crossentropy": 2.5863649249076843, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16338953748345375, "step": 10650 }, { "epoch": 0.48418181818181816, "grad_norm": 4.8125, "grad_norm_var": 0.12174072265625, "learning_rate": 0.0001, "loss": 5.5397, "loss/crossentropy": 2.4316972494125366, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16118723899126053, "step": 10652 }, { "epoch": 0.4842727272727273, "grad_norm": 5.875, "grad_norm_var": 0.17688802083333333, "learning_rate": 0.0001, "loss": 5.2396, "loss/crossentropy": 2.303374767303467, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1473376266658306, "step": 10654 }, { "epoch": 0.4843636363636364, "grad_norm": 5.375, "grad_norm_var": 0.182275390625, "learning_rate": 0.0001, "loss": 6.0056, "loss/crossentropy": 2.7666945457458496, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17291227728128433, "step": 10656 }, { "epoch": 0.48445454545454547, "grad_norm": 5.03125, "grad_norm_var": 0.18778889973958332, "learning_rate": 0.0001, "loss": 5.5872, "loss/crossentropy": 2.459073781967163, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16379105299711227, "step": 10658 }, { "epoch": 0.48454545454545456, "grad_norm": 4.6875, "grad_norm_var": 0.19348551432291666, "learning_rate": 0.0001, "loss": 5.6411, "loss/crossentropy": 2.4973978400230408, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16378621384501457, "step": 10660 }, { "epoch": 0.48463636363636364, "grad_norm": 5.5625, "grad_norm_var": 0.16471354166666666, "learning_rate": 0.0001, "loss": 5.6031, "loss/crossentropy": 2.367778927087784, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.1659134142100811, "step": 10662 }, { "epoch": 0.48472727272727273, "grad_norm": 5.375, "grad_norm_var": 0.15948893229166666, "learning_rate": 0.0001, "loss": 5.9179, "loss/crossentropy": 2.6133381724357605, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17889774218201637, "step": 10664 }, { "epoch": 0.4848181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.15188802083333333, "learning_rate": 0.0001, "loss": 5.7256, "loss/crossentropy": 2.5556012988090515, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1660231463611126, "step": 10666 }, { "epoch": 0.4849090909090909, "grad_norm": 4.6875, "grad_norm_var": 0.13658447265625, "learning_rate": 0.0001, "loss": 5.7914, "loss/crossentropy": 2.6549882292747498, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16422607749700546, "step": 10668 }, { "epoch": 0.485, "grad_norm": 4.75, "grad_norm_var": 0.09542643229166667, "learning_rate": 0.0001, "loss": 5.7646, "loss/crossentropy": 2.5408324599266052, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16906099393963814, "step": 10670 }, { "epoch": 0.4850909090909091, "grad_norm": 6.46875, "grad_norm_var": 0.19993082682291666, "learning_rate": 0.0001, "loss": 5.7547, "loss/crossentropy": 2.5716329216957092, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16752781346440315, "step": 10672 }, { "epoch": 0.48518181818181816, "grad_norm": 4.84375, "grad_norm_var": 0.20240885416666668, "learning_rate": 0.0001, "loss": 5.5307, "loss/crossentropy": 2.4123998880386353, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16202812269330025, "step": 10674 }, { "epoch": 0.4852727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.18984375, "learning_rate": 0.0001, "loss": 5.7167, "loss/crossentropy": 2.5704286098480225, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1650169976055622, "step": 10676 }, { "epoch": 0.4853636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.19563802083333334, "learning_rate": 0.0001, "loss": 5.6275, "loss/crossentropy": 2.512344717979431, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16308175027370453, "step": 10678 }, { "epoch": 0.48545454545454547, "grad_norm": 5.09375, "grad_norm_var": 0.188916015625, "learning_rate": 0.0001, "loss": 5.7298, "loss/crossentropy": 2.4982463717460632, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17100633680820465, "step": 10680 }, { "epoch": 0.48554545454545456, "grad_norm": 4.6875, "grad_norm_var": 0.21769205729166666, "learning_rate": 0.0001, "loss": 5.7881, "loss/crossentropy": 2.5488837361335754, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17196856811642647, "step": 10682 }, { "epoch": 0.48563636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.21425374348958334, "learning_rate": 0.0001, "loss": 5.5937, "loss/crossentropy": 2.441530227661133, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16443265601992607, "step": 10684 }, { "epoch": 0.48572727272727273, "grad_norm": 5.46875, "grad_norm_var": 0.23788655598958333, "learning_rate": 0.0001, "loss": 5.663, "loss/crossentropy": 2.4392586648464203, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.1690574511885643, "step": 10686 }, { "epoch": 0.4858181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.12550455729166668, "learning_rate": 0.0001, "loss": 5.743, "loss/crossentropy": 2.5801252722740173, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16550932824611664, "step": 10688 }, { "epoch": 0.4859090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.12224934895833334, "learning_rate": 0.0001, "loss": 5.5938, "loss/crossentropy": 2.4495307207107544, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16403233259916306, "step": 10690 }, { "epoch": 0.486, "grad_norm": 4.9375, "grad_norm_var": 0.11978759765625, "learning_rate": 0.0001, "loss": 5.5287, "loss/crossentropy": 2.432726174592972, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16252781823277473, "step": 10692 }, { "epoch": 0.4860909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.10481770833333333, "learning_rate": 0.0001, "loss": 5.8322, "loss/crossentropy": 2.586852490901947, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17356014251708984, "step": 10694 }, { "epoch": 0.48618181818181816, "grad_norm": 4.9375, "grad_norm_var": 0.10797119140625, "learning_rate": 0.0001, "loss": 6.0558, "loss/crossentropy": 2.689504325389862, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18584374338388443, "step": 10696 }, { "epoch": 0.48627272727272725, "grad_norm": 5.1875, "grad_norm_var": 0.07935791015625, "learning_rate": 0.0001, "loss": 5.5874, "loss/crossentropy": 2.4344339966773987, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16744956746697426, "step": 10698 }, { "epoch": 0.4863636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.07327067057291667, "learning_rate": 0.0001, "loss": 5.9062, "loss/crossentropy": 2.7034459114074707, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1708620935678482, "step": 10700 }, { "epoch": 0.4864545454545455, "grad_norm": 4.8125, "grad_norm_var": 0.05172119140625, "learning_rate": 0.0001, "loss": 5.7528, "loss/crossentropy": 2.5641178488731384, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16906622424721718, "step": 10702 }, { "epoch": 0.48654545454545456, "grad_norm": 4.875, "grad_norm_var": 0.05115559895833333, "learning_rate": 0.0001, "loss": 5.2589, "loss/crossentropy": 2.239676982164383, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15290343016386032, "step": 10704 }, { "epoch": 0.48663636363636364, "grad_norm": 6.4375, "grad_norm_var": 0.20208333333333334, "learning_rate": 0.0001, "loss": 5.8698, "loss/crossentropy": 2.624631345272064, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17197617515921593, "step": 10706 }, { "epoch": 0.48672727272727273, "grad_norm": 5.5, "grad_norm_var": 0.20276285807291666, "learning_rate": 0.0001, "loss": 5.6285, "loss/crossentropy": 2.403379499912262, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1631409116089344, "step": 10708 }, { "epoch": 0.4868181818181818, "grad_norm": 5.4375, "grad_norm_var": 0.21184895833333334, "learning_rate": 0.0001, "loss": 5.9947, "loss/crossentropy": 2.693888545036316, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17676038295030594, "step": 10710 }, { "epoch": 0.4869090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.21168212890625, "learning_rate": 0.0001, "loss": 5.7864, "loss/crossentropy": 2.59978586435318, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16865741088986397, "step": 10712 }, { "epoch": 0.487, "grad_norm": 5.4375, "grad_norm_var": 0.21053059895833334, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.5107547640800476, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16927902027964592, "step": 10714 }, { "epoch": 0.4870909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.21601155598958333, "learning_rate": 0.0001, "loss": 5.9438, "loss/crossentropy": 2.6902623176574707, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17398985102772713, "step": 10716 }, { "epoch": 0.48718181818181816, "grad_norm": 4.375, "grad_norm_var": 0.26417643229166665, "learning_rate": 0.0001, "loss": 5.6249, "loss/crossentropy": 2.521565794944763, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1628735400736332, "step": 10718 }, { "epoch": 0.48727272727272725, "grad_norm": 5.03125, "grad_norm_var": 0.26808268229166665, "learning_rate": 0.0001, "loss": 5.5184, "loss/crossentropy": 2.414401799440384, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16313378140330315, "step": 10720 }, { "epoch": 0.4873636363636364, "grad_norm": 5.25, "grad_norm_var": 0.11256103515625, "learning_rate": 0.0001, "loss": 5.8823, "loss/crossentropy": 2.679884672164917, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1694575734436512, "step": 10722 }, { "epoch": 0.4874545454545455, "grad_norm": 5.0, "grad_norm_var": 0.09895833333333333, "learning_rate": 0.0001, "loss": 5.9078, "loss/crossentropy": 2.708454191684723, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16934408992528915, "step": 10724 }, { "epoch": 0.48754545454545456, "grad_norm": 5.34375, "grad_norm_var": 0.09482014973958333, "learning_rate": 0.0001, "loss": 5.9796, "loss/crossentropy": 2.7370084524154663, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.175041563808918, "step": 10726 }, { "epoch": 0.48763636363636365, "grad_norm": 5.09375, "grad_norm_var": 0.08990885416666666, "learning_rate": 0.0001, "loss": 5.5179, "loss/crossentropy": 2.406443238258362, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1597748100757599, "step": 10728 }, { "epoch": 0.48772727272727273, "grad_norm": 4.78125, "grad_norm_var": 0.08332926432291667, "learning_rate": 0.0001, "loss": 5.7396, "loss/crossentropy": 2.6261452436447144, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16037221252918243, "step": 10730 }, { "epoch": 0.4878181818181818, "grad_norm": 5.21875, "grad_norm_var": 0.12643229166666667, "learning_rate": 0.0001, "loss": 5.8193, "loss/crossentropy": 2.5953370928764343, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17181404307484627, "step": 10732 }, { "epoch": 0.4879090909090909, "grad_norm": 4.40625, "grad_norm_var": 0.13245035807291666, "learning_rate": 0.0001, "loss": 5.3615, "loss/crossentropy": 2.2693382501602173, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15882214158773422, "step": 10734 }, { "epoch": 0.488, "grad_norm": 5.125, "grad_norm_var": 0.12164306640625, "learning_rate": 0.0001, "loss": 5.7069, "loss/crossentropy": 2.597281336784363, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1640859991312027, "step": 10736 }, { "epoch": 0.4880909090909091, "grad_norm": 4.71875, "grad_norm_var": 0.135400390625, "learning_rate": 0.0001, "loss": 5.5022, "loss/crossentropy": 2.4004065096378326, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1617414951324463, "step": 10738 }, { "epoch": 0.48818181818181816, "grad_norm": 5.1875, "grad_norm_var": 0.12102864583333334, "learning_rate": 0.0001, "loss": 5.288, "loss/crossentropy": 2.2687742710113525, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.14997407793998718, "step": 10740 }, { "epoch": 0.48827272727272725, "grad_norm": 5.59375, "grad_norm_var": 0.13121337890625, "learning_rate": 0.0001, "loss": 5.4182, "loss/crossentropy": 2.317117363214493, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1601048968732357, "step": 10742 }, { "epoch": 0.4883636363636364, "grad_norm": 5.3125, "grad_norm_var": 0.130712890625, "learning_rate": 0.0001, "loss": 5.0263, "loss/crossentropy": 2.0507777631282806, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.14676843956112862, "step": 10744 }, { "epoch": 0.4884545454545455, "grad_norm": 4.625, "grad_norm_var": 0.14060872395833332, "learning_rate": 0.0001, "loss": 5.6929, "loss/crossentropy": 2.625057876110077, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1606900840997696, "step": 10746 }, { "epoch": 0.48854545454545456, "grad_norm": 5.28125, "grad_norm_var": 0.10627848307291667, "learning_rate": 0.0001, "loss": 5.7771, "loss/crossentropy": 2.581186592578888, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16920016705989838, "step": 10748 }, { "epoch": 0.48863636363636365, "grad_norm": 5.21875, "grad_norm_var": 0.06334228515625, "learning_rate": 0.0001, "loss": 5.9704, "loss/crossentropy": 2.6737701296806335, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1786913387477398, "step": 10750 }, { "epoch": 0.48872727272727273, "grad_norm": 5.03125, "grad_norm_var": 0.06428629557291667, "learning_rate": 0.0001, "loss": 5.4096, "loss/crossentropy": 2.3156943917274475, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15977784991264343, "step": 10752 }, { "epoch": 0.4888181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.06100260416666667, "learning_rate": 0.0001, "loss": 5.3345, "loss/crossentropy": 2.2634385228157043, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15769709646701813, "step": 10754 }, { "epoch": 0.4889090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.059891764322916666, "learning_rate": 0.0001, "loss": 5.6002, "loss/crossentropy": 2.4799935817718506, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16416729614138603, "step": 10756 }, { "epoch": 0.489, "grad_norm": 4.71875, "grad_norm_var": 0.04058837890625, "learning_rate": 0.0001, "loss": 5.749, "loss/crossentropy": 2.586726129055023, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16954628378152847, "step": 10758 }, { "epoch": 0.4890909090909091, "grad_norm": 4.5625, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 4.9446, "loss/crossentropy": 2.0568801760673523, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1393573172390461, "step": 10760 }, { "epoch": 0.48918181818181816, "grad_norm": 4.90625, "grad_norm_var": 0.03899739583333333, "learning_rate": 0.0001, "loss": 5.4439, "loss/crossentropy": 2.3644532561302185, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1587209403514862, "step": 10762 }, { "epoch": 0.48927272727272725, "grad_norm": 4.5, "grad_norm_var": 0.03860677083333333, "learning_rate": 0.0001, "loss": 5.3372, "loss/crossentropy": 2.2813178300857544, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15812994167208672, "step": 10764 }, { "epoch": 0.4893636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.028759765625, "learning_rate": 0.0001, "loss": 5.4843, "loss/crossentropy": 2.4297089874744415, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15780021622776985, "step": 10766 }, { "epoch": 0.4894545454545455, "grad_norm": 5.03125, "grad_norm_var": 0.048291015625, "learning_rate": 0.0001, "loss": 5.632, "loss/crossentropy": 2.5307722687721252, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16168908029794693, "step": 10768 }, { "epoch": 0.48954545454545456, "grad_norm": 6.15625, "grad_norm_var": 0.153515625, "learning_rate": 0.0001, "loss": 5.6302, "loss/crossentropy": 2.447494328022003, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16729223728179932, "step": 10770 }, { "epoch": 0.48963636363636365, "grad_norm": 4.875, "grad_norm_var": 0.15194905598958333, "learning_rate": 0.0001, "loss": 5.5282, "loss/crossentropy": 2.4391397833824158, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15870711579918861, "step": 10772 }, { "epoch": 0.48972727272727273, "grad_norm": 4.59375, "grad_norm_var": 0.15819905598958334, "learning_rate": 0.0001, "loss": 5.6779, "loss/crossentropy": 2.563166558742523, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1630404032766819, "step": 10774 }, { "epoch": 0.4898181818181818, "grad_norm": 5.25, "grad_norm_var": 0.14641520182291667, "learning_rate": 0.0001, "loss": 5.9502, "loss/crossentropy": 2.740999460220337, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1718961000442505, "step": 10776 }, { "epoch": 0.4899090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.15520426432291667, "learning_rate": 0.0001, "loss": 5.4348, "loss/crossentropy": 2.3376699686050415, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1593216061592102, "step": 10778 }, { "epoch": 0.49, "grad_norm": 5.5, "grad_norm_var": 0.151416015625, "learning_rate": 0.0001, "loss": 6.0109, "loss/crossentropy": 2.7022724747657776, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17949627339839935, "step": 10780 }, { "epoch": 0.4900909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.152734375, "learning_rate": 0.0001, "loss": 5.8288, "loss/crossentropy": 2.5575220584869385, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17673573270440102, "step": 10782 }, { "epoch": 0.49018181818181816, "grad_norm": 5.15625, "grad_norm_var": 0.148291015625, "learning_rate": 0.0001, "loss": 5.7526, "loss/crossentropy": 2.570279538631439, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16900886595249176, "step": 10784 }, { "epoch": 0.49027272727272725, "grad_norm": 5.4375, "grad_norm_var": 0.08079427083333333, "learning_rate": 0.0001, "loss": 5.9433, "loss/crossentropy": 2.681649923324585, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17558325827121735, "step": 10786 }, { "epoch": 0.4903636363636364, "grad_norm": 4.6875, "grad_norm_var": 0.12845052083333333, "learning_rate": 0.0001, "loss": 5.481, "loss/crossentropy": 2.464482218027115, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.15770559012889862, "step": 10788 }, { "epoch": 0.4904545454545455, "grad_norm": 4.96875, "grad_norm_var": 0.13267822265625, "learning_rate": 0.0001, "loss": 5.4862, "loss/crossentropy": 2.378043234348297, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16120942682027817, "step": 10790 }, { "epoch": 0.49054545454545456, "grad_norm": 4.65625, "grad_norm_var": 0.15966389973958334, "learning_rate": 0.0001, "loss": 5.0068, "loss/crossentropy": 2.0770281851291656, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1423942670226097, "step": 10792 }, { "epoch": 0.49063636363636365, "grad_norm": 5.09375, "grad_norm_var": 0.15432535807291667, "learning_rate": 0.0001, "loss": 5.5687, "loss/crossentropy": 2.5381816625595093, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15637622401118279, "step": 10794 }, { "epoch": 0.49072727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.14069010416666666, "learning_rate": 0.0001, "loss": 5.6196, "loss/crossentropy": 2.506982207298279, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16243836656212807, "step": 10796 }, { "epoch": 0.4908181818181818, "grad_norm": 4.53125, "grad_norm_var": 0.14308268229166668, "learning_rate": 0.0001, "loss": 5.6867, "loss/crossentropy": 2.538111686706543, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16759352385997772, "step": 10798 }, { "epoch": 0.4909090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.14602457682291667, "learning_rate": 0.0001, "loss": 5.9695, "loss/crossentropy": 2.7409645915031433, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1724609062075615, "step": 10800 }, { "epoch": 0.491, "grad_norm": 4.96875, "grad_norm_var": 0.12239176432291667, "learning_rate": 0.0001, "loss": 5.9268, "loss/crossentropy": 2.714695155620575, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17160402610898018, "step": 10802 }, { "epoch": 0.4910909090909091, "grad_norm": 5.125, "grad_norm_var": 0.05543212890625, "learning_rate": 0.0001, "loss": 5.9136, "loss/crossentropy": 2.6807863116264343, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17172123491764069, "step": 10804 }, { "epoch": 0.49118181818181816, "grad_norm": 4.75, "grad_norm_var": 0.051025390625, "learning_rate": 0.0001, "loss": 5.6541, "loss/crossentropy": 2.5424426198005676, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16214480251073837, "step": 10806 }, { "epoch": 0.49127272727272725, "grad_norm": 5.1875, "grad_norm_var": 0.04478759765625, "learning_rate": 0.0001, "loss": 5.5005, "loss/crossentropy": 2.422061800956726, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1594049595296383, "step": 10808 }, { "epoch": 0.4913636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.06092122395833333, "learning_rate": 0.0001, "loss": 5.6673, "loss/crossentropy": 2.4684593081474304, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1691046506166458, "step": 10810 }, { "epoch": 0.4914545454545455, "grad_norm": 5.1875, "grad_norm_var": 0.057421875, "learning_rate": 0.0001, "loss": 5.4327, "loss/crossentropy": 2.355461299419403, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15772302448749542, "step": 10812 }, { "epoch": 0.49154545454545456, "grad_norm": 5.0625, "grad_norm_var": 0.03964436848958333, "learning_rate": 0.0001, "loss": 5.8388, "loss/crossentropy": 2.6806819438934326, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16756871715188026, "step": 10814 }, { "epoch": 0.49163636363636365, "grad_norm": 5.0, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 5.478, "loss/crossentropy": 2.409215211868286, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16117557138204575, "step": 10816 }, { "epoch": 0.49172727272727274, "grad_norm": 5.21875, "grad_norm_var": 0.03982747395833333, "learning_rate": 0.0001, "loss": 5.873, "loss/crossentropy": 2.7511683106422424, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1651151441037655, "step": 10818 }, { "epoch": 0.4918181818181818, "grad_norm": 4.75, "grad_norm_var": 0.04696858723958333, "learning_rate": 0.0001, "loss": 5.5998, "loss/crossentropy": 2.472767412662506, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1636800318956375, "step": 10820 }, { "epoch": 0.4919090909090909, "grad_norm": 4.625, "grad_norm_var": 0.053629557291666664, "learning_rate": 0.0001, "loss": 5.809, "loss/crossentropy": 2.5731563568115234, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17436370253562927, "step": 10822 }, { "epoch": 0.492, "grad_norm": 5.25, "grad_norm_var": 0.05767822265625, "learning_rate": 0.0001, "loss": 5.4019, "loss/crossentropy": 2.2795416116714478, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16027802601456642, "step": 10824 }, { "epoch": 0.4920909090909091, "grad_norm": 6.90625, "grad_norm_var": 0.26571858723958336, "learning_rate": 0.0001, "loss": 5.8111, "loss/crossentropy": 2.5209105610847473, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17588969692587852, "step": 10826 }, { "epoch": 0.49218181818181816, "grad_norm": 5.3125, "grad_norm_var": 0.26428629557291666, "learning_rate": 0.0001, "loss": 5.4856, "loss/crossentropy": 2.3604410588741302, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16134053841233253, "step": 10828 }, { "epoch": 0.49227272727272725, "grad_norm": 5.09375, "grad_norm_var": 0.2850545247395833, "learning_rate": 0.0001, "loss": 5.3667, "loss/crossentropy": 2.3977065086364746, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1517818719148636, "step": 10830 }, { "epoch": 0.49236363636363634, "grad_norm": 5.15625, "grad_norm_var": 0.28697509765625, "learning_rate": 0.0001, "loss": 5.6708, "loss/crossentropy": 2.4754103422164917, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17012640461325645, "step": 10832 }, { "epoch": 0.4924545454545455, "grad_norm": 4.8125, "grad_norm_var": 0.29055989583333336, "learning_rate": 0.0001, "loss": 5.7786, "loss/crossentropy": 2.5582902431488037, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17066743224859238, "step": 10834 }, { "epoch": 0.49254545454545456, "grad_norm": 4.9375, "grad_norm_var": 0.28046875, "learning_rate": 0.0001, "loss": 5.8931, "loss/crossentropy": 2.6801414489746094, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17071210220456123, "step": 10836 }, { "epoch": 0.49263636363636365, "grad_norm": 4.625, "grad_norm_var": 0.288525390625, "learning_rate": 0.0001, "loss": 5.036, "loss/crossentropy": 2.093076705932617, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14585148356854916, "step": 10838 }, { "epoch": 0.49272727272727274, "grad_norm": 5.0625, "grad_norm_var": 0.2827962239583333, "learning_rate": 0.0001, "loss": 5.389, "loss/crossentropy": 2.309242308139801, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1573922224342823, "step": 10840 }, { "epoch": 0.4928181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.08346354166666667, "learning_rate": 0.0001, "loss": 5.8226, "loss/crossentropy": 2.6018259525299072, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16953404620289803, "step": 10842 }, { "epoch": 0.4929090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.08268229166666667, "learning_rate": 0.0001, "loss": 5.5883, "loss/crossentropy": 2.466057240962982, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16339678317308426, "step": 10844 }, { "epoch": 0.493, "grad_norm": 5.25, "grad_norm_var": 0.06341145833333334, "learning_rate": 0.0001, "loss": 5.6046, "loss/crossentropy": 2.462187945842743, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1640472300350666, "step": 10846 }, { "epoch": 0.4930909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.06341145833333334, "learning_rate": 0.0001, "loss": 5.995, "loss/crossentropy": 2.724848508834839, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17584481462836266, "step": 10848 }, { "epoch": 0.49318181818181817, "grad_norm": 5.25, "grad_norm_var": 0.07926025390625, "learning_rate": 0.0001, "loss": 5.1462, "loss/crossentropy": 2.06176033616066, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15746628865599632, "step": 10850 }, { "epoch": 0.49327272727272725, "grad_norm": 4.65625, "grad_norm_var": 0.08004150390625, "learning_rate": 0.0001, "loss": 5.3063, "loss/crossentropy": 2.25309956073761, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15786121785640717, "step": 10852 }, { "epoch": 0.49336363636363634, "grad_norm": 5.125, "grad_norm_var": 0.06715087890625, "learning_rate": 0.0001, "loss": 5.3137, "loss/crossentropy": 2.2484490275382996, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1534048393368721, "step": 10854 }, { "epoch": 0.4934545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.0572265625, "learning_rate": 0.0001, "loss": 5.3774, "loss/crossentropy": 2.3621148467063904, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15445398166775703, "step": 10856 }, { "epoch": 0.49354545454545456, "grad_norm": 5.03125, "grad_norm_var": 0.05084635416666667, "learning_rate": 0.0001, "loss": 5.5365, "loss/crossentropy": 2.415403425693512, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16503634676337242, "step": 10858 }, { "epoch": 0.49363636363636365, "grad_norm": 4.9375, "grad_norm_var": 0.048173014322916666, "learning_rate": 0.0001, "loss": 5.5293, "loss/crossentropy": 2.4645752906799316, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1574479416012764, "step": 10860 }, { "epoch": 0.49372727272727274, "grad_norm": 5.21875, "grad_norm_var": 0.04664306640625, "learning_rate": 0.0001, "loss": 5.744, "loss/crossentropy": 2.614540219306946, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16236461326479912, "step": 10862 }, { "epoch": 0.4938181818181818, "grad_norm": 7.3125, "grad_norm_var": 0.39683837890625, "learning_rate": 0.0001, "loss": 5.9682, "loss/crossentropy": 2.7100764513015747, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17444606870412827, "step": 10864 }, { "epoch": 0.4939090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.3714680989583333, "learning_rate": 0.0001, "loss": 5.5222, "loss/crossentropy": 2.3758395314216614, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16404855251312256, "step": 10866 }, { "epoch": 0.494, "grad_norm": 4.96875, "grad_norm_var": 0.3592732747395833, "learning_rate": 0.0001, "loss": 5.6178, "loss/crossentropy": 2.479239583015442, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16385576874017715, "step": 10868 }, { "epoch": 0.4940909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.3739420572916667, "learning_rate": 0.0001, "loss": 5.4589, "loss/crossentropy": 2.415366232395172, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15474606305360794, "step": 10870 }, { "epoch": 0.49418181818181817, "grad_norm": 5.09375, "grad_norm_var": 0.37336832682291665, "learning_rate": 0.0001, "loss": 5.9553, "loss/crossentropy": 2.6974491477012634, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17714907601475716, "step": 10872 }, { "epoch": 0.49427272727272725, "grad_norm": 4.96875, "grad_norm_var": 0.38189697265625, "learning_rate": 0.0001, "loss": 5.5288, "loss/crossentropy": 2.3937082290649414, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16096536815166473, "step": 10874 }, { "epoch": 0.49436363636363634, "grad_norm": 5.40625, "grad_norm_var": 0.38776041666666666, "learning_rate": 0.0001, "loss": 6.0251, "loss/crossentropy": 2.667108476161957, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1811138093471527, "step": 10876 }, { "epoch": 0.4944545454545455, "grad_norm": 5.1875, "grad_norm_var": 0.3805623372395833, "learning_rate": 0.0001, "loss": 5.8314, "loss/crossentropy": 2.610751748085022, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1718660183250904, "step": 10878 }, { "epoch": 0.49454545454545457, "grad_norm": 5.28125, "grad_norm_var": 0.056103515625, "learning_rate": 0.0001, "loss": 5.8514, "loss/crossentropy": 2.6907901763916016, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16762103512883186, "step": 10880 }, { "epoch": 0.49463636363636365, "grad_norm": 5.53125, "grad_norm_var": 0.07936197916666667, "learning_rate": 0.0001, "loss": 5.29, "loss/crossentropy": 2.238453060388565, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15203220769762993, "step": 10882 }, { "epoch": 0.49472727272727274, "grad_norm": 5.0, "grad_norm_var": 0.08990478515625, "learning_rate": 0.0001, "loss": 5.7858, "loss/crossentropy": 2.6010043025016785, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1694541685283184, "step": 10884 }, { "epoch": 0.4948181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.084375, "learning_rate": 0.0001, "loss": 5.4238, "loss/crossentropy": 2.3883197903633118, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15433157980442047, "step": 10886 }, { "epoch": 0.4949090909090909, "grad_norm": 4.875, "grad_norm_var": 0.08886311848958334, "learning_rate": 0.0001, "loss": 5.3886, "loss/crossentropy": 2.3221689462661743, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1564459688961506, "step": 10888 }, { "epoch": 0.495, "grad_norm": 4.96875, "grad_norm_var": 0.0826171875, "learning_rate": 0.0001, "loss": 5.5228, "loss/crossentropy": 2.4147925972938538, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.15865307301282883, "step": 10890 }, { "epoch": 0.4950909090909091, "grad_norm": 4.4375, "grad_norm_var": 0.08079020182291667, "learning_rate": 0.0001, "loss": 5.0689, "loss/crossentropy": 2.1393822133541107, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.14393137954175472, "step": 10892 }, { "epoch": 0.49518181818181817, "grad_norm": 4.8125, "grad_norm_var": 0.081103515625, "learning_rate": 0.0001, "loss": 5.5796, "loss/crossentropy": 2.5198482871055603, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1598830744624138, "step": 10894 }, { "epoch": 0.49527272727272725, "grad_norm": 4.90625, "grad_norm_var": 0.07681884765625, "learning_rate": 0.0001, "loss": 5.7197, "loss/crossentropy": 2.5777830481529236, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16633697971701622, "step": 10896 }, { "epoch": 0.49536363636363634, "grad_norm": 5.0, "grad_norm_var": 0.060347493489583334, "learning_rate": 0.0001, "loss": 5.2149, "loss/crossentropy": 2.2297541797161102, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.14773714542388916, "step": 10898 }, { "epoch": 0.4954545454545455, "grad_norm": 5.125, "grad_norm_var": 0.038655598958333336, "learning_rate": 0.0001, "loss": 5.4663, "loss/crossentropy": 2.3657853603363037, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1598525084555149, "step": 10900 }, { "epoch": 0.49554545454545457, "grad_norm": 5.46875, "grad_norm_var": 0.35128580729166664, "learning_rate": 0.0001, "loss": 5.5285, "loss/crossentropy": 2.398997962474823, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16001613438129425, "step": 10902 }, { "epoch": 0.49563636363636365, "grad_norm": 5.0625, "grad_norm_var": 0.35562744140625, "learning_rate": 0.0001, "loss": 5.6424, "loss/crossentropy": 2.462488055229187, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16857336089015007, "step": 10904 }, { "epoch": 0.49572727272727274, "grad_norm": 4.78125, "grad_norm_var": 0.36064046223958335, "learning_rate": 0.0001, "loss": 5.4677, "loss/crossentropy": 2.386452317237854, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15890280529856682, "step": 10906 }, { "epoch": 0.4958181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.34425455729166665, "learning_rate": 0.0001, "loss": 5.781, "loss/crossentropy": 2.5583813786506653, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1701137274503708, "step": 10908 }, { "epoch": 0.4959090909090909, "grad_norm": 4.5, "grad_norm_var": 0.39384358723958335, "learning_rate": 0.0001, "loss": 5.0386, "loss/crossentropy": 2.1515157520771027, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1410566009581089, "step": 10910 }, { "epoch": 0.496, "grad_norm": 5.21875, "grad_norm_var": 0.39386393229166666, "learning_rate": 0.0001, "loss": 5.5835, "loss/crossentropy": 2.422006130218506, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16556769981980324, "step": 10912 }, { "epoch": 0.4960909090909091, "grad_norm": 5.125, "grad_norm_var": 0.3876139322916667, "learning_rate": 0.0001, "loss": 5.7625, "loss/crossentropy": 2.547901153564453, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16774405911564827, "step": 10914 }, { "epoch": 0.49618181818181817, "grad_norm": 5.03125, "grad_norm_var": 0.41061197916666664, "learning_rate": 0.0001, "loss": 5.526, "loss/crossentropy": 2.4198916256427765, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1604190282523632, "step": 10916 }, { "epoch": 0.49627272727272725, "grad_norm": 4.78125, "grad_norm_var": 0.13622639973958334, "learning_rate": 0.0001, "loss": 5.5981, "loss/crossentropy": 2.5029335021972656, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16146522387862206, "step": 10918 }, { "epoch": 0.49636363636363634, "grad_norm": 5.34375, "grad_norm_var": 0.14468994140625, "learning_rate": 0.0001, "loss": 5.7047, "loss/crossentropy": 2.5370948910713196, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16636958345770836, "step": 10920 }, { "epoch": 0.4964545454545455, "grad_norm": 4.71875, "grad_norm_var": 0.14791259765625, "learning_rate": 0.0001, "loss": 5.5471, "loss/crossentropy": 2.3956594467163086, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16338590160012245, "step": 10922 }, { "epoch": 0.49654545454545457, "grad_norm": 5.15625, "grad_norm_var": 0.14498697916666667, "learning_rate": 0.0001, "loss": 5.6496, "loss/crossentropy": 2.562291443347931, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16185245662927628, "step": 10924 }, { "epoch": 0.49663636363636365, "grad_norm": 5.0625, "grad_norm_var": 0.12604166666666666, "learning_rate": 0.0001, "loss": 5.4376, "loss/crossentropy": 2.3794270753860474, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15757986530661583, "step": 10926 }, { "epoch": 0.49672727272727274, "grad_norm": 4.75, "grad_norm_var": 0.11529947916666666, "learning_rate": 0.0001, "loss": 5.5402, "loss/crossentropy": 2.4494882225990295, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16297278180718422, "step": 10928 }, { "epoch": 0.4968181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.06692708333333333, "learning_rate": 0.0001, "loss": 5.6483, "loss/crossentropy": 2.47570937871933, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1670680046081543, "step": 10930 }, { "epoch": 0.4969090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.06321614583333333, "learning_rate": 0.0001, "loss": 5.7583, "loss/crossentropy": 2.5578030943870544, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16868607699871063, "step": 10932 }, { "epoch": 0.497, "grad_norm": 5.21875, "grad_norm_var": 0.06920166015625, "learning_rate": 0.0001, "loss": 5.5068, "loss/crossentropy": 2.4265429377555847, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15685855969786644, "step": 10934 }, { "epoch": 0.4970909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.07245686848958334, "learning_rate": 0.0001, "loss": 5.7413, "loss/crossentropy": 2.5365001559257507, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17009112611413002, "step": 10936 }, { "epoch": 0.49718181818181817, "grad_norm": 5.5, "grad_norm_var": 0.2941691080729167, "learning_rate": 0.0001, "loss": 5.8966, "loss/crossentropy": 2.685452461242676, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.174433134496212, "step": 10938 }, { "epoch": 0.49727272727272726, "grad_norm": 5.03125, "grad_norm_var": 0.27786458333333336, "learning_rate": 0.0001, "loss": 5.6618, "loss/crossentropy": 2.5269235968589783, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16426395624876022, "step": 10940 }, { "epoch": 0.49736363636363634, "grad_norm": 5.15625, "grad_norm_var": 0.24295247395833333, "learning_rate": 0.0001, "loss": 5.8866, "loss/crossentropy": 2.697558104991913, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16812114045023918, "step": 10942 }, { "epoch": 0.4974545454545454, "grad_norm": 4.875, "grad_norm_var": 0.23683268229166668, "learning_rate": 0.0001, "loss": 5.2018, "loss/crossentropy": 2.250657171010971, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.14472565054893494, "step": 10944 }, { "epoch": 0.49754545454545457, "grad_norm": 5.34375, "grad_norm_var": 0.25614827473958335, "learning_rate": 0.0001, "loss": 5.5703, "loss/crossentropy": 2.4259127974510193, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16424178704619408, "step": 10946 }, { "epoch": 0.49763636363636365, "grad_norm": 4.875, "grad_norm_var": 0.2595011393229167, "learning_rate": 0.0001, "loss": 5.6218, "loss/crossentropy": 2.5228089094161987, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16341428831219673, "step": 10948 }, { "epoch": 0.49772727272727274, "grad_norm": 5.75, "grad_norm_var": 0.2828776041666667, "learning_rate": 0.0001, "loss": 5.8211, "loss/crossentropy": 2.574313759803772, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17370188236236572, "step": 10950 }, { "epoch": 0.4978181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.28472900390625, "learning_rate": 0.0001, "loss": 5.6401, "loss/crossentropy": 2.5040605664253235, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16399193555116653, "step": 10952 }, { "epoch": 0.4979090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.07978108723958334, "learning_rate": 0.0001, "loss": 5.6235, "loss/crossentropy": 2.5325503945350647, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15870382264256477, "step": 10954 }, { "epoch": 0.498, "grad_norm": 5.0625, "grad_norm_var": 0.07994384765625, "learning_rate": 0.0001, "loss": 5.5989, "loss/crossentropy": 2.4421003460884094, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.164901252835989, "step": 10956 }, { "epoch": 0.4980909090909091, "grad_norm": 5.375, "grad_norm_var": 0.11093343098958333, "learning_rate": 0.0001, "loss": 5.6668, "loss/crossentropy": 2.5180968046188354, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16369818896055222, "step": 10958 }, { "epoch": 0.49818181818181817, "grad_norm": 4.25, "grad_norm_var": 0.14804280598958333, "learning_rate": 0.0001, "loss": 5.1659, "loss/crossentropy": 2.23604553937912, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1470888964831829, "step": 10960 }, { "epoch": 0.49827272727272726, "grad_norm": 5.46875, "grad_norm_var": 0.13899332682291668, "learning_rate": 0.0001, "loss": 6.0139, "loss/crossentropy": 2.672422766685486, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.18121861666440964, "step": 10962 }, { "epoch": 0.49836363636363634, "grad_norm": 5.1875, "grad_norm_var": 0.14186197916666668, "learning_rate": 0.0001, "loss": 5.5223, "loss/crossentropy": 2.433311700820923, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16182735934853554, "step": 10964 }, { "epoch": 0.49845454545454543, "grad_norm": 5.125, "grad_norm_var": 0.11822916666666666, "learning_rate": 0.0001, "loss": 5.7373, "loss/crossentropy": 2.565512716770172, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16815189272165298, "step": 10966 }, { "epoch": 0.49854545454545457, "grad_norm": 4.96875, "grad_norm_var": 0.12565104166666666, "learning_rate": 0.0001, "loss": 5.6411, "loss/crossentropy": 2.5305636525154114, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1620299443602562, "step": 10968 }, { "epoch": 0.49863636363636366, "grad_norm": 5.0, "grad_norm_var": 0.13487955729166667, "learning_rate": 0.0001, "loss": 5.5242, "loss/crossentropy": 2.4317968487739563, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1625640206038952, "step": 10970 }, { "epoch": 0.49872727272727274, "grad_norm": 5.1875, "grad_norm_var": 0.13769124348958334, "learning_rate": 0.0001, "loss": 5.8947, "loss/crossentropy": 2.65418404340744, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1744435876607895, "step": 10972 }, { "epoch": 0.4988181818181818, "grad_norm": 5.6875, "grad_norm_var": 0.12862955729166667, "learning_rate": 0.0001, "loss": 5.831, "loss/crossentropy": 2.595550000667572, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1715943068265915, "step": 10974 }, { "epoch": 0.4989090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.09264322916666666, "learning_rate": 0.0001, "loss": 5.8332, "loss/crossentropy": 2.6079735159873962, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17408530786633492, "step": 10976 }, { "epoch": 0.499, "grad_norm": 4.78125, "grad_norm_var": 0.08839518229166667, "learning_rate": 0.0001, "loss": 5.54, "loss/crossentropy": 2.4626612067222595, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15969150885939598, "step": 10978 }, { "epoch": 0.4990909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.08515625, "learning_rate": 0.0001, "loss": 5.9768, "loss/crossentropy": 2.7779561281204224, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1702732890844345, "step": 10980 }, { "epoch": 0.49918181818181817, "grad_norm": 5.0625, "grad_norm_var": 0.08474934895833333, "learning_rate": 0.0001, "loss": 5.86, "loss/crossentropy": 2.657391607761383, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16830569133162498, "step": 10982 }, { "epoch": 0.49927272727272726, "grad_norm": 5.0, "grad_norm_var": 0.08553059895833333, "learning_rate": 0.0001, "loss": 6.1512, "loss/crossentropy": 2.8165456652641296, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18307215720415115, "step": 10984 }, { "epoch": 0.49936363636363634, "grad_norm": 6.34375, "grad_norm_var": 0.18079427083333333, "learning_rate": 0.0001, "loss": 5.794, "loss/crossentropy": 2.5949292182922363, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16893112286925316, "step": 10986 }, { "epoch": 0.49945454545454543, "grad_norm": 4.9375, "grad_norm_var": 0.18644205729166666, "learning_rate": 0.0001, "loss": 5.4163, "loss/crossentropy": 2.3787726163864136, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15668699145317078, "step": 10988 }, { "epoch": 0.49954545454545457, "grad_norm": 5.03125, "grad_norm_var": 0.36337483723958336, "learning_rate": 0.0001, "loss": 5.8775, "loss/crossentropy": 2.6363367438316345, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17176952958106995, "step": 10990 }, { "epoch": 0.49963636363636366, "grad_norm": 4.84375, "grad_norm_var": 0.36171875, "learning_rate": 0.0001, "loss": 5.6477, "loss/crossentropy": 2.501773416996002, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1647852584719658, "step": 10992 }, { "epoch": 0.49972727272727274, "grad_norm": 5.0625, "grad_norm_var": 0.33053385416666664, "learning_rate": 0.0001, "loss": 5.2023, "loss/crossentropy": 2.236946791410446, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.14829321950674057, "step": 10994 }, { "epoch": 0.49981818181818183, "grad_norm": 6.1875, "grad_norm_var": 0.38058268229166664, "learning_rate": 0.0001, "loss": 5.67, "loss/crossentropy": 2.42450150847435, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1706419587135315, "step": 10996 }, { "epoch": 0.4999090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.3712239583333333, "learning_rate": 0.0001, "loss": 5.6672, "loss/crossentropy": 2.4718552231788635, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16875776275992393, "step": 10998 }, { "epoch": 0.5, "grad_norm": 5.0, "grad_norm_var": 0.3951822916666667, "learning_rate": 0.0001, "loss": 5.4933, "loss/crossentropy": 2.464068830013275, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15565872937440872, "step": 11000 }, { "epoch": 0.5000909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.39892171223958334, "learning_rate": 0.0001, "loss": 5.1562, "loss/crossentropy": 2.2090771794319153, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14763860777020454, "step": 11002 }, { "epoch": 0.5001818181818182, "grad_norm": 4.71875, "grad_norm_var": 0.4383951822916667, "learning_rate": 0.0001, "loss": 5.491, "loss/crossentropy": 2.454963803291321, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1580977402627468, "step": 11004 }, { "epoch": 0.5002727272727273, "grad_norm": 4.78125, "grad_norm_var": 0.24426676432291666, "learning_rate": 0.0001, "loss": 5.6549, "loss/crossentropy": 2.4506796002388, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16730193048715591, "step": 11006 }, { "epoch": 0.5003636363636363, "grad_norm": 5.34375, "grad_norm_var": 0.2538045247395833, "learning_rate": 0.0001, "loss": 5.9237, "loss/crossentropy": 2.6433420181274414, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17588981613516808, "step": 11008 }, { "epoch": 0.5004545454545455, "grad_norm": 4.65625, "grad_norm_var": 0.2633463541666667, "learning_rate": 0.0001, "loss": 5.6294, "loss/crossentropy": 2.5414459109306335, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1584046594798565, "step": 11010 }, { "epoch": 0.5005454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.15178629557291667, "learning_rate": 0.0001, "loss": 5.6825, "loss/crossentropy": 2.5356642603874207, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.165274478495121, "step": 11012 }, { "epoch": 0.5006363636363637, "grad_norm": 4.96875, "grad_norm_var": 0.14459228515625, "learning_rate": 0.0001, "loss": 5.6367, "loss/crossentropy": 2.515668511390686, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1636650115251541, "step": 11014 }, { "epoch": 0.5007272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.15260009765625, "learning_rate": 0.0001, "loss": 5.6845, "loss/crossentropy": 2.4985058903694153, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16937601193785667, "step": 11016 }, { "epoch": 0.5008181818181818, "grad_norm": 5.125, "grad_norm_var": 0.13105061848958333, "learning_rate": 0.0001, "loss": 5.6515, "loss/crossentropy": 2.56623375415802, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1579420492053032, "step": 11018 }, { "epoch": 0.5009090909090909, "grad_norm": 5.25, "grad_norm_var": 0.40605061848958335, "learning_rate": 0.0001, "loss": 5.7429, "loss/crossentropy": 2.4942219853401184, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17330998554825783, "step": 11020 }, { "epoch": 0.501, "grad_norm": 5.125, "grad_norm_var": 0.40396728515625, "learning_rate": 0.0001, "loss": 5.4781, "loss/crossentropy": 2.377518594264984, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1614220030605793, "step": 11022 }, { "epoch": 0.5010909090909091, "grad_norm": 6.375, "grad_norm_var": 0.484375, "learning_rate": 0.0001, "loss": 5.8463, "loss/crossentropy": 2.626004695892334, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1704719066619873, "step": 11024 }, { "epoch": 0.5011818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.4426066080729167, "learning_rate": 0.0001, "loss": 6.1245, "loss/crossentropy": 2.809302806854248, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.18054577708244324, "step": 11026 }, { "epoch": 0.5012727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.46789957682291666, "learning_rate": 0.0001, "loss": 5.4799, "loss/crossentropy": 2.4478771686553955, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15749863162636757, "step": 11028 }, { "epoch": 0.5013636363636363, "grad_norm": 4.8125, "grad_norm_var": 0.483056640625, "learning_rate": 0.0001, "loss": 5.8071, "loss/crossentropy": 2.651841878890991, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1645522341132164, "step": 11030 }, { "epoch": 0.5014545454545455, "grad_norm": 5.71875, "grad_norm_var": 0.498291015625, "learning_rate": 0.0001, "loss": 5.7312, "loss/crossentropy": 2.5654813647270203, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16383983939886093, "step": 11032 }, { "epoch": 0.5015454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.4951171875, "learning_rate": 0.0001, "loss": 5.7842, "loss/crossentropy": 2.5566521286964417, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17157834023237228, "step": 11034 }, { "epoch": 0.5016363636363637, "grad_norm": 5.03125, "grad_norm_var": 0.17224934895833333, "learning_rate": 0.0001, "loss": 5.452, "loss/crossentropy": 2.3629947304725647, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16065558046102524, "step": 11036 }, { "epoch": 0.5017272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.17389322916666666, "learning_rate": 0.0001, "loss": 5.761, "loss/crossentropy": 2.5836288928985596, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16832385957241058, "step": 11038 }, { "epoch": 0.5018181818181818, "grad_norm": 5.0, "grad_norm_var": 0.0697265625, "learning_rate": 0.0001, "loss": 5.8176, "loss/crossentropy": 2.678365111351013, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1656765080988407, "step": 11040 }, { "epoch": 0.5019090909090909, "grad_norm": 5.125, "grad_norm_var": 0.07565104166666667, "learning_rate": 0.0001, "loss": 5.2901, "loss/crossentropy": 2.301580846309662, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15041900426149368, "step": 11042 }, { "epoch": 0.502, "grad_norm": 5.21875, "grad_norm_var": 0.0802734375, "learning_rate": 0.0001, "loss": 5.7046, "loss/crossentropy": 2.448770225048065, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1726495325565338, "step": 11044 }, { "epoch": 0.5020909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.08756510416666667, "learning_rate": 0.0001, "loss": 5.7893, "loss/crossentropy": 2.6119261384010315, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17125630378723145, "step": 11046 }, { "epoch": 0.5021818181818182, "grad_norm": 4.5625, "grad_norm_var": 0.07115478515625, "learning_rate": 0.0001, "loss": 5.2633, "loss/crossentropy": 2.246310740709305, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.152482058852911, "step": 11048 }, { "epoch": 0.5022727272727273, "grad_norm": 5.25, "grad_norm_var": 0.07102457682291667, "learning_rate": 0.0001, "loss": 5.8865, "loss/crossentropy": 2.7020627856254578, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16688114032149315, "step": 11050 }, { "epoch": 0.5023636363636363, "grad_norm": 5.21875, "grad_norm_var": 0.08860677083333333, "learning_rate": 0.0001, "loss": 5.6862, "loss/crossentropy": 2.4726778864860535, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17076730728149414, "step": 11052 }, { "epoch": 0.5024545454545455, "grad_norm": 4.34375, "grad_norm_var": 0.11717122395833333, "learning_rate": 0.0001, "loss": 5.1468, "loss/crossentropy": 2.174849420785904, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1497354507446289, "step": 11054 }, { "epoch": 0.5025454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.12301025390625, "learning_rate": 0.0001, "loss": 5.6288, "loss/crossentropy": 2.5369701981544495, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1603550724685192, "step": 11056 }, { "epoch": 0.5026363636363637, "grad_norm": 5.28125, "grad_norm_var": 0.12974853515625, "learning_rate": 0.0001, "loss": 5.6021, "loss/crossentropy": 2.5009655356407166, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1599222905933857, "step": 11058 }, { "epoch": 0.5027272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.101806640625, "learning_rate": 0.0001, "loss": 5.7152, "loss/crossentropy": 2.577699601650238, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16569849103689194, "step": 11060 }, { "epoch": 0.5028181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.10204671223958334, "learning_rate": 0.0001, "loss": 5.5274, "loss/crossentropy": 2.4273369312286377, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1594165861606598, "step": 11062 }, { "epoch": 0.5029090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.09592692057291667, "learning_rate": 0.0001, "loss": 5.8315, "loss/crossentropy": 2.652952194213867, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16746317595243454, "step": 11064 }, { "epoch": 0.503, "grad_norm": 5.15625, "grad_norm_var": 0.09073893229166667, "learning_rate": 0.0001, "loss": 5.8764, "loss/crossentropy": 2.642425060272217, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17300963401794434, "step": 11066 }, { "epoch": 0.5030909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.06617431640625, "learning_rate": 0.0001, "loss": 5.6674, "loss/crossentropy": 2.5564351081848145, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16421950235962868, "step": 11068 }, { "epoch": 0.5031818181818182, "grad_norm": 4.875, "grad_norm_var": 0.044266764322916666, "learning_rate": 0.0001, "loss": 5.7552, "loss/crossentropy": 2.61171555519104, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16532539576292038, "step": 11070 }, { "epoch": 0.5032727272727273, "grad_norm": 5.25, "grad_norm_var": 0.04920247395833333, "learning_rate": 0.0001, "loss": 5.4326, "loss/crossentropy": 2.359228014945984, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15695058926939964, "step": 11072 }, { "epoch": 0.5033636363636363, "grad_norm": 4.625, "grad_norm_var": 0.042822265625, "learning_rate": 0.0001, "loss": 5.4894, "loss/crossentropy": 2.4413366317749023, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15870894491672516, "step": 11074 }, { "epoch": 0.5034545454545455, "grad_norm": 4.9375, "grad_norm_var": 0.07649332682291667, "learning_rate": 0.0001, "loss": 5.94, "loss/crossentropy": 2.627800464630127, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17789551988244057, "step": 11076 }, { "epoch": 0.5035454545454545, "grad_norm": 5.25, "grad_norm_var": 0.08190104166666666, "learning_rate": 0.0001, "loss": 5.9224, "loss/crossentropy": 2.6932069659233093, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1735062412917614, "step": 11078 }, { "epoch": 0.5036363636363637, "grad_norm": 4.78125, "grad_norm_var": 0.083984375, "learning_rate": 0.0001, "loss": 5.6476, "loss/crossentropy": 2.5309282541275024, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1645975299179554, "step": 11080 }, { "epoch": 0.5037272727272727, "grad_norm": 5.0, "grad_norm_var": 0.08214518229166666, "learning_rate": 0.0001, "loss": 5.6365, "loss/crossentropy": 2.5065829157829285, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16240839287638664, "step": 11082 }, { "epoch": 0.5038181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.090625, "learning_rate": 0.0001, "loss": 5.4396, "loss/crossentropy": 2.428074896335602, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15095443278551102, "step": 11084 }, { "epoch": 0.5039090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.1, "learning_rate": 0.0001, "loss": 5.5453, "loss/crossentropy": 2.485000401735306, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15720606595277786, "step": 11086 }, { "epoch": 0.504, "grad_norm": 5.125, "grad_norm_var": 0.09680989583333334, "learning_rate": 0.0001, "loss": 5.7377, "loss/crossentropy": 2.5851664543151855, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1648613102734089, "step": 11088 }, { "epoch": 0.5040909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.08528645833333333, "learning_rate": 0.0001, "loss": 5.5076, "loss/crossentropy": 2.416034758090973, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15974314883351326, "step": 11090 }, { "epoch": 0.5041818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.06067301432291667, "learning_rate": 0.0001, "loss": 6.0223, "loss/crossentropy": 2.7305678725242615, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17936691641807556, "step": 11092 }, { "epoch": 0.5042727272727273, "grad_norm": 5.5, "grad_norm_var": 0.07662353515625, "learning_rate": 0.0001, "loss": 5.7143, "loss/crossentropy": 2.5433385968208313, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16807431355118752, "step": 11094 }, { "epoch": 0.5043636363636363, "grad_norm": 4.75, "grad_norm_var": 0.0833984375, "learning_rate": 0.0001, "loss": 4.9159, "loss/crossentropy": 2.050717204809189, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14277169853448868, "step": 11096 }, { "epoch": 0.5044545454545455, "grad_norm": 4.96875, "grad_norm_var": 0.08487955729166667, "learning_rate": 0.0001, "loss": 5.6565, "loss/crossentropy": 2.5093231201171875, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1647147759795189, "step": 11098 }, { "epoch": 0.5045454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.07971598307291666, "learning_rate": 0.0001, "loss": 5.8997, "loss/crossentropy": 2.666722297668457, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1721251755952835, "step": 11100 }, { "epoch": 0.5046363636363637, "grad_norm": 5.0625, "grad_norm_var": 0.08378499348958333, "learning_rate": 0.0001, "loss": 5.5347, "loss/crossentropy": 2.423344135284424, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16113387793302536, "step": 11102 }, { "epoch": 0.5047272727272727, "grad_norm": 5.53125, "grad_norm_var": 0.10673421223958333, "learning_rate": 0.0001, "loss": 6.1611, "loss/crossentropy": 2.7474916577339172, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1874540075659752, "step": 11104 }, { "epoch": 0.5048181818181818, "grad_norm": 5.84375, "grad_norm_var": 0.14468994140625, "learning_rate": 0.0001, "loss": 5.645, "loss/crossentropy": 2.458938717842102, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1686040684580803, "step": 11106 }, { "epoch": 0.5049090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.16955973307291666, "learning_rate": 0.0001, "loss": 6.058, "loss/crossentropy": 2.719900667667389, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1806843802332878, "step": 11108 }, { "epoch": 0.505, "grad_norm": 6.1875, "grad_norm_var": 0.22265625, "learning_rate": 0.0001, "loss": 5.5642, "loss/crossentropy": 2.4015421271324158, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16567708551883698, "step": 11110 }, { "epoch": 0.5050909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.191259765625, "learning_rate": 0.0001, "loss": 6.1408, "loss/crossentropy": 2.8739614486694336, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17629747092723846, "step": 11112 }, { "epoch": 0.5051818181818182, "grad_norm": 6.4375, "grad_norm_var": 0.24620768229166667, "learning_rate": 0.0001, "loss": 5.3555, "loss/crossentropy": 2.2307965755462646, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16091181337833405, "step": 11114 }, { "epoch": 0.5052727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.23821207682291667, "learning_rate": 0.0001, "loss": 5.7208, "loss/crossentropy": 2.542577028274536, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17016569152474403, "step": 11116 }, { "epoch": 0.5053636363636363, "grad_norm": 4.8125, "grad_norm_var": 0.2240234375, "learning_rate": 0.0001, "loss": 5.2597, "loss/crossentropy": 2.2364233136177063, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15311229974031448, "step": 11118 }, { "epoch": 0.5054545454545455, "grad_norm": 5.03125, "grad_norm_var": 0.24798177083333334, "learning_rate": 0.0001, "loss": 5.8975, "loss/crossentropy": 2.688439965248108, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17071175575256348, "step": 11120 }, { "epoch": 0.5055454545454545, "grad_norm": 5.0, "grad_norm_var": 0.22395833333333334, "learning_rate": 0.0001, "loss": 5.6642, "loss/crossentropy": 2.517568200826645, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16388559713959694, "step": 11122 }, { "epoch": 0.5056363636363637, "grad_norm": 5.25, "grad_norm_var": 0.21691080729166667, "learning_rate": 0.0001, "loss": 5.5277, "loss/crossentropy": 2.4245994091033936, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15932874009013176, "step": 11124 }, { "epoch": 0.5057272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.16652018229166668, "learning_rate": 0.0001, "loss": 5.8672, "loss/crossentropy": 2.644101858139038, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17211729660630226, "step": 11126 }, { "epoch": 0.5058181818181818, "grad_norm": 5.0, "grad_norm_var": 0.17678629557291667, "learning_rate": 0.0001, "loss": 5.4076, "loss/crossentropy": 2.33242928981781, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.15459024161100388, "step": 11128 }, { "epoch": 0.5059090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.07792561848958333, "learning_rate": 0.0001, "loss": 5.2578, "loss/crossentropy": 2.284926116466522, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.14904413744807243, "step": 11130 }, { "epoch": 0.506, "grad_norm": 6.125, "grad_norm_var": 0.13609619140625, "learning_rate": 0.0001, "loss": 5.491, "loss/crossentropy": 2.352058470249176, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16388997435569763, "step": 11132 }, { "epoch": 0.506090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.137744140625, "learning_rate": 0.0001, "loss": 5.6648, "loss/crossentropy": 2.4969218373298645, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1683473214507103, "step": 11134 }, { "epoch": 0.5061818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.19293212890625, "learning_rate": 0.0001, "loss": 5.7872, "loss/crossentropy": 2.5439926385879517, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17198166251182556, "step": 11136 }, { "epoch": 0.5062727272727273, "grad_norm": 5.25, "grad_norm_var": 0.19257405598958333, "learning_rate": 0.0001, "loss": 5.803, "loss/crossentropy": 2.6109648048877716, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16881321370601654, "step": 11138 }, { "epoch": 0.5063636363636363, "grad_norm": 5.5, "grad_norm_var": 0.271728515625, "learning_rate": 0.0001, "loss": 5.8723, "loss/crossentropy": 2.620957672595978, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17317646369338036, "step": 11140 }, { "epoch": 0.5064545454545455, "grad_norm": 4.5625, "grad_norm_var": 0.29596354166666666, "learning_rate": 0.0001, "loss": 5.5834, "loss/crossentropy": 2.431982159614563, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1667044274508953, "step": 11142 }, { "epoch": 0.5065454545454545, "grad_norm": 5.6875, "grad_norm_var": 0.2892537434895833, "learning_rate": 0.0001, "loss": 5.8621, "loss/crossentropy": 2.5642775297164917, "loss/hidden": 1.552734375, "loss/jsd": 0.0, "loss/logits": 0.17450788989663124, "step": 11144 }, { "epoch": 0.5066363636363637, "grad_norm": 4.65625, "grad_norm_var": 0.30810139973958334, "learning_rate": 0.0001, "loss": 5.5323, "loss/crossentropy": 2.474263846874237, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1577616184949875, "step": 11146 }, { "epoch": 0.5067272727272727, "grad_norm": 4.875, "grad_norm_var": 0.28254801432291665, "learning_rate": 0.0001, "loss": 5.5971, "loss/crossentropy": 2.5313867926597595, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1593083254992962, "step": 11148 }, { "epoch": 0.5068181818181818, "grad_norm": 4.5, "grad_norm_var": 0.34667561848958334, "learning_rate": 0.0001, "loss": 5.4721, "loss/crossentropy": 2.3337009251117706, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1628588642925024, "step": 11150 }, { "epoch": 0.5069090909090909, "grad_norm": 4.6875, "grad_norm_var": 0.31184488932291665, "learning_rate": 0.0001, "loss": 5.2848, "loss/crossentropy": 2.317204922437668, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1498812809586525, "step": 11152 }, { "epoch": 0.507, "grad_norm": 4.75, "grad_norm_var": 0.32030843098958334, "learning_rate": 0.0001, "loss": 5.3526, "loss/crossentropy": 2.2784281373023987, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15507083013653755, "step": 11154 }, { "epoch": 0.507090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.19892171223958333, "learning_rate": 0.0001, "loss": 5.5874, "loss/crossentropy": 2.501277446746826, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1582179069519043, "step": 11156 }, { "epoch": 0.5071818181818182, "grad_norm": 5.25, "grad_norm_var": 0.20136311848958333, "learning_rate": 0.0001, "loss": 5.7342, "loss/crossentropy": 2.560066521167755, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16624519973993301, "step": 11158 }, { "epoch": 0.5072727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.15579020182291667, "learning_rate": 0.0001, "loss": 5.6005, "loss/crossentropy": 2.479751408100128, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1616814099252224, "step": 11160 }, { "epoch": 0.5073636363636363, "grad_norm": 4.59375, "grad_norm_var": 0.15852864583333334, "learning_rate": 0.0001, "loss": 5.6182, "loss/crossentropy": 2.5396382808685303, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16176409274339676, "step": 11162 }, { "epoch": 0.5074545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.15865885416666667, "learning_rate": 0.0001, "loss": 5.8599, "loss/crossentropy": 2.6319398880004883, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1726037748157978, "step": 11164 }, { "epoch": 0.5075454545454545, "grad_norm": 5.0, "grad_norm_var": 0.07420247395833333, "learning_rate": 0.0001, "loss": 5.6186, "loss/crossentropy": 2.4823029935359955, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16499990969896317, "step": 11166 }, { "epoch": 0.5076363636363637, "grad_norm": 5.125, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 5.7768, "loss/crossentropy": 2.5248705446720123, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17500018700957298, "step": 11168 }, { "epoch": 0.5077272727272727, "grad_norm": 5.4375, "grad_norm_var": 0.09468994140625, "learning_rate": 0.0001, "loss": 5.9241, "loss/crossentropy": 2.662128210067749, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17482608929276466, "step": 11170 }, { "epoch": 0.5078181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.09931233723958334, "learning_rate": 0.0001, "loss": 5.261, "loss/crossentropy": 2.2694543600082397, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15208425372838974, "step": 11172 }, { "epoch": 0.5079090909090909, "grad_norm": 4.625, "grad_norm_var": 0.09685872395833334, "learning_rate": 0.0001, "loss": 5.4117, "loss/crossentropy": 2.3230114579200745, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16081815212965012, "step": 11174 }, { "epoch": 0.508, "grad_norm": 5.09375, "grad_norm_var": 0.09970296223958333, "learning_rate": 0.0001, "loss": 5.7763, "loss/crossentropy": 2.5682618021965027, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17197350412607193, "step": 11176 }, { "epoch": 0.508090909090909, "grad_norm": 4.75, "grad_norm_var": 0.10012613932291667, "learning_rate": 0.0001, "loss": 5.5594, "loss/crossentropy": 2.502389371395111, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15687404572963715, "step": 11178 }, { "epoch": 0.5081818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.08876546223958333, "learning_rate": 0.0001, "loss": 5.6436, "loss/crossentropy": 2.475421190261841, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16740529239177704, "step": 11180 }, { "epoch": 0.5082727272727273, "grad_norm": 5.0, "grad_norm_var": 0.12496337890625, "learning_rate": 0.0001, "loss": 5.2529, "loss/crossentropy": 2.267862468957901, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14987235888838768, "step": 11182 }, { "epoch": 0.5083636363636364, "grad_norm": 5.65625, "grad_norm_var": 0.15592447916666666, "learning_rate": 0.0001, "loss": 5.8897, "loss/crossentropy": 2.652738332748413, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17174392938613892, "step": 11184 }, { "epoch": 0.5084545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.115478515625, "learning_rate": 0.0001, "loss": 5.8375, "loss/crossentropy": 2.6300214529037476, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17074764147400856, "step": 11186 }, { "epoch": 0.5085454545454545, "grad_norm": 4.84375, "grad_norm_var": 0.11868489583333333, "learning_rate": 0.0001, "loss": 5.5449, "loss/crossentropy": 2.407879054546356, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16018923372030258, "step": 11188 }, { "epoch": 0.5086363636363637, "grad_norm": 5.34375, "grad_norm_var": 0.12457275390625, "learning_rate": 0.0001, "loss": 5.5376, "loss/crossentropy": 2.4082972407341003, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16312871500849724, "step": 11190 }, { "epoch": 0.5087272727272727, "grad_norm": 4.65625, "grad_norm_var": 0.12476806640625, "learning_rate": 0.0001, "loss": 5.3197, "loss/crossentropy": 2.25258332490921, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15729264169931412, "step": 11192 }, { "epoch": 0.5088181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.15937093098958333, "learning_rate": 0.0001, "loss": 5.6378, "loss/crossentropy": 2.4291324615478516, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1679399088025093, "step": 11194 }, { "epoch": 0.5089090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.17288004557291667, "learning_rate": 0.0001, "loss": 5.9474, "loss/crossentropy": 2.6826807856559753, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17256944254040718, "step": 11196 }, { "epoch": 0.509, "grad_norm": 4.90625, "grad_norm_var": 0.11916910807291667, "learning_rate": 0.0001, "loss": 5.542, "loss/crossentropy": 2.458086669445038, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15897703543305397, "step": 11198 }, { "epoch": 0.509090909090909, "grad_norm": 5.59375, "grad_norm_var": 0.10699462890625, "learning_rate": 0.0001, "loss": 5.6644, "loss/crossentropy": 2.5044764280319214, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1636507660150528, "step": 11200 }, { "epoch": 0.5091818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.12681884765625, "learning_rate": 0.0001, "loss": 5.6711, "loss/crossentropy": 2.4547852277755737, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.16831351071596146, "step": 11202 }, { "epoch": 0.5092727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.11672770182291667, "learning_rate": 0.0001, "loss": 5.3913, "loss/crossentropy": 2.2388022541999817, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16564138233661652, "step": 11204 }, { "epoch": 0.5093636363636364, "grad_norm": 5.09375, "grad_norm_var": 0.10026041666666667, "learning_rate": 0.0001, "loss": 5.6152, "loss/crossentropy": 2.4977492690086365, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16174833849072456, "step": 11206 }, { "epoch": 0.5094545454545455, "grad_norm": 4.84375, "grad_norm_var": 0.08782145182291666, "learning_rate": 0.0001, "loss": 5.8366, "loss/crossentropy": 2.622305691242218, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1726054660975933, "step": 11208 }, { "epoch": 0.5095454545454545, "grad_norm": 6.0, "grad_norm_var": 0.10675455729166666, "learning_rate": 0.0001, "loss": 5.5119, "loss/crossentropy": 2.359890341758728, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1663772501051426, "step": 11210 }, { "epoch": 0.5096363636363637, "grad_norm": 5.09375, "grad_norm_var": 0.10558268229166666, "learning_rate": 0.0001, "loss": 5.6925, "loss/crossentropy": 2.5712180733680725, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16349238902330399, "step": 11212 }, { "epoch": 0.5097272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.11233317057291667, "learning_rate": 0.0001, "loss": 5.5701, "loss/crossentropy": 2.4420080184936523, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16339293122291565, "step": 11214 }, { "epoch": 0.5098181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.1251953125, "learning_rate": 0.0001, "loss": 5.4777, "loss/crossentropy": 2.39072185754776, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15596674755215645, "step": 11216 }, { "epoch": 0.5099090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.09685872395833334, "learning_rate": 0.0001, "loss": 5.7872, "loss/crossentropy": 2.621726155281067, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16498788073658943, "step": 11218 }, { "epoch": 0.51, "grad_norm": 4.84375, "grad_norm_var": 0.10644124348958334, "learning_rate": 0.0001, "loss": 5.6702, "loss/crossentropy": 2.574012279510498, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1602066233754158, "step": 11220 }, { "epoch": 0.510090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.10732014973958333, "learning_rate": 0.0001, "loss": 5.5247, "loss/crossentropy": 2.4388739466667175, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15623850375413895, "step": 11222 }, { "epoch": 0.5101818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.09915364583333333, "learning_rate": 0.0001, "loss": 5.1003, "loss/crossentropy": 2.131421059370041, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.14688862673938274, "step": 11224 }, { "epoch": 0.5102727272727273, "grad_norm": 5.59375, "grad_norm_var": 0.054423014322916664, "learning_rate": 0.0001, "loss": 5.8166, "loss/crossentropy": 2.6077255606651306, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1685430444777012, "step": 11226 }, { "epoch": 0.5103636363636364, "grad_norm": 5.0625, "grad_norm_var": 0.10130208333333333, "learning_rate": 0.0001, "loss": 5.7567, "loss/crossentropy": 2.5751078128814697, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16777273267507553, "step": 11228 }, { "epoch": 0.5104545454545455, "grad_norm": 5.15625, "grad_norm_var": 0.09589436848958334, "learning_rate": 0.0001, "loss": 5.7444, "loss/crossentropy": 2.6053077578544617, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16645139828324318, "step": 11230 }, { "epoch": 0.5105454545454545, "grad_norm": 5.125, "grad_norm_var": 0.09231363932291667, "learning_rate": 0.0001, "loss": 5.5945, "loss/crossentropy": 2.4977442026138306, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1600637473165989, "step": 11232 }, { "epoch": 0.5106363636363637, "grad_norm": 4.625, "grad_norm_var": 0.10753580729166666, "learning_rate": 0.0001, "loss": 5.6295, "loss/crossentropy": 2.47802472114563, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1647525653243065, "step": 11234 }, { "epoch": 0.5107272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.11510009765625, "learning_rate": 0.0001, "loss": 5.8591, "loss/crossentropy": 2.6492409706115723, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17079529538750648, "step": 11236 }, { "epoch": 0.5108181818181818, "grad_norm": 6.3125, "grad_norm_var": 0.667041015625, "learning_rate": 0.0001, "loss": 5.8356, "loss/crossentropy": 2.4996486008167267, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17909801378846169, "step": 11238 }, { "epoch": 0.5109090909090909, "grad_norm": 5.125, "grad_norm_var": 0.64898681640625, "learning_rate": 0.0001, "loss": 5.9512, "loss/crossentropy": 2.71554434299469, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17317792773246765, "step": 11240 }, { "epoch": 0.511, "grad_norm": 5.09375, "grad_norm_var": 0.6873006184895833, "learning_rate": 0.0001, "loss": 5.2995, "loss/crossentropy": 2.3109885454177856, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1513907015323639, "step": 11242 }, { "epoch": 0.511090909090909, "grad_norm": 4.25, "grad_norm_var": 0.7361979166666667, "learning_rate": 0.0001, "loss": 5.382, "loss/crossentropy": 2.365031599998474, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1532638669013977, "step": 11244 }, { "epoch": 0.5111818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.7478474934895833, "learning_rate": 0.0001, "loss": 5.4329, "loss/crossentropy": 2.34383761882782, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1585179567337036, "step": 11246 }, { "epoch": 0.5112727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.73375244140625, "learning_rate": 0.0001, "loss": 5.3836, "loss/crossentropy": 2.2885391116142273, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1577443741261959, "step": 11248 }, { "epoch": 0.5113636363636364, "grad_norm": 5.0625, "grad_norm_var": 0.7057902018229166, "learning_rate": 0.0001, "loss": 5.5833, "loss/crossentropy": 2.4429712891578674, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16325156763195992, "step": 11250 }, { "epoch": 0.5114545454545455, "grad_norm": 5.21875, "grad_norm_var": 0.7166015625, "learning_rate": 0.0001, "loss": 5.771, "loss/crossentropy": 2.544038712978363, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17171870917081833, "step": 11252 }, { "epoch": 0.5115454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.10354410807291667, "learning_rate": 0.0001, "loss": 5.5268, "loss/crossentropy": 2.45944082736969, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15771473571658134, "step": 11254 }, { "epoch": 0.5116363636363637, "grad_norm": 5.0, "grad_norm_var": 0.11086832682291667, "learning_rate": 0.0001, "loss": 5.8442, "loss/crossentropy": 2.627876341342926, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17084677889943123, "step": 11256 }, { "epoch": 0.5117272727272727, "grad_norm": 4.84375, "grad_norm_var": 0.09609375, "learning_rate": 0.0001, "loss": 5.3808, "loss/crossentropy": 2.368621349334717, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15239059180021286, "step": 11258 }, { "epoch": 0.5118181818181818, "grad_norm": 4.625, "grad_norm_var": 0.06444905598958334, "learning_rate": 0.0001, "loss": 5.506, "loss/crossentropy": 2.454044759273529, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1559784822165966, "step": 11260 }, { "epoch": 0.5119090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.07707926432291666, "learning_rate": 0.0001, "loss": 5.75, "loss/crossentropy": 2.561949133872986, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16489893198013306, "step": 11262 }, { "epoch": 0.512, "grad_norm": 5.5, "grad_norm_var": 0.08883056640625, "learning_rate": 0.0001, "loss": 5.404, "loss/crossentropy": 2.350621819496155, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15592044219374657, "step": 11264 }, { "epoch": 0.512090909090909, "grad_norm": 5.5, "grad_norm_var": 0.11170247395833334, "learning_rate": 0.0001, "loss": 5.8682, "loss/crossentropy": 2.5941357612609863, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1733049862086773, "step": 11266 }, { "epoch": 0.5121818181818182, "grad_norm": 5.0625, "grad_norm_var": 0.11259358723958333, "learning_rate": 0.0001, "loss": 5.5211, "loss/crossentropy": 2.4446956515312195, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15998605638742447, "step": 11268 }, { "epoch": 0.5122727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.11005452473958334, "learning_rate": 0.0001, "loss": 5.906, "loss/crossentropy": 2.6314417123794556, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17667684331536293, "step": 11270 }, { "epoch": 0.5123636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.10364583333333334, "learning_rate": 0.0001, "loss": 5.7884, "loss/crossentropy": 2.67705637216568, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16406847536563873, "step": 11272 }, { "epoch": 0.5124545454545455, "grad_norm": 4.84375, "grad_norm_var": 0.11378580729166667, "learning_rate": 0.0001, "loss": 5.3722, "loss/crossentropy": 2.3938241600990295, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15213844925165176, "step": 11274 }, { "epoch": 0.5125454545454545, "grad_norm": 4.84375, "grad_norm_var": 0.13570556640625, "learning_rate": 0.0001, "loss": 4.8612, "loss/crossentropy": 2.0314290523529053, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.13415092043578625, "step": 11276 }, { "epoch": 0.5126363636363637, "grad_norm": 5.625, "grad_norm_var": 0.14234619140625, "learning_rate": 0.0001, "loss": 6.0807, "loss/crossentropy": 2.776512920856476, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17826558649539948, "step": 11278 }, { "epoch": 0.5127272727272727, "grad_norm": 5.53125, "grad_norm_var": 0.13982747395833334, "learning_rate": 0.0001, "loss": 5.8726, "loss/crossentropy": 2.625965714454651, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17270758002996445, "step": 11280 }, { "epoch": 0.5128181818181818, "grad_norm": 5.0, "grad_norm_var": 0.11093343098958333, "learning_rate": 0.0001, "loss": 5.5931, "loss/crossentropy": 2.541961431503296, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15648197010159492, "step": 11282 }, { "epoch": 0.5129090909090909, "grad_norm": 5.40625, "grad_norm_var": 0.11886393229166667, "learning_rate": 0.0001, "loss": 5.9437, "loss/crossentropy": 2.681400179862976, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1734994687139988, "step": 11284 }, { "epoch": 0.513, "grad_norm": 5.375, "grad_norm_var": 0.13772379557291667, "learning_rate": 0.0001, "loss": 5.3147, "loss/crossentropy": 2.251597225666046, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1563120186328888, "step": 11286 }, { "epoch": 0.513090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.14244791666666667, "learning_rate": 0.0001, "loss": 5.9, "loss/crossentropy": 2.625343441963196, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17512031644582748, "step": 11288 }, { "epoch": 0.5131818181818182, "grad_norm": 5.875, "grad_norm_var": 0.17401936848958333, "learning_rate": 0.0001, "loss": 5.4307, "loss/crossentropy": 2.3568709194660187, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.156207874417305, "step": 11290 }, { "epoch": 0.5132727272727273, "grad_norm": 4.5625, "grad_norm_var": 0.140478515625, "learning_rate": 0.0001, "loss": 5.4737, "loss/crossentropy": 2.390196204185486, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15952178835868835, "step": 11292 }, { "epoch": 0.5133636363636364, "grad_norm": 4.75, "grad_norm_var": 0.1326171875, "learning_rate": 0.0001, "loss": 5.0384, "loss/crossentropy": 2.0992167592048645, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.14469658955931664, "step": 11294 }, { "epoch": 0.5134545454545455, "grad_norm": 4.84375, "grad_norm_var": 0.12336832682291667, "learning_rate": 0.0001, "loss": 5.7897, "loss/crossentropy": 2.5832952857017517, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16966495662927628, "step": 11296 }, { "epoch": 0.5135454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.12649739583333333, "learning_rate": 0.0001, "loss": 5.4852, "loss/crossentropy": 2.4032829999923706, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15995153039693832, "step": 11298 }, { "epoch": 0.5136363636363637, "grad_norm": 5.0625, "grad_norm_var": 0.129541015625, "learning_rate": 0.0001, "loss": 5.1859, "loss/crossentropy": 2.2157799899578094, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.14251796528697014, "step": 11300 }, { "epoch": 0.5137272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.11144205729166666, "learning_rate": 0.0001, "loss": 5.5094, "loss/crossentropy": 2.4428184032440186, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15860801190137863, "step": 11302 }, { "epoch": 0.5138181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.10670572916666667, "learning_rate": 0.0001, "loss": 5.5092, "loss/crossentropy": 2.3933839201927185, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16334276273846626, "step": 11304 }, { "epoch": 0.5139090909090909, "grad_norm": 5.125, "grad_norm_var": 0.9923014322916667, "learning_rate": 0.0001, "loss": 5.7004, "loss/crossentropy": 2.5066319704055786, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16839583218097687, "step": 11306 }, { "epoch": 0.514, "grad_norm": 4.8125, "grad_norm_var": 0.9744425455729167, "learning_rate": 0.0001, "loss": 5.73, "loss/crossentropy": 2.563461422920227, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16587593406438828, "step": 11308 }, { "epoch": 0.514090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.9762654622395833, "learning_rate": 0.0001, "loss": 5.2693, "loss/crossentropy": 2.2280170023441315, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.15158990398049355, "step": 11310 }, { "epoch": 0.5141818181818182, "grad_norm": 4.71875, "grad_norm_var": 1.003759765625, "learning_rate": 0.0001, "loss": 5.7664, "loss/crossentropy": 2.5529858469963074, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16762560978531837, "step": 11312 }, { "epoch": 0.5142727272727273, "grad_norm": 5.25, "grad_norm_var": 0.9845703125, "learning_rate": 0.0001, "loss": 5.6749, "loss/crossentropy": 2.5201463103294373, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1645008623600006, "step": 11314 }, { "epoch": 0.5143636363636364, "grad_norm": 5.875, "grad_norm_var": 0.9862589518229167, "learning_rate": 0.0001, "loss": 5.6073, "loss/crossentropy": 2.3899563252925873, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16744020953774452, "step": 11316 }, { "epoch": 0.5144545454545455, "grad_norm": 5.28125, "grad_norm_var": 0.965234375, "learning_rate": 0.0001, "loss": 5.6867, "loss/crossentropy": 2.521957039833069, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16628087684512138, "step": 11318 }, { "epoch": 0.5145454545454545, "grad_norm": 4.59375, "grad_norm_var": 0.9792277018229166, "learning_rate": 0.0001, "loss": 5.7318, "loss/crossentropy": 2.572475552558899, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16651428490877151, "step": 11320 }, { "epoch": 0.5146363636363637, "grad_norm": 4.8125, "grad_norm_var": 0.13098551432291666, "learning_rate": 0.0001, "loss": 5.5138, "loss/crossentropy": 2.4212434887886047, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15925529599189758, "step": 11322 }, { "epoch": 0.5147272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.13625895182291667, "learning_rate": 0.0001, "loss": 5.7193, "loss/crossentropy": 2.5840595960617065, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16605975851416588, "step": 11324 }, { "epoch": 0.5148181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.14247639973958334, "learning_rate": 0.0001, "loss": 5.4408, "loss/crossentropy": 2.384169638156891, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1570272110402584, "step": 11326 }, { "epoch": 0.5149090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.10194905598958333, "learning_rate": 0.0001, "loss": 5.8198, "loss/crossentropy": 2.659976601600647, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16656580194830894, "step": 11328 }, { "epoch": 0.515, "grad_norm": 4.84375, "grad_norm_var": 0.09905192057291666, "learning_rate": 0.0001, "loss": 5.4415, "loss/crossentropy": 2.416977822780609, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15206300094723701, "step": 11330 }, { "epoch": 0.515090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.036356608072916664, "learning_rate": 0.0001, "loss": 5.5099, "loss/crossentropy": 2.418288618326187, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15955467149615288, "step": 11332 }, { "epoch": 0.5151818181818182, "grad_norm": 5.5, "grad_norm_var": 0.061258951822916664, "learning_rate": 0.0001, "loss": 5.5936, "loss/crossentropy": 2.434962213039398, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16644559055566788, "step": 11334 }, { "epoch": 0.5152727272727273, "grad_norm": 5.03125, "grad_norm_var": 0.05, "learning_rate": 0.0001, "loss": 5.5001, "loss/crossentropy": 2.3804167807102203, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16079607233405113, "step": 11336 }, { "epoch": 0.5153636363636364, "grad_norm": 5.65625, "grad_norm_var": 0.08150634765625, "learning_rate": 0.0001, "loss": 5.3627, "loss/crossentropy": 2.2772040367126465, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15698848478496075, "step": 11338 }, { "epoch": 0.5154545454545455, "grad_norm": 7.375, "grad_norm_var": 0.4273274739583333, "learning_rate": 0.0001, "loss": 5.4102, "loss/crossentropy": 2.2757627069950104, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16071410104632378, "step": 11340 }, { "epoch": 0.5155454545454545, "grad_norm": 4.4375, "grad_norm_var": 0.44021809895833336, "learning_rate": 0.0001, "loss": 5.3302, "loss/crossentropy": 2.3474289178848267, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15238146856427193, "step": 11342 }, { "epoch": 0.5156363636363637, "grad_norm": 4.9375, "grad_norm_var": 0.45198160807291665, "learning_rate": 0.0001, "loss": 5.862, "loss/crossentropy": 2.7192708253860474, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16602632403373718, "step": 11344 }, { "epoch": 0.5157272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.44894205729166664, "learning_rate": 0.0001, "loss": 5.156, "loss/crossentropy": 2.211173266172409, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1456499621272087, "step": 11346 }, { "epoch": 0.5158181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.4571614583333333, "learning_rate": 0.0001, "loss": 5.7173, "loss/crossentropy": 2.572934925556183, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1648251675069332, "step": 11348 }, { "epoch": 0.5159090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.45050455729166666, "learning_rate": 0.0001, "loss": 5.4838, "loss/crossentropy": 2.3811818063259125, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1581113263964653, "step": 11350 }, { "epoch": 0.516, "grad_norm": 5.625, "grad_norm_var": 0.47857666015625, "learning_rate": 0.0001, "loss": 5.6055, "loss/crossentropy": 2.480678379535675, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16169650852680206, "step": 11352 }, { "epoch": 0.516090909090909, "grad_norm": 6.3125, "grad_norm_var": 0.56461181640625, "learning_rate": 0.0001, "loss": 5.6702, "loss/crossentropy": 2.4091030955314636, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17434975504875183, "step": 11354 }, { "epoch": 0.5161818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.214306640625, "learning_rate": 0.0001, "loss": 5.3644, "loss/crossentropy": 2.3537175059318542, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15321481600403786, "step": 11356 }, { "epoch": 0.5162727272727273, "grad_norm": 4.5625, "grad_norm_var": 0.21013997395833334, "learning_rate": 0.0001, "loss": 5.0975, "loss/crossentropy": 2.176319271326065, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14328520745038986, "step": 11358 }, { "epoch": 0.5163636363636364, "grad_norm": 4.28125, "grad_norm_var": 0.24163004557291667, "learning_rate": 0.0001, "loss": 5.3826, "loss/crossentropy": 2.4029247164726257, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15109299682080746, "step": 11360 }, { "epoch": 0.5164545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.24060872395833333, "learning_rate": 0.0001, "loss": 5.4312, "loss/crossentropy": 2.3360416889190674, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16029243171215057, "step": 11362 }, { "epoch": 0.5165454545454545, "grad_norm": 4.90625, "grad_norm_var": 0.24495035807291668, "learning_rate": 0.0001, "loss": 5.5714, "loss/crossentropy": 2.529927968978882, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1563001684844494, "step": 11364 }, { "epoch": 0.5166363636363637, "grad_norm": 4.6875, "grad_norm_var": 0.26060791015625, "learning_rate": 0.0001, "loss": 5.2423, "loss/crossentropy": 2.2399765253067017, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15042971074581146, "step": 11366 }, { "epoch": 0.5167272727272727, "grad_norm": 4.71875, "grad_norm_var": 0.233203125, "learning_rate": 0.0001, "loss": 5.5758, "loss/crossentropy": 2.4445695877075195, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16371175274252892, "step": 11368 }, { "epoch": 0.5168181818181818, "grad_norm": 4.875, "grad_norm_var": 0.09798177083333333, "learning_rate": 0.0001, "loss": 5.1736, "loss/crossentropy": 2.2583247125148773, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14426120929419994, "step": 11370 }, { "epoch": 0.5169090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.09924723307291666, "learning_rate": 0.0001, "loss": 5.942, "loss/crossentropy": 2.7258222103118896, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17337420210242271, "step": 11372 }, { "epoch": 0.517, "grad_norm": 5.25, "grad_norm_var": 0.09657796223958333, "learning_rate": 0.0001, "loss": 5.7265, "loss/crossentropy": 2.55788791179657, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16588224470615387, "step": 11374 }, { "epoch": 0.517090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.06194254557291667, "learning_rate": 0.0001, "loss": 5.9186, "loss/crossentropy": 2.6688587069511414, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17458417266607285, "step": 11376 }, { "epoch": 0.5171818181818182, "grad_norm": 5.3125, "grad_norm_var": 0.0703125, "learning_rate": 0.0001, "loss": 5.6687, "loss/crossentropy": 2.546813726425171, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16238567233085632, "step": 11378 }, { "epoch": 0.5172727272727272, "grad_norm": 5.78125, "grad_norm_var": 0.10920817057291667, "learning_rate": 0.0001, "loss": 5.1927, "loss/crossentropy": 2.169510066509247, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.14899695292115211, "step": 11380 }, { "epoch": 0.5173636363636364, "grad_norm": 5.0, "grad_norm_var": 0.08892822265625, "learning_rate": 0.0001, "loss": 5.4732, "loss/crossentropy": 2.390267252922058, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15887660905718803, "step": 11382 }, { "epoch": 0.5174545454545455, "grad_norm": 5.25, "grad_norm_var": 0.08547770182291667, "learning_rate": 0.0001, "loss": 5.354, "loss/crossentropy": 2.254630982875824, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15993855521082878, "step": 11384 }, { "epoch": 0.5175454545454545, "grad_norm": 5.125, "grad_norm_var": 0.081884765625, "learning_rate": 0.0001, "loss": 6.0824, "loss/crossentropy": 2.7935284972190857, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17692938819527626, "step": 11386 }, { "epoch": 0.5176363636363637, "grad_norm": 5.75, "grad_norm_var": 0.13004150390625, "learning_rate": 0.0001, "loss": 5.4243, "loss/crossentropy": 2.387833058834076, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15247710794210434, "step": 11388 }, { "epoch": 0.5177272727272727, "grad_norm": 5.4375, "grad_norm_var": 0.153369140625, "learning_rate": 0.0001, "loss": 5.511, "loss/crossentropy": 2.4559649229049683, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1558930166065693, "step": 11390 }, { "epoch": 0.5178181818181818, "grad_norm": 4.75, "grad_norm_var": 0.1671875, "learning_rate": 0.0001, "loss": 5.5202, "loss/crossentropy": 2.4777923822402954, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15463513135910034, "step": 11392 }, { "epoch": 0.5179090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.16783854166666667, "learning_rate": 0.0001, "loss": 5.7455, "loss/crossentropy": 2.5909385681152344, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16545889526605606, "step": 11394 }, { "epoch": 0.518, "grad_norm": 5.875, "grad_norm_var": 0.16365559895833334, "learning_rate": 0.0001, "loss": 5.5323, "loss/crossentropy": 2.4424089193344116, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15742620453238487, "step": 11396 }, { "epoch": 0.518090909090909, "grad_norm": 5.5625, "grad_norm_var": 0.19464518229166666, "learning_rate": 0.0001, "loss": 5.58, "loss/crossentropy": 2.4999775886535645, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16112461686134338, "step": 11398 }, { "epoch": 0.5181818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.19156494140625, "learning_rate": 0.0001, "loss": 5.9176, "loss/crossentropy": 2.7095948457717896, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17255688831210136, "step": 11400 }, { "epoch": 0.5182727272727272, "grad_norm": 4.9375, "grad_norm_var": 0.201025390625, "learning_rate": 0.0001, "loss": 5.2092, "loss/crossentropy": 2.249884247779846, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14827637746930122, "step": 11402 }, { "epoch": 0.5183636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.14933268229166666, "learning_rate": 0.0001, "loss": 5.7739, "loss/crossentropy": 2.6687061488628387, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16052328422665596, "step": 11404 }, { "epoch": 0.5184545454545455, "grad_norm": 5.03125, "grad_norm_var": 0.1271484375, "learning_rate": 0.0001, "loss": 5.9265, "loss/crossentropy": 2.763790249824524, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16939810290932655, "step": 11406 }, { "epoch": 0.5185454545454545, "grad_norm": 5.5625, "grad_norm_var": 0.14244384765625, "learning_rate": 0.0001, "loss": 5.6622, "loss/crossentropy": 2.5578179359436035, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1612205021083355, "step": 11408 }, { "epoch": 0.5186363636363637, "grad_norm": 5.0, "grad_norm_var": 0.13498942057291666, "learning_rate": 0.0001, "loss": 5.257, "loss/crossentropy": 2.2030357122421265, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1550075300037861, "step": 11410 }, { "epoch": 0.5187272727272727, "grad_norm": 5.65625, "grad_norm_var": 0.123681640625, "learning_rate": 0.0001, "loss": 5.6982, "loss/crossentropy": 2.488339841365814, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1688385307788849, "step": 11412 }, { "epoch": 0.5188181818181818, "grad_norm": 5.75, "grad_norm_var": 0.217041015625, "learning_rate": 0.0001, "loss": 5.5888, "loss/crossentropy": 2.4880114793777466, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16262184083461761, "step": 11414 }, { "epoch": 0.5189090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.23046875, "learning_rate": 0.0001, "loss": 5.8365, "loss/crossentropy": 2.6189099550247192, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1703910455107689, "step": 11416 }, { "epoch": 0.519, "grad_norm": 4.90625, "grad_norm_var": 0.2142578125, "learning_rate": 0.0001, "loss": 5.5589, "loss/crossentropy": 2.3746370673179626, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16842889040708542, "step": 11418 }, { "epoch": 0.519090909090909, "grad_norm": 5.75, "grad_norm_var": 0.24737955729166666, "learning_rate": 0.0001, "loss": 5.5748, "loss/crossentropy": 2.3437707722187042, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1715366132557392, "step": 11420 }, { "epoch": 0.5191818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.22066650390625, "learning_rate": 0.0001, "loss": 5.278, "loss/crossentropy": 2.2031770050525665, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15865514799952507, "step": 11422 }, { "epoch": 0.5192727272727272, "grad_norm": 4.75, "grad_norm_var": 0.2408203125, "learning_rate": 0.0001, "loss": 5.4407, "loss/crossentropy": 2.3997054994106293, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15624727681279182, "step": 11424 }, { "epoch": 0.5193636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.24433186848958333, "learning_rate": 0.0001, "loss": 5.5384, "loss/crossentropy": 2.4400044083595276, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16198882460594177, "step": 11426 }, { "epoch": 0.5194545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.23007405598958333, "learning_rate": 0.0001, "loss": 5.4773, "loss/crossentropy": 2.355273127555847, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16141759604215622, "step": 11428 }, { "epoch": 0.5195454545454545, "grad_norm": 5.1875, "grad_norm_var": 0.18860677083333333, "learning_rate": 0.0001, "loss": 5.5787, "loss/crossentropy": 2.4835284054279327, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15932218730449677, "step": 11430 }, { "epoch": 0.5196363636363637, "grad_norm": 5.28125, "grad_norm_var": 0.20220947265625, "learning_rate": 0.0001, "loss": 5.5927, "loss/crossentropy": 2.5181837677955627, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16018524393439293, "step": 11432 }, { "epoch": 0.5197272727272727, "grad_norm": 5.09375, "grad_norm_var": 0.19576416015625, "learning_rate": 0.0001, "loss": 5.2714, "loss/crossentropy": 2.2736304998397827, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1513417512178421, "step": 11434 }, { "epoch": 0.5198181818181818, "grad_norm": 5.0, "grad_norm_var": 0.10260009765625, "learning_rate": 0.0001, "loss": 5.1894, "loss/crossentropy": 2.2153303027153015, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14858205989003181, "step": 11436 }, { "epoch": 0.5199090909090909, "grad_norm": 5.40625, "grad_norm_var": 0.110791015625, "learning_rate": 0.0001, "loss": 5.8924, "loss/crossentropy": 2.658789575099945, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17257612571120262, "step": 11438 }, { "epoch": 0.52, "grad_norm": 4.71875, "grad_norm_var": 0.125244140625, "learning_rate": 0.0001, "loss": 5.6104, "loss/crossentropy": 2.510727196931839, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16055257990956306, "step": 11440 }, { "epoch": 0.520090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.12170817057291666, "learning_rate": 0.0001, "loss": 6.24, "loss/crossentropy": 2.8361973762512207, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.19116373360157013, "step": 11442 }, { "epoch": 0.5201818181818182, "grad_norm": 5.0, "grad_norm_var": 0.10128580729166667, "learning_rate": 0.0001, "loss": 5.6634, "loss/crossentropy": 2.524344503879547, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16780756786465645, "step": 11444 }, { "epoch": 0.5202727272727272, "grad_norm": 4.71875, "grad_norm_var": 0.10227457682291667, "learning_rate": 0.0001, "loss": 5.4633, "loss/crossentropy": 2.3295190930366516, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16435548663139343, "step": 11446 }, { "epoch": 0.5203636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.12980143229166666, "learning_rate": 0.0001, "loss": 5.8949, "loss/crossentropy": 2.661980092525482, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.170556478202343, "step": 11448 }, { "epoch": 0.5204545454545455, "grad_norm": 5.5625, "grad_norm_var": 0.12578125, "learning_rate": 0.0001, "loss": 5.9482, "loss/crossentropy": 2.65312123298645, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.177357729524374, "step": 11450 }, { "epoch": 0.5205454545454545, "grad_norm": 5.25, "grad_norm_var": 0.11248372395833334, "learning_rate": 0.0001, "loss": 5.6772, "loss/crossentropy": 2.546341061592102, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16269570216536522, "step": 11452 }, { "epoch": 0.5206363636363637, "grad_norm": 8.125, "grad_norm_var": 0.6403483072916667, "learning_rate": 0.0001, "loss": 5.3643, "loss/crossentropy": 2.1597833931446075, "loss/hidden": 1.599609375, "loss/jsd": 0.0, "loss/logits": 0.16049188002943993, "step": 11454 }, { "epoch": 0.5207272727272727, "grad_norm": 4.71875, "grad_norm_var": 0.6419881184895834, "learning_rate": 0.0001, "loss": 5.5811, "loss/crossentropy": 2.447795569896698, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1648901328444481, "step": 11456 }, { "epoch": 0.5208181818181818, "grad_norm": 5.5, "grad_norm_var": 0.63980712890625, "learning_rate": 0.0001, "loss": 6.1052, "loss/crossentropy": 2.827401340007782, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17660382762551308, "step": 11458 }, { "epoch": 0.5209090909090909, "grad_norm": 4.6875, "grad_norm_var": 0.6649739583333333, "learning_rate": 0.0001, "loss": 5.5856, "loss/crossentropy": 2.500984847545624, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16021526604890823, "step": 11460 }, { "epoch": 0.521, "grad_norm": 4.90625, "grad_norm_var": 0.7337239583333334, "learning_rate": 0.0001, "loss": 5.126, "loss/crossentropy": 2.2035148441791534, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14693856239318848, "step": 11462 }, { "epoch": 0.521090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.7050618489583333, "learning_rate": 0.0001, "loss": 5.5091, "loss/crossentropy": 2.352773368358612, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16426919028162956, "step": 11464 }, { "epoch": 0.5211818181818182, "grad_norm": 4.75, "grad_norm_var": 0.7121378580729166, "learning_rate": 0.0001, "loss": 5.4145, "loss/crossentropy": 2.3137649595737457, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15773220732808113, "step": 11466 }, { "epoch": 0.5212727272727272, "grad_norm": 4.90625, "grad_norm_var": 0.71324462890625, "learning_rate": 0.0001, "loss": 5.7179, "loss/crossentropy": 2.565337836742401, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16467201709747314, "step": 11468 }, { "epoch": 0.5213636363636364, "grad_norm": 5.0625, "grad_norm_var": 0.11315104166666666, "learning_rate": 0.0001, "loss": 5.5783, "loss/crossentropy": 2.457634150981903, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1618700474500656, "step": 11470 }, { "epoch": 0.5214545454545455, "grad_norm": 5.40625, "grad_norm_var": 0.12108968098958334, "learning_rate": 0.0001, "loss": 5.5921, "loss/crossentropy": 2.528220236301422, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15893124043941498, "step": 11472 }, { "epoch": 0.5215454545454545, "grad_norm": 4.90625, "grad_norm_var": 0.11138916015625, "learning_rate": 0.0001, "loss": 5.4638, "loss/crossentropy": 2.486754357814789, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15141934528946877, "step": 11474 }, { "epoch": 0.5216363636363637, "grad_norm": 5.1875, "grad_norm_var": 0.11047770182291666, "learning_rate": 0.0001, "loss": 5.4699, "loss/crossentropy": 2.3940068185329437, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1570039838552475, "step": 11476 }, { "epoch": 0.5217272727272727, "grad_norm": 5.0, "grad_norm_var": 0.07355143229166666, "learning_rate": 0.0001, "loss": 5.4108, "loss/crossentropy": 2.3701483011245728, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15504568815231323, "step": 11478 }, { "epoch": 0.5218181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.06054280598958333, "learning_rate": 0.0001, "loss": 5.4618, "loss/crossentropy": 2.42620712518692, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15590038150548935, "step": 11480 }, { "epoch": 0.5219090909090909, "grad_norm": 4.625, "grad_norm_var": 0.056640625, "learning_rate": 0.0001, "loss": 5.6726, "loss/crossentropy": 2.5711668729782104, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16366126388311386, "step": 11482 }, { "epoch": 0.522, "grad_norm": 5.34375, "grad_norm_var": 2.6347615559895834, "learning_rate": 0.0001, "loss": 5.3189, "loss/crossentropy": 2.3260642290115356, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.14713888987898827, "step": 11484 }, { "epoch": 0.522090909090909, "grad_norm": 5.15625, "grad_norm_var": 2.6177734375, "learning_rate": 0.0001, "loss": 5.8759, "loss/crossentropy": 2.6344969868659973, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17394323647022247, "step": 11486 }, { "epoch": 0.5221818181818182, "grad_norm": 5.34375, "grad_norm_var": 2.59644775390625, "learning_rate": 0.0001, "loss": 5.8951, "loss/crossentropy": 2.6187047362327576, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17666680365800858, "step": 11488 }, { "epoch": 0.5222727272727272, "grad_norm": 6.0625, "grad_norm_var": 2.6748046875, "learning_rate": 0.0001, "loss": 5.4564, "loss/crossentropy": 2.337107867002487, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16095517575740814, "step": 11490 }, { "epoch": 0.5223636363636364, "grad_norm": 5.6875, "grad_norm_var": 2.66109619140625, "learning_rate": 0.0001, "loss": 5.9471, "loss/crossentropy": 2.671757400035858, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17557881772518158, "step": 11492 }, { "epoch": 0.5224545454545455, "grad_norm": 4.90625, "grad_norm_var": 2.631884765625, "learning_rate": 0.0001, "loss": 5.6769, "loss/crossentropy": 2.489877760410309, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16929200291633606, "step": 11494 }, { "epoch": 0.5225454545454545, "grad_norm": 5.28125, "grad_norm_var": 2.549072265625, "learning_rate": 0.0001, "loss": 5.9899, "loss/crossentropy": 2.7057414054870605, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.17470479011535645, "step": 11496 }, { "epoch": 0.5226363636363637, "grad_norm": 4.75, "grad_norm_var": 2.51597900390625, "learning_rate": 0.0001, "loss": 5.4272, "loss/crossentropy": 2.4006329774856567, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15578345954418182, "step": 11498 }, { "epoch": 0.5227272727272727, "grad_norm": 6.125, "grad_norm_var": 0.4276041666666667, "learning_rate": 0.0001, "loss": 5.6241, "loss/crossentropy": 2.3832620978355408, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16920045390725136, "step": 11500 }, { "epoch": 0.5228181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.4161295572916667, "learning_rate": 0.0001, "loss": 5.65, "loss/crossentropy": 2.4329758882522583, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.168580774217844, "step": 11502 }, { "epoch": 0.5229090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.49361572265625, "learning_rate": 0.0001, "loss": 5.7085, "loss/crossentropy": 2.5622613728046417, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16501890867948532, "step": 11504 }, { "epoch": 0.523, "grad_norm": 5.25, "grad_norm_var": 0.3470052083333333, "learning_rate": 0.0001, "loss": 5.9788, "loss/crossentropy": 2.6987921595573425, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17780274152755737, "step": 11506 }, { "epoch": 0.523090909090909, "grad_norm": 4.875, "grad_norm_var": 0.346337890625, "learning_rate": 0.0001, "loss": 5.8071, "loss/crossentropy": 2.6378459334373474, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16653570160269737, "step": 11508 }, { "epoch": 0.5231818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.38943684895833336, "learning_rate": 0.0001, "loss": 5.2433, "loss/crossentropy": 2.206461250782013, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1534872055053711, "step": 11510 }, { "epoch": 0.5232727272727272, "grad_norm": 6.0, "grad_norm_var": 0.4071248372395833, "learning_rate": 0.0001, "loss": 5.6086, "loss/crossentropy": 2.427219331264496, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1646215170621872, "step": 11512 }, { "epoch": 0.5233636363636364, "grad_norm": 6.0, "grad_norm_var": 0.4091796875, "learning_rate": 0.0001, "loss": 5.7567, "loss/crossentropy": 2.439964920282364, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17191115766763687, "step": 11514 }, { "epoch": 0.5234545454545455, "grad_norm": 5.28125, "grad_norm_var": 0.33238525390625, "learning_rate": 0.0001, "loss": 6.1568, "loss/crossentropy": 2.848033905029297, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17990372329950333, "step": 11516 }, { "epoch": 0.5235454545454545, "grad_norm": 4.875, "grad_norm_var": 0.37509358723958336, "learning_rate": 0.0001, "loss": 5.7325, "loss/crossentropy": 2.5679110288619995, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16646388918161392, "step": 11518 }, { "epoch": 0.5236363636363637, "grad_norm": 5.21875, "grad_norm_var": 0.3206868489583333, "learning_rate": 0.0001, "loss": 5.4836, "loss/crossentropy": 2.4734872579574585, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15140066668391228, "step": 11520 }, { "epoch": 0.5237272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.33707275390625, "learning_rate": 0.0001, "loss": 5.5566, "loss/crossentropy": 2.4273542165756226, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1613624170422554, "step": 11522 }, { "epoch": 0.5238181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.31881103515625, "learning_rate": 0.0001, "loss": 5.7165, "loss/crossentropy": 2.533910036087036, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16806117817759514, "step": 11524 }, { "epoch": 0.5239090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.310400390625, "learning_rate": 0.0001, "loss": 5.3446, "loss/crossentropy": 2.336532175540924, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15178417041897774, "step": 11526 }, { "epoch": 0.524, "grad_norm": 4.75, "grad_norm_var": 0.29217122395833334, "learning_rate": 0.0001, "loss": 5.6224, "loss/crossentropy": 2.560520201921463, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15950957685709, "step": 11528 }, { "epoch": 0.524090909090909, "grad_norm": 4.59375, "grad_norm_var": 0.15872395833333333, "learning_rate": 0.0001, "loss": 5.1753, "loss/crossentropy": 2.187934249639511, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1499098688364029, "step": 11530 }, { "epoch": 0.5241818181818182, "grad_norm": 5.0, "grad_norm_var": 0.15813802083333334, "learning_rate": 0.0001, "loss": 5.3689, "loss/crossentropy": 2.3190948367118835, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1538068987429142, "step": 11532 }, { "epoch": 0.5242727272727272, "grad_norm": 5.0625, "grad_norm_var": 0.07472330729166667, "learning_rate": 0.0001, "loss": 5.6095, "loss/crossentropy": 2.4901386499404907, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1580318845808506, "step": 11534 }, { "epoch": 0.5243636363636364, "grad_norm": 5.09375, "grad_norm_var": 0.07812093098958334, "learning_rate": 0.0001, "loss": 5.6773, "loss/crossentropy": 2.5465354323387146, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1617058888077736, "step": 11536 }, { "epoch": 0.5244545454545455, "grad_norm": 5.5625, "grad_norm_var": 0.22628580729166667, "learning_rate": 0.0001, "loss": 5.876, "loss/crossentropy": 2.627891182899475, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17344705015420914, "step": 11538 }, { "epoch": 0.5245454545454545, "grad_norm": 5.46875, "grad_norm_var": 0.21926676432291667, "learning_rate": 0.0001, "loss": 5.6861, "loss/crossentropy": 2.4310396015644073, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17160186916589737, "step": 11540 }, { "epoch": 0.5246363636363637, "grad_norm": 5.125, "grad_norm_var": 0.21112874348958333, "learning_rate": 0.0001, "loss": 5.3056, "loss/crossentropy": 2.3273326456546783, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1503661759197712, "step": 11542 }, { "epoch": 0.5247272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.20110270182291667, "learning_rate": 0.0001, "loss": 5.5753, "loss/crossentropy": 2.4227343797683716, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16447628289461136, "step": 11544 }, { "epoch": 0.5248181818181819, "grad_norm": 5.3125, "grad_norm_var": 0.17589518229166667, "learning_rate": 0.0001, "loss": 5.9456, "loss/crossentropy": 2.736989736557007, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17242559418082237, "step": 11546 }, { "epoch": 0.5249090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.17974853515625, "learning_rate": 0.0001, "loss": 5.4083, "loss/crossentropy": 2.394290566444397, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15413412638008595, "step": 11548 }, { "epoch": 0.525, "grad_norm": 4.9375, "grad_norm_var": 0.20989176432291667, "learning_rate": 0.0001, "loss": 5.2792, "loss/crossentropy": 2.2813682854175568, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1529046967625618, "step": 11550 }, { "epoch": 0.525090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.246728515625, "learning_rate": 0.0001, "loss": 5.6256, "loss/crossentropy": 2.515303075313568, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16220545023679733, "step": 11552 }, { "epoch": 0.5251818181818182, "grad_norm": 4.6875, "grad_norm_var": 0.096337890625, "learning_rate": 0.0001, "loss": 5.6174, "loss/crossentropy": 2.527224600315094, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16233909130096436, "step": 11554 }, { "epoch": 0.5252727272727272, "grad_norm": 6.28125, "grad_norm_var": 0.18593343098958334, "learning_rate": 0.0001, "loss": 5.6319, "loss/crossentropy": 2.495100259780884, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16212228313088417, "step": 11556 }, { "epoch": 0.5253636363636364, "grad_norm": 5.15625, "grad_norm_var": 0.22164306640625, "learning_rate": 0.0001, "loss": 5.8962, "loss/crossentropy": 2.6610118746757507, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17215410992503166, "step": 11558 }, { "epoch": 0.5254545454545455, "grad_norm": 4.84375, "grad_norm_var": 0.24029947916666666, "learning_rate": 0.0001, "loss": 5.5323, "loss/crossentropy": 2.424234390258789, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1623692661523819, "step": 11560 }, { "epoch": 0.5255454545454545, "grad_norm": 4.5625, "grad_norm_var": 0.25758056640625, "learning_rate": 0.0001, "loss": 5.2474, "loss/crossentropy": 2.217895746231079, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15216735377907753, "step": 11562 }, { "epoch": 0.5256363636363637, "grad_norm": 4.75, "grad_norm_var": 0.2607421875, "learning_rate": 0.0001, "loss": 5.4529, "loss/crossentropy": 2.3945156037807465, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1560349501669407, "step": 11564 }, { "epoch": 0.5257272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.29250895182291664, "learning_rate": 0.0001, "loss": 5.8089, "loss/crossentropy": 2.6121813654899597, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1638164035975933, "step": 11566 }, { "epoch": 0.5258181818181819, "grad_norm": 5.03125, "grad_norm_var": 0.26080729166666666, "learning_rate": 0.0001, "loss": 5.4218, "loss/crossentropy": 2.402151584625244, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15040480345487595, "step": 11568 }, { "epoch": 0.5259090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.24342041015625, "learning_rate": 0.0001, "loss": 5.7599, "loss/crossentropy": 2.583667039871216, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16664284840226173, "step": 11570 }, { "epoch": 0.526, "grad_norm": 4.84375, "grad_norm_var": 0.17528889973958334, "learning_rate": 0.0001, "loss": 5.4527, "loss/crossentropy": 2.390179932117462, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15781289711594582, "step": 11572 }, { "epoch": 0.526090909090909, "grad_norm": 4.875, "grad_norm_var": 0.14798177083333333, "learning_rate": 0.0001, "loss": 5.4225, "loss/crossentropy": 2.3991548120975494, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15448780357837677, "step": 11574 }, { "epoch": 0.5261818181818182, "grad_norm": 5.71875, "grad_norm_var": 0.14940999348958334, "learning_rate": 0.0001, "loss": 5.8785, "loss/crossentropy": 2.7165735363960266, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1663876622915268, "step": 11576 }, { "epoch": 0.5262727272727272, "grad_norm": 5.28125, "grad_norm_var": 0.129541015625, "learning_rate": 0.0001, "loss": 5.6353, "loss/crossentropy": 2.5132967829704285, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16180655360221863, "step": 11578 }, { "epoch": 0.5263636363636364, "grad_norm": 4.90625, "grad_norm_var": 0.12489827473958333, "learning_rate": 0.0001, "loss": 5.6716, "loss/crossentropy": 2.5096209049224854, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1654185615479946, "step": 11580 }, { "epoch": 0.5264545454545455, "grad_norm": 4.71875, "grad_norm_var": 0.08437093098958333, "learning_rate": 0.0001, "loss": 5.6622, "loss/crossentropy": 2.5605412125587463, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16133761778473854, "step": 11582 }, { "epoch": 0.5265454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.09060872395833333, "learning_rate": 0.0001, "loss": 5.7532, "loss/crossentropy": 2.5263421535491943, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17150931432843208, "step": 11584 }, { "epoch": 0.5266363636363637, "grad_norm": 4.625, "grad_norm_var": 0.13619791666666667, "learning_rate": 0.0001, "loss": 5.8729, "loss/crossentropy": 2.7402184009552, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16444235295057297, "step": 11586 }, { "epoch": 0.5267272727272727, "grad_norm": 5.34375, "grad_norm_var": 0.336572265625, "learning_rate": 0.0001, "loss": 5.8128, "loss/crossentropy": 2.555798888206482, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17549997568130493, "step": 11588 }, { "epoch": 0.5268181818181819, "grad_norm": 5.3125, "grad_norm_var": 0.3340983072916667, "learning_rate": 0.0001, "loss": 5.9195, "loss/crossentropy": 2.696958601474762, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17225219309329987, "step": 11590 }, { "epoch": 0.5269090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.33865559895833336, "learning_rate": 0.0001, "loss": 5.3239, "loss/crossentropy": 2.294593721628189, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15429406985640526, "step": 11592 }, { "epoch": 0.527, "grad_norm": 4.71875, "grad_norm_var": 0.3480428059895833, "learning_rate": 0.0001, "loss": 5.5557, "loss/crossentropy": 2.4792935848236084, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1586216203868389, "step": 11594 }, { "epoch": 0.527090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.36847330729166666, "learning_rate": 0.0001, "loss": 5.4953, "loss/crossentropy": 2.477485716342926, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1549060009419918, "step": 11596 }, { "epoch": 0.5271818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.3549112955729167, "learning_rate": 0.0001, "loss": 5.8819, "loss/crossentropy": 2.625824987888336, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17306401580572128, "step": 11598 }, { "epoch": 0.5272727272727272, "grad_norm": 5.28125, "grad_norm_var": 0.3473307291666667, "learning_rate": 0.0001, "loss": 6.0317, "loss/crossentropy": 2.758597433567047, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17789938673377037, "step": 11600 }, { "epoch": 0.5273636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.28592122395833336, "learning_rate": 0.0001, "loss": 5.7264, "loss/crossentropy": 2.581452190876007, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16312434524297714, "step": 11602 }, { "epoch": 0.5274545454545455, "grad_norm": 5.0, "grad_norm_var": 0.11077067057291666, "learning_rate": 0.0001, "loss": 5.7988, "loss/crossentropy": 2.6411542892456055, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1677161306142807, "step": 11604 }, { "epoch": 0.5275454545454545, "grad_norm": 5.0, "grad_norm_var": 0.097119140625, "learning_rate": 0.0001, "loss": 5.5884, "loss/crossentropy": 2.4576746225357056, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16326841339468956, "step": 11606 }, { "epoch": 0.5276363636363637, "grad_norm": 4.6875, "grad_norm_var": 0.10974934895833334, "learning_rate": 0.0001, "loss": 5.4934, "loss/crossentropy": 2.4006629586219788, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16122816503047943, "step": 11608 }, { "epoch": 0.5277272727272727, "grad_norm": 5.15625, "grad_norm_var": 0.11263020833333333, "learning_rate": 0.0001, "loss": 5.6265, "loss/crossentropy": 2.5144362449645996, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16276629269123077, "step": 11610 }, { "epoch": 0.5278181818181819, "grad_norm": 5.0625, "grad_norm_var": 0.09034830729166667, "learning_rate": 0.0001, "loss": 5.9381, "loss/crossentropy": 2.7489136457443237, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17028168588876724, "step": 11612 }, { "epoch": 0.5279090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.11903889973958333, "learning_rate": 0.0001, "loss": 5.2029, "loss/crossentropy": 2.2286249697208405, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.14840072020888329, "step": 11614 }, { "epoch": 0.528, "grad_norm": 5.0625, "grad_norm_var": 0.12068684895833333, "learning_rate": 0.0001, "loss": 5.2158, "loss/crossentropy": 2.184873938560486, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1532907448709011, "step": 11616 }, { "epoch": 0.528090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.15211181640625, "learning_rate": 0.0001, "loss": 5.481, "loss/crossentropy": 2.4617788195610046, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15504543483257294, "step": 11618 }, { "epoch": 0.5281818181818182, "grad_norm": 4.53125, "grad_norm_var": 0.11067708333333333, "learning_rate": 0.0001, "loss": 5.5992, "loss/crossentropy": 2.485476016998291, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16039685159921646, "step": 11620 }, { "epoch": 0.5282727272727272, "grad_norm": 5.28125, "grad_norm_var": 0.11845296223958333, "learning_rate": 0.0001, "loss": 5.7033, "loss/crossentropy": 2.5420307517051697, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16632424667477608, "step": 11622 }, { "epoch": 0.5283636363636364, "grad_norm": 4.53125, "grad_norm_var": 0.10271809895833334, "learning_rate": 0.0001, "loss": 5.4248, "loss/crossentropy": 2.4030017256736755, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15472382307052612, "step": 11624 }, { "epoch": 0.5284545454545454, "grad_norm": 5.4375, "grad_norm_var": 0.09542643229166667, "learning_rate": 0.0001, "loss": 5.6959, "loss/crossentropy": 2.5539764761924744, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1634124480187893, "step": 11626 }, { "epoch": 0.5285454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.09625244140625, "learning_rate": 0.0001, "loss": 5.767, "loss/crossentropy": 2.6526360511779785, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16378260403871536, "step": 11628 }, { "epoch": 0.5286363636363637, "grad_norm": 5.34375, "grad_norm_var": 0.12029622395833334, "learning_rate": 0.0001, "loss": 5.4258, "loss/crossentropy": 2.359153628349304, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15979433059692383, "step": 11630 }, { "epoch": 0.5287272727272727, "grad_norm": 4.53125, "grad_norm_var": 0.129150390625, "learning_rate": 0.0001, "loss": 5.341, "loss/crossentropy": 2.3381743133068085, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15301954373717308, "step": 11632 }, { "epoch": 0.5288181818181819, "grad_norm": 4.78125, "grad_norm_var": 0.1087890625, "learning_rate": 0.0001, "loss": 5.5966, "loss/crossentropy": 2.508383870124817, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16018449142575264, "step": 11634 }, { "epoch": 0.5289090909090909, "grad_norm": 4.375, "grad_norm_var": 0.11064046223958333, "learning_rate": 0.0001, "loss": 5.6689, "loss/crossentropy": 2.614964723587036, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15949412435293198, "step": 11636 }, { "epoch": 0.529, "grad_norm": 4.875, "grad_norm_var": 0.12513020833333333, "learning_rate": 0.0001, "loss": 5.5694, "loss/crossentropy": 2.3919665217399597, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16969557106494904, "step": 11638 }, { "epoch": 0.5290909090909091, "grad_norm": 5.71875, "grad_norm_var": 0.15836181640625, "learning_rate": 0.0001, "loss": 5.8369, "loss/crossentropy": 2.6039292216300964, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17271031439304352, "step": 11640 }, { "epoch": 0.5291818181818182, "grad_norm": 4.875, "grad_norm_var": 0.13943684895833333, "learning_rate": 0.0001, "loss": 5.8409, "loss/crossentropy": 2.6346781849861145, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1713993363082409, "step": 11642 }, { "epoch": 0.5292727272727272, "grad_norm": 5.0625, "grad_norm_var": 0.14019775390625, "learning_rate": 0.0001, "loss": 5.4941, "loss/crossentropy": 2.440717101097107, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15749037638306618, "step": 11644 }, { "epoch": 0.5293636363636364, "grad_norm": 5.5, "grad_norm_var": 0.13938395182291666, "learning_rate": 0.0001, "loss": 5.6695, "loss/crossentropy": 2.557267725467682, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1637645997107029, "step": 11646 }, { "epoch": 0.5294545454545454, "grad_norm": 5.0625, "grad_norm_var": 0.1302734375, "learning_rate": 0.0001, "loss": 5.4034, "loss/crossentropy": 2.358051985502243, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15668338537216187, "step": 11648 }, { "epoch": 0.5295454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.133837890625, "learning_rate": 0.0001, "loss": 5.2725, "loss/crossentropy": 2.2257463932037354, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.15135956183075905, "step": 11650 }, { "epoch": 0.5296363636363637, "grad_norm": 5.125, "grad_norm_var": 0.09934895833333333, "learning_rate": 0.0001, "loss": 5.7998, "loss/crossentropy": 2.6513434648513794, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16679920256137848, "step": 11652 }, { "epoch": 0.5297272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.08824462890625, "learning_rate": 0.0001, "loss": 5.6563, "loss/crossentropy": 2.563631236553192, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16161536425352097, "step": 11654 }, { "epoch": 0.5298181818181819, "grad_norm": 4.84375, "grad_norm_var": 0.05390625, "learning_rate": 0.0001, "loss": 5.8201, "loss/crossentropy": 2.666722357273102, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16767966747283936, "step": 11656 }, { "epoch": 0.5299090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.07685139973958334, "learning_rate": 0.0001, "loss": 5.5374, "loss/crossentropy": 2.511270970106125, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15398243069648743, "step": 11658 }, { "epoch": 0.53, "grad_norm": 4.8125, "grad_norm_var": 0.07545572916666667, "learning_rate": 0.0001, "loss": 5.6958, "loss/crossentropy": 2.5582846999168396, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.165114875882864, "step": 11660 }, { "epoch": 0.5300909090909091, "grad_norm": 4.59375, "grad_norm_var": 0.04892171223958333, "learning_rate": 0.0001, "loss": 5.7519, "loss/crossentropy": 2.6091405153274536, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16388370096683502, "step": 11662 }, { "epoch": 0.5301818181818182, "grad_norm": 5.4375, "grad_norm_var": 0.06842041015625, "learning_rate": 0.0001, "loss": 5.5966, "loss/crossentropy": 2.4204812049865723, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16722046583890915, "step": 11664 }, { "epoch": 0.5302727272727272, "grad_norm": 4.5625, "grad_norm_var": 0.07042643229166666, "learning_rate": 0.0001, "loss": 5.3525, "loss/crossentropy": 2.3112173080444336, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15275727584958076, "step": 11666 }, { "epoch": 0.5303636363636364, "grad_norm": 5.15625, "grad_norm_var": 0.06803385416666667, "learning_rate": 0.0001, "loss": 5.6482, "loss/crossentropy": 2.4513262510299683, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1669505573809147, "step": 11668 }, { "epoch": 0.5304545454545454, "grad_norm": 4.96875, "grad_norm_var": 0.06842041015625, "learning_rate": 0.0001, "loss": 5.4119, "loss/crossentropy": 2.3529414534568787, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15902427956461906, "step": 11670 }, { "epoch": 0.5305454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.07665608723958334, "learning_rate": 0.0001, "loss": 5.5447, "loss/crossentropy": 2.4977696537971497, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15840510837733746, "step": 11672 }, { "epoch": 0.5306363636363637, "grad_norm": 4.875, "grad_norm_var": 0.05924072265625, "learning_rate": 0.0001, "loss": 5.2933, "loss/crossentropy": 2.198571890592575, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15966922417283058, "step": 11674 }, { "epoch": 0.5307272727272727, "grad_norm": 4.5625, "grad_norm_var": 0.06339518229166667, "learning_rate": 0.0001, "loss": 5.4263, "loss/crossentropy": 2.364956349134445, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15750199928879738, "step": 11676 }, { "epoch": 0.5308181818181819, "grad_norm": 5.25, "grad_norm_var": 0.06526285807291667, "learning_rate": 0.0001, "loss": 5.7387, "loss/crossentropy": 2.5375404953956604, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1679626628756523, "step": 11678 }, { "epoch": 0.5309090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.051102701822916666, "learning_rate": 0.0001, "loss": 5.066, "loss/crossentropy": 2.0973684787750244, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.14568718150258064, "step": 11680 }, { "epoch": 0.531, "grad_norm": 4.75, "grad_norm_var": 0.044755045572916666, "learning_rate": 0.0001, "loss": 5.6376, "loss/crossentropy": 2.5309077501296997, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16125498339533806, "step": 11682 }, { "epoch": 0.5310909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.04303385416666667, "learning_rate": 0.0001, "loss": 5.7035, "loss/crossentropy": 2.5160232186317444, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16738487035036087, "step": 11684 }, { "epoch": 0.5311818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.05792643229166667, "learning_rate": 0.0001, "loss": 5.2444, "loss/crossentropy": 2.20520082116127, "loss/hidden": 1.576171875, "loss/jsd": 0.0, "loss/logits": 0.14630625769495964, "step": 11686 }, { "epoch": 0.5312727272727272, "grad_norm": 4.84375, "grad_norm_var": 0.08059895833333333, "learning_rate": 0.0001, "loss": 5.6009, "loss/crossentropy": 2.4447956681251526, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16404446586966515, "step": 11688 }, { "epoch": 0.5313636363636364, "grad_norm": 5.375, "grad_norm_var": 0.083837890625, "learning_rate": 0.0001, "loss": 5.9276, "loss/crossentropy": 2.6452311873435974, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17726048082113266, "step": 11690 }, { "epoch": 0.5314545454545454, "grad_norm": 4.75, "grad_norm_var": 0.116015625, "learning_rate": 0.0001, "loss": 5.1875, "loss/crossentropy": 2.2814596593379974, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14763294532895088, "step": 11692 }, { "epoch": 0.5315454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.13179931640625, "learning_rate": 0.0001, "loss": 5.7445, "loss/crossentropy": 2.6153308153152466, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16428404301404953, "step": 11694 }, { "epoch": 0.5316363636363637, "grad_norm": 5.1875, "grad_norm_var": 0.12589518229166666, "learning_rate": 0.0001, "loss": 5.9873, "loss/crossentropy": 2.698302388191223, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17596612498164177, "step": 11696 }, { "epoch": 0.5317272727272727, "grad_norm": 4.59375, "grad_norm_var": 0.14700520833333333, "learning_rate": 0.0001, "loss": 5.4529, "loss/crossentropy": 2.4881181120872498, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14803612604737282, "step": 11698 }, { "epoch": 0.5318181818181819, "grad_norm": 5.3125, "grad_norm_var": 0.15810139973958334, "learning_rate": 0.0001, "loss": 5.4222, "loss/crossentropy": 2.35819011926651, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.157177422195673, "step": 11700 }, { "epoch": 0.5319090909090909, "grad_norm": 4.5625, "grad_norm_var": 0.14706624348958333, "learning_rate": 0.0001, "loss": 5.1158, "loss/crossentropy": 2.1986855268478394, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1454182006418705, "step": 11702 }, { "epoch": 0.532, "grad_norm": 4.65625, "grad_norm_var": 0.12513020833333333, "learning_rate": 0.0001, "loss": 5.5588, "loss/crossentropy": 2.499589443206787, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15729175135493279, "step": 11704 }, { "epoch": 0.5320909090909091, "grad_norm": 4.46875, "grad_norm_var": 0.1099609375, "learning_rate": 0.0001, "loss": 5.5603, "loss/crossentropy": 2.500252068042755, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15521949157118797, "step": 11706 }, { "epoch": 0.5321818181818182, "grad_norm": 5.5, "grad_norm_var": 0.11825764973958333, "learning_rate": 0.0001, "loss": 5.7007, "loss/crossentropy": 2.5778854489326477, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16306642815470695, "step": 11708 }, { "epoch": 0.5322727272727272, "grad_norm": 5.375, "grad_norm_var": 0.12252197265625, "learning_rate": 0.0001, "loss": 5.7818, "loss/crossentropy": 2.5759984254837036, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.171558428555727, "step": 11710 }, { "epoch": 0.5323636363636364, "grad_norm": 5.15625, "grad_norm_var": 0.12786458333333334, "learning_rate": 0.0001, "loss": 5.6079, "loss/crossentropy": 2.563403367996216, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15932776033878326, "step": 11712 }, { "epoch": 0.5324545454545454, "grad_norm": 4.9375, "grad_norm_var": 0.11131184895833333, "learning_rate": 0.0001, "loss": 6.0686, "loss/crossentropy": 2.8392905592918396, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17274058610200882, "step": 11714 }, { "epoch": 0.5325454545454545, "grad_norm": 4.90625, "grad_norm_var": 0.0974609375, "learning_rate": 0.0001, "loss": 5.199, "loss/crossentropy": 2.204054206609726, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.14930040389299393, "step": 11716 }, { "epoch": 0.5326363636363637, "grad_norm": 4.96875, "grad_norm_var": 0.09078369140625, "learning_rate": 0.0001, "loss": 5.6608, "loss/crossentropy": 2.51526802778244, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16259821876883507, "step": 11718 }, { "epoch": 0.5327272727272727, "grad_norm": 4.5, "grad_norm_var": 0.111572265625, "learning_rate": 0.0001, "loss": 5.7807, "loss/crossentropy": 2.579286217689514, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1689651571214199, "step": 11720 }, { "epoch": 0.5328181818181819, "grad_norm": 5.21875, "grad_norm_var": 0.11044514973958333, "learning_rate": 0.0001, "loss": 5.871, "loss/crossentropy": 2.606703817844391, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17369906604290009, "step": 11722 }, { "epoch": 0.5329090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.14811197916666666, "learning_rate": 0.0001, "loss": 5.4309, "loss/crossentropy": 2.396897554397583, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15691709890961647, "step": 11724 }, { "epoch": 0.533, "grad_norm": 4.40625, "grad_norm_var": 0.1591796875, "learning_rate": 0.0001, "loss": 5.0627, "loss/crossentropy": 2.207626849412918, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.13883044198155403, "step": 11726 }, { "epoch": 0.5330909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.15305582682291666, "learning_rate": 0.0001, "loss": 5.8195, "loss/crossentropy": 2.629148244857788, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16981158778071404, "step": 11728 }, { "epoch": 0.5331818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.16015218098958334, "learning_rate": 0.0001, "loss": 5.8984, "loss/crossentropy": 2.7073184847831726, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16930412128567696, "step": 11730 }, { "epoch": 0.5332727272727272, "grad_norm": 5.1875, "grad_norm_var": 0.18040364583333332, "learning_rate": 0.0001, "loss": 5.5991, "loss/crossentropy": 2.535825252532959, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15867197513580322, "step": 11732 }, { "epoch": 0.5333636363636364, "grad_norm": 5.125, "grad_norm_var": 0.31008707682291664, "learning_rate": 0.0001, "loss": 6.0241, "loss/crossentropy": 2.711982309818268, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.17710953950881958, "step": 11734 }, { "epoch": 0.5334545454545454, "grad_norm": 4.34375, "grad_norm_var": 0.32580973307291666, "learning_rate": 0.0001, "loss": 5.3254, "loss/crossentropy": 2.366285651922226, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15118397772312164, "step": 11736 }, { "epoch": 0.5335454545454545, "grad_norm": 5.0625, "grad_norm_var": 0.31145833333333334, "learning_rate": 0.0001, "loss": 5.6353, "loss/crossentropy": 2.4454724192619324, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16840071603655815, "step": 11738 }, { "epoch": 0.5336363636363637, "grad_norm": 5.15625, "grad_norm_var": 0.257275390625, "learning_rate": 0.0001, "loss": 5.6061, "loss/crossentropy": 2.464336007833481, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16632281243801117, "step": 11740 }, { "epoch": 0.5337272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.23928629557291667, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.6892940402030945, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16493145748972893, "step": 11742 }, { "epoch": 0.5338181818181819, "grad_norm": 4.78125, "grad_norm_var": 0.23683268229166668, "learning_rate": 0.0001, "loss": 5.6198, "loss/crossentropy": 2.5105109810829163, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16210034489631653, "step": 11744 }, { "epoch": 0.5339090909090909, "grad_norm": 5.53125, "grad_norm_var": 0.26764322916666666, "learning_rate": 0.0001, "loss": 5.4914, "loss/crossentropy": 2.4667125940322876, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15480920299887657, "step": 11746 }, { "epoch": 0.534, "grad_norm": 4.59375, "grad_norm_var": 0.26417643229166665, "learning_rate": 0.0001, "loss": 5.3359, "loss/crossentropy": 2.3916077315807343, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.148136705160141, "step": 11748 }, { "epoch": 0.5340909090909091, "grad_norm": 4.875, "grad_norm_var": 0.11357014973958333, "learning_rate": 0.0001, "loss": 5.5307, "loss/crossentropy": 2.4350056052207947, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1607387363910675, "step": 11750 }, { "epoch": 0.5341818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.07858072916666667, "learning_rate": 0.0001, "loss": 5.484, "loss/crossentropy": 2.462287127971649, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15334616973996162, "step": 11752 }, { "epoch": 0.5342727272727272, "grad_norm": 4.84375, "grad_norm_var": 0.08209228515625, "learning_rate": 0.0001, "loss": 5.4486, "loss/crossentropy": 2.4248600602149963, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15549679100513458, "step": 11754 }, { "epoch": 0.5343636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.07877604166666667, "learning_rate": 0.0001, "loss": 5.3148, "loss/crossentropy": 2.3434738516807556, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15025310590863228, "step": 11756 }, { "epoch": 0.5344545454545454, "grad_norm": 4.78125, "grad_norm_var": 0.07353108723958333, "learning_rate": 0.0001, "loss": 5.5632, "loss/crossentropy": 2.472403109073639, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1600608415901661, "step": 11758 }, { "epoch": 0.5345454545454545, "grad_norm": 5.71875, "grad_norm_var": 0.10885416666666667, "learning_rate": 0.0001, "loss": 5.4445, "loss/crossentropy": 2.4265640676021576, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1531626284122467, "step": 11760 }, { "epoch": 0.5346363636363637, "grad_norm": 4.96875, "grad_norm_var": 0.07281494140625, "learning_rate": 0.0001, "loss": 5.5033, "loss/crossentropy": 2.4085628390312195, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16279039904475212, "step": 11762 }, { "epoch": 0.5347272727272727, "grad_norm": 5.15625, "grad_norm_var": 0.06985677083333333, "learning_rate": 0.0001, "loss": 5.7012, "loss/crossentropy": 2.475400984287262, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1688736416399479, "step": 11764 }, { "epoch": 0.5348181818181819, "grad_norm": 5.40625, "grad_norm_var": 0.08134358723958333, "learning_rate": 0.0001, "loss": 5.5436, "loss/crossentropy": 2.4539679884910583, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15993741527199745, "step": 11766 }, { "epoch": 0.5349090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.09739176432291667, "learning_rate": 0.0001, "loss": 5.7824, "loss/crossentropy": 2.6366466879844666, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16692066192626953, "step": 11768 }, { "epoch": 0.535, "grad_norm": 5.0, "grad_norm_var": 0.097119140625, "learning_rate": 0.0001, "loss": 5.7453, "loss/crossentropy": 2.637820780277252, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1623111367225647, "step": 11770 }, { "epoch": 0.5350909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.09529622395833333, "learning_rate": 0.0001, "loss": 5.7724, "loss/crossentropy": 2.612788200378418, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16381338611245155, "step": 11772 }, { "epoch": 0.5351818181818182, "grad_norm": 5.125, "grad_norm_var": 0.07537434895833334, "learning_rate": 0.0001, "loss": 5.9614, "loss/crossentropy": 2.7422853112220764, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17229974642395973, "step": 11774 }, { "epoch": 0.5352727272727272, "grad_norm": 5.59375, "grad_norm_var": 0.06451416015625, "learning_rate": 0.0001, "loss": 5.9461, "loss/crossentropy": 2.680665969848633, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17576489597558975, "step": 11776 }, { "epoch": 0.5353636363636364, "grad_norm": 5.125, "grad_norm_var": 0.2216796875, "learning_rate": 0.0001, "loss": 5.9252, "loss/crossentropy": 2.6677042841911316, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.1732068955898285, "step": 11778 }, { "epoch": 0.5354545454545454, "grad_norm": 4.75, "grad_norm_var": 0.246728515625, "learning_rate": 0.0001, "loss": 5.6443, "loss/crossentropy": 2.5280286073684692, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1606515273451805, "step": 11780 }, { "epoch": 0.5355454545454545, "grad_norm": 4.65625, "grad_norm_var": 0.26672770182291666, "learning_rate": 0.0001, "loss": 5.4485, "loss/crossentropy": 2.3490573167800903, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1615084707736969, "step": 11782 }, { "epoch": 0.5356363636363637, "grad_norm": 5.1875, "grad_norm_var": 0.261328125, "learning_rate": 0.0001, "loss": 5.5085, "loss/crossentropy": 2.381419390439987, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16133763268589973, "step": 11784 }, { "epoch": 0.5357272727272727, "grad_norm": 5.125, "grad_norm_var": 0.28033447265625, "learning_rate": 0.0001, "loss": 5.166, "loss/crossentropy": 2.221637636423111, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14677541702985764, "step": 11786 }, { "epoch": 0.5358181818181819, "grad_norm": 5.40625, "grad_norm_var": 0.28765869140625, "learning_rate": 0.0001, "loss": 5.6869, "loss/crossentropy": 2.5278281569480896, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16805700957775116, "step": 11788 }, { "epoch": 0.5359090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.281884765625, "learning_rate": 0.0001, "loss": 5.8858, "loss/crossentropy": 2.6334755420684814, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.174455925822258, "step": 11790 }, { "epoch": 0.536, "grad_norm": 4.5625, "grad_norm_var": 0.29781494140625, "learning_rate": 0.0001, "loss": 5.3698, "loss/crossentropy": 2.353064715862274, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15421009808778763, "step": 11792 }, { "epoch": 0.5360909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.08215738932291666, "learning_rate": 0.0001, "loss": 5.3372, "loss/crossentropy": 2.3106865882873535, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15343176573514938, "step": 11794 }, { "epoch": 0.5361818181818182, "grad_norm": 4.875, "grad_norm_var": 0.07975260416666667, "learning_rate": 0.0001, "loss": 5.5492, "loss/crossentropy": 2.4705324172973633, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15903674438595772, "step": 11796 }, { "epoch": 0.5362727272727272, "grad_norm": 5.96875, "grad_norm_var": 0.11695556640625, "learning_rate": 0.0001, "loss": 5.5668, "loss/crossentropy": 2.451120764017105, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15922871604561806, "step": 11798 }, { "epoch": 0.5363636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.11542561848958334, "learning_rate": 0.0001, "loss": 5.5601, "loss/crossentropy": 2.4737014174461365, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16059113666415215, "step": 11800 }, { "epoch": 0.5364545454545454, "grad_norm": 4.5625, "grad_norm_var": 0.11951497395833334, "learning_rate": 0.0001, "loss": 5.3668, "loss/crossentropy": 2.353312313556671, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15330573171377182, "step": 11802 }, { "epoch": 0.5365454545454545, "grad_norm": 5.25, "grad_norm_var": 0.11568603515625, "learning_rate": 0.0001, "loss": 5.7009, "loss/crossentropy": 2.5508822798728943, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16676024347543716, "step": 11804 }, { "epoch": 0.5366363636363637, "grad_norm": 4.90625, "grad_norm_var": 0.12115885416666666, "learning_rate": 0.0001, "loss": 5.7271, "loss/crossentropy": 2.6198853254318237, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16248339787125587, "step": 11806 }, { "epoch": 0.5367272727272727, "grad_norm": 4.625, "grad_norm_var": 0.12073160807291666, "learning_rate": 0.0001, "loss": 5.6095, "loss/crossentropy": 2.539943039417267, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15773403272032738, "step": 11808 }, { "epoch": 0.5368181818181819, "grad_norm": 4.84375, "grad_norm_var": 0.12057291666666667, "learning_rate": 0.0001, "loss": 5.2866, "loss/crossentropy": 2.269374907016754, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.14996064826846123, "step": 11810 }, { "epoch": 0.5369090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.12604166666666666, "learning_rate": 0.0001, "loss": 5.3796, "loss/crossentropy": 2.341698467731476, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15633050724864006, "step": 11812 }, { "epoch": 0.537, "grad_norm": 5.375, "grad_norm_var": 0.07003580729166667, "learning_rate": 0.0001, "loss": 5.8233, "loss/crossentropy": 2.6461498737335205, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.17142247781157494, "step": 11814 }, { "epoch": 0.5370909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.0796875, "learning_rate": 0.0001, "loss": 5.905, "loss/crossentropy": 2.7072067260742188, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16841626167297363, "step": 11816 }, { "epoch": 0.5371818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.07274983723958334, "learning_rate": 0.0001, "loss": 5.7316, "loss/crossentropy": 2.5829994678497314, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1681823804974556, "step": 11818 }, { "epoch": 0.5372727272727272, "grad_norm": 5.15625, "grad_norm_var": 0.0888671875, "learning_rate": 0.0001, "loss": 5.8833, "loss/crossentropy": 2.6628684401512146, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17165104299783707, "step": 11820 }, { "epoch": 0.5373636363636364, "grad_norm": 4.75, "grad_norm_var": 0.08566080729166667, "learning_rate": 0.0001, "loss": 5.3903, "loss/crossentropy": 2.381030321121216, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1530761756002903, "step": 11822 }, { "epoch": 0.5374545454545454, "grad_norm": 5.3125, "grad_norm_var": 0.08277587890625, "learning_rate": 0.0001, "loss": 5.6364, "loss/crossentropy": 2.428826928138733, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1680193617939949, "step": 11824 }, { "epoch": 0.5375454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.11968994140625, "learning_rate": 0.0001, "loss": 5.3313, "loss/crossentropy": 2.389039099216461, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14969687536358833, "step": 11826 }, { "epoch": 0.5376363636363637, "grad_norm": 5.65625, "grad_norm_var": 0.14205322265625, "learning_rate": 0.0001, "loss": 5.5913, "loss/crossentropy": 2.483102321624756, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16140318661928177, "step": 11828 }, { "epoch": 0.5377272727272727, "grad_norm": 4.65625, "grad_norm_var": 0.145166015625, "learning_rate": 0.0001, "loss": 5.4699, "loss/crossentropy": 2.3997111320495605, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15818779170513153, "step": 11830 }, { "epoch": 0.5378181818181819, "grad_norm": 5.34375, "grad_norm_var": 0.15286458333333333, "learning_rate": 0.0001, "loss": 5.5905, "loss/crossentropy": 2.4827281832695007, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16370191425085068, "step": 11832 }, { "epoch": 0.5379090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.1490234375, "learning_rate": 0.0001, "loss": 5.5287, "loss/crossentropy": 2.4512845277786255, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1594988778233528, "step": 11834 }, { "epoch": 0.538, "grad_norm": 4.9375, "grad_norm_var": 0.122509765625, "learning_rate": 0.0001, "loss": 5.4667, "loss/crossentropy": 2.436112403869629, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1569620743393898, "step": 11836 }, { "epoch": 0.5380909090909091, "grad_norm": 4.53125, "grad_norm_var": 0.130712890625, "learning_rate": 0.0001, "loss": 5.3458, "loss/crossentropy": 2.300630509853363, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15627339109778404, "step": 11838 }, { "epoch": 0.5381818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.10104166666666667, "learning_rate": 0.0001, "loss": 5.253, "loss/crossentropy": 2.287660837173462, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1496546920388937, "step": 11840 }, { "epoch": 0.5382727272727272, "grad_norm": 5.3125, "grad_norm_var": 0.09426676432291667, "learning_rate": 0.0001, "loss": 5.8434, "loss/crossentropy": 2.66462641954422, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16710053384304047, "step": 11842 }, { "epoch": 0.5383636363636364, "grad_norm": 4.6875, "grad_norm_var": 0.08561197916666667, "learning_rate": 0.0001, "loss": 5.6658, "loss/crossentropy": 2.5906155109405518, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1590837873518467, "step": 11844 }, { "epoch": 0.5384545454545454, "grad_norm": 4.78125, "grad_norm_var": 0.08756103515625, "learning_rate": 0.0001, "loss": 5.7169, "loss/crossentropy": 2.533994138240814, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1671147607266903, "step": 11846 }, { "epoch": 0.5385454545454545, "grad_norm": 4.96875, "grad_norm_var": 0.57105712890625, "learning_rate": 0.0001, "loss": 5.8051, "loss/crossentropy": 2.605129897594452, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17136871069669724, "step": 11848 }, { "epoch": 0.5386363636363637, "grad_norm": 5.03125, "grad_norm_var": 0.574462890625, "learning_rate": 0.0001, "loss": 5.5042, "loss/crossentropy": 2.4175845980644226, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16002702340483665, "step": 11850 }, { "epoch": 0.5387272727272727, "grad_norm": 5.5, "grad_norm_var": 0.569775390625, "learning_rate": 0.0001, "loss": 5.8614, "loss/crossentropy": 2.723442852497101, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16301490738987923, "step": 11852 }, { "epoch": 0.5388181818181819, "grad_norm": 4.6875, "grad_norm_var": 0.59453125, "learning_rate": 0.0001, "loss": 5.9737, "loss/crossentropy": 2.6872878074645996, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.18059051409363747, "step": 11854 }, { "epoch": 0.5389090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.5672159830729167, "learning_rate": 0.0001, "loss": 5.383, "loss/crossentropy": 2.3371022045612335, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15673777647316456, "step": 11856 }, { "epoch": 0.539, "grad_norm": 4.9375, "grad_norm_var": 0.576025390625, "learning_rate": 0.0001, "loss": 5.5244, "loss/crossentropy": 2.49228572845459, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15555689483880997, "step": 11858 }, { "epoch": 0.5390909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.55113525390625, "learning_rate": 0.0001, "loss": 5.6003, "loss/crossentropy": 2.4790830612182617, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16290712729096413, "step": 11860 }, { "epoch": 0.5391818181818182, "grad_norm": 4.46875, "grad_norm_var": 0.6294230143229167, "learning_rate": 0.0001, "loss": 5.2854, "loss/crossentropy": 2.2917089760303497, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1521005854010582, "step": 11862 }, { "epoch": 0.5392727272727272, "grad_norm": 5.0, "grad_norm_var": 0.20963134765625, "learning_rate": 0.0001, "loss": 5.6498, "loss/crossentropy": 2.4831774830818176, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.164518341422081, "step": 11864 }, { "epoch": 0.5393636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.20201822916666667, "learning_rate": 0.0001, "loss": 5.699, "loss/crossentropy": 2.531345784664154, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1638316661119461, "step": 11866 }, { "epoch": 0.5394545454545454, "grad_norm": 4.875, "grad_norm_var": 0.20076497395833334, "learning_rate": 0.0001, "loss": 5.4312, "loss/crossentropy": 2.4150264263153076, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15220632776618004, "step": 11868 }, { "epoch": 0.5395454545454546, "grad_norm": 4.8125, "grad_norm_var": 0.143994140625, "learning_rate": 0.0001, "loss": 5.6184, "loss/crossentropy": 2.536176562309265, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.15724720805883408, "step": 11870 }, { "epoch": 0.5396363636363637, "grad_norm": 4.6875, "grad_norm_var": 0.15292561848958333, "learning_rate": 0.0001, "loss": 5.5657, "loss/crossentropy": 2.517603099346161, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1559770107269287, "step": 11872 }, { "epoch": 0.5397272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.15608317057291668, "learning_rate": 0.0001, "loss": 5.5396, "loss/crossentropy": 2.4932432770729065, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15815414860844612, "step": 11874 }, { "epoch": 0.5398181818181819, "grad_norm": 4.84375, "grad_norm_var": 0.16526285807291666, "learning_rate": 0.0001, "loss": 5.5047, "loss/crossentropy": 2.4761269092559814, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15559198334813118, "step": 11876 }, { "epoch": 0.5399090909090909, "grad_norm": 5.25, "grad_norm_var": 0.059488932291666664, "learning_rate": 0.0001, "loss": 5.4026, "loss/crossentropy": 2.357177436351776, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1543460376560688, "step": 11878 }, { "epoch": 0.54, "grad_norm": 4.65625, "grad_norm_var": 0.024983723958333332, "learning_rate": 0.0001, "loss": 5.3945, "loss/crossentropy": 2.4013152718544006, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1532265916466713, "step": 11880 }, { "epoch": 0.5400909090909091, "grad_norm": 5.125, "grad_norm_var": 0.05328369140625, "learning_rate": 0.0001, "loss": 5.5749, "loss/crossentropy": 2.4432485103607178, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16335546597838402, "step": 11882 }, { "epoch": 0.5401818181818182, "grad_norm": 4.625, "grad_norm_var": 0.05907796223958333, "learning_rate": 0.0001, "loss": 5.4148, "loss/crossentropy": 2.358603537082672, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1565932296216488, "step": 11884 }, { "epoch": 0.5402727272727272, "grad_norm": 5.53125, "grad_norm_var": 0.081494140625, "learning_rate": 0.0001, "loss": 5.7796, "loss/crossentropy": 2.57023823261261, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1711283065378666, "step": 11886 }, { "epoch": 0.5403636363636364, "grad_norm": 5.5, "grad_norm_var": 0.11217447916666666, "learning_rate": 0.0001, "loss": 5.6808, "loss/crossentropy": 2.5407851338386536, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1641961969435215, "step": 11888 }, { "epoch": 0.5404545454545454, "grad_norm": 5.3125, "grad_norm_var": 0.108056640625, "learning_rate": 0.0001, "loss": 5.3377, "loss/crossentropy": 2.340271770954132, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15090976282954216, "step": 11890 }, { "epoch": 0.5405454545454546, "grad_norm": 4.5625, "grad_norm_var": 0.13707275390625, "learning_rate": 0.0001, "loss": 5.3281, "loss/crossentropy": 2.395107477903366, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1481868140399456, "step": 11892 }, { "epoch": 0.5406363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.17434895833333333, "learning_rate": 0.0001, "loss": 4.9388, "loss/crossentropy": 2.0923281013965607, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.13737668842077255, "step": 11894 }, { "epoch": 0.5407272727272727, "grad_norm": 4.875, "grad_norm_var": 0.17307535807291666, "learning_rate": 0.0001, "loss": 5.6575, "loss/crossentropy": 2.5359903275966644, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1631249226629734, "step": 11896 }, { "epoch": 0.5408181818181819, "grad_norm": 4.90625, "grad_norm_var": 0.15406494140625, "learning_rate": 0.0001, "loss": 5.5006, "loss/crossentropy": 2.3463075160980225, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1654244400560856, "step": 11898 }, { "epoch": 0.5409090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.16835530598958334, "learning_rate": 0.0001, "loss": 5.6515, "loss/crossentropy": 2.531062036752701, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16457895562052727, "step": 11900 }, { "epoch": 0.541, "grad_norm": 5.0625, "grad_norm_var": 0.15194905598958333, "learning_rate": 0.0001, "loss": 5.8691, "loss/crossentropy": 2.6942979097366333, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16943366453051567, "step": 11902 }, { "epoch": 0.5410909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 5.5018, "loss/crossentropy": 2.4487345218658447, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15667764469981194, "step": 11904 }, { "epoch": 0.5411818181818182, "grad_norm": 5.125, "grad_norm_var": 0.10217692057291666, "learning_rate": 0.0001, "loss": 5.366, "loss/crossentropy": 2.3015649616718292, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15956929698586464, "step": 11906 }, { "epoch": 0.5412727272727272, "grad_norm": 5.34375, "grad_norm_var": 0.09230143229166667, "learning_rate": 0.0001, "loss": 5.8585, "loss/crossentropy": 2.650891900062561, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16919738799333572, "step": 11908 }, { "epoch": 0.5413636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.07502848307291667, "learning_rate": 0.0001, "loss": 5.492, "loss/crossentropy": 2.453842341899872, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1573301926255226, "step": 11910 }, { "epoch": 0.5414545454545454, "grad_norm": 4.78125, "grad_norm_var": 0.07675374348958333, "learning_rate": 0.0001, "loss": 5.6994, "loss/crossentropy": 2.5135883688926697, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16604382917284966, "step": 11912 }, { "epoch": 0.5415454545454546, "grad_norm": 5.21875, "grad_norm_var": 0.10494384765625, "learning_rate": 0.0001, "loss": 5.5936, "loss/crossentropy": 2.5471272468566895, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15776770934462547, "step": 11914 }, { "epoch": 0.5416363636363636, "grad_norm": 5.53125, "grad_norm_var": 0.09803059895833334, "learning_rate": 0.0001, "loss": 5.5162, "loss/crossentropy": 2.3512961864471436, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16434543952345848, "step": 11916 }, { "epoch": 0.5417272727272727, "grad_norm": 4.6875, "grad_norm_var": 0.10084228515625, "learning_rate": 0.0001, "loss": 5.6582, "loss/crossentropy": 2.5585391521453857, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16113493219017982, "step": 11918 }, { "epoch": 0.5418181818181819, "grad_norm": 5.15625, "grad_norm_var": 0.12301025390625, "learning_rate": 0.0001, "loss": 5.5348, "loss/crossentropy": 2.3259292244911194, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16697800904512405, "step": 11920 }, { "epoch": 0.5419090909090909, "grad_norm": 5.375, "grad_norm_var": 0.12810872395833334, "learning_rate": 0.0001, "loss": 5.4, "loss/crossentropy": 2.363675892353058, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15539118275046349, "step": 11922 }, { "epoch": 0.542, "grad_norm": 4.625, "grad_norm_var": 0.13218994140625, "learning_rate": 0.0001, "loss": 5.7265, "loss/crossentropy": 2.599820077419281, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16364599391818047, "step": 11924 }, { "epoch": 0.5420909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.13124593098958334, "learning_rate": 0.0001, "loss": 5.688, "loss/crossentropy": 2.574453830718994, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16310754045844078, "step": 11926 }, { "epoch": 0.5421818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.14112955729166668, "learning_rate": 0.0001, "loss": 5.4873, "loss/crossentropy": 2.3061976730823517, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16303027048707008, "step": 11928 }, { "epoch": 0.5422727272727272, "grad_norm": 4.53125, "grad_norm_var": 0.13043212890625, "learning_rate": 0.0001, "loss": 5.1071, "loss/crossentropy": 2.2279416918754578, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14221550896763802, "step": 11930 }, { "epoch": 0.5423636363636364, "grad_norm": 5.0, "grad_norm_var": 0.11959228515625, "learning_rate": 0.0001, "loss": 5.9299, "loss/crossentropy": 2.717057466506958, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17226484417915344, "step": 11932 }, { "epoch": 0.5424545454545454, "grad_norm": 5.34375, "grad_norm_var": 0.20006510416666667, "learning_rate": 0.0001, "loss": 5.9907, "loss/crossentropy": 2.6609758138656616, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18063008040189743, "step": 11934 }, { "epoch": 0.5425454545454546, "grad_norm": 4.96875, "grad_norm_var": 0.18450113932291667, "learning_rate": 0.0001, "loss": 5.5085, "loss/crossentropy": 2.438881814479828, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15872181579470634, "step": 11936 }, { "epoch": 0.5426363636363636, "grad_norm": 4.96875, "grad_norm_var": 0.20206705729166666, "learning_rate": 0.0001, "loss": 5.3146, "loss/crossentropy": 2.313203603029251, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15248818695545197, "step": 11938 }, { "epoch": 0.5427272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.20569254557291666, "learning_rate": 0.0001, "loss": 5.6263, "loss/crossentropy": 2.4365834295749664, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1687805987894535, "step": 11940 }, { "epoch": 0.5428181818181819, "grad_norm": 4.5625, "grad_norm_var": 0.223828125, "learning_rate": 0.0001, "loss": 4.9918, "loss/crossentropy": 2.127735823392868, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.1420668140053749, "step": 11942 }, { "epoch": 0.5429090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.20753580729166668, "learning_rate": 0.0001, "loss": 5.6986, "loss/crossentropy": 2.587211489677429, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1626996286213398, "step": 11944 }, { "epoch": 0.543, "grad_norm": 5.03125, "grad_norm_var": 0.19138997395833332, "learning_rate": 0.0001, "loss": 5.4987, "loss/crossentropy": 2.430412471294403, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15741149708628654, "step": 11946 }, { "epoch": 0.5430909090909091, "grad_norm": 6.09375, "grad_norm_var": 0.25497639973958336, "learning_rate": 0.0001, "loss": 5.8765, "loss/crossentropy": 2.59558767080307, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17770399898290634, "step": 11948 }, { "epoch": 0.5431818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.16881510416666667, "learning_rate": 0.0001, "loss": 5.5338, "loss/crossentropy": 2.418336033821106, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15998810902237892, "step": 11950 }, { "epoch": 0.5432727272727272, "grad_norm": 5.09375, "grad_norm_var": 0.17030843098958334, "learning_rate": 0.0001, "loss": 5.4809, "loss/crossentropy": 2.386384278535843, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15925810113549232, "step": 11952 }, { "epoch": 0.5433636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.13843994140625, "learning_rate": 0.0001, "loss": 5.5639, "loss/crossentropy": 2.396286725997925, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1646169200539589, "step": 11954 }, { "epoch": 0.5434545454545454, "grad_norm": 4.6875, "grad_norm_var": 0.13580729166666666, "learning_rate": 0.0001, "loss": 5.4456, "loss/crossentropy": 2.426262080669403, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1534961760044098, "step": 11956 }, { "epoch": 0.5435454545454546, "grad_norm": 4.75, "grad_norm_var": 0.11246337890625, "learning_rate": 0.0001, "loss": 5.3457, "loss/crossentropy": 2.347665637731552, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15311919152736664, "step": 11958 }, { "epoch": 0.5436363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.11261393229166666, "learning_rate": 0.0001, "loss": 5.4832, "loss/crossentropy": 2.430385649204254, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15742652863264084, "step": 11960 }, { "epoch": 0.5437272727272727, "grad_norm": 4.65625, "grad_norm_var": 0.13704020182291668, "learning_rate": 0.0001, "loss": 5.1544, "loss/crossentropy": 2.249367117881775, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1440209411084652, "step": 11962 }, { "epoch": 0.5438181818181819, "grad_norm": 4.84375, "grad_norm_var": 0.110791015625, "learning_rate": 0.0001, "loss": 5.5796, "loss/crossentropy": 2.443424105644226, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16146672517061234, "step": 11964 }, { "epoch": 0.5439090909090909, "grad_norm": 5.40625, "grad_norm_var": 0.12376302083333333, "learning_rate": 0.0001, "loss": 5.481, "loss/crossentropy": 2.3938831090927124, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15792886167764664, "step": 11966 }, { "epoch": 0.544, "grad_norm": 4.4375, "grad_norm_var": 0.16404622395833332, "learning_rate": 0.0001, "loss": 5.3765, "loss/crossentropy": 2.382270395755768, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14961721561849117, "step": 11968 }, { "epoch": 0.5440909090909091, "grad_norm": 4.75, "grad_norm_var": 0.16500244140625, "learning_rate": 0.0001, "loss": 5.7488, "loss/crossentropy": 2.5950971245765686, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16361607611179352, "step": 11970 }, { "epoch": 0.5441818181818182, "grad_norm": 4.65625, "grad_norm_var": 0.16612955729166667, "learning_rate": 0.0001, "loss": 5.4055, "loss/crossentropy": 2.3284749388694763, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15848413482308388, "step": 11972 }, { "epoch": 0.5442727272727272, "grad_norm": 5.4375, "grad_norm_var": 0.17047119140625, "learning_rate": 0.0001, "loss": 6.0289, "loss/crossentropy": 2.8004690408706665, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17206457257270813, "step": 11974 }, { "epoch": 0.5443636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.18518473307291666, "learning_rate": 0.0001, "loss": 5.4428, "loss/crossentropy": 2.3943461775779724, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15581777691841125, "step": 11976 }, { "epoch": 0.5444545454545454, "grad_norm": 5.5, "grad_norm_var": 0.1591796875, "learning_rate": 0.0001, "loss": 5.9986, "loss/crossentropy": 2.731560528278351, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17455783486366272, "step": 11978 }, { "epoch": 0.5445454545454546, "grad_norm": 5.0625, "grad_norm_var": 0.11461181640625, "learning_rate": 0.0001, "loss": 5.8803, "loss/crossentropy": 2.66705322265625, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17308074608445168, "step": 11980 }, { "epoch": 0.5446363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.12636311848958334, "learning_rate": 0.0001, "loss": 5.422, "loss/crossentropy": 2.413965404033661, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1521703042089939, "step": 11982 }, { "epoch": 0.5447272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.123828125, "learning_rate": 0.0001, "loss": 4.9268, "loss/crossentropy": 2.038184642791748, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14042218402028084, "step": 11984 }, { "epoch": 0.5448181818181819, "grad_norm": 5.125, "grad_norm_var": 0.13053385416666666, "learning_rate": 0.0001, "loss": 5.4719, "loss/crossentropy": 2.432144671678543, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15729444101452827, "step": 11986 }, { "epoch": 0.5449090909090909, "grad_norm": 4.6875, "grad_norm_var": 0.13616129557291667, "learning_rate": 0.0001, "loss": 5.2295, "loss/crossentropy": 2.2877834141254425, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14846938103437424, "step": 11988 }, { "epoch": 0.545, "grad_norm": 5.1875, "grad_norm_var": 0.12454020182291667, "learning_rate": 0.0001, "loss": 5.559, "loss/crossentropy": 2.4531672596931458, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15979938209056854, "step": 11990 }, { "epoch": 0.5450909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.10559895833333334, "learning_rate": 0.0001, "loss": 5.5285, "loss/crossentropy": 2.432464599609375, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16116851940751076, "step": 11992 }, { "epoch": 0.5451818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.08251546223958334, "learning_rate": 0.0001, "loss": 5.541, "loss/crossentropy": 2.4687169194221497, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1583983302116394, "step": 11994 }, { "epoch": 0.5452727272727272, "grad_norm": 5.0625, "grad_norm_var": 0.081884765625, "learning_rate": 0.0001, "loss": 6.0458, "loss/crossentropy": 2.8344866037368774, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17171285301446915, "step": 11996 }, { "epoch": 0.5453636363636364, "grad_norm": 4.59375, "grad_norm_var": 0.08404947916666666, "learning_rate": 0.0001, "loss": 5.4524, "loss/crossentropy": 2.390717029571533, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15694787725806236, "step": 11998 }, { "epoch": 0.5454545454545454, "grad_norm": 6.4375, "grad_norm_var": 0.19983317057291666, "learning_rate": 0.0001, "loss": 5.5334, "loss/crossentropy": 2.4315099716186523, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16018693894147873, "step": 12000 }, { "epoch": 0.5455454545454546, "grad_norm": 5.375, "grad_norm_var": 0.19763997395833333, "learning_rate": 0.0001, "loss": 5.5844, "loss/crossentropy": 2.441077172756195, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16316482797265053, "step": 12002 }, { "epoch": 0.5456363636363636, "grad_norm": 4.5625, "grad_norm_var": 0.204541015625, "learning_rate": 0.0001, "loss": 4.9272, "loss/crossentropy": 2.0031445920467377, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14377465099096298, "step": 12004 }, { "epoch": 0.5457272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.20875244140625, "learning_rate": 0.0001, "loss": 5.6588, "loss/crossentropy": 2.5406020879745483, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.162017609924078, "step": 12006 }, { "epoch": 0.5458181818181819, "grad_norm": 5.15625, "grad_norm_var": 0.21053059895833334, "learning_rate": 0.0001, "loss": 5.9398, "loss/crossentropy": 2.7394732236862183, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.170427355915308, "step": 12008 }, { "epoch": 0.5459090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.200634765625, "learning_rate": 0.0001, "loss": 5.5022, "loss/crossentropy": 2.3745588958263397, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1627674587070942, "step": 12010 }, { "epoch": 0.546, "grad_norm": 4.78125, "grad_norm_var": 0.22906494140625, "learning_rate": 0.0001, "loss": 5.9173, "loss/crossentropy": 2.7308385968208313, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16942474618554115, "step": 12012 }, { "epoch": 0.5460909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.22486979166666668, "learning_rate": 0.0001, "loss": 5.2492, "loss/crossentropy": 2.27278795838356, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15017734840512276, "step": 12014 }, { "epoch": 0.5461818181818182, "grad_norm": 4.5625, "grad_norm_var": 0.09230143229166667, "learning_rate": 0.0001, "loss": 5.6368, "loss/crossentropy": 2.49781596660614, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1662420965731144, "step": 12016 }, { "epoch": 0.5462727272727272, "grad_norm": 5.15625, "grad_norm_var": 0.08241780598958333, "learning_rate": 0.0001, "loss": 5.7288, "loss/crossentropy": 2.5434579849243164, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16794705390930176, "step": 12018 }, { "epoch": 0.5463636363636364, "grad_norm": 5.0, "grad_norm_var": 0.06767171223958333, "learning_rate": 0.0001, "loss": 5.5949, "loss/crossentropy": 2.5181995630264282, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15942644327878952, "step": 12020 }, { "epoch": 0.5464545454545454, "grad_norm": 5.0, "grad_norm_var": 0.07483317057291666, "learning_rate": 0.0001, "loss": 5.6434, "loss/crossentropy": 2.520175337791443, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16349931061267853, "step": 12022 }, { "epoch": 0.5465454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.07525634765625, "learning_rate": 0.0001, "loss": 6.1171, "loss/crossentropy": 2.8584049940109253, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17626241222023964, "step": 12024 }, { "epoch": 0.5466363636363636, "grad_norm": 5.0625, "grad_norm_var": 0.07515869140625, "learning_rate": 0.0001, "loss": 5.1799, "loss/crossentropy": 2.1684990227222443, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.14938490092754364, "step": 12026 }, { "epoch": 0.5467272727272727, "grad_norm": 4.6875, "grad_norm_var": 0.04869384765625, "learning_rate": 0.0001, "loss": 5.5517, "loss/crossentropy": 2.427734911441803, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16278589144349098, "step": 12028 }, { "epoch": 0.5468181818181819, "grad_norm": 5.34375, "grad_norm_var": 0.05543212890625, "learning_rate": 0.0001, "loss": 5.6373, "loss/crossentropy": 2.4961383938789368, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16313549131155014, "step": 12030 }, { "epoch": 0.5469090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.05543212890625, "learning_rate": 0.0001, "loss": 5.5897, "loss/crossentropy": 2.498049259185791, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1601371020078659, "step": 12032 }, { "epoch": 0.547, "grad_norm": 10.1875, "grad_norm_var": 1.7553019205729166, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.3979645669460297, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1763778254389763, "step": 12034 }, { "epoch": 0.5470909090909091, "grad_norm": 4.625, "grad_norm_var": 1.7646484375, "learning_rate": 0.0001, "loss": 5.4872, "loss/crossentropy": 2.4233224391937256, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15971248969435692, "step": 12036 }, { "epoch": 0.5471818181818182, "grad_norm": 5.03125, "grad_norm_var": 1.76343994140625, "learning_rate": 0.0001, "loss": 5.2556, "loss/crossentropy": 2.2139559388160706, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15513892099261284, "step": 12038 }, { "epoch": 0.5472727272727272, "grad_norm": 4.96875, "grad_norm_var": 1.76597900390625, "learning_rate": 0.0001, "loss": 5.4517, "loss/crossentropy": 2.36692214012146, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16043269634246826, "step": 12040 }, { "epoch": 0.5473636363636364, "grad_norm": 4.75, "grad_norm_var": 1.78209228515625, "learning_rate": 0.0001, "loss": 5.2878, "loss/crossentropy": 2.3611292839050293, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14617976546287537, "step": 12042 }, { "epoch": 0.5474545454545454, "grad_norm": 5.03125, "grad_norm_var": 1.7646769205729167, "learning_rate": 0.0001, "loss": 5.6721, "loss/crossentropy": 2.5895131826400757, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16099027544260025, "step": 12044 }, { "epoch": 0.5475454545454546, "grad_norm": 5.40625, "grad_norm_var": 1.752978515625, "learning_rate": 0.0001, "loss": 5.6635, "loss/crossentropy": 2.485221266746521, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16529299318790436, "step": 12046 }, { "epoch": 0.5476363636363636, "grad_norm": 4.9375, "grad_norm_var": 1.73902587890625, "learning_rate": 0.0001, "loss": 5.6871, "loss/crossentropy": 2.5230249762535095, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1642569899559021, "step": 12048 }, { "epoch": 0.5477272727272727, "grad_norm": 5.125, "grad_norm_var": 0.08000895182291666, "learning_rate": 0.0001, "loss": 5.7031, "loss/crossentropy": 2.5264374017715454, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16864143684506416, "step": 12050 }, { "epoch": 0.5478181818181819, "grad_norm": 4.8125, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 5.3211, "loss/crossentropy": 2.2726261019706726, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1544606313109398, "step": 12052 }, { "epoch": 0.5479090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.0591796875, "learning_rate": 0.0001, "loss": 5.4534, "loss/crossentropy": 2.3432289361953735, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15964578837156296, "step": 12054 }, { "epoch": 0.548, "grad_norm": 4.6875, "grad_norm_var": 0.06808268229166667, "learning_rate": 0.0001, "loss": 5.2478, "loss/crossentropy": 2.2864224910736084, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15161020681262016, "step": 12056 }, { "epoch": 0.5480909090909091, "grad_norm": 5.375, "grad_norm_var": 0.082666015625, "learning_rate": 0.0001, "loss": 5.7025, "loss/crossentropy": 2.5219152569770813, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16689015179872513, "step": 12058 }, { "epoch": 0.5481818181818182, "grad_norm": 4.875, "grad_norm_var": 0.059619140625, "learning_rate": 0.0001, "loss": 5.8413, "loss/crossentropy": 2.680335819721222, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16707008704543114, "step": 12060 }, { "epoch": 0.5482727272727272, "grad_norm": 4.8125, "grad_norm_var": 0.053120930989583336, "learning_rate": 0.0001, "loss": 5.4978, "loss/crossentropy": 2.4129289984703064, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1584879606962204, "step": 12062 }, { "epoch": 0.5483636363636364, "grad_norm": 4.65625, "grad_norm_var": 0.06968994140625, "learning_rate": 0.0001, "loss": 5.367, "loss/crossentropy": 2.3190665543079376, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1559655610471964, "step": 12064 }, { "epoch": 0.5484545454545454, "grad_norm": 4.90625, "grad_norm_var": 0.06910400390625, "learning_rate": 0.0001, "loss": 5.7127, "loss/crossentropy": 2.580237090587616, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16266440972685814, "step": 12066 }, { "epoch": 0.5485454545454546, "grad_norm": 4.84375, "grad_norm_var": 0.064697265625, "learning_rate": 0.0001, "loss": 5.5107, "loss/crossentropy": 2.500630497932434, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15452483296394348, "step": 12068 }, { "epoch": 0.5486363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.06842041015625, "learning_rate": 0.0001, "loss": 5.8777, "loss/crossentropy": 2.654220223426819, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17332900688052177, "step": 12070 }, { "epoch": 0.5487272727272727, "grad_norm": 4.6875, "grad_norm_var": 0.061421712239583336, "learning_rate": 0.0001, "loss": 5.8536, "loss/crossentropy": 2.6666173338890076, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17143207043409348, "step": 12072 }, { "epoch": 0.5488181818181819, "grad_norm": 5.59375, "grad_norm_var": 0.07198893229166667, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.6354020833969116, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16543930023908615, "step": 12074 }, { "epoch": 0.5489090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.06770426432291667, "learning_rate": 0.0001, "loss": 5.4412, "loss/crossentropy": 2.376191735267639, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1566983312368393, "step": 12076 }, { "epoch": 0.549, "grad_norm": 5.1875, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 5.7555, "loss/crossentropy": 2.570434808731079, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1685025319457054, "step": 12078 }, { "epoch": 0.5490909090909091, "grad_norm": 5.75, "grad_norm_var": 0.08370768229166667, "learning_rate": 0.0001, "loss": 5.744, "loss/crossentropy": 2.531863808631897, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.16828493773937225, "step": 12080 }, { "epoch": 0.5491818181818182, "grad_norm": 4.75, "grad_norm_var": 0.10767822265625, "learning_rate": 0.0001, "loss": 6.0307, "loss/crossentropy": 2.804016649723053, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17208464443683624, "step": 12082 }, { "epoch": 0.5492727272727272, "grad_norm": 5.78125, "grad_norm_var": 0.16013997395833332, "learning_rate": 0.0001, "loss": 5.8164, "loss/crossentropy": 2.5023974180221558, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17885972931981087, "step": 12084 }, { "epoch": 0.5493636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.153759765625, "learning_rate": 0.0001, "loss": 5.5651, "loss/crossentropy": 2.465980350971222, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16010596975684166, "step": 12086 }, { "epoch": 0.5494545454545454, "grad_norm": 4.90625, "grad_norm_var": 0.16347249348958334, "learning_rate": 0.0001, "loss": 5.5053, "loss/crossentropy": 2.472873270511627, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1567607969045639, "step": 12088 }, { "epoch": 0.5495454545454546, "grad_norm": 4.90625, "grad_norm_var": 0.15924072265625, "learning_rate": 0.0001, "loss": 5.3332, "loss/crossentropy": 2.3013472855091095, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15572846494615078, "step": 12090 }, { "epoch": 0.5496363636363636, "grad_norm": 4.46875, "grad_norm_var": 0.20679931640625, "learning_rate": 0.0001, "loss": 5.8359, "loss/crossentropy": 2.679712951183319, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16425445675849915, "step": 12092 }, { "epoch": 0.5497272727272727, "grad_norm": 4.25, "grad_norm_var": 0.26171875, "learning_rate": 0.0001, "loss": 5.132, "loss/crossentropy": 2.1865033507347107, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14591878652572632, "step": 12094 }, { "epoch": 0.5498181818181819, "grad_norm": 5.25, "grad_norm_var": 0.23196207682291667, "learning_rate": 0.0001, "loss": 5.9268, "loss/crossentropy": 2.685212731361389, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1723962016403675, "step": 12096 }, { "epoch": 0.5499090909090909, "grad_norm": 5.84375, "grad_norm_var": 0.29159749348958336, "learning_rate": 0.0001, "loss": 5.6344, "loss/crossentropy": 2.4947914481163025, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16454292833805084, "step": 12098 }, { "epoch": 0.55, "grad_norm": 5.0625, "grad_norm_var": 0.23137613932291667, "learning_rate": 0.0001, "loss": 5.5842, "loss/crossentropy": 2.451418161392212, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16289153322577477, "step": 12100 }, { "epoch": 0.5500909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.22945556640625, "learning_rate": 0.0001, "loss": 5.7433, "loss/crossentropy": 2.542769968509674, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16869046539068222, "step": 12102 }, { "epoch": 0.5501818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.21721598307291667, "learning_rate": 0.0001, "loss": 5.5505, "loss/crossentropy": 2.496416985988617, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15540575981140137, "step": 12104 }, { "epoch": 0.5502727272727272, "grad_norm": 5.0625, "grad_norm_var": 8.05006103515625, "learning_rate": 0.0001, "loss": 5.3147, "loss/crossentropy": 2.1708960235118866, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16223157942295074, "step": 12106 }, { "epoch": 0.5503636363636364, "grad_norm": 5.09375, "grad_norm_var": 7.99537353515625, "learning_rate": 0.0001, "loss": 5.5691, "loss/crossentropy": 2.435040056705475, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16243384033441544, "step": 12108 }, { "epoch": 0.5504545454545454, "grad_norm": 4.84375, "grad_norm_var": 7.923140462239584, "learning_rate": 0.0001, "loss": 5.6976, "loss/crossentropy": 2.5275803804397583, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16563482955098152, "step": 12110 }, { "epoch": 0.5505454545454546, "grad_norm": 4.875, "grad_norm_var": 8.008882649739583, "learning_rate": 0.0001, "loss": 5.1005, "loss/crossentropy": 2.150497317314148, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.14578080922365189, "step": 12112 }, { "epoch": 0.5506363636363636, "grad_norm": 4.71875, "grad_norm_var": 8.133577473958333, "learning_rate": 0.0001, "loss": 5.3226, "loss/crossentropy": 2.3108710646629333, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15254252403974533, "step": 12114 }, { "epoch": 0.5507272727272727, "grad_norm": 4.875, "grad_norm_var": 8.181380208333334, "learning_rate": 0.0001, "loss": 5.5687, "loss/crossentropy": 2.46711128950119, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16094323620200157, "step": 12116 }, { "epoch": 0.5508181818181819, "grad_norm": 4.5625, "grad_norm_var": 8.287430826822916, "learning_rate": 0.0001, "loss": 5.5026, "loss/crossentropy": 2.476740777492523, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1557156704366207, "step": 12118 }, { "epoch": 0.5509090909090909, "grad_norm": 4.84375, "grad_norm_var": 8.324702962239583, "learning_rate": 0.0001, "loss": 5.7377, "loss/crossentropy": 2.6227015256881714, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16423814371228218, "step": 12120 }, { "epoch": 0.551, "grad_norm": 5.65625, "grad_norm_var": 0.0654296875, "learning_rate": 0.0001, "loss": 5.696, "loss/crossentropy": 2.539680600166321, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16485026106238365, "step": 12122 }, { "epoch": 0.5510909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.07281494140625, "learning_rate": 0.0001, "loss": 5.5091, "loss/crossentropy": 2.4072123765945435, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16136322170495987, "step": 12124 }, { "epoch": 0.5511818181818182, "grad_norm": 5.46875, "grad_norm_var": 0.09407552083333333, "learning_rate": 0.0001, "loss": 5.8505, "loss/crossentropy": 2.671629011631012, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16866881400346756, "step": 12126 }, { "epoch": 0.5512727272727272, "grad_norm": 5.03125, "grad_norm_var": 0.10753580729166666, "learning_rate": 0.0001, "loss": 5.5994, "loss/crossentropy": 2.452264189720154, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16198041290044785, "step": 12128 }, { "epoch": 0.5513636363636364, "grad_norm": 5.125, "grad_norm_var": 0.10428059895833333, "learning_rate": 0.0001, "loss": 5.3913, "loss/crossentropy": 2.4063426852226257, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15357832983136177, "step": 12130 }, { "epoch": 0.5514545454545454, "grad_norm": 4.84375, "grad_norm_var": 0.10468343098958334, "learning_rate": 0.0001, "loss": 5.8478, "loss/crossentropy": 2.6562403440475464, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17189422994852066, "step": 12132 }, { "epoch": 0.5515454545454546, "grad_norm": 4.65625, "grad_norm_var": 0.11503499348958333, "learning_rate": 0.0001, "loss": 5.4716, "loss/crossentropy": 2.4511476159095764, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15380483120679855, "step": 12134 }, { "epoch": 0.5516363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.10794270833333333, "learning_rate": 0.0001, "loss": 5.3884, "loss/crossentropy": 2.3642712235450745, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15455949679017067, "step": 12136 }, { "epoch": 0.5517272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.08095296223958333, "learning_rate": 0.0001, "loss": 5.1557, "loss/crossentropy": 2.181484341621399, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1493772454559803, "step": 12138 }, { "epoch": 0.5518181818181818, "grad_norm": 4.65625, "grad_norm_var": 0.19716389973958334, "learning_rate": 0.0001, "loss": 5.5561, "loss/crossentropy": 2.4878833293914795, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15935614705085754, "step": 12140 }, { "epoch": 0.5519090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.19755452473958332, "learning_rate": 0.0001, "loss": 5.4249, "loss/crossentropy": 2.4193437099456787, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15290168300271034, "step": 12142 }, { "epoch": 0.552, "grad_norm": 4.90625, "grad_norm_var": 0.19029541015625, "learning_rate": 0.0001, "loss": 5.6169, "loss/crossentropy": 2.5359317660331726, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16181065887212753, "step": 12144 }, { "epoch": 0.5520909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.20546468098958334, "learning_rate": 0.0001, "loss": 5.5336, "loss/crossentropy": 2.428410232067108, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15915046259760857, "step": 12146 }, { "epoch": 0.5521818181818182, "grad_norm": 5.25, "grad_norm_var": 0.21392822265625, "learning_rate": 0.0001, "loss": 5.5166, "loss/crossentropy": 2.3964964151382446, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16103562340140343, "step": 12148 }, { "epoch": 0.5522727272727272, "grad_norm": 4.75, "grad_norm_var": 0.20836181640625, "learning_rate": 0.0001, "loss": 5.6443, "loss/crossentropy": 2.574618846178055, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15852880105376244, "step": 12150 }, { "epoch": 0.5523636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.20689697265625, "learning_rate": 0.0001, "loss": 5.1682, "loss/crossentropy": 2.224143624305725, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14459966123104095, "step": 12152 }, { "epoch": 0.5524545454545454, "grad_norm": 4.4375, "grad_norm_var": 0.232275390625, "learning_rate": 0.0001, "loss": 5.1186, "loss/crossentropy": 2.1792072057724, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14647462964057922, "step": 12154 }, { "epoch": 0.5525454545454546, "grad_norm": 5.40625, "grad_norm_var": 0.11103108723958334, "learning_rate": 0.0001, "loss": 5.8347, "loss/crossentropy": 2.683619499206543, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16608503088355064, "step": 12156 }, { "epoch": 0.5526363636363636, "grad_norm": 5.0, "grad_norm_var": 0.09931233723958334, "learning_rate": 0.0001, "loss": 5.3, "loss/crossentropy": 2.2990229427814484, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15165656246244907, "step": 12158 }, { "epoch": 0.5527272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.10271809895833334, "learning_rate": 0.0001, "loss": 5.9472, "loss/crossentropy": 2.694692552089691, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1748630590736866, "step": 12160 }, { "epoch": 0.5528181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.08961181640625, "learning_rate": 0.0001, "loss": 5.8658, "loss/crossentropy": 2.6588631868362427, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17166842892766, "step": 12162 }, { "epoch": 0.5529090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.07323811848958334, "learning_rate": 0.0001, "loss": 5.2775, "loss/crossentropy": 2.2313567996025085, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1553943268954754, "step": 12164 }, { "epoch": 0.553, "grad_norm": 5.8125, "grad_norm_var": 6.606494140625, "learning_rate": 0.0001, "loss": 5.8099, "loss/crossentropy": 2.46657395362854, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17729934304952621, "step": 12166 }, { "epoch": 0.5530909090909091, "grad_norm": 5.15625, "grad_norm_var": 6.52398681640625, "learning_rate": 0.0001, "loss": 5.7746, "loss/crossentropy": 2.545652210712433, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17211183533072472, "step": 12168 }, { "epoch": 0.5531818181818182, "grad_norm": 6.1875, "grad_norm_var": 6.39742431640625, "learning_rate": 0.0001, "loss": 5.6012, "loss/crossentropy": 2.464741140604019, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16286784410476685, "step": 12170 }, { "epoch": 0.5532727272727272, "grad_norm": 4.78125, "grad_norm_var": 6.40078125, "learning_rate": 0.0001, "loss": 5.4235, "loss/crossentropy": 2.3559176325798035, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15675660222768784, "step": 12172 }, { "epoch": 0.5533636363636364, "grad_norm": 4.40625, "grad_norm_var": 6.438602701822917, "learning_rate": 0.0001, "loss": 5.3853, "loss/crossentropy": 2.3638344407081604, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1535091195255518, "step": 12174 }, { "epoch": 0.5534545454545454, "grad_norm": 5.65625, "grad_norm_var": 6.568550618489583, "learning_rate": 0.0001, "loss": 5.406, "loss/crossentropy": 2.3860928416252136, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15492126904428005, "step": 12176 }, { "epoch": 0.5535454545454546, "grad_norm": 4.59375, "grad_norm_var": 6.597880045572917, "learning_rate": 0.0001, "loss": 5.9068, "loss/crossentropy": 2.7228598594665527, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16897611320018768, "step": 12178 }, { "epoch": 0.5536363636363636, "grad_norm": 5.15625, "grad_norm_var": 6.551167805989583, "learning_rate": 0.0001, "loss": 5.7979, "loss/crossentropy": 2.5969238877296448, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1718524992465973, "step": 12180 }, { "epoch": 0.5537272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.252587890625, "learning_rate": 0.0001, "loss": 5.4484, "loss/crossentropy": 2.414106070995331, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15694503113627434, "step": 12182 }, { "epoch": 0.5538181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.25054931640625, "learning_rate": 0.0001, "loss": 5.5301, "loss/crossentropy": 2.493352711200714, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15582389011979103, "step": 12184 }, { "epoch": 0.5539090909090909, "grad_norm": 5.6875, "grad_norm_var": 0.186572265625, "learning_rate": 0.0001, "loss": 5.7096, "loss/crossentropy": 2.4938624501228333, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16883507370948792, "step": 12186 }, { "epoch": 0.554, "grad_norm": 5.28125, "grad_norm_var": 0.18880208333333334, "learning_rate": 0.0001, "loss": 5.8837, "loss/crossentropy": 2.662484109401703, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16958299279212952, "step": 12188 }, { "epoch": 0.5540909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.16951497395833334, "learning_rate": 0.0001, "loss": 5.602, "loss/crossentropy": 2.550877571105957, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15686535090208054, "step": 12190 }, { "epoch": 0.5541818181818182, "grad_norm": 5.125, "grad_norm_var": 0.08681233723958333, "learning_rate": 0.0001, "loss": 5.6913, "loss/crossentropy": 2.517401933670044, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1675814464688301, "step": 12192 }, { "epoch": 0.5542727272727273, "grad_norm": 4.53125, "grad_norm_var": 0.08918863932291667, "learning_rate": 0.0001, "loss": 5.4925, "loss/crossentropy": 2.4495535492897034, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15702901408076286, "step": 12194 }, { "epoch": 0.5543636363636364, "grad_norm": 4.40625, "grad_norm_var": 0.116650390625, "learning_rate": 0.0001, "loss": 5.203, "loss/crossentropy": 2.2093261778354645, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.14701969549059868, "step": 12196 }, { "epoch": 0.5544545454545454, "grad_norm": 5.1875, "grad_norm_var": 0.12812093098958333, "learning_rate": 0.0001, "loss": 5.4325, "loss/crossentropy": 2.3332322239875793, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1579759567975998, "step": 12198 }, { "epoch": 0.5545454545454546, "grad_norm": 5.53125, "grad_norm_var": 0.13017171223958332, "learning_rate": 0.0001, "loss": 5.3978, "loss/crossentropy": 2.2536393105983734, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16460728645324707, "step": 12200 }, { "epoch": 0.5546363636363636, "grad_norm": 4.65625, "grad_norm_var": 0.10188395182291667, "learning_rate": 0.0001, "loss": 5.3032, "loss/crossentropy": 2.285421371459961, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15236322209239006, "step": 12202 }, { "epoch": 0.5547272727272727, "grad_norm": 5.6875, "grad_norm_var": 0.22053629557291668, "learning_rate": 0.0001, "loss": 5.6812, "loss/crossentropy": 2.42428857088089, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1678810752928257, "step": 12204 }, { "epoch": 0.5548181818181818, "grad_norm": 4.8125, "grad_norm_var": 0.21741129557291666, "learning_rate": 0.0001, "loss": 5.2286, "loss/crossentropy": 2.2452887296676636, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1500888131558895, "step": 12206 }, { "epoch": 0.5549090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.282666015625, "learning_rate": 0.0001, "loss": 6.1086, "loss/crossentropy": 2.7861289978027344, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1822461448609829, "step": 12208 }, { "epoch": 0.555, "grad_norm": 4.9375, "grad_norm_var": 0.26458333333333334, "learning_rate": 0.0001, "loss": 5.6825, "loss/crossentropy": 2.572957158088684, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1633010320365429, "step": 12210 }, { "epoch": 0.5550909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.23632405598958334, "learning_rate": 0.0001, "loss": 5.3983, "loss/crossentropy": 2.3527638912200928, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1572888419032097, "step": 12212 }, { "epoch": 0.5551818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.22196858723958332, "learning_rate": 0.0001, "loss": 5.4268, "loss/crossentropy": 2.4062750935554504, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15322594717144966, "step": 12214 }, { "epoch": 0.5552727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.204150390625, "learning_rate": 0.0001, "loss": 5.8697, "loss/crossentropy": 2.7082927227020264, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16594843566417694, "step": 12216 }, { "epoch": 0.5553636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.182666015625, "learning_rate": 0.0001, "loss": 5.6294, "loss/crossentropy": 2.4917418360710144, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16513008251786232, "step": 12218 }, { "epoch": 0.5554545454545454, "grad_norm": 4.5625, "grad_norm_var": 0.10491129557291666, "learning_rate": 0.0001, "loss": 5.5257, "loss/crossentropy": 2.4858800172805786, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15691284462809563, "step": 12220 }, { "epoch": 0.5555454545454546, "grad_norm": 5.53125, "grad_norm_var": 0.11441650390625, "learning_rate": 0.0001, "loss": 6.0811, "loss/crossentropy": 2.7765572667121887, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1781153455376625, "step": 12222 }, { "epoch": 0.5556363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.05093994140625, "learning_rate": 0.0001, "loss": 5.4485, "loss/crossentropy": 2.366093337535858, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1586274728178978, "step": 12224 }, { "epoch": 0.5557272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.05084635416666667, "learning_rate": 0.0001, "loss": 5.8366, "loss/crossentropy": 2.6479142904281616, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1700436919927597, "step": 12226 }, { "epoch": 0.5558181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.04540608723958333, "learning_rate": 0.0001, "loss": 5.4629, "loss/crossentropy": 2.3423826694488525, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1634194776415825, "step": 12228 }, { "epoch": 0.5559090909090909, "grad_norm": 5.78125, "grad_norm_var": 0.08658447265625, "learning_rate": 0.0001, "loss": 5.7193, "loss/crossentropy": 2.5309571027755737, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16863582283258438, "step": 12230 }, { "epoch": 0.556, "grad_norm": 4.6875, "grad_norm_var": 0.09256184895833333, "learning_rate": 0.0001, "loss": 5.1926, "loss/crossentropy": 2.1851513385772705, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15094373002648354, "step": 12232 }, { "epoch": 0.5560909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.09108072916666667, "learning_rate": 0.0001, "loss": 5.5733, "loss/crossentropy": 2.473851501941681, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1597527228295803, "step": 12234 }, { "epoch": 0.5561818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.08258056640625, "learning_rate": 0.0001, "loss": 5.4352, "loss/crossentropy": 2.426235318183899, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1520729511976242, "step": 12236 }, { "epoch": 0.5562727272727273, "grad_norm": 5.875, "grad_norm_var": 351.7364868164062, "learning_rate": 0.0001, "loss": 6.5218, "loss/crossentropy": 2.7042306661605835, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.22804607823491096, "step": 12238 }, { "epoch": 0.5563636363636364, "grad_norm": 15.5625, "grad_norm_var": 351.91484375, "learning_rate": 0.0001, "loss": 5.4025, "loss/crossentropy": 2.3346155285835266, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15952078998088837, "step": 12240 }, { "epoch": 0.5564545454545454, "grad_norm": 5.0625, "grad_norm_var": 351.372119140625, "learning_rate": 0.0001, "loss": 5.1957, "loss/crossentropy": 2.228040874004364, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.14520208910107613, "step": 12242 }, { "epoch": 0.5565454545454546, "grad_norm": 4.90625, "grad_norm_var": 351.1653279622396, "learning_rate": 0.0001, "loss": 5.656, "loss/crossentropy": 2.4659744799137115, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16861482709646225, "step": 12244 }, { "epoch": 0.5566363636363636, "grad_norm": 5.28125, "grad_norm_var": 351.2680623372396, "learning_rate": 0.0001, "loss": 5.9153, "loss/crossentropy": 2.688238739967346, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17289778590202332, "step": 12246 }, { "epoch": 0.5567272727272727, "grad_norm": 5.21875, "grad_norm_var": 348.8878540039062, "learning_rate": 0.0001, "loss": 6.0363, "loss/crossentropy": 2.68748539686203, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.18116654083132744, "step": 12248 }, { "epoch": 0.5568181818181818, "grad_norm": 4.59375, "grad_norm_var": 349.3365844726562, "learning_rate": 0.0001, "loss": 5.3315, "loss/crossentropy": 2.340037167072296, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15266256779432297, "step": 12250 }, { "epoch": 0.5569090909090909, "grad_norm": 5.09375, "grad_norm_var": 348.8102172851562, "learning_rate": 0.0001, "loss": 5.7378, "loss/crossentropy": 2.564254581928253, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16794361546635628, "step": 12252 }, { "epoch": 0.557, "grad_norm": 5.21875, "grad_norm_var": 7.516890462239584, "learning_rate": 0.0001, "loss": 5.8311, "loss/crossentropy": 2.528565764427185, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1816231571137905, "step": 12254 }, { "epoch": 0.5570909090909091, "grad_norm": 5.5625, "grad_norm_var": 1.1221354166666666, "learning_rate": 0.0001, "loss": 5.9433, "loss/crossentropy": 2.757791042327881, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1685473918914795, "step": 12256 }, { "epoch": 0.5571818181818182, "grad_norm": 5.59375, "grad_norm_var": 1.1521769205729167, "learning_rate": 0.0001, "loss": 5.287, "loss/crossentropy": 2.269909143447876, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15307438001036644, "step": 12258 }, { "epoch": 0.5572727272727273, "grad_norm": 4.8125, "grad_norm_var": 1.1856119791666666, "learning_rate": 0.0001, "loss": 5.2697, "loss/crossentropy": 2.301614463329315, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14700761809945107, "step": 12260 }, { "epoch": 0.5573636363636364, "grad_norm": 5.09375, "grad_norm_var": 1.2199869791666667, "learning_rate": 0.0001, "loss": 5.4358, "loss/crossentropy": 2.3511667251586914, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15612239390611649, "step": 12262 }, { "epoch": 0.5574545454545454, "grad_norm": 5.03125, "grad_norm_var": 0.24055582682291668, "learning_rate": 0.0001, "loss": 5.6941, "loss/crossentropy": 2.4923476576805115, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.1664666347205639, "step": 12264 }, { "epoch": 0.5575454545454546, "grad_norm": 4.5625, "grad_norm_var": 0.24508056640625, "learning_rate": 0.0001, "loss": 5.4259, "loss/crossentropy": 2.3862876296043396, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15512826293706894, "step": 12266 }, { "epoch": 0.5576363636363636, "grad_norm": 5.875, "grad_norm_var": 0.30286051432291666, "learning_rate": 0.0001, "loss": 5.7808, "loss/crossentropy": 2.5392532348632812, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1755206249654293, "step": 12268 }, { "epoch": 0.5577272727272727, "grad_norm": 5.375, "grad_norm_var": 0.23528238932291667, "learning_rate": 0.0001, "loss": 5.7025, "loss/crossentropy": 2.5421494841575623, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.164864182472229, "step": 12270 }, { "epoch": 0.5578181818181818, "grad_norm": 4.65625, "grad_norm_var": 0.24049072265625, "learning_rate": 0.0001, "loss": 5.6517, "loss/crossentropy": 2.580172121524811, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16028229147195816, "step": 12272 }, { "epoch": 0.5579090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.22858072916666666, "learning_rate": 0.0001, "loss": 5.8054, "loss/crossentropy": 2.607712209224701, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1678185984492302, "step": 12274 }, { "epoch": 0.558, "grad_norm": 5.0, "grad_norm_var": 0.21495768229166667, "learning_rate": 0.0001, "loss": 5.6271, "loss/crossentropy": 2.504303455352783, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16032563149929047, "step": 12276 }, { "epoch": 0.5580909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.20627848307291666, "learning_rate": 0.0001, "loss": 5.4005, "loss/crossentropy": 2.3441523909568787, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1570066288113594, "step": 12278 }, { "epoch": 0.5581818181818182, "grad_norm": 4.5625, "grad_norm_var": 0.21164957682291666, "learning_rate": 0.0001, "loss": 5.2332, "loss/crossentropy": 2.2227862775325775, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1541709452867508, "step": 12280 }, { "epoch": 0.5582727272727273, "grad_norm": 5.71875, "grad_norm_var": 0.19364827473958332, "learning_rate": 0.0001, "loss": 5.5042, "loss/crossentropy": 2.4432972073554993, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15687483921647072, "step": 12282 }, { "epoch": 0.5583636363636364, "grad_norm": 4.59375, "grad_norm_var": 0.16534830729166666, "learning_rate": 0.0001, "loss": 5.3343, "loss/crossentropy": 2.3686773777008057, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14949332922697067, "step": 12284 }, { "epoch": 0.5584545454545454, "grad_norm": 8.8125, "grad_norm_var": 1.0593587239583333, "learning_rate": 0.0001, "loss": 5.5831, "loss/crossentropy": 2.437762886285782, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16218645870685577, "step": 12286 }, { "epoch": 0.5585454545454546, "grad_norm": 5.4375, "grad_norm_var": 1.0285115559895834, "learning_rate": 0.0001, "loss": 5.4293, "loss/crossentropy": 2.2771124243736267, "loss/hidden": 1.548828125, "loss/jsd": 0.0, "loss/logits": 0.16033988818526268, "step": 12288 }, { "epoch": 0.5586363636363636, "grad_norm": 4.78125, "grad_norm_var": 1.04195556640625, "learning_rate": 0.0001, "loss": 5.5256, "loss/crossentropy": 2.472592353820801, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15803131088614464, "step": 12290 }, { "epoch": 0.5587272727272727, "grad_norm": 5.3125, "grad_norm_var": 1.037109375, "learning_rate": 0.0001, "loss": 5.6909, "loss/crossentropy": 2.5554531812667847, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16549433395266533, "step": 12292 }, { "epoch": 0.5588181818181818, "grad_norm": 5.65625, "grad_norm_var": 1.05279541015625, "learning_rate": 0.0001, "loss": 5.4396, "loss/crossentropy": 2.403671443462372, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15515902265906334, "step": 12294 }, { "epoch": 0.5589090909090909, "grad_norm": 5.09375, "grad_norm_var": 1.034375, "learning_rate": 0.0001, "loss": 5.5635, "loss/crossentropy": 2.4665570855140686, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16242538392543793, "step": 12296 }, { "epoch": 0.559, "grad_norm": 4.625, "grad_norm_var": 1.0318318684895833, "learning_rate": 0.0001, "loss": 5.6596, "loss/crossentropy": 2.5560131669044495, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16133318096399307, "step": 12298 }, { "epoch": 0.5590909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.9929646809895833, "learning_rate": 0.0001, "loss": 5.9888, "loss/crossentropy": 2.736272156238556, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17525555193424225, "step": 12300 }, { "epoch": 0.5591818181818182, "grad_norm": 5.375, "grad_norm_var": 0.09894205729166666, "learning_rate": 0.0001, "loss": 5.7311, "loss/crossentropy": 2.549500912427902, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16503478959202766, "step": 12302 }, { "epoch": 0.5592727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.3683878580729167, "learning_rate": 0.0001, "loss": 5.6008, "loss/crossentropy": 2.4362313747406006, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1658664606511593, "step": 12304 }, { "epoch": 0.5593636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.363671875, "learning_rate": 0.0001, "loss": 5.5441, "loss/crossentropy": 2.4832570552825928, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1568659096956253, "step": 12306 }, { "epoch": 0.5594545454545454, "grad_norm": 5.15625, "grad_norm_var": 0.36300455729166664, "learning_rate": 0.0001, "loss": 5.5328, "loss/crossentropy": 2.4394392371177673, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15913714095950127, "step": 12308 }, { "epoch": 0.5595454545454546, "grad_norm": 5.21875, "grad_norm_var": 0.3338826497395833, "learning_rate": 0.0001, "loss": 6.0476, "loss/crossentropy": 2.805640935897827, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17165199667215347, "step": 12310 }, { "epoch": 0.5596363636363636, "grad_norm": 4.75, "grad_norm_var": 0.34524332682291664, "learning_rate": 0.0001, "loss": 5.474, "loss/crossentropy": 2.417115092277527, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15626973286271095, "step": 12312 }, { "epoch": 0.5597272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.3349609375, "learning_rate": 0.0001, "loss": 5.6984, "loss/crossentropy": 2.5278199315071106, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16607894748449326, "step": 12314 }, { "epoch": 0.5598181818181818, "grad_norm": 5.0, "grad_norm_var": 0.3294921875, "learning_rate": 0.0001, "loss": 5.8477, "loss/crossentropy": 2.6091421246528625, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17307798564434052, "step": 12316 }, { "epoch": 0.5599090909090909, "grad_norm": 5.0, "grad_norm_var": 0.3433430989583333, "learning_rate": 0.0001, "loss": 5.6133, "loss/crossentropy": 2.5156832337379456, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1624949797987938, "step": 12318 }, { "epoch": 0.56, "grad_norm": 5.09375, "grad_norm_var": 0.04309895833333333, "learning_rate": 0.0001, "loss": 5.6679, "loss/crossentropy": 2.56925505399704, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1610344536602497, "step": 12320 }, { "epoch": 0.5600909090909091, "grad_norm": 5.0, "grad_norm_var": 0.03925374348958333, "learning_rate": 0.0001, "loss": 5.6931, "loss/crossentropy": 2.52028089761734, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16865187138319016, "step": 12322 }, { "epoch": 0.5601818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.0384765625, "learning_rate": 0.0001, "loss": 5.4458, "loss/crossentropy": 2.3097259998321533, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16400037333369255, "step": 12324 }, { "epoch": 0.5602727272727273, "grad_norm": 5.125, "grad_norm_var": 0.03746337890625, "learning_rate": 0.0001, "loss": 5.6631, "loss/crossentropy": 2.566804885864258, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15904304012656212, "step": 12326 }, { "epoch": 0.5603636363636364, "grad_norm": 5.0, "grad_norm_var": 0.03253580729166667, "learning_rate": 0.0001, "loss": 5.7718, "loss/crossentropy": 2.57272070646286, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17088723927736282, "step": 12328 }, { "epoch": 0.5604545454545454, "grad_norm": 4.8125, "grad_norm_var": 0.03388264973958333, "learning_rate": 0.0001, "loss": 5.5165, "loss/crossentropy": 2.4308376014232635, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1587589904665947, "step": 12330 }, { "epoch": 0.5605454545454546, "grad_norm": 4.46875, "grad_norm_var": 0.06151936848958333, "learning_rate": 0.0001, "loss": 5.3857, "loss/crossentropy": 2.3660449385643005, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1552852801978588, "step": 12332 }, { "epoch": 0.5606363636363636, "grad_norm": 4.46875, "grad_norm_var": 0.06995035807291666, "learning_rate": 0.0001, "loss": 5.3832, "loss/crossentropy": 2.3770184218883514, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1492556668817997, "step": 12334 }, { "epoch": 0.5607272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.07047119140625, "learning_rate": 0.0001, "loss": 5.8066, "loss/crossentropy": 2.686662942171097, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16414058208465576, "step": 12336 }, { "epoch": 0.5608181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.078369140625, "learning_rate": 0.0001, "loss": 5.7269, "loss/crossentropy": 2.6134073138237, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1631072200834751, "step": 12338 }, { "epoch": 0.5609090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.07255452473958333, "learning_rate": 0.0001, "loss": 5.2411, "loss/crossentropy": 2.2624399065971375, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1511908955872059, "step": 12340 }, { "epoch": 0.561, "grad_norm": 5.40625, "grad_norm_var": 0.08058268229166667, "learning_rate": 0.0001, "loss": 5.7249, "loss/crossentropy": 2.613484799861908, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16231799125671387, "step": 12342 }, { "epoch": 0.5610909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.11183268229166667, "learning_rate": 0.0001, "loss": 5.8547, "loss/crossentropy": 2.655280888080597, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16740470379590988, "step": 12344 }, { "epoch": 0.5611818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.121337890625, "learning_rate": 0.0001, "loss": 5.4767, "loss/crossentropy": 2.4149802923202515, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15832047536969185, "step": 12346 }, { "epoch": 0.5612727272727273, "grad_norm": 5.34375, "grad_norm_var": 0.109228515625, "learning_rate": 0.0001, "loss": 5.7937, "loss/crossentropy": 2.6330209970474243, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1666564978659153, "step": 12348 }, { "epoch": 0.5613636363636364, "grad_norm": 4.65625, "grad_norm_var": 0.10191650390625, "learning_rate": 0.0001, "loss": 5.2589, "loss/crossentropy": 2.290423482656479, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1499680131673813, "step": 12350 }, { "epoch": 0.5614545454545454, "grad_norm": 4.75, "grad_norm_var": 0.10084228515625, "learning_rate": 0.0001, "loss": 5.6626, "loss/crossentropy": 2.550921320915222, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16233883425593376, "step": 12352 }, { "epoch": 0.5615454545454546, "grad_norm": 5.15625, "grad_norm_var": 0.09563802083333334, "learning_rate": 0.0001, "loss": 5.5873, "loss/crossentropy": 2.497893989086151, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1593339666724205, "step": 12354 }, { "epoch": 0.5616363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.08593343098958334, "learning_rate": 0.0001, "loss": 5.7844, "loss/crossentropy": 2.6043895483016968, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.17054082080721855, "step": 12356 }, { "epoch": 0.5617272727272727, "grad_norm": 4.625, "grad_norm_var": 0.08023681640625, "learning_rate": 0.0001, "loss": 5.5409, "loss/crossentropy": 2.438837766647339, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16137400269508362, "step": 12358 }, { "epoch": 0.5618181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.05907796223958333, "learning_rate": 0.0001, "loss": 5.6697, "loss/crossentropy": 2.5313850045204163, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16500268504023552, "step": 12360 }, { "epoch": 0.5619090909090909, "grad_norm": 5.25, "grad_norm_var": 0.05870768229166667, "learning_rate": 0.0001, "loss": 5.7122, "loss/crossentropy": 2.544954001903534, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16516189649701118, "step": 12362 }, { "epoch": 0.562, "grad_norm": 4.8125, "grad_norm_var": 0.0751953125, "learning_rate": 0.0001, "loss": 5.8012, "loss/crossentropy": 2.6285024285316467, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16863516718149185, "step": 12364 }, { "epoch": 0.5620909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.06802978515625, "learning_rate": 0.0001, "loss": 5.5682, "loss/crossentropy": 2.4734602570533752, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16357142850756645, "step": 12366 }, { "epoch": 0.5621818181818182, "grad_norm": 4.875, "grad_norm_var": 0.06783447265625, "learning_rate": 0.0001, "loss": 5.8553, "loss/crossentropy": 2.6685399413108826, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16906602680683136, "step": 12368 }, { "epoch": 0.5622727272727273, "grad_norm": 4.75, "grad_norm_var": 0.07987874348958333, "learning_rate": 0.0001, "loss": 5.1491, "loss/crossentropy": 2.1997784078121185, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14844798296689987, "step": 12370 }, { "epoch": 0.5623636363636364, "grad_norm": 5.125, "grad_norm_var": 0.13085530598958334, "learning_rate": 0.0001, "loss": 5.3717, "loss/crossentropy": 2.2994065284729004, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15664424002170563, "step": 12372 }, { "epoch": 0.5624545454545454, "grad_norm": 4.9375, "grad_norm_var": 0.12353108723958334, "learning_rate": 0.0001, "loss": 5.7808, "loss/crossentropy": 2.6680538058280945, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.16615672409534454, "step": 12374 }, { "epoch": 0.5625454545454546, "grad_norm": 4.75, "grad_norm_var": 0.11894124348958333, "learning_rate": 0.0001, "loss": 5.7851, "loss/crossentropy": 2.636833369731903, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16697156056761742, "step": 12376 }, { "epoch": 0.5626363636363636, "grad_norm": 4.875, "grad_norm_var": 0.12141927083333333, "learning_rate": 0.0001, "loss": 5.6668, "loss/crossentropy": 2.4783145785331726, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16689380630850792, "step": 12378 }, { "epoch": 0.5627272727272727, "grad_norm": 4.4375, "grad_norm_var": 0.12486979166666666, "learning_rate": 0.0001, "loss": 5.2876, "loss/crossentropy": 2.2840095162391663, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.151925191283226, "step": 12380 }, { "epoch": 0.5628181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.12537434895833333, "learning_rate": 0.0001, "loss": 6.0153, "loss/crossentropy": 2.727947473526001, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17716992646455765, "step": 12382 }, { "epoch": 0.5629090909090909, "grad_norm": 4.5625, "grad_norm_var": 0.13982747395833334, "learning_rate": 0.0001, "loss": 5.7112, "loss/crossentropy": 2.590764284133911, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1649751141667366, "step": 12384 }, { "epoch": 0.563, "grad_norm": 5.625, "grad_norm_var": 0.13873291015625, "learning_rate": 0.0001, "loss": 5.9002, "loss/crossentropy": 2.665815055370331, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17343691363930702, "step": 12386 }, { "epoch": 0.5630909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.116015625, "learning_rate": 0.0001, "loss": 5.3595, "loss/crossentropy": 2.4203909039497375, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14508366957306862, "step": 12388 }, { "epoch": 0.5631818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.14117431640625, "learning_rate": 0.0001, "loss": 5.5719, "loss/crossentropy": 2.5067862272262573, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15944526717066765, "step": 12390 }, { "epoch": 0.5632727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.133056640625, "learning_rate": 0.0001, "loss": 5.8814, "loss/crossentropy": 2.7080366611480713, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17046190798282623, "step": 12392 }, { "epoch": 0.5633636363636364, "grad_norm": 5.0625, "grad_norm_var": 0.12823893229166666, "learning_rate": 0.0001, "loss": 5.4374, "loss/crossentropy": 2.3470643162727356, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1568896882236004, "step": 12394 }, { "epoch": 0.5634545454545454, "grad_norm": 4.75, "grad_norm_var": 0.11751302083333333, "learning_rate": 0.0001, "loss": 5.2328, "loss/crossentropy": 2.2432524263858795, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.14895961619913578, "step": 12396 }, { "epoch": 0.5635454545454546, "grad_norm": 4.96875, "grad_norm_var": 0.112109375, "learning_rate": 0.0001, "loss": 5.5136, "loss/crossentropy": 2.3976858258247375, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16218146309256554, "step": 12398 }, { "epoch": 0.5636363636363636, "grad_norm": 4.875, "grad_norm_var": 0.09724934895833333, "learning_rate": 0.0001, "loss": 5.6269, "loss/crossentropy": 2.5535377264022827, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15850309655070305, "step": 12400 }, { "epoch": 0.5637272727272727, "grad_norm": 4.625, "grad_norm_var": 0.07565104166666667, "learning_rate": 0.0001, "loss": 5.5156, "loss/crossentropy": 2.4240078926086426, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15955087542533875, "step": 12402 }, { "epoch": 0.5638181818181818, "grad_norm": 5.28125, "grad_norm_var": 0.072119140625, "learning_rate": 0.0001, "loss": 5.6003, "loss/crossentropy": 2.4877944588661194, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16320621222257614, "step": 12404 }, { "epoch": 0.5639090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.036454264322916666, "learning_rate": 0.0001, "loss": 5.4967, "loss/crossentropy": 2.439744770526886, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15647710859775543, "step": 12406 }, { "epoch": 0.564, "grad_norm": 4.96875, "grad_norm_var": 0.0390625, "learning_rate": 0.0001, "loss": 5.5224, "loss/crossentropy": 2.460795283317566, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1602589264512062, "step": 12408 }, { "epoch": 0.5640909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.04071858723958333, "learning_rate": 0.0001, "loss": 5.5402, "loss/crossentropy": 2.519256591796875, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1560007818043232, "step": 12410 }, { "epoch": 0.5641818181818182, "grad_norm": 4.6875, "grad_norm_var": 0.043680826822916664, "learning_rate": 0.0001, "loss": 5.3598, "loss/crossentropy": 2.407894253730774, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1492931805551052, "step": 12412 }, { "epoch": 0.5642727272727273, "grad_norm": 6.0, "grad_norm_var": 2.150130208333333, "learning_rate": 0.0001, "loss": 5.8041, "loss/crossentropy": 2.452794849872589, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18122129887342453, "step": 12414 }, { "epoch": 0.5643636363636364, "grad_norm": 4.6875, "grad_norm_var": 2.1543253580729167, "learning_rate": 0.0001, "loss": 5.6709, "loss/crossentropy": 2.5243276357650757, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16387607902288437, "step": 12416 }, { "epoch": 0.5644545454545454, "grad_norm": 5.3125, "grad_norm_var": 2.1657511393229165, "learning_rate": 0.0001, "loss": 5.7069, "loss/crossentropy": 2.616263270378113, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.161607313901186, "step": 12418 }, { "epoch": 0.5645454545454546, "grad_norm": 4.875, "grad_norm_var": 2.199898274739583, "learning_rate": 0.0001, "loss": 5.2688, "loss/crossentropy": 2.3165701031684875, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15146982669830322, "step": 12420 }, { "epoch": 0.5646363636363636, "grad_norm": 4.6875, "grad_norm_var": 2.1942708333333334, "learning_rate": 0.0001, "loss": 5.339, "loss/crossentropy": 2.344943344593048, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1511613391339779, "step": 12422 }, { "epoch": 0.5647272727272727, "grad_norm": 4.65625, "grad_norm_var": 2.19742431640625, "learning_rate": 0.0001, "loss": 5.4621, "loss/crossentropy": 2.518851339817047, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14686529338359833, "step": 12424 }, { "epoch": 0.5648181818181818, "grad_norm": 5.90625, "grad_norm_var": 2.19664306640625, "learning_rate": 0.0001, "loss": 5.8809, "loss/crossentropy": 2.6391289234161377, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17339275777339935, "step": 12426 }, { "epoch": 0.5649090909090909, "grad_norm": 5.1875, "grad_norm_var": 2.124853515625, "learning_rate": 0.0001, "loss": 5.6547, "loss/crossentropy": 2.5233870148658752, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16313588619232178, "step": 12428 }, { "epoch": 0.565, "grad_norm": 4.8125, "grad_norm_var": 0.163525390625, "learning_rate": 0.0001, "loss": 5.8361, "loss/crossentropy": 2.6430333852767944, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16930582746863365, "step": 12430 }, { "epoch": 0.5650909090909091, "grad_norm": 6.0, "grad_norm_var": 0.21432291666666667, "learning_rate": 0.0001, "loss": 5.694, "loss/crossentropy": 2.5287997722625732, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1651521511375904, "step": 12432 }, { "epoch": 0.5651818181818182, "grad_norm": 4.75, "grad_norm_var": 0.3402180989583333, "learning_rate": 0.0001, "loss": 5.5325, "loss/crossentropy": 2.417554020881653, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.162078108638525, "step": 12434 }, { "epoch": 0.5652727272727273, "grad_norm": 4.1875, "grad_norm_var": 0.36763916015625, "learning_rate": 0.0001, "loss": 4.9198, "loss/crossentropy": 2.1145244240760803, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1344349905848503, "step": 12436 }, { "epoch": 0.5653636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.35846354166666666, "learning_rate": 0.0001, "loss": 5.8726, "loss/crossentropy": 2.658089756965637, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17106376588344574, "step": 12438 }, { "epoch": 0.5654545454545454, "grad_norm": 4.65625, "grad_norm_var": 0.344384765625, "learning_rate": 0.0001, "loss": 5.6559, "loss/crossentropy": 2.5463132858276367, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16290981322526932, "step": 12440 }, { "epoch": 0.5655454545454546, "grad_norm": 4.40625, "grad_norm_var": 0.333203125, "learning_rate": 0.0001, "loss": 5.7023, "loss/crossentropy": 2.5873001515865326, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16423833556473255, "step": 12442 }, { "epoch": 0.5656363636363636, "grad_norm": 4.59375, "grad_norm_var": 0.34205322265625, "learning_rate": 0.0001, "loss": 5.4248, "loss/crossentropy": 2.4184764623641968, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15395406261086464, "step": 12444 }, { "epoch": 0.5657272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.3408203125, "learning_rate": 0.0001, "loss": 5.7844, "loss/crossentropy": 2.6063942909240723, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16721286997199059, "step": 12446 }, { "epoch": 0.5658181818181818, "grad_norm": 4.625, "grad_norm_var": 0.27146809895833335, "learning_rate": 0.0001, "loss": 5.3925, "loss/crossentropy": 2.313130259513855, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15794016420841217, "step": 12448 }, { "epoch": 0.5659090909090909, "grad_norm": 5.0, "grad_norm_var": 0.07615559895833333, "learning_rate": 0.0001, "loss": 5.6748, "loss/crossentropy": 2.467174470424652, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16861893609166145, "step": 12450 }, { "epoch": 0.566, "grad_norm": 5.1875, "grad_norm_var": 0.056884765625, "learning_rate": 0.0001, "loss": 5.9029, "loss/crossentropy": 2.7430806756019592, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16520003229379654, "step": 12452 }, { "epoch": 0.5660909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.19465738932291668, "learning_rate": 0.0001, "loss": 5.9223, "loss/crossentropy": 2.7103158235549927, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17002641409635544, "step": 12454 }, { "epoch": 0.5661818181818182, "grad_norm": 6.46875, "grad_norm_var": 0.34163004557291665, "learning_rate": 0.0001, "loss": 5.9248, "loss/crossentropy": 2.7172224521636963, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1736917607486248, "step": 12456 }, { "epoch": 0.5662727272727273, "grad_norm": 4.5, "grad_norm_var": 0.335791015625, "learning_rate": 0.0001, "loss": 5.5414, "loss/crossentropy": 2.4445483684539795, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16105085611343384, "step": 12458 }, { "epoch": 0.5663636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.32081705729166665, "learning_rate": 0.0001, "loss": 5.6187, "loss/crossentropy": 2.5712639689445496, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1584542691707611, "step": 12460 }, { "epoch": 0.5664545454545454, "grad_norm": 5.09375, "grad_norm_var": 0.3152994791666667, "learning_rate": 0.0001, "loss": 5.7237, "loss/crossentropy": 2.5880876779556274, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1664932668209076, "step": 12462 }, { "epoch": 0.5665454545454546, "grad_norm": 6.625, "grad_norm_var": 0.4595703125, "learning_rate": 0.0001, "loss": 5.2684, "loss/crossentropy": 2.238225072622299, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15477661415934563, "step": 12464 }, { "epoch": 0.5666363636363636, "grad_norm": 5.375, "grad_norm_var": 0.45299072265625, "learning_rate": 0.0001, "loss": 5.965, "loss/crossentropy": 2.7440072298049927, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.17424371093511581, "step": 12466 }, { "epoch": 0.5667272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.44908447265625, "learning_rate": 0.0001, "loss": 5.815, "loss/crossentropy": 2.6777132749557495, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16451410576701164, "step": 12468 }, { "epoch": 0.5668181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.38046875, "learning_rate": 0.0001, "loss": 5.5708, "loss/crossentropy": 2.398127853870392, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1674637496471405, "step": 12470 }, { "epoch": 0.5669090909090909, "grad_norm": 4.6875, "grad_norm_var": 0.250244140625, "learning_rate": 0.0001, "loss": 5.393, "loss/crossentropy": 2.3585721850395203, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15461960434913635, "step": 12472 }, { "epoch": 0.567, "grad_norm": 4.6875, "grad_norm_var": 0.23853759765625, "learning_rate": 0.0001, "loss": 5.5419, "loss/crossentropy": 2.497170627117157, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15603407472372055, "step": 12474 }, { "epoch": 0.5670909090909091, "grad_norm": 5.125, "grad_norm_var": 0.22239176432291666, "learning_rate": 0.0001, "loss": 5.8153, "loss/crossentropy": 2.6307443976402283, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16904527321457863, "step": 12476 }, { "epoch": 0.5671818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.22476806640625, "learning_rate": 0.0001, "loss": 5.3422, "loss/crossentropy": 2.3128874003887177, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15527592599391937, "step": 12478 }, { "epoch": 0.5672727272727273, "grad_norm": 4.53125, "grad_norm_var": 0.06926676432291666, "learning_rate": 0.0001, "loss": 5.0664, "loss/crossentropy": 2.139479637145996, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1411321461200714, "step": 12480 }, { "epoch": 0.5673636363636364, "grad_norm": 5.3125, "grad_norm_var": 0.10501302083333333, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.6218754649162292, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17043724656105042, "step": 12482 }, { "epoch": 0.5674545454545454, "grad_norm": 5.34375, "grad_norm_var": 0.110791015625, "learning_rate": 0.0001, "loss": 5.6958, "loss/crossentropy": 2.5374042987823486, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16368839144706726, "step": 12484 }, { "epoch": 0.5675454545454546, "grad_norm": 5.09375, "grad_norm_var": 0.11161702473958333, "learning_rate": 0.0001, "loss": 5.9496, "loss/crossentropy": 2.7113914489746094, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17459870129823685, "step": 12486 }, { "epoch": 0.5676363636363636, "grad_norm": 4.96875, "grad_norm_var": 0.09895833333333333, "learning_rate": 0.0001, "loss": 5.6793, "loss/crossentropy": 2.515933930873871, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16536100581288338, "step": 12488 }, { "epoch": 0.5677272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.09625244140625, "learning_rate": 0.0001, "loss": 5.4134, "loss/crossentropy": 2.3624037504196167, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15529590100049973, "step": 12490 }, { "epoch": 0.5678181818181818, "grad_norm": 4.625, "grad_norm_var": 0.12157796223958334, "learning_rate": 0.0001, "loss": 5.0946, "loss/crossentropy": 2.2208310663700104, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14108539000153542, "step": 12492 }, { "epoch": 0.5679090909090909, "grad_norm": 4.53125, "grad_norm_var": 0.13843994140625, "learning_rate": 0.0001, "loss": 5.5344, "loss/crossentropy": 2.4788554310798645, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15731456875801086, "step": 12494 }, { "epoch": 0.568, "grad_norm": 5.4375, "grad_norm_var": 0.13681233723958333, "learning_rate": 0.0001, "loss": 5.7618, "loss/crossentropy": 2.632826507091522, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1664118506014347, "step": 12496 }, { "epoch": 0.5680909090909091, "grad_norm": 5.65625, "grad_norm_var": 0.110546875, "learning_rate": 0.0001, "loss": 5.6459, "loss/crossentropy": 2.530439019203186, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16193854063749313, "step": 12498 }, { "epoch": 0.5681818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.10670166015625, "learning_rate": 0.0001, "loss": 5.749, "loss/crossentropy": 2.5376789569854736, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17191049829125404, "step": 12500 }, { "epoch": 0.5682727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.11832275390625, "learning_rate": 0.0001, "loss": 5.4336, "loss/crossentropy": 2.4506676197052, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15278635546565056, "step": 12502 }, { "epoch": 0.5683636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.1181640625, "learning_rate": 0.0001, "loss": 5.6235, "loss/crossentropy": 2.5342578291893005, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1591196283698082, "step": 12504 }, { "epoch": 0.5684545454545454, "grad_norm": 5.4375, "grad_norm_var": 0.14479166666666668, "learning_rate": 0.0001, "loss": 5.7355, "loss/crossentropy": 2.5835026502609253, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16520339995622635, "step": 12506 }, { "epoch": 0.5685454545454546, "grad_norm": 4.75, "grad_norm_var": 0.13489176432291666, "learning_rate": 0.0001, "loss": 5.4421, "loss/crossentropy": 2.4699018001556396, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1517101563513279, "step": 12508 }, { "epoch": 0.5686363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.11901041666666666, "learning_rate": 0.0001, "loss": 5.2028, "loss/crossentropy": 2.1922098994255066, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15184422954916954, "step": 12510 }, { "epoch": 0.5687272727272727, "grad_norm": 4.46875, "grad_norm_var": 0.12589518229166666, "learning_rate": 0.0001, "loss": 5.4463, "loss/crossentropy": 2.508118987083435, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15085142850875854, "step": 12512 }, { "epoch": 0.5688181818181818, "grad_norm": 5.34375, "grad_norm_var": 0.10601806640625, "learning_rate": 0.0001, "loss": 5.6226, "loss/crossentropy": 2.586339056491852, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1577323153614998, "step": 12514 }, { "epoch": 0.5689090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.10974934895833334, "learning_rate": 0.0001, "loss": 5.2403, "loss/crossentropy": 2.3109437227249146, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14449363201856613, "step": 12516 }, { "epoch": 0.569, "grad_norm": 4.65625, "grad_norm_var": 0.09840087890625, "learning_rate": 0.0001, "loss": 5.1658, "loss/crossentropy": 2.175502747297287, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.148447435349226, "step": 12518 }, { "epoch": 0.5690909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.09527587890625, "learning_rate": 0.0001, "loss": 5.394, "loss/crossentropy": 2.3509193062782288, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1550910696387291, "step": 12520 }, { "epoch": 0.5691818181818182, "grad_norm": 6.96875, "grad_norm_var": 0.33592122395833335, "learning_rate": 0.0001, "loss": 5.8068, "loss/crossentropy": 2.543339490890503, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17322121188044548, "step": 12522 }, { "epoch": 0.5692727272727273, "grad_norm": 4.65625, "grad_norm_var": 0.3343587239583333, "learning_rate": 0.0001, "loss": 5.1477, "loss/crossentropy": 2.2352875471115112, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14475348964333534, "step": 12524 }, { "epoch": 0.5693636363636364, "grad_norm": 6.84375, "grad_norm_var": 0.5570597330729167, "learning_rate": 0.0001, "loss": 6.0529, "loss/crossentropy": 2.7174421548843384, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1808066889643669, "step": 12526 }, { "epoch": 0.5694545454545454, "grad_norm": 5.125, "grad_norm_var": 0.517578125, "learning_rate": 0.0001, "loss": 5.6649, "loss/crossentropy": 2.5530218482017517, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16119248420000076, "step": 12528 }, { "epoch": 0.5695454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.504296875, "learning_rate": 0.0001, "loss": 5.9255, "loss/crossentropy": 2.706557273864746, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1695527844130993, "step": 12530 }, { "epoch": 0.5696363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.5094034830729167, "learning_rate": 0.0001, "loss": 5.8863, "loss/crossentropy": 2.6816256642341614, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16890133544802666, "step": 12532 }, { "epoch": 0.5697272727272727, "grad_norm": 4.5, "grad_norm_var": 0.501171875, "learning_rate": 0.0001, "loss": 5.4632, "loss/crossentropy": 2.3589871525764465, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16159706935286522, "step": 12534 }, { "epoch": 0.5698181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.47213134765625, "learning_rate": 0.0001, "loss": 5.989, "loss/crossentropy": 2.6729572415351868, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18003839626908302, "step": 12536 }, { "epoch": 0.5699090909090909, "grad_norm": 5.375, "grad_norm_var": 0.28853759765625, "learning_rate": 0.0001, "loss": 5.8111, "loss/crossentropy": 2.677076518535614, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16184039413928986, "step": 12538 }, { "epoch": 0.57, "grad_norm": 4.96875, "grad_norm_var": 0.26038004557291666, "learning_rate": 0.0001, "loss": 5.8246, "loss/crossentropy": 2.632142424583435, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16845954954624176, "step": 12540 }, { "epoch": 0.5700909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.10130208333333333, "learning_rate": 0.0001, "loss": 5.5718, "loss/crossentropy": 2.454837590456009, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16130495816469193, "step": 12542 }, { "epoch": 0.5701818181818182, "grad_norm": 5.0, "grad_norm_var": 0.12675374348958332, "learning_rate": 0.0001, "loss": 5.4363, "loss/crossentropy": 2.388817548751831, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15572696551680565, "step": 12544 }, { "epoch": 0.5702727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.13827718098958333, "learning_rate": 0.0001, "loss": 5.166, "loss/crossentropy": 2.184056520462036, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.14780095219612122, "step": 12546 }, { "epoch": 0.5703636363636364, "grad_norm": 4.96875, "grad_norm_var": 0.08596598307291667, "learning_rate": 0.0001, "loss": 5.5549, "loss/crossentropy": 2.4440484046936035, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16205701231956482, "step": 12548 }, { "epoch": 0.5704545454545454, "grad_norm": 4.875, "grad_norm_var": 0.05779622395833333, "learning_rate": 0.0001, "loss": 5.9219, "loss/crossentropy": 2.7033936381340027, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17341173812747002, "step": 12550 }, { "epoch": 0.5705454545454546, "grad_norm": 4.90625, "grad_norm_var": 0.050191243489583336, "learning_rate": 0.0001, "loss": 5.6174, "loss/crossentropy": 2.4143083095550537, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16991867125034332, "step": 12552 }, { "epoch": 0.5706363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.03583577473958333, "learning_rate": 0.0001, "loss": 5.6452, "loss/crossentropy": 2.4560999870300293, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16813290864229202, "step": 12554 }, { "epoch": 0.5707272727272727, "grad_norm": 6.3125, "grad_norm_var": 0.15025634765625, "learning_rate": 0.0001, "loss": 5.9779, "loss/crossentropy": 2.661012828350067, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.17876141145825386, "step": 12556 }, { "epoch": 0.5708181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.14208577473958334, "learning_rate": 0.0001, "loss": 5.5662, "loss/crossentropy": 2.378601610660553, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16660858690738678, "step": 12558 }, { "epoch": 0.5709090909090909, "grad_norm": 11.375, "grad_norm_var": 2.57330322265625, "learning_rate": 0.0001, "loss": 5.5741, "loss/crossentropy": 2.3754146099090576, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1683095432817936, "step": 12560 }, { "epoch": 0.571, "grad_norm": 5.03125, "grad_norm_var": 2.5208170572916666, "learning_rate": 0.0001, "loss": 5.2443, "loss/crossentropy": 2.196614980697632, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1541842482984066, "step": 12562 }, { "epoch": 0.5710909090909091, "grad_norm": 4.96875, "grad_norm_var": 2.515885416666667, "learning_rate": 0.0001, "loss": 5.5119, "loss/crossentropy": 2.499367594718933, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15496844053268433, "step": 12564 }, { "epoch": 0.5711818181818182, "grad_norm": 5.0, "grad_norm_var": 2.4945271809895835, "learning_rate": 0.0001, "loss": 5.4004, "loss/crossentropy": 2.3762601613998413, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15456349775195122, "step": 12566 }, { "epoch": 0.5712727272727273, "grad_norm": 5.1875, "grad_norm_var": 2.5399739583333334, "learning_rate": 0.0001, "loss": 5.1916, "loss/crossentropy": 2.2285718619823456, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.14629879780113697, "step": 12568 }, { "epoch": 0.5713636363636364, "grad_norm": 5.53125, "grad_norm_var": 2.5560546875, "learning_rate": 0.0001, "loss": 5.5917, "loss/crossentropy": 2.443901240825653, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1640026494860649, "step": 12570 }, { "epoch": 0.5714545454545454, "grad_norm": 4.5, "grad_norm_var": 2.597359212239583, "learning_rate": 0.0001, "loss": 5.4748, "loss/crossentropy": 2.4624048471450806, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15221330896019936, "step": 12572 }, { "epoch": 0.5715454545454546, "grad_norm": 4.5625, "grad_norm_var": 2.6690388997395833, "learning_rate": 0.0001, "loss": 5.6019, "loss/crossentropy": 2.518677592277527, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16203340142965317, "step": 12574 }, { "epoch": 0.5716363636363636, "grad_norm": 4.625, "grad_norm_var": 0.13004150390625, "learning_rate": 0.0001, "loss": 5.5239, "loss/crossentropy": 2.488992840051651, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1581779643893242, "step": 12576 }, { "epoch": 0.5717272727272728, "grad_norm": 5.3125, "grad_norm_var": 0.09459228515625, "learning_rate": 0.0001, "loss": 5.8316, "loss/crossentropy": 2.6219934225082397, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1688152179121971, "step": 12578 }, { "epoch": 0.5718181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.104931640625, "learning_rate": 0.0001, "loss": 5.4033, "loss/crossentropy": 2.4028836488723755, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15218858793377876, "step": 12580 }, { "epoch": 0.5719090909090909, "grad_norm": 4.53125, "grad_norm_var": 0.10963134765625, "learning_rate": 0.0001, "loss": 5.6692, "loss/crossentropy": 2.5963767766952515, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15923921391367912, "step": 12582 }, { "epoch": 0.572, "grad_norm": 4.53125, "grad_norm_var": 0.09737955729166667, "learning_rate": 0.0001, "loss": 5.6744, "loss/crossentropy": 2.61434805393219, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.16088807955384254, "step": 12584 }, { "epoch": 0.5720909090909091, "grad_norm": 4.53125, "grad_norm_var": 0.09254150390625, "learning_rate": 0.0001, "loss": 5.4456, "loss/crossentropy": 2.380621373653412, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15649783611297607, "step": 12586 }, { "epoch": 0.5721818181818182, "grad_norm": 4.65625, "grad_norm_var": 0.09384358723958333, "learning_rate": 0.0001, "loss": 5.5957, "loss/crossentropy": 2.473643183708191, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16161930188536644, "step": 12588 }, { "epoch": 0.5722727272727273, "grad_norm": 4.625, "grad_norm_var": 0.12571207682291666, "learning_rate": 0.0001, "loss": 5.6756, "loss/crossentropy": 2.5975553393363953, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15877661854028702, "step": 12590 }, { "epoch": 0.5723636363636364, "grad_norm": 4.5, "grad_norm_var": 0.172900390625, "learning_rate": 0.0001, "loss": 5.6913, "loss/crossentropy": 2.5567031800746918, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16639236733317375, "step": 12592 }, { "epoch": 0.5724545454545454, "grad_norm": 4.65625, "grad_norm_var": 0.16809488932291666, "learning_rate": 0.0001, "loss": 5.4099, "loss/crossentropy": 2.3945192098617554, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15192433446645737, "step": 12594 }, { "epoch": 0.5725454545454546, "grad_norm": 4.5, "grad_norm_var": 0.16495768229166666, "learning_rate": 0.0001, "loss": 5.197, "loss/crossentropy": 2.280409663915634, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.14692838676273823, "step": 12596 }, { "epoch": 0.5726363636363636, "grad_norm": 5.34375, "grad_norm_var": 0.19306233723958333, "learning_rate": 0.0001, "loss": 5.7284, "loss/crossentropy": 2.557257056236267, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1677020937204361, "step": 12598 }, { "epoch": 0.5727272727272728, "grad_norm": 4.6875, "grad_norm_var": 0.18502604166666667, "learning_rate": 0.0001, "loss": 5.2561, "loss/crossentropy": 2.275733232498169, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14940858259797096, "step": 12600 }, { "epoch": 0.5728181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.3092447916666667, "learning_rate": 0.0001, "loss": 5.6567, "loss/crossentropy": 2.4790654480457306, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.1656159982085228, "step": 12602 }, { "epoch": 0.5729090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.3093058268229167, "learning_rate": 0.0001, "loss": 5.5234, "loss/crossentropy": 2.454949915409088, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16016291081905365, "step": 12604 }, { "epoch": 0.573, "grad_norm": 5.5, "grad_norm_var": 0.2951171875, "learning_rate": 0.0001, "loss": 5.7642, "loss/crossentropy": 2.5987899899482727, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16732396185398102, "step": 12606 }, { "epoch": 0.5730909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.25657552083333335, "learning_rate": 0.0001, "loss": 5.5832, "loss/crossentropy": 2.492612063884735, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16101237013936043, "step": 12608 }, { "epoch": 0.5731818181818182, "grad_norm": 4.34375, "grad_norm_var": 0.28277587890625, "learning_rate": 0.0001, "loss": 5.285, "loss/crossentropy": 2.2974409759044647, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15070682018995285, "step": 12610 }, { "epoch": 0.5732727272727273, "grad_norm": 5.0625, "grad_norm_var": 0.27537434895833335, "learning_rate": 0.0001, "loss": 5.6999, "loss/crossentropy": 2.6505894660949707, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15727685019373894, "step": 12612 }, { "epoch": 0.5733636363636364, "grad_norm": 8.3125, "grad_norm_var": 0.92545166015625, "learning_rate": 0.0001, "loss": 5.4716, "loss/crossentropy": 2.3866373896598816, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15869001671671867, "step": 12614 }, { "epoch": 0.5734545454545454, "grad_norm": 4.5625, "grad_norm_var": 0.9352498372395833, "learning_rate": 0.0001, "loss": 5.1225, "loss/crossentropy": 2.177122265100479, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1431664451956749, "step": 12616 }, { "epoch": 0.5735454545454546, "grad_norm": 5.34375, "grad_norm_var": 0.8118123372395833, "learning_rate": 0.0001, "loss": 5.9439, "loss/crossentropy": 2.752324402332306, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16915984451770782, "step": 12618 }, { "epoch": 0.5736363636363636, "grad_norm": 5.59375, "grad_norm_var": 0.8031209309895834, "learning_rate": 0.0001, "loss": 5.75, "loss/crossentropy": 2.5343589782714844, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17156235501170158, "step": 12620 }, { "epoch": 0.5737272727272728, "grad_norm": 4.65625, "grad_norm_var": 0.8210286458333333, "learning_rate": 0.0001, "loss": 5.697, "loss/crossentropy": 2.6087059378623962, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1615646593272686, "step": 12622 }, { "epoch": 0.5738181818181818, "grad_norm": 5.5625, "grad_norm_var": 0.8302083333333333, "learning_rate": 0.0001, "loss": 5.7407, "loss/crossentropy": 2.5721961855888367, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16665035113692284, "step": 12624 }, { "epoch": 0.5739090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.7933430989583333, "learning_rate": 0.0001, "loss": 5.7696, "loss/crossentropy": 2.6403281688690186, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16507647559046745, "step": 12626 }, { "epoch": 0.574, "grad_norm": 5.03125, "grad_norm_var": 0.775244140625, "learning_rate": 0.0001, "loss": 5.6562, "loss/crossentropy": 2.5560121536254883, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16333546862006187, "step": 12628 }, { "epoch": 0.5740909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.11842041015625, "learning_rate": 0.0001, "loss": 5.0528, "loss/crossentropy": 2.157546103000641, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.1447979211807251, "step": 12630 }, { "epoch": 0.5741818181818181, "grad_norm": 4.5, "grad_norm_var": 0.12307535807291667, "learning_rate": 0.0001, "loss": 5.8717, "loss/crossentropy": 2.703748196363449, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1654311716556549, "step": 12632 }, { "epoch": 0.5742727272727273, "grad_norm": 5.25, "grad_norm_var": 0.11080729166666667, "learning_rate": 0.0001, "loss": 5.7369, "loss/crossentropy": 2.5185986161231995, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17026911675930023, "step": 12634 }, { "epoch": 0.5743636363636364, "grad_norm": 4.65625, "grad_norm_var": 0.07854410807291666, "learning_rate": 0.0001, "loss": 5.5409, "loss/crossentropy": 2.5306065380573273, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15318153239786625, "step": 12636 }, { "epoch": 0.5744545454545454, "grad_norm": 5.1875, "grad_norm_var": 0.07743733723958333, "learning_rate": 0.0001, "loss": 5.5192, "loss/crossentropy": 2.3535690903663635, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1649978868663311, "step": 12638 }, { "epoch": 0.5745454545454546, "grad_norm": 5.0, "grad_norm_var": 0.049214680989583336, "learning_rate": 0.0001, "loss": 5.5453, "loss/crossentropy": 2.5112658739089966, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15496457368135452, "step": 12640 }, { "epoch": 0.5746363636363636, "grad_norm": 4.875, "grad_norm_var": 0.0466796875, "learning_rate": 0.0001, "loss": 5.5205, "loss/crossentropy": 2.4394948482513428, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16024967655539513, "step": 12642 }, { "epoch": 0.5747272727272728, "grad_norm": 5.0625, "grad_norm_var": 0.04998372395833333, "learning_rate": 0.0001, "loss": 5.5027, "loss/crossentropy": 2.3720807433128357, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16189342737197876, "step": 12644 }, { "epoch": 0.5748181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.04217122395833333, "learning_rate": 0.0001, "loss": 5.3726, "loss/crossentropy": 2.3237873315811157, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15449083596467972, "step": 12646 }, { "epoch": 0.5749090909090909, "grad_norm": 4.875, "grad_norm_var": 0.025679524739583334, "learning_rate": 0.0001, "loss": 5.5614, "loss/crossentropy": 2.5444445610046387, "loss/hidden": 1.431640625, "loss/jsd": 0.0, "loss/logits": 0.15853150933980942, "step": 12648 }, { "epoch": 0.575, "grad_norm": 4.8125, "grad_norm_var": 0.022119140625, "learning_rate": 0.0001, "loss": 5.6278, "loss/crossentropy": 2.5213130712509155, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16377481818199158, "step": 12650 }, { "epoch": 0.5750909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.0330078125, "learning_rate": 0.0001, "loss": 5.5616, "loss/crossentropy": 2.4559340476989746, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1632968820631504, "step": 12652 }, { "epoch": 0.5751818181818181, "grad_norm": 4.65625, "grad_norm_var": 0.04260660807291667, "learning_rate": 0.0001, "loss": 5.0612, "loss/crossentropy": 2.1392775177955627, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1455100141465664, "step": 12654 }, { "epoch": 0.5752727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.04816080729166667, "learning_rate": 0.0001, "loss": 5.6599, "loss/crossentropy": 2.523610293865204, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16694991663098335, "step": 12656 }, { "epoch": 0.5753636363636364, "grad_norm": 4.46875, "grad_norm_var": 0.06484375, "learning_rate": 0.0001, "loss": 5.6036, "loss/crossentropy": 2.5419200658798218, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15968641266226768, "step": 12658 }, { "epoch": 0.5754545454545454, "grad_norm": 4.90625, "grad_norm_var": 0.06529947916666666, "learning_rate": 0.0001, "loss": 6.1634, "loss/crossentropy": 2.8666704297065735, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.18025494366884232, "step": 12660 }, { "epoch": 0.5755454545454546, "grad_norm": 4.71875, "grad_norm_var": 0.07379150390625, "learning_rate": 0.0001, "loss": 5.3186, "loss/crossentropy": 2.323385000228882, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15362155064940453, "step": 12662 }, { "epoch": 0.5756363636363636, "grad_norm": 5.65625, "grad_norm_var": 0.10745035807291667, "learning_rate": 0.0001, "loss": 6.1105, "loss/crossentropy": 2.829956829547882, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17688684538006783, "step": 12664 }, { "epoch": 0.5757272727272728, "grad_norm": 4.9375, "grad_norm_var": 0.10797119140625, "learning_rate": 0.0001, "loss": 5.4256, "loss/crossentropy": 2.4120504558086395, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15428025275468826, "step": 12666 }, { "epoch": 0.5758181818181818, "grad_norm": 4.71875, "grad_norm_var": 0.09169514973958333, "learning_rate": 0.0001, "loss": 5.5017, "loss/crossentropy": 2.472950041294098, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1552225574851036, "step": 12668 }, { "epoch": 0.5759090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.08534749348958333, "learning_rate": 0.0001, "loss": 5.677, "loss/crossentropy": 2.569889783859253, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16051161661744118, "step": 12670 }, { "epoch": 0.576, "grad_norm": 4.84375, "grad_norm_var": 0.08136393229166666, "learning_rate": 0.0001, "loss": 5.6104, "loss/crossentropy": 2.539670169353485, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16117635369300842, "step": 12672 }, { "epoch": 0.5760909090909091, "grad_norm": 4.5, "grad_norm_var": 0.08297119140625, "learning_rate": 0.0001, "loss": 5.4347, "loss/crossentropy": 2.440650999546051, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15253276005387306, "step": 12674 }, { "epoch": 0.5761818181818181, "grad_norm": 4.875, "grad_norm_var": 0.08958333333333333, "learning_rate": 0.0001, "loss": 6.042, "loss/crossentropy": 2.781181275844574, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17842936143279076, "step": 12676 }, { "epoch": 0.5762727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.38448893229166664, "learning_rate": 0.0001, "loss": 5.5725, "loss/crossentropy": 2.4382214546203613, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16245021298527718, "step": 12678 }, { "epoch": 0.5763636363636364, "grad_norm": 5.6875, "grad_norm_var": 0.40377197265625, "learning_rate": 0.0001, "loss": 5.4265, "loss/crossentropy": 2.327479064464569, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16263576224446297, "step": 12680 }, { "epoch": 0.5764545454545454, "grad_norm": 5.1875, "grad_norm_var": 0.40735270182291666, "learning_rate": 0.0001, "loss": 5.8449, "loss/crossentropy": 2.674961745738983, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16836199909448624, "step": 12682 }, { "epoch": 0.5765454545454546, "grad_norm": 4.3125, "grad_norm_var": 0.444775390625, "learning_rate": 0.0001, "loss": 5.1961, "loss/crossentropy": 2.2762894928455353, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1468641310930252, "step": 12684 }, { "epoch": 0.5766363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.4364217122395833, "learning_rate": 0.0001, "loss": 5.5659, "loss/crossentropy": 2.4693800806999207, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15945670753717422, "step": 12686 }, { "epoch": 0.5767272727272728, "grad_norm": 4.90625, "grad_norm_var": 0.4281534830729167, "learning_rate": 0.0001, "loss": 5.5185, "loss/crossentropy": 2.4851436018943787, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15450286865234375, "step": 12688 }, { "epoch": 0.5768181818181818, "grad_norm": 5.28125, "grad_norm_var": 0.4003214518229167, "learning_rate": 0.0001, "loss": 5.7978, "loss/crossentropy": 2.647874414920807, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16596507281064987, "step": 12690 }, { "epoch": 0.5769090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.39915364583333335, "learning_rate": 0.0001, "loss": 5.5564, "loss/crossentropy": 2.513614594936371, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15876969322562218, "step": 12692 }, { "epoch": 0.577, "grad_norm": 4.65625, "grad_norm_var": 0.12864583333333332, "learning_rate": 0.0001, "loss": 5.6004, "loss/crossentropy": 2.52165549993515, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16080666705965996, "step": 12694 }, { "epoch": 0.5770909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.09295247395833334, "learning_rate": 0.0001, "loss": 5.739, "loss/crossentropy": 2.5542714595794678, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16769636049866676, "step": 12696 }, { "epoch": 0.5771818181818181, "grad_norm": 5.65625, "grad_norm_var": 0.11588134765625, "learning_rate": 0.0001, "loss": 5.7101, "loss/crossentropy": 2.533351719379425, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.17021428048610687, "step": 12698 }, { "epoch": 0.5772727272727273, "grad_norm": 5.90625, "grad_norm_var": 0.12890625, "learning_rate": 0.0001, "loss": 5.7029, "loss/crossentropy": 2.525662422180176, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16635776683688164, "step": 12700 }, { "epoch": 0.5773636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.14345296223958334, "learning_rate": 0.0001, "loss": 5.6347, "loss/crossentropy": 2.492663323879242, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16479456052184105, "step": 12702 }, { "epoch": 0.5774545454545454, "grad_norm": 4.8125, "grad_norm_var": 0.14256184895833332, "learning_rate": 0.0001, "loss": 5.6449, "loss/crossentropy": 2.596120595932007, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15507284179329872, "step": 12704 }, { "epoch": 0.5775454545454546, "grad_norm": 5.28125, "grad_norm_var": 0.15037434895833332, "learning_rate": 0.0001, "loss": 5.9023, "loss/crossentropy": 2.630462944507599, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17718296125531197, "step": 12706 }, { "epoch": 0.5776363636363636, "grad_norm": 4.75, "grad_norm_var": 0.15403645833333332, "learning_rate": 0.0001, "loss": 5.6356, "loss/crossentropy": 2.53184574842453, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16018490493297577, "step": 12708 }, { "epoch": 0.5777272727272728, "grad_norm": 4.71875, "grad_norm_var": 0.19058837890625, "learning_rate": 0.0001, "loss": 5.5223, "loss/crossentropy": 2.4566612243652344, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.15480761602520943, "step": 12710 }, { "epoch": 0.5778181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.19127197265625, "learning_rate": 0.0001, "loss": 6.041, "loss/crossentropy": 2.7836720943450928, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17768723517656326, "step": 12712 }, { "epoch": 0.5779090909090909, "grad_norm": 4.5, "grad_norm_var": 0.18430989583333332, "learning_rate": 0.0001, "loss": 5.6506, "loss/crossentropy": 2.5826547145843506, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15992371737957, "step": 12714 }, { "epoch": 0.578, "grad_norm": 4.8125, "grad_norm_var": 0.125634765625, "learning_rate": 0.0001, "loss": 5.6727, "loss/crossentropy": 2.532552123069763, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16576913371682167, "step": 12716 }, { "epoch": 0.5780909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.11451416015625, "learning_rate": 0.0001, "loss": 5.5038, "loss/crossentropy": 2.4517017602920532, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15442543476819992, "step": 12718 }, { "epoch": 0.5781818181818181, "grad_norm": 5.40625, "grad_norm_var": 0.13166910807291668, "learning_rate": 0.0001, "loss": 5.5865, "loss/crossentropy": 2.5025119185447693, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1611309014260769, "step": 12720 }, { "epoch": 0.5782727272727273, "grad_norm": 4.5, "grad_norm_var": 0.08365478515625, "learning_rate": 0.0001, "loss": 5.7077, "loss/crossentropy": 2.577452778816223, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16615383327007294, "step": 12722 }, { "epoch": 0.5783636363636364, "grad_norm": 4.75, "grad_norm_var": 0.08899332682291666, "learning_rate": 0.0001, "loss": 5.3563, "loss/crossentropy": 2.4050285816192627, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14884283021092415, "step": 12724 }, { "epoch": 0.5784545454545454, "grad_norm": 5.40625, "grad_norm_var": 0.09646809895833333, "learning_rate": 0.0001, "loss": 5.9824, "loss/crossentropy": 2.8304759860038757, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16519664600491524, "step": 12726 }, { "epoch": 0.5785454545454546, "grad_norm": 5.15625, "grad_norm_var": 0.169921875, "learning_rate": 0.0001, "loss": 5.3534, "loss/crossentropy": 2.339946150779724, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15213003009557724, "step": 12728 }, { "epoch": 0.5786363636363636, "grad_norm": 4.65625, "grad_norm_var": 0.16652018229166668, "learning_rate": 0.0001, "loss": 5.1364, "loss/crossentropy": 2.278413414955139, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1416570469737053, "step": 12730 }, { "epoch": 0.5787272727272728, "grad_norm": 5.46875, "grad_norm_var": 0.18609619140625, "learning_rate": 0.0001, "loss": 5.7067, "loss/crossentropy": 2.5141561031341553, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16945136338472366, "step": 12732 }, { "epoch": 0.5788181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.17356770833333332, "learning_rate": 0.0001, "loss": 5.697, "loss/crossentropy": 2.6027334928512573, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1623549871146679, "step": 12734 }, { "epoch": 0.5789090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.17587483723958333, "learning_rate": 0.0001, "loss": 5.229, "loss/crossentropy": 2.198225736618042, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1528797224164009, "step": 12736 }, { "epoch": 0.579, "grad_norm": 4.5625, "grad_norm_var": 0.177587890625, "learning_rate": 0.0001, "loss": 5.1024, "loss/crossentropy": 2.1944727897644043, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14196382462978363, "step": 12738 }, { "epoch": 0.5790909090909091, "grad_norm": 5.75, "grad_norm_var": 0.17745768229166667, "learning_rate": 0.0001, "loss": 5.5305, "loss/crossentropy": 2.4240750074386597, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1624011993408203, "step": 12740 }, { "epoch": 0.5791818181818181, "grad_norm": 4.4375, "grad_norm_var": 0.19820556640625, "learning_rate": 0.0001, "loss": 5.2863, "loss/crossentropy": 2.302164673805237, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14977984875440598, "step": 12742 }, { "epoch": 0.5792727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.140087890625, "learning_rate": 0.0001, "loss": 5.6022, "loss/crossentropy": 2.5024898052215576, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15997319296002388, "step": 12744 }, { "epoch": 0.5793636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.126806640625, "learning_rate": 0.0001, "loss": 5.4792, "loss/crossentropy": 2.3831779956817627, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15959947183728218, "step": 12746 }, { "epoch": 0.5794545454545454, "grad_norm": 5.40625, "grad_norm_var": 0.157275390625, "learning_rate": 0.0001, "loss": 5.7527, "loss/crossentropy": 2.5795753598213196, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16614112630486488, "step": 12748 }, { "epoch": 0.5795454545454546, "grad_norm": 5.59375, "grad_norm_var": 0.20104166666666667, "learning_rate": 0.0001, "loss": 5.5021, "loss/crossentropy": 2.3807377219200134, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16193775460124016, "step": 12750 }, { "epoch": 0.5796363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.18264567057291667, "learning_rate": 0.0001, "loss": 5.2887, "loss/crossentropy": 2.3353514671325684, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14826583862304688, "step": 12752 }, { "epoch": 0.5797272727272728, "grad_norm": 4.75, "grad_norm_var": 0.22258707682291667, "learning_rate": 0.0001, "loss": 5.2008, "loss/crossentropy": 2.2604178190231323, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14716190472245216, "step": 12754 }, { "epoch": 0.5798181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.19501546223958333, "learning_rate": 0.0001, "loss": 5.707, "loss/crossentropy": 2.5187288522720337, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17038549110293388, "step": 12756 }, { "epoch": 0.5799090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.17086181640625, "learning_rate": 0.0001, "loss": 5.3947, "loss/crossentropy": 2.368385076522827, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15360470488667488, "step": 12758 }, { "epoch": 0.58, "grad_norm": 4.5, "grad_norm_var": 0.2080078125, "learning_rate": 0.0001, "loss": 5.7126, "loss/crossentropy": 2.519753724336624, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16752708330750465, "step": 12760 }, { "epoch": 0.5800909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.20113525390625, "learning_rate": 0.0001, "loss": 5.8704, "loss/crossentropy": 2.668766438961029, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16918673738837242, "step": 12762 }, { "epoch": 0.5801818181818181, "grad_norm": 5.15625, "grad_norm_var": 0.18564046223958333, "learning_rate": 0.0001, "loss": 5.8097, "loss/crossentropy": 2.670236349105835, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16472547873854637, "step": 12764 }, { "epoch": 0.5802727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.104296875, "learning_rate": 0.0001, "loss": 5.3604, "loss/crossentropy": 2.375097632408142, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15322131291031837, "step": 12766 }, { "epoch": 0.5803636363636364, "grad_norm": 6.5, "grad_norm_var": 0.254931640625, "learning_rate": 0.0001, "loss": 5.4722, "loss/crossentropy": 2.3366559743881226, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15964803472161293, "step": 12768 }, { "epoch": 0.5804545454545454, "grad_norm": 4.53125, "grad_norm_var": 0.224462890625, "learning_rate": 0.0001, "loss": 5.5331, "loss/crossentropy": 2.514002352952957, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15562565810978413, "step": 12770 }, { "epoch": 0.5805454545454546, "grad_norm": 4.65625, "grad_norm_var": 0.24322509765625, "learning_rate": 0.0001, "loss": 5.1658, "loss/crossentropy": 2.120842009782791, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.15273740887641907, "step": 12772 }, { "epoch": 0.5806363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.25149739583333336, "learning_rate": 0.0001, "loss": 5.6209, "loss/crossentropy": 2.553505539894104, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.158300518989563, "step": 12774 }, { "epoch": 0.5807272727272728, "grad_norm": 4.875, "grad_norm_var": 0.22398681640625, "learning_rate": 0.0001, "loss": 5.6359, "loss/crossentropy": 2.4999959468841553, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1626092866063118, "step": 12776 }, { "epoch": 0.5808181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.22405192057291667, "learning_rate": 0.0001, "loss": 5.6106, "loss/crossentropy": 2.463769257068634, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.1633109748363495, "step": 12778 }, { "epoch": 0.5809090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.20618082682291666, "learning_rate": 0.0001, "loss": 5.8422, "loss/crossentropy": 2.6502593755722046, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1697794534265995, "step": 12780 }, { "epoch": 0.581, "grad_norm": 4.5, "grad_norm_var": 0.23547770182291666, "learning_rate": 0.0001, "loss": 5.1313, "loss/crossentropy": 2.1847437918186188, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.14993081986904144, "step": 12782 }, { "epoch": 0.5810909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.122900390625, "learning_rate": 0.0001, "loss": 5.2224, "loss/crossentropy": 2.3072132766246796, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.1471787728369236, "step": 12784 }, { "epoch": 0.5811818181818181, "grad_norm": 5.0, "grad_norm_var": 0.10740559895833333, "learning_rate": 0.0001, "loss": 5.7631, "loss/crossentropy": 2.6021578311920166, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16746427863836288, "step": 12786 }, { "epoch": 0.5812727272727273, "grad_norm": 5.59375, "grad_norm_var": 0.11599934895833333, "learning_rate": 0.0001, "loss": 5.532, "loss/crossentropy": 2.4015230536460876, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16246238723397255, "step": 12788 }, { "epoch": 0.5813636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.12261962890625, "learning_rate": 0.0001, "loss": 5.7961, "loss/crossentropy": 2.545253038406372, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17488958686590195, "step": 12790 }, { "epoch": 0.5814545454545454, "grad_norm": 4.65625, "grad_norm_var": 0.11829020182291666, "learning_rate": 0.0001, "loss": 5.263, "loss/crossentropy": 2.2652284502983093, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15329403057694435, "step": 12792 }, { "epoch": 0.5815454545454546, "grad_norm": 4.96875, "grad_norm_var": 0.12121988932291666, "learning_rate": 0.0001, "loss": 5.2793, "loss/crossentropy": 2.2400320768356323, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15510188043117523, "step": 12794 }, { "epoch": 0.5816363636363636, "grad_norm": 4.78125, "grad_norm_var": 0.138671875, "learning_rate": 0.0001, "loss": 5.7087, "loss/crossentropy": 2.564660966396332, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16596544533967972, "step": 12796 }, { "epoch": 0.5817272727272728, "grad_norm": 4.5, "grad_norm_var": 0.123828125, "learning_rate": 0.0001, "loss": 5.6936, "loss/crossentropy": 2.5923526287078857, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1636379137635231, "step": 12798 }, { "epoch": 0.5818181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.094775390625, "learning_rate": 0.0001, "loss": 5.7185, "loss/crossentropy": 2.564977705478668, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16691455990076065, "step": 12800 }, { "epoch": 0.5819090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.13385416666666666, "learning_rate": 0.0001, "loss": 5.9462, "loss/crossentropy": 2.6563745737075806, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17507575452327728, "step": 12802 }, { "epoch": 0.582, "grad_norm": 4.625, "grad_norm_var": 0.130712890625, "learning_rate": 0.0001, "loss": 5.309, "loss/crossentropy": 2.3462361097335815, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14783475920557976, "step": 12804 }, { "epoch": 0.5820909090909091, "grad_norm": 4.875, "grad_norm_var": 0.11715087890625, "learning_rate": 0.0001, "loss": 5.5208, "loss/crossentropy": 2.4949156641960144, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15414972975850105, "step": 12806 }, { "epoch": 0.5821818181818181, "grad_norm": 4.75, "grad_norm_var": 0.115087890625, "learning_rate": 0.0001, "loss": 5.6093, "loss/crossentropy": 2.517456293106079, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16230876743793488, "step": 12808 }, { "epoch": 0.5822727272727273, "grad_norm": 5.4375, "grad_norm_var": 0.13346354166666666, "learning_rate": 0.0001, "loss": 5.4964, "loss/crossentropy": 2.474238783121109, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15690241754055023, "step": 12810 }, { "epoch": 0.5823636363636364, "grad_norm": 5.5, "grad_norm_var": 0.13678385416666666, "learning_rate": 0.0001, "loss": 5.652, "loss/crossentropy": 2.5125503540039062, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16257357597351074, "step": 12812 }, { "epoch": 0.5824545454545454, "grad_norm": 4.53125, "grad_norm_var": 0.1361328125, "learning_rate": 0.0001, "loss": 5.4246, "loss/crossentropy": 2.4224600791931152, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15373342856764793, "step": 12814 }, { "epoch": 0.5825454545454546, "grad_norm": 4.90625, "grad_norm_var": 0.133447265625, "learning_rate": 0.0001, "loss": 5.83, "loss/crossentropy": 2.6455368399620056, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16844208166003227, "step": 12816 }, { "epoch": 0.5826363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.09375, "learning_rate": 0.0001, "loss": 5.7911, "loss/crossentropy": 2.5952720046043396, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16978271305561066, "step": 12818 }, { "epoch": 0.5827272727272728, "grad_norm": 5.21875, "grad_norm_var": 0.08995768229166666, "learning_rate": 0.0001, "loss": 6.0781, "loss/crossentropy": 2.81308776140213, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17669494077563286, "step": 12820 }, { "epoch": 0.5828181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.12776285807291668, "learning_rate": 0.0001, "loss": 5.093, "loss/crossentropy": 2.2032639384269714, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.14464083686470985, "step": 12822 }, { "epoch": 0.5829090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.13671875, "learning_rate": 0.0001, "loss": 5.5655, "loss/crossentropy": 2.451945513486862, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16272181272506714, "step": 12824 }, { "epoch": 0.583, "grad_norm": 4.8125, "grad_norm_var": 0.12154947916666667, "learning_rate": 0.0001, "loss": 5.8528, "loss/crossentropy": 2.7220672965049744, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1669800505042076, "step": 12826 }, { "epoch": 0.5830909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.110009765625, "learning_rate": 0.0001, "loss": 5.3135, "loss/crossentropy": 2.3314926624298096, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.14781177788972855, "step": 12828 }, { "epoch": 0.5831818181818181, "grad_norm": 4.875, "grad_norm_var": 0.09553629557291667, "learning_rate": 0.0001, "loss": 5.4622, "loss/crossentropy": 2.4264811277389526, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15493548288941383, "step": 12830 }, { "epoch": 0.5832727272727273, "grad_norm": 5.0625, "grad_norm_var": 0.09791259765625, "learning_rate": 0.0001, "loss": 5.698, "loss/crossentropy": 2.6023080348968506, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16464589536190033, "step": 12832 }, { "epoch": 0.5833636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.10510660807291666, "learning_rate": 0.0001, "loss": 5.1564, "loss/crossentropy": 2.2188839614391327, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1484438069164753, "step": 12834 }, { "epoch": 0.5834545454545454, "grad_norm": 5.125, "grad_norm_var": 0.09732666015625, "learning_rate": 0.0001, "loss": 5.7692, "loss/crossentropy": 2.683552861213684, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1607140712440014, "step": 12836 }, { "epoch": 0.5835454545454546, "grad_norm": 4.71875, "grad_norm_var": 0.06106770833333333, "learning_rate": 0.0001, "loss": 5.5879, "loss/crossentropy": 2.532575309276581, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15944061800837517, "step": 12838 }, { "epoch": 0.5836363636363636, "grad_norm": 5.5625, "grad_norm_var": 0.12511393229166667, "learning_rate": 0.0001, "loss": 5.7591, "loss/crossentropy": 2.568789482116699, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.17196469753980637, "step": 12840 }, { "epoch": 0.5837272727272728, "grad_norm": 6.34375, "grad_norm_var": 0.753125, "learning_rate": 0.0001, "loss": 5.7211, "loss/crossentropy": 2.484728157520294, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.169145867228508, "step": 12842 }, { "epoch": 0.5838181818181818, "grad_norm": 4.78125, "grad_norm_var": 0.7405232747395833, "learning_rate": 0.0001, "loss": 5.1996, "loss/crossentropy": 2.1886436343193054, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1516808494925499, "step": 12844 }, { "epoch": 0.5839090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.76090087890625, "learning_rate": 0.0001, "loss": 5.3781, "loss/crossentropy": 2.374707520008087, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1546379141509533, "step": 12846 }, { "epoch": 0.584, "grad_norm": 6.09375, "grad_norm_var": 0.777978515625, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 2.577345848083496, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1682620644569397, "step": 12848 }, { "epoch": 0.5840909090909091, "grad_norm": 5.0, "grad_norm_var": 0.74351806640625, "learning_rate": 0.0001, "loss": 5.7102, "loss/crossentropy": 2.5577890872955322, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16426847875118256, "step": 12850 }, { "epoch": 0.5841818181818181, "grad_norm": 5.1875, "grad_norm_var": 0.73248291015625, "learning_rate": 0.0001, "loss": 5.6066, "loss/crossentropy": 2.43359911441803, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16475963965058327, "step": 12852 }, { "epoch": 0.5842727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.6786092122395834, "learning_rate": 0.0001, "loss": 5.4439, "loss/crossentropy": 2.4108656644821167, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15350103750824928, "step": 12854 }, { "epoch": 0.5843636363636364, "grad_norm": 5.40625, "grad_norm_var": 0.7274698893229167, "learning_rate": 0.0001, "loss": 5.7858, "loss/crossentropy": 2.5951963663101196, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1700357086956501, "step": 12856 }, { "epoch": 0.5844545454545454, "grad_norm": 5.375, "grad_norm_var": 0.30273030598958334, "learning_rate": 0.0001, "loss": 5.4131, "loss/crossentropy": 2.220782458782196, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16297921165823936, "step": 12858 }, { "epoch": 0.5845454545454546, "grad_norm": 5.1875, "grad_norm_var": 0.3112589518229167, "learning_rate": 0.0001, "loss": 5.536, "loss/crossentropy": 2.471416622400284, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15743575990200043, "step": 12860 }, { "epoch": 0.5846363636363636, "grad_norm": 4.84375, "grad_norm_var": 0.30549723307291665, "learning_rate": 0.0001, "loss": 5.7373, "loss/crossentropy": 2.5496240854263306, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16739784181118011, "step": 12862 }, { "epoch": 0.5847272727272728, "grad_norm": 4.75, "grad_norm_var": 0.27734375, "learning_rate": 0.0001, "loss": 5.2632, "loss/crossentropy": 2.2917995154857635, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1492869295179844, "step": 12864 }, { "epoch": 0.5848181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.22825113932291666, "learning_rate": 0.0001, "loss": 5.2339, "loss/crossentropy": 2.253520429134369, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1486198902130127, "step": 12866 }, { "epoch": 0.5849090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.20930989583333334, "learning_rate": 0.0001, "loss": 5.846, "loss/crossentropy": 2.6876875162124634, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1664128303527832, "step": 12868 }, { "epoch": 0.585, "grad_norm": 4.4375, "grad_norm_var": 0.23411458333333332, "learning_rate": 0.0001, "loss": 5.41, "loss/crossentropy": 2.341092586517334, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1590374819934368, "step": 12870 }, { "epoch": 0.5850909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.22209879557291667, "learning_rate": 0.0001, "loss": 5.1966, "loss/crossentropy": 2.239412158727646, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.14903904125094414, "step": 12872 }, { "epoch": 0.5851818181818181, "grad_norm": 4.71875, "grad_norm_var": 0.07693684895833333, "learning_rate": 0.0001, "loss": 5.3881, "loss/crossentropy": 2.3582454323768616, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15494176000356674, "step": 12874 }, { "epoch": 0.5852727272727273, "grad_norm": 5.4375, "grad_norm_var": 0.08605143229166666, "learning_rate": 0.0001, "loss": 5.7568, "loss/crossentropy": 2.597515106201172, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16748683527112007, "step": 12876 }, { "epoch": 0.5853636363636363, "grad_norm": 5.125, "grad_norm_var": 0.09191080729166666, "learning_rate": 0.0001, "loss": 5.6126, "loss/crossentropy": 2.444057583808899, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16841308400034904, "step": 12878 }, { "epoch": 0.5854545454545454, "grad_norm": 4.75, "grad_norm_var": 0.09202067057291667, "learning_rate": 0.0001, "loss": 5.5784, "loss/crossentropy": 2.4943723678588867, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1595764011144638, "step": 12880 }, { "epoch": 0.5855454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.06458333333333334, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 2.636373817920685, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.164903923869133, "step": 12882 }, { "epoch": 0.5856363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.0603515625, "learning_rate": 0.0001, "loss": 5.5909, "loss/crossentropy": 2.4636634588241577, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1623367890715599, "step": 12884 }, { "epoch": 0.5857272727272728, "grad_norm": 4.6875, "grad_norm_var": 0.05038655598958333, "learning_rate": 0.0001, "loss": 5.1666, "loss/crossentropy": 2.238316059112549, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14419101923704147, "step": 12886 }, { "epoch": 0.5858181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.057356770833333334, "learning_rate": 0.0001, "loss": 5.6254, "loss/crossentropy": 2.523546040058136, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16018548980355263, "step": 12888 }, { "epoch": 0.5859090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.07141927083333334, "learning_rate": 0.0001, "loss": 5.3267, "loss/crossentropy": 2.356657862663269, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14954613521695137, "step": 12890 }, { "epoch": 0.586, "grad_norm": 4.75, "grad_norm_var": 0.06638997395833333, "learning_rate": 0.0001, "loss": 5.0957, "loss/crossentropy": 2.152902841567993, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.14565159380435944, "step": 12892 }, { "epoch": 0.5860909090909091, "grad_norm": 4.96875, "grad_norm_var": 0.05627848307291667, "learning_rate": 0.0001, "loss": 5.2247, "loss/crossentropy": 2.2518163919448853, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14983205124735832, "step": 12894 }, { "epoch": 0.5861818181818181, "grad_norm": 5.15625, "grad_norm_var": 0.0671875, "learning_rate": 0.0001, "loss": 5.3792, "loss/crossentropy": 2.3365888595581055, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1575806699693203, "step": 12896 }, { "epoch": 0.5862727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.06809488932291667, "learning_rate": 0.0001, "loss": 5.5125, "loss/crossentropy": 2.4503390192985535, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1568012572824955, "step": 12898 }, { "epoch": 0.5863636363636363, "grad_norm": 5.09375, "grad_norm_var": 0.06725260416666666, "learning_rate": 0.0001, "loss": 5.9378, "loss/crossentropy": 2.7365885376930237, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16992752999067307, "step": 12900 }, { "epoch": 0.5864545454545455, "grad_norm": 4.875, "grad_norm_var": 0.05579427083333333, "learning_rate": 0.0001, "loss": 5.2154, "loss/crossentropy": 2.2256321609020233, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14917635917663574, "step": 12902 }, { "epoch": 0.5865454545454546, "grad_norm": 5.28125, "grad_norm_var": 0.05347900390625, "learning_rate": 0.0001, "loss": 5.195, "loss/crossentropy": 2.2594750225543976, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1445311326533556, "step": 12904 }, { "epoch": 0.5866363636363636, "grad_norm": 5.28125, "grad_norm_var": 0.073681640625, "learning_rate": 0.0001, "loss": 5.6983, "loss/crossentropy": 2.5408636927604675, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16652637720108032, "step": 12906 }, { "epoch": 0.5867272727272728, "grad_norm": 5.34375, "grad_norm_var": 0.082275390625, "learning_rate": 0.0001, "loss": 5.7946, "loss/crossentropy": 2.6222561597824097, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1668473333120346, "step": 12908 }, { "epoch": 0.5868181818181818, "grad_norm": 5.625, "grad_norm_var": 7.794986979166667, "learning_rate": 0.0001, "loss": 5.7832, "loss/crossentropy": 2.4721696376800537, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17915406823158264, "step": 12910 }, { "epoch": 0.5869090909090909, "grad_norm": 5.09375, "grad_norm_var": 7.7564453125, "learning_rate": 0.0001, "loss": 5.3187, "loss/crossentropy": 2.2676642537117004, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15607670322060585, "step": 12912 }, { "epoch": 0.587, "grad_norm": 5.3125, "grad_norm_var": 7.760347493489584, "learning_rate": 0.0001, "loss": 5.1942, "loss/crossentropy": 2.1863989531993866, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1490219682455063, "step": 12914 }, { "epoch": 0.5870909090909091, "grad_norm": 4.75, "grad_norm_var": 7.820703125, "learning_rate": 0.0001, "loss": 5.3903, "loss/crossentropy": 2.4013232588768005, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15260930359363556, "step": 12916 }, { "epoch": 0.5871818181818181, "grad_norm": 5.5625, "grad_norm_var": 7.702587890625, "learning_rate": 0.0001, "loss": 5.998, "loss/crossentropy": 2.686691462993622, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17898161336779594, "step": 12918 }, { "epoch": 0.5872727272727273, "grad_norm": 4.9375, "grad_norm_var": 7.628645833333334, "learning_rate": 0.0001, "loss": 5.8038, "loss/crossentropy": 2.5977508425712585, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17002246901392937, "step": 12920 }, { "epoch": 0.5873636363636363, "grad_norm": 4.875, "grad_norm_var": 7.724088541666666, "learning_rate": 0.0001, "loss": 5.2169, "loss/crossentropy": 2.2297788560390472, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15027765184640884, "step": 12922 }, { "epoch": 0.5874545454545455, "grad_norm": 4.875, "grad_norm_var": 7.748111979166667, "learning_rate": 0.0001, "loss": 5.7972, "loss/crossentropy": 2.6412994861602783, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.167345829308033, "step": 12924 }, { "epoch": 0.5875454545454546, "grad_norm": 5.96875, "grad_norm_var": 0.17486572265625, "learning_rate": 0.0001, "loss": 5.3266, "loss/crossentropy": 2.321482688188553, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15129713341593742, "step": 12926 }, { "epoch": 0.5876363636363636, "grad_norm": 4.8125, "grad_norm_var": 0.177197265625, "learning_rate": 0.0001, "loss": 5.6723, "loss/crossentropy": 2.585103690624237, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1569651197642088, "step": 12928 }, { "epoch": 0.5877272727272728, "grad_norm": 5.15625, "grad_norm_var": 0.16483968098958332, "learning_rate": 0.0001, "loss": 5.1446, "loss/crossentropy": 2.1569724678993225, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1509106457233429, "step": 12930 }, { "epoch": 0.5878181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.169775390625, "learning_rate": 0.0001, "loss": 5.5159, "loss/crossentropy": 2.451673150062561, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15935410931706429, "step": 12932 }, { "epoch": 0.5879090909090909, "grad_norm": 4.5625, "grad_norm_var": 0.18450520833333334, "learning_rate": 0.0001, "loss": 5.1398, "loss/crossentropy": 2.230955958366394, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1449855100363493, "step": 12934 }, { "epoch": 0.588, "grad_norm": 5.90625, "grad_norm_var": 0.2236328125, "learning_rate": 0.0001, "loss": 5.6505, "loss/crossentropy": 2.636913299560547, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15760723128914833, "step": 12936 }, { "epoch": 0.5880909090909091, "grad_norm": 5.84375, "grad_norm_var": 0.26829427083333335, "learning_rate": 0.0001, "loss": 5.7857, "loss/crossentropy": 2.5806131958961487, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17070797458291054, "step": 12938 }, { "epoch": 0.5881818181818181, "grad_norm": 5.0625, "grad_norm_var": 0.25221354166666665, "learning_rate": 0.0001, "loss": 5.3842, "loss/crossentropy": 2.348509758710861, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15454557910561562, "step": 12940 }, { "epoch": 0.5882727272727273, "grad_norm": 4.875, "grad_norm_var": 0.18023681640625, "learning_rate": 0.0001, "loss": 5.6493, "loss/crossentropy": 2.536854088306427, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16280335560441017, "step": 12942 }, { "epoch": 0.5883636363636363, "grad_norm": 5.03125, "grad_norm_var": 0.19003499348958333, "learning_rate": 0.0001, "loss": 5.9419, "loss/crossentropy": 2.707606256008148, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17362496256828308, "step": 12944 }, { "epoch": 0.5884545454545455, "grad_norm": 5.0, "grad_norm_var": 0.18255208333333334, "learning_rate": 0.0001, "loss": 5.2861, "loss/crossentropy": 2.302984893321991, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15026502311229706, "step": 12946 }, { "epoch": 0.5885454545454546, "grad_norm": 4.59375, "grad_norm_var": 0.17942708333333332, "learning_rate": 0.0001, "loss": 5.36, "loss/crossentropy": 2.401233732700348, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15076059103012085, "step": 12948 }, { "epoch": 0.5886363636363636, "grad_norm": 4.65625, "grad_norm_var": 0.15455322265625, "learning_rate": 0.0001, "loss": 5.5478, "loss/crossentropy": 2.516247272491455, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1558912917971611, "step": 12950 }, { "epoch": 0.5887272727272728, "grad_norm": 5.1875, "grad_norm_var": 0.09687093098958334, "learning_rate": 0.0001, "loss": 5.8388, "loss/crossentropy": 2.5634588599205017, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17538171634078026, "step": 12952 }, { "epoch": 0.5888181818181818, "grad_norm": 5.0, "grad_norm_var": 0.044905598958333334, "learning_rate": 0.0001, "loss": 5.6754, "loss/crossentropy": 2.6229324340820312, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1589583456516266, "step": 12954 }, { "epoch": 0.5889090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.08938802083333333, "learning_rate": 0.0001, "loss": 5.378, "loss/crossentropy": 2.3320858478546143, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15615279600024223, "step": 12956 }, { "epoch": 0.589, "grad_norm": 4.90625, "grad_norm_var": 0.08896077473958333, "learning_rate": 0.0001, "loss": 5.5867, "loss/crossentropy": 2.5116265416145325, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16082816198468208, "step": 12958 }, { "epoch": 0.5890909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.08420817057291667, "learning_rate": 0.0001, "loss": 6.1487, "loss/crossentropy": 2.8384019136428833, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17908036336302757, "step": 12960 }, { "epoch": 0.5891818181818181, "grad_norm": 4.875, "grad_norm_var": 0.092041015625, "learning_rate": 0.0001, "loss": 5.5761, "loss/crossentropy": 2.46398001909256, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16198864579200745, "step": 12962 }, { "epoch": 0.5892727272727273, "grad_norm": 4.21875, "grad_norm_var": 0.12316080729166666, "learning_rate": 0.0001, "loss": 5.4144, "loss/crossentropy": 2.443458139896393, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15237142145633698, "step": 12964 }, { "epoch": 0.5893636363636363, "grad_norm": 5.21875, "grad_norm_var": 0.12034098307291667, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.533086597919464, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16764162480831146, "step": 12966 }, { "epoch": 0.5894545454545455, "grad_norm": 4.90625, "grad_norm_var": 0.11978759765625, "learning_rate": 0.0001, "loss": 5.5268, "loss/crossentropy": 2.4573318362236023, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1567513607442379, "step": 12968 }, { "epoch": 0.5895454545454546, "grad_norm": 4.9375, "grad_norm_var": 0.11832275390625, "learning_rate": 0.0001, "loss": 5.2337, "loss/crossentropy": 2.2466468513011932, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14909444004297256, "step": 12970 }, { "epoch": 0.5896363636363636, "grad_norm": 4.5, "grad_norm_var": 0.09940999348958333, "learning_rate": 0.0001, "loss": 5.783, "loss/crossentropy": 2.6419954895973206, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16644896194338799, "step": 12972 }, { "epoch": 0.5897272727272728, "grad_norm": 4.78125, "grad_norm_var": 0.12941080729166668, "learning_rate": 0.0001, "loss": 5.8821, "loss/crossentropy": 2.662723183631897, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17115986347198486, "step": 12974 }, { "epoch": 0.5898181818181818, "grad_norm": 4.875, "grad_norm_var": 0.145703125, "learning_rate": 0.0001, "loss": 6.0848, "loss/crossentropy": 2.77221542596817, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.17949574068188667, "step": 12976 }, { "epoch": 0.5899090909090909, "grad_norm": 4.40625, "grad_norm_var": 0.16343994140625, "learning_rate": 0.0001, "loss": 5.3785, "loss/crossentropy": 2.4222320914268494, "loss/hidden": 1.427734375, "loss/jsd": 0.0, "loss/logits": 0.15285135805606842, "step": 12978 }, { "epoch": 0.59, "grad_norm": 5.09375, "grad_norm_var": 0.14016927083333333, "learning_rate": 0.0001, "loss": 5.6764, "loss/crossentropy": 2.6099212765693665, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1597764901816845, "step": 12980 }, { "epoch": 0.5900909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.15045572916666666, "learning_rate": 0.0001, "loss": 5.4053, "loss/crossentropy": 2.3825854659080505, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15383205190300941, "step": 12982 }, { "epoch": 0.5901818181818181, "grad_norm": 4.75, "grad_norm_var": 0.16990559895833332, "learning_rate": 0.0001, "loss": 5.6583, "loss/crossentropy": 2.512786090373993, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16396550461649895, "step": 12984 }, { "epoch": 0.5902727272727273, "grad_norm": 4.375, "grad_norm_var": 0.19442952473958333, "learning_rate": 0.0001, "loss": 5.3071, "loss/crossentropy": 2.3423084914684296, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1482418216764927, "step": 12986 }, { "epoch": 0.5903636363636363, "grad_norm": 4.65625, "grad_norm_var": 0.17291259765625, "learning_rate": 0.0001, "loss": 5.4705, "loss/crossentropy": 2.418650269508362, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15948064625263214, "step": 12988 }, { "epoch": 0.5904545454545455, "grad_norm": 4.59375, "grad_norm_var": 0.13463134765625, "learning_rate": 0.0001, "loss": 5.6513, "loss/crossentropy": 2.551933765411377, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16286912560462952, "step": 12990 }, { "epoch": 0.5905454545454546, "grad_norm": 6.03125, "grad_norm_var": 0.19334309895833332, "learning_rate": 0.0001, "loss": 5.8914, "loss/crossentropy": 2.6064648032188416, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17790738493204117, "step": 12992 }, { "epoch": 0.5906363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.18020833333333333, "learning_rate": 0.0001, "loss": 5.7567, "loss/crossentropy": 2.5502389669418335, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17045365273952484, "step": 12994 }, { "epoch": 0.5907272727272728, "grad_norm": 4.8125, "grad_norm_var": 0.17148030598958333, "learning_rate": 0.0001, "loss": 5.771, "loss/crossentropy": 2.6138010025024414, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16649818420410156, "step": 12996 }, { "epoch": 0.5908181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.16165364583333333, "learning_rate": 0.0001, "loss": 5.3363, "loss/crossentropy": 2.294845163822174, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15512369945645332, "step": 12998 }, { "epoch": 0.5909090909090909, "grad_norm": 4.5, "grad_norm_var": 0.16685791015625, "learning_rate": 0.0001, "loss": 5.1489, "loss/crossentropy": 2.201725959777832, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1462802067399025, "step": 13000 }, { "epoch": 0.591, "grad_norm": 4.71875, "grad_norm_var": 0.15240885416666666, "learning_rate": 0.0001, "loss": 5.9168, "loss/crossentropy": 2.7227046489715576, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16960430890321732, "step": 13002 }, { "epoch": 0.5910909090909091, "grad_norm": 4.71875, "grad_norm_var": 0.15128580729166666, "learning_rate": 0.0001, "loss": 5.8655, "loss/crossentropy": 2.7283406853675842, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16840184852480888, "step": 13004 }, { "epoch": 0.5911818181818181, "grad_norm": 4.875, "grad_norm_var": 0.14289957682291668, "learning_rate": 0.0001, "loss": 5.5639, "loss/crossentropy": 2.444461405277252, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.1578436903655529, "step": 13006 }, { "epoch": 0.5912727272727273, "grad_norm": 5.625, "grad_norm_var": 0.09505208333333333, "learning_rate": 0.0001, "loss": 5.8111, "loss/crossentropy": 2.61089825630188, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1692386381328106, "step": 13008 }, { "epoch": 0.5913636363636363, "grad_norm": 5.0, "grad_norm_var": 0.08709309895833334, "learning_rate": 0.0001, "loss": 5.9902, "loss/crossentropy": 2.850160777568817, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16869624331593513, "step": 13010 }, { "epoch": 0.5914545454545455, "grad_norm": 6.125, "grad_norm_var": 0.17704671223958332, "learning_rate": 0.0001, "loss": 5.3577, "loss/crossentropy": 2.2749541997909546, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15983812138438225, "step": 13012 }, { "epoch": 0.5915454545454546, "grad_norm": 5.0625, "grad_norm_var": 0.17024739583333334, "learning_rate": 0.0001, "loss": 5.6387, "loss/crossentropy": 2.4969412088394165, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16554687917232513, "step": 13014 }, { "epoch": 0.5916363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.14759114583333333, "learning_rate": 0.0001, "loss": 5.8359, "loss/crossentropy": 2.6033886671066284, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17227354645729065, "step": 13016 }, { "epoch": 0.5917272727272728, "grad_norm": 5.375, "grad_norm_var": 0.14761962890625, "learning_rate": 0.0001, "loss": 5.6355, "loss/crossentropy": 2.4964978396892548, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16468515247106552, "step": 13018 }, { "epoch": 0.5918181818181818, "grad_norm": 4.53125, "grad_norm_var": 0.16731770833333334, "learning_rate": 0.0001, "loss": 5.2514, "loss/crossentropy": 2.311902701854706, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1472725346684456, "step": 13020 }, { "epoch": 0.5919090909090909, "grad_norm": 4.59375, "grad_norm_var": 0.18531494140625, "learning_rate": 0.0001, "loss": 5.3714, "loss/crossentropy": 2.404983103275299, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15230777114629745, "step": 13022 }, { "epoch": 0.592, "grad_norm": 4.71875, "grad_norm_var": 0.180322265625, "learning_rate": 0.0001, "loss": 5.2777, "loss/crossentropy": 2.286839246749878, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1502585969865322, "step": 13024 }, { "epoch": 0.5920909090909091, "grad_norm": 4.59375, "grad_norm_var": 0.18593343098958334, "learning_rate": 0.0001, "loss": 5.4828, "loss/crossentropy": 2.4141191840171814, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15979482978582382, "step": 13026 }, { "epoch": 0.5921818181818181, "grad_norm": 4.875, "grad_norm_var": 0.09217122395833334, "learning_rate": 0.0001, "loss": 5.562, "loss/crossentropy": 2.5364628732204437, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15646293014287949, "step": 13028 }, { "epoch": 0.5922727272727273, "grad_norm": 4.75, "grad_norm_var": 0.12763264973958333, "learning_rate": 0.0001, "loss": 6.0313, "loss/crossentropy": 2.787810981273651, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17571473494172096, "step": 13030 }, { "epoch": 0.5923636363636363, "grad_norm": 5.03125, "grad_norm_var": 0.1177734375, "learning_rate": 0.0001, "loss": 5.7671, "loss/crossentropy": 2.6253777146339417, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1678861267864704, "step": 13032 }, { "epoch": 0.5924545454545455, "grad_norm": 4.53125, "grad_norm_var": 0.10728759765625, "learning_rate": 0.0001, "loss": 5.6048, "loss/crossentropy": 2.5480566024780273, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1556701622903347, "step": 13034 }, { "epoch": 0.5925454545454546, "grad_norm": 4.4375, "grad_norm_var": 0.08469645182291667, "learning_rate": 0.0001, "loss": 5.3919, "loss/crossentropy": 2.3856500387191772, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1527719758450985, "step": 13036 }, { "epoch": 0.5926363636363636, "grad_norm": 4.9375, "grad_norm_var": 0.087890625, "learning_rate": 0.0001, "loss": 5.6376, "loss/crossentropy": 2.5390196442604065, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1610298827290535, "step": 13038 }, { "epoch": 0.5927272727272728, "grad_norm": 5.125, "grad_norm_var": 0.08892822265625, "learning_rate": 0.0001, "loss": 5.2872, "loss/crossentropy": 2.3063166439533234, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.14906353503465652, "step": 13040 }, { "epoch": 0.5928181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.08515625, "learning_rate": 0.0001, "loss": 5.6094, "loss/crossentropy": 2.4892436861991882, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16162479668855667, "step": 13042 }, { "epoch": 0.5929090909090909, "grad_norm": 5.53125, "grad_norm_var": 0.11197509765625, "learning_rate": 0.0001, "loss": 5.8996, "loss/crossentropy": 2.6827020049095154, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17266525700688362, "step": 13044 }, { "epoch": 0.593, "grad_norm": 4.71875, "grad_norm_var": 0.074609375, "learning_rate": 0.0001, "loss": 5.5866, "loss/crossentropy": 2.497205078601837, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16128095611929893, "step": 13046 }, { "epoch": 0.5930909090909091, "grad_norm": 5.71875, "grad_norm_var": 0.10784098307291666, "learning_rate": 0.0001, "loss": 5.6753, "loss/crossentropy": 2.4484336972236633, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17073404788970947, "step": 13048 }, { "epoch": 0.5931818181818181, "grad_norm": 4.53125, "grad_norm_var": 0.10597330729166667, "learning_rate": 0.0001, "loss": 5.4318, "loss/crossentropy": 2.442477971315384, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15069254860281944, "step": 13050 }, { "epoch": 0.5932727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.0880859375, "learning_rate": 0.0001, "loss": 5.4465, "loss/crossentropy": 2.4351808428764343, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15288659185171127, "step": 13052 }, { "epoch": 0.5933636363636363, "grad_norm": 5.96875, "grad_norm_var": 0.14927978515625, "learning_rate": 0.0001, "loss": 5.803, "loss/crossentropy": 2.535852789878845, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1755416840314865, "step": 13054 }, { "epoch": 0.5934545454545455, "grad_norm": 4.875, "grad_norm_var": 0.14612223307291666, "learning_rate": 0.0001, "loss": 5.4508, "loss/crossentropy": 2.4302980303764343, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15536685287952423, "step": 13056 }, { "epoch": 0.5935454545454546, "grad_norm": 5.53125, "grad_norm_var": 0.178125, "learning_rate": 0.0001, "loss": 5.8687, "loss/crossentropy": 2.5813333988189697, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17755988985300064, "step": 13058 }, { "epoch": 0.5936363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.17472330729166666, "learning_rate": 0.0001, "loss": 5.4386, "loss/crossentropy": 2.365623891353607, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15885701775550842, "step": 13060 }, { "epoch": 0.5937272727272728, "grad_norm": 4.71875, "grad_norm_var": 0.17902018229166666, "learning_rate": 0.0001, "loss": 5.737, "loss/crossentropy": 2.619325876235962, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16645066067576408, "step": 13062 }, { "epoch": 0.5938181818181818, "grad_norm": 5.21875, "grad_norm_var": 0.15885009765625, "learning_rate": 0.0001, "loss": 6.1114, "loss/crossentropy": 2.8551997542381287, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17620301619172096, "step": 13064 }, { "epoch": 0.5939090909090909, "grad_norm": 5.125, "grad_norm_var": 0.15263264973958332, "learning_rate": 0.0001, "loss": 5.5226, "loss/crossentropy": 2.4218417406082153, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16202502325177193, "step": 13066 }, { "epoch": 0.594, "grad_norm": 4.84375, "grad_norm_var": 0.16217447916666666, "learning_rate": 0.0001, "loss": 5.3878, "loss/crossentropy": 2.3727846145629883, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15364569053053856, "step": 13068 }, { "epoch": 0.5940909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.515478515625, "learning_rate": 0.0001, "loss": 5.7468, "loss/crossentropy": 2.5909459590911865, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16480199992656708, "step": 13070 }, { "epoch": 0.5941818181818181, "grad_norm": 5.40625, "grad_norm_var": 0.50396728515625, "learning_rate": 0.0001, "loss": 5.0196, "loss/crossentropy": 2.064130187034607, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14593354612588882, "step": 13072 }, { "epoch": 0.5942727272727273, "grad_norm": 4.6875, "grad_norm_var": 0.513916015625, "learning_rate": 0.0001, "loss": 5.7515, "loss/crossentropy": 2.6614574193954468, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1623275987803936, "step": 13074 }, { "epoch": 0.5943636363636363, "grad_norm": 5.0625, "grad_norm_var": 0.5143229166666666, "learning_rate": 0.0001, "loss": 5.4888, "loss/crossentropy": 2.4318927526474, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15705391019582748, "step": 13076 }, { "epoch": 0.5944545454545455, "grad_norm": 4.71875, "grad_norm_var": 0.51640625, "learning_rate": 0.0001, "loss": 5.5591, "loss/crossentropy": 2.5032665729522705, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15890008583664894, "step": 13078 }, { "epoch": 0.5945454545454546, "grad_norm": 4.90625, "grad_norm_var": 0.5117146809895833, "learning_rate": 0.0001, "loss": 5.7501, "loss/crossentropy": 2.585473656654358, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16821817681193352, "step": 13080 }, { "epoch": 0.5946363636363636, "grad_norm": 4.78125, "grad_norm_var": 0.49972330729166664, "learning_rate": 0.0001, "loss": 5.7322, "loss/crossentropy": 2.6400110125541687, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1619493067264557, "step": 13082 }, { "epoch": 0.5947272727272728, "grad_norm": 4.90625, "grad_norm_var": 0.48723958333333334, "learning_rate": 0.0001, "loss": 5.3579, "loss/crossentropy": 2.3628477454185486, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.1547766625881195, "step": 13084 }, { "epoch": 0.5948181818181818, "grad_norm": 5.25, "grad_norm_var": 0.05597330729166667, "learning_rate": 0.0001, "loss": 5.2808, "loss/crossentropy": 2.241294205188751, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15571137890219688, "step": 13086 }, { "epoch": 0.5949090909090909, "grad_norm": 4.59375, "grad_norm_var": 0.043619791666666664, "learning_rate": 0.0001, "loss": 5.2295, "loss/crossentropy": 2.2724371552467346, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1451207473874092, "step": 13088 }, { "epoch": 0.595, "grad_norm": 5.0625, "grad_norm_var": 0.05455729166666667, "learning_rate": 0.0001, "loss": 5.6607, "loss/crossentropy": 2.594714045524597, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1577659696340561, "step": 13090 }, { "epoch": 0.5950909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.04976806640625, "learning_rate": 0.0001, "loss": 5.3871, "loss/crossentropy": 2.3948134779930115, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15118132531642914, "step": 13092 }, { "epoch": 0.5951818181818181, "grad_norm": 4.75, "grad_norm_var": 0.05484619140625, "learning_rate": 0.0001, "loss": 4.9844, "loss/crossentropy": 2.058290809392929, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14494982361793518, "step": 13094 }, { "epoch": 0.5952727272727273, "grad_norm": 5.625, "grad_norm_var": 0.08201497395833333, "learning_rate": 0.0001, "loss": 5.677, "loss/crossentropy": 2.554017424583435, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.162685826420784, "step": 13096 }, { "epoch": 0.5953636363636363, "grad_norm": 5.0625, "grad_norm_var": 0.0810546875, "learning_rate": 0.0001, "loss": 5.8151, "loss/crossentropy": 2.5986998677253723, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1681247353553772, "step": 13098 }, { "epoch": 0.5954545454545455, "grad_norm": 4.65625, "grad_norm_var": 0.09218343098958333, "learning_rate": 0.0001, "loss": 5.4829, "loss/crossentropy": 2.433473229408264, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15650127455592155, "step": 13100 }, { "epoch": 0.5955454545454546, "grad_norm": 4.90625, "grad_norm_var": 0.07867431640625, "learning_rate": 0.0001, "loss": 5.6037, "loss/crossentropy": 2.520003378391266, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16188176348805428, "step": 13102 }, { "epoch": 0.5956363636363636, "grad_norm": 4.75, "grad_norm_var": 0.07115478515625, "learning_rate": 0.0001, "loss": 5.8028, "loss/crossentropy": 2.611322343349457, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16953802108764648, "step": 13104 }, { "epoch": 0.5957272727272728, "grad_norm": 4.84375, "grad_norm_var": 0.06326497395833333, "learning_rate": 0.0001, "loss": 5.4801, "loss/crossentropy": 2.3528942465782166, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1619361788034439, "step": 13106 }, { "epoch": 0.5958181818181818, "grad_norm": 5.0, "grad_norm_var": 0.06317952473958334, "learning_rate": 0.0001, "loss": 5.7693, "loss/crossentropy": 2.5881011486053467, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.17026937380433083, "step": 13108 }, { "epoch": 0.5959090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.05478108723958333, "learning_rate": 0.0001, "loss": 5.6008, "loss/crossentropy": 2.4495511054992676, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16453707218170166, "step": 13110 }, { "epoch": 0.596, "grad_norm": 5.03125, "grad_norm_var": 0.15862223307291667, "learning_rate": 0.0001, "loss": 5.8609, "loss/crossentropy": 2.6291807293891907, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17414913326501846, "step": 13112 }, { "epoch": 0.5960909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.1634765625, "learning_rate": 0.0001, "loss": 5.7332, "loss/crossentropy": 2.596071779727936, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1633242592215538, "step": 13114 }, { "epoch": 0.5961818181818181, "grad_norm": 5.8125, "grad_norm_var": 0.18482666015625, "learning_rate": 0.0001, "loss": 5.8024, "loss/crossentropy": 2.549975335597992, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17055293172597885, "step": 13116 }, { "epoch": 0.5962727272727273, "grad_norm": 6.3125, "grad_norm_var": 0.27369384765625, "learning_rate": 0.0001, "loss": 5.743, "loss/crossentropy": 2.6127020716667175, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16459351778030396, "step": 13118 }, { "epoch": 0.5963636363636363, "grad_norm": 4.6875, "grad_norm_var": 0.29147135416666664, "learning_rate": 0.0001, "loss": 5.3025, "loss/crossentropy": 2.3244688510894775, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.14956063404679298, "step": 13120 }, { "epoch": 0.5964545454545455, "grad_norm": 4.875, "grad_norm_var": 0.34881184895833334, "learning_rate": 0.0001, "loss": 5.4933, "loss/crossentropy": 2.4613835513591766, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1571020446717739, "step": 13122 }, { "epoch": 0.5965454545454546, "grad_norm": 5.0, "grad_norm_var": 0.34490559895833334, "learning_rate": 0.0001, "loss": 5.9361, "loss/crossentropy": 2.7014739513397217, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17326701432466507, "step": 13124 }, { "epoch": 0.5966363636363636, "grad_norm": 6.625, "grad_norm_var": 0.47003580729166666, "learning_rate": 0.0001, "loss": 5.9732, "loss/crossentropy": 2.7262784838676453, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17312711849808693, "step": 13126 }, { "epoch": 0.5967272727272728, "grad_norm": 4.75, "grad_norm_var": 0.3973795572916667, "learning_rate": 0.0001, "loss": 5.3606, "loss/crossentropy": 2.3435181975364685, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1536656878888607, "step": 13128 }, { "epoch": 0.5968181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.6104777018229167, "learning_rate": 0.0001, "loss": 5.502, "loss/crossentropy": 2.398867666721344, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16324134171009064, "step": 13130 }, { "epoch": 0.596909090909091, "grad_norm": 5.5625, "grad_norm_var": 0.6043253580729167, "learning_rate": 0.0001, "loss": 6.0585, "loss/crossentropy": 2.8271849751472473, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17449689283967018, "step": 13132 }, { "epoch": 0.597, "grad_norm": 5.03125, "grad_norm_var": 0.5015909830729167, "learning_rate": 0.0001, "loss": 5.8685, "loss/crossentropy": 2.67732036113739, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1699039414525032, "step": 13134 }, { "epoch": 0.5970909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.4909138997395833, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.5951598286628723, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.173301599919796, "step": 13136 }, { "epoch": 0.5971818181818181, "grad_norm": 4.46875, "grad_norm_var": 0.45959879557291666, "learning_rate": 0.0001, "loss": 5.583, "loss/crossentropy": 2.51961150765419, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1584889031946659, "step": 13138 }, { "epoch": 0.5972727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.48053385416666666, "learning_rate": 0.0001, "loss": 5.4131, "loss/crossentropy": 2.4426139295101166, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.15310735628008842, "step": 13140 }, { "epoch": 0.5973636363636363, "grad_norm": 5.0, "grad_norm_var": 0.33345947265625, "learning_rate": 0.0001, "loss": 5.3469, "loss/crossentropy": 2.3660163581371307, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15081876143813133, "step": 13142 }, { "epoch": 0.5974545454545455, "grad_norm": 5.4375, "grad_norm_var": 0.3312784830729167, "learning_rate": 0.0001, "loss": 5.8796, "loss/crossentropy": 2.6416966319084167, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17379150539636612, "step": 13144 }, { "epoch": 0.5975454545454545, "grad_norm": 4.71875, "grad_norm_var": 0.09843343098958333, "learning_rate": 0.0001, "loss": 5.4919, "loss/crossentropy": 2.475735366344452, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1520027481019497, "step": 13146 }, { "epoch": 0.5976363636363636, "grad_norm": 4.5625, "grad_norm_var": 0.10715738932291667, "learning_rate": 0.0001, "loss": 5.0047, "loss/crossentropy": 2.1856743693351746, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1365935541689396, "step": 13148 }, { "epoch": 0.5977272727272728, "grad_norm": 5.6875, "grad_norm_var": 0.14386393229166666, "learning_rate": 0.0001, "loss": 5.695, "loss/crossentropy": 2.530966877937317, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16660041362047195, "step": 13150 }, { "epoch": 0.5978181818181818, "grad_norm": 5.125, "grad_norm_var": 0.11799723307291667, "learning_rate": 0.0001, "loss": 5.6598, "loss/crossentropy": 2.4979645907878876, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1671619638800621, "step": 13152 }, { "epoch": 0.597909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.10972900390625, "learning_rate": 0.0001, "loss": 5.6703, "loss/crossentropy": 2.5856587290763855, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16256918385624886, "step": 13154 }, { "epoch": 0.598, "grad_norm": 4.3125, "grad_norm_var": 0.12893473307291667, "learning_rate": 0.0001, "loss": 5.5142, "loss/crossentropy": 2.527314603328705, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15434861183166504, "step": 13156 }, { "epoch": 0.5980909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.16490885416666667, "learning_rate": 0.0001, "loss": 5.9965, "loss/crossentropy": 2.7465386986732483, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17655743286013603, "step": 13158 }, { "epoch": 0.5981818181818181, "grad_norm": 4.90625, "grad_norm_var": 0.156884765625, "learning_rate": 0.0001, "loss": 5.1456, "loss/crossentropy": 2.175815999507904, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14717816933989525, "step": 13160 }, { "epoch": 0.5982727272727273, "grad_norm": 4.34375, "grad_norm_var": 0.17066650390625, "learning_rate": 0.0001, "loss": 5.2894, "loss/crossentropy": 2.29888653755188, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14925163984298706, "step": 13162 }, { "epoch": 0.5983636363636363, "grad_norm": 4.5625, "grad_norm_var": 0.1544921875, "learning_rate": 0.0001, "loss": 5.3821, "loss/crossentropy": 2.3537777960300446, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1563475877046585, "step": 13164 }, { "epoch": 0.5984545454545455, "grad_norm": 4.875, "grad_norm_var": 0.10924479166666666, "learning_rate": 0.0001, "loss": 5.3982, "loss/crossentropy": 2.3643885254859924, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1547526754438877, "step": 13166 }, { "epoch": 0.5985454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.11653645833333333, "learning_rate": 0.0001, "loss": 5.7395, "loss/crossentropy": 2.5530406832695007, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16884173080325127, "step": 13168 }, { "epoch": 0.5986363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.11558837890625, "learning_rate": 0.0001, "loss": 5.5721, "loss/crossentropy": 2.5026203393936157, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16144488006830215, "step": 13170 }, { "epoch": 0.5987272727272728, "grad_norm": 4.75, "grad_norm_var": 0.10025634765625, "learning_rate": 0.0001, "loss": 5.4533, "loss/crossentropy": 2.3873945474624634, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15757139027118683, "step": 13172 }, { "epoch": 0.5988181818181818, "grad_norm": 4.75, "grad_norm_var": 0.050634765625, "learning_rate": 0.0001, "loss": 5.4501, "loss/crossentropy": 2.5023956894874573, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15023517236113548, "step": 13174 }, { "epoch": 0.598909090909091, "grad_norm": 5.125, "grad_norm_var": 0.05152587890625, "learning_rate": 0.0001, "loss": 5.534, "loss/crossentropy": 2.496830701828003, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1552811898291111, "step": 13176 }, { "epoch": 0.599, "grad_norm": 4.625, "grad_norm_var": 0.038004557291666664, "learning_rate": 0.0001, "loss": 5.6403, "loss/crossentropy": 2.5815635919570923, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15861058607697487, "step": 13178 }, { "epoch": 0.5990909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.06808268229166667, "learning_rate": 0.0001, "loss": 5.1642, "loss/crossentropy": 2.2524921596050262, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14663701504468918, "step": 13180 }, { "epoch": 0.5991818181818181, "grad_norm": 4.65625, "grad_norm_var": 0.07330322265625, "learning_rate": 0.0001, "loss": 5.6826, "loss/crossentropy": 2.6390803456306458, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15766951069235802, "step": 13182 }, { "epoch": 0.5992727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.068994140625, "learning_rate": 0.0001, "loss": 5.971, "loss/crossentropy": 2.7782201766967773, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1704460084438324, "step": 13184 }, { "epoch": 0.5993636363636363, "grad_norm": 4.53125, "grad_norm_var": 0.07454020182291667, "learning_rate": 0.0001, "loss": 5.1849, "loss/crossentropy": 2.250243365764618, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14776727557182312, "step": 13186 }, { "epoch": 0.5994545454545455, "grad_norm": 4.78125, "grad_norm_var": 0.07151285807291667, "learning_rate": 0.0001, "loss": 5.4054, "loss/crossentropy": 2.3928995728492737, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15418021380901337, "step": 13188 }, { "epoch": 0.5995454545454545, "grad_norm": 4.90625, "grad_norm_var": 0.06842447916666666, "learning_rate": 0.0001, "loss": 5.8394, "loss/crossentropy": 2.6491185426712036, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16961539909243584, "step": 13190 }, { "epoch": 0.5996363636363636, "grad_norm": 4.53125, "grad_norm_var": 0.07317708333333334, "learning_rate": 0.0001, "loss": 5.2032, "loss/crossentropy": 2.284791052341461, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1465298868715763, "step": 13192 }, { "epoch": 0.5997272727272728, "grad_norm": 4.40625, "grad_norm_var": 0.083056640625, "learning_rate": 0.0001, "loss": 5.4908, "loss/crossentropy": 2.4513997435569763, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15686534717679024, "step": 13194 }, { "epoch": 0.5998181818181818, "grad_norm": 6.40625, "grad_norm_var": 0.21350504557291666, "learning_rate": 0.0001, "loss": 5.765, "loss/crossentropy": 2.484646499156952, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.17471011355519295, "step": 13196 }, { "epoch": 0.599909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.21942952473958333, "learning_rate": 0.0001, "loss": 5.6583, "loss/crossentropy": 2.5263527631759644, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1630026251077652, "step": 13198 }, { "epoch": 0.6, "grad_norm": 5.59375, "grad_norm_var": 0.43215738932291664, "learning_rate": 0.0001, "loss": 5.6513, "loss/crossentropy": 2.5130882263183594, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16420749947428703, "step": 13200 }, { "epoch": 0.6000909090909091, "grad_norm": 4.75, "grad_norm_var": 0.41763916015625, "learning_rate": 0.0001, "loss": 5.6382, "loss/crossentropy": 2.5550536513328552, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16143720969557762, "step": 13202 }, { "epoch": 0.6001818181818181, "grad_norm": 5.3125, "grad_norm_var": 0.3974609375, "learning_rate": 0.0001, "loss": 5.8982, "loss/crossentropy": 2.6857409477233887, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17358973994851112, "step": 13204 }, { "epoch": 0.6002727272727273, "grad_norm": 5.15625, "grad_norm_var": 0.39332275390625, "learning_rate": 0.0001, "loss": 5.499, "loss/crossentropy": 2.388649821281433, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1643580161035061, "step": 13206 }, { "epoch": 0.6003636363636363, "grad_norm": 5.34375, "grad_norm_var": 0.38062744140625, "learning_rate": 0.0001, "loss": 5.7425, "loss/crossentropy": 2.4868844151496887, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17301885411143303, "step": 13208 }, { "epoch": 0.6004545454545455, "grad_norm": 5.15625, "grad_norm_var": 0.323681640625, "learning_rate": 0.0001, "loss": 5.5718, "loss/crossentropy": 2.46939480304718, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16063520312309265, "step": 13210 }, { "epoch": 0.6005454545454545, "grad_norm": 4.65625, "grad_norm_var": 0.28131510416666666, "learning_rate": 0.0001, "loss": 5.847, "loss/crossentropy": 2.7427425384521484, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16354884207248688, "step": 13212 }, { "epoch": 0.6006363636363636, "grad_norm": 5.96875, "grad_norm_var": 0.32420247395833335, "learning_rate": 0.0001, "loss": 5.7634, "loss/crossentropy": 2.5539660453796387, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1697758175432682, "step": 13214 }, { "epoch": 0.6007272727272728, "grad_norm": 5.3125, "grad_norm_var": 0.16326497395833334, "learning_rate": 0.0001, "loss": 5.7243, "loss/crossentropy": 2.643153488636017, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16182073578238487, "step": 13216 }, { "epoch": 0.6008181818181818, "grad_norm": 5.375, "grad_norm_var": 0.16519775390625, "learning_rate": 0.0001, "loss": 5.6102, "loss/crossentropy": 2.4582282304763794, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16324791312217712, "step": 13218 }, { "epoch": 0.600909090909091, "grad_norm": 4.625, "grad_norm_var": 0.20207926432291667, "learning_rate": 0.0001, "loss": 5.6577, "loss/crossentropy": 2.58210551738739, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15853331238031387, "step": 13220 }, { "epoch": 0.601, "grad_norm": 4.96875, "grad_norm_var": 0.21252848307291666, "learning_rate": 0.0001, "loss": 5.1463, "loss/crossentropy": 2.188431680202484, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14774325117468834, "step": 13222 }, { "epoch": 0.6010909090909091, "grad_norm": 5.125, "grad_norm_var": 0.13331705729166668, "learning_rate": 0.0001, "loss": 5.5316, "loss/crossentropy": 2.4156614542007446, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16432326287031174, "step": 13224 }, { "epoch": 0.6011818181818182, "grad_norm": 4.625, "grad_norm_var": 0.14073893229166667, "learning_rate": 0.0001, "loss": 5.3738, "loss/crossentropy": 2.3682169020175934, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15329153835773468, "step": 13226 }, { "epoch": 0.6012727272727273, "grad_norm": 4.78125, "grad_norm_var": 0.140087890625, "learning_rate": 0.0001, "loss": 5.1545, "loss/crossentropy": 2.227293848991394, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.145841084420681, "step": 13228 }, { "epoch": 0.6013636363636363, "grad_norm": 5.625, "grad_norm_var": 0.12877604166666667, "learning_rate": 0.0001, "loss": 5.8619, "loss/crossentropy": 2.59367835521698, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17545780539512634, "step": 13230 }, { "epoch": 0.6014545454545455, "grad_norm": 4.6875, "grad_norm_var": 0.12771809895833333, "learning_rate": 0.0001, "loss": 5.597, "loss/crossentropy": 2.4965121746063232, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16278129443526268, "step": 13232 }, { "epoch": 0.6015454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.11978759765625, "learning_rate": 0.0001, "loss": 5.5519, "loss/crossentropy": 2.48773729801178, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1570051908493042, "step": 13234 }, { "epoch": 0.6016363636363636, "grad_norm": 4.5625, "grad_norm_var": 0.11516520182291666, "learning_rate": 0.0001, "loss": 5.7242, "loss/crossentropy": 2.6411865949630737, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1582981012761593, "step": 13236 }, { "epoch": 0.6017272727272728, "grad_norm": 4.90625, "grad_norm_var": 0.12316080729166666, "learning_rate": 0.0001, "loss": 5.6507, "loss/crossentropy": 2.602257192134857, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15933772176504135, "step": 13238 }, { "epoch": 0.6018181818181818, "grad_norm": 4.875, "grad_norm_var": 0.17707926432291668, "learning_rate": 0.0001, "loss": 5.9094, "loss/crossentropy": 2.7092089653015137, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17177360132336617, "step": 13240 }, { "epoch": 0.601909090909091, "grad_norm": 4.375, "grad_norm_var": 0.19166666666666668, "learning_rate": 0.0001, "loss": 5.6073, "loss/crossentropy": 2.5710765719413757, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15596768632531166, "step": 13242 }, { "epoch": 0.602, "grad_norm": 5.1875, "grad_norm_var": 0.19010009765625, "learning_rate": 0.0001, "loss": 5.681, "loss/crossentropy": 2.5679922699928284, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1609140895307064, "step": 13244 }, { "epoch": 0.6020909090909091, "grad_norm": 4.75, "grad_norm_var": 0.13253580729166667, "learning_rate": 0.0001, "loss": 5.3658, "loss/crossentropy": 2.4245828986167908, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14861777052283287, "step": 13246 }, { "epoch": 0.6021818181818182, "grad_norm": 5.0, "grad_norm_var": 0.13292643229166667, "learning_rate": 0.0001, "loss": 5.9399, "loss/crossentropy": 2.7722641825675964, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16911085322499275, "step": 13248 }, { "epoch": 0.6022727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.13538004557291666, "learning_rate": 0.0001, "loss": 5.0593, "loss/crossentropy": 2.2389366030693054, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.13809145614504814, "step": 13250 }, { "epoch": 0.6023636363636363, "grad_norm": 5.0, "grad_norm_var": 0.13215738932291668, "learning_rate": 0.0001, "loss": 5.9877, "loss/crossentropy": 2.8109993934631348, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17196938022971153, "step": 13252 }, { "epoch": 0.6024545454545455, "grad_norm": 4.5625, "grad_norm_var": 0.12838541666666667, "learning_rate": 0.0001, "loss": 5.0762, "loss/crossentropy": 2.155842751264572, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14457407593727112, "step": 13254 }, { "epoch": 0.6025454545454545, "grad_norm": 5.125, "grad_norm_var": 0.06422119140625, "learning_rate": 0.0001, "loss": 5.5281, "loss/crossentropy": 2.4204238057136536, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15959205478429794, "step": 13256 }, { "epoch": 0.6026363636363636, "grad_norm": 5.40625, "grad_norm_var": 0.5253743489583333, "learning_rate": 0.0001, "loss": 5.8298, "loss/crossentropy": 2.5972495675086975, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17168844491243362, "step": 13258 }, { "epoch": 0.6027272727272728, "grad_norm": 5.125, "grad_norm_var": 0.5251139322916667, "learning_rate": 0.0001, "loss": 5.6847, "loss/crossentropy": 2.5864909291267395, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16275355219841003, "step": 13260 }, { "epoch": 0.6028181818181818, "grad_norm": 4.75, "grad_norm_var": 0.5125, "learning_rate": 0.0001, "loss": 5.6291, "loss/crossentropy": 2.549295127391815, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16168807819485664, "step": 13262 }, { "epoch": 0.602909090909091, "grad_norm": 5.5, "grad_norm_var": 0.5285807291666667, "learning_rate": 0.0001, "loss": 5.7075, "loss/crossentropy": 2.626074433326721, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16146034002304077, "step": 13264 }, { "epoch": 0.603, "grad_norm": 4.5625, "grad_norm_var": 0.5204386393229167, "learning_rate": 0.0001, "loss": 5.0834, "loss/crossentropy": 2.1499944925308228, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1437312588095665, "step": 13266 }, { "epoch": 0.6030909090909091, "grad_norm": 4.5, "grad_norm_var": 0.5583292643229166, "learning_rate": 0.0001, "loss": 5.296, "loss/crossentropy": 2.385276734828949, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14595041424036026, "step": 13268 }, { "epoch": 0.6031818181818182, "grad_norm": 4.5625, "grad_norm_var": 0.67271728515625, "learning_rate": 0.0001, "loss": 5.7809, "loss/crossentropy": 2.613357901573181, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1698807142674923, "step": 13270 }, { "epoch": 0.6032727272727273, "grad_norm": 6.0625, "grad_norm_var": 0.8593587239583333, "learning_rate": 0.0001, "loss": 5.9352, "loss/crossentropy": 2.6863903999328613, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17839810624718666, "step": 13272 }, { "epoch": 0.6033636363636363, "grad_norm": 5.0625, "grad_norm_var": 0.5505045572916667, "learning_rate": 0.0001, "loss": 6.0022, "loss/crossentropy": 2.7551998496055603, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.17411867156624794, "step": 13274 }, { "epoch": 0.6034545454545455, "grad_norm": 5.46875, "grad_norm_var": 0.5538045247395833, "learning_rate": 0.0001, "loss": 5.6842, "loss/crossentropy": 2.535228729248047, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16450176760554314, "step": 13276 }, { "epoch": 0.6035454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.5622395833333333, "learning_rate": 0.0001, "loss": 5.1773, "loss/crossentropy": 2.2751423716545105, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14568665623664856, "step": 13278 }, { "epoch": 0.6036363636363636, "grad_norm": 5.125, "grad_norm_var": 0.5536458333333333, "learning_rate": 0.0001, "loss": 5.5811, "loss/crossentropy": 2.4948442578315735, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1584291309118271, "step": 13280 }, { "epoch": 0.6037272727272728, "grad_norm": 5.125, "grad_norm_var": 0.5295857747395833, "learning_rate": 0.0001, "loss": 5.8679, "loss/crossentropy": 2.677632510662079, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1701999492943287, "step": 13282 }, { "epoch": 0.6038181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.46090087890625, "learning_rate": 0.0001, "loss": 5.6117, "loss/crossentropy": 2.4847099781036377, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1642628014087677, "step": 13284 }, { "epoch": 0.603909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.35383707682291665, "learning_rate": 0.0001, "loss": 5.3898, "loss/crossentropy": 2.421732008457184, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15149536356329918, "step": 13286 }, { "epoch": 0.604, "grad_norm": 5.21875, "grad_norm_var": 0.119384765625, "learning_rate": 0.0001, "loss": 5.5005, "loss/crossentropy": 2.4364213347434998, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15777258574962616, "step": 13288 }, { "epoch": 0.6040909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.05725504557291667, "learning_rate": 0.0001, "loss": 5.2265, "loss/crossentropy": 2.263393223285675, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1490405984222889, "step": 13290 }, { "epoch": 0.6041818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 5.1311, "loss/crossentropy": 2.2565248608589172, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14214203134179115, "step": 13292 }, { "epoch": 0.6042727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.046223958333333336, "learning_rate": 0.0001, "loss": 5.6349, "loss/crossentropy": 2.518991708755493, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16295583173632622, "step": 13294 }, { "epoch": 0.6043636363636363, "grad_norm": 4.75, "grad_norm_var": 0.04803059895833333, "learning_rate": 0.0001, "loss": 5.6243, "loss/crossentropy": 2.570718288421631, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15926288813352585, "step": 13296 }, { "epoch": 0.6044545454545455, "grad_norm": 4.78125, "grad_norm_var": 0.045210774739583334, "learning_rate": 0.0001, "loss": 5.4022, "loss/crossentropy": 2.4169577956199646, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.14813815243542194, "step": 13298 }, { "epoch": 0.6045454545454545, "grad_norm": 5.53125, "grad_norm_var": 5.769038899739583, "learning_rate": 0.0001, "loss": 5.4613, "loss/crossentropy": 2.3328678011894226, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16284482181072235, "step": 13300 }, { "epoch": 0.6046363636363636, "grad_norm": 5.0625, "grad_norm_var": 5.7271484375, "learning_rate": 0.0001, "loss": 5.6006, "loss/crossentropy": 2.4668155908584595, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16454803571105003, "step": 13302 }, { "epoch": 0.6047272727272728, "grad_norm": 4.75, "grad_norm_var": 5.756754557291667, "learning_rate": 0.0001, "loss": 5.3251, "loss/crossentropy": 2.3632932603359222, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14969193190336227, "step": 13304 }, { "epoch": 0.6048181818181818, "grad_norm": 5.125, "grad_norm_var": 5.919266764322916, "learning_rate": 0.0001, "loss": 5.9163, "loss/crossentropy": 2.6530949473381042, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1745624616742134, "step": 13306 }, { "epoch": 0.604909090909091, "grad_norm": 4.8125, "grad_norm_var": 5.884505208333334, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.581167221069336, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1605941355228424, "step": 13308 }, { "epoch": 0.605, "grad_norm": 4.65625, "grad_norm_var": 5.976688639322917, "learning_rate": 0.0001, "loss": 5.3338, "loss/crossentropy": 2.4129437804222107, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14677563682198524, "step": 13310 }, { "epoch": 0.6050909090909091, "grad_norm": 4.625, "grad_norm_var": 6.001102701822917, "learning_rate": 0.0001, "loss": 5.5557, "loss/crossentropy": 2.5000206530094147, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15869216993451118, "step": 13312 }, { "epoch": 0.6051818181818182, "grad_norm": 5.34375, "grad_norm_var": 5.96920166015625, "learning_rate": 0.0001, "loss": 5.595, "loss/crossentropy": 2.505952000617981, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15988320484757423, "step": 13314 }, { "epoch": 0.6052727272727273, "grad_norm": 4.4375, "grad_norm_var": 0.5344034830729166, "learning_rate": 0.0001, "loss": 5.2887, "loss/crossentropy": 2.359533190727234, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14838217571377754, "step": 13316 }, { "epoch": 0.6053636363636363, "grad_norm": 4.9375, "grad_norm_var": 0.5230428059895833, "learning_rate": 0.0001, "loss": 5.8151, "loss/crossentropy": 2.6509228944778442, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16739663109183311, "step": 13318 }, { "epoch": 0.6054545454545455, "grad_norm": 4.78125, "grad_norm_var": 0.5184733072916666, "learning_rate": 0.0001, "loss": 5.9096, "loss/crossentropy": 2.713243842124939, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17061319202184677, "step": 13320 }, { "epoch": 0.6055454545454545, "grad_norm": 5.46875, "grad_norm_var": 3.5537068684895834, "learning_rate": 0.0001, "loss": 6.0187, "loss/crossentropy": 2.5558062195777893, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.19589563459157944, "step": 13322 }, { "epoch": 0.6056363636363636, "grad_norm": 5.09375, "grad_norm_var": 3.529541015625, "learning_rate": 0.0001, "loss": 5.3778, "loss/crossentropy": 2.3600906133651733, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15372032299637794, "step": 13324 }, { "epoch": 0.6057272727272728, "grad_norm": 4.6875, "grad_norm_var": 3.49459228515625, "learning_rate": 0.0001, "loss": 5.2877, "loss/crossentropy": 2.319792926311493, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15030836313962936, "step": 13326 }, { "epoch": 0.6058181818181818, "grad_norm": 4.5625, "grad_norm_var": 3.5010701497395833, "learning_rate": 0.0001, "loss": 5.2891, "loss/crossentropy": 2.3681472539901733, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.14775670692324638, "step": 13328 }, { "epoch": 0.605909090909091, "grad_norm": 4.78125, "grad_norm_var": 3.5163899739583333, "learning_rate": 0.0001, "loss": 5.5907, "loss/crossentropy": 2.5010276436805725, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16170088201761246, "step": 13330 }, { "epoch": 0.606, "grad_norm": 4.875, "grad_norm_var": 3.4774373372395835, "learning_rate": 0.0001, "loss": 5.4742, "loss/crossentropy": 2.4159963726997375, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15855331346392632, "step": 13332 }, { "epoch": 0.6060909090909091, "grad_norm": 4.21875, "grad_norm_var": 3.5382649739583334, "learning_rate": 0.0001, "loss": 5.3494, "loss/crossentropy": 2.382845848798752, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14899759367108345, "step": 13334 }, { "epoch": 0.6061818181818182, "grad_norm": 5.3125, "grad_norm_var": 3.547782389322917, "learning_rate": 0.0001, "loss": 5.5365, "loss/crossentropy": 2.410194605588913, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16067684069275856, "step": 13336 }, { "epoch": 0.6062727272727273, "grad_norm": 5.90625, "grad_norm_var": 0.24724934895833334, "learning_rate": 0.0001, "loss": 5.9378, "loss/crossentropy": 2.6091347336769104, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.18110663443803787, "step": 13338 }, { "epoch": 0.6063636363636363, "grad_norm": 4.65625, "grad_norm_var": 0.2720703125, "learning_rate": 0.0001, "loss": 5.477, "loss/crossentropy": 2.4703749120235443, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15046873316168785, "step": 13340 }, { "epoch": 0.6064545454545455, "grad_norm": 4.53125, "grad_norm_var": 0.279931640625, "learning_rate": 0.0001, "loss": 5.5245, "loss/crossentropy": 2.5283278822898865, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1566511169075966, "step": 13342 }, { "epoch": 0.6065454545454545, "grad_norm": 4.9375, "grad_norm_var": 0.265087890625, "learning_rate": 0.0001, "loss": 5.4862, "loss/crossentropy": 2.4469814896583557, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15392126515507698, "step": 13344 }, { "epoch": 0.6066363636363636, "grad_norm": 5.5, "grad_norm_var": 0.26737874348958335, "learning_rate": 0.0001, "loss": 5.9068, "loss/crossentropy": 2.660727322101593, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17558301240205765, "step": 13346 }, { "epoch": 0.6067272727272728, "grad_norm": 5.25, "grad_norm_var": 0.2609375, "learning_rate": 0.0001, "loss": 5.6745, "loss/crossentropy": 2.481992155313492, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17022350057959557, "step": 13348 }, { "epoch": 0.6068181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.20061442057291667, "learning_rate": 0.0001, "loss": 5.6291, "loss/crossentropy": 2.453981935977936, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1677098385989666, "step": 13350 }, { "epoch": 0.606909090909091, "grad_norm": 4.53125, "grad_norm_var": 0.1466796875, "learning_rate": 0.0001, "loss": 5.2806, "loss/crossentropy": 2.3328853845596313, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14848090708255768, "step": 13352 }, { "epoch": 0.607, "grad_norm": 4.5, "grad_norm_var": 0.11575113932291667, "learning_rate": 0.0001, "loss": 5.2504, "loss/crossentropy": 2.3059535920619965, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1448319125920534, "step": 13354 }, { "epoch": 0.6070909090909091, "grad_norm": 5.25, "grad_norm_var": 0.11458333333333333, "learning_rate": 0.0001, "loss": 5.2205, "loss/crossentropy": 2.3005822598934174, "loss/hidden": 1.419921875, "loss/jsd": 0.0, "loss/logits": 0.14999573677778244, "step": 13356 }, { "epoch": 0.6071818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.4056640625, "learning_rate": 0.0001, "loss": 5.4431, "loss/crossentropy": 2.3387625217437744, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15652533248066902, "step": 13358 }, { "epoch": 0.6072727272727273, "grad_norm": 4.46875, "grad_norm_var": 0.4175618489583333, "learning_rate": 0.0001, "loss": 5.3636, "loss/crossentropy": 2.399239420890808, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15072985365986824, "step": 13360 }, { "epoch": 0.6073636363636363, "grad_norm": 4.6875, "grad_norm_var": 0.4299112955729167, "learning_rate": 0.0001, "loss": 5.8879, "loss/crossentropy": 2.738557815551758, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16805585846304893, "step": 13362 }, { "epoch": 0.6074545454545455, "grad_norm": 4.53125, "grad_norm_var": 0.4443359375, "learning_rate": 0.0001, "loss": 5.445, "loss/crossentropy": 2.414939761161804, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15593304857611656, "step": 13364 }, { "epoch": 0.6075454545454545, "grad_norm": 5.21875, "grad_norm_var": 0.44439697265625, "learning_rate": 0.0001, "loss": 5.6535, "loss/crossentropy": 2.6108436584472656, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1571936309337616, "step": 13366 }, { "epoch": 0.6076363636363636, "grad_norm": 4.15625, "grad_norm_var": 0.46724853515625, "learning_rate": 0.0001, "loss": 5.293, "loss/crossentropy": 2.326842784881592, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1513056345283985, "step": 13368 }, { "epoch": 0.6077272727272728, "grad_norm": 4.65625, "grad_norm_var": 0.45885009765625, "learning_rate": 0.0001, "loss": 5.5528, "loss/crossentropy": 2.4438453316688538, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16343355178833008, "step": 13370 }, { "epoch": 0.6078181818181818, "grad_norm": 5.125, "grad_norm_var": 0.440869140625, "learning_rate": 0.0001, "loss": 5.9737, "loss/crossentropy": 2.7419592142105103, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17492758482694626, "step": 13372 }, { "epoch": 0.607909090909091, "grad_norm": 4.4375, "grad_norm_var": 0.10429280598958333, "learning_rate": 0.0001, "loss": 5.4047, "loss/crossentropy": 2.4354814887046814, "loss/hidden": 1.431640625, "loss/jsd": 0.0, "loss/logits": 0.15375278517603874, "step": 13374 }, { "epoch": 0.608, "grad_norm": 4.875, "grad_norm_var": 0.09694010416666667, "learning_rate": 0.0001, "loss": 5.7454, "loss/crossentropy": 2.6215182542800903, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16375133022665977, "step": 13376 }, { "epoch": 0.6080909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.09840087890625, "learning_rate": 0.0001, "loss": 5.6001, "loss/crossentropy": 2.537400245666504, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15802910551428795, "step": 13378 }, { "epoch": 0.6081818181818182, "grad_norm": 4.59375, "grad_norm_var": 0.10050455729166667, "learning_rate": 0.0001, "loss": 5.4348, "loss/crossentropy": 2.4860468804836273, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14956235513091087, "step": 13380 }, { "epoch": 0.6082727272727273, "grad_norm": 4.78125, "grad_norm_var": 0.08795166015625, "learning_rate": 0.0001, "loss": 5.5547, "loss/crossentropy": 2.5136411786079407, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15859729424118996, "step": 13382 }, { "epoch": 0.6083636363636363, "grad_norm": 5.34375, "grad_norm_var": 0.07753499348958333, "learning_rate": 0.0001, "loss": 5.7088, "loss/crossentropy": 2.487191617488861, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1739226058125496, "step": 13384 }, { "epoch": 0.6084545454545455, "grad_norm": 4.8125, "grad_norm_var": 0.07489827473958334, "learning_rate": 0.0001, "loss": 5.9559, "loss/crossentropy": 2.7266381978988647, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17273498699069023, "step": 13386 }, { "epoch": 0.6085454545454545, "grad_norm": 5.5, "grad_norm_var": 0.08427327473958333, "learning_rate": 0.0001, "loss": 6.0492, "loss/crossentropy": 2.7601717114448547, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17968083545565605, "step": 13388 }, { "epoch": 0.6086363636363636, "grad_norm": 4.84375, "grad_norm_var": 0.07513020833333334, "learning_rate": 0.0001, "loss": 5.6947, "loss/crossentropy": 2.630356788635254, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1595630720257759, "step": 13390 }, { "epoch": 0.6087272727272727, "grad_norm": 4.875, "grad_norm_var": 0.06767171223958333, "learning_rate": 0.0001, "loss": 5.4187, "loss/crossentropy": 2.3571101129055023, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15713462606072426, "step": 13392 }, { "epoch": 0.6088181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.066259765625, "learning_rate": 0.0001, "loss": 5.5892, "loss/crossentropy": 2.4779102206230164, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1624961644411087, "step": 13394 }, { "epoch": 0.608909090909091, "grad_norm": 5.0, "grad_norm_var": 0.079931640625, "learning_rate": 0.0001, "loss": 5.4178, "loss/crossentropy": 2.347044348716736, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.157859455794096, "step": 13396 }, { "epoch": 0.609, "grad_norm": 5.3125, "grad_norm_var": 0.09517822265625, "learning_rate": 0.0001, "loss": 5.4534, "loss/crossentropy": 2.437023639678955, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1530061475932598, "step": 13398 }, { "epoch": 0.6090909090909091, "grad_norm": 5.375, "grad_norm_var": 0.09983317057291667, "learning_rate": 0.0001, "loss": 5.3749, "loss/crossentropy": 2.3272646069526672, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15593786165118217, "step": 13400 }, { "epoch": 0.6091818181818182, "grad_norm": 5.25, "grad_norm_var": 0.10963134765625, "learning_rate": 0.0001, "loss": 5.7186, "loss/crossentropy": 2.4883328080177307, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17360872775316238, "step": 13402 }, { "epoch": 0.6092727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.096728515625, "learning_rate": 0.0001, "loss": 5.3504, "loss/crossentropy": 2.357819139957428, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1510186828672886, "step": 13404 }, { "epoch": 0.6093636363636363, "grad_norm": 5.53125, "grad_norm_var": 0.09680989583333334, "learning_rate": 0.0001, "loss": 5.9814, "loss/crossentropy": 2.7481072545051575, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17430227249860764, "step": 13406 }, { "epoch": 0.6094545454545455, "grad_norm": 5.25, "grad_norm_var": 0.11890869140625, "learning_rate": 0.0001, "loss": 5.4876, "loss/crossentropy": 2.4045779705047607, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1586904153227806, "step": 13408 }, { "epoch": 0.6095454545454545, "grad_norm": 5.0, "grad_norm_var": 0.10266520182291666, "learning_rate": 0.0001, "loss": 5.4968, "loss/crossentropy": 2.463227391242981, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1541408784687519, "step": 13410 }, { "epoch": 0.6096363636363636, "grad_norm": 4.8125, "grad_norm_var": 0.10299479166666667, "learning_rate": 0.0001, "loss": 5.7009, "loss/crossentropy": 2.6167253255844116, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1578356809914112, "step": 13412 }, { "epoch": 0.6097272727272727, "grad_norm": 6.21875, "grad_norm_var": 3.223763020833333, "learning_rate": 0.0001, "loss": 5.8865, "loss/crossentropy": 2.588716745376587, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.17528939247131348, "step": 13414 }, { "epoch": 0.6098181818181818, "grad_norm": 4.71875, "grad_norm_var": 3.293745930989583, "learning_rate": 0.0001, "loss": 5.5341, "loss/crossentropy": 2.4732799530029297, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15881211683154106, "step": 13416 }, { "epoch": 0.609909090909091, "grad_norm": 4.625, "grad_norm_var": 3.41148681640625, "learning_rate": 0.0001, "loss": 5.2927, "loss/crossentropy": 2.3249897956848145, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15165027603507042, "step": 13418 }, { "epoch": 0.61, "grad_norm": 5.0625, "grad_norm_var": 3.430192057291667, "learning_rate": 0.0001, "loss": 5.4369, "loss/crossentropy": 2.4274608492851257, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15192106366157532, "step": 13420 }, { "epoch": 0.6100909090909091, "grad_norm": 5.1875, "grad_norm_var": 3.503889973958333, "learning_rate": 0.0001, "loss": 5.5199, "loss/crossentropy": 2.4530814588069916, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1572720631957054, "step": 13422 }, { "epoch": 0.6101818181818182, "grad_norm": 5.0, "grad_norm_var": 3.5360310872395835, "learning_rate": 0.0001, "loss": 5.2679, "loss/crossentropy": 2.2907902002334595, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15064018219709396, "step": 13424 }, { "epoch": 0.6102727272727273, "grad_norm": 5.3125, "grad_norm_var": 3.54000244140625, "learning_rate": 0.0001, "loss": 5.7447, "loss/crossentropy": 2.592038929462433, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16976257786154747, "step": 13426 }, { "epoch": 0.6103636363636363, "grad_norm": 4.78125, "grad_norm_var": 3.565087890625, "learning_rate": 0.0001, "loss": 5.4343, "loss/crossentropy": 2.41688472032547, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15349455922842026, "step": 13428 }, { "epoch": 0.6104545454545455, "grad_norm": 5.1875, "grad_norm_var": 0.06510416666666667, "learning_rate": 0.0001, "loss": 5.8459, "loss/crossentropy": 2.711695373058319, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16576745361089706, "step": 13430 }, { "epoch": 0.6105454545454545, "grad_norm": 5.09375, "grad_norm_var": 0.09290364583333334, "learning_rate": 0.0001, "loss": 5.8876, "loss/crossentropy": 2.6194169521331787, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17760059982538223, "step": 13432 }, { "epoch": 0.6106363636363636, "grad_norm": 4.53125, "grad_norm_var": 0.09420166015625, "learning_rate": 0.0001, "loss": 5.5503, "loss/crossentropy": 2.4614731669425964, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15986104309558868, "step": 13434 }, { "epoch": 0.6107272727272727, "grad_norm": 6.25, "grad_norm_var": 0.18671468098958333, "learning_rate": 0.0001, "loss": 5.4411, "loss/crossentropy": 2.338426947593689, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1598796397447586, "step": 13436 }, { "epoch": 0.6108181818181818, "grad_norm": 5.375, "grad_norm_var": 0.16790364583333334, "learning_rate": 0.0001, "loss": 5.7101, "loss/crossentropy": 2.6199235320091248, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1603863313794136, "step": 13438 }, { "epoch": 0.610909090909091, "grad_norm": 5.6875, "grad_norm_var": 0.17486572265625, "learning_rate": 0.0001, "loss": 5.7381, "loss/crossentropy": 2.5851326286792755, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16705921292304993, "step": 13440 }, { "epoch": 0.611, "grad_norm": 4.59375, "grad_norm_var": 0.19110921223958333, "learning_rate": 0.0001, "loss": 5.6592, "loss/crossentropy": 2.5379753708839417, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16329049691557884, "step": 13442 }, { "epoch": 0.6110909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.16728108723958332, "learning_rate": 0.0001, "loss": 5.9704, "loss/crossentropy": 2.753849983215332, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.17068196088075638, "step": 13444 }, { "epoch": 0.6111818181818182, "grad_norm": 5.125, "grad_norm_var": 0.169921875, "learning_rate": 0.0001, "loss": 5.5348, "loss/crossentropy": 2.46734356880188, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15440108254551888, "step": 13446 }, { "epoch": 0.6112727272727273, "grad_norm": 4.625, "grad_norm_var": 0.19426676432291667, "learning_rate": 0.0001, "loss": 5.4134, "loss/crossentropy": 2.3416508436203003, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15756739675998688, "step": 13448 }, { "epoch": 0.6113636363636363, "grad_norm": 4.3125, "grad_norm_var": 0.22198893229166666, "learning_rate": 0.0001, "loss": 5.0413, "loss/crossentropy": 2.1659111976623535, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14007296040654182, "step": 13450 }, { "epoch": 0.6114545454545455, "grad_norm": 5.125, "grad_norm_var": 0.12389322916666666, "learning_rate": 0.0001, "loss": 5.5043, "loss/crossentropy": 2.433621883392334, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15687133371829987, "step": 13452 }, { "epoch": 0.6115454545454545, "grad_norm": 4.5, "grad_norm_var": 0.1525390625, "learning_rate": 0.0001, "loss": 5.4754, "loss/crossentropy": 2.529143512248993, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14931030571460724, "step": 13454 }, { "epoch": 0.6116363636363636, "grad_norm": 5.40625, "grad_norm_var": 0.1099609375, "learning_rate": 0.0001, "loss": 6.152, "loss/crossentropy": 2.83367782831192, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.18085217475891113, "step": 13456 }, { "epoch": 0.6117272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.10154622395833333, "learning_rate": 0.0001, "loss": 5.5688, "loss/crossentropy": 2.563223958015442, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15407317131757736, "step": 13458 }, { "epoch": 0.6118181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.10533447265625, "learning_rate": 0.0001, "loss": 5.873, "loss/crossentropy": 2.663998544216156, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1714904122054577, "step": 13460 }, { "epoch": 0.611909090909091, "grad_norm": 4.875, "grad_norm_var": 0.0966796875, "learning_rate": 0.0001, "loss": 5.5281, "loss/crossentropy": 2.474609076976776, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15691332146525383, "step": 13462 }, { "epoch": 0.612, "grad_norm": 5.15625, "grad_norm_var": 0.10533447265625, "learning_rate": 0.0001, "loss": 5.8128, "loss/crossentropy": 2.6607489585876465, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16676495969295502, "step": 13464 }, { "epoch": 0.6120909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.09176025390625, "learning_rate": 0.0001, "loss": 5.9659, "loss/crossentropy": 2.8130205869674683, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16646437719464302, "step": 13466 }, { "epoch": 0.6121818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.5057291666666667, "learning_rate": 0.0001, "loss": 5.5981, "loss/crossentropy": 2.446286141872406, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.162837915122509, "step": 13468 }, { "epoch": 0.6122727272727273, "grad_norm": 4.75, "grad_norm_var": 0.484619140625, "learning_rate": 0.0001, "loss": 5.6089, "loss/crossentropy": 2.5663070678710938, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15757878124713898, "step": 13470 }, { "epoch": 0.6123636363636363, "grad_norm": 5.15625, "grad_norm_var": 0.501416015625, "learning_rate": 0.0001, "loss": 5.4532, "loss/crossentropy": 2.435228168964386, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15511441230773926, "step": 13472 }, { "epoch": 0.6124545454545455, "grad_norm": 4.875, "grad_norm_var": 0.4891764322916667, "learning_rate": 0.0001, "loss": 5.5672, "loss/crossentropy": 2.4584423303604126, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16244256868958473, "step": 13474 }, { "epoch": 0.6125454545454545, "grad_norm": 4.8125, "grad_norm_var": 0.46920572916666664, "learning_rate": 0.0001, "loss": 5.1459, "loss/crossentropy": 2.0853562355041504, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15214965119957924, "step": 13476 }, { "epoch": 0.6126363636363636, "grad_norm": 4.875, "grad_norm_var": 0.479541015625, "learning_rate": 0.0001, "loss": 5.4678, "loss/crossentropy": 2.40700626373291, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15822458639740944, "step": 13478 }, { "epoch": 0.6127272727272727, "grad_norm": 5.0, "grad_norm_var": 0.480078125, "learning_rate": 0.0001, "loss": 5.6804, "loss/crossentropy": 2.4806538224220276, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16957995668053627, "step": 13480 }, { "epoch": 0.6128181818181818, "grad_norm": 4.8125, "grad_norm_var": 0.4811197916666667, "learning_rate": 0.0001, "loss": 5.2588, "loss/crossentropy": 2.2642974257469177, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15081555023789406, "step": 13482 }, { "epoch": 0.612909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.05142822265625, "learning_rate": 0.0001, "loss": 5.5781, "loss/crossentropy": 2.5047664046287537, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15831426158547401, "step": 13484 }, { "epoch": 0.613, "grad_norm": 4.5625, "grad_norm_var": 0.04440104166666667, "learning_rate": 0.0001, "loss": 5.2584, "loss/crossentropy": 2.324939042329788, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14608265087008476, "step": 13486 }, { "epoch": 0.6130909090909091, "grad_norm": 5.40625, "grad_norm_var": 0.04620768229166667, "learning_rate": 0.0001, "loss": 5.5729, "loss/crossentropy": 2.4918423295021057, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15966329351067543, "step": 13488 }, { "epoch": 0.6131818181818182, "grad_norm": 4.5625, "grad_norm_var": 0.05530192057291667, "learning_rate": 0.0001, "loss": 5.4616, "loss/crossentropy": 2.428170382976532, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15764391422271729, "step": 13490 }, { "epoch": 0.6132727272727273, "grad_norm": 5.21875, "grad_norm_var": 0.06360270182291666, "learning_rate": 0.0001, "loss": 5.597, "loss/crossentropy": 2.534876048564911, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15855344757437706, "step": 13492 }, { "epoch": 0.6133636363636363, "grad_norm": 5.40625, "grad_norm_var": 0.07314046223958333, "learning_rate": 0.0001, "loss": 6.0156, "loss/crossentropy": 2.81664901971817, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17106664553284645, "step": 13494 }, { "epoch": 0.6134545454545455, "grad_norm": 4.90625, "grad_norm_var": 0.07537434895833334, "learning_rate": 0.0001, "loss": 5.5704, "loss/crossentropy": 2.5862680673599243, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1488063856959343, "step": 13496 }, { "epoch": 0.6135454545454545, "grad_norm": 4.9375, "grad_norm_var": 0.07459309895833334, "learning_rate": 0.0001, "loss": 5.6993, "loss/crossentropy": 2.6370437145233154, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15993314608931541, "step": 13498 }, { "epoch": 0.6136363636363636, "grad_norm": 4.40625, "grad_norm_var": 0.08414306640625, "learning_rate": 0.0001, "loss": 5.2923, "loss/crossentropy": 2.332246243953705, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1463930793106556, "step": 13500 }, { "epoch": 0.6137272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.08391520182291666, "learning_rate": 0.0001, "loss": 5.7199, "loss/crossentropy": 2.5336705446243286, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16979826241731644, "step": 13502 }, { "epoch": 0.6138181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.06770833333333333, "learning_rate": 0.0001, "loss": 5.2976, "loss/crossentropy": 2.370426058769226, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14760393276810646, "step": 13504 }, { "epoch": 0.613909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.061328125, "learning_rate": 0.0001, "loss": 5.8703, "loss/crossentropy": 2.630436599254608, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17632558941841125, "step": 13506 }, { "epoch": 0.614, "grad_norm": 4.34375, "grad_norm_var": 0.07694905598958333, "learning_rate": 0.0001, "loss": 5.1811, "loss/crossentropy": 2.284262776374817, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1445678435266018, "step": 13508 }, { "epoch": 0.6140909090909091, "grad_norm": 4.75, "grad_norm_var": 0.06015218098958333, "learning_rate": 0.0001, "loss": 5.7152, "loss/crossentropy": 2.62159925699234, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16307099536061287, "step": 13510 }, { "epoch": 0.6141818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.058492024739583336, "learning_rate": 0.0001, "loss": 5.3962, "loss/crossentropy": 2.440258353948593, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14950529485940933, "step": 13512 }, { "epoch": 0.6142727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.07662353515625, "learning_rate": 0.0001, "loss": 5.8085, "loss/crossentropy": 2.608160436153412, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17120660468935966, "step": 13514 }, { "epoch": 0.6143636363636363, "grad_norm": 5.15625, "grad_norm_var": 0.064453125, "learning_rate": 0.0001, "loss": 5.5753, "loss/crossentropy": 2.4889637231826782, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16058924421668053, "step": 13516 }, { "epoch": 0.6144545454545455, "grad_norm": 5.4375, "grad_norm_var": 0.07496337890625, "learning_rate": 0.0001, "loss": 5.7359, "loss/crossentropy": 2.596680223941803, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16528761386871338, "step": 13518 }, { "epoch": 0.6145454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.07906494140625, "learning_rate": 0.0001, "loss": 5.5598, "loss/crossentropy": 2.433913767337799, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16317866742610931, "step": 13520 }, { "epoch": 0.6146363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.07877197265625, "learning_rate": 0.0001, "loss": 5.4015, "loss/crossentropy": 2.3707969784736633, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15560739487409592, "step": 13522 }, { "epoch": 0.6147272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.06461181640625, "learning_rate": 0.0001, "loss": 5.1903, "loss/crossentropy": 2.260016143321991, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14752408862113953, "step": 13524 }, { "epoch": 0.6148181818181818, "grad_norm": 4.625, "grad_norm_var": 0.10227457682291667, "learning_rate": 0.0001, "loss": 5.3117, "loss/crossentropy": 2.315606951713562, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1527356617152691, "step": 13526 }, { "epoch": 0.614909090909091, "grad_norm": 4.125, "grad_norm_var": 0.14451497395833332, "learning_rate": 0.0001, "loss": 4.9323, "loss/crossentropy": 2.0726427733898163, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.13967311941087246, "step": 13528 }, { "epoch": 0.615, "grad_norm": 4.96875, "grad_norm_var": 0.12903238932291666, "learning_rate": 0.0001, "loss": 5.7416, "loss/crossentropy": 2.6263691782951355, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1623004786670208, "step": 13530 }, { "epoch": 0.6150909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.12590738932291667, "learning_rate": 0.0001, "loss": 5.7549, "loss/crossentropy": 2.6344215273857117, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16517051309347153, "step": 13532 }, { "epoch": 0.6151818181818182, "grad_norm": 4.65625, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 5.2464, "loss/crossentropy": 2.2749089896678925, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14910036697983742, "step": 13534 }, { "epoch": 0.6152727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.10836181640625, "learning_rate": 0.0001, "loss": 5.3439, "loss/crossentropy": 2.340679258108139, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15325133502483368, "step": 13536 }, { "epoch": 0.6153636363636363, "grad_norm": 4.40625, "grad_norm_var": 0.16200764973958334, "learning_rate": 0.0001, "loss": 5.604, "loss/crossentropy": 2.566130518913269, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15788887068629265, "step": 13538 }, { "epoch": 0.6154545454545455, "grad_norm": 5.34375, "grad_norm_var": 0.17923177083333333, "learning_rate": 0.0001, "loss": 5.9672, "loss/crossentropy": 2.7252528071403503, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17732419818639755, "step": 13540 }, { "epoch": 0.6155454545454545, "grad_norm": 4.53125, "grad_norm_var": 0.15266927083333334, "learning_rate": 0.0001, "loss": 4.9569, "loss/crossentropy": 2.1016883552074432, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.13981429487466812, "step": 13542 }, { "epoch": 0.6156363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.10940348307291667, "learning_rate": 0.0001, "loss": 5.9734, "loss/crossentropy": 2.7957730889320374, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1705019548535347, "step": 13544 }, { "epoch": 0.6157272727272727, "grad_norm": 4.53125, "grad_norm_var": 0.1333984375, "learning_rate": 0.0001, "loss": 5.4946, "loss/crossentropy": 2.491583228111267, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15459440648555756, "step": 13546 }, { "epoch": 0.6158181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.154931640625, "learning_rate": 0.0001, "loss": 5.4277, "loss/crossentropy": 2.3726036846637726, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15472977235913277, "step": 13548 }, { "epoch": 0.615909090909091, "grad_norm": 5.25, "grad_norm_var": 0.15362955729166666, "learning_rate": 0.0001, "loss": 5.7139, "loss/crossentropy": 2.5399115085601807, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1674036867916584, "step": 13550 }, { "epoch": 0.616, "grad_norm": 5.125, "grad_norm_var": 0.16304931640625, "learning_rate": 0.0001, "loss": 5.5663, "loss/crossentropy": 2.5405367612838745, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15609240904450417, "step": 13552 }, { "epoch": 0.6160909090909091, "grad_norm": 5.375, "grad_norm_var": 0.237353515625, "learning_rate": 0.0001, "loss": 5.7572, "loss/crossentropy": 2.549697697162628, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17250507324934006, "step": 13554 }, { "epoch": 0.6161818181818182, "grad_norm": 4.53125, "grad_norm_var": 0.23411051432291666, "learning_rate": 0.0001, "loss": 5.4317, "loss/crossentropy": 2.3964259028434753, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15411678701639175, "step": 13556 }, { "epoch": 0.6162727272727273, "grad_norm": 5.75, "grad_norm_var": 2.346337890625, "learning_rate": 0.0001, "loss": 5.803, "loss/crossentropy": 2.506345510482788, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17830168083310127, "step": 13558 }, { "epoch": 0.6163636363636363, "grad_norm": 5.09375, "grad_norm_var": 2.3347493489583333, "learning_rate": 0.0001, "loss": 5.1142, "loss/crossentropy": 2.1237070560455322, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15100491046905518, "step": 13560 }, { "epoch": 0.6164545454545455, "grad_norm": 5.125, "grad_norm_var": 2.2462890625, "learning_rate": 0.0001, "loss": 5.5272, "loss/crossentropy": 2.4637849032878876, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15829958021640778, "step": 13562 }, { "epoch": 0.6165454545454545, "grad_norm": 5.6875, "grad_norm_var": 2.2390625, "learning_rate": 0.0001, "loss": 5.7794, "loss/crossentropy": 2.5610952377319336, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1716320998966694, "step": 13564 }, { "epoch": 0.6166363636363636, "grad_norm": 4.9375, "grad_norm_var": 2.2431640625, "learning_rate": 0.0001, "loss": 5.4877, "loss/crossentropy": 2.4639358520507812, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1543276160955429, "step": 13566 }, { "epoch": 0.6167272727272727, "grad_norm": 4.59375, "grad_norm_var": 2.237044270833333, "learning_rate": 0.0001, "loss": 5.6094, "loss/crossentropy": 2.498253673315048, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16208918765187263, "step": 13568 }, { "epoch": 0.6168181818181818, "grad_norm": 5.4375, "grad_norm_var": 2.2370402018229165, "learning_rate": 0.0001, "loss": 5.4574, "loss/crossentropy": 2.4299910068511963, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15333160012960434, "step": 13570 }, { "epoch": 0.616909090909091, "grad_norm": 5.46875, "grad_norm_var": 2.1838175455729165, "learning_rate": 0.0001, "loss": 5.7573, "loss/crossentropy": 2.641711175441742, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1625399999320507, "step": 13572 }, { "epoch": 0.617, "grad_norm": 4.8125, "grad_norm_var": 0.10240885416666666, "learning_rate": 0.0001, "loss": 5.5455, "loss/crossentropy": 2.4958184361457825, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16004882752895355, "step": 13574 }, { "epoch": 0.6170909090909091, "grad_norm": 4.875, "grad_norm_var": 0.108203125, "learning_rate": 0.0001, "loss": 5.7835, "loss/crossentropy": 2.599367320537567, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.17095716297626495, "step": 13576 }, { "epoch": 0.6171818181818182, "grad_norm": 4.65625, "grad_norm_var": 0.11754150390625, "learning_rate": 0.0001, "loss": 5.5076, "loss/crossentropy": 2.4512023329734802, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15700485557317734, "step": 13578 }, { "epoch": 0.6172727272727273, "grad_norm": 4.875, "grad_norm_var": 0.08931884765625, "learning_rate": 0.0001, "loss": 5.6084, "loss/crossentropy": 2.489917576313019, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16379958391189575, "step": 13580 }, { "epoch": 0.6173636363636363, "grad_norm": 4.46875, "grad_norm_var": 0.108056640625, "learning_rate": 0.0001, "loss": 5.6997, "loss/crossentropy": 2.58688485622406, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16265011578798294, "step": 13582 }, { "epoch": 0.6174545454545455, "grad_norm": 4.4375, "grad_norm_var": 0.115234375, "learning_rate": 0.0001, "loss": 5.186, "loss/crossentropy": 2.2484651505947113, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14570514857769012, "step": 13584 }, { "epoch": 0.6175454545454545, "grad_norm": 4.53125, "grad_norm_var": 0.103369140625, "learning_rate": 0.0001, "loss": 5.5392, "loss/crossentropy": 2.521575629711151, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15332607179880142, "step": 13586 }, { "epoch": 0.6176363636363637, "grad_norm": 8.5625, "grad_norm_var": 0.9571451822916667, "learning_rate": 0.0001, "loss": 5.6285, "loss/crossentropy": 2.537859261035919, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16179462149739265, "step": 13588 }, { "epoch": 0.6177272727272727, "grad_norm": 5.0, "grad_norm_var": 0.9719889322916667, "learning_rate": 0.0001, "loss": 5.6724, "loss/crossentropy": 2.516273260116577, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16385949030518532, "step": 13590 }, { "epoch": 0.6178181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.9843098958333333, "learning_rate": 0.0001, "loss": 5.3532, "loss/crossentropy": 2.4078996777534485, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1478521227836609, "step": 13592 }, { "epoch": 0.617909090909091, "grad_norm": 4.875, "grad_norm_var": 0.9839680989583334, "learning_rate": 0.0001, "loss": 5.6181, "loss/crossentropy": 2.572903275489807, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15530182793736458, "step": 13594 }, { "epoch": 0.618, "grad_norm": 5.9375, "grad_norm_var": 1.03814697265625, "learning_rate": 0.0001, "loss": 6.0335, "loss/crossentropy": 2.7870341539382935, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17562781646847725, "step": 13596 }, { "epoch": 0.6180909090909091, "grad_norm": 5.53125, "grad_norm_var": 1.022509765625, "learning_rate": 0.0001, "loss": 5.6749, "loss/crossentropy": 2.522008627653122, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16568288207054138, "step": 13598 }, { "epoch": 0.6181818181818182, "grad_norm": 4.375, "grad_norm_var": 1.0288411458333333, "learning_rate": 0.0001, "loss": 4.8127, "loss/crossentropy": 1.9705303013324738, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.13499920070171356, "step": 13600 }, { "epoch": 0.6182727272727273, "grad_norm": 5.03125, "grad_norm_var": 0.979150390625, "learning_rate": 0.0001, "loss": 5.5603, "loss/crossentropy": 2.51270991563797, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1580827608704567, "step": 13602 }, { "epoch": 0.6183636363636363, "grad_norm": 4.9375, "grad_norm_var": 0.18957926432291666, "learning_rate": 0.0001, "loss": 5.7436, "loss/crossentropy": 2.666111946105957, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16087251529097557, "step": 13604 }, { "epoch": 0.6184545454545455, "grad_norm": 4.8125, "grad_norm_var": 0.17164306640625, "learning_rate": 0.0001, "loss": 5.6274, "loss/crossentropy": 2.5875118374824524, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15906419977545738, "step": 13606 }, { "epoch": 0.6185454545454545, "grad_norm": 5.25, "grad_norm_var": 0.158837890625, "learning_rate": 0.0001, "loss": 5.6654, "loss/crossentropy": 2.500241756439209, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16788484156131744, "step": 13608 }, { "epoch": 0.6186363636363637, "grad_norm": 5.21875, "grad_norm_var": 0.16659749348958333, "learning_rate": 0.0001, "loss": 5.7107, "loss/crossentropy": 2.5615034699440002, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16823548078536987, "step": 13610 }, { "epoch": 0.6187272727272727, "grad_norm": 4.84375, "grad_norm_var": 0.09524739583333333, "learning_rate": 0.0001, "loss": 5.2957, "loss/crossentropy": 2.2879849076271057, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15467306226491928, "step": 13612 }, { "epoch": 0.6188181818181818, "grad_norm": 4.875, "grad_norm_var": 0.069384765625, "learning_rate": 0.0001, "loss": 5.6778, "loss/crossentropy": 2.592259109020233, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16226530447602272, "step": 13614 }, { "epoch": 0.618909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.04508056640625, "learning_rate": 0.0001, "loss": 5.6485, "loss/crossentropy": 2.5437480211257935, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16242657229304314, "step": 13616 }, { "epoch": 0.619, "grad_norm": 5.0, "grad_norm_var": 0.06834309895833333, "learning_rate": 0.0001, "loss": 5.6287, "loss/crossentropy": 2.5325984358787537, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1598021648824215, "step": 13618 }, { "epoch": 0.6190909090909091, "grad_norm": 5.125, "grad_norm_var": 0.06975504557291666, "learning_rate": 0.0001, "loss": 5.8084, "loss/crossentropy": 2.656123995780945, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16991962119936943, "step": 13620 }, { "epoch": 0.6191818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.05852457682291667, "learning_rate": 0.0001, "loss": 5.6417, "loss/crossentropy": 2.560681641101837, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16180900484323502, "step": 13622 }, { "epoch": 0.6192727272727273, "grad_norm": 5.34375, "grad_norm_var": 0.07620035807291667, "learning_rate": 0.0001, "loss": 5.7831, "loss/crossentropy": 2.6093443036079407, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16932981833815575, "step": 13624 }, { "epoch": 0.6193636363636363, "grad_norm": 4.40625, "grad_norm_var": 0.10623372395833333, "learning_rate": 0.0001, "loss": 5.3821, "loss/crossentropy": 2.2520551085472107, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1620306894183159, "step": 13626 }, { "epoch": 0.6194545454545455, "grad_norm": 5.75, "grad_norm_var": 0.16302083333333334, "learning_rate": 0.0001, "loss": 4.9578, "loss/crossentropy": 2.04289248585701, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.142859049141407, "step": 13628 }, { "epoch": 0.6195454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.16689046223958334, "learning_rate": 0.0001, "loss": 5.2878, "loss/crossentropy": 2.303801119327545, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.14938055723905563, "step": 13630 }, { "epoch": 0.6196363636363637, "grad_norm": 4.96875, "grad_norm_var": 0.165087890625, "learning_rate": 0.0001, "loss": 5.5118, "loss/crossentropy": 2.4572924375534058, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15818778797984123, "step": 13632 }, { "epoch": 0.6197272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.14674072265625, "learning_rate": 0.0001, "loss": 5.7097, "loss/crossentropy": 2.535703182220459, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16681509837508202, "step": 13634 }, { "epoch": 0.6198181818181818, "grad_norm": 4.71875, "grad_norm_var": 0.14716389973958333, "learning_rate": 0.0001, "loss": 5.6971, "loss/crossentropy": 2.593695819377899, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16464032977819443, "step": 13636 }, { "epoch": 0.6199090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.15319010416666667, "learning_rate": 0.0001, "loss": 5.5671, "loss/crossentropy": 2.4816934466362, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1614728718996048, "step": 13638 }, { "epoch": 0.62, "grad_norm": 4.90625, "grad_norm_var": 0.13839518229166667, "learning_rate": 0.0001, "loss": 5.8876, "loss/crossentropy": 2.6758915781974792, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17586078494787216, "step": 13640 }, { "epoch": 0.6200909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.10172119140625, "learning_rate": 0.0001, "loss": 5.398, "loss/crossentropy": 2.351638078689575, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.15405187755823135, "step": 13642 }, { "epoch": 0.6201818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.04412434895833333, "learning_rate": 0.0001, "loss": 5.8676, "loss/crossentropy": 2.670478403568268, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.17029396817088127, "step": 13644 }, { "epoch": 0.6202727272727273, "grad_norm": 4.4375, "grad_norm_var": 0.06864827473958333, "learning_rate": 0.0001, "loss": 5.225, "loss/crossentropy": 2.2856927514076233, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14783446863293648, "step": 13646 }, { "epoch": 0.6203636363636363, "grad_norm": 4.65625, "grad_norm_var": 0.07277018229166667, "learning_rate": 0.0001, "loss": 5.4556, "loss/crossentropy": 2.4595329761505127, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1540967896580696, "step": 13648 }, { "epoch": 0.6204545454545455, "grad_norm": 5.9375, "grad_norm_var": 0.14034830729166667, "learning_rate": 0.0001, "loss": 5.5805, "loss/crossentropy": 2.4853187799453735, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16010470688343048, "step": 13650 }, { "epoch": 0.6205454545454545, "grad_norm": 4.71875, "grad_norm_var": 0.14016520182291667, "learning_rate": 0.0001, "loss": 5.5455, "loss/crossentropy": 2.487966537475586, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15575774386525154, "step": 13652 }, { "epoch": 0.6206363636363637, "grad_norm": 4.96875, "grad_norm_var": 0.12831624348958334, "learning_rate": 0.0001, "loss": 5.8481, "loss/crossentropy": 2.689804971218109, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16836468875408173, "step": 13654 }, { "epoch": 0.6207272727272727, "grad_norm": 5.4375, "grad_norm_var": 0.13733317057291666, "learning_rate": 0.0001, "loss": 5.4893, "loss/crossentropy": 2.4026240706443787, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15944599360227585, "step": 13656 }, { "epoch": 0.6208181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.14166666666666666, "learning_rate": 0.0001, "loss": 5.4209, "loss/crossentropy": 2.449820816516876, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1502346768975258, "step": 13658 }, { "epoch": 0.6209090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.22213541666666667, "learning_rate": 0.0001, "loss": 5.8187, "loss/crossentropy": 2.680106043815613, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1656169258058071, "step": 13660 }, { "epoch": 0.621, "grad_norm": 4.625, "grad_norm_var": 0.19401041666666666, "learning_rate": 0.0001, "loss": 5.5326, "loss/crossentropy": 2.4818212687969208, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15703411772847176, "step": 13662 }, { "epoch": 0.6210909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.20823160807291666, "learning_rate": 0.0001, "loss": 5.3972, "loss/crossentropy": 2.383641690015793, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1546805016696453, "step": 13664 }, { "epoch": 0.6211818181818182, "grad_norm": 4.40625, "grad_norm_var": 0.18722330729166667, "learning_rate": 0.0001, "loss": 5.4473, "loss/crossentropy": 2.448235869407654, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15459777042269707, "step": 13666 }, { "epoch": 0.6212727272727273, "grad_norm": 5.375, "grad_norm_var": 0.19503580729166667, "learning_rate": 0.0001, "loss": 5.881, "loss/crossentropy": 2.7424033880233765, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1656157448887825, "step": 13668 }, { "epoch": 0.6213636363636363, "grad_norm": 4.8125, "grad_norm_var": 0.20089518229166667, "learning_rate": 0.0001, "loss": 5.7952, "loss/crossentropy": 2.619001567363739, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16781343147158623, "step": 13670 }, { "epoch": 0.6214545454545455, "grad_norm": 5.125, "grad_norm_var": 0.19582926432291667, "learning_rate": 0.0001, "loss": 5.6057, "loss/crossentropy": 2.4561498165130615, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16476232931017876, "step": 13672 }, { "epoch": 0.6215454545454545, "grad_norm": 10.3125, "grad_norm_var": 1.935009765625, "learning_rate": 0.0001, "loss": 5.589, "loss/crossentropy": 2.502582013607025, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1584477461874485, "step": 13674 }, { "epoch": 0.6216363636363637, "grad_norm": 8.0625, "grad_norm_var": 2.347847493489583, "learning_rate": 0.0001, "loss": 5.5196, "loss/crossentropy": 2.391885817050934, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16159668564796448, "step": 13676 }, { "epoch": 0.6217272727272727, "grad_norm": 5.375, "grad_norm_var": 2.3184244791666666, "learning_rate": 0.0001, "loss": 5.2477, "loss/crossentropy": 2.23673278093338, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15266156941652298, "step": 13678 }, { "epoch": 0.6218181818181818, "grad_norm": 4.75, "grad_norm_var": 2.304878743489583, "learning_rate": 0.0001, "loss": 5.5882, "loss/crossentropy": 2.5066946148872375, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15913061052560806, "step": 13680 }, { "epoch": 0.6219090909090909, "grad_norm": 5.03125, "grad_norm_var": 2.1972941080729167, "learning_rate": 0.0001, "loss": 5.5565, "loss/crossentropy": 2.4684625267982483, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15958917140960693, "step": 13682 }, { "epoch": 0.622, "grad_norm": 5.0625, "grad_norm_var": 2.26998291015625, "learning_rate": 0.0001, "loss": 5.1387, "loss/crossentropy": 2.2128196358680725, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.14258464239537716, "step": 13684 }, { "epoch": 0.6220909090909091, "grad_norm": 5.46875, "grad_norm_var": 2.2744425455729167, "learning_rate": 0.0001, "loss": 5.62, "loss/crossentropy": 2.4933459758758545, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1618805006146431, "step": 13686 }, { "epoch": 0.6221818181818182, "grad_norm": 5.15625, "grad_norm_var": 2.304520670572917, "learning_rate": 0.0001, "loss": 5.479, "loss/crossentropy": 2.431627333164215, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15903542935848236, "step": 13688 }, { "epoch": 0.6222727272727273, "grad_norm": 5.25, "grad_norm_var": 0.7093709309895834, "learning_rate": 0.0001, "loss": 5.5769, "loss/crossentropy": 2.4907152354717255, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16134875267744064, "step": 13690 }, { "epoch": 0.6223636363636363, "grad_norm": 4.65625, "grad_norm_var": 0.09646809895833333, "learning_rate": 0.0001, "loss": 5.2996, "loss/crossentropy": 2.300475776195526, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15089058503508568, "step": 13692 }, { "epoch": 0.6224545454545455, "grad_norm": 4.875, "grad_norm_var": 0.085400390625, "learning_rate": 0.0001, "loss": 5.4244, "loss/crossentropy": 2.3611140847206116, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15769241005182266, "step": 13694 }, { "epoch": 0.6225454545454545, "grad_norm": 4.90625, "grad_norm_var": 0.08140869140625, "learning_rate": 0.0001, "loss": 5.5667, "loss/crossentropy": 2.4851489067077637, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15951864421367645, "step": 13696 }, { "epoch": 0.6226363636363637, "grad_norm": 4.84375, "grad_norm_var": 0.11236572265625, "learning_rate": 0.0001, "loss": 5.2949, "loss/crossentropy": 2.3964130878448486, "loss/hidden": 1.431640625, "loss/jsd": 0.0, "loss/logits": 0.14668821915984154, "step": 13698 }, { "epoch": 0.6227272727272727, "grad_norm": 4.46875, "grad_norm_var": 0.11256510416666667, "learning_rate": 0.0001, "loss": 5.6901, "loss/crossentropy": 2.6367610692977905, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15669666975736618, "step": 13700 }, { "epoch": 0.6228181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.08448893229166667, "learning_rate": 0.0001, "loss": 5.7234, "loss/crossentropy": 2.564587712287903, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16705166921019554, "step": 13702 }, { "epoch": 0.6229090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.07923177083333334, "learning_rate": 0.0001, "loss": 5.7977, "loss/crossentropy": 2.6735346913337708, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16221743822097778, "step": 13704 }, { "epoch": 0.623, "grad_norm": 5.0, "grad_norm_var": 0.06327718098958333, "learning_rate": 0.0001, "loss": 5.7129, "loss/crossentropy": 2.62945818901062, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1610768511891365, "step": 13706 }, { "epoch": 0.6230909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 5.5035, "loss/crossentropy": 2.4751113653182983, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15772157907485962, "step": 13708 }, { "epoch": 0.6231818181818182, "grad_norm": 4.875, "grad_norm_var": 0.05780843098958333, "learning_rate": 0.0001, "loss": 5.9011, "loss/crossentropy": 2.7440091967582703, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16805072873830795, "step": 13710 }, { "epoch": 0.6232727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.055008951822916666, "learning_rate": 0.0001, "loss": 5.2471, "loss/crossentropy": 2.256155550479889, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.14597084186971188, "step": 13712 }, { "epoch": 0.6233636363636363, "grad_norm": 4.875, "grad_norm_var": 0.03326416015625, "learning_rate": 0.0001, "loss": 5.9508, "loss/crossentropy": 2.764966130256653, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17033912986516953, "step": 13714 }, { "epoch": 0.6234545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.07600504557291667, "learning_rate": 0.0001, "loss": 5.3091, "loss/crossentropy": 2.303363174200058, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15428738296031952, "step": 13716 }, { "epoch": 0.6235454545454545, "grad_norm": 4.875, "grad_norm_var": 0.08840738932291667, "learning_rate": 0.0001, "loss": 5.0995, "loss/crossentropy": 2.161522328853607, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.14477057568728924, "step": 13718 }, { "epoch": 0.6236363636363637, "grad_norm": 4.9375, "grad_norm_var": 0.08752848307291666, "learning_rate": 0.0001, "loss": 5.6982, "loss/crossentropy": 2.5381361842155457, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16600258648395538, "step": 13720 }, { "epoch": 0.6237272727272727, "grad_norm": 5.59375, "grad_norm_var": 0.11796875, "learning_rate": 0.0001, "loss": 5.7482, "loss/crossentropy": 2.6087452173233032, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1655050814151764, "step": 13722 }, { "epoch": 0.6238181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.120166015625, "learning_rate": 0.0001, "loss": 5.8665, "loss/crossentropy": 2.668575704097748, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17154676467180252, "step": 13724 }, { "epoch": 0.6239090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.12161051432291667, "learning_rate": 0.0001, "loss": 5.5558, "loss/crossentropy": 2.483163833618164, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16117290779948235, "step": 13726 }, { "epoch": 0.624, "grad_norm": 4.59375, "grad_norm_var": 0.12392171223958333, "learning_rate": 0.0001, "loss": 5.6394, "loss/crossentropy": 2.5945608615875244, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15448236837983131, "step": 13728 }, { "epoch": 0.6240909090909091, "grad_norm": 5.125, "grad_norm_var": 0.14589436848958334, "learning_rate": 0.0001, "loss": 5.044, "loss/crossentropy": 2.1192452907562256, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.14462370984256268, "step": 13730 }, { "epoch": 0.6241818181818182, "grad_norm": 4.875, "grad_norm_var": 0.09901936848958333, "learning_rate": 0.0001, "loss": 5.505, "loss/crossentropy": 2.4081311225891113, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1600801944732666, "step": 13732 }, { "epoch": 0.6242727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.09114583333333333, "learning_rate": 0.0001, "loss": 5.3418, "loss/crossentropy": 2.3302783966064453, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15291403234004974, "step": 13734 }, { "epoch": 0.6243636363636363, "grad_norm": 5.40625, "grad_norm_var": 0.1103515625, "learning_rate": 0.0001, "loss": 5.4077, "loss/crossentropy": 2.366298109292984, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15570450760424137, "step": 13736 }, { "epoch": 0.6244545454545455, "grad_norm": 4.75, "grad_norm_var": 0.07447509765625, "learning_rate": 0.0001, "loss": 5.3584, "loss/crossentropy": 2.3462542593479156, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1557062231004238, "step": 13738 }, { "epoch": 0.6245454545454545, "grad_norm": 4.5625, "grad_norm_var": 0.06951497395833334, "learning_rate": 0.0001, "loss": 5.4147, "loss/crossentropy": 2.3745952248573303, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1547933593392372, "step": 13740 }, { "epoch": 0.6246363636363637, "grad_norm": 5.09375, "grad_norm_var": 0.07131754557291667, "learning_rate": 0.0001, "loss": 5.7623, "loss/crossentropy": 2.6809111833572388, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16145918890833855, "step": 13742 }, { "epoch": 0.6247272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.06721598307291667, "learning_rate": 0.0001, "loss": 5.6959, "loss/crossentropy": 2.5996997356414795, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16293713077902794, "step": 13744 }, { "epoch": 0.6248181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.04950764973958333, "learning_rate": 0.0001, "loss": 5.738, "loss/crossentropy": 2.655427038669586, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16275037080049515, "step": 13746 }, { "epoch": 0.6249090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.051171875, "learning_rate": 0.0001, "loss": 5.1938, "loss/crossentropy": 2.178255259990692, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1505788043141365, "step": 13748 }, { "epoch": 0.625, "grad_norm": 4.59375, "grad_norm_var": 0.05227457682291667, "learning_rate": 0.0001, "loss": 5.6163, "loss/crossentropy": 2.5575879216194153, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15821511298418045, "step": 13750 }, { "epoch": 0.6250909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.03683268229166667, "learning_rate": 0.0001, "loss": 5.3694, "loss/crossentropy": 2.334297776222229, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1544872373342514, "step": 13752 }, { "epoch": 0.6251818181818182, "grad_norm": 5.34375, "grad_norm_var": 0.04440104166666667, "learning_rate": 0.0001, "loss": 5.7784, "loss/crossentropy": 2.5816864669322968, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16967444866895676, "step": 13754 }, { "epoch": 0.6252727272727273, "grad_norm": 7.03125, "grad_norm_var": 0.33834635416666664, "learning_rate": 0.0001, "loss": 5.7195, "loss/crossentropy": 2.6021084785461426, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1629137024283409, "step": 13756 }, { "epoch": 0.6253636363636363, "grad_norm": 5.1875, "grad_norm_var": 0.3848795572916667, "learning_rate": 0.0001, "loss": 5.7496, "loss/crossentropy": 2.554400324821472, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1706942394375801, "step": 13758 }, { "epoch": 0.6254545454545455, "grad_norm": 5.125, "grad_norm_var": 0.37037760416666665, "learning_rate": 0.0001, "loss": 5.8257, "loss/crossentropy": 2.634198546409607, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.17052210122346878, "step": 13760 }, { "epoch": 0.6255454545454545, "grad_norm": 4.96875, "grad_norm_var": 0.3522135416666667, "learning_rate": 0.0001, "loss": 5.3763, "loss/crossentropy": 2.2876062989234924, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1602391041815281, "step": 13762 }, { "epoch": 0.6256363636363637, "grad_norm": 4.8125, "grad_norm_var": 0.38824462890625, "learning_rate": 0.0001, "loss": 5.3991, "loss/crossentropy": 2.400828719139099, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15392785146832466, "step": 13764 }, { "epoch": 0.6257272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.39140625, "learning_rate": 0.0001, "loss": 5.074, "loss/crossentropy": 2.166716128587723, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.14150959253311157, "step": 13766 }, { "epoch": 0.6258181818181818, "grad_norm": 5.71875, "grad_norm_var": 0.41744791666666664, "learning_rate": 0.0001, "loss": 5.7987, "loss/crossentropy": 2.586951971054077, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1721520535647869, "step": 13768 }, { "epoch": 0.6259090909090909, "grad_norm": 5.5, "grad_norm_var": 0.42141520182291664, "learning_rate": 0.0001, "loss": 5.3271, "loss/crossentropy": 2.2904950380325317, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15522154048085213, "step": 13770 }, { "epoch": 0.626, "grad_norm": 5.34375, "grad_norm_var": 0.17180989583333334, "learning_rate": 0.0001, "loss": 5.9325, "loss/crossentropy": 2.7839518785476685, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16661624982953072, "step": 13772 }, { "epoch": 0.6260909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.12766520182291666, "learning_rate": 0.0001, "loss": 5.5925, "loss/crossentropy": 2.5011700987815857, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1597188487648964, "step": 13774 }, { "epoch": 0.6261818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.14449462890625, "learning_rate": 0.0001, "loss": 5.4417, "loss/crossentropy": 2.4230279326438904, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15440216660499573, "step": 13776 }, { "epoch": 0.6262727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.15230712890625, "learning_rate": 0.0001, "loss": 5.0307, "loss/crossentropy": 2.1118745505809784, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.14109744876623154, "step": 13778 }, { "epoch": 0.6263636363636363, "grad_norm": 9.5625, "grad_norm_var": 1.3711588541666666, "learning_rate": 0.0001, "loss": 5.5321, "loss/crossentropy": 2.402574121952057, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1611981876194477, "step": 13780 }, { "epoch": 0.6264545454545455, "grad_norm": 4.90625, "grad_norm_var": 1.3227498372395834, "learning_rate": 0.0001, "loss": 5.7315, "loss/crossentropy": 2.609156519174576, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16164854168891907, "step": 13782 }, { "epoch": 0.6265454545454545, "grad_norm": 5.21875, "grad_norm_var": 1.3020670572916666, "learning_rate": 0.0001, "loss": 5.9328, "loss/crossentropy": 2.747963786125183, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16926047205924988, "step": 13784 }, { "epoch": 0.6266363636363637, "grad_norm": 4.875, "grad_norm_var": 1.34683837890625, "learning_rate": 0.0001, "loss": 5.2767, "loss/crossentropy": 2.332880914211273, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14594518765807152, "step": 13786 }, { "epoch": 0.6267272727272727, "grad_norm": 4.8125, "grad_norm_var": 1.3715494791666667, "learning_rate": 0.0001, "loss": 5.3936, "loss/crossentropy": 2.3987341821193695, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15319564938545227, "step": 13788 }, { "epoch": 0.6268181818181818, "grad_norm": 4.53125, "grad_norm_var": 1.4026652018229167, "learning_rate": 0.0001, "loss": 5.7021, "loss/crossentropy": 2.593368947505951, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.16575371474027634, "step": 13790 }, { "epoch": 0.6269090909090909, "grad_norm": 4.8125, "grad_norm_var": 1.3882649739583333, "learning_rate": 0.0001, "loss": 5.7185, "loss/crossentropy": 2.6127507090568542, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1648765280842781, "step": 13792 }, { "epoch": 0.627, "grad_norm": 5.125, "grad_norm_var": 1.4334920247395833, "learning_rate": 0.0001, "loss": 5.5069, "loss/crossentropy": 2.4222662448883057, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1522086225450039, "step": 13794 }, { "epoch": 0.6270909090909091, "grad_norm": 4.71875, "grad_norm_var": 0.19706624348958332, "learning_rate": 0.0001, "loss": 5.2391, "loss/crossentropy": 2.291296750307083, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.14653894677758217, "step": 13796 }, { "epoch": 0.6271818181818182, "grad_norm": 5.28125, "grad_norm_var": 0.19765625, "learning_rate": 0.0001, "loss": 5.729, "loss/crossentropy": 2.5916918516159058, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16392650082707405, "step": 13798 }, { "epoch": 0.6272727272727273, "grad_norm": 5.28125, "grad_norm_var": 0.19191080729166668, "learning_rate": 0.0001, "loss": 5.5641, "loss/crossentropy": 2.430114269256592, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16379384323954582, "step": 13800 }, { "epoch": 0.6273636363636363, "grad_norm": 4.96875, "grad_norm_var": 0.18254801432291667, "learning_rate": 0.0001, "loss": 5.4738, "loss/crossentropy": 2.426866203546524, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15390878170728683, "step": 13802 }, { "epoch": 0.6274545454545455, "grad_norm": 5.25, "grad_norm_var": 0.7106770833333333, "learning_rate": 0.0001, "loss": 6.0782, "loss/crossentropy": 2.802955746650696, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1759633533656597, "step": 13804 }, { "epoch": 0.6275454545454545, "grad_norm": 4.96875, "grad_norm_var": 0.7186808268229167, "learning_rate": 0.0001, "loss": 5.2823, "loss/crossentropy": 2.32105353474617, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14964070171117783, "step": 13806 }, { "epoch": 0.6276363636363637, "grad_norm": 4.90625, "grad_norm_var": 0.7307291666666667, "learning_rate": 0.0001, "loss": 5.852, "loss/crossentropy": 2.695583939552307, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1673986241221428, "step": 13808 }, { "epoch": 0.6277272727272727, "grad_norm": 4.71875, "grad_norm_var": 0.649853515625, "learning_rate": 0.0001, "loss": 5.7085, "loss/crossentropy": 2.6518600583076477, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15918341279029846, "step": 13810 }, { "epoch": 0.6278181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.6753743489583334, "learning_rate": 0.0001, "loss": 5.6184, "loss/crossentropy": 2.523060917854309, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15992547571659088, "step": 13812 }, { "epoch": 0.6279090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.6953084309895833, "learning_rate": 0.0001, "loss": 5.5118, "loss/crossentropy": 2.4672951102256775, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1562097780406475, "step": 13814 }, { "epoch": 0.628, "grad_norm": 5.53125, "grad_norm_var": 0.7288370768229167, "learning_rate": 0.0001, "loss": 5.2173, "loss/crossentropy": 2.2567247450351715, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1489909179508686, "step": 13816 }, { "epoch": 0.6280909090909091, "grad_norm": 4.75, "grad_norm_var": 0.7392862955729167, "learning_rate": 0.0001, "loss": 5.2568, "loss/crossentropy": 2.311992734670639, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1483913417905569, "step": 13818 }, { "epoch": 0.6281818181818182, "grad_norm": 5.0625, "grad_norm_var": 0.123291015625, "learning_rate": 0.0001, "loss": 5.6394, "loss/crossentropy": 2.514432966709137, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16542880237102509, "step": 13820 }, { "epoch": 0.6282727272727273, "grad_norm": 5.125, "grad_norm_var": 0.14345296223958334, "learning_rate": 0.0001, "loss": 6.0253, "loss/crossentropy": 2.7613102197647095, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17503434792160988, "step": 13822 }, { "epoch": 0.6283636363636363, "grad_norm": 4.71875, "grad_norm_var": 0.14403889973958334, "learning_rate": 0.0001, "loss": 5.2059, "loss/crossentropy": 2.2781902551651, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1439426690340042, "step": 13824 }, { "epoch": 0.6284545454545455, "grad_norm": 5.3125, "grad_norm_var": 0.133837890625, "learning_rate": 0.0001, "loss": 5.9001, "loss/crossentropy": 2.68224036693573, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1698332391679287, "step": 13826 }, { "epoch": 0.6285454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 5.4918, "loss/crossentropy": 2.438932776451111, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1580234132707119, "step": 13828 }, { "epoch": 0.6286363636363637, "grad_norm": 4.71875, "grad_norm_var": 0.1251953125, "learning_rate": 0.0001, "loss": 5.0256, "loss/crossentropy": 2.174152821302414, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1396374050527811, "step": 13830 }, { "epoch": 0.6287272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.10009358723958334, "learning_rate": 0.0001, "loss": 5.7066, "loss/crossentropy": 2.597393751144409, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16346264630556107, "step": 13832 }, { "epoch": 0.6288181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.10154622395833333, "learning_rate": 0.0001, "loss": 5.6876, "loss/crossentropy": 2.6425670981407166, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15665346011519432, "step": 13834 }, { "epoch": 0.6289090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.10761311848958334, "learning_rate": 0.0001, "loss": 5.3179, "loss/crossentropy": 2.3181647658348083, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15212685242295265, "step": 13836 }, { "epoch": 0.629, "grad_norm": 5.03125, "grad_norm_var": 0.2819295247395833, "learning_rate": 0.0001, "loss": 5.7905, "loss/crossentropy": 2.536300241947174, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.173665851354599, "step": 13838 }, { "epoch": 0.6290909090909091, "grad_norm": 5.09375, "grad_norm_var": 4.135139973958333, "learning_rate": 0.0001, "loss": 5.848, "loss/crossentropy": 2.454160511493683, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.19114339351654053, "step": 13840 }, { "epoch": 0.6291818181818182, "grad_norm": 5.15625, "grad_norm_var": 4.16451416015625, "learning_rate": 0.0001, "loss": 5.4347, "loss/crossentropy": 2.388921856880188, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15750835835933685, "step": 13842 }, { "epoch": 0.6292727272727273, "grad_norm": 4.84375, "grad_norm_var": 4.189306640625, "learning_rate": 0.0001, "loss": 5.4907, "loss/crossentropy": 2.4095123410224915, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15811635181307793, "step": 13844 }, { "epoch": 0.6293636363636363, "grad_norm": 5.03125, "grad_norm_var": 4.071598307291667, "learning_rate": 0.0001, "loss": 5.7752, "loss/crossentropy": 2.594937026500702, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16841840744018555, "step": 13846 }, { "epoch": 0.6294545454545455, "grad_norm": 5.125, "grad_norm_var": 4.052327473958333, "learning_rate": 0.0001, "loss": 5.7552, "loss/crossentropy": 2.6134223341941833, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16710661724209785, "step": 13848 }, { "epoch": 0.6295454545454545, "grad_norm": 4.78125, "grad_norm_var": 4.07906494140625, "learning_rate": 0.0001, "loss": 5.4786, "loss/crossentropy": 2.4181235432624817, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15487785264849663, "step": 13850 }, { "epoch": 0.6296363636363637, "grad_norm": 4.8125, "grad_norm_var": 4.084175618489583, "learning_rate": 0.0001, "loss": 5.6609, "loss/crossentropy": 2.579773783683777, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.161623764783144, "step": 13852 }, { "epoch": 0.6297272727272727, "grad_norm": 5.34375, "grad_norm_var": 3.993941243489583, "learning_rate": 0.0001, "loss": 5.9938, "loss/crossentropy": 2.770217478275299, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.17450888454914093, "step": 13854 }, { "epoch": 0.6298181818181818, "grad_norm": 4.75, "grad_norm_var": 0.05220947265625, "learning_rate": 0.0001, "loss": 5.8933, "loss/crossentropy": 2.688464343547821, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17399824783205986, "step": 13856 }, { "epoch": 0.6299090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.059859212239583334, "learning_rate": 0.0001, "loss": 5.2602, "loss/crossentropy": 2.3468905687332153, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14524199441075325, "step": 13858 }, { "epoch": 0.63, "grad_norm": 5.0, "grad_norm_var": 0.0609375, "learning_rate": 0.0001, "loss": 5.543, "loss/crossentropy": 2.4803218841552734, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1572457030415535, "step": 13860 }, { "epoch": 0.6300909090909091, "grad_norm": 5.9375, "grad_norm_var": 0.11802978515625, "learning_rate": 0.0001, "loss": 5.6972, "loss/crossentropy": 2.4336213171482086, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17245449125766754, "step": 13862 }, { "epoch": 0.6301818181818182, "grad_norm": 5.5625, "grad_norm_var": 0.13592122395833334, "learning_rate": 0.0001, "loss": 5.4654, "loss/crossentropy": 2.422806203365326, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15308533608913422, "step": 13864 }, { "epoch": 0.6302727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.12057291666666667, "learning_rate": 0.0001, "loss": 5.3142, "loss/crossentropy": 2.3081869781017303, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1525505818426609, "step": 13866 }, { "epoch": 0.6303636363636363, "grad_norm": 4.78125, "grad_norm_var": 0.12263997395833333, "learning_rate": 0.0001, "loss": 5.4475, "loss/crossentropy": 2.415437161922455, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15281761065125465, "step": 13868 }, { "epoch": 0.6304545454545455, "grad_norm": 4.71875, "grad_norm_var": 0.13303629557291666, "learning_rate": 0.0001, "loss": 5.4214, "loss/crossentropy": 2.43136066198349, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15466763824224472, "step": 13870 }, { "epoch": 0.6305454545454545, "grad_norm": 4.9375, "grad_norm_var": 0.11835530598958334, "learning_rate": 0.0001, "loss": 5.6739, "loss/crossentropy": 2.554092586040497, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16530035436153412, "step": 13872 }, { "epoch": 0.6306363636363637, "grad_norm": 4.8125, "grad_norm_var": 0.24576822916666666, "learning_rate": 0.0001, "loss": 5.7749, "loss/crossentropy": 2.6308032870292664, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16714539751410484, "step": 13874 }, { "epoch": 0.6307272727272727, "grad_norm": 4.84375, "grad_norm_var": 0.2530598958333333, "learning_rate": 0.0001, "loss": 5.5542, "loss/crossentropy": 2.4173203110694885, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16544605046510696, "step": 13876 }, { "epoch": 0.6308181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.20579427083333332, "learning_rate": 0.0001, "loss": 5.9005, "loss/crossentropy": 2.7659467458724976, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16638562083244324, "step": 13878 }, { "epoch": 0.6309090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.18756103515625, "learning_rate": 0.0001, "loss": 5.6611, "loss/crossentropy": 2.5294124484062195, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1664857193827629, "step": 13880 }, { "epoch": 0.631, "grad_norm": 4.34375, "grad_norm_var": 0.212353515625, "learning_rate": 0.0001, "loss": 4.989, "loss/crossentropy": 2.1200978755950928, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.139817388728261, "step": 13882 }, { "epoch": 0.631090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.22053629557291668, "learning_rate": 0.0001, "loss": 5.3925, "loss/crossentropy": 2.4119372069835663, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15157292783260345, "step": 13884 }, { "epoch": 0.6311818181818182, "grad_norm": 5.15625, "grad_norm_var": 0.21756184895833333, "learning_rate": 0.0001, "loss": 5.6704, "loss/crossentropy": 2.524015188217163, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16600703448057175, "step": 13886 }, { "epoch": 0.6312727272727273, "grad_norm": 5.0, "grad_norm_var": 0.21438802083333333, "learning_rate": 0.0001, "loss": 5.583, "loss/crossentropy": 2.491507828235626, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16246576979756355, "step": 13888 }, { "epoch": 0.6313636363636363, "grad_norm": 5.03125, "grad_norm_var": 0.06054280598958333, "learning_rate": 0.0001, "loss": 5.8943, "loss/crossentropy": 2.7312594056129456, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1653294377028942, "step": 13890 }, { "epoch": 0.6314545454545455, "grad_norm": 4.5, "grad_norm_var": 0.07548421223958333, "learning_rate": 0.0001, "loss": 5.1495, "loss/crossentropy": 2.2019418478012085, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14749427512288094, "step": 13892 }, { "epoch": 0.6315454545454545, "grad_norm": 4.59375, "grad_norm_var": 0.074853515625, "learning_rate": 0.0001, "loss": 5.5329, "loss/crossentropy": 2.4698176980018616, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15962433069944382, "step": 13894 }, { "epoch": 0.6316363636363637, "grad_norm": 6.0625, "grad_norm_var": 0.16529541015625, "learning_rate": 0.0001, "loss": 5.8561, "loss/crossentropy": 2.6384132504463196, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1702074073255062, "step": 13896 }, { "epoch": 0.6317272727272727, "grad_norm": 5.0, "grad_norm_var": 0.13967692057291667, "learning_rate": 0.0001, "loss": 5.4069, "loss/crossentropy": 2.362175941467285, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15505509451031685, "step": 13898 }, { "epoch": 0.6318181818181818, "grad_norm": 4.9375, "grad_norm_var": 0.12784830729166666, "learning_rate": 0.0001, "loss": 5.698, "loss/crossentropy": 2.6071027517318726, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1633821725845337, "step": 13900 }, { "epoch": 0.6319090909090909, "grad_norm": 4.53125, "grad_norm_var": 0.14451497395833332, "learning_rate": 0.0001, "loss": 5.6176, "loss/crossentropy": 2.5925057232379913, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15661460906267166, "step": 13902 }, { "epoch": 0.632, "grad_norm": 4.84375, "grad_norm_var": 0.17118733723958332, "learning_rate": 0.0001, "loss": 5.2044, "loss/crossentropy": 2.331599682569504, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14216182194650173, "step": 13904 }, { "epoch": 0.632090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.15104166666666666, "learning_rate": 0.0001, "loss": 5.6781, "loss/crossentropy": 2.5969226956367493, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16104752197861671, "step": 13906 }, { "epoch": 0.6321818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.13690999348958333, "learning_rate": 0.0001, "loss": 5.3271, "loss/crossentropy": 2.3269498348236084, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15118947252631187, "step": 13908 }, { "epoch": 0.6322727272727273, "grad_norm": 6.28125, "grad_norm_var": 0.2613118489583333, "learning_rate": 0.0001, "loss": 5.513, "loss/crossentropy": 2.3669484853744507, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16362984851002693, "step": 13910 }, { "epoch": 0.6323636363636363, "grad_norm": 4.3125, "grad_norm_var": 0.19097900390625, "learning_rate": 0.0001, "loss": 5.2647, "loss/crossentropy": 2.295670509338379, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.15295684337615967, "step": 13912 }, { "epoch": 0.6324545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.19455973307291666, "learning_rate": 0.0001, "loss": 5.4189, "loss/crossentropy": 2.354986250400543, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15697333961725235, "step": 13914 }, { "epoch": 0.6325454545454545, "grad_norm": 5.09375, "grad_norm_var": 0.20676676432291666, "learning_rate": 0.0001, "loss": 5.6643, "loss/crossentropy": 2.612419366836548, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15889639034867287, "step": 13916 }, { "epoch": 0.6326363636363637, "grad_norm": 5.3125, "grad_norm_var": 0.23566080729166666, "learning_rate": 0.0001, "loss": 5.7498, "loss/crossentropy": 2.5915122032165527, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16641896218061447, "step": 13918 }, { "epoch": 0.6327272727272727, "grad_norm": 4.375, "grad_norm_var": 0.23443603515625, "learning_rate": 0.0001, "loss": 5.3516, "loss/crossentropy": 2.3650439977645874, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1496340036392212, "step": 13920 }, { "epoch": 0.6328181818181818, "grad_norm": 4.5625, "grad_norm_var": 0.2956339518229167, "learning_rate": 0.0001, "loss": 4.881, "loss/crossentropy": 2.1249808967113495, "loss/hidden": 1.416015625, "loss/jsd": 0.0, "loss/logits": 0.13400067389011383, "step": 13922 }, { "epoch": 0.6329090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.3439412434895833, "learning_rate": 0.0001, "loss": 5.9608, "loss/crossentropy": 2.70715993642807, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17400041595101357, "step": 13924 }, { "epoch": 0.633, "grad_norm": 4.46875, "grad_norm_var": 0.2555338541666667, "learning_rate": 0.0001, "loss": 5.3959, "loss/crossentropy": 2.3758644461631775, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15181152150034904, "step": 13926 }, { "epoch": 0.633090909090909, "grad_norm": 4.6875, "grad_norm_var": 0.23235270182291667, "learning_rate": 0.0001, "loss": 5.7445, "loss/crossentropy": 2.636206269264221, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1627870723605156, "step": 13928 }, { "epoch": 0.6331818181818182, "grad_norm": 4.6875, "grad_norm_var": 0.23631184895833332, "learning_rate": 0.0001, "loss": 5.4378, "loss/crossentropy": 2.4255980849266052, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15395845100283623, "step": 13930 }, { "epoch": 0.6332727272727273, "grad_norm": 4.875, "grad_norm_var": 0.23316650390625, "learning_rate": 0.0001, "loss": 6.0279, "loss/crossentropy": 2.775606334209442, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.174841970205307, "step": 13932 }, { "epoch": 0.6333636363636364, "grad_norm": 5.625, "grad_norm_var": 0.23359375, "learning_rate": 0.0001, "loss": 5.605, "loss/crossentropy": 2.5272789001464844, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1618751361966133, "step": 13934 }, { "epoch": 0.6334545454545455, "grad_norm": 5.28125, "grad_norm_var": 0.3594889322916667, "learning_rate": 0.0001, "loss": 5.6387, "loss/crossentropy": 2.4590471386909485, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1640564538538456, "step": 13936 }, { "epoch": 0.6335454545454545, "grad_norm": 4.90625, "grad_norm_var": 0.25852864583333335, "learning_rate": 0.0001, "loss": 6.0749, "loss/crossentropy": 2.842562258243561, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17245712503790855, "step": 13938 }, { "epoch": 0.6336363636363637, "grad_norm": 4.6875, "grad_norm_var": 0.26002197265625, "learning_rate": 0.0001, "loss": 5.6939, "loss/crossentropy": 2.5877389907836914, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.164134431630373, "step": 13940 }, { "epoch": 0.6337272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.2330078125, "learning_rate": 0.0001, "loss": 5.4536, "loss/crossentropy": 2.4143625497817993, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1548989824950695, "step": 13942 }, { "epoch": 0.6338181818181818, "grad_norm": 5.125, "grad_norm_var": 0.22545572916666667, "learning_rate": 0.0001, "loss": 5.339, "loss/crossentropy": 2.3294999599456787, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15134247392416, "step": 13944 }, { "epoch": 0.6339090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.21482747395833332, "learning_rate": 0.0001, "loss": 5.6828, "loss/crossentropy": 2.5378116369247437, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16508269682526588, "step": 13946 }, { "epoch": 0.634, "grad_norm": 4.625, "grad_norm_var": 0.23804931640625, "learning_rate": 0.0001, "loss": 5.551, "loss/crossentropy": 2.4882892966270447, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1599823236465454, "step": 13948 }, { "epoch": 0.634090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.23310139973958333, "learning_rate": 0.0001, "loss": 5.1334, "loss/crossentropy": 2.2340022921562195, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14267435669898987, "step": 13950 }, { "epoch": 0.6341818181818182, "grad_norm": 4.75, "grad_norm_var": 0.051676432291666664, "learning_rate": 0.0001, "loss": 5.3119, "loss/crossentropy": 2.329666703939438, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15135161206126213, "step": 13952 }, { "epoch": 0.6342727272727273, "grad_norm": 4.625, "grad_norm_var": 0.06480712890625, "learning_rate": 0.0001, "loss": 5.5807, "loss/crossentropy": 2.493431031703949, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16321707889437675, "step": 13954 }, { "epoch": 0.6343636363636364, "grad_norm": 4.5625, "grad_norm_var": 0.07235921223958333, "learning_rate": 0.0001, "loss": 5.3323, "loss/crossentropy": 2.4275718331336975, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1451611965894699, "step": 13956 }, { "epoch": 0.6344545454545455, "grad_norm": 5.09375, "grad_norm_var": 0.07459309895833334, "learning_rate": 0.0001, "loss": 5.5672, "loss/crossentropy": 2.524083912372589, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15548516809940338, "step": 13958 }, { "epoch": 0.6345454545454545, "grad_norm": 4.9375, "grad_norm_var": 0.070703125, "learning_rate": 0.0001, "loss": 5.5643, "loss/crossentropy": 2.4934926629066467, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15962206944823265, "step": 13960 }, { "epoch": 0.6346363636363637, "grad_norm": 5.125, "grad_norm_var": 0.06864827473958333, "learning_rate": 0.0001, "loss": 5.8722, "loss/crossentropy": 2.635405957698822, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1740693561732769, "step": 13962 }, { "epoch": 0.6347272727272727, "grad_norm": 4.3125, "grad_norm_var": 0.08175455729166667, "learning_rate": 0.0001, "loss": 5.0472, "loss/crossentropy": 2.171990752220154, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14044667780399323, "step": 13964 }, { "epoch": 0.6348181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.08111572265625, "learning_rate": 0.0001, "loss": 5.858, "loss/crossentropy": 2.6597747206687927, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16963090375065804, "step": 13966 }, { "epoch": 0.6349090909090909, "grad_norm": 4.25, "grad_norm_var": 0.1060546875, "learning_rate": 0.0001, "loss": 5.0396, "loss/crossentropy": 2.1831861436367035, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1407222654670477, "step": 13968 }, { "epoch": 0.635, "grad_norm": 4.6875, "grad_norm_var": 0.075244140625, "learning_rate": 0.0001, "loss": 5.5654, "loss/crossentropy": 2.5059346556663513, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15946603938937187, "step": 13970 }, { "epoch": 0.635090909090909, "grad_norm": 4.46875, "grad_norm_var": 0.07867431640625, "learning_rate": 0.0001, "loss": 5.2741, "loss/crossentropy": 2.3195536732673645, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15248994156718254, "step": 13972 }, { "epoch": 0.6351818181818182, "grad_norm": 5.0625, "grad_norm_var": 0.20050455729166666, "learning_rate": 0.0001, "loss": 5.6124, "loss/crossentropy": 2.496102035045624, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16201941668987274, "step": 13974 }, { "epoch": 0.6352727272727273, "grad_norm": 5.03125, "grad_norm_var": 0.25123291015625, "learning_rate": 0.0001, "loss": 5.4485, "loss/crossentropy": 2.3605761528015137, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15977340936660767, "step": 13976 }, { "epoch": 0.6353636363636364, "grad_norm": 4.59375, "grad_norm_var": 0.25271809895833336, "learning_rate": 0.0001, "loss": 5.236, "loss/crossentropy": 2.329701066017151, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14453547075390816, "step": 13978 }, { "epoch": 0.6354545454545455, "grad_norm": 4.9375, "grad_norm_var": 0.22805989583333333, "learning_rate": 0.0001, "loss": 5.574, "loss/crossentropy": 2.4852933287620544, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15926172584295273, "step": 13980 }, { "epoch": 0.6355454545454545, "grad_norm": 5.84375, "grad_norm_var": 0.3025390625, "learning_rate": 0.0001, "loss": 5.1522, "loss/crossentropy": 2.165558636188507, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.14690501615405083, "step": 13982 }, { "epoch": 0.6356363636363637, "grad_norm": 4.96875, "grad_norm_var": 0.2591145833333333, "learning_rate": 0.0001, "loss": 5.682, "loss/crossentropy": 2.5594189167022705, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1628413125872612, "step": 13984 }, { "epoch": 0.6357272727272727, "grad_norm": 4.65625, "grad_norm_var": 0.272509765625, "learning_rate": 0.0001, "loss": 5.3513, "loss/crossentropy": 2.3719271421432495, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15262045711278915, "step": 13986 }, { "epoch": 0.6358181818181818, "grad_norm": 4.8125, "grad_norm_var": 0.24257405598958334, "learning_rate": 0.0001, "loss": 5.8077, "loss/crossentropy": 2.6855915784835815, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1633841060101986, "step": 13988 }, { "epoch": 0.6359090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.15467122395833333, "learning_rate": 0.0001, "loss": 5.6052, "loss/crossentropy": 2.492613673210144, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16223231330513954, "step": 13990 }, { "epoch": 0.636, "grad_norm": 4.78125, "grad_norm_var": 0.10871988932291667, "learning_rate": 0.0001, "loss": 5.0734, "loss/crossentropy": 2.187904268503189, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.14421211183071136, "step": 13992 }, { "epoch": 0.636090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.10585530598958333, "learning_rate": 0.0001, "loss": 5.533, "loss/crossentropy": 2.5028680562973022, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15554944798350334, "step": 13994 }, { "epoch": 0.6361818181818182, "grad_norm": 5.1875, "grad_norm_var": 0.11964518229166667, "learning_rate": 0.0001, "loss": 5.3062, "loss/crossentropy": 2.3213813304901123, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15140918269753456, "step": 13996 }, { "epoch": 0.6362727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.04110921223958333, "learning_rate": 0.0001, "loss": 5.8394, "loss/crossentropy": 2.7100530862808228, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1666504144668579, "step": 13998 }, { "epoch": 0.6363636363636364, "grad_norm": 4.65625, "grad_norm_var": 0.045699055989583334, "learning_rate": 0.0001, "loss": 5.4535, "loss/crossentropy": 2.437196433544159, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15514655783772469, "step": 14000 }, { "epoch": 0.6364545454545455, "grad_norm": 4.5, "grad_norm_var": 0.052632649739583336, "learning_rate": 0.0001, "loss": 5.4446, "loss/crossentropy": 2.478797972202301, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15185842663049698, "step": 14002 }, { "epoch": 0.6365454545454545, "grad_norm": 4.59375, "grad_norm_var": 0.05618489583333333, "learning_rate": 0.0001, "loss": 5.2489, "loss/crossentropy": 2.301930248737335, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14762281253933907, "step": 14004 }, { "epoch": 0.6366363636363637, "grad_norm": 5.375, "grad_norm_var": 0.07291666666666667, "learning_rate": 0.0001, "loss": 5.751, "loss/crossentropy": 2.546574115753174, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1700490340590477, "step": 14006 }, { "epoch": 0.6367272727272727, "grad_norm": 4.65625, "grad_norm_var": 0.07857666015625, "learning_rate": 0.0001, "loss": 5.3347, "loss/crossentropy": 2.352039098739624, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15119142085313797, "step": 14008 }, { "epoch": 0.6368181818181818, "grad_norm": 5.1875, "grad_norm_var": 0.14505208333333333, "learning_rate": 0.0001, "loss": 5.8758, "loss/crossentropy": 2.5909829139709473, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17633122578263283, "step": 14010 }, { "epoch": 0.6369090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.130322265625, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.51230251789093, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15870276466012, "step": 14012 }, { "epoch": 0.637, "grad_norm": 4.96875, "grad_norm_var": 0.132421875, "learning_rate": 0.0001, "loss": 5.6156, "loss/crossentropy": 2.4643360674381256, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1647370457649231, "step": 14014 }, { "epoch": 0.637090909090909, "grad_norm": 5.84375, "grad_norm_var": 0.9667317708333333, "learning_rate": 0.0001, "loss": 5.6748, "loss/crossentropy": 2.508821487426758, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16484343633055687, "step": 14016 }, { "epoch": 0.6371818181818182, "grad_norm": 4.6875, "grad_norm_var": 0.9348795572916667, "learning_rate": 0.0001, "loss": 5.2038, "loss/crossentropy": 2.241334408521652, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.149366594851017, "step": 14018 }, { "epoch": 0.6372727272727273, "grad_norm": 4.4375, "grad_norm_var": 0.9407389322916667, "learning_rate": 0.0001, "loss": 5.3001, "loss/crossentropy": 2.3194613456726074, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14962775260210037, "step": 14020 }, { "epoch": 0.6373636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.9402994791666667, "learning_rate": 0.0001, "loss": 5.9708, "loss/crossentropy": 2.7690637707710266, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1709522306919098, "step": 14022 }, { "epoch": 0.6374545454545455, "grad_norm": 4.71875, "grad_norm_var": 0.9083943684895833, "learning_rate": 0.0001, "loss": 5.7656, "loss/crossentropy": 2.5997495651245117, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1681448370218277, "step": 14024 }, { "epoch": 0.6375454545454545, "grad_norm": 5.34375, "grad_norm_var": 0.88365478515625, "learning_rate": 0.0001, "loss": 5.4831, "loss/crossentropy": 2.431674510240555, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1557306982576847, "step": 14026 }, { "epoch": 0.6376363636363637, "grad_norm": 4.25, "grad_norm_var": 0.9461873372395834, "learning_rate": 0.0001, "loss": 5.1218, "loss/crossentropy": 2.2200260758399963, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14369201846420765, "step": 14028 }, { "epoch": 0.6377272727272727, "grad_norm": 4.6875, "grad_norm_var": 0.9473592122395833, "learning_rate": 0.0001, "loss": 5.4838, "loss/crossentropy": 2.403561532497406, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15958283841609955, "step": 14030 }, { "epoch": 0.6378181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.12732747395833333, "learning_rate": 0.0001, "loss": 5.5543, "loss/crossentropy": 2.4824615716934204, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1605086736381054, "step": 14032 }, { "epoch": 0.6379090909090909, "grad_norm": 4.75, "grad_norm_var": 0.13860677083333334, "learning_rate": 0.0001, "loss": 5.3536, "loss/crossentropy": 2.337603986263275, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15433797612786293, "step": 14034 }, { "epoch": 0.638, "grad_norm": 4.625, "grad_norm_var": 0.13865559895833332, "learning_rate": 0.0001, "loss": 5.7267, "loss/crossentropy": 2.648826777935028, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16228017956018448, "step": 14036 }, { "epoch": 0.638090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.12314046223958333, "learning_rate": 0.0001, "loss": 5.4601, "loss/crossentropy": 2.4381436109542847, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15317116305232048, "step": 14038 }, { "epoch": 0.6381818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.09309488932291667, "learning_rate": 0.0001, "loss": 5.7206, "loss/crossentropy": 2.6311462819576263, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.16382605582475662, "step": 14040 }, { "epoch": 0.6382727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.07509358723958333, "learning_rate": 0.0001, "loss": 5.825, "loss/crossentropy": 2.6947160959243774, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16303111240267754, "step": 14042 }, { "epoch": 0.6383636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.05797119140625, "learning_rate": 0.0001, "loss": 5.6742, "loss/crossentropy": 2.5952455401420593, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16180380806326866, "step": 14044 }, { "epoch": 0.6384545454545455, "grad_norm": 4.65625, "grad_norm_var": 0.05683186848958333, "learning_rate": 0.0001, "loss": 5.2506, "loss/crossentropy": 2.2751267552375793, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15047211945056915, "step": 14046 }, { "epoch": 0.6385454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.055322265625, "learning_rate": 0.0001, "loss": 5.2542, "loss/crossentropy": 2.2862476110458374, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.14601044729351997, "step": 14048 }, { "epoch": 0.6386363636363637, "grad_norm": 5.0, "grad_norm_var": 0.06015625, "learning_rate": 0.0001, "loss": 5.7723, "loss/crossentropy": 2.588358700275421, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.166445791721344, "step": 14050 }, { "epoch": 0.6387272727272727, "grad_norm": 4.59375, "grad_norm_var": 0.05494384765625, "learning_rate": 0.0001, "loss": 5.4861, "loss/crossentropy": 2.4418524503707886, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1565733216702938, "step": 14052 }, { "epoch": 0.6388181818181818, "grad_norm": 4.75, "grad_norm_var": 0.054931640625, "learning_rate": 0.0001, "loss": 5.3759, "loss/crossentropy": 2.299728274345398, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15742067620158195, "step": 14054 }, { "epoch": 0.6389090909090909, "grad_norm": 10.5, "grad_norm_var": 2.036942545572917, "learning_rate": 0.0001, "loss": 5.4498, "loss/crossentropy": 2.39930123090744, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1575850248336792, "step": 14056 }, { "epoch": 0.639, "grad_norm": 5.46875, "grad_norm_var": 2.032421875, "learning_rate": 0.0001, "loss": 5.5361, "loss/crossentropy": 2.4168885350227356, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16074854508042336, "step": 14058 }, { "epoch": 0.639090909090909, "grad_norm": 5.1875, "grad_norm_var": 1.99263916015625, "learning_rate": 0.0001, "loss": 5.5605, "loss/crossentropy": 2.4578709602355957, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16123966872692108, "step": 14060 }, { "epoch": 0.6391818181818182, "grad_norm": 4.9375, "grad_norm_var": 2.01314697265625, "learning_rate": 0.0001, "loss": 5.3005, "loss/crossentropy": 2.3851672410964966, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14348358288407326, "step": 14062 }, { "epoch": 0.6392727272727273, "grad_norm": 4.875, "grad_norm_var": 2.06314697265625, "learning_rate": 0.0001, "loss": 5.5585, "loss/crossentropy": 2.5246835947036743, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15553292259573936, "step": 14064 }, { "epoch": 0.6393636363636364, "grad_norm": 4.4375, "grad_norm_var": 2.113134765625, "learning_rate": 0.0001, "loss": 5.5765, "loss/crossentropy": 2.574450194835663, "loss/hidden": 1.419921875, "loss/jsd": 0.0, "loss/logits": 0.15821772068738937, "step": 14066 }, { "epoch": 0.6394545454545455, "grad_norm": 5.25, "grad_norm_var": 2.10963134765625, "learning_rate": 0.0001, "loss": 5.5799, "loss/crossentropy": 2.4594247341156006, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16126887872815132, "step": 14068 }, { "epoch": 0.6395454545454545, "grad_norm": 5.0, "grad_norm_var": 2.09967041015625, "learning_rate": 0.0001, "loss": 5.8626, "loss/crossentropy": 2.7205663919448853, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16674614325165749, "step": 14070 }, { "epoch": 0.6396363636363637, "grad_norm": 4.71875, "grad_norm_var": 0.107421875, "learning_rate": 0.0001, "loss": 5.6542, "loss/crossentropy": 2.55336195230484, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1614464707672596, "step": 14072 }, { "epoch": 0.6397272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.07841389973958333, "learning_rate": 0.0001, "loss": 5.3782, "loss/crossentropy": 2.359157830476761, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1530778668820858, "step": 14074 }, { "epoch": 0.6398181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.07125244140625, "learning_rate": 0.0001, "loss": 5.743, "loss/crossentropy": 2.638343870639801, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1632009856402874, "step": 14076 }, { "epoch": 0.6399090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.083447265625, "learning_rate": 0.0001, "loss": 5.7472, "loss/crossentropy": 2.6389856338500977, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1629740372300148, "step": 14078 }, { "epoch": 0.64, "grad_norm": 4.78125, "grad_norm_var": 0.06718343098958333, "learning_rate": 0.0001, "loss": 5.7811, "loss/crossentropy": 2.6519010066986084, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.166042510420084, "step": 14080 }, { "epoch": 0.640090909090909, "grad_norm": 4.875, "grad_norm_var": 0.054036458333333336, "learning_rate": 0.0001, "loss": 5.8313, "loss/crossentropy": 2.67627215385437, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16784464940428734, "step": 14082 }, { "epoch": 0.6401818181818182, "grad_norm": 5.3125, "grad_norm_var": 0.0568359375, "learning_rate": 0.0001, "loss": 5.3694, "loss/crossentropy": 2.342897355556488, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.14834897592663765, "step": 14084 }, { "epoch": 0.6402727272727273, "grad_norm": 3.96875, "grad_norm_var": 0.11249593098958334, "learning_rate": 0.0001, "loss": 4.9188, "loss/crossentropy": 2.1348380148410797, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1354225929826498, "step": 14086 }, { "epoch": 0.6403636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.11483968098958333, "learning_rate": 0.0001, "loss": 6.2232, "loss/crossentropy": 2.911830484867096, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.18133585900068283, "step": 14088 }, { "epoch": 0.6404545454545455, "grad_norm": 5.03125, "grad_norm_var": 0.13095296223958333, "learning_rate": 0.0001, "loss": 5.5644, "loss/crossentropy": 2.4929592609405518, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15831639617681503, "step": 14090 }, { "epoch": 0.6405454545454545, "grad_norm": 5.1875, "grad_norm_var": 0.13411458333333334, "learning_rate": 0.0001, "loss": 5.8671, "loss/crossentropy": 2.7023789286613464, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1674531176686287, "step": 14092 }, { "epoch": 0.6406363636363637, "grad_norm": 5.09375, "grad_norm_var": 0.12753499348958333, "learning_rate": 0.0001, "loss": 5.7204, "loss/crossentropy": 2.6023258566856384, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16297884285449982, "step": 14094 }, { "epoch": 0.6407272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.12733968098958334, "learning_rate": 0.0001, "loss": 5.564, "loss/crossentropy": 2.4935577511787415, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15958424657583237, "step": 14096 }, { "epoch": 0.6408181818181818, "grad_norm": 5.40625, "grad_norm_var": 0.139306640625, "learning_rate": 0.0001, "loss": 5.8058, "loss/crossentropy": 2.621978998184204, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16994208097457886, "step": 14098 }, { "epoch": 0.6409090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.17952067057291668, "learning_rate": 0.0001, "loss": 5.6196, "loss/crossentropy": 2.420749306678772, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16598280519247055, "step": 14100 }, { "epoch": 0.641, "grad_norm": 4.625, "grad_norm_var": 0.1146484375, "learning_rate": 0.0001, "loss": 5.5149, "loss/crossentropy": 2.4915637969970703, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15526238456368446, "step": 14102 }, { "epoch": 0.641090909090909, "grad_norm": 4.53125, "grad_norm_var": 0.13059895833333332, "learning_rate": 0.0001, "loss": 5.4211, "loss/crossentropy": 2.3863351941108704, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15484268963336945, "step": 14104 }, { "epoch": 0.6411818181818182, "grad_norm": 5.21875, "grad_norm_var": 0.13017171223958332, "learning_rate": 0.0001, "loss": 5.704, "loss/crossentropy": 2.533920407295227, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16798141971230507, "step": 14106 }, { "epoch": 0.6412727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.13791910807291666, "learning_rate": 0.0001, "loss": 5.9009, "loss/crossentropy": 2.684028923511505, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17324917390942574, "step": 14108 }, { "epoch": 0.6413636363636364, "grad_norm": 4.96875, "grad_norm_var": 0.12727457682291668, "learning_rate": 0.0001, "loss": 5.531, "loss/crossentropy": 2.402980327606201, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1651453971862793, "step": 14110 }, { "epoch": 0.6414545454545455, "grad_norm": 6.09375, "grad_norm_var": 0.17808837890625, "learning_rate": 0.0001, "loss": 5.5174, "loss/crossentropy": 2.475772976875305, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15689251571893692, "step": 14112 }, { "epoch": 0.6415454545454545, "grad_norm": 5.0, "grad_norm_var": 0.18756103515625, "learning_rate": 0.0001, "loss": 5.5072, "loss/crossentropy": 2.415327727794647, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15996969118714333, "step": 14114 }, { "epoch": 0.6416363636363637, "grad_norm": 4.65625, "grad_norm_var": 0.16751302083333333, "learning_rate": 0.0001, "loss": 5.3266, "loss/crossentropy": 2.3090521693229675, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15644555911421776, "step": 14116 }, { "epoch": 0.6417272727272727, "grad_norm": 4.59375, "grad_norm_var": 0.17382405598958334, "learning_rate": 0.0001, "loss": 5.2217, "loss/crossentropy": 2.2606255412101746, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14981364458799362, "step": 14118 }, { "epoch": 0.6418181818181818, "grad_norm": 5.8125, "grad_norm_var": 0.20631103515625, "learning_rate": 0.0001, "loss": 5.5901, "loss/crossentropy": 2.5170169472694397, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1586710512638092, "step": 14120 }, { "epoch": 0.6419090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.203369140625, "learning_rate": 0.0001, "loss": 4.8027, "loss/crossentropy": 1.975707471370697, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.13543026708066463, "step": 14122 }, { "epoch": 0.642, "grad_norm": 4.96875, "grad_norm_var": 0.19840087890625, "learning_rate": 0.0001, "loss": 5.6582, "loss/crossentropy": 2.5082982778549194, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16381487995386124, "step": 14124 }, { "epoch": 0.642090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.28463541666666664, "learning_rate": 0.0001, "loss": 5.4231, "loss/crossentropy": 2.2915040254592896, "loss/hidden": 1.541015625, "loss/jsd": 0.0, "loss/logits": 0.15906236320734024, "step": 14126 }, { "epoch": 0.6421818181818182, "grad_norm": 5.125, "grad_norm_var": 0.216796875, "learning_rate": 0.0001, "loss": 5.7822, "loss/crossentropy": 2.6297618746757507, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16700155287981033, "step": 14128 }, { "epoch": 0.6422727272727272, "grad_norm": 4.8125, "grad_norm_var": 0.220703125, "learning_rate": 0.0001, "loss": 5.7109, "loss/crossentropy": 2.5794067978858948, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16217628866434097, "step": 14130 }, { "epoch": 0.6423636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.20933837890625, "learning_rate": 0.0001, "loss": 5.6865, "loss/crossentropy": 2.622674584388733, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.16127019003033638, "step": 14132 }, { "epoch": 0.6424545454545455, "grad_norm": 4.625, "grad_norm_var": 0.20067952473958334, "learning_rate": 0.0001, "loss": 5.4389, "loss/crossentropy": 2.3972830772399902, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15689337253570557, "step": 14134 }, { "epoch": 0.6425454545454545, "grad_norm": 5.25, "grad_norm_var": 0.21737874348958333, "learning_rate": 0.0001, "loss": 5.4897, "loss/crossentropy": 2.384045362472534, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1619286946952343, "step": 14136 }, { "epoch": 0.6426363636363637, "grad_norm": 3.9375, "grad_norm_var": 0.323046875, "learning_rate": 0.0001, "loss": 4.7693, "loss/crossentropy": 2.040207713842392, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.12739992141723633, "step": 14138 }, { "epoch": 0.6427272727272727, "grad_norm": 5.0, "grad_norm_var": 0.3154296875, "learning_rate": 0.0001, "loss": 5.6271, "loss/crossentropy": 2.554409146308899, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15882931277155876, "step": 14140 }, { "epoch": 0.6428181818181818, "grad_norm": 5.0625, "grad_norm_var": 0.20907796223958333, "learning_rate": 0.0001, "loss": 5.5746, "loss/crossentropy": 2.4424397945404053, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16224274039268494, "step": 14142 }, { "epoch": 0.6429090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.214697265625, "learning_rate": 0.0001, "loss": 5.4993, "loss/crossentropy": 2.414202570915222, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15772362798452377, "step": 14144 }, { "epoch": 0.643, "grad_norm": 6.125, "grad_norm_var": 0.28560791015625, "learning_rate": 0.0001, "loss": 5.4089, "loss/crossentropy": 2.2324831783771515, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16666733846068382, "step": 14146 }, { "epoch": 0.643090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.288671875, "learning_rate": 0.0001, "loss": 5.6606, "loss/crossentropy": 2.554735839366913, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16097895056009293, "step": 14148 }, { "epoch": 0.6431818181818182, "grad_norm": 4.75, "grad_norm_var": 0.28170166015625, "learning_rate": 0.0001, "loss": 5.0075, "loss/crossentropy": 2.05866801738739, "loss/hidden": 1.533203125, "loss/jsd": 0.0, "loss/logits": 0.14156024158000946, "step": 14150 }, { "epoch": 0.6432727272727272, "grad_norm": 5.34375, "grad_norm_var": 0.21897379557291666, "learning_rate": 0.0001, "loss": 5.5603, "loss/crossentropy": 2.5115471482276917, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15623825415968895, "step": 14152 }, { "epoch": 0.6433636363636364, "grad_norm": 5.25, "grad_norm_var": 0.12185872395833333, "learning_rate": 0.0001, "loss": 5.5204, "loss/crossentropy": 2.463449537754059, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15940947085618973, "step": 14154 }, { "epoch": 0.6434545454545455, "grad_norm": 4.71875, "grad_norm_var": 0.13092041015625, "learning_rate": 0.0001, "loss": 5.3121, "loss/crossentropy": 2.3006229996681213, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15251773595809937, "step": 14156 }, { "epoch": 0.6435454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.13183186848958334, "learning_rate": 0.0001, "loss": 5.4798, "loss/crossentropy": 2.421540915966034, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15543607249855995, "step": 14158 }, { "epoch": 0.6436363636363637, "grad_norm": 4.6875, "grad_norm_var": 0.15732014973958333, "learning_rate": 0.0001, "loss": 5.9496, "loss/crossentropy": 2.746300995349884, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.17248155921697617, "step": 14160 }, { "epoch": 0.6437272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.0841796875, "learning_rate": 0.0001, "loss": 5.0841, "loss/crossentropy": 2.219233453273773, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.14156050607562065, "step": 14162 }, { "epoch": 0.6438181818181818, "grad_norm": 5.0, "grad_norm_var": 0.076025390625, "learning_rate": 0.0001, "loss": 5.7318, "loss/crossentropy": 2.5571871995925903, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16882672160863876, "step": 14164 }, { "epoch": 0.6439090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.07317708333333334, "learning_rate": 0.0001, "loss": 5.2919, "loss/crossentropy": 2.242592513561249, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15532608702778816, "step": 14166 }, { "epoch": 0.644, "grad_norm": 7.46875, "grad_norm_var": 0.440087890625, "learning_rate": 0.0001, "loss": 5.9043, "loss/crossentropy": 2.6671549677848816, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1719580888748169, "step": 14168 }, { "epoch": 0.644090909090909, "grad_norm": 5.21875, "grad_norm_var": 0.43664957682291666, "learning_rate": 0.0001, "loss": 5.5909, "loss/crossentropy": 2.4567374885082245, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1665395423769951, "step": 14170 }, { "epoch": 0.6441818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.4298828125, "learning_rate": 0.0001, "loss": 5.7725, "loss/crossentropy": 2.721900165081024, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15740039944648743, "step": 14172 }, { "epoch": 0.6442727272727272, "grad_norm": 5.09375, "grad_norm_var": 0.4371744791666667, "learning_rate": 0.0001, "loss": 5.1202, "loss/crossentropy": 2.2367148995399475, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14225225523114204, "step": 14174 }, { "epoch": 0.6443636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.42433268229166665, "learning_rate": 0.0001, "loss": 5.4277, "loss/crossentropy": 2.484602391719818, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14919504895806313, "step": 14176 }, { "epoch": 0.6444545454545455, "grad_norm": 4.625, "grad_norm_var": 0.44661051432291665, "learning_rate": 0.0001, "loss": 5.2258, "loss/crossentropy": 2.3125839233398438, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14600756391882896, "step": 14178 }, { "epoch": 0.6445454545454545, "grad_norm": 5.125, "grad_norm_var": 0.46588541666666666, "learning_rate": 0.0001, "loss": 5.1682, "loss/crossentropy": 2.162985622882843, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1489606536924839, "step": 14180 }, { "epoch": 0.6446363636363637, "grad_norm": 4.46875, "grad_norm_var": 0.5218709309895834, "learning_rate": 0.0001, "loss": 5.0877, "loss/crossentropy": 2.255551517009735, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.13907012343406677, "step": 14182 }, { "epoch": 0.6447272727272727, "grad_norm": 5.21875, "grad_norm_var": 0.12903238932291666, "learning_rate": 0.0001, "loss": 5.6418, "loss/crossentropy": 2.5240834951400757, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15904034301638603, "step": 14184 }, { "epoch": 0.6448181818181818, "grad_norm": 4.625, "grad_norm_var": 0.129931640625, "learning_rate": 0.0001, "loss": 5.6806, "loss/crossentropy": 2.5794569849967957, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16226473078131676, "step": 14186 }, { "epoch": 0.6449090909090909, "grad_norm": 5.0625, "grad_norm_var": 0.13469645182291667, "learning_rate": 0.0001, "loss": 5.5973, "loss/crossentropy": 2.5037556290626526, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16228406876325607, "step": 14188 }, { "epoch": 0.645, "grad_norm": 4.96875, "grad_norm_var": 0.13313802083333334, "learning_rate": 0.0001, "loss": 5.8416, "loss/crossentropy": 2.6570287346839905, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16787352412939072, "step": 14190 }, { "epoch": 0.645090909090909, "grad_norm": 5.25, "grad_norm_var": 0.14694010416666667, "learning_rate": 0.0001, "loss": 5.806, "loss/crossentropy": 2.640306055545807, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16793566197156906, "step": 14192 }, { "epoch": 0.6451818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.12089436848958333, "learning_rate": 0.0001, "loss": 5.6933, "loss/crossentropy": 2.622140645980835, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1602361761033535, "step": 14194 }, { "epoch": 0.6452727272727272, "grad_norm": 4.9375, "grad_norm_var": 0.10077718098958334, "learning_rate": 0.0001, "loss": 5.285, "loss/crossentropy": 2.335495173931122, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14924559369683266, "step": 14196 }, { "epoch": 0.6453636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.09283447265625, "learning_rate": 0.0001, "loss": 5.3292, "loss/crossentropy": 2.3009188175201416, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1553656868636608, "step": 14198 }, { "epoch": 0.6454545454545455, "grad_norm": 4.96875, "grad_norm_var": 0.08561197916666667, "learning_rate": 0.0001, "loss": 5.9032, "loss/crossentropy": 2.6857473850250244, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1742844209074974, "step": 14200 }, { "epoch": 0.6455454545454545, "grad_norm": 5.0625, "grad_norm_var": 0.06506754557291666, "learning_rate": 0.0001, "loss": 5.6489, "loss/crossentropy": 2.551121413707733, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16251220181584358, "step": 14202 }, { "epoch": 0.6456363636363637, "grad_norm": 4.9375, "grad_norm_var": 0.0763671875, "learning_rate": 0.0001, "loss": 5.8836, "loss/crossentropy": 2.6620081663131714, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17235445603728294, "step": 14204 }, { "epoch": 0.6457272727272727, "grad_norm": 4.75, "grad_norm_var": 0.09361979166666666, "learning_rate": 0.0001, "loss": 5.5783, "loss/crossentropy": 2.540731370449066, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15512024611234665, "step": 14206 }, { "epoch": 0.6458181818181818, "grad_norm": 4.9375, "grad_norm_var": 0.089306640625, "learning_rate": 0.0001, "loss": 5.7263, "loss/crossentropy": 2.6361371278762817, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16018756851553917, "step": 14208 }, { "epoch": 0.6459090909090909, "grad_norm": 5.0, "grad_norm_var": 0.08935139973958334, "learning_rate": 0.0001, "loss": 5.5778, "loss/crossentropy": 2.4465640783309937, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16137071698904037, "step": 14210 }, { "epoch": 0.646, "grad_norm": 4.53125, "grad_norm_var": 0.119140625, "learning_rate": 0.0001, "loss": 5.3592, "loss/crossentropy": 2.390940010547638, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15112164989113808, "step": 14212 }, { "epoch": 0.646090909090909, "grad_norm": 4.875, "grad_norm_var": 0.08518473307291667, "learning_rate": 0.0001, "loss": 5.6254, "loss/crossentropy": 2.5198898315429688, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1607498973608017, "step": 14214 }, { "epoch": 0.6461818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.07968343098958333, "learning_rate": 0.0001, "loss": 5.5235, "loss/crossentropy": 2.477197289466858, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1595108099281788, "step": 14216 }, { "epoch": 0.6462727272727272, "grad_norm": 4.75, "grad_norm_var": 0.07903238932291666, "learning_rate": 0.0001, "loss": 5.5061, "loss/crossentropy": 2.5260009765625, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1538666971027851, "step": 14218 }, { "epoch": 0.6463636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.06571858723958333, "learning_rate": 0.0001, "loss": 5.865, "loss/crossentropy": 2.685506820678711, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.17087891325354576, "step": 14220 }, { "epoch": 0.6464545454545455, "grad_norm": 5.53125, "grad_norm_var": 0.08437093098958333, "learning_rate": 0.0001, "loss": 5.6385, "loss/crossentropy": 2.507300913333893, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1648782715201378, "step": 14222 }, { "epoch": 0.6465454545454545, "grad_norm": 4.96875, "grad_norm_var": 0.08052978515625, "learning_rate": 0.0001, "loss": 5.5692, "loss/crossentropy": 2.4805822372436523, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16198771446943283, "step": 14224 }, { "epoch": 0.6466363636363637, "grad_norm": 5.03125, "grad_norm_var": 0.08860270182291667, "learning_rate": 0.0001, "loss": 5.2237, "loss/crossentropy": 2.2353450655937195, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14903374761343002, "step": 14226 }, { "epoch": 0.6467272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.052018229166666666, "learning_rate": 0.0001, "loss": 5.6144, "loss/crossentropy": 2.540363609790802, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16072136163711548, "step": 14228 }, { "epoch": 0.6468181818181818, "grad_norm": 4.625, "grad_norm_var": 0.058577473958333334, "learning_rate": 0.0001, "loss": 5.3978, "loss/crossentropy": 2.4148789644241333, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15064074471592903, "step": 14230 }, { "epoch": 0.6469090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.051106770833333336, "learning_rate": 0.0001, "loss": 5.8096, "loss/crossentropy": 2.706482172012329, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1638236716389656, "step": 14232 }, { "epoch": 0.647, "grad_norm": 5.21875, "grad_norm_var": 0.05810139973958333, "learning_rate": 0.0001, "loss": 5.9137, "loss/crossentropy": 2.762599766254425, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16666821762919426, "step": 14234 }, { "epoch": 0.647090909090909, "grad_norm": 5.03125, "grad_norm_var": 0.05673421223958333, "learning_rate": 0.0001, "loss": 5.3142, "loss/crossentropy": 2.2673343122005463, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15430086106061935, "step": 14236 }, { "epoch": 0.6471818181818182, "grad_norm": 4.625, "grad_norm_var": 0.039306640625, "learning_rate": 0.0001, "loss": 5.6336, "loss/crossentropy": 2.613314628601074, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15769357979297638, "step": 14238 }, { "epoch": 0.6472727272727272, "grad_norm": 5.0, "grad_norm_var": 0.10362955729166666, "learning_rate": 0.0001, "loss": 5.6532, "loss/crossentropy": 2.6095167994499207, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15593114867806435, "step": 14240 }, { "epoch": 0.6473636363636364, "grad_norm": 5.09375, "grad_norm_var": 0.09830729166666667, "learning_rate": 0.0001, "loss": 5.5342, "loss/crossentropy": 2.4323567152023315, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16331318393349648, "step": 14242 }, { "epoch": 0.6474545454545455, "grad_norm": 5.5625, "grad_norm_var": 0.11803385416666666, "learning_rate": 0.0001, "loss": 5.8929, "loss/crossentropy": 2.72109317779541, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1673731952905655, "step": 14244 }, { "epoch": 0.6475454545454545, "grad_norm": 6.09375, "grad_norm_var": 0.18409830729166668, "learning_rate": 0.0001, "loss": 5.0625, "loss/crossentropy": 2.1815007030963898, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1425936110317707, "step": 14246 }, { "epoch": 0.6476363636363637, "grad_norm": 4.84375, "grad_norm_var": 0.183203125, "learning_rate": 0.0001, "loss": 5.2373, "loss/crossentropy": 2.2881979644298553, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1464766301214695, "step": 14248 }, { "epoch": 0.6477272727272727, "grad_norm": 5.59375, "grad_norm_var": 0.18134358723958333, "learning_rate": 0.0001, "loss": 5.6902, "loss/crossentropy": 2.459657847881317, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17188234254717827, "step": 14250 }, { "epoch": 0.6478181818181818, "grad_norm": 4.875, "grad_norm_var": 0.19921875, "learning_rate": 0.0001, "loss": 5.2039, "loss/crossentropy": 2.2518559396266937, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14832885563373566, "step": 14252 }, { "epoch": 0.6479090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.19023030598958332, "learning_rate": 0.0001, "loss": 5.5501, "loss/crossentropy": 2.511933445930481, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1585010513663292, "step": 14254 }, { "epoch": 0.648, "grad_norm": 4.9375, "grad_norm_var": 0.15012613932291666, "learning_rate": 0.0001, "loss": 5.3357, "loss/crossentropy": 2.297892838716507, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1561284437775612, "step": 14256 }, { "epoch": 0.648090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.15064697265625, "learning_rate": 0.0001, "loss": 5.4731, "loss/crossentropy": 2.450223743915558, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15424147248268127, "step": 14258 }, { "epoch": 0.6481818181818182, "grad_norm": 5.0, "grad_norm_var": 0.131103515625, "learning_rate": 0.0001, "loss": 5.6259, "loss/crossentropy": 2.548853576183319, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.161215677857399, "step": 14260 }, { "epoch": 0.6482727272727272, "grad_norm": 4.6875, "grad_norm_var": 0.05767822265625, "learning_rate": 0.0001, "loss": 5.5097, "loss/crossentropy": 2.4296090602874756, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1617184914648533, "step": 14262 }, { "epoch": 0.6483636363636364, "grad_norm": 4.75, "grad_norm_var": 0.71676025390625, "learning_rate": 0.0001, "loss": 5.5496, "loss/crossentropy": 2.385088801383972, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16371673718094826, "step": 14264 }, { "epoch": 0.6484545454545455, "grad_norm": 4.6875, "grad_norm_var": 0.7179524739583333, "learning_rate": 0.0001, "loss": 5.4603, "loss/crossentropy": 2.4189411997795105, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15628531202673912, "step": 14266 }, { "epoch": 0.6485454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.7616170247395834, "learning_rate": 0.0001, "loss": 5.1638, "loss/crossentropy": 2.2774382829666138, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.14195699244737625, "step": 14268 }, { "epoch": 0.6486363636363637, "grad_norm": 4.78125, "grad_norm_var": 0.7587076822916666, "learning_rate": 0.0001, "loss": 5.8537, "loss/crossentropy": 2.6535674333572388, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1705964170396328, "step": 14270 }, { "epoch": 0.6487272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.7619425455729166, "learning_rate": 0.0001, "loss": 5.1997, "loss/crossentropy": 2.2677895724773407, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14573423191905022, "step": 14272 }, { "epoch": 0.6488181818181818, "grad_norm": 5.25, "grad_norm_var": 0.766259765625, "learning_rate": 0.0001, "loss": 5.404, "loss/crossentropy": 2.332013487815857, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.158172145485878, "step": 14274 }, { "epoch": 0.6489090909090909, "grad_norm": 4.46875, "grad_norm_var": 0.783837890625, "learning_rate": 0.0001, "loss": 5.6325, "loss/crossentropy": 2.568618893623352, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16088049113750458, "step": 14276 }, { "epoch": 0.649, "grad_norm": 5.125, "grad_norm_var": 0.76978759765625, "learning_rate": 0.0001, "loss": 5.9311, "loss/crossentropy": 2.756606876850128, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1697978787124157, "step": 14278 }, { "epoch": 0.649090909090909, "grad_norm": 4.625, "grad_norm_var": 0.07066650390625, "learning_rate": 0.0001, "loss": 5.4159, "loss/crossentropy": 2.423298954963684, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15433703735470772, "step": 14280 }, { "epoch": 0.6491818181818182, "grad_norm": 4.96875, "grad_norm_var": 0.07359619140625, "learning_rate": 0.0001, "loss": 5.6058, "loss/crossentropy": 2.518462359905243, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16244519501924515, "step": 14282 }, { "epoch": 0.6492727272727272, "grad_norm": 5.0625, "grad_norm_var": 0.043212890625, "learning_rate": 0.0001, "loss": 5.5221, "loss/crossentropy": 2.414330244064331, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15921639278531075, "step": 14284 }, { "epoch": 0.6493636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.0431640625, "learning_rate": 0.0001, "loss": 5.5183, "loss/crossentropy": 2.498883932828903, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15311096236109734, "step": 14286 }, { "epoch": 0.6494545454545455, "grad_norm": 4.5625, "grad_norm_var": 0.05987955729166667, "learning_rate": 0.0001, "loss": 5.5289, "loss/crossentropy": 2.5428152084350586, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15427425503730774, "step": 14288 }, { "epoch": 0.6495454545454545, "grad_norm": 5.125, "grad_norm_var": 0.05565999348958333, "learning_rate": 0.0001, "loss": 5.2963, "loss/crossentropy": 2.316330701112747, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15033939853310585, "step": 14290 }, { "epoch": 0.6496363636363637, "grad_norm": 5.375, "grad_norm_var": 0.0546875, "learning_rate": 0.0001, "loss": 5.484, "loss/crossentropy": 2.4260010719299316, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15560712665319443, "step": 14292 }, { "epoch": 0.6497272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.05703125, "learning_rate": 0.0001, "loss": 5.7875, "loss/crossentropy": 2.649525821208954, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.1639886274933815, "step": 14294 }, { "epoch": 0.6498181818181819, "grad_norm": 4.9375, "grad_norm_var": 0.05987955729166667, "learning_rate": 0.0001, "loss": 5.2091, "loss/crossentropy": 2.2795659601688385, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14490843936800957, "step": 14296 }, { "epoch": 0.6499090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.06731363932291666, "learning_rate": 0.0001, "loss": 5.7578, "loss/crossentropy": 2.601398229598999, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16583513468503952, "step": 14298 }, { "epoch": 0.65, "grad_norm": 4.875, "grad_norm_var": 0.06467692057291667, "learning_rate": 0.0001, "loss": 5.4694, "loss/crossentropy": 2.4504769444465637, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1546255573630333, "step": 14300 }, { "epoch": 0.650090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.06769205729166666, "learning_rate": 0.0001, "loss": 5.5292, "loss/crossentropy": 2.493425726890564, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1574809066951275, "step": 14302 }, { "epoch": 0.6501818181818182, "grad_norm": 5.5, "grad_norm_var": 0.07336832682291666, "learning_rate": 0.0001, "loss": 5.0443, "loss/crossentropy": 2.153278946876526, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14184126444160938, "step": 14304 }, { "epoch": 0.6502727272727272, "grad_norm": 4.65625, "grad_norm_var": 0.07745768229166666, "learning_rate": 0.0001, "loss": 5.5465, "loss/crossentropy": 2.4783300161361694, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15779193863272667, "step": 14306 }, { "epoch": 0.6503636363636364, "grad_norm": 4.5625, "grad_norm_var": 0.0755859375, "learning_rate": 0.0001, "loss": 5.01, "loss/crossentropy": 2.135793298482895, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14034938253462315, "step": 14308 }, { "epoch": 0.6504545454545455, "grad_norm": 5.125, "grad_norm_var": 0.07220052083333334, "learning_rate": 0.0001, "loss": 5.5806, "loss/crossentropy": 2.482321858406067, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1596350036561489, "step": 14310 }, { "epoch": 0.6505454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.07209879557291667, "learning_rate": 0.0001, "loss": 5.6205, "loss/crossentropy": 2.537259101867676, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16105420887470245, "step": 14312 }, { "epoch": 0.6506363636363637, "grad_norm": 5.0, "grad_norm_var": 0.059794108072916664, "learning_rate": 0.0001, "loss": 5.3436, "loss/crossentropy": 2.320305496454239, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1535002626478672, "step": 14314 }, { "epoch": 0.6507272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.16314697265625, "learning_rate": 0.0001, "loss": 5.6852, "loss/crossentropy": 2.562361717224121, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16365085542201996, "step": 14316 }, { "epoch": 0.6508181818181819, "grad_norm": 4.59375, "grad_norm_var": 0.17018229166666668, "learning_rate": 0.0001, "loss": 5.0234, "loss/crossentropy": 2.0985576510429382, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.14463403075933456, "step": 14318 }, { "epoch": 0.6509090909090909, "grad_norm": 5.0, "grad_norm_var": 0.15373942057291667, "learning_rate": 0.0001, "loss": 5.3098, "loss/crossentropy": 2.2868571877479553, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1560027226805687, "step": 14320 }, { "epoch": 0.651, "grad_norm": 4.84375, "grad_norm_var": 0.173046875, "learning_rate": 0.0001, "loss": 5.5337, "loss/crossentropy": 2.501464545726776, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15381420403718948, "step": 14322 }, { "epoch": 0.651090909090909, "grad_norm": 4.625, "grad_norm_var": 0.16417643229166667, "learning_rate": 0.0001, "loss": 5.4402, "loss/crossentropy": 2.365739583969116, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15744513645768166, "step": 14324 }, { "epoch": 0.6511818181818182, "grad_norm": 4.6875, "grad_norm_var": 0.16682535807291668, "learning_rate": 0.0001, "loss": 5.5552, "loss/crossentropy": 2.5387349724769592, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15574707835912704, "step": 14326 }, { "epoch": 0.6512727272727272, "grad_norm": 4.8125, "grad_norm_var": 0.17141520182291667, "learning_rate": 0.0001, "loss": 5.3731, "loss/crossentropy": 2.4046505093574524, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.1532924436032772, "step": 14328 }, { "epoch": 0.6513636363636364, "grad_norm": 4.625, "grad_norm_var": 0.18170572916666666, "learning_rate": 0.0001, "loss": 5.4539, "loss/crossentropy": 2.4422318935394287, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1540948785841465, "step": 14330 }, { "epoch": 0.6514545454545455, "grad_norm": 5.1875, "grad_norm_var": 0.090234375, "learning_rate": 0.0001, "loss": 5.3024, "loss/crossentropy": 2.2360511422157288, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15839488059282303, "step": 14332 }, { "epoch": 0.6515454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.0916015625, "learning_rate": 0.0001, "loss": 5.9381, "loss/crossentropy": 2.725403845310211, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17478029802441597, "step": 14334 }, { "epoch": 0.6516363636363637, "grad_norm": 5.5, "grad_norm_var": 1.137744140625, "learning_rate": 0.0001, "loss": 5.6348, "loss/crossentropy": 2.4776384234428406, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1639548670500517, "step": 14336 }, { "epoch": 0.6517272727272727, "grad_norm": 4.8125, "grad_norm_var": 1.1312784830729166, "learning_rate": 0.0001, "loss": 5.6827, "loss/crossentropy": 2.5576648116111755, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1623055636882782, "step": 14338 }, { "epoch": 0.6518181818181819, "grad_norm": 4.71875, "grad_norm_var": 1.1250284830729167, "learning_rate": 0.0001, "loss": 5.5618, "loss/crossentropy": 2.4793514013290405, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1598033532500267, "step": 14340 }, { "epoch": 0.6519090909090909, "grad_norm": 4.78125, "grad_norm_var": 1.1157389322916667, "learning_rate": 0.0001, "loss": 5.5665, "loss/crossentropy": 2.476914405822754, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1595483236014843, "step": 14342 }, { "epoch": 0.652, "grad_norm": 5.4375, "grad_norm_var": 1.0865519205729166, "learning_rate": 0.0001, "loss": 5.7033, "loss/crossentropy": 2.5598138570785522, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16728225722908974, "step": 14344 }, { "epoch": 0.652090909090909, "grad_norm": 5.0625, "grad_norm_var": 1.0638631184895833, "learning_rate": 0.0001, "loss": 5.7279, "loss/crossentropy": 2.6087517142295837, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1644529178738594, "step": 14346 }, { "epoch": 0.6521818181818182, "grad_norm": 4.59375, "grad_norm_var": 1.0704386393229166, "learning_rate": 0.0001, "loss": 5.469, "loss/crossentropy": 2.4359212517738342, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15720979496836662, "step": 14348 }, { "epoch": 0.6522727272727272, "grad_norm": 5.75, "grad_norm_var": 1.0714152018229166, "learning_rate": 0.0001, "loss": 5.7515, "loss/crossentropy": 2.6445393562316895, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16186432167887688, "step": 14350 }, { "epoch": 0.6523636363636364, "grad_norm": 5.125, "grad_norm_var": 0.09566650390625, "learning_rate": 0.0001, "loss": 5.6646, "loss/crossentropy": 2.550412029027939, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16317836940288544, "step": 14352 }, { "epoch": 0.6524545454545455, "grad_norm": 5.46875, "grad_norm_var": 0.11291910807291666, "learning_rate": 0.0001, "loss": 5.7156, "loss/crossentropy": 2.5470350980758667, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16900620609521866, "step": 14354 }, { "epoch": 0.6525454545454545, "grad_norm": 5.5, "grad_norm_var": 0.1205078125, "learning_rate": 0.0001, "loss": 5.6868, "loss/crossentropy": 2.560868501663208, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.16044573858380318, "step": 14356 }, { "epoch": 0.6526363636363637, "grad_norm": 4.625, "grad_norm_var": 0.13004150390625, "learning_rate": 0.0001, "loss": 5.3356, "loss/crossentropy": 2.395979881286621, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14884667471051216, "step": 14358 }, { "epoch": 0.6527272727272727, "grad_norm": 4.53125, "grad_norm_var": 0.13775634765625, "learning_rate": 0.0001, "loss": 5.5813, "loss/crossentropy": 2.5444082617759705, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1558339148759842, "step": 14360 }, { "epoch": 0.6528181818181819, "grad_norm": 4.46875, "grad_norm_var": 0.13857014973958334, "learning_rate": 0.0001, "loss": 5.1216, "loss/crossentropy": 2.196814000606537, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14638715982437134, "step": 14362 }, { "epoch": 0.6529090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.13254801432291666, "learning_rate": 0.0001, "loss": 5.5168, "loss/crossentropy": 2.42292720079422, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15978022292256355, "step": 14364 }, { "epoch": 0.653, "grad_norm": 4.71875, "grad_norm_var": 0.08681233723958333, "learning_rate": 0.0001, "loss": 5.8356, "loss/crossentropy": 2.7143812775611877, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1642725020647049, "step": 14366 }, { "epoch": 0.653090909090909, "grad_norm": 5.0, "grad_norm_var": 0.090478515625, "learning_rate": 0.0001, "loss": 5.9246, "loss/crossentropy": 2.632205009460449, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17708922550082207, "step": 14368 }, { "epoch": 0.6531818181818182, "grad_norm": 5.09375, "grad_norm_var": 0.06802978515625, "learning_rate": 0.0001, "loss": 5.7474, "loss/crossentropy": 2.616843283176422, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1640358753502369, "step": 14370 }, { "epoch": 0.6532727272727272, "grad_norm": 5.09375, "grad_norm_var": 0.05035400390625, "learning_rate": 0.0001, "loss": 5.9704, "loss/crossentropy": 2.724435031414032, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17303292080760002, "step": 14372 }, { "epoch": 0.6533636363636364, "grad_norm": 4.96875, "grad_norm_var": 0.08671875, "learning_rate": 0.0001, "loss": 5.7556, "loss/crossentropy": 2.5384770035743713, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17347301170229912, "step": 14374 }, { "epoch": 0.6534545454545454, "grad_norm": 4.625, "grad_norm_var": 0.08378499348958333, "learning_rate": 0.0001, "loss": 5.6116, "loss/crossentropy": 2.5740877985954285, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15804360061883926, "step": 14376 }, { "epoch": 0.6535454545454545, "grad_norm": 4.625, "grad_norm_var": 0.07200520833333333, "learning_rate": 0.0001, "loss": 5.7369, "loss/crossentropy": 2.6431787610054016, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16054195538163185, "step": 14378 }, { "epoch": 0.6536363636363637, "grad_norm": 4.90625, "grad_norm_var": 0.07545572916666667, "learning_rate": 0.0001, "loss": 5.5545, "loss/crossentropy": 2.504837691783905, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.16023746877908707, "step": 14380 }, { "epoch": 0.6537272727272727, "grad_norm": 4.71875, "grad_norm_var": 0.07639567057291667, "learning_rate": 0.0001, "loss": 5.4032, "loss/crossentropy": 2.428323268890381, "loss/hidden": 1.427734375, "loss/jsd": 0.0, "loss/logits": 0.15470951423048973, "step": 14382 }, { "epoch": 0.6538181818181819, "grad_norm": 4.65625, "grad_norm_var": 0.07623697916666666, "learning_rate": 0.0001, "loss": 5.5814, "loss/crossentropy": 2.550226151943207, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15487508848309517, "step": 14384 }, { "epoch": 0.6539090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.08307291666666666, "learning_rate": 0.0001, "loss": 5.5425, "loss/crossentropy": 2.4516382217407227, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16280150786042213, "step": 14386 }, { "epoch": 0.654, "grad_norm": 5.8125, "grad_norm_var": 0.13043212890625, "learning_rate": 0.0001, "loss": 5.3828, "loss/crossentropy": 2.298442006111145, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.157658439129591, "step": 14388 }, { "epoch": 0.6540909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.09534098307291666, "learning_rate": 0.0001, "loss": 5.8143, "loss/crossentropy": 2.6575233340263367, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16606856510043144, "step": 14390 }, { "epoch": 0.6541818181818182, "grad_norm": 4.625, "grad_norm_var": 0.09338785807291666, "learning_rate": 0.0001, "loss": 5.743, "loss/crossentropy": 2.62918421626091, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16548157297074795, "step": 14392 }, { "epoch": 0.6542727272727272, "grad_norm": 4.34375, "grad_norm_var": 0.11764322916666667, "learning_rate": 0.0001, "loss": 5.0717, "loss/crossentropy": 2.178998053073883, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14473718777298927, "step": 14394 }, { "epoch": 0.6543636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.11679280598958333, "learning_rate": 0.0001, "loss": 5.4725, "loss/crossentropy": 2.4463430643081665, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15593810752034187, "step": 14396 }, { "epoch": 0.6544545454545454, "grad_norm": 5.0625, "grad_norm_var": 0.12102457682291666, "learning_rate": 0.0001, "loss": 5.8637, "loss/crossentropy": 2.629501223564148, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.17205233126878738, "step": 14398 }, { "epoch": 0.6545454545454545, "grad_norm": 4.59375, "grad_norm_var": 0.12362874348958333, "learning_rate": 0.0001, "loss": 6.0059, "loss/crossentropy": 2.793469190597534, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17280084267258644, "step": 14400 }, { "epoch": 0.6546363636363637, "grad_norm": 4.75, "grad_norm_var": 0.12245686848958333, "learning_rate": 0.0001, "loss": 5.4208, "loss/crossentropy": 2.439429521560669, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15379885956645012, "step": 14402 }, { "epoch": 0.6547272727272727, "grad_norm": 5.4375, "grad_norm_var": 0.08964436848958333, "learning_rate": 0.0001, "loss": 5.8617, "loss/crossentropy": 2.6780622601509094, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.17168063670396805, "step": 14404 }, { "epoch": 0.6548181818181819, "grad_norm": 5.0625, "grad_norm_var": 0.09006754557291667, "learning_rate": 0.0001, "loss": 5.7905, "loss/crossentropy": 2.6676629185676575, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1628650762140751, "step": 14406 }, { "epoch": 0.6549090909090909, "grad_norm": 4.75, "grad_norm_var": 0.10676676432291667, "learning_rate": 0.0001, "loss": 5.924, "loss/crossentropy": 2.7655044198036194, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16721495985984802, "step": 14408 }, { "epoch": 0.655, "grad_norm": 4.96875, "grad_norm_var": 0.095556640625, "learning_rate": 0.0001, "loss": 5.5829, "loss/crossentropy": 2.4835711121559143, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16052226349711418, "step": 14410 }, { "epoch": 0.6550909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.08905843098958334, "learning_rate": 0.0001, "loss": 5.7792, "loss/crossentropy": 2.670194447040558, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16460682824254036, "step": 14412 }, { "epoch": 0.6551818181818182, "grad_norm": 4.375, "grad_norm_var": 0.11717122395833333, "learning_rate": 0.0001, "loss": 5.4987, "loss/crossentropy": 2.5170775055885315, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15343941375613213, "step": 14414 }, { "epoch": 0.6552727272727272, "grad_norm": 4.59375, "grad_norm_var": 0.11790364583333333, "learning_rate": 0.0001, "loss": 5.1636, "loss/crossentropy": 2.2113208770751953, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1477675624191761, "step": 14416 }, { "epoch": 0.6553636363636364, "grad_norm": 4.90625, "grad_norm_var": 0.11207275390625, "learning_rate": 0.0001, "loss": 5.5657, "loss/crossentropy": 2.4536340832710266, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15886571258306503, "step": 14418 }, { "epoch": 0.6554545454545454, "grad_norm": 4.3125, "grad_norm_var": 0.12584228515625, "learning_rate": 0.0001, "loss": 5.2119, "loss/crossentropy": 2.307181030511856, "loss/hidden": 1.423828125, "loss/jsd": 0.0, "loss/logits": 0.1480897068977356, "step": 14420 }, { "epoch": 0.6555454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.127587890625, "learning_rate": 0.0001, "loss": 5.5961, "loss/crossentropy": 2.533249616622925, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16077588126063347, "step": 14422 }, { "epoch": 0.6556363636363637, "grad_norm": 4.5, "grad_norm_var": 0.11691080729166667, "learning_rate": 0.0001, "loss": 5.5, "loss/crossentropy": 2.416747808456421, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16028204187750816, "step": 14424 }, { "epoch": 0.6557272727272727, "grad_norm": 4.625, "grad_norm_var": 0.07782796223958334, "learning_rate": 0.0001, "loss": 5.1077, "loss/crossentropy": 2.1828812062740326, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14540854841470718, "step": 14426 }, { "epoch": 0.6558181818181819, "grad_norm": 4.8125, "grad_norm_var": 0.077978515625, "learning_rate": 0.0001, "loss": 5.5182, "loss/crossentropy": 2.4480252861976624, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15975582599639893, "step": 14428 }, { "epoch": 0.6559090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.07369384765625, "learning_rate": 0.0001, "loss": 5.2592, "loss/crossentropy": 2.2454375326633453, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15391898155212402, "step": 14430 }, { "epoch": 0.656, "grad_norm": 4.9375, "grad_norm_var": 0.13635660807291666, "learning_rate": 0.0001, "loss": 5.4825, "loss/crossentropy": 2.4606877267360687, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15315360389649868, "step": 14432 }, { "epoch": 0.6560909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.13837483723958333, "learning_rate": 0.0001, "loss": 5.9403, "loss/crossentropy": 2.753790259361267, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16963079944252968, "step": 14434 }, { "epoch": 0.6561818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.11405843098958333, "learning_rate": 0.0001, "loss": 5.7083, "loss/crossentropy": 2.6456146836280823, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1603698804974556, "step": 14436 }, { "epoch": 0.6562727272727272, "grad_norm": 4.65625, "grad_norm_var": 0.11776936848958333, "learning_rate": 0.0001, "loss": 5.6778, "loss/crossentropy": 2.632447123527527, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15961457043886185, "step": 14438 }, { "epoch": 0.6563636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.10546875, "learning_rate": 0.0001, "loss": 5.3739, "loss/crossentropy": 2.406282365322113, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1504744477570057, "step": 14440 }, { "epoch": 0.6564545454545454, "grad_norm": 4.1875, "grad_norm_var": 0.13306884765625, "learning_rate": 0.0001, "loss": 5.284, "loss/crossentropy": 2.372660994529724, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1469908207654953, "step": 14442 }, { "epoch": 0.6565454545454545, "grad_norm": 5.15625, "grad_norm_var": 0.15497639973958333, "learning_rate": 0.0001, "loss": 5.8592, "loss/crossentropy": 2.6486659049987793, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17027616873383522, "step": 14444 }, { "epoch": 0.6566363636363637, "grad_norm": 4.875, "grad_norm_var": 0.15128580729166666, "learning_rate": 0.0001, "loss": 5.2542, "loss/crossentropy": 2.2999744415283203, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14737535454332829, "step": 14446 }, { "epoch": 0.6567272727272727, "grad_norm": 4.5625, "grad_norm_var": 0.10530192057291667, "learning_rate": 0.0001, "loss": 5.3525, "loss/crossentropy": 2.31733700633049, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15663789957761765, "step": 14448 }, { "epoch": 0.6568181818181819, "grad_norm": 4.53125, "grad_norm_var": 0.10347900390625, "learning_rate": 0.0001, "loss": 5.254, "loss/crossentropy": 2.3483442068099976, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1456434391438961, "step": 14450 }, { "epoch": 0.6569090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.10725504557291667, "learning_rate": 0.0001, "loss": 5.3476, "loss/crossentropy": 2.392181098461151, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15042224898934364, "step": 14452 }, { "epoch": 0.657, "grad_norm": 4.90625, "grad_norm_var": 0.10852457682291666, "learning_rate": 0.0001, "loss": 5.6586, "loss/crossentropy": 2.5549997091293335, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16329244896769524, "step": 14454 }, { "epoch": 0.6570909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.12157796223958334, "learning_rate": 0.0001, "loss": 5.6909, "loss/crossentropy": 2.5629040598869324, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16162606701254845, "step": 14456 }, { "epoch": 0.6571818181818182, "grad_norm": 4.65625, "grad_norm_var": 0.09237874348958333, "learning_rate": 0.0001, "loss": 5.6228, "loss/crossentropy": 2.581303358078003, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1592266485095024, "step": 14458 }, { "epoch": 0.6572727272727272, "grad_norm": 5.09375, "grad_norm_var": 0.06412353515625, "learning_rate": 0.0001, "loss": 5.9393, "loss/crossentropy": 2.7067508697509766, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17559875920414925, "step": 14460 }, { "epoch": 0.6573636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.06350504557291667, "learning_rate": 0.0001, "loss": 5.2864, "loss/crossentropy": 2.2676753997802734, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15343939885497093, "step": 14462 }, { "epoch": 0.6574545454545454, "grad_norm": 4.59375, "grad_norm_var": 0.06194254557291667, "learning_rate": 0.0001, "loss": 5.6654, "loss/crossentropy": 2.637107789516449, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1569286547601223, "step": 14464 }, { "epoch": 0.6575454545454545, "grad_norm": 5.5625, "grad_norm_var": 0.11444905598958334, "learning_rate": 0.0001, "loss": 5.2419, "loss/crossentropy": 2.2906819880008698, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14824440889060497, "step": 14466 }, { "epoch": 0.6576363636363637, "grad_norm": 5.125, "grad_norm_var": 0.10943603515625, "learning_rate": 0.0001, "loss": 5.5577, "loss/crossentropy": 2.4748148322105408, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15672430023550987, "step": 14468 }, { "epoch": 0.6577272727272727, "grad_norm": 4.75, "grad_norm_var": 0.10675455729166666, "learning_rate": 0.0001, "loss": 5.2573, "loss/crossentropy": 2.273584008216858, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1511084958910942, "step": 14470 }, { "epoch": 0.6578181818181819, "grad_norm": 4.3125, "grad_norm_var": 0.11428629557291667, "learning_rate": 0.0001, "loss": 5.4797, "loss/crossentropy": 2.4776092171669006, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15489595010876656, "step": 14472 }, { "epoch": 0.6579090909090909, "grad_norm": 4.625, "grad_norm_var": 0.11964518229166667, "learning_rate": 0.0001, "loss": 5.6238, "loss/crossentropy": 2.5995258688926697, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1559380330145359, "step": 14474 }, { "epoch": 0.658, "grad_norm": 4.84375, "grad_norm_var": 0.11412760416666666, "learning_rate": 0.0001, "loss": 5.4913, "loss/crossentropy": 2.474304437637329, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15579887479543686, "step": 14476 }, { "epoch": 0.6580909090909091, "grad_norm": 4.625, "grad_norm_var": 0.12037760416666667, "learning_rate": 0.0001, "loss": 5.4303, "loss/crossentropy": 2.44503653049469, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.15497178956866264, "step": 14478 }, { "epoch": 0.6581818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.13287760416666666, "learning_rate": 0.0001, "loss": 5.6853, "loss/crossentropy": 2.4411635398864746, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17207315936684608, "step": 14480 }, { "epoch": 0.6582727272727272, "grad_norm": 4.90625, "grad_norm_var": 0.06347249348958334, "learning_rate": 0.0001, "loss": 5.5897, "loss/crossentropy": 2.528580963611603, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15864717587828636, "step": 14482 }, { "epoch": 0.6583636363636364, "grad_norm": 4.625, "grad_norm_var": 0.05377604166666667, "learning_rate": 0.0001, "loss": 5.6407, "loss/crossentropy": 2.55942165851593, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16046912223100662, "step": 14484 }, { "epoch": 0.6584545454545454, "grad_norm": 5.46875, "grad_norm_var": 0.09724934895833333, "learning_rate": 0.0001, "loss": 5.8752, "loss/crossentropy": 2.6880213022232056, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17262382060289383, "step": 14486 }, { "epoch": 0.6585454545454545, "grad_norm": 4.84375, "grad_norm_var": 0.08092041015625, "learning_rate": 0.0001, "loss": 5.5123, "loss/crossentropy": 2.4725944995880127, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15651452168822289, "step": 14488 }, { "epoch": 0.6586363636363637, "grad_norm": 5.1875, "grad_norm_var": 0.09224853515625, "learning_rate": 0.0001, "loss": 5.3244, "loss/crossentropy": 2.404356062412262, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.14767219498753548, "step": 14490 }, { "epoch": 0.6587272727272727, "grad_norm": 5.15625, "grad_norm_var": 0.09120686848958333, "learning_rate": 0.0001, "loss": 5.8961, "loss/crossentropy": 2.661442995071411, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.17092788964509964, "step": 14492 }, { "epoch": 0.6588181818181819, "grad_norm": 5.625, "grad_norm_var": 0.09602864583333333, "learning_rate": 0.0001, "loss": 5.4416, "loss/crossentropy": 2.3565969467163086, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16006315499544144, "step": 14494 }, { "epoch": 0.6589090909090909, "grad_norm": 4.875, "grad_norm_var": 0.107666015625, "learning_rate": 0.0001, "loss": 5.3205, "loss/crossentropy": 2.4001147150993347, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.1480942666530609, "step": 14496 }, { "epoch": 0.659, "grad_norm": 5.09375, "grad_norm_var": 0.112353515625, "learning_rate": 0.0001, "loss": 5.4624, "loss/crossentropy": 2.4383997917175293, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15493753924965858, "step": 14498 }, { "epoch": 0.6590909090909091, "grad_norm": 4.375, "grad_norm_var": 0.13655192057291668, "learning_rate": 0.0001, "loss": 5.3689, "loss/crossentropy": 2.38688200712204, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15230586379766464, "step": 14500 }, { "epoch": 0.6591818181818182, "grad_norm": 4.65625, "grad_norm_var": 0.11623942057291667, "learning_rate": 0.0001, "loss": 5.6144, "loss/crossentropy": 2.5920315980911255, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15575332194566727, "step": 14502 }, { "epoch": 0.6592727272727272, "grad_norm": 5.0, "grad_norm_var": 0.11851806640625, "learning_rate": 0.0001, "loss": 5.7959, "loss/crossentropy": 2.6427462697029114, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1659025028347969, "step": 14504 }, { "epoch": 0.6593636363636364, "grad_norm": 5.46875, "grad_norm_var": 0.13404541015625, "learning_rate": 0.0001, "loss": 5.695, "loss/crossentropy": 2.482367157936096, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16774774715304375, "step": 14506 }, { "epoch": 0.6594545454545454, "grad_norm": 5.3125, "grad_norm_var": 0.20284830729166667, "learning_rate": 0.0001, "loss": 5.7313, "loss/crossentropy": 2.5813097953796387, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16480672359466553, "step": 14508 }, { "epoch": 0.6595454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.31526285807291665, "learning_rate": 0.0001, "loss": 5.9672, "loss/crossentropy": 2.774557113647461, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17004166170954704, "step": 14510 }, { "epoch": 0.6596363636363637, "grad_norm": 5.15625, "grad_norm_var": 0.2897745768229167, "learning_rate": 0.0001, "loss": 5.7808, "loss/crossentropy": 2.584555149078369, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16962365061044693, "step": 14512 }, { "epoch": 0.6597272727272727, "grad_norm": 5.28125, "grad_norm_var": 0.29547119140625, "learning_rate": 0.0001, "loss": 5.4149, "loss/crossentropy": 2.3881096243858337, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15560809522867203, "step": 14514 }, { "epoch": 0.6598181818181819, "grad_norm": 4.65625, "grad_norm_var": 0.27574462890625, "learning_rate": 0.0001, "loss": 5.1216, "loss/crossentropy": 2.197768807411194, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14726419001817703, "step": 14516 }, { "epoch": 0.6599090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.25536702473958334, "learning_rate": 0.0001, "loss": 5.5935, "loss/crossentropy": 2.4689524173736572, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16460811346769333, "step": 14518 }, { "epoch": 0.66, "grad_norm": 4.71875, "grad_norm_var": 0.2664998372395833, "learning_rate": 0.0001, "loss": 5.6775, "loss/crossentropy": 2.621710777282715, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15792334452271461, "step": 14520 }, { "epoch": 0.6600909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.26282145182291666, "learning_rate": 0.0001, "loss": 5.5124, "loss/crossentropy": 2.4317670464515686, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1600189507007599, "step": 14522 }, { "epoch": 0.6601818181818182, "grad_norm": 5.0, "grad_norm_var": 0.22779947916666668, "learning_rate": 0.0001, "loss": 5.5015, "loss/crossentropy": 2.4581679701805115, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.16038663685321808, "step": 14524 }, { "epoch": 0.6602727272727272, "grad_norm": 4.9375, "grad_norm_var": 0.092041015625, "learning_rate": 0.0001, "loss": 5.486, "loss/crossentropy": 2.470252573490143, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15352598577737808, "step": 14526 }, { "epoch": 0.6603636363636364, "grad_norm": 5.28125, "grad_norm_var": 0.09568684895833333, "learning_rate": 0.0001, "loss": 5.6127, "loss/crossentropy": 2.5201398730278015, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16120431572198868, "step": 14528 }, { "epoch": 0.6604545454545454, "grad_norm": 4.59375, "grad_norm_var": 0.09000244140625, "learning_rate": 0.0001, "loss": 5.3149, "loss/crossentropy": 2.3355169892311096, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15223336964845657, "step": 14530 }, { "epoch": 0.6605454545454545, "grad_norm": 5.4375, "grad_norm_var": 0.08808186848958334, "learning_rate": 0.0001, "loss": 5.6155, "loss/crossentropy": 2.5830517411231995, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.154418732970953, "step": 14532 }, { "epoch": 0.6606363636363637, "grad_norm": 4.75, "grad_norm_var": 0.058186848958333336, "learning_rate": 0.0001, "loss": 5.5585, "loss/crossentropy": 2.5293153524398804, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15604420378804207, "step": 14534 }, { "epoch": 0.6607272727272727, "grad_norm": 5.0, "grad_norm_var": 0.054150390625, "learning_rate": 0.0001, "loss": 5.2381, "loss/crossentropy": 2.257128059864044, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.14887730032205582, "step": 14536 }, { "epoch": 0.6608181818181819, "grad_norm": 4.5, "grad_norm_var": 0.06526285807291667, "learning_rate": 0.0001, "loss": 5.7933, "loss/crossentropy": 2.6584794521331787, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16816936433315277, "step": 14538 }, { "epoch": 0.6609090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.07746988932291667, "learning_rate": 0.0001, "loss": 5.5199, "loss/crossentropy": 2.4204304814338684, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16072753071784973, "step": 14540 }, { "epoch": 0.661, "grad_norm": 4.28125, "grad_norm_var": 0.09866129557291667, "learning_rate": 0.0001, "loss": 5.1868, "loss/crossentropy": 2.312116652727127, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1423475183546543, "step": 14542 }, { "epoch": 0.6610909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.09420572916666667, "learning_rate": 0.0001, "loss": 5.5222, "loss/crossentropy": 2.4573349952697754, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1590249091386795, "step": 14544 }, { "epoch": 0.6611818181818182, "grad_norm": 6.28125, "grad_norm_var": 0.21275634765625, "learning_rate": 0.0001, "loss": 5.7945, "loss/crossentropy": 2.6356093883514404, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16804156824946404, "step": 14546 }, { "epoch": 0.6612727272727272, "grad_norm": 4.46875, "grad_norm_var": 0.21751302083333332, "learning_rate": 0.0001, "loss": 5.4138, "loss/crossentropy": 2.3938942551612854, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15667376294732094, "step": 14548 }, { "epoch": 0.6613636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.216259765625, "learning_rate": 0.0001, "loss": 5.8679, "loss/crossentropy": 2.752472758293152, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16428076475858688, "step": 14550 }, { "epoch": 0.6614545454545454, "grad_norm": 4.875, "grad_norm_var": 0.21575520833333334, "learning_rate": 0.0001, "loss": 5.5714, "loss/crossentropy": 2.4784713983535767, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1616399995982647, "step": 14552 }, { "epoch": 0.6615454545454545, "grad_norm": 4.53125, "grad_norm_var": 0.22340087890625, "learning_rate": 0.0001, "loss": 5.3312, "loss/crossentropy": 2.3758413195610046, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1496373638510704, "step": 14554 }, { "epoch": 0.6616363636363637, "grad_norm": 4.78125, "grad_norm_var": 0.214697265625, "learning_rate": 0.0001, "loss": 5.4647, "loss/crossentropy": 2.4470531344413757, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15606020018458366, "step": 14556 }, { "epoch": 0.6617272727272727, "grad_norm": 5.1875, "grad_norm_var": 0.21170247395833333, "learning_rate": 0.0001, "loss": 6.0174, "loss/crossentropy": 2.816870927810669, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16927630826830864, "step": 14558 }, { "epoch": 0.6618181818181819, "grad_norm": 4.75, "grad_norm_var": 0.2080078125, "learning_rate": 0.0001, "loss": 5.3651, "loss/crossentropy": 2.395837426185608, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15181058645248413, "step": 14560 }, { "epoch": 0.6619090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.07862955729166667, "learning_rate": 0.0001, "loss": 5.5073, "loss/crossentropy": 2.4301341772079468, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16240685433149338, "step": 14562 }, { "epoch": 0.662, "grad_norm": 4.875, "grad_norm_var": 0.0669921875, "learning_rate": 0.0001, "loss": 5.2399, "loss/crossentropy": 2.2994531095027924, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1483430713415146, "step": 14564 }, { "epoch": 0.6620909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.0859375, "learning_rate": 0.0001, "loss": 5.262, "loss/crossentropy": 2.390290319919586, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.14380742982029915, "step": 14566 }, { "epoch": 0.6621818181818182, "grad_norm": 4.875, "grad_norm_var": 0.08453369140625, "learning_rate": 0.0001, "loss": 5.4693, "loss/crossentropy": 2.424134850502014, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15666841343045235, "step": 14568 }, { "epoch": 0.6622727272727272, "grad_norm": 4.90625, "grad_norm_var": 0.08958333333333333, "learning_rate": 0.0001, "loss": 5.9155, "loss/crossentropy": 2.6389726400375366, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17745810747146606, "step": 14570 }, { "epoch": 0.6623636363636364, "grad_norm": 4.5625, "grad_norm_var": 0.08800455729166666, "learning_rate": 0.0001, "loss": 5.1077, "loss/crossentropy": 2.190793991088867, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14462241530418396, "step": 14572 }, { "epoch": 0.6624545454545454, "grad_norm": 4.375, "grad_norm_var": 0.06636962890625, "learning_rate": 0.0001, "loss": 5.4863, "loss/crossentropy": 2.5138657093048096, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.1529104746878147, "step": 14574 }, { "epoch": 0.6625454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.0625, "learning_rate": 0.0001, "loss": 5.6306, "loss/crossentropy": 2.5598538517951965, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.160395085811615, "step": 14576 }, { "epoch": 0.6626363636363637, "grad_norm": 4.75, "grad_norm_var": 0.06847330729166666, "learning_rate": 0.0001, "loss": 5.1626, "loss/crossentropy": 2.276053637266159, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.14431520365178585, "step": 14578 }, { "epoch": 0.6627272727272727, "grad_norm": 6.0625, "grad_norm_var": 0.19833577473958333, "learning_rate": 0.0001, "loss": 6.1435, "loss/crossentropy": 2.8314537405967712, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.18139630183577538, "step": 14580 }, { "epoch": 0.6628181818181819, "grad_norm": 4.875, "grad_norm_var": 0.176416015625, "learning_rate": 0.0001, "loss": 5.4959, "loss/crossentropy": 2.4862895607948303, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15271999314427376, "step": 14582 }, { "epoch": 0.6629090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.19299723307291666, "learning_rate": 0.0001, "loss": 5.2465, "loss/crossentropy": 2.291154205799103, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14944316446781158, "step": 14584 }, { "epoch": 0.663, "grad_norm": 4.53125, "grad_norm_var": 0.188916015625, "learning_rate": 0.0001, "loss": 5.3491, "loss/crossentropy": 2.3372650146484375, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1544988490641117, "step": 14586 }, { "epoch": 0.6630909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.18843994140625, "learning_rate": 0.0001, "loss": 5.7246, "loss/crossentropy": 2.579487979412079, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16255472972989082, "step": 14588 }, { "epoch": 0.6631818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.16769205729166667, "learning_rate": 0.0001, "loss": 5.7467, "loss/crossentropy": 2.5875194668769836, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16669975593686104, "step": 14590 }, { "epoch": 0.6632727272727272, "grad_norm": 5.21875, "grad_norm_var": 0.17398681640625, "learning_rate": 0.0001, "loss": 5.5104, "loss/crossentropy": 2.38445508480072, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16200730204582214, "step": 14592 }, { "epoch": 0.6633636363636364, "grad_norm": 5.34375, "grad_norm_var": 0.16730143229166666, "learning_rate": 0.0001, "loss": 5.2602, "loss/crossentropy": 2.3341862559318542, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.14670155569911003, "step": 14594 }, { "epoch": 0.6634545454545454, "grad_norm": 4.8125, "grad_norm_var": 0.06454671223958333, "learning_rate": 0.0001, "loss": 5.577, "loss/crossentropy": 2.5113143920898438, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1579369716346264, "step": 14596 }, { "epoch": 0.6635454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.11087239583333333, "learning_rate": 0.0001, "loss": 5.7149, "loss/crossentropy": 2.5721338987350464, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1656433343887329, "step": 14598 }, { "epoch": 0.6636363636363637, "grad_norm": 4.875, "grad_norm_var": 0.09412434895833334, "learning_rate": 0.0001, "loss": 5.642, "loss/crossentropy": 2.6024346947669983, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1602061465382576, "step": 14600 }, { "epoch": 0.6637272727272727, "grad_norm": 6.09375, "grad_norm_var": 0.16874593098958332, "learning_rate": 0.0001, "loss": 5.7162, "loss/crossentropy": 2.5933499336242676, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1620929129421711, "step": 14602 }, { "epoch": 0.6638181818181819, "grad_norm": 4.65625, "grad_norm_var": 0.17955729166666667, "learning_rate": 0.0001, "loss": 5.4993, "loss/crossentropy": 2.4710326194763184, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15771254152059555, "step": 14604 }, { "epoch": 0.6639090909090909, "grad_norm": 4.75, "grad_norm_var": 0.19159749348958333, "learning_rate": 0.0001, "loss": 5.8925, "loss/crossentropy": 2.701278507709503, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17146345973014832, "step": 14606 }, { "epoch": 0.664, "grad_norm": 4.5625, "grad_norm_var": 0.21210530598958333, "learning_rate": 0.0001, "loss": 5.2111, "loss/crossentropy": 2.2963694632053375, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.14752573147416115, "step": 14608 }, { "epoch": 0.6640909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.19518229166666667, "learning_rate": 0.0001, "loss": 5.7464, "loss/crossentropy": 2.59967303276062, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16760087758302689, "step": 14610 }, { "epoch": 0.6641818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.19654541015625, "learning_rate": 0.0001, "loss": 5.8409, "loss/crossentropy": 2.6769095063209534, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16698814556002617, "step": 14612 }, { "epoch": 0.6642727272727272, "grad_norm": 5.625, "grad_norm_var": 0.18336181640625, "learning_rate": 0.0001, "loss": 5.5525, "loss/crossentropy": 2.4396756291389465, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.15776648744940758, "step": 14614 }, { "epoch": 0.6643636363636364, "grad_norm": 4.5, "grad_norm_var": 0.19381103515625, "learning_rate": 0.0001, "loss": 5.8385, "loss/crossentropy": 2.7131020426750183, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1668335124850273, "step": 14616 }, { "epoch": 0.6644545454545454, "grad_norm": 5.25, "grad_norm_var": 0.10299072265625, "learning_rate": 0.0001, "loss": 5.4226, "loss/crossentropy": 2.404298275709152, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15338973328471184, "step": 14618 }, { "epoch": 0.6645454545454546, "grad_norm": 4.34375, "grad_norm_var": 0.12291666666666666, "learning_rate": 0.0001, "loss": 5.2001, "loss/crossentropy": 2.287385582923889, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.14771421253681183, "step": 14620 }, { "epoch": 0.6646363636363637, "grad_norm": 4.75, "grad_norm_var": 0.10953369140625, "learning_rate": 0.0001, "loss": 5.5228, "loss/crossentropy": 2.4583874344825745, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15858763828873634, "step": 14622 }, { "epoch": 0.6647272727272727, "grad_norm": 5.25, "grad_norm_var": 0.10139567057291667, "learning_rate": 0.0001, "loss": 5.2947, "loss/crossentropy": 2.291873037815094, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15321504324674606, "step": 14624 }, { "epoch": 0.6648181818181819, "grad_norm": 5.34375, "grad_norm_var": 7.065104166666667, "learning_rate": 0.0001, "loss": 5.581, "loss/crossentropy": 2.4558881521224976, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1630980148911476, "step": 14626 }, { "epoch": 0.6649090909090909, "grad_norm": 5.25, "grad_norm_var": 7.032535807291667, "learning_rate": 0.0001, "loss": 5.6237, "loss/crossentropy": 2.464868903160095, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1649041436612606, "step": 14628 }, { "epoch": 0.665, "grad_norm": 5.0, "grad_norm_var": 7.1173828125, "learning_rate": 0.0001, "loss": 5.2447, "loss/crossentropy": 2.2341561913490295, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1522250920534134, "step": 14630 }, { "epoch": 0.6650909090909091, "grad_norm": 4.9375, "grad_norm_var": 7.037430826822916, "learning_rate": 0.0001, "loss": 5.9869, "loss/crossentropy": 2.7075061798095703, "loss/hidden": 1.521484375, "loss/jsd": 0.0, "loss/logits": 0.17579179629683495, "step": 14632 }, { "epoch": 0.6651818181818182, "grad_norm": 5.28125, "grad_norm_var": 7.075325520833333, "learning_rate": 0.0001, "loss": 5.7224, "loss/crossentropy": 2.634636104106903, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16073094680905342, "step": 14634 }, { "epoch": 0.6652727272727272, "grad_norm": 5.5, "grad_norm_var": 21.170048014322916, "learning_rate": 0.0001, "loss": 5.3784, "loss/crossentropy": 2.3791489005088806, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.151484202593565, "step": 14636 }, { "epoch": 0.6653636363636364, "grad_norm": 9.4375, "grad_norm_var": 21.043452962239584, "learning_rate": 0.0001, "loss": 5.8716, "loss/crossentropy": 2.574512779712677, "loss/hidden": 1.568359375, "loss/jsd": 0.0, "loss/logits": 0.17287568747997284, "step": 14638 }, { "epoch": 0.6654545454545454, "grad_norm": 4.84375, "grad_norm_var": 21.10836181640625, "learning_rate": 0.0001, "loss": 5.6818, "loss/crossentropy": 2.5617367029190063, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16220280528068542, "step": 14640 }, { "epoch": 0.6655454545454546, "grad_norm": 5.40625, "grad_norm_var": 16.267378743489584, "learning_rate": 0.0001, "loss": 5.826, "loss/crossentropy": 2.6934983134269714, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1638370342552662, "step": 14642 }, { "epoch": 0.6656363636363636, "grad_norm": 4.8125, "grad_norm_var": 16.511848958333335, "learning_rate": 0.0001, "loss": 4.9744, "loss/crossentropy": 2.0803614258766174, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.14116486348211765, "step": 14644 }, { "epoch": 0.6657272727272727, "grad_norm": 4.5, "grad_norm_var": 16.484761555989582, "learning_rate": 0.0001, "loss": 5.587, "loss/crossentropy": 2.5679538249969482, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15619712322950363, "step": 14646 }, { "epoch": 0.6658181818181819, "grad_norm": 4.40625, "grad_norm_var": 16.7890625, "learning_rate": 0.0001, "loss": 5.6185, "loss/crossentropy": 2.637032210826874, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15400145202875137, "step": 14648 }, { "epoch": 0.6659090909090909, "grad_norm": 4.75, "grad_norm_var": 16.848502604166665, "learning_rate": 0.0001, "loss": 5.7609, "loss/crossentropy": 2.648809015750885, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16179189458489418, "step": 14650 }, { "epoch": 0.666, "grad_norm": 4.96875, "grad_norm_var": 1.4295857747395833, "learning_rate": 0.0001, "loss": 5.3797, "loss/crossentropy": 2.3703519105911255, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1515166461467743, "step": 14652 }, { "epoch": 0.6660909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.08264567057291666, "learning_rate": 0.0001, "loss": 5.4946, "loss/crossentropy": 2.4462444186210632, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15620705857872963, "step": 14654 }, { "epoch": 0.6661818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.08336181640625, "learning_rate": 0.0001, "loss": 5.7302, "loss/crossentropy": 2.6429893374443054, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16321339830756187, "step": 14656 }, { "epoch": 0.6662727272727272, "grad_norm": 4.875, "grad_norm_var": 0.075244140625, "learning_rate": 0.0001, "loss": 5.3645, "loss/crossentropy": 2.370669424533844, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15328679978847504, "step": 14658 }, { "epoch": 0.6663636363636364, "grad_norm": 5.1875, "grad_norm_var": 0.08775634765625, "learning_rate": 0.0001, "loss": 5.7478, "loss/crossentropy": 2.6256787180900574, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16163047030568123, "step": 14660 }, { "epoch": 0.6664545454545454, "grad_norm": 8.6875, "grad_norm_var": 1.01119384765625, "learning_rate": 0.0001, "loss": 5.5824, "loss/crossentropy": 2.459168553352356, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16056132316589355, "step": 14662 }, { "epoch": 0.6665454545454546, "grad_norm": 4.96875, "grad_norm_var": 0.9732381184895833, "learning_rate": 0.0001, "loss": 5.6034, "loss/crossentropy": 2.573319435119629, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15730908885598183, "step": 14664 }, { "epoch": 0.6666363636363636, "grad_norm": 5.8125, "grad_norm_var": 1.3969889322916667, "learning_rate": 0.0001, "loss": 5.502, "loss/crossentropy": 2.4034863710403442, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1610247679054737, "step": 14666 }, { "epoch": 0.6667272727272727, "grad_norm": 6.5, "grad_norm_var": 1.48580322265625, "learning_rate": 0.0001, "loss": 5.1866, "loss/crossentropy": 2.222708225250244, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14873061142861843, "step": 14668 }, { "epoch": 0.6668181818181819, "grad_norm": 5.25, "grad_norm_var": 1.43199462890625, "learning_rate": 0.0001, "loss": 5.3174, "loss/crossentropy": 2.1489322185516357, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1650925576686859, "step": 14670 }, { "epoch": 0.6669090909090909, "grad_norm": 6.59375, "grad_norm_var": 1.4498046875, "learning_rate": 0.0001, "loss": 5.5278, "loss/crossentropy": 2.443805992603302, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16074083000421524, "step": 14672 }, { "epoch": 0.667, "grad_norm": 5.21875, "grad_norm_var": 1.3261555989583333, "learning_rate": 0.0001, "loss": 5.8145, "loss/crossentropy": 2.6732526421546936, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16451715305447578, "step": 14674 }, { "epoch": 0.6670909090909091, "grad_norm": 4.5, "grad_norm_var": 1.3998006184895833, "learning_rate": 0.0001, "loss": 5.3386, "loss/crossentropy": 2.411792367696762, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1450262814760208, "step": 14676 }, { "epoch": 0.6671818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.7348958333333333, "learning_rate": 0.0001, "loss": 5.5463, "loss/crossentropy": 2.4674432277679443, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15964435413479805, "step": 14678 }, { "epoch": 0.6672727272727272, "grad_norm": 5.21875, "grad_norm_var": 0.721337890625, "learning_rate": 0.0001, "loss": 5.6645, "loss/crossentropy": 2.483033776283264, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16990790888667107, "step": 14680 }, { "epoch": 0.6673636363636364, "grad_norm": 4.53125, "grad_norm_var": 0.38238525390625, "learning_rate": 0.0001, "loss": 5.0222, "loss/crossentropy": 2.217642992734909, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.13416637480258942, "step": 14682 }, { "epoch": 0.6674545454545454, "grad_norm": 4.75, "grad_norm_var": 0.24459228515625, "learning_rate": 0.0001, "loss": 5.6129, "loss/crossentropy": 2.493801772594452, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16229823604226112, "step": 14684 }, { "epoch": 0.6675454545454546, "grad_norm": 5.625, "grad_norm_var": 0.23622639973958334, "learning_rate": 0.0001, "loss": 5.5001, "loss/crossentropy": 2.353171706199646, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16508278623223305, "step": 14686 }, { "epoch": 0.6676363636363636, "grad_norm": 4.78125, "grad_norm_var": 0.10230712890625, "learning_rate": 0.0001, "loss": 5.5619, "loss/crossentropy": 2.4851110577583313, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16080009937286377, "step": 14688 }, { "epoch": 0.6677272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.10230712890625, "learning_rate": 0.0001, "loss": 5.7441, "loss/crossentropy": 2.6108075380325317, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1650853306055069, "step": 14690 }, { "epoch": 0.6678181818181819, "grad_norm": 5.03125, "grad_norm_var": 0.08411051432291666, "learning_rate": 0.0001, "loss": 5.8407, "loss/crossentropy": 2.690009295940399, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1664368100464344, "step": 14692 }, { "epoch": 0.6679090909090909, "grad_norm": 5.0, "grad_norm_var": 0.08137613932291667, "learning_rate": 0.0001, "loss": 5.6162, "loss/crossentropy": 2.5182636976242065, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16037816554307938, "step": 14694 }, { "epoch": 0.668, "grad_norm": 5.125, "grad_norm_var": 0.07727457682291666, "learning_rate": 0.0001, "loss": 5.5565, "loss/crossentropy": 2.4498541355133057, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16144826263189316, "step": 14696 }, { "epoch": 0.6680909090909091, "grad_norm": 4.75, "grad_norm_var": 0.07138264973958333, "learning_rate": 0.0001, "loss": 5.4898, "loss/crossentropy": 2.485405683517456, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1514110527932644, "step": 14698 }, { "epoch": 0.6681818181818182, "grad_norm": 5.25, "grad_norm_var": 0.055497233072916666, "learning_rate": 0.0001, "loss": 6.0108, "loss/crossentropy": 2.7887539863586426, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17220144718885422, "step": 14700 }, { "epoch": 0.6682727272727272, "grad_norm": 4.46875, "grad_norm_var": 0.085791015625, "learning_rate": 0.0001, "loss": 5.3844, "loss/crossentropy": 2.399520993232727, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15317891165614128, "step": 14702 }, { "epoch": 0.6683636363636364, "grad_norm": 4.84375, "grad_norm_var": 0.08469645182291667, "learning_rate": 0.0001, "loss": 5.3399, "loss/crossentropy": 2.3223689794540405, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15253232419490814, "step": 14704 }, { "epoch": 0.6684545454545454, "grad_norm": 5.25, "grad_norm_var": 0.09293212890625, "learning_rate": 0.0001, "loss": 5.8724, "loss/crossentropy": 2.70592337846756, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16898903995752335, "step": 14706 }, { "epoch": 0.6685454545454546, "grad_norm": 5.09375, "grad_norm_var": 0.09146728515625, "learning_rate": 0.0001, "loss": 5.2944, "loss/crossentropy": 2.3047215342521667, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15112097933888435, "step": 14708 }, { "epoch": 0.6686363636363636, "grad_norm": 5.34375, "grad_norm_var": 0.16028238932291666, "learning_rate": 0.0001, "loss": 5.6566, "loss/crossentropy": 2.529391646385193, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16291284188628197, "step": 14710 }, { "epoch": 0.6687272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.16272379557291666, "learning_rate": 0.0001, "loss": 5.6186, "loss/crossentropy": 2.495147466659546, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16312279552221298, "step": 14712 }, { "epoch": 0.6688181818181819, "grad_norm": 4.78125, "grad_norm_var": 0.155712890625, "learning_rate": 0.0001, "loss": 5.3598, "loss/crossentropy": 2.3523505330085754, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15113554894924164, "step": 14714 }, { "epoch": 0.6689090909090909, "grad_norm": 5.28125, "grad_norm_var": 0.1845703125, "learning_rate": 0.0001, "loss": 5.5017, "loss/crossentropy": 2.4914302229881287, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15414700657129288, "step": 14716 }, { "epoch": 0.669, "grad_norm": 4.59375, "grad_norm_var": 0.15208333333333332, "learning_rate": 0.0001, "loss": 5.7675, "loss/crossentropy": 2.646491140127182, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16600725799798965, "step": 14718 }, { "epoch": 0.6690909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.15266520182291668, "learning_rate": 0.0001, "loss": 5.6936, "loss/crossentropy": 2.5623375177383423, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16586025059223175, "step": 14720 }, { "epoch": 0.6691818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.16756184895833334, "learning_rate": 0.0001, "loss": 5.2108, "loss/crossentropy": 2.3075430393218994, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14579923450946808, "step": 14722 }, { "epoch": 0.6692727272727272, "grad_norm": 4.65625, "grad_norm_var": 0.1751953125, "learning_rate": 0.0001, "loss": 5.7555, "loss/crossentropy": 2.6116074323654175, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1682937666773796, "step": 14724 }, { "epoch": 0.6693636363636364, "grad_norm": 4.625, "grad_norm_var": 0.09381103515625, "learning_rate": 0.0001, "loss": 5.4043, "loss/crossentropy": 2.3865214586257935, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15626663342118263, "step": 14726 }, { "epoch": 0.6694545454545454, "grad_norm": 4.5625, "grad_norm_var": 0.09241129557291666, "learning_rate": 0.0001, "loss": 5.2294, "loss/crossentropy": 2.3199295103549957, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14485789090394974, "step": 14728 }, { "epoch": 0.6695454545454546, "grad_norm": 5.3125, "grad_norm_var": 0.11875, "learning_rate": 0.0001, "loss": 5.1824, "loss/crossentropy": 2.152264326810837, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1512589193880558, "step": 14730 }, { "epoch": 0.6696363636363636, "grad_norm": 5.5, "grad_norm_var": 0.12994384765625, "learning_rate": 0.0001, "loss": 5.9307, "loss/crossentropy": 2.7600613236427307, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16804517433047295, "step": 14732 }, { "epoch": 0.6697272727272727, "grad_norm": 4.53125, "grad_norm_var": 0.131884765625, "learning_rate": 0.0001, "loss": 5.5143, "loss/crossentropy": 2.4980695843696594, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15747914463281631, "step": 14734 }, { "epoch": 0.6698181818181819, "grad_norm": 5.125, "grad_norm_var": 0.12135009765625, "learning_rate": 0.0001, "loss": 5.5159, "loss/crossentropy": 2.4881858229637146, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15725995227694511, "step": 14736 }, { "epoch": 0.6699090909090909, "grad_norm": 5.53125, "grad_norm_var": 0.15388997395833334, "learning_rate": 0.0001, "loss": 5.5949, "loss/crossentropy": 2.5038620233535767, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15832163393497467, "step": 14738 }, { "epoch": 0.67, "grad_norm": 4.71875, "grad_norm_var": 0.15507405598958332, "learning_rate": 0.0001, "loss": 5.5928, "loss/crossentropy": 2.5009512305259705, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16211550682783127, "step": 14740 }, { "epoch": 0.6700909090909091, "grad_norm": 5.0, "grad_norm_var": 0.17265218098958332, "learning_rate": 0.0001, "loss": 5.5216, "loss/crossentropy": 2.495528817176819, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1551411636173725, "step": 14742 }, { "epoch": 0.6701818181818182, "grad_norm": 5.0, "grad_norm_var": 0.14478759765625, "learning_rate": 0.0001, "loss": 5.7446, "loss/crossentropy": 2.620486319065094, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1631913259625435, "step": 14744 }, { "epoch": 0.6702727272727272, "grad_norm": 4.71875, "grad_norm_var": 0.13280843098958334, "learning_rate": 0.0001, "loss": 5.7103, "loss/crossentropy": 2.6095163822174072, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1616438329219818, "step": 14746 }, { "epoch": 0.6703636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.11864827473958334, "learning_rate": 0.0001, "loss": 5.4142, "loss/crossentropy": 2.431125223636627, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15104178711771965, "step": 14748 }, { "epoch": 0.6704545454545454, "grad_norm": 4.875, "grad_norm_var": 0.10982666015625, "learning_rate": 0.0001, "loss": 5.8449, "loss/crossentropy": 2.73796147108078, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16518713906407356, "step": 14750 }, { "epoch": 0.6705454545454546, "grad_norm": 5.125, "grad_norm_var": 0.08391520182291666, "learning_rate": 0.0001, "loss": 5.5976, "loss/crossentropy": 2.54883474111557, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15643977001309395, "step": 14752 }, { "epoch": 0.6706363636363636, "grad_norm": 4.75, "grad_norm_var": 0.07200520833333333, "learning_rate": 0.0001, "loss": 5.0085, "loss/crossentropy": 2.161517083644867, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.13977921195328236, "step": 14754 }, { "epoch": 0.6707272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.07408447265625, "learning_rate": 0.0001, "loss": 5.6369, "loss/crossentropy": 2.542103350162506, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16026446968317032, "step": 14756 }, { "epoch": 0.6708181818181819, "grad_norm": 8.375, "grad_norm_var": 0.8562337239583333, "learning_rate": 0.0001, "loss": 5.5705, "loss/crossentropy": 2.5307381749153137, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15749597176909447, "step": 14758 }, { "epoch": 0.6709090909090909, "grad_norm": 5.40625, "grad_norm_var": 0.86328125, "learning_rate": 0.0001, "loss": 5.6132, "loss/crossentropy": 2.4867268800735474, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1649920865893364, "step": 14760 }, { "epoch": 0.671, "grad_norm": 4.6875, "grad_norm_var": 0.8456380208333333, "learning_rate": 0.0001, "loss": 5.3226, "loss/crossentropy": 2.2674600481987, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15824555605649948, "step": 14762 }, { "epoch": 0.6710909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.847900390625, "learning_rate": 0.0001, "loss": 5.3482, "loss/crossentropy": 2.351358026266098, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15261723846197128, "step": 14764 }, { "epoch": 0.6711818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.86861572265625, "learning_rate": 0.0001, "loss": 5.4898, "loss/crossentropy": 2.535560190677643, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14854972437024117, "step": 14766 }, { "epoch": 0.6712727272727272, "grad_norm": 4.75, "grad_norm_var": 0.8753865559895834, "learning_rate": 0.0001, "loss": 5.3989, "loss/crossentropy": 2.3662914633750916, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15521281585097313, "step": 14768 }, { "epoch": 0.6713636363636364, "grad_norm": 4.9375, "grad_norm_var": 0.8495930989583333, "learning_rate": 0.0001, "loss": 5.4331, "loss/crossentropy": 2.4199856519699097, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1550198830664158, "step": 14770 }, { "epoch": 0.6714545454545454, "grad_norm": 4.65625, "grad_norm_var": 0.8547526041666667, "learning_rate": 0.0001, "loss": 5.5669, "loss/crossentropy": 2.4502986073493958, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16263531893491745, "step": 14772 }, { "epoch": 0.6715454545454546, "grad_norm": 5.34375, "grad_norm_var": 0.06808268229166667, "learning_rate": 0.0001, "loss": 5.8937, "loss/crossentropy": 2.719221353530884, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16724826022982597, "step": 14774 }, { "epoch": 0.6716363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.052958170572916664, "learning_rate": 0.0001, "loss": 5.8613, "loss/crossentropy": 2.7312249541282654, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1649584360420704, "step": 14776 }, { "epoch": 0.6717272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.060868326822916666, "learning_rate": 0.0001, "loss": 5.3249, "loss/crossentropy": 2.4034529626369476, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14643876999616623, "step": 14778 }, { "epoch": 0.6718181818181819, "grad_norm": 4.625, "grad_norm_var": 0.06061197916666667, "learning_rate": 0.0001, "loss": 5.578, "loss/crossentropy": 2.5282622575759888, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1580955684185028, "step": 14780 }, { "epoch": 0.6719090909090909, "grad_norm": 4.59375, "grad_norm_var": 0.062235514322916664, "learning_rate": 0.0001, "loss": 5.8477, "loss/crossentropy": 2.6771205067634583, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16666514798998833, "step": 14782 }, { "epoch": 0.672, "grad_norm": 4.65625, "grad_norm_var": 0.07799479166666666, "learning_rate": 0.0001, "loss": 5.357, "loss/crossentropy": 2.3994439244270325, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.14985255897045135, "step": 14784 }, { "epoch": 0.6720909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.084375, "learning_rate": 0.0001, "loss": 5.6827, "loss/crossentropy": 2.653828978538513, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1564064733684063, "step": 14786 }, { "epoch": 0.6721818181818182, "grad_norm": 4.5, "grad_norm_var": 0.088134765625, "learning_rate": 0.0001, "loss": 5.4341, "loss/crossentropy": 2.385296642780304, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15410209074616432, "step": 14788 }, { "epoch": 0.6722727272727272, "grad_norm": 4.65625, "grad_norm_var": 0.04420166015625, "learning_rate": 0.0001, "loss": 5.6054, "loss/crossentropy": 2.560108184814453, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1560966596007347, "step": 14790 }, { "epoch": 0.6723636363636364, "grad_norm": 4.875, "grad_norm_var": 0.044722493489583334, "learning_rate": 0.0001, "loss": 5.5479, "loss/crossentropy": 2.5007128715515137, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1568627804517746, "step": 14792 }, { "epoch": 0.6724545454545454, "grad_norm": 5.03125, "grad_norm_var": 0.04605712890625, "learning_rate": 0.0001, "loss": 5.7323, "loss/crossentropy": 2.6086812019348145, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16392884403467178, "step": 14794 }, { "epoch": 0.6725454545454546, "grad_norm": 4.0625, "grad_norm_var": 0.08261311848958333, "learning_rate": 0.0001, "loss": 4.8648, "loss/crossentropy": 2.1229419112205505, "loss/hidden": 1.431640625, "loss/jsd": 0.0, "loss/logits": 0.13101987540721893, "step": 14796 }, { "epoch": 0.6726363636363636, "grad_norm": 4.53125, "grad_norm_var": 0.0720703125, "learning_rate": 0.0001, "loss": 5.0562, "loss/crossentropy": 2.15819251537323, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1448834463953972, "step": 14798 }, { "epoch": 0.6727272727272727, "grad_norm": 5.03125, "grad_norm_var": 0.08508707682291666, "learning_rate": 0.0001, "loss": 5.6065, "loss/crossentropy": 2.5013455748558044, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16031523421406746, "step": 14800 }, { "epoch": 0.6728181818181819, "grad_norm": 4.53125, "grad_norm_var": 0.08255208333333333, "learning_rate": 0.0001, "loss": 5.5837, "loss/crossentropy": 2.5305891036987305, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15961020812392235, "step": 14802 }, { "epoch": 0.6729090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.07649332682291667, "learning_rate": 0.0001, "loss": 5.3545, "loss/crossentropy": 2.3898249864578247, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14842498302459717, "step": 14804 }, { "epoch": 0.673, "grad_norm": 4.59375, "grad_norm_var": 0.094921875, "learning_rate": 0.0001, "loss": 5.5336, "loss/crossentropy": 2.476577788591385, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1580447480082512, "step": 14806 }, { "epoch": 0.6730909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.10868733723958333, "learning_rate": 0.0001, "loss": 5.6993, "loss/crossentropy": 2.6263473629951477, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16101045161485672, "step": 14808 }, { "epoch": 0.6731818181818182, "grad_norm": 6.15625, "grad_norm_var": 2.150764973958333, "learning_rate": 0.0001, "loss": 5.9132, "loss/crossentropy": 2.64788556098938, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1790732704102993, "step": 14810 }, { "epoch": 0.6732727272727272, "grad_norm": 6.625, "grad_norm_var": 2.1065388997395833, "learning_rate": 0.0001, "loss": 5.5264, "loss/crossentropy": 2.425367623567581, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16068832203745842, "step": 14812 }, { "epoch": 0.6733636363636364, "grad_norm": 4.875, "grad_norm_var": 2.063895670572917, "learning_rate": 0.0001, "loss": 5.3226, "loss/crossentropy": 2.30625382065773, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15534116327762604, "step": 14814 }, { "epoch": 0.6734545454545454, "grad_norm": 4.90625, "grad_norm_var": 2.102469889322917, "learning_rate": 0.0001, "loss": 5.3237, "loss/crossentropy": 2.374607741832733, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14861800149083138, "step": 14816 }, { "epoch": 0.6735454545454546, "grad_norm": 5.15625, "grad_norm_var": 2.068229166666667, "learning_rate": 0.0001, "loss": 5.1954, "loss/crossentropy": 2.2406651973724365, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14586044661700726, "step": 14818 }, { "epoch": 0.6736363636363636, "grad_norm": 5.09375, "grad_norm_var": 2.00220947265625, "learning_rate": 0.0001, "loss": 5.5817, "loss/crossentropy": 2.465458631515503, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16103416681289673, "step": 14820 }, { "epoch": 0.6737272727272727, "grad_norm": 5.0625, "grad_norm_var": 1.971875, "learning_rate": 0.0001, "loss": 5.909, "loss/crossentropy": 2.641572952270508, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17595941200852394, "step": 14822 }, { "epoch": 0.6738181818181819, "grad_norm": 5.0, "grad_norm_var": 1.9678995768229166, "learning_rate": 0.0001, "loss": 5.5933, "loss/crossentropy": 2.5282087922096252, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15749019756913185, "step": 14824 }, { "epoch": 0.6739090909090909, "grad_norm": 5.3125, "grad_norm_var": 0.27615559895833336, "learning_rate": 0.0001, "loss": 5.4989, "loss/crossentropy": 2.5026021599769592, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1531408168375492, "step": 14826 }, { "epoch": 0.674, "grad_norm": 6.09375, "grad_norm_var": 0.21454671223958333, "learning_rate": 0.0001, "loss": 6.0605, "loss/crossentropy": 2.7982600927352905, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17719874903559685, "step": 14828 }, { "epoch": 0.6740909090909091, "grad_norm": 5.3125, "grad_norm_var": 0.19358317057291666, "learning_rate": 0.0001, "loss": 5.5363, "loss/crossentropy": 2.456417143344879, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15935982391238213, "step": 14830 }, { "epoch": 0.6741818181818182, "grad_norm": 4.9375, "grad_norm_var": 0.192431640625, "learning_rate": 0.0001, "loss": 5.5167, "loss/crossentropy": 2.495418429374695, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15623139217495918, "step": 14832 }, { "epoch": 0.6742727272727272, "grad_norm": 4.9375, "grad_norm_var": 0.198681640625, "learning_rate": 0.0001, "loss": 5.6594, "loss/crossentropy": 2.542004704475403, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16369467973709106, "step": 14834 }, { "epoch": 0.6743636363636364, "grad_norm": 5.15625, "grad_norm_var": 0.91275634765625, "learning_rate": 0.0001, "loss": 5.1304, "loss/crossentropy": 2.171060025691986, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1490582786500454, "step": 14836 }, { "epoch": 0.6744545454545454, "grad_norm": 4.78125, "grad_norm_var": 0.910009765625, "learning_rate": 0.0001, "loss": 5.3238, "loss/crossentropy": 2.3311139941215515, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15356320142745972, "step": 14838 }, { "epoch": 0.6745454545454546, "grad_norm": 5.0, "grad_norm_var": 0.90640869140625, "learning_rate": 0.0001, "loss": 5.6963, "loss/crossentropy": 2.5634007453918457, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.163879182189703, "step": 14840 }, { "epoch": 0.6746363636363636, "grad_norm": 4.8125, "grad_norm_var": 0.9267862955729167, "learning_rate": 0.0001, "loss": 5.695, "loss/crossentropy": 2.647094964981079, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.1600625216960907, "step": 14842 }, { "epoch": 0.6747272727272727, "grad_norm": 5.0, "grad_norm_var": 0.86109619140625, "learning_rate": 0.0001, "loss": 5.7909, "loss/crossentropy": 2.6277424693107605, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17022358998656273, "step": 14844 }, { "epoch": 0.6748181818181819, "grad_norm": 4.9375, "grad_norm_var": 0.9246744791666667, "learning_rate": 0.0001, "loss": 5.194, "loss/crossentropy": 2.2414469122886658, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14876921847462654, "step": 14846 }, { "epoch": 0.6749090909090909, "grad_norm": 4.875, "grad_norm_var": 0.926025390625, "learning_rate": 0.0001, "loss": 5.3815, "loss/crossentropy": 2.3841670155525208, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1524643339216709, "step": 14848 }, { "epoch": 0.675, "grad_norm": 4.875, "grad_norm_var": 0.938916015625, "learning_rate": 0.0001, "loss": 5.398, "loss/crossentropy": 2.3979039788246155, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1548878401517868, "step": 14850 }, { "epoch": 0.6750909090909091, "grad_norm": 4.21875, "grad_norm_var": 0.07707926432291666, "learning_rate": 0.0001, "loss": 4.7381, "loss/crossentropy": 1.9580199718475342, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.13386878371238708, "step": 14852 }, { "epoch": 0.6751818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.08802083333333334, "learning_rate": 0.0001, "loss": 5.8953, "loss/crossentropy": 2.699602425098419, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16937736421823502, "step": 14854 }, { "epoch": 0.6752727272727272, "grad_norm": 4.5625, "grad_norm_var": 0.08941650390625, "learning_rate": 0.0001, "loss": 5.0724, "loss/crossentropy": 2.1688239872455597, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.1468065343797207, "step": 14856 }, { "epoch": 0.6753636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.09394124348958334, "learning_rate": 0.0001, "loss": 5.5113, "loss/crossentropy": 2.4745628237724304, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15817086026072502, "step": 14858 }, { "epoch": 0.6754545454545454, "grad_norm": 4.3125, "grad_norm_var": 0.12590738932291667, "learning_rate": 0.0001, "loss": 5.3573, "loss/crossentropy": 2.3403390049934387, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1522846333682537, "step": 14860 }, { "epoch": 0.6755454545454546, "grad_norm": 4.625, "grad_norm_var": 1.2266886393229166, "learning_rate": 0.0001, "loss": 5.2157, "loss/crossentropy": 2.1624057292938232, "loss/hidden": 1.529296875, "loss/jsd": 0.0, "loss/logits": 0.1523999720811844, "step": 14862 }, { "epoch": 0.6756363636363636, "grad_norm": 5.0625, "grad_norm_var": 1.2190755208333333, "learning_rate": 0.0001, "loss": 5.6943, "loss/crossentropy": 2.569103419780731, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16368817165493965, "step": 14864 }, { "epoch": 0.6757272727272727, "grad_norm": 4.84375, "grad_norm_var": 1.20045166015625, "learning_rate": 0.0001, "loss": 5.2979, "loss/crossentropy": 2.2738258242607117, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.152802973985672, "step": 14866 }, { "epoch": 0.6758181818181819, "grad_norm": 5.4375, "grad_norm_var": 1.162353515625, "learning_rate": 0.0001, "loss": 5.447, "loss/crossentropy": 2.285519987344742, "loss/hidden": 1.556640625, "loss/jsd": 0.0, "loss/logits": 0.16048305481672287, "step": 14868 }, { "epoch": 0.6759090909090909, "grad_norm": 6.09375, "grad_norm_var": 1.2190755208333333, "learning_rate": 0.0001, "loss": 5.7402, "loss/crossentropy": 2.56396746635437, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16586197912693024, "step": 14870 }, { "epoch": 0.676, "grad_norm": 5.46875, "grad_norm_var": 1.1478800455729166, "learning_rate": 0.0001, "loss": 5.5837, "loss/crossentropy": 2.5111696124076843, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1594051495194435, "step": 14872 }, { "epoch": 0.6760909090909091, "grad_norm": 5.71875, "grad_norm_var": 1.1305989583333333, "learning_rate": 0.0001, "loss": 5.7749, "loss/crossentropy": 2.638635277748108, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16343194246292114, "step": 14874 }, { "epoch": 0.6761818181818182, "grad_norm": 4.90625, "grad_norm_var": 1.1070597330729166, "learning_rate": 0.0001, "loss": 5.3968, "loss/crossentropy": 2.450125992298126, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.14993639662861824, "step": 14876 }, { "epoch": 0.6762727272727272, "grad_norm": 4.40625, "grad_norm_var": 0.17649332682291666, "learning_rate": 0.0001, "loss": 5.2392, "loss/crossentropy": 2.270824819803238, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1507445126771927, "step": 14878 }, { "epoch": 0.6763636363636364, "grad_norm": 5.125, "grad_norm_var": 0.17274983723958334, "learning_rate": 0.0001, "loss": 5.9242, "loss/crossentropy": 2.6987059116363525, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17098333686590195, "step": 14880 }, { "epoch": 0.6764545454545454, "grad_norm": 5.34375, "grad_norm_var": 0.16838785807291667, "learning_rate": 0.0001, "loss": 5.8376, "loss/crossentropy": 2.6837639808654785, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.16596636921167374, "step": 14882 }, { "epoch": 0.6765454545454546, "grad_norm": 5.125, "grad_norm_var": 0.15478108723958334, "learning_rate": 0.0001, "loss": 5.7921, "loss/crossentropy": 2.6174458265304565, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.17000731080770493, "step": 14884 }, { "epoch": 0.6766363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.10048421223958333, "learning_rate": 0.0001, "loss": 5.4444, "loss/crossentropy": 2.4098557233810425, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1577465832233429, "step": 14886 }, { "epoch": 0.6767272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.09026285807291666, "learning_rate": 0.0001, "loss": 5.4009, "loss/crossentropy": 2.316635489463806, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1592080481350422, "step": 14888 }, { "epoch": 0.6768181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.05553385416666667, "learning_rate": 0.0001, "loss": 5.7757, "loss/crossentropy": 2.6555426120758057, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1653316766023636, "step": 14890 }, { "epoch": 0.6769090909090909, "grad_norm": 4.75, "grad_norm_var": 0.0529296875, "learning_rate": 0.0001, "loss": 5.3007, "loss/crossentropy": 2.307871997356415, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15221336111426353, "step": 14892 }, { "epoch": 0.677, "grad_norm": 4.78125, "grad_norm_var": 0.04107666015625, "learning_rate": 0.0001, "loss": 4.9556, "loss/crossentropy": 2.051932781934738, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.14114334806799889, "step": 14894 }, { "epoch": 0.6770909090909091, "grad_norm": 5.0, "grad_norm_var": 0.040478515625, "learning_rate": 0.0001, "loss": 5.8142, "loss/crossentropy": 2.5898659229278564, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1722402125597, "step": 14896 }, { "epoch": 0.6771818181818182, "grad_norm": 4.65625, "grad_norm_var": 0.031571451822916666, "learning_rate": 0.0001, "loss": 5.6724, "loss/crossentropy": 2.614823579788208, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15868839249014854, "step": 14898 }, { "epoch": 0.6772727272727272, "grad_norm": 5.125, "grad_norm_var": 0.03365885416666667, "learning_rate": 0.0001, "loss": 5.858, "loss/crossentropy": 2.708995521068573, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1654854528605938, "step": 14900 }, { "epoch": 0.6773636363636364, "grad_norm": 4.46875, "grad_norm_var": 0.04241129557291667, "learning_rate": 0.0001, "loss": 5.7608, "loss/crossentropy": 2.6420387029647827, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16480176523327827, "step": 14902 }, { "epoch": 0.6774545454545454, "grad_norm": 4.8125, "grad_norm_var": 0.037890625, "learning_rate": 0.0001, "loss": 5.4816, "loss/crossentropy": 2.4349318742752075, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1595461219549179, "step": 14904 }, { "epoch": 0.6775454545454546, "grad_norm": 4.71875, "grad_norm_var": 0.035009765625, "learning_rate": 0.0001, "loss": 5.591, "loss/crossentropy": 2.6034812927246094, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1546124964952469, "step": 14906 }, { "epoch": 0.6776363636363636, "grad_norm": 5.0, "grad_norm_var": 0.043680826822916664, "learning_rate": 0.0001, "loss": 5.3594, "loss/crossentropy": 2.356455087661743, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1528318841010332, "step": 14908 }, { "epoch": 0.6777272727272727, "grad_norm": 5.0625, "grad_norm_var": 0.0830078125, "learning_rate": 0.0001, "loss": 5.8976, "loss/crossentropy": 2.6962955594062805, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.16993698477745056, "step": 14910 }, { "epoch": 0.6778181818181818, "grad_norm": 5.21875, "grad_norm_var": 0.08694254557291667, "learning_rate": 0.0001, "loss": 5.6224, "loss/crossentropy": 2.5482494235038757, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16132031753659248, "step": 14912 }, { "epoch": 0.6779090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.08173421223958334, "learning_rate": 0.0001, "loss": 5.7701, "loss/crossentropy": 2.631121277809143, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16741415858268738, "step": 14914 }, { "epoch": 0.678, "grad_norm": 5.09375, "grad_norm_var": 0.08046468098958333, "learning_rate": 0.0001, "loss": 5.6389, "loss/crossentropy": 2.561305344104767, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1612737476825714, "step": 14916 }, { "epoch": 0.6780909090909091, "grad_norm": 5.1875, "grad_norm_var": 0.06756184895833334, "learning_rate": 0.0001, "loss": 5.8728, "loss/crossentropy": 2.6863314509391785, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17137952521443367, "step": 14918 }, { "epoch": 0.6781818181818182, "grad_norm": 4.40625, "grad_norm_var": 0.09524332682291667, "learning_rate": 0.0001, "loss": 5.4751, "loss/crossentropy": 2.4379305243492126, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15664466470479965, "step": 14920 }, { "epoch": 0.6782727272727272, "grad_norm": 4.5625, "grad_norm_var": 0.110400390625, "learning_rate": 0.0001, "loss": 5.4642, "loss/crossentropy": 2.450079381465912, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15551115572452545, "step": 14922 }, { "epoch": 0.6783636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.098681640625, "learning_rate": 0.0001, "loss": 5.382, "loss/crossentropy": 2.3736481368541718, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15434961020946503, "step": 14924 }, { "epoch": 0.6784545454545454, "grad_norm": 5.375, "grad_norm_var": 0.11184488932291667, "learning_rate": 0.0001, "loss": 5.3898, "loss/crossentropy": 2.332248866558075, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1567281410098076, "step": 14926 }, { "epoch": 0.6785454545454546, "grad_norm": 4.71875, "grad_norm_var": 0.11875, "learning_rate": 0.0001, "loss": 5.7708, "loss/crossentropy": 2.636729598045349, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16262968629598618, "step": 14928 }, { "epoch": 0.6786363636363636, "grad_norm": 5.09375, "grad_norm_var": 0.12224934895833334, "learning_rate": 0.0001, "loss": 5.7605, "loss/crossentropy": 2.6908512711524963, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1591094583272934, "step": 14930 }, { "epoch": 0.6787272727272727, "grad_norm": 4.46875, "grad_norm_var": 0.14592692057291667, "learning_rate": 0.0001, "loss": 5.5923, "loss/crossentropy": 2.5667012333869934, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15470550954341888, "step": 14932 }, { "epoch": 0.6788181818181818, "grad_norm": 5.3125, "grad_norm_var": 0.15896809895833333, "learning_rate": 0.0001, "loss": 5.6414, "loss/crossentropy": 2.5468209981918335, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16063199006021023, "step": 14934 }, { "epoch": 0.6789090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.14120686848958333, "learning_rate": 0.0001, "loss": 5.3909, "loss/crossentropy": 2.389750897884369, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15558377653360367, "step": 14936 }, { "epoch": 0.679, "grad_norm": 5.21875, "grad_norm_var": 0.12522379557291666, "learning_rate": 0.0001, "loss": 5.3307, "loss/crossentropy": 2.335719883441925, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15223658457398415, "step": 14938 }, { "epoch": 0.6790909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.129541015625, "learning_rate": 0.0001, "loss": 5.4792, "loss/crossentropy": 2.49638968706131, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15042498335242271, "step": 14940 }, { "epoch": 0.6791818181818182, "grad_norm": 5.125, "grad_norm_var": 0.08878580729166667, "learning_rate": 0.0001, "loss": 5.8703, "loss/crossentropy": 2.6880595088005066, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16959556937217712, "step": 14942 }, { "epoch": 0.6792727272727273, "grad_norm": 5.375, "grad_norm_var": 0.09416910807291666, "learning_rate": 0.0001, "loss": 5.816, "loss/crossentropy": 2.5439570546150208, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17564282193779945, "step": 14944 }, { "epoch": 0.6793636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.094140625, "learning_rate": 0.0001, "loss": 5.8455, "loss/crossentropy": 2.6634727716445923, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.1695687733590603, "step": 14946 }, { "epoch": 0.6794545454545454, "grad_norm": 4.5625, "grad_norm_var": 0.0896484375, "learning_rate": 0.0001, "loss": 5.5508, "loss/crossentropy": 2.474051296710968, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1607980914413929, "step": 14948 }, { "epoch": 0.6795454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.12102864583333334, "learning_rate": 0.0001, "loss": 5.7661, "loss/crossentropy": 2.5782328844070435, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16976484283804893, "step": 14950 }, { "epoch": 0.6796363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.11474202473958334, "learning_rate": 0.0001, "loss": 5.6002, "loss/crossentropy": 2.5298526883125305, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15742451697587967, "step": 14952 }, { "epoch": 0.6797272727272727, "grad_norm": 4.625, "grad_norm_var": 0.12607014973958333, "learning_rate": 0.0001, "loss": 5.1383, "loss/crossentropy": 2.219993829727173, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1459367386996746, "step": 14954 }, { "epoch": 0.6798181818181818, "grad_norm": 4.53125, "grad_norm_var": 0.13922119140625, "learning_rate": 0.0001, "loss": 5.3967, "loss/crossentropy": 2.340772360563278, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15695981681346893, "step": 14956 }, { "epoch": 0.6799090909090909, "grad_norm": 4.4375, "grad_norm_var": 0.15126546223958334, "learning_rate": 0.0001, "loss": 5.1067, "loss/crossentropy": 2.2492027580738068, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14122144132852554, "step": 14958 }, { "epoch": 0.68, "grad_norm": 5.125, "grad_norm_var": 0.12493082682291666, "learning_rate": 0.0001, "loss": 6.0761, "loss/crossentropy": 2.8164268732070923, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.177141010761261, "step": 14960 }, { "epoch": 0.6800909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.11910400390625, "learning_rate": 0.0001, "loss": 5.0958, "loss/crossentropy": 2.190837800502777, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1442045047879219, "step": 14962 }, { "epoch": 0.6801818181818182, "grad_norm": 4.25, "grad_norm_var": 0.133447265625, "learning_rate": 0.0001, "loss": 5.204, "loss/crossentropy": 2.2545872926712036, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.14513596892356873, "step": 14964 }, { "epoch": 0.6802727272727273, "grad_norm": 4.59375, "grad_norm_var": 0.067822265625, "learning_rate": 0.0001, "loss": 4.9893, "loss/crossentropy": 2.109420597553253, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14248377457261086, "step": 14966 }, { "epoch": 0.6803636363636364, "grad_norm": 4.625, "grad_norm_var": 0.05318603515625, "learning_rate": 0.0001, "loss": 5.1098, "loss/crossentropy": 2.199271023273468, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14222596026957035, "step": 14968 }, { "epoch": 0.6804545454545454, "grad_norm": 5.09375, "grad_norm_var": 0.06484375, "learning_rate": 0.0001, "loss": 5.4174, "loss/crossentropy": 2.395630180835724, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.156475231051445, "step": 14970 }, { "epoch": 0.6805454545454546, "grad_norm": 4.78125, "grad_norm_var": 0.078125, "learning_rate": 0.0001, "loss": 5.4556, "loss/crossentropy": 2.415020674467087, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15640535205602646, "step": 14972 }, { "epoch": 0.6806363636363636, "grad_norm": 5.0625, "grad_norm_var": 0.08420817057291667, "learning_rate": 0.0001, "loss": 5.4107, "loss/crossentropy": 2.3671417236328125, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15748325362801552, "step": 14974 }, { "epoch": 0.6807272727272727, "grad_norm": 5.5625, "grad_norm_var": 0.11555989583333333, "learning_rate": 0.0001, "loss": 5.6522, "loss/crossentropy": 2.586203873157501, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.1564052663743496, "step": 14976 }, { "epoch": 0.6808181818181818, "grad_norm": 4.96875, "grad_norm_var": 0.12459309895833333, "learning_rate": 0.0001, "loss": 5.301, "loss/crossentropy": 2.3432498276233673, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14772344380617142, "step": 14978 }, { "epoch": 0.6809090909090909, "grad_norm": 4.84375, "grad_norm_var": 0.10640869140625, "learning_rate": 0.0001, "loss": 5.5588, "loss/crossentropy": 2.5540916323661804, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15398849919438362, "step": 14980 }, { "epoch": 0.681, "grad_norm": 5.25, "grad_norm_var": 0.0859375, "learning_rate": 0.0001, "loss": 5.7826, "loss/crossentropy": 2.703576683998108, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1625860445201397, "step": 14982 }, { "epoch": 0.6810909090909091, "grad_norm": 5.78125, "grad_norm_var": 0.38088785807291664, "learning_rate": 0.0001, "loss": 5.6926, "loss/crossentropy": 2.5570316910743713, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1612122505903244, "step": 14984 }, { "epoch": 0.6811818181818182, "grad_norm": 5.25, "grad_norm_var": 0.40507405598958335, "learning_rate": 0.0001, "loss": 5.5291, "loss/crossentropy": 2.49503356218338, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15457841008901596, "step": 14986 }, { "epoch": 0.6812727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.4236979166666667, "learning_rate": 0.0001, "loss": 5.5553, "loss/crossentropy": 2.4643154740333557, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16319508850574493, "step": 14988 }, { "epoch": 0.6813636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.4312337239583333, "learning_rate": 0.0001, "loss": 5.6492, "loss/crossentropy": 2.5782920718193054, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16021272167563438, "step": 14990 }, { "epoch": 0.6814545454545454, "grad_norm": 5.34375, "grad_norm_var": 0.4251302083333333, "learning_rate": 0.0001, "loss": 5.6687, "loss/crossentropy": 2.5417363047599792, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16602082177996635, "step": 14992 }, { "epoch": 0.6815454545454546, "grad_norm": 4.625, "grad_norm_var": 0.4293904622395833, "learning_rate": 0.0001, "loss": 5.3926, "loss/crossentropy": 2.423864960670471, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14980093762278557, "step": 14994 }, { "epoch": 0.6816363636363636, "grad_norm": 4.59375, "grad_norm_var": 0.45299072265625, "learning_rate": 0.0001, "loss": 5.4817, "loss/crossentropy": 2.4850744009017944, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15298421680927277, "step": 14996 }, { "epoch": 0.6817272727272727, "grad_norm": 4.78125, "grad_norm_var": 0.45084228515625, "learning_rate": 0.0001, "loss": 5.6664, "loss/crossentropy": 2.5292633771896362, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.16117523610591888, "step": 14998 }, { "epoch": 0.6818181818181818, "grad_norm": 5.53125, "grad_norm_var": 0.13983968098958333, "learning_rate": 0.0001, "loss": 5.4729, "loss/crossentropy": 2.499454975128174, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15066300332546234, "step": 15000 }, { "epoch": 0.6819090909090909, "grad_norm": 4.375, "grad_norm_var": 0.13391927083333333, "learning_rate": 0.0001, "loss": 5.4734, "loss/crossentropy": 2.4397996068000793, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15629408694803715, "step": 15002 }, { "epoch": 0.682, "grad_norm": 5.15625, "grad_norm_var": 0.13084309895833332, "learning_rate": 0.0001, "loss": 5.3708, "loss/crossentropy": 2.443451464176178, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14644931629300117, "step": 15004 }, { "epoch": 0.6820909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.13189697265625, "learning_rate": 0.0001, "loss": 5.6987, "loss/crossentropy": 2.585091471672058, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16429346054792404, "step": 15006 }, { "epoch": 0.6821818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.11702067057291667, "learning_rate": 0.0001, "loss": 5.5305, "loss/crossentropy": 2.50614595413208, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15575288236141205, "step": 15008 }, { "epoch": 0.6822727272727273, "grad_norm": 6.59375, "grad_norm_var": 0.2782185872395833, "learning_rate": 0.0001, "loss": 5.4595, "loss/crossentropy": 2.372732937335968, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1606285236775875, "step": 15010 }, { "epoch": 0.6823636363636364, "grad_norm": 4.875, "grad_norm_var": 0.2740234375, "learning_rate": 0.0001, "loss": 5.1975, "loss/crossentropy": 2.253134101629257, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.14600179344415665, "step": 15012 }, { "epoch": 0.6824545454545454, "grad_norm": 4.65625, "grad_norm_var": 0.27909749348958335, "learning_rate": 0.0001, "loss": 5.6167, "loss/crossentropy": 2.544855058193207, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16304529830813408, "step": 15014 }, { "epoch": 0.6825454545454546, "grad_norm": 4.875, "grad_norm_var": 0.25198160807291664, "learning_rate": 0.0001, "loss": 5.7085, "loss/crossentropy": 2.686085522174835, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1577080711722374, "step": 15016 }, { "epoch": 0.6826363636363636, "grad_norm": 4.84375, "grad_norm_var": 0.22981363932291668, "learning_rate": 0.0001, "loss": 5.4138, "loss/crossentropy": 2.38978773355484, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15591515228152275, "step": 15018 }, { "epoch": 0.6827272727272727, "grad_norm": 4.9375, "grad_norm_var": 0.22740478515625, "learning_rate": 0.0001, "loss": 5.9393, "loss/crossentropy": 2.742184042930603, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17166420072317123, "step": 15020 }, { "epoch": 0.6828181818181818, "grad_norm": 5.59375, "grad_norm_var": 0.26711832682291664, "learning_rate": 0.0001, "loss": 5.174, "loss/crossentropy": 2.2696699798107147, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14317166060209274, "step": 15022 }, { "epoch": 0.6829090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.26763916015625, "learning_rate": 0.0001, "loss": 5.4436, "loss/crossentropy": 2.4414515495300293, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15509294345974922, "step": 15024 }, { "epoch": 0.683, "grad_norm": 4.96875, "grad_norm_var": 0.07623291015625, "learning_rate": 0.0001, "loss": 5.8677, "loss/crossentropy": 2.6623284220695496, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.17307787016034126, "step": 15026 }, { "epoch": 0.6830909090909091, "grad_norm": 4.625, "grad_norm_var": 0.0681640625, "learning_rate": 0.0001, "loss": 5.1339, "loss/crossentropy": 2.234265089035034, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1444525085389614, "step": 15028 }, { "epoch": 0.6831818181818182, "grad_norm": 4.75, "grad_norm_var": 0.06438802083333334, "learning_rate": 0.0001, "loss": 5.2517, "loss/crossentropy": 2.267657697200775, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1530958004295826, "step": 15030 }, { "epoch": 0.6832727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.06972249348958333, "learning_rate": 0.0001, "loss": 5.8734, "loss/crossentropy": 2.755321502685547, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.16747689247131348, "step": 15032 }, { "epoch": 0.6833636363636364, "grad_norm": 4.46875, "grad_norm_var": 0.08631184895833334, "learning_rate": 0.0001, "loss": 5.4645, "loss/crossentropy": 2.4688007831573486, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1544530764222145, "step": 15034 }, { "epoch": 0.6834545454545454, "grad_norm": 4.90625, "grad_norm_var": 0.09416910807291666, "learning_rate": 0.0001, "loss": 5.4365, "loss/crossentropy": 2.424635797739029, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15645525604486465, "step": 15036 }, { "epoch": 0.6835454545454546, "grad_norm": 5.1875, "grad_norm_var": 0.08006184895833333, "learning_rate": 0.0001, "loss": 5.6428, "loss/crossentropy": 2.5597403049468994, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15693595260381699, "step": 15038 }, { "epoch": 0.6836363636363636, "grad_norm": 4.46875, "grad_norm_var": 0.08704020182291666, "learning_rate": 0.0001, "loss": 5.8631, "loss/crossentropy": 2.7387152910232544, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16575715318322182, "step": 15040 }, { "epoch": 0.6837272727272727, "grad_norm": 4.71875, "grad_norm_var": 0.09146728515625, "learning_rate": 0.0001, "loss": 5.1112, "loss/crossentropy": 2.2403500378131866, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14196569845080376, "step": 15042 }, { "epoch": 0.6838181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.097265625, "learning_rate": 0.0001, "loss": 5.626, "loss/crossentropy": 2.500198006629944, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16394656896591187, "step": 15044 }, { "epoch": 0.6839090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.09763997395833333, "learning_rate": 0.0001, "loss": 5.5385, "loss/crossentropy": 2.489265501499176, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15765491500496864, "step": 15046 }, { "epoch": 0.684, "grad_norm": 4.5, "grad_norm_var": 0.09853108723958333, "learning_rate": 0.0001, "loss": 5.3554, "loss/crossentropy": 2.399509608745575, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14930225163698196, "step": 15048 }, { "epoch": 0.6840909090909091, "grad_norm": 5.59375, "grad_norm_var": 0.12928059895833333, "learning_rate": 0.0001, "loss": 5.5699, "loss/crossentropy": 2.4638743698596954, "loss/hidden": 1.525390625, "loss/jsd": 0.0, "loss/logits": 0.15805918164551258, "step": 15050 }, { "epoch": 0.6841818181818182, "grad_norm": 4.6875, "grad_norm_var": 0.120166015625, "learning_rate": 0.0001, "loss": 5.506, "loss/crossentropy": 2.499079644680023, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1522573009133339, "step": 15052 }, { "epoch": 0.6842727272727273, "grad_norm": 4.53125, "grad_norm_var": 0.08828125, "learning_rate": 0.0001, "loss": 5.4799, "loss/crossentropy": 2.429978311061859, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15869877859950066, "step": 15054 }, { "epoch": 0.6843636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.08013916015625, "learning_rate": 0.0001, "loss": 5.5092, "loss/crossentropy": 2.45806485414505, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1590244099497795, "step": 15056 }, { "epoch": 0.6844545454545454, "grad_norm": 4.90625, "grad_norm_var": 0.07779541015625, "learning_rate": 0.0001, "loss": 5.815, "loss/crossentropy": 2.6889867186546326, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16279328987002373, "step": 15058 }, { "epoch": 0.6845454545454546, "grad_norm": 4.78125, "grad_norm_var": 0.072900390625, "learning_rate": 0.0001, "loss": 5.6021, "loss/crossentropy": 2.536576569080353, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15987011045217514, "step": 15060 }, { "epoch": 0.6846363636363636, "grad_norm": 4.625, "grad_norm_var": 0.07486572265625, "learning_rate": 0.0001, "loss": 5.4631, "loss/crossentropy": 2.443257451057434, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15316038206219673, "step": 15062 }, { "epoch": 0.6847272727272727, "grad_norm": 4.625, "grad_norm_var": 0.08271077473958334, "learning_rate": 0.0001, "loss": 5.5009, "loss/crossentropy": 2.4705925583839417, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.156933955848217, "step": 15064 }, { "epoch": 0.6848181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.042822265625, "learning_rate": 0.0001, "loss": 5.5257, "loss/crossentropy": 2.455219805240631, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15841132774949074, "step": 15066 }, { "epoch": 0.6849090909090909, "grad_norm": 4.75, "grad_norm_var": 0.041259765625, "learning_rate": 0.0001, "loss": 5.4411, "loss/crossentropy": 2.3986820578575134, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1567809022963047, "step": 15068 }, { "epoch": 0.685, "grad_norm": 4.84375, "grad_norm_var": 0.03179931640625, "learning_rate": 0.0001, "loss": 5.6975, "loss/crossentropy": 2.5687695145606995, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16600165516138077, "step": 15070 }, { "epoch": 0.6850909090909091, "grad_norm": 5.15625, "grad_norm_var": 0.03292643229166667, "learning_rate": 0.0001, "loss": 5.5764, "loss/crossentropy": 2.5185726583004, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15753916278481483, "step": 15072 }, { "epoch": 0.6851818181818182, "grad_norm": 4.875, "grad_norm_var": 0.031966145833333334, "learning_rate": 0.0001, "loss": 5.7005, "loss/crossentropy": 2.546187460422516, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16699539124965668, "step": 15074 }, { "epoch": 0.6852727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.03954671223958333, "learning_rate": 0.0001, "loss": 5.5273, "loss/crossentropy": 2.499878764152527, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1556677259504795, "step": 15076 }, { "epoch": 0.6853636363636364, "grad_norm": 4.625, "grad_norm_var": 0.04016520182291667, "learning_rate": 0.0001, "loss": 5.6285, "loss/crossentropy": 2.517319977283478, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1620922815054655, "step": 15078 }, { "epoch": 0.6854545454545454, "grad_norm": 4.9375, "grad_norm_var": 0.040755208333333334, "learning_rate": 0.0001, "loss": 4.9036, "loss/crossentropy": 2.1094295382499695, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1364511474967003, "step": 15080 }, { "epoch": 0.6855454545454546, "grad_norm": 4.1875, "grad_norm_var": 0.0736328125, "learning_rate": 0.0001, "loss": 5.1417, "loss/crossentropy": 2.284202367067337, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.14141416177153587, "step": 15082 }, { "epoch": 0.6856363636363636, "grad_norm": 4.625, "grad_norm_var": 0.07415364583333334, "learning_rate": 0.0001, "loss": 5.4058, "loss/crossentropy": 2.3629150986671448, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1570240817964077, "step": 15084 }, { "epoch": 0.6857272727272727, "grad_norm": 4.15625, "grad_norm_var": 0.10305989583333333, "learning_rate": 0.0001, "loss": 5.501, "loss/crossentropy": 2.531556159257889, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1535816676914692, "step": 15086 }, { "epoch": 0.6858181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.35705973307291666, "learning_rate": 0.0001, "loss": 4.9636, "loss/crossentropy": 2.078034430742264, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.1446160338819027, "step": 15088 }, { "epoch": 0.6859090909090909, "grad_norm": 5.34375, "grad_norm_var": 0.37382405598958335, "learning_rate": 0.0001, "loss": 5.6297, "loss/crossentropy": 2.4608113765716553, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16786768659949303, "step": 15090 }, { "epoch": 0.686, "grad_norm": 4.84375, "grad_norm_var": 0.3697916666666667, "learning_rate": 0.0001, "loss": 5.6156, "loss/crossentropy": 2.5847169160842896, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15504619106650352, "step": 15092 }, { "epoch": 0.6860909090909091, "grad_norm": 4.96875, "grad_norm_var": 0.3653483072916667, "learning_rate": 0.0001, "loss": 5.4817, "loss/crossentropy": 2.4908827543258667, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15435537695884705, "step": 15094 }, { "epoch": 0.6861818181818182, "grad_norm": 5.9375, "grad_norm_var": 0.41373291015625, "learning_rate": 0.0001, "loss": 5.7328, "loss/crossentropy": 2.60941743850708, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16429101675748825, "step": 15096 }, { "epoch": 0.6862727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.8656534830729167, "learning_rate": 0.0001, "loss": 5.6279, "loss/crossentropy": 2.452177256345749, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16561734676361084, "step": 15098 }, { "epoch": 0.6863636363636364, "grad_norm": 5.59375, "grad_norm_var": 0.847900390625, "learning_rate": 0.0001, "loss": 5.7729, "loss/crossentropy": 2.6640750765800476, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16127722337841988, "step": 15100 }, { "epoch": 0.6864545454545454, "grad_norm": 4.75, "grad_norm_var": 0.7725260416666667, "learning_rate": 0.0001, "loss": 5.3351, "loss/crossentropy": 2.3000391721725464, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15526407584547997, "step": 15102 }, { "epoch": 0.6865454545454546, "grad_norm": 4.875, "grad_norm_var": 0.6723592122395833, "learning_rate": 0.0001, "loss": 5.0816, "loss/crossentropy": 2.1997823119163513, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14052722416818142, "step": 15104 }, { "epoch": 0.6866363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.6991536458333333, "learning_rate": 0.0001, "loss": 5.2247, "loss/crossentropy": 2.297314941883087, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1470322199165821, "step": 15106 }, { "epoch": 0.6867272727272727, "grad_norm": 4.5625, "grad_norm_var": 0.70445556640625, "learning_rate": 0.0001, "loss": 5.3334, "loss/crossentropy": 2.3365259170532227, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1492919586598873, "step": 15108 }, { "epoch": 0.6868181818181818, "grad_norm": 5.0, "grad_norm_var": 0.7025675455729167, "learning_rate": 0.0001, "loss": 5.7203, "loss/crossentropy": 2.619702100753784, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1610342673957348, "step": 15110 }, { "epoch": 0.6869090909090909, "grad_norm": 5.40625, "grad_norm_var": 0.6654256184895834, "learning_rate": 0.0001, "loss": 5.7116, "loss/crossentropy": 2.58221435546875, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16430460289120674, "step": 15112 }, { "epoch": 0.687, "grad_norm": 4.75, "grad_norm_var": 0.09479166666666666, "learning_rate": 0.0001, "loss": 5.5789, "loss/crossentropy": 2.584923505783081, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15388642624020576, "step": 15114 }, { "epoch": 0.6870909090909091, "grad_norm": 5.21875, "grad_norm_var": 0.07667643229166667, "learning_rate": 0.0001, "loss": 5.32, "loss/crossentropy": 2.2886132895946503, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1521605271846056, "step": 15116 }, { "epoch": 0.6871818181818182, "grad_norm": 4.21875, "grad_norm_var": 0.10406494140625, "learning_rate": 0.0001, "loss": 5.4652, "loss/crossentropy": 2.450714409351349, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15359895303845406, "step": 15118 }, { "epoch": 0.6872727272727273, "grad_norm": 9.5, "grad_norm_var": 1.47808837890625, "learning_rate": 0.0001, "loss": 6.2513, "loss/crossentropy": 2.9379690885543823, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.18270468711853027, "step": 15120 }, { "epoch": 0.6873636363636364, "grad_norm": 4.9375, "grad_norm_var": 1.45113525390625, "learning_rate": 0.0001, "loss": 5.037, "loss/crossentropy": 2.0815237760543823, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1475054696202278, "step": 15122 }, { "epoch": 0.6874545454545454, "grad_norm": 5.53125, "grad_norm_var": 1.44761962890625, "learning_rate": 0.0001, "loss": 5.7495, "loss/crossentropy": 2.579326093196869, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16525593400001526, "step": 15124 }, { "epoch": 0.6875454545454546, "grad_norm": 5.3125, "grad_norm_var": 1.4372233072916667, "learning_rate": 0.0001, "loss": 5.7847, "loss/crossentropy": 2.614237904548645, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16645579412579536, "step": 15126 }, { "epoch": 0.6876363636363636, "grad_norm": 5.28125, "grad_norm_var": 1.4263671875, "learning_rate": 0.0001, "loss": 5.743, "loss/crossentropy": 2.5924883484840393, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16700627654790878, "step": 15128 }, { "epoch": 0.6877272727272727, "grad_norm": 4.53125, "grad_norm_var": 1.452197265625, "learning_rate": 0.0001, "loss": 5.245, "loss/crossentropy": 2.2910314202308655, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.14950120821595192, "step": 15130 }, { "epoch": 0.6878181818181818, "grad_norm": 4.6875, "grad_norm_var": 1.4271443684895833, "learning_rate": 0.0001, "loss": 5.653, "loss/crossentropy": 2.522467017173767, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1650017388164997, "step": 15132 }, { "epoch": 0.6879090909090909, "grad_norm": 4.625, "grad_norm_var": 1.39732666015625, "learning_rate": 0.0001, "loss": 5.4359, "loss/crossentropy": 2.429227352142334, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15223262086510658, "step": 15134 }, { "epoch": 0.688, "grad_norm": 4.90625, "grad_norm_var": 0.09972330729166666, "learning_rate": 0.0001, "loss": 5.3992, "loss/crossentropy": 2.4452385306358337, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.150674257427454, "step": 15136 }, { "epoch": 0.6880909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.12248942057291666, "learning_rate": 0.0001, "loss": 5.3613, "loss/crossentropy": 2.364413559436798, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15301282703876495, "step": 15138 }, { "epoch": 0.6881818181818182, "grad_norm": 4.625, "grad_norm_var": 0.10388997395833334, "learning_rate": 0.0001, "loss": 5.6429, "loss/crossentropy": 2.5932493209838867, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1580948270857334, "step": 15140 }, { "epoch": 0.6882727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.09479166666666666, "learning_rate": 0.0001, "loss": 5.4569, "loss/crossentropy": 2.4226343035697937, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1524488590657711, "step": 15142 }, { "epoch": 0.6883636363636364, "grad_norm": 5.03125, "grad_norm_var": 0.09146728515625, "learning_rate": 0.0001, "loss": 5.4585, "loss/crossentropy": 2.511614978313446, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.14996232651174068, "step": 15144 }, { "epoch": 0.6884545454545454, "grad_norm": 5.09375, "grad_norm_var": 0.06953125, "learning_rate": 0.0001, "loss": 5.6471, "loss/crossentropy": 2.53956401348114, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1601705327630043, "step": 15146 }, { "epoch": 0.6885454545454546, "grad_norm": 5.0, "grad_norm_var": 0.05584309895833333, "learning_rate": 0.0001, "loss": 5.6383, "loss/crossentropy": 2.552858829498291, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1618623435497284, "step": 15148 }, { "epoch": 0.6886363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.053515625, "learning_rate": 0.0001, "loss": 5.6229, "loss/crossentropy": 2.4977126121520996, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16368931531906128, "step": 15150 }, { "epoch": 0.6887272727272727, "grad_norm": 5.0, "grad_norm_var": 0.04797770182291667, "learning_rate": 0.0001, "loss": 5.4042, "loss/crossentropy": 2.3014856576919556, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16026866808533669, "step": 15152 }, { "epoch": 0.6888181818181818, "grad_norm": 6.65625, "grad_norm_var": 0.22630208333333332, "learning_rate": 0.0001, "loss": 4.9461, "loss/crossentropy": 2.044435828924179, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14309265464544296, "step": 15154 }, { "epoch": 0.6889090909090909, "grad_norm": 5.46875, "grad_norm_var": 0.22473958333333333, "learning_rate": 0.0001, "loss": 5.7874, "loss/crossentropy": 2.6013399958610535, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1678214743733406, "step": 15156 }, { "epoch": 0.689, "grad_norm": 4.78125, "grad_norm_var": 0.22678629557291666, "learning_rate": 0.0001, "loss": 5.641, "loss/crossentropy": 2.515192747116089, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16531796008348465, "step": 15158 }, { "epoch": 0.6890909090909091, "grad_norm": 5.28125, "grad_norm_var": 0.23053385416666666, "learning_rate": 0.0001, "loss": 5.5149, "loss/crossentropy": 2.5070137977600098, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15645096078515053, "step": 15160 }, { "epoch": 0.6891818181818182, "grad_norm": 4.84375, "grad_norm_var": 0.26249593098958335, "learning_rate": 0.0001, "loss": 5.345, "loss/crossentropy": 2.404625177383423, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1479460522532463, "step": 15162 }, { "epoch": 0.6892727272727273, "grad_norm": 5.875, "grad_norm_var": 0.31521809895833336, "learning_rate": 0.0001, "loss": 5.8875, "loss/crossentropy": 2.6787039637565613, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1718585342168808, "step": 15164 }, { "epoch": 0.6893636363636364, "grad_norm": 4.65625, "grad_norm_var": 0.32615559895833335, "learning_rate": 0.0001, "loss": 5.4563, "loss/crossentropy": 2.401111513376236, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15474002249538898, "step": 15166 }, { "epoch": 0.6894545454545454, "grad_norm": 4.78125, "grad_norm_var": 0.331640625, "learning_rate": 0.0001, "loss": 5.5257, "loss/crossentropy": 2.4568422436714172, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1617705225944519, "step": 15168 }, { "epoch": 0.6895454545454546, "grad_norm": 4.65625, "grad_norm_var": 0.17330729166666667, "learning_rate": 0.0001, "loss": 5.1766, "loss/crossentropy": 2.2595038414001465, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14718206599354744, "step": 15170 }, { "epoch": 0.6896363636363636, "grad_norm": 5.71875, "grad_norm_var": 0.19280192057291667, "learning_rate": 0.0001, "loss": 5.8841, "loss/crossentropy": 2.692972183227539, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.17009006813168526, "step": 15172 }, { "epoch": 0.6897272727272727, "grad_norm": 5.84375, "grad_norm_var": 0.23326822916666667, "learning_rate": 0.0001, "loss": 5.3827, "loss/crossentropy": 2.375341773033142, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1517142355442047, "step": 15174 }, { "epoch": 0.6898181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.23019205729166667, "learning_rate": 0.0001, "loss": 5.6361, "loss/crossentropy": 2.6083914041519165, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15667397528886795, "step": 15176 }, { "epoch": 0.6899090909090909, "grad_norm": 4.5, "grad_norm_var": 0.21666259765625, "learning_rate": 0.0001, "loss": 5.5636, "loss/crossentropy": 2.517033338546753, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15895314142107964, "step": 15178 }, { "epoch": 0.69, "grad_norm": 4.9375, "grad_norm_var": 0.16477864583333332, "learning_rate": 0.0001, "loss": 5.4088, "loss/crossentropy": 2.4169522523880005, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15172581747174263, "step": 15180 }, { "epoch": 0.6900909090909091, "grad_norm": 5.46875, "grad_norm_var": 0.154150390625, "learning_rate": 0.0001, "loss": 5.3928, "loss/crossentropy": 2.3289132714271545, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.1554105021059513, "step": 15182 }, { "epoch": 0.6901818181818182, "grad_norm": 4.625, "grad_norm_var": 0.15819905598958334, "learning_rate": 0.0001, "loss": 4.8353, "loss/crossentropy": 1.9664542973041534, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.13629781641066074, "step": 15184 }, { "epoch": 0.6902727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.1529296875, "learning_rate": 0.0001, "loss": 4.978, "loss/crossentropy": 2.127328783273697, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.13838543370366096, "step": 15186 }, { "epoch": 0.6903636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.11847330729166666, "learning_rate": 0.0001, "loss": 5.6372, "loss/crossentropy": 2.5826175808906555, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1597595326602459, "step": 15188 }, { "epoch": 0.6904545454545454, "grad_norm": 6.59375, "grad_norm_var": 0.24459635416666667, "learning_rate": 0.0001, "loss": 5.5482, "loss/crossentropy": 2.4947391152381897, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1561252698302269, "step": 15190 }, { "epoch": 0.6905454545454546, "grad_norm": 4.90625, "grad_norm_var": 0.24856363932291667, "learning_rate": 0.0001, "loss": 5.7531, "loss/crossentropy": 2.641813337802887, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16327305883169174, "step": 15192 }, { "epoch": 0.6906363636363636, "grad_norm": 5.125, "grad_norm_var": 0.23290608723958334, "learning_rate": 0.0001, "loss": 5.5707, "loss/crossentropy": 2.455371141433716, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16153453662991524, "step": 15194 }, { "epoch": 0.6907272727272727, "grad_norm": 4.6875, "grad_norm_var": 0.235009765625, "learning_rate": 0.0001, "loss": 5.2111, "loss/crossentropy": 2.29635351896286, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14499091356992722, "step": 15196 }, { "epoch": 0.6908181818181818, "grad_norm": 4.53125, "grad_norm_var": 0.23394775390625, "learning_rate": 0.0001, "loss": 5.5453, "loss/crossentropy": 2.5396187901496887, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1544697806239128, "step": 15198 }, { "epoch": 0.6909090909090909, "grad_norm": 4.125, "grad_norm_var": 0.26982014973958335, "learning_rate": 0.0001, "loss": 5.1233, "loss/crossentropy": 2.2495445907115936, "loss/hidden": 1.431640625, "loss/jsd": 0.0, "loss/logits": 0.14421499893069267, "step": 15200 }, { "epoch": 0.691, "grad_norm": 4.9375, "grad_norm_var": 0.26287434895833334, "learning_rate": 0.0001, "loss": 5.6354, "loss/crossentropy": 2.5699339509010315, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15693668276071548, "step": 15202 }, { "epoch": 0.6910909090909091, "grad_norm": 5.125, "grad_norm_var": 0.2711588541666667, "learning_rate": 0.0001, "loss": 5.5472, "loss/crossentropy": 2.4921794533729553, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1592099517583847, "step": 15204 }, { "epoch": 0.6911818181818182, "grad_norm": 4.5625, "grad_norm_var": 0.08683268229166667, "learning_rate": 0.0001, "loss": 5.297, "loss/crossentropy": 2.3543097376823425, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.14934923499822617, "step": 15206 }, { "epoch": 0.6912727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.07838541666666667, "learning_rate": 0.0001, "loss": 5.7071, "loss/crossentropy": 2.5897671580314636, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1644635647535324, "step": 15208 }, { "epoch": 0.6913636363636364, "grad_norm": 5.25, "grad_norm_var": 0.08365478515625, "learning_rate": 0.0001, "loss": 5.7287, "loss/crossentropy": 2.5633251667022705, "loss/hidden": 1.537109375, "loss/jsd": 0.0, "loss/logits": 0.16282198950648308, "step": 15210 }, { "epoch": 0.6914545454545454, "grad_norm": 5.28125, "grad_norm_var": 0.09374593098958334, "learning_rate": 0.0001, "loss": 5.6176, "loss/crossentropy": 2.472905606031418, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16446491330862045, "step": 15212 }, { "epoch": 0.6915454545454546, "grad_norm": 4.875, "grad_norm_var": 0.08995768229166666, "learning_rate": 0.0001, "loss": 5.4822, "loss/crossentropy": 2.454251319169998, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.153769850730896, "step": 15214 }, { "epoch": 0.6916363636363636, "grad_norm": 4.8125, "grad_norm_var": 0.050191243489583336, "learning_rate": 0.0001, "loss": 5.5213, "loss/crossentropy": 2.477287471294403, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.157912690192461, "step": 15216 }, { "epoch": 0.6917272727272727, "grad_norm": 4.90625, "grad_norm_var": 0.045817057291666664, "learning_rate": 0.0001, "loss": 5.5485, "loss/crossentropy": 2.5077935457229614, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15856670588254929, "step": 15218 }, { "epoch": 0.6918181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.03515625, "learning_rate": 0.0001, "loss": 5.7329, "loss/crossentropy": 2.6410263180732727, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16114160791039467, "step": 15220 }, { "epoch": 0.6919090909090909, "grad_norm": 4.8125, "grad_norm_var": 0.03290608723958333, "learning_rate": 0.0001, "loss": 5.5808, "loss/crossentropy": 2.573550760746002, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15502097085118294, "step": 15222 }, { "epoch": 0.692, "grad_norm": 4.5625, "grad_norm_var": 0.03863525390625, "learning_rate": 0.0001, "loss": 5.4796, "loss/crossentropy": 2.4795591235160828, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15293358266353607, "step": 15224 }, { "epoch": 0.6920909090909091, "grad_norm": 4.625, "grad_norm_var": 0.034891764322916664, "learning_rate": 0.0001, "loss": 5.358, "loss/crossentropy": 2.363064229488373, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15379272773861885, "step": 15226 }, { "epoch": 0.6921818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.021028645833333335, "learning_rate": 0.0001, "loss": 5.2975, "loss/crossentropy": 2.2933908104896545, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1509929709136486, "step": 15228 }, { "epoch": 0.6922727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.020015462239583334, "learning_rate": 0.0001, "loss": 5.7702, "loss/crossentropy": 2.616370916366577, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1647966131567955, "step": 15230 }, { "epoch": 0.6923636363636364, "grad_norm": 4.59375, "grad_norm_var": 0.026416015625, "learning_rate": 0.0001, "loss": 5.7238, "loss/crossentropy": 2.6088046431541443, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1655963957309723, "step": 15232 }, { "epoch": 0.6924545454545454, "grad_norm": 5.625, "grad_norm_var": 0.06744791666666666, "learning_rate": 0.0001, "loss": 5.7328, "loss/crossentropy": 2.5936232209205627, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.1633293367922306, "step": 15234 }, { "epoch": 0.6925454545454546, "grad_norm": 4.96875, "grad_norm_var": 0.07164306640625, "learning_rate": 0.0001, "loss": 5.2663, "loss/crossentropy": 2.2857706248760223, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15137168020009995, "step": 15236 }, { "epoch": 0.6926363636363636, "grad_norm": 4.78125, "grad_norm_var": 0.068603515625, "learning_rate": 0.0001, "loss": 5.3667, "loss/crossentropy": 2.3692628145217896, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15228023380041122, "step": 15238 }, { "epoch": 0.6927272727272727, "grad_norm": 4.96875, "grad_norm_var": 0.06131184895833333, "learning_rate": 0.0001, "loss": 5.6753, "loss/crossentropy": 2.5616660714149475, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1619538888335228, "step": 15240 }, { "epoch": 0.6928181818181818, "grad_norm": 4.8125, "grad_norm_var": 0.16399739583333334, "learning_rate": 0.0001, "loss": 5.6365, "loss/crossentropy": 2.5810431241989136, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15847072377800941, "step": 15242 }, { "epoch": 0.6929090909090909, "grad_norm": 4.875, "grad_norm_var": 0.165087890625, "learning_rate": 0.0001, "loss": 5.4845, "loss/crossentropy": 2.4928985238075256, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15033144131302834, "step": 15244 }, { "epoch": 0.693, "grad_norm": 4.34375, "grad_norm_var": 0.19166259765625, "learning_rate": 0.0001, "loss": 5.3052, "loss/crossentropy": 2.3321645855903625, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15101932734251022, "step": 15246 }, { "epoch": 0.6930909090909091, "grad_norm": 4.3125, "grad_norm_var": 0.21744791666666666, "learning_rate": 0.0001, "loss": 5.2165, "loss/crossentropy": 2.313879281282425, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14651662856340408, "step": 15248 }, { "epoch": 0.6931818181818182, "grad_norm": 4.90625, "grad_norm_var": 0.18427327473958333, "learning_rate": 0.0001, "loss": 5.6053, "loss/crossentropy": 2.5482121109962463, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15804829820990562, "step": 15250 }, { "epoch": 0.6932727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.181103515625, "learning_rate": 0.0001, "loss": 5.5264, "loss/crossentropy": 2.467728078365326, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15782207250595093, "step": 15252 }, { "epoch": 0.6933636363636364, "grad_norm": 4.5625, "grad_norm_var": 0.19110921223958333, "learning_rate": 0.0001, "loss": 5.1665, "loss/crossentropy": 2.2446649074554443, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1457037664949894, "step": 15254 }, { "epoch": 0.6934545454545454, "grad_norm": 5.125, "grad_norm_var": 0.20052083333333334, "learning_rate": 0.0001, "loss": 5.7779, "loss/crossentropy": 2.564128875732422, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.16961493715643883, "step": 15256 }, { "epoch": 0.6935454545454546, "grad_norm": 4.84375, "grad_norm_var": 0.06646728515625, "learning_rate": 0.0001, "loss": 5.4039, "loss/crossentropy": 2.3549468517303467, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15450917556881905, "step": 15258 }, { "epoch": 0.6936363636363636, "grad_norm": 4.46875, "grad_norm_var": 0.081640625, "learning_rate": 0.0001, "loss": 5.4016, "loss/crossentropy": 2.397127151489258, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15220754593610764, "step": 15260 }, { "epoch": 0.6937272727272727, "grad_norm": 4.6875, "grad_norm_var": 0.06365559895833334, "learning_rate": 0.0001, "loss": 4.979, "loss/crossentropy": 2.153014838695526, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.13865121826529503, "step": 15262 }, { "epoch": 0.6938181818181818, "grad_norm": 4.9375, "grad_norm_var": 0.10859375, "learning_rate": 0.0001, "loss": 5.6123, "loss/crossentropy": 2.490121901035309, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.16085312515497208, "step": 15264 }, { "epoch": 0.6939090909090909, "grad_norm": 4.75, "grad_norm_var": 0.1080078125, "learning_rate": 0.0001, "loss": 5.8333, "loss/crossentropy": 2.66463041305542, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16902001202106476, "step": 15266 }, { "epoch": 0.694, "grad_norm": 4.78125, "grad_norm_var": 0.10871988932291667, "learning_rate": 0.0001, "loss": 5.4845, "loss/crossentropy": 2.482460856437683, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1543041728436947, "step": 15268 }, { "epoch": 0.6940909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.1, "learning_rate": 0.0001, "loss": 5.6109, "loss/crossentropy": 2.5278186202049255, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1610407680273056, "step": 15270 }, { "epoch": 0.6941818181818182, "grad_norm": 4.71875, "grad_norm_var": 0.10631103515625, "learning_rate": 0.0001, "loss": 5.5655, "loss/crossentropy": 2.510043829679489, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.16042587161064148, "step": 15272 }, { "epoch": 0.6942727272727273, "grad_norm": 5.09375, "grad_norm_var": 0.10722249348958333, "learning_rate": 0.0001, "loss": 5.7817, "loss/crossentropy": 2.644769310951233, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16603977978229523, "step": 15274 }, { "epoch": 0.6943636363636364, "grad_norm": 4.5, "grad_norm_var": 0.10907796223958334, "learning_rate": 0.0001, "loss": 5.4339, "loss/crossentropy": 2.3827038407325745, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15687650442123413, "step": 15276 }, { "epoch": 0.6944545454545454, "grad_norm": 4.5, "grad_norm_var": 0.11698811848958333, "learning_rate": 0.0001, "loss": 5.5279, "loss/crossentropy": 2.537885844707489, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.152713380753994, "step": 15278 }, { "epoch": 0.6945454545454546, "grad_norm": 4.75, "grad_norm_var": 0.06672770182291667, "learning_rate": 0.0001, "loss": 5.4887, "loss/crossentropy": 2.4776374101638794, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15403971448540688, "step": 15280 }, { "epoch": 0.6946363636363636, "grad_norm": 4.71875, "grad_norm_var": 0.07899983723958333, "learning_rate": 0.0001, "loss": 5.3931, "loss/crossentropy": 2.4417805671691895, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.150800172239542, "step": 15282 }, { "epoch": 0.6947272727272727, "grad_norm": 4.875, "grad_norm_var": 0.08235677083333333, "learning_rate": 0.0001, "loss": 5.1876, "loss/crossentropy": 2.220409780740738, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14866885915398598, "step": 15284 }, { "epoch": 0.6948181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.07890218098958333, "learning_rate": 0.0001, "loss": 5.1284, "loss/crossentropy": 2.196730315685272, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.1449253261089325, "step": 15286 }, { "epoch": 0.6949090909090909, "grad_norm": 4.90625, "grad_norm_var": 0.07029622395833333, "learning_rate": 0.0001, "loss": 5.7406, "loss/crossentropy": 2.6054821014404297, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16761674731969833, "step": 15288 }, { "epoch": 0.695, "grad_norm": 5.34375, "grad_norm_var": 0.11287434895833333, "learning_rate": 0.0001, "loss": 5.8297, "loss/crossentropy": 2.634186029434204, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1707209013402462, "step": 15290 }, { "epoch": 0.6950909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.12342122395833334, "learning_rate": 0.0001, "loss": 5.1448, "loss/crossentropy": 2.272458076477051, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14211992546916008, "step": 15292 }, { "epoch": 0.6951818181818182, "grad_norm": 5.03125, "grad_norm_var": 0.11018473307291667, "learning_rate": 0.0001, "loss": 5.632, "loss/crossentropy": 2.578502833843231, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.158083975315094, "step": 15294 }, { "epoch": 0.6952727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.10924479166666666, "learning_rate": 0.0001, "loss": 5.8444, "loss/crossentropy": 2.7360798716545105, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16434605047106743, "step": 15296 }, { "epoch": 0.6953636363636364, "grad_norm": 4.5625, "grad_norm_var": 0.1005859375, "learning_rate": 0.0001, "loss": 5.3729, "loss/crossentropy": 2.4999848008155823, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1421758569777012, "step": 15298 }, { "epoch": 0.6954545454545454, "grad_norm": 4.53125, "grad_norm_var": 0.10201822916666667, "learning_rate": 0.0001, "loss": 5.69, "loss/crossentropy": 2.6148542761802673, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16004910692572594, "step": 15300 }, { "epoch": 0.6955454545454546, "grad_norm": 5.1875, "grad_norm_var": 0.09895426432291667, "learning_rate": 0.0001, "loss": 5.5035, "loss/crossentropy": 2.48470675945282, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15539145469665527, "step": 15302 }, { "epoch": 0.6956363636363636, "grad_norm": 5.34375, "grad_norm_var": 0.11184488932291667, "learning_rate": 0.0001, "loss": 5.6524, "loss/crossentropy": 2.6148207783699036, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15512390434741974, "step": 15304 }, { "epoch": 0.6957272727272727, "grad_norm": 4.8125, "grad_norm_var": 0.09073893229166667, "learning_rate": 0.0001, "loss": 5.3922, "loss/crossentropy": 2.4220827221870422, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15091479569673538, "step": 15306 }, { "epoch": 0.6958181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.068603515625, "learning_rate": 0.0001, "loss": 5.6429, "loss/crossentropy": 2.5292393565177917, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16332417726516724, "step": 15308 }, { "epoch": 0.6959090909090909, "grad_norm": 4.65625, "grad_norm_var": 0.07981770833333333, "learning_rate": 0.0001, "loss": 4.9467, "loss/crossentropy": 2.0863985121250153, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.13876168243587017, "step": 15310 }, { "epoch": 0.696, "grad_norm": 4.6875, "grad_norm_var": 0.08331705729166666, "learning_rate": 0.0001, "loss": 5.6105, "loss/crossentropy": 2.5136342644691467, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.161638081073761, "step": 15312 }, { "epoch": 0.6960909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.07394205729166667, "learning_rate": 0.0001, "loss": 5.2879, "loss/crossentropy": 2.259947806596756, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1537681370973587, "step": 15314 }, { "epoch": 0.6961818181818182, "grad_norm": 6.03125, "grad_norm_var": 0.14862874348958333, "learning_rate": 0.0001, "loss": 5.6045, "loss/crossentropy": 2.5473129749298096, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1557176485657692, "step": 15316 }, { "epoch": 0.6962727272727273, "grad_norm": 4.46875, "grad_norm_var": 0.158056640625, "learning_rate": 0.0001, "loss": 5.1157, "loss/crossentropy": 2.234911620616913, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14296137169003487, "step": 15318 }, { "epoch": 0.6963636363636364, "grad_norm": 5.25, "grad_norm_var": 0.15875244140625, "learning_rate": 0.0001, "loss": 4.9952, "loss/crossentropy": 2.0580098628997803, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1466493308544159, "step": 15320 }, { "epoch": 0.6964545454545454, "grad_norm": 4.59375, "grad_norm_var": 0.15162353515625, "learning_rate": 0.0001, "loss": 5.5767, "loss/crossentropy": 2.583648145198822, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15672307461500168, "step": 15322 }, { "epoch": 0.6965454545454546, "grad_norm": 4.84375, "grad_norm_var": 0.148291015625, "learning_rate": 0.0001, "loss": 5.6123, "loss/crossentropy": 2.5582881569862366, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1600862592458725, "step": 15324 }, { "epoch": 0.6966363636363636, "grad_norm": 5.1875, "grad_norm_var": 0.14400634765625, "learning_rate": 0.0001, "loss": 5.6441, "loss/crossentropy": 2.6292887926101685, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15402071177959442, "step": 15326 }, { "epoch": 0.6967272727272728, "grad_norm": 4.78125, "grad_norm_var": 0.1720703125, "learning_rate": 0.0001, "loss": 5.8055, "loss/crossentropy": 2.677483022212982, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16573090851306915, "step": 15328 }, { "epoch": 0.6968181818181818, "grad_norm": 4.4375, "grad_norm_var": 0.18697916666666667, "learning_rate": 0.0001, "loss": 5.3893, "loss/crossentropy": 2.38733172416687, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15488333255052567, "step": 15330 }, { "epoch": 0.6969090909090909, "grad_norm": 4.5, "grad_norm_var": 0.11506754557291667, "learning_rate": 0.0001, "loss": 5.2861, "loss/crossentropy": 2.3563262224197388, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1492321416735649, "step": 15332 }, { "epoch": 0.697, "grad_norm": 4.5, "grad_norm_var": 0.11802978515625, "learning_rate": 0.0001, "loss": 5.3519, "loss/crossentropy": 2.398404896259308, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1500369906425476, "step": 15334 }, { "epoch": 0.6970909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.10448811848958334, "learning_rate": 0.0001, "loss": 5.5363, "loss/crossentropy": 2.446496456861496, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15976086631417274, "step": 15336 }, { "epoch": 0.6971818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.14329427083333332, "learning_rate": 0.0001, "loss": 5.8046, "loss/crossentropy": 2.5650145411491394, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.17962215840816498, "step": 15338 }, { "epoch": 0.6972727272727273, "grad_norm": 5.375, "grad_norm_var": 0.15601806640625, "learning_rate": 0.0001, "loss": 5.124, "loss/crossentropy": 2.1054129898548126, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15146513283252716, "step": 15340 }, { "epoch": 0.6973636363636364, "grad_norm": 5.21875, "grad_norm_var": 0.16057535807291667, "learning_rate": 0.0001, "loss": 5.4654, "loss/crossentropy": 2.443997085094452, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15565520524978638, "step": 15342 }, { "epoch": 0.6974545454545454, "grad_norm": 4.6875, "grad_norm_var": 0.12528889973958332, "learning_rate": 0.0001, "loss": 5.2568, "loss/crossentropy": 2.3148113489151, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14732561260461807, "step": 15344 }, { "epoch": 0.6975454545454546, "grad_norm": 4.71875, "grad_norm_var": 0.12440999348958333, "learning_rate": 0.0001, "loss": 5.7927, "loss/crossentropy": 2.670605421066284, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16377247124910355, "step": 15346 }, { "epoch": 0.6976363636363636, "grad_norm": 4.875, "grad_norm_var": 0.10319010416666667, "learning_rate": 0.0001, "loss": 5.7003, "loss/crossentropy": 2.5692341327667236, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1642790324985981, "step": 15348 }, { "epoch": 0.6977272727272728, "grad_norm": 4.78125, "grad_norm_var": 0.08316650390625, "learning_rate": 0.0001, "loss": 5.6463, "loss/crossentropy": 2.4768600165843964, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16792330890893936, "step": 15350 }, { "epoch": 0.6978181818181818, "grad_norm": 5.0, "grad_norm_var": 0.08097330729166667, "learning_rate": 0.0001, "loss": 5.5347, "loss/crossentropy": 2.5317476391792297, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1536115389317274, "step": 15352 }, { "epoch": 0.6979090909090909, "grad_norm": 4.71875, "grad_norm_var": 0.06261393229166666, "learning_rate": 0.0001, "loss": 5.847, "loss/crossentropy": 2.681263267993927, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16891705244779587, "step": 15354 }, { "epoch": 0.698, "grad_norm": 4.6875, "grad_norm_var": 0.08951416015625, "learning_rate": 0.0001, "loss": 5.0299, "loss/crossentropy": 2.1169793009757996, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1455906629562378, "step": 15356 }, { "epoch": 0.6980909090909091, "grad_norm": 4.875, "grad_norm_var": 0.07841389973958333, "learning_rate": 0.0001, "loss": 5.874, "loss/crossentropy": 2.718153476715088, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16734542325139046, "step": 15358 }, { "epoch": 0.6981818181818182, "grad_norm": 4.78125, "grad_norm_var": 0.08318684895833334, "learning_rate": 0.0001, "loss": 5.6083, "loss/crossentropy": 2.607272446155548, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1528349705040455, "step": 15360 }, { "epoch": 0.6982727272727273, "grad_norm": 4.4375, "grad_norm_var": 0.08417561848958334, "learning_rate": 0.0001, "loss": 5.0783, "loss/crossentropy": 2.1626283526420593, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.14664671197533607, "step": 15362 }, { "epoch": 0.6983636363636364, "grad_norm": 5.0, "grad_norm_var": 0.07955729166666667, "learning_rate": 0.0001, "loss": 5.6397, "loss/crossentropy": 2.516181468963623, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16547711566090584, "step": 15364 }, { "epoch": 0.6984545454545454, "grad_norm": 5.59375, "grad_norm_var": 0.13069254557291668, "learning_rate": 0.0001, "loss": 5.7675, "loss/crossentropy": 2.619000792503357, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16816620901226997, "step": 15366 }, { "epoch": 0.6985454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.18896077473958334, "learning_rate": 0.0001, "loss": 5.3774, "loss/crossentropy": 2.3425838947296143, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15406683087348938, "step": 15368 }, { "epoch": 0.6986363636363636, "grad_norm": 4.375, "grad_norm_var": 0.21282552083333334, "learning_rate": 0.0001, "loss": 5.537, "loss/crossentropy": 2.5247331261634827, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15493424981832504, "step": 15370 }, { "epoch": 0.6987272727272728, "grad_norm": 4.4375, "grad_norm_var": 0.21379801432291667, "learning_rate": 0.0001, "loss": 5.0916, "loss/crossentropy": 2.248545378446579, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.13957416266202927, "step": 15372 }, { "epoch": 0.6988181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.21702067057291666, "learning_rate": 0.0001, "loss": 5.8762, "loss/crossentropy": 2.722472071647644, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.169671431183815, "step": 15374 }, { "epoch": 0.6989090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.20911458333333333, "learning_rate": 0.0001, "loss": 5.0092, "loss/crossentropy": 2.0990125238895416, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.14277168549597263, "step": 15376 }, { "epoch": 0.699, "grad_norm": 5.59375, "grad_norm_var": 0.22281494140625, "learning_rate": 0.0001, "loss": 5.5609, "loss/crossentropy": 2.4462759494781494, "loss/hidden": 1.505859375, "loss/jsd": 0.0, "loss/logits": 0.16087587550282478, "step": 15378 }, { "epoch": 0.6990909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.217041015625, "learning_rate": 0.0001, "loss": 5.6899, "loss/crossentropy": 2.681197762489319, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15653471648693085, "step": 15380 }, { "epoch": 0.6991818181818181, "grad_norm": 4.84375, "grad_norm_var": 0.193212890625, "learning_rate": 0.0001, "loss": 5.6621, "loss/crossentropy": 2.5429386496543884, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16523343324661255, "step": 15382 }, { "epoch": 0.6992727272727273, "grad_norm": 4.46875, "grad_norm_var": 0.16184895833333332, "learning_rate": 0.0001, "loss": 5.5256, "loss/crossentropy": 2.5652246177196503, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15189741551876068, "step": 15384 }, { "epoch": 0.6993636363636364, "grad_norm": 4.71875, "grad_norm_var": 0.12636311848958334, "learning_rate": 0.0001, "loss": 5.3637, "loss/crossentropy": 2.3713961243629456, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15255003049969673, "step": 15386 }, { "epoch": 0.6994545454545454, "grad_norm": 5.25, "grad_norm_var": 0.08904622395833334, "learning_rate": 0.0001, "loss": 5.3168, "loss/crossentropy": 2.350399911403656, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1497630886733532, "step": 15388 }, { "epoch": 0.6995454545454546, "grad_norm": 5.6875, "grad_norm_var": 0.1279296875, "learning_rate": 0.0001, "loss": 5.4434, "loss/crossentropy": 2.4902723729610443, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14980322867631912, "step": 15390 }, { "epoch": 0.6996363636363636, "grad_norm": 5.40625, "grad_norm_var": 0.13723551432291667, "learning_rate": 0.0001, "loss": 5.5104, "loss/crossentropy": 2.449519544839859, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15843546763062477, "step": 15392 }, { "epoch": 0.6997272727272728, "grad_norm": 4.375, "grad_norm_var": 0.12932535807291667, "learning_rate": 0.0001, "loss": 5.4372, "loss/crossentropy": 2.4398016333580017, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15345413237810135, "step": 15394 }, { "epoch": 0.6998181818181818, "grad_norm": 5.0, "grad_norm_var": 0.1259765625, "learning_rate": 0.0001, "loss": 5.9643, "loss/crossentropy": 2.7807021141052246, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16933783143758774, "step": 15396 }, { "epoch": 0.6999090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.13318684895833333, "learning_rate": 0.0001, "loss": 5.3631, "loss/crossentropy": 2.345616966485977, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15331536158919334, "step": 15398 }, { "epoch": 0.7, "grad_norm": 4.90625, "grad_norm_var": 0.10536702473958333, "learning_rate": 0.0001, "loss": 5.6387, "loss/crossentropy": 2.5422781109809875, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15964630991220474, "step": 15400 }, { "epoch": 0.7000909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.11158854166666667, "learning_rate": 0.0001, "loss": 5.9276, "loss/crossentropy": 2.7756441831588745, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16694866493344307, "step": 15402 }, { "epoch": 0.7001818181818181, "grad_norm": 5.8125, "grad_norm_var": 0.15562744140625, "learning_rate": 0.0001, "loss": 5.6725, "loss/crossentropy": 2.555400311946869, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16171270981431007, "step": 15404 }, { "epoch": 0.7002727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.11855061848958333, "learning_rate": 0.0001, "loss": 5.5595, "loss/crossentropy": 2.49951308965683, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15912789478898048, "step": 15406 }, { "epoch": 0.7003636363636364, "grad_norm": 5.34375, "grad_norm_var": 0.11607666015625, "learning_rate": 0.0001, "loss": 5.9658, "loss/crossentropy": 2.7848252058029175, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1692717708647251, "step": 15408 }, { "epoch": 0.7004545454545454, "grad_norm": 4.6875, "grad_norm_var": 0.10435791015625, "learning_rate": 0.0001, "loss": 5.5963, "loss/crossentropy": 2.56128990650177, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15858344361186028, "step": 15410 }, { "epoch": 0.7005454545454546, "grad_norm": 4.875, "grad_norm_var": 0.10623372395833333, "learning_rate": 0.0001, "loss": 5.27, "loss/crossentropy": 2.2774549424648285, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15022851340472698, "step": 15412 }, { "epoch": 0.7006363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.0998046875, "learning_rate": 0.0001, "loss": 5.4895, "loss/crossentropy": 2.4418811798095703, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1576923429965973, "step": 15414 }, { "epoch": 0.7007272727272728, "grad_norm": 4.75, "grad_norm_var": 0.13114827473958332, "learning_rate": 0.0001, "loss": 5.093, "loss/crossentropy": 2.2686984837055206, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1386814322322607, "step": 15416 }, { "epoch": 0.7008181818181818, "grad_norm": 4.65625, "grad_norm_var": 0.13821207682291667, "learning_rate": 0.0001, "loss": 4.9702, "loss/crossentropy": 2.082810401916504, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.14283610507845879, "step": 15418 }, { "epoch": 0.7009090909090909, "grad_norm": 5.125, "grad_norm_var": 0.09026285807291666, "learning_rate": 0.0001, "loss": 5.6433, "loss/crossentropy": 2.5365796089172363, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1634104661643505, "step": 15420 }, { "epoch": 0.701, "grad_norm": 4.5, "grad_norm_var": 0.10504150390625, "learning_rate": 0.0001, "loss": 5.1699, "loss/crossentropy": 2.2736953496932983, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14509180188179016, "step": 15422 }, { "epoch": 0.7010909090909091, "grad_norm": 5.4375, "grad_norm_var": 0.112744140625, "learning_rate": 0.0001, "loss": 5.5137, "loss/crossentropy": 2.4405232667922974, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15771209076046944, "step": 15424 }, { "epoch": 0.7011818181818181, "grad_norm": 5.03125, "grad_norm_var": 0.11464436848958333, "learning_rate": 0.0001, "loss": 5.2065, "loss/crossentropy": 2.290461480617523, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14413957297801971, "step": 15426 }, { "epoch": 0.7012727272727273, "grad_norm": 4.1875, "grad_norm_var": 0.13802083333333334, "learning_rate": 0.0001, "loss": 5.4575, "loss/crossentropy": 2.4814448952674866, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15131745487451553, "step": 15428 }, { "epoch": 0.7013636363636364, "grad_norm": 4.78125, "grad_norm_var": 0.12667643229166667, "learning_rate": 0.0001, "loss": 5.3513, "loss/crossentropy": 2.339711904525757, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15369779989123344, "step": 15430 }, { "epoch": 0.7014545454545454, "grad_norm": 5.15625, "grad_norm_var": 0.12323811848958334, "learning_rate": 0.0001, "loss": 5.6317, "loss/crossentropy": 2.574380934238434, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1582750752568245, "step": 15432 }, { "epoch": 0.7015454545454546, "grad_norm": 4.8125, "grad_norm_var": 0.11495768229166667, "learning_rate": 0.0001, "loss": 5.4604, "loss/crossentropy": 2.5545068979263306, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14527222514152527, "step": 15434 }, { "epoch": 0.7016363636363636, "grad_norm": 4.53125, "grad_norm_var": 0.09983317057291667, "learning_rate": 0.0001, "loss": 5.436, "loss/crossentropy": 2.3960867524147034, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1555529534816742, "step": 15436 }, { "epoch": 0.7017272727272728, "grad_norm": 4.5, "grad_norm_var": 0.08866780598958333, "learning_rate": 0.0001, "loss": 5.3812, "loss/crossentropy": 2.3722246885299683, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15617243200540543, "step": 15438 }, { "epoch": 0.7018181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.06597900390625, "learning_rate": 0.0001, "loss": 5.5137, "loss/crossentropy": 2.4787269830703735, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15798833593726158, "step": 15440 }, { "epoch": 0.7019090909090909, "grad_norm": 4.6875, "grad_norm_var": 0.062483723958333334, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 2.684257447719574, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16206562519073486, "step": 15442 }, { "epoch": 0.702, "grad_norm": 5.0, "grad_norm_var": 0.045556640625, "learning_rate": 0.0001, "loss": 5.7663, "loss/crossentropy": 2.637068808078766, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16546301171183586, "step": 15444 }, { "epoch": 0.7020909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.050244140625, "learning_rate": 0.0001, "loss": 5.7153, "loss/crossentropy": 2.589369475841522, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16552743315696716, "step": 15446 }, { "epoch": 0.7021818181818181, "grad_norm": 4.375, "grad_norm_var": 0.046337890625, "learning_rate": 0.0001, "loss": 5.4192, "loss/crossentropy": 2.401302456855774, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1545262672007084, "step": 15448 }, { "epoch": 0.7022727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.04332275390625, "learning_rate": 0.0001, "loss": 5.3534, "loss/crossentropy": 2.382058262825012, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1526004746556282, "step": 15450 }, { "epoch": 0.7023636363636364, "grad_norm": 5.0625, "grad_norm_var": 0.13359375, "learning_rate": 0.0001, "loss": 5.9225, "loss/crossentropy": 2.735133111476898, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1699126847088337, "step": 15452 }, { "epoch": 0.7024545454545454, "grad_norm": 5.4375, "grad_norm_var": 0.14654947916666666, "learning_rate": 0.0001, "loss": 5.6172, "loss/crossentropy": 2.470603108406067, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16582705453038216, "step": 15454 }, { "epoch": 0.7025454545454546, "grad_norm": 4.40625, "grad_norm_var": 0.16073811848958333, "learning_rate": 0.0001, "loss": 5.1856, "loss/crossentropy": 2.274991273880005, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.14614185690879822, "step": 15456 }, { "epoch": 0.7026363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.15709635416666667, "learning_rate": 0.0001, "loss": 5.8424, "loss/crossentropy": 2.69235759973526, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16754576191306114, "step": 15458 }, { "epoch": 0.7027272727272728, "grad_norm": 6.96875, "grad_norm_var": 0.397900390625, "learning_rate": 0.0001, "loss": 5.5733, "loss/crossentropy": 2.4592632353305817, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16296855732798576, "step": 15460 }, { "epoch": 0.7028181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.4090983072916667, "learning_rate": 0.0001, "loss": 5.4129, "loss/crossentropy": 2.377541482448578, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.15646443143486977, "step": 15462 }, { "epoch": 0.7029090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.38313395182291665, "learning_rate": 0.0001, "loss": 6.0475, "loss/crossentropy": 2.8153685927391052, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17477787658572197, "step": 15464 }, { "epoch": 0.703, "grad_norm": 4.53125, "grad_norm_var": 0.3971964518229167, "learning_rate": 0.0001, "loss": 5.6333, "loss/crossentropy": 2.4880419969558716, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16608543694019318, "step": 15466 }, { "epoch": 0.7030909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.34752197265625, "learning_rate": 0.0001, "loss": 5.571, "loss/crossentropy": 2.5340182185173035, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.15467166155576706, "step": 15468 }, { "epoch": 0.7031818181818181, "grad_norm": 4.78125, "grad_norm_var": 0.33619384765625, "learning_rate": 0.0001, "loss": 5.6202, "loss/crossentropy": 2.5784313678741455, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15808238461613655, "step": 15470 }, { "epoch": 0.7032727272727273, "grad_norm": 5.0, "grad_norm_var": 0.32063395182291665, "learning_rate": 0.0001, "loss": 5.4724, "loss/crossentropy": 2.489358901977539, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15181905031204224, "step": 15472 }, { "epoch": 0.7033636363636364, "grad_norm": 4.8125, "grad_norm_var": 0.3298136393229167, "learning_rate": 0.0001, "loss": 5.548, "loss/crossentropy": 2.5155546069145203, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15734227746725082, "step": 15474 }, { "epoch": 0.7034545454545454, "grad_norm": 4.3125, "grad_norm_var": 0.06725260416666666, "learning_rate": 0.0001, "loss": 5.3958, "loss/crossentropy": 2.4328156113624573, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.15196696296334267, "step": 15476 }, { "epoch": 0.7035454545454546, "grad_norm": 4.40625, "grad_norm_var": 0.07610270182291666, "learning_rate": 0.0001, "loss": 5.1703, "loss/crossentropy": 2.2869012355804443, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.1436164602637291, "step": 15478 }, { "epoch": 0.7036363636363636, "grad_norm": 4.53125, "grad_norm_var": 0.05273030598958333, "learning_rate": 0.0001, "loss": 5.2848, "loss/crossentropy": 2.339174598455429, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1467132568359375, "step": 15480 }, { "epoch": 0.7037272727272728, "grad_norm": 4.8125, "grad_norm_var": 0.0380859375, "learning_rate": 0.0001, "loss": 5.4934, "loss/crossentropy": 2.431028425693512, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15995275229215622, "step": 15482 }, { "epoch": 0.7038181818181818, "grad_norm": 4.8125, "grad_norm_var": 0.03683268229166667, "learning_rate": 0.0001, "loss": 5.4633, "loss/crossentropy": 2.4441736340522766, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15542742982506752, "step": 15484 }, { "epoch": 0.7039090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.04537760416666667, "learning_rate": 0.0001, "loss": 5.8142, "loss/crossentropy": 2.6019595861434937, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17160971090197563, "step": 15486 }, { "epoch": 0.704, "grad_norm": 4.8125, "grad_norm_var": 0.05245768229166667, "learning_rate": 0.0001, "loss": 5.5146, "loss/crossentropy": 2.4459442496299744, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15764635056257248, "step": 15488 }, { "epoch": 0.7040909090909091, "grad_norm": 5.125, "grad_norm_var": 0.06011962890625, "learning_rate": 0.0001, "loss": 5.4552, "loss/crossentropy": 2.4830148220062256, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15229397639632225, "step": 15490 }, { "epoch": 0.7041818181818181, "grad_norm": 5.09375, "grad_norm_var": 0.049853515625, "learning_rate": 0.0001, "loss": 5.5328, "loss/crossentropy": 2.510638952255249, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.15866251289844513, "step": 15492 }, { "epoch": 0.7042727272727273, "grad_norm": 6.0, "grad_norm_var": 0.11480712890625, "learning_rate": 0.0001, "loss": 6.1937, "loss/crossentropy": 2.882744252681732, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1822633221745491, "step": 15494 }, { "epoch": 0.7043636363636364, "grad_norm": 4.59375, "grad_norm_var": 0.1185546875, "learning_rate": 0.0001, "loss": 5.3222, "loss/crossentropy": 2.369798183441162, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14914477989077568, "step": 15496 }, { "epoch": 0.7044545454545454, "grad_norm": 4.71875, "grad_norm_var": 0.11226806640625, "learning_rate": 0.0001, "loss": 5.7089, "loss/crossentropy": 2.6051777005195618, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1640792265534401, "step": 15498 }, { "epoch": 0.7045454545454546, "grad_norm": 4.8125, "grad_norm_var": 0.11422119140625, "learning_rate": 0.0001, "loss": 5.9454, "loss/crossentropy": 2.758245348930359, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.17047734558582306, "step": 15500 }, { "epoch": 0.7046363636363636, "grad_norm": 4.84375, "grad_norm_var": 0.1158203125, "learning_rate": 0.0001, "loss": 5.7091, "loss/crossentropy": 2.651557981967926, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15594613179564476, "step": 15502 }, { "epoch": 0.7047272727272728, "grad_norm": 6.21875, "grad_norm_var": 0.21815999348958334, "learning_rate": 0.0001, "loss": 5.7378, "loss/crossentropy": 2.6489926278591156, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1610279232263565, "step": 15504 }, { "epoch": 0.7048181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.223681640625, "learning_rate": 0.0001, "loss": 5.5226, "loss/crossentropy": 2.4858256578445435, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15563423931598663, "step": 15506 }, { "epoch": 0.7049090909090909, "grad_norm": 5.15625, "grad_norm_var": 0.22454427083333334, "learning_rate": 0.0001, "loss": 5.3358, "loss/crossentropy": 2.3539036214351654, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15073120594024658, "step": 15508 }, { "epoch": 0.705, "grad_norm": 5.15625, "grad_norm_var": 0.15868733723958334, "learning_rate": 0.0001, "loss": 5.7788, "loss/crossentropy": 2.6635037064552307, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16485121473670006, "step": 15510 }, { "epoch": 0.7050909090909091, "grad_norm": 5.125, "grad_norm_var": 0.15240478515625, "learning_rate": 0.0001, "loss": 5.7337, "loss/crossentropy": 2.6222140192985535, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16192809492349625, "step": 15512 }, { "epoch": 0.7051818181818181, "grad_norm": 4.34375, "grad_norm_var": 0.18756103515625, "learning_rate": 0.0001, "loss": 5.0965, "loss/crossentropy": 2.1942755579948425, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1443264801055193, "step": 15514 }, { "epoch": 0.7052727272727273, "grad_norm": 4.59375, "grad_norm_var": 0.215478515625, "learning_rate": 0.0001, "loss": 5.3866, "loss/crossentropy": 2.3878509998321533, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15143244341015816, "step": 15516 }, { "epoch": 0.7053636363636364, "grad_norm": 4.5625, "grad_norm_var": 0.22506103515625, "learning_rate": 0.0001, "loss": 5.6038, "loss/crossentropy": 2.5791831016540527, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1555861085653305, "step": 15518 }, { "epoch": 0.7054545454545454, "grad_norm": 4.84375, "grad_norm_var": 0.12845052083333333, "learning_rate": 0.0001, "loss": 5.5946, "loss/crossentropy": 2.4861350059509277, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16318709775805473, "step": 15520 }, { "epoch": 0.7055454545454546, "grad_norm": 4.84375, "grad_norm_var": 0.1310546875, "learning_rate": 0.0001, "loss": 5.3475, "loss/crossentropy": 2.4374494552612305, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14589087665081024, "step": 15522 }, { "epoch": 0.7056363636363636, "grad_norm": 4.9375, "grad_norm_var": 0.12691650390625, "learning_rate": 0.0001, "loss": 5.3202, "loss/crossentropy": 2.3644493222236633, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14791987091302872, "step": 15524 }, { "epoch": 0.7057272727272728, "grad_norm": 4.5625, "grad_norm_var": 0.15286051432291667, "learning_rate": 0.0001, "loss": 5.5991, "loss/crossentropy": 2.5202776193618774, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16061190888285637, "step": 15526 }, { "epoch": 0.7058181818181818, "grad_norm": 4.75, "grad_norm_var": 0.13404541015625, "learning_rate": 0.0001, "loss": 5.66, "loss/crossentropy": 2.5336228013038635, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.1679137758910656, "step": 15528 }, { "epoch": 0.7059090909090909, "grad_norm": 5.09375, "grad_norm_var": 0.1212890625, "learning_rate": 0.0001, "loss": 5.6247, "loss/crossentropy": 2.5990737676620483, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15412861108779907, "step": 15530 }, { "epoch": 0.706, "grad_norm": 5.40625, "grad_norm_var": 0.10373942057291667, "learning_rate": 0.0001, "loss": 5.6007, "loss/crossentropy": 2.5143296122550964, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16195562481880188, "step": 15532 }, { "epoch": 0.7060909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.10022379557291666, "learning_rate": 0.0001, "loss": 5.6061, "loss/crossentropy": 2.5366826951503754, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15870357304811478, "step": 15534 }, { "epoch": 0.7061818181818181, "grad_norm": 4.90625, "grad_norm_var": 0.08255208333333333, "learning_rate": 0.0001, "loss": 5.7229, "loss/crossentropy": 2.56122088432312, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16890301182866096, "step": 15536 }, { "epoch": 0.7062727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.08709309895833334, "learning_rate": 0.0001, "loss": 5.5813, "loss/crossentropy": 2.5397143363952637, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1557236611843109, "step": 15538 }, { "epoch": 0.7063636363636364, "grad_norm": 4.53125, "grad_norm_var": 0.09524332682291667, "learning_rate": 0.0001, "loss": 5.439, "loss/crossentropy": 2.400396764278412, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15835639834403992, "step": 15540 }, { "epoch": 0.7064545454545454, "grad_norm": 4.71875, "grad_norm_var": 0.0732421875, "learning_rate": 0.0001, "loss": 5.9317, "loss/crossentropy": 2.7843554615974426, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16785834729671478, "step": 15542 }, { "epoch": 0.7065454545454546, "grad_norm": 5.21875, "grad_norm_var": 0.08205973307291667, "learning_rate": 0.0001, "loss": 5.4516, "loss/crossentropy": 2.3771032094955444, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15900900587439537, "step": 15544 }, { "epoch": 0.7066363636363636, "grad_norm": 4.3125, "grad_norm_var": 0.08826497395833334, "learning_rate": 0.0001, "loss": 5.1618, "loss/crossentropy": 2.239344358444214, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14615124091506004, "step": 15546 }, { "epoch": 0.7067272727272728, "grad_norm": 4.96875, "grad_norm_var": 0.06669514973958333, "learning_rate": 0.0001, "loss": 5.6833, "loss/crossentropy": 2.6119887232780457, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15927838906645775, "step": 15548 }, { "epoch": 0.7068181818181818, "grad_norm": 4.59375, "grad_norm_var": 0.07600504557291667, "learning_rate": 0.0001, "loss": 5.3869, "loss/crossentropy": 2.344145119190216, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15759719163179398, "step": 15550 }, { "epoch": 0.7069090909090909, "grad_norm": 4.9375, "grad_norm_var": 0.07578125, "learning_rate": 0.0001, "loss": 5.4275, "loss/crossentropy": 2.4135733246803284, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15529697388410568, "step": 15552 }, { "epoch": 0.707, "grad_norm": 4.9375, "grad_norm_var": 0.07343343098958334, "learning_rate": 0.0001, "loss": 5.3977, "loss/crossentropy": 2.3703730702400208, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15586240962147713, "step": 15554 }, { "epoch": 0.7070909090909091, "grad_norm": 4.5, "grad_norm_var": 0.076806640625, "learning_rate": 0.0001, "loss": 5.2236, "loss/crossentropy": 2.31178617477417, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14743054285645485, "step": 15556 }, { "epoch": 0.7071818181818181, "grad_norm": 4.96875, "grad_norm_var": 0.0763671875, "learning_rate": 0.0001, "loss": 5.292, "loss/crossentropy": 2.3241893649101257, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15205925330519676, "step": 15558 }, { "epoch": 0.7072727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.066259765625, "learning_rate": 0.0001, "loss": 5.5332, "loss/crossentropy": 2.5038142800331116, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15606122463941574, "step": 15560 }, { "epoch": 0.7073636363636364, "grad_norm": 4.6875, "grad_norm_var": 0.049723307291666664, "learning_rate": 0.0001, "loss": 5.3819, "loss/crossentropy": 2.403307020664215, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15117811039090157, "step": 15562 }, { "epoch": 0.7074545454545454, "grad_norm": 4.5, "grad_norm_var": 0.057145182291666666, "learning_rate": 0.0001, "loss": 5.3954, "loss/crossentropy": 2.418062150478363, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15281193889677525, "step": 15564 }, { "epoch": 0.7075454545454546, "grad_norm": 5.375, "grad_norm_var": 0.07828369140625, "learning_rate": 0.0001, "loss": 5.782, "loss/crossentropy": 2.6261412501335144, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16558850184082985, "step": 15566 }, { "epoch": 0.7076363636363636, "grad_norm": 4.71875, "grad_norm_var": 0.0794921875, "learning_rate": 0.0001, "loss": 5.5246, "loss/crossentropy": 2.520792067050934, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15780334547162056, "step": 15568 }, { "epoch": 0.7077272727272728, "grad_norm": 4.65625, "grad_norm_var": 0.07454427083333333, "learning_rate": 0.0001, "loss": 5.2205, "loss/crossentropy": 2.315966248512268, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.14650996401906013, "step": 15570 }, { "epoch": 0.7078181818181818, "grad_norm": 4.9375, "grad_norm_var": 0.10545247395833333, "learning_rate": 0.0001, "loss": 6.0006, "loss/crossentropy": 2.857744336128235, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16779853776097298, "step": 15572 }, { "epoch": 0.7079090909090909, "grad_norm": 4.53125, "grad_norm_var": 0.10683186848958333, "learning_rate": 0.0001, "loss": 5.0895, "loss/crossentropy": 2.2251909971237183, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14131609722971916, "step": 15574 }, { "epoch": 0.708, "grad_norm": 4.84375, "grad_norm_var": 0.10388997395833334, "learning_rate": 0.0001, "loss": 5.519, "loss/crossentropy": 2.492396056652069, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15539120882749557, "step": 15576 }, { "epoch": 0.7080909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.104541015625, "learning_rate": 0.0001, "loss": 5.8649, "loss/crossentropy": 2.7446078062057495, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16515647992491722, "step": 15578 }, { "epoch": 0.7081818181818181, "grad_norm": 4.84375, "grad_norm_var": 0.09386393229166666, "learning_rate": 0.0001, "loss": 5.5104, "loss/crossentropy": 2.439342975616455, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15827451273798943, "step": 15580 }, { "epoch": 0.7082727272727273, "grad_norm": 4.96875, "grad_norm_var": 0.07042643229166666, "learning_rate": 0.0001, "loss": 5.6149, "loss/crossentropy": 2.58961683511734, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15409007295966148, "step": 15582 }, { "epoch": 0.7083636363636364, "grad_norm": 4.90625, "grad_norm_var": 0.06940104166666666, "learning_rate": 0.0001, "loss": 5.9179, "loss/crossentropy": 2.744641363620758, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17045165970921516, "step": 15584 }, { "epoch": 0.7084545454545454, "grad_norm": 5.0, "grad_norm_var": 0.061812337239583334, "learning_rate": 0.0001, "loss": 5.9015, "loss/crossentropy": 2.738995909690857, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16781001165509224, "step": 15586 }, { "epoch": 0.7085454545454546, "grad_norm": 4.625, "grad_norm_var": 0.02359619140625, "learning_rate": 0.0001, "loss": 5.5134, "loss/crossentropy": 2.514311134815216, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1532323658466339, "step": 15588 }, { "epoch": 0.7086363636363636, "grad_norm": 4.9375, "grad_norm_var": 0.016341145833333334, "learning_rate": 0.0001, "loss": 5.8021, "loss/crossentropy": 2.6875081658363342, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16517141833901405, "step": 15590 }, { "epoch": 0.7087272727272728, "grad_norm": 4.5625, "grad_norm_var": 0.020296223958333335, "learning_rate": 0.0001, "loss": 5.6663, "loss/crossentropy": 2.5989599227905273, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16181223839521408, "step": 15592 }, { "epoch": 0.7088181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.018229166666666668, "learning_rate": 0.0001, "loss": 5.7533, "loss/crossentropy": 2.680032193660736, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16240793466567993, "step": 15594 }, { "epoch": 0.7089090909090909, "grad_norm": 4.4375, "grad_norm_var": 0.029410807291666667, "learning_rate": 0.0001, "loss": 5.4916, "loss/crossentropy": 2.5397902131080627, "loss/hidden": 1.431640625, "loss/jsd": 0.0, "loss/logits": 0.15201909840106964, "step": 15596 }, { "epoch": 0.709, "grad_norm": 5.1875, "grad_norm_var": 0.03883056640625, "learning_rate": 0.0001, "loss": 5.6668, "loss/crossentropy": 2.6120806336402893, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.1607496291399002, "step": 15598 }, { "epoch": 0.7090909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.04388020833333333, "learning_rate": 0.0001, "loss": 5.4504, "loss/crossentropy": 2.4572813510894775, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1530195213854313, "step": 15600 }, { "epoch": 0.7091818181818181, "grad_norm": 4.1875, "grad_norm_var": 0.06482747395833334, "learning_rate": 0.0001, "loss": 5.2453, "loss/crossentropy": 2.348776936531067, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14395192824304104, "step": 15602 }, { "epoch": 0.7092727272727273, "grad_norm": 5.375, "grad_norm_var": 0.09032796223958334, "learning_rate": 0.0001, "loss": 6.0354, "loss/crossentropy": 2.827039420604706, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.17415333166718483, "step": 15604 }, { "epoch": 0.7093636363636364, "grad_norm": 5.65625, "grad_norm_var": 0.13058268229166667, "learning_rate": 0.0001, "loss": 5.8301, "loss/crossentropy": 2.6645882725715637, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16772745177149773, "step": 15606 }, { "epoch": 0.7094545454545454, "grad_norm": 4.78125, "grad_norm_var": 0.12688802083333334, "learning_rate": 0.0001, "loss": 4.6981, "loss/crossentropy": 1.9633700549602509, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.12971938960254192, "step": 15608 }, { "epoch": 0.7095454545454546, "grad_norm": 5.03125, "grad_norm_var": 0.13917643229166668, "learning_rate": 0.0001, "loss": 5.9201, "loss/crossentropy": 2.751846432685852, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16858338937163353, "step": 15610 }, { "epoch": 0.7096363636363636, "grad_norm": 4.9375, "grad_norm_var": 0.10836181640625, "learning_rate": 0.0001, "loss": 5.8382, "loss/crossentropy": 2.7755603194236755, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1599709391593933, "step": 15612 }, { "epoch": 0.7097272727272728, "grad_norm": 5.25, "grad_norm_var": 0.11144205729166666, "learning_rate": 0.0001, "loss": 5.5265, "loss/crossentropy": 2.380308896303177, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16071725264191628, "step": 15614 }, { "epoch": 0.7098181818181818, "grad_norm": 5.46875, "grad_norm_var": 0.15579427083333333, "learning_rate": 0.0001, "loss": 5.6127, "loss/crossentropy": 2.473143458366394, "loss/hidden": 1.509765625, "loss/jsd": 0.0, "loss/logits": 0.16298267990350723, "step": 15616 }, { "epoch": 0.7099090909090909, "grad_norm": 5.8125, "grad_norm_var": 0.123291015625, "learning_rate": 0.0001, "loss": 5.8719, "loss/crossentropy": 2.7055364847183228, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16663656383752823, "step": 15618 }, { "epoch": 0.71, "grad_norm": 4.75, "grad_norm_var": 0.131884765625, "learning_rate": 0.0001, "loss": 5.5019, "loss/crossentropy": 2.4997122287750244, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15392785891890526, "step": 15620 }, { "epoch": 0.7100909090909091, "grad_norm": 5.125, "grad_norm_var": 0.12994791666666666, "learning_rate": 0.0001, "loss": 5.5859, "loss/crossentropy": 2.527077376842499, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15959575027227402, "step": 15622 }, { "epoch": 0.7101818181818181, "grad_norm": 4.71875, "grad_norm_var": 0.12825113932291668, "learning_rate": 0.0001, "loss": 5.3194, "loss/crossentropy": 2.3667706847190857, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14917240291833878, "step": 15624 }, { "epoch": 0.7102727272727273, "grad_norm": 5.4375, "grad_norm_var": 19.279931640625, "learning_rate": 0.0001, "loss": 5.8326, "loss/crossentropy": 2.250419497489929, "loss/hidden": 1.544921875, "loss/jsd": 0.0, "loss/logits": 0.2037237025797367, "step": 15626 }, { "epoch": 0.7103636363636363, "grad_norm": 5.34375, "grad_norm_var": 19.13218994140625, "learning_rate": 0.0001, "loss": 5.8475, "loss/crossentropy": 2.6457881331443787, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17056390270590782, "step": 15628 }, { "epoch": 0.7104545454545454, "grad_norm": 5.09375, "grad_norm_var": 19.252372233072915, "learning_rate": 0.0001, "loss": 5.1355, "loss/crossentropy": 2.2628673017024994, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14175518974661827, "step": 15630 }, { "epoch": 0.7105454545454546, "grad_norm": 5.0625, "grad_norm_var": 19.359619140625, "learning_rate": 0.0001, "loss": 5.4131, "loss/crossentropy": 2.4183364510536194, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15142665058374405, "step": 15632 }, { "epoch": 0.7106363636363636, "grad_norm": 5.0625, "grad_norm_var": 19.538895670572916, "learning_rate": 0.0001, "loss": 5.1302, "loss/crossentropy": 2.3075363636016846, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.13832053169608116, "step": 15634 }, { "epoch": 0.7107272727272728, "grad_norm": 4.84375, "grad_norm_var": 19.599609375, "learning_rate": 0.0001, "loss": 5.5201, "loss/crossentropy": 2.493843972682953, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1559484302997589, "step": 15636 }, { "epoch": 0.7108181818181818, "grad_norm": 4.59375, "grad_norm_var": 19.634114583333332, "learning_rate": 0.0001, "loss": 5.4204, "loss/crossentropy": 2.441509783267975, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.15394264832139015, "step": 15638 }, { "epoch": 0.7109090909090909, "grad_norm": 4.4375, "grad_norm_var": 19.624202473958334, "learning_rate": 0.0001, "loss": 4.94, "loss/crossentropy": 2.017343431711197, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.14109859243035316, "step": 15640 }, { "epoch": 0.711, "grad_norm": 4.90625, "grad_norm_var": 0.13683268229166667, "learning_rate": 0.0001, "loss": 5.4056, "loss/crossentropy": 2.3564905524253845, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.15315302461385727, "step": 15642 }, { "epoch": 0.7110909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.10245768229166667, "learning_rate": 0.0001, "loss": 5.2974, "loss/crossentropy": 2.350173592567444, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1488200817257166, "step": 15644 }, { "epoch": 0.7111818181818181, "grad_norm": 4.96875, "grad_norm_var": 0.09830322265625, "learning_rate": 0.0001, "loss": 5.4829, "loss/crossentropy": 2.4564724564552307, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1565503366291523, "step": 15646 }, { "epoch": 0.7112727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.08756103515625, "learning_rate": 0.0001, "loss": 5.5522, "loss/crossentropy": 2.4978992342948914, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1597272902727127, "step": 15648 }, { "epoch": 0.7113636363636363, "grad_norm": 5.625, "grad_norm_var": 0.114306640625, "learning_rate": 0.0001, "loss": 5.6716, "loss/crossentropy": 2.4993537068367004, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.16741949319839478, "step": 15650 }, { "epoch": 0.7114545454545455, "grad_norm": 5.28125, "grad_norm_var": 0.12646077473958334, "learning_rate": 0.0001, "loss": 5.8234, "loss/crossentropy": 2.6962907910346985, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16563793271780014, "step": 15652 }, { "epoch": 0.7115454545454546, "grad_norm": 4.84375, "grad_norm_var": 0.12376302083333333, "learning_rate": 0.0001, "loss": 5.7971, "loss/crossentropy": 2.6471580266952515, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16792718321084976, "step": 15654 }, { "epoch": 0.7116363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.1060546875, "learning_rate": 0.0001, "loss": 5.0935, "loss/crossentropy": 2.1606865525245667, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.14289012551307678, "step": 15656 }, { "epoch": 0.7117272727272728, "grad_norm": 4.90625, "grad_norm_var": 0.09720052083333333, "learning_rate": 0.0001, "loss": 5.5497, "loss/crossentropy": 2.494159758090973, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15887797623872757, "step": 15658 }, { "epoch": 0.7118181818181818, "grad_norm": 4.375, "grad_norm_var": 0.08876546223958333, "learning_rate": 0.0001, "loss": 5.2286, "loss/crossentropy": 2.3503342866897583, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1460295207798481, "step": 15660 }, { "epoch": 0.7119090909090909, "grad_norm": 5.1875, "grad_norm_var": 0.09581705729166666, "learning_rate": 0.0001, "loss": 5.5929, "loss/crossentropy": 2.523803949356079, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15945138782262802, "step": 15662 }, { "epoch": 0.712, "grad_norm": 4.8125, "grad_norm_var": 0.08918863932291667, "learning_rate": 0.0001, "loss": 5.4746, "loss/crossentropy": 2.384294033050537, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1611829176545143, "step": 15664 }, { "epoch": 0.7120909090909091, "grad_norm": 5.5, "grad_norm_var": 0.07884114583333333, "learning_rate": 0.0001, "loss": 5.702, "loss/crossentropy": 2.595898449420929, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.16198061779141426, "step": 15666 }, { "epoch": 0.7121818181818181, "grad_norm": 5.40625, "grad_norm_var": 0.0986328125, "learning_rate": 0.0001, "loss": 5.4756, "loss/crossentropy": 2.4041056036949158, "loss/hidden": 1.513671875, "loss/jsd": 0.0, "loss/logits": 0.15578315034508705, "step": 15668 }, { "epoch": 0.7122727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.09820556640625, "learning_rate": 0.0001, "loss": 5.4407, "loss/crossentropy": 2.469397008419037, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15084585919976234, "step": 15670 }, { "epoch": 0.7123636363636363, "grad_norm": 4.75, "grad_norm_var": 0.09153238932291667, "learning_rate": 0.0001, "loss": 5.8647, "loss/crossentropy": 2.705290198326111, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1678973287343979, "step": 15672 }, { "epoch": 0.7124545454545455, "grad_norm": 4.65625, "grad_norm_var": 0.10520833333333333, "learning_rate": 0.0001, "loss": 5.3773, "loss/crossentropy": 2.38366636633873, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.1503419540822506, "step": 15674 }, { "epoch": 0.7125454545454546, "grad_norm": 4.84375, "grad_norm_var": 0.07496337890625, "learning_rate": 0.0001, "loss": 5.5163, "loss/crossentropy": 2.4541679322719574, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15777188539505005, "step": 15676 }, { "epoch": 0.7126363636363636, "grad_norm": 5.21875, "grad_norm_var": 0.090087890625, "learning_rate": 0.0001, "loss": 5.549, "loss/crossentropy": 2.4704172611236572, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15922529622912407, "step": 15678 }, { "epoch": 0.7127272727272728, "grad_norm": 4.90625, "grad_norm_var": 0.08826497395833334, "learning_rate": 0.0001, "loss": 5.627, "loss/crossentropy": 2.5248828530311584, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16294502094388008, "step": 15680 }, { "epoch": 0.7128181818181818, "grad_norm": 4.625, "grad_norm_var": 0.08485921223958333, "learning_rate": 0.0001, "loss": 5.1318, "loss/crossentropy": 2.2395263612270355, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.14254742115736008, "step": 15682 }, { "epoch": 0.7129090909090909, "grad_norm": 4.96875, "grad_norm_var": 0.04657796223958333, "learning_rate": 0.0001, "loss": 5.2206, "loss/crossentropy": 2.2544912099838257, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1501246765255928, "step": 15684 }, { "epoch": 0.713, "grad_norm": 5.09375, "grad_norm_var": 0.05282796223958333, "learning_rate": 0.0001, "loss": 5.3038, "loss/crossentropy": 2.3537960052490234, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.14832548052072525, "step": 15686 }, { "epoch": 0.7130909090909091, "grad_norm": 4.28125, "grad_norm_var": 0.07185872395833333, "learning_rate": 0.0001, "loss": 5.027, "loss/crossentropy": 2.2018037736415863, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.13799218088388443, "step": 15688 }, { "epoch": 0.7131818181818181, "grad_norm": 4.6875, "grad_norm_var": 0.06261393229166666, "learning_rate": 0.0001, "loss": 5.7087, "loss/crossentropy": 2.6209859251976013, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.16169636324048042, "step": 15690 }, { "epoch": 0.7132727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.063916015625, "learning_rate": 0.0001, "loss": 5.5508, "loss/crossentropy": 2.503214716911316, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15709742903709412, "step": 15692 }, { "epoch": 0.7133636363636363, "grad_norm": 4.78125, "grad_norm_var": 0.047652180989583334, "learning_rate": 0.0001, "loss": 5.6812, "loss/crossentropy": 2.5777087807655334, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16249596700072289, "step": 15694 }, { "epoch": 0.7134545454545455, "grad_norm": 5.03125, "grad_norm_var": 0.06516927083333333, "learning_rate": 0.0001, "loss": 5.7598, "loss/crossentropy": 2.5827746987342834, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1680980995297432, "step": 15696 }, { "epoch": 0.7135454545454546, "grad_norm": 4.8125, "grad_norm_var": 0.06106770833333333, "learning_rate": 0.0001, "loss": 5.4687, "loss/crossentropy": 2.3855721950531006, "loss/hidden": 1.517578125, "loss/jsd": 0.0, "loss/logits": 0.1565561629831791, "step": 15698 }, { "epoch": 0.7136363636363636, "grad_norm": 4.4375, "grad_norm_var": 0.070556640625, "learning_rate": 0.0001, "loss": 5.4114, "loss/crossentropy": 2.409366011619568, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15235665440559387, "step": 15700 }, { "epoch": 0.7137272727272728, "grad_norm": 4.9375, "grad_norm_var": 0.0841796875, "learning_rate": 0.0001, "loss": 5.7811, "loss/crossentropy": 2.6386348605155945, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16971943899989128, "step": 15702 }, { "epoch": 0.7138181818181818, "grad_norm": 4.75, "grad_norm_var": 0.06717122395833333, "learning_rate": 0.0001, "loss": 5.7586, "loss/crossentropy": 2.6272735595703125, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16567456349730492, "step": 15704 }, { "epoch": 0.7139090909090909, "grad_norm": 4.375, "grad_norm_var": 0.06847330729166666, "learning_rate": 0.0001, "loss": 5.4803, "loss/crossentropy": 2.5135130882263184, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.14999553561210632, "step": 15706 }, { "epoch": 0.714, "grad_norm": 5.15625, "grad_norm_var": 0.081640625, "learning_rate": 0.0001, "loss": 5.7225, "loss/crossentropy": 2.5747000575065613, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16575592756271362, "step": 15708 }, { "epoch": 0.7140909090909091, "grad_norm": 4.84375, "grad_norm_var": 0.09452718098958333, "learning_rate": 0.0001, "loss": 5.7716, "loss/crossentropy": 2.6179002821445465, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.1679079867899418, "step": 15710 }, { "epoch": 0.7141818181818181, "grad_norm": 4.625, "grad_norm_var": 0.0771484375, "learning_rate": 0.0001, "loss": 5.4709, "loss/crossentropy": 2.472455322742462, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15395011752843857, "step": 15712 }, { "epoch": 0.7142727272727273, "grad_norm": 5.1875, "grad_norm_var": 0.08804931640625, "learning_rate": 0.0001, "loss": 5.5027, "loss/crossentropy": 2.455199122428894, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15748256072402, "step": 15714 }, { "epoch": 0.7143636363636363, "grad_norm": 4.84375, "grad_norm_var": 0.08982747395833333, "learning_rate": 0.0001, "loss": 5.3749, "loss/crossentropy": 2.3318116664886475, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15586841106414795, "step": 15716 }, { "epoch": 0.7144545454545455, "grad_norm": 4.78125, "grad_norm_var": 0.070556640625, "learning_rate": 0.0001, "loss": 5.9541, "loss/crossentropy": 2.800969183444977, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.16902295500040054, "step": 15718 }, { "epoch": 0.7145454545454546, "grad_norm": 4.65625, "grad_norm_var": 0.072119140625, "learning_rate": 0.0001, "loss": 5.2203, "loss/crossentropy": 2.2693081200122833, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14900407940149307, "step": 15720 }, { "epoch": 0.7146363636363636, "grad_norm": 4.78125, "grad_norm_var": 0.11178385416666667, "learning_rate": 0.0001, "loss": 5.6426, "loss/crossentropy": 2.4379906356334686, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17163247615098953, "step": 15722 }, { "epoch": 0.7147272727272728, "grad_norm": 4.84375, "grad_norm_var": 0.114453125, "learning_rate": 0.0001, "loss": 5.1839, "loss/crossentropy": 2.2764680981636047, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.14563005790114403, "step": 15724 }, { "epoch": 0.7148181818181818, "grad_norm": 4.75, "grad_norm_var": 0.11578369140625, "learning_rate": 0.0001, "loss": 5.6739, "loss/crossentropy": 2.645602762699127, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.15614702180027962, "step": 15726 }, { "epoch": 0.7149090909090909, "grad_norm": 4.59375, "grad_norm_var": 0.116650390625, "learning_rate": 0.0001, "loss": 5.4347, "loss/crossentropy": 2.463605523109436, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15413781628012657, "step": 15728 }, { "epoch": 0.715, "grad_norm": 4.96875, "grad_norm_var": 0.11925455729166666, "learning_rate": 0.0001, "loss": 5.4257, "loss/crossentropy": 2.451568454504013, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15112631022930145, "step": 15730 }, { "epoch": 0.7150909090909091, "grad_norm": 7.65625, "grad_norm_var": 0.6070963541666666, "learning_rate": 0.0001, "loss": 5.6736, "loss/crossentropy": 2.4936853647232056, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16916536167263985, "step": 15732 }, { "epoch": 0.7151818181818181, "grad_norm": 4.96875, "grad_norm_var": 0.6050130208333333, "learning_rate": 0.0001, "loss": 5.4804, "loss/crossentropy": 2.3920748829841614, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15902535989880562, "step": 15734 }, { "epoch": 0.7152727272727273, "grad_norm": 4.9375, "grad_norm_var": 0.601171875, "learning_rate": 0.0001, "loss": 5.5009, "loss/crossentropy": 2.4467357993125916, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15698335319757462, "step": 15736 }, { "epoch": 0.7153636363636363, "grad_norm": 4.46875, "grad_norm_var": 0.57281494140625, "learning_rate": 0.0001, "loss": 5.0578, "loss/crossentropy": 2.1600789427757263, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.14309093914926052, "step": 15738 }, { "epoch": 0.7154545454545455, "grad_norm": 4.46875, "grad_norm_var": 0.5752604166666667, "learning_rate": 0.0001, "loss": 5.4519, "loss/crossentropy": 2.466528445482254, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15400993451476097, "step": 15740 }, { "epoch": 0.7155454545454546, "grad_norm": 4.4375, "grad_norm_var": 0.58984375, "learning_rate": 0.0001, "loss": 5.3692, "loss/crossentropy": 2.384841740131378, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15097802132368088, "step": 15742 }, { "epoch": 0.7156363636363636, "grad_norm": 5.03125, "grad_norm_var": 0.5817545572916667, "learning_rate": 0.0001, "loss": 5.4355, "loss/crossentropy": 2.3977039456367493, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15788142755627632, "step": 15744 }, { "epoch": 0.7157272727272728, "grad_norm": 4.90625, "grad_norm_var": 0.615625, "learning_rate": 0.0001, "loss": 5.6271, "loss/crossentropy": 2.531294882297516, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16133777052164078, "step": 15746 }, { "epoch": 0.7158181818181818, "grad_norm": 4.84375, "grad_norm_var": 0.13437093098958333, "learning_rate": 0.0001, "loss": 5.3427, "loss/crossentropy": 2.375728726387024, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.1496226117014885, "step": 15748 }, { "epoch": 0.7159090909090909, "grad_norm": 4.25, "grad_norm_var": 0.14654947916666666, "learning_rate": 0.0001, "loss": 5.3116, "loss/crossentropy": 2.41439750790596, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1451937910169363, "step": 15750 }, { "epoch": 0.716, "grad_norm": 4.53125, "grad_norm_var": 0.15182291666666667, "learning_rate": 0.0001, "loss": 5.1823, "loss/crossentropy": 2.2704721689224243, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1456742323935032, "step": 15752 }, { "epoch": 0.7160909090909091, "grad_norm": 4.90625, "grad_norm_var": 0.15546875, "learning_rate": 0.0001, "loss": 5.6165, "loss/crossentropy": 2.503541350364685, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16285444423556328, "step": 15754 }, { "epoch": 0.7161818181818181, "grad_norm": 4.5, "grad_norm_var": 0.153759765625, "learning_rate": 0.0001, "loss": 5.5368, "loss/crossentropy": 2.4812952876091003, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1590622067451477, "step": 15756 }, { "epoch": 0.7162727272727273, "grad_norm": 4.90625, "grad_norm_var": 0.146337890625, "learning_rate": 0.0001, "loss": 5.4592, "loss/crossentropy": 2.489649176597595, "loss/hidden": 1.419921875, "loss/jsd": 0.0, "loss/logits": 0.1549597680568695, "step": 15758 }, { "epoch": 0.7163636363636363, "grad_norm": 5.25, "grad_norm_var": 0.15963541666666667, "learning_rate": 0.0001, "loss": 5.5423, "loss/crossentropy": 2.487386107444763, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1595892757177353, "step": 15760 }, { "epoch": 0.7164545454545455, "grad_norm": 4.78125, "grad_norm_var": 0.07877197265625, "learning_rate": 0.0001, "loss": 5.3218, "loss/crossentropy": 2.359964907169342, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.15028223022818565, "step": 15762 }, { "epoch": 0.7165454545454546, "grad_norm": 5.0625, "grad_norm_var": 0.10545247395833333, "learning_rate": 0.0001, "loss": 5.377, "loss/crossentropy": 2.3609440326690674, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1535632535815239, "step": 15764 }, { "epoch": 0.7166363636363636, "grad_norm": 5.34375, "grad_norm_var": 0.10536702473958333, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.6656216979026794, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1671365574002266, "step": 15766 }, { "epoch": 0.7167272727272728, "grad_norm": 5.34375, "grad_norm_var": 0.10155843098958334, "learning_rate": 0.0001, "loss": 5.8817, "loss/crossentropy": 2.7271727919578552, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16857516020536423, "step": 15768 }, { "epoch": 0.7168181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.10123697916666667, "learning_rate": 0.0001, "loss": 5.1993, "loss/crossentropy": 2.301594167947769, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.14543049409985542, "step": 15770 }, { "epoch": 0.7169090909090909, "grad_norm": 5.4375, "grad_norm_var": 0.112353515625, "learning_rate": 0.0001, "loss": 5.6258, "loss/crossentropy": 2.5459945797920227, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16188785433769226, "step": 15772 }, { "epoch": 0.717, "grad_norm": 4.4375, "grad_norm_var": 0.10826416015625, "learning_rate": 0.0001, "loss": 5.4562, "loss/crossentropy": 2.472679555416107, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1532338261604309, "step": 15774 }, { "epoch": 0.7170909090909091, "grad_norm": 5.0625, "grad_norm_var": 0.09505208333333333, "learning_rate": 0.0001, "loss": 5.7895, "loss/crossentropy": 2.6711838245391846, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16534532234072685, "step": 15776 }, { "epoch": 0.7171818181818181, "grad_norm": 4.9375, "grad_norm_var": 0.07899983723958333, "learning_rate": 0.0001, "loss": 5.5909, "loss/crossentropy": 2.4649124145507812, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.16709371656179428, "step": 15778 }, { "epoch": 0.7172727272727273, "grad_norm": 4.8125, "grad_norm_var": 0.08658447265625, "learning_rate": 0.0001, "loss": 5.7461, "loss/crossentropy": 2.6803303956985474, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16204624995589256, "step": 15780 }, { "epoch": 0.7173636363636363, "grad_norm": 4.9375, "grad_norm_var": 0.08131103515625, "learning_rate": 0.0001, "loss": 5.4688, "loss/crossentropy": 2.4383515119552612, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1567559316754341, "step": 15782 }, { "epoch": 0.7174545454545455, "grad_norm": 4.53125, "grad_norm_var": 0.08138020833333333, "learning_rate": 0.0001, "loss": 5.5328, "loss/crossentropy": 2.5348410606384277, "loss/hidden": 1.419921875, "loss/jsd": 0.0, "loss/logits": 0.15780147165060043, "step": 15784 }, { "epoch": 0.7175454545454546, "grad_norm": 4.75, "grad_norm_var": 0.08362223307291666, "learning_rate": 0.0001, "loss": 5.3137, "loss/crossentropy": 2.383007138967514, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.14834440127015114, "step": 15786 }, { "epoch": 0.7176363636363636, "grad_norm": 4.5, "grad_norm_var": 0.06750895182291666, "learning_rate": 0.0001, "loss": 5.1084, "loss/crossentropy": 2.1906117498874664, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1451025903224945, "step": 15788 }, { "epoch": 0.7177272727272728, "grad_norm": 4.65625, "grad_norm_var": 0.059619140625, "learning_rate": 0.0001, "loss": 5.5129, "loss/crossentropy": 2.4503175616264343, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16133873164653778, "step": 15790 }, { "epoch": 0.7178181818181818, "grad_norm": 4.6875, "grad_norm_var": 0.06298421223958334, "learning_rate": 0.0001, "loss": 5.5495, "loss/crossentropy": 2.4944825172424316, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15667686238884926, "step": 15792 }, { "epoch": 0.7179090909090909, "grad_norm": 4.3125, "grad_norm_var": 0.09816080729166667, "learning_rate": 0.0001, "loss": 4.4664, "loss/crossentropy": 1.8284907937049866, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.12511418014764786, "step": 15794 }, { "epoch": 0.718, "grad_norm": 5.625, "grad_norm_var": 0.12291666666666666, "learning_rate": 0.0001, "loss": 5.7657, "loss/crossentropy": 2.551359534263611, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.171829205006361, "step": 15796 }, { "epoch": 0.7180909090909091, "grad_norm": 4.625, "grad_norm_var": 0.13625895182291667, "learning_rate": 0.0001, "loss": 5.8115, "loss/crossentropy": 2.670877993106842, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16621258482336998, "step": 15798 }, { "epoch": 0.7181818181818181, "grad_norm": 4.59375, "grad_norm_var": 0.13515625, "learning_rate": 0.0001, "loss": 5.4417, "loss/crossentropy": 2.4613198041915894, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.1544867865741253, "step": 15800 }, { "epoch": 0.7182727272727273, "grad_norm": 4.0625, "grad_norm_var": 0.16484375, "learning_rate": 0.0001, "loss": 5.0508, "loss/crossentropy": 2.1977867782115936, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.13920745626091957, "step": 15802 }, { "epoch": 0.7183636363636363, "grad_norm": 5.0, "grad_norm_var": 0.16721598307291666, "learning_rate": 0.0001, "loss": 5.7393, "loss/crossentropy": 2.6539129614830017, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.15873290970921516, "step": 15804 }, { "epoch": 0.7184545454545455, "grad_norm": 4.8125, "grad_norm_var": 0.16799723307291667, "learning_rate": 0.0001, "loss": 5.6191, "loss/crossentropy": 2.567414879798889, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15771108493208885, "step": 15806 }, { "epoch": 0.7185454545454546, "grad_norm": 5.1875, "grad_norm_var": 0.17131754557291667, "learning_rate": 0.0001, "loss": 5.1438, "loss/crossentropy": 2.217124253511429, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.14676546305418015, "step": 15808 }, { "epoch": 0.7186363636363636, "grad_norm": 5.15625, "grad_norm_var": 0.12550455729166668, "learning_rate": 0.0001, "loss": 5.9929, "loss/crossentropy": 2.75366348028183, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.17373152077198029, "step": 15810 }, { "epoch": 0.7187272727272728, "grad_norm": 4.09375, "grad_norm_var": 0.13131510416666667, "learning_rate": 0.0001, "loss": 5.0222, "loss/crossentropy": 2.224964439868927, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.13753383606672287, "step": 15812 }, { "epoch": 0.7188181818181818, "grad_norm": 5.125, "grad_norm_var": 0.13580322265625, "learning_rate": 0.0001, "loss": 5.1041, "loss/crossentropy": 2.205590009689331, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14434438198804855, "step": 15814 }, { "epoch": 0.7189090909090909, "grad_norm": 4.625, "grad_norm_var": 0.135009765625, "learning_rate": 0.0001, "loss": 4.817, "loss/crossentropy": 2.0134765207767487, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.13679314218461514, "step": 15816 }, { "epoch": 0.719, "grad_norm": 4.75, "grad_norm_var": 0.12753499348958333, "learning_rate": 0.0001, "loss": 5.2689, "loss/crossentropy": 2.23820224404335, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15424123406410217, "step": 15818 }, { "epoch": 0.7190909090909091, "grad_norm": 4.40625, "grad_norm_var": 0.13357747395833333, "learning_rate": 0.0001, "loss": 5.4678, "loss/crossentropy": 2.4790061712265015, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15396112948656082, "step": 15820 }, { "epoch": 0.7191818181818181, "grad_norm": 5.125, "grad_norm_var": 0.14195556640625, "learning_rate": 0.0001, "loss": 5.7557, "loss/crossentropy": 2.6110225319862366, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.16661401093006134, "step": 15822 }, { "epoch": 0.7192727272727273, "grad_norm": 4.71875, "grad_norm_var": 0.129931640625, "learning_rate": 0.0001, "loss": 5.76, "loss/crossentropy": 2.662331759929657, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16308654099702835, "step": 15824 }, { "epoch": 0.7193636363636363, "grad_norm": 4.53125, "grad_norm_var": 0.12226155598958334, "learning_rate": 0.0001, "loss": 5.3345, "loss/crossentropy": 2.344639539718628, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15152490511536598, "step": 15826 }, { "epoch": 0.7194545454545455, "grad_norm": 4.53125, "grad_norm_var": 0.08323160807291667, "learning_rate": 0.0001, "loss": 5.2129, "loss/crossentropy": 2.2787189185619354, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14595308527350426, "step": 15828 }, { "epoch": 0.7195454545454546, "grad_norm": 4.71875, "grad_norm_var": 0.07102457682291667, "learning_rate": 0.0001, "loss": 5.184, "loss/crossentropy": 2.2709426283836365, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.14345046877861023, "step": 15830 }, { "epoch": 0.7196363636363636, "grad_norm": 4.5625, "grad_norm_var": 0.08209228515625, "learning_rate": 0.0001, "loss": 5.5739, "loss/crossentropy": 2.4833813905715942, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16061320528388023, "step": 15832 }, { "epoch": 0.7197272727272728, "grad_norm": 4.5, "grad_norm_var": 0.06744791666666666, "learning_rate": 0.0001, "loss": 5.0218, "loss/crossentropy": 2.138951152563095, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14140916615724564, "step": 15834 }, { "epoch": 0.7198181818181818, "grad_norm": 5.09375, "grad_norm_var": 0.09034830729166667, "learning_rate": 0.0001, "loss": 5.5589, "loss/crossentropy": 2.5145879685878754, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15716120600700378, "step": 15836 }, { "epoch": 0.7199090909090909, "grad_norm": 4.625, "grad_norm_var": 0.08020833333333334, "learning_rate": 0.0001, "loss": 5.713, "loss/crossentropy": 2.610795259475708, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.16354496404528618, "step": 15838 }, { "epoch": 0.72, "grad_norm": 4.75, "grad_norm_var": 0.08404541015625, "learning_rate": 0.0001, "loss": 5.3356, "loss/crossentropy": 2.40803724527359, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14901067316532135, "step": 15840 }, { "epoch": 0.7200909090909091, "grad_norm": 4.4375, "grad_norm_var": 0.08746337890625, "learning_rate": 0.0001, "loss": 5.3665, "loss/crossentropy": 2.4166762828826904, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14947381243109703, "step": 15842 }, { "epoch": 0.7201818181818181, "grad_norm": 4.875, "grad_norm_var": 0.09482014973958333, "learning_rate": 0.0001, "loss": 5.3503, "loss/crossentropy": 2.4470048546791077, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14657548069953918, "step": 15844 }, { "epoch": 0.7202727272727273, "grad_norm": 4.84375, "grad_norm_var": 0.08873697916666666, "learning_rate": 0.0001, "loss": 5.9786, "loss/crossentropy": 2.788360595703125, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.1711726263165474, "step": 15846 }, { "epoch": 0.7203636363636363, "grad_norm": 5.125, "grad_norm_var": 0.08179931640625, "learning_rate": 0.0001, "loss": 5.5508, "loss/crossentropy": 2.519824266433716, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15504658594727516, "step": 15848 }, { "epoch": 0.7204545454545455, "grad_norm": 4.71875, "grad_norm_var": 0.08814697265625, "learning_rate": 0.0001, "loss": 5.6179, "loss/crossentropy": 2.5467644333839417, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.16355577483773232, "step": 15850 }, { "epoch": 0.7205454545454546, "grad_norm": 5.75, "grad_norm_var": 0.11263020833333333, "learning_rate": 0.0001, "loss": 5.4082, "loss/crossentropy": 2.389118731021881, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15385904535651207, "step": 15852 }, { "epoch": 0.7206363636363636, "grad_norm": 4.6875, "grad_norm_var": 0.12782796223958334, "learning_rate": 0.0001, "loss": 4.9781, "loss/crossentropy": 2.1320849657058716, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.13948030397295952, "step": 15854 }, { "epoch": 0.7207272727272728, "grad_norm": 4.5625, "grad_norm_var": 0.13212483723958332, "learning_rate": 0.0001, "loss": 5.3161, "loss/crossentropy": 2.3750340342521667, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1487988419830799, "step": 15856 }, { "epoch": 0.7208181818181818, "grad_norm": 5.78125, "grad_norm_var": 0.183447265625, "learning_rate": 0.0001, "loss": 5.8637, "loss/crossentropy": 2.7136016488075256, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16657065600156784, "step": 15858 }, { "epoch": 0.7209090909090909, "grad_norm": 4.78125, "grad_norm_var": 0.164306640625, "learning_rate": 0.0001, "loss": 5.5907, "loss/crossentropy": 2.5576052069664, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.1574113741517067, "step": 15860 }, { "epoch": 0.721, "grad_norm": 4.8125, "grad_norm_var": 0.1658203125, "learning_rate": 0.0001, "loss": 5.3632, "loss/crossentropy": 2.3437901437282562, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15272707119584084, "step": 15862 }, { "epoch": 0.7210909090909091, "grad_norm": 4.0625, "grad_norm_var": 0.22421875, "learning_rate": 0.0001, "loss": 4.7518, "loss/crossentropy": 2.03557088971138, "loss/hidden": 1.419921875, "loss/jsd": 0.0, "loss/logits": 0.12962960824370384, "step": 15864 }, { "epoch": 0.7211818181818181, "grad_norm": 4.4375, "grad_norm_var": 0.22375895182291666, "learning_rate": 0.0001, "loss": 5.2382, "loss/crossentropy": 2.330113172531128, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14784467592835426, "step": 15866 }, { "epoch": 0.7212727272727273, "grad_norm": 4.625, "grad_norm_var": 0.16861979166666666, "learning_rate": 0.0001, "loss": 5.5367, "loss/crossentropy": 2.5270906686782837, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1556437462568283, "step": 15868 }, { "epoch": 0.7213636363636363, "grad_norm": 5.03125, "grad_norm_var": 0.16652018229166668, "learning_rate": 0.0001, "loss": 5.63, "loss/crossentropy": 2.581632077693939, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16069117560982704, "step": 15870 }, { "epoch": 0.7214545454545455, "grad_norm": 4.1875, "grad_norm_var": 0.18411051432291667, "learning_rate": 0.0001, "loss": 5.4562, "loss/crossentropy": 2.513001322746277, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.14802901446819305, "step": 15872 }, { "epoch": 0.7215454545454546, "grad_norm": 4.03125, "grad_norm_var": 0.13020833333333334, "learning_rate": 0.0001, "loss": 5.2154, "loss/crossentropy": 2.324406385421753, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14163901656866074, "step": 15874 }, { "epoch": 0.7216363636363636, "grad_norm": 4.90625, "grad_norm_var": 0.12720947265625, "learning_rate": 0.0001, "loss": 5.7849, "loss/crossentropy": 2.644643187522888, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16480235010385513, "step": 15876 }, { "epoch": 0.7217272727272728, "grad_norm": 4.625, "grad_norm_var": 0.13411051432291668, "learning_rate": 0.0001, "loss": 5.619, "loss/crossentropy": 2.539761781692505, "loss/hidden": 1.443359375, "loss/jsd": 0.0, "loss/logits": 0.163586113601923, "step": 15878 }, { "epoch": 0.7218181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.112109375, "learning_rate": 0.0001, "loss": 5.9692, "loss/crossentropy": 2.753280222415924, "loss/hidden": 1.498046875, "loss/jsd": 0.0, "loss/logits": 0.17178421095013618, "step": 15880 }, { "epoch": 0.721909090909091, "grad_norm": 5.0, "grad_norm_var": 0.12584228515625, "learning_rate": 0.0001, "loss": 5.4681, "loss/crossentropy": 2.5153728127479553, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14976193755865097, "step": 15882 }, { "epoch": 0.722, "grad_norm": 4.46875, "grad_norm_var": 0.140478515625, "learning_rate": 0.0001, "loss": 5.5173, "loss/crossentropy": 2.4675386548042297, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.15556345134973526, "step": 15884 }, { "epoch": 0.7220909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.13111572265625, "learning_rate": 0.0001, "loss": 5.2423, "loss/crossentropy": 2.2885093986988068, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.14791834354400635, "step": 15886 }, { "epoch": 0.7221818181818181, "grad_norm": 4.625, "grad_norm_var": 0.11119791666666666, "learning_rate": 0.0001, "loss": 5.1748, "loss/crossentropy": 2.267673671245575, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14774850755929947, "step": 15888 }, { "epoch": 0.7222727272727273, "grad_norm": 5.0625, "grad_norm_var": 0.07366129557291666, "learning_rate": 0.0001, "loss": 5.7464, "loss/crossentropy": 2.6481314301490784, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.16392898559570312, "step": 15890 }, { "epoch": 0.7223636363636363, "grad_norm": 4.96875, "grad_norm_var": 0.12381184895833333, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.5348499417304993, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15629855543375015, "step": 15892 }, { "epoch": 0.7224545454545455, "grad_norm": 4.5625, "grad_norm_var": 0.12851155598958333, "learning_rate": 0.0001, "loss": 5.4838, "loss/crossentropy": 2.5112680792808533, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.1533091738820076, "step": 15894 }, { "epoch": 0.7225454545454545, "grad_norm": 4.96875, "grad_norm_var": 0.13118489583333334, "learning_rate": 0.0001, "loss": 5.4108, "loss/crossentropy": 2.4064126014709473, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15278209000825882, "step": 15896 }, { "epoch": 0.7226363636363636, "grad_norm": 4.71875, "grad_norm_var": 0.11073811848958333, "learning_rate": 0.0001, "loss": 5.3406, "loss/crossentropy": 2.377955198287964, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14860576391220093, "step": 15898 }, { "epoch": 0.7227272727272728, "grad_norm": 4.625, "grad_norm_var": 0.10358072916666666, "learning_rate": 0.0001, "loss": 4.9014, "loss/crossentropy": 2.0927498042583466, "loss/hidden": 1.439453125, "loss/jsd": 0.0, "loss/logits": 0.13691600039601326, "step": 15900 }, { "epoch": 0.7228181818181818, "grad_norm": 5.15625, "grad_norm_var": 0.12161051432291667, "learning_rate": 0.0001, "loss": 5.4064, "loss/crossentropy": 2.424302726984024, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.1527002714574337, "step": 15902 }, { "epoch": 0.722909090909091, "grad_norm": 5.03125, "grad_norm_var": 0.1263671875, "learning_rate": 0.0001, "loss": 5.7855, "loss/crossentropy": 2.5847994685173035, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1712394878268242, "step": 15904 }, { "epoch": 0.723, "grad_norm": 4.40625, "grad_norm_var": 0.145703125, "learning_rate": 0.0001, "loss": 4.9467, "loss/crossentropy": 2.1086485385894775, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.13615125603973866, "step": 15906 }, { "epoch": 0.7230909090909091, "grad_norm": 5.09375, "grad_norm_var": 0.08092041015625, "learning_rate": 0.0001, "loss": 5.2134, "loss/crossentropy": 2.212599992752075, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1528145968914032, "step": 15908 }, { "epoch": 0.7231818181818181, "grad_norm": 4.1875, "grad_norm_var": 0.11278889973958334, "learning_rate": 0.0001, "loss": 5.2045, "loss/crossentropy": 2.2828786969184875, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1448955461382866, "step": 15910 }, { "epoch": 0.7232727272727273, "grad_norm": 4.78125, "grad_norm_var": 0.13876546223958333, "learning_rate": 0.0001, "loss": 5.1336, "loss/crossentropy": 2.2595453560352325, "loss/hidden": 1.458984375, "loss/jsd": 0.0, "loss/logits": 0.14150341227650642, "step": 15912 }, { "epoch": 0.7233636363636363, "grad_norm": 4.25, "grad_norm_var": 0.15370686848958334, "learning_rate": 0.0001, "loss": 5.3998, "loss/crossentropy": 2.404416084289551, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15324760600924492, "step": 15914 }, { "epoch": 0.7234545454545455, "grad_norm": 4.5625, "grad_norm_var": 0.159375, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.5414194464683533, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15637391433119774, "step": 15916 }, { "epoch": 0.7235454545454545, "grad_norm": 4.78125, "grad_norm_var": 0.15467122395833333, "learning_rate": 0.0001, "loss": 5.9942, "loss/crossentropy": 2.792307198047638, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.1738957017660141, "step": 15918 }, { "epoch": 0.7236363636363636, "grad_norm": 4.9375, "grad_norm_var": 0.14462483723958333, "learning_rate": 0.0001, "loss": 5.3479, "loss/crossentropy": 2.3572131395339966, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1518065370619297, "step": 15920 }, { "epoch": 0.7237272727272728, "grad_norm": 5.375, "grad_norm_var": 0.15221354166666667, "learning_rate": 0.0001, "loss": 5.7168, "loss/crossentropy": 2.6422542929649353, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16018719598650932, "step": 15922 }, { "epoch": 0.7238181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.19440104166666666, "learning_rate": 0.0001, "loss": 5.4162, "loss/crossentropy": 2.372770607471466, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.15688436850905418, "step": 15924 }, { "epoch": 0.723909090909091, "grad_norm": 4.65625, "grad_norm_var": 0.15709228515625, "learning_rate": 0.0001, "loss": 5.3731, "loss/crossentropy": 2.383422166109085, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15052670240402222, "step": 15926 }, { "epoch": 0.724, "grad_norm": 5.28125, "grad_norm_var": 0.8003865559895833, "learning_rate": 0.0001, "loss": 5.4588, "loss/crossentropy": 2.494476556777954, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1495569720864296, "step": 15928 }, { "epoch": 0.7240909090909091, "grad_norm": 4.625, "grad_norm_var": 0.7597493489583333, "learning_rate": 0.0001, "loss": 5.2746, "loss/crossentropy": 2.2567934095859528, "loss/hidden": 1.486328125, "loss/jsd": 0.0, "loss/logits": 0.15315201506018639, "step": 15930 }, { "epoch": 0.7241818181818181, "grad_norm": 5.375, "grad_norm_var": 1.1724609375, "learning_rate": 0.0001, "loss": 5.5956, "loss/crossentropy": 2.5074672996997833, "loss/hidden": 1.474609375, "loss/jsd": 0.0, "loss/logits": 0.16135511547327042, "step": 15932 }, { "epoch": 0.7242727272727273, "grad_norm": 4.84375, "grad_norm_var": 1.1837239583333334, "learning_rate": 0.0001, "loss": 5.375, "loss/crossentropy": 2.4087381958961487, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.148965023458004, "step": 15934 }, { "epoch": 0.7243636363636363, "grad_norm": 5.25, "grad_norm_var": 1.173046875, "learning_rate": 0.0001, "loss": 5.2738, "loss/crossentropy": 2.3004143238067627, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1479254625737667, "step": 15936 }, { "epoch": 0.7244545454545455, "grad_norm": 4.75, "grad_norm_var": 1.194921875, "learning_rate": 0.0001, "loss": 5.6573, "loss/crossentropy": 2.6021387577056885, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1594252921640873, "step": 15938 }, { "epoch": 0.7245454545454545, "grad_norm": 4.59375, "grad_norm_var": 1.243603515625, "learning_rate": 0.0001, "loss": 5.354, "loss/crossentropy": 2.35964235663414, "loss/hidden": 1.462890625, "loss/jsd": 0.0, "loss/logits": 0.15314973518252373, "step": 15940 }, { "epoch": 0.7246363636363636, "grad_norm": 4.96875, "grad_norm_var": 1.2108072916666666, "learning_rate": 0.0001, "loss": 5.2319, "loss/crossentropy": 2.2895249724388123, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14619353041052818, "step": 15942 }, { "epoch": 0.7247272727272728, "grad_norm": 4.75, "grad_norm_var": 0.6489217122395833, "learning_rate": 0.0001, "loss": 5.7908, "loss/crossentropy": 2.6577311754226685, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1636965498328209, "step": 15944 }, { "epoch": 0.7248181818181818, "grad_norm": 4.90625, "grad_norm_var": 0.6438761393229167, "learning_rate": 0.0001, "loss": 5.5018, "loss/crossentropy": 2.4808125495910645, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15444353222846985, "step": 15946 }, { "epoch": 0.724909090909091, "grad_norm": 6.15625, "grad_norm_var": 0.19967041015625, "learning_rate": 0.0001, "loss": 5.954, "loss/crossentropy": 2.744844079017639, "loss/hidden": 1.494140625, "loss/jsd": 0.0, "loss/logits": 0.1715031899511814, "step": 15948 }, { "epoch": 0.725, "grad_norm": 4.5625, "grad_norm_var": 0.15935872395833334, "learning_rate": 0.0001, "loss": 5.1784, "loss/crossentropy": 2.285864621400833, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14159744419157505, "step": 15950 }, { "epoch": 0.7250909090909091, "grad_norm": 4.9375, "grad_norm_var": 0.15325520833333334, "learning_rate": 0.0001, "loss": 5.4346, "loss/crossentropy": 2.422718584537506, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15294532477855682, "step": 15952 }, { "epoch": 0.7251818181818181, "grad_norm": 4.375, "grad_norm_var": 0.17105712890625, "learning_rate": 0.0001, "loss": 5.2422, "loss/crossentropy": 2.276267796754837, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15166967175900936, "step": 15954 }, { "epoch": 0.7252727272727273, "grad_norm": 4.625, "grad_norm_var": 0.169775390625, "learning_rate": 0.0001, "loss": 5.3785, "loss/crossentropy": 2.415549099445343, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.14922702312469482, "step": 15956 }, { "epoch": 0.7253636363636363, "grad_norm": 4.8125, "grad_norm_var": 0.17408854166666668, "learning_rate": 0.0001, "loss": 5.2799, "loss/crossentropy": 2.3313016295433044, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.1501355953514576, "step": 15958 }, { "epoch": 0.7254545454545455, "grad_norm": 5.0625, "grad_norm_var": 0.19156494140625, "learning_rate": 0.0001, "loss": 5.6716, "loss/crossentropy": 2.562635898590088, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16323670372366905, "step": 15960 }, { "epoch": 0.7255454545454545, "grad_norm": 4.6875, "grad_norm_var": 0.20536702473958332, "learning_rate": 0.0001, "loss": 5.7133, "loss/crossentropy": 2.6238372921943665, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.164022795855999, "step": 15962 }, { "epoch": 0.7256363636363636, "grad_norm": 4.65625, "grad_norm_var": 0.07610677083333334, "learning_rate": 0.0001, "loss": 5.6773, "loss/crossentropy": 2.5206223726272583, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.16742663457989693, "step": 15964 }, { "epoch": 0.7257272727272728, "grad_norm": 4.6875, "grad_norm_var": 0.09143473307291666, "learning_rate": 0.0001, "loss": 5.1339, "loss/crossentropy": 2.2314231395721436, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1441533863544464, "step": 15966 }, { "epoch": 0.7258181818181818, "grad_norm": 5.25, "grad_norm_var": 0.10779622395833334, "learning_rate": 0.0001, "loss": 5.8322, "loss/crossentropy": 2.6781500577926636, "loss/hidden": 1.490234375, "loss/jsd": 0.0, "loss/logits": 0.16637765616178513, "step": 15968 }, { "epoch": 0.725909090909091, "grad_norm": 4.78125, "grad_norm_var": 0.09505208333333333, "learning_rate": 0.0001, "loss": 6.02, "loss/crossentropy": 2.8372421860694885, "loss/hidden": 1.470703125, "loss/jsd": 0.0, "loss/logits": 0.17120429128408432, "step": 15970 }, { "epoch": 0.726, "grad_norm": 4.8125, "grad_norm_var": 0.092041015625, "learning_rate": 0.0001, "loss": 5.7046, "loss/crossentropy": 2.6740498542785645, "loss/hidden": 1.482421875, "loss/jsd": 0.0, "loss/logits": 0.15480852872133255, "step": 15972 }, { "epoch": 0.7260909090909091, "grad_norm": 5.34375, "grad_norm_var": 0.11275634765625, "learning_rate": 0.0001, "loss": 5.4901, "loss/crossentropy": 2.488378405570984, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.1534878220409155, "step": 15974 }, { "epoch": 0.7261818181818182, "grad_norm": 4.625, "grad_norm_var": 0.084228515625, "learning_rate": 0.0001, "loss": 5.471, "loss/crossentropy": 2.4852702021598816, "loss/hidden": 1.447265625, "loss/jsd": 0.0, "loss/logits": 0.15385020896792412, "step": 15976 }, { "epoch": 0.7262727272727273, "grad_norm": 4.03125, "grad_norm_var": 0.118359375, "learning_rate": 0.0001, "loss": 5.2107, "loss/crossentropy": 2.3398948311805725, "loss/hidden": 1.423828125, "loss/jsd": 0.0, "loss/logits": 0.14469926804304123, "step": 15978 }, { "epoch": 0.7263636363636363, "grad_norm": 4.96875, "grad_norm_var": 0.11314697265625, "learning_rate": 0.0001, "loss": 5.2307, "loss/crossentropy": 2.334214210510254, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.14413756504654884, "step": 15980 }, { "epoch": 0.7264545454545455, "grad_norm": 4.375, "grad_norm_var": 0.10572916666666667, "learning_rate": 0.0001, "loss": 5.0453, "loss/crossentropy": 2.2194370925426483, "loss/hidden": 1.435546875, "loss/jsd": 0.0, "loss/logits": 0.13903527706861496, "step": 15982 }, { "epoch": 0.7265454545454545, "grad_norm": 5.03125, "grad_norm_var": 0.09581705729166666, "learning_rate": 0.0001, "loss": 5.4545, "loss/crossentropy": 2.3948992490768433, "loss/hidden": 1.501953125, "loss/jsd": 0.0, "loss/logits": 0.15576554089784622, "step": 15984 }, { "epoch": 0.7266363636363636, "grad_norm": 4.71875, "grad_norm_var": 0.09607747395833334, "learning_rate": 0.0001, "loss": 5.3589, "loss/crossentropy": 2.3849769234657288, "loss/hidden": 1.455078125, "loss/jsd": 0.0, "loss/logits": 0.15188242122530937, "step": 15986 }, { "epoch": 0.7267272727272728, "grad_norm": 4.625, "grad_norm_var": 0.10028889973958334, "learning_rate": 0.0001, "loss": 5.5144, "loss/crossentropy": 2.5085206627845764, "loss/hidden": 1.466796875, "loss/jsd": 0.0, "loss/logits": 0.153905738145113, "step": 15988 }, { "epoch": 0.7268181818181818, "grad_norm": 5.03125, "grad_norm_var": 0.08069254557291666, "learning_rate": 0.0001, "loss": 5.7295, "loss/crossentropy": 2.6296277046203613, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1631106324493885, "step": 15990 }, { "epoch": 0.726909090909091, "grad_norm": 4.8125, "grad_norm_var": 0.07916666666666666, "learning_rate": 0.0001, "loss": 5.5993, "loss/crossentropy": 2.540049135684967, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.1608063541352749, "step": 15992 }, { "epoch": 0.727, "grad_norm": 4.96875, "grad_norm_var": 0.04455973307291667, "learning_rate": 0.0001, "loss": 5.3664, "loss/crossentropy": 2.3609424829483032, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15250037610530853, "step": 15994 }, { "epoch": 0.7270909090909091, "grad_norm": 4.6875, "grad_norm_var": 0.039697265625, "learning_rate": 0.0001, "loss": 5.5015, "loss/crossentropy": 2.5104551911354065, "loss/hidden": 1.451171875, "loss/jsd": 0.0, "loss/logits": 0.15398623049259186, "step": 15996 }, { "epoch": 0.7271818181818182, "grad_norm": 4.8125, "grad_norm_var": 0.023661295572916668, "learning_rate": 0.0001, "loss": 5.3694, "loss/crossentropy": 2.383281111717224, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.148611418902874, "step": 15998 }, { "epoch": 0.7272727272727273, "grad_norm": 5.3125, "grad_norm_var": 0.03756510416666667, "learning_rate": 0.0001, "loss": 5.5348, "loss/crossentropy": 2.512751340866089, "loss/hidden": 1.478515625, "loss/jsd": 0.0, "loss/logits": 0.15434890612959862, "step": 16000 } ], "logging_steps": 2, "max_steps": 22000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.13164042715136e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }