diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,168027 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.875, + "eval_steps": 2000, + "global_step": 28000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.25e-05, + "grad_norm": 660.0, + "learning_rate": 2.1600000000000003e-05, + "loss": 89.3086, + "loss/crossentropy": 9.156324863433838, + "loss/hidden": 10.34375, + "loss/jsd": 0.0, + "loss/logits": 6.980849266052246, + "step": 2 + }, + { + "epoch": 0.000125, + "grad_norm": 724.0, + "learning_rate": 2.32e-05, + "loss": 92.6541, + "loss/crossentropy": 9.248075485229492, + "loss/hidden": 10.25, + "loss/jsd": 0.0, + "loss/logits": 7.315607070922852, + "step": 4 + }, + { + "epoch": 0.0001875, + "grad_norm": 652.0, + "learning_rate": 2.48e-05, + "loss": 90.2102, + "loss/crossentropy": 9.289902210235596, + "loss/hidden": 10.125, + "loss/jsd": 0.0, + "loss/logits": 7.079533338546753, + "step": 6 + }, + { + "epoch": 0.00025, + "grad_norm": 398.0, + "learning_rate": 2.64e-05, + "loss": 86.3137, + "loss/crossentropy": 8.973662853240967, + "loss/hidden": 10.15625, + "loss/jsd": 0.0, + "loss/logits": 6.718380928039551, + "step": 8 + }, + { + "epoch": 0.0003125, + "grad_norm": 246.0, + "learning_rate": 2.8000000000000003e-05, + "loss": 74.1221, + "loss/crossentropy": 7.941339015960693, + "loss/hidden": 9.875, + "loss/jsd": 0.0, + "loss/logits": 5.630578994750977, + "step": 10 + }, + { + "epoch": 0.000375, + "grad_norm": 149.0, + "learning_rate": 2.9600000000000005e-05, + "loss": 67.799, + "loss/crossentropy": 7.281719923019409, + "loss/hidden": 9.84375, + "loss/jsd": 0.0, + "loss/logits": 5.067355394363403, + "step": 12 + }, + { + "epoch": 0.0004375, + "grad_norm": 96.0, + "learning_rate": 3.1200000000000006e-05, + "loss": 59.7456, + "loss/crossentropy": 6.664980173110962, + "loss/hidden": 9.25, + "loss/jsd": 0.0, + "loss/logits": 4.383058547973633, + "step": 14 + }, + { + "epoch": 0.0005, + "grad_norm": 82.0, + "grad_norm_var": 65283.240625, + "learning_rate": 3.2800000000000004e-05, + "loss": 54.7434, + "loss/crossentropy": 6.230701446533203, + "loss/hidden": 8.96875, + "loss/jsd": 0.0, + "loss/logits": 3.954396367073059, + "step": 16 + }, + { + "epoch": 0.0005625, + "grad_norm": 82.5, + "grad_norm_var": 59929.1625, + "learning_rate": 3.4399999999999996e-05, + "loss": 49.011, + "loss/crossentropy": 5.828659772872925, + "loss/hidden": 8.75, + "loss/jsd": 0.0, + "loss/logits": 3.443238377571106, + "step": 18 + }, + { + "epoch": 0.000625, + "grad_norm": 58.0, + "grad_norm_var": 41848.00729166667, + "learning_rate": 3.600000000000001e-05, + "loss": 44.1889, + "loss/crossentropy": 5.225403785705566, + "loss/hidden": 8.21875, + "loss/jsd": 0.0, + "loss/logits": 3.074475884437561, + "step": 20 + }, + { + "epoch": 0.0006875, + "grad_norm": 61.5, + "grad_norm_var": 21039.383333333335, + "learning_rate": 3.76e-05, + "loss": 40.1213, + "loss/crossentropy": 5.027252197265625, + "loss/hidden": 7.984375, + "loss/jsd": 0.0, + "loss/logits": 2.710965871810913, + "step": 22 + }, + { + "epoch": 0.00075, + "grad_norm": 58.5, + "grad_norm_var": 9249.7625, + "learning_rate": 3.9200000000000004e-05, + "loss": 36.1313, + "loss/crossentropy": 4.690935373306274, + "loss/hidden": 7.578125, + "loss/jsd": 0.0, + "loss/logits": 2.386227607727051, + "step": 24 + }, + { + "epoch": 0.0008125, + "grad_norm": 48.5, + "grad_norm_var": 4270.605989583333, + "learning_rate": 4.08e-05, + "loss": 31.8483, + "loss/crossentropy": 4.4289772510528564, + "loss/hidden": 7.140625, + "loss/jsd": 0.0, + "loss/logits": 2.0278735160827637, + "step": 26 + }, + { + "epoch": 0.000875, + "grad_norm": 61.5, + "grad_norm_var": 340.2708333333333, + "learning_rate": 4.240000000000001e-05, + "loss": 32.7749, + "loss/crossentropy": 4.644165277481079, + "loss/hidden": 6.71875, + "loss/jsd": 0.0, + "loss/logits": 2.1411956548690796, + "step": 28 + }, + { + "epoch": 0.0009375, + "grad_norm": 193.0, + "grad_norm_var": 1174.7375, + "learning_rate": 4.4000000000000006e-05, + "loss": 28.865, + "loss/crossentropy": 4.189300537109375, + "loss/hidden": 6.546875, + "loss/jsd": 0.0, + "loss/logits": 1.81288480758667, + "step": 30 + }, + { + "epoch": 0.001, + "grad_norm": 32.25, + "grad_norm_var": 1289.6643229166666, + "learning_rate": 4.5600000000000004e-05, + "loss": 25.722, + "loss/crossentropy": 3.7905068397521973, + "loss/hidden": 6.078125, + "loss/jsd": 0.0, + "loss/logits": 1.5853379368782043, + "step": 32 + }, + { + "epoch": 0.0010625, + "grad_norm": 48.75, + "grad_norm_var": 1344.2759765625, + "learning_rate": 4.72e-05, + "loss": 24.645, + "loss/crossentropy": 3.972483277320862, + "loss/hidden": 5.890625, + "loss/jsd": 0.0, + "loss/logits": 1.4781858325004578, + "step": 34 + }, + { + "epoch": 0.001125, + "grad_norm": 28.5, + "grad_norm_var": 1459.4322265625, + "learning_rate": 4.88e-05, + "loss": 21.5933, + "loss/crossentropy": 3.3368273973464966, + "loss/hidden": 5.59375, + "loss/jsd": 0.0, + "loss/logits": 1.2662731409072876, + "step": 36 + }, + { + "epoch": 0.0011875, + "grad_norm": 23.75, + "grad_norm_var": 1564.7108723958333, + "learning_rate": 5.0400000000000005e-05, + "loss": 21.9488, + "loss/crossentropy": 3.6340177059173584, + "loss/hidden": 5.484375, + "loss/jsd": 0.0, + "loss/logits": 1.2830361127853394, + "step": 38 + }, + { + "epoch": 0.00125, + "grad_norm": 28.375, + "grad_norm_var": 1630.9760416666666, + "learning_rate": 5.2000000000000004e-05, + "loss": 21.1924, + "loss/crossentropy": 3.7127119302749634, + "loss/hidden": 5.25, + "loss/jsd": 0.0, + "loss/logits": 1.2229673862457275, + "step": 40 + }, + { + "epoch": 0.0013125, + "grad_norm": 27.875, + "grad_norm_var": 1695.2822916666667, + "learning_rate": 5.360000000000001e-05, + "loss": 19.8704, + "loss/crossentropy": 3.402773141860962, + "loss/hidden": 5.203125, + "loss/jsd": 0.0, + "loss/logits": 1.1264490485191345, + "step": 42 + }, + { + "epoch": 0.001375, + "grad_norm": 35.5, + "grad_norm_var": 1718.1832682291667, + "learning_rate": 5.520000000000001e-05, + "loss": 18.1969, + "loss/crossentropy": 3.2803523540496826, + "loss/hidden": 4.9375, + "loss/jsd": 0.0, + "loss/logits": 0.9979034960269928, + "step": 44 + }, + { + "epoch": 0.0014375, + "grad_norm": 23.5, + "grad_norm_var": 72.73639322916667, + "learning_rate": 5.680000000000001e-05, + "loss": 20.0092, + "loss/crossentropy": 3.6576796770095825, + "loss/hidden": 4.984375, + "loss/jsd": 0.0, + "loss/logits": 1.136712908744812, + "step": 46 + }, + { + "epoch": 0.0015, + "grad_norm": 18.125, + "grad_norm_var": 58.280208333333334, + "learning_rate": 5.840000000000001e-05, + "loss": 17.1218, + "loss/crossentropy": 3.304303526878357, + "loss/hidden": 4.6875, + "loss/jsd": 0.0, + "loss/logits": 0.9130024909973145, + "step": 48 + }, + { + "epoch": 0.0015625, + "grad_norm": 26.75, + "grad_norm_var": 63.002018229166666, + "learning_rate": 6.0000000000000015e-05, + "loss": 17.7318, + "loss/crossentropy": 3.2923504114151, + "loss/hidden": 4.78125, + "loss/jsd": 0.0, + "loss/logits": 0.9658186733722687, + "step": 50 + }, + { + "epoch": 0.001625, + "grad_norm": 35.0, + "grad_norm_var": 67.05670572916667, + "learning_rate": 6.16e-05, + "loss": 16.8527, + "loss/crossentropy": 3.1591343879699707, + "loss/hidden": 4.625, + "loss/jsd": 0.0, + "loss/logits": 0.9068574905395508, + "step": 52 + }, + { + "epoch": 0.0016875, + "grad_norm": 16.25, + "grad_norm_var": 78.15983072916667, + "learning_rate": 6.320000000000002e-05, + "loss": 16.2787, + "loss/crossentropy": 3.1270612478256226, + "loss/hidden": 4.453125, + "loss/jsd": 0.0, + "loss/logits": 0.8698541224002838, + "step": 54 + }, + { + "epoch": 0.00175, + "grad_norm": 17.75, + "grad_norm_var": 86.74993489583333, + "learning_rate": 6.480000000000002e-05, + "loss": 16.5175, + "loss/crossentropy": 3.2500909566879272, + "loss/hidden": 4.46875, + "loss/jsd": 0.0, + "loss/logits": 0.8798635005950928, + "step": 56 + }, + { + "epoch": 0.0018125, + "grad_norm": 23.0, + "grad_norm_var": 93.07473958333334, + "learning_rate": 6.64e-05, + "loss": 16.7407, + "loss/crossentropy": 3.3384851217269897, + "loss/hidden": 4.359375, + "loss/jsd": 0.0, + "loss/logits": 0.9042791426181793, + "step": 58 + }, + { + "epoch": 0.001875, + "grad_norm": 19.0, + "grad_norm_var": 88.33587239583333, + "learning_rate": 6.8e-05, + "loss": 15.3873, + "loss/crossentropy": 2.9939886331558228, + "loss/hidden": 4.234375, + "loss/jsd": 0.0, + "loss/logits": 0.8158909380435944, + "step": 60 + }, + { + "epoch": 0.0019375, + "grad_norm": 17.0, + "grad_norm_var": 80.16223958333333, + "learning_rate": 6.96e-05, + "loss": 16.1478, + "loss/crossentropy": 3.2266587018966675, + "loss/hidden": 4.296875, + "loss/jsd": 0.0, + "loss/logits": 0.8624304533004761, + "step": 62 + }, + { + "epoch": 0.002, + "grad_norm": 16.5, + "grad_norm_var": 79.83515625, + "learning_rate": 7.12e-05, + "loss": 13.9403, + "loss/crossentropy": 2.7177222967147827, + "loss/hidden": 4.140625, + "loss/jsd": 0.0, + "loss/logits": 0.7081980109214783, + "step": 64 + }, + { + "epoch": 0.0020625, + "grad_norm": 17.875, + "grad_norm_var": 22.685791015625, + "learning_rate": 7.280000000000001e-05, + "loss": 14.943, + "loss/crossentropy": 3.1492207050323486, + "loss/hidden": 4.125, + "loss/jsd": 0.0, + "loss/logits": 0.7668733894824982, + "step": 66 + }, + { + "epoch": 0.002125, + "grad_norm": 13.875, + "grad_norm_var": 5.928369140625, + "learning_rate": 7.44e-05, + "loss": 14.6872, + "loss/crossentropy": 3.163086175918579, + "loss/hidden": 3.9375, + "loss/jsd": 0.0, + "loss/logits": 0.758662760257721, + "step": 68 + }, + { + "epoch": 0.0021875, + "grad_norm": 20.0, + "grad_norm_var": 7.180712890625, + "learning_rate": 7.6e-05, + "loss": 15.1751, + "loss/crossentropy": 3.189119815826416, + "loss/hidden": 4.015625, + "loss/jsd": 0.0, + "loss/logits": 0.7970321774482727, + "step": 70 + }, + { + "epoch": 0.00225, + "grad_norm": 18.25, + "grad_norm_var": 7.503059895833333, + "learning_rate": 7.76e-05, + "loss": 14.9421, + "loss/crossentropy": 3.257962226867676, + "loss/hidden": 3.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.777004063129425, + "step": 72 + }, + { + "epoch": 0.0023125, + "grad_norm": 18.5, + "grad_norm_var": 6.022395833333333, + "learning_rate": 7.920000000000001e-05, + "loss": 15.0205, + "loss/crossentropy": 3.289088249206543, + "loss/hidden": 3.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.7801713049411774, + "step": 74 + }, + { + "epoch": 0.002375, + "grad_norm": 17.25, + "grad_norm_var": 5.628059895833333, + "learning_rate": 8.080000000000001e-05, + "loss": 14.6665, + "loss/crossentropy": 3.132324695587158, + "loss/hidden": 3.875, + "loss/jsd": 0.0, + "loss/logits": 0.7659187614917755, + "step": 76 + }, + { + "epoch": 0.0024375, + "grad_norm": 14.125, + "grad_norm_var": 6.317643229166666, + "learning_rate": 8.240000000000001e-05, + "loss": 14.5497, + "loss/crossentropy": 3.1414411067962646, + "loss/hidden": 3.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.7525473237037659, + "step": 78 + }, + { + "epoch": 0.0025, + "grad_norm": 22.5, + "grad_norm_var": 8.394205729166666, + "learning_rate": 8.400000000000001e-05, + "loss": 14.471, + "loss/crossentropy": 3.19692599773407, + "loss/hidden": 3.796875, + "loss/jsd": 0.0, + "loss/logits": 0.7477201521396637, + "step": 80 + }, + { + "epoch": 0.0025625, + "grad_norm": 16.5, + "grad_norm_var": 8.338802083333333, + "learning_rate": 8.560000000000001e-05, + "loss": 13.6071, + "loss/crossentropy": 2.9412546157836914, + "loss/hidden": 3.78125, + "loss/jsd": 0.0, + "loss/logits": 0.6884627342224121, + "step": 82 + }, + { + "epoch": 0.002625, + "grad_norm": 21.875, + "grad_norm_var": 8.0587890625, + "learning_rate": 8.720000000000002e-05, + "loss": 13.4559, + "loss/crossentropy": 3.074158549308777, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.6702075600624084, + "step": 84 + }, + { + "epoch": 0.0026875, + "grad_norm": 11.875, + "grad_norm_var": 8.190104166666666, + "learning_rate": 8.880000000000002e-05, + "loss": 13.1482, + "loss/crossentropy": 2.9633986949920654, + "loss/hidden": 3.734375, + "loss/jsd": 0.0, + "loss/logits": 0.6450382769107819, + "step": 86 + }, + { + "epoch": 0.00275, + "grad_norm": 13.3125, + "grad_norm_var": 8.740885416666666, + "learning_rate": 9.040000000000002e-05, + "loss": 13.2811, + "loss/crossentropy": 3.0899221897125244, + "loss/hidden": 3.59375, + "loss/jsd": 0.0, + "loss/logits": 0.6597437262535095, + "step": 88 + }, + { + "epoch": 0.0028125, + "grad_norm": 14.5, + "grad_norm_var": 9.731770833333334, + "learning_rate": 9.200000000000001e-05, + "loss": 12.5622, + "loss/crossentropy": 2.6711933612823486, + "loss/hidden": 3.640625, + "loss/jsd": 0.0, + "loss/logits": 0.6250402927398682, + "step": 90 + }, + { + "epoch": 0.002875, + "grad_norm": 22.125, + "grad_norm_var": 11.3453125, + "learning_rate": 9.360000000000003e-05, + "loss": 12.9117, + "loss/crossentropy": 2.9529306888580322, + "loss/hidden": 3.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.6279078125953674, + "step": 92 + }, + { + "epoch": 0.0029375, + "grad_norm": 15.75, + "grad_norm_var": 11.020247395833334, + "learning_rate": 9.52e-05, + "loss": 13.1727, + "loss/crossentropy": 3.0362765789031982, + "loss/hidden": 3.578125, + "loss/jsd": 0.0, + "loss/logits": 0.655828982591629, + "step": 94 + }, + { + "epoch": 0.003, + "grad_norm": 14.8125, + "grad_norm_var": 9.081103515625, + "learning_rate": 9.680000000000001e-05, + "loss": 13.2317, + "loss/crossentropy": 3.050165295600891, + "loss/hidden": 3.65625, + "loss/jsd": 0.0, + "loss/logits": 0.6525256931781769, + "step": 96 + }, + { + "epoch": 0.0030625, + "grad_norm": 16.125, + "grad_norm_var": 10.2056640625, + "learning_rate": 9.84e-05, + "loss": 12.9596, + "loss/crossentropy": 3.007634401321411, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.6436329782009125, + "step": 98 + }, + { + "epoch": 0.003125, + "grad_norm": 12.6875, + "grad_norm_var": 8.851546223958334, + "learning_rate": 0.0001, + "loss": 12.526, + "loss/crossentropy": 3.0878303050994873, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.5985024273395538, + "step": 100 + }, + { + "epoch": 0.0031875, + "grad_norm": 14.875, + "grad_norm_var": 8.074934895833334, + "learning_rate": 0.0001, + "loss": 12.5772, + "loss/crossentropy": 2.8073278665542603, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.6262076199054718, + "step": 102 + }, + { + "epoch": 0.00325, + "grad_norm": 12.8125, + "grad_norm_var": 8.747639973958334, + "learning_rate": 0.0001, + "loss": 12.8581, + "loss/crossentropy": 2.9922837018966675, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.6381443738937378, + "step": 104 + }, + { + "epoch": 0.0033125, + "grad_norm": 13.25, + "grad_norm_var": 8.709358723958333, + "learning_rate": 0.0001, + "loss": 12.9596, + "loss/crossentropy": 2.9500906467437744, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.6517307162284851, + "step": 106 + }, + { + "epoch": 0.003375, + "grad_norm": 12.25, + "grad_norm_var": 4.660660807291666, + "learning_rate": 0.0001, + "loss": 11.768, + "loss/crossentropy": 2.8668397665023804, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.5580830276012421, + "step": 108 + }, + { + "epoch": 0.0034375, + "grad_norm": 16.625, + "grad_norm_var": 4.357666015625, + "learning_rate": 0.0001, + "loss": 12.1518, + "loss/crossentropy": 2.9460701942443848, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.5986941456794739, + "step": 110 + }, + { + "epoch": 0.0035, + "grad_norm": 13.5625, + "grad_norm_var": 2.1166015625, + "learning_rate": 0.0001, + "loss": 11.8625, + "loss/crossentropy": 2.9012417793273926, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.5570653975009918, + "step": 112 + }, + { + "epoch": 0.0035625, + "grad_norm": 11.875, + "grad_norm_var": 1.6338541666666666, + "learning_rate": 0.0001, + "loss": 11.8177, + "loss/crossentropy": 2.9161791801452637, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.5620248317718506, + "step": 114 + }, + { + "epoch": 0.003625, + "grad_norm": 15.5, + "grad_norm_var": 2.4508951822916667, + "learning_rate": 0.0001, + "loss": 12.2833, + "loss/crossentropy": 3.0185381174087524, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.596792608499527, + "step": 116 + }, + { + "epoch": 0.0036875, + "grad_norm": 13.0625, + "grad_norm_var": 2.618603515625, + "learning_rate": 0.0001, + "loss": 12.3115, + "loss/crossentropy": 3.1687170267105103, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.5845893919467926, + "step": 118 + }, + { + "epoch": 0.00375, + "grad_norm": 13.8125, + "grad_norm_var": 2.6577962239583335, + "learning_rate": 0.0001, + "loss": 12.3601, + "loss/crossentropy": 3.0240886211395264, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.6023522615432739, + "step": 120 + }, + { + "epoch": 0.0038125, + "grad_norm": 10.9375, + "grad_norm_var": 3.499462890625, + "learning_rate": 0.0001, + "loss": 10.8604, + "loss/crossentropy": 2.6125407218933105, + "loss/hidden": 3.171875, + "loss/jsd": 0.0, + "loss/logits": 0.5075993537902832, + "step": 122 + }, + { + "epoch": 0.003875, + "grad_norm": 12.0, + "grad_norm_var": 3.5036295572916667, + "learning_rate": 0.0001, + "loss": 11.8795, + "loss/crossentropy": 2.8815178871154785, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.581826388835907, + "step": 124 + }, + { + "epoch": 0.0039375, + "grad_norm": 13.375, + "grad_norm_var": 2.6025390625, + "learning_rate": 0.0001, + "loss": 11.8437, + "loss/crossentropy": 2.966344118118286, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.5650805234909058, + "step": 126 + }, + { + "epoch": 0.004, + "grad_norm": 10.9375, + "grad_norm_var": 2.746875, + "learning_rate": 0.0001, + "loss": 11.7435, + "loss/crossentropy": 2.978629946708679, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.5546101331710815, + "step": 128 + }, + { + "epoch": 0.0040625, + "grad_norm": 9.875, + "grad_norm_var": 3.224723307291667, + "learning_rate": 0.0001, + "loss": 11.4013, + "loss/crossentropy": 2.8617138862609863, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.5406723916530609, + "step": 130 + }, + { + "epoch": 0.004125, + "grad_norm": 11.125, + "grad_norm_var": 2.3733723958333335, + "learning_rate": 0.0001, + "loss": 11.7582, + "loss/crossentropy": 2.960026741027832, + "loss/hidden": 3.203125, + "loss/jsd": 0.0, + "loss/logits": 0.5595053732395172, + "step": 132 + }, + { + "epoch": 0.0041875, + "grad_norm": 11.125, + "grad_norm_var": 1.3651041666666666, + "learning_rate": 0.0001, + "loss": 11.0347, + "loss/crossentropy": 2.877363085746765, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.5071391761302948, + "step": 134 + }, + { + "epoch": 0.00425, + "grad_norm": 12.8125, + "grad_norm_var": 1.1153483072916666, + "learning_rate": 0.0001, + "loss": 11.4875, + "loss/crossentropy": 2.904009461402893, + "loss/hidden": 3.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.5450641810894012, + "step": 136 + }, + { + "epoch": 0.0043125, + "grad_norm": 10.6875, + "grad_norm_var": 1.0735514322916666, + "learning_rate": 0.0001, + "loss": 10.8593, + "loss/crossentropy": 2.7577372789382935, + "loss/hidden": 3.09375, + "loss/jsd": 0.0, + "loss/logits": 0.5007785558700562, + "step": 138 + }, + { + "epoch": 0.004375, + "grad_norm": 11.875, + "grad_norm_var": 1.42734375, + "learning_rate": 0.0001, + "loss": 12.4117, + "loss/crossentropy": 3.1381568908691406, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.6015740633010864, + "step": 140 + }, + { + "epoch": 0.0044375, + "grad_norm": 12.8125, + "grad_norm_var": 1.5671875, + "learning_rate": 0.0001, + "loss": 11.4659, + "loss/crossentropy": 3.0004279613494873, + "loss/hidden": 3.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.537956103682518, + "step": 142 + }, + { + "epoch": 0.0045, + "grad_norm": 11.5, + "grad_norm_var": 1.3536295572916666, + "learning_rate": 0.0001, + "loss": 10.9772, + "loss/crossentropy": 2.6849220991134644, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.5253205299377441, + "step": 144 + }, + { + "epoch": 0.0045625, + "grad_norm": 13.0, + "grad_norm_var": 1.42890625, + "learning_rate": 0.0001, + "loss": 10.991, + "loss/crossentropy": 2.929854989051819, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.501423716545105, + "step": 146 + }, + { + "epoch": 0.004625, + "grad_norm": 8.75, + "grad_norm_var": 2.023697916666667, + "learning_rate": 0.0001, + "loss": 10.8928, + "loss/crossentropy": 2.955693483352661, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.4944879859685898, + "step": 148 + }, + { + "epoch": 0.0046875, + "grad_norm": 14.625, + "grad_norm_var": 2.6907389322916666, + "learning_rate": 0.0001, + "loss": 10.9484, + "loss/crossentropy": 2.745115876197815, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.5187694430351257, + "step": 150 + }, + { + "epoch": 0.00475, + "grad_norm": 9.4375, + "grad_norm_var": 3.390625, + "learning_rate": 0.0001, + "loss": 11.0682, + "loss/crossentropy": 2.793348789215088, + "loss/hidden": 3.046875, + "loss/jsd": 0.0, + "loss/logits": 0.5228001177310944, + "step": 152 + }, + { + "epoch": 0.0048125, + "grad_norm": 9.0625, + "grad_norm_var": 3.870556640625, + "learning_rate": 0.0001, + "loss": 9.9671, + "loss/crossentropy": 2.544915556907654, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.45315225422382355, + "step": 154 + }, + { + "epoch": 0.004875, + "grad_norm": 9.75, + "grad_norm_var": 3.581103515625, + "learning_rate": 0.0001, + "loss": 10.2548, + "loss/crossentropy": 2.7735856771469116, + "loss/hidden": 2.984375, + "loss/jsd": 0.0, + "loss/logits": 0.44968172907829285, + "step": 156 + }, + { + "epoch": 0.0049375, + "grad_norm": 9.5, + "grad_norm_var": 3.4734212239583333, + "learning_rate": 0.0001, + "loss": 10.6978, + "loss/crossentropy": 2.61746346950531, + "loss/hidden": 3.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.5010055005550385, + "step": 158 + }, + { + "epoch": 0.005, + "grad_norm": 12.375, + "grad_norm_var": 3.4423014322916665, + "learning_rate": 0.0001, + "loss": 11.1114, + "loss/crossentropy": 3.0405282974243164, + "loss/hidden": 3.015625, + "loss/jsd": 0.0, + "loss/logits": 0.5055254101753235, + "step": 160 + }, + { + "epoch": 0.0050625, + "grad_norm": 13.5, + "grad_norm_var": 3.6130045572916667, + "learning_rate": 0.0001, + "loss": 11.3564, + "loss/crossentropy": 2.9557673931121826, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.5361583232879639, + "step": 162 + }, + { + "epoch": 0.005125, + "grad_norm": 9.6875, + "grad_norm_var": 3.3590983072916667, + "learning_rate": 0.0001, + "loss": 10.8173, + "loss/crossentropy": 2.9696191549301147, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.49414533376693726, + "step": 164 + }, + { + "epoch": 0.0051875, + "grad_norm": 10.9375, + "grad_norm_var": 2.4331868489583335, + "learning_rate": 0.0001, + "loss": 10.7629, + "loss/crossentropy": 2.870342493057251, + "loss/hidden": 2.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.4900369644165039, + "step": 166 + }, + { + "epoch": 0.00525, + "grad_norm": 11.9375, + "grad_norm_var": 1.7266764322916666, + "learning_rate": 0.0001, + "loss": 11.3727, + "loss/crossentropy": 2.9022059440612793, + "loss/hidden": 3.03125, + "loss/jsd": 0.0, + "loss/logits": 0.5439256131649017, + "step": 168 + }, + { + "epoch": 0.0053125, + "grad_norm": 13.6875, + "grad_norm_var": 1.939697265625, + "learning_rate": 0.0001, + "loss": 11.2794, + "loss/crossentropy": 2.9904470443725586, + "loss/hidden": 2.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.5312376618385315, + "step": 170 + }, + { + "epoch": 0.005375, + "grad_norm": 11.875, + "grad_norm_var": 1.6367024739583333, + "learning_rate": 0.0001, + "loss": 11.0391, + "loss/crossentropy": 2.9553390741348267, + "loss/hidden": 3.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.5044730305671692, + "step": 172 + }, + { + "epoch": 0.0054375, + "grad_norm": 8.5, + "grad_norm_var": 2.192822265625, + "learning_rate": 0.0001, + "loss": 10.5749, + "loss/crossentropy": 2.886873483657837, + "loss/hidden": 2.921875, + "loss/jsd": 0.0, + "loss/logits": 0.4766187369823456, + "step": 174 + }, + { + "epoch": 0.0055, + "grad_norm": 10.625, + "grad_norm_var": 2.0989420572916666, + "learning_rate": 0.0001, + "loss": 10.7418, + "loss/crossentropy": 2.8771119117736816, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.5020954459905624, + "step": 176 + }, + { + "epoch": 0.0055625, + "grad_norm": 9.5, + "grad_norm_var": 1.9636555989583333, + "learning_rate": 0.0001, + "loss": 10.3281, + "loss/crossentropy": 2.769477367401123, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.4746148884296417, + "step": 178 + }, + { + "epoch": 0.005625, + "grad_norm": 9.4375, + "grad_norm_var": 2.148177083333333, + "learning_rate": 0.0001, + "loss": 10.6448, + "loss/crossentropy": 2.889192581176758, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.4950959086418152, + "step": 180 + }, + { + "epoch": 0.0056875, + "grad_norm": 9.3125, + "grad_norm_var": 2.2296712239583334, + "learning_rate": 0.0001, + "loss": 10.4231, + "loss/crossentropy": 2.9166088104248047, + "loss/hidden": 2.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.4686211049556732, + "step": 182 + }, + { + "epoch": 0.00575, + "grad_norm": 10.25, + "grad_norm_var": 1.7809895833333333, + "learning_rate": 0.0001, + "loss": 10.4394, + "loss/crossentropy": 2.962415933609009, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.4633220136165619, + "step": 184 + }, + { + "epoch": 0.0058125, + "grad_norm": 9.625, + "grad_norm_var": 0.896728515625, + "learning_rate": 0.0001, + "loss": 10.7424, + "loss/crossentropy": 2.9772469997406006, + "loss/hidden": 2.90625, + "loss/jsd": 0.0, + "loss/logits": 0.48589444160461426, + "step": 186 + }, + { + "epoch": 0.005875, + "grad_norm": 11.25, + "grad_norm_var": 0.6596354166666667, + "learning_rate": 0.0001, + "loss": 11.4098, + "loss/crossentropy": 3.0638362169265747, + "loss/hidden": 2.96875, + "loss/jsd": 0.0, + "loss/logits": 0.5377195775508881, + "step": 188 + }, + { + "epoch": 0.0059375, + "grad_norm": 11.125, + "grad_norm_var": 0.6338541666666667, + "learning_rate": 0.0001, + "loss": 10.2039, + "loss/crossentropy": 2.698214054107666, + "loss/hidden": 2.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.46385255455970764, + "step": 190 + }, + { + "epoch": 0.006, + "grad_norm": 13.4375, + "grad_norm_var": 1.5541015625, + "learning_rate": 0.0001, + "loss": 10.3437, + "loss/crossentropy": 2.7819976806640625, + "loss/hidden": 2.84375, + "loss/jsd": 0.0, + "loss/logits": 0.47179484367370605, + "step": 192 + }, + { + "epoch": 0.0060625, + "grad_norm": 9.625, + "grad_norm_var": 1.4389973958333333, + "learning_rate": 0.0001, + "loss": 10.434, + "loss/crossentropy": 2.776822566986084, + "loss/hidden": 2.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.48525065183639526, + "step": 194 + }, + { + "epoch": 0.006125, + "grad_norm": 9.9375, + "grad_norm_var": 1.3817057291666666, + "learning_rate": 0.0001, + "loss": 10.5311, + "loss/crossentropy": 2.9022562503814697, + "loss/hidden": 2.890625, + "loss/jsd": 0.0, + "loss/logits": 0.473824679851532, + "step": 196 + }, + { + "epoch": 0.0061875, + "grad_norm": 9.0625, + "grad_norm_var": 1.4453125, + "learning_rate": 0.0001, + "loss": 10.6488, + "loss/crossentropy": 3.086912989616394, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.4749341607093811, + "step": 198 + }, + { + "epoch": 0.00625, + "grad_norm": 9.9375, + "grad_norm_var": 1.6442057291666667, + "learning_rate": 0.0001, + "loss": 10.5391, + "loss/crossentropy": 2.9904398918151855, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.4751739054918289, + "step": 200 + }, + { + "epoch": 0.0063125, + "grad_norm": 9.3125, + "grad_norm_var": 1.6425618489583333, + "learning_rate": 0.0001, + "loss": 10.7217, + "loss/crossentropy": 3.009773015975952, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.48994509875774384, + "step": 202 + }, + { + "epoch": 0.006375, + "grad_norm": 9.1875, + "grad_norm_var": 1.433447265625, + "learning_rate": 0.0001, + "loss": 10.2394, + "loss/crossentropy": 2.7742444276809692, + "loss/hidden": 2.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.46136191487312317, + "step": 204 + }, + { + "epoch": 0.0064375, + "grad_norm": 8.5625, + "grad_norm_var": 1.3983723958333334, + "learning_rate": 0.0001, + "loss": 10.1427, + "loss/crossentropy": 2.848304867744446, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.45365576446056366, + "step": 206 + }, + { + "epoch": 0.0065, + "grad_norm": 8.4375, + "grad_norm_var": 0.33151041666666664, + "learning_rate": 0.0001, + "loss": 10.3211, + "loss/crossentropy": 2.9591645002365112, + "loss/hidden": 2.765625, + "loss/jsd": 0.0, + "loss/logits": 0.4596277326345444, + "step": 208 + }, + { + "epoch": 0.0065625, + "grad_norm": 8.8125, + "grad_norm_var": 0.5629557291666667, + "learning_rate": 0.0001, + "loss": 10.0751, + "loss/crossentropy": 2.704858183860779, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.4635818302631378, + "step": 210 + }, + { + "epoch": 0.006625, + "grad_norm": 9.5, + "grad_norm_var": 0.5410807291666667, + "learning_rate": 0.0001, + "loss": 10.6019, + "loss/crossentropy": 2.992745518684387, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.48356935381889343, + "step": 212 + }, + { + "epoch": 0.0066875, + "grad_norm": 9.625, + "grad_norm_var": 0.6176920572916667, + "learning_rate": 0.0001, + "loss": 10.0108, + "loss/crossentropy": 2.840528726577759, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.4396800994873047, + "step": 214 + }, + { + "epoch": 0.00675, + "grad_norm": 10.4375, + "grad_norm_var": 0.605322265625, + "learning_rate": 0.0001, + "loss": 9.8039, + "loss/crossentropy": 2.64647901058197, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.43996211886405945, + "step": 216 + }, + { + "epoch": 0.0068125, + "grad_norm": 8.1875, + "grad_norm_var": 0.8450520833333334, + "learning_rate": 0.0001, + "loss": 10.1335, + "loss/crossentropy": 2.840444326400757, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.4503984898328781, + "step": 218 + }, + { + "epoch": 0.006875, + "grad_norm": 10.3125, + "grad_norm_var": 0.9781087239583334, + "learning_rate": 0.0001, + "loss": 9.9466, + "loss/crossentropy": 2.655266284942627, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.4494474083185196, + "step": 220 + }, + { + "epoch": 0.0069375, + "grad_norm": 10.9375, + "grad_norm_var": 1.1989583333333333, + "learning_rate": 0.0001, + "loss": 10.1344, + "loss/crossentropy": 2.819695830345154, + "loss/hidden": 2.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.45725369453430176, + "step": 222 + }, + { + "epoch": 0.007, + "grad_norm": 8.9375, + "grad_norm_var": 1.1447916666666667, + "learning_rate": 0.0001, + "loss": 10.2209, + "loss/crossentropy": 2.9145302772521973, + "loss/hidden": 2.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.4532930552959442, + "step": 224 + }, + { + "epoch": 0.0070625, + "grad_norm": 8.4375, + "grad_norm_var": 0.9332682291666666, + "learning_rate": 0.0001, + "loss": 10.0121, + "loss/crossentropy": 2.8794121742248535, + "loss/hidden": 2.75, + "loss/jsd": 0.0, + "loss/logits": 0.4382711499929428, + "step": 226 + }, + { + "epoch": 0.007125, + "grad_norm": 10.125, + "grad_norm_var": 1.0872395833333333, + "learning_rate": 0.0001, + "loss": 10.397, + "loss/crossentropy": 2.894919991493225, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.4767727255821228, + "step": 228 + }, + { + "epoch": 0.0071875, + "grad_norm": 8.9375, + "grad_norm_var": 1.021728515625, + "learning_rate": 0.0001, + "loss": 10.1881, + "loss/crossentropy": 2.923813581466675, + "loss/hidden": 2.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.4506445974111557, + "step": 230 + }, + { + "epoch": 0.00725, + "grad_norm": 9.4375, + "grad_norm_var": 0.9921875, + "learning_rate": 0.0001, + "loss": 10.1729, + "loss/crossentropy": 2.806499481201172, + "loss/hidden": 2.8125, + "loss/jsd": 0.0, + "loss/logits": 0.4553864002227783, + "step": 232 + }, + { + "epoch": 0.0073125, + "grad_norm": 8.75, + "grad_norm_var": 0.771728515625, + "learning_rate": 0.0001, + "loss": 10.0788, + "loss/crossentropy": 2.8656084537506104, + "loss/hidden": 2.6875, + "loss/jsd": 0.0, + "loss/logits": 0.4525725245475769, + "step": 234 + }, + { + "epoch": 0.007375, + "grad_norm": 9.875, + "grad_norm_var": 0.6760416666666667, + "learning_rate": 0.0001, + "loss": 10.0307, + "loss/crossentropy": 2.7157695293426514, + "loss/hidden": 2.796875, + "loss/jsd": 0.0, + "loss/logits": 0.45180511474609375, + "step": 236 + }, + { + "epoch": 0.0074375, + "grad_norm": 8.8125, + "grad_norm_var": 0.3675618489583333, + "learning_rate": 0.0001, + "loss": 10.218, + "loss/crossentropy": 2.8563307523727417, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.46350690722465515, + "step": 238 + }, + { + "epoch": 0.0075, + "grad_norm": 8.5, + "grad_norm_var": 0.38795572916666665, + "learning_rate": 0.0001, + "loss": 10.0956, + "loss/crossentropy": 2.801733136177063, + "loss/hidden": 2.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.45985159277915955, + "step": 240 + }, + { + "epoch": 0.0075625, + "grad_norm": 9.25, + "grad_norm_var": 0.372509765625, + "learning_rate": 0.0001, + "loss": 10.3281, + "loss/crossentropy": 3.0221978425979614, + "loss/hidden": 2.6875, + "loss/jsd": 0.0, + "loss/logits": 0.46184317767620087, + "step": 242 + }, + { + "epoch": 0.007625, + "grad_norm": 8.5, + "grad_norm_var": 0.2699055989583333, + "learning_rate": 0.0001, + "loss": 10.069, + "loss/crossentropy": 2.868880271911621, + "loss/hidden": 2.6875, + "loss/jsd": 0.0, + "loss/logits": 0.4512626975774765, + "step": 244 + }, + { + "epoch": 0.0076875, + "grad_norm": 9.125, + "grad_norm_var": 0.37405192057291664, + "learning_rate": 0.0001, + "loss": 10.1415, + "loss/crossentropy": 3.017348051071167, + "loss/hidden": 2.6875, + "loss/jsd": 0.0, + "loss/logits": 0.4436669647693634, + "step": 246 + }, + { + "epoch": 0.00775, + "grad_norm": 7.5625, + "grad_norm_var": 0.4295857747395833, + "learning_rate": 0.0001, + "loss": 9.7982, + "loss/crossentropy": 2.8351858854293823, + "loss/hidden": 2.640625, + "loss/jsd": 0.0, + "loss/logits": 0.432241827249527, + "step": 248 + }, + { + "epoch": 0.0078125, + "grad_norm": 8.75, + "grad_norm_var": 0.4040323893229167, + "learning_rate": 0.0001, + "loss": 10.0516, + "loss/crossentropy": 2.9942902326583862, + "loss/hidden": 2.625, + "loss/jsd": 0.0, + "loss/logits": 0.44322872161865234, + "step": 250 + }, + { + "epoch": 0.007875, + "grad_norm": 7.0625, + "grad_norm_var": 0.5571451822916667, + "learning_rate": 0.0001, + "loss": 9.6001, + "loss/crossentropy": 2.8024221658706665, + "loss/hidden": 2.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.42117369174957275, + "step": 252 + }, + { + "epoch": 0.0079375, + "grad_norm": 7.53125, + "grad_norm_var": 0.6127237955729167, + "learning_rate": 0.0001, + "loss": 9.4012, + "loss/crossentropy": 2.659646511077881, + "loss/hidden": 2.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.4124371409416199, + "step": 254 + }, + { + "epoch": 0.008, + "grad_norm": 8.5, + "grad_norm_var": 4.341044108072917, + "learning_rate": 0.0001, + "loss": 9.9692, + "loss/crossentropy": 2.7675873041152954, + "loss/hidden": 2.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.43657153844833374, + "step": 256 + }, + { + "epoch": 0.0080625, + "grad_norm": 7.375, + "grad_norm_var": 4.434098307291666, + "learning_rate": 0.0001, + "loss": 9.6241, + "loss/crossentropy": 2.752623677253723, + "loss/hidden": 2.65625, + "loss/jsd": 0.0, + "loss/logits": 0.421526238322258, + "step": 258 + }, + { + "epoch": 0.008125, + "grad_norm": 9.75, + "grad_norm_var": 4.814322916666667, + "learning_rate": 0.0001, + "loss": 10.5617, + "loss/crossentropy": 2.9343960285186768, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.4900728166103363, + "step": 260 + }, + { + "epoch": 0.0081875, + "grad_norm": 8.5, + "grad_norm_var": 4.757255045572917, + "learning_rate": 0.0001, + "loss": 9.813, + "loss/crossentropy": 2.8956027030944824, + "loss/hidden": 2.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.43002206087112427, + "step": 262 + }, + { + "epoch": 0.00825, + "grad_norm": 6.90625, + "grad_norm_var": 4.924479166666667, + "learning_rate": 0.0001, + "loss": 9.1786, + "loss/crossentropy": 2.5651010274887085, + "loss/hidden": 2.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.39650242030620575, + "step": 264 + }, + { + "epoch": 0.0083125, + "grad_norm": 10.125, + "grad_norm_var": 5.049674479166667, + "learning_rate": 0.0001, + "loss": 10.1403, + "loss/crossentropy": 2.9591987133026123, + "loss/hidden": 2.734375, + "loss/jsd": 0.0, + "loss/logits": 0.4446714520454407, + "step": 266 + }, + { + "epoch": 0.008375, + "grad_norm": 10.4375, + "grad_norm_var": 4.84810791015625, + "learning_rate": 0.0001, + "loss": 10.0619, + "loss/crossentropy": 2.9117506742477417, + "loss/hidden": 2.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.4439200907945633, + "step": 268 + }, + { + "epoch": 0.0084375, + "grad_norm": 12.8125, + "grad_norm_var": 5.396858723958333, + "learning_rate": 0.0001, + "loss": 9.8102, + "loss/crossentropy": 2.7316226959228516, + "loss/hidden": 2.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.435204416513443, + "step": 270 + }, + { + "epoch": 0.0085, + "grad_norm": 11.625, + "grad_norm_var": 2.6581380208333334, + "learning_rate": 0.0001, + "loss": 10.3313, + "loss/crossentropy": 2.8637452125549316, + "loss/hidden": 2.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.4678504019975662, + "step": 272 + }, + { + "epoch": 0.0085625, + "grad_norm": 8.25, + "grad_norm_var": 2.36773681640625, + "learning_rate": 0.0001, + "loss": 10.0692, + "loss/crossentropy": 2.905339241027832, + "loss/hidden": 2.703125, + "loss/jsd": 0.0, + "loss/logits": 0.44607456028461456, + "step": 274 + }, + { + "epoch": 0.008625, + "grad_norm": 8.875, + "grad_norm_var": 2.241890462239583, + "learning_rate": 0.0001, + "loss": 9.7537, + "loss/crossentropy": 2.7921040058135986, + "loss/hidden": 2.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.43287399411201477, + "step": 276 + }, + { + "epoch": 0.0086875, + "grad_norm": 11.0, + "grad_norm_var": 2.432059733072917, + "learning_rate": 0.0001, + "loss": 9.8559, + "loss/crossentropy": 2.66982901096344, + "loss/hidden": 2.671875, + "loss/jsd": 0.0, + "loss/logits": 0.45141659677028656, + "step": 278 + }, + { + "epoch": 0.00875, + "grad_norm": 8.1875, + "grad_norm_var": 2.079150390625, + "learning_rate": 0.0001, + "loss": 9.8994, + "loss/crossentropy": 2.9137284755706787, + "loss/hidden": 2.59375, + "loss/jsd": 0.0, + "loss/logits": 0.4391949772834778, + "step": 280 + }, + { + "epoch": 0.0088125, + "grad_norm": 8.1875, + "grad_norm_var": 2.11875, + "learning_rate": 0.0001, + "loss": 9.8661, + "loss/crossentropy": 2.753507614135742, + "loss/hidden": 2.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.4479823410511017, + "step": 282 + }, + { + "epoch": 0.008875, + "grad_norm": 8.25, + "grad_norm_var": 2.0929524739583334, + "learning_rate": 0.0001, + "loss": 9.7656, + "loss/crossentropy": 2.8703192472457886, + "loss/hidden": 2.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.4324973225593567, + "step": 284 + }, + { + "epoch": 0.0089375, + "grad_norm": 8.1875, + "grad_norm_var": 1.2997233072916667, + "learning_rate": 0.0001, + "loss": 9.2983, + "loss/crossentropy": 2.6686055660247803, + "loss/hidden": 2.609375, + "loss/jsd": 0.0, + "loss/logits": 0.4020322114229202, + "step": 286 + }, + { + "epoch": 0.009, + "grad_norm": 7.28125, + "grad_norm_var": 0.891796875, + "learning_rate": 0.0001, + "loss": 9.6798, + "loss/crossentropy": 2.915258288383484, + "loss/hidden": 2.546875, + "loss/jsd": 0.0, + "loss/logits": 0.42177151143550873, + "step": 288 + }, + { + "epoch": 0.0090625, + "grad_norm": 7.78125, + "grad_norm_var": 0.80250244140625, + "learning_rate": 0.0001, + "loss": 9.7686, + "loss/crossentropy": 2.877256751060486, + "loss/hidden": 2.609375, + "loss/jsd": 0.0, + "loss/logits": 0.42820094525814056, + "step": 290 + }, + { + "epoch": 0.009125, + "grad_norm": 7.8125, + "grad_norm_var": 0.90875244140625, + "learning_rate": 0.0001, + "loss": 9.3521, + "loss/crossentropy": 2.6476125717163086, + "loss/hidden": 2.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.4134131520986557, + "step": 292 + }, + { + "epoch": 0.0091875, + "grad_norm": 9.5, + "grad_norm_var": 0.52418212890625, + "learning_rate": 0.0001, + "loss": 9.1739, + "loss/crossentropy": 2.618640184402466, + "loss/hidden": 2.640625, + "loss/jsd": 0.0, + "loss/logits": 0.391463965177536, + "step": 294 + }, + { + "epoch": 0.00925, + "grad_norm": 7.90625, + "grad_norm_var": 0.579541015625, + "learning_rate": 0.0001, + "loss": 9.3249, + "loss/crossentropy": 2.5597482919692993, + "loss/hidden": 2.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.4147980213165283, + "step": 296 + }, + { + "epoch": 0.0093125, + "grad_norm": 7.46875, + "grad_norm_var": 0.4962076822916667, + "learning_rate": 0.0001, + "loss": 9.5687, + "loss/crossentropy": 2.7897287607192993, + "loss/hidden": 2.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.4224274307489395, + "step": 298 + }, + { + "epoch": 0.009375, + "grad_norm": 8.9375, + "grad_norm_var": 0.53707275390625, + "learning_rate": 0.0001, + "loss": 9.7683, + "loss/crossentropy": 2.743291974067688, + "loss/hidden": 2.625, + "loss/jsd": 0.0, + "loss/logits": 0.43999941647052765, + "step": 300 + }, + { + "epoch": 0.0094375, + "grad_norm": 9.625, + "grad_norm_var": 0.9377888997395833, + "learning_rate": 0.0001, + "loss": 10.1141, + "loss/crossentropy": 2.968402862548828, + "loss/hidden": 2.65625, + "loss/jsd": 0.0, + "loss/logits": 0.4489475339651108, + "step": 302 + }, + { + "epoch": 0.0095, + "grad_norm": 7.65625, + "grad_norm_var": 1.1374837239583333, + "learning_rate": 0.0001, + "loss": 9.9672, + "loss/crossentropy": 2.921197533607483, + "loss/hidden": 2.609375, + "loss/jsd": 0.0, + "loss/logits": 0.4436652660369873, + "step": 304 + }, + { + "epoch": 0.0095625, + "grad_norm": 9.125, + "grad_norm_var": 1.0794230143229167, + "learning_rate": 0.0001, + "loss": 9.9847, + "loss/crossentropy": 2.8543198108673096, + "loss/hidden": 2.578125, + "loss/jsd": 0.0, + "loss/logits": 0.45522603392601013, + "step": 306 + }, + { + "epoch": 0.009625, + "grad_norm": 7.96875, + "grad_norm_var": 1.0253255208333334, + "learning_rate": 0.0001, + "loss": 9.478, + "loss/crossentropy": 2.711910605430603, + "loss/hidden": 2.625, + "loss/jsd": 0.0, + "loss/logits": 0.41411033272743225, + "step": 308 + }, + { + "epoch": 0.0096875, + "grad_norm": 8.6875, + "grad_norm_var": 0.957666015625, + "learning_rate": 0.0001, + "loss": 9.7925, + "loss/crossentropy": 2.9207193851470947, + "loss/hidden": 2.625, + "loss/jsd": 0.0, + "loss/logits": 0.42468030750751495, + "step": 310 + }, + { + "epoch": 0.00975, + "grad_norm": 8.375, + "grad_norm_var": 0.952197265625, + "learning_rate": 0.0001, + "loss": 9.4838, + "loss/crossentropy": 2.861418128013611, + "loss/hidden": 2.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.40833115577697754, + "step": 312 + }, + { + "epoch": 0.0098125, + "grad_norm": 8.8125, + "grad_norm_var": 3.8309733072916665, + "learning_rate": 0.0001, + "loss": 9.8637, + "loss/crossentropy": 2.8941240310668945, + "loss/hidden": 2.5625, + "loss/jsd": 0.0, + "loss/logits": 0.44070352613925934, + "step": 314 + }, + { + "epoch": 0.009875, + "grad_norm": 8.1875, + "grad_norm_var": 3.8521443684895833, + "learning_rate": 0.0001, + "loss": 9.5803, + "loss/crossentropy": 2.8630210161209106, + "loss/hidden": 2.5625, + "loss/jsd": 0.0, + "loss/logits": 0.41548123955726624, + "step": 316 + }, + { + "epoch": 0.0099375, + "grad_norm": 8.875, + "grad_norm_var": 3.709403483072917, + "learning_rate": 0.0001, + "loss": 9.5554, + "loss/crossentropy": 2.6127843856811523, + "loss/hidden": 2.625, + "loss/jsd": 0.0, + "loss/logits": 0.43175867199897766, + "step": 318 + }, + { + "epoch": 0.01, + "grad_norm": 7.9375, + "grad_norm_var": 3.5029947916666666, + "learning_rate": 0.0001, + "loss": 9.2689, + "loss/crossentropy": 2.6942365169525146, + "loss/hidden": 2.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.40199390053749084, + "step": 320 + }, + { + "epoch": 0.0100625, + "grad_norm": 9.0, + "grad_norm_var": 3.4977213541666665, + "learning_rate": 0.0001, + "loss": 9.6463, + "loss/crossentropy": 2.8481950759887695, + "loss/hidden": 2.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.42277809977531433, + "step": 322 + }, + { + "epoch": 0.010125, + "grad_norm": 8.3125, + "grad_norm_var": 3.5474894205729166, + "learning_rate": 0.0001, + "loss": 9.4688, + "loss/crossentropy": 2.792279362678528, + "loss/hidden": 2.546875, + "loss/jsd": 0.0, + "loss/logits": 0.4129619151353836, + "step": 324 + }, + { + "epoch": 0.0101875, + "grad_norm": 7.71875, + "grad_norm_var": 3.8777180989583333, + "learning_rate": 0.0001, + "loss": 9.6284, + "loss/crossentropy": 2.919487237930298, + "loss/hidden": 2.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.42011047899723053, + "step": 326 + }, + { + "epoch": 0.01025, + "grad_norm": 11.6875, + "grad_norm_var": 4.332255045572917, + "learning_rate": 0.0001, + "loss": 8.8365, + "loss/crossentropy": 2.4436700344085693, + "loss/hidden": 2.5625, + "loss/jsd": 0.0, + "loss/logits": 0.3830350488424301, + "step": 328 + }, + { + "epoch": 0.0103125, + "grad_norm": 8.6875, + "grad_norm_var": 1.2620402018229167, + "learning_rate": 0.0001, + "loss": 10.4608, + "loss/crossentropy": 3.0949630737304688, + "loss/hidden": 2.625, + "loss/jsd": 0.0, + "loss/logits": 0.4740859717130661, + "step": 330 + }, + { + "epoch": 0.010375, + "grad_norm": 9.8125, + "grad_norm_var": 1.4888631184895833, + "learning_rate": 0.0001, + "loss": 10.0661, + "loss/crossentropy": 2.9477418661117554, + "loss/hidden": 2.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.45167967677116394, + "step": 332 + }, + { + "epoch": 0.0104375, + "grad_norm": 7.1875, + "grad_norm_var": 1.8235677083333333, + "learning_rate": 0.0001, + "loss": 9.2473, + "loss/crossentropy": 2.713068962097168, + "loss/hidden": 2.546875, + "loss/jsd": 0.0, + "loss/logits": 0.39873576164245605, + "step": 334 + }, + { + "epoch": 0.0105, + "grad_norm": 8.4375, + "grad_norm_var": 1.9116170247395834, + "learning_rate": 0.0001, + "loss": 9.017, + "loss/crossentropy": 2.5754904747009277, + "loss/hidden": 2.546875, + "loss/jsd": 0.0, + "loss/logits": 0.3894636482000351, + "step": 336 + }, + { + "epoch": 0.0105625, + "grad_norm": 7.5625, + "grad_norm_var": 1.9513020833333334, + "learning_rate": 0.0001, + "loss": 9.322, + "loss/crossentropy": 2.7804969549179077, + "loss/hidden": 2.5, + "loss/jsd": 0.0, + "loss/logits": 0.40414653718471527, + "step": 338 + }, + { + "epoch": 0.010625, + "grad_norm": 7.28125, + "grad_norm_var": 2.0599568684895835, + "learning_rate": 0.0001, + "loss": 9.4558, + "loss/crossentropy": 2.9021013975143433, + "loss/hidden": 2.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.40615350008010864, + "step": 340 + }, + { + "epoch": 0.0106875, + "grad_norm": 7.53125, + "grad_norm_var": 1.9496053059895833, + "learning_rate": 0.0001, + "loss": 9.7347, + "loss/crossentropy": 2.8895071744918823, + "loss/hidden": 2.53125, + "loss/jsd": 0.0, + "loss/logits": 0.4313907325267792, + "step": 342 + }, + { + "epoch": 0.01075, + "grad_norm": 7.5625, + "grad_norm_var": 1.24234619140625, + "learning_rate": 0.0001, + "loss": 9.2208, + "loss/crossentropy": 2.8096436262130737, + "loss/hidden": 2.53125, + "loss/jsd": 0.0, + "loss/logits": 0.3879920691251755, + "step": 344 + }, + { + "epoch": 0.0108125, + "grad_norm": 8.375, + "grad_norm_var": 0.9823567708333333, + "learning_rate": 0.0001, + "loss": 8.3941, + "loss/crossentropy": 2.444824457168579, + "loss/hidden": 2.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.3472696393728256, + "step": 346 + }, + { + "epoch": 0.010875, + "grad_norm": 6.4375, + "grad_norm_var": 0.27044270833333334, + "learning_rate": 0.0001, + "loss": 9.3588, + "loss/crossentropy": 2.8576740026474, + "loss/hidden": 2.5, + "loss/jsd": 0.0, + "loss/logits": 0.40011417865753174, + "step": 348 + }, + { + "epoch": 0.0109375, + "grad_norm": 6.875, + "grad_norm_var": 0.30201416015625, + "learning_rate": 0.0001, + "loss": 9.0438, + "loss/crossentropy": 2.7489218711853027, + "loss/hidden": 2.453125, + "loss/jsd": 0.0, + "loss/logits": 0.3841765522956848, + "step": 350 + }, + { + "epoch": 0.011, + "grad_norm": 8.0, + "grad_norm_var": 0.26008707682291665, + "learning_rate": 0.0001, + "loss": 9.2174, + "loss/crossentropy": 2.693643808364868, + "loss/hidden": 2.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.398467093706131, + "step": 352 + }, + { + "epoch": 0.0110625, + "grad_norm": 7.34375, + "grad_norm_var": 0.2731404622395833, + "learning_rate": 0.0001, + "loss": 9.2476, + "loss/crossentropy": 2.8319369554519653, + "loss/hidden": 2.46875, + "loss/jsd": 0.0, + "loss/logits": 0.3946947753429413, + "step": 354 + }, + { + "epoch": 0.011125, + "grad_norm": 7.4375, + "grad_norm_var": 3.001936848958333, + "learning_rate": 0.0001, + "loss": 9.8609, + "loss/crossentropy": 2.928380846977234, + "loss/hidden": 2.515625, + "loss/jsd": 0.0, + "loss/logits": 0.44168923795223236, + "step": 356 + }, + { + "epoch": 0.0111875, + "grad_norm": 7.5625, + "grad_norm_var": 3.0012858072916666, + "learning_rate": 0.0001, + "loss": 9.7272, + "loss/crossentropy": 2.978438377380371, + "loss/hidden": 2.546875, + "loss/jsd": 0.0, + "loss/logits": 0.4201928675174713, + "step": 358 + }, + { + "epoch": 0.01125, + "grad_norm": 7.9375, + "grad_norm_var": 2.9525390625, + "learning_rate": 0.0001, + "loss": 9.7725, + "loss/crossentropy": 2.9407122135162354, + "loss/hidden": 2.515625, + "loss/jsd": 0.0, + "loss/logits": 0.4316175580024719, + "step": 360 + }, + { + "epoch": 0.0113125, + "grad_norm": 7.40625, + "grad_norm_var": 2.9946573893229167, + "learning_rate": 0.0001, + "loss": 9.1066, + "loss/crossentropy": 2.8210840225219727, + "loss/hidden": 2.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.38558244705200195, + "step": 362 + }, + { + "epoch": 0.011375, + "grad_norm": 6.84375, + "grad_norm_var": 2.9395792643229166, + "learning_rate": 0.0001, + "loss": 9.1715, + "loss/crossentropy": 2.683255434036255, + "loss/hidden": 2.484375, + "loss/jsd": 0.0, + "loss/logits": 0.40038590133190155, + "step": 364 + }, + { + "epoch": 0.0114375, + "grad_norm": 7.28125, + "grad_norm_var": 2.906233723958333, + "learning_rate": 0.0001, + "loss": 8.6687, + "loss/crossentropy": 2.61862576007843, + "loss/hidden": 2.40625, + "loss/jsd": 0.0, + "loss/logits": 0.3643851727247238, + "step": 366 + }, + { + "epoch": 0.0115, + "grad_norm": 7.71875, + "grad_norm_var": 2.9218098958333334, + "learning_rate": 0.0001, + "loss": 8.9238, + "loss/crossentropy": 2.6416503190994263, + "loss/hidden": 2.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.38056348264217377, + "step": 368 + }, + { + "epoch": 0.0115625, + "grad_norm": 7.90625, + "grad_norm_var": 2.856864420572917, + "learning_rate": 0.0001, + "loss": 9.4662, + "loss/crossentropy": 2.924672245979309, + "loss/hidden": 2.46875, + "loss/jsd": 0.0, + "loss/logits": 0.40727290511131287, + "step": 370 + }, + { + "epoch": 0.011625, + "grad_norm": 7.625, + "grad_norm_var": 0.15422770182291667, + "learning_rate": 0.0001, + "loss": 9.5455, + "loss/crossentropy": 2.887393593788147, + "loss/hidden": 2.484375, + "loss/jsd": 0.0, + "loss/logits": 0.4173741787672043, + "step": 372 + }, + { + "epoch": 0.0116875, + "grad_norm": 7.09375, + "grad_norm_var": 0.160009765625, + "learning_rate": 0.0001, + "loss": 9.3231, + "loss/crossentropy": 2.940009832382202, + "loss/hidden": 2.421875, + "loss/jsd": 0.0, + "loss/logits": 0.39612552523612976, + "step": 374 + }, + { + "epoch": 0.01175, + "grad_norm": 7.4375, + "grad_norm_var": 0.14763997395833334, + "learning_rate": 0.0001, + "loss": 8.7557, + "loss/crossentropy": 2.558520793914795, + "loss/hidden": 2.4375, + "loss/jsd": 0.0, + "loss/logits": 0.3759680688381195, + "step": 376 + }, + { + "epoch": 0.0118125, + "grad_norm": 7.84375, + "grad_norm_var": 0.17099202473958333, + "learning_rate": 0.0001, + "loss": 9.4308, + "loss/crossentropy": 2.7902355194091797, + "loss/hidden": 2.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.4195282459259033, + "step": 378 + }, + { + "epoch": 0.011875, + "grad_norm": 6.90625, + "grad_norm_var": 0.19140625, + "learning_rate": 0.0001, + "loss": 8.8902, + "loss/crossentropy": 2.752501130104065, + "loss/hidden": 2.375, + "loss/jsd": 0.0, + "loss/logits": 0.37626585364341736, + "step": 380 + }, + { + "epoch": 0.0119375, + "grad_norm": 8.125, + "grad_norm_var": 0.71929931640625, + "learning_rate": 0.0001, + "loss": 9.3502, + "loss/crossentropy": 2.772351384162903, + "loss/hidden": 2.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.40857018530368805, + "step": 382 + }, + { + "epoch": 0.012, + "grad_norm": 7.21875, + "grad_norm_var": 0.77066650390625, + "learning_rate": 0.0001, + "loss": 9.3194, + "loss/crossentropy": 2.7217490673065186, + "loss/hidden": 2.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.4089791476726532, + "step": 384 + }, + { + "epoch": 0.0120625, + "grad_norm": 7.65625, + "grad_norm_var": 0.7696451822916667, + "learning_rate": 0.0001, + "loss": 9.2867, + "loss/crossentropy": 2.7904086112976074, + "loss/hidden": 2.46875, + "loss/jsd": 0.0, + "loss/logits": 0.402749627828598, + "step": 386 + }, + { + "epoch": 0.012125, + "grad_norm": 7.84375, + "grad_norm_var": 0.7747029622395833, + "learning_rate": 0.0001, + "loss": 9.4515, + "loss/crossentropy": 2.9278364181518555, + "loss/hidden": 2.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.4109601080417633, + "step": 388 + }, + { + "epoch": 0.0121875, + "grad_norm": 7.78125, + "grad_norm_var": 0.775390625, + "learning_rate": 0.0001, + "loss": 9.0024, + "loss/crossentropy": 2.808686137199402, + "loss/hidden": 2.40625, + "loss/jsd": 0.0, + "loss/logits": 0.37874314188957214, + "step": 390 + }, + { + "epoch": 0.01225, + "grad_norm": 7.40625, + "grad_norm_var": 0.765869140625, + "learning_rate": 0.0001, + "loss": 8.8575, + "loss/crossentropy": 2.651176333427429, + "loss/hidden": 2.390625, + "loss/jsd": 0.0, + "loss/logits": 0.38156652450561523, + "step": 392 + }, + { + "epoch": 0.0123125, + "grad_norm": 8.625, + "grad_norm_var": 0.8224568684895833, + "learning_rate": 0.0001, + "loss": 10.1481, + "loss/crossentropy": 3.143176555633545, + "loss/hidden": 2.484375, + "loss/jsd": 0.0, + "loss/logits": 0.45205217599868774, + "step": 394 + }, + { + "epoch": 0.012375, + "grad_norm": 7.03125, + "grad_norm_var": 0.7639933268229167, + "learning_rate": 0.0001, + "loss": 8.8441, + "loss/crossentropy": 2.7559027671813965, + "loss/hidden": 2.390625, + "loss/jsd": 0.0, + "loss/logits": 0.36975668370723724, + "step": 396 + }, + { + "epoch": 0.0124375, + "grad_norm": 7.03125, + "grad_norm_var": 0.32005208333333335, + "learning_rate": 0.0001, + "loss": 9.0988, + "loss/crossentropy": 2.8294628858566284, + "loss/hidden": 2.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.3824039399623871, + "step": 398 + }, + { + "epoch": 0.0125, + "grad_norm": 7.125, + "grad_norm_var": 0.24947509765625, + "learning_rate": 0.0001, + "loss": 9.2609, + "loss/crossentropy": 2.8465192317962646, + "loss/hidden": 2.453125, + "loss/jsd": 0.0, + "loss/logits": 0.39612245559692383, + "step": 400 + }, + { + "epoch": 0.0125625, + "grad_norm": 7.375, + "grad_norm_var": 0.25227457682291665, + "learning_rate": 0.0001, + "loss": 9.135, + "loss/crossentropy": 2.829011917114258, + "loss/hidden": 2.375, + "loss/jsd": 0.0, + "loss/logits": 0.39309877157211304, + "step": 402 + }, + { + "epoch": 0.012625, + "grad_norm": 6.625, + "grad_norm_var": 0.26951497395833335, + "learning_rate": 0.0001, + "loss": 8.5141, + "loss/crossentropy": 2.407685399055481, + "loss/hidden": 2.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.37392735481262207, + "step": 404 + }, + { + "epoch": 0.0126875, + "grad_norm": 6.96875, + "grad_norm_var": 0.25638020833333336, + "learning_rate": 0.0001, + "loss": 9.0194, + "loss/crossentropy": 2.782030463218689, + "loss/hidden": 2.390625, + "loss/jsd": 0.0, + "loss/logits": 0.3846723139286041, + "step": 406 + }, + { + "epoch": 0.01275, + "grad_norm": 6.84375, + "grad_norm_var": 0.3097005208333333, + "learning_rate": 0.0001, + "loss": 7.9774, + "loss/crossentropy": 2.263835072517395, + "loss/hidden": 2.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.336203470826149, + "step": 408 + }, + { + "epoch": 0.0128125, + "grad_norm": 7.25, + "grad_norm_var": 0.13372395833333334, + "learning_rate": 0.0001, + "loss": 9.1392, + "loss/crossentropy": 2.795884609222412, + "loss/hidden": 2.453125, + "loss/jsd": 0.0, + "loss/logits": 0.38901545107364655, + "step": 410 + }, + { + "epoch": 0.012875, + "grad_norm": 7.25, + "grad_norm_var": 0.24560139973958334, + "learning_rate": 0.0001, + "loss": 9.8796, + "loss/crossentropy": 3.1521027088165283, + "loss/hidden": 2.453125, + "loss/jsd": 0.0, + "loss/logits": 0.427437886595726, + "step": 412 + }, + { + "epoch": 0.0129375, + "grad_norm": 7.125, + "grad_norm_var": 0.24451497395833333, + "learning_rate": 0.0001, + "loss": 9.2833, + "loss/crossentropy": 2.9037975072860718, + "loss/hidden": 2.40625, + "loss/jsd": 0.0, + "loss/logits": 0.3973206430673599, + "step": 414 + }, + { + "epoch": 0.013, + "grad_norm": 6.65625, + "grad_norm_var": 0.2760701497395833, + "learning_rate": 0.0001, + "loss": 9.1047, + "loss/crossentropy": 2.8156282901763916, + "loss/hidden": 2.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.38906630873680115, + "step": 416 + }, + { + "epoch": 0.0130625, + "grad_norm": 8.75, + "grad_norm_var": 0.44397379557291666, + "learning_rate": 0.0001, + "loss": 9.1789, + "loss/crossentropy": 2.7314385175704956, + "loss/hidden": 2.375, + "loss/jsd": 0.0, + "loss/logits": 0.40724538266658783, + "step": 418 + }, + { + "epoch": 0.013125, + "grad_norm": 8.125, + "grad_norm_var": 0.4571451822916667, + "learning_rate": 0.0001, + "loss": 9.4066, + "loss/crossentropy": 2.7825275659561157, + "loss/hidden": 2.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.4147540330886841, + "step": 420 + }, + { + "epoch": 0.0131875, + "grad_norm": 6.03125, + "grad_norm_var": 0.5766276041666667, + "learning_rate": 0.0001, + "loss": 9.1609, + "loss/crossentropy": 2.930117607116699, + "loss/hidden": 2.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.3785484880208969, + "step": 422 + }, + { + "epoch": 0.01325, + "grad_norm": 5.90625, + "grad_norm_var": 0.6495442708333333, + "learning_rate": 0.0001, + "loss": 8.6362, + "loss/crossentropy": 2.676853656768799, + "loss/hidden": 2.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3599983751773834, + "step": 424 + }, + { + "epoch": 0.0133125, + "grad_norm": 7.5, + "grad_norm_var": 0.624853515625, + "learning_rate": 0.0001, + "loss": 8.7011, + "loss/crossentropy": 2.5884207487106323, + "loss/hidden": 2.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.36829495429992676, + "step": 426 + }, + { + "epoch": 0.013375, + "grad_norm": 7.53125, + "grad_norm_var": 0.5376953125, + "learning_rate": 0.0001, + "loss": 9.7283, + "loss/crossentropy": 2.9578946828842163, + "loss/hidden": 2.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.42625896632671356, + "step": 428 + }, + { + "epoch": 0.0134375, + "grad_norm": 7.8125, + "grad_norm_var": 0.5708943684895833, + "learning_rate": 0.0001, + "loss": 9.2129, + "loss/crossentropy": 2.830846667289734, + "loss/hidden": 2.34375, + "loss/jsd": 0.0, + "loss/logits": 0.4038323014974594, + "step": 430 + }, + { + "epoch": 0.0135, + "grad_norm": 7.15625, + "grad_norm_var": 0.53961181640625, + "learning_rate": 0.0001, + "loss": 9.1391, + "loss/crossentropy": 2.934818983078003, + "loss/hidden": 2.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.3837139457464218, + "step": 432 + }, + { + "epoch": 0.0135625, + "grad_norm": 6.53125, + "grad_norm_var": 0.42125244140625, + "learning_rate": 0.0001, + "loss": 8.8191, + "loss/crossentropy": 2.789232611656189, + "loss/hidden": 2.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.3709518015384674, + "step": 434 + }, + { + "epoch": 0.013625, + "grad_norm": 7.03125, + "grad_norm_var": 0.33824462890625, + "learning_rate": 0.0001, + "loss": 8.8767, + "loss/crossentropy": 2.723211646080017, + "loss/hidden": 2.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3770655542612076, + "step": 436 + }, + { + "epoch": 0.0136875, + "grad_norm": 7.15625, + "grad_norm_var": 0.3138631184895833, + "learning_rate": 0.0001, + "loss": 8.6882, + "loss/crossentropy": 2.7312934398651123, + "loss/hidden": 2.34375, + "loss/jsd": 0.0, + "loss/logits": 0.36132051050662994, + "step": 438 + }, + { + "epoch": 0.01375, + "grad_norm": 7.40625, + "grad_norm_var": 0.30740559895833336, + "learning_rate": 0.0001, + "loss": 9.2994, + "loss/crossentropy": 3.003957748413086, + "loss/hidden": 2.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.38345402479171753, + "step": 440 + }, + { + "epoch": 0.0138125, + "grad_norm": 7.1875, + "grad_norm_var": 0.32190348307291666, + "learning_rate": 0.0001, + "loss": 9.5717, + "loss/crossentropy": 2.9630134105682373, + "loss/hidden": 2.421875, + "loss/jsd": 0.0, + "loss/logits": 0.4186822474002838, + "step": 442 + }, + { + "epoch": 0.013875, + "grad_norm": 7.5, + "grad_norm_var": 0.3284993489583333, + "learning_rate": 0.0001, + "loss": 8.9237, + "loss/crossentropy": 2.695352554321289, + "loss/hidden": 2.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3845515549182892, + "step": 444 + }, + { + "epoch": 0.0139375, + "grad_norm": 12.75, + "grad_norm_var": 2.301416015625, + "learning_rate": 0.0001, + "loss": 10.1005, + "loss/crossentropy": 2.9550745487213135, + "loss/hidden": 2.5, + "loss/jsd": 0.0, + "loss/logits": 0.4645442068576813, + "step": 446 + }, + { + "epoch": 0.014, + "grad_norm": 7.125, + "grad_norm_var": 2.3378865559895834, + "learning_rate": 0.0001, + "loss": 9.2894, + "loss/crossentropy": 2.7871328592300415, + "loss/hidden": 2.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.40257345139980316, + "step": 448 + }, + { + "epoch": 0.0140625, + "grad_norm": 6.59375, + "grad_norm_var": 2.273270670572917, + "learning_rate": 0.0001, + "loss": 8.9407, + "loss/crossentropy": 2.749815583229065, + "loss/hidden": 2.328125, + "loss/jsd": 0.0, + "loss/logits": 0.38627225160598755, + "step": 450 + }, + { + "epoch": 0.014125, + "grad_norm": 7.0, + "grad_norm_var": 2.2490519205729167, + "learning_rate": 0.0001, + "loss": 9.0574, + "loss/crossentropy": 2.733398675918579, + "loss/hidden": 2.359375, + "loss/jsd": 0.0, + "loss/logits": 0.39646580815315247, + "step": 452 + }, + { + "epoch": 0.0141875, + "grad_norm": 6.90625, + "grad_norm_var": 2.060530598958333, + "learning_rate": 0.0001, + "loss": 9.1024, + "loss/crossentropy": 2.782361626625061, + "loss/hidden": 2.390625, + "loss/jsd": 0.0, + "loss/logits": 0.3929390609264374, + "step": 454 + }, + { + "epoch": 0.01425, + "grad_norm": 7.125, + "grad_norm_var": 2.107405598958333, + "learning_rate": 0.0001, + "loss": 8.9667, + "loss/crossentropy": 2.81704843044281, + "loss/hidden": 2.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.3735543340444565, + "step": 456 + }, + { + "epoch": 0.0143125, + "grad_norm": 7.40625, + "grad_norm_var": 2.492041015625, + "learning_rate": 0.0001, + "loss": 8.9565, + "loss/crossentropy": 2.850237250328064, + "loss/hidden": 2.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.3707829564809799, + "step": 458 + }, + { + "epoch": 0.014375, + "grad_norm": 6.78125, + "grad_norm_var": 2.554931640625, + "learning_rate": 0.0001, + "loss": 8.6868, + "loss/crossentropy": 2.717839241027832, + "loss/hidden": 2.3125, + "loss/jsd": 0.0, + "loss/logits": 0.3656424283981323, + "step": 460 + }, + { + "epoch": 0.0144375, + "grad_norm": 6.96875, + "grad_norm_var": 0.7952473958333334, + "learning_rate": 0.0001, + "loss": 9.2741, + "loss/crossentropy": 2.896919012069702, + "loss/hidden": 2.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3994409739971161, + "step": 462 + }, + { + "epoch": 0.0145, + "grad_norm": 6.6875, + "grad_norm_var": 0.8183430989583333, + "learning_rate": 0.0001, + "loss": 8.9309, + "loss/crossentropy": 2.8172919750213623, + "loss/hidden": 2.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3754217326641083, + "step": 464 + }, + { + "epoch": 0.0145625, + "grad_norm": 6.78125, + "grad_norm_var": 0.8165323893229167, + "learning_rate": 0.0001, + "loss": 8.9581, + "loss/crossentropy": 2.779883623123169, + "loss/hidden": 2.375, + "loss/jsd": 0.0, + "loss/logits": 0.3803219199180603, + "step": 466 + }, + { + "epoch": 0.014625, + "grad_norm": 7.1875, + "grad_norm_var": 0.7966145833333333, + "learning_rate": 0.0001, + "loss": 8.8229, + "loss/crossentropy": 2.738626718521118, + "loss/hidden": 2.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.3732661008834839, + "step": 468 + }, + { + "epoch": 0.0146875, + "grad_norm": 10.875, + "grad_norm_var": 1.6305338541666667, + "learning_rate": 0.0001, + "loss": 9.3225, + "loss/crossentropy": 2.7142670154571533, + "loss/hidden": 2.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.4131620526313782, + "step": 470 + }, + { + "epoch": 0.01475, + "grad_norm": 6.75, + "grad_norm_var": 1.64146728515625, + "learning_rate": 0.0001, + "loss": 8.7388, + "loss/crossentropy": 2.8026680946350098, + "loss/hidden": 2.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.35846084356307983, + "step": 472 + }, + { + "epoch": 0.0148125, + "grad_norm": 6.3125, + "grad_norm_var": 1.345556640625, + "learning_rate": 0.0001, + "loss": 9.0225, + "loss/crossentropy": 2.877347230911255, + "loss/hidden": 2.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.38092535734176636, + "step": 474 + }, + { + "epoch": 0.014875, + "grad_norm": 6.375, + "grad_norm_var": 1.351416015625, + "learning_rate": 0.0001, + "loss": 9.2148, + "loss/crossentropy": 2.9207284450531006, + "loss/hidden": 2.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.39269061386585236, + "step": 476 + }, + { + "epoch": 0.0149375, + "grad_norm": 6.46875, + "grad_norm_var": 1.3369140625, + "learning_rate": 0.0001, + "loss": 8.772, + "loss/crossentropy": 2.7076833248138428, + "loss/hidden": 2.421875, + "loss/jsd": 0.0, + "loss/logits": 0.3642459362745285, + "step": 478 + }, + { + "epoch": 0.015, + "grad_norm": 6.21875, + "grad_norm_var": 1.34088134765625, + "learning_rate": 0.0001, + "loss": 8.7427, + "loss/crossentropy": 2.684576988220215, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.3769093304872513, + "step": 480 + }, + { + "epoch": 0.0150625, + "grad_norm": 6.59375, + "grad_norm_var": 1.45689697265625, + "learning_rate": 0.0001, + "loss": 8.6295, + "loss/crossentropy": 2.735003113746643, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.36054036021232605, + "step": 482 + }, + { + "epoch": 0.015125, + "grad_norm": 7.59375, + "grad_norm_var": 1.4751139322916667, + "learning_rate": 0.0001, + "loss": 9.1039, + "loss/crossentropy": 2.8056256771087646, + "loss/hidden": 2.40625, + "loss/jsd": 0.0, + "loss/logits": 0.3891993761062622, + "step": 484 + }, + { + "epoch": 0.0151875, + "grad_norm": 6.78125, + "grad_norm_var": 0.46174723307291665, + "learning_rate": 0.0001, + "loss": 9.1391, + "loss/crossentropy": 2.9603331089019775, + "loss/hidden": 2.359375, + "loss/jsd": 0.0, + "loss/logits": 0.3819381147623062, + "step": 486 + }, + { + "epoch": 0.01525, + "grad_norm": 7.84375, + "grad_norm_var": 0.5404256184895834, + "learning_rate": 0.0001, + "loss": 9.0152, + "loss/crossentropy": 2.802178978919983, + "loss/hidden": 2.34375, + "loss/jsd": 0.0, + "loss/logits": 0.3869243860244751, + "step": 488 + }, + { + "epoch": 0.0153125, + "grad_norm": 6.28125, + "grad_norm_var": 0.27545166015625, + "learning_rate": 0.0001, + "loss": 8.6984, + "loss/crossentropy": 2.745243191719055, + "loss/hidden": 2.3125, + "loss/jsd": 0.0, + "loss/logits": 0.3640620857477188, + "step": 490 + }, + { + "epoch": 0.015375, + "grad_norm": 6.59375, + "grad_norm_var": 0.27112223307291666, + "learning_rate": 0.0001, + "loss": 8.5364, + "loss/crossentropy": 2.6457338333129883, + "loss/hidden": 2.296875, + "loss/jsd": 0.0, + "loss/logits": 0.3593788295984268, + "step": 492 + }, + { + "epoch": 0.0154375, + "grad_norm": 6.3125, + "grad_norm_var": 0.2749837239583333, + "learning_rate": 0.0001, + "loss": 8.7473, + "loss/crossentropy": 2.8219770193099976, + "loss/hidden": 2.296875, + "loss/jsd": 0.0, + "loss/logits": 0.3628465384244919, + "step": 494 + }, + { + "epoch": 0.0155, + "grad_norm": 6.46875, + "grad_norm_var": 0.28863525390625, + "learning_rate": 0.0001, + "loss": 9.2873, + "loss/crossentropy": 2.9155017137527466, + "loss/hidden": 2.375, + "loss/jsd": 0.0, + "loss/logits": 0.3996797800064087, + "step": 496 + }, + { + "epoch": 0.0155625, + "grad_norm": 7.9375, + "grad_norm_var": 0.3087076822916667, + "learning_rate": 0.0001, + "loss": 8.8924, + "loss/crossentropy": 2.916568875312805, + "loss/hidden": 2.3125, + "loss/jsd": 0.0, + "loss/logits": 0.3663354367017746, + "step": 498 + }, + { + "epoch": 0.015625, + "grad_norm": 6.96875, + "grad_norm_var": 0.2665201822916667, + "learning_rate": 0.0001, + "loss": 8.9003, + "loss/crossentropy": 2.785780906677246, + "loss/hidden": 2.421875, + "loss/jsd": 0.0, + "loss/logits": 0.36926528811454773, + "step": 500 + }, + { + "epoch": 0.0156875, + "grad_norm": 7.15625, + "grad_norm_var": 0.276025390625, + "learning_rate": 0.0001, + "loss": 8.8767, + "loss/crossentropy": 2.8471392393112183, + "loss/hidden": 2.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.3724879324436188, + "step": 502 + }, + { + "epoch": 0.01575, + "grad_norm": 6.15625, + "grad_norm_var": 0.22040608723958333, + "learning_rate": 0.0001, + "loss": 8.8554, + "loss/crossentropy": 2.868078351020813, + "loss/hidden": 2.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.36670316755771637, + "step": 504 + }, + { + "epoch": 0.0158125, + "grad_norm": 8.375, + "grad_norm_var": 0.3575358072916667, + "learning_rate": 0.0001, + "loss": 8.9408, + "loss/crossentropy": 2.7880598306655884, + "loss/hidden": 2.34375, + "loss/jsd": 0.0, + "loss/logits": 0.3809019774198532, + "step": 506 + }, + { + "epoch": 0.015875, + "grad_norm": 7.21875, + "grad_norm_var": 0.380712890625, + "learning_rate": 0.0001, + "loss": 8.7033, + "loss/crossentropy": 2.674094319343567, + "loss/hidden": 2.3125, + "loss/jsd": 0.0, + "loss/logits": 0.37166814506053925, + "step": 508 + }, + { + "epoch": 0.0159375, + "grad_norm": 6.28125, + "grad_norm_var": 0.3999837239583333, + "learning_rate": 0.0001, + "loss": 8.4149, + "loss/crossentropy": 2.6351935863494873, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.34984463453292847, + "step": 510 + }, + { + "epoch": 0.016, + "grad_norm": 5.875, + "grad_norm_var": 0.46190999348958334, + "learning_rate": 0.0001, + "loss": 8.6473, + "loss/crossentropy": 2.8746068477630615, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.3491448760032654, + "step": 512 + }, + { + "epoch": 0.0160625, + "grad_norm": 9.0625, + "grad_norm_var": 0.7147420247395834, + "learning_rate": 0.0001, + "loss": 9.3762, + "loss/crossentropy": 2.979753851890564, + "loss/hidden": 2.4375, + "loss/jsd": 0.0, + "loss/logits": 0.39589935541152954, + "step": 514 + }, + { + "epoch": 0.016125, + "grad_norm": 6.28125, + "grad_norm_var": 0.73521728515625, + "learning_rate": 0.0001, + "loss": 8.8009, + "loss/crossentropy": 2.7439844608306885, + "loss/hidden": 2.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.3720976561307907, + "step": 516 + }, + { + "epoch": 0.0161875, + "grad_norm": 7.3125, + "grad_norm_var": 0.7470703125, + "learning_rate": 0.0001, + "loss": 9.0876, + "loss/crossentropy": 2.9311214685440063, + "loss/hidden": 2.34375, + "loss/jsd": 0.0, + "loss/logits": 0.38126952946186066, + "step": 518 + }, + { + "epoch": 0.01625, + "grad_norm": 6.71875, + "grad_norm_var": 0.74068603515625, + "learning_rate": 0.0001, + "loss": 8.3644, + "loss/crossentropy": 2.749508023262024, + "loss/hidden": 2.296875, + "loss/jsd": 0.0, + "loss/logits": 0.331801176071167, + "step": 520 + }, + { + "epoch": 0.0163125, + "grad_norm": 5.875, + "grad_norm_var": 0.6261555989583333, + "learning_rate": 0.0001, + "loss": 8.3462, + "loss/crossentropy": 2.638471841812134, + "loss/hidden": 2.296875, + "loss/jsd": 0.0, + "loss/logits": 0.34108367562294006, + "step": 522 + }, + { + "epoch": 0.016375, + "grad_norm": 6.4375, + "grad_norm_var": 0.5886555989583333, + "learning_rate": 0.0001, + "loss": 8.7263, + "loss/crossentropy": 2.772747278213501, + "loss/hidden": 2.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.3680117428302765, + "step": 524 + }, + { + "epoch": 0.0164375, + "grad_norm": 7.8125, + "grad_norm_var": 0.6463826497395834, + "learning_rate": 0.0001, + "loss": 8.424, + "loss/crossentropy": 2.672973871231079, + "loss/hidden": 2.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.3415112644433975, + "step": 526 + }, + { + "epoch": 0.0165, + "grad_norm": 6.03125, + "grad_norm_var": 0.6551432291666667, + "learning_rate": 0.0001, + "loss": 8.9393, + "loss/crossentropy": 2.907612681388855, + "loss/hidden": 2.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.36489084362983704, + "step": 528 + }, + { + "epoch": 0.0165625, + "grad_norm": 7.78125, + "grad_norm_var": 0.3875, + "learning_rate": 0.0001, + "loss": 8.8, + "loss/crossentropy": 2.7299684286117554, + "loss/hidden": 2.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.3687261939048767, + "step": 530 + }, + { + "epoch": 0.016625, + "grad_norm": 6.90625, + "grad_norm_var": 0.36607666015625, + "learning_rate": 0.0001, + "loss": 8.7147, + "loss/crossentropy": 2.771029829978943, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.36624111235141754, + "step": 532 + }, + { + "epoch": 0.0166875, + "grad_norm": 7.03125, + "grad_norm_var": 0.36376546223958334, + "learning_rate": 0.0001, + "loss": 8.9628, + "loss/crossentropy": 2.915344715118408, + "loss/hidden": 2.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.36958499252796173, + "step": 534 + }, + { + "epoch": 0.01675, + "grad_norm": 8.6875, + "grad_norm_var": 0.5572224934895833, + "learning_rate": 0.0001, + "loss": 8.6004, + "loss/crossentropy": 2.746902346611023, + "loss/hidden": 2.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.3533162772655487, + "step": 536 + }, + { + "epoch": 0.0168125, + "grad_norm": 6.21875, + "grad_norm_var": 0.5417805989583333, + "learning_rate": 0.0001, + "loss": 8.8733, + "loss/crossentropy": 2.7585846185684204, + "loss/hidden": 2.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.37631259858608246, + "step": 538 + }, + { + "epoch": 0.016875, + "grad_norm": 6.625, + "grad_norm_var": 0.5559529622395833, + "learning_rate": 0.0001, + "loss": 8.5898, + "loss/crossentropy": 2.757652521133423, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.35664936900138855, + "step": 540 + }, + { + "epoch": 0.0169375, + "grad_norm": 6.40625, + "grad_norm_var": 0.5765909830729167, + "learning_rate": 0.0001, + "loss": 8.5212, + "loss/crossentropy": 2.701608419418335, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.35539373755455017, + "step": 542 + }, + { + "epoch": 0.017, + "grad_norm": 7.40625, + "grad_norm_var": 0.5002888997395833, + "learning_rate": 0.0001, + "loss": 8.5642, + "loss/crossentropy": 2.677858829498291, + "loss/hidden": 2.328125, + "loss/jsd": 0.0, + "loss/logits": 0.35582463443279266, + "step": 544 + }, + { + "epoch": 0.0170625, + "grad_norm": 6.375, + "grad_norm_var": 0.53140869140625, + "learning_rate": 0.0001, + "loss": 8.5848, + "loss/crossentropy": 2.7882959842681885, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.3507450073957443, + "step": 546 + }, + { + "epoch": 0.017125, + "grad_norm": 6.65625, + "grad_norm_var": 0.5334269205729166, + "learning_rate": 0.0001, + "loss": 8.2292, + "loss/crossentropy": 2.616737723350525, + "loss/hidden": 2.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.33077819645404816, + "step": 548 + }, + { + "epoch": 0.0171875, + "grad_norm": 7.0625, + "grad_norm_var": 0.5574503580729167, + "learning_rate": 0.0001, + "loss": 8.6128, + "loss/crossentropy": 2.8342689275741577, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.351292222738266, + "step": 550 + }, + { + "epoch": 0.01725, + "grad_norm": 6.375, + "grad_norm_var": 0.29099934895833335, + "learning_rate": 0.0001, + "loss": 9.0425, + "loss/crossentropy": 2.837349534034729, + "loss/hidden": 2.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.38379451632499695, + "step": 552 + }, + { + "epoch": 0.0173125, + "grad_norm": 9.75, + "grad_norm_var": 0.8583943684895833, + "learning_rate": 0.0001, + "loss": 8.8276, + "loss/crossentropy": 2.689685583114624, + "loss/hidden": 2.328125, + "loss/jsd": 0.0, + "loss/logits": 0.38097959756851196, + "step": 554 + }, + { + "epoch": 0.017375, + "grad_norm": 6.78125, + "grad_norm_var": 0.87076416015625, + "learning_rate": 0.0001, + "loss": 8.5501, + "loss/crossentropy": 2.777572512626648, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.35068848729133606, + "step": 556 + }, + { + "epoch": 0.0174375, + "grad_norm": 6.21875, + "grad_norm_var": 0.8652628580729167, + "learning_rate": 0.0001, + "loss": 8.6392, + "loss/crossentropy": 2.783918261528015, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.3566194474697113, + "step": 558 + }, + { + "epoch": 0.0175, + "grad_norm": 6.1875, + "grad_norm_var": 0.8312337239583333, + "learning_rate": 0.0001, + "loss": 8.3345, + "loss/crossentropy": 2.6030431985855103, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.3465864509344101, + "step": 560 + }, + { + "epoch": 0.0175625, + "grad_norm": 6.46875, + "grad_norm_var": 0.80621337890625, + "learning_rate": 0.0001, + "loss": 9.2577, + "loss/crossentropy": 2.9951740503311157, + "loss/hidden": 2.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.39578162133693695, + "step": 562 + }, + { + "epoch": 0.017625, + "grad_norm": 6.46875, + "grad_norm_var": 0.83209228515625, + "learning_rate": 0.0001, + "loss": 8.6354, + "loss/crossentropy": 2.8244831562042236, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.35530561208724976, + "step": 564 + }, + { + "epoch": 0.0176875, + "grad_norm": 6.6875, + "grad_norm_var": 0.7983357747395833, + "learning_rate": 0.0001, + "loss": 8.4636, + "loss/crossentropy": 2.7996731996536255, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.34061114490032196, + "step": 566 + }, + { + "epoch": 0.01775, + "grad_norm": 5.96875, + "grad_norm_var": 0.854296875, + "learning_rate": 0.0001, + "loss": 7.9341, + "loss/crossentropy": 2.5139763355255127, + "loss/hidden": 2.171875, + "loss/jsd": 0.0, + "loss/logits": 0.3248262405395508, + "step": 568 + }, + { + "epoch": 0.0178125, + "grad_norm": 6.0625, + "grad_norm_var": 0.12476806640625, + "learning_rate": 0.0001, + "loss": 8.2548, + "loss/crossentropy": 2.5873286724090576, + "loss/hidden": 2.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.3347187936306, + "step": 570 + }, + { + "epoch": 0.017875, + "grad_norm": 6.96875, + "grad_norm_var": 0.17823893229166668, + "learning_rate": 0.0001, + "loss": 8.2163, + "loss/crossentropy": 2.6500922441482544, + "loss/hidden": 2.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.3292814791202545, + "step": 572 + }, + { + "epoch": 0.0179375, + "grad_norm": 6.25, + "grad_norm_var": 0.6076456705729166, + "learning_rate": 0.0001, + "loss": 8.0095, + "loss/crossentropy": 2.4520283937454224, + "loss/hidden": 2.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.3377826511859894, + "step": 574 + }, + { + "epoch": 0.018, + "grad_norm": 5.96875, + "grad_norm_var": 0.6046712239583333, + "learning_rate": 0.0001, + "loss": 8.5973, + "loss/crossentropy": 2.8165611028671265, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.35541336238384247, + "step": 576 + }, + { + "epoch": 0.0180625, + "grad_norm": 5.71875, + "grad_norm_var": 0.6470052083333333, + "learning_rate": 0.0001, + "loss": 8.5303, + "loss/crossentropy": 2.7367520332336426, + "loss/hidden": 2.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.3551349341869354, + "step": 578 + }, + { + "epoch": 0.018125, + "grad_norm": 7.71875, + "grad_norm_var": 0.722900390625, + "learning_rate": 0.0001, + "loss": 8.5377, + "loss/crossentropy": 2.64441180229187, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.3627614676952362, + "step": 580 + }, + { + "epoch": 0.0181875, + "grad_norm": 6.375, + "grad_norm_var": 1.022119140625, + "learning_rate": 0.0001, + "loss": 8.7801, + "loss/crossentropy": 2.859044909477234, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.36319929361343384, + "step": 582 + }, + { + "epoch": 0.01825, + "grad_norm": 6.5625, + "grad_norm_var": 0.9454264322916667, + "learning_rate": 0.0001, + "loss": 8.5919, + "loss/crossentropy": 2.662962317466736, + "loss/hidden": 2.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.3624245524406433, + "step": 584 + }, + { + "epoch": 0.0183125, + "grad_norm": 7.59375, + "grad_norm_var": 1.02203369140625, + "learning_rate": 0.0001, + "loss": 8.4608, + "loss/crossentropy": 2.702337145805359, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.3500698506832123, + "step": 586 + }, + { + "epoch": 0.018375, + "grad_norm": 5.96875, + "grad_norm_var": 0.9475260416666667, + "learning_rate": 0.0001, + "loss": 8.4998, + "loss/crossentropy": 2.7028015851974487, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.3570452928543091, + "step": 588 + }, + { + "epoch": 0.0184375, + "grad_norm": 7.40625, + "grad_norm_var": 0.7027303059895833, + "learning_rate": 0.0001, + "loss": 8.7391, + "loss/crossentropy": 2.8359057903289795, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.36766134202480316, + "step": 590 + }, + { + "epoch": 0.0185, + "grad_norm": 6.03125, + "grad_norm_var": 0.7144816080729167, + "learning_rate": 0.0001, + "loss": 8.4026, + "loss/crossentropy": 2.7504743337631226, + "loss/hidden": 2.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.34099745750427246, + "step": 592 + }, + { + "epoch": 0.0185625, + "grad_norm": 7.21875, + "grad_norm_var": 0.63756103515625, + "learning_rate": 0.0001, + "loss": 8.8234, + "loss/crossentropy": 2.694290280342102, + "loss/hidden": 2.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.3777501881122589, + "step": 594 + }, + { + "epoch": 0.018625, + "grad_norm": 6.21875, + "grad_norm_var": 0.6261067708333333, + "learning_rate": 0.0001, + "loss": 8.3699, + "loss/crossentropy": 2.754095435142517, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.3334520310163498, + "step": 596 + }, + { + "epoch": 0.0186875, + "grad_norm": 5.9375, + "grad_norm_var": 0.3947224934895833, + "learning_rate": 0.0001, + "loss": 8.3719, + "loss/crossentropy": 2.728385329246521, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.3416992574930191, + "step": 598 + }, + { + "epoch": 0.01875, + "grad_norm": 6.0, + "grad_norm_var": 0.4166666666666667, + "learning_rate": 0.0001, + "loss": 8.6437, + "loss/crossentropy": 2.7940341234207153, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.36308759450912476, + "step": 600 + }, + { + "epoch": 0.0188125, + "grad_norm": 6.125, + "grad_norm_var": 0.23287760416666667, + "learning_rate": 0.0001, + "loss": 8.5931, + "loss/crossentropy": 2.888337016105652, + "loss/hidden": 2.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.3462534695863724, + "step": 602 + }, + { + "epoch": 0.018875, + "grad_norm": 7.21875, + "grad_norm_var": 0.30959879557291664, + "learning_rate": 0.0001, + "loss": 8.4772, + "loss/crossentropy": 2.8515857458114624, + "loss/hidden": 2.234375, + "loss/jsd": 0.0, + "loss/logits": 0.33912205696105957, + "step": 604 + }, + { + "epoch": 0.0189375, + "grad_norm": 6.5, + "grad_norm_var": 0.23274332682291668, + "learning_rate": 0.0001, + "loss": 8.2908, + "loss/crossentropy": 2.711584448814392, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.33683064579963684, + "step": 606 + }, + { + "epoch": 0.019, + "grad_norm": 6.3125, + "grad_norm_var": 0.23814697265625, + "learning_rate": 0.0001, + "loss": 8.6234, + "loss/crossentropy": 2.8399088382720947, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.35256920754909515, + "step": 608 + }, + { + "epoch": 0.0190625, + "grad_norm": 7.34375, + "grad_norm_var": 0.23062744140625, + "learning_rate": 0.0001, + "loss": 8.5065, + "loss/crossentropy": 2.7695130109786987, + "loss/hidden": 2.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.3463588356971741, + "step": 610 + }, + { + "epoch": 0.019125, + "grad_norm": 7.09375, + "grad_norm_var": 0.2593587239583333, + "learning_rate": 0.0001, + "loss": 8.7198, + "loss/crossentropy": 2.824345588684082, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.36141711473464966, + "step": 612 + }, + { + "epoch": 0.0191875, + "grad_norm": 5.5, + "grad_norm_var": 0.29983317057291664, + "learning_rate": 0.0001, + "loss": 8.6518, + "loss/crossentropy": 3.0193722248077393, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.3351168632507324, + "step": 614 + }, + { + "epoch": 0.01925, + "grad_norm": 6.875, + "grad_norm_var": 0.35605061848958336, + "learning_rate": 0.0001, + "loss": 8.4348, + "loss/crossentropy": 2.788522481918335, + "loss/hidden": 2.234375, + "loss/jsd": 0.0, + "loss/logits": 0.3411922752857208, + "step": 616 + }, + { + "epoch": 0.0193125, + "grad_norm": 6.75, + "grad_norm_var": 0.3453125, + "learning_rate": 0.0001, + "loss": 8.7769, + "loss/crossentropy": 2.954568028450012, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.353326752781868, + "step": 618 + }, + { + "epoch": 0.019375, + "grad_norm": 6.1875, + "grad_norm_var": 0.25028889973958335, + "learning_rate": 0.0001, + "loss": 8.2421, + "loss/crossentropy": 2.671786308288574, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.3343764841556549, + "step": 620 + }, + { + "epoch": 0.0194375, + "grad_norm": 6.15625, + "grad_norm_var": 0.27727864583333334, + "learning_rate": 0.0001, + "loss": 8.3061, + "loss/crossentropy": 2.692282795906067, + "loss/hidden": 2.171875, + "loss/jsd": 0.0, + "loss/logits": 0.34419384598731995, + "step": 622 + }, + { + "epoch": 0.0195, + "grad_norm": 6.875, + "grad_norm_var": 0.5027180989583333, + "learning_rate": 0.0001, + "loss": 7.9189, + "loss/crossentropy": 2.3546725511550903, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.32986071705818176, + "step": 624 + }, + { + "epoch": 0.0195625, + "grad_norm": 6.15625, + "grad_norm_var": 0.47144775390625, + "learning_rate": 0.0001, + "loss": 8.5574, + "loss/crossentropy": 2.7879234552383423, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.3503849357366562, + "step": 626 + }, + { + "epoch": 0.019625, + "grad_norm": 6.34375, + "grad_norm_var": 0.47261962890625, + "learning_rate": 0.0001, + "loss": 8.3498, + "loss/crossentropy": 2.6622077226638794, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.3421996682882309, + "step": 628 + }, + { + "epoch": 0.0196875, + "grad_norm": 6.625, + "grad_norm_var": 0.42604166666666665, + "learning_rate": 0.0001, + "loss": 8.5607, + "loss/crossentropy": 2.743830442428589, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.3598093241453171, + "step": 630 + }, + { + "epoch": 0.01975, + "grad_norm": 5.84375, + "grad_norm_var": 0.3856404622395833, + "learning_rate": 0.0001, + "loss": 8.3227, + "loss/crossentropy": 2.563997983932495, + "loss/hidden": 2.234375, + "loss/jsd": 0.0, + "loss/logits": 0.35243353247642517, + "step": 632 + }, + { + "epoch": 0.0198125, + "grad_norm": 5.46875, + "grad_norm_var": 0.4461588541666667, + "learning_rate": 0.0001, + "loss": 8.1279, + "loss/crossentropy": 2.6363645792007446, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.32962509989738464, + "step": 634 + }, + { + "epoch": 0.019875, + "grad_norm": 12.0, + "grad_norm_var": 3.256363932291667, + "learning_rate": 0.0001, + "loss": 9.5295, + "loss/crossentropy": 3.0127965211868286, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.4227666109800339, + "step": 636 + }, + { + "epoch": 0.0199375, + "grad_norm": 6.5, + "grad_norm_var": 3.189322916666667, + "learning_rate": 0.0001, + "loss": 7.9371, + "loss/crossentropy": 2.4866052865982056, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.32317017018795013, + "step": 638 + }, + { + "epoch": 0.02, + "grad_norm": 6.03125, + "grad_norm_var": 3.162369791666667, + "learning_rate": 0.0001, + "loss": 7.8481, + "loss/crossentropy": 2.403463363647461, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.3249284029006958, + "step": 640 + }, + { + "epoch": 0.0200625, + "grad_norm": 6.875, + "grad_norm_var": 3.123763020833333, + "learning_rate": 0.0001, + "loss": 9.3288, + "loss/crossentropy": 2.9746265411376953, + "loss/hidden": 2.296875, + "loss/jsd": 0.0, + "loss/logits": 0.40572839975357056, + "step": 642 + }, + { + "epoch": 0.020125, + "grad_norm": 7.625, + "grad_norm_var": 3.1727701822916665, + "learning_rate": 0.0001, + "loss": 8.5278, + "loss/crossentropy": 2.7414538860321045, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.35675469040870667, + "step": 644 + }, + { + "epoch": 0.0201875, + "grad_norm": 5.53125, + "grad_norm_var": 3.2625, + "learning_rate": 0.0001, + "loss": 8.8497, + "loss/crossentropy": 2.9117285013198853, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.3711456209421158, + "step": 646 + }, + { + "epoch": 0.02025, + "grad_norm": 8.0625, + "grad_norm_var": 3.316259765625, + "learning_rate": 0.0001, + "loss": 8.5108, + "loss/crossentropy": 2.7149282693862915, + "loss/hidden": 2.234375, + "loss/jsd": 0.0, + "loss/logits": 0.35615289211273193, + "step": 648 + }, + { + "epoch": 0.0203125, + "grad_norm": 7.34375, + "grad_norm_var": 3.121903483072917, + "learning_rate": 0.0001, + "loss": 8.6546, + "loss/crossentropy": 2.6386566162109375, + "loss/hidden": 2.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.37424588203430176, + "step": 650 + }, + { + "epoch": 0.020375, + "grad_norm": 5.46875, + "grad_norm_var": 0.7382120768229167, + "learning_rate": 0.0001, + "loss": 8.4152, + "loss/crossentropy": 2.574658155441284, + "loss/hidden": 2.3125, + "loss/jsd": 0.0, + "loss/logits": 0.3528069108724594, + "step": 652 + }, + { + "epoch": 0.0204375, + "grad_norm": 6.1875, + "grad_norm_var": 0.7291951497395833, + "learning_rate": 0.0001, + "loss": 8.6135, + "loss/crossentropy": 2.879882335662842, + "loss/hidden": 2.25, + "loss/jsd": 0.0, + "loss/logits": 0.3483603298664093, + "step": 654 + }, + { + "epoch": 0.0205, + "grad_norm": 6.5, + "grad_norm_var": 0.6912760416666667, + "learning_rate": 0.0001, + "loss": 8.6924, + "loss/crossentropy": 2.702815890312195, + "loss/hidden": 2.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.3700554370880127, + "step": 656 + }, + { + "epoch": 0.0205625, + "grad_norm": 5.90625, + "grad_norm_var": 0.755322265625, + "learning_rate": 0.0001, + "loss": 8.514, + "loss/crossentropy": 2.626859426498413, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.3629360496997833, + "step": 658 + }, + { + "epoch": 0.020625, + "grad_norm": 6.125, + "grad_norm_var": 0.7024739583333334, + "learning_rate": 0.0001, + "loss": 8.7634, + "loss/crossentropy": 2.8795636892318726, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.3657243996858597, + "step": 660 + }, + { + "epoch": 0.0206875, + "grad_norm": 7.15625, + "grad_norm_var": 0.5881144205729166, + "learning_rate": 0.0001, + "loss": 8.3306, + "loss/crossentropy": 2.7441413402557373, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.34223654866218567, + "step": 662 + }, + { + "epoch": 0.02075, + "grad_norm": 5.1875, + "grad_norm_var": 0.5191691080729167, + "learning_rate": 0.0001, + "loss": 7.7663, + "loss/crossentropy": 2.452701687812805, + "loss/hidden": 2.1875, + "loss/jsd": 0.0, + "loss/logits": 0.31261467933654785, + "step": 664 + }, + { + "epoch": 0.0208125, + "grad_norm": 5.75, + "grad_norm_var": 0.38619384765625, + "learning_rate": 0.0001, + "loss": 8.093, + "loss/crossentropy": 2.546747088432312, + "loss/hidden": 2.25, + "loss/jsd": 0.0, + "loss/logits": 0.32962463796138763, + "step": 666 + }, + { + "epoch": 0.020875, + "grad_norm": 5.96875, + "grad_norm_var": 0.306640625, + "learning_rate": 0.0001, + "loss": 8.5068, + "loss/crossentropy": 2.8044780492782593, + "loss/hidden": 2.1875, + "loss/jsd": 0.0, + "loss/logits": 0.351484552025795, + "step": 668 + }, + { + "epoch": 0.0209375, + "grad_norm": 6.78125, + "grad_norm_var": 0.29547119140625, + "learning_rate": 0.0001, + "loss": 8.7689, + "loss/crossentropy": 3.0153743028640747, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.34957103431224823, + "step": 670 + }, + { + "epoch": 0.021, + "grad_norm": 5.75, + "grad_norm_var": 0.29178059895833336, + "learning_rate": 0.0001, + "loss": 8.4564, + "loss/crossentropy": 2.712641477584839, + "loss/hidden": 2.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.3564087748527527, + "step": 672 + }, + { + "epoch": 0.0210625, + "grad_norm": 5.90625, + "grad_norm_var": 0.22476806640625, + "learning_rate": 0.0001, + "loss": 8.5781, + "loss/crossentropy": 2.984122633934021, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.3383050411939621, + "step": 674 + }, + { + "epoch": 0.021125, + "grad_norm": 5.84375, + "grad_norm_var": 0.24133707682291666, + "learning_rate": 0.0001, + "loss": 8.5681, + "loss/crossentropy": 2.6959644556045532, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.3653368502855301, + "step": 676 + }, + { + "epoch": 0.0211875, + "grad_norm": 6.75, + "grad_norm_var": 0.21951497395833333, + "learning_rate": 0.0001, + "loss": 8.3572, + "loss/crossentropy": 2.661711096763611, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.34142591059207916, + "step": 678 + }, + { + "epoch": 0.02125, + "grad_norm": 6.96875, + "grad_norm_var": 0.23346354166666666, + "learning_rate": 0.0001, + "loss": 8.1775, + "loss/crossentropy": 2.5807024240493774, + "loss/hidden": 2.25, + "loss/jsd": 0.0, + "loss/logits": 0.3346793055534363, + "step": 680 + }, + { + "epoch": 0.0213125, + "grad_norm": 6.6875, + "grad_norm_var": 0.34814046223958334, + "learning_rate": 0.0001, + "loss": 9.2846, + "loss/crossentropy": 3.140147089958191, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.38788214325904846, + "step": 682 + }, + { + "epoch": 0.021375, + "grad_norm": 6.6875, + "grad_norm_var": 0.3236328125, + "learning_rate": 0.0001, + "loss": 8.1472, + "loss/crossentropy": 2.5489360094070435, + "loss/hidden": 2.25, + "loss/jsd": 0.0, + "loss/logits": 0.33482369780540466, + "step": 684 + }, + { + "epoch": 0.0214375, + "grad_norm": 5.9375, + "grad_norm_var": 0.33414306640625, + "learning_rate": 0.0001, + "loss": 8.3896, + "loss/crossentropy": 2.6653178930282593, + "loss/hidden": 2.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.34508879482746124, + "step": 686 + }, + { + "epoch": 0.0215, + "grad_norm": 6.875, + "grad_norm_var": 0.31392822265625, + "learning_rate": 0.0001, + "loss": 8.701, + "loss/crossentropy": 2.9853092432022095, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.35047847032546997, + "step": 688 + }, + { + "epoch": 0.0215625, + "grad_norm": 5.59375, + "grad_norm_var": 0.32591145833333335, + "learning_rate": 0.0001, + "loss": 8.1641, + "loss/crossentropy": 2.6723328828811646, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.3280813992023468, + "step": 690 + }, + { + "epoch": 0.021625, + "grad_norm": 6.46875, + "grad_norm_var": 0.305078125, + "learning_rate": 0.0001, + "loss": 8.4814, + "loss/crossentropy": 2.7639589309692383, + "loss/hidden": 2.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.34440168738365173, + "step": 692 + }, + { + "epoch": 0.0216875, + "grad_norm": 7.40625, + "grad_norm_var": 0.35523681640625, + "learning_rate": 0.0001, + "loss": 8.0945, + "loss/crossentropy": 2.505578875541687, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.3331110030412674, + "step": 694 + }, + { + "epoch": 0.02175, + "grad_norm": 5.6875, + "grad_norm_var": 0.3683553059895833, + "learning_rate": 0.0001, + "loss": 8.3149, + "loss/crossentropy": 2.799286961555481, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.336715966463089, + "step": 696 + }, + { + "epoch": 0.0218125, + "grad_norm": 6.3125, + "grad_norm_var": 0.2619099934895833, + "learning_rate": 0.0001, + "loss": 8.1849, + "loss/crossentropy": 2.5903667211532593, + "loss/hidden": 2.203125, + "loss/jsd": 0.0, + "loss/logits": 0.3391364514827728, + "step": 698 + }, + { + "epoch": 0.021875, + "grad_norm": 6.125, + "grad_norm_var": 0.2579427083333333, + "learning_rate": 0.0001, + "loss": 8.5062, + "loss/crossentropy": 2.7039811611175537, + "loss/hidden": 2.28125, + "loss/jsd": 0.0, + "loss/logits": 0.3520972728729248, + "step": 700 + }, + { + "epoch": 0.0219375, + "grad_norm": 10.375, + "grad_norm_var": 1.3748982747395833, + "learning_rate": 0.0001, + "loss": 8.151, + "loss/crossentropy": 2.5421234369277954, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.3444819301366806, + "step": 702 + }, + { + "epoch": 0.022, + "grad_norm": 5.40625, + "grad_norm_var": 1.465625, + "learning_rate": 0.0001, + "loss": 7.8739, + "loss/crossentropy": 2.482094407081604, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.32745902240276337, + "step": 704 + }, + { + "epoch": 0.0220625, + "grad_norm": 7.21875, + "grad_norm_var": 1.5061848958333333, + "learning_rate": 0.0001, + "loss": 8.6827, + "loss/crossentropy": 2.8615206480026245, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.3610275834798813, + "step": 706 + }, + { + "epoch": 0.022125, + "grad_norm": 5.875, + "grad_norm_var": 1.5166951497395833, + "learning_rate": 0.0001, + "loss": 8.1118, + "loss/crossentropy": 2.5979604721069336, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.339664563536644, + "step": 708 + }, + { + "epoch": 0.0221875, + "grad_norm": 7.125, + "grad_norm_var": 1.48492431640625, + "learning_rate": 0.0001, + "loss": 8.3526, + "loss/crossentropy": 2.7990275621414185, + "loss/hidden": 2.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.3373870253562927, + "step": 710 + }, + { + "epoch": 0.02225, + "grad_norm": 6.0, + "grad_norm_var": 1.4725545247395833, + "learning_rate": 0.0001, + "loss": 8.3694, + "loss/crossentropy": 2.773742437362671, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.3400302082300186, + "step": 712 + }, + { + "epoch": 0.0223125, + "grad_norm": 5.75, + "grad_norm_var": 1.4877237955729166, + "learning_rate": 0.0001, + "loss": 8.5224, + "loss/crossentropy": 2.9048824310302734, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.3406597226858139, + "step": 714 + }, + { + "epoch": 0.022375, + "grad_norm": 6.21875, + "grad_norm_var": 1.4764933268229166, + "learning_rate": 0.0001, + "loss": 8.2475, + "loss/crossentropy": 2.597532033920288, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.3431176245212555, + "step": 716 + }, + { + "epoch": 0.0224375, + "grad_norm": 5.6875, + "grad_norm_var": 0.3088175455729167, + "learning_rate": 0.0001, + "loss": 7.8735, + "loss/crossentropy": 2.5067347288131714, + "loss/hidden": 2.15625, + "loss/jsd": 0.0, + "loss/logits": 0.3210519254207611, + "step": 718 + }, + { + "epoch": 0.0225, + "grad_norm": 6.5, + "grad_norm_var": 0.29811197916666665, + "learning_rate": 0.0001, + "loss": 8.3045, + "loss/crossentropy": 2.7840747833251953, + "loss/hidden": 2.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.3340781629085541, + "step": 720 + }, + { + "epoch": 0.0225625, + "grad_norm": 6.9375, + "grad_norm_var": 0.461962890625, + "learning_rate": 0.0001, + "loss": 8.5005, + "loss/crossentropy": 2.7896593809127808, + "loss/hidden": 2.265625, + "loss/jsd": 0.0, + "loss/logits": 0.34451836347579956, + "step": 722 + }, + { + "epoch": 0.022625, + "grad_norm": 7.53125, + "grad_norm_var": 0.5669108072916667, + "learning_rate": 0.0001, + "loss": 8.2398, + "loss/crossentropy": 2.7475579977035522, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.3343813419342041, + "step": 724 + }, + { + "epoch": 0.0226875, + "grad_norm": 6.75, + "grad_norm_var": 0.4945271809895833, + "learning_rate": 0.0001, + "loss": 8.5406, + "loss/crossentropy": 2.7411571741104126, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.35806816816329956, + "step": 726 + }, + { + "epoch": 0.02275, + "grad_norm": 6.40625, + "grad_norm_var": 0.47711181640625, + "learning_rate": 0.0001, + "loss": 8.5912, + "loss/crossentropy": 2.766042709350586, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.36298026144504547, + "step": 728 + }, + { + "epoch": 0.0228125, + "grad_norm": 6.34375, + "grad_norm_var": 0.4973958333333333, + "learning_rate": 0.0001, + "loss": 7.9852, + "loss/crossentropy": 2.5189281702041626, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.33256661891937256, + "step": 730 + }, + { + "epoch": 0.022875, + "grad_norm": 9.9375, + "grad_norm_var": 1.2870402018229166, + "learning_rate": 0.0001, + "loss": 8.3075, + "loss/crossentropy": 2.7226722240448, + "loss/hidden": 2.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.33426010608673096, + "step": 732 + }, + { + "epoch": 0.0229375, + "grad_norm": 5.625, + "grad_norm_var": 1.3407389322916667, + "learning_rate": 0.0001, + "loss": 8.4679, + "loss/crossentropy": 2.7558926343917847, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.3547917455434799, + "step": 734 + }, + { + "epoch": 0.023, + "grad_norm": 6.34375, + "grad_norm_var": 1.37720947265625, + "learning_rate": 0.0001, + "loss": 8.9548, + "loss/crossentropy": 2.8543918132781982, + "loss/hidden": 2.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.3858264982700348, + "step": 736 + }, + { + "epoch": 0.0230625, + "grad_norm": 6.125, + "grad_norm_var": 1.3258951822916667, + "learning_rate": 0.0001, + "loss": 8.4039, + "loss/crossentropy": 2.701572895050049, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.3491367846727371, + "step": 738 + }, + { + "epoch": 0.023125, + "grad_norm": 5.46875, + "grad_norm_var": 1.3380818684895834, + "learning_rate": 0.0001, + "loss": 8.2909, + "loss/crossentropy": 2.7676188945770264, + "loss/hidden": 2.15625, + "loss/jsd": 0.0, + "loss/logits": 0.3367016613483429, + "step": 740 + }, + { + "epoch": 0.0231875, + "grad_norm": 6.25, + "grad_norm_var": 1.4073527018229166, + "learning_rate": 0.0001, + "loss": 7.9941, + "loss/crossentropy": 2.589860200881958, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.32557548582553864, + "step": 742 + }, + { + "epoch": 0.02325, + "grad_norm": 5.59375, + "grad_norm_var": 1.5048787434895834, + "learning_rate": 0.0001, + "loss": 8.1793, + "loss/crossentropy": 2.689063787460327, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.3365195244550705, + "step": 744 + }, + { + "epoch": 0.0233125, + "grad_norm": 6.5, + "grad_norm_var": 1.434619140625, + "learning_rate": 0.0001, + "loss": 8.2082, + "loss/crossentropy": 2.6279042959213257, + "loss/hidden": 2.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.3338083028793335, + "step": 746 + }, + { + "epoch": 0.023375, + "grad_norm": 5.375, + "grad_norm_var": 0.6980305989583333, + "learning_rate": 0.0001, + "loss": 8.2409, + "loss/crossentropy": 2.7593994140625, + "loss/hidden": 2.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.33018162846565247, + "step": 748 + }, + { + "epoch": 0.0234375, + "grad_norm": 7.34375, + "grad_norm_var": 0.6168253580729167, + "learning_rate": 0.0001, + "loss": 8.6918, + "loss/crossentropy": 2.853471040725708, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.361956387758255, + "step": 750 + }, + { + "epoch": 0.0235, + "grad_norm": 5.21875, + "grad_norm_var": 0.31187744140625, + "learning_rate": 0.0001, + "loss": 7.8511, + "loss/crossentropy": 2.6204041242599487, + "loss/hidden": 2.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3121359795331955, + "step": 752 + }, + { + "epoch": 0.0235625, + "grad_norm": 6.03125, + "grad_norm_var": 0.308447265625, + "learning_rate": 0.0001, + "loss": 8.0742, + "loss/crossentropy": 2.605458617210388, + "loss/hidden": 2.1875, + "loss/jsd": 0.0, + "loss/logits": 0.3281271606683731, + "step": 754 + }, + { + "epoch": 0.023625, + "grad_norm": 6.46875, + "grad_norm_var": 0.30725504557291666, + "learning_rate": 0.0001, + "loss": 8.6584, + "loss/crossentropy": 2.7429229021072388, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.37045322358608246, + "step": 756 + }, + { + "epoch": 0.0236875, + "grad_norm": 6.0625, + "grad_norm_var": 0.29225260416666665, + "learning_rate": 0.0001, + "loss": 8.1709, + "loss/crossentropy": 2.6542168855667114, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.33917176723480225, + "step": 758 + }, + { + "epoch": 0.02375, + "grad_norm": 5.71875, + "grad_norm_var": 0.287890625, + "learning_rate": 0.0001, + "loss": 8.4722, + "loss/crossentropy": 2.7812271118164062, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.349564865231514, + "step": 760 + }, + { + "epoch": 0.0238125, + "grad_norm": 5.96875, + "grad_norm_var": 0.27245686848958334, + "learning_rate": 0.0001, + "loss": 8.1857, + "loss/crossentropy": 2.6698429584503174, + "loss/hidden": 2.15625, + "loss/jsd": 0.0, + "loss/logits": 0.33596329391002655, + "step": 762 + }, + { + "epoch": 0.023875, + "grad_norm": 6.25, + "grad_norm_var": 0.264306640625, + "learning_rate": 0.0001, + "loss": 8.2672, + "loss/crossentropy": 2.8197702169418335, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.3306830823421478, + "step": 764 + }, + { + "epoch": 0.0239375, + "grad_norm": 6.53125, + "grad_norm_var": 0.14451497395833332, + "learning_rate": 0.0001, + "loss": 8.1106, + "loss/crossentropy": 2.594325065612793, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.3320969194173813, + "step": 766 + }, + { + "epoch": 0.024, + "grad_norm": 6.8125, + "grad_norm_var": 0.18153889973958334, + "learning_rate": 0.0001, + "loss": 8.8065, + "loss/crossentropy": 2.9510093927383423, + "loss/hidden": 2.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.36289557814598083, + "step": 768 + }, + { + "epoch": 0.0240625, + "grad_norm": 5.5625, + "grad_norm_var": 0.15584309895833334, + "learning_rate": 0.0001, + "loss": 8.0488, + "loss/crossentropy": 2.677095890045166, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3270166665315628, + "step": 770 + }, + { + "epoch": 0.024125, + "grad_norm": 5.4375, + "grad_norm_var": 0.16698811848958334, + "learning_rate": 0.0001, + "loss": 8.2652, + "loss/crossentropy": 2.7432464361190796, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.3373481184244156, + "step": 772 + }, + { + "epoch": 0.0241875, + "grad_norm": 5.71875, + "grad_norm_var": 0.20480143229166667, + "learning_rate": 0.0001, + "loss": 8.045, + "loss/crossentropy": 2.7736769914627075, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.31229038536548615, + "step": 774 + }, + { + "epoch": 0.02425, + "grad_norm": 5.53125, + "grad_norm_var": 0.21330973307291667, + "learning_rate": 0.0001, + "loss": 7.8055, + "loss/crossentropy": 2.4817134141921997, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.3206585496664047, + "step": 776 + }, + { + "epoch": 0.0243125, + "grad_norm": 5.125, + "grad_norm_var": 0.25689697265625, + "learning_rate": 0.0001, + "loss": 7.8784, + "loss/crossentropy": 2.6201757192611694, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3156616538763046, + "step": 778 + }, + { + "epoch": 0.024375, + "grad_norm": 5.3125, + "grad_norm_var": 0.2800089518229167, + "learning_rate": 0.0001, + "loss": 8.0161, + "loss/crossentropy": 2.720884919166565, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.320142462849617, + "step": 780 + }, + { + "epoch": 0.0244375, + "grad_norm": 7.125, + "grad_norm_var": 0.38752848307291665, + "learning_rate": 0.0001, + "loss": 8.5161, + "loss/crossentropy": 2.83653724193573, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.34842240810394287, + "step": 782 + }, + { + "epoch": 0.0245, + "grad_norm": 8.1875, + "grad_norm_var": 0.666650390625, + "learning_rate": 0.0001, + "loss": 8.2143, + "loss/crossentropy": 2.7162340879440308, + "loss/hidden": 2.171875, + "loss/jsd": 0.0, + "loss/logits": 0.3326167166233063, + "step": 784 + }, + { + "epoch": 0.0245625, + "grad_norm": 6.03125, + "grad_norm_var": 0.657275390625, + "learning_rate": 0.0001, + "loss": 8.0563, + "loss/crossentropy": 2.6525591611862183, + "loss/hidden": 2.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.32709620893001556, + "step": 786 + }, + { + "epoch": 0.024625, + "grad_norm": 5.71875, + "grad_norm_var": 0.6537109375, + "learning_rate": 0.0001, + "loss": 8.2529, + "loss/crossentropy": 2.7945055961608887, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.3333374410867691, + "step": 788 + }, + { + "epoch": 0.0246875, + "grad_norm": 6.34375, + "grad_norm_var": 0.6299112955729167, + "learning_rate": 0.0001, + "loss": 8.3067, + "loss/crossentropy": 2.855311632156372, + "loss/hidden": 2.15625, + "loss/jsd": 0.0, + "loss/logits": 0.3295145481824875, + "step": 790 + }, + { + "epoch": 0.02475, + "grad_norm": 6.65625, + "grad_norm_var": 0.6583292643229167, + "learning_rate": 0.0001, + "loss": 8.5236, + "loss/crossentropy": 2.885672926902771, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.34895357489585876, + "step": 792 + }, + { + "epoch": 0.0248125, + "grad_norm": 6.125, + "grad_norm_var": 0.58814697265625, + "learning_rate": 0.0001, + "loss": 8.077, + "loss/crossentropy": 2.6696921586990356, + "loss/hidden": 2.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.327450156211853, + "step": 794 + }, + { + "epoch": 0.024875, + "grad_norm": 5.3125, + "grad_norm_var": 0.56900634765625, + "learning_rate": 0.0001, + "loss": 8.3296, + "loss/crossentropy": 2.806466221809387, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.34059543907642365, + "step": 796 + }, + { + "epoch": 0.0249375, + "grad_norm": 5.6875, + "grad_norm_var": 1.035400390625, + "learning_rate": 0.0001, + "loss": 8.0681, + "loss/crossentropy": 2.5905505418777466, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.33134762942790985, + "step": 798 + }, + { + "epoch": 0.025, + "grad_norm": 6.28125, + "grad_norm_var": 0.76422119140625, + "learning_rate": 0.0001, + "loss": 8.2224, + "loss/crossentropy": 2.6438721418380737, + "loss/hidden": 2.1875, + "loss/jsd": 0.0, + "loss/logits": 0.339097797870636, + "step": 800 + }, + { + "epoch": 0.0250625, + "grad_norm": 5.6875, + "grad_norm_var": 0.779931640625, + "learning_rate": 0.0001, + "loss": 7.9234, + "loss/crossentropy": 2.5964959859848022, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.31862902641296387, + "step": 802 + }, + { + "epoch": 0.025125, + "grad_norm": 6.21875, + "grad_norm_var": 0.7956013997395833, + "learning_rate": 0.0001, + "loss": 8.1019, + "loss/crossentropy": 2.6731587648391724, + "loss/hidden": 2.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.32959243655204773, + "step": 804 + }, + { + "epoch": 0.0251875, + "grad_norm": 7.21875, + "grad_norm_var": 0.8546223958333333, + "learning_rate": 0.0001, + "loss": 8.605, + "loss/crossentropy": 2.872212290763855, + "loss/hidden": 2.2109375, + "loss/jsd": 0.0, + "loss/logits": 0.352187842130661, + "step": 806 + }, + { + "epoch": 0.02525, + "grad_norm": 6.375, + "grad_norm_var": 0.8997395833333334, + "learning_rate": 0.0001, + "loss": 8.2991, + "loss/crossentropy": 2.5583417415618896, + "loss/hidden": 2.203125, + "loss/jsd": 0.0, + "loss/logits": 0.35376642644405365, + "step": 808 + }, + { + "epoch": 0.0253125, + "grad_norm": 6.25, + "grad_norm_var": 0.8658854166666666, + "learning_rate": 0.0001, + "loss": 8.3245, + "loss/crossentropy": 2.7695552110671997, + "loss/hidden": 2.171875, + "loss/jsd": 0.0, + "loss/logits": 0.3383059650659561, + "step": 810 + }, + { + "epoch": 0.025375, + "grad_norm": 6.28125, + "grad_norm_var": 0.7844889322916667, + "learning_rate": 0.0001, + "loss": 8.36, + "loss/crossentropy": 2.7573885917663574, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.3438550680875778, + "step": 812 + }, + { + "epoch": 0.0254375, + "grad_norm": 5.46875, + "grad_norm_var": 0.2918904622395833, + "learning_rate": 0.0001, + "loss": 7.7418, + "loss/crossentropy": 2.553446650505066, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.30242519080638885, + "step": 814 + }, + { + "epoch": 0.0255, + "grad_norm": 13.8125, + "grad_norm_var": 3.9885050455729165, + "learning_rate": 0.0001, + "loss": 8.9974, + "loss/crossentropy": 2.893692135810852, + "loss/hidden": 2.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.3845931142568588, + "step": 816 + }, + { + "epoch": 0.0255625, + "grad_norm": 6.40625, + "grad_norm_var": 3.9234212239583335, + "learning_rate": 0.0001, + "loss": 8.7496, + "loss/crossentropy": 2.876673936843872, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.36541396379470825, + "step": 818 + }, + { + "epoch": 0.025625, + "grad_norm": 7.28125, + "grad_norm_var": 3.7567545572916665, + "learning_rate": 0.0001, + "loss": 8.6539, + "loss/crossentropy": 2.7950828075408936, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.36635273694992065, + "step": 820 + }, + { + "epoch": 0.0256875, + "grad_norm": 5.40625, + "grad_norm_var": 3.863541666666667, + "learning_rate": 0.0001, + "loss": 8.036, + "loss/crossentropy": 2.569133162498474, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.3326212763786316, + "step": 822 + }, + { + "epoch": 0.02575, + "grad_norm": 5.75, + "grad_norm_var": 4.04976806640625, + "learning_rate": 0.0001, + "loss": 8.4846, + "loss/crossentropy": 2.9390757083892822, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.33267849683761597, + "step": 824 + }, + { + "epoch": 0.0258125, + "grad_norm": 6.03125, + "grad_norm_var": 4.074898274739583, + "learning_rate": 0.0001, + "loss": 8.1337, + "loss/crossentropy": 2.7280073165893555, + "loss/hidden": 2.15625, + "loss/jsd": 0.0, + "loss/logits": 0.32494574785232544, + "step": 826 + }, + { + "epoch": 0.025875, + "grad_norm": 6.3125, + "grad_norm_var": 4.099833170572917, + "learning_rate": 0.0001, + "loss": 7.9933, + "loss/crossentropy": 2.6511049270629883, + "loss/hidden": 2.1875, + "loss/jsd": 0.0, + "loss/logits": 0.31547415256500244, + "step": 828 + }, + { + "epoch": 0.0259375, + "grad_norm": 5.5625, + "grad_norm_var": 4.104410807291667, + "learning_rate": 0.0001, + "loss": 8.1625, + "loss/crossentropy": 2.761909246444702, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.3345913887023926, + "step": 830 + }, + { + "epoch": 0.026, + "grad_norm": 5.5625, + "grad_norm_var": 0.4061197916666667, + "learning_rate": 0.0001, + "loss": 7.8933, + "loss/crossentropy": 2.57563316822052, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.3200434446334839, + "step": 832 + }, + { + "epoch": 0.0260625, + "grad_norm": 5.28125, + "grad_norm_var": 0.3856404622395833, + "learning_rate": 0.0001, + "loss": 7.7928, + "loss/crossentropy": 2.5671942234039307, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.3155245780944824, + "step": 834 + }, + { + "epoch": 0.026125, + "grad_norm": 5.1875, + "grad_norm_var": 0.23222249348958332, + "learning_rate": 0.0001, + "loss": 7.8599, + "loss/crossentropy": 2.665991187095642, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3115830421447754, + "step": 836 + }, + { + "epoch": 0.0261875, + "grad_norm": 5.40625, + "grad_norm_var": 0.11620686848958334, + "learning_rate": 0.0001, + "loss": 8.0253, + "loss/crossentropy": 2.748578667640686, + "loss/hidden": 2.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3167339563369751, + "step": 838 + }, + { + "epoch": 0.02625, + "grad_norm": 5.34375, + "grad_norm_var": 0.126416015625, + "learning_rate": 0.0001, + "loss": 7.9042, + "loss/crossentropy": 2.693682074546814, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.31089678406715393, + "step": 840 + }, + { + "epoch": 0.0263125, + "grad_norm": 6.34375, + "grad_norm_var": 0.15662434895833333, + "learning_rate": 0.0001, + "loss": 8.5614, + "loss/crossentropy": 2.8782442808151245, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.35191184282302856, + "step": 842 + }, + { + "epoch": 0.026375, + "grad_norm": 6.46875, + "grad_norm_var": 0.17574462890625, + "learning_rate": 0.0001, + "loss": 8.2453, + "loss/crossentropy": 2.783571243286133, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.3313324302434921, + "step": 844 + }, + { + "epoch": 0.0264375, + "grad_norm": 5.6875, + "grad_norm_var": 0.16897379557291667, + "learning_rate": 0.0001, + "loss": 7.8925, + "loss/crossentropy": 2.6488534212112427, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3165491074323654, + "step": 846 + }, + { + "epoch": 0.0265, + "grad_norm": 5.71875, + "grad_norm_var": 0.15937093098958333, + "learning_rate": 0.0001, + "loss": 8.2695, + "loss/crossentropy": 2.808097720146179, + "loss/hidden": 2.1484375, + "loss/jsd": 0.0, + "loss/logits": 0.33130063116550446, + "step": 848 + }, + { + "epoch": 0.0265625, + "grad_norm": 5.40625, + "grad_norm_var": 0.14685872395833333, + "learning_rate": 0.0001, + "loss": 8.1955, + "loss/crossentropy": 2.867979645729065, + "loss/hidden": 2.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3218100666999817, + "step": 850 + }, + { + "epoch": 0.026625, + "grad_norm": 5.6875, + "grad_norm_var": 0.134619140625, + "learning_rate": 0.0001, + "loss": 8.4039, + "loss/crossentropy": 2.8698208332061768, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.3409119248390198, + "step": 852 + }, + { + "epoch": 0.0266875, + "grad_norm": 6.53125, + "grad_norm_var": 0.18111979166666667, + "learning_rate": 0.0001, + "loss": 8.0889, + "loss/crossentropy": 2.7214276790618896, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.32424378395080566, + "step": 854 + }, + { + "epoch": 0.02675, + "grad_norm": 6.125, + "grad_norm_var": 0.17069905598958332, + "learning_rate": 0.0001, + "loss": 8.3495, + "loss/crossentropy": 2.6523544788360596, + "loss/hidden": 2.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.34237538278102875, + "step": 856 + }, + { + "epoch": 0.0268125, + "grad_norm": 6.25, + "grad_norm_var": 0.18821614583333332, + "learning_rate": 0.0001, + "loss": 8.1074, + "loss/crossentropy": 2.735729455947876, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.3285723030567169, + "step": 858 + }, + { + "epoch": 0.026875, + "grad_norm": 5.03125, + "grad_norm_var": 0.17589518229166667, + "learning_rate": 0.0001, + "loss": 7.7916, + "loss/crossentropy": 2.623154044151306, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.3113795071840286, + "step": 860 + }, + { + "epoch": 0.0269375, + "grad_norm": 5.65625, + "grad_norm_var": 0.175244140625, + "learning_rate": 0.0001, + "loss": 8.0075, + "loss/crossentropy": 2.6751527786254883, + "loss/hidden": 2.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.3199501931667328, + "step": 862 + }, + { + "epoch": 0.027, + "grad_norm": 6.125, + "grad_norm_var": 0.16978759765625, + "learning_rate": 0.0001, + "loss": 8.3152, + "loss/crossentropy": 2.830706477165222, + "loss/hidden": 2.171875, + "loss/jsd": 0.0, + "loss/logits": 0.331265926361084, + "step": 864 + }, + { + "epoch": 0.0270625, + "grad_norm": 5.0625, + "grad_norm_var": 0.21604410807291666, + "learning_rate": 0.0001, + "loss": 7.6622, + "loss/crossentropy": 2.5602376461029053, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3070688992738724, + "step": 866 + }, + { + "epoch": 0.027125, + "grad_norm": 5.21875, + "grad_norm_var": 0.23385416666666667, + "learning_rate": 0.0001, + "loss": 7.5858, + "loss/crossentropy": 2.4632397890090942, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.30210280418395996, + "step": 868 + }, + { + "epoch": 0.0271875, + "grad_norm": 5.40625, + "grad_norm_var": 0.19134114583333334, + "learning_rate": 0.0001, + "loss": 7.9338, + "loss/crossentropy": 2.7243661880493164, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3131289482116699, + "step": 870 + }, + { + "epoch": 0.02725, + "grad_norm": 6.46875, + "grad_norm_var": 0.19524332682291667, + "learning_rate": 0.0001, + "loss": 8.6795, + "loss/crossentropy": 3.0478591918945312, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.3490995764732361, + "step": 872 + }, + { + "epoch": 0.0273125, + "grad_norm": 5.78125, + "grad_norm_var": 0.16415608723958333, + "learning_rate": 0.0001, + "loss": 8.155, + "loss/crossentropy": 2.7137235403060913, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.3347555547952652, + "step": 874 + }, + { + "epoch": 0.027375, + "grad_norm": 5.09375, + "grad_norm_var": 0.1630859375, + "learning_rate": 0.0001, + "loss": 7.9233, + "loss/crossentropy": 2.6758487224578857, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.31693698465824127, + "step": 876 + }, + { + "epoch": 0.0274375, + "grad_norm": 6.1875, + "grad_norm_var": 0.17623697916666667, + "learning_rate": 0.0001, + "loss": 7.9173, + "loss/crossentropy": 2.528697967529297, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.3302699476480484, + "step": 878 + }, + { + "epoch": 0.0275, + "grad_norm": 5.875, + "grad_norm_var": 0.28787434895833336, + "learning_rate": 0.0001, + "loss": 8.5773, + "loss/crossentropy": 2.8210952281951904, + "loss/hidden": 2.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.35140474140644073, + "step": 880 + }, + { + "epoch": 0.0275625, + "grad_norm": 5.40625, + "grad_norm_var": 0.2482421875, + "learning_rate": 0.0001, + "loss": 8.0685, + "loss/crossentropy": 2.7750041484832764, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3215404748916626, + "step": 882 + }, + { + "epoch": 0.027625, + "grad_norm": 5.46875, + "grad_norm_var": 0.23173421223958332, + "learning_rate": 0.0001, + "loss": 7.9534, + "loss/crossentropy": 2.611912250518799, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.32476896047592163, + "step": 884 + }, + { + "epoch": 0.0276875, + "grad_norm": 4.90625, + "grad_norm_var": 0.2774739583333333, + "learning_rate": 0.0001, + "loss": 7.6661, + "loss/crossentropy": 2.6392383575439453, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2940940260887146, + "step": 886 + }, + { + "epoch": 0.02775, + "grad_norm": 6.625, + "grad_norm_var": 0.2930989583333333, + "learning_rate": 0.0001, + "loss": 8.6536, + "loss/crossentropy": 3.054406762123108, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.34351395070552826, + "step": 888 + }, + { + "epoch": 0.0278125, + "grad_norm": 5.40625, + "grad_norm_var": 0.304541015625, + "learning_rate": 0.0001, + "loss": 7.8948, + "loss/crossentropy": 2.5886287689208984, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3228069841861725, + "step": 890 + }, + { + "epoch": 0.027875, + "grad_norm": 5.75, + "grad_norm_var": 0.27063395182291666, + "learning_rate": 0.0001, + "loss": 7.9995, + "loss/crossentropy": 2.773493528366089, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3124459385871887, + "step": 892 + }, + { + "epoch": 0.0279375, + "grad_norm": 5.3125, + "grad_norm_var": 0.273681640625, + "learning_rate": 0.0001, + "loss": 8.0724, + "loss/crossentropy": 2.750393271446228, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3243854194879532, + "step": 894 + }, + { + "epoch": 0.028, + "grad_norm": 6.15625, + "grad_norm_var": 0.17213134765625, + "learning_rate": 0.0001, + "loss": 7.6506, + "loss/crossentropy": 2.4986928701400757, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.30815713107585907, + "step": 896 + }, + { + "epoch": 0.0280625, + "grad_norm": 5.46875, + "grad_norm_var": 0.19250895182291666, + "learning_rate": 0.0001, + "loss": 7.845, + "loss/crossentropy": 2.671201229095459, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3126888573169708, + "step": 898 + }, + { + "epoch": 0.028125, + "grad_norm": 6.59375, + "grad_norm_var": 0.25100504557291664, + "learning_rate": 0.0001, + "loss": 8.2131, + "loss/crossentropy": 2.8476167917251587, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3287386894226074, + "step": 900 + }, + { + "epoch": 0.0281875, + "grad_norm": 5.1875, + "grad_norm_var": 0.24221598307291667, + "learning_rate": 0.0001, + "loss": 7.9984, + "loss/crossentropy": 2.7201178073883057, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.31376519799232483, + "step": 902 + }, + { + "epoch": 0.02825, + "grad_norm": 5.6875, + "grad_norm_var": 0.22470296223958333, + "learning_rate": 0.0001, + "loss": 7.7041, + "loss/crossentropy": 2.569111943244934, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.3080315589904785, + "step": 904 + }, + { + "epoch": 0.0283125, + "grad_norm": 5.28125, + "grad_norm_var": 0.3753743489583333, + "learning_rate": 0.0001, + "loss": 8.047, + "loss/crossentropy": 2.642240047454834, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.3279738128185272, + "step": 906 + }, + { + "epoch": 0.028375, + "grad_norm": 6.03125, + "grad_norm_var": 0.38293863932291666, + "learning_rate": 0.0001, + "loss": 7.88, + "loss/crossentropy": 2.5164895057678223, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.32775919139385223, + "step": 908 + }, + { + "epoch": 0.0284375, + "grad_norm": 5.53125, + "grad_norm_var": 0.38437093098958336, + "learning_rate": 0.0001, + "loss": 8.5437, + "loss/crossentropy": 2.972300410270691, + "loss/hidden": 2.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.33917422592639923, + "step": 910 + }, + { + "epoch": 0.0285, + "grad_norm": 5.09375, + "grad_norm_var": 0.390625, + "learning_rate": 0.0001, + "loss": 7.7213, + "loss/crossentropy": 2.5361626148223877, + "loss/hidden": 2.109375, + "loss/jsd": 0.0, + "loss/logits": 0.30757124722003937, + "step": 912 + }, + { + "epoch": 0.0285625, + "grad_norm": 9.375, + "grad_norm_var": 1.1869425455729166, + "learning_rate": 0.0001, + "loss": 8.2299, + "loss/crossentropy": 2.709487557411194, + "loss/hidden": 2.21875, + "loss/jsd": 0.0, + "loss/logits": 0.3301650881767273, + "step": 914 + }, + { + "epoch": 0.028625, + "grad_norm": 6.5625, + "grad_norm_var": 1.1613118489583334, + "learning_rate": 0.0001, + "loss": 7.923, + "loss/crossentropy": 2.6102263927459717, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.32190655171871185, + "step": 916 + }, + { + "epoch": 0.0286875, + "grad_norm": 6.875, + "grad_norm_var": 1.3302042643229166, + "learning_rate": 0.0001, + "loss": 8.4869, + "loss/crossentropy": 2.9106889963150024, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.3435541093349457, + "step": 918 + }, + { + "epoch": 0.02875, + "grad_norm": 5.375, + "grad_norm_var": 1.285009765625, + "learning_rate": 0.0001, + "loss": 8.1751, + "loss/crossentropy": 2.721361994743347, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.33287379145622253, + "step": 920 + }, + { + "epoch": 0.0288125, + "grad_norm": 6.09375, + "grad_norm_var": 1.2374837239583334, + "learning_rate": 0.0001, + "loss": 8.1547, + "loss/crossentropy": 2.76972234249115, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.32990050315856934, + "step": 922 + }, + { + "epoch": 0.028875, + "grad_norm": 5.75, + "grad_norm_var": 1.2248982747395833, + "learning_rate": 0.0001, + "loss": 8.235, + "loss/crossentropy": 2.905160665512085, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.32516761124134064, + "step": 924 + }, + { + "epoch": 0.0289375, + "grad_norm": 5.75, + "grad_norm_var": 1.21920166015625, + "learning_rate": 0.0001, + "loss": 7.8167, + "loss/crossentropy": 2.572208523750305, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.3150770515203476, + "step": 926 + }, + { + "epoch": 0.029, + "grad_norm": 6.0, + "grad_norm_var": 1.0982381184895833, + "learning_rate": 0.0001, + "loss": 7.6862, + "loss/crossentropy": 2.576735019683838, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3031381666660309, + "step": 928 + }, + { + "epoch": 0.0290625, + "grad_norm": 5.28125, + "grad_norm_var": 0.45636393229166666, + "learning_rate": 0.0001, + "loss": 7.9539, + "loss/crossentropy": 2.7378779649734497, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.3122238516807556, + "step": 930 + }, + { + "epoch": 0.029125, + "grad_norm": 5.65625, + "grad_norm_var": 0.47415364583333336, + "learning_rate": 0.0001, + "loss": 7.7871, + "loss/crossentropy": 2.664810299873352, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.3130109906196594, + "step": 932 + }, + { + "epoch": 0.0291875, + "grad_norm": 5.46875, + "grad_norm_var": 0.12987874348958334, + "learning_rate": 0.0001, + "loss": 8.0233, + "loss/crossentropy": 2.762031674385071, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.31909996271133423, + "step": 934 + }, + { + "epoch": 0.02925, + "grad_norm": 11.125, + "grad_norm_var": 1.95758056640625, + "learning_rate": 0.0001, + "loss": 8.1688, + "loss/crossentropy": 2.6960248947143555, + "loss/hidden": 2.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.3339923918247223, + "step": 936 + }, + { + "epoch": 0.0293125, + "grad_norm": 5.84375, + "grad_norm_var": 1.9125651041666667, + "learning_rate": 0.0001, + "loss": 7.8335, + "loss/crossentropy": 2.756159782409668, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.30227015912532806, + "step": 938 + }, + { + "epoch": 0.029375, + "grad_norm": 5.53125, + "grad_norm_var": 1.9476847330729166, + "learning_rate": 0.0001, + "loss": 7.8498, + "loss/crossentropy": 2.552929639816284, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.3179690092802048, + "step": 940 + }, + { + "epoch": 0.0294375, + "grad_norm": 6.0625, + "grad_norm_var": 1.9318644205729167, + "learning_rate": 0.0001, + "loss": 8.2395, + "loss/crossentropy": 2.7607001066207886, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.3361617773771286, + "step": 942 + }, + { + "epoch": 0.0295, + "grad_norm": 5.71875, + "grad_norm_var": 1.9405232747395833, + "learning_rate": 0.0001, + "loss": 7.8254, + "loss/crossentropy": 2.6775211095809937, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.3077540248632431, + "step": 944 + }, + { + "epoch": 0.0295625, + "grad_norm": 7.34375, + "grad_norm_var": 2.0398274739583333, + "learning_rate": 0.0001, + "loss": 7.93, + "loss/crossentropy": 2.6490660905838013, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.3210662305355072, + "step": 946 + }, + { + "epoch": 0.029625, + "grad_norm": 5.375, + "grad_norm_var": 2.0617024739583334, + "learning_rate": 0.0001, + "loss": 7.4648, + "loss/crossentropy": 2.5306503772735596, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.29341667890548706, + "step": 948 + }, + { + "epoch": 0.0296875, + "grad_norm": 5.3125, + "grad_norm_var": 2.0951171875, + "learning_rate": 0.0001, + "loss": 7.4159, + "loss/crossentropy": 2.4712361097335815, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.28509533405303955, + "step": 950 + }, + { + "epoch": 0.02975, + "grad_norm": 5.375, + "grad_norm_var": 0.3578084309895833, + "learning_rate": 0.0001, + "loss": 7.9328, + "loss/crossentropy": 2.6684658527374268, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.3170586824417114, + "step": 952 + }, + { + "epoch": 0.0298125, + "grad_norm": 5.0625, + "grad_norm_var": 0.3912760416666667, + "learning_rate": 0.0001, + "loss": 7.7893, + "loss/crossentropy": 2.586375594139099, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.31247901916503906, + "step": 954 + }, + { + "epoch": 0.029875, + "grad_norm": 4.8125, + "grad_norm_var": 0.41243082682291665, + "learning_rate": 0.0001, + "loss": 7.4818, + "loss/crossentropy": 2.563793182373047, + "loss/hidden": 1.99609375, + "loss/jsd": 0.0, + "loss/logits": 0.2921905666589737, + "step": 956 + }, + { + "epoch": 0.0299375, + "grad_norm": 5.125, + "grad_norm_var": 0.39143473307291665, + "learning_rate": 0.0001, + "loss": 7.8392, + "loss/crossentropy": 2.7233054637908936, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3014303147792816, + "step": 958 + }, + { + "epoch": 0.03, + "grad_norm": 12.3125, + "grad_norm_var": 3.29986572265625, + "learning_rate": 0.0001, + "loss": 8.3685, + "loss/crossentropy": 2.7161797285079956, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.345696359872818, + "step": 960 + }, + { + "epoch": 0.0300625, + "grad_norm": 5.8125, + "grad_norm_var": 3.100455729166667, + "learning_rate": 0.0001, + "loss": 7.8995, + "loss/crossentropy": 2.6865986585617065, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.315825879573822, + "step": 962 + }, + { + "epoch": 0.030125, + "grad_norm": 5.84375, + "grad_norm_var": 3.0404947916666667, + "learning_rate": 0.0001, + "loss": 7.5698, + "loss/crossentropy": 2.4539116621017456, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.30924659967422485, + "step": 964 + }, + { + "epoch": 0.0301875, + "grad_norm": 5.09375, + "grad_norm_var": 3.0578125, + "learning_rate": 0.0001, + "loss": 7.8823, + "loss/crossentropy": 2.7271156311035156, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3092683255672455, + "step": 966 + }, + { + "epoch": 0.03025, + "grad_norm": 5.34375, + "grad_norm_var": 3.09375, + "learning_rate": 0.0001, + "loss": 8.3393, + "loss/crossentropy": 2.7491281032562256, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.3394854813814163, + "step": 968 + }, + { + "epoch": 0.0303125, + "grad_norm": 5.40625, + "grad_norm_var": 3.076806640625, + "learning_rate": 0.0001, + "loss": 8.0177, + "loss/crossentropy": 2.7733538150787354, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3142744302749634, + "step": 970 + }, + { + "epoch": 0.030375, + "grad_norm": 5.875, + "grad_norm_var": 2.9033854166666666, + "learning_rate": 0.0001, + "loss": 8.4779, + "loss/crossentropy": 3.035637617111206, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.33406516909599304, + "step": 972 + }, + { + "epoch": 0.0304375, + "grad_norm": 6.21875, + "grad_norm_var": 2.83970947265625, + "learning_rate": 0.0001, + "loss": 8.3789, + "loss/crossentropy": 2.986885666847229, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.331390380859375, + "step": 974 + }, + { + "epoch": 0.0305, + "grad_norm": 6.65625, + "grad_norm_var": 0.195703125, + "learning_rate": 0.0001, + "loss": 8.4519, + "loss/crossentropy": 2.8858373165130615, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3464466333389282, + "step": 976 + }, + { + "epoch": 0.0305625, + "grad_norm": 6.4375, + "grad_norm_var": 0.47063802083333334, + "learning_rate": 0.0001, + "loss": 8.3219, + "loss/crossentropy": 2.7667852640151978, + "loss/hidden": 2.171875, + "loss/jsd": 0.0, + "loss/logits": 0.3383200764656067, + "step": 978 + }, + { + "epoch": 0.030625, + "grad_norm": 5.53125, + "grad_norm_var": 0.5205362955729167, + "learning_rate": 0.0001, + "loss": 7.7302, + "loss/crossentropy": 2.6153576374053955, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3067933917045593, + "step": 980 + }, + { + "epoch": 0.0306875, + "grad_norm": 5.96875, + "grad_norm_var": 0.510791015625, + "learning_rate": 0.0001, + "loss": 8.5684, + "loss/crossentropy": 2.9319427013397217, + "loss/hidden": 2.15625, + "loss/jsd": 0.0, + "loss/logits": 0.34801939129829407, + "step": 982 + }, + { + "epoch": 0.03075, + "grad_norm": 5.6875, + "grad_norm_var": 0.4574178059895833, + "learning_rate": 0.0001, + "loss": 8.154, + "loss/crossentropy": 2.73896861076355, + "loss/hidden": 2.109375, + "loss/jsd": 0.0, + "loss/logits": 0.3305632919073105, + "step": 984 + }, + { + "epoch": 0.0308125, + "grad_norm": 5.375, + "grad_norm_var": 0.44302978515625, + "learning_rate": 0.0001, + "loss": 7.9289, + "loss/crossentropy": 2.6419018507003784, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.31620074808597565, + "step": 986 + }, + { + "epoch": 0.030875, + "grad_norm": 6.03125, + "grad_norm_var": 0.4825520833333333, + "learning_rate": 0.0001, + "loss": 7.8493, + "loss/crossentropy": 2.6793285608291626, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.313095286488533, + "step": 988 + }, + { + "epoch": 0.0309375, + "grad_norm": 4.96875, + "grad_norm_var": 0.529541015625, + "learning_rate": 0.0001, + "loss": 7.8312, + "loss/crossentropy": 2.6495360136032104, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.3142629563808441, + "step": 990 + }, + { + "epoch": 0.031, + "grad_norm": 5.65625, + "grad_norm_var": 0.5310506184895833, + "learning_rate": 0.0001, + "loss": 7.8569, + "loss/crossentropy": 2.6996525526046753, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.3086983859539032, + "step": 992 + }, + { + "epoch": 0.0310625, + "grad_norm": 5.78125, + "grad_norm_var": 0.19765625, + "learning_rate": 0.0001, + "loss": 8.2429, + "loss/crossentropy": 2.787962317466736, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.33689984679222107, + "step": 994 + }, + { + "epoch": 0.031125, + "grad_norm": 6.21875, + "grad_norm_var": 0.22366129557291667, + "learning_rate": 0.0001, + "loss": 7.8903, + "loss/crossentropy": 2.657445192337036, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.31156356632709503, + "step": 996 + }, + { + "epoch": 0.0311875, + "grad_norm": 5.5625, + "grad_norm_var": 0.13487955729166667, + "learning_rate": 0.0001, + "loss": 7.927, + "loss/crossentropy": 2.7342272996902466, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.3099045604467392, + "step": 998 + }, + { + "epoch": 0.03125, + "grad_norm": 5.125, + "grad_norm_var": 0.13541666666666666, + "learning_rate": 0.0001, + "loss": 7.7652, + "loss/crossentropy": 2.6236950159072876, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.30945977568626404, + "step": 1000 + }, + { + "epoch": 0.0313125, + "grad_norm": 5.8125, + "grad_norm_var": 0.14263916015625, + "learning_rate": 0.0001, + "loss": 8.256, + "loss/crossentropy": 2.8440134525299072, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.329479455947876, + "step": 1002 + }, + { + "epoch": 0.031375, + "grad_norm": 5.78125, + "grad_norm_var": 0.12047119140625, + "learning_rate": 0.0001, + "loss": 7.9484, + "loss/crossentropy": 2.6811503171920776, + "loss/hidden": 2.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.313446044921875, + "step": 1004 + }, + { + "epoch": 0.0314375, + "grad_norm": 5.15625, + "grad_norm_var": 0.12053629557291666, + "learning_rate": 0.0001, + "loss": 7.8344, + "loss/crossentropy": 2.676396131515503, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.3118928074836731, + "step": 1006 + }, + { + "epoch": 0.0315, + "grad_norm": 5.28125, + "grad_norm_var": 0.10519205729166667, + "learning_rate": 0.0001, + "loss": 7.6031, + "loss/crossentropy": 2.5833935737609863, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2972829341888428, + "step": 1008 + }, + { + "epoch": 0.0315625, + "grad_norm": 4.71875, + "grad_norm_var": 0.14134114583333332, + "learning_rate": 0.0001, + "loss": 7.8421, + "loss/crossentropy": 2.774709463119507, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2981446832418442, + "step": 1010 + }, + { + "epoch": 0.031625, + "grad_norm": 6.15625, + "grad_norm_var": 0.57730712890625, + "learning_rate": 0.0001, + "loss": 7.9047, + "loss/crossentropy": 2.4489853382110596, + "loss/hidden": 2.1640625, + "loss/jsd": 0.0, + "loss/logits": 0.329169899225235, + "step": 1012 + }, + { + "epoch": 0.0316875, + "grad_norm": 4.96875, + "grad_norm_var": 0.6143513997395833, + "learning_rate": 0.0001, + "loss": 7.7484, + "loss/crossentropy": 2.7602421045303345, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2956867665052414, + "step": 1014 + }, + { + "epoch": 0.03175, + "grad_norm": 5.28125, + "grad_norm_var": 0.60299072265625, + "learning_rate": 0.0001, + "loss": 7.9042, + "loss/crossentropy": 2.6339290142059326, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.32234111428260803, + "step": 1016 + }, + { + "epoch": 0.0318125, + "grad_norm": 5.71875, + "grad_norm_var": 0.6020833333333333, + "learning_rate": 0.0001, + "loss": 7.887, + "loss/crossentropy": 2.5684269666671753, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3217056393623352, + "step": 1018 + }, + { + "epoch": 0.031875, + "grad_norm": 5.8125, + "grad_norm_var": 0.5924763997395833, + "learning_rate": 0.0001, + "loss": 7.8581, + "loss/crossentropy": 2.7317564487457275, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.3056041747331619, + "step": 1020 + }, + { + "epoch": 0.0319375, + "grad_norm": 9.3125, + "grad_norm_var": 1.39703369140625, + "learning_rate": 0.0001, + "loss": 7.7259, + "loss/crossentropy": 2.5896379947662354, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.30425289273262024, + "step": 1022 + }, + { + "epoch": 0.032, + "grad_norm": 5.09375, + "grad_norm_var": 1.40953369140625, + "learning_rate": 0.0001, + "loss": 7.7384, + "loss/crossentropy": 2.5754462480545044, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3084825873374939, + "step": 1024 + }, + { + "epoch": 0.0320625, + "grad_norm": 5.4375, + "grad_norm_var": 1.3848958333333334, + "learning_rate": 0.0001, + "loss": 8.0083, + "loss/crossentropy": 2.7684485912323, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.31617075204849243, + "step": 1026 + }, + { + "epoch": 0.032125, + "grad_norm": 5.5, + "grad_norm_var": 1.0278605143229167, + "learning_rate": 0.0001, + "loss": 7.4014, + "loss/crossentropy": 2.4708162546157837, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.2899314910173416, + "step": 1028 + }, + { + "epoch": 0.0321875, + "grad_norm": 5.8125, + "grad_norm_var": 0.9800130208333333, + "learning_rate": 0.0001, + "loss": 8.1904, + "loss/crossentropy": 2.7571115493774414, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.33083009719848633, + "step": 1030 + }, + { + "epoch": 0.03225, + "grad_norm": 4.875, + "grad_norm_var": 1.01529541015625, + "learning_rate": 0.0001, + "loss": 7.6514, + "loss/crossentropy": 2.638901114463806, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.29890111088752747, + "step": 1032 + }, + { + "epoch": 0.0323125, + "grad_norm": 7.90625, + "grad_norm_var": 1.320947265625, + "learning_rate": 0.0001, + "loss": 8.1064, + "loss/crossentropy": 2.8018993139266968, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.3218534588813782, + "step": 1034 + }, + { + "epoch": 0.032375, + "grad_norm": 7.5, + "grad_norm_var": 1.4863240559895834, + "learning_rate": 0.0001, + "loss": 8.4959, + "loss/crossentropy": 2.9163622856140137, + "loss/hidden": 2.1953125, + "loss/jsd": 0.0, + "loss/logits": 0.33842067420482635, + "step": 1036 + }, + { + "epoch": 0.0324375, + "grad_norm": 5.71875, + "grad_norm_var": 0.6952473958333333, + "learning_rate": 0.0001, + "loss": 7.8989, + "loss/crossentropy": 2.7055130004882812, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.31543323397636414, + "step": 1038 + }, + { + "epoch": 0.0325, + "grad_norm": 5.78125, + "grad_norm_var": 0.6703084309895834, + "learning_rate": 0.0001, + "loss": 7.9723, + "loss/crossentropy": 2.5736807584762573, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.3273621052503586, + "step": 1040 + }, + { + "epoch": 0.0325625, + "grad_norm": 6.09375, + "grad_norm_var": 1.0708333333333333, + "learning_rate": 0.0001, + "loss": 8.718, + "loss/crossentropy": 2.940716505050659, + "loss/hidden": 2.140625, + "loss/jsd": 0.0, + "loss/logits": 0.3636625409126282, + "step": 1042 + }, + { + "epoch": 0.032625, + "grad_norm": 6.25, + "grad_norm_var": 1.0049479166666666, + "learning_rate": 0.0001, + "loss": 8.4934, + "loss/crossentropy": 2.8877869844436646, + "loss/hidden": 2.1328125, + "loss/jsd": 0.0, + "loss/logits": 0.34728457033634186, + "step": 1044 + }, + { + "epoch": 0.0326875, + "grad_norm": 5.28125, + "grad_norm_var": 1.0175618489583333, + "learning_rate": 0.0001, + "loss": 8.099, + "loss/crossentropy": 2.7789658308029175, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.3241891860961914, + "step": 1046 + }, + { + "epoch": 0.03275, + "grad_norm": 6.1875, + "grad_norm_var": 0.8885701497395834, + "learning_rate": 0.0001, + "loss": 8.1815, + "loss/crossentropy": 2.7968313694000244, + "loss/hidden": 2.15625, + "loss/jsd": 0.0, + "loss/logits": 0.32283732295036316, + "step": 1048 + }, + { + "epoch": 0.0328125, + "grad_norm": 5.40625, + "grad_norm_var": 0.7373046875, + "learning_rate": 0.0001, + "loss": 7.472, + "loss/crossentropy": 2.44843852519989, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.2937634289264679, + "step": 1050 + }, + { + "epoch": 0.032875, + "grad_norm": 5.9375, + "grad_norm_var": 0.5847941080729167, + "learning_rate": 0.0001, + "loss": 8.6614, + "loss/crossentropy": 3.1080269813537598, + "loss/hidden": 2.109375, + "loss/jsd": 0.0, + "loss/logits": 0.34440042078495026, + "step": 1052 + }, + { + "epoch": 0.0329375, + "grad_norm": 5.28125, + "grad_norm_var": 0.6066365559895833, + "learning_rate": 0.0001, + "loss": 7.9554, + "loss/crossentropy": 2.694094657897949, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.322227418422699, + "step": 1054 + }, + { + "epoch": 0.033, + "grad_norm": 4.9375, + "grad_norm_var": 0.69879150390625, + "learning_rate": 0.0001, + "loss": 7.6204, + "loss/crossentropy": 2.594877004623413, + "loss/hidden": 2.03515625, + "loss/jsd": 0.0, + "loss/logits": 0.2990366369485855, + "step": 1056 + }, + { + "epoch": 0.0330625, + "grad_norm": 5.6875, + "grad_norm_var": 0.18345947265625, + "learning_rate": 0.0001, + "loss": 7.7801, + "loss/crossentropy": 2.691964864730835, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.30256471037864685, + "step": 1058 + }, + { + "epoch": 0.033125, + "grad_norm": 5.5, + "grad_norm_var": 0.16809895833333333, + "learning_rate": 0.0001, + "loss": 7.9022, + "loss/crossentropy": 2.747640609741211, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.3084237426519394, + "step": 1060 + }, + { + "epoch": 0.0331875, + "grad_norm": 4.6875, + "grad_norm_var": 0.1982421875, + "learning_rate": 0.0001, + "loss": 7.4803, + "loss/crossentropy": 2.540893077850342, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.29471834003925323, + "step": 1062 + }, + { + "epoch": 0.03325, + "grad_norm": 4.8125, + "grad_norm_var": 0.18079020182291666, + "learning_rate": 0.0001, + "loss": 7.8368, + "loss/crossentropy": 2.8419036865234375, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2979314625263214, + "step": 1064 + }, + { + "epoch": 0.0333125, + "grad_norm": 5.34375, + "grad_norm_var": 0.19078369140625, + "learning_rate": 0.0001, + "loss": 7.6024, + "loss/crossentropy": 2.562094211578369, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.3016863167285919, + "step": 1066 + }, + { + "epoch": 0.033375, + "grad_norm": 4.875, + "grad_norm_var": 0.13828125, + "learning_rate": 0.0001, + "loss": 7.4647, + "loss/crossentropy": 2.46220201253891, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.29322096705436707, + "step": 1068 + }, + { + "epoch": 0.0334375, + "grad_norm": 5.1875, + "grad_norm_var": 0.14286702473958332, + "learning_rate": 0.0001, + "loss": 7.6626, + "loss/crossentropy": 2.729177236557007, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.29100048542022705, + "step": 1070 + }, + { + "epoch": 0.0335, + "grad_norm": 5.21875, + "grad_norm_var": 0.14036458333333332, + "learning_rate": 0.0001, + "loss": 7.565, + "loss/crossentropy": 2.565149426460266, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.29529547691345215, + "step": 1072 + }, + { + "epoch": 0.0335625, + "grad_norm": 5.53125, + "grad_norm_var": 0.13606363932291668, + "learning_rate": 0.0001, + "loss": 8.2564, + "loss/crossentropy": 2.856488823890686, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.33218303322792053, + "step": 1074 + }, + { + "epoch": 0.033625, + "grad_norm": 4.9375, + "grad_norm_var": 0.13307291666666668, + "learning_rate": 0.0001, + "loss": 7.3967, + "loss/crossentropy": 2.5067635774612427, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.29055845737457275, + "step": 1076 + }, + { + "epoch": 0.0336875, + "grad_norm": 5.625, + "grad_norm_var": 0.13527018229166668, + "learning_rate": 0.0001, + "loss": 8.2068, + "loss/crossentropy": 2.904178261756897, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.32166482508182526, + "step": 1078 + }, + { + "epoch": 0.03375, + "grad_norm": 5.84375, + "grad_norm_var": 0.09478759765625, + "learning_rate": 0.0001, + "loss": 7.9513, + "loss/crossentropy": 2.6391228437423706, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.3187223821878433, + "step": 1080 + }, + { + "epoch": 0.0338125, + "grad_norm": 5.75, + "grad_norm_var": 0.10234375, + "learning_rate": 0.0001, + "loss": 7.9395, + "loss/crossentropy": 2.7211371660232544, + "loss/hidden": 2.08984375, + "loss/jsd": 0.0, + "loss/logits": 0.31285470724105835, + "step": 1082 + }, + { + "epoch": 0.033875, + "grad_norm": 5.25, + "grad_norm_var": 0.093603515625, + "learning_rate": 0.0001, + "loss": 7.8391, + "loss/crossentropy": 2.802746534347534, + "loss/hidden": 2.03515625, + "loss/jsd": 0.0, + "loss/logits": 0.3001168519258499, + "step": 1084 + }, + { + "epoch": 0.0339375, + "grad_norm": 5.34375, + "grad_norm_var": 0.09425455729166667, + "learning_rate": 0.0001, + "loss": 7.5762, + "loss/crossentropy": 2.604748845100403, + "loss/hidden": 1.99609375, + "loss/jsd": 0.0, + "loss/logits": 0.2975347489118576, + "step": 1086 + }, + { + "epoch": 0.034, + "grad_norm": 5.15625, + "grad_norm_var": 0.09062093098958333, + "learning_rate": 0.0001, + "loss": 7.8203, + "loss/crossentropy": 2.646591305732727, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.3134637773036957, + "step": 1088 + }, + { + "epoch": 0.0340625, + "grad_norm": 4.96875, + "grad_norm_var": 0.09803059895833334, + "learning_rate": 0.0001, + "loss": 7.6995, + "loss/crossentropy": 2.7274059057235718, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.29486148059368134, + "step": 1090 + }, + { + "epoch": 0.034125, + "grad_norm": 5.5625, + "grad_norm_var": 0.19062093098958333, + "learning_rate": 0.0001, + "loss": 7.3454, + "loss/crossentropy": 2.3077635765075684, + "loss/hidden": 2.03515625, + "loss/jsd": 0.0, + "loss/logits": 0.3002438396215439, + "step": 1092 + }, + { + "epoch": 0.0341875, + "grad_norm": 5.09375, + "grad_norm_var": 0.19384358723958334, + "learning_rate": 0.0001, + "loss": 7.5719, + "loss/crossentropy": 2.6316027641296387, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.29012222588062286, + "step": 1094 + }, + { + "epoch": 0.03425, + "grad_norm": 5.03125, + "grad_norm_var": 0.18843994140625, + "learning_rate": 0.0001, + "loss": 7.4854, + "loss/crossentropy": 2.538474917411804, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.2974313497543335, + "step": 1096 + }, + { + "epoch": 0.0343125, + "grad_norm": 5.40625, + "grad_norm_var": 0.22355143229166666, + "learning_rate": 0.0001, + "loss": 7.938, + "loss/crossentropy": 2.862139940261841, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3060276210308075, + "step": 1098 + }, + { + "epoch": 0.034375, + "grad_norm": 5.28125, + "grad_norm_var": 0.21444905598958333, + "learning_rate": 0.0001, + "loss": 7.7295, + "loss/crossentropy": 2.5745826959609985, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.31002719700336456, + "step": 1100 + }, + { + "epoch": 0.0344375, + "grad_norm": 5.25, + "grad_norm_var": 0.198291015625, + "learning_rate": 0.0001, + "loss": 8.1631, + "loss/crossentropy": 2.921027660369873, + "loss/hidden": 2.109375, + "loss/jsd": 0.0, + "loss/logits": 0.313272625207901, + "step": 1102 + }, + { + "epoch": 0.0345, + "grad_norm": 4.96875, + "grad_norm_var": 0.21783854166666666, + "learning_rate": 0.0001, + "loss": 7.2742, + "loss/crossentropy": 2.430017828941345, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.28129300475120544, + "step": 1104 + }, + { + "epoch": 0.0345625, + "grad_norm": 5.59375, + "grad_norm_var": 0.20753580729166668, + "learning_rate": 0.0001, + "loss": 7.9823, + "loss/crossentropy": 2.8249661922454834, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.31338855624198914, + "step": 1106 + }, + { + "epoch": 0.034625, + "grad_norm": 6.0, + "grad_norm_var": 0.123681640625, + "learning_rate": 0.0001, + "loss": 7.9694, + "loss/crossentropy": 2.717615008354187, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3150188624858856, + "step": 1108 + }, + { + "epoch": 0.0346875, + "grad_norm": 5.53125, + "grad_norm_var": 0.12056884765625, + "learning_rate": 0.0001, + "loss": 7.7475, + "loss/crossentropy": 2.7591657638549805, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.2918030321598053, + "step": 1110 + }, + { + "epoch": 0.03475, + "grad_norm": 4.78125, + "grad_norm_var": 0.13730061848958333, + "learning_rate": 0.0001, + "loss": 7.624, + "loss/crossentropy": 2.5710976123809814, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3006017506122589, + "step": 1112 + }, + { + "epoch": 0.0348125, + "grad_norm": 7.125, + "grad_norm_var": 0.30250244140625, + "learning_rate": 0.0001, + "loss": 7.9458, + "loss/crossentropy": 2.817541718482971, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3065790385007858, + "step": 1114 + }, + { + "epoch": 0.034875, + "grad_norm": 5.1875, + "grad_norm_var": 0.30601806640625, + "learning_rate": 0.0001, + "loss": 8.3373, + "loss/crossentropy": 3.012556791305542, + "loss/hidden": 2.09375, + "loss/jsd": 0.0, + "loss/logits": 0.323103591799736, + "step": 1116 + }, + { + "epoch": 0.0349375, + "grad_norm": 6.09375, + "grad_norm_var": 0.33033447265625, + "learning_rate": 0.0001, + "loss": 8.2642, + "loss/crossentropy": 2.9240721464157104, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.32698529958724976, + "step": 1118 + }, + { + "epoch": 0.035, + "grad_norm": 4.96875, + "grad_norm_var": 0.316015625, + "learning_rate": 0.0001, + "loss": 7.7638, + "loss/crossentropy": 2.6127941608428955, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.306509867310524, + "step": 1120 + }, + { + "epoch": 0.0350625, + "grad_norm": 5.0, + "grad_norm_var": 0.32821858723958336, + "learning_rate": 0.0001, + "loss": 8.0152, + "loss/crossentropy": 2.7948319911956787, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.31578800082206726, + "step": 1122 + }, + { + "epoch": 0.035125, + "grad_norm": 6.15625, + "grad_norm_var": 0.36256103515625, + "learning_rate": 0.0001, + "loss": 7.5683, + "loss/crossentropy": 2.5288106203079224, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.301602378487587, + "step": 1124 + }, + { + "epoch": 0.0351875, + "grad_norm": 5.5, + "grad_norm_var": 0.3632120768229167, + "learning_rate": 0.0001, + "loss": 8.1987, + "loss/crossentropy": 2.9701608419418335, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.31426164507865906, + "step": 1126 + }, + { + "epoch": 0.03525, + "grad_norm": 5.125, + "grad_norm_var": 0.33684895833333334, + "learning_rate": 0.0001, + "loss": 7.9655, + "loss/crossentropy": 2.819477081298828, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3099192678928375, + "step": 1128 + }, + { + "epoch": 0.0353125, + "grad_norm": 4.875, + "grad_norm_var": 0.15735270182291666, + "learning_rate": 0.0001, + "loss": 7.3771, + "loss/crossentropy": 2.5352863073349, + "loss/hidden": 1.953125, + "loss/jsd": 0.0, + "loss/logits": 0.28886666893959045, + "step": 1130 + }, + { + "epoch": 0.035375, + "grad_norm": 5.5625, + "grad_norm_var": 0.15623372395833332, + "learning_rate": 0.0001, + "loss": 7.9852, + "loss/crossentropy": 2.8817960023880005, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.30878154933452606, + "step": 1132 + }, + { + "epoch": 0.0354375, + "grad_norm": 5.75, + "grad_norm_var": 0.13157145182291666, + "learning_rate": 0.0001, + "loss": 8.0863, + "loss/crossentropy": 2.783313274383545, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3240460753440857, + "step": 1134 + }, + { + "epoch": 0.0355, + "grad_norm": 5.84375, + "grad_norm_var": 0.14817708333333332, + "learning_rate": 0.0001, + "loss": 8.2244, + "loss/crossentropy": 2.797518014907837, + "loss/hidden": 2.1171875, + "loss/jsd": 0.0, + "loss/logits": 0.3309740275144577, + "step": 1136 + }, + { + "epoch": 0.0355625, + "grad_norm": 4.90625, + "grad_norm_var": 0.19295247395833334, + "learning_rate": 0.0001, + "loss": 7.3743, + "loss/crossentropy": 2.5806562900543213, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.28640007972717285, + "step": 1138 + }, + { + "epoch": 0.035625, + "grad_norm": 5.375, + "grad_norm_var": 0.13605143229166666, + "learning_rate": 0.0001, + "loss": 7.825, + "loss/crossentropy": 2.722290277481079, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.30167827010154724, + "step": 1140 + }, + { + "epoch": 0.0356875, + "grad_norm": 5.65625, + "grad_norm_var": 0.141259765625, + "learning_rate": 0.0001, + "loss": 7.7702, + "loss/crossentropy": 2.702438473701477, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.30130916833877563, + "step": 1142 + }, + { + "epoch": 0.03575, + "grad_norm": 4.84375, + "grad_norm_var": 0.146875, + "learning_rate": 0.0001, + "loss": 7.8102, + "loss/crossentropy": 2.712978720664978, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3050341159105301, + "step": 1144 + }, + { + "epoch": 0.0358125, + "grad_norm": 5.625, + "grad_norm_var": 0.1291015625, + "learning_rate": 0.0001, + "loss": 7.8463, + "loss/crossentropy": 2.7124587297439575, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3102594017982483, + "step": 1146 + }, + { + "epoch": 0.035875, + "grad_norm": 5.40625, + "grad_norm_var": 0.13014322916666668, + "learning_rate": 0.0001, + "loss": 7.6124, + "loss/crossentropy": 2.6879937648773193, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.29478274285793304, + "step": 1148 + }, + { + "epoch": 0.0359375, + "grad_norm": 4.71875, + "grad_norm_var": 0.14752604166666666, + "learning_rate": 0.0001, + "loss": 7.7755, + "loss/crossentropy": 2.7245869636535645, + "loss/hidden": 2.01171875, + "loss/jsd": 0.0, + "loss/logits": 0.3039206862449646, + "step": 1150 + }, + { + "epoch": 0.036, + "grad_norm": 5.5, + "grad_norm_var": 0.10480143229166666, + "learning_rate": 0.0001, + "loss": 7.835, + "loss/crossentropy": 2.744845390319824, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3074570447206497, + "step": 1152 + }, + { + "epoch": 0.0360625, + "grad_norm": 8.25, + "grad_norm_var": 0.6361287434895834, + "learning_rate": 0.0001, + "loss": 7.9799, + "loss/crossentropy": 2.694003462791443, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.32155656814575195, + "step": 1154 + }, + { + "epoch": 0.036125, + "grad_norm": 5.28125, + "grad_norm_var": 0.6376912434895833, + "learning_rate": 0.0001, + "loss": 7.9291, + "loss/crossentropy": 2.8468209505081177, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.30744484066963196, + "step": 1156 + }, + { + "epoch": 0.0361875, + "grad_norm": 4.9375, + "grad_norm_var": 0.7028483072916667, + "learning_rate": 0.0001, + "loss": 7.8866, + "loss/crossentropy": 2.731719493865967, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3092362582683563, + "step": 1158 + }, + { + "epoch": 0.03625, + "grad_norm": 5.71875, + "grad_norm_var": 0.7010050455729167, + "learning_rate": 0.0001, + "loss": 7.7235, + "loss/crossentropy": 2.7183526754379272, + "loss/hidden": 2.01953125, + "loss/jsd": 0.0, + "loss/logits": 0.2985660433769226, + "step": 1160 + }, + { + "epoch": 0.0363125, + "grad_norm": 5.21875, + "grad_norm_var": 0.7028483072916667, + "learning_rate": 0.0001, + "loss": 7.5857, + "loss/crossentropy": 2.5812745094299316, + "loss/hidden": 2.078125, + "loss/jsd": 0.0, + "loss/logits": 0.2926321029663086, + "step": 1162 + }, + { + "epoch": 0.036375, + "grad_norm": 5.53125, + "grad_norm_var": 0.7484334309895834, + "learning_rate": 0.0001, + "loss": 7.7259, + "loss/crossentropy": 2.694801926612854, + "loss/hidden": 1.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.3042774498462677, + "step": 1164 + }, + { + "epoch": 0.0364375, + "grad_norm": 5.21875, + "grad_norm_var": 0.7175130208333333, + "learning_rate": 0.0001, + "loss": 8.0344, + "loss/crossentropy": 2.8037021160125732, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.3191618323326111, + "step": 1166 + }, + { + "epoch": 0.0365, + "grad_norm": 4.625, + "grad_norm_var": 0.7555989583333333, + "learning_rate": 0.0001, + "loss": 7.6207, + "loss/crossentropy": 2.678989052772522, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.2961277812719345, + "step": 1168 + }, + { + "epoch": 0.0365625, + "grad_norm": 4.9375, + "grad_norm_var": 0.18553059895833332, + "learning_rate": 0.0001, + "loss": 7.8717, + "loss/crossentropy": 2.750701904296875, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3089783787727356, + "step": 1170 + }, + { + "epoch": 0.036625, + "grad_norm": 4.875, + "grad_norm_var": 0.19503580729166667, + "learning_rate": 0.0001, + "loss": 7.5382, + "loss/crossentropy": 2.6573996543884277, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.2873009145259857, + "step": 1172 + }, + { + "epoch": 0.0366875, + "grad_norm": 5.625, + "grad_norm_var": 0.116796875, + "learning_rate": 0.0001, + "loss": 7.6217, + "loss/crossentropy": 2.5741889476776123, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2985018193721771, + "step": 1174 + }, + { + "epoch": 0.03675, + "grad_norm": 4.875, + "grad_norm_var": 0.20950113932291667, + "learning_rate": 0.0001, + "loss": 7.9366, + "loss/crossentropy": 2.7274372577667236, + "loss/hidden": 2.125, + "loss/jsd": 0.0, + "loss/logits": 0.308414101600647, + "step": 1176 + }, + { + "epoch": 0.0368125, + "grad_norm": 4.84375, + "grad_norm_var": 0.20976155598958332, + "learning_rate": 0.0001, + "loss": 7.4378, + "loss/crossentropy": 2.578323483467102, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.29141393303871155, + "step": 1178 + }, + { + "epoch": 0.036875, + "grad_norm": 5.78125, + "grad_norm_var": 0.23632405598958334, + "learning_rate": 0.0001, + "loss": 7.782, + "loss/crossentropy": 2.6867313385009766, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3079614043235779, + "step": 1180 + }, + { + "epoch": 0.0369375, + "grad_norm": 5.1875, + "grad_norm_var": 0.23717447916666667, + "learning_rate": 0.0001, + "loss": 7.6463, + "loss/crossentropy": 2.5722291469573975, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.31014107167720795, + "step": 1182 + }, + { + "epoch": 0.037, + "grad_norm": 5.1875, + "grad_norm_var": 0.21139322916666667, + "learning_rate": 0.0001, + "loss": 7.3991, + "loss/crossentropy": 2.6571277379989624, + "loss/hidden": 1.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.27536794543266296, + "step": 1184 + }, + { + "epoch": 0.0370625, + "grad_norm": 5.78125, + "grad_norm_var": 0.22805582682291667, + "learning_rate": 0.0001, + "loss": 7.8376, + "loss/crossentropy": 2.785367250442505, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3005323112010956, + "step": 1186 + }, + { + "epoch": 0.037125, + "grad_norm": 4.6875, + "grad_norm_var": 0.23267822265625, + "learning_rate": 0.0001, + "loss": 7.5326, + "loss/crossentropy": 2.6078680753707886, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.292471781373024, + "step": 1188 + }, + { + "epoch": 0.0371875, + "grad_norm": 4.84375, + "grad_norm_var": 0.25006510416666666, + "learning_rate": 0.0001, + "loss": 7.6366, + "loss/crossentropy": 2.6924532651901245, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2920667827129364, + "step": 1190 + }, + { + "epoch": 0.03725, + "grad_norm": 5.0625, + "grad_norm_var": 0.14954020182291666, + "learning_rate": 0.0001, + "loss": 7.6194, + "loss/crossentropy": 2.57063090801239, + "loss/hidden": 1.99609375, + "loss/jsd": 0.0, + "loss/logits": 0.3052666634321213, + "step": 1192 + }, + { + "epoch": 0.0373125, + "grad_norm": 4.90625, + "grad_norm_var": 0.15653889973958332, + "learning_rate": 0.0001, + "loss": 7.9032, + "loss/crossentropy": 2.867217540740967, + "loss/hidden": 1.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.30750520527362823, + "step": 1194 + }, + { + "epoch": 0.037375, + "grad_norm": 5.3125, + "grad_norm_var": 0.11092122395833333, + "learning_rate": 0.0001, + "loss": 7.5179, + "loss/crossentropy": 2.601387858390808, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.292428120970726, + "step": 1196 + }, + { + "epoch": 0.0374375, + "grad_norm": 5.6875, + "grad_norm_var": 0.12561442057291666, + "learning_rate": 0.0001, + "loss": 8.2058, + "loss/crossentropy": 2.918960690498352, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.32165729999542236, + "step": 1198 + }, + { + "epoch": 0.0375, + "grad_norm": 4.8125, + "grad_norm_var": 0.17003580729166667, + "learning_rate": 0.0001, + "loss": 7.6882, + "loss/crossentropy": 2.5691174268722534, + "loss/hidden": 2.1015625, + "loss/jsd": 0.0, + "loss/logits": 0.3017522841691971, + "step": 1200 + }, + { + "epoch": 0.0375625, + "grad_norm": 4.875, + "grad_norm_var": 0.16925455729166666, + "learning_rate": 0.0001, + "loss": 7.5997, + "loss/crossentropy": 2.5958189964294434, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2956975996494293, + "step": 1202 + }, + { + "epoch": 0.037625, + "grad_norm": 5.3125, + "grad_norm_var": 0.14905192057291666, + "learning_rate": 0.0001, + "loss": 7.9452, + "loss/crossentropy": 2.7279187440872192, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.31938889622688293, + "step": 1204 + }, + { + "epoch": 0.0376875, + "grad_norm": 5.09375, + "grad_norm_var": 0.13006184895833334, + "learning_rate": 0.0001, + "loss": 7.5916, + "loss/crossentropy": 2.6716209650039673, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2951197922229767, + "step": 1206 + }, + { + "epoch": 0.03775, + "grad_norm": 5.40625, + "grad_norm_var": 0.150390625, + "learning_rate": 0.0001, + "loss": 7.7079, + "loss/crossentropy": 2.808968424797058, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.29067111015319824, + "step": 1208 + }, + { + "epoch": 0.0378125, + "grad_norm": 5.375, + "grad_norm_var": 0.13433837890625, + "learning_rate": 0.0001, + "loss": 7.9013, + "loss/crossentropy": 2.709269404411316, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3145200312137604, + "step": 1210 + }, + { + "epoch": 0.037875, + "grad_norm": 5.09375, + "grad_norm_var": 0.13411051432291668, + "learning_rate": 0.0001, + "loss": 7.62, + "loss/crossentropy": 2.611648201942444, + "loss/hidden": 2.02734375, + "loss/jsd": 0.0, + "loss/logits": 0.2980997562408447, + "step": 1212 + }, + { + "epoch": 0.0379375, + "grad_norm": 4.78125, + "grad_norm_var": 0.14256184895833332, + "learning_rate": 0.0001, + "loss": 7.4184, + "loss/crossentropy": 2.5281219482421875, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.28746722638607025, + "step": 1214 + }, + { + "epoch": 0.038, + "grad_norm": 5.4375, + "grad_norm_var": 0.122509765625, + "learning_rate": 0.0001, + "loss": 7.7106, + "loss/crossentropy": 2.614644765853882, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.30647440254688263, + "step": 1216 + }, + { + "epoch": 0.0380625, + "grad_norm": 4.8125, + "grad_norm_var": 0.09927978515625, + "learning_rate": 0.0001, + "loss": 7.4034, + "loss/crossentropy": 2.629401683807373, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.2816943824291229, + "step": 1218 + }, + { + "epoch": 0.038125, + "grad_norm": 4.9375, + "grad_norm_var": 0.10702718098958333, + "learning_rate": 0.0001, + "loss": 7.5036, + "loss/crossentropy": 2.6855201721191406, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.28180426359176636, + "step": 1220 + }, + { + "epoch": 0.0381875, + "grad_norm": 5.09375, + "grad_norm_var": 0.11998697916666666, + "learning_rate": 0.0001, + "loss": 7.6283, + "loss/crossentropy": 2.6732563972473145, + "loss/hidden": 1.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.29901817440986633, + "step": 1222 + }, + { + "epoch": 0.03825, + "grad_norm": 5.3125, + "grad_norm_var": 0.10178629557291667, + "learning_rate": 0.0001, + "loss": 7.7806, + "loss/crossentropy": 2.826427936553955, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.29463550448417664, + "step": 1224 + }, + { + "epoch": 0.0383125, + "grad_norm": 4.875, + "grad_norm_var": 0.08203125, + "learning_rate": 0.0001, + "loss": 7.5518, + "loss/crossentropy": 2.679568648338318, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.2899537980556488, + "step": 1226 + }, + { + "epoch": 0.038375, + "grad_norm": 10.9375, + "grad_norm_var": 2.25299072265625, + "learning_rate": 0.0001, + "loss": 7.8777, + "loss/crossentropy": 2.711169719696045, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.3150908648967743, + "step": 1228 + }, + { + "epoch": 0.0384375, + "grad_norm": 5.15625, + "grad_norm_var": 2.215234375, + "learning_rate": 0.0001, + "loss": 7.7375, + "loss/crossentropy": 2.791654109954834, + "loss/hidden": 1.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.298099547624588, + "step": 1230 + }, + { + "epoch": 0.0385, + "grad_norm": 5.40625, + "grad_norm_var": 2.214774576822917, + "learning_rate": 0.0001, + "loss": 7.6766, + "loss/crossentropy": 2.561646342277527, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.3044602572917938, + "step": 1232 + }, + { + "epoch": 0.0385625, + "grad_norm": 4.65625, + "grad_norm_var": 2.287239583333333, + "learning_rate": 0.0001, + "loss": 7.1393, + "loss/crossentropy": 2.4994441270828247, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.26945771276950836, + "step": 1234 + }, + { + "epoch": 0.038625, + "grad_norm": 5.0625, + "grad_norm_var": 2.287040201822917, + "learning_rate": 0.0001, + "loss": 7.9031, + "loss/crossentropy": 2.8577252626419067, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.3076618164777756, + "step": 1236 + }, + { + "epoch": 0.0386875, + "grad_norm": 5.40625, + "grad_norm_var": 2.2509765625, + "learning_rate": 0.0001, + "loss": 7.9173, + "loss/crossentropy": 2.8302924633026123, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.3032368868589401, + "step": 1238 + }, + { + "epoch": 0.03875, + "grad_norm": 5.78125, + "grad_norm_var": 2.30924072265625, + "learning_rate": 0.0001, + "loss": 7.7384, + "loss/crossentropy": 2.829968810081482, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.2927999347448349, + "step": 1240 + }, + { + "epoch": 0.0388125, + "grad_norm": 5.25, + "grad_norm_var": 2.2814453125, + "learning_rate": 0.0001, + "loss": 7.7618, + "loss/crossentropy": 2.735278010368347, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2979622185230255, + "step": 1242 + }, + { + "epoch": 0.038875, + "grad_norm": 5.4375, + "grad_norm_var": 0.14737955729166666, + "learning_rate": 0.0001, + "loss": 7.7466, + "loss/crossentropy": 2.716152548789978, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.29913707077503204, + "step": 1244 + }, + { + "epoch": 0.0389375, + "grad_norm": 4.625, + "grad_norm_var": 0.14700113932291667, + "learning_rate": 0.0001, + "loss": 7.3482, + "loss/crossentropy": 2.594505190849304, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2816225290298462, + "step": 1246 + }, + { + "epoch": 0.039, + "grad_norm": 5.40625, + "grad_norm_var": 0.14719645182291666, + "learning_rate": 0.0001, + "loss": 7.7186, + "loss/crossentropy": 2.7304205894470215, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.3003810793161392, + "step": 1248 + }, + { + "epoch": 0.0390625, + "grad_norm": 5.0, + "grad_norm_var": 0.113134765625, + "learning_rate": 0.0001, + "loss": 7.3257, + "loss/crossentropy": 2.4609283208847046, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2872539907693863, + "step": 1250 + }, + { + "epoch": 0.039125, + "grad_norm": 5.25, + "grad_norm_var": 0.30038655598958336, + "learning_rate": 0.0001, + "loss": 7.8434, + "loss/crossentropy": 2.7610113620758057, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.3074617087841034, + "step": 1252 + }, + { + "epoch": 0.0391875, + "grad_norm": 5.09375, + "grad_norm_var": 0.29940999348958336, + "learning_rate": 0.0001, + "loss": 7.8855, + "loss/crossentropy": 2.828143000602722, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.3100292235612869, + "step": 1254 + }, + { + "epoch": 0.03925, + "grad_norm": 5.625, + "grad_norm_var": 0.2684733072916667, + "learning_rate": 0.0001, + "loss": 7.429, + "loss/crossentropy": 2.530988335609436, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.2917501926422119, + "step": 1256 + }, + { + "epoch": 0.0393125, + "grad_norm": 4.625, + "grad_norm_var": 0.2939453125, + "learning_rate": 0.0001, + "loss": 7.4206, + "loss/crossentropy": 2.5805318355560303, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2902548015117645, + "step": 1258 + }, + { + "epoch": 0.039375, + "grad_norm": 5.0, + "grad_norm_var": 0.29010416666666666, + "learning_rate": 0.0001, + "loss": 7.7127, + "loss/crossentropy": 2.7116379737854004, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.3024456202983856, + "step": 1260 + }, + { + "epoch": 0.0394375, + "grad_norm": 5.125, + "grad_norm_var": 0.26038004557291666, + "learning_rate": 0.0001, + "loss": 7.9923, + "loss/crossentropy": 2.866301417350769, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.3125974237918854, + "step": 1262 + }, + { + "epoch": 0.0395, + "grad_norm": 5.34375, + "grad_norm_var": 0.291650390625, + "learning_rate": 0.0001, + "loss": 7.584, + "loss/crossentropy": 2.6004990339279175, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2991277277469635, + "step": 1264 + }, + { + "epoch": 0.0395625, + "grad_norm": 5.21875, + "grad_norm_var": 0.29542643229166665, + "learning_rate": 0.0001, + "loss": 7.8619, + "loss/crossentropy": 2.7568334341049194, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.3050409257411957, + "step": 1266 + }, + { + "epoch": 0.039625, + "grad_norm": 5.53125, + "grad_norm_var": 0.11887613932291667, + "learning_rate": 0.0001, + "loss": 8.04, + "loss/crossentropy": 2.8222177028656006, + "loss/hidden": 2.01953125, + "loss/jsd": 0.0, + "loss/logits": 0.31982044875621796, + "step": 1268 + }, + { + "epoch": 0.0396875, + "grad_norm": 4.625, + "grad_norm_var": 0.144921875, + "learning_rate": 0.0001, + "loss": 7.9355, + "loss/crossentropy": 2.8479357957839966, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.31032055616378784, + "step": 1270 + }, + { + "epoch": 0.03975, + "grad_norm": 5.1875, + "grad_norm_var": 2.83492431640625, + "learning_rate": 0.0001, + "loss": 7.5238, + "loss/crossentropy": 2.5936185121536255, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2961418032646179, + "step": 1272 + }, + { + "epoch": 0.0398125, + "grad_norm": 5.65625, + "grad_norm_var": 2.822066243489583, + "learning_rate": 0.0001, + "loss": 8.0667, + "loss/crossentropy": 2.8308135271072388, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.32046134769916534, + "step": 1274 + }, + { + "epoch": 0.039875, + "grad_norm": 5.125, + "grad_norm_var": 2.891259765625, + "learning_rate": 0.0001, + "loss": 7.4237, + "loss/crossentropy": 2.5655672550201416, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2912827283143997, + "step": 1276 + }, + { + "epoch": 0.0399375, + "grad_norm": 5.0625, + "grad_norm_var": 2.91734619140625, + "learning_rate": 0.0001, + "loss": 7.68, + "loss/crossentropy": 2.714452862739563, + "loss/hidden": 1.99609375, + "loss/jsd": 0.0, + "loss/logits": 0.2969430685043335, + "step": 1278 + }, + { + "epoch": 0.04, + "grad_norm": 7.09375, + "grad_norm_var": 2.90718994140625, + "learning_rate": 0.0001, + "loss": 7.5165, + "loss/crossentropy": 2.4779699444770813, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2991633117198944, + "step": 1280 + }, + { + "epoch": 0.0400625, + "grad_norm": 5.625, + "grad_norm_var": 2.8809529622395833, + "learning_rate": 0.0001, + "loss": 7.6352, + "loss/crossentropy": 2.7353172302246094, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2931157201528549, + "step": 1282 + }, + { + "epoch": 0.040125, + "grad_norm": 6.3125, + "grad_norm_var": 2.894462076822917, + "learning_rate": 0.0001, + "loss": 7.668, + "loss/crossentropy": 2.70292329788208, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.295722633600235, + "step": 1284 + }, + { + "epoch": 0.0401875, + "grad_norm": 4.875, + "grad_norm_var": 2.886031087239583, + "learning_rate": 0.0001, + "loss": 7.5251, + "loss/crossentropy": 2.6100287437438965, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.29073064029216766, + "step": 1286 + }, + { + "epoch": 0.04025, + "grad_norm": 5.21875, + "grad_norm_var": 0.698681640625, + "learning_rate": 0.0001, + "loss": 8.0385, + "loss/crossentropy": 2.6995733976364136, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.3276410549879074, + "step": 1288 + }, + { + "epoch": 0.0403125, + "grad_norm": 4.875, + "grad_norm_var": 0.6340983072916667, + "learning_rate": 0.0001, + "loss": 7.5085, + "loss/crossentropy": 2.571585774421692, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.29291462898254395, + "step": 1290 + }, + { + "epoch": 0.040375, + "grad_norm": 6.03125, + "grad_norm_var": 0.6041951497395833, + "learning_rate": 0.0001, + "loss": 7.5429, + "loss/crossentropy": 2.6209115982055664, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.29141952097415924, + "step": 1292 + }, + { + "epoch": 0.0404375, + "grad_norm": 5.75, + "grad_norm_var": 0.5770792643229167, + "learning_rate": 0.0001, + "loss": 7.5796, + "loss/crossentropy": 2.5977821350097656, + "loss/hidden": 1.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.299348846077919, + "step": 1294 + }, + { + "epoch": 0.0405, + "grad_norm": 5.5, + "grad_norm_var": 0.43292643229166666, + "learning_rate": 0.0001, + "loss": 7.8398, + "loss/crossentropy": 2.8123137950897217, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2988448143005371, + "step": 1296 + }, + { + "epoch": 0.0405625, + "grad_norm": 6.0625, + "grad_norm_var": 0.4564737955729167, + "learning_rate": 0.0001, + "loss": 8.0537, + "loss/crossentropy": 2.8605915307998657, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.3138464093208313, + "step": 1298 + }, + { + "epoch": 0.040625, + "grad_norm": 5.8125, + "grad_norm_var": 0.41858317057291666, + "learning_rate": 0.0001, + "loss": 7.4912, + "loss/crossentropy": 2.502464771270752, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.29965806007385254, + "step": 1300 + }, + { + "epoch": 0.0406875, + "grad_norm": 4.84375, + "grad_norm_var": 0.41389567057291665, + "learning_rate": 0.0001, + "loss": 7.9667, + "loss/crossentropy": 2.812674403190613, + "loss/hidden": 2.01953125, + "loss/jsd": 0.0, + "loss/logits": 0.3134448826313019, + "step": 1302 + }, + { + "epoch": 0.04075, + "grad_norm": 5.0, + "grad_norm_var": 0.16886393229166666, + "learning_rate": 0.0001, + "loss": 7.5189, + "loss/crossentropy": 2.5777794122695923, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2894267141819, + "step": 1304 + }, + { + "epoch": 0.0408125, + "grad_norm": 5.25, + "grad_norm_var": 0.30690104166666665, + "learning_rate": 0.0001, + "loss": 7.9687, + "loss/crossentropy": 2.8120635747909546, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3109755218029022, + "step": 1306 + }, + { + "epoch": 0.040875, + "grad_norm": 4.96875, + "grad_norm_var": 0.2977701822916667, + "learning_rate": 0.0001, + "loss": 7.529, + "loss/crossentropy": 2.723876118659973, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2812942564487457, + "step": 1308 + }, + { + "epoch": 0.0409375, + "grad_norm": 5.28125, + "grad_norm_var": 0.2884724934895833, + "learning_rate": 0.0001, + "loss": 7.9555, + "loss/crossentropy": 2.883132815361023, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.30488817393779755, + "step": 1310 + }, + { + "epoch": 0.041, + "grad_norm": 5.3125, + "grad_norm_var": 0.31438802083333334, + "learning_rate": 0.0001, + "loss": 7.2809, + "loss/crossentropy": 2.370841383934021, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2933536022901535, + "step": 1312 + }, + { + "epoch": 0.0410625, + "grad_norm": 5.46875, + "grad_norm_var": 0.3129191080729167, + "learning_rate": 0.0001, + "loss": 7.2967, + "loss/crossentropy": 2.5285329818725586, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.27837996184825897, + "step": 1314 + }, + { + "epoch": 0.041125, + "grad_norm": 5.46875, + "grad_norm_var": 0.29664306640625, + "learning_rate": 0.0001, + "loss": 7.8327, + "loss/crossentropy": 2.757304310798645, + "loss/hidden": 1.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.30871395766735077, + "step": 1316 + }, + { + "epoch": 0.0411875, + "grad_norm": 5.0625, + "grad_norm_var": 0.27810872395833336, + "learning_rate": 0.0001, + "loss": 7.576, + "loss/crossentropy": 2.7341228723526, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2865341305732727, + "step": 1318 + }, + { + "epoch": 0.04125, + "grad_norm": 4.875, + "grad_norm_var": 0.284765625, + "learning_rate": 0.0001, + "loss": 7.3078, + "loss/crossentropy": 2.576223611831665, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.27549774944782257, + "step": 1320 + }, + { + "epoch": 0.0413125, + "grad_norm": 4.59375, + "grad_norm_var": 0.08121337890625, + "learning_rate": 0.0001, + "loss": 7.2132, + "loss/crossentropy": 2.5026360750198364, + "loss/hidden": 1.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.2745731547474861, + "step": 1322 + }, + { + "epoch": 0.041375, + "grad_norm": 5.3125, + "grad_norm_var": 0.08599853515625, + "learning_rate": 0.0001, + "loss": 7.0476, + "loss/crossentropy": 2.394889712333679, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.27269674837589264, + "step": 1324 + }, + { + "epoch": 0.0414375, + "grad_norm": 4.59375, + "grad_norm_var": 0.26248372395833336, + "learning_rate": 0.0001, + "loss": 7.624, + "loss/crossentropy": 2.6320308446884155, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.29373273253440857, + "step": 1326 + }, + { + "epoch": 0.0415, + "grad_norm": 5.21875, + "grad_norm_var": 0.28046875, + "learning_rate": 0.0001, + "loss": 7.9352, + "loss/crossentropy": 2.927828311920166, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.300733357667923, + "step": 1328 + }, + { + "epoch": 0.0415625, + "grad_norm": 6.6875, + "grad_norm_var": 0.39078369140625, + "learning_rate": 0.0001, + "loss": 7.9079, + "loss/crossentropy": 2.6833643913269043, + "loss/hidden": 2.01171875, + "loss/jsd": 0.0, + "loss/logits": 0.3212866932153702, + "step": 1330 + }, + { + "epoch": 0.041625, + "grad_norm": 5.75, + "grad_norm_var": 0.40312093098958335, + "learning_rate": 0.0001, + "loss": 8.1211, + "loss/crossentropy": 2.897345185279846, + "loss/hidden": 2.0703125, + "loss/jsd": 0.0, + "loss/logits": 0.31534482538700104, + "step": 1332 + }, + { + "epoch": 0.0416875, + "grad_norm": 5.46875, + "grad_norm_var": 0.38917643229166665, + "learning_rate": 0.0001, + "loss": 7.7456, + "loss/crossentropy": 2.584378242492676, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.3176807314157486, + "step": 1334 + }, + { + "epoch": 0.04175, + "grad_norm": 5.15625, + "grad_norm_var": 0.3631510416666667, + "learning_rate": 0.0001, + "loss": 7.6088, + "loss/crossentropy": 2.5694063901901245, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3008142113685608, + "step": 1336 + }, + { + "epoch": 0.0418125, + "grad_norm": 4.34375, + "grad_norm_var": 0.405322265625, + "learning_rate": 0.0001, + "loss": 7.2625, + "loss/crossentropy": 2.5970876216888428, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.27201347053050995, + "step": 1338 + }, + { + "epoch": 0.041875, + "grad_norm": 5.03125, + "grad_norm_var": 0.41627197265625, + "learning_rate": 0.0001, + "loss": 7.5859, + "loss/crossentropy": 2.6498775482177734, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.2896941006183624, + "step": 1340 + }, + { + "epoch": 0.0419375, + "grad_norm": 6.53125, + "grad_norm_var": 0.4341796875, + "learning_rate": 0.0001, + "loss": 7.1471, + "loss/crossentropy": 2.4250807762145996, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2698608785867691, + "step": 1342 + }, + { + "epoch": 0.042, + "grad_norm": 5.34375, + "grad_norm_var": 0.4249959309895833, + "learning_rate": 0.0001, + "loss": 7.8104, + "loss/crossentropy": 2.8312790393829346, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2994709312915802, + "step": 1344 + }, + { + "epoch": 0.0420625, + "grad_norm": 4.78125, + "grad_norm_var": 0.33548177083333336, + "learning_rate": 0.0001, + "loss": 7.5373, + "loss/crossentropy": 2.687604069709778, + "loss/hidden": 1.953125, + "loss/jsd": 0.0, + "loss/logits": 0.28966057300567627, + "step": 1346 + }, + { + "epoch": 0.042125, + "grad_norm": 4.84375, + "grad_norm_var": 0.33264567057291666, + "learning_rate": 0.0001, + "loss": 7.5791, + "loss/crossentropy": 2.6224461793899536, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2933178097009659, + "step": 1348 + }, + { + "epoch": 0.0421875, + "grad_norm": 5.0, + "grad_norm_var": 0.350244140625, + "learning_rate": 0.0001, + "loss": 7.8897, + "loss/crossentropy": 2.7649588584899902, + "loss/hidden": 2.01171875, + "loss/jsd": 0.0, + "loss/logits": 0.311299666762352, + "step": 1350 + }, + { + "epoch": 0.04225, + "grad_norm": 4.78125, + "grad_norm_var": 0.31842041015625, + "learning_rate": 0.0001, + "loss": 7.5599, + "loss/crossentropy": 2.6638940572738647, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.29546037316322327, + "step": 1352 + }, + { + "epoch": 0.0423125, + "grad_norm": 5.15625, + "grad_norm_var": 0.2779947916666667, + "learning_rate": 0.0001, + "loss": 7.7463, + "loss/crossentropy": 2.7206530570983887, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.30256910622119904, + "step": 1354 + }, + { + "epoch": 0.042375, + "grad_norm": 5.25, + "grad_norm_var": 0.271728515625, + "learning_rate": 0.0001, + "loss": 7.5532, + "loss/crossentropy": 2.679394841194153, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.29480086266994476, + "step": 1356 + }, + { + "epoch": 0.0424375, + "grad_norm": 5.125, + "grad_norm_var": 0.12278645833333333, + "learning_rate": 0.0001, + "loss": 7.4359, + "loss/crossentropy": 2.6582494974136353, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.28323106467723846, + "step": 1358 + }, + { + "epoch": 0.0425, + "grad_norm": 4.9375, + "grad_norm_var": 0.13513997395833333, + "learning_rate": 0.0001, + "loss": 7.5235, + "loss/crossentropy": 2.6920082569122314, + "loss/hidden": 1.94921875, + "loss/jsd": 0.0, + "loss/logits": 0.2882232964038849, + "step": 1360 + }, + { + "epoch": 0.0425625, + "grad_norm": 5.09375, + "grad_norm_var": 0.19777018229166668, + "learning_rate": 0.0001, + "loss": 7.6063, + "loss/crossentropy": 2.6489486694335938, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2894832342863083, + "step": 1362 + }, + { + "epoch": 0.042625, + "grad_norm": 5.1875, + "grad_norm_var": 0.16953125, + "learning_rate": 0.0001, + "loss": 7.7816, + "loss/crossentropy": 2.7566269636154175, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.3048381954431534, + "step": 1364 + }, + { + "epoch": 0.0426875, + "grad_norm": 4.71875, + "grad_norm_var": 0.14920247395833333, + "learning_rate": 0.0001, + "loss": 7.3469, + "loss/crossentropy": 2.634415030479431, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.27750201523303986, + "step": 1366 + }, + { + "epoch": 0.04275, + "grad_norm": 4.9375, + "grad_norm_var": 0.15572509765625, + "learning_rate": 0.0001, + "loss": 7.5947, + "loss/crossentropy": 2.707185387611389, + "loss/hidden": 2.01953125, + "loss/jsd": 0.0, + "loss/logits": 0.2867947816848755, + "step": 1368 + }, + { + "epoch": 0.0428125, + "grad_norm": 4.71875, + "grad_norm_var": 0.15575764973958334, + "learning_rate": 0.0001, + "loss": 7.1529, + "loss/crossentropy": 2.4458247423171997, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.27070252597332, + "step": 1370 + }, + { + "epoch": 0.042875, + "grad_norm": 4.84375, + "grad_norm_var": 0.16929931640625, + "learning_rate": 0.0001, + "loss": 7.482, + "loss/crossentropy": 2.642867088317871, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.2776651754975319, + "step": 1372 + }, + { + "epoch": 0.0429375, + "grad_norm": 4.59375, + "grad_norm_var": 0.17170817057291668, + "learning_rate": 0.0001, + "loss": 7.7102, + "loss/crossentropy": 2.722243070602417, + "loss/hidden": 2.015625, + "loss/jsd": 0.0, + "loss/logits": 0.2972361445426941, + "step": 1374 + }, + { + "epoch": 0.043, + "grad_norm": 4.84375, + "grad_norm_var": 0.16534830729166666, + "learning_rate": 0.0001, + "loss": 7.3228, + "loss/crossentropy": 2.5705126523971558, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.28303705155849457, + "step": 1376 + }, + { + "epoch": 0.0430625, + "grad_norm": 5.0, + "grad_norm_var": 0.06278889973958333, + "learning_rate": 0.0001, + "loss": 7.5588, + "loss/crossentropy": 2.6257171630859375, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.295265793800354, + "step": 1378 + }, + { + "epoch": 0.043125, + "grad_norm": 5.375, + "grad_norm_var": 0.07203369140625, + "learning_rate": 0.0001, + "loss": 7.7361, + "loss/crossentropy": 2.775424599647522, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2984114736318588, + "step": 1380 + }, + { + "epoch": 0.0431875, + "grad_norm": 6.9375, + "grad_norm_var": 0.311181640625, + "learning_rate": 0.0001, + "loss": 7.899, + "loss/crossentropy": 2.7836259603500366, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.3084150403738022, + "step": 1382 + }, + { + "epoch": 0.04325, + "grad_norm": 4.84375, + "grad_norm_var": 0.29814046223958335, + "learning_rate": 0.0001, + "loss": 7.7099, + "loss/crossentropy": 2.850728750228882, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2921678125858307, + "step": 1384 + }, + { + "epoch": 0.0433125, + "grad_norm": 4.84375, + "grad_norm_var": 0.29179280598958335, + "learning_rate": 0.0001, + "loss": 7.3731, + "loss/crossentropy": 2.5699057579040527, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2834426909685135, + "step": 1386 + }, + { + "epoch": 0.043375, + "grad_norm": 4.84375, + "grad_norm_var": 0.28982747395833336, + "learning_rate": 0.0001, + "loss": 7.3414, + "loss/crossentropy": 2.566841721534729, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.28018754720687866, + "step": 1388 + }, + { + "epoch": 0.0434375, + "grad_norm": 4.90625, + "grad_norm_var": 0.29019775390625, + "learning_rate": 0.0001, + "loss": 7.627, + "loss/crossentropy": 2.803957223892212, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2846508026123047, + "step": 1390 + }, + { + "epoch": 0.0435, + "grad_norm": 4.8125, + "grad_norm_var": 0.28404541015625, + "learning_rate": 0.0001, + "loss": 7.3805, + "loss/crossentropy": 2.4680495262145996, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.28734005987644196, + "step": 1392 + }, + { + "epoch": 0.0435625, + "grad_norm": 9.5625, + "grad_norm_var": 1.4954060872395833, + "learning_rate": 0.0001, + "loss": 7.6419, + "loss/crossentropy": 2.6241623163223267, + "loss/hidden": 2.0625, + "loss/jsd": 0.0, + "loss/logits": 0.29552070796489716, + "step": 1394 + }, + { + "epoch": 0.043625, + "grad_norm": 4.84375, + "grad_norm_var": 1.5164998372395833, + "learning_rate": 0.0001, + "loss": 7.0692, + "loss/crossentropy": 2.2776577472686768, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2807151973247528, + "step": 1396 + }, + { + "epoch": 0.0436875, + "grad_norm": 4.875, + "grad_norm_var": 1.3636555989583334, + "learning_rate": 0.0001, + "loss": 7.6524, + "loss/crossentropy": 2.6224864721298218, + "loss/hidden": 2.0859375, + "loss/jsd": 0.0, + "loss/logits": 0.29439981281757355, + "step": 1398 + }, + { + "epoch": 0.04375, + "grad_norm": 4.71875, + "grad_norm_var": 1.3757771809895833, + "learning_rate": 0.0001, + "loss": 7.5251, + "loss/crossentropy": 2.6127312183380127, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.29357995092868805, + "step": 1400 + }, + { + "epoch": 0.0438125, + "grad_norm": 5.25, + "grad_norm_var": 1.3815388997395834, + "learning_rate": 0.0001, + "loss": 7.6992, + "loss/crossentropy": 2.692687511444092, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.2959606945514679, + "step": 1402 + }, + { + "epoch": 0.043875, + "grad_norm": 4.75, + "grad_norm_var": 1.3755818684895833, + "learning_rate": 0.0001, + "loss": 7.694, + "loss/crossentropy": 2.7359989881515503, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.3028276115655899, + "step": 1404 + }, + { + "epoch": 0.0439375, + "grad_norm": 5.125, + "grad_norm_var": 1.3676920572916667, + "learning_rate": 0.0001, + "loss": 7.5041, + "loss/crossentropy": 2.6671664714813232, + "loss/hidden": 1.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2875981330871582, + "step": 1406 + }, + { + "epoch": 0.044, + "grad_norm": 4.71875, + "grad_norm_var": 1.4091145833333334, + "learning_rate": 0.0001, + "loss": 7.2373, + "loss/crossentropy": 2.509815812110901, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2789994776248932, + "step": 1408 + }, + { + "epoch": 0.0440625, + "grad_norm": 5.09375, + "grad_norm_var": 0.06513264973958334, + "learning_rate": 0.0001, + "loss": 7.7282, + "loss/crossentropy": 2.7103008031845093, + "loss/hidden": 2.01171875, + "loss/jsd": 0.0, + "loss/logits": 0.3006156384944916, + "step": 1410 + }, + { + "epoch": 0.044125, + "grad_norm": 5.15625, + "grad_norm_var": 0.06676025390625, + "learning_rate": 0.0001, + "loss": 7.5621, + "loss/crossentropy": 2.7707459926605225, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.2791343182325363, + "step": 1412 + }, + { + "epoch": 0.0441875, + "grad_norm": 5.3125, + "grad_norm_var": 0.07636311848958334, + "learning_rate": 0.0001, + "loss": 7.5706, + "loss/crossentropy": 2.721479654312134, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.2868632972240448, + "step": 1414 + }, + { + "epoch": 0.04425, + "grad_norm": 4.65625, + "grad_norm_var": 0.08456624348958333, + "learning_rate": 0.0001, + "loss": 7.2394, + "loss/crossentropy": 2.603902578353882, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2709691673517227, + "step": 1416 + }, + { + "epoch": 0.0443125, + "grad_norm": 4.3125, + "grad_norm_var": 0.10087483723958333, + "learning_rate": 0.0001, + "loss": 6.9187, + "loss/crossentropy": 2.4536471366882324, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.25783771276474, + "step": 1418 + }, + { + "epoch": 0.044375, + "grad_norm": 4.78125, + "grad_norm_var": 0.08489176432291666, + "learning_rate": 0.0001, + "loss": 6.9662, + "loss/crossentropy": 2.4370354413986206, + "loss/hidden": 1.94921875, + "loss/jsd": 0.0, + "loss/logits": 0.25799560546875, + "step": 1420 + }, + { + "epoch": 0.0444375, + "grad_norm": 4.84375, + "grad_norm_var": 0.09192301432291666, + "learning_rate": 0.0001, + "loss": 7.5708, + "loss/crossentropy": 2.74661386013031, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2886638045310974, + "step": 1422 + }, + { + "epoch": 0.0445, + "grad_norm": 5.1875, + "grad_norm_var": 0.13909098307291667, + "learning_rate": 0.0001, + "loss": 7.7308, + "loss/crossentropy": 2.716087222099304, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.3022549897432327, + "step": 1424 + }, + { + "epoch": 0.0445625, + "grad_norm": 4.625, + "grad_norm_var": 0.14034830729166667, + "learning_rate": 0.0001, + "loss": 7.4306, + "loss/crossentropy": 2.7329652309417725, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.27796949446201324, + "step": 1426 + }, + { + "epoch": 0.044625, + "grad_norm": 4.875, + "grad_norm_var": 0.15035400390625, + "learning_rate": 0.0001, + "loss": 7.4232, + "loss/crossentropy": 2.7089322805404663, + "loss/hidden": 1.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.27493977546691895, + "step": 1428 + }, + { + "epoch": 0.0446875, + "grad_norm": 4.375, + "grad_norm_var": 0.162353515625, + "learning_rate": 0.0001, + "loss": 7.1642, + "loss/crossentropy": 2.5490864515304565, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2638545036315918, + "step": 1430 + }, + { + "epoch": 0.04475, + "grad_norm": 5.15625, + "grad_norm_var": 0.22652587890625, + "learning_rate": 0.0001, + "loss": 8.1627, + "loss/crossentropy": 2.870542287826538, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.32375185191631317, + "step": 1432 + }, + { + "epoch": 0.0448125, + "grad_norm": 5.0, + "grad_norm_var": 0.19529622395833332, + "learning_rate": 0.0001, + "loss": 7.4056, + "loss/crossentropy": 2.5882151126861572, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.2860320508480072, + "step": 1434 + }, + { + "epoch": 0.044875, + "grad_norm": 6.09375, + "grad_norm_var": 0.2537760416666667, + "learning_rate": 0.0001, + "loss": 7.894, + "loss/crossentropy": 2.7808961868286133, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.30818620324134827, + "step": 1436 + }, + { + "epoch": 0.0449375, + "grad_norm": 5.9375, + "grad_norm_var": 0.2899576822916667, + "learning_rate": 0.0001, + "loss": 7.9565, + "loss/crossentropy": 2.932820439338684, + "loss/hidden": 2.01953125, + "loss/jsd": 0.0, + "loss/logits": 0.30041807889938354, + "step": 1438 + }, + { + "epoch": 0.045, + "grad_norm": 5.25, + "grad_norm_var": 0.268212890625, + "learning_rate": 0.0001, + "loss": 7.5398, + "loss/crossentropy": 2.500837206840515, + "loss/hidden": 2.03515625, + "loss/jsd": 0.0, + "loss/logits": 0.30038216710090637, + "step": 1440 + }, + { + "epoch": 0.0450625, + "grad_norm": 6.1875, + "grad_norm_var": 0.2953409830729167, + "learning_rate": 0.0001, + "loss": 7.5575, + "loss/crossentropy": 2.5460762977600098, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.29801732301712036, + "step": 1442 + }, + { + "epoch": 0.045125, + "grad_norm": 5.0, + "grad_norm_var": 0.23290608723958334, + "learning_rate": 0.0001, + "loss": 8.0496, + "loss/crossentropy": 2.860416531562805, + "loss/hidden": 2.0078125, + "loss/jsd": 0.0, + "loss/logits": 0.3181406408548355, + "step": 1444 + }, + { + "epoch": 0.0451875, + "grad_norm": 5.5, + "grad_norm_var": 0.19029947916666667, + "learning_rate": 0.0001, + "loss": 7.5155, + "loss/crossentropy": 2.6714009046554565, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.29065585136413574, + "step": 1446 + }, + { + "epoch": 0.04525, + "grad_norm": 5.125, + "grad_norm_var": 0.17509358723958332, + "learning_rate": 0.0001, + "loss": 7.2614, + "loss/crossentropy": 2.548215627670288, + "loss/hidden": 1.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2760085165500641, + "step": 1448 + }, + { + "epoch": 0.0453125, + "grad_norm": 4.96875, + "grad_norm_var": 0.18644205729166666, + "learning_rate": 0.0001, + "loss": 7.7885, + "loss/crossentropy": 2.8646936416625977, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2931659668684006, + "step": 1450 + }, + { + "epoch": 0.045375, + "grad_norm": 5.25, + "grad_norm_var": 0.17561442057291668, + "learning_rate": 0.0001, + "loss": 7.5222, + "loss/crossentropy": 2.7001854181289673, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.28884437680244446, + "step": 1452 + }, + { + "epoch": 0.0454375, + "grad_norm": 5.375, + "grad_norm_var": 0.14674479166666668, + "learning_rate": 0.0001, + "loss": 7.3909, + "loss/crossentropy": 2.587071418762207, + "loss/hidden": 2.0234375, + "loss/jsd": 0.0, + "loss/logits": 0.2780349850654602, + "step": 1454 + }, + { + "epoch": 0.0455, + "grad_norm": 4.96875, + "grad_norm_var": 0.15011393229166667, + "learning_rate": 0.0001, + "loss": 7.6218, + "loss/crossentropy": 2.7357864379882812, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2901657521724701, + "step": 1456 + }, + { + "epoch": 0.0455625, + "grad_norm": 5.25, + "grad_norm_var": 0.08912353515625, + "learning_rate": 0.0001, + "loss": 7.8304, + "loss/crossentropy": 2.845029592514038, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2993154674768448, + "step": 1458 + }, + { + "epoch": 0.045625, + "grad_norm": 4.9375, + "grad_norm_var": 0.07277018229166667, + "learning_rate": 0.0001, + "loss": 7.627, + "loss/crossentropy": 2.732051730155945, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.2914441227912903, + "step": 1460 + }, + { + "epoch": 0.0456875, + "grad_norm": 4.84375, + "grad_norm_var": 0.09303385416666667, + "learning_rate": 0.0001, + "loss": 7.2475, + "loss/crossentropy": 2.6173033714294434, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.27278298139572144, + "step": 1462 + }, + { + "epoch": 0.04575, + "grad_norm": 4.625, + "grad_norm_var": 0.10012613932291667, + "learning_rate": 0.0001, + "loss": 7.2154, + "loss/crossentropy": 2.5818458795547485, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.27155643701553345, + "step": 1464 + }, + { + "epoch": 0.0458125, + "grad_norm": 4.875, + "grad_norm_var": 0.10858968098958334, + "learning_rate": 0.0001, + "loss": 7.3674, + "loss/crossentropy": 2.595680594444275, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2849871814250946, + "step": 1466 + }, + { + "epoch": 0.045875, + "grad_norm": 5.0625, + "grad_norm_var": 0.09620768229166667, + "learning_rate": 0.0001, + "loss": 7.5837, + "loss/crossentropy": 2.663433074951172, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.29280896484851837, + "step": 1468 + }, + { + "epoch": 0.0459375, + "grad_norm": 5.21875, + "grad_norm_var": 0.071728515625, + "learning_rate": 0.0001, + "loss": 7.2683, + "loss/crossentropy": 2.5311508178710938, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.2803528904914856, + "step": 1470 + }, + { + "epoch": 0.046, + "grad_norm": 4.5625, + "grad_norm_var": 0.07107747395833333, + "learning_rate": 0.0001, + "loss": 7.5451, + "loss/crossentropy": 2.698567032814026, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.292463943362236, + "step": 1472 + }, + { + "epoch": 0.0460625, + "grad_norm": 5.8125, + "grad_norm_var": 0.12903238932291666, + "learning_rate": 0.0001, + "loss": 7.2625, + "loss/crossentropy": 2.4582966566085815, + "loss/hidden": 2.00390625, + "loss/jsd": 0.0, + "loss/logits": 0.2800302058458328, + "step": 1474 + }, + { + "epoch": 0.046125, + "grad_norm": 4.90625, + "grad_norm_var": 0.12496337890625, + "learning_rate": 0.0001, + "loss": 7.458, + "loss/crossentropy": 2.7098604440689087, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2830130606889725, + "step": 1476 + }, + { + "epoch": 0.0461875, + "grad_norm": 5.90625, + "grad_norm_var": 0.2095703125, + "learning_rate": 0.0001, + "loss": 8.0177, + "loss/crossentropy": 2.9519091844558716, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.30735622346401215, + "step": 1478 + }, + { + "epoch": 0.04625, + "grad_norm": 4.78125, + "grad_norm_var": 0.21249593098958333, + "learning_rate": 0.0001, + "loss": 7.5442, + "loss/crossentropy": 2.6207966804504395, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2981995493173599, + "step": 1480 + }, + { + "epoch": 0.0463125, + "grad_norm": 4.96875, + "grad_norm_var": 0.26122639973958334, + "learning_rate": 0.0001, + "loss": 7.6388, + "loss/crossentropy": 2.5771687030792236, + "loss/hidden": 2.046875, + "loss/jsd": 0.0, + "loss/logits": 0.3014744073152542, + "step": 1482 + }, + { + "epoch": 0.046375, + "grad_norm": 5.28125, + "grad_norm_var": 0.26610921223958334, + "learning_rate": 0.0001, + "loss": 7.6108, + "loss/crossentropy": 2.6574681997299194, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.29689261317253113, + "step": 1484 + }, + { + "epoch": 0.0464375, + "grad_norm": 5.0, + "grad_norm_var": 0.24306233723958334, + "learning_rate": 0.0001, + "loss": 7.9907, + "loss/crossentropy": 2.8251935243606567, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.31264084577560425, + "step": 1486 + }, + { + "epoch": 0.0465, + "grad_norm": 4.59375, + "grad_norm_var": 0.26248372395833336, + "learning_rate": 0.0001, + "loss": 7.0941, + "loss/crossentropy": 2.5712080001831055, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.26361557841300964, + "step": 1488 + }, + { + "epoch": 0.0465625, + "grad_norm": 4.25, + "grad_norm_var": 0.2503743489583333, + "learning_rate": 0.0001, + "loss": 7.742, + "loss/crossentropy": 2.824060797691345, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.29452717304229736, + "step": 1490 + }, + { + "epoch": 0.046625, + "grad_norm": 5.125, + "grad_norm_var": 0.24933268229166666, + "learning_rate": 0.0001, + "loss": 7.5885, + "loss/crossentropy": 2.7565609216690063, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.28592388331890106, + "step": 1492 + }, + { + "epoch": 0.0466875, + "grad_norm": 4.6875, + "grad_norm_var": 0.16730143229166666, + "learning_rate": 0.0001, + "loss": 7.2391, + "loss/crossentropy": 2.4160306453704834, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2881673574447632, + "step": 1494 + }, + { + "epoch": 0.04675, + "grad_norm": 4.65625, + "grad_norm_var": 0.17081705729166666, + "learning_rate": 0.0001, + "loss": 7.0716, + "loss/crossentropy": 2.5062583684921265, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.26747608184814453, + "step": 1496 + }, + { + "epoch": 0.0468125, + "grad_norm": 4.84375, + "grad_norm_var": 0.11243082682291666, + "learning_rate": 0.0001, + "loss": 7.6162, + "loss/crossentropy": 2.732138156890869, + "loss/hidden": 1.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.29191985726356506, + "step": 1498 + }, + { + "epoch": 0.046875, + "grad_norm": 4.4375, + "grad_norm_var": 0.108447265625, + "learning_rate": 0.0001, + "loss": 7.2309, + "loss/crossentropy": 2.6066497564315796, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2686777710914612, + "step": 1500 + }, + { + "epoch": 0.0469375, + "grad_norm": 5.21875, + "grad_norm_var": 0.12580973307291668, + "learning_rate": 0.0001, + "loss": 7.8604, + "loss/crossentropy": 2.8097376823425293, + "loss/hidden": 2.01953125, + "loss/jsd": 0.0, + "loss/logits": 0.30311088263988495, + "step": 1502 + }, + { + "epoch": 0.047, + "grad_norm": 4.6875, + "grad_norm_var": 0.10774332682291667, + "learning_rate": 0.0001, + "loss": 7.7376, + "loss/crossentropy": 2.753267288208008, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.30116328597068787, + "step": 1504 + }, + { + "epoch": 0.0470625, + "grad_norm": 4.96875, + "grad_norm_var": 0.3541015625, + "learning_rate": 0.0001, + "loss": 7.3779, + "loss/crossentropy": 2.5379709005355835, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.28985071182250977, + "step": 1506 + }, + { + "epoch": 0.047125, + "grad_norm": 5.6875, + "grad_norm_var": 0.37183837890625, + "learning_rate": 0.0001, + "loss": 7.4822, + "loss/crossentropy": 2.641207695007324, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.28605230152606964, + "step": 1508 + }, + { + "epoch": 0.0471875, + "grad_norm": 4.71875, + "grad_norm_var": 0.372119140625, + "learning_rate": 0.0001, + "loss": 7.5333, + "loss/crossentropy": 2.6981176137924194, + "loss/hidden": 1.94921875, + "loss/jsd": 0.0, + "loss/logits": 0.28859612345695496, + "step": 1510 + }, + { + "epoch": 0.04725, + "grad_norm": 4.59375, + "grad_norm_var": 0.36682535807291666, + "learning_rate": 0.0001, + "loss": 7.5909, + "loss/crossentropy": 2.744605541229248, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2854115813970566, + "step": 1512 + }, + { + "epoch": 0.0473125, + "grad_norm": 4.28125, + "grad_norm_var": 0.403369140625, + "learning_rate": 0.0001, + "loss": 7.3829, + "loss/crossentropy": 2.692691683769226, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.27800965309143066, + "step": 1514 + }, + { + "epoch": 0.047375, + "grad_norm": 5.125, + "grad_norm_var": 0.37821858723958335, + "learning_rate": 0.0001, + "loss": 7.519, + "loss/crossentropy": 2.72913920879364, + "loss/hidden": 1.99609375, + "loss/jsd": 0.0, + "loss/logits": 0.27937404811382294, + "step": 1516 + }, + { + "epoch": 0.0474375, + "grad_norm": 5.0, + "grad_norm_var": 0.37154947916666664, + "learning_rate": 0.0001, + "loss": 7.5057, + "loss/crossentropy": 2.726033926010132, + "loss/hidden": 1.953125, + "loss/jsd": 0.0, + "loss/logits": 0.28265441954135895, + "step": 1518 + }, + { + "epoch": 0.0475, + "grad_norm": 4.5625, + "grad_norm_var": 0.40818684895833335, + "learning_rate": 0.0001, + "loss": 7.2604, + "loss/crossentropy": 2.455228567123413, + "loss/hidden": 1.98828125, + "loss/jsd": 0.0, + "loss/logits": 0.2816852927207947, + "step": 1520 + }, + { + "epoch": 0.0475625, + "grad_norm": 4.625, + "grad_norm_var": 0.17095947265625, + "learning_rate": 0.0001, + "loss": 6.9734, + "loss/crossentropy": 2.4943045377731323, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2604096084833145, + "step": 1522 + }, + { + "epoch": 0.047625, + "grad_norm": 4.78125, + "grad_norm_var": 0.13645833333333332, + "learning_rate": 0.0001, + "loss": 7.0076, + "loss/crossentropy": 2.4553334712982178, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2626507952809334, + "step": 1524 + }, + { + "epoch": 0.0476875, + "grad_norm": 4.625, + "grad_norm_var": 0.12654622395833334, + "learning_rate": 0.0001, + "loss": 7.3117, + "loss/crossentropy": 2.6219717264175415, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.27561162412166595, + "step": 1526 + }, + { + "epoch": 0.04775, + "grad_norm": 5.21875, + "grad_norm_var": 0.13336181640625, + "learning_rate": 0.0001, + "loss": 7.8256, + "loss/crossentropy": 2.863209366798401, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.3028797209262848, + "step": 1528 + }, + { + "epoch": 0.0478125, + "grad_norm": 4.8125, + "grad_norm_var": 0.11070556640625, + "learning_rate": 0.0001, + "loss": 7.9175, + "loss/crossentropy": 2.9292283058166504, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.30116941034793854, + "step": 1530 + }, + { + "epoch": 0.047875, + "grad_norm": 4.875, + "grad_norm_var": 0.10513916015625, + "learning_rate": 0.0001, + "loss": 7.1054, + "loss/crossentropy": 2.462781071662903, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.2662145644426346, + "step": 1532 + }, + { + "epoch": 0.0479375, + "grad_norm": 4.78125, + "grad_norm_var": 0.10234375, + "learning_rate": 0.0001, + "loss": 7.2958, + "loss/crossentropy": 2.6526283025741577, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2740848809480667, + "step": 1534 + }, + { + "epoch": 0.048, + "grad_norm": 5.25, + "grad_norm_var": 0.04882405598958333, + "learning_rate": 0.0001, + "loss": 7.1055, + "loss/crossentropy": 2.424253225326538, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.27398423850536346, + "step": 1536 + }, + { + "epoch": 0.0480625, + "grad_norm": 5.46875, + "grad_norm_var": 0.06599934895833333, + "learning_rate": 0.0001, + "loss": 7.5641, + "loss/crossentropy": 2.689168930053711, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.28827279806137085, + "step": 1538 + }, + { + "epoch": 0.048125, + "grad_norm": 5.34375, + "grad_norm_var": 0.07213541666666666, + "learning_rate": 0.0001, + "loss": 7.7088, + "loss/crossentropy": 2.7261996269226074, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.30255410075187683, + "step": 1540 + }, + { + "epoch": 0.0481875, + "grad_norm": 4.78125, + "grad_norm_var": 0.08136393229166666, + "learning_rate": 0.0001, + "loss": 6.9039, + "loss/crossentropy": 2.494332194328308, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.24916400015354156, + "step": 1542 + }, + { + "epoch": 0.04825, + "grad_norm": 4.90625, + "grad_norm_var": 0.06834309895833333, + "learning_rate": 0.0001, + "loss": 7.4646, + "loss/crossentropy": 2.5675861835479736, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2904863655567169, + "step": 1544 + }, + { + "epoch": 0.0483125, + "grad_norm": 5.03125, + "grad_norm_var": 0.06894124348958333, + "learning_rate": 0.0001, + "loss": 7.4416, + "loss/crossentropy": 2.6678693294525146, + "loss/hidden": 1.953125, + "loss/jsd": 0.0, + "loss/logits": 0.2820626497268677, + "step": 1546 + }, + { + "epoch": 0.048375, + "grad_norm": 4.75, + "grad_norm_var": 0.06946207682291666, + "learning_rate": 0.0001, + "loss": 7.3052, + "loss/crossentropy": 2.563589096069336, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.27650561928749084, + "step": 1548 + }, + { + "epoch": 0.0484375, + "grad_norm": 4.75, + "grad_norm_var": 0.07548421223958333, + "learning_rate": 0.0001, + "loss": 7.7503, + "loss/crossentropy": 2.8281824588775635, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2937776744365692, + "step": 1550 + }, + { + "epoch": 0.0485, + "grad_norm": 4.625, + "grad_norm_var": 0.06834309895833333, + "learning_rate": 0.0001, + "loss": 7.2171, + "loss/crossentropy": 2.690904974937439, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.263553261756897, + "step": 1552 + }, + { + "epoch": 0.0485625, + "grad_norm": 5.09375, + "grad_norm_var": 0.05310872395833333, + "learning_rate": 0.0001, + "loss": 7.1013, + "loss/crossentropy": 2.5671788454055786, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.26082948595285416, + "step": 1554 + }, + { + "epoch": 0.048625, + "grad_norm": 5.21875, + "grad_norm_var": 0.04674479166666667, + "learning_rate": 0.0001, + "loss": 7.3052, + "loss/crossentropy": 2.6749120950698853, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.27279847860336304, + "step": 1556 + }, + { + "epoch": 0.0486875, + "grad_norm": 4.625, + "grad_norm_var": 0.03876546223958333, + "learning_rate": 0.0001, + "loss": 7.3388, + "loss/crossentropy": 2.7545392513275146, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2685810327529907, + "step": 1558 + }, + { + "epoch": 0.04875, + "grad_norm": 6.875, + "grad_norm_var": 0.2933553059895833, + "learning_rate": 0.0001, + "loss": 7.5524, + "loss/crossentropy": 2.611665725708008, + "loss/hidden": 2.00390625, + "loss/jsd": 0.0, + "loss/logits": 0.2936825156211853, + "step": 1560 + }, + { + "epoch": 0.0488125, + "grad_norm": 5.4375, + "grad_norm_var": 0.3044230143229167, + "learning_rate": 0.0001, + "loss": 7.5063, + "loss/crossentropy": 2.619522213935852, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.29141077399253845, + "step": 1562 + }, + { + "epoch": 0.048875, + "grad_norm": 6.25, + "grad_norm_var": 0.4083170572916667, + "learning_rate": 0.0001, + "loss": 7.5491, + "loss/crossentropy": 2.712641716003418, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.2856023460626602, + "step": 1564 + }, + { + "epoch": 0.0489375, + "grad_norm": 5.46875, + "grad_norm_var": 0.4775390625, + "learning_rate": 0.0001, + "loss": 7.9109, + "loss/crossentropy": 2.710148334503174, + "loss/hidden": 2.0390625, + "loss/jsd": 0.0, + "loss/logits": 0.31617045402526855, + "step": 1566 + }, + { + "epoch": 0.049, + "grad_norm": 4.625, + "grad_norm_var": 0.49078369140625, + "learning_rate": 0.0001, + "loss": 7.2407, + "loss/crossentropy": 2.6541051864624023, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.27155420184135437, + "step": 1568 + }, + { + "epoch": 0.0490625, + "grad_norm": 4.65625, + "grad_norm_var": 0.4930948893229167, + "learning_rate": 0.0001, + "loss": 7.4394, + "loss/crossentropy": 2.746522545814514, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.28022365272045135, + "step": 1570 + }, + { + "epoch": 0.049125, + "grad_norm": 4.65625, + "grad_norm_var": 0.49719645182291666, + "learning_rate": 0.0001, + "loss": 7.3557, + "loss/crossentropy": 2.669800877571106, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.27991996705532074, + "step": 1572 + }, + { + "epoch": 0.0491875, + "grad_norm": 5.03125, + "grad_norm_var": 0.48879801432291664, + "learning_rate": 0.0001, + "loss": 7.5363, + "loss/crossentropy": 2.7022202014923096, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.28497424721717834, + "step": 1574 + }, + { + "epoch": 0.04925, + "grad_norm": 4.375, + "grad_norm_var": 0.3167805989583333, + "learning_rate": 0.0001, + "loss": 7.1079, + "loss/crossentropy": 2.5182780027389526, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.26834164559841156, + "step": 1576 + }, + { + "epoch": 0.0493125, + "grad_norm": 5.5625, + "grad_norm_var": 0.32958577473958334, + "learning_rate": 0.0001, + "loss": 7.4399, + "loss/crossentropy": 2.5361671447753906, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2977970540523529, + "step": 1578 + }, + { + "epoch": 0.049375, + "grad_norm": 5.0625, + "grad_norm_var": 0.24976806640625, + "learning_rate": 0.0001, + "loss": 7.2944, + "loss/crossentropy": 2.7187254428863525, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2677236646413803, + "step": 1580 + }, + { + "epoch": 0.0494375, + "grad_norm": 4.96875, + "grad_norm_var": 0.11907145182291666, + "learning_rate": 0.0001, + "loss": 6.5479, + "loss/crossentropy": 2.1634461879730225, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.24937818944454193, + "step": 1582 + }, + { + "epoch": 0.0495, + "grad_norm": 4.375, + "grad_norm_var": 0.128759765625, + "learning_rate": 0.0001, + "loss": 7.1655, + "loss/crossentropy": 2.545617699623108, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.27449171245098114, + "step": 1584 + }, + { + "epoch": 0.0495625, + "grad_norm": 5.0, + "grad_norm_var": 0.13020833333333334, + "learning_rate": 0.0001, + "loss": 7.8215, + "loss/crossentropy": 2.8528627157211304, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2976495623588562, + "step": 1586 + }, + { + "epoch": 0.049625, + "grad_norm": 4.78125, + "grad_norm_var": 0.128759765625, + "learning_rate": 0.0001, + "loss": 7.2978, + "loss/crossentropy": 2.632421135902405, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.27435266971588135, + "step": 1588 + }, + { + "epoch": 0.0496875, + "grad_norm": 6.75, + "grad_norm_var": 0.363671875, + "learning_rate": 0.0001, + "loss": 7.6244, + "loss/crossentropy": 2.7735743522644043, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.2917233109474182, + "step": 1590 + }, + { + "epoch": 0.04975, + "grad_norm": 6.5625, + "grad_norm_var": 0.49176025390625, + "learning_rate": 0.0001, + "loss": 7.995, + "loss/crossentropy": 2.765578508377075, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.3252904415130615, + "step": 1592 + }, + { + "epoch": 0.0498125, + "grad_norm": 5.75, + "grad_norm_var": 0.51353759765625, + "learning_rate": 0.0001, + "loss": 7.5683, + "loss/crossentropy": 2.6407854557037354, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.298612505197525, + "step": 1594 + }, + { + "epoch": 0.049875, + "grad_norm": 5.5625, + "grad_norm_var": 0.46640218098958336, + "learning_rate": 0.0001, + "loss": 7.3369, + "loss/crossentropy": 2.433140516281128, + "loss/hidden": 2.03515625, + "loss/jsd": 0.0, + "loss/logits": 0.2868652194738388, + "step": 1596 + }, + { + "epoch": 0.0499375, + "grad_norm": 4.5625, + "grad_norm_var": 0.4823527018229167, + "learning_rate": 0.0001, + "loss": 7.2654, + "loss/crossentropy": 2.6077977418899536, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2731828987598419, + "step": 1598 + }, + { + "epoch": 0.05, + "grad_norm": 5.3125, + "grad_norm_var": 0.43136393229166664, + "learning_rate": 0.0001, + "loss": 7.3095, + "loss/crossentropy": 2.6041181087493896, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2736643999814987, + "step": 1600 + }, + { + "epoch": 0.0500625, + "grad_norm": 5.34375, + "grad_norm_var": 0.40315348307291665, + "learning_rate": 0.0001, + "loss": 7.1878, + "loss/crossentropy": 2.42449951171875, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2821933627128601, + "step": 1602 + }, + { + "epoch": 0.050125, + "grad_norm": 5.46875, + "grad_norm_var": 0.35279947916666665, + "learning_rate": 0.0001, + "loss": 7.665, + "loss/crossentropy": 2.7596222162246704, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.29483360052108765, + "step": 1604 + }, + { + "epoch": 0.0501875, + "grad_norm": 4.65625, + "grad_norm_var": 0.26431884765625, + "learning_rate": 0.0001, + "loss": 7.2976, + "loss/crossentropy": 2.743039131164551, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.26444223523139954, + "step": 1606 + }, + { + "epoch": 0.05025, + "grad_norm": 5.34375, + "grad_norm_var": 0.141015625, + "learning_rate": 0.0001, + "loss": 7.515, + "loss/crossentropy": 2.707811117172241, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2830655127763748, + "step": 1608 + }, + { + "epoch": 0.0503125, + "grad_norm": 4.53125, + "grad_norm_var": 0.12550455729166668, + "learning_rate": 0.0001, + "loss": 7.175, + "loss/crossentropy": 2.577424645423889, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2695184051990509, + "step": 1610 + }, + { + "epoch": 0.050375, + "grad_norm": 5.4375, + "grad_norm_var": 0.11591389973958334, + "learning_rate": 0.0001, + "loss": 7.1828, + "loss/crossentropy": 2.5045968294143677, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2693790942430496, + "step": 1612 + }, + { + "epoch": 0.0504375, + "grad_norm": 5.0, + "grad_norm_var": 0.10076497395833334, + "learning_rate": 0.0001, + "loss": 7.6011, + "loss/crossentropy": 2.6588661670684814, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.3004702776670456, + "step": 1614 + }, + { + "epoch": 0.0505, + "grad_norm": 4.5, + "grad_norm_var": 0.12099202473958333, + "learning_rate": 0.0001, + "loss": 7.3598, + "loss/crossentropy": 2.62858247756958, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.28093548119068146, + "step": 1616 + }, + { + "epoch": 0.0505625, + "grad_norm": 4.46875, + "grad_norm_var": 0.13435872395833334, + "learning_rate": 0.0001, + "loss": 7.2105, + "loss/crossentropy": 2.5444538593292236, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.27363789081573486, + "step": 1618 + }, + { + "epoch": 0.050625, + "grad_norm": 5.28125, + "grad_norm_var": 0.12854410807291666, + "learning_rate": 0.0001, + "loss": 7.3259, + "loss/crossentropy": 2.6291420459747314, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2778768092393875, + "step": 1620 + }, + { + "epoch": 0.0506875, + "grad_norm": 4.46875, + "grad_norm_var": 0.15154622395833334, + "learning_rate": 0.0001, + "loss": 7.2225, + "loss/crossentropy": 2.6978687047958374, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2661316245794296, + "step": 1622 + }, + { + "epoch": 0.05075, + "grad_norm": 4.84375, + "grad_norm_var": 0.14163004557291667, + "learning_rate": 0.0001, + "loss": 7.2325, + "loss/crossentropy": 2.544032335281372, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.27861447632312775, + "step": 1624 + }, + { + "epoch": 0.0508125, + "grad_norm": 5.03125, + "grad_norm_var": 0.14329427083333332, + "learning_rate": 0.0001, + "loss": 7.3249, + "loss/crossentropy": 2.662850856781006, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.275974377989769, + "step": 1626 + }, + { + "epoch": 0.050875, + "grad_norm": 5.3125, + "grad_norm_var": 0.13893229166666668, + "learning_rate": 0.0001, + "loss": 7.4051, + "loss/crossentropy": 2.6046046018600464, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.2827852815389633, + "step": 1628 + }, + { + "epoch": 0.0509375, + "grad_norm": 4.8125, + "grad_norm_var": 0.107666015625, + "learning_rate": 0.0001, + "loss": 7.3616, + "loss/crossentropy": 2.651221752166748, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.27963121235370636, + "step": 1630 + }, + { + "epoch": 0.051, + "grad_norm": 4.6875, + "grad_norm_var": 0.14420166015625, + "learning_rate": 0.0001, + "loss": 7.4135, + "loss/crossentropy": 2.646748185157776, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.2794112116098404, + "step": 1632 + }, + { + "epoch": 0.0510625, + "grad_norm": 5.15625, + "grad_norm_var": 0.137744140625, + "learning_rate": 0.0001, + "loss": 7.5045, + "loss/crossentropy": 2.708313465118408, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2858666926622391, + "step": 1634 + }, + { + "epoch": 0.051125, + "grad_norm": 5.34375, + "grad_norm_var": 0.13944905598958332, + "learning_rate": 0.0001, + "loss": 7.6362, + "loss/crossentropy": 2.700982093811035, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.2978232800960541, + "step": 1636 + }, + { + "epoch": 0.0511875, + "grad_norm": 5.34375, + "grad_norm_var": 0.6510050455729167, + "learning_rate": 0.0001, + "loss": 7.2766, + "loss/crossentropy": 2.491866707801819, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.28003498911857605, + "step": 1638 + }, + { + "epoch": 0.05125, + "grad_norm": 4.40625, + "grad_norm_var": 0.6700480143229167, + "learning_rate": 0.0001, + "loss": 7.4596, + "loss/crossentropy": 2.732044219970703, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2805637717247009, + "step": 1640 + }, + { + "epoch": 0.0513125, + "grad_norm": 5.1875, + "grad_norm_var": 0.6787109375, + "learning_rate": 0.0001, + "loss": 7.5031, + "loss/crossentropy": 2.650493025779724, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.29072538018226624, + "step": 1642 + }, + { + "epoch": 0.051375, + "grad_norm": 5.125, + "grad_norm_var": 0.656494140625, + "learning_rate": 0.0001, + "loss": 7.6908, + "loss/crossentropy": 2.7976644039154053, + "loss/hidden": 2.03125, + "loss/jsd": 0.0, + "loss/logits": 0.28619284927845, + "step": 1644 + }, + { + "epoch": 0.0514375, + "grad_norm": 4.75, + "grad_norm_var": 0.6514933268229167, + "learning_rate": 0.0001, + "loss": 7.557, + "loss/crossentropy": 2.8387088775634766, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2788640707731247, + "step": 1646 + }, + { + "epoch": 0.0515, + "grad_norm": 4.65625, + "grad_norm_var": 0.649462890625, + "learning_rate": 0.0001, + "loss": 7.3634, + "loss/crossentropy": 2.669367551803589, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.2760390490293503, + "step": 1648 + }, + { + "epoch": 0.0515625, + "grad_norm": 4.71875, + "grad_norm_var": 0.6587198893229167, + "learning_rate": 0.0001, + "loss": 7.625, + "loss/crossentropy": 2.7615777254104614, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.29259105026721954, + "step": 1650 + }, + { + "epoch": 0.051625, + "grad_norm": 5.0, + "grad_norm_var": 0.65562744140625, + "learning_rate": 0.0001, + "loss": 7.8787, + "loss/crossentropy": 2.975517749786377, + "loss/hidden": 1.98046875, + "loss/jsd": 0.0, + "loss/logits": 0.29227523505687714, + "step": 1652 + }, + { + "epoch": 0.0516875, + "grad_norm": 4.46875, + "grad_norm_var": 0.15979410807291666, + "learning_rate": 0.0001, + "loss": 7.3841, + "loss/crossentropy": 2.7546948194503784, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2719228267669678, + "step": 1654 + }, + { + "epoch": 0.05175, + "grad_norm": 4.40625, + "grad_norm_var": 0.1611328125, + "learning_rate": 0.0001, + "loss": 7.5601, + "loss/crossentropy": 2.7170467376708984, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2925069183111191, + "step": 1656 + }, + { + "epoch": 0.0518125, + "grad_norm": 4.53125, + "grad_norm_var": 0.07076416015625, + "learning_rate": 0.0001, + "loss": 7.4027, + "loss/crossentropy": 2.742368459701538, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.274629682302475, + "step": 1658 + }, + { + "epoch": 0.051875, + "grad_norm": 4.78125, + "grad_norm_var": 0.06467692057291667, + "learning_rate": 0.0001, + "loss": 7.491, + "loss/crossentropy": 2.882826805114746, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2697974443435669, + "step": 1660 + }, + { + "epoch": 0.0519375, + "grad_norm": 5.03125, + "grad_norm_var": 0.072119140625, + "learning_rate": 0.0001, + "loss": 7.0602, + "loss/crossentropy": 2.6344101428985596, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2574244886636734, + "step": 1662 + }, + { + "epoch": 0.052, + "grad_norm": 4.84375, + "grad_norm_var": 0.07496337890625, + "learning_rate": 0.0001, + "loss": 7.5474, + "loss/crossentropy": 2.7764055728912354, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2860814034938812, + "step": 1664 + }, + { + "epoch": 0.0520625, + "grad_norm": 4.5, + "grad_norm_var": 0.07120768229166667, + "learning_rate": 0.0001, + "loss": 7.333, + "loss/crossentropy": 2.6493825912475586, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2773485779762268, + "step": 1666 + }, + { + "epoch": 0.052125, + "grad_norm": 5.125, + "grad_norm_var": 0.07226155598958334, + "learning_rate": 0.0001, + "loss": 7.801, + "loss/crossentropy": 2.91958749294281, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.2924348711967468, + "step": 1668 + }, + { + "epoch": 0.0521875, + "grad_norm": 4.75, + "grad_norm_var": 0.09518229166666667, + "learning_rate": 0.0001, + "loss": 6.945, + "loss/crossentropy": 2.4583855867385864, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.258427232503891, + "step": 1670 + }, + { + "epoch": 0.05225, + "grad_norm": 4.90625, + "grad_norm_var": 0.07610677083333334, + "learning_rate": 0.0001, + "loss": 7.5141, + "loss/crossentropy": 2.660768151283264, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.2861160635948181, + "step": 1672 + }, + { + "epoch": 0.0523125, + "grad_norm": 5.53125, + "grad_norm_var": 0.10065104166666666, + "learning_rate": 0.0001, + "loss": 7.4877, + "loss/crossentropy": 2.635893940925598, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2867467477917671, + "step": 1674 + }, + { + "epoch": 0.052375, + "grad_norm": 4.75, + "grad_norm_var": 0.096875, + "learning_rate": 0.0001, + "loss": 7.3388, + "loss/crossentropy": 2.656780481338501, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.27952560782432556, + "step": 1676 + }, + { + "epoch": 0.0524375, + "grad_norm": 4.34375, + "grad_norm_var": 0.10279947916666667, + "learning_rate": 0.0001, + "loss": 7.4772, + "loss/crossentropy": 2.795539140701294, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2771495431661606, + "step": 1678 + }, + { + "epoch": 0.0525, + "grad_norm": 5.15625, + "grad_norm_var": 0.11968994140625, + "learning_rate": 0.0001, + "loss": 7.3803, + "loss/crossentropy": 2.764199137687683, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2713741958141327, + "step": 1680 + }, + { + "epoch": 0.0525625, + "grad_norm": 5.6875, + "grad_norm_var": 0.148046875, + "learning_rate": 0.0001, + "loss": 7.4873, + "loss/crossentropy": 2.785406231880188, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.27917732298374176, + "step": 1682 + }, + { + "epoch": 0.052625, + "grad_norm": 5.0625, + "grad_norm_var": 0.17476806640625, + "learning_rate": 0.0001, + "loss": 7.2824, + "loss/crossentropy": 2.638624429702759, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.27375538647174835, + "step": 1684 + }, + { + "epoch": 0.0526875, + "grad_norm": 4.15625, + "grad_norm_var": 0.18625895182291666, + "learning_rate": 0.0001, + "loss": 7.5929, + "loss/crossentropy": 2.8473161458969116, + "loss/hidden": 1.94921875, + "loss/jsd": 0.0, + "loss/logits": 0.2796381860971451, + "step": 1686 + }, + { + "epoch": 0.05275, + "grad_norm": 5.46875, + "grad_norm_var": 0.21200764973958333, + "learning_rate": 0.0001, + "loss": 7.5646, + "loss/crossentropy": 2.6495203971862793, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2930660992860794, + "step": 1688 + }, + { + "epoch": 0.0528125, + "grad_norm": 4.875, + "grad_norm_var": 0.18631184895833333, + "learning_rate": 0.0001, + "loss": 7.16, + "loss/crossentropy": 2.5938304662704468, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.26715944707393646, + "step": 1690 + }, + { + "epoch": 0.052875, + "grad_norm": 4.90625, + "grad_norm_var": 0.19464518229166666, + "learning_rate": 0.0001, + "loss": 7.2846, + "loss/crossentropy": 2.5637892484664917, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.27832843363285065, + "step": 1692 + }, + { + "epoch": 0.0529375, + "grad_norm": 4.75, + "grad_norm_var": 0.19178059895833333, + "learning_rate": 0.0001, + "loss": 7.7704, + "loss/crossentropy": 2.8994187116622925, + "loss/hidden": 1.94921875, + "loss/jsd": 0.0, + "loss/logits": 0.29217809438705444, + "step": 1694 + }, + { + "epoch": 0.053, + "grad_norm": 4.75, + "grad_norm_var": 0.174853515625, + "learning_rate": 0.0001, + "loss": 7.0468, + "loss/crossentropy": 2.444552779197693, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2692095637321472, + "step": 1696 + }, + { + "epoch": 0.0530625, + "grad_norm": 4.96875, + "grad_norm_var": 0.12893473307291667, + "learning_rate": 0.0001, + "loss": 7.7064, + "loss/crossentropy": 2.796920657157898, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.29368677735328674, + "step": 1698 + }, + { + "epoch": 0.053125, + "grad_norm": 5.28125, + "grad_norm_var": 0.12107747395833333, + "learning_rate": 0.0001, + "loss": 7.6303, + "loss/crossentropy": 2.813141107559204, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2887490391731262, + "step": 1700 + }, + { + "epoch": 0.0531875, + "grad_norm": 4.84375, + "grad_norm_var": 0.09192301432291666, + "learning_rate": 0.0001, + "loss": 7.3593, + "loss/crossentropy": 2.739987373352051, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.27130600810050964, + "step": 1702 + }, + { + "epoch": 0.05325, + "grad_norm": 4.625, + "grad_norm_var": 0.07307535807291667, + "learning_rate": 0.0001, + "loss": 7.1669, + "loss/crossentropy": 2.6518582105636597, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2624403387308121, + "step": 1704 + }, + { + "epoch": 0.0533125, + "grad_norm": 5.0625, + "grad_norm_var": 0.06300455729166667, + "learning_rate": 0.0001, + "loss": 7.661, + "loss/crossentropy": 2.716781973838806, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2975441813468933, + "step": 1706 + }, + { + "epoch": 0.053375, + "grad_norm": 4.53125, + "grad_norm_var": 0.05813802083333333, + "learning_rate": 0.0001, + "loss": 7.1827, + "loss/crossentropy": 2.491211175918579, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.27813446521759033, + "step": 1708 + }, + { + "epoch": 0.0534375, + "grad_norm": 4.78125, + "grad_norm_var": 0.07604166666666666, + "learning_rate": 0.0001, + "loss": 7.0346, + "loss/crossentropy": 2.6145761013031006, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.25840601325035095, + "step": 1710 + }, + { + "epoch": 0.0535, + "grad_norm": 4.6875, + "grad_norm_var": 0.07730712890625, + "learning_rate": 0.0001, + "loss": 7.2211, + "loss/crossentropy": 2.608027696609497, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.27067790925502777, + "step": 1712 + }, + { + "epoch": 0.0535625, + "grad_norm": 6.6875, + "grad_norm_var": 0.30857747395833335, + "learning_rate": 0.0001, + "loss": 7.2801, + "loss/crossentropy": 2.5958139896392822, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2766307219862938, + "step": 1714 + }, + { + "epoch": 0.053625, + "grad_norm": 6.5625, + "grad_norm_var": 0.4998982747395833, + "learning_rate": 0.0001, + "loss": 7.4305, + "loss/crossentropy": 2.6899927854537964, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2803005874156952, + "step": 1716 + }, + { + "epoch": 0.0536875, + "grad_norm": 5.4375, + "grad_norm_var": 0.5143513997395833, + "learning_rate": 0.0001, + "loss": 7.3846, + "loss/crossentropy": 2.563607931137085, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2852242588996887, + "step": 1718 + }, + { + "epoch": 0.05375, + "grad_norm": 5.40625, + "grad_norm_var": 0.5321451822916666, + "learning_rate": 0.0001, + "loss": 7.3026, + "loss/crossentropy": 2.646055579185486, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2734649181365967, + "step": 1720 + }, + { + "epoch": 0.0538125, + "grad_norm": 4.3125, + "grad_norm_var": 0.5678995768229167, + "learning_rate": 0.0001, + "loss": 6.968, + "loss/crossentropy": 2.5653910636901855, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.25354039669036865, + "step": 1722 + }, + { + "epoch": 0.053875, + "grad_norm": 5.28125, + "grad_norm_var": 0.5819620768229167, + "learning_rate": 0.0001, + "loss": 7.4924, + "loss/crossentropy": 2.834080696105957, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.27403783798217773, + "step": 1724 + }, + { + "epoch": 0.0539375, + "grad_norm": 4.5, + "grad_norm_var": 0.6571248372395834, + "learning_rate": 0.0001, + "loss": 7.7559, + "loss/crossentropy": 2.8956801891326904, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2883637845516205, + "step": 1726 + }, + { + "epoch": 0.054, + "grad_norm": 4.3125, + "grad_norm_var": 0.7003743489583333, + "learning_rate": 0.0001, + "loss": 7.1868, + "loss/crossentropy": 2.660465121269226, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.26670072972774506, + "step": 1728 + }, + { + "epoch": 0.0540625, + "grad_norm": 4.90625, + "grad_norm_var": 0.47978108723958335, + "learning_rate": 0.0001, + "loss": 7.3968, + "loss/crossentropy": 2.6902763843536377, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.27768830955028534, + "step": 1730 + }, + { + "epoch": 0.054125, + "grad_norm": 5.1875, + "grad_norm_var": 0.2855428059895833, + "learning_rate": 0.0001, + "loss": 7.5651, + "loss/crossentropy": 2.790587067604065, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.28643281757831573, + "step": 1732 + }, + { + "epoch": 0.0541875, + "grad_norm": 4.8125, + "grad_norm_var": 0.2678019205729167, + "learning_rate": 0.0001, + "loss": 7.267, + "loss/crossentropy": 2.6578781604766846, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.27107058465480804, + "step": 1734 + }, + { + "epoch": 0.05425, + "grad_norm": 5.03125, + "grad_norm_var": 0.22975260416666668, + "learning_rate": 0.0001, + "loss": 7.4316, + "loss/crossentropy": 2.717787742614746, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.2776351869106293, + "step": 1736 + }, + { + "epoch": 0.0543125, + "grad_norm": 4.71875, + "grad_norm_var": 0.20924072265625, + "learning_rate": 0.0001, + "loss": 7.4847, + "loss/crossentropy": 2.774222254753113, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.27885735034942627, + "step": 1738 + }, + { + "epoch": 0.054375, + "grad_norm": 5.03125, + "grad_norm_var": 0.2652180989583333, + "learning_rate": 0.0001, + "loss": 7.2475, + "loss/crossentropy": 2.5218154191970825, + "loss/hidden": 1.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.27608276903629303, + "step": 1740 + }, + { + "epoch": 0.0544375, + "grad_norm": 4.65625, + "grad_norm_var": 0.16047770182291668, + "learning_rate": 0.0001, + "loss": 7.3771, + "loss/crossentropy": 2.7365787029266357, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.27381476759910583, + "step": 1742 + }, + { + "epoch": 0.0545, + "grad_norm": 4.375, + "grad_norm_var": 0.17029622395833333, + "learning_rate": 0.0001, + "loss": 7.1074, + "loss/crossentropy": 2.650601625442505, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.2585696280002594, + "step": 1744 + }, + { + "epoch": 0.0545625, + "grad_norm": 5.4375, + "grad_norm_var": 0.19088134765625, + "learning_rate": 0.0001, + "loss": 7.2373, + "loss/crossentropy": 2.4645010232925415, + "loss/hidden": 1.9609375, + "loss/jsd": 0.0, + "loss/logits": 0.2811823785305023, + "step": 1746 + }, + { + "epoch": 0.054625, + "grad_norm": 4.8125, + "grad_norm_var": 0.18046875, + "learning_rate": 0.0001, + "loss": 7.0984, + "loss/crossentropy": 2.5452595949172974, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2584358751773834, + "step": 1748 + }, + { + "epoch": 0.0546875, + "grad_norm": 4.90625, + "grad_norm_var": 0.183837890625, + "learning_rate": 0.0001, + "loss": 7.7758, + "loss/crossentropy": 2.812344789505005, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.3026003837585449, + "step": 1750 + }, + { + "epoch": 0.05475, + "grad_norm": 4.71875, + "grad_norm_var": 0.18756103515625, + "learning_rate": 0.0001, + "loss": 7.1996, + "loss/crossentropy": 2.7080670595169067, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2612666040658951, + "step": 1752 + }, + { + "epoch": 0.0548125, + "grad_norm": 4.84375, + "grad_norm_var": 0.19273681640625, + "learning_rate": 0.0001, + "loss": 7.3367, + "loss/crossentropy": 2.656891942024231, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2797033041715622, + "step": 1754 + }, + { + "epoch": 0.054875, + "grad_norm": 4.5, + "grad_norm_var": 0.11409098307291667, + "learning_rate": 0.0001, + "loss": 7.1449, + "loss/crossentropy": 2.577438712120056, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2704222649335861, + "step": 1756 + }, + { + "epoch": 0.0549375, + "grad_norm": 4.875, + "grad_norm_var": 0.11855061848958333, + "learning_rate": 0.0001, + "loss": 7.4559, + "loss/crossentropy": 2.8306283950805664, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2734677642583847, + "step": 1758 + }, + { + "epoch": 0.055, + "grad_norm": 6.5625, + "grad_norm_var": 0.266650390625, + "learning_rate": 0.0001, + "loss": 7.3112, + "loss/crossentropy": 2.6345953941345215, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.2743034064769745, + "step": 1760 + }, + { + "epoch": 0.0550625, + "grad_norm": 4.6875, + "grad_norm_var": 0.25494791666666666, + "learning_rate": 0.0001, + "loss": 7.4958, + "loss/crossentropy": 2.742944121360779, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2823123633861542, + "step": 1762 + }, + { + "epoch": 0.055125, + "grad_norm": 4.96875, + "grad_norm_var": 0.2669921875, + "learning_rate": 0.0001, + "loss": 7.3314, + "loss/crossentropy": 2.6937159299850464, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2735290676355362, + "step": 1764 + }, + { + "epoch": 0.0551875, + "grad_norm": 4.375, + "grad_norm_var": 0.2847493489583333, + "learning_rate": 0.0001, + "loss": 7.0821, + "loss/crossentropy": 2.6495813131332397, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2569257989525795, + "step": 1766 + }, + { + "epoch": 0.05525, + "grad_norm": 5.0, + "grad_norm_var": 0.2819295247395833, + "learning_rate": 0.0001, + "loss": 7.654, + "loss/crossentropy": 2.885606050491333, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2842566519975662, + "step": 1768 + }, + { + "epoch": 0.0553125, + "grad_norm": 4.59375, + "grad_norm_var": 0.28938395182291665, + "learning_rate": 0.0001, + "loss": 7.2599, + "loss/crossentropy": 2.683700203895569, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2662117928266525, + "step": 1770 + }, + { + "epoch": 0.055375, + "grad_norm": 4.28125, + "grad_norm_var": 0.297509765625, + "learning_rate": 0.0001, + "loss": 7.4357, + "loss/crossentropy": 2.809111475944519, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2751620411872864, + "step": 1772 + }, + { + "epoch": 0.0554375, + "grad_norm": 4.3125, + "grad_norm_var": 0.29944254557291666, + "learning_rate": 0.0001, + "loss": 7.3595, + "loss/crossentropy": 2.7361397743225098, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2728865295648575, + "step": 1774 + }, + { + "epoch": 0.0555, + "grad_norm": 4.625, + "grad_norm_var": 0.071484375, + "learning_rate": 0.0001, + "loss": 7.2885, + "loss/crossentropy": 2.7563177347183228, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.26727694272994995, + "step": 1776 + }, + { + "epoch": 0.0555625, + "grad_norm": 4.6875, + "grad_norm_var": 0.06933186848958334, + "learning_rate": 0.0001, + "loss": 7.1418, + "loss/crossentropy": 2.609483480453491, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2672937512397766, + "step": 1778 + }, + { + "epoch": 0.055625, + "grad_norm": 4.71875, + "grad_norm_var": 0.060009765625, + "learning_rate": 0.0001, + "loss": 7.373, + "loss/crossentropy": 2.7315728664398193, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2750767469406128, + "step": 1780 + }, + { + "epoch": 0.0556875, + "grad_norm": 4.875, + "grad_norm_var": 0.05640869140625, + "learning_rate": 0.0001, + "loss": 7.1895, + "loss/crossentropy": 2.5997053384780884, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2679608315229416, + "step": 1782 + }, + { + "epoch": 0.05575, + "grad_norm": 4.25, + "grad_norm_var": 0.047509765625, + "learning_rate": 0.0001, + "loss": 7.2201, + "loss/crossentropy": 2.680380940437317, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2641296237707138, + "step": 1784 + }, + { + "epoch": 0.0558125, + "grad_norm": 4.28125, + "grad_norm_var": 0.053629557291666664, + "learning_rate": 0.0001, + "loss": 7.1739, + "loss/crossentropy": 2.6020020246505737, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.27007773518562317, + "step": 1786 + }, + { + "epoch": 0.055875, + "grad_norm": 4.78125, + "grad_norm_var": 0.04641520182291667, + "learning_rate": 0.0001, + "loss": 7.3154, + "loss/crossentropy": 2.705802083015442, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.26838430762290955, + "step": 1788 + }, + { + "epoch": 0.0559375, + "grad_norm": 7.71875, + "grad_norm_var": 0.6591796875, + "learning_rate": 0.0001, + "loss": 7.5472, + "loss/crossentropy": 2.700193166732788, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.2913414239883423, + "step": 1790 + }, + { + "epoch": 0.056, + "grad_norm": 5.34375, + "grad_norm_var": 0.67265625, + "learning_rate": 0.0001, + "loss": 7.6119, + "loss/crossentropy": 2.830447196960449, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.28361743688583374, + "step": 1792 + }, + { + "epoch": 0.0560625, + "grad_norm": 4.5, + "grad_norm_var": 0.660009765625, + "learning_rate": 0.0001, + "loss": 7.2971, + "loss/crossentropy": 2.7628878355026245, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.26201970875263214, + "step": 1794 + }, + { + "epoch": 0.056125, + "grad_norm": 5.03125, + "grad_norm_var": 0.7589680989583333, + "learning_rate": 0.0001, + "loss": 7.3627, + "loss/crossentropy": 2.752334475517273, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2747083008289337, + "step": 1796 + }, + { + "epoch": 0.0561875, + "grad_norm": 5.65625, + "grad_norm_var": 0.7759765625, + "learning_rate": 0.0001, + "loss": 7.6213, + "loss/crossentropy": 2.984766721725464, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.27263370156288147, + "step": 1798 + }, + { + "epoch": 0.05625, + "grad_norm": 4.625, + "grad_norm_var": 0.732275390625, + "learning_rate": 0.0001, + "loss": 7.3488, + "loss/crossentropy": 2.7198057174682617, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2746191918849945, + "step": 1800 + }, + { + "epoch": 0.0563125, + "grad_norm": 4.46875, + "grad_norm_var": 0.7081868489583333, + "learning_rate": 0.0001, + "loss": 7.2877, + "loss/crossentropy": 2.6790480613708496, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.27297329902648926, + "step": 1802 + }, + { + "epoch": 0.056375, + "grad_norm": 5.21875, + "grad_norm_var": 0.69107666015625, + "learning_rate": 0.0001, + "loss": 6.8517, + "loss/crossentropy": 2.3501633405685425, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.26148542761802673, + "step": 1804 + }, + { + "epoch": 0.0564375, + "grad_norm": 4.53125, + "grad_norm_var": 0.22669270833333333, + "learning_rate": 0.0001, + "loss": 7.5052, + "loss/crossentropy": 2.75057852268219, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.28718413412570953, + "step": 1806 + }, + { + "epoch": 0.0565, + "grad_norm": 4.71875, + "grad_norm_var": 0.22301025390625, + "learning_rate": 0.0001, + "loss": 7.2447, + "loss/crossentropy": 2.6712071895599365, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.26633328199386597, + "step": 1808 + }, + { + "epoch": 0.0565625, + "grad_norm": 5.3125, + "grad_norm_var": 0.220947265625, + "learning_rate": 0.0001, + "loss": 7.5968, + "loss/crossentropy": 2.8121140003204346, + "loss/hidden": 1.9921875, + "loss/jsd": 0.0, + "loss/logits": 0.27925361692905426, + "step": 1810 + }, + { + "epoch": 0.056625, + "grad_norm": 4.6875, + "grad_norm_var": 0.1205078125, + "learning_rate": 0.0001, + "loss": 7.1545, + "loss/crossentropy": 2.555493950843811, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.27317963540554047, + "step": 1812 + }, + { + "epoch": 0.0566875, + "grad_norm": 4.15625, + "grad_norm_var": 0.11138916015625, + "learning_rate": 0.0001, + "loss": 6.9804, + "loss/crossentropy": 2.5900630950927734, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.25583045184612274, + "step": 1814 + }, + { + "epoch": 0.05675, + "grad_norm": 5.4375, + "grad_norm_var": 0.13201497395833334, + "learning_rate": 0.0001, + "loss": 7.5876, + "loss/crossentropy": 2.8362863063812256, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2809862047433853, + "step": 1816 + }, + { + "epoch": 0.0568125, + "grad_norm": 5.03125, + "grad_norm_var": 0.11678059895833333, + "learning_rate": 0.0001, + "loss": 7.6393, + "loss/crossentropy": 2.8320658206939697, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.29009483754634857, + "step": 1818 + }, + { + "epoch": 0.056875, + "grad_norm": 4.5625, + "grad_norm_var": 0.11044514973958333, + "learning_rate": 0.0001, + "loss": 7.4099, + "loss/crossentropy": 2.7755016088485718, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.27359677851200104, + "step": 1820 + }, + { + "epoch": 0.0569375, + "grad_norm": 4.1875, + "grad_norm_var": 0.1587890625, + "learning_rate": 0.0001, + "loss": 6.747, + "loss/crossentropy": 2.4457638263702393, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.24731196463108063, + "step": 1822 + }, + { + "epoch": 0.057, + "grad_norm": 4.34375, + "grad_norm_var": 0.158056640625, + "learning_rate": 0.0001, + "loss": 7.3074, + "loss/crossentropy": 2.709230422973633, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.2711452543735504, + "step": 1824 + }, + { + "epoch": 0.0570625, + "grad_norm": 5.15625, + "grad_norm_var": 0.14768473307291666, + "learning_rate": 0.0001, + "loss": 7.6037, + "loss/crossentropy": 2.705008387565613, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.29416972398757935, + "step": 1826 + }, + { + "epoch": 0.057125, + "grad_norm": 4.34375, + "grad_norm_var": 0.15325520833333334, + "learning_rate": 0.0001, + "loss": 6.9901, + "loss/crossentropy": 2.60453999042511, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2533971518278122, + "step": 1828 + }, + { + "epoch": 0.0571875, + "grad_norm": 4.625, + "grad_norm_var": 0.13787434895833334, + "learning_rate": 0.0001, + "loss": 7.1899, + "loss/crossentropy": 2.5849435329437256, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2659648209810257, + "step": 1830 + }, + { + "epoch": 0.05725, + "grad_norm": 4.625, + "grad_norm_var": 0.08670247395833333, + "learning_rate": 0.0001, + "loss": 7.0161, + "loss/crossentropy": 2.420079231262207, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.2662433907389641, + "step": 1832 + }, + { + "epoch": 0.0573125, + "grad_norm": 4.34375, + "grad_norm_var": 0.08098958333333334, + "learning_rate": 0.0001, + "loss": 6.947, + "loss/crossentropy": 2.5382049083709717, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2494724839925766, + "step": 1834 + }, + { + "epoch": 0.057375, + "grad_norm": 4.59375, + "grad_norm_var": 0.07668863932291667, + "learning_rate": 0.0001, + "loss": 7.4878, + "loss/crossentropy": 2.8954910039901733, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2709462493658066, + "step": 1836 + }, + { + "epoch": 0.0574375, + "grad_norm": 4.1875, + "grad_norm_var": 0.06139322916666667, + "learning_rate": 0.0001, + "loss": 7.0911, + "loss/crossentropy": 2.6605488061904907, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2532087415456772, + "step": 1838 + }, + { + "epoch": 0.0575, + "grad_norm": 4.6875, + "grad_norm_var": 0.07224934895833333, + "learning_rate": 0.0001, + "loss": 7.4682, + "loss/crossentropy": 2.7157968282699585, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.28500914573669434, + "step": 1840 + }, + { + "epoch": 0.0575625, + "grad_norm": 4.0, + "grad_norm_var": 0.07867431640625, + "learning_rate": 0.0001, + "loss": 6.9401, + "loss/crossentropy": 2.514353036880493, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.26054561138153076, + "step": 1842 + }, + { + "epoch": 0.057625, + "grad_norm": 4.75, + "grad_norm_var": 0.11404622395833333, + "learning_rate": 0.0001, + "loss": 7.3218, + "loss/crossentropy": 2.6728512048721313, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.27114808559417725, + "step": 1844 + }, + { + "epoch": 0.0576875, + "grad_norm": 4.4375, + "grad_norm_var": 0.11599934895833333, + "learning_rate": 0.0001, + "loss": 6.7175, + "loss/crossentropy": 2.397219181060791, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2374936044216156, + "step": 1846 + }, + { + "epoch": 0.05775, + "grad_norm": 5.125, + "grad_norm_var": 0.145556640625, + "learning_rate": 0.0001, + "loss": 6.8971, + "loss/crossentropy": 2.435696005821228, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.25512565672397614, + "step": 1848 + }, + { + "epoch": 0.0578125, + "grad_norm": 4.78125, + "grad_norm_var": 0.14934895833333334, + "learning_rate": 0.0001, + "loss": 7.4901, + "loss/crossentropy": 2.7314751148223877, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2840634733438492, + "step": 1850 + }, + { + "epoch": 0.057875, + "grad_norm": 5.125, + "grad_norm_var": 0.16090087890625, + "learning_rate": 0.0001, + "loss": 7.3878, + "loss/crossentropy": 2.701223373413086, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.275688573718071, + "step": 1852 + }, + { + "epoch": 0.0579375, + "grad_norm": 4.625, + "grad_norm_var": 0.16480712890625, + "learning_rate": 0.0001, + "loss": 7.1744, + "loss/crossentropy": 2.564082622528076, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.27274955809116364, + "step": 1854 + }, + { + "epoch": 0.058, + "grad_norm": 4.375, + "grad_norm_var": 0.16873372395833333, + "learning_rate": 0.0001, + "loss": 7.1284, + "loss/crossentropy": 2.5915364027023315, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2642373740673065, + "step": 1856 + }, + { + "epoch": 0.0580625, + "grad_norm": 5.3125, + "grad_norm_var": 0.15331624348958334, + "learning_rate": 0.0001, + "loss": 7.493, + "loss/crossentropy": 2.7819111347198486, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.27735432982444763, + "step": 1858 + }, + { + "epoch": 0.058125, + "grad_norm": 5.96875, + "grad_norm_var": 0.22011311848958334, + "learning_rate": 0.0001, + "loss": 7.3859, + "loss/crossentropy": 2.679842948913574, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.27998194098472595, + "step": 1860 + }, + { + "epoch": 0.0581875, + "grad_norm": 4.65625, + "grad_norm_var": 0.20553385416666667, + "learning_rate": 0.0001, + "loss": 6.9421, + "loss/crossentropy": 2.5038408041000366, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.2614058554172516, + "step": 1862 + }, + { + "epoch": 0.05825, + "grad_norm": 4.71875, + "grad_norm_var": 0.17626546223958334, + "learning_rate": 0.0001, + "loss": 7.2092, + "loss/crossentropy": 2.6554737091064453, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.26396705210208893, + "step": 1864 + }, + { + "epoch": 0.0583125, + "grad_norm": 4.34375, + "grad_norm_var": 0.18982747395833333, + "learning_rate": 0.0001, + "loss": 7.1009, + "loss/crossentropy": 2.6473816633224487, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.26058557629585266, + "step": 1866 + }, + { + "epoch": 0.058375, + "grad_norm": 4.59375, + "grad_norm_var": 0.1978515625, + "learning_rate": 0.0001, + "loss": 7.3526, + "loss/crossentropy": 2.696037769317627, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.273857519030571, + "step": 1868 + }, + { + "epoch": 0.0584375, + "grad_norm": 4.71875, + "grad_norm_var": 0.18058268229166666, + "learning_rate": 0.0001, + "loss": 6.9596, + "loss/crossentropy": 2.452818512916565, + "loss/hidden": 1.94921875, + "loss/jsd": 0.0, + "loss/logits": 0.2557523772120476, + "step": 1870 + }, + { + "epoch": 0.0585, + "grad_norm": 4.53125, + "grad_norm_var": 0.17144775390625, + "learning_rate": 0.0001, + "loss": 7.3039, + "loss/crossentropy": 2.809278964996338, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2631368637084961, + "step": 1872 + }, + { + "epoch": 0.0585625, + "grad_norm": 6.53125, + "grad_norm_var": 0.36578369140625, + "learning_rate": 0.0001, + "loss": 7.12, + "loss/crossentropy": 2.583288073539734, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.26265595853328705, + "step": 1874 + }, + { + "epoch": 0.058625, + "grad_norm": 4.125, + "grad_norm_var": 0.3079427083333333, + "learning_rate": 0.0001, + "loss": 6.8005, + "loss/crossentropy": 2.5182260274887085, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.23525764048099518, + "step": 1876 + }, + { + "epoch": 0.0586875, + "grad_norm": 4.03125, + "grad_norm_var": 0.3358072916666667, + "learning_rate": 0.0001, + "loss": 7.1627, + "loss/crossentropy": 2.6038613319396973, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2633034437894821, + "step": 1878 + }, + { + "epoch": 0.05875, + "grad_norm": 6.46875, + "grad_norm_var": 0.55992431640625, + "learning_rate": 0.0001, + "loss": 7.0627, + "loss/crossentropy": 2.5273290872573853, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.2687753736972809, + "step": 1880 + }, + { + "epoch": 0.0588125, + "grad_norm": 4.6875, + "grad_norm_var": 0.5538899739583333, + "learning_rate": 0.0001, + "loss": 7.1662, + "loss/crossentropy": 2.563568949699402, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.27159547805786133, + "step": 1882 + }, + { + "epoch": 0.058875, + "grad_norm": 4.53125, + "grad_norm_var": 0.540625, + "learning_rate": 0.0001, + "loss": 7.0803, + "loss/crossentropy": 2.534513831138611, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2635657638311386, + "step": 1884 + }, + { + "epoch": 0.0589375, + "grad_norm": 4.34375, + "grad_norm_var": 0.56256103515625, + "learning_rate": 0.0001, + "loss": 6.81, + "loss/crossentropy": 2.3854469060897827, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2530023232102394, + "step": 1886 + }, + { + "epoch": 0.059, + "grad_norm": 4.9375, + "grad_norm_var": 0.558447265625, + "learning_rate": 0.0001, + "loss": 7.5175, + "loss/crossentropy": 2.8702194690704346, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.2760562151670456, + "step": 1888 + }, + { + "epoch": 0.0590625, + "grad_norm": 4.5, + "grad_norm_var": 0.33114827473958336, + "learning_rate": 0.0001, + "loss": 7.3912, + "loss/crossentropy": 2.72122585773468, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2752014696598053, + "step": 1890 + }, + { + "epoch": 0.059125, + "grad_norm": 4.96875, + "grad_norm_var": 0.3114217122395833, + "learning_rate": 0.0001, + "loss": 7.2087, + "loss/crossentropy": 2.669129967689514, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.26606956124305725, + "step": 1892 + }, + { + "epoch": 0.0591875, + "grad_norm": 4.5625, + "grad_norm_var": 0.29269205729166664, + "learning_rate": 0.0001, + "loss": 6.8762, + "loss/crossentropy": 2.4340004920959473, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2563306391239166, + "step": 1894 + }, + { + "epoch": 0.05925, + "grad_norm": 4.875, + "grad_norm_var": 0.08118082682291666, + "learning_rate": 0.0001, + "loss": 6.9578, + "loss/crossentropy": 2.363881826400757, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2695441246032715, + "step": 1896 + }, + { + "epoch": 0.0593125, + "grad_norm": 4.875, + "grad_norm_var": 0.14895833333333333, + "learning_rate": 0.0001, + "loss": 7.2834, + "loss/crossentropy": 2.5548282861709595, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2822292298078537, + "step": 1898 + }, + { + "epoch": 0.059375, + "grad_norm": 4.75, + "grad_norm_var": 0.15206705729166667, + "learning_rate": 0.0001, + "loss": 7.3495, + "loss/crossentropy": 2.674235224723816, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.27845965325832367, + "step": 1900 + }, + { + "epoch": 0.0594375, + "grad_norm": 7.15625, + "grad_norm_var": 0.47903238932291664, + "learning_rate": 0.0001, + "loss": 7.4386, + "loss/crossentropy": 2.8135560750961304, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.27578939497470856, + "step": 1902 + }, + { + "epoch": 0.0595, + "grad_norm": 4.65625, + "grad_norm_var": 0.47470296223958336, + "learning_rate": 0.0001, + "loss": 7.3103, + "loss/crossentropy": 2.764998435974121, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.26663658022880554, + "step": 1904 + }, + { + "epoch": 0.0595625, + "grad_norm": 4.4375, + "grad_norm_var": 0.517041015625, + "learning_rate": 0.0001, + "loss": 7.5119, + "loss/crossentropy": 2.6685441732406616, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2901953458786011, + "step": 1906 + }, + { + "epoch": 0.059625, + "grad_norm": 5.1875, + "grad_norm_var": 0.5104817708333333, + "learning_rate": 0.0001, + "loss": 7.0694, + "loss/crossentropy": 2.5350207090377808, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2667199671268463, + "step": 1908 + }, + { + "epoch": 0.0596875, + "grad_norm": 4.84375, + "grad_norm_var": 0.49635416666666665, + "learning_rate": 0.0001, + "loss": 7.0891, + "loss/crossentropy": 2.5018303394317627, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.27083566784858704, + "step": 1910 + }, + { + "epoch": 0.05975, + "grad_norm": 4.75, + "grad_norm_var": 0.518212890625, + "learning_rate": 0.0001, + "loss": 6.9844, + "loss/crossentropy": 2.463659405708313, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.2649676352739334, + "step": 1912 + }, + { + "epoch": 0.0598125, + "grad_norm": 4.78125, + "grad_norm_var": 0.480859375, + "learning_rate": 0.0001, + "loss": 7.5348, + "loss/crossentropy": 2.796781539916992, + "loss/hidden": 1.9453125, + "loss/jsd": 0.0, + "loss/logits": 0.2792717218399048, + "step": 1914 + }, + { + "epoch": 0.059875, + "grad_norm": 5.0, + "grad_norm_var": 0.45636393229166666, + "learning_rate": 0.0001, + "loss": 7.4205, + "loss/crossentropy": 2.717116117477417, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.2726837396621704, + "step": 1916 + }, + { + "epoch": 0.0599375, + "grad_norm": 5.1875, + "grad_norm_var": 0.14191080729166666, + "learning_rate": 0.0001, + "loss": 7.9693, + "loss/crossentropy": 3.0037057399749756, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.30319735407829285, + "step": 1918 + }, + { + "epoch": 0.06, + "grad_norm": 4.59375, + "grad_norm_var": 0.14514567057291666, + "learning_rate": 0.0001, + "loss": 7.5668, + "loss/crossentropy": 2.965430498123169, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2675633579492569, + "step": 1920 + }, + { + "epoch": 0.0600625, + "grad_norm": 4.25, + "grad_norm_var": 0.10286051432291667, + "learning_rate": 0.0001, + "loss": 6.6429, + "loss/crossentropy": 2.376823306083679, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.24300982803106308, + "step": 1922 + }, + { + "epoch": 0.060125, + "grad_norm": 4.96875, + "grad_norm_var": 0.089697265625, + "learning_rate": 0.0001, + "loss": 6.9851, + "loss/crossentropy": 2.5577648878097534, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2485908716917038, + "step": 1924 + }, + { + "epoch": 0.0601875, + "grad_norm": 4.9375, + "grad_norm_var": 0.0943359375, + "learning_rate": 0.0001, + "loss": 7.3381, + "loss/crossentropy": 2.583045721054077, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.28253433108329773, + "step": 1926 + }, + { + "epoch": 0.06025, + "grad_norm": 5.3125, + "grad_norm_var": 0.37906494140625, + "learning_rate": 0.0001, + "loss": 7.8211, + "loss/crossentropy": 3.004792094230652, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2890557497739792, + "step": 1928 + }, + { + "epoch": 0.0603125, + "grad_norm": 4.625, + "grad_norm_var": 0.374853515625, + "learning_rate": 0.0001, + "loss": 7.1874, + "loss/crossentropy": 2.61250638961792, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2653019577264786, + "step": 1930 + }, + { + "epoch": 0.060375, + "grad_norm": 4.25, + "grad_norm_var": 0.408447265625, + "learning_rate": 0.0001, + "loss": 7.2217, + "loss/crossentropy": 2.6708651781082153, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2656322345137596, + "step": 1932 + }, + { + "epoch": 0.0604375, + "grad_norm": 6.5, + "grad_norm_var": 0.54166259765625, + "learning_rate": 0.0001, + "loss": 7.7014, + "loss/crossentropy": 2.7332130670547485, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.3061896413564682, + "step": 1934 + }, + { + "epoch": 0.0605, + "grad_norm": 4.65625, + "grad_norm_var": 0.5347615559895833, + "learning_rate": 0.0001, + "loss": 7.3152, + "loss/crossentropy": 2.6586803197860718, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.27776092290878296, + "step": 1936 + }, + { + "epoch": 0.0605625, + "grad_norm": 6.28125, + "grad_norm_var": 0.5535807291666667, + "learning_rate": 0.0001, + "loss": 7.6632, + "loss/crossentropy": 2.7831838130950928, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2938612252473831, + "step": 1938 + }, + { + "epoch": 0.060625, + "grad_norm": 4.875, + "grad_norm_var": 0.5571573893229167, + "learning_rate": 0.0001, + "loss": 7.1143, + "loss/crossentropy": 2.658159375190735, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.25889691710472107, + "step": 1940 + }, + { + "epoch": 0.0606875, + "grad_norm": 9.3125, + "grad_norm_var": 1.6083943684895834, + "learning_rate": 0.0001, + "loss": 7.8654, + "loss/crossentropy": 2.923017978668213, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.30244556069374084, + "step": 1942 + }, + { + "epoch": 0.06075, + "grad_norm": 18.5, + "grad_norm_var": 12.256766764322917, + "learning_rate": 0.0001, + "loss": 7.4521, + "loss/crossentropy": 2.684673309326172, + "loss/hidden": 1.96875, + "loss/jsd": 0.0, + "loss/logits": 0.2798672914505005, + "step": 1944 + }, + { + "epoch": 0.0608125, + "grad_norm": 4.625, + "grad_norm_var": 12.309244791666666, + "learning_rate": 0.0001, + "loss": 7.3468, + "loss/crossentropy": 2.713373899459839, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.27623049914836884, + "step": 1946 + }, + { + "epoch": 0.060875, + "grad_norm": 5.65625, + "grad_norm_var": 12.200907389322916, + "learning_rate": 0.0001, + "loss": 7.3181, + "loss/crossentropy": 2.609257936477661, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2794778645038605, + "step": 1948 + }, + { + "epoch": 0.0609375, + "grad_norm": 4.5, + "grad_norm_var": 12.456343587239584, + "learning_rate": 0.0001, + "loss": 6.7497, + "loss/crossentropy": 2.5331573486328125, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.24157867580652237, + "step": 1950 + }, + { + "epoch": 0.061, + "grad_norm": 4.96875, + "grad_norm_var": 12.452197265625, + "learning_rate": 0.0001, + "loss": 7.6656, + "loss/crossentropy": 2.8852624893188477, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.28624095022678375, + "step": 1952 + }, + { + "epoch": 0.0610625, + "grad_norm": 4.71875, + "grad_norm_var": 12.616402180989583, + "learning_rate": 0.0001, + "loss": 7.1288, + "loss/crossentropy": 2.5084198713302612, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.26907259225845337, + "step": 1954 + }, + { + "epoch": 0.061125, + "grad_norm": 4.34375, + "grad_norm_var": 12.776590983072916, + "learning_rate": 0.0001, + "loss": 7.1834, + "loss/crossentropy": 2.7280231714248657, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.25999484956264496, + "step": 1956 + }, + { + "epoch": 0.0611875, + "grad_norm": 4.5625, + "grad_norm_var": 12.005952962239583, + "learning_rate": 0.0001, + "loss": 7.1353, + "loss/crossentropy": 2.5946719646453857, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.26773855835199356, + "step": 1958 + }, + { + "epoch": 0.06125, + "grad_norm": 4.84375, + "grad_norm_var": 0.11272379557291666, + "learning_rate": 0.0001, + "loss": 7.3448, + "loss/crossentropy": 2.718293786048889, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.27085064351558685, + "step": 1960 + }, + { + "epoch": 0.0613125, + "grad_norm": 4.8125, + "grad_norm_var": 0.11471354166666667, + "learning_rate": 0.0001, + "loss": 7.2067, + "loss/crossentropy": 2.6649069786071777, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2643337845802307, + "step": 1962 + }, + { + "epoch": 0.061375, + "grad_norm": 5.03125, + "grad_norm_var": 0.07021077473958333, + "learning_rate": 0.0001, + "loss": 7.0607, + "loss/crossentropy": 2.586350440979004, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.26345033198595047, + "step": 1964 + }, + { + "epoch": 0.0614375, + "grad_norm": 4.0625, + "grad_norm_var": 0.09211832682291667, + "learning_rate": 0.0001, + "loss": 7.3764, + "loss/crossentropy": 2.824537992477417, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2688605338335037, + "step": 1966 + }, + { + "epoch": 0.0615, + "grad_norm": 4.9375, + "grad_norm_var": 0.09081624348958334, + "learning_rate": 0.0001, + "loss": 7.2516, + "loss/crossentropy": 2.66815447807312, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.27045372128486633, + "step": 1968 + }, + { + "epoch": 0.0615625, + "grad_norm": 4.5, + "grad_norm_var": 0.09013264973958333, + "learning_rate": 0.0001, + "loss": 7.1873, + "loss/crossentropy": 2.714627504348755, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2621074616909027, + "step": 1970 + }, + { + "epoch": 0.061625, + "grad_norm": 4.40625, + "grad_norm_var": 0.083984375, + "learning_rate": 0.0001, + "loss": 7.148, + "loss/crossentropy": 2.732495427131653, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.2583463042974472, + "step": 1972 + }, + { + "epoch": 0.0616875, + "grad_norm": 6.875, + "grad_norm_var": 0.403369140625, + "learning_rate": 0.0001, + "loss": 7.7964, + "loss/crossentropy": 2.6379644870758057, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.31740613281726837, + "step": 1974 + }, + { + "epoch": 0.06175, + "grad_norm": 5.125, + "grad_norm_var": 0.7941243489583333, + "learning_rate": 0.0001, + "loss": 7.8055, + "loss/crossentropy": 2.8083606958389282, + "loss/hidden": 2.01171875, + "loss/jsd": 0.0, + "loss/logits": 0.29854556918144226, + "step": 1976 + }, + { + "epoch": 0.0618125, + "grad_norm": 4.6875, + "grad_norm_var": 0.7879191080729167, + "learning_rate": 0.0001, + "loss": 6.9407, + "loss/crossentropy": 2.48198664188385, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.25602778792381287, + "step": 1978 + }, + { + "epoch": 0.061875, + "grad_norm": 4.8125, + "grad_norm_var": 0.75924072265625, + "learning_rate": 0.0001, + "loss": 7.4327, + "loss/crossentropy": 2.723036050796509, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2826816141605377, + "step": 1980 + }, + { + "epoch": 0.0619375, + "grad_norm": 4.75, + "grad_norm_var": 0.7576456705729167, + "learning_rate": 0.0001, + "loss": 6.9662, + "loss/crossentropy": 2.535414457321167, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2540201246738434, + "step": 1982 + }, + { + "epoch": 0.062, + "grad_norm": 4.40625, + "grad_norm_var": 0.7925130208333333, + "learning_rate": 0.0001, + "loss": 7.1227, + "loss/crossentropy": 2.690464735031128, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.25885075330734253, + "step": 1984 + }, + { + "epoch": 0.0620625, + "grad_norm": 4.0625, + "grad_norm_var": 0.82467041015625, + "learning_rate": 0.0001, + "loss": 7.2165, + "loss/crossentropy": 2.705548882484436, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.26750563085079193, + "step": 1986 + }, + { + "epoch": 0.062125, + "grad_norm": 4.59375, + "grad_norm_var": 0.8031901041666667, + "learning_rate": 0.0001, + "loss": 7.3331, + "loss/crossentropy": 2.8012691736221313, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.26646314561367035, + "step": 1988 + }, + { + "epoch": 0.0621875, + "grad_norm": 4.34375, + "grad_norm_var": 0.5442545572916667, + "learning_rate": 0.0001, + "loss": 6.8132, + "loss/crossentropy": 2.3716607093811035, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2578277885913849, + "step": 1990 + }, + { + "epoch": 0.06225, + "grad_norm": 4.625, + "grad_norm_var": 0.9739217122395833, + "learning_rate": 0.0001, + "loss": 7.441, + "loss/crossentropy": 2.5655524730682373, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.29887592792510986, + "step": 1992 + }, + { + "epoch": 0.0623125, + "grad_norm": 5.875, + "grad_norm_var": 1.05181884765625, + "learning_rate": 0.0001, + "loss": 7.5642, + "loss/crossentropy": 2.728899836540222, + "loss/hidden": 1.984375, + "loss/jsd": 0.0, + "loss/logits": 0.2850935161113739, + "step": 1994 + }, + { + "epoch": 0.062375, + "grad_norm": 8.25, + "grad_norm_var": 1.738134765625, + "learning_rate": 0.0001, + "loss": 7.8895, + "loss/crossentropy": 2.779134511947632, + "loss/hidden": 2.0, + "loss/jsd": 0.0, + "loss/logits": 0.31103692948818207, + "step": 1996 + }, + { + "epoch": 0.0624375, + "grad_norm": 4.59375, + "grad_norm_var": 1.6788899739583334, + "learning_rate": 0.0001, + "loss": 7.2849, + "loss/crossentropy": 2.6748945713043213, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.27271807193756104, + "step": 1998 + }, + { + "epoch": 0.0625, + "grad_norm": 4.875, + "grad_norm_var": 2.008658854166667, + "learning_rate": 0.0001, + "loss": 7.8282, + "loss/crossentropy": 2.801780104637146, + "loss/hidden": 1.9765625, + "loss/jsd": 0.0, + "loss/logits": 0.30498483777046204, + "step": 2000 + }, + { + "epoch": 0.0625625, + "grad_norm": 4.8125, + "grad_norm_var": 1.9149576822916667, + "learning_rate": 0.0001, + "loss": 6.9057, + "loss/crossentropy": 2.453532576560974, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2584947943687439, + "step": 2002 + }, + { + "epoch": 0.062625, + "grad_norm": 4.71875, + "grad_norm_var": 1.8465983072916667, + "learning_rate": 0.0001, + "loss": 6.7113, + "loss/crossentropy": 2.292815327644348, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2477075159549713, + "step": 2004 + }, + { + "epoch": 0.0626875, + "grad_norm": 4.6875, + "grad_norm_var": 1.9022786458333334, + "learning_rate": 0.0001, + "loss": 7.4308, + "loss/crossentropy": 2.834795117378235, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.27014750242233276, + "step": 2006 + }, + { + "epoch": 0.06275, + "grad_norm": 4.875, + "grad_norm_var": 1.36256103515625, + "learning_rate": 0.0001, + "loss": 7.6104, + "loss/crossentropy": 2.861023187637329, + "loss/hidden": 1.93359375, + "loss/jsd": 0.0, + "loss/logits": 0.2815757244825363, + "step": 2008 + }, + { + "epoch": 0.0628125, + "grad_norm": 4.375, + "grad_norm_var": 1.40963134765625, + "learning_rate": 0.0001, + "loss": 7.0763, + "loss/crossentropy": 2.5962413549423218, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2616799771785736, + "step": 2010 + }, + { + "epoch": 0.062875, + "grad_norm": 4.40625, + "grad_norm_var": 0.7212239583333333, + "learning_rate": 0.0001, + "loss": 7.2025, + "loss/crossentropy": 2.6350537538528442, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.26299260556697845, + "step": 2012 + }, + { + "epoch": 0.0629375, + "grad_norm": 4.34375, + "grad_norm_var": 0.7535441080729167, + "learning_rate": 0.0001, + "loss": 7.032, + "loss/crossentropy": 2.6000778675079346, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.25842562317848206, + "step": 2014 + }, + { + "epoch": 0.063, + "grad_norm": 4.125, + "grad_norm_var": 0.07913004557291667, + "learning_rate": 0.0001, + "loss": 7.0661, + "loss/crossentropy": 2.6812326908111572, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2564576119184494, + "step": 2016 + }, + { + "epoch": 0.0630625, + "grad_norm": 5.03125, + "grad_norm_var": 0.093603515625, + "learning_rate": 0.0001, + "loss": 7.2916, + "loss/crossentropy": 2.6755926609039307, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2705804705619812, + "step": 2018 + }, + { + "epoch": 0.063125, + "grad_norm": 4.15625, + "grad_norm_var": 0.12945556640625, + "learning_rate": 0.0001, + "loss": 7.1084, + "loss/crossentropy": 2.57517671585083, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2654324173927307, + "step": 2020 + }, + { + "epoch": 0.0631875, + "grad_norm": 5.4375, + "grad_norm_var": 0.193212890625, + "learning_rate": 0.0001, + "loss": 7.435, + "loss/crossentropy": 2.6259225606918335, + "loss/hidden": 1.97265625, + "loss/jsd": 0.0, + "loss/logits": 0.28364163637161255, + "step": 2022 + }, + { + "epoch": 0.06325, + "grad_norm": 4.78125, + "grad_norm_var": 0.203759765625, + "learning_rate": 0.0001, + "loss": 7.0103, + "loss/crossentropy": 2.5401690006256104, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2657654732465744, + "step": 2024 + }, + { + "epoch": 0.0633125, + "grad_norm": 4.84375, + "grad_norm_var": 0.201171875, + "learning_rate": 0.0001, + "loss": 6.892, + "loss/crossentropy": 2.448048710823059, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.2557239532470703, + "step": 2026 + }, + { + "epoch": 0.063375, + "grad_norm": 5.15625, + "grad_norm_var": 0.210400390625, + "learning_rate": 0.0001, + "loss": 7.1379, + "loss/crossentropy": 2.5381767749786377, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.26856717467308044, + "step": 2028 + }, + { + "epoch": 0.0634375, + "grad_norm": 4.625, + "grad_norm_var": 0.19138997395833332, + "learning_rate": 0.0001, + "loss": 7.3956, + "loss/crossentropy": 2.7830891609191895, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2714068740606308, + "step": 2030 + }, + { + "epoch": 0.0635, + "grad_norm": 5.96875, + "grad_norm_var": 0.2626261393229167, + "learning_rate": 0.0001, + "loss": 7.0564, + "loss/crossentropy": 2.473397970199585, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2668927237391472, + "step": 2032 + }, + { + "epoch": 0.0635625, + "grad_norm": 4.6875, + "grad_norm_var": 0.26495768229166666, + "learning_rate": 0.0001, + "loss": 7.0239, + "loss/crossentropy": 2.6535342931747437, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2526656314730644, + "step": 2034 + }, + { + "epoch": 0.063625, + "grad_norm": 4.96875, + "grad_norm_var": 0.21588541666666666, + "learning_rate": 0.0001, + "loss": 7.3198, + "loss/crossentropy": 2.62338125705719, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.2770598828792572, + "step": 2036 + }, + { + "epoch": 0.0636875, + "grad_norm": 4.3125, + "grad_norm_var": 0.20494791666666667, + "learning_rate": 0.0001, + "loss": 6.834, + "loss/crossentropy": 2.4843112230300903, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2544984146952629, + "step": 2038 + }, + { + "epoch": 0.06375, + "grad_norm": 4.375, + "grad_norm_var": 0.19140625, + "learning_rate": 0.0001, + "loss": 7.0979, + "loss/crossentropy": 2.6469043493270874, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2599470168352127, + "step": 2040 + }, + { + "epoch": 0.0638125, + "grad_norm": 4.5, + "grad_norm_var": 0.19073893229166666, + "learning_rate": 0.0001, + "loss": 6.9246, + "loss/crossentropy": 2.4736326932907104, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2576009929180145, + "step": 2042 + }, + { + "epoch": 0.063875, + "grad_norm": 4.1875, + "grad_norm_var": 0.19659830729166666, + "learning_rate": 0.0001, + "loss": 6.8111, + "loss/crossentropy": 2.47759473323822, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.25053443014621735, + "step": 2044 + }, + { + "epoch": 0.0639375, + "grad_norm": 4.59375, + "grad_norm_var": 0.20572916666666666, + "learning_rate": 0.0001, + "loss": 6.8598, + "loss/crossentropy": 2.4240305423736572, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2576344683766365, + "step": 2046 + }, + { + "epoch": 0.064, + "grad_norm": 4.78125, + "grad_norm_var": 0.071875, + "learning_rate": 0.0001, + "loss": 7.2763, + "loss/crossentropy": 2.636983275413513, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.27525630593299866, + "step": 2048 + }, + { + "epoch": 0.0640625, + "grad_norm": 5.25, + "grad_norm_var": 0.13248697916666666, + "learning_rate": 0.0001, + "loss": 7.3148, + "loss/crossentropy": 2.602674722671509, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2805839627981186, + "step": 2050 + }, + { + "epoch": 0.064125, + "grad_norm": 4.78125, + "grad_norm_var": 0.11145426432291666, + "learning_rate": 0.0001, + "loss": 7.5156, + "loss/crossentropy": 2.8106677532196045, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2786969691514969, + "step": 2052 + }, + { + "epoch": 0.0641875, + "grad_norm": 4.28125, + "grad_norm_var": 0.11145426432291666, + "learning_rate": 0.0001, + "loss": 6.9285, + "loss/crossentropy": 2.535367965698242, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.258063942193985, + "step": 2054 + }, + { + "epoch": 0.06425, + "grad_norm": 4.125, + "grad_norm_var": 0.14810282389322918, + "learning_rate": 0.0001, + "loss": 7.0355, + "loss/crossentropy": 2.7271558046340942, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2480250746011734, + "step": 2056 + }, + { + "epoch": 0.0643125, + "grad_norm": 4.71875, + "grad_norm_var": 0.16648661295572917, + "learning_rate": 0.0001, + "loss": 7.6621, + "loss/crossentropy": 2.8376705646514893, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.2918137311935425, + "step": 2058 + }, + { + "epoch": 0.064375, + "grad_norm": 4.375, + "grad_norm_var": 0.16048075358072916, + "learning_rate": 0.0001, + "loss": 7.4221, + "loss/crossentropy": 2.738136410713196, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2769925594329834, + "step": 2060 + }, + { + "epoch": 0.0644375, + "grad_norm": 5.21875, + "grad_norm_var": 0.1754547119140625, + "learning_rate": 0.0001, + "loss": 7.5939, + "loss/crossentropy": 2.849211812019348, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.283063068985939, + "step": 2062 + }, + { + "epoch": 0.0645, + "grad_norm": 4.71875, + "grad_norm_var": 0.17480367024739582, + "learning_rate": 0.0001, + "loss": 7.5665, + "loss/crossentropy": 2.8930327892303467, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2798466980457306, + "step": 2064 + }, + { + "epoch": 0.0645625, + "grad_norm": 4.15625, + "grad_norm_var": 0.13145243326822917, + "learning_rate": 0.0001, + "loss": 6.9794, + "loss/crossentropy": 2.5419009923934937, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.25585878640413284, + "step": 2066 + }, + { + "epoch": 0.064625, + "grad_norm": 4.65625, + "grad_norm_var": 0.1286773681640625, + "learning_rate": 0.0001, + "loss": 7.3639, + "loss/crossentropy": 2.8139398097991943, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.2694520950317383, + "step": 2068 + }, + { + "epoch": 0.0646875, + "grad_norm": 4.3125, + "grad_norm_var": 0.1372955322265625, + "learning_rate": 0.0001, + "loss": 7.0513, + "loss/crossentropy": 2.6727343797683716, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.25425978749990463, + "step": 2070 + }, + { + "epoch": 0.06475, + "grad_norm": 4.65625, + "grad_norm_var": 0.09823811848958333, + "learning_rate": 0.0001, + "loss": 7.0751, + "loss/crossentropy": 2.564603090286255, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.26042503118515015, + "step": 2072 + }, + { + "epoch": 0.0648125, + "grad_norm": 4.1875, + "grad_norm_var": 0.09440104166666667, + "learning_rate": 0.0001, + "loss": 7.3307, + "loss/crossentropy": 2.74035382270813, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.27035802602767944, + "step": 2074 + }, + { + "epoch": 0.064875, + "grad_norm": 4.3125, + "grad_norm_var": 0.08847249348958333, + "learning_rate": 0.0001, + "loss": 6.7706, + "loss/crossentropy": 2.4303542375564575, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2539462745189667, + "step": 2076 + }, + { + "epoch": 0.0649375, + "grad_norm": 4.59375, + "grad_norm_var": 0.06291910807291666, + "learning_rate": 0.0001, + "loss": 7.1118, + "loss/crossentropy": 2.539111375808716, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.26507870107889175, + "step": 2078 + }, + { + "epoch": 0.065, + "grad_norm": 4.25, + "grad_norm_var": 0.083203125, + "learning_rate": 0.0001, + "loss": 7.1723, + "loss/crossentropy": 2.6623255014419556, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.26506394147872925, + "step": 2080 + }, + { + "epoch": 0.0650625, + "grad_norm": 4.3125, + "grad_norm_var": 0.079296875, + "learning_rate": 0.0001, + "loss": 6.9769, + "loss/crossentropy": 2.581335186958313, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2504977136850357, + "step": 2082 + }, + { + "epoch": 0.065125, + "grad_norm": 4.46875, + "grad_norm_var": 0.077734375, + "learning_rate": 0.0001, + "loss": 6.7248, + "loss/crossentropy": 2.428415536880493, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2433120310306549, + "step": 2084 + }, + { + "epoch": 0.0651875, + "grad_norm": 5.6875, + "grad_norm_var": 0.154541015625, + "learning_rate": 0.0001, + "loss": 7.3285, + "loss/crossentropy": 2.767895817756653, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.26739276945590973, + "step": 2086 + }, + { + "epoch": 0.06525, + "grad_norm": 4.6875, + "grad_norm_var": 0.1654296875, + "learning_rate": 0.0001, + "loss": 7.5187, + "loss/crossentropy": 2.7806981801986694, + "loss/hidden": 1.9296875, + "loss/jsd": 0.0, + "loss/logits": 0.2808331549167633, + "step": 2088 + }, + { + "epoch": 0.0653125, + "grad_norm": 4.65625, + "grad_norm_var": 0.1556640625, + "learning_rate": 0.0001, + "loss": 6.844, + "loss/crossentropy": 2.510632038116455, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.24700827151536942, + "step": 2090 + }, + { + "epoch": 0.065375, + "grad_norm": 4.03125, + "grad_norm_var": 0.1995513916015625, + "learning_rate": 0.0001, + "loss": 6.7551, + "loss/crossentropy": 2.520743489265442, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2453150451183319, + "step": 2092 + }, + { + "epoch": 0.0654375, + "grad_norm": 5.625, + "grad_norm_var": 0.2886383056640625, + "learning_rate": 0.0001, + "loss": 7.2018, + "loss/crossentropy": 2.580179214477539, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2703619748353958, + "step": 2094 + }, + { + "epoch": 0.0655, + "grad_norm": 4.40625, + "grad_norm_var": 0.2673980712890625, + "learning_rate": 0.0001, + "loss": 7.1838, + "loss/crossentropy": 2.655561089515686, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.26454417407512665, + "step": 2096 + }, + { + "epoch": 0.0655625, + "grad_norm": 4.46875, + "grad_norm_var": 0.27241923014322916, + "learning_rate": 0.0001, + "loss": 7.056, + "loss/crossentropy": 2.610250473022461, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.2598103657364845, + "step": 2098 + }, + { + "epoch": 0.065625, + "grad_norm": 4.625, + "grad_norm_var": 0.2668853759765625, + "learning_rate": 0.0001, + "loss": 6.9496, + "loss/crossentropy": 2.5838887691497803, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.25180892646312714, + "step": 2100 + }, + { + "epoch": 0.0656875, + "grad_norm": 4.5, + "grad_norm_var": 0.1876373291015625, + "learning_rate": 0.0001, + "loss": 7.208, + "loss/crossentropy": 2.717166543006897, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2639300525188446, + "step": 2102 + }, + { + "epoch": 0.06575, + "grad_norm": 4.59375, + "grad_norm_var": 0.1708160400390625, + "learning_rate": 0.0001, + "loss": 6.649, + "loss/crossentropy": 2.3968560695648193, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.24005791544914246, + "step": 2104 + }, + { + "epoch": 0.0658125, + "grad_norm": 4.78125, + "grad_norm_var": 0.17692769368489583, + "learning_rate": 0.0001, + "loss": 7.0411, + "loss/crossentropy": 2.5781623125076294, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.25996091961860657, + "step": 2106 + }, + { + "epoch": 0.065875, + "grad_norm": 4.3125, + "grad_norm_var": 0.14208577473958334, + "learning_rate": 0.0001, + "loss": 6.7822, + "loss/crossentropy": 2.490805149078369, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.2506261169910431, + "step": 2108 + }, + { + "epoch": 0.0659375, + "grad_norm": 4.75, + "grad_norm_var": 0.04898681640625, + "learning_rate": 0.0001, + "loss": 7.1099, + "loss/crossentropy": 2.639052987098694, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.26115116477012634, + "step": 2110 + }, + { + "epoch": 0.066, + "grad_norm": 4.5625, + "grad_norm_var": 0.04544270833333333, + "learning_rate": 0.0001, + "loss": 7.1626, + "loss/crossentropy": 2.600106716156006, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.2621118575334549, + "step": 2112 + }, + { + "epoch": 0.0660625, + "grad_norm": 4.65625, + "grad_norm_var": 0.03209635416666667, + "learning_rate": 0.0001, + "loss": 7.4427, + "loss/crossentropy": 2.763522505760193, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.27768468856811523, + "step": 2114 + }, + { + "epoch": 0.066125, + "grad_norm": 4.53125, + "grad_norm_var": 0.043473307291666666, + "learning_rate": 0.0001, + "loss": 7.2648, + "loss/crossentropy": 2.662122130393982, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.27199098467826843, + "step": 2116 + }, + { + "epoch": 0.0661875, + "grad_norm": 4.375, + "grad_norm_var": 0.04260660807291667, + "learning_rate": 0.0001, + "loss": 6.6901, + "loss/crossentropy": 2.4105567932128906, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2486593872308731, + "step": 2118 + }, + { + "epoch": 0.06625, + "grad_norm": 4.8125, + "grad_norm_var": 0.07623697916666666, + "learning_rate": 0.0001, + "loss": 7.4795, + "loss/crossentropy": 2.7654476165771484, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.28194795548915863, + "step": 2120 + }, + { + "epoch": 0.0663125, + "grad_norm": 4.625, + "grad_norm_var": 0.06614583333333333, + "learning_rate": 0.0001, + "loss": 7.1431, + "loss/crossentropy": 2.657179594039917, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2634367048740387, + "step": 2122 + }, + { + "epoch": 0.066375, + "grad_norm": 6.03125, + "grad_norm_var": 0.17248942057291666, + "learning_rate": 0.0001, + "loss": 7.4061, + "loss/crossentropy": 2.7268136739730835, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2788669764995575, + "step": 2124 + }, + { + "epoch": 0.0664375, + "grad_norm": 5.03125, + "grad_norm_var": 0.17203369140625, + "learning_rate": 0.0001, + "loss": 7.2446, + "loss/crossentropy": 2.6502318382263184, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.26764001697301865, + "step": 2126 + }, + { + "epoch": 0.0665, + "grad_norm": 4.46875, + "grad_norm_var": 0.18108317057291667, + "learning_rate": 0.0001, + "loss": 7.014, + "loss/crossentropy": 2.633153796195984, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2576177716255188, + "step": 2128 + }, + { + "epoch": 0.0665625, + "grad_norm": 4.21875, + "grad_norm_var": 0.20506184895833332, + "learning_rate": 0.0001, + "loss": 7.1342, + "loss/crossentropy": 2.7031787633895874, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.25443191826343536, + "step": 2130 + }, + { + "epoch": 0.066625, + "grad_norm": 4.75, + "grad_norm_var": 0.19849853515625, + "learning_rate": 0.0001, + "loss": 7.3439, + "loss/crossentropy": 2.7594083547592163, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2709462195634842, + "step": 2132 + }, + { + "epoch": 0.0666875, + "grad_norm": 4.75, + "grad_norm_var": 0.19127197265625, + "learning_rate": 0.0001, + "loss": 6.8376, + "loss/crossentropy": 2.4818824529647827, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.25159231573343277, + "step": 2134 + }, + { + "epoch": 0.06675, + "grad_norm": 4.15625, + "grad_norm_var": 0.20149332682291668, + "learning_rate": 0.0001, + "loss": 7.1332, + "loss/crossentropy": 2.6554126739501953, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.2622327506542206, + "step": 2136 + }, + { + "epoch": 0.0668125, + "grad_norm": 4.6875, + "grad_norm_var": 0.20015869140625, + "learning_rate": 0.0001, + "loss": 7.2519, + "loss/crossentropy": 2.650509238243103, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.26951250433921814, + "step": 2138 + }, + { + "epoch": 0.066875, + "grad_norm": 5.28125, + "grad_norm_var": 0.09947509765625, + "learning_rate": 0.0001, + "loss": 7.3845, + "loss/crossentropy": 2.764272689819336, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2729562968015671, + "step": 2140 + }, + { + "epoch": 0.0669375, + "grad_norm": 4.46875, + "grad_norm_var": 0.43800455729166665, + "learning_rate": 0.0001, + "loss": 7.2234, + "loss/crossentropy": 2.6682363748550415, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.26840950548648834, + "step": 2142 + }, + { + "epoch": 0.067, + "grad_norm": 4.65625, + "grad_norm_var": 0.4259114583333333, + "learning_rate": 0.0001, + "loss": 7.2398, + "loss/crossentropy": 2.713719964027405, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2635442316532135, + "step": 2144 + }, + { + "epoch": 0.0670625, + "grad_norm": 4.40625, + "grad_norm_var": 0.42414957682291665, + "learning_rate": 0.0001, + "loss": 7.175, + "loss/crossentropy": 2.7028512954711914, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.260101780295372, + "step": 2146 + }, + { + "epoch": 0.067125, + "grad_norm": 7.0, + "grad_norm_var": 1.5235026041666666, + "learning_rate": 0.0001, + "loss": 7.2175, + "loss/crossentropy": 2.663016438484192, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.26442950963974, + "step": 2148 + }, + { + "epoch": 0.0671875, + "grad_norm": 5.03125, + "grad_norm_var": 1.506884765625, + "learning_rate": 0.0001, + "loss": 7.5776, + "loss/crossentropy": 2.747779130935669, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.28923508524894714, + "step": 2150 + }, + { + "epoch": 0.06725, + "grad_norm": 4.125, + "grad_norm_var": 1.5227823893229167, + "learning_rate": 0.0001, + "loss": 6.7097, + "loss/crossentropy": 2.3725602626800537, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.24621078372001648, + "step": 2152 + }, + { + "epoch": 0.0673125, + "grad_norm": 4.75, + "grad_norm_var": 1.5213541666666666, + "learning_rate": 0.0001, + "loss": 6.9537, + "loss/crossentropy": 2.5790809392929077, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.24995902180671692, + "step": 2154 + }, + { + "epoch": 0.067375, + "grad_norm": 4.46875, + "grad_norm_var": 1.5811808268229166, + "learning_rate": 0.0001, + "loss": 7.2648, + "loss/crossentropy": 2.7113585472106934, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.26862309873104095, + "step": 2156 + }, + { + "epoch": 0.0674375, + "grad_norm": 4.0625, + "grad_norm_var": 1.39107666015625, + "learning_rate": 0.0001, + "loss": 6.6646, + "loss/crossentropy": 2.4880530834198, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.23874470591545105, + "step": 2158 + }, + { + "epoch": 0.0675, + "grad_norm": 4.40625, + "grad_norm_var": 1.3983357747395833, + "learning_rate": 0.0001, + "loss": 7.1278, + "loss/crossentropy": 2.6453821659088135, + "loss/hidden": 1.92578125, + "loss/jsd": 0.0, + "loss/logits": 0.25566530227661133, + "step": 2160 + }, + { + "epoch": 0.0675625, + "grad_norm": 4.875, + "grad_norm_var": 1.3705037434895833, + "learning_rate": 0.0001, + "loss": 7.3912, + "loss/crossentropy": 2.793515920639038, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2687506824731827, + "step": 2162 + }, + { + "epoch": 0.067625, + "grad_norm": 5.0, + "grad_norm_var": 0.09607747395833334, + "learning_rate": 0.0001, + "loss": 7.5826, + "loss/crossentropy": 2.8772777318954468, + "loss/hidden": 1.96484375, + "loss/jsd": 0.0, + "loss/logits": 0.27405229210853577, + "step": 2164 + }, + { + "epoch": 0.0676875, + "grad_norm": 4.03125, + "grad_norm_var": 0.08372395833333333, + "learning_rate": 0.0001, + "loss": 6.822, + "loss/crossentropy": 2.485366106033325, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2457720786333084, + "step": 2166 + }, + { + "epoch": 0.06775, + "grad_norm": 4.96875, + "grad_norm_var": 0.08866780598958333, + "learning_rate": 0.0001, + "loss": 6.6891, + "loss/crossentropy": 2.4618345499038696, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2352314591407776, + "step": 2168 + }, + { + "epoch": 0.0678125, + "grad_norm": 4.28125, + "grad_norm_var": 0.09425455729166667, + "learning_rate": 0.0001, + "loss": 7.4168, + "loss/crossentropy": 2.745675206184387, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2768822908401489, + "step": 2170 + }, + { + "epoch": 0.067875, + "grad_norm": 4.28125, + "grad_norm_var": 0.11060791015625, + "learning_rate": 0.0001, + "loss": 7.2711, + "loss/crossentropy": 2.8049396276474, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.26107245683670044, + "step": 2172 + }, + { + "epoch": 0.0679375, + "grad_norm": 4.0, + "grad_norm_var": 0.11796468098958333, + "learning_rate": 0.0001, + "loss": 7.1348, + "loss/crossentropy": 2.637243390083313, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2645973861217499, + "step": 2174 + }, + { + "epoch": 0.068, + "grad_norm": 5.3125, + "grad_norm_var": 0.1611328125, + "learning_rate": 0.0001, + "loss": 7.3615, + "loss/crossentropy": 2.7797021865844727, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.27068354189395905, + "step": 2176 + }, + { + "epoch": 0.0680625, + "grad_norm": 4.4375, + "grad_norm_var": 0.15793863932291666, + "learning_rate": 0.0001, + "loss": 7.033, + "loss/crossentropy": 2.639753580093384, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2553429752588272, + "step": 2178 + }, + { + "epoch": 0.068125, + "grad_norm": 4.46875, + "grad_norm_var": 0.13358968098958332, + "learning_rate": 0.0001, + "loss": 7.1443, + "loss/crossentropy": 2.788546323776245, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.25041744858026505, + "step": 2180 + }, + { + "epoch": 0.0681875, + "grad_norm": 4.375, + "grad_norm_var": 0.13993733723958332, + "learning_rate": 0.0001, + "loss": 7.4631, + "loss/crossentropy": 2.8808815479278564, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.2695489227771759, + "step": 2182 + }, + { + "epoch": 0.06825, + "grad_norm": 4.375, + "grad_norm_var": 0.13411051432291668, + "learning_rate": 0.0001, + "loss": 7.1354, + "loss/crossentropy": 2.6089104413986206, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2631940394639969, + "step": 2184 + }, + { + "epoch": 0.0683125, + "grad_norm": 4.5, + "grad_norm_var": 0.12688802083333334, + "learning_rate": 0.0001, + "loss": 7.4842, + "loss/crossentropy": 2.8750851154327393, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.2710645943880081, + "step": 2186 + }, + { + "epoch": 0.068375, + "grad_norm": 4.375, + "grad_norm_var": 0.14178059895833334, + "learning_rate": 0.0001, + "loss": 7.0685, + "loss/crossentropy": 2.5792561769485474, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2637658417224884, + "step": 2188 + }, + { + "epoch": 0.0684375, + "grad_norm": 4.0625, + "grad_norm_var": 0.13993733723958332, + "learning_rate": 0.0001, + "loss": 6.7785, + "loss/crossentropy": 2.4673629999160767, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2443905919790268, + "step": 2190 + }, + { + "epoch": 0.0685, + "grad_norm": 3.96875, + "grad_norm_var": 0.13014322916666668, + "learning_rate": 0.0001, + "loss": 6.7141, + "loss/crossentropy": 2.534016251564026, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24066344648599625, + "step": 2192 + }, + { + "epoch": 0.0685625, + "grad_norm": 4.78125, + "grad_norm_var": 0.15142822265625, + "learning_rate": 0.0001, + "loss": 7.1777, + "loss/crossentropy": 2.86741304397583, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.25017230212688446, + "step": 2194 + }, + { + "epoch": 0.068625, + "grad_norm": 4.0625, + "grad_norm_var": 0.16597900390625, + "learning_rate": 0.0001, + "loss": 6.9375, + "loss/crossentropy": 2.5084309577941895, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.26087313890457153, + "step": 2196 + }, + { + "epoch": 0.0686875, + "grad_norm": 4.3125, + "grad_norm_var": 0.15260416666666668, + "learning_rate": 0.0001, + "loss": 7.1154, + "loss/crossentropy": 2.6377168893814087, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.26222096383571625, + "step": 2198 + }, + { + "epoch": 0.06875, + "grad_norm": 4.34375, + "grad_norm_var": 0.13980712890625, + "learning_rate": 0.0001, + "loss": 6.8151, + "loss/crossentropy": 2.562336802482605, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24793028831481934, + "step": 2200 + }, + { + "epoch": 0.0688125, + "grad_norm": 5.3125, + "grad_norm_var": 0.21320699055989584, + "learning_rate": 0.0001, + "loss": 7.058, + "loss/crossentropy": 2.5885305404663086, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.25827546417713165, + "step": 2202 + }, + { + "epoch": 0.068875, + "grad_norm": 4.34375, + "grad_norm_var": 0.1595123291015625, + "learning_rate": 0.0001, + "loss": 6.9934, + "loss/crossentropy": 2.6797332763671875, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.25050613284111023, + "step": 2204 + }, + { + "epoch": 0.0689375, + "grad_norm": 4.3125, + "grad_norm_var": 0.15498758951822916, + "learning_rate": 0.0001, + "loss": 6.7623, + "loss/crossentropy": 2.4612139463424683, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.24612630903720856, + "step": 2206 + }, + { + "epoch": 0.069, + "grad_norm": 4.4375, + "grad_norm_var": 0.17112528483072917, + "learning_rate": 0.0001, + "loss": 7.2423, + "loss/crossentropy": 2.674578070640564, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2665373533964157, + "step": 2208 + }, + { + "epoch": 0.0690625, + "grad_norm": 4.40625, + "grad_norm_var": 0.14546610514322916, + "learning_rate": 0.0001, + "loss": 7.1637, + "loss/crossentropy": 2.640005946159363, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.26486851274967194, + "step": 2210 + }, + { + "epoch": 0.069125, + "grad_norm": 4.90625, + "grad_norm_var": 0.14099019368489582, + "learning_rate": 0.0001, + "loss": 7.202, + "loss/crossentropy": 2.744121551513672, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.26024456322193146, + "step": 2212 + }, + { + "epoch": 0.0691875, + "grad_norm": 4.375, + "grad_norm_var": 0.13904520670572917, + "learning_rate": 0.0001, + "loss": 7.4197, + "loss/crossentropy": 2.8771705627441406, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.26518765091896057, + "step": 2214 + }, + { + "epoch": 0.06925, + "grad_norm": 3.84375, + "grad_norm_var": 0.1639801025390625, + "learning_rate": 0.0001, + "loss": 6.836, + "loss/crossentropy": 2.6073604822158813, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24161886423826218, + "step": 2216 + }, + { + "epoch": 0.0693125, + "grad_norm": 4.34375, + "grad_norm_var": 0.11125895182291666, + "learning_rate": 0.0001, + "loss": 6.8475, + "loss/crossentropy": 2.4261631965637207, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.25580471009016037, + "step": 2218 + }, + { + "epoch": 0.069375, + "grad_norm": 4.375, + "grad_norm_var": 0.1138671875, + "learning_rate": 0.0001, + "loss": 7.3131, + "loss/crossentropy": 2.8144900798797607, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.2650986611843109, + "step": 2220 + }, + { + "epoch": 0.0694375, + "grad_norm": 4.71875, + "grad_norm_var": 0.1197265625, + "learning_rate": 0.0001, + "loss": 6.8706, + "loss/crossentropy": 2.5216604471206665, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.25286050140857697, + "step": 2222 + }, + { + "epoch": 0.0695, + "grad_norm": 4.28125, + "grad_norm_var": 0.10403238932291667, + "learning_rate": 0.0001, + "loss": 6.7715, + "loss/crossentropy": 2.4990915060043335, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.24404004216194153, + "step": 2224 + }, + { + "epoch": 0.0695625, + "grad_norm": 4.5625, + "grad_norm_var": 0.10402018229166667, + "learning_rate": 0.0001, + "loss": 7.2957, + "loss/crossentropy": 2.741424560546875, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.26909762620925903, + "step": 2226 + }, + { + "epoch": 0.069625, + "grad_norm": 4.46875, + "grad_norm_var": 0.097509765625, + "learning_rate": 0.0001, + "loss": 6.9829, + "loss/crossentropy": 2.6196401119232178, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2527298480272293, + "step": 2228 + }, + { + "epoch": 0.0696875, + "grad_norm": 4.25, + "grad_norm_var": 0.10338541666666666, + "learning_rate": 0.0001, + "loss": 6.6484, + "loss/crossentropy": 2.543282628059387, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23239174485206604, + "step": 2230 + }, + { + "epoch": 0.06975, + "grad_norm": 4.28125, + "grad_norm_var": 0.09036458333333333, + "learning_rate": 0.0001, + "loss": 7.2512, + "loss/crossentropy": 2.6861058473587036, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.27017952501773834, + "step": 2232 + }, + { + "epoch": 0.0698125, + "grad_norm": 4.78125, + "grad_norm_var": 0.07317708333333334, + "learning_rate": 0.0001, + "loss": 7.216, + "loss/crossentropy": 2.6648640632629395, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2672192007303238, + "step": 2234 + }, + { + "epoch": 0.069875, + "grad_norm": 4.46875, + "grad_norm_var": 0.06482747395833334, + "learning_rate": 0.0001, + "loss": 7.0849, + "loss/crossentropy": 2.549190878868103, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2660679966211319, + "step": 2236 + }, + { + "epoch": 0.0699375, + "grad_norm": 4.0, + "grad_norm_var": 0.06874593098958333, + "learning_rate": 0.0001, + "loss": 7.0557, + "loss/crossentropy": 2.736733317375183, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.2502536326646805, + "step": 2238 + }, + { + "epoch": 0.07, + "grad_norm": 4.53125, + "grad_norm_var": 0.061962890625, + "learning_rate": 0.0001, + "loss": 7.019, + "loss/crossentropy": 2.588713526725769, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2570897936820984, + "step": 2240 + }, + { + "epoch": 0.0700625, + "grad_norm": 4.6875, + "grad_norm_var": 0.07428385416666666, + "learning_rate": 0.0001, + "loss": 6.9907, + "loss/crossentropy": 2.584397792816162, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.24961788207292557, + "step": 2242 + }, + { + "epoch": 0.070125, + "grad_norm": 4.6875, + "grad_norm_var": 0.07375895182291667, + "learning_rate": 0.0001, + "loss": 7.0313, + "loss/crossentropy": 2.6812790632247925, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.24672441184520721, + "step": 2244 + }, + { + "epoch": 0.0701875, + "grad_norm": 4.28125, + "grad_norm_var": 0.07538655598958334, + "learning_rate": 0.0001, + "loss": 6.9258, + "loss/crossentropy": 2.5161720514297485, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.256193146109581, + "step": 2246 + }, + { + "epoch": 0.07025, + "grad_norm": 4.65625, + "grad_norm_var": 0.06269124348958334, + "learning_rate": 0.0001, + "loss": 7.3589, + "loss/crossentropy": 2.8025104999542236, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2689158171415329, + "step": 2248 + }, + { + "epoch": 0.0703125, + "grad_norm": 4.3125, + "grad_norm_var": 0.05755208333333333, + "learning_rate": 0.0001, + "loss": 7.0828, + "loss/crossentropy": 2.700048565864563, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2578069567680359, + "step": 2250 + }, + { + "epoch": 0.070375, + "grad_norm": 4.40625, + "grad_norm_var": 0.605712890625, + "learning_rate": 0.0001, + "loss": 7.0709, + "loss/crossentropy": 2.5707215070724487, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.26603344082832336, + "step": 2252 + }, + { + "epoch": 0.0704375, + "grad_norm": 4.21875, + "grad_norm_var": 0.5828125, + "learning_rate": 0.0001, + "loss": 7.0732, + "loss/crossentropy": 2.6843878030776978, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.25528310239315033, + "step": 2254 + }, + { + "epoch": 0.0705, + "grad_norm": 4.25, + "grad_norm_var": 0.5912760416666667, + "learning_rate": 0.0001, + "loss": 7.1331, + "loss/crossentropy": 2.6784332990646362, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.25835826992988586, + "step": 2256 + }, + { + "epoch": 0.0705625, + "grad_norm": 3.9375, + "grad_norm_var": 0.6138671875, + "learning_rate": 0.0001, + "loss": 6.7684, + "loss/crossentropy": 2.484556198120117, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.24244347214698792, + "step": 2258 + }, + { + "epoch": 0.070625, + "grad_norm": 4.59375, + "grad_norm_var": 0.6217732747395833, + "learning_rate": 0.0001, + "loss": 7.0758, + "loss/crossentropy": 2.5867775678634644, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.25866710394620895, + "step": 2260 + }, + { + "epoch": 0.0706875, + "grad_norm": 3.859375, + "grad_norm_var": 0.6412750244140625, + "learning_rate": 0.0001, + "loss": 6.9539, + "loss/crossentropy": 2.667303681373596, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2450612187385559, + "step": 2262 + }, + { + "epoch": 0.07075, + "grad_norm": 4.53125, + "grad_norm_var": 0.6617746988932292, + "learning_rate": 0.0001, + "loss": 7.1908, + "loss/crossentropy": 2.8160548210144043, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.256610170006752, + "step": 2264 + }, + { + "epoch": 0.0708125, + "grad_norm": 4.53125, + "grad_norm_var": 0.6638661702473958, + "learning_rate": 0.0001, + "loss": 7.3165, + "loss/crossentropy": 2.7610117197036743, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.2633662298321724, + "step": 2266 + }, + { + "epoch": 0.070875, + "grad_norm": 4.4375, + "grad_norm_var": 0.10606180826822917, + "learning_rate": 0.0001, + "loss": 6.9168, + "loss/crossentropy": 2.6086331605911255, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.24604862928390503, + "step": 2268 + }, + { + "epoch": 0.0709375, + "grad_norm": 4.4375, + "grad_norm_var": 0.11240946451822917, + "learning_rate": 0.0001, + "loss": 6.8032, + "loss/crossentropy": 2.4364657402038574, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.25191039592027664, + "step": 2270 + }, + { + "epoch": 0.071, + "grad_norm": 4.1875, + "grad_norm_var": 0.11458231608072916, + "learning_rate": 0.0001, + "loss": 7.0964, + "loss/crossentropy": 2.70850145816803, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.25089629739522934, + "step": 2272 + }, + { + "epoch": 0.0710625, + "grad_norm": 5.15625, + "grad_norm_var": 0.13074442545572917, + "learning_rate": 0.0001, + "loss": 6.8514, + "loss/crossentropy": 2.5284905433654785, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.24713944643735886, + "step": 2274 + }, + { + "epoch": 0.071125, + "grad_norm": 5.15625, + "grad_norm_var": 0.15869038899739582, + "learning_rate": 0.0001, + "loss": 6.9768, + "loss/crossentropy": 2.5898996591567993, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.2539285346865654, + "step": 2276 + }, + { + "epoch": 0.0711875, + "grad_norm": 4.25, + "grad_norm_var": 0.13668212890625, + "learning_rate": 0.0001, + "loss": 6.8138, + "loss/crossentropy": 2.4170562028884888, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2506166696548462, + "step": 2278 + }, + { + "epoch": 0.07125, + "grad_norm": 4.21875, + "grad_norm_var": 0.12237955729166666, + "learning_rate": 0.0001, + "loss": 7.1489, + "loss/crossentropy": 2.708563208580017, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.25575482845306396, + "step": 2280 + }, + { + "epoch": 0.0713125, + "grad_norm": 5.3125, + "grad_norm_var": 0.15930582682291666, + "learning_rate": 0.0001, + "loss": 6.9591, + "loss/crossentropy": 2.462288975715637, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2645288556814194, + "step": 2282 + }, + { + "epoch": 0.071375, + "grad_norm": 5.4375, + "grad_norm_var": 0.21477864583333334, + "learning_rate": 0.0001, + "loss": 7.2023, + "loss/crossentropy": 2.656105160713196, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.26360785216093063, + "step": 2284 + }, + { + "epoch": 0.0714375, + "grad_norm": 4.65625, + "grad_norm_var": 0.20078125, + "learning_rate": 0.0001, + "loss": 7.2032, + "loss/crossentropy": 2.6529760360717773, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.26947909593582153, + "step": 2286 + }, + { + "epoch": 0.0715, + "grad_norm": 4.25, + "grad_norm_var": 0.18899739583333333, + "learning_rate": 0.0001, + "loss": 7.1448, + "loss/crossentropy": 2.6024595499038696, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2722066640853882, + "step": 2288 + }, + { + "epoch": 0.0715625, + "grad_norm": 4.15625, + "grad_norm_var": 0.18162434895833332, + "learning_rate": 0.0001, + "loss": 7.2628, + "loss/crossentropy": 2.8925254344940186, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.25421494245529175, + "step": 2290 + }, + { + "epoch": 0.071625, + "grad_norm": 4.15625, + "grad_norm_var": 0.18349202473958334, + "learning_rate": 0.0001, + "loss": 7.2124, + "loss/crossentropy": 2.778253436088562, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.25826049596071243, + "step": 2292 + }, + { + "epoch": 0.0716875, + "grad_norm": 4.6875, + "grad_norm_var": 0.17978108723958333, + "learning_rate": 0.0001, + "loss": 7.4538, + "loss/crossentropy": 2.843157172203064, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.27200615406036377, + "step": 2294 + }, + { + "epoch": 0.07175, + "grad_norm": 4.46875, + "grad_norm_var": 0.16291910807291668, + "learning_rate": 0.0001, + "loss": 7.2753, + "loss/crossentropy": 2.7370086908340454, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.26828624308109283, + "step": 2296 + }, + { + "epoch": 0.0718125, + "grad_norm": 4.0, + "grad_norm_var": 0.146728515625, + "learning_rate": 0.0001, + "loss": 6.9942, + "loss/crossentropy": 2.6206599473953247, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2533659115433693, + "step": 2298 + }, + { + "epoch": 0.071875, + "grad_norm": 5.0625, + "grad_norm_var": 0.15881754557291666, + "learning_rate": 0.0001, + "loss": 7.5766, + "loss/crossentropy": 2.930529475212097, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.27671411633491516, + "step": 2300 + }, + { + "epoch": 0.0719375, + "grad_norm": 4.53125, + "grad_norm_var": 0.14914957682291666, + "learning_rate": 0.0001, + "loss": 7.2869, + "loss/crossentropy": 2.8419981002807617, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2577664852142334, + "step": 2302 + }, + { + "epoch": 0.072, + "grad_norm": 4.8125, + "grad_norm_var": 0.14768473307291666, + "learning_rate": 0.0001, + "loss": 7.244, + "loss/crossentropy": 2.6828211545944214, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.26588352024555206, + "step": 2304 + }, + { + "epoch": 0.0720625, + "grad_norm": 5.3125, + "grad_norm_var": 0.16634114583333334, + "learning_rate": 0.0001, + "loss": 7.2035, + "loss/crossentropy": 2.6578863859176636, + "loss/hidden": 1.90625, + "loss/jsd": 0.0, + "loss/logits": 0.26393402367830276, + "step": 2306 + }, + { + "epoch": 0.072125, + "grad_norm": 4.1875, + "grad_norm_var": 0.138671875, + "learning_rate": 0.0001, + "loss": 6.9004, + "loss/crossentropy": 2.594657063484192, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.24971309304237366, + "step": 2308 + }, + { + "epoch": 0.0721875, + "grad_norm": 4.28125, + "grad_norm_var": 0.15597330729166667, + "learning_rate": 0.0001, + "loss": 6.9301, + "loss/crossentropy": 2.602201819419861, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.2495882734656334, + "step": 2310 + }, + { + "epoch": 0.07225, + "grad_norm": 4.40625, + "grad_norm_var": 0.15909830729166666, + "learning_rate": 0.0001, + "loss": 7.218, + "loss/crossentropy": 2.758484959602356, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.263138011097908, + "step": 2312 + }, + { + "epoch": 0.0723125, + "grad_norm": 4.21875, + "grad_norm_var": 0.14862874348958333, + "learning_rate": 0.0001, + "loss": 6.7638, + "loss/crossentropy": 2.4779679775238037, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.24811531603336334, + "step": 2314 + }, + { + "epoch": 0.072375, + "grad_norm": 4.125, + "grad_norm_var": 0.10048421223958333, + "learning_rate": 0.0001, + "loss": 7.0218, + "loss/crossentropy": 2.633275866508484, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.25565214455127716, + "step": 2316 + }, + { + "epoch": 0.0724375, + "grad_norm": 4.84375, + "grad_norm_var": 0.11900634765625, + "learning_rate": 0.0001, + "loss": 7.4657, + "loss/crossentropy": 2.900314688682556, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.26982201635837555, + "step": 2318 + }, + { + "epoch": 0.0725, + "grad_norm": 4.34375, + "grad_norm_var": 0.12750244140625, + "learning_rate": 0.0001, + "loss": 7.4663, + "loss/crossentropy": 2.815197467803955, + "loss/hidden": 1.9140625, + "loss/jsd": 0.0, + "loss/logits": 0.2737061530351639, + "step": 2320 + }, + { + "epoch": 0.0725625, + "grad_norm": 4.625, + "grad_norm_var": 0.08837483723958334, + "learning_rate": 0.0001, + "loss": 7.0731, + "loss/crossentropy": 2.589448571205139, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.25969552993774414, + "step": 2322 + }, + { + "epoch": 0.072625, + "grad_norm": 4.125, + "grad_norm_var": 0.09178059895833333, + "learning_rate": 0.0001, + "loss": 6.8812, + "loss/crossentropy": 2.5958632230758667, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.25002047419548035, + "step": 2324 + }, + { + "epoch": 0.0726875, + "grad_norm": 4.28125, + "grad_norm_var": 0.08857014973958334, + "learning_rate": 0.0001, + "loss": 7.0842, + "loss/crossentropy": 2.6766932010650635, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2548135668039322, + "step": 2326 + }, + { + "epoch": 0.07275, + "grad_norm": 4.21875, + "grad_norm_var": 0.10354410807291667, + "learning_rate": 0.0001, + "loss": 6.875, + "loss/crossentropy": 2.6552330255508423, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.2434604912996292, + "step": 2328 + }, + { + "epoch": 0.0728125, + "grad_norm": 4.625, + "grad_norm_var": 0.090869140625, + "learning_rate": 0.0001, + "loss": 6.878, + "loss/crossentropy": 2.5861902236938477, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24948906898498535, + "step": 2330 + }, + { + "epoch": 0.072875, + "grad_norm": 4.8125, + "grad_norm_var": 0.10104166666666667, + "learning_rate": 0.0001, + "loss": 6.9627, + "loss/crossentropy": 2.5423258543014526, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.25726811587810516, + "step": 2332 + }, + { + "epoch": 0.0729375, + "grad_norm": 4.40625, + "grad_norm_var": 0.07372639973958334, + "learning_rate": 0.0001, + "loss": 6.9905, + "loss/crossentropy": 2.631394863128662, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2519259601831436, + "step": 2334 + }, + { + "epoch": 0.073, + "grad_norm": 4.46875, + "grad_norm_var": 0.05813802083333333, + "learning_rate": 0.0001, + "loss": 7.1794, + "loss/crossentropy": 2.736093521118164, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.25956377387046814, + "step": 2336 + }, + { + "epoch": 0.0730625, + "grad_norm": 4.1875, + "grad_norm_var": 0.059305826822916664, + "learning_rate": 0.0001, + "loss": 6.9842, + "loss/crossentropy": 2.6525719165802, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.24722251296043396, + "step": 2338 + }, + { + "epoch": 0.073125, + "grad_norm": 4.34375, + "grad_norm_var": 0.056050618489583336, + "learning_rate": 0.0001, + "loss": 6.9389, + "loss/crossentropy": 2.6012275218963623, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.2505672574043274, + "step": 2340 + }, + { + "epoch": 0.0731875, + "grad_norm": 4.1875, + "grad_norm_var": 0.05927327473958333, + "learning_rate": 0.0001, + "loss": 6.6983, + "loss/crossentropy": 2.4778064489364624, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.23923813551664352, + "step": 2342 + }, + { + "epoch": 0.07325, + "grad_norm": 5.25, + "grad_norm_var": 0.9503255208333333, + "learning_rate": 0.0001, + "loss": 7.0442, + "loss/crossentropy": 2.479038953781128, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.26549845933914185, + "step": 2344 + }, + { + "epoch": 0.0733125, + "grad_norm": 4.71875, + "grad_norm_var": 0.9283203125, + "learning_rate": 0.0001, + "loss": 7.0029, + "loss/crossentropy": 2.595993757247925, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.25943995267152786, + "step": 2346 + }, + { + "epoch": 0.073375, + "grad_norm": 4.84375, + "grad_norm_var": 0.9596354166666666, + "learning_rate": 0.0001, + "loss": 7.6093, + "loss/crossentropy": 2.858232021331787, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.27940085530281067, + "step": 2348 + }, + { + "epoch": 0.0734375, + "grad_norm": 4.46875, + "grad_norm_var": 0.95078125, + "learning_rate": 0.0001, + "loss": 7.2626, + "loss/crossentropy": 2.632628321647644, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2747160494327545, + "step": 2350 + }, + { + "epoch": 0.0735, + "grad_norm": 4.1875, + "grad_norm_var": 0.97877197265625, + "learning_rate": 0.0001, + "loss": 6.8277, + "loss/crossentropy": 2.6192827224731445, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.24232970923185349, + "step": 2352 + }, + { + "epoch": 0.0735625, + "grad_norm": 4.59375, + "grad_norm_var": 0.9653645833333333, + "learning_rate": 0.0001, + "loss": 6.9852, + "loss/crossentropy": 2.6112306118011475, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.2541925981640816, + "step": 2354 + }, + { + "epoch": 0.073625, + "grad_norm": 3.890625, + "grad_norm_var": 1.0204905192057292, + "learning_rate": 0.0001, + "loss": 6.8813, + "loss/crossentropy": 2.5602935552597046, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2516297549009323, + "step": 2356 + }, + { + "epoch": 0.0736875, + "grad_norm": 3.796875, + "grad_norm_var": 1.06181640625, + "learning_rate": 0.0001, + "loss": 7.0254, + "loss/crossentropy": 2.710487961769104, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.250630758702755, + "step": 2358 + }, + { + "epoch": 0.07375, + "grad_norm": 4.84375, + "grad_norm_var": 0.17678629557291667, + "learning_rate": 0.0001, + "loss": 6.9378, + "loss/crossentropy": 2.587727904319763, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.24946476519107819, + "step": 2360 + }, + { + "epoch": 0.0738125, + "grad_norm": 4.25, + "grad_norm_var": 0.21761067708333334, + "learning_rate": 0.0001, + "loss": 7.5178, + "loss/crossentropy": 2.8490744829177856, + "loss/hidden": 1.95703125, + "loss/jsd": 0.0, + "loss/logits": 0.2711695656180382, + "step": 2362 + }, + { + "epoch": 0.073875, + "grad_norm": 4.40625, + "grad_norm_var": 0.17144775390625, + "learning_rate": 0.0001, + "loss": 6.7725, + "loss/crossentropy": 2.4507436752319336, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2427205666899681, + "step": 2364 + }, + { + "epoch": 0.0739375, + "grad_norm": 5.21875, + "grad_norm_var": 0.19394124348958333, + "learning_rate": 0.0001, + "loss": 6.8787, + "loss/crossentropy": 2.422315001487732, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.2589210420846939, + "step": 2366 + }, + { + "epoch": 0.074, + "grad_norm": 4.90625, + "grad_norm_var": 0.20370686848958333, + "learning_rate": 0.0001, + "loss": 6.9107, + "loss/crossentropy": 2.5740681886672974, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.25202006101608276, + "step": 2368 + }, + { + "epoch": 0.0740625, + "grad_norm": 4.65625, + "grad_norm_var": 0.24055582682291668, + "learning_rate": 0.0001, + "loss": 6.6577, + "loss/crossentropy": 2.347817301750183, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2505149394273758, + "step": 2370 + }, + { + "epoch": 0.074125, + "grad_norm": 4.53125, + "grad_norm_var": 0.21880594889322916, + "learning_rate": 0.0001, + "loss": 7.4111, + "loss/crossentropy": 2.738089680671692, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.27940770983695984, + "step": 2372 + }, + { + "epoch": 0.0741875, + "grad_norm": 4.625, + "grad_norm_var": 0.18175455729166667, + "learning_rate": 0.0001, + "loss": 6.877, + "loss/crossentropy": 2.653886318206787, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.24067474901676178, + "step": 2374 + }, + { + "epoch": 0.07425, + "grad_norm": 4.21875, + "grad_norm_var": 0.18502604166666667, + "learning_rate": 0.0001, + "loss": 7.034, + "loss/crossentropy": 2.6042428016662598, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.26328346133232117, + "step": 2376 + }, + { + "epoch": 0.0743125, + "grad_norm": 4.0, + "grad_norm_var": 0.16568603515625, + "learning_rate": 0.0001, + "loss": 6.7866, + "loss/crossentropy": 2.5228244066238403, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2451310157775879, + "step": 2378 + }, + { + "epoch": 0.074375, + "grad_norm": 4.78125, + "grad_norm_var": 0.15245768229166667, + "learning_rate": 0.0001, + "loss": 7.0086, + "loss/crossentropy": 2.58968186378479, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.2571246773004532, + "step": 2380 + }, + { + "epoch": 0.0744375, + "grad_norm": 4.34375, + "grad_norm_var": 0.106884765625, + "learning_rate": 0.0001, + "loss": 7.0466, + "loss/crossentropy": 2.6445672512054443, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2562218904495239, + "step": 2382 + }, + { + "epoch": 0.0745, + "grad_norm": 4.125, + "grad_norm_var": 0.08958333333333333, + "learning_rate": 0.0001, + "loss": 6.81, + "loss/crossentropy": 2.5752354860305786, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24613595008850098, + "step": 2384 + }, + { + "epoch": 0.0745625, + "grad_norm": 3.78125, + "grad_norm_var": 0.08033447265625, + "learning_rate": 0.0001, + "loss": 6.9493, + "loss/crossentropy": 2.652430534362793, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.2488238662481308, + "step": 2386 + }, + { + "epoch": 0.074625, + "grad_norm": 4.46875, + "grad_norm_var": 0.058186848958333336, + "learning_rate": 0.0001, + "loss": 7.2682, + "loss/crossentropy": 2.7897156476974487, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2619110345840454, + "step": 2388 + }, + { + "epoch": 0.0746875, + "grad_norm": 4.3125, + "grad_norm_var": 0.053515625, + "learning_rate": 0.0001, + "loss": 7.0406, + "loss/crossentropy": 2.68414843082428, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.25517675280570984, + "step": 2390 + }, + { + "epoch": 0.07475, + "grad_norm": 4.96875, + "grad_norm_var": 0.08912353515625, + "learning_rate": 0.0001, + "loss": 6.6606, + "loss/crossentropy": 2.4219181537628174, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2418384850025177, + "step": 2392 + }, + { + "epoch": 0.0748125, + "grad_norm": 4.0625, + "grad_norm_var": 0.09413655598958333, + "learning_rate": 0.0001, + "loss": 6.7802, + "loss/crossentropy": 2.5747569799423218, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.24475835263729095, + "step": 2394 + }, + { + "epoch": 0.074875, + "grad_norm": 4.53125, + "grad_norm_var": 0.07884114583333333, + "learning_rate": 0.0001, + "loss": 6.8577, + "loss/crossentropy": 2.540056586265564, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.25363685190677643, + "step": 2396 + }, + { + "epoch": 0.0749375, + "grad_norm": 3.8125, + "grad_norm_var": 0.10328369140625, + "learning_rate": 0.0001, + "loss": 6.8691, + "loss/crossentropy": 2.5701223611831665, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.24942508339881897, + "step": 2398 + }, + { + "epoch": 0.075, + "grad_norm": 4.3125, + "grad_norm_var": 0.10064697265625, + "learning_rate": 0.0001, + "loss": 7.2102, + "loss/crossentropy": 2.895754337310791, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.25058774650096893, + "step": 2400 + }, + { + "epoch": 0.0750625, + "grad_norm": 4.5625, + "grad_norm_var": 0.08388264973958333, + "learning_rate": 0.0001, + "loss": 7.0321, + "loss/crossentropy": 2.605707287788391, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.25826863944530487, + "step": 2402 + }, + { + "epoch": 0.075125, + "grad_norm": 5.1875, + "grad_norm_var": 0.12615559895833334, + "learning_rate": 0.0001, + "loss": 7.0141, + "loss/crossentropy": 2.5209784507751465, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2602469325065613, + "step": 2404 + }, + { + "epoch": 0.0751875, + "grad_norm": 4.4375, + "grad_norm_var": 0.17769775390625, + "learning_rate": 0.0001, + "loss": 7.0747, + "loss/crossentropy": 2.622338056564331, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.25968825817108154, + "step": 2406 + }, + { + "epoch": 0.07525, + "grad_norm": 4.90625, + "grad_norm_var": 0.17310791015625, + "learning_rate": 0.0001, + "loss": 7.479, + "loss/crossentropy": 2.869504928588867, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.27423471212387085, + "step": 2408 + }, + { + "epoch": 0.0753125, + "grad_norm": 4.0625, + "grad_norm_var": 0.16378580729166667, + "learning_rate": 0.0001, + "loss": 7.0745, + "loss/crossentropy": 2.7071975469589233, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.25040150433778763, + "step": 2410 + }, + { + "epoch": 0.075375, + "grad_norm": 4.46875, + "grad_norm_var": 0.2552083333333333, + "learning_rate": 0.0001, + "loss": 7.2737, + "loss/crossentropy": 2.802687883377075, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.26350709050893784, + "step": 2412 + }, + { + "epoch": 0.0754375, + "grad_norm": 4.0625, + "grad_norm_var": 0.23303629557291666, + "learning_rate": 0.0001, + "loss": 6.7801, + "loss/crossentropy": 2.491695761680603, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2499319687485695, + "step": 2414 + }, + { + "epoch": 0.0755, + "grad_norm": 4.75, + "grad_norm_var": 0.21808268229166666, + "learning_rate": 0.0001, + "loss": 7.4099, + "loss/crossentropy": 2.7706817388534546, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2729024589061737, + "step": 2416 + }, + { + "epoch": 0.0755625, + "grad_norm": 4.3125, + "grad_norm_var": 0.2091796875, + "learning_rate": 0.0001, + "loss": 6.9253, + "loss/crossentropy": 2.564241409301758, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.25094784796237946, + "step": 2418 + }, + { + "epoch": 0.075625, + "grad_norm": 3.890625, + "grad_norm_var": 1.3548329671223958, + "learning_rate": 0.0001, + "loss": 7.1162, + "loss/crossentropy": 2.616005301475525, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.26642537117004395, + "step": 2420 + }, + { + "epoch": 0.0756875, + "grad_norm": 4.09375, + "grad_norm_var": 1.411766560872396, + "learning_rate": 0.0001, + "loss": 6.7817, + "loss/crossentropy": 2.5218793153762817, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.245124451816082, + "step": 2422 + }, + { + "epoch": 0.07575, + "grad_norm": 4.125, + "grad_norm_var": 1.4401601155598958, + "learning_rate": 0.0001, + "loss": 6.9256, + "loss/crossentropy": 2.559067964553833, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.25657251477241516, + "step": 2424 + }, + { + "epoch": 0.0758125, + "grad_norm": 4.59375, + "grad_norm_var": 1.4102203369140625, + "learning_rate": 0.0001, + "loss": 7.1871, + "loss/crossentropy": 2.6583296060562134, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.26381099224090576, + "step": 2426 + }, + { + "epoch": 0.075875, + "grad_norm": 4.03125, + "grad_norm_var": 1.3658599853515625, + "learning_rate": 0.0001, + "loss": 6.7713, + "loss/crossentropy": 2.5485886335372925, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24102021753787994, + "step": 2428 + }, + { + "epoch": 0.0759375, + "grad_norm": 4.46875, + "grad_norm_var": 1.3427073160807292, + "learning_rate": 0.0001, + "loss": 7.1977, + "loss/crossentropy": 2.817766785621643, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.25596490502357483, + "step": 2430 + }, + { + "epoch": 0.076, + "grad_norm": 25.5, + "grad_norm_var": 28.454483032226562, + "learning_rate": 0.0001, + "loss": 7.4982, + "loss/crossentropy": 2.580443024635315, + "loss/hidden": 2.01171875, + "loss/jsd": 0.0, + "loss/logits": 0.2906050682067871, + "step": 2432 + }, + { + "epoch": 0.0760625, + "grad_norm": 4.59375, + "grad_norm_var": 28.318000284830728, + "learning_rate": 0.0001, + "loss": 7.4054, + "loss/crossentropy": 2.8410911560058594, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2685410678386688, + "step": 2434 + }, + { + "epoch": 0.076125, + "grad_norm": 4.75, + "grad_norm_var": 27.567769368489582, + "learning_rate": 0.0001, + "loss": 7.0348, + "loss/crossentropy": 2.5193880796432495, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2652158737182617, + "step": 2436 + }, + { + "epoch": 0.0761875, + "grad_norm": 4.375, + "grad_norm_var": 27.481148274739585, + "learning_rate": 0.0001, + "loss": 6.8777, + "loss/crossentropy": 2.5700310468673706, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24951593577861786, + "step": 2438 + }, + { + "epoch": 0.07625, + "grad_norm": 4.5625, + "grad_norm_var": 27.446207682291668, + "learning_rate": 0.0001, + "loss": 6.4905, + "loss/crossentropy": 2.379134774208069, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.23027870059013367, + "step": 2440 + }, + { + "epoch": 0.0763125, + "grad_norm": 4.125, + "grad_norm_var": 27.518229166666668, + "learning_rate": 0.0001, + "loss": 6.9303, + "loss/crossentropy": 2.5619860887527466, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.24503721296787262, + "step": 2442 + }, + { + "epoch": 0.076375, + "grad_norm": 4.1875, + "grad_norm_var": 27.57097880045573, + "learning_rate": 0.0001, + "loss": 6.5573, + "loss/crossentropy": 2.392979383468628, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.23713234812021255, + "step": 2444 + }, + { + "epoch": 0.0764375, + "grad_norm": 5.125, + "grad_norm_var": 27.42019755045573, + "learning_rate": 0.0001, + "loss": 7.196, + "loss/crossentropy": 2.5500316619873047, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2727985829114914, + "step": 2446 + }, + { + "epoch": 0.0765, + "grad_norm": 4.40625, + "grad_norm_var": 0.3015696207682292, + "learning_rate": 0.0001, + "loss": 7.0044, + "loss/crossentropy": 2.5839664936065674, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.257663831114769, + "step": 2448 + }, + { + "epoch": 0.0765625, + "grad_norm": 4.59375, + "grad_norm_var": 0.4693349202473958, + "learning_rate": 0.0001, + "loss": 6.9517, + "loss/crossentropy": 2.5211989879608154, + "loss/hidden": 1.953125, + "loss/jsd": 0.0, + "loss/logits": 0.24773336946964264, + "step": 2450 + }, + { + "epoch": 0.076625, + "grad_norm": 3.75, + "grad_norm_var": 0.37576395670572915, + "learning_rate": 0.0001, + "loss": 7.0608, + "loss/crossentropy": 2.7452404499053955, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.24913445115089417, + "step": 2452 + }, + { + "epoch": 0.0766875, + "grad_norm": 3.953125, + "grad_norm_var": 0.3881144205729167, + "learning_rate": 0.0001, + "loss": 7.0236, + "loss/crossentropy": 2.7359414100646973, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2486885040998459, + "step": 2454 + }, + { + "epoch": 0.07675, + "grad_norm": 4.46875, + "grad_norm_var": 0.4288736979166667, + "learning_rate": 0.0001, + "loss": 6.6268, + "loss/crossentropy": 2.423543334007263, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.24180874228477478, + "step": 2456 + }, + { + "epoch": 0.0768125, + "grad_norm": 4.875, + "grad_norm_var": 0.5277506510416666, + "learning_rate": 0.0001, + "loss": 7.222, + "loss/crossentropy": 2.639382839202881, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.2664603292942047, + "step": 2458 + }, + { + "epoch": 0.076875, + "grad_norm": 4.78125, + "grad_norm_var": 0.5103830973307292, + "learning_rate": 0.0001, + "loss": 7.0783, + "loss/crossentropy": 2.7202084064483643, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.24714156985282898, + "step": 2460 + }, + { + "epoch": 0.0769375, + "grad_norm": 3.953125, + "grad_norm_var": 0.5262532552083333, + "learning_rate": 0.0001, + "loss": 6.6897, + "loss/crossentropy": 2.4970299005508423, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.236849345266819, + "step": 2462 + }, + { + "epoch": 0.077, + "grad_norm": 4.4375, + "grad_norm_var": 0.5183919270833334, + "learning_rate": 0.0001, + "loss": 7.4318, + "loss/crossentropy": 2.875585675239563, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2696887403726578, + "step": 2464 + }, + { + "epoch": 0.0770625, + "grad_norm": 4.28125, + "grad_norm_var": 0.30789388020833336, + "learning_rate": 0.0001, + "loss": 7.137, + "loss/crossentropy": 2.7693514823913574, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.2551232725381851, + "step": 2466 + }, + { + "epoch": 0.077125, + "grad_norm": 4.375, + "grad_norm_var": 0.27701416015625, + "learning_rate": 0.0001, + "loss": 6.8556, + "loss/crossentropy": 2.479888916015625, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2555390000343323, + "step": 2468 + }, + { + "epoch": 0.0771875, + "grad_norm": 3.75, + "grad_norm_var": 0.30461324055989586, + "learning_rate": 0.0001, + "loss": 6.6706, + "loss/crossentropy": 2.5898760557174683, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2334597110748291, + "step": 2470 + }, + { + "epoch": 0.07725, + "grad_norm": 5.0625, + "grad_norm_var": 0.3222401936848958, + "learning_rate": 0.0001, + "loss": 7.4498, + "loss/crossentropy": 2.784128785133362, + "loss/hidden": 1.91796875, + "loss/jsd": 0.0, + "loss/logits": 0.27476924657821655, + "step": 2472 + }, + { + "epoch": 0.0773125, + "grad_norm": 4.5625, + "grad_norm_var": 0.2200592041015625, + "learning_rate": 0.0001, + "loss": 7.3398, + "loss/crossentropy": 2.880878210067749, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2556593418121338, + "step": 2474 + }, + { + "epoch": 0.077375, + "grad_norm": 4.21875, + "grad_norm_var": 0.17563374837239584, + "learning_rate": 0.0001, + "loss": 7.2807, + "loss/crossentropy": 2.877553939819336, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2574998736381531, + "step": 2476 + }, + { + "epoch": 0.0774375, + "grad_norm": 4.65625, + "grad_norm_var": 0.16825764973958332, + "learning_rate": 0.0001, + "loss": 6.9746, + "loss/crossentropy": 2.594506025314331, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.25558608770370483, + "step": 2478 + }, + { + "epoch": 0.0775, + "grad_norm": 4.78125, + "grad_norm_var": 0.17343343098958333, + "learning_rate": 0.0001, + "loss": 7.2236, + "loss/crossentropy": 2.823242425918579, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.25761541724205017, + "step": 2480 + }, + { + "epoch": 0.0775625, + "grad_norm": 5.09375, + "grad_norm_var": 0.19537353515625, + "learning_rate": 0.0001, + "loss": 7.0229, + "loss/crossentropy": 2.6774028539657593, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2556390166282654, + "step": 2482 + }, + { + "epoch": 0.077625, + "grad_norm": 4.375, + "grad_norm_var": 0.1978515625, + "learning_rate": 0.0001, + "loss": 6.6963, + "loss/crossentropy": 2.5514711141586304, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.23284562677145004, + "step": 2484 + }, + { + "epoch": 0.0776875, + "grad_norm": 5.9375, + "grad_norm_var": 0.25780843098958334, + "learning_rate": 0.0001, + "loss": 7.4183, + "loss/crossentropy": 2.7475714683532715, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.2768399715423584, + "step": 2486 + }, + { + "epoch": 0.07775, + "grad_norm": 3.953125, + "grad_norm_var": 0.22437235514322917, + "learning_rate": 0.0001, + "loss": 6.837, + "loss/crossentropy": 2.534694790840149, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.25562240928411484, + "step": 2488 + }, + { + "epoch": 0.0778125, + "grad_norm": 3.921875, + "grad_norm_var": 0.2560831705729167, + "learning_rate": 0.0001, + "loss": 6.665, + "loss/crossentropy": 2.4949283599853516, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.23770590126514435, + "step": 2490 + }, + { + "epoch": 0.077875, + "grad_norm": 4.90625, + "grad_norm_var": 0.86773681640625, + "learning_rate": 0.0001, + "loss": 7.0473, + "loss/crossentropy": 2.5547311305999756, + "loss/hidden": 1.90234375, + "loss/jsd": 0.0, + "loss/logits": 0.25901947915554047, + "step": 2492 + }, + { + "epoch": 0.0779375, + "grad_norm": 4.78125, + "grad_norm_var": 0.895458984375, + "learning_rate": 0.0001, + "loss": 6.822, + "loss/crossentropy": 2.5802226066589355, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.23980476707220078, + "step": 2494 + }, + { + "epoch": 0.078, + "grad_norm": 3.71875, + "grad_norm_var": 0.9597819010416667, + "learning_rate": 0.0001, + "loss": 6.2974, + "loss/crossentropy": 2.323503851890564, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.21965843439102173, + "step": 2496 + }, + { + "epoch": 0.0780625, + "grad_norm": 4.09375, + "grad_norm_var": 0.96005859375, + "learning_rate": 0.0001, + "loss": 6.6043, + "loss/crossentropy": 2.4191696643829346, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2384398654103279, + "step": 2498 + }, + { + "epoch": 0.078125, + "grad_norm": 4.21875, + "grad_norm_var": 0.9535441080729167, + "learning_rate": 0.0001, + "loss": 7.2297, + "loss/crossentropy": 2.8721203804016113, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.251773476600647, + "step": 2500 + }, + { + "epoch": 0.0781875, + "grad_norm": 4.1875, + "grad_norm_var": 0.83916015625, + "learning_rate": 0.0001, + "loss": 6.9023, + "loss/crossentropy": 2.6179966926574707, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.247569240629673, + "step": 2502 + }, + { + "epoch": 0.07825, + "grad_norm": 4.6875, + "grad_norm_var": 1.0491770426432292, + "learning_rate": 0.0001, + "loss": 7.201, + "loss/crossentropy": 2.5599087476730347, + "loss/hidden": 1.94140625, + "loss/jsd": 0.0, + "loss/logits": 0.26996414363384247, + "step": 2504 + }, + { + "epoch": 0.0783125, + "grad_norm": 4.75, + "grad_norm_var": 1.005322265625, + "learning_rate": 0.0001, + "loss": 7.369, + "loss/crossentropy": 2.879120349884033, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.26656413078308105, + "step": 2506 + }, + { + "epoch": 0.078375, + "grad_norm": 4.375, + "grad_norm_var": 0.3843587239583333, + "learning_rate": 0.0001, + "loss": 7.1051, + "loss/crossentropy": 2.634618043899536, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.26384034752845764, + "step": 2508 + }, + { + "epoch": 0.0784375, + "grad_norm": 4.1875, + "grad_norm_var": 0.36886393229166664, + "learning_rate": 0.0001, + "loss": 6.8611, + "loss/crossentropy": 2.601563811302185, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.24236443638801575, + "step": 2510 + }, + { + "epoch": 0.0785, + "grad_norm": 4.5, + "grad_norm_var": 0.39342041015625, + "learning_rate": 0.0001, + "loss": 7.0797, + "loss/crossentropy": 2.5156508684158325, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2685118168592453, + "step": 2512 + }, + { + "epoch": 0.0785625, + "grad_norm": 4.125, + "grad_norm_var": 0.398681640625, + "learning_rate": 0.0001, + "loss": 6.9509, + "loss/crossentropy": 2.646916389465332, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.24993188679218292, + "step": 2514 + }, + { + "epoch": 0.078625, + "grad_norm": 4.3125, + "grad_norm_var": 0.3974609375, + "learning_rate": 0.0001, + "loss": 6.8179, + "loss/crossentropy": 2.5790480375289917, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24420031160116196, + "step": 2516 + }, + { + "epoch": 0.0786875, + "grad_norm": 4.3125, + "grad_norm_var": 0.38648681640625, + "learning_rate": 0.0001, + "loss": 6.9183, + "loss/crossentropy": 2.5294101238250732, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.25607292354106903, + "step": 2518 + }, + { + "epoch": 0.07875, + "grad_norm": 3.765625, + "grad_norm_var": 0.18620503743489583, + "learning_rate": 0.0001, + "loss": 6.6685, + "loss/crossentropy": 2.481043577194214, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2414042055606842, + "step": 2520 + }, + { + "epoch": 0.0788125, + "grad_norm": 4.46875, + "grad_norm_var": 0.1506500244140625, + "learning_rate": 0.0001, + "loss": 7.3269, + "loss/crossentropy": 2.931631326675415, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2582782357931137, + "step": 2522 + }, + { + "epoch": 0.078875, + "grad_norm": 4.09375, + "grad_norm_var": 0.1521148681640625, + "learning_rate": 0.0001, + "loss": 6.8272, + "loss/crossentropy": 2.5830577611923218, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.24512076377868652, + "step": 2524 + }, + { + "epoch": 0.0789375, + "grad_norm": 4.46875, + "grad_norm_var": 0.14638570149739583, + "learning_rate": 0.0001, + "loss": 6.795, + "loss/crossentropy": 2.4542908668518066, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.24617988616228104, + "step": 2526 + }, + { + "epoch": 0.079, + "grad_norm": 3.96875, + "grad_norm_var": 0.0508941650390625, + "learning_rate": 0.0001, + "loss": 6.8898, + "loss/crossentropy": 2.6073638200759888, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2509048581123352, + "step": 2528 + }, + { + "epoch": 0.0790625, + "grad_norm": 4.0, + "grad_norm_var": 0.0536041259765625, + "learning_rate": 0.0001, + "loss": 6.7992, + "loss/crossentropy": 2.7190089225769043, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.22716018557548523, + "step": 2530 + }, + { + "epoch": 0.079125, + "grad_norm": 4.84375, + "grad_norm_var": 0.09160054524739583, + "learning_rate": 0.0001, + "loss": 7.4126, + "loss/crossentropy": 2.7276889085769653, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.27903781831264496, + "step": 2532 + }, + { + "epoch": 0.0791875, + "grad_norm": 4.09375, + "grad_norm_var": 0.09919331868489584, + "learning_rate": 0.0001, + "loss": 6.6975, + "loss/crossentropy": 2.497929811477661, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2410556524991989, + "step": 2534 + }, + { + "epoch": 0.07925, + "grad_norm": 4.3125, + "grad_norm_var": 0.16210530598958334, + "learning_rate": 0.0001, + "loss": 7.0674, + "loss/crossentropy": 2.7110267877578735, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.25282643735408783, + "step": 2536 + }, + { + "epoch": 0.0793125, + "grad_norm": 4.78125, + "grad_norm_var": 0.16951497395833334, + "learning_rate": 0.0001, + "loss": 6.9377, + "loss/crossentropy": 2.6172502040863037, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2523553669452667, + "step": 2538 + }, + { + "epoch": 0.079375, + "grad_norm": 6.71875, + "grad_norm_var": 0.47849934895833335, + "learning_rate": 0.0001, + "loss": 6.696, + "loss/crossentropy": 2.4634408950805664, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.23614561557769775, + "step": 2540 + }, + { + "epoch": 0.0794375, + "grad_norm": 4.0, + "grad_norm_var": 0.5531209309895834, + "learning_rate": 0.0001, + "loss": 7.1948, + "loss/crossentropy": 2.777570605278015, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.2573477476835251, + "step": 2542 + }, + { + "epoch": 0.0795, + "grad_norm": 4.96875, + "grad_norm_var": 0.5543619791666666, + "learning_rate": 0.0001, + "loss": 6.8532, + "loss/crossentropy": 2.5255823135375977, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.24799229949712753, + "step": 2544 + }, + { + "epoch": 0.0795625, + "grad_norm": 4.25, + "grad_norm_var": 0.52720947265625, + "learning_rate": 0.0001, + "loss": 7.1704, + "loss/crossentropy": 2.666976809501648, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.26518701016902924, + "step": 2546 + }, + { + "epoch": 0.079625, + "grad_norm": 4.84375, + "grad_norm_var": 0.5709869384765625, + "learning_rate": 0.0001, + "loss": 6.6822, + "loss/crossentropy": 2.4179537296295166, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.24439681321382523, + "step": 2548 + }, + { + "epoch": 0.0796875, + "grad_norm": 4.875, + "grad_norm_var": 0.5432688395182291, + "learning_rate": 0.0001, + "loss": 7.5325, + "loss/crossentropy": 2.8433114290237427, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2798580527305603, + "step": 2550 + }, + { + "epoch": 0.07975, + "grad_norm": 5.1875, + "grad_norm_var": 0.5427805582682291, + "learning_rate": 0.0001, + "loss": 6.5928, + "loss/crossentropy": 2.402013421058655, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.23509907722473145, + "step": 2552 + }, + { + "epoch": 0.0798125, + "grad_norm": 3.71875, + "grad_norm_var": 0.5950429280598958, + "learning_rate": 0.0001, + "loss": 6.832, + "loss/crossentropy": 2.5818487405776978, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.24493704736232758, + "step": 2554 + }, + { + "epoch": 0.079875, + "grad_norm": 5.09375, + "grad_norm_var": 0.3174468994140625, + "learning_rate": 0.0001, + "loss": 7.0319, + "loss/crossentropy": 2.6635189056396484, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.24973279982805252, + "step": 2556 + }, + { + "epoch": 0.0799375, + "grad_norm": 4.9375, + "grad_norm_var": 0.2541737874348958, + "learning_rate": 0.0001, + "loss": 7.4462, + "loss/crossentropy": 2.94194233417511, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.2644868791103363, + "step": 2558 + }, + { + "epoch": 0.08, + "grad_norm": 4.15625, + "grad_norm_var": 0.24468485514322916, + "learning_rate": 0.0001, + "loss": 6.816, + "loss/crossentropy": 2.615973472595215, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.24071022123098373, + "step": 2560 + }, + { + "epoch": 0.0800625, + "grad_norm": 4.53125, + "grad_norm_var": 0.23405659993489583, + "learning_rate": 0.0001, + "loss": 7.2102, + "loss/crossentropy": 2.626392126083374, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.27126961946487427, + "step": 2562 + }, + { + "epoch": 0.080125, + "grad_norm": 4.15625, + "grad_norm_var": 0.23209228515625, + "learning_rate": 0.0001, + "loss": 7.2899, + "loss/crossentropy": 2.768537402153015, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.27244727313518524, + "step": 2564 + }, + { + "epoch": 0.0801875, + "grad_norm": 4.84375, + "grad_norm_var": 0.23557535807291666, + "learning_rate": 0.0001, + "loss": 6.807, + "loss/crossentropy": 2.501596689224243, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.24968481063842773, + "step": 2566 + }, + { + "epoch": 0.08025, + "grad_norm": 5.78125, + "grad_norm_var": 0.3113433837890625, + "learning_rate": 0.0001, + "loss": 6.8229, + "loss/crossentropy": 2.6043057441711426, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.24490661919116974, + "step": 2568 + }, + { + "epoch": 0.0803125, + "grad_norm": 5.625, + "grad_norm_var": 0.3254954020182292, + "learning_rate": 0.0001, + "loss": 6.9471, + "loss/crossentropy": 2.533547282218933, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.26205888390541077, + "step": 2570 + }, + { + "epoch": 0.080375, + "grad_norm": 4.34375, + "grad_norm_var": 0.3199534098307292, + "learning_rate": 0.0001, + "loss": 7.1706, + "loss/crossentropy": 2.7811293601989746, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.25652652978897095, + "step": 2572 + }, + { + "epoch": 0.0804375, + "grad_norm": 4.53125, + "grad_norm_var": 0.31607157389322915, + "learning_rate": 0.0001, + "loss": 7.0857, + "loss/crossentropy": 2.6379505395889282, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.25727570056915283, + "step": 2574 + }, + { + "epoch": 0.0805, + "grad_norm": 4.625, + "grad_norm_var": 0.28181864420572916, + "learning_rate": 0.0001, + "loss": 7.2665, + "loss/crossentropy": 2.77507221698761, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.2596895396709442, + "step": 2576 + }, + { + "epoch": 0.0805625, + "grad_norm": 4.53125, + "grad_norm_var": 0.2671132405598958, + "learning_rate": 0.0001, + "loss": 7.2025, + "loss/crossentropy": 2.7237536907196045, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.26388655602931976, + "step": 2578 + }, + { + "epoch": 0.080625, + "grad_norm": 4.46875, + "grad_norm_var": 0.2418121337890625, + "learning_rate": 0.0001, + "loss": 6.6304, + "loss/crossentropy": 2.4240094423294067, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.23938723653554916, + "step": 2580 + }, + { + "epoch": 0.0806875, + "grad_norm": 3.984375, + "grad_norm_var": 0.248388671875, + "learning_rate": 0.0001, + "loss": 6.8911, + "loss/crossentropy": 2.5649718046188354, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2521471083164215, + "step": 2582 + }, + { + "epoch": 0.08075, + "grad_norm": 4.125, + "grad_norm_var": 0.1258209228515625, + "learning_rate": 0.0001, + "loss": 6.8676, + "loss/crossentropy": 2.6262258291244507, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.24718241393566132, + "step": 2584 + }, + { + "epoch": 0.0808125, + "grad_norm": 4.1875, + "grad_norm_var": 0.040852864583333336, + "learning_rate": 0.0001, + "loss": 6.9075, + "loss/crossentropy": 2.565857768058777, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.2533073127269745, + "step": 2586 + }, + { + "epoch": 0.080875, + "grad_norm": 4.03125, + "grad_norm_var": 0.05105794270833333, + "learning_rate": 0.0001, + "loss": 7.0589, + "loss/crossentropy": 2.7443935871124268, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.2537176012992859, + "step": 2588 + }, + { + "epoch": 0.0809375, + "grad_norm": 4.625, + "grad_norm_var": 0.05431315104166667, + "learning_rate": 0.0001, + "loss": 7.094, + "loss/crossentropy": 2.6940484046936035, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.25601277500391006, + "step": 2590 + }, + { + "epoch": 0.081, + "grad_norm": 5.15625, + "grad_norm_var": 0.099755859375, + "learning_rate": 0.0001, + "loss": 6.7848, + "loss/crossentropy": 2.6095337867736816, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.24252191185951233, + "step": 2592 + }, + { + "epoch": 0.0810625, + "grad_norm": 4.875, + "grad_norm_var": 0.11858723958333334, + "learning_rate": 0.0001, + "loss": 6.9664, + "loss/crossentropy": 2.6425299644470215, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.24917972832918167, + "step": 2594 + }, + { + "epoch": 0.081125, + "grad_norm": 3.9375, + "grad_norm_var": 0.12763264973958333, + "learning_rate": 0.0001, + "loss": 6.6924, + "loss/crossentropy": 2.5188519954681396, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.23844516277313232, + "step": 2596 + }, + { + "epoch": 0.0811875, + "grad_norm": 4.40625, + "grad_norm_var": 0.11960347493489583, + "learning_rate": 0.0001, + "loss": 7.2267, + "loss/crossentropy": 2.77741277217865, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2597716152667999, + "step": 2598 + }, + { + "epoch": 0.08125, + "grad_norm": 4.46875, + "grad_norm_var": 0.11912333170572917, + "learning_rate": 0.0001, + "loss": 6.9949, + "loss/crossentropy": 2.707468867301941, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.2471056804060936, + "step": 2600 + }, + { + "epoch": 0.0813125, + "grad_norm": 3.90625, + "grad_norm_var": 0.11705729166666666, + "learning_rate": 0.0001, + "loss": 6.7894, + "loss/crossentropy": 2.567119598388672, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24449165165424347, + "step": 2602 + }, + { + "epoch": 0.081375, + "grad_norm": 4.25, + "grad_norm_var": 0.10214436848958333, + "learning_rate": 0.0001, + "loss": 6.9362, + "loss/crossentropy": 2.680039644241333, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.2408483326435089, + "step": 2604 + }, + { + "epoch": 0.0814375, + "grad_norm": 4.53125, + "grad_norm_var": 0.10441080729166667, + "learning_rate": 0.0001, + "loss": 6.6155, + "loss/crossentropy": 2.513710618019104, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23205246031284332, + "step": 2606 + }, + { + "epoch": 0.0815, + "grad_norm": 4.46875, + "grad_norm_var": 0.056473795572916666, + "learning_rate": 0.0001, + "loss": 6.9472, + "loss/crossentropy": 2.6492717266082764, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.24736837297677994, + "step": 2608 + }, + { + "epoch": 0.0815625, + "grad_norm": 3.828125, + "grad_norm_var": 0.05537821451822917, + "learning_rate": 0.0001, + "loss": 6.8619, + "loss/crossentropy": 2.6352410316467285, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.24454277008771896, + "step": 2610 + }, + { + "epoch": 0.081625, + "grad_norm": 4.3125, + "grad_norm_var": 0.048460896809895834, + "learning_rate": 0.0001, + "loss": 7.0136, + "loss/crossentropy": 2.6773797273635864, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.2512050122022629, + "step": 2612 + }, + { + "epoch": 0.0816875, + "grad_norm": 3.9375, + "grad_norm_var": 0.06180013020833333, + "learning_rate": 0.0001, + "loss": 6.8441, + "loss/crossentropy": 2.6951568126678467, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2355944886803627, + "step": 2614 + }, + { + "epoch": 0.08175, + "grad_norm": 3.703125, + "grad_norm_var": 0.08315327962239584, + "learning_rate": 0.0001, + "loss": 7.0625, + "loss/crossentropy": 2.8284648656845093, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2441062480211258, + "step": 2616 + }, + { + "epoch": 0.0818125, + "grad_norm": 3.90625, + "grad_norm_var": 0.08290608723958333, + "learning_rate": 0.0001, + "loss": 7.0338, + "loss/crossentropy": 2.8697853088378906, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.23789025843143463, + "step": 2618 + }, + { + "epoch": 0.081875, + "grad_norm": 4.03125, + "grad_norm_var": 0.0837890625, + "learning_rate": 0.0001, + "loss": 6.5213, + "loss/crossentropy": 2.4423439502716064, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23367558419704437, + "step": 2620 + }, + { + "epoch": 0.0819375, + "grad_norm": 4.25, + "grad_norm_var": 0.085400390625, + "learning_rate": 0.0001, + "loss": 6.9671, + "loss/crossentropy": 2.7237391471862793, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.24503568559885025, + "step": 2622 + }, + { + "epoch": 0.082, + "grad_norm": 4.40625, + "grad_norm_var": 0.07935791015625, + "learning_rate": 0.0001, + "loss": 6.7533, + "loss/crossentropy": 2.7278497219085693, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.22793515026569366, + "step": 2624 + }, + { + "epoch": 0.0820625, + "grad_norm": 4.46875, + "grad_norm_var": 0.0682769775390625, + "learning_rate": 0.0001, + "loss": 6.983, + "loss/crossentropy": 2.63441264629364, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2528279647231102, + "step": 2626 + }, + { + "epoch": 0.082125, + "grad_norm": 4.28125, + "grad_norm_var": 0.07888895670572917, + "learning_rate": 0.0001, + "loss": 7.0744, + "loss/crossentropy": 2.7248504161834717, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.25292399525642395, + "step": 2628 + }, + { + "epoch": 0.0821875, + "grad_norm": 4.28125, + "grad_norm_var": 0.06802978515625, + "learning_rate": 0.0001, + "loss": 6.8822, + "loss/crossentropy": 2.598918318748474, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.24317240715026855, + "step": 2630 + }, + { + "epoch": 0.08225, + "grad_norm": 3.953125, + "grad_norm_var": 0.06279296875, + "learning_rate": 0.0001, + "loss": 7.1108, + "loss/crossentropy": 2.6071430444717407, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.2648216784000397, + "step": 2632 + }, + { + "epoch": 0.0823125, + "grad_norm": 4.0625, + "grad_norm_var": 0.05176493326822917, + "learning_rate": 0.0001, + "loss": 7.0101, + "loss/crossentropy": 2.7162694931030273, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2465699315071106, + "step": 2634 + }, + { + "epoch": 0.082375, + "grad_norm": 4.0625, + "grad_norm_var": 0.05611572265625, + "learning_rate": 0.0001, + "loss": 7.1623, + "loss/crossentropy": 2.8163429498672485, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2545183300971985, + "step": 2636 + }, + { + "epoch": 0.0824375, + "grad_norm": 4.34375, + "grad_norm_var": 0.06080322265625, + "learning_rate": 0.0001, + "loss": 7.2852, + "loss/crossentropy": 2.8388519287109375, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2606465071439743, + "step": 2638 + }, + { + "epoch": 0.0825, + "grad_norm": 4.5, + "grad_norm_var": 0.06549072265625, + "learning_rate": 0.0001, + "loss": 6.9992, + "loss/crossentropy": 2.734708309173584, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24676088988780975, + "step": 2640 + }, + { + "epoch": 0.0825625, + "grad_norm": 4.5, + "grad_norm_var": 0.06012369791666667, + "learning_rate": 0.0001, + "loss": 6.7951, + "loss/crossentropy": 2.6085119247436523, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.24052976816892624, + "step": 2642 + }, + { + "epoch": 0.082625, + "grad_norm": 4.40625, + "grad_norm_var": 0.055615234375, + "learning_rate": 0.0001, + "loss": 6.8036, + "loss/crossentropy": 2.632908344268799, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24050304293632507, + "step": 2644 + }, + { + "epoch": 0.0826875, + "grad_norm": 4.1875, + "grad_norm_var": 0.05563151041666667, + "learning_rate": 0.0001, + "loss": 7.0268, + "loss/crossentropy": 2.7476726770401, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.24588297307491302, + "step": 2646 + }, + { + "epoch": 0.08275, + "grad_norm": 4.5625, + "grad_norm_var": 0.04209696451822917, + "learning_rate": 0.0001, + "loss": 6.8453, + "loss/crossentropy": 2.6198912858963013, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.23934194445610046, + "step": 2648 + }, + { + "epoch": 0.0828125, + "grad_norm": 4.09375, + "grad_norm_var": 0.042455037434895836, + "learning_rate": 0.0001, + "loss": 7.0919, + "loss/crossentropy": 2.7107146978378296, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.25687095522880554, + "step": 2650 + }, + { + "epoch": 0.082875, + "grad_norm": 3.921875, + "grad_norm_var": 0.04362691243489583, + "learning_rate": 0.0001, + "loss": 6.6233, + "loss/crossentropy": 2.5405800342559814, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2360065057873726, + "step": 2652 + }, + { + "epoch": 0.0829375, + "grad_norm": 4.0, + "grad_norm_var": 0.0378326416015625, + "learning_rate": 0.0001, + "loss": 6.5801, + "loss/crossentropy": 2.442927122116089, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.23364173620939255, + "step": 2654 + }, + { + "epoch": 0.083, + "grad_norm": 4.46875, + "grad_norm_var": 0.0396392822265625, + "learning_rate": 0.0001, + "loss": 6.8022, + "loss/crossentropy": 2.5995049476623535, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.24175354093313217, + "step": 2656 + }, + { + "epoch": 0.0830625, + "grad_norm": 4.59375, + "grad_norm_var": 0.0486724853515625, + "learning_rate": 0.0001, + "loss": 7.1417, + "loss/crossentropy": 2.7718758583068848, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.25222010165452957, + "step": 2658 + }, + { + "epoch": 0.083125, + "grad_norm": 4.71875, + "grad_norm_var": 0.0679107666015625, + "learning_rate": 0.0001, + "loss": 6.9471, + "loss/crossentropy": 2.580165386199951, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.25309784710407257, + "step": 2660 + }, + { + "epoch": 0.0831875, + "grad_norm": 4.34375, + "grad_norm_var": 0.0666412353515625, + "learning_rate": 0.0001, + "loss": 7.1886, + "loss/crossentropy": 2.7995067834854126, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.25140853226184845, + "step": 2662 + }, + { + "epoch": 0.08325, + "grad_norm": 4.28125, + "grad_norm_var": 0.0606109619140625, + "learning_rate": 0.0001, + "loss": 6.9854, + "loss/crossentropy": 2.60084867477417, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.255643293261528, + "step": 2664 + }, + { + "epoch": 0.0833125, + "grad_norm": 4.40625, + "grad_norm_var": 0.2248199462890625, + "learning_rate": 0.0001, + "loss": 7.6103, + "loss/crossentropy": 2.9394861459732056, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2858267277479172, + "step": 2666 + }, + { + "epoch": 0.083375, + "grad_norm": 5.53125, + "grad_norm_var": 0.3035552978515625, + "learning_rate": 0.0001, + "loss": 6.9792, + "loss/crossentropy": 2.661886215209961, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2497013434767723, + "step": 2668 + }, + { + "epoch": 0.0834375, + "grad_norm": 4.09375, + "grad_norm_var": 0.3045857747395833, + "learning_rate": 0.0001, + "loss": 6.5979, + "loss/crossentropy": 2.470514416694641, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2314888834953308, + "step": 2670 + }, + { + "epoch": 0.0835, + "grad_norm": 4.40625, + "grad_norm_var": 0.28738606770833336, + "learning_rate": 0.0001, + "loss": 7.3821, + "loss/crossentropy": 2.8701629638671875, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.2648680955171585, + "step": 2672 + }, + { + "epoch": 0.0835625, + "grad_norm": 4.21875, + "grad_norm_var": 0.29339192708333334, + "learning_rate": 0.0001, + "loss": 6.7866, + "loss/crossentropy": 2.513500213623047, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24605898559093475, + "step": 2674 + }, + { + "epoch": 0.083625, + "grad_norm": 5.75, + "grad_norm_var": 0.43191731770833336, + "learning_rate": 0.0001, + "loss": 6.4642, + "loss/crossentropy": 2.3846570253372192, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.22982840240001678, + "step": 2676 + }, + { + "epoch": 0.0836875, + "grad_norm": 4.34375, + "grad_norm_var": 0.43017171223958334, + "learning_rate": 0.0001, + "loss": 7.1463, + "loss/crossentropy": 2.7192554473876953, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.2598957419395447, + "step": 2678 + }, + { + "epoch": 0.08375, + "grad_norm": 4.375, + "grad_norm_var": 0.4313435872395833, + "learning_rate": 0.0001, + "loss": 7.0543, + "loss/crossentropy": 2.72619366645813, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.25624796748161316, + "step": 2680 + }, + { + "epoch": 0.0838125, + "grad_norm": 4.28125, + "grad_norm_var": 0.30953776041666664, + "learning_rate": 0.0001, + "loss": 6.8765, + "loss/crossentropy": 2.620025634765625, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2451772838830948, + "step": 2682 + }, + { + "epoch": 0.083875, + "grad_norm": 4.71875, + "grad_norm_var": 0.20747782389322916, + "learning_rate": 0.0001, + "loss": 6.9777, + "loss/crossentropy": 2.6200352907180786, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.2533438503742218, + "step": 2684 + }, + { + "epoch": 0.0839375, + "grad_norm": 4.46875, + "grad_norm_var": 0.19401041666666666, + "learning_rate": 0.0001, + "loss": 6.9249, + "loss/crossentropy": 2.5884883403778076, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2531713396310806, + "step": 2686 + }, + { + "epoch": 0.084, + "grad_norm": 4.46875, + "grad_norm_var": 0.20788472493489582, + "learning_rate": 0.0001, + "loss": 6.4523, + "loss/crossentropy": 2.364005446434021, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.22406137734651566, + "step": 2688 + }, + { + "epoch": 0.0840625, + "grad_norm": 4.53125, + "grad_norm_var": 0.20716044108072917, + "learning_rate": 0.0001, + "loss": 7.1637, + "loss/crossentropy": 2.6903235912323, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.26413694024086, + "step": 2690 + }, + { + "epoch": 0.084125, + "grad_norm": 4.15625, + "grad_norm_var": 0.05725809733072917, + "learning_rate": 0.0001, + "loss": 7.0104, + "loss/crossentropy": 2.6367440223693848, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2561148405075073, + "step": 2692 + }, + { + "epoch": 0.0841875, + "grad_norm": 4.09375, + "grad_norm_var": 0.0597320556640625, + "learning_rate": 0.0001, + "loss": 7.0027, + "loss/crossentropy": 2.7296712398529053, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2472282499074936, + "step": 2694 + }, + { + "epoch": 0.08425, + "grad_norm": 5.28125, + "grad_norm_var": 0.11960347493489583, + "learning_rate": 0.0001, + "loss": 6.5956, + "loss/crossentropy": 2.212746024131775, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2562513202428818, + "step": 2696 + }, + { + "epoch": 0.0843125, + "grad_norm": 4.125, + "grad_norm_var": 0.12383524576822917, + "learning_rate": 0.0001, + "loss": 6.456, + "loss/crossentropy": 2.245554804801941, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.23628269135951996, + "step": 2698 + }, + { + "epoch": 0.084375, + "grad_norm": 4.375, + "grad_norm_var": 0.11800028483072916, + "learning_rate": 0.0001, + "loss": 7.0587, + "loss/crossentropy": 2.7779324054718018, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24682226032018661, + "step": 2700 + }, + { + "epoch": 0.0844375, + "grad_norm": 6.0625, + "grad_norm_var": 0.2856597900390625, + "learning_rate": 0.0001, + "loss": 6.8239, + "loss/crossentropy": 2.4644941091537476, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2566477209329605, + "step": 2702 + }, + { + "epoch": 0.0845, + "grad_norm": 4.75, + "grad_norm_var": 0.267822265625, + "learning_rate": 0.0001, + "loss": 7.1954, + "loss/crossentropy": 2.770855665206909, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.2545660585165024, + "step": 2704 + }, + { + "epoch": 0.0845625, + "grad_norm": 3.859375, + "grad_norm_var": 0.3005116780598958, + "learning_rate": 0.0001, + "loss": 7.1604, + "loss/crossentropy": 2.833829402923584, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.24945129454135895, + "step": 2706 + }, + { + "epoch": 0.084625, + "grad_norm": 4.53125, + "grad_norm_var": 0.2871734619140625, + "learning_rate": 0.0001, + "loss": 6.8071, + "loss/crossentropy": 2.4861559867858887, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2508440986275673, + "step": 2708 + }, + { + "epoch": 0.0846875, + "grad_norm": 4.21875, + "grad_norm_var": 0.28145243326822916, + "learning_rate": 0.0001, + "loss": 7.2675, + "loss/crossentropy": 2.794210195541382, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.2625606060028076, + "step": 2710 + }, + { + "epoch": 0.08475, + "grad_norm": 3.875, + "grad_norm_var": 0.2695465087890625, + "learning_rate": 0.0001, + "loss": 6.6533, + "loss/crossentropy": 2.558227300643921, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23138165473937988, + "step": 2712 + }, + { + "epoch": 0.0848125, + "grad_norm": 4.03125, + "grad_norm_var": 0.27464090983072914, + "learning_rate": 0.0001, + "loss": 6.8092, + "loss/crossentropy": 2.6484687328338623, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.24107811599969864, + "step": 2714 + }, + { + "epoch": 0.084875, + "grad_norm": 4.53125, + "grad_norm_var": 0.29113667805989585, + "learning_rate": 0.0001, + "loss": 6.639, + "loss/crossentropy": 2.3600529432296753, + "loss/hidden": 1.8984375, + "loss/jsd": 0.0, + "loss/logits": 0.23805497586727142, + "step": 2716 + }, + { + "epoch": 0.0849375, + "grad_norm": 4.15625, + "grad_norm_var": 0.13931884765625, + "learning_rate": 0.0001, + "loss": 6.7615, + "loss/crossentropy": 2.654744267463684, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23645318299531937, + "step": 2718 + }, + { + "epoch": 0.085, + "grad_norm": 4.65625, + "grad_norm_var": 0.13863932291666667, + "learning_rate": 0.0001, + "loss": 6.7733, + "loss/crossentropy": 2.5434645414352417, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.2413441687822342, + "step": 2720 + }, + { + "epoch": 0.0850625, + "grad_norm": 4.09375, + "grad_norm_var": 0.12547098795572917, + "learning_rate": 0.0001, + "loss": 6.9629, + "loss/crossentropy": 2.571885824203491, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2570737153291702, + "step": 2722 + }, + { + "epoch": 0.085125, + "grad_norm": 4.46875, + "grad_norm_var": 0.1234039306640625, + "learning_rate": 0.0001, + "loss": 7.1714, + "loss/crossentropy": 2.752210855484009, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.2563716471195221, + "step": 2724 + }, + { + "epoch": 0.0851875, + "grad_norm": 4.15625, + "grad_norm_var": 0.12360738118489584, + "learning_rate": 0.0001, + "loss": 7.13, + "loss/crossentropy": 2.7513015270233154, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.2570132464170456, + "step": 2726 + }, + { + "epoch": 0.08525, + "grad_norm": 5.125, + "grad_norm_var": 0.15066731770833333, + "learning_rate": 0.0001, + "loss": 6.8562, + "loss/crossentropy": 2.527231216430664, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.24617478996515274, + "step": 2728 + }, + { + "epoch": 0.0853125, + "grad_norm": 7.53125, + "grad_norm_var": 0.7095987955729167, + "learning_rate": 0.0001, + "loss": 7.6808, + "loss/crossentropy": 2.98296856880188, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.2807210758328438, + "step": 2730 + }, + { + "epoch": 0.085375, + "grad_norm": 4.09375, + "grad_norm_var": 0.73336181640625, + "learning_rate": 0.0001, + "loss": 6.8353, + "loss/crossentropy": 2.6061861515045166, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24556761980056763, + "step": 2732 + }, + { + "epoch": 0.0854375, + "grad_norm": 4.15625, + "grad_norm_var": 0.74990234375, + "learning_rate": 0.0001, + "loss": 6.569, + "loss/crossentropy": 2.427661895751953, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.23678777366876602, + "step": 2734 + }, + { + "epoch": 0.0855, + "grad_norm": 3.90625, + "grad_norm_var": 0.8070220947265625, + "learning_rate": 0.0001, + "loss": 6.4683, + "loss/crossentropy": 2.5308948755264282, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.21951919049024582, + "step": 2736 + }, + { + "epoch": 0.0855625, + "grad_norm": 4.59375, + "grad_norm_var": 0.8096832275390625, + "learning_rate": 0.0001, + "loss": 7.1404, + "loss/crossentropy": 2.737537384033203, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2563035860657692, + "step": 2738 + }, + { + "epoch": 0.085625, + "grad_norm": 3.875, + "grad_norm_var": 0.8395416259765625, + "learning_rate": 0.0001, + "loss": 6.6333, + "loss/crossentropy": 2.508602023124695, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.23629501461982727, + "step": 2740 + }, + { + "epoch": 0.0856875, + "grad_norm": 4.40625, + "grad_norm_var": 0.8300201416015625, + "learning_rate": 0.0001, + "loss": 7.1187, + "loss/crossentropy": 2.715458035469055, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.2579001486301422, + "step": 2742 + }, + { + "epoch": 0.08575, + "grad_norm": 4.03125, + "grad_norm_var": 0.802294921875, + "learning_rate": 0.0001, + "loss": 6.6738, + "loss/crossentropy": 2.5113805532455444, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2396751567721367, + "step": 2744 + }, + { + "epoch": 0.0858125, + "grad_norm": 4.03125, + "grad_norm_var": 0.04358723958333333, + "learning_rate": 0.0001, + "loss": 6.8626, + "loss/crossentropy": 2.678552985191345, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2434050738811493, + "step": 2746 + }, + { + "epoch": 0.085875, + "grad_norm": 4.40625, + "grad_norm_var": 0.048014322916666664, + "learning_rate": 0.0001, + "loss": 7.0644, + "loss/crossentropy": 2.6373531818389893, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.25911282002925873, + "step": 2748 + }, + { + "epoch": 0.0859375, + "grad_norm": 4.25, + "grad_norm_var": 0.046808878580729164, + "learning_rate": 0.0001, + "loss": 6.9114, + "loss/crossentropy": 2.703710198402405, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.24264460057020187, + "step": 2750 + }, + { + "epoch": 0.086, + "grad_norm": 7.09375, + "grad_norm_var": 0.60865478515625, + "learning_rate": 0.0001, + "loss": 7.2531, + "loss/crossentropy": 2.7426042556762695, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2635515555739403, + "step": 2752 + }, + { + "epoch": 0.0860625, + "grad_norm": 4.15625, + "grad_norm_var": 0.6038899739583333, + "learning_rate": 0.0001, + "loss": 7.0144, + "loss/crossentropy": 2.686529755592346, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.25348879396915436, + "step": 2754 + }, + { + "epoch": 0.086125, + "grad_norm": 4.125, + "grad_norm_var": 0.5939737955729166, + "learning_rate": 0.0001, + "loss": 6.8584, + "loss/crossentropy": 2.621758818626404, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24397746473550797, + "step": 2756 + }, + { + "epoch": 0.0861875, + "grad_norm": 3.84375, + "grad_norm_var": 0.6249257405598958, + "learning_rate": 0.0001, + "loss": 6.4882, + "loss/crossentropy": 2.4892083406448364, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.22528532147407532, + "step": 2758 + }, + { + "epoch": 0.08625, + "grad_norm": 4.09375, + "grad_norm_var": 0.6170155843098958, + "learning_rate": 0.0001, + "loss": 6.6869, + "loss/crossentropy": 2.5306947231292725, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2367103397846222, + "step": 2760 + }, + { + "epoch": 0.0863125, + "grad_norm": 4.125, + "grad_norm_var": 0.6169260660807292, + "learning_rate": 0.0001, + "loss": 7.078, + "loss/crossentropy": 2.7038986682891846, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.25459785759449005, + "step": 2762 + }, + { + "epoch": 0.086375, + "grad_norm": 4.03125, + "grad_norm_var": 0.6237375895182292, + "learning_rate": 0.0001, + "loss": 6.8762, + "loss/crossentropy": 2.646838665008545, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2440301477909088, + "step": 2764 + }, + { + "epoch": 0.0864375, + "grad_norm": 4.03125, + "grad_norm_var": 0.6159332275390625, + "learning_rate": 0.0001, + "loss": 7.0619, + "loss/crossentropy": 2.8066179752349854, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.2477927803993225, + "step": 2766 + }, + { + "epoch": 0.0865, + "grad_norm": 4.15625, + "grad_norm_var": 0.0525787353515625, + "learning_rate": 0.0001, + "loss": 7.0585, + "loss/crossentropy": 2.8025286197662354, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.24552123248577118, + "step": 2768 + }, + { + "epoch": 0.0865625, + "grad_norm": 3.90625, + "grad_norm_var": 0.06516520182291667, + "learning_rate": 0.0001, + "loss": 6.6905, + "loss/crossentropy": 2.5861109495162964, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2338722199201584, + "step": 2770 + }, + { + "epoch": 0.086625, + "grad_norm": 4.09375, + "grad_norm_var": 0.06992085774739583, + "learning_rate": 0.0001, + "loss": 6.5937, + "loss/crossentropy": 2.4812984466552734, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23702051490545273, + "step": 2772 + }, + { + "epoch": 0.0866875, + "grad_norm": 4.09375, + "grad_norm_var": 0.08655598958333334, + "learning_rate": 0.0001, + "loss": 6.9514, + "loss/crossentropy": 2.6603236198425293, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.25136835873126984, + "step": 2774 + }, + { + "epoch": 0.08675, + "grad_norm": 3.828125, + "grad_norm_var": 0.10271809895833334, + "learning_rate": 0.0001, + "loss": 6.6624, + "loss/crossentropy": 2.646838068962097, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22812309861183167, + "step": 2776 + }, + { + "epoch": 0.0868125, + "grad_norm": 4.0, + "grad_norm_var": 0.08000895182291666, + "learning_rate": 0.0001, + "loss": 6.6274, + "loss/crossentropy": 2.505067229270935, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.2345016449689865, + "step": 2778 + }, + { + "epoch": 0.086875, + "grad_norm": 4.8125, + "grad_norm_var": 0.10718485514322916, + "learning_rate": 0.0001, + "loss": 6.8672, + "loss/crossentropy": 2.6379599571228027, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24636013805866241, + "step": 2780 + }, + { + "epoch": 0.0869375, + "grad_norm": 3.859375, + "grad_norm_var": 0.11272786458333334, + "learning_rate": 0.0001, + "loss": 6.9603, + "loss/crossentropy": 2.7323312759399414, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24545498192310333, + "step": 2782 + }, + { + "epoch": 0.087, + "grad_norm": 4.03125, + "grad_norm_var": 0.09674072265625, + "learning_rate": 0.0001, + "loss": 6.5846, + "loss/crossentropy": 2.439736485481262, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.23127836734056473, + "step": 2784 + }, + { + "epoch": 0.0870625, + "grad_norm": 4.5625, + "grad_norm_var": 0.1028472900390625, + "learning_rate": 0.0001, + "loss": 7.1149, + "loss/crossentropy": 2.7336888313293457, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.25842948257923126, + "step": 2786 + }, + { + "epoch": 0.087125, + "grad_norm": 3.890625, + "grad_norm_var": 0.10173238118489583, + "learning_rate": 0.0001, + "loss": 6.6214, + "loss/crossentropy": 2.5512136220932007, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.23397225886583328, + "step": 2788 + }, + { + "epoch": 0.0871875, + "grad_norm": 4.34375, + "grad_norm_var": 0.07649637858072916, + "learning_rate": 0.0001, + "loss": 6.8521, + "loss/crossentropy": 2.6787188053131104, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.23803870379924774, + "step": 2790 + }, + { + "epoch": 0.08725, + "grad_norm": 5.09375, + "grad_norm_var": 0.12491861979166667, + "learning_rate": 0.0001, + "loss": 6.6633, + "loss/crossentropy": 2.4818060398101807, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.23885706812143326, + "step": 2792 + }, + { + "epoch": 0.0873125, + "grad_norm": 5.21875, + "grad_norm_var": 0.197265625, + "learning_rate": 0.0001, + "loss": 7.4022, + "loss/crossentropy": 2.7372331619262695, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.2754848003387451, + "step": 2794 + }, + { + "epoch": 0.087375, + "grad_norm": 4.125, + "grad_norm_var": 0.17433980305989583, + "learning_rate": 0.0001, + "loss": 6.979, + "loss/crossentropy": 2.7950183153152466, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.24418172985315323, + "step": 2796 + }, + { + "epoch": 0.0874375, + "grad_norm": 4.0, + "grad_norm_var": 0.17021077473958332, + "learning_rate": 0.0001, + "loss": 6.909, + "loss/crossentropy": 2.6368402242660522, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24987629055976868, + "step": 2798 + }, + { + "epoch": 0.0875, + "grad_norm": 4.15625, + "grad_norm_var": 0.16259358723958334, + "learning_rate": 0.0001, + "loss": 6.9035, + "loss/crossentropy": 2.65077543258667, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24793104082345963, + "step": 2800 + }, + { + "epoch": 0.0875625, + "grad_norm": 4.25, + "grad_norm_var": 0.15861002604166666, + "learning_rate": 0.0001, + "loss": 6.7637, + "loss/crossentropy": 2.6732107400894165, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.23014727979898453, + "step": 2802 + }, + { + "epoch": 0.087625, + "grad_norm": 4.1875, + "grad_norm_var": 0.16157124837239584, + "learning_rate": 0.0001, + "loss": 7.1957, + "loss/crossentropy": 2.7851150035858154, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.2586326599121094, + "step": 2804 + }, + { + "epoch": 0.0876875, + "grad_norm": 4.25, + "grad_norm_var": 0.1755279541015625, + "learning_rate": 0.0001, + "loss": 6.8912, + "loss/crossentropy": 2.5576705932617188, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.25679074227809906, + "step": 2806 + }, + { + "epoch": 0.08775, + "grad_norm": 4.03125, + "grad_norm_var": 0.13225504557291667, + "learning_rate": 0.0001, + "loss": 6.9721, + "loss/crossentropy": 2.7346194982528687, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.24757720530033112, + "step": 2808 + }, + { + "epoch": 0.0878125, + "grad_norm": 3.796875, + "grad_norm_var": 0.08034566243489584, + "learning_rate": 0.0001, + "loss": 7.0882, + "loss/crossentropy": 2.774062395095825, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.25055310130119324, + "step": 2810 + }, + { + "epoch": 0.087875, + "grad_norm": 4.15625, + "grad_norm_var": 0.0814849853515625, + "learning_rate": 0.0001, + "loss": 6.645, + "loss/crossentropy": 2.5014760494232178, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.24013769626617432, + "step": 2812 + }, + { + "epoch": 0.0879375, + "grad_norm": 4.34375, + "grad_norm_var": 0.08295796712239584, + "learning_rate": 0.0001, + "loss": 6.8454, + "loss/crossentropy": 2.706041693687439, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2389346957206726, + "step": 2814 + }, + { + "epoch": 0.088, + "grad_norm": 4.28125, + "grad_norm_var": 0.0823883056640625, + "learning_rate": 0.0001, + "loss": 6.6613, + "loss/crossentropy": 2.494887948036194, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.2349967360496521, + "step": 2816 + }, + { + "epoch": 0.0880625, + "grad_norm": 4.5625, + "grad_norm_var": 5.473029581705729, + "learning_rate": 0.0001, + "loss": 7.6397, + "loss/crossentropy": 2.7328039407730103, + "loss/hidden": 1.87109375, + "loss/jsd": 0.0, + "loss/logits": 0.30358322709798813, + "step": 2818 + }, + { + "epoch": 0.088125, + "grad_norm": 4.15625, + "grad_norm_var": 5.509585571289063, + "learning_rate": 0.0001, + "loss": 6.719, + "loss/crossentropy": 2.626744270324707, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2346126213669777, + "step": 2820 + }, + { + "epoch": 0.0881875, + "grad_norm": 4.03125, + "grad_norm_var": 5.575602213541667, + "learning_rate": 0.0001, + "loss": 6.7236, + "loss/crossentropy": 2.6262214183807373, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.23395337909460068, + "step": 2822 + }, + { + "epoch": 0.08825, + "grad_norm": 4.4375, + "grad_norm_var": 5.55611572265625, + "learning_rate": 0.0001, + "loss": 6.95, + "loss/crossentropy": 2.6433587074279785, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2505885064601898, + "step": 2824 + }, + { + "epoch": 0.0883125, + "grad_norm": 4.15625, + "grad_norm_var": 5.5215810139973955, + "learning_rate": 0.0001, + "loss": 6.9737, + "loss/crossentropy": 2.6511499881744385, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2502279579639435, + "step": 2826 + }, + { + "epoch": 0.088375, + "grad_norm": 4.5, + "grad_norm_var": 5.497638956705729, + "learning_rate": 0.0001, + "loss": 7.2043, + "loss/crossentropy": 2.8375182151794434, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2565983235836029, + "step": 2828 + }, + { + "epoch": 0.0884375, + "grad_norm": 4.03125, + "grad_norm_var": 5.498542277018229, + "learning_rate": 0.0001, + "loss": 6.836, + "loss/crossentropy": 2.7026480436325073, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.23598749190568924, + "step": 2830 + }, + { + "epoch": 0.0885, + "grad_norm": 5.8125, + "grad_norm_var": 5.547028605143229, + "learning_rate": 0.0001, + "loss": 6.8172, + "loss/crossentropy": 2.4725399017333984, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.25009439140558243, + "step": 2832 + }, + { + "epoch": 0.0885625, + "grad_norm": 4.21875, + "grad_norm_var": 0.1890777587890625, + "learning_rate": 0.0001, + "loss": 6.9316, + "loss/crossentropy": 2.7388436794281006, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24193084985017776, + "step": 2834 + }, + { + "epoch": 0.088625, + "grad_norm": 4.0, + "grad_norm_var": 0.1932281494140625, + "learning_rate": 0.0001, + "loss": 6.8544, + "loss/crossentropy": 2.6529780626296997, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.2424093559384346, + "step": 2836 + }, + { + "epoch": 0.0886875, + "grad_norm": 3.984375, + "grad_norm_var": 0.1971832275390625, + "learning_rate": 0.0001, + "loss": 6.4944, + "loss/crossentropy": 2.533252477645874, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22384879738092422, + "step": 2838 + }, + { + "epoch": 0.08875, + "grad_norm": 4.375, + "grad_norm_var": 0.19909566243489582, + "learning_rate": 0.0001, + "loss": 6.6716, + "loss/crossentropy": 2.474052309989929, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.23772280663251877, + "step": 2840 + }, + { + "epoch": 0.0888125, + "grad_norm": 4.1875, + "grad_norm_var": 0.20845438639322916, + "learning_rate": 0.0001, + "loss": 6.8851, + "loss/crossentropy": 2.7143443822860718, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.2385583370923996, + "step": 2842 + }, + { + "epoch": 0.088875, + "grad_norm": 4.46875, + "grad_norm_var": 0.20620829264322918, + "learning_rate": 0.0001, + "loss": 7.1547, + "loss/crossentropy": 2.82839298248291, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.24982231855392456, + "step": 2844 + }, + { + "epoch": 0.0889375, + "grad_norm": 4.375, + "grad_norm_var": 0.20735575358072916, + "learning_rate": 0.0001, + "loss": 6.9194, + "loss/crossentropy": 2.689344048500061, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2440999150276184, + "step": 2846 + }, + { + "epoch": 0.089, + "grad_norm": 4.0625, + "grad_norm_var": 0.03247782389322917, + "learning_rate": 0.0001, + "loss": 7.0278, + "loss/crossentropy": 2.8314419984817505, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.2458094358444214, + "step": 2848 + }, + { + "epoch": 0.0890625, + "grad_norm": 4.15625, + "grad_norm_var": 0.04157613118489583, + "learning_rate": 0.0001, + "loss": 6.9939, + "loss/crossentropy": 2.6279104948043823, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.25378216803073883, + "step": 2850 + }, + { + "epoch": 0.089125, + "grad_norm": 4.0625, + "grad_norm_var": 0.04472554524739583, + "learning_rate": 0.0001, + "loss": 6.8556, + "loss/crossentropy": 2.684327483177185, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.24017417430877686, + "step": 2852 + }, + { + "epoch": 0.0891875, + "grad_norm": 3.953125, + "grad_norm_var": 0.06446024576822916, + "learning_rate": 0.0001, + "loss": 6.9153, + "loss/crossentropy": 2.611162781715393, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2511187493801117, + "step": 2854 + }, + { + "epoch": 0.08925, + "grad_norm": 3.828125, + "grad_norm_var": 0.08336588541666666, + "learning_rate": 0.0001, + "loss": 7.3155, + "loss/crossentropy": 2.903210401535034, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.25841856747865677, + "step": 2856 + }, + { + "epoch": 0.0893125, + "grad_norm": 4.1875, + "grad_norm_var": 0.07395426432291667, + "learning_rate": 0.0001, + "loss": 6.7204, + "loss/crossentropy": 2.5703028440475464, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.23415035754442215, + "step": 2858 + }, + { + "epoch": 0.089375, + "grad_norm": 3.96875, + "grad_norm_var": 0.07742513020833333, + "learning_rate": 0.0001, + "loss": 6.6673, + "loss/crossentropy": 2.515069603919983, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2394431233406067, + "step": 2860 + }, + { + "epoch": 0.0894375, + "grad_norm": 3.96875, + "grad_norm_var": 0.07705078125, + "learning_rate": 0.0001, + "loss": 6.7068, + "loss/crossentropy": 2.5977174043655396, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23278646916151047, + "step": 2862 + }, + { + "epoch": 0.0895, + "grad_norm": 3.84375, + "grad_norm_var": 0.106982421875, + "learning_rate": 0.0001, + "loss": 7.1021, + "loss/crossentropy": 2.7704328298568726, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.24995948374271393, + "step": 2864 + }, + { + "epoch": 0.0895625, + "grad_norm": 4.125, + "grad_norm_var": 0.11728108723958333, + "learning_rate": 0.0001, + "loss": 7.1029, + "loss/crossentropy": 2.7944629192352295, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.25193439424037933, + "step": 2866 + }, + { + "epoch": 0.089625, + "grad_norm": 4.15625, + "grad_norm_var": 0.11506754557291667, + "learning_rate": 0.0001, + "loss": 6.8402, + "loss/crossentropy": 2.615453004837036, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.24434728920459747, + "step": 2868 + }, + { + "epoch": 0.0896875, + "grad_norm": 4.6875, + "grad_norm_var": 0.11043294270833333, + "learning_rate": 0.0001, + "loss": 7.2103, + "loss/crossentropy": 2.9622833728790283, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2447189837694168, + "step": 2870 + }, + { + "epoch": 0.08975, + "grad_norm": 4.78125, + "grad_norm_var": 0.10915425618489584, + "learning_rate": 0.0001, + "loss": 7.1553, + "loss/crossentropy": 2.7944257259368896, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2548411712050438, + "step": 2872 + }, + { + "epoch": 0.0898125, + "grad_norm": 4.46875, + "grad_norm_var": 0.11044820149739583, + "learning_rate": 0.0001, + "loss": 6.7337, + "loss/crossentropy": 2.5883902311325073, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.23601321130990982, + "step": 2874 + }, + { + "epoch": 0.089875, + "grad_norm": 4.15625, + "grad_norm_var": 0.1197662353515625, + "learning_rate": 0.0001, + "loss": 7.1886, + "loss/crossentropy": 2.669734477996826, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2667340338230133, + "step": 2876 + }, + { + "epoch": 0.0899375, + "grad_norm": 4.59375, + "grad_norm_var": 0.1191070556640625, + "learning_rate": 0.0001, + "loss": 6.9796, + "loss/crossentropy": 2.673854112625122, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.24659424275159836, + "step": 2878 + }, + { + "epoch": 0.09, + "grad_norm": 3.84375, + "grad_norm_var": 0.1076324462890625, + "learning_rate": 0.0001, + "loss": 6.6908, + "loss/crossentropy": 2.657265067100525, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.22796591371297836, + "step": 2880 + }, + { + "epoch": 0.0900625, + "grad_norm": 3.984375, + "grad_norm_var": 0.10998942057291666, + "learning_rate": 0.0001, + "loss": 6.9527, + "loss/crossentropy": 2.5288443565368652, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2587915509939194, + "step": 2882 + }, + { + "epoch": 0.090125, + "grad_norm": 4.375, + "grad_norm_var": 0.10878499348958333, + "learning_rate": 0.0001, + "loss": 6.9951, + "loss/crossentropy": 2.815529942512512, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.23827117681503296, + "step": 2884 + }, + { + "epoch": 0.0901875, + "grad_norm": 4.84375, + "grad_norm_var": 0.11975504557291666, + "learning_rate": 0.0001, + "loss": 6.8181, + "loss/crossentropy": 2.495347023010254, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.24985623359680176, + "step": 2886 + }, + { + "epoch": 0.09025, + "grad_norm": 6.25, + "grad_norm_var": 0.34085286458333336, + "learning_rate": 0.0001, + "loss": 7.1716, + "loss/crossentropy": 2.736644744873047, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.26185525953769684, + "step": 2888 + }, + { + "epoch": 0.0903125, + "grad_norm": 4.46875, + "grad_norm_var": 0.3358683268229167, + "learning_rate": 0.0001, + "loss": 7.1126, + "loss/crossentropy": 2.7053741216659546, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2606474459171295, + "step": 2890 + }, + { + "epoch": 0.090375, + "grad_norm": 4.28125, + "grad_norm_var": 0.31735026041666664, + "learning_rate": 0.0001, + "loss": 7.067, + "loss/crossentropy": 2.699031949043274, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.25515763461589813, + "step": 2892 + }, + { + "epoch": 0.0904375, + "grad_norm": 5.25, + "grad_norm_var": 0.38806050618489585, + "learning_rate": 0.0001, + "loss": 6.9426, + "loss/crossentropy": 2.710414171218872, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24353402853012085, + "step": 2894 + }, + { + "epoch": 0.0905, + "grad_norm": 4.84375, + "grad_norm_var": 0.3643951416015625, + "learning_rate": 0.0001, + "loss": 6.9086, + "loss/crossentropy": 2.6190038919448853, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.25004892796278, + "step": 2896 + }, + { + "epoch": 0.0905625, + "grad_norm": 5.46875, + "grad_norm_var": 0.4073079427083333, + "learning_rate": 0.0001, + "loss": 7.2013, + "loss/crossentropy": 2.7578283548355103, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.2591920793056488, + "step": 2898 + }, + { + "epoch": 0.090625, + "grad_norm": 4.25, + "grad_norm_var": 0.4041951497395833, + "learning_rate": 0.0001, + "loss": 6.7156, + "loss/crossentropy": 2.523174524307251, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.2383802831172943, + "step": 2900 + }, + { + "epoch": 0.0906875, + "grad_norm": 4.09375, + "grad_norm_var": 0.3777089436848958, + "learning_rate": 0.0001, + "loss": 6.6932, + "loss/crossentropy": 2.539914608001709, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.23134269565343857, + "step": 2902 + }, + { + "epoch": 0.09075, + "grad_norm": 4.0, + "grad_norm_var": 0.20273335774739584, + "learning_rate": 0.0001, + "loss": 6.8114, + "loss/crossentropy": 2.737942695617676, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.23117107152938843, + "step": 2904 + }, + { + "epoch": 0.0908125, + "grad_norm": 4.21875, + "grad_norm_var": 0.3624827067057292, + "learning_rate": 0.0001, + "loss": 7.1634, + "loss/crossentropy": 2.7246967554092407, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2598849982023239, + "step": 2906 + }, + { + "epoch": 0.090875, + "grad_norm": 4.625, + "grad_norm_var": 0.39011942545572914, + "learning_rate": 0.0001, + "loss": 7.0006, + "loss/crossentropy": 2.6638818979263306, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2555497959256172, + "step": 2908 + }, + { + "epoch": 0.0909375, + "grad_norm": 3.859375, + "grad_norm_var": 0.3447011311848958, + "learning_rate": 0.0001, + "loss": 6.7047, + "loss/crossentropy": 2.5874160528182983, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.23985768109560013, + "step": 2910 + }, + { + "epoch": 0.091, + "grad_norm": 4.28125, + "grad_norm_var": 0.3924763997395833, + "learning_rate": 0.0001, + "loss": 6.6062, + "loss/crossentropy": 2.608031749725342, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.2236482873558998, + "step": 2912 + }, + { + "epoch": 0.0910625, + "grad_norm": 4.25, + "grad_norm_var": 0.34807942708333334, + "learning_rate": 0.0001, + "loss": 6.8533, + "loss/crossentropy": 2.6706267595291138, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.24327071011066437, + "step": 2914 + }, + { + "epoch": 0.091125, + "grad_norm": 4.0625, + "grad_norm_var": 0.3446248372395833, + "learning_rate": 0.0001, + "loss": 6.8203, + "loss/crossentropy": 2.655190944671631, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.23877842724323273, + "step": 2916 + }, + { + "epoch": 0.0911875, + "grad_norm": 4.6875, + "grad_norm_var": 0.3470865885416667, + "learning_rate": 0.0001, + "loss": 7.2147, + "loss/crossentropy": 2.751248002052307, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2627522945404053, + "step": 2918 + }, + { + "epoch": 0.09125, + "grad_norm": 3.96875, + "grad_norm_var": 0.34788411458333335, + "learning_rate": 0.0001, + "loss": 6.9592, + "loss/crossentropy": 2.6001957654953003, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.25777673721313477, + "step": 2920 + }, + { + "epoch": 0.0913125, + "grad_norm": 5.28125, + "grad_norm_var": 0.22195638020833333, + "learning_rate": 0.0001, + "loss": 6.786, + "loss/crossentropy": 2.5922285318374634, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.23969252407550812, + "step": 2922 + }, + { + "epoch": 0.091375, + "grad_norm": 4.3125, + "grad_norm_var": 0.14903971354166667, + "learning_rate": 0.0001, + "loss": 7.0038, + "loss/crossentropy": 2.7293628454208374, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.2457985281944275, + "step": 2924 + }, + { + "epoch": 0.0914375, + "grad_norm": 4.125, + "grad_norm_var": 0.15593159993489583, + "learning_rate": 0.0001, + "loss": 6.701, + "loss/crossentropy": 2.60032856464386, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2342819646000862, + "step": 2926 + }, + { + "epoch": 0.0915, + "grad_norm": 3.703125, + "grad_norm_var": 0.15835673014322918, + "learning_rate": 0.0001, + "loss": 6.4028, + "loss/crossentropy": 2.512592077255249, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21519330888986588, + "step": 2928 + }, + { + "epoch": 0.0915625, + "grad_norm": 4.34375, + "grad_norm_var": 0.15203348795572916, + "learning_rate": 0.0001, + "loss": 6.8907, + "loss/crossentropy": 2.677201509475708, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24010415375232697, + "step": 2930 + }, + { + "epoch": 0.091625, + "grad_norm": 4.4375, + "grad_norm_var": 0.15335184733072918, + "learning_rate": 0.0001, + "loss": 7.0395, + "loss/crossentropy": 2.7401055097579956, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.2522094398736954, + "step": 2932 + }, + { + "epoch": 0.0916875, + "grad_norm": 4.21875, + "grad_norm_var": 0.13443094889322918, + "learning_rate": 0.0001, + "loss": 6.786, + "loss/crossentropy": 2.6013104915618896, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.24542643129825592, + "step": 2934 + }, + { + "epoch": 0.09175, + "grad_norm": 4.59375, + "grad_norm_var": 2.154735310872396, + "learning_rate": 0.0001, + "loss": 7.3931, + "loss/crossentropy": 2.691000461578369, + "loss/hidden": 1.921875, + "loss/jsd": 0.0, + "loss/logits": 0.278026819229126, + "step": 2936 + }, + { + "epoch": 0.0918125, + "grad_norm": 4.40625, + "grad_norm_var": 2.124315388997396, + "learning_rate": 0.0001, + "loss": 6.5102, + "loss/crossentropy": 2.412446618080139, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2285284548997879, + "step": 2938 + }, + { + "epoch": 0.091875, + "grad_norm": 4.21875, + "grad_norm_var": 2.149030558268229, + "learning_rate": 0.0001, + "loss": 6.8509, + "loss/crossentropy": 2.6718616485595703, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.24095501005649567, + "step": 2940 + }, + { + "epoch": 0.0919375, + "grad_norm": 4.4375, + "grad_norm_var": 2.1091461181640625, + "learning_rate": 0.0001, + "loss": 7.2243, + "loss/crossentropy": 2.857651472091675, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.25229182839393616, + "step": 2942 + }, + { + "epoch": 0.092, + "grad_norm": 3.796875, + "grad_norm_var": 2.1014556884765625, + "learning_rate": 0.0001, + "loss": 6.4888, + "loss/crossentropy": 2.57748019695282, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21808737516403198, + "step": 2944 + }, + { + "epoch": 0.0920625, + "grad_norm": 3.953125, + "grad_norm_var": 2.124755859375, + "learning_rate": 0.0001, + "loss": 6.6413, + "loss/crossentropy": 2.599161982536316, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.2202320247888565, + "step": 2946 + }, + { + "epoch": 0.092125, + "grad_norm": 4.3125, + "grad_norm_var": 2.14498291015625, + "learning_rate": 0.0001, + "loss": 6.6873, + "loss/crossentropy": 2.5599820613861084, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23929814249277115, + "step": 2948 + }, + { + "epoch": 0.0921875, + "grad_norm": 4.03125, + "grad_norm_var": 2.14400634765625, + "learning_rate": 0.0001, + "loss": 6.4769, + "loss/crossentropy": 2.4159616231918335, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.22445113956928253, + "step": 2950 + }, + { + "epoch": 0.09225, + "grad_norm": 5.65625, + "grad_norm_var": 0.32291259765625, + "learning_rate": 0.0001, + "loss": 7.0604, + "loss/crossentropy": 2.6208486557006836, + "loss/hidden": 1.85546875, + "loss/jsd": 0.0, + "loss/logits": 0.2584053575992584, + "step": 2952 + }, + { + "epoch": 0.0923125, + "grad_norm": 3.9375, + "grad_norm_var": 0.33408203125, + "learning_rate": 0.0001, + "loss": 6.8853, + "loss/crossentropy": 2.723706603050232, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.24116095155477524, + "step": 2954 + }, + { + "epoch": 0.092375, + "grad_norm": 3.875, + "grad_norm_var": 0.33986002604166665, + "learning_rate": 0.0001, + "loss": 6.808, + "loss/crossentropy": 2.670522689819336, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23874707520008087, + "step": 2956 + }, + { + "epoch": 0.0924375, + "grad_norm": 4.34375, + "grad_norm_var": 0.62880859375, + "learning_rate": 0.0001, + "loss": 6.943, + "loss/crossentropy": 2.642168879508972, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.24258745461702347, + "step": 2958 + }, + { + "epoch": 0.0925, + "grad_norm": 4.28125, + "grad_norm_var": 1.5287913004557292, + "learning_rate": 0.0001, + "loss": 6.9016, + "loss/crossentropy": 2.5931923389434814, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.2499857395887375, + "step": 2960 + }, + { + "epoch": 0.0925625, + "grad_norm": 4.46875, + "grad_norm_var": 1.5153483072916667, + "learning_rate": 0.0001, + "loss": 7.1575, + "loss/crossentropy": 2.789669632911682, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2563166320323944, + "step": 2962 + }, + { + "epoch": 0.092625, + "grad_norm": 4.4375, + "grad_norm_var": 1.4841105143229167, + "learning_rate": 0.0001, + "loss": 6.93, + "loss/crossentropy": 2.761322498321533, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.24265322089195251, + "step": 2964 + }, + { + "epoch": 0.0926875, + "grad_norm": 4.375, + "grad_norm_var": 1.4745076497395833, + "learning_rate": 0.0001, + "loss": 6.6479, + "loss/crossentropy": 2.478193521499634, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2404085099697113, + "step": 2966 + }, + { + "epoch": 0.09275, + "grad_norm": 4.53125, + "grad_norm_var": 1.37994384765625, + "learning_rate": 0.0001, + "loss": 7.2748, + "loss/crossentropy": 2.903393268585205, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2597960978746414, + "step": 2968 + }, + { + "epoch": 0.0928125, + "grad_norm": 4.75, + "grad_norm_var": 1.3373331705729166, + "learning_rate": 0.0001, + "loss": 7.0387, + "loss/crossentropy": 2.710214138031006, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.24768862128257751, + "step": 2970 + }, + { + "epoch": 0.092875, + "grad_norm": 4.21875, + "grad_norm_var": 1.2830078125, + "learning_rate": 0.0001, + "loss": 6.5448, + "loss/crossentropy": 2.471164584159851, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23392198234796524, + "step": 2972 + }, + { + "epoch": 0.0929375, + "grad_norm": 4.46875, + "grad_norm_var": 1.1928670247395834, + "learning_rate": 0.0001, + "loss": 7.2907, + "loss/crossentropy": 2.745394468307495, + "loss/hidden": 1.8515625, + "loss/jsd": 0.0, + "loss/logits": 0.26937687397003174, + "step": 2974 + }, + { + "epoch": 0.093, + "grad_norm": 4.3125, + "grad_norm_var": 0.21354166666666666, + "learning_rate": 0.0001, + "loss": 6.8719, + "loss/crossentropy": 2.703699469566345, + "loss/hidden": 1.828125, + "loss/jsd": 0.0, + "loss/logits": 0.23400261253118515, + "step": 2976 + }, + { + "epoch": 0.0930625, + "grad_norm": 4.0, + "grad_norm_var": 0.22515869140625, + "learning_rate": 0.0001, + "loss": 6.9013, + "loss/crossentropy": 2.700608015060425, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2427232563495636, + "step": 2978 + }, + { + "epoch": 0.093125, + "grad_norm": 3.859375, + "grad_norm_var": 0.24124247233072918, + "learning_rate": 0.0001, + "loss": 6.305, + "loss/crossentropy": 2.276304244995117, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.2192765772342682, + "step": 2980 + }, + { + "epoch": 0.0931875, + "grad_norm": 4.09375, + "grad_norm_var": 0.24733784993489583, + "learning_rate": 0.0001, + "loss": 6.8706, + "loss/crossentropy": 2.6530104875564575, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2436370849609375, + "step": 2982 + }, + { + "epoch": 0.09325, + "grad_norm": 4.40625, + "grad_norm_var": 0.2544179280598958, + "learning_rate": 0.0001, + "loss": 6.9514, + "loss/crossentropy": 2.6985493898391724, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.24325336515903473, + "step": 2984 + }, + { + "epoch": 0.0933125, + "grad_norm": 3.90625, + "grad_norm_var": 0.26167704264322916, + "learning_rate": 0.0001, + "loss": 6.5768, + "loss/crossentropy": 2.4524112939834595, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2331458330154419, + "step": 2986 + }, + { + "epoch": 0.093375, + "grad_norm": 4.4375, + "grad_norm_var": 0.2630523681640625, + "learning_rate": 0.0001, + "loss": 6.5181, + "loss/crossentropy": 2.4587045907974243, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.22274015843868256, + "step": 2988 + }, + { + "epoch": 0.0934375, + "grad_norm": 4.5625, + "grad_norm_var": 0.0508941650390625, + "learning_rate": 0.0001, + "loss": 6.4799, + "loss/crossentropy": 2.4741140604019165, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2240130975842476, + "step": 2990 + }, + { + "epoch": 0.0935, + "grad_norm": 3.734375, + "grad_norm_var": 0.06552327473958333, + "learning_rate": 0.0001, + "loss": 6.5749, + "loss/crossentropy": 2.5307430028915405, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.2309805527329445, + "step": 2992 + }, + { + "epoch": 0.0935625, + "grad_norm": 4.0, + "grad_norm_var": 0.06594645182291667, + "learning_rate": 0.0001, + "loss": 6.7368, + "loss/crossentropy": 2.6206862926483154, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23621946573257446, + "step": 2994 + }, + { + "epoch": 0.093625, + "grad_norm": 3.84375, + "grad_norm_var": 0.05030008951822917, + "learning_rate": 0.0001, + "loss": 6.4821, + "loss/crossentropy": 2.4800742864608765, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2244247943162918, + "step": 2996 + }, + { + "epoch": 0.0936875, + "grad_norm": 4.0625, + "grad_norm_var": 0.05836181640625, + "learning_rate": 0.0001, + "loss": 6.9033, + "loss/crossentropy": 2.7235101461410522, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.24532272666692734, + "step": 2998 + }, + { + "epoch": 0.09375, + "grad_norm": 4.21875, + "grad_norm_var": 0.289404296875, + "learning_rate": 0.0001, + "loss": 7.3286, + "loss/crossentropy": 2.658680558204651, + "loss/hidden": 1.91015625, + "loss/jsd": 0.0, + "loss/logits": 0.27597957849502563, + "step": 3000 + }, + { + "epoch": 0.0938125, + "grad_norm": 3.90625, + "grad_norm_var": 0.29117431640625, + "learning_rate": 0.0001, + "loss": 6.7126, + "loss/crossentropy": 2.571900725364685, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23906809091567993, + "step": 3002 + }, + { + "epoch": 0.093875, + "grad_norm": 3.984375, + "grad_norm_var": 0.2908681233723958, + "learning_rate": 0.0001, + "loss": 6.8183, + "loss/crossentropy": 2.616904139518738, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.24162866920232773, + "step": 3004 + }, + { + "epoch": 0.0939375, + "grad_norm": 4.09375, + "grad_norm_var": 0.28135477701822914, + "learning_rate": 0.0001, + "loss": 6.5828, + "loss/crossentropy": 2.5343810319900513, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.2294553965330124, + "step": 3006 + }, + { + "epoch": 0.094, + "grad_norm": 4.21875, + "grad_norm_var": 0.27568359375, + "learning_rate": 0.0001, + "loss": 6.8372, + "loss/crossentropy": 2.6878076791763306, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2391548901796341, + "step": 3008 + }, + { + "epoch": 0.0940625, + "grad_norm": 3.890625, + "grad_norm_var": 0.28186442057291666, + "learning_rate": 0.0001, + "loss": 6.8996, + "loss/crossentropy": 2.6858986616134644, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2455882728099823, + "step": 3010 + }, + { + "epoch": 0.094125, + "grad_norm": 4.375, + "grad_norm_var": 0.2776763916015625, + "learning_rate": 0.0001, + "loss": 6.9203, + "loss/crossentropy": 2.6672648191452026, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24874596297740936, + "step": 3012 + }, + { + "epoch": 0.0941875, + "grad_norm": 4.15625, + "grad_norm_var": 0.25683186848958334, + "learning_rate": 0.0001, + "loss": 6.8733, + "loss/crossentropy": 2.6610859632492065, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.2427053600549698, + "step": 3014 + }, + { + "epoch": 0.09425, + "grad_norm": 4.625, + "grad_norm_var": 0.04869791666666667, + "learning_rate": 0.0001, + "loss": 6.8369, + "loss/crossentropy": 2.577809453010559, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23919511586427689, + "step": 3016 + }, + { + "epoch": 0.0943125, + "grad_norm": 6.75, + "grad_norm_var": 0.531884765625, + "learning_rate": 0.0001, + "loss": 7.1416, + "loss/crossentropy": 2.660222053527832, + "loss/hidden": 1.8828125, + "loss/jsd": 0.0, + "loss/logits": 0.2598596066236496, + "step": 3018 + }, + { + "epoch": 0.094375, + "grad_norm": 5.03125, + "grad_norm_var": 0.5339182535807292, + "learning_rate": 0.0001, + "loss": 6.722, + "loss/crossentropy": 2.464984178543091, + "loss/hidden": 1.8671875, + "loss/jsd": 0.0, + "loss/logits": 0.23898254334926605, + "step": 3020 + }, + { + "epoch": 0.0944375, + "grad_norm": 4.46875, + "grad_norm_var": 0.5307607014973958, + "learning_rate": 0.0001, + "loss": 6.9155, + "loss/crossentropy": 2.7366366386413574, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.23937073349952698, + "step": 3022 + }, + { + "epoch": 0.0945, + "grad_norm": 4.21875, + "grad_norm_var": 0.5398590087890625, + "learning_rate": 0.0001, + "loss": 7.1708, + "loss/crossentropy": 2.825117588043213, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.2568327337503433, + "step": 3024 + }, + { + "epoch": 0.0945625, + "grad_norm": 3.9375, + "grad_norm_var": 0.55103759765625, + "learning_rate": 0.0001, + "loss": 6.7961, + "loss/crossentropy": 2.6475404500961304, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.23751483112573624, + "step": 3026 + }, + { + "epoch": 0.094625, + "grad_norm": 4.1875, + "grad_norm_var": 0.5415191650390625, + "learning_rate": 0.0001, + "loss": 6.4474, + "loss/crossentropy": 2.3609050512313843, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2356044054031372, + "step": 3028 + }, + { + "epoch": 0.0946875, + "grad_norm": 3.921875, + "grad_norm_var": 0.5520345052083333, + "learning_rate": 0.0001, + "loss": 6.6852, + "loss/crossentropy": 2.574931025505066, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23563718795776367, + "step": 3030 + }, + { + "epoch": 0.09475, + "grad_norm": 5.5, + "grad_norm_var": 0.6260050455729167, + "learning_rate": 0.0001, + "loss": 6.8219, + "loss/crossentropy": 2.615893244743347, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.24130163341760635, + "step": 3032 + }, + { + "epoch": 0.0948125, + "grad_norm": 4.03125, + "grad_norm_var": 0.18860270182291666, + "learning_rate": 0.0001, + "loss": 6.5735, + "loss/crossentropy": 2.4803576469421387, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23431752622127533, + "step": 3034 + }, + { + "epoch": 0.094875, + "grad_norm": 3.890625, + "grad_norm_var": 0.15572001139322916, + "learning_rate": 0.0001, + "loss": 6.739, + "loss/crossentropy": 2.682898163795471, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23217643052339554, + "step": 3036 + }, + { + "epoch": 0.0949375, + "grad_norm": 4.59375, + "grad_norm_var": 0.16021219889322916, + "learning_rate": 0.0001, + "loss": 6.7686, + "loss/crossentropy": 2.6070168018341064, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2411540225148201, + "step": 3038 + }, + { + "epoch": 0.095, + "grad_norm": 4.28125, + "grad_norm_var": 0.1994781494140625, + "learning_rate": 0.0001, + "loss": 7.0366, + "loss/crossentropy": 2.666748523712158, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.25495024025440216, + "step": 3040 + }, + { + "epoch": 0.0950625, + "grad_norm": 4.21875, + "grad_norm_var": 0.1870269775390625, + "learning_rate": 0.0001, + "loss": 6.8307, + "loss/crossentropy": 2.6568331718444824, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24082067608833313, + "step": 3042 + }, + { + "epoch": 0.095125, + "grad_norm": 3.9375, + "grad_norm_var": 0.1998046875, + "learning_rate": 0.0001, + "loss": 6.7003, + "loss/crossentropy": 2.595793128013611, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2358388602733612, + "step": 3044 + }, + { + "epoch": 0.0951875, + "grad_norm": 4.6875, + "grad_norm_var": 0.20746968587239584, + "learning_rate": 0.0001, + "loss": 6.4576, + "loss/crossentropy": 2.3486874103546143, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23745301365852356, + "step": 3046 + }, + { + "epoch": 0.09525, + "grad_norm": 4.0625, + "grad_norm_var": 0.1053131103515625, + "learning_rate": 0.0001, + "loss": 6.8214, + "loss/crossentropy": 2.672357678413391, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.23716489970684052, + "step": 3048 + }, + { + "epoch": 0.0953125, + "grad_norm": 3.859375, + "grad_norm_var": 0.115087890625, + "learning_rate": 0.0001, + "loss": 6.7248, + "loss/crossentropy": 2.5690304040908813, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.23588933050632477, + "step": 3050 + }, + { + "epoch": 0.095375, + "grad_norm": 4.46875, + "grad_norm_var": 0.11972249348958333, + "learning_rate": 0.0001, + "loss": 6.8619, + "loss/crossentropy": 2.6711432933807373, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.2390008568763733, + "step": 3052 + }, + { + "epoch": 0.0954375, + "grad_norm": 4.09375, + "grad_norm_var": 0.10676676432291667, + "learning_rate": 0.0001, + "loss": 6.5898, + "loss/crossentropy": 2.5515469312667847, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2288241982460022, + "step": 3054 + }, + { + "epoch": 0.0955, + "grad_norm": 4.3125, + "grad_norm_var": 0.0671539306640625, + "learning_rate": 0.0001, + "loss": 6.6707, + "loss/crossentropy": 2.6111743450164795, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2317328155040741, + "step": 3056 + }, + { + "epoch": 0.0955625, + "grad_norm": 4.0625, + "grad_norm_var": 0.22987874348958334, + "learning_rate": 0.0001, + "loss": 7.0313, + "loss/crossentropy": 2.750077962875366, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.24569693207740784, + "step": 3058 + }, + { + "epoch": 0.095625, + "grad_norm": 4.1875, + "grad_norm_var": 0.2235015869140625, + "learning_rate": 0.0001, + "loss": 6.6612, + "loss/crossentropy": 2.4988226890563965, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.23459364473819733, + "step": 3060 + }, + { + "epoch": 0.0956875, + "grad_norm": 4.875, + "grad_norm_var": 0.23723042805989583, + "learning_rate": 0.0001, + "loss": 6.9914, + "loss/crossentropy": 2.6185423135757446, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.25251929461956024, + "step": 3062 + }, + { + "epoch": 0.09575, + "grad_norm": 5.65625, + "grad_norm_var": 0.35871480305989584, + "learning_rate": 0.0001, + "loss": 6.6228, + "loss/crossentropy": 2.582352042198181, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2243526726961136, + "step": 3064 + }, + { + "epoch": 0.0958125, + "grad_norm": 4.03125, + "grad_norm_var": 0.33693033854166665, + "learning_rate": 0.0001, + "loss": 6.735, + "loss/crossentropy": 2.521291971206665, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24168409407138824, + "step": 3066 + }, + { + "epoch": 0.095875, + "grad_norm": 4.15625, + "grad_norm_var": 0.3283762613932292, + "learning_rate": 0.0001, + "loss": 6.9965, + "loss/crossentropy": 2.7500585317611694, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.24456849694252014, + "step": 3068 + }, + { + "epoch": 0.0959375, + "grad_norm": 4.03125, + "grad_norm_var": 0.3553538004557292, + "learning_rate": 0.0001, + "loss": 7.0783, + "loss/crossentropy": 2.7938302755355835, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.24797877669334412, + "step": 3070 + }, + { + "epoch": 0.096, + "grad_norm": 3.4375, + "grad_norm_var": 0.37916666666666665, + "learning_rate": 0.0001, + "loss": 6.7114, + "loss/crossentropy": 2.6448922157287598, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.22930297255516052, + "step": 3072 + }, + { + "epoch": 0.0960625, + "grad_norm": 3.890625, + "grad_norm_var": 0.2966217041015625, + "learning_rate": 0.0001, + "loss": 6.7515, + "loss/crossentropy": 2.675472378730774, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23376993834972382, + "step": 3074 + }, + { + "epoch": 0.096125, + "grad_norm": 4.125, + "grad_norm_var": 0.30192769368489586, + "learning_rate": 0.0001, + "loss": 6.5454, + "loss/crossentropy": 2.5147024393081665, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.22806604951620102, + "step": 3076 + }, + { + "epoch": 0.0961875, + "grad_norm": 4.0, + "grad_norm_var": 0.2820709228515625, + "learning_rate": 0.0001, + "loss": 6.6275, + "loss/crossentropy": 2.5008946657180786, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.23336394876241684, + "step": 3078 + }, + { + "epoch": 0.09625, + "grad_norm": 4.15625, + "grad_norm_var": 0.1670806884765625, + "learning_rate": 0.0001, + "loss": 6.8559, + "loss/crossentropy": 2.682113766670227, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.23885856568813324, + "step": 3080 + }, + { + "epoch": 0.0963125, + "grad_norm": 4.21875, + "grad_norm_var": 0.1740631103515625, + "learning_rate": 0.0001, + "loss": 6.5302, + "loss/crossentropy": 2.4620113372802734, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.22791030257940292, + "step": 3082 + }, + { + "epoch": 0.096375, + "grad_norm": 4.375, + "grad_norm_var": 0.1577301025390625, + "learning_rate": 0.0001, + "loss": 7.1082, + "loss/crossentropy": 2.7661988735198975, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.2533379793167114, + "step": 3084 + }, + { + "epoch": 0.0964375, + "grad_norm": 3.828125, + "grad_norm_var": 0.09364827473958333, + "learning_rate": 0.0001, + "loss": 6.6862, + "loss/crossentropy": 2.629671812057495, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23182903230190277, + "step": 3086 + }, + { + "epoch": 0.0965, + "grad_norm": 4.40625, + "grad_norm_var": 0.057633463541666666, + "learning_rate": 0.0001, + "loss": 6.8724, + "loss/crossentropy": 2.6740200519561768, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.24522805958986282, + "step": 3088 + }, + { + "epoch": 0.0965625, + "grad_norm": 4.0625, + "grad_norm_var": 0.0516510009765625, + "learning_rate": 0.0001, + "loss": 6.7875, + "loss/crossentropy": 2.6869399547576904, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23505789041519165, + "step": 3090 + }, + { + "epoch": 0.096625, + "grad_norm": 4.25, + "grad_norm_var": 0.05257059733072917, + "learning_rate": 0.0001, + "loss": 6.9584, + "loss/crossentropy": 2.700324773788452, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.24651554971933365, + "step": 3092 + }, + { + "epoch": 0.0966875, + "grad_norm": 3.84375, + "grad_norm_var": 0.06670633951822917, + "learning_rate": 0.0001, + "loss": 6.6047, + "loss/crossentropy": 2.5858383178710938, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2296229526400566, + "step": 3094 + }, + { + "epoch": 0.09675, + "grad_norm": 4.375, + "grad_norm_var": 0.0513092041015625, + "learning_rate": 0.0001, + "loss": 6.897, + "loss/crossentropy": 2.5975048542022705, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.25104597210884094, + "step": 3096 + }, + { + "epoch": 0.0968125, + "grad_norm": 3.6875, + "grad_norm_var": 0.06236063639322917, + "learning_rate": 0.0001, + "loss": 6.4617, + "loss/crossentropy": 2.5920755863189697, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21313737332820892, + "step": 3098 + }, + { + "epoch": 0.096875, + "grad_norm": 4.0, + "grad_norm_var": 0.0770660400390625, + "learning_rate": 0.0001, + "loss": 6.7644, + "loss/crossentropy": 2.6767162084579468, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.23689288645982742, + "step": 3100 + }, + { + "epoch": 0.0969375, + "grad_norm": 4.0625, + "grad_norm_var": 0.07552083333333333, + "learning_rate": 0.0001, + "loss": 6.9666, + "loss/crossentropy": 2.70647394657135, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24476733803749084, + "step": 3102 + }, + { + "epoch": 0.097, + "grad_norm": 7.8125, + "grad_norm_var": 0.961328125, + "learning_rate": 0.0001, + "loss": 7.8022, + "loss/crossentropy": 3.112025260925293, + "loss/hidden": 1.88671875, + "loss/jsd": 0.0, + "loss/logits": 0.28034333884716034, + "step": 3104 + }, + { + "epoch": 0.0970625, + "grad_norm": 4.375, + "grad_norm_var": 0.9477864583333333, + "learning_rate": 0.0001, + "loss": 6.7555, + "loss/crossentropy": 2.5812584161758423, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24008256942033768, + "step": 3106 + }, + { + "epoch": 0.097125, + "grad_norm": 5.25, + "grad_norm_var": 0.9876139322916667, + "learning_rate": 0.0001, + "loss": 6.916, + "loss/crossentropy": 2.659627318382263, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.24321314692497253, + "step": 3108 + }, + { + "epoch": 0.0971875, + "grad_norm": 4.03125, + "grad_norm_var": 0.9808553059895834, + "learning_rate": 0.0001, + "loss": 6.8265, + "loss/crossentropy": 2.709407329559326, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.23201733827590942, + "step": 3110 + }, + { + "epoch": 0.09725, + "grad_norm": 4.4375, + "grad_norm_var": 1.1321614583333333, + "learning_rate": 0.0001, + "loss": 7.1443, + "loss/crossentropy": 2.7275675535202026, + "loss/hidden": 1.875, + "loss/jsd": 0.0, + "loss/logits": 0.2541683614253998, + "step": 3112 + }, + { + "epoch": 0.0973125, + "grad_norm": 5.125, + "grad_norm_var": 1.0579386393229167, + "learning_rate": 0.0001, + "loss": 7.3439, + "loss/crossentropy": 2.883936047554016, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.26474176347255707, + "step": 3114 + }, + { + "epoch": 0.097375, + "grad_norm": 4.09375, + "grad_norm_var": 0.9590494791666667, + "learning_rate": 0.0001, + "loss": 6.8974, + "loss/crossentropy": 2.684443473815918, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24473382532596588, + "step": 3116 + }, + { + "epoch": 0.0974375, + "grad_norm": 6.53125, + "grad_norm_var": 1.1113240559895834, + "learning_rate": 0.0001, + "loss": 6.6607, + "loss/crossentropy": 2.57452130317688, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.22854435443878174, + "step": 3118 + }, + { + "epoch": 0.0975, + "grad_norm": 4.46875, + "grad_norm_var": 0.5167805989583333, + "learning_rate": 0.0001, + "loss": 6.5166, + "loss/crossentropy": 2.3872928619384766, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2332434430718422, + "step": 3120 + }, + { + "epoch": 0.0975625, + "grad_norm": 4.4375, + "grad_norm_var": 0.5370402018229167, + "learning_rate": 0.0001, + "loss": 7.0696, + "loss/crossentropy": 2.7737441062927246, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.253020279109478, + "step": 3122 + }, + { + "epoch": 0.097625, + "grad_norm": 3.9375, + "grad_norm_var": 0.5593170166015625, + "learning_rate": 0.0001, + "loss": 6.8688, + "loss/crossentropy": 2.7830101251602173, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.23279358446598053, + "step": 3124 + }, + { + "epoch": 0.0976875, + "grad_norm": 5.34375, + "grad_norm_var": 0.5707265218098958, + "learning_rate": 0.0001, + "loss": 7.207, + "loss/crossentropy": 2.699463367462158, + "loss/hidden": 1.859375, + "loss/jsd": 0.0, + "loss/logits": 0.26481975615024567, + "step": 3126 + }, + { + "epoch": 0.09775, + "grad_norm": 4.78125, + "grad_norm_var": 0.4343658447265625, + "learning_rate": 0.0001, + "loss": 7.1911, + "loss/crossentropy": 2.7707037925720215, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.25765983760356903, + "step": 3128 + }, + { + "epoch": 0.0978125, + "grad_norm": 4.28125, + "grad_norm_var": 0.4162831624348958, + "learning_rate": 0.0001, + "loss": 7.0955, + "loss/crossentropy": 2.7431172132492065, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.253596693277359, + "step": 3130 + }, + { + "epoch": 0.097875, + "grad_norm": 3.96875, + "grad_norm_var": 0.5329661051432292, + "learning_rate": 0.0001, + "loss": 6.8752, + "loss/crossentropy": 2.6066911220550537, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.24325723201036453, + "step": 3132 + }, + { + "epoch": 0.0979375, + "grad_norm": 4.53125, + "grad_norm_var": 0.2732086181640625, + "learning_rate": 0.0001, + "loss": 6.5346, + "loss/crossentropy": 2.51951003074646, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2269037440419197, + "step": 3134 + }, + { + "epoch": 0.098, + "grad_norm": 4.25, + "grad_norm_var": 0.2782379150390625, + "learning_rate": 0.0001, + "loss": 6.609, + "loss/crossentropy": 2.4421186447143555, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.23504481464624405, + "step": 3136 + }, + { + "epoch": 0.0980625, + "grad_norm": 4.3125, + "grad_norm_var": 0.29846089680989585, + "learning_rate": 0.0001, + "loss": 6.7649, + "loss/crossentropy": 2.7085916996002197, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.2302408069372177, + "step": 3138 + }, + { + "epoch": 0.098125, + "grad_norm": 4.78125, + "grad_norm_var": 0.35513407389322915, + "learning_rate": 0.0001, + "loss": 6.7473, + "loss/crossentropy": 2.6201740503311157, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2330276444554329, + "step": 3140 + }, + { + "epoch": 0.0981875, + "grad_norm": 4.1875, + "grad_norm_var": 0.2961578369140625, + "learning_rate": 0.0001, + "loss": 6.9348, + "loss/crossentropy": 2.6920173168182373, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.24536843597888947, + "step": 3142 + }, + { + "epoch": 0.09825, + "grad_norm": 3.75, + "grad_norm_var": 0.3045074462890625, + "learning_rate": 0.0001, + "loss": 6.7441, + "loss/crossentropy": 2.5798075199127197, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.24338270723819733, + "step": 3144 + }, + { + "epoch": 0.0983125, + "grad_norm": 4.15625, + "grad_norm_var": 0.3104329427083333, + "learning_rate": 0.0001, + "loss": 6.8746, + "loss/crossentropy": 2.736800789833069, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.239170603454113, + "step": 3146 + }, + { + "epoch": 0.098375, + "grad_norm": 4.53125, + "grad_norm_var": 0.1495758056640625, + "learning_rate": 0.0001, + "loss": 6.8822, + "loss/crossentropy": 2.7565410137176514, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23443719744682312, + "step": 3148 + }, + { + "epoch": 0.0984375, + "grad_norm": 4.09375, + "grad_norm_var": 0.13869527180989583, + "learning_rate": 0.0001, + "loss": 6.7574, + "loss/crossentropy": 2.711781859397888, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23190922290086746, + "step": 3150 + }, + { + "epoch": 0.0985, + "grad_norm": 4.21875, + "grad_norm_var": 0.11704813639322917, + "learning_rate": 0.0001, + "loss": 6.5084, + "loss/crossentropy": 2.5728635787963867, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22167624533176422, + "step": 3152 + }, + { + "epoch": 0.0985625, + "grad_norm": 3.828125, + "grad_norm_var": 0.11435139973958333, + "learning_rate": 0.0001, + "loss": 6.4095, + "loss/crossentropy": 2.422892928123474, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.22132034599781036, + "step": 3154 + }, + { + "epoch": 0.098625, + "grad_norm": 4.25, + "grad_norm_var": 0.04759012858072917, + "learning_rate": 0.0001, + "loss": 6.8278, + "loss/crossentropy": 2.6931371688842773, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23808039724826813, + "step": 3156 + }, + { + "epoch": 0.0986875, + "grad_norm": 3.9375, + "grad_norm_var": 0.04902242024739583, + "learning_rate": 0.0001, + "loss": 6.7769, + "loss/crossentropy": 2.723364233970642, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2311325967311859, + "step": 3158 + }, + { + "epoch": 0.09875, + "grad_norm": 3.578125, + "grad_norm_var": 0.05366923014322917, + "learning_rate": 0.0001, + "loss": 6.0851, + "loss/crossentropy": 2.2519463300704956, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21300450712442398, + "step": 3160 + }, + { + "epoch": 0.0988125, + "grad_norm": 4.09375, + "grad_norm_var": 0.05276285807291667, + "learning_rate": 0.0001, + "loss": 6.7297, + "loss/crossentropy": 2.5974432229995728, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.23119933903217316, + "step": 3162 + }, + { + "epoch": 0.098875, + "grad_norm": 4.21875, + "grad_norm_var": 0.0400054931640625, + "learning_rate": 0.0001, + "loss": 7.0831, + "loss/crossentropy": 2.8298484086990356, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.24407267570495605, + "step": 3164 + }, + { + "epoch": 0.0989375, + "grad_norm": 4.21875, + "grad_norm_var": 0.0450103759765625, + "learning_rate": 0.0001, + "loss": 6.7796, + "loss/crossentropy": 2.7380000352859497, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.22993908822536469, + "step": 3166 + }, + { + "epoch": 0.099, + "grad_norm": 4.5625, + "grad_norm_var": 0.0597808837890625, + "learning_rate": 0.0001, + "loss": 6.7739, + "loss/crossentropy": 2.6205304861068726, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2395516186952591, + "step": 3168 + }, + { + "epoch": 0.0990625, + "grad_norm": 3.796875, + "grad_norm_var": 0.0579254150390625, + "learning_rate": 0.0001, + "loss": 7.0631, + "loss/crossentropy": 2.8302695751190186, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.24398455023765564, + "step": 3170 + }, + { + "epoch": 0.099125, + "grad_norm": 5.0625, + "grad_norm_var": 0.11843973795572917, + "learning_rate": 0.0001, + "loss": 7.3957, + "loss/crossentropy": 2.925995945930481, + "loss/hidden": 1.890625, + "loss/jsd": 0.0, + "loss/logits": 0.25790391862392426, + "step": 3172 + }, + { + "epoch": 0.0991875, + "grad_norm": 4.1875, + "grad_norm_var": 0.11941630045572917, + "learning_rate": 0.0001, + "loss": 6.684, + "loss/crossentropy": 2.4603766202926636, + "loss/hidden": 1.83203125, + "loss/jsd": 0.0, + "loss/logits": 0.23915697634220123, + "step": 3174 + }, + { + "epoch": 0.09925, + "grad_norm": 3.828125, + "grad_norm_var": 0.08981119791666667, + "learning_rate": 0.0001, + "loss": 6.8299, + "loss/crossentropy": 2.674596667289734, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.24092095345258713, + "step": 3176 + }, + { + "epoch": 0.0993125, + "grad_norm": 3.859375, + "grad_norm_var": 0.1056549072265625, + "learning_rate": 0.0001, + "loss": 6.3702, + "loss/crossentropy": 2.389748215675354, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22499437630176544, + "step": 3178 + }, + { + "epoch": 0.099375, + "grad_norm": 4.25, + "grad_norm_var": 0.3156077067057292, + "learning_rate": 0.0001, + "loss": 6.9551, + "loss/crossentropy": 2.7173407077789307, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.24916420876979828, + "step": 3180 + }, + { + "epoch": 0.0994375, + "grad_norm": 8.625, + "grad_norm_var": 1.487555948893229, + "learning_rate": 0.0001, + "loss": 6.6407, + "loss/crossentropy": 2.406462073326111, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24568749964237213, + "step": 3182 + }, + { + "epoch": 0.0995, + "grad_norm": 4.46875, + "grad_norm_var": 1.4655100504557292, + "learning_rate": 0.0001, + "loss": 6.8779, + "loss/crossentropy": 2.5153772830963135, + "loss/hidden": 1.86328125, + "loss/jsd": 0.0, + "loss/logits": 0.24992363154888153, + "step": 3184 + }, + { + "epoch": 0.0995625, + "grad_norm": 4.8125, + "grad_norm_var": 1.4021769205729167, + "learning_rate": 0.0001, + "loss": 6.5114, + "loss/crossentropy": 2.387337803840637, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.23193595558404922, + "step": 3186 + }, + { + "epoch": 0.099625, + "grad_norm": 4.0625, + "grad_norm_var": 1.4149373372395833, + "learning_rate": 0.0001, + "loss": 6.844, + "loss/crossentropy": 2.6651508808135986, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.23741832375526428, + "step": 3188 + }, + { + "epoch": 0.0996875, + "grad_norm": 3.734375, + "grad_norm_var": 1.458503214518229, + "learning_rate": 0.0001, + "loss": 6.4834, + "loss/crossentropy": 2.3933539390563965, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.23244161903858185, + "step": 3190 + }, + { + "epoch": 0.09975, + "grad_norm": 4.96875, + "grad_norm_var": 1.44927978515625, + "learning_rate": 0.0001, + "loss": 7.1836, + "loss/crossentropy": 2.730854630470276, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.25581687688827515, + "step": 3192 + }, + { + "epoch": 0.0998125, + "grad_norm": 4.28125, + "grad_norm_var": 1.3643137613932292, + "learning_rate": 0.0001, + "loss": 6.8291, + "loss/crossentropy": 2.6246731281280518, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24309448152780533, + "step": 3194 + }, + { + "epoch": 0.099875, + "grad_norm": 4.21875, + "grad_norm_var": 1.2745676676432292, + "learning_rate": 0.0001, + "loss": 7.1324, + "loss/crossentropy": 2.785401940345764, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2550133019685745, + "step": 3196 + }, + { + "epoch": 0.0999375, + "grad_norm": 4.75, + "grad_norm_var": 0.16503804524739582, + "learning_rate": 0.0001, + "loss": 6.8579, + "loss/crossentropy": 2.624993920326233, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.23969610035419464, + "step": 3198 + }, + { + "epoch": 0.1, + "grad_norm": 4.4375, + "grad_norm_var": 0.16879781087239584, + "learning_rate": 0.0001, + "loss": 6.7553, + "loss/crossentropy": 2.605829358100891, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23798971623182297, + "step": 3200 + }, + { + "epoch": 0.1000625, + "grad_norm": 3.703125, + "grad_norm_var": 0.18420817057291666, + "learning_rate": 0.0001, + "loss": 6.8028, + "loss/crossentropy": 2.650471568107605, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23710602521896362, + "step": 3202 + }, + { + "epoch": 0.100125, + "grad_norm": 3.6875, + "grad_norm_var": 0.21162109375, + "learning_rate": 0.0001, + "loss": 6.7042, + "loss/crossentropy": 2.5433319807052612, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.23484085500240326, + "step": 3204 + }, + { + "epoch": 0.1001875, + "grad_norm": 3.75, + "grad_norm_var": 0.22713114420572916, + "learning_rate": 0.0001, + "loss": 6.457, + "loss/crossentropy": 2.5292232036590576, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22207817435264587, + "step": 3206 + }, + { + "epoch": 0.10025, + "grad_norm": 3.734375, + "grad_norm_var": 0.09065755208333333, + "learning_rate": 0.0001, + "loss": 6.7258, + "loss/crossentropy": 2.6866955757141113, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23047468811273575, + "step": 3208 + }, + { + "epoch": 0.1003125, + "grad_norm": 4.1875, + "grad_norm_var": 0.08564046223958334, + "learning_rate": 0.0001, + "loss": 6.8028, + "loss/crossentropy": 2.612138509750366, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.24016397446393967, + "step": 3210 + }, + { + "epoch": 0.100375, + "grad_norm": 3.90625, + "grad_norm_var": 0.087255859375, + "learning_rate": 0.0001, + "loss": 6.9648, + "loss/crossentropy": 2.7992119789123535, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.23726076632738113, + "step": 3212 + }, + { + "epoch": 0.1004375, + "grad_norm": 4.3125, + "grad_norm_var": 0.07073567708333334, + "learning_rate": 0.0001, + "loss": 6.7238, + "loss/crossentropy": 2.5705056190490723, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2379813715815544, + "step": 3214 + }, + { + "epoch": 0.1005, + "grad_norm": 4.4375, + "grad_norm_var": 0.07086181640625, + "learning_rate": 0.0001, + "loss": 6.9679, + "loss/crossentropy": 2.720636248588562, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.24972644448280334, + "step": 3216 + }, + { + "epoch": 0.1005625, + "grad_norm": 4.4375, + "grad_norm_var": 0.07771708170572916, + "learning_rate": 0.0001, + "loss": 6.9827, + "loss/crossentropy": 2.7298405170440674, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24754898250102997, + "step": 3218 + }, + { + "epoch": 0.100625, + "grad_norm": 4.71875, + "grad_norm_var": 0.0835601806640625, + "learning_rate": 0.0001, + "loss": 6.7132, + "loss/crossentropy": 2.5458946228027344, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2393861562013626, + "step": 3220 + }, + { + "epoch": 0.1006875, + "grad_norm": 4.46875, + "grad_norm_var": 0.10512593587239584, + "learning_rate": 0.0001, + "loss": 7.0824, + "loss/crossentropy": 2.813864588737488, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2487269639968872, + "step": 3222 + }, + { + "epoch": 0.10075, + "grad_norm": 4.375, + "grad_norm_var": 0.08682352701822917, + "learning_rate": 0.0001, + "loss": 6.5424, + "loss/crossentropy": 2.4539612531661987, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.23228086531162262, + "step": 3224 + }, + { + "epoch": 0.1008125, + "grad_norm": 4.0, + "grad_norm_var": 0.10008138020833333, + "learning_rate": 0.0001, + "loss": 6.5922, + "loss/crossentropy": 2.486897349357605, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2347516268491745, + "step": 3226 + }, + { + "epoch": 0.100875, + "grad_norm": 4.375, + "grad_norm_var": 0.11945699055989584, + "learning_rate": 0.0001, + "loss": 6.6798, + "loss/crossentropy": 2.6234965324401855, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.22828619182109833, + "step": 3228 + }, + { + "epoch": 0.1009375, + "grad_norm": 4.28125, + "grad_norm_var": 0.1401275634765625, + "learning_rate": 0.0001, + "loss": 6.5299, + "loss/crossentropy": 2.453442335128784, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.22756576538085938, + "step": 3230 + }, + { + "epoch": 0.101, + "grad_norm": 4.03125, + "grad_norm_var": 0.16022135416666666, + "learning_rate": 0.0001, + "loss": 6.2773, + "loss/crossentropy": 2.4830336570739746, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.20910926163196564, + "step": 3232 + }, + { + "epoch": 0.1010625, + "grad_norm": 4.34375, + "grad_norm_var": 0.15885416666666666, + "learning_rate": 0.0001, + "loss": 6.9401, + "loss/crossentropy": 2.710699439048767, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24559374898672104, + "step": 3234 + }, + { + "epoch": 0.101125, + "grad_norm": 4.0625, + "grad_norm_var": 0.14816080729166667, + "learning_rate": 0.0001, + "loss": 6.8652, + "loss/crossentropy": 2.6526342630386353, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24469351768493652, + "step": 3236 + }, + { + "epoch": 0.1011875, + "grad_norm": 3.90625, + "grad_norm_var": 0.10041910807291667, + "learning_rate": 0.0001, + "loss": 6.4651, + "loss/crossentropy": 2.483692169189453, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.22392398118972778, + "step": 3238 + }, + { + "epoch": 0.10125, + "grad_norm": 3.78125, + "grad_norm_var": 0.10042215983072916, + "learning_rate": 0.0001, + "loss": 6.5394, + "loss/crossentropy": 2.5091440677642822, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.23388579487800598, + "step": 3240 + }, + { + "epoch": 0.1013125, + "grad_norm": 4.90625, + "grad_norm_var": 0.14345296223958334, + "learning_rate": 0.0001, + "loss": 6.6786, + "loss/crossentropy": 2.4868510961532593, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24143566191196442, + "step": 3242 + }, + { + "epoch": 0.101375, + "grad_norm": 3.765625, + "grad_norm_var": 0.13547261555989584, + "learning_rate": 0.0001, + "loss": 6.4898, + "loss/crossentropy": 2.5291796922683716, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22028034180402756, + "step": 3244 + }, + { + "epoch": 0.1014375, + "grad_norm": 4.34375, + "grad_norm_var": 0.0960601806640625, + "learning_rate": 0.0001, + "loss": 6.854, + "loss/crossentropy": 2.7553012371063232, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.23409316688776016, + "step": 3246 + }, + { + "epoch": 0.1015, + "grad_norm": 3.796875, + "grad_norm_var": 0.10049540201822917, + "learning_rate": 0.0001, + "loss": 6.5228, + "loss/crossentropy": 2.4410321712493896, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23552102595567703, + "step": 3248 + }, + { + "epoch": 0.1015625, + "grad_norm": 3.84375, + "grad_norm_var": 0.1018463134765625, + "learning_rate": 0.0001, + "loss": 6.7058, + "loss/crossentropy": 2.555396318435669, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.23418189585208893, + "step": 3250 + }, + { + "epoch": 0.101625, + "grad_norm": 3.828125, + "grad_norm_var": 0.14159749348958334, + "learning_rate": 0.0001, + "loss": 6.7568, + "loss/crossentropy": 2.5550049543380737, + "loss/hidden": 1.83984375, + "loss/jsd": 0.0, + "loss/logits": 0.23619654774665833, + "step": 3252 + }, + { + "epoch": 0.1016875, + "grad_norm": 4.53125, + "grad_norm_var": 0.13186442057291667, + "learning_rate": 0.0001, + "loss": 6.6336, + "loss/crossentropy": 2.5909934043884277, + "loss/hidden": 1.8359375, + "loss/jsd": 0.0, + "loss/logits": 0.22066353261470795, + "step": 3254 + }, + { + "epoch": 0.10175, + "grad_norm": 4.3125, + "grad_norm_var": 0.12200113932291666, + "learning_rate": 0.0001, + "loss": 6.9677, + "loss/crossentropy": 2.701515555381775, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.24731867015361786, + "step": 3256 + }, + { + "epoch": 0.1018125, + "grad_norm": 4.34375, + "grad_norm_var": 0.09191080729166666, + "learning_rate": 0.0001, + "loss": 6.6695, + "loss/crossentropy": 2.6422771215438843, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23046043515205383, + "step": 3258 + }, + { + "epoch": 0.101875, + "grad_norm": 4.09375, + "grad_norm_var": 0.07873942057291666, + "learning_rate": 0.0001, + "loss": 6.2728, + "loss/crossentropy": 2.242031455039978, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2257358878850937, + "step": 3260 + }, + { + "epoch": 0.1019375, + "grad_norm": 3.953125, + "grad_norm_var": 0.07946675618489583, + "learning_rate": 0.0001, + "loss": 6.5906, + "loss/crossentropy": 2.5818673372268677, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.22587288916110992, + "step": 3262 + }, + { + "epoch": 0.102, + "grad_norm": 3.765625, + "grad_norm_var": 0.0798736572265625, + "learning_rate": 0.0001, + "loss": 6.6192, + "loss/crossentropy": 2.602246403694153, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.2247394099831581, + "step": 3264 + }, + { + "epoch": 0.1020625, + "grad_norm": 3.796875, + "grad_norm_var": 0.08364156087239584, + "learning_rate": 0.0001, + "loss": 6.7432, + "loss/crossentropy": 2.6607872247695923, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23480576276779175, + "step": 3266 + }, + { + "epoch": 0.102125, + "grad_norm": 5.03125, + "grad_norm_var": 0.10357666015625, + "learning_rate": 0.0001, + "loss": 7.0483, + "loss/crossentropy": 2.6793575286865234, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2564301863312721, + "step": 3268 + }, + { + "epoch": 0.1021875, + "grad_norm": 4.03125, + "grad_norm_var": 0.09527079264322917, + "learning_rate": 0.0001, + "loss": 6.6106, + "loss/crossentropy": 2.5537089109420776, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.23264338821172714, + "step": 3270 + }, + { + "epoch": 0.10225, + "grad_norm": 3.984375, + "grad_norm_var": 0.09326171875, + "learning_rate": 0.0001, + "loss": 6.6138, + "loss/crossentropy": 2.5651522874832153, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.23298664391040802, + "step": 3272 + }, + { + "epoch": 0.1023125, + "grad_norm": 4.6875, + "grad_norm_var": 0.12069905598958333, + "learning_rate": 0.0001, + "loss": 6.2125, + "loss/crossentropy": 2.287251889705658, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.21595896035432816, + "step": 3274 + }, + { + "epoch": 0.102375, + "grad_norm": 4.53125, + "grad_norm_var": 0.12675374348958332, + "learning_rate": 0.0001, + "loss": 6.6801, + "loss/crossentropy": 2.540787935256958, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.2358093112707138, + "step": 3276 + }, + { + "epoch": 0.1024375, + "grad_norm": 4.0, + "grad_norm_var": 0.1319732666015625, + "learning_rate": 0.0001, + "loss": 6.6532, + "loss/crossentropy": 2.479809880256653, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.24272619932889938, + "step": 3278 + }, + { + "epoch": 0.1025, + "grad_norm": 4.0625, + "grad_norm_var": 0.11599934895833333, + "learning_rate": 0.0001, + "loss": 6.5609, + "loss/crossentropy": 2.5186489820480347, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2311742752790451, + "step": 3280 + }, + { + "epoch": 0.1025625, + "grad_norm": 4.125, + "grad_norm_var": 0.09345703125, + "learning_rate": 0.0001, + "loss": 6.5359, + "loss/crossentropy": 2.5733646154403687, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.2251606062054634, + "step": 3282 + }, + { + "epoch": 0.102625, + "grad_norm": 4.4375, + "grad_norm_var": 0.05789388020833333, + "learning_rate": 0.0001, + "loss": 6.8128, + "loss/crossentropy": 2.6851168870925903, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23737867176532745, + "step": 3284 + }, + { + "epoch": 0.1026875, + "grad_norm": 4.40625, + "grad_norm_var": 0.2830149332682292, + "learning_rate": 0.0001, + "loss": 7.2477, + "loss/crossentropy": 2.7112995386123657, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2723904848098755, + "step": 3286 + }, + { + "epoch": 0.10275, + "grad_norm": 3.9375, + "grad_norm_var": 0.27584635416666664, + "learning_rate": 0.0001, + "loss": 6.889, + "loss/crossentropy": 2.689952254295349, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.24490958452224731, + "step": 3288 + }, + { + "epoch": 0.1028125, + "grad_norm": 4.0, + "grad_norm_var": 0.30777079264322915, + "learning_rate": 0.0001, + "loss": 6.6267, + "loss/crossentropy": 2.6512755155563354, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22684409469366074, + "step": 3290 + }, + { + "epoch": 0.102875, + "grad_norm": 3.8125, + "grad_norm_var": 0.3424763997395833, + "learning_rate": 0.0001, + "loss": 6.3049, + "loss/crossentropy": 2.470964550971985, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21112415194511414, + "step": 3292 + }, + { + "epoch": 0.1029375, + "grad_norm": 4.03125, + "grad_norm_var": 0.3399566650390625, + "learning_rate": 0.0001, + "loss": 6.601, + "loss/crossentropy": 2.5517722368240356, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.22992479801177979, + "step": 3294 + }, + { + "epoch": 0.103, + "grad_norm": 3.953125, + "grad_norm_var": 0.34455973307291665, + "learning_rate": 0.0001, + "loss": 6.9876, + "loss/crossentropy": 2.772384524345398, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24378702044487, + "step": 3296 + }, + { + "epoch": 0.1030625, + "grad_norm": 4.1875, + "grad_norm_var": 0.3600870768229167, + "learning_rate": 0.0001, + "loss": 6.5065, + "loss/crossentropy": 2.5865761041641235, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22168077528476715, + "step": 3298 + }, + { + "epoch": 0.103125, + "grad_norm": 4.5, + "grad_norm_var": 0.36243082682291666, + "learning_rate": 0.0001, + "loss": 6.7596, + "loss/crossentropy": 2.6341527700424194, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23559296876192093, + "step": 3300 + }, + { + "epoch": 0.1031875, + "grad_norm": 4.59375, + "grad_norm_var": 0.08058268229166667, + "learning_rate": 0.0001, + "loss": 6.8533, + "loss/crossentropy": 2.7177610397338867, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23659591376781464, + "step": 3302 + }, + { + "epoch": 0.10325, + "grad_norm": 4.4375, + "grad_norm_var": 0.083447265625, + "learning_rate": 0.0001, + "loss": 6.9423, + "loss/crossentropy": 2.6986615657806396, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.24740910530090332, + "step": 3304 + }, + { + "epoch": 0.1033125, + "grad_norm": 4.3125, + "grad_norm_var": 0.0849761962890625, + "learning_rate": 0.0001, + "loss": 6.3193, + "loss/crossentropy": 2.3887938261032104, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22156642377376556, + "step": 3306 + }, + { + "epoch": 0.103375, + "grad_norm": 4.15625, + "grad_norm_var": 0.06568603515625, + "learning_rate": 0.0001, + "loss": 7.2802, + "loss/crossentropy": 3.0000956058502197, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.25066203624010086, + "step": 3308 + }, + { + "epoch": 0.1034375, + "grad_norm": 55.25, + "grad_norm_var": 163.27130432128905, + "learning_rate": 0.0001, + "loss": 7.7819, + "loss/crossentropy": 2.6019656658172607, + "loss/hidden": 1.9375, + "loss/jsd": 0.0, + "loss/logits": 0.3242449462413788, + "step": 3310 + }, + { + "epoch": 0.1035, + "grad_norm": 4.28125, + "grad_norm_var": 163.11725260416668, + "learning_rate": 0.0001, + "loss": 6.8478, + "loss/crossentropy": 2.632767677307129, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.24064189940690994, + "step": 3312 + }, + { + "epoch": 0.1035625, + "grad_norm": 6.65625, + "grad_norm_var": 162.36551005045573, + "learning_rate": 0.0001, + "loss": 6.6017, + "loss/crossentropy": 2.5260356664657593, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23256751894950867, + "step": 3314 + }, + { + "epoch": 0.103625, + "grad_norm": 4.46875, + "grad_norm_var": 162.21673075358072, + "learning_rate": 0.0001, + "loss": 6.9282, + "loss/crossentropy": 2.644586205482483, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.24828049540519714, + "step": 3316 + }, + { + "epoch": 0.1036875, + "grad_norm": 4.21875, + "grad_norm_var": 162.37351786295574, + "learning_rate": 0.0001, + "loss": 6.4069, + "loss/crossentropy": 2.370494246482849, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.22707335650920868, + "step": 3318 + }, + { + "epoch": 0.10375, + "grad_norm": 4.15625, + "grad_norm_var": 162.66480204264323, + "learning_rate": 0.0001, + "loss": 6.7115, + "loss/crossentropy": 2.620364189147949, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23216355592012405, + "step": 3320 + }, + { + "epoch": 0.1038125, + "grad_norm": 3.921875, + "grad_norm_var": 162.7859120686849, + "learning_rate": 0.0001, + "loss": 6.739, + "loss/crossentropy": 2.599918246269226, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.23734501004219055, + "step": 3322 + }, + { + "epoch": 0.103875, + "grad_norm": 3.71875, + "grad_norm_var": 163.06065165201824, + "learning_rate": 0.0001, + "loss": 6.007, + "loss/crossentropy": 2.29317045211792, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.20107153803110123, + "step": 3324 + }, + { + "epoch": 0.1039375, + "grad_norm": 4.75, + "grad_norm_var": 0.49283447265625, + "learning_rate": 0.0001, + "loss": 6.6523, + "loss/crossentropy": 2.5400946140289307, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2369968742132187, + "step": 3326 + }, + { + "epoch": 0.104, + "grad_norm": 5.34375, + "grad_norm_var": 0.5761220296223958, + "learning_rate": 0.0001, + "loss": 6.9468, + "loss/crossentropy": 2.67788302898407, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24720104038715363, + "step": 3328 + }, + { + "epoch": 0.1040625, + "grad_norm": 4.03125, + "grad_norm_var": 0.18349202473958334, + "learning_rate": 0.0001, + "loss": 6.7984, + "loss/crossentropy": 2.7285367250442505, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.2316000536084175, + "step": 3330 + }, + { + "epoch": 0.104125, + "grad_norm": 4.3125, + "grad_norm_var": 0.18943684895833332, + "learning_rate": 0.0001, + "loss": 7.0711, + "loss/crossentropy": 2.7006884813308716, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.24914763867855072, + "step": 3332 + }, + { + "epoch": 0.1041875, + "grad_norm": 4.15625, + "grad_norm_var": 0.19184468587239584, + "learning_rate": 0.0001, + "loss": 6.2808, + "loss/crossentropy": 2.314067244529724, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22089679539203644, + "step": 3334 + }, + { + "epoch": 0.10425, + "grad_norm": 4.3125, + "grad_norm_var": 0.18879292805989584, + "learning_rate": 0.0001, + "loss": 6.6084, + "loss/crossentropy": 2.5610376596450806, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23208405077457428, + "step": 3336 + }, + { + "epoch": 0.1043125, + "grad_norm": 3.921875, + "grad_norm_var": 0.1903717041015625, + "learning_rate": 0.0001, + "loss": 6.4163, + "loss/crossentropy": 2.499178647994995, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2198343127965927, + "step": 3338 + }, + { + "epoch": 0.104375, + "grad_norm": 5.1875, + "grad_norm_var": 0.2302398681640625, + "learning_rate": 0.0001, + "loss": 6.9003, + "loss/crossentropy": 2.6422451734542847, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.24767768383026123, + "step": 3340 + }, + { + "epoch": 0.1044375, + "grad_norm": 4.40625, + "grad_norm_var": 0.20227864583333333, + "learning_rate": 0.0001, + "loss": 6.6569, + "loss/crossentropy": 2.496048331260681, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23913077265024185, + "step": 3342 + }, + { + "epoch": 0.1045, + "grad_norm": 4.25, + "grad_norm_var": 0.11485087076822917, + "learning_rate": 0.0001, + "loss": 7.0398, + "loss/crossentropy": 2.788357138633728, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24741224944591522, + "step": 3344 + }, + { + "epoch": 0.1045625, + "grad_norm": 3.78125, + "grad_norm_var": 0.12363993326822917, + "learning_rate": 0.0001, + "loss": 6.7648, + "loss/crossentropy": 2.645902395248413, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23493220657110214, + "step": 3346 + }, + { + "epoch": 0.104625, + "grad_norm": 4.84375, + "grad_norm_var": 0.1349761962890625, + "learning_rate": 0.0001, + "loss": 6.6878, + "loss/crossentropy": 2.617870330810547, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23355630785226822, + "step": 3348 + }, + { + "epoch": 0.1046875, + "grad_norm": 3.953125, + "grad_norm_var": 0.14269917805989582, + "learning_rate": 0.0001, + "loss": 6.9487, + "loss/crossentropy": 2.728622317314148, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.2442745715379715, + "step": 3350 + }, + { + "epoch": 0.10475, + "grad_norm": 4.15625, + "grad_norm_var": 0.1415435791015625, + "learning_rate": 0.0001, + "loss": 6.8727, + "loss/crossentropy": 2.638767719268799, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.24331195652484894, + "step": 3352 + }, + { + "epoch": 0.1048125, + "grad_norm": 4.0625, + "grad_norm_var": 0.12536519368489582, + "learning_rate": 0.0001, + "loss": 6.8102, + "loss/crossentropy": 2.7050987482070923, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23511598259210587, + "step": 3354 + }, + { + "epoch": 0.104875, + "grad_norm": 3.71875, + "grad_norm_var": 0.07929585774739584, + "learning_rate": 0.0001, + "loss": 6.2211, + "loss/crossentropy": 2.3178874254226685, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.21766043454408646, + "step": 3356 + }, + { + "epoch": 0.1049375, + "grad_norm": 3.640625, + "grad_norm_var": 0.09163309733072916, + "learning_rate": 0.0001, + "loss": 6.1729, + "loss/crossentropy": 2.3861899375915527, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21108877658843994, + "step": 3358 + }, + { + "epoch": 0.105, + "grad_norm": 4.09375, + "grad_norm_var": 0.09419657389322916, + "learning_rate": 0.0001, + "loss": 6.5909, + "loss/crossentropy": 2.560970664024353, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23072851449251175, + "step": 3360 + }, + { + "epoch": 0.1050625, + "grad_norm": 4.0, + "grad_norm_var": 0.09385477701822917, + "learning_rate": 0.0001, + "loss": 6.4854, + "loss/crossentropy": 2.5368300676345825, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.22220295667648315, + "step": 3362 + }, + { + "epoch": 0.105125, + "grad_norm": 3.953125, + "grad_norm_var": 0.053515625, + "learning_rate": 0.0001, + "loss": 6.4987, + "loss/crossentropy": 2.570409417152405, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21977746486663818, + "step": 3364 + }, + { + "epoch": 0.1051875, + "grad_norm": 3.875, + "grad_norm_var": 0.03459370930989583, + "learning_rate": 0.0001, + "loss": 6.8575, + "loss/crossentropy": 2.7380369901657104, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.23538140952587128, + "step": 3366 + }, + { + "epoch": 0.10525, + "grad_norm": 4.3125, + "grad_norm_var": 0.031712849934895836, + "learning_rate": 0.0001, + "loss": 6.6548, + "loss/crossentropy": 2.5999975204467773, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.23086804151535034, + "step": 3368 + }, + { + "epoch": 0.1053125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0510894775390625, + "learning_rate": 0.0001, + "loss": 6.2373, + "loss/crossentropy": 2.3586994409561157, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21403686702251434, + "step": 3370 + }, + { + "epoch": 0.105375, + "grad_norm": 3.9375, + "grad_norm_var": 0.0415679931640625, + "learning_rate": 0.0001, + "loss": 6.3877, + "loss/crossentropy": 2.436392903327942, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22168905287981033, + "step": 3372 + }, + { + "epoch": 0.1054375, + "grad_norm": 3.8125, + "grad_norm_var": 0.046930948893229164, + "learning_rate": 0.0001, + "loss": 6.3903, + "loss/crossentropy": 2.537071108818054, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21617799997329712, + "step": 3374 + }, + { + "epoch": 0.1055, + "grad_norm": 3.734375, + "grad_norm_var": 0.0477447509765625, + "learning_rate": 0.0001, + "loss": 6.3936, + "loss/crossentropy": 2.44389808177948, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22348541766405106, + "step": 3376 + }, + { + "epoch": 0.1055625, + "grad_norm": 3.828125, + "grad_norm_var": 0.047265625, + "learning_rate": 0.0001, + "loss": 6.1489, + "loss/crossentropy": 2.351946711540222, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2089892104268074, + "step": 3378 + }, + { + "epoch": 0.105625, + "grad_norm": 3.640625, + "grad_norm_var": 0.05133056640625, + "learning_rate": 0.0001, + "loss": 6.4778, + "loss/crossentropy": 2.489010810852051, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22818031907081604, + "step": 3380 + }, + { + "epoch": 0.1056875, + "grad_norm": 3.9375, + "grad_norm_var": 0.05028889973958333, + "learning_rate": 0.0001, + "loss": 6.8609, + "loss/crossentropy": 2.7186564207077026, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.24039897322654724, + "step": 3382 + }, + { + "epoch": 0.10575, + "grad_norm": 4.25, + "grad_norm_var": 0.11812744140625, + "learning_rate": 0.0001, + "loss": 7.0582, + "loss/crossentropy": 2.758796215057373, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2510346248745918, + "step": 3384 + }, + { + "epoch": 0.1058125, + "grad_norm": 3.953125, + "grad_norm_var": 0.1041168212890625, + "learning_rate": 0.0001, + "loss": 6.4993, + "loss/crossentropy": 2.5294731855392456, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22120419889688492, + "step": 3386 + }, + { + "epoch": 0.105875, + "grad_norm": 3.9375, + "grad_norm_var": 0.10358784993489584, + "learning_rate": 0.0001, + "loss": 6.5889, + "loss/crossentropy": 2.5931564569473267, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.22339749336242676, + "step": 3388 + }, + { + "epoch": 0.1059375, + "grad_norm": 4.34375, + "grad_norm_var": 0.09595947265625, + "learning_rate": 0.0001, + "loss": 6.754, + "loss/crossentropy": 2.637158513069153, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23903284966945648, + "step": 3390 + }, + { + "epoch": 0.106, + "grad_norm": 4.0, + "grad_norm_var": 0.08388264973958333, + "learning_rate": 0.0001, + "loss": 6.38, + "loss/crossentropy": 2.390872836112976, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22664637118577957, + "step": 3392 + }, + { + "epoch": 0.1060625, + "grad_norm": 3.84375, + "grad_norm_var": 0.0847320556640625, + "learning_rate": 0.0001, + "loss": 6.859, + "loss/crossentropy": 2.7824547290802, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.23460885882377625, + "step": 3394 + }, + { + "epoch": 0.106125, + "grad_norm": 3.859375, + "grad_norm_var": 0.0758209228515625, + "learning_rate": 0.0001, + "loss": 6.4763, + "loss/crossentropy": 2.5256487131118774, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.22709660977125168, + "step": 3396 + }, + { + "epoch": 0.1061875, + "grad_norm": 3.78125, + "grad_norm_var": 0.08244527180989583, + "learning_rate": 0.0001, + "loss": 6.7439, + "loss/crossentropy": 2.5768805742263794, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.2381836324930191, + "step": 3398 + }, + { + "epoch": 0.10625, + "grad_norm": 3.6875, + "grad_norm_var": 0.04130757649739583, + "learning_rate": 0.0001, + "loss": 6.611, + "loss/crossentropy": 2.593908429145813, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22827593237161636, + "step": 3400 + }, + { + "epoch": 0.1063125, + "grad_norm": 3.921875, + "grad_norm_var": 0.044066365559895834, + "learning_rate": 0.0001, + "loss": 6.2723, + "loss/crossentropy": 2.3864080905914307, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.21359364688396454, + "step": 3402 + }, + { + "epoch": 0.106375, + "grad_norm": 4.28125, + "grad_norm_var": 0.05420633951822917, + "learning_rate": 0.0001, + "loss": 6.8791, + "loss/crossentropy": 2.740124464035034, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.23929176479578018, + "step": 3404 + }, + { + "epoch": 0.1064375, + "grad_norm": 3.96875, + "grad_norm_var": 0.044733683268229164, + "learning_rate": 0.0001, + "loss": 6.8561, + "loss/crossentropy": 2.72505784034729, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23966719955205917, + "step": 3406 + }, + { + "epoch": 0.1065, + "grad_norm": 4.375, + "grad_norm_var": 0.054541015625, + "learning_rate": 0.0001, + "loss": 7.0188, + "loss/crossentropy": 2.7997384071350098, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.24338825047016144, + "step": 3408 + }, + { + "epoch": 0.1065625, + "grad_norm": 3.828125, + "grad_norm_var": 0.05488993326822917, + "learning_rate": 0.0001, + "loss": 6.9797, + "loss/crossentropy": 2.819231390953064, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.24026915431022644, + "step": 3410 + }, + { + "epoch": 0.106625, + "grad_norm": 4.96875, + "grad_norm_var": 2.1745402018229165, + "learning_rate": 0.0001, + "loss": 7.0188, + "loss/crossentropy": 2.718687415122986, + "loss/hidden": 1.84375, + "loss/jsd": 0.0, + "loss/logits": 0.24564100801944733, + "step": 3412 + }, + { + "epoch": 0.1066875, + "grad_norm": 4.21875, + "grad_norm_var": 2.156966145833333, + "learning_rate": 0.0001, + "loss": 6.5127, + "loss/crossentropy": 2.5084104537963867, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2297305017709732, + "step": 3414 + }, + { + "epoch": 0.10675, + "grad_norm": 3.796875, + "grad_norm_var": 2.158740234375, + "learning_rate": 0.0001, + "loss": 6.6456, + "loss/crossentropy": 2.6552449464797974, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22754787653684616, + "step": 3416 + }, + { + "epoch": 0.1068125, + "grad_norm": 4.40625, + "grad_norm_var": 2.106273396809896, + "learning_rate": 0.0001, + "loss": 6.9007, + "loss/crossentropy": 2.6753886938095093, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.24674660712480545, + "step": 3418 + }, + { + "epoch": 0.106875, + "grad_norm": 4.21875, + "grad_norm_var": 2.1166178385416665, + "learning_rate": 0.0001, + "loss": 6.4056, + "loss/crossentropy": 2.5564688444137573, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21655535697937012, + "step": 3420 + }, + { + "epoch": 0.1069375, + "grad_norm": 3.78125, + "grad_norm_var": 2.1394195556640625, + "learning_rate": 0.0001, + "loss": 6.6441, + "loss/crossentropy": 2.6539390087127686, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22596576809883118, + "step": 3422 + }, + { + "epoch": 0.107, + "grad_norm": 3.484375, + "grad_norm_var": 2.1920562744140626, + "learning_rate": 0.0001, + "loss": 6.0564, + "loss/crossentropy": 2.279597282409668, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.20502237975597382, + "step": 3424 + }, + { + "epoch": 0.1070625, + "grad_norm": 4.03125, + "grad_norm_var": 2.3096964518229166, + "learning_rate": 0.0001, + "loss": 6.9587, + "loss/crossentropy": 2.729211688041687, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.2440449818968773, + "step": 3426 + }, + { + "epoch": 0.107125, + "grad_norm": 4.09375, + "grad_norm_var": 0.2835774739583333, + "learning_rate": 0.0001, + "loss": 6.672, + "loss/crossentropy": 2.5828728675842285, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.233519047498703, + "step": 3428 + }, + { + "epoch": 0.1071875, + "grad_norm": 3.625, + "grad_norm_var": 0.29830322265625, + "learning_rate": 0.0001, + "loss": 6.5412, + "loss/crossentropy": 2.6584055423736572, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2175753340125084, + "step": 3430 + }, + { + "epoch": 0.10725, + "grad_norm": 4.125, + "grad_norm_var": 0.29244384765625, + "learning_rate": 0.0001, + "loss": 7.0144, + "loss/crossentropy": 2.808635711669922, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.24245436489582062, + "step": 3432 + }, + { + "epoch": 0.1073125, + "grad_norm": 4.28125, + "grad_norm_var": 0.29611002604166664, + "learning_rate": 0.0001, + "loss": 6.582, + "loss/crossentropy": 2.5630985498428345, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2292320355772972, + "step": 3434 + }, + { + "epoch": 0.107375, + "grad_norm": 4.34375, + "grad_norm_var": 0.29687398274739585, + "learning_rate": 0.0001, + "loss": 6.3284, + "loss/crossentropy": 2.5125374794006348, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.20814663916826248, + "step": 3436 + }, + { + "epoch": 0.1074375, + "grad_norm": 4.15625, + "grad_norm_var": 0.288427734375, + "learning_rate": 0.0001, + "loss": 6.6832, + "loss/crossentropy": 2.553908348083496, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23793192207813263, + "step": 3438 + }, + { + "epoch": 0.1075, + "grad_norm": 3.890625, + "grad_norm_var": 0.267138671875, + "learning_rate": 0.0001, + "loss": 6.6803, + "loss/crossentropy": 2.5226951837539673, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.23919696360826492, + "step": 3440 + }, + { + "epoch": 0.1075625, + "grad_norm": 3.921875, + "grad_norm_var": 0.04299214680989583, + "learning_rate": 0.0001, + "loss": 6.8391, + "loss/crossentropy": 2.732064366340637, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2357020601630211, + "step": 3442 + }, + { + "epoch": 0.107625, + "grad_norm": 4.09375, + "grad_norm_var": 0.050455729166666664, + "learning_rate": 0.0001, + "loss": 6.4614, + "loss/crossentropy": 2.5358023643493652, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22224289923906326, + "step": 3444 + }, + { + "epoch": 0.1076875, + "grad_norm": 4.34375, + "grad_norm_var": 0.09778645833333334, + "learning_rate": 0.0001, + "loss": 7.4274, + "loss/crossentropy": 2.9860514402389526, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.26366259157657623, + "step": 3446 + }, + { + "epoch": 0.10775, + "grad_norm": 3.953125, + "grad_norm_var": 0.13871968587239583, + "learning_rate": 0.0001, + "loss": 6.7431, + "loss/crossentropy": 2.5070163011550903, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24586967378854752, + "step": 3448 + }, + { + "epoch": 0.1078125, + "grad_norm": 4.28125, + "grad_norm_var": 0.12927144368489582, + "learning_rate": 0.0001, + "loss": 6.9973, + "loss/crossentropy": 2.7542529106140137, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.24266376346349716, + "step": 3450 + }, + { + "epoch": 0.107875, + "grad_norm": 3.640625, + "grad_norm_var": 0.12939453125, + "learning_rate": 0.0001, + "loss": 6.5825, + "loss/crossentropy": 2.6318721771240234, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22435743361711502, + "step": 3452 + }, + { + "epoch": 0.1079375, + "grad_norm": 3.984375, + "grad_norm_var": 0.12965494791666668, + "learning_rate": 0.0001, + "loss": 6.9525, + "loss/crossentropy": 2.752131462097168, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2395711988210678, + "step": 3454 + }, + { + "epoch": 0.108, + "grad_norm": 4.0625, + "grad_norm_var": 0.12970377604166666, + "learning_rate": 0.0001, + "loss": 6.811, + "loss/crossentropy": 2.6434799432754517, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23862747848033905, + "step": 3456 + }, + { + "epoch": 0.1080625, + "grad_norm": 4.53125, + "grad_norm_var": 0.13997294108072916, + "learning_rate": 0.0001, + "loss": 6.7528, + "loss/crossentropy": 2.690545082092285, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23083413392305374, + "step": 3458 + }, + { + "epoch": 0.108125, + "grad_norm": 3.96875, + "grad_norm_var": 0.1258453369140625, + "learning_rate": 0.0001, + "loss": 6.6226, + "loss/crossentropy": 2.639745831489563, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22484835237264633, + "step": 3460 + }, + { + "epoch": 0.1081875, + "grad_norm": 4.0625, + "grad_norm_var": 0.09070536295572916, + "learning_rate": 0.0001, + "loss": 6.2426, + "loss/crossentropy": 2.396896004676819, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21152300387620926, + "step": 3462 + }, + { + "epoch": 0.10825, + "grad_norm": 3.578125, + "grad_norm_var": 0.06910807291666667, + "learning_rate": 0.0001, + "loss": 6.324, + "loss/crossentropy": 2.4832128286361694, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.21220114827156067, + "step": 3464 + }, + { + "epoch": 0.1083125, + "grad_norm": 3.921875, + "grad_norm_var": 0.05734049479166667, + "learning_rate": 0.0001, + "loss": 6.793, + "loss/crossentropy": 2.7297017574310303, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23250501602888107, + "step": 3466 + }, + { + "epoch": 0.108375, + "grad_norm": 4.1875, + "grad_norm_var": 0.05925191243489583, + "learning_rate": 0.0001, + "loss": 6.7522, + "loss/crossentropy": 2.6332927942276, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23767106980085373, + "step": 3468 + }, + { + "epoch": 0.1084375, + "grad_norm": 4.0625, + "grad_norm_var": 0.06487630208333334, + "learning_rate": 0.0001, + "loss": 7.205, + "loss/crossentropy": 2.95207941532135, + "loss/hidden": 1.80859375, + "loss/jsd": 0.0, + "loss/logits": 0.24442926049232483, + "step": 3470 + }, + { + "epoch": 0.1085, + "grad_norm": 3.90625, + "grad_norm_var": 0.06583658854166667, + "learning_rate": 0.0001, + "loss": 6.8472, + "loss/crossentropy": 2.7336323261260986, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2383066490292549, + "step": 3472 + }, + { + "epoch": 0.1085625, + "grad_norm": 3.78125, + "grad_norm_var": 0.05103759765625, + "learning_rate": 0.0001, + "loss": 6.5448, + "loss/crossentropy": 2.5316661596298218, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22787390649318695, + "step": 3474 + }, + { + "epoch": 0.108625, + "grad_norm": 3.984375, + "grad_norm_var": 0.30260009765625, + "learning_rate": 0.0001, + "loss": 6.6127, + "loss/crossentropy": 2.57386314868927, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2292788252234459, + "step": 3476 + }, + { + "epoch": 0.1086875, + "grad_norm": 4.375, + "grad_norm_var": 0.30172119140625, + "learning_rate": 0.0001, + "loss": 6.4784, + "loss/crossentropy": 2.4776185750961304, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22429370135068893, + "step": 3478 + }, + { + "epoch": 0.10875, + "grad_norm": 4.40625, + "grad_norm_var": 0.26549072265625, + "learning_rate": 0.0001, + "loss": 6.7101, + "loss/crossentropy": 2.6190848350524902, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.23293334245681763, + "step": 3480 + }, + { + "epoch": 0.1088125, + "grad_norm": 3.75, + "grad_norm_var": 0.26448160807291665, + "learning_rate": 0.0001, + "loss": 6.7926, + "loss/crossentropy": 2.765929102897644, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2296176701784134, + "step": 3482 + }, + { + "epoch": 0.108875, + "grad_norm": 3.796875, + "grad_norm_var": 0.27838541666666666, + "learning_rate": 0.0001, + "loss": 6.4387, + "loss/crossentropy": 2.511491060256958, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2204545959830284, + "step": 3484 + }, + { + "epoch": 0.1089375, + "grad_norm": 3.75, + "grad_norm_var": 0.29277242024739586, + "learning_rate": 0.0001, + "loss": 6.5973, + "loss/crossentropy": 2.638323187828064, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22637015581130981, + "step": 3486 + }, + { + "epoch": 0.109, + "grad_norm": 4.25, + "grad_norm_var": 0.2879384358723958, + "learning_rate": 0.0001, + "loss": 6.7602, + "loss/crossentropy": 2.6970373392105103, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23366264253854752, + "step": 3488 + }, + { + "epoch": 0.1090625, + "grad_norm": 4.65625, + "grad_norm_var": 0.2937164306640625, + "learning_rate": 0.0001, + "loss": 6.8439, + "loss/crossentropy": 2.669746994972229, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.24124659597873688, + "step": 3490 + }, + { + "epoch": 0.109125, + "grad_norm": 3.9375, + "grad_norm_var": 0.07747395833333333, + "learning_rate": 0.0001, + "loss": 6.8294, + "loss/crossentropy": 2.710958242416382, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.23527763038873672, + "step": 3492 + }, + { + "epoch": 0.1091875, + "grad_norm": 4.46875, + "grad_norm_var": 0.08189697265625, + "learning_rate": 0.0001, + "loss": 6.9861, + "loss/crossentropy": 2.617711901664734, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2555924579501152, + "step": 3494 + }, + { + "epoch": 0.10925, + "grad_norm": 3.640625, + "grad_norm_var": 0.0930816650390625, + "learning_rate": 0.0001, + "loss": 6.425, + "loss/crossentropy": 2.4971325397491455, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22208131849765778, + "step": 3496 + }, + { + "epoch": 0.1093125, + "grad_norm": 3.875, + "grad_norm_var": 0.0868316650390625, + "learning_rate": 0.0001, + "loss": 6.8564, + "loss/crossentropy": 2.7396827936172485, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23667167872190475, + "step": 3498 + }, + { + "epoch": 0.109375, + "grad_norm": 3.984375, + "grad_norm_var": 0.08381754557291667, + "learning_rate": 0.0001, + "loss": 6.3491, + "loss/crossentropy": 2.466445207595825, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.219122052192688, + "step": 3500 + }, + { + "epoch": 0.1094375, + "grad_norm": 3.9375, + "grad_norm_var": 0.07215067545572916, + "learning_rate": 0.0001, + "loss": 6.7396, + "loss/crossentropy": 2.6930452585220337, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.2312178760766983, + "step": 3502 + }, + { + "epoch": 0.1095, + "grad_norm": 3.984375, + "grad_norm_var": 0.069189453125, + "learning_rate": 0.0001, + "loss": 7.0488, + "loss/crossentropy": 2.8537940979003906, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.24645072221755981, + "step": 3504 + }, + { + "epoch": 0.1095625, + "grad_norm": 4.03125, + "grad_norm_var": 0.059789021809895836, + "learning_rate": 0.0001, + "loss": 6.2965, + "loss/crossentropy": 2.4314013719558716, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.22010138630867004, + "step": 3506 + }, + { + "epoch": 0.109625, + "grad_norm": 3.6875, + "grad_norm_var": 0.05520426432291667, + "learning_rate": 0.0001, + "loss": 6.1617, + "loss/crossentropy": 2.3627779483795166, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2111404687166214, + "step": 3508 + }, + { + "epoch": 0.1096875, + "grad_norm": 4.28125, + "grad_norm_var": 0.04342041015625, + "learning_rate": 0.0001, + "loss": 7.208, + "loss/crossentropy": 2.9820592403411865, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.24798868596553802, + "step": 3510 + }, + { + "epoch": 0.10975, + "grad_norm": 3.8125, + "grad_norm_var": 0.030790201822916665, + "learning_rate": 0.0001, + "loss": 6.5128, + "loss/crossentropy": 2.5333805084228516, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22724204510450363, + "step": 3512 + }, + { + "epoch": 0.1098125, + "grad_norm": 3.703125, + "grad_norm_var": 0.03193257649739583, + "learning_rate": 0.0001, + "loss": 6.6557, + "loss/crossentropy": 2.628496289253235, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22694183886051178, + "step": 3514 + }, + { + "epoch": 0.109875, + "grad_norm": 4.46875, + "grad_norm_var": 0.85230712890625, + "learning_rate": 0.0001, + "loss": 6.923, + "loss/crossentropy": 2.598778486251831, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.253519706428051, + "step": 3516 + }, + { + "epoch": 0.1099375, + "grad_norm": 4.125, + "grad_norm_var": 0.8498931884765625, + "learning_rate": 0.0001, + "loss": 6.7063, + "loss/crossentropy": 2.7180824279785156, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22694354504346848, + "step": 3518 + }, + { + "epoch": 0.11, + "grad_norm": 4.21875, + "grad_norm_var": 0.8428944905598958, + "learning_rate": 0.0001, + "loss": 6.8915, + "loss/crossentropy": 2.7829537391662598, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.2323419153690338, + "step": 3520 + }, + { + "epoch": 0.1100625, + "grad_norm": 3.84375, + "grad_norm_var": 0.8219685872395833, + "learning_rate": 0.0001, + "loss": 6.619, + "loss/crossentropy": 2.5749711990356445, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23018094897270203, + "step": 3522 + }, + { + "epoch": 0.110125, + "grad_norm": 3.71875, + "grad_norm_var": 0.8130116780598958, + "learning_rate": 0.0001, + "loss": 6.526, + "loss/crossentropy": 2.5394864082336426, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2267736867070198, + "step": 3524 + }, + { + "epoch": 0.1101875, + "grad_norm": 4.0, + "grad_norm_var": 0.83092041015625, + "learning_rate": 0.0001, + "loss": 6.6519, + "loss/crossentropy": 2.6654670238494873, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2291162833571434, + "step": 3526 + }, + { + "epoch": 0.11025, + "grad_norm": 4.3125, + "grad_norm_var": 0.8125315348307292, + "learning_rate": 0.0001, + "loss": 6.7104, + "loss/crossentropy": 2.6382076740264893, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2345614731311798, + "step": 3528 + }, + { + "epoch": 0.1103125, + "grad_norm": 3.453125, + "grad_norm_var": 0.8322336832682292, + "learning_rate": 0.0001, + "loss": 6.7222, + "loss/crossentropy": 2.7326756715774536, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22864137589931488, + "step": 3530 + }, + { + "epoch": 0.110375, + "grad_norm": 4.25, + "grad_norm_var": 0.0852447509765625, + "learning_rate": 0.0001, + "loss": 7.1299, + "loss/crossentropy": 2.849326014518738, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2483738362789154, + "step": 3532 + }, + { + "epoch": 0.1104375, + "grad_norm": 4.09375, + "grad_norm_var": 0.08559468587239584, + "learning_rate": 0.0001, + "loss": 6.6426, + "loss/crossentropy": 2.622498631477356, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.227398082613945, + "step": 3534 + }, + { + "epoch": 0.1105, + "grad_norm": 3.703125, + "grad_norm_var": 0.08915608723958333, + "learning_rate": 0.0001, + "loss": 6.3708, + "loss/crossentropy": 2.4668338298797607, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.21461939066648483, + "step": 3536 + }, + { + "epoch": 0.1105625, + "grad_norm": 4.40625, + "grad_norm_var": 0.09412434895833334, + "learning_rate": 0.0001, + "loss": 6.6304, + "loss/crossentropy": 2.595758080482483, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.23197656869888306, + "step": 3538 + }, + { + "epoch": 0.110625, + "grad_norm": 3.734375, + "grad_norm_var": 0.09536031087239584, + "learning_rate": 0.0001, + "loss": 6.5308, + "loss/crossentropy": 2.606462597846985, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2221251055598259, + "step": 3540 + }, + { + "epoch": 0.1106875, + "grad_norm": 4.09375, + "grad_norm_var": 0.10174153645833334, + "learning_rate": 0.0001, + "loss": 6.4257, + "loss/crossentropy": 2.4909168481826782, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22277754545211792, + "step": 3542 + }, + { + "epoch": 0.11075, + "grad_norm": 4.40625, + "grad_norm_var": 0.10397135416666667, + "learning_rate": 0.0001, + "loss": 6.7788, + "loss/crossentropy": 2.604660987854004, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.23694781959056854, + "step": 3544 + }, + { + "epoch": 0.1108125, + "grad_norm": 4.125, + "grad_norm_var": 0.077587890625, + "learning_rate": 0.0001, + "loss": 6.3997, + "loss/crossentropy": 2.427910327911377, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.22608358412981033, + "step": 3546 + }, + { + "epoch": 0.110875, + "grad_norm": 3.65625, + "grad_norm_var": 0.06253153483072917, + "learning_rate": 0.0001, + "loss": 6.204, + "loss/crossentropy": 2.4499400854110718, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.2117353156208992, + "step": 3548 + }, + { + "epoch": 0.1109375, + "grad_norm": 4.5625, + "grad_norm_var": 0.08551025390625, + "learning_rate": 0.0001, + "loss": 6.8613, + "loss/crossentropy": 2.785742163658142, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23216184228658676, + "step": 3550 + }, + { + "epoch": 0.111, + "grad_norm": 4.375, + "grad_norm_var": 0.08940327962239583, + "learning_rate": 0.0001, + "loss": 6.8523, + "loss/crossentropy": 2.646605968475342, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.24713413417339325, + "step": 3552 + }, + { + "epoch": 0.1110625, + "grad_norm": 3.6875, + "grad_norm_var": 0.08571675618489584, + "learning_rate": 0.0001, + "loss": 6.5563, + "loss/crossentropy": 2.618689775466919, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22383636236190796, + "step": 3554 + }, + { + "epoch": 0.111125, + "grad_norm": 4.09375, + "grad_norm_var": 0.26513264973958334, + "learning_rate": 0.0001, + "loss": 6.6827, + "loss/crossentropy": 2.5601232051849365, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.23452390730381012, + "step": 3556 + }, + { + "epoch": 0.1111875, + "grad_norm": 3.640625, + "grad_norm_var": 0.26907145182291664, + "learning_rate": 0.0001, + "loss": 6.2314, + "loss/crossentropy": 2.4062451124191284, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21611201018095016, + "step": 3558 + }, + { + "epoch": 0.11125, + "grad_norm": 3.578125, + "grad_norm_var": 0.2773671468098958, + "learning_rate": 0.0001, + "loss": 6.465, + "loss/crossentropy": 2.562098264694214, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22115087509155273, + "step": 3560 + }, + { + "epoch": 0.1113125, + "grad_norm": 3.6875, + "grad_norm_var": 0.2841796875, + "learning_rate": 0.0001, + "loss": 6.3455, + "loss/crossentropy": 2.5443878173828125, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21370669454336166, + "step": 3562 + }, + { + "epoch": 0.111375, + "grad_norm": 3.859375, + "grad_norm_var": 0.27547098795572916, + "learning_rate": 0.0001, + "loss": 6.4085, + "loss/crossentropy": 2.515444278717041, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.22134120762348175, + "step": 3564 + }, + { + "epoch": 0.1114375, + "grad_norm": 4.78125, + "grad_norm_var": 0.2936024983723958, + "learning_rate": 0.0001, + "loss": 6.2856, + "loss/crossentropy": 2.3123891353607178, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2250596508383751, + "step": 3566 + }, + { + "epoch": 0.1115, + "grad_norm": 4.40625, + "grad_norm_var": 0.2957672119140625, + "learning_rate": 0.0001, + "loss": 6.6701, + "loss/crossentropy": 2.613504409790039, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23065698146820068, + "step": 3568 + }, + { + "epoch": 0.1115625, + "grad_norm": 4.5625, + "grad_norm_var": 0.30191650390625, + "learning_rate": 0.0001, + "loss": 6.2952, + "loss/crossentropy": 2.3294789791107178, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22352328151464462, + "step": 3570 + }, + { + "epoch": 0.111625, + "grad_norm": 4.0625, + "grad_norm_var": 0.11542561848958334, + "learning_rate": 0.0001, + "loss": 6.5475, + "loss/crossentropy": 2.4532060623168945, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23247961699962616, + "step": 3572 + }, + { + "epoch": 0.1116875, + "grad_norm": 3.875, + "grad_norm_var": 0.10655008951822917, + "learning_rate": 0.0001, + "loss": 6.4323, + "loss/crossentropy": 2.528500556945801, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21928536146879196, + "step": 3574 + }, + { + "epoch": 0.11175, + "grad_norm": 4.5625, + "grad_norm_var": 0.3978352864583333, + "learning_rate": 0.0001, + "loss": 7.1899, + "loss/crossentropy": 2.9368622303009033, + "loss/hidden": 1.84765625, + "loss/jsd": 0.0, + "loss/logits": 0.24054250866174698, + "step": 3576 + }, + { + "epoch": 0.1118125, + "grad_norm": 4.1875, + "grad_norm_var": 0.37014567057291664, + "learning_rate": 0.0001, + "loss": 6.6702, + "loss/crossentropy": 2.626793384552002, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23051141202449799, + "step": 3578 + }, + { + "epoch": 0.111875, + "grad_norm": 3.53125, + "grad_norm_var": 0.37301025390625, + "learning_rate": 0.0001, + "loss": 6.3652, + "loss/crossentropy": 2.3946304321289062, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.22322587668895721, + "step": 3580 + }, + { + "epoch": 0.1119375, + "grad_norm": 3.875, + "grad_norm_var": 0.38766276041666664, + "learning_rate": 0.0001, + "loss": 6.512, + "loss/crossentropy": 2.6459895372390747, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.219414621591568, + "step": 3582 + }, + { + "epoch": 0.112, + "grad_norm": 3.9375, + "grad_norm_var": 0.3851236979166667, + "learning_rate": 0.0001, + "loss": 6.5002, + "loss/crossentropy": 2.5649925470352173, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.22086559236049652, + "step": 3584 + }, + { + "epoch": 0.1120625, + "grad_norm": 3.984375, + "grad_norm_var": 0.3741048177083333, + "learning_rate": 0.0001, + "loss": 6.636, + "loss/crossentropy": 2.722179889678955, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.21716240048408508, + "step": 3586 + }, + { + "epoch": 0.112125, + "grad_norm": 4.5625, + "grad_norm_var": 0.387255859375, + "learning_rate": 0.0001, + "loss": 6.7132, + "loss/crossentropy": 2.6352118253707886, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.22889728099107742, + "step": 3588 + }, + { + "epoch": 0.1121875, + "grad_norm": 3.671875, + "grad_norm_var": 0.40722249348958334, + "learning_rate": 0.0001, + "loss": 6.5121, + "loss/crossentropy": 2.532341957092285, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22219117730855942, + "step": 3590 + }, + { + "epoch": 0.11225, + "grad_norm": 4.125, + "grad_norm_var": 0.09814453125, + "learning_rate": 0.0001, + "loss": 6.7784, + "loss/crossentropy": 2.6766886711120605, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23634084314107895, + "step": 3592 + }, + { + "epoch": 0.1123125, + "grad_norm": 4.125, + "grad_norm_var": 0.10041910807291667, + "learning_rate": 0.0001, + "loss": 6.839, + "loss/crossentropy": 2.7825467586517334, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23181992769241333, + "step": 3594 + }, + { + "epoch": 0.112375, + "grad_norm": 4.25, + "grad_norm_var": 0.08924051920572916, + "learning_rate": 0.0001, + "loss": 6.8486, + "loss/crossentropy": 2.769296169281006, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23527134954929352, + "step": 3596 + }, + { + "epoch": 0.1124375, + "grad_norm": 3.96875, + "grad_norm_var": 0.08483072916666666, + "learning_rate": 0.0001, + "loss": 6.3203, + "loss/crossentropy": 2.4583276510238647, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.2151002734899521, + "step": 3598 + }, + { + "epoch": 0.1125, + "grad_norm": 3.859375, + "grad_norm_var": 0.0872955322265625, + "learning_rate": 0.0001, + "loss": 6.4537, + "loss/crossentropy": 2.5872033834457397, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21555408835411072, + "step": 3600 + }, + { + "epoch": 0.1125625, + "grad_norm": 3.65625, + "grad_norm_var": 0.10201822916666667, + "learning_rate": 0.0001, + "loss": 6.635, + "loss/crossentropy": 2.611227035522461, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22894278168678284, + "step": 3602 + }, + { + "epoch": 0.112625, + "grad_norm": 3.765625, + "grad_norm_var": 0.08616129557291667, + "learning_rate": 0.0001, + "loss": 6.8173, + "loss/crossentropy": 2.761025547981262, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23336273431777954, + "step": 3604 + }, + { + "epoch": 0.1126875, + "grad_norm": 3.640625, + "grad_norm_var": 0.06401265462239583, + "learning_rate": 0.0001, + "loss": 6.1301, + "loss/crossentropy": 2.4436473846435547, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.199115589261055, + "step": 3606 + }, + { + "epoch": 0.11275, + "grad_norm": 4.40625, + "grad_norm_var": 0.07748921712239583, + "learning_rate": 0.0001, + "loss": 6.5835, + "loss/crossentropy": 2.56386399269104, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.22930476069450378, + "step": 3608 + }, + { + "epoch": 0.1128125, + "grad_norm": 3.890625, + "grad_norm_var": 0.11940104166666667, + "learning_rate": 0.0001, + "loss": 6.7951, + "loss/crossentropy": 2.6115182638168335, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.2386746183037758, + "step": 3610 + }, + { + "epoch": 0.112875, + "grad_norm": 3.875, + "grad_norm_var": 0.11263020833333333, + "learning_rate": 0.0001, + "loss": 6.3766, + "loss/crossentropy": 2.4483206272125244, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2221250981092453, + "step": 3612 + }, + { + "epoch": 0.1129375, + "grad_norm": 3.796875, + "grad_norm_var": 0.10579427083333333, + "learning_rate": 0.0001, + "loss": 6.7492, + "loss/crossentropy": 2.800732374191284, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2198447734117508, + "step": 3614 + }, + { + "epoch": 0.113, + "grad_norm": 3.78125, + "grad_norm_var": 0.10694071451822916, + "learning_rate": 0.0001, + "loss": 6.9306, + "loss/crossentropy": 2.844248056411743, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.23246607184410095, + "step": 3616 + }, + { + "epoch": 0.1130625, + "grad_norm": 4.5, + "grad_norm_var": 0.11110738118489584, + "learning_rate": 0.0001, + "loss": 6.5568, + "loss/crossentropy": 2.523680090904236, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.22713688760995865, + "step": 3618 + }, + { + "epoch": 0.113125, + "grad_norm": 3.640625, + "grad_norm_var": 0.1162017822265625, + "learning_rate": 0.0001, + "loss": 6.2764, + "loss/crossentropy": 2.5017552375793457, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.20753761380910873, + "step": 3620 + }, + { + "epoch": 0.1131875, + "grad_norm": 4.03125, + "grad_norm_var": 0.1085113525390625, + "learning_rate": 0.0001, + "loss": 6.5934, + "loss/crossentropy": 2.6008695363998413, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2297266125679016, + "step": 3622 + }, + { + "epoch": 0.11325, + "grad_norm": 4.34375, + "grad_norm_var": 0.09004618326822916, + "learning_rate": 0.0001, + "loss": 6.6576, + "loss/crossentropy": 2.6061915159225464, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23014119267463684, + "step": 3624 + }, + { + "epoch": 0.1133125, + "grad_norm": 3.875, + "grad_norm_var": 0.0517578125, + "learning_rate": 0.0001, + "loss": 6.828, + "loss/crossentropy": 2.762625575065613, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.233097642660141, + "step": 3626 + }, + { + "epoch": 0.113375, + "grad_norm": 3.984375, + "grad_norm_var": 0.0546051025390625, + "learning_rate": 0.0001, + "loss": 6.4933, + "loss/crossentropy": 2.5349791049957275, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2184930369257927, + "step": 3628 + }, + { + "epoch": 0.1134375, + "grad_norm": 4.1875, + "grad_norm_var": 0.06451416015625, + "learning_rate": 0.0001, + "loss": 6.8052, + "loss/crossentropy": 2.711672782897949, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23591265082359314, + "step": 3630 + }, + { + "epoch": 0.1135, + "grad_norm": 4.03125, + "grad_norm_var": 0.0630523681640625, + "learning_rate": 0.0001, + "loss": 6.4884, + "loss/crossentropy": 2.4987112283706665, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.22787494212388992, + "step": 3632 + }, + { + "epoch": 0.1135625, + "grad_norm": 4.71875, + "grad_norm_var": 0.07967020670572916, + "learning_rate": 0.0001, + "loss": 6.7434, + "loss/crossentropy": 2.6017539501190186, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.2321312204003334, + "step": 3634 + }, + { + "epoch": 0.113625, + "grad_norm": 3.828125, + "grad_norm_var": 0.07819010416666666, + "learning_rate": 0.0001, + "loss": 6.7524, + "loss/crossentropy": 2.789211392402649, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22600264102220535, + "step": 3636 + }, + { + "epoch": 0.1136875, + "grad_norm": 4.1875, + "grad_norm_var": 0.0706207275390625, + "learning_rate": 0.0001, + "loss": 6.9458, + "loss/crossentropy": 2.745774269104004, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24265731871128082, + "step": 3638 + }, + { + "epoch": 0.11375, + "grad_norm": 3.984375, + "grad_norm_var": 0.07154541015625, + "learning_rate": 0.0001, + "loss": 7.0743, + "loss/crossentropy": 2.8300269842147827, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.24474012106657028, + "step": 3640 + }, + { + "epoch": 0.1138125, + "grad_norm": 3.859375, + "grad_norm_var": 0.0783111572265625, + "learning_rate": 0.0001, + "loss": 6.295, + "loss/crossentropy": 2.3331433534622192, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.2207912728190422, + "step": 3642 + }, + { + "epoch": 0.113875, + "grad_norm": 3.765625, + "grad_norm_var": 0.08606363932291666, + "learning_rate": 0.0001, + "loss": 6.7028, + "loss/crossentropy": 2.7445106506347656, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.2243453711271286, + "step": 3644 + }, + { + "epoch": 0.1139375, + "grad_norm": 4.125, + "grad_norm_var": 0.08013916015625, + "learning_rate": 0.0001, + "loss": 6.8632, + "loss/crossentropy": 2.7395485639572144, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2373623326420784, + "step": 3646 + }, + { + "epoch": 0.114, + "grad_norm": 3.78125, + "grad_norm_var": 0.08299051920572917, + "learning_rate": 0.0001, + "loss": 6.9405, + "loss/crossentropy": 2.8632638454437256, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23233596980571747, + "step": 3648 + }, + { + "epoch": 0.1140625, + "grad_norm": 10.625, + "grad_norm_var": 2.787287394205729, + "learning_rate": 0.0001, + "loss": 7.184, + "loss/crossentropy": 2.526134729385376, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2864900380373001, + "step": 3650 + }, + { + "epoch": 0.114125, + "grad_norm": 4.15625, + "grad_norm_var": 2.727408854166667, + "learning_rate": 0.0001, + "loss": 6.6539, + "loss/crossentropy": 2.5402532815933228, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23714881390333176, + "step": 3652 + }, + { + "epoch": 0.1141875, + "grad_norm": 4.21875, + "grad_norm_var": 2.713703409830729, + "learning_rate": 0.0001, + "loss": 6.467, + "loss/crossentropy": 2.4508432149887085, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2289644032716751, + "step": 3654 + }, + { + "epoch": 0.11425, + "grad_norm": 4.625, + "grad_norm_var": 2.7493967692057293, + "learning_rate": 0.0001, + "loss": 6.9745, + "loss/crossentropy": 2.7536582946777344, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.24474085122346878, + "step": 3656 + }, + { + "epoch": 0.1143125, + "grad_norm": 4.09375, + "grad_norm_var": 2.7429850260416666, + "learning_rate": 0.0001, + "loss": 6.775, + "loss/crossentropy": 2.79840886592865, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22461071610450745, + "step": 3658 + }, + { + "epoch": 0.114375, + "grad_norm": 3.71875, + "grad_norm_var": 2.7027659098307293, + "learning_rate": 0.0001, + "loss": 6.643, + "loss/crossentropy": 2.691011071205139, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22332537174224854, + "step": 3660 + }, + { + "epoch": 0.1144375, + "grad_norm": 3.671875, + "grad_norm_var": 2.7574859619140626, + "learning_rate": 0.0001, + "loss": 6.3804, + "loss/crossentropy": 2.5471014976501465, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2141846865415573, + "step": 3662 + }, + { + "epoch": 0.1145, + "grad_norm": 4.375, + "grad_norm_var": 2.7490875244140627, + "learning_rate": 0.0001, + "loss": 6.5308, + "loss/crossentropy": 2.5367395877838135, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2294824793934822, + "step": 3664 + }, + { + "epoch": 0.1145625, + "grad_norm": 4.4375, + "grad_norm_var": 0.0964996337890625, + "learning_rate": 0.0001, + "loss": 6.7434, + "loss/crossentropy": 2.6196374893188477, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23854342103004456, + "step": 3666 + }, + { + "epoch": 0.114625, + "grad_norm": 3.625, + "grad_norm_var": 0.1001373291015625, + "learning_rate": 0.0001, + "loss": 6.3404, + "loss/crossentropy": 2.478997230529785, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21505115926265717, + "step": 3668 + }, + { + "epoch": 0.1146875, + "grad_norm": 4.0, + "grad_norm_var": 0.09562886555989583, + "learning_rate": 0.0001, + "loss": 6.8886, + "loss/crossentropy": 2.7907516956329346, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23283251374959946, + "step": 3670 + }, + { + "epoch": 0.11475, + "grad_norm": 4.15625, + "grad_norm_var": 0.06789957682291667, + "learning_rate": 0.0001, + "loss": 6.8005, + "loss/crossentropy": 2.735516667366028, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.23501239717006683, + "step": 3672 + }, + { + "epoch": 0.1148125, + "grad_norm": 3.765625, + "grad_norm_var": 0.07175191243489583, + "learning_rate": 0.0001, + "loss": 6.7163, + "loss/crossentropy": 2.6947312355041504, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2294972613453865, + "step": 3674 + }, + { + "epoch": 0.114875, + "grad_norm": 3.984375, + "grad_norm_var": 0.05624593098958333, + "learning_rate": 0.0001, + "loss": 6.6487, + "loss/crossentropy": 2.6631579399108887, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22668370604515076, + "step": 3676 + }, + { + "epoch": 0.1149375, + "grad_norm": 3.75, + "grad_norm_var": 0.05592041015625, + "learning_rate": 0.0001, + "loss": 6.8093, + "loss/crossentropy": 2.752833366394043, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23220966756343842, + "step": 3678 + }, + { + "epoch": 0.115, + "grad_norm": 4.375, + "grad_norm_var": 0.053548177083333336, + "learning_rate": 0.0001, + "loss": 6.8694, + "loss/crossentropy": 2.7636189460754395, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23674701154232025, + "step": 3680 + }, + { + "epoch": 0.1150625, + "grad_norm": 4.03125, + "grad_norm_var": 0.04224853515625, + "learning_rate": 0.0001, + "loss": 6.6997, + "loss/crossentropy": 2.702791452407837, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.21960802376270294, + "step": 3682 + }, + { + "epoch": 0.115125, + "grad_norm": 3.8125, + "grad_norm_var": 0.03583984375, + "learning_rate": 0.0001, + "loss": 6.6096, + "loss/crossentropy": 2.5904924869537354, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2319922372698784, + "step": 3684 + }, + { + "epoch": 0.1151875, + "grad_norm": 4.03125, + "grad_norm_var": 0.05891825358072917, + "learning_rate": 0.0001, + "loss": 6.2208, + "loss/crossentropy": 2.417539119720459, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2107919454574585, + "step": 3686 + }, + { + "epoch": 0.11525, + "grad_norm": 4.28125, + "grad_norm_var": 0.05426025390625, + "learning_rate": 0.0001, + "loss": 6.5656, + "loss/crossentropy": 2.5044217109680176, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23268309235572815, + "step": 3688 + }, + { + "epoch": 0.1153125, + "grad_norm": 3.75, + "grad_norm_var": 0.05659077962239583, + "learning_rate": 0.0001, + "loss": 6.5324, + "loss/crossentropy": 2.5434666872024536, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2238949090242386, + "step": 3690 + }, + { + "epoch": 0.115375, + "grad_norm": 3.953125, + "grad_norm_var": 0.25015360514322915, + "learning_rate": 0.0001, + "loss": 6.4936, + "loss/crossentropy": 2.470642566680908, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.22456327080726624, + "step": 3692 + }, + { + "epoch": 0.1154375, + "grad_norm": 4.0, + "grad_norm_var": 0.2470703125, + "learning_rate": 0.0001, + "loss": 6.7092, + "loss/crossentropy": 2.697585701942444, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.23045646399259567, + "step": 3694 + }, + { + "epoch": 0.1155, + "grad_norm": 3.9375, + "grad_norm_var": 0.24149983723958332, + "learning_rate": 0.0001, + "loss": 6.7061, + "loss/crossentropy": 2.6695618629455566, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.23177966475486755, + "step": 3696 + }, + { + "epoch": 0.1155625, + "grad_norm": 3.921875, + "grad_norm_var": 0.2424224853515625, + "learning_rate": 0.0001, + "loss": 6.2392, + "loss/crossentropy": 2.3701905012130737, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.21150615811347961, + "step": 3698 + }, + { + "epoch": 0.115625, + "grad_norm": 4.0625, + "grad_norm_var": 0.2453765869140625, + "learning_rate": 0.0001, + "loss": 6.4243, + "loss/crossentropy": 2.4719810485839844, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2221815437078476, + "step": 3700 + }, + { + "epoch": 0.1156875, + "grad_norm": 3.90625, + "grad_norm_var": 0.22551167805989583, + "learning_rate": 0.0001, + "loss": 6.5181, + "loss/crossentropy": 2.5747958421707153, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22245176136493683, + "step": 3702 + }, + { + "epoch": 0.11575, + "grad_norm": 3.65625, + "grad_norm_var": 0.23251546223958333, + "learning_rate": 0.0001, + "loss": 6.4551, + "loss/crossentropy": 2.557185173034668, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.2183024138212204, + "step": 3704 + }, + { + "epoch": 0.1158125, + "grad_norm": 4.03125, + "grad_norm_var": 0.23841145833333333, + "learning_rate": 0.0001, + "loss": 6.4098, + "loss/crossentropy": 2.527738094329834, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21828729659318924, + "step": 3706 + }, + { + "epoch": 0.115875, + "grad_norm": 3.765625, + "grad_norm_var": 0.042708333333333334, + "learning_rate": 0.0001, + "loss": 6.6301, + "loss/crossentropy": 2.637513756752014, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22776946425437927, + "step": 3708 + }, + { + "epoch": 0.1159375, + "grad_norm": 4.5, + "grad_norm_var": 0.062548828125, + "learning_rate": 0.0001, + "loss": 6.6892, + "loss/crossentropy": 2.666839361190796, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.23036272078752518, + "step": 3710 + }, + { + "epoch": 0.116, + "grad_norm": 3.75, + "grad_norm_var": 0.0642578125, + "learning_rate": 0.0001, + "loss": 6.757, + "loss/crossentropy": 2.6890220642089844, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.23218737542629242, + "step": 3712 + }, + { + "epoch": 0.1160625, + "grad_norm": 3.859375, + "grad_norm_var": 0.06907552083333333, + "learning_rate": 0.0001, + "loss": 6.4918, + "loss/crossentropy": 2.558487057685852, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22340869903564453, + "step": 3714 + }, + { + "epoch": 0.116125, + "grad_norm": 4.25, + "grad_norm_var": 0.07427978515625, + "learning_rate": 0.0001, + "loss": 6.694, + "loss/crossentropy": 2.642730474472046, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.22778676450252533, + "step": 3716 + }, + { + "epoch": 0.1161875, + "grad_norm": 3.765625, + "grad_norm_var": 0.0743316650390625, + "learning_rate": 0.0001, + "loss": 6.6046, + "loss/crossentropy": 2.582736015319824, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.22640958428382874, + "step": 3718 + }, + { + "epoch": 0.11625, + "grad_norm": 4.0625, + "grad_norm_var": 0.07993876139322917, + "learning_rate": 0.0001, + "loss": 6.8234, + "loss/crossentropy": 2.6626198291778564, + "loss/hidden": 1.80078125, + "loss/jsd": 0.0, + "loss/logits": 0.23600252717733383, + "step": 3720 + }, + { + "epoch": 0.1163125, + "grad_norm": 3.953125, + "grad_norm_var": 0.07737630208333333, + "learning_rate": 0.0001, + "loss": 6.7206, + "loss/crossentropy": 2.554721474647522, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.23533476889133453, + "step": 3722 + }, + { + "epoch": 0.116375, + "grad_norm": 4.90625, + "grad_norm_var": 0.13069254557291668, + "learning_rate": 0.0001, + "loss": 6.7879, + "loss/crossentropy": 2.7431427240371704, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2325975000858307, + "step": 3724 + }, + { + "epoch": 0.1164375, + "grad_norm": 4.3125, + "grad_norm_var": 21.46592508951823, + "learning_rate": 0.0001, + "loss": 6.9384, + "loss/crossentropy": 2.5549099445343018, + "loss/hidden": 1.87890625, + "loss/jsd": 0.0, + "loss/logits": 0.25046199560165405, + "step": 3726 + }, + { + "epoch": 0.1165, + "grad_norm": 7.46875, + "grad_norm_var": 21.633101399739584, + "learning_rate": 0.0001, + "loss": 7.0932, + "loss/crossentropy": 2.690193295478821, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2629580646753311, + "step": 3728 + }, + { + "epoch": 0.1165625, + "grad_norm": 3.5625, + "grad_norm_var": 21.62051493326823, + "learning_rate": 0.0001, + "loss": 6.3362, + "loss/crossentropy": 2.4794082641601562, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.21380110830068588, + "step": 3730 + }, + { + "epoch": 0.116625, + "grad_norm": 4.03125, + "grad_norm_var": 21.58226623535156, + "learning_rate": 0.0001, + "loss": 7.0605, + "loss/crossentropy": 2.8624175786972046, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2451971471309662, + "step": 3732 + }, + { + "epoch": 0.1166875, + "grad_norm": 3.828125, + "grad_norm_var": 21.607673136393228, + "learning_rate": 0.0001, + "loss": 6.5621, + "loss/crossentropy": 2.6379172801971436, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22171756625175476, + "step": 3734 + }, + { + "epoch": 0.11675, + "grad_norm": 6.0625, + "grad_norm_var": 21.62181701660156, + "learning_rate": 0.0001, + "loss": 6.755, + "loss/crossentropy": 2.536095380783081, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.24298104643821716, + "step": 3736 + }, + { + "epoch": 0.1168125, + "grad_norm": 3.671875, + "grad_norm_var": 21.738084920247395, + "learning_rate": 0.0001, + "loss": 6.6708, + "loss/crossentropy": 2.7039905786514282, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22636428475379944, + "step": 3738 + }, + { + "epoch": 0.116875, + "grad_norm": 3.890625, + "grad_norm_var": 21.764842732747397, + "learning_rate": 0.0001, + "loss": 6.7239, + "loss/crossentropy": 2.686025619506836, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.23073802888393402, + "step": 3740 + }, + { + "epoch": 0.1169375, + "grad_norm": 3.375, + "grad_norm_var": 1.1045857747395833, + "learning_rate": 0.0001, + "loss": 5.9112, + "loss/crossentropy": 2.2144126892089844, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.19663581252098083, + "step": 3742 + }, + { + "epoch": 0.117, + "grad_norm": 3.765625, + "grad_norm_var": 0.3494954427083333, + "learning_rate": 0.0001, + "loss": 6.4705, + "loss/crossentropy": 2.5193395614624023, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2220740020275116, + "step": 3744 + }, + { + "epoch": 0.1170625, + "grad_norm": 3.9375, + "grad_norm_var": 0.33886311848958334, + "learning_rate": 0.0001, + "loss": 6.5074, + "loss/crossentropy": 2.5095298290252686, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2279161512851715, + "step": 3746 + }, + { + "epoch": 0.117125, + "grad_norm": 4.375, + "grad_norm_var": 0.3591949462890625, + "learning_rate": 0.0001, + "loss": 6.3287, + "loss/crossentropy": 2.5012372732162476, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21555764973163605, + "step": 3748 + }, + { + "epoch": 0.1171875, + "grad_norm": 4.25, + "grad_norm_var": 0.3813547770182292, + "learning_rate": 0.0001, + "loss": 6.5337, + "loss/crossentropy": 2.591525673866272, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22234736382961273, + "step": 3750 + }, + { + "epoch": 0.11725, + "grad_norm": 4.3125, + "grad_norm_var": 0.10349833170572917, + "learning_rate": 0.0001, + "loss": 6.8932, + "loss/crossentropy": 2.746386170387268, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2400682345032692, + "step": 3752 + }, + { + "epoch": 0.1173125, + "grad_norm": 3.90625, + "grad_norm_var": 0.10373942057291667, + "learning_rate": 0.0001, + "loss": 6.6784, + "loss/crossentropy": 2.580140709877014, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23561152815818787, + "step": 3754 + }, + { + "epoch": 0.117375, + "grad_norm": 4.28125, + "grad_norm_var": 0.11262613932291667, + "learning_rate": 0.0001, + "loss": 6.4309, + "loss/crossentropy": 2.4955955743789673, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.22243603318929672, + "step": 3756 + }, + { + "epoch": 0.1174375, + "grad_norm": 3.96875, + "grad_norm_var": 0.09172770182291666, + "learning_rate": 0.0001, + "loss": 6.5724, + "loss/crossentropy": 2.580453872680664, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2273193672299385, + "step": 3758 + }, + { + "epoch": 0.1175, + "grad_norm": 4.25, + "grad_norm_var": 0.10486551920572916, + "learning_rate": 0.0001, + "loss": 7.1613, + "loss/crossentropy": 2.834384322166443, + "loss/hidden": 1.81640625, + "loss/jsd": 0.0, + "loss/logits": 0.2510491907596588, + "step": 3760 + }, + { + "epoch": 0.1175625, + "grad_norm": 3.90625, + "grad_norm_var": 0.10698954264322917, + "learning_rate": 0.0001, + "loss": 6.5418, + "loss/crossentropy": 2.5571680068969727, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.220725879073143, + "step": 3762 + }, + { + "epoch": 0.117625, + "grad_norm": 4.15625, + "grad_norm_var": 0.10991109212239583, + "learning_rate": 0.0001, + "loss": 6.6508, + "loss/crossentropy": 2.6110875606536865, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.22662819921970367, + "step": 3764 + }, + { + "epoch": 0.1176875, + "grad_norm": 3.4375, + "grad_norm_var": 0.10447489420572917, + "learning_rate": 0.0001, + "loss": 6.5068, + "loss/crossentropy": 2.6286540031433105, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2217947244644165, + "step": 3766 + }, + { + "epoch": 0.11775, + "grad_norm": 3.765625, + "grad_norm_var": 0.0976959228515625, + "learning_rate": 0.0001, + "loss": 6.2205, + "loss/crossentropy": 2.449827790260315, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.2059696912765503, + "step": 3768 + }, + { + "epoch": 0.1178125, + "grad_norm": 3.90625, + "grad_norm_var": 0.12255757649739583, + "learning_rate": 0.0001, + "loss": 6.7584, + "loss/crossentropy": 2.619004487991333, + "loss/hidden": 1.8203125, + "loss/jsd": 0.0, + "loss/logits": 0.23191242665052414, + "step": 3770 + }, + { + "epoch": 0.117875, + "grad_norm": 3.890625, + "grad_norm_var": 0.11604715983072916, + "learning_rate": 0.0001, + "loss": 6.4862, + "loss/crossentropy": 2.4522976875305176, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2303444668650627, + "step": 3772 + }, + { + "epoch": 0.1179375, + "grad_norm": 4.3125, + "grad_norm_var": 0.12135009765625, + "learning_rate": 0.0001, + "loss": 6.9901, + "loss/crossentropy": 2.853781223297119, + "loss/hidden": 1.79296875, + "loss/jsd": 0.0, + "loss/logits": 0.2343396246433258, + "step": 3774 + }, + { + "epoch": 0.118, + "grad_norm": 4.4375, + "grad_norm_var": 0.11217041015625, + "learning_rate": 0.0001, + "loss": 6.8352, + "loss/crossentropy": 2.7490190267562866, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2340097874403, + "step": 3776 + }, + { + "epoch": 0.1180625, + "grad_norm": 3.4375, + "grad_norm_var": 0.134814453125, + "learning_rate": 0.0001, + "loss": 6.4295, + "loss/crossentropy": 2.532248616218567, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21746356040239334, + "step": 3778 + }, + { + "epoch": 0.118125, + "grad_norm": 4.09375, + "grad_norm_var": 0.11428120930989584, + "learning_rate": 0.0001, + "loss": 6.8923, + "loss/crossentropy": 2.812334418296814, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2357269674539566, + "step": 3780 + }, + { + "epoch": 0.1181875, + "grad_norm": 4.09375, + "grad_norm_var": 0.09714253743489583, + "learning_rate": 0.0001, + "loss": 6.6251, + "loss/crossentropy": 2.6684558391571045, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2257475033402443, + "step": 3782 + }, + { + "epoch": 0.11825, + "grad_norm": 3.984375, + "grad_norm_var": 0.07913004557291667, + "learning_rate": 0.0001, + "loss": 6.5944, + "loss/crossentropy": 2.530303120613098, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.228675976395607, + "step": 3784 + }, + { + "epoch": 0.1183125, + "grad_norm": 3.8125, + "grad_norm_var": 0.06564127604166667, + "learning_rate": 0.0001, + "loss": 6.8646, + "loss/crossentropy": 2.777267575263977, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.23685341328382492, + "step": 3786 + }, + { + "epoch": 0.118375, + "grad_norm": 4.03125, + "grad_norm_var": 0.07919820149739583, + "learning_rate": 0.0001, + "loss": 6.6104, + "loss/crossentropy": 2.720764994621277, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21825769543647766, + "step": 3788 + }, + { + "epoch": 0.1184375, + "grad_norm": 3.71875, + "grad_norm_var": 0.0782623291015625, + "learning_rate": 0.0001, + "loss": 6.5611, + "loss/crossentropy": 2.617440700531006, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22248947620391846, + "step": 3790 + }, + { + "epoch": 0.1185, + "grad_norm": 3.78125, + "grad_norm_var": 0.06314697265625, + "learning_rate": 0.0001, + "loss": 6.5085, + "loss/crossentropy": 2.5625646114349365, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22310566902160645, + "step": 3792 + }, + { + "epoch": 0.1185625, + "grad_norm": 3.796875, + "grad_norm_var": 0.03997395833333333, + "learning_rate": 0.0001, + "loss": 6.8123, + "loss/crossentropy": 2.7593398094177246, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.23342178761959076, + "step": 3794 + }, + { + "epoch": 0.118625, + "grad_norm": 3.890625, + "grad_norm_var": 0.0342926025390625, + "learning_rate": 0.0001, + "loss": 6.43, + "loss/crossentropy": 2.5027835369110107, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22123384475708008, + "step": 3796 + }, + { + "epoch": 0.1186875, + "grad_norm": 4.59375, + "grad_norm_var": 0.06208394368489583, + "learning_rate": 0.0001, + "loss": 7.0069, + "loss/crossentropy": 2.9035476446151733, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.23689740151166916, + "step": 3798 + }, + { + "epoch": 0.11875, + "grad_norm": 3.84375, + "grad_norm_var": 0.06220296223958333, + "learning_rate": 0.0001, + "loss": 6.7092, + "loss/crossentropy": 2.609599232673645, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23574373871088028, + "step": 3800 + }, + { + "epoch": 0.1188125, + "grad_norm": 3.859375, + "grad_norm_var": 0.14558817545572916, + "learning_rate": 0.0001, + "loss": 6.6645, + "loss/crossentropy": 2.588888168334961, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23372826725244522, + "step": 3802 + }, + { + "epoch": 0.118875, + "grad_norm": 3.40625, + "grad_norm_var": 0.16487528483072916, + "learning_rate": 0.0001, + "loss": 6.3971, + "loss/crossentropy": 2.6425379514694214, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21139515936374664, + "step": 3804 + }, + { + "epoch": 0.1189375, + "grad_norm": 4.34375, + "grad_norm_var": 0.18788960774739583, + "learning_rate": 0.0001, + "loss": 6.4841, + "loss/crossentropy": 2.421231508255005, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.2293347418308258, + "step": 3806 + }, + { + "epoch": 0.119, + "grad_norm": 5.28125, + "grad_norm_var": 0.2775349934895833, + "learning_rate": 0.0001, + "loss": 6.4633, + "loss/crossentropy": 2.3681161403656006, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2282676324248314, + "step": 3808 + }, + { + "epoch": 0.1190625, + "grad_norm": 3.953125, + "grad_norm_var": 0.2936197916666667, + "learning_rate": 0.0001, + "loss": 6.316, + "loss/crossentropy": 2.4912995100021362, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21489524841308594, + "step": 3810 + }, + { + "epoch": 0.119125, + "grad_norm": 3.796875, + "grad_norm_var": 0.29673563639322914, + "learning_rate": 0.0001, + "loss": 6.6797, + "loss/crossentropy": 2.7319802045822144, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22250615060329437, + "step": 3812 + }, + { + "epoch": 0.1191875, + "grad_norm": 3.875, + "grad_norm_var": 0.2789947509765625, + "learning_rate": 0.0001, + "loss": 6.9102, + "loss/crossentropy": 2.805300712585449, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.2366633266210556, + "step": 3814 + }, + { + "epoch": 0.11925, + "grad_norm": 4.1875, + "grad_norm_var": 0.2746246337890625, + "learning_rate": 0.0001, + "loss": 6.5752, + "loss/crossentropy": 2.6415826082229614, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.21875379979610443, + "step": 3816 + }, + { + "epoch": 0.1193125, + "grad_norm": 4.5, + "grad_norm_var": 0.21341145833333333, + "learning_rate": 0.0001, + "loss": 6.5314, + "loss/crossentropy": 2.5881221294403076, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22089115530252457, + "step": 3818 + }, + { + "epoch": 0.119375, + "grad_norm": 13.125, + "grad_norm_var": 5.160139973958334, + "learning_rate": 0.0001, + "loss": 6.6154, + "loss/crossentropy": 2.380750298500061, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2476859986782074, + "step": 3820 + }, + { + "epoch": 0.1194375, + "grad_norm": 3.828125, + "grad_norm_var": 5.204002888997396, + "learning_rate": 0.0001, + "loss": 6.6778, + "loss/crossentropy": 2.7415353059768677, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22291918098926544, + "step": 3822 + }, + { + "epoch": 0.1195, + "grad_norm": 4.71875, + "grad_norm_var": 5.1928049723307295, + "learning_rate": 0.0001, + "loss": 7.1779, + "loss/crossentropy": 2.8588478565216064, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.25300103425979614, + "step": 3824 + }, + { + "epoch": 0.1195625, + "grad_norm": 4.3125, + "grad_norm_var": 5.140648396809896, + "learning_rate": 0.0001, + "loss": 6.5702, + "loss/crossentropy": 2.557085633277893, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.22669967263936996, + "step": 3826 + }, + { + "epoch": 0.119625, + "grad_norm": 4.4375, + "grad_norm_var": 5.0800120035807295, + "learning_rate": 0.0001, + "loss": 6.8565, + "loss/crossentropy": 2.6817139387130737, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2444320172071457, + "step": 3828 + }, + { + "epoch": 0.1196875, + "grad_norm": 3.5, + "grad_norm_var": 5.15458984375, + "learning_rate": 0.0001, + "loss": 6.2591, + "loss/crossentropy": 2.42851722240448, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21665531396865845, + "step": 3830 + }, + { + "epoch": 0.11975, + "grad_norm": 4.3125, + "grad_norm_var": 5.154011027018229, + "learning_rate": 0.0001, + "loss": 6.7774, + "loss/crossentropy": 2.70755672454834, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23315252363681793, + "step": 3832 + }, + { + "epoch": 0.1198125, + "grad_norm": 4.59375, + "grad_norm_var": 5.165925089518229, + "learning_rate": 0.0001, + "loss": 6.8659, + "loss/crossentropy": 2.693860173225403, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24063920229673386, + "step": 3834 + }, + { + "epoch": 0.119875, + "grad_norm": 3.859375, + "grad_norm_var": 0.14049072265625, + "learning_rate": 0.0001, + "loss": 6.7968, + "loss/crossentropy": 2.7572206258773804, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2312999740242958, + "step": 3836 + }, + { + "epoch": 0.1199375, + "grad_norm": 4.09375, + "grad_norm_var": 0.10696512858072917, + "learning_rate": 0.0001, + "loss": 6.4984, + "loss/crossentropy": 2.5449635982513428, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22463569045066833, + "step": 3838 + }, + { + "epoch": 0.12, + "grad_norm": 3.65625, + "grad_norm_var": 0.09641927083333333, + "learning_rate": 0.0001, + "loss": 6.4033, + "loss/crossentropy": 2.559556007385254, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21288719773292542, + "step": 3840 + }, + { + "epoch": 0.1200625, + "grad_norm": 4.0625, + "grad_norm_var": 0.08531494140625, + "learning_rate": 0.0001, + "loss": 6.4479, + "loss/crossentropy": 2.5646849870681763, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.21644552052021027, + "step": 3842 + }, + { + "epoch": 0.120125, + "grad_norm": 3.953125, + "grad_norm_var": 0.07693583170572917, + "learning_rate": 0.0001, + "loss": 6.7152, + "loss/crossentropy": 2.7804479598999023, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22355540096759796, + "step": 3844 + }, + { + "epoch": 0.1201875, + "grad_norm": 3.984375, + "grad_norm_var": 0.06265360514322917, + "learning_rate": 0.0001, + "loss": 6.7568, + "loss/crossentropy": 2.7693997621536255, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2260872721672058, + "step": 3846 + }, + { + "epoch": 0.12025, + "grad_norm": 5.8125, + "grad_norm_var": 0.26751302083333334, + "learning_rate": 0.0001, + "loss": 6.9386, + "loss/crossentropy": 2.737673044204712, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.24353280663490295, + "step": 3848 + }, + { + "epoch": 0.1203125, + "grad_norm": 4.0, + "grad_norm_var": 0.42926025390625, + "learning_rate": 0.0001, + "loss": 6.6092, + "loss/crossentropy": 2.4701892137527466, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.23421593010425568, + "step": 3850 + }, + { + "epoch": 0.120375, + "grad_norm": 4.1875, + "grad_norm_var": 0.4547526041666667, + "learning_rate": 0.0001, + "loss": 6.4197, + "loss/crossentropy": 2.488594174385071, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22005945444107056, + "step": 3852 + }, + { + "epoch": 0.1204375, + "grad_norm": 4.15625, + "grad_norm_var": 0.44224853515625, + "learning_rate": 0.0001, + "loss": 6.156, + "loss/crossentropy": 2.3247268199920654, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21008112281560898, + "step": 3854 + }, + { + "epoch": 0.1205, + "grad_norm": 3.75, + "grad_norm_var": 0.4252675374348958, + "learning_rate": 0.0001, + "loss": 6.46, + "loss/crossentropy": 2.516259789466858, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22366880625486374, + "step": 3856 + }, + { + "epoch": 0.1205625, + "grad_norm": 4.09375, + "grad_norm_var": 0.42454325358072914, + "learning_rate": 0.0001, + "loss": 6.6779, + "loss/crossentropy": 2.6965575218200684, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22625887393951416, + "step": 3858 + }, + { + "epoch": 0.120625, + "grad_norm": 3.671875, + "grad_norm_var": 0.4255442301432292, + "learning_rate": 0.0001, + "loss": 6.2982, + "loss/crossentropy": 2.446297526359558, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.21058575063943863, + "step": 3860 + }, + { + "epoch": 0.1206875, + "grad_norm": 3.921875, + "grad_norm_var": 0.4435373942057292, + "learning_rate": 0.0001, + "loss": 6.7492, + "loss/crossentropy": 2.723504066467285, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.2279648631811142, + "step": 3862 + }, + { + "epoch": 0.12075, + "grad_norm": 3.828125, + "grad_norm_var": 0.2752604166666667, + "learning_rate": 0.0001, + "loss": 6.3156, + "loss/crossentropy": 2.4643293619155884, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2163769155740738, + "step": 3864 + }, + { + "epoch": 0.1208125, + "grad_norm": 3.984375, + "grad_norm_var": 0.07724507649739583, + "learning_rate": 0.0001, + "loss": 6.578, + "loss/crossentropy": 2.6052640676498413, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22696109861135483, + "step": 3866 + }, + { + "epoch": 0.120875, + "grad_norm": 4.1875, + "grad_norm_var": 0.0659088134765625, + "learning_rate": 0.0001, + "loss": 6.4002, + "loss/crossentropy": 2.4768152236938477, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22280435264110565, + "step": 3868 + }, + { + "epoch": 0.1209375, + "grad_norm": 3.609375, + "grad_norm_var": 0.05943603515625, + "learning_rate": 0.0001, + "loss": 6.4668, + "loss/crossentropy": 2.5721131563186646, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.22150424122810364, + "step": 3870 + }, + { + "epoch": 0.121, + "grad_norm": 3.921875, + "grad_norm_var": 0.058958943684895834, + "learning_rate": 0.0001, + "loss": 6.5498, + "loss/crossentropy": 2.6573235988616943, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22010327875614166, + "step": 3872 + }, + { + "epoch": 0.1210625, + "grad_norm": 4.28125, + "grad_norm_var": 0.060400390625, + "learning_rate": 0.0001, + "loss": 6.1348, + "loss/crossentropy": 2.4366344213485718, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.20107021182775497, + "step": 3874 + }, + { + "epoch": 0.121125, + "grad_norm": 3.984375, + "grad_norm_var": 0.044905598958333334, + "learning_rate": 0.0001, + "loss": 6.6133, + "loss/crossentropy": 2.625810146331787, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22570153325796127, + "step": 3876 + }, + { + "epoch": 0.1211875, + "grad_norm": 4.1875, + "grad_norm_var": 0.050699869791666664, + "learning_rate": 0.0001, + "loss": 6.3149, + "loss/crossentropy": 2.4852793216705322, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2165529802441597, + "step": 3878 + }, + { + "epoch": 0.12125, + "grad_norm": 3.9375, + "grad_norm_var": 0.05750223795572917, + "learning_rate": 0.0001, + "loss": 6.4477, + "loss/crossentropy": 2.55775785446167, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22141185402870178, + "step": 3880 + }, + { + "epoch": 0.1213125, + "grad_norm": 4.0625, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 0.0001, + "loss": 6.6623, + "loss/crossentropy": 2.6345800161361694, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22972697764635086, + "step": 3882 + }, + { + "epoch": 0.121375, + "grad_norm": 4.25, + "grad_norm_var": 0.06428934733072916, + "learning_rate": 0.0001, + "loss": 6.9427, + "loss/crossentropy": 2.794018030166626, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.24182262271642685, + "step": 3884 + }, + { + "epoch": 0.1214375, + "grad_norm": 3.71875, + "grad_norm_var": 0.060445149739583336, + "learning_rate": 0.0001, + "loss": 6.6471, + "loss/crossentropy": 2.7294814586639404, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22262265533208847, + "step": 3886 + }, + { + "epoch": 0.1215, + "grad_norm": 3.53125, + "grad_norm_var": 0.0771148681640625, + "learning_rate": 0.0001, + "loss": 6.5541, + "loss/crossentropy": 2.7204082012176514, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21188200265169144, + "step": 3888 + }, + { + "epoch": 0.1215625, + "grad_norm": 3.578125, + "grad_norm_var": 0.09329427083333333, + "learning_rate": 0.0001, + "loss": 6.6385, + "loss/crossentropy": 2.7236995697021484, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22312351316213608, + "step": 3890 + }, + { + "epoch": 0.121625, + "grad_norm": 3.96875, + "grad_norm_var": 0.09780985514322917, + "learning_rate": 0.0001, + "loss": 6.6929, + "loss/crossentropy": 2.778989553451538, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2195180281996727, + "step": 3892 + }, + { + "epoch": 0.1216875, + "grad_norm": 3.59375, + "grad_norm_var": 0.10478108723958333, + "learning_rate": 0.0001, + "loss": 6.6652, + "loss/crossentropy": 2.6517540216445923, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22790998965501785, + "step": 3894 + }, + { + "epoch": 0.12175, + "grad_norm": 3.765625, + "grad_norm_var": 0.10888264973958334, + "learning_rate": 0.0001, + "loss": 6.5588, + "loss/crossentropy": 2.6892133951187134, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21742477267980576, + "step": 3896 + }, + { + "epoch": 0.1218125, + "grad_norm": 3.828125, + "grad_norm_var": 0.096044921875, + "learning_rate": 0.0001, + "loss": 6.3872, + "loss/crossentropy": 2.534321904182434, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21693114936351776, + "step": 3898 + }, + { + "epoch": 0.121875, + "grad_norm": 3.71875, + "grad_norm_var": 0.07714436848958334, + "learning_rate": 0.0001, + "loss": 6.4921, + "loss/crossentropy": 2.593091368675232, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21763041615486145, + "step": 3900 + }, + { + "epoch": 0.1219375, + "grad_norm": 4.1875, + "grad_norm_var": 0.08701171875, + "learning_rate": 0.0001, + "loss": 6.5717, + "loss/crossentropy": 2.5926719903945923, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.22290125489234924, + "step": 3902 + }, + { + "epoch": 0.122, + "grad_norm": 3.71875, + "grad_norm_var": 0.06660868326822916, + "learning_rate": 0.0001, + "loss": 6.6787, + "loss/crossentropy": 2.6835639476776123, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2291964739561081, + "step": 3904 + }, + { + "epoch": 0.1220625, + "grad_norm": 4.3125, + "grad_norm_var": 0.07808329264322916, + "learning_rate": 0.0001, + "loss": 6.485, + "loss/crossentropy": 2.571964383125305, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22138310968875885, + "step": 3906 + }, + { + "epoch": 0.122125, + "grad_norm": 4.0625, + "grad_norm_var": 0.06379292805989584, + "learning_rate": 0.0001, + "loss": 6.6788, + "loss/crossentropy": 2.6926345825195312, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.2247883677482605, + "step": 3908 + }, + { + "epoch": 0.1221875, + "grad_norm": 5.03125, + "grad_norm_var": 0.1368560791015625, + "learning_rate": 0.0001, + "loss": 5.9859, + "loss/crossentropy": 2.3010092973709106, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.19700200110673904, + "step": 3910 + }, + { + "epoch": 0.12225, + "grad_norm": 4.25, + "grad_norm_var": 0.13918863932291667, + "learning_rate": 0.0001, + "loss": 6.426, + "loss/crossentropy": 2.584962010383606, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2141772359609604, + "step": 3912 + }, + { + "epoch": 0.1223125, + "grad_norm": 3.75, + "grad_norm_var": 0.13753255208333334, + "learning_rate": 0.0001, + "loss": 6.4614, + "loss/crossentropy": 2.509815812110901, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22484420239925385, + "step": 3914 + }, + { + "epoch": 0.122375, + "grad_norm": 4.0, + "grad_norm_var": 0.15249735514322918, + "learning_rate": 0.0001, + "loss": 6.3401, + "loss/crossentropy": 2.4113447666168213, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21904617547988892, + "step": 3916 + }, + { + "epoch": 0.1224375, + "grad_norm": 10.625, + "grad_norm_var": 2.94146728515625, + "learning_rate": 0.0001, + "loss": 7.0081, + "loss/crossentropy": 2.5848923921585083, + "loss/hidden": 1.89453125, + "loss/jsd": 0.0, + "loss/logits": 0.25286970287561417, + "step": 3918 + }, + { + "epoch": 0.1225, + "grad_norm": 5.5625, + "grad_norm_var": 3.3592844645182294, + "learning_rate": 0.0001, + "loss": 7.1548, + "loss/crossentropy": 2.74170184135437, + "loss/hidden": 1.82421875, + "loss/jsd": 0.0, + "loss/logits": 0.2588881254196167, + "step": 3920 + }, + { + "epoch": 0.1225625, + "grad_norm": 3.984375, + "grad_norm_var": 3.308463541666667, + "learning_rate": 0.0001, + "loss": 6.5678, + "loss/crossentropy": 2.5315656661987305, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.22511044144630432, + "step": 3922 + }, + { + "epoch": 0.122625, + "grad_norm": 4.46875, + "grad_norm_var": 3.26783447265625, + "learning_rate": 0.0001, + "loss": 6.649, + "loss/crossentropy": 2.4766751527786255, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2430151253938675, + "step": 3924 + }, + { + "epoch": 0.1226875, + "grad_norm": 3.53125, + "grad_norm_var": 3.360798136393229, + "learning_rate": 0.0001, + "loss": 6.3786, + "loss/crossentropy": 2.5347131490707397, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21212033182382584, + "step": 3926 + }, + { + "epoch": 0.12275, + "grad_norm": 3.9375, + "grad_norm_var": 3.3823232014973956, + "learning_rate": 0.0001, + "loss": 6.5134, + "loss/crossentropy": 2.561252474784851, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22451211512088776, + "step": 3928 + }, + { + "epoch": 0.1228125, + "grad_norm": 4.28125, + "grad_norm_var": 3.322021484375, + "learning_rate": 0.0001, + "loss": 6.4125, + "loss/crossentropy": 2.4684640169143677, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22292188555002213, + "step": 3930 + }, + { + "epoch": 0.122875, + "grad_norm": 3.90625, + "grad_norm_var": 3.2751617431640625, + "learning_rate": 0.0001, + "loss": 6.6115, + "loss/crossentropy": 2.600993514060974, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.22604826837778091, + "step": 3932 + }, + { + "epoch": 0.1229375, + "grad_norm": 3.8125, + "grad_norm_var": 0.7657389322916667, + "learning_rate": 0.0001, + "loss": 6.5081, + "loss/crossentropy": 2.5906589031219482, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21791699528694153, + "step": 3934 + }, + { + "epoch": 0.123, + "grad_norm": 3.640625, + "grad_norm_var": 0.08106180826822916, + "learning_rate": 0.0001, + "loss": 6.5433, + "loss/crossentropy": 2.596111297607422, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2240198478102684, + "step": 3936 + }, + { + "epoch": 0.1230625, + "grad_norm": 4.84375, + "grad_norm_var": 0.203173828125, + "learning_rate": 0.0001, + "loss": 7.1452, + "loss/crossentropy": 2.843635082244873, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.24968495965003967, + "step": 3938 + }, + { + "epoch": 0.123125, + "grad_norm": 3.6875, + "grad_norm_var": 0.19171549479166666, + "learning_rate": 0.0001, + "loss": 6.8274, + "loss/crossentropy": 2.7907726764678955, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.23061522096395493, + "step": 3940 + }, + { + "epoch": 0.1231875, + "grad_norm": 3.734375, + "grad_norm_var": 0.17717997233072916, + "learning_rate": 0.0001, + "loss": 6.6786, + "loss/crossentropy": 2.7477582693099976, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22550544887781143, + "step": 3942 + }, + { + "epoch": 0.12325, + "grad_norm": 3.875, + "grad_norm_var": 0.17346598307291666, + "learning_rate": 0.0001, + "loss": 6.4862, + "loss/crossentropy": 2.572945475578308, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22218024730682373, + "step": 3944 + }, + { + "epoch": 0.1233125, + "grad_norm": 3.65625, + "grad_norm_var": 0.2100982666015625, + "learning_rate": 0.0001, + "loss": 6.6538, + "loss/crossentropy": 2.5931992530822754, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23379892855882645, + "step": 3946 + }, + { + "epoch": 0.123375, + "grad_norm": 4.71875, + "grad_norm_var": 0.23860677083333334, + "learning_rate": 0.0001, + "loss": 6.928, + "loss/crossentropy": 2.7630093097686768, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.23954720050096512, + "step": 3948 + }, + { + "epoch": 0.1234375, + "grad_norm": 3.953125, + "grad_norm_var": 0.23022359212239582, + "learning_rate": 0.0001, + "loss": 7.0825, + "loss/crossentropy": 2.878159523010254, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.24269741028547287, + "step": 3950 + }, + { + "epoch": 0.1235, + "grad_norm": 5.65625, + "grad_norm_var": 0.4005045572916667, + "learning_rate": 0.0001, + "loss": 6.4622, + "loss/crossentropy": 2.620681405067444, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21735333651304245, + "step": 3952 + }, + { + "epoch": 0.1235625, + "grad_norm": 4.375, + "grad_norm_var": 0.3162180582682292, + "learning_rate": 0.0001, + "loss": 6.6901, + "loss/crossentropy": 2.6496429443359375, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23138756304979324, + "step": 3954 + }, + { + "epoch": 0.123625, + "grad_norm": 4.1875, + "grad_norm_var": 0.31428934733072916, + "learning_rate": 0.0001, + "loss": 6.4552, + "loss/crossentropy": 2.529421091079712, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.219528928399086, + "step": 3956 + }, + { + "epoch": 0.1236875, + "grad_norm": 4.15625, + "grad_norm_var": 0.2940338134765625, + "learning_rate": 0.0001, + "loss": 6.5467, + "loss/crossentropy": 2.5284109115600586, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.230732724070549, + "step": 3958 + }, + { + "epoch": 0.12375, + "grad_norm": 5.125, + "grad_norm_var": 0.3382639567057292, + "learning_rate": 0.0001, + "loss": 7.0126, + "loss/crossentropy": 2.92387056350708, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.23621230572462082, + "step": 3960 + }, + { + "epoch": 0.1238125, + "grad_norm": 3.734375, + "grad_norm_var": 0.31686197916666664, + "learning_rate": 0.0001, + "loss": 6.4582, + "loss/crossentropy": 2.526583433151245, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22324207425117493, + "step": 3962 + }, + { + "epoch": 0.123875, + "grad_norm": 3.90625, + "grad_norm_var": 0.2927805582682292, + "learning_rate": 0.0001, + "loss": 6.6438, + "loss/crossentropy": 2.6276822090148926, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.2281695306301117, + "step": 3964 + }, + { + "epoch": 0.1239375, + "grad_norm": 3.828125, + "grad_norm_var": 0.29655659993489586, + "learning_rate": 0.0001, + "loss": 6.905, + "loss/crossentropy": 2.866371512413025, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23003342747688293, + "step": 3966 + }, + { + "epoch": 0.124, + "grad_norm": 3.40625, + "grad_norm_var": 0.14641927083333334, + "learning_rate": 0.0001, + "loss": 6.8189, + "loss/crossentropy": 2.93355929851532, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21705003827810287, + "step": 3968 + }, + { + "epoch": 0.1240625, + "grad_norm": 3.375, + "grad_norm_var": 0.16176656087239583, + "learning_rate": 0.0001, + "loss": 6.5506, + "loss/crossentropy": 2.607635974884033, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2200758457183838, + "step": 3970 + }, + { + "epoch": 0.124125, + "grad_norm": 4.0625, + "grad_norm_var": 0.1582916259765625, + "learning_rate": 0.0001, + "loss": 6.6297, + "loss/crossentropy": 2.632441759109497, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22823940962553024, + "step": 3972 + }, + { + "epoch": 0.1241875, + "grad_norm": 3.453125, + "grad_norm_var": 0.1716461181640625, + "learning_rate": 0.0001, + "loss": 6.3758, + "loss/crossentropy": 2.6422054767608643, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2046114206314087, + "step": 3974 + }, + { + "epoch": 0.12425, + "grad_norm": 3.78125, + "grad_norm_var": 0.06135152180989583, + "learning_rate": 0.0001, + "loss": 6.7175, + "loss/crossentropy": 2.7409368753433228, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2250024750828743, + "step": 3976 + }, + { + "epoch": 0.1243125, + "grad_norm": 3.734375, + "grad_norm_var": 0.05761311848958333, + "learning_rate": 0.0001, + "loss": 6.096, + "loss/crossentropy": 2.399104118347168, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2017187550663948, + "step": 3978 + }, + { + "epoch": 0.124375, + "grad_norm": 3.859375, + "grad_norm_var": 0.08145243326822917, + "learning_rate": 0.0001, + "loss": 6.5613, + "loss/crossentropy": 2.7199209928512573, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.20992152392864227, + "step": 3980 + }, + { + "epoch": 0.1244375, + "grad_norm": 4.1875, + "grad_norm_var": 0.09019775390625, + "learning_rate": 0.0001, + "loss": 6.7587, + "loss/crossentropy": 2.6355478763580322, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.23653244227170944, + "step": 3982 + }, + { + "epoch": 0.1245, + "grad_norm": 4.0, + "grad_norm_var": 0.0809478759765625, + "learning_rate": 0.0001, + "loss": 6.6411, + "loss/crossentropy": 2.5781519412994385, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.23051701486110687, + "step": 3984 + }, + { + "epoch": 0.1245625, + "grad_norm": 3.453125, + "grad_norm_var": 0.06982014973958334, + "learning_rate": 0.0001, + "loss": 6.0452, + "loss/crossentropy": 2.3460733890533447, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.2062443271279335, + "step": 3986 + }, + { + "epoch": 0.124625, + "grad_norm": 3.71875, + "grad_norm_var": 0.06868387858072916, + "learning_rate": 0.0001, + "loss": 6.2594, + "loss/crossentropy": 2.419608950614929, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.20937252044677734, + "step": 3988 + }, + { + "epoch": 0.1246875, + "grad_norm": 3.71875, + "grad_norm_var": 0.0579986572265625, + "learning_rate": 0.0001, + "loss": 6.3339, + "loss/crossentropy": 2.4288982152938843, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22058139741420746, + "step": 3990 + }, + { + "epoch": 0.12475, + "grad_norm": 6.6875, + "grad_norm_var": 0.5380279541015625, + "learning_rate": 0.0001, + "loss": 6.838, + "loss/crossentropy": 2.5921448469161987, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.24646371603012085, + "step": 3992 + }, + { + "epoch": 0.1248125, + "grad_norm": 4.1875, + "grad_norm_var": 0.5374582926432292, + "learning_rate": 0.0001, + "loss": 6.9783, + "loss/crossentropy": 2.6753528118133545, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.25334077328443527, + "step": 3994 + }, + { + "epoch": 0.124875, + "grad_norm": 3.890625, + "grad_norm_var": 0.5369781494140625, + "learning_rate": 0.0001, + "loss": 6.3665, + "loss/crossentropy": 2.5173572301864624, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.20991378277540207, + "step": 3996 + }, + { + "epoch": 0.1249375, + "grad_norm": 4.0, + "grad_norm_var": 0.53902587890625, + "learning_rate": 0.0001, + "loss": 6.4783, + "loss/crossentropy": 2.622048854827881, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.21140175312757492, + "step": 3998 + }, + { + "epoch": 0.125, + "grad_norm": 4.75, + "grad_norm_var": 0.5801432291666667, + "learning_rate": 0.0001, + "loss": 6.6491, + "loss/crossentropy": 2.7371960878372192, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2208828255534172, + "step": 4000 + }, + { + "epoch": 0.1250625, + "grad_norm": 4.65625, + "grad_norm_var": 0.5559804280598958, + "learning_rate": 0.0001, + "loss": 6.7283, + "loss/crossentropy": 2.761434555053711, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22364237159490585, + "step": 4002 + }, + { + "epoch": 0.125125, + "grad_norm": 4.40625, + "grad_norm_var": 0.5261545817057292, + "learning_rate": 0.0001, + "loss": 6.6485, + "loss/crossentropy": 2.512582302093506, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23859276622533798, + "step": 4004 + }, + { + "epoch": 0.1251875, + "grad_norm": 4.0, + "grad_norm_var": 0.49958394368489584, + "learning_rate": 0.0001, + "loss": 6.6833, + "loss/crossentropy": 2.5966222286224365, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.23327559977769852, + "step": 4006 + }, + { + "epoch": 0.12525, + "grad_norm": 3.890625, + "grad_norm_var": 0.1353912353515625, + "learning_rate": 0.0001, + "loss": 6.2893, + "loss/crossentropy": 2.4675627946853638, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2126445323228836, + "step": 4008 + }, + { + "epoch": 0.1253125, + "grad_norm": 4.75, + "grad_norm_var": 0.14102274576822918, + "learning_rate": 0.0001, + "loss": 6.5078, + "loss/crossentropy": 2.4831652641296387, + "loss/hidden": 1.78515625, + "loss/jsd": 0.0, + "loss/logits": 0.22395003587007523, + "step": 4010 + }, + { + "epoch": 0.125375, + "grad_norm": 4.6875, + "grad_norm_var": 0.15494384765625, + "learning_rate": 0.0001, + "loss": 6.6971, + "loss/crossentropy": 2.6222360134124756, + "loss/hidden": 1.796875, + "loss/jsd": 0.0, + "loss/logits": 0.22779580950737, + "step": 4012 + }, + { + "epoch": 0.1254375, + "grad_norm": 3.90625, + "grad_norm_var": 0.15995992024739583, + "learning_rate": 0.0001, + "loss": 6.6417, + "loss/crossentropy": 2.7136658430099487, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22092662006616592, + "step": 4014 + }, + { + "epoch": 0.1255, + "grad_norm": 3.859375, + "grad_norm_var": 0.13006184895833334, + "learning_rate": 0.0001, + "loss": 6.5771, + "loss/crossentropy": 2.63912570476532, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22387553751468658, + "step": 4016 + }, + { + "epoch": 0.1255625, + "grad_norm": 4.34375, + "grad_norm_var": 0.3907552083333333, + "learning_rate": 0.0001, + "loss": 7.2111, + "loss/crossentropy": 2.8089661598205566, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2597440257668495, + "step": 4018 + }, + { + "epoch": 0.125625, + "grad_norm": 4.21875, + "grad_norm_var": 0.40019429524739586, + "learning_rate": 0.0001, + "loss": 6.7308, + "loss/crossentropy": 2.6494024991989136, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23391643166542053, + "step": 4020 + }, + { + "epoch": 0.1256875, + "grad_norm": 3.96875, + "grad_norm_var": 0.406884765625, + "learning_rate": 0.0001, + "loss": 6.8903, + "loss/crossentropy": 2.84161376953125, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23064526915550232, + "step": 4022 + }, + { + "epoch": 0.12575, + "grad_norm": 5.15625, + "grad_norm_var": 0.43072509765625, + "learning_rate": 0.0001, + "loss": 7.1108, + "loss/crossentropy": 2.7824318408966064, + "loss/hidden": 1.8125, + "loss/jsd": 0.0, + "loss/logits": 0.2515895813703537, + "step": 4024 + }, + { + "epoch": 0.1258125, + "grad_norm": 3.921875, + "grad_norm_var": 0.4175771077473958, + "learning_rate": 0.0001, + "loss": 6.5405, + "loss/crossentropy": 2.5641770362854004, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22731705754995346, + "step": 4026 + }, + { + "epoch": 0.125875, + "grad_norm": 3.5625, + "grad_norm_var": 0.4420572916666667, + "learning_rate": 0.0001, + "loss": 6.4007, + "loss/crossentropy": 2.6236324310302734, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.2062263935804367, + "step": 4028 + }, + { + "epoch": 0.1259375, + "grad_norm": 4.09375, + "grad_norm_var": 0.44380594889322916, + "learning_rate": 0.0001, + "loss": 6.1651, + "loss/crossentropy": 2.388764262199402, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2084936499595642, + "step": 4030 + }, + { + "epoch": 0.126, + "grad_norm": 6.90625, + "grad_norm_var": 0.8959299723307291, + "learning_rate": 0.0001, + "loss": 6.8036, + "loss/crossentropy": 2.6765114068984985, + "loss/hidden": 1.76953125, + "loss/jsd": 0.0, + "loss/logits": 0.2357548102736473, + "step": 4032 + }, + { + "epoch": 0.1260625, + "grad_norm": 3.671875, + "grad_norm_var": 0.6935831705729166, + "learning_rate": 0.0001, + "loss": 6.3616, + "loss/crossentropy": 2.583994150161743, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21330882608890533, + "step": 4034 + }, + { + "epoch": 0.126125, + "grad_norm": 3.9375, + "grad_norm_var": 0.68369140625, + "learning_rate": 0.0001, + "loss": 6.6707, + "loss/crossentropy": 2.6019418239593506, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23304607719182968, + "step": 4036 + }, + { + "epoch": 0.1261875, + "grad_norm": 4.09375, + "grad_norm_var": 0.6779612223307292, + "learning_rate": 0.0001, + "loss": 6.7854, + "loss/crossentropy": 2.8110634088516235, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22712517529726028, + "step": 4038 + }, + { + "epoch": 0.12625, + "grad_norm": 4.34375, + "grad_norm_var": 0.61314697265625, + "learning_rate": 0.0001, + "loss": 7.036, + "loss/crossentropy": 2.830755352973938, + "loss/hidden": 1.76171875, + "loss/jsd": 0.0, + "loss/logits": 0.2443506345152855, + "step": 4040 + }, + { + "epoch": 0.1263125, + "grad_norm": 3.984375, + "grad_norm_var": 0.6087198893229167, + "learning_rate": 0.0001, + "loss": 6.6535, + "loss/crossentropy": 2.6730138063430786, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2289079651236534, + "step": 4042 + }, + { + "epoch": 0.126375, + "grad_norm": 3.5625, + "grad_norm_var": 0.60465087890625, + "learning_rate": 0.0001, + "loss": 6.5962, + "loss/crossentropy": 2.6363285779953003, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22567617148160934, + "step": 4044 + }, + { + "epoch": 0.1264375, + "grad_norm": 3.6875, + "grad_norm_var": 0.6362782796223958, + "learning_rate": 0.0001, + "loss": 6.6012, + "loss/crossentropy": 2.7529706954956055, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2164599373936653, + "step": 4046 + }, + { + "epoch": 0.1265, + "grad_norm": 3.71875, + "grad_norm_var": 0.07694905598958333, + "learning_rate": 0.0001, + "loss": 6.2872, + "loss/crossentropy": 2.568008065223694, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20707439631223679, + "step": 4048 + }, + { + "epoch": 0.1265625, + "grad_norm": 4.03125, + "grad_norm_var": 0.06897379557291666, + "learning_rate": 0.0001, + "loss": 6.7791, + "loss/crossentropy": 2.7470881938934326, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.22820251435041428, + "step": 4050 + }, + { + "epoch": 0.126625, + "grad_norm": 3.921875, + "grad_norm_var": 0.06415913899739584, + "learning_rate": 0.0001, + "loss": 6.6669, + "loss/crossentropy": 2.7223349809646606, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.2229759618639946, + "step": 4052 + }, + { + "epoch": 0.1266875, + "grad_norm": 4.0625, + "grad_norm_var": 0.06628316243489583, + "learning_rate": 0.0001, + "loss": 6.6013, + "loss/crossentropy": 2.7354965209960938, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21587659418582916, + "step": 4054 + }, + { + "epoch": 0.12675, + "grad_norm": 4.09375, + "grad_norm_var": 0.05424702962239583, + "learning_rate": 0.0001, + "loss": 6.8119, + "loss/crossentropy": 2.8012603521347046, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22879604250192642, + "step": 4056 + }, + { + "epoch": 0.1268125, + "grad_norm": 4.125, + "grad_norm_var": 0.05943603515625, + "learning_rate": 0.0001, + "loss": 6.4312, + "loss/crossentropy": 2.5267443656921387, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21817763149738312, + "step": 4058 + }, + { + "epoch": 0.126875, + "grad_norm": 3.546875, + "grad_norm_var": 0.0608306884765625, + "learning_rate": 0.0001, + "loss": 6.6004, + "loss/crossentropy": 2.601567268371582, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.2241053283214569, + "step": 4060 + }, + { + "epoch": 0.1269375, + "grad_norm": 4.21875, + "grad_norm_var": 0.05134989420572917, + "learning_rate": 0.0001, + "loss": 6.7407, + "loss/crossentropy": 2.706727147102356, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2307443767786026, + "step": 4062 + }, + { + "epoch": 0.127, + "grad_norm": 3.578125, + "grad_norm_var": 0.035868326822916664, + "learning_rate": 0.0001, + "loss": 6.2964, + "loss/crossentropy": 2.4649651050567627, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21400236338377, + "step": 4064 + }, + { + "epoch": 0.1270625, + "grad_norm": 3.703125, + "grad_norm_var": 0.04338785807291667, + "learning_rate": 0.0001, + "loss": 6.561, + "loss/crossentropy": 2.6405688524246216, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22134052217006683, + "step": 4066 + }, + { + "epoch": 0.127125, + "grad_norm": 3.921875, + "grad_norm_var": 0.04312744140625, + "learning_rate": 0.0001, + "loss": 6.7403, + "loss/crossentropy": 2.7473138570785522, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.228990375995636, + "step": 4068 + }, + { + "epoch": 0.1271875, + "grad_norm": 3.984375, + "grad_norm_var": 0.04299214680989583, + "learning_rate": 0.0001, + "loss": 6.701, + "loss/crossentropy": 2.588662028312683, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.23818998783826828, + "step": 4070 + }, + { + "epoch": 0.12725, + "grad_norm": 3.828125, + "grad_norm_var": 0.050593058268229164, + "learning_rate": 0.0001, + "loss": 6.6683, + "loss/crossentropy": 2.6175063848495483, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.2296886220574379, + "step": 4072 + }, + { + "epoch": 0.1273125, + "grad_norm": 3.765625, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 0.0001, + "loss": 6.4641, + "loss/crossentropy": 2.5841997861862183, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.216502346098423, + "step": 4074 + }, + { + "epoch": 0.127375, + "grad_norm": 4.65625, + "grad_norm_var": 0.07437744140625, + "learning_rate": 0.0001, + "loss": 6.6996, + "loss/crossentropy": 2.647361397743225, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23296315222978592, + "step": 4076 + }, + { + "epoch": 0.1274375, + "grad_norm": 3.625, + "grad_norm_var": 0.07527669270833333, + "learning_rate": 0.0001, + "loss": 6.6153, + "loss/crossentropy": 2.6356441974639893, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22570262849330902, + "step": 4078 + }, + { + "epoch": 0.1275, + "grad_norm": 4.46875, + "grad_norm_var": 0.0843658447265625, + "learning_rate": 0.0001, + "loss": 6.5986, + "loss/crossentropy": 2.5087159872055054, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2359411045908928, + "step": 4080 + }, + { + "epoch": 0.1275625, + "grad_norm": 3.921875, + "grad_norm_var": 0.07222391764322916, + "learning_rate": 0.0001, + "loss": 6.2928, + "loss/crossentropy": 2.4719302654266357, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.20865336060523987, + "step": 4082 + }, + { + "epoch": 0.127625, + "grad_norm": 11.875, + "grad_norm_var": 3.9582590738932293, + "learning_rate": 0.0001, + "loss": 6.6642, + "loss/crossentropy": 2.618984341621399, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2365577220916748, + "step": 4084 + }, + { + "epoch": 0.1276875, + "grad_norm": 4.03125, + "grad_norm_var": 3.9661936442057293, + "learning_rate": 0.0001, + "loss": 6.4982, + "loss/crossentropy": 2.5682222843170166, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2207360565662384, + "step": 4086 + }, + { + "epoch": 0.12775, + "grad_norm": 3.828125, + "grad_norm_var": 3.9713205973307293, + "learning_rate": 0.0001, + "loss": 6.3929, + "loss/crossentropy": 2.5119932889938354, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21894660592079163, + "step": 4088 + }, + { + "epoch": 0.1278125, + "grad_norm": 3.671875, + "grad_norm_var": 3.978327433268229, + "learning_rate": 0.0001, + "loss": 6.4042, + "loss/crossentropy": 2.5663267374038696, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2173830345273018, + "step": 4090 + }, + { + "epoch": 0.127875, + "grad_norm": 4.1875, + "grad_norm_var": 4.021939086914062, + "learning_rate": 0.0001, + "loss": 6.4591, + "loss/crossentropy": 2.5778207778930664, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2201596274971962, + "step": 4092 + }, + { + "epoch": 0.1279375, + "grad_norm": 3.484375, + "grad_norm_var": 4.07906494140625, + "learning_rate": 0.0001, + "loss": 6.2625, + "loss/crossentropy": 2.538253903388977, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2040649726986885, + "step": 4094 + }, + { + "epoch": 0.128, + "grad_norm": 4.0625, + "grad_norm_var": 4.089891560872396, + "learning_rate": 0.0001, + "loss": 6.6743, + "loss/crossentropy": 2.7568410634994507, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22143083810806274, + "step": 4096 + }, + { + "epoch": 0.1280625, + "grad_norm": 3.609375, + "grad_norm_var": 4.11890869140625, + "learning_rate": 0.0001, + "loss": 6.4438, + "loss/crossentropy": 2.6530957221984863, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21070635318756104, + "step": 4098 + }, + { + "epoch": 0.128125, + "grad_norm": 3.90625, + "grad_norm_var": 0.07829488118489583, + "learning_rate": 0.0001, + "loss": 6.6074, + "loss/crossentropy": 2.625568151473999, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22631164640188217, + "step": 4100 + }, + { + "epoch": 0.1281875, + "grad_norm": 4.125, + "grad_norm_var": 0.08401285807291667, + "learning_rate": 0.0001, + "loss": 6.5135, + "loss/crossentropy": 2.6141321659088135, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2192334309220314, + "step": 4102 + }, + { + "epoch": 0.12825, + "grad_norm": 4.03125, + "grad_norm_var": 0.0801422119140625, + "learning_rate": 0.0001, + "loss": 6.7171, + "loss/crossentropy": 2.7333229780197144, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.23002268373966217, + "step": 4104 + }, + { + "epoch": 0.1283125, + "grad_norm": 3.4375, + "grad_norm_var": 0.09021809895833334, + "learning_rate": 0.0001, + "loss": 6.4842, + "loss/crossentropy": 2.6517757177352905, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21566757559776306, + "step": 4106 + }, + { + "epoch": 0.128375, + "grad_norm": 3.625, + "grad_norm_var": 0.08058268229166667, + "learning_rate": 0.0001, + "loss": 6.4124, + "loss/crossentropy": 2.5258562564849854, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21912791579961777, + "step": 4108 + }, + { + "epoch": 0.1284375, + "grad_norm": 3.9375, + "grad_norm_var": 0.07349344889322916, + "learning_rate": 0.0001, + "loss": 6.4231, + "loss/crossentropy": 2.5245391130447388, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22032760083675385, + "step": 4110 + }, + { + "epoch": 0.1285, + "grad_norm": 3.84375, + "grad_norm_var": 0.06735026041666667, + "learning_rate": 0.0001, + "loss": 6.5448, + "loss/crossentropy": 2.599327564239502, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.2211119681596756, + "step": 4112 + }, + { + "epoch": 0.1285625, + "grad_norm": 3.59375, + "grad_norm_var": 0.06770426432291667, + "learning_rate": 0.0001, + "loss": 6.4937, + "loss/crossentropy": 2.6408677101135254, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21536342799663544, + "step": 4114 + }, + { + "epoch": 0.128625, + "grad_norm": 3.578125, + "grad_norm_var": 0.0384674072265625, + "learning_rate": 0.0001, + "loss": 6.4221, + "loss/crossentropy": 2.54049289226532, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.22175266593694687, + "step": 4116 + }, + { + "epoch": 0.1286875, + "grad_norm": 3.5625, + "grad_norm_var": 0.03052978515625, + "learning_rate": 0.0001, + "loss": 6.659, + "loss/crossentropy": 2.755501627922058, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2204309031367302, + "step": 4118 + }, + { + "epoch": 0.12875, + "grad_norm": 4.625, + "grad_norm_var": 0.19879150390625, + "learning_rate": 0.0001, + "loss": 6.7089, + "loss/crossentropy": 2.5334556102752686, + "loss/hidden": 1.8046875, + "loss/jsd": 0.0, + "loss/logits": 0.2370723932981491, + "step": 4120 + }, + { + "epoch": 0.1288125, + "grad_norm": 4.40625, + "grad_norm_var": 0.2015533447265625, + "learning_rate": 0.0001, + "loss": 6.4647, + "loss/crossentropy": 2.4602530002593994, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22739624977111816, + "step": 4122 + }, + { + "epoch": 0.128875, + "grad_norm": 3.453125, + "grad_norm_var": 0.20187886555989584, + "learning_rate": 0.0001, + "loss": 6.684, + "loss/crossentropy": 2.7768908739089966, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22195608168840408, + "step": 4124 + }, + { + "epoch": 0.1289375, + "grad_norm": 3.640625, + "grad_norm_var": 0.20279541015625, + "learning_rate": 0.0001, + "loss": 6.5967, + "loss/crossentropy": 2.624576449394226, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22690071165561676, + "step": 4126 + }, + { + "epoch": 0.129, + "grad_norm": 4.15625, + "grad_norm_var": 0.20142822265625, + "learning_rate": 0.0001, + "loss": 6.0872, + "loss/crossentropy": 2.3337572813034058, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.20541912317276, + "step": 4128 + }, + { + "epoch": 0.1290625, + "grad_norm": 4.09375, + "grad_norm_var": 0.5465128580729167, + "learning_rate": 0.0001, + "loss": 6.8517, + "loss/crossentropy": 2.7883676290512085, + "loss/hidden": 1.77734375, + "loss/jsd": 0.0, + "loss/logits": 0.22859449684619904, + "step": 4130 + }, + { + "epoch": 0.129125, + "grad_norm": 3.796875, + "grad_norm_var": 0.52691650390625, + "learning_rate": 0.0001, + "loss": 6.9007, + "loss/crossentropy": 2.802171230316162, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23485445231199265, + "step": 4132 + }, + { + "epoch": 0.1291875, + "grad_norm": 4.78125, + "grad_norm_var": 0.5078521728515625, + "learning_rate": 0.0001, + "loss": 6.6574, + "loss/crossentropy": 2.6423803567886353, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.22728128731250763, + "step": 4134 + }, + { + "epoch": 0.12925, + "grad_norm": 3.375, + "grad_norm_var": 0.4954254150390625, + "learning_rate": 0.0001, + "loss": 6.2662, + "loss/crossentropy": 2.5183045864105225, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20916834473609924, + "step": 4136 + }, + { + "epoch": 0.1293125, + "grad_norm": 4.0625, + "grad_norm_var": 0.48957417805989584, + "learning_rate": 0.0001, + "loss": 6.5832, + "loss/crossentropy": 2.6430411338806152, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22097034752368927, + "step": 4138 + }, + { + "epoch": 0.129375, + "grad_norm": 3.953125, + "grad_norm_var": 0.4623687744140625, + "learning_rate": 0.0001, + "loss": 6.8817, + "loss/crossentropy": 2.7385300397872925, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2416619285941124, + "step": 4140 + }, + { + "epoch": 0.1294375, + "grad_norm": 3.671875, + "grad_norm_var": 0.47001953125, + "learning_rate": 0.0001, + "loss": 6.2309, + "loss/crossentropy": 2.5208064317703247, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20694401860237122, + "step": 4142 + }, + { + "epoch": 0.1295, + "grad_norm": 3.734375, + "grad_norm_var": 0.4915852864583333, + "learning_rate": 0.0001, + "loss": 6.3752, + "loss/crossentropy": 2.5856963396072388, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21332371979951859, + "step": 4144 + }, + { + "epoch": 0.1295625, + "grad_norm": 3.890625, + "grad_norm_var": 0.10068257649739583, + "learning_rate": 0.0001, + "loss": 6.302, + "loss/crossentropy": 2.464065670967102, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2130947783589363, + "step": 4146 + }, + { + "epoch": 0.129625, + "grad_norm": 3.625, + "grad_norm_var": 0.102978515625, + "learning_rate": 0.0001, + "loss": 6.0577, + "loss/crossentropy": 2.259469985961914, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.20638281852006912, + "step": 4148 + }, + { + "epoch": 0.1296875, + "grad_norm": 4.0625, + "grad_norm_var": 0.0470367431640625, + "learning_rate": 0.0001, + "loss": 6.3444, + "loss/crossentropy": 2.5508021116256714, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.208266943693161, + "step": 4150 + }, + { + "epoch": 0.12975, + "grad_norm": 3.5625, + "grad_norm_var": 0.0468414306640625, + "learning_rate": 0.0001, + "loss": 6.8325, + "loss/crossentropy": 2.7907811403274536, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.23268505930900574, + "step": 4152 + }, + { + "epoch": 0.1298125, + "grad_norm": 4.40625, + "grad_norm_var": 0.0658203125, + "learning_rate": 0.0001, + "loss": 6.7278, + "loss/crossentropy": 2.6292165517807007, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23759499937295914, + "step": 4154 + }, + { + "epoch": 0.129875, + "grad_norm": 4.09375, + "grad_norm_var": 0.07007548014322916, + "learning_rate": 0.0001, + "loss": 6.8438, + "loss/crossentropy": 2.810207486152649, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.23031456023454666, + "step": 4156 + }, + { + "epoch": 0.1299375, + "grad_norm": 3.625, + "grad_norm_var": 0.07054036458333333, + "learning_rate": 0.0001, + "loss": 6.3715, + "loss/crossentropy": 2.574985980987549, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21402358263731003, + "step": 4158 + }, + { + "epoch": 0.13, + "grad_norm": 3.625, + "grad_norm_var": 0.07649332682291667, + "learning_rate": 0.0001, + "loss": 6.3259, + "loss/crossentropy": 2.5313199758529663, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21266181766986847, + "step": 4160 + }, + { + "epoch": 0.1300625, + "grad_norm": 3.859375, + "grad_norm_var": 0.07180887858072917, + "learning_rate": 0.0001, + "loss": 6.4371, + "loss/crossentropy": 2.6102746725082397, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21354027092456818, + "step": 4162 + }, + { + "epoch": 0.130125, + "grad_norm": 3.53125, + "grad_norm_var": 0.07256571451822917, + "learning_rate": 0.0001, + "loss": 6.482, + "loss/crossentropy": 2.6907159090042114, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.20959646999835968, + "step": 4164 + }, + { + "epoch": 0.1301875, + "grad_norm": 4.125, + "grad_norm_var": 0.07476806640625, + "learning_rate": 0.0001, + "loss": 6.3805, + "loss/crossentropy": 2.4660550355911255, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.22347452491521835, + "step": 4166 + }, + { + "epoch": 0.13025, + "grad_norm": 4.03125, + "grad_norm_var": 0.07359619140625, + "learning_rate": 0.0001, + "loss": 6.7581, + "loss/crossentropy": 2.7357442378997803, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.22958087921142578, + "step": 4168 + }, + { + "epoch": 0.1303125, + "grad_norm": 3.46875, + "grad_norm_var": 0.12799479166666666, + "learning_rate": 0.0001, + "loss": 6.0945, + "loss/crossentropy": 2.3649386167526245, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.20498840510845184, + "step": 4170 + }, + { + "epoch": 0.130375, + "grad_norm": 3.71875, + "grad_norm_var": 0.13772684733072918, + "learning_rate": 0.0001, + "loss": 6.4307, + "loss/crossentropy": 2.4999207258224487, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.21886222064495087, + "step": 4172 + }, + { + "epoch": 0.1304375, + "grad_norm": 6.28125, + "grad_norm_var": 0.500537109375, + "learning_rate": 0.0001, + "loss": 6.2569, + "loss/crossentropy": 2.4742400646209717, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2157633751630783, + "step": 4174 + }, + { + "epoch": 0.1305, + "grad_norm": 3.640625, + "grad_norm_var": 0.48405659993489586, + "learning_rate": 0.0001, + "loss": 6.3167, + "loss/crossentropy": 2.4573535919189453, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2179667130112648, + "step": 4176 + }, + { + "epoch": 0.1305625, + "grad_norm": 3.921875, + "grad_norm_var": 0.47384440104166664, + "learning_rate": 0.0001, + "loss": 6.4765, + "loss/crossentropy": 2.6034940481185913, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2177659273147583, + "step": 4178 + }, + { + "epoch": 0.130625, + "grad_norm": 3.859375, + "grad_norm_var": 0.4401326497395833, + "learning_rate": 0.0001, + "loss": 6.6969, + "loss/crossentropy": 2.6809717416763306, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22854258120059967, + "step": 4180 + }, + { + "epoch": 0.1306875, + "grad_norm": 3.96875, + "grad_norm_var": 0.44468994140625, + "learning_rate": 0.0001, + "loss": 6.9014, + "loss/crossentropy": 2.9240981340408325, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22624869644641876, + "step": 4182 + }, + { + "epoch": 0.13075, + "grad_norm": 3.703125, + "grad_norm_var": 0.4532704671223958, + "learning_rate": 0.0001, + "loss": 6.1262, + "loss/crossentropy": 2.3521162271499634, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2070925459265709, + "step": 4184 + }, + { + "epoch": 0.1308125, + "grad_norm": 3.546875, + "grad_norm_var": 0.41184794108072914, + "learning_rate": 0.0001, + "loss": 6.2821, + "loss/crossentropy": 2.519057512283325, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20872611552476883, + "step": 4186 + }, + { + "epoch": 0.130875, + "grad_norm": 3.703125, + "grad_norm_var": 0.4066243489583333, + "learning_rate": 0.0001, + "loss": 6.5832, + "loss/crossentropy": 2.6278737783432007, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22521910816431046, + "step": 4188 + }, + { + "epoch": 0.1309375, + "grad_norm": 4.0, + "grad_norm_var": 0.034440104166666666, + "learning_rate": 0.0001, + "loss": 6.9223, + "loss/crossentropy": 2.7981791496276855, + "loss/hidden": 1.78125, + "loss/jsd": 0.0, + "loss/logits": 0.23429103940725327, + "step": 4190 + }, + { + "epoch": 0.131, + "grad_norm": 3.984375, + "grad_norm_var": 0.033919270833333334, + "learning_rate": 0.0001, + "loss": 6.48, + "loss/crossentropy": 2.611246943473816, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.2153891921043396, + "step": 4192 + }, + { + "epoch": 0.1310625, + "grad_norm": 4.6875, + "grad_norm_var": 0.07752278645833334, + "learning_rate": 0.0001, + "loss": 6.7144, + "loss/crossentropy": 2.749796986579895, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.2253640741109848, + "step": 4194 + }, + { + "epoch": 0.131125, + "grad_norm": 3.734375, + "grad_norm_var": 0.08818257649739583, + "learning_rate": 0.0001, + "loss": 6.3536, + "loss/crossentropy": 2.5561606884002686, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21177595853805542, + "step": 4196 + }, + { + "epoch": 0.1311875, + "grad_norm": 4.03125, + "grad_norm_var": 0.08901265462239584, + "learning_rate": 0.0001, + "loss": 6.313, + "loss/crossentropy": 2.41589617729187, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21978683024644852, + "step": 4198 + }, + { + "epoch": 0.13125, + "grad_norm": 3.859375, + "grad_norm_var": 0.09361572265625, + "learning_rate": 0.0001, + "loss": 6.2272, + "loss/crossentropy": 2.4662758111953735, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.20538844913244247, + "step": 4200 + }, + { + "epoch": 0.1313125, + "grad_norm": 3.796875, + "grad_norm_var": 0.09602864583333333, + "learning_rate": 0.0001, + "loss": 6.4304, + "loss/crossentropy": 2.605375051498413, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2141440585255623, + "step": 4202 + }, + { + "epoch": 0.131375, + "grad_norm": 3.671875, + "grad_norm_var": 0.09169514973958333, + "learning_rate": 0.0001, + "loss": 6.4819, + "loss/crossentropy": 2.5840961933135986, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21673616021871567, + "step": 4204 + }, + { + "epoch": 0.1314375, + "grad_norm": 3.6875, + "grad_norm_var": 0.08472900390625, + "learning_rate": 0.0001, + "loss": 6.5872, + "loss/crossentropy": 2.652251124382019, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21966180205345154, + "step": 4206 + }, + { + "epoch": 0.1315, + "grad_norm": 3.359375, + "grad_norm_var": 0.0946197509765625, + "learning_rate": 0.0001, + "loss": 6.3679, + "loss/crossentropy": 2.7024388313293457, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.19545652717351913, + "step": 4208 + }, + { + "epoch": 0.1315625, + "grad_norm": 3.78125, + "grad_norm_var": 0.034012858072916666, + "learning_rate": 0.0001, + "loss": 6.2644, + "loss/crossentropy": 2.55469286441803, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.20300530642271042, + "step": 4210 + }, + { + "epoch": 0.131625, + "grad_norm": 3.6875, + "grad_norm_var": 0.036799112955729164, + "learning_rate": 0.0001, + "loss": 6.7124, + "loss/crossentropy": 2.8051246404647827, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2215913087129593, + "step": 4212 + }, + { + "epoch": 0.1316875, + "grad_norm": 4.9375, + "grad_norm_var": 0.12801005045572916, + "learning_rate": 0.0001, + "loss": 6.2274, + "loss/crossentropy": 2.3926981687545776, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21198853850364685, + "step": 4214 + }, + { + "epoch": 0.13175, + "grad_norm": 3.953125, + "grad_norm_var": 0.12809244791666666, + "learning_rate": 0.0001, + "loss": 6.4395, + "loss/crossentropy": 2.504998803138733, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22040077298879623, + "step": 4216 + }, + { + "epoch": 0.1318125, + "grad_norm": 3.921875, + "grad_norm_var": 0.12309468587239583, + "learning_rate": 0.0001, + "loss": 6.483, + "loss/crossentropy": 2.5757195949554443, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2208108976483345, + "step": 4218 + }, + { + "epoch": 0.131875, + "grad_norm": 3.5, + "grad_norm_var": 0.12935282389322916, + "learning_rate": 0.0001, + "loss": 6.3703, + "loss/crossentropy": 2.5360913276672363, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2162364348769188, + "step": 4220 + }, + { + "epoch": 0.1319375, + "grad_norm": 4.15625, + "grad_norm_var": 0.13492838541666666, + "learning_rate": 0.0001, + "loss": 6.4147, + "loss/crossentropy": 2.4870160818099976, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.21854856610298157, + "step": 4222 + }, + { + "epoch": 0.132, + "grad_norm": 4.3125, + "grad_norm_var": 0.1375, + "learning_rate": 0.0001, + "loss": 6.8119, + "loss/crossentropy": 2.793803095817566, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.2283768579363823, + "step": 4224 + }, + { + "epoch": 0.1320625, + "grad_norm": 4.28125, + "grad_norm_var": 0.14158528645833332, + "learning_rate": 0.0001, + "loss": 6.9475, + "loss/crossentropy": 2.823140859603882, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2393893525004387, + "step": 4226 + }, + { + "epoch": 0.132125, + "grad_norm": 4.25, + "grad_norm_var": 0.14159749348958334, + "learning_rate": 0.0001, + "loss": 6.4212, + "loss/crossentropy": 2.466855525970459, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.22121071815490723, + "step": 4228 + }, + { + "epoch": 0.1321875, + "grad_norm": 3.9375, + "grad_norm_var": 0.06571858723958333, + "learning_rate": 0.0001, + "loss": 6.1528, + "loss/crossentropy": 2.2941900491714478, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2139892429113388, + "step": 4230 + }, + { + "epoch": 0.13225, + "grad_norm": 3.875, + "grad_norm_var": 0.06431376139322917, + "learning_rate": 0.0001, + "loss": 6.3381, + "loss/crossentropy": 2.562358021736145, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.20882709324359894, + "step": 4232 + }, + { + "epoch": 0.1323125, + "grad_norm": 3.96875, + "grad_norm_var": 0.06397196451822916, + "learning_rate": 0.0001, + "loss": 6.0432, + "loss/crossentropy": 2.2380497455596924, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.20981623232364655, + "step": 4234 + }, + { + "epoch": 0.132375, + "grad_norm": 4.1875, + "grad_norm_var": 0.05945638020833333, + "learning_rate": 0.0001, + "loss": 6.8093, + "loss/crossentropy": 2.7688835859298706, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.23021194338798523, + "step": 4236 + }, + { + "epoch": 0.1324375, + "grad_norm": 3.484375, + "grad_norm_var": 0.07353108723958333, + "learning_rate": 0.0001, + "loss": 6.1162, + "loss/crossentropy": 2.437235474586487, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.2030561864376068, + "step": 4238 + }, + { + "epoch": 0.1325, + "grad_norm": 3.703125, + "grad_norm_var": 0.052718098958333334, + "learning_rate": 0.0001, + "loss": 6.4468, + "loss/crossentropy": 2.6158939599990845, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21629557013511658, + "step": 4240 + }, + { + "epoch": 0.1325625, + "grad_norm": 3.859375, + "grad_norm_var": 0.04348551432291667, + "learning_rate": 0.0001, + "loss": 6.4783, + "loss/crossentropy": 2.6150516271591187, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21835701167583466, + "step": 4242 + }, + { + "epoch": 0.132625, + "grad_norm": 3.703125, + "grad_norm_var": 0.037596638997395834, + "learning_rate": 0.0001, + "loss": 6.3232, + "loss/crossentropy": 2.542102098464966, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21014492213726044, + "step": 4244 + }, + { + "epoch": 0.1326875, + "grad_norm": 3.75, + "grad_norm_var": 0.0372467041015625, + "learning_rate": 0.0001, + "loss": 6.7724, + "loss/crossentropy": 2.832777500152588, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22364888340234756, + "step": 4246 + }, + { + "epoch": 0.13275, + "grad_norm": 3.671875, + "grad_norm_var": 0.0381744384765625, + "learning_rate": 0.0001, + "loss": 6.4954, + "loss/crossentropy": 2.6658281087875366, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21499013900756836, + "step": 4248 + }, + { + "epoch": 0.1328125, + "grad_norm": 3.703125, + "grad_norm_var": 0.03746337890625, + "learning_rate": 0.0001, + "loss": 6.6817, + "loss/crossentropy": 2.795152187347412, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22029437124729156, + "step": 4250 + }, + { + "epoch": 0.132875, + "grad_norm": 3.71875, + "grad_norm_var": 0.0105377197265625, + "learning_rate": 0.0001, + "loss": 6.5964, + "loss/crossentropy": 2.6854653358459473, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22233900427818298, + "step": 4252 + }, + { + "epoch": 0.1329375, + "grad_norm": 3.859375, + "grad_norm_var": 0.0064117431640625, + "learning_rate": 0.0001, + "loss": 6.7142, + "loss/crossentropy": 2.7894644737243652, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22490007430315018, + "step": 4254 + }, + { + "epoch": 0.133, + "grad_norm": 3.84375, + "grad_norm_var": 0.006965128580729166, + "learning_rate": 0.0001, + "loss": 6.5515, + "loss/crossentropy": 2.6032466888427734, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22568415105342865, + "step": 4256 + }, + { + "epoch": 0.1330625, + "grad_norm": 3.546875, + "grad_norm_var": 0.009447224934895833, + "learning_rate": 0.0001, + "loss": 6.6607, + "loss/crossentropy": 2.729295015335083, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22438761591911316, + "step": 4258 + }, + { + "epoch": 0.133125, + "grad_norm": 4.0625, + "grad_norm_var": 0.014842732747395834, + "learning_rate": 0.0001, + "loss": 6.6664, + "loss/crossentropy": 2.7235976457595825, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2239716351032257, + "step": 4260 + }, + { + "epoch": 0.1331875, + "grad_norm": 3.84375, + "grad_norm_var": 0.018505859375, + "learning_rate": 0.0001, + "loss": 6.2934, + "loss/crossentropy": 2.503963351249695, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2164425626397133, + "step": 4262 + }, + { + "epoch": 0.13325, + "grad_norm": 3.890625, + "grad_norm_var": 0.055028279622395836, + "learning_rate": 0.0001, + "loss": 6.5547, + "loss/crossentropy": 2.598883867263794, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22526460140943527, + "step": 4264 + }, + { + "epoch": 0.1333125, + "grad_norm": 3.890625, + "grad_norm_var": 0.052000935872395834, + "learning_rate": 0.0001, + "loss": 6.4364, + "loss/crossentropy": 2.55340576171875, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.22267234325408936, + "step": 4266 + }, + { + "epoch": 0.133375, + "grad_norm": 3.59375, + "grad_norm_var": 0.05603841145833333, + "learning_rate": 0.0001, + "loss": 6.4032, + "loss/crossentropy": 2.63783061504364, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2109164074063301, + "step": 4268 + }, + { + "epoch": 0.1334375, + "grad_norm": 4.09375, + "grad_norm_var": 0.0647369384765625, + "learning_rate": 0.0001, + "loss": 6.8678, + "loss/crossentropy": 2.767982602119446, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.23849761486053467, + "step": 4270 + }, + { + "epoch": 0.1335, + "grad_norm": 3.78125, + "grad_norm_var": 0.06555989583333334, + "learning_rate": 0.0001, + "loss": 6.5162, + "loss/crossentropy": 2.573201060295105, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22203601151704788, + "step": 4272 + }, + { + "epoch": 0.1335625, + "grad_norm": 3.453125, + "grad_norm_var": 0.07156473795572917, + "learning_rate": 0.0001, + "loss": 6.5034, + "loss/crossentropy": 2.6283360719680786, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21602189540863037, + "step": 4274 + }, + { + "epoch": 0.133625, + "grad_norm": 4.0, + "grad_norm_var": 0.071044921875, + "learning_rate": 0.0001, + "loss": 6.3222, + "loss/crossentropy": 2.5181901454925537, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21126443147659302, + "step": 4276 + }, + { + "epoch": 0.1336875, + "grad_norm": 3.984375, + "grad_norm_var": 0.06546223958333333, + "learning_rate": 0.0001, + "loss": 6.9159, + "loss/crossentropy": 2.8771623373031616, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2319938987493515, + "step": 4278 + }, + { + "epoch": 0.13375, + "grad_norm": 4.0625, + "grad_norm_var": 0.07353515625, + "learning_rate": 0.0001, + "loss": 6.1239, + "loss/crossentropy": 2.4219969511032104, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.19753269106149673, + "step": 4280 + }, + { + "epoch": 0.1338125, + "grad_norm": 3.921875, + "grad_norm_var": 0.07343343098958334, + "learning_rate": 0.0001, + "loss": 6.3541, + "loss/crossentropy": 2.4714077711105347, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21717225015163422, + "step": 4282 + }, + { + "epoch": 0.133875, + "grad_norm": 3.984375, + "grad_norm_var": 0.0681640625, + "learning_rate": 0.0001, + "loss": 6.5769, + "loss/crossentropy": 2.685715436935425, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2188032567501068, + "step": 4284 + }, + { + "epoch": 0.1339375, + "grad_norm": 4.28125, + "grad_norm_var": 0.07177327473958334, + "learning_rate": 0.0001, + "loss": 6.4156, + "loss/crossentropy": 2.4861207008361816, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22067828476428986, + "step": 4286 + }, + { + "epoch": 0.134, + "grad_norm": 4.21875, + "grad_norm_var": 0.0796539306640625, + "learning_rate": 0.0001, + "loss": 6.3744, + "loss/crossentropy": 2.5076987743377686, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21362122148275375, + "step": 4288 + }, + { + "epoch": 0.1340625, + "grad_norm": 4.09375, + "grad_norm_var": 0.11110026041666667, + "learning_rate": 0.0001, + "loss": 6.7546, + "loss/crossentropy": 2.7254503965377808, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.23260141164064407, + "step": 4290 + }, + { + "epoch": 0.134125, + "grad_norm": 3.578125, + "grad_norm_var": 0.13523661295572917, + "learning_rate": 0.0001, + "loss": 6.1764, + "loss/crossentropy": 2.425421357154846, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20986061543226242, + "step": 4292 + }, + { + "epoch": 0.1341875, + "grad_norm": 4.34375, + "grad_norm_var": 0.14474283854166667, + "learning_rate": 0.0001, + "loss": 6.5565, + "loss/crossentropy": 2.553962469100952, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22681234776973724, + "step": 4294 + }, + { + "epoch": 0.13425, + "grad_norm": 3.734375, + "grad_norm_var": 0.11599833170572917, + "learning_rate": 0.0001, + "loss": 6.498, + "loss/crossentropy": 2.701680064201355, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21400703489780426, + "step": 4296 + }, + { + "epoch": 0.1343125, + "grad_norm": 3.71875, + "grad_norm_var": 0.1501617431640625, + "learning_rate": 0.0001, + "loss": 6.1799, + "loss/crossentropy": 2.540942668914795, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20256894826889038, + "step": 4298 + }, + { + "epoch": 0.134375, + "grad_norm": 4.0, + "grad_norm_var": 0.15080973307291667, + "learning_rate": 0.0001, + "loss": 6.5185, + "loss/crossentropy": 2.6064943075180054, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21971440315246582, + "step": 4300 + }, + { + "epoch": 0.1344375, + "grad_norm": 3.53125, + "grad_norm_var": 0.14550374348958334, + "learning_rate": 0.0001, + "loss": 6.4625, + "loss/crossentropy": 2.6421563625335693, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21367350220680237, + "step": 4302 + }, + { + "epoch": 0.1345, + "grad_norm": 4.4375, + "grad_norm_var": 0.16135660807291666, + "learning_rate": 0.0001, + "loss": 6.559, + "loss/crossentropy": 2.6035948991775513, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2248389720916748, + "step": 4304 + }, + { + "epoch": 0.1345625, + "grad_norm": 3.890625, + "grad_norm_var": 0.09832255045572917, + "learning_rate": 0.0001, + "loss": 6.5766, + "loss/crossentropy": 2.636247396469116, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22215821593999863, + "step": 4306 + }, + { + "epoch": 0.134625, + "grad_norm": 4.09375, + "grad_norm_var": 0.09346415201822916, + "learning_rate": 0.0001, + "loss": 6.3825, + "loss/crossentropy": 2.516153335571289, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2174898460507393, + "step": 4308 + }, + { + "epoch": 0.1346875, + "grad_norm": 3.703125, + "grad_norm_var": 0.07429911295572916, + "learning_rate": 0.0001, + "loss": 6.1787, + "loss/crossentropy": 2.4977835416793823, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20247140526771545, + "step": 4310 + }, + { + "epoch": 0.13475, + "grad_norm": 3.640625, + "grad_norm_var": 0.07258707682291667, + "learning_rate": 0.0001, + "loss": 6.7702, + "loss/crossentropy": 2.7508914470672607, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.2304483950138092, + "step": 4312 + }, + { + "epoch": 0.1348125, + "grad_norm": 3.703125, + "grad_norm_var": 0.07841695149739583, + "learning_rate": 0.0001, + "loss": 6.3305, + "loss/crossentropy": 2.513437509536743, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21295832097530365, + "step": 4314 + }, + { + "epoch": 0.134875, + "grad_norm": 3.71875, + "grad_norm_var": 0.07786051432291667, + "learning_rate": 0.0001, + "loss": 6.3279, + "loss/crossentropy": 2.533980965614319, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21298405528068542, + "step": 4316 + }, + { + "epoch": 0.1349375, + "grad_norm": 3.765625, + "grad_norm_var": 33.87757059733073, + "learning_rate": 0.0001, + "loss": 6.8462, + "loss/crossentropy": 2.828759551048279, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22947552800178528, + "step": 4318 + }, + { + "epoch": 0.135, + "grad_norm": 3.78125, + "grad_norm_var": 33.94041341145833, + "learning_rate": 0.0001, + "loss": 6.5928, + "loss/crossentropy": 2.7170915603637695, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21686632186174393, + "step": 4320 + }, + { + "epoch": 0.1350625, + "grad_norm": 3.90625, + "grad_norm_var": 34.03020426432292, + "learning_rate": 0.0001, + "loss": 6.2161, + "loss/crossentropy": 2.4180595874786377, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2118338942527771, + "step": 4322 + }, + { + "epoch": 0.135125, + "grad_norm": 3.984375, + "grad_norm_var": 34.01892903645833, + "learning_rate": 0.0001, + "loss": 6.4675, + "loss/crossentropy": 2.610173225402832, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2158115953207016, + "step": 4324 + }, + { + "epoch": 0.1351875, + "grad_norm": 3.96875, + "grad_norm_var": 33.88642578125, + "learning_rate": 0.0001, + "loss": 6.534, + "loss/crossentropy": 2.64306640625, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.21721620857715607, + "step": 4326 + }, + { + "epoch": 0.13525, + "grad_norm": 3.5, + "grad_norm_var": 33.971805826822916, + "learning_rate": 0.0001, + "loss": 6.4392, + "loss/crossentropy": 2.6547141075134277, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21243591606616974, + "step": 4328 + }, + { + "epoch": 0.1353125, + "grad_norm": 3.40625, + "grad_norm_var": 34.10946858723958, + "learning_rate": 0.0001, + "loss": 6.368, + "loss/crossentropy": 2.5513360500335693, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21799319982528687, + "step": 4330 + }, + { + "epoch": 0.135375, + "grad_norm": 3.765625, + "grad_norm_var": 34.01167704264323, + "learning_rate": 0.0001, + "loss": 6.8963, + "loss/crossentropy": 2.9096730947494507, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22952109575271606, + "step": 4332 + }, + { + "epoch": 0.1354375, + "grad_norm": 4.125, + "grad_norm_var": 0.07390034993489583, + "learning_rate": 0.0001, + "loss": 6.5352, + "loss/crossentropy": 2.6305580139160156, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.22366870939731598, + "step": 4334 + }, + { + "epoch": 0.1355, + "grad_norm": 3.828125, + "grad_norm_var": 0.0769927978515625, + "learning_rate": 0.0001, + "loss": 6.5557, + "loss/crossentropy": 2.622426748275757, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.2198849767446518, + "step": 4336 + }, + { + "epoch": 0.1355625, + "grad_norm": 4.125, + "grad_norm_var": 0.0683013916015625, + "learning_rate": 0.0001, + "loss": 6.7091, + "loss/crossentropy": 2.711685061454773, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.2290424406528473, + "step": 4338 + }, + { + "epoch": 0.135625, + "grad_norm": 4.5625, + "grad_norm_var": 0.09519856770833333, + "learning_rate": 0.0001, + "loss": 6.6465, + "loss/crossentropy": 2.6272358894348145, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.22731468826532364, + "step": 4340 + }, + { + "epoch": 0.1356875, + "grad_norm": 4.28125, + "grad_norm_var": 0.102099609375, + "learning_rate": 0.0001, + "loss": 6.796, + "loss/crossentropy": 2.6994833946228027, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.23465242981910706, + "step": 4342 + }, + { + "epoch": 0.13575, + "grad_norm": 3.921875, + "grad_norm_var": 0.08208719889322917, + "learning_rate": 0.0001, + "loss": 6.2886, + "loss/crossentropy": 2.5074750185012817, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21014603972434998, + "step": 4344 + }, + { + "epoch": 0.1358125, + "grad_norm": 4.0625, + "grad_norm_var": 0.06747945149739583, + "learning_rate": 0.0001, + "loss": 6.5879, + "loss/crossentropy": 2.709128260612488, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.22108222544193268, + "step": 4346 + }, + { + "epoch": 0.135875, + "grad_norm": 3.734375, + "grad_norm_var": 0.05568745930989583, + "learning_rate": 0.0001, + "loss": 6.3625, + "loss/crossentropy": 2.6139813661575317, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21040183305740356, + "step": 4348 + }, + { + "epoch": 0.1359375, + "grad_norm": 3.671875, + "grad_norm_var": 0.0558502197265625, + "learning_rate": 0.0001, + "loss": 6.5649, + "loss/crossentropy": 2.662161111831665, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21800871938467026, + "step": 4350 + }, + { + "epoch": 0.136, + "grad_norm": 3.875, + "grad_norm_var": 0.0582672119140625, + "learning_rate": 0.0001, + "loss": 6.2154, + "loss/crossentropy": 2.428389310836792, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2119050994515419, + "step": 4352 + }, + { + "epoch": 0.1360625, + "grad_norm": 4.0625, + "grad_norm_var": 0.07548828125, + "learning_rate": 0.0001, + "loss": 6.7486, + "loss/crossentropy": 2.7366960048675537, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2293202206492424, + "step": 4354 + }, + { + "epoch": 0.136125, + "grad_norm": 3.9375, + "grad_norm_var": 0.0656890869140625, + "learning_rate": 0.0001, + "loss": 6.259, + "loss/crossentropy": 2.418584704399109, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21295125782489777, + "step": 4356 + }, + { + "epoch": 0.1361875, + "grad_norm": 3.515625, + "grad_norm_var": 0.06562093098958334, + "learning_rate": 0.0001, + "loss": 6.5373, + "loss/crossentropy": 2.6187655925750732, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22232334315776825, + "step": 4358 + }, + { + "epoch": 0.13625, + "grad_norm": 3.765625, + "grad_norm_var": 0.07109375, + "learning_rate": 0.0001, + "loss": 6.6928, + "loss/crossentropy": 2.7608814239501953, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2228756546974182, + "step": 4360 + }, + { + "epoch": 0.1363125, + "grad_norm": 4.1875, + "grad_norm_var": 0.07258199055989584, + "learning_rate": 0.0001, + "loss": 6.7529, + "loss/crossentropy": 2.7711809873580933, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22668741643428802, + "step": 4362 + }, + { + "epoch": 0.136375, + "grad_norm": 4.09375, + "grad_norm_var": 0.0748931884765625, + "learning_rate": 0.0001, + "loss": 6.5502, + "loss/crossentropy": 2.643489956855774, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2211436778306961, + "step": 4364 + }, + { + "epoch": 0.1364375, + "grad_norm": 4.03125, + "grad_norm_var": 0.06730855305989583, + "learning_rate": 0.0001, + "loss": 6.4432, + "loss/crossentropy": 2.633753180503845, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.20984596014022827, + "step": 4366 + }, + { + "epoch": 0.1365, + "grad_norm": 3.90625, + "grad_norm_var": 0.0697906494140625, + "learning_rate": 0.0001, + "loss": 6.2885, + "loss/crossentropy": 2.534524083137512, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2093779295682907, + "step": 4368 + }, + { + "epoch": 0.1365625, + "grad_norm": 3.625, + "grad_norm_var": 0.04158426920572917, + "learning_rate": 0.0001, + "loss": 6.0191, + "loss/crossentropy": 2.3582180738449097, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20241554081439972, + "step": 4370 + }, + { + "epoch": 0.136625, + "grad_norm": 3.5, + "grad_norm_var": 0.04755452473958333, + "learning_rate": 0.0001, + "loss": 6.387, + "loss/crossentropy": 2.5663875341415405, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2140897959470749, + "step": 4372 + }, + { + "epoch": 0.1366875, + "grad_norm": 3.6875, + "grad_norm_var": 0.04810282389322917, + "learning_rate": 0.0001, + "loss": 6.2828, + "loss/crossentropy": 2.5431841611862183, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20833703875541687, + "step": 4374 + }, + { + "epoch": 0.13675, + "grad_norm": 4.65625, + "grad_norm_var": 0.10110270182291667, + "learning_rate": 0.0001, + "loss": 6.6385, + "loss/crossentropy": 2.7397044897079468, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22112590074539185, + "step": 4376 + }, + { + "epoch": 0.1368125, + "grad_norm": 4.0, + "grad_norm_var": 0.24049479166666668, + "learning_rate": 0.0001, + "loss": 6.3941, + "loss/crossentropy": 2.472692608833313, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22261205315589905, + "step": 4378 + }, + { + "epoch": 0.136875, + "grad_norm": 4.03125, + "grad_norm_var": 0.23801676432291666, + "learning_rate": 0.0001, + "loss": 6.4296, + "loss/crossentropy": 2.52515184879303, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21974552422761917, + "step": 4380 + }, + { + "epoch": 0.1369375, + "grad_norm": 3.796875, + "grad_norm_var": 0.23893229166666666, + "learning_rate": 0.0001, + "loss": 6.2777, + "loss/crossentropy": 2.4887278079986572, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2101452499628067, + "step": 4382 + }, + { + "epoch": 0.137, + "grad_norm": 3.828125, + "grad_norm_var": 0.23479410807291667, + "learning_rate": 0.0001, + "loss": 6.4066, + "loss/crossentropy": 2.611871361732483, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2107202634215355, + "step": 4384 + }, + { + "epoch": 0.1370625, + "grad_norm": 3.484375, + "grad_norm_var": 0.2543690999348958, + "learning_rate": 0.0001, + "loss": 6.4682, + "loss/crossentropy": 2.648727774620056, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2163223773241043, + "step": 4386 + }, + { + "epoch": 0.137125, + "grad_norm": 3.875, + "grad_norm_var": 0.23957417805989584, + "learning_rate": 0.0001, + "loss": 6.3062, + "loss/crossentropy": 2.5285497903823853, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.20822982490062714, + "step": 4388 + }, + { + "epoch": 0.1371875, + "grad_norm": 3.5, + "grad_norm_var": 0.23983968098958333, + "learning_rate": 0.0001, + "loss": 6.2979, + "loss/crossentropy": 2.506072998046875, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21238084137439728, + "step": 4390 + }, + { + "epoch": 0.13725, + "grad_norm": 3.8125, + "grad_norm_var": 0.19384765625, + "learning_rate": 0.0001, + "loss": 6.5237, + "loss/crossentropy": 2.7000246047973633, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21361419558525085, + "step": 4392 + }, + { + "epoch": 0.1373125, + "grad_norm": 3.71875, + "grad_norm_var": 0.0458892822265625, + "learning_rate": 0.0001, + "loss": 6.4284, + "loss/crossentropy": 2.589784860610962, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.2162848636507988, + "step": 4394 + }, + { + "epoch": 0.137375, + "grad_norm": 3.59375, + "grad_norm_var": 0.044123331705729164, + "learning_rate": 0.0001, + "loss": 6.1978, + "loss/crossentropy": 2.410344958305359, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.20960881561040878, + "step": 4396 + }, + { + "epoch": 0.1374375, + "grad_norm": 4.375, + "grad_norm_var": 0.07614644368489583, + "learning_rate": 0.0001, + "loss": 6.5374, + "loss/crossentropy": 2.6627117395401, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21676111966371536, + "step": 4398 + }, + { + "epoch": 0.1375, + "grad_norm": 3.921875, + "grad_norm_var": 0.07752278645833334, + "learning_rate": 0.0001, + "loss": 6.4063, + "loss/crossentropy": 2.6314616203308105, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21381668746471405, + "step": 4400 + }, + { + "epoch": 0.1375625, + "grad_norm": 3.828125, + "grad_norm_var": 0.0466949462890625, + "learning_rate": 0.0001, + "loss": 6.4063, + "loss/crossentropy": 2.54606032371521, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21532301604747772, + "step": 4402 + }, + { + "epoch": 0.137625, + "grad_norm": 4.03125, + "grad_norm_var": 0.06516825358072917, + "learning_rate": 0.0001, + "loss": 6.7664, + "loss/crossentropy": 2.695158004760742, + "loss/hidden": 1.74609375, + "loss/jsd": 0.0, + "loss/logits": 0.23251530528068542, + "step": 4404 + }, + { + "epoch": 0.1376875, + "grad_norm": 3.671875, + "grad_norm_var": 0.059178670247395836, + "learning_rate": 0.0001, + "loss": 6.5929, + "loss/crossentropy": 2.7052929401397705, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21922975778579712, + "step": 4406 + }, + { + "epoch": 0.13775, + "grad_norm": 3.640625, + "grad_norm_var": 0.08063863118489584, + "learning_rate": 0.0001, + "loss": 6.2633, + "loss/crossentropy": 2.559746026992798, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20707596093416214, + "step": 4408 + }, + { + "epoch": 0.1378125, + "grad_norm": 4.59375, + "grad_norm_var": 0.1199127197265625, + "learning_rate": 0.0001, + "loss": 6.0566, + "loss/crossentropy": 2.4197880029678345, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20157083868980408, + "step": 4410 + }, + { + "epoch": 0.137875, + "grad_norm": 3.796875, + "grad_norm_var": 0.12034403483072917, + "learning_rate": 0.0001, + "loss": 6.3774, + "loss/crossentropy": 2.6243380308151245, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20772488415241241, + "step": 4412 + }, + { + "epoch": 0.1379375, + "grad_norm": 3.578125, + "grad_norm_var": 0.11555887858072916, + "learning_rate": 0.0001, + "loss": 6.6581, + "loss/crossentropy": 2.6697126626968384, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22696784883737564, + "step": 4414 + }, + { + "epoch": 0.138, + "grad_norm": 3.921875, + "grad_norm_var": 0.1158599853515625, + "learning_rate": 0.0001, + "loss": 6.5487, + "loss/crossentropy": 2.6725574731826782, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21378755569458008, + "step": 4416 + }, + { + "epoch": 0.1380625, + "grad_norm": 3.75, + "grad_norm_var": 0.11702067057291667, + "learning_rate": 0.0001, + "loss": 6.6902, + "loss/crossentropy": 2.743945837020874, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2274349480867386, + "step": 4418 + }, + { + "epoch": 0.138125, + "grad_norm": 3.546875, + "grad_norm_var": 0.13110249837239582, + "learning_rate": 0.0001, + "loss": 6.7319, + "loss/crossentropy": 2.7539360523223877, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22630860656499863, + "step": 4420 + }, + { + "epoch": 0.1381875, + "grad_norm": 3.828125, + "grad_norm_var": 0.1296783447265625, + "learning_rate": 0.0001, + "loss": 6.4803, + "loss/crossentropy": 2.604143738746643, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21613523364067078, + "step": 4422 + }, + { + "epoch": 0.13825, + "grad_norm": 3.859375, + "grad_norm_var": 0.109326171875, + "learning_rate": 0.0001, + "loss": 6.6886, + "loss/crossentropy": 2.7972522974014282, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22077278792858124, + "step": 4424 + }, + { + "epoch": 0.1383125, + "grad_norm": 4.15625, + "grad_norm_var": 0.080810546875, + "learning_rate": 0.0001, + "loss": 6.5401, + "loss/crossentropy": 2.657148241996765, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.22345105558633804, + "step": 4426 + }, + { + "epoch": 0.138375, + "grad_norm": 4.65625, + "grad_norm_var": 0.120458984375, + "learning_rate": 0.0001, + "loss": 6.5473, + "loss/crossentropy": 2.6493141651153564, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22065355628728867, + "step": 4428 + }, + { + "epoch": 0.1384375, + "grad_norm": 3.65625, + "grad_norm_var": 0.09996744791666666, + "learning_rate": 0.0001, + "loss": 6.2785, + "loss/crossentropy": 2.490964412689209, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21000496298074722, + "step": 4430 + }, + { + "epoch": 0.1385, + "grad_norm": 3.953125, + "grad_norm_var": 0.11620992024739583, + "learning_rate": 0.0001, + "loss": 6.6311, + "loss/crossentropy": 2.739372491836548, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22003226727247238, + "step": 4432 + }, + { + "epoch": 0.1385625, + "grad_norm": 3.9375, + "grad_norm_var": 0.11472066243489583, + "learning_rate": 0.0001, + "loss": 6.4536, + "loss/crossentropy": 2.550296187400818, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2161109298467636, + "step": 4434 + }, + { + "epoch": 0.138625, + "grad_norm": 3.828125, + "grad_norm_var": 0.0882476806640625, + "learning_rate": 0.0001, + "loss": 6.554, + "loss/crossentropy": 2.681362748146057, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21929988265037537, + "step": 4436 + }, + { + "epoch": 0.1386875, + "grad_norm": 3.890625, + "grad_norm_var": 0.09313151041666666, + "learning_rate": 0.0001, + "loss": 6.4458, + "loss/crossentropy": 2.70820152759552, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.20500586926937103, + "step": 4438 + }, + { + "epoch": 0.13875, + "grad_norm": 4.15625, + "grad_norm_var": 0.0947418212890625, + "learning_rate": 0.0001, + "loss": 6.2504, + "loss/crossentropy": 2.5078264474868774, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20941196382045746, + "step": 4440 + }, + { + "epoch": 0.1388125, + "grad_norm": 3.8125, + "grad_norm_var": 0.09263407389322917, + "learning_rate": 0.0001, + "loss": 5.8633, + "loss/crossentropy": 2.26975679397583, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.19295161217451096, + "step": 4442 + }, + { + "epoch": 0.138875, + "grad_norm": 3.59375, + "grad_norm_var": 0.05187174479166667, + "learning_rate": 0.0001, + "loss": 6.3014, + "loss/crossentropy": 2.5669431686401367, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2074338048696518, + "step": 4444 + }, + { + "epoch": 0.1389375, + "grad_norm": 3.515625, + "grad_norm_var": 0.056737263997395836, + "learning_rate": 0.0001, + "loss": 6.2993, + "loss/crossentropy": 2.540955901145935, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21098945289850235, + "step": 4446 + }, + { + "epoch": 0.139, + "grad_norm": 3.5, + "grad_norm_var": 0.03548075358072917, + "learning_rate": 0.0001, + "loss": 6.4083, + "loss/crossentropy": 2.60480535030365, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21004024147987366, + "step": 4448 + }, + { + "epoch": 0.1390625, + "grad_norm": 3.796875, + "grad_norm_var": 0.03355712890625, + "learning_rate": 0.0001, + "loss": 6.3533, + "loss/crossentropy": 2.52720844745636, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.20994866639375687, + "step": 4450 + }, + { + "epoch": 0.139125, + "grad_norm": 3.65625, + "grad_norm_var": 0.0287506103515625, + "learning_rate": 0.0001, + "loss": 6.2883, + "loss/crossentropy": 2.4919263124465942, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2108834832906723, + "step": 4452 + }, + { + "epoch": 0.1391875, + "grad_norm": 3.40625, + "grad_norm_var": 0.0337890625, + "learning_rate": 0.0001, + "loss": 6.3411, + "loss/crossentropy": 2.643371105194092, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20687970519065857, + "step": 4454 + }, + { + "epoch": 0.13925, + "grad_norm": 3.53125, + "grad_norm_var": 0.0250152587890625, + "learning_rate": 0.0001, + "loss": 6.5259, + "loss/crossentropy": 2.7266517877578735, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21351400017738342, + "step": 4456 + }, + { + "epoch": 0.1393125, + "grad_norm": 3.53125, + "grad_norm_var": 0.024833170572916667, + "learning_rate": 0.0001, + "loss": 6.1921, + "loss/crossentropy": 2.515869140625, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2019999995827675, + "step": 4458 + }, + { + "epoch": 0.139375, + "grad_norm": 3.6875, + "grad_norm_var": 0.026520792643229166, + "learning_rate": 0.0001, + "loss": 6.0831, + "loss/crossentropy": 2.4401614665985107, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20140717178583145, + "step": 4460 + }, + { + "epoch": 0.1394375, + "grad_norm": 3.78125, + "grad_norm_var": 0.026298014322916667, + "learning_rate": 0.0001, + "loss": 6.4829, + "loss/crossentropy": 2.7296160459518433, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2124355137348175, + "step": 4462 + }, + { + "epoch": 0.1395, + "grad_norm": 3.515625, + "grad_norm_var": 0.025487263997395832, + "learning_rate": 0.0001, + "loss": 6.4402, + "loss/crossentropy": 2.608121871948242, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21328146010637283, + "step": 4464 + }, + { + "epoch": 0.1395625, + "grad_norm": 3.6875, + "grad_norm_var": 0.024681599934895833, + "learning_rate": 0.0001, + "loss": 6.5292, + "loss/crossentropy": 2.684123992919922, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2192751243710518, + "step": 4466 + }, + { + "epoch": 0.139625, + "grad_norm": 3.765625, + "grad_norm_var": 0.027620442708333335, + "learning_rate": 0.0001, + "loss": 6.6034, + "loss/crossentropy": 2.69321072101593, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22227226942777634, + "step": 4468 + }, + { + "epoch": 0.1396875, + "grad_norm": 4.28125, + "grad_norm_var": 0.0414703369140625, + "learning_rate": 0.0001, + "loss": 6.5612, + "loss/crossentropy": 2.6232060194015503, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22309193760156631, + "step": 4470 + }, + { + "epoch": 0.13975, + "grad_norm": 3.375, + "grad_norm_var": 0.044408162434895836, + "learning_rate": 0.0001, + "loss": 5.9948, + "loss/crossentropy": 2.433348774909973, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19599098712205887, + "step": 4472 + }, + { + "epoch": 0.1398125, + "grad_norm": 4.5, + "grad_norm_var": 0.08082275390625, + "learning_rate": 0.0001, + "loss": 6.6228, + "loss/crossentropy": 2.6278765201568604, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.22410084307193756, + "step": 4474 + }, + { + "epoch": 0.139875, + "grad_norm": 3.78125, + "grad_norm_var": 0.07467041015625, + "learning_rate": 0.0001, + "loss": 6.0025, + "loss/crossentropy": 2.3112982511520386, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.1991952434182167, + "step": 4476 + }, + { + "epoch": 0.1399375, + "grad_norm": 3.65625, + "grad_norm_var": 0.080810546875, + "learning_rate": 0.0001, + "loss": 6.1531, + "loss/crossentropy": 2.4080352783203125, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.20654088258743286, + "step": 4478 + }, + { + "epoch": 0.14, + "grad_norm": 3.671875, + "grad_norm_var": 0.08090718587239583, + "learning_rate": 0.0001, + "loss": 6.3801, + "loss/crossentropy": 2.57833468914032, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21142880618572235, + "step": 4480 + }, + { + "epoch": 0.1400625, + "grad_norm": 4.28125, + "grad_norm_var": 0.09670817057291667, + "learning_rate": 0.0001, + "loss": 6.3799, + "loss/crossentropy": 2.578604817390442, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.20747198164463043, + "step": 4482 + }, + { + "epoch": 0.140125, + "grad_norm": 3.625, + "grad_norm_var": 0.11760965983072917, + "learning_rate": 0.0001, + "loss": 6.7525, + "loss/crossentropy": 2.8470464944839478, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22023146599531174, + "step": 4484 + }, + { + "epoch": 0.1401875, + "grad_norm": 4.1875, + "grad_norm_var": 0.2569000244140625, + "learning_rate": 0.0001, + "loss": 6.3493, + "loss/crossentropy": 2.4870035648345947, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21826012432575226, + "step": 4486 + }, + { + "epoch": 0.14025, + "grad_norm": 3.71875, + "grad_norm_var": 0.2294830322265625, + "learning_rate": 0.0001, + "loss": 6.3122, + "loss/crossentropy": 2.494701623916626, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21456186473369598, + "step": 4488 + }, + { + "epoch": 0.1403125, + "grad_norm": 4.09375, + "grad_norm_var": 0.20968424479166667, + "learning_rate": 0.0001, + "loss": 6.4447, + "loss/crossentropy": 2.5576690435409546, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.218392476439476, + "step": 4490 + }, + { + "epoch": 0.140375, + "grad_norm": 3.890625, + "grad_norm_var": 0.20572509765625, + "learning_rate": 0.0001, + "loss": 6.6909, + "loss/crossentropy": 2.7482335567474365, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2278623878955841, + "step": 4492 + }, + { + "epoch": 0.1404375, + "grad_norm": 3.71875, + "grad_norm_var": 0.18975321451822916, + "learning_rate": 0.0001, + "loss": 6.8476, + "loss/crossentropy": 2.8149173259735107, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23099815845489502, + "step": 4494 + }, + { + "epoch": 0.1405, + "grad_norm": 3.828125, + "grad_norm_var": 0.1708648681640625, + "learning_rate": 0.0001, + "loss": 6.3192, + "loss/crossentropy": 2.4586331844329834, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21535467356443405, + "step": 4496 + }, + { + "epoch": 0.1405625, + "grad_norm": 3.921875, + "grad_norm_var": 0.16803385416666666, + "learning_rate": 0.0001, + "loss": 6.4127, + "loss/crossentropy": 2.5607200860977173, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21137110888957977, + "step": 4498 + }, + { + "epoch": 0.140625, + "grad_norm": 5.3125, + "grad_norm_var": 0.261767578125, + "learning_rate": 0.0001, + "loss": 6.9379, + "loss/crossentropy": 2.715990900993347, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.24641267955303192, + "step": 4500 + }, + { + "epoch": 0.1406875, + "grad_norm": 3.46875, + "grad_norm_var": 0.1669342041015625, + "learning_rate": 0.0001, + "loss": 6.0838, + "loss/crossentropy": 2.3416292667388916, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20937299728393555, + "step": 4502 + }, + { + "epoch": 0.14075, + "grad_norm": 3.8125, + "grad_norm_var": 0.1842193603515625, + "learning_rate": 0.0001, + "loss": 6.1107, + "loss/crossentropy": 2.399893641471863, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20779887586832047, + "step": 4504 + }, + { + "epoch": 0.1408125, + "grad_norm": 3.578125, + "grad_norm_var": 0.1912506103515625, + "learning_rate": 0.0001, + "loss": 6.4685, + "loss/crossentropy": 2.614013910293579, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21787265688180923, + "step": 4506 + }, + { + "epoch": 0.140875, + "grad_norm": 3.796875, + "grad_norm_var": 0.2802398681640625, + "learning_rate": 0.0001, + "loss": 6.7776, + "loss/crossentropy": 2.7103980779647827, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2344505339860916, + "step": 4508 + }, + { + "epoch": 0.1409375, + "grad_norm": 3.90625, + "grad_norm_var": 0.268994140625, + "learning_rate": 0.0001, + "loss": 6.4618, + "loss/crossentropy": 2.5430240631103516, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22156068682670593, + "step": 4510 + }, + { + "epoch": 0.141, + "grad_norm": 3.25, + "grad_norm_var": 0.3018544514973958, + "learning_rate": 0.0001, + "loss": 6.0656, + "loss/crossentropy": 2.3237478733062744, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20582488179206848, + "step": 4512 + }, + { + "epoch": 0.1410625, + "grad_norm": 3.375, + "grad_norm_var": 0.3292877197265625, + "learning_rate": 0.0001, + "loss": 6.3944, + "loss/crossentropy": 2.6120007038116455, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21261148154735565, + "step": 4514 + }, + { + "epoch": 0.141125, + "grad_norm": 4.03125, + "grad_norm_var": 0.19192301432291667, + "learning_rate": 0.0001, + "loss": 6.3112, + "loss/crossentropy": 2.4821159839630127, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21532998234033585, + "step": 4516 + }, + { + "epoch": 0.1411875, + "grad_norm": 4.25, + "grad_norm_var": 0.20214742024739582, + "learning_rate": 0.0001, + "loss": 6.6308, + "loss/crossentropy": 2.641494393348694, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.22783225774765015, + "step": 4518 + }, + { + "epoch": 0.14125, + "grad_norm": 3.890625, + "grad_norm_var": 0.1877349853515625, + "learning_rate": 0.0001, + "loss": 6.083, + "loss/crossentropy": 2.351473093032837, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20596114546060562, + "step": 4520 + }, + { + "epoch": 0.1413125, + "grad_norm": 3.640625, + "grad_norm_var": 0.18372294108072917, + "learning_rate": 0.0001, + "loss": 6.4145, + "loss/crossentropy": 2.6024839878082275, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21128298342227936, + "step": 4522 + }, + { + "epoch": 0.141375, + "grad_norm": 3.40625, + "grad_norm_var": 0.09845377604166666, + "learning_rate": 0.0001, + "loss": 6.0818, + "loss/crossentropy": 2.4369860887527466, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2015867829322815, + "step": 4524 + }, + { + "epoch": 0.1414375, + "grad_norm": 4.125, + "grad_norm_var": 0.10485026041666666, + "learning_rate": 0.0001, + "loss": 6.5281, + "loss/crossentropy": 2.6377745866775513, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22028282284736633, + "step": 4526 + }, + { + "epoch": 0.1415, + "grad_norm": 3.921875, + "grad_norm_var": 0.075390625, + "learning_rate": 0.0001, + "loss": 6.1738, + "loss/crossentropy": 2.4429298639297485, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2086314857006073, + "step": 4528 + }, + { + "epoch": 0.1415625, + "grad_norm": 4.28125, + "grad_norm_var": 0.08909098307291667, + "learning_rate": 0.0001, + "loss": 6.9688, + "loss/crossentropy": 2.8351988792419434, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.23914079368114471, + "step": 4530 + }, + { + "epoch": 0.141625, + "grad_norm": 3.703125, + "grad_norm_var": 0.09009501139322916, + "learning_rate": 0.0001, + "loss": 6.333, + "loss/crossentropy": 2.566150426864624, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21066495776176453, + "step": 4532 + }, + { + "epoch": 0.1416875, + "grad_norm": 4.125, + "grad_norm_var": 0.07337137858072916, + "learning_rate": 0.0001, + "loss": 6.6482, + "loss/crossentropy": 2.7197688817977905, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22448784857988358, + "step": 4534 + }, + { + "epoch": 0.14175, + "grad_norm": 3.921875, + "grad_norm_var": 0.0838531494140625, + "learning_rate": 0.0001, + "loss": 6.5358, + "loss/crossentropy": 2.7000629901885986, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21873365342617035, + "step": 4536 + }, + { + "epoch": 0.1418125, + "grad_norm": 3.421875, + "grad_norm_var": 0.1019439697265625, + "learning_rate": 0.0001, + "loss": 6.2915, + "loss/crossentropy": 2.5986807346343994, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20560834556818008, + "step": 4538 + }, + { + "epoch": 0.141875, + "grad_norm": 3.546875, + "grad_norm_var": 0.09635009765625, + "learning_rate": 0.0001, + "loss": 6.5066, + "loss/crossentropy": 2.612900137901306, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22178903222084045, + "step": 4540 + }, + { + "epoch": 0.1419375, + "grad_norm": 3.6875, + "grad_norm_var": 0.09374898274739583, + "learning_rate": 0.0001, + "loss": 6.4313, + "loss/crossentropy": 2.623916268348694, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21199119836091995, + "step": 4542 + }, + { + "epoch": 0.142, + "grad_norm": 3.84375, + "grad_norm_var": 0.10750223795572916, + "learning_rate": 0.0001, + "loss": 6.2772, + "loss/crossentropy": 2.52363121509552, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20778058469295502, + "step": 4544 + }, + { + "epoch": 0.1420625, + "grad_norm": 3.671875, + "grad_norm_var": 0.06940104166666666, + "learning_rate": 0.0001, + "loss": 6.5497, + "loss/crossentropy": 2.559391736984253, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22676189988851547, + "step": 4546 + }, + { + "epoch": 0.142125, + "grad_norm": 3.859375, + "grad_norm_var": 0.07356363932291667, + "learning_rate": 0.0001, + "loss": 6.2551, + "loss/crossentropy": 2.5050313472747803, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2046976387500763, + "step": 4548 + }, + { + "epoch": 0.1421875, + "grad_norm": 4.46875, + "grad_norm_var": 0.10657552083333334, + "learning_rate": 0.0001, + "loss": 6.7595, + "loss/crossentropy": 2.6757744550704956, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2353217825293541, + "step": 4550 + }, + { + "epoch": 0.14225, + "grad_norm": 4.46875, + "grad_norm_var": 0.12874247233072916, + "learning_rate": 0.0001, + "loss": 6.6681, + "loss/crossentropy": 2.7467669248580933, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.2179144099354744, + "step": 4552 + }, + { + "epoch": 0.1423125, + "grad_norm": 4.28125, + "grad_norm_var": 0.13662109375, + "learning_rate": 0.0001, + "loss": 6.3387, + "loss/crossentropy": 2.508995532989502, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21773157268762589, + "step": 4554 + }, + { + "epoch": 0.142375, + "grad_norm": 3.59375, + "grad_norm_var": 0.135107421875, + "learning_rate": 0.0001, + "loss": 6.3024, + "loss/crossentropy": 2.5592448711395264, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21025492250919342, + "step": 4556 + }, + { + "epoch": 0.1424375, + "grad_norm": 4.25, + "grad_norm_var": 0.25068359375, + "learning_rate": 0.0001, + "loss": 6.3025, + "loss/crossentropy": 2.3946605920791626, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22242514044046402, + "step": 4558 + }, + { + "epoch": 0.1425, + "grad_norm": 4.0, + "grad_norm_var": 0.23028971354166666, + "learning_rate": 0.0001, + "loss": 6.4046, + "loss/crossentropy": 2.659841537475586, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2076825425028801, + "step": 4560 + }, + { + "epoch": 0.1425625, + "grad_norm": 3.75, + "grad_norm_var": 0.2351226806640625, + "learning_rate": 0.0001, + "loss": 6.4888, + "loss/crossentropy": 2.6625062227249146, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21622255444526672, + "step": 4562 + }, + { + "epoch": 0.142625, + "grad_norm": 3.4375, + "grad_norm_var": 0.25201416015625, + "learning_rate": 0.0001, + "loss": 6.1066, + "loss/crossentropy": 2.4543986320495605, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20076348632574081, + "step": 4564 + }, + { + "epoch": 0.1426875, + "grad_norm": 3.515625, + "grad_norm_var": 0.2391998291015625, + "learning_rate": 0.0001, + "loss": 6.3698, + "loss/crossentropy": 2.6311250925064087, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20980965346097946, + "step": 4566 + }, + { + "epoch": 0.14275, + "grad_norm": 3.921875, + "grad_norm_var": 0.21142171223958334, + "learning_rate": 0.0001, + "loss": 6.4228, + "loss/crossentropy": 2.5519657135009766, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21833696961402893, + "step": 4568 + }, + { + "epoch": 0.1428125, + "grad_norm": 3.75, + "grad_norm_var": 0.1912506103515625, + "learning_rate": 0.0001, + "loss": 6.3472, + "loss/crossentropy": 2.5183498859405518, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2157001942396164, + "step": 4570 + }, + { + "epoch": 0.142875, + "grad_norm": 3.703125, + "grad_norm_var": 0.19122721354166666, + "learning_rate": 0.0001, + "loss": 6.2351, + "loss/crossentropy": 2.500234842300415, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.2113747000694275, + "step": 4572 + }, + { + "epoch": 0.1429375, + "grad_norm": 4.0, + "grad_norm_var": 0.037385050455729166, + "learning_rate": 0.0001, + "loss": 6.6098, + "loss/crossentropy": 2.7225894927978516, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2203599065542221, + "step": 4574 + }, + { + "epoch": 0.143, + "grad_norm": 3.65625, + "grad_norm_var": 0.046662394205729166, + "learning_rate": 0.0001, + "loss": 6.3647, + "loss/crossentropy": 2.57330060005188, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21117094904184341, + "step": 4576 + }, + { + "epoch": 0.1430625, + "grad_norm": 3.734375, + "grad_norm_var": 0.0470123291015625, + "learning_rate": 0.0001, + "loss": 6.621, + "loss/crossentropy": 2.7625339031219482, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21905124187469482, + "step": 4578 + }, + { + "epoch": 0.143125, + "grad_norm": 3.890625, + "grad_norm_var": 0.0507720947265625, + "learning_rate": 0.0001, + "loss": 6.6931, + "loss/crossentropy": 2.7396358251571655, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22463934868574142, + "step": 4580 + }, + { + "epoch": 0.1431875, + "grad_norm": 3.78125, + "grad_norm_var": 0.05603841145833333, + "learning_rate": 0.0001, + "loss": 6.3751, + "loss/crossentropy": 2.5076018571853638, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21995484083890915, + "step": 4582 + }, + { + "epoch": 0.14325, + "grad_norm": 4.125, + "grad_norm_var": 0.060774739583333334, + "learning_rate": 0.0001, + "loss": 6.302, + "loss/crossentropy": 2.444732427597046, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21190303564071655, + "step": 4584 + }, + { + "epoch": 0.1433125, + "grad_norm": 3.796875, + "grad_norm_var": 0.055597941080729164, + "learning_rate": 0.0001, + "loss": 6.6817, + "loss/crossentropy": 2.6860902309417725, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.22417040169239044, + "step": 4586 + }, + { + "epoch": 0.143375, + "grad_norm": 3.859375, + "grad_norm_var": 0.04543863932291667, + "learning_rate": 0.0001, + "loss": 6.3433, + "loss/crossentropy": 2.6071120500564575, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20682349801063538, + "step": 4588 + }, + { + "epoch": 0.1434375, + "grad_norm": 3.96875, + "grad_norm_var": 0.04850972493489583, + "learning_rate": 0.0001, + "loss": 6.4867, + "loss/crossentropy": 2.5918766260147095, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21916785836219788, + "step": 4590 + }, + { + "epoch": 0.1435, + "grad_norm": 3.59375, + "grad_norm_var": 0.04853413899739583, + "learning_rate": 0.0001, + "loss": 6.5322, + "loss/crossentropy": 2.661565065383911, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21909621357917786, + "step": 4592 + }, + { + "epoch": 0.1435625, + "grad_norm": 3.5, + "grad_norm_var": 0.04719136555989583, + "learning_rate": 0.0001, + "loss": 6.1707, + "loss/crossentropy": 2.3326449394226074, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21505358070135117, + "step": 4594 + }, + { + "epoch": 0.143625, + "grad_norm": 3.640625, + "grad_norm_var": 0.048876953125, + "learning_rate": 0.0001, + "loss": 6.1565, + "loss/crossentropy": 2.409466505050659, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20829974859952927, + "step": 4596 + }, + { + "epoch": 0.1436875, + "grad_norm": 3.765625, + "grad_norm_var": 0.042210896809895836, + "learning_rate": 0.0001, + "loss": 6.5329, + "loss/crossentropy": 2.6367892026901245, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21812911331653595, + "step": 4598 + }, + { + "epoch": 0.14375, + "grad_norm": 3.5625, + "grad_norm_var": 0.044896443684895836, + "learning_rate": 0.0001, + "loss": 6.4859, + "loss/crossentropy": 2.6948471069335938, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21660198271274567, + "step": 4600 + }, + { + "epoch": 0.1438125, + "grad_norm": 3.84375, + "grad_norm_var": 0.03905843098958333, + "learning_rate": 0.0001, + "loss": 6.0582, + "loss/crossentropy": 2.3523894548416138, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.20027178525924683, + "step": 4602 + }, + { + "epoch": 0.143875, + "grad_norm": 3.6875, + "grad_norm_var": 0.0413970947265625, + "learning_rate": 0.0001, + "loss": 6.6664, + "loss/crossentropy": 2.769497036933899, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22133028507232666, + "step": 4604 + }, + { + "epoch": 0.1439375, + "grad_norm": 3.84375, + "grad_norm_var": 0.032013956705729166, + "learning_rate": 0.0001, + "loss": 6.756, + "loss/crossentropy": 2.7452114820480347, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.23233232647180557, + "step": 4606 + }, + { + "epoch": 0.144, + "grad_norm": 4.5, + "grad_norm_var": 0.06319071451822916, + "learning_rate": 0.0001, + "loss": 6.6783, + "loss/crossentropy": 2.7978492975234985, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21811985969543457, + "step": 4608 + }, + { + "epoch": 0.1440625, + "grad_norm": 3.90625, + "grad_norm_var": 0.0533203125, + "learning_rate": 0.0001, + "loss": 6.1483, + "loss/crossentropy": 2.4034503698349, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20768387615680695, + "step": 4610 + }, + { + "epoch": 0.144125, + "grad_norm": 4.03125, + "grad_norm_var": 0.05178629557291667, + "learning_rate": 0.0001, + "loss": 6.3053, + "loss/crossentropy": 2.5535662174224854, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.20603449642658234, + "step": 4612 + }, + { + "epoch": 0.1441875, + "grad_norm": 3.90625, + "grad_norm_var": 0.059691365559895834, + "learning_rate": 0.0001, + "loss": 6.2352, + "loss/crossentropy": 2.566025137901306, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.19503897428512573, + "step": 4614 + }, + { + "epoch": 0.14425, + "grad_norm": 4.03125, + "grad_norm_var": 0.0542144775390625, + "learning_rate": 0.0001, + "loss": 6.5978, + "loss/crossentropy": 2.6479650735855103, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22350239753723145, + "step": 4616 + }, + { + "epoch": 0.1443125, + "grad_norm": 3.78125, + "grad_norm_var": 0.0493560791015625, + "learning_rate": 0.0001, + "loss": 6.2562, + "loss/crossentropy": 2.5567132234573364, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20432090759277344, + "step": 4618 + }, + { + "epoch": 0.144375, + "grad_norm": 3.53125, + "grad_norm_var": 0.05799051920572917, + "learning_rate": 0.0001, + "loss": 6.1748, + "loss/crossentropy": 2.5087623596191406, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2056633159518242, + "step": 4620 + }, + { + "epoch": 0.1444375, + "grad_norm": 3.90625, + "grad_norm_var": 0.0584136962890625, + "learning_rate": 0.0001, + "loss": 6.6676, + "loss/crossentropy": 2.757576823234558, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21951927244663239, + "step": 4622 + }, + { + "epoch": 0.1445, + "grad_norm": 3.546875, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 0.0001, + "loss": 6.4611, + "loss/crossentropy": 2.717313051223755, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20914141833782196, + "step": 4624 + }, + { + "epoch": 0.1445625, + "grad_norm": 3.921875, + "grad_norm_var": 0.044722493489583334, + "learning_rate": 0.0001, + "loss": 6.5246, + "loss/crossentropy": 2.6257013082504272, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22230887413024902, + "step": 4626 + }, + { + "epoch": 0.144625, + "grad_norm": 3.609375, + "grad_norm_var": 0.047652180989583334, + "learning_rate": 0.0001, + "loss": 6.5916, + "loss/crossentropy": 2.752093195915222, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21911172568798065, + "step": 4628 + }, + { + "epoch": 0.1446875, + "grad_norm": 4.09375, + "grad_norm_var": 0.0403228759765625, + "learning_rate": 0.0001, + "loss": 6.4875, + "loss/crossentropy": 2.518881678581238, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22381020337343216, + "step": 4630 + }, + { + "epoch": 0.14475, + "grad_norm": 3.5, + "grad_norm_var": 0.0376129150390625, + "learning_rate": 0.0001, + "loss": 6.2797, + "loss/crossentropy": 2.5216615200042725, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2105671912431717, + "step": 4632 + }, + { + "epoch": 0.1448125, + "grad_norm": 4.75, + "grad_norm_var": 0.10459696451822917, + "learning_rate": 0.0001, + "loss": 6.6336, + "loss/crossentropy": 2.7041332721710205, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.22068022191524506, + "step": 4634 + }, + { + "epoch": 0.144875, + "grad_norm": 3.59375, + "grad_norm_var": 0.11144917805989583, + "learning_rate": 0.0001, + "loss": 6.2327, + "loss/crossentropy": 2.368388056755066, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.2153344452381134, + "step": 4636 + }, + { + "epoch": 0.1449375, + "grad_norm": 3.484375, + "grad_norm_var": 0.12156575520833333, + "learning_rate": 0.0001, + "loss": 5.9793, + "loss/crossentropy": 2.3809818029403687, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19616173207759857, + "step": 4638 + }, + { + "epoch": 0.145, + "grad_norm": 4.125, + "grad_norm_var": 0.1231353759765625, + "learning_rate": 0.0001, + "loss": 6.7704, + "loss/crossentropy": 2.7761400938034058, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22637692838907242, + "step": 4640 + }, + { + "epoch": 0.1450625, + "grad_norm": 4.21875, + "grad_norm_var": 0.1308990478515625, + "learning_rate": 0.0001, + "loss": 6.6359, + "loss/crossentropy": 2.667805314064026, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2264966070652008, + "step": 4642 + }, + { + "epoch": 0.145125, + "grad_norm": 3.828125, + "grad_norm_var": 0.13297119140625, + "learning_rate": 0.0001, + "loss": 6.4757, + "loss/crossentropy": 2.7138952016830444, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2097788080573082, + "step": 4644 + }, + { + "epoch": 0.1451875, + "grad_norm": 5.6875, + "grad_norm_var": 0.34869791666666666, + "learning_rate": 0.0001, + "loss": 6.4296, + "loss/crossentropy": 2.5671643018722534, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2182699367403984, + "step": 4646 + }, + { + "epoch": 0.14525, + "grad_norm": 4.0625, + "grad_norm_var": 0.3304524739583333, + "learning_rate": 0.0001, + "loss": 6.3105, + "loss/crossentropy": 2.389971971511841, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.22017350792884827, + "step": 4648 + }, + { + "epoch": 0.1453125, + "grad_norm": 3.75, + "grad_norm_var": 0.2860514322916667, + "learning_rate": 0.0001, + "loss": 6.1683, + "loss/crossentropy": 2.426085114479065, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20703301578760147, + "step": 4650 + }, + { + "epoch": 0.145375, + "grad_norm": 4.15625, + "grad_norm_var": 0.2794586181640625, + "learning_rate": 0.0001, + "loss": 6.3578, + "loss/crossentropy": 2.5354756116867065, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2158234342932701, + "step": 4652 + }, + { + "epoch": 0.1454375, + "grad_norm": 3.921875, + "grad_norm_var": 0.26105855305989584, + "learning_rate": 0.0001, + "loss": 6.7593, + "loss/crossentropy": 2.630834221839905, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.2355065494775772, + "step": 4654 + }, + { + "epoch": 0.1455, + "grad_norm": 4.03125, + "grad_norm_var": 0.26048075358072914, + "learning_rate": 0.0001, + "loss": 6.8064, + "loss/crossentropy": 2.7368087768554688, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.23664291948080063, + "step": 4656 + }, + { + "epoch": 0.1455625, + "grad_norm": 4.6875, + "grad_norm_var": 0.40924479166666666, + "learning_rate": 0.0001, + "loss": 7.0822, + "loss/crossentropy": 2.8304264545440674, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.25134557485580444, + "step": 4658 + }, + { + "epoch": 0.145625, + "grad_norm": 3.9375, + "grad_norm_var": 0.4041015625, + "learning_rate": 0.0001, + "loss": 6.5154, + "loss/crossentropy": 2.655316472053528, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21921440213918686, + "step": 4660 + }, + { + "epoch": 0.1456875, + "grad_norm": 4.0, + "grad_norm_var": 0.2482330322265625, + "learning_rate": 0.0001, + "loss": 6.4032, + "loss/crossentropy": 2.604591965675354, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2158007100224495, + "step": 4662 + }, + { + "epoch": 0.14575, + "grad_norm": 3.703125, + "grad_norm_var": 0.2572550455729167, + "learning_rate": 0.0001, + "loss": 6.2784, + "loss/crossentropy": 2.573136806488037, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20646653324365616, + "step": 4664 + }, + { + "epoch": 0.1458125, + "grad_norm": 3.75, + "grad_norm_var": 0.244140625, + "learning_rate": 0.0001, + "loss": 6.6643, + "loss/crossentropy": 2.864218592643738, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21437915414571762, + "step": 4666 + }, + { + "epoch": 0.145875, + "grad_norm": 3.796875, + "grad_norm_var": 0.23665364583333334, + "learning_rate": 0.0001, + "loss": 6.3439, + "loss/crossentropy": 2.5378594398498535, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21380609273910522, + "step": 4668 + }, + { + "epoch": 0.1459375, + "grad_norm": 3.890625, + "grad_norm_var": 0.22810872395833334, + "learning_rate": 0.0001, + "loss": 6.4716, + "loss/crossentropy": 2.617180824279785, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21786151826381683, + "step": 4670 + }, + { + "epoch": 0.146, + "grad_norm": 4.15625, + "grad_norm_var": 0.23333333333333334, + "learning_rate": 0.0001, + "loss": 6.6927, + "loss/crossentropy": 2.6598581075668335, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22984379529953003, + "step": 4672 + }, + { + "epoch": 0.1460625, + "grad_norm": 4.03125, + "grad_norm_var": 0.0334136962890625, + "learning_rate": 0.0001, + "loss": 6.6473, + "loss/crossentropy": 2.679308295249939, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22726821154356003, + "step": 4674 + }, + { + "epoch": 0.146125, + "grad_norm": 3.78125, + "grad_norm_var": 0.028499348958333334, + "learning_rate": 0.0001, + "loss": 6.4195, + "loss/crossentropy": 2.625803232192993, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21217992156744003, + "step": 4676 + }, + { + "epoch": 0.1461875, + "grad_norm": 4.59375, + "grad_norm_var": 0.04967041015625, + "learning_rate": 0.0001, + "loss": 6.5957, + "loss/crossentropy": 2.7120320796966553, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.22117996215820312, + "step": 4678 + }, + { + "epoch": 0.14625, + "grad_norm": 3.515625, + "grad_norm_var": 0.0579254150390625, + "learning_rate": 0.0001, + "loss": 6.3614, + "loss/crossentropy": 2.6168389320373535, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21117979288101196, + "step": 4680 + }, + { + "epoch": 0.1463125, + "grad_norm": 3.671875, + "grad_norm_var": 0.06334228515625, + "learning_rate": 0.0001, + "loss": 6.6885, + "loss/crossentropy": 2.751344323158264, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2241879627108574, + "step": 4682 + }, + { + "epoch": 0.146375, + "grad_norm": 4.125, + "grad_norm_var": 0.08232421875, + "learning_rate": 0.0001, + "loss": 6.5906, + "loss/crossentropy": 2.7465096712112427, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.21292072534561157, + "step": 4684 + }, + { + "epoch": 0.1464375, + "grad_norm": 3.6875, + "grad_norm_var": 0.0863189697265625, + "learning_rate": 0.0001, + "loss": 6.5358, + "loss/crossentropy": 2.718324303627014, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2133835405111313, + "step": 4686 + }, + { + "epoch": 0.1465, + "grad_norm": 3.40625, + "grad_norm_var": 0.09709370930989583, + "learning_rate": 0.0001, + "loss": 6.1771, + "loss/crossentropy": 2.439010500907898, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20662237703800201, + "step": 4688 + }, + { + "epoch": 0.1465625, + "grad_norm": 3.90625, + "grad_norm_var": 0.10308837890625, + "learning_rate": 0.0001, + "loss": 6.5201, + "loss/crossentropy": 2.627174496650696, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22014875710010529, + "step": 4690 + }, + { + "epoch": 0.146625, + "grad_norm": 3.765625, + "grad_norm_var": 0.10435791015625, + "learning_rate": 0.0001, + "loss": 6.6709, + "loss/crossentropy": 2.7912899255752563, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21842972934246063, + "step": 4692 + }, + { + "epoch": 0.1466875, + "grad_norm": 3.65625, + "grad_norm_var": 0.070849609375, + "learning_rate": 0.0001, + "loss": 6.4942, + "loss/crossentropy": 2.630295991897583, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21568869799375534, + "step": 4694 + }, + { + "epoch": 0.14675, + "grad_norm": 3.625, + "grad_norm_var": 0.08316650390625, + "learning_rate": 0.0001, + "loss": 6.0061, + "loss/crossentropy": 2.477609872817993, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19581755995750427, + "step": 4696 + }, + { + "epoch": 0.1468125, + "grad_norm": 3.5625, + "grad_norm_var": 0.07779541015625, + "learning_rate": 0.0001, + "loss": 6.6173, + "loss/crossentropy": 2.7599306106567383, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21972202509641647, + "step": 4698 + }, + { + "epoch": 0.146875, + "grad_norm": 4.0, + "grad_norm_var": 0.06713765462239583, + "learning_rate": 0.0001, + "loss": 6.5485, + "loss/crossentropy": 2.7502459287643433, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21458908915519714, + "step": 4700 + }, + { + "epoch": 0.1469375, + "grad_norm": 3.625, + "grad_norm_var": 0.0636871337890625, + "learning_rate": 0.0001, + "loss": 6.4994, + "loss/crossentropy": 2.6590317487716675, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2160710096359253, + "step": 4702 + }, + { + "epoch": 0.147, + "grad_norm": 3.515625, + "grad_norm_var": 0.0592681884765625, + "learning_rate": 0.0001, + "loss": 6.2914, + "loss/crossentropy": 2.5735844373703003, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.20264513790607452, + "step": 4704 + }, + { + "epoch": 0.1470625, + "grad_norm": 3.625, + "grad_norm_var": 0.0452056884765625, + "learning_rate": 0.0001, + "loss": 6.3309, + "loss/crossentropy": 2.5724921226501465, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20825912058353424, + "step": 4706 + }, + { + "epoch": 0.147125, + "grad_norm": 3.671875, + "grad_norm_var": 0.05728759765625, + "learning_rate": 0.0001, + "loss": 6.3636, + "loss/crossentropy": 2.558912515640259, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.20976246148347855, + "step": 4708 + }, + { + "epoch": 0.1471875, + "grad_norm": 3.96875, + "grad_norm_var": 0.055908203125, + "learning_rate": 0.0001, + "loss": 6.3413, + "loss/crossentropy": 2.588721752166748, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21159029752016068, + "step": 4710 + }, + { + "epoch": 0.14725, + "grad_norm": 3.9375, + "grad_norm_var": 0.048981730143229166, + "learning_rate": 0.0001, + "loss": 6.4642, + "loss/crossentropy": 2.610605239868164, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21856311708688736, + "step": 4712 + }, + { + "epoch": 0.1473125, + "grad_norm": 3.640625, + "grad_norm_var": 0.054963175455729166, + "learning_rate": 0.0001, + "loss": 6.4486, + "loss/crossentropy": 2.6900192499160767, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21257588267326355, + "step": 4714 + }, + { + "epoch": 0.147375, + "grad_norm": 4.0, + "grad_norm_var": 0.05174051920572917, + "learning_rate": 0.0001, + "loss": 6.4907, + "loss/crossentropy": 2.6867319345474243, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21203771233558655, + "step": 4716 + }, + { + "epoch": 0.1474375, + "grad_norm": 3.78125, + "grad_norm_var": 0.05563151041666667, + "learning_rate": 0.0001, + "loss": 5.9098, + "loss/crossentropy": 2.360137462615967, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1932504028081894, + "step": 4718 + }, + { + "epoch": 0.1475, + "grad_norm": 3.671875, + "grad_norm_var": 0.05534566243489583, + "learning_rate": 0.0001, + "loss": 6.7295, + "loss/crossentropy": 2.795089840888977, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.2258637547492981, + "step": 4720 + }, + { + "epoch": 0.1475625, + "grad_norm": 4.53125, + "grad_norm_var": 0.32356669108072916, + "learning_rate": 0.0001, + "loss": 6.5211, + "loss/crossentropy": 2.617510437965393, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.21652990579605103, + "step": 4722 + }, + { + "epoch": 0.147625, + "grad_norm": 4.875, + "grad_norm_var": 0.36549072265625, + "learning_rate": 0.0001, + "loss": 6.7006, + "loss/crossentropy": 2.6605865955352783, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.23368670791387558, + "step": 4724 + }, + { + "epoch": 0.1476875, + "grad_norm": 4.125, + "grad_norm_var": 0.33739827473958334, + "learning_rate": 0.0001, + "loss": 6.5533, + "loss/crossentropy": 2.675244092941284, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21476037055253983, + "step": 4726 + }, + { + "epoch": 0.14775, + "grad_norm": 3.625, + "grad_norm_var": 0.3437164306640625, + "learning_rate": 0.0001, + "loss": 6.3036, + "loss/crossentropy": 2.573203682899475, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.206244595348835, + "step": 4728 + }, + { + "epoch": 0.1478125, + "grad_norm": 3.390625, + "grad_norm_var": 0.3688629150390625, + "learning_rate": 0.0001, + "loss": 6.2489, + "loss/crossentropy": 2.5649033784866333, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20082136988639832, + "step": 4730 + }, + { + "epoch": 0.147875, + "grad_norm": 3.90625, + "grad_norm_var": 0.3668853759765625, + "learning_rate": 0.0001, + "loss": 6.9884, + "loss/crossentropy": 2.994609832763672, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.22906502336263657, + "step": 4732 + }, + { + "epoch": 0.1479375, + "grad_norm": 3.78125, + "grad_norm_var": 0.3581451416015625, + "learning_rate": 0.0001, + "loss": 6.3501, + "loss/crossentropy": 2.52126944065094, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2137421816587448, + "step": 4734 + }, + { + "epoch": 0.148, + "grad_norm": 3.828125, + "grad_norm_var": 0.3618316650390625, + "learning_rate": 0.0001, + "loss": 6.4362, + "loss/crossentropy": 2.6966384649276733, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20872260630130768, + "step": 4736 + }, + { + "epoch": 0.1480625, + "grad_norm": 3.671875, + "grad_norm_var": 0.1119537353515625, + "learning_rate": 0.0001, + "loss": 6.4629, + "loss/crossentropy": 2.6376761198043823, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.214164137840271, + "step": 4738 + }, + { + "epoch": 0.148125, + "grad_norm": 3.625, + "grad_norm_var": 0.041792805989583334, + "learning_rate": 0.0001, + "loss": 6.1429, + "loss/crossentropy": 2.5099347829818726, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.19884376972913742, + "step": 4740 + }, + { + "epoch": 0.1481875, + "grad_norm": 4.125, + "grad_norm_var": 0.03687744140625, + "learning_rate": 0.0001, + "loss": 6.6726, + "loss/crossentropy": 2.6728895902633667, + "loss/hidden": 1.734375, + "loss/jsd": 0.0, + "loss/logits": 0.22653395682573318, + "step": 4742 + }, + { + "epoch": 0.14825, + "grad_norm": 3.578125, + "grad_norm_var": 0.03776753743489583, + "learning_rate": 0.0001, + "loss": 6.5611, + "loss/crossentropy": 2.78212308883667, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21227127313613892, + "step": 4744 + }, + { + "epoch": 0.1483125, + "grad_norm": 3.765625, + "grad_norm_var": 0.029344685872395835, + "learning_rate": 0.0001, + "loss": 6.3416, + "loss/crossentropy": 2.6213202476501465, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2087465226650238, + "step": 4746 + }, + { + "epoch": 0.148375, + "grad_norm": 3.484375, + "grad_norm_var": 0.030304972330729166, + "learning_rate": 0.0001, + "loss": 6.5042, + "loss/crossentropy": 2.7428064346313477, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20972855389118195, + "step": 4748 + }, + { + "epoch": 0.1484375, + "grad_norm": 3.671875, + "grad_norm_var": 0.030304972330729166, + "learning_rate": 0.0001, + "loss": 6.6375, + "loss/crossentropy": 2.8026511669158936, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2170819416642189, + "step": 4750 + }, + { + "epoch": 0.1485, + "grad_norm": 3.765625, + "grad_norm_var": 0.04433492024739583, + "learning_rate": 0.0001, + "loss": 6.5706, + "loss/crossentropy": 2.613227605819702, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22503775358200073, + "step": 4752 + }, + { + "epoch": 0.1485625, + "grad_norm": 3.8125, + "grad_norm_var": 0.045491536458333336, + "learning_rate": 0.0001, + "loss": 6.5307, + "loss/crossentropy": 2.7017834186553955, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2157081514596939, + "step": 4754 + }, + { + "epoch": 0.148625, + "grad_norm": 3.796875, + "grad_norm_var": 0.042822265625, + "learning_rate": 0.0001, + "loss": 6.6199, + "loss/crossentropy": 2.764488458633423, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21756920218467712, + "step": 4756 + }, + { + "epoch": 0.1486875, + "grad_norm": 4.21875, + "grad_norm_var": 0.044921875, + "learning_rate": 0.0001, + "loss": 6.4397, + "loss/crossentropy": 2.6155864000320435, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21835249662399292, + "step": 4758 + }, + { + "epoch": 0.14875, + "grad_norm": 3.9375, + "grad_norm_var": 0.06787821451822916, + "learning_rate": 0.0001, + "loss": 6.5694, + "loss/crossentropy": 2.645212173461914, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2236703336238861, + "step": 4760 + }, + { + "epoch": 0.1488125, + "grad_norm": 3.65625, + "grad_norm_var": 0.148583984375, + "learning_rate": 0.0001, + "loss": 6.2737, + "loss/crossentropy": 2.5080801248550415, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21133046597242355, + "step": 4762 + }, + { + "epoch": 0.148875, + "grad_norm": 5.125, + "grad_norm_var": 0.2266021728515625, + "learning_rate": 0.0001, + "loss": 6.5326, + "loss/crossentropy": 2.6351619958877563, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.22450867295265198, + "step": 4764 + }, + { + "epoch": 0.1489375, + "grad_norm": 4.21875, + "grad_norm_var": 0.21855061848958332, + "learning_rate": 0.0001, + "loss": 6.4739, + "loss/crossentropy": 2.570665121078491, + "loss/hidden": 1.75, + "loss/jsd": 0.0, + "loss/logits": 0.2153189554810524, + "step": 4766 + }, + { + "epoch": 0.149, + "grad_norm": 3.578125, + "grad_norm_var": 0.22913309733072917, + "learning_rate": 0.0001, + "loss": 6.3232, + "loss/crossentropy": 2.6073015928268433, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20831340551376343, + "step": 4768 + }, + { + "epoch": 0.1490625, + "grad_norm": 3.4375, + "grad_norm_var": 0.23277587890625, + "learning_rate": 0.0001, + "loss": 6.2837, + "loss/crossentropy": 2.4850372076034546, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2072141021490097, + "step": 4770 + }, + { + "epoch": 0.149125, + "grad_norm": 3.859375, + "grad_norm_var": 0.22883707682291668, + "learning_rate": 0.0001, + "loss": 6.6157, + "loss/crossentropy": 2.6990227699279785, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22292090207338333, + "step": 4772 + }, + { + "epoch": 0.1491875, + "grad_norm": 3.609375, + "grad_norm_var": 0.23316650390625, + "learning_rate": 0.0001, + "loss": 6.5108, + "loss/crossentropy": 2.7127143144607544, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2157444953918457, + "step": 4774 + }, + { + "epoch": 0.14925, + "grad_norm": 4.53125, + "grad_norm_var": 0.25297749837239586, + "learning_rate": 0.0001, + "loss": 6.5796, + "loss/crossentropy": 2.648136258125305, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2247898280620575, + "step": 4776 + }, + { + "epoch": 0.1493125, + "grad_norm": 3.84375, + "grad_norm_var": 0.19218648274739583, + "learning_rate": 0.0001, + "loss": 6.5274, + "loss/crossentropy": 2.7297213077545166, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21101507544517517, + "step": 4778 + }, + { + "epoch": 0.149375, + "grad_norm": 3.953125, + "grad_norm_var": 0.10084635416666667, + "learning_rate": 0.0001, + "loss": 6.6353, + "loss/crossentropy": 2.7655104398727417, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.22174807637929916, + "step": 4780 + }, + { + "epoch": 0.1494375, + "grad_norm": 3.921875, + "grad_norm_var": 0.09071858723958333, + "learning_rate": 0.0001, + "loss": 6.5437, + "loss/crossentropy": 2.667617917060852, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2176843211054802, + "step": 4782 + }, + { + "epoch": 0.1495, + "grad_norm": 4.0, + "grad_norm_var": 0.08983968098958334, + "learning_rate": 0.0001, + "loss": 6.3786, + "loss/crossentropy": 2.5850164890289307, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21295025944709778, + "step": 4784 + }, + { + "epoch": 0.1495625, + "grad_norm": 3.703125, + "grad_norm_var": 0.09162495930989584, + "learning_rate": 0.0001, + "loss": 6.3162, + "loss/crossentropy": 2.6784101724624634, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.19736792147159576, + "step": 4786 + }, + { + "epoch": 0.149625, + "grad_norm": 4.4375, + "grad_norm_var": 0.11328837076822916, + "learning_rate": 0.0001, + "loss": 5.9967, + "loss/crossentropy": 2.2380075454711914, + "loss/hidden": 1.7578125, + "loss/jsd": 0.0, + "loss/logits": 0.20008385181427002, + "step": 4788 + }, + { + "epoch": 0.1496875, + "grad_norm": 3.8125, + "grad_norm_var": 0.1090728759765625, + "learning_rate": 0.0001, + "loss": 6.2627, + "loss/crossentropy": 2.4229001998901367, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2160077840089798, + "step": 4790 + }, + { + "epoch": 0.14975, + "grad_norm": 3.765625, + "grad_norm_var": 0.07546284993489584, + "learning_rate": 0.0001, + "loss": 6.5056, + "loss/crossentropy": 2.6331783533096313, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21693290770053864, + "step": 4792 + }, + { + "epoch": 0.1498125, + "grad_norm": 4.53125, + "grad_norm_var": 0.0869049072265625, + "learning_rate": 0.0001, + "loss": 6.306, + "loss/crossentropy": 2.5121694803237915, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21297980844974518, + "step": 4794 + }, + { + "epoch": 0.149875, + "grad_norm": 4.03125, + "grad_norm_var": 0.08339436848958333, + "learning_rate": 0.0001, + "loss": 6.8366, + "loss/crossentropy": 2.779943823814392, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.22909872978925705, + "step": 4796 + }, + { + "epoch": 0.1499375, + "grad_norm": 7.3125, + "grad_norm_var": 0.8360677083333333, + "learning_rate": 0.0001, + "loss": 6.6624, + "loss/crossentropy": 2.6996039152145386, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.22987814247608185, + "step": 4798 + }, + { + "epoch": 0.15, + "grad_norm": 4.03125, + "grad_norm_var": 0.823681640625, + "learning_rate": 0.0001, + "loss": 6.5988, + "loss/crossentropy": 2.6779834032058716, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.2166868895292282, + "step": 4800 + }, + { + "epoch": 0.1500625, + "grad_norm": 3.40625, + "grad_norm_var": 0.8215159098307292, + "learning_rate": 0.0001, + "loss": 6.2327, + "loss/crossentropy": 2.5419305562973022, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20501716434955597, + "step": 4802 + }, + { + "epoch": 0.150125, + "grad_norm": 3.78125, + "grad_norm_var": 0.8072174072265625, + "learning_rate": 0.0001, + "loss": 6.3111, + "loss/crossentropy": 2.5460424423217773, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21243851631879807, + "step": 4804 + }, + { + "epoch": 0.1501875, + "grad_norm": 3.875, + "grad_norm_var": 0.8091868082682292, + "learning_rate": 0.0001, + "loss": 6.3154, + "loss/crossentropy": 2.6478878259658813, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.19761145114898682, + "step": 4806 + }, + { + "epoch": 0.15025, + "grad_norm": 3.78125, + "grad_norm_var": 0.80947265625, + "learning_rate": 0.0001, + "loss": 6.159, + "loss/crossentropy": 2.370529294013977, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2108783796429634, + "step": 4808 + }, + { + "epoch": 0.1503125, + "grad_norm": 3.75, + "grad_norm_var": 0.7909332275390625, + "learning_rate": 0.0001, + "loss": 6.3784, + "loss/crossentropy": 2.6217751502990723, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.213553749024868, + "step": 4810 + }, + { + "epoch": 0.150375, + "grad_norm": 4.34375, + "grad_norm_var": 0.8113515218098958, + "learning_rate": 0.0001, + "loss": 6.5287, + "loss/crossentropy": 2.6473978757858276, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2217206507921219, + "step": 4812 + }, + { + "epoch": 0.1504375, + "grad_norm": 3.53125, + "grad_norm_var": 0.056640625, + "learning_rate": 0.0001, + "loss": 6.2518, + "loss/crossentropy": 2.496413826942444, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21147903054952621, + "step": 4814 + }, + { + "epoch": 0.1505, + "grad_norm": 3.46875, + "grad_norm_var": 0.0618804931640625, + "learning_rate": 0.0001, + "loss": 6.4329, + "loss/crossentropy": 2.6252561807632446, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21475353837013245, + "step": 4816 + }, + { + "epoch": 0.1505625, + "grad_norm": 3.75, + "grad_norm_var": 0.051493326822916664, + "learning_rate": 0.0001, + "loss": 6.3718, + "loss/crossentropy": 2.5250319242477417, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2159259095788002, + "step": 4818 + }, + { + "epoch": 0.150625, + "grad_norm": 3.8125, + "grad_norm_var": 0.05698954264322917, + "learning_rate": 0.0001, + "loss": 6.0336, + "loss/crossentropy": 2.400040030479431, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.1992897242307663, + "step": 4820 + }, + { + "epoch": 0.1506875, + "grad_norm": 3.59375, + "grad_norm_var": 0.07403055826822917, + "learning_rate": 0.0001, + "loss": 6.4269, + "loss/crossentropy": 2.708932042121887, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20616790652275085, + "step": 4822 + }, + { + "epoch": 0.15075, + "grad_norm": 3.65625, + "grad_norm_var": 0.05953369140625, + "learning_rate": 0.0001, + "loss": 6.6234, + "loss/crossentropy": 2.6994398832321167, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22286559641361237, + "step": 4824 + }, + { + "epoch": 0.1508125, + "grad_norm": 3.6875, + "grad_norm_var": 0.057840983072916664, + "learning_rate": 0.0001, + "loss": 6.4284, + "loss/crossentropy": 2.6506223678588867, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21254274994134903, + "step": 4826 + }, + { + "epoch": 0.150875, + "grad_norm": 4.0625, + "grad_norm_var": 0.038248697916666664, + "learning_rate": 0.0001, + "loss": 6.4538, + "loss/crossentropy": 2.6030107736587524, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21633025258779526, + "step": 4828 + }, + { + "epoch": 0.1509375, + "grad_norm": 4.125, + "grad_norm_var": 0.05157877604166667, + "learning_rate": 0.0001, + "loss": 6.5578, + "loss/crossentropy": 2.6313416957855225, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22272682189941406, + "step": 4830 + }, + { + "epoch": 0.151, + "grad_norm": 4.3125, + "grad_norm_var": 0.07156575520833333, + "learning_rate": 0.0001, + "loss": 6.7107, + "loss/crossentropy": 2.744732975959778, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22667010128498077, + "step": 4832 + }, + { + "epoch": 0.1510625, + "grad_norm": 3.84375, + "grad_norm_var": 0.07504781087239583, + "learning_rate": 0.0001, + "loss": 6.2812, + "loss/crossentropy": 2.538503885269165, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20746999979019165, + "step": 4834 + }, + { + "epoch": 0.151125, + "grad_norm": 3.921875, + "grad_norm_var": 0.07955322265625, + "learning_rate": 0.0001, + "loss": 6.6262, + "loss/crossentropy": 2.7523258924484253, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2178577333688736, + "step": 4836 + }, + { + "epoch": 0.1511875, + "grad_norm": 3.671875, + "grad_norm_var": 0.05794169108072917, + "learning_rate": 0.0001, + "loss": 6.544, + "loss/crossentropy": 2.660256505012512, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21962185949087143, + "step": 4838 + }, + { + "epoch": 0.15125, + "grad_norm": 6.84375, + "grad_norm_var": 0.6006022135416667, + "learning_rate": 0.0001, + "loss": 6.2568, + "loss/crossentropy": 2.4511998891830444, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21102909743785858, + "step": 4840 + }, + { + "epoch": 0.1513125, + "grad_norm": 3.5, + "grad_norm_var": 0.5956858317057292, + "learning_rate": 0.0001, + "loss": 6.4881, + "loss/crossentropy": 2.6805083751678467, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21435106545686722, + "step": 4842 + }, + { + "epoch": 0.151375, + "grad_norm": 3.84375, + "grad_norm_var": 0.5873046875, + "learning_rate": 0.0001, + "loss": 6.1579, + "loss/crossentropy": 2.474646806716919, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20426137745380402, + "step": 4844 + }, + { + "epoch": 0.1514375, + "grad_norm": 3.515625, + "grad_norm_var": 0.6108561197916667, + "learning_rate": 0.0001, + "loss": 6.2489, + "loss/crossentropy": 2.508758306503296, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2076030820608139, + "step": 4846 + }, + { + "epoch": 0.1515, + "grad_norm": 3.71875, + "grad_norm_var": 0.5954060872395833, + "learning_rate": 0.0001, + "loss": 6.6277, + "loss/crossentropy": 2.7554014921188354, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2192564159631729, + "step": 4848 + }, + { + "epoch": 0.1515625, + "grad_norm": 3.671875, + "grad_norm_var": 0.6114491780598958, + "learning_rate": 0.0001, + "loss": 6.5457, + "loss/crossentropy": 2.6733874082565308, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21731121093034744, + "step": 4850 + }, + { + "epoch": 0.151625, + "grad_norm": 3.578125, + "grad_norm_var": 0.6405181884765625, + "learning_rate": 0.0001, + "loss": 6.3359, + "loss/crossentropy": 2.6059813499450684, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20932254940271378, + "step": 4852 + }, + { + "epoch": 0.1516875, + "grad_norm": 3.640625, + "grad_norm_var": 0.6582509358723958, + "learning_rate": 0.0001, + "loss": 6.4843, + "loss/crossentropy": 2.637516736984253, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21631476283073425, + "step": 4854 + }, + { + "epoch": 0.15175, + "grad_norm": 5.5, + "grad_norm_var": 0.26500244140625, + "learning_rate": 0.0001, + "loss": 6.7869, + "loss/crossentropy": 2.763762354850769, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2323966547846794, + "step": 4856 + }, + { + "epoch": 0.1518125, + "grad_norm": 4.125, + "grad_norm_var": 0.2546620686848958, + "learning_rate": 0.0001, + "loss": 6.5614, + "loss/crossentropy": 2.6742827892303467, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.22269795089960098, + "step": 4858 + }, + { + "epoch": 0.151875, + "grad_norm": 3.484375, + "grad_norm_var": 0.2902303059895833, + "learning_rate": 0.0001, + "loss": 6.5024, + "loss/crossentropy": 2.5040030479431152, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.22678908705711365, + "step": 4860 + }, + { + "epoch": 0.1519375, + "grad_norm": 3.859375, + "grad_norm_var": 0.28025716145833335, + "learning_rate": 0.0001, + "loss": 6.2587, + "loss/crossentropy": 2.4296271800994873, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21181800961494446, + "step": 4862 + }, + { + "epoch": 0.152, + "grad_norm": 3.453125, + "grad_norm_var": 0.29153544108072915, + "learning_rate": 0.0001, + "loss": 6.3599, + "loss/crossentropy": 2.5853668451309204, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.21612919867038727, + "step": 4864 + }, + { + "epoch": 0.1520625, + "grad_norm": 4.09375, + "grad_norm_var": 0.2879842122395833, + "learning_rate": 0.0001, + "loss": 6.4013, + "loss/crossentropy": 2.6143546104431152, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21033377200365067, + "step": 4866 + }, + { + "epoch": 0.152125, + "grad_norm": 3.6875, + "grad_norm_var": 0.26902567545572914, + "learning_rate": 0.0001, + "loss": 6.7236, + "loss/crossentropy": 2.8559582233428955, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21606221795082092, + "step": 4868 + }, + { + "epoch": 0.1521875, + "grad_norm": 3.5, + "grad_norm_var": 0.2936920166015625, + "learning_rate": 0.0001, + "loss": 6.0866, + "loss/crossentropy": 2.5235257148742676, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1965421736240387, + "step": 4870 + }, + { + "epoch": 0.15225, + "grad_norm": 3.734375, + "grad_norm_var": 0.101416015625, + "learning_rate": 0.0001, + "loss": 6.4072, + "loss/crossentropy": 2.604854464530945, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21265756338834763, + "step": 4872 + }, + { + "epoch": 0.1523125, + "grad_norm": 3.734375, + "grad_norm_var": 0.09153238932291667, + "learning_rate": 0.0001, + "loss": 6.3704, + "loss/crossentropy": 2.5827693939208984, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21430625021457672, + "step": 4874 + }, + { + "epoch": 0.152375, + "grad_norm": 3.78125, + "grad_norm_var": 0.0570953369140625, + "learning_rate": 0.0001, + "loss": 6.623, + "loss/crossentropy": 2.838082194328308, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21247129142284393, + "step": 4876 + }, + { + "epoch": 0.1524375, + "grad_norm": 3.765625, + "grad_norm_var": 0.05123291015625, + "learning_rate": 0.0001, + "loss": 6.3011, + "loss/crossentropy": 2.579153895378113, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.21086709201335907, + "step": 4878 + }, + { + "epoch": 0.1525, + "grad_norm": 3.53125, + "grad_norm_var": 0.0462066650390625, + "learning_rate": 0.0001, + "loss": 6.1631, + "loss/crossentropy": 2.5194358825683594, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20147526264190674, + "step": 4880 + }, + { + "epoch": 0.1525625, + "grad_norm": 3.75, + "grad_norm_var": 0.037333170572916664, + "learning_rate": 0.0001, + "loss": 6.4245, + "loss/crossentropy": 2.5760059356689453, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21532009541988373, + "step": 4882 + }, + { + "epoch": 0.152625, + "grad_norm": 3.6875, + "grad_norm_var": 0.03874409993489583, + "learning_rate": 0.0001, + "loss": 6.3237, + "loss/crossentropy": 2.5853145122528076, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2058662474155426, + "step": 4884 + }, + { + "epoch": 0.1526875, + "grad_norm": 3.453125, + "grad_norm_var": 0.023014322916666666, + "learning_rate": 0.0001, + "loss": 6.1755, + "loss/crossentropy": 2.4950112104415894, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20789121091365814, + "step": 4886 + }, + { + "epoch": 0.15275, + "grad_norm": 3.6875, + "grad_norm_var": 0.025788370768229166, + "learning_rate": 0.0001, + "loss": 6.2782, + "loss/crossentropy": 2.464964509010315, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2090589702129364, + "step": 4888 + }, + { + "epoch": 0.1528125, + "grad_norm": 3.703125, + "grad_norm_var": 0.027586873372395834, + "learning_rate": 0.0001, + "loss": 6.72, + "loss/crossentropy": 2.7551660537719727, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.22772932052612305, + "step": 4890 + }, + { + "epoch": 0.152875, + "grad_norm": 3.546875, + "grad_norm_var": 0.020832316080729166, + "learning_rate": 0.0001, + "loss": 6.2148, + "loss/crossentropy": 2.508195996284485, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2073819264769554, + "step": 4892 + }, + { + "epoch": 0.1529375, + "grad_norm": 3.8125, + "grad_norm_var": 0.022587076822916666, + "learning_rate": 0.0001, + "loss": 6.1928, + "loss/crossentropy": 2.421853542327881, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21419940888881683, + "step": 4894 + }, + { + "epoch": 0.153, + "grad_norm": 3.921875, + "grad_norm_var": 0.0239654541015625, + "learning_rate": 0.0001, + "loss": 5.6587, + "loss/crossentropy": 2.0499314069747925, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.18783257901668549, + "step": 4896 + }, + { + "epoch": 0.1530625, + "grad_norm": 3.65625, + "grad_norm_var": 0.0221343994140625, + "learning_rate": 0.0001, + "loss": 6.3923, + "loss/crossentropy": 2.6016801595687866, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2122626230120659, + "step": 4898 + }, + { + "epoch": 0.153125, + "grad_norm": 3.78125, + "grad_norm_var": 0.018407185872395832, + "learning_rate": 0.0001, + "loss": 6.5129, + "loss/crossentropy": 2.709871530532837, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.21819671988487244, + "step": 4900 + }, + { + "epoch": 0.1531875, + "grad_norm": 4.03125, + "grad_norm_var": 0.018163045247395832, + "learning_rate": 0.0001, + "loss": 6.4619, + "loss/crossentropy": 2.638934850692749, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2151099443435669, + "step": 4902 + }, + { + "epoch": 0.15325, + "grad_norm": 4.3125, + "grad_norm_var": 0.03599853515625, + "learning_rate": 0.0001, + "loss": 6.5957, + "loss/crossentropy": 2.6319879293441772, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22644442319869995, + "step": 4904 + }, + { + "epoch": 0.1533125, + "grad_norm": 3.734375, + "grad_norm_var": 0.07101236979166667, + "learning_rate": 0.0001, + "loss": 6.3658, + "loss/crossentropy": 2.5488661527633667, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21215952187776566, + "step": 4906 + }, + { + "epoch": 0.153375, + "grad_norm": 3.546875, + "grad_norm_var": 0.0849761962890625, + "learning_rate": 0.0001, + "loss": 6.3781, + "loss/crossentropy": 2.5926159620285034, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2129197046160698, + "step": 4908 + }, + { + "epoch": 0.1534375, + "grad_norm": 3.8125, + "grad_norm_var": 0.08209228515625, + "learning_rate": 0.0001, + "loss": 6.0521, + "loss/crossentropy": 2.3596653938293457, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20322907716035843, + "step": 4910 + }, + { + "epoch": 0.1535, + "grad_norm": 3.578125, + "grad_norm_var": 0.09072265625, + "learning_rate": 0.0001, + "loss": 6.2831, + "loss/crossentropy": 2.5288031101226807, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20980124175548553, + "step": 4912 + }, + { + "epoch": 0.1535625, + "grad_norm": 3.90625, + "grad_norm_var": 0.08870442708333333, + "learning_rate": 0.0001, + "loss": 6.9371, + "loss/crossentropy": 2.849328398704529, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23651330918073654, + "step": 4914 + }, + { + "epoch": 0.153625, + "grad_norm": 4.09375, + "grad_norm_var": 0.09208577473958333, + "learning_rate": 0.0001, + "loss": 6.5117, + "loss/crossentropy": 2.6502633094787598, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21622441709041595, + "step": 4916 + }, + { + "epoch": 0.1536875, + "grad_norm": 4.0625, + "grad_norm_var": 0.09516499837239584, + "learning_rate": 0.0001, + "loss": 6.632, + "loss/crossentropy": 2.7815967798233032, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.21316128224134445, + "step": 4918 + }, + { + "epoch": 0.15375, + "grad_norm": 4.09375, + "grad_norm_var": 0.08580729166666666, + "learning_rate": 0.0001, + "loss": 6.5193, + "loss/crossentropy": 2.6187355518341064, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.22287128120660782, + "step": 4920 + }, + { + "epoch": 0.1538125, + "grad_norm": 3.890625, + "grad_norm_var": 0.0621978759765625, + "learning_rate": 0.0001, + "loss": 6.3034, + "loss/crossentropy": 2.581501007080078, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.21047167479991913, + "step": 4922 + }, + { + "epoch": 0.153875, + "grad_norm": 3.640625, + "grad_norm_var": 0.0472564697265625, + "learning_rate": 0.0001, + "loss": 6.4875, + "loss/crossentropy": 2.6864192485809326, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21448803693056107, + "step": 4924 + }, + { + "epoch": 0.1539375, + "grad_norm": 5.53125, + "grad_norm_var": 0.24671223958333333, + "learning_rate": 0.0001, + "loss": 6.4715, + "loss/crossentropy": 2.5524847507476807, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22275861352682114, + "step": 4926 + }, + { + "epoch": 0.154, + "grad_norm": 4.125, + "grad_norm_var": 0.2464019775390625, + "learning_rate": 0.0001, + "loss": 6.572, + "loss/crossentropy": 2.698154926300049, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2194170281291008, + "step": 4928 + }, + { + "epoch": 0.1540625, + "grad_norm": 3.90625, + "grad_norm_var": 0.24670817057291666, + "learning_rate": 0.0001, + "loss": 6.699, + "loss/crossentropy": 2.7247055768966675, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.23062902688980103, + "step": 4930 + }, + { + "epoch": 0.154125, + "grad_norm": 3.578125, + "grad_norm_var": 0.24951070149739582, + "learning_rate": 0.0001, + "loss": 6.404, + "loss/crossentropy": 2.587713122367859, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21404717862606049, + "step": 4932 + }, + { + "epoch": 0.1541875, + "grad_norm": 5.5, + "grad_norm_var": 0.43082275390625, + "learning_rate": 0.0001, + "loss": 6.9871, + "loss/crossentropy": 2.9728357791900635, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22994327545166016, + "step": 4934 + }, + { + "epoch": 0.15425, + "grad_norm": 3.78125, + "grad_norm_var": 0.42596028645833334, + "learning_rate": 0.0001, + "loss": 6.3586, + "loss/crossentropy": 2.537160277366638, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21651742607355118, + "step": 4936 + }, + { + "epoch": 0.1543125, + "grad_norm": 3.5, + "grad_norm_var": 0.4275716145833333, + "learning_rate": 0.0001, + "loss": 6.2248, + "loss/crossentropy": 2.5505727529525757, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20569919794797897, + "step": 4938 + }, + { + "epoch": 0.154375, + "grad_norm": 3.6875, + "grad_norm_var": 0.4137603759765625, + "learning_rate": 0.0001, + "loss": 6.3691, + "loss/crossentropy": 2.5592960119247437, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21262313425540924, + "step": 4940 + }, + { + "epoch": 0.1544375, + "grad_norm": 3.9375, + "grad_norm_var": 0.23670247395833333, + "learning_rate": 0.0001, + "loss": 6.1737, + "loss/crossentropy": 2.476612091064453, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20408708602190018, + "step": 4942 + }, + { + "epoch": 0.1545, + "grad_norm": 3.8125, + "grad_norm_var": 0.2263824462890625, + "learning_rate": 0.0001, + "loss": 6.6774, + "loss/crossentropy": 2.7931333780288696, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2188974693417549, + "step": 4944 + }, + { + "epoch": 0.1545625, + "grad_norm": 3.59375, + "grad_norm_var": 0.23065999348958333, + "learning_rate": 0.0001, + "loss": 6.6406, + "loss/crossentropy": 2.8075019121170044, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21455729007720947, + "step": 4946 + }, + { + "epoch": 0.154625, + "grad_norm": 3.28125, + "grad_norm_var": 0.2508697509765625, + "learning_rate": 0.0001, + "loss": 6.2334, + "loss/crossentropy": 2.5928040742874146, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.2019491121172905, + "step": 4948 + }, + { + "epoch": 0.1546875, + "grad_norm": 3.59375, + "grad_norm_var": 0.04607747395833333, + "learning_rate": 0.0001, + "loss": 6.4066, + "loss/crossentropy": 2.6578075885772705, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21042943745851517, + "step": 4950 + }, + { + "epoch": 0.15475, + "grad_norm": 3.765625, + "grad_norm_var": 0.053564453125, + "learning_rate": 0.0001, + "loss": 6.5547, + "loss/crossentropy": 2.6699572801589966, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2216789573431015, + "step": 4952 + }, + { + "epoch": 0.1548125, + "grad_norm": 3.1875, + "grad_norm_var": 0.069580078125, + "learning_rate": 0.0001, + "loss": 6.1558, + "loss/crossentropy": 2.5289593935012817, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20018472522497177, + "step": 4954 + }, + { + "epoch": 0.154875, + "grad_norm": 3.84375, + "grad_norm_var": 0.061498006184895836, + "learning_rate": 0.0001, + "loss": 6.4415, + "loss/crossentropy": 2.6500484943389893, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21507825702428818, + "step": 4956 + }, + { + "epoch": 0.1549375, + "grad_norm": 3.953125, + "grad_norm_var": 0.09381103515625, + "learning_rate": 0.0001, + "loss": 6.7442, + "loss/crossentropy": 2.700336456298828, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2379828542470932, + "step": 4958 + }, + { + "epoch": 0.155, + "grad_norm": 3.8125, + "grad_norm_var": 0.088623046875, + "learning_rate": 0.0001, + "loss": 6.3364, + "loss/crossentropy": 2.595617890357971, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20767300575971603, + "step": 4960 + }, + { + "epoch": 0.1550625, + "grad_norm": 3.9375, + "grad_norm_var": 0.0892974853515625, + "learning_rate": 0.0001, + "loss": 6.4777, + "loss/crossentropy": 2.6588134765625, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21509408205747604, + "step": 4962 + }, + { + "epoch": 0.155125, + "grad_norm": 4.875, + "grad_norm_var": 0.15546468098958333, + "learning_rate": 0.0001, + "loss": 6.0125, + "loss/crossentropy": 2.2977291345596313, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20546399056911469, + "step": 4964 + }, + { + "epoch": 0.1551875, + "grad_norm": 3.578125, + "grad_norm_var": 0.15156148274739584, + "learning_rate": 0.0001, + "loss": 6.2089, + "loss/crossentropy": 2.4438865184783936, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2104882299900055, + "step": 4966 + }, + { + "epoch": 0.15525, + "grad_norm": 3.84375, + "grad_norm_var": 0.15627848307291667, + "learning_rate": 0.0001, + "loss": 6.6539, + "loss/crossentropy": 2.769439458847046, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.22008167952299118, + "step": 4968 + }, + { + "epoch": 0.1553125, + "grad_norm": 3.375, + "grad_norm_var": 0.13815104166666667, + "learning_rate": 0.0001, + "loss": 6.3455, + "loss/crossentropy": 2.6334996223449707, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.202054463326931, + "step": 4970 + }, + { + "epoch": 0.155375, + "grad_norm": 3.515625, + "grad_norm_var": 0.15764058430989583, + "learning_rate": 0.0001, + "loss": 6.1166, + "loss/crossentropy": 2.5081863403320312, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19794577360153198, + "step": 4972 + }, + { + "epoch": 0.1554375, + "grad_norm": 3.640625, + "grad_norm_var": 0.13384501139322916, + "learning_rate": 0.0001, + "loss": 6.3389, + "loss/crossentropy": 2.5470728874206543, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21004494279623032, + "step": 4974 + }, + { + "epoch": 0.1555, + "grad_norm": 3.4375, + "grad_norm_var": 0.13826395670572916, + "learning_rate": 0.0001, + "loss": 6.344, + "loss/crossentropy": 2.6228668689727783, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20609334856271744, + "step": 4976 + }, + { + "epoch": 0.1555625, + "grad_norm": 3.796875, + "grad_norm_var": 0.13590494791666666, + "learning_rate": 0.0001, + "loss": 6.6796, + "loss/crossentropy": 2.734183669090271, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22305253148078918, + "step": 4978 + }, + { + "epoch": 0.155625, + "grad_norm": 3.515625, + "grad_norm_var": 0.05121968587239583, + "learning_rate": 0.0001, + "loss": 6.2023, + "loss/crossentropy": 2.4566444158554077, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21010775864124298, + "step": 4980 + }, + { + "epoch": 0.1556875, + "grad_norm": 3.875, + "grad_norm_var": 0.05627848307291667, + "learning_rate": 0.0001, + "loss": 6.1007, + "loss/crossentropy": 2.3971216678619385, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20434706658124924, + "step": 4982 + }, + { + "epoch": 0.15575, + "grad_norm": 4.25, + "grad_norm_var": 0.062841796875, + "learning_rate": 0.0001, + "loss": 6.3932, + "loss/crossentropy": 2.6078414916992188, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.20979001373052597, + "step": 4984 + }, + { + "epoch": 0.1558125, + "grad_norm": 3.6875, + "grad_norm_var": 0.05384012858072917, + "learning_rate": 0.0001, + "loss": 6.1757, + "loss/crossentropy": 2.5649302005767822, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19623598456382751, + "step": 4986 + }, + { + "epoch": 0.155875, + "grad_norm": 3.5625, + "grad_norm_var": 0.05051981608072917, + "learning_rate": 0.0001, + "loss": 6.3125, + "loss/crossentropy": 2.5719149112701416, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21155717223882675, + "step": 4988 + }, + { + "epoch": 0.1559375, + "grad_norm": 4.09375, + "grad_norm_var": 0.058649698893229164, + "learning_rate": 0.0001, + "loss": 6.2429, + "loss/crossentropy": 2.6092617511749268, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2024226039648056, + "step": 4990 + }, + { + "epoch": 0.156, + "grad_norm": 4.3125, + "grad_norm_var": 0.07617899576822916, + "learning_rate": 0.0001, + "loss": 6.5649, + "loss/crossentropy": 2.692357659339905, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21616382896900177, + "step": 4992 + }, + { + "epoch": 0.1560625, + "grad_norm": 3.53125, + "grad_norm_var": 0.0810699462890625, + "learning_rate": 0.0001, + "loss": 6.5541, + "loss/crossentropy": 2.740652918815613, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21416035294532776, + "step": 4994 + }, + { + "epoch": 0.156125, + "grad_norm": 3.640625, + "grad_norm_var": 0.0776763916015625, + "learning_rate": 0.0001, + "loss": 6.0529, + "loss/crossentropy": 2.3745936155319214, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.1990818828344345, + "step": 4996 + }, + { + "epoch": 0.1561875, + "grad_norm": 3.484375, + "grad_norm_var": 0.07605794270833334, + "learning_rate": 0.0001, + "loss": 6.4075, + "loss/crossentropy": 2.6630691289901733, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2076416164636612, + "step": 4998 + }, + { + "epoch": 0.15625, + "grad_norm": 3.421875, + "grad_norm_var": 0.061848958333333336, + "learning_rate": 0.0001, + "loss": 6.417, + "loss/crossentropy": 2.7289209365844727, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20670168101787567, + "step": 5000 + }, + { + "epoch": 0.1563125, + "grad_norm": 3.625, + "grad_norm_var": 0.06197916666666667, + "learning_rate": 0.0001, + "loss": 6.3584, + "loss/crossentropy": 2.621297597885132, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2092597633600235, + "step": 5002 + }, + { + "epoch": 0.156375, + "grad_norm": 3.890625, + "grad_norm_var": 0.12735087076822918, + "learning_rate": 0.0001, + "loss": 6.5332, + "loss/crossentropy": 2.6060941219329834, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.2196609079837799, + "step": 5004 + }, + { + "epoch": 0.1564375, + "grad_norm": 3.734375, + "grad_norm_var": 0.12423502604166667, + "learning_rate": 0.0001, + "loss": 6.1773, + "loss/crossentropy": 2.4523247480392456, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2115599289536476, + "step": 5006 + }, + { + "epoch": 0.1565, + "grad_norm": 3.765625, + "grad_norm_var": 0.09846903483072916, + "learning_rate": 0.0001, + "loss": 6.41, + "loss/crossentropy": 2.568955659866333, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21886961162090302, + "step": 5008 + }, + { + "epoch": 0.1565625, + "grad_norm": 3.484375, + "grad_norm_var": 0.09791666666666667, + "learning_rate": 0.0001, + "loss": 6.6024, + "loss/crossentropy": 2.722938656806946, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21763822436332703, + "step": 5010 + }, + { + "epoch": 0.156625, + "grad_norm": 3.8125, + "grad_norm_var": 0.10121968587239584, + "learning_rate": 0.0001, + "loss": 6.335, + "loss/crossentropy": 2.561442732810974, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.207045778632164, + "step": 5012 + }, + { + "epoch": 0.1566875, + "grad_norm": 4.625, + "grad_norm_var": 0.14866129557291666, + "learning_rate": 0.0001, + "loss": 6.5044, + "loss/crossentropy": 2.5820696353912354, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.22582750767469406, + "step": 5014 + }, + { + "epoch": 0.15675, + "grad_norm": 6.03125, + "grad_norm_var": 0.44075419108072916, + "learning_rate": 0.0001, + "loss": 6.296, + "loss/crossentropy": 2.5115894079208374, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.20851974934339523, + "step": 5016 + }, + { + "epoch": 0.1568125, + "grad_norm": 3.9375, + "grad_norm_var": 0.4282297770182292, + "learning_rate": 0.0001, + "loss": 6.2359, + "loss/crossentropy": 2.5803922414779663, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.1971878930926323, + "step": 5018 + }, + { + "epoch": 0.156875, + "grad_norm": 4.15625, + "grad_norm_var": 1.7147420247395833, + "learning_rate": 0.0001, + "loss": 6.9218, + "loss/crossentropy": 2.7316008806228638, + "loss/hidden": 1.7890625, + "loss/jsd": 0.0, + "loss/logits": 0.24011238664388657, + "step": 5020 + }, + { + "epoch": 0.1569375, + "grad_norm": 3.78125, + "grad_norm_var": 1.6871907552083334, + "learning_rate": 0.0001, + "loss": 6.3541, + "loss/crossentropy": 2.5209031105041504, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21574316918849945, + "step": 5022 + }, + { + "epoch": 0.157, + "grad_norm": 3.734375, + "grad_norm_var": 1.6809967041015625, + "learning_rate": 0.0001, + "loss": 6.4448, + "loss/crossentropy": 2.6470454931259155, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21337257325649261, + "step": 5024 + }, + { + "epoch": 0.1570625, + "grad_norm": 3.75, + "grad_norm_var": 1.649006144205729, + "learning_rate": 0.0001, + "loss": 6.1475, + "loss/crossentropy": 2.424108862876892, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.2047611102461815, + "step": 5026 + }, + { + "epoch": 0.157125, + "grad_norm": 3.546875, + "grad_norm_var": 1.6721750895182292, + "learning_rate": 0.0001, + "loss": 6.2396, + "loss/crossentropy": 2.609367609024048, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20130402594804764, + "step": 5028 + }, + { + "epoch": 0.1571875, + "grad_norm": 3.65625, + "grad_norm_var": 1.6649698893229166, + "learning_rate": 0.0001, + "loss": 6.5154, + "loss/crossentropy": 2.649414300918579, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21628418564796448, + "step": 5030 + }, + { + "epoch": 0.15725, + "grad_norm": 3.34375, + "grad_norm_var": 1.4771443684895833, + "learning_rate": 0.0001, + "loss": 6.2335, + "loss/crossentropy": 2.515960216522217, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20495706796646118, + "step": 5032 + }, + { + "epoch": 0.1573125, + "grad_norm": 3.84375, + "grad_norm_var": 1.50572509765625, + "learning_rate": 0.0001, + "loss": 6.6099, + "loss/crossentropy": 2.7881150245666504, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2169434353709221, + "step": 5034 + }, + { + "epoch": 0.157375, + "grad_norm": 3.40625, + "grad_norm_var": 0.04423726399739583, + "learning_rate": 0.0001, + "loss": 6.3489, + "loss/crossentropy": 2.5991029739379883, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20974069833755493, + "step": 5036 + }, + { + "epoch": 0.1574375, + "grad_norm": 3.3125, + "grad_norm_var": 0.05195210774739583, + "learning_rate": 0.0001, + "loss": 6.2484, + "loss/crossentropy": 2.5991371870040894, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20281457901000977, + "step": 5038 + }, + { + "epoch": 0.1575, + "grad_norm": 3.703125, + "grad_norm_var": 0.048079427083333334, + "learning_rate": 0.0001, + "loss": 6.5411, + "loss/crossentropy": 2.719546914100647, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21183981746435165, + "step": 5040 + }, + { + "epoch": 0.1575625, + "grad_norm": 3.890625, + "grad_norm_var": 0.043187459309895836, + "learning_rate": 0.0001, + "loss": 6.4863, + "loss/crossentropy": 2.7351890802383423, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21221739053726196, + "step": 5042 + }, + { + "epoch": 0.157625, + "grad_norm": 3.5625, + "grad_norm_var": 0.0495758056640625, + "learning_rate": 0.0001, + "loss": 6.2946, + "loss/crossentropy": 2.523933529853821, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2137833908200264, + "step": 5044 + }, + { + "epoch": 0.1576875, + "grad_norm": 3.4375, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 6.0445, + "loss/crossentropy": 2.4377795457839966, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19817539304494858, + "step": 5046 + }, + { + "epoch": 0.15775, + "grad_norm": 3.96875, + "grad_norm_var": 0.05283915201822917, + "learning_rate": 0.0001, + "loss": 5.9443, + "loss/crossentropy": 2.4297016859054565, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.184275820851326, + "step": 5048 + }, + { + "epoch": 0.1578125, + "grad_norm": 3.65625, + "grad_norm_var": 0.04575907389322917, + "learning_rate": 0.0001, + "loss": 6.0538, + "loss/crossentropy": 2.4386450052261353, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.19511238485574722, + "step": 5050 + }, + { + "epoch": 0.157875, + "grad_norm": 3.390625, + "grad_norm_var": 0.06670633951822917, + "learning_rate": 0.0001, + "loss": 6.4905, + "loss/crossentropy": 2.6591763496398926, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21672768890857697, + "step": 5052 + }, + { + "epoch": 0.1579375, + "grad_norm": 3.71875, + "grad_norm_var": 0.06217041015625, + "learning_rate": 0.0001, + "loss": 6.4167, + "loss/crossentropy": 2.7084654569625854, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20637103915214539, + "step": 5054 + }, + { + "epoch": 0.158, + "grad_norm": 3.375, + "grad_norm_var": 0.0797027587890625, + "learning_rate": 0.0001, + "loss": 6.4931, + "loss/crossentropy": 2.6809680461883545, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21324515342712402, + "step": 5056 + }, + { + "epoch": 0.1580625, + "grad_norm": 4.0625, + "grad_norm_var": 0.12431233723958333, + "learning_rate": 0.0001, + "loss": 6.6414, + "loss/crossentropy": 2.652488112449646, + "loss/hidden": 1.75390625, + "loss/jsd": 0.0, + "loss/logits": 0.22349800169467926, + "step": 5058 + }, + { + "epoch": 0.158125, + "grad_norm": 3.46875, + "grad_norm_var": 0.12433980305989584, + "learning_rate": 0.0001, + "loss": 6.3064, + "loss/crossentropy": 2.605587124824524, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20680193603038788, + "step": 5060 + }, + { + "epoch": 0.1581875, + "grad_norm": 3.5, + "grad_norm_var": 0.12500712076822917, + "learning_rate": 0.0001, + "loss": 5.9345, + "loss/crossentropy": 2.387421131134033, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19493824988603592, + "step": 5062 + }, + { + "epoch": 0.15825, + "grad_norm": 3.84375, + "grad_norm_var": 0.12653706868489584, + "learning_rate": 0.0001, + "loss": 6.6495, + "loss/crossentropy": 2.7843059301376343, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.21698807924985886, + "step": 5064 + }, + { + "epoch": 0.1583125, + "grad_norm": 3.6875, + "grad_norm_var": 0.13125712076822918, + "learning_rate": 0.0001, + "loss": 6.1758, + "loss/crossentropy": 2.5535526275634766, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19894598424434662, + "step": 5066 + }, + { + "epoch": 0.158375, + "grad_norm": 3.9375, + "grad_norm_var": 0.11034749348958334, + "learning_rate": 0.0001, + "loss": 6.1638, + "loss/crossentropy": 2.449795961380005, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20772428065538406, + "step": 5068 + }, + { + "epoch": 0.1584375, + "grad_norm": 4.125, + "grad_norm_var": 0.10965067545572917, + "learning_rate": 0.0001, + "loss": 6.7174, + "loss/crossentropy": 2.7422256469726562, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22837365418672562, + "step": 5070 + }, + { + "epoch": 0.1585, + "grad_norm": 3.34375, + "grad_norm_var": 0.10690104166666667, + "learning_rate": 0.0001, + "loss": 6.3429, + "loss/crossentropy": 2.657220244407654, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20646213740110397, + "step": 5072 + }, + { + "epoch": 0.1585625, + "grad_norm": 3.921875, + "grad_norm_var": 0.074755859375, + "learning_rate": 0.0001, + "loss": 6.6273, + "loss/crossentropy": 2.8043344020843506, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2178436666727066, + "step": 5074 + }, + { + "epoch": 0.158625, + "grad_norm": 3.359375, + "grad_norm_var": 0.07821858723958333, + "learning_rate": 0.0001, + "loss": 5.8211, + "loss/crossentropy": 2.3485565185546875, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.18475934863090515, + "step": 5076 + }, + { + "epoch": 0.1586875, + "grad_norm": 3.28125, + "grad_norm_var": 0.08338114420572916, + "learning_rate": 0.0001, + "loss": 6.1521, + "loss/crossentropy": 2.469932198524475, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2014240026473999, + "step": 5078 + }, + { + "epoch": 0.15875, + "grad_norm": 4.375, + "grad_norm_var": 0.1011871337890625, + "learning_rate": 0.0001, + "loss": 6.8118, + "loss/crossentropy": 2.7964015007019043, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.23201094567775726, + "step": 5080 + }, + { + "epoch": 0.1588125, + "grad_norm": 3.609375, + "grad_norm_var": 0.10099283854166667, + "learning_rate": 0.0001, + "loss": 6.3316, + "loss/crossentropy": 2.656257152557373, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20347124338150024, + "step": 5082 + }, + { + "epoch": 0.158875, + "grad_norm": 3.640625, + "grad_norm_var": 0.10268452962239584, + "learning_rate": 0.0001, + "loss": 6.2924, + "loss/crossentropy": 2.5780434608459473, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20854273438453674, + "step": 5084 + }, + { + "epoch": 0.1589375, + "grad_norm": 3.9375, + "grad_norm_var": 0.08918863932291667, + "learning_rate": 0.0001, + "loss": 6.2844, + "loss/crossentropy": 2.5688424110412598, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2070987969636917, + "step": 5086 + }, + { + "epoch": 0.159, + "grad_norm": 3.484375, + "grad_norm_var": 0.081298828125, + "learning_rate": 0.0001, + "loss": 6.4102, + "loss/crossentropy": 2.6852883100509644, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20686575770378113, + "step": 5088 + }, + { + "epoch": 0.1590625, + "grad_norm": 3.75, + "grad_norm_var": 0.28347981770833336, + "learning_rate": 0.0001, + "loss": 6.2168, + "loss/crossentropy": 2.4014971256256104, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21551689505577087, + "step": 5090 + }, + { + "epoch": 0.159125, + "grad_norm": 3.84375, + "grad_norm_var": 0.2693684895833333, + "learning_rate": 0.0001, + "loss": 6.2445, + "loss/crossentropy": 2.539799928665161, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2087467536330223, + "step": 5092 + }, + { + "epoch": 0.1591875, + "grad_norm": 4.15625, + "grad_norm_var": 0.25432942708333334, + "learning_rate": 0.0001, + "loss": 6.5523, + "loss/crossentropy": 2.720983147621155, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21946312487125397, + "step": 5094 + }, + { + "epoch": 0.15925, + "grad_norm": 3.625, + "grad_norm_var": 0.23621419270833333, + "learning_rate": 0.0001, + "loss": 6.062, + "loss/crossentropy": 2.464852809906006, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19800060987472534, + "step": 5096 + }, + { + "epoch": 0.1593125, + "grad_norm": 3.921875, + "grad_norm_var": 0.22683919270833333, + "learning_rate": 0.0001, + "loss": 6.3407, + "loss/crossentropy": 2.5605088472366333, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21160940825939178, + "step": 5098 + }, + { + "epoch": 0.159375, + "grad_norm": 4.0, + "grad_norm_var": 0.22457275390625, + "learning_rate": 0.0001, + "loss": 6.0553, + "loss/crossentropy": 2.394958972930908, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2047068253159523, + "step": 5100 + }, + { + "epoch": 0.1594375, + "grad_norm": 3.515625, + "grad_norm_var": 0.238427734375, + "learning_rate": 0.0001, + "loss": 6.1042, + "loss/crossentropy": 2.592382788658142, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1890685334801674, + "step": 5102 + }, + { + "epoch": 0.1595, + "grad_norm": 3.875, + "grad_norm_var": 0.23775634765625, + "learning_rate": 0.0001, + "loss": 6.3744, + "loss/crossentropy": 2.544821858406067, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2165486440062523, + "step": 5104 + }, + { + "epoch": 0.1595625, + "grad_norm": 3.875, + "grad_norm_var": 0.042023722330729166, + "learning_rate": 0.0001, + "loss": 6.4397, + "loss/crossentropy": 2.6532318592071533, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.20872486382722855, + "step": 5106 + }, + { + "epoch": 0.159625, + "grad_norm": 3.625, + "grad_norm_var": 0.042723592122395834, + "learning_rate": 0.0001, + "loss": 6.248, + "loss/crossentropy": 2.629621386528015, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.1934749037027359, + "step": 5108 + }, + { + "epoch": 0.1596875, + "grad_norm": 3.3125, + "grad_norm_var": 0.0379791259765625, + "learning_rate": 0.0001, + "loss": 6.0829, + "loss/crossentropy": 2.38829243183136, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20539657771587372, + "step": 5110 + }, + { + "epoch": 0.15975, + "grad_norm": 3.96875, + "grad_norm_var": 0.040934244791666664, + "learning_rate": 0.0001, + "loss": 6.3522, + "loss/crossentropy": 2.6986857652664185, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.200894795358181, + "step": 5112 + }, + { + "epoch": 0.1598125, + "grad_norm": 3.5, + "grad_norm_var": 0.04039713541666667, + "learning_rate": 0.0001, + "loss": 6.328, + "loss/crossentropy": 2.610170006752014, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20928016304969788, + "step": 5114 + }, + { + "epoch": 0.159875, + "grad_norm": 3.25, + "grad_norm_var": 0.04309794108072917, + "learning_rate": 0.0001, + "loss": 6.1676, + "loss/crossentropy": 2.5283877849578857, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2021999955177307, + "step": 5116 + }, + { + "epoch": 0.1599375, + "grad_norm": 3.625, + "grad_norm_var": 0.04107666015625, + "learning_rate": 0.0001, + "loss": 6.0732, + "loss/crossentropy": 2.3941776752471924, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.1999344378709793, + "step": 5118 + }, + { + "epoch": 0.16, + "grad_norm": 3.84375, + "grad_norm_var": 0.04444071451822917, + "learning_rate": 0.0001, + "loss": 6.2364, + "loss/crossentropy": 2.4882354736328125, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20841020345687866, + "step": 5120 + }, + { + "epoch": 0.1600625, + "grad_norm": 3.9375, + "grad_norm_var": 0.04726460774739583, + "learning_rate": 0.0001, + "loss": 6.0552, + "loss/crossentropy": 2.391435742378235, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.19997309893369675, + "step": 5122 + }, + { + "epoch": 0.160125, + "grad_norm": 4.1875, + "grad_norm_var": 0.06616109212239583, + "learning_rate": 0.0001, + "loss": 6.3829, + "loss/crossentropy": 2.4768353700637817, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.218342125415802, + "step": 5124 + }, + { + "epoch": 0.1601875, + "grad_norm": 3.953125, + "grad_norm_var": 0.060301717122395834, + "learning_rate": 0.0001, + "loss": 6.5424, + "loss/crossentropy": 2.626850724220276, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.22007066011428833, + "step": 5126 + }, + { + "epoch": 0.16025, + "grad_norm": 4.0625, + "grad_norm_var": 0.07681884765625, + "learning_rate": 0.0001, + "loss": 6.2302, + "loss/crossentropy": 2.40298068523407, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21827350556850433, + "step": 5128 + }, + { + "epoch": 0.1603125, + "grad_norm": 3.640625, + "grad_norm_var": 0.0725494384765625, + "learning_rate": 0.0001, + "loss": 6.5084, + "loss/crossentropy": 2.6761317253112793, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21799086779356003, + "step": 5130 + }, + { + "epoch": 0.160375, + "grad_norm": 4.375, + "grad_norm_var": 0.061432902018229166, + "learning_rate": 0.0001, + "loss": 6.4369, + "loss/crossentropy": 2.5701708793640137, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.22065787762403488, + "step": 5132 + }, + { + "epoch": 0.1604375, + "grad_norm": 3.640625, + "grad_norm_var": 0.05025634765625, + "learning_rate": 0.0001, + "loss": 6.3835, + "loss/crossentropy": 2.5875306129455566, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2104513794183731, + "step": 5134 + }, + { + "epoch": 0.1605, + "grad_norm": 3.609375, + "grad_norm_var": 0.05621337890625, + "learning_rate": 0.0001, + "loss": 6.4553, + "loss/crossentropy": 2.675415873527527, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20962948352098465, + "step": 5136 + }, + { + "epoch": 0.1605625, + "grad_norm": 4.0625, + "grad_norm_var": 0.0486328125, + "learning_rate": 0.0001, + "loss": 6.6254, + "loss/crossentropy": 2.7770951986312866, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2231074795126915, + "step": 5138 + }, + { + "epoch": 0.160625, + "grad_norm": 4.15625, + "grad_norm_var": 0.0473297119140625, + "learning_rate": 0.0001, + "loss": 6.4635, + "loss/crossentropy": 2.665207266807556, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21224810183048248, + "step": 5140 + }, + { + "epoch": 0.1606875, + "grad_norm": 3.921875, + "grad_norm_var": 0.07489827473958334, + "learning_rate": 0.0001, + "loss": 6.5802, + "loss/crossentropy": 2.6273059844970703, + "loss/hidden": 1.765625, + "loss/jsd": 0.0, + "loss/logits": 0.2187241166830063, + "step": 5142 + }, + { + "epoch": 0.16075, + "grad_norm": 3.625, + "grad_norm_var": 0.07668863932291667, + "learning_rate": 0.0001, + "loss": 6.4066, + "loss/crossentropy": 2.621706962585449, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.2089575156569481, + "step": 5144 + }, + { + "epoch": 0.1608125, + "grad_norm": 3.390625, + "grad_norm_var": 0.09207255045572917, + "learning_rate": 0.0001, + "loss": 6.0105, + "loss/crossentropy": 2.5059956312179565, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.18990148603916168, + "step": 5146 + }, + { + "epoch": 0.160875, + "grad_norm": 4.0, + "grad_norm_var": 0.08333231608072916, + "learning_rate": 0.0001, + "loss": 6.1947, + "loss/crossentropy": 2.434074878692627, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20887398719787598, + "step": 5148 + }, + { + "epoch": 0.1609375, + "grad_norm": 3.953125, + "grad_norm_var": 0.11112874348958333, + "learning_rate": 0.0001, + "loss": 6.348, + "loss/crossentropy": 2.608154535293579, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2048449069261551, + "step": 5150 + }, + { + "epoch": 0.161, + "grad_norm": 3.578125, + "grad_norm_var": 0.12499593098958334, + "learning_rate": 0.0001, + "loss": 6.1689, + "loss/crossentropy": 2.5595574378967285, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.1984364241361618, + "step": 5152 + }, + { + "epoch": 0.1610625, + "grad_norm": 3.578125, + "grad_norm_var": 0.1310699462890625, + "learning_rate": 0.0001, + "loss": 6.2864, + "loss/crossentropy": 2.62160587310791, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2051507607102394, + "step": 5154 + }, + { + "epoch": 0.161125, + "grad_norm": 4.09375, + "grad_norm_var": 0.18584696451822916, + "learning_rate": 0.0001, + "loss": 6.5913, + "loss/crossentropy": 2.7094404697418213, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.2155286818742752, + "step": 5156 + }, + { + "epoch": 0.1611875, + "grad_norm": 3.75, + "grad_norm_var": 0.1341217041015625, + "learning_rate": 0.0001, + "loss": 6.2557, + "loss/crossentropy": 2.498765468597412, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21201671659946442, + "step": 5158 + }, + { + "epoch": 0.16125, + "grad_norm": 3.6875, + "grad_norm_var": 0.16846415201822917, + "learning_rate": 0.0001, + "loss": 6.4407, + "loss/crossentropy": 2.5842690467834473, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21884340792894363, + "step": 5160 + }, + { + "epoch": 0.1613125, + "grad_norm": 3.625, + "grad_norm_var": 0.16340230305989584, + "learning_rate": 0.0001, + "loss": 6.2249, + "loss/crossentropy": 2.6013705730438232, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2014119178056717, + "step": 5162 + }, + { + "epoch": 0.161375, + "grad_norm": 3.890625, + "grad_norm_var": 0.16494038899739583, + "learning_rate": 0.0001, + "loss": 6.4383, + "loss/crossentropy": 2.6550703048706055, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21465323865413666, + "step": 5164 + }, + { + "epoch": 0.1614375, + "grad_norm": 3.9375, + "grad_norm_var": 0.1437896728515625, + "learning_rate": 0.0001, + "loss": 6.5286, + "loss/crossentropy": 2.627516508102417, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2197948545217514, + "step": 5166 + }, + { + "epoch": 0.1615, + "grad_norm": 4.25, + "grad_norm_var": 0.13238525390625, + "learning_rate": 0.0001, + "loss": 6.5927, + "loss/crossentropy": 2.7336513996124268, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21871402859687805, + "step": 5168 + }, + { + "epoch": 0.1615625, + "grad_norm": 3.875, + "grad_norm_var": 0.10852762858072916, + "learning_rate": 0.0001, + "loss": 6.4115, + "loss/crossentropy": 2.640623927116394, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.20834068953990936, + "step": 5170 + }, + { + "epoch": 0.161625, + "grad_norm": 3.96875, + "grad_norm_var": 0.07009175618489584, + "learning_rate": 0.0001, + "loss": 6.6146, + "loss/crossentropy": 2.7761088609695435, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21548492461442947, + "step": 5172 + }, + { + "epoch": 0.1616875, + "grad_norm": 3.65625, + "grad_norm_var": 0.07420247395833333, + "learning_rate": 0.0001, + "loss": 6.3679, + "loss/crossentropy": 2.681834101676941, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20571385324001312, + "step": 5174 + }, + { + "epoch": 0.16175, + "grad_norm": 3.640625, + "grad_norm_var": 0.054743448893229164, + "learning_rate": 0.0001, + "loss": 6.4309, + "loss/crossentropy": 2.570289731025696, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21769776940345764, + "step": 5176 + }, + { + "epoch": 0.1618125, + "grad_norm": 3.625, + "grad_norm_var": 0.06123758951822917, + "learning_rate": 0.0001, + "loss": 6.4081, + "loss/crossentropy": 2.6298660039901733, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.20868495106697083, + "step": 5178 + }, + { + "epoch": 0.161875, + "grad_norm": 3.859375, + "grad_norm_var": 0.05553385416666667, + "learning_rate": 0.0001, + "loss": 6.1179, + "loss/crossentropy": 2.452068328857422, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20329831540584564, + "step": 5180 + }, + { + "epoch": 0.1619375, + "grad_norm": 3.71875, + "grad_norm_var": 0.05422770182291667, + "learning_rate": 0.0001, + "loss": 6.2349, + "loss/crossentropy": 2.5371674299240112, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20415294915437698, + "step": 5182 + }, + { + "epoch": 0.162, + "grad_norm": 3.984375, + "grad_norm_var": 0.07447916666666667, + "learning_rate": 0.0001, + "loss": 6.5218, + "loss/crossentropy": 2.6421661376953125, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21570029109716415, + "step": 5184 + }, + { + "epoch": 0.1620625, + "grad_norm": 3.703125, + "grad_norm_var": 0.07503153483072916, + "learning_rate": 0.0001, + "loss": 6.3243, + "loss/crossentropy": 2.581998825073242, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.208997443318367, + "step": 5186 + }, + { + "epoch": 0.162125, + "grad_norm": 3.703125, + "grad_norm_var": 0.068115234375, + "learning_rate": 0.0001, + "loss": 6.564, + "loss/crossentropy": 2.727616548538208, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21722905337810516, + "step": 5188 + }, + { + "epoch": 0.1621875, + "grad_norm": 3.640625, + "grad_norm_var": 0.0645172119140625, + "learning_rate": 0.0001, + "loss": 6.8096, + "loss/crossentropy": 2.9005852937698364, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22176381200551987, + "step": 5190 + }, + { + "epoch": 0.16225, + "grad_norm": 3.46875, + "grad_norm_var": 0.07733968098958334, + "learning_rate": 0.0001, + "loss": 6.0591, + "loss/crossentropy": 2.4665474891662598, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19909808039665222, + "step": 5192 + }, + { + "epoch": 0.1623125, + "grad_norm": 3.703125, + "grad_norm_var": 0.06539306640625, + "learning_rate": 0.0001, + "loss": 6.5161, + "loss/crossentropy": 2.7396236658096313, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21124304085969925, + "step": 5194 + }, + { + "epoch": 0.162375, + "grad_norm": 3.78125, + "grad_norm_var": 0.06278889973958333, + "learning_rate": 0.0001, + "loss": 6.2922, + "loss/crossentropy": 2.588027238845825, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20713649690151215, + "step": 5196 + }, + { + "epoch": 0.1624375, + "grad_norm": 3.390625, + "grad_norm_var": 0.21730855305989583, + "learning_rate": 0.0001, + "loss": 6.4155, + "loss/crossentropy": 2.5007861852645874, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22232603281736374, + "step": 5198 + }, + { + "epoch": 0.1625, + "grad_norm": 3.5, + "grad_norm_var": 0.18430989583333332, + "learning_rate": 0.0001, + "loss": 6.3138, + "loss/crossentropy": 2.7006181478500366, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19842666387557983, + "step": 5200 + }, + { + "epoch": 0.1625625, + "grad_norm": 3.578125, + "grad_norm_var": 0.37224019368489586, + "learning_rate": 0.0001, + "loss": 6.209, + "loss/crossentropy": 2.4328094720840454, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.20613687485456467, + "step": 5202 + }, + { + "epoch": 0.162625, + "grad_norm": 3.484375, + "grad_norm_var": 0.37862955729166664, + "learning_rate": 0.0001, + "loss": 6.245, + "loss/crossentropy": 2.5239570140838623, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20570280402898788, + "step": 5204 + }, + { + "epoch": 0.1626875, + "grad_norm": 3.78125, + "grad_norm_var": 0.3766103108723958, + "learning_rate": 0.0001, + "loss": 6.2682, + "loss/crossentropy": 2.5990031957626343, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20051166415214539, + "step": 5206 + }, + { + "epoch": 0.16275, + "grad_norm": 3.4375, + "grad_norm_var": 0.3717447916666667, + "learning_rate": 0.0001, + "loss": 5.9836, + "loss/crossentropy": 2.4379926919937134, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19362393021583557, + "step": 5208 + }, + { + "epoch": 0.1628125, + "grad_norm": 3.9375, + "grad_norm_var": 0.3742472330729167, + "learning_rate": 0.0001, + "loss": 6.0679, + "loss/crossentropy": 2.390724301338196, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20013543963432312, + "step": 5210 + }, + { + "epoch": 0.162875, + "grad_norm": 3.671875, + "grad_norm_var": 0.3865559895833333, + "learning_rate": 0.0001, + "loss": 6.5876, + "loss/crossentropy": 2.7420979738235474, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21463260054588318, + "step": 5212 + }, + { + "epoch": 0.1629375, + "grad_norm": 3.984375, + "grad_norm_var": 0.24397684733072916, + "learning_rate": 0.0001, + "loss": 6.2138, + "loss/crossentropy": 2.4656132459640503, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20880325138568878, + "step": 5214 + }, + { + "epoch": 0.163, + "grad_norm": 3.484375, + "grad_norm_var": 0.25559895833333335, + "learning_rate": 0.0001, + "loss": 6.0856, + "loss/crossentropy": 2.4168217182159424, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.19968654215335846, + "step": 5216 + }, + { + "epoch": 0.1630625, + "grad_norm": 3.875, + "grad_norm_var": 0.08619791666666667, + "learning_rate": 0.0001, + "loss": 6.3584, + "loss/crossentropy": 2.638114094734192, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20875079184770584, + "step": 5218 + }, + { + "epoch": 0.163125, + "grad_norm": 3.703125, + "grad_norm_var": 0.08138020833333333, + "learning_rate": 0.0001, + "loss": 6.441, + "loss/crossentropy": 2.6618345975875854, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21033572405576706, + "step": 5220 + }, + { + "epoch": 0.1631875, + "grad_norm": 3.921875, + "grad_norm_var": 0.08289388020833334, + "learning_rate": 0.0001, + "loss": 6.6087, + "loss/crossentropy": 2.7304168939590454, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.216347374022007, + "step": 5222 + }, + { + "epoch": 0.16325, + "grad_norm": 3.3125, + "grad_norm_var": 0.08680013020833334, + "learning_rate": 0.0001, + "loss": 5.8067, + "loss/crossentropy": 2.2770395278930664, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19398200511932373, + "step": 5224 + }, + { + "epoch": 0.1633125, + "grad_norm": 4.1875, + "grad_norm_var": 0.16366780598958333, + "learning_rate": 0.0001, + "loss": 6.5923, + "loss/crossentropy": 2.704433560371399, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22121261805295944, + "step": 5226 + }, + { + "epoch": 0.163375, + "grad_norm": 3.984375, + "grad_norm_var": 0.16862691243489583, + "learning_rate": 0.0001, + "loss": 6.3544, + "loss/crossentropy": 2.5782910585403442, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2112007588148117, + "step": 5228 + }, + { + "epoch": 0.1634375, + "grad_norm": 3.8125, + "grad_norm_var": 0.17488606770833334, + "learning_rate": 0.0001, + "loss": 6.3675, + "loss/crossentropy": 2.6073272228240967, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21234365552663803, + "step": 5230 + }, + { + "epoch": 0.1635, + "grad_norm": 3.828125, + "grad_norm_var": 0.15170796712239584, + "learning_rate": 0.0001, + "loss": 6.3838, + "loss/crossentropy": 2.715991497039795, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20349497348070145, + "step": 5232 + }, + { + "epoch": 0.1635625, + "grad_norm": 4.1875, + "grad_norm_var": 0.15120442708333334, + "learning_rate": 0.0001, + "loss": 6.2745, + "loss/crossentropy": 2.5514438152313232, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.205508753657341, + "step": 5234 + }, + { + "epoch": 0.163625, + "grad_norm": 3.4375, + "grad_norm_var": 0.1604156494140625, + "learning_rate": 0.0001, + "loss": 6.3004, + "loss/crossentropy": 2.616993546485901, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20662517845630646, + "step": 5236 + }, + { + "epoch": 0.1636875, + "grad_norm": 3.75, + "grad_norm_var": 0.159228515625, + "learning_rate": 0.0001, + "loss": 6.4741, + "loss/crossentropy": 2.6319591999053955, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21546736359596252, + "step": 5238 + }, + { + "epoch": 0.16375, + "grad_norm": 3.59375, + "grad_norm_var": 0.146484375, + "learning_rate": 0.0001, + "loss": 6.5345, + "loss/crossentropy": 2.6985357999801636, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21602191030979156, + "step": 5240 + }, + { + "epoch": 0.1638125, + "grad_norm": 4.21875, + "grad_norm_var": 0.07890523274739583, + "learning_rate": 0.0001, + "loss": 6.5487, + "loss/crossentropy": 2.69356632232666, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.2179398387670517, + "step": 5242 + }, + { + "epoch": 0.163875, + "grad_norm": 3.421875, + "grad_norm_var": 0.06830952962239584, + "learning_rate": 0.0001, + "loss": 6.4129, + "loss/crossentropy": 2.6797508001327515, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20768603682518005, + "step": 5244 + }, + { + "epoch": 0.1639375, + "grad_norm": 3.65625, + "grad_norm_var": 0.0629058837890625, + "learning_rate": 0.0001, + "loss": 6.3771, + "loss/crossentropy": 2.670736074447632, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20345058292150497, + "step": 5246 + }, + { + "epoch": 0.164, + "grad_norm": 3.4375, + "grad_norm_var": 0.06568603515625, + "learning_rate": 0.0001, + "loss": 6.4829, + "loss/crossentropy": 2.721261501312256, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21287819743156433, + "step": 5248 + }, + { + "epoch": 0.1640625, + "grad_norm": 3.671875, + "grad_norm_var": 0.04943033854166667, + "learning_rate": 0.0001, + "loss": 6.2365, + "loss/crossentropy": 2.5345875024795532, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20651615411043167, + "step": 5250 + }, + { + "epoch": 0.164125, + "grad_norm": 3.421875, + "grad_norm_var": 0.0500152587890625, + "learning_rate": 0.0001, + "loss": 6.1669, + "loss/crossentropy": 2.5575015544891357, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.198830708861351, + "step": 5252 + }, + { + "epoch": 0.1641875, + "grad_norm": 3.6875, + "grad_norm_var": 0.050446573893229166, + "learning_rate": 0.0001, + "loss": 6.2607, + "loss/crossentropy": 2.505549192428589, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2075466513633728, + "step": 5254 + }, + { + "epoch": 0.16425, + "grad_norm": 3.9375, + "grad_norm_var": 0.051667277018229166, + "learning_rate": 0.0001, + "loss": 6.5641, + "loss/crossentropy": 2.744532585144043, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2194598913192749, + "step": 5256 + }, + { + "epoch": 0.1643125, + "grad_norm": 3.671875, + "grad_norm_var": 0.028511555989583333, + "learning_rate": 0.0001, + "loss": 6.049, + "loss/crossentropy": 2.42891788482666, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20029054582118988, + "step": 5258 + }, + { + "epoch": 0.164375, + "grad_norm": 3.59375, + "grad_norm_var": 0.027750651041666668, + "learning_rate": 0.0001, + "loss": 6.0197, + "loss/crossentropy": 2.432560443878174, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19933748990297318, + "step": 5260 + }, + { + "epoch": 0.1644375, + "grad_norm": 3.96875, + "grad_norm_var": 0.03818359375, + "learning_rate": 0.0001, + "loss": 6.2531, + "loss/crossentropy": 2.4191304445266724, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.21308691799640656, + "step": 5262 + }, + { + "epoch": 0.1645, + "grad_norm": 3.75, + "grad_norm_var": 0.038834635416666666, + "learning_rate": 0.0001, + "loss": 6.062, + "loss/crossentropy": 2.460596442222595, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.19295437633991241, + "step": 5264 + }, + { + "epoch": 0.1645625, + "grad_norm": 3.4375, + "grad_norm_var": 0.04149983723958333, + "learning_rate": 0.0001, + "loss": 6.1671, + "loss/crossentropy": 2.5348631143569946, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20229025930166245, + "step": 5266 + }, + { + "epoch": 0.164625, + "grad_norm": 4.03125, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 5.7182, + "loss/crossentropy": 2.207819700241089, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.18619763106107712, + "step": 5268 + }, + { + "epoch": 0.1646875, + "grad_norm": 3.703125, + "grad_norm_var": 0.04625651041666667, + "learning_rate": 0.0001, + "loss": 6.2923, + "loss/crossentropy": 2.5302654504776, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21252676844596863, + "step": 5270 + }, + { + "epoch": 0.16475, + "grad_norm": 3.453125, + "grad_norm_var": 0.0417144775390625, + "learning_rate": 0.0001, + "loss": 5.8896, + "loss/crossentropy": 2.3639479875564575, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.1896733045578003, + "step": 5272 + }, + { + "epoch": 0.1648125, + "grad_norm": 3.78125, + "grad_norm_var": 0.03972066243489583, + "learning_rate": 0.0001, + "loss": 6.4644, + "loss/crossentropy": 2.6995153427124023, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.2116451859474182, + "step": 5274 + }, + { + "epoch": 0.164875, + "grad_norm": 5.1875, + "grad_norm_var": 0.18012593587239584, + "learning_rate": 0.0001, + "loss": 6.0782, + "loss/crossentropy": 2.4377158880233765, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20232883840799332, + "step": 5276 + }, + { + "epoch": 0.1649375, + "grad_norm": 3.953125, + "grad_norm_var": 0.18073628743489584, + "learning_rate": 0.0001, + "loss": 6.1884, + "loss/crossentropy": 2.6104766130447388, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19763468205928802, + "step": 5278 + }, + { + "epoch": 0.165, + "grad_norm": 3.765625, + "grad_norm_var": 0.16936442057291667, + "learning_rate": 0.0001, + "loss": 6.457, + "loss/crossentropy": 2.747946619987488, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20879191160202026, + "step": 5280 + }, + { + "epoch": 0.1650625, + "grad_norm": 3.765625, + "grad_norm_var": 0.16105855305989583, + "learning_rate": 0.0001, + "loss": 6.2794, + "loss/crossentropy": 2.5800029039382935, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20665637403726578, + "step": 5282 + }, + { + "epoch": 0.165125, + "grad_norm": 3.40625, + "grad_norm_var": 0.1702056884765625, + "learning_rate": 0.0001, + "loss": 6.0868, + "loss/crossentropy": 2.5235323905944824, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19108830392360687, + "step": 5284 + }, + { + "epoch": 0.1651875, + "grad_norm": 3.75, + "grad_norm_var": 0.17115478515625, + "learning_rate": 0.0001, + "loss": 6.2818, + "loss/crossentropy": 2.5826101303100586, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20624583214521408, + "step": 5286 + }, + { + "epoch": 0.16525, + "grad_norm": 3.484375, + "grad_norm_var": 0.17464192708333334, + "learning_rate": 0.0001, + "loss": 6.1176, + "loss/crossentropy": 2.4823321104049683, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20063431560993195, + "step": 5288 + }, + { + "epoch": 0.1653125, + "grad_norm": 3.515625, + "grad_norm_var": 0.18733622233072916, + "learning_rate": 0.0001, + "loss": 6.4095, + "loss/crossentropy": 2.626147985458374, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21544121205806732, + "step": 5290 + }, + { + "epoch": 0.165375, + "grad_norm": 3.046875, + "grad_norm_var": 0.06989644368489584, + "learning_rate": 0.0001, + "loss": 6.0643, + "loss/crossentropy": 2.466831922531128, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19450916349887848, + "step": 5292 + }, + { + "epoch": 0.1654375, + "grad_norm": 4.03125, + "grad_norm_var": 0.074072265625, + "learning_rate": 0.0001, + "loss": 6.5792, + "loss/crossentropy": 2.6792465448379517, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.22397875040769577, + "step": 5294 + }, + { + "epoch": 0.1655, + "grad_norm": 3.78125, + "grad_norm_var": 0.07503153483072916, + "learning_rate": 0.0001, + "loss": 6.2, + "loss/crossentropy": 2.5096585750579834, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20536701381206512, + "step": 5296 + }, + { + "epoch": 0.1655625, + "grad_norm": 4.0, + "grad_norm_var": 0.08166910807291666, + "learning_rate": 0.0001, + "loss": 6.4394, + "loss/crossentropy": 2.628980278968811, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2122943252325058, + "step": 5298 + }, + { + "epoch": 0.165625, + "grad_norm": 3.28125, + "grad_norm_var": 0.08350321451822916, + "learning_rate": 0.0001, + "loss": 5.8707, + "loss/crossentropy": 2.3820395469665527, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.18636663258075714, + "step": 5300 + }, + { + "epoch": 0.1656875, + "grad_norm": 3.4375, + "grad_norm_var": 0.08725484212239583, + "learning_rate": 0.0001, + "loss": 6.0521, + "loss/crossentropy": 2.4212061166763306, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.19551438838243484, + "step": 5302 + }, + { + "epoch": 0.16575, + "grad_norm": 3.765625, + "grad_norm_var": 0.09051106770833334, + "learning_rate": 0.0001, + "loss": 6.2196, + "loss/crossentropy": 2.6344507932662964, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19992156326770782, + "step": 5304 + }, + { + "epoch": 0.1658125, + "grad_norm": 4.03125, + "grad_norm_var": 0.08406575520833333, + "learning_rate": 0.0001, + "loss": 6.1123, + "loss/crossentropy": 2.4072346687316895, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2044946402311325, + "step": 5306 + }, + { + "epoch": 0.165875, + "grad_norm": 3.609375, + "grad_norm_var": 0.06939697265625, + "learning_rate": 0.0001, + "loss": 5.9202, + "loss/crossentropy": 2.3986281156539917, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.18965810537338257, + "step": 5308 + }, + { + "epoch": 0.1659375, + "grad_norm": 3.5625, + "grad_norm_var": 0.05939839680989583, + "learning_rate": 0.0001, + "loss": 6.329, + "loss/crossentropy": 2.6033272743225098, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.21045982837677002, + "step": 5310 + }, + { + "epoch": 0.166, + "grad_norm": 3.75, + "grad_norm_var": 0.05845947265625, + "learning_rate": 0.0001, + "loss": 6.139, + "loss/crossentropy": 2.564116358757019, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1988985240459442, + "step": 5312 + }, + { + "epoch": 0.1660625, + "grad_norm": 4.03125, + "grad_norm_var": 0.06773173014322917, + "learning_rate": 0.0001, + "loss": 6.5924, + "loss/crossentropy": 2.6761828660964966, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2252115160226822, + "step": 5314 + }, + { + "epoch": 0.166125, + "grad_norm": 3.59375, + "grad_norm_var": 0.06974283854166667, + "learning_rate": 0.0001, + "loss": 6.094, + "loss/crossentropy": 2.426016926765442, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2011687308549881, + "step": 5316 + }, + { + "epoch": 0.1661875, + "grad_norm": 3.9375, + "grad_norm_var": 0.06330464680989584, + "learning_rate": 0.0001, + "loss": 6.3091, + "loss/crossentropy": 2.5874863862991333, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.204196497797966, + "step": 5318 + }, + { + "epoch": 0.16625, + "grad_norm": 5.84375, + "grad_norm_var": 0.32942606608072916, + "learning_rate": 0.0001, + "loss": 6.6528, + "loss/crossentropy": 2.6687638759613037, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2292676791548729, + "step": 5320 + }, + { + "epoch": 0.1663125, + "grad_norm": 3.71875, + "grad_norm_var": 0.35510660807291666, + "learning_rate": 0.0001, + "loss": 6.4089, + "loss/crossentropy": 2.679196834564209, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20734111219644547, + "step": 5322 + }, + { + "epoch": 0.166375, + "grad_norm": 3.546875, + "grad_norm_var": 0.33088785807291665, + "learning_rate": 0.0001, + "loss": 6.3873, + "loss/crossentropy": 2.6827419996261597, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20405340194702148, + "step": 5324 + }, + { + "epoch": 0.1664375, + "grad_norm": 3.609375, + "grad_norm_var": 0.35852457682291666, + "learning_rate": 0.0001, + "loss": 6.5107, + "loss/crossentropy": 2.693418025970459, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21258943527936935, + "step": 5326 + }, + { + "epoch": 0.1665, + "grad_norm": 3.59375, + "grad_norm_var": 0.394287109375, + "learning_rate": 0.0001, + "loss": 6.0726, + "loss/crossentropy": 2.532777190208435, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1934373378753662, + "step": 5328 + }, + { + "epoch": 0.1665625, + "grad_norm": 3.734375, + "grad_norm_var": 0.40081380208333334, + "learning_rate": 0.0001, + "loss": 6.4014, + "loss/crossentropy": 2.661211609840393, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20996029675006866, + "step": 5330 + }, + { + "epoch": 0.166625, + "grad_norm": 4.25, + "grad_norm_var": 0.4056630452473958, + "learning_rate": 0.0001, + "loss": 6.4706, + "loss/crossentropy": 2.697288155555725, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2132657915353775, + "step": 5332 + }, + { + "epoch": 0.1666875, + "grad_norm": 4.0, + "grad_norm_var": 0.415185546875, + "learning_rate": 0.0001, + "loss": 6.4788, + "loss/crossentropy": 2.6174012422561646, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21699899435043335, + "step": 5334 + }, + { + "epoch": 0.16675, + "grad_norm": 4.09375, + "grad_norm_var": 0.1821197509765625, + "learning_rate": 0.0001, + "loss": 6.0464, + "loss/crossentropy": 2.4147592782974243, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2037886083126068, + "step": 5336 + }, + { + "epoch": 0.1668125, + "grad_norm": 3.546875, + "grad_norm_var": 0.15432535807291667, + "learning_rate": 0.0001, + "loss": 6.4814, + "loss/crossentropy": 2.7996264696121216, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20528528094291687, + "step": 5338 + }, + { + "epoch": 0.166875, + "grad_norm": 5.625, + "grad_norm_var": 0.34579671223958336, + "learning_rate": 0.0001, + "loss": 6.5194, + "loss/crossentropy": 2.6324565410614014, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.220335453748703, + "step": 5340 + }, + { + "epoch": 0.1669375, + "grad_norm": 3.703125, + "grad_norm_var": 0.30339253743489586, + "learning_rate": 0.0001, + "loss": 6.2926, + "loss/crossentropy": 2.5119271278381348, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20970982313156128, + "step": 5342 + }, + { + "epoch": 0.167, + "grad_norm": 4.03125, + "grad_norm_var": 0.2597564697265625, + "learning_rate": 0.0001, + "loss": 6.6131, + "loss/crossentropy": 2.618421196937561, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.23266763985157013, + "step": 5344 + }, + { + "epoch": 0.1670625, + "grad_norm": 4.0625, + "grad_norm_var": 0.2544097900390625, + "learning_rate": 0.0001, + "loss": 6.7084, + "loss/crossentropy": 2.812577247619629, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22004784643650055, + "step": 5346 + }, + { + "epoch": 0.167125, + "grad_norm": 4.0625, + "grad_norm_var": 0.2555491129557292, + "learning_rate": 0.0001, + "loss": 6.5053, + "loss/crossentropy": 2.6363308429718018, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2201043888926506, + "step": 5348 + }, + { + "epoch": 0.1671875, + "grad_norm": 4.03125, + "grad_norm_var": 0.24338785807291666, + "learning_rate": 0.0001, + "loss": 6.0258, + "loss/crossentropy": 2.3339359760284424, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20707960426807404, + "step": 5350 + }, + { + "epoch": 0.16725, + "grad_norm": 3.671875, + "grad_norm_var": 0.24612223307291667, + "learning_rate": 0.0001, + "loss": 6.2252, + "loss/crossentropy": 2.5705204010009766, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20062048733234406, + "step": 5352 + }, + { + "epoch": 0.1673125, + "grad_norm": 3.484375, + "grad_norm_var": 0.2734039306640625, + "learning_rate": 0.0001, + "loss": 6.248, + "loss/crossentropy": 2.613363027572632, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20096845924854279, + "step": 5354 + }, + { + "epoch": 0.167375, + "grad_norm": 3.375, + "grad_norm_var": 0.07454020182291667, + "learning_rate": 0.0001, + "loss": 6.1435, + "loss/crossentropy": 2.6340157985687256, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19157454371452332, + "step": 5356 + }, + { + "epoch": 0.1674375, + "grad_norm": 3.75, + "grad_norm_var": 0.12720947265625, + "learning_rate": 0.0001, + "loss": 6.2857, + "loss/crossentropy": 2.5772162675857544, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20639867335557938, + "step": 5358 + }, + { + "epoch": 0.1675, + "grad_norm": 3.328125, + "grad_norm_var": 0.12574462890625, + "learning_rate": 0.0001, + "loss": 6.0702, + "loss/crossentropy": 2.4975491762161255, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19788971543312073, + "step": 5360 + }, + { + "epoch": 0.1675625, + "grad_norm": 4.5625, + "grad_norm_var": 0.1691802978515625, + "learning_rate": 0.0001, + "loss": 6.5493, + "loss/crossentropy": 2.58541202545166, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2241220772266388, + "step": 5362 + }, + { + "epoch": 0.167625, + "grad_norm": 3.875, + "grad_norm_var": 0.16763916015625, + "learning_rate": 0.0001, + "loss": 6.3549, + "loss/crossentropy": 2.555784583091736, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.20998644083738327, + "step": 5364 + }, + { + "epoch": 0.1676875, + "grad_norm": 3.28125, + "grad_norm_var": 0.1792144775390625, + "learning_rate": 0.0001, + "loss": 6.1039, + "loss/crossentropy": 2.540518641471863, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19383682310581207, + "step": 5366 + }, + { + "epoch": 0.16775, + "grad_norm": 3.40625, + "grad_norm_var": 0.18572489420572916, + "learning_rate": 0.0001, + "loss": 6.1902, + "loss/crossentropy": 2.560936450958252, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2035495787858963, + "step": 5368 + }, + { + "epoch": 0.1678125, + "grad_norm": 3.734375, + "grad_norm_var": 0.17398173014322918, + "learning_rate": 0.0001, + "loss": 6.389, + "loss/crossentropy": 2.6606903076171875, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20720500499010086, + "step": 5370 + }, + { + "epoch": 0.167875, + "grad_norm": 3.59375, + "grad_norm_var": 0.16502278645833332, + "learning_rate": 0.0001, + "loss": 6.5097, + "loss/crossentropy": 2.69782817363739, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21634666621685028, + "step": 5372 + }, + { + "epoch": 0.1679375, + "grad_norm": 3.640625, + "grad_norm_var": 0.10647684733072917, + "learning_rate": 0.0001, + "loss": 6.4707, + "loss/crossentropy": 2.744077205657959, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20859896391630173, + "step": 5374 + }, + { + "epoch": 0.168, + "grad_norm": 3.65625, + "grad_norm_var": 0.17427469889322916, + "learning_rate": 0.0001, + "loss": 6.645, + "loss/crossentropy": 2.8004655838012695, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.21219252049922943, + "step": 5376 + }, + { + "epoch": 0.1680625, + "grad_norm": 4.0, + "grad_norm_var": 0.13902079264322917, + "learning_rate": 0.0001, + "loss": 6.3748, + "loss/crossentropy": 2.616547465324402, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.20785995572805405, + "step": 5378 + }, + { + "epoch": 0.168125, + "grad_norm": 3.65625, + "grad_norm_var": 0.12819010416666668, + "learning_rate": 0.0001, + "loss": 6.1065, + "loss/crossentropy": 2.487843632698059, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19858013838529587, + "step": 5380 + }, + { + "epoch": 0.1681875, + "grad_norm": 3.5, + "grad_norm_var": 0.13079020182291667, + "learning_rate": 0.0001, + "loss": 6.2677, + "loss/crossentropy": 2.6592295169830322, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20108597725629807, + "step": 5382 + }, + { + "epoch": 0.16825, + "grad_norm": 3.65625, + "grad_norm_var": 0.1249664306640625, + "learning_rate": 0.0001, + "loss": 6.2955, + "loss/crossentropy": 2.5784114599227905, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20608504116535187, + "step": 5384 + }, + { + "epoch": 0.1683125, + "grad_norm": 3.46875, + "grad_norm_var": 0.13178609212239584, + "learning_rate": 0.0001, + "loss": 6.3715, + "loss/crossentropy": 2.714709997177124, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20435550063848495, + "step": 5386 + }, + { + "epoch": 0.168375, + "grad_norm": 3.453125, + "grad_norm_var": 0.14839579264322916, + "learning_rate": 0.0001, + "loss": 6.3324, + "loss/crossentropy": 2.7200835943222046, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20146213471889496, + "step": 5388 + }, + { + "epoch": 0.1684375, + "grad_norm": 3.59375, + "grad_norm_var": 0.14973958333333334, + "learning_rate": 0.0001, + "loss": 6.4809, + "loss/crossentropy": 2.7606624364852905, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.2099115252494812, + "step": 5390 + }, + { + "epoch": 0.1685, + "grad_norm": 3.640625, + "grad_norm_var": 0.03892822265625, + "learning_rate": 0.0001, + "loss": 6.3505, + "loss/crossentropy": 2.65522837638855, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.203899547457695, + "step": 5392 + }, + { + "epoch": 0.1685625, + "grad_norm": 3.765625, + "grad_norm_var": 0.0453033447265625, + "learning_rate": 0.0001, + "loss": 6.5379, + "loss/crossentropy": 2.6739238500595093, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2195974886417389, + "step": 5394 + }, + { + "epoch": 0.168625, + "grad_norm": 3.4375, + "grad_norm_var": 0.04299723307291667, + "learning_rate": 0.0001, + "loss": 6.0921, + "loss/crossentropy": 2.4353628158569336, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2047354057431221, + "step": 5396 + }, + { + "epoch": 0.1686875, + "grad_norm": 3.828125, + "grad_norm_var": 0.05764058430989583, + "learning_rate": 0.0001, + "loss": 6.3689, + "loss/crossentropy": 2.521478533744812, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2144312784075737, + "step": 5398 + }, + { + "epoch": 0.16875, + "grad_norm": 3.625, + "grad_norm_var": 0.05908915201822917, + "learning_rate": 0.0001, + "loss": 6.0522, + "loss/crossentropy": 2.4441404342651367, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19596174359321594, + "step": 5400 + }, + { + "epoch": 0.1688125, + "grad_norm": 3.5625, + "grad_norm_var": 0.05773111979166667, + "learning_rate": 0.0001, + "loss": 6.3178, + "loss/crossentropy": 2.64383864402771, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20528355240821838, + "step": 5402 + }, + { + "epoch": 0.168875, + "grad_norm": 4.1875, + "grad_norm_var": 0.06798502604166666, + "learning_rate": 0.0001, + "loss": 6.4363, + "loss/crossentropy": 2.61694872379303, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21474573761224747, + "step": 5404 + }, + { + "epoch": 0.1689375, + "grad_norm": 3.625, + "grad_norm_var": 0.06555582682291666, + "learning_rate": 0.0001, + "loss": 6.3585, + "loss/crossentropy": 2.6157970428466797, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2141156867146492, + "step": 5406 + }, + { + "epoch": 0.169, + "grad_norm": 3.734375, + "grad_norm_var": 0.0596343994140625, + "learning_rate": 0.0001, + "loss": 6.7139, + "loss/crossentropy": 2.8296098709106445, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.22124291211366653, + "step": 5408 + }, + { + "epoch": 0.1690625, + "grad_norm": 3.78125, + "grad_norm_var": 0.0520660400390625, + "learning_rate": 0.0001, + "loss": 6.5764, + "loss/crossentropy": 2.770159125328064, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21304430067539215, + "step": 5410 + }, + { + "epoch": 0.169125, + "grad_norm": 3.796875, + "grad_norm_var": 0.05730794270833333, + "learning_rate": 0.0001, + "loss": 6.1421, + "loss/crossentropy": 2.5159448385238647, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19933024793863297, + "step": 5412 + }, + { + "epoch": 0.1691875, + "grad_norm": 3.5625, + "grad_norm_var": 0.0895904541015625, + "learning_rate": 0.0001, + "loss": 6.8257, + "loss/crossentropy": 2.857306122779846, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22925830632448196, + "step": 5414 + }, + { + "epoch": 0.16925, + "grad_norm": 4.4375, + "grad_norm_var": 0.1106842041015625, + "learning_rate": 0.0001, + "loss": 6.3079, + "loss/crossentropy": 2.5683765411376953, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20754961669445038, + "step": 5416 + }, + { + "epoch": 0.1693125, + "grad_norm": 3.984375, + "grad_norm_var": 0.09905192057291666, + "learning_rate": 0.0001, + "loss": 6.2138, + "loss/crossentropy": 2.4839487075805664, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20735560357570648, + "step": 5418 + }, + { + "epoch": 0.169375, + "grad_norm": 3.4375, + "grad_norm_var": 0.10004781087239584, + "learning_rate": 0.0001, + "loss": 6.2687, + "loss/crossentropy": 2.6493914127349854, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.1986526921391487, + "step": 5420 + }, + { + "epoch": 0.1694375, + "grad_norm": 4.03125, + "grad_norm_var": 0.10148111979166667, + "learning_rate": 0.0001, + "loss": 6.5061, + "loss/crossentropy": 2.7358455657958984, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21296165138483047, + "step": 5422 + }, + { + "epoch": 0.1695, + "grad_norm": 3.484375, + "grad_norm_var": 0.10832926432291666, + "learning_rate": 0.0001, + "loss": 6.2646, + "loss/crossentropy": 2.6034480333328247, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20518138259649277, + "step": 5424 + }, + { + "epoch": 0.1695625, + "grad_norm": 3.46875, + "grad_norm_var": 0.11578369140625, + "learning_rate": 0.0001, + "loss": 6.1431, + "loss/crossentropy": 2.5414342880249023, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19923234730958939, + "step": 5426 + }, + { + "epoch": 0.169625, + "grad_norm": 3.890625, + "grad_norm_var": 0.10565999348958334, + "learning_rate": 0.0001, + "loss": 5.8812, + "loss/crossentropy": 2.3314428329467773, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19013582915067673, + "step": 5428 + }, + { + "epoch": 0.1696875, + "grad_norm": 3.8125, + "grad_norm_var": 0.07024637858072917, + "learning_rate": 0.0001, + "loss": 6.293, + "loss/crossentropy": 2.6051303148269653, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2031620442867279, + "step": 5430 + }, + { + "epoch": 0.16975, + "grad_norm": 3.75, + "grad_norm_var": 0.038655598958333336, + "learning_rate": 0.0001, + "loss": 6.2182, + "loss/crossentropy": 2.517228841781616, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20330332219600677, + "step": 5432 + }, + { + "epoch": 0.1698125, + "grad_norm": 3.9375, + "grad_norm_var": 0.0454254150390625, + "learning_rate": 0.0001, + "loss": 6.0247, + "loss/crossentropy": 2.445191740989685, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20052997767925262, + "step": 5434 + }, + { + "epoch": 0.169875, + "grad_norm": 4.09375, + "grad_norm_var": 0.05201416015625, + "learning_rate": 0.0001, + "loss": 6.6324, + "loss/crossentropy": 2.6764878034591675, + "loss/hidden": 1.7265625, + "loss/jsd": 0.0, + "loss/logits": 0.22293312847614288, + "step": 5436 + }, + { + "epoch": 0.1699375, + "grad_norm": 3.515625, + "grad_norm_var": 0.05042215983072917, + "learning_rate": 0.0001, + "loss": 6.4418, + "loss/crossentropy": 2.747052788734436, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20541198551654816, + "step": 5438 + }, + { + "epoch": 0.17, + "grad_norm": 3.609375, + "grad_norm_var": 0.0805084228515625, + "learning_rate": 0.0001, + "loss": 5.9118, + "loss/crossentropy": 2.369445323944092, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19603674113750458, + "step": 5440 + }, + { + "epoch": 0.1700625, + "grad_norm": 3.296875, + "grad_norm_var": 0.08769124348958333, + "learning_rate": 0.0001, + "loss": 5.7587, + "loss/crossentropy": 2.2850255966186523, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19267499446868896, + "step": 5442 + }, + { + "epoch": 0.170125, + "grad_norm": 3.625, + "grad_norm_var": 0.08445536295572917, + "learning_rate": 0.0001, + "loss": 6.3702, + "loss/crossentropy": 2.642918348312378, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20944388210773468, + "step": 5444 + }, + { + "epoch": 0.1701875, + "grad_norm": 3.265625, + "grad_norm_var": 0.0998931884765625, + "learning_rate": 0.0001, + "loss": 6.0806, + "loss/crossentropy": 2.5002808570861816, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19435935467481613, + "step": 5446 + }, + { + "epoch": 0.17025, + "grad_norm": 3.609375, + "grad_norm_var": 0.10025634765625, + "learning_rate": 0.0001, + "loss": 6.2508, + "loss/crossentropy": 2.54501211643219, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20495309680700302, + "step": 5448 + }, + { + "epoch": 0.1703125, + "grad_norm": 3.703125, + "grad_norm_var": 0.0860015869140625, + "learning_rate": 0.0001, + "loss": 6.3207, + "loss/crossentropy": 2.633958101272583, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20305413007736206, + "step": 5450 + }, + { + "epoch": 0.170375, + "grad_norm": 4.28125, + "grad_norm_var": 0.10237223307291667, + "learning_rate": 0.0001, + "loss": 6.4857, + "loss/crossentropy": 2.7109899520874023, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21419066935777664, + "step": 5452 + }, + { + "epoch": 0.1704375, + "grad_norm": 3.40625, + "grad_norm_var": 0.10229390462239583, + "learning_rate": 0.0001, + "loss": 5.651, + "loss/crossentropy": 2.245222568511963, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.17690138518810272, + "step": 5454 + }, + { + "epoch": 0.1705, + "grad_norm": 3.46875, + "grad_norm_var": 0.05877278645833333, + "learning_rate": 0.0001, + "loss": 6.2159, + "loss/crossentropy": 2.6513789892196655, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19668704271316528, + "step": 5456 + }, + { + "epoch": 0.1705625, + "grad_norm": 3.578125, + "grad_norm_var": 0.059992472330729164, + "learning_rate": 0.0001, + "loss": 6.543, + "loss/crossentropy": 2.70147442817688, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21969977766275406, + "step": 5458 + }, + { + "epoch": 0.170625, + "grad_norm": 3.96875, + "grad_norm_var": 0.06984049479166667, + "learning_rate": 0.0001, + "loss": 6.4508, + "loss/crossentropy": 2.6886109113693237, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21020089089870453, + "step": 5460 + }, + { + "epoch": 0.1706875, + "grad_norm": 3.46875, + "grad_norm_var": 0.061644490559895834, + "learning_rate": 0.0001, + "loss": 5.9927, + "loss/crossentropy": 2.4410237073898315, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19228195399045944, + "step": 5462 + }, + { + "epoch": 0.17075, + "grad_norm": 3.5625, + "grad_norm_var": 0.0674957275390625, + "learning_rate": 0.0001, + "loss": 6.5238, + "loss/crossentropy": 2.766111969947815, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.2109237089753151, + "step": 5464 + }, + { + "epoch": 0.1708125, + "grad_norm": 3.5, + "grad_norm_var": 0.06883138020833333, + "learning_rate": 0.0001, + "loss": 6.3399, + "loss/crossentropy": 2.627722144126892, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20832887291908264, + "step": 5466 + }, + { + "epoch": 0.170875, + "grad_norm": 3.5625, + "grad_norm_var": 0.03599853515625, + "learning_rate": 0.0001, + "loss": 6.2697, + "loss/crossentropy": 2.5772327184677124, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20675109326839447, + "step": 5468 + }, + { + "epoch": 0.1709375, + "grad_norm": 3.375, + "grad_norm_var": 0.03992513020833333, + "learning_rate": 0.0001, + "loss": 6.4087, + "loss/crossentropy": 2.600441336631775, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21754159033298492, + "step": 5470 + }, + { + "epoch": 0.171, + "grad_norm": 3.5, + "grad_norm_var": 0.03908589680989583, + "learning_rate": 0.0001, + "loss": 6.1616, + "loss/crossentropy": 2.4985276460647583, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20302552729845047, + "step": 5472 + }, + { + "epoch": 0.1710625, + "grad_norm": 3.859375, + "grad_norm_var": 0.036181640625, + "learning_rate": 0.0001, + "loss": 6.4449, + "loss/crossentropy": 2.6403703689575195, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21521900594234467, + "step": 5474 + }, + { + "epoch": 0.171125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0319732666015625, + "learning_rate": 0.0001, + "loss": 6.4397, + "loss/crossentropy": 2.7072731256484985, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21074757725000381, + "step": 5476 + }, + { + "epoch": 0.1711875, + "grad_norm": 4.09375, + "grad_norm_var": 0.0464752197265625, + "learning_rate": 0.0001, + "loss": 6.5197, + "loss/crossentropy": 2.655811905860901, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.21529193222522736, + "step": 5478 + }, + { + "epoch": 0.17125, + "grad_norm": 3.4375, + "grad_norm_var": 0.0478515625, + "learning_rate": 0.0001, + "loss": 5.8979, + "loss/crossentropy": 2.374879837036133, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.18628836423158646, + "step": 5480 + }, + { + "epoch": 0.1713125, + "grad_norm": 3.96875, + "grad_norm_var": 0.058226521809895834, + "learning_rate": 0.0001, + "loss": 6.4394, + "loss/crossentropy": 2.685877799987793, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21206961572170258, + "step": 5482 + }, + { + "epoch": 0.171375, + "grad_norm": 3.6875, + "grad_norm_var": 0.05786844889322917, + "learning_rate": 0.0001, + "loss": 6.1926, + "loss/crossentropy": 2.487221598625183, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20686902850866318, + "step": 5484 + }, + { + "epoch": 0.1714375, + "grad_norm": 4.15625, + "grad_norm_var": 0.06420796712239583, + "learning_rate": 0.0001, + "loss": 6.6209, + "loss/crossentropy": 2.7182233333587646, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2215162143111229, + "step": 5486 + }, + { + "epoch": 0.1715, + "grad_norm": 3.890625, + "grad_norm_var": 0.06687723795572917, + "learning_rate": 0.0001, + "loss": 6.2613, + "loss/crossentropy": 2.6063839197158813, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.19869530200958252, + "step": 5488 + }, + { + "epoch": 0.1715625, + "grad_norm": 3.875, + "grad_norm_var": 0.0705230712890625, + "learning_rate": 0.0001, + "loss": 6.401, + "loss/crossentropy": 2.596443295478821, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21131177991628647, + "step": 5490 + }, + { + "epoch": 0.171625, + "grad_norm": 3.75, + "grad_norm_var": 0.06787821451822916, + "learning_rate": 0.0001, + "loss": 6.3697, + "loss/crossentropy": 2.6126943826675415, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21008026599884033, + "step": 5492 + }, + { + "epoch": 0.1716875, + "grad_norm": 3.734375, + "grad_norm_var": 0.061701456705729164, + "learning_rate": 0.0001, + "loss": 6.1955, + "loss/crossentropy": 2.5357481241226196, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20269111543893814, + "step": 5494 + }, + { + "epoch": 0.17175, + "grad_norm": 3.890625, + "grad_norm_var": 0.048173014322916666, + "learning_rate": 0.0001, + "loss": 6.3698, + "loss/crossentropy": 2.6355448961257935, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2042831853032112, + "step": 5496 + }, + { + "epoch": 0.1718125, + "grad_norm": 3.40625, + "grad_norm_var": 0.04729410807291667, + "learning_rate": 0.0001, + "loss": 6.0, + "loss/crossentropy": 2.480632781982422, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1952923834323883, + "step": 5498 + }, + { + "epoch": 0.171875, + "grad_norm": 3.578125, + "grad_norm_var": 0.057373046875, + "learning_rate": 0.0001, + "loss": 6.3511, + "loss/crossentropy": 2.5731500387191772, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.20787642896175385, + "step": 5500 + }, + { + "epoch": 0.1719375, + "grad_norm": 3.296875, + "grad_norm_var": 0.0629791259765625, + "learning_rate": 0.0001, + "loss": 5.9312, + "loss/crossentropy": 2.383143901824951, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.1895749568939209, + "step": 5502 + }, + { + "epoch": 0.172, + "grad_norm": 3.65625, + "grad_norm_var": 0.05706380208333333, + "learning_rate": 0.0001, + "loss": 6.5475, + "loss/crossentropy": 2.9158960580825806, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20261601358652115, + "step": 5504 + }, + { + "epoch": 0.1720625, + "grad_norm": 4.03125, + "grad_norm_var": 0.05901692708333333, + "learning_rate": 0.0001, + "loss": 6.1549, + "loss/crossentropy": 2.452110528945923, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.19996604323387146, + "step": 5506 + }, + { + "epoch": 0.172125, + "grad_norm": 3.734375, + "grad_norm_var": 0.04944559733072917, + "learning_rate": 0.0001, + "loss": 6.2734, + "loss/crossentropy": 2.584115147590637, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20526093244552612, + "step": 5508 + }, + { + "epoch": 0.1721875, + "grad_norm": 3.796875, + "grad_norm_var": 0.04934488932291667, + "learning_rate": 0.0001, + "loss": 6.5578, + "loss/crossentropy": 2.7004928588867188, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.2134695053100586, + "step": 5510 + }, + { + "epoch": 0.17225, + "grad_norm": 3.734375, + "grad_norm_var": 0.04513346354166667, + "learning_rate": 0.0001, + "loss": 5.8912, + "loss/crossentropy": 2.4137511253356934, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.18602566421031952, + "step": 5512 + }, + { + "epoch": 0.1723125, + "grad_norm": 3.9375, + "grad_norm_var": 0.04523824055989583, + "learning_rate": 0.0001, + "loss": 5.9679, + "loss/crossentropy": 2.368557929992676, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19899508357048035, + "step": 5514 + }, + { + "epoch": 0.172375, + "grad_norm": 3.703125, + "grad_norm_var": 0.03235677083333333, + "learning_rate": 0.0001, + "loss": 6.2388, + "loss/crossentropy": 2.5534307956695557, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20759540796279907, + "step": 5516 + }, + { + "epoch": 0.1724375, + "grad_norm": 3.640625, + "grad_norm_var": 0.021776326497395835, + "learning_rate": 0.0001, + "loss": 6.3551, + "loss/crossentropy": 2.6560505628585815, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2089691162109375, + "step": 5518 + }, + { + "epoch": 0.1725, + "grad_norm": 4.03125, + "grad_norm_var": 0.031183878580729168, + "learning_rate": 0.0001, + "loss": 6.4113, + "loss/crossentropy": 2.628575325012207, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20991557836532593, + "step": 5520 + }, + { + "epoch": 0.1725625, + "grad_norm": 3.71875, + "grad_norm_var": 0.025716145833333332, + "learning_rate": 0.0001, + "loss": 6.3448, + "loss/crossentropy": 2.7013977766036987, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20144866406917572, + "step": 5522 + }, + { + "epoch": 0.172625, + "grad_norm": 3.75, + "grad_norm_var": 0.027242024739583332, + "learning_rate": 0.0001, + "loss": 6.1257, + "loss/crossentropy": 2.425819158554077, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20592305809259415, + "step": 5524 + }, + { + "epoch": 0.1726875, + "grad_norm": 3.71875, + "grad_norm_var": 0.0305572509765625, + "learning_rate": 0.0001, + "loss": 6.1834, + "loss/crossentropy": 2.5644673109054565, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20251353085041046, + "step": 5526 + }, + { + "epoch": 0.17275, + "grad_norm": 3.34375, + "grad_norm_var": 0.0370513916015625, + "learning_rate": 0.0001, + "loss": 6.5402, + "loss/crossentropy": 2.778851270675659, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21363477408885956, + "step": 5528 + }, + { + "epoch": 0.1728125, + "grad_norm": 3.84375, + "grad_norm_var": 0.07545166015625, + "learning_rate": 0.0001, + "loss": 6.4929, + "loss/crossentropy": 2.754807710647583, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20779237151145935, + "step": 5530 + }, + { + "epoch": 0.172875, + "grad_norm": 3.609375, + "grad_norm_var": 0.07896728515625, + "learning_rate": 0.0001, + "loss": 6.3329, + "loss/crossentropy": 2.6094292402267456, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2078896090388298, + "step": 5532 + }, + { + "epoch": 0.1729375, + "grad_norm": 3.734375, + "grad_norm_var": 0.08041890462239583, + "learning_rate": 0.0001, + "loss": 6.5027, + "loss/crossentropy": 2.8108277320861816, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20668402314186096, + "step": 5534 + }, + { + "epoch": 0.173, + "grad_norm": 3.703125, + "grad_norm_var": 0.07682291666666667, + "learning_rate": 0.0001, + "loss": 6.4419, + "loss/crossentropy": 2.603666305541992, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21390238404273987, + "step": 5536 + }, + { + "epoch": 0.1730625, + "grad_norm": 3.484375, + "grad_norm_var": 0.08290608723958333, + "learning_rate": 0.0001, + "loss": 6.4747, + "loss/crossentropy": 2.6804850101470947, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21692615002393723, + "step": 5538 + }, + { + "epoch": 0.173125, + "grad_norm": 3.21875, + "grad_norm_var": 0.09759012858072917, + "learning_rate": 0.0001, + "loss": 6.3413, + "loss/crossentropy": 2.7084202766418457, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20000549405813217, + "step": 5540 + }, + { + "epoch": 0.1731875, + "grad_norm": 3.4375, + "grad_norm_var": 0.10080973307291667, + "learning_rate": 0.0001, + "loss": 6.3795, + "loss/crossentropy": 2.7111209630966187, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2043348103761673, + "step": 5542 + }, + { + "epoch": 0.17325, + "grad_norm": 3.328125, + "grad_norm_var": 0.10380452473958333, + "learning_rate": 0.0001, + "loss": 5.9665, + "loss/crossentropy": 2.4299083948135376, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19233398884534836, + "step": 5544 + }, + { + "epoch": 0.1733125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0641021728515625, + "learning_rate": 0.0001, + "loss": 6.2057, + "loss/crossentropy": 2.600328803062439, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19881782680749893, + "step": 5546 + }, + { + "epoch": 0.173375, + "grad_norm": 3.34375, + "grad_norm_var": 0.09570210774739583, + "learning_rate": 0.0001, + "loss": 6.2745, + "loss/crossentropy": 2.575212240219116, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.2023472636938095, + "step": 5548 + }, + { + "epoch": 0.1734375, + "grad_norm": 4.1875, + "grad_norm_var": 0.11731669108072916, + "learning_rate": 0.0001, + "loss": 6.2189, + "loss/crossentropy": 2.5943928956985474, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.1976061761379242, + "step": 5550 + }, + { + "epoch": 0.1735, + "grad_norm": 3.84375, + "grad_norm_var": 0.10994364420572916, + "learning_rate": 0.0001, + "loss": 6.5701, + "loss/crossentropy": 2.767845034599304, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2130359187722206, + "step": 5552 + }, + { + "epoch": 0.1735625, + "grad_norm": 3.453125, + "grad_norm_var": 0.11357014973958333, + "learning_rate": 0.0001, + "loss": 6.1909, + "loss/crossentropy": 2.653822660446167, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19238030910491943, + "step": 5554 + }, + { + "epoch": 0.173625, + "grad_norm": 3.78125, + "grad_norm_var": 0.10829671223958333, + "learning_rate": 0.0001, + "loss": 6.2698, + "loss/crossentropy": 2.593658685684204, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.1996469348669052, + "step": 5556 + }, + { + "epoch": 0.1736875, + "grad_norm": 3.28125, + "grad_norm_var": 0.11754150390625, + "learning_rate": 0.0001, + "loss": 6.1297, + "loss/crossentropy": 2.5261536836624146, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19746138155460358, + "step": 5558 + }, + { + "epoch": 0.17375, + "grad_norm": 3.828125, + "grad_norm_var": 0.1182037353515625, + "learning_rate": 0.0001, + "loss": 6.0964, + "loss/crossentropy": 2.4176089763641357, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.1987377628684044, + "step": 5560 + }, + { + "epoch": 0.1738125, + "grad_norm": 3.765625, + "grad_norm_var": 0.10011393229166667, + "learning_rate": 0.0001, + "loss": 6.3642, + "loss/crossentropy": 2.759734869003296, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19989798218011856, + "step": 5562 + }, + { + "epoch": 0.173875, + "grad_norm": 3.5, + "grad_norm_var": 0.06797587076822917, + "learning_rate": 0.0001, + "loss": 6.0124, + "loss/crossentropy": 2.4658912420272827, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19527988135814667, + "step": 5564 + }, + { + "epoch": 0.1739375, + "grad_norm": 3.71875, + "grad_norm_var": 0.05498046875, + "learning_rate": 0.0001, + "loss": 6.0655, + "loss/crossentropy": 2.4361329078674316, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19965960085391998, + "step": 5566 + }, + { + "epoch": 0.174, + "grad_norm": 3.640625, + "grad_norm_var": 0.0514068603515625, + "learning_rate": 0.0001, + "loss": 6.3428, + "loss/crossentropy": 2.624956250190735, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.208501935005188, + "step": 5568 + }, + { + "epoch": 0.1740625, + "grad_norm": 3.859375, + "grad_norm_var": 0.03737691243489583, + "learning_rate": 0.0001, + "loss": 6.4201, + "loss/crossentropy": 2.6422702074050903, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21371586620807648, + "step": 5570 + }, + { + "epoch": 0.174125, + "grad_norm": 4.125, + "grad_norm_var": 0.0541015625, + "learning_rate": 0.0001, + "loss": 6.4237, + "loss/crossentropy": 2.668924331665039, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.210244320333004, + "step": 5572 + }, + { + "epoch": 0.1741875, + "grad_norm": 3.75, + "grad_norm_var": 0.039697265625, + "learning_rate": 0.0001, + "loss": 6.43, + "loss/crossentropy": 2.765849232673645, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20313312858343124, + "step": 5574 + }, + { + "epoch": 0.17425, + "grad_norm": 3.421875, + "grad_norm_var": 0.0411773681640625, + "learning_rate": 0.0001, + "loss": 6.4027, + "loss/crossentropy": 2.7064108848571777, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20517880469560623, + "step": 5576 + }, + { + "epoch": 0.1743125, + "grad_norm": 3.8125, + "grad_norm_var": 0.042378743489583336, + "learning_rate": 0.0001, + "loss": 6.5003, + "loss/crossentropy": 2.7742602825164795, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2101084440946579, + "step": 5578 + }, + { + "epoch": 0.174375, + "grad_norm": 3.421875, + "grad_norm_var": 0.05230712890625, + "learning_rate": 0.0001, + "loss": 6.4819, + "loss/crossentropy": 2.7062528133392334, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21194252371788025, + "step": 5580 + }, + { + "epoch": 0.1744375, + "grad_norm": 3.6875, + "grad_norm_var": 0.047948201497395836, + "learning_rate": 0.0001, + "loss": 6.3452, + "loss/crossentropy": 2.660622000694275, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20400027930736542, + "step": 5582 + }, + { + "epoch": 0.1745, + "grad_norm": 4.09375, + "grad_norm_var": 0.0604400634765625, + "learning_rate": 0.0001, + "loss": 6.6467, + "loss/crossentropy": 2.7296407222747803, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.22529703378677368, + "step": 5584 + }, + { + "epoch": 0.1745625, + "grad_norm": 3.671875, + "grad_norm_var": 0.059912109375, + "learning_rate": 0.0001, + "loss": 6.3289, + "loss/crossentropy": 2.6632198095321655, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2071952521800995, + "step": 5586 + }, + { + "epoch": 0.174625, + "grad_norm": 3.359375, + "grad_norm_var": 0.04794921875, + "learning_rate": 0.0001, + "loss": 6.2706, + "loss/crossentropy": 2.6442915201187134, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20012595504522324, + "step": 5588 + }, + { + "epoch": 0.1746875, + "grad_norm": 3.734375, + "grad_norm_var": 0.055501302083333336, + "learning_rate": 0.0001, + "loss": 6.4411, + "loss/crossentropy": 2.7457441091537476, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2035154029726982, + "step": 5590 + }, + { + "epoch": 0.17475, + "grad_norm": 3.40625, + "grad_norm_var": 0.07525634765625, + "learning_rate": 0.0001, + "loss": 5.8117, + "loss/crossentropy": 2.4168955087661743, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.17971421033143997, + "step": 5592 + }, + { + "epoch": 0.1748125, + "grad_norm": 3.578125, + "grad_norm_var": 0.07383524576822917, + "learning_rate": 0.0001, + "loss": 6.0586, + "loss/crossentropy": 2.48819899559021, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19414632767438889, + "step": 5594 + }, + { + "epoch": 0.174875, + "grad_norm": 3.75, + "grad_norm_var": 0.22345377604166666, + "learning_rate": 0.0001, + "loss": 6.9743, + "loss/crossentropy": 2.888335704803467, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.23632864654064178, + "step": 5596 + }, + { + "epoch": 0.1749375, + "grad_norm": 3.4375, + "grad_norm_var": 0.22860921223958333, + "learning_rate": 0.0001, + "loss": 6.0856, + "loss/crossentropy": 2.4923453330993652, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19994857162237167, + "step": 5598 + }, + { + "epoch": 0.175, + "grad_norm": 3.828125, + "grad_norm_var": 0.21950581868489583, + "learning_rate": 0.0001, + "loss": 5.9782, + "loss/crossentropy": 2.332379460334778, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20052067935466766, + "step": 5600 + }, + { + "epoch": 0.1750625, + "grad_norm": 3.78125, + "grad_norm_var": 0.218701171875, + "learning_rate": 0.0001, + "loss": 6.4891, + "loss/crossentropy": 2.630122423171997, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21871009469032288, + "step": 5602 + }, + { + "epoch": 0.175125, + "grad_norm": 3.546875, + "grad_norm_var": 0.2118072509765625, + "learning_rate": 0.0001, + "loss": 6.1111, + "loss/crossentropy": 2.5449728965759277, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19449857622385025, + "step": 5604 + }, + { + "epoch": 0.1751875, + "grad_norm": 3.6875, + "grad_norm_var": 0.20869140625, + "learning_rate": 0.0001, + "loss": 6.1819, + "loss/crossentropy": 2.5370291471481323, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20042233169078827, + "step": 5606 + }, + { + "epoch": 0.17525, + "grad_norm": 3.5625, + "grad_norm_var": 0.17384440104166668, + "learning_rate": 0.0001, + "loss": 6.2695, + "loss/crossentropy": 2.5881470441818237, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.19899416714906693, + "step": 5608 + }, + { + "epoch": 0.1753125, + "grad_norm": 3.515625, + "grad_norm_var": 0.17097066243489584, + "learning_rate": 0.0001, + "loss": 6.4462, + "loss/crossentropy": 2.672497034072876, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.20510167628526688, + "step": 5610 + }, + { + "epoch": 0.175375, + "grad_norm": 3.40625, + "grad_norm_var": 0.0383941650390625, + "learning_rate": 0.0001, + "loss": 6.0375, + "loss/crossentropy": 2.4991201162338257, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19368354231119156, + "step": 5612 + }, + { + "epoch": 0.1754375, + "grad_norm": 3.875, + "grad_norm_var": 0.03714192708333333, + "learning_rate": 0.0001, + "loss": 6.4138, + "loss/crossentropy": 2.6649649143218994, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2076936662197113, + "step": 5614 + }, + { + "epoch": 0.1755, + "grad_norm": 3.90625, + "grad_norm_var": 0.0385406494140625, + "learning_rate": 0.0001, + "loss": 5.8119, + "loss/crossentropy": 2.332767963409424, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19087842851877213, + "step": 5616 + }, + { + "epoch": 0.1755625, + "grad_norm": 3.796875, + "grad_norm_var": 0.0342926025390625, + "learning_rate": 0.0001, + "loss": 6.3032, + "loss/crossentropy": 2.5783437490463257, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2084260806441307, + "step": 5618 + }, + { + "epoch": 0.175625, + "grad_norm": 3.671875, + "grad_norm_var": 0.049235026041666664, + "learning_rate": 0.0001, + "loss": 6.737, + "loss/crossentropy": 2.8179370164871216, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.22589459270238876, + "step": 5620 + }, + { + "epoch": 0.1756875, + "grad_norm": 3.96875, + "grad_norm_var": 0.054833984375, + "learning_rate": 0.0001, + "loss": 6.3994, + "loss/crossentropy": 2.7480050325393677, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20303140580654144, + "step": 5622 + }, + { + "epoch": 0.17575, + "grad_norm": 3.828125, + "grad_norm_var": 0.05921223958333333, + "learning_rate": 0.0001, + "loss": 6.2438, + "loss/crossentropy": 2.5771526098251343, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.1998675912618637, + "step": 5624 + }, + { + "epoch": 0.1758125, + "grad_norm": 3.921875, + "grad_norm_var": 0.060114542643229164, + "learning_rate": 0.0001, + "loss": 6.4994, + "loss/crossentropy": 2.608477830886841, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2187797725200653, + "step": 5626 + }, + { + "epoch": 0.175875, + "grad_norm": 4.09375, + "grad_norm_var": 0.04951070149739583, + "learning_rate": 0.0001, + "loss": 6.3209, + "loss/crossentropy": 2.6283583641052246, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20871003717184067, + "step": 5628 + }, + { + "epoch": 0.1759375, + "grad_norm": 3.96875, + "grad_norm_var": 0.05357157389322917, + "learning_rate": 0.0001, + "loss": 6.298, + "loss/crossentropy": 2.627431631088257, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20377814769744873, + "step": 5630 + }, + { + "epoch": 0.176, + "grad_norm": 3.96875, + "grad_norm_var": 0.0513671875, + "learning_rate": 0.0001, + "loss": 6.4348, + "loss/crossentropy": 2.6045050621032715, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21545372158288956, + "step": 5632 + }, + { + "epoch": 0.1760625, + "grad_norm": 3.734375, + "grad_norm_var": 0.03843994140625, + "learning_rate": 0.0001, + "loss": 6.4334, + "loss/crossentropy": 2.734872579574585, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2057913839817047, + "step": 5634 + }, + { + "epoch": 0.176125, + "grad_norm": 3.875, + "grad_norm_var": 0.032124837239583336, + "learning_rate": 0.0001, + "loss": 6.2559, + "loss/crossentropy": 2.5179080963134766, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2097330242395401, + "step": 5636 + }, + { + "epoch": 0.1761875, + "grad_norm": 4.375, + "grad_norm_var": 0.04289957682291667, + "learning_rate": 0.0001, + "loss": 6.3522, + "loss/crossentropy": 2.6638708114624023, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20399095118045807, + "step": 5638 + }, + { + "epoch": 0.17625, + "grad_norm": 3.65625, + "grad_norm_var": 0.047419230143229164, + "learning_rate": 0.0001, + "loss": 6.1607, + "loss/crossentropy": 2.555658221244812, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20073561370372772, + "step": 5640 + }, + { + "epoch": 0.1763125, + "grad_norm": 3.328125, + "grad_norm_var": 0.06232808430989583, + "learning_rate": 0.0001, + "loss": 6.2548, + "loss/crossentropy": 2.580946207046509, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20527862012386322, + "step": 5642 + }, + { + "epoch": 0.176375, + "grad_norm": 3.90625, + "grad_norm_var": 0.07954813639322916, + "learning_rate": 0.0001, + "loss": 5.9941, + "loss/crossentropy": 2.4849324226379395, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19505522400140762, + "step": 5644 + }, + { + "epoch": 0.1764375, + "grad_norm": 3.75, + "grad_norm_var": 0.07453511555989584, + "learning_rate": 0.0001, + "loss": 6.4735, + "loss/crossentropy": 2.701940655708313, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21113985031843185, + "step": 5646 + }, + { + "epoch": 0.1765, + "grad_norm": 3.34375, + "grad_norm_var": 0.077490234375, + "learning_rate": 0.0001, + "loss": 6.0421, + "loss/crossentropy": 2.433351516723633, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.1968122348189354, + "step": 5648 + }, + { + "epoch": 0.1765625, + "grad_norm": 3.6875, + "grad_norm_var": 0.0806060791015625, + "learning_rate": 0.0001, + "loss": 6.4999, + "loss/crossentropy": 2.7361565828323364, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.20840420573949814, + "step": 5650 + }, + { + "epoch": 0.176625, + "grad_norm": 4.09375, + "grad_norm_var": 0.0964263916015625, + "learning_rate": 0.0001, + "loss": 6.1257, + "loss/crossentropy": 2.467520236968994, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20097361505031586, + "step": 5652 + }, + { + "epoch": 0.1766875, + "grad_norm": 3.546875, + "grad_norm_var": 0.09436442057291666, + "learning_rate": 0.0001, + "loss": 6.2714, + "loss/crossentropy": 2.6138484477996826, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.19973890483379364, + "step": 5654 + }, + { + "epoch": 0.17675, + "grad_norm": 3.640625, + "grad_norm_var": 0.09977925618489583, + "learning_rate": 0.0001, + "loss": 6.6365, + "loss/crossentropy": 2.814689874649048, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21421615034341812, + "step": 5656 + }, + { + "epoch": 0.1768125, + "grad_norm": 3.546875, + "grad_norm_var": 0.0920318603515625, + "learning_rate": 0.0001, + "loss": 6.3915, + "loss/crossentropy": 2.6593785285949707, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20915207266807556, + "step": 5658 + }, + { + "epoch": 0.176875, + "grad_norm": 3.375, + "grad_norm_var": 0.074462890625, + "learning_rate": 0.0001, + "loss": 6.0323, + "loss/crossentropy": 2.47067928314209, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.1921006441116333, + "step": 5660 + }, + { + "epoch": 0.1769375, + "grad_norm": 3.71875, + "grad_norm_var": 0.07395426432291667, + "learning_rate": 0.0001, + "loss": 6.0652, + "loss/crossentropy": 2.50555682182312, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.197368785738945, + "step": 5662 + }, + { + "epoch": 0.177, + "grad_norm": 2.9375, + "grad_norm_var": 0.11296284993489583, + "learning_rate": 0.0001, + "loss": 5.5988, + "loss/crossentropy": 2.294847249984741, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17531780153512955, + "step": 5664 + }, + { + "epoch": 0.1770625, + "grad_norm": 3.375, + "grad_norm_var": 0.1178863525390625, + "learning_rate": 0.0001, + "loss": 6.3314, + "loss/crossentropy": 2.6526646614074707, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20341810584068298, + "step": 5666 + }, + { + "epoch": 0.177125, + "grad_norm": 3.6875, + "grad_norm_var": 0.1009674072265625, + "learning_rate": 0.0001, + "loss": 6.3754, + "loss/crossentropy": 2.6954914331436157, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20822450518608093, + "step": 5668 + }, + { + "epoch": 0.1771875, + "grad_norm": 3.96875, + "grad_norm_var": 0.0833984375, + "learning_rate": 0.0001, + "loss": 6.4037, + "loss/crossentropy": 2.563956379890442, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2171739861369133, + "step": 5670 + }, + { + "epoch": 0.17725, + "grad_norm": 3.84375, + "grad_norm_var": 0.09482320149739583, + "learning_rate": 0.0001, + "loss": 6.0265, + "loss/crossentropy": 2.4230778217315674, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.19276423752307892, + "step": 5672 + }, + { + "epoch": 0.1773125, + "grad_norm": 3.59375, + "grad_norm_var": 0.10392964680989583, + "learning_rate": 0.0001, + "loss": 6.2188, + "loss/crossentropy": 2.607333183288574, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20059751719236374, + "step": 5674 + }, + { + "epoch": 0.177375, + "grad_norm": 3.515625, + "grad_norm_var": 0.10070699055989583, + "learning_rate": 0.0001, + "loss": 5.9932, + "loss/crossentropy": 2.5008485317230225, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.18634529411792755, + "step": 5676 + }, + { + "epoch": 0.1774375, + "grad_norm": 3.765625, + "grad_norm_var": 0.10190327962239583, + "learning_rate": 0.0001, + "loss": 6.1233, + "loss/crossentropy": 2.4628396034240723, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20198483765125275, + "step": 5678 + }, + { + "epoch": 0.1775, + "grad_norm": 3.390625, + "grad_norm_var": 0.06603902180989583, + "learning_rate": 0.0001, + "loss": 5.9784, + "loss/crossentropy": 2.521420478820801, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18866854161024094, + "step": 5680 + }, + { + "epoch": 0.1775625, + "grad_norm": 3.609375, + "grad_norm_var": 0.059065755208333334, + "learning_rate": 0.0001, + "loss": 6.1665, + "loss/crossentropy": 2.536958336830139, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20045578479766846, + "step": 5682 + }, + { + "epoch": 0.177625, + "grad_norm": 3.921875, + "grad_norm_var": 0.06405843098958333, + "learning_rate": 0.0001, + "loss": 6.0364, + "loss/crossentropy": 2.4570053815841675, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.1911417543888092, + "step": 5684 + }, + { + "epoch": 0.1776875, + "grad_norm": 3.3125, + "grad_norm_var": 0.05449117024739583, + "learning_rate": 0.0001, + "loss": 6.3008, + "loss/crossentropy": 2.6369824409484863, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20388375967741013, + "step": 5686 + }, + { + "epoch": 0.17775, + "grad_norm": 3.59375, + "grad_norm_var": 0.0405670166015625, + "learning_rate": 0.0001, + "loss": 6.5723, + "loss/crossentropy": 2.806073546409607, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21021592617034912, + "step": 5688 + }, + { + "epoch": 0.1778125, + "grad_norm": 3.765625, + "grad_norm_var": 0.03326822916666667, + "learning_rate": 0.0001, + "loss": 6.6962, + "loss/crossentropy": 2.8453763723373413, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21906623244285583, + "step": 5690 + }, + { + "epoch": 0.177875, + "grad_norm": 3.546875, + "grad_norm_var": 0.034830729166666664, + "learning_rate": 0.0001, + "loss": 6.2838, + "loss/crossentropy": 2.645614504814148, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20483769476413727, + "step": 5692 + }, + { + "epoch": 0.1779375, + "grad_norm": 3.34375, + "grad_norm_var": 0.035416666666666666, + "learning_rate": 0.0001, + "loss": 6.1371, + "loss/crossentropy": 2.5817540884017944, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1961633712053299, + "step": 5694 + }, + { + "epoch": 0.178, + "grad_norm": 3.5, + "grad_norm_var": 0.04157613118489583, + "learning_rate": 0.0001, + "loss": 6.7531, + "loss/crossentropy": 2.898721218109131, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21942297369241714, + "step": 5696 + }, + { + "epoch": 0.1780625, + "grad_norm": 3.96875, + "grad_norm_var": 0.0493560791015625, + "learning_rate": 0.0001, + "loss": 6.3258, + "loss/crossentropy": 2.5990471839904785, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20939195156097412, + "step": 5698 + }, + { + "epoch": 0.178125, + "grad_norm": 3.515625, + "grad_norm_var": 0.0461090087890625, + "learning_rate": 0.0001, + "loss": 6.1863, + "loss/crossentropy": 2.567986488342285, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1997183933854103, + "step": 5700 + }, + { + "epoch": 0.1781875, + "grad_norm": 3.859375, + "grad_norm_var": 0.041943359375, + "learning_rate": 0.0001, + "loss": 6.2774, + "loss/crossentropy": 2.6630423069000244, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.194635771214962, + "step": 5702 + }, + { + "epoch": 0.17825, + "grad_norm": 4.15625, + "grad_norm_var": 0.05338134765625, + "learning_rate": 0.0001, + "loss": 6.0809, + "loss/crossentropy": 2.3729244470596313, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20947062969207764, + "step": 5704 + }, + { + "epoch": 0.1783125, + "grad_norm": 3.515625, + "grad_norm_var": 0.058080037434895836, + "learning_rate": 0.0001, + "loss": 6.1394, + "loss/crossentropy": 2.588572144508362, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19414159655570984, + "step": 5706 + }, + { + "epoch": 0.178375, + "grad_norm": 3.265625, + "grad_norm_var": 0.0641510009765625, + "learning_rate": 0.0001, + "loss": 6.1624, + "loss/crossentropy": 2.5588879585266113, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.2029315009713173, + "step": 5708 + }, + { + "epoch": 0.1784375, + "grad_norm": 3.546875, + "grad_norm_var": 0.057835896809895836, + "learning_rate": 0.0001, + "loss": 6.1616, + "loss/crossentropy": 2.531624436378479, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20166686177253723, + "step": 5710 + }, + { + "epoch": 0.1785, + "grad_norm": 3.421875, + "grad_norm_var": 0.0538970947265625, + "learning_rate": 0.0001, + "loss": 6.3368, + "loss/crossentropy": 2.6661752462387085, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20690400898456573, + "step": 5712 + }, + { + "epoch": 0.1785625, + "grad_norm": 3.78125, + "grad_norm_var": 0.04736328125, + "learning_rate": 0.0001, + "loss": 6.4346, + "loss/crossentropy": 2.690172791481018, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2088213488459587, + "step": 5714 + }, + { + "epoch": 0.178625, + "grad_norm": 3.65625, + "grad_norm_var": 0.051268513997395834, + "learning_rate": 0.0001, + "loss": 6.414, + "loss/crossentropy": 2.7049120664596558, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20645985007286072, + "step": 5716 + }, + { + "epoch": 0.1786875, + "grad_norm": 3.703125, + "grad_norm_var": 0.04967447916666667, + "learning_rate": 0.0001, + "loss": 6.2708, + "loss/crossentropy": 2.523758292198181, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.2079075500369072, + "step": 5718 + }, + { + "epoch": 0.17875, + "grad_norm": 4.0, + "grad_norm_var": 0.03925679524739583, + "learning_rate": 0.0001, + "loss": 6.4607, + "loss/crossentropy": 2.631307601928711, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21418674290180206, + "step": 5720 + }, + { + "epoch": 0.1788125, + "grad_norm": 3.625, + "grad_norm_var": 0.03351949055989583, + "learning_rate": 0.0001, + "loss": 6.0575, + "loss/crossentropy": 2.4578261375427246, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19473493099212646, + "step": 5722 + }, + { + "epoch": 0.178875, + "grad_norm": 3.765625, + "grad_norm_var": 0.022184244791666665, + "learning_rate": 0.0001, + "loss": 6.1044, + "loss/crossentropy": 2.4846503734588623, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20025886595249176, + "step": 5724 + }, + { + "epoch": 0.1789375, + "grad_norm": 3.578125, + "grad_norm_var": 0.022459920247395834, + "learning_rate": 0.0001, + "loss": 6.212, + "loss/crossentropy": 2.6433242559432983, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19319302588701248, + "step": 5726 + }, + { + "epoch": 0.179, + "grad_norm": 3.546875, + "grad_norm_var": 0.0189453125, + "learning_rate": 0.0001, + "loss": 5.9439, + "loss/crossentropy": 2.4513763189315796, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18948928266763687, + "step": 5728 + }, + { + "epoch": 0.1790625, + "grad_norm": 3.84375, + "grad_norm_var": 0.027765909830729168, + "learning_rate": 0.0001, + "loss": 6.5101, + "loss/crossentropy": 2.668804407119751, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21616463363170624, + "step": 5730 + }, + { + "epoch": 0.179125, + "grad_norm": 4.28125, + "grad_norm_var": 0.04680989583333333, + "learning_rate": 0.0001, + "loss": 6.1737, + "loss/crossentropy": 2.4736764430999756, + "loss/hidden": 1.72265625, + "loss/jsd": 0.0, + "loss/logits": 0.1977366879582405, + "step": 5732 + }, + { + "epoch": 0.1791875, + "grad_norm": 3.671875, + "grad_norm_var": 0.047652180989583334, + "learning_rate": 0.0001, + "loss": 6.4047, + "loss/crossentropy": 2.67950177192688, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2072872519493103, + "step": 5734 + }, + { + "epoch": 0.17925, + "grad_norm": 3.578125, + "grad_norm_var": 0.06891988118489584, + "learning_rate": 0.0001, + "loss": 6.2035, + "loss/crossentropy": 2.63262677192688, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19537082314491272, + "step": 5736 + }, + { + "epoch": 0.1793125, + "grad_norm": 3.515625, + "grad_norm_var": 0.07222391764322916, + "learning_rate": 0.0001, + "loss": 6.2492, + "loss/crossentropy": 2.5669682025909424, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20493736863136292, + "step": 5738 + }, + { + "epoch": 0.179375, + "grad_norm": 3.90625, + "grad_norm_var": 0.07522786458333333, + "learning_rate": 0.0001, + "loss": 6.2924, + "loss/crossentropy": 2.6394450664520264, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20279543846845627, + "step": 5740 + }, + { + "epoch": 0.1794375, + "grad_norm": 3.5625, + "grad_norm_var": 0.082666015625, + "learning_rate": 0.0001, + "loss": 6.4208, + "loss/crossentropy": 2.7411283254623413, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2066434845328331, + "step": 5742 + }, + { + "epoch": 0.1795, + "grad_norm": 3.4375, + "grad_norm_var": 0.0856353759765625, + "learning_rate": 0.0001, + "loss": 6.3208, + "loss/crossentropy": 2.672483444213867, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2019430547952652, + "step": 5744 + }, + { + "epoch": 0.1795625, + "grad_norm": 3.859375, + "grad_norm_var": 0.07745768229166666, + "learning_rate": 0.0001, + "loss": 6.4814, + "loss/crossentropy": 2.6624244451522827, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21197732537984848, + "step": 5746 + }, + { + "epoch": 0.179625, + "grad_norm": 3.5625, + "grad_norm_var": 0.0529449462890625, + "learning_rate": 0.0001, + "loss": 6.3465, + "loss/crossentropy": 2.6245813369750977, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20969115942716599, + "step": 5748 + }, + { + "epoch": 0.1796875, + "grad_norm": 3.390625, + "grad_norm_var": 0.0566802978515625, + "learning_rate": 0.0001, + "loss": 6.2782, + "loss/crossentropy": 2.6000006198883057, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2045365795493126, + "step": 5750 + }, + { + "epoch": 0.17975, + "grad_norm": 3.515625, + "grad_norm_var": 0.038037109375, + "learning_rate": 0.0001, + "loss": 6.1203, + "loss/crossentropy": 2.5339865684509277, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19495518505573273, + "step": 5752 + }, + { + "epoch": 0.1798125, + "grad_norm": 3.75, + "grad_norm_var": 0.03528238932291667, + "learning_rate": 0.0001, + "loss": 6.0857, + "loss/crossentropy": 2.468671679496765, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1999809518456459, + "step": 5754 + }, + { + "epoch": 0.179875, + "grad_norm": 3.28125, + "grad_norm_var": 0.04231669108072917, + "learning_rate": 0.0001, + "loss": 6.4099, + "loss/crossentropy": 2.736938714981079, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2055758759379387, + "step": 5756 + }, + { + "epoch": 0.1799375, + "grad_norm": 3.875, + "grad_norm_var": 0.03561197916666667, + "learning_rate": 0.0001, + "loss": 6.3897, + "loss/crossentropy": 2.660236954689026, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2104458436369896, + "step": 5758 + }, + { + "epoch": 0.18, + "grad_norm": 3.0625, + "grad_norm_var": 0.0550445556640625, + "learning_rate": 0.0001, + "loss": 5.5766, + "loss/crossentropy": 2.194609224796295, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.17882372438907623, + "step": 5760 + }, + { + "epoch": 0.1800625, + "grad_norm": 3.65625, + "grad_norm_var": 0.05657145182291667, + "learning_rate": 0.0001, + "loss": 6.1178, + "loss/crossentropy": 2.506523847579956, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20019135624170303, + "step": 5762 + }, + { + "epoch": 0.180125, + "grad_norm": 4.0, + "grad_norm_var": 0.06211649576822917, + "learning_rate": 0.0001, + "loss": 6.2018, + "loss/crossentropy": 2.5049558877944946, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2103073075413704, + "step": 5764 + }, + { + "epoch": 0.1801875, + "grad_norm": 4.34375, + "grad_norm_var": 0.08708394368489583, + "learning_rate": 0.0001, + "loss": 6.4528, + "loss/crossentropy": 2.705584406852722, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21144528687000275, + "step": 5766 + }, + { + "epoch": 0.18025, + "grad_norm": 3.78125, + "grad_norm_var": 0.16002604166666667, + "learning_rate": 0.0001, + "loss": 6.464, + "loss/crossentropy": 2.6017357110977173, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.22060427069664001, + "step": 5768 + }, + { + "epoch": 0.1803125, + "grad_norm": 3.8125, + "grad_norm_var": 0.16955973307291666, + "learning_rate": 0.0001, + "loss": 6.2266, + "loss/crossentropy": 2.54715359210968, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20583520829677582, + "step": 5770 + }, + { + "epoch": 0.180375, + "grad_norm": 3.453125, + "grad_norm_var": 0.15359700520833333, + "learning_rate": 0.0001, + "loss": 6.2142, + "loss/crossentropy": 2.560963273048401, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.19970254600048065, + "step": 5772 + }, + { + "epoch": 0.1804375, + "grad_norm": 3.59375, + "grad_norm_var": 0.161083984375, + "learning_rate": 0.0001, + "loss": 6.3225, + "loss/crossentropy": 2.578567624092102, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20603880286216736, + "step": 5774 + }, + { + "epoch": 0.1805, + "grad_norm": 3.640625, + "grad_norm_var": 1.7318033854166666, + "learning_rate": 0.0001, + "loss": 6.5207, + "loss/crossentropy": 2.654413104057312, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2202201560139656, + "step": 5776 + }, + { + "epoch": 0.1805625, + "grad_norm": 3.515625, + "grad_norm_var": 1.7427642822265625, + "learning_rate": 0.0001, + "loss": 6.3629, + "loss/crossentropy": 2.5971418619155884, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.208999365568161, + "step": 5778 + }, + { + "epoch": 0.180625, + "grad_norm": 3.75, + "grad_norm_var": 1.759130859375, + "learning_rate": 0.0001, + "loss": 6.1511, + "loss/crossentropy": 2.503427267074585, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2022705376148224, + "step": 5780 + }, + { + "epoch": 0.1806875, + "grad_norm": 3.71875, + "grad_norm_var": 1.7509724934895834, + "learning_rate": 0.0001, + "loss": 6.2061, + "loss/crossentropy": 2.415714144706726, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21223796904087067, + "step": 5782 + }, + { + "epoch": 0.18075, + "grad_norm": 3.46875, + "grad_norm_var": 1.7370402018229167, + "learning_rate": 0.0001, + "loss": 6.2435, + "loss/crossentropy": 2.522855043411255, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2103436440229416, + "step": 5784 + }, + { + "epoch": 0.1808125, + "grad_norm": 3.5, + "grad_norm_var": 1.7345011393229166, + "learning_rate": 0.0001, + "loss": 6.2004, + "loss/crossentropy": 2.580862522125244, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20140428841114044, + "step": 5786 + }, + { + "epoch": 0.180875, + "grad_norm": 3.5, + "grad_norm_var": 1.755329386393229, + "learning_rate": 0.0001, + "loss": 5.9218, + "loss/crossentropy": 2.3795002698898315, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19134049117565155, + "step": 5788 + }, + { + "epoch": 0.1809375, + "grad_norm": 4.375, + "grad_norm_var": 1.745368448893229, + "learning_rate": 0.0001, + "loss": 6.3283, + "loss/crossentropy": 2.644761562347412, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20507150143384933, + "step": 5790 + }, + { + "epoch": 0.181, + "grad_norm": 3.40625, + "grad_norm_var": 0.07665608723958334, + "learning_rate": 0.0001, + "loss": 6.5596, + "loss/crossentropy": 2.7409855127334595, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21584705263376236, + "step": 5792 + }, + { + "epoch": 0.1810625, + "grad_norm": 3.453125, + "grad_norm_var": 0.0763671875, + "learning_rate": 0.0001, + "loss": 5.9313, + "loss/crossentropy": 2.430976629257202, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19105058908462524, + "step": 5794 + }, + { + "epoch": 0.181125, + "grad_norm": 3.515625, + "grad_norm_var": 0.07681376139322917, + "learning_rate": 0.0001, + "loss": 6.2093, + "loss/crossentropy": 2.5864611864089966, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19939538091421127, + "step": 5796 + }, + { + "epoch": 0.1811875, + "grad_norm": 3.78125, + "grad_norm_var": 0.07241109212239584, + "learning_rate": 0.0001, + "loss": 6.3301, + "loss/crossentropy": 2.6111371517181396, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.203924298286438, + "step": 5798 + }, + { + "epoch": 0.18125, + "grad_norm": 3.75, + "grad_norm_var": 0.07019856770833334, + "learning_rate": 0.0001, + "loss": 6.1891, + "loss/crossentropy": 2.4769375324249268, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2055877298116684, + "step": 5800 + }, + { + "epoch": 0.1813125, + "grad_norm": 3.328125, + "grad_norm_var": 0.07612202962239584, + "learning_rate": 0.0001, + "loss": 6.229, + "loss/crossentropy": 2.6561849117279053, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19556637108325958, + "step": 5802 + }, + { + "epoch": 0.181375, + "grad_norm": 3.546875, + "grad_norm_var": 0.06879781087239584, + "learning_rate": 0.0001, + "loss": 6.2133, + "loss/crossentropy": 2.5491530895233154, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20156937837600708, + "step": 5804 + }, + { + "epoch": 0.1814375, + "grad_norm": 3.703125, + "grad_norm_var": 0.037083943684895836, + "learning_rate": 0.0001, + "loss": 6.4403, + "loss/crossentropy": 2.6249520778656006, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.2116159126162529, + "step": 5806 + }, + { + "epoch": 0.1815, + "grad_norm": 3.65625, + "grad_norm_var": 0.02398681640625, + "learning_rate": 0.0001, + "loss": 6.0739, + "loss/crossentropy": 2.522126793861389, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.1922825202345848, + "step": 5808 + }, + { + "epoch": 0.1815625, + "grad_norm": 4.46875, + "grad_norm_var": 0.06580403645833334, + "learning_rate": 0.0001, + "loss": 6.8166, + "loss/crossentropy": 2.8532909154891968, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22562817484140396, + "step": 5810 + }, + { + "epoch": 0.181625, + "grad_norm": 3.6875, + "grad_norm_var": 0.12453511555989584, + "learning_rate": 0.0001, + "loss": 6.6679, + "loss/crossentropy": 2.8089990615844727, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21948668360710144, + "step": 5812 + }, + { + "epoch": 0.1816875, + "grad_norm": 3.703125, + "grad_norm_var": 1.460643513997396, + "learning_rate": 0.0001, + "loss": 6.3886, + "loss/crossentropy": 2.6490007638931274, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2098982334136963, + "step": 5814 + }, + { + "epoch": 0.18175, + "grad_norm": 3.671875, + "grad_norm_var": 1.4552480061848958, + "learning_rate": 0.0001, + "loss": 6.1484, + "loss/crossentropy": 2.4922189712524414, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.1999887079000473, + "step": 5816 + }, + { + "epoch": 0.1818125, + "grad_norm": 3.828125, + "grad_norm_var": 1.4140625, + "learning_rate": 0.0001, + "loss": 6.5733, + "loss/crossentropy": 2.758327007293701, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21665561199188232, + "step": 5818 + }, + { + "epoch": 0.181875, + "grad_norm": 3.5625, + "grad_norm_var": 1.4061197916666666, + "learning_rate": 0.0001, + "loss": 6.2049, + "loss/crossentropy": 2.554853320121765, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20055203139781952, + "step": 5820 + }, + { + "epoch": 0.1819375, + "grad_norm": 3.734375, + "grad_norm_var": 1.4133290608723958, + "learning_rate": 0.0001, + "loss": 6.5205, + "loss/crossentropy": 2.6887972354888916, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21519909799098969, + "step": 5822 + }, + { + "epoch": 0.182, + "grad_norm": 6.40625, + "grad_norm_var": 1.7002237955729167, + "learning_rate": 0.0001, + "loss": 6.4641, + "loss/crossentropy": 2.568132162094116, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.22943571954965591, + "step": 5824 + }, + { + "epoch": 0.1820625, + "grad_norm": 3.5625, + "grad_norm_var": 1.7548177083333334, + "learning_rate": 0.0001, + "loss": 6.3732, + "loss/crossentropy": 2.7769051790237427, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19713202863931656, + "step": 5826 + }, + { + "epoch": 0.182125, + "grad_norm": 3.96875, + "grad_norm_var": 1.7350545247395834, + "learning_rate": 0.0001, + "loss": 6.5541, + "loss/crossentropy": 2.8054513931274414, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.209628663957119, + "step": 5828 + }, + { + "epoch": 0.1821875, + "grad_norm": 3.65625, + "grad_norm_var": 0.49374593098958336, + "learning_rate": 0.0001, + "loss": 6.231, + "loss/crossentropy": 2.5347152948379517, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20673973113298416, + "step": 5830 + }, + { + "epoch": 0.18225, + "grad_norm": 3.6875, + "grad_norm_var": 0.49521077473958336, + "learning_rate": 0.0001, + "loss": 6.0622, + "loss/crossentropy": 2.431654691696167, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19938179105520248, + "step": 5832 + }, + { + "epoch": 0.1823125, + "grad_norm": 3.890625, + "grad_norm_var": 0.49771219889322915, + "learning_rate": 0.0001, + "loss": 6.4294, + "loss/crossentropy": 2.6954740285873413, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20581210404634476, + "step": 5834 + }, + { + "epoch": 0.182375, + "grad_norm": 3.5, + "grad_norm_var": 0.5013417561848958, + "learning_rate": 0.0001, + "loss": 6.3685, + "loss/crossentropy": 2.5816421508789062, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21266847103834152, + "step": 5836 + }, + { + "epoch": 0.1824375, + "grad_norm": 3.484375, + "grad_norm_var": 0.5062733968098958, + "learning_rate": 0.0001, + "loss": 6.3101, + "loss/crossentropy": 2.661076307296753, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20084363222122192, + "step": 5838 + }, + { + "epoch": 0.1825, + "grad_norm": 3.34375, + "grad_norm_var": 0.031346638997395836, + "learning_rate": 0.0001, + "loss": 6.1729, + "loss/crossentropy": 2.5334683656692505, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20027121901512146, + "step": 5840 + }, + { + "epoch": 0.1825625, + "grad_norm": 3.71875, + "grad_norm_var": 0.0342193603515625, + "learning_rate": 0.0001, + "loss": 5.9912, + "loss/crossentropy": 2.4580332040786743, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.18847763538360596, + "step": 5842 + }, + { + "epoch": 0.182625, + "grad_norm": 3.625, + "grad_norm_var": 0.022826131184895834, + "learning_rate": 0.0001, + "loss": 6.1967, + "loss/crossentropy": 2.510142683982849, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20498262345790863, + "step": 5844 + }, + { + "epoch": 0.1826875, + "grad_norm": 3.453125, + "grad_norm_var": 0.02457275390625, + "learning_rate": 0.0001, + "loss": 5.8873, + "loss/crossentropy": 2.379484176635742, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.192183256149292, + "step": 5846 + }, + { + "epoch": 0.18275, + "grad_norm": 3.5625, + "grad_norm_var": 0.05812886555989583, + "learning_rate": 0.0001, + "loss": 6.1376, + "loss/crossentropy": 2.4847766160964966, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2043425291776657, + "step": 5848 + }, + { + "epoch": 0.1828125, + "grad_norm": 3.484375, + "grad_norm_var": 0.055052693684895834, + "learning_rate": 0.0001, + "loss": 6.0785, + "loss/crossentropy": 2.466696262359619, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20063114166259766, + "step": 5850 + }, + { + "epoch": 0.182875, + "grad_norm": 3.40625, + "grad_norm_var": 0.05804036458333333, + "learning_rate": 0.0001, + "loss": 5.8681, + "loss/crossentropy": 2.3853559494018555, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1912408024072647, + "step": 5852 + }, + { + "epoch": 0.1829375, + "grad_norm": 3.59375, + "grad_norm_var": 0.0567291259765625, + "learning_rate": 0.0001, + "loss": 6.2438, + "loss/crossentropy": 2.5789082050323486, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.2028198093175888, + "step": 5854 + }, + { + "epoch": 0.183, + "grad_norm": 3.203125, + "grad_norm_var": 0.06461181640625, + "learning_rate": 0.0001, + "loss": 5.9171, + "loss/crossentropy": 2.3911736011505127, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1955569088459015, + "step": 5856 + }, + { + "epoch": 0.1830625, + "grad_norm": 3.4375, + "grad_norm_var": 0.0631988525390625, + "learning_rate": 0.0001, + "loss": 5.8855, + "loss/crossentropy": 2.3705540895462036, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.18938453495502472, + "step": 5858 + }, + { + "epoch": 0.183125, + "grad_norm": 3.734375, + "grad_norm_var": 0.06767578125, + "learning_rate": 0.0001, + "loss": 6.2594, + "loss/crossentropy": 2.577322840690613, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20453617721796036, + "step": 5860 + }, + { + "epoch": 0.1831875, + "grad_norm": 3.421875, + "grad_norm_var": 0.06760660807291667, + "learning_rate": 0.0001, + "loss": 5.9125, + "loss/crossentropy": 2.3466345071792603, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.19018106907606125, + "step": 5862 + }, + { + "epoch": 0.18325, + "grad_norm": 3.875, + "grad_norm_var": 0.0389312744140625, + "learning_rate": 0.0001, + "loss": 6.6228, + "loss/crossentropy": 2.7842180728912354, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21862202137708664, + "step": 5864 + }, + { + "epoch": 0.1833125, + "grad_norm": 3.359375, + "grad_norm_var": 0.049592081705729166, + "learning_rate": 0.0001, + "loss": 6.4839, + "loss/crossentropy": 2.7767481803894043, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2054857388138771, + "step": 5866 + }, + { + "epoch": 0.183375, + "grad_norm": 4.21875, + "grad_norm_var": 0.07108968098958333, + "learning_rate": 0.0001, + "loss": 6.4285, + "loss/crossentropy": 2.7161409854888916, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.2036587819457054, + "step": 5868 + }, + { + "epoch": 0.1834375, + "grad_norm": 3.71875, + "grad_norm_var": 0.07155659993489584, + "learning_rate": 0.0001, + "loss": 6.3881, + "loss/crossentropy": 2.6537606716156006, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20976366102695465, + "step": 5870 + }, + { + "epoch": 0.1835, + "grad_norm": 3.578125, + "grad_norm_var": 0.06264546712239584, + "learning_rate": 0.0001, + "loss": 6.5392, + "loss/crossentropy": 2.760679841041565, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21339457482099533, + "step": 5872 + }, + { + "epoch": 0.1835625, + "grad_norm": 3.59375, + "grad_norm_var": 0.05198465983072917, + "learning_rate": 0.0001, + "loss": 6.4479, + "loss/crossentropy": 2.747857093811035, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20672143250703812, + "step": 5874 + }, + { + "epoch": 0.183625, + "grad_norm": 3.46875, + "grad_norm_var": 0.05926106770833333, + "learning_rate": 0.0001, + "loss": 6.1602, + "loss/crossentropy": 2.582856297492981, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19680150598287582, + "step": 5876 + }, + { + "epoch": 0.1836875, + "grad_norm": 3.65625, + "grad_norm_var": 0.05678609212239583, + "learning_rate": 0.0001, + "loss": 6.4322, + "loss/crossentropy": 2.755292773246765, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20363199710845947, + "step": 5878 + }, + { + "epoch": 0.18375, + "grad_norm": 3.609375, + "grad_norm_var": 0.0533599853515625, + "learning_rate": 0.0001, + "loss": 6.3255, + "loss/crossentropy": 2.6330443620681763, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20635131001472473, + "step": 5880 + }, + { + "epoch": 0.1838125, + "grad_norm": 3.453125, + "grad_norm_var": 0.043701171875, + "learning_rate": 0.0001, + "loss": 6.3948, + "loss/crossentropy": 2.711233615875244, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20507829636335373, + "step": 5882 + }, + { + "epoch": 0.183875, + "grad_norm": 3.40625, + "grad_norm_var": 0.022175089518229166, + "learning_rate": 0.0001, + "loss": 6.1855, + "loss/crossentropy": 2.6189242601394653, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19532468914985657, + "step": 5884 + }, + { + "epoch": 0.1839375, + "grad_norm": 3.421875, + "grad_norm_var": 0.022233072916666666, + "learning_rate": 0.0001, + "loss": 6.3397, + "loss/crossentropy": 2.698093295097351, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20478975027799606, + "step": 5886 + }, + { + "epoch": 0.184, + "grad_norm": 3.296875, + "grad_norm_var": 0.018708292643229166, + "learning_rate": 0.0001, + "loss": 5.7069, + "loss/crossentropy": 2.351386785507202, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.17383243143558502, + "step": 5888 + }, + { + "epoch": 0.1840625, + "grad_norm": 3.296875, + "grad_norm_var": 0.022728474934895833, + "learning_rate": 0.0001, + "loss": 5.966, + "loss/crossentropy": 2.510135769844055, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18855902552604675, + "step": 5890 + }, + { + "epoch": 0.184125, + "grad_norm": 3.765625, + "grad_norm_var": 0.033935546875, + "learning_rate": 0.0001, + "loss": 6.1203, + "loss/crossentropy": 2.476384997367859, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20696952939033508, + "step": 5892 + }, + { + "epoch": 0.1841875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0362213134765625, + "learning_rate": 0.0001, + "loss": 5.9829, + "loss/crossentropy": 2.4397428035736084, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.192990280687809, + "step": 5894 + }, + { + "epoch": 0.18425, + "grad_norm": 4.34375, + "grad_norm_var": 0.08364969889322917, + "learning_rate": 0.0001, + "loss": 5.778, + "loss/crossentropy": 2.3311071395874023, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.17945268750190735, + "step": 5896 + }, + { + "epoch": 0.1843125, + "grad_norm": 3.390625, + "grad_norm_var": 0.0907135009765625, + "learning_rate": 0.0001, + "loss": 6.2392, + "loss/crossentropy": 2.6726242303848267, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.2004098892211914, + "step": 5898 + }, + { + "epoch": 0.184375, + "grad_norm": 3.734375, + "grad_norm_var": 0.0914215087890625, + "learning_rate": 0.0001, + "loss": 6.2737, + "loss/crossentropy": 2.6352880001068115, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20017390698194504, + "step": 5900 + }, + { + "epoch": 0.1844375, + "grad_norm": 3.484375, + "grad_norm_var": 0.09117431640625, + "learning_rate": 0.0001, + "loss": 6.3768, + "loss/crossentropy": 2.702660083770752, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20491401851177216, + "step": 5902 + }, + { + "epoch": 0.1845, + "grad_norm": 3.625, + "grad_norm_var": 0.08385009765625, + "learning_rate": 0.0001, + "loss": 6.3654, + "loss/crossentropy": 2.5936564207077026, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21232753992080688, + "step": 5904 + }, + { + "epoch": 0.1845625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0859771728515625, + "learning_rate": 0.0001, + "loss": 5.9536, + "loss/crossentropy": 2.4742748737335205, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1905067041516304, + "step": 5906 + }, + { + "epoch": 0.184625, + "grad_norm": 3.609375, + "grad_norm_var": 0.08317057291666667, + "learning_rate": 0.0001, + "loss": 6.1984, + "loss/crossentropy": 2.583977222442627, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.2040199413895607, + "step": 5908 + }, + { + "epoch": 0.1846875, + "grad_norm": 3.734375, + "grad_norm_var": 0.0803375244140625, + "learning_rate": 0.0001, + "loss": 6.5871, + "loss/crossentropy": 2.709283709526062, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2194249927997589, + "step": 5910 + }, + { + "epoch": 0.18475, + "grad_norm": 3.5, + "grad_norm_var": 0.04755859375, + "learning_rate": 0.0001, + "loss": 5.9032, + "loss/crossentropy": 2.3166788816452026, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.1953662857413292, + "step": 5912 + }, + { + "epoch": 0.1848125, + "grad_norm": 3.859375, + "grad_norm_var": 0.04663798014322917, + "learning_rate": 0.0001, + "loss": 6.475, + "loss/crossentropy": 2.7360684871673584, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20944271236658096, + "step": 5914 + }, + { + "epoch": 0.184875, + "grad_norm": 4.0625, + "grad_norm_var": 0.0570465087890625, + "learning_rate": 0.0001, + "loss": 6.0926, + "loss/crossentropy": 2.427862048149109, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2039765566587448, + "step": 5916 + }, + { + "epoch": 0.1849375, + "grad_norm": 3.8125, + "grad_norm_var": 0.05185546875, + "learning_rate": 0.0001, + "loss": 6.5126, + "loss/crossentropy": 2.7930887937545776, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20515745878219604, + "step": 5918 + }, + { + "epoch": 0.185, + "grad_norm": 4.8125, + "grad_norm_var": 0.13144124348958333, + "learning_rate": 0.0001, + "loss": 6.4902, + "loss/crossentropy": 2.752415418624878, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2085418626666069, + "step": 5920 + }, + { + "epoch": 0.1850625, + "grad_norm": 3.5625, + "grad_norm_var": 0.09431864420572916, + "learning_rate": 0.0001, + "loss": 6.0656, + "loss/crossentropy": 2.5158064365386963, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19677375257015228, + "step": 5922 + }, + { + "epoch": 0.185125, + "grad_norm": 4.09375, + "grad_norm_var": 0.09625244140625, + "learning_rate": 0.0001, + "loss": 6.4938, + "loss/crossentropy": 2.683521032333374, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21579697728157043, + "step": 5924 + }, + { + "epoch": 0.1851875, + "grad_norm": 3.609375, + "grad_norm_var": 0.1140045166015625, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.217368483543396, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.182216115295887, + "step": 5926 + }, + { + "epoch": 0.18525, + "grad_norm": 3.34375, + "grad_norm_var": 0.12662353515625, + "learning_rate": 0.0001, + "loss": 5.9032, + "loss/crossentropy": 2.399585723876953, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1882522851228714, + "step": 5928 + }, + { + "epoch": 0.1853125, + "grad_norm": 4.15625, + "grad_norm_var": 0.1364654541015625, + "learning_rate": 0.0001, + "loss": 6.5129, + "loss/crossentropy": 2.6554906368255615, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21972205489873886, + "step": 5930 + }, + { + "epoch": 0.185375, + "grad_norm": 3.203125, + "grad_norm_var": 0.1580718994140625, + "learning_rate": 0.0001, + "loss": 6.1766, + "loss/crossentropy": 2.5827823877334595, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19922207295894623, + "step": 5932 + }, + { + "epoch": 0.1854375, + "grad_norm": 3.921875, + "grad_norm_var": 0.161376953125, + "learning_rate": 0.0001, + "loss": 6.298, + "loss/crossentropy": 2.5829977989196777, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20508920401334763, + "step": 5934 + }, + { + "epoch": 0.1855, + "grad_norm": 3.484375, + "grad_norm_var": 0.13147684733072917, + "learning_rate": 0.0001, + "loss": 6.2714, + "loss/crossentropy": 2.52670419216156, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21040621399879456, + "step": 5936 + }, + { + "epoch": 0.1855625, + "grad_norm": 3.609375, + "grad_norm_var": 0.14094645182291668, + "learning_rate": 0.0001, + "loss": 6.1793, + "loss/crossentropy": 2.5923471450805664, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1989283263683319, + "step": 5938 + }, + { + "epoch": 0.185625, + "grad_norm": 3.296875, + "grad_norm_var": 0.15126546223958334, + "learning_rate": 0.0001, + "loss": 6.0703, + "loss/crossentropy": 2.5271341800689697, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1937677189707756, + "step": 5940 + }, + { + "epoch": 0.1856875, + "grad_norm": 3.625, + "grad_norm_var": 0.1473541259765625, + "learning_rate": 0.0001, + "loss": 6.6569, + "loss/crossentropy": 2.8995094299316406, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21206432580947876, + "step": 5942 + }, + { + "epoch": 0.18575, + "grad_norm": 3.3125, + "grad_norm_var": 0.16066080729166668, + "learning_rate": 0.0001, + "loss": 5.5161, + "loss/crossentropy": 2.213390350341797, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.17207244038581848, + "step": 5944 + }, + { + "epoch": 0.1858125, + "grad_norm": 3.75, + "grad_norm_var": 0.14329427083333332, + "learning_rate": 0.0001, + "loss": 6.0463, + "loss/crossentropy": 2.470386028289795, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19392302632331848, + "step": 5946 + }, + { + "epoch": 0.185875, + "grad_norm": 3.5625, + "grad_norm_var": 0.17496744791666666, + "learning_rate": 0.0001, + "loss": 6.4909, + "loss/crossentropy": 2.6137181520462036, + "loss/hidden": 1.73046875, + "loss/jsd": 0.0, + "loss/logits": 0.21466894447803497, + "step": 5948 + }, + { + "epoch": 0.1859375, + "grad_norm": 3.828125, + "grad_norm_var": 0.17626546223958334, + "learning_rate": 0.0001, + "loss": 6.1844, + "loss/crossentropy": 2.5273345708847046, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2047644555568695, + "step": 5950 + }, + { + "epoch": 0.186, + "grad_norm": 3.65625, + "grad_norm_var": 0.10972391764322917, + "learning_rate": 0.0001, + "loss": 6.133, + "loss/crossentropy": 2.5080472230911255, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2011718973517418, + "step": 5952 + }, + { + "epoch": 0.1860625, + "grad_norm": 3.890625, + "grad_norm_var": 0.1380035400390625, + "learning_rate": 0.0001, + "loss": 6.3812, + "loss/crossentropy": 2.6199456453323364, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20932674407958984, + "step": 5954 + }, + { + "epoch": 0.186125, + "grad_norm": 3.625, + "grad_norm_var": 0.12379150390625, + "learning_rate": 0.0001, + "loss": 6.1097, + "loss/crossentropy": 2.495076298713684, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20013613998889923, + "step": 5956 + }, + { + "epoch": 0.1861875, + "grad_norm": 3.421875, + "grad_norm_var": 0.12910868326822916, + "learning_rate": 0.0001, + "loss": 6.1547, + "loss/crossentropy": 2.533836007118225, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19919445365667343, + "step": 5958 + }, + { + "epoch": 0.18625, + "grad_norm": 3.5, + "grad_norm_var": 0.10904032389322917, + "learning_rate": 0.0001, + "loss": 6.2077, + "loss/crossentropy": 2.6468350887298584, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19710253924131393, + "step": 5960 + }, + { + "epoch": 0.1863125, + "grad_norm": 5.8125, + "grad_norm_var": 0.39912821451822916, + "learning_rate": 0.0001, + "loss": 6.6428, + "loss/crossentropy": 2.8047144412994385, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21544451266527176, + "step": 5962 + }, + { + "epoch": 0.186375, + "grad_norm": 3.453125, + "grad_norm_var": 0.38775634765625, + "learning_rate": 0.0001, + "loss": 5.9039, + "loss/crossentropy": 2.4518396854400635, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1866111382842064, + "step": 5964 + }, + { + "epoch": 0.1864375, + "grad_norm": 3.703125, + "grad_norm_var": 0.4022125244140625, + "learning_rate": 0.0001, + "loss": 6.3249, + "loss/crossentropy": 2.555612087249756, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20856502652168274, + "step": 5966 + }, + { + "epoch": 0.1865, + "grad_norm": 3.4375, + "grad_norm_var": 0.41238606770833336, + "learning_rate": 0.0001, + "loss": 6.0578, + "loss/crossentropy": 2.496393322944641, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19246415048837662, + "step": 5968 + }, + { + "epoch": 0.1865625, + "grad_norm": 3.625, + "grad_norm_var": 0.3786417643229167, + "learning_rate": 0.0001, + "loss": 5.716, + "loss/crossentropy": 2.2444005012512207, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.1823199763894081, + "step": 5970 + }, + { + "epoch": 0.186625, + "grad_norm": 3.921875, + "grad_norm_var": 0.3862050374348958, + "learning_rate": 0.0001, + "loss": 6.1808, + "loss/crossentropy": 2.5879788398742676, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19756118953227997, + "step": 5972 + }, + { + "epoch": 0.1866875, + "grad_norm": 3.515625, + "grad_norm_var": 0.38194071451822914, + "learning_rate": 0.0001, + "loss": 6.16, + "loss/crossentropy": 2.476559638977051, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20388895273208618, + "step": 5974 + }, + { + "epoch": 0.18675, + "grad_norm": 3.703125, + "grad_norm_var": 0.380712890625, + "learning_rate": 0.0001, + "loss": 6.3285, + "loss/crossentropy": 2.6764557361602783, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20426716655492783, + "step": 5976 + }, + { + "epoch": 0.1868125, + "grad_norm": 3.78125, + "grad_norm_var": 0.06890869140625, + "learning_rate": 0.0001, + "loss": 6.1595, + "loss/crossentropy": 2.5331965684890747, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.20560278743505478, + "step": 5978 + }, + { + "epoch": 0.186875, + "grad_norm": 3.71875, + "grad_norm_var": 0.061620076497395836, + "learning_rate": 0.0001, + "loss": 6.2729, + "loss/crossentropy": 2.617649555206299, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20341891050338745, + "step": 5980 + }, + { + "epoch": 0.1869375, + "grad_norm": 3.53125, + "grad_norm_var": 0.031151326497395833, + "learning_rate": 0.0001, + "loss": 6.3184, + "loss/crossentropy": 2.657251715660095, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20166247338056564, + "step": 5982 + }, + { + "epoch": 0.187, + "grad_norm": 14.6875, + "grad_norm_var": 7.671647135416666, + "learning_rate": 0.0001, + "loss": 6.4626, + "loss/crossentropy": 2.511157751083374, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.22912870347499847, + "step": 5984 + }, + { + "epoch": 0.1870625, + "grad_norm": 3.671875, + "grad_norm_var": 7.643797810872396, + "learning_rate": 0.0001, + "loss": 6.1287, + "loss/crossentropy": 2.533773183822632, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.19543274492025375, + "step": 5986 + }, + { + "epoch": 0.187125, + "grad_norm": 3.75, + "grad_norm_var": 7.625886027018229, + "learning_rate": 0.0001, + "loss": 6.2694, + "loss/crossentropy": 2.6192766427993774, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20055712759494781, + "step": 5988 + }, + { + "epoch": 0.1871875, + "grad_norm": 3.5625, + "grad_norm_var": 7.604227701822917, + "learning_rate": 0.0001, + "loss": 6.4918, + "loss/crossentropy": 2.6744190454483032, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.213771253824234, + "step": 5990 + }, + { + "epoch": 0.18725, + "grad_norm": 3.78125, + "grad_norm_var": 7.5575103759765625, + "learning_rate": 0.0001, + "loss": 6.3335, + "loss/crossentropy": 2.694438338279724, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.19984393566846848, + "step": 5992 + }, + { + "epoch": 0.1873125, + "grad_norm": 3.671875, + "grad_norm_var": 7.572516886393229, + "learning_rate": 0.0001, + "loss": 6.2238, + "loss/crossentropy": 2.52706241607666, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20600418001413345, + "step": 5994 + }, + { + "epoch": 0.187375, + "grad_norm": 3.515625, + "grad_norm_var": 7.656012980143229, + "learning_rate": 0.0001, + "loss": 5.9532, + "loss/crossentropy": 2.426245331764221, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19214607775211334, + "step": 5996 + }, + { + "epoch": 0.1874375, + "grad_norm": 3.796875, + "grad_norm_var": 7.616178385416666, + "learning_rate": 0.0001, + "loss": 6.4061, + "loss/crossentropy": 2.657272458076477, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20925995707511902, + "step": 5998 + }, + { + "epoch": 0.1875, + "grad_norm": 3.5, + "grad_norm_var": 0.03191731770833333, + "learning_rate": 0.0001, + "loss": 6.4515, + "loss/crossentropy": 2.7046492099761963, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21218468993902206, + "step": 6000 + }, + { + "epoch": 0.1875625, + "grad_norm": 3.328125, + "grad_norm_var": 0.034521484375, + "learning_rate": 0.0001, + "loss": 6.2953, + "loss/crossentropy": 2.7289458513259888, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1945292055606842, + "step": 6002 + }, + { + "epoch": 0.187625, + "grad_norm": 3.90625, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 6.2382, + "loss/crossentropy": 2.5910305976867676, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20260342955589294, + "step": 6004 + }, + { + "epoch": 0.1876875, + "grad_norm": 3.703125, + "grad_norm_var": 0.034407552083333334, + "learning_rate": 0.0001, + "loss": 6.4952, + "loss/crossentropy": 2.707619071006775, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.21039412915706635, + "step": 6006 + }, + { + "epoch": 0.18775, + "grad_norm": 3.734375, + "grad_norm_var": 0.038863118489583334, + "learning_rate": 0.0001, + "loss": 6.1359, + "loss/crossentropy": 2.426607131958008, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20881590247154236, + "step": 6008 + }, + { + "epoch": 0.1878125, + "grad_norm": 3.703125, + "grad_norm_var": 0.03825581868489583, + "learning_rate": 0.0001, + "loss": 6.6067, + "loss/crossentropy": 2.7720266580581665, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2154972180724144, + "step": 6010 + }, + { + "epoch": 0.187875, + "grad_norm": 3.84375, + "grad_norm_var": 0.03859761555989583, + "learning_rate": 0.0001, + "loss": 6.7914, + "loss/crossentropy": 2.8925548791885376, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.2219136357307434, + "step": 6012 + }, + { + "epoch": 0.1879375, + "grad_norm": 3.453125, + "grad_norm_var": 0.04293212890625, + "learning_rate": 0.0001, + "loss": 6.502, + "loss/crossentropy": 2.8628028631210327, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2010301947593689, + "step": 6014 + }, + { + "epoch": 0.188, + "grad_norm": 3.359375, + "grad_norm_var": 0.05241597493489583, + "learning_rate": 0.0001, + "loss": 6.1368, + "loss/crossentropy": 2.570828080177307, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19839782267808914, + "step": 6016 + }, + { + "epoch": 0.1880625, + "grad_norm": 3.5, + "grad_norm_var": 0.04946187337239583, + "learning_rate": 0.0001, + "loss": 6.0992, + "loss/crossentropy": 2.5494972467422485, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19520193338394165, + "step": 6018 + }, + { + "epoch": 0.188125, + "grad_norm": 3.703125, + "grad_norm_var": 0.04431864420572917, + "learning_rate": 0.0001, + "loss": 6.274, + "loss/crossentropy": 2.5650126934051514, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20722636580467224, + "step": 6020 + }, + { + "epoch": 0.1881875, + "grad_norm": 4.0625, + "grad_norm_var": 0.054076131184895834, + "learning_rate": 0.0001, + "loss": 6.8115, + "loss/crossentropy": 2.8891549110412598, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2234850898385048, + "step": 6022 + }, + { + "epoch": 0.18825, + "grad_norm": 3.4375, + "grad_norm_var": 0.06729227701822917, + "learning_rate": 0.0001, + "loss": 5.7411, + "loss/crossentropy": 2.296898603439331, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.1795775294303894, + "step": 6024 + }, + { + "epoch": 0.1883125, + "grad_norm": 3.484375, + "grad_norm_var": 0.06712137858072917, + "learning_rate": 0.0001, + "loss": 6.3372, + "loss/crossentropy": 2.705489993095398, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2037922739982605, + "step": 6026 + }, + { + "epoch": 0.188375, + "grad_norm": 3.5625, + "grad_norm_var": 0.04289449055989583, + "learning_rate": 0.0001, + "loss": 6.5295, + "loss/crossentropy": 2.7792210578918457, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.20705441385507584, + "step": 6028 + }, + { + "epoch": 0.1884375, + "grad_norm": 3.5625, + "grad_norm_var": 0.049738566080729164, + "learning_rate": 0.0001, + "loss": 6.4451, + "loss/crossentropy": 2.713812470436096, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2071167230606079, + "step": 6030 + }, + { + "epoch": 0.1885, + "grad_norm": 3.578125, + "grad_norm_var": 0.048493448893229166, + "learning_rate": 0.0001, + "loss": 6.602, + "loss/crossentropy": 2.7990516424179077, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.21662269532680511, + "step": 6032 + }, + { + "epoch": 0.1885625, + "grad_norm": 3.546875, + "grad_norm_var": 0.046117146809895836, + "learning_rate": 0.0001, + "loss": 6.1163, + "loss/crossentropy": 2.664048671722412, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18624288588762283, + "step": 6034 + }, + { + "epoch": 0.188625, + "grad_norm": 6.375, + "grad_norm_var": 0.5297688802083333, + "learning_rate": 0.0001, + "loss": 6.7929, + "loss/crossentropy": 2.8639910221099854, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.22219152748584747, + "step": 6036 + }, + { + "epoch": 0.1886875, + "grad_norm": 3.6875, + "grad_norm_var": 0.5204823811848959, + "learning_rate": 0.0001, + "loss": 6.279, + "loss/crossentropy": 2.6860432624816895, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20031289756298065, + "step": 6038 + }, + { + "epoch": 0.18875, + "grad_norm": 3.421875, + "grad_norm_var": 0.49435221354166664, + "learning_rate": 0.0001, + "loss": 6.1542, + "loss/crossentropy": 2.5659446716308594, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19710734486579895, + "step": 6040 + }, + { + "epoch": 0.1888125, + "grad_norm": 3.703125, + "grad_norm_var": 0.48899637858072914, + "learning_rate": 0.0001, + "loss": 6.3601, + "loss/crossentropy": 2.688350558280945, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20194531977176666, + "step": 6042 + }, + { + "epoch": 0.188875, + "grad_norm": 3.578125, + "grad_norm_var": 0.49302978515625, + "learning_rate": 0.0001, + "loss": 6.3056, + "loss/crossentropy": 2.6706860065460205, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.204509399831295, + "step": 6044 + }, + { + "epoch": 0.1889375, + "grad_norm": 3.734375, + "grad_norm_var": 0.48799540201822916, + "learning_rate": 0.0001, + "loss": 6.5712, + "loss/crossentropy": 2.746452808380127, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.21528411656618118, + "step": 6046 + }, + { + "epoch": 0.189, + "grad_norm": 4.375, + "grad_norm_var": 0.5116373697916666, + "learning_rate": 0.0001, + "loss": 6.3722, + "loss/crossentropy": 2.6199615001678467, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21076759696006775, + "step": 6048 + }, + { + "epoch": 0.1890625, + "grad_norm": 3.5625, + "grad_norm_var": 0.5173248291015625, + "learning_rate": 0.0001, + "loss": 5.7291, + "loss/crossentropy": 2.251525402069092, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.18135468661785126, + "step": 6050 + }, + { + "epoch": 0.189125, + "grad_norm": 3.5, + "grad_norm_var": 0.05324605305989583, + "learning_rate": 0.0001, + "loss": 6.3893, + "loss/crossentropy": 2.6776301860809326, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2086646556854248, + "step": 6052 + }, + { + "epoch": 0.1891875, + "grad_norm": 4.8125, + "grad_norm_var": 0.18164774576822917, + "learning_rate": 0.0001, + "loss": 6.5803, + "loss/crossentropy": 2.852060914039612, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2040734738111496, + "step": 6054 + }, + { + "epoch": 0.18925, + "grad_norm": 3.625, + "grad_norm_var": 0.17443033854166667, + "learning_rate": 0.0001, + "loss": 6.489, + "loss/crossentropy": 2.769283890724182, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2090795561671257, + "step": 6056 + }, + { + "epoch": 0.1893125, + "grad_norm": 3.421875, + "grad_norm_var": 0.17888895670572916, + "learning_rate": 0.0001, + "loss": 5.935, + "loss/crossentropy": 2.397734045982361, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1927870362997055, + "step": 6058 + }, + { + "epoch": 0.189375, + "grad_norm": 3.84375, + "grad_norm_var": 0.1787261962890625, + "learning_rate": 0.0001, + "loss": 6.323, + "loss/crossentropy": 2.716283082962036, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19816960394382477, + "step": 6060 + }, + { + "epoch": 0.1894375, + "grad_norm": 3.46875, + "grad_norm_var": 0.1854888916015625, + "learning_rate": 0.0001, + "loss": 6.3036, + "loss/crossentropy": 2.6464054584503174, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20244264602661133, + "step": 6062 + }, + { + "epoch": 0.1895, + "grad_norm": 3.5625, + "grad_norm_var": 0.16020406087239583, + "learning_rate": 0.0001, + "loss": 6.1906, + "loss/crossentropy": 2.612483024597168, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1976574957370758, + "step": 6064 + }, + { + "epoch": 0.1895625, + "grad_norm": 3.703125, + "grad_norm_var": 0.1543121337890625, + "learning_rate": 0.0001, + "loss": 6.1622, + "loss/crossentropy": 2.5031535625457764, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20418957620859146, + "step": 6066 + }, + { + "epoch": 0.189625, + "grad_norm": 3.96875, + "grad_norm_var": 0.1566802978515625, + "learning_rate": 0.0001, + "loss": 6.4214, + "loss/crossentropy": 2.6766852140426636, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.20415638387203217, + "step": 6068 + }, + { + "epoch": 0.1896875, + "grad_norm": 3.546875, + "grad_norm_var": 0.026349894205729165, + "learning_rate": 0.0001, + "loss": 6.3106, + "loss/crossentropy": 2.7238508462905884, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19851642102003098, + "step": 6070 + }, + { + "epoch": 0.18975, + "grad_norm": 4.0, + "grad_norm_var": 0.038232421875, + "learning_rate": 0.0001, + "loss": 6.3525, + "loss/crossentropy": 2.5249571800231934, + "loss/hidden": 1.71875, + "loss/jsd": 0.0, + "loss/logits": 0.2108766883611679, + "step": 6072 + }, + { + "epoch": 0.1898125, + "grad_norm": 4.09375, + "grad_norm_var": 0.04902242024739583, + "learning_rate": 0.0001, + "loss": 6.2244, + "loss/crossentropy": 2.585625648498535, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20138001441955566, + "step": 6074 + }, + { + "epoch": 0.189875, + "grad_norm": 3.6875, + "grad_norm_var": 0.043488566080729166, + "learning_rate": 0.0001, + "loss": 6.2864, + "loss/crossentropy": 2.6281604766845703, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20293108373880386, + "step": 6076 + }, + { + "epoch": 0.1899375, + "grad_norm": 4.375, + "grad_norm_var": 0.0712554931640625, + "learning_rate": 0.0001, + "loss": 6.1245, + "loss/crossentropy": 2.405423879623413, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20823708176612854, + "step": 6078 + }, + { + "epoch": 0.19, + "grad_norm": 3.5, + "grad_norm_var": 0.07091471354166666, + "learning_rate": 0.0001, + "loss": 5.8131, + "loss/crossentropy": 2.343820095062256, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18950790911912918, + "step": 6080 + }, + { + "epoch": 0.1900625, + "grad_norm": 3.734375, + "grad_norm_var": 0.06970926920572916, + "learning_rate": 0.0001, + "loss": 6.6922, + "loss/crossentropy": 2.852641463279724, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.2148171365261078, + "step": 6082 + }, + { + "epoch": 0.190125, + "grad_norm": 3.796875, + "grad_norm_var": 0.06372782389322916, + "learning_rate": 0.0001, + "loss": 6.4927, + "loss/crossentropy": 2.714944005012512, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21137383580207825, + "step": 6084 + }, + { + "epoch": 0.1901875, + "grad_norm": 3.765625, + "grad_norm_var": 0.05829671223958333, + "learning_rate": 0.0001, + "loss": 5.8325, + "loss/crossentropy": 2.2950823307037354, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.1877269148826599, + "step": 6086 + }, + { + "epoch": 0.19025, + "grad_norm": 3.578125, + "grad_norm_var": 0.06189676920572917, + "learning_rate": 0.0001, + "loss": 6.3545, + "loss/crossentropy": 2.7253220081329346, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.19611873477697372, + "step": 6088 + }, + { + "epoch": 0.1903125, + "grad_norm": 4.125, + "grad_norm_var": 0.0606109619140625, + "learning_rate": 0.0001, + "loss": 6.5246, + "loss/crossentropy": 2.7045950889587402, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21950286626815796, + "step": 6090 + }, + { + "epoch": 0.190375, + "grad_norm": 4.03125, + "grad_norm_var": 0.06428934733072916, + "learning_rate": 0.0001, + "loss": 6.361, + "loss/crossentropy": 2.581110715866089, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.20885251462459564, + "step": 6092 + }, + { + "epoch": 0.1904375, + "grad_norm": 3.53125, + "grad_norm_var": 0.03723856608072917, + "learning_rate": 0.0001, + "loss": 6.28, + "loss/crossentropy": 2.647969961166382, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20030918717384338, + "step": 6094 + }, + { + "epoch": 0.1905, + "grad_norm": 3.828125, + "grad_norm_var": 0.03548075358072917, + "learning_rate": 0.0001, + "loss": 6.3454, + "loss/crossentropy": 2.6669753789901733, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20261073857545853, + "step": 6096 + }, + { + "epoch": 0.1905625, + "grad_norm": 3.53125, + "grad_norm_var": 0.04225972493489583, + "learning_rate": 0.0001, + "loss": 6.2998, + "loss/crossentropy": 2.621742844581604, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20764455944299698, + "step": 6098 + }, + { + "epoch": 0.190625, + "grad_norm": 3.59375, + "grad_norm_var": 0.04215087890625, + "learning_rate": 0.0001, + "loss": 6.0253, + "loss/crossentropy": 2.3852285146713257, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19916538894176483, + "step": 6100 + }, + { + "epoch": 0.1906875, + "grad_norm": 3.5625, + "grad_norm_var": 0.04202473958333333, + "learning_rate": 0.0001, + "loss": 6.3239, + "loss/crossentropy": 2.6609463691711426, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20379143953323364, + "step": 6102 + }, + { + "epoch": 0.19075, + "grad_norm": 3.90625, + "grad_norm_var": 0.04533589680989583, + "learning_rate": 0.0001, + "loss": 6.1169, + "loss/crossentropy": 2.599799633026123, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19663503021001816, + "step": 6104 + }, + { + "epoch": 0.1908125, + "grad_norm": 3.5, + "grad_norm_var": 0.031201171875, + "learning_rate": 0.0001, + "loss": 6.0963, + "loss/crossentropy": 2.561492085456848, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19332732260227203, + "step": 6106 + }, + { + "epoch": 0.190875, + "grad_norm": 3.3125, + "grad_norm_var": 0.022362263997395833, + "learning_rate": 0.0001, + "loss": 6.1243, + "loss/crossentropy": 2.561802625656128, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19258107244968414, + "step": 6108 + }, + { + "epoch": 0.1909375, + "grad_norm": 3.46875, + "grad_norm_var": 0.04573465983072917, + "learning_rate": 0.0001, + "loss": 6.4606, + "loss/crossentropy": 2.772518277168274, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20982690900564194, + "step": 6110 + }, + { + "epoch": 0.191, + "grad_norm": 3.78125, + "grad_norm_var": 0.05339253743489583, + "learning_rate": 0.0001, + "loss": 6.2575, + "loss/crossentropy": 2.608256459236145, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2043812796473503, + "step": 6112 + }, + { + "epoch": 0.1910625, + "grad_norm": 3.53125, + "grad_norm_var": 0.054488118489583334, + "learning_rate": 0.0001, + "loss": 6.3426, + "loss/crossentropy": 2.754101514816284, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.2022107094526291, + "step": 6114 + }, + { + "epoch": 0.191125, + "grad_norm": 3.46875, + "grad_norm_var": 0.05672098795572917, + "learning_rate": 0.0001, + "loss": 6.064, + "loss/crossentropy": 2.5370287895202637, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1929335743188858, + "step": 6116 + }, + { + "epoch": 0.1911875, + "grad_norm": 3.46875, + "grad_norm_var": 0.07105712890625, + "learning_rate": 0.0001, + "loss": 6.3428, + "loss/crossentropy": 2.7027775049209595, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20228593051433563, + "step": 6118 + }, + { + "epoch": 0.19125, + "grad_norm": 4.15625, + "grad_norm_var": 0.08713277180989583, + "learning_rate": 0.0001, + "loss": 6.389, + "loss/crossentropy": 2.6226966381073, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.2102195918560028, + "step": 6120 + }, + { + "epoch": 0.1913125, + "grad_norm": 3.46875, + "grad_norm_var": 0.0928863525390625, + "learning_rate": 0.0001, + "loss": 6.2733, + "loss/crossentropy": 2.6209421157836914, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2019570991396904, + "step": 6122 + }, + { + "epoch": 0.191375, + "grad_norm": 3.265625, + "grad_norm_var": 0.09683329264322917, + "learning_rate": 0.0001, + "loss": 6.0993, + "loss/crossentropy": 2.5557941198349, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1977071538567543, + "step": 6124 + }, + { + "epoch": 0.1914375, + "grad_norm": 4.125, + "grad_norm_var": 0.11159566243489584, + "learning_rate": 0.0001, + "loss": 6.5564, + "loss/crossentropy": 2.638274908065796, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.22267445921897888, + "step": 6126 + }, + { + "epoch": 0.1915, + "grad_norm": 3.296875, + "grad_norm_var": 0.10690816243489583, + "learning_rate": 0.0001, + "loss": 5.9419, + "loss/crossentropy": 2.3865491151809692, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.1918674185872078, + "step": 6128 + }, + { + "epoch": 0.1915625, + "grad_norm": 3.890625, + "grad_norm_var": 0.10657145182291666, + "learning_rate": 0.0001, + "loss": 6.2456, + "loss/crossentropy": 2.4741748571395874, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2115134298801422, + "step": 6130 + }, + { + "epoch": 0.191625, + "grad_norm": 3.515625, + "grad_norm_var": 0.0994140625, + "learning_rate": 0.0001, + "loss": 6.3467, + "loss/crossentropy": 2.683540105819702, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.203819178044796, + "step": 6132 + }, + { + "epoch": 0.1916875, + "grad_norm": 3.546875, + "grad_norm_var": 0.10526936848958333, + "learning_rate": 0.0001, + "loss": 6.2246, + "loss/crossentropy": 2.635378360748291, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1968080848455429, + "step": 6134 + }, + { + "epoch": 0.19175, + "grad_norm": 3.40625, + "grad_norm_var": 0.08983968098958334, + "learning_rate": 0.0001, + "loss": 5.9543, + "loss/crossentropy": 2.4498454332351685, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19184674322605133, + "step": 6136 + }, + { + "epoch": 0.1918125, + "grad_norm": 4.0, + "grad_norm_var": 0.09337565104166666, + "learning_rate": 0.0001, + "loss": 6.1546, + "loss/crossentropy": 2.5229196548461914, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2030087485909462, + "step": 6138 + }, + { + "epoch": 0.191875, + "grad_norm": 4.71875, + "grad_norm_var": 0.14919331868489583, + "learning_rate": 0.0001, + "loss": 6.5918, + "loss/crossentropy": 2.786103367805481, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21572455763816833, + "step": 6140 + }, + { + "epoch": 0.1919375, + "grad_norm": 4.4375, + "grad_norm_var": 0.1832427978515625, + "learning_rate": 0.0001, + "loss": 6.7382, + "loss/crossentropy": 2.689463257789612, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.23808030039072037, + "step": 6142 + }, + { + "epoch": 0.192, + "grad_norm": 3.390625, + "grad_norm_var": 0.18919169108072917, + "learning_rate": 0.0001, + "loss": 5.9059, + "loss/crossentropy": 2.4961681365966797, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18550093472003937, + "step": 6144 + }, + { + "epoch": 0.1920625, + "grad_norm": 3.71875, + "grad_norm_var": 0.20533854166666668, + "learning_rate": 0.0001, + "loss": 5.8333, + "loss/crossentropy": 2.3912068605422974, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18639901280403137, + "step": 6146 + }, + { + "epoch": 0.192125, + "grad_norm": 3.21875, + "grad_norm_var": 0.21966044108072916, + "learning_rate": 0.0001, + "loss": 6.0744, + "loss/crossentropy": 2.5090500116348267, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19559796899557114, + "step": 6148 + }, + { + "epoch": 0.1921875, + "grad_norm": 3.65625, + "grad_norm_var": 0.20067952473958334, + "learning_rate": 0.0001, + "loss": 6.3968, + "loss/crossentropy": 2.7094013690948486, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20545856654644012, + "step": 6150 + }, + { + "epoch": 0.19225, + "grad_norm": 4.1875, + "grad_norm_var": 0.1937164306640625, + "learning_rate": 0.0001, + "loss": 6.2988, + "loss/crossentropy": 2.616774797439575, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20609114319086075, + "step": 6152 + }, + { + "epoch": 0.1923125, + "grad_norm": 3.21875, + "grad_norm_var": 0.214013671875, + "learning_rate": 0.0001, + "loss": 6.1391, + "loss/crossentropy": 2.506002187728882, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20237066596746445, + "step": 6154 + }, + { + "epoch": 0.192375, + "grad_norm": 5.25, + "grad_norm_var": 0.3080963134765625, + "learning_rate": 0.0001, + "loss": 6.3327, + "loss/crossentropy": 2.64454448223114, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20358256250619888, + "step": 6156 + }, + { + "epoch": 0.1924375, + "grad_norm": 3.796875, + "grad_norm_var": 0.25627339680989586, + "learning_rate": 0.0001, + "loss": 6.125, + "loss/crossentropy": 2.4693968296051025, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2038370817899704, + "step": 6158 + }, + { + "epoch": 0.1925, + "grad_norm": 3.859375, + "grad_norm_var": 0.23990478515625, + "learning_rate": 0.0001, + "loss": 6.5249, + "loss/crossentropy": 2.726282238960266, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21697460114955902, + "step": 6160 + }, + { + "epoch": 0.1925625, + "grad_norm": 3.578125, + "grad_norm_var": 0.22180989583333333, + "learning_rate": 0.0001, + "loss": 6.0866, + "loss/crossentropy": 2.3971667289733887, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20370658487081528, + "step": 6162 + }, + { + "epoch": 0.192625, + "grad_norm": 4.0, + "grad_norm_var": 0.19635009765625, + "learning_rate": 0.0001, + "loss": 6.451, + "loss/crossentropy": 2.5855096578598022, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21897225081920624, + "step": 6164 + }, + { + "epoch": 0.1926875, + "grad_norm": 3.875, + "grad_norm_var": 0.19466044108072916, + "learning_rate": 0.0001, + "loss": 6.4751, + "loss/crossentropy": 2.7665354013442993, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.20328181236982346, + "step": 6166 + }, + { + "epoch": 0.19275, + "grad_norm": 3.328125, + "grad_norm_var": 0.20995992024739582, + "learning_rate": 0.0001, + "loss": 6.134, + "loss/crossentropy": 2.5601810216903687, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.19292522966861725, + "step": 6168 + }, + { + "epoch": 0.1928125, + "grad_norm": 3.84375, + "grad_norm_var": 0.20896809895833332, + "learning_rate": 0.0001, + "loss": 6.3857, + "loss/crossentropy": 2.703435182571411, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20494025945663452, + "step": 6170 + }, + { + "epoch": 0.192875, + "grad_norm": 4.3125, + "grad_norm_var": 0.0761871337890625, + "learning_rate": 0.0001, + "loss": 6.5079, + "loss/crossentropy": 2.6246687173843384, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2199653834104538, + "step": 6172 + }, + { + "epoch": 0.1929375, + "grad_norm": 3.671875, + "grad_norm_var": 0.07942301432291667, + "learning_rate": 0.0001, + "loss": 5.8911, + "loss/crossentropy": 2.304368019104004, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19813121110200882, + "step": 6174 + }, + { + "epoch": 0.193, + "grad_norm": 4.09375, + "grad_norm_var": 0.09829813639322917, + "learning_rate": 0.0001, + "loss": 5.9767, + "loss/crossentropy": 2.4628864526748657, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1896635890007019, + "step": 6176 + }, + { + "epoch": 0.1930625, + "grad_norm": 3.84375, + "grad_norm_var": 0.10165608723958333, + "learning_rate": 0.0001, + "loss": 6.2344, + "loss/crossentropy": 2.622955799102783, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.198647640645504, + "step": 6178 + }, + { + "epoch": 0.193125, + "grad_norm": 3.25, + "grad_norm_var": 0.10637919108072917, + "learning_rate": 0.0001, + "loss": 6.2863, + "loss/crossentropy": 2.7121388912200928, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19725629687309265, + "step": 6180 + }, + { + "epoch": 0.1931875, + "grad_norm": 3.6875, + "grad_norm_var": 0.10224507649739584, + "learning_rate": 0.0001, + "loss": 5.8827, + "loss/crossentropy": 2.3777220249176025, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18956278264522552, + "step": 6182 + }, + { + "epoch": 0.19325, + "grad_norm": 3.640625, + "grad_norm_var": 0.104443359375, + "learning_rate": 0.0001, + "loss": 6.2903, + "loss/crossentropy": 2.6881426572799683, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19927627593278885, + "step": 6184 + }, + { + "epoch": 0.1933125, + "grad_norm": 3.890625, + "grad_norm_var": 0.09348856608072917, + "learning_rate": 0.0001, + "loss": 6.3074, + "loss/crossentropy": 2.5851895809173584, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20620650053024292, + "step": 6186 + }, + { + "epoch": 0.193375, + "grad_norm": 3.609375, + "grad_norm_var": 0.057291666666666664, + "learning_rate": 0.0001, + "loss": 6.0688, + "loss/crossentropy": 2.531674385070801, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1935586929321289, + "step": 6188 + }, + { + "epoch": 0.1934375, + "grad_norm": 3.671875, + "grad_norm_var": 0.0791015625, + "learning_rate": 0.0001, + "loss": 6.2447, + "loss/crossentropy": 2.599445343017578, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20085646957159042, + "step": 6190 + }, + { + "epoch": 0.1935, + "grad_norm": 3.609375, + "grad_norm_var": 0.05747782389322917, + "learning_rate": 0.0001, + "loss": 6.2174, + "loss/crossentropy": 2.591778039932251, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20044782757759094, + "step": 6192 + }, + { + "epoch": 0.1935625, + "grad_norm": 3.453125, + "grad_norm_var": 0.05000712076822917, + "learning_rate": 0.0001, + "loss": 5.9738, + "loss/crossentropy": 2.4809694290161133, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19146671891212463, + "step": 6194 + }, + { + "epoch": 0.193625, + "grad_norm": 3.4375, + "grad_norm_var": 0.04241129557291667, + "learning_rate": 0.0001, + "loss": 5.8573, + "loss/crossentropy": 2.316272735595703, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.1896507441997528, + "step": 6196 + }, + { + "epoch": 0.1936875, + "grad_norm": 3.5, + "grad_norm_var": 0.04509175618489583, + "learning_rate": 0.0001, + "loss": 5.8686, + "loss/crossentropy": 2.430908203125, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1859612688422203, + "step": 6198 + }, + { + "epoch": 0.19375, + "grad_norm": 3.359375, + "grad_norm_var": 0.04146219889322917, + "learning_rate": 0.0001, + "loss": 6.1377, + "loss/crossentropy": 2.570114254951477, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1985529363155365, + "step": 6200 + }, + { + "epoch": 0.1938125, + "grad_norm": 3.59375, + "grad_norm_var": 0.03762613932291667, + "learning_rate": 0.0001, + "loss": 6.412, + "loss/crossentropy": 2.7270954847335815, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20169344544410706, + "step": 6202 + }, + { + "epoch": 0.193875, + "grad_norm": 3.703125, + "grad_norm_var": 0.03767801920572917, + "learning_rate": 0.0001, + "loss": 5.8985, + "loss/crossentropy": 2.3919687271118164, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1920596957206726, + "step": 6204 + }, + { + "epoch": 0.1939375, + "grad_norm": 3.796875, + "grad_norm_var": 0.017699178059895834, + "learning_rate": 0.0001, + "loss": 6.5405, + "loss/crossentropy": 2.784037232398987, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20846372842788696, + "step": 6206 + }, + { + "epoch": 0.194, + "grad_norm": 3.671875, + "grad_norm_var": 0.018387858072916666, + "learning_rate": 0.0001, + "loss": 5.9835, + "loss/crossentropy": 2.3724876642227173, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2005559653043747, + "step": 6208 + }, + { + "epoch": 0.1940625, + "grad_norm": 3.515625, + "grad_norm_var": 0.03406473795572917, + "learning_rate": 0.0001, + "loss": 6.2921, + "loss/crossentropy": 2.662288188934326, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19813786447048187, + "step": 6210 + }, + { + "epoch": 0.194125, + "grad_norm": 3.921875, + "grad_norm_var": 0.03808186848958333, + "learning_rate": 0.0001, + "loss": 6.4928, + "loss/crossentropy": 2.73208487033844, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21122856438159943, + "step": 6212 + }, + { + "epoch": 0.1941875, + "grad_norm": 3.328125, + "grad_norm_var": 0.0408843994140625, + "learning_rate": 0.0001, + "loss": 5.8068, + "loss/crossentropy": 2.2671496868133545, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19107583165168762, + "step": 6214 + }, + { + "epoch": 0.19425, + "grad_norm": 4.8125, + "grad_norm_var": 0.1188140869140625, + "learning_rate": 0.0001, + "loss": 6.0706, + "loss/crossentropy": 2.461466670036316, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19880161434412003, + "step": 6216 + }, + { + "epoch": 0.1943125, + "grad_norm": 3.71875, + "grad_norm_var": 0.11562093098958333, + "learning_rate": 0.0001, + "loss": 6.2875, + "loss/crossentropy": 2.588753581047058, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20659077912569046, + "step": 6218 + }, + { + "epoch": 0.194375, + "grad_norm": 4.09375, + "grad_norm_var": 0.12668863932291666, + "learning_rate": 0.0001, + "loss": 6.0093, + "loss/crossentropy": 2.498833656311035, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19205859303474426, + "step": 6220 + }, + { + "epoch": 0.1944375, + "grad_norm": 3.796875, + "grad_norm_var": 0.13118082682291668, + "learning_rate": 0.0001, + "loss": 5.934, + "loss/crossentropy": 2.3816803693771362, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.1927303969860077, + "step": 6222 + }, + { + "epoch": 0.1945, + "grad_norm": 3.59375, + "grad_norm_var": 0.13492431640625, + "learning_rate": 0.0001, + "loss": 6.3904, + "loss/crossentropy": 2.7171014547348022, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2071734443306923, + "step": 6224 + }, + { + "epoch": 0.1945625, + "grad_norm": 3.484375, + "grad_norm_var": 0.13240559895833334, + "learning_rate": 0.0001, + "loss": 6.0352, + "loss/crossentropy": 2.5309075117111206, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18909665942192078, + "step": 6226 + }, + { + "epoch": 0.194625, + "grad_norm": 4.21875, + "grad_norm_var": 0.18277994791666666, + "learning_rate": 0.0001, + "loss": 6.4397, + "loss/crossentropy": 2.879347324371338, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19080226868391037, + "step": 6228 + }, + { + "epoch": 0.1946875, + "grad_norm": 3.53125, + "grad_norm_var": 0.17288004557291667, + "learning_rate": 0.0001, + "loss": 6.0098, + "loss/crossentropy": 2.470539927482605, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19103093445301056, + "step": 6230 + }, + { + "epoch": 0.19475, + "grad_norm": 3.40625, + "grad_norm_var": 0.09817301432291667, + "learning_rate": 0.0001, + "loss": 6.6131, + "loss/crossentropy": 2.857435464859009, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21111512184143066, + "step": 6232 + }, + { + "epoch": 0.1948125, + "grad_norm": 3.90625, + "grad_norm_var": 0.11474507649739583, + "learning_rate": 0.0001, + "loss": 6.1691, + "loss/crossentropy": 2.5498597621917725, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20059340447187424, + "step": 6234 + }, + { + "epoch": 0.194875, + "grad_norm": 3.390625, + "grad_norm_var": 0.11454671223958333, + "learning_rate": 0.0001, + "loss": 5.9912, + "loss/crossentropy": 2.5641114711761475, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1864565759897232, + "step": 6236 + }, + { + "epoch": 0.1949375, + "grad_norm": 3.421875, + "grad_norm_var": 0.1144927978515625, + "learning_rate": 0.0001, + "loss": 6.1785, + "loss/crossentropy": 2.6006263494491577, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19684504717588425, + "step": 6238 + }, + { + "epoch": 0.195, + "grad_norm": 3.65625, + "grad_norm_var": 0.11428629557291667, + "learning_rate": 0.0001, + "loss": 6.4466, + "loss/crossentropy": 2.6832586526870728, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21032319962978363, + "step": 6240 + }, + { + "epoch": 0.1950625, + "grad_norm": 3.765625, + "grad_norm_var": 0.10955301920572917, + "learning_rate": 0.0001, + "loss": 6.2545, + "loss/crossentropy": 2.583243250846863, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20423732697963715, + "step": 6242 + }, + { + "epoch": 0.195125, + "grad_norm": 4.15625, + "grad_norm_var": 0.05838216145833333, + "learning_rate": 0.0001, + "loss": 6.3258, + "loss/crossentropy": 2.6092876195907593, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2032882571220398, + "step": 6244 + }, + { + "epoch": 0.1951875, + "grad_norm": 3.40625, + "grad_norm_var": 0.06337483723958333, + "learning_rate": 0.0001, + "loss": 6.0453, + "loss/crossentropy": 2.4458929300308228, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19626961648464203, + "step": 6246 + }, + { + "epoch": 0.19525, + "grad_norm": 3.40625, + "grad_norm_var": 0.0630523681640625, + "learning_rate": 0.0001, + "loss": 6.0131, + "loss/crossentropy": 2.524987578392029, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1898244246840477, + "step": 6248 + }, + { + "epoch": 0.1953125, + "grad_norm": 3.671875, + "grad_norm_var": 0.04722900390625, + "learning_rate": 0.0001, + "loss": 6.153, + "loss/crossentropy": 2.53132426738739, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2008429765701294, + "step": 6250 + }, + { + "epoch": 0.195375, + "grad_norm": 4.03125, + "grad_norm_var": 0.0478424072265625, + "learning_rate": 0.0001, + "loss": 6.2777, + "loss/crossentropy": 2.6375958919525146, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2046319842338562, + "step": 6252 + }, + { + "epoch": 0.1954375, + "grad_norm": 3.5625, + "grad_norm_var": 0.04595947265625, + "learning_rate": 0.0001, + "loss": 6.8056, + "loss/crossentropy": 3.0053709745407104, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2128308191895485, + "step": 6254 + }, + { + "epoch": 0.1955, + "grad_norm": 3.71875, + "grad_norm_var": 0.04680989583333333, + "learning_rate": 0.0001, + "loss": 6.1446, + "loss/crossentropy": 2.5195276737213135, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20000825077295303, + "step": 6256 + }, + { + "epoch": 0.1955625, + "grad_norm": 3.484375, + "grad_norm_var": 0.04716695149739583, + "learning_rate": 0.0001, + "loss": 6.2884, + "loss/crossentropy": 2.652225613594055, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2030748426914215, + "step": 6258 + }, + { + "epoch": 0.195625, + "grad_norm": 3.703125, + "grad_norm_var": 0.0280181884765625, + "learning_rate": 0.0001, + "loss": 6.3109, + "loss/crossentropy": 2.706356406211853, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19834348559379578, + "step": 6260 + }, + { + "epoch": 0.1956875, + "grad_norm": 3.65625, + "grad_norm_var": 0.0787506103515625, + "learning_rate": 0.0001, + "loss": 6.0637, + "loss/crossentropy": 2.5506874322891235, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.18645452708005905, + "step": 6262 + }, + { + "epoch": 0.19575, + "grad_norm": 3.390625, + "grad_norm_var": 0.090283203125, + "learning_rate": 0.0001, + "loss": 6.2095, + "loss/crossentropy": 2.659890294075012, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19363542646169662, + "step": 6264 + }, + { + "epoch": 0.1958125, + "grad_norm": 3.5625, + "grad_norm_var": 0.16773681640625, + "learning_rate": 0.0001, + "loss": 6.1357, + "loss/crossentropy": 2.4744845628738403, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.19814816117286682, + "step": 6266 + }, + { + "epoch": 0.195875, + "grad_norm": 3.328125, + "grad_norm_var": 0.1708404541015625, + "learning_rate": 0.0001, + "loss": 6.2998, + "loss/crossentropy": 2.6317743062973022, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20274505764245987, + "step": 6268 + }, + { + "epoch": 0.1959375, + "grad_norm": 3.640625, + "grad_norm_var": 0.16910400390625, + "learning_rate": 0.0001, + "loss": 6.0523, + "loss/crossentropy": 2.4604722261428833, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.1931646391749382, + "step": 6270 + }, + { + "epoch": 0.196, + "grad_norm": 3.78125, + "grad_norm_var": 0.16767171223958333, + "learning_rate": 0.0001, + "loss": 6.2577, + "loss/crossentropy": 2.5092904567718506, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20765485614538193, + "step": 6272 + }, + { + "epoch": 0.1960625, + "grad_norm": 3.625, + "grad_norm_var": 0.16330973307291666, + "learning_rate": 0.0001, + "loss": 6.2526, + "loss/crossentropy": 2.607978105545044, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20313771069049835, + "step": 6274 + }, + { + "epoch": 0.196125, + "grad_norm": 3.265625, + "grad_norm_var": 0.19286702473958334, + "learning_rate": 0.0001, + "loss": 6.0585, + "loss/crossentropy": 2.600584626197815, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18876295536756516, + "step": 6276 + }, + { + "epoch": 0.1961875, + "grad_norm": 3.390625, + "grad_norm_var": 0.14591471354166666, + "learning_rate": 0.0001, + "loss": 6.0282, + "loss/crossentropy": 2.3694067001342773, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.19908495992422104, + "step": 6278 + }, + { + "epoch": 0.19625, + "grad_norm": 3.4375, + "grad_norm_var": 0.1331451416015625, + "learning_rate": 0.0001, + "loss": 6.2367, + "loss/crossentropy": 2.596962571144104, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.2049897238612175, + "step": 6280 + }, + { + "epoch": 0.1963125, + "grad_norm": 4.125, + "grad_norm_var": 0.06413472493489583, + "learning_rate": 0.0001, + "loss": 6.5489, + "loss/crossentropy": 2.7599679231643677, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21326513588428497, + "step": 6282 + }, + { + "epoch": 0.196375, + "grad_norm": 3.53125, + "grad_norm_var": 0.05855712890625, + "learning_rate": 0.0001, + "loss": 6.1641, + "loss/crossentropy": 2.602334141731262, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1979694366455078, + "step": 6284 + }, + { + "epoch": 0.1964375, + "grad_norm": 3.34375, + "grad_norm_var": 0.062409464518229166, + "learning_rate": 0.0001, + "loss": 5.9995, + "loss/crossentropy": 2.484236478805542, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1905846670269966, + "step": 6286 + }, + { + "epoch": 0.1965, + "grad_norm": 4.0625, + "grad_norm_var": 0.06709696451822916, + "learning_rate": 0.0001, + "loss": 5.9537, + "loss/crossentropy": 2.2918028831481934, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20212748646736145, + "step": 6288 + }, + { + "epoch": 0.1965625, + "grad_norm": 3.5625, + "grad_norm_var": 0.06467997233072917, + "learning_rate": 0.0001, + "loss": 6.3358, + "loss/crossentropy": 2.6319711208343506, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.21062206476926804, + "step": 6290 + }, + { + "epoch": 0.196625, + "grad_norm": 3.375, + "grad_norm_var": 0.054703776041666666, + "learning_rate": 0.0001, + "loss": 6.0362, + "loss/crossentropy": 2.4948049783706665, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1951577365398407, + "step": 6292 + }, + { + "epoch": 0.1966875, + "grad_norm": 3.578125, + "grad_norm_var": 0.049714152018229166, + "learning_rate": 0.0001, + "loss": 6.3416, + "loss/crossentropy": 2.634096145629883, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20981720089912415, + "step": 6294 + }, + { + "epoch": 0.19675, + "grad_norm": 3.546875, + "grad_norm_var": 0.05196024576822917, + "learning_rate": 0.0001, + "loss": 6.1901, + "loss/crossentropy": 2.591532826423645, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19462481141090393, + "step": 6296 + }, + { + "epoch": 0.1968125, + "grad_norm": 3.578125, + "grad_norm_var": 0.10296223958333334, + "learning_rate": 0.0001, + "loss": 6.3022, + "loss/crossentropy": 2.592622399330139, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20650336146354675, + "step": 6298 + }, + { + "epoch": 0.196875, + "grad_norm": 3.671875, + "grad_norm_var": 0.10568745930989583, + "learning_rate": 0.0001, + "loss": 6.5115, + "loss/crossentropy": 2.7239500284194946, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.2103961780667305, + "step": 6300 + }, + { + "epoch": 0.1969375, + "grad_norm": 3.46875, + "grad_norm_var": 0.09684244791666667, + "learning_rate": 0.0001, + "loss": 6.5188, + "loss/crossentropy": 2.784137725830078, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20627786219120026, + "step": 6302 + }, + { + "epoch": 0.197, + "grad_norm": 3.640625, + "grad_norm_var": 0.09217122395833334, + "learning_rate": 0.0001, + "loss": 6.2574, + "loss/crossentropy": 2.6546131372451782, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20090806484222412, + "step": 6304 + }, + { + "epoch": 0.1970625, + "grad_norm": 3.625, + "grad_norm_var": 0.09218648274739584, + "learning_rate": 0.0001, + "loss": 6.459, + "loss/crossentropy": 2.7477740049362183, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20822839438915253, + "step": 6306 + }, + { + "epoch": 0.197125, + "grad_norm": 3.671875, + "grad_norm_var": 0.10091044108072916, + "learning_rate": 0.0001, + "loss": 6.1576, + "loss/crossentropy": 2.6425379514694214, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19252388179302216, + "step": 6308 + }, + { + "epoch": 0.1971875, + "grad_norm": 3.5, + "grad_norm_var": 0.101220703125, + "learning_rate": 0.0001, + "loss": 6.1944, + "loss/crossentropy": 2.570142388343811, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20227423310279846, + "step": 6310 + }, + { + "epoch": 0.19725, + "grad_norm": 3.8125, + "grad_norm_var": 59.4859364827474, + "learning_rate": 0.0001, + "loss": 7.034, + "loss/crossentropy": 2.753095507621765, + "loss/hidden": 2.0546875, + "loss/jsd": 0.0, + "loss/logits": 0.22262004762887955, + "step": 6312 + }, + { + "epoch": 0.1973125, + "grad_norm": 3.875, + "grad_norm_var": 59.61048075358073, + "learning_rate": 0.0001, + "loss": 6.3162, + "loss/crossentropy": 2.640398144721985, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20039722323417664, + "step": 6314 + }, + { + "epoch": 0.197375, + "grad_norm": 3.765625, + "grad_norm_var": 59.516988118489586, + "learning_rate": 0.0001, + "loss": 6.306, + "loss/crossentropy": 2.629085659980774, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.207536019384861, + "step": 6316 + }, + { + "epoch": 0.1974375, + "grad_norm": 3.796875, + "grad_norm_var": 59.381900024414065, + "learning_rate": 0.0001, + "loss": 6.1296, + "loss/crossentropy": 2.5182682275772095, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.1978505775332451, + "step": 6318 + }, + { + "epoch": 0.1975, + "grad_norm": 3.90625, + "grad_norm_var": 59.28876953125, + "learning_rate": 0.0001, + "loss": 6.2356, + "loss/crossentropy": 2.5801738500595093, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20303912460803986, + "step": 6320 + }, + { + "epoch": 0.1975625, + "grad_norm": 3.765625, + "grad_norm_var": 59.175113932291666, + "learning_rate": 0.0001, + "loss": 5.987, + "loss/crossentropy": 2.393300414085388, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19686954468488693, + "step": 6322 + }, + { + "epoch": 0.197625, + "grad_norm": 3.625, + "grad_norm_var": 59.05861002604167, + "learning_rate": 0.0001, + "loss": 6.3308, + "loss/crossentropy": 2.6749523878097534, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20504015684127808, + "step": 6324 + }, + { + "epoch": 0.1976875, + "grad_norm": 3.734375, + "grad_norm_var": 59.07056884765625, + "learning_rate": 0.0001, + "loss": 6.1818, + "loss/crossentropy": 2.6099843978881836, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19741416722536087, + "step": 6326 + }, + { + "epoch": 0.19775, + "grad_norm": 3.703125, + "grad_norm_var": 0.048844401041666666, + "learning_rate": 0.0001, + "loss": 6.1622, + "loss/crossentropy": 2.540986657142639, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19727765768766403, + "step": 6328 + }, + { + "epoch": 0.1978125, + "grad_norm": 3.9375, + "grad_norm_var": 0.04920145670572917, + "learning_rate": 0.0001, + "loss": 6.5252, + "loss/crossentropy": 2.7225914001464844, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21346324682235718, + "step": 6330 + }, + { + "epoch": 0.197875, + "grad_norm": 3.5, + "grad_norm_var": 0.033447265625, + "learning_rate": 0.0001, + "loss": 6.5429, + "loss/crossentropy": 2.8141279220581055, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.21155305206775665, + "step": 6332 + }, + { + "epoch": 0.1979375, + "grad_norm": 3.640625, + "grad_norm_var": 0.025861612955729165, + "learning_rate": 0.0001, + "loss": 6.1337, + "loss/crossentropy": 2.5853216648101807, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19389677047729492, + "step": 6334 + }, + { + "epoch": 0.198, + "grad_norm": 3.5, + "grad_norm_var": 0.022614542643229166, + "learning_rate": 0.0001, + "loss": 5.9631, + "loss/crossentropy": 2.4103667736053467, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.1916046440601349, + "step": 6336 + }, + { + "epoch": 0.1980625, + "grad_norm": 3.71875, + "grad_norm_var": 0.025324503580729168, + "learning_rate": 0.0001, + "loss": 6.2243, + "loss/crossentropy": 2.612160801887512, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.20261822640895844, + "step": 6338 + }, + { + "epoch": 0.198125, + "grad_norm": 3.328125, + "grad_norm_var": 0.031103515625, + "learning_rate": 0.0001, + "loss": 6.0515, + "loss/crossentropy": 2.571484923362732, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19018612802028656, + "step": 6340 + }, + { + "epoch": 0.1981875, + "grad_norm": 3.328125, + "grad_norm_var": 0.047200520833333336, + "learning_rate": 0.0001, + "loss": 5.8307, + "loss/crossentropy": 2.4519015550613403, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18397627025842667, + "step": 6342 + }, + { + "epoch": 0.19825, + "grad_norm": 3.6875, + "grad_norm_var": 0.04722391764322917, + "learning_rate": 0.0001, + "loss": 6.2341, + "loss/crossentropy": 2.633862018585205, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.2002565562725067, + "step": 6344 + }, + { + "epoch": 0.1983125, + "grad_norm": 3.859375, + "grad_norm_var": 0.03828125, + "learning_rate": 0.0001, + "loss": 6.365, + "loss/crossentropy": 2.621291160583496, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20913610607385635, + "step": 6346 + }, + { + "epoch": 0.198375, + "grad_norm": 3.296875, + "grad_norm_var": 0.0399810791015625, + "learning_rate": 0.0001, + "loss": 6.2709, + "loss/crossentropy": 2.7081961631774902, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19533439725637436, + "step": 6348 + }, + { + "epoch": 0.1984375, + "grad_norm": 3.484375, + "grad_norm_var": 0.0437408447265625, + "learning_rate": 0.0001, + "loss": 6.3223, + "loss/crossentropy": 2.6249715089797974, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20723501592874527, + "step": 6350 + }, + { + "epoch": 0.1985, + "grad_norm": 3.65625, + "grad_norm_var": 0.049462890625, + "learning_rate": 0.0001, + "loss": 6.4, + "loss/crossentropy": 2.7535150051116943, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20253852754831314, + "step": 6352 + }, + { + "epoch": 0.1985625, + "grad_norm": 3.421875, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 0.0001, + "loss": 6.5861, + "loss/crossentropy": 2.8564473390579224, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.21241725236177444, + "step": 6354 + }, + { + "epoch": 0.198625, + "grad_norm": 3.609375, + "grad_norm_var": 0.044676717122395834, + "learning_rate": 0.0001, + "loss": 6.5185, + "loss/crossentropy": 2.7504091262817383, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2150934562087059, + "step": 6356 + }, + { + "epoch": 0.1986875, + "grad_norm": 3.484375, + "grad_norm_var": 0.03127339680989583, + "learning_rate": 0.0001, + "loss": 6.1064, + "loss/crossentropy": 2.5444202423095703, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19291561096906662, + "step": 6358 + }, + { + "epoch": 0.19875, + "grad_norm": 4.75, + "grad_norm_var": 0.11840718587239583, + "learning_rate": 0.0001, + "loss": 6.6099, + "loss/crossentropy": 2.734977602958679, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.21952679008245468, + "step": 6360 + }, + { + "epoch": 0.1988125, + "grad_norm": 4.46875, + "grad_norm_var": 0.17399088541666666, + "learning_rate": 0.0001, + "loss": 6.2625, + "loss/crossentropy": 2.4923490285873413, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21412266045808792, + "step": 6362 + }, + { + "epoch": 0.198875, + "grad_norm": 3.578125, + "grad_norm_var": 0.17430013020833332, + "learning_rate": 0.0001, + "loss": 6.154, + "loss/crossentropy": 2.5593817234039307, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.2004818245768547, + "step": 6364 + }, + { + "epoch": 0.1989375, + "grad_norm": 4.03125, + "grad_norm_var": 0.17472330729166666, + "learning_rate": 0.0001, + "loss": 6.0191, + "loss/crossentropy": 2.418843388557434, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19518518447875977, + "step": 6366 + }, + { + "epoch": 0.199, + "grad_norm": 3.875, + "grad_norm_var": 0.19568684895833333, + "learning_rate": 0.0001, + "loss": 6.4857, + "loss/crossentropy": 2.751247763633728, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20782289654016495, + "step": 6368 + }, + { + "epoch": 0.1990625, + "grad_norm": 3.59375, + "grad_norm_var": 0.19297587076822917, + "learning_rate": 0.0001, + "loss": 6.0458, + "loss/crossentropy": 2.521559238433838, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.196170873939991, + "step": 6370 + }, + { + "epoch": 0.199125, + "grad_norm": 3.25, + "grad_norm_var": 0.21787821451822917, + "learning_rate": 0.0001, + "loss": 5.827, + "loss/crossentropy": 2.4283812046051025, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18360844254493713, + "step": 6372 + }, + { + "epoch": 0.1991875, + "grad_norm": 3.4375, + "grad_norm_var": 0.20514322916666666, + "learning_rate": 0.0001, + "loss": 5.9552, + "loss/crossentropy": 2.408576011657715, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19568176567554474, + "step": 6374 + }, + { + "epoch": 0.19925, + "grad_norm": 3.890625, + "grad_norm_var": 0.13240559895833334, + "learning_rate": 0.0001, + "loss": 6.0052, + "loss/crossentropy": 2.449387311935425, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.193084217607975, + "step": 6376 + }, + { + "epoch": 0.1993125, + "grad_norm": 3.59375, + "grad_norm_var": 0.079541015625, + "learning_rate": 0.0001, + "loss": 6.3974, + "loss/crossentropy": 2.7193866968154907, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2068602815270424, + "step": 6378 + }, + { + "epoch": 0.199375, + "grad_norm": 3.625, + "grad_norm_var": 0.07008056640625, + "learning_rate": 0.0001, + "loss": 6.4284, + "loss/crossentropy": 2.7452560663223267, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20386269688606262, + "step": 6380 + }, + { + "epoch": 0.1994375, + "grad_norm": 3.5, + "grad_norm_var": 0.0810455322265625, + "learning_rate": 0.0001, + "loss": 5.7566, + "loss/crossentropy": 2.3827916383743286, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1819118708372116, + "step": 6382 + }, + { + "epoch": 0.1995, + "grad_norm": 3.734375, + "grad_norm_var": 0.04063212076822917, + "learning_rate": 0.0001, + "loss": 6.378, + "loss/crossentropy": 2.6957045793533325, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20417062938213348, + "step": 6384 + }, + { + "epoch": 0.1995625, + "grad_norm": 3.5625, + "grad_norm_var": 0.0408355712890625, + "learning_rate": 0.0001, + "loss": 6.1671, + "loss/crossentropy": 2.603260636329651, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19466431438922882, + "step": 6386 + }, + { + "epoch": 0.199625, + "grad_norm": 3.65625, + "grad_norm_var": 0.04121805826822917, + "learning_rate": 0.0001, + "loss": 6.5784, + "loss/crossentropy": 2.73874568939209, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21717103570699692, + "step": 6388 + }, + { + "epoch": 0.1996875, + "grad_norm": 3.28125, + "grad_norm_var": 0.04345703125, + "learning_rate": 0.0001, + "loss": 6.2663, + "loss/crossentropy": 2.6854538917541504, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19793018698692322, + "step": 6390 + }, + { + "epoch": 0.19975, + "grad_norm": 6.25, + "grad_norm_var": 0.6866770426432292, + "learning_rate": 0.0001, + "loss": 6.5006, + "loss/crossentropy": 2.59263277053833, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.22321807593107224, + "step": 6392 + }, + { + "epoch": 0.1998125, + "grad_norm": 3.578125, + "grad_norm_var": 0.6817535400390625, + "learning_rate": 0.0001, + "loss": 6.0462, + "loss/crossentropy": 2.4960540533065796, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19408106058835983, + "step": 6394 + }, + { + "epoch": 0.199875, + "grad_norm": 3.625, + "grad_norm_var": 0.696044921875, + "learning_rate": 0.0001, + "loss": 6.0056, + "loss/crossentropy": 2.483731508255005, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19280751049518585, + "step": 6396 + }, + { + "epoch": 0.1999375, + "grad_norm": 4.0, + "grad_norm_var": 0.6753000895182292, + "learning_rate": 0.0001, + "loss": 6.3558, + "loss/crossentropy": 2.6343945264816284, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20612553507089615, + "step": 6398 + }, + { + "epoch": 0.2, + "grad_norm": 3.75, + "grad_norm_var": 0.6785959879557292, + "learning_rate": 0.0001, + "loss": 6.0777, + "loss/crossentropy": 2.5519295930862427, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19711043685674667, + "step": 6400 + }, + { + "epoch": 0.2000625, + "grad_norm": 3.609375, + "grad_norm_var": 0.687744140625, + "learning_rate": 0.0001, + "loss": 6.034, + "loss/crossentropy": 2.49414598941803, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1938270404934883, + "step": 6402 + }, + { + "epoch": 0.200125, + "grad_norm": 3.640625, + "grad_norm_var": 0.6919911702473959, + "learning_rate": 0.0001, + "loss": 6.2959, + "loss/crossentropy": 2.6305042505264282, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2020825818181038, + "step": 6404 + }, + { + "epoch": 0.2001875, + "grad_norm": 6.0625, + "grad_norm_var": 0.9452952067057292, + "learning_rate": 0.0001, + "loss": 6.4622, + "loss/crossentropy": 2.616268277168274, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21389038115739822, + "step": 6406 + }, + { + "epoch": 0.20025, + "grad_norm": 3.375, + "grad_norm_var": 0.4216542561848958, + "learning_rate": 0.0001, + "loss": 6.1744, + "loss/crossentropy": 2.6161030530929565, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19450364261865616, + "step": 6408 + }, + { + "epoch": 0.2003125, + "grad_norm": 3.5625, + "grad_norm_var": 0.43029683430989585, + "learning_rate": 0.0001, + "loss": 6.2095, + "loss/crossentropy": 2.6182576417922974, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19779501855373383, + "step": 6410 + }, + { + "epoch": 0.200375, + "grad_norm": 3.4375, + "grad_norm_var": 0.4272369384765625, + "learning_rate": 0.0001, + "loss": 6.2618, + "loss/crossentropy": 2.6078569889068604, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20093682408332825, + "step": 6412 + }, + { + "epoch": 0.2004375, + "grad_norm": 3.46875, + "grad_norm_var": 0.41306966145833335, + "learning_rate": 0.0001, + "loss": 5.821, + "loss/crossentropy": 2.303962230682373, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19662676006555557, + "step": 6414 + }, + { + "epoch": 0.2005, + "grad_norm": 3.328125, + "grad_norm_var": 0.44433186848958334, + "learning_rate": 0.0001, + "loss": 5.5651, + "loss/crossentropy": 2.257362961769104, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17413020133972168, + "step": 6416 + }, + { + "epoch": 0.2005625, + "grad_norm": 3.546875, + "grad_norm_var": 0.43925679524739586, + "learning_rate": 0.0001, + "loss": 6.4127, + "loss/crossentropy": 2.721003532409668, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.2070593237876892, + "step": 6418 + }, + { + "epoch": 0.200625, + "grad_norm": 4.625, + "grad_norm_var": 0.49986572265625, + "learning_rate": 0.0001, + "loss": 6.561, + "loss/crossentropy": 2.606340169906616, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.22593571245670319, + "step": 6420 + }, + { + "epoch": 0.2006875, + "grad_norm": 3.34375, + "grad_norm_var": 0.1508697509765625, + "learning_rate": 0.0001, + "loss": 6.0928, + "loss/crossentropy": 2.532419443130493, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19510090351104736, + "step": 6422 + }, + { + "epoch": 0.20075, + "grad_norm": 3.609375, + "grad_norm_var": 0.13479715983072918, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.1905429363250732, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18880540132522583, + "step": 6424 + }, + { + "epoch": 0.2008125, + "grad_norm": 3.453125, + "grad_norm_var": 0.13567301432291667, + "learning_rate": 0.0001, + "loss": 6.0427, + "loss/crossentropy": 2.5184231996536255, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19227512180805206, + "step": 6426 + }, + { + "epoch": 0.200875, + "grad_norm": 3.421875, + "grad_norm_var": 0.137060546875, + "learning_rate": 0.0001, + "loss": 6.1531, + "loss/crossentropy": 2.416158676147461, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20806624740362167, + "step": 6428 + }, + { + "epoch": 0.2009375, + "grad_norm": 3.53125, + "grad_norm_var": 0.13623758951822917, + "learning_rate": 0.0001, + "loss": 6.4346, + "loss/crossentropy": 2.746420979499817, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20592492073774338, + "step": 6430 + }, + { + "epoch": 0.201, + "grad_norm": 3.53125, + "grad_norm_var": 0.11658426920572916, + "learning_rate": 0.0001, + "loss": 6.1585, + "loss/crossentropy": 2.525734782218933, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.19882269948720932, + "step": 6432 + }, + { + "epoch": 0.2010625, + "grad_norm": 3.625, + "grad_norm_var": 0.1169342041015625, + "learning_rate": 0.0001, + "loss": 5.9987, + "loss/crossentropy": 2.3857542276382446, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19606362283229828, + "step": 6434 + }, + { + "epoch": 0.201125, + "grad_norm": 3.765625, + "grad_norm_var": 0.019449869791666668, + "learning_rate": 0.0001, + "loss": 6.494, + "loss/crossentropy": 2.761850357055664, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.20289798080921173, + "step": 6436 + }, + { + "epoch": 0.2011875, + "grad_norm": 3.875, + "grad_norm_var": 0.025260416666666667, + "learning_rate": 0.0001, + "loss": 6.5977, + "loss/crossentropy": 2.7616851329803467, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.21602296084165573, + "step": 6438 + }, + { + "epoch": 0.20125, + "grad_norm": 3.5, + "grad_norm_var": 0.028076171875, + "learning_rate": 0.0001, + "loss": 6.1706, + "loss/crossentropy": 2.5454601049423218, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19962288439273834, + "step": 6440 + }, + { + "epoch": 0.2013125, + "grad_norm": 3.5, + "grad_norm_var": 0.03411051432291667, + "learning_rate": 0.0001, + "loss": 5.9664, + "loss/crossentropy": 2.4520124197006226, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19050417840480804, + "step": 6442 + }, + { + "epoch": 0.201375, + "grad_norm": 3.4375, + "grad_norm_var": 0.03203023274739583, + "learning_rate": 0.0001, + "loss": 6.3257, + "loss/crossentropy": 2.726597547531128, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.2028745710849762, + "step": 6444 + }, + { + "epoch": 0.2014375, + "grad_norm": 3.046875, + "grad_norm_var": 0.05325113932291667, + "learning_rate": 0.0001, + "loss": 5.8921, + "loss/crossentropy": 2.405794858932495, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18925213813781738, + "step": 6446 + }, + { + "epoch": 0.2015, + "grad_norm": 3.421875, + "grad_norm_var": 0.05666910807291667, + "learning_rate": 0.0001, + "loss": 6.0421, + "loss/crossentropy": 2.5933274030685425, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18394114822149277, + "step": 6448 + }, + { + "epoch": 0.2015625, + "grad_norm": 4.25, + "grad_norm_var": 0.08600260416666666, + "learning_rate": 0.0001, + "loss": 6.291, + "loss/crossentropy": 2.6293972730636597, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2028760313987732, + "step": 6450 + }, + { + "epoch": 0.201625, + "grad_norm": 3.484375, + "grad_norm_var": 0.08571675618489584, + "learning_rate": 0.0001, + "loss": 6.3673, + "loss/crossentropy": 2.68838632106781, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20578349381685257, + "step": 6452 + }, + { + "epoch": 0.2016875, + "grad_norm": 3.53125, + "grad_norm_var": 0.06887613932291667, + "learning_rate": 0.0001, + "loss": 6.2586, + "loss/crossentropy": 2.695120930671692, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.1934536248445511, + "step": 6454 + }, + { + "epoch": 0.20175, + "grad_norm": 3.34375, + "grad_norm_var": 0.07627665201822917, + "learning_rate": 0.0001, + "loss": 5.6712, + "loss/crossentropy": 2.2751861810684204, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18022499233484268, + "step": 6456 + }, + { + "epoch": 0.2018125, + "grad_norm": 3.421875, + "grad_norm_var": 0.07322489420572917, + "learning_rate": 0.0001, + "loss": 5.9988, + "loss/crossentropy": 2.5843334197998047, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1875448003411293, + "step": 6458 + }, + { + "epoch": 0.201875, + "grad_norm": 3.421875, + "grad_norm_var": 0.07529195149739583, + "learning_rate": 0.0001, + "loss": 5.957, + "loss/crossentropy": 2.453627586364746, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19135772436857224, + "step": 6460 + }, + { + "epoch": 0.2019375, + "grad_norm": 3.53125, + "grad_norm_var": 0.05598856608072917, + "learning_rate": 0.0001, + "loss": 5.9872, + "loss/crossentropy": 2.428037643432617, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1969291865825653, + "step": 6462 + }, + { + "epoch": 0.202, + "grad_norm": 3.34375, + "grad_norm_var": 0.05963541666666667, + "learning_rate": 0.0001, + "loss": 6.4439, + "loss/crossentropy": 2.763149619102478, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20479683578014374, + "step": 6464 + }, + { + "epoch": 0.2020625, + "grad_norm": 3.484375, + "grad_norm_var": 0.024706013997395835, + "learning_rate": 0.0001, + "loss": 6.3375, + "loss/crossentropy": 2.755637764930725, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.198811337351799, + "step": 6466 + }, + { + "epoch": 0.202125, + "grad_norm": 4.21875, + "grad_norm_var": 0.1077056884765625, + "learning_rate": 0.0001, + "loss": 6.7012, + "loss/crossentropy": 2.681010603904724, + "loss/hidden": 1.7421875, + "loss/jsd": 0.0, + "loss/logits": 0.22779762744903564, + "step": 6468 + }, + { + "epoch": 0.2021875, + "grad_norm": 3.609375, + "grad_norm_var": 0.10625, + "learning_rate": 0.0001, + "loss": 5.7852, + "loss/crossentropy": 2.30839204788208, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1871296763420105, + "step": 6470 + }, + { + "epoch": 0.20225, + "grad_norm": 3.546875, + "grad_norm_var": 0.09915262858072917, + "learning_rate": 0.0001, + "loss": 6.3085, + "loss/crossentropy": 2.6761258840560913, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2030799761414528, + "step": 6472 + }, + { + "epoch": 0.2023125, + "grad_norm": 3.609375, + "grad_norm_var": 0.10269266764322917, + "learning_rate": 0.0001, + "loss": 6.0303, + "loss/crossentropy": 2.5599225759506226, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18922977149486542, + "step": 6474 + }, + { + "epoch": 0.202375, + "grad_norm": 3.546875, + "grad_norm_var": 0.1067779541015625, + "learning_rate": 0.0001, + "loss": 6.3538, + "loss/crossentropy": 2.677946925163269, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.207425557076931, + "step": 6476 + }, + { + "epoch": 0.2024375, + "grad_norm": 3.703125, + "grad_norm_var": 0.1154937744140625, + "learning_rate": 0.0001, + "loss": 6.4047, + "loss/crossentropy": 2.7720776796340942, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.1999780759215355, + "step": 6478 + }, + { + "epoch": 0.2025, + "grad_norm": 3.1875, + "grad_norm_var": 0.12296549479166667, + "learning_rate": 0.0001, + "loss": 6.2042, + "loss/crossentropy": 2.692052960395813, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19379380345344543, + "step": 6480 + }, + { + "epoch": 0.2025625, + "grad_norm": 3.390625, + "grad_norm_var": 0.12428385416666667, + "learning_rate": 0.0001, + "loss": 6.1168, + "loss/crossentropy": 2.5561152696609497, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19669246673583984, + "step": 6482 + }, + { + "epoch": 0.202625, + "grad_norm": 4.03125, + "grad_norm_var": 0.063720703125, + "learning_rate": 0.0001, + "loss": 6.1133, + "loss/crossentropy": 2.533989191055298, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19464991986751556, + "step": 6484 + }, + { + "epoch": 0.2026875, + "grad_norm": 3.46875, + "grad_norm_var": 0.06373291015625, + "learning_rate": 0.0001, + "loss": 6.3021, + "loss/crossentropy": 2.685084342956543, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20037659257650375, + "step": 6486 + }, + { + "epoch": 0.20275, + "grad_norm": 3.625, + "grad_norm_var": 0.06504618326822917, + "learning_rate": 0.0001, + "loss": 6.115, + "loss/crossentropy": 2.609553813934326, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19429339468479156, + "step": 6488 + }, + { + "epoch": 0.2028125, + "grad_norm": 3.140625, + "grad_norm_var": 0.06892801920572916, + "learning_rate": 0.0001, + "loss": 5.9493, + "loss/crossentropy": 2.5031588077545166, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18836626410484314, + "step": 6490 + }, + { + "epoch": 0.202875, + "grad_norm": 3.28125, + "grad_norm_var": 0.051590983072916666, + "learning_rate": 0.0001, + "loss": 5.8351, + "loss/crossentropy": 2.389310359954834, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18677020817995071, + "step": 6492 + }, + { + "epoch": 0.2029375, + "grad_norm": 3.34375, + "grad_norm_var": 0.04875895182291667, + "learning_rate": 0.0001, + "loss": 6.2967, + "loss/crossentropy": 2.6834553480148315, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.19530673325061798, + "step": 6494 + }, + { + "epoch": 0.203, + "grad_norm": 3.40625, + "grad_norm_var": 0.053544108072916666, + "learning_rate": 0.0001, + "loss": 6.0616, + "loss/crossentropy": 2.5159674882888794, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19284315407276154, + "step": 6496 + }, + { + "epoch": 0.2030625, + "grad_norm": 3.421875, + "grad_norm_var": 0.05363667805989583, + "learning_rate": 0.0001, + "loss": 6.157, + "loss/crossentropy": 2.5118918418884277, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20123399794101715, + "step": 6498 + }, + { + "epoch": 0.203125, + "grad_norm": 3.671875, + "grad_norm_var": 0.03837890625, + "learning_rate": 0.0001, + "loss": 6.1266, + "loss/crossentropy": 2.590605854988098, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19656459987163544, + "step": 6500 + }, + { + "epoch": 0.2031875, + "grad_norm": 3.453125, + "grad_norm_var": 0.04551493326822917, + "learning_rate": 0.0001, + "loss": 6.005, + "loss/crossentropy": 2.442551851272583, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1960894912481308, + "step": 6502 + }, + { + "epoch": 0.20325, + "grad_norm": 3.875, + "grad_norm_var": 0.056050618489583336, + "learning_rate": 0.0001, + "loss": 6.0774, + "loss/crossentropy": 2.550527811050415, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19448236376047134, + "step": 6504 + }, + { + "epoch": 0.2033125, + "grad_norm": 3.59375, + "grad_norm_var": 0.05068359375, + "learning_rate": 0.0001, + "loss": 6.252, + "loss/crossentropy": 2.6041879653930664, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20423421263694763, + "step": 6506 + }, + { + "epoch": 0.203375, + "grad_norm": 3.359375, + "grad_norm_var": 0.045947265625, + "learning_rate": 0.0001, + "loss": 6.4221, + "loss/crossentropy": 2.733941078186035, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2027966007590294, + "step": 6508 + }, + { + "epoch": 0.2034375, + "grad_norm": 3.3125, + "grad_norm_var": 0.04585673014322917, + "learning_rate": 0.0001, + "loss": 6.0533, + "loss/crossentropy": 2.4923083782196045, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.1920376867055893, + "step": 6510 + }, + { + "epoch": 0.2035, + "grad_norm": 4.09375, + "grad_norm_var": 0.06238505045572917, + "learning_rate": 0.0001, + "loss": 6.2568, + "loss/crossentropy": 2.6752275228500366, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19799719750881195, + "step": 6512 + }, + { + "epoch": 0.2035625, + "grad_norm": 3.609375, + "grad_norm_var": 0.06461181640625, + "learning_rate": 0.0001, + "loss": 6.1931, + "loss/crossentropy": 2.5532758235931396, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.201869398355484, + "step": 6514 + }, + { + "epoch": 0.203625, + "grad_norm": 3.578125, + "grad_norm_var": 0.0533355712890625, + "learning_rate": 0.0001, + "loss": 6.2844, + "loss/crossentropy": 2.6496084928512573, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.19628706574440002, + "step": 6516 + }, + { + "epoch": 0.2036875, + "grad_norm": 3.921875, + "grad_norm_var": 0.0578765869140625, + "learning_rate": 0.0001, + "loss": 6.2643, + "loss/crossentropy": 2.5256234407424927, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20784791558980942, + "step": 6518 + }, + { + "epoch": 0.20375, + "grad_norm": 3.3125, + "grad_norm_var": 0.052685546875, + "learning_rate": 0.0001, + "loss": 6.0125, + "loss/crossentropy": 2.6128029823303223, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18567069619894028, + "step": 6520 + }, + { + "epoch": 0.2038125, + "grad_norm": 3.46875, + "grad_norm_var": 0.054850260416666664, + "learning_rate": 0.0001, + "loss": 6.0551, + "loss/crossentropy": 2.523378372192383, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19458089023828506, + "step": 6522 + }, + { + "epoch": 0.203875, + "grad_norm": 3.328125, + "grad_norm_var": 0.06409505208333334, + "learning_rate": 0.0001, + "loss": 5.6911, + "loss/crossentropy": 2.242918074131012, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.1819288358092308, + "step": 6524 + }, + { + "epoch": 0.2039375, + "grad_norm": 3.375, + "grad_norm_var": 0.06256103515625, + "learning_rate": 0.0001, + "loss": 6.0178, + "loss/crossentropy": 2.5325701236724854, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1879783421754837, + "step": 6526 + }, + { + "epoch": 0.204, + "grad_norm": 3.1875, + "grad_norm_var": 0.0496246337890625, + "learning_rate": 0.0001, + "loss": 5.8456, + "loss/crossentropy": 2.405721068382263, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18773876875638962, + "step": 6528 + }, + { + "epoch": 0.2040625, + "grad_norm": 3.53125, + "grad_norm_var": 0.0484039306640625, + "learning_rate": 0.0001, + "loss": 6.0804, + "loss/crossentropy": 2.5674500465393066, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1942674219608307, + "step": 6530 + }, + { + "epoch": 0.204125, + "grad_norm": 3.8125, + "grad_norm_var": 0.0522125244140625, + "learning_rate": 0.0001, + "loss": 5.9899, + "loss/crossentropy": 2.3522396087646484, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20165231823921204, + "step": 6532 + }, + { + "epoch": 0.2041875, + "grad_norm": 3.640625, + "grad_norm_var": 0.05276285807291667, + "learning_rate": 0.0001, + "loss": 5.9671, + "loss/crossentropy": 2.5224697589874268, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1866544932126999, + "step": 6534 + }, + { + "epoch": 0.20425, + "grad_norm": 3.46875, + "grad_norm_var": 0.04951883951822917, + "learning_rate": 0.0001, + "loss": 6.3238, + "loss/crossentropy": 2.7269598245620728, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.1967899575829506, + "step": 6536 + }, + { + "epoch": 0.2043125, + "grad_norm": 3.46875, + "grad_norm_var": 0.04924214680989583, + "learning_rate": 0.0001, + "loss": 6.0685, + "loss/crossentropy": 2.5368363857269287, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19534919410943985, + "step": 6538 + }, + { + "epoch": 0.204375, + "grad_norm": 3.953125, + "grad_norm_var": 0.046873982747395834, + "learning_rate": 0.0001, + "loss": 6.089, + "loss/crossentropy": 2.502684712409973, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19925415515899658, + "step": 6540 + }, + { + "epoch": 0.2044375, + "grad_norm": 3.65625, + "grad_norm_var": 0.0486480712890625, + "learning_rate": 0.0001, + "loss": 6.0269, + "loss/crossentropy": 2.5080604553222656, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19367945939302444, + "step": 6542 + }, + { + "epoch": 0.2045, + "grad_norm": 3.796875, + "grad_norm_var": 0.04801025390625, + "learning_rate": 0.0001, + "loss": 6.2834, + "loss/crossentropy": 2.530786395072937, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2108120620250702, + "step": 6544 + }, + { + "epoch": 0.2045625, + "grad_norm": 3.40625, + "grad_norm_var": 0.06304931640625, + "learning_rate": 0.0001, + "loss": 5.9298, + "loss/crossentropy": 2.4919790029525757, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18440379947423935, + "step": 6546 + }, + { + "epoch": 0.204625, + "grad_norm": 4.9375, + "grad_norm_var": 0.20811258951822917, + "learning_rate": 0.0001, + "loss": 5.7784, + "loss/crossentropy": 2.259275794029236, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.183159738779068, + "step": 6548 + }, + { + "epoch": 0.2046875, + "grad_norm": 3.671875, + "grad_norm_var": 0.20513916015625, + "learning_rate": 0.0001, + "loss": 6.3012, + "loss/crossentropy": 2.639529585838318, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20600835978984833, + "step": 6550 + }, + { + "epoch": 0.20475, + "grad_norm": 3.78125, + "grad_norm_var": 0.2039947509765625, + "learning_rate": 0.0001, + "loss": 6.209, + "loss/crossentropy": 2.7606594562530518, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18819257616996765, + "step": 6552 + }, + { + "epoch": 0.2048125, + "grad_norm": 3.484375, + "grad_norm_var": 0.19899800618489583, + "learning_rate": 0.0001, + "loss": 6.2039, + "loss/crossentropy": 2.7157788276672363, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.191785030066967, + "step": 6554 + }, + { + "epoch": 0.204875, + "grad_norm": 3.328125, + "grad_norm_var": 0.2031158447265625, + "learning_rate": 0.0001, + "loss": 5.9576, + "loss/crossentropy": 2.3542600870132446, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.2005726769566536, + "step": 6556 + }, + { + "epoch": 0.2049375, + "grad_norm": 3.515625, + "grad_norm_var": 0.19844462076822916, + "learning_rate": 0.0001, + "loss": 6.4571, + "loss/crossentropy": 2.7662460803985596, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20501954853534698, + "step": 6558 + }, + { + "epoch": 0.205, + "grad_norm": 3.984375, + "grad_norm_var": 0.20293680826822916, + "learning_rate": 0.0001, + "loss": 6.4058, + "loss/crossentropy": 2.7039828300476074, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20846319943666458, + "step": 6560 + }, + { + "epoch": 0.2050625, + "grad_norm": 3.328125, + "grad_norm_var": 0.1826171875, + "learning_rate": 0.0001, + "loss": 6.3746, + "loss/crossentropy": 2.7657452821731567, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20150689780712128, + "step": 6562 + }, + { + "epoch": 0.205125, + "grad_norm": 3.84375, + "grad_norm_var": 0.045633951822916664, + "learning_rate": 0.0001, + "loss": 6.2148, + "loss/crossentropy": 2.551281690597534, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20697413384914398, + "step": 6564 + }, + { + "epoch": 0.2051875, + "grad_norm": 3.5, + "grad_norm_var": 0.0443756103515625, + "learning_rate": 0.0001, + "loss": 5.8608, + "loss/crossentropy": 2.430768132209778, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1867513507604599, + "step": 6566 + }, + { + "epoch": 0.20525, + "grad_norm": 3.234375, + "grad_norm_var": 0.05994466145833333, + "learning_rate": 0.0001, + "loss": 6.2358, + "loss/crossentropy": 2.7113460302352905, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19267980754375458, + "step": 6568 + }, + { + "epoch": 0.2053125, + "grad_norm": 3.515625, + "grad_norm_var": 0.05836181640625, + "learning_rate": 0.0001, + "loss": 5.6412, + "loss/crossentropy": 2.3279199600219727, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17859302461147308, + "step": 6570 + }, + { + "epoch": 0.205375, + "grad_norm": 3.46875, + "grad_norm_var": 0.04892578125, + "learning_rate": 0.0001, + "loss": 6.2538, + "loss/crossentropy": 2.665827751159668, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19942322373390198, + "step": 6572 + }, + { + "epoch": 0.2054375, + "grad_norm": 3.796875, + "grad_norm_var": 0.05757548014322917, + "learning_rate": 0.0001, + "loss": 6.3836, + "loss/crossentropy": 2.6853175163269043, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.21006380766630173, + "step": 6574 + }, + { + "epoch": 0.2055, + "grad_norm": 3.5625, + "grad_norm_var": 0.05373942057291667, + "learning_rate": 0.0001, + "loss": 6.2879, + "loss/crossentropy": 2.569413185119629, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20466507971286774, + "step": 6576 + }, + { + "epoch": 0.2055625, + "grad_norm": 3.6875, + "grad_norm_var": 0.0574859619140625, + "learning_rate": 0.0001, + "loss": 6.2565, + "loss/crossentropy": 2.6899293661117554, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19572219252586365, + "step": 6578 + }, + { + "epoch": 0.205625, + "grad_norm": 3.796875, + "grad_norm_var": 0.0571197509765625, + "learning_rate": 0.0001, + "loss": 6.3495, + "loss/crossentropy": 2.7121264934539795, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20084993541240692, + "step": 6580 + }, + { + "epoch": 0.2056875, + "grad_norm": 3.40625, + "grad_norm_var": 0.052294921875, + "learning_rate": 0.0001, + "loss": 6.0029, + "loss/crossentropy": 2.565826892852783, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18589099496603012, + "step": 6582 + }, + { + "epoch": 0.20575, + "grad_norm": 3.1875, + "grad_norm_var": 0.048173014322916666, + "learning_rate": 0.0001, + "loss": 6.0348, + "loss/crossentropy": 2.5720603466033936, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.18338802456855774, + "step": 6584 + }, + { + "epoch": 0.2058125, + "grad_norm": 3.765625, + "grad_norm_var": 0.0484375, + "learning_rate": 0.0001, + "loss": 6.2553, + "loss/crossentropy": 2.6761839389801025, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19697345048189163, + "step": 6586 + }, + { + "epoch": 0.205875, + "grad_norm": 3.40625, + "grad_norm_var": 0.05034077962239583, + "learning_rate": 0.0001, + "loss": 6.2078, + "loss/crossentropy": 2.63966965675354, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1986144557595253, + "step": 6588 + }, + { + "epoch": 0.2059375, + "grad_norm": 3.890625, + "grad_norm_var": 0.04778238932291667, + "learning_rate": 0.0001, + "loss": 5.9383, + "loss/crossentropy": 2.411689281463623, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19055413454771042, + "step": 6590 + }, + { + "epoch": 0.206, + "grad_norm": 3.5625, + "grad_norm_var": 0.0485260009765625, + "learning_rate": 0.0001, + "loss": 5.9474, + "loss/crossentropy": 2.4322092533111572, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19136208295822144, + "step": 6592 + }, + { + "epoch": 0.2060625, + "grad_norm": 3.5, + "grad_norm_var": 0.04551493326822917, + "learning_rate": 0.0001, + "loss": 5.9522, + "loss/crossentropy": 2.410311698913574, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.1905139610171318, + "step": 6594 + }, + { + "epoch": 0.206125, + "grad_norm": 3.453125, + "grad_norm_var": 0.04211832682291667, + "learning_rate": 0.0001, + "loss": 6.1695, + "loss/crossentropy": 2.6207791566848755, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19510557502508163, + "step": 6596 + }, + { + "epoch": 0.2061875, + "grad_norm": 3.515625, + "grad_norm_var": 0.040608723958333336, + "learning_rate": 0.0001, + "loss": 6.1398, + "loss/crossentropy": 2.588352918624878, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1961626261472702, + "step": 6598 + }, + { + "epoch": 0.20625, + "grad_norm": 3.328125, + "grad_norm_var": 0.038557942708333334, + "learning_rate": 0.0001, + "loss": 6.0676, + "loss/crossentropy": 2.5641703605651855, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18940068036317825, + "step": 6600 + }, + { + "epoch": 0.2063125, + "grad_norm": 3.578125, + "grad_norm_var": 0.03681640625, + "learning_rate": 0.0001, + "loss": 6.0762, + "loss/crossentropy": 2.5501731634140015, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19166115671396255, + "step": 6602 + }, + { + "epoch": 0.206375, + "grad_norm": 3.546875, + "grad_norm_var": 0.03290913899739583, + "learning_rate": 0.0001, + "loss": 6.0994, + "loss/crossentropy": 2.5292768478393555, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19958586245775223, + "step": 6604 + }, + { + "epoch": 0.2064375, + "grad_norm": 3.578125, + "grad_norm_var": 0.02412109375, + "learning_rate": 0.0001, + "loss": 6.0442, + "loss/crossentropy": 2.487869143486023, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19586338847875595, + "step": 6606 + }, + { + "epoch": 0.2065, + "grad_norm": 3.75, + "grad_norm_var": 0.016877237955729166, + "learning_rate": 0.0001, + "loss": 6.1839, + "loss/crossentropy": 2.5478591918945312, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20071035623550415, + "step": 6608 + }, + { + "epoch": 0.2065625, + "grad_norm": 3.578125, + "grad_norm_var": 0.014192708333333333, + "learning_rate": 0.0001, + "loss": 5.9695, + "loss/crossentropy": 2.489307165145874, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1894274652004242, + "step": 6610 + }, + { + "epoch": 0.206625, + "grad_norm": 3.484375, + "grad_norm_var": 0.013570149739583334, + "learning_rate": 0.0001, + "loss": 5.9812, + "loss/crossentropy": 2.469808340072632, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1948893666267395, + "step": 6612 + }, + { + "epoch": 0.2066875, + "grad_norm": 3.546875, + "grad_norm_var": 0.014777628580729167, + "learning_rate": 0.0001, + "loss": 5.9901, + "loss/crossentropy": 2.4591652154922485, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1941061168909073, + "step": 6614 + }, + { + "epoch": 0.20675, + "grad_norm": 3.546875, + "grad_norm_var": 0.0080963134765625, + "learning_rate": 0.0001, + "loss": 6.3745, + "loss/crossentropy": 2.7677459716796875, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20168904960155487, + "step": 6616 + }, + { + "epoch": 0.2068125, + "grad_norm": 3.59375, + "grad_norm_var": 0.009326171875, + "learning_rate": 0.0001, + "loss": 6.331, + "loss/crossentropy": 2.570394515991211, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2116076424717903, + "step": 6618 + }, + { + "epoch": 0.206875, + "grad_norm": 3.75, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 0.0001, + "loss": 6.5496, + "loss/crossentropy": 2.781446933746338, + "loss/hidden": 1.6953125, + "loss/jsd": 0.0, + "loss/logits": 0.20728357881307602, + "step": 6620 + }, + { + "epoch": 0.2069375, + "grad_norm": 3.546875, + "grad_norm_var": 0.039957682291666664, + "learning_rate": 0.0001, + "loss": 6.1231, + "loss/crossentropy": 2.6173206567764282, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.190808467566967, + "step": 6622 + }, + { + "epoch": 0.207, + "grad_norm": 3.640625, + "grad_norm_var": 0.042601521809895834, + "learning_rate": 0.0001, + "loss": 5.8668, + "loss/crossentropy": 2.445883631706238, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18232329189777374, + "step": 6624 + }, + { + "epoch": 0.2070625, + "grad_norm": 3.40625, + "grad_norm_var": 0.0454742431640625, + "learning_rate": 0.0001, + "loss": 6.1287, + "loss/crossentropy": 2.62809419631958, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19342082738876343, + "step": 6626 + }, + { + "epoch": 0.207125, + "grad_norm": 3.5625, + "grad_norm_var": 0.04563700358072917, + "learning_rate": 0.0001, + "loss": 6.4298, + "loss/crossentropy": 2.7382125854492188, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20744048058986664, + "step": 6628 + }, + { + "epoch": 0.2071875, + "grad_norm": 3.546875, + "grad_norm_var": 0.04147135416666667, + "learning_rate": 0.0001, + "loss": 6.1929, + "loss/crossentropy": 2.6155550479888916, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19679277390241623, + "step": 6630 + }, + { + "epoch": 0.20725, + "grad_norm": 3.5, + "grad_norm_var": 0.042740885416666666, + "learning_rate": 0.0001, + "loss": 6.0664, + "loss/crossentropy": 2.5981115102767944, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19096802175045013, + "step": 6632 + }, + { + "epoch": 0.2073125, + "grad_norm": 3.53125, + "grad_norm_var": 0.0436431884765625, + "learning_rate": 0.0001, + "loss": 6.5656, + "loss/crossentropy": 2.8172494173049927, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20881778746843338, + "step": 6634 + }, + { + "epoch": 0.207375, + "grad_norm": 3.609375, + "grad_norm_var": 0.013671875, + "learning_rate": 0.0001, + "loss": 6.4479, + "loss/crossentropy": 2.655908226966858, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2104528844356537, + "step": 6636 + }, + { + "epoch": 0.2074375, + "grad_norm": 3.484375, + "grad_norm_var": 0.024149576822916668, + "learning_rate": 0.0001, + "loss": 5.8728, + "loss/crossentropy": 2.4703177213668823, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1851721778512001, + "step": 6638 + }, + { + "epoch": 0.2075, + "grad_norm": 3.53125, + "grad_norm_var": 0.022098795572916666, + "learning_rate": 0.0001, + "loss": 5.8662, + "loss/crossentropy": 2.426244616508484, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1908668577671051, + "step": 6640 + }, + { + "epoch": 0.2075625, + "grad_norm": 3.671875, + "grad_norm_var": 0.0309478759765625, + "learning_rate": 0.0001, + "loss": 6.0628, + "loss/crossentropy": 2.6135430335998535, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18984804302453995, + "step": 6642 + }, + { + "epoch": 0.207625, + "grad_norm": 3.5625, + "grad_norm_var": 0.03248291015625, + "learning_rate": 0.0001, + "loss": 5.9551, + "loss/crossentropy": 2.4342517852783203, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19075235724449158, + "step": 6644 + }, + { + "epoch": 0.2076875, + "grad_norm": 3.546875, + "grad_norm_var": 0.03248291015625, + "learning_rate": 0.0001, + "loss": 6.0009, + "loss/crossentropy": 2.528058171272278, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18986602127552032, + "step": 6646 + }, + { + "epoch": 0.20775, + "grad_norm": 3.5625, + "grad_norm_var": 0.03463134765625, + "learning_rate": 0.0001, + "loss": 5.8214, + "loss/crossentropy": 2.3128613233566284, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19030648469924927, + "step": 6648 + }, + { + "epoch": 0.2078125, + "grad_norm": 3.59375, + "grad_norm_var": 0.0339019775390625, + "learning_rate": 0.0001, + "loss": 6.2097, + "loss/crossentropy": 2.593241810798645, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19992893934249878, + "step": 6650 + }, + { + "epoch": 0.207875, + "grad_norm": 3.375, + "grad_norm_var": 0.0320465087890625, + "learning_rate": 0.0001, + "loss": 6.085, + "loss/crossentropy": 2.586739182472229, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1924077644944191, + "step": 6652 + }, + { + "epoch": 0.2079375, + "grad_norm": 3.453125, + "grad_norm_var": 0.025275675455729167, + "learning_rate": 0.0001, + "loss": 6.1803, + "loss/crossentropy": 2.6167575120925903, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19541189074516296, + "step": 6654 + }, + { + "epoch": 0.208, + "grad_norm": 3.625, + "grad_norm_var": 0.0313140869140625, + "learning_rate": 0.0001, + "loss": 5.8731, + "loss/crossentropy": 2.3873835802078247, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19076339900493622, + "step": 6656 + }, + { + "epoch": 0.2080625, + "grad_norm": 3.453125, + "grad_norm_var": 0.021800740559895834, + "learning_rate": 0.0001, + "loss": 6.1023, + "loss/crossentropy": 2.521125078201294, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19679167866706848, + "step": 6658 + }, + { + "epoch": 0.208125, + "grad_norm": 3.625, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 5.9835, + "loss/crossentropy": 2.5526716709136963, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1891724169254303, + "step": 6660 + }, + { + "epoch": 0.2081875, + "grad_norm": 3.5625, + "grad_norm_var": 0.022526041666666666, + "learning_rate": 0.0001, + "loss": 5.9984, + "loss/crossentropy": 2.4385485649108887, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19465378671884537, + "step": 6662 + }, + { + "epoch": 0.20825, + "grad_norm": 3.65625, + "grad_norm_var": 0.023542277018229165, + "learning_rate": 0.0001, + "loss": 6.0496, + "loss/crossentropy": 2.517141580581665, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1970003843307495, + "step": 6664 + }, + { + "epoch": 0.2083125, + "grad_norm": 3.671875, + "grad_norm_var": 0.029264322916666665, + "learning_rate": 0.0001, + "loss": 6.3474, + "loss/crossentropy": 2.6856855154037476, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20327741652727127, + "step": 6666 + }, + { + "epoch": 0.208375, + "grad_norm": 3.4375, + "grad_norm_var": 0.027619425455729166, + "learning_rate": 0.0001, + "loss": 6.2244, + "loss/crossentropy": 2.67505943775177, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1971169114112854, + "step": 6668 + }, + { + "epoch": 0.2084375, + "grad_norm": 3.828125, + "grad_norm_var": 0.031412760416666664, + "learning_rate": 0.0001, + "loss": 6.0871, + "loss/crossentropy": 2.5097700357437134, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.2022606059908867, + "step": 6670 + }, + { + "epoch": 0.2085, + "grad_norm": 4.90625, + "grad_norm_var": 0.14741923014322916, + "learning_rate": 0.0001, + "loss": 6.5034, + "loss/crossentropy": 2.736795425415039, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.21454696357250214, + "step": 6672 + }, + { + "epoch": 0.2085625, + "grad_norm": 3.484375, + "grad_norm_var": 0.14700113932291667, + "learning_rate": 0.0001, + "loss": 5.9703, + "loss/crossentropy": 2.4670923948287964, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.18821196258068085, + "step": 6674 + }, + { + "epoch": 0.208625, + "grad_norm": 3.578125, + "grad_norm_var": 0.14710184733072917, + "learning_rate": 0.0001, + "loss": 6.1057, + "loss/crossentropy": 2.564454197883606, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19592129439115524, + "step": 6676 + }, + { + "epoch": 0.2086875, + "grad_norm": 3.46875, + "grad_norm_var": 0.14792378743489584, + "learning_rate": 0.0001, + "loss": 6.044, + "loss/crossentropy": 2.5001375675201416, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1977410465478897, + "step": 6678 + }, + { + "epoch": 0.20875, + "grad_norm": 4.53125, + "grad_norm_var": 0.1970123291015625, + "learning_rate": 0.0001, + "loss": 6.3306, + "loss/crossentropy": 2.623522400856018, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2101605385541916, + "step": 6680 + }, + { + "epoch": 0.2088125, + "grad_norm": 3.453125, + "grad_norm_var": 0.1873931884765625, + "learning_rate": 0.0001, + "loss": 6.2546, + "loss/crossentropy": 2.6927119493484497, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1968143731355667, + "step": 6682 + }, + { + "epoch": 0.208875, + "grad_norm": 3.65625, + "grad_norm_var": 0.1829498291015625, + "learning_rate": 0.0001, + "loss": 6.3133, + "loss/crossentropy": 2.665427803993225, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20346005260944366, + "step": 6684 + }, + { + "epoch": 0.2089375, + "grad_norm": 4.125, + "grad_norm_var": 0.18936258951822918, + "learning_rate": 0.0001, + "loss": 6.4817, + "loss/crossentropy": 2.6795389652252197, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21693264693021774, + "step": 6686 + }, + { + "epoch": 0.209, + "grad_norm": 3.703125, + "grad_norm_var": 0.09228515625, + "learning_rate": 0.0001, + "loss": 6.2551, + "loss/crossentropy": 2.697451591491699, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19599945843219757, + "step": 6688 + }, + { + "epoch": 0.2090625, + "grad_norm": 3.671875, + "grad_norm_var": 0.09664713541666667, + "learning_rate": 0.0001, + "loss": 6.3445, + "loss/crossentropy": 2.641369342803955, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.20429710298776627, + "step": 6690 + }, + { + "epoch": 0.209125, + "grad_norm": 3.25, + "grad_norm_var": 0.09938151041666667, + "learning_rate": 0.0001, + "loss": 5.7877, + "loss/crossentropy": 2.3258358240127563, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.1817334219813347, + "step": 6692 + }, + { + "epoch": 0.2091875, + "grad_norm": 3.5625, + "grad_norm_var": 0.10060933430989584, + "learning_rate": 0.0001, + "loss": 6.271, + "loss/crossentropy": 2.6425410509109497, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2015206143260002, + "step": 6694 + }, + { + "epoch": 0.20925, + "grad_norm": 3.359375, + "grad_norm_var": 0.05495503743489583, + "learning_rate": 0.0001, + "loss": 6.0879, + "loss/crossentropy": 2.579468846321106, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19107597321271896, + "step": 6696 + }, + { + "epoch": 0.2093125, + "grad_norm": 3.453125, + "grad_norm_var": 0.0579986572265625, + "learning_rate": 0.0001, + "loss": 6.3796, + "loss/crossentropy": 2.787188410758972, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.2006485015153885, + "step": 6698 + }, + { + "epoch": 0.209375, + "grad_norm": 3.734375, + "grad_norm_var": 0.05845947265625, + "learning_rate": 0.0001, + "loss": 6.3861, + "loss/crossentropy": 2.7306629419326782, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20460304617881775, + "step": 6700 + }, + { + "epoch": 0.2094375, + "grad_norm": 3.765625, + "grad_norm_var": 0.03974609375, + "learning_rate": 0.0001, + "loss": 6.3023, + "loss/crossentropy": 2.5314093828201294, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21107427775859833, + "step": 6702 + }, + { + "epoch": 0.2095, + "grad_norm": 3.25, + "grad_norm_var": 0.044657389322916664, + "learning_rate": 0.0001, + "loss": 6.0491, + "loss/crossentropy": 2.4721285104751587, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1963651329278946, + "step": 6704 + }, + { + "epoch": 0.2095625, + "grad_norm": 3.8125, + "grad_norm_var": 0.03984375, + "learning_rate": 0.0001, + "loss": 6.136, + "loss/crossentropy": 2.4893136024475098, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.2010004222393036, + "step": 6706 + }, + { + "epoch": 0.209625, + "grad_norm": 3.609375, + "grad_norm_var": 0.0373046875, + "learning_rate": 0.0001, + "loss": 6.322, + "loss/crossentropy": 2.671256422996521, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20062114298343658, + "step": 6708 + }, + { + "epoch": 0.2096875, + "grad_norm": 3.875, + "grad_norm_var": 0.0443511962890625, + "learning_rate": 0.0001, + "loss": 6.3426, + "loss/crossentropy": 2.7482519149780273, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19967231899499893, + "step": 6710 + }, + { + "epoch": 0.20975, + "grad_norm": 4.0, + "grad_norm_var": 0.0500396728515625, + "learning_rate": 0.0001, + "loss": 6.3674, + "loss/crossentropy": 2.688132405281067, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.19918164610862732, + "step": 6712 + }, + { + "epoch": 0.2098125, + "grad_norm": 3.546875, + "grad_norm_var": 0.041356404622395836, + "learning_rate": 0.0001, + "loss": 6.3835, + "loss/crossentropy": 2.7763701677322388, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19860640913248062, + "step": 6714 + }, + { + "epoch": 0.209875, + "grad_norm": 3.5, + "grad_norm_var": 0.04519856770833333, + "learning_rate": 0.0001, + "loss": 6.347, + "loss/crossentropy": 2.749025583267212, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19925252348184586, + "step": 6716 + }, + { + "epoch": 0.2099375, + "grad_norm": 3.34375, + "grad_norm_var": 0.05291341145833333, + "learning_rate": 0.0001, + "loss": 5.7526, + "loss/crossentropy": 2.431371331214905, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.17275211960077286, + "step": 6718 + }, + { + "epoch": 0.21, + "grad_norm": 3.578125, + "grad_norm_var": 0.050023396809895836, + "learning_rate": 0.0001, + "loss": 6.2584, + "loss/crossentropy": 2.6580491065979004, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1998780071735382, + "step": 6720 + }, + { + "epoch": 0.2100625, + "grad_norm": 3.3125, + "grad_norm_var": 0.045882161458333334, + "learning_rate": 0.0001, + "loss": 5.9866, + "loss/crossentropy": 2.57085919380188, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18298456072807312, + "step": 6722 + }, + { + "epoch": 0.210125, + "grad_norm": 3.984375, + "grad_norm_var": 0.05683492024739583, + "learning_rate": 0.0001, + "loss": 6.2644, + "loss/crossentropy": 2.5806660652160645, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20430614054203033, + "step": 6724 + }, + { + "epoch": 0.2101875, + "grad_norm": 3.4375, + "grad_norm_var": 0.04755859375, + "learning_rate": 0.0001, + "loss": 6.1806, + "loss/crossentropy": 2.649932026863098, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19525517523288727, + "step": 6726 + }, + { + "epoch": 0.21025, + "grad_norm": 3.53125, + "grad_norm_var": 0.034130859375, + "learning_rate": 0.0001, + "loss": 6.4122, + "loss/crossentropy": 2.719170093536377, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.21109655499458313, + "step": 6728 + }, + { + "epoch": 0.2103125, + "grad_norm": 3.90625, + "grad_norm_var": 0.05331624348958333, + "learning_rate": 0.0001, + "loss": 6.2589, + "loss/crossentropy": 2.509567618370056, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.21282700449228287, + "step": 6730 + }, + { + "epoch": 0.210375, + "grad_norm": 3.765625, + "grad_norm_var": 0.053515625, + "learning_rate": 0.0001, + "loss": 6.3033, + "loss/crossentropy": 2.68427836894989, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.1994030922651291, + "step": 6732 + }, + { + "epoch": 0.2104375, + "grad_norm": 3.734375, + "grad_norm_var": 3.7589508056640626, + "learning_rate": 0.0001, + "loss": 6.1585, + "loss/crossentropy": 2.346200704574585, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21678119897842407, + "step": 6734 + }, + { + "epoch": 0.2105, + "grad_norm": 3.546875, + "grad_norm_var": 3.731297810872396, + "learning_rate": 0.0001, + "loss": 6.4917, + "loss/crossentropy": 2.8597121238708496, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.19523271918296814, + "step": 6736 + }, + { + "epoch": 0.2105625, + "grad_norm": 4.34375, + "grad_norm_var": 3.691844685872396, + "learning_rate": 0.0001, + "loss": 6.3788, + "loss/crossentropy": 2.7760846614837646, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2008993998169899, + "step": 6738 + }, + { + "epoch": 0.210625, + "grad_norm": 3.75, + "grad_norm_var": 3.670995076497396, + "learning_rate": 0.0001, + "loss": 6.342, + "loss/crossentropy": 2.637627124786377, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.21184614300727844, + "step": 6740 + }, + { + "epoch": 0.2106875, + "grad_norm": 3.796875, + "grad_norm_var": 3.6266886393229165, + "learning_rate": 0.0001, + "loss": 6.2531, + "loss/crossentropy": 2.6286277770996094, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2018960416316986, + "step": 6742 + }, + { + "epoch": 0.21075, + "grad_norm": 3.8125, + "grad_norm_var": 3.5985677083333334, + "learning_rate": 0.0001, + "loss": 6.3415, + "loss/crossentropy": 2.7454168796539307, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19633187353610992, + "step": 6744 + }, + { + "epoch": 0.2108125, + "grad_norm": 3.609375, + "grad_norm_var": 3.647419230143229, + "learning_rate": 0.0001, + "loss": 6.4109, + "loss/crossentropy": 2.7401084899902344, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20418954640626907, + "step": 6746 + }, + { + "epoch": 0.210875, + "grad_norm": 3.609375, + "grad_norm_var": 3.637495930989583, + "learning_rate": 0.0001, + "loss": 5.882, + "loss/crossentropy": 2.24249267578125, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.1975444257259369, + "step": 6748 + }, + { + "epoch": 0.2109375, + "grad_norm": 3.15625, + "grad_norm_var": 0.09569905598958334, + "learning_rate": 0.0001, + "loss": 6.2418, + "loss/crossentropy": 2.7148282527923584, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19527536630630493, + "step": 6750 + }, + { + "epoch": 0.211, + "grad_norm": 3.125, + "grad_norm_var": 0.12573140462239582, + "learning_rate": 0.0001, + "loss": 6.1224, + "loss/crossentropy": 2.6875683069229126, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1876225247979164, + "step": 6752 + }, + { + "epoch": 0.2110625, + "grad_norm": 4.0, + "grad_norm_var": 0.09426676432291667, + "learning_rate": 0.0001, + "loss": 6.4024, + "loss/crossentropy": 2.6420962810516357, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.213921919465065, + "step": 6754 + }, + { + "epoch": 0.211125, + "grad_norm": 4.34375, + "grad_norm_var": 0.12241923014322917, + "learning_rate": 0.0001, + "loss": 6.0016, + "loss/crossentropy": 2.3534570932388306, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.19879579544067383, + "step": 6756 + }, + { + "epoch": 0.2111875, + "grad_norm": 4.21875, + "grad_norm_var": 0.15220438639322917, + "learning_rate": 0.0001, + "loss": 6.4019, + "loss/crossentropy": 2.689331889152527, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2079753801226616, + "step": 6758 + }, + { + "epoch": 0.21125, + "grad_norm": 3.484375, + "grad_norm_var": 0.15225321451822918, + "learning_rate": 0.0001, + "loss": 6.2664, + "loss/crossentropy": 2.6862618923187256, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1982519030570984, + "step": 6760 + }, + { + "epoch": 0.2113125, + "grad_norm": 3.21875, + "grad_norm_var": 0.16969401041666668, + "learning_rate": 0.0001, + "loss": 5.9312, + "loss/crossentropy": 2.5212395191192627, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1870882213115692, + "step": 6762 + }, + { + "epoch": 0.211375, + "grad_norm": 4.1875, + "grad_norm_var": 0.15653889973958332, + "learning_rate": 0.0001, + "loss": 6.5337, + "loss/crossentropy": 2.787313938140869, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20979921519756317, + "step": 6764 + }, + { + "epoch": 0.2114375, + "grad_norm": 3.609375, + "grad_norm_var": 0.14332682291666668, + "learning_rate": 0.0001, + "loss": 6.1694, + "loss/crossentropy": 2.5976446866989136, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19584590196609497, + "step": 6766 + }, + { + "epoch": 0.2115, + "grad_norm": 3.203125, + "grad_norm_var": 0.13141988118489584, + "learning_rate": 0.0001, + "loss": 6.1991, + "loss/crossentropy": 2.71761953830719, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.18799372017383575, + "step": 6768 + }, + { + "epoch": 0.2115625, + "grad_norm": 3.734375, + "grad_norm_var": 0.12600504557291667, + "learning_rate": 0.0001, + "loss": 6.2506, + "loss/crossentropy": 2.704631805419922, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19678643345832825, + "step": 6770 + }, + { + "epoch": 0.211625, + "grad_norm": 3.90625, + "grad_norm_var": 0.10442708333333334, + "learning_rate": 0.0001, + "loss": 6.2673, + "loss/crossentropy": 2.6605230569839478, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.1977902352809906, + "step": 6772 + }, + { + "epoch": 0.2116875, + "grad_norm": 3.90625, + "grad_norm_var": 0.07746480305989584, + "learning_rate": 0.0001, + "loss": 6.6312, + "loss/crossentropy": 2.835526466369629, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21550609171390533, + "step": 6774 + }, + { + "epoch": 0.21175, + "grad_norm": 3.78125, + "grad_norm_var": 0.07994384765625, + "learning_rate": 0.0001, + "loss": 6.1223, + "loss/crossentropy": 2.536879062652588, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19603867828845978, + "step": 6776 + }, + { + "epoch": 0.2118125, + "grad_norm": 3.796875, + "grad_norm_var": 0.11632486979166666, + "learning_rate": 0.0001, + "loss": 6.2197, + "loss/crossentropy": 2.4974231719970703, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20777669548988342, + "step": 6778 + }, + { + "epoch": 0.211875, + "grad_norm": 3.90625, + "grad_norm_var": 0.1041015625, + "learning_rate": 0.0001, + "loss": 6.1775, + "loss/crossentropy": 2.4850826263427734, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2067461460828781, + "step": 6780 + }, + { + "epoch": 0.2119375, + "grad_norm": 3.875, + "grad_norm_var": 0.1038970947265625, + "learning_rate": 0.0001, + "loss": 6.2258, + "loss/crossentropy": 2.639601707458496, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19963591545820236, + "step": 6782 + }, + { + "epoch": 0.212, + "grad_norm": 3.421875, + "grad_norm_var": 0.09138895670572916, + "learning_rate": 0.0001, + "loss": 6.0989, + "loss/crossentropy": 2.51748263835907, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19915585964918137, + "step": 6784 + }, + { + "epoch": 0.2120625, + "grad_norm": 3.359375, + "grad_norm_var": 0.10368550618489583, + "learning_rate": 0.0001, + "loss": 6.1392, + "loss/crossentropy": 2.6376984119415283, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19507159292697906, + "step": 6786 + }, + { + "epoch": 0.212125, + "grad_norm": 4.09375, + "grad_norm_var": 0.10632222493489583, + "learning_rate": 0.0001, + "loss": 6.1884, + "loss/crossentropy": 2.629759669303894, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1960998997092247, + "step": 6788 + }, + { + "epoch": 0.2121875, + "grad_norm": 3.5625, + "grad_norm_var": 0.10808817545572917, + "learning_rate": 0.0001, + "loss": 6.6151, + "loss/crossentropy": 2.7972121238708496, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21615905314683914, + "step": 6790 + }, + { + "epoch": 0.21225, + "grad_norm": 3.6875, + "grad_norm_var": 0.10956929524739584, + "learning_rate": 0.0001, + "loss": 6.3118, + "loss/crossentropy": 2.690866470336914, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19920150190591812, + "step": 6792 + }, + { + "epoch": 0.2123125, + "grad_norm": 3.515625, + "grad_norm_var": 0.06291910807291666, + "learning_rate": 0.0001, + "loss": 6.4656, + "loss/crossentropy": 2.7472859621047974, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.2085518091917038, + "step": 6794 + }, + { + "epoch": 0.212375, + "grad_norm": 3.421875, + "grad_norm_var": 0.05852457682291667, + "learning_rate": 0.0001, + "loss": 6.8297, + "loss/crossentropy": 3.120318293571472, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20766117423772812, + "step": 6796 + }, + { + "epoch": 0.2124375, + "grad_norm": 3.140625, + "grad_norm_var": 0.06950581868489583, + "learning_rate": 0.0001, + "loss": 6.0367, + "loss/crossentropy": 2.587893486022949, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1901942640542984, + "step": 6798 + }, + { + "epoch": 0.2125, + "grad_norm": 3.53125, + "grad_norm_var": 0.083251953125, + "learning_rate": 0.0001, + "loss": 5.9106, + "loss/crossentropy": 2.4920233488082886, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18404901027679443, + "step": 6800 + }, + { + "epoch": 0.2125625, + "grad_norm": 4.09375, + "grad_norm_var": 0.09627176920572916, + "learning_rate": 0.0001, + "loss": 6.1437, + "loss/crossentropy": 2.584474802017212, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19498298317193985, + "step": 6802 + }, + { + "epoch": 0.212625, + "grad_norm": 3.703125, + "grad_norm_var": 0.19191792805989583, + "learning_rate": 0.0001, + "loss": 6.2582, + "loss/crossentropy": 2.5794581174850464, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20303219556808472, + "step": 6804 + }, + { + "epoch": 0.2126875, + "grad_norm": 3.4375, + "grad_norm_var": 0.19745686848958333, + "learning_rate": 0.0001, + "loss": 6.0408, + "loss/crossentropy": 2.5239760875701904, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1919121965765953, + "step": 6806 + }, + { + "epoch": 0.21275, + "grad_norm": 3.734375, + "grad_norm_var": 0.19284566243489584, + "learning_rate": 0.0001, + "loss": 6.6782, + "loss/crossentropy": 2.947417378425598, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2113574743270874, + "step": 6808 + }, + { + "epoch": 0.2128125, + "grad_norm": 3.34375, + "grad_norm_var": 0.2046051025390625, + "learning_rate": 0.0001, + "loss": 5.7501, + "loss/crossentropy": 2.3616881370544434, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.17907945811748505, + "step": 6810 + }, + { + "epoch": 0.212875, + "grad_norm": 3.375, + "grad_norm_var": 0.2069488525390625, + "learning_rate": 0.0001, + "loss": 6.2424, + "loss/crossentropy": 2.6910303831100464, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19224313646554947, + "step": 6812 + }, + { + "epoch": 0.2129375, + "grad_norm": 4.15625, + "grad_norm_var": 0.20847066243489584, + "learning_rate": 0.0001, + "loss": 6.5023, + "loss/crossentropy": 2.777039647102356, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.21041592955589294, + "step": 6814 + }, + { + "epoch": 0.213, + "grad_norm": 4.5625, + "grad_norm_var": 0.23484700520833332, + "learning_rate": 0.0001, + "loss": 5.8413, + "loss/crossentropy": 2.267647624015808, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19603300094604492, + "step": 6816 + }, + { + "epoch": 0.2130625, + "grad_norm": 3.609375, + "grad_norm_var": 0.23153889973958333, + "learning_rate": 0.0001, + "loss": 6.3894, + "loss/crossentropy": 2.82620370388031, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19576887786388397, + "step": 6818 + }, + { + "epoch": 0.213125, + "grad_norm": 3.53125, + "grad_norm_var": 0.14072977701822917, + "learning_rate": 0.0001, + "loss": 6.4969, + "loss/crossentropy": 2.78303325176239, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.2135787233710289, + "step": 6820 + }, + { + "epoch": 0.2131875, + "grad_norm": 3.109375, + "grad_norm_var": 0.15377197265625, + "learning_rate": 0.0001, + "loss": 6.1589, + "loss/crossentropy": 2.6030240058898926, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19465148448944092, + "step": 6822 + }, + { + "epoch": 0.21325, + "grad_norm": 3.359375, + "grad_norm_var": 0.16599833170572917, + "learning_rate": 0.0001, + "loss": 6.0984, + "loss/crossentropy": 2.5625205039978027, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19538921862840652, + "step": 6824 + }, + { + "epoch": 0.2133125, + "grad_norm": 3.6875, + "grad_norm_var": 0.14905192057291666, + "learning_rate": 0.0001, + "loss": 6.186, + "loss/crossentropy": 2.593506693840027, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19987186789512634, + "step": 6826 + }, + { + "epoch": 0.213375, + "grad_norm": 3.671875, + "grad_norm_var": 0.1437164306640625, + "learning_rate": 0.0001, + "loss": 6.2935, + "loss/crossentropy": 2.6629804372787476, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20250384509563446, + "step": 6828 + }, + { + "epoch": 0.2134375, + "grad_norm": 3.203125, + "grad_norm_var": 0.14336649576822916, + "learning_rate": 0.0001, + "loss": 6.0402, + "loss/crossentropy": 2.5341413021087646, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18966825306415558, + "step": 6830 + }, + { + "epoch": 0.2135, + "grad_norm": 4.03125, + "grad_norm_var": 0.09088134765625, + "learning_rate": 0.0001, + "loss": 6.2665, + "loss/crossentropy": 2.608793616294861, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20171182602643967, + "step": 6832 + }, + { + "epoch": 0.2135625, + "grad_norm": 3.4375, + "grad_norm_var": 0.08474934895833333, + "learning_rate": 0.0001, + "loss": 6.3257, + "loss/crossentropy": 2.6409130096435547, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2075371891260147, + "step": 6834 + }, + { + "epoch": 0.213625, + "grad_norm": 3.46875, + "grad_norm_var": 0.06357014973958333, + "learning_rate": 0.0001, + "loss": 6.1442, + "loss/crossentropy": 2.496389865875244, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.2026677504181862, + "step": 6836 + }, + { + "epoch": 0.2136875, + "grad_norm": 3.6875, + "grad_norm_var": 0.0514801025390625, + "learning_rate": 0.0001, + "loss": 6.1189, + "loss/crossentropy": 2.5822921991348267, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19311807304620743, + "step": 6838 + }, + { + "epoch": 0.21375, + "grad_norm": 3.78125, + "grad_norm_var": 0.0456939697265625, + "learning_rate": 0.0001, + "loss": 6.2426, + "loss/crossentropy": 2.5685629844665527, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20763982832431793, + "step": 6840 + }, + { + "epoch": 0.2138125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0572418212890625, + "learning_rate": 0.0001, + "loss": 6.0465, + "loss/crossentropy": 2.61044180393219, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19047950953245163, + "step": 6842 + }, + { + "epoch": 0.213875, + "grad_norm": 3.203125, + "grad_norm_var": 0.0666015625, + "learning_rate": 0.0001, + "loss": 5.6172, + "loss/crossentropy": 2.2282965183258057, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1775653213262558, + "step": 6844 + }, + { + "epoch": 0.2139375, + "grad_norm": 3.609375, + "grad_norm_var": 0.059375, + "learning_rate": 0.0001, + "loss": 6.2345, + "loss/crossentropy": 2.593433976173401, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.2019970864057541, + "step": 6846 + }, + { + "epoch": 0.214, + "grad_norm": 3.515625, + "grad_norm_var": 0.04716695149739583, + "learning_rate": 0.0001, + "loss": 6.2703, + "loss/crossentropy": 2.7207247018814087, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.193631112575531, + "step": 6848 + }, + { + "epoch": 0.2140625, + "grad_norm": 3.25, + "grad_norm_var": 0.051350911458333336, + "learning_rate": 0.0001, + "loss": 5.868, + "loss/crossentropy": 2.4845513105392456, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18209190666675568, + "step": 6850 + }, + { + "epoch": 0.214125, + "grad_norm": 3.453125, + "grad_norm_var": 0.051813761393229164, + "learning_rate": 0.0001, + "loss": 6.0807, + "loss/crossentropy": 2.618214249610901, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19077570736408234, + "step": 6852 + }, + { + "epoch": 0.2141875, + "grad_norm": 3.578125, + "grad_norm_var": 0.03345947265625, + "learning_rate": 0.0001, + "loss": 6.0171, + "loss/crossentropy": 2.489748954772949, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1910119280219078, + "step": 6854 + }, + { + "epoch": 0.21425, + "grad_norm": 3.5, + "grad_norm_var": 0.02760009765625, + "learning_rate": 0.0001, + "loss": 6.0291, + "loss/crossentropy": 2.6054465770721436, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18924039602279663, + "step": 6856 + }, + { + "epoch": 0.2143125, + "grad_norm": 3.515625, + "grad_norm_var": 0.035676066080729166, + "learning_rate": 0.0001, + "loss": 6.2084, + "loss/crossentropy": 2.5486491918563843, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.201907716691494, + "step": 6858 + }, + { + "epoch": 0.214375, + "grad_norm": 3.9375, + "grad_norm_var": 0.048981730143229166, + "learning_rate": 0.0001, + "loss": 6.027, + "loss/crossentropy": 2.454265594482422, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19242677092552185, + "step": 6860 + }, + { + "epoch": 0.2144375, + "grad_norm": 3.390625, + "grad_norm_var": 0.0418365478515625, + "learning_rate": 0.0001, + "loss": 5.9576, + "loss/crossentropy": 2.468759536743164, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19107331335544586, + "step": 6862 + }, + { + "epoch": 0.2145, + "grad_norm": 3.34375, + "grad_norm_var": 0.0423980712890625, + "learning_rate": 0.0001, + "loss": 5.91, + "loss/crossentropy": 2.439316511154175, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18730410933494568, + "step": 6864 + }, + { + "epoch": 0.2145625, + "grad_norm": 3.453125, + "grad_norm_var": 0.03689778645833333, + "learning_rate": 0.0001, + "loss": 5.901, + "loss/crossentropy": 2.439536452293396, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18872268497943878, + "step": 6866 + }, + { + "epoch": 0.214625, + "grad_norm": 3.875, + "grad_norm_var": 0.046549479166666664, + "learning_rate": 0.0001, + "loss": 6.036, + "loss/crossentropy": 2.5354151725769043, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19497878849506378, + "step": 6868 + }, + { + "epoch": 0.2146875, + "grad_norm": 3.921875, + "grad_norm_var": 0.0598541259765625, + "learning_rate": 0.0001, + "loss": 6.3898, + "loss/crossentropy": 2.7419973611831665, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20345129817724228, + "step": 6870 + }, + { + "epoch": 0.21475, + "grad_norm": 6.0, + "grad_norm_var": 0.43945210774739585, + "learning_rate": 0.0001, + "loss": 6.9018, + "loss/crossentropy": 2.9109256267547607, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.22916782647371292, + "step": 6872 + }, + { + "epoch": 0.2148125, + "grad_norm": 3.59375, + "grad_norm_var": 0.436181640625, + "learning_rate": 0.0001, + "loss": 6.228, + "loss/crossentropy": 2.65897274017334, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1994800567626953, + "step": 6874 + }, + { + "epoch": 0.214875, + "grad_norm": 4.125, + "grad_norm_var": 0.43635660807291665, + "learning_rate": 0.0001, + "loss": 6.2568, + "loss/crossentropy": 2.5532950162887573, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2059016078710556, + "step": 6876 + }, + { + "epoch": 0.2149375, + "grad_norm": 3.46875, + "grad_norm_var": 0.43700764973958334, + "learning_rate": 0.0001, + "loss": 5.8958, + "loss/crossentropy": 2.4443269968032837, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18655820935964584, + "step": 6878 + }, + { + "epoch": 0.215, + "grad_norm": 3.28125, + "grad_norm_var": 0.4430816650390625, + "learning_rate": 0.0001, + "loss": 6.0938, + "loss/crossentropy": 2.662695527076721, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18373503535985947, + "step": 6880 + }, + { + "epoch": 0.2150625, + "grad_norm": 3.34375, + "grad_norm_var": 0.45083719889322915, + "learning_rate": 0.0001, + "loss": 6.3018, + "loss/crossentropy": 2.7749582529067993, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19487353414297104, + "step": 6882 + }, + { + "epoch": 0.215125, + "grad_norm": 3.671875, + "grad_norm_var": 0.44015299479166664, + "learning_rate": 0.0001, + "loss": 6.3429, + "loss/crossentropy": 2.737243890762329, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1984574869275093, + "step": 6884 + }, + { + "epoch": 0.2151875, + "grad_norm": 3.578125, + "grad_norm_var": 0.42986653645833334, + "learning_rate": 0.0001, + "loss": 6.3917, + "loss/crossentropy": 2.7898651361465454, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20119955390691757, + "step": 6886 + }, + { + "epoch": 0.21525, + "grad_norm": 3.46875, + "grad_norm_var": 0.054032389322916666, + "learning_rate": 0.0001, + "loss": 5.9249, + "loss/crossentropy": 2.4453309774398804, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.192485474050045, + "step": 6888 + }, + { + "epoch": 0.2153125, + "grad_norm": 3.140625, + "grad_norm_var": 0.057648722330729166, + "learning_rate": 0.0001, + "loss": 6.1452, + "loss/crossentropy": 2.6766172647476196, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19138825684785843, + "step": 6890 + }, + { + "epoch": 0.215375, + "grad_norm": 3.5625, + "grad_norm_var": 0.027567545572916668, + "learning_rate": 0.0001, + "loss": 6.2641, + "loss/crossentropy": 2.6406314373016357, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19906429201364517, + "step": 6892 + }, + { + "epoch": 0.2154375, + "grad_norm": 3.90625, + "grad_norm_var": 0.04077860514322917, + "learning_rate": 0.0001, + "loss": 6.4015, + "loss/crossentropy": 2.7206833362579346, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2067534253001213, + "step": 6894 + }, + { + "epoch": 0.2155, + "grad_norm": 3.546875, + "grad_norm_var": 0.04058837890625, + "learning_rate": 0.0001, + "loss": 5.9721, + "loss/crossentropy": 2.5484172105789185, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1861155927181244, + "step": 6896 + }, + { + "epoch": 0.2155625, + "grad_norm": 3.59375, + "grad_norm_var": 0.03752339680989583, + "learning_rate": 0.0001, + "loss": 6.0761, + "loss/crossentropy": 2.551543116569519, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19502922147512436, + "step": 6898 + }, + { + "epoch": 0.215625, + "grad_norm": 3.625, + "grad_norm_var": 0.036942545572916666, + "learning_rate": 0.0001, + "loss": 6.0563, + "loss/crossentropy": 2.5941959619522095, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18995870649814606, + "step": 6900 + }, + { + "epoch": 0.2156875, + "grad_norm": 3.421875, + "grad_norm_var": 0.03948567708333333, + "learning_rate": 0.0001, + "loss": 5.8376, + "loss/crossentropy": 2.4501700401306152, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18366114050149918, + "step": 6902 + }, + { + "epoch": 0.21575, + "grad_norm": 3.734375, + "grad_norm_var": 0.0407135009765625, + "learning_rate": 0.0001, + "loss": 6.1904, + "loss/crossentropy": 2.533998727798462, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.1996278166770935, + "step": 6904 + }, + { + "epoch": 0.2158125, + "grad_norm": 3.890625, + "grad_norm_var": 0.03933919270833333, + "learning_rate": 0.0001, + "loss": 6.2574, + "loss/crossentropy": 2.6609431505203247, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19753922522068024, + "step": 6906 + }, + { + "epoch": 0.215875, + "grad_norm": 3.71875, + "grad_norm_var": 0.10784098307291666, + "learning_rate": 0.0001, + "loss": 6.211, + "loss/crossentropy": 2.457728385925293, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.21009515970945358, + "step": 6908 + }, + { + "epoch": 0.2159375, + "grad_norm": 4.5, + "grad_norm_var": 0.15359598795572918, + "learning_rate": 0.0001, + "loss": 6.5915, + "loss/crossentropy": 2.7991663217544556, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21477922797203064, + "step": 6910 + }, + { + "epoch": 0.216, + "grad_norm": 3.453125, + "grad_norm_var": 0.14563395182291666, + "learning_rate": 0.0001, + "loss": 6.4119, + "loss/crossentropy": 2.751275420188904, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20707565546035767, + "step": 6912 + }, + { + "epoch": 0.2160625, + "grad_norm": 3.421875, + "grad_norm_var": 0.15192769368489584, + "learning_rate": 0.0001, + "loss": 6.0199, + "loss/crossentropy": 2.5404539108276367, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1924763321876526, + "step": 6914 + }, + { + "epoch": 0.216125, + "grad_norm": 3.4375, + "grad_norm_var": 0.15031636555989583, + "learning_rate": 0.0001, + "loss": 6.4459, + "loss/crossentropy": 2.7554389238357544, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20654761791229248, + "step": 6916 + }, + { + "epoch": 0.2161875, + "grad_norm": 4.0, + "grad_norm_var": 0.13528645833333333, + "learning_rate": 0.0001, + "loss": 6.662, + "loss/crossentropy": 2.817233920097351, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.21885564178228378, + "step": 6918 + }, + { + "epoch": 0.21625, + "grad_norm": 3.546875, + "grad_norm_var": 0.13834635416666666, + "learning_rate": 0.0001, + "loss": 6.2551, + "loss/crossentropy": 2.618025064468384, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20003525912761688, + "step": 6920 + }, + { + "epoch": 0.2163125, + "grad_norm": 3.53125, + "grad_norm_var": 0.15013020833333332, + "learning_rate": 0.0001, + "loss": 5.8261, + "loss/crossentropy": 2.4295458793640137, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18496494740247726, + "step": 6922 + }, + { + "epoch": 0.216375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0981353759765625, + "learning_rate": 0.0001, + "loss": 6.0448, + "loss/crossentropy": 2.54641056060791, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19007324427366257, + "step": 6924 + }, + { + "epoch": 0.2164375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0444488525390625, + "learning_rate": 0.0001, + "loss": 6.3752, + "loss/crossentropy": 2.781325340270996, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1976737380027771, + "step": 6926 + }, + { + "epoch": 0.2165, + "grad_norm": 3.65625, + "grad_norm_var": 0.046923828125, + "learning_rate": 0.0001, + "loss": 6.2686, + "loss/crossentropy": 2.6794657707214355, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19875822216272354, + "step": 6928 + }, + { + "epoch": 0.2165625, + "grad_norm": 6.3125, + "grad_norm_var": 0.52379150390625, + "learning_rate": 0.0001, + "loss": 6.0096, + "loss/crossentropy": 2.478939652442932, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1917371302843094, + "step": 6930 + }, + { + "epoch": 0.216625, + "grad_norm": 3.75, + "grad_norm_var": 0.5166005452473958, + "learning_rate": 0.0001, + "loss": 6.2988, + "loss/crossentropy": 2.561856746673584, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.20572340488433838, + "step": 6932 + }, + { + "epoch": 0.2166875, + "grad_norm": 3.359375, + "grad_norm_var": 0.5197550455729166, + "learning_rate": 0.0001, + "loss": 6.2968, + "loss/crossentropy": 2.6517215967178345, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20200955122709274, + "step": 6934 + }, + { + "epoch": 0.21675, + "grad_norm": 3.28125, + "grad_norm_var": 0.5318023681640625, + "learning_rate": 0.0001, + "loss": 5.8895, + "loss/crossentropy": 2.4591037034988403, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18561290949583054, + "step": 6936 + }, + { + "epoch": 0.2168125, + "grad_norm": 3.359375, + "grad_norm_var": 0.5356730143229167, + "learning_rate": 0.0001, + "loss": 5.8998, + "loss/crossentropy": 2.508857011795044, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1859700232744217, + "step": 6938 + }, + { + "epoch": 0.216875, + "grad_norm": 3.40625, + "grad_norm_var": 0.5375935872395833, + "learning_rate": 0.0001, + "loss": 6.1059, + "loss/crossentropy": 2.579146146774292, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19368895143270493, + "step": 6940 + }, + { + "epoch": 0.2169375, + "grad_norm": 4.28125, + "grad_norm_var": 0.5642649332682291, + "learning_rate": 0.0001, + "loss": 6.336, + "loss/crossentropy": 2.79544997215271, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1978062242269516, + "step": 6942 + }, + { + "epoch": 0.217, + "grad_norm": 4.5625, + "grad_norm_var": 0.59810791015625, + "learning_rate": 0.0001, + "loss": 6.2227, + "loss/crossentropy": 2.620998740196228, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19688712805509567, + "step": 6944 + }, + { + "epoch": 0.2170625, + "grad_norm": 3.375, + "grad_norm_var": 0.14003804524739583, + "learning_rate": 0.0001, + "loss": 6.1529, + "loss/crossentropy": 2.6646175384521484, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19413743168115616, + "step": 6946 + }, + { + "epoch": 0.217125, + "grad_norm": 3.859375, + "grad_norm_var": 0.14371744791666666, + "learning_rate": 0.0001, + "loss": 6.4361, + "loss/crossentropy": 2.7347337007522583, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2095884531736374, + "step": 6948 + }, + { + "epoch": 0.2171875, + "grad_norm": 4.0, + "grad_norm_var": 0.14880269368489582, + "learning_rate": 0.0001, + "loss": 6.2403, + "loss/crossentropy": 2.5746344327926636, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20328227430582047, + "step": 6950 + }, + { + "epoch": 0.21725, + "grad_norm": 3.484375, + "grad_norm_var": 0.1431549072265625, + "learning_rate": 0.0001, + "loss": 6.3584, + "loss/crossentropy": 2.7034659385681152, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20260456204414368, + "step": 6952 + }, + { + "epoch": 0.2173125, + "grad_norm": 3.15625, + "grad_norm_var": 0.15107014973958333, + "learning_rate": 0.0001, + "loss": 6.0041, + "loss/crossentropy": 2.5409141778945923, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19006993621587753, + "step": 6954 + }, + { + "epoch": 0.217375, + "grad_norm": 4.125, + "grad_norm_var": 0.16077473958333333, + "learning_rate": 0.0001, + "loss": 6.1016, + "loss/crossentropy": 2.4567782878875732, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20276756584644318, + "step": 6956 + }, + { + "epoch": 0.2174375, + "grad_norm": 4.53125, + "grad_norm_var": 0.172412109375, + "learning_rate": 0.0001, + "loss": 6.2712, + "loss/crossentropy": 2.551652431488037, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.2063276246190071, + "step": 6958 + }, + { + "epoch": 0.2175, + "grad_norm": 3.5625, + "grad_norm_var": 0.1279937744140625, + "learning_rate": 0.0001, + "loss": 6.1276, + "loss/crossentropy": 2.6114827394485474, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1934124454855919, + "step": 6960 + }, + { + "epoch": 0.2175625, + "grad_norm": 4.03125, + "grad_norm_var": 0.15374247233072916, + "learning_rate": 0.0001, + "loss": 6.3278, + "loss/crossentropy": 2.5706037282943726, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20931678265333176, + "step": 6962 + }, + { + "epoch": 0.217625, + "grad_norm": 3.25, + "grad_norm_var": 0.173583984375, + "learning_rate": 0.0001, + "loss": 6.1378, + "loss/crossentropy": 2.5814859867095947, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.20016635954380035, + "step": 6964 + }, + { + "epoch": 0.2176875, + "grad_norm": 3.90625, + "grad_norm_var": 0.1806060791015625, + "learning_rate": 0.0001, + "loss": 6.5318, + "loss/crossentropy": 2.6615020036697388, + "loss/hidden": 1.70703125, + "loss/jsd": 0.0, + "loss/logits": 0.21632423251867294, + "step": 6966 + }, + { + "epoch": 0.21775, + "grad_norm": 3.671875, + "grad_norm_var": 0.17083333333333334, + "learning_rate": 0.0001, + "loss": 6.1315, + "loss/crossentropy": 2.544968843460083, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.20005465298891068, + "step": 6968 + }, + { + "epoch": 0.2178125, + "grad_norm": 4.09375, + "grad_norm_var": 0.13464253743489582, + "learning_rate": 0.0001, + "loss": 6.7397, + "loss/crossentropy": 2.9408375024795532, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.21113675087690353, + "step": 6970 + }, + { + "epoch": 0.217875, + "grad_norm": 3.390625, + "grad_norm_var": 0.1431640625, + "learning_rate": 0.0001, + "loss": 6.1237, + "loss/crossentropy": 2.596482753753662, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19256656616926193, + "step": 6972 + }, + { + "epoch": 0.2179375, + "grad_norm": 3.484375, + "grad_norm_var": 0.11965230305989584, + "learning_rate": 0.0001, + "loss": 6.2645, + "loss/crossentropy": 2.7255892753601074, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19803231209516525, + "step": 6974 + }, + { + "epoch": 0.218, + "grad_norm": 3.03125, + "grad_norm_var": 0.1577301025390625, + "learning_rate": 0.0001, + "loss": 5.6588, + "loss/crossentropy": 2.37584125995636, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17439134418964386, + "step": 6976 + }, + { + "epoch": 0.2180625, + "grad_norm": 3.59375, + "grad_norm_var": 0.11507161458333333, + "learning_rate": 0.0001, + "loss": 6.0221, + "loss/crossentropy": 2.602111339569092, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18770533800125122, + "step": 6978 + }, + { + "epoch": 0.218125, + "grad_norm": 3.34375, + "grad_norm_var": 0.11194559733072916, + "learning_rate": 0.0001, + "loss": 5.8931, + "loss/crossentropy": 2.467549204826355, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18161573261022568, + "step": 6980 + }, + { + "epoch": 0.2181875, + "grad_norm": 3.484375, + "grad_norm_var": 0.07280985514322917, + "learning_rate": 0.0001, + "loss": 6.1829, + "loss/crossentropy": 2.5902645587921143, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19793671369552612, + "step": 6982 + }, + { + "epoch": 0.21825, + "grad_norm": 3.59375, + "grad_norm_var": 0.07154541015625, + "learning_rate": 0.0001, + "loss": 6.2253, + "loss/crossentropy": 2.642678380012512, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19654599577188492, + "step": 6984 + }, + { + "epoch": 0.2183125, + "grad_norm": 3.609375, + "grad_norm_var": 0.030134073893229165, + "learning_rate": 0.0001, + "loss": 6.1163, + "loss/crossentropy": 2.5914446115493774, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19467195123434067, + "step": 6986 + }, + { + "epoch": 0.218375, + "grad_norm": 3.671875, + "grad_norm_var": 0.03561197916666667, + "learning_rate": 0.0001, + "loss": 5.5533, + "loss/crossentropy": 2.160017430782318, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18190325796604156, + "step": 6988 + }, + { + "epoch": 0.2184375, + "grad_norm": 3.734375, + "grad_norm_var": 0.043619791666666664, + "learning_rate": 0.0001, + "loss": 6.1725, + "loss/crossentropy": 2.5211633443832397, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19989459216594696, + "step": 6990 + }, + { + "epoch": 0.2185, + "grad_norm": 3.359375, + "grad_norm_var": 0.031981404622395834, + "learning_rate": 0.0001, + "loss": 6.0217, + "loss/crossentropy": 2.500148296356201, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19590522348880768, + "step": 6992 + }, + { + "epoch": 0.2185625, + "grad_norm": 3.5, + "grad_norm_var": 0.025764973958333333, + "learning_rate": 0.0001, + "loss": 6.2181, + "loss/crossentropy": 2.735214591026306, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19008450955152512, + "step": 6994 + }, + { + "epoch": 0.218625, + "grad_norm": 4.21875, + "grad_norm_var": 0.04881083170572917, + "learning_rate": 0.0001, + "loss": 6.3318, + "loss/crossentropy": 2.646474242210388, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2083786502480507, + "step": 6996 + }, + { + "epoch": 0.2186875, + "grad_norm": 3.484375, + "grad_norm_var": 0.05103759765625, + "learning_rate": 0.0001, + "loss": 6.0873, + "loss/crossentropy": 2.5000009536743164, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19584060460329056, + "step": 6998 + }, + { + "epoch": 0.21875, + "grad_norm": 8.0625, + "grad_norm_var": 1.3062825520833334, + "learning_rate": 0.0001, + "loss": 6.2512, + "loss/crossentropy": 2.6957199573516846, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19656287878751755, + "step": 7000 + }, + { + "epoch": 0.2188125, + "grad_norm": 4.34375, + "grad_norm_var": 1.2825592041015625, + "learning_rate": 0.0001, + "loss": 6.5302, + "loss/crossentropy": 2.7317248582839966, + "loss/hidden": 1.6875, + "loss/jsd": 0.0, + "loss/logits": 0.2110992819070816, + "step": 7002 + }, + { + "epoch": 0.218875, + "grad_norm": 3.40625, + "grad_norm_var": 11.030924479166666, + "learning_rate": 0.0001, + "loss": 6.7224, + "loss/crossentropy": 2.6649667024612427, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.24284875392913818, + "step": 7004 + }, + { + "epoch": 0.2189375, + "grad_norm": 3.953125, + "grad_norm_var": 11.0185546875, + "learning_rate": 0.0001, + "loss": 6.2481, + "loss/crossentropy": 2.5178329944610596, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.21130909025669098, + "step": 7006 + }, + { + "epoch": 0.219, + "grad_norm": 3.53125, + "grad_norm_var": 10.85338134765625, + "learning_rate": 0.0001, + "loss": 6.3382, + "loss/crossentropy": 2.545018792152405, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.21486765146255493, + "step": 7008 + }, + { + "epoch": 0.2190625, + "grad_norm": 3.609375, + "grad_norm_var": 10.790208943684895, + "learning_rate": 0.0001, + "loss": 6.0454, + "loss/crossentropy": 2.5478323698043823, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19077441096305847, + "step": 7010 + }, + { + "epoch": 0.219125, + "grad_norm": 3.796875, + "grad_norm_var": 10.765526326497396, + "learning_rate": 0.0001, + "loss": 6.4756, + "loss/crossentropy": 2.6919052600860596, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21547965705394745, + "step": 7012 + }, + { + "epoch": 0.2191875, + "grad_norm": 3.359375, + "grad_norm_var": 10.698778279622395, + "learning_rate": 0.0001, + "loss": 6.6294, + "loss/crossentropy": 2.824127435684204, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21451301872730255, + "step": 7014 + }, + { + "epoch": 0.21925, + "grad_norm": 3.1875, + "grad_norm_var": 10.271480305989583, + "learning_rate": 0.0001, + "loss": 5.8119, + "loss/crossentropy": 2.401167631149292, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18638114631175995, + "step": 7016 + }, + { + "epoch": 0.2193125, + "grad_norm": 3.40625, + "grad_norm_var": 10.422298177083333, + "learning_rate": 0.0001, + "loss": 6.2321, + "loss/crossentropy": 2.6169735193252563, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20330799371004105, + "step": 7018 + }, + { + "epoch": 0.219375, + "grad_norm": 3.53125, + "grad_norm_var": 0.21865234375, + "learning_rate": 0.0001, + "loss": 6.2863, + "loss/crossentropy": 2.6988236904144287, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19937185943126678, + "step": 7020 + }, + { + "epoch": 0.2194375, + "grad_norm": 5.09375, + "grad_norm_var": 0.33622639973958335, + "learning_rate": 0.0001, + "loss": 6.0942, + "loss/crossentropy": 2.487640380859375, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20284540206193924, + "step": 7022 + }, + { + "epoch": 0.2195, + "grad_norm": 3.953125, + "grad_norm_var": 0.2670806884765625, + "learning_rate": 0.0001, + "loss": 6.4727, + "loss/crossentropy": 2.6979743242263794, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.212629072368145, + "step": 7024 + }, + { + "epoch": 0.2195625, + "grad_norm": 3.6875, + "grad_norm_var": 0.26594645182291665, + "learning_rate": 0.0001, + "loss": 6.6105, + "loss/crossentropy": 2.811803102493286, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.2138504534959793, + "step": 7026 + }, + { + "epoch": 0.219625, + "grad_norm": 3.296875, + "grad_norm_var": 0.27529296875, + "learning_rate": 0.0001, + "loss": 6.16, + "loss/crossentropy": 2.5398292541503906, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20225611329078674, + "step": 7028 + }, + { + "epoch": 0.2196875, + "grad_norm": 3.1875, + "grad_norm_var": 0.2148101806640625, + "learning_rate": 0.0001, + "loss": 5.8806, + "loss/crossentropy": 2.467816948890686, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18659129738807678, + "step": 7030 + }, + { + "epoch": 0.21975, + "grad_norm": 3.59375, + "grad_norm_var": 0.22088216145833334, + "learning_rate": 0.0001, + "loss": 6.1715, + "loss/crossentropy": 2.4942638874053955, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.201707124710083, + "step": 7032 + }, + { + "epoch": 0.2198125, + "grad_norm": 3.3125, + "grad_norm_var": 0.22753499348958334, + "learning_rate": 0.0001, + "loss": 5.9798, + "loss/crossentropy": 2.5174331665039062, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19038093090057373, + "step": 7034 + }, + { + "epoch": 0.219875, + "grad_norm": 3.25, + "grad_norm_var": 0.2456207275390625, + "learning_rate": 0.0001, + "loss": 6.161, + "loss/crossentropy": 2.651417851448059, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1939275935292244, + "step": 7036 + }, + { + "epoch": 0.2199375, + "grad_norm": 3.015625, + "grad_norm_var": 0.1223297119140625, + "learning_rate": 0.0001, + "loss": 6.1559, + "loss/crossentropy": 2.6518659591674805, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.18985295295715332, + "step": 7038 + }, + { + "epoch": 0.22, + "grad_norm": 3.671875, + "grad_norm_var": 0.11574605305989584, + "learning_rate": 0.0001, + "loss": 5.7382, + "loss/crossentropy": 2.3332555294036865, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.17877569794654846, + "step": 7040 + }, + { + "epoch": 0.2200625, + "grad_norm": 3.390625, + "grad_norm_var": 0.10338134765625, + "learning_rate": 0.0001, + "loss": 5.9545, + "loss/crossentropy": 2.5254757404327393, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1839166134595871, + "step": 7042 + }, + { + "epoch": 0.220125, + "grad_norm": 3.359375, + "grad_norm_var": 0.09316304524739584, + "learning_rate": 0.0001, + "loss": 5.7698, + "loss/crossentropy": 2.3414753675460815, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18501769751310349, + "step": 7044 + }, + { + "epoch": 0.2201875, + "grad_norm": 3.203125, + "grad_norm_var": 0.09169921875, + "learning_rate": 0.0001, + "loss": 6.0464, + "loss/crossentropy": 2.503966808319092, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1929159015417099, + "step": 7046 + }, + { + "epoch": 0.22025, + "grad_norm": 3.40625, + "grad_norm_var": 0.0310943603515625, + "learning_rate": 0.0001, + "loss": 5.8648, + "loss/crossentropy": 2.4673224687576294, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.178032785654068, + "step": 7048 + }, + { + "epoch": 0.2203125, + "grad_norm": 3.703125, + "grad_norm_var": 0.03764546712239583, + "learning_rate": 0.0001, + "loss": 6.4858, + "loss/crossentropy": 2.8963719606399536, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19605228304862976, + "step": 7050 + }, + { + "epoch": 0.220375, + "grad_norm": 3.390625, + "grad_norm_var": 0.041291300455729166, + "learning_rate": 0.0001, + "loss": 6.3703, + "loss/crossentropy": 2.7580485343933105, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20107314735651016, + "step": 7052 + }, + { + "epoch": 0.2204375, + "grad_norm": 3.59375, + "grad_norm_var": 0.03156636555989583, + "learning_rate": 0.0001, + "loss": 6.1383, + "loss/crossentropy": 2.636568784713745, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19196929037570953, + "step": 7054 + }, + { + "epoch": 0.2205, + "grad_norm": 3.890625, + "grad_norm_var": 0.03916015625, + "learning_rate": 0.0001, + "loss": 6.3322, + "loss/crossentropy": 2.6917834281921387, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.19997604191303253, + "step": 7056 + }, + { + "epoch": 0.2205625, + "grad_norm": 3.609375, + "grad_norm_var": 0.043366495768229166, + "learning_rate": 0.0001, + "loss": 6.0525, + "loss/crossentropy": 2.5482877492904663, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.188310407102108, + "step": 7058 + }, + { + "epoch": 0.220625, + "grad_norm": 3.53125, + "grad_norm_var": 0.06220296223958333, + "learning_rate": 0.0001, + "loss": 6.1149, + "loss/crossentropy": 2.491082549095154, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20183787494897842, + "step": 7060 + }, + { + "epoch": 0.2206875, + "grad_norm": 3.90625, + "grad_norm_var": 0.06349283854166667, + "learning_rate": 0.0001, + "loss": 6.5998, + "loss/crossentropy": 2.760351061820984, + "loss/hidden": 1.69921875, + "loss/jsd": 0.0, + "loss/logits": 0.21402332931756973, + "step": 7062 + }, + { + "epoch": 0.22075, + "grad_norm": 3.4375, + "grad_norm_var": 0.05373433430989583, + "learning_rate": 0.0001, + "loss": 6.11, + "loss/crossentropy": 2.5462125539779663, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19739294797182083, + "step": 7064 + }, + { + "epoch": 0.2208125, + "grad_norm": 3.3125, + "grad_norm_var": 0.061376953125, + "learning_rate": 0.0001, + "loss": 6.0224, + "loss/crossentropy": 2.554285407066345, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18977734446525574, + "step": 7066 + }, + { + "epoch": 0.220875, + "grad_norm": 3.546875, + "grad_norm_var": 0.05858968098958333, + "learning_rate": 0.0001, + "loss": 6.373, + "loss/crossentropy": 2.7668060064315796, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20319905132055283, + "step": 7068 + }, + { + "epoch": 0.2209375, + "grad_norm": 3.265625, + "grad_norm_var": 0.060301717122395834, + "learning_rate": 0.0001, + "loss": 6.1924, + "loss/crossentropy": 2.679903984069824, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19226178526878357, + "step": 7070 + }, + { + "epoch": 0.221, + "grad_norm": 3.25, + "grad_norm_var": 0.054255167643229164, + "learning_rate": 0.0001, + "loss": 6.0998, + "loss/crossentropy": 2.584157705307007, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19062745571136475, + "step": 7072 + }, + { + "epoch": 0.2210625, + "grad_norm": 3.5, + "grad_norm_var": 0.051935831705729164, + "learning_rate": 0.0001, + "loss": 6.153, + "loss/crossentropy": 2.6092441082000732, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19539161026477814, + "step": 7074 + }, + { + "epoch": 0.221125, + "grad_norm": 3.625, + "grad_norm_var": 0.03181864420572917, + "learning_rate": 0.0001, + "loss": 6.1599, + "loss/crossentropy": 2.556912899017334, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.19624081999063492, + "step": 7076 + }, + { + "epoch": 0.2211875, + "grad_norm": 3.515625, + "grad_norm_var": 0.018355305989583334, + "learning_rate": 0.0001, + "loss": 6.3478, + "loss/crossentropy": 2.6942012310028076, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20754989236593246, + "step": 7078 + }, + { + "epoch": 0.22125, + "grad_norm": 3.625, + "grad_norm_var": 0.022102864583333333, + "learning_rate": 0.0001, + "loss": 6.1846, + "loss/crossentropy": 2.581602454185486, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20092745125293732, + "step": 7080 + }, + { + "epoch": 0.2213125, + "grad_norm": 3.546875, + "grad_norm_var": 0.03290608723958333, + "learning_rate": 0.0001, + "loss": 5.9697, + "loss/crossentropy": 2.4440085887908936, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19787990301847458, + "step": 7082 + }, + { + "epoch": 0.221375, + "grad_norm": 3.96875, + "grad_norm_var": 0.04449462890625, + "learning_rate": 0.0001, + "loss": 6.4231, + "loss/crossentropy": 2.677451968193054, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.21167093515396118, + "step": 7084 + }, + { + "epoch": 0.2214375, + "grad_norm": 3.921875, + "grad_norm_var": 0.0378326416015625, + "learning_rate": 0.0001, + "loss": 6.5638, + "loss/crossentropy": 2.8133574724197388, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.20863892138004303, + "step": 7086 + }, + { + "epoch": 0.2215, + "grad_norm": 3.65625, + "grad_norm_var": 0.0268218994140625, + "learning_rate": 0.0001, + "loss": 6.1669, + "loss/crossentropy": 2.544797658920288, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20283634215593338, + "step": 7088 + }, + { + "epoch": 0.2215625, + "grad_norm": 3.5625, + "grad_norm_var": 0.032124837239583336, + "learning_rate": 0.0001, + "loss": 5.9956, + "loss/crossentropy": 2.562207341194153, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1878679394721985, + "step": 7090 + }, + { + "epoch": 0.221625, + "grad_norm": 3.65625, + "grad_norm_var": 0.73599853515625, + "learning_rate": 0.0001, + "loss": 6.2003, + "loss/crossentropy": 2.454803466796875, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20970892161130905, + "step": 7092 + }, + { + "epoch": 0.2216875, + "grad_norm": 3.71875, + "grad_norm_var": 0.7364217122395833, + "learning_rate": 0.0001, + "loss": 6.4424, + "loss/crossentropy": 2.7469085454940796, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20665527135133743, + "step": 7094 + }, + { + "epoch": 0.22175, + "grad_norm": 3.453125, + "grad_norm_var": 0.7474843343098958, + "learning_rate": 0.0001, + "loss": 5.8923, + "loss/crossentropy": 2.3767290115356445, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19647910445928574, + "step": 7096 + }, + { + "epoch": 0.2218125, + "grad_norm": 3.40625, + "grad_norm_var": 0.7739491780598958, + "learning_rate": 0.0001, + "loss": 5.7138, + "loss/crossentropy": 2.3209710121154785, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18537429720163345, + "step": 7098 + }, + { + "epoch": 0.221875, + "grad_norm": 3.953125, + "grad_norm_var": 0.7713175455729167, + "learning_rate": 0.0001, + "loss": 6.4804, + "loss/crossentropy": 2.6829233169555664, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.21334099769592285, + "step": 7100 + }, + { + "epoch": 0.2219375, + "grad_norm": 3.6875, + "grad_norm_var": 0.7812001546223958, + "learning_rate": 0.0001, + "loss": 6.408, + "loss/crossentropy": 2.7748000621795654, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20159757882356644, + "step": 7102 + }, + { + "epoch": 0.222, + "grad_norm": 3.328125, + "grad_norm_var": 0.8007771809895833, + "learning_rate": 0.0001, + "loss": 6.3518, + "loss/crossentropy": 2.754671096801758, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.19643227010965347, + "step": 7104 + }, + { + "epoch": 0.2220625, + "grad_norm": 3.359375, + "grad_norm_var": 0.795263671875, + "learning_rate": 0.0001, + "loss": 5.9993, + "loss/crossentropy": 2.408530831336975, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19658169895410538, + "step": 7106 + }, + { + "epoch": 0.222125, + "grad_norm": 3.453125, + "grad_norm_var": 0.0819244384765625, + "learning_rate": 0.0001, + "loss": 6.3234, + "loss/crossentropy": 2.7514824867248535, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19742165505886078, + "step": 7108 + }, + { + "epoch": 0.2221875, + "grad_norm": 3.5, + "grad_norm_var": 0.058381144205729166, + "learning_rate": 0.0001, + "loss": 5.6316, + "loss/crossentropy": 2.305321216583252, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17950443178415298, + "step": 7110 + }, + { + "epoch": 0.22225, + "grad_norm": 3.125, + "grad_norm_var": 0.06800028483072916, + "learning_rate": 0.0001, + "loss": 5.7636, + "loss/crossentropy": 2.4000433683395386, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1769847720861435, + "step": 7112 + }, + { + "epoch": 0.2223125, + "grad_norm": 4.0, + "grad_norm_var": 0.07986551920572917, + "learning_rate": 0.0001, + "loss": 6.4122, + "loss/crossentropy": 2.672022819519043, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20683379471302032, + "step": 7114 + }, + { + "epoch": 0.222375, + "grad_norm": 3.828125, + "grad_norm_var": 0.0757720947265625, + "learning_rate": 0.0001, + "loss": 6.4042, + "loss/crossentropy": 2.7689915895462036, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.2037556767463684, + "step": 7116 + }, + { + "epoch": 0.2224375, + "grad_norm": 4.0625, + "grad_norm_var": 0.091796875, + "learning_rate": 0.0001, + "loss": 6.0827, + "loss/crossentropy": 2.5164963006973267, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19842083007097244, + "step": 7118 + }, + { + "epoch": 0.2225, + "grad_norm": 3.578125, + "grad_norm_var": 0.0732421875, + "learning_rate": 0.0001, + "loss": 5.9162, + "loss/crossentropy": 2.453752040863037, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19038930535316467, + "step": 7120 + }, + { + "epoch": 0.2225625, + "grad_norm": 3.578125, + "grad_norm_var": 0.07088216145833333, + "learning_rate": 0.0001, + "loss": 6.3235, + "loss/crossentropy": 2.7482842206954956, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1981423869729042, + "step": 7122 + }, + { + "epoch": 0.222625, + "grad_norm": 4.125, + "grad_norm_var": 0.09062093098958333, + "learning_rate": 0.0001, + "loss": 6.4008, + "loss/crossentropy": 2.5867438316345215, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.21460527181625366, + "step": 7124 + }, + { + "epoch": 0.2226875, + "grad_norm": 3.59375, + "grad_norm_var": 0.08170166015625, + "learning_rate": 0.0001, + "loss": 5.7468, + "loss/crossentropy": 2.335509777069092, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18683339655399323, + "step": 7126 + }, + { + "epoch": 0.22275, + "grad_norm": 3.453125, + "grad_norm_var": 0.06715087890625, + "learning_rate": 0.0001, + "loss": 5.9505, + "loss/crossentropy": 2.484445810317993, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1860543116927147, + "step": 7128 + }, + { + "epoch": 0.2228125, + "grad_norm": 3.859375, + "grad_norm_var": 0.06148681640625, + "learning_rate": 0.0001, + "loss": 6.167, + "loss/crossentropy": 2.599080204963684, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19624672830104828, + "step": 7130 + }, + { + "epoch": 0.222875, + "grad_norm": 3.078125, + "grad_norm_var": 0.0745269775390625, + "learning_rate": 0.0001, + "loss": 6.1566, + "loss/crossentropy": 2.5842623710632324, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19941814243793488, + "step": 7132 + }, + { + "epoch": 0.2229375, + "grad_norm": 3.421875, + "grad_norm_var": 0.06288655598958333, + "learning_rate": 0.0001, + "loss": 5.9316, + "loss/crossentropy": 2.4942493438720703, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18983107805252075, + "step": 7134 + }, + { + "epoch": 0.223, + "grad_norm": 3.40625, + "grad_norm_var": 0.058512369791666664, + "learning_rate": 0.0001, + "loss": 6.0632, + "loss/crossentropy": 2.4725465774536133, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19735118746757507, + "step": 7136 + }, + { + "epoch": 0.2230625, + "grad_norm": 3.390625, + "grad_norm_var": 0.06085611979166667, + "learning_rate": 0.0001, + "loss": 6.1609, + "loss/crossentropy": 2.6143925189971924, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19840151816606522, + "step": 7138 + }, + { + "epoch": 0.223125, + "grad_norm": 5.5625, + "grad_norm_var": 0.3148722330729167, + "learning_rate": 0.0001, + "loss": 6.2831, + "loss/crossentropy": 2.7380794286727905, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19746720790863037, + "step": 7140 + }, + { + "epoch": 0.2231875, + "grad_norm": 3.359375, + "grad_norm_var": 0.31502176920572916, + "learning_rate": 0.0001, + "loss": 5.8714, + "loss/crossentropy": 2.4498631954193115, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1855173036456108, + "step": 7142 + }, + { + "epoch": 0.22325, + "grad_norm": 3.515625, + "grad_norm_var": 0.3158925374348958, + "learning_rate": 0.0001, + "loss": 6.1672, + "loss/crossentropy": 2.544526696205139, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19742655754089355, + "step": 7144 + }, + { + "epoch": 0.2233125, + "grad_norm": 3.625, + "grad_norm_var": 0.31091206868489585, + "learning_rate": 0.0001, + "loss": 6.4244, + "loss/crossentropy": 2.779419422149658, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20395386964082718, + "step": 7146 + }, + { + "epoch": 0.223375, + "grad_norm": 3.921875, + "grad_norm_var": 0.30429280598958336, + "learning_rate": 0.0001, + "loss": 6.1574, + "loss/crossentropy": 2.537745952606201, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2002440243959427, + "step": 7148 + }, + { + "epoch": 0.2234375, + "grad_norm": 4.375, + "grad_norm_var": 0.3318837483723958, + "learning_rate": 0.0001, + "loss": 6.4352, + "loss/crossentropy": 2.7549798488616943, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.2063073366880417, + "step": 7150 + }, + { + "epoch": 0.2235, + "grad_norm": 3.5, + "grad_norm_var": 0.327783203125, + "learning_rate": 0.0001, + "loss": 6.1109, + "loss/crossentropy": 2.532162070274353, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20044952630996704, + "step": 7152 + }, + { + "epoch": 0.2235625, + "grad_norm": 3.65625, + "grad_norm_var": 0.3145172119140625, + "learning_rate": 0.0001, + "loss": 6.241, + "loss/crossentropy": 2.542296290397644, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20698314160108566, + "step": 7154 + }, + { + "epoch": 0.223625, + "grad_norm": 3.5, + "grad_norm_var": 0.09064127604166666, + "learning_rate": 0.0001, + "loss": 6.3842, + "loss/crossentropy": 2.649972081184387, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20779529958963394, + "step": 7156 + }, + { + "epoch": 0.2236875, + "grad_norm": 3.515625, + "grad_norm_var": 0.07971598307291666, + "learning_rate": 0.0001, + "loss": 6.0456, + "loss/crossentropy": 2.5300976037979126, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1949128583073616, + "step": 7158 + }, + { + "epoch": 0.22375, + "grad_norm": 3.90625, + "grad_norm_var": 0.0878082275390625, + "learning_rate": 0.0001, + "loss": 6.2176, + "loss/crossentropy": 2.521161913871765, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20636696368455887, + "step": 7160 + }, + { + "epoch": 0.2238125, + "grad_norm": 3.609375, + "grad_norm_var": 0.08700764973958333, + "learning_rate": 0.0001, + "loss": 6.2207, + "loss/crossentropy": 2.7186752557754517, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1919991374015808, + "step": 7162 + }, + { + "epoch": 0.223875, + "grad_norm": 3.484375, + "grad_norm_var": 0.07935791015625, + "learning_rate": 0.0001, + "loss": 6.0661, + "loss/crossentropy": 2.516534924507141, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1987050324678421, + "step": 7164 + }, + { + "epoch": 0.2239375, + "grad_norm": 3.375, + "grad_norm_var": 0.05379130045572917, + "learning_rate": 0.0001, + "loss": 6.3041, + "loss/crossentropy": 2.608791708946228, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20624708384275436, + "step": 7166 + }, + { + "epoch": 0.224, + "grad_norm": 3.671875, + "grad_norm_var": 0.0555816650390625, + "learning_rate": 0.0001, + "loss": 6.1641, + "loss/crossentropy": 2.6959011554718018, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1878363937139511, + "step": 7168 + }, + { + "epoch": 0.2240625, + "grad_norm": 3.40625, + "grad_norm_var": 0.06803385416666667, + "learning_rate": 0.0001, + "loss": 6.0807, + "loss/crossentropy": 2.5687514543533325, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1918240785598755, + "step": 7170 + }, + { + "epoch": 0.224125, + "grad_norm": 3.453125, + "grad_norm_var": 0.044169108072916664, + "learning_rate": 0.0001, + "loss": 6.3498, + "loss/crossentropy": 2.717841625213623, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20069392770528793, + "step": 7172 + }, + { + "epoch": 0.2241875, + "grad_norm": 3.40625, + "grad_norm_var": 0.045849609375, + "learning_rate": 0.0001, + "loss": 5.9979, + "loss/crossentropy": 2.548419952392578, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1879129856824875, + "step": 7174 + }, + { + "epoch": 0.22425, + "grad_norm": 3.65625, + "grad_norm_var": 0.043717447916666666, + "learning_rate": 0.0001, + "loss": 6.0759, + "loss/crossentropy": 2.5514897108078003, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19267520308494568, + "step": 7176 + }, + { + "epoch": 0.2243125, + "grad_norm": 3.484375, + "grad_norm_var": 0.046126302083333334, + "learning_rate": 0.0001, + "loss": 6.05, + "loss/crossentropy": 2.5378646850585938, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1945752575993538, + "step": 7178 + }, + { + "epoch": 0.224375, + "grad_norm": 3.78125, + "grad_norm_var": 0.04854227701822917, + "learning_rate": 0.0001, + "loss": 5.9998, + "loss/crossentropy": 2.5784002542495728, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18316055834293365, + "step": 7180 + }, + { + "epoch": 0.2244375, + "grad_norm": 3.25, + "grad_norm_var": 0.047932942708333336, + "learning_rate": 0.0001, + "loss": 6.2681, + "loss/crossentropy": 2.7137949466705322, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19644546508789062, + "step": 7182 + }, + { + "epoch": 0.2245, + "grad_norm": 3.078125, + "grad_norm_var": 0.0605133056640625, + "learning_rate": 0.0001, + "loss": 5.9173, + "loss/crossentropy": 2.4534000158309937, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.18466929346323013, + "step": 7184 + }, + { + "epoch": 0.2245625, + "grad_norm": 3.921875, + "grad_norm_var": 0.06454976399739583, + "learning_rate": 0.0001, + "loss": 6.1779, + "loss/crossentropy": 2.5641443729400635, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20044228434562683, + "step": 7186 + }, + { + "epoch": 0.224625, + "grad_norm": 4.1875, + "grad_norm_var": 0.0912109375, + "learning_rate": 0.0001, + "loss": 5.9642, + "loss/crossentropy": 2.539321780204773, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1811591386795044, + "step": 7188 + }, + { + "epoch": 0.2246875, + "grad_norm": 3.3125, + "grad_norm_var": 0.09397379557291667, + "learning_rate": 0.0001, + "loss": 5.7936, + "loss/crossentropy": 2.4275606870651245, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1842569038271904, + "step": 7190 + }, + { + "epoch": 0.22475, + "grad_norm": 3.46875, + "grad_norm_var": 0.07342122395833334, + "learning_rate": 0.0001, + "loss": 6.1311, + "loss/crossentropy": 2.6554477214813232, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19209395349025726, + "step": 7192 + }, + { + "epoch": 0.2248125, + "grad_norm": 3.484375, + "grad_norm_var": 0.073291015625, + "learning_rate": 0.0001, + "loss": 6.0223, + "loss/crossentropy": 2.511997103691101, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19126948714256287, + "step": 7194 + }, + { + "epoch": 0.224875, + "grad_norm": 3.359375, + "grad_norm_var": 0.07537333170572917, + "learning_rate": 0.0001, + "loss": 6.327, + "loss/crossentropy": 2.757696270942688, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1979505866765976, + "step": 7196 + }, + { + "epoch": 0.2249375, + "grad_norm": 3.5, + "grad_norm_var": 0.07089436848958333, + "learning_rate": 0.0001, + "loss": 5.9437, + "loss/crossentropy": 2.507505178451538, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1858089566230774, + "step": 7198 + }, + { + "epoch": 0.225, + "grad_norm": 4.0, + "grad_norm_var": 0.08759663899739584, + "learning_rate": 0.0001, + "loss": 6.2874, + "loss/crossentropy": 2.633690595626831, + "loss/hidden": 1.7109375, + "loss/jsd": 0.0, + "loss/logits": 0.1942805051803589, + "step": 7200 + }, + { + "epoch": 0.2250625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0906646728515625, + "learning_rate": 0.0001, + "loss": 6.237, + "loss/crossentropy": 2.6839088201522827, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19710107892751694, + "step": 7202 + }, + { + "epoch": 0.225125, + "grad_norm": 3.3125, + "grad_norm_var": 0.06440327962239584, + "learning_rate": 0.0001, + "loss": 6.0258, + "loss/crossentropy": 2.5818649530410767, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18736477941274643, + "step": 7204 + }, + { + "epoch": 0.2251875, + "grad_norm": 3.296875, + "grad_norm_var": 0.06496480305989584, + "learning_rate": 0.0001, + "loss": 6.2243, + "loss/crossentropy": 2.69213604927063, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19540417194366455, + "step": 7206 + }, + { + "epoch": 0.22525, + "grad_norm": 3.390625, + "grad_norm_var": 0.06997782389322917, + "learning_rate": 0.0001, + "loss": 6.408, + "loss/crossentropy": 2.735700845718384, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.20863628387451172, + "step": 7208 + }, + { + "epoch": 0.2253125, + "grad_norm": 3.59375, + "grad_norm_var": 0.10693257649739583, + "learning_rate": 0.0001, + "loss": 6.252, + "loss/crossentropy": 2.6026411056518555, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20008762180805206, + "step": 7210 + }, + { + "epoch": 0.225375, + "grad_norm": 3.625, + "grad_norm_var": 0.10064697265625, + "learning_rate": 0.0001, + "loss": 5.8114, + "loss/crossentropy": 2.366856098175049, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19054561108350754, + "step": 7212 + }, + { + "epoch": 0.2254375, + "grad_norm": 3.59375, + "grad_norm_var": 0.11060791015625, + "learning_rate": 0.0001, + "loss": 5.9851, + "loss/crossentropy": 2.4928311109542847, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1918041706085205, + "step": 7214 + }, + { + "epoch": 0.2255, + "grad_norm": 3.3125, + "grad_norm_var": 0.08677469889322917, + "learning_rate": 0.0001, + "loss": 6.1356, + "loss/crossentropy": 2.576792359352112, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1992373839020729, + "step": 7216 + }, + { + "epoch": 0.2255625, + "grad_norm": 3.4375, + "grad_norm_var": 0.08844401041666666, + "learning_rate": 0.0001, + "loss": 6.0178, + "loss/crossentropy": 2.3962244987487793, + "loss/hidden": 1.6796875, + "loss/jsd": 0.0, + "loss/logits": 0.1941882222890854, + "step": 7218 + }, + { + "epoch": 0.225625, + "grad_norm": 3.765625, + "grad_norm_var": 0.0850006103515625, + "learning_rate": 0.0001, + "loss": 6.0182, + "loss/crossentropy": 2.485282301902771, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19274737685918808, + "step": 7220 + }, + { + "epoch": 0.2256875, + "grad_norm": 3.5625, + "grad_norm_var": 0.08580322265625, + "learning_rate": 0.0001, + "loss": 6.042, + "loss/crossentropy": 2.4804413318634033, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19521985948085785, + "step": 7222 + }, + { + "epoch": 0.22575, + "grad_norm": 3.53125, + "grad_norm_var": 0.0804107666015625, + "learning_rate": 0.0001, + "loss": 6.2497, + "loss/crossentropy": 2.649003744125366, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19991525262594223, + "step": 7224 + }, + { + "epoch": 0.2258125, + "grad_norm": 3.375, + "grad_norm_var": 0.04898681640625, + "learning_rate": 0.0001, + "loss": 6.1027, + "loss/crossentropy": 2.5942927598953247, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19264158606529236, + "step": 7226 + }, + { + "epoch": 0.225875, + "grad_norm": 3.53125, + "grad_norm_var": 0.0519439697265625, + "learning_rate": 0.0001, + "loss": 6.1449, + "loss/crossentropy": 2.6132737398147583, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1926158145070076, + "step": 7228 + }, + { + "epoch": 0.2259375, + "grad_norm": 3.859375, + "grad_norm_var": 0.04111328125, + "learning_rate": 0.0001, + "loss": 6.0757, + "loss/crossentropy": 2.5855209827423096, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.18846654891967773, + "step": 7230 + }, + { + "epoch": 0.226, + "grad_norm": 3.546875, + "grad_norm_var": 0.07340087890625, + "learning_rate": 0.0001, + "loss": 6.1571, + "loss/crossentropy": 2.5730226039886475, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.1947377249598503, + "step": 7232 + }, + { + "epoch": 0.2260625, + "grad_norm": 3.53125, + "grad_norm_var": 0.06357421875, + "learning_rate": 0.0001, + "loss": 6.0368, + "loss/crossentropy": 2.53850257396698, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19045089930295944, + "step": 7234 + }, + { + "epoch": 0.226125, + "grad_norm": 3.515625, + "grad_norm_var": 0.057616170247395834, + "learning_rate": 0.0001, + "loss": 5.9541, + "loss/crossentropy": 2.423492431640625, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19446807354688644, + "step": 7236 + }, + { + "epoch": 0.2261875, + "grad_norm": 3.9375, + "grad_norm_var": 0.060042317708333334, + "learning_rate": 0.0001, + "loss": 5.933, + "loss/crossentropy": 2.3947328329086304, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.18937600404024124, + "step": 7238 + }, + { + "epoch": 0.22625, + "grad_norm": 3.734375, + "grad_norm_var": 0.0597808837890625, + "learning_rate": 0.0001, + "loss": 6.074, + "loss/crossentropy": 2.458932042121887, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.19744067639112473, + "step": 7240 + }, + { + "epoch": 0.2263125, + "grad_norm": 3.796875, + "grad_norm_var": 0.0525299072265625, + "learning_rate": 0.0001, + "loss": 6.1446, + "loss/crossentropy": 2.521939754486084, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20210901647806168, + "step": 7242 + }, + { + "epoch": 0.226375, + "grad_norm": 3.625, + "grad_norm_var": 0.05084228515625, + "learning_rate": 0.0001, + "loss": 6.3326, + "loss/crossentropy": 2.70308518409729, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20357725769281387, + "step": 7244 + }, + { + "epoch": 0.2264375, + "grad_norm": 3.265625, + "grad_norm_var": 0.07063700358072916, + "learning_rate": 0.0001, + "loss": 6.008, + "loss/crossentropy": 2.571186661720276, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.188599094748497, + "step": 7246 + }, + { + "epoch": 0.2265, + "grad_norm": 3.4375, + "grad_norm_var": 0.0380767822265625, + "learning_rate": 0.0001, + "loss": 6.2089, + "loss/crossentropy": 2.70067298412323, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1918356642127037, + "step": 7248 + }, + { + "epoch": 0.2265625, + "grad_norm": 3.734375, + "grad_norm_var": 0.05286051432291667, + "learning_rate": 0.0001, + "loss": 5.8106, + "loss/crossentropy": 2.393430471420288, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18429815769195557, + "step": 7250 + }, + { + "epoch": 0.226625, + "grad_norm": 3.4375, + "grad_norm_var": 0.05406494140625, + "learning_rate": 0.0001, + "loss": 6.1698, + "loss/crossentropy": 2.624249815940857, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19557499140501022, + "step": 7252 + }, + { + "epoch": 0.2266875, + "grad_norm": 3.3125, + "grad_norm_var": 0.046296183268229166, + "learning_rate": 0.0001, + "loss": 5.6594, + "loss/crossentropy": 2.2593475580215454, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18844523280858994, + "step": 7254 + }, + { + "epoch": 0.22675, + "grad_norm": 3.453125, + "grad_norm_var": 0.043473307291666666, + "learning_rate": 0.0001, + "loss": 6.0688, + "loss/crossentropy": 2.633441209793091, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.188069187104702, + "step": 7256 + }, + { + "epoch": 0.2268125, + "grad_norm": 3.75, + "grad_norm_var": 0.03833719889322917, + "learning_rate": 0.0001, + "loss": 6.0438, + "loss/crossentropy": 2.435559034347534, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.19480594247579575, + "step": 7258 + }, + { + "epoch": 0.226875, + "grad_norm": 3.375, + "grad_norm_var": 0.03524983723958333, + "learning_rate": 0.0001, + "loss": 6.0048, + "loss/crossentropy": 2.555377721786499, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18791264295578003, + "step": 7260 + }, + { + "epoch": 0.2269375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0385162353515625, + "learning_rate": 0.0001, + "loss": 5.9651, + "loss/crossentropy": 2.540740966796875, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18813879787921906, + "step": 7262 + }, + { + "epoch": 0.227, + "grad_norm": 3.1875, + "grad_norm_var": 0.0409088134765625, + "learning_rate": 0.0001, + "loss": 5.6813, + "loss/crossentropy": 2.3397140502929688, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17908480763435364, + "step": 7264 + }, + { + "epoch": 0.2270625, + "grad_norm": 3.265625, + "grad_norm_var": 0.03850911458333333, + "learning_rate": 0.0001, + "loss": 6.601, + "loss/crossentropy": 3.0177700519561768, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1997295320034027, + "step": 7266 + }, + { + "epoch": 0.227125, + "grad_norm": 3.75, + "grad_norm_var": 0.0453521728515625, + "learning_rate": 0.0001, + "loss": 6.1002, + "loss/crossentropy": 2.540258288383484, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19309942424297333, + "step": 7268 + }, + { + "epoch": 0.2271875, + "grad_norm": 3.8125, + "grad_norm_var": 0.08054097493489583, + "learning_rate": 0.0001, + "loss": 6.1415, + "loss/crossentropy": 2.4717822074890137, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20447231829166412, + "step": 7270 + }, + { + "epoch": 0.22725, + "grad_norm": 3.328125, + "grad_norm_var": 0.07685445149739584, + "learning_rate": 0.0001, + "loss": 6.0849, + "loss/crossentropy": 2.57514226436615, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1939399093389511, + "step": 7272 + }, + { + "epoch": 0.2273125, + "grad_norm": 3.578125, + "grad_norm_var": 0.07429097493489584, + "learning_rate": 0.0001, + "loss": 5.7311, + "loss/crossentropy": 2.333707571029663, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18543951958417892, + "step": 7274 + }, + { + "epoch": 0.227375, + "grad_norm": 3.234375, + "grad_norm_var": 0.07571614583333333, + "learning_rate": 0.0001, + "loss": 6.0251, + "loss/crossentropy": 2.5862536430358887, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18607349693775177, + "step": 7276 + }, + { + "epoch": 0.2274375, + "grad_norm": 3.0, + "grad_norm_var": 0.10493876139322916, + "learning_rate": 0.0001, + "loss": 5.9968, + "loss/crossentropy": 2.556631088256836, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1865924820303917, + "step": 7278 + }, + { + "epoch": 0.2275, + "grad_norm": 3.6875, + "grad_norm_var": 0.09657796223958333, + "learning_rate": 0.0001, + "loss": 6.216, + "loss/crossentropy": 2.6764732599258423, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.194186270236969, + "step": 7280 + }, + { + "epoch": 0.2275625, + "grad_norm": 3.40625, + "grad_norm_var": 0.09251302083333333, + "learning_rate": 0.0001, + "loss": 6.2562, + "loss/crossentropy": 2.716831684112549, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19495396316051483, + "step": 7282 + }, + { + "epoch": 0.227625, + "grad_norm": 4.25, + "grad_norm_var": 0.13001302083333333, + "learning_rate": 0.0001, + "loss": 5.9113, + "loss/crossentropy": 2.297808527946472, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.19689101725816727, + "step": 7284 + }, + { + "epoch": 0.2276875, + "grad_norm": 3.359375, + "grad_norm_var": 0.1010894775390625, + "learning_rate": 0.0001, + "loss": 6.109, + "loss/crossentropy": 2.637886643409729, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19047501683235168, + "step": 7286 + }, + { + "epoch": 0.22775, + "grad_norm": 3.421875, + "grad_norm_var": 0.0988433837890625, + "learning_rate": 0.0001, + "loss": 6.1929, + "loss/crossentropy": 2.7201133966445923, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19336767494678497, + "step": 7288 + }, + { + "epoch": 0.2278125, + "grad_norm": 3.140625, + "grad_norm_var": 0.10550028483072917, + "learning_rate": 0.0001, + "loss": 5.6638, + "loss/crossentropy": 2.355941653251648, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1776605248451233, + "step": 7290 + }, + { + "epoch": 0.227875, + "grad_norm": 3.21875, + "grad_norm_var": 0.10982666015625, + "learning_rate": 0.0001, + "loss": 5.7562, + "loss/crossentropy": 2.4231557846069336, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18056906759738922, + "step": 7292 + }, + { + "epoch": 0.2279375, + "grad_norm": 3.515625, + "grad_norm_var": 0.0697174072265625, + "learning_rate": 0.0001, + "loss": 6.1144, + "loss/crossentropy": 2.5779067277908325, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19544732570648193, + "step": 7294 + }, + { + "epoch": 0.228, + "grad_norm": 3.421875, + "grad_norm_var": 0.06889546712239583, + "learning_rate": 0.0001, + "loss": 6.4275, + "loss/crossentropy": 2.780030369758606, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20459024608135223, + "step": 7296 + }, + { + "epoch": 0.2280625, + "grad_norm": 3.859375, + "grad_norm_var": 0.08276265462239583, + "learning_rate": 0.0001, + "loss": 6.0204, + "loss/crossentropy": 2.416700005531311, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.1931861937046051, + "step": 7298 + }, + { + "epoch": 0.228125, + "grad_norm": 3.375, + "grad_norm_var": 0.03961181640625, + "learning_rate": 0.0001, + "loss": 5.8088, + "loss/crossentropy": 2.481619954109192, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.181547611951828, + "step": 7300 + }, + { + "epoch": 0.2281875, + "grad_norm": 3.25, + "grad_norm_var": 0.04090067545572917, + "learning_rate": 0.0001, + "loss": 5.8149, + "loss/crossentropy": 2.395822048187256, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1879977211356163, + "step": 7302 + }, + { + "epoch": 0.22825, + "grad_norm": 3.5625, + "grad_norm_var": 0.04132486979166667, + "learning_rate": 0.0001, + "loss": 6.3389, + "loss/crossentropy": 2.743680715560913, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20171266049146652, + "step": 7304 + }, + { + "epoch": 0.2283125, + "grad_norm": 3.875, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 0.0001, + "loss": 5.9588, + "loss/crossentropy": 2.5179975032806396, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1870514154434204, + "step": 7306 + }, + { + "epoch": 0.228375, + "grad_norm": 3.8125, + "grad_norm_var": 0.0456207275390625, + "learning_rate": 0.0001, + "loss": 6.2987, + "loss/crossentropy": 2.748790740966797, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19640059769153595, + "step": 7308 + }, + { + "epoch": 0.2284375, + "grad_norm": 3.484375, + "grad_norm_var": 0.0541900634765625, + "learning_rate": 0.0001, + "loss": 6.0278, + "loss/crossentropy": 2.588564395904541, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1872790977358818, + "step": 7310 + }, + { + "epoch": 0.2285, + "grad_norm": 3.40625, + "grad_norm_var": 0.05211588541666667, + "learning_rate": 0.0001, + "loss": 6.1007, + "loss/crossentropy": 2.5295733213424683, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19696111977100372, + "step": 7312 + }, + { + "epoch": 0.2285625, + "grad_norm": 3.28125, + "grad_norm_var": 0.039891560872395836, + "learning_rate": 0.0001, + "loss": 5.849, + "loss/crossentropy": 2.4458121061325073, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18289253860712051, + "step": 7314 + }, + { + "epoch": 0.228625, + "grad_norm": 3.5, + "grad_norm_var": 0.04163004557291667, + "learning_rate": 0.0001, + "loss": 6.2824, + "loss/crossentropy": 2.627694010734558, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20530951023101807, + "step": 7316 + }, + { + "epoch": 0.2286875, + "grad_norm": 3.484375, + "grad_norm_var": 0.03870035807291667, + "learning_rate": 0.0001, + "loss": 5.8016, + "loss/crossentropy": 2.4144967794418335, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18480688333511353, + "step": 7318 + }, + { + "epoch": 0.22875, + "grad_norm": 3.546875, + "grad_norm_var": 0.04283447265625, + "learning_rate": 0.0001, + "loss": 5.8909, + "loss/crossentropy": 2.421323776245117, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19070785492658615, + "step": 7320 + }, + { + "epoch": 0.2288125, + "grad_norm": 3.484375, + "grad_norm_var": 0.029182942708333333, + "learning_rate": 0.0001, + "loss": 5.9303, + "loss/crossentropy": 2.5196995735168457, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1859780177474022, + "step": 7322 + }, + { + "epoch": 0.228875, + "grad_norm": 3.140625, + "grad_norm_var": 0.023583984375, + "learning_rate": 0.0001, + "loss": 6.0889, + "loss/crossentropy": 2.652615547180176, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18815699964761734, + "step": 7324 + }, + { + "epoch": 0.2289375, + "grad_norm": 6.5, + "grad_norm_var": 0.6192860921223958, + "learning_rate": 0.0001, + "loss": 6.1446, + "loss/crossentropy": 2.5927183628082275, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.196204774081707, + "step": 7326 + }, + { + "epoch": 0.229, + "grad_norm": 3.859375, + "grad_norm_var": 0.6470611572265625, + "learning_rate": 0.0001, + "loss": 6.395, + "loss/crossentropy": 2.6915029287338257, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20862893015146255, + "step": 7328 + }, + { + "epoch": 0.2290625, + "grad_norm": 3.984375, + "grad_norm_var": 0.6263580322265625, + "learning_rate": 0.0001, + "loss": 6.2523, + "loss/crossentropy": 2.586848020553589, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20248162746429443, + "step": 7330 + }, + { + "epoch": 0.229125, + "grad_norm": 3.65625, + "grad_norm_var": 0.6242095947265625, + "learning_rate": 0.0001, + "loss": 6.3837, + "loss/crossentropy": 2.72838294506073, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.203810915350914, + "step": 7332 + }, + { + "epoch": 0.2291875, + "grad_norm": 3.78125, + "grad_norm_var": 0.6142079671223958, + "learning_rate": 0.0001, + "loss": 6.22, + "loss/crossentropy": 2.5156002044677734, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20325710624456406, + "step": 7334 + }, + { + "epoch": 0.22925, + "grad_norm": 3.40625, + "grad_norm_var": 0.6004628499348958, + "learning_rate": 0.0001, + "loss": 6.2168, + "loss/crossentropy": 2.610624313354492, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19967524707317352, + "step": 7336 + }, + { + "epoch": 0.2293125, + "grad_norm": 3.203125, + "grad_norm_var": 0.6215810139973958, + "learning_rate": 0.0001, + "loss": 6.0493, + "loss/crossentropy": 2.615975260734558, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1870839074254036, + "step": 7338 + }, + { + "epoch": 0.229375, + "grad_norm": 3.546875, + "grad_norm_var": 0.5929758707682292, + "learning_rate": 0.0001, + "loss": 6.0301, + "loss/crossentropy": 2.5628278255462646, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19204376637935638, + "step": 7340 + }, + { + "epoch": 0.2294375, + "grad_norm": 3.796875, + "grad_norm_var": 0.09176432291666667, + "learning_rate": 0.0001, + "loss": 6.1977, + "loss/crossentropy": 2.5626055002212524, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19866493344306946, + "step": 7342 + }, + { + "epoch": 0.2295, + "grad_norm": 3.3125, + "grad_norm_var": 0.06845296223958333, + "learning_rate": 0.0001, + "loss": 5.7414, + "loss/crossentropy": 2.365588426589966, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18407006561756134, + "step": 7344 + }, + { + "epoch": 0.2295625, + "grad_norm": 3.625, + "grad_norm_var": 0.07990620930989584, + "learning_rate": 0.0001, + "loss": 6.1658, + "loss/crossentropy": 2.662602186203003, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1960218995809555, + "step": 7346 + }, + { + "epoch": 0.229625, + "grad_norm": 3.375, + "grad_norm_var": 0.09516499837239584, + "learning_rate": 0.0001, + "loss": 5.9993, + "loss/crossentropy": 2.550138235092163, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19100836664438248, + "step": 7348 + }, + { + "epoch": 0.2296875, + "grad_norm": 3.71875, + "grad_norm_var": 0.09342041015625, + "learning_rate": 0.0001, + "loss": 6.0154, + "loss/crossentropy": 2.5097841024398804, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19235394895076752, + "step": 7350 + }, + { + "epoch": 0.22975, + "grad_norm": 3.53125, + "grad_norm_var": 0.09157613118489584, + "learning_rate": 0.0001, + "loss": 5.976, + "loss/crossentropy": 2.540569543838501, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.1810412034392357, + "step": 7352 + }, + { + "epoch": 0.2298125, + "grad_norm": 3.515625, + "grad_norm_var": 0.07981363932291667, + "learning_rate": 0.0001, + "loss": 6.1652, + "loss/crossentropy": 2.666202664375305, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19325844943523407, + "step": 7354 + }, + { + "epoch": 0.229875, + "grad_norm": 3.53125, + "grad_norm_var": 0.0746490478515625, + "learning_rate": 0.0001, + "loss": 6.2743, + "loss/crossentropy": 2.694416880607605, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1966572105884552, + "step": 7356 + }, + { + "epoch": 0.2299375, + "grad_norm": 3.5625, + "grad_norm_var": 0.0745269775390625, + "learning_rate": 0.0001, + "loss": 5.9192, + "loss/crossentropy": 2.5005931854248047, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1879514679312706, + "step": 7358 + }, + { + "epoch": 0.23, + "grad_norm": 3.6875, + "grad_norm_var": 0.07662353515625, + "learning_rate": 0.0001, + "loss": 6.2571, + "loss/crossentropy": 2.651940941810608, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.20192715525627136, + "step": 7360 + }, + { + "epoch": 0.2300625, + "grad_norm": 5.03125, + "grad_norm_var": 0.18541259765625, + "learning_rate": 0.0001, + "loss": 6.1134, + "loss/crossentropy": 2.456774592399597, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20238465070724487, + "step": 7362 + }, + { + "epoch": 0.230125, + "grad_norm": 6.8125, + "grad_norm_var": 0.78668212890625, + "learning_rate": 0.0001, + "loss": 6.1916, + "loss/crossentropy": 2.581822395324707, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.1941842883825302, + "step": 7364 + }, + { + "epoch": 0.2301875, + "grad_norm": 3.453125, + "grad_norm_var": 0.7692372639973958, + "learning_rate": 0.0001, + "loss": 6.2382, + "loss/crossentropy": 2.6422585248947144, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.2045135721564293, + "step": 7366 + }, + { + "epoch": 0.23025, + "grad_norm": 4.0625, + "grad_norm_var": 0.7674112955729167, + "learning_rate": 0.0001, + "loss": 6.6114, + "loss/crossentropy": 2.907786726951599, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.2078600376844406, + "step": 7368 + }, + { + "epoch": 0.2303125, + "grad_norm": 3.28125, + "grad_norm_var": 0.7768717447916667, + "learning_rate": 0.0001, + "loss": 6.1037, + "loss/crossentropy": 2.5247615575790405, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19578395783901215, + "step": 7370 + }, + { + "epoch": 0.230375, + "grad_norm": 3.390625, + "grad_norm_var": 0.8033274332682292, + "learning_rate": 0.0001, + "loss": 6.1342, + "loss/crossentropy": 2.605314254760742, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19156111776828766, + "step": 7372 + }, + { + "epoch": 0.2304375, + "grad_norm": 3.390625, + "grad_norm_var": 0.8084869384765625, + "learning_rate": 0.0001, + "loss": 5.7904, + "loss/crossentropy": 2.444626212120056, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1775500327348709, + "step": 7374 + }, + { + "epoch": 0.2305, + "grad_norm": 3.3125, + "grad_norm_var": 0.85035400390625, + "learning_rate": 0.0001, + "loss": 5.8499, + "loss/crossentropy": 2.4502416849136353, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18762332946062088, + "step": 7376 + }, + { + "epoch": 0.2305625, + "grad_norm": 3.484375, + "grad_norm_var": 0.744580078125, + "learning_rate": 0.0001, + "loss": 5.9014, + "loss/crossentropy": 2.4991562366485596, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18710073083639145, + "step": 7378 + }, + { + "epoch": 0.230625, + "grad_norm": 3.828125, + "grad_norm_var": 0.062398274739583336, + "learning_rate": 0.0001, + "loss": 6.2592, + "loss/crossentropy": 2.6415544748306274, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20395462214946747, + "step": 7380 + }, + { + "epoch": 0.2306875, + "grad_norm": 4.1875, + "grad_norm_var": 0.08961181640625, + "learning_rate": 0.0001, + "loss": 6.4175, + "loss/crossentropy": 2.7125109434127808, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2076052576303482, + "step": 7382 + }, + { + "epoch": 0.23075, + "grad_norm": 3.515625, + "grad_norm_var": 0.07551167805989584, + "learning_rate": 0.0001, + "loss": 6.259, + "loss/crossentropy": 2.6772814989089966, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19801807403564453, + "step": 7384 + }, + { + "epoch": 0.2308125, + "grad_norm": 3.375, + "grad_norm_var": 0.06308186848958333, + "learning_rate": 0.0001, + "loss": 5.962, + "loss/crossentropy": 2.497159481048584, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1890670359134674, + "step": 7386 + }, + { + "epoch": 0.230875, + "grad_norm": 3.671875, + "grad_norm_var": 0.0703765869140625, + "learning_rate": 0.0001, + "loss": 6.1034, + "loss/crossentropy": 2.4412357807159424, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20137407630681992, + "step": 7388 + }, + { + "epoch": 0.2309375, + "grad_norm": 3.46875, + "grad_norm_var": 0.0634674072265625, + "learning_rate": 0.0001, + "loss": 6.2355, + "loss/crossentropy": 2.7218462228775024, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19511879980564117, + "step": 7390 + }, + { + "epoch": 0.231, + "grad_norm": 3.703125, + "grad_norm_var": 0.053376261393229166, + "learning_rate": 0.0001, + "loss": 5.8255, + "loss/crossentropy": 2.4146286249160767, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18561378866434097, + "step": 7392 + }, + { + "epoch": 0.2310625, + "grad_norm": 3.359375, + "grad_norm_var": 0.06270243326822916, + "learning_rate": 0.0001, + "loss": 5.9418, + "loss/crossentropy": 2.549205780029297, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18496041744947433, + "step": 7394 + }, + { + "epoch": 0.231125, + "grad_norm": 3.390625, + "grad_norm_var": 0.0640625, + "learning_rate": 0.0001, + "loss": 5.9342, + "loss/crossentropy": 2.497208595275879, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18862471729516983, + "step": 7396 + }, + { + "epoch": 0.2311875, + "grad_norm": 3.3125, + "grad_norm_var": 0.046751912434895834, + "learning_rate": 0.0001, + "loss": 6.2351, + "loss/crossentropy": 2.6447510719299316, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.1957508847117424, + "step": 7398 + }, + { + "epoch": 0.23125, + "grad_norm": 3.171875, + "grad_norm_var": 0.051123046875, + "learning_rate": 0.0001, + "loss": 5.8249, + "loss/crossentropy": 2.4528943300247192, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18251455575227737, + "step": 7400 + }, + { + "epoch": 0.2313125, + "grad_norm": 3.734375, + "grad_norm_var": 0.05332743326822917, + "learning_rate": 0.0001, + "loss": 6.1517, + "loss/crossentropy": 2.573247790336609, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1984696239233017, + "step": 7402 + }, + { + "epoch": 0.231375, + "grad_norm": 3.390625, + "grad_norm_var": 0.10120035807291666, + "learning_rate": 0.0001, + "loss": 6.4598, + "loss/crossentropy": 2.763832688331604, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20474957674741745, + "step": 7404 + }, + { + "epoch": 0.2314375, + "grad_norm": 3.484375, + "grad_norm_var": 0.1046783447265625, + "learning_rate": 0.0001, + "loss": 6.0386, + "loss/crossentropy": 2.639525532722473, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1840517669916153, + "step": 7406 + }, + { + "epoch": 0.2315, + "grad_norm": 4.09375, + "grad_norm_var": 0.12578837076822916, + "learning_rate": 0.0001, + "loss": 6.2328, + "loss/crossentropy": 2.7072668075561523, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19512664526700974, + "step": 7408 + }, + { + "epoch": 0.2315625, + "grad_norm": 3.296875, + "grad_norm_var": 0.12578837076822916, + "learning_rate": 0.0001, + "loss": 5.8393, + "loss/crossentropy": 2.417103886604309, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1875307485461235, + "step": 7410 + }, + { + "epoch": 0.231625, + "grad_norm": 3.515625, + "grad_norm_var": 0.12415364583333334, + "learning_rate": 0.0001, + "loss": 6.296, + "loss/crossentropy": 2.6419787406921387, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2025139182806015, + "step": 7412 + }, + { + "epoch": 0.2316875, + "grad_norm": 3.5, + "grad_norm_var": 0.12319234212239584, + "learning_rate": 0.0001, + "loss": 5.9212, + "loss/crossentropy": 2.5086781978607178, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1857873946428299, + "step": 7414 + }, + { + "epoch": 0.23175, + "grad_norm": 3.625, + "grad_norm_var": 0.12553609212239583, + "learning_rate": 0.0001, + "loss": 6.3267, + "loss/crossentropy": 2.7113449573516846, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19669626653194427, + "step": 7416 + }, + { + "epoch": 0.2318125, + "grad_norm": 3.625, + "grad_norm_var": 0.14147135416666667, + "learning_rate": 0.0001, + "loss": 6.3766, + "loss/crossentropy": 2.7021708488464355, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2061111181974411, + "step": 7418 + }, + { + "epoch": 0.231875, + "grad_norm": 3.4375, + "grad_norm_var": 0.09260965983072916, + "learning_rate": 0.0001, + "loss": 5.9801, + "loss/crossentropy": 2.5316959619522095, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1854703575372696, + "step": 7420 + }, + { + "epoch": 0.2319375, + "grad_norm": 3.421875, + "grad_norm_var": 0.10178934733072917, + "learning_rate": 0.0001, + "loss": 5.8133, + "loss/crossentropy": 2.450483798980713, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18081633746623993, + "step": 7422 + }, + { + "epoch": 0.232, + "grad_norm": 3.65625, + "grad_norm_var": 0.08230692545572917, + "learning_rate": 0.0001, + "loss": 5.9437, + "loss/crossentropy": 2.411531090736389, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19189123809337616, + "step": 7424 + }, + { + "epoch": 0.2320625, + "grad_norm": 3.71875, + "grad_norm_var": 0.07639058430989583, + "learning_rate": 0.0001, + "loss": 5.9129, + "loss/crossentropy": 2.5023571252822876, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18285023421049118, + "step": 7426 + }, + { + "epoch": 0.232125, + "grad_norm": 3.71875, + "grad_norm_var": 0.12104390462239584, + "learning_rate": 0.0001, + "loss": 6.1384, + "loss/crossentropy": 2.497291088104248, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.2043498232960701, + "step": 7428 + }, + { + "epoch": 0.2321875, + "grad_norm": 3.359375, + "grad_norm_var": 0.10891927083333333, + "learning_rate": 0.0001, + "loss": 6.2714, + "loss/crossentropy": 2.755240559577942, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19185104221105576, + "step": 7430 + }, + { + "epoch": 0.23225, + "grad_norm": 3.6875, + "grad_norm_var": 0.09778645833333334, + "learning_rate": 0.0001, + "loss": 6.3849, + "loss/crossentropy": 2.709605097770691, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20268186926841736, + "step": 7432 + }, + { + "epoch": 0.2323125, + "grad_norm": 3.5, + "grad_norm_var": 0.0864166259765625, + "learning_rate": 0.0001, + "loss": 5.7756, + "loss/crossentropy": 2.337099075317383, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18994461745023727, + "step": 7434 + }, + { + "epoch": 0.232375, + "grad_norm": 3.9375, + "grad_norm_var": 0.09278055826822916, + "learning_rate": 0.0001, + "loss": 6.1802, + "loss/crossentropy": 2.614351749420166, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19760002940893173, + "step": 7436 + }, + { + "epoch": 0.2324375, + "grad_norm": 3.5, + "grad_norm_var": 0.07560933430989583, + "learning_rate": 0.0001, + "loss": 6.6144, + "loss/crossentropy": 2.9351630210876465, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20620250701904297, + "step": 7438 + }, + { + "epoch": 0.2325, + "grad_norm": 3.375, + "grad_norm_var": 0.07434488932291666, + "learning_rate": 0.0001, + "loss": 6.2547, + "loss/crossentropy": 2.697936177253723, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1986447423696518, + "step": 7440 + }, + { + "epoch": 0.2325625, + "grad_norm": 3.78125, + "grad_norm_var": 0.081005859375, + "learning_rate": 0.0001, + "loss": 6.3051, + "loss/crossentropy": 2.68693745136261, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20361657440662384, + "step": 7442 + }, + { + "epoch": 0.232625, + "grad_norm": 3.59375, + "grad_norm_var": 0.04145406087239583, + "learning_rate": 0.0001, + "loss": 5.9403, + "loss/crossentropy": 2.4055399894714355, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19800888746976852, + "step": 7444 + }, + { + "epoch": 0.2326875, + "grad_norm": 3.171875, + "grad_norm_var": 0.04772135416666667, + "learning_rate": 0.0001, + "loss": 6.2272, + "loss/crossentropy": 2.7289352416992188, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19162369519472122, + "step": 7446 + }, + { + "epoch": 0.23275, + "grad_norm": 3.515625, + "grad_norm_var": 0.04728902180989583, + "learning_rate": 0.0001, + "loss": 6.1197, + "loss/crossentropy": 2.5941803455352783, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19629882276058197, + "step": 7448 + }, + { + "epoch": 0.2328125, + "grad_norm": 3.65625, + "grad_norm_var": 0.04666341145833333, + "learning_rate": 0.0001, + "loss": 6.1391, + "loss/crossentropy": 2.5506038665771484, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19713414460420609, + "step": 7450 + }, + { + "epoch": 0.232875, + "grad_norm": 3.265625, + "grad_norm_var": 0.0388336181640625, + "learning_rate": 0.0001, + "loss": 6.0142, + "loss/crossentropy": 2.5255450010299683, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19105535745620728, + "step": 7452 + }, + { + "epoch": 0.2329375, + "grad_norm": 3.59375, + "grad_norm_var": 0.04072265625, + "learning_rate": 0.0001, + "loss": 6.4106, + "loss/crossentropy": 2.724580407142639, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20023799687623978, + "step": 7454 + }, + { + "epoch": 0.233, + "grad_norm": 3.109375, + "grad_norm_var": 0.0539947509765625, + "learning_rate": 0.0001, + "loss": 5.9726, + "loss/crossentropy": 2.5376689434051514, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18997538089752197, + "step": 7456 + }, + { + "epoch": 0.2330625, + "grad_norm": 3.21875, + "grad_norm_var": 0.04094645182291667, + "learning_rate": 0.0001, + "loss": 5.9383, + "loss/crossentropy": 2.5750547647476196, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18320411443710327, + "step": 7458 + }, + { + "epoch": 0.233125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0402740478515625, + "learning_rate": 0.0001, + "loss": 5.826, + "loss/crossentropy": 2.477864980697632, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17816954106092453, + "step": 7460 + }, + { + "epoch": 0.2331875, + "grad_norm": 3.359375, + "grad_norm_var": 0.03504231770833333, + "learning_rate": 0.0001, + "loss": 5.7815, + "loss/crossentropy": 2.437563419342041, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1797105148434639, + "step": 7462 + }, + { + "epoch": 0.23325, + "grad_norm": 3.46875, + "grad_norm_var": 0.03479817708333333, + "learning_rate": 0.0001, + "loss": 6.4018, + "loss/crossentropy": 2.7117398977279663, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2080647200345993, + "step": 7464 + }, + { + "epoch": 0.2333125, + "grad_norm": 3.765625, + "grad_norm_var": 0.03961181640625, + "learning_rate": 0.0001, + "loss": 6.1329, + "loss/crossentropy": 2.7160139083862305, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1838773414492607, + "step": 7466 + }, + { + "epoch": 0.233375, + "grad_norm": 3.203125, + "grad_norm_var": 0.04114583333333333, + "learning_rate": 0.0001, + "loss": 6.0136, + "loss/crossentropy": 2.598839044570923, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18405775725841522, + "step": 7468 + }, + { + "epoch": 0.2334375, + "grad_norm": 3.203125, + "grad_norm_var": 0.03804423014322917, + "learning_rate": 0.0001, + "loss": 6.104, + "loss/crossentropy": 2.6647852659225464, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18571925908327103, + "step": 7470 + }, + { + "epoch": 0.2335, + "grad_norm": 3.15625, + "grad_norm_var": 0.0279693603515625, + "learning_rate": 0.0001, + "loss": 5.6951, + "loss/crossentropy": 2.384377956390381, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17794862389564514, + "step": 7472 + }, + { + "epoch": 0.2335625, + "grad_norm": 3.5, + "grad_norm_var": 0.028156534830729166, + "learning_rate": 0.0001, + "loss": 5.9524, + "loss/crossentropy": 2.49960196018219, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18785890191793442, + "step": 7474 + }, + { + "epoch": 0.233625, + "grad_norm": 3.546875, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 0.0001, + "loss": 6.1427, + "loss/crossentropy": 2.557058572769165, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19762682914733887, + "step": 7476 + }, + { + "epoch": 0.2336875, + "grad_norm": 3.40625, + "grad_norm_var": 0.03368733723958333, + "learning_rate": 0.0001, + "loss": 6.0381, + "loss/crossentropy": 2.534654378890991, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19487891346216202, + "step": 7478 + }, + { + "epoch": 0.23375, + "grad_norm": 3.359375, + "grad_norm_var": 0.036188761393229164, + "learning_rate": 0.0001, + "loss": 5.8262, + "loss/crossentropy": 2.4826525449752808, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18162458389997482, + "step": 7480 + }, + { + "epoch": 0.2338125, + "grad_norm": 3.65625, + "grad_norm_var": 0.033600870768229166, + "learning_rate": 0.0001, + "loss": 5.8905, + "loss/crossentropy": 2.4829264879226685, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1868496686220169, + "step": 7482 + }, + { + "epoch": 0.233875, + "grad_norm": 3.390625, + "grad_norm_var": 0.04011942545572917, + "learning_rate": 0.0001, + "loss": 6.1066, + "loss/crossentropy": 2.5453397035598755, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19479379057884216, + "step": 7484 + }, + { + "epoch": 0.2339375, + "grad_norm": 3.46875, + "grad_norm_var": 0.043229166666666666, + "learning_rate": 0.0001, + "loss": 5.7595, + "loss/crossentropy": 2.438984513282776, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17931944131851196, + "step": 7486 + }, + { + "epoch": 0.234, + "grad_norm": 3.625, + "grad_norm_var": 0.04342447916666667, + "learning_rate": 0.0001, + "loss": 6.3522, + "loss/crossentropy": 2.6873581409454346, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2055514082312584, + "step": 7488 + }, + { + "epoch": 0.2340625, + "grad_norm": 3.40625, + "grad_norm_var": 0.040848795572916666, + "learning_rate": 0.0001, + "loss": 6.0623, + "loss/crossentropy": 2.6052039861679077, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.18516145646572113, + "step": 7490 + }, + { + "epoch": 0.234125, + "grad_norm": 3.390625, + "grad_norm_var": 0.04080403645833333, + "learning_rate": 0.0001, + "loss": 6.2604, + "loss/crossentropy": 2.753083825111389, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19213388115167618, + "step": 7492 + }, + { + "epoch": 0.2341875, + "grad_norm": 3.625, + "grad_norm_var": 0.03994038899739583, + "learning_rate": 0.0001, + "loss": 6.1461, + "loss/crossentropy": 2.5053790807724, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.19688421487808228, + "step": 7494 + }, + { + "epoch": 0.23425, + "grad_norm": 3.25, + "grad_norm_var": 0.0356109619140625, + "learning_rate": 0.0001, + "loss": 6.1289, + "loss/crossentropy": 2.672441601753235, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18939807265996933, + "step": 7496 + }, + { + "epoch": 0.2343125, + "grad_norm": 3.25, + "grad_norm_var": 0.0393218994140625, + "learning_rate": 0.0001, + "loss": 5.8123, + "loss/crossentropy": 2.453073740005493, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1788911297917366, + "step": 7498 + }, + { + "epoch": 0.234375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0416412353515625, + "learning_rate": 0.0001, + "loss": 6.2689, + "loss/crossentropy": 2.692493438720703, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.198652982711792, + "step": 7500 + }, + { + "epoch": 0.2344375, + "grad_norm": 3.21875, + "grad_norm_var": 0.037158203125, + "learning_rate": 0.0001, + "loss": 6.0308, + "loss/crossentropy": 2.57742440700531, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18869930505752563, + "step": 7502 + }, + { + "epoch": 0.2345, + "grad_norm": 3.28125, + "grad_norm_var": 0.030973307291666665, + "learning_rate": 0.0001, + "loss": 5.7385, + "loss/crossentropy": 2.3825695514678955, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17816990613937378, + "step": 7504 + }, + { + "epoch": 0.2345625, + "grad_norm": 3.515625, + "grad_norm_var": 0.08776041666666666, + "learning_rate": 0.0001, + "loss": 6.2344, + "loss/crossentropy": 2.630827307701111, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19785429537296295, + "step": 7506 + }, + { + "epoch": 0.234625, + "grad_norm": 3.5625, + "grad_norm_var": 0.1081207275390625, + "learning_rate": 0.0001, + "loss": 5.9919, + "loss/crossentropy": 2.440385937690735, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19225788116455078, + "step": 7508 + }, + { + "epoch": 0.2346875, + "grad_norm": 3.484375, + "grad_norm_var": 0.10650634765625, + "learning_rate": 0.0001, + "loss": 5.8651, + "loss/crossentropy": 2.441224455833435, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1896503046154976, + "step": 7510 + }, + { + "epoch": 0.23475, + "grad_norm": 3.65625, + "grad_norm_var": 0.10657145182291666, + "learning_rate": 0.0001, + "loss": 6.3991, + "loss/crossentropy": 2.7317874431610107, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2073518931865692, + "step": 7512 + }, + { + "epoch": 0.2348125, + "grad_norm": 3.453125, + "grad_norm_var": 0.09326070149739583, + "learning_rate": 0.0001, + "loss": 6.1689, + "loss/crossentropy": 2.6027718782424927, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1956755369901657, + "step": 7514 + }, + { + "epoch": 0.234875, + "grad_norm": 3.546875, + "grad_norm_var": 0.08765869140625, + "learning_rate": 0.0001, + "loss": 6.0401, + "loss/crossentropy": 2.4928770065307617, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19182778894901276, + "step": 7516 + }, + { + "epoch": 0.2349375, + "grad_norm": 3.390625, + "grad_norm_var": 0.07895406087239583, + "learning_rate": 0.0001, + "loss": 6.2329, + "loss/crossentropy": 2.7454384565353394, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19132785499095917, + "step": 7518 + }, + { + "epoch": 0.235, + "grad_norm": 3.40625, + "grad_norm_var": 0.07144775390625, + "learning_rate": 0.0001, + "loss": 5.9715, + "loss/crossentropy": 2.5534324645996094, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18360119313001633, + "step": 7520 + }, + { + "epoch": 0.2350625, + "grad_norm": 3.5, + "grad_norm_var": 0.032933553059895836, + "learning_rate": 0.0001, + "loss": 6.2586, + "loss/crossentropy": 2.6766849756240845, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.2007722333073616, + "step": 7522 + }, + { + "epoch": 0.235125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0172271728515625, + "learning_rate": 0.0001, + "loss": 6.083, + "loss/crossentropy": 2.5798628330230713, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18898801505565643, + "step": 7524 + }, + { + "epoch": 0.2351875, + "grad_norm": 3.765625, + "grad_norm_var": 0.021370442708333333, + "learning_rate": 0.0001, + "loss": 6.1985, + "loss/crossentropy": 2.5955700874328613, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20208538323640823, + "step": 7526 + }, + { + "epoch": 0.23525, + "grad_norm": 3.359375, + "grad_norm_var": 0.023176066080729165, + "learning_rate": 0.0001, + "loss": 6.181, + "loss/crossentropy": 2.667165517807007, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19552771747112274, + "step": 7528 + }, + { + "epoch": 0.2353125, + "grad_norm": 3.234375, + "grad_norm_var": 0.03821614583333333, + "learning_rate": 0.0001, + "loss": 5.8642, + "loss/crossentropy": 2.4298404455184937, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18601436913013458, + "step": 7530 + }, + { + "epoch": 0.235375, + "grad_norm": 3.78125, + "grad_norm_var": 0.040327962239583334, + "learning_rate": 0.0001, + "loss": 6.4814, + "loss/crossentropy": 2.8501453399658203, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.19828537851572037, + "step": 7532 + }, + { + "epoch": 0.2354375, + "grad_norm": 3.5625, + "grad_norm_var": 0.04738667805989583, + "learning_rate": 0.0001, + "loss": 6.0067, + "loss/crossentropy": 2.5832111835479736, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18453294783830643, + "step": 7534 + }, + { + "epoch": 0.2355, + "grad_norm": 3.25, + "grad_norm_var": 0.04707743326822917, + "learning_rate": 0.0001, + "loss": 6.0754, + "loss/crossentropy": 2.593501329421997, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19154705107212067, + "step": 7536 + }, + { + "epoch": 0.2355625, + "grad_norm": 3.625, + "grad_norm_var": 0.04840494791666667, + "learning_rate": 0.0001, + "loss": 6.1885, + "loss/crossentropy": 2.6841200590133667, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1941901594400406, + "step": 7538 + }, + { + "epoch": 0.235625, + "grad_norm": 3.6875, + "grad_norm_var": 0.050226847330729164, + "learning_rate": 0.0001, + "loss": 5.701, + "loss/crossentropy": 2.362775444984436, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.17288699746131897, + "step": 7540 + }, + { + "epoch": 0.2356875, + "grad_norm": 3.546875, + "grad_norm_var": 0.046418253580729166, + "learning_rate": 0.0001, + "loss": 5.9492, + "loss/crossentropy": 2.4750373363494873, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18921273946762085, + "step": 7542 + }, + { + "epoch": 0.23575, + "grad_norm": 3.09375, + "grad_norm_var": 0.05595601399739583, + "learning_rate": 0.0001, + "loss": 5.785, + "loss/crossentropy": 2.453348994255066, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17652620375156403, + "step": 7544 + }, + { + "epoch": 0.2358125, + "grad_norm": 5.84375, + "grad_norm_var": 0.39527994791666665, + "learning_rate": 0.0001, + "loss": 5.9636, + "loss/crossentropy": 2.440808653831482, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19055798649787903, + "step": 7546 + }, + { + "epoch": 0.235875, + "grad_norm": 3.5625, + "grad_norm_var": 0.3976715087890625, + "learning_rate": 0.0001, + "loss": 5.9528, + "loss/crossentropy": 2.4879361391067505, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1886746659874916, + "step": 7548 + }, + { + "epoch": 0.2359375, + "grad_norm": 3.390625, + "grad_norm_var": 0.38782145182291666, + "learning_rate": 0.0001, + "loss": 6.2643, + "loss/crossentropy": 2.6960970163345337, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19666466116905212, + "step": 7550 + }, + { + "epoch": 0.236, + "grad_norm": 3.1875, + "grad_norm_var": 0.3906402587890625, + "learning_rate": 0.0001, + "loss": 5.9808, + "loss/crossentropy": 2.5641645193099976, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18463445454835892, + "step": 7552 + }, + { + "epoch": 0.2360625, + "grad_norm": 3.9375, + "grad_norm_var": 0.4043121337890625, + "learning_rate": 0.0001, + "loss": 6.465, + "loss/crossentropy": 2.766203284263611, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20581817626953125, + "step": 7554 + }, + { + "epoch": 0.236125, + "grad_norm": 3.609375, + "grad_norm_var": 0.4017405192057292, + "learning_rate": 0.0001, + "loss": 6.0331, + "loss/crossentropy": 2.4767733812332153, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19821280241012573, + "step": 7556 + }, + { + "epoch": 0.2361875, + "grad_norm": 3.40625, + "grad_norm_var": 0.4049763997395833, + "learning_rate": 0.0001, + "loss": 6.0414, + "loss/crossentropy": 2.5478110313415527, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18841978907585144, + "step": 7558 + }, + { + "epoch": 0.23625, + "grad_norm": 3.296875, + "grad_norm_var": 0.3839670817057292, + "learning_rate": 0.0001, + "loss": 5.8884, + "loss/crossentropy": 2.4817367792129517, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.17855586111545563, + "step": 7560 + }, + { + "epoch": 0.2363125, + "grad_norm": 3.5625, + "grad_norm_var": 0.03857014973958333, + "learning_rate": 0.0001, + "loss": 5.6971, + "loss/crossentropy": 2.261039435863495, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18852870911359787, + "step": 7562 + }, + { + "epoch": 0.236375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0419830322265625, + "learning_rate": 0.0001, + "loss": 5.9997, + "loss/crossentropy": 2.6095385551452637, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18550091981887817, + "step": 7564 + }, + { + "epoch": 0.2364375, + "grad_norm": 3.4375, + "grad_norm_var": 0.0416015625, + "learning_rate": 0.0001, + "loss": 6.0966, + "loss/crossentropy": 2.6820207834243774, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18715624511241913, + "step": 7566 + }, + { + "epoch": 0.2365, + "grad_norm": 3.59375, + "grad_norm_var": 0.043294270833333336, + "learning_rate": 0.0001, + "loss": 6.2628, + "loss/crossentropy": 2.7443941831588745, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19442252069711685, + "step": 7568 + }, + { + "epoch": 0.2365625, + "grad_norm": 3.53125, + "grad_norm_var": 0.027440388997395832, + "learning_rate": 0.0001, + "loss": 6.3244, + "loss/crossentropy": 2.7657259702682495, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19532336294651031, + "step": 7570 + }, + { + "epoch": 0.236625, + "grad_norm": 3.15625, + "grad_norm_var": 0.025423177083333335, + "learning_rate": 0.0001, + "loss": 5.5643, + "loss/crossentropy": 2.2630890607833862, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1738731563091278, + "step": 7572 + }, + { + "epoch": 0.2366875, + "grad_norm": 3.625, + "grad_norm_var": 0.028180948893229165, + "learning_rate": 0.0001, + "loss": 6.4293, + "loss/crossentropy": 2.828166961669922, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.2015225887298584, + "step": 7574 + }, + { + "epoch": 0.23675, + "grad_norm": 3.265625, + "grad_norm_var": 0.024153645833333334, + "learning_rate": 0.0001, + "loss": 5.8275, + "loss/crossentropy": 2.4868907928466797, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18054601550102234, + "step": 7576 + }, + { + "epoch": 0.2368125, + "grad_norm": 4.34375, + "grad_norm_var": 0.08116861979166666, + "learning_rate": 0.0001, + "loss": 6.1035, + "loss/crossentropy": 2.4918447732925415, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.1963195875287056, + "step": 7578 + }, + { + "epoch": 0.236875, + "grad_norm": 3.734375, + "grad_norm_var": 0.08313802083333334, + "learning_rate": 0.0001, + "loss": 5.715, + "loss/crossentropy": 2.3960211277008057, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1799403429031372, + "step": 7580 + }, + { + "epoch": 0.2369375, + "grad_norm": 3.28125, + "grad_norm_var": 0.08834228515625, + "learning_rate": 0.0001, + "loss": 5.9184, + "loss/crossentropy": 2.5109978914260864, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18644145131111145, + "step": 7582 + }, + { + "epoch": 0.237, + "grad_norm": 3.109375, + "grad_norm_var": 0.0928619384765625, + "learning_rate": 0.0001, + "loss": 5.6529, + "loss/crossentropy": 2.3433854579925537, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1770445704460144, + "step": 7584 + }, + { + "epoch": 0.2370625, + "grad_norm": 3.609375, + "grad_norm_var": 0.09990234375, + "learning_rate": 0.0001, + "loss": 6.3634, + "loss/crossentropy": 2.692499876022339, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20458614826202393, + "step": 7586 + }, + { + "epoch": 0.237125, + "grad_norm": 3.625, + "grad_norm_var": 0.10044657389322917, + "learning_rate": 0.0001, + "loss": 6.2058, + "loss/crossentropy": 2.702503204345703, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1917399913072586, + "step": 7588 + }, + { + "epoch": 0.2371875, + "grad_norm": 3.75, + "grad_norm_var": 0.10458882649739583, + "learning_rate": 0.0001, + "loss": 6.1442, + "loss/crossentropy": 2.5761271715164185, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19469749182462692, + "step": 7590 + }, + { + "epoch": 0.23725, + "grad_norm": 3.78125, + "grad_norm_var": 0.10366109212239584, + "learning_rate": 0.0001, + "loss": 6.0064, + "loss/crossentropy": 2.5359818935394287, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1907963976264, + "step": 7592 + }, + { + "epoch": 0.2373125, + "grad_norm": 3.390625, + "grad_norm_var": 0.055475870768229164, + "learning_rate": 0.0001, + "loss": 6.1073, + "loss/crossentropy": 2.6080822944641113, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1932821273803711, + "step": 7594 + }, + { + "epoch": 0.237375, + "grad_norm": 3.4375, + "grad_norm_var": 0.048371378580729166, + "learning_rate": 0.0001, + "loss": 5.8831, + "loss/crossentropy": 2.4336295127868652, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18713627755641937, + "step": 7596 + }, + { + "epoch": 0.2374375, + "grad_norm": 3.296875, + "grad_norm_var": 0.04436442057291667, + "learning_rate": 0.0001, + "loss": 5.7833, + "loss/crossentropy": 2.2970622777938843, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1935415342450142, + "step": 7598 + }, + { + "epoch": 0.2375, + "grad_norm": 3.390625, + "grad_norm_var": 0.03173421223958333, + "learning_rate": 0.0001, + "loss": 5.8934, + "loss/crossentropy": 2.5146186351776123, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18475797772407532, + "step": 7600 + }, + { + "epoch": 0.2375625, + "grad_norm": 3.59375, + "grad_norm_var": 0.03154195149739583, + "learning_rate": 0.0001, + "loss": 6.3939, + "loss/crossentropy": 2.7156718969345093, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20805390179157257, + "step": 7602 + }, + { + "epoch": 0.237625, + "grad_norm": 3.546875, + "grad_norm_var": 0.0245025634765625, + "learning_rate": 0.0001, + "loss": 6.2358, + "loss/crossentropy": 2.6067934036254883, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20508348941802979, + "step": 7604 + }, + { + "epoch": 0.2376875, + "grad_norm": 3.71875, + "grad_norm_var": 0.0274566650390625, + "learning_rate": 0.0001, + "loss": 6.0435, + "loss/crossentropy": 2.4407546520233154, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19855981320142746, + "step": 7606 + }, + { + "epoch": 0.23775, + "grad_norm": 3.34375, + "grad_norm_var": 0.0262115478515625, + "learning_rate": 0.0001, + "loss": 6.3214, + "loss/crossentropy": 2.7076494693756104, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1996530145406723, + "step": 7608 + }, + { + "epoch": 0.2378125, + "grad_norm": 3.5, + "grad_norm_var": 0.028197224934895834, + "learning_rate": 0.0001, + "loss": 6.2234, + "loss/crossentropy": 2.6549901962280273, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1978522315621376, + "step": 7610 + }, + { + "epoch": 0.237875, + "grad_norm": 3.421875, + "grad_norm_var": 0.0282135009765625, + "learning_rate": 0.0001, + "loss": 6.0882, + "loss/crossentropy": 2.6244239807128906, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1920793578028679, + "step": 7612 + }, + { + "epoch": 0.2379375, + "grad_norm": 4.75, + "grad_norm_var": 0.11799723307291667, + "learning_rate": 0.0001, + "loss": 6.0148, + "loss/crossentropy": 2.5746607780456543, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1901114359498024, + "step": 7614 + }, + { + "epoch": 0.238, + "grad_norm": 3.4375, + "grad_norm_var": 0.11655985514322917, + "learning_rate": 0.0001, + "loss": 5.8119, + "loss/crossentropy": 2.3741531372070312, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1875242292881012, + "step": 7616 + }, + { + "epoch": 0.2380625, + "grad_norm": 3.265625, + "grad_norm_var": 0.1244293212890625, + "learning_rate": 0.0001, + "loss": 6.1934, + "loss/crossentropy": 2.69778573513031, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19097129255533218, + "step": 7618 + }, + { + "epoch": 0.238125, + "grad_norm": 3.578125, + "grad_norm_var": 0.12808329264322918, + "learning_rate": 0.0001, + "loss": 6.1297, + "loss/crossentropy": 2.5959811210632324, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19751714169979095, + "step": 7620 + }, + { + "epoch": 0.2381875, + "grad_norm": 5.75, + "grad_norm_var": 0.43488667805989584, + "learning_rate": 0.0001, + "loss": 6.3296, + "loss/crossentropy": 2.6300052404403687, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.20628874748945236, + "step": 7622 + }, + { + "epoch": 0.23825, + "grad_norm": 3.390625, + "grad_norm_var": 0.4568430582682292, + "learning_rate": 0.0001, + "loss": 5.5186, + "loss/crossentropy": 2.2347458600997925, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17096569389104843, + "step": 7624 + }, + { + "epoch": 0.2383125, + "grad_norm": 3.8125, + "grad_norm_var": 0.4613596598307292, + "learning_rate": 0.0001, + "loss": 6.3385, + "loss/crossentropy": 2.8292057514190674, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.18921514600515366, + "step": 7626 + }, + { + "epoch": 0.238375, + "grad_norm": 3.359375, + "grad_norm_var": 0.46288655598958334, + "learning_rate": 0.0001, + "loss": 5.9493, + "loss/crossentropy": 2.499138593673706, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18915607780218124, + "step": 7628 + }, + { + "epoch": 0.2384375, + "grad_norm": 3.453125, + "grad_norm_var": 0.5111480712890625, + "learning_rate": 0.0001, + "loss": 6.0143, + "loss/crossentropy": 2.4007151126861572, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.19572950899600983, + "step": 7630 + }, + { + "epoch": 0.2385, + "grad_norm": 3.1875, + "grad_norm_var": 0.5228342692057292, + "learning_rate": 0.0001, + "loss": 6.1161, + "loss/crossentropy": 2.6703180074691772, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1902800351381302, + "step": 7632 + }, + { + "epoch": 0.2385625, + "grad_norm": 3.921875, + "grad_norm_var": 0.539404296875, + "learning_rate": 0.0001, + "loss": 5.9397, + "loss/crossentropy": 2.48610520362854, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18442490696907043, + "step": 7634 + }, + { + "epoch": 0.238625, + "grad_norm": 3.90625, + "grad_norm_var": 0.5344034830729166, + "learning_rate": 0.0001, + "loss": 6.1724, + "loss/crossentropy": 2.52171790599823, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20334511995315552, + "step": 7636 + }, + { + "epoch": 0.2386875, + "grad_norm": 3.515625, + "grad_norm_var": 0.23290608723958334, + "learning_rate": 0.0001, + "loss": 5.7379, + "loss/crossentropy": 2.3531194925308228, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18340447545051575, + "step": 7638 + }, + { + "epoch": 0.23875, + "grad_norm": 3.453125, + "grad_norm_var": 0.21437886555989583, + "learning_rate": 0.0001, + "loss": 6.1048, + "loss/crossentropy": 2.5537298917770386, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19456487894058228, + "step": 7640 + }, + { + "epoch": 0.2388125, + "grad_norm": 3.3125, + "grad_norm_var": 0.21537984212239583, + "learning_rate": 0.0001, + "loss": 6.0609, + "loss/crossentropy": 2.5692453384399414, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19096451997756958, + "step": 7642 + }, + { + "epoch": 0.238875, + "grad_norm": 3.40625, + "grad_norm_var": 0.21619466145833333, + "learning_rate": 0.0001, + "loss": 6.1539, + "loss/crossentropy": 2.62418270111084, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19672146439552307, + "step": 7644 + }, + { + "epoch": 0.2389375, + "grad_norm": 3.421875, + "grad_norm_var": 0.057450358072916666, + "learning_rate": 0.0001, + "loss": 6.0482, + "loss/crossentropy": 2.5824190378189087, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19149936735630035, + "step": 7646 + }, + { + "epoch": 0.239, + "grad_norm": 3.078125, + "grad_norm_var": 0.06647847493489584, + "learning_rate": 0.0001, + "loss": 5.9723, + "loss/crossentropy": 2.5797115564346313, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.17832079529762268, + "step": 7648 + }, + { + "epoch": 0.2390625, + "grad_norm": 3.4375, + "grad_norm_var": 0.04267171223958333, + "learning_rate": 0.0001, + "loss": 6.286, + "loss/crossentropy": 2.747970700263977, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1936463937163353, + "step": 7650 + }, + { + "epoch": 0.239125, + "grad_norm": 3.5625, + "grad_norm_var": 0.030777994791666666, + "learning_rate": 0.0001, + "loss": 5.9269, + "loss/crossentropy": 2.3935400247573853, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19278503954410553, + "step": 7652 + }, + { + "epoch": 0.2391875, + "grad_norm": 3.796875, + "grad_norm_var": 0.0376861572265625, + "learning_rate": 0.0001, + "loss": 6.1596, + "loss/crossentropy": 2.6391823291778564, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19462397694587708, + "step": 7654 + }, + { + "epoch": 0.23925, + "grad_norm": 3.4375, + "grad_norm_var": 0.0478668212890625, + "learning_rate": 0.0001, + "loss": 5.9814, + "loss/crossentropy": 2.4334522485733032, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19463550299406052, + "step": 7656 + }, + { + "epoch": 0.2393125, + "grad_norm": 4.0625, + "grad_norm_var": 0.06364644368489583, + "learning_rate": 0.0001, + "loss": 6.2801, + "loss/crossentropy": 2.685059428215027, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.197001650929451, + "step": 7658 + }, + { + "epoch": 0.239375, + "grad_norm": 3.140625, + "grad_norm_var": 0.07219950358072917, + "learning_rate": 0.0001, + "loss": 6.1138, + "loss/crossentropy": 2.6333361864089966, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19102010130882263, + "step": 7660 + }, + { + "epoch": 0.2394375, + "grad_norm": 3.5, + "grad_norm_var": 0.06955464680989583, + "learning_rate": 0.0001, + "loss": 6.0346, + "loss/crossentropy": 2.51070237159729, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19575046002864838, + "step": 7662 + }, + { + "epoch": 0.2395, + "grad_norm": 3.359375, + "grad_norm_var": 0.05730794270833333, + "learning_rate": 0.0001, + "loss": 6.019, + "loss/crossentropy": 2.5976483821868896, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1893959939479828, + "step": 7664 + }, + { + "epoch": 0.2395625, + "grad_norm": 3.21875, + "grad_norm_var": 0.06433919270833334, + "learning_rate": 0.0001, + "loss": 5.9062, + "loss/crossentropy": 2.5433908700942993, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1796429306268692, + "step": 7666 + }, + { + "epoch": 0.239625, + "grad_norm": 3.375, + "grad_norm_var": 0.0742095947265625, + "learning_rate": 0.0001, + "loss": 6.44, + "loss/crossentropy": 2.791746497154236, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.2038840353488922, + "step": 7668 + }, + { + "epoch": 0.2396875, + "grad_norm": 3.59375, + "grad_norm_var": 0.090673828125, + "learning_rate": 0.0001, + "loss": 5.9489, + "loss/crossentropy": 2.4165494441986084, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19034140557050705, + "step": 7670 + }, + { + "epoch": 0.23975, + "grad_norm": 3.5, + "grad_norm_var": 0.0778961181640625, + "learning_rate": 0.0001, + "loss": 6.1395, + "loss/crossentropy": 2.613227963447571, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.18934263288974762, + "step": 7672 + }, + { + "epoch": 0.2398125, + "grad_norm": 4.09375, + "grad_norm_var": 0.08218994140625, + "learning_rate": 0.0001, + "loss": 6.4134, + "loss/crossentropy": 2.7667269706726074, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.2033383771777153, + "step": 7674 + }, + { + "epoch": 0.239875, + "grad_norm": 3.5, + "grad_norm_var": 0.08207906087239583, + "learning_rate": 0.0001, + "loss": 5.8618, + "loss/crossentropy": 2.4750369787216187, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.17969341576099396, + "step": 7676 + }, + { + "epoch": 0.2399375, + "grad_norm": 3.375, + "grad_norm_var": 0.08401285807291667, + "learning_rate": 0.0001, + "loss": 6.0187, + "loss/crossentropy": 2.508861541748047, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19200244545936584, + "step": 7678 + }, + { + "epoch": 0.24, + "grad_norm": 3.328125, + "grad_norm_var": 0.08294270833333334, + "learning_rate": 0.0001, + "loss": 6.365, + "loss/crossentropy": 2.7623302936553955, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19815552234649658, + "step": 7680 + }, + { + "epoch": 0.2400625, + "grad_norm": 3.625, + "grad_norm_var": 0.07265523274739584, + "learning_rate": 0.0001, + "loss": 6.0794, + "loss/crossentropy": 2.5649800300598145, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1936321035027504, + "step": 7682 + }, + { + "epoch": 0.240125, + "grad_norm": 3.40625, + "grad_norm_var": 0.07997945149739584, + "learning_rate": 0.0001, + "loss": 5.8903, + "loss/crossentropy": 2.523725152015686, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18274738639593124, + "step": 7684 + }, + { + "epoch": 0.2401875, + "grad_norm": 3.5, + "grad_norm_var": 0.05571187337239583, + "learning_rate": 0.0001, + "loss": 6.3296, + "loss/crossentropy": 2.7412075996398926, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19907841831445694, + "step": 7686 + }, + { + "epoch": 0.24025, + "grad_norm": 3.828125, + "grad_norm_var": 0.06308186848958333, + "learning_rate": 0.0001, + "loss": 6.2066, + "loss/crossentropy": 2.6145856380462646, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20021675527095795, + "step": 7688 + }, + { + "epoch": 0.2403125, + "grad_norm": 3.265625, + "grad_norm_var": 0.03584696451822917, + "learning_rate": 0.0001, + "loss": 6.1771, + "loss/crossentropy": 2.6702972650527954, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19169631600379944, + "step": 7690 + }, + { + "epoch": 0.240375, + "grad_norm": 3.71875, + "grad_norm_var": 0.03361714680989583, + "learning_rate": 0.0001, + "loss": 6.3533, + "loss/crossentropy": 2.7222691774368286, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20255489647388458, + "step": 7692 + }, + { + "epoch": 0.2404375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0396636962890625, + "learning_rate": 0.0001, + "loss": 5.9519, + "loss/crossentropy": 2.548743963241577, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1848517507314682, + "step": 7694 + }, + { + "epoch": 0.2405, + "grad_norm": 3.546875, + "grad_norm_var": 0.04666341145833333, + "learning_rate": 0.0001, + "loss": 6.2107, + "loss/crossentropy": 2.63839328289032, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19668738543987274, + "step": 7696 + }, + { + "epoch": 0.2405625, + "grad_norm": 3.9375, + "grad_norm_var": 0.05812886555989583, + "learning_rate": 0.0001, + "loss": 6.2819, + "loss/crossentropy": 2.62613844871521, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20737700909376144, + "step": 7698 + }, + { + "epoch": 0.240625, + "grad_norm": 4.03125, + "grad_norm_var": 0.059056599934895836, + "learning_rate": 0.0001, + "loss": 6.3754, + "loss/crossentropy": 2.713670492172241, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20484429597854614, + "step": 7700 + }, + { + "epoch": 0.2406875, + "grad_norm": 3.3125, + "grad_norm_var": 0.060595703125, + "learning_rate": 0.0001, + "loss": 6.2529, + "loss/crossentropy": 2.7103840112686157, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19448387622833252, + "step": 7702 + }, + { + "epoch": 0.24075, + "grad_norm": 3.328125, + "grad_norm_var": 0.06695556640625, + "learning_rate": 0.0001, + "loss": 5.7694, + "loss/crossentropy": 2.4169265031814575, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17977654933929443, + "step": 7704 + }, + { + "epoch": 0.2408125, + "grad_norm": 4.28125, + "grad_norm_var": 0.10196940104166667, + "learning_rate": 0.0001, + "loss": 6.2524, + "loss/crossentropy": 2.4799392223358154, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.21396034210920334, + "step": 7706 + }, + { + "epoch": 0.240875, + "grad_norm": 3.28125, + "grad_norm_var": 0.11464436848958333, + "learning_rate": 0.0001, + "loss": 5.6104, + "loss/crossentropy": 2.3208820819854736, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17504741251468658, + "step": 7708 + }, + { + "epoch": 0.2409375, + "grad_norm": 3.078125, + "grad_norm_var": 0.12100321451822917, + "learning_rate": 0.0001, + "loss": 5.7849, + "loss/crossentropy": 2.4717557430267334, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17740823328495026, + "step": 7710 + }, + { + "epoch": 0.241, + "grad_norm": 3.46875, + "grad_norm_var": 0.12170308430989583, + "learning_rate": 0.0001, + "loss": 5.9905, + "loss/crossentropy": 2.655658006668091, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1795746013522148, + "step": 7712 + }, + { + "epoch": 0.2410625, + "grad_norm": 41.0, + "grad_norm_var": 88.06312561035156, + "learning_rate": 0.0001, + "loss": 7.0637, + "loss/crossentropy": 2.5289785861968994, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.2831588163971901, + "step": 7714 + }, + { + "epoch": 0.241125, + "grad_norm": 3.890625, + "grad_norm_var": 87.94371744791667, + "learning_rate": 0.0001, + "loss": 6.1867, + "loss/crossentropy": 2.650020718574524, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19585387408733368, + "step": 7716 + }, + { + "epoch": 0.2411875, + "grad_norm": 3.875, + "grad_norm_var": 87.73944396972657, + "learning_rate": 0.0001, + "loss": 6.2797, + "loss/crossentropy": 2.657504081726074, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20167139172554016, + "step": 7718 + }, + { + "epoch": 0.24125, + "grad_norm": 3.625, + "grad_norm_var": 87.48702799479166, + "learning_rate": 0.0001, + "loss": 6.3552, + "loss/crossentropy": 2.6477073431015015, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20590269565582275, + "step": 7720 + }, + { + "epoch": 0.2413125, + "grad_norm": 3.34375, + "grad_norm_var": 87.8251454671224, + "learning_rate": 0.0001, + "loss": 6.2522, + "loss/crossentropy": 2.693089723587036, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19692938029766083, + "step": 7722 + }, + { + "epoch": 0.241375, + "grad_norm": 3.046875, + "grad_norm_var": 87.7824208577474, + "learning_rate": 0.0001, + "loss": 5.8971, + "loss/crossentropy": 2.472025990486145, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1870427280664444, + "step": 7724 + }, + { + "epoch": 0.2414375, + "grad_norm": 3.328125, + "grad_norm_var": 87.68208719889323, + "learning_rate": 0.0001, + "loss": 6.0263, + "loss/crossentropy": 2.6007825136184692, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.18083418905735016, + "step": 7726 + }, + { + "epoch": 0.2415, + "grad_norm": 3.359375, + "grad_norm_var": 87.63302408854166, + "learning_rate": 0.0001, + "loss": 6.2469, + "loss/crossentropy": 2.689045548439026, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19797423481941223, + "step": 7728 + }, + { + "epoch": 0.2415625, + "grad_norm": 3.171875, + "grad_norm_var": 0.08767903645833333, + "learning_rate": 0.0001, + "loss": 5.662, + "loss/crossentropy": 2.308548331260681, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.17558036744594574, + "step": 7730 + }, + { + "epoch": 0.241625, + "grad_norm": 3.5625, + "grad_norm_var": 0.058568318684895836, + "learning_rate": 0.0001, + "loss": 6.1883, + "loss/crossentropy": 2.5779424905776978, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.2024448812007904, + "step": 7732 + }, + { + "epoch": 0.2416875, + "grad_norm": 3.359375, + "grad_norm_var": 0.05073140462239583, + "learning_rate": 0.0001, + "loss": 6.0548, + "loss/crossentropy": 2.5732924938201904, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1883901134133339, + "step": 7734 + }, + { + "epoch": 0.24175, + "grad_norm": 3.65625, + "grad_norm_var": 0.050455729166666664, + "learning_rate": 0.0001, + "loss": 5.964, + "loss/crossentropy": 2.459815263748169, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19221822917461395, + "step": 7736 + }, + { + "epoch": 0.2418125, + "grad_norm": 3.34375, + "grad_norm_var": 0.0482330322265625, + "learning_rate": 0.0001, + "loss": 6.1748, + "loss/crossentropy": 2.7405179738998413, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18718285858631134, + "step": 7738 + }, + { + "epoch": 0.241875, + "grad_norm": 3.765625, + "grad_norm_var": 0.04411519368489583, + "learning_rate": 0.0001, + "loss": 6.0505, + "loss/crossentropy": 2.4628392457962036, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19783060252666473, + "step": 7740 + }, + { + "epoch": 0.2419375, + "grad_norm": 3.609375, + "grad_norm_var": 0.04273173014322917, + "learning_rate": 0.0001, + "loss": 6.2276, + "loss/crossentropy": 2.6094053983688354, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20048777014017105, + "step": 7742 + }, + { + "epoch": 0.242, + "grad_norm": 3.71875, + "grad_norm_var": 0.04364827473958333, + "learning_rate": 0.0001, + "loss": 6.2317, + "loss/crossentropy": 2.5628308057785034, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.2047729715704918, + "step": 7744 + }, + { + "epoch": 0.2420625, + "grad_norm": 3.5, + "grad_norm_var": 0.032486979166666666, + "learning_rate": 0.0001, + "loss": 6.142, + "loss/crossentropy": 2.5770037174224854, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1943896859884262, + "step": 7746 + }, + { + "epoch": 0.242125, + "grad_norm": 3.1875, + "grad_norm_var": 0.031083170572916666, + "learning_rate": 0.0001, + "loss": 5.9243, + "loss/crossentropy": 2.5422744750976562, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18546995520591736, + "step": 7748 + }, + { + "epoch": 0.2421875, + "grad_norm": 3.515625, + "grad_norm_var": 0.028514607747395834, + "learning_rate": 0.0001, + "loss": 5.6252, + "loss/crossentropy": 2.283834457397461, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.17437529563903809, + "step": 7750 + }, + { + "epoch": 0.24225, + "grad_norm": 3.578125, + "grad_norm_var": 0.025634765625, + "learning_rate": 0.0001, + "loss": 6.4153, + "loss/crossentropy": 2.743710160255432, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20465648919343948, + "step": 7752 + }, + { + "epoch": 0.2423125, + "grad_norm": 3.484375, + "grad_norm_var": 0.0237213134765625, + "learning_rate": 0.0001, + "loss": 6.0075, + "loss/crossentropy": 2.5681275129318237, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1876872330904007, + "step": 7754 + }, + { + "epoch": 0.242375, + "grad_norm": 3.828125, + "grad_norm_var": 0.025690714518229168, + "learning_rate": 0.0001, + "loss": 6.4399, + "loss/crossentropy": 2.7721768617630005, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20271114259958267, + "step": 7756 + }, + { + "epoch": 0.2424375, + "grad_norm": 3.359375, + "grad_norm_var": 0.026545206705729168, + "learning_rate": 0.0001, + "loss": 6.0202, + "loss/crossentropy": 2.6353739500045776, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18223249912261963, + "step": 7758 + }, + { + "epoch": 0.2425, + "grad_norm": 3.203125, + "grad_norm_var": 0.026981608072916666, + "learning_rate": 0.0001, + "loss": 6.1179, + "loss/crossentropy": 2.664836049079895, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1863185241818428, + "step": 7760 + }, + { + "epoch": 0.2425625, + "grad_norm": 3.53125, + "grad_norm_var": 0.027958170572916666, + "learning_rate": 0.0001, + "loss": 6.085, + "loss/crossentropy": 2.597185730934143, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19331426173448563, + "step": 7762 + }, + { + "epoch": 0.242625, + "grad_norm": 3.59375, + "grad_norm_var": 0.04345703125, + "learning_rate": 0.0001, + "loss": 6.1179, + "loss/crossentropy": 2.5563639402389526, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19325853884220123, + "step": 7764 + }, + { + "epoch": 0.2426875, + "grad_norm": 3.25, + "grad_norm_var": 0.053343709309895834, + "learning_rate": 0.0001, + "loss": 5.9444, + "loss/crossentropy": 2.4918466806411743, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.18510272353887558, + "step": 7766 + }, + { + "epoch": 0.24275, + "grad_norm": 3.578125, + "grad_norm_var": 0.05282796223958333, + "learning_rate": 0.0001, + "loss": 6.0905, + "loss/crossentropy": 2.6600621938705444, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18601295351982117, + "step": 7768 + }, + { + "epoch": 0.2428125, + "grad_norm": 3.453125, + "grad_norm_var": 0.0508697509765625, + "learning_rate": 0.0001, + "loss": 5.9313, + "loss/crossentropy": 2.4641988277435303, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19046224653720856, + "step": 7770 + }, + { + "epoch": 0.242875, + "grad_norm": 3.484375, + "grad_norm_var": 0.0777740478515625, + "learning_rate": 0.0001, + "loss": 6.0365, + "loss/crossentropy": 2.5514219999313354, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18952420353889465, + "step": 7772 + }, + { + "epoch": 0.2429375, + "grad_norm": 4.0, + "grad_norm_var": 0.08321024576822916, + "learning_rate": 0.0001, + "loss": 6.1001, + "loss/crossentropy": 2.650992274284363, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18944354355335236, + "step": 7774 + }, + { + "epoch": 0.243, + "grad_norm": 3.453125, + "grad_norm_var": 0.08057352701822916, + "learning_rate": 0.0001, + "loss": 6.1831, + "loss/crossentropy": 2.6317412853240967, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19302934408187866, + "step": 7776 + }, + { + "epoch": 0.2430625, + "grad_norm": 3.140625, + "grad_norm_var": 0.10263570149739583, + "learning_rate": 0.0001, + "loss": 5.764, + "loss/crossentropy": 2.448301076889038, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17492999881505966, + "step": 7778 + }, + { + "epoch": 0.243125, + "grad_norm": 3.34375, + "grad_norm_var": 0.09371744791666667, + "learning_rate": 0.0001, + "loss": 6.1999, + "loss/crossentropy": 2.648719549179077, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19691617041826248, + "step": 7780 + }, + { + "epoch": 0.2431875, + "grad_norm": 3.09375, + "grad_norm_var": 0.09827473958333334, + "learning_rate": 0.0001, + "loss": 5.8564, + "loss/crossentropy": 2.5009610652923584, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1824200451374054, + "step": 7782 + }, + { + "epoch": 0.24325, + "grad_norm": 3.53125, + "grad_norm_var": 0.10519917805989583, + "learning_rate": 0.0001, + "loss": 5.8692, + "loss/crossentropy": 2.490937113761902, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18235694617033005, + "step": 7784 + }, + { + "epoch": 0.2433125, + "grad_norm": 3.390625, + "grad_norm_var": 0.10624593098958333, + "learning_rate": 0.0001, + "loss": 5.8675, + "loss/crossentropy": 2.485131859779358, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1804227977991104, + "step": 7786 + }, + { + "epoch": 0.243375, + "grad_norm": 3.515625, + "grad_norm_var": 0.0654205322265625, + "learning_rate": 0.0001, + "loss": 6.1414, + "loss/crossentropy": 2.639013171195984, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1885160133242607, + "step": 7788 + }, + { + "epoch": 0.2434375, + "grad_norm": 3.53125, + "grad_norm_var": 0.049046834309895836, + "learning_rate": 0.0001, + "loss": 5.9523, + "loss/crossentropy": 2.5348676443099976, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18275688588619232, + "step": 7790 + }, + { + "epoch": 0.2435, + "grad_norm": 4.5, + "grad_norm_var": 0.11607666015625, + "learning_rate": 0.0001, + "loss": 5.8492, + "loss/crossentropy": 2.330635905265808, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19521702826023102, + "step": 7792 + }, + { + "epoch": 0.2435625, + "grad_norm": 3.609375, + "grad_norm_var": 0.11142476399739583, + "learning_rate": 0.0001, + "loss": 6.1485, + "loss/crossentropy": 2.628572463989258, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19769364595413208, + "step": 7794 + }, + { + "epoch": 0.243625, + "grad_norm": 3.5625, + "grad_norm_var": 0.11405843098958333, + "learning_rate": 0.0001, + "loss": 5.9074, + "loss/crossentropy": 2.4764528274536133, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1891844943165779, + "step": 7796 + }, + { + "epoch": 0.2436875, + "grad_norm": 3.203125, + "grad_norm_var": 0.107421875, + "learning_rate": 0.0001, + "loss": 6.0289, + "loss/crossentropy": 2.5855778455734253, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18730523437261581, + "step": 7798 + }, + { + "epoch": 0.24375, + "grad_norm": 3.25, + "grad_norm_var": 0.1046295166015625, + "learning_rate": 0.0001, + "loss": 6.4235, + "loss/crossentropy": 2.844739079475403, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19888964295387268, + "step": 7800 + }, + { + "epoch": 0.2438125, + "grad_norm": 3.265625, + "grad_norm_var": 0.1085113525390625, + "learning_rate": 0.0001, + "loss": 5.7583, + "loss/crossentropy": 2.369624972343445, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18457041680812836, + "step": 7802 + }, + { + "epoch": 0.243875, + "grad_norm": 3.328125, + "grad_norm_var": 0.119482421875, + "learning_rate": 0.0001, + "loss": 6.0802, + "loss/crossentropy": 2.5831106901168823, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.18916666507720947, + "step": 7804 + }, + { + "epoch": 0.2439375, + "grad_norm": 3.296875, + "grad_norm_var": 0.11793619791666667, + "learning_rate": 0.0001, + "loss": 6.0391, + "loss/crossentropy": 2.6238601207733154, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18683332204818726, + "step": 7806 + }, + { + "epoch": 0.244, + "grad_norm": 3.546875, + "grad_norm_var": 0.0329254150390625, + "learning_rate": 0.0001, + "loss": 5.9501, + "loss/crossentropy": 2.5450098514556885, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18465009331703186, + "step": 7808 + }, + { + "epoch": 0.2440625, + "grad_norm": 3.71875, + "grad_norm_var": 0.03805338541666667, + "learning_rate": 0.0001, + "loss": 6.1418, + "loss/crossentropy": 2.5412479639053345, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19911602139472961, + "step": 7810 + }, + { + "epoch": 0.244125, + "grad_norm": 3.015625, + "grad_norm_var": 0.04783426920572917, + "learning_rate": 0.0001, + "loss": 6.091, + "loss/crossentropy": 2.619031071662903, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19134248048067093, + "step": 7812 + }, + { + "epoch": 0.2441875, + "grad_norm": 3.484375, + "grad_norm_var": 0.059798177083333334, + "learning_rate": 0.0001, + "loss": 6.2009, + "loss/crossentropy": 2.6912925243377686, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19471533596515656, + "step": 7814 + }, + { + "epoch": 0.24425, + "grad_norm": 3.234375, + "grad_norm_var": 0.06214090983072917, + "learning_rate": 0.0001, + "loss": 6.1676, + "loss/crossentropy": 2.704859495162964, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19119124859571457, + "step": 7816 + }, + { + "epoch": 0.2443125, + "grad_norm": 3.359375, + "grad_norm_var": 0.06272684733072917, + "learning_rate": 0.0001, + "loss": 6.2454, + "loss/crossentropy": 2.7171125411987305, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1989237368106842, + "step": 7818 + }, + { + "epoch": 0.244375, + "grad_norm": 3.390625, + "grad_norm_var": 0.0551910400390625, + "learning_rate": 0.0001, + "loss": 5.6902, + "loss/crossentropy": 2.326253294944763, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1793610379099846, + "step": 7820 + }, + { + "epoch": 0.2444375, + "grad_norm": 3.421875, + "grad_norm_var": 0.06988525390625, + "learning_rate": 0.0001, + "loss": 6.2151, + "loss/crossentropy": 2.7847646474838257, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18756647408008575, + "step": 7822 + }, + { + "epoch": 0.2445, + "grad_norm": 3.421875, + "grad_norm_var": 0.07024739583333334, + "learning_rate": 0.0001, + "loss": 5.9772, + "loss/crossentropy": 2.487933397293091, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1883789598941803, + "step": 7824 + }, + { + "epoch": 0.2445625, + "grad_norm": 3.34375, + "grad_norm_var": 0.06451822916666666, + "learning_rate": 0.0001, + "loss": 6.0661, + "loss/crossentropy": 2.6811541318893433, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1826338768005371, + "step": 7826 + }, + { + "epoch": 0.244625, + "grad_norm": 3.640625, + "grad_norm_var": 0.05382486979166667, + "learning_rate": 0.0001, + "loss": 6.3543, + "loss/crossentropy": 2.798715114593506, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19578896462917328, + "step": 7828 + }, + { + "epoch": 0.2446875, + "grad_norm": 3.46875, + "grad_norm_var": 0.03632405598958333, + "learning_rate": 0.0001, + "loss": 5.4193, + "loss/crossentropy": 2.237901747226715, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.16267180442810059, + "step": 7830 + }, + { + "epoch": 0.24475, + "grad_norm": 5.375, + "grad_norm_var": 0.27489827473958334, + "learning_rate": 0.0001, + "loss": 6.3888, + "loss/crossentropy": 2.6911579370498657, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.2025742381811142, + "step": 7832 + }, + { + "epoch": 0.2448125, + "grad_norm": 3.359375, + "grad_norm_var": 0.26678059895833334, + "learning_rate": 0.0001, + "loss": 5.8585, + "loss/crossentropy": 2.5180585384368896, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1817043051123619, + "step": 7834 + }, + { + "epoch": 0.244875, + "grad_norm": 3.71875, + "grad_norm_var": 0.2519765218098958, + "learning_rate": 0.0001, + "loss": 6.376, + "loss/crossentropy": 2.7115944623947144, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20745471864938736, + "step": 7836 + }, + { + "epoch": 0.2449375, + "grad_norm": 3.359375, + "grad_norm_var": 0.25478515625, + "learning_rate": 0.0001, + "loss": 6.0173, + "loss/crossentropy": 2.4863085746765137, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19684798270463943, + "step": 7838 + }, + { + "epoch": 0.245, + "grad_norm": 3.53125, + "grad_norm_var": 0.25222880045572915, + "learning_rate": 0.0001, + "loss": 6.3938, + "loss/crossentropy": 2.752587676048279, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.20552918314933777, + "step": 7840 + }, + { + "epoch": 0.2450625, + "grad_norm": 3.34375, + "grad_norm_var": 0.2506174723307292, + "learning_rate": 0.0001, + "loss": 6.0401, + "loss/crossentropy": 2.57265305519104, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1920556202530861, + "step": 7842 + }, + { + "epoch": 0.245125, + "grad_norm": 3.65625, + "grad_norm_var": 0.2801920572916667, + "learning_rate": 0.0001, + "loss": 6.6359, + "loss/crossentropy": 2.7755597829818726, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.22002330422401428, + "step": 7844 + }, + { + "epoch": 0.2451875, + "grad_norm": 3.640625, + "grad_norm_var": 0.272119140625, + "learning_rate": 0.0001, + "loss": 5.9087, + "loss/crossentropy": 2.4002764225006104, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19029289484024048, + "step": 7846 + }, + { + "epoch": 0.24525, + "grad_norm": 3.6875, + "grad_norm_var": 0.07565816243489583, + "learning_rate": 0.0001, + "loss": 6.2504, + "loss/crossentropy": 2.630833864212036, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20297091454267502, + "step": 7848 + }, + { + "epoch": 0.2453125, + "grad_norm": 3.5625, + "grad_norm_var": 0.0699127197265625, + "learning_rate": 0.0001, + "loss": 6.2238, + "loss/crossentropy": 2.7208290100097656, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19326822459697723, + "step": 7850 + }, + { + "epoch": 0.245375, + "grad_norm": 3.453125, + "grad_norm_var": 0.07089742024739583, + "learning_rate": 0.0001, + "loss": 6.0403, + "loss/crossentropy": 2.524513602256775, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19532479345798492, + "step": 7852 + }, + { + "epoch": 0.2454375, + "grad_norm": 3.390625, + "grad_norm_var": 0.06756083170572917, + "learning_rate": 0.0001, + "loss": 6.0687, + "loss/crossentropy": 2.5335192680358887, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1957024410367012, + "step": 7854 + }, + { + "epoch": 0.2455, + "grad_norm": 3.703125, + "grad_norm_var": 0.07585347493489583, + "learning_rate": 0.0001, + "loss": 6.5999, + "loss/crossentropy": 2.828696131706238, + "loss/hidden": 1.68359375, + "loss/jsd": 0.0, + "loss/logits": 0.20876126736402512, + "step": 7856 + }, + { + "epoch": 0.2455625, + "grad_norm": 3.25, + "grad_norm_var": 0.0802886962890625, + "learning_rate": 0.0001, + "loss": 5.5055, + "loss/crossentropy": 2.0957219004631042, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.17769645899534225, + "step": 7858 + }, + { + "epoch": 0.245625, + "grad_norm": 3.65625, + "grad_norm_var": 0.0358551025390625, + "learning_rate": 0.0001, + "loss": 6.1333, + "loss/crossentropy": 2.5687731504440308, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19512902200222015, + "step": 7860 + }, + { + "epoch": 0.2456875, + "grad_norm": 3.5, + "grad_norm_var": 0.03453369140625, + "learning_rate": 0.0001, + "loss": 6.2233, + "loss/crossentropy": 2.69309139251709, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19481310993433, + "step": 7862 + }, + { + "epoch": 0.24575, + "grad_norm": 3.53125, + "grad_norm_var": 0.036253865559895834, + "learning_rate": 0.0001, + "loss": 6.0009, + "loss/crossentropy": 2.590876817703247, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18435797840356827, + "step": 7864 + }, + { + "epoch": 0.2458125, + "grad_norm": 3.46875, + "grad_norm_var": 0.0365142822265625, + "learning_rate": 0.0001, + "loss": 5.9163, + "loss/crossentropy": 2.5039474964141846, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18577133119106293, + "step": 7866 + }, + { + "epoch": 0.245875, + "grad_norm": 3.609375, + "grad_norm_var": 0.033600870768229166, + "learning_rate": 0.0001, + "loss": 6.214, + "loss/crossentropy": 2.5934181213378906, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.19603876769542694, + "step": 7868 + }, + { + "epoch": 0.2459375, + "grad_norm": 4.09375, + "grad_norm_var": 0.06715494791666667, + "learning_rate": 0.0001, + "loss": 6.5384, + "loss/crossentropy": 2.8254576921463013, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.20566459000110626, + "step": 7870 + }, + { + "epoch": 0.246, + "grad_norm": 3.234375, + "grad_norm_var": 0.07030843098958334, + "learning_rate": 0.0001, + "loss": 6.0556, + "loss/crossentropy": 2.5720643997192383, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1917138248682022, + "step": 7872 + }, + { + "epoch": 0.2460625, + "grad_norm": 4.90625, + "grad_norm_var": 0.17058003743489583, + "learning_rate": 0.0001, + "loss": 6.1044, + "loss/crossentropy": 2.4462668895721436, + "loss/hidden": 1.703125, + "loss/jsd": 0.0, + "loss/logits": 0.1955041065812111, + "step": 7874 + }, + { + "epoch": 0.246125, + "grad_norm": 3.578125, + "grad_norm_var": 0.1717193603515625, + "learning_rate": 0.0001, + "loss": 5.9914, + "loss/crossentropy": 2.463874340057373, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.18713021278381348, + "step": 7876 + }, + { + "epoch": 0.2461875, + "grad_norm": 3.828125, + "grad_norm_var": 0.19169514973958332, + "learning_rate": 0.0001, + "loss": 5.8841, + "loss/crossentropy": 2.4507832527160645, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18825646489858627, + "step": 7878 + }, + { + "epoch": 0.24625, + "grad_norm": 3.9375, + "grad_norm_var": 0.19039713541666667, + "learning_rate": 0.0001, + "loss": 6.4666, + "loss/crossentropy": 2.8597030639648438, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20014318823814392, + "step": 7880 + }, + { + "epoch": 0.2463125, + "grad_norm": 3.546875, + "grad_norm_var": 0.18128153483072917, + "learning_rate": 0.0001, + "loss": 6.3076, + "loss/crossentropy": 2.747925639152527, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1977677345275879, + "step": 7882 + }, + { + "epoch": 0.246375, + "grad_norm": 3.265625, + "grad_norm_var": 0.19488932291666666, + "learning_rate": 0.0001, + "loss": 6.0471, + "loss/crossentropy": 2.5719133615493774, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18892011791467667, + "step": 7884 + }, + { + "epoch": 0.2464375, + "grad_norm": 3.59375, + "grad_norm_var": 0.18992513020833332, + "learning_rate": 0.0001, + "loss": 6.5645, + "loss/crossentropy": 2.784387230873108, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.21199924498796463, + "step": 7886 + }, + { + "epoch": 0.2465, + "grad_norm": 4.09375, + "grad_norm_var": 0.17419331868489582, + "learning_rate": 0.0001, + "loss": 6.6613, + "loss/crossentropy": 2.934768319129944, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.20742224156856537, + "step": 7888 + }, + { + "epoch": 0.2465625, + "grad_norm": 3.703125, + "grad_norm_var": 0.09788411458333333, + "learning_rate": 0.0001, + "loss": 5.9526, + "loss/crossentropy": 2.491362690925598, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18752900511026382, + "step": 7890 + }, + { + "epoch": 0.246625, + "grad_norm": 3.359375, + "grad_norm_var": 0.11057535807291667, + "learning_rate": 0.0001, + "loss": 5.7986, + "loss/crossentropy": 2.3962568044662476, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18476200103759766, + "step": 7892 + }, + { + "epoch": 0.2466875, + "grad_norm": 3.21875, + "grad_norm_var": 0.09982096354166667, + "learning_rate": 0.0001, + "loss": 5.9733, + "loss/crossentropy": 2.5740636587142944, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18484096974134445, + "step": 7894 + }, + { + "epoch": 0.24675, + "grad_norm": 3.609375, + "grad_norm_var": 0.1060943603515625, + "learning_rate": 0.0001, + "loss": 5.8972, + "loss/crossentropy": 2.5059421062469482, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1832689419388771, + "step": 7896 + }, + { + "epoch": 0.2468125, + "grad_norm": 3.375, + "grad_norm_var": 0.10416259765625, + "learning_rate": 0.0001, + "loss": 6.1949, + "loss/crossentropy": 2.745757818222046, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18944872915744781, + "step": 7898 + }, + { + "epoch": 0.246875, + "grad_norm": 3.21875, + "grad_norm_var": 0.10508524576822917, + "learning_rate": 0.0001, + "loss": 6.2038, + "loss/crossentropy": 2.656543731689453, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19612840563058853, + "step": 7900 + }, + { + "epoch": 0.2469375, + "grad_norm": 3.140625, + "grad_norm_var": 0.07021382649739584, + "learning_rate": 0.0001, + "loss": 6.0582, + "loss/crossentropy": 2.552359104156494, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.18652494251728058, + "step": 7902 + }, + { + "epoch": 0.247, + "grad_norm": 3.421875, + "grad_norm_var": 0.03429361979166667, + "learning_rate": 0.0001, + "loss": 6.0395, + "loss/crossentropy": 2.514822244644165, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.18957258015871048, + "step": 7904 + }, + { + "epoch": 0.2470625, + "grad_norm": 3.765625, + "grad_norm_var": 0.0343902587890625, + "learning_rate": 0.0001, + "loss": 5.9636, + "loss/crossentropy": 2.5300296545028687, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.18164195120334625, + "step": 7906 + }, + { + "epoch": 0.247125, + "grad_norm": 3.125, + "grad_norm_var": 0.0420562744140625, + "learning_rate": 0.0001, + "loss": 5.3254, + "loss/crossentropy": 2.123539924621582, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.16041796654462814, + "step": 7908 + }, + { + "epoch": 0.2471875, + "grad_norm": 3.375, + "grad_norm_var": 0.04210611979166667, + "learning_rate": 0.0001, + "loss": 6.2468, + "loss/crossentropy": 2.684428095817566, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19803011417388916, + "step": 7910 + }, + { + "epoch": 0.24725, + "grad_norm": 3.5625, + "grad_norm_var": 0.03721415201822917, + "learning_rate": 0.0001, + "loss": 6.1013, + "loss/crossentropy": 2.5559555292129517, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19750570505857468, + "step": 7912 + }, + { + "epoch": 0.2473125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0376953125, + "learning_rate": 0.0001, + "loss": 6.2057, + "loss/crossentropy": 2.7045116424560547, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1942594200372696, + "step": 7914 + }, + { + "epoch": 0.247375, + "grad_norm": 4.0625, + "grad_norm_var": 0.060986328125, + "learning_rate": 0.0001, + "loss": 6.1003, + "loss/crossentropy": 2.5861334800720215, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19282116740942, + "step": 7916 + }, + { + "epoch": 0.2474375, + "grad_norm": 3.984375, + "grad_norm_var": 0.1520904541015625, + "learning_rate": 0.0001, + "loss": 5.808, + "loss/crossentropy": 2.408286929130554, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1802079826593399, + "step": 7918 + }, + { + "epoch": 0.2475, + "grad_norm": 3.59375, + "grad_norm_var": 0.15461832682291668, + "learning_rate": 0.0001, + "loss": 6.177, + "loss/crossentropy": 2.6627827882766724, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19244200736284256, + "step": 7920 + }, + { + "epoch": 0.2475625, + "grad_norm": 3.0, + "grad_norm_var": 0.17177327473958334, + "learning_rate": 0.0001, + "loss": 5.4008, + "loss/crossentropy": 2.206899642944336, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16626481711864471, + "step": 7922 + }, + { + "epoch": 0.247625, + "grad_norm": 3.328125, + "grad_norm_var": 0.14933268229166666, + "learning_rate": 0.0001, + "loss": 6.0001, + "loss/crossentropy": 2.5021921396255493, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19315312057733536, + "step": 7924 + }, + { + "epoch": 0.2476875, + "grad_norm": 3.46875, + "grad_norm_var": 0.15714518229166666, + "learning_rate": 0.0001, + "loss": 5.9853, + "loss/crossentropy": 2.622257351875305, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18161460012197495, + "step": 7926 + }, + { + "epoch": 0.24775, + "grad_norm": 3.578125, + "grad_norm_var": 0.15847066243489583, + "learning_rate": 0.0001, + "loss": 5.9218, + "loss/crossentropy": 2.493217349052429, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18348479270935059, + "step": 7928 + }, + { + "epoch": 0.2478125, + "grad_norm": 3.640625, + "grad_norm_var": 0.16197001139322917, + "learning_rate": 0.0001, + "loss": 5.9928, + "loss/crossentropy": 2.5236148834228516, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19027671217918396, + "step": 7930 + }, + { + "epoch": 0.247875, + "grad_norm": 3.671875, + "grad_norm_var": 0.14403889973958334, + "learning_rate": 0.0001, + "loss": 5.8449, + "loss/crossentropy": 2.3914425373077393, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18831348419189453, + "step": 7932 + }, + { + "epoch": 0.2479375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0432281494140625, + "learning_rate": 0.0001, + "loss": 6.0737, + "loss/crossentropy": 2.6326550245285034, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18941866606473923, + "step": 7934 + }, + { + "epoch": 0.248, + "grad_norm": 3.546875, + "grad_norm_var": 0.04403889973958333, + "learning_rate": 0.0001, + "loss": 6.5346, + "loss/crossentropy": 2.8211774826049805, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20649944990873337, + "step": 7936 + }, + { + "epoch": 0.2480625, + "grad_norm": 3.453125, + "grad_norm_var": 0.02926025390625, + "learning_rate": 0.0001, + "loss": 6.2807, + "loss/crossentropy": 2.684471607208252, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.2018081322312355, + "step": 7938 + }, + { + "epoch": 0.248125, + "grad_norm": 3.546875, + "grad_norm_var": 0.032567342122395836, + "learning_rate": 0.0001, + "loss": 6.2865, + "loss/crossentropy": 2.6986790895462036, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1970643773674965, + "step": 7940 + }, + { + "epoch": 0.2481875, + "grad_norm": 3.171875, + "grad_norm_var": 0.039850870768229164, + "learning_rate": 0.0001, + "loss": 5.8474, + "loss/crossentropy": 2.4854358434677124, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18385069072246552, + "step": 7942 + }, + { + "epoch": 0.24825, + "grad_norm": 3.546875, + "grad_norm_var": 0.039208984375, + "learning_rate": 0.0001, + "loss": 6.1029, + "loss/crossentropy": 2.4998894929885864, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.1962408572435379, + "step": 7944 + }, + { + "epoch": 0.2483125, + "grad_norm": 4.5625, + "grad_norm_var": 0.10787353515625, + "learning_rate": 0.0001, + "loss": 6.2775, + "loss/crossentropy": 2.5435431003570557, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.21441563218832016, + "step": 7946 + }, + { + "epoch": 0.248375, + "grad_norm": 3.40625, + "grad_norm_var": 0.10576070149739583, + "learning_rate": 0.0001, + "loss": 6.0459, + "loss/crossentropy": 2.5619404315948486, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19292648136615753, + "step": 7948 + }, + { + "epoch": 0.2484375, + "grad_norm": 3.671875, + "grad_norm_var": 0.1029296875, + "learning_rate": 0.0001, + "loss": 6.0509, + "loss/crossentropy": 2.510421395301819, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19545641541481018, + "step": 7950 + }, + { + "epoch": 0.2485, + "grad_norm": 3.515625, + "grad_norm_var": 0.10670166015625, + "learning_rate": 0.0001, + "loss": 6.0985, + "loss/crossentropy": 2.620503783226013, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19310956448316574, + "step": 7952 + }, + { + "epoch": 0.2485625, + "grad_norm": 3.375, + "grad_norm_var": 0.10779622395833334, + "learning_rate": 0.0001, + "loss": 5.8262, + "loss/crossentropy": 2.4311505556106567, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18715716898441315, + "step": 7954 + }, + { + "epoch": 0.248625, + "grad_norm": 3.328125, + "grad_norm_var": 0.10303446451822916, + "learning_rate": 0.0001, + "loss": 6.1145, + "loss/crossentropy": 2.6362648010253906, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19001401215791702, + "step": 7956 + }, + { + "epoch": 0.2486875, + "grad_norm": 4.5, + "grad_norm_var": 0.15091145833333333, + "learning_rate": 0.0001, + "loss": 6.2873, + "loss/crossentropy": 2.619986891746521, + "loss/hidden": 1.7734375, + "loss/jsd": 0.0, + "loss/logits": 0.1893879696726799, + "step": 7958 + }, + { + "epoch": 0.24875, + "grad_norm": 3.390625, + "grad_norm_var": 0.15543212890625, + "learning_rate": 0.0001, + "loss": 5.8951, + "loss/crossentropy": 2.4806383848190308, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1855819895863533, + "step": 7960 + }, + { + "epoch": 0.2488125, + "grad_norm": 3.203125, + "grad_norm_var": 0.08589579264322916, + "learning_rate": 0.0001, + "loss": 6.2061, + "loss/crossentropy": 2.761041522026062, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18747299909591675, + "step": 7962 + }, + { + "epoch": 0.248875, + "grad_norm": 3.453125, + "grad_norm_var": 0.08677978515625, + "learning_rate": 0.0001, + "loss": 5.8637, + "loss/crossentropy": 2.4276249408721924, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18891657888889313, + "step": 7964 + }, + { + "epoch": 0.2489375, + "grad_norm": 3.5, + "grad_norm_var": 0.08857014973958334, + "learning_rate": 0.0001, + "loss": 6.0556, + "loss/crossentropy": 2.5081448554992676, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19771240651607513, + "step": 7966 + }, + { + "epoch": 0.249, + "grad_norm": 3.765625, + "grad_norm_var": 0.09461263020833334, + "learning_rate": 0.0001, + "loss": 6.067, + "loss/crossentropy": 2.5315128564834595, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19651669263839722, + "step": 7968 + }, + { + "epoch": 0.2490625, + "grad_norm": 3.140625, + "grad_norm_var": 0.10575764973958333, + "learning_rate": 0.0001, + "loss": 5.6147, + "loss/crossentropy": 2.4189276695251465, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16683895140886307, + "step": 7970 + }, + { + "epoch": 0.249125, + "grad_norm": 3.359375, + "grad_norm_var": 0.10559488932291666, + "learning_rate": 0.0001, + "loss": 6.2059, + "loss/crossentropy": 2.5469354391098022, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.20105034857988358, + "step": 7972 + }, + { + "epoch": 0.2491875, + "grad_norm": 7.96875, + "grad_norm_var": 1.3269816080729167, + "learning_rate": 0.0001, + "loss": 6.3564, + "loss/crossentropy": 2.734826445579529, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.19575071334838867, + "step": 7974 + }, + { + "epoch": 0.24925, + "grad_norm": 4.375, + "grad_norm_var": 1.3368123372395833, + "learning_rate": 0.0001, + "loss": 6.09, + "loss/crossentropy": 2.45013964176178, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.19718587398529053, + "step": 7976 + }, + { + "epoch": 0.2493125, + "grad_norm": 3.65625, + "grad_norm_var": 1.3228505452473958, + "learning_rate": 0.0001, + "loss": 6.1342, + "loss/crossentropy": 2.6453663110733032, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19185029715299606, + "step": 7978 + }, + { + "epoch": 0.249375, + "grad_norm": 3.921875, + "grad_norm_var": 1.3178670247395834, + "learning_rate": 0.0001, + "loss": 6.3624, + "loss/crossentropy": 2.704474449157715, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20289774239063263, + "step": 7980 + }, + { + "epoch": 0.2494375, + "grad_norm": 3.203125, + "grad_norm_var": 1.3607584635416667, + "learning_rate": 0.0001, + "loss": 5.942, + "loss/crossentropy": 2.601536989212036, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17857421189546585, + "step": 7982 + }, + { + "epoch": 0.2495, + "grad_norm": 3.28125, + "grad_norm_var": 1.383137003580729, + "learning_rate": 0.0001, + "loss": 5.8629, + "loss/crossentropy": 2.492098808288574, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18082839250564575, + "step": 7984 + }, + { + "epoch": 0.2495625, + "grad_norm": 3.28125, + "grad_norm_var": 1.3790273030598958, + "learning_rate": 0.0001, + "loss": 5.709, + "loss/crossentropy": 2.4098531007766724, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17835478484630585, + "step": 7986 + }, + { + "epoch": 0.249625, + "grad_norm": 3.78125, + "grad_norm_var": 1.3669759114583333, + "learning_rate": 0.0001, + "loss": 6.3246, + "loss/crossentropy": 2.687322974205017, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20435378700494766, + "step": 7988 + }, + { + "epoch": 0.2496875, + "grad_norm": 3.5625, + "grad_norm_var": 0.13305562337239582, + "learning_rate": 0.0001, + "loss": 6.3689, + "loss/crossentropy": 2.691186547279358, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2083970531821251, + "step": 7990 + }, + { + "epoch": 0.24975, + "grad_norm": 3.578125, + "grad_norm_var": 0.09887593587239583, + "learning_rate": 0.0001, + "loss": 6.2701, + "loss/crossentropy": 2.7293598651885986, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19704613834619522, + "step": 7992 + }, + { + "epoch": 0.2498125, + "grad_norm": 3.5, + "grad_norm_var": 0.09706624348958333, + "learning_rate": 0.0001, + "loss": 5.952, + "loss/crossentropy": 2.6070988178253174, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17863477021455765, + "step": 7994 + }, + { + "epoch": 0.249875, + "grad_norm": 3.375, + "grad_norm_var": 0.07883199055989583, + "learning_rate": 0.0001, + "loss": 5.6068, + "loss/crossentropy": 2.2992024421691895, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17372917383909225, + "step": 7996 + }, + { + "epoch": 0.2499375, + "grad_norm": 3.578125, + "grad_norm_var": 0.07669270833333333, + "learning_rate": 0.0001, + "loss": 6.232, + "loss/crossentropy": 2.7200305461883545, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19260654598474503, + "step": 7998 + }, + { + "epoch": 0.25, + "grad_norm": 3.6875, + "grad_norm_var": 0.0800445556640625, + "learning_rate": 0.0001, + "loss": 6.0694, + "loss/crossentropy": 2.5788803100585938, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1892903596162796, + "step": 8000 + }, + { + "epoch": 0.2500625, + "grad_norm": 3.640625, + "grad_norm_var": 0.07529195149739583, + "learning_rate": 0.0001, + "loss": 6.1261, + "loss/crossentropy": 2.6562572717666626, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18760987371206284, + "step": 8002 + }, + { + "epoch": 0.250125, + "grad_norm": 3.328125, + "grad_norm_var": 0.0545806884765625, + "learning_rate": 0.0001, + "loss": 5.9428, + "loss/crossentropy": 2.517930746078491, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18740666657686234, + "step": 8004 + }, + { + "epoch": 0.2501875, + "grad_norm": 3.765625, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 5.9677, + "loss/crossentropy": 2.528059959411621, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1838068664073944, + "step": 8006 + }, + { + "epoch": 0.25025, + "grad_norm": 3.3125, + "grad_norm_var": 0.033055623372395836, + "learning_rate": 0.0001, + "loss": 6.0525, + "loss/crossentropy": 2.603206753730774, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19024566560983658, + "step": 8008 + }, + { + "epoch": 0.2503125, + "grad_norm": 3.484375, + "grad_norm_var": 0.0303375244140625, + "learning_rate": 0.0001, + "loss": 6.0339, + "loss/crossentropy": 2.5453351736068726, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19494757801294327, + "step": 8010 + }, + { + "epoch": 0.250375, + "grad_norm": 3.421875, + "grad_norm_var": 0.03229166666666667, + "learning_rate": 0.0001, + "loss": 5.7867, + "loss/crossentropy": 2.459004521369934, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17808642238378525, + "step": 8012 + }, + { + "epoch": 0.2504375, + "grad_norm": 3.578125, + "grad_norm_var": 0.030403645833333333, + "learning_rate": 0.0001, + "loss": 5.9464, + "loss/crossentropy": 2.4782962799072266, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.18431393802165985, + "step": 8014 + }, + { + "epoch": 0.2505, + "grad_norm": 3.609375, + "grad_norm_var": 0.027360026041666666, + "learning_rate": 0.0001, + "loss": 6.2003, + "loss/crossentropy": 2.765717029571533, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1879941076040268, + "step": 8016 + }, + { + "epoch": 0.2505625, + "grad_norm": 3.25, + "grad_norm_var": 0.024344889322916667, + "learning_rate": 0.0001, + "loss": 6.0431, + "loss/crossentropy": 2.6034449338912964, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18967097252607346, + "step": 8018 + }, + { + "epoch": 0.250625, + "grad_norm": 3.375, + "grad_norm_var": 0.023661295572916668, + "learning_rate": 0.0001, + "loss": 5.9184, + "loss/crossentropy": 2.5038918256759644, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1848083809018135, + "step": 8020 + }, + { + "epoch": 0.2506875, + "grad_norm": 3.25, + "grad_norm_var": 0.015787760416666668, + "learning_rate": 0.0001, + "loss": 5.6977, + "loss/crossentropy": 2.339107394218445, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.17492011189460754, + "step": 8022 + }, + { + "epoch": 0.25075, + "grad_norm": 3.28125, + "grad_norm_var": 0.0177886962890625, + "learning_rate": 0.0001, + "loss": 5.8359, + "loss/crossentropy": 2.505526900291443, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17522908002138138, + "step": 8024 + }, + { + "epoch": 0.2508125, + "grad_norm": 3.015625, + "grad_norm_var": 0.023828125, + "learning_rate": 0.0001, + "loss": 5.6513, + "loss/crossentropy": 2.377779960632324, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17188209295272827, + "step": 8026 + }, + { + "epoch": 0.250875, + "grad_norm": 3.53125, + "grad_norm_var": 0.025243123372395832, + "learning_rate": 0.0001, + "loss": 6.1452, + "loss/crossentropy": 2.6347309350967407, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1951827108860016, + "step": 8028 + }, + { + "epoch": 0.2509375, + "grad_norm": 3.40625, + "grad_norm_var": 0.021870930989583332, + "learning_rate": 0.0001, + "loss": 5.9928, + "loss/crossentropy": 2.6351139545440674, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18107682466506958, + "step": 8030 + }, + { + "epoch": 0.251, + "grad_norm": 3.484375, + "grad_norm_var": 0.02720947265625, + "learning_rate": 0.0001, + "loss": 6.2226, + "loss/crossentropy": 2.5201927423477173, + "loss/hidden": 1.67578125, + "loss/jsd": 0.0, + "loss/logits": 0.2026629075407982, + "step": 8032 + }, + { + "epoch": 0.2510625, + "grad_norm": 3.265625, + "grad_norm_var": 0.0265045166015625, + "learning_rate": 0.0001, + "loss": 6.0213, + "loss/crossentropy": 2.5705147981643677, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18687426298856735, + "step": 8034 + }, + { + "epoch": 0.251125, + "grad_norm": 3.34375, + "grad_norm_var": 0.029035441080729165, + "learning_rate": 0.0001, + "loss": 6.0548, + "loss/crossentropy": 2.6310404539108276, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18886138498783112, + "step": 8036 + }, + { + "epoch": 0.2511875, + "grad_norm": 3.484375, + "grad_norm_var": 0.029255167643229166, + "learning_rate": 0.0001, + "loss": 5.8158, + "loss/crossentropy": 2.3926368951797485, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18684671074151993, + "step": 8038 + }, + { + "epoch": 0.25125, + "grad_norm": 3.609375, + "grad_norm_var": 0.030908203125, + "learning_rate": 0.0001, + "loss": 6.2981, + "loss/crossentropy": 2.7984174489974976, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19215591996908188, + "step": 8040 + }, + { + "epoch": 0.2513125, + "grad_norm": 3.3125, + "grad_norm_var": 0.020865885416666667, + "learning_rate": 0.0001, + "loss": 5.9793, + "loss/crossentropy": 2.5516382455825806, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18690845370292664, + "step": 8042 + }, + { + "epoch": 0.251375, + "grad_norm": 3.40625, + "grad_norm_var": 0.030711873372395834, + "learning_rate": 0.0001, + "loss": 5.9134, + "loss/crossentropy": 2.5644371509552, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1806023046374321, + "step": 8044 + }, + { + "epoch": 0.2514375, + "grad_norm": 3.21875, + "grad_norm_var": 0.03463541666666667, + "learning_rate": 0.0001, + "loss": 6.101, + "loss/crossentropy": 2.622431755065918, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1927769035100937, + "step": 8046 + }, + { + "epoch": 0.2515, + "grad_norm": 3.5, + "grad_norm_var": 0.026590983072916668, + "learning_rate": 0.0001, + "loss": 6.0714, + "loss/crossentropy": 2.4856817722320557, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19880184531211853, + "step": 8048 + }, + { + "epoch": 0.2515625, + "grad_norm": 3.40625, + "grad_norm_var": 0.025764973958333333, + "learning_rate": 0.0001, + "loss": 5.9678, + "loss/crossentropy": 2.541865348815918, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18478521704673767, + "step": 8050 + }, + { + "epoch": 0.251625, + "grad_norm": 3.296875, + "grad_norm_var": 0.027757771809895835, + "learning_rate": 0.0001, + "loss": 5.6594, + "loss/crossentropy": 2.3731764554977417, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1751052439212799, + "step": 8052 + }, + { + "epoch": 0.2516875, + "grad_norm": 3.234375, + "grad_norm_var": 0.03154296875, + "learning_rate": 0.0001, + "loss": 6.2624, + "loss/crossentropy": 2.677454948425293, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19795053452253342, + "step": 8054 + }, + { + "epoch": 0.25175, + "grad_norm": 3.46875, + "grad_norm_var": 0.027692667643229165, + "learning_rate": 0.0001, + "loss": 6.2567, + "loss/crossentropy": 2.74212908744812, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1936415657401085, + "step": 8056 + }, + { + "epoch": 0.2518125, + "grad_norm": 3.421875, + "grad_norm_var": 0.0286773681640625, + "learning_rate": 0.0001, + "loss": 6.046, + "loss/crossentropy": 2.5982284545898438, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18891958892345428, + "step": 8058 + }, + { + "epoch": 0.251875, + "grad_norm": 3.25, + "grad_norm_var": 0.021480305989583334, + "learning_rate": 0.0001, + "loss": 6.0877, + "loss/crossentropy": 2.6000083684921265, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19095581769943237, + "step": 8060 + }, + { + "epoch": 0.2519375, + "grad_norm": 3.421875, + "grad_norm_var": 0.017625935872395835, + "learning_rate": 0.0001, + "loss": 5.7538, + "loss/crossentropy": 2.3270821571350098, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18134818971157074, + "step": 8062 + }, + { + "epoch": 0.252, + "grad_norm": 3.703125, + "grad_norm_var": 0.02535400390625, + "learning_rate": 0.0001, + "loss": 6.3668, + "loss/crossentropy": 2.758496880531311, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.2030191347002983, + "step": 8064 + }, + { + "epoch": 0.2520625, + "grad_norm": 3.46875, + "grad_norm_var": 0.026707967122395832, + "learning_rate": 0.0001, + "loss": 6.171, + "loss/crossentropy": 2.5974239110946655, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1999397724866867, + "step": 8066 + }, + { + "epoch": 0.252125, + "grad_norm": 3.828125, + "grad_norm_var": 0.028804524739583334, + "learning_rate": 0.0001, + "loss": 6.304, + "loss/crossentropy": 2.7676830291748047, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19660098105669022, + "step": 8068 + }, + { + "epoch": 0.2521875, + "grad_norm": 3.9375, + "grad_norm_var": 0.04177144368489583, + "learning_rate": 0.0001, + "loss": 6.123, + "loss/crossentropy": 2.5446542501449585, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19415876269340515, + "step": 8070 + }, + { + "epoch": 0.25225, + "grad_norm": 3.078125, + "grad_norm_var": 0.05226949055989583, + "learning_rate": 0.0001, + "loss": 5.8428, + "loss/crossentropy": 2.4725788831710815, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.17881622165441513, + "step": 8072 + }, + { + "epoch": 0.2523125, + "grad_norm": 3.171875, + "grad_norm_var": 0.059601847330729166, + "learning_rate": 0.0001, + "loss": 5.8556, + "loss/crossentropy": 2.4939451217651367, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18382671475410461, + "step": 8074 + }, + { + "epoch": 0.252375, + "grad_norm": 3.34375, + "grad_norm_var": 0.058283487955729164, + "learning_rate": 0.0001, + "loss": 6.2737, + "loss/crossentropy": 2.7139326333999634, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19855286180973053, + "step": 8076 + }, + { + "epoch": 0.2524375, + "grad_norm": 3.140625, + "grad_norm_var": 0.06663411458333333, + "learning_rate": 0.0001, + "loss": 5.8327, + "loss/crossentropy": 2.42052161693573, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18262653052806854, + "step": 8078 + }, + { + "epoch": 0.2525, + "grad_norm": 3.6875, + "grad_norm_var": 0.0643463134765625, + "learning_rate": 0.0001, + "loss": 6.1268, + "loss/crossentropy": 2.6724756956100464, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18879260122776031, + "step": 8080 + }, + { + "epoch": 0.2525625, + "grad_norm": 3.328125, + "grad_norm_var": 0.06493733723958334, + "learning_rate": 0.0001, + "loss": 6.1529, + "loss/crossentropy": 2.6575675010681152, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19406761974096298, + "step": 8082 + }, + { + "epoch": 0.252625, + "grad_norm": 3.15625, + "grad_norm_var": 0.056982421875, + "learning_rate": 0.0001, + "loss": 6.2375, + "loss/crossentropy": 2.7514657974243164, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19353008270263672, + "step": 8084 + }, + { + "epoch": 0.2526875, + "grad_norm": 3.515625, + "grad_norm_var": 0.0350738525390625, + "learning_rate": 0.0001, + "loss": 6.1594, + "loss/crossentropy": 2.6690210103988647, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19200769811868668, + "step": 8086 + }, + { + "epoch": 0.25275, + "grad_norm": 3.453125, + "grad_norm_var": 0.029832967122395835, + "learning_rate": 0.0001, + "loss": 5.8653, + "loss/crossentropy": 2.4479297399520874, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18587347120046616, + "step": 8088 + }, + { + "epoch": 0.2528125, + "grad_norm": 4.03125, + "grad_norm_var": 0.050414021809895834, + "learning_rate": 0.0001, + "loss": 6.3817, + "loss/crossentropy": 2.7035313844680786, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20571190118789673, + "step": 8090 + }, + { + "epoch": 0.252875, + "grad_norm": 3.15625, + "grad_norm_var": 0.0542144775390625, + "learning_rate": 0.0001, + "loss": 6.0223, + "loss/crossentropy": 2.5742950439453125, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18347414582967758, + "step": 8092 + }, + { + "epoch": 0.2529375, + "grad_norm": 2.96875, + "grad_norm_var": 0.06529032389322917, + "learning_rate": 0.0001, + "loss": 5.7376, + "loss/crossentropy": 2.3434239625930786, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18160177022218704, + "step": 8094 + }, + { + "epoch": 0.253, + "grad_norm": 4.125, + "grad_norm_var": 0.0930328369140625, + "learning_rate": 0.0001, + "loss": 6.1727, + "loss/crossentropy": 2.617506980895996, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1957532986998558, + "step": 8096 + }, + { + "epoch": 0.2530625, + "grad_norm": 3.3125, + "grad_norm_var": 0.09023030598958333, + "learning_rate": 0.0001, + "loss": 5.7503, + "loss/crossentropy": 2.399343729019165, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17962299287319183, + "step": 8098 + }, + { + "epoch": 0.253125, + "grad_norm": 3.3125, + "grad_norm_var": 0.0849029541015625, + "learning_rate": 0.0001, + "loss": 6.1124, + "loss/crossentropy": 2.61958384513855, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1902993693947792, + "step": 8100 + }, + { + "epoch": 0.2531875, + "grad_norm": 3.296875, + "grad_norm_var": 0.08745829264322917, + "learning_rate": 0.0001, + "loss": 5.8804, + "loss/crossentropy": 2.463230609893799, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18859213590621948, + "step": 8102 + }, + { + "epoch": 0.25325, + "grad_norm": 3.421875, + "grad_norm_var": 0.09225972493489583, + "learning_rate": 0.0001, + "loss": 5.7748, + "loss/crossentropy": 2.4880319833755493, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17516177147626877, + "step": 8104 + }, + { + "epoch": 0.2533125, + "grad_norm": 3.421875, + "grad_norm_var": 0.08961181640625, + "learning_rate": 0.0001, + "loss": 6.4198, + "loss/crossentropy": 2.817080855369568, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1997276097536087, + "step": 8106 + }, + { + "epoch": 0.253375, + "grad_norm": 3.578125, + "grad_norm_var": 0.0907623291015625, + "learning_rate": 0.0001, + "loss": 6.4113, + "loss/crossentropy": 2.7963486909866333, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.19743558764457703, + "step": 8108 + }, + { + "epoch": 0.2534375, + "grad_norm": 3.84375, + "grad_norm_var": 0.07965087890625, + "learning_rate": 0.0001, + "loss": 6.1079, + "loss/crossentropy": 2.680260419845581, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1888602152466774, + "step": 8110 + }, + { + "epoch": 0.2535, + "grad_norm": 3.203125, + "grad_norm_var": 0.05621337890625, + "learning_rate": 0.0001, + "loss": 6.0002, + "loss/crossentropy": 2.531370520591736, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1910264864563942, + "step": 8112 + }, + { + "epoch": 0.2535625, + "grad_norm": 3.640625, + "grad_norm_var": 0.05748291015625, + "learning_rate": 0.0001, + "loss": 6.0763, + "loss/crossentropy": 2.5369983911514282, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19182298332452774, + "step": 8114 + }, + { + "epoch": 0.253625, + "grad_norm": 3.71875, + "grad_norm_var": 0.06272379557291667, + "learning_rate": 0.0001, + "loss": 5.6069, + "loss/crossentropy": 2.2425425052642822, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18252773582935333, + "step": 8116 + }, + { + "epoch": 0.2536875, + "grad_norm": 3.6875, + "grad_norm_var": 0.0618804931640625, + "learning_rate": 0.0001, + "loss": 6.3295, + "loss/crossentropy": 2.6767349243164062, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20473266392946243, + "step": 8118 + }, + { + "epoch": 0.25375, + "grad_norm": 3.6875, + "grad_norm_var": 0.05396219889322917, + "learning_rate": 0.0001, + "loss": 6.0133, + "loss/crossentropy": 2.5724856853485107, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18705421686172485, + "step": 8120 + }, + { + "epoch": 0.2538125, + "grad_norm": 3.28125, + "grad_norm_var": 0.04700520833333333, + "learning_rate": 0.0001, + "loss": 5.8223, + "loss/crossentropy": 2.4687927961349487, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17909876257181168, + "step": 8122 + }, + { + "epoch": 0.253875, + "grad_norm": 3.53125, + "grad_norm_var": 0.04430338541666667, + "learning_rate": 0.0001, + "loss": 5.9073, + "loss/crossentropy": 2.532727599143982, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1847231239080429, + "step": 8124 + }, + { + "epoch": 0.2539375, + "grad_norm": 3.578125, + "grad_norm_var": 0.044759114583333336, + "learning_rate": 0.0001, + "loss": 6.082, + "loss/crossentropy": 2.6451234817504883, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18236370384693146, + "step": 8126 + }, + { + "epoch": 0.254, + "grad_norm": 3.5625, + "grad_norm_var": 0.04051005045572917, + "learning_rate": 0.0001, + "loss": 6.1677, + "loss/crossentropy": 2.591571807861328, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19628168642520905, + "step": 8128 + }, + { + "epoch": 0.2540625, + "grad_norm": 3.453125, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 0.0001, + "loss": 6.2562, + "loss/crossentropy": 2.6917589902877808, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.199021115899086, + "step": 8130 + }, + { + "epoch": 0.254125, + "grad_norm": 3.453125, + "grad_norm_var": 0.032124837239583336, + "learning_rate": 0.0001, + "loss": 5.9864, + "loss/crossentropy": 2.5716795921325684, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18639299273490906, + "step": 8132 + }, + { + "epoch": 0.2541875, + "grad_norm": 3.671875, + "grad_norm_var": 0.029520670572916668, + "learning_rate": 0.0001, + "loss": 5.8128, + "loss/crossentropy": 2.4251999855041504, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.17977185547351837, + "step": 8134 + }, + { + "epoch": 0.25425, + "grad_norm": 3.46875, + "grad_norm_var": 0.02769775390625, + "learning_rate": 0.0001, + "loss": 5.9771, + "loss/crossentropy": 2.446664810180664, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19327668100595474, + "step": 8136 + }, + { + "epoch": 0.2543125, + "grad_norm": 3.3125, + "grad_norm_var": 0.03144124348958333, + "learning_rate": 0.0001, + "loss": 6.1304, + "loss/crossentropy": 2.6993672847747803, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18920022994279861, + "step": 8138 + }, + { + "epoch": 0.254375, + "grad_norm": 3.9375, + "grad_norm_var": 0.06031494140625, + "learning_rate": 0.0001, + "loss": 6.2245, + "loss/crossentropy": 2.6362926959991455, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19749093800783157, + "step": 8140 + }, + { + "epoch": 0.2544375, + "grad_norm": 4.0625, + "grad_norm_var": 0.06389872233072917, + "learning_rate": 0.0001, + "loss": 6.1924, + "loss/crossentropy": 2.6543272733688354, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19326184689998627, + "step": 8142 + }, + { + "epoch": 0.2545, + "grad_norm": 3.65625, + "grad_norm_var": 8.84273681640625, + "learning_rate": 0.0001, + "loss": 6.4958, + "loss/crossentropy": 2.622679352760315, + "loss/hidden": 1.69140625, + "loss/jsd": 0.0, + "loss/logits": 0.21816815435886383, + "step": 8144 + }, + { + "epoch": 0.2545625, + "grad_norm": 3.15625, + "grad_norm_var": 8.895556640625, + "learning_rate": 0.0001, + "loss": 5.8055, + "loss/crossentropy": 2.5427088737487793, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17549703270196915, + "step": 8146 + }, + { + "epoch": 0.254625, + "grad_norm": 3.1875, + "grad_norm_var": 8.953531901041666, + "learning_rate": 0.0001, + "loss": 5.8557, + "loss/crossentropy": 2.5127452611923218, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18038783222436905, + "step": 8148 + }, + { + "epoch": 0.2546875, + "grad_norm": 3.34375, + "grad_norm_var": 8.972587076822917, + "learning_rate": 0.0001, + "loss": 5.8703, + "loss/crossentropy": 2.4678895473480225, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1859426498413086, + "step": 8150 + }, + { + "epoch": 0.25475, + "grad_norm": 3.84375, + "grad_norm_var": 8.950553385416667, + "learning_rate": 0.0001, + "loss": 6.3494, + "loss/crossentropy": 2.7035274505615234, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.202867791056633, + "step": 8152 + }, + { + "epoch": 0.2548125, + "grad_norm": 3.671875, + "grad_norm_var": 8.850846354166666, + "learning_rate": 0.0001, + "loss": 6.0579, + "loss/crossentropy": 2.4627203941345215, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20209801942110062, + "step": 8154 + }, + { + "epoch": 0.254875, + "grad_norm": 3.59375, + "grad_norm_var": 8.912621053059896, + "learning_rate": 0.0001, + "loss": 5.9049, + "loss/crossentropy": 2.4870606660842896, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18358328938484192, + "step": 8156 + }, + { + "epoch": 0.2549375, + "grad_norm": 3.75, + "grad_norm_var": 8.94732666015625, + "learning_rate": 0.0001, + "loss": 5.8499, + "loss/crossentropy": 2.37395977973938, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18860754370689392, + "step": 8158 + }, + { + "epoch": 0.255, + "grad_norm": 3.546875, + "grad_norm_var": 0.04077860514322917, + "learning_rate": 0.0001, + "loss": 5.7366, + "loss/crossentropy": 2.353061556816101, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1805398240685463, + "step": 8160 + }, + { + "epoch": 0.2550625, + "grad_norm": 3.671875, + "grad_norm_var": 0.037760416666666664, + "learning_rate": 0.0001, + "loss": 6.0241, + "loss/crossentropy": 2.4962236881256104, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19302287697792053, + "step": 8162 + }, + { + "epoch": 0.255125, + "grad_norm": 3.609375, + "grad_norm_var": 0.031208292643229166, + "learning_rate": 0.0001, + "loss": 5.9913, + "loss/crossentropy": 2.4920365810394287, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19132770597934723, + "step": 8164 + }, + { + "epoch": 0.2551875, + "grad_norm": 3.671875, + "grad_norm_var": 0.029264322916666665, + "learning_rate": 0.0001, + "loss": 6.1592, + "loss/crossentropy": 2.6284236907958984, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19605131447315216, + "step": 8166 + }, + { + "epoch": 0.25525, + "grad_norm": 3.15625, + "grad_norm_var": 0.04185791015625, + "learning_rate": 0.0001, + "loss": 5.895, + "loss/crossentropy": 2.559125542640686, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17889637500047684, + "step": 8168 + }, + { + "epoch": 0.2553125, + "grad_norm": 3.359375, + "grad_norm_var": 0.03865458170572917, + "learning_rate": 0.0001, + "loss": 5.877, + "loss/crossentropy": 2.508115768432617, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1837659329175949, + "step": 8170 + }, + { + "epoch": 0.255375, + "grad_norm": 3.84375, + "grad_norm_var": 0.047749837239583336, + "learning_rate": 0.0001, + "loss": 6.2866, + "loss/crossentropy": 2.6549712419509888, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20144866406917572, + "step": 8172 + }, + { + "epoch": 0.2554375, + "grad_norm": 3.4375, + "grad_norm_var": 0.042041015625, + "learning_rate": 0.0001, + "loss": 5.8874, + "loss/crossentropy": 2.5244874954223633, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18238522112369537, + "step": 8174 + }, + { + "epoch": 0.2555, + "grad_norm": 3.359375, + "grad_norm_var": 0.0411773681640625, + "learning_rate": 0.0001, + "loss": 6.2863, + "loss/crossentropy": 2.7426899671554565, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19459190964698792, + "step": 8176 + }, + { + "epoch": 0.2555625, + "grad_norm": 3.578125, + "grad_norm_var": 0.03765869140625, + "learning_rate": 0.0001, + "loss": 6.5763, + "loss/crossentropy": 2.919371247291565, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.2055361270904541, + "step": 8178 + }, + { + "epoch": 0.255625, + "grad_norm": 3.671875, + "grad_norm_var": 0.04023335774739583, + "learning_rate": 0.0001, + "loss": 5.7059, + "loss/crossentropy": 2.3707518577575684, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17570346593856812, + "step": 8180 + }, + { + "epoch": 0.2556875, + "grad_norm": 3.3125, + "grad_norm_var": 0.0366119384765625, + "learning_rate": 0.0001, + "loss": 5.9558, + "loss/crossentropy": 2.569222569465637, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18749002367258072, + "step": 8182 + }, + { + "epoch": 0.25575, + "grad_norm": 3.546875, + "grad_norm_var": 0.029911295572916666, + "learning_rate": 0.0001, + "loss": 6.395, + "loss/crossentropy": 2.7556833028793335, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20065077394247055, + "step": 8184 + }, + { + "epoch": 0.2558125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05172526041666667, + "learning_rate": 0.0001, + "loss": 6.2051, + "loss/crossentropy": 2.6984463930130005, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19480326026678085, + "step": 8186 + }, + { + "epoch": 0.255875, + "grad_norm": 3.1875, + "grad_norm_var": 0.048371378580729166, + "learning_rate": 0.0001, + "loss": 6.2935, + "loss/crossentropy": 2.731417417526245, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19526979327201843, + "step": 8188 + }, + { + "epoch": 0.2559375, + "grad_norm": 3.75, + "grad_norm_var": 0.0550445556640625, + "learning_rate": 0.0001, + "loss": 6.1322, + "loss/crossentropy": 2.5600863695144653, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.19354426860809326, + "step": 8190 + }, + { + "epoch": 0.256, + "grad_norm": 3.25, + "grad_norm_var": 0.06477762858072916, + "learning_rate": 0.0001, + "loss": 5.8101, + "loss/crossentropy": 2.5173254013061523, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1773252859711647, + "step": 8192 + }, + { + "epoch": 0.2560625, + "grad_norm": 3.515625, + "grad_norm_var": 0.0637847900390625, + "learning_rate": 0.0001, + "loss": 6.308, + "loss/crossentropy": 2.842012047767639, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18917309492826462, + "step": 8194 + }, + { + "epoch": 0.256125, + "grad_norm": 3.53125, + "grad_norm_var": 0.05677083333333333, + "learning_rate": 0.0001, + "loss": 6.0664, + "loss/crossentropy": 2.609717845916748, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1902015060186386, + "step": 8196 + }, + { + "epoch": 0.2561875, + "grad_norm": 3.6875, + "grad_norm_var": 0.057145182291666666, + "learning_rate": 0.0001, + "loss": 6.1134, + "loss/crossentropy": 2.550622820854187, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1980718970298767, + "step": 8198 + }, + { + "epoch": 0.25625, + "grad_norm": 3.25, + "grad_norm_var": 0.0616363525390625, + "learning_rate": 0.0001, + "loss": 5.7814, + "loss/crossentropy": 2.3334431648254395, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.1811218336224556, + "step": 8200 + }, + { + "epoch": 0.2563125, + "grad_norm": 3.46875, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 6.2308, + "loss/crossentropy": 2.691083073616028, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19733324646949768, + "step": 8202 + }, + { + "epoch": 0.256375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0377593994140625, + "learning_rate": 0.0001, + "loss": 5.8558, + "loss/crossentropy": 2.4827888011932373, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18417687714099884, + "step": 8204 + }, + { + "epoch": 0.2564375, + "grad_norm": 3.640625, + "grad_norm_var": 0.03142903645833333, + "learning_rate": 0.0001, + "loss": 6.041, + "loss/crossentropy": 2.623255968093872, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18552595376968384, + "step": 8206 + }, + { + "epoch": 0.2565, + "grad_norm": 3.453125, + "grad_norm_var": 0.020524088541666666, + "learning_rate": 0.0001, + "loss": 6.0687, + "loss/crossentropy": 2.516445279121399, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19624106585979462, + "step": 8208 + }, + { + "epoch": 0.2565625, + "grad_norm": 3.546875, + "grad_norm_var": 0.020873006184895834, + "learning_rate": 0.0001, + "loss": 6.0083, + "loss/crossentropy": 2.499703049659729, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19187762588262558, + "step": 8210 + }, + { + "epoch": 0.256625, + "grad_norm": 3.21875, + "grad_norm_var": 0.02584228515625, + "learning_rate": 0.0001, + "loss": 5.8727, + "loss/crossentropy": 2.5297656059265137, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1792202666401863, + "step": 8212 + }, + { + "epoch": 0.2566875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0221343994140625, + "learning_rate": 0.0001, + "loss": 5.938, + "loss/crossentropy": 2.5417758226394653, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1868831068277359, + "step": 8214 + }, + { + "epoch": 0.25675, + "grad_norm": 4.3125, + "grad_norm_var": 0.09054361979166667, + "learning_rate": 0.0001, + "loss": 5.9652, + "loss/crossentropy": 2.501801609992981, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18736010044813156, + "step": 8216 + }, + { + "epoch": 0.2568125, + "grad_norm": 3.296875, + "grad_norm_var": 0.09519755045572917, + "learning_rate": 0.0001, + "loss": 6.1688, + "loss/crossentropy": 2.674699306488037, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1919931024312973, + "step": 8218 + }, + { + "epoch": 0.256875, + "grad_norm": 3.765625, + "grad_norm_var": 0.0919097900390625, + "learning_rate": 0.0001, + "loss": 6.0433, + "loss/crossentropy": 2.5629888772964478, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19022127240896225, + "step": 8220 + }, + { + "epoch": 0.2569375, + "grad_norm": 3.4375, + "grad_norm_var": 0.0944000244140625, + "learning_rate": 0.0001, + "loss": 6.1382, + "loss/crossentropy": 2.676474690437317, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18953000009059906, + "step": 8222 + }, + { + "epoch": 0.257, + "grad_norm": 3.453125, + "grad_norm_var": 0.0957427978515625, + "learning_rate": 0.0001, + "loss": 6.3055, + "loss/crossentropy": 2.6694082021713257, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20383962988853455, + "step": 8224 + }, + { + "epoch": 0.2570625, + "grad_norm": 3.421875, + "grad_norm_var": 0.09700520833333333, + "learning_rate": 0.0001, + "loss": 6.3182, + "loss/crossentropy": 2.7566806077957153, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19755762815475464, + "step": 8226 + }, + { + "epoch": 0.257125, + "grad_norm": 3.59375, + "grad_norm_var": 0.0902984619140625, + "learning_rate": 0.0001, + "loss": 6.2823, + "loss/crossentropy": 2.7556703090667725, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19016651809215546, + "step": 8228 + }, + { + "epoch": 0.2571875, + "grad_norm": 3.3125, + "grad_norm_var": 0.08502604166666666, + "learning_rate": 0.0001, + "loss": 5.947, + "loss/crossentropy": 2.4731982946395874, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18878469616174698, + "step": 8230 + }, + { + "epoch": 0.25725, + "grad_norm": 3.28125, + "grad_norm_var": 0.0430084228515625, + "learning_rate": 0.0001, + "loss": 5.9901, + "loss/crossentropy": 2.4843109846115112, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.1869049295783043, + "step": 8232 + }, + { + "epoch": 0.2573125, + "grad_norm": 3.734375, + "grad_norm_var": 0.04150390625, + "learning_rate": 0.0001, + "loss": 6.1095, + "loss/crossentropy": 2.643258213996887, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1880340278148651, + "step": 8234 + }, + { + "epoch": 0.257375, + "grad_norm": 3.53125, + "grad_norm_var": 0.046875, + "learning_rate": 0.0001, + "loss": 6.121, + "loss/crossentropy": 2.6990444660186768, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18829254806041718, + "step": 8236 + }, + { + "epoch": 0.2574375, + "grad_norm": 3.296875, + "grad_norm_var": 0.04801025390625, + "learning_rate": 0.0001, + "loss": 6.135, + "loss/crossentropy": 2.7081637382507324, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18604490160942078, + "step": 8238 + }, + { + "epoch": 0.2575, + "grad_norm": 3.484375, + "grad_norm_var": 0.04849853515625, + "learning_rate": 0.0001, + "loss": 6.2395, + "loss/crossentropy": 2.634418249130249, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.19644414633512497, + "step": 8240 + }, + { + "epoch": 0.2575625, + "grad_norm": 3.59375, + "grad_norm_var": 0.0493560791015625, + "learning_rate": 0.0001, + "loss": 6.2743, + "loss/crossentropy": 2.7220499515533447, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19740818440914154, + "step": 8242 + }, + { + "epoch": 0.257625, + "grad_norm": 3.375, + "grad_norm_var": 0.04008687337239583, + "learning_rate": 0.0001, + "loss": 5.913, + "loss/crossentropy": 2.4557689428329468, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18478545546531677, + "step": 8244 + }, + { + "epoch": 0.2576875, + "grad_norm": 3.96875, + "grad_norm_var": 0.0530181884765625, + "learning_rate": 0.0001, + "loss": 5.7819, + "loss/crossentropy": 2.339992046356201, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1828642264008522, + "step": 8246 + }, + { + "epoch": 0.25775, + "grad_norm": 3.265625, + "grad_norm_var": 0.043375651041666664, + "learning_rate": 0.0001, + "loss": 5.7539, + "loss/crossentropy": 2.4117339849472046, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17874816805124283, + "step": 8248 + }, + { + "epoch": 0.2578125, + "grad_norm": 3.859375, + "grad_norm_var": 0.048273722330729164, + "learning_rate": 0.0001, + "loss": 6.4533, + "loss/crossentropy": 2.731229305267334, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.20892907679080963, + "step": 8250 + }, + { + "epoch": 0.257875, + "grad_norm": 3.140625, + "grad_norm_var": 0.04885152180989583, + "learning_rate": 0.0001, + "loss": 6.0605, + "loss/crossentropy": 2.6448343992233276, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18882764875888824, + "step": 8252 + }, + { + "epoch": 0.2579375, + "grad_norm": 3.671875, + "grad_norm_var": 0.053792317708333336, + "learning_rate": 0.0001, + "loss": 6.0896, + "loss/crossentropy": 2.6585968732833862, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18762676417827606, + "step": 8254 + }, + { + "epoch": 0.258, + "grad_norm": 3.453125, + "grad_norm_var": 0.056550089518229166, + "learning_rate": 0.0001, + "loss": 5.9864, + "loss/crossentropy": 2.599988579750061, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18277861922979355, + "step": 8256 + }, + { + "epoch": 0.2580625, + "grad_norm": 3.15625, + "grad_norm_var": 0.059911092122395836, + "learning_rate": 0.0001, + "loss": 6.1234, + "loss/crossentropy": 2.659175992012024, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1901729479432106, + "step": 8258 + }, + { + "epoch": 0.258125, + "grad_norm": 3.140625, + "grad_norm_var": 0.06591389973958334, + "learning_rate": 0.0001, + "loss": 5.8625, + "loss/crossentropy": 2.4908353090286255, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18092092871665955, + "step": 8260 + }, + { + "epoch": 0.2581875, + "grad_norm": 3.5, + "grad_norm_var": 0.04632161458333333, + "learning_rate": 0.0001, + "loss": 6.1232, + "loss/crossentropy": 2.56347119808197, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19855140149593353, + "step": 8262 + }, + { + "epoch": 0.25825, + "grad_norm": 3.75, + "grad_norm_var": 0.0597320556640625, + "learning_rate": 0.0001, + "loss": 6.3446, + "loss/crossentropy": 2.7170121669769287, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.19752085208892822, + "step": 8264 + }, + { + "epoch": 0.2583125, + "grad_norm": 3.71875, + "grad_norm_var": 0.0683258056640625, + "learning_rate": 0.0001, + "loss": 6.0703, + "loss/crossentropy": 2.4784377813339233, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.20058993995189667, + "step": 8266 + }, + { + "epoch": 0.258375, + "grad_norm": 3.765625, + "grad_norm_var": 0.06558837890625, + "learning_rate": 0.0001, + "loss": 6.2707, + "loss/crossentropy": 2.6991453170776367, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.1950492411851883, + "step": 8268 + }, + { + "epoch": 0.2584375, + "grad_norm": 3.15625, + "grad_norm_var": 0.068017578125, + "learning_rate": 0.0001, + "loss": 5.8645, + "loss/crossentropy": 2.5384750366210938, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1786932647228241, + "step": 8270 + }, + { + "epoch": 0.2585, + "grad_norm": 3.234375, + "grad_norm_var": 0.071875, + "learning_rate": 0.0001, + "loss": 5.983, + "loss/crossentropy": 2.5552178621292114, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18692362308502197, + "step": 8272 + }, + { + "epoch": 0.2585625, + "grad_norm": 3.453125, + "grad_norm_var": 0.07221577962239584, + "learning_rate": 0.0001, + "loss": 6.3375, + "loss/crossentropy": 2.727352499961853, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19929222017526627, + "step": 8274 + }, + { + "epoch": 0.258625, + "grad_norm": 3.25, + "grad_norm_var": 0.06682535807291666, + "learning_rate": 0.0001, + "loss": 6.0953, + "loss/crossentropy": 2.6253907680511475, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18722385168075562, + "step": 8276 + }, + { + "epoch": 0.2586875, + "grad_norm": 3.234375, + "grad_norm_var": 0.07351888020833333, + "learning_rate": 0.0001, + "loss": 6.2028, + "loss/crossentropy": 2.692871928215027, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19396165758371353, + "step": 8278 + }, + { + "epoch": 0.25875, + "grad_norm": 3.59375, + "grad_norm_var": 0.06799723307291666, + "learning_rate": 0.0001, + "loss": 6.2305, + "loss/crossentropy": 2.7118791341781616, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19639146327972412, + "step": 8280 + }, + { + "epoch": 0.2588125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0605133056640625, + "learning_rate": 0.0001, + "loss": 5.5454, + "loss/crossentropy": 2.3294434547424316, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1684712991118431, + "step": 8282 + }, + { + "epoch": 0.258875, + "grad_norm": 3.25, + "grad_norm_var": 0.054442342122395834, + "learning_rate": 0.0001, + "loss": 5.8204, + "loss/crossentropy": 2.476076364517212, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1820843368768692, + "step": 8284 + }, + { + "epoch": 0.2589375, + "grad_norm": 3.875, + "grad_norm_var": 0.071826171875, + "learning_rate": 0.0001, + "loss": 6.3151, + "loss/crossentropy": 2.664148688316345, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20064260065555573, + "step": 8286 + }, + { + "epoch": 0.259, + "grad_norm": 3.109375, + "grad_norm_var": 0.08062744140625, + "learning_rate": 0.0001, + "loss": 5.9039, + "loss/crossentropy": 2.5980091094970703, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17824992537498474, + "step": 8288 + }, + { + "epoch": 0.2590625, + "grad_norm": 3.3125, + "grad_norm_var": 0.08085835774739583, + "learning_rate": 0.0001, + "loss": 5.8166, + "loss/crossentropy": 2.4984443187713623, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17829637974500656, + "step": 8290 + }, + { + "epoch": 0.259125, + "grad_norm": 3.53125, + "grad_norm_var": 0.08088277180989584, + "learning_rate": 0.0001, + "loss": 6.2211, + "loss/crossentropy": 2.633698344230652, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1970238834619522, + "step": 8292 + }, + { + "epoch": 0.2591875, + "grad_norm": 3.59375, + "grad_norm_var": 0.09090067545572916, + "learning_rate": 0.0001, + "loss": 6.4699, + "loss/crossentropy": 2.7759658098220825, + "loss/hidden": 1.65234375, + "loss/jsd": 0.0, + "loss/logits": 0.2041609138250351, + "step": 8294 + }, + { + "epoch": 0.25925, + "grad_norm": 3.71875, + "grad_norm_var": 0.09553629557291667, + "learning_rate": 0.0001, + "loss": 6.0969, + "loss/crossentropy": 2.5544604063034058, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19408855587244034, + "step": 8296 + }, + { + "epoch": 0.2593125, + "grad_norm": 3.359375, + "grad_norm_var": 0.08961588541666667, + "learning_rate": 0.0001, + "loss": 5.9955, + "loss/crossentropy": 2.543579578399658, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18816332519054413, + "step": 8298 + }, + { + "epoch": 0.259375, + "grad_norm": 3.578125, + "grad_norm_var": 0.08601888020833333, + "learning_rate": 0.0001, + "loss": 6.2063, + "loss/crossentropy": 2.6519927978515625, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1952776163816452, + "step": 8300 + }, + { + "epoch": 0.2594375, + "grad_norm": 3.109375, + "grad_norm_var": 0.06621805826822917, + "learning_rate": 0.0001, + "loss": 5.6115, + "loss/crossentropy": 2.3252283334732056, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.16924922168254852, + "step": 8302 + }, + { + "epoch": 0.2595, + "grad_norm": 3.75, + "grad_norm_var": 0.05898335774739583, + "learning_rate": 0.0001, + "loss": 6.0397, + "loss/crossentropy": 2.5799871683120728, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18855176120996475, + "step": 8304 + }, + { + "epoch": 0.2595625, + "grad_norm": 3.5, + "grad_norm_var": 0.036844889322916664, + "learning_rate": 0.0001, + "loss": 5.945, + "loss/crossentropy": 2.486521363258362, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19115658104419708, + "step": 8306 + }, + { + "epoch": 0.259625, + "grad_norm": 3.4375, + "grad_norm_var": 0.03909505208333333, + "learning_rate": 0.0001, + "loss": 5.9135, + "loss/crossentropy": 2.5399292707443237, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1842341423034668, + "step": 8308 + }, + { + "epoch": 0.2596875, + "grad_norm": 3.28125, + "grad_norm_var": 0.028962198893229166, + "learning_rate": 0.0001, + "loss": 5.8404, + "loss/crossentropy": 2.4623615741729736, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1831134930253029, + "step": 8310 + }, + { + "epoch": 0.25975, + "grad_norm": 3.421875, + "grad_norm_var": 0.0230865478515625, + "learning_rate": 0.0001, + "loss": 6.1338, + "loss/crossentropy": 2.6255195140838623, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19340698421001434, + "step": 8312 + }, + { + "epoch": 0.2598125, + "grad_norm": 3.59375, + "grad_norm_var": 0.023681640625, + "learning_rate": 0.0001, + "loss": 6.0778, + "loss/crossentropy": 2.566820979118347, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18976758420467377, + "step": 8314 + }, + { + "epoch": 0.259875, + "grad_norm": 3.40625, + "grad_norm_var": 0.022931925455729165, + "learning_rate": 0.0001, + "loss": 5.856, + "loss/crossentropy": 2.480563521385193, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18402428925037384, + "step": 8316 + }, + { + "epoch": 0.2599375, + "grad_norm": 3.265625, + "grad_norm_var": 0.02086181640625, + "learning_rate": 0.0001, + "loss": 6.0715, + "loss/crossentropy": 2.6022223234176636, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1906745731830597, + "step": 8318 + }, + { + "epoch": 0.26, + "grad_norm": 4.53125, + "grad_norm_var": 0.08355712890625, + "learning_rate": 0.0001, + "loss": 5.9294, + "loss/crossentropy": 2.46099591255188, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18551427125930786, + "step": 8320 + }, + { + "epoch": 0.2600625, + "grad_norm": 3.625, + "grad_norm_var": 0.10201822916666667, + "learning_rate": 0.0001, + "loss": 6.2567, + "loss/crossentropy": 2.639232039451599, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20159168541431427, + "step": 8322 + }, + { + "epoch": 0.260125, + "grad_norm": 3.453125, + "grad_norm_var": 0.09713134765625, + "learning_rate": 0.0001, + "loss": 6.1552, + "loss/crossentropy": 2.626811981201172, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19072547554969788, + "step": 8324 + }, + { + "epoch": 0.2601875, + "grad_norm": 3.4375, + "grad_norm_var": 0.09269205729166667, + "learning_rate": 0.0001, + "loss": 6.1227, + "loss/crossentropy": 2.5807541608810425, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19754991680383682, + "step": 8326 + }, + { + "epoch": 0.26025, + "grad_norm": 3.3125, + "grad_norm_var": 0.099755859375, + "learning_rate": 0.0001, + "loss": 6.0594, + "loss/crossentropy": 2.5863711833953857, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1926172822713852, + "step": 8328 + }, + { + "epoch": 0.2603125, + "grad_norm": 3.953125, + "grad_norm_var": 0.10774637858072916, + "learning_rate": 0.0001, + "loss": 6.2198, + "loss/crossentropy": 2.6828631162643433, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19509944319725037, + "step": 8330 + }, + { + "epoch": 0.260375, + "grad_norm": 3.25, + "grad_norm_var": 0.11334228515625, + "learning_rate": 0.0001, + "loss": 5.6538, + "loss/crossentropy": 2.307045340538025, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17764659970998764, + "step": 8332 + }, + { + "epoch": 0.2604375, + "grad_norm": 3.53125, + "grad_norm_var": 0.10630594889322917, + "learning_rate": 0.0001, + "loss": 6.075, + "loss/crossentropy": 2.556526780128479, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19755026698112488, + "step": 8334 + }, + { + "epoch": 0.2605, + "grad_norm": 3.09375, + "grad_norm_var": 0.060358683268229164, + "learning_rate": 0.0001, + "loss": 5.6987, + "loss/crossentropy": 2.3782535791397095, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.17188557982444763, + "step": 8336 + }, + { + "epoch": 0.2605625, + "grad_norm": 3.546875, + "grad_norm_var": 0.079541015625, + "learning_rate": 0.0001, + "loss": 5.9846, + "loss/crossentropy": 2.5467023849487305, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1851910725235939, + "step": 8338 + }, + { + "epoch": 0.260625, + "grad_norm": 3.515625, + "grad_norm_var": 0.08170572916666667, + "learning_rate": 0.0001, + "loss": 6.1645, + "loss/crossentropy": 2.6724393367767334, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19217563420534134, + "step": 8340 + }, + { + "epoch": 0.2606875, + "grad_norm": 3.40625, + "grad_norm_var": 0.0935943603515625, + "learning_rate": 0.0001, + "loss": 5.9087, + "loss/crossentropy": 2.4909695386886597, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1816122606396675, + "step": 8342 + }, + { + "epoch": 0.26075, + "grad_norm": 3.296875, + "grad_norm_var": 0.10429280598958333, + "learning_rate": 0.0001, + "loss": 5.9806, + "loss/crossentropy": 2.591573119163513, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18617041409015656, + "step": 8344 + }, + { + "epoch": 0.2608125, + "grad_norm": 3.390625, + "grad_norm_var": 0.09718424479166667, + "learning_rate": 0.0001, + "loss": 6.2064, + "loss/crossentropy": 2.6100746393203735, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19791245460510254, + "step": 8346 + }, + { + "epoch": 0.260875, + "grad_norm": 3.421875, + "grad_norm_var": 0.09492085774739584, + "learning_rate": 0.0001, + "loss": 6.1266, + "loss/crossentropy": 2.613708972930908, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1946462318301201, + "step": 8348 + }, + { + "epoch": 0.2609375, + "grad_norm": 3.640625, + "grad_norm_var": 0.10991109212239583, + "learning_rate": 0.0001, + "loss": 5.5421, + "loss/crossentropy": 2.3060216903686523, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17361095547676086, + "step": 8350 + }, + { + "epoch": 0.261, + "grad_norm": 3.359375, + "grad_norm_var": 0.10115458170572916, + "learning_rate": 0.0001, + "loss": 5.9834, + "loss/crossentropy": 2.5177823305130005, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.189922496676445, + "step": 8352 + }, + { + "epoch": 0.2610625, + "grad_norm": 3.34375, + "grad_norm_var": 0.056494140625, + "learning_rate": 0.0001, + "loss": 6.0121, + "loss/crossentropy": 2.5313172340393066, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18987558782100677, + "step": 8354 + }, + { + "epoch": 0.261125, + "grad_norm": 3.375, + "grad_norm_var": 0.062939453125, + "learning_rate": 0.0001, + "loss": 6.2673, + "loss/crossentropy": 2.6749621629714966, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19947275519371033, + "step": 8356 + }, + { + "epoch": 0.2611875, + "grad_norm": 3.515625, + "grad_norm_var": 0.054833984375, + "learning_rate": 0.0001, + "loss": 5.7834, + "loss/crossentropy": 2.427981734275818, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1761658638715744, + "step": 8358 + }, + { + "epoch": 0.26125, + "grad_norm": 3.640625, + "grad_norm_var": 0.04914449055989583, + "learning_rate": 0.0001, + "loss": 6.0362, + "loss/crossentropy": 2.5841875076293945, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1877766028046608, + "step": 8360 + }, + { + "epoch": 0.2613125, + "grad_norm": 3.328125, + "grad_norm_var": 0.04364827473958333, + "learning_rate": 0.0001, + "loss": 6.4257, + "loss/crossentropy": 2.832598328590393, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.2011021375656128, + "step": 8362 + }, + { + "epoch": 0.261375, + "grad_norm": 3.40625, + "grad_norm_var": 0.047379557291666666, + "learning_rate": 0.0001, + "loss": 6.0373, + "loss/crossentropy": 2.547619581222534, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18919818103313446, + "step": 8364 + }, + { + "epoch": 0.2614375, + "grad_norm": 3.140625, + "grad_norm_var": 0.039567057291666666, + "learning_rate": 0.0001, + "loss": 5.865, + "loss/crossentropy": 2.556501626968384, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17577163130044937, + "step": 8366 + }, + { + "epoch": 0.2615, + "grad_norm": 3.171875, + "grad_norm_var": 0.04397379557291667, + "learning_rate": 0.0001, + "loss": 5.9708, + "loss/crossentropy": 2.5391035079956055, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18926027417182922, + "step": 8368 + }, + { + "epoch": 0.2615625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04334309895833333, + "learning_rate": 0.0001, + "loss": 6.0405, + "loss/crossentropy": 2.5592786073684692, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19070356339216232, + "step": 8370 + }, + { + "epoch": 0.261625, + "grad_norm": 3.28125, + "grad_norm_var": 0.03657938639322917, + "learning_rate": 0.0001, + "loss": 5.7534, + "loss/crossentropy": 2.413809657096863, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18200141936540604, + "step": 8372 + }, + { + "epoch": 0.2616875, + "grad_norm": 3.328125, + "grad_norm_var": 0.03467508951822917, + "learning_rate": 0.0001, + "loss": 6.3139, + "loss/crossentropy": 2.706895589828491, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19976375997066498, + "step": 8374 + }, + { + "epoch": 0.26175, + "grad_norm": 3.5, + "grad_norm_var": 0.029474894205729168, + "learning_rate": 0.0001, + "loss": 5.9654, + "loss/crossentropy": 2.599493980407715, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18073485046625137, + "step": 8376 + }, + { + "epoch": 0.2618125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0298004150390625, + "learning_rate": 0.0001, + "loss": 5.7526, + "loss/crossentropy": 2.4448423385620117, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18077139556407928, + "step": 8378 + }, + { + "epoch": 0.261875, + "grad_norm": 3.25, + "grad_norm_var": 0.0318023681640625, + "learning_rate": 0.0001, + "loss": 6.0073, + "loss/crossentropy": 2.585577368736267, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1867080181837082, + "step": 8380 + }, + { + "epoch": 0.2619375, + "grad_norm": 3.1875, + "grad_norm_var": 0.03072509765625, + "learning_rate": 0.0001, + "loss": 5.9465, + "loss/crossentropy": 2.610453486442566, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18126288056373596, + "step": 8382 + }, + { + "epoch": 0.262, + "grad_norm": 3.375, + "grad_norm_var": 0.0347808837890625, + "learning_rate": 0.0001, + "loss": 6.1832, + "loss/crossentropy": 2.772537350654602, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18403807282447815, + "step": 8384 + }, + { + "epoch": 0.2620625, + "grad_norm": 3.1875, + "grad_norm_var": 0.033056640625, + "learning_rate": 0.0001, + "loss": 6.0056, + "loss/crossentropy": 2.648902654647827, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18293480575084686, + "step": 8386 + }, + { + "epoch": 0.262125, + "grad_norm": 3.53125, + "grad_norm_var": 0.0350982666015625, + "learning_rate": 0.0001, + "loss": 5.7087, + "loss/crossentropy": 2.3789840936660767, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1751616671681404, + "step": 8388 + }, + { + "epoch": 0.2621875, + "grad_norm": 3.4375, + "grad_norm_var": 0.030304972330729166, + "learning_rate": 0.0001, + "loss": 5.8585, + "loss/crossentropy": 2.4742425680160522, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18257129937410355, + "step": 8390 + }, + { + "epoch": 0.26225, + "grad_norm": 3.34375, + "grad_norm_var": 0.028392537434895834, + "learning_rate": 0.0001, + "loss": 5.8659, + "loss/crossentropy": 2.4556645154953003, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18360427021980286, + "step": 8392 + }, + { + "epoch": 0.2623125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0290924072265625, + "learning_rate": 0.0001, + "loss": 6.0201, + "loss/crossentropy": 2.5338116884231567, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19159843772649765, + "step": 8394 + }, + { + "epoch": 0.262375, + "grad_norm": 3.609375, + "grad_norm_var": 0.025764973958333333, + "learning_rate": 0.0001, + "loss": 6.0123, + "loss/crossentropy": 2.4817099571228027, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19290197640657425, + "step": 8396 + }, + { + "epoch": 0.2624375, + "grad_norm": 3.515625, + "grad_norm_var": 0.02603759765625, + "learning_rate": 0.0001, + "loss": 6.006, + "loss/crossentropy": 2.598557710647583, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1852778196334839, + "step": 8398 + }, + { + "epoch": 0.2625, + "grad_norm": 3.515625, + "grad_norm_var": 0.018619791666666666, + "learning_rate": 0.0001, + "loss": 6.082, + "loss/crossentropy": 2.567653179168701, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.18932054936885834, + "step": 8400 + }, + { + "epoch": 0.2625625, + "grad_norm": 3.296875, + "grad_norm_var": 0.018773396809895832, + "learning_rate": 0.0001, + "loss": 5.8867, + "loss/crossentropy": 2.5691102743148804, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1817583665251732, + "step": 8402 + }, + { + "epoch": 0.262625, + "grad_norm": 3.453125, + "grad_norm_var": 0.0141998291015625, + "learning_rate": 0.0001, + "loss": 6.1406, + "loss/crossentropy": 2.613145351409912, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19610608369112015, + "step": 8404 + }, + { + "epoch": 0.2626875, + "grad_norm": 3.359375, + "grad_norm_var": 0.020018513997395834, + "learning_rate": 0.0001, + "loss": 6.4031, + "loss/crossentropy": 2.8511613607406616, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.20050807297229767, + "step": 8406 + }, + { + "epoch": 0.26275, + "grad_norm": 3.46875, + "grad_norm_var": 0.027904256184895834, + "learning_rate": 0.0001, + "loss": 5.8355, + "loss/crossentropy": 2.463037610054016, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1782667636871338, + "step": 8408 + }, + { + "epoch": 0.2628125, + "grad_norm": 3.25, + "grad_norm_var": 0.027958170572916666, + "learning_rate": 0.0001, + "loss": 6.201, + "loss/crossentropy": 2.721705675125122, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19090045243501663, + "step": 8410 + }, + { + "epoch": 0.262875, + "grad_norm": 3.28125, + "grad_norm_var": 0.031647745768229166, + "learning_rate": 0.0001, + "loss": 5.5934, + "loss/crossentropy": 2.385154128074646, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17199848592281342, + "step": 8412 + }, + { + "epoch": 0.2629375, + "grad_norm": 3.671875, + "grad_norm_var": 0.03550516764322917, + "learning_rate": 0.0001, + "loss": 6.2027, + "loss/crossentropy": 2.6594830751419067, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1953330710530281, + "step": 8414 + }, + { + "epoch": 0.263, + "grad_norm": 3.03125, + "grad_norm_var": 0.04192301432291667, + "learning_rate": 0.0001, + "loss": 5.7108, + "loss/crossentropy": 2.443622589111328, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17828471958637238, + "step": 8416 + }, + { + "epoch": 0.2630625, + "grad_norm": 3.734375, + "grad_norm_var": 0.0504302978515625, + "learning_rate": 0.0001, + "loss": 6.1936, + "loss/crossentropy": 2.6475489139556885, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19522744417190552, + "step": 8418 + }, + { + "epoch": 0.263125, + "grad_norm": 3.5625, + "grad_norm_var": 0.0546295166015625, + "learning_rate": 0.0001, + "loss": 5.9959, + "loss/crossentropy": 2.6391217708587646, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18294747918844223, + "step": 8420 + }, + { + "epoch": 0.2631875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0515533447265625, + "learning_rate": 0.0001, + "loss": 6.053, + "loss/crossentropy": 2.62371289730072, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18785394728183746, + "step": 8422 + }, + { + "epoch": 0.26325, + "grad_norm": 3.28125, + "grad_norm_var": 0.04903971354166667, + "learning_rate": 0.0001, + "loss": 6.0535, + "loss/crossentropy": 2.6555440425872803, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18471328914165497, + "step": 8424 + }, + { + "epoch": 0.2633125, + "grad_norm": 3.171875, + "grad_norm_var": 0.04951070149739583, + "learning_rate": 0.0001, + "loss": 5.9066, + "loss/crossentropy": 2.5810846090316772, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1805986762046814, + "step": 8426 + }, + { + "epoch": 0.263375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0529296875, + "learning_rate": 0.0001, + "loss": 5.9898, + "loss/crossentropy": 2.6199209690093994, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.17878857254981995, + "step": 8428 + }, + { + "epoch": 0.2634375, + "grad_norm": 4.1875, + "grad_norm_var": 0.08944905598958333, + "learning_rate": 0.0001, + "loss": 6.1658, + "loss/crossentropy": 2.6054818630218506, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19900210201740265, + "step": 8430 + }, + { + "epoch": 0.2635, + "grad_norm": 3.140625, + "grad_norm_var": 0.1032135009765625, + "learning_rate": 0.0001, + "loss": 6.1958, + "loss/crossentropy": 2.714025855064392, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19154001772403717, + "step": 8432 + }, + { + "epoch": 0.2635625, + "grad_norm": 3.46875, + "grad_norm_var": 0.18465169270833334, + "learning_rate": 0.0001, + "loss": 6.0181, + "loss/crossentropy": 2.5759806632995605, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18757275491952896, + "step": 8434 + }, + { + "epoch": 0.263625, + "grad_norm": 3.40625, + "grad_norm_var": 0.18899332682291667, + "learning_rate": 0.0001, + "loss": 5.8798, + "loss/crossentropy": 2.510910749435425, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18219773471355438, + "step": 8436 + }, + { + "epoch": 0.2636875, + "grad_norm": 3.390625, + "grad_norm_var": 0.18684794108072916, + "learning_rate": 0.0001, + "loss": 5.8096, + "loss/crossentropy": 2.4002357721328735, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18468758463859558, + "step": 8438 + }, + { + "epoch": 0.26375, + "grad_norm": 3.21875, + "grad_norm_var": 0.18179931640625, + "learning_rate": 0.0001, + "loss": 5.8787, + "loss/crossentropy": 2.54973042011261, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17860250920057297, + "step": 8440 + }, + { + "epoch": 0.2638125, + "grad_norm": 3.25, + "grad_norm_var": 0.17497456868489583, + "learning_rate": 0.0001, + "loss": 5.9215, + "loss/crossentropy": 2.4767297506332397, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1886201873421669, + "step": 8442 + }, + { + "epoch": 0.263875, + "grad_norm": 3.453125, + "grad_norm_var": 0.3054270426432292, + "learning_rate": 0.0001, + "loss": 6.3098, + "loss/crossentropy": 2.6348717212677, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20577538013458252, + "step": 8444 + }, + { + "epoch": 0.2639375, + "grad_norm": 3.453125, + "grad_norm_var": 0.28345947265625, + "learning_rate": 0.0001, + "loss": 6.1107, + "loss/crossentropy": 2.671947479248047, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1891855150461197, + "step": 8446 + }, + { + "epoch": 0.264, + "grad_norm": 3.5, + "grad_norm_var": 0.2637603759765625, + "learning_rate": 0.0001, + "loss": 5.9479, + "loss/crossentropy": 2.5242542028427124, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1892443746328354, + "step": 8448 + }, + { + "epoch": 0.2640625, + "grad_norm": 3.296875, + "grad_norm_var": 0.19389546712239583, + "learning_rate": 0.0001, + "loss": 5.9163, + "loss/crossentropy": 2.5203553438186646, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18491075187921524, + "step": 8450 + }, + { + "epoch": 0.264125, + "grad_norm": 3.40625, + "grad_norm_var": 0.19278055826822918, + "learning_rate": 0.0001, + "loss": 5.8552, + "loss/crossentropy": 2.513572573661804, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18298938870429993, + "step": 8452 + }, + { + "epoch": 0.2641875, + "grad_norm": 3.5, + "grad_norm_var": 0.19021708170572918, + "learning_rate": 0.0001, + "loss": 5.97, + "loss/crossentropy": 2.4746936559677124, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1944495290517807, + "step": 8454 + }, + { + "epoch": 0.26425, + "grad_norm": 3.78125, + "grad_norm_var": 0.18772684733072917, + "learning_rate": 0.0001, + "loss": 6.2641, + "loss/crossentropy": 2.6473084688186646, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2023027092218399, + "step": 8456 + }, + { + "epoch": 0.2643125, + "grad_norm": 3.234375, + "grad_norm_var": 0.18791910807291667, + "learning_rate": 0.0001, + "loss": 5.9522, + "loss/crossentropy": 2.6063324213027954, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18067849427461624, + "step": 8458 + }, + { + "epoch": 0.264375, + "grad_norm": 3.46875, + "grad_norm_var": 0.02222900390625, + "learning_rate": 0.0001, + "loss": 5.8842, + "loss/crossentropy": 2.526416301727295, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18186820298433304, + "step": 8460 + }, + { + "epoch": 0.2644375, + "grad_norm": 3.59375, + "grad_norm_var": 0.0293609619140625, + "learning_rate": 0.0001, + "loss": 5.7632, + "loss/crossentropy": 2.3774088621139526, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1862344667315483, + "step": 8462 + }, + { + "epoch": 0.2645, + "grad_norm": 3.28125, + "grad_norm_var": 0.03349609375, + "learning_rate": 0.0001, + "loss": 6.1811, + "loss/crossentropy": 2.741239309310913, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18265872448682785, + "step": 8464 + }, + { + "epoch": 0.2645625, + "grad_norm": 3.609375, + "grad_norm_var": 0.035456339518229164, + "learning_rate": 0.0001, + "loss": 5.8993, + "loss/crossentropy": 2.5158239603042603, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18209965527057648, + "step": 8466 + }, + { + "epoch": 0.264625, + "grad_norm": 3.09375, + "grad_norm_var": 0.040257771809895836, + "learning_rate": 0.0001, + "loss": 5.747, + "loss/crossentropy": 2.390213966369629, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1805977299809456, + "step": 8468 + }, + { + "epoch": 0.2646875, + "grad_norm": 3.484375, + "grad_norm_var": 0.0401031494140625, + "learning_rate": 0.0001, + "loss": 6.2295, + "loss/crossentropy": 2.6879630088806152, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19361046701669693, + "step": 8470 + }, + { + "epoch": 0.26475, + "grad_norm": 3.328125, + "grad_norm_var": 0.030631510416666667, + "learning_rate": 0.0001, + "loss": 6.0514, + "loss/crossentropy": 2.625853419303894, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18865232914686203, + "step": 8472 + }, + { + "epoch": 0.2648125, + "grad_norm": 3.515625, + "grad_norm_var": 0.03129781087239583, + "learning_rate": 0.0001, + "loss": 6.0974, + "loss/crossentropy": 2.589285969734192, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19416602700948715, + "step": 8474 + }, + { + "epoch": 0.264875, + "grad_norm": 3.296875, + "grad_norm_var": 0.029996744791666665, + "learning_rate": 0.0001, + "loss": 5.8653, + "loss/crossentropy": 2.503763794898987, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1783452183008194, + "step": 8476 + }, + { + "epoch": 0.2649375, + "grad_norm": 3.4375, + "grad_norm_var": 0.0326171875, + "learning_rate": 0.0001, + "loss": 6.2003, + "loss/crossentropy": 2.6542402505874634, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19601517915725708, + "step": 8478 + }, + { + "epoch": 0.265, + "grad_norm": 3.734375, + "grad_norm_var": 0.048094685872395834, + "learning_rate": 0.0001, + "loss": 5.7677, + "loss/crossentropy": 2.405746579170227, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18384867906570435, + "step": 8480 + }, + { + "epoch": 0.2650625, + "grad_norm": 3.40625, + "grad_norm_var": 0.04792378743489583, + "learning_rate": 0.0001, + "loss": 6.0084, + "loss/crossentropy": 2.685820698738098, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1779606118798256, + "step": 8482 + }, + { + "epoch": 0.265125, + "grad_norm": 9.75, + "grad_norm_var": 2.5621327718098956, + "learning_rate": 0.0001, + "loss": 6.2049, + "loss/crossentropy": 2.587186574935913, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.20474126935005188, + "step": 8484 + }, + { + "epoch": 0.2651875, + "grad_norm": 3.734375, + "grad_norm_var": 2.559129842122396, + "learning_rate": 0.0001, + "loss": 5.8343, + "loss/crossentropy": 2.442618250846863, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1833084225654602, + "step": 8486 + }, + { + "epoch": 0.26525, + "grad_norm": 3.25, + "grad_norm_var": 2.552848307291667, + "learning_rate": 0.0001, + "loss": 6.4387, + "loss/crossentropy": 2.8887436389923096, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.19327892363071442, + "step": 8488 + }, + { + "epoch": 0.2653125, + "grad_norm": 3.296875, + "grad_norm_var": 2.5823964436848956, + "learning_rate": 0.0001, + "loss": 5.7949, + "loss/crossentropy": 2.453776001930237, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1833271160721779, + "step": 8490 + }, + { + "epoch": 0.265375, + "grad_norm": 3.296875, + "grad_norm_var": 2.581656901041667, + "learning_rate": 0.0001, + "loss": 6.0074, + "loss/crossentropy": 2.468347191810608, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.18945540487766266, + "step": 8492 + }, + { + "epoch": 0.2654375, + "grad_norm": 3.609375, + "grad_norm_var": 2.5796712239583335, + "learning_rate": 0.0001, + "loss": 6.1462, + "loss/crossentropy": 2.676905393600464, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19067463278770447, + "step": 8494 + }, + { + "epoch": 0.2655, + "grad_norm": 3.65625, + "grad_norm_var": 2.5346832275390625, + "learning_rate": 0.0001, + "loss": 6.2797, + "loss/crossentropy": 2.7150908708572388, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1978655308485031, + "step": 8496 + }, + { + "epoch": 0.2655625, + "grad_norm": 3.671875, + "grad_norm_var": 2.5151041666666667, + "learning_rate": 0.0001, + "loss": 6.0652, + "loss/crossentropy": 2.5959692001342773, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18989138305187225, + "step": 8498 + }, + { + "epoch": 0.265625, + "grad_norm": 3.734375, + "grad_norm_var": 0.04973551432291667, + "learning_rate": 0.0001, + "loss": 6.1024, + "loss/crossentropy": 2.5476499795913696, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19609753042459488, + "step": 8500 + }, + { + "epoch": 0.2656875, + "grad_norm": 4.125, + "grad_norm_var": 0.07014058430989584, + "learning_rate": 0.0001, + "loss": 6.267, + "loss/crossentropy": 2.6953723430633545, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19583825767040253, + "step": 8502 + }, + { + "epoch": 0.26575, + "grad_norm": 3.125, + "grad_norm_var": 0.06415913899739584, + "learning_rate": 0.0001, + "loss": 5.935, + "loss/crossentropy": 2.5268644094467163, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1853414699435234, + "step": 8504 + }, + { + "epoch": 0.2658125, + "grad_norm": 4.875, + "grad_norm_var": 0.16070048014322916, + "learning_rate": 0.0001, + "loss": 6.208, + "loss/crossentropy": 2.6290266513824463, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.20164984464645386, + "step": 8506 + }, + { + "epoch": 0.265875, + "grad_norm": 3.53125, + "grad_norm_var": 0.1534820556640625, + "learning_rate": 0.0001, + "loss": 6.1913, + "loss/crossentropy": 2.6603872776031494, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19371574372053146, + "step": 8508 + }, + { + "epoch": 0.2659375, + "grad_norm": 3.78125, + "grad_norm_var": 0.6705963134765625, + "learning_rate": 0.0001, + "loss": 6.1024, + "loss/crossentropy": 2.6138943433761597, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1918160319328308, + "step": 8510 + }, + { + "epoch": 0.266, + "grad_norm": 3.171875, + "grad_norm_var": 0.7031534830729167, + "learning_rate": 0.0001, + "loss": 6.0797, + "loss/crossentropy": 2.6256297826766968, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1915031224489212, + "step": 8512 + }, + { + "epoch": 0.2660625, + "grad_norm": 3.4375, + "grad_norm_var": 0.7205729166666667, + "learning_rate": 0.0001, + "loss": 6.1179, + "loss/crossentropy": 2.6821320056915283, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18693740665912628, + "step": 8514 + }, + { + "epoch": 0.266125, + "grad_norm": 3.5, + "grad_norm_var": 0.7332102457682291, + "learning_rate": 0.0001, + "loss": 5.5921, + "loss/crossentropy": 2.170780062675476, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.17963345348834991, + "step": 8516 + }, + { + "epoch": 0.2661875, + "grad_norm": 3.71875, + "grad_norm_var": 0.7375396728515625, + "learning_rate": 0.0001, + "loss": 6.2086, + "loss/crossentropy": 2.6892930269241333, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19255327433347702, + "step": 8518 + }, + { + "epoch": 0.26625, + "grad_norm": 3.390625, + "grad_norm_var": 0.7311808268229166, + "learning_rate": 0.0001, + "loss": 5.8271, + "loss/crossentropy": 2.4138920307159424, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1846773475408554, + "step": 8520 + }, + { + "epoch": 0.2663125, + "grad_norm": 3.5625, + "grad_norm_var": 0.6611979166666667, + "learning_rate": 0.0001, + "loss": 5.7981, + "loss/crossentropy": 2.438520908355713, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18283716589212418, + "step": 8522 + }, + { + "epoch": 0.266375, + "grad_norm": 3.421875, + "grad_norm_var": 0.6628743489583333, + "learning_rate": 0.0001, + "loss": 5.8994, + "loss/crossentropy": 2.4681202173233032, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18649118393659592, + "step": 8524 + }, + { + "epoch": 0.2664375, + "grad_norm": 3.765625, + "grad_norm_var": 0.07416890462239584, + "learning_rate": 0.0001, + "loss": 6.0627, + "loss/crossentropy": 2.5536561012268066, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19855762273073196, + "step": 8526 + }, + { + "epoch": 0.2665, + "grad_norm": 3.734375, + "grad_norm_var": 0.07022196451822917, + "learning_rate": 0.0001, + "loss": 5.6656, + "loss/crossentropy": 2.3667540550231934, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1755838543176651, + "step": 8528 + }, + { + "epoch": 0.2665625, + "grad_norm": 3.359375, + "grad_norm_var": 0.06329752604166666, + "learning_rate": 0.0001, + "loss": 6.0194, + "loss/crossentropy": 2.491975426673889, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19297584146261215, + "step": 8530 + }, + { + "epoch": 0.266625, + "grad_norm": 3.625, + "grad_norm_var": 0.08082682291666667, + "learning_rate": 0.0001, + "loss": 5.9617, + "loss/crossentropy": 2.532854199409485, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18819521367549896, + "step": 8532 + }, + { + "epoch": 0.2666875, + "grad_norm": 3.515625, + "grad_norm_var": 0.07795817057291667, + "learning_rate": 0.0001, + "loss": 6.2831, + "loss/crossentropy": 2.7350999116897583, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19385851174592972, + "step": 8534 + }, + { + "epoch": 0.26675, + "grad_norm": 3.9375, + "grad_norm_var": 0.6849273681640625, + "learning_rate": 0.0001, + "loss": 5.9346, + "loss/crossentropy": 2.4325190782546997, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.1838061511516571, + "step": 8536 + }, + { + "epoch": 0.2668125, + "grad_norm": 3.5625, + "grad_norm_var": 0.6939849853515625, + "learning_rate": 0.0001, + "loss": 5.8413, + "loss/crossentropy": 2.4846941232681274, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17862877994775772, + "step": 8538 + }, + { + "epoch": 0.266875, + "grad_norm": 3.875, + "grad_norm_var": 0.68277587890625, + "learning_rate": 0.0001, + "loss": 6.2013, + "loss/crossentropy": 2.639085292816162, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19528042525053024, + "step": 8540 + }, + { + "epoch": 0.2669375, + "grad_norm": 3.375, + "grad_norm_var": 0.7116282145182292, + "learning_rate": 0.0001, + "loss": 6.0734, + "loss/crossentropy": 2.645695686340332, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18847399204969406, + "step": 8542 + }, + { + "epoch": 0.267, + "grad_norm": 3.34375, + "grad_norm_var": 0.7354237874348958, + "learning_rate": 0.0001, + "loss": 6.0328, + "loss/crossentropy": 2.622828960418701, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1851336508989334, + "step": 8544 + }, + { + "epoch": 0.2670625, + "grad_norm": 3.578125, + "grad_norm_var": 0.7755849202473958, + "learning_rate": 0.0001, + "loss": 6.0321, + "loss/crossentropy": 2.5674968957901, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19177460670471191, + "step": 8546 + }, + { + "epoch": 0.267125, + "grad_norm": 3.421875, + "grad_norm_var": 0.7624094645182292, + "learning_rate": 0.0001, + "loss": 6.1345, + "loss/crossentropy": 2.6102665662765503, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19578158855438232, + "step": 8548 + }, + { + "epoch": 0.2671875, + "grad_norm": 3.875, + "grad_norm_var": 0.7626291910807291, + "learning_rate": 0.0001, + "loss": 5.8333, + "loss/crossentropy": 2.478430151939392, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17806804925203323, + "step": 8550 + }, + { + "epoch": 0.26725, + "grad_norm": 3.421875, + "grad_norm_var": 0.13746337890625, + "learning_rate": 0.0001, + "loss": 5.9301, + "loss/crossentropy": 2.470288872718811, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18699821829795837, + "step": 8552 + }, + { + "epoch": 0.2673125, + "grad_norm": 3.359375, + "grad_norm_var": 0.13751627604166666, + "learning_rate": 0.0001, + "loss": 5.7851, + "loss/crossentropy": 2.3833465576171875, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18158571422100067, + "step": 8554 + }, + { + "epoch": 0.267375, + "grad_norm": 3.5, + "grad_norm_var": 0.13147684733072917, + "learning_rate": 0.0001, + "loss": 5.8517, + "loss/crossentropy": 2.461923360824585, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18311707675457, + "step": 8556 + }, + { + "epoch": 0.2674375, + "grad_norm": 3.203125, + "grad_norm_var": 0.13958333333333334, + "learning_rate": 0.0001, + "loss": 5.9528, + "loss/crossentropy": 2.553870439529419, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18715743720531464, + "step": 8558 + }, + { + "epoch": 0.2675, + "grad_norm": 3.515625, + "grad_norm_var": 0.13875223795572916, + "learning_rate": 0.0001, + "loss": 5.6426, + "loss/crossentropy": 2.277732491493225, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.17672587931156158, + "step": 8560 + }, + { + "epoch": 0.2675625, + "grad_norm": 3.125, + "grad_norm_var": 0.0509674072265625, + "learning_rate": 0.0001, + "loss": 6.0759, + "loss/crossentropy": 2.677735924720764, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18357118964195251, + "step": 8562 + }, + { + "epoch": 0.267625, + "grad_norm": 3.34375, + "grad_norm_var": 0.24551493326822918, + "learning_rate": 0.0001, + "loss": 6.1003, + "loss/crossentropy": 2.658802032470703, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1875048652291298, + "step": 8564 + }, + { + "epoch": 0.2676875, + "grad_norm": 3.3125, + "grad_norm_var": 0.23691304524739584, + "learning_rate": 0.0001, + "loss": 6.104, + "loss/crossentropy": 2.683103084564209, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18818234652280807, + "step": 8566 + }, + { + "epoch": 0.26775, + "grad_norm": 4.0625, + "grad_norm_var": 0.260107421875, + "learning_rate": 0.0001, + "loss": 5.8088, + "loss/crossentropy": 2.433120369911194, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18366563320159912, + "step": 8568 + }, + { + "epoch": 0.2678125, + "grad_norm": 3.46875, + "grad_norm_var": 0.26452534993489585, + "learning_rate": 0.0001, + "loss": 6.098, + "loss/crossentropy": 2.537542939186096, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.19158834964036942, + "step": 8570 + }, + { + "epoch": 0.267875, + "grad_norm": 3.5, + "grad_norm_var": 0.26738179524739586, + "learning_rate": 0.0001, + "loss": 6.4872, + "loss/crossentropy": 2.7899457216262817, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20918139815330505, + "step": 8572 + }, + { + "epoch": 0.2679375, + "grad_norm": 3.265625, + "grad_norm_var": 0.2688385009765625, + "learning_rate": 0.0001, + "loss": 5.6195, + "loss/crossentropy": 2.3636358976364136, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17441652715206146, + "step": 8574 + }, + { + "epoch": 0.268, + "grad_norm": 3.90625, + "grad_norm_var": 0.26971028645833334, + "learning_rate": 0.0001, + "loss": 5.9691, + "loss/crossentropy": 2.4868744611740112, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18845638632774353, + "step": 8576 + }, + { + "epoch": 0.2680625, + "grad_norm": 3.515625, + "grad_norm_var": 0.8212076822916666, + "learning_rate": 0.0001, + "loss": 6.2685, + "loss/crossentropy": 2.60793399810791, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20472849905490875, + "step": 8578 + }, + { + "epoch": 0.268125, + "grad_norm": 3.234375, + "grad_norm_var": 0.70054931640625, + "learning_rate": 0.0001, + "loss": 6.0464, + "loss/crossentropy": 2.6345239877700806, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18219953030347824, + "step": 8580 + }, + { + "epoch": 0.2681875, + "grad_norm": 3.453125, + "grad_norm_var": 1.0117177327473958, + "learning_rate": 0.0001, + "loss": 6.4786, + "loss/crossentropy": 2.73080575466156, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.211110420525074, + "step": 8582 + }, + { + "epoch": 0.26825, + "grad_norm": 3.78125, + "grad_norm_var": 1.0106404622395833, + "learning_rate": 0.0001, + "loss": 6.2299, + "loss/crossentropy": 2.610520124435425, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19982445240020752, + "step": 8584 + }, + { + "epoch": 0.2683125, + "grad_norm": 3.484375, + "grad_norm_var": 1.0197499593098958, + "learning_rate": 0.0001, + "loss": 6.3412, + "loss/crossentropy": 2.7998993396759033, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1978771984577179, + "step": 8586 + }, + { + "epoch": 0.268375, + "grad_norm": 3.796875, + "grad_norm_var": 1.0201812744140626, + "learning_rate": 0.0001, + "loss": 5.8564, + "loss/crossentropy": 2.38976788520813, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18689440190792084, + "step": 8588 + }, + { + "epoch": 0.2684375, + "grad_norm": 3.3125, + "grad_norm_var": 0.9783162434895833, + "learning_rate": 0.0001, + "loss": 6.1383, + "loss/crossentropy": 2.5660598278045654, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1982373148202896, + "step": 8590 + }, + { + "epoch": 0.2685, + "grad_norm": 3.34375, + "grad_norm_var": 1.0193644205729167, + "learning_rate": 0.0001, + "loss": 5.9395, + "loss/crossentropy": 2.569070339202881, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18196505308151245, + "step": 8592 + }, + { + "epoch": 0.2685625, + "grad_norm": 3.328125, + "grad_norm_var": 0.485791015625, + "learning_rate": 0.0001, + "loss": 6.0061, + "loss/crossentropy": 2.6035884618759155, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18477965891361237, + "step": 8594 + }, + { + "epoch": 0.268625, + "grad_norm": 3.765625, + "grad_norm_var": 0.4724609375, + "learning_rate": 0.0001, + "loss": 6.2822, + "loss/crossentropy": 2.7133415937423706, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19594413042068481, + "step": 8596 + }, + { + "epoch": 0.2686875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0524810791015625, + "learning_rate": 0.0001, + "loss": 5.9493, + "loss/crossentropy": 2.4889795780181885, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18743963539600372, + "step": 8598 + }, + { + "epoch": 0.26875, + "grad_norm": 3.375, + "grad_norm_var": 0.0525299072265625, + "learning_rate": 0.0001, + "loss": 6.0522, + "loss/crossentropy": 2.6539541482925415, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18514089286327362, + "step": 8600 + }, + { + "epoch": 0.2688125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05427958170572917, + "learning_rate": 0.0001, + "loss": 5.7099, + "loss/crossentropy": 2.2864630222320557, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.188829205930233, + "step": 8602 + }, + { + "epoch": 0.268875, + "grad_norm": 3.109375, + "grad_norm_var": 0.06525065104166666, + "learning_rate": 0.0001, + "loss": 6.2145, + "loss/crossentropy": 2.717995524406433, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19378919154405594, + "step": 8604 + }, + { + "epoch": 0.2689375, + "grad_norm": 3.125, + "grad_norm_var": 0.0459625244140625, + "learning_rate": 0.0001, + "loss": 5.758, + "loss/crossentropy": 2.4573293924331665, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17771954834461212, + "step": 8606 + }, + { + "epoch": 0.269, + "grad_norm": 3.578125, + "grad_norm_var": 0.05129801432291667, + "learning_rate": 0.0001, + "loss": 5.9259, + "loss/crossentropy": 2.485554337501526, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18817304819822311, + "step": 8608 + }, + { + "epoch": 0.2690625, + "grad_norm": 3.390625, + "grad_norm_var": 0.07372639973958334, + "learning_rate": 0.0001, + "loss": 6.209, + "loss/crossentropy": 2.6714975833892822, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19632639735937119, + "step": 8610 + }, + { + "epoch": 0.269125, + "grad_norm": 3.046875, + "grad_norm_var": 0.07276102701822916, + "learning_rate": 0.0001, + "loss": 5.8628, + "loss/crossentropy": 2.6209064722061157, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17301487177610397, + "step": 8612 + }, + { + "epoch": 0.2691875, + "grad_norm": 3.828125, + "grad_norm_var": 0.09257405598958333, + "learning_rate": 0.0001, + "loss": 6.4746, + "loss/crossentropy": 2.7915139198303223, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.20659378916025162, + "step": 8614 + }, + { + "epoch": 0.26925, + "grad_norm": 3.5625, + "grad_norm_var": 0.10926005045572916, + "learning_rate": 0.0001, + "loss": 6.1453, + "loss/crossentropy": 2.718366861343384, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18370791524648666, + "step": 8616 + }, + { + "epoch": 0.2693125, + "grad_norm": 4.3125, + "grad_norm_var": 0.144970703125, + "learning_rate": 0.0001, + "loss": 6.0593, + "loss/crossentropy": 2.50391161441803, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19264744967222214, + "step": 8618 + }, + { + "epoch": 0.269375, + "grad_norm": 3.109375, + "grad_norm_var": 0.13963114420572917, + "learning_rate": 0.0001, + "loss": 5.8813, + "loss/crossentropy": 2.5312806367874146, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18343793600797653, + "step": 8620 + }, + { + "epoch": 0.2694375, + "grad_norm": 3.421875, + "grad_norm_var": 0.12897135416666666, + "learning_rate": 0.0001, + "loss": 6.264, + "loss/crossentropy": 2.7507938146591187, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1954570934176445, + "step": 8622 + }, + { + "epoch": 0.2695, + "grad_norm": 3.234375, + "grad_norm_var": 0.1293853759765625, + "learning_rate": 0.0001, + "loss": 5.8537, + "loss/crossentropy": 2.525124430656433, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1809079274535179, + "step": 8624 + }, + { + "epoch": 0.2695625, + "grad_norm": 3.234375, + "grad_norm_var": 0.12226460774739584, + "learning_rate": 0.0001, + "loss": 6.1066, + "loss/crossentropy": 2.6248010396957397, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1934967190027237, + "step": 8626 + }, + { + "epoch": 0.269625, + "grad_norm": 3.34375, + "grad_norm_var": 0.10802408854166666, + "learning_rate": 0.0001, + "loss": 6.0722, + "loss/crossentropy": 2.6180055141448975, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18917328119277954, + "step": 8628 + }, + { + "epoch": 0.2696875, + "grad_norm": 3.421875, + "grad_norm_var": 0.10091044108072916, + "learning_rate": 0.0001, + "loss": 5.994, + "loss/crossentropy": 2.541751265525818, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18936219811439514, + "step": 8630 + }, + { + "epoch": 0.26975, + "grad_norm": 3.46875, + "grad_norm_var": 0.0783843994140625, + "learning_rate": 0.0001, + "loss": 6.3035, + "loss/crossentropy": 2.8230782747268677, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18983669579029083, + "step": 8632 + }, + { + "epoch": 0.2698125, + "grad_norm": 3.796875, + "grad_norm_var": 0.0278717041015625, + "learning_rate": 0.0001, + "loss": 6.2003, + "loss/crossentropy": 2.559078574180603, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2012312412261963, + "step": 8634 + }, + { + "epoch": 0.269875, + "grad_norm": 3.453125, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 5.8106, + "loss/crossentropy": 2.497360348701477, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17976141721010208, + "step": 8636 + }, + { + "epoch": 0.2699375, + "grad_norm": 3.65625, + "grad_norm_var": 0.0294830322265625, + "learning_rate": 0.0001, + "loss": 6.3131, + "loss/crossentropy": 2.727493166923523, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1984044834971428, + "step": 8638 + }, + { + "epoch": 0.27, + "grad_norm": 3.25, + "grad_norm_var": 0.0282379150390625, + "learning_rate": 0.0001, + "loss": 5.8564, + "loss/crossentropy": 2.4807841777801514, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1840466484427452, + "step": 8640 + }, + { + "epoch": 0.2700625, + "grad_norm": 3.25, + "grad_norm_var": 0.024470011393229168, + "learning_rate": 0.0001, + "loss": 5.8187, + "loss/crossentropy": 2.4412097930908203, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18267560750246048, + "step": 8642 + }, + { + "epoch": 0.270125, + "grad_norm": 3.703125, + "grad_norm_var": 0.03683268229166667, + "learning_rate": 0.0001, + "loss": 6.5103, + "loss/crossentropy": 2.903472065925598, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20131144672632217, + "step": 8644 + }, + { + "epoch": 0.2701875, + "grad_norm": 3.21875, + "grad_norm_var": 0.04053446451822917, + "learning_rate": 0.0001, + "loss": 6.2487, + "loss/crossentropy": 2.7037285566329956, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.18965797871351242, + "step": 8646 + }, + { + "epoch": 0.27025, + "grad_norm": 3.203125, + "grad_norm_var": 0.04619038899739583, + "learning_rate": 0.0001, + "loss": 5.8646, + "loss/crossentropy": 2.4815542697906494, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18361923843622208, + "step": 8648 + }, + { + "epoch": 0.2703125, + "grad_norm": 3.859375, + "grad_norm_var": 0.0555572509765625, + "learning_rate": 0.0001, + "loss": 5.7107, + "loss/crossentropy": 2.400591015815735, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17944909632205963, + "step": 8650 + }, + { + "epoch": 0.270375, + "grad_norm": 3.5625, + "grad_norm_var": 0.05408528645833333, + "learning_rate": 0.0001, + "loss": 6.0948, + "loss/crossentropy": 2.658158540725708, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1862383633852005, + "step": 8652 + }, + { + "epoch": 0.2704375, + "grad_norm": 3.234375, + "grad_norm_var": 0.052144368489583336, + "learning_rate": 0.0001, + "loss": 5.8797, + "loss/crossentropy": 2.499776244163513, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18056650459766388, + "step": 8654 + }, + { + "epoch": 0.2705, + "grad_norm": 3.609375, + "grad_norm_var": 0.05791015625, + "learning_rate": 0.0001, + "loss": 6.1491, + "loss/crossentropy": 2.69377064704895, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18849724531173706, + "step": 8656 + }, + { + "epoch": 0.2705625, + "grad_norm": 3.296875, + "grad_norm_var": 0.057047526041666664, + "learning_rate": 0.0001, + "loss": 6.3627, + "loss/crossentropy": 2.8331875801086426, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19318635761737823, + "step": 8658 + }, + { + "epoch": 0.270625, + "grad_norm": 3.265625, + "grad_norm_var": 0.04064127604166667, + "learning_rate": 0.0001, + "loss": 6.2608, + "loss/crossentropy": 2.7417465448379517, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19448032975196838, + "step": 8660 + }, + { + "epoch": 0.2706875, + "grad_norm": 3.4375, + "grad_norm_var": 0.03798828125, + "learning_rate": 0.0001, + "loss": 6.0342, + "loss/crossentropy": 2.577946186065674, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19172074645757675, + "step": 8662 + }, + { + "epoch": 0.27075, + "grad_norm": 3.234375, + "grad_norm_var": 0.035868326822916664, + "learning_rate": 0.0001, + "loss": 5.8113, + "loss/crossentropy": 2.4585071802139282, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18137074261903763, + "step": 8664 + }, + { + "epoch": 0.2708125, + "grad_norm": 3.03125, + "grad_norm_var": 0.025972493489583335, + "learning_rate": 0.0001, + "loss": 5.7591, + "loss/crossentropy": 2.4618247747421265, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1773819923400879, + "step": 8666 + }, + { + "epoch": 0.270875, + "grad_norm": 3.625, + "grad_norm_var": 0.028490193684895835, + "learning_rate": 0.0001, + "loss": 6.1608, + "loss/crossentropy": 2.6863608360290527, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18884887546300888, + "step": 8668 + }, + { + "epoch": 0.2709375, + "grad_norm": 3.5, + "grad_norm_var": 0.02926025390625, + "learning_rate": 0.0001, + "loss": 5.778, + "loss/crossentropy": 2.4676570892333984, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17751803994178772, + "step": 8670 + }, + { + "epoch": 0.271, + "grad_norm": 3.1875, + "grad_norm_var": 0.02369384765625, + "learning_rate": 0.0001, + "loss": 6.1098, + "loss/crossentropy": 2.6976585388183594, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18300720304250717, + "step": 8672 + }, + { + "epoch": 0.2710625, + "grad_norm": 3.625, + "grad_norm_var": 0.0282867431640625, + "learning_rate": 0.0001, + "loss": 6.1249, + "loss/crossentropy": 2.682486414909363, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19112010300159454, + "step": 8674 + }, + { + "epoch": 0.271125, + "grad_norm": 3.78125, + "grad_norm_var": 0.08658447265625, + "learning_rate": 0.0001, + "loss": 6.3226, + "loss/crossentropy": 2.64150333404541, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20756685733795166, + "step": 8676 + }, + { + "epoch": 0.2711875, + "grad_norm": 3.5625, + "grad_norm_var": 0.08772786458333333, + "learning_rate": 0.0001, + "loss": 6.1881, + "loss/crossentropy": 2.605919599533081, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19922953099012375, + "step": 8678 + }, + { + "epoch": 0.27125, + "grad_norm": 6.375, + "grad_norm_var": 0.6093251546223958, + "learning_rate": 0.0001, + "loss": 6.0575, + "loss/crossentropy": 2.4938093423843384, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19347576051950455, + "step": 8680 + }, + { + "epoch": 0.2713125, + "grad_norm": 3.625, + "grad_norm_var": 0.5719716389973958, + "learning_rate": 0.0001, + "loss": 6.2066, + "loss/crossentropy": 2.7196247577667236, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18736734986305237, + "step": 8682 + }, + { + "epoch": 0.271375, + "grad_norm": 3.984375, + "grad_norm_var": 0.561279296875, + "learning_rate": 0.0001, + "loss": 6.3675, + "loss/crossentropy": 2.715699076652527, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20502400398254395, + "step": 8684 + }, + { + "epoch": 0.2714375, + "grad_norm": 3.484375, + "grad_norm_var": 0.5424550374348959, + "learning_rate": 0.0001, + "loss": 6.0175, + "loss/crossentropy": 2.6258652210235596, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1836940422654152, + "step": 8686 + }, + { + "epoch": 0.2715, + "grad_norm": 3.65625, + "grad_norm_var": 0.530419921875, + "learning_rate": 0.0001, + "loss": 5.9491, + "loss/crossentropy": 2.454306483268738, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19049323350191116, + "step": 8688 + }, + { + "epoch": 0.2715625, + "grad_norm": 3.734375, + "grad_norm_var": 0.5486979166666667, + "learning_rate": 0.0001, + "loss": 6.4043, + "loss/crossentropy": 2.7677533626556396, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2042810544371605, + "step": 8690 + }, + { + "epoch": 0.271625, + "grad_norm": 3.5625, + "grad_norm_var": 0.5486165364583333, + "learning_rate": 0.0001, + "loss": 5.8867, + "loss/crossentropy": 2.4385541677474976, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18543724715709686, + "step": 8692 + }, + { + "epoch": 0.2716875, + "grad_norm": 4.125, + "grad_norm_var": 0.5405588785807292, + "learning_rate": 0.0001, + "loss": 6.4892, + "loss/crossentropy": 2.7224197387695312, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.20988348871469498, + "step": 8694 + }, + { + "epoch": 0.27175, + "grad_norm": 3.6875, + "grad_norm_var": 0.07668863932291667, + "learning_rate": 0.0001, + "loss": 6.1692, + "loss/crossentropy": 2.6610034704208374, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18948668986558914, + "step": 8696 + }, + { + "epoch": 0.2718125, + "grad_norm": 3.609375, + "grad_norm_var": 0.07870686848958333, + "learning_rate": 0.0001, + "loss": 6.0613, + "loss/crossentropy": 2.5653761625289917, + "loss/hidden": 1.6171875, + "loss/jsd": 0.0, + "loss/logits": 0.1878705695271492, + "step": 8698 + }, + { + "epoch": 0.271875, + "grad_norm": 3.546875, + "grad_norm_var": 0.0612213134765625, + "learning_rate": 0.0001, + "loss": 5.9579, + "loss/crossentropy": 2.5688871145248413, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18382221460342407, + "step": 8700 + }, + { + "epoch": 0.2719375, + "grad_norm": 3.28125, + "grad_norm_var": 0.05758056640625, + "learning_rate": 0.0001, + "loss": 5.8379, + "loss/crossentropy": 2.452753782272339, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18109457194805145, + "step": 8702 + }, + { + "epoch": 0.272, + "grad_norm": 3.3125, + "grad_norm_var": 0.05315348307291667, + "learning_rate": 0.0001, + "loss": 5.9339, + "loss/crossentropy": 2.5359824895858765, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18471135199069977, + "step": 8704 + }, + { + "epoch": 0.2720625, + "grad_norm": 3.34375, + "grad_norm_var": 0.04827067057291667, + "learning_rate": 0.0001, + "loss": 5.9526, + "loss/crossentropy": 2.540789246559143, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18844721466302872, + "step": 8706 + }, + { + "epoch": 0.272125, + "grad_norm": 3.671875, + "grad_norm_var": 0.048949178059895834, + "learning_rate": 0.0001, + "loss": 6.377, + "loss/crossentropy": 2.760253071784973, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20386086404323578, + "step": 8708 + }, + { + "epoch": 0.2721875, + "grad_norm": 3.171875, + "grad_norm_var": 0.021610514322916666, + "learning_rate": 0.0001, + "loss": 5.8678, + "loss/crossentropy": 2.484999179840088, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18554507941007614, + "step": 8710 + }, + { + "epoch": 0.27225, + "grad_norm": 3.375, + "grad_norm_var": 0.018375651041666666, + "learning_rate": 0.0001, + "loss": 5.7152, + "loss/crossentropy": 2.4064172506332397, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17736196517944336, + "step": 8712 + }, + { + "epoch": 0.2723125, + "grad_norm": 3.328125, + "grad_norm_var": 0.016304524739583333, + "learning_rate": 0.0001, + "loss": 6.194, + "loss/crossentropy": 2.6760975122451782, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1951456367969513, + "step": 8714 + }, + { + "epoch": 0.272375, + "grad_norm": 3.3125, + "grad_norm_var": 0.015327962239583333, + "learning_rate": 0.0001, + "loss": 6.0213, + "loss/crossentropy": 2.6049975156784058, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18694640696048737, + "step": 8716 + }, + { + "epoch": 0.2724375, + "grad_norm": 3.53125, + "grad_norm_var": 0.01611328125, + "learning_rate": 0.0001, + "loss": 6.2175, + "loss/crossentropy": 2.626902222633362, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.19265294820070267, + "step": 8718 + }, + { + "epoch": 0.2725, + "grad_norm": 3.296875, + "grad_norm_var": 0.019466145833333334, + "learning_rate": 0.0001, + "loss": 6.3469, + "loss/crossentropy": 2.8148353099823, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1957806497812271, + "step": 8720 + }, + { + "epoch": 0.2725625, + "grad_norm": 3.328125, + "grad_norm_var": 0.020319620768229168, + "learning_rate": 0.0001, + "loss": 5.6954, + "loss/crossentropy": 2.363893508911133, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17806947231292725, + "step": 8722 + }, + { + "epoch": 0.272625, + "grad_norm": 3.359375, + "grad_norm_var": 0.015653483072916665, + "learning_rate": 0.0001, + "loss": 5.9333, + "loss/crossentropy": 2.547742247581482, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18308507651090622, + "step": 8724 + }, + { + "epoch": 0.2726875, + "grad_norm": 3.8125, + "grad_norm_var": 0.030744425455729165, + "learning_rate": 0.0001, + "loss": 6.2602, + "loss/crossentropy": 2.6908657550811768, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19951613247394562, + "step": 8726 + }, + { + "epoch": 0.27275, + "grad_norm": 3.5, + "grad_norm_var": 0.029955037434895835, + "learning_rate": 0.0001, + "loss": 6.0924, + "loss/crossentropy": 2.683358073234558, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18660413473844528, + "step": 8728 + }, + { + "epoch": 0.2728125, + "grad_norm": 3.921875, + "grad_norm_var": 0.04309794108072917, + "learning_rate": 0.0001, + "loss": 6.1644, + "loss/crossentropy": 2.6175061464309692, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1972716897726059, + "step": 8730 + }, + { + "epoch": 0.272875, + "grad_norm": 4.09375, + "grad_norm_var": 0.06542561848958334, + "learning_rate": 0.0001, + "loss": 6.2474, + "loss/crossentropy": 2.5593132972717285, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.2043544426560402, + "step": 8732 + }, + { + "epoch": 0.2729375, + "grad_norm": 3.640625, + "grad_norm_var": 0.06306864420572916, + "learning_rate": 0.0001, + "loss": 6.2188, + "loss/crossentropy": 2.6943942308425903, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1946251168847084, + "step": 8734 + }, + { + "epoch": 0.273, + "grad_norm": 3.359375, + "grad_norm_var": 0.0703125, + "learning_rate": 0.0001, + "loss": 5.8123, + "loss/crossentropy": 2.492477774620056, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17495298385620117, + "step": 8736 + }, + { + "epoch": 0.2730625, + "grad_norm": 3.890625, + "grad_norm_var": 0.06643778483072917, + "learning_rate": 0.0001, + "loss": 6.3913, + "loss/crossentropy": 2.8301221132278442, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19869162142276764, + "step": 8738 + }, + { + "epoch": 0.273125, + "grad_norm": 3.671875, + "grad_norm_var": 0.0551177978515625, + "learning_rate": 0.0001, + "loss": 6.3365, + "loss/crossentropy": 2.775020718574524, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19950658828020096, + "step": 8740 + }, + { + "epoch": 0.2731875, + "grad_norm": 3.140625, + "grad_norm_var": 0.06675516764322917, + "learning_rate": 0.0001, + "loss": 5.8037, + "loss/crossentropy": 2.5151455402374268, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1757265403866768, + "step": 8742 + }, + { + "epoch": 0.27325, + "grad_norm": 3.203125, + "grad_norm_var": 0.1047027587890625, + "learning_rate": 0.0001, + "loss": 5.8188, + "loss/crossentropy": 2.563611149787903, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17200451344251633, + "step": 8744 + }, + { + "epoch": 0.2733125, + "grad_norm": 3.40625, + "grad_norm_var": 0.1, + "learning_rate": 0.0001, + "loss": 6.1039, + "loss/crossentropy": 2.6742992401123047, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18905840814113617, + "step": 8746 + }, + { + "epoch": 0.273375, + "grad_norm": 3.5, + "grad_norm_var": 0.070947265625, + "learning_rate": 0.0001, + "loss": 5.9399, + "loss/crossentropy": 2.424111247062683, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19102928042411804, + "step": 8748 + }, + { + "epoch": 0.2734375, + "grad_norm": 3.640625, + "grad_norm_var": 0.0660797119140625, + "learning_rate": 0.0001, + "loss": 6.22, + "loss/crossentropy": 2.6243419647216797, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.20409469306468964, + "step": 8750 + }, + { + "epoch": 0.2735, + "grad_norm": 3.71875, + "grad_norm_var": 0.0675201416015625, + "learning_rate": 0.0001, + "loss": 6.0798, + "loss/crossentropy": 2.552013397216797, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19223305583000183, + "step": 8752 + }, + { + "epoch": 0.2735625, + "grad_norm": 3.5, + "grad_norm_var": 0.057103474934895836, + "learning_rate": 0.0001, + "loss": 6.2181, + "loss/crossentropy": 2.641934394836426, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19745796918869019, + "step": 8754 + }, + { + "epoch": 0.273625, + "grad_norm": 3.390625, + "grad_norm_var": 0.052652994791666664, + "learning_rate": 0.0001, + "loss": 6.222, + "loss/crossentropy": 2.711361527442932, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19051231443881989, + "step": 8756 + }, + { + "epoch": 0.2736875, + "grad_norm": 3.421875, + "grad_norm_var": 0.0475982666015625, + "learning_rate": 0.0001, + "loss": 6.0285, + "loss/crossentropy": 2.663419246673584, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18182026594877243, + "step": 8758 + }, + { + "epoch": 0.27375, + "grad_norm": 4.65625, + "grad_norm_var": 0.10269775390625, + "learning_rate": 0.0001, + "loss": 6.1043, + "loss/crossentropy": 2.649004578590393, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18811038881540298, + "step": 8760 + }, + { + "epoch": 0.2738125, + "grad_norm": 3.4375, + "grad_norm_var": 0.0944732666015625, + "learning_rate": 0.0001, + "loss": 6.08, + "loss/crossentropy": 2.5890932083129883, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1920555904507637, + "step": 8762 + }, + { + "epoch": 0.273875, + "grad_norm": 3.46875, + "grad_norm_var": 0.097216796875, + "learning_rate": 0.0001, + "loss": 6.2831, + "loss/crossentropy": 2.7520138025283813, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19490370899438858, + "step": 8764 + }, + { + "epoch": 0.2739375, + "grad_norm": 3.390625, + "grad_norm_var": 0.10485026041666666, + "learning_rate": 0.0001, + "loss": 5.9451, + "loss/crossentropy": 2.5493834018707275, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18566906452178955, + "step": 8766 + }, + { + "epoch": 0.274, + "grad_norm": 3.625, + "grad_norm_var": 0.10478108723958333, + "learning_rate": 0.0001, + "loss": 6.0873, + "loss/crossentropy": 2.5950188636779785, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19141746312379837, + "step": 8768 + }, + { + "epoch": 0.2740625, + "grad_norm": 3.765625, + "grad_norm_var": 0.1172027587890625, + "learning_rate": 0.0001, + "loss": 6.4435, + "loss/crossentropy": 2.8003259897232056, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.19986136257648468, + "step": 8770 + }, + { + "epoch": 0.274125, + "grad_norm": 3.328125, + "grad_norm_var": 0.12132059733072917, + "learning_rate": 0.0001, + "loss": 5.8654, + "loss/crossentropy": 2.447899103164673, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18784713745117188, + "step": 8772 + }, + { + "epoch": 0.2741875, + "grad_norm": 3.4375, + "grad_norm_var": 0.438037109375, + "learning_rate": 0.0001, + "loss": 6.2964, + "loss/crossentropy": 2.7445706129074097, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19385666400194168, + "step": 8774 + }, + { + "epoch": 0.27425, + "grad_norm": 3.53125, + "grad_norm_var": 0.3990559895833333, + "learning_rate": 0.0001, + "loss": 5.6377, + "loss/crossentropy": 2.3796085119247437, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17463870346546173, + "step": 8776 + }, + { + "epoch": 0.2743125, + "grad_norm": 3.484375, + "grad_norm_var": 0.40067952473958335, + "learning_rate": 0.0001, + "loss": 5.7103, + "loss/crossentropy": 2.4123687744140625, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17666452378034592, + "step": 8778 + }, + { + "epoch": 0.274375, + "grad_norm": 3.6875, + "grad_norm_var": 0.404345703125, + "learning_rate": 0.0001, + "loss": 6.1997, + "loss/crossentropy": 2.716852307319641, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19359584152698517, + "step": 8780 + }, + { + "epoch": 0.2744375, + "grad_norm": 2.921875, + "grad_norm_var": 0.42649637858072914, + "learning_rate": 0.0001, + "loss": 6.1726, + "loss/crossentropy": 2.763651132583618, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1865966022014618, + "step": 8782 + }, + { + "epoch": 0.2745, + "grad_norm": 3.0625, + "grad_norm_var": 0.44247639973958336, + "learning_rate": 0.0001, + "loss": 5.8291, + "loss/crossentropy": 2.481606960296631, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18435516953468323, + "step": 8784 + }, + { + "epoch": 0.2745625, + "grad_norm": 3.21875, + "grad_norm_var": 0.43795572916666664, + "learning_rate": 0.0001, + "loss": 5.8875, + "loss/crossentropy": 2.5274109840393066, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18053767830133438, + "step": 8786 + }, + { + "epoch": 0.274625, + "grad_norm": 3.28125, + "grad_norm_var": 0.43502197265625, + "learning_rate": 0.0001, + "loss": 6.3651, + "loss/crossentropy": 2.8572980165481567, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19218553602695465, + "step": 8788 + }, + { + "epoch": 0.2746875, + "grad_norm": 3.421875, + "grad_norm_var": 0.06050516764322917, + "learning_rate": 0.0001, + "loss": 5.8025, + "loss/crossentropy": 2.4208855628967285, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1799629107117653, + "step": 8790 + }, + { + "epoch": 0.27475, + "grad_norm": 3.828125, + "grad_norm_var": 0.06599833170572916, + "learning_rate": 0.0001, + "loss": 5.9873, + "loss/crossentropy": 2.390637755393982, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.19404463469982147, + "step": 8792 + }, + { + "epoch": 0.2748125, + "grad_norm": 3.421875, + "grad_norm_var": 0.06614481608072917, + "learning_rate": 0.0001, + "loss": 6.2452, + "loss/crossentropy": 2.7461918592453003, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19520880281925201, + "step": 8794 + }, + { + "epoch": 0.274875, + "grad_norm": 3.421875, + "grad_norm_var": 0.059794108072916664, + "learning_rate": 0.0001, + "loss": 6.1468, + "loss/crossentropy": 2.60414457321167, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1945016235113144, + "step": 8796 + }, + { + "epoch": 0.2749375, + "grad_norm": 3.109375, + "grad_norm_var": 0.06798502604166666, + "learning_rate": 0.0001, + "loss": 5.2521, + "loss/crossentropy": 2.1276236176490784, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16166242957115173, + "step": 8798 + }, + { + "epoch": 0.275, + "grad_norm": 3.265625, + "grad_norm_var": 0.06199442545572917, + "learning_rate": 0.0001, + "loss": 5.9335, + "loss/crossentropy": 2.6193655729293823, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17672650516033173, + "step": 8800 + }, + { + "epoch": 0.2750625, + "grad_norm": 3.671875, + "grad_norm_var": 0.0619140625, + "learning_rate": 0.0001, + "loss": 5.8253, + "loss/crossentropy": 2.409619092941284, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18688050657510757, + "step": 8802 + }, + { + "epoch": 0.275125, + "grad_norm": 3.328125, + "grad_norm_var": 0.0607818603515625, + "learning_rate": 0.0001, + "loss": 5.8032, + "loss/crossentropy": 2.3988406658172607, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18301154673099518, + "step": 8804 + }, + { + "epoch": 0.2751875, + "grad_norm": 3.3125, + "grad_norm_var": 0.05152079264322917, + "learning_rate": 0.0001, + "loss": 5.8565, + "loss/crossentropy": 2.544641375541687, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17649392038583755, + "step": 8806 + }, + { + "epoch": 0.27525, + "grad_norm": 3.203125, + "grad_norm_var": 0.043578084309895834, + "learning_rate": 0.0001, + "loss": 5.813, + "loss/crossentropy": 2.5152865648269653, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17820528894662857, + "step": 8808 + }, + { + "epoch": 0.2753125, + "grad_norm": 3.15625, + "grad_norm_var": 0.04828999837239583, + "learning_rate": 0.0001, + "loss": 5.7696, + "loss/crossentropy": 2.489772319793701, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17680658400058746, + "step": 8810 + }, + { + "epoch": 0.275375, + "grad_norm": 3.59375, + "grad_norm_var": 0.051301066080729166, + "learning_rate": 0.0001, + "loss": 6.1193, + "loss/crossentropy": 2.623705267906189, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.18940023332834244, + "step": 8812 + }, + { + "epoch": 0.2754375, + "grad_norm": 3.4375, + "grad_norm_var": 0.024609375, + "learning_rate": 0.0001, + "loss": 6.0475, + "loss/crossentropy": 2.624902129173279, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18718025833368301, + "step": 8814 + }, + { + "epoch": 0.2755, + "grad_norm": 3.453125, + "grad_norm_var": 0.034440104166666666, + "learning_rate": 0.0001, + "loss": 5.7325, + "loss/crossentropy": 2.4753358364105225, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17297816276550293, + "step": 8816 + }, + { + "epoch": 0.2755625, + "grad_norm": 3.421875, + "grad_norm_var": 0.028645833333333332, + "learning_rate": 0.0001, + "loss": 5.5818, + "loss/crossentropy": 2.259931802749634, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1712503433227539, + "step": 8818 + }, + { + "epoch": 0.275625, + "grad_norm": 3.34375, + "grad_norm_var": 0.027391560872395835, + "learning_rate": 0.0001, + "loss": 5.6535, + "loss/crossentropy": 2.3533644676208496, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1788463592529297, + "step": 8820 + }, + { + "epoch": 0.2756875, + "grad_norm": 3.546875, + "grad_norm_var": 0.045308430989583336, + "learning_rate": 0.0001, + "loss": 6.2962, + "loss/crossentropy": 2.6124351024627686, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.20549017190933228, + "step": 8822 + }, + { + "epoch": 0.27575, + "grad_norm": 3.234375, + "grad_norm_var": 0.043732706705729166, + "learning_rate": 0.0001, + "loss": 6.3325, + "loss/crossentropy": 2.8760937452316284, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1901729702949524, + "step": 8824 + }, + { + "epoch": 0.2758125, + "grad_norm": 7.46875, + "grad_norm_var": 1.0603678385416666, + "learning_rate": 0.0001, + "loss": 6.0266, + "loss/crossentropy": 2.5222177505493164, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.191059410572052, + "step": 8826 + }, + { + "epoch": 0.275875, + "grad_norm": 3.515625, + "grad_norm_var": 1.0563140869140626, + "learning_rate": 0.0001, + "loss": 6.0066, + "loss/crossentropy": 2.600649833679199, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18512310087680817, + "step": 8828 + }, + { + "epoch": 0.2759375, + "grad_norm": 3.671875, + "grad_norm_var": 1.0502675374348958, + "learning_rate": 0.0001, + "loss": 6.071, + "loss/crossentropy": 2.4670032262802124, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.20375903695821762, + "step": 8830 + }, + { + "epoch": 0.276, + "grad_norm": 3.265625, + "grad_norm_var": 1.0273834228515626, + "learning_rate": 0.0001, + "loss": 5.8144, + "loss/crossentropy": 2.4422956705093384, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.183305524289608, + "step": 8832 + }, + { + "epoch": 0.2760625, + "grad_norm": 3.265625, + "grad_norm_var": 1.03642578125, + "learning_rate": 0.0001, + "loss": 6.1734, + "loss/crossentropy": 2.7296916246414185, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1869504600763321, + "step": 8834 + }, + { + "epoch": 0.276125, + "grad_norm": 3.53125, + "grad_norm_var": 1.035163370768229, + "learning_rate": 0.0001, + "loss": 5.7405, + "loss/crossentropy": 2.4141600131988525, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.179900124669075, + "step": 8836 + }, + { + "epoch": 0.2761875, + "grad_norm": 3.296875, + "grad_norm_var": 1.0578277587890625, + "learning_rate": 0.0001, + "loss": 5.9823, + "loss/crossentropy": 2.6309428215026855, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18005498498678207, + "step": 8838 + }, + { + "epoch": 0.27625, + "grad_norm": 3.734375, + "grad_norm_var": 1.0456451416015624, + "learning_rate": 0.0001, + "loss": 6.1293, + "loss/crossentropy": 2.5688467025756836, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19628392159938812, + "step": 8840 + }, + { + "epoch": 0.2763125, + "grad_norm": 3.375, + "grad_norm_var": 0.04659830729166667, + "learning_rate": 0.0001, + "loss": 6.0435, + "loss/crossentropy": 2.640480637550354, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18639644235372543, + "step": 8842 + }, + { + "epoch": 0.276375, + "grad_norm": 3.390625, + "grad_norm_var": 0.047200520833333336, + "learning_rate": 0.0001, + "loss": 6.2727, + "loss/crossentropy": 2.7400788068771362, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19193562865257263, + "step": 8844 + }, + { + "epoch": 0.2764375, + "grad_norm": 3.46875, + "grad_norm_var": 0.0207916259765625, + "learning_rate": 0.0001, + "loss": 6.1126, + "loss/crossentropy": 2.6935667991638184, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18643878400325775, + "step": 8846 + }, + { + "epoch": 0.2765, + "grad_norm": 4.0625, + "grad_norm_var": 0.0489654541015625, + "learning_rate": 0.0001, + "loss": 6.1042, + "loss/crossentropy": 2.595457911491394, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1918850839138031, + "step": 8848 + }, + { + "epoch": 0.2765625, + "grad_norm": 3.125, + "grad_norm_var": 0.05380452473958333, + "learning_rate": 0.0001, + "loss": 5.7562, + "loss/crossentropy": 2.459462285041809, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1761576235294342, + "step": 8850 + }, + { + "epoch": 0.276625, + "grad_norm": 3.25, + "grad_norm_var": 0.055150349934895836, + "learning_rate": 0.0001, + "loss": 5.8443, + "loss/crossentropy": 2.4820131063461304, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1799747496843338, + "step": 8852 + }, + { + "epoch": 0.2766875, + "grad_norm": 3.1875, + "grad_norm_var": 0.05754292805989583, + "learning_rate": 0.0001, + "loss": 5.8885, + "loss/crossentropy": 2.467939019203186, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18658292293548584, + "step": 8854 + }, + { + "epoch": 0.27675, + "grad_norm": 3.390625, + "grad_norm_var": 0.04973551432291667, + "learning_rate": 0.0001, + "loss": 6.0669, + "loss/crossentropy": 2.6163350343704224, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18998120725154877, + "step": 8856 + }, + { + "epoch": 0.2768125, + "grad_norm": 3.140625, + "grad_norm_var": 0.0527984619140625, + "learning_rate": 0.0001, + "loss": 5.7258, + "loss/crossentropy": 2.452614665031433, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17732173204421997, + "step": 8858 + }, + { + "epoch": 0.276875, + "grad_norm": 4.5, + "grad_norm_var": 0.13489583333333333, + "learning_rate": 0.0001, + "loss": 6.393, + "loss/crossentropy": 2.8383991718292236, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19726187735795975, + "step": 8860 + }, + { + "epoch": 0.2769375, + "grad_norm": 3.6875, + "grad_norm_var": 0.13736063639322918, + "learning_rate": 0.0001, + "loss": 6.1063, + "loss/crossentropy": 2.6258177757263184, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19257892668247223, + "step": 8862 + }, + { + "epoch": 0.277, + "grad_norm": 3.484375, + "grad_norm_var": 0.11110026041666667, + "learning_rate": 0.0001, + "loss": 5.8231, + "loss/crossentropy": 2.3822438716888428, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.18197131156921387, + "step": 8864 + }, + { + "epoch": 0.2770625, + "grad_norm": 3.390625, + "grad_norm_var": 0.10546875, + "learning_rate": 0.0001, + "loss": 5.7566, + "loss/crossentropy": 2.384195327758789, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18294084072113037, + "step": 8866 + }, + { + "epoch": 0.277125, + "grad_norm": 3.21875, + "grad_norm_var": 0.10819905598958333, + "learning_rate": 0.0001, + "loss": 5.9445, + "loss/crossentropy": 2.611143112182617, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17903496325016022, + "step": 8868 + }, + { + "epoch": 0.2771875, + "grad_norm": 3.375, + "grad_norm_var": 0.1090972900390625, + "learning_rate": 0.0001, + "loss": 6.1364, + "loss/crossentropy": 2.7580084800720215, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18236715346574783, + "step": 8870 + }, + { + "epoch": 0.27725, + "grad_norm": 3.390625, + "grad_norm_var": 0.10901285807291666, + "learning_rate": 0.0001, + "loss": 5.88, + "loss/crossentropy": 2.5484120845794678, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18042747676372528, + "step": 8872 + }, + { + "epoch": 0.2773125, + "grad_norm": 4.375, + "grad_norm_var": 0.16637369791666667, + "learning_rate": 0.0001, + "loss": 6.0186, + "loss/crossentropy": 2.544276237487793, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19157471507787704, + "step": 8874 + }, + { + "epoch": 0.277375, + "grad_norm": 3.46875, + "grad_norm_var": 0.09169514973958333, + "learning_rate": 0.0001, + "loss": 5.927, + "loss/crossentropy": 2.526648759841919, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18495458364486694, + "step": 8876 + }, + { + "epoch": 0.2774375, + "grad_norm": 3.234375, + "grad_norm_var": 0.09654541015625, + "learning_rate": 0.0001, + "loss": 5.9058, + "loss/crossentropy": 2.490079164505005, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18298199772834778, + "step": 8878 + }, + { + "epoch": 0.2775, + "grad_norm": 3.140625, + "grad_norm_var": 0.0988189697265625, + "learning_rate": 0.0001, + "loss": 5.924, + "loss/crossentropy": 2.631035327911377, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17694874107837677, + "step": 8880 + }, + { + "epoch": 0.2775625, + "grad_norm": 2.984375, + "grad_norm_var": 0.14885660807291667, + "learning_rate": 0.0001, + "loss": 5.9504, + "loss/crossentropy": 2.489716410636902, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.18356838822364807, + "step": 8882 + }, + { + "epoch": 0.277625, + "grad_norm": 3.171875, + "grad_norm_var": 0.15318603515625, + "learning_rate": 0.0001, + "loss": 6.0393, + "loss/crossentropy": 2.6960405111312866, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.17495335638523102, + "step": 8884 + }, + { + "epoch": 0.2776875, + "grad_norm": 3.0, + "grad_norm_var": 0.15896809895833333, + "learning_rate": 0.0001, + "loss": 5.8487, + "loss/crossentropy": 2.581865906715393, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17394591867923737, + "step": 8886 + }, + { + "epoch": 0.27775, + "grad_norm": 3.546875, + "grad_norm_var": 0.16142476399739583, + "learning_rate": 0.0001, + "loss": 6.2924, + "loss/crossentropy": 2.6841617822647095, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.20144400000572205, + "step": 8888 + }, + { + "epoch": 0.2778125, + "grad_norm": 3.125, + "grad_norm_var": 0.09556884765625, + "learning_rate": 0.0001, + "loss": 5.8561, + "loss/crossentropy": 2.5051841735839844, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17806024849414825, + "step": 8890 + }, + { + "epoch": 0.277875, + "grad_norm": 4.03125, + "grad_norm_var": 0.1261383056640625, + "learning_rate": 0.0001, + "loss": 6.0474, + "loss/crossentropy": 2.5743348598480225, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19379350543022156, + "step": 8892 + }, + { + "epoch": 0.2779375, + "grad_norm": 3.15625, + "grad_norm_var": 0.12121988932291666, + "learning_rate": 0.0001, + "loss": 6.1704, + "loss/crossentropy": 2.7337318658828735, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18507727980613708, + "step": 8894 + }, + { + "epoch": 0.278, + "grad_norm": 3.109375, + "grad_norm_var": 0.1221343994140625, + "learning_rate": 0.0001, + "loss": 5.8408, + "loss/crossentropy": 2.5032352209091187, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17828834056854248, + "step": 8896 + }, + { + "epoch": 0.2780625, + "grad_norm": 3.21875, + "grad_norm_var": 0.0725006103515625, + "learning_rate": 0.0001, + "loss": 5.7559, + "loss/crossentropy": 2.4360796213150024, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1765114590525627, + "step": 8898 + }, + { + "epoch": 0.278125, + "grad_norm": 3.765625, + "grad_norm_var": 0.08477274576822917, + "learning_rate": 0.0001, + "loss": 6.2151, + "loss/crossentropy": 2.651309370994568, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19661091268062592, + "step": 8900 + }, + { + "epoch": 0.2781875, + "grad_norm": 3.453125, + "grad_norm_var": 0.06966044108072916, + "learning_rate": 0.0001, + "loss": 5.8337, + "loss/crossentropy": 2.427757143974304, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18590442836284637, + "step": 8902 + }, + { + "epoch": 0.27825, + "grad_norm": 3.890625, + "grad_norm_var": 0.08192952473958333, + "learning_rate": 0.0001, + "loss": 5.9187, + "loss/crossentropy": 2.3886170387268066, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19714532792568207, + "step": 8904 + }, + { + "epoch": 0.2783125, + "grad_norm": 3.625, + "grad_norm_var": 0.06999409993489583, + "learning_rate": 0.0001, + "loss": 5.7418, + "loss/crossentropy": 2.3733723163604736, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18254968523979187, + "step": 8906 + }, + { + "epoch": 0.278375, + "grad_norm": 3.359375, + "grad_norm_var": 0.0604400634765625, + "learning_rate": 0.0001, + "loss": 5.7892, + "loss/crossentropy": 2.49438738822937, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1767505556344986, + "step": 8908 + }, + { + "epoch": 0.2784375, + "grad_norm": 3.28125, + "grad_norm_var": 0.061400349934895834, + "learning_rate": 0.0001, + "loss": 5.5431, + "loss/crossentropy": 2.2899755239486694, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17140889167785645, + "step": 8910 + }, + { + "epoch": 0.2785, + "grad_norm": 3.421875, + "grad_norm_var": 0.05349019368489583, + "learning_rate": 0.0001, + "loss": 6.0516, + "loss/crossentropy": 2.625260591506958, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18404033035039902, + "step": 8912 + }, + { + "epoch": 0.2785625, + "grad_norm": 3.34375, + "grad_norm_var": 0.051610310872395836, + "learning_rate": 0.0001, + "loss": 6.0681, + "loss/crossentropy": 2.603209972381592, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19336361438035965, + "step": 8914 + }, + { + "epoch": 0.278625, + "grad_norm": 4.125, + "grad_norm_var": 0.069189453125, + "learning_rate": 0.0001, + "loss": 6.155, + "loss/crossentropy": 2.629045248031616, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19204367697238922, + "step": 8916 + }, + { + "epoch": 0.2786875, + "grad_norm": 3.078125, + "grad_norm_var": 0.08277587890625, + "learning_rate": 0.0001, + "loss": 5.9272, + "loss/crossentropy": 2.633598208427429, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17935702204704285, + "step": 8918 + }, + { + "epoch": 0.27875, + "grad_norm": 3.734375, + "grad_norm_var": 0.0746246337890625, + "learning_rate": 0.0001, + "loss": 5.9349, + "loss/crossentropy": 2.489575743675232, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18788864463567734, + "step": 8920 + }, + { + "epoch": 0.2788125, + "grad_norm": 4.59375, + "grad_norm_var": 0.16741434733072916, + "learning_rate": 0.0001, + "loss": 5.6964, + "loss/crossentropy": 2.37763512134552, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1783628612756729, + "step": 8922 + }, + { + "epoch": 0.278875, + "grad_norm": 3.203125, + "grad_norm_var": 0.16574605305989584, + "learning_rate": 0.0001, + "loss": 6.0553, + "loss/crossentropy": 2.676714062690735, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18395595252513885, + "step": 8924 + }, + { + "epoch": 0.2789375, + "grad_norm": 3.796875, + "grad_norm_var": 0.16239827473958332, + "learning_rate": 0.0001, + "loss": 5.8915, + "loss/crossentropy": 2.429540514945984, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.18604381382465363, + "step": 8926 + }, + { + "epoch": 0.279, + "grad_norm": 3.359375, + "grad_norm_var": 0.16198628743489582, + "learning_rate": 0.0001, + "loss": 5.7803, + "loss/crossentropy": 2.4657455682754517, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1752079725265503, + "step": 8928 + }, + { + "epoch": 0.2790625, + "grad_norm": 3.5, + "grad_norm_var": 0.16420796712239583, + "learning_rate": 0.0001, + "loss": 6.0748, + "loss/crossentropy": 2.6903953552246094, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1825784221291542, + "step": 8930 + }, + { + "epoch": 0.279125, + "grad_norm": 3.53125, + "grad_norm_var": 0.13433837890625, + "learning_rate": 0.0001, + "loss": 6.1758, + "loss/crossentropy": 2.631464958190918, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19739867746829987, + "step": 8932 + }, + { + "epoch": 0.2791875, + "grad_norm": 3.171875, + "grad_norm_var": 0.12526041666666668, + "learning_rate": 0.0001, + "loss": 6.0468, + "loss/crossentropy": 2.5245360136032104, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1936342790722847, + "step": 8934 + }, + { + "epoch": 0.27925, + "grad_norm": 3.078125, + "grad_norm_var": 0.1358795166015625, + "learning_rate": 0.0001, + "loss": 5.7535, + "loss/crossentropy": 2.496535301208496, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17335310578346252, + "step": 8936 + }, + { + "epoch": 0.2793125, + "grad_norm": 3.1875, + "grad_norm_var": 0.05370686848958333, + "learning_rate": 0.0001, + "loss": 5.8803, + "loss/crossentropy": 2.5169789791107178, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18320399522781372, + "step": 8938 + }, + { + "epoch": 0.279375, + "grad_norm": 3.484375, + "grad_norm_var": 0.05538736979166667, + "learning_rate": 0.0001, + "loss": 5.5435, + "loss/crossentropy": 2.28670871257782, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1678697019815445, + "step": 8940 + }, + { + "epoch": 0.2794375, + "grad_norm": 3.578125, + "grad_norm_var": 0.0518707275390625, + "learning_rate": 0.0001, + "loss": 5.9636, + "loss/crossentropy": 2.5089820623397827, + "loss/hidden": 1.6328125, + "loss/jsd": 0.0, + "loss/logits": 0.18218537420034409, + "step": 8942 + }, + { + "epoch": 0.2795, + "grad_norm": 3.71875, + "grad_norm_var": 0.05826416015625, + "learning_rate": 0.0001, + "loss": 6.0677, + "loss/crossentropy": 2.6015191078186035, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1919344812631607, + "step": 8944 + }, + { + "epoch": 0.2795625, + "grad_norm": 4.125, + "grad_norm_var": 0.08690999348958334, + "learning_rate": 0.0001, + "loss": 6.2675, + "loss/crossentropy": 2.6956677436828613, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19663256406784058, + "step": 8946 + }, + { + "epoch": 0.279625, + "grad_norm": 3.328125, + "grad_norm_var": 0.09688212076822916, + "learning_rate": 0.0001, + "loss": 5.8046, + "loss/crossentropy": 2.519374966621399, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1773502230644226, + "step": 8948 + }, + { + "epoch": 0.2796875, + "grad_norm": 3.671875, + "grad_norm_var": 0.0864898681640625, + "learning_rate": 0.0001, + "loss": 6.0677, + "loss/crossentropy": 2.6085387468338013, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18927831947803497, + "step": 8950 + }, + { + "epoch": 0.27975, + "grad_norm": 3.234375, + "grad_norm_var": 0.07963765462239583, + "learning_rate": 0.0001, + "loss": 5.9959, + "loss/crossentropy": 2.5420197248458862, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18953076004981995, + "step": 8952 + }, + { + "epoch": 0.2798125, + "grad_norm": 3.40625, + "grad_norm_var": 0.07415262858072917, + "learning_rate": 0.0001, + "loss": 6.1113, + "loss/crossentropy": 2.6709654331207275, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18778185546398163, + "step": 8954 + }, + { + "epoch": 0.279875, + "grad_norm": 3.390625, + "grad_norm_var": 0.07258707682291667, + "learning_rate": 0.0001, + "loss": 5.9649, + "loss/crossentropy": 2.6230790615081787, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.179103784263134, + "step": 8956 + }, + { + "epoch": 0.2799375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0714752197265625, + "learning_rate": 0.0001, + "loss": 5.973, + "loss/crossentropy": 2.5649073123931885, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18572864681482315, + "step": 8958 + }, + { + "epoch": 0.28, + "grad_norm": 3.53125, + "grad_norm_var": 0.06720377604166666, + "learning_rate": 0.0001, + "loss": 5.8986, + "loss/crossentropy": 2.5165066719055176, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18429826945066452, + "step": 8960 + }, + { + "epoch": 0.2800625, + "grad_norm": 3.5, + "grad_norm_var": 0.034501139322916666, + "learning_rate": 0.0001, + "loss": 6.0107, + "loss/crossentropy": 2.5791239738464355, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1829991489648819, + "step": 8962 + }, + { + "epoch": 0.280125, + "grad_norm": 3.390625, + "grad_norm_var": 0.026688639322916666, + "learning_rate": 0.0001, + "loss": 6.2176, + "loss/crossentropy": 2.7475950717926025, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1895764321088791, + "step": 8964 + }, + { + "epoch": 0.2801875, + "grad_norm": 4.375, + "grad_norm_var": 0.08095703125, + "learning_rate": 0.0001, + "loss": 5.7247, + "loss/crossentropy": 2.4407023191452026, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1725417673587799, + "step": 8966 + }, + { + "epoch": 0.28025, + "grad_norm": 3.46875, + "grad_norm_var": 0.072265625, + "learning_rate": 0.0001, + "loss": 6.1318, + "loss/crossentropy": 2.6310908794403076, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1903063729405403, + "step": 8968 + }, + { + "epoch": 0.2803125, + "grad_norm": 3.109375, + "grad_norm_var": 0.07995503743489583, + "learning_rate": 0.0001, + "loss": 5.8537, + "loss/crossentropy": 2.465729832649231, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18567556142807007, + "step": 8970 + }, + { + "epoch": 0.280375, + "grad_norm": 3.3125, + "grad_norm_var": 0.07912495930989584, + "learning_rate": 0.0001, + "loss": 5.6157, + "loss/crossentropy": 2.361966848373413, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17459086328744888, + "step": 8972 + }, + { + "epoch": 0.2804375, + "grad_norm": 3.625, + "grad_norm_var": 0.08082275390625, + "learning_rate": 0.0001, + "loss": 6.1712, + "loss/crossentropy": 2.601422905921936, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.198383167386055, + "step": 8974 + }, + { + "epoch": 0.2805, + "grad_norm": 3.84375, + "grad_norm_var": 0.08948567708333334, + "learning_rate": 0.0001, + "loss": 6.1741, + "loss/crossentropy": 2.692523241043091, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19307806342840195, + "step": 8976 + }, + { + "epoch": 0.2805625, + "grad_norm": 3.46875, + "grad_norm_var": 0.0899566650390625, + "learning_rate": 0.0001, + "loss": 5.9848, + "loss/crossentropy": 2.4888601303100586, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19139544665813446, + "step": 8978 + }, + { + "epoch": 0.280625, + "grad_norm": 3.546875, + "grad_norm_var": 0.09114481608072916, + "learning_rate": 0.0001, + "loss": 6.094, + "loss/crossentropy": 2.6560696363449097, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18871255218982697, + "step": 8980 + }, + { + "epoch": 0.2806875, + "grad_norm": 3.4375, + "grad_norm_var": 0.04195048014322917, + "learning_rate": 0.0001, + "loss": 6.1607, + "loss/crossentropy": 2.6694579124450684, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19365518540143967, + "step": 8982 + }, + { + "epoch": 0.28075, + "grad_norm": 3.375, + "grad_norm_var": 0.03999735514322917, + "learning_rate": 0.0001, + "loss": 6.0601, + "loss/crossentropy": 2.6100698709487915, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1926591694355011, + "step": 8984 + }, + { + "epoch": 0.2808125, + "grad_norm": 3.703125, + "grad_norm_var": 0.0390289306640625, + "learning_rate": 0.0001, + "loss": 5.95, + "loss/crossentropy": 2.567933201789856, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1796116977930069, + "step": 8986 + }, + { + "epoch": 0.280875, + "grad_norm": 3.328125, + "grad_norm_var": 0.0328765869140625, + "learning_rate": 0.0001, + "loss": 6.0463, + "loss/crossentropy": 2.65181303024292, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18242233991622925, + "step": 8988 + }, + { + "epoch": 0.2809375, + "grad_norm": 3.09375, + "grad_norm_var": 0.04934488932291667, + "learning_rate": 0.0001, + "loss": 5.4396, + "loss/crossentropy": 2.2871644496917725, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16602488607168198, + "step": 8990 + }, + { + "epoch": 0.281, + "grad_norm": 3.015625, + "grad_norm_var": 0.04771728515625, + "learning_rate": 0.0001, + "loss": 6.077, + "loss/crossentropy": 2.5936484336853027, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19403835386037827, + "step": 8992 + }, + { + "epoch": 0.2810625, + "grad_norm": 3.578125, + "grad_norm_var": 0.04986572265625, + "learning_rate": 0.0001, + "loss": 6.0082, + "loss/crossentropy": 2.5718302726745605, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18933695554733276, + "step": 8994 + }, + { + "epoch": 0.281125, + "grad_norm": 3.5, + "grad_norm_var": 0.04856669108072917, + "learning_rate": 0.0001, + "loss": 6.1828, + "loss/crossentropy": 2.682897448539734, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1925681158900261, + "step": 8996 + }, + { + "epoch": 0.2811875, + "grad_norm": 3.671875, + "grad_norm_var": 0.04409077962239583, + "learning_rate": 0.0001, + "loss": 5.9279, + "loss/crossentropy": 2.5359108448028564, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18567855656147003, + "step": 8998 + }, + { + "epoch": 0.28125, + "grad_norm": 3.234375, + "grad_norm_var": 0.04519856770833333, + "learning_rate": 0.0001, + "loss": 6.0286, + "loss/crossentropy": 2.624733328819275, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18140491843223572, + "step": 9000 + }, + { + "epoch": 0.2813125, + "grad_norm": 3.28125, + "grad_norm_var": 0.04487202962239583, + "learning_rate": 0.0001, + "loss": 6.2065, + "loss/crossentropy": 2.7221689224243164, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19375034421682358, + "step": 9002 + }, + { + "epoch": 0.281375, + "grad_norm": 3.5, + "grad_norm_var": 0.0489898681640625, + "learning_rate": 0.0001, + "loss": 6.0372, + "loss/crossentropy": 2.5447323322296143, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1953364461660385, + "step": 9004 + }, + { + "epoch": 0.2814375, + "grad_norm": 5.0, + "grad_norm_var": 0.19611002604166666, + "learning_rate": 0.0001, + "loss": 6.2313, + "loss/crossentropy": 2.6949377059936523, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1993364840745926, + "step": 9006 + }, + { + "epoch": 0.2815, + "grad_norm": 3.25, + "grad_norm_var": 0.18479817708333332, + "learning_rate": 0.0001, + "loss": 6.228, + "loss/crossentropy": 2.7894601821899414, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1883883774280548, + "step": 9008 + }, + { + "epoch": 0.2815625, + "grad_norm": 3.625, + "grad_norm_var": 0.18494466145833333, + "learning_rate": 0.0001, + "loss": 6.3061, + "loss/crossentropy": 2.7123767137527466, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.20273320376873016, + "step": 9010 + }, + { + "epoch": 0.281625, + "grad_norm": 3.09375, + "grad_norm_var": 0.1956695556640625, + "learning_rate": 0.0001, + "loss": 5.9871, + "loss/crossentropy": 2.632554531097412, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18233323842287064, + "step": 9012 + }, + { + "epoch": 0.2816875, + "grad_norm": 3.171875, + "grad_norm_var": 0.21155192057291666, + "learning_rate": 0.0001, + "loss": 5.5732, + "loss/crossentropy": 2.3073713779449463, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17033526301383972, + "step": 9014 + }, + { + "epoch": 0.28175, + "grad_norm": 3.984375, + "grad_norm_var": 0.2232574462890625, + "learning_rate": 0.0001, + "loss": 6.193, + "loss/crossentropy": 2.6605632305145264, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19542711228132248, + "step": 9016 + }, + { + "epoch": 0.2818125, + "grad_norm": 3.40625, + "grad_norm_var": 0.22009989420572917, + "learning_rate": 0.0001, + "loss": 5.9053, + "loss/crossentropy": 2.5774831771850586, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18083104491233826, + "step": 9018 + }, + { + "epoch": 0.281875, + "grad_norm": 3.421875, + "grad_norm_var": 0.22069905598958334, + "learning_rate": 0.0001, + "loss": 6.3092, + "loss/crossentropy": 2.8267698287963867, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1927720308303833, + "step": 9020 + }, + { + "epoch": 0.2819375, + "grad_norm": 3.28125, + "grad_norm_var": 0.06379292805989584, + "learning_rate": 0.0001, + "loss": 6.0883, + "loss/crossentropy": 2.6888206005096436, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18642932176589966, + "step": 9022 + }, + { + "epoch": 0.282, + "grad_norm": 3.109375, + "grad_norm_var": 0.06591389973958334, + "learning_rate": 0.0001, + "loss": 5.9574, + "loss/crossentropy": 2.5752652883529663, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18469508737325668, + "step": 9024 + }, + { + "epoch": 0.2820625, + "grad_norm": 3.40625, + "grad_norm_var": 0.06319986979166667, + "learning_rate": 0.0001, + "loss": 6.0361, + "loss/crossentropy": 2.5644443035125732, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18895837664604187, + "step": 9026 + }, + { + "epoch": 0.282125, + "grad_norm": 3.5, + "grad_norm_var": 0.061986287434895836, + "learning_rate": 0.0001, + "loss": 6.1833, + "loss/crossentropy": 2.6032562255859375, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20019075274467468, + "step": 9028 + }, + { + "epoch": 0.2821875, + "grad_norm": 3.203125, + "grad_norm_var": 0.046875, + "learning_rate": 0.0001, + "loss": 5.8594, + "loss/crossentropy": 2.4787466526031494, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18571703881025314, + "step": 9030 + }, + { + "epoch": 0.28225, + "grad_norm": 3.171875, + "grad_norm_var": 0.024909464518229167, + "learning_rate": 0.0001, + "loss": 6.064, + "loss/crossentropy": 2.6514443159103394, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1842246949672699, + "step": 9032 + }, + { + "epoch": 0.2823125, + "grad_norm": 3.296875, + "grad_norm_var": 0.024803670247395833, + "learning_rate": 0.0001, + "loss": 5.7463, + "loss/crossentropy": 2.3479292392730713, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18398087471723557, + "step": 9034 + }, + { + "epoch": 0.282375, + "grad_norm": 3.046875, + "grad_norm_var": 0.024388631184895832, + "learning_rate": 0.0001, + "loss": 5.8337, + "loss/crossentropy": 2.4997901916503906, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18104806542396545, + "step": 9036 + }, + { + "epoch": 0.2824375, + "grad_norm": 2.984375, + "grad_norm_var": 0.030485026041666665, + "learning_rate": 0.0001, + "loss": 5.6793, + "loss/crossentropy": 2.4415680170059204, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17455822974443436, + "step": 9038 + }, + { + "epoch": 0.2825, + "grad_norm": 3.15625, + "grad_norm_var": 0.0307525634765625, + "learning_rate": 0.0001, + "loss": 6.0256, + "loss/crossentropy": 2.7107661962509155, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17601224780082703, + "step": 9040 + }, + { + "epoch": 0.2825625, + "grad_norm": 3.421875, + "grad_norm_var": 0.030931599934895835, + "learning_rate": 0.0001, + "loss": 5.9807, + "loss/crossentropy": 2.551282525062561, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18864601850509644, + "step": 9042 + }, + { + "epoch": 0.282625, + "grad_norm": 3.203125, + "grad_norm_var": 0.030248006184895832, + "learning_rate": 0.0001, + "loss": 5.8746, + "loss/crossentropy": 2.461247444152832, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1889927163720131, + "step": 9044 + }, + { + "epoch": 0.2826875, + "grad_norm": 3.65625, + "grad_norm_var": 0.04267171223958333, + "learning_rate": 0.0001, + "loss": 6.2772, + "loss/crossentropy": 2.6784560680389404, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.20088715851306915, + "step": 9046 + }, + { + "epoch": 0.28275, + "grad_norm": 3.328125, + "grad_norm_var": 0.041552734375, + "learning_rate": 0.0001, + "loss": 5.7114, + "loss/crossentropy": 2.465242028236389, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17305027693510056, + "step": 9048 + }, + { + "epoch": 0.2828125, + "grad_norm": 3.40625, + "grad_norm_var": 0.04221903483072917, + "learning_rate": 0.0001, + "loss": 5.8207, + "loss/crossentropy": 2.4869046211242676, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18103093653917313, + "step": 9050 + }, + { + "epoch": 0.282875, + "grad_norm": 3.515625, + "grad_norm_var": 0.040192667643229166, + "learning_rate": 0.0001, + "loss": 5.8688, + "loss/crossentropy": 2.471032738685608, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18821820616722107, + "step": 9052 + }, + { + "epoch": 0.2829375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06074930826822917, + "learning_rate": 0.0001, + "loss": 5.9442, + "loss/crossentropy": 2.5340031385421753, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18828360736370087, + "step": 9054 + }, + { + "epoch": 0.283, + "grad_norm": 3.25, + "grad_norm_var": 0.05743815104166667, + "learning_rate": 0.0001, + "loss": 5.7985, + "loss/crossentropy": 2.535214900970459, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17632829397916794, + "step": 9056 + }, + { + "epoch": 0.2830625, + "grad_norm": 4.0625, + "grad_norm_var": 0.08642171223958334, + "learning_rate": 0.0001, + "loss": 5.8255, + "loss/crossentropy": 2.485495924949646, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17735742032527924, + "step": 9058 + }, + { + "epoch": 0.283125, + "grad_norm": 3.21875, + "grad_norm_var": 0.1017242431640625, + "learning_rate": 0.0001, + "loss": 6.0565, + "loss/crossentropy": 2.626773238182068, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18710918724536896, + "step": 9060 + }, + { + "epoch": 0.2831875, + "grad_norm": 3.40625, + "grad_norm_var": 0.09986572265625, + "learning_rate": 0.0001, + "loss": 5.8211, + "loss/crossentropy": 2.5458565950393677, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1759604886174202, + "step": 9062 + }, + { + "epoch": 0.28325, + "grad_norm": 3.484375, + "grad_norm_var": 0.09824930826822917, + "learning_rate": 0.0001, + "loss": 5.7519, + "loss/crossentropy": 2.442685604095459, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17466949671506882, + "step": 9064 + }, + { + "epoch": 0.2833125, + "grad_norm": 3.375, + "grad_norm_var": 0.10879618326822917, + "learning_rate": 0.0001, + "loss": 6.2599, + "loss/crossentropy": 2.6942391395568848, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19718880206346512, + "step": 9066 + }, + { + "epoch": 0.283375, + "grad_norm": 3.359375, + "grad_norm_var": 0.13450113932291666, + "learning_rate": 0.0001, + "loss": 5.7228, + "loss/crossentropy": 2.4559353590011597, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17434197664260864, + "step": 9068 + }, + { + "epoch": 0.2834375, + "grad_norm": 3.703125, + "grad_norm_var": 0.16128641764322918, + "learning_rate": 0.0001, + "loss": 6.3024, + "loss/crossentropy": 2.7098337411880493, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19870896637439728, + "step": 9070 + }, + { + "epoch": 0.2835, + "grad_norm": 3.5, + "grad_norm_var": 0.1585601806640625, + "learning_rate": 0.0001, + "loss": 5.9719, + "loss/crossentropy": 2.5387319326400757, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1882343515753746, + "step": 9072 + }, + { + "epoch": 0.2835625, + "grad_norm": 3.28125, + "grad_norm_var": 0.1357818603515625, + "learning_rate": 0.0001, + "loss": 5.8944, + "loss/crossentropy": 2.5297415256500244, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18216697871685028, + "step": 9074 + }, + { + "epoch": 0.283625, + "grad_norm": 3.171875, + "grad_norm_var": 0.11490478515625, + "learning_rate": 0.0001, + "loss": 5.7764, + "loss/crossentropy": 2.4699500799179077, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.177521213889122, + "step": 9076 + }, + { + "epoch": 0.2836875, + "grad_norm": 3.53125, + "grad_norm_var": 0.1197265625, + "learning_rate": 0.0001, + "loss": 5.6463, + "loss/crossentropy": 2.3882386684417725, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1707303747534752, + "step": 9078 + }, + { + "epoch": 0.28375, + "grad_norm": 4.125, + "grad_norm_var": 0.167724609375, + "learning_rate": 0.0001, + "loss": 6.1195, + "loss/crossentropy": 2.611040472984314, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19068588316440582, + "step": 9080 + }, + { + "epoch": 0.2838125, + "grad_norm": 3.40625, + "grad_norm_var": 0.16392822265625, + "learning_rate": 0.0001, + "loss": 5.9801, + "loss/crossentropy": 2.6328080892562866, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1812119111418724, + "step": 9082 + }, + { + "epoch": 0.283875, + "grad_norm": 3.265625, + "grad_norm_var": 0.14704488118489584, + "learning_rate": 0.0001, + "loss": 6.4703, + "loss/crossentropy": 2.8104302883148193, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.20817676931619644, + "step": 9084 + }, + { + "epoch": 0.2839375, + "grad_norm": 3.5, + "grad_norm_var": 0.09387105305989583, + "learning_rate": 0.0001, + "loss": 6.104, + "loss/crossentropy": 2.558880567550659, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19631367921829224, + "step": 9086 + }, + { + "epoch": 0.284, + "grad_norm": 3.546875, + "grad_norm_var": 0.09000651041666667, + "learning_rate": 0.0001, + "loss": 6.2526, + "loss/crossentropy": 2.8148038387298584, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18752646446228027, + "step": 9088 + }, + { + "epoch": 0.2840625, + "grad_norm": 3.546875, + "grad_norm_var": 0.08503316243489584, + "learning_rate": 0.0001, + "loss": 5.5647, + "loss/crossentropy": 2.253652572631836, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17329681664705276, + "step": 9090 + }, + { + "epoch": 0.284125, + "grad_norm": 4.15625, + "grad_norm_var": 0.11277669270833333, + "learning_rate": 0.0001, + "loss": 6.383, + "loss/crossentropy": 2.6728352308273315, + "loss/hidden": 1.671875, + "loss/jsd": 0.0, + "loss/logits": 0.20383010804653168, + "step": 9092 + }, + { + "epoch": 0.2841875, + "grad_norm": 3.59375, + "grad_norm_var": 0.1004302978515625, + "learning_rate": 0.0001, + "loss": 5.9191, + "loss/crossentropy": 2.4468226432800293, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19253655523061752, + "step": 9094 + }, + { + "epoch": 0.28425, + "grad_norm": 3.53125, + "grad_norm_var": 0.06933186848958334, + "learning_rate": 0.0001, + "loss": 6.0211, + "loss/crossentropy": 2.636566638946533, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18259137123823166, + "step": 9096 + }, + { + "epoch": 0.2843125, + "grad_norm": 3.234375, + "grad_norm_var": 0.06502176920572916, + "learning_rate": 0.0001, + "loss": 5.822, + "loss/crossentropy": 2.5147953033447266, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18071867525577545, + "step": 9098 + }, + { + "epoch": 0.284375, + "grad_norm": 3.484375, + "grad_norm_var": 0.05845438639322917, + "learning_rate": 0.0001, + "loss": 6.1502, + "loss/crossentropy": 2.6123939752578735, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19557856768369675, + "step": 9100 + }, + { + "epoch": 0.2844375, + "grad_norm": 3.640625, + "grad_norm_var": 0.0567291259765625, + "learning_rate": 0.0001, + "loss": 6.3179, + "loss/crossentropy": 2.7712206840515137, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19568248838186264, + "step": 9102 + }, + { + "epoch": 0.2845, + "grad_norm": 3.15625, + "grad_norm_var": 0.0625152587890625, + "learning_rate": 0.0001, + "loss": 5.7568, + "loss/crossentropy": 2.3733925819396973, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18052639067173004, + "step": 9104 + }, + { + "epoch": 0.2845625, + "grad_norm": 3.53125, + "grad_norm_var": 0.08746337890625, + "learning_rate": 0.0001, + "loss": 6.2083, + "loss/crossentropy": 2.762609839439392, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18948964029550552, + "step": 9106 + }, + { + "epoch": 0.284625, + "grad_norm": 3.859375, + "grad_norm_var": 0.06048177083333333, + "learning_rate": 0.0001, + "loss": 5.7611, + "loss/crossentropy": 2.3383185863494873, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18094955384731293, + "step": 9108 + }, + { + "epoch": 0.2846875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0689605712890625, + "learning_rate": 0.0001, + "loss": 5.8825, + "loss/crossentropy": 2.542226552963257, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1801212877035141, + "step": 9110 + }, + { + "epoch": 0.28475, + "grad_norm": 3.109375, + "grad_norm_var": 0.07508036295572916, + "learning_rate": 0.0001, + "loss": 5.6944, + "loss/crossentropy": 2.4106396436691284, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1779869645833969, + "step": 9112 + }, + { + "epoch": 0.2848125, + "grad_norm": 3.796875, + "grad_norm_var": 0.0863922119140625, + "learning_rate": 0.0001, + "loss": 6.0255, + "loss/crossentropy": 2.6151647567749023, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18673612922430038, + "step": 9114 + }, + { + "epoch": 0.284875, + "grad_norm": 3.3125, + "grad_norm_var": 0.0794921875, + "learning_rate": 0.0001, + "loss": 5.9964, + "loss/crossentropy": 2.5844658613204956, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1861146315932274, + "step": 9116 + }, + { + "epoch": 0.2849375, + "grad_norm": 3.375, + "grad_norm_var": 0.07571207682291667, + "learning_rate": 0.0001, + "loss": 5.947, + "loss/crossentropy": 2.5303101539611816, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18971359729766846, + "step": 9118 + }, + { + "epoch": 0.285, + "grad_norm": 3.359375, + "grad_norm_var": 0.07317301432291666, + "learning_rate": 0.0001, + "loss": 6.0363, + "loss/crossentropy": 2.655127763748169, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18343449383974075, + "step": 9120 + }, + { + "epoch": 0.2850625, + "grad_norm": 3.796875, + "grad_norm_var": 0.09767964680989584, + "learning_rate": 0.0001, + "loss": 6.018, + "loss/crossentropy": 2.493555426597595, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1926787868142128, + "step": 9122 + }, + { + "epoch": 0.285125, + "grad_norm": 3.546875, + "grad_norm_var": 0.075439453125, + "learning_rate": 0.0001, + "loss": 6.0001, + "loss/crossentropy": 2.5280349254608154, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1901702806353569, + "step": 9124 + }, + { + "epoch": 0.2851875, + "grad_norm": 4.59375, + "grad_norm_var": 0.16013895670572917, + "learning_rate": 0.0001, + "loss": 6.3609, + "loss/crossentropy": 2.7163779735565186, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.2078113555908203, + "step": 9126 + }, + { + "epoch": 0.28525, + "grad_norm": 3.59375, + "grad_norm_var": 0.14339192708333334, + "learning_rate": 0.0001, + "loss": 5.9869, + "loss/crossentropy": 2.551315426826477, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18730486929416656, + "step": 9128 + }, + { + "epoch": 0.2853125, + "grad_norm": 3.375, + "grad_norm_var": 0.13235270182291667, + "learning_rate": 0.0001, + "loss": 5.9906, + "loss/crossentropy": 2.602681517601013, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1860525906085968, + "step": 9130 + }, + { + "epoch": 0.285375, + "grad_norm": 4.0, + "grad_norm_var": 0.134619140625, + "learning_rate": 0.0001, + "loss": 6.3725, + "loss/crossentropy": 2.7989598512649536, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19719856977462769, + "step": 9132 + }, + { + "epoch": 0.2854375, + "grad_norm": 3.984375, + "grad_norm_var": 35.59182535807292, + "learning_rate": 0.0001, + "loss": 7.1315, + "loss/crossentropy": 2.71242094039917, + "loss/hidden": 1.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.27745749801397324, + "step": 9134 + }, + { + "epoch": 0.2855, + "grad_norm": 4.125, + "grad_norm_var": 35.21113993326823, + "learning_rate": 0.0001, + "loss": 6.6899, + "loss/crossentropy": 2.899548292160034, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.21496783941984177, + "step": 9136 + }, + { + "epoch": 0.2855625, + "grad_norm": 3.40625, + "grad_norm_var": 35.45763346354167, + "learning_rate": 0.0001, + "loss": 5.8573, + "loss/crossentropy": 2.4658334255218506, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18640770018100739, + "step": 9138 + }, + { + "epoch": 0.285625, + "grad_norm": 3.375, + "grad_norm_var": 35.547672526041666, + "learning_rate": 0.0001, + "loss": 5.8011, + "loss/crossentropy": 2.4256393909454346, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1820792406797409, + "step": 9140 + }, + { + "epoch": 0.2856875, + "grad_norm": 3.359375, + "grad_norm_var": 35.784830729166664, + "learning_rate": 0.0001, + "loss": 6.0857, + "loss/crossentropy": 2.64946711063385, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1889318972826004, + "step": 9142 + }, + { + "epoch": 0.28575, + "grad_norm": 3.296875, + "grad_norm_var": 35.78233947753906, + "learning_rate": 0.0001, + "loss": 5.8765, + "loss/crossentropy": 2.4886363744735718, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.185664564371109, + "step": 9144 + }, + { + "epoch": 0.2858125, + "grad_norm": 3.234375, + "grad_norm_var": 35.79916076660156, + "learning_rate": 0.0001, + "loss": 5.7125, + "loss/crossentropy": 2.361438512802124, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18354769051074982, + "step": 9146 + }, + { + "epoch": 0.285875, + "grad_norm": 3.703125, + "grad_norm_var": 35.96471252441406, + "learning_rate": 0.0001, + "loss": 5.9112, + "loss/crossentropy": 2.548967123031616, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1823192909359932, + "step": 9148 + }, + { + "epoch": 0.2859375, + "grad_norm": 7.34375, + "grad_norm_var": 1.03726806640625, + "learning_rate": 0.0001, + "loss": 6.0803, + "loss/crossentropy": 2.5867608785629272, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19037093222141266, + "step": 9150 + }, + { + "epoch": 0.286, + "grad_norm": 3.875, + "grad_norm_var": 0.9989217122395834, + "learning_rate": 0.0001, + "loss": 6.0382, + "loss/crossentropy": 2.55404531955719, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19059911370277405, + "step": 9152 + }, + { + "epoch": 0.2860625, + "grad_norm": 3.140625, + "grad_norm_var": 1.0123931884765625, + "learning_rate": 0.0001, + "loss": 5.9966, + "loss/crossentropy": 2.560911774635315, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18809889256954193, + "step": 9154 + }, + { + "epoch": 0.286125, + "grad_norm": 3.53125, + "grad_norm_var": 1.009968058268229, + "learning_rate": 0.0001, + "loss": 5.5887, + "loss/crossentropy": 2.3073805570602417, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1726599782705307, + "step": 9156 + }, + { + "epoch": 0.2861875, + "grad_norm": 3.390625, + "grad_norm_var": 1.0194488525390626, + "learning_rate": 0.0001, + "loss": 5.7854, + "loss/crossentropy": 2.393547296524048, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18527594208717346, + "step": 9158 + }, + { + "epoch": 0.28625, + "grad_norm": 3.640625, + "grad_norm_var": 1.0142242431640625, + "learning_rate": 0.0001, + "loss": 6.0094, + "loss/crossentropy": 2.545302629470825, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18820396065711975, + "step": 9160 + }, + { + "epoch": 0.2863125, + "grad_norm": 4.125, + "grad_norm_var": 1.0132161458333333, + "learning_rate": 0.0001, + "loss": 6.3975, + "loss/crossentropy": 2.7841343879699707, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.20117808878421783, + "step": 9162 + }, + { + "epoch": 0.286375, + "grad_norm": 3.515625, + "grad_norm_var": 0.9988840738932292, + "learning_rate": 0.0001, + "loss": 6.0892, + "loss/crossentropy": 2.603915810585022, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18954448401927948, + "step": 9164 + }, + { + "epoch": 0.2864375, + "grad_norm": 3.484375, + "grad_norm_var": 0.07822977701822917, + "learning_rate": 0.0001, + "loss": 6.1461, + "loss/crossentropy": 2.6508954763412476, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19170917570590973, + "step": 9166 + }, + { + "epoch": 0.2865, + "grad_norm": 3.421875, + "grad_norm_var": 0.065185546875, + "learning_rate": 0.0001, + "loss": 5.8764, + "loss/crossentropy": 2.5029672384262085, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18344075232744217, + "step": 9168 + }, + { + "epoch": 0.2865625, + "grad_norm": 3.3125, + "grad_norm_var": 0.06448160807291667, + "learning_rate": 0.0001, + "loss": 6.1846, + "loss/crossentropy": 2.7582833766937256, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1879480704665184, + "step": 9170 + }, + { + "epoch": 0.286625, + "grad_norm": 3.28125, + "grad_norm_var": 0.06555582682291666, + "learning_rate": 0.0001, + "loss": 5.7316, + "loss/crossentropy": 2.3871891498565674, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18209929764270782, + "step": 9172 + }, + { + "epoch": 0.2866875, + "grad_norm": 3.546875, + "grad_norm_var": 0.0596588134765625, + "learning_rate": 0.0001, + "loss": 6.0071, + "loss/crossentropy": 2.5322988033294678, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19279064238071442, + "step": 9174 + }, + { + "epoch": 0.28675, + "grad_norm": 3.328125, + "grad_norm_var": 0.06018778483072917, + "learning_rate": 0.0001, + "loss": 6.0731, + "loss/crossentropy": 2.6345635652542114, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18916398286819458, + "step": 9176 + }, + { + "epoch": 0.2868125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0293853759765625, + "learning_rate": 0.0001, + "loss": 5.8183, + "loss/crossentropy": 2.5042537450790405, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17906411737203598, + "step": 9178 + }, + { + "epoch": 0.286875, + "grad_norm": 3.390625, + "grad_norm_var": 0.025397745768229167, + "learning_rate": 0.0001, + "loss": 5.807, + "loss/crossentropy": 2.5237934589385986, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17675773054361343, + "step": 9180 + }, + { + "epoch": 0.2869375, + "grad_norm": 4.9375, + "grad_norm_var": 0.17666727701822918, + "learning_rate": 0.0001, + "loss": 5.7583, + "loss/crossentropy": 2.3460735082626343, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18458520621061325, + "step": 9182 + }, + { + "epoch": 0.287, + "grad_norm": 3.296875, + "grad_norm_var": 0.18108317057291667, + "learning_rate": 0.0001, + "loss": 6.0849, + "loss/crossentropy": 2.655609369277954, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1851135641336441, + "step": 9184 + }, + { + "epoch": 0.2870625, + "grad_norm": 3.265625, + "grad_norm_var": 0.17948811848958332, + "learning_rate": 0.0001, + "loss": 6.1004, + "loss/crossentropy": 2.6337010860443115, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1904214471578598, + "step": 9186 + }, + { + "epoch": 0.287125, + "grad_norm": 3.21875, + "grad_norm_var": 0.18765869140625, + "learning_rate": 0.0001, + "loss": 5.7994, + "loss/crossentropy": 2.459021210670471, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18091781437397003, + "step": 9188 + }, + { + "epoch": 0.2871875, + "grad_norm": 3.25, + "grad_norm_var": 0.19498697916666666, + "learning_rate": 0.0001, + "loss": 6.0276, + "loss/crossentropy": 2.621256709098816, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18634165823459625, + "step": 9190 + }, + { + "epoch": 0.28725, + "grad_norm": 3.328125, + "grad_norm_var": 0.19714253743489582, + "learning_rate": 0.0001, + "loss": 6.3783, + "loss/crossentropy": 2.853867769241333, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1969754844903946, + "step": 9192 + }, + { + "epoch": 0.2873125, + "grad_norm": 3.15625, + "grad_norm_var": 0.1912994384765625, + "learning_rate": 0.0001, + "loss": 5.6215, + "loss/crossentropy": 2.3387062549591064, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17164014279842377, + "step": 9194 + }, + { + "epoch": 0.287375, + "grad_norm": 3.375, + "grad_norm_var": 0.19104410807291666, + "learning_rate": 0.0001, + "loss": 6.1026, + "loss/crossentropy": 2.6131699085235596, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19152169674634933, + "step": 9196 + }, + { + "epoch": 0.2874375, + "grad_norm": 3.6875, + "grad_norm_var": 0.044709269205729166, + "learning_rate": 0.0001, + "loss": 5.9232, + "loss/crossentropy": 2.5079824924468994, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1868300512433052, + "step": 9198 + }, + { + "epoch": 0.2875, + "grad_norm": 3.71875, + "grad_norm_var": 0.048111979166666666, + "learning_rate": 0.0001, + "loss": 6.3274, + "loss/crossentropy": 2.782606601715088, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19588365405797958, + "step": 9200 + }, + { + "epoch": 0.2875625, + "grad_norm": 3.671875, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 0.0001, + "loss": 6.1545, + "loss/crossentropy": 2.6817389726638794, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18751223385334015, + "step": 9202 + }, + { + "epoch": 0.287625, + "grad_norm": 3.34375, + "grad_norm_var": 0.03462626139322917, + "learning_rate": 0.0001, + "loss": 5.9146, + "loss/crossentropy": 2.536959648132324, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18385609984397888, + "step": 9204 + }, + { + "epoch": 0.2876875, + "grad_norm": 3.671875, + "grad_norm_var": 0.031754557291666666, + "learning_rate": 0.0001, + "loss": 6.1421, + "loss/crossentropy": 2.664365768432617, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1930903196334839, + "step": 9206 + }, + { + "epoch": 0.28775, + "grad_norm": 3.40625, + "grad_norm_var": 0.027253214518229166, + "learning_rate": 0.0001, + "loss": 5.6488, + "loss/crossentropy": 2.3551712036132812, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17546115815639496, + "step": 9208 + }, + { + "epoch": 0.2878125, + "grad_norm": 3.09375, + "grad_norm_var": 0.03013916015625, + "learning_rate": 0.0001, + "loss": 5.8193, + "loss/crossentropy": 2.4081833362579346, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18486540019512177, + "step": 9210 + }, + { + "epoch": 0.287875, + "grad_norm": 3.609375, + "grad_norm_var": 0.041666666666666664, + "learning_rate": 0.0001, + "loss": 5.6278, + "loss/crossentropy": 2.3689277172088623, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1723714917898178, + "step": 9212 + }, + { + "epoch": 0.2879375, + "grad_norm": 3.46875, + "grad_norm_var": 0.046468098958333336, + "learning_rate": 0.0001, + "loss": 5.9088, + "loss/crossentropy": 2.5543618202209473, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1830964982509613, + "step": 9214 + }, + { + "epoch": 0.288, + "grad_norm": 3.453125, + "grad_norm_var": 0.04487202962239583, + "learning_rate": 0.0001, + "loss": 6.1735, + "loss/crossentropy": 2.5532714128494263, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20225465297698975, + "step": 9216 + }, + { + "epoch": 0.2880625, + "grad_norm": 3.546875, + "grad_norm_var": 0.05185139973958333, + "learning_rate": 0.0001, + "loss": 6.1338, + "loss/crossentropy": 2.6418739557266235, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19567591696977615, + "step": 9218 + }, + { + "epoch": 0.288125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0592437744140625, + "learning_rate": 0.0001, + "loss": 5.7424, + "loss/crossentropy": 2.3867040872573853, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1824459806084633, + "step": 9220 + }, + { + "epoch": 0.2881875, + "grad_norm": 3.703125, + "grad_norm_var": 0.06051432291666667, + "learning_rate": 0.0001, + "loss": 6.0869, + "loss/crossentropy": 2.5976165533065796, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1883801445364952, + "step": 9222 + }, + { + "epoch": 0.28825, + "grad_norm": 3.546875, + "grad_norm_var": 0.06036783854166667, + "learning_rate": 0.0001, + "loss": 6.3511, + "loss/crossentropy": 2.787461757659912, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19698838144540787, + "step": 9224 + }, + { + "epoch": 0.2883125, + "grad_norm": 3.21875, + "grad_norm_var": 0.055419921875, + "learning_rate": 0.0001, + "loss": 5.9473, + "loss/crossentropy": 2.564436197280884, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18554943799972534, + "step": 9226 + }, + { + "epoch": 0.288375, + "grad_norm": 3.640625, + "grad_norm_var": 0.055029296875, + "learning_rate": 0.0001, + "loss": 6.043, + "loss/crossentropy": 2.5392333269119263, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19334959238767624, + "step": 9228 + }, + { + "epoch": 0.2884375, + "grad_norm": 3.609375, + "grad_norm_var": 0.04179585774739583, + "learning_rate": 0.0001, + "loss": 5.7817, + "loss/crossentropy": 2.423188805580139, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18077274411916733, + "step": 9230 + }, + { + "epoch": 0.2885, + "grad_norm": 3.15625, + "grad_norm_var": 0.0576568603515625, + "learning_rate": 0.0001, + "loss": 5.9619, + "loss/crossentropy": 2.6595394611358643, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17867840826511383, + "step": 9232 + }, + { + "epoch": 0.2885625, + "grad_norm": 4.25, + "grad_norm_var": 0.08697509765625, + "learning_rate": 0.0001, + "loss": 6.3437, + "loss/crossentropy": 2.8047486543655396, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19607874006032944, + "step": 9234 + }, + { + "epoch": 0.288625, + "grad_norm": 3.375, + "grad_norm_var": 0.08189697265625, + "learning_rate": 0.0001, + "loss": 5.814, + "loss/crossentropy": 2.499058723449707, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17759249359369278, + "step": 9236 + }, + { + "epoch": 0.2886875, + "grad_norm": 4.09375, + "grad_norm_var": 0.10056966145833333, + "learning_rate": 0.0001, + "loss": 6.0368, + "loss/crossentropy": 2.552868962287903, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.18627947568893433, + "step": 9238 + }, + { + "epoch": 0.28875, + "grad_norm": 3.484375, + "grad_norm_var": 0.10156962076822916, + "learning_rate": 0.0001, + "loss": 5.9558, + "loss/crossentropy": 2.547224760055542, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1881256103515625, + "step": 9240 + }, + { + "epoch": 0.2888125, + "grad_norm": 3.28125, + "grad_norm_var": 0.09879557291666667, + "learning_rate": 0.0001, + "loss": 5.9994, + "loss/crossentropy": 2.5107314586639404, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19340085983276367, + "step": 9242 + }, + { + "epoch": 0.288875, + "grad_norm": 3.46875, + "grad_norm_var": 0.09177958170572917, + "learning_rate": 0.0001, + "loss": 5.7584, + "loss/crossentropy": 2.4198479652404785, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18385396897792816, + "step": 9244 + }, + { + "epoch": 0.2889375, + "grad_norm": 3.03125, + "grad_norm_var": 0.10918680826822917, + "learning_rate": 0.0001, + "loss": 5.8078, + "loss/crossentropy": 2.5677201747894287, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17283860594034195, + "step": 9246 + }, + { + "epoch": 0.289, + "grad_norm": 3.28125, + "grad_norm_var": 0.10030924479166667, + "learning_rate": 0.0001, + "loss": 6.0421, + "loss/crossentropy": 2.5909671783447266, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1880839392542839, + "step": 9248 + }, + { + "epoch": 0.2890625, + "grad_norm": 3.671875, + "grad_norm_var": 0.121337890625, + "learning_rate": 0.0001, + "loss": 6.1022, + "loss/crossentropy": 2.5477112531661987, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19764020293951035, + "step": 9250 + }, + { + "epoch": 0.289125, + "grad_norm": 3.28125, + "grad_norm_var": 0.122705078125, + "learning_rate": 0.0001, + "loss": 5.8505, + "loss/crossentropy": 2.532976508140564, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17784690856933594, + "step": 9252 + }, + { + "epoch": 0.2891875, + "grad_norm": 3.625, + "grad_norm_var": 0.105078125, + "learning_rate": 0.0001, + "loss": 6.1819, + "loss/crossentropy": 2.5791308879852295, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20207322388887405, + "step": 9254 + }, + { + "epoch": 0.28925, + "grad_norm": 3.515625, + "grad_norm_var": 0.10579020182291667, + "learning_rate": 0.0001, + "loss": 5.9808, + "loss/crossentropy": 2.5386255979537964, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18718945980072021, + "step": 9256 + }, + { + "epoch": 0.2893125, + "grad_norm": 3.203125, + "grad_norm_var": 0.11177978515625, + "learning_rate": 0.0001, + "loss": 5.9823, + "loss/crossentropy": 2.648008704185486, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.179526224732399, + "step": 9258 + }, + { + "epoch": 0.289375, + "grad_norm": 3.09375, + "grad_norm_var": 0.11738179524739584, + "learning_rate": 0.0001, + "loss": 5.5612, + "loss/crossentropy": 2.2775819301605225, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17680228501558304, + "step": 9260 + }, + { + "epoch": 0.2894375, + "grad_norm": 3.21875, + "grad_norm_var": 0.10325113932291667, + "learning_rate": 0.0001, + "loss": 5.8594, + "loss/crossentropy": 2.468423366546631, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18128884583711624, + "step": 9262 + }, + { + "epoch": 0.2895, + "grad_norm": 3.671875, + "grad_norm_var": 0.1106109619140625, + "learning_rate": 0.0001, + "loss": 5.9303, + "loss/crossentropy": 2.5634262561798096, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1831684410572052, + "step": 9264 + }, + { + "epoch": 0.2895625, + "grad_norm": 3.28125, + "grad_norm_var": 0.0459869384765625, + "learning_rate": 0.0001, + "loss": 5.953, + "loss/crossentropy": 2.5027319192886353, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19189921766519547, + "step": 9266 + }, + { + "epoch": 0.289625, + "grad_norm": 3.46875, + "grad_norm_var": 0.04576416015625, + "learning_rate": 0.0001, + "loss": 5.9283, + "loss/crossentropy": 2.5710513591766357, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17986471205949783, + "step": 9268 + }, + { + "epoch": 0.2896875, + "grad_norm": 3.265625, + "grad_norm_var": 0.03170166015625, + "learning_rate": 0.0001, + "loss": 5.9576, + "loss/crossentropy": 2.578773617744446, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18319019675254822, + "step": 9270 + }, + { + "epoch": 0.28975, + "grad_norm": 4.09375, + "grad_norm_var": 0.06482645670572916, + "learning_rate": 0.0001, + "loss": 5.906, + "loss/crossentropy": 2.3595499992370605, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19214613735675812, + "step": 9272 + }, + { + "epoch": 0.2898125, + "grad_norm": 3.46875, + "grad_norm_var": 0.05940653483072917, + "learning_rate": 0.0001, + "loss": 5.8633, + "loss/crossentropy": 2.495652437210083, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.17817071825265884, + "step": 9274 + }, + { + "epoch": 0.289875, + "grad_norm": 3.53125, + "grad_norm_var": 0.05322977701822917, + "learning_rate": 0.0001, + "loss": 5.7203, + "loss/crossentropy": 2.4016173481941223, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17562279850244522, + "step": 9276 + }, + { + "epoch": 0.2899375, + "grad_norm": 3.65625, + "grad_norm_var": 0.05185139973958333, + "learning_rate": 0.0001, + "loss": 5.9694, + "loss/crossentropy": 2.572734236717224, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18420225381851196, + "step": 9278 + }, + { + "epoch": 0.29, + "grad_norm": 3.96875, + "grad_norm_var": 0.06738993326822916, + "learning_rate": 0.0001, + "loss": 6.0559, + "loss/crossentropy": 2.6163170337677, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19044649600982666, + "step": 9280 + }, + { + "epoch": 0.2900625, + "grad_norm": 3.609375, + "grad_norm_var": 0.06609700520833334, + "learning_rate": 0.0001, + "loss": 6.3712, + "loss/crossentropy": 2.8518699407577515, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19099697470664978, + "step": 9282 + }, + { + "epoch": 0.290125, + "grad_norm": 3.46875, + "grad_norm_var": 0.06609700520833334, + "learning_rate": 0.0001, + "loss": 6.0108, + "loss/crossentropy": 2.5253738164901733, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19151408970355988, + "step": 9284 + }, + { + "epoch": 0.2901875, + "grad_norm": 3.671875, + "grad_norm_var": 0.08382161458333333, + "learning_rate": 0.0001, + "loss": 6.1044, + "loss/crossentropy": 2.62210476398468, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19041306525468826, + "step": 9286 + }, + { + "epoch": 0.29025, + "grad_norm": 6.1875, + "grad_norm_var": 0.5259928385416667, + "learning_rate": 0.0001, + "loss": 5.704, + "loss/crossentropy": 2.3728272914886475, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17530959844589233, + "step": 9288 + }, + { + "epoch": 0.2903125, + "grad_norm": 3.359375, + "grad_norm_var": 0.5331044514973958, + "learning_rate": 0.0001, + "loss": 6.0333, + "loss/crossentropy": 2.5850698947906494, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1870090439915657, + "step": 9290 + }, + { + "epoch": 0.290375, + "grad_norm": 3.21875, + "grad_norm_var": 0.5549763997395833, + "learning_rate": 0.0001, + "loss": 5.9617, + "loss/crossentropy": 2.6056989431381226, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1824745461344719, + "step": 9292 + }, + { + "epoch": 0.2904375, + "grad_norm": 3.578125, + "grad_norm_var": 0.5628214518229167, + "learning_rate": 0.0001, + "loss": 6.0058, + "loss/crossentropy": 2.5911813974380493, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18911613523960114, + "step": 9294 + }, + { + "epoch": 0.2905, + "grad_norm": 4.5625, + "grad_norm_var": 0.60439453125, + "learning_rate": 0.0001, + "loss": 6.025, + "loss/crossentropy": 2.558809995651245, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1919320896267891, + "step": 9296 + }, + { + "epoch": 0.2905625, + "grad_norm": 3.765625, + "grad_norm_var": 0.6037923177083333, + "learning_rate": 0.0001, + "loss": 6.0465, + "loss/crossentropy": 2.5537161827087402, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19029832631349564, + "step": 9298 + }, + { + "epoch": 0.290625, + "grad_norm": 3.015625, + "grad_norm_var": 0.6330800374348958, + "learning_rate": 0.0001, + "loss": 5.8275, + "loss/crossentropy": 2.5661016702651978, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1761380210518837, + "step": 9300 + }, + { + "epoch": 0.2906875, + "grad_norm": 6.5625, + "grad_norm_var": 1.161327107747396, + "learning_rate": 0.0001, + "loss": 6.7729, + "loss/crossentropy": 2.9322030544281006, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.22586390376091003, + "step": 9302 + }, + { + "epoch": 0.29075, + "grad_norm": 3.46875, + "grad_norm_var": 0.7327870686848958, + "learning_rate": 0.0001, + "loss": 6.2271, + "loss/crossentropy": 2.728728771209717, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1916360929608345, + "step": 9304 + }, + { + "epoch": 0.2908125, + "grad_norm": 4.3125, + "grad_norm_var": 0.74677734375, + "learning_rate": 0.0001, + "loss": 6.2302, + "loss/crossentropy": 2.59408700466156, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.20267153531312943, + "step": 9306 + }, + { + "epoch": 0.290875, + "grad_norm": 3.390625, + "grad_norm_var": 0.7129547119140625, + "learning_rate": 0.0001, + "loss": 5.8737, + "loss/crossentropy": 2.5048424005508423, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18337111920118332, + "step": 9308 + }, + { + "epoch": 0.2909375, + "grad_norm": 3.625, + "grad_norm_var": 0.701904296875, + "learning_rate": 0.0001, + "loss": 6.0379, + "loss/crossentropy": 2.601284146308899, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1866263747215271, + "step": 9310 + }, + { + "epoch": 0.291, + "grad_norm": 3.203125, + "grad_norm_var": 0.6597615559895833, + "learning_rate": 0.0001, + "loss": 6.0365, + "loss/crossentropy": 2.6188138723373413, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18629661947488785, + "step": 9312 + }, + { + "epoch": 0.2910625, + "grad_norm": 7.46875, + "grad_norm_var": 1.5605753580729167, + "learning_rate": 0.0001, + "loss": 6.462, + "loss/crossentropy": 2.6669247150421143, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.21700634062290192, + "step": 9314 + }, + { + "epoch": 0.291125, + "grad_norm": 3.65625, + "grad_norm_var": 1.5083160400390625, + "learning_rate": 0.0001, + "loss": 5.9701, + "loss/crossentropy": 2.56630277633667, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18685922026634216, + "step": 9316 + }, + { + "epoch": 0.2911875, + "grad_norm": 3.265625, + "grad_norm_var": 1.0509592692057292, + "learning_rate": 0.0001, + "loss": 5.8683, + "loss/crossentropy": 2.4687705039978027, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1833105981349945, + "step": 9318 + }, + { + "epoch": 0.29125, + "grad_norm": 3.6875, + "grad_norm_var": 1.0473917643229167, + "learning_rate": 0.0001, + "loss": 6.094, + "loss/crossentropy": 2.5667370557785034, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19530849903821945, + "step": 9320 + }, + { + "epoch": 0.2913125, + "grad_norm": 3.6875, + "grad_norm_var": 1.0302154541015625, + "learning_rate": 0.0001, + "loss": 5.9883, + "loss/crossentropy": 2.5635939836502075, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18778348714113235, + "step": 9322 + }, + { + "epoch": 0.291375, + "grad_norm": 3.875, + "grad_norm_var": 1.03828125, + "learning_rate": 0.0001, + "loss": 6.1287, + "loss/crossentropy": 2.6045225858688354, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.19108834862709045, + "step": 9324 + }, + { + "epoch": 0.2914375, + "grad_norm": 3.328125, + "grad_norm_var": 1.0485514322916667, + "learning_rate": 0.0001, + "loss": 5.77, + "loss/crossentropy": 2.5001951456069946, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17659078538417816, + "step": 9326 + }, + { + "epoch": 0.2915, + "grad_norm": 3.65625, + "grad_norm_var": 1.0400217692057292, + "learning_rate": 0.0001, + "loss": 5.6163, + "loss/crossentropy": 2.339695930480957, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17688129842281342, + "step": 9328 + }, + { + "epoch": 0.2915625, + "grad_norm": 3.015625, + "grad_norm_var": 0.05122782389322917, + "learning_rate": 0.0001, + "loss": 5.4758, + "loss/crossentropy": 2.3356382846832275, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1620657593011856, + "step": 9330 + }, + { + "epoch": 0.291625, + "grad_norm": 7.5, + "grad_norm_var": 1.0894694010416666, + "learning_rate": 0.0001, + "loss": 6.4588, + "loss/crossentropy": 2.841578722000122, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20351407676935196, + "step": 9332 + }, + { + "epoch": 0.2916875, + "grad_norm": 3.734375, + "grad_norm_var": 1.0865397135416666, + "learning_rate": 0.0001, + "loss": 6.1103, + "loss/crossentropy": 2.682422637939453, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.190447598695755, + "step": 9334 + }, + { + "epoch": 0.29175, + "grad_norm": 3.1875, + "grad_norm_var": 1.1200846354166667, + "learning_rate": 0.0001, + "loss": 5.6275, + "loss/crossentropy": 2.3950765132904053, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1720711588859558, + "step": 9336 + }, + { + "epoch": 0.2918125, + "grad_norm": 3.203125, + "grad_norm_var": 1.1268229166666666, + "learning_rate": 0.0001, + "loss": 5.6068, + "loss/crossentropy": 2.337676167488098, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17222560197114944, + "step": 9338 + }, + { + "epoch": 0.291875, + "grad_norm": 3.34375, + "grad_norm_var": 1.1230428059895834, + "learning_rate": 0.0001, + "loss": 6.1973, + "loss/crossentropy": 2.795773148536682, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18624678999185562, + "step": 9340 + }, + { + "epoch": 0.2919375, + "grad_norm": 2.859375, + "grad_norm_var": 1.1498769124348958, + "learning_rate": 0.0001, + "loss": 5.594, + "loss/crossentropy": 2.4307150840759277, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1643746793270111, + "step": 9342 + }, + { + "epoch": 0.292, + "grad_norm": 3.484375, + "grad_norm_var": 2.4121419270833333, + "learning_rate": 0.0001, + "loss": 6.8335, + "loss/crossentropy": 2.7818437814712524, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.24501432478427887, + "step": 9344 + }, + { + "epoch": 0.2920625, + "grad_norm": 3.28125, + "grad_norm_var": 2.3807281494140624, + "learning_rate": 0.0001, + "loss": 5.7506, + "loss/crossentropy": 2.4875333309173584, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17786619812250137, + "step": 9346 + }, + { + "epoch": 0.292125, + "grad_norm": 3.359375, + "grad_norm_var": 1.44351806640625, + "learning_rate": 0.0001, + "loss": 5.8438, + "loss/crossentropy": 2.517126202583313, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18189110606908798, + "step": 9348 + }, + { + "epoch": 0.2921875, + "grad_norm": 3.265625, + "grad_norm_var": 1.450202433268229, + "learning_rate": 0.0001, + "loss": 5.8565, + "loss/crossentropy": 2.475774884223938, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1790889948606491, + "step": 9350 + }, + { + "epoch": 0.29225, + "grad_norm": 3.234375, + "grad_norm_var": 1.4333892822265626, + "learning_rate": 0.0001, + "loss": 5.771, + "loss/crossentropy": 2.3728095293045044, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18551874160766602, + "step": 9352 + }, + { + "epoch": 0.2923125, + "grad_norm": 3.921875, + "grad_norm_var": 1.4277628580729167, + "learning_rate": 0.0001, + "loss": 6.1051, + "loss/crossentropy": 2.6263909339904785, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1924026682972908, + "step": 9354 + }, + { + "epoch": 0.292375, + "grad_norm": 3.84375, + "grad_norm_var": 1.4276692708333334, + "learning_rate": 0.0001, + "loss": 6.1152, + "loss/crossentropy": 2.5191601514816284, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19905493408441544, + "step": 9356 + }, + { + "epoch": 0.2924375, + "grad_norm": 3.265625, + "grad_norm_var": 1.3986480712890625, + "learning_rate": 0.0001, + "loss": 5.7928, + "loss/crossentropy": 2.454358458518982, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1815009042620659, + "step": 9358 + }, + { + "epoch": 0.2925, + "grad_norm": 3.234375, + "grad_norm_var": 0.05308837890625, + "learning_rate": 0.0001, + "loss": 5.6278, + "loss/crossentropy": 2.4135148525238037, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16869229823350906, + "step": 9360 + }, + { + "epoch": 0.2925625, + "grad_norm": 3.515625, + "grad_norm_var": 0.05319010416666667, + "learning_rate": 0.0001, + "loss": 5.9325, + "loss/crossentropy": 2.5693756341934204, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18279489129781723, + "step": 9362 + }, + { + "epoch": 0.292625, + "grad_norm": 3.484375, + "grad_norm_var": 0.07939351399739583, + "learning_rate": 0.0001, + "loss": 6.2678, + "loss/crossentropy": 2.7371314764022827, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19564001262187958, + "step": 9364 + }, + { + "epoch": 0.2926875, + "grad_norm": 3.1875, + "grad_norm_var": 0.1599517822265625, + "learning_rate": 0.0001, + "loss": 6.1241, + "loss/crossentropy": 2.58701753616333, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19745878875255585, + "step": 9366 + }, + { + "epoch": 0.29275, + "grad_norm": 3.90625, + "grad_norm_var": 0.1594390869140625, + "learning_rate": 0.0001, + "loss": 6.3299, + "loss/crossentropy": 2.775208592414856, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19882483780384064, + "step": 9368 + }, + { + "epoch": 0.2928125, + "grad_norm": 3.3125, + "grad_norm_var": 0.15263570149739583, + "learning_rate": 0.0001, + "loss": 5.8915, + "loss/crossentropy": 2.4790087938308716, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18656082451343536, + "step": 9370 + }, + { + "epoch": 0.292875, + "grad_norm": 3.046875, + "grad_norm_var": 0.16703999837239583, + "learning_rate": 0.0001, + "loss": 5.6431, + "loss/crossentropy": 2.39720356464386, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1726379320025444, + "step": 9372 + }, + { + "epoch": 0.2929375, + "grad_norm": 3.5, + "grad_norm_var": 0.1646392822265625, + "learning_rate": 0.0001, + "loss": 6.112, + "loss/crossentropy": 2.69320011138916, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18719422817230225, + "step": 9374 + }, + { + "epoch": 0.293, + "grad_norm": 3.46875, + "grad_norm_var": 0.166943359375, + "learning_rate": 0.0001, + "loss": 5.4966, + "loss/crossentropy": 2.2177973985671997, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17085294425487518, + "step": 9376 + }, + { + "epoch": 0.2930625, + "grad_norm": 3.28125, + "grad_norm_var": 0.1681793212890625, + "learning_rate": 0.0001, + "loss": 6.1634, + "loss/crossentropy": 2.6918656826019287, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18817061185836792, + "step": 9378 + }, + { + "epoch": 0.293125, + "grad_norm": 3.546875, + "grad_norm_var": 0.14599507649739582, + "learning_rate": 0.0001, + "loss": 6.1192, + "loss/crossentropy": 2.6786845922470093, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18702397495508194, + "step": 9380 + }, + { + "epoch": 0.2931875, + "grad_norm": 3.5, + "grad_norm_var": 0.05211588541666667, + "learning_rate": 0.0001, + "loss": 5.9031, + "loss/crossentropy": 2.4648101329803467, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1848449409008026, + "step": 9382 + }, + { + "epoch": 0.29325, + "grad_norm": 3.390625, + "grad_norm_var": 0.031672159830729164, + "learning_rate": 0.0001, + "loss": 5.9042, + "loss/crossentropy": 2.516110062599182, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1833401843905449, + "step": 9384 + }, + { + "epoch": 0.2933125, + "grad_norm": 3.75, + "grad_norm_var": 0.06259358723958333, + "learning_rate": 0.0001, + "loss": 6.1422, + "loss/crossentropy": 2.5483232736587524, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20118191838264465, + "step": 9386 + }, + { + "epoch": 0.293375, + "grad_norm": 3.203125, + "grad_norm_var": 0.053857421875, + "learning_rate": 0.0001, + "loss": 5.6446, + "loss/crossentropy": 2.4846439361572266, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16365066170692444, + "step": 9388 + }, + { + "epoch": 0.2934375, + "grad_norm": 3.34375, + "grad_norm_var": 0.05217692057291667, + "learning_rate": 0.0001, + "loss": 5.8954, + "loss/crossentropy": 2.557347536087036, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18145860731601715, + "step": 9390 + }, + { + "epoch": 0.2935, + "grad_norm": 3.46875, + "grad_norm_var": 0.04788411458333333, + "learning_rate": 0.0001, + "loss": 5.7785, + "loss/crossentropy": 2.4624106884002686, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17770367860794067, + "step": 9392 + }, + { + "epoch": 0.2935625, + "grad_norm": 3.265625, + "grad_norm_var": 0.04924723307291667, + "learning_rate": 0.0001, + "loss": 6.0135, + "loss/crossentropy": 2.5998064279556274, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18512319773435593, + "step": 9394 + }, + { + "epoch": 0.293625, + "grad_norm": 3.265625, + "grad_norm_var": 0.05273030598958333, + "learning_rate": 0.0001, + "loss": 6.0434, + "loss/crossentropy": 2.7213690280914307, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17673882097005844, + "step": 9396 + }, + { + "epoch": 0.2936875, + "grad_norm": 3.59375, + "grad_norm_var": 0.055418904622395834, + "learning_rate": 0.0001, + "loss": 6.0422, + "loss/crossentropy": 2.630928635597229, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1879998818039894, + "step": 9398 + }, + { + "epoch": 0.29375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0619537353515625, + "learning_rate": 0.0001, + "loss": 5.794, + "loss/crossentropy": 2.47506582736969, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17798515409231186, + "step": 9400 + }, + { + "epoch": 0.2938125, + "grad_norm": 3.65625, + "grad_norm_var": 0.02877197265625, + "learning_rate": 0.0001, + "loss": 5.7255, + "loss/crossentropy": 2.420718193054199, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17852778732776642, + "step": 9402 + }, + { + "epoch": 0.293875, + "grad_norm": 5.0625, + "grad_norm_var": 0.20722249348958333, + "learning_rate": 0.0001, + "loss": 6.1786, + "loss/crossentropy": 2.5539212226867676, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19997169077396393, + "step": 9404 + }, + { + "epoch": 0.2939375, + "grad_norm": 4.125, + "grad_norm_var": 0.23736572265625, + "learning_rate": 0.0001, + "loss": 6.1939, + "loss/crossentropy": 2.6862411499023438, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19490952044725418, + "step": 9406 + }, + { + "epoch": 0.294, + "grad_norm": 3.28125, + "grad_norm_var": 0.23877665201822917, + "learning_rate": 0.0001, + "loss": 5.8797, + "loss/crossentropy": 2.5766749382019043, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17756423354148865, + "step": 9408 + }, + { + "epoch": 0.2940625, + "grad_norm": 3.0625, + "grad_norm_var": 0.24751688639322916, + "learning_rate": 0.0001, + "loss": 5.9456, + "loss/crossentropy": 2.650808095932007, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17869983613491058, + "step": 9410 + }, + { + "epoch": 0.294125, + "grad_norm": 3.234375, + "grad_norm_var": 0.24697977701822918, + "learning_rate": 0.0001, + "loss": 5.9725, + "loss/crossentropy": 2.5413256883621216, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18765229731798172, + "step": 9412 + }, + { + "epoch": 0.2941875, + "grad_norm": 3.25, + "grad_norm_var": 0.24722900390625, + "learning_rate": 0.0001, + "loss": 5.8193, + "loss/crossentropy": 2.443260073661804, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18018130958080292, + "step": 9414 + }, + { + "epoch": 0.29425, + "grad_norm": 3.40625, + "grad_norm_var": 0.24661051432291667, + "learning_rate": 0.0001, + "loss": 5.8545, + "loss/crossentropy": 2.491973638534546, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18312648683786392, + "step": 9416 + }, + { + "epoch": 0.2943125, + "grad_norm": 3.53125, + "grad_norm_var": 0.23987630208333333, + "learning_rate": 0.0001, + "loss": 6.0687, + "loss/crossentropy": 2.5530205965042114, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1918068453669548, + "step": 9418 + }, + { + "epoch": 0.294375, + "grad_norm": 3.65625, + "grad_norm_var": 0.07014973958333333, + "learning_rate": 0.0001, + "loss": 5.5433, + "loss/crossentropy": 2.3218941688537598, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1686219573020935, + "step": 9420 + }, + { + "epoch": 0.2944375, + "grad_norm": 3.15625, + "grad_norm_var": 0.035741170247395836, + "learning_rate": 0.0001, + "loss": 5.8188, + "loss/crossentropy": 2.5462976694107056, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17217408120632172, + "step": 9422 + }, + { + "epoch": 0.2945, + "grad_norm": 3.359375, + "grad_norm_var": 0.035319010416666664, + "learning_rate": 0.0001, + "loss": 5.9661, + "loss/crossentropy": 2.5954443216323853, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18276644498109818, + "step": 9424 + }, + { + "epoch": 0.2945625, + "grad_norm": 3.328125, + "grad_norm_var": 0.03476155598958333, + "learning_rate": 0.0001, + "loss": 5.9103, + "loss/crossentropy": 2.6398624181747437, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17587270587682724, + "step": 9426 + }, + { + "epoch": 0.294625, + "grad_norm": 3.4375, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 5.9045, + "loss/crossentropy": 2.5071613788604736, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18387354165315628, + "step": 9428 + }, + { + "epoch": 0.2946875, + "grad_norm": 5.21875, + "grad_norm_var": 0.25580952962239584, + "learning_rate": 0.0001, + "loss": 6.5772, + "loss/crossentropy": 2.8044530153274536, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.21243342012166977, + "step": 9430 + }, + { + "epoch": 0.29475, + "grad_norm": 3.625, + "grad_norm_var": 0.24859619140625, + "learning_rate": 0.0001, + "loss": 5.8637, + "loss/crossentropy": 2.4671213626861572, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18223374336957932, + "step": 9432 + }, + { + "epoch": 0.2948125, + "grad_norm": 3.4375, + "grad_norm_var": 0.24726460774739584, + "learning_rate": 0.0001, + "loss": 6.168, + "loss/crossentropy": 2.750429153442383, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18667643517255783, + "step": 9434 + }, + { + "epoch": 0.294875, + "grad_norm": 3.234375, + "grad_norm_var": 0.24963277180989582, + "learning_rate": 0.0001, + "loss": 5.7225, + "loss/crossentropy": 2.441996455192566, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1784442812204361, + "step": 9436 + }, + { + "epoch": 0.2949375, + "grad_norm": 3.53125, + "grad_norm_var": 0.24006245930989584, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.450874924659729, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1782272458076477, + "step": 9438 + }, + { + "epoch": 0.295, + "grad_norm": 3.5625, + "grad_norm_var": 0.26676025390625, + "learning_rate": 0.0001, + "loss": 6.0014, + "loss/crossentropy": 2.551122784614563, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19034453481435776, + "step": 9440 + }, + { + "epoch": 0.2950625, + "grad_norm": 3.4375, + "grad_norm_var": 0.2520904541015625, + "learning_rate": 0.0001, + "loss": 5.8832, + "loss/crossentropy": 2.4829729795455933, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18689806014299393, + "step": 9442 + }, + { + "epoch": 0.295125, + "grad_norm": 3.46875, + "grad_norm_var": 0.2420318603515625, + "learning_rate": 0.0001, + "loss": 6.3832, + "loss/crossentropy": 2.75757896900177, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.20045582950115204, + "step": 9444 + }, + { + "epoch": 0.2951875, + "grad_norm": 3.28125, + "grad_norm_var": 0.064453125, + "learning_rate": 0.0001, + "loss": 6.0769, + "loss/crossentropy": 2.629858613014221, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19157715141773224, + "step": 9446 + }, + { + "epoch": 0.29525, + "grad_norm": 3.296875, + "grad_norm_var": 0.0658111572265625, + "learning_rate": 0.0001, + "loss": 5.7368, + "loss/crossentropy": 2.4494065046310425, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17952421307563782, + "step": 9448 + }, + { + "epoch": 0.2953125, + "grad_norm": 3.515625, + "grad_norm_var": 0.0683258056640625, + "learning_rate": 0.0001, + "loss": 6.2172, + "loss/crossentropy": 2.6646794080734253, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.1927492320537567, + "step": 9450 + }, + { + "epoch": 0.295375, + "grad_norm": 3.09375, + "grad_norm_var": 0.07465718587239584, + "learning_rate": 0.0001, + "loss": 5.9107, + "loss/crossentropy": 2.565911889076233, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1833091378211975, + "step": 9452 + }, + { + "epoch": 0.2954375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0834136962890625, + "learning_rate": 0.0001, + "loss": 5.9376, + "loss/crossentropy": 2.5572386980056763, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18178445845842361, + "step": 9454 + }, + { + "epoch": 0.2955, + "grad_norm": 3.28125, + "grad_norm_var": 0.04449462890625, + "learning_rate": 0.0001, + "loss": 5.9452, + "loss/crossentropy": 2.6706295013427734, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17628566920757294, + "step": 9456 + }, + { + "epoch": 0.2955625, + "grad_norm": 3.5, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 6.1368, + "loss/crossentropy": 2.710642457008362, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18793027102947235, + "step": 9458 + }, + { + "epoch": 0.295625, + "grad_norm": 3.375, + "grad_norm_var": 0.02685546875, + "learning_rate": 0.0001, + "loss": 6.0779, + "loss/crossentropy": 2.7107990980148315, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18397362530231476, + "step": 9460 + }, + { + "epoch": 0.2956875, + "grad_norm": 3.359375, + "grad_norm_var": 0.02685546875, + "learning_rate": 0.0001, + "loss": 5.858, + "loss/crossentropy": 2.5094329118728638, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18016769737005234, + "step": 9462 + }, + { + "epoch": 0.29575, + "grad_norm": 3.234375, + "grad_norm_var": 0.03330790201822917, + "learning_rate": 0.0001, + "loss": 5.9281, + "loss/crossentropy": 2.5578246116638184, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18507373332977295, + "step": 9464 + }, + { + "epoch": 0.2958125, + "grad_norm": 3.625, + "grad_norm_var": 0.0293853759765625, + "learning_rate": 0.0001, + "loss": 6.177, + "loss/crossentropy": 2.6653488874435425, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19569653272628784, + "step": 9466 + }, + { + "epoch": 0.295875, + "grad_norm": 3.203125, + "grad_norm_var": 0.0265777587890625, + "learning_rate": 0.0001, + "loss": 5.5547, + "loss/crossentropy": 2.340910792350769, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.16630034893751144, + "step": 9468 + }, + { + "epoch": 0.2959375, + "grad_norm": 3.0625, + "grad_norm_var": 0.028343709309895833, + "learning_rate": 0.0001, + "loss": 6.0088, + "loss/crossentropy": 2.661539316177368, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18238188326358795, + "step": 9470 + }, + { + "epoch": 0.296, + "grad_norm": 3.328125, + "grad_norm_var": 0.024267578125, + "learning_rate": 0.0001, + "loss": 6.0804, + "loss/crossentropy": 2.6197402477264404, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18668701499700546, + "step": 9472 + }, + { + "epoch": 0.2960625, + "grad_norm": 3.953125, + "grad_norm_var": 0.05462239583333333, + "learning_rate": 0.0001, + "loss": 5.6989, + "loss/crossentropy": 2.4417498111724854, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17102348804473877, + "step": 9474 + }, + { + "epoch": 0.296125, + "grad_norm": 3.203125, + "grad_norm_var": 0.056441243489583334, + "learning_rate": 0.0001, + "loss": 5.9945, + "loss/crossentropy": 2.5656116008758545, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19054538756608963, + "step": 9476 + }, + { + "epoch": 0.2961875, + "grad_norm": 3.203125, + "grad_norm_var": 0.0676177978515625, + "learning_rate": 0.0001, + "loss": 5.9758, + "loss/crossentropy": 2.5559715032577515, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18690335750579834, + "step": 9478 + }, + { + "epoch": 0.29625, + "grad_norm": 3.359375, + "grad_norm_var": 0.06379292805989584, + "learning_rate": 0.0001, + "loss": 6.15, + "loss/crossentropy": 2.7207692861557007, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18823497742414474, + "step": 9480 + }, + { + "epoch": 0.2963125, + "grad_norm": 3.203125, + "grad_norm_var": 0.06122639973958333, + "learning_rate": 0.0001, + "loss": 5.8091, + "loss/crossentropy": 2.492751121520996, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17538242787122726, + "step": 9482 + }, + { + "epoch": 0.296375, + "grad_norm": 3.078125, + "grad_norm_var": 0.06492513020833333, + "learning_rate": 0.0001, + "loss": 5.736, + "loss/crossentropy": 2.543042540550232, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16890213638544083, + "step": 9484 + }, + { + "epoch": 0.2964375, + "grad_norm": 3.53125, + "grad_norm_var": 0.061799112955729166, + "learning_rate": 0.0001, + "loss": 5.9448, + "loss/crossentropy": 2.458292841911316, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1892717480659485, + "step": 9486 + }, + { + "epoch": 0.2965, + "grad_norm": 3.46875, + "grad_norm_var": 0.06148681640625, + "learning_rate": 0.0001, + "loss": 6.4874, + "loss/crossentropy": 2.889983057975769, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20232471078634262, + "step": 9488 + }, + { + "epoch": 0.2965625, + "grad_norm": 3.25, + "grad_norm_var": 0.03206278483072917, + "learning_rate": 0.0001, + "loss": 6.2168, + "loss/crossentropy": 2.751617908477783, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.19378162175416946, + "step": 9490 + }, + { + "epoch": 0.296625, + "grad_norm": 3.375, + "grad_norm_var": 0.027864583333333335, + "learning_rate": 0.0001, + "loss": 6.0141, + "loss/crossentropy": 2.566988706588745, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18807435780763626, + "step": 9492 + }, + { + "epoch": 0.2966875, + "grad_norm": 3.15625, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 5.8184, + "loss/crossentropy": 2.528294086456299, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17744722217321396, + "step": 9494 + }, + { + "epoch": 0.29675, + "grad_norm": 3.375, + "grad_norm_var": 0.017529296875, + "learning_rate": 0.0001, + "loss": 6.3648, + "loss/crossentropy": 2.832018256187439, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19468458741903305, + "step": 9496 + }, + { + "epoch": 0.2968125, + "grad_norm": 3.25, + "grad_norm_var": 0.0201568603515625, + "learning_rate": 0.0001, + "loss": 6.061, + "loss/crossentropy": 2.7335082292556763, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18274761736392975, + "step": 9498 + }, + { + "epoch": 0.296875, + "grad_norm": 3.046875, + "grad_norm_var": 0.020699055989583333, + "learning_rate": 0.0001, + "loss": 6.0026, + "loss/crossentropy": 2.644629120826721, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18111416697502136, + "step": 9500 + }, + { + "epoch": 0.2969375, + "grad_norm": 3.40625, + "grad_norm_var": 0.018387858072916666, + "learning_rate": 0.0001, + "loss": 5.9254, + "loss/crossentropy": 2.5114939212799072, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18553601205348969, + "step": 9502 + }, + { + "epoch": 0.297, + "grad_norm": 3.6875, + "grad_norm_var": 0.025537109375, + "learning_rate": 0.0001, + "loss": 6.0737, + "loss/crossentropy": 2.6292479038238525, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1885829046368599, + "step": 9504 + }, + { + "epoch": 0.2970625, + "grad_norm": 3.171875, + "grad_norm_var": 0.024217732747395835, + "learning_rate": 0.0001, + "loss": 5.829, + "loss/crossentropy": 2.529231548309326, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17763758450746536, + "step": 9506 + }, + { + "epoch": 0.297125, + "grad_norm": 3.75, + "grad_norm_var": 0.03554585774739583, + "learning_rate": 0.0001, + "loss": 6.2838, + "loss/crossentropy": 2.723105311393738, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19864670932292938, + "step": 9508 + }, + { + "epoch": 0.2971875, + "grad_norm": 3.328125, + "grad_norm_var": 0.033934529622395834, + "learning_rate": 0.0001, + "loss": 6.1941, + "loss/crossentropy": 2.7853564023971558, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1842387244105339, + "step": 9510 + }, + { + "epoch": 0.29725, + "grad_norm": 3.28125, + "grad_norm_var": 0.033568318684895834, + "learning_rate": 0.0001, + "loss": 5.5186, + "loss/crossentropy": 2.222517728805542, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1757029891014099, + "step": 9512 + }, + { + "epoch": 0.2973125, + "grad_norm": 3.515625, + "grad_norm_var": 0.03264058430989583, + "learning_rate": 0.0001, + "loss": 5.5155, + "loss/crossentropy": 2.2605106830596924, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17237187176942825, + "step": 9514 + }, + { + "epoch": 0.297375, + "grad_norm": 3.40625, + "grad_norm_var": 0.027925618489583335, + "learning_rate": 0.0001, + "loss": 6.0786, + "loss/crossentropy": 2.588716983795166, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19274143874645233, + "step": 9516 + }, + { + "epoch": 0.2974375, + "grad_norm": 3.21875, + "grad_norm_var": 0.04374898274739583, + "learning_rate": 0.0001, + "loss": 5.8546, + "loss/crossentropy": 2.580670118331909, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17465391755104065, + "step": 9518 + }, + { + "epoch": 0.2975, + "grad_norm": 3.859375, + "grad_norm_var": 0.05211181640625, + "learning_rate": 0.0001, + "loss": 5.7883, + "loss/crossentropy": 2.466141700744629, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17870307713747025, + "step": 9520 + }, + { + "epoch": 0.2975625, + "grad_norm": 3.609375, + "grad_norm_var": 0.04810791015625, + "learning_rate": 0.0001, + "loss": 5.8819, + "loss/crossentropy": 2.5266683101654053, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1773185133934021, + "step": 9522 + }, + { + "epoch": 0.297625, + "grad_norm": 4.6875, + "grad_norm_var": 0.14081929524739584, + "learning_rate": 0.0001, + "loss": 5.888, + "loss/crossentropy": 2.44830322265625, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1873292475938797, + "step": 9524 + }, + { + "epoch": 0.2976875, + "grad_norm": 3.125, + "grad_norm_var": 0.15488179524739584, + "learning_rate": 0.0001, + "loss": 5.7312, + "loss/crossentropy": 2.435327410697937, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17880625277757645, + "step": 9526 + }, + { + "epoch": 0.29775, + "grad_norm": 3.203125, + "grad_norm_var": 0.16428120930989584, + "learning_rate": 0.0001, + "loss": 5.7866, + "loss/crossentropy": 2.4521981477737427, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1818797066807747, + "step": 9528 + }, + { + "epoch": 0.2978125, + "grad_norm": 3.796875, + "grad_norm_var": 0.17026265462239584, + "learning_rate": 0.0001, + "loss": 6.0854, + "loss/crossentropy": 2.559659242630005, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19632402807474136, + "step": 9530 + }, + { + "epoch": 0.297875, + "grad_norm": 3.078125, + "grad_norm_var": 0.17836812337239583, + "learning_rate": 0.0001, + "loss": 5.5406, + "loss/crossentropy": 2.393701434135437, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1639113500714302, + "step": 9532 + }, + { + "epoch": 0.2979375, + "grad_norm": 3.34375, + "grad_norm_var": 0.16140848795572918, + "learning_rate": 0.0001, + "loss": 5.8569, + "loss/crossentropy": 2.4648091793060303, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18452439457178116, + "step": 9534 + }, + { + "epoch": 0.298, + "grad_norm": 3.296875, + "grad_norm_var": 0.15240885416666666, + "learning_rate": 0.0001, + "loss": 5.9575, + "loss/crossentropy": 2.516305923461914, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18670186400413513, + "step": 9536 + }, + { + "epoch": 0.2980625, + "grad_norm": 3.015625, + "grad_norm_var": 0.16092122395833333, + "learning_rate": 0.0001, + "loss": 5.7747, + "loss/crossentropy": 2.584937572479248, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1666332334280014, + "step": 9538 + }, + { + "epoch": 0.298125, + "grad_norm": 3.546875, + "grad_norm_var": 0.04719136555989583, + "learning_rate": 0.0001, + "loss": 6.1296, + "loss/crossentropy": 2.6019665002822876, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19651742279529572, + "step": 9540 + }, + { + "epoch": 0.2981875, + "grad_norm": 3.6875, + "grad_norm_var": 0.048140462239583334, + "learning_rate": 0.0001, + "loss": 5.9211, + "loss/crossentropy": 2.519709587097168, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18545471131801605, + "step": 9542 + }, + { + "epoch": 0.29825, + "grad_norm": 3.46875, + "grad_norm_var": 0.040913899739583336, + "learning_rate": 0.0001, + "loss": 6.135, + "loss/crossentropy": 2.707599401473999, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1880495250225067, + "step": 9544 + }, + { + "epoch": 0.2983125, + "grad_norm": 3.125, + "grad_norm_var": 0.039891560872395836, + "learning_rate": 0.0001, + "loss": 5.8902, + "loss/crossentropy": 2.584674119949341, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17938528209924698, + "step": 9546 + }, + { + "epoch": 0.298375, + "grad_norm": 3.328125, + "grad_norm_var": 0.07604878743489583, + "learning_rate": 0.0001, + "loss": 6.1498, + "loss/crossentropy": 2.6279720067977905, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19554487615823746, + "step": 9548 + }, + { + "epoch": 0.2984375, + "grad_norm": 4.0625, + "grad_norm_var": 0.10044657389322917, + "learning_rate": 0.0001, + "loss": 6.1114, + "loss/crossentropy": 2.615634799003601, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1941043734550476, + "step": 9550 + }, + { + "epoch": 0.2985, + "grad_norm": 3.625, + "grad_norm_var": 0.10454813639322917, + "learning_rate": 0.0001, + "loss": 5.8584, + "loss/crossentropy": 2.473676919937134, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18105436861515045, + "step": 9552 + }, + { + "epoch": 0.2985625, + "grad_norm": 3.40625, + "grad_norm_var": 0.09113667805989584, + "learning_rate": 0.0001, + "loss": 5.8849, + "loss/crossentropy": 2.549417018890381, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1772959977388382, + "step": 9554 + }, + { + "epoch": 0.298625, + "grad_norm": 3.53125, + "grad_norm_var": 0.09916890462239583, + "learning_rate": 0.0001, + "loss": 5.8984, + "loss/crossentropy": 2.5247409343719482, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18267476558685303, + "step": 9556 + }, + { + "epoch": 0.2986875, + "grad_norm": 3.5625, + "grad_norm_var": 0.09558003743489583, + "learning_rate": 0.0001, + "loss": 6.3896, + "loss/crossentropy": 2.9074044227600098, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19197405874729156, + "step": 9558 + }, + { + "epoch": 0.29875, + "grad_norm": 3.640625, + "grad_norm_var": 0.10263264973958333, + "learning_rate": 0.0001, + "loss": 5.951, + "loss/crossentropy": 2.599033832550049, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18089497834444046, + "step": 9560 + }, + { + "epoch": 0.2988125, + "grad_norm": 3.546875, + "grad_norm_var": 0.08199869791666667, + "learning_rate": 0.0001, + "loss": 6.0215, + "loss/crossentropy": 2.534602165222168, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.18853811919689178, + "step": 9562 + }, + { + "epoch": 0.298875, + "grad_norm": 3.3125, + "grad_norm_var": 0.0502105712890625, + "learning_rate": 0.0001, + "loss": 5.9781, + "loss/crossentropy": 2.5743662118911743, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18216802924871445, + "step": 9564 + }, + { + "epoch": 0.2989375, + "grad_norm": 4.0, + "grad_norm_var": 0.07532450358072916, + "learning_rate": 0.0001, + "loss": 6.0835, + "loss/crossentropy": 2.565767288208008, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19083568453788757, + "step": 9566 + }, + { + "epoch": 0.299, + "grad_norm": 3.46875, + "grad_norm_var": 0.068994140625, + "learning_rate": 0.0001, + "loss": 5.9021, + "loss/crossentropy": 2.493484139442444, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1830536648631096, + "step": 9568 + }, + { + "epoch": 0.2990625, + "grad_norm": 3.671875, + "grad_norm_var": 0.06985677083333333, + "learning_rate": 0.0001, + "loss": 6.2318, + "loss/crossentropy": 2.683763265609741, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19386757910251617, + "step": 9570 + }, + { + "epoch": 0.299125, + "grad_norm": 3.28125, + "grad_norm_var": 0.06435445149739584, + "learning_rate": 0.0001, + "loss": 5.9197, + "loss/crossentropy": 2.583989977836609, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18045029789209366, + "step": 9572 + }, + { + "epoch": 0.2991875, + "grad_norm": 4.09375, + "grad_norm_var": 0.08567301432291667, + "learning_rate": 0.0001, + "loss": 6.0451, + "loss/crossentropy": 2.5369582176208496, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19339578598737717, + "step": 9574 + }, + { + "epoch": 0.29925, + "grad_norm": 3.390625, + "grad_norm_var": 0.07838134765625, + "learning_rate": 0.0001, + "loss": 5.6927, + "loss/crossentropy": 2.391352415084839, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1742788478732109, + "step": 9576 + }, + { + "epoch": 0.2993125, + "grad_norm": 3.453125, + "grad_norm_var": 0.07647196451822917, + "learning_rate": 0.0001, + "loss": 6.1259, + "loss/crossentropy": 2.701521873474121, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18853048235177994, + "step": 9578 + }, + { + "epoch": 0.299375, + "grad_norm": 3.421875, + "grad_norm_var": 0.07623291015625, + "learning_rate": 0.0001, + "loss": 6.2207, + "loss/crossentropy": 2.770782947540283, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18952109664678574, + "step": 9580 + }, + { + "epoch": 0.2994375, + "grad_norm": 3.453125, + "grad_norm_var": 0.03580322265625, + "learning_rate": 0.0001, + "loss": 5.908, + "loss/crossentropy": 2.5371055603027344, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18357133120298386, + "step": 9582 + }, + { + "epoch": 0.2995, + "grad_norm": 3.625, + "grad_norm_var": 0.0381988525390625, + "learning_rate": 0.0001, + "loss": 5.7893, + "loss/crossentropy": 2.3868966102600098, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18242916464805603, + "step": 9584 + }, + { + "epoch": 0.2995625, + "grad_norm": 3.640625, + "grad_norm_var": 0.03821614583333333, + "learning_rate": 0.0001, + "loss": 5.8449, + "loss/crossentropy": 2.508471131324768, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17895280569791794, + "step": 9586 + }, + { + "epoch": 0.299625, + "grad_norm": 3.4375, + "grad_norm_var": 0.03559468587239583, + "learning_rate": 0.0001, + "loss": 5.6706, + "loss/crossentropy": 2.353018879890442, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17629433423280716, + "step": 9588 + }, + { + "epoch": 0.2996875, + "grad_norm": 3.21875, + "grad_norm_var": 0.017096964518229167, + "learning_rate": 0.0001, + "loss": 5.8188, + "loss/crossentropy": 2.514328956604004, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17771629244089127, + "step": 9590 + }, + { + "epoch": 0.29975, + "grad_norm": 3.453125, + "grad_norm_var": 0.02197265625, + "learning_rate": 0.0001, + "loss": 5.9967, + "loss/crossentropy": 2.6237692832946777, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18416789174079895, + "step": 9592 + }, + { + "epoch": 0.2998125, + "grad_norm": 3.078125, + "grad_norm_var": 0.0258941650390625, + "learning_rate": 0.0001, + "loss": 5.948, + "loss/crossentropy": 2.5956236124038696, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18289864808321, + "step": 9594 + }, + { + "epoch": 0.299875, + "grad_norm": 3.390625, + "grad_norm_var": 0.02568359375, + "learning_rate": 0.0001, + "loss": 6.041, + "loss/crossentropy": 2.6824913024902344, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18272147327661514, + "step": 9596 + }, + { + "epoch": 0.2999375, + "grad_norm": 3.40625, + "grad_norm_var": 0.0229888916015625, + "learning_rate": 0.0001, + "loss": 5.926, + "loss/crossentropy": 2.5956075191497803, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17991703003644943, + "step": 9598 + }, + { + "epoch": 0.3, + "grad_norm": 3.5, + "grad_norm_var": 0.019660441080729167, + "learning_rate": 0.0001, + "loss": 6.218, + "loss/crossentropy": 2.779041051864624, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1864701434969902, + "step": 9600 + }, + { + "epoch": 0.3000625, + "grad_norm": 3.40625, + "grad_norm_var": 0.016486612955729167, + "learning_rate": 0.0001, + "loss": 6.0231, + "loss/crossentropy": 2.605224609375, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18710007518529892, + "step": 9602 + }, + { + "epoch": 0.300125, + "grad_norm": 3.453125, + "grad_norm_var": 0.020588175455729166, + "learning_rate": 0.0001, + "loss": 5.9393, + "loss/crossentropy": 2.516131639480591, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.187236025929451, + "step": 9604 + }, + { + "epoch": 0.3001875, + "grad_norm": 3.125, + "grad_norm_var": 0.022297159830729166, + "learning_rate": 0.0001, + "loss": 5.7019, + "loss/crossentropy": 2.4160910844802856, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17584580928087234, + "step": 9606 + }, + { + "epoch": 0.30025, + "grad_norm": 3.34375, + "grad_norm_var": 0.05022379557291667, + "learning_rate": 0.0001, + "loss": 6.1495, + "loss/crossentropy": 2.7719966173171997, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18384820222854614, + "step": 9608 + }, + { + "epoch": 0.3003125, + "grad_norm": 3.09375, + "grad_norm_var": 0.049681599934895834, + "learning_rate": 0.0001, + "loss": 5.9063, + "loss/crossentropy": 2.548715829849243, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18185021728277206, + "step": 9610 + }, + { + "epoch": 0.300375, + "grad_norm": 3.171875, + "grad_norm_var": 0.05271708170572917, + "learning_rate": 0.0001, + "loss": 5.9881, + "loss/crossentropy": 2.6455163955688477, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17839647084474564, + "step": 9612 + }, + { + "epoch": 0.3004375, + "grad_norm": 3.484375, + "grad_norm_var": 0.05388081868489583, + "learning_rate": 0.0001, + "loss": 5.8598, + "loss/crossentropy": 2.5083526372909546, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.178897887468338, + "step": 9614 + }, + { + "epoch": 0.3005, + "grad_norm": 3.390625, + "grad_norm_var": 0.05284830729166667, + "learning_rate": 0.0001, + "loss": 6.1027, + "loss/crossentropy": 2.705984592437744, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18693287670612335, + "step": 9616 + }, + { + "epoch": 0.3005625, + "grad_norm": 3.25, + "grad_norm_var": 0.05530192057291667, + "learning_rate": 0.0001, + "loss": 6.0881, + "loss/crossentropy": 2.582324504852295, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19158920645713806, + "step": 9618 + }, + { + "epoch": 0.300625, + "grad_norm": 3.40625, + "grad_norm_var": 0.05628255208333333, + "learning_rate": 0.0001, + "loss": 5.9898, + "loss/crossentropy": 2.5595717430114746, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.189116433262825, + "step": 9620 + }, + { + "epoch": 0.3006875, + "grad_norm": 3.28125, + "grad_norm_var": 0.053644816080729164, + "learning_rate": 0.0001, + "loss": 5.9982, + "loss/crossentropy": 2.6964563131332397, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17900234460830688, + "step": 9622 + }, + { + "epoch": 0.30075, + "grad_norm": 3.046875, + "grad_norm_var": 0.03076171875, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.5678685903549194, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17911336570978165, + "step": 9624 + }, + { + "epoch": 0.3008125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03389383951822917, + "learning_rate": 0.0001, + "loss": 5.7959, + "loss/crossentropy": 2.503232955932617, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17653512209653854, + "step": 9626 + }, + { + "epoch": 0.300875, + "grad_norm": 3.359375, + "grad_norm_var": 0.03497721354166667, + "learning_rate": 0.0001, + "loss": 5.7962, + "loss/crossentropy": 2.52774715423584, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1745016649365425, + "step": 9628 + }, + { + "epoch": 0.3009375, + "grad_norm": 3.421875, + "grad_norm_var": 0.03589579264322917, + "learning_rate": 0.0001, + "loss": 5.8252, + "loss/crossentropy": 2.4879744052886963, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17746925354003906, + "step": 9630 + }, + { + "epoch": 0.301, + "grad_norm": 3.1875, + "grad_norm_var": 0.0363922119140625, + "learning_rate": 0.0001, + "loss": 5.9173, + "loss/crossentropy": 2.5328809022903442, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18727260828018188, + "step": 9632 + }, + { + "epoch": 0.3010625, + "grad_norm": 3.59375, + "grad_norm_var": 0.03642578125, + "learning_rate": 0.0001, + "loss": 5.9692, + "loss/crossentropy": 2.6109448671340942, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18230891227722168, + "step": 9634 + }, + { + "epoch": 0.301125, + "grad_norm": 3.25, + "grad_norm_var": 0.023323567708333333, + "learning_rate": 0.0001, + "loss": 5.9591, + "loss/crossentropy": 2.6024105548858643, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18058867007493973, + "step": 9636 + }, + { + "epoch": 0.3011875, + "grad_norm": 3.484375, + "grad_norm_var": 0.02896728515625, + "learning_rate": 0.0001, + "loss": 5.5176, + "loss/crossentropy": 2.2790863513946533, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17345691472291946, + "step": 9638 + }, + { + "epoch": 0.30125, + "grad_norm": 3.40625, + "grad_norm_var": 0.030790201822916665, + "learning_rate": 0.0001, + "loss": 6.1989, + "loss/crossentropy": 2.7583065032958984, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18858934193849564, + "step": 9640 + }, + { + "epoch": 0.3013125, + "grad_norm": 3.078125, + "grad_norm_var": 0.03411458333333333, + "learning_rate": 0.0001, + "loss": 5.7386, + "loss/crossentropy": 2.461578130722046, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17535844445228577, + "step": 9642 + }, + { + "epoch": 0.301375, + "grad_norm": 3.1875, + "grad_norm_var": 0.03196512858072917, + "learning_rate": 0.0001, + "loss": 5.9512, + "loss/crossentropy": 2.6277549266815186, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1807790920138359, + "step": 9644 + }, + { + "epoch": 0.3014375, + "grad_norm": 3.21875, + "grad_norm_var": 0.029076131184895833, + "learning_rate": 0.0001, + "loss": 5.4921, + "loss/crossentropy": 2.221467673778534, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1723794862627983, + "step": 9646 + }, + { + "epoch": 0.3015, + "grad_norm": 3.015625, + "grad_norm_var": 0.03277994791666667, + "learning_rate": 0.0001, + "loss": 5.6586, + "loss/crossentropy": 2.457500696182251, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17206186801195145, + "step": 9648 + }, + { + "epoch": 0.3015625, + "grad_norm": 5.8125, + "grad_norm_var": 0.4381022135416667, + "learning_rate": 0.0001, + "loss": 5.787, + "loss/crossentropy": 2.481991767883301, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17971954494714737, + "step": 9650 + }, + { + "epoch": 0.301625, + "grad_norm": 3.453125, + "grad_norm_var": 0.4443023681640625, + "learning_rate": 0.0001, + "loss": 6.5569, + "loss/crossentropy": 2.9203284978866577, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.20232653617858887, + "step": 9652 + }, + { + "epoch": 0.3016875, + "grad_norm": 3.53125, + "grad_norm_var": 0.4389556884765625, + "learning_rate": 0.0001, + "loss": 5.9993, + "loss/crossentropy": 2.6400113105773926, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18006744980812073, + "step": 9654 + }, + { + "epoch": 0.30175, + "grad_norm": 3.390625, + "grad_norm_var": 0.4326985677083333, + "learning_rate": 0.0001, + "loss": 5.972, + "loss/crossentropy": 2.6356629133224487, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18129239231348038, + "step": 9656 + }, + { + "epoch": 0.3018125, + "grad_norm": 3.515625, + "grad_norm_var": 0.42558186848958335, + "learning_rate": 0.0001, + "loss": 5.9715, + "loss/crossentropy": 2.6084052324295044, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18435966968536377, + "step": 9658 + }, + { + "epoch": 0.301875, + "grad_norm": 3.203125, + "grad_norm_var": 0.42740478515625, + "learning_rate": 0.0001, + "loss": 5.5919, + "loss/crossentropy": 2.396050214767456, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16958267986774445, + "step": 9660 + }, + { + "epoch": 0.3019375, + "grad_norm": 3.46875, + "grad_norm_var": 0.42538655598958336, + "learning_rate": 0.0001, + "loss": 6.0327, + "loss/crossentropy": 2.6803042888641357, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18367833644151688, + "step": 9662 + }, + { + "epoch": 0.302, + "grad_norm": 3.46875, + "grad_norm_var": 0.40513916015625, + "learning_rate": 0.0001, + "loss": 5.9193, + "loss/crossentropy": 2.616178512573242, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1775795891880989, + "step": 9664 + }, + { + "epoch": 0.3020625, + "grad_norm": 3.125, + "grad_norm_var": 0.03400777180989583, + "learning_rate": 0.0001, + "loss": 6.0945, + "loss/crossentropy": 2.6753381490707397, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18918544054031372, + "step": 9666 + }, + { + "epoch": 0.302125, + "grad_norm": 3.453125, + "grad_norm_var": 0.023921712239583334, + "learning_rate": 0.0001, + "loss": 5.76, + "loss/crossentropy": 2.3625649213790894, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18076029419898987, + "step": 9668 + }, + { + "epoch": 0.3021875, + "grad_norm": 3.21875, + "grad_norm_var": 0.021468098958333334, + "learning_rate": 0.0001, + "loss": 5.9876, + "loss/crossentropy": 2.6000994443893433, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18445394933223724, + "step": 9670 + }, + { + "epoch": 0.30225, + "grad_norm": 3.234375, + "grad_norm_var": 0.019596354166666666, + "learning_rate": 0.0001, + "loss": 6.0433, + "loss/crossentropy": 2.628748297691345, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18911347538232803, + "step": 9672 + }, + { + "epoch": 0.3023125, + "grad_norm": 3.34375, + "grad_norm_var": 0.018583170572916665, + "learning_rate": 0.0001, + "loss": 6.0157, + "loss/crossentropy": 2.6911391019821167, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17972269654273987, + "step": 9674 + }, + { + "epoch": 0.302375, + "grad_norm": 3.421875, + "grad_norm_var": 0.018919881184895834, + "learning_rate": 0.0001, + "loss": 5.9976, + "loss/crossentropy": 2.540882706642151, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19098274409770966, + "step": 9676 + }, + { + "epoch": 0.3024375, + "grad_norm": 3.0, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 5.8145, + "loss/crossentropy": 2.524951934814453, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1727074533700943, + "step": 9678 + }, + { + "epoch": 0.3025, + "grad_norm": 3.078125, + "grad_norm_var": 0.024901326497395834, + "learning_rate": 0.0001, + "loss": 5.9706, + "loss/crossentropy": 2.6175925731658936, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18256500363349915, + "step": 9680 + }, + { + "epoch": 0.3025625, + "grad_norm": 3.328125, + "grad_norm_var": 0.029931640625, + "learning_rate": 0.0001, + "loss": 6.1773, + "loss/crossentropy": 2.6699429750442505, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19526726007461548, + "step": 9682 + }, + { + "epoch": 0.302625, + "grad_norm": 3.21875, + "grad_norm_var": 0.024828084309895835, + "learning_rate": 0.0001, + "loss": 5.7396, + "loss/crossentropy": 2.4560199975967407, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17757853120565414, + "step": 9684 + }, + { + "epoch": 0.3026875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0289459228515625, + "learning_rate": 0.0001, + "loss": 5.706, + "loss/crossentropy": 2.4104011058807373, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17370350658893585, + "step": 9686 + }, + { + "epoch": 0.30275, + "grad_norm": 3.21875, + "grad_norm_var": 0.03212788899739583, + "learning_rate": 0.0001, + "loss": 6.059, + "loss/crossentropy": 2.6281578540802, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18957317620515823, + "step": 9688 + }, + { + "epoch": 0.3028125, + "grad_norm": 3.390625, + "grad_norm_var": 0.030077107747395835, + "learning_rate": 0.0001, + "loss": 6.0764, + "loss/crossentropy": 2.6625046730041504, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18787209689617157, + "step": 9690 + }, + { + "epoch": 0.302875, + "grad_norm": 3.546875, + "grad_norm_var": 0.031110636393229165, + "learning_rate": 0.0001, + "loss": 6.0875, + "loss/crossentropy": 2.630816340446472, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18746767193078995, + "step": 9692 + }, + { + "epoch": 0.3029375, + "grad_norm": 2.90625, + "grad_norm_var": 0.03593648274739583, + "learning_rate": 0.0001, + "loss": 5.7976, + "loss/crossentropy": 2.48416268825531, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1789965257048607, + "step": 9694 + }, + { + "epoch": 0.303, + "grad_norm": 3.109375, + "grad_norm_var": 0.037255859375, + "learning_rate": 0.0001, + "loss": 5.6082, + "loss/crossentropy": 2.3885338306427, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17118359357118607, + "step": 9696 + }, + { + "epoch": 0.3030625, + "grad_norm": 5.125, + "grad_norm_var": 0.23954671223958332, + "learning_rate": 0.0001, + "loss": 5.8096, + "loss/crossentropy": 2.481685996055603, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18044928461313248, + "step": 9698 + }, + { + "epoch": 0.303125, + "grad_norm": 3.640625, + "grad_norm_var": 0.24023335774739582, + "learning_rate": 0.0001, + "loss": 5.9725, + "loss/crossentropy": 2.5380191802978516, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18485242128372192, + "step": 9700 + }, + { + "epoch": 0.3031875, + "grad_norm": 4.21875, + "grad_norm_var": 0.28283589680989585, + "learning_rate": 0.0001, + "loss": 6.1457, + "loss/crossentropy": 2.648195743560791, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1931135207414627, + "step": 9702 + }, + { + "epoch": 0.30325, + "grad_norm": 3.40625, + "grad_norm_var": 0.31652730305989585, + "learning_rate": 0.0001, + "loss": 6.2557, + "loss/crossentropy": 2.6659653186798096, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20076920092105865, + "step": 9704 + }, + { + "epoch": 0.3033125, + "grad_norm": 3.359375, + "grad_norm_var": 0.3236968994140625, + "learning_rate": 0.0001, + "loss": 5.774, + "loss/crossentropy": 2.449228048324585, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17779207974672318, + "step": 9706 + }, + { + "epoch": 0.303375, + "grad_norm": 3.328125, + "grad_norm_var": 0.32242431640625, + "learning_rate": 0.0001, + "loss": 5.9802, + "loss/crossentropy": 2.4985880851745605, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19269168376922607, + "step": 9708 + }, + { + "epoch": 0.3034375, + "grad_norm": 3.265625, + "grad_norm_var": 0.29934488932291664, + "learning_rate": 0.0001, + "loss": 5.7041, + "loss/crossentropy": 2.4116644859313965, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17455655336380005, + "step": 9710 + }, + { + "epoch": 0.3035, + "grad_norm": 3.5625, + "grad_norm_var": 0.27971089680989586, + "learning_rate": 0.0001, + "loss": 5.8835, + "loss/crossentropy": 2.5367835760116577, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17920752614736557, + "step": 9712 + }, + { + "epoch": 0.3035625, + "grad_norm": 3.546875, + "grad_norm_var": 0.1057525634765625, + "learning_rate": 0.0001, + "loss": 6.2456, + "loss/crossentropy": 2.7077924013137817, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19480124861001968, + "step": 9714 + }, + { + "epoch": 0.303625, + "grad_norm": 3.59375, + "grad_norm_var": 0.10676676432291667, + "learning_rate": 0.0001, + "loss": 5.8065, + "loss/crossentropy": 2.430699110031128, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18054787814617157, + "step": 9716 + }, + { + "epoch": 0.3036875, + "grad_norm": 3.71875, + "grad_norm_var": 0.06737874348958334, + "learning_rate": 0.0001, + "loss": 6.3493, + "loss/crossentropy": 2.7686712741851807, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1971207708120346, + "step": 9718 + }, + { + "epoch": 0.30375, + "grad_norm": 3.390625, + "grad_norm_var": 0.024442545572916665, + "learning_rate": 0.0001, + "loss": 6.152, + "loss/crossentropy": 2.6613842248916626, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19554419070482254, + "step": 9720 + }, + { + "epoch": 0.3038125, + "grad_norm": 3.765625, + "grad_norm_var": 0.026448567708333332, + "learning_rate": 0.0001, + "loss": 6.0501, + "loss/crossentropy": 2.6510229110717773, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1817050278186798, + "step": 9722 + }, + { + "epoch": 0.303875, + "grad_norm": 3.40625, + "grad_norm_var": 0.025194295247395835, + "learning_rate": 0.0001, + "loss": 5.992, + "loss/crossentropy": 2.553894519805908, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18639186024665833, + "step": 9724 + }, + { + "epoch": 0.3039375, + "grad_norm": 3.265625, + "grad_norm_var": 0.026276652018229166, + "learning_rate": 0.0001, + "loss": 5.9109, + "loss/crossentropy": 2.565396308898926, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17986725270748138, + "step": 9726 + }, + { + "epoch": 0.304, + "grad_norm": 3.265625, + "grad_norm_var": 0.0246490478515625, + "learning_rate": 0.0001, + "loss": 5.7018, + "loss/crossentropy": 2.4221383333206177, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17249853909015656, + "step": 9728 + }, + { + "epoch": 0.3040625, + "grad_norm": 3.140625, + "grad_norm_var": 0.02939453125, + "learning_rate": 0.0001, + "loss": 5.7047, + "loss/crossentropy": 2.3192784786224365, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.17955958098173141, + "step": 9730 + }, + { + "epoch": 0.304125, + "grad_norm": 3.125, + "grad_norm_var": 0.03310546875, + "learning_rate": 0.0001, + "loss": 5.7375, + "loss/crossentropy": 2.4046072959899902, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1797727271914482, + "step": 9732 + }, + { + "epoch": 0.3041875, + "grad_norm": 3.578125, + "grad_norm_var": 0.0322174072265625, + "learning_rate": 0.0001, + "loss": 5.862, + "loss/crossentropy": 2.5601195096969604, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17745760083198547, + "step": 9734 + }, + { + "epoch": 0.30425, + "grad_norm": 4.15625, + "grad_norm_var": 0.07295633951822916, + "learning_rate": 0.0001, + "loss": 6.1264, + "loss/crossentropy": 2.6302343606948853, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19140934944152832, + "step": 9736 + }, + { + "epoch": 0.3043125, + "grad_norm": 3.4375, + "grad_norm_var": 0.06372782389322916, + "learning_rate": 0.0001, + "loss": 5.9925, + "loss/crossentropy": 2.5693787336349487, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18528062105178833, + "step": 9738 + }, + { + "epoch": 0.304375, + "grad_norm": 3.0, + "grad_norm_var": 0.076953125, + "learning_rate": 0.0001, + "loss": 5.7311, + "loss/crossentropy": 2.5311845541000366, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16843144595623016, + "step": 9740 + }, + { + "epoch": 0.3044375, + "grad_norm": 3.296875, + "grad_norm_var": 0.07720947265625, + "learning_rate": 0.0001, + "loss": 5.6622, + "loss/crossentropy": 2.4350874423980713, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.16606903076171875, + "step": 9742 + }, + { + "epoch": 0.3045, + "grad_norm": 3.515625, + "grad_norm_var": 0.07876688639322917, + "learning_rate": 0.0001, + "loss": 5.9038, + "loss/crossentropy": 2.5257811546325684, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18389403074979782, + "step": 9744 + }, + { + "epoch": 0.3045625, + "grad_norm": 3.21875, + "grad_norm_var": 0.0848297119140625, + "learning_rate": 0.0001, + "loss": 6.0084, + "loss/crossentropy": 2.6016011238098145, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18403909355401993, + "step": 9746 + }, + { + "epoch": 0.304625, + "grad_norm": 3.5625, + "grad_norm_var": 0.08259989420572916, + "learning_rate": 0.0001, + "loss": 5.8981, + "loss/crossentropy": 2.4193954467773438, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.18771403282880783, + "step": 9748 + }, + { + "epoch": 0.3046875, + "grad_norm": 3.25, + "grad_norm_var": 0.07692057291666667, + "learning_rate": 0.0001, + "loss": 6.047, + "loss/crossentropy": 2.6089885234832764, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19145718216896057, + "step": 9750 + }, + { + "epoch": 0.30475, + "grad_norm": 3.359375, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 5.7729, + "loss/crossentropy": 2.414872884750366, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17838361859321594, + "step": 9752 + }, + { + "epoch": 0.3048125, + "grad_norm": 3.21875, + "grad_norm_var": 0.034891764322916664, + "learning_rate": 0.0001, + "loss": 5.8067, + "loss/crossentropy": 2.4757325649261475, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18075258284807205, + "step": 9754 + }, + { + "epoch": 0.304875, + "grad_norm": 3.421875, + "grad_norm_var": 0.024079386393229166, + "learning_rate": 0.0001, + "loss": 5.8794, + "loss/crossentropy": 2.59158194065094, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17644038051366806, + "step": 9756 + }, + { + "epoch": 0.3049375, + "grad_norm": 3.1875, + "grad_norm_var": 0.025667317708333335, + "learning_rate": 0.0001, + "loss": 5.903, + "loss/crossentropy": 2.588863968849182, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17750480771064758, + "step": 9758 + }, + { + "epoch": 0.305, + "grad_norm": 3.40625, + "grad_norm_var": 0.02877197265625, + "learning_rate": 0.0001, + "loss": 6.0948, + "loss/crossentropy": 2.6731438636779785, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18747824430465698, + "step": 9760 + }, + { + "epoch": 0.3050625, + "grad_norm": 3.1875, + "grad_norm_var": 0.019896443684895834, + "learning_rate": 0.0001, + "loss": 6.1542, + "loss/crossentropy": 2.7394288778305054, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1852274313569069, + "step": 9762 + }, + { + "epoch": 0.305125, + "grad_norm": 4.15625, + "grad_norm_var": 0.0641510009765625, + "learning_rate": 0.0001, + "loss": 6.1654, + "loss/crossentropy": 2.600125551223755, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19441720843315125, + "step": 9764 + }, + { + "epoch": 0.3051875, + "grad_norm": 3.15625, + "grad_norm_var": 0.06757405598958334, + "learning_rate": 0.0001, + "loss": 5.8276, + "loss/crossentropy": 2.552868127822876, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17473676800727844, + "step": 9766 + }, + { + "epoch": 0.30525, + "grad_norm": 3.796875, + "grad_norm_var": 0.08151753743489583, + "learning_rate": 0.0001, + "loss": 5.7432, + "loss/crossentropy": 2.376441240310669, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.17808697372674942, + "step": 9768 + }, + { + "epoch": 0.3053125, + "grad_norm": 3.890625, + "grad_norm_var": 0.10416259765625, + "learning_rate": 0.0001, + "loss": 6.2732, + "loss/crossentropy": 2.801830768585205, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1897130012512207, + "step": 9770 + }, + { + "epoch": 0.305375, + "grad_norm": 3.578125, + "grad_norm_var": 0.10221354166666667, + "learning_rate": 0.0001, + "loss": 6.282, + "loss/crossentropy": 2.7058520317077637, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20019535720348358, + "step": 9772 + }, + { + "epoch": 0.3054375, + "grad_norm": 3.5625, + "grad_norm_var": 0.09934895833333333, + "learning_rate": 0.0001, + "loss": 5.9044, + "loss/crossentropy": 2.424149751663208, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1913842111825943, + "step": 9774 + }, + { + "epoch": 0.3055, + "grad_norm": 3.59375, + "grad_norm_var": 0.09735921223958334, + "learning_rate": 0.0001, + "loss": 5.8527, + "loss/crossentropy": 2.4528133869171143, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.182962104678154, + "step": 9776 + }, + { + "epoch": 0.3055625, + "grad_norm": 3.109375, + "grad_norm_var": 0.09986063639322916, + "learning_rate": 0.0001, + "loss": 6.0973, + "loss/crossentropy": 2.6982662677764893, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18638630211353302, + "step": 9778 + }, + { + "epoch": 0.305625, + "grad_norm": 3.625, + "grad_norm_var": 0.0629791259765625, + "learning_rate": 0.0001, + "loss": 6.3929, + "loss/crossentropy": 2.798943281173706, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.2000209465622902, + "step": 9780 + }, + { + "epoch": 0.3056875, + "grad_norm": 3.0, + "grad_norm_var": 0.069580078125, + "learning_rate": 0.0001, + "loss": 5.7924, + "loss/crossentropy": 2.5172841548919678, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1779046207666397, + "step": 9782 + }, + { + "epoch": 0.30575, + "grad_norm": 3.40625, + "grad_norm_var": 0.06213785807291667, + "learning_rate": 0.0001, + "loss": 5.8359, + "loss/crossentropy": 2.506075620651245, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1810283660888672, + "step": 9784 + }, + { + "epoch": 0.3058125, + "grad_norm": 3.484375, + "grad_norm_var": 0.0404449462890625, + "learning_rate": 0.0001, + "loss": 5.9368, + "loss/crossentropy": 2.524527072906494, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18770781904459, + "step": 9786 + }, + { + "epoch": 0.305875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0729888916015625, + "learning_rate": 0.0001, + "loss": 5.6651, + "loss/crossentropy": 2.3186328411102295, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1811298280954361, + "step": 9788 + }, + { + "epoch": 0.3059375, + "grad_norm": 3.3125, + "grad_norm_var": 0.09324544270833333, + "learning_rate": 0.0001, + "loss": 5.9769, + "loss/crossentropy": 2.6126078367233276, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18212811648845673, + "step": 9790 + }, + { + "epoch": 0.306, + "grad_norm": 3.40625, + "grad_norm_var": 0.08733317057291666, + "learning_rate": 0.0001, + "loss": 5.843, + "loss/crossentropy": 2.464438796043396, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18395350873470306, + "step": 9792 + }, + { + "epoch": 0.3060625, + "grad_norm": 3.84375, + "grad_norm_var": 0.0983306884765625, + "learning_rate": 0.0001, + "loss": 5.8246, + "loss/crossentropy": 2.483186721801758, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17905943095684052, + "step": 9794 + }, + { + "epoch": 0.306125, + "grad_norm": 3.15625, + "grad_norm_var": 0.10074462890625, + "learning_rate": 0.0001, + "loss": 6.034, + "loss/crossentropy": 2.705387234687805, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18129978328943253, + "step": 9796 + }, + { + "epoch": 0.3061875, + "grad_norm": 3.421875, + "grad_norm_var": 0.08580322265625, + "learning_rate": 0.0001, + "loss": 5.8784, + "loss/crossentropy": 2.4730982780456543, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18428245186805725, + "step": 9798 + }, + { + "epoch": 0.30625, + "grad_norm": 3.25, + "grad_norm_var": 0.09385477701822917, + "learning_rate": 0.0001, + "loss": 5.5357, + "loss/crossentropy": 2.307369589805603, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16892365366220474, + "step": 9800 + }, + { + "epoch": 0.3063125, + "grad_norm": 3.3125, + "grad_norm_var": 0.0942291259765625, + "learning_rate": 0.0001, + "loss": 5.7947, + "loss/crossentropy": 2.520161747932434, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17433323711156845, + "step": 9802 + }, + { + "epoch": 0.306375, + "grad_norm": 3.28125, + "grad_norm_var": 0.0646484375, + "learning_rate": 0.0001, + "loss": 6.0379, + "loss/crossentropy": 2.6151548624038696, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18641389906406403, + "step": 9804 + }, + { + "epoch": 0.3064375, + "grad_norm": 3.421875, + "grad_norm_var": 0.05175374348958333, + "learning_rate": 0.0001, + "loss": 5.8846, + "loss/crossentropy": 2.3822693824768066, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.187337726354599, + "step": 9806 + }, + { + "epoch": 0.3065, + "grad_norm": 3.59375, + "grad_norm_var": 0.05625712076822917, + "learning_rate": 0.0001, + "loss": 6.2549, + "loss/crossentropy": 2.7366974353790283, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19322381913661957, + "step": 9808 + }, + { + "epoch": 0.3065625, + "grad_norm": 3.046875, + "grad_norm_var": 0.06360270182291666, + "learning_rate": 0.0001, + "loss": 5.6028, + "loss/crossentropy": 2.431782364845276, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16788671165704727, + "step": 9810 + }, + { + "epoch": 0.306625, + "grad_norm": 3.421875, + "grad_norm_var": 0.05806884765625, + "learning_rate": 0.0001, + "loss": 6.0663, + "loss/crossentropy": 2.6174486875534058, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18863768130540848, + "step": 9812 + }, + { + "epoch": 0.3066875, + "grad_norm": 3.421875, + "grad_norm_var": 0.0593658447265625, + "learning_rate": 0.0001, + "loss": 5.8863, + "loss/crossentropy": 2.484455108642578, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18393176048994064, + "step": 9814 + }, + { + "epoch": 0.30675, + "grad_norm": 3.359375, + "grad_norm_var": 0.061909993489583336, + "learning_rate": 0.0001, + "loss": 5.8589, + "loss/crossentropy": 2.4574899673461914, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18271709978580475, + "step": 9816 + }, + { + "epoch": 0.3068125, + "grad_norm": 3.609375, + "grad_norm_var": 0.06285400390625, + "learning_rate": 0.0001, + "loss": 6.0258, + "loss/crossentropy": 2.581871509552002, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18775159865617752, + "step": 9818 + }, + { + "epoch": 0.306875, + "grad_norm": 3.53125, + "grad_norm_var": 0.0616363525390625, + "learning_rate": 0.0001, + "loss": 6.1466, + "loss/crossentropy": 2.6562764644622803, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19473157078027725, + "step": 9820 + }, + { + "epoch": 0.3069375, + "grad_norm": 3.78125, + "grad_norm_var": 0.06469624837239583, + "learning_rate": 0.0001, + "loss": 6.2948, + "loss/crossentropy": 2.640057325363159, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.20141346007585526, + "step": 9822 + }, + { + "epoch": 0.307, + "grad_norm": 3.484375, + "grad_norm_var": 0.06384175618489583, + "learning_rate": 0.0001, + "loss": 6.0037, + "loss/crossentropy": 2.6254125833511353, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18391861766576767, + "step": 9824 + }, + { + "epoch": 0.3070625, + "grad_norm": 3.0, + "grad_norm_var": 0.051732381184895836, + "learning_rate": 0.0001, + "loss": 6.1341, + "loss/crossentropy": 2.7805886268615723, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1853499636054039, + "step": 9826 + }, + { + "epoch": 0.307125, + "grad_norm": 4.09375, + "grad_norm_var": 0.07996317545572916, + "learning_rate": 0.0001, + "loss": 6.1145, + "loss/crossentropy": 2.549388289451599, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19869714230298996, + "step": 9828 + }, + { + "epoch": 0.3071875, + "grad_norm": 3.25, + "grad_norm_var": 0.08531901041666666, + "learning_rate": 0.0001, + "loss": 5.7362, + "loss/crossentropy": 2.4683566093444824, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17131523042917252, + "step": 9830 + }, + { + "epoch": 0.30725, + "grad_norm": 3.4375, + "grad_norm_var": 0.0760162353515625, + "learning_rate": 0.0001, + "loss": 5.9847, + "loss/crossentropy": 2.5591347217559814, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1886463165283203, + "step": 9832 + }, + { + "epoch": 0.3073125, + "grad_norm": 3.34375, + "grad_norm_var": 0.08001302083333334, + "learning_rate": 0.0001, + "loss": 6.272, + "loss/crossentropy": 2.7357157468795776, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19659461081027985, + "step": 9834 + }, + { + "epoch": 0.307375, + "grad_norm": 3.484375, + "grad_norm_var": 0.079541015625, + "learning_rate": 0.0001, + "loss": 5.9386, + "loss/crossentropy": 2.547179102897644, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18289538472890854, + "step": 9836 + }, + { + "epoch": 0.3074375, + "grad_norm": 3.515625, + "grad_norm_var": 0.07056884765625, + "learning_rate": 0.0001, + "loss": 6.2855, + "loss/crossentropy": 2.701116442680359, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1998470351099968, + "step": 9838 + }, + { + "epoch": 0.3075, + "grad_norm": 3.421875, + "grad_norm_var": 0.07183837890625, + "learning_rate": 0.0001, + "loss": 6.0312, + "loss/crossentropy": 2.6321617364883423, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1867804378271103, + "step": 9840 + }, + { + "epoch": 0.3075625, + "grad_norm": 3.296875, + "grad_norm_var": 0.06365458170572917, + "learning_rate": 0.0001, + "loss": 6.1738, + "loss/crossentropy": 2.7372519969940186, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18662403523921967, + "step": 9842 + }, + { + "epoch": 0.307625, + "grad_norm": 3.3125, + "grad_norm_var": 0.03885091145833333, + "learning_rate": 0.0001, + "loss": 5.7245, + "loss/crossentropy": 2.4566760063171387, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1756145879626274, + "step": 9844 + }, + { + "epoch": 0.3076875, + "grad_norm": 3.4375, + "grad_norm_var": 0.07322591145833333, + "learning_rate": 0.0001, + "loss": 6.1802, + "loss/crossentropy": 2.640401840209961, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19421062618494034, + "step": 9846 + }, + { + "epoch": 0.30775, + "grad_norm": 3.3125, + "grad_norm_var": 0.07102457682291667, + "learning_rate": 0.0001, + "loss": 5.9244, + "loss/crossentropy": 2.5438748598098755, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18297865241765976, + "step": 9848 + }, + { + "epoch": 0.3078125, + "grad_norm": 3.765625, + "grad_norm_var": 0.07532450358072916, + "learning_rate": 0.0001, + "loss": 6.1063, + "loss/crossentropy": 2.6450347900390625, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18909262120723724, + "step": 9850 + }, + { + "epoch": 0.307875, + "grad_norm": 3.53125, + "grad_norm_var": 0.07706705729166667, + "learning_rate": 0.0001, + "loss": 6.0794, + "loss/crossentropy": 2.6155248880386353, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18857572972774506, + "step": 9852 + }, + { + "epoch": 0.3079375, + "grad_norm": 3.28125, + "grad_norm_var": 0.0739898681640625, + "learning_rate": 0.0001, + "loss": 5.7991, + "loss/crossentropy": 2.467707872390747, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17844978719949722, + "step": 9854 + }, + { + "epoch": 0.308, + "grad_norm": 3.328125, + "grad_norm_var": 0.0732574462890625, + "learning_rate": 0.0001, + "loss": 6.2447, + "loss/crossentropy": 2.705179214477539, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1976974532008171, + "step": 9856 + }, + { + "epoch": 0.3080625, + "grad_norm": 3.328125, + "grad_norm_var": 0.06796773274739583, + "learning_rate": 0.0001, + "loss": 6.0257, + "loss/crossentropy": 2.636121988296509, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18465761840343475, + "step": 9858 + }, + { + "epoch": 0.308125, + "grad_norm": 3.34375, + "grad_norm_var": 0.0575103759765625, + "learning_rate": 0.0001, + "loss": 6.0515, + "loss/crossentropy": 2.5803788900375366, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19047530740499496, + "step": 9860 + }, + { + "epoch": 0.3081875, + "grad_norm": 3.234375, + "grad_norm_var": 0.05416666666666667, + "learning_rate": 0.0001, + "loss": 6.1671, + "loss/crossentropy": 2.685998320579529, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19029683619737625, + "step": 9862 + }, + { + "epoch": 0.30825, + "grad_norm": 3.28125, + "grad_norm_var": 0.05315348307291667, + "learning_rate": 0.0001, + "loss": 6.1954, + "loss/crossentropy": 2.747362732887268, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18738098442554474, + "step": 9864 + }, + { + "epoch": 0.3083125, + "grad_norm": 3.375, + "grad_norm_var": 0.0454254150390625, + "learning_rate": 0.0001, + "loss": 6.2392, + "loss/crossentropy": 2.808570146560669, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18759103119373322, + "step": 9866 + }, + { + "epoch": 0.308375, + "grad_norm": 3.171875, + "grad_norm_var": 0.05003153483072917, + "learning_rate": 0.0001, + "loss": 6.049, + "loss/crossentropy": 2.623955488204956, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18586328625679016, + "step": 9868 + }, + { + "epoch": 0.3084375, + "grad_norm": 3.0625, + "grad_norm_var": 0.056864420572916664, + "learning_rate": 0.0001, + "loss": 5.7197, + "loss/crossentropy": 2.475876808166504, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1732119396328926, + "step": 9870 + }, + { + "epoch": 0.3085, + "grad_norm": 3.515625, + "grad_norm_var": 0.05852457682291667, + "learning_rate": 0.0001, + "loss": 5.8995, + "loss/crossentropy": 2.5887688398361206, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17756131291389465, + "step": 9872 + }, + { + "epoch": 0.3085625, + "grad_norm": 4.78125, + "grad_norm_var": 0.18848368326822917, + "learning_rate": 0.0001, + "loss": 6.5997, + "loss/crossentropy": 2.8490360975265503, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.21413087844848633, + "step": 9874 + }, + { + "epoch": 0.308625, + "grad_norm": 3.3125, + "grad_norm_var": 0.18660481770833334, + "learning_rate": 0.0001, + "loss": 6.2266, + "loss/crossentropy": 2.7796058654785156, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18650047481060028, + "step": 9876 + }, + { + "epoch": 0.3086875, + "grad_norm": 3.4375, + "grad_norm_var": 0.1646148681640625, + "learning_rate": 0.0001, + "loss": 6.1877, + "loss/crossentropy": 2.775872588157654, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18610799312591553, + "step": 9878 + }, + { + "epoch": 0.30875, + "grad_norm": 3.734375, + "grad_norm_var": 0.16564127604166667, + "learning_rate": 0.0001, + "loss": 5.9442, + "loss/crossentropy": 2.5594289302825928, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.181057408452034, + "step": 9880 + }, + { + "epoch": 0.3088125, + "grad_norm": 3.53125, + "grad_norm_var": 0.16448465983072916, + "learning_rate": 0.0001, + "loss": 6.1045, + "loss/crossentropy": 2.648719549179077, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19128568470478058, + "step": 9882 + }, + { + "epoch": 0.308875, + "grad_norm": 3.15625, + "grad_norm_var": 0.16286519368489583, + "learning_rate": 0.0001, + "loss": 5.9229, + "loss/crossentropy": 2.567718982696533, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18161438405513763, + "step": 9884 + }, + { + "epoch": 0.3089375, + "grad_norm": 3.359375, + "grad_norm_var": 0.1539703369140625, + "learning_rate": 0.0001, + "loss": 6.0521, + "loss/crossentropy": 2.6140952110290527, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18481512367725372, + "step": 9886 + }, + { + "epoch": 0.309, + "grad_norm": 3.171875, + "grad_norm_var": 0.1518218994140625, + "learning_rate": 0.0001, + "loss": 5.9709, + "loss/crossentropy": 2.589396595954895, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18580633401870728, + "step": 9888 + }, + { + "epoch": 0.3090625, + "grad_norm": 3.265625, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 5.9705, + "loss/crossentropy": 2.5676733255386353, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1805213838815689, + "step": 9890 + }, + { + "epoch": 0.309125, + "grad_norm": 3.203125, + "grad_norm_var": 0.024120076497395834, + "learning_rate": 0.0001, + "loss": 6.1128, + "loss/crossentropy": 2.677307605743408, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1904277577996254, + "step": 9892 + }, + { + "epoch": 0.3091875, + "grad_norm": 3.109375, + "grad_norm_var": 0.027083333333333334, + "learning_rate": 0.0001, + "loss": 5.3723, + "loss/crossentropy": 2.256300449371338, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16081786900758743, + "step": 9894 + }, + { + "epoch": 0.30925, + "grad_norm": 3.703125, + "grad_norm_var": 0.0312408447265625, + "learning_rate": 0.0001, + "loss": 6.5875, + "loss/crossentropy": 2.90229868888855, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.2087506353855133, + "step": 9896 + }, + { + "epoch": 0.3093125, + "grad_norm": 3.421875, + "grad_norm_var": 0.029133097330729166, + "learning_rate": 0.0001, + "loss": 5.5702, + "loss/crossentropy": 2.324537992477417, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17299997806549072, + "step": 9898 + }, + { + "epoch": 0.309375, + "grad_norm": 3.59375, + "grad_norm_var": 0.032648722330729164, + "learning_rate": 0.0001, + "loss": 5.9227, + "loss/crossentropy": 2.5770009756088257, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1810528114438057, + "step": 9900 + }, + { + "epoch": 0.3094375, + "grad_norm": 2.890625, + "grad_norm_var": 0.046930948893229164, + "learning_rate": 0.0001, + "loss": 5.6693, + "loss/crossentropy": 2.446715235710144, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17264599353075027, + "step": 9902 + }, + { + "epoch": 0.3095, + "grad_norm": 3.359375, + "grad_norm_var": 0.04419657389322917, + "learning_rate": 0.0001, + "loss": 5.8134, + "loss/crossentropy": 2.5081379413604736, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17661748826503754, + "step": 9904 + }, + { + "epoch": 0.3095625, + "grad_norm": 3.109375, + "grad_norm_var": 0.044352213541666664, + "learning_rate": 0.0001, + "loss": 5.749, + "loss/crossentropy": 2.468726873397827, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17646163702011108, + "step": 9906 + }, + { + "epoch": 0.309625, + "grad_norm": 3.953125, + "grad_norm_var": 0.08619384765625, + "learning_rate": 0.0001, + "loss": 5.7259, + "loss/crossentropy": 2.3587589263916016, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17968729883432388, + "step": 9908 + }, + { + "epoch": 0.3096875, + "grad_norm": 3.75, + "grad_norm_var": 0.09411519368489583, + "learning_rate": 0.0001, + "loss": 6.1154, + "loss/crossentropy": 2.744936943054199, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18274830281734467, + "step": 9910 + }, + { + "epoch": 0.30975, + "grad_norm": 3.453125, + "grad_norm_var": 0.09482421875, + "learning_rate": 0.0001, + "loss": 5.834, + "loss/crossentropy": 2.5946810245513916, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1715928465127945, + "step": 9912 + }, + { + "epoch": 0.3098125, + "grad_norm": 3.609375, + "grad_norm_var": 0.6234527587890625, + "learning_rate": 0.0001, + "loss": 6.0723, + "loss/crossentropy": 2.575463891029358, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19421055167913437, + "step": 9914 + }, + { + "epoch": 0.309875, + "grad_norm": 3.3125, + "grad_norm_var": 0.6217437744140625, + "learning_rate": 0.0001, + "loss": 5.7882, + "loss/crossentropy": 2.528978943824768, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1731920763850212, + "step": 9916 + }, + { + "epoch": 0.3099375, + "grad_norm": 3.640625, + "grad_norm_var": 0.5905914306640625, + "learning_rate": 0.0001, + "loss": 6.0089, + "loss/crossentropy": 2.5800297260284424, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18546050786972046, + "step": 9918 + }, + { + "epoch": 0.31, + "grad_norm": 3.921875, + "grad_norm_var": 0.5864166259765625, + "learning_rate": 0.0001, + "loss": 6.1517, + "loss/crossentropy": 2.6178401708602905, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.19283895194530487, + "step": 9920 + }, + { + "epoch": 0.3100625, + "grad_norm": 3.359375, + "grad_norm_var": 0.5754709879557292, + "learning_rate": 0.0001, + "loss": 6.3124, + "loss/crossentropy": 2.8379873037338257, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19118830561637878, + "step": 9922 + }, + { + "epoch": 0.310125, + "grad_norm": 3.46875, + "grad_norm_var": 0.6920206705729167, + "learning_rate": 0.0001, + "loss": 5.9962, + "loss/crossentropy": 2.508086919784546, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1960764080286026, + "step": 9924 + }, + { + "epoch": 0.3101875, + "grad_norm": 3.515625, + "grad_norm_var": 0.6780832926432292, + "learning_rate": 0.0001, + "loss": 6.1674, + "loss/crossentropy": 2.7152515649795532, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18661946803331375, + "step": 9926 + }, + { + "epoch": 0.31025, + "grad_norm": 3.40625, + "grad_norm_var": 0.6502024332682291, + "learning_rate": 0.0001, + "loss": 5.6761, + "loss/crossentropy": 2.3589260578155518, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17508075386285782, + "step": 9928 + }, + { + "epoch": 0.3103125, + "grad_norm": 3.5625, + "grad_norm_var": 0.2156646728515625, + "learning_rate": 0.0001, + "loss": 5.8203, + "loss/crossentropy": 2.5374714136123657, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17515962570905685, + "step": 9930 + }, + { + "epoch": 0.310375, + "grad_norm": 3.15625, + "grad_norm_var": 0.219921875, + "learning_rate": 0.0001, + "loss": 6.037, + "loss/crossentropy": 2.6640161275863647, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18416845053434372, + "step": 9932 + }, + { + "epoch": 0.3104375, + "grad_norm": 3.609375, + "grad_norm_var": 0.21574605305989583, + "learning_rate": 0.0001, + "loss": 6.198, + "loss/crossentropy": 2.705440402030945, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19104811549186707, + "step": 9934 + }, + { + "epoch": 0.3105, + "grad_norm": 3.453125, + "grad_norm_var": 0.2122467041015625, + "learning_rate": 0.0001, + "loss": 5.8265, + "loss/crossentropy": 2.546779990196228, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17953650653362274, + "step": 9936 + }, + { + "epoch": 0.3105625, + "grad_norm": 3.4375, + "grad_norm_var": 0.20606180826822917, + "learning_rate": 0.0001, + "loss": 6.1075, + "loss/crossentropy": 2.6210367679595947, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1900544911623001, + "step": 9938 + }, + { + "epoch": 0.310625, + "grad_norm": 3.3125, + "grad_norm_var": 0.06210530598958333, + "learning_rate": 0.0001, + "loss": 5.8099, + "loss/crossentropy": 2.5792685747146606, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1734534054994583, + "step": 9940 + }, + { + "epoch": 0.3106875, + "grad_norm": 7.65625, + "grad_norm_var": 1.1744466145833334, + "learning_rate": 0.0001, + "loss": 6.0235, + "loss/crossentropy": 2.4303317070007324, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19994131475687027, + "step": 9942 + }, + { + "epoch": 0.31075, + "grad_norm": 3.09375, + "grad_norm_var": 1.1827799479166667, + "learning_rate": 0.0001, + "loss": 5.732, + "loss/crossentropy": 2.526008129119873, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17098946869373322, + "step": 9944 + }, + { + "epoch": 0.3108125, + "grad_norm": 3.65625, + "grad_norm_var": 1.1689198811848958, + "learning_rate": 0.0001, + "loss": 6.0426, + "loss/crossentropy": 2.6789000034332275, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18128886818885803, + "step": 9946 + }, + { + "epoch": 0.310875, + "grad_norm": 3.0625, + "grad_norm_var": 1.18121337890625, + "learning_rate": 0.0001, + "loss": 5.8011, + "loss/crossentropy": 2.4889053106307983, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17770235240459442, + "step": 9948 + }, + { + "epoch": 0.3109375, + "grad_norm": 3.390625, + "grad_norm_var": 1.1945963541666667, + "learning_rate": 0.0001, + "loss": 5.963, + "loss/crossentropy": 2.5693660974502563, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18780048191547394, + "step": 9950 + }, + { + "epoch": 0.311, + "grad_norm": 3.515625, + "grad_norm_var": 1.1877604166666667, + "learning_rate": 0.0001, + "loss": 6.3526, + "loss/crossentropy": 2.8071502447128296, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1936059445142746, + "step": 9952 + }, + { + "epoch": 0.3110625, + "grad_norm": 3.625, + "grad_norm_var": 1.1951243082682292, + "learning_rate": 0.0001, + "loss": 6.1133, + "loss/crossentropy": 2.6762869358062744, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18627797067165375, + "step": 9954 + }, + { + "epoch": 0.311125, + "grad_norm": 3.296875, + "grad_norm_var": 1.17275390625, + "learning_rate": 0.0001, + "loss": 6.275, + "loss/crossentropy": 2.8558825254440308, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18410159647464752, + "step": 9956 + }, + { + "epoch": 0.3111875, + "grad_norm": 3.5, + "grad_norm_var": 0.055597941080729164, + "learning_rate": 0.0001, + "loss": 6.21, + "loss/crossentropy": 2.735555052757263, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1915881484746933, + "step": 9958 + }, + { + "epoch": 0.31125, + "grad_norm": 3.1875, + "grad_norm_var": 0.05237223307291667, + "learning_rate": 0.0001, + "loss": 5.8453, + "loss/crossentropy": 2.5318996906280518, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1762586608529091, + "step": 9960 + }, + { + "epoch": 0.3113125, + "grad_norm": 3.328125, + "grad_norm_var": 0.049051920572916664, + "learning_rate": 0.0001, + "loss": 6.0751, + "loss/crossentropy": 2.6993168592453003, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1832793802022934, + "step": 9962 + }, + { + "epoch": 0.311375, + "grad_norm": 3.40625, + "grad_norm_var": 0.0489654541015625, + "learning_rate": 0.0001, + "loss": 5.998, + "loss/crossentropy": 2.5605742931365967, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1874968707561493, + "step": 9964 + }, + { + "epoch": 0.3114375, + "grad_norm": 3.859375, + "grad_norm_var": 0.056396484375, + "learning_rate": 0.0001, + "loss": 6.1923, + "loss/crossentropy": 2.628121256828308, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19743662327528, + "step": 9966 + }, + { + "epoch": 0.3115, + "grad_norm": 3.296875, + "grad_norm_var": 0.052294921875, + "learning_rate": 0.0001, + "loss": 5.9873, + "loss/crossentropy": 2.6812620162963867, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17787063866853714, + "step": 9968 + }, + { + "epoch": 0.3115625, + "grad_norm": 3.609375, + "grad_norm_var": 0.04845377604166667, + "learning_rate": 0.0001, + "loss": 6.0452, + "loss/crossentropy": 2.5701335668563843, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18813063204288483, + "step": 9970 + }, + { + "epoch": 0.311625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0431060791015625, + "learning_rate": 0.0001, + "loss": 5.6299, + "loss/crossentropy": 2.377366542816162, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17446747422218323, + "step": 9972 + }, + { + "epoch": 0.3116875, + "grad_norm": 3.671875, + "grad_norm_var": 0.048974609375, + "learning_rate": 0.0001, + "loss": 5.7221, + "loss/crossentropy": 2.362430214881897, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.17659056186676025, + "step": 9974 + }, + { + "epoch": 0.31175, + "grad_norm": 3.75, + "grad_norm_var": 0.05611572265625, + "learning_rate": 0.0001, + "loss": 5.8213, + "loss/crossentropy": 2.462308406829834, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17808612436056137, + "step": 9976 + }, + { + "epoch": 0.3118125, + "grad_norm": 3.3125, + "grad_norm_var": 0.05877278645833333, + "learning_rate": 0.0001, + "loss": 6.1144, + "loss/crossentropy": 2.7456939220428467, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18335268646478653, + "step": 9978 + }, + { + "epoch": 0.311875, + "grad_norm": 3.390625, + "grad_norm_var": 0.0531890869140625, + "learning_rate": 0.0001, + "loss": 5.8751, + "loss/crossentropy": 2.599093437194824, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1733005940914154, + "step": 9980 + }, + { + "epoch": 0.3119375, + "grad_norm": 3.390625, + "grad_norm_var": 0.03828125, + "learning_rate": 0.0001, + "loss": 6.3225, + "loss/crossentropy": 2.8240467309951782, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19437789916992188, + "step": 9982 + }, + { + "epoch": 0.312, + "grad_norm": 3.609375, + "grad_norm_var": 0.03931884765625, + "learning_rate": 0.0001, + "loss": 6.072, + "loss/crossentropy": 2.614315390586853, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19069334864616394, + "step": 9984 + }, + { + "epoch": 0.3120625, + "grad_norm": 3.453125, + "grad_norm_var": 0.04952799479166667, + "learning_rate": 0.0001, + "loss": 6.1526, + "loss/crossentropy": 2.637527585029602, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19369181990623474, + "step": 9986 + }, + { + "epoch": 0.312125, + "grad_norm": 3.390625, + "grad_norm_var": 0.04692281087239583, + "learning_rate": 0.0001, + "loss": 6.1406, + "loss/crossentropy": 2.7079596519470215, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18857339769601822, + "step": 9988 + }, + { + "epoch": 0.3121875, + "grad_norm": 3.578125, + "grad_norm_var": 0.04166259765625, + "learning_rate": 0.0001, + "loss": 5.921, + "loss/crossentropy": 2.5391063690185547, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18310847878456116, + "step": 9990 + }, + { + "epoch": 0.31225, + "grad_norm": 3.1875, + "grad_norm_var": 0.032242838541666666, + "learning_rate": 0.0001, + "loss": 6.4327, + "loss/crossentropy": 2.9867708683013916, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1902955174446106, + "step": 9992 + }, + { + "epoch": 0.3123125, + "grad_norm": 3.34375, + "grad_norm_var": 0.03570963541666667, + "learning_rate": 0.0001, + "loss": 5.8569, + "loss/crossentropy": 2.498446226119995, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18232624232769012, + "step": 9994 + }, + { + "epoch": 0.312375, + "grad_norm": 3.34375, + "grad_norm_var": 0.031245930989583334, + "learning_rate": 0.0001, + "loss": 6.0615, + "loss/crossentropy": 2.6084243059158325, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19100607931613922, + "step": 9996 + }, + { + "epoch": 0.3124375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0321441650390625, + "learning_rate": 0.0001, + "loss": 5.9178, + "loss/crossentropy": 2.620732069015503, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17580141127109528, + "step": 9998 + }, + { + "epoch": 0.3125, + "grad_norm": 3.421875, + "grad_norm_var": 0.03232014973958333, + "learning_rate": 0.0001, + "loss": 5.748, + "loss/crossentropy": 2.4111841917037964, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18055960536003113, + "step": 10000 + }, + { + "epoch": 0.3125625, + "grad_norm": 3.359375, + "grad_norm_var": 0.018407185872395832, + "learning_rate": 0.0001, + "loss": 5.8249, + "loss/crossentropy": 2.4341362714767456, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1863442361354828, + "step": 10002 + }, + { + "epoch": 0.312625, + "grad_norm": 3.921875, + "grad_norm_var": 0.03772379557291667, + "learning_rate": 0.0001, + "loss": 5.9725, + "loss/crossentropy": 2.524761438369751, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18384020030498505, + "step": 10004 + }, + { + "epoch": 0.3126875, + "grad_norm": 3.359375, + "grad_norm_var": 0.035481770833333336, + "learning_rate": 0.0001, + "loss": 5.8406, + "loss/crossentropy": 2.4397616386413574, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1822672188282013, + "step": 10006 + }, + { + "epoch": 0.31275, + "grad_norm": 3.359375, + "grad_norm_var": 0.03580322265625, + "learning_rate": 0.0001, + "loss": 5.9844, + "loss/crossentropy": 2.6379276514053345, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17995747178792953, + "step": 10008 + }, + { + "epoch": 0.3128125, + "grad_norm": 3.609375, + "grad_norm_var": 0.034521484375, + "learning_rate": 0.0001, + "loss": 5.9061, + "loss/crossentropy": 2.4900271892547607, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18457267433404922, + "step": 10010 + }, + { + "epoch": 0.312875, + "grad_norm": 3.015625, + "grad_norm_var": 0.04472554524739583, + "learning_rate": 0.0001, + "loss": 6.0315, + "loss/crossentropy": 2.5978981256484985, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1859375536441803, + "step": 10012 + }, + { + "epoch": 0.3129375, + "grad_norm": 3.34375, + "grad_norm_var": 0.04940999348958333, + "learning_rate": 0.0001, + "loss": 5.9851, + "loss/crossentropy": 2.6621599197387695, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17995379120111465, + "step": 10014 + }, + { + "epoch": 0.313, + "grad_norm": 3.546875, + "grad_norm_var": 0.0474609375, + "learning_rate": 0.0001, + "loss": 6.3342, + "loss/crossentropy": 2.760565996170044, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.19525590538978577, + "step": 10016 + }, + { + "epoch": 0.3130625, + "grad_norm": 3.5, + "grad_norm_var": 0.04726460774739583, + "learning_rate": 0.0001, + "loss": 5.8966, + "loss/crossentropy": 2.5175243616104126, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18517116457223892, + "step": 10018 + }, + { + "epoch": 0.313125, + "grad_norm": 3.484375, + "grad_norm_var": 0.029548136393229167, + "learning_rate": 0.0001, + "loss": 6.05, + "loss/crossentropy": 2.631057024002075, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1875997707247734, + "step": 10020 + }, + { + "epoch": 0.3131875, + "grad_norm": 3.65625, + "grad_norm_var": 0.0339263916015625, + "learning_rate": 0.0001, + "loss": 5.9978, + "loss/crossentropy": 2.4847946166992188, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1930968165397644, + "step": 10022 + }, + { + "epoch": 0.31325, + "grad_norm": 3.171875, + "grad_norm_var": 0.0322418212890625, + "learning_rate": 0.0001, + "loss": 6.1395, + "loss/crossentropy": 2.7118492126464844, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18964150547981262, + "step": 10024 + }, + { + "epoch": 0.3133125, + "grad_norm": 3.515625, + "grad_norm_var": 0.03418680826822917, + "learning_rate": 0.0001, + "loss": 6.3217, + "loss/crossentropy": 2.7653316259384155, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19860395044088364, + "step": 10026 + }, + { + "epoch": 0.313375, + "grad_norm": 3.265625, + "grad_norm_var": 0.024592081705729168, + "learning_rate": 0.0001, + "loss": 6.0078, + "loss/crossentropy": 2.6814316511154175, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18146147578954697, + "step": 10028 + }, + { + "epoch": 0.3134375, + "grad_norm": 3.484375, + "grad_norm_var": 0.02001953125, + "learning_rate": 0.0001, + "loss": 6.0967, + "loss/crossentropy": 2.6082998514175415, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19298158586025238, + "step": 10030 + }, + { + "epoch": 0.3135, + "grad_norm": 3.28125, + "grad_norm_var": 0.020995076497395834, + "learning_rate": 0.0001, + "loss": 5.9659, + "loss/crossentropy": 2.502140998840332, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18855996429920197, + "step": 10032 + }, + { + "epoch": 0.3135625, + "grad_norm": 3.328125, + "grad_norm_var": 0.02412109375, + "learning_rate": 0.0001, + "loss": 5.9479, + "loss/crossentropy": 2.6646758317947388, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17714965343475342, + "step": 10034 + }, + { + "epoch": 0.313625, + "grad_norm": 3.359375, + "grad_norm_var": 0.02877197265625, + "learning_rate": 0.0001, + "loss": 5.7889, + "loss/crossentropy": 2.542953610420227, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17225146293640137, + "step": 10036 + }, + { + "epoch": 0.3136875, + "grad_norm": 3.109375, + "grad_norm_var": 0.025446573893229168, + "learning_rate": 0.0001, + "loss": 6.0174, + "loss/crossentropy": 2.647829055786133, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1834377571940422, + "step": 10038 + }, + { + "epoch": 0.31375, + "grad_norm": 3.390625, + "grad_norm_var": 0.023921712239583334, + "learning_rate": 0.0001, + "loss": 6.0348, + "loss/crossentropy": 2.7294058799743652, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17897269129753113, + "step": 10040 + }, + { + "epoch": 0.3138125, + "grad_norm": 3.421875, + "grad_norm_var": 0.012523396809895834, + "learning_rate": 0.0001, + "loss": 5.7022, + "loss/crossentropy": 2.3868244886398315, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1737213134765625, + "step": 10042 + }, + { + "epoch": 0.313875, + "grad_norm": 3.34375, + "grad_norm_var": 0.012723795572916667, + "learning_rate": 0.0001, + "loss": 5.6612, + "loss/crossentropy": 2.4281164407730103, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17330902069807053, + "step": 10044 + }, + { + "epoch": 0.3139375, + "grad_norm": 3.46875, + "grad_norm_var": 0.020210774739583333, + "learning_rate": 0.0001, + "loss": 5.7424, + "loss/crossentropy": 2.547224283218384, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1652233824133873, + "step": 10046 + }, + { + "epoch": 0.314, + "grad_norm": 3.46875, + "grad_norm_var": 0.024559529622395833, + "learning_rate": 0.0001, + "loss": 5.9676, + "loss/crossentropy": 2.5773009061813354, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18824848532676697, + "step": 10048 + }, + { + "epoch": 0.3140625, + "grad_norm": 4.03125, + "grad_norm_var": 6.336457316080729, + "learning_rate": 0.0001, + "loss": 6.2463, + "loss/crossentropy": 2.6703532934188843, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1993865668773651, + "step": 10050 + }, + { + "epoch": 0.314125, + "grad_norm": 3.4375, + "grad_norm_var": 6.288590494791666, + "learning_rate": 0.0001, + "loss": 6.0822, + "loss/crossentropy": 2.74504292011261, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17824263870716095, + "step": 10052 + }, + { + "epoch": 0.3141875, + "grad_norm": 3.359375, + "grad_norm_var": 6.258088175455729, + "learning_rate": 0.0001, + "loss": 5.9835, + "loss/crossentropy": 2.5916759967803955, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1848854348063469, + "step": 10054 + }, + { + "epoch": 0.31425, + "grad_norm": 3.265625, + "grad_norm_var": 6.292967732747396, + "learning_rate": 0.0001, + "loss": 5.7399, + "loss/crossentropy": 2.497455358505249, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17229175567626953, + "step": 10056 + }, + { + "epoch": 0.3143125, + "grad_norm": 3.625, + "grad_norm_var": 6.293008422851562, + "learning_rate": 0.0001, + "loss": 5.9707, + "loss/crossentropy": 2.6046427488327026, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1803584024310112, + "step": 10058 + }, + { + "epoch": 0.314375, + "grad_norm": 3.75, + "grad_norm_var": 6.237691243489583, + "learning_rate": 0.0001, + "loss": 5.9682, + "loss/crossentropy": 2.5612787008285522, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18444325774908066, + "step": 10060 + }, + { + "epoch": 0.3144375, + "grad_norm": 3.265625, + "grad_norm_var": 6.182445271809896, + "learning_rate": 0.0001, + "loss": 5.8448, + "loss/crossentropy": 2.4632593393325806, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18190263211727142, + "step": 10062 + }, + { + "epoch": 0.3145, + "grad_norm": 3.203125, + "grad_norm_var": 6.199735514322916, + "learning_rate": 0.0001, + "loss": 5.9259, + "loss/crossentropy": 2.5135574340820312, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18303150683641434, + "step": 10064 + }, + { + "epoch": 0.3145625, + "grad_norm": 3.609375, + "grad_norm_var": 0.03591206868489583, + "learning_rate": 0.0001, + "loss": 5.9092, + "loss/crossentropy": 2.597083568572998, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17964644730091095, + "step": 10066 + }, + { + "epoch": 0.314625, + "grad_norm": 3.390625, + "grad_norm_var": 0.03460286458333333, + "learning_rate": 0.0001, + "loss": 5.7752, + "loss/crossentropy": 2.3850090503692627, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18081103265285492, + "step": 10068 + }, + { + "epoch": 0.3146875, + "grad_norm": 3.8125, + "grad_norm_var": 0.07229410807291667, + "learning_rate": 0.0001, + "loss": 6.113, + "loss/crossentropy": 2.440373659133911, + "loss/hidden": 1.71484375, + "loss/jsd": 0.0, + "loss/logits": 0.1957770213484764, + "step": 10070 + }, + { + "epoch": 0.31475, + "grad_norm": 3.828125, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 0.0001, + "loss": 6.0065, + "loss/crossentropy": 2.573858380317688, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1877976655960083, + "step": 10072 + }, + { + "epoch": 0.3148125, + "grad_norm": 3.46875, + "grad_norm_var": 0.07527567545572916, + "learning_rate": 0.0001, + "loss": 5.9498, + "loss/crossentropy": 2.566790461540222, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18361780047416687, + "step": 10074 + }, + { + "epoch": 0.314875, + "grad_norm": 4.0625, + "grad_norm_var": 0.09072265625, + "learning_rate": 0.0001, + "loss": 6.2288, + "loss/crossentropy": 2.6813313961029053, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19654232263565063, + "step": 10076 + }, + { + "epoch": 0.3149375, + "grad_norm": 3.71875, + "grad_norm_var": 0.0894927978515625, + "learning_rate": 0.0001, + "loss": 6.2026, + "loss/crossentropy": 2.6554404497146606, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19768338650465012, + "step": 10078 + }, + { + "epoch": 0.315, + "grad_norm": 3.453125, + "grad_norm_var": 0.0810699462890625, + "learning_rate": 0.0001, + "loss": 5.9567, + "loss/crossentropy": 2.5344364643096924, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18792706727981567, + "step": 10080 + }, + { + "epoch": 0.3150625, + "grad_norm": 3.3125, + "grad_norm_var": 0.08329671223958333, + "learning_rate": 0.0001, + "loss": 5.9447, + "loss/crossentropy": 2.5786601305007935, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18503838777542114, + "step": 10082 + }, + { + "epoch": 0.315125, + "grad_norm": 3.28125, + "grad_norm_var": 0.08629150390625, + "learning_rate": 0.0001, + "loss": 6.0267, + "loss/crossentropy": 2.615303635597229, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.188794806599617, + "step": 10084 + }, + { + "epoch": 0.3151875, + "grad_norm": 3.03125, + "grad_norm_var": 0.09729410807291666, + "learning_rate": 0.0001, + "loss": 5.6881, + "loss/crossentropy": 2.497081160545349, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17105385661125183, + "step": 10086 + }, + { + "epoch": 0.31525, + "grad_norm": 3.59375, + "grad_norm_var": 0.09480692545572916, + "learning_rate": 0.0001, + "loss": 6.041, + "loss/crossentropy": 2.6185524463653564, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18833978474140167, + "step": 10088 + }, + { + "epoch": 0.3153125, + "grad_norm": 4.34375, + "grad_norm_var": 0.1343414306640625, + "learning_rate": 0.0001, + "loss": 5.7858, + "loss/crossentropy": 2.496425151824951, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17620240151882172, + "step": 10090 + }, + { + "epoch": 0.315375, + "grad_norm": 3.484375, + "grad_norm_var": 0.11458231608072916, + "learning_rate": 0.0001, + "loss": 5.837, + "loss/crossentropy": 2.5144152641296387, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17874548584222794, + "step": 10092 + }, + { + "epoch": 0.3154375, + "grad_norm": 3.59375, + "grad_norm_var": 0.1048004150390625, + "learning_rate": 0.0001, + "loss": 5.8075, + "loss/crossentropy": 2.473135471343994, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17914441972970963, + "step": 10094 + }, + { + "epoch": 0.3155, + "grad_norm": 3.34375, + "grad_norm_var": 0.10533854166666666, + "learning_rate": 0.0001, + "loss": 6.141, + "loss/crossentropy": 2.6939806938171387, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1872761845588684, + "step": 10096 + }, + { + "epoch": 0.3155625, + "grad_norm": 3.234375, + "grad_norm_var": 0.0986968994140625, + "learning_rate": 0.0001, + "loss": 5.4468, + "loss/crossentropy": 2.274045705795288, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16727469116449356, + "step": 10098 + }, + { + "epoch": 0.315625, + "grad_norm": 3.03125, + "grad_norm_var": 0.105078125, + "learning_rate": 0.0001, + "loss": 6.0835, + "loss/crossentropy": 2.676869750022888, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18910247087478638, + "step": 10100 + }, + { + "epoch": 0.3156875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0931640625, + "learning_rate": 0.0001, + "loss": 5.7976, + "loss/crossentropy": 2.5403417348861694, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17455856502056122, + "step": 10102 + }, + { + "epoch": 0.31575, + "grad_norm": 3.25, + "grad_norm_var": 0.08815104166666667, + "learning_rate": 0.0001, + "loss": 5.8803, + "loss/crossentropy": 2.5339393615722656, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17721383273601532, + "step": 10104 + }, + { + "epoch": 0.3158125, + "grad_norm": 3.25, + "grad_norm_var": 0.022705078125, + "learning_rate": 0.0001, + "loss": 5.8428, + "loss/crossentropy": 2.582388997077942, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17447850108146667, + "step": 10106 + }, + { + "epoch": 0.315875, + "grad_norm": 3.5, + "grad_norm_var": 0.9008748372395833, + "learning_rate": 0.0001, + "loss": 6.0148, + "loss/crossentropy": 2.483462333679199, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.19024056941270828, + "step": 10108 + }, + { + "epoch": 0.3159375, + "grad_norm": 3.234375, + "grad_norm_var": 0.9079264322916667, + "learning_rate": 0.0001, + "loss": 5.6382, + "loss/crossentropy": 2.321375846862793, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17738066613674164, + "step": 10110 + }, + { + "epoch": 0.316, + "grad_norm": 3.421875, + "grad_norm_var": 0.9083292643229167, + "learning_rate": 0.0001, + "loss": 6.0126, + "loss/crossentropy": 2.585088610649109, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18845763057470322, + "step": 10112 + }, + { + "epoch": 0.3160625, + "grad_norm": 3.609375, + "grad_norm_var": 0.8969390869140625, + "learning_rate": 0.0001, + "loss": 5.8656, + "loss/crossentropy": 2.4846831560134888, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18340729176998138, + "step": 10114 + }, + { + "epoch": 0.316125, + "grad_norm": 3.328125, + "grad_norm_var": 0.8772420247395833, + "learning_rate": 0.0001, + "loss": 6.2419, + "loss/crossentropy": 2.7272180318832397, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1928756982088089, + "step": 10116 + }, + { + "epoch": 0.3161875, + "grad_norm": 3.3125, + "grad_norm_var": 0.860009765625, + "learning_rate": 0.0001, + "loss": 6.0352, + "loss/crossentropy": 2.6085673570632935, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18601831048727036, + "step": 10118 + }, + { + "epoch": 0.31625, + "grad_norm": 3.625, + "grad_norm_var": 0.8505930582682292, + "learning_rate": 0.0001, + "loss": 5.9706, + "loss/crossentropy": 2.563066244125366, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18918965011835098, + "step": 10120 + }, + { + "epoch": 0.3163125, + "grad_norm": 3.1875, + "grad_norm_var": 0.86148681640625, + "learning_rate": 0.0001, + "loss": 5.9185, + "loss/crossentropy": 2.5656826496124268, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18020518869161606, + "step": 10122 + }, + { + "epoch": 0.316375, + "grad_norm": 3.28125, + "grad_norm_var": 0.027269490559895835, + "learning_rate": 0.0001, + "loss": 6.2062, + "loss/crossentropy": 2.751970887184143, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18839187920093536, + "step": 10124 + }, + { + "epoch": 0.3164375, + "grad_norm": 3.53125, + "grad_norm_var": 0.033492024739583334, + "learning_rate": 0.0001, + "loss": 6.0275, + "loss/crossentropy": 2.6042853593826294, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18685470521450043, + "step": 10126 + }, + { + "epoch": 0.3165, + "grad_norm": 3.734375, + "grad_norm_var": 0.0392730712890625, + "learning_rate": 0.0001, + "loss": 6.4755, + "loss/crossentropy": 2.923148512840271, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1942974030971527, + "step": 10128 + }, + { + "epoch": 0.3165625, + "grad_norm": 3.421875, + "grad_norm_var": 0.03551025390625, + "learning_rate": 0.0001, + "loss": 6.4908, + "loss/crossentropy": 2.9329782724380493, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19796500355005264, + "step": 10130 + }, + { + "epoch": 0.316625, + "grad_norm": 3.0625, + "grad_norm_var": 0.04715169270833333, + "learning_rate": 0.0001, + "loss": 5.8301, + "loss/crossentropy": 2.6023718118667603, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1727747619152069, + "step": 10132 + }, + { + "epoch": 0.3166875, + "grad_norm": 2.984375, + "grad_norm_var": 0.062333170572916666, + "learning_rate": 0.0001, + "loss": 5.7403, + "loss/crossentropy": 2.5091053247451782, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17312300950288773, + "step": 10134 + }, + { + "epoch": 0.31675, + "grad_norm": 3.546875, + "grad_norm_var": 0.05915425618489583, + "learning_rate": 0.0001, + "loss": 5.4448, + "loss/crossentropy": 2.1704102754592896, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1704123616218567, + "step": 10136 + }, + { + "epoch": 0.3168125, + "grad_norm": 3.453125, + "grad_norm_var": 0.05845947265625, + "learning_rate": 0.0001, + "loss": 6.0383, + "loss/crossentropy": 2.6399617195129395, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18436075747013092, + "step": 10138 + }, + { + "epoch": 0.316875, + "grad_norm": 3.3125, + "grad_norm_var": 0.0577789306640625, + "learning_rate": 0.0001, + "loss": 6.2752, + "loss/crossentropy": 2.7846546173095703, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1935882568359375, + "step": 10140 + }, + { + "epoch": 0.3169375, + "grad_norm": 3.671875, + "grad_norm_var": 0.06409098307291666, + "learning_rate": 0.0001, + "loss": 5.9632, + "loss/crossentropy": 2.5568896532058716, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18750131130218506, + "step": 10142 + }, + { + "epoch": 0.317, + "grad_norm": 3.578125, + "grad_norm_var": 0.058470662434895834, + "learning_rate": 0.0001, + "loss": 6.0904, + "loss/crossentropy": 2.6694942712783813, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18897011131048203, + "step": 10144 + }, + { + "epoch": 0.3170625, + "grad_norm": 3.21875, + "grad_norm_var": 0.08661702473958334, + "learning_rate": 0.0001, + "loss": 6.068, + "loss/crossentropy": 2.649327874183655, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18718105554580688, + "step": 10146 + }, + { + "epoch": 0.317125, + "grad_norm": 3.3125, + "grad_norm_var": 0.07558492024739584, + "learning_rate": 0.0001, + "loss": 5.599, + "loss/crossentropy": 2.344407320022583, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17507170885801315, + "step": 10148 + }, + { + "epoch": 0.3171875, + "grad_norm": 3.46875, + "grad_norm_var": 0.055078125, + "learning_rate": 0.0001, + "loss": 5.8624, + "loss/crossentropy": 2.5185035467147827, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18165487796068192, + "step": 10150 + }, + { + "epoch": 0.31725, + "grad_norm": 3.46875, + "grad_norm_var": 0.050169881184895834, + "learning_rate": 0.0001, + "loss": 5.8706, + "loss/crossentropy": 2.4989309310913086, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18326275050640106, + "step": 10152 + }, + { + "epoch": 0.3173125, + "grad_norm": 3.609375, + "grad_norm_var": 0.050553385416666666, + "learning_rate": 0.0001, + "loss": 6.2412, + "loss/crossentropy": 2.77180016040802, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19381748884916306, + "step": 10154 + }, + { + "epoch": 0.317375, + "grad_norm": 3.5625, + "grad_norm_var": 0.05331624348958333, + "learning_rate": 0.0001, + "loss": 5.8568, + "loss/crossentropy": 2.56472384929657, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17686709016561508, + "step": 10156 + }, + { + "epoch": 0.3174375, + "grad_norm": 3.296875, + "grad_norm_var": 0.04781494140625, + "learning_rate": 0.0001, + "loss": 5.5883, + "loss/crossentropy": 2.3735530376434326, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17225316166877747, + "step": 10158 + }, + { + "epoch": 0.3175, + "grad_norm": 3.6875, + "grad_norm_var": 0.058447265625, + "learning_rate": 0.0001, + "loss": 6.2468, + "loss/crossentropy": 2.710073947906494, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19585628807544708, + "step": 10160 + }, + { + "epoch": 0.3175625, + "grad_norm": 3.359375, + "grad_norm_var": 0.032938639322916664, + "learning_rate": 0.0001, + "loss": 5.4335, + "loss/crossentropy": 2.1610593795776367, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1721634864807129, + "step": 10162 + }, + { + "epoch": 0.317625, + "grad_norm": 3.109375, + "grad_norm_var": 0.03828837076822917, + "learning_rate": 0.0001, + "loss": 6.2081, + "loss/crossentropy": 2.792189121246338, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18768785893917084, + "step": 10164 + }, + { + "epoch": 0.3176875, + "grad_norm": 3.265625, + "grad_norm_var": 0.04657796223958333, + "learning_rate": 0.0001, + "loss": 5.8877, + "loss/crossentropy": 2.6299341917037964, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17616848647594452, + "step": 10166 + }, + { + "epoch": 0.31775, + "grad_norm": 3.546875, + "grad_norm_var": 0.052708943684895836, + "learning_rate": 0.0001, + "loss": 6.1424, + "loss/crossentropy": 2.760109543800354, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1827574521303177, + "step": 10168 + }, + { + "epoch": 0.3178125, + "grad_norm": 3.671875, + "grad_norm_var": 0.059992472330729164, + "learning_rate": 0.0001, + "loss": 5.8406, + "loss/crossentropy": 2.468233346939087, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18216142058372498, + "step": 10170 + }, + { + "epoch": 0.317875, + "grad_norm": 3.25, + "grad_norm_var": 0.055516560872395836, + "learning_rate": 0.0001, + "loss": 6.0491, + "loss/crossentropy": 2.6257340908050537, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1895984634757042, + "step": 10172 + }, + { + "epoch": 0.3179375, + "grad_norm": 3.484375, + "grad_norm_var": 0.053254191080729166, + "learning_rate": 0.0001, + "loss": 6.0024, + "loss/crossentropy": 2.6600505113601685, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18228283524513245, + "step": 10174 + }, + { + "epoch": 0.318, + "grad_norm": 4.4375, + "grad_norm_var": 0.11450093587239583, + "learning_rate": 0.0001, + "loss": 5.8633, + "loss/crossentropy": 2.492933988571167, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18430470675230026, + "step": 10176 + }, + { + "epoch": 0.3180625, + "grad_norm": 3.265625, + "grad_norm_var": 0.11482645670572916, + "learning_rate": 0.0001, + "loss": 5.48, + "loss/crossentropy": 2.2273969650268555, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17291437089443207, + "step": 10178 + }, + { + "epoch": 0.318125, + "grad_norm": 3.5, + "grad_norm_var": 0.11145426432291666, + "learning_rate": 0.0001, + "loss": 6.0786, + "loss/crossentropy": 2.6331560611724854, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1918104961514473, + "step": 10180 + }, + { + "epoch": 0.3181875, + "grad_norm": 3.359375, + "grad_norm_var": 0.10051676432291666, + "learning_rate": 0.0001, + "loss": 6.0399, + "loss/crossentropy": 2.6284717321395874, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.185287743806839, + "step": 10182 + }, + { + "epoch": 0.31825, + "grad_norm": 3.1875, + "grad_norm_var": 0.10147196451822917, + "learning_rate": 0.0001, + "loss": 5.739, + "loss/crossentropy": 2.525644898414612, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1740656942129135, + "step": 10184 + }, + { + "epoch": 0.3183125, + "grad_norm": 3.296875, + "grad_norm_var": 0.09661458333333334, + "learning_rate": 0.0001, + "loss": 5.705, + "loss/crossentropy": 2.35819411277771, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17960207164287567, + "step": 10186 + }, + { + "epoch": 0.318375, + "grad_norm": 3.1875, + "grad_norm_var": 0.09866129557291667, + "learning_rate": 0.0001, + "loss": 5.9201, + "loss/crossentropy": 2.6530667543411255, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1763104498386383, + "step": 10188 + }, + { + "epoch": 0.3184375, + "grad_norm": 4.25, + "grad_norm_var": 0.14165751139322916, + "learning_rate": 0.0001, + "loss": 5.7726, + "loss/crossentropy": 2.3765740394592285, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18217645585536957, + "step": 10190 + }, + { + "epoch": 0.3185, + "grad_norm": 3.3125, + "grad_norm_var": 0.072509765625, + "learning_rate": 0.0001, + "loss": 5.9997, + "loss/crossentropy": 2.617422342300415, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18666477501392365, + "step": 10192 + }, + { + "epoch": 0.3185625, + "grad_norm": 3.515625, + "grad_norm_var": 0.07060139973958333, + "learning_rate": 0.0001, + "loss": 6.0119, + "loss/crossentropy": 2.6309128999710083, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18614888191223145, + "step": 10194 + }, + { + "epoch": 0.318625, + "grad_norm": 3.3125, + "grad_norm_var": 0.07162984212239583, + "learning_rate": 0.0001, + "loss": 6.0888, + "loss/crossentropy": 2.712532877922058, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1876249685883522, + "step": 10196 + }, + { + "epoch": 0.3186875, + "grad_norm": 3.28125, + "grad_norm_var": 0.1028228759765625, + "learning_rate": 0.0001, + "loss": 6.3096, + "loss/crossentropy": 2.7521666288375854, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1951950639486313, + "step": 10198 + }, + { + "epoch": 0.31875, + "grad_norm": 3.453125, + "grad_norm_var": 0.08837483723958334, + "learning_rate": 0.0001, + "loss": 5.8462, + "loss/crossentropy": 2.4042731523513794, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19028404355049133, + "step": 10200 + }, + { + "epoch": 0.3188125, + "grad_norm": 4.71875, + "grad_norm_var": 0.18262430826822917, + "learning_rate": 0.0001, + "loss": 5.8477, + "loss/crossentropy": 2.4556018114089966, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.17670895904302597, + "step": 10202 + }, + { + "epoch": 0.318875, + "grad_norm": 3.9375, + "grad_norm_var": 0.17320556640625, + "learning_rate": 0.0001, + "loss": 6.0425, + "loss/crossentropy": 2.592575192451477, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18679186701774597, + "step": 10204 + }, + { + "epoch": 0.3189375, + "grad_norm": 4.625, + "grad_norm_var": 0.22554931640625, + "learning_rate": 0.0001, + "loss": 6.1813, + "loss/crossentropy": 2.7186423540115356, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1892346814274788, + "step": 10206 + }, + { + "epoch": 0.319, + "grad_norm": 3.265625, + "grad_norm_var": 0.23151041666666666, + "learning_rate": 0.0001, + "loss": 5.8299, + "loss/crossentropy": 2.483436942100525, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18112580478191376, + "step": 10208 + }, + { + "epoch": 0.3190625, + "grad_norm": 3.71875, + "grad_norm_var": 0.23720296223958334, + "learning_rate": 0.0001, + "loss": 6.1856, + "loss/crossentropy": 2.73944354057312, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18875248730182648, + "step": 10210 + }, + { + "epoch": 0.319125, + "grad_norm": 3.21875, + "grad_norm_var": 0.24117431640625, + "learning_rate": 0.0001, + "loss": 5.8642, + "loss/crossentropy": 2.4610995054244995, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18523633480072021, + "step": 10212 + }, + { + "epoch": 0.3191875, + "grad_norm": 3.71875, + "grad_norm_var": 0.22043355305989584, + "learning_rate": 0.0001, + "loss": 5.7804, + "loss/crossentropy": 2.481137990951538, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17562881857156754, + "step": 10214 + }, + { + "epoch": 0.31925, + "grad_norm": 3.53125, + "grad_norm_var": 0.22139383951822916, + "learning_rate": 0.0001, + "loss": 6.2475, + "loss/crossentropy": 2.704582691192627, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.1917915642261505, + "step": 10216 + }, + { + "epoch": 0.3193125, + "grad_norm": 4.625, + "grad_norm_var": 0.20972391764322917, + "learning_rate": 0.0001, + "loss": 5.7252, + "loss/crossentropy": 2.357407569885254, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18209270387887955, + "step": 10218 + }, + { + "epoch": 0.319375, + "grad_norm": 3.03125, + "grad_norm_var": 0.223779296875, + "learning_rate": 0.0001, + "loss": 5.6514, + "loss/crossentropy": 2.3752524852752686, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17409750074148178, + "step": 10220 + }, + { + "epoch": 0.3194375, + "grad_norm": 3.96875, + "grad_norm_var": 0.15097554524739584, + "learning_rate": 0.0001, + "loss": 6.2729, + "loss/crossentropy": 2.6497918367385864, + "loss/hidden": 1.66796875, + "loss/jsd": 0.0, + "loss/logits": 0.19551673531532288, + "step": 10222 + }, + { + "epoch": 0.3195, + "grad_norm": 2.984375, + "grad_norm_var": 0.17255757649739584, + "learning_rate": 0.0001, + "loss": 5.8602, + "loss/crossentropy": 2.5426958799362183, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1809680014848709, + "step": 10224 + }, + { + "epoch": 0.3195625, + "grad_norm": 3.3125, + "grad_norm_var": 0.17149149576822917, + "learning_rate": 0.0001, + "loss": 5.8726, + "loss/crossentropy": 2.609194755554199, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17633941769599915, + "step": 10226 + }, + { + "epoch": 0.319625, + "grad_norm": 3.328125, + "grad_norm_var": 0.16856180826822917, + "learning_rate": 0.0001, + "loss": 6.0416, + "loss/crossentropy": 2.5963852405548096, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18749169260263443, + "step": 10228 + }, + { + "epoch": 0.3196875, + "grad_norm": 3.5625, + "grad_norm_var": 0.16494038899739583, + "learning_rate": 0.0001, + "loss": 6.0253, + "loss/crossentropy": 2.5735563039779663, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18892797082662582, + "step": 10230 + }, + { + "epoch": 0.31975, + "grad_norm": 3.0625, + "grad_norm_var": 0.18153889973958334, + "learning_rate": 0.0001, + "loss": 5.8955, + "loss/crossentropy": 2.6234103441238403, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17604166269302368, + "step": 10232 + }, + { + "epoch": 0.3198125, + "grad_norm": 4.0625, + "grad_norm_var": 0.10650126139322917, + "learning_rate": 0.0001, + "loss": 6.102, + "loss/crossentropy": 2.589194655418396, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1950305849313736, + "step": 10234 + }, + { + "epoch": 0.319875, + "grad_norm": 3.421875, + "grad_norm_var": 0.09208882649739583, + "learning_rate": 0.0001, + "loss": 6.0808, + "loss/crossentropy": 2.5738414525985718, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1963956654071808, + "step": 10236 + }, + { + "epoch": 0.3199375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0952789306640625, + "learning_rate": 0.0001, + "loss": 5.8711, + "loss/crossentropy": 2.5417840480804443, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1766820102930069, + "step": 10238 + }, + { + "epoch": 0.32, + "grad_norm": 3.28125, + "grad_norm_var": 0.08124593098958334, + "learning_rate": 0.0001, + "loss": 5.8375, + "loss/crossentropy": 2.502975821495056, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17954809218645096, + "step": 10240 + }, + { + "epoch": 0.3200625, + "grad_norm": 3.015625, + "grad_norm_var": 0.09079488118489583, + "learning_rate": 0.0001, + "loss": 6.0292, + "loss/crossentropy": 2.710159420967102, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1795569211244583, + "step": 10242 + }, + { + "epoch": 0.320125, + "grad_norm": 3.390625, + "grad_norm_var": 0.0902740478515625, + "learning_rate": 0.0001, + "loss": 6.0092, + "loss/crossentropy": 2.625326156616211, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18369631469249725, + "step": 10244 + }, + { + "epoch": 0.3201875, + "grad_norm": 3.390625, + "grad_norm_var": 0.2173980712890625, + "learning_rate": 0.0001, + "loss": 5.8772, + "loss/crossentropy": 2.528961658477783, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.17349748313426971, + "step": 10246 + }, + { + "epoch": 0.32025, + "grad_norm": 3.375, + "grad_norm_var": 0.1926177978515625, + "learning_rate": 0.0001, + "loss": 6.0081, + "loss/crossentropy": 2.6666018962860107, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18024158477783203, + "step": 10248 + }, + { + "epoch": 0.3203125, + "grad_norm": 3.921875, + "grad_norm_var": 0.1835357666015625, + "learning_rate": 0.0001, + "loss": 6.2809, + "loss/crossentropy": 2.6556284427642822, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.20627307891845703, + "step": 10250 + }, + { + "epoch": 0.320375, + "grad_norm": 3.546875, + "grad_norm_var": 0.18212890625, + "learning_rate": 0.0001, + "loss": 6.0027, + "loss/crossentropy": 2.710022449493408, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17575644701719284, + "step": 10252 + }, + { + "epoch": 0.3204375, + "grad_norm": 3.828125, + "grad_norm_var": 0.172119140625, + "learning_rate": 0.0001, + "loss": 6.0304, + "loss/crossentropy": 2.572240948677063, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18682856857776642, + "step": 10254 + }, + { + "epoch": 0.3205, + "grad_norm": 4.28125, + "grad_norm_var": 0.19068094889322917, + "learning_rate": 0.0001, + "loss": 5.9118, + "loss/crossentropy": 2.43207323551178, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18937978148460388, + "step": 10256 + }, + { + "epoch": 0.3205625, + "grad_norm": 3.125, + "grad_norm_var": 0.18294270833333334, + "learning_rate": 0.0001, + "loss": 6.0053, + "loss/crossentropy": 2.6684393882751465, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1805582493543625, + "step": 10258 + }, + { + "epoch": 0.320625, + "grad_norm": 3.078125, + "grad_norm_var": 0.19446614583333333, + "learning_rate": 0.0001, + "loss": 5.8534, + "loss/crossentropy": 2.5321428775787354, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17939115315675735, + "step": 10260 + }, + { + "epoch": 0.3206875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0946685791015625, + "learning_rate": 0.0001, + "loss": 5.935, + "loss/crossentropy": 2.6070172786712646, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18084676563739777, + "step": 10262 + }, + { + "epoch": 0.32075, + "grad_norm": 3.53125, + "grad_norm_var": 0.10591532389322916, + "learning_rate": 0.0001, + "loss": 5.7859, + "loss/crossentropy": 2.498133897781372, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17604473233222961, + "step": 10264 + }, + { + "epoch": 0.3208125, + "grad_norm": 3.28125, + "grad_norm_var": 0.09695638020833333, + "learning_rate": 0.0001, + "loss": 5.8425, + "loss/crossentropy": 2.546578049659729, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1776386946439743, + "step": 10266 + }, + { + "epoch": 0.320875, + "grad_norm": 3.25, + "grad_norm_var": 0.10373433430989583, + "learning_rate": 0.0001, + "loss": 6.0768, + "loss/crossentropy": 2.699702501296997, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18419396877288818, + "step": 10268 + }, + { + "epoch": 0.3209375, + "grad_norm": 3.234375, + "grad_norm_var": 0.09401041666666667, + "learning_rate": 0.0001, + "loss": 5.9828, + "loss/crossentropy": 2.5811959505081177, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18703202903270721, + "step": 10270 + }, + { + "epoch": 0.321, + "grad_norm": 3.125, + "grad_norm_var": 0.043017578125, + "learning_rate": 0.0001, + "loss": 5.7115, + "loss/crossentropy": 2.476984977722168, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17384564876556396, + "step": 10272 + }, + { + "epoch": 0.3210625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0375396728515625, + "learning_rate": 0.0001, + "loss": 6.1687, + "loss/crossentropy": 2.780236601829529, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18338096141815186, + "step": 10274 + }, + { + "epoch": 0.321125, + "grad_norm": 3.296875, + "grad_norm_var": 0.03824462890625, + "learning_rate": 0.0001, + "loss": 6.0086, + "loss/crossentropy": 2.6080679893493652, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18693005293607712, + "step": 10276 + }, + { + "epoch": 0.3211875, + "grad_norm": 3.40625, + "grad_norm_var": 0.0389556884765625, + "learning_rate": 0.0001, + "loss": 6.034, + "loss/crossentropy": 2.629909634590149, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1849387288093567, + "step": 10278 + }, + { + "epoch": 0.32125, + "grad_norm": 3.234375, + "grad_norm_var": 0.04850972493489583, + "learning_rate": 0.0001, + "loss": 6.0483, + "loss/crossentropy": 2.610661268234253, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18985499441623688, + "step": 10280 + }, + { + "epoch": 0.3213125, + "grad_norm": 3.65625, + "grad_norm_var": 0.054784138997395836, + "learning_rate": 0.0001, + "loss": 5.9443, + "loss/crossentropy": 2.598568916320801, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1841811239719391, + "step": 10282 + }, + { + "epoch": 0.321375, + "grad_norm": 3.0, + "grad_norm_var": 0.05699869791666667, + "learning_rate": 0.0001, + "loss": 5.8561, + "loss/crossentropy": 2.504266142845154, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18322645872831345, + "step": 10284 + }, + { + "epoch": 0.3214375, + "grad_norm": 3.78125, + "grad_norm_var": 0.06689453125, + "learning_rate": 0.0001, + "loss": 5.8586, + "loss/crossentropy": 2.497063636779785, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18068519979715347, + "step": 10286 + }, + { + "epoch": 0.3215, + "grad_norm": 3.453125, + "grad_norm_var": 0.0549713134765625, + "learning_rate": 0.0001, + "loss": 5.9398, + "loss/crossentropy": 2.5412371158599854, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18595405668020248, + "step": 10288 + }, + { + "epoch": 0.3215625, + "grad_norm": 3.4375, + "grad_norm_var": 0.054833984375, + "learning_rate": 0.0001, + "loss": 6.2372, + "loss/crossentropy": 2.7960572242736816, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18982186168432236, + "step": 10290 + }, + { + "epoch": 0.321625, + "grad_norm": 3.390625, + "grad_norm_var": 0.06021728515625, + "learning_rate": 0.0001, + "loss": 6.0493, + "loss/crossentropy": 2.7268223762512207, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18107493221759796, + "step": 10292 + }, + { + "epoch": 0.3216875, + "grad_norm": 3.3125, + "grad_norm_var": 0.0586822509765625, + "learning_rate": 0.0001, + "loss": 5.9802, + "loss/crossentropy": 2.5619630813598633, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1898685246706009, + "step": 10294 + }, + { + "epoch": 0.32175, + "grad_norm": 3.125, + "grad_norm_var": 0.04859619140625, + "learning_rate": 0.0001, + "loss": 5.959, + "loss/crossentropy": 2.6295331716537476, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18021129816770554, + "step": 10296 + }, + { + "epoch": 0.3218125, + "grad_norm": 3.3125, + "grad_norm_var": 0.046484375, + "learning_rate": 0.0001, + "loss": 6.1336, + "loss/crossentropy": 2.800679564476013, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18172965943813324, + "step": 10298 + }, + { + "epoch": 0.321875, + "grad_norm": 3.140625, + "grad_norm_var": 0.038309733072916664, + "learning_rate": 0.0001, + "loss": 5.9261, + "loss/crossentropy": 2.616189479827881, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18059717118740082, + "step": 10300 + }, + { + "epoch": 0.3219375, + "grad_norm": 3.46875, + "grad_norm_var": 0.060179646809895834, + "learning_rate": 0.0001, + "loss": 6.3866, + "loss/crossentropy": 2.903854727745056, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19163402915000916, + "step": 10302 + }, + { + "epoch": 0.322, + "grad_norm": 3.21875, + "grad_norm_var": 0.060933430989583336, + "learning_rate": 0.0001, + "loss": 5.8206, + "loss/crossentropy": 2.4988759756088257, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17787177860736847, + "step": 10304 + }, + { + "epoch": 0.3220625, + "grad_norm": 3.109375, + "grad_norm_var": 0.06848042805989583, + "learning_rate": 0.0001, + "loss": 5.943, + "loss/crossentropy": 2.5184680223464966, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18386346101760864, + "step": 10306 + }, + { + "epoch": 0.322125, + "grad_norm": 3.21875, + "grad_norm_var": 0.06252339680989584, + "learning_rate": 0.0001, + "loss": 5.9365, + "loss/crossentropy": 2.560743570327759, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18444863706827164, + "step": 10308 + }, + { + "epoch": 0.3221875, + "grad_norm": 3.171875, + "grad_norm_var": 0.06266988118489583, + "learning_rate": 0.0001, + "loss": 5.896, + "loss/crossentropy": 2.492717981338501, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1879885271191597, + "step": 10310 + }, + { + "epoch": 0.32225, + "grad_norm": 3.359375, + "grad_norm_var": 0.08522847493489584, + "learning_rate": 0.0001, + "loss": 6.0193, + "loss/crossentropy": 2.629201889038086, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18197664618492126, + "step": 10312 + }, + { + "epoch": 0.3223125, + "grad_norm": 3.296875, + "grad_norm_var": 0.08351949055989584, + "learning_rate": 0.0001, + "loss": 5.6949, + "loss/crossentropy": 2.4657797813415527, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17134519666433334, + "step": 10314 + }, + { + "epoch": 0.322375, + "grad_norm": 3.5625, + "grad_norm_var": 0.0806549072265625, + "learning_rate": 0.0001, + "loss": 6.1845, + "loss/crossentropy": 2.742559552192688, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1852065622806549, + "step": 10316 + }, + { + "epoch": 0.3224375, + "grad_norm": 3.53125, + "grad_norm_var": 0.0503570556640625, + "learning_rate": 0.0001, + "loss": 5.5001, + "loss/crossentropy": 2.2537004947662354, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1726863980293274, + "step": 10318 + }, + { + "epoch": 0.3225, + "grad_norm": 3.453125, + "grad_norm_var": 0.04840087890625, + "learning_rate": 0.0001, + "loss": 6.0665, + "loss/crossentropy": 2.6084266901016235, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18799513578414917, + "step": 10320 + }, + { + "epoch": 0.3225625, + "grad_norm": 4.15625, + "grad_norm_var": 0.08440348307291666, + "learning_rate": 0.0001, + "loss": 5.7407, + "loss/crossentropy": 2.47977352142334, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1698397845029831, + "step": 10322 + }, + { + "epoch": 0.322625, + "grad_norm": 3.40625, + "grad_norm_var": 0.08185933430989584, + "learning_rate": 0.0001, + "loss": 5.7587, + "loss/crossentropy": 2.4150086641311646, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1800706535577774, + "step": 10324 + }, + { + "epoch": 0.3226875, + "grad_norm": 3.5625, + "grad_norm_var": 0.08221028645833334, + "learning_rate": 0.0001, + "loss": 5.5886, + "loss/crossentropy": 2.318339467048645, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1758519634604454, + "step": 10326 + }, + { + "epoch": 0.32275, + "grad_norm": 3.34375, + "grad_norm_var": 0.0698883056640625, + "learning_rate": 0.0001, + "loss": 5.5275, + "loss/crossentropy": 2.342996120452881, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1664949432015419, + "step": 10328 + }, + { + "epoch": 0.3228125, + "grad_norm": 3.21875, + "grad_norm_var": 0.06549072265625, + "learning_rate": 0.0001, + "loss": 6.0932, + "loss/crossentropy": 2.6780357360839844, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18683306872844696, + "step": 10330 + }, + { + "epoch": 0.322875, + "grad_norm": 3.53125, + "grad_norm_var": 0.0680816650390625, + "learning_rate": 0.0001, + "loss": 5.9287, + "loss/crossentropy": 2.5059865713119507, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1852380484342575, + "step": 10332 + }, + { + "epoch": 0.3229375, + "grad_norm": 3.359375, + "grad_norm_var": 0.06879781087239584, + "learning_rate": 0.0001, + "loss": 6.0371, + "loss/crossentropy": 2.7091665267944336, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1816255897283554, + "step": 10334 + }, + { + "epoch": 0.323, + "grad_norm": 3.140625, + "grad_norm_var": 0.07222391764322916, + "learning_rate": 0.0001, + "loss": 6.0552, + "loss/crossentropy": 2.6606714725494385, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1851576864719391, + "step": 10336 + }, + { + "epoch": 0.3230625, + "grad_norm": 3.203125, + "grad_norm_var": 0.028539021809895832, + "learning_rate": 0.0001, + "loss": 6.1826, + "loss/crossentropy": 2.755558729171753, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18450582027435303, + "step": 10338 + }, + { + "epoch": 0.323125, + "grad_norm": 3.484375, + "grad_norm_var": 0.03092041015625, + "learning_rate": 0.0001, + "loss": 5.6844, + "loss/crossentropy": 2.4298194646835327, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1707700565457344, + "step": 10340 + }, + { + "epoch": 0.3231875, + "grad_norm": 3.171875, + "grad_norm_var": 0.0276275634765625, + "learning_rate": 0.0001, + "loss": 6.0102, + "loss/crossentropy": 2.654059410095215, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18131651729345322, + "step": 10342 + }, + { + "epoch": 0.32325, + "grad_norm": 3.234375, + "grad_norm_var": 0.025121053059895832, + "learning_rate": 0.0001, + "loss": 5.7932, + "loss/crossentropy": 2.459676742553711, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.177101731300354, + "step": 10344 + }, + { + "epoch": 0.3233125, + "grad_norm": 3.28125, + "grad_norm_var": 0.025439453125, + "learning_rate": 0.0001, + "loss": 6.0264, + "loss/crossentropy": 2.617383599281311, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18972976505756378, + "step": 10346 + }, + { + "epoch": 0.323375, + "grad_norm": 3.46875, + "grad_norm_var": 0.018062337239583334, + "learning_rate": 0.0001, + "loss": 5.8327, + "loss/crossentropy": 2.4840651750564575, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18407892435789108, + "step": 10348 + }, + { + "epoch": 0.3234375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0202301025390625, + "learning_rate": 0.0001, + "loss": 5.74, + "loss/crossentropy": 2.3936723470687866, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18033472448587418, + "step": 10350 + }, + { + "epoch": 0.3235, + "grad_norm": 3.390625, + "grad_norm_var": 0.018260701497395834, + "learning_rate": 0.0001, + "loss": 6.2428, + "loss/crossentropy": 2.749495029449463, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1915174424648285, + "step": 10352 + }, + { + "epoch": 0.3235625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0174468994140625, + "learning_rate": 0.0001, + "loss": 5.4822, + "loss/crossentropy": 2.28355073928833, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16634956747293472, + "step": 10354 + }, + { + "epoch": 0.323625, + "grad_norm": 3.40625, + "grad_norm_var": 0.016747029622395833, + "learning_rate": 0.0001, + "loss": 5.8807, + "loss/crossentropy": 2.597448945045471, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17598330974578857, + "step": 10356 + }, + { + "epoch": 0.3236875, + "grad_norm": 3.09375, + "grad_norm_var": 0.018903605143229165, + "learning_rate": 0.0001, + "loss": 5.9361, + "loss/crossentropy": 2.642319917678833, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1762532889842987, + "step": 10358 + }, + { + "epoch": 0.32375, + "grad_norm": 3.125, + "grad_norm_var": 0.020164998372395833, + "learning_rate": 0.0001, + "loss": 6.2062, + "loss/crossentropy": 2.7625609636306763, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18889529258012772, + "step": 10360 + }, + { + "epoch": 0.3238125, + "grad_norm": 3.3125, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 0.0001, + "loss": 5.6625, + "loss/crossentropy": 2.37093186378479, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17876386642456055, + "step": 10362 + }, + { + "epoch": 0.323875, + "grad_norm": 3.046875, + "grad_norm_var": 0.020865885416666667, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.4135403633117676, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1698107272386551, + "step": 10364 + }, + { + "epoch": 0.3239375, + "grad_norm": 3.21875, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 6.103, + "loss/crossentropy": 2.6791166067123413, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18613450974225998, + "step": 10366 + }, + { + "epoch": 0.324, + "grad_norm": 3.546875, + "grad_norm_var": 0.0278717041015625, + "learning_rate": 0.0001, + "loss": 6.0993, + "loss/crossentropy": 2.6597228050231934, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18770888447761536, + "step": 10368 + }, + { + "epoch": 0.3240625, + "grad_norm": 3.53125, + "grad_norm_var": 0.032731119791666666, + "learning_rate": 0.0001, + "loss": 6.1056, + "loss/crossentropy": 2.677014470100403, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1870013177394867, + "step": 10370 + }, + { + "epoch": 0.324125, + "grad_norm": 3.390625, + "grad_norm_var": 0.03764546712239583, + "learning_rate": 0.0001, + "loss": 5.7593, + "loss/crossentropy": 2.435485005378723, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18003712594509125, + "step": 10372 + }, + { + "epoch": 0.3241875, + "grad_norm": 3.25, + "grad_norm_var": 0.036408487955729166, + "learning_rate": 0.0001, + "loss": 6.0285, + "loss/crossentropy": 2.649515151977539, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18320664763450623, + "step": 10374 + }, + { + "epoch": 0.32425, + "grad_norm": 3.125, + "grad_norm_var": 0.043778483072916666, + "learning_rate": 0.0001, + "loss": 5.9564, + "loss/crossentropy": 2.605233073234558, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1812061071395874, + "step": 10376 + }, + { + "epoch": 0.3243125, + "grad_norm": 3.453125, + "grad_norm_var": 0.0540191650390625, + "learning_rate": 0.0001, + "loss": 5.8441, + "loss/crossentropy": 2.578033208847046, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17075205594301224, + "step": 10378 + }, + { + "epoch": 0.324375, + "grad_norm": 3.109375, + "grad_norm_var": 0.05322265625, + "learning_rate": 0.0001, + "loss": 5.6596, + "loss/crossentropy": 2.445371389389038, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1725914552807808, + "step": 10380 + }, + { + "epoch": 0.3244375, + "grad_norm": 3.15625, + "grad_norm_var": 0.052229817708333334, + "learning_rate": 0.0001, + "loss": 5.9566, + "loss/crossentropy": 2.6489113569259644, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17686673998832703, + "step": 10382 + }, + { + "epoch": 0.3245, + "grad_norm": 3.15625, + "grad_norm_var": 0.04602762858072917, + "learning_rate": 0.0001, + "loss": 5.9523, + "loss/crossentropy": 2.565733313560486, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18435630202293396, + "step": 10384 + }, + { + "epoch": 0.3245625, + "grad_norm": 4.15625, + "grad_norm_var": 0.08870035807291667, + "learning_rate": 0.0001, + "loss": 5.6202, + "loss/crossentropy": 2.3419684171676636, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17196565866470337, + "step": 10386 + }, + { + "epoch": 0.324625, + "grad_norm": 3.34375, + "grad_norm_var": 0.08421223958333333, + "learning_rate": 0.0001, + "loss": 5.8697, + "loss/crossentropy": 2.485108971595764, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1865016222000122, + "step": 10388 + }, + { + "epoch": 0.3246875, + "grad_norm": 3.5, + "grad_norm_var": 0.08633524576822917, + "learning_rate": 0.0001, + "loss": 6.2263, + "loss/crossentropy": 2.757042169570923, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1894996240735054, + "step": 10390 + }, + { + "epoch": 0.32475, + "grad_norm": 3.390625, + "grad_norm_var": 0.07402242024739583, + "learning_rate": 0.0001, + "loss": 5.8389, + "loss/crossentropy": 2.444661259651184, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18629730492830276, + "step": 10392 + }, + { + "epoch": 0.3248125, + "grad_norm": 3.609375, + "grad_norm_var": 0.07128804524739583, + "learning_rate": 0.0001, + "loss": 6.3191, + "loss/crossentropy": 2.7633039951324463, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1977638155221939, + "step": 10394 + }, + { + "epoch": 0.324875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0710601806640625, + "learning_rate": 0.0001, + "loss": 5.8646, + "loss/crossentropy": 2.640702486038208, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17316973209381104, + "step": 10396 + }, + { + "epoch": 0.3249375, + "grad_norm": 3.4375, + "grad_norm_var": 0.06949462890625, + "learning_rate": 0.0001, + "loss": 5.791, + "loss/crossentropy": 2.443985104560852, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18001803755760193, + "step": 10398 + }, + { + "epoch": 0.325, + "grad_norm": 3.21875, + "grad_norm_var": 0.06363525390625, + "learning_rate": 0.0001, + "loss": 6.0377, + "loss/crossentropy": 2.6358437538146973, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1823723390698433, + "step": 10400 + }, + { + "epoch": 0.3250625, + "grad_norm": 3.296875, + "grad_norm_var": 0.017528279622395834, + "learning_rate": 0.0001, + "loss": 5.4682, + "loss/crossentropy": 2.22607159614563, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17265252023935318, + "step": 10402 + }, + { + "epoch": 0.325125, + "grad_norm": 3.3125, + "grad_norm_var": 0.025581868489583333, + "learning_rate": 0.0001, + "loss": 6.1358, + "loss/crossentropy": 2.7480798959732056, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18291470408439636, + "step": 10404 + }, + { + "epoch": 0.3251875, + "grad_norm": 4.03125, + "grad_norm_var": 0.051656087239583336, + "learning_rate": 0.0001, + "loss": 5.8878, + "loss/crossentropy": 2.5838598012924194, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17922098189592361, + "step": 10406 + }, + { + "epoch": 0.32525, + "grad_norm": 3.125, + "grad_norm_var": 0.05635477701822917, + "learning_rate": 0.0001, + "loss": 5.8983, + "loss/crossentropy": 2.529516100883484, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18218666315078735, + "step": 10408 + }, + { + "epoch": 0.3253125, + "grad_norm": 2.984375, + "grad_norm_var": 0.06767578125, + "learning_rate": 0.0001, + "loss": 5.6475, + "loss/crossentropy": 2.3896443843841553, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17344091832637787, + "step": 10410 + }, + { + "epoch": 0.325375, + "grad_norm": 3.09375, + "grad_norm_var": 0.06689453125, + "learning_rate": 0.0001, + "loss": 5.9586, + "loss/crossentropy": 2.6563864946365356, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1790490746498108, + "step": 10412 + }, + { + "epoch": 0.3254375, + "grad_norm": 3.5, + "grad_norm_var": 0.06845703125, + "learning_rate": 0.0001, + "loss": 6.1893, + "loss/crossentropy": 2.7665140628814697, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18915075063705444, + "step": 10414 + }, + { + "epoch": 0.3255, + "grad_norm": 3.484375, + "grad_norm_var": 0.12932027180989583, + "learning_rate": 0.0001, + "loss": 6.155, + "loss/crossentropy": 2.6996634006500244, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1861589252948761, + "step": 10416 + }, + { + "epoch": 0.3255625, + "grad_norm": 4.28125, + "grad_norm_var": 0.1626617431640625, + "learning_rate": 0.0001, + "loss": 5.4045, + "loss/crossentropy": 2.205993890762329, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16594918072223663, + "step": 10418 + }, + { + "epoch": 0.325625, + "grad_norm": 3.265625, + "grad_norm_var": 0.16363525390625, + "learning_rate": 0.0001, + "loss": 6.1563, + "loss/crossentropy": 2.702183246612549, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18955016881227493, + "step": 10420 + }, + { + "epoch": 0.3256875, + "grad_norm": 3.0, + "grad_norm_var": 0.1684722900390625, + "learning_rate": 0.0001, + "loss": 5.5715, + "loss/crossentropy": 2.3634437322616577, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17510025948286057, + "step": 10422 + }, + { + "epoch": 0.32575, + "grad_norm": 3.1875, + "grad_norm_var": 0.16743876139322916, + "learning_rate": 0.0001, + "loss": 5.9613, + "loss/crossentropy": 2.613492727279663, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17852627485990524, + "step": 10424 + }, + { + "epoch": 0.3258125, + "grad_norm": 3.234375, + "grad_norm_var": 0.159619140625, + "learning_rate": 0.0001, + "loss": 6.1792, + "loss/crossentropy": 2.718513250350952, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19059736281633377, + "step": 10426 + }, + { + "epoch": 0.325875, + "grad_norm": 3.796875, + "grad_norm_var": 0.1602691650390625, + "learning_rate": 0.0001, + "loss": 5.9991, + "loss/crossentropy": 2.6648871898651123, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1795169711112976, + "step": 10428 + }, + { + "epoch": 0.3259375, + "grad_norm": 3.109375, + "grad_norm_var": 0.1678131103515625, + "learning_rate": 0.0001, + "loss": 5.809, + "loss/crossentropy": 2.539751172065735, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1761474683880806, + "step": 10430 + }, + { + "epoch": 0.326, + "grad_norm": 3.328125, + "grad_norm_var": 0.10966695149739583, + "learning_rate": 0.0001, + "loss": 6.1314, + "loss/crossentropy": 2.715307593345642, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.19004476815462112, + "step": 10432 + }, + { + "epoch": 0.3260625, + "grad_norm": 3.3125, + "grad_norm_var": 0.05855204264322917, + "learning_rate": 0.0001, + "loss": 6.1432, + "loss/crossentropy": 2.6461070775985718, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19385036826133728, + "step": 10434 + }, + { + "epoch": 0.326125, + "grad_norm": 3.3125, + "grad_norm_var": 0.05244140625, + "learning_rate": 0.0001, + "loss": 5.8139, + "loss/crossentropy": 2.513677954673767, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17338525503873825, + "step": 10436 + }, + { + "epoch": 0.3261875, + "grad_norm": 3.5625, + "grad_norm_var": 0.04198811848958333, + "learning_rate": 0.0001, + "loss": 5.869, + "loss/crossentropy": 2.5315924882888794, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18335412442684174, + "step": 10438 + }, + { + "epoch": 0.32625, + "grad_norm": 3.328125, + "grad_norm_var": 0.046418253580729166, + "learning_rate": 0.0001, + "loss": 5.8026, + "loss/crossentropy": 2.502164602279663, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17965249717235565, + "step": 10440 + }, + { + "epoch": 0.3263125, + "grad_norm": 3.25, + "grad_norm_var": 0.037495930989583336, + "learning_rate": 0.0001, + "loss": 6.0665, + "loss/crossentropy": 2.6249698400497437, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18907830864191055, + "step": 10442 + }, + { + "epoch": 0.326375, + "grad_norm": 3.875, + "grad_norm_var": 0.040257771809895836, + "learning_rate": 0.0001, + "loss": 6.3294, + "loss/crossentropy": 2.8112963438034058, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1947823166847229, + "step": 10444 + }, + { + "epoch": 0.3264375, + "grad_norm": 3.46875, + "grad_norm_var": 0.03654683430989583, + "learning_rate": 0.0001, + "loss": 6.0778, + "loss/crossentropy": 2.722153425216675, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18165677040815353, + "step": 10446 + }, + { + "epoch": 0.3265, + "grad_norm": 3.21875, + "grad_norm_var": 0.04531148274739583, + "learning_rate": 0.0001, + "loss": 5.9552, + "loss/crossentropy": 2.666857600212097, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17804977297782898, + "step": 10448 + }, + { + "epoch": 0.3265625, + "grad_norm": 3.703125, + "grad_norm_var": 0.05487874348958333, + "learning_rate": 0.0001, + "loss": 6.0745, + "loss/crossentropy": 2.7147200107574463, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18285037577152252, + "step": 10450 + }, + { + "epoch": 0.326625, + "grad_norm": 3.25, + "grad_norm_var": 0.055399576822916664, + "learning_rate": 0.0001, + "loss": 5.8654, + "loss/crossentropy": 2.5688791275024414, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1765284687280655, + "step": 10452 + }, + { + "epoch": 0.3266875, + "grad_norm": 4.46875, + "grad_norm_var": 0.12888895670572917, + "learning_rate": 0.0001, + "loss": 6.2283, + "loss/crossentropy": 2.7135192155838013, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1948346346616745, + "step": 10454 + }, + { + "epoch": 0.32675, + "grad_norm": 3.1875, + "grad_norm_var": 0.12759501139322918, + "learning_rate": 0.0001, + "loss": 5.8369, + "loss/crossentropy": 2.5518386363983154, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17654848098754883, + "step": 10456 + }, + { + "epoch": 0.3268125, + "grad_norm": 3.484375, + "grad_norm_var": 0.12392171223958333, + "learning_rate": 0.0001, + "loss": 5.9143, + "loss/crossentropy": 2.50605046749115, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18222897499799728, + "step": 10458 + }, + { + "epoch": 0.326875, + "grad_norm": 3.34375, + "grad_norm_var": 0.11468098958333334, + "learning_rate": 0.0001, + "loss": 5.886, + "loss/crossentropy": 2.5006728172302246, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.183840811252594, + "step": 10460 + }, + { + "epoch": 0.3269375, + "grad_norm": 3.015625, + "grad_norm_var": 0.13113505045572918, + "learning_rate": 0.0001, + "loss": 5.5897, + "loss/crossentropy": 2.428275942802429, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16692692786455154, + "step": 10462 + }, + { + "epoch": 0.327, + "grad_norm": 3.328125, + "grad_norm_var": 0.12127278645833334, + "learning_rate": 0.0001, + "loss": 5.9769, + "loss/crossentropy": 2.618067502975464, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18197977542877197, + "step": 10464 + }, + { + "epoch": 0.3270625, + "grad_norm": 2.9375, + "grad_norm_var": 0.12903645833333333, + "learning_rate": 0.0001, + "loss": 5.5058, + "loss/crossentropy": 2.329106092453003, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16766852140426636, + "step": 10466 + }, + { + "epoch": 0.327125, + "grad_norm": 3.234375, + "grad_norm_var": 0.13027242024739583, + "learning_rate": 0.0001, + "loss": 5.8744, + "loss/crossentropy": 2.5502312183380127, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17851175367832184, + "step": 10468 + }, + { + "epoch": 0.3271875, + "grad_norm": 3.09375, + "grad_norm_var": 0.055322265625, + "learning_rate": 0.0001, + "loss": 5.9023, + "loss/crossentropy": 2.62544047832489, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17808130383491516, + "step": 10470 + }, + { + "epoch": 0.32725, + "grad_norm": 3.71875, + "grad_norm_var": 0.05129801432291667, + "learning_rate": 0.0001, + "loss": 6.1697, + "loss/crossentropy": 2.612215995788574, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19754952192306519, + "step": 10472 + }, + { + "epoch": 0.3273125, + "grad_norm": 3.390625, + "grad_norm_var": 0.06692301432291667, + "learning_rate": 0.0001, + "loss": 5.3695, + "loss/crossentropy": 2.3111432790756226, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15778601169586182, + "step": 10474 + }, + { + "epoch": 0.327375, + "grad_norm": 3.078125, + "grad_norm_var": 0.069921875, + "learning_rate": 0.0001, + "loss": 6.0333, + "loss/crossentropy": 2.6295695304870605, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18725257366895676, + "step": 10476 + }, + { + "epoch": 0.3274375, + "grad_norm": 3.453125, + "grad_norm_var": 0.06411844889322917, + "learning_rate": 0.0001, + "loss": 5.8327, + "loss/crossentropy": 2.553055167198181, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1764000654220581, + "step": 10478 + }, + { + "epoch": 0.3275, + "grad_norm": 3.25, + "grad_norm_var": 0.0619140625, + "learning_rate": 0.0001, + "loss": 5.8046, + "loss/crossentropy": 2.449642062187195, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1815905198454857, + "step": 10480 + }, + { + "epoch": 0.3275625, + "grad_norm": 3.234375, + "grad_norm_var": 0.11783447265625, + "learning_rate": 0.0001, + "loss": 6.0743, + "loss/crossentropy": 2.6281604766845703, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19148899614810944, + "step": 10482 + }, + { + "epoch": 0.327625, + "grad_norm": 3.40625, + "grad_norm_var": 0.11571858723958334, + "learning_rate": 0.0001, + "loss": 5.8511, + "loss/crossentropy": 2.5006006956100464, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18231221288442612, + "step": 10484 + }, + { + "epoch": 0.3276875, + "grad_norm": 3.34375, + "grad_norm_var": 0.110205078125, + "learning_rate": 0.0001, + "loss": 5.6347, + "loss/crossentropy": 2.3496744632720947, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1730307787656784, + "step": 10486 + }, + { + "epoch": 0.32775, + "grad_norm": 3.3125, + "grad_norm_var": 0.10273030598958334, + "learning_rate": 0.0001, + "loss": 5.9948, + "loss/crossentropy": 2.6307467222213745, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18444740772247314, + "step": 10488 + }, + { + "epoch": 0.3278125, + "grad_norm": 3.40625, + "grad_norm_var": 0.091015625, + "learning_rate": 0.0001, + "loss": 5.7624, + "loss/crossentropy": 2.5877468585968018, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16746732592582703, + "step": 10490 + }, + { + "epoch": 0.327875, + "grad_norm": 3.171875, + "grad_norm_var": 0.089111328125, + "learning_rate": 0.0001, + "loss": 5.8677, + "loss/crossentropy": 2.573415517807007, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1766989678144455, + "step": 10492 + }, + { + "epoch": 0.3279375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0910797119140625, + "learning_rate": 0.0001, + "loss": 5.6794, + "loss/crossentropy": 2.364428162574768, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17602647095918655, + "step": 10494 + }, + { + "epoch": 0.328, + "grad_norm": 4.0625, + "grad_norm_var": 0.12268473307291666, + "learning_rate": 0.0001, + "loss": 6.2771, + "loss/crossentropy": 2.719959020614624, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19751346856355667, + "step": 10496 + }, + { + "epoch": 0.3280625, + "grad_norm": 3.375, + "grad_norm_var": 0.056428019205729166, + "learning_rate": 0.0001, + "loss": 5.7902, + "loss/crossentropy": 2.4194256067276, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1851213350892067, + "step": 10498 + }, + { + "epoch": 0.328125, + "grad_norm": 3.390625, + "grad_norm_var": 0.08354390462239583, + "learning_rate": 0.0001, + "loss": 6.115, + "loss/crossentropy": 2.686237335205078, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1870184689760208, + "step": 10500 + }, + { + "epoch": 0.3281875, + "grad_norm": 3.484375, + "grad_norm_var": 0.089990234375, + "learning_rate": 0.0001, + "loss": 5.9201, + "loss/crossentropy": 2.5260117053985596, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18276835978031158, + "step": 10502 + }, + { + "epoch": 0.32825, + "grad_norm": 3.390625, + "grad_norm_var": 0.08671468098958333, + "learning_rate": 0.0001, + "loss": 5.8161, + "loss/crossentropy": 2.5687613487243652, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17356525361537933, + "step": 10504 + }, + { + "epoch": 0.3283125, + "grad_norm": 3.140625, + "grad_norm_var": 0.07932942708333333, + "learning_rate": 0.0001, + "loss": 5.592, + "loss/crossentropy": 2.33124041557312, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17099381238222122, + "step": 10506 + }, + { + "epoch": 0.328375, + "grad_norm": 3.390625, + "grad_norm_var": 0.07315165201822917, + "learning_rate": 0.0001, + "loss": 5.967, + "loss/crossentropy": 2.6727588176727295, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17903530597686768, + "step": 10508 + }, + { + "epoch": 0.3284375, + "grad_norm": 3.5625, + "grad_norm_var": 0.06545817057291667, + "learning_rate": 0.0001, + "loss": 5.9038, + "loss/crossentropy": 2.4932072162628174, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18363633006811142, + "step": 10510 + }, + { + "epoch": 0.3285, + "grad_norm": 3.078125, + "grad_norm_var": 0.053132120768229166, + "learning_rate": 0.0001, + "loss": 6.0411, + "loss/crossentropy": 2.7488632202148438, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1776644065976143, + "step": 10512 + }, + { + "epoch": 0.3285625, + "grad_norm": 3.1875, + "grad_norm_var": 0.06106770833333333, + "learning_rate": 0.0001, + "loss": 6.4665, + "loss/crossentropy": 2.885968565940857, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19789214432239532, + "step": 10514 + }, + { + "epoch": 0.328625, + "grad_norm": 3.109375, + "grad_norm_var": 0.03954671223958333, + "learning_rate": 0.0001, + "loss": 5.824, + "loss/crossentropy": 2.562509536743164, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17380927503108978, + "step": 10516 + }, + { + "epoch": 0.3286875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0358551025390625, + "learning_rate": 0.0001, + "loss": 5.5661, + "loss/crossentropy": 2.369008183479309, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.170881487429142, + "step": 10518 + }, + { + "epoch": 0.32875, + "grad_norm": 3.296875, + "grad_norm_var": 0.040576171875, + "learning_rate": 0.0001, + "loss": 5.9116, + "loss/crossentropy": 2.5701723098754883, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1794508770108223, + "step": 10520 + }, + { + "epoch": 0.3288125, + "grad_norm": 3.609375, + "grad_norm_var": 0.044677734375, + "learning_rate": 0.0001, + "loss": 6.1161, + "loss/crossentropy": 2.674402117729187, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1894790381193161, + "step": 10522 + }, + { + "epoch": 0.328875, + "grad_norm": 3.265625, + "grad_norm_var": 0.044287109375, + "learning_rate": 0.0001, + "loss": 5.9051, + "loss/crossentropy": 2.544066548347473, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17907274514436722, + "step": 10524 + }, + { + "epoch": 0.3289375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0453125, + "learning_rate": 0.0001, + "loss": 5.8596, + "loss/crossentropy": 2.5954864025115967, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17484527081251144, + "step": 10526 + }, + { + "epoch": 0.329, + "grad_norm": 3.265625, + "grad_norm_var": 0.04072977701822917, + "learning_rate": 0.0001, + "loss": 5.9282, + "loss/crossentropy": 2.6108763217926025, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18172959238290787, + "step": 10528 + }, + { + "epoch": 0.3290625, + "grad_norm": 3.546875, + "grad_norm_var": 0.034928385416666666, + "learning_rate": 0.0001, + "loss": 5.976, + "loss/crossentropy": 2.6122357845306396, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18520929664373398, + "step": 10530 + }, + { + "epoch": 0.329125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03798828125, + "learning_rate": 0.0001, + "loss": 6.1419, + "loss/crossentropy": 2.7869744300842285, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18510252982378006, + "step": 10532 + }, + { + "epoch": 0.3291875, + "grad_norm": 3.1875, + "grad_norm_var": 0.03351949055989583, + "learning_rate": 0.0001, + "loss": 6.0635, + "loss/crossentropy": 2.7528659105300903, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1775524988770485, + "step": 10534 + }, + { + "epoch": 0.32925, + "grad_norm": 3.140625, + "grad_norm_var": 0.03192952473958333, + "learning_rate": 0.0001, + "loss": 5.9812, + "loss/crossentropy": 2.60650634765625, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18083249777555466, + "step": 10536 + }, + { + "epoch": 0.3293125, + "grad_norm": 3.0, + "grad_norm_var": 0.030524698893229167, + "learning_rate": 0.0001, + "loss": 5.8681, + "loss/crossentropy": 2.6220571994781494, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1742141991853714, + "step": 10538 + }, + { + "epoch": 0.329375, + "grad_norm": 3.234375, + "grad_norm_var": 0.037653605143229164, + "learning_rate": 0.0001, + "loss": 5.7579, + "loss/crossentropy": 2.4170820713043213, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1786084622144699, + "step": 10540 + }, + { + "epoch": 0.3294375, + "grad_norm": 3.171875, + "grad_norm_var": 0.038361612955729166, + "learning_rate": 0.0001, + "loss": 5.5535, + "loss/crossentropy": 2.4177643060684204, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16865624487400055, + "step": 10542 + }, + { + "epoch": 0.3295, + "grad_norm": 4.21875, + "grad_norm_var": 0.0998199462890625, + "learning_rate": 0.0001, + "loss": 5.5978, + "loss/crossentropy": 2.239539384841919, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17879444360733032, + "step": 10544 + }, + { + "epoch": 0.3295625, + "grad_norm": 3.28125, + "grad_norm_var": 0.09480692545572916, + "learning_rate": 0.0001, + "loss": 5.6869, + "loss/crossentropy": 2.414452075958252, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17412185668945312, + "step": 10546 + }, + { + "epoch": 0.329625, + "grad_norm": 3.28125, + "grad_norm_var": 0.09277242024739583, + "learning_rate": 0.0001, + "loss": 5.908, + "loss/crossentropy": 2.638763189315796, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17496571689844131, + "step": 10548 + }, + { + "epoch": 0.3296875, + "grad_norm": 3.09375, + "grad_norm_var": 0.09472249348958334, + "learning_rate": 0.0001, + "loss": 5.8715, + "loss/crossentropy": 2.533913731575012, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17712035030126572, + "step": 10550 + }, + { + "epoch": 0.32975, + "grad_norm": 3.171875, + "grad_norm_var": 0.09904683430989583, + "learning_rate": 0.0001, + "loss": 5.953, + "loss/crossentropy": 2.5308799743652344, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1844041869044304, + "step": 10552 + }, + { + "epoch": 0.3298125, + "grad_norm": 3.28125, + "grad_norm_var": 0.09582926432291666, + "learning_rate": 0.0001, + "loss": 5.2914, + "loss/crossentropy": 2.190713882446289, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1588962972164154, + "step": 10554 + }, + { + "epoch": 0.329875, + "grad_norm": 3.1875, + "grad_norm_var": 0.09474283854166667, + "learning_rate": 0.0001, + "loss": 5.7447, + "loss/crossentropy": 2.421551823616028, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1780192330479622, + "step": 10556 + }, + { + "epoch": 0.3299375, + "grad_norm": 3.765625, + "grad_norm_var": 0.09556884765625, + "learning_rate": 0.0001, + "loss": 5.8952, + "loss/crossentropy": 2.55362606048584, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17869000136852264, + "step": 10558 + }, + { + "epoch": 0.33, + "grad_norm": 3.234375, + "grad_norm_var": 0.047200520833333336, + "learning_rate": 0.0001, + "loss": 6.1143, + "loss/crossentropy": 2.75285267829895, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18067169189453125, + "step": 10560 + }, + { + "epoch": 0.3300625, + "grad_norm": 3.375, + "grad_norm_var": 0.048140462239583334, + "learning_rate": 0.0001, + "loss": 5.7913, + "loss/crossentropy": 2.5640674829483032, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17116037011146545, + "step": 10562 + }, + { + "epoch": 0.330125, + "grad_norm": 3.296875, + "grad_norm_var": 0.042708333333333334, + "learning_rate": 0.0001, + "loss": 5.7565, + "loss/crossentropy": 2.4372029304504395, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17763124406337738, + "step": 10564 + }, + { + "epoch": 0.3301875, + "grad_norm": 3.265625, + "grad_norm_var": 0.09617411295572917, + "learning_rate": 0.0001, + "loss": 5.9864, + "loss/crossentropy": 2.4982370138168335, + "loss/hidden": 1.62109375, + "loss/jsd": 0.0, + "loss/logits": 0.18670214712619781, + "step": 10566 + }, + { + "epoch": 0.33025, + "grad_norm": 3.265625, + "grad_norm_var": 0.09566141764322916, + "learning_rate": 0.0001, + "loss": 6.2147, + "loss/crossentropy": 2.682075023651123, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19662075489759445, + "step": 10568 + }, + { + "epoch": 0.3303125, + "grad_norm": 3.515625, + "grad_norm_var": 0.08302408854166667, + "learning_rate": 0.0001, + "loss": 6.034, + "loss/crossentropy": 2.6168935298919678, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18624671548604965, + "step": 10570 + }, + { + "epoch": 0.330375, + "grad_norm": 3.171875, + "grad_norm_var": 0.08391927083333334, + "learning_rate": 0.0001, + "loss": 6.1638, + "loss/crossentropy": 2.775294542312622, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.185729518532753, + "step": 10572 + }, + { + "epoch": 0.3304375, + "grad_norm": 3.3125, + "grad_norm_var": 0.07730712890625, + "learning_rate": 0.0001, + "loss": 5.9852, + "loss/crossentropy": 2.5786255598068237, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18675360083580017, + "step": 10574 + }, + { + "epoch": 0.3305, + "grad_norm": 3.140625, + "grad_norm_var": 0.0783599853515625, + "learning_rate": 0.0001, + "loss": 5.3715, + "loss/crossentropy": 2.2789233922958374, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15887174755334854, + "step": 10576 + }, + { + "epoch": 0.3305625, + "grad_norm": 3.1875, + "grad_norm_var": 0.07822265625, + "learning_rate": 0.0001, + "loss": 5.6954, + "loss/crossentropy": 2.4184963703155518, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17612414807081223, + "step": 10578 + }, + { + "epoch": 0.330625, + "grad_norm": 3.0, + "grad_norm_var": 0.08936258951822916, + "learning_rate": 0.0001, + "loss": 6.0551, + "loss/crossentropy": 2.6684863567352295, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18397579342126846, + "step": 10580 + }, + { + "epoch": 0.3306875, + "grad_norm": 3.5625, + "grad_norm_var": 0.05384114583333333, + "learning_rate": 0.0001, + "loss": 6.3382, + "loss/crossentropy": 2.7625722885131836, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.2020929455757141, + "step": 10582 + }, + { + "epoch": 0.33075, + "grad_norm": 3.234375, + "grad_norm_var": 0.0497467041015625, + "learning_rate": 0.0001, + "loss": 5.7447, + "loss/crossentropy": 2.435757040977478, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.17191199213266373, + "step": 10584 + }, + { + "epoch": 0.3308125, + "grad_norm": 4.03125, + "grad_norm_var": 0.0761871337890625, + "learning_rate": 0.0001, + "loss": 6.0752, + "loss/crossentropy": 2.595419764518738, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19095022231340408, + "step": 10586 + }, + { + "epoch": 0.330875, + "grad_norm": 3.15625, + "grad_norm_var": 0.07667643229166667, + "learning_rate": 0.0001, + "loss": 5.9915, + "loss/crossentropy": 2.5594364404678345, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18657050281763077, + "step": 10588 + }, + { + "epoch": 0.3309375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0802734375, + "learning_rate": 0.0001, + "loss": 5.8781, + "loss/crossentropy": 2.589942455291748, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17803217470645905, + "step": 10590 + }, + { + "epoch": 0.331, + "grad_norm": 2.859375, + "grad_norm_var": 0.09575907389322917, + "learning_rate": 0.0001, + "loss": 6.007, + "loss/crossentropy": 2.692532777786255, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1798829883337021, + "step": 10592 + }, + { + "epoch": 0.3310625, + "grad_norm": 3.125, + "grad_norm_var": 0.09853413899739584, + "learning_rate": 0.0001, + "loss": 5.9402, + "loss/crossentropy": 2.6700247526168823, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1758483648300171, + "step": 10594 + }, + { + "epoch": 0.331125, + "grad_norm": 3.5, + "grad_norm_var": 0.09239908854166666, + "learning_rate": 0.0001, + "loss": 5.8785, + "loss/crossentropy": 2.488546133041382, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.17845331132411957, + "step": 10596 + }, + { + "epoch": 0.3311875, + "grad_norm": 3.515625, + "grad_norm_var": 0.07979227701822916, + "learning_rate": 0.0001, + "loss": 6.2341, + "loss/crossentropy": 2.7175732851028442, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19422803819179535, + "step": 10598 + }, + { + "epoch": 0.33125, + "grad_norm": 3.203125, + "grad_norm_var": 0.08053385416666667, + "learning_rate": 0.0001, + "loss": 5.8776, + "loss/crossentropy": 2.5200968980789185, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18262987583875656, + "step": 10600 + }, + { + "epoch": 0.3313125, + "grad_norm": 3.328125, + "grad_norm_var": 0.05237223307291667, + "learning_rate": 0.0001, + "loss": 5.669, + "loss/crossentropy": 2.410857081413269, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17425625771284103, + "step": 10602 + }, + { + "epoch": 0.331375, + "grad_norm": 3.4375, + "grad_norm_var": 0.05281473795572917, + "learning_rate": 0.0001, + "loss": 5.9284, + "loss/crossentropy": 2.5856266021728516, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1834932491183281, + "step": 10604 + }, + { + "epoch": 0.3314375, + "grad_norm": 3.171875, + "grad_norm_var": 0.052229817708333334, + "learning_rate": 0.0001, + "loss": 5.7939, + "loss/crossentropy": 2.5548208951950073, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17195909470319748, + "step": 10606 + }, + { + "epoch": 0.3315, + "grad_norm": 3.265625, + "grad_norm_var": 0.03737691243489583, + "learning_rate": 0.0001, + "loss": 5.943, + "loss/crossentropy": 2.641461133956909, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17663607746362686, + "step": 10608 + }, + { + "epoch": 0.3315625, + "grad_norm": 3.046875, + "grad_norm_var": 0.03974202473958333, + "learning_rate": 0.0001, + "loss": 5.666, + "loss/crossentropy": 2.4323424100875854, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16984786093235016, + "step": 10610 + }, + { + "epoch": 0.331625, + "grad_norm": 3.09375, + "grad_norm_var": 0.033935546875, + "learning_rate": 0.0001, + "loss": 5.4376, + "loss/crossentropy": 2.272653341293335, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16844987869262695, + "step": 10612 + }, + { + "epoch": 0.3316875, + "grad_norm": 3.125, + "grad_norm_var": 0.011617024739583334, + "learning_rate": 0.0001, + "loss": 5.8562, + "loss/crossentropy": 2.58320415019989, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1765137016773224, + "step": 10614 + }, + { + "epoch": 0.33175, + "grad_norm": 3.0625, + "grad_norm_var": 0.012035115559895834, + "learning_rate": 0.0001, + "loss": 5.8355, + "loss/crossentropy": 2.6183416843414307, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17289245128631592, + "step": 10616 + }, + { + "epoch": 0.3318125, + "grad_norm": 3.5625, + "grad_norm_var": 0.019367472330729166, + "learning_rate": 0.0001, + "loss": 6.1285, + "loss/crossentropy": 2.7610244750976562, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17893724143505096, + "step": 10618 + }, + { + "epoch": 0.331875, + "grad_norm": 3.234375, + "grad_norm_var": 0.017281087239583333, + "learning_rate": 0.0001, + "loss": 5.8985, + "loss/crossentropy": 2.5375760793685913, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18101774901151657, + "step": 10620 + }, + { + "epoch": 0.3319375, + "grad_norm": 4.09375, + "grad_norm_var": 0.06529947916666666, + "learning_rate": 0.0001, + "loss": 5.8052, + "loss/crossentropy": 2.487018585205078, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17596260458230972, + "step": 10622 + }, + { + "epoch": 0.332, + "grad_norm": 3.203125, + "grad_norm_var": 0.06450093587239583, + "learning_rate": 0.0001, + "loss": 5.8771, + "loss/crossentropy": 2.591071367263794, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17821310460567474, + "step": 10624 + }, + { + "epoch": 0.3320625, + "grad_norm": 3.484375, + "grad_norm_var": 0.06321614583333333, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.428701877593994, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17056617140769958, + "step": 10626 + }, + { + "epoch": 0.332125, + "grad_norm": 3.578125, + "grad_norm_var": 0.06451822916666666, + "learning_rate": 0.0001, + "loss": 5.9604, + "loss/crossentropy": 2.5602476596832275, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18844837695360184, + "step": 10628 + }, + { + "epoch": 0.3321875, + "grad_norm": 3.34375, + "grad_norm_var": 0.061823527018229164, + "learning_rate": 0.0001, + "loss": 5.652, + "loss/crossentropy": 2.4678162336349487, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16881363093852997, + "step": 10630 + }, + { + "epoch": 0.33225, + "grad_norm": 3.640625, + "grad_norm_var": 0.0569732666015625, + "learning_rate": 0.0001, + "loss": 6.0892, + "loss/crossentropy": 2.691482424736023, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18703413009643555, + "step": 10632 + }, + { + "epoch": 0.3323125, + "grad_norm": 3.640625, + "grad_norm_var": 0.058958943684895834, + "learning_rate": 0.0001, + "loss": 6.0169, + "loss/crossentropy": 2.605046033859253, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18610990047454834, + "step": 10634 + }, + { + "epoch": 0.332375, + "grad_norm": 4.1875, + "grad_norm_var": 0.10077718098958334, + "learning_rate": 0.0001, + "loss": 6.2816, + "loss/crossentropy": 2.7226985692977905, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19924483448266983, + "step": 10636 + }, + { + "epoch": 0.3324375, + "grad_norm": 3.5625, + "grad_norm_var": 0.0700836181640625, + "learning_rate": 0.0001, + "loss": 5.8493, + "loss/crossentropy": 2.4939684867858887, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17928729951381683, + "step": 10638 + }, + { + "epoch": 0.3325, + "grad_norm": 3.828125, + "grad_norm_var": 0.10445963541666667, + "learning_rate": 0.0001, + "loss": 6.0624, + "loss/crossentropy": 2.5568013191223145, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19626538455486298, + "step": 10640 + }, + { + "epoch": 0.3325625, + "grad_norm": 3.265625, + "grad_norm_var": 0.11070556640625, + "learning_rate": 0.0001, + "loss": 5.8763, + "loss/crossentropy": 2.5293742418289185, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18391429632902145, + "step": 10642 + }, + { + "epoch": 0.332625, + "grad_norm": 3.015625, + "grad_norm_var": 0.12675374348958332, + "learning_rate": 0.0001, + "loss": 5.6411, + "loss/crossentropy": 2.470715045928955, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16704195737838745, + "step": 10644 + }, + { + "epoch": 0.3326875, + "grad_norm": 3.6875, + "grad_norm_var": 0.12403971354166667, + "learning_rate": 0.0001, + "loss": 6.0903, + "loss/crossentropy": 2.5943169593811035, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19100331515073776, + "step": 10646 + }, + { + "epoch": 0.33275, + "grad_norm": 3.25, + "grad_norm_var": 0.12360738118489584, + "learning_rate": 0.0001, + "loss": 5.8859, + "loss/crossentropy": 2.4839388132095337, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18237964808940887, + "step": 10648 + }, + { + "epoch": 0.3328125, + "grad_norm": 2.96875, + "grad_norm_var": 0.13709309895833333, + "learning_rate": 0.0001, + "loss": 5.7542, + "loss/crossentropy": 2.485421061515808, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17414316534996033, + "step": 10650 + }, + { + "epoch": 0.332875, + "grad_norm": 3.8125, + "grad_norm_var": 0.11015218098958333, + "learning_rate": 0.0001, + "loss": 5.6255, + "loss/crossentropy": 2.3566641807556152, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17219743877649307, + "step": 10652 + }, + { + "epoch": 0.3329375, + "grad_norm": 3.59375, + "grad_norm_var": 0.11064046223958333, + "learning_rate": 0.0001, + "loss": 5.9665, + "loss/crossentropy": 2.5395361185073853, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18566076457500458, + "step": 10654 + }, + { + "epoch": 0.333, + "grad_norm": 3.671875, + "grad_norm_var": 0.08931884765625, + "learning_rate": 0.0001, + "loss": 6.1111, + "loss/crossentropy": 2.56496000289917, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19485173374414444, + "step": 10656 + }, + { + "epoch": 0.3330625, + "grad_norm": 3.40625, + "grad_norm_var": 0.08658447265625, + "learning_rate": 0.0001, + "loss": 5.7782, + "loss/crossentropy": 2.532205820083618, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17459736764431, + "step": 10658 + }, + { + "epoch": 0.333125, + "grad_norm": 3.421875, + "grad_norm_var": 0.06955973307291667, + "learning_rate": 0.0001, + "loss": 5.6623, + "loss/crossentropy": 2.408676028251648, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1734139770269394, + "step": 10660 + }, + { + "epoch": 0.3331875, + "grad_norm": 3.265625, + "grad_norm_var": 0.06777242024739584, + "learning_rate": 0.0001, + "loss": 5.9523, + "loss/crossentropy": 2.6525381803512573, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17763584852218628, + "step": 10662 + }, + { + "epoch": 0.33325, + "grad_norm": 2.96875, + "grad_norm_var": 0.08220113118489583, + "learning_rate": 0.0001, + "loss": 5.8732, + "loss/crossentropy": 2.58855938911438, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1784590780735016, + "step": 10664 + }, + { + "epoch": 0.3333125, + "grad_norm": 3.359375, + "grad_norm_var": 0.08573811848958333, + "learning_rate": 0.0001, + "loss": 5.8772, + "loss/crossentropy": 2.512405276298523, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1813981756567955, + "step": 10666 + }, + { + "epoch": 0.333375, + "grad_norm": 3.25, + "grad_norm_var": 0.07273763020833333, + "learning_rate": 0.0001, + "loss": 5.7278, + "loss/crossentropy": 2.418931484222412, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1816667690873146, + "step": 10668 + }, + { + "epoch": 0.3334375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0744140625, + "learning_rate": 0.0001, + "loss": 5.9827, + "loss/crossentropy": 2.6172374486923218, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1802959442138672, + "step": 10670 + }, + { + "epoch": 0.3335, + "grad_norm": 3.234375, + "grad_norm_var": 0.0423980712890625, + "learning_rate": 0.0001, + "loss": 5.8085, + "loss/crossentropy": 2.5476096868515015, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1753033846616745, + "step": 10672 + }, + { + "epoch": 0.3335625, + "grad_norm": 3.59375, + "grad_norm_var": 0.04898681640625, + "learning_rate": 0.0001, + "loss": 5.8135, + "loss/crossentropy": 2.5202016830444336, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17893477529287338, + "step": 10674 + }, + { + "epoch": 0.333625, + "grad_norm": 3.109375, + "grad_norm_var": 0.051070149739583334, + "learning_rate": 0.0001, + "loss": 5.966, + "loss/crossentropy": 2.663010001182556, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17600010335445404, + "step": 10676 + }, + { + "epoch": 0.3336875, + "grad_norm": 3.421875, + "grad_norm_var": 0.052408854166666664, + "learning_rate": 0.0001, + "loss": 5.8813, + "loss/crossentropy": 2.593448281288147, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1764383167028427, + "step": 10678 + }, + { + "epoch": 0.33375, + "grad_norm": 3.28125, + "grad_norm_var": 0.04797770182291667, + "learning_rate": 0.0001, + "loss": 6.1149, + "loss/crossentropy": 2.6398794651031494, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19281642884016037, + "step": 10680 + }, + { + "epoch": 0.3338125, + "grad_norm": 3.71875, + "grad_norm_var": 0.0342926025390625, + "learning_rate": 0.0001, + "loss": 5.8726, + "loss/crossentropy": 2.5157653093338013, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1802167147397995, + "step": 10682 + }, + { + "epoch": 0.333875, + "grad_norm": 3.296875, + "grad_norm_var": 0.0352203369140625, + "learning_rate": 0.0001, + "loss": 5.8488, + "loss/crossentropy": 2.5493035316467285, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17799723148345947, + "step": 10684 + }, + { + "epoch": 0.3339375, + "grad_norm": 3.484375, + "grad_norm_var": 0.0407867431640625, + "learning_rate": 0.0001, + "loss": 5.8222, + "loss/crossentropy": 2.5019291639328003, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1777304857969284, + "step": 10686 + }, + { + "epoch": 0.334, + "grad_norm": 3.4375, + "grad_norm_var": 0.0457183837890625, + "learning_rate": 0.0001, + "loss": 6.1355, + "loss/crossentropy": 2.6443079710006714, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19364753365516663, + "step": 10688 + }, + { + "epoch": 0.3340625, + "grad_norm": 3.75, + "grad_norm_var": 0.06055399576822917, + "learning_rate": 0.0001, + "loss": 5.6513, + "loss/crossentropy": 2.3860379457473755, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17340559512376785, + "step": 10690 + }, + { + "epoch": 0.334125, + "grad_norm": 3.640625, + "grad_norm_var": 0.055859375, + "learning_rate": 0.0001, + "loss": 6.0432, + "loss/crossentropy": 2.6361632347106934, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18523555994033813, + "step": 10692 + }, + { + "epoch": 0.3341875, + "grad_norm": 3.84375, + "grad_norm_var": 0.058592732747395834, + "learning_rate": 0.0001, + "loss": 5.6446, + "loss/crossentropy": 2.3392274379730225, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17975673079490662, + "step": 10694 + }, + { + "epoch": 0.33425, + "grad_norm": 3.515625, + "grad_norm_var": 0.056103515625, + "learning_rate": 0.0001, + "loss": 6.1306, + "loss/crossentropy": 2.662463068962097, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.19407868385314941, + "step": 10696 + }, + { + "epoch": 0.3343125, + "grad_norm": 3.203125, + "grad_norm_var": 0.06316630045572917, + "learning_rate": 0.0001, + "loss": 5.8663, + "loss/crossentropy": 2.55662739276886, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18174849450588226, + "step": 10698 + }, + { + "epoch": 0.334375, + "grad_norm": 3.453125, + "grad_norm_var": 0.054423014322916664, + "learning_rate": 0.0001, + "loss": 6.0766, + "loss/crossentropy": 2.595309257507324, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19148609787225723, + "step": 10700 + }, + { + "epoch": 0.3344375, + "grad_norm": 3.125, + "grad_norm_var": 0.0628570556640625, + "learning_rate": 0.0001, + "loss": 6.0151, + "loss/crossentropy": 2.636073112487793, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18009155988693237, + "step": 10702 + }, + { + "epoch": 0.3345, + "grad_norm": 3.453125, + "grad_norm_var": 0.0578521728515625, + "learning_rate": 0.0001, + "loss": 5.9708, + "loss/crossentropy": 2.5911970138549805, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18444080650806427, + "step": 10704 + }, + { + "epoch": 0.3345625, + "grad_norm": 4.15625, + "grad_norm_var": 0.07268778483072917, + "learning_rate": 0.0001, + "loss": 6.0049, + "loss/crossentropy": 2.5935776233673096, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18839357793331146, + "step": 10706 + }, + { + "epoch": 0.334625, + "grad_norm": 3.421875, + "grad_norm_var": 0.07278645833333333, + "learning_rate": 0.0001, + "loss": 5.6842, + "loss/crossentropy": 2.3627558946609497, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.17394591867923737, + "step": 10708 + }, + { + "epoch": 0.3346875, + "grad_norm": 3.21875, + "grad_norm_var": 0.06808268229166667, + "learning_rate": 0.0001, + "loss": 6.0557, + "loss/crossentropy": 2.6889145374298096, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1796456128358841, + "step": 10710 + }, + { + "epoch": 0.33475, + "grad_norm": 3.390625, + "grad_norm_var": 0.06531473795572916, + "learning_rate": 0.0001, + "loss": 5.8758, + "loss/crossentropy": 2.5156023502349854, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.183290034532547, + "step": 10712 + }, + { + "epoch": 0.3348125, + "grad_norm": 3.203125, + "grad_norm_var": 0.0652496337890625, + "learning_rate": 0.0001, + "loss": 5.9407, + "loss/crossentropy": 2.618725299835205, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18063638359308243, + "step": 10714 + }, + { + "epoch": 0.334875, + "grad_norm": 3.609375, + "grad_norm_var": 0.07017822265625, + "learning_rate": 0.0001, + "loss": 5.5902, + "loss/crossentropy": 2.3678793907165527, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.16559426486492157, + "step": 10716 + }, + { + "epoch": 0.3349375, + "grad_norm": 3.234375, + "grad_norm_var": 0.06812235514322916, + "learning_rate": 0.0001, + "loss": 5.7844, + "loss/crossentropy": 2.527473211288452, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17490901052951813, + "step": 10718 + }, + { + "epoch": 0.335, + "grad_norm": 3.234375, + "grad_norm_var": 0.0790191650390625, + "learning_rate": 0.0001, + "loss": 5.5054, + "loss/crossentropy": 2.298014998435974, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17073433101177216, + "step": 10720 + }, + { + "epoch": 0.3350625, + "grad_norm": 3.515625, + "grad_norm_var": 0.041015625, + "learning_rate": 0.0001, + "loss": 5.8837, + "loss/crossentropy": 2.5489251613616943, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18308701366186142, + "step": 10722 + }, + { + "epoch": 0.335125, + "grad_norm": 3.53125, + "grad_norm_var": 0.03801167805989583, + "learning_rate": 0.0001, + "loss": 5.8824, + "loss/crossentropy": 2.590053081512451, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17493685334920883, + "step": 10724 + }, + { + "epoch": 0.3351875, + "grad_norm": 3.703125, + "grad_norm_var": 0.04772135416666667, + "learning_rate": 0.0001, + "loss": 6.0159, + "loss/crossentropy": 2.5686169862747192, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1876990720629692, + "step": 10726 + }, + { + "epoch": 0.33525, + "grad_norm": 3.296875, + "grad_norm_var": 0.05120035807291667, + "learning_rate": 0.0001, + "loss": 5.5758, + "loss/crossentropy": 2.423282265663147, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16720261424779892, + "step": 10728 + }, + { + "epoch": 0.3353125, + "grad_norm": 3.203125, + "grad_norm_var": 0.041731770833333334, + "learning_rate": 0.0001, + "loss": 5.8786, + "loss/crossentropy": 2.5165220499038696, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18073506653308868, + "step": 10730 + }, + { + "epoch": 0.335375, + "grad_norm": 3.1875, + "grad_norm_var": 0.03559468587239583, + "learning_rate": 0.0001, + "loss": 5.459, + "loss/crossentropy": 2.332701325416565, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16380292922258377, + "step": 10732 + }, + { + "epoch": 0.3354375, + "grad_norm": 3.25, + "grad_norm_var": 0.035542805989583336, + "learning_rate": 0.0001, + "loss": 6.101, + "loss/crossentropy": 2.673590302467346, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18766145408153534, + "step": 10734 + }, + { + "epoch": 0.3355, + "grad_norm": 3.0625, + "grad_norm_var": 0.03414306640625, + "learning_rate": 0.0001, + "loss": 5.8438, + "loss/crossentropy": 2.531806230545044, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1784651130437851, + "step": 10736 + }, + { + "epoch": 0.3355625, + "grad_norm": 3.515625, + "grad_norm_var": 0.030517578125, + "learning_rate": 0.0001, + "loss": 5.9511, + "loss/crossentropy": 2.5333372354507446, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1859154775738716, + "step": 10738 + }, + { + "epoch": 0.335625, + "grad_norm": 3.390625, + "grad_norm_var": 0.02564697265625, + "learning_rate": 0.0001, + "loss": 5.6016, + "loss/crossentropy": 2.356564521789551, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17645235359668732, + "step": 10740 + }, + { + "epoch": 0.3356875, + "grad_norm": 3.1875, + "grad_norm_var": 0.03204752604166667, + "learning_rate": 0.0001, + "loss": 5.8296, + "loss/crossentropy": 2.5035078525543213, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18104194849729538, + "step": 10742 + }, + { + "epoch": 0.33575, + "grad_norm": 3.71875, + "grad_norm_var": 0.045929972330729166, + "learning_rate": 0.0001, + "loss": 5.9766, + "loss/crossentropy": 2.483887195587158, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1961456835269928, + "step": 10744 + }, + { + "epoch": 0.3358125, + "grad_norm": 3.734375, + "grad_norm_var": 0.08267822265625, + "learning_rate": 0.0001, + "loss": 6.4994, + "loss/crossentropy": 2.8418978452682495, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2028629183769226, + "step": 10746 + }, + { + "epoch": 0.335875, + "grad_norm": 3.109375, + "grad_norm_var": 0.08565165201822916, + "learning_rate": 0.0001, + "loss": 5.9268, + "loss/crossentropy": 2.621497869491577, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17896533757448196, + "step": 10748 + }, + { + "epoch": 0.3359375, + "grad_norm": 2.9375, + "grad_norm_var": 0.1017486572265625, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.494469165802002, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17855554819107056, + "step": 10750 + }, + { + "epoch": 0.336, + "grad_norm": 3.625, + "grad_norm_var": 0.10789388020833333, + "learning_rate": 0.0001, + "loss": 5.9852, + "loss/crossentropy": 2.5950719118118286, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18471822887659073, + "step": 10752 + }, + { + "epoch": 0.3360625, + "grad_norm": 3.625, + "grad_norm_var": 0.10816650390625, + "learning_rate": 0.0001, + "loss": 5.991, + "loss/crossentropy": 2.512368321418762, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18966437131166458, + "step": 10754 + }, + { + "epoch": 0.336125, + "grad_norm": 3.3125, + "grad_norm_var": 0.10845438639322917, + "learning_rate": 0.0001, + "loss": 6.11, + "loss/crossentropy": 2.673190951347351, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18665232509374619, + "step": 10756 + }, + { + "epoch": 0.3361875, + "grad_norm": 3.25, + "grad_norm_var": 0.10006103515625, + "learning_rate": 0.0001, + "loss": 6.2482, + "loss/crossentropy": 2.7868168354034424, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19027435779571533, + "step": 10758 + }, + { + "epoch": 0.33625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0986480712890625, + "learning_rate": 0.0001, + "loss": 5.9804, + "loss/crossentropy": 2.595295548439026, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18694473803043365, + "step": 10760 + }, + { + "epoch": 0.3363125, + "grad_norm": 3.4375, + "grad_norm_var": 0.067529296875, + "learning_rate": 0.0001, + "loss": 5.9568, + "loss/crossentropy": 2.5836589336395264, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18340512365102768, + "step": 10762 + }, + { + "epoch": 0.336375, + "grad_norm": 3.03125, + "grad_norm_var": 0.06737874348958334, + "learning_rate": 0.0001, + "loss": 5.7095, + "loss/crossentropy": 2.5286333560943604, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16848018765449524, + "step": 10764 + }, + { + "epoch": 0.3364375, + "grad_norm": 3.546875, + "grad_norm_var": 0.0657867431640625, + "learning_rate": 0.0001, + "loss": 5.6757, + "loss/crossentropy": 2.3952642679214478, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17179232090711594, + "step": 10766 + }, + { + "epoch": 0.3365, + "grad_norm": 3.296875, + "grad_norm_var": 0.051318359375, + "learning_rate": 0.0001, + "loss": 5.8813, + "loss/crossentropy": 2.6146652698516846, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1770579069852829, + "step": 10768 + }, + { + "epoch": 0.3365625, + "grad_norm": 3.375, + "grad_norm_var": 0.04949544270833333, + "learning_rate": 0.0001, + "loss": 6.2394, + "loss/crossentropy": 2.711554527282715, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19184448570013046, + "step": 10770 + }, + { + "epoch": 0.336625, + "grad_norm": 3.171875, + "grad_norm_var": 0.05370686848958333, + "learning_rate": 0.0001, + "loss": 5.9321, + "loss/crossentropy": 2.6609376668930054, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17711444944143295, + "step": 10772 + }, + { + "epoch": 0.3366875, + "grad_norm": 3.546875, + "grad_norm_var": 0.0553619384765625, + "learning_rate": 0.0001, + "loss": 6.0884, + "loss/crossentropy": 2.6715911626815796, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1854308694601059, + "step": 10774 + }, + { + "epoch": 0.33675, + "grad_norm": 3.28125, + "grad_norm_var": 0.049609375, + "learning_rate": 0.0001, + "loss": 5.9226, + "loss/crossentropy": 2.57106876373291, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1789025068283081, + "step": 10776 + }, + { + "epoch": 0.3368125, + "grad_norm": 3.0625, + "grad_norm_var": 0.06587626139322916, + "learning_rate": 0.0001, + "loss": 6.0601, + "loss/crossentropy": 2.6637685298919678, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1865086406469345, + "step": 10778 + }, + { + "epoch": 0.336875, + "grad_norm": 3.1875, + "grad_norm_var": 0.06286519368489583, + "learning_rate": 0.0001, + "loss": 5.9621, + "loss/crossentropy": 2.669995427131653, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17764340341091156, + "step": 10780 + }, + { + "epoch": 0.3369375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0591796875, + "learning_rate": 0.0001, + "loss": 5.7788, + "loss/crossentropy": 2.5435677766799927, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17664822190999985, + "step": 10782 + }, + { + "epoch": 0.337, + "grad_norm": 3.34375, + "grad_norm_var": 0.0614654541015625, + "learning_rate": 0.0001, + "loss": 5.8965, + "loss/crossentropy": 2.6301380395889282, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1758502647280693, + "step": 10784 + }, + { + "epoch": 0.3370625, + "grad_norm": 3.296875, + "grad_norm_var": 0.054215494791666666, + "learning_rate": 0.0001, + "loss": 5.6843, + "loss/crossentropy": 2.3977192640304565, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17084172368049622, + "step": 10786 + }, + { + "epoch": 0.337125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0639068603515625, + "learning_rate": 0.0001, + "loss": 6.0451, + "loss/crossentropy": 2.5708560943603516, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1923437863588333, + "step": 10788 + }, + { + "epoch": 0.3371875, + "grad_norm": 3.453125, + "grad_norm_var": 0.05845947265625, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.28852117061615, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17684955149888992, + "step": 10790 + }, + { + "epoch": 0.33725, + "grad_norm": 3.296875, + "grad_norm_var": 0.058394368489583334, + "learning_rate": 0.0001, + "loss": 5.7565, + "loss/crossentropy": 2.5005563497543335, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1728557050228119, + "step": 10792 + }, + { + "epoch": 0.3373125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0307037353515625, + "learning_rate": 0.0001, + "loss": 5.3686, + "loss/crossentropy": 2.236561596393585, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1596895009279251, + "step": 10794 + }, + { + "epoch": 0.337375, + "grad_norm": 3.546875, + "grad_norm_var": 0.04013264973958333, + "learning_rate": 0.0001, + "loss": 5.9064, + "loss/crossentropy": 2.5232553482055664, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18323633074760437, + "step": 10796 + }, + { + "epoch": 0.3374375, + "grad_norm": 3.046875, + "grad_norm_var": 0.03885091145833333, + "learning_rate": 0.0001, + "loss": 5.8346, + "loss/crossentropy": 2.624903440475464, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17097139358520508, + "step": 10798 + }, + { + "epoch": 0.3375, + "grad_norm": 3.375, + "grad_norm_var": 0.037694295247395836, + "learning_rate": 0.0001, + "loss": 5.5798, + "loss/crossentropy": 2.3961304426193237, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16797547787427902, + "step": 10800 + }, + { + "epoch": 0.3375625, + "grad_norm": 3.34375, + "grad_norm_var": 0.039159138997395836, + "learning_rate": 0.0001, + "loss": 5.981, + "loss/crossentropy": 2.683085799217224, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1786186248064041, + "step": 10802 + }, + { + "epoch": 0.337625, + "grad_norm": 3.265625, + "grad_norm_var": 0.02857666015625, + "learning_rate": 0.0001, + "loss": 5.9571, + "loss/crossentropy": 2.624199628829956, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1821201890707016, + "step": 10804 + }, + { + "epoch": 0.3376875, + "grad_norm": 3.28125, + "grad_norm_var": 0.02808837890625, + "learning_rate": 0.0001, + "loss": 5.924, + "loss/crossentropy": 2.570906400680542, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18257682770490646, + "step": 10806 + }, + { + "epoch": 0.33775, + "grad_norm": 3.390625, + "grad_norm_var": 0.028694661458333333, + "learning_rate": 0.0001, + "loss": 6.0505, + "loss/crossentropy": 2.7018431425094604, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18291601538658142, + "step": 10808 + }, + { + "epoch": 0.3378125, + "grad_norm": 4.21875, + "grad_norm_var": 0.0766510009765625, + "learning_rate": 0.0001, + "loss": 5.7848, + "loss/crossentropy": 2.3851877450942993, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18840241432189941, + "step": 10810 + }, + { + "epoch": 0.337875, + "grad_norm": 3.8125, + "grad_norm_var": 0.0820709228515625, + "learning_rate": 0.0001, + "loss": 6.1217, + "loss/crossentropy": 2.7344181537628174, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18638356775045395, + "step": 10812 + }, + { + "epoch": 0.3379375, + "grad_norm": 3.671875, + "grad_norm_var": 0.07802632649739584, + "learning_rate": 0.0001, + "loss": 5.7403, + "loss/crossentropy": 2.4325857162475586, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17608315497636795, + "step": 10814 + }, + { + "epoch": 0.338, + "grad_norm": 3.21875, + "grad_norm_var": 0.07598368326822917, + "learning_rate": 0.0001, + "loss": 5.9573, + "loss/crossentropy": 2.6354763507843018, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17983748018741608, + "step": 10816 + }, + { + "epoch": 0.3380625, + "grad_norm": 2.875, + "grad_norm_var": 0.0956207275390625, + "learning_rate": 0.0001, + "loss": 5.738, + "loss/crossentropy": 2.550377368927002, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1668103039264679, + "step": 10818 + }, + { + "epoch": 0.338125, + "grad_norm": 3.25, + "grad_norm_var": 0.09594624837239583, + "learning_rate": 0.0001, + "loss": 5.5867, + "loss/crossentropy": 2.396926999092102, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16741415858268738, + "step": 10820 + }, + { + "epoch": 0.3381875, + "grad_norm": 4.03125, + "grad_norm_var": 0.11691792805989583, + "learning_rate": 0.0001, + "loss": 5.6129, + "loss/crossentropy": 2.3473883867263794, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1749899983406067, + "step": 10822 + }, + { + "epoch": 0.33825, + "grad_norm": 3.109375, + "grad_norm_var": 0.12259012858072917, + "learning_rate": 0.0001, + "loss": 5.8787, + "loss/crossentropy": 2.5520554780960083, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18071406334638596, + "step": 10824 + }, + { + "epoch": 0.3383125, + "grad_norm": 3.28125, + "grad_norm_var": 0.07906494140625, + "learning_rate": 0.0001, + "loss": 5.7556, + "loss/crossentropy": 2.445315718650818, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1802482008934021, + "step": 10826 + }, + { + "epoch": 0.338375, + "grad_norm": 3.25, + "grad_norm_var": 0.061356608072916666, + "learning_rate": 0.0001, + "loss": 6.0558, + "loss/crossentropy": 2.675871968269348, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18213185667991638, + "step": 10828 + }, + { + "epoch": 0.3384375, + "grad_norm": 3.28125, + "grad_norm_var": 0.05349934895833333, + "learning_rate": 0.0001, + "loss": 5.763, + "loss/crossentropy": 2.541442632675171, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17332328110933304, + "step": 10830 + }, + { + "epoch": 0.3385, + "grad_norm": 3.390625, + "grad_norm_var": 0.057763671875, + "learning_rate": 0.0001, + "loss": 5.7538, + "loss/crossentropy": 2.457796812057495, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17764590680599213, + "step": 10832 + }, + { + "epoch": 0.3385625, + "grad_norm": 3.671875, + "grad_norm_var": 0.04993082682291667, + "learning_rate": 0.0001, + "loss": 5.7388, + "loss/crossentropy": 2.423627495765686, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17956066131591797, + "step": 10834 + }, + { + "epoch": 0.338625, + "grad_norm": 3.40625, + "grad_norm_var": 0.05520426432291667, + "learning_rate": 0.0001, + "loss": 5.8473, + "loss/crossentropy": 2.5403659343719482, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1767827644944191, + "step": 10836 + }, + { + "epoch": 0.3386875, + "grad_norm": 3.953125, + "grad_norm_var": 0.04716796875, + "learning_rate": 0.0001, + "loss": 5.8963, + "loss/crossentropy": 2.445417642593384, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19196118414402008, + "step": 10838 + }, + { + "epoch": 0.33875, + "grad_norm": 3.34375, + "grad_norm_var": 0.04111328125, + "learning_rate": 0.0001, + "loss": 6.0339, + "loss/crossentropy": 2.66239595413208, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18362993001937866, + "step": 10840 + }, + { + "epoch": 0.3388125, + "grad_norm": 3.25, + "grad_norm_var": 0.042919921875, + "learning_rate": 0.0001, + "loss": 6.0567, + "loss/crossentropy": 2.661539673805237, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1836584359407425, + "step": 10842 + }, + { + "epoch": 0.338875, + "grad_norm": 3.3125, + "grad_norm_var": 0.040583292643229164, + "learning_rate": 0.0001, + "loss": 5.7153, + "loss/crossentropy": 2.4426932334899902, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17491213977336884, + "step": 10844 + }, + { + "epoch": 0.3389375, + "grad_norm": 3.375, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 0.0001, + "loss": 6.1584, + "loss/crossentropy": 2.711267828941345, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1880771815776825, + "step": 10846 + }, + { + "epoch": 0.339, + "grad_norm": 3.609375, + "grad_norm_var": 0.0428375244140625, + "learning_rate": 0.0001, + "loss": 6.1288, + "loss/crossentropy": 2.648341417312622, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19062668830156326, + "step": 10848 + }, + { + "epoch": 0.3390625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0487945556640625, + "learning_rate": 0.0001, + "loss": 5.5299, + "loss/crossentropy": 2.352954149246216, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17082266509532928, + "step": 10850 + }, + { + "epoch": 0.339125, + "grad_norm": 4.0625, + "grad_norm_var": 0.0731597900390625, + "learning_rate": 0.0001, + "loss": 5.5417, + "loss/crossentropy": 2.261208415031433, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17024008929729462, + "step": 10852 + }, + { + "epoch": 0.3391875, + "grad_norm": 3.671875, + "grad_norm_var": 2.5806925455729166, + "learning_rate": 0.0001, + "loss": 5.8908, + "loss/crossentropy": 2.4909229278564453, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1868659257888794, + "step": 10854 + }, + { + "epoch": 0.33925, + "grad_norm": 3.265625, + "grad_norm_var": 2.587532552083333, + "learning_rate": 0.0001, + "loss": 6.0128, + "loss/crossentropy": 2.71052086353302, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17983968555927277, + "step": 10856 + }, + { + "epoch": 0.3393125, + "grad_norm": 3.171875, + "grad_norm_var": 2.620417277018229, + "learning_rate": 0.0001, + "loss": 6.1523, + "loss/crossentropy": 2.8265068531036377, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18179305642843246, + "step": 10858 + }, + { + "epoch": 0.339375, + "grad_norm": 3.671875, + "grad_norm_var": 2.6018300374348957, + "learning_rate": 0.0001, + "loss": 5.8658, + "loss/crossentropy": 2.5263173580169678, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1804347187280655, + "step": 10860 + }, + { + "epoch": 0.3394375, + "grad_norm": 3.40625, + "grad_norm_var": 2.62666015625, + "learning_rate": 0.0001, + "loss": 6.0053, + "loss/crossentropy": 2.6483066082000732, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17945124208927155, + "step": 10862 + }, + { + "epoch": 0.3395, + "grad_norm": 3.28125, + "grad_norm_var": 2.6412923177083334, + "learning_rate": 0.0001, + "loss": 5.9145, + "loss/crossentropy": 2.5227582454681396, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18292731046676636, + "step": 10864 + }, + { + "epoch": 0.3395625, + "grad_norm": 3.5625, + "grad_norm_var": 2.609358723958333, + "learning_rate": 0.0001, + "loss": 5.919, + "loss/crossentropy": 2.577438712120056, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18258970230817795, + "step": 10866 + }, + { + "epoch": 0.339625, + "grad_norm": 3.140625, + "grad_norm_var": 2.622119140625, + "learning_rate": 0.0001, + "loss": 5.9416, + "loss/crossentropy": 2.5979727506637573, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17889202386140823, + "step": 10868 + }, + { + "epoch": 0.3396875, + "grad_norm": 3.109375, + "grad_norm_var": 0.0297760009765625, + "learning_rate": 0.0001, + "loss": 6.0314, + "loss/crossentropy": 2.663851022720337, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17933289706707, + "step": 10870 + }, + { + "epoch": 0.33975, + "grad_norm": 3.609375, + "grad_norm_var": 0.09868062337239583, + "learning_rate": 0.0001, + "loss": 6.1448, + "loss/crossentropy": 2.641571283340454, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1940707564353943, + "step": 10872 + }, + { + "epoch": 0.3398125, + "grad_norm": 3.546875, + "grad_norm_var": 0.09877827962239584, + "learning_rate": 0.0001, + "loss": 6.0617, + "loss/crossentropy": 2.6150325536727905, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19153756648302078, + "step": 10874 + }, + { + "epoch": 0.339875, + "grad_norm": 3.40625, + "grad_norm_var": 0.10087483723958333, + "learning_rate": 0.0001, + "loss": 5.9674, + "loss/crossentropy": 2.6188477277755737, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18329627811908722, + "step": 10876 + }, + { + "epoch": 0.3399375, + "grad_norm": 3.390625, + "grad_norm_var": 0.09728902180989583, + "learning_rate": 0.0001, + "loss": 6.1462, + "loss/crossentropy": 2.661772608757019, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19141025841236115, + "step": 10878 + }, + { + "epoch": 0.34, + "grad_norm": 2.9375, + "grad_norm_var": 0.11197916666666667, + "learning_rate": 0.0001, + "loss": 5.636, + "loss/crossentropy": 2.44494891166687, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16871945559978485, + "step": 10880 + }, + { + "epoch": 0.3400625, + "grad_norm": 3.53125, + "grad_norm_var": 0.10836181640625, + "learning_rate": 0.0001, + "loss": 6.0516, + "loss/crossentropy": 2.627223014831543, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18775425106287003, + "step": 10882 + }, + { + "epoch": 0.340125, + "grad_norm": 3.0625, + "grad_norm_var": 0.1162261962890625, + "learning_rate": 0.0001, + "loss": 5.6793, + "loss/crossentropy": 2.494239091873169, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1716337651014328, + "step": 10884 + }, + { + "epoch": 0.3401875, + "grad_norm": 3.109375, + "grad_norm_var": 0.11667378743489583, + "learning_rate": 0.0001, + "loss": 5.4698, + "loss/crossentropy": 2.2702760696411133, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1652650088071823, + "step": 10886 + }, + { + "epoch": 0.34025, + "grad_norm": 3.3125, + "grad_norm_var": 0.04019775390625, + "learning_rate": 0.0001, + "loss": 5.7117, + "loss/crossentropy": 2.4544767141342163, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17181452363729477, + "step": 10888 + }, + { + "epoch": 0.3403125, + "grad_norm": 3.265625, + "grad_norm_var": 0.03330078125, + "learning_rate": 0.0001, + "loss": 5.9917, + "loss/crossentropy": 2.7106869220733643, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17380309849977493, + "step": 10890 + }, + { + "epoch": 0.340375, + "grad_norm": 3.15625, + "grad_norm_var": 0.031981404622395834, + "learning_rate": 0.0001, + "loss": 5.9661, + "loss/crossentropy": 2.6236428022384644, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18268528580665588, + "step": 10892 + }, + { + "epoch": 0.3404375, + "grad_norm": 3.0625, + "grad_norm_var": 0.029124959309895834, + "learning_rate": 0.0001, + "loss": 5.6988, + "loss/crossentropy": 2.512023091316223, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16985370963811874, + "step": 10894 + }, + { + "epoch": 0.3405, + "grad_norm": 3.046875, + "grad_norm_var": 0.0265289306640625, + "learning_rate": 0.0001, + "loss": 5.6424, + "loss/crossentropy": 2.4638726711273193, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16511869430541992, + "step": 10896 + }, + { + "epoch": 0.3405625, + "grad_norm": 3.53125, + "grad_norm_var": 0.024372355143229166, + "learning_rate": 0.0001, + "loss": 6.0874, + "loss/crossentropy": 2.6857768297195435, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18860473483800888, + "step": 10898 + }, + { + "epoch": 0.340625, + "grad_norm": 3.46875, + "grad_norm_var": 0.025516764322916666, + "learning_rate": 0.0001, + "loss": 5.4173, + "loss/crossentropy": 2.2630616426467896, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16893823444843292, + "step": 10900 + }, + { + "epoch": 0.3406875, + "grad_norm": 3.359375, + "grad_norm_var": 0.03925374348958333, + "learning_rate": 0.0001, + "loss": 5.8756, + "loss/crossentropy": 2.5280606746673584, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18162433803081512, + "step": 10902 + }, + { + "epoch": 0.34075, + "grad_norm": 3.109375, + "grad_norm_var": 0.0426910400390625, + "learning_rate": 0.0001, + "loss": 5.9101, + "loss/crossentropy": 2.5396711826324463, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18352441489696503, + "step": 10904 + }, + { + "epoch": 0.3408125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0494537353515625, + "learning_rate": 0.0001, + "loss": 5.6406, + "loss/crossentropy": 2.461556911468506, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1694711446762085, + "step": 10906 + }, + { + "epoch": 0.340875, + "grad_norm": 3.375, + "grad_norm_var": 0.04871317545572917, + "learning_rate": 0.0001, + "loss": 5.9681, + "loss/crossentropy": 2.568337917327881, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18645568192005157, + "step": 10908 + }, + { + "epoch": 0.3409375, + "grad_norm": 3.375, + "grad_norm_var": 0.05250244140625, + "learning_rate": 0.0001, + "loss": 5.7098, + "loss/crossentropy": 2.4167133569717407, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1777435913681984, + "step": 10910 + }, + { + "epoch": 0.341, + "grad_norm": 3.359375, + "grad_norm_var": 0.04472554524739583, + "learning_rate": 0.0001, + "loss": 5.9249, + "loss/crossentropy": 2.5556130409240723, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18380820006132126, + "step": 10912 + }, + { + "epoch": 0.3410625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0411529541015625, + "learning_rate": 0.0001, + "loss": 5.6859, + "loss/crossentropy": 2.3107681274414062, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18086906522512436, + "step": 10914 + }, + { + "epoch": 0.341125, + "grad_norm": 3.25, + "grad_norm_var": 0.040425618489583336, + "learning_rate": 0.0001, + "loss": 5.9311, + "loss/crossentropy": 2.6373904943466187, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.179367333650589, + "step": 10916 + }, + { + "epoch": 0.3411875, + "grad_norm": 3.453125, + "grad_norm_var": 0.03131510416666667, + "learning_rate": 0.0001, + "loss": 5.9823, + "loss/crossentropy": 2.6373625993728638, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.181370347738266, + "step": 10918 + }, + { + "epoch": 0.34125, + "grad_norm": 3.078125, + "grad_norm_var": 0.0240234375, + "learning_rate": 0.0001, + "loss": 5.7492, + "loss/crossentropy": 2.4768972396850586, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1764535903930664, + "step": 10920 + }, + { + "epoch": 0.3413125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0212890625, + "learning_rate": 0.0001, + "loss": 5.8238, + "loss/crossentropy": 2.574054479598999, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1730186939239502, + "step": 10922 + }, + { + "epoch": 0.341375, + "grad_norm": 3.296875, + "grad_norm_var": 0.020719401041666665, + "learning_rate": 0.0001, + "loss": 5.8977, + "loss/crossentropy": 2.562409281730652, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1811838299036026, + "step": 10924 + }, + { + "epoch": 0.3414375, + "grad_norm": 3.359375, + "grad_norm_var": 0.014557902018229167, + "learning_rate": 0.0001, + "loss": 5.9327, + "loss/crossentropy": 2.5949437618255615, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18103672564029694, + "step": 10926 + }, + { + "epoch": 0.3415, + "grad_norm": 3.375, + "grad_norm_var": 0.022102864583333333, + "learning_rate": 0.0001, + "loss": 6.2892, + "loss/crossentropy": 2.779433012008667, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.19238261878490448, + "step": 10928 + }, + { + "epoch": 0.3415625, + "grad_norm": 3.125, + "grad_norm_var": 0.024397786458333334, + "learning_rate": 0.0001, + "loss": 5.7626, + "loss/crossentropy": 2.495705723762512, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17395368218421936, + "step": 10930 + }, + { + "epoch": 0.341625, + "grad_norm": 3.390625, + "grad_norm_var": 0.0246978759765625, + "learning_rate": 0.0001, + "loss": 6.0376, + "loss/crossentropy": 2.7212945222854614, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18123944103717804, + "step": 10932 + }, + { + "epoch": 0.3416875, + "grad_norm": 3.265625, + "grad_norm_var": 0.020539347330729166, + "learning_rate": 0.0001, + "loss": 6.0254, + "loss/crossentropy": 2.6789186000823975, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18347567319869995, + "step": 10934 + }, + { + "epoch": 0.34175, + "grad_norm": 2.84375, + "grad_norm_var": 0.0332672119140625, + "learning_rate": 0.0001, + "loss": 5.7685, + "loss/crossentropy": 2.5864064693450928, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17016415297985077, + "step": 10936 + }, + { + "epoch": 0.3418125, + "grad_norm": 3.328125, + "grad_norm_var": 0.03037109375, + "learning_rate": 0.0001, + "loss": 5.7732, + "loss/crossentropy": 2.477833867073059, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17836246639490128, + "step": 10938 + }, + { + "epoch": 0.341875, + "grad_norm": 3.375, + "grad_norm_var": 0.0384429931640625, + "learning_rate": 0.0001, + "loss": 5.9246, + "loss/crossentropy": 2.5930657386779785, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17807789146900177, + "step": 10940 + }, + { + "epoch": 0.3419375, + "grad_norm": 3.28125, + "grad_norm_var": 0.3951080322265625, + "learning_rate": 0.0001, + "loss": 6.2484, + "loss/crossentropy": 2.8314849138259888, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18895969539880753, + "step": 10942 + }, + { + "epoch": 0.342, + "grad_norm": 3.265625, + "grad_norm_var": 0.39583231608072916, + "learning_rate": 0.0001, + "loss": 5.6654, + "loss/crossentropy": 2.41261088848114, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17371421307325363, + "step": 10944 + }, + { + "epoch": 0.3420625, + "grad_norm": 3.21875, + "grad_norm_var": 0.38682352701822914, + "learning_rate": 0.0001, + "loss": 6.0481, + "loss/crossentropy": 2.647688627243042, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18301014602184296, + "step": 10946 + }, + { + "epoch": 0.342125, + "grad_norm": 3.390625, + "grad_norm_var": 0.38798828125, + "learning_rate": 0.0001, + "loss": 5.9545, + "loss/crossentropy": 2.6143709421157837, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1832272633910179, + "step": 10948 + }, + { + "epoch": 0.3421875, + "grad_norm": 3.234375, + "grad_norm_var": 0.3848297119140625, + "learning_rate": 0.0001, + "loss": 6.2824, + "loss/crossentropy": 2.8192338943481445, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18850116431713104, + "step": 10950 + }, + { + "epoch": 0.34225, + "grad_norm": 4.03125, + "grad_norm_var": 0.37916666666666665, + "learning_rate": 0.0001, + "loss": 5.6463, + "loss/crossentropy": 2.3457504510879517, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17341547459363937, + "step": 10952 + }, + { + "epoch": 0.3423125, + "grad_norm": 3.359375, + "grad_norm_var": 0.38039957682291664, + "learning_rate": 0.0001, + "loss": 5.9078, + "loss/crossentropy": 2.5608900785446167, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18312767893075943, + "step": 10954 + }, + { + "epoch": 0.342375, + "grad_norm": 3.5, + "grad_norm_var": 0.39982808430989586, + "learning_rate": 0.0001, + "loss": 5.9286, + "loss/crossentropy": 2.5840686559677124, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18054546415805817, + "step": 10956 + }, + { + "epoch": 0.3424375, + "grad_norm": 3.3125, + "grad_norm_var": 0.052277628580729166, + "learning_rate": 0.0001, + "loss": 5.7118, + "loss/crossentropy": 2.4785114526748657, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17332758009433746, + "step": 10958 + }, + { + "epoch": 0.3425, + "grad_norm": 3.5, + "grad_norm_var": 0.05308837890625, + "learning_rate": 0.0001, + "loss": 6.422, + "loss/crossentropy": 2.9245123863220215, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19467034935951233, + "step": 10960 + }, + { + "epoch": 0.3425625, + "grad_norm": 3.21875, + "grad_norm_var": 0.05732320149739583, + "learning_rate": 0.0001, + "loss": 5.6495, + "loss/crossentropy": 2.456456422805786, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16930365562438965, + "step": 10962 + }, + { + "epoch": 0.342625, + "grad_norm": 3.34375, + "grad_norm_var": 0.05842183430989583, + "learning_rate": 0.0001, + "loss": 5.9639, + "loss/crossentropy": 2.673986077308655, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17586512863636017, + "step": 10964 + }, + { + "epoch": 0.3426875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0572265625, + "learning_rate": 0.0001, + "loss": 5.6662, + "loss/crossentropy": 2.3765084743499756, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17701995372772217, + "step": 10966 + }, + { + "epoch": 0.34275, + "grad_norm": 2.984375, + "grad_norm_var": 0.0366119384765625, + "learning_rate": 0.0001, + "loss": 5.7426, + "loss/crossentropy": 2.4702529907226562, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17176322638988495, + "step": 10968 + }, + { + "epoch": 0.3428125, + "grad_norm": 3.0, + "grad_norm_var": 0.042496744791666666, + "learning_rate": 0.0001, + "loss": 6.0317, + "loss/crossentropy": 2.765709638595581, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17386630922555923, + "step": 10970 + }, + { + "epoch": 0.342875, + "grad_norm": 3.078125, + "grad_norm_var": 0.036229451497395836, + "learning_rate": 0.0001, + "loss": 5.8305, + "loss/crossentropy": 2.593336582183838, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17293524742126465, + "step": 10972 + }, + { + "epoch": 0.3429375, + "grad_norm": 3.390625, + "grad_norm_var": 0.037287394205729164, + "learning_rate": 0.0001, + "loss": 5.6782, + "loss/crossentropy": 2.459639310836792, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17107924818992615, + "step": 10974 + }, + { + "epoch": 0.343, + "grad_norm": 3.09375, + "grad_norm_var": 0.03460286458333333, + "learning_rate": 0.0001, + "loss": 5.9007, + "loss/crossentropy": 2.568490982055664, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1808793842792511, + "step": 10976 + }, + { + "epoch": 0.3430625, + "grad_norm": 3.015625, + "grad_norm_var": 0.0361968994140625, + "learning_rate": 0.0001, + "loss": 5.9303, + "loss/crossentropy": 2.644846558570862, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1793234720826149, + "step": 10978 + }, + { + "epoch": 0.343125, + "grad_norm": 2.96875, + "grad_norm_var": 0.0394439697265625, + "learning_rate": 0.0001, + "loss": 5.7084, + "loss/crossentropy": 2.4656275510787964, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17545323073863983, + "step": 10980 + }, + { + "epoch": 0.3431875, + "grad_norm": 3.1875, + "grad_norm_var": 0.03463134765625, + "learning_rate": 0.0001, + "loss": 5.4337, + "loss/crossentropy": 2.3236976861953735, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16255810111761093, + "step": 10982 + }, + { + "epoch": 0.34325, + "grad_norm": 3.75, + "grad_norm_var": 0.0402008056640625, + "learning_rate": 0.0001, + "loss": 5.7879, + "loss/crossentropy": 2.6042559146881104, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1687503159046173, + "step": 10984 + }, + { + "epoch": 0.3433125, + "grad_norm": 3.328125, + "grad_norm_var": 0.0443756103515625, + "learning_rate": 0.0001, + "loss": 5.9716, + "loss/crossentropy": 2.5806045532226562, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1867561936378479, + "step": 10986 + }, + { + "epoch": 0.343375, + "grad_norm": 3.34375, + "grad_norm_var": 0.04302978515625, + "learning_rate": 0.0001, + "loss": 5.8281, + "loss/crossentropy": 2.579994797706604, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17559535056352615, + "step": 10988 + }, + { + "epoch": 0.3434375, + "grad_norm": 3.46875, + "grad_norm_var": 0.0450836181640625, + "learning_rate": 0.0001, + "loss": 6.1501, + "loss/crossentropy": 2.6620121002197266, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19216465950012207, + "step": 10990 + }, + { + "epoch": 0.3435, + "grad_norm": 3.15625, + "grad_norm_var": 0.0447662353515625, + "learning_rate": 0.0001, + "loss": 5.6643, + "loss/crossentropy": 2.463992714881897, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1719801351428032, + "step": 10992 + }, + { + "epoch": 0.3435625, + "grad_norm": 3.34375, + "grad_norm_var": 0.042577107747395836, + "learning_rate": 0.0001, + "loss": 5.8199, + "loss/crossentropy": 2.567610740661621, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1748414933681488, + "step": 10994 + }, + { + "epoch": 0.343625, + "grad_norm": 3.5, + "grad_norm_var": 0.04566650390625, + "learning_rate": 0.0001, + "loss": 6.0374, + "loss/crossentropy": 2.708420515060425, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1813330352306366, + "step": 10996 + }, + { + "epoch": 0.3436875, + "grad_norm": 3.3125, + "grad_norm_var": 0.040314737955729166, + "learning_rate": 0.0001, + "loss": 6.0973, + "loss/crossentropy": 2.697722911834717, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18683434277772903, + "step": 10998 + }, + { + "epoch": 0.34375, + "grad_norm": 3.453125, + "grad_norm_var": 0.023802693684895834, + "learning_rate": 0.0001, + "loss": 5.7794, + "loss/crossentropy": 2.438982844352722, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18325923383235931, + "step": 11000 + }, + { + "epoch": 0.3438125, + "grad_norm": 3.4375, + "grad_norm_var": 0.0238922119140625, + "learning_rate": 0.0001, + "loss": 5.9612, + "loss/crossentropy": 2.6043819189071655, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18177300691604614, + "step": 11002 + }, + { + "epoch": 0.343875, + "grad_norm": 3.34375, + "grad_norm_var": 0.02119140625, + "learning_rate": 0.0001, + "loss": 6.027, + "loss/crossentropy": 2.7204278707504272, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1763620674610138, + "step": 11004 + }, + { + "epoch": 0.3439375, + "grad_norm": 3.421875, + "grad_norm_var": 0.021483357747395834, + "learning_rate": 0.0001, + "loss": 6.086, + "loss/crossentropy": 2.653968095779419, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18539541214704514, + "step": 11006 + }, + { + "epoch": 0.344, + "grad_norm": 2.953125, + "grad_norm_var": 0.025951131184895834, + "learning_rate": 0.0001, + "loss": 5.6879, + "loss/crossentropy": 2.4831135272979736, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17243118584156036, + "step": 11008 + }, + { + "epoch": 0.3440625, + "grad_norm": 3.421875, + "grad_norm_var": 0.026688639322916666, + "learning_rate": 0.0001, + "loss": 5.9452, + "loss/crossentropy": 2.528921961784363, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18616057932376862, + "step": 11010 + }, + { + "epoch": 0.344125, + "grad_norm": 3.21875, + "grad_norm_var": 0.024833170572916667, + "learning_rate": 0.0001, + "loss": 5.7475, + "loss/crossentropy": 2.4258735179901123, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18099413812160492, + "step": 11012 + }, + { + "epoch": 0.3441875, + "grad_norm": 3.015625, + "grad_norm_var": 0.031050618489583334, + "learning_rate": 0.0001, + "loss": 5.8515, + "loss/crossentropy": 2.5265121459960938, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1821117326617241, + "step": 11014 + }, + { + "epoch": 0.34425, + "grad_norm": 3.328125, + "grad_norm_var": 0.0298248291015625, + "learning_rate": 0.0001, + "loss": 5.9604, + "loss/crossentropy": 2.6057002544403076, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18156321346759796, + "step": 11016 + }, + { + "epoch": 0.3443125, + "grad_norm": 2.875, + "grad_norm_var": 0.038248697916666664, + "learning_rate": 0.0001, + "loss": 5.8424, + "loss/crossentropy": 2.6125999689102173, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17493344843387604, + "step": 11018 + }, + { + "epoch": 0.344375, + "grad_norm": 2.921875, + "grad_norm_var": 0.045441691080729166, + "learning_rate": 0.0001, + "loss": 5.5205, + "loss/crossentropy": 2.342848062515259, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16542082279920578, + "step": 11020 + }, + { + "epoch": 0.3444375, + "grad_norm": 3.15625, + "grad_norm_var": 0.04234619140625, + "learning_rate": 0.0001, + "loss": 5.702, + "loss/crossentropy": 2.473252773284912, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17209185659885406, + "step": 11022 + }, + { + "epoch": 0.3445, + "grad_norm": 3.484375, + "grad_norm_var": 0.04439697265625, + "learning_rate": 0.0001, + "loss": 5.9239, + "loss/crossentropy": 2.5366417169570923, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1824759989976883, + "step": 11024 + }, + { + "epoch": 0.3445625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0422515869140625, + "learning_rate": 0.0001, + "loss": 5.8792, + "loss/crossentropy": 2.5807461738586426, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17866993695497513, + "step": 11026 + }, + { + "epoch": 0.344625, + "grad_norm": 3.234375, + "grad_norm_var": 0.028511555989583333, + "learning_rate": 0.0001, + "loss": 5.5309, + "loss/crossentropy": 2.296918511390686, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1718367487192154, + "step": 11028 + }, + { + "epoch": 0.3446875, + "grad_norm": 3.34375, + "grad_norm_var": 0.026463826497395832, + "learning_rate": 0.0001, + "loss": 6.0491, + "loss/crossentropy": 2.704776167869568, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18326442688703537, + "step": 11030 + }, + { + "epoch": 0.34475, + "grad_norm": 3.1875, + "grad_norm_var": 0.025162760416666666, + "learning_rate": 0.0001, + "loss": 6.0549, + "loss/crossentropy": 2.7294983863830566, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17941391468048096, + "step": 11032 + }, + { + "epoch": 0.3448125, + "grad_norm": 3.203125, + "grad_norm_var": 0.01793212890625, + "learning_rate": 0.0001, + "loss": 6.1078, + "loss/crossentropy": 2.755513548851013, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18522819131612778, + "step": 11034 + }, + { + "epoch": 0.344875, + "grad_norm": 3.0, + "grad_norm_var": 0.013016764322916667, + "learning_rate": 0.0001, + "loss": 5.6999, + "loss/crossentropy": 2.4434242248535156, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17330323159694672, + "step": 11036 + }, + { + "epoch": 0.3449375, + "grad_norm": 3.71875, + "grad_norm_var": 0.031217447916666665, + "learning_rate": 0.0001, + "loss": 6.0105, + "loss/crossentropy": 2.668699264526367, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18301181495189667, + "step": 11038 + }, + { + "epoch": 0.345, + "grad_norm": 3.046875, + "grad_norm_var": 0.0318756103515625, + "learning_rate": 0.0001, + "loss": 5.3591, + "loss/crossentropy": 2.254475712776184, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16046421229839325, + "step": 11040 + }, + { + "epoch": 0.3450625, + "grad_norm": 3.09375, + "grad_norm_var": 0.03238525390625, + "learning_rate": 0.0001, + "loss": 5.6728, + "loss/crossentropy": 2.4203919172286987, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17914804816246033, + "step": 11042 + }, + { + "epoch": 0.345125, + "grad_norm": 3.4375, + "grad_norm_var": 0.30448811848958335, + "learning_rate": 0.0001, + "loss": 5.9763, + "loss/crossentropy": 2.5770797729492188, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18367434293031693, + "step": 11044 + }, + { + "epoch": 0.3451875, + "grad_norm": 3.078125, + "grad_norm_var": 0.3071523030598958, + "learning_rate": 0.0001, + "loss": 5.7715, + "loss/crossentropy": 2.4922112226486206, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1771462932229042, + "step": 11046 + }, + { + "epoch": 0.34525, + "grad_norm": 3.515625, + "grad_norm_var": 0.3077138264973958, + "learning_rate": 0.0001, + "loss": 5.9733, + "loss/crossentropy": 2.598994731903076, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18039784580469131, + "step": 11048 + }, + { + "epoch": 0.3453125, + "grad_norm": 6.21875, + "grad_norm_var": 0.8429107666015625, + "learning_rate": 0.0001, + "loss": 6.5324, + "loss/crossentropy": 2.894778251647949, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.20126450061798096, + "step": 11050 + }, + { + "epoch": 0.345375, + "grad_norm": 3.390625, + "grad_norm_var": 0.8036946614583333, + "learning_rate": 0.0001, + "loss": 6.0874, + "loss/crossentropy": 2.665947198867798, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18980085104703903, + "step": 11052 + }, + { + "epoch": 0.3454375, + "grad_norm": 3.40625, + "grad_norm_var": 0.7823069254557292, + "learning_rate": 0.0001, + "loss": 5.9363, + "loss/crossentropy": 2.5732173919677734, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18317849189043045, + "step": 11054 + }, + { + "epoch": 0.3455, + "grad_norm": 3.28125, + "grad_norm_var": 0.7324045817057292, + "learning_rate": 0.0001, + "loss": 5.8919, + "loss/crossentropy": 2.570802092552185, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17547086626291275, + "step": 11056 + }, + { + "epoch": 0.3455625, + "grad_norm": 3.40625, + "grad_norm_var": 0.7168121337890625, + "learning_rate": 0.0001, + "loss": 6.2789, + "loss/crossentropy": 2.7660707235336304, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19425665587186813, + "step": 11058 + }, + { + "epoch": 0.345625, + "grad_norm": 4.03125, + "grad_norm_var": 0.5719553629557291, + "learning_rate": 0.0001, + "loss": 6.3759, + "loss/crossentropy": 2.8103519678115845, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19640257209539413, + "step": 11060 + }, + { + "epoch": 0.3456875, + "grad_norm": 3.71875, + "grad_norm_var": 0.5447336832682291, + "learning_rate": 0.0001, + "loss": 6.3307, + "loss/crossentropy": 2.857063055038452, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18955209106206894, + "step": 11062 + }, + { + "epoch": 0.34575, + "grad_norm": 3.203125, + "grad_norm_var": 0.5674967447916667, + "learning_rate": 0.0001, + "loss": 6.0962, + "loss/crossentropy": 2.7202454805374146, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18485981225967407, + "step": 11064 + }, + { + "epoch": 0.3458125, + "grad_norm": 4.0, + "grad_norm_var": 0.07200113932291667, + "learning_rate": 0.0001, + "loss": 5.8249, + "loss/crossentropy": 2.524648666381836, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1776851788163185, + "step": 11066 + }, + { + "epoch": 0.345875, + "grad_norm": 3.484375, + "grad_norm_var": 0.06721903483072916, + "learning_rate": 0.0001, + "loss": 6.1936, + "loss/crossentropy": 2.8223618268966675, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18243858963251114, + "step": 11068 + }, + { + "epoch": 0.3459375, + "grad_norm": 3.5625, + "grad_norm_var": 0.08352457682291667, + "learning_rate": 0.0001, + "loss": 5.9817, + "loss/crossentropy": 2.6173232793807983, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18057696521282196, + "step": 11070 + }, + { + "epoch": 0.346, + "grad_norm": 3.953125, + "grad_norm_var": 0.1038970947265625, + "learning_rate": 0.0001, + "loss": 5.9832, + "loss/crossentropy": 2.5125160217285156, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19120532274246216, + "step": 11072 + }, + { + "epoch": 0.3460625, + "grad_norm": 3.46875, + "grad_norm_var": 0.1014312744140625, + "learning_rate": 0.0001, + "loss": 5.9291, + "loss/crossentropy": 2.5588573217391968, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1823352500796318, + "step": 11074 + }, + { + "epoch": 0.346125, + "grad_norm": 3.53125, + "grad_norm_var": 0.08059794108072917, + "learning_rate": 0.0001, + "loss": 5.9067, + "loss/crossentropy": 2.497497797012329, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.1807660534977913, + "step": 11076 + }, + { + "epoch": 0.3461875, + "grad_norm": 2.953125, + "grad_norm_var": 0.09683329264322917, + "learning_rate": 0.0001, + "loss": 5.4389, + "loss/crossentropy": 2.295078158378601, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1636056751012802, + "step": 11078 + }, + { + "epoch": 0.34625, + "grad_norm": 3.40625, + "grad_norm_var": 0.09576822916666666, + "learning_rate": 0.0001, + "loss": 6.0454, + "loss/crossentropy": 2.6146689653396606, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18213783204555511, + "step": 11080 + }, + { + "epoch": 0.3463125, + "grad_norm": 2.96875, + "grad_norm_var": 0.08359273274739583, + "learning_rate": 0.0001, + "loss": 5.8011, + "loss/crossentropy": 2.588598370552063, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16929897665977478, + "step": 11082 + }, + { + "epoch": 0.346375, + "grad_norm": 3.296875, + "grad_norm_var": 0.08227437337239583, + "learning_rate": 0.0001, + "loss": 5.735, + "loss/crossentropy": 2.449593663215637, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17893542349338531, + "step": 11084 + }, + { + "epoch": 0.3464375, + "grad_norm": 3.28125, + "grad_norm_var": 0.07458394368489583, + "learning_rate": 0.0001, + "loss": 6.0398, + "loss/crossentropy": 2.669768452644348, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1858329400420189, + "step": 11086 + }, + { + "epoch": 0.3465, + "grad_norm": 3.28125, + "grad_norm_var": 0.056639607747395834, + "learning_rate": 0.0001, + "loss": 5.6926, + "loss/crossentropy": 2.4738898277282715, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16952742636203766, + "step": 11088 + }, + { + "epoch": 0.3465625, + "grad_norm": 3.390625, + "grad_norm_var": 0.056559244791666664, + "learning_rate": 0.0001, + "loss": 5.934, + "loss/crossentropy": 2.592034935951233, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17989840358495712, + "step": 11090 + }, + { + "epoch": 0.346625, + "grad_norm": 3.25, + "grad_norm_var": 0.044189453125, + "learning_rate": 0.0001, + "loss": 6.0853, + "loss/crossentropy": 2.683542847633362, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18665538728237152, + "step": 11092 + }, + { + "epoch": 0.3466875, + "grad_norm": 3.03125, + "grad_norm_var": 0.039403279622395836, + "learning_rate": 0.0001, + "loss": 5.8121, + "loss/crossentropy": 2.5736337900161743, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17345602065324783, + "step": 11094 + }, + { + "epoch": 0.34675, + "grad_norm": 3.09375, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 6.08, + "loss/crossentropy": 2.7127881050109863, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18320278823375702, + "step": 11096 + }, + { + "epoch": 0.3468125, + "grad_norm": 3.390625, + "grad_norm_var": 0.02799072265625, + "learning_rate": 0.0001, + "loss": 5.8826, + "loss/crossentropy": 2.57907497882843, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17957322299480438, + "step": 11098 + }, + { + "epoch": 0.346875, + "grad_norm": 3.265625, + "grad_norm_var": 0.027587890625, + "learning_rate": 0.0001, + "loss": 5.502, + "loss/crossentropy": 2.2725518941879272, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16981849074363708, + "step": 11100 + }, + { + "epoch": 0.3469375, + "grad_norm": 3.1875, + "grad_norm_var": 0.016893513997395835, + "learning_rate": 0.0001, + "loss": 5.6568, + "loss/crossentropy": 2.4335155487060547, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17311019450426102, + "step": 11102 + }, + { + "epoch": 0.347, + "grad_norm": 3.296875, + "grad_norm_var": 0.015478515625, + "learning_rate": 0.0001, + "loss": 6.0785, + "loss/crossentropy": 2.749638795852661, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18288739770650864, + "step": 11104 + }, + { + "epoch": 0.3470625, + "grad_norm": 3.265625, + "grad_norm_var": 0.0157623291015625, + "learning_rate": 0.0001, + "loss": 5.9349, + "loss/crossentropy": 2.621717691421509, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17819301784038544, + "step": 11106 + }, + { + "epoch": 0.347125, + "grad_norm": 3.140625, + "grad_norm_var": 0.011872355143229167, + "learning_rate": 0.0001, + "loss": 5.6888, + "loss/crossentropy": 2.4195992946624756, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17730767279863358, + "step": 11108 + }, + { + "epoch": 0.3471875, + "grad_norm": 3.078125, + "grad_norm_var": 0.010965983072916666, + "learning_rate": 0.0001, + "loss": 5.7349, + "loss/crossentropy": 2.514296054840088, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17010922729969025, + "step": 11110 + }, + { + "epoch": 0.34725, + "grad_norm": 3.125, + "grad_norm_var": 0.0102691650390625, + "learning_rate": 0.0001, + "loss": 5.4622, + "loss/crossentropy": 2.3277621269226074, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16422312706708908, + "step": 11112 + }, + { + "epoch": 0.3473125, + "grad_norm": 3.53125, + "grad_norm_var": 0.036530558268229166, + "learning_rate": 0.0001, + "loss": 5.9148, + "loss/crossentropy": 2.5701433420181274, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18095263093709946, + "step": 11114 + }, + { + "epoch": 0.347375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0496734619140625, + "learning_rate": 0.0001, + "loss": 5.6771, + "loss/crossentropy": 2.4177253246307373, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17281214892864227, + "step": 11116 + }, + { + "epoch": 0.3474375, + "grad_norm": 3.578125, + "grad_norm_var": 0.057027180989583336, + "learning_rate": 0.0001, + "loss": 6.002, + "loss/crossentropy": 2.5987123250961304, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1844726875424385, + "step": 11118 + }, + { + "epoch": 0.3475, + "grad_norm": 2.953125, + "grad_norm_var": 0.061295572916666666, + "learning_rate": 0.0001, + "loss": 5.6562, + "loss/crossentropy": 2.4043514728546143, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17127804458141327, + "step": 11120 + }, + { + "epoch": 0.3475625, + "grad_norm": 3.328125, + "grad_norm_var": 0.05907796223958333, + "learning_rate": 0.0001, + "loss": 6.0375, + "loss/crossentropy": 2.6463372707366943, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18403904139995575, + "step": 11122 + }, + { + "epoch": 0.347625, + "grad_norm": 3.53125, + "grad_norm_var": 0.0587554931640625, + "learning_rate": 0.0001, + "loss": 5.7592, + "loss/crossentropy": 2.4847307205200195, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1758800819516182, + "step": 11124 + }, + { + "epoch": 0.3476875, + "grad_norm": 3.25, + "grad_norm_var": 0.0590240478515625, + "learning_rate": 0.0001, + "loss": 6.1016, + "loss/crossentropy": 2.643786668777466, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1910940557718277, + "step": 11126 + }, + { + "epoch": 0.34775, + "grad_norm": 3.015625, + "grad_norm_var": 0.06352437337239583, + "learning_rate": 0.0001, + "loss": 5.7997, + "loss/crossentropy": 2.6193262338638306, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17116473615169525, + "step": 11128 + }, + { + "epoch": 0.3478125, + "grad_norm": 3.328125, + "grad_norm_var": 0.05071207682291667, + "learning_rate": 0.0001, + "loss": 5.9549, + "loss/crossentropy": 2.670639753341675, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17764153331518173, + "step": 11130 + }, + { + "epoch": 0.347875, + "grad_norm": 3.21875, + "grad_norm_var": 0.03535868326822917, + "learning_rate": 0.0001, + "loss": 5.7988, + "loss/crossentropy": 2.5086302757263184, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17940445989370346, + "step": 11132 + }, + { + "epoch": 0.3479375, + "grad_norm": 3.515625, + "grad_norm_var": 0.03955790201822917, + "learning_rate": 0.0001, + "loss": 6.3464, + "loss/crossentropy": 2.8006097078323364, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19793908298015594, + "step": 11134 + }, + { + "epoch": 0.348, + "grad_norm": 3.078125, + "grad_norm_var": 0.03473307291666667, + "learning_rate": 0.0001, + "loss": 5.4672, + "loss/crossentropy": 2.351517081260681, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16547825932502747, + "step": 11136 + }, + { + "epoch": 0.3480625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0445953369140625, + "learning_rate": 0.0001, + "loss": 5.7162, + "loss/crossentropy": 2.5065797567367554, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17213500291109085, + "step": 11138 + }, + { + "epoch": 0.348125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05287984212239583, + "learning_rate": 0.0001, + "loss": 5.7799, + "loss/crossentropy": 2.4705816507339478, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.17233440279960632, + "step": 11140 + }, + { + "epoch": 0.3481875, + "grad_norm": 3.296875, + "grad_norm_var": 0.04781901041666667, + "learning_rate": 0.0001, + "loss": 6.0974, + "loss/crossentropy": 2.680031180381775, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18783296644687653, + "step": 11142 + }, + { + "epoch": 0.34825, + "grad_norm": 3.5625, + "grad_norm_var": 0.04947509765625, + "learning_rate": 0.0001, + "loss": 6.0563, + "loss/crossentropy": 2.6336668729782104, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18405906111001968, + "step": 11144 + }, + { + "epoch": 0.3483125, + "grad_norm": 2.96875, + "grad_norm_var": 0.0525787353515625, + "learning_rate": 0.0001, + "loss": 5.5608, + "loss/crossentropy": 2.3576338291168213, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1714879646897316, + "step": 11146 + }, + { + "epoch": 0.348375, + "grad_norm": 3.296875, + "grad_norm_var": 0.05322977701822917, + "learning_rate": 0.0001, + "loss": 5.7678, + "loss/crossentropy": 2.5067286491394043, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17493688315153122, + "step": 11148 + }, + { + "epoch": 0.3484375, + "grad_norm": 3.25, + "grad_norm_var": 0.054459635416666666, + "learning_rate": 0.0001, + "loss": 5.9017, + "loss/crossentropy": 2.5168557167053223, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18418489396572113, + "step": 11150 + }, + { + "epoch": 0.3485, + "grad_norm": 3.546875, + "grad_norm_var": 0.05573628743489583, + "learning_rate": 0.0001, + "loss": 5.9392, + "loss/crossentropy": 2.6039544343948364, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18195894360542297, + "step": 11152 + }, + { + "epoch": 0.3485625, + "grad_norm": 3.296875, + "grad_norm_var": 0.04540608723958333, + "learning_rate": 0.0001, + "loss": 6.2064, + "loss/crossentropy": 2.760994553565979, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18789827078580856, + "step": 11154 + }, + { + "epoch": 0.348625, + "grad_norm": 3.828125, + "grad_norm_var": 0.06337483723958333, + "learning_rate": 0.0001, + "loss": 6.1188, + "loss/crossentropy": 2.54991352558136, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.19790344685316086, + "step": 11156 + }, + { + "epoch": 0.3486875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0599761962890625, + "learning_rate": 0.0001, + "loss": 5.7801, + "loss/crossentropy": 2.4646570682525635, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17724335938692093, + "step": 11158 + }, + { + "epoch": 0.34875, + "grad_norm": 4.40625, + "grad_norm_var": 0.114208984375, + "learning_rate": 0.0001, + "loss": 6.0655, + "loss/crossentropy": 2.5859330892562866, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19014528393745422, + "step": 11160 + }, + { + "epoch": 0.3488125, + "grad_norm": 3.96875, + "grad_norm_var": 0.10943094889322917, + "learning_rate": 0.0001, + "loss": 6.1698, + "loss/crossentropy": 2.6557672023773193, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.19163348525762558, + "step": 11162 + }, + { + "epoch": 0.348875, + "grad_norm": 3.859375, + "grad_norm_var": 0.11806233723958333, + "learning_rate": 0.0001, + "loss": 6.1783, + "loss/crossentropy": 2.7903772592544556, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1844942718744278, + "step": 11164 + }, + { + "epoch": 0.3489375, + "grad_norm": 4.15625, + "grad_norm_var": 0.13076070149739583, + "learning_rate": 0.0001, + "loss": 5.9133, + "loss/crossentropy": 2.4920116662979126, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1901763305068016, + "step": 11166 + }, + { + "epoch": 0.349, + "grad_norm": 3.359375, + "grad_norm_var": 0.13056233723958333, + "learning_rate": 0.0001, + "loss": 5.9955, + "loss/crossentropy": 2.6160982847213745, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.17973970621824265, + "step": 11168 + }, + { + "epoch": 0.3490625, + "grad_norm": 3.546875, + "grad_norm_var": 0.14177958170572916, + "learning_rate": 0.0001, + "loss": 5.8252, + "loss/crossentropy": 2.562760353088379, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1758510321378708, + "step": 11170 + }, + { + "epoch": 0.349125, + "grad_norm": 3.109375, + "grad_norm_var": 0.16461181640625, + "learning_rate": 0.0001, + "loss": 5.5451, + "loss/crossentropy": 2.348025679588318, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17048463970422745, + "step": 11172 + }, + { + "epoch": 0.3491875, + "grad_norm": 3.390625, + "grad_norm_var": 0.172119140625, + "learning_rate": 0.0001, + "loss": 5.9302, + "loss/crossentropy": 2.6501015424728394, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17722531408071518, + "step": 11174 + }, + { + "epoch": 0.34925, + "grad_norm": 3.28125, + "grad_norm_var": 0.1240142822265625, + "learning_rate": 0.0001, + "loss": 5.4685, + "loss/crossentropy": 2.325492262840271, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16390928626060486, + "step": 11176 + }, + { + "epoch": 0.3493125, + "grad_norm": 3.203125, + "grad_norm_var": 0.11184794108072917, + "learning_rate": 0.0001, + "loss": 6.2105, + "loss/crossentropy": 2.7702953815460205, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1889393851161003, + "step": 11178 + }, + { + "epoch": 0.349375, + "grad_norm": 3.546875, + "grad_norm_var": 0.0923980712890625, + "learning_rate": 0.0001, + "loss": 5.8614, + "loss/crossentropy": 2.550577998161316, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1818677932024002, + "step": 11180 + }, + { + "epoch": 0.3494375, + "grad_norm": 3.453125, + "grad_norm_var": 0.051493326822916664, + "learning_rate": 0.0001, + "loss": 5.9858, + "loss/crossentropy": 2.600010871887207, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18467383831739426, + "step": 11182 + }, + { + "epoch": 0.3495, + "grad_norm": 3.578125, + "grad_norm_var": 0.06024983723958333, + "learning_rate": 0.0001, + "loss": 6.2648, + "loss/crossentropy": 2.7478842735290527, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19387764483690262, + "step": 11184 + }, + { + "epoch": 0.3495625, + "grad_norm": 3.5625, + "grad_norm_var": 0.056864420572916664, + "learning_rate": 0.0001, + "loss": 5.759, + "loss/crossentropy": 2.4200514554977417, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1753026694059372, + "step": 11186 + }, + { + "epoch": 0.349625, + "grad_norm": 3.34375, + "grad_norm_var": 0.047728474934895834, + "learning_rate": 0.0001, + "loss": 5.8962, + "loss/crossentropy": 2.652923583984375, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17393695563077927, + "step": 11188 + }, + { + "epoch": 0.3496875, + "grad_norm": 3.453125, + "grad_norm_var": 0.04353739420572917, + "learning_rate": 0.0001, + "loss": 5.4828, + "loss/crossentropy": 2.2370325326919556, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.16676458716392517, + "step": 11190 + }, + { + "epoch": 0.34975, + "grad_norm": 3.234375, + "grad_norm_var": 0.04783426920572917, + "learning_rate": 0.0001, + "loss": 5.9645, + "loss/crossentropy": 2.518808126449585, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1894880086183548, + "step": 11192 + }, + { + "epoch": 0.3498125, + "grad_norm": 3.46875, + "grad_norm_var": 0.038426717122395836, + "learning_rate": 0.0001, + "loss": 5.8168, + "loss/crossentropy": 2.4844318628311157, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17894314229488373, + "step": 11194 + }, + { + "epoch": 0.349875, + "grad_norm": 3.578125, + "grad_norm_var": 0.03642578125, + "learning_rate": 0.0001, + "loss": 6.2959, + "loss/crossentropy": 2.748618245124817, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19730991125106812, + "step": 11196 + }, + { + "epoch": 0.3499375, + "grad_norm": 5.78125, + "grad_norm_var": 0.3701568603515625, + "learning_rate": 0.0001, + "loss": 5.8812, + "loss/crossentropy": 2.458068370819092, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18528541177511215, + "step": 11198 + }, + { + "epoch": 0.35, + "grad_norm": 3.484375, + "grad_norm_var": 0.37374674479166664, + "learning_rate": 0.0001, + "loss": 5.9547, + "loss/crossentropy": 2.5829252004623413, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1863955408334732, + "step": 11200 + }, + { + "epoch": 0.3500625, + "grad_norm": 4.125, + "grad_norm_var": 0.38059895833333335, + "learning_rate": 0.0001, + "loss": 6.2833, + "loss/crossentropy": 2.7533146142959595, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19557969272136688, + "step": 11202 + }, + { + "epoch": 0.350125, + "grad_norm": 3.4375, + "grad_norm_var": 0.363525390625, + "learning_rate": 0.0001, + "loss": 5.8647, + "loss/crossentropy": 2.588066577911377, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17375587671995163, + "step": 11204 + }, + { + "epoch": 0.3501875, + "grad_norm": 3.25, + "grad_norm_var": 0.37858784993489586, + "learning_rate": 0.0001, + "loss": 6.1644, + "loss/crossentropy": 2.754800796508789, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18470770865678787, + "step": 11206 + }, + { + "epoch": 0.35025, + "grad_norm": 3.515625, + "grad_norm_var": 0.36643778483072914, + "learning_rate": 0.0001, + "loss": 5.9204, + "loss/crossentropy": 2.622808814048767, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1793663278222084, + "step": 11208 + }, + { + "epoch": 0.3503125, + "grad_norm": 3.265625, + "grad_norm_var": 0.3826568603515625, + "learning_rate": 0.0001, + "loss": 5.8128, + "loss/crossentropy": 2.534000873565674, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17632145434617996, + "step": 11210 + }, + { + "epoch": 0.350375, + "grad_norm": 3.421875, + "grad_norm_var": 0.38606363932291665, + "learning_rate": 0.0001, + "loss": 6.2683, + "loss/crossentropy": 2.803617000579834, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19177913665771484, + "step": 11212 + }, + { + "epoch": 0.3504375, + "grad_norm": 4.15625, + "grad_norm_var": 0.07792561848958333, + "learning_rate": 0.0001, + "loss": 5.7733, + "loss/crossentropy": 2.41003954410553, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.17499523609876633, + "step": 11214 + }, + { + "epoch": 0.3505, + "grad_norm": 3.359375, + "grad_norm_var": 0.08134663899739583, + "learning_rate": 0.0001, + "loss": 6.0462, + "loss/crossentropy": 2.616556763648987, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18906167149543762, + "step": 11216 + }, + { + "epoch": 0.3505625, + "grad_norm": 3.234375, + "grad_norm_var": 0.05584309895833333, + "learning_rate": 0.0001, + "loss": 5.9943, + "loss/crossentropy": 2.5787479877471924, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1853073462843895, + "step": 11218 + }, + { + "epoch": 0.350625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0581451416015625, + "learning_rate": 0.0001, + "loss": 5.8301, + "loss/crossentropy": 2.56949520111084, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17606385052204132, + "step": 11220 + }, + { + "epoch": 0.3506875, + "grad_norm": 3.4375, + "grad_norm_var": 0.062483723958333334, + "learning_rate": 0.0001, + "loss": 5.8622, + "loss/crossentropy": 2.6186574697494507, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17240316420793533, + "step": 11222 + }, + { + "epoch": 0.35075, + "grad_norm": 3.140625, + "grad_norm_var": 0.06483968098958333, + "learning_rate": 0.0001, + "loss": 5.9472, + "loss/crossentropy": 2.6057368516921997, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17946264147758484, + "step": 11224 + }, + { + "epoch": 0.3508125, + "grad_norm": 3.078125, + "grad_norm_var": 0.06871337890625, + "learning_rate": 0.0001, + "loss": 6.3552, + "loss/crossentropy": 2.9567378759384155, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18516208976507187, + "step": 11226 + }, + { + "epoch": 0.350875, + "grad_norm": 4.0625, + "grad_norm_var": 0.09927978515625, + "learning_rate": 0.0001, + "loss": 5.7293, + "loss/crossentropy": 2.478781580924988, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17505527287721634, + "step": 11228 + }, + { + "epoch": 0.3509375, + "grad_norm": 3.234375, + "grad_norm_var": 0.06197001139322917, + "learning_rate": 0.0001, + "loss": 5.6721, + "loss/crossentropy": 2.4796438217163086, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16728852689266205, + "step": 11230 + }, + { + "epoch": 0.351, + "grad_norm": 3.3125, + "grad_norm_var": 0.0557281494140625, + "learning_rate": 0.0001, + "loss": 5.7428, + "loss/crossentropy": 2.528485894203186, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1686977818608284, + "step": 11232 + }, + { + "epoch": 0.3510625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0560211181640625, + "learning_rate": 0.0001, + "loss": 5.9358, + "loss/crossentropy": 2.5955309867858887, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18168071657419205, + "step": 11234 + }, + { + "epoch": 0.351125, + "grad_norm": 2.921875, + "grad_norm_var": 0.06412353515625, + "learning_rate": 0.0001, + "loss": 5.6136, + "loss/crossentropy": 2.4391366243362427, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16861621290445328, + "step": 11236 + }, + { + "epoch": 0.3511875, + "grad_norm": 3.1875, + "grad_norm_var": 0.06402587890625, + "learning_rate": 0.0001, + "loss": 5.7673, + "loss/crossentropy": 2.608486533164978, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1686142310500145, + "step": 11238 + }, + { + "epoch": 0.35125, + "grad_norm": 3.5625, + "grad_norm_var": 0.06883036295572917, + "learning_rate": 0.0001, + "loss": 5.9049, + "loss/crossentropy": 2.573819637298584, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17998070269823074, + "step": 11240 + }, + { + "epoch": 0.3513125, + "grad_norm": 3.546875, + "grad_norm_var": 0.07077534993489583, + "learning_rate": 0.0001, + "loss": 5.8188, + "loss/crossentropy": 2.440657377243042, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18117434531450272, + "step": 11242 + }, + { + "epoch": 0.351375, + "grad_norm": 3.546875, + "grad_norm_var": 0.034830729166666664, + "learning_rate": 0.0001, + "loss": 6.0702, + "loss/crossentropy": 2.7077648639678955, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18272359669208527, + "step": 11244 + }, + { + "epoch": 0.3514375, + "grad_norm": 3.390625, + "grad_norm_var": 0.0373199462890625, + "learning_rate": 0.0001, + "loss": 5.7257, + "loss/crossentropy": 2.380261778831482, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18180926144123077, + "step": 11246 + }, + { + "epoch": 0.3515, + "grad_norm": 3.234375, + "grad_norm_var": 0.03427327473958333, + "learning_rate": 0.0001, + "loss": 6.2033, + "loss/crossentropy": 2.7489737272262573, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.189968541264534, + "step": 11248 + }, + { + "epoch": 0.3515625, + "grad_norm": 3.40625, + "grad_norm_var": 0.034154256184895836, + "learning_rate": 0.0001, + "loss": 5.8867, + "loss/crossentropy": 2.6347585916519165, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1720653623342514, + "step": 11250 + }, + { + "epoch": 0.351625, + "grad_norm": 3.65625, + "grad_norm_var": 0.033610026041666664, + "learning_rate": 0.0001, + "loss": 5.5141, + "loss/crossentropy": 2.294405937194824, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16962312906980515, + "step": 11252 + }, + { + "epoch": 0.3516875, + "grad_norm": 3.46875, + "grad_norm_var": 0.020588175455729166, + "learning_rate": 0.0001, + "loss": 5.6743, + "loss/crossentropy": 2.416317582130432, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1695522665977478, + "step": 11254 + }, + { + "epoch": 0.35175, + "grad_norm": 3.34375, + "grad_norm_var": 0.0200347900390625, + "learning_rate": 0.0001, + "loss": 5.6698, + "loss/crossentropy": 2.437352418899536, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17363091558218002, + "step": 11256 + }, + { + "epoch": 0.3518125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04293212890625, + "learning_rate": 0.0001, + "loss": 5.5666, + "loss/crossentropy": 2.447490692138672, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16581828147172928, + "step": 11258 + }, + { + "epoch": 0.351875, + "grad_norm": 3.34375, + "grad_norm_var": 0.04511311848958333, + "learning_rate": 0.0001, + "loss": 5.714, + "loss/crossentropy": 2.4743552207946777, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17201179265975952, + "step": 11260 + }, + { + "epoch": 0.3519375, + "grad_norm": 3.359375, + "grad_norm_var": 0.042780558268229164, + "learning_rate": 0.0001, + "loss": 5.7087, + "loss/crossentropy": 2.4358839988708496, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17533152550458908, + "step": 11262 + }, + { + "epoch": 0.352, + "grad_norm": 3.40625, + "grad_norm_var": 0.04302978515625, + "learning_rate": 0.0001, + "loss": 5.9783, + "loss/crossentropy": 2.613458275794983, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18062026798725128, + "step": 11264 + }, + { + "epoch": 0.3520625, + "grad_norm": 3.484375, + "grad_norm_var": 0.0494781494140625, + "learning_rate": 0.0001, + "loss": 6.0169, + "loss/crossentropy": 2.6738606691360474, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18235048651695251, + "step": 11266 + }, + { + "epoch": 0.352125, + "grad_norm": 3.25, + "grad_norm_var": 0.030757649739583334, + "learning_rate": 0.0001, + "loss": 5.932, + "loss/crossentropy": 2.573430895805359, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17921525239944458, + "step": 11268 + }, + { + "epoch": 0.3521875, + "grad_norm": 3.078125, + "grad_norm_var": 0.02935791015625, + "learning_rate": 0.0001, + "loss": 6.0359, + "loss/crossentropy": 2.6839070320129395, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1812937706708908, + "step": 11270 + }, + { + "epoch": 0.35225, + "grad_norm": 3.421875, + "grad_norm_var": 0.030907185872395833, + "learning_rate": 0.0001, + "loss": 6.0565, + "loss/crossentropy": 2.701889157295227, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18155021220445633, + "step": 11272 + }, + { + "epoch": 0.3523125, + "grad_norm": 3.21875, + "grad_norm_var": 0.0376617431640625, + "learning_rate": 0.0001, + "loss": 5.5597, + "loss/crossentropy": 2.294955849647522, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17335163056850433, + "step": 11274 + }, + { + "epoch": 0.352375, + "grad_norm": 3.1875, + "grad_norm_var": 0.03697509765625, + "learning_rate": 0.0001, + "loss": 5.8427, + "loss/crossentropy": 2.4607044458389282, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18351703882217407, + "step": 11276 + }, + { + "epoch": 0.3524375, + "grad_norm": 3.28125, + "grad_norm_var": 0.036604817708333334, + "learning_rate": 0.0001, + "loss": 5.9594, + "loss/crossentropy": 2.536848306655884, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18873773515224457, + "step": 11278 + }, + { + "epoch": 0.3525, + "grad_norm": 3.46875, + "grad_norm_var": 0.037385050455729166, + "learning_rate": 0.0001, + "loss": 6.1033, + "loss/crossentropy": 2.7329437732696533, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1827412247657776, + "step": 11280 + }, + { + "epoch": 0.3525625, + "grad_norm": 3.65625, + "grad_norm_var": 0.0367095947265625, + "learning_rate": 0.0001, + "loss": 6.0466, + "loss/crossentropy": 2.6824004650115967, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17860594391822815, + "step": 11282 + }, + { + "epoch": 0.352625, + "grad_norm": 3.453125, + "grad_norm_var": 0.0357574462890625, + "learning_rate": 0.0001, + "loss": 5.8863, + "loss/crossentropy": 2.61265230178833, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1738537773489952, + "step": 11284 + }, + { + "epoch": 0.3526875, + "grad_norm": 3.40625, + "grad_norm_var": 0.04616597493489583, + "learning_rate": 0.0001, + "loss": 6.1326, + "loss/crossentropy": 2.674401044845581, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18683121353387833, + "step": 11286 + }, + { + "epoch": 0.35275, + "grad_norm": 3.359375, + "grad_norm_var": 0.04273681640625, + "learning_rate": 0.0001, + "loss": 6.1229, + "loss/crossentropy": 2.6818618774414062, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19020012766122818, + "step": 11288 + }, + { + "epoch": 0.3528125, + "grad_norm": 3.140625, + "grad_norm_var": 0.04355061848958333, + "learning_rate": 0.0001, + "loss": 5.7223, + "loss/crossentropy": 2.535930633544922, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16863670200109482, + "step": 11290 + }, + { + "epoch": 0.352875, + "grad_norm": 3.515625, + "grad_norm_var": 0.05386962890625, + "learning_rate": 0.0001, + "loss": 5.8413, + "loss/crossentropy": 2.5034278631210327, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1818310022354126, + "step": 11292 + }, + { + "epoch": 0.3529375, + "grad_norm": 3.515625, + "grad_norm_var": 0.05308329264322917, + "learning_rate": 0.0001, + "loss": 5.9111, + "loss/crossentropy": 2.5392953157424927, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1813190057873726, + "step": 11294 + }, + { + "epoch": 0.353, + "grad_norm": 3.359375, + "grad_norm_var": 0.05386962890625, + "learning_rate": 0.0001, + "loss": 5.6253, + "loss/crossentropy": 2.42446768283844, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17008384317159653, + "step": 11296 + }, + { + "epoch": 0.3530625, + "grad_norm": 3.0625, + "grad_norm_var": 0.05767822265625, + "learning_rate": 0.0001, + "loss": 5.7541, + "loss/crossentropy": 2.5158231258392334, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17578310519456863, + "step": 11298 + }, + { + "epoch": 0.353125, + "grad_norm": 3.171875, + "grad_norm_var": 0.057417805989583334, + "learning_rate": 0.0001, + "loss": 6.1198, + "loss/crossentropy": 2.7350213527679443, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18262308090925217, + "step": 11300 + }, + { + "epoch": 0.3531875, + "grad_norm": 3.234375, + "grad_norm_var": 0.02818603515625, + "learning_rate": 0.0001, + "loss": 5.8617, + "loss/crossentropy": 2.554205536842346, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17722924798727036, + "step": 11302 + }, + { + "epoch": 0.35325, + "grad_norm": 3.578125, + "grad_norm_var": 0.03394775390625, + "learning_rate": 0.0001, + "loss": 6.1269, + "loss/crossentropy": 2.652440905570984, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19080623984336853, + "step": 11304 + }, + { + "epoch": 0.3533125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0314117431640625, + "learning_rate": 0.0001, + "loss": 5.8889, + "loss/crossentropy": 2.5206410884857178, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1809711903333664, + "step": 11306 + }, + { + "epoch": 0.353375, + "grad_norm": 3.296875, + "grad_norm_var": 0.021610514322916666, + "learning_rate": 0.0001, + "loss": 5.8617, + "loss/crossentropy": 2.5516124963760376, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1786694973707199, + "step": 11308 + }, + { + "epoch": 0.3534375, + "grad_norm": 3.328125, + "grad_norm_var": 0.017545572916666665, + "learning_rate": 0.0001, + "loss": 5.9227, + "loss/crossentropy": 2.5877593755722046, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18154260516166687, + "step": 11310 + }, + { + "epoch": 0.3535, + "grad_norm": 3.25, + "grad_norm_var": 0.0334136962890625, + "learning_rate": 0.0001, + "loss": 6.1391, + "loss/crossentropy": 2.717615008354187, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18902809917926788, + "step": 11312 + }, + { + "epoch": 0.3535625, + "grad_norm": 3.28125, + "grad_norm_var": 0.0310699462890625, + "learning_rate": 0.0001, + "loss": 5.949, + "loss/crossentropy": 2.5918458700180054, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18336959183216095, + "step": 11314 + }, + { + "epoch": 0.353625, + "grad_norm": 3.3125, + "grad_norm_var": 0.029752604166666665, + "learning_rate": 0.0001, + "loss": 5.7579, + "loss/crossentropy": 2.500713586807251, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1753242462873459, + "step": 11316 + }, + { + "epoch": 0.3536875, + "grad_norm": 3.4375, + "grad_norm_var": 0.03313802083333333, + "learning_rate": 0.0001, + "loss": 5.6947, + "loss/crossentropy": 2.4850414991378784, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.170186847448349, + "step": 11318 + }, + { + "epoch": 0.35375, + "grad_norm": 3.703125, + "grad_norm_var": 0.033919270833333334, + "learning_rate": 0.0001, + "loss": 5.7845, + "loss/crossentropy": 2.454130530357361, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17952131479978561, + "step": 11320 + }, + { + "epoch": 0.3538125, + "grad_norm": 3.5625, + "grad_norm_var": 0.04283447265625, + "learning_rate": 0.0001, + "loss": 5.8565, + "loss/crossentropy": 2.5047308206558228, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1820513680577278, + "step": 11322 + }, + { + "epoch": 0.353875, + "grad_norm": 3.0625, + "grad_norm_var": 0.04586181640625, + "learning_rate": 0.0001, + "loss": 5.7061, + "loss/crossentropy": 2.4678107500076294, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17343752086162567, + "step": 11324 + }, + { + "epoch": 0.3539375, + "grad_norm": 3.140625, + "grad_norm_var": 0.04939676920572917, + "learning_rate": 0.0001, + "loss": 5.8189, + "loss/crossentropy": 2.5566846132278442, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1758304387331009, + "step": 11326 + }, + { + "epoch": 0.354, + "grad_norm": 3.34375, + "grad_norm_var": 0.03601786295572917, + "learning_rate": 0.0001, + "loss": 6.1119, + "loss/crossentropy": 2.7197126150131226, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1864854320883751, + "step": 11328 + }, + { + "epoch": 0.3540625, + "grad_norm": 3.296875, + "grad_norm_var": 0.04797770182291667, + "learning_rate": 0.0001, + "loss": 5.9145, + "loss/crossentropy": 2.634163022041321, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17569313943386078, + "step": 11330 + }, + { + "epoch": 0.354125, + "grad_norm": 3.140625, + "grad_norm_var": 0.050146484375, + "learning_rate": 0.0001, + "loss": 5.8728, + "loss/crossentropy": 2.5763657093048096, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17691173404455185, + "step": 11332 + }, + { + "epoch": 0.3541875, + "grad_norm": 3.234375, + "grad_norm_var": 0.04676106770833333, + "learning_rate": 0.0001, + "loss": 6.14, + "loss/crossentropy": 2.7341185808181763, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1866796761751175, + "step": 11334 + }, + { + "epoch": 0.35425, + "grad_norm": 3.234375, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 5.9644, + "loss/crossentropy": 2.674552321434021, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17703347653150558, + "step": 11336 + }, + { + "epoch": 0.3543125, + "grad_norm": 3.328125, + "grad_norm_var": 2.8357167561848957, + "learning_rate": 0.0001, + "loss": 5.8163, + "loss/crossentropy": 2.3537384271621704, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18922463059425354, + "step": 11338 + }, + { + "epoch": 0.354375, + "grad_norm": 3.140625, + "grad_norm_var": 2.81578369140625, + "learning_rate": 0.0001, + "loss": 5.97, + "loss/crossentropy": 2.622086763381958, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.18674640357494354, + "step": 11340 + }, + { + "epoch": 0.3544375, + "grad_norm": 3.0625, + "grad_norm_var": 2.811994425455729, + "learning_rate": 0.0001, + "loss": 5.8829, + "loss/crossentropy": 2.6072442531585693, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17404572665691376, + "step": 11342 + }, + { + "epoch": 0.3545, + "grad_norm": 3.765625, + "grad_norm_var": 2.8266886393229167, + "learning_rate": 0.0001, + "loss": 5.7161, + "loss/crossentropy": 2.516643524169922, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17112035304307938, + "step": 11344 + }, + { + "epoch": 0.3545625, + "grad_norm": 3.046875, + "grad_norm_var": 2.8582916259765625, + "learning_rate": 0.0001, + "loss": 5.7841, + "loss/crossentropy": 2.5356485843658447, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17562797665596008, + "step": 11346 + }, + { + "epoch": 0.354625, + "grad_norm": 3.15625, + "grad_norm_var": 2.874381510416667, + "learning_rate": 0.0001, + "loss": 5.5878, + "loss/crossentropy": 2.4020326137542725, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1681831330060959, + "step": 11348 + }, + { + "epoch": 0.3546875, + "grad_norm": 3.453125, + "grad_norm_var": 2.861034138997396, + "learning_rate": 0.0001, + "loss": 5.9719, + "loss/crossentropy": 2.6108627319335938, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1853175312280655, + "step": 11350 + }, + { + "epoch": 0.35475, + "grad_norm": 3.34375, + "grad_norm_var": 2.86441650390625, + "learning_rate": 0.0001, + "loss": 5.8816, + "loss/crossentropy": 2.545454263687134, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18048831820487976, + "step": 11352 + }, + { + "epoch": 0.3548125, + "grad_norm": 3.828125, + "grad_norm_var": 0.07869364420572916, + "learning_rate": 0.0001, + "loss": 6.3151, + "loss/crossentropy": 2.843619704246521, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19011834263801575, + "step": 11354 + }, + { + "epoch": 0.354875, + "grad_norm": 3.203125, + "grad_norm_var": 0.05623270670572917, + "learning_rate": 0.0001, + "loss": 5.6727, + "loss/crossentropy": 2.3813884258270264, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17522287368774414, + "step": 11356 + }, + { + "epoch": 0.3549375, + "grad_norm": 3.296875, + "grad_norm_var": 0.0600494384765625, + "learning_rate": 0.0001, + "loss": 5.7162, + "loss/crossentropy": 2.4047796726226807, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1768423169851303, + "step": 11358 + }, + { + "epoch": 0.355, + "grad_norm": 3.53125, + "grad_norm_var": 0.0487945556640625, + "learning_rate": 0.0001, + "loss": 5.6889, + "loss/crossentropy": 2.43475878238678, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17385368049144745, + "step": 11360 + }, + { + "epoch": 0.3550625, + "grad_norm": 3.28125, + "grad_norm_var": 0.04837239583333333, + "learning_rate": 0.0001, + "loss": 5.7149, + "loss/crossentropy": 2.4121944904327393, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17792222648859024, + "step": 11362 + }, + { + "epoch": 0.355125, + "grad_norm": 3.375, + "grad_norm_var": 0.038309733072916664, + "learning_rate": 0.0001, + "loss": 5.8767, + "loss/crossentropy": 2.5436421632766724, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1762741208076477, + "step": 11364 + }, + { + "epoch": 0.3551875, + "grad_norm": 3.5625, + "grad_norm_var": 0.44999593098958335, + "learning_rate": 0.0001, + "loss": 6.2411, + "loss/crossentropy": 2.7405037879943848, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.19888553023338318, + "step": 11366 + }, + { + "epoch": 0.35525, + "grad_norm": 3.234375, + "grad_norm_var": 0.46381734212239584, + "learning_rate": 0.0001, + "loss": 5.6813, + "loss/crossentropy": 2.51580548286438, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16928555816411972, + "step": 11368 + }, + { + "epoch": 0.3553125, + "grad_norm": 3.421875, + "grad_norm_var": 0.45828348795572915, + "learning_rate": 0.0001, + "loss": 5.8859, + "loss/crossentropy": 2.5942893028259277, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17525933682918549, + "step": 11370 + }, + { + "epoch": 0.355375, + "grad_norm": 3.25, + "grad_norm_var": 0.46943359375, + "learning_rate": 0.0001, + "loss": 5.867, + "loss/crossentropy": 2.6768126487731934, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17019033432006836, + "step": 11372 + }, + { + "epoch": 0.3554375, + "grad_norm": 3.0625, + "grad_norm_var": 0.47509765625, + "learning_rate": 0.0001, + "loss": 5.6296, + "loss/crossentropy": 2.4368897676467896, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16771049797534943, + "step": 11374 + }, + { + "epoch": 0.3555, + "grad_norm": 3.515625, + "grad_norm_var": 0.47421875, + "learning_rate": 0.0001, + "loss": 5.7442, + "loss/crossentropy": 2.4130361080169678, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1745217740535736, + "step": 11376 + }, + { + "epoch": 0.3555625, + "grad_norm": 3.453125, + "grad_norm_var": 0.4614410400390625, + "learning_rate": 0.0001, + "loss": 5.7711, + "loss/crossentropy": 2.423428177833557, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17969273030757904, + "step": 11378 + }, + { + "epoch": 0.355625, + "grad_norm": 3.796875, + "grad_norm_var": 0.6179758707682291, + "learning_rate": 0.0001, + "loss": 5.9717, + "loss/crossentropy": 2.619338870048523, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17977125942707062, + "step": 11380 + }, + { + "epoch": 0.3556875, + "grad_norm": 3.203125, + "grad_norm_var": 0.24495442708333334, + "learning_rate": 0.0001, + "loss": 6.052, + "loss/crossentropy": 2.606391429901123, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18830808997154236, + "step": 11382 + }, + { + "epoch": 0.35575, + "grad_norm": 3.21875, + "grad_norm_var": 0.2530426025390625, + "learning_rate": 0.0001, + "loss": 5.6287, + "loss/crossentropy": 2.441736340522766, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1694810837507248, + "step": 11384 + }, + { + "epoch": 0.3558125, + "grad_norm": 3.4375, + "grad_norm_var": 0.27371317545572915, + "learning_rate": 0.0001, + "loss": 5.8177, + "loss/crossentropy": 2.563707709312439, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17344237864017487, + "step": 11386 + }, + { + "epoch": 0.355875, + "grad_norm": 3.15625, + "grad_norm_var": 0.2701334635416667, + "learning_rate": 0.0001, + "loss": 5.8076, + "loss/crossentropy": 2.5102453231811523, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1742628961801529, + "step": 11388 + }, + { + "epoch": 0.3559375, + "grad_norm": 3.390625, + "grad_norm_var": 0.26374409993489584, + "learning_rate": 0.0001, + "loss": 5.8307, + "loss/crossentropy": 2.5091181993484497, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17981696128845215, + "step": 11390 + }, + { + "epoch": 0.356, + "grad_norm": 3.78125, + "grad_norm_var": 0.2713704427083333, + "learning_rate": 0.0001, + "loss": 5.6833, + "loss/crossentropy": 2.3582528829574585, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17821084707975388, + "step": 11392 + }, + { + "epoch": 0.3560625, + "grad_norm": 3.640625, + "grad_norm_var": 0.27932840983072915, + "learning_rate": 0.0001, + "loss": 5.9741, + "loss/crossentropy": 2.55366849899292, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18618155270814896, + "step": 11394 + }, + { + "epoch": 0.356125, + "grad_norm": 3.03125, + "grad_norm_var": 0.07580464680989583, + "learning_rate": 0.0001, + "loss": 5.8154, + "loss/crossentropy": 2.4842296838760376, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18037919700145721, + "step": 11396 + }, + { + "epoch": 0.3561875, + "grad_norm": 3.4375, + "grad_norm_var": 0.0642242431640625, + "learning_rate": 0.0001, + "loss": 6.0091, + "loss/crossentropy": 2.615309715270996, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1885964423418045, + "step": 11398 + }, + { + "epoch": 0.35625, + "grad_norm": 3.25, + "grad_norm_var": 0.058568318684895836, + "learning_rate": 0.0001, + "loss": 5.5899, + "loss/crossentropy": 2.429373621940613, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16722938418388367, + "step": 11400 + }, + { + "epoch": 0.3563125, + "grad_norm": 4.5625, + "grad_norm_var": 0.14453125, + "learning_rate": 0.0001, + "loss": 6.0887, + "loss/crossentropy": 2.6522120237350464, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18935363739728928, + "step": 11402 + }, + { + "epoch": 0.356375, + "grad_norm": 3.40625, + "grad_norm_var": 0.13702799479166666, + "learning_rate": 0.0001, + "loss": 6.0122, + "loss/crossentropy": 2.635712504386902, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18452216684818268, + "step": 11404 + }, + { + "epoch": 0.3564375, + "grad_norm": 3.25, + "grad_norm_var": 0.13648681640625, + "learning_rate": 0.0001, + "loss": 5.8178, + "loss/crossentropy": 2.485180974006653, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1777970865368843, + "step": 11406 + }, + { + "epoch": 0.3565, + "grad_norm": 3.3125, + "grad_norm_var": 0.12939453125, + "learning_rate": 0.0001, + "loss": 5.8076, + "loss/crossentropy": 2.5296874046325684, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17544680833816528, + "step": 11408 + }, + { + "epoch": 0.3565625, + "grad_norm": 3.21875, + "grad_norm_var": 0.12379150390625, + "learning_rate": 0.0001, + "loss": 5.9481, + "loss/crossentropy": 2.6227307319641113, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1809762418270111, + "step": 11410 + }, + { + "epoch": 0.356625, + "grad_norm": 3.0625, + "grad_norm_var": 0.1189849853515625, + "learning_rate": 0.0001, + "loss": 5.8063, + "loss/crossentropy": 2.5536731481552124, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17096972465515137, + "step": 11412 + }, + { + "epoch": 0.3566875, + "grad_norm": 3.40625, + "grad_norm_var": 0.1249908447265625, + "learning_rate": 0.0001, + "loss": 6.0507, + "loss/crossentropy": 2.648218274116516, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18946433812379837, + "step": 11414 + }, + { + "epoch": 0.35675, + "grad_norm": 3.078125, + "grad_norm_var": 0.1256744384765625, + "learning_rate": 0.0001, + "loss": 5.9262, + "loss/crossentropy": 2.59994637966156, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17950069904327393, + "step": 11416 + }, + { + "epoch": 0.3568125, + "grad_norm": 3.296875, + "grad_norm_var": 0.022294108072916666, + "learning_rate": 0.0001, + "loss": 6.0349, + "loss/crossentropy": 2.742589235305786, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1776665523648262, + "step": 11418 + }, + { + "epoch": 0.356875, + "grad_norm": 3.15625, + "grad_norm_var": 0.020685831705729168, + "learning_rate": 0.0001, + "loss": 5.7908, + "loss/crossentropy": 2.485624313354492, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1750485599040985, + "step": 11420 + }, + { + "epoch": 0.3569375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0204498291015625, + "learning_rate": 0.0001, + "loss": 5.9228, + "loss/crossentropy": 2.6894630193710327, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.172164686024189, + "step": 11422 + }, + { + "epoch": 0.357, + "grad_norm": 3.171875, + "grad_norm_var": 0.02359619140625, + "learning_rate": 0.0001, + "loss": 6.0369, + "loss/crossentropy": 2.658891439437866, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18428927659988403, + "step": 11424 + }, + { + "epoch": 0.3570625, + "grad_norm": 3.4375, + "grad_norm_var": 0.0354400634765625, + "learning_rate": 0.0001, + "loss": 5.8742, + "loss/crossentropy": 2.530714273452759, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18005235493183136, + "step": 11426 + }, + { + "epoch": 0.357125, + "grad_norm": 3.40625, + "grad_norm_var": 0.037890625, + "learning_rate": 0.0001, + "loss": 6.1503, + "loss/crossentropy": 2.7310314178466797, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1864599734544754, + "step": 11428 + }, + { + "epoch": 0.3571875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0415679931640625, + "learning_rate": 0.0001, + "loss": 5.6779, + "loss/crossentropy": 2.4858238697052, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1688140332698822, + "step": 11430 + }, + { + "epoch": 0.35725, + "grad_norm": 3.140625, + "grad_norm_var": 0.040192667643229166, + "learning_rate": 0.0001, + "loss": 5.7823, + "loss/crossentropy": 2.5231053829193115, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17475000023841858, + "step": 11432 + }, + { + "epoch": 0.3573125, + "grad_norm": 3.1875, + "grad_norm_var": 0.041890462239583336, + "learning_rate": 0.0001, + "loss": 5.9383, + "loss/crossentropy": 2.6606061458587646, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.175423301756382, + "step": 11434 + }, + { + "epoch": 0.357375, + "grad_norm": 3.734375, + "grad_norm_var": 0.05413411458333333, + "learning_rate": 0.0001, + "loss": 5.7, + "loss/crossentropy": 2.3909060955047607, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17543992400169373, + "step": 11436 + }, + { + "epoch": 0.3574375, + "grad_norm": 3.0, + "grad_norm_var": 0.06560872395833334, + "learning_rate": 0.0001, + "loss": 5.6324, + "loss/crossentropy": 2.520796775817871, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16311749815940857, + "step": 11438 + }, + { + "epoch": 0.3575, + "grad_norm": 3.28125, + "grad_norm_var": 0.06467692057291667, + "learning_rate": 0.0001, + "loss": 5.521, + "loss/crossentropy": 2.355831027030945, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16808204352855682, + "step": 11440 + }, + { + "epoch": 0.3575625, + "grad_norm": 3.53125, + "grad_norm_var": 0.05696512858072917, + "learning_rate": 0.0001, + "loss": 5.8167, + "loss/crossentropy": 2.5602900981903076, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17251737415790558, + "step": 11442 + }, + { + "epoch": 0.357625, + "grad_norm": 3.46875, + "grad_norm_var": 0.0509765625, + "learning_rate": 0.0001, + "loss": 5.8388, + "loss/crossentropy": 2.6111416816711426, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17237364500761032, + "step": 11444 + }, + { + "epoch": 0.3576875, + "grad_norm": 3.609375, + "grad_norm_var": 0.0578765869140625, + "learning_rate": 0.0001, + "loss": 6.075, + "loss/crossentropy": 2.681327700614929, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18428415060043335, + "step": 11446 + }, + { + "epoch": 0.35775, + "grad_norm": 3.09375, + "grad_norm_var": 0.0592193603515625, + "learning_rate": 0.0001, + "loss": 5.8754, + "loss/crossentropy": 2.6480273008346558, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1723475679755211, + "step": 11448 + }, + { + "epoch": 0.3578125, + "grad_norm": 3.375, + "grad_norm_var": 0.055501302083333336, + "learning_rate": 0.0001, + "loss": 6.0198, + "loss/crossentropy": 2.7274798154830933, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1784529834985733, + "step": 11450 + }, + { + "epoch": 0.357875, + "grad_norm": 3.484375, + "grad_norm_var": 0.04488525390625, + "learning_rate": 0.0001, + "loss": 6.2585, + "loss/crossentropy": 2.8555736541748047, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18521306663751602, + "step": 11452 + }, + { + "epoch": 0.3579375, + "grad_norm": 3.203125, + "grad_norm_var": 0.03599853515625, + "learning_rate": 0.0001, + "loss": 5.8051, + "loss/crossentropy": 2.4731528759002686, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1800696775317192, + "step": 11454 + }, + { + "epoch": 0.358, + "grad_norm": 3.4375, + "grad_norm_var": 0.0372467041015625, + "learning_rate": 0.0001, + "loss": 6.0278, + "loss/crossentropy": 2.633829951286316, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18431830406188965, + "step": 11456 + }, + { + "epoch": 0.3580625, + "grad_norm": 3.375, + "grad_norm_var": 0.032450358072916664, + "learning_rate": 0.0001, + "loss": 6.1159, + "loss/crossentropy": 2.7020633220672607, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1851382553577423, + "step": 11458 + }, + { + "epoch": 0.358125, + "grad_norm": 3.046875, + "grad_norm_var": 0.029523722330729165, + "learning_rate": 0.0001, + "loss": 5.5757, + "loss/crossentropy": 2.4457303285598755, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1661197990179062, + "step": 11460 + }, + { + "epoch": 0.3581875, + "grad_norm": 3.328125, + "grad_norm_var": 0.021923828125, + "learning_rate": 0.0001, + "loss": 6.0214, + "loss/crossentropy": 2.771443009376526, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17265425622463226, + "step": 11462 + }, + { + "epoch": 0.35825, + "grad_norm": 3.15625, + "grad_norm_var": 0.019331868489583334, + "learning_rate": 0.0001, + "loss": 5.6269, + "loss/crossentropy": 2.448089599609375, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16787806153297424, + "step": 11464 + }, + { + "epoch": 0.3583125, + "grad_norm": 3.71875, + "grad_norm_var": 0.035481770833333336, + "learning_rate": 0.0001, + "loss": 5.9265, + "loss/crossentropy": 2.513920307159424, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18305665999650955, + "step": 11466 + }, + { + "epoch": 0.358375, + "grad_norm": 3.578125, + "grad_norm_var": 0.333203125, + "learning_rate": 0.0001, + "loss": 5.9521, + "loss/crossentropy": 2.5828335285186768, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18028418719768524, + "step": 11468 + }, + { + "epoch": 0.3584375, + "grad_norm": 3.46875, + "grad_norm_var": 0.33818359375, + "learning_rate": 0.0001, + "loss": 5.839, + "loss/crossentropy": 2.5756973028182983, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17281711846590042, + "step": 11470 + }, + { + "epoch": 0.3585, + "grad_norm": 3.46875, + "grad_norm_var": 0.32499593098958335, + "learning_rate": 0.0001, + "loss": 5.5674, + "loss/crossentropy": 2.346805453300476, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1712806150317192, + "step": 11472 + }, + { + "epoch": 0.3585625, + "grad_norm": 3.890625, + "grad_norm_var": 0.33482666015625, + "learning_rate": 0.0001, + "loss": 6.0567, + "loss/crossentropy": 2.6429061889648438, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1859058141708374, + "step": 11474 + }, + { + "epoch": 0.358625, + "grad_norm": 3.0625, + "grad_norm_var": 0.3302154541015625, + "learning_rate": 0.0001, + "loss": 5.918, + "loss/crossentropy": 2.584723472595215, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18176878988742828, + "step": 11476 + }, + { + "epoch": 0.3586875, + "grad_norm": 3.328125, + "grad_norm_var": 0.322705078125, + "learning_rate": 0.0001, + "loss": 6.003, + "loss/crossentropy": 2.609718918800354, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1862044632434845, + "step": 11478 + }, + { + "epoch": 0.35875, + "grad_norm": 3.375, + "grad_norm_var": 0.30917561848958336, + "learning_rate": 0.0001, + "loss": 6.4142, + "loss/crossentropy": 2.867684483528137, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1933259814977646, + "step": 11480 + }, + { + "epoch": 0.3588125, + "grad_norm": 3.21875, + "grad_norm_var": 0.330126953125, + "learning_rate": 0.0001, + "loss": 5.7085, + "loss/crossentropy": 2.4196025133132935, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17732993513345718, + "step": 11482 + }, + { + "epoch": 0.358875, + "grad_norm": 3.375, + "grad_norm_var": 0.06323140462239583, + "learning_rate": 0.0001, + "loss": 5.9215, + "loss/crossentropy": 2.6263914108276367, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17716874182224274, + "step": 11484 + }, + { + "epoch": 0.3589375, + "grad_norm": 3.578125, + "grad_norm_var": 0.05761311848958333, + "learning_rate": 0.0001, + "loss": 5.6976, + "loss/crossentropy": 2.416483521461487, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17655231058597565, + "step": 11486 + }, + { + "epoch": 0.359, + "grad_norm": 7.46875, + "grad_norm_var": 1.0815714518229167, + "learning_rate": 0.0001, + "loss": 6.3502, + "loss/crossentropy": 2.7684093713760376, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.1987995207309723, + "step": 11488 + }, + { + "epoch": 0.3590625, + "grad_norm": 3.65625, + "grad_norm_var": 1.0758453369140626, + "learning_rate": 0.0001, + "loss": 6.0211, + "loss/crossentropy": 2.5561872720718384, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1890682429075241, + "step": 11490 + }, + { + "epoch": 0.359125, + "grad_norm": 3.375, + "grad_norm_var": 1.0589752197265625, + "learning_rate": 0.0001, + "loss": 5.9378, + "loss/crossentropy": 2.5617023706436157, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18253500759601593, + "step": 11492 + }, + { + "epoch": 0.3591875, + "grad_norm": 3.328125, + "grad_norm_var": 1.06138916015625, + "learning_rate": 0.0001, + "loss": 6.02, + "loss/crossentropy": 2.614338517189026, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18744519352912903, + "step": 11494 + }, + { + "epoch": 0.35925, + "grad_norm": 3.09375, + "grad_norm_var": 1.0776763916015626, + "learning_rate": 0.0001, + "loss": 6.0142, + "loss/crossentropy": 2.7154823541641235, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17830651253461838, + "step": 11496 + }, + { + "epoch": 0.3593125, + "grad_norm": 3.6875, + "grad_norm_var": 1.07730712890625, + "learning_rate": 0.0001, + "loss": 5.9975, + "loss/crossentropy": 2.6306092739105225, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18395738303661346, + "step": 11498 + }, + { + "epoch": 0.359375, + "grad_norm": 3.265625, + "grad_norm_var": 1.0700510660807292, + "learning_rate": 0.0001, + "loss": 5.5256, + "loss/crossentropy": 2.3245222568511963, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16854803264141083, + "step": 11500 + }, + { + "epoch": 0.3594375, + "grad_norm": 3.28125, + "grad_norm_var": 1.07838134765625, + "learning_rate": 0.0001, + "loss": 5.9712, + "loss/crossentropy": 2.6609901189804077, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1724277064204216, + "step": 11502 + }, + { + "epoch": 0.3595, + "grad_norm": 3.15625, + "grad_norm_var": 0.0346343994140625, + "learning_rate": 0.0001, + "loss": 5.9798, + "loss/crossentropy": 2.6193424463272095, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18292488902807236, + "step": 11504 + }, + { + "epoch": 0.3595625, + "grad_norm": 3.65625, + "grad_norm_var": 0.03511962890625, + "learning_rate": 0.0001, + "loss": 6.0716, + "loss/crossentropy": 2.686578392982483, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18185945600271225, + "step": 11506 + }, + { + "epoch": 0.359625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0396881103515625, + "learning_rate": 0.0001, + "loss": 5.779, + "loss/crossentropy": 2.5633209943771362, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17195884883403778, + "step": 11508 + }, + { + "epoch": 0.3596875, + "grad_norm": 3.28125, + "grad_norm_var": 0.03990478515625, + "learning_rate": 0.0001, + "loss": 6.0202, + "loss/crossentropy": 2.6731693744659424, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18157903850078583, + "step": 11510 + }, + { + "epoch": 0.35975, + "grad_norm": 3.109375, + "grad_norm_var": 0.04692281087239583, + "learning_rate": 0.0001, + "loss": 5.5234, + "loss/crossentropy": 2.385671377182007, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16103439033031464, + "step": 11512 + }, + { + "epoch": 0.3598125, + "grad_norm": 3.1875, + "grad_norm_var": 0.032201131184895836, + "learning_rate": 0.0001, + "loss": 6.1492, + "loss/crossentropy": 2.7553439140319824, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18274714052677155, + "step": 11514 + }, + { + "epoch": 0.359875, + "grad_norm": 3.390625, + "grad_norm_var": 0.03298238118489583, + "learning_rate": 0.0001, + "loss": 5.9644, + "loss/crossentropy": 2.6135441064834595, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17961689829826355, + "step": 11516 + }, + { + "epoch": 0.3599375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03629150390625, + "learning_rate": 0.0001, + "loss": 5.5116, + "loss/crossentropy": 2.3820388317108154, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16452037543058395, + "step": 11518 + }, + { + "epoch": 0.36, + "grad_norm": 3.140625, + "grad_norm_var": 0.040827433268229164, + "learning_rate": 0.0001, + "loss": 5.9463, + "loss/crossentropy": 2.649800658226013, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.178085595369339, + "step": 11520 + }, + { + "epoch": 0.3600625, + "grad_norm": 3.578125, + "grad_norm_var": 0.03927408854166667, + "learning_rate": 0.0001, + "loss": 6.2177, + "loss/crossentropy": 2.772137999534607, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19104167073965073, + "step": 11522 + }, + { + "epoch": 0.360125, + "grad_norm": 3.140625, + "grad_norm_var": 0.037821451822916664, + "learning_rate": 0.0001, + "loss": 6.0252, + "loss/crossentropy": 2.7171050310134888, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18002324551343918, + "step": 11524 + }, + { + "epoch": 0.3601875, + "grad_norm": 3.234375, + "grad_norm_var": 0.03765869140625, + "learning_rate": 0.0001, + "loss": 6.0706, + "loss/crossentropy": 2.757015585899353, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18174485862255096, + "step": 11526 + }, + { + "epoch": 0.36025, + "grad_norm": 3.203125, + "grad_norm_var": 0.03736572265625, + "learning_rate": 0.0001, + "loss": 5.4559, + "loss/crossentropy": 2.3074045181274414, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16524454951286316, + "step": 11528 + }, + { + "epoch": 0.3603125, + "grad_norm": 3.40625, + "grad_norm_var": 0.03972066243489583, + "learning_rate": 0.0001, + "loss": 5.7828, + "loss/crossentropy": 2.4054633378982544, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18382655084133148, + "step": 11530 + }, + { + "epoch": 0.360375, + "grad_norm": 3.296875, + "grad_norm_var": 0.03899332682291667, + "learning_rate": 0.0001, + "loss": 5.729, + "loss/crossentropy": 2.433566451072693, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1756378412246704, + "step": 11532 + }, + { + "epoch": 0.3604375, + "grad_norm": 3.203125, + "grad_norm_var": 0.03413798014322917, + "learning_rate": 0.0001, + "loss": 6.1159, + "loss/crossentropy": 2.7514678239822388, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1833144798874855, + "step": 11534 + }, + { + "epoch": 0.3605, + "grad_norm": 3.21875, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 6.1491, + "loss/crossentropy": 2.819941759109497, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1821325495839119, + "step": 11536 + }, + { + "epoch": 0.3605625, + "grad_norm": 3.234375, + "grad_norm_var": 0.053742472330729166, + "learning_rate": 0.0001, + "loss": 5.5643, + "loss/crossentropy": 2.311951160430908, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1717149093747139, + "step": 11538 + }, + { + "epoch": 0.360625, + "grad_norm": 3.03125, + "grad_norm_var": 0.061644490559895834, + "learning_rate": 0.0001, + "loss": 5.3732, + "loss/crossentropy": 2.313949942588806, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1574876829981804, + "step": 11540 + }, + { + "epoch": 0.3606875, + "grad_norm": 3.25, + "grad_norm_var": 0.0590240478515625, + "learning_rate": 0.0001, + "loss": 5.8614, + "loss/crossentropy": 2.5384668111801147, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1783868819475174, + "step": 11542 + }, + { + "epoch": 0.36075, + "grad_norm": 3.015625, + "grad_norm_var": 0.059000651041666664, + "learning_rate": 0.0001, + "loss": 5.7281, + "loss/crossentropy": 2.412795901298523, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18231303244829178, + "step": 11544 + }, + { + "epoch": 0.3608125, + "grad_norm": 3.75, + "grad_norm_var": 0.08050130208333334, + "learning_rate": 0.0001, + "loss": 6.1654, + "loss/crossentropy": 2.6934750080108643, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18977203965187073, + "step": 11546 + }, + { + "epoch": 0.360875, + "grad_norm": 3.390625, + "grad_norm_var": 0.08046468098958333, + "learning_rate": 0.0001, + "loss": 6.0536, + "loss/crossentropy": 2.67894446849823, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18395385891199112, + "step": 11548 + }, + { + "epoch": 0.3609375, + "grad_norm": 3.390625, + "grad_norm_var": 0.12551167805989583, + "learning_rate": 0.0001, + "loss": 5.9916, + "loss/crossentropy": 2.415665626525879, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.20173435658216476, + "step": 11550 + }, + { + "epoch": 0.361, + "grad_norm": 3.375, + "grad_norm_var": 0.11824442545572916, + "learning_rate": 0.0001, + "loss": 5.992, + "loss/crossentropy": 2.6645971536636353, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18039774894714355, + "step": 11552 + }, + { + "epoch": 0.3610625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0986724853515625, + "learning_rate": 0.0001, + "loss": 6.0679, + "loss/crossentropy": 2.702301025390625, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18304389715194702, + "step": 11554 + }, + { + "epoch": 0.361125, + "grad_norm": 3.484375, + "grad_norm_var": 0.07206624348958333, + "learning_rate": 0.0001, + "loss": 6.174, + "loss/crossentropy": 2.7713425159454346, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18792008608579636, + "step": 11556 + }, + { + "epoch": 0.3611875, + "grad_norm": 3.734375, + "grad_norm_var": 0.07440999348958334, + "learning_rate": 0.0001, + "loss": 5.9285, + "loss/crossentropy": 2.554310441017151, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18585404008626938, + "step": 11558 + }, + { + "epoch": 0.36125, + "grad_norm": 3.40625, + "grad_norm_var": 0.06679280598958333, + "learning_rate": 0.0001, + "loss": 6.3119, + "loss/crossentropy": 2.803977608680725, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19298001378774643, + "step": 11560 + }, + { + "epoch": 0.3613125, + "grad_norm": 3.203125, + "grad_norm_var": 0.06648661295572916, + "learning_rate": 0.0001, + "loss": 5.8028, + "loss/crossentropy": 2.4768543243408203, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17946702241897583, + "step": 11562 + }, + { + "epoch": 0.361375, + "grad_norm": 3.0625, + "grad_norm_var": 0.07428385416666666, + "learning_rate": 0.0001, + "loss": 5.909, + "loss/crossentropy": 2.573415517807007, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1796480119228363, + "step": 11564 + }, + { + "epoch": 0.3614375, + "grad_norm": 3.421875, + "grad_norm_var": 0.04191792805989583, + "learning_rate": 0.0001, + "loss": 6.1032, + "loss/crossentropy": 2.708019971847534, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18404622375965118, + "step": 11566 + }, + { + "epoch": 0.3615, + "grad_norm": 3.34375, + "grad_norm_var": 0.04479166666666667, + "learning_rate": 0.0001, + "loss": 5.5103, + "loss/crossentropy": 2.3572477102279663, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16803553700447083, + "step": 11568 + }, + { + "epoch": 0.3615625, + "grad_norm": 3.421875, + "grad_norm_var": 0.044234212239583334, + "learning_rate": 0.0001, + "loss": 5.8691, + "loss/crossentropy": 2.554656982421875, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18222244083881378, + "step": 11570 + }, + { + "epoch": 0.361625, + "grad_norm": 3.234375, + "grad_norm_var": 0.04781901041666667, + "learning_rate": 0.0001, + "loss": 5.8138, + "loss/crossentropy": 2.56459379196167, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1741422712802887, + "step": 11572 + }, + { + "epoch": 0.3616875, + "grad_norm": 3.046875, + "grad_norm_var": 0.04560546875, + "learning_rate": 0.0001, + "loss": 5.7767, + "loss/crossentropy": 2.5425853729248047, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1753685027360916, + "step": 11574 + }, + { + "epoch": 0.36175, + "grad_norm": 3.15625, + "grad_norm_var": 0.028831990559895833, + "learning_rate": 0.0001, + "loss": 5.6042, + "loss/crossentropy": 2.3334556818008423, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1731710433959961, + "step": 11576 + }, + { + "epoch": 0.3618125, + "grad_norm": 3.421875, + "grad_norm_var": 0.028678385416666667, + "learning_rate": 0.0001, + "loss": 6.0177, + "loss/crossentropy": 2.638232946395874, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1813022494316101, + "step": 11578 + }, + { + "epoch": 0.361875, + "grad_norm": 2.96875, + "grad_norm_var": 0.03369140625, + "learning_rate": 0.0001, + "loss": 5.3485, + "loss/crossentropy": 2.2655311822891235, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.15712259709835052, + "step": 11580 + }, + { + "epoch": 0.3619375, + "grad_norm": 3.375, + "grad_norm_var": 0.0190582275390625, + "learning_rate": 0.0001, + "loss": 6.0837, + "loss/crossentropy": 2.672060251235962, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18803488463163376, + "step": 11582 + }, + { + "epoch": 0.362, + "grad_norm": 2.984375, + "grad_norm_var": 0.0309722900390625, + "learning_rate": 0.0001, + "loss": 5.4147, + "loss/crossentropy": 2.373745918273926, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15409798175096512, + "step": 11584 + }, + { + "epoch": 0.3620625, + "grad_norm": 3.421875, + "grad_norm_var": 0.031050618489583334, + "learning_rate": 0.0001, + "loss": 6.0394, + "loss/crossentropy": 2.607384204864502, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.19164405763149261, + "step": 11586 + }, + { + "epoch": 0.362125, + "grad_norm": 3.375, + "grad_norm_var": 0.034601847330729164, + "learning_rate": 0.0001, + "loss": 5.8228, + "loss/crossentropy": 2.6524970531463623, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16819876432418823, + "step": 11588 + }, + { + "epoch": 0.3621875, + "grad_norm": 3.078125, + "grad_norm_var": 0.0370758056640625, + "learning_rate": 0.0001, + "loss": 5.7826, + "loss/crossentropy": 2.526271104812622, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1760205775499344, + "step": 11590 + }, + { + "epoch": 0.36225, + "grad_norm": 3.1875, + "grad_norm_var": 0.04140523274739583, + "learning_rate": 0.0001, + "loss": 5.73, + "loss/crossentropy": 2.4259228706359863, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.174160897731781, + "step": 11592 + }, + { + "epoch": 0.3623125, + "grad_norm": 3.28125, + "grad_norm_var": 0.03748372395833333, + "learning_rate": 0.0001, + "loss": 5.8936, + "loss/crossentropy": 2.581678628921509, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17494133859872818, + "step": 11594 + }, + { + "epoch": 0.362375, + "grad_norm": 3.390625, + "grad_norm_var": 0.043553670247395836, + "learning_rate": 0.0001, + "loss": 5.814, + "loss/crossentropy": 2.466444492340088, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1824108511209488, + "step": 11596 + }, + { + "epoch": 0.3624375, + "grad_norm": 3.78125, + "grad_norm_var": 0.0593902587890625, + "learning_rate": 0.0001, + "loss": 5.8607, + "loss/crossentropy": 2.518047571182251, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.181922048330307, + "step": 11598 + }, + { + "epoch": 0.3625, + "grad_norm": 3.375, + "grad_norm_var": 0.034228515625, + "learning_rate": 0.0001, + "loss": 5.5955, + "loss/crossentropy": 2.3988256454467773, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17396844923496246, + "step": 11600 + }, + { + "epoch": 0.3625625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0350250244140625, + "learning_rate": 0.0001, + "loss": 5.93, + "loss/crossentropy": 2.633981227874756, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17647858709096909, + "step": 11602 + }, + { + "epoch": 0.362625, + "grad_norm": 3.1875, + "grad_norm_var": 0.042578125, + "learning_rate": 0.0001, + "loss": 5.5269, + "loss/crossentropy": 2.3514821529388428, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17027592658996582, + "step": 11604 + }, + { + "epoch": 0.3626875, + "grad_norm": 3.578125, + "grad_norm_var": 0.04189351399739583, + "learning_rate": 0.0001, + "loss": 5.5546, + "loss/crossentropy": 2.203986644744873, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1788102239370346, + "step": 11606 + }, + { + "epoch": 0.36275, + "grad_norm": 3.609375, + "grad_norm_var": 0.05056050618489583, + "learning_rate": 0.0001, + "loss": 5.7688, + "loss/crossentropy": 2.4843231439590454, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17649301886558533, + "step": 11608 + }, + { + "epoch": 0.3628125, + "grad_norm": 3.078125, + "grad_norm_var": 0.059260050455729164, + "learning_rate": 0.0001, + "loss": 6.075, + "loss/crossentropy": 2.6833078861236572, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1864389181137085, + "step": 11610 + }, + { + "epoch": 0.362875, + "grad_norm": 3.4375, + "grad_norm_var": 0.05190327962239583, + "learning_rate": 0.0001, + "loss": 6.0866, + "loss/crossentropy": 2.5845061540603638, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.19200821965932846, + "step": 11612 + }, + { + "epoch": 0.3629375, + "grad_norm": 3.171875, + "grad_norm_var": 0.03876546223958333, + "learning_rate": 0.0001, + "loss": 5.9458, + "loss/crossentropy": 2.663218140602112, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1790422722697258, + "step": 11614 + }, + { + "epoch": 0.363, + "grad_norm": 3.359375, + "grad_norm_var": 0.03864644368489583, + "learning_rate": 0.0001, + "loss": 6.1841, + "loss/crossentropy": 2.820657968521118, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18126174062490463, + "step": 11616 + }, + { + "epoch": 0.3630625, + "grad_norm": 3.375, + "grad_norm_var": 0.041764322916666666, + "learning_rate": 0.0001, + "loss": 5.7947, + "loss/crossentropy": 2.549742341041565, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17098423838615417, + "step": 11618 + }, + { + "epoch": 0.363125, + "grad_norm": 3.546875, + "grad_norm_var": 0.04560445149739583, + "learning_rate": 0.0001, + "loss": 5.6737, + "loss/crossentropy": 2.3581948280334473, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17491018027067184, + "step": 11620 + }, + { + "epoch": 0.3631875, + "grad_norm": 3.296875, + "grad_norm_var": 0.045210774739583334, + "learning_rate": 0.0001, + "loss": 5.8093, + "loss/crossentropy": 2.5261915922164917, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1783144325017929, + "step": 11622 + }, + { + "epoch": 0.36325, + "grad_norm": 3.25, + "grad_norm_var": 0.0349273681640625, + "learning_rate": 0.0001, + "loss": 5.8163, + "loss/crossentropy": 2.544545292854309, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17678914219141006, + "step": 11624 + }, + { + "epoch": 0.3633125, + "grad_norm": 3.234375, + "grad_norm_var": 0.026252237955729167, + "learning_rate": 0.0001, + "loss": 5.7587, + "loss/crossentropy": 2.469446539878845, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17267972975969315, + "step": 11626 + }, + { + "epoch": 0.363375, + "grad_norm": 3.640625, + "grad_norm_var": 0.03076171875, + "learning_rate": 0.0001, + "loss": 6.3727, + "loss/crossentropy": 2.7683225870132446, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.1991046443581581, + "step": 11628 + }, + { + "epoch": 0.3634375, + "grad_norm": 3.5, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 6.1533, + "loss/crossentropy": 2.645849823951721, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19449244439601898, + "step": 11630 + }, + { + "epoch": 0.3635, + "grad_norm": 3.1875, + "grad_norm_var": 0.031571451822916666, + "learning_rate": 0.0001, + "loss": 5.7111, + "loss/crossentropy": 2.488775372505188, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17223234474658966, + "step": 11632 + }, + { + "epoch": 0.3635625, + "grad_norm": 3.71875, + "grad_norm_var": 1.595905558268229, + "learning_rate": 0.0001, + "loss": 6.3605, + "loss/crossentropy": 2.544732093811035, + "loss/hidden": 1.62890625, + "loss/jsd": 0.0, + "loss/logits": 0.2186906337738037, + "step": 11634 + }, + { + "epoch": 0.363625, + "grad_norm": 3.203125, + "grad_norm_var": 1.64381103515625, + "learning_rate": 0.0001, + "loss": 5.8801, + "loss/crossentropy": 2.657926321029663, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17182911932468414, + "step": 11636 + }, + { + "epoch": 0.3636875, + "grad_norm": 3.34375, + "grad_norm_var": 1.6316691080729167, + "learning_rate": 0.0001, + "loss": 5.957, + "loss/crossentropy": 2.5820019245147705, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18203197419643402, + "step": 11638 + }, + { + "epoch": 0.36375, + "grad_norm": 3.078125, + "grad_norm_var": 1.6466471354166667, + "learning_rate": 0.0001, + "loss": 5.8765, + "loss/crossentropy": 2.6228911876678467, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17418856918811798, + "step": 11640 + }, + { + "epoch": 0.3638125, + "grad_norm": 3.078125, + "grad_norm_var": 1.6611480712890625, + "learning_rate": 0.0001, + "loss": 5.718, + "loss/crossentropy": 2.4857271909713745, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17166081070899963, + "step": 11642 + }, + { + "epoch": 0.363875, + "grad_norm": 3.546875, + "grad_norm_var": 1.7013092041015625, + "learning_rate": 0.0001, + "loss": 6.1816, + "loss/crossentropy": 2.5950502157211304, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19928032159805298, + "step": 11644 + }, + { + "epoch": 0.3639375, + "grad_norm": 3.078125, + "grad_norm_var": 1.7250315348307292, + "learning_rate": 0.0001, + "loss": 6.0275, + "loss/crossentropy": 2.6586222648620605, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1841554269194603, + "step": 11646 + }, + { + "epoch": 0.364, + "grad_norm": 3.359375, + "grad_norm_var": 1.7169748942057292, + "learning_rate": 0.0001, + "loss": 6.0681, + "loss/crossentropy": 2.6864718198776245, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18503419309854507, + "step": 11648 + }, + { + "epoch": 0.3640625, + "grad_norm": 3.140625, + "grad_norm_var": 0.13170572916666667, + "learning_rate": 0.0001, + "loss": 5.9131, + "loss/crossentropy": 2.6158591508865356, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17894399911165237, + "step": 11650 + }, + { + "epoch": 0.364125, + "grad_norm": 3.78125, + "grad_norm_var": 0.13697509765625, + "learning_rate": 0.0001, + "loss": 5.522, + "loss/crossentropy": 2.3324047327041626, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1646653339266777, + "step": 11652 + }, + { + "epoch": 0.3641875, + "grad_norm": 3.40625, + "grad_norm_var": 0.14096577962239584, + "learning_rate": 0.0001, + "loss": 5.791, + "loss/crossentropy": 2.4745148420333862, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18203992396593094, + "step": 11654 + }, + { + "epoch": 0.36425, + "grad_norm": 3.0625, + "grad_norm_var": 0.14129130045572916, + "learning_rate": 0.0001, + "loss": 5.5313, + "loss/crossentropy": 2.3341383934020996, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17166809737682343, + "step": 11656 + }, + { + "epoch": 0.3643125, + "grad_norm": 3.328125, + "grad_norm_var": 0.1370025634765625, + "learning_rate": 0.0001, + "loss": 5.5383, + "loss/crossentropy": 2.3614786863327026, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1692415550351143, + "step": 11658 + }, + { + "epoch": 0.364375, + "grad_norm": 2.890625, + "grad_norm_var": 0.05548502604166667, + "learning_rate": 0.0001, + "loss": 5.4776, + "loss/crossentropy": 2.327902913093567, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1641886830329895, + "step": 11660 + }, + { + "epoch": 0.3644375, + "grad_norm": 4.125, + "grad_norm_var": 0.0938140869140625, + "learning_rate": 0.0001, + "loss": 5.8242, + "loss/crossentropy": 2.3535810708999634, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.190810889005661, + "step": 11662 + }, + { + "epoch": 0.3645, + "grad_norm": 3.25, + "grad_norm_var": 0.09388020833333334, + "learning_rate": 0.0001, + "loss": 5.8349, + "loss/crossentropy": 2.6038748025894165, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17388267070055008, + "step": 11664 + }, + { + "epoch": 0.3645625, + "grad_norm": 3.28125, + "grad_norm_var": 0.09176025390625, + "learning_rate": 0.0001, + "loss": 5.7459, + "loss/crossentropy": 2.4753239154815674, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1743217408657074, + "step": 11666 + }, + { + "epoch": 0.364625, + "grad_norm": 3.59375, + "grad_norm_var": 0.0791412353515625, + "learning_rate": 0.0001, + "loss": 6.2747, + "loss/crossentropy": 2.79960834980011, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19282344728708267, + "step": 11668 + }, + { + "epoch": 0.3646875, + "grad_norm": 3.828125, + "grad_norm_var": 0.09381103515625, + "learning_rate": 0.0001, + "loss": 5.9164, + "loss/crossentropy": 2.572916865348816, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17809413373470306, + "step": 11670 + }, + { + "epoch": 0.36475, + "grad_norm": 3.1875, + "grad_norm_var": 0.08993733723958333, + "learning_rate": 0.0001, + "loss": 5.8124, + "loss/crossentropy": 2.5034350156784058, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17386088520288467, + "step": 11672 + }, + { + "epoch": 0.3648125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0907379150390625, + "learning_rate": 0.0001, + "loss": 6.0899, + "loss/crossentropy": 2.7098684310913086, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18527349829673767, + "step": 11674 + }, + { + "epoch": 0.364875, + "grad_norm": 3.328125, + "grad_norm_var": 0.07219136555989583, + "learning_rate": 0.0001, + "loss": 5.8037, + "loss/crossentropy": 2.4899182319641113, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17356415838003159, + "step": 11676 + }, + { + "epoch": 0.3649375, + "grad_norm": 3.375, + "grad_norm_var": 0.030013020833333334, + "learning_rate": 0.0001, + "loss": 6.0627, + "loss/crossentropy": 2.7277345657348633, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18271364271640778, + "step": 11678 + }, + { + "epoch": 0.365, + "grad_norm": 3.734375, + "grad_norm_var": 0.040299479166666666, + "learning_rate": 0.0001, + "loss": 6.0736, + "loss/crossentropy": 2.6390098333358765, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18916141241788864, + "step": 11680 + }, + { + "epoch": 0.3650625, + "grad_norm": 3.0, + "grad_norm_var": 0.04865620930989583, + "learning_rate": 0.0001, + "loss": 5.5582, + "loss/crossentropy": 2.3566737174987793, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1681957244873047, + "step": 11682 + }, + { + "epoch": 0.365125, + "grad_norm": 3.25, + "grad_norm_var": 0.0457672119140625, + "learning_rate": 0.0001, + "loss": 5.7941, + "loss/crossentropy": 2.5121636390686035, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17780755460262299, + "step": 11684 + }, + { + "epoch": 0.3651875, + "grad_norm": 3.96875, + "grad_norm_var": 0.0543853759765625, + "learning_rate": 0.0001, + "loss": 6.0005, + "loss/crossentropy": 2.5540707111358643, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18605080246925354, + "step": 11686 + }, + { + "epoch": 0.36525, + "grad_norm": 3.03125, + "grad_norm_var": 0.059691365559895834, + "learning_rate": 0.0001, + "loss": 5.9282, + "loss/crossentropy": 2.586628556251526, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17712734639644623, + "step": 11688 + }, + { + "epoch": 0.3653125, + "grad_norm": 3.5, + "grad_norm_var": 0.05976155598958333, + "learning_rate": 0.0001, + "loss": 5.9325, + "loss/crossentropy": 2.536140561103821, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1876792460680008, + "step": 11690 + }, + { + "epoch": 0.365375, + "grad_norm": 3.34375, + "grad_norm_var": 0.06415608723958334, + "learning_rate": 0.0001, + "loss": 5.9249, + "loss/crossentropy": 2.6262542009353638, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17908621579408646, + "step": 11692 + }, + { + "epoch": 0.3654375, + "grad_norm": 3.53125, + "grad_norm_var": 0.0690093994140625, + "learning_rate": 0.0001, + "loss": 5.7679, + "loss/crossentropy": 2.517289161682129, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17428414523601532, + "step": 11694 + }, + { + "epoch": 0.3655, + "grad_norm": 3.21875, + "grad_norm_var": 0.06822916666666666, + "learning_rate": 0.0001, + "loss": 5.8793, + "loss/crossentropy": 2.5547901391983032, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18010495603084564, + "step": 11696 + }, + { + "epoch": 0.3655625, + "grad_norm": 3.359375, + "grad_norm_var": 0.060774739583333334, + "learning_rate": 0.0001, + "loss": 5.7699, + "loss/crossentropy": 2.498196601867676, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17561272531747818, + "step": 11698 + }, + { + "epoch": 0.365625, + "grad_norm": 3.5, + "grad_norm_var": 0.07692057291666667, + "learning_rate": 0.0001, + "loss": 6.0204, + "loss/crossentropy": 2.573630452156067, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18920981138944626, + "step": 11700 + }, + { + "epoch": 0.3656875, + "grad_norm": 2.9375, + "grad_norm_var": 0.07570699055989584, + "learning_rate": 0.0001, + "loss": 5.524, + "loss/crossentropy": 2.4320229291915894, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1638863906264305, + "step": 11702 + }, + { + "epoch": 0.36575, + "grad_norm": 3.0625, + "grad_norm_var": 0.074658203125, + "learning_rate": 0.0001, + "loss": 5.3621, + "loss/crossentropy": 2.2419254183769226, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1627953201532364, + "step": 11704 + }, + { + "epoch": 0.3658125, + "grad_norm": 3.03125, + "grad_norm_var": 0.07830403645833334, + "learning_rate": 0.0001, + "loss": 5.7887, + "loss/crossentropy": 2.5559518337249756, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17327089607715607, + "step": 11706 + }, + { + "epoch": 0.365875, + "grad_norm": 3.09375, + "grad_norm_var": 0.07842508951822917, + "learning_rate": 0.0001, + "loss": 5.7768, + "loss/crossentropy": 2.544148802757263, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17405012249946594, + "step": 11708 + }, + { + "epoch": 0.3659375, + "grad_norm": 3.265625, + "grad_norm_var": 0.07327473958333333, + "learning_rate": 0.0001, + "loss": 5.8207, + "loss/crossentropy": 2.5839486122131348, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17094473540782928, + "step": 11710 + }, + { + "epoch": 0.366, + "grad_norm": 3.359375, + "grad_norm_var": 0.06852925618489583, + "learning_rate": 0.0001, + "loss": 6.0067, + "loss/crossentropy": 2.642910599708557, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18208573758602142, + "step": 11712 + }, + { + "epoch": 0.3660625, + "grad_norm": 3.359375, + "grad_norm_var": 0.06663411458333333, + "learning_rate": 0.0001, + "loss": 6.038, + "loss/crossentropy": 2.672016978263855, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18269260227680206, + "step": 11714 + }, + { + "epoch": 0.366125, + "grad_norm": 3.3125, + "grad_norm_var": 0.028392537434895834, + "learning_rate": 0.0001, + "loss": 5.82, + "loss/crossentropy": 2.5538461208343506, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17856648564338684, + "step": 11716 + }, + { + "epoch": 0.3661875, + "grad_norm": 3.15625, + "grad_norm_var": 0.02291259765625, + "learning_rate": 0.0001, + "loss": 5.836, + "loss/crossentropy": 2.6154959201812744, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1712726205587387, + "step": 11718 + }, + { + "epoch": 0.36625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0215728759765625, + "learning_rate": 0.0001, + "loss": 5.932, + "loss/crossentropy": 2.6341400146484375, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1809549182653427, + "step": 11720 + }, + { + "epoch": 0.3663125, + "grad_norm": 3.3125, + "grad_norm_var": 0.019205729166666668, + "learning_rate": 0.0001, + "loss": 6.1397, + "loss/crossentropy": 2.791115164756775, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18173208832740784, + "step": 11722 + }, + { + "epoch": 0.366375, + "grad_norm": 3.265625, + "grad_norm_var": 0.019921875, + "learning_rate": 0.0001, + "loss": 5.9471, + "loss/crossentropy": 2.646299362182617, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17812715470790863, + "step": 11724 + }, + { + "epoch": 0.3664375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0202056884765625, + "learning_rate": 0.0001, + "loss": 5.9564, + "loss/crossentropy": 2.6312177181243896, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18056128174066544, + "step": 11726 + }, + { + "epoch": 0.3665, + "grad_norm": 3.421875, + "grad_norm_var": 0.0116363525390625, + "learning_rate": 0.0001, + "loss": 5.8562, + "loss/crossentropy": 2.532650113105774, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18156994879245758, + "step": 11728 + }, + { + "epoch": 0.3665625, + "grad_norm": 3.28125, + "grad_norm_var": 0.009566243489583333, + "learning_rate": 0.0001, + "loss": 6.1817, + "loss/crossentropy": 2.808872938156128, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18416258692741394, + "step": 11730 + }, + { + "epoch": 0.366625, + "grad_norm": 3.109375, + "grad_norm_var": 0.009358723958333334, + "learning_rate": 0.0001, + "loss": 5.7934, + "loss/crossentropy": 2.5109959840774536, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1743355542421341, + "step": 11732 + }, + { + "epoch": 0.3666875, + "grad_norm": 3.265625, + "grad_norm_var": 0.030859375, + "learning_rate": 0.0001, + "loss": 6.2513, + "loss/crossentropy": 2.783187747001648, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19368354231119156, + "step": 11734 + }, + { + "epoch": 0.36675, + "grad_norm": 2.984375, + "grad_norm_var": 0.0400543212890625, + "learning_rate": 0.0001, + "loss": 5.7834, + "loss/crossentropy": 2.5999754667282104, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16912198066711426, + "step": 11736 + }, + { + "epoch": 0.3668125, + "grad_norm": 3.390625, + "grad_norm_var": 0.04117431640625, + "learning_rate": 0.0001, + "loss": 5.9766, + "loss/crossentropy": 2.683970332145691, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17809562385082245, + "step": 11738 + }, + { + "epoch": 0.366875, + "grad_norm": 3.265625, + "grad_norm_var": 0.054182942708333334, + "learning_rate": 0.0001, + "loss": 6.0108, + "loss/crossentropy": 2.56476891040802, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.18366805464029312, + "step": 11740 + }, + { + "epoch": 0.3669375, + "grad_norm": 3.15625, + "grad_norm_var": 0.057763671875, + "learning_rate": 0.0001, + "loss": 5.6899, + "loss/crossentropy": 2.467342257499695, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17264722287654877, + "step": 11742 + }, + { + "epoch": 0.367, + "grad_norm": 3.296875, + "grad_norm_var": 0.0559234619140625, + "learning_rate": 0.0001, + "loss": 5.7186, + "loss/crossentropy": 2.435905694961548, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17710177600383759, + "step": 11744 + }, + { + "epoch": 0.3670625, + "grad_norm": 3.296875, + "grad_norm_var": 0.055924479166666666, + "learning_rate": 0.0001, + "loss": 5.6426, + "loss/crossentropy": 2.3441178798675537, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17633193731307983, + "step": 11746 + }, + { + "epoch": 0.367125, + "grad_norm": 3.265625, + "grad_norm_var": 0.05445556640625, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.5711851119995117, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17189423739910126, + "step": 11748 + }, + { + "epoch": 0.3671875, + "grad_norm": 3.34375, + "grad_norm_var": 0.03928934733072917, + "learning_rate": 0.0001, + "loss": 5.7056, + "loss/crossentropy": 2.3876869678497314, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17944584786891937, + "step": 11750 + }, + { + "epoch": 0.36725, + "grad_norm": 3.484375, + "grad_norm_var": 0.027912394205729166, + "learning_rate": 0.0001, + "loss": 6.0769, + "loss/crossentropy": 2.682102680206299, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18322941660881042, + "step": 11752 + }, + { + "epoch": 0.3673125, + "grad_norm": 3.15625, + "grad_norm_var": 0.029850260416666666, + "learning_rate": 0.0001, + "loss": 6.0325, + "loss/crossentropy": 2.6718112230300903, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18177196383476257, + "step": 11754 + }, + { + "epoch": 0.367375, + "grad_norm": 3.21875, + "grad_norm_var": 0.01611328125, + "learning_rate": 0.0001, + "loss": 5.8179, + "loss/crossentropy": 2.515088200569153, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1787201538681984, + "step": 11756 + }, + { + "epoch": 0.3674375, + "grad_norm": 3.40625, + "grad_norm_var": 0.013036092122395834, + "learning_rate": 0.0001, + "loss": 5.9501, + "loss/crossentropy": 2.6708627939224243, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17753348499536514, + "step": 11758 + }, + { + "epoch": 0.3675, + "grad_norm": 3.328125, + "grad_norm_var": 0.015851847330729165, + "learning_rate": 0.0001, + "loss": 5.7554, + "loss/crossentropy": 2.5031509399414062, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17796175181865692, + "step": 11760 + }, + { + "epoch": 0.3675625, + "grad_norm": 2.90625, + "grad_norm_var": 0.026005045572916666, + "learning_rate": 0.0001, + "loss": 5.5039, + "loss/crossentropy": 2.3185973167419434, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1685332953929901, + "step": 11762 + }, + { + "epoch": 0.367625, + "grad_norm": 3.4375, + "grad_norm_var": 0.02691650390625, + "learning_rate": 0.0001, + "loss": 6.066, + "loss/crossentropy": 2.6969223022460938, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18534033745527267, + "step": 11764 + }, + { + "epoch": 0.3676875, + "grad_norm": 3.0, + "grad_norm_var": 0.026204427083333332, + "learning_rate": 0.0001, + "loss": 6.0345, + "loss/crossentropy": 2.7123241424560547, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.181439608335495, + "step": 11766 + }, + { + "epoch": 0.36775, + "grad_norm": 3.3125, + "grad_norm_var": 0.021773274739583334, + "learning_rate": 0.0001, + "loss": 5.5845, + "loss/crossentropy": 2.3817098140716553, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17106405645608902, + "step": 11768 + }, + { + "epoch": 0.3678125, + "grad_norm": 3.328125, + "grad_norm_var": 0.029520670572916668, + "learning_rate": 0.0001, + "loss": 5.9256, + "loss/crossentropy": 2.521042823791504, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18108432739973068, + "step": 11770 + }, + { + "epoch": 0.367875, + "grad_norm": 3.265625, + "grad_norm_var": 0.029255167643229166, + "learning_rate": 0.0001, + "loss": 5.6482, + "loss/crossentropy": 2.3615509271621704, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17514477670192719, + "step": 11772 + }, + { + "epoch": 0.3679375, + "grad_norm": 3.1875, + "grad_norm_var": 0.027880859375, + "learning_rate": 0.0001, + "loss": 6.0203, + "loss/crossentropy": 2.6792795658111572, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1809818148612976, + "step": 11774 + }, + { + "epoch": 0.368, + "grad_norm": 3.234375, + "grad_norm_var": 0.02574462890625, + "learning_rate": 0.0001, + "loss": 5.7908, + "loss/crossentropy": 2.4552351236343384, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17691315710544586, + "step": 11776 + }, + { + "epoch": 0.3680625, + "grad_norm": 3.09375, + "grad_norm_var": 0.018879191080729166, + "learning_rate": 0.0001, + "loss": 5.8327, + "loss/crossentropy": 2.4948712587356567, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1775280386209488, + "step": 11778 + }, + { + "epoch": 0.368125, + "grad_norm": 3.5, + "grad_norm_var": 0.020116170247395832, + "learning_rate": 0.0001, + "loss": 5.9142, + "loss/crossentropy": 2.597373604774475, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17777980864048004, + "step": 11780 + }, + { + "epoch": 0.3681875, + "grad_norm": 3.40625, + "grad_norm_var": 0.014567057291666666, + "learning_rate": 0.0001, + "loss": 5.594, + "loss/crossentropy": 2.3891193866729736, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17009887844324112, + "step": 11782 + }, + { + "epoch": 0.36825, + "grad_norm": 3.171875, + "grad_norm_var": 0.015511067708333333, + "learning_rate": 0.0001, + "loss": 5.81, + "loss/crossentropy": 2.645570397377014, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16409894824028015, + "step": 11784 + }, + { + "epoch": 0.3683125, + "grad_norm": 3.296875, + "grad_norm_var": 0.0119049072265625, + "learning_rate": 0.0001, + "loss": 5.6802, + "loss/crossentropy": 2.476786255836487, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.168389730155468, + "step": 11786 + }, + { + "epoch": 0.368375, + "grad_norm": 3.703125, + "grad_norm_var": 0.370654296875, + "learning_rate": 0.0001, + "loss": 5.9516, + "loss/crossentropy": 2.423331618309021, + "loss/hidden": 1.65625, + "loss/jsd": 0.0, + "loss/logits": 0.1872054487466812, + "step": 11788 + }, + { + "epoch": 0.3684375, + "grad_norm": 3.125, + "grad_norm_var": 0.3765289306640625, + "learning_rate": 0.0001, + "loss": 5.709, + "loss/crossentropy": 2.542636752128601, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1697589010000229, + "step": 11790 + }, + { + "epoch": 0.3685, + "grad_norm": 3.625, + "grad_norm_var": 0.37255452473958334, + "learning_rate": 0.0001, + "loss": 6.1027, + "loss/crossentropy": 2.6770007610321045, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1878843605518341, + "step": 11792 + }, + { + "epoch": 0.3685625, + "grad_norm": 2.875, + "grad_norm_var": 0.3907389322916667, + "learning_rate": 0.0001, + "loss": 5.5742, + "loss/crossentropy": 2.499484658241272, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16294343024492264, + "step": 11794 + }, + { + "epoch": 0.368625, + "grad_norm": 3.40625, + "grad_norm_var": 0.3915201822916667, + "learning_rate": 0.0001, + "loss": 5.8344, + "loss/crossentropy": 2.538533926010132, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17880485951900482, + "step": 11796 + }, + { + "epoch": 0.3686875, + "grad_norm": 3.40625, + "grad_norm_var": 0.39090067545572915, + "learning_rate": 0.0001, + "loss": 6.1114, + "loss/crossentropy": 2.6758971214294434, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1861327812075615, + "step": 11798 + }, + { + "epoch": 0.36875, + "grad_norm": 3.515625, + "grad_norm_var": 0.3773834228515625, + "learning_rate": 0.0001, + "loss": 6.3307, + "loss/crossentropy": 2.873810648918152, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19256100058555603, + "step": 11800 + }, + { + "epoch": 0.3688125, + "grad_norm": 3.234375, + "grad_norm_var": 0.3807525634765625, + "learning_rate": 0.0001, + "loss": 5.6037, + "loss/crossentropy": 2.340093493461609, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17205937206745148, + "step": 11802 + }, + { + "epoch": 0.368875, + "grad_norm": 3.359375, + "grad_norm_var": 0.046174112955729166, + "learning_rate": 0.0001, + "loss": 5.8077, + "loss/crossentropy": 2.5863677263259888, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17096296697854996, + "step": 11804 + }, + { + "epoch": 0.3689375, + "grad_norm": 3.1875, + "grad_norm_var": 0.04405924479166667, + "learning_rate": 0.0001, + "loss": 5.8759, + "loss/crossentropy": 2.5780467987060547, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1790032461285591, + "step": 11806 + }, + { + "epoch": 0.369, + "grad_norm": 3.484375, + "grad_norm_var": 0.07014567057291667, + "learning_rate": 0.0001, + "loss": 5.6983, + "loss/crossentropy": 2.3743369579315186, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17927594482898712, + "step": 11808 + }, + { + "epoch": 0.3690625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0919921875, + "learning_rate": 0.0001, + "loss": 5.9331, + "loss/crossentropy": 2.5354260206222534, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18741943687200546, + "step": 11810 + }, + { + "epoch": 0.369125, + "grad_norm": 3.40625, + "grad_norm_var": 0.09038798014322917, + "learning_rate": 0.0001, + "loss": 5.9963, + "loss/crossentropy": 2.649405837059021, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17805065214633942, + "step": 11812 + }, + { + "epoch": 0.3691875, + "grad_norm": 3.4375, + "grad_norm_var": 0.10051676432291666, + "learning_rate": 0.0001, + "loss": 5.9888, + "loss/crossentropy": 2.676456570625305, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1777212768793106, + "step": 11814 + }, + { + "epoch": 0.36925, + "grad_norm": 3.296875, + "grad_norm_var": 0.10239156087239583, + "learning_rate": 0.0001, + "loss": 5.9971, + "loss/crossentropy": 2.6781569719314575, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18072041869163513, + "step": 11816 + }, + { + "epoch": 0.3693125, + "grad_norm": 3.09375, + "grad_norm_var": 0.10413004557291666, + "learning_rate": 0.0001, + "loss": 6.1823, + "loss/crossentropy": 2.8319051265716553, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1807391345500946, + "step": 11818 + }, + { + "epoch": 0.369375, + "grad_norm": 3.03125, + "grad_norm_var": 0.11757405598958333, + "learning_rate": 0.0001, + "loss": 5.6943, + "loss/crossentropy": 2.5074106454849243, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16947129368782043, + "step": 11820 + }, + { + "epoch": 0.3694375, + "grad_norm": 3.1875, + "grad_norm_var": 0.12092692057291667, + "learning_rate": 0.0001, + "loss": 6.0688, + "loss/crossentropy": 2.6527081727981567, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1896558776497841, + "step": 11822 + }, + { + "epoch": 0.3695, + "grad_norm": 3.5625, + "grad_norm_var": 0.10622456868489584, + "learning_rate": 0.0001, + "loss": 6.4309, + "loss/crossentropy": 2.8148432970046997, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.20184143632650375, + "step": 11824 + }, + { + "epoch": 0.3695625, + "grad_norm": 3.15625, + "grad_norm_var": 0.05650634765625, + "learning_rate": 0.0001, + "loss": 5.8168, + "loss/crossentropy": 2.520147204399109, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18044225126504898, + "step": 11826 + }, + { + "epoch": 0.369625, + "grad_norm": 3.359375, + "grad_norm_var": 0.05172119140625, + "learning_rate": 0.0001, + "loss": 5.8837, + "loss/crossentropy": 2.5915223360061646, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1780419945716858, + "step": 11828 + }, + { + "epoch": 0.3696875, + "grad_norm": 3.296875, + "grad_norm_var": 0.0499908447265625, + "learning_rate": 0.0001, + "loss": 5.6589, + "loss/crossentropy": 2.4262943267822266, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16857632994651794, + "step": 11830 + }, + { + "epoch": 0.36975, + "grad_norm": 3.203125, + "grad_norm_var": 0.0506988525390625, + "learning_rate": 0.0001, + "loss": 6.0, + "loss/crossentropy": 2.713708758354187, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1762854903936386, + "step": 11832 + }, + { + "epoch": 0.3698125, + "grad_norm": 3.203125, + "grad_norm_var": 0.04698893229166667, + "learning_rate": 0.0001, + "loss": 5.6122, + "loss/crossentropy": 2.363717198371887, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17445889115333557, + "step": 11834 + }, + { + "epoch": 0.369875, + "grad_norm": 3.328125, + "grad_norm_var": 0.0415191650390625, + "learning_rate": 0.0001, + "loss": 5.7302, + "loss/crossentropy": 2.459938406944275, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17624631524085999, + "step": 11836 + }, + { + "epoch": 0.3699375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0331451416015625, + "learning_rate": 0.0001, + "loss": 5.8738, + "loss/crossentropy": 2.522003412246704, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18127425760030746, + "step": 11838 + }, + { + "epoch": 0.37, + "grad_norm": 3.078125, + "grad_norm_var": 0.009891764322916666, + "learning_rate": 0.0001, + "loss": 5.664, + "loss/crossentropy": 2.4958502054214478, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1675918996334076, + "step": 11840 + }, + { + "epoch": 0.3700625, + "grad_norm": 3.578125, + "grad_norm_var": 0.016437784830729166, + "learning_rate": 0.0001, + "loss": 6.1671, + "loss/crossentropy": 2.7934341430664062, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1811162754893303, + "step": 11842 + }, + { + "epoch": 0.370125, + "grad_norm": 3.25, + "grad_norm_var": 0.016893513997395835, + "learning_rate": 0.0001, + "loss": 5.8931, + "loss/crossentropy": 2.596327543258667, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17889384925365448, + "step": 11844 + }, + { + "epoch": 0.3701875, + "grad_norm": 3.15625, + "grad_norm_var": 0.016304524739583333, + "learning_rate": 0.0001, + "loss": 5.938, + "loss/crossentropy": 2.633580446243286, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17887794226408005, + "step": 11846 + }, + { + "epoch": 0.37025, + "grad_norm": 3.046875, + "grad_norm_var": 0.021187337239583333, + "learning_rate": 0.0001, + "loss": 5.9855, + "loss/crossentropy": 2.6642227172851562, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18134354054927826, + "step": 11848 + }, + { + "epoch": 0.3703125, + "grad_norm": 3.140625, + "grad_norm_var": 0.022294108072916666, + "learning_rate": 0.0001, + "loss": 5.7663, + "loss/crossentropy": 2.5717811584472656, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16749581694602966, + "step": 11850 + }, + { + "epoch": 0.370375, + "grad_norm": 3.140625, + "grad_norm_var": 0.021451822916666665, + "learning_rate": 0.0001, + "loss": 5.8715, + "loss/crossentropy": 2.627884268760681, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17474838346242905, + "step": 11852 + }, + { + "epoch": 0.3704375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0182281494140625, + "learning_rate": 0.0001, + "loss": 5.5614, + "loss/crossentropy": 2.3951979875564575, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16310697048902512, + "step": 11854 + }, + { + "epoch": 0.3705, + "grad_norm": 3.21875, + "grad_norm_var": 0.017281087239583333, + "learning_rate": 0.0001, + "loss": 5.9406, + "loss/crossentropy": 2.6456116437911987, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17950201779603958, + "step": 11856 + }, + { + "epoch": 0.3705625, + "grad_norm": 3.3125, + "grad_norm_var": 0.008918253580729167, + "learning_rate": 0.0001, + "loss": 5.8216, + "loss/crossentropy": 2.5380886793136597, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17678741365671158, + "step": 11858 + }, + { + "epoch": 0.370625, + "grad_norm": 3.3125, + "grad_norm_var": 0.0146484375, + "learning_rate": 0.0001, + "loss": 5.8239, + "loss/crossentropy": 2.5890220403671265, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17348285764455795, + "step": 11860 + }, + { + "epoch": 0.3706875, + "grad_norm": 3.03125, + "grad_norm_var": 0.01617431640625, + "learning_rate": 0.0001, + "loss": 5.8439, + "loss/crossentropy": 2.6098623275756836, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17457354068756104, + "step": 11862 + }, + { + "epoch": 0.37075, + "grad_norm": 3.296875, + "grad_norm_var": 0.0123199462890625, + "learning_rate": 0.0001, + "loss": 5.7248, + "loss/crossentropy": 2.4863415956497192, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16994335502386093, + "step": 11864 + }, + { + "epoch": 0.3708125, + "grad_norm": 3.234375, + "grad_norm_var": 0.012548828125, + "learning_rate": 0.0001, + "loss": 5.8885, + "loss/crossentropy": 2.595168352127075, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1777665838599205, + "step": 11866 + }, + { + "epoch": 0.370875, + "grad_norm": 3.59375, + "grad_norm_var": 0.021187337239583333, + "learning_rate": 0.0001, + "loss": 5.8157, + "loss/crossentropy": 2.5132588148117065, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18023964762687683, + "step": 11868 + }, + { + "epoch": 0.3709375, + "grad_norm": 3.515625, + "grad_norm_var": 0.026244099934895834, + "learning_rate": 0.0001, + "loss": 5.5236, + "loss/crossentropy": 2.3587870597839355, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1688261479139328, + "step": 11870 + }, + { + "epoch": 0.371, + "grad_norm": 3.15625, + "grad_norm_var": 0.028929646809895834, + "learning_rate": 0.0001, + "loss": 5.6618, + "loss/crossentropy": 2.5152169466018677, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1634834259748459, + "step": 11872 + }, + { + "epoch": 0.3710625, + "grad_norm": 3.65625, + "grad_norm_var": 0.04599202473958333, + "learning_rate": 0.0001, + "loss": 5.8343, + "loss/crossentropy": 2.5197051763534546, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1779429167509079, + "step": 11874 + }, + { + "epoch": 0.371125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0421783447265625, + "learning_rate": 0.0001, + "loss": 5.9015, + "loss/crossentropy": 2.619465470314026, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17663691192865372, + "step": 11876 + }, + { + "epoch": 0.3711875, + "grad_norm": 3.28125, + "grad_norm_var": 0.046826171875, + "learning_rate": 0.0001, + "loss": 5.7655, + "loss/crossentropy": 2.504304528236389, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1761169210076332, + "step": 11878 + }, + { + "epoch": 0.37125, + "grad_norm": 3.53125, + "grad_norm_var": 0.04921773274739583, + "learning_rate": 0.0001, + "loss": 6.141, + "loss/crossentropy": 2.733517646789551, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18567375093698502, + "step": 11880 + }, + { + "epoch": 0.3713125, + "grad_norm": 3.4375, + "grad_norm_var": 0.0548736572265625, + "learning_rate": 0.0001, + "loss": 5.9319, + "loss/crossentropy": 2.5222357511520386, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18354719132184982, + "step": 11882 + }, + { + "epoch": 0.371375, + "grad_norm": 3.1875, + "grad_norm_var": 0.050104777018229164, + "learning_rate": 0.0001, + "loss": 6.1632, + "loss/crossentropy": 2.722404360771179, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19056753814220428, + "step": 11884 + }, + { + "epoch": 0.3714375, + "grad_norm": 3.390625, + "grad_norm_var": 0.046052042643229166, + "learning_rate": 0.0001, + "loss": 6.281, + "loss/crossentropy": 2.844220280647278, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1893763542175293, + "step": 11886 + }, + { + "epoch": 0.3715, + "grad_norm": 3.359375, + "grad_norm_var": 0.038426717122395836, + "learning_rate": 0.0001, + "loss": 5.9486, + "loss/crossentropy": 2.6783348321914673, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17663230001926422, + "step": 11888 + }, + { + "epoch": 0.3715625, + "grad_norm": 3.28125, + "grad_norm_var": 0.03242085774739583, + "learning_rate": 0.0001, + "loss": 5.7972, + "loss/crossentropy": 2.472720503807068, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1793181523680687, + "step": 11890 + }, + { + "epoch": 0.371625, + "grad_norm": 3.390625, + "grad_norm_var": 0.024995930989583335, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.5683202743530273, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17515835911035538, + "step": 11892 + }, + { + "epoch": 0.3716875, + "grad_norm": 3.09375, + "grad_norm_var": 0.019505818684895832, + "learning_rate": 0.0001, + "loss": 5.8766, + "loss/crossentropy": 2.563071131706238, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18018251657485962, + "step": 11894 + }, + { + "epoch": 0.37175, + "grad_norm": 3.40625, + "grad_norm_var": 0.016893513997395835, + "learning_rate": 0.0001, + "loss": 5.803, + "loss/crossentropy": 2.605095624923706, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17174078524112701, + "step": 11896 + }, + { + "epoch": 0.3718125, + "grad_norm": 3.328125, + "grad_norm_var": 0.01591796875, + "learning_rate": 0.0001, + "loss": 5.4955, + "loss/crossentropy": 2.3684515953063965, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16699732095003128, + "step": 11898 + }, + { + "epoch": 0.371875, + "grad_norm": 3.0625, + "grad_norm_var": 0.021678670247395834, + "learning_rate": 0.0001, + "loss": 5.8702, + "loss/crossentropy": 2.6153483390808105, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1739197000861168, + "step": 11900 + }, + { + "epoch": 0.3719375, + "grad_norm": 3.34375, + "grad_norm_var": 0.02447509765625, + "learning_rate": 0.0001, + "loss": 5.5087, + "loss/crossentropy": 2.327476739883423, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1657761111855507, + "step": 11902 + }, + { + "epoch": 0.372, + "grad_norm": 3.8125, + "grad_norm_var": 0.04108072916666667, + "learning_rate": 0.0001, + "loss": 6.0579, + "loss/crossentropy": 2.683075189590454, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1831829622387886, + "step": 11904 + }, + { + "epoch": 0.3720625, + "grad_norm": 3.1875, + "grad_norm_var": 0.03892822265625, + "learning_rate": 0.0001, + "loss": 5.8588, + "loss/crossentropy": 2.5925426483154297, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17467505484819412, + "step": 11906 + }, + { + "epoch": 0.372125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0461822509765625, + "learning_rate": 0.0001, + "loss": 5.984, + "loss/crossentropy": 2.646338939666748, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18103424459695816, + "step": 11908 + }, + { + "epoch": 0.3721875, + "grad_norm": 3.828125, + "grad_norm_var": 0.0583404541015625, + "learning_rate": 0.0001, + "loss": 5.8657, + "loss/crossentropy": 2.5094738006591797, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18406447768211365, + "step": 11910 + }, + { + "epoch": 0.37225, + "grad_norm": 3.375, + "grad_norm_var": 0.06013081868489583, + "learning_rate": 0.0001, + "loss": 5.5741, + "loss/crossentropy": 2.288855195045471, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17070835828781128, + "step": 11912 + }, + { + "epoch": 0.3723125, + "grad_norm": 3.296875, + "grad_norm_var": 0.07337137858072916, + "learning_rate": 0.0001, + "loss": 5.9294, + "loss/crossentropy": 2.5085290670394897, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18505970388650894, + "step": 11914 + }, + { + "epoch": 0.372375, + "grad_norm": 3.40625, + "grad_norm_var": 0.06606343587239584, + "learning_rate": 0.0001, + "loss": 6.045, + "loss/crossentropy": 2.6599591970443726, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18576952815055847, + "step": 11916 + }, + { + "epoch": 0.3724375, + "grad_norm": 3.0, + "grad_norm_var": 0.07512105305989583, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.6257758140563965, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17527589201927185, + "step": 11918 + }, + { + "epoch": 0.3725, + "grad_norm": 3.1875, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 0.0001, + "loss": 5.622, + "loss/crossentropy": 2.420231819152832, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17290856689214706, + "step": 11920 + }, + { + "epoch": 0.3725625, + "grad_norm": 3.234375, + "grad_norm_var": 0.06565348307291667, + "learning_rate": 0.0001, + "loss": 5.8418, + "loss/crossentropy": 2.585193395614624, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.175659641623497, + "step": 11922 + }, + { + "epoch": 0.372625, + "grad_norm": 3.515625, + "grad_norm_var": 1.6833821614583333, + "learning_rate": 0.0001, + "loss": 6.0385, + "loss/crossentropy": 2.539443612098694, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19248707592487335, + "step": 11924 + }, + { + "epoch": 0.3726875, + "grad_norm": 3.15625, + "grad_norm_var": 1.6886138916015625, + "learning_rate": 0.0001, + "loss": 6.0132, + "loss/crossentropy": 2.6566983461380005, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18252985924482346, + "step": 11926 + }, + { + "epoch": 0.37275, + "grad_norm": 3.15625, + "grad_norm_var": 1.7012115478515626, + "learning_rate": 0.0001, + "loss": 5.6475, + "loss/crossentropy": 2.4480196237564087, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1722874492406845, + "step": 11928 + }, + { + "epoch": 0.3728125, + "grad_norm": 3.171875, + "grad_norm_var": 1.701123046875, + "learning_rate": 0.0001, + "loss": 5.7602, + "loss/crossentropy": 2.413305878639221, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18078355491161346, + "step": 11930 + }, + { + "epoch": 0.372875, + "grad_norm": 3.609375, + "grad_norm_var": 1.6905344645182292, + "learning_rate": 0.0001, + "loss": 5.789, + "loss/crossentropy": 2.392745018005371, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18180835992097855, + "step": 11932 + }, + { + "epoch": 0.3729375, + "grad_norm": 3.015625, + "grad_norm_var": 1.6818196614583334, + "learning_rate": 0.0001, + "loss": 5.7455, + "loss/crossentropy": 2.50241756439209, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17000866681337357, + "step": 11934 + }, + { + "epoch": 0.373, + "grad_norm": 3.265625, + "grad_norm_var": 1.6605133056640624, + "learning_rate": 0.0001, + "loss": 5.9344, + "loss/crossentropy": 2.5978814363479614, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18013402074575424, + "step": 11936 + }, + { + "epoch": 0.3730625, + "grad_norm": 3.21875, + "grad_norm_var": 1.6770579020182292, + "learning_rate": 0.0001, + "loss": 5.7267, + "loss/crossentropy": 2.4426097869873047, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17684456706047058, + "step": 11938 + }, + { + "epoch": 0.373125, + "grad_norm": 3.34375, + "grad_norm_var": 0.0436676025390625, + "learning_rate": 0.0001, + "loss": 6.013, + "loss/crossentropy": 2.625849723815918, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18558523058891296, + "step": 11940 + }, + { + "epoch": 0.3731875, + "grad_norm": 3.921875, + "grad_norm_var": 0.0610015869140625, + "learning_rate": 0.0001, + "loss": 6.1125, + "loss/crossentropy": 2.7566566467285156, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1808934286236763, + "step": 11942 + }, + { + "epoch": 0.37325, + "grad_norm": 3.296875, + "grad_norm_var": 0.05533447265625, + "learning_rate": 0.0001, + "loss": 5.7133, + "loss/crossentropy": 2.4668716192245483, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17229923605918884, + "step": 11944 + }, + { + "epoch": 0.3733125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05533447265625, + "learning_rate": 0.0001, + "loss": 5.5595, + "loss/crossentropy": 2.3742669820785522, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17047683149576187, + "step": 11946 + }, + { + "epoch": 0.373375, + "grad_norm": 3.453125, + "grad_norm_var": 0.04828999837239583, + "learning_rate": 0.0001, + "loss": 5.9406, + "loss/crossentropy": 2.5744495391845703, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18388355523347855, + "step": 11948 + }, + { + "epoch": 0.3734375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0493072509765625, + "learning_rate": 0.0001, + "loss": 5.7761, + "loss/crossentropy": 2.529895782470703, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17344459891319275, + "step": 11950 + }, + { + "epoch": 0.3735, + "grad_norm": 3.421875, + "grad_norm_var": 0.05340169270833333, + "learning_rate": 0.0001, + "loss": 5.783, + "loss/crossentropy": 2.5650538206100464, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17218929529190063, + "step": 11952 + }, + { + "epoch": 0.3735625, + "grad_norm": 3.484375, + "grad_norm_var": 0.050618489583333336, + "learning_rate": 0.0001, + "loss": 5.6553, + "loss/crossentropy": 2.396847724914551, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17623884230852127, + "step": 11954 + }, + { + "epoch": 0.373625, + "grad_norm": 2.96875, + "grad_norm_var": 0.06588134765625, + "learning_rate": 0.0001, + "loss": 5.986, + "loss/crossentropy": 2.6252458095550537, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18216659128665924, + "step": 11956 + }, + { + "epoch": 0.3736875, + "grad_norm": 3.296875, + "grad_norm_var": 0.041844685872395836, + "learning_rate": 0.0001, + "loss": 5.8041, + "loss/crossentropy": 2.5305240154266357, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17228445410728455, + "step": 11958 + }, + { + "epoch": 0.37375, + "grad_norm": 3.046875, + "grad_norm_var": 0.049637858072916666, + "learning_rate": 0.0001, + "loss": 5.6102, + "loss/crossentropy": 2.4852755069732666, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16288577020168304, + "step": 11960 + }, + { + "epoch": 0.3738125, + "grad_norm": 3.234375, + "grad_norm_var": 0.29588216145833335, + "learning_rate": 0.0001, + "loss": 5.9325, + "loss/crossentropy": 2.562421679496765, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18740053474903107, + "step": 11962 + }, + { + "epoch": 0.373875, + "grad_norm": 4.59375, + "grad_norm_var": 0.4058990478515625, + "learning_rate": 0.0001, + "loss": 6.2446, + "loss/crossentropy": 2.6552056074142456, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.20503675192594528, + "step": 11964 + }, + { + "epoch": 0.3739375, + "grad_norm": 3.34375, + "grad_norm_var": 0.39443359375, + "learning_rate": 0.0001, + "loss": 5.7876, + "loss/crossentropy": 2.4652920961380005, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17988931387662888, + "step": 11966 + }, + { + "epoch": 0.374, + "grad_norm": 2.984375, + "grad_norm_var": 0.39947509765625, + "learning_rate": 0.0001, + "loss": 5.7098, + "loss/crossentropy": 2.407015323638916, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1748071238398552, + "step": 11968 + }, + { + "epoch": 0.3740625, + "grad_norm": 3.546875, + "grad_norm_var": 0.39192301432291665, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.541646957397461, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17861248552799225, + "step": 11970 + }, + { + "epoch": 0.374125, + "grad_norm": 3.34375, + "grad_norm_var": 0.37522786458333335, + "learning_rate": 0.0001, + "loss": 6.0544, + "loss/crossentropy": 2.697180986404419, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1841549649834633, + "step": 11972 + }, + { + "epoch": 0.3741875, + "grad_norm": 3.65625, + "grad_norm_var": 0.363623046875, + "learning_rate": 0.0001, + "loss": 6.1215, + "loss/crossentropy": 2.690246343612671, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1841374933719635, + "step": 11974 + }, + { + "epoch": 0.37425, + "grad_norm": 2.796875, + "grad_norm_var": 0.39421284993489586, + "learning_rate": 0.0001, + "loss": 5.6795, + "loss/crossentropy": 2.4888908863067627, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17101595550775528, + "step": 11976 + }, + { + "epoch": 0.3743125, + "grad_norm": 3.3125, + "grad_norm_var": 0.19384358723958334, + "learning_rate": 0.0001, + "loss": 5.9827, + "loss/crossentropy": 2.6535253524780273, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18213209509849548, + "step": 11978 + }, + { + "epoch": 0.374375, + "grad_norm": 3.390625, + "grad_norm_var": 0.062027994791666666, + "learning_rate": 0.0001, + "loss": 6.015, + "loss/crossentropy": 2.711008906364441, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17922794818878174, + "step": 11980 + }, + { + "epoch": 0.3744375, + "grad_norm": 3.015625, + "grad_norm_var": 0.06271158854166667, + "learning_rate": 0.0001, + "loss": 5.7833, + "loss/crossentropy": 2.5675182342529297, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17314188182353973, + "step": 11982 + }, + { + "epoch": 0.3745, + "grad_norm": 3.203125, + "grad_norm_var": 0.04506734212239583, + "learning_rate": 0.0001, + "loss": 5.6987, + "loss/crossentropy": 2.4805713891983032, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16751296073198318, + "step": 11984 + }, + { + "epoch": 0.3745625, + "grad_norm": 3.359375, + "grad_norm_var": 0.04267171223958333, + "learning_rate": 0.0001, + "loss": 5.9659, + "loss/crossentropy": 2.6091278791427612, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18177400529384613, + "step": 11986 + }, + { + "epoch": 0.374625, + "grad_norm": 3.296875, + "grad_norm_var": 0.043017578125, + "learning_rate": 0.0001, + "loss": 5.8874, + "loss/crossentropy": 2.5380669832229614, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1849372684955597, + "step": 11988 + }, + { + "epoch": 0.3746875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0345123291015625, + "learning_rate": 0.0001, + "loss": 5.87, + "loss/crossentropy": 2.6897950172424316, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16996879875659943, + "step": 11990 + }, + { + "epoch": 0.37475, + "grad_norm": 3.78125, + "grad_norm_var": 0.03989969889322917, + "learning_rate": 0.0001, + "loss": 5.6236, + "loss/crossentropy": 2.408652186393738, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1664126068353653, + "step": 11992 + }, + { + "epoch": 0.3748125, + "grad_norm": 3.3125, + "grad_norm_var": 0.04254150390625, + "learning_rate": 0.0001, + "loss": 5.955, + "loss/crossentropy": 2.6086472272872925, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1850292831659317, + "step": 11994 + }, + { + "epoch": 0.374875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0419830322265625, + "learning_rate": 0.0001, + "loss": 5.8619, + "loss/crossentropy": 2.5171536207199097, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17978684604167938, + "step": 11996 + }, + { + "epoch": 0.3749375, + "grad_norm": 3.09375, + "grad_norm_var": 0.048563639322916664, + "learning_rate": 0.0001, + "loss": 6.1212, + "loss/crossentropy": 2.7672228813171387, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18227529525756836, + "step": 11998 + }, + { + "epoch": 0.375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0486328125, + "learning_rate": 0.0001, + "loss": 5.5444, + "loss/crossentropy": 2.3668113946914673, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16619888693094254, + "step": 12000 + }, + { + "epoch": 0.3750625, + "grad_norm": 3.515625, + "grad_norm_var": 0.05091044108072917, + "learning_rate": 0.0001, + "loss": 5.8255, + "loss/crossentropy": 2.539885640144348, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1730951964855194, + "step": 12002 + }, + { + "epoch": 0.375125, + "grad_norm": 3.546875, + "grad_norm_var": 0.053304036458333336, + "learning_rate": 0.0001, + "loss": 5.977, + "loss/crossentropy": 2.6519832611083984, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17547233402729034, + "step": 12004 + }, + { + "epoch": 0.3751875, + "grad_norm": 3.296875, + "grad_norm_var": 0.0476959228515625, + "learning_rate": 0.0001, + "loss": 6.1005, + "loss/crossentropy": 2.726112723350525, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.185099795460701, + "step": 12006 + }, + { + "epoch": 0.37525, + "grad_norm": 3.5625, + "grad_norm_var": 0.0339019775390625, + "learning_rate": 0.0001, + "loss": 5.9259, + "loss/crossentropy": 2.5760785341262817, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18108032643795013, + "step": 12008 + }, + { + "epoch": 0.3753125, + "grad_norm": 3.1875, + "grad_norm_var": 0.03004150390625, + "learning_rate": 0.0001, + "loss": 5.536, + "loss/crossentropy": 2.247123122215271, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.177718847990036, + "step": 12010 + }, + { + "epoch": 0.375375, + "grad_norm": 3.1875, + "grad_norm_var": 0.028986612955729168, + "learning_rate": 0.0001, + "loss": 5.6847, + "loss/crossentropy": 2.4444741010665894, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17246388643980026, + "step": 12012 + }, + { + "epoch": 0.3754375, + "grad_norm": 3.125, + "grad_norm_var": 0.022191365559895832, + "learning_rate": 0.0001, + "loss": 5.6309, + "loss/crossentropy": 2.492947816848755, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16419032961130142, + "step": 12014 + }, + { + "epoch": 0.3755, + "grad_norm": 3.34375, + "grad_norm_var": 0.021024576822916665, + "learning_rate": 0.0001, + "loss": 5.9138, + "loss/crossentropy": 2.5885088443756104, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18174774199724197, + "step": 12016 + }, + { + "epoch": 0.3755625, + "grad_norm": 3.3125, + "grad_norm_var": 0.020555623372395835, + "learning_rate": 0.0001, + "loss": 6.0689, + "loss/crossentropy": 2.5820603370666504, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1932116001844406, + "step": 12018 + }, + { + "epoch": 0.375625, + "grad_norm": 3.203125, + "grad_norm_var": 0.021272786458333335, + "learning_rate": 0.0001, + "loss": 5.7684, + "loss/crossentropy": 2.509916305541992, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1778009980916977, + "step": 12020 + }, + { + "epoch": 0.3756875, + "grad_norm": 3.390625, + "grad_norm_var": 0.021708170572916668, + "learning_rate": 0.0001, + "loss": 5.6891, + "loss/crossentropy": 2.4291462898254395, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.174431212246418, + "step": 12022 + }, + { + "epoch": 0.37575, + "grad_norm": 3.0625, + "grad_norm_var": 0.0208404541015625, + "learning_rate": 0.0001, + "loss": 5.9232, + "loss/crossentropy": 2.6144766807556152, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18009580671787262, + "step": 12024 + }, + { + "epoch": 0.3758125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0223541259765625, + "learning_rate": 0.0001, + "loss": 5.6589, + "loss/crossentropy": 2.4320074319839478, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17034152150154114, + "step": 12026 + }, + { + "epoch": 0.375875, + "grad_norm": 3.640625, + "grad_norm_var": 0.028694661458333333, + "learning_rate": 0.0001, + "loss": 5.7392, + "loss/crossentropy": 2.4564120769500732, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17866551876068115, + "step": 12028 + }, + { + "epoch": 0.3759375, + "grad_norm": 3.1875, + "grad_norm_var": 0.02779541015625, + "learning_rate": 0.0001, + "loss": 5.8896, + "loss/crossentropy": 2.589442491531372, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17806587368249893, + "step": 12030 + }, + { + "epoch": 0.376, + "grad_norm": 3.3125, + "grad_norm_var": 0.0276519775390625, + "learning_rate": 0.0001, + "loss": 6.0676, + "loss/crossentropy": 2.648952603340149, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18990715593099594, + "step": 12032 + }, + { + "epoch": 0.3760625, + "grad_norm": 3.296875, + "grad_norm_var": 0.023762003580729166, + "learning_rate": 0.0001, + "loss": 6.137, + "loss/crossentropy": 2.6550445556640625, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19233429431915283, + "step": 12034 + }, + { + "epoch": 0.376125, + "grad_norm": 3.28125, + "grad_norm_var": 0.02525634765625, + "learning_rate": 0.0001, + "loss": 5.4149, + "loss/crossentropy": 2.3415744304656982, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15967915207147598, + "step": 12036 + }, + { + "epoch": 0.3761875, + "grad_norm": 3.34375, + "grad_norm_var": 0.03355712890625, + "learning_rate": 0.0001, + "loss": 5.77, + "loss/crossentropy": 2.580352306365967, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16701454669237137, + "step": 12038 + }, + { + "epoch": 0.37625, + "grad_norm": 3.140625, + "grad_norm_var": 0.03664957682291667, + "learning_rate": 0.0001, + "loss": 5.5584, + "loss/crossentropy": 2.40367329120636, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16391241550445557, + "step": 12040 + }, + { + "epoch": 0.3763125, + "grad_norm": 3.296875, + "grad_norm_var": 0.0327056884765625, + "learning_rate": 0.0001, + "loss": 6.1674, + "loss/crossentropy": 2.7630926370620728, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18300750851631165, + "step": 12042 + }, + { + "epoch": 0.376375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0356353759765625, + "learning_rate": 0.0001, + "loss": 6.0534, + "loss/crossentropy": 2.6944353580474854, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18199177831411362, + "step": 12044 + }, + { + "epoch": 0.3764375, + "grad_norm": 3.046875, + "grad_norm_var": 0.0447174072265625, + "learning_rate": 0.0001, + "loss": 5.8027, + "loss/crossentropy": 2.645112633705139, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16810278594493866, + "step": 12046 + }, + { + "epoch": 0.3765, + "grad_norm": 3.078125, + "grad_norm_var": 0.05136311848958333, + "learning_rate": 0.0001, + "loss": 5.7176, + "loss/crossentropy": 2.5695072412490845, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16676388680934906, + "step": 12048 + }, + { + "epoch": 0.3765625, + "grad_norm": 3.015625, + "grad_norm_var": 0.05413004557291667, + "learning_rate": 0.0001, + "loss": 6.038, + "loss/crossentropy": 2.7702596187591553, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17638515681028366, + "step": 12050 + }, + { + "epoch": 0.376625, + "grad_norm": 3.140625, + "grad_norm_var": 0.05302632649739583, + "learning_rate": 0.0001, + "loss": 5.9717, + "loss/crossentropy": 2.59650194644928, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18243810534477234, + "step": 12052 + }, + { + "epoch": 0.3766875, + "grad_norm": 3.0625, + "grad_norm_var": 0.05816650390625, + "learning_rate": 0.0001, + "loss": 5.5906, + "loss/crossentropy": 2.4401824474334717, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16699735075235367, + "step": 12054 + }, + { + "epoch": 0.37675, + "grad_norm": 3.84375, + "grad_norm_var": 0.07298177083333333, + "learning_rate": 0.0001, + "loss": 5.9168, + "loss/crossentropy": 2.5050649642944336, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1868806630373001, + "step": 12056 + }, + { + "epoch": 0.3768125, + "grad_norm": 4.9375, + "grad_norm_var": 0.24763081868489584, + "learning_rate": 0.0001, + "loss": 5.6702, + "loss/crossentropy": 2.490068197250366, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1684013083577156, + "step": 12058 + }, + { + "epoch": 0.376875, + "grad_norm": 3.203125, + "grad_norm_var": 0.24306233723958334, + "learning_rate": 0.0001, + "loss": 6.1612, + "loss/crossentropy": 2.7433427572250366, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18982917815446854, + "step": 12060 + }, + { + "epoch": 0.3769375, + "grad_norm": 3.328125, + "grad_norm_var": 0.23471577962239584, + "learning_rate": 0.0001, + "loss": 5.4492, + "loss/crossentropy": 2.3737224340438843, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16184395551681519, + "step": 12062 + }, + { + "epoch": 0.377, + "grad_norm": 3.421875, + "grad_norm_var": 0.22488606770833333, + "learning_rate": 0.0001, + "loss": 5.816, + "loss/crossentropy": 2.5514299869537354, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17684289067983627, + "step": 12064 + }, + { + "epoch": 0.3770625, + "grad_norm": 3.4375, + "grad_norm_var": 0.20998433430989583, + "learning_rate": 0.0001, + "loss": 6.035, + "loss/crossentropy": 2.5798012018203735, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19005408138036728, + "step": 12066 + }, + { + "epoch": 0.377125, + "grad_norm": 3.3125, + "grad_norm_var": 0.20474853515625, + "learning_rate": 0.0001, + "loss": 5.929, + "loss/crossentropy": 2.568007707595825, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18258086591959, + "step": 12068 + }, + { + "epoch": 0.3771875, + "grad_norm": 3.078125, + "grad_norm_var": 0.2058746337890625, + "learning_rate": 0.0001, + "loss": 5.6861, + "loss/crossentropy": 2.453046202659607, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1748638078570366, + "step": 12070 + }, + { + "epoch": 0.37725, + "grad_norm": 3.328125, + "grad_norm_var": 0.19586588541666666, + "learning_rate": 0.0001, + "loss": 5.5062, + "loss/crossentropy": 2.371487021446228, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1623035967350006, + "step": 12072 + }, + { + "epoch": 0.3773125, + "grad_norm": 3.0, + "grad_norm_var": 0.028929646809895834, + "learning_rate": 0.0001, + "loss": 5.6704, + "loss/crossentropy": 2.464169502258301, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17023424059152603, + "step": 12074 + }, + { + "epoch": 0.377375, + "grad_norm": 3.25, + "grad_norm_var": 0.025536092122395833, + "learning_rate": 0.0001, + "loss": 5.7449, + "loss/crossentropy": 2.4666662216186523, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17704083770513535, + "step": 12076 + }, + { + "epoch": 0.3774375, + "grad_norm": 3.171875, + "grad_norm_var": 0.025223795572916666, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.421693801879883, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17212750762701035, + "step": 12078 + }, + { + "epoch": 0.3775, + "grad_norm": 3.359375, + "grad_norm_var": 0.025544230143229166, + "learning_rate": 0.0001, + "loss": 5.6774, + "loss/crossentropy": 2.476585626602173, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16890597343444824, + "step": 12080 + }, + { + "epoch": 0.3775625, + "grad_norm": 3.375, + "grad_norm_var": 0.017406209309895834, + "learning_rate": 0.0001, + "loss": 5.5387, + "loss/crossentropy": 2.3387359380722046, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16921505331993103, + "step": 12082 + }, + { + "epoch": 0.377625, + "grad_norm": 3.21875, + "grad_norm_var": 0.015485636393229167, + "learning_rate": 0.0001, + "loss": 5.5181, + "loss/crossentropy": 2.3846631050109863, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16529268771409988, + "step": 12084 + }, + { + "epoch": 0.3776875, + "grad_norm": 3.421875, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 5.8004, + "loss/crossentropy": 2.52835750579834, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17485986649990082, + "step": 12086 + }, + { + "epoch": 0.37775, + "grad_norm": 3.140625, + "grad_norm_var": 0.018131510416666666, + "learning_rate": 0.0001, + "loss": 5.7243, + "loss/crossentropy": 2.447743535041809, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17452579736709595, + "step": 12088 + }, + { + "epoch": 0.3778125, + "grad_norm": 3.125, + "grad_norm_var": 0.014839680989583333, + "learning_rate": 0.0001, + "loss": 5.776, + "loss/crossentropy": 2.5936180353164673, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16941224038600922, + "step": 12090 + }, + { + "epoch": 0.377875, + "grad_norm": 3.421875, + "grad_norm_var": 0.01910400390625, + "learning_rate": 0.0001, + "loss": 5.9988, + "loss/crossentropy": 2.608113169670105, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1851622387766838, + "step": 12092 + }, + { + "epoch": 0.3779375, + "grad_norm": 3.3125, + "grad_norm_var": 0.021540323893229168, + "learning_rate": 0.0001, + "loss": 5.6652, + "loss/crossentropy": 2.426708698272705, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16915695369243622, + "step": 12094 + }, + { + "epoch": 0.378, + "grad_norm": 3.0, + "grad_norm_var": 0.025316365559895835, + "learning_rate": 0.0001, + "loss": 5.6704, + "loss/crossentropy": 2.3982508182525635, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17408864200115204, + "step": 12096 + }, + { + "epoch": 0.3780625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0919586181640625, + "learning_rate": 0.0001, + "loss": 6.2858, + "loss/crossentropy": 2.751635193824768, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19716878235340118, + "step": 12098 + }, + { + "epoch": 0.378125, + "grad_norm": 3.078125, + "grad_norm_var": 0.0958984375, + "learning_rate": 0.0001, + "loss": 5.8983, + "loss/crossentropy": 2.6188371181488037, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17834113538265228, + "step": 12100 + }, + { + "epoch": 0.3781875, + "grad_norm": 2.90625, + "grad_norm_var": 0.10329488118489584, + "learning_rate": 0.0001, + "loss": 5.251, + "loss/crossentropy": 2.2201396226882935, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15269850939512253, + "step": 12102 + }, + { + "epoch": 0.37825, + "grad_norm": 3.3125, + "grad_norm_var": 0.10086263020833333, + "learning_rate": 0.0001, + "loss": 5.7502, + "loss/crossentropy": 2.4805989265441895, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1765652298927307, + "step": 12104 + }, + { + "epoch": 0.3783125, + "grad_norm": 3.28125, + "grad_norm_var": 0.12730204264322917, + "learning_rate": 0.0001, + "loss": 6.1588, + "loss/crossentropy": 2.686942934989929, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18976256251335144, + "step": 12106 + }, + { + "epoch": 0.378375, + "grad_norm": 3.359375, + "grad_norm_var": 0.12505594889322916, + "learning_rate": 0.0001, + "loss": 5.7162, + "loss/crossentropy": 2.430199146270752, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17469412833452225, + "step": 12108 + }, + { + "epoch": 0.3784375, + "grad_norm": 3.3125, + "grad_norm_var": 0.12639872233072916, + "learning_rate": 0.0001, + "loss": 5.9854, + "loss/crossentropy": 2.623427152633667, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18189743161201477, + "step": 12110 + }, + { + "epoch": 0.3785, + "grad_norm": 3.203125, + "grad_norm_var": 0.11840718587239583, + "learning_rate": 0.0001, + "loss": 5.8371, + "loss/crossentropy": 2.538592576980591, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17945991456508636, + "step": 12112 + }, + { + "epoch": 0.3785625, + "grad_norm": 3.15625, + "grad_norm_var": 0.06402079264322917, + "learning_rate": 0.0001, + "loss": 6.0307, + "loss/crossentropy": 2.703576922416687, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18192638456821442, + "step": 12114 + }, + { + "epoch": 0.378625, + "grad_norm": 3.359375, + "grad_norm_var": 0.06735026041666667, + "learning_rate": 0.0001, + "loss": 5.8, + "loss/crossentropy": 2.606787919998169, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17439626157283783, + "step": 12116 + }, + { + "epoch": 0.3786875, + "grad_norm": 3.328125, + "grad_norm_var": 0.059015909830729164, + "learning_rate": 0.0001, + "loss": 6.1803, + "loss/crossentropy": 2.748610496520996, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1880939081311226, + "step": 12118 + }, + { + "epoch": 0.37875, + "grad_norm": 3.453125, + "grad_norm_var": 0.06272684733072917, + "learning_rate": 0.0001, + "loss": 6.186, + "loss/crossentropy": 2.7390146255493164, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18962519615888596, + "step": 12120 + }, + { + "epoch": 0.3788125, + "grad_norm": 2.875, + "grad_norm_var": 0.0482330322265625, + "learning_rate": 0.0001, + "loss": 5.7026, + "loss/crossentropy": 2.450590491294861, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1736360266804695, + "step": 12122 + }, + { + "epoch": 0.378875, + "grad_norm": 3.265625, + "grad_norm_var": 0.0556060791015625, + "learning_rate": 0.0001, + "loss": 5.9319, + "loss/crossentropy": 2.6200908422470093, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17961391806602478, + "step": 12124 + }, + { + "epoch": 0.3789375, + "grad_norm": 3.140625, + "grad_norm_var": 0.05188395182291667, + "learning_rate": 0.0001, + "loss": 5.8957, + "loss/crossentropy": 2.6624832153320312, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1721498966217041, + "step": 12126 + }, + { + "epoch": 0.379, + "grad_norm": 3.640625, + "grad_norm_var": 0.06915690104166666, + "learning_rate": 0.0001, + "loss": 5.8722, + "loss/crossentropy": 2.6058125495910645, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17468710243701935, + "step": 12128 + }, + { + "epoch": 0.3790625, + "grad_norm": 3.21875, + "grad_norm_var": 0.06897786458333334, + "learning_rate": 0.0001, + "loss": 5.9679, + "loss/crossentropy": 2.654877543449402, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17896226793527603, + "step": 12130 + }, + { + "epoch": 0.379125, + "grad_norm": 3.4375, + "grad_norm_var": 0.06750386555989583, + "learning_rate": 0.0001, + "loss": 5.7398, + "loss/crossentropy": 2.4703577756881714, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1749955117702484, + "step": 12132 + }, + { + "epoch": 0.3791875, + "grad_norm": 3.125, + "grad_norm_var": 0.05586649576822917, + "learning_rate": 0.0001, + "loss": 5.7071, + "loss/crossentropy": 2.4364266395568848, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17589226365089417, + "step": 12134 + }, + { + "epoch": 0.37925, + "grad_norm": 3.0625, + "grad_norm_var": 0.05611572265625, + "learning_rate": 0.0001, + "loss": 5.7527, + "loss/crossentropy": 2.5519464015960693, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17007731646299362, + "step": 12136 + }, + { + "epoch": 0.3793125, + "grad_norm": 3.53125, + "grad_norm_var": 0.06360677083333334, + "learning_rate": 0.0001, + "loss": 5.9415, + "loss/crossentropy": 2.526289939880371, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18800681084394455, + "step": 12138 + }, + { + "epoch": 0.379375, + "grad_norm": 3.265625, + "grad_norm_var": 0.0621246337890625, + "learning_rate": 0.0001, + "loss": 5.956, + "loss/crossentropy": 2.6125783920288086, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18316560983657837, + "step": 12140 + }, + { + "epoch": 0.3794375, + "grad_norm": 3.5, + "grad_norm_var": 0.056722005208333336, + "learning_rate": 0.0001, + "loss": 5.7223, + "loss/crossentropy": 2.5177252292633057, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17241264134645462, + "step": 12142 + }, + { + "epoch": 0.3795, + "grad_norm": 4.5, + "grad_norm_var": 0.1311187744140625, + "learning_rate": 0.0001, + "loss": 6.0227, + "loss/crossentropy": 2.5970641374588013, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1909969076514244, + "step": 12144 + }, + { + "epoch": 0.3795625, + "grad_norm": 3.625, + "grad_norm_var": 0.13193257649739584, + "learning_rate": 0.0001, + "loss": 5.93, + "loss/crossentropy": 2.5664255619049072, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18322807550430298, + "step": 12146 + }, + { + "epoch": 0.379625, + "grad_norm": 3.375, + "grad_norm_var": 0.13154195149739584, + "learning_rate": 0.0001, + "loss": 6.1476, + "loss/crossentropy": 2.797860860824585, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17989974468946457, + "step": 12148 + }, + { + "epoch": 0.3796875, + "grad_norm": 3.171875, + "grad_norm_var": 0.13027242024739583, + "learning_rate": 0.0001, + "loss": 5.6635, + "loss/crossentropy": 2.4914366006851196, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1683766320347786, + "step": 12150 + }, + { + "epoch": 0.37975, + "grad_norm": 3.21875, + "grad_norm_var": 0.10907796223958334, + "learning_rate": 0.0001, + "loss": 5.8303, + "loss/crossentropy": 2.5784735679626465, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1728389412164688, + "step": 12152 + }, + { + "epoch": 0.3798125, + "grad_norm": 3.765625, + "grad_norm_var": 0.11542561848958334, + "learning_rate": 0.0001, + "loss": 6.2311, + "loss/crossentropy": 2.6795923709869385, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1969432532787323, + "step": 12154 + }, + { + "epoch": 0.379875, + "grad_norm": 3.34375, + "grad_norm_var": 0.11448160807291667, + "learning_rate": 0.0001, + "loss": 5.8681, + "loss/crossentropy": 2.557464122772217, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1759844273328781, + "step": 12156 + }, + { + "epoch": 0.3799375, + "grad_norm": 3.4375, + "grad_norm_var": 0.11199442545572917, + "learning_rate": 0.0001, + "loss": 5.7149, + "loss/crossentropy": 2.522140145301819, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.169661745429039, + "step": 12158 + }, + { + "epoch": 0.38, + "grad_norm": 4.1875, + "grad_norm_var": 0.0751861572265625, + "learning_rate": 0.0001, + "loss": 5.8308, + "loss/crossentropy": 2.591986656188965, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1762208491563797, + "step": 12160 + }, + { + "epoch": 0.3800625, + "grad_norm": 3.078125, + "grad_norm_var": 0.07493489583333333, + "learning_rate": 0.0001, + "loss": 5.7254, + "loss/crossentropy": 2.4571229219436646, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17604468762874603, + "step": 12162 + }, + { + "epoch": 0.380125, + "grad_norm": 3.171875, + "grad_norm_var": 0.08089090983072916, + "learning_rate": 0.0001, + "loss": 5.9118, + "loss/crossentropy": 2.6147637367248535, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17970652133226395, + "step": 12164 + }, + { + "epoch": 0.3801875, + "grad_norm": 3.125, + "grad_norm_var": 0.08205973307291667, + "learning_rate": 0.0001, + "loss": 5.7955, + "loss/crossentropy": 2.5248990058898926, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17549820244312286, + "step": 12166 + }, + { + "epoch": 0.38025, + "grad_norm": 3.078125, + "grad_norm_var": 0.08730061848958333, + "learning_rate": 0.0001, + "loss": 5.4255, + "loss/crossentropy": 2.3403602838516235, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1608593687415123, + "step": 12168 + }, + { + "epoch": 0.3803125, + "grad_norm": 3.234375, + "grad_norm_var": 0.07319234212239584, + "learning_rate": 0.0001, + "loss": 5.9311, + "loss/crossentropy": 2.554044485092163, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18340571224689484, + "step": 12170 + }, + { + "epoch": 0.380375, + "grad_norm": 3.703125, + "grad_norm_var": 0.08367411295572917, + "learning_rate": 0.0001, + "loss": 5.8228, + "loss/crossentropy": 2.5055052042007446, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18290476500988007, + "step": 12172 + }, + { + "epoch": 0.3804375, + "grad_norm": 3.40625, + "grad_norm_var": 0.08108622233072917, + "learning_rate": 0.0001, + "loss": 6.1499, + "loss/crossentropy": 2.7790268659591675, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1839602366089821, + "step": 12174 + }, + { + "epoch": 0.3805, + "grad_norm": 3.578125, + "grad_norm_var": 0.03209228515625, + "learning_rate": 0.0001, + "loss": 6.0808, + "loss/crossentropy": 2.6841509342193604, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18732508271932602, + "step": 12176 + }, + { + "epoch": 0.3805625, + "grad_norm": 3.421875, + "grad_norm_var": 0.0468170166015625, + "learning_rate": 0.0001, + "loss": 5.9809, + "loss/crossentropy": 2.6901607513427734, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17555983364582062, + "step": 12178 + }, + { + "epoch": 0.380625, + "grad_norm": 3.15625, + "grad_norm_var": 0.05035400390625, + "learning_rate": 0.0001, + "loss": 5.7499, + "loss/crossentropy": 2.541025757789612, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1728431135416031, + "step": 12180 + }, + { + "epoch": 0.3806875, + "grad_norm": 2.953125, + "grad_norm_var": 0.05526936848958333, + "learning_rate": 0.0001, + "loss": 5.6726, + "loss/crossentropy": 2.44266676902771, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.16752055287361145, + "step": 12182 + }, + { + "epoch": 0.38075, + "grad_norm": 3.75, + "grad_norm_var": 0.07165425618489583, + "learning_rate": 0.0001, + "loss": 5.8804, + "loss/crossentropy": 2.521677017211914, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.17649737745523453, + "step": 12184 + }, + { + "epoch": 0.3808125, + "grad_norm": 3.4375, + "grad_norm_var": 0.0736328125, + "learning_rate": 0.0001, + "loss": 5.8637, + "loss/crossentropy": 2.5367971658706665, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18073730170726776, + "step": 12186 + }, + { + "epoch": 0.380875, + "grad_norm": 3.390625, + "grad_norm_var": 0.06292215983072917, + "learning_rate": 0.0001, + "loss": 5.9977, + "loss/crossentropy": 2.5861297845840454, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.186076782643795, + "step": 12188 + }, + { + "epoch": 0.3809375, + "grad_norm": 3.28125, + "grad_norm_var": 0.06717122395833333, + "learning_rate": 0.0001, + "loss": 5.9649, + "loss/crossentropy": 2.567418694496155, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1842784881591797, + "step": 12190 + }, + { + "epoch": 0.381, + "grad_norm": 3.125, + "grad_norm_var": 0.061375935872395836, + "learning_rate": 0.0001, + "loss": 5.7625, + "loss/crossentropy": 2.532058596611023, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16874880343675613, + "step": 12192 + }, + { + "epoch": 0.3810625, + "grad_norm": 3.21875, + "grad_norm_var": 0.04898681640625, + "learning_rate": 0.0001, + "loss": 5.867, + "loss/crossentropy": 2.5616434812545776, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17741339653730392, + "step": 12194 + }, + { + "epoch": 0.381125, + "grad_norm": 3.0625, + "grad_norm_var": 0.046263631184895834, + "learning_rate": 0.0001, + "loss": 5.9946, + "loss/crossentropy": 2.7013275623321533, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17893613129854202, + "step": 12196 + }, + { + "epoch": 0.3811875, + "grad_norm": 3.40625, + "grad_norm_var": 0.039362589518229164, + "learning_rate": 0.0001, + "loss": 6.0828, + "loss/crossentropy": 2.731987476348877, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1835208311676979, + "step": 12198 + }, + { + "epoch": 0.38125, + "grad_norm": 3.3125, + "grad_norm_var": 0.02398681640625, + "learning_rate": 0.0001, + "loss": 5.8494, + "loss/crossentropy": 2.5146725177764893, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1838630512356758, + "step": 12200 + }, + { + "epoch": 0.3813125, + "grad_norm": 3.09375, + "grad_norm_var": 0.027701822916666667, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.570136785507202, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17073464393615723, + "step": 12202 + }, + { + "epoch": 0.381375, + "grad_norm": 3.609375, + "grad_norm_var": 0.032103474934895834, + "learning_rate": 0.0001, + "loss": 5.9705, + "loss/crossentropy": 2.6945348978042603, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17642514407634735, + "step": 12204 + }, + { + "epoch": 0.3814375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0372467041015625, + "learning_rate": 0.0001, + "loss": 5.908, + "loss/crossentropy": 2.632769465446472, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17478998005390167, + "step": 12206 + }, + { + "epoch": 0.3815, + "grad_norm": 2.890625, + "grad_norm_var": 0.04537353515625, + "learning_rate": 0.0001, + "loss": 5.46, + "loss/crossentropy": 2.406898617744446, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1568731889128685, + "step": 12208 + }, + { + "epoch": 0.3815625, + "grad_norm": 3.421875, + "grad_norm_var": 0.0423248291015625, + "learning_rate": 0.0001, + "loss": 6.0608, + "loss/crossentropy": 2.70103120803833, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18206676840782166, + "step": 12210 + }, + { + "epoch": 0.381625, + "grad_norm": 3.03125, + "grad_norm_var": 0.043257649739583334, + "learning_rate": 0.0001, + "loss": 5.6974, + "loss/crossentropy": 2.4659173488616943, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17080692946910858, + "step": 12212 + }, + { + "epoch": 0.3816875, + "grad_norm": 3.203125, + "grad_norm_var": 0.0397125244140625, + "learning_rate": 0.0001, + "loss": 5.9125, + "loss/crossentropy": 2.652961015701294, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17556791007518768, + "step": 12214 + }, + { + "epoch": 0.38175, + "grad_norm": 8.9375, + "grad_norm_var": 2.0692535400390626, + "learning_rate": 0.0001, + "loss": 5.9109, + "loss/crossentropy": 2.546905755996704, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1785893440246582, + "step": 12216 + }, + { + "epoch": 0.3818125, + "grad_norm": 3.015625, + "grad_norm_var": 2.058552042643229, + "learning_rate": 0.0001, + "loss": 5.6968, + "loss/crossentropy": 2.577237844467163, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16312456130981445, + "step": 12218 + }, + { + "epoch": 0.381875, + "grad_norm": 3.203125, + "grad_norm_var": 2.06337890625, + "learning_rate": 0.0001, + "loss": 5.8515, + "loss/crossentropy": 2.5166709423065186, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17996849119663239, + "step": 12220 + }, + { + "epoch": 0.3819375, + "grad_norm": 3.234375, + "grad_norm_var": 2.081761678059896, + "learning_rate": 0.0001, + "loss": 5.7943, + "loss/crossentropy": 2.5752391815185547, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17229846864938736, + "step": 12222 + }, + { + "epoch": 0.382, + "grad_norm": 3.515625, + "grad_norm_var": 2.059928385416667, + "learning_rate": 0.0001, + "loss": 5.6823, + "loss/crossentropy": 2.489313840866089, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16851232945919037, + "step": 12224 + }, + { + "epoch": 0.3820625, + "grad_norm": 3.484375, + "grad_norm_var": 2.0624664306640623, + "learning_rate": 0.0001, + "loss": 5.9289, + "loss/crossentropy": 2.5973278284072876, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18120825290679932, + "step": 12226 + }, + { + "epoch": 0.382125, + "grad_norm": 3.234375, + "grad_norm_var": 2.0595987955729167, + "learning_rate": 0.0001, + "loss": 5.9501, + "loss/crossentropy": 2.6300965547561646, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17809592932462692, + "step": 12228 + }, + { + "epoch": 0.3821875, + "grad_norm": 3.5625, + "grad_norm_var": 2.0440388997395833, + "learning_rate": 0.0001, + "loss": 5.8267, + "loss/crossentropy": 2.6522780656814575, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16744433343410492, + "step": 12230 + }, + { + "epoch": 0.38225, + "grad_norm": 3.421875, + "grad_norm_var": 0.07775065104166666, + "learning_rate": 0.0001, + "loss": 5.998, + "loss/crossentropy": 2.5721473693847656, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18477017432451248, + "step": 12232 + }, + { + "epoch": 0.3823125, + "grad_norm": 3.078125, + "grad_norm_var": 0.078662109375, + "learning_rate": 0.0001, + "loss": 5.6932, + "loss/crossentropy": 2.521741032600403, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16909783333539963, + "step": 12234 + }, + { + "epoch": 0.382375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0667388916015625, + "learning_rate": 0.0001, + "loss": 6.0465, + "loss/crossentropy": 2.657966375350952, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18377594649791718, + "step": 12236 + }, + { + "epoch": 0.3824375, + "grad_norm": 3.890625, + "grad_norm_var": 0.0851959228515625, + "learning_rate": 0.0001, + "loss": 5.3831, + "loss/crossentropy": 2.246523380279541, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1605309322476387, + "step": 12238 + }, + { + "epoch": 0.3825, + "grad_norm": 3.5625, + "grad_norm_var": 0.0819000244140625, + "learning_rate": 0.0001, + "loss": 5.9532, + "loss/crossentropy": 2.6257444620132446, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18274663388729095, + "step": 12240 + }, + { + "epoch": 0.3825625, + "grad_norm": 3.390625, + "grad_norm_var": 0.079443359375, + "learning_rate": 0.0001, + "loss": 5.8184, + "loss/crossentropy": 2.523483991622925, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17519061267375946, + "step": 12242 + }, + { + "epoch": 0.382625, + "grad_norm": 3.40625, + "grad_norm_var": 0.07519124348958334, + "learning_rate": 0.0001, + "loss": 5.916, + "loss/crossentropy": 2.5331852436065674, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1808634251356125, + "step": 12244 + }, + { + "epoch": 0.3826875, + "grad_norm": 3.046875, + "grad_norm_var": 0.08105061848958334, + "learning_rate": 0.0001, + "loss": 5.8447, + "loss/crossentropy": 2.6363556385040283, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16927223652601242, + "step": 12246 + }, + { + "epoch": 0.38275, + "grad_norm": 3.328125, + "grad_norm_var": 0.05364583333333333, + "learning_rate": 0.0001, + "loss": 5.4834, + "loss/crossentropy": 2.31623113155365, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16554533690214157, + "step": 12248 + }, + { + "epoch": 0.3828125, + "grad_norm": 3.390625, + "grad_norm_var": 0.050959269205729164, + "learning_rate": 0.0001, + "loss": 5.7755, + "loss/crossentropy": 2.4709372520446777, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17888887971639633, + "step": 12250 + }, + { + "epoch": 0.382875, + "grad_norm": 3.625, + "grad_norm_var": 0.0599761962890625, + "learning_rate": 0.0001, + "loss": 5.9009, + "loss/crossentropy": 2.617013931274414, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17682691663503647, + "step": 12252 + }, + { + "epoch": 0.3829375, + "grad_norm": 3.6875, + "grad_norm_var": 0.040913899739583336, + "learning_rate": 0.0001, + "loss": 6.0086, + "loss/crossentropy": 2.671997904777527, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1836603283882141, + "step": 12254 + }, + { + "epoch": 0.383, + "grad_norm": 3.390625, + "grad_norm_var": 0.038914998372395836, + "learning_rate": 0.0001, + "loss": 5.8406, + "loss/crossentropy": 2.6281083822250366, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1720341593027115, + "step": 12256 + }, + { + "epoch": 0.3830625, + "grad_norm": 3.421875, + "grad_norm_var": 0.04355061848958333, + "learning_rate": 0.0001, + "loss": 5.7652, + "loss/crossentropy": 2.478159785270691, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1751842424273491, + "step": 12258 + }, + { + "epoch": 0.383125, + "grad_norm": 3.328125, + "grad_norm_var": 0.04267578125, + "learning_rate": 0.0001, + "loss": 5.8508, + "loss/crossentropy": 2.61394727230072, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17290370166301727, + "step": 12260 + }, + { + "epoch": 0.3831875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0467681884765625, + "learning_rate": 0.0001, + "loss": 6.0011, + "loss/crossentropy": 2.6651010513305664, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17969100177288055, + "step": 12262 + }, + { + "epoch": 0.38325, + "grad_norm": 3.015625, + "grad_norm_var": 0.05367431640625, + "learning_rate": 0.0001, + "loss": 5.8139, + "loss/crossentropy": 2.531054735183716, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17555169016122818, + "step": 12264 + }, + { + "epoch": 0.3833125, + "grad_norm": 2.96875, + "grad_norm_var": 0.06116434733072917, + "learning_rate": 0.0001, + "loss": 5.7743, + "loss/crossentropy": 2.5854746103286743, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17044856399297714, + "step": 12266 + }, + { + "epoch": 0.383375, + "grad_norm": 3.359375, + "grad_norm_var": 0.05109049479166667, + "learning_rate": 0.0001, + "loss": 5.6732, + "loss/crossentropy": 2.37837016582489, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1732364371418953, + "step": 12268 + }, + { + "epoch": 0.3834375, + "grad_norm": 3.140625, + "grad_norm_var": 0.04208984375, + "learning_rate": 0.0001, + "loss": 5.9049, + "loss/crossentropy": 2.6091073751449585, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17567677795886993, + "step": 12270 + }, + { + "epoch": 0.3835, + "grad_norm": 2.90625, + "grad_norm_var": 0.04729817708333333, + "learning_rate": 0.0001, + "loss": 5.5178, + "loss/crossentropy": 2.4287716150283813, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16397615522146225, + "step": 12272 + }, + { + "epoch": 0.3835625, + "grad_norm": 3.140625, + "grad_norm_var": 0.042464192708333334, + "learning_rate": 0.0001, + "loss": 5.8446, + "loss/crossentropy": 2.570552706718445, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1766263246536255, + "step": 12274 + }, + { + "epoch": 0.383625, + "grad_norm": 3.34375, + "grad_norm_var": 0.041845703125, + "learning_rate": 0.0001, + "loss": 6.1636, + "loss/crossentropy": 2.717664122581482, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19225004315376282, + "step": 12276 + }, + { + "epoch": 0.3836875, + "grad_norm": 3.640625, + "grad_norm_var": 0.04117431640625, + "learning_rate": 0.0001, + "loss": 5.6497, + "loss/crossentropy": 2.3928192853927612, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17412814497947693, + "step": 12278 + }, + { + "epoch": 0.38375, + "grad_norm": 3.609375, + "grad_norm_var": 0.04153238932291667, + "learning_rate": 0.0001, + "loss": 5.757, + "loss/crossentropy": 2.556597352027893, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1719910204410553, + "step": 12280 + }, + { + "epoch": 0.3838125, + "grad_norm": 3.421875, + "grad_norm_var": 0.035791015625, + "learning_rate": 0.0001, + "loss": 6.1173, + "loss/crossentropy": 2.8058717250823975, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17684178799390793, + "step": 12282 + }, + { + "epoch": 0.383875, + "grad_norm": 3.453125, + "grad_norm_var": 0.037886555989583334, + "learning_rate": 0.0001, + "loss": 5.7644, + "loss/crossentropy": 2.4213374853134155, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1819588765501976, + "step": 12284 + }, + { + "epoch": 0.3839375, + "grad_norm": 3.484375, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 5.9828, + "loss/crossentropy": 2.639641284942627, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18197622895240784, + "step": 12286 + }, + { + "epoch": 0.384, + "grad_norm": 4.65625, + "grad_norm_var": 0.13775634765625, + "learning_rate": 0.0001, + "loss": 5.9036, + "loss/crossentropy": 2.542542815208435, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18063607811927795, + "step": 12288 + }, + { + "epoch": 0.3840625, + "grad_norm": 3.3125, + "grad_norm_var": 0.13239644368489584, + "learning_rate": 0.0001, + "loss": 5.9213, + "loss/crossentropy": 2.6110751628875732, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17906443774700165, + "step": 12290 + }, + { + "epoch": 0.384125, + "grad_norm": 3.15625, + "grad_norm_var": 0.14096577962239584, + "learning_rate": 0.0001, + "loss": 5.5237, + "loss/crossentropy": 2.42322838306427, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16083312034606934, + "step": 12292 + }, + { + "epoch": 0.3841875, + "grad_norm": 3.65625, + "grad_norm_var": 0.1385406494140625, + "learning_rate": 0.0001, + "loss": 5.839, + "loss/crossentropy": 2.524837851524353, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17750893533229828, + "step": 12294 + }, + { + "epoch": 0.38425, + "grad_norm": 3.359375, + "grad_norm_var": 0.1338531494140625, + "learning_rate": 0.0001, + "loss": 6.0998, + "loss/crossentropy": 2.6787259578704834, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1889820396900177, + "step": 12296 + }, + { + "epoch": 0.3843125, + "grad_norm": 3.140625, + "grad_norm_var": 0.1303375244140625, + "learning_rate": 0.0001, + "loss": 5.4848, + "loss/crossentropy": 2.3015941381454468, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16558706760406494, + "step": 12298 + }, + { + "epoch": 0.384375, + "grad_norm": 3.453125, + "grad_norm_var": 0.13454488118489583, + "learning_rate": 0.0001, + "loss": 5.5566, + "loss/crossentropy": 2.385546326637268, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16671882569789886, + "step": 12300 + }, + { + "epoch": 0.3844375, + "grad_norm": 3.109375, + "grad_norm_var": 0.13991597493489583, + "learning_rate": 0.0001, + "loss": 6.0241, + "loss/crossentropy": 2.754414200782776, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17579607665538788, + "step": 12302 + }, + { + "epoch": 0.3845, + "grad_norm": 3.3125, + "grad_norm_var": 0.03919270833333333, + "learning_rate": 0.0001, + "loss": 5.9473, + "loss/crossentropy": 2.5974284410476685, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1799042969942093, + "step": 12304 + }, + { + "epoch": 0.3845625, + "grad_norm": 3.53125, + "grad_norm_var": 0.041991170247395834, + "learning_rate": 0.0001, + "loss": 5.6848, + "loss/crossentropy": 2.4203197956085205, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17567094415426254, + "step": 12306 + }, + { + "epoch": 0.384625, + "grad_norm": 3.421875, + "grad_norm_var": 0.03603413899739583, + "learning_rate": 0.0001, + "loss": 5.9538, + "loss/crossentropy": 2.602110505104065, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17930720001459122, + "step": 12308 + }, + { + "epoch": 0.3846875, + "grad_norm": 4.9375, + "grad_norm_var": 0.17834879557291666, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.3790918588638306, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1777629777789116, + "step": 12310 + }, + { + "epoch": 0.38475, + "grad_norm": 3.921875, + "grad_norm_var": 0.19153238932291666, + "learning_rate": 0.0001, + "loss": 5.7206, + "loss/crossentropy": 2.4854986667633057, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.173123300075531, + "step": 12312 + }, + { + "epoch": 0.3848125, + "grad_norm": 3.75, + "grad_norm_var": 0.28059488932291665, + "learning_rate": 0.0001, + "loss": 6.2935, + "loss/crossentropy": 2.72293484210968, + "loss/hidden": 1.63671875, + "loss/jsd": 0.0, + "loss/logits": 0.1933843046426773, + "step": 12314 + }, + { + "epoch": 0.384875, + "grad_norm": 3.796875, + "grad_norm_var": 0.29862874348958335, + "learning_rate": 0.0001, + "loss": 5.9117, + "loss/crossentropy": 2.5335875749588013, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18077735602855682, + "step": 12316 + }, + { + "epoch": 0.3849375, + "grad_norm": 3.34375, + "grad_norm_var": 0.28352762858072916, + "learning_rate": 0.0001, + "loss": 6.0196, + "loss/crossentropy": 2.642233729362488, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18461138755083084, + "step": 12318 + }, + { + "epoch": 0.385, + "grad_norm": 3.203125, + "grad_norm_var": 0.2954915364583333, + "learning_rate": 0.0001, + "loss": 5.907, + "loss/crossentropy": 2.630492329597473, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17648132890462875, + "step": 12320 + }, + { + "epoch": 0.3850625, + "grad_norm": 3.453125, + "grad_norm_var": 0.28582356770833334, + "learning_rate": 0.0001, + "loss": 5.9095, + "loss/crossentropy": 2.679234027862549, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1726335808634758, + "step": 12322 + }, + { + "epoch": 0.385125, + "grad_norm": 2.984375, + "grad_norm_var": 0.343896484375, + "learning_rate": 0.0001, + "loss": 5.3557, + "loss/crossentropy": 2.304378032684326, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1559125781059265, + "step": 12324 + }, + { + "epoch": 0.3851875, + "grad_norm": 3.1875, + "grad_norm_var": 0.21656901041666668, + "learning_rate": 0.0001, + "loss": 5.8262, + "loss/crossentropy": 2.606326460838318, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17003396153450012, + "step": 12326 + }, + { + "epoch": 0.38525, + "grad_norm": 3.328125, + "grad_norm_var": 0.20341796875, + "learning_rate": 0.0001, + "loss": 5.5562, + "loss/crossentropy": 2.372411847114563, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16916058957576752, + "step": 12328 + }, + { + "epoch": 0.3853125, + "grad_norm": 3.359375, + "grad_norm_var": 0.05090738932291667, + "learning_rate": 0.0001, + "loss": 5.7291, + "loss/crossentropy": 2.5212844610214233, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17000123113393784, + "step": 12330 + }, + { + "epoch": 0.385375, + "grad_norm": 3.46875, + "grad_norm_var": 0.034468587239583334, + "learning_rate": 0.0001, + "loss": 5.8163, + "loss/crossentropy": 2.578205704689026, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17654545605182648, + "step": 12332 + }, + { + "epoch": 0.3854375, + "grad_norm": 3.5625, + "grad_norm_var": 0.0312896728515625, + "learning_rate": 0.0001, + "loss": 5.9139, + "loss/crossentropy": 2.620287299156189, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17623500525951385, + "step": 12334 + }, + { + "epoch": 0.3855, + "grad_norm": 3.703125, + "grad_norm_var": 0.04426167805989583, + "learning_rate": 0.0001, + "loss": 5.6922, + "loss/crossentropy": 2.4255547523498535, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17588192969560623, + "step": 12336 + }, + { + "epoch": 0.3855625, + "grad_norm": 3.453125, + "grad_norm_var": 0.04471028645833333, + "learning_rate": 0.0001, + "loss": 5.6718, + "loss/crossentropy": 2.5147881507873535, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1641385480761528, + "step": 12338 + }, + { + "epoch": 0.385625, + "grad_norm": 3.71875, + "grad_norm_var": 0.040120442708333336, + "learning_rate": 0.0001, + "loss": 5.4351, + "loss/crossentropy": 2.3117510080337524, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16584917902946472, + "step": 12340 + }, + { + "epoch": 0.3856875, + "grad_norm": 3.6875, + "grad_norm_var": 0.05225321451822917, + "learning_rate": 0.0001, + "loss": 5.3687, + "loss/crossentropy": 2.2538561820983887, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1583578959107399, + "step": 12342 + }, + { + "epoch": 0.38575, + "grad_norm": 3.546875, + "grad_norm_var": 0.053206380208333334, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.422629475593567, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1827506199479103, + "step": 12344 + }, + { + "epoch": 0.3858125, + "grad_norm": 3.296875, + "grad_norm_var": 0.06763407389322916, + "learning_rate": 0.0001, + "loss": 5.7576, + "loss/crossentropy": 2.3514147996902466, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18398155272006989, + "step": 12346 + }, + { + "epoch": 0.385875, + "grad_norm": 3.609375, + "grad_norm_var": 0.07009175618489584, + "learning_rate": 0.0001, + "loss": 5.8983, + "loss/crossentropy": 2.591655969619751, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17832036316394806, + "step": 12348 + }, + { + "epoch": 0.3859375, + "grad_norm": 3.109375, + "grad_norm_var": 0.08994852701822917, + "learning_rate": 0.0001, + "loss": 5.9475, + "loss/crossentropy": 2.7168256044387817, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1722814068198204, + "step": 12350 + }, + { + "epoch": 0.386, + "grad_norm": 3.078125, + "grad_norm_var": 0.0876861572265625, + "learning_rate": 0.0001, + "loss": 5.7991, + "loss/crossentropy": 2.5579906702041626, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17254658043384552, + "step": 12352 + }, + { + "epoch": 0.3860625, + "grad_norm": 4.78125, + "grad_norm_var": 0.2190338134765625, + "learning_rate": 0.0001, + "loss": 6.342, + "loss/crossentropy": 2.74264657497406, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.2029033899307251, + "step": 12354 + }, + { + "epoch": 0.386125, + "grad_norm": 3.15625, + "grad_norm_var": 0.2100982666015625, + "learning_rate": 0.0001, + "loss": 5.7182, + "loss/crossentropy": 2.5077556371688843, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17377915978431702, + "step": 12356 + }, + { + "epoch": 0.3861875, + "grad_norm": 3.15625, + "grad_norm_var": 0.19944254557291666, + "learning_rate": 0.0001, + "loss": 6.0767, + "loss/crossentropy": 2.7694251537323, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1795564666390419, + "step": 12358 + }, + { + "epoch": 0.38625, + "grad_norm": 3.515625, + "grad_norm_var": 0.19931640625, + "learning_rate": 0.0001, + "loss": 5.6196, + "loss/crossentropy": 2.38739013671875, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17126502841711044, + "step": 12360 + }, + { + "epoch": 0.3863125, + "grad_norm": 3.046875, + "grad_norm_var": 0.19107666015625, + "learning_rate": 0.0001, + "loss": 5.7025, + "loss/crossentropy": 2.5682406425476074, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16732843965291977, + "step": 12362 + }, + { + "epoch": 0.386375, + "grad_norm": 3.203125, + "grad_norm_var": 0.1846099853515625, + "learning_rate": 0.0001, + "loss": 5.7853, + "loss/crossentropy": 2.5417308807373047, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17239879816770554, + "step": 12364 + }, + { + "epoch": 0.3864375, + "grad_norm": 3.109375, + "grad_norm_var": 0.17350972493489583, + "learning_rate": 0.0001, + "loss": 5.945, + "loss/crossentropy": 2.6591559648513794, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17819470912218094, + "step": 12366 + }, + { + "epoch": 0.3865, + "grad_norm": 3.125, + "grad_norm_var": 0.17265523274739583, + "learning_rate": 0.0001, + "loss": 5.8818, + "loss/crossentropy": 2.5998687744140625, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1742846518754959, + "step": 12368 + }, + { + "epoch": 0.3865625, + "grad_norm": 3.375, + "grad_norm_var": 0.022508748372395835, + "learning_rate": 0.0001, + "loss": 6.0897, + "loss/crossentropy": 2.7611477375030518, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18207243829965591, + "step": 12370 + }, + { + "epoch": 0.386625, + "grad_norm": 3.40625, + "grad_norm_var": 0.0267974853515625, + "learning_rate": 0.0001, + "loss": 5.6273, + "loss/crossentropy": 2.4154202938079834, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17314515262842178, + "step": 12372 + }, + { + "epoch": 0.3866875, + "grad_norm": 3.59375, + "grad_norm_var": 0.033568318684895834, + "learning_rate": 0.0001, + "loss": 6.3509, + "loss/crossentropy": 2.8199591636657715, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1952815130352974, + "step": 12374 + }, + { + "epoch": 0.38675, + "grad_norm": 3.53125, + "grad_norm_var": 0.03404032389322917, + "learning_rate": 0.0001, + "loss": 6.0306, + "loss/crossentropy": 2.5840632915496826, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18879874050617218, + "step": 12376 + }, + { + "epoch": 0.3868125, + "grad_norm": 2.984375, + "grad_norm_var": 0.030497233072916668, + "learning_rate": 0.0001, + "loss": 5.7335, + "loss/crossentropy": 2.499807357788086, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17219502478837967, + "step": 12378 + }, + { + "epoch": 0.386875, + "grad_norm": 2.984375, + "grad_norm_var": 0.040608723958333336, + "learning_rate": 0.0001, + "loss": 5.4897, + "loss/crossentropy": 2.382007360458374, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16232822835445404, + "step": 12380 + }, + { + "epoch": 0.3869375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0439849853515625, + "learning_rate": 0.0001, + "loss": 6.0852, + "loss/crossentropy": 2.6731148958206177, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18730565905570984, + "step": 12382 + }, + { + "epoch": 0.387, + "grad_norm": 2.8125, + "grad_norm_var": 0.0530426025390625, + "learning_rate": 0.0001, + "loss": 5.3265, + "loss/crossentropy": 2.2830283641815186, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15513179451227188, + "step": 12384 + }, + { + "epoch": 0.3870625, + "grad_norm": 3.71875, + "grad_norm_var": 0.0671051025390625, + "learning_rate": 0.0001, + "loss": 6.0105, + "loss/crossentropy": 2.5999969244003296, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18284514546394348, + "step": 12386 + }, + { + "epoch": 0.387125, + "grad_norm": 3.453125, + "grad_norm_var": 0.066015625, + "learning_rate": 0.0001, + "loss": 5.861, + "loss/crossentropy": 2.5838682651519775, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17653851211071014, + "step": 12388 + }, + { + "epoch": 0.3871875, + "grad_norm": 3.140625, + "grad_norm_var": 0.06604410807291666, + "learning_rate": 0.0001, + "loss": 6.0443, + "loss/crossentropy": 2.650669813156128, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1823267638683319, + "step": 12390 + }, + { + "epoch": 0.38725, + "grad_norm": 3.5, + "grad_norm_var": 0.06659749348958334, + "learning_rate": 0.0001, + "loss": 6.1211, + "loss/crossentropy": 2.624216675758362, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.1907046064734459, + "step": 12392 + }, + { + "epoch": 0.3873125, + "grad_norm": 3.453125, + "grad_norm_var": 0.06513264973958334, + "learning_rate": 0.0001, + "loss": 5.732, + "loss/crossentropy": 2.4991012811660767, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17524397373199463, + "step": 12394 + }, + { + "epoch": 0.387375, + "grad_norm": 5.21875, + "grad_norm_var": 0.2813639322916667, + "learning_rate": 0.0001, + "loss": 6.0215, + "loss/crossentropy": 2.6776946783065796, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18281908333301544, + "step": 12396 + }, + { + "epoch": 0.3874375, + "grad_norm": 3.53125, + "grad_norm_var": 0.27920633951822915, + "learning_rate": 0.0001, + "loss": 5.7708, + "loss/crossentropy": 2.5114076137542725, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17359807342290878, + "step": 12398 + }, + { + "epoch": 0.3875, + "grad_norm": 3.046875, + "grad_norm_var": 0.26319986979166665, + "learning_rate": 0.0001, + "loss": 5.7417, + "loss/crossentropy": 2.5564173460006714, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17009221762418747, + "step": 12400 + }, + { + "epoch": 0.3875625, + "grad_norm": 3.28125, + "grad_norm_var": 0.2624664306640625, + "learning_rate": 0.0001, + "loss": 6.2902, + "loss/crossentropy": 2.812481641769409, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19152524322271347, + "step": 12402 + }, + { + "epoch": 0.387625, + "grad_norm": 3.421875, + "grad_norm_var": 0.25524800618489585, + "learning_rate": 0.0001, + "loss": 6.0524, + "loss/crossentropy": 2.710444688796997, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1826329007744789, + "step": 12404 + }, + { + "epoch": 0.3876875, + "grad_norm": 3.125, + "grad_norm_var": 0.25578511555989586, + "learning_rate": 0.0001, + "loss": 5.929, + "loss/crossentropy": 2.6516133546829224, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17812985181808472, + "step": 12406 + }, + { + "epoch": 0.38775, + "grad_norm": 3.515625, + "grad_norm_var": 0.25614827473958335, + "learning_rate": 0.0001, + "loss": 5.8416, + "loss/crossentropy": 2.535792112350464, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.175498329102993, + "step": 12408 + }, + { + "epoch": 0.3878125, + "grad_norm": 3.9375, + "grad_norm_var": 0.26713765462239586, + "learning_rate": 0.0001, + "loss": 5.8101, + "loss/crossentropy": 2.5195395946502686, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1802251636981964, + "step": 12410 + }, + { + "epoch": 0.387875, + "grad_norm": 5.6875, + "grad_norm_var": 0.38483784993489584, + "learning_rate": 0.0001, + "loss": 6.088, + "loss/crossentropy": 2.5359402894973755, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19778630137443542, + "step": 12412 + }, + { + "epoch": 0.3879375, + "grad_norm": 3.484375, + "grad_norm_var": 0.38273824055989586, + "learning_rate": 0.0001, + "loss": 5.8851, + "loss/crossentropy": 2.563393473625183, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18021686375141144, + "step": 12414 + }, + { + "epoch": 0.388, + "grad_norm": 3.40625, + "grad_norm_var": 0.36657613118489585, + "learning_rate": 0.0001, + "loss": 5.6905, + "loss/crossentropy": 2.400371789932251, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17510996758937836, + "step": 12416 + }, + { + "epoch": 0.3880625, + "grad_norm": 3.0, + "grad_norm_var": 0.3878814697265625, + "learning_rate": 0.0001, + "loss": 5.8078, + "loss/crossentropy": 2.5770890712738037, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17150652408599854, + "step": 12418 + }, + { + "epoch": 0.388125, + "grad_norm": 2.9375, + "grad_norm_var": 0.41765950520833334, + "learning_rate": 0.0001, + "loss": 5.6815, + "loss/crossentropy": 2.564165711402893, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1679873764514923, + "step": 12420 + }, + { + "epoch": 0.3881875, + "grad_norm": 14.6875, + "grad_norm_var": 8.300706990559895, + "learning_rate": 0.0001, + "loss": 6.268, + "loss/crossentropy": 2.609584927558899, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.21232140809297562, + "step": 12422 + }, + { + "epoch": 0.38825, + "grad_norm": 3.453125, + "grad_norm_var": 8.298563639322916, + "learning_rate": 0.0001, + "loss": 6.045, + "loss/crossentropy": 2.7234071493148804, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17942225188016891, + "step": 12424 + }, + { + "epoch": 0.3883125, + "grad_norm": 3.625, + "grad_norm_var": 8.25523681640625, + "learning_rate": 0.0001, + "loss": 6.2657, + "loss/crossentropy": 2.8496131896972656, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1857447624206543, + "step": 12426 + }, + { + "epoch": 0.388375, + "grad_norm": 3.234375, + "grad_norm_var": 8.137954711914062, + "learning_rate": 0.0001, + "loss": 6.019, + "loss/crossentropy": 2.729101300239563, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17938295006752014, + "step": 12428 + }, + { + "epoch": 0.3884375, + "grad_norm": 3.484375, + "grad_norm_var": 8.197554524739584, + "learning_rate": 0.0001, + "loss": 6.0574, + "loss/crossentropy": 2.804273843765259, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17414409667253494, + "step": 12430 + }, + { + "epoch": 0.3885, + "grad_norm": 3.328125, + "grad_norm_var": 8.200804646809896, + "learning_rate": 0.0001, + "loss": 5.9274, + "loss/crossentropy": 2.677392601966858, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17461467534303665, + "step": 12432 + }, + { + "epoch": 0.3885625, + "grad_norm": 3.328125, + "grad_norm_var": 8.151171875, + "learning_rate": 0.0001, + "loss": 6.0823, + "loss/crossentropy": 2.739627718925476, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17919403314590454, + "step": 12434 + }, + { + "epoch": 0.388625, + "grad_norm": 3.1875, + "grad_norm_var": 8.069559733072916, + "learning_rate": 0.0001, + "loss": 5.5287, + "loss/crossentropy": 2.3251763582229614, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16918141394853592, + "step": 12436 + }, + { + "epoch": 0.3886875, + "grad_norm": 3.234375, + "grad_norm_var": 0.051985677083333334, + "learning_rate": 0.0001, + "loss": 5.8059, + "loss/crossentropy": 2.5358848571777344, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17544230818748474, + "step": 12438 + }, + { + "epoch": 0.38875, + "grad_norm": 3.40625, + "grad_norm_var": 0.062093098958333336, + "learning_rate": 0.0001, + "loss": 5.8158, + "loss/crossentropy": 2.5627267360687256, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17413048446178436, + "step": 12440 + }, + { + "epoch": 0.3888125, + "grad_norm": 3.078125, + "grad_norm_var": 0.028913370768229165, + "learning_rate": 0.0001, + "loss": 5.9131, + "loss/crossentropy": 2.66182541847229, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17434658855199814, + "step": 12442 + }, + { + "epoch": 0.388875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0419830322265625, + "learning_rate": 0.0001, + "loss": 5.5846, + "loss/crossentropy": 2.480395793914795, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16433017700910568, + "step": 12444 + }, + { + "epoch": 0.3889375, + "grad_norm": 3.40625, + "grad_norm_var": 0.04179585774739583, + "learning_rate": 0.0001, + "loss": 5.9598, + "loss/crossentropy": 2.5799955129623413, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1829030141234398, + "step": 12446 + }, + { + "epoch": 0.389, + "grad_norm": 3.671875, + "grad_norm_var": 0.05797119140625, + "learning_rate": 0.0001, + "loss": 5.8662, + "loss/crossentropy": 2.4446297883987427, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18551705032587051, + "step": 12448 + }, + { + "epoch": 0.3890625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0632232666015625, + "learning_rate": 0.0001, + "loss": 5.5757, + "loss/crossentropy": 2.4247324466705322, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16666096448898315, + "step": 12450 + }, + { + "epoch": 0.389125, + "grad_norm": 3.15625, + "grad_norm_var": 0.054976399739583334, + "learning_rate": 0.0001, + "loss": 5.6293, + "loss/crossentropy": 2.4501971006393433, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1667427271604538, + "step": 12452 + }, + { + "epoch": 0.3891875, + "grad_norm": 3.375, + "grad_norm_var": 0.0585357666015625, + "learning_rate": 0.0001, + "loss": 5.9373, + "loss/crossentropy": 2.6698057651519775, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1779187098145485, + "step": 12454 + }, + { + "epoch": 0.38925, + "grad_norm": 3.296875, + "grad_norm_var": 0.05213114420572917, + "learning_rate": 0.0001, + "loss": 6.0682, + "loss/crossentropy": 2.7355445623397827, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18404972553253174, + "step": 12456 + }, + { + "epoch": 0.3893125, + "grad_norm": 3.203125, + "grad_norm_var": 0.05105794270833333, + "learning_rate": 0.0001, + "loss": 5.8755, + "loss/crossentropy": 2.621545433998108, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17344574630260468, + "step": 12458 + }, + { + "epoch": 0.389375, + "grad_norm": 3.984375, + "grad_norm_var": 0.082177734375, + "learning_rate": 0.0001, + "loss": 5.8256, + "loss/crossentropy": 2.565748691558838, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17285622656345367, + "step": 12460 + }, + { + "epoch": 0.3894375, + "grad_norm": 4.0625, + "grad_norm_var": 0.11726888020833333, + "learning_rate": 0.0001, + "loss": 6.4138, + "loss/crossentropy": 2.834993362426758, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.20397429913282394, + "step": 12462 + }, + { + "epoch": 0.3895, + "grad_norm": 3.171875, + "grad_norm_var": 0.17669169108072916, + "learning_rate": 0.0001, + "loss": 5.9481, + "loss/crossentropy": 2.6729438304901123, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1786840558052063, + "step": 12464 + }, + { + "epoch": 0.3895625, + "grad_norm": 3.640625, + "grad_norm_var": 0.17158203125, + "learning_rate": 0.0001, + "loss": 6.2031, + "loss/crossentropy": 2.658798575401306, + "loss/hidden": 1.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.18958371877670288, + "step": 12466 + }, + { + "epoch": 0.389625, + "grad_norm": 3.1875, + "grad_norm_var": 0.17887369791666666, + "learning_rate": 0.0001, + "loss": 5.4703, + "loss/crossentropy": 2.3624590635299683, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1603964865207672, + "step": 12468 + }, + { + "epoch": 0.3896875, + "grad_norm": 3.109375, + "grad_norm_var": 0.17981363932291666, + "learning_rate": 0.0001, + "loss": 5.5185, + "loss/crossentropy": 2.313608765602112, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16970573365688324, + "step": 12470 + }, + { + "epoch": 0.38975, + "grad_norm": 3.390625, + "grad_norm_var": 0.17998758951822916, + "learning_rate": 0.0001, + "loss": 5.3998, + "loss/crossentropy": 2.1840202808380127, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.16533156484365463, + "step": 12472 + }, + { + "epoch": 0.3898125, + "grad_norm": 3.28125, + "grad_norm_var": 0.1732330322265625, + "learning_rate": 0.0001, + "loss": 5.7885, + "loss/crossentropy": 2.514266848564148, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17312682420015335, + "step": 12474 + }, + { + "epoch": 0.389875, + "grad_norm": 3.03125, + "grad_norm_var": 0.14644775390625, + "learning_rate": 0.0001, + "loss": 5.6845, + "loss/crossentropy": 2.5720447301864624, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1604689359664917, + "step": 12476 + }, + { + "epoch": 0.3899375, + "grad_norm": 3.1875, + "grad_norm_var": 0.117431640625, + "learning_rate": 0.0001, + "loss": 5.9237, + "loss/crossentropy": 2.688705086708069, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17271683365106583, + "step": 12478 + }, + { + "epoch": 0.39, + "grad_norm": 3.28125, + "grad_norm_var": 0.045287068684895834, + "learning_rate": 0.0001, + "loss": 6.0534, + "loss/crossentropy": 2.643076777458191, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18556687235832214, + "step": 12480 + }, + { + "epoch": 0.3900625, + "grad_norm": 3.078125, + "grad_norm_var": 0.3818105061848958, + "learning_rate": 0.0001, + "loss": 5.7111, + "loss/crossentropy": 2.463478922843933, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1739843562245369, + "step": 12482 + }, + { + "epoch": 0.390125, + "grad_norm": 3.46875, + "grad_norm_var": 0.3636627197265625, + "learning_rate": 0.0001, + "loss": 5.9466, + "loss/crossentropy": 2.5659433603286743, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18260101974010468, + "step": 12484 + }, + { + "epoch": 0.3901875, + "grad_norm": 3.34375, + "grad_norm_var": 0.35659891764322915, + "learning_rate": 0.0001, + "loss": 5.9332, + "loss/crossentropy": 2.6430987119674683, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17940440773963928, + "step": 12486 + }, + { + "epoch": 0.39025, + "grad_norm": 3.390625, + "grad_norm_var": 0.3578114827473958, + "learning_rate": 0.0001, + "loss": 5.8222, + "loss/crossentropy": 2.4864721298217773, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17732400447130203, + "step": 12488 + }, + { + "epoch": 0.3903125, + "grad_norm": 3.421875, + "grad_norm_var": 0.3553700764973958, + "learning_rate": 0.0001, + "loss": 5.7043, + "loss/crossentropy": 2.460012197494507, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1709115281701088, + "step": 12490 + }, + { + "epoch": 0.390375, + "grad_norm": 3.640625, + "grad_norm_var": 0.3351064046223958, + "learning_rate": 0.0001, + "loss": 5.8912, + "loss/crossentropy": 2.4892873764038086, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18393903970718384, + "step": 12492 + }, + { + "epoch": 0.3904375, + "grad_norm": 4.84375, + "grad_norm_var": 0.4280181884765625, + "learning_rate": 0.0001, + "loss": 5.8677, + "loss/crossentropy": 2.4914904832839966, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18332628905773163, + "step": 12494 + }, + { + "epoch": 0.3905, + "grad_norm": 3.59375, + "grad_norm_var": 0.44505208333333335, + "learning_rate": 0.0001, + "loss": 6.0272, + "loss/crossentropy": 2.6169928312301636, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1859443485736847, + "step": 12496 + }, + { + "epoch": 0.3905625, + "grad_norm": 4.3125, + "grad_norm_var": 0.19702860514322917, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.447673797607422, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17634569108486176, + "step": 12498 + }, + { + "epoch": 0.390625, + "grad_norm": 3.25, + "grad_norm_var": 0.2016754150390625, + "learning_rate": 0.0001, + "loss": 6.1153, + "loss/crossentropy": 2.724039673805237, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1859992891550064, + "step": 12500 + }, + { + "epoch": 0.3906875, + "grad_norm": 3.46875, + "grad_norm_var": 0.1912017822265625, + "learning_rate": 0.0001, + "loss": 5.985, + "loss/crossentropy": 2.615624785423279, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1845911666750908, + "step": 12502 + }, + { + "epoch": 0.39075, + "grad_norm": 3.796875, + "grad_norm_var": 0.20871988932291666, + "learning_rate": 0.0001, + "loss": 5.9715, + "loss/crossentropy": 2.6195785999298096, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1808907762169838, + "step": 12504 + }, + { + "epoch": 0.3908125, + "grad_norm": 3.28125, + "grad_norm_var": 0.21756184895833333, + "learning_rate": 0.0001, + "loss": 5.8925, + "loss/crossentropy": 2.5929884910583496, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17487546801567078, + "step": 12506 + }, + { + "epoch": 0.390875, + "grad_norm": 3.421875, + "grad_norm_var": 0.24308980305989583, + "learning_rate": 0.0001, + "loss": 5.6525, + "loss/crossentropy": 2.4756404161453247, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1673000156879425, + "step": 12508 + }, + { + "epoch": 0.3909375, + "grad_norm": 3.484375, + "grad_norm_var": 0.12542317708333334, + "learning_rate": 0.0001, + "loss": 5.937, + "loss/crossentropy": 2.653487205505371, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17600330710411072, + "step": 12510 + }, + { + "epoch": 0.391, + "grad_norm": 3.28125, + "grad_norm_var": 0.12543843587239584, + "learning_rate": 0.0001, + "loss": 5.4535, + "loss/crossentropy": 2.368133783340454, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1561921015381813, + "step": 12512 + }, + { + "epoch": 0.3910625, + "grad_norm": 3.515625, + "grad_norm_var": 0.07339579264322917, + "learning_rate": 0.0001, + "loss": 5.4631, + "loss/crossentropy": 2.319524049758911, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16044697165489197, + "step": 12514 + }, + { + "epoch": 0.391125, + "grad_norm": 3.125, + "grad_norm_var": 0.07093098958333334, + "learning_rate": 0.0001, + "loss": 5.8847, + "loss/crossentropy": 2.6295299530029297, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17668668925762177, + "step": 12516 + }, + { + "epoch": 0.3911875, + "grad_norm": 3.0, + "grad_norm_var": 0.07001546223958334, + "learning_rate": 0.0001, + "loss": 5.7337, + "loss/crossentropy": 2.4934409856796265, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17051269114017487, + "step": 12518 + }, + { + "epoch": 0.39125, + "grad_norm": 3.234375, + "grad_norm_var": 0.04545796712239583, + "learning_rate": 0.0001, + "loss": 5.9634, + "loss/crossentropy": 2.6827075481414795, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1765061467885971, + "step": 12520 + }, + { + "epoch": 0.3913125, + "grad_norm": 3.46875, + "grad_norm_var": 0.05082906087239583, + "learning_rate": 0.0001, + "loss": 5.8172, + "loss/crossentropy": 2.511690855026245, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1789904236793518, + "step": 12522 + }, + { + "epoch": 0.391375, + "grad_norm": 3.0625, + "grad_norm_var": 0.044188435872395834, + "learning_rate": 0.0001, + "loss": 5.6367, + "loss/crossentropy": 2.4518083333969116, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17044229060411453, + "step": 12524 + }, + { + "epoch": 0.3914375, + "grad_norm": 3.125, + "grad_norm_var": 0.03808186848958333, + "learning_rate": 0.0001, + "loss": 5.77, + "loss/crossentropy": 2.6033570766448975, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16315263509750366, + "step": 12526 + }, + { + "epoch": 0.3915, + "grad_norm": 2.875, + "grad_norm_var": 0.038386027018229164, + "learning_rate": 0.0001, + "loss": 5.6457, + "loss/crossentropy": 2.487342357635498, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16974316537380219, + "step": 12528 + }, + { + "epoch": 0.3915625, + "grad_norm": 2.84375, + "grad_norm_var": 0.032225545247395834, + "learning_rate": 0.0001, + "loss": 5.8537, + "loss/crossentropy": 2.6897761821746826, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16716928780078888, + "step": 12530 + }, + { + "epoch": 0.391625, + "grad_norm": 3.015625, + "grad_norm_var": 0.03245340983072917, + "learning_rate": 0.0001, + "loss": 5.6876, + "loss/crossentropy": 2.4564136266708374, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17116591334342957, + "step": 12532 + }, + { + "epoch": 0.3916875, + "grad_norm": 3.125, + "grad_norm_var": 0.02880859375, + "learning_rate": 0.0001, + "loss": 6.1001, + "loss/crossentropy": 2.800950288772583, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17913557589054108, + "step": 12534 + }, + { + "epoch": 0.39175, + "grad_norm": 3.578125, + "grad_norm_var": 0.053954060872395834, + "learning_rate": 0.0001, + "loss": 5.9839, + "loss/crossentropy": 2.5637909173965454, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1884925439953804, + "step": 12536 + }, + { + "epoch": 0.3918125, + "grad_norm": 3.125, + "grad_norm_var": 0.04442952473958333, + "learning_rate": 0.0001, + "loss": 5.656, + "loss/crossentropy": 2.512777805328369, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1674426794052124, + "step": 12538 + }, + { + "epoch": 0.391875, + "grad_norm": 3.109375, + "grad_norm_var": 0.04358622233072917, + "learning_rate": 0.0001, + "loss": 5.2536, + "loss/crossentropy": 2.2287850379943848, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15521272271871567, + "step": 12540 + }, + { + "epoch": 0.3919375, + "grad_norm": 3.515625, + "grad_norm_var": 0.05670166015625, + "learning_rate": 0.0001, + "loss": 6.172, + "loss/crossentropy": 2.7422330379486084, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18789854645729065, + "step": 12542 + }, + { + "epoch": 0.392, + "grad_norm": 3.203125, + "grad_norm_var": 0.0508453369140625, + "learning_rate": 0.0001, + "loss": 5.7974, + "loss/crossentropy": 2.490816831588745, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17753690481185913, + "step": 12544 + }, + { + "epoch": 0.3920625, + "grad_norm": 3.34375, + "grad_norm_var": 0.03681640625, + "learning_rate": 0.0001, + "loss": 5.9525, + "loss/crossentropy": 2.657665252685547, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1783137023448944, + "step": 12546 + }, + { + "epoch": 0.392125, + "grad_norm": 3.25, + "grad_norm_var": 0.030387369791666667, + "learning_rate": 0.0001, + "loss": 6.3394, + "loss/crossentropy": 2.913390874862671, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18947432935237885, + "step": 12548 + }, + { + "epoch": 0.3921875, + "grad_norm": 2.9375, + "grad_norm_var": 0.038605753580729166, + "learning_rate": 0.0001, + "loss": 5.7545, + "loss/crossentropy": 2.521004557609558, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17413241416215897, + "step": 12550 + }, + { + "epoch": 0.39225, + "grad_norm": 3.0625, + "grad_norm_var": 0.027176920572916666, + "learning_rate": 0.0001, + "loss": 5.6198, + "loss/crossentropy": 2.4493943452835083, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1689920276403427, + "step": 12552 + }, + { + "epoch": 0.3923125, + "grad_norm": 3.359375, + "grad_norm_var": 0.02763671875, + "learning_rate": 0.0001, + "loss": 6.0448, + "loss/crossentropy": 2.728785991668701, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18042971938848495, + "step": 12554 + }, + { + "epoch": 0.392375, + "grad_norm": 3.890625, + "grad_norm_var": 0.06608784993489583, + "learning_rate": 0.0001, + "loss": 6.0623, + "loss/crossentropy": 2.7216707468032837, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17937306314706802, + "step": 12556 + }, + { + "epoch": 0.3924375, + "grad_norm": 4.4375, + "grad_norm_var": 0.14260660807291667, + "learning_rate": 0.0001, + "loss": 6.114, + "loss/crossentropy": 2.7082772254943848, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18705706298351288, + "step": 12558 + }, + { + "epoch": 0.3925, + "grad_norm": 3.4375, + "grad_norm_var": 0.14175516764322918, + "learning_rate": 0.0001, + "loss": 6.0443, + "loss/crossentropy": 2.6535454988479614, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1836022362112999, + "step": 12560 + }, + { + "epoch": 0.3925625, + "grad_norm": 3.296875, + "grad_norm_var": 0.14643452962239584, + "learning_rate": 0.0001, + "loss": 5.8386, + "loss/crossentropy": 2.5863006114959717, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17288491874933243, + "step": 12562 + }, + { + "epoch": 0.392625, + "grad_norm": 3.46875, + "grad_norm_var": 0.14233296712239582, + "learning_rate": 0.0001, + "loss": 6.2355, + "loss/crossentropy": 2.804616689682007, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1868431344628334, + "step": 12564 + }, + { + "epoch": 0.3926875, + "grad_norm": 3.796875, + "grad_norm_var": 0.12805074055989582, + "learning_rate": 0.0001, + "loss": 5.9621, + "loss/crossentropy": 2.629786491394043, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18401210010051727, + "step": 12566 + }, + { + "epoch": 0.39275, + "grad_norm": 3.28125, + "grad_norm_var": 0.11767171223958334, + "learning_rate": 0.0001, + "loss": 5.7638, + "loss/crossentropy": 2.5153621435165405, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17366864532232285, + "step": 12568 + }, + { + "epoch": 0.3928125, + "grad_norm": 3.078125, + "grad_norm_var": 0.12672119140625, + "learning_rate": 0.0001, + "loss": 5.7299, + "loss/crossentropy": 2.584002733230591, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16575895249843597, + "step": 12570 + }, + { + "epoch": 0.392875, + "grad_norm": 3.25, + "grad_norm_var": 0.11220296223958333, + "learning_rate": 0.0001, + "loss": 5.9635, + "loss/crossentropy": 2.6303478479385376, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18175004422664642, + "step": 12572 + }, + { + "epoch": 0.3929375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0424468994140625, + "learning_rate": 0.0001, + "loss": 5.6407, + "loss/crossentropy": 2.4312771558761597, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17328844219446182, + "step": 12574 + }, + { + "epoch": 0.393, + "grad_norm": 3.28125, + "grad_norm_var": 0.07740478515625, + "learning_rate": 0.0001, + "loss": 6.1336, + "loss/crossentropy": 2.7003642320632935, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18629637360572815, + "step": 12576 + }, + { + "epoch": 0.3930625, + "grad_norm": 2.890625, + "grad_norm_var": 0.08684895833333334, + "learning_rate": 0.0001, + "loss": 5.4049, + "loss/crossentropy": 2.3360401391983032, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15961584448814392, + "step": 12578 + }, + { + "epoch": 0.393125, + "grad_norm": 3.203125, + "grad_norm_var": 0.08376363118489584, + "learning_rate": 0.0001, + "loss": 5.8556, + "loss/crossentropy": 2.511664032936096, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17892113327980042, + "step": 12580 + }, + { + "epoch": 0.3931875, + "grad_norm": 3.3125, + "grad_norm_var": 0.06633199055989583, + "learning_rate": 0.0001, + "loss": 5.8833, + "loss/crossentropy": 2.581635594367981, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18055399507284164, + "step": 12582 + }, + { + "epoch": 0.39325, + "grad_norm": 3.375, + "grad_norm_var": 0.06669514973958333, + "learning_rate": 0.0001, + "loss": 5.5154, + "loss/crossentropy": 2.326522707939148, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1700570210814476, + "step": 12584 + }, + { + "epoch": 0.3933125, + "grad_norm": 3.296875, + "grad_norm_var": 0.06213785807291667, + "learning_rate": 0.0001, + "loss": 5.904, + "loss/crossentropy": 2.644391179084778, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17518238723278046, + "step": 12586 + }, + { + "epoch": 0.393375, + "grad_norm": 3.625, + "grad_norm_var": 0.06856180826822916, + "learning_rate": 0.0001, + "loss": 6.2582, + "loss/crossentropy": 2.8179484605789185, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18856094777584076, + "step": 12588 + }, + { + "epoch": 0.3934375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0644439697265625, + "learning_rate": 0.0001, + "loss": 5.8571, + "loss/crossentropy": 2.6284478902816772, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17482053488492966, + "step": 12590 + }, + { + "epoch": 0.3935, + "grad_norm": 3.140625, + "grad_norm_var": 0.028218587239583332, + "learning_rate": 0.0001, + "loss": 5.86, + "loss/crossentropy": 2.5996965169906616, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1752540022134781, + "step": 12592 + }, + { + "epoch": 0.3935625, + "grad_norm": 3.375, + "grad_norm_var": 0.015771484375, + "learning_rate": 0.0001, + "loss": 5.6939, + "loss/crossentropy": 2.366678476333618, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17842572182416916, + "step": 12594 + }, + { + "epoch": 0.393625, + "grad_norm": 2.984375, + "grad_norm_var": 0.02666015625, + "learning_rate": 0.0001, + "loss": 5.558, + "loss/crossentropy": 2.4497467279434204, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16512507945299149, + "step": 12596 + }, + { + "epoch": 0.3936875, + "grad_norm": 3.078125, + "grad_norm_var": 0.028587849934895833, + "learning_rate": 0.0001, + "loss": 6.0952, + "loss/crossentropy": 2.6856324672698975, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18509763479232788, + "step": 12598 + }, + { + "epoch": 0.39375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0327545166015625, + "learning_rate": 0.0001, + "loss": 5.5648, + "loss/crossentropy": 2.4074909687042236, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1653430312871933, + "step": 12600 + }, + { + "epoch": 0.3938125, + "grad_norm": 3.046875, + "grad_norm_var": 0.031636555989583336, + "learning_rate": 0.0001, + "loss": 5.7825, + "loss/crossentropy": 2.528868794441223, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1745782196521759, + "step": 12602 + }, + { + "epoch": 0.393875, + "grad_norm": 3.3125, + "grad_norm_var": 0.020466105143229166, + "learning_rate": 0.0001, + "loss": 5.9446, + "loss/crossentropy": 2.685313105583191, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1767057254910469, + "step": 12604 + }, + { + "epoch": 0.3939375, + "grad_norm": 3.46875, + "grad_norm_var": 0.025651041666666666, + "learning_rate": 0.0001, + "loss": 6.0904, + "loss/crossentropy": 2.719091296195984, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1847868636250496, + "step": 12606 + }, + { + "epoch": 0.394, + "grad_norm": 3.453125, + "grad_norm_var": 0.032013956705729166, + "learning_rate": 0.0001, + "loss": 5.8556, + "loss/crossentropy": 2.664482593536377, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16872593015432358, + "step": 12608 + }, + { + "epoch": 0.3940625, + "grad_norm": 3.25, + "grad_norm_var": 0.0301910400390625, + "learning_rate": 0.0001, + "loss": 5.7742, + "loss/crossentropy": 2.536603331565857, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17298102378845215, + "step": 12610 + }, + { + "epoch": 0.394125, + "grad_norm": 3.21875, + "grad_norm_var": 0.02486572265625, + "learning_rate": 0.0001, + "loss": 5.664, + "loss/crossentropy": 2.4420113563537598, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1690772995352745, + "step": 12612 + }, + { + "epoch": 0.3941875, + "grad_norm": 3.5, + "grad_norm_var": 0.027708943684895834, + "learning_rate": 0.0001, + "loss": 6.0206, + "loss/crossentropy": 2.643946409225464, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18648922443389893, + "step": 12614 + }, + { + "epoch": 0.39425, + "grad_norm": 3.390625, + "grad_norm_var": 0.03137613932291667, + "learning_rate": 0.0001, + "loss": 6.2392, + "loss/crossentropy": 2.7503719329833984, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19185277819633484, + "step": 12616 + }, + { + "epoch": 0.3943125, + "grad_norm": 3.203125, + "grad_norm_var": 0.040999348958333334, + "learning_rate": 0.0001, + "loss": 6.1137, + "loss/crossentropy": 2.66534686088562, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18741349875926971, + "step": 12618 + }, + { + "epoch": 0.394375, + "grad_norm": 3.125, + "grad_norm_var": 0.037398274739583334, + "learning_rate": 0.0001, + "loss": 5.9342, + "loss/crossentropy": 2.627368450164795, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17638636380434036, + "step": 12620 + }, + { + "epoch": 0.3944375, + "grad_norm": 3.84375, + "grad_norm_var": 0.06957906087239583, + "learning_rate": 0.0001, + "loss": 5.4455, + "loss/crossentropy": 2.38720965385437, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15699703991413116, + "step": 12622 + }, + { + "epoch": 0.3945, + "grad_norm": 3.15625, + "grad_norm_var": 0.0674224853515625, + "learning_rate": 0.0001, + "loss": 5.952, + "loss/crossentropy": 2.627716302871704, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1789139211177826, + "step": 12624 + }, + { + "epoch": 0.3945625, + "grad_norm": 3.0625, + "grad_norm_var": 0.0775787353515625, + "learning_rate": 0.0001, + "loss": 5.6675, + "loss/crossentropy": 2.4872137308120728, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16880901902914047, + "step": 12626 + }, + { + "epoch": 0.394625, + "grad_norm": 3.4375, + "grad_norm_var": 0.07729390462239584, + "learning_rate": 0.0001, + "loss": 5.8755, + "loss/crossentropy": 2.627228617668152, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17365680634975433, + "step": 12628 + }, + { + "epoch": 0.3946875, + "grad_norm": 3.5625, + "grad_norm_var": 0.07883199055989583, + "learning_rate": 0.0001, + "loss": 6.046, + "loss/crossentropy": 2.7390414476394653, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1763944998383522, + "step": 12630 + }, + { + "epoch": 0.39475, + "grad_norm": 4.59375, + "grad_norm_var": 0.1789215087890625, + "learning_rate": 0.0001, + "loss": 5.8115, + "loss/crossentropy": 2.4511373043060303, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17939364165067673, + "step": 12632 + }, + { + "epoch": 0.3948125, + "grad_norm": 3.625, + "grad_norm_var": 0.170458984375, + "learning_rate": 0.0001, + "loss": 5.8712, + "loss/crossentropy": 2.5700401067733765, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17777566611766815, + "step": 12634 + }, + { + "epoch": 0.394875, + "grad_norm": 3.234375, + "grad_norm_var": 0.16705729166666666, + "learning_rate": 0.0001, + "loss": 5.5774, + "loss/crossentropy": 2.346840262413025, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16876373440027237, + "step": 12636 + }, + { + "epoch": 0.3949375, + "grad_norm": 3.046875, + "grad_norm_var": 0.14108784993489584, + "learning_rate": 0.0001, + "loss": 5.632, + "loss/crossentropy": 2.4182028770446777, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16825632005929947, + "step": 12638 + }, + { + "epoch": 0.395, + "grad_norm": 2.984375, + "grad_norm_var": 0.14585673014322917, + "learning_rate": 0.0001, + "loss": 5.8294, + "loss/crossentropy": 2.660908341407776, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16997504979372025, + "step": 12640 + }, + { + "epoch": 0.3950625, + "grad_norm": 3.40625, + "grad_norm_var": 0.1327545166015625, + "learning_rate": 0.0001, + "loss": 5.6972, + "loss/crossentropy": 2.4525365829467773, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17602672427892685, + "step": 12642 + }, + { + "epoch": 0.395125, + "grad_norm": 3.28125, + "grad_norm_var": 0.13359375, + "learning_rate": 0.0001, + "loss": 5.8437, + "loss/crossentropy": 2.584797978401184, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17628318071365356, + "step": 12644 + }, + { + "epoch": 0.3951875, + "grad_norm": 3.125, + "grad_norm_var": 0.13804423014322917, + "learning_rate": 0.0001, + "loss": 5.8704, + "loss/crossentropy": 2.6155022382736206, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17392294108867645, + "step": 12646 + }, + { + "epoch": 0.39525, + "grad_norm": 2.96875, + "grad_norm_var": 0.03209228515625, + "learning_rate": 0.0001, + "loss": 5.3547, + "loss/crossentropy": 2.2829012870788574, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16147667169570923, + "step": 12648 + }, + { + "epoch": 0.3953125, + "grad_norm": 3.21875, + "grad_norm_var": 0.02261962890625, + "learning_rate": 0.0001, + "loss": 5.5823, + "loss/crossentropy": 2.4431118965148926, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16587400436401367, + "step": 12650 + }, + { + "epoch": 0.395375, + "grad_norm": 3.359375, + "grad_norm_var": 0.021577962239583335, + "learning_rate": 0.0001, + "loss": 5.9593, + "loss/crossentropy": 2.669771671295166, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17777632921934128, + "step": 12652 + }, + { + "epoch": 0.3954375, + "grad_norm": 3.140625, + "grad_norm_var": 0.020699055989583333, + "learning_rate": 0.0001, + "loss": 5.5884, + "loss/crossentropy": 2.387265920639038, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1724553257226944, + "step": 12654 + }, + { + "epoch": 0.3955, + "grad_norm": 3.4375, + "grad_norm_var": 0.02095947265625, + "learning_rate": 0.0001, + "loss": 5.5649, + "loss/crossentropy": 2.4451918601989746, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.15923383086919785, + "step": 12656 + }, + { + "epoch": 0.3955625, + "grad_norm": 2.921875, + "grad_norm_var": 0.02203369140625, + "learning_rate": 0.0001, + "loss": 5.5627, + "loss/crossentropy": 2.475408911705017, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16341856867074966, + "step": 12658 + }, + { + "epoch": 0.395625, + "grad_norm": 3.390625, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 5.9212, + "loss/crossentropy": 2.67451274394989, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17388500273227692, + "step": 12660 + }, + { + "epoch": 0.3956875, + "grad_norm": 3.140625, + "grad_norm_var": 0.02467041015625, + "learning_rate": 0.0001, + "loss": 5.924, + "loss/crossentropy": 2.6317321062088013, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.18118055164813995, + "step": 12662 + }, + { + "epoch": 0.39575, + "grad_norm": 3.546875, + "grad_norm_var": 0.028776041666666665, + "learning_rate": 0.0001, + "loss": 5.7894, + "loss/crossentropy": 2.5545856952667236, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17348483204841614, + "step": 12664 + }, + { + "epoch": 0.3958125, + "grad_norm": 3.4375, + "grad_norm_var": 0.030692545572916667, + "learning_rate": 0.0001, + "loss": 5.8091, + "loss/crossentropy": 2.508827805519104, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1796346753835678, + "step": 12666 + }, + { + "epoch": 0.395875, + "grad_norm": 3.40625, + "grad_norm_var": 0.03242899576822917, + "learning_rate": 0.0001, + "loss": 5.7779, + "loss/crossentropy": 2.5303611755371094, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17280302941799164, + "step": 12668 + }, + { + "epoch": 0.3959375, + "grad_norm": 3.640625, + "grad_norm_var": 0.03943583170572917, + "learning_rate": 0.0001, + "loss": 6.0169, + "loss/crossentropy": 2.6776299476623535, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1831415742635727, + "step": 12670 + }, + { + "epoch": 0.396, + "grad_norm": 3.125, + "grad_norm_var": 0.03638407389322917, + "learning_rate": 0.0001, + "loss": 5.5852, + "loss/crossentropy": 2.3899872303009033, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17030571401119232, + "step": 12672 + }, + { + "epoch": 0.3960625, + "grad_norm": 3.40625, + "grad_norm_var": 0.025016276041666667, + "learning_rate": 0.0001, + "loss": 5.9688, + "loss/crossentropy": 2.6896661520004272, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17908131331205368, + "step": 12674 + }, + { + "epoch": 0.396125, + "grad_norm": 3.21875, + "grad_norm_var": 0.0358062744140625, + "learning_rate": 0.0001, + "loss": 6.1225, + "loss/crossentropy": 2.7160264253616333, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18322856724262238, + "step": 12676 + }, + { + "epoch": 0.3961875, + "grad_norm": 3.078125, + "grad_norm_var": 0.038374837239583334, + "learning_rate": 0.0001, + "loss": 6.0282, + "loss/crossentropy": 2.6944602727890015, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18063852190971375, + "step": 12678 + }, + { + "epoch": 0.39625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03828023274739583, + "learning_rate": 0.0001, + "loss": 5.5322, + "loss/crossentropy": 2.3815308809280396, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16272065788507462, + "step": 12680 + }, + { + "epoch": 0.3963125, + "grad_norm": 2.953125, + "grad_norm_var": 0.049072265625, + "learning_rate": 0.0001, + "loss": 5.7624, + "loss/crossentropy": 2.572110176086426, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17059581726789474, + "step": 12682 + }, + { + "epoch": 0.396375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0523834228515625, + "learning_rate": 0.0001, + "loss": 5.963, + "loss/crossentropy": 2.7088600397109985, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17540912330150604, + "step": 12684 + }, + { + "epoch": 0.3964375, + "grad_norm": 3.09375, + "grad_norm_var": 0.041337076822916666, + "learning_rate": 0.0001, + "loss": 5.7898, + "loss/crossentropy": 2.542935609817505, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17547084391117096, + "step": 12686 + }, + { + "epoch": 0.3965, + "grad_norm": 3.328125, + "grad_norm_var": 0.0449859619140625, + "learning_rate": 0.0001, + "loss": 5.9827, + "loss/crossentropy": 2.6294325590133667, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1821979507803917, + "step": 12688 + }, + { + "epoch": 0.3965625, + "grad_norm": 3.25, + "grad_norm_var": 0.042529296875, + "learning_rate": 0.0001, + "loss": 5.5309, + "loss/crossentropy": 2.3679721355438232, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16746362298727036, + "step": 12690 + }, + { + "epoch": 0.396625, + "grad_norm": 2.96875, + "grad_norm_var": 0.022977701822916665, + "learning_rate": 0.0001, + "loss": 5.819, + "loss/crossentropy": 2.605017900466919, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17178434878587723, + "step": 12692 + }, + { + "epoch": 0.3966875, + "grad_norm": 3.296875, + "grad_norm_var": 0.019759114583333334, + "learning_rate": 0.0001, + "loss": 5.899, + "loss/crossentropy": 2.6208107471466064, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17821407318115234, + "step": 12694 + }, + { + "epoch": 0.39675, + "grad_norm": 3.0, + "grad_norm_var": 0.020279947916666666, + "learning_rate": 0.0001, + "loss": 5.5962, + "loss/crossentropy": 2.4298471212387085, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16663925349712372, + "step": 12696 + }, + { + "epoch": 0.3968125, + "grad_norm": 3.234375, + "grad_norm_var": 0.020051066080729166, + "learning_rate": 0.0001, + "loss": 5.482, + "loss/crossentropy": 2.359502673149109, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16342318803071976, + "step": 12698 + }, + { + "epoch": 0.396875, + "grad_norm": 3.40625, + "grad_norm_var": 0.023102823893229166, + "learning_rate": 0.0001, + "loss": 6.0644, + "loss/crossentropy": 2.720403790473938, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17932403087615967, + "step": 12700 + }, + { + "epoch": 0.3969375, + "grad_norm": 3.1875, + "grad_norm_var": 0.023680623372395834, + "learning_rate": 0.0001, + "loss": 5.8073, + "loss/crossentropy": 2.5173072814941406, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1747017800807953, + "step": 12702 + }, + { + "epoch": 0.397, + "grad_norm": 3.078125, + "grad_norm_var": 0.018001302083333334, + "learning_rate": 0.0001, + "loss": 5.785, + "loss/crossentropy": 2.5444486141204834, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17483435571193695, + "step": 12704 + }, + { + "epoch": 0.3970625, + "grad_norm": 3.234375, + "grad_norm_var": 0.019334920247395835, + "learning_rate": 0.0001, + "loss": 5.9081, + "loss/crossentropy": 2.5757994651794434, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18128123879432678, + "step": 12706 + }, + { + "epoch": 0.397125, + "grad_norm": 3.125, + "grad_norm_var": 0.017365519205729166, + "learning_rate": 0.0001, + "loss": 5.716, + "loss/crossentropy": 2.5148154497146606, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17207133769989014, + "step": 12708 + }, + { + "epoch": 0.3971875, + "grad_norm": 3.171875, + "grad_norm_var": 0.025504557291666667, + "learning_rate": 0.0001, + "loss": 6.4185, + "loss/crossentropy": 2.932085871696472, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1908249631524086, + "step": 12710 + }, + { + "epoch": 0.39725, + "grad_norm": 3.203125, + "grad_norm_var": 0.0242095947265625, + "learning_rate": 0.0001, + "loss": 5.9251, + "loss/crossentropy": 2.657052993774414, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17719236016273499, + "step": 12712 + }, + { + "epoch": 0.3973125, + "grad_norm": 3.484375, + "grad_norm_var": 0.03722330729166667, + "learning_rate": 0.0001, + "loss": 6.0949, + "loss/crossentropy": 2.6813771724700928, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18510235846042633, + "step": 12714 + }, + { + "epoch": 0.397375, + "grad_norm": 3.375, + "grad_norm_var": 0.0348297119140625, + "learning_rate": 0.0001, + "loss": 5.9062, + "loss/crossentropy": 2.5811485052108765, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17742551118135452, + "step": 12716 + }, + { + "epoch": 0.3974375, + "grad_norm": 3.078125, + "grad_norm_var": 0.04065755208333333, + "learning_rate": 0.0001, + "loss": 5.4382, + "loss/crossentropy": 2.3304866552352905, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1631132811307907, + "step": 12718 + }, + { + "epoch": 0.3975, + "grad_norm": 3.09375, + "grad_norm_var": 0.0389068603515625, + "learning_rate": 0.0001, + "loss": 5.613, + "loss/crossentropy": 2.435179114341736, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16504327952861786, + "step": 12720 + }, + { + "epoch": 0.3975625, + "grad_norm": 3.171875, + "grad_norm_var": 0.040755208333333334, + "learning_rate": 0.0001, + "loss": 5.7467, + "loss/crossentropy": 2.530525803565979, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17318245768547058, + "step": 12722 + }, + { + "epoch": 0.397625, + "grad_norm": 3.328125, + "grad_norm_var": 0.04049072265625, + "learning_rate": 0.0001, + "loss": 5.8345, + "loss/crossentropy": 2.524799942970276, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17823348939418793, + "step": 12724 + }, + { + "epoch": 0.3976875, + "grad_norm": 3.171875, + "grad_norm_var": 0.0399078369140625, + "learning_rate": 0.0001, + "loss": 5.778, + "loss/crossentropy": 2.4941515922546387, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17877789586782455, + "step": 12726 + }, + { + "epoch": 0.39775, + "grad_norm": 3.0625, + "grad_norm_var": 0.0445709228515625, + "learning_rate": 0.0001, + "loss": 5.7333, + "loss/crossentropy": 2.5802040100097656, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1684320867061615, + "step": 12728 + }, + { + "epoch": 0.3978125, + "grad_norm": 3.890625, + "grad_norm_var": 0.05390218098958333, + "learning_rate": 0.0001, + "loss": 5.8855, + "loss/crossentropy": 2.5856465101242065, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17763860523700714, + "step": 12730 + }, + { + "epoch": 0.397875, + "grad_norm": 3.234375, + "grad_norm_var": 0.06889546712239583, + "learning_rate": 0.0001, + "loss": 5.846, + "loss/crossentropy": 2.570215344429016, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1724957674741745, + "step": 12732 + }, + { + "epoch": 0.3979375, + "grad_norm": 2.984375, + "grad_norm_var": 0.06931966145833333, + "learning_rate": 0.0001, + "loss": 5.7952, + "loss/crossentropy": 2.5533066987991333, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17457794398069382, + "step": 12734 + }, + { + "epoch": 0.398, + "grad_norm": 3.203125, + "grad_norm_var": 0.06868082682291667, + "learning_rate": 0.0001, + "loss": 5.5556, + "loss/crossentropy": 2.3609848022460938, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16985399276018143, + "step": 12736 + }, + { + "epoch": 0.3980625, + "grad_norm": 3.125, + "grad_norm_var": 0.06868489583333333, + "learning_rate": 0.0001, + "loss": 5.675, + "loss/crossentropy": 2.453263998031616, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17256765067577362, + "step": 12738 + }, + { + "epoch": 0.398125, + "grad_norm": 3.203125, + "grad_norm_var": 0.06577860514322917, + "learning_rate": 0.0001, + "loss": 5.8025, + "loss/crossentropy": 2.5349299907684326, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17637009173631668, + "step": 12740 + }, + { + "epoch": 0.3981875, + "grad_norm": 3.25, + "grad_norm_var": 0.06015218098958333, + "learning_rate": 0.0001, + "loss": 5.8391, + "loss/crossentropy": 2.591480016708374, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17437315732240677, + "step": 12742 + }, + { + "epoch": 0.39825, + "grad_norm": 3.109375, + "grad_norm_var": 0.05719401041666667, + "learning_rate": 0.0001, + "loss": 5.7709, + "loss/crossentropy": 2.547766089439392, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17231663316488266, + "step": 12744 + }, + { + "epoch": 0.3983125, + "grad_norm": 3.21875, + "grad_norm_var": 0.029832967122395835, + "learning_rate": 0.0001, + "loss": 5.6614, + "loss/crossentropy": 2.5061652660369873, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16434767842292786, + "step": 12746 + }, + { + "epoch": 0.398375, + "grad_norm": 3.109375, + "grad_norm_var": 0.00699462890625, + "learning_rate": 0.0001, + "loss": 5.6085, + "loss/crossentropy": 2.4205719232559204, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17113681882619858, + "step": 12748 + }, + { + "epoch": 0.3984375, + "grad_norm": 3.125, + "grad_norm_var": 0.005760701497395834, + "learning_rate": 0.0001, + "loss": 5.6337, + "loss/crossentropy": 2.4922345876693726, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16922497749328613, + "step": 12750 + }, + { + "epoch": 0.3985, + "grad_norm": 3.265625, + "grad_norm_var": 0.006734212239583333, + "learning_rate": 0.0001, + "loss": 5.8577, + "loss/crossentropy": 2.6219236850738525, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17319178581237793, + "step": 12752 + }, + { + "epoch": 0.3985625, + "grad_norm": 3.3125, + "grad_norm_var": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 5.772, + "loss/crossentropy": 2.5100075006484985, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1738506257534027, + "step": 12754 + }, + { + "epoch": 0.398625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0154296875, + "learning_rate": 0.0001, + "loss": 5.8406, + "loss/crossentropy": 2.6387826204299927, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17135730385780334, + "step": 12756 + }, + { + "epoch": 0.3986875, + "grad_norm": 3.109375, + "grad_norm_var": 0.020601399739583335, + "learning_rate": 0.0001, + "loss": 5.8364, + "loss/crossentropy": 2.608380675315857, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17358743399381638, + "step": 12758 + }, + { + "epoch": 0.39875, + "grad_norm": 3.046875, + "grad_norm_var": 0.021873982747395833, + "learning_rate": 0.0001, + "loss": 5.8293, + "loss/crossentropy": 2.6063863039016724, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17150786519050598, + "step": 12760 + }, + { + "epoch": 0.3988125, + "grad_norm": 3.375, + "grad_norm_var": 0.044041951497395836, + "learning_rate": 0.0001, + "loss": 6.2794, + "loss/crossentropy": 2.8802947998046875, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18639102578163147, + "step": 12762 + }, + { + "epoch": 0.398875, + "grad_norm": 3.265625, + "grad_norm_var": 0.044188435872395834, + "learning_rate": 0.0001, + "loss": 5.4637, + "loss/crossentropy": 2.2918306589126587, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1683613732457161, + "step": 12764 + }, + { + "epoch": 0.3989375, + "grad_norm": 3.078125, + "grad_norm_var": 0.04299214680989583, + "learning_rate": 0.0001, + "loss": 5.8856, + "loss/crossentropy": 2.683797240257263, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17095889896154404, + "step": 12766 + }, + { + "epoch": 0.399, + "grad_norm": 3.078125, + "grad_norm_var": 0.044352213541666664, + "learning_rate": 0.0001, + "loss": 5.9134, + "loss/crossentropy": 2.6380010843276978, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17714772373437881, + "step": 12768 + }, + { + "epoch": 0.3990625, + "grad_norm": 3.5, + "grad_norm_var": 0.049820963541666666, + "learning_rate": 0.0001, + "loss": 5.9807, + "loss/crossentropy": 2.6644222736358643, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1804550215601921, + "step": 12770 + }, + { + "epoch": 0.399125, + "grad_norm": 3.078125, + "grad_norm_var": 0.042704264322916664, + "learning_rate": 0.0001, + "loss": 5.4594, + "loss/crossentropy": 2.3213056325912476, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1661573201417923, + "step": 12772 + }, + { + "epoch": 0.3991875, + "grad_norm": 2.96875, + "grad_norm_var": 0.043024698893229164, + "learning_rate": 0.0001, + "loss": 5.6452, + "loss/crossentropy": 2.530721426010132, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1630120500922203, + "step": 12774 + }, + { + "epoch": 0.39925, + "grad_norm": 3.125, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 0.0001, + "loss": 5.8038, + "loss/crossentropy": 2.4875781536102295, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1788884699344635, + "step": 12776 + }, + { + "epoch": 0.3993125, + "grad_norm": 3.125, + "grad_norm_var": 0.026292928059895835, + "learning_rate": 0.0001, + "loss": 5.6844, + "loss/crossentropy": 2.4839333295822144, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16848218441009521, + "step": 12778 + }, + { + "epoch": 0.399375, + "grad_norm": 2.9375, + "grad_norm_var": 0.028058878580729165, + "learning_rate": 0.0001, + "loss": 5.8555, + "loss/crossentropy": 2.6576311588287354, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17173823714256287, + "step": 12780 + }, + { + "epoch": 0.3994375, + "grad_norm": 3.46875, + "grad_norm_var": 0.03469645182291667, + "learning_rate": 0.0001, + "loss": 5.7478, + "loss/crossentropy": 2.508821964263916, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17663374543190002, + "step": 12782 + }, + { + "epoch": 0.3995, + "grad_norm": 3.28125, + "grad_norm_var": 0.03271077473958333, + "learning_rate": 0.0001, + "loss": 5.9926, + "loss/crossentropy": 2.6725634336471558, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1788768768310547, + "step": 12784 + }, + { + "epoch": 0.3995625, + "grad_norm": 3.265625, + "grad_norm_var": 0.025731404622395832, + "learning_rate": 0.0001, + "loss": 5.7898, + "loss/crossentropy": 2.5669726133346558, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1730625331401825, + "step": 12786 + }, + { + "epoch": 0.399625, + "grad_norm": 3.171875, + "grad_norm_var": 0.028571573893229167, + "learning_rate": 0.0001, + "loss": 6.1232, + "loss/crossentropy": 2.745548963546753, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.184643916785717, + "step": 12788 + }, + { + "epoch": 0.3996875, + "grad_norm": 3.90625, + "grad_norm_var": 0.04778238932291667, + "learning_rate": 0.0001, + "loss": 6.1705, + "loss/crossentropy": 2.769607663154602, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18853065371513367, + "step": 12790 + }, + { + "epoch": 0.39975, + "grad_norm": 3.0, + "grad_norm_var": 0.0531646728515625, + "learning_rate": 0.0001, + "loss": 5.6306, + "loss/crossentropy": 2.5244375467300415, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16374072432518005, + "step": 12792 + }, + { + "epoch": 0.3998125, + "grad_norm": 2.90625, + "grad_norm_var": 0.06271158854166667, + "learning_rate": 0.0001, + "loss": 5.5619, + "loss/crossentropy": 2.397638440132141, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16681698709726334, + "step": 12794 + }, + { + "epoch": 0.399875, + "grad_norm": 3.03125, + "grad_norm_var": 0.05999348958333333, + "learning_rate": 0.0001, + "loss": 5.6963, + "loss/crossentropy": 2.5068334341049194, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16504260897636414, + "step": 12796 + }, + { + "epoch": 0.3999375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0570953369140625, + "learning_rate": 0.0001, + "loss": 5.694, + "loss/crossentropy": 2.4424139261245728, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1739911586046219, + "step": 12798 + }, + { + "epoch": 0.4, + "grad_norm": 3.359375, + "grad_norm_var": 0.07810872395833333, + "learning_rate": 0.0001, + "loss": 6.0521, + "loss/crossentropy": 2.644740581512451, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18995042890310287, + "step": 12800 + }, + { + "epoch": 0.4000625, + "grad_norm": 3.0625, + "grad_norm_var": 0.08280843098958333, + "learning_rate": 0.0001, + "loss": 6.062, + "loss/crossentropy": 2.752319812774658, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17979149520397186, + "step": 12802 + }, + { + "epoch": 0.400125, + "grad_norm": 3.796875, + "grad_norm_var": 0.09868062337239583, + "learning_rate": 0.0001, + "loss": 5.8939, + "loss/crossentropy": 2.7276742458343506, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16466771811246872, + "step": 12804 + }, + { + "epoch": 0.4001875, + "grad_norm": 3.53125, + "grad_norm_var": 0.08255208333333333, + "learning_rate": 0.0001, + "loss": 5.9977, + "loss/crossentropy": 2.6202635765075684, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1830565556883812, + "step": 12806 + }, + { + "epoch": 0.40025, + "grad_norm": 3.078125, + "grad_norm_var": 0.07848307291666666, + "learning_rate": 0.0001, + "loss": 5.8544, + "loss/crossentropy": 2.6176319122314453, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17563194781541824, + "step": 12808 + }, + { + "epoch": 0.4003125, + "grad_norm": 3.59375, + "grad_norm_var": 0.06526692708333333, + "learning_rate": 0.0001, + "loss": 6.0423, + "loss/crossentropy": 2.7191877365112305, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18075303733348846, + "step": 12810 + }, + { + "epoch": 0.400375, + "grad_norm": 3.28125, + "grad_norm_var": 0.05510660807291667, + "learning_rate": 0.0001, + "loss": 6.155, + "loss/crossentropy": 2.797845959663391, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18414809554815292, + "step": 12812 + }, + { + "epoch": 0.4004375, + "grad_norm": 3.28125, + "grad_norm_var": 0.05431315104166667, + "learning_rate": 0.0001, + "loss": 5.5796, + "loss/crossentropy": 2.437540650367737, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16694405674934387, + "step": 12814 + }, + { + "epoch": 0.4005, + "grad_norm": 3.0, + "grad_norm_var": 0.052057902018229164, + "learning_rate": 0.0001, + "loss": 5.9384, + "loss/crossentropy": 2.726389765739441, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17237205058336258, + "step": 12816 + }, + { + "epoch": 0.4005625, + "grad_norm": 3.1875, + "grad_norm_var": 0.05452372233072917, + "learning_rate": 0.0001, + "loss": 6.0321, + "loss/crossentropy": 2.7516602277755737, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17999598383903503, + "step": 12818 + }, + { + "epoch": 0.400625, + "grad_norm": 3.046875, + "grad_norm_var": 0.04016520182291667, + "learning_rate": 0.0001, + "loss": 5.9035, + "loss/crossentropy": 2.5985175371170044, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17971674352884293, + "step": 12820 + }, + { + "epoch": 0.4006875, + "grad_norm": 3.515625, + "grad_norm_var": 0.03284403483072917, + "learning_rate": 0.0001, + "loss": 6.0141, + "loss/crossentropy": 2.634552836418152, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18131476640701294, + "step": 12822 + }, + { + "epoch": 0.40075, + "grad_norm": 3.34375, + "grad_norm_var": 0.0257232666015625, + "learning_rate": 0.0001, + "loss": 5.7685, + "loss/crossentropy": 2.4657139778137207, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17871953547000885, + "step": 12824 + }, + { + "epoch": 0.4008125, + "grad_norm": 3.46875, + "grad_norm_var": 0.025519816080729167, + "learning_rate": 0.0001, + "loss": 5.9018, + "loss/crossentropy": 2.6673959493637085, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17422594875097275, + "step": 12826 + }, + { + "epoch": 0.400875, + "grad_norm": 3.4375, + "grad_norm_var": 0.042878214518229166, + "learning_rate": 0.0001, + "loss": 5.9004, + "loss/crossentropy": 2.428491711616516, + "loss/hidden": 1.66015625, + "loss/jsd": 0.0, + "loss/logits": 0.1811741143465042, + "step": 12828 + }, + { + "epoch": 0.4009375, + "grad_norm": 3.34375, + "grad_norm_var": 0.0453521728515625, + "learning_rate": 0.0001, + "loss": 5.9111, + "loss/crossentropy": 2.6243066787719727, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17711268365383148, + "step": 12830 + }, + { + "epoch": 0.401, + "grad_norm": 3.296875, + "grad_norm_var": 0.041291300455729166, + "learning_rate": 0.0001, + "loss": 5.9383, + "loss/crossentropy": 2.6939502954483032, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17169564962387085, + "step": 12832 + }, + { + "epoch": 0.4010625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04114176432291667, + "learning_rate": 0.0001, + "loss": 5.6311, + "loss/crossentropy": 2.4560784101486206, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16984651237726212, + "step": 12834 + }, + { + "epoch": 0.401125, + "grad_norm": 3.84375, + "grad_norm_var": 0.0587554931640625, + "learning_rate": 0.0001, + "loss": 5.9381, + "loss/crossentropy": 2.6205928325653076, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18097268044948578, + "step": 12836 + }, + { + "epoch": 0.4011875, + "grad_norm": 3.125, + "grad_norm_var": 0.06116434733072917, + "learning_rate": 0.0001, + "loss": 5.8675, + "loss/crossentropy": 2.715549349784851, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16793277859687805, + "step": 12838 + }, + { + "epoch": 0.40125, + "grad_norm": 3.21875, + "grad_norm_var": 0.060628255208333336, + "learning_rate": 0.0001, + "loss": 5.7584, + "loss/crossentropy": 2.447048544883728, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18074850738048553, + "step": 12840 + }, + { + "epoch": 0.4013125, + "grad_norm": 3.125, + "grad_norm_var": 0.055817667643229166, + "learning_rate": 0.0001, + "loss": 5.7538, + "loss/crossentropy": 2.580907702445984, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17040975391864777, + "step": 12842 + }, + { + "epoch": 0.401375, + "grad_norm": 3.1875, + "grad_norm_var": 0.03987528483072917, + "learning_rate": 0.0001, + "loss": 5.7061, + "loss/crossentropy": 2.4553611278533936, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.16803889721632004, + "step": 12844 + }, + { + "epoch": 0.4014375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03857014973958333, + "learning_rate": 0.0001, + "loss": 5.7958, + "loss/crossentropy": 2.4673973321914673, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18089058995246887, + "step": 12846 + }, + { + "epoch": 0.4015, + "grad_norm": 3.765625, + "grad_norm_var": 0.058003743489583336, + "learning_rate": 0.0001, + "loss": 6.0188, + "loss/crossentropy": 2.632445812225342, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18121369928121567, + "step": 12848 + }, + { + "epoch": 0.4015625, + "grad_norm": 3.125, + "grad_norm_var": 0.05601806640625, + "learning_rate": 0.0001, + "loss": 5.7952, + "loss/crossentropy": 2.564656615257263, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17383144050836563, + "step": 12850 + }, + { + "epoch": 0.401625, + "grad_norm": 3.421875, + "grad_norm_var": 0.0319488525390625, + "learning_rate": 0.0001, + "loss": 5.7028, + "loss/crossentropy": 2.5033124685287476, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16721860319375992, + "step": 12852 + }, + { + "epoch": 0.4016875, + "grad_norm": 3.28125, + "grad_norm_var": 0.032770792643229164, + "learning_rate": 0.0001, + "loss": 6.0856, + "loss/crossentropy": 2.7508046627044678, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1823054924607277, + "step": 12854 + }, + { + "epoch": 0.40175, + "grad_norm": 2.90625, + "grad_norm_var": 0.040379842122395836, + "learning_rate": 0.0001, + "loss": 5.7225, + "loss/crossentropy": 2.5294690132141113, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16851916164159775, + "step": 12856 + }, + { + "epoch": 0.4018125, + "grad_norm": 3.203125, + "grad_norm_var": 0.25660400390625, + "learning_rate": 0.0001, + "loss": 6.2473, + "loss/crossentropy": 2.7770293951034546, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.19663351774215698, + "step": 12858 + }, + { + "epoch": 0.401875, + "grad_norm": 3.046875, + "grad_norm_var": 0.262939453125, + "learning_rate": 0.0001, + "loss": 5.4946, + "loss/crossentropy": 2.327062249183655, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1667497232556343, + "step": 12860 + }, + { + "epoch": 0.4019375, + "grad_norm": 3.1875, + "grad_norm_var": 0.26521708170572916, + "learning_rate": 0.0001, + "loss": 5.7023, + "loss/crossentropy": 2.474923253059387, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.16531268507242203, + "step": 12862 + }, + { + "epoch": 0.402, + "grad_norm": 3.328125, + "grad_norm_var": 0.26431376139322915, + "learning_rate": 0.0001, + "loss": 6.5668, + "loss/crossentropy": 2.958469271659851, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.20262643694877625, + "step": 12864 + }, + { + "epoch": 0.4020625, + "grad_norm": 3.59375, + "grad_norm_var": 0.26106363932291665, + "learning_rate": 0.0001, + "loss": 6.1842, + "loss/crossentropy": 2.7213059663772583, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19003939628601074, + "step": 12866 + }, + { + "epoch": 0.402125, + "grad_norm": 3.109375, + "grad_norm_var": 0.2629191080729167, + "learning_rate": 0.0001, + "loss": 6.1548, + "loss/crossentropy": 2.7594075202941895, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18680787086486816, + "step": 12868 + }, + { + "epoch": 0.4021875, + "grad_norm": 3.078125, + "grad_norm_var": 0.269140625, + "learning_rate": 0.0001, + "loss": 5.7794, + "loss/crossentropy": 2.5408273935317993, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17268949002027512, + "step": 12870 + }, + { + "epoch": 0.40225, + "grad_norm": 3.3125, + "grad_norm_var": 0.25554097493489586, + "learning_rate": 0.0001, + "loss": 5.8066, + "loss/crossentropy": 2.6045267581939697, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1721557453274727, + "step": 12872 + }, + { + "epoch": 0.4023125, + "grad_norm": 3.1875, + "grad_norm_var": 0.7239908854166667, + "learning_rate": 0.0001, + "loss": 6.1699, + "loss/crossentropy": 2.70488178730011, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1879124790430069, + "step": 12874 + }, + { + "epoch": 0.402375, + "grad_norm": 3.140625, + "grad_norm_var": 0.71588134765625, + "learning_rate": 0.0001, + "loss": 5.6145, + "loss/crossentropy": 2.437716007232666, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1688496172428131, + "step": 12876 + }, + { + "epoch": 0.4024375, + "grad_norm": 3.578125, + "grad_norm_var": 0.7010091145833334, + "learning_rate": 0.0001, + "loss": 6.0098, + "loss/crossentropy": 2.6502280235290527, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18361777067184448, + "step": 12878 + }, + { + "epoch": 0.4025, + "grad_norm": 3.4375, + "grad_norm_var": 0.7035115559895834, + "learning_rate": 0.0001, + "loss": 5.7489, + "loss/crossentropy": 2.478968024253845, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17621035873889923, + "step": 12880 + }, + { + "epoch": 0.4025625, + "grad_norm": 3.0625, + "grad_norm_var": 0.7146148681640625, + "learning_rate": 0.0001, + "loss": 5.8506, + "loss/crossentropy": 2.605592966079712, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17332439869642258, + "step": 12882 + }, + { + "epoch": 0.402625, + "grad_norm": 3.171875, + "grad_norm_var": 0.7188639322916667, + "learning_rate": 0.0001, + "loss": 5.5365, + "loss/crossentropy": 2.4114911556243896, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16211073100566864, + "step": 12884 + }, + { + "epoch": 0.4026875, + "grad_norm": 3.3125, + "grad_norm_var": 0.712255859375, + "learning_rate": 0.0001, + "loss": 5.7428, + "loss/crossentropy": 2.4702759981155396, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17334362864494324, + "step": 12886 + }, + { + "epoch": 0.40275, + "grad_norm": 3.421875, + "grad_norm_var": 0.7189280192057291, + "learning_rate": 0.0001, + "loss": 5.6523, + "loss/crossentropy": 2.398773670196533, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17222969233989716, + "step": 12888 + }, + { + "epoch": 0.4028125, + "grad_norm": 3.0, + "grad_norm_var": 0.030171712239583332, + "learning_rate": 0.0001, + "loss": 5.6355, + "loss/crossentropy": 2.4634926319122314, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16485454887151718, + "step": 12890 + }, + { + "epoch": 0.402875, + "grad_norm": 3.6875, + "grad_norm_var": 0.04164937337239583, + "learning_rate": 0.0001, + "loss": 6.3441, + "loss/crossentropy": 2.9000874757766724, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1873697191476822, + "step": 12892 + }, + { + "epoch": 0.4029375, + "grad_norm": 3.421875, + "grad_norm_var": 0.035807291666666664, + "learning_rate": 0.0001, + "loss": 6.0954, + "loss/crossentropy": 2.7477036714553833, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1839851438999176, + "step": 12894 + }, + { + "epoch": 0.403, + "grad_norm": 2.984375, + "grad_norm_var": 0.04275716145833333, + "learning_rate": 0.0001, + "loss": 5.8582, + "loss/crossentropy": 2.6724019050598145, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1674080416560173, + "step": 12896 + }, + { + "epoch": 0.4030625, + "grad_norm": 3.28125, + "grad_norm_var": 0.040608723958333336, + "learning_rate": 0.0001, + "loss": 5.2831, + "loss/crossentropy": 2.202645778656006, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15726906061172485, + "step": 12898 + }, + { + "epoch": 0.403125, + "grad_norm": 3.03125, + "grad_norm_var": 0.04267578125, + "learning_rate": 0.0001, + "loss": 5.6854, + "loss/crossentropy": 2.4447492361068726, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17523667216300964, + "step": 12900 + }, + { + "epoch": 0.4031875, + "grad_norm": 3.109375, + "grad_norm_var": 0.04366861979166667, + "learning_rate": 0.0001, + "loss": 5.9955, + "loss/crossentropy": 2.6723486185073853, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17958541214466095, + "step": 12902 + }, + { + "epoch": 0.40325, + "grad_norm": 3.0625, + "grad_norm_var": 0.04013264973958333, + "learning_rate": 0.0001, + "loss": 5.924, + "loss/crossentropy": 2.620428204536438, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17957250773906708, + "step": 12904 + }, + { + "epoch": 0.4033125, + "grad_norm": 3.125, + "grad_norm_var": 0.0362945556640625, + "learning_rate": 0.0001, + "loss": 5.8185, + "loss/crossentropy": 2.5143805742263794, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.178853839635849, + "step": 12906 + }, + { + "epoch": 0.403375, + "grad_norm": 3.0, + "grad_norm_var": 0.024925740559895833, + "learning_rate": 0.0001, + "loss": 5.6459, + "loss/crossentropy": 2.4812533855438232, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16880392283201218, + "step": 12908 + }, + { + "epoch": 0.4034375, + "grad_norm": 3.0625, + "grad_norm_var": 0.021581013997395832, + "learning_rate": 0.0001, + "loss": 5.7469, + "loss/crossentropy": 2.550856828689575, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17467769235372543, + "step": 12910 + }, + { + "epoch": 0.4035, + "grad_norm": 2.859375, + "grad_norm_var": 0.016630045572916665, + "learning_rate": 0.0001, + "loss": 5.5039, + "loss/crossentropy": 2.4234888553619385, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1615532785654068, + "step": 12912 + }, + { + "epoch": 0.4035625, + "grad_norm": 3.09375, + "grad_norm_var": 0.023639933268229166, + "learning_rate": 0.0001, + "loss": 5.9952, + "loss/crossentropy": 2.6943411827087402, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1781327947974205, + "step": 12914 + }, + { + "epoch": 0.403625, + "grad_norm": 3.5625, + "grad_norm_var": 0.0354644775390625, + "learning_rate": 0.0001, + "loss": 5.8373, + "loss/crossentropy": 2.5322425365448, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17660397291183472, + "step": 12916 + }, + { + "epoch": 0.4036875, + "grad_norm": 2.953125, + "grad_norm_var": 0.038004557291666664, + "learning_rate": 0.0001, + "loss": 5.5832, + "loss/crossentropy": 2.4476633071899414, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16394364833831787, + "step": 12918 + }, + { + "epoch": 0.40375, + "grad_norm": 3.390625, + "grad_norm_var": 0.049332682291666666, + "learning_rate": 0.0001, + "loss": 5.9082, + "loss/crossentropy": 2.593148946762085, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17876767367124557, + "step": 12920 + }, + { + "epoch": 0.4038125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0494293212890625, + "learning_rate": 0.0001, + "loss": 5.9072, + "loss/crossentropy": 2.629413366317749, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17738816887140274, + "step": 12922 + }, + { + "epoch": 0.403875, + "grad_norm": 3.375, + "grad_norm_var": 0.048779296875, + "learning_rate": 0.0001, + "loss": 5.9481, + "loss/crossentropy": 2.5518592596054077, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18571743369102478, + "step": 12924 + }, + { + "epoch": 0.4039375, + "grad_norm": 3.125, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 5.6959, + "loss/crossentropy": 2.442676544189453, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.16985361278057098, + "step": 12926 + }, + { + "epoch": 0.404, + "grad_norm": 3.171875, + "grad_norm_var": 0.05259501139322917, + "learning_rate": 0.0001, + "loss": 5.7413, + "loss/crossentropy": 2.4280115365982056, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18210766464471817, + "step": 12928 + }, + { + "epoch": 0.4040625, + "grad_norm": 2.90625, + "grad_norm_var": 0.057860310872395834, + "learning_rate": 0.0001, + "loss": 5.5295, + "loss/crossentropy": 2.4299877882003784, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.163080595433712, + "step": 12930 + }, + { + "epoch": 0.404125, + "grad_norm": 3.390625, + "grad_norm_var": 0.046483357747395836, + "learning_rate": 0.0001, + "loss": 5.8573, + "loss/crossentropy": 2.609510898590088, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17087089270353317, + "step": 12932 + }, + { + "epoch": 0.4041875, + "grad_norm": 3.625, + "grad_norm_var": 0.04343973795572917, + "learning_rate": 0.0001, + "loss": 6.0956, + "loss/crossentropy": 2.6327909231185913, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19081494212150574, + "step": 12934 + }, + { + "epoch": 0.40425, + "grad_norm": 3.46875, + "grad_norm_var": 0.042985026041666666, + "learning_rate": 0.0001, + "loss": 5.8734, + "loss/crossentropy": 2.5685410499572754, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1773630529642105, + "step": 12936 + }, + { + "epoch": 0.4043125, + "grad_norm": 3.203125, + "grad_norm_var": 0.0441314697265625, + "learning_rate": 0.0001, + "loss": 6.2464, + "loss/crossentropy": 2.921871542930603, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1812841147184372, + "step": 12938 + }, + { + "epoch": 0.404375, + "grad_norm": 3.234375, + "grad_norm_var": 0.04390360514322917, + "learning_rate": 0.0001, + "loss": 5.889, + "loss/crossentropy": 2.590018391609192, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17560270428657532, + "step": 12940 + }, + { + "epoch": 0.4044375, + "grad_norm": 3.171875, + "grad_norm_var": 0.043290201822916666, + "learning_rate": 0.0001, + "loss": 5.9632, + "loss/crossentropy": 2.6677803993225098, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17719509452581406, + "step": 12942 + }, + { + "epoch": 0.4045, + "grad_norm": 3.359375, + "grad_norm_var": 0.030338541666666666, + "learning_rate": 0.0001, + "loss": 5.5436, + "loss/crossentropy": 2.4174834489822388, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16221749037504196, + "step": 12944 + }, + { + "epoch": 0.4045625, + "grad_norm": 3.359375, + "grad_norm_var": 0.020149739583333333, + "learning_rate": 0.0001, + "loss": 5.697, + "loss/crossentropy": 2.472835898399353, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1708502173423767, + "step": 12946 + }, + { + "epoch": 0.404625, + "grad_norm": 3.359375, + "grad_norm_var": 0.019928995768229166, + "learning_rate": 0.0001, + "loss": 5.8782, + "loss/crossentropy": 2.58474600315094, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17582834511995316, + "step": 12948 + }, + { + "epoch": 0.4046875, + "grad_norm": 3.234375, + "grad_norm_var": 0.011881510416666666, + "learning_rate": 0.0001, + "loss": 5.87, + "loss/crossentropy": 2.5309667587280273, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18155810236930847, + "step": 12950 + }, + { + "epoch": 0.40475, + "grad_norm": 3.0625, + "grad_norm_var": 0.012528483072916667, + "learning_rate": 0.0001, + "loss": 5.9152, + "loss/crossentropy": 2.634434938430786, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17729829996824265, + "step": 12952 + }, + { + "epoch": 0.4048125, + "grad_norm": 3.15625, + "grad_norm_var": 0.013963826497395833, + "learning_rate": 0.0001, + "loss": 5.9207, + "loss/crossentropy": 2.6336101293563843, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17558201402425766, + "step": 12954 + }, + { + "epoch": 0.404875, + "grad_norm": 3.296875, + "grad_norm_var": 0.013996378580729166, + "learning_rate": 0.0001, + "loss": 5.6376, + "loss/crossentropy": 2.3832263946533203, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17309315502643585, + "step": 12956 + }, + { + "epoch": 0.4049375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0151275634765625, + "learning_rate": 0.0001, + "loss": 5.7511, + "loss/crossentropy": 2.565544009208679, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17011910676956177, + "step": 12958 + }, + { + "epoch": 0.405, + "grad_norm": 3.453125, + "grad_norm_var": 0.0159332275390625, + "learning_rate": 0.0001, + "loss": 5.6195, + "loss/crossentropy": 2.4714592695236206, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1644178256392479, + "step": 12960 + }, + { + "epoch": 0.4050625, + "grad_norm": 3.234375, + "grad_norm_var": 0.01646728515625, + "learning_rate": 0.0001, + "loss": 5.9175, + "loss/crossentropy": 2.6698936223983765, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17437398433685303, + "step": 12962 + }, + { + "epoch": 0.405125, + "grad_norm": 3.375, + "grad_norm_var": 0.0189117431640625, + "learning_rate": 0.0001, + "loss": 5.839, + "loss/crossentropy": 2.6037967205047607, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17118076980113983, + "step": 12964 + }, + { + "epoch": 0.4051875, + "grad_norm": 3.46875, + "grad_norm_var": 0.020524088541666666, + "learning_rate": 0.0001, + "loss": 6.0013, + "loss/crossentropy": 2.6123119592666626, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.17991715669631958, + "step": 12966 + }, + { + "epoch": 0.40525, + "grad_norm": 3.171875, + "grad_norm_var": 0.017561848958333334, + "learning_rate": 0.0001, + "loss": 5.9125, + "loss/crossentropy": 2.702871561050415, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16783539205789566, + "step": 12968 + }, + { + "epoch": 0.4053125, + "grad_norm": 3.328125, + "grad_norm_var": 0.021393839518229166, + "learning_rate": 0.0001, + "loss": 5.6707, + "loss/crossentropy": 2.442600131034851, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17046774923801422, + "step": 12970 + }, + { + "epoch": 0.405375, + "grad_norm": 4.40625, + "grad_norm_var": 0.11380106608072917, + "learning_rate": 0.0001, + "loss": 6.1768, + "loss/crossentropy": 2.6450765132904053, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.19223621487617493, + "step": 12972 + }, + { + "epoch": 0.4054375, + "grad_norm": 3.25, + "grad_norm_var": 0.11139322916666666, + "learning_rate": 0.0001, + "loss": 5.9965, + "loss/crossentropy": 2.73725688457489, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1770923212170601, + "step": 12974 + }, + { + "epoch": 0.4055, + "grad_norm": 3.40625, + "grad_norm_var": 0.11028238932291666, + "learning_rate": 0.0001, + "loss": 6.0203, + "loss/crossentropy": 2.6498407125473022, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1819629818201065, + "step": 12976 + }, + { + "epoch": 0.4055625, + "grad_norm": 3.609375, + "grad_norm_var": 0.11162821451822917, + "learning_rate": 0.0001, + "loss": 5.9902, + "loss/crossentropy": 2.645597815513611, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17977623641490936, + "step": 12978 + }, + { + "epoch": 0.405625, + "grad_norm": 3.046875, + "grad_norm_var": 0.12702534993489584, + "learning_rate": 0.0001, + "loss": 5.3293, + "loss/crossentropy": 2.323233962059021, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1560777723789215, + "step": 12980 + }, + { + "epoch": 0.4056875, + "grad_norm": 3.53125, + "grad_norm_var": 0.13032938639322916, + "learning_rate": 0.0001, + "loss": 5.9522, + "loss/crossentropy": 2.6342997550964355, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17944678664207458, + "step": 12982 + }, + { + "epoch": 0.40575, + "grad_norm": 3.234375, + "grad_norm_var": 0.12976888020833333, + "learning_rate": 0.0001, + "loss": 6.1011, + "loss/crossentropy": 2.750393033027649, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18545960634946823, + "step": 12984 + }, + { + "epoch": 0.4058125, + "grad_norm": 3.375, + "grad_norm_var": 0.12395833333333334, + "learning_rate": 0.0001, + "loss": 6.0156, + "loss/crossentropy": 2.686494469642639, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1805633381009102, + "step": 12986 + }, + { + "epoch": 0.405875, + "grad_norm": 3.03125, + "grad_norm_var": 0.03619791666666667, + "learning_rate": 0.0001, + "loss": 5.7684, + "loss/crossentropy": 2.537803530693054, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17345496267080307, + "step": 12988 + }, + { + "epoch": 0.4059375, + "grad_norm": 3.15625, + "grad_norm_var": 0.044596354166666664, + "learning_rate": 0.0001, + "loss": 5.7914, + "loss/crossentropy": 2.5131624937057495, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17626352608203888, + "step": 12990 + }, + { + "epoch": 0.406, + "grad_norm": 3.265625, + "grad_norm_var": 0.041657511393229166, + "learning_rate": 0.0001, + "loss": 5.7201, + "loss/crossentropy": 2.47638475894928, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17163395136594772, + "step": 12992 + }, + { + "epoch": 0.4060625, + "grad_norm": 3.359375, + "grad_norm_var": 0.034566243489583336, + "learning_rate": 0.0001, + "loss": 5.7366, + "loss/crossentropy": 2.4082270860671997, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18049810826778412, + "step": 12994 + }, + { + "epoch": 0.406125, + "grad_norm": 3.15625, + "grad_norm_var": 0.048258463541666664, + "learning_rate": 0.0001, + "loss": 5.2884, + "loss/crossentropy": 2.2167803049087524, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15638333559036255, + "step": 12996 + }, + { + "epoch": 0.4061875, + "grad_norm": 3.25, + "grad_norm_var": 0.04304911295572917, + "learning_rate": 0.0001, + "loss": 5.9594, + "loss/crossentropy": 2.6436110734939575, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.177668496966362, + "step": 12998 + }, + { + "epoch": 0.40625, + "grad_norm": 2.9375, + "grad_norm_var": 0.04846903483072917, + "learning_rate": 0.0001, + "loss": 5.648, + "loss/crossentropy": 2.5511425733566284, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1643727794289589, + "step": 13000 + }, + { + "epoch": 0.4063125, + "grad_norm": 3.1875, + "grad_norm_var": 0.04716796875, + "learning_rate": 0.0001, + "loss": 6.078, + "loss/crossentropy": 2.709423780441284, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1856864020228386, + "step": 13002 + }, + { + "epoch": 0.406375, + "grad_norm": 3.140625, + "grad_norm_var": 0.043146769205729164, + "learning_rate": 0.0001, + "loss": 5.9485, + "loss/crossentropy": 2.7473580837249756, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17089996486902237, + "step": 13004 + }, + { + "epoch": 0.4064375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0306793212890625, + "learning_rate": 0.0001, + "loss": 6.011, + "loss/crossentropy": 2.718340754508972, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17848655581474304, + "step": 13006 + }, + { + "epoch": 0.4065, + "grad_norm": 3.3125, + "grad_norm_var": 0.03271077473958333, + "learning_rate": 0.0001, + "loss": 5.9108, + "loss/crossentropy": 2.67184054851532, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1735036000609398, + "step": 13008 + }, + { + "epoch": 0.4065625, + "grad_norm": 3.671875, + "grad_norm_var": 0.049658203125, + "learning_rate": 0.0001, + "loss": 5.9981, + "loss/crossentropy": 2.624893307685852, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1834184154868126, + "step": 13010 + }, + { + "epoch": 0.406625, + "grad_norm": 3.140625, + "grad_norm_var": 0.036942545572916666, + "learning_rate": 0.0001, + "loss": 5.404, + "loss/crossentropy": 2.3047443628311157, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1599244475364685, + "step": 13012 + }, + { + "epoch": 0.4066875, + "grad_norm": 4.34375, + "grad_norm_var": 0.12011311848958334, + "learning_rate": 0.0001, + "loss": 6.179, + "loss/crossentropy": 2.733541250228882, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18907804042100906, + "step": 13014 + }, + { + "epoch": 0.40675, + "grad_norm": 3.484375, + "grad_norm_var": 0.11612040201822917, + "learning_rate": 0.0001, + "loss": 5.8605, + "loss/crossentropy": 2.537650942802429, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18189211934804916, + "step": 13016 + }, + { + "epoch": 0.4068125, + "grad_norm": 4.1875, + "grad_norm_var": 0.55986328125, + "learning_rate": 0.0001, + "loss": 5.989, + "loss/crossentropy": 2.51317822933197, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.19601739943027496, + "step": 13018 + }, + { + "epoch": 0.406875, + "grad_norm": 3.328125, + "grad_norm_var": 0.5401519775390625, + "learning_rate": 0.0001, + "loss": 6.1557, + "loss/crossentropy": 2.7317371368408203, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18848735094070435, + "step": 13020 + }, + { + "epoch": 0.4069375, + "grad_norm": 3.34375, + "grad_norm_var": 0.5370107014973958, + "learning_rate": 0.0001, + "loss": 5.7906, + "loss/crossentropy": 2.585814952850342, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17086688429117203, + "step": 13022 + }, + { + "epoch": 0.407, + "grad_norm": 3.3125, + "grad_norm_var": 0.55888671875, + "learning_rate": 0.0001, + "loss": 5.6192, + "loss/crossentropy": 2.5052181482315063, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16140136122703552, + "step": 13024 + }, + { + "epoch": 0.4070625, + "grad_norm": 3.078125, + "grad_norm_var": 0.5541341145833333, + "learning_rate": 0.0001, + "loss": 5.8651, + "loss/crossentropy": 2.6415683031082153, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17234836518764496, + "step": 13026 + }, + { + "epoch": 0.407125, + "grad_norm": 3.109375, + "grad_norm_var": 0.5558878580729166, + "learning_rate": 0.0001, + "loss": 5.8399, + "loss/crossentropy": 2.6774927377700806, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1658504456281662, + "step": 13028 + }, + { + "epoch": 0.4071875, + "grad_norm": 3.328125, + "grad_norm_var": 0.5191884358723958, + "learning_rate": 0.0001, + "loss": 5.7787, + "loss/crossentropy": 2.552108645439148, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1714826375246048, + "step": 13030 + }, + { + "epoch": 0.40725, + "grad_norm": 3.359375, + "grad_norm_var": 0.52529296875, + "learning_rate": 0.0001, + "loss": 5.8789, + "loss/crossentropy": 2.6156972646713257, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17476215958595276, + "step": 13032 + }, + { + "epoch": 0.4073125, + "grad_norm": 3.296875, + "grad_norm_var": 0.03331705729166667, + "learning_rate": 0.0001, + "loss": 5.8981, + "loss/crossentropy": 2.586290955543518, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1811782419681549, + "step": 13034 + }, + { + "epoch": 0.407375, + "grad_norm": 3.296875, + "grad_norm_var": 0.024925740559895833, + "learning_rate": 0.0001, + "loss": 5.954, + "loss/crossentropy": 2.6261991262435913, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18082567304372787, + "step": 13036 + }, + { + "epoch": 0.4074375, + "grad_norm": 3.625, + "grad_norm_var": 0.03329671223958333, + "learning_rate": 0.0001, + "loss": 6.3336, + "loss/crossentropy": 2.8870203495025635, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18645723164081573, + "step": 13038 + }, + { + "epoch": 0.4075, + "grad_norm": 3.5625, + "grad_norm_var": 0.052179972330729164, + "learning_rate": 0.0001, + "loss": 6.3924, + "loss/crossentropy": 2.899696469306946, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19497000426054, + "step": 13040 + }, + { + "epoch": 0.4075625, + "grad_norm": 3.4375, + "grad_norm_var": 0.0502349853515625, + "learning_rate": 0.0001, + "loss": 6.004, + "loss/crossentropy": 2.6903072595596313, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17902319878339767, + "step": 13042 + }, + { + "epoch": 0.407625, + "grad_norm": 3.09375, + "grad_norm_var": 0.04241536458333333, + "learning_rate": 0.0001, + "loss": 5.4394, + "loss/crossentropy": 2.2938272953033447, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16417094320058823, + "step": 13044 + }, + { + "epoch": 0.4076875, + "grad_norm": 2.953125, + "grad_norm_var": 0.05730794270833333, + "learning_rate": 0.0001, + "loss": 5.5693, + "loss/crossentropy": 2.452745795249939, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16634270548820496, + "step": 13046 + }, + { + "epoch": 0.40775, + "grad_norm": 3.8125, + "grad_norm_var": 0.073779296875, + "learning_rate": 0.0001, + "loss": 5.7333, + "loss/crossentropy": 2.4640876054763794, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17496556788682938, + "step": 13048 + }, + { + "epoch": 0.4078125, + "grad_norm": 3.28125, + "grad_norm_var": 0.12746988932291667, + "learning_rate": 0.0001, + "loss": 6.1964, + "loss/crossentropy": 2.707811117172241, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19143328070640564, + "step": 13050 + }, + { + "epoch": 0.407875, + "grad_norm": 3.171875, + "grad_norm_var": 0.13976236979166667, + "learning_rate": 0.0001, + "loss": 5.8796, + "loss/crossentropy": 2.644796133041382, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17309431731700897, + "step": 13052 + }, + { + "epoch": 0.4079375, + "grad_norm": 3.21875, + "grad_norm_var": 0.13677978515625, + "learning_rate": 0.0001, + "loss": 5.5999, + "loss/crossentropy": 2.475613832473755, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16711590439081192, + "step": 13054 + }, + { + "epoch": 0.408, + "grad_norm": 3.140625, + "grad_norm_var": 0.11959228515625, + "learning_rate": 0.0001, + "loss": 6.0233, + "loss/crossentropy": 2.7193907499313354, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17452675104141235, + "step": 13056 + }, + { + "epoch": 0.4080625, + "grad_norm": 3.03125, + "grad_norm_var": 0.151953125, + "learning_rate": 0.0001, + "loss": 5.8764, + "loss/crossentropy": 2.55754816532135, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18032243102788925, + "step": 13058 + }, + { + "epoch": 0.408125, + "grad_norm": 3.28125, + "grad_norm_var": 0.14645894368489584, + "learning_rate": 0.0001, + "loss": 5.8501, + "loss/crossentropy": 2.5090755224227905, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18020044267177582, + "step": 13060 + }, + { + "epoch": 0.4081875, + "grad_norm": 3.25, + "grad_norm_var": 0.1266998291015625, + "learning_rate": 0.0001, + "loss": 5.6616, + "loss/crossentropy": 2.4366965293884277, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17405778169631958, + "step": 13062 + }, + { + "epoch": 0.40825, + "grad_norm": 3.453125, + "grad_norm_var": 0.11357014973958333, + "learning_rate": 0.0001, + "loss": 5.9129, + "loss/crossentropy": 2.611645460128784, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18051308393478394, + "step": 13064 + }, + { + "epoch": 0.4083125, + "grad_norm": 3.15625, + "grad_norm_var": 0.053511555989583334, + "learning_rate": 0.0001, + "loss": 5.7801, + "loss/crossentropy": 2.5562463998794556, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17199376970529556, + "step": 13066 + }, + { + "epoch": 0.408375, + "grad_norm": 2.90625, + "grad_norm_var": 0.05780843098958333, + "learning_rate": 0.0001, + "loss": 5.6068, + "loss/crossentropy": 2.465224266052246, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1637713611125946, + "step": 13068 + }, + { + "epoch": 0.4084375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0585601806640625, + "learning_rate": 0.0001, + "loss": 5.959, + "loss/crossentropy": 2.712372303009033, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.175831601023674, + "step": 13070 + }, + { + "epoch": 0.4085, + "grad_norm": 3.40625, + "grad_norm_var": 0.056428019205729166, + "learning_rate": 0.0001, + "loss": 6.1658, + "loss/crossentropy": 2.6972309350967407, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1906116008758545, + "step": 13072 + }, + { + "epoch": 0.4085625, + "grad_norm": 3.3125, + "grad_norm_var": 0.023014322916666666, + "learning_rate": 0.0001, + "loss": 6.0673, + "loss/crossentropy": 2.7165971994400024, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1835119128227234, + "step": 13074 + }, + { + "epoch": 0.408625, + "grad_norm": 3.390625, + "grad_norm_var": 0.0250640869140625, + "learning_rate": 0.0001, + "loss": 5.8489, + "loss/crossentropy": 2.573718786239624, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17438985407352448, + "step": 13076 + }, + { + "epoch": 0.4086875, + "grad_norm": 3.359375, + "grad_norm_var": 0.025275675455729167, + "learning_rate": 0.0001, + "loss": 6.0729, + "loss/crossentropy": 2.7444902658462524, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18361777812242508, + "step": 13078 + }, + { + "epoch": 0.40875, + "grad_norm": 3.0, + "grad_norm_var": 0.02769775390625, + "learning_rate": 0.0001, + "loss": 5.6211, + "loss/crossentropy": 2.511841297149658, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16405534744262695, + "step": 13080 + }, + { + "epoch": 0.4088125, + "grad_norm": 4.59375, + "grad_norm_var": 0.14172770182291666, + "learning_rate": 0.0001, + "loss": 6.2488, + "loss/crossentropy": 2.8049396276474, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1885310783982277, + "step": 13082 + }, + { + "epoch": 0.408875, + "grad_norm": 3.484375, + "grad_norm_var": 0.12698160807291667, + "learning_rate": 0.0001, + "loss": 5.9688, + "loss/crossentropy": 2.6219193935394287, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17960664629936218, + "step": 13084 + }, + { + "epoch": 0.4089375, + "grad_norm": 3.484375, + "grad_norm_var": 0.12097880045572916, + "learning_rate": 0.0001, + "loss": 6.251, + "loss/crossentropy": 2.896095871925354, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1819765493273735, + "step": 13086 + }, + { + "epoch": 0.409, + "grad_norm": 3.265625, + "grad_norm_var": 0.13064676920572918, + "learning_rate": 0.0001, + "loss": 6.0631, + "loss/crossentropy": 2.682563066482544, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.183364599943161, + "step": 13088 + }, + { + "epoch": 0.4090625, + "grad_norm": 3.25, + "grad_norm_var": 0.13582255045572916, + "learning_rate": 0.0001, + "loss": 5.8131, + "loss/crossentropy": 2.4506473541259766, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1792103797197342, + "step": 13090 + }, + { + "epoch": 0.409125, + "grad_norm": 3.140625, + "grad_norm_var": 0.14191080729166666, + "learning_rate": 0.0001, + "loss": 6.05, + "loss/crossentropy": 2.672413468360901, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18463655561208725, + "step": 13092 + }, + { + "epoch": 0.4091875, + "grad_norm": 3.078125, + "grad_norm_var": 0.15699462890625, + "learning_rate": 0.0001, + "loss": 5.4334, + "loss/crossentropy": 2.3502501249313354, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16143521666526794, + "step": 13094 + }, + { + "epoch": 0.40925, + "grad_norm": 3.28125, + "grad_norm_var": 0.1360504150390625, + "learning_rate": 0.0001, + "loss": 5.8177, + "loss/crossentropy": 2.548230767250061, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1777261570096016, + "step": 13096 + }, + { + "epoch": 0.4093125, + "grad_norm": 3.375, + "grad_norm_var": 0.05133056640625, + "learning_rate": 0.0001, + "loss": 5.6474, + "loss/crossentropy": 2.5038857460021973, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16512799263000488, + "step": 13098 + }, + { + "epoch": 0.409375, + "grad_norm": 3.265625, + "grad_norm_var": 0.04692281087239583, + "learning_rate": 0.0001, + "loss": 5.8479, + "loss/crossentropy": 2.573588013648987, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17430279403924942, + "step": 13100 + }, + { + "epoch": 0.4094375, + "grad_norm": 3.828125, + "grad_norm_var": 0.06464436848958334, + "learning_rate": 0.0001, + "loss": 5.7626, + "loss/crossentropy": 2.4751532077789307, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17444677650928497, + "step": 13102 + }, + { + "epoch": 0.4095, + "grad_norm": 3.421875, + "grad_norm_var": 0.050837198893229164, + "learning_rate": 0.0001, + "loss": 5.7975, + "loss/crossentropy": 2.5101910829544067, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17795388400554657, + "step": 13104 + }, + { + "epoch": 0.4095625, + "grad_norm": 3.59375, + "grad_norm_var": 0.05718994140625, + "learning_rate": 0.0001, + "loss": 5.7223, + "loss/crossentropy": 2.4912636280059814, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1719299554824829, + "step": 13106 + }, + { + "epoch": 0.409625, + "grad_norm": 3.03125, + "grad_norm_var": 0.05935872395833333, + "learning_rate": 0.0001, + "loss": 5.8821, + "loss/crossentropy": 2.6286439895629883, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17495766282081604, + "step": 13108 + }, + { + "epoch": 0.4096875, + "grad_norm": 2.875, + "grad_norm_var": 0.0646484375, + "learning_rate": 0.0001, + "loss": 5.6972, + "loss/crossentropy": 2.5340648889541626, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17100301384925842, + "step": 13110 + }, + { + "epoch": 0.40975, + "grad_norm": 3.15625, + "grad_norm_var": 0.06648661295572916, + "learning_rate": 0.0001, + "loss": 5.6622, + "loss/crossentropy": 2.534527063369751, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1682344302535057, + "step": 13112 + }, + { + "epoch": 0.4098125, + "grad_norm": 3.234375, + "grad_norm_var": 0.07119140625, + "learning_rate": 0.0001, + "loss": 5.444, + "loss/crossentropy": 2.3773000240325928, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15901631116867065, + "step": 13114 + }, + { + "epoch": 0.409875, + "grad_norm": 3.265625, + "grad_norm_var": 0.06913655598958333, + "learning_rate": 0.0001, + "loss": 5.7806, + "loss/crossentropy": 2.476312279701233, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18160535395145416, + "step": 13116 + }, + { + "epoch": 0.4099375, + "grad_norm": 2.875, + "grad_norm_var": 0.051122029622395836, + "learning_rate": 0.0001, + "loss": 5.6636, + "loss/crossentropy": 2.5202556848526, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1658995822072029, + "step": 13118 + }, + { + "epoch": 0.41, + "grad_norm": 3.34375, + "grad_norm_var": 0.048746744791666664, + "learning_rate": 0.0001, + "loss": 5.9608, + "loss/crossentropy": 2.648427128791809, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18084564805030823, + "step": 13120 + }, + { + "epoch": 0.4100625, + "grad_norm": 3.359375, + "grad_norm_var": 0.034619140625, + "learning_rate": 0.0001, + "loss": 5.8285, + "loss/crossentropy": 2.50955867767334, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18032754957675934, + "step": 13122 + }, + { + "epoch": 0.410125, + "grad_norm": 3.4375, + "grad_norm_var": 0.03951416015625, + "learning_rate": 0.0001, + "loss": 5.8366, + "loss/crossentropy": 2.528256058692932, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.18278292566537857, + "step": 13124 + }, + { + "epoch": 0.4101875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0361236572265625, + "learning_rate": 0.0001, + "loss": 5.7716, + "loss/crossentropy": 2.579535961151123, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17076754570007324, + "step": 13126 + }, + { + "epoch": 0.41025, + "grad_norm": 3.46875, + "grad_norm_var": 0.04003499348958333, + "learning_rate": 0.0001, + "loss": 5.9524, + "loss/crossentropy": 2.6309748888015747, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1805766299366951, + "step": 13128 + }, + { + "epoch": 0.4103125, + "grad_norm": 5.34375, + "grad_norm_var": 0.3024648030598958, + "learning_rate": 0.0001, + "loss": 6.2001, + "loss/crossentropy": 2.7218735218048096, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19196175783872604, + "step": 13130 + }, + { + "epoch": 0.410375, + "grad_norm": 3.40625, + "grad_norm_var": 0.30048828125, + "learning_rate": 0.0001, + "loss": 5.9113, + "loss/crossentropy": 2.632519841194153, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17670893669128418, + "step": 13132 + }, + { + "epoch": 0.4104375, + "grad_norm": 2.953125, + "grad_norm_var": 0.30227457682291664, + "learning_rate": 0.0001, + "loss": 5.7421, + "loss/crossentropy": 2.5946797132492065, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17098701000213623, + "step": 13134 + }, + { + "epoch": 0.4105, + "grad_norm": 3.234375, + "grad_norm_var": 0.31245829264322916, + "learning_rate": 0.0001, + "loss": 5.7763, + "loss/crossentropy": 2.5710290670394897, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17091825604438782, + "step": 13136 + }, + { + "epoch": 0.4105625, + "grad_norm": 3.59375, + "grad_norm_var": 0.32151590983072914, + "learning_rate": 0.0001, + "loss": 6.1374, + "loss/crossentropy": 2.688127636909485, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19023503363132477, + "step": 13138 + }, + { + "epoch": 0.410625, + "grad_norm": 3.40625, + "grad_norm_var": 0.32194722493489586, + "learning_rate": 0.0001, + "loss": 5.8991, + "loss/crossentropy": 2.6207317113876343, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17588258534669876, + "step": 13140 + }, + { + "epoch": 0.4106875, + "grad_norm": 3.5625, + "grad_norm_var": 0.30554097493489585, + "learning_rate": 0.0001, + "loss": 5.9693, + "loss/crossentropy": 2.6185660362243652, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1819438636302948, + "step": 13142 + }, + { + "epoch": 0.41075, + "grad_norm": 3.484375, + "grad_norm_var": 0.3094716389973958, + "learning_rate": 0.0001, + "loss": 5.6599, + "loss/crossentropy": 2.396067976951599, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17325875163078308, + "step": 13144 + }, + { + "epoch": 0.4108125, + "grad_norm": 3.296875, + "grad_norm_var": 0.0578765869140625, + "learning_rate": 0.0001, + "loss": 6.0223, + "loss/crossentropy": 2.7240883111953735, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17708660662174225, + "step": 13146 + }, + { + "epoch": 0.410875, + "grad_norm": 3.015625, + "grad_norm_var": 0.06599934895833333, + "learning_rate": 0.0001, + "loss": 5.8849, + "loss/crossentropy": 2.707707166671753, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1681085005402565, + "step": 13148 + }, + { + "epoch": 0.4109375, + "grad_norm": 3.359375, + "grad_norm_var": 0.05093994140625, + "learning_rate": 0.0001, + "loss": 5.9366, + "loss/crossentropy": 2.665693759918213, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17747745662927628, + "step": 13150 + }, + { + "epoch": 0.411, + "grad_norm": 3.078125, + "grad_norm_var": 0.058137003580729166, + "learning_rate": 0.0001, + "loss": 5.3523, + "loss/crossentropy": 2.3129165172576904, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16018399596214294, + "step": 13152 + }, + { + "epoch": 0.4110625, + "grad_norm": 3.34375, + "grad_norm_var": 0.164404296875, + "learning_rate": 0.0001, + "loss": 5.7427, + "loss/crossentropy": 2.3883581161499023, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18270309269428253, + "step": 13154 + }, + { + "epoch": 0.411125, + "grad_norm": 3.1875, + "grad_norm_var": 0.16544596354166666, + "learning_rate": 0.0001, + "loss": 5.5893, + "loss/crossentropy": 2.4184921979904175, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1682557836174965, + "step": 13156 + }, + { + "epoch": 0.4111875, + "grad_norm": 4.0, + "grad_norm_var": 0.19062398274739584, + "learning_rate": 0.0001, + "loss": 5.793, + "loss/crossentropy": 2.558933734893799, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1710628792643547, + "step": 13158 + }, + { + "epoch": 0.41125, + "grad_norm": 3.34375, + "grad_norm_var": 0.18707275390625, + "learning_rate": 0.0001, + "loss": 5.912, + "loss/crossentropy": 2.6660226583480835, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17577102780342102, + "step": 13160 + }, + { + "epoch": 0.4113125, + "grad_norm": 3.390625, + "grad_norm_var": 0.19097900390625, + "learning_rate": 0.0001, + "loss": 5.8179, + "loss/crossentropy": 2.5855531692504883, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17089534550905228, + "step": 13162 + }, + { + "epoch": 0.411375, + "grad_norm": 3.359375, + "grad_norm_var": 0.18017578125, + "learning_rate": 0.0001, + "loss": 6.1325, + "loss/crossentropy": 2.7981022596359253, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1810910701751709, + "step": 13164 + }, + { + "epoch": 0.4114375, + "grad_norm": 4.75, + "grad_norm_var": 0.2921549479166667, + "learning_rate": 0.0001, + "loss": 6.0312, + "loss/crossentropy": 2.654681921005249, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1841331049799919, + "step": 13166 + }, + { + "epoch": 0.4115, + "grad_norm": 3.203125, + "grad_norm_var": 0.2732086181640625, + "learning_rate": 0.0001, + "loss": 5.6075, + "loss/crossentropy": 2.474108576774597, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16568316519260406, + "step": 13168 + }, + { + "epoch": 0.4115625, + "grad_norm": 2.96875, + "grad_norm_var": 0.18518473307291666, + "learning_rate": 0.0001, + "loss": 5.9398, + "loss/crossentropy": 2.6023415327072144, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17827922105789185, + "step": 13170 + }, + { + "epoch": 0.411625, + "grad_norm": 3.328125, + "grad_norm_var": 0.18005269368489582, + "learning_rate": 0.0001, + "loss": 6.2196, + "loss/crossentropy": 2.844622850418091, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1832045540213585, + "step": 13172 + }, + { + "epoch": 0.4116875, + "grad_norm": 3.265625, + "grad_norm_var": 0.15696512858072917, + "learning_rate": 0.0001, + "loss": 6.0967, + "loss/crossentropy": 2.647633671760559, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19255923479795456, + "step": 13174 + }, + { + "epoch": 0.41175, + "grad_norm": 3.3125, + "grad_norm_var": 0.15987955729166667, + "learning_rate": 0.0001, + "loss": 5.9396, + "loss/crossentropy": 2.614232897758484, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18292462080717087, + "step": 13176 + }, + { + "epoch": 0.4118125, + "grad_norm": 3.140625, + "grad_norm_var": 0.1589019775390625, + "learning_rate": 0.0001, + "loss": 5.5491, + "loss/crossentropy": 2.373845338821411, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16595923900604248, + "step": 13178 + }, + { + "epoch": 0.411875, + "grad_norm": 3.234375, + "grad_norm_var": 0.16457417805989583, + "learning_rate": 0.0001, + "loss": 5.872, + "loss/crossentropy": 2.5679699182510376, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17571911960840225, + "step": 13180 + }, + { + "epoch": 0.4119375, + "grad_norm": 3.328125, + "grad_norm_var": 0.023075358072916666, + "learning_rate": 0.0001, + "loss": 6.1259, + "loss/crossentropy": 2.7394293546676636, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18317710608243942, + "step": 13182 + }, + { + "epoch": 0.412, + "grad_norm": 3.328125, + "grad_norm_var": 0.022587076822916666, + "learning_rate": 0.0001, + "loss": 5.9327, + "loss/crossentropy": 2.6078226566314697, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17662370204925537, + "step": 13184 + }, + { + "epoch": 0.4120625, + "grad_norm": 3.15625, + "grad_norm_var": 0.015913899739583334, + "learning_rate": 0.0001, + "loss": 5.8175, + "loss/crossentropy": 2.5503119230270386, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17554441094398499, + "step": 13186 + }, + { + "epoch": 0.412125, + "grad_norm": 3.515625, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 6.1154, + "loss/crossentropy": 2.7444063425064087, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18436866998672485, + "step": 13188 + }, + { + "epoch": 0.4121875, + "grad_norm": 3.3125, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 5.5464, + "loss/crossentropy": 2.332979202270508, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17055802047252655, + "step": 13190 + }, + { + "epoch": 0.41225, + "grad_norm": 3.09375, + "grad_norm_var": 0.01793212890625, + "learning_rate": 0.0001, + "loss": 5.7799, + "loss/crossentropy": 2.530150532722473, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17536704242229462, + "step": 13192 + }, + { + "epoch": 0.4123125, + "grad_norm": 3.1875, + "grad_norm_var": 0.017317708333333334, + "learning_rate": 0.0001, + "loss": 5.5503, + "loss/crossentropy": 2.3143259286880493, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17476805299520493, + "step": 13194 + }, + { + "epoch": 0.412375, + "grad_norm": 4.53125, + "grad_norm_var": 0.114111328125, + "learning_rate": 0.0001, + "loss": 6.0064, + "loss/crossentropy": 2.6239311695098877, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18238778412342072, + "step": 13196 + }, + { + "epoch": 0.4124375, + "grad_norm": 3.359375, + "grad_norm_var": 0.11399637858072917, + "learning_rate": 0.0001, + "loss": 6.1141, + "loss/crossentropy": 2.669206738471985, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19214162975549698, + "step": 13198 + }, + { + "epoch": 0.4125, + "grad_norm": 3.078125, + "grad_norm_var": 0.11969401041666666, + "learning_rate": 0.0001, + "loss": 5.491, + "loss/crossentropy": 2.3696892261505127, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1629132702946663, + "step": 13200 + }, + { + "epoch": 0.4125625, + "grad_norm": 3.046875, + "grad_norm_var": 0.11653544108072916, + "learning_rate": 0.0001, + "loss": 5.5503, + "loss/crossentropy": 2.4425567388534546, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16076939553022385, + "step": 13202 + }, + { + "epoch": 0.412625, + "grad_norm": 3.125, + "grad_norm_var": 0.1280181884765625, + "learning_rate": 0.0001, + "loss": 5.6681, + "loss/crossentropy": 2.4140864610671997, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17501512169837952, + "step": 13204 + }, + { + "epoch": 0.4126875, + "grad_norm": 3.3125, + "grad_norm_var": 0.12833658854166666, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.585092306137085, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16854554414749146, + "step": 13206 + }, + { + "epoch": 0.41275, + "grad_norm": 3.015625, + "grad_norm_var": 0.1355621337890625, + "learning_rate": 0.0001, + "loss": 5.9727, + "loss/crossentropy": 2.6454169750213623, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18116343766450882, + "step": 13208 + }, + { + "epoch": 0.4128125, + "grad_norm": 3.0625, + "grad_norm_var": 0.15442301432291666, + "learning_rate": 0.0001, + "loss": 5.6759, + "loss/crossentropy": 2.4061704874038696, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1718936413526535, + "step": 13210 + }, + { + "epoch": 0.412875, + "grad_norm": 3.453125, + "grad_norm_var": 0.06181538899739583, + "learning_rate": 0.0001, + "loss": 6.1976, + "loss/crossentropy": 2.7651203870773315, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19011814892292023, + "step": 13212 + }, + { + "epoch": 0.4129375, + "grad_norm": 3.296875, + "grad_norm_var": 0.061930338541666664, + "learning_rate": 0.0001, + "loss": 5.996, + "loss/crossentropy": 2.701950192451477, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17784158140420914, + "step": 13214 + }, + { + "epoch": 0.413, + "grad_norm": 3.109375, + "grad_norm_var": 0.06635640462239584, + "learning_rate": 0.0001, + "loss": 5.6419, + "loss/crossentropy": 2.512939691543579, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1621171534061432, + "step": 13216 + }, + { + "epoch": 0.4130625, + "grad_norm": 3.59375, + "grad_norm_var": 0.07097066243489583, + "learning_rate": 0.0001, + "loss": 5.8349, + "loss/crossentropy": 2.527301073074341, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17724359780550003, + "step": 13218 + }, + { + "epoch": 0.413125, + "grad_norm": 3.40625, + "grad_norm_var": 0.057249959309895834, + "learning_rate": 0.0001, + "loss": 6.2121, + "loss/crossentropy": 2.728484869003296, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19289369881153107, + "step": 13220 + }, + { + "epoch": 0.4131875, + "grad_norm": 3.484375, + "grad_norm_var": 0.060770670572916664, + "learning_rate": 0.0001, + "loss": 5.9931, + "loss/crossentropy": 2.6184688806533813, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18395137786865234, + "step": 13222 + }, + { + "epoch": 0.41325, + "grad_norm": 3.1875, + "grad_norm_var": 0.06220296223958333, + "learning_rate": 0.0001, + "loss": 5.6879, + "loss/crossentropy": 2.5066728591918945, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16772903501987457, + "step": 13224 + }, + { + "epoch": 0.4133125, + "grad_norm": 3.21875, + "grad_norm_var": 0.04133199055989583, + "learning_rate": 0.0001, + "loss": 5.925, + "loss/crossentropy": 2.6199041604995728, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1742590218782425, + "step": 13226 + }, + { + "epoch": 0.413375, + "grad_norm": 3.484375, + "grad_norm_var": 0.03858133951822917, + "learning_rate": 0.0001, + "loss": 5.825, + "loss/crossentropy": 2.541438341140747, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17405883967876434, + "step": 13228 + }, + { + "epoch": 0.4134375, + "grad_norm": 3.171875, + "grad_norm_var": 0.04065348307291667, + "learning_rate": 0.0001, + "loss": 5.7935, + "loss/crossentropy": 2.605634570121765, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17074382305145264, + "step": 13230 + }, + { + "epoch": 0.4135, + "grad_norm": 3.1875, + "grad_norm_var": 0.03902587890625, + "learning_rate": 0.0001, + "loss": 6.3648, + "loss/crossentropy": 2.9099632501602173, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19235942512750626, + "step": 13232 + }, + { + "epoch": 0.4135625, + "grad_norm": 3.03125, + "grad_norm_var": 0.03572489420572917, + "learning_rate": 0.0001, + "loss": 5.8881, + "loss/crossentropy": 2.6659469604492188, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17104150354862213, + "step": 13234 + }, + { + "epoch": 0.413625, + "grad_norm": 2.96875, + "grad_norm_var": 0.04006245930989583, + "learning_rate": 0.0001, + "loss": 5.4022, + "loss/crossentropy": 2.2921340465545654, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1602272465825081, + "step": 13236 + }, + { + "epoch": 0.4136875, + "grad_norm": 3.125, + "grad_norm_var": 0.03375244140625, + "learning_rate": 0.0001, + "loss": 5.7986, + "loss/crossentropy": 2.5155210494995117, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17713217437267303, + "step": 13238 + }, + { + "epoch": 0.41375, + "grad_norm": 3.15625, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 5.7667, + "loss/crossentropy": 2.5459882020950317, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17519887536764145, + "step": 13240 + }, + { + "epoch": 0.4138125, + "grad_norm": 3.40625, + "grad_norm_var": 0.02672119140625, + "learning_rate": 0.0001, + "loss": 5.9134, + "loss/crossentropy": 2.598848342895508, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17911456525325775, + "step": 13242 + }, + { + "epoch": 0.413875, + "grad_norm": 3.359375, + "grad_norm_var": 0.026204427083333332, + "learning_rate": 0.0001, + "loss": 5.7157, + "loss/crossentropy": 2.5025155544281006, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1736644208431244, + "step": 13244 + }, + { + "epoch": 0.4139375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0280426025390625, + "learning_rate": 0.0001, + "loss": 5.8131, + "loss/crossentropy": 2.613903760910034, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17187371850013733, + "step": 13246 + }, + { + "epoch": 0.414, + "grad_norm": 3.0625, + "grad_norm_var": 0.022150675455729168, + "learning_rate": 0.0001, + "loss": 5.5436, + "loss/crossentropy": 2.4482144117355347, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.162273108959198, + "step": 13248 + }, + { + "epoch": 0.4140625, + "grad_norm": 3.015625, + "grad_norm_var": 0.022411092122395834, + "learning_rate": 0.0001, + "loss": 5.9598, + "loss/crossentropy": 2.663753390312195, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17960526049137115, + "step": 13250 + }, + { + "epoch": 0.414125, + "grad_norm": 2.96875, + "grad_norm_var": 0.0251953125, + "learning_rate": 0.0001, + "loss": 5.511, + "loss/crossentropy": 2.4860671758651733, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15561538934707642, + "step": 13252 + }, + { + "epoch": 0.4141875, + "grad_norm": 3.078125, + "grad_norm_var": 0.020979817708333334, + "learning_rate": 0.0001, + "loss": 5.5277, + "loss/crossentropy": 2.4098732471466064, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1629585400223732, + "step": 13254 + }, + { + "epoch": 0.41425, + "grad_norm": 3.203125, + "grad_norm_var": 0.022998046875, + "learning_rate": 0.0001, + "loss": 6.0683, + "loss/crossentropy": 2.7509347200393677, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18095697462558746, + "step": 13256 + }, + { + "epoch": 0.4143125, + "grad_norm": 3.265625, + "grad_norm_var": 0.017463175455729167, + "learning_rate": 0.0001, + "loss": 5.9674, + "loss/crossentropy": 2.6672979593276978, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17844686657190323, + "step": 13258 + }, + { + "epoch": 0.414375, + "grad_norm": 3.265625, + "grad_norm_var": 0.015331013997395834, + "learning_rate": 0.0001, + "loss": 5.6512, + "loss/crossentropy": 2.4591602087020874, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16842012852430344, + "step": 13260 + }, + { + "epoch": 0.4144375, + "grad_norm": 3.265625, + "grad_norm_var": 0.019791666666666666, + "learning_rate": 0.0001, + "loss": 5.7192, + "loss/crossentropy": 2.5459213256835938, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16615993529558182, + "step": 13262 + }, + { + "epoch": 0.4145, + "grad_norm": 3.203125, + "grad_norm_var": 0.017039998372395834, + "learning_rate": 0.0001, + "loss": 5.7789, + "loss/crossentropy": 2.60288667678833, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1707262471318245, + "step": 13264 + }, + { + "epoch": 0.4145625, + "grad_norm": 3.046875, + "grad_norm_var": 0.017113240559895833, + "learning_rate": 0.0001, + "loss": 5.8823, + "loss/crossentropy": 2.6379607915878296, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1728760376572609, + "step": 13266 + }, + { + "epoch": 0.414625, + "grad_norm": 3.5625, + "grad_norm_var": 0.017464192708333333, + "learning_rate": 0.0001, + "loss": 6.1463, + "loss/crossentropy": 2.7377195358276367, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1869480088353157, + "step": 13268 + }, + { + "epoch": 0.4146875, + "grad_norm": 221249536.0, + "grad_norm_var": 3059459732353980.5, + "learning_rate": 0.0001, + "loss": 8.1439, + "loss/crossentropy": 2.4181610345840454, + "loss/hidden": 2.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.332338847219944, + "step": 13270 + }, + { + "epoch": 0.41475, + "grad_norm": 3.953125, + "grad_norm_var": 3059459730020489.5, + "learning_rate": 0.0001, + "loss": 6.1215, + "loss/crossentropy": 2.6964603662490845, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18469253927469254, + "step": 13272 + }, + { + "epoch": 0.4148125, + "grad_norm": 2.96875, + "grad_norm_var": 3059459729905255.0, + "learning_rate": 0.0001, + "loss": 5.6707, + "loss/crossentropy": 2.3926303386688232, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1719427853822708, + "step": 13274 + }, + { + "epoch": 0.414875, + "grad_norm": 3.1875, + "grad_norm_var": 3059459729501935.5, + "learning_rate": 0.0001, + "loss": 5.7719, + "loss/crossentropy": 2.4425740242004395, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17785116285085678, + "step": 13276 + }, + { + "epoch": 0.4149375, + "grad_norm": 3.109375, + "grad_norm_var": 3059459729790021.0, + "learning_rate": 0.0001, + "loss": 5.8638, + "loss/crossentropy": 2.6428544521331787, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17248062789440155, + "step": 13278 + }, + { + "epoch": 0.415, + "grad_norm": 3.5, + "grad_norm_var": 3059459729645978.5, + "learning_rate": 0.0001, + "loss": 6.1344, + "loss/crossentropy": 2.7529033422470093, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18854503333568573, + "step": 13280 + }, + { + "epoch": 0.4150625, + "grad_norm": 3.296875, + "grad_norm_var": 3059459728954573.5, + "learning_rate": 0.0001, + "loss": 5.7005, + "loss/crossentropy": 2.411180257797241, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17502257227897644, + "step": 13282 + }, + { + "epoch": 0.415125, + "grad_norm": 5.09375, + "grad_norm_var": 3059459726966785.0, + "learning_rate": 0.0001, + "loss": 5.3624, + "loss/crossentropy": 2.238004684448242, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1600944846868515, + "step": 13284 + }, + { + "epoch": 0.4151875, + "grad_norm": 3.328125, + "grad_norm_var": 0.27325846354166666, + "learning_rate": 0.0001, + "loss": 5.8273, + "loss/crossentropy": 2.4858503341674805, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1786753088235855, + "step": 13286 + }, + { + "epoch": 0.41525, + "grad_norm": 3.125, + "grad_norm_var": 0.2527496337890625, + "learning_rate": 0.0001, + "loss": 5.8173, + "loss/crossentropy": 2.5401535034179688, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1765388399362564, + "step": 13288 + }, + { + "epoch": 0.4153125, + "grad_norm": 3.40625, + "grad_norm_var": 0.23998921712239582, + "learning_rate": 0.0001, + "loss": 5.8668, + "loss/crossentropy": 2.60434091091156, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17507880926132202, + "step": 13290 + }, + { + "epoch": 0.415375, + "grad_norm": 3.40625, + "grad_norm_var": 0.2409088134765625, + "learning_rate": 0.0001, + "loss": 5.8486, + "loss/crossentropy": 2.604510545730591, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17284392565488815, + "step": 13292 + }, + { + "epoch": 0.4154375, + "grad_norm": 4.53125, + "grad_norm_var": 0.3153635660807292, + "learning_rate": 0.0001, + "loss": 5.9829, + "loss/crossentropy": 2.625877618789673, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1810181587934494, + "step": 13294 + }, + { + "epoch": 0.4155, + "grad_norm": 3.703125, + "grad_norm_var": 0.3130279541015625, + "learning_rate": 0.0001, + "loss": 5.9803, + "loss/crossentropy": 2.6242889165878296, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18247459828853607, + "step": 13296 + }, + { + "epoch": 0.4155625, + "grad_norm": 3.046875, + "grad_norm_var": 0.3250071207682292, + "learning_rate": 0.0001, + "loss": 5.6778, + "loss/crossentropy": 2.4567201137542725, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17054059356451035, + "step": 13298 + }, + { + "epoch": 0.415625, + "grad_norm": 3.359375, + "grad_norm_var": 0.142041015625, + "learning_rate": 0.0001, + "loss": 5.6596, + "loss/crossentropy": 2.473236560821533, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16746355593204498, + "step": 13300 + }, + { + "epoch": 0.4156875, + "grad_norm": 3.234375, + "grad_norm_var": 0.14356180826822917, + "learning_rate": 0.0001, + "loss": 5.7374, + "loss/crossentropy": 2.4581050872802734, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17090317606925964, + "step": 13302 + }, + { + "epoch": 0.41575, + "grad_norm": 4.09375, + "grad_norm_var": 0.333984375, + "learning_rate": 0.0001, + "loss": 6.5727, + "loss/crossentropy": 2.818481206893921, + "loss/hidden": 1.640625, + "loss/jsd": 0.0, + "loss/logits": 0.2113558128476143, + "step": 13304 + }, + { + "epoch": 0.4158125, + "grad_norm": 3.390625, + "grad_norm_var": 0.374267578125, + "learning_rate": 0.0001, + "loss": 5.9868, + "loss/crossentropy": 2.625677466392517, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18337451666593552, + "step": 13306 + }, + { + "epoch": 0.415875, + "grad_norm": 3.15625, + "grad_norm_var": 0.3784739176432292, + "learning_rate": 0.0001, + "loss": 5.8802, + "loss/crossentropy": 2.577824592590332, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17985156923532486, + "step": 13308 + }, + { + "epoch": 0.4159375, + "grad_norm": 3.046875, + "grad_norm_var": 0.34845377604166666, + "learning_rate": 0.0001, + "loss": 5.8672, + "loss/crossentropy": 2.6495500802993774, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17372021824121475, + "step": 13310 + }, + { + "epoch": 0.416, + "grad_norm": 3.265625, + "grad_norm_var": 0.34324544270833335, + "learning_rate": 0.0001, + "loss": 5.8939, + "loss/crossentropy": 2.5945329666137695, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17720364034175873, + "step": 13312 + }, + { + "epoch": 0.4160625, + "grad_norm": 3.265625, + "grad_norm_var": 0.3314127604166667, + "learning_rate": 0.0001, + "loss": 6.0797, + "loss/crossentropy": 2.772068738937378, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17959268391132355, + "step": 13314 + }, + { + "epoch": 0.416125, + "grad_norm": 3.578125, + "grad_norm_var": 0.32737528483072914, + "learning_rate": 0.0001, + "loss": 5.9208, + "loss/crossentropy": 2.5992554426193237, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17902611941099167, + "step": 13316 + }, + { + "epoch": 0.4161875, + "grad_norm": 3.234375, + "grad_norm_var": 0.3333160400390625, + "learning_rate": 0.0001, + "loss": 6.0433, + "loss/crossentropy": 2.6992595195770264, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18244589865207672, + "step": 13318 + }, + { + "epoch": 0.41625, + "grad_norm": 3.078125, + "grad_norm_var": 0.12014058430989584, + "learning_rate": 0.0001, + "loss": 6.0013, + "loss/crossentropy": 2.6596258878707886, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17987225949764252, + "step": 13320 + }, + { + "epoch": 0.4163125, + "grad_norm": 3.3125, + "grad_norm_var": 0.022835286458333333, + "learning_rate": 0.0001, + "loss": 5.7675, + "loss/crossentropy": 2.4878780841827393, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17562340945005417, + "step": 13322 + }, + { + "epoch": 0.416375, + "grad_norm": 3.09375, + "grad_norm_var": 0.024117024739583333, + "learning_rate": 0.0001, + "loss": 5.6204, + "loss/crossentropy": 2.3957300186157227, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1724710911512375, + "step": 13324 + }, + { + "epoch": 0.4164375, + "grad_norm": 3.609375, + "grad_norm_var": 0.030855305989583335, + "learning_rate": 0.0001, + "loss": 5.8249, + "loss/crossentropy": 2.5164222717285156, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17537520080804825, + "step": 13326 + }, + { + "epoch": 0.4165, + "grad_norm": 3.09375, + "grad_norm_var": 0.03662109375, + "learning_rate": 0.0001, + "loss": 5.6166, + "loss/crossentropy": 2.4366323947906494, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1652638539671898, + "step": 13328 + }, + { + "epoch": 0.4165625, + "grad_norm": 3.125, + "grad_norm_var": 0.04049072265625, + "learning_rate": 0.0001, + "loss": 5.9916, + "loss/crossentropy": 2.7603050470352173, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17508035898208618, + "step": 13330 + }, + { + "epoch": 0.416625, + "grad_norm": 3.375, + "grad_norm_var": 0.03472900390625, + "learning_rate": 0.0001, + "loss": 5.9878, + "loss/crossentropy": 2.6948909759521484, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1773415356874466, + "step": 13332 + }, + { + "epoch": 0.4166875, + "grad_norm": 3.078125, + "grad_norm_var": 0.0405426025390625, + "learning_rate": 0.0001, + "loss": 5.7725, + "loss/crossentropy": 2.5620919466018677, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1694791242480278, + "step": 13334 + }, + { + "epoch": 0.41675, + "grad_norm": 3.296875, + "grad_norm_var": 0.0394439697265625, + "learning_rate": 0.0001, + "loss": 5.5998, + "loss/crossentropy": 2.45719838142395, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16699153184890747, + "step": 13336 + }, + { + "epoch": 0.4168125, + "grad_norm": 3.703125, + "grad_norm_var": 0.053221638997395834, + "learning_rate": 0.0001, + "loss": 6.1197, + "loss/crossentropy": 2.694188714027405, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.188251793384552, + "step": 13338 + }, + { + "epoch": 0.416875, + "grad_norm": 3.1875, + "grad_norm_var": 0.05523681640625, + "learning_rate": 0.0001, + "loss": 6.0335, + "loss/crossentropy": 2.7140097618103027, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18156076222658157, + "step": 13340 + }, + { + "epoch": 0.4169375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0443511962890625, + "learning_rate": 0.0001, + "loss": 6.0226, + "loss/crossentropy": 2.661650538444519, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18414033949375153, + "step": 13342 + }, + { + "epoch": 0.417, + "grad_norm": 3.046875, + "grad_norm_var": 0.0461334228515625, + "learning_rate": 0.0001, + "loss": 5.7235, + "loss/crossentropy": 2.478771209716797, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1717337816953659, + "step": 13344 + }, + { + "epoch": 0.4170625, + "grad_norm": 3.4375, + "grad_norm_var": 0.0412506103515625, + "learning_rate": 0.0001, + "loss": 5.8778, + "loss/crossentropy": 2.632124662399292, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17495673149824142, + "step": 13346 + }, + { + "epoch": 0.417125, + "grad_norm": 3.0625, + "grad_norm_var": 0.043115234375, + "learning_rate": 0.0001, + "loss": 6.0772, + "loss/crossentropy": 2.775555968284607, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17899131774902344, + "step": 13348 + }, + { + "epoch": 0.4171875, + "grad_norm": 3.1875, + "grad_norm_var": 0.042601521809895834, + "learning_rate": 0.0001, + "loss": 6.0198, + "loss/crossentropy": 2.7797752618789673, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1736101284623146, + "step": 13350 + }, + { + "epoch": 0.41725, + "grad_norm": 3.0625, + "grad_norm_var": 0.038863118489583334, + "learning_rate": 0.0001, + "loss": 5.4021, + "loss/crossentropy": 2.3128799200057983, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1608761101961136, + "step": 13352 + }, + { + "epoch": 0.4173125, + "grad_norm": 2.90625, + "grad_norm_var": 0.0328033447265625, + "learning_rate": 0.0001, + "loss": 5.488, + "loss/crossentropy": 2.4243158102035522, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15753886848688126, + "step": 13354 + }, + { + "epoch": 0.417375, + "grad_norm": 3.28125, + "grad_norm_var": 1.1357981363932292, + "learning_rate": 0.0001, + "loss": 6.0172, + "loss/crossentropy": 2.6375356912612915, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18289029598236084, + "step": 13356 + }, + { + "epoch": 0.4174375, + "grad_norm": 3.15625, + "grad_norm_var": 1.153076171875, + "learning_rate": 0.0001, + "loss": 5.5547, + "loss/crossentropy": 2.4211915731430054, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16296488046646118, + "step": 13358 + }, + { + "epoch": 0.4175, + "grad_norm": 3.125, + "grad_norm_var": 1.1563873291015625, + "learning_rate": 0.0001, + "loss": 5.7951, + "loss/crossentropy": 2.5717642307281494, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17428386211395264, + "step": 13360 + }, + { + "epoch": 0.4175625, + "grad_norm": 3.609375, + "grad_norm_var": 1.15572509765625, + "learning_rate": 0.0001, + "loss": 5.9661, + "loss/crossentropy": 2.6595340967178345, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.18261125683784485, + "step": 13362 + }, + { + "epoch": 0.417625, + "grad_norm": 3.015625, + "grad_norm_var": 1.1632232666015625, + "learning_rate": 0.0001, + "loss": 5.9179, + "loss/crossentropy": 2.6462502479553223, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17364852130413055, + "step": 13364 + }, + { + "epoch": 0.4176875, + "grad_norm": 3.25, + "grad_norm_var": 1.153343709309896, + "learning_rate": 0.0001, + "loss": 5.8498, + "loss/crossentropy": 2.6918177604675293, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16501691937446594, + "step": 13366 + }, + { + "epoch": 0.41775, + "grad_norm": 3.3125, + "grad_norm_var": 1.14615478515625, + "learning_rate": 0.0001, + "loss": 5.804, + "loss/crossentropy": 2.536333441734314, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17598675191402435, + "step": 13368 + }, + { + "epoch": 0.4178125, + "grad_norm": 3.234375, + "grad_norm_var": 1.122021484375, + "learning_rate": 0.0001, + "loss": 5.3244, + "loss/crossentropy": 2.227112650871277, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.15777958929538727, + "step": 13370 + }, + { + "epoch": 0.417875, + "grad_norm": 4.71875, + "grad_norm_var": 0.17509765625, + "learning_rate": 0.0001, + "loss": 5.9836, + "loss/crossentropy": 2.636080026626587, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18553803116083145, + "step": 13372 + }, + { + "epoch": 0.4179375, + "grad_norm": 3.328125, + "grad_norm_var": 0.16098531087239584, + "learning_rate": 0.0001, + "loss": 5.9206, + "loss/crossentropy": 2.621453285217285, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17913124710321426, + "step": 13374 + }, + { + "epoch": 0.418, + "grad_norm": 3.265625, + "grad_norm_var": 0.15097249348958333, + "learning_rate": 0.0001, + "loss": 6.2312, + "loss/crossentropy": 2.796077847480774, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18843501061201096, + "step": 13376 + }, + { + "epoch": 0.4180625, + "grad_norm": 3.171875, + "grad_norm_var": 0.15077718098958334, + "learning_rate": 0.0001, + "loss": 5.9163, + "loss/crossentropy": 2.6079468727111816, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17770981043577194, + "step": 13378 + }, + { + "epoch": 0.418125, + "grad_norm": 3.390625, + "grad_norm_var": 0.13793843587239582, + "learning_rate": 0.0001, + "loss": 6.1217, + "loss/crossentropy": 2.783895492553711, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.181822769343853, + "step": 13380 + }, + { + "epoch": 0.4181875, + "grad_norm": 3.328125, + "grad_norm_var": 0.13452860514322917, + "learning_rate": 0.0001, + "loss": 5.8822, + "loss/crossentropy": 2.6041117906570435, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17976077646017075, + "step": 13382 + }, + { + "epoch": 0.41825, + "grad_norm": 3.1875, + "grad_norm_var": 0.1415679931640625, + "learning_rate": 0.0001, + "loss": 5.7405, + "loss/crossentropy": 2.4632670879364014, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17733129858970642, + "step": 13384 + }, + { + "epoch": 0.4183125, + "grad_norm": 3.0625, + "grad_norm_var": 0.14594624837239584, + "learning_rate": 0.0001, + "loss": 6.1735, + "loss/crossentropy": 2.747955083847046, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18903613835573196, + "step": 13386 + }, + { + "epoch": 0.418375, + "grad_norm": 3.578125, + "grad_norm_var": 0.034891764322916664, + "learning_rate": 0.0001, + "loss": 5.926, + "loss/crossentropy": 2.628903031349182, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1824403628706932, + "step": 13388 + }, + { + "epoch": 0.4184375, + "grad_norm": 6.25, + "grad_norm_var": 0.5953196207682292, + "learning_rate": 0.0001, + "loss": 5.9103, + "loss/crossentropy": 2.48799729347229, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1840263083577156, + "step": 13390 + }, + { + "epoch": 0.4185, + "grad_norm": 3.078125, + "grad_norm_var": 0.6135243733723958, + "learning_rate": 0.0001, + "loss": 5.9143, + "loss/crossentropy": 2.6407530307769775, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17696483433246613, + "step": 13392 + }, + { + "epoch": 0.4185625, + "grad_norm": 3.8125, + "grad_norm_var": 0.61724853515625, + "learning_rate": 0.0001, + "loss": 5.8641, + "loss/crossentropy": 2.538280963897705, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17945598810911179, + "step": 13394 + }, + { + "epoch": 0.418625, + "grad_norm": 3.578125, + "grad_norm_var": 0.6085245768229167, + "learning_rate": 0.0001, + "loss": 5.8976, + "loss/crossentropy": 2.600240468978882, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17778422683477402, + "step": 13396 + }, + { + "epoch": 0.4186875, + "grad_norm": 3.296875, + "grad_norm_var": 0.6098866780598958, + "learning_rate": 0.0001, + "loss": 5.8737, + "loss/crossentropy": 2.6107362508773804, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17668740451335907, + "step": 13398 + }, + { + "epoch": 0.41875, + "grad_norm": 8.625, + "grad_norm_var": 2.196703084309896, + "learning_rate": 0.0001, + "loss": 5.7575, + "loss/crossentropy": 2.415872573852539, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17674195766448975, + "step": 13400 + }, + { + "epoch": 0.4188125, + "grad_norm": 3.390625, + "grad_norm_var": 2.171947224934896, + "learning_rate": 0.0001, + "loss": 5.953, + "loss/crossentropy": 2.647102952003479, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17707321792840958, + "step": 13402 + }, + { + "epoch": 0.418875, + "grad_norm": 3.015625, + "grad_norm_var": 2.161229451497396, + "learning_rate": 0.0001, + "loss": 5.7712, + "loss/crossentropy": 2.5194567441940308, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17673862725496292, + "step": 13404 + }, + { + "epoch": 0.4189375, + "grad_norm": 3.046875, + "grad_norm_var": 1.7971588134765626, + "learning_rate": 0.0001, + "loss": 6.0203, + "loss/crossentropy": 2.7287405729293823, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17564278095960617, + "step": 13406 + }, + { + "epoch": 0.419, + "grad_norm": 3.234375, + "grad_norm_var": 1.8368448893229166, + "learning_rate": 0.0001, + "loss": 5.8314, + "loss/crossentropy": 2.470117211341858, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18105123937129974, + "step": 13408 + }, + { + "epoch": 0.4190625, + "grad_norm": 3.328125, + "grad_norm_var": 1.8598917643229167, + "learning_rate": 0.0001, + "loss": 5.6467, + "loss/crossentropy": 2.48598575592041, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1656765565276146, + "step": 13410 + }, + { + "epoch": 0.419125, + "grad_norm": 3.109375, + "grad_norm_var": 1.8889638264973958, + "learning_rate": 0.0001, + "loss": 5.4564, + "loss/crossentropy": 2.2907443046569824, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16578485071659088, + "step": 13412 + }, + { + "epoch": 0.4191875, + "grad_norm": 3.078125, + "grad_norm_var": 1.9190012613932292, + "learning_rate": 0.0001, + "loss": 5.713, + "loss/crossentropy": 2.5517455339431763, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16808252781629562, + "step": 13414 + }, + { + "epoch": 0.41925, + "grad_norm": 3.078125, + "grad_norm_var": 0.14417317708333333, + "learning_rate": 0.0001, + "loss": 5.7764, + "loss/crossentropy": 2.537129521369934, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17587868124246597, + "step": 13416 + }, + { + "epoch": 0.4193125, + "grad_norm": 3.25, + "grad_norm_var": 0.14276936848958333, + "learning_rate": 0.0001, + "loss": 5.7669, + "loss/crossentropy": 2.5112040042877197, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1700960397720337, + "step": 13418 + }, + { + "epoch": 0.419375, + "grad_norm": 3.25, + "grad_norm_var": 0.13907877604166666, + "learning_rate": 0.0001, + "loss": 5.8926, + "loss/crossentropy": 2.6581804752349854, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17187707126140594, + "step": 13420 + }, + { + "epoch": 0.4194375, + "grad_norm": 3.734375, + "grad_norm_var": 0.15826416015625, + "learning_rate": 0.0001, + "loss": 5.9235, + "loss/crossentropy": 2.537745952606201, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.17998237162828445, + "step": 13422 + }, + { + "epoch": 0.4195, + "grad_norm": 3.421875, + "grad_norm_var": 0.0461578369140625, + "learning_rate": 0.0001, + "loss": 5.7226, + "loss/crossentropy": 2.4406378269195557, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1789797767996788, + "step": 13424 + }, + { + "epoch": 0.4195625, + "grad_norm": 3.09375, + "grad_norm_var": 0.045849609375, + "learning_rate": 0.0001, + "loss": 5.6016, + "loss/crossentropy": 2.383212089538574, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1726173758506775, + "step": 13426 + }, + { + "epoch": 0.419625, + "grad_norm": 3.25, + "grad_norm_var": 0.04413655598958333, + "learning_rate": 0.0001, + "loss": 6.0146, + "loss/crossentropy": 2.7026796340942383, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1796249970793724, + "step": 13428 + }, + { + "epoch": 0.4196875, + "grad_norm": 3.4375, + "grad_norm_var": 0.0380767822265625, + "learning_rate": 0.0001, + "loss": 5.8797, + "loss/crossentropy": 2.6174843311309814, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17661139369010925, + "step": 13430 + }, + { + "epoch": 0.41975, + "grad_norm": 3.4375, + "grad_norm_var": 0.0279693603515625, + "learning_rate": 0.0001, + "loss": 5.9371, + "loss/crossentropy": 2.576013684272766, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18376551568508148, + "step": 13432 + }, + { + "epoch": 0.4198125, + "grad_norm": 3.09375, + "grad_norm_var": 0.03082275390625, + "learning_rate": 0.0001, + "loss": 5.7509, + "loss/crossentropy": 2.544191360473633, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17184053361415863, + "step": 13434 + }, + { + "epoch": 0.419875, + "grad_norm": 3.421875, + "grad_norm_var": 0.03209228515625, + "learning_rate": 0.0001, + "loss": 5.5981, + "loss/crossentropy": 2.3723961114883423, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17061317712068558, + "step": 13436 + }, + { + "epoch": 0.4199375, + "grad_norm": 3.46875, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 5.8003, + "loss/crossentropy": 2.6000717878341675, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1711946427822113, + "step": 13438 + }, + { + "epoch": 0.42, + "grad_norm": 3.328125, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 6.0101, + "loss/crossentropy": 2.6064292192459106, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18763728439807892, + "step": 13440 + }, + { + "epoch": 0.4200625, + "grad_norm": 3.703125, + "grad_norm_var": 0.024828084309895835, + "learning_rate": 0.0001, + "loss": 5.8854, + "loss/crossentropy": 2.6590187549591064, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17458895593881607, + "step": 13442 + }, + { + "epoch": 0.420125, + "grad_norm": 3.46875, + "grad_norm_var": 0.025191243489583334, + "learning_rate": 0.0001, + "loss": 5.918, + "loss/crossentropy": 2.607555866241455, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17752625793218613, + "step": 13444 + }, + { + "epoch": 0.4201875, + "grad_norm": 3.234375, + "grad_norm_var": 0.026195271809895834, + "learning_rate": 0.0001, + "loss": 5.7567, + "loss/crossentropy": 2.554731249809265, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1725441962480545, + "step": 13446 + }, + { + "epoch": 0.42025, + "grad_norm": 3.203125, + "grad_norm_var": 0.08779195149739584, + "learning_rate": 0.0001, + "loss": 5.9279, + "loss/crossentropy": 2.6013470888137817, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18148837983608246, + "step": 13448 + }, + { + "epoch": 0.4203125, + "grad_norm": 3.25, + "grad_norm_var": 0.08702799479166666, + "learning_rate": 0.0001, + "loss": 5.9229, + "loss/crossentropy": 2.625003695487976, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17979059368371964, + "step": 13450 + }, + { + "epoch": 0.420375, + "grad_norm": 2.921875, + "grad_norm_var": 0.1002593994140625, + "learning_rate": 0.0001, + "loss": 5.879, + "loss/crossentropy": 2.624898314476013, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.16915686428546906, + "step": 13452 + }, + { + "epoch": 0.4204375, + "grad_norm": 3.1875, + "grad_norm_var": 0.1020172119140625, + "learning_rate": 0.0001, + "loss": 5.7025, + "loss/crossentropy": 2.572711944580078, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1661021187901497, + "step": 13454 + }, + { + "epoch": 0.4205, + "grad_norm": 3.15625, + "grad_norm_var": 0.11406962076822917, + "learning_rate": 0.0001, + "loss": 5.6847, + "loss/crossentropy": 2.509779691696167, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1663188487291336, + "step": 13456 + }, + { + "epoch": 0.4205625, + "grad_norm": 2.953125, + "grad_norm_var": 0.11355692545572917, + "learning_rate": 0.0001, + "loss": 5.5047, + "loss/crossentropy": 2.4106862545013428, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1613505482673645, + "step": 13458 + }, + { + "epoch": 0.420625, + "grad_norm": 3.25, + "grad_norm_var": 0.11013895670572917, + "learning_rate": 0.0001, + "loss": 5.619, + "loss/crossentropy": 2.399704098701477, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1633327305316925, + "step": 13460 + }, + { + "epoch": 0.4206875, + "grad_norm": 3.28125, + "grad_norm_var": 0.10957743326822916, + "learning_rate": 0.0001, + "loss": 6.0299, + "loss/crossentropy": 2.7502713203430176, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17757584154605865, + "step": 13462 + }, + { + "epoch": 0.42075, + "grad_norm": 3.265625, + "grad_norm_var": 0.04217122395833333, + "learning_rate": 0.0001, + "loss": 5.9929, + "loss/crossentropy": 2.682234048843384, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18028826266527176, + "step": 13464 + }, + { + "epoch": 0.4208125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0327789306640625, + "learning_rate": 0.0001, + "loss": 5.8913, + "loss/crossentropy": 2.646457552909851, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17252662032842636, + "step": 13466 + }, + { + "epoch": 0.420875, + "grad_norm": 3.859375, + "grad_norm_var": 0.0522857666015625, + "learning_rate": 0.0001, + "loss": 5.9075, + "loss/crossentropy": 2.5467541217803955, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17982478439807892, + "step": 13468 + }, + { + "epoch": 0.4209375, + "grad_norm": 3.578125, + "grad_norm_var": 0.0559478759765625, + "learning_rate": 0.0001, + "loss": 6.1718, + "loss/crossentropy": 2.7906744480133057, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18498337268829346, + "step": 13470 + }, + { + "epoch": 0.421, + "grad_norm": 3.78125, + "grad_norm_var": 0.059723917643229166, + "learning_rate": 0.0001, + "loss": 6.0537, + "loss/crossentropy": 2.6973766088485718, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18602560460567474, + "step": 13472 + }, + { + "epoch": 0.4210625, + "grad_norm": 3.359375, + "grad_norm_var": 0.044169108072916664, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.57953679561615, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17145118862390518, + "step": 13474 + }, + { + "epoch": 0.421125, + "grad_norm": 3.109375, + "grad_norm_var": 0.05371805826822917, + "learning_rate": 0.0001, + "loss": 5.7416, + "loss/crossentropy": 2.556352138519287, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16735686361789703, + "step": 13476 + }, + { + "epoch": 0.4211875, + "grad_norm": 3.515625, + "grad_norm_var": 0.05446675618489583, + "learning_rate": 0.0001, + "loss": 6.0581, + "loss/crossentropy": 2.733359932899475, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18012653291225433, + "step": 13478 + }, + { + "epoch": 0.42125, + "grad_norm": 3.203125, + "grad_norm_var": 0.06277669270833333, + "learning_rate": 0.0001, + "loss": 6.0782, + "loss/crossentropy": 2.7694530487060547, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17970608174800873, + "step": 13480 + }, + { + "epoch": 0.4213125, + "grad_norm": 3.125, + "grad_norm_var": 0.06507161458333334, + "learning_rate": 0.0001, + "loss": 5.9773, + "loss/crossentropy": 2.702518939971924, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1763092428445816, + "step": 13482 + }, + { + "epoch": 0.421375, + "grad_norm": 3.5, + "grad_norm_var": 0.04702046712239583, + "learning_rate": 0.0001, + "loss": 5.7623, + "loss/crossentropy": 2.4824819564819336, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17329394072294235, + "step": 13484 + }, + { + "epoch": 0.4214375, + "grad_norm": 3.34375, + "grad_norm_var": 0.04290364583333333, + "learning_rate": 0.0001, + "loss": 5.6789, + "loss/crossentropy": 2.4971306324005127, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1693483293056488, + "step": 13486 + }, + { + "epoch": 0.4215, + "grad_norm": 3.390625, + "grad_norm_var": 0.02711181640625, + "learning_rate": 0.0001, + "loss": 6.0181, + "loss/crossentropy": 2.667072296142578, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18236906826496124, + "step": 13488 + }, + { + "epoch": 0.4215625, + "grad_norm": 3.375, + "grad_norm_var": 0.356689453125, + "learning_rate": 0.0001, + "loss": 5.7587, + "loss/crossentropy": 2.497342586517334, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17339970916509628, + "step": 13490 + }, + { + "epoch": 0.421625, + "grad_norm": 3.484375, + "grad_norm_var": 0.40539449055989585, + "learning_rate": 0.0001, + "loss": 5.9602, + "loss/crossentropy": 2.5742835998535156, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18351581692695618, + "step": 13492 + }, + { + "epoch": 0.4216875, + "grad_norm": 3.40625, + "grad_norm_var": 0.41343994140625, + "learning_rate": 0.0001, + "loss": 5.6152, + "loss/crossentropy": 2.4358010292053223, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.169895239174366, + "step": 13494 + }, + { + "epoch": 0.42175, + "grad_norm": 3.53125, + "grad_norm_var": 0.3828938802083333, + "learning_rate": 0.0001, + "loss": 6.0907, + "loss/crossentropy": 2.7139753103256226, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18298854678869247, + "step": 13496 + }, + { + "epoch": 0.4218125, + "grad_norm": 2.984375, + "grad_norm_var": 0.39010416666666664, + "learning_rate": 0.0001, + "loss": 5.7839, + "loss/crossentropy": 2.626868724822998, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16609349101781845, + "step": 13498 + }, + { + "epoch": 0.421875, + "grad_norm": 3.0625, + "grad_norm_var": 0.4120076497395833, + "learning_rate": 0.0001, + "loss": 5.9938, + "loss/crossentropy": 2.759929895401001, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17456068098545074, + "step": 13500 + }, + { + "epoch": 0.4219375, + "grad_norm": 3.46875, + "grad_norm_var": 0.4039998372395833, + "learning_rate": 0.0001, + "loss": 5.9093, + "loss/crossentropy": 2.6714051961898804, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17417631298303604, + "step": 13502 + }, + { + "epoch": 0.422, + "grad_norm": 3.15625, + "grad_norm_var": 0.41015218098958334, + "learning_rate": 0.0001, + "loss": 5.9514, + "loss/crossentropy": 2.6693791151046753, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17742042243480682, + "step": 13504 + }, + { + "epoch": 0.4220625, + "grad_norm": 3.28125, + "grad_norm_var": 0.11609700520833334, + "learning_rate": 0.0001, + "loss": 6.0173, + "loss/crossentropy": 2.7742620706558228, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17352618277072906, + "step": 13506 + }, + { + "epoch": 0.422125, + "grad_norm": 2.9375, + "grad_norm_var": 0.04129231770833333, + "learning_rate": 0.0001, + "loss": 5.8235, + "loss/crossentropy": 2.619171619415283, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17238953709602356, + "step": 13508 + }, + { + "epoch": 0.4221875, + "grad_norm": 3.0625, + "grad_norm_var": 0.04148661295572917, + "learning_rate": 0.0001, + "loss": 5.8389, + "loss/crossentropy": 2.6312637329101562, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17115691304206848, + "step": 13510 + }, + { + "epoch": 0.42225, + "grad_norm": 3.234375, + "grad_norm_var": 0.0308013916015625, + "learning_rate": 0.0001, + "loss": 5.9009, + "loss/crossentropy": 2.6351054906845093, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17657799273729324, + "step": 13512 + }, + { + "epoch": 0.4223125, + "grad_norm": 3.34375, + "grad_norm_var": 0.024430338541666666, + "learning_rate": 0.0001, + "loss": 5.832, + "loss/crossentropy": 2.595235824584961, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1748461276292801, + "step": 13514 + }, + { + "epoch": 0.422375, + "grad_norm": 3.421875, + "grad_norm_var": 0.026188151041666666, + "learning_rate": 0.0001, + "loss": 5.8888, + "loss/crossentropy": 2.6085236072540283, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1764683797955513, + "step": 13516 + }, + { + "epoch": 0.4224375, + "grad_norm": 3.125, + "grad_norm_var": 0.023779296875, + "learning_rate": 0.0001, + "loss": 5.5742, + "loss/crossentropy": 2.4499523639678955, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16671674698591232, + "step": 13518 + }, + { + "epoch": 0.4225, + "grad_norm": 2.875, + "grad_norm_var": 0.03146158854166667, + "learning_rate": 0.0001, + "loss": 5.4037, + "loss/crossentropy": 2.3098702430725098, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1605541855096817, + "step": 13520 + }, + { + "epoch": 0.4225625, + "grad_norm": 3.390625, + "grad_norm_var": 0.03340555826822917, + "learning_rate": 0.0001, + "loss": 5.9726, + "loss/crossentropy": 2.64448082447052, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1781274378299713, + "step": 13522 + }, + { + "epoch": 0.422625, + "grad_norm": 3.03125, + "grad_norm_var": 0.034821573893229166, + "learning_rate": 0.0001, + "loss": 5.6807, + "loss/crossentropy": 2.45465350151062, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17338386923074722, + "step": 13524 + }, + { + "epoch": 0.4226875, + "grad_norm": 3.390625, + "grad_norm_var": 0.032957967122395834, + "learning_rate": 0.0001, + "loss": 6.0195, + "loss/crossentropy": 2.72080659866333, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17518477141857147, + "step": 13526 + }, + { + "epoch": 0.42275, + "grad_norm": 3.09375, + "grad_norm_var": 0.03427734375, + "learning_rate": 0.0001, + "loss": 5.3524, + "loss/crossentropy": 2.244433879852295, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16235922276973724, + "step": 13528 + }, + { + "epoch": 0.4228125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03691304524739583, + "learning_rate": 0.0001, + "loss": 6.0795, + "loss/crossentropy": 2.715553045272827, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1817065104842186, + "step": 13530 + }, + { + "epoch": 0.422875, + "grad_norm": 3.03125, + "grad_norm_var": 0.03583984375, + "learning_rate": 0.0001, + "loss": 5.6236, + "loss/crossentropy": 2.480729341506958, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16858422756195068, + "step": 13532 + }, + { + "epoch": 0.4229375, + "grad_norm": 3.421875, + "grad_norm_var": 0.04533589680989583, + "learning_rate": 0.0001, + "loss": 5.8396, + "loss/crossentropy": 2.5761055946350098, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17518050968647003, + "step": 13534 + }, + { + "epoch": 0.423, + "grad_norm": 3.1875, + "grad_norm_var": 0.044140625, + "learning_rate": 0.0001, + "loss": 5.7705, + "loss/crossentropy": 2.5773677825927734, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1689225435256958, + "step": 13536 + }, + { + "epoch": 0.4230625, + "grad_norm": 3.0625, + "grad_norm_var": 0.04312744140625, + "learning_rate": 0.0001, + "loss": 5.5529, + "loss/crossentropy": 2.4116233587265015, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16373953968286514, + "step": 13538 + }, + { + "epoch": 0.423125, + "grad_norm": 3.4375, + "grad_norm_var": 0.047118123372395834, + "learning_rate": 0.0001, + "loss": 6.0036, + "loss/crossentropy": 2.6386624574661255, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18571265786886215, + "step": 13540 + }, + { + "epoch": 0.4231875, + "grad_norm": 3.25, + "grad_norm_var": 0.04478759765625, + "learning_rate": 0.0001, + "loss": 5.9893, + "loss/crossentropy": 2.6201233863830566, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18535902351140976, + "step": 13542 + }, + { + "epoch": 0.42325, + "grad_norm": 3.5625, + "grad_norm_var": 0.05511067708333333, + "learning_rate": 0.0001, + "loss": 6.3525, + "loss/crossentropy": 2.8163682222366333, + "loss/hidden": 1.609375, + "loss/jsd": 0.0, + "loss/logits": 0.1926744133234024, + "step": 13544 + }, + { + "epoch": 0.4233125, + "grad_norm": 3.796875, + "grad_norm_var": 0.06813151041666667, + "learning_rate": 0.0001, + "loss": 6.1174, + "loss/crossentropy": 2.648329496383667, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19027109444141388, + "step": 13546 + }, + { + "epoch": 0.423375, + "grad_norm": 3.734375, + "grad_norm_var": 0.07858784993489583, + "learning_rate": 0.0001, + "loss": 5.7509, + "loss/crossentropy": 2.5591460466384888, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17151930928230286, + "step": 13548 + }, + { + "epoch": 0.4234375, + "grad_norm": 3.28125, + "grad_norm_var": 0.06431884765625, + "learning_rate": 0.0001, + "loss": 5.9321, + "loss/crossentropy": 2.6350157260894775, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17932235449552536, + "step": 13550 + }, + { + "epoch": 0.4235, + "grad_norm": 3.046875, + "grad_norm_var": 0.05556640625, + "learning_rate": 0.0001, + "loss": 5.7378, + "loss/crossentropy": 2.443291425704956, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17905839532613754, + "step": 13552 + }, + { + "epoch": 0.4235625, + "grad_norm": 3.65625, + "grad_norm_var": 0.11454976399739583, + "learning_rate": 0.0001, + "loss": 5.7831, + "loss/crossentropy": 2.4910799264907837, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17451247572898865, + "step": 13554 + }, + { + "epoch": 0.423625, + "grad_norm": 3.15625, + "grad_norm_var": 0.13596598307291666, + "learning_rate": 0.0001, + "loss": 5.5742, + "loss/crossentropy": 2.452877163887024, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1644795536994934, + "step": 13556 + }, + { + "epoch": 0.4236875, + "grad_norm": 2.953125, + "grad_norm_var": 0.15886942545572916, + "learning_rate": 0.0001, + "loss": 5.665, + "loss/crossentropy": 2.4376211166381836, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1696086823940277, + "step": 13558 + }, + { + "epoch": 0.42375, + "grad_norm": 3.265625, + "grad_norm_var": 0.15921122233072918, + "learning_rate": 0.0001, + "loss": 5.8832, + "loss/crossentropy": 2.618662476539612, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17449791729450226, + "step": 13560 + }, + { + "epoch": 0.4238125, + "grad_norm": 3.265625, + "grad_norm_var": 0.14875386555989584, + "learning_rate": 0.0001, + "loss": 5.8942, + "loss/crossentropy": 2.562021017074585, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.17422987520694733, + "step": 13562 + }, + { + "epoch": 0.423875, + "grad_norm": 3.625, + "grad_norm_var": 0.13917643229166668, + "learning_rate": 0.0001, + "loss": 6.0478, + "loss/crossentropy": 2.6333521604537964, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18987825512886047, + "step": 13564 + }, + { + "epoch": 0.4239375, + "grad_norm": 3.125, + "grad_norm_var": 0.14609375, + "learning_rate": 0.0001, + "loss": 5.9004, + "loss/crossentropy": 2.6945817470550537, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17018862068653107, + "step": 13566 + }, + { + "epoch": 0.424, + "grad_norm": 3.6875, + "grad_norm_var": 0.14438374837239584, + "learning_rate": 0.0001, + "loss": 5.7261, + "loss/crossentropy": 2.4810198545455933, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17333311587572098, + "step": 13568 + }, + { + "epoch": 0.4240625, + "grad_norm": 3.234375, + "grad_norm_var": 0.0650390625, + "learning_rate": 0.0001, + "loss": 5.8039, + "loss/crossentropy": 2.582778811454773, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17132743448019028, + "step": 13570 + }, + { + "epoch": 0.424125, + "grad_norm": 3.3125, + "grad_norm_var": 0.053132120768229166, + "learning_rate": 0.0001, + "loss": 5.9895, + "loss/crossentropy": 2.6580461263656616, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1827528029680252, + "step": 13572 + }, + { + "epoch": 0.4241875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0322418212890625, + "learning_rate": 0.0001, + "loss": 5.5213, + "loss/crossentropy": 2.3348710536956787, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1674690619111061, + "step": 13574 + }, + { + "epoch": 0.42425, + "grad_norm": 2.734375, + "grad_norm_var": 0.05172526041666667, + "learning_rate": 0.0001, + "loss": 5.2827, + "loss/crossentropy": 2.324481725692749, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1469968482851982, + "step": 13576 + }, + { + "epoch": 0.4243125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0509674072265625, + "learning_rate": 0.0001, + "loss": 5.7213, + "loss/crossentropy": 2.6081173419952393, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1660090982913971, + "step": 13578 + }, + { + "epoch": 0.424375, + "grad_norm": 3.203125, + "grad_norm_var": 0.04109598795572917, + "learning_rate": 0.0001, + "loss": 5.5337, + "loss/crossentropy": 2.4117451906204224, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16219858080148697, + "step": 13580 + }, + { + "epoch": 0.4244375, + "grad_norm": 3.828125, + "grad_norm_var": 0.06424051920572917, + "learning_rate": 0.0001, + "loss": 5.3408, + "loss/crossentropy": 2.2714133262634277, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15850047767162323, + "step": 13582 + }, + { + "epoch": 0.4245, + "grad_norm": 3.203125, + "grad_norm_var": 0.0598052978515625, + "learning_rate": 0.0001, + "loss": 5.9355, + "loss/crossentropy": 2.5937576293945312, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1818263828754425, + "step": 13584 + }, + { + "epoch": 0.4245625, + "grad_norm": 4.875, + "grad_norm_var": 0.2268951416015625, + "learning_rate": 0.0001, + "loss": 6.0587, + "loss/crossentropy": 2.6613736152648926, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1854352280497551, + "step": 13586 + }, + { + "epoch": 0.424625, + "grad_norm": 3.234375, + "grad_norm_var": 0.22893880208333334, + "learning_rate": 0.0001, + "loss": 5.7986, + "loss/crossentropy": 2.533471941947937, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17534331232309341, + "step": 13588 + }, + { + "epoch": 0.4246875, + "grad_norm": 3.171875, + "grad_norm_var": 0.22678629557291666, + "learning_rate": 0.0001, + "loss": 5.7789, + "loss/crossentropy": 2.5065789222717285, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1737145110964775, + "step": 13590 + }, + { + "epoch": 0.42475, + "grad_norm": 3.171875, + "grad_norm_var": 0.2024810791015625, + "learning_rate": 0.0001, + "loss": 5.8493, + "loss/crossentropy": 2.5877480506896973, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1745925396680832, + "step": 13592 + }, + { + "epoch": 0.4248125, + "grad_norm": 3.53125, + "grad_norm_var": 0.19434305826822917, + "learning_rate": 0.0001, + "loss": 5.9073, + "loss/crossentropy": 2.6121160984039307, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1759987249970436, + "step": 13594 + }, + { + "epoch": 0.424875, + "grad_norm": 4.59375, + "grad_norm_var": 0.2897420247395833, + "learning_rate": 0.0001, + "loss": 5.9084, + "loss/crossentropy": 2.6313605308532715, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17730838060379028, + "step": 13596 + }, + { + "epoch": 0.4249375, + "grad_norm": 3.125, + "grad_norm_var": 0.28772379557291666, + "learning_rate": 0.0001, + "loss": 5.9081, + "loss/crossentropy": 2.660796046257019, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17355673015117645, + "step": 13598 + }, + { + "epoch": 0.425, + "grad_norm": 3.578125, + "grad_norm_var": 0.28989969889322914, + "learning_rate": 0.0001, + "loss": 5.8516, + "loss/crossentropy": 2.5417560338974, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17981155961751938, + "step": 13600 + }, + { + "epoch": 0.4250625, + "grad_norm": 3.4375, + "grad_norm_var": 0.14089253743489583, + "learning_rate": 0.0001, + "loss": 5.9886, + "loss/crossentropy": 2.612521529197693, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18447883427143097, + "step": 13602 + }, + { + "epoch": 0.425125, + "grad_norm": 3.453125, + "grad_norm_var": 0.14060770670572917, + "learning_rate": 0.0001, + "loss": 5.7486, + "loss/crossentropy": 2.529898762702942, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16757732629776, + "step": 13604 + }, + { + "epoch": 0.4251875, + "grad_norm": 3.359375, + "grad_norm_var": 0.13560791015625, + "learning_rate": 0.0001, + "loss": 5.9274, + "loss/crossentropy": 2.588602662086487, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17997673153877258, + "step": 13606 + }, + { + "epoch": 0.42525, + "grad_norm": 3.265625, + "grad_norm_var": 0.15074869791666667, + "learning_rate": 0.0001, + "loss": 5.9698, + "loss/crossentropy": 2.599562168121338, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18311382085084915, + "step": 13608 + }, + { + "epoch": 0.4253125, + "grad_norm": 3.1875, + "grad_norm_var": 0.14986063639322916, + "learning_rate": 0.0001, + "loss": 5.7757, + "loss/crossentropy": 2.4895899295806885, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17665836960077286, + "step": 13610 + }, + { + "epoch": 0.425375, + "grad_norm": 3.421875, + "grad_norm_var": 0.04376627604166667, + "learning_rate": 0.0001, + "loss": 5.9007, + "loss/crossentropy": 2.615707755088806, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17654824256896973, + "step": 13612 + }, + { + "epoch": 0.4254375, + "grad_norm": 3.359375, + "grad_norm_var": 0.0394683837890625, + "learning_rate": 0.0001, + "loss": 5.9093, + "loss/crossentropy": 2.6024194955825806, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17755922675132751, + "step": 13614 + }, + { + "epoch": 0.4255, + "grad_norm": 3.34375, + "grad_norm_var": 0.037007649739583336, + "learning_rate": 0.0001, + "loss": 5.3853, + "loss/crossentropy": 2.3523961305618286, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15563051402568817, + "step": 13616 + }, + { + "epoch": 0.4255625, + "grad_norm": 3.453125, + "grad_norm_var": 0.03515218098958333, + "learning_rate": 0.0001, + "loss": 5.8712, + "loss/crossentropy": 2.51068115234375, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18449315428733826, + "step": 13618 + }, + { + "epoch": 0.425625, + "grad_norm": 3.140625, + "grad_norm_var": 0.03655598958333333, + "learning_rate": 0.0001, + "loss": 5.9174, + "loss/crossentropy": 2.7071765661239624, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17102541029453278, + "step": 13620 + }, + { + "epoch": 0.4256875, + "grad_norm": 3.546875, + "grad_norm_var": 0.03996988932291667, + "learning_rate": 0.0001, + "loss": 6.1444, + "loss/crossentropy": 2.7265379428863525, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18710123747587204, + "step": 13622 + }, + { + "epoch": 0.42575, + "grad_norm": 3.078125, + "grad_norm_var": 0.020068359375, + "learning_rate": 0.0001, + "loss": 5.9351, + "loss/crossentropy": 2.638815402984619, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17767088115215302, + "step": 13624 + }, + { + "epoch": 0.4258125, + "grad_norm": 3.21875, + "grad_norm_var": 0.0225006103515625, + "learning_rate": 0.0001, + "loss": 5.8705, + "loss/crossentropy": 2.6077572107315063, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17237266153097153, + "step": 13626 + }, + { + "epoch": 0.425875, + "grad_norm": 3.078125, + "grad_norm_var": 0.026334635416666665, + "learning_rate": 0.0001, + "loss": 5.6404, + "loss/crossentropy": 2.4760019779205322, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16761603951454163, + "step": 13628 + }, + { + "epoch": 0.4259375, + "grad_norm": 3.390625, + "grad_norm_var": 0.0270660400390625, + "learning_rate": 0.0001, + "loss": 5.8097, + "loss/crossentropy": 2.5245094299316406, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17343804985284805, + "step": 13630 + }, + { + "epoch": 0.426, + "grad_norm": 3.109375, + "grad_norm_var": 0.026764933268229166, + "learning_rate": 0.0001, + "loss": 5.7181, + "loss/crossentropy": 2.5386550426483154, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1714557781815529, + "step": 13632 + }, + { + "epoch": 0.4260625, + "grad_norm": 3.34375, + "grad_norm_var": 0.027123006184895833, + "learning_rate": 0.0001, + "loss": 5.4877, + "loss/crossentropy": 2.36991286277771, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1641227975487709, + "step": 13634 + }, + { + "epoch": 0.426125, + "grad_norm": 3.25, + "grad_norm_var": 0.0275787353515625, + "learning_rate": 0.0001, + "loss": 5.488, + "loss/crossentropy": 2.3451461791992188, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1635008379817009, + "step": 13636 + }, + { + "epoch": 0.4261875, + "grad_norm": 3.046875, + "grad_norm_var": 0.023021443684895834, + "learning_rate": 0.0001, + "loss": 5.7511, + "loss/crossentropy": 2.6217762231826782, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16527977585792542, + "step": 13638 + }, + { + "epoch": 0.42625, + "grad_norm": 3.0625, + "grad_norm_var": 0.02301025390625, + "learning_rate": 0.0001, + "loss": 5.3194, + "loss/crossentropy": 2.2426563501358032, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16235796362161636, + "step": 13640 + }, + { + "epoch": 0.4263125, + "grad_norm": 3.09375, + "grad_norm_var": 0.01685791015625, + "learning_rate": 0.0001, + "loss": 5.8931, + "loss/crossentropy": 2.6649584770202637, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17476899176836014, + "step": 13642 + }, + { + "epoch": 0.426375, + "grad_norm": 3.5625, + "grad_norm_var": 0.0351226806640625, + "learning_rate": 0.0001, + "loss": 5.7322, + "loss/crossentropy": 2.4845662117004395, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17242056876420975, + "step": 13644 + }, + { + "epoch": 0.4264375, + "grad_norm": 3.375, + "grad_norm_var": 0.03449605305989583, + "learning_rate": 0.0001, + "loss": 5.7222, + "loss/crossentropy": 2.536791205406189, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16971704363822937, + "step": 13646 + }, + { + "epoch": 0.4265, + "grad_norm": 3.09375, + "grad_norm_var": 0.034566243489583336, + "learning_rate": 0.0001, + "loss": 5.5455, + "loss/crossentropy": 2.3979681730270386, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1678829789161682, + "step": 13648 + }, + { + "epoch": 0.4265625, + "grad_norm": 3.109375, + "grad_norm_var": 0.03430989583333333, + "learning_rate": 0.0001, + "loss": 5.6397, + "loss/crossentropy": 2.4746124744415283, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1704118475317955, + "step": 13650 + }, + { + "epoch": 0.426625, + "grad_norm": 3.25, + "grad_norm_var": 0.03469645182291667, + "learning_rate": 0.0001, + "loss": 5.5564, + "loss/crossentropy": 2.4087108373641968, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16281092911958694, + "step": 13652 + }, + { + "epoch": 0.4266875, + "grad_norm": 3.328125, + "grad_norm_var": 0.03352762858072917, + "learning_rate": 0.0001, + "loss": 5.5027, + "loss/crossentropy": 2.326142907142639, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1660897359251976, + "step": 13654 + }, + { + "epoch": 0.42675, + "grad_norm": 3.390625, + "grad_norm_var": 0.033421834309895836, + "learning_rate": 0.0001, + "loss": 5.7803, + "loss/crossentropy": 2.5783458948135376, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16863028705120087, + "step": 13656 + }, + { + "epoch": 0.4268125, + "grad_norm": 3.125, + "grad_norm_var": 0.06815999348958333, + "learning_rate": 0.0001, + "loss": 5.732, + "loss/crossentropy": 2.4865012168884277, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17220691591501236, + "step": 13658 + }, + { + "epoch": 0.426875, + "grad_norm": 3.0625, + "grad_norm_var": 0.060212198893229166, + "learning_rate": 0.0001, + "loss": 5.7956, + "loss/crossentropy": 2.5870524644851685, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16968437284231186, + "step": 13660 + }, + { + "epoch": 0.4269375, + "grad_norm": 3.6875, + "grad_norm_var": 0.10939127604166667, + "learning_rate": 0.0001, + "loss": 6.2719, + "loss/crossentropy": 2.813697934150696, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18722982704639435, + "step": 13662 + }, + { + "epoch": 0.427, + "grad_norm": 3.40625, + "grad_norm_var": 0.10535380045572916, + "learning_rate": 0.0001, + "loss": 5.8451, + "loss/crossentropy": 2.5655994415283203, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17716550827026367, + "step": 13664 + }, + { + "epoch": 0.4270625, + "grad_norm": 3.625, + "grad_norm_var": 0.1028961181640625, + "learning_rate": 0.0001, + "loss": 5.9091, + "loss/crossentropy": 2.5500409603118896, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1835578829050064, + "step": 13666 + }, + { + "epoch": 0.427125, + "grad_norm": 3.53125, + "grad_norm_var": 0.09569905598958334, + "learning_rate": 0.0001, + "loss": 6.0471, + "loss/crossentropy": 2.759128451347351, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1768459901213646, + "step": 13668 + }, + { + "epoch": 0.4271875, + "grad_norm": 3.234375, + "grad_norm_var": 0.09733072916666667, + "learning_rate": 0.0001, + "loss": 5.8049, + "loss/crossentropy": 2.5630141496658325, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17458105832338333, + "step": 13670 + }, + { + "epoch": 0.42725, + "grad_norm": 3.171875, + "grad_norm_var": 0.10494384765625, + "learning_rate": 0.0001, + "loss": 5.4886, + "loss/crossentropy": 2.2683770656585693, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.16693967580795288, + "step": 13672 + }, + { + "epoch": 0.4273125, + "grad_norm": 3.921875, + "grad_norm_var": 0.1036529541015625, + "learning_rate": 0.0001, + "loss": 5.9961, + "loss/crossentropy": 2.6692570447921753, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18072886765003204, + "step": 13674 + }, + { + "epoch": 0.427375, + "grad_norm": 3.4375, + "grad_norm_var": 0.0935455322265625, + "learning_rate": 0.0001, + "loss": 5.93, + "loss/crossentropy": 2.6348601579666138, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17873502522706985, + "step": 13676 + }, + { + "epoch": 0.4274375, + "grad_norm": 2.875, + "grad_norm_var": 0.08173421223958334, + "learning_rate": 0.0001, + "loss": 5.6902, + "loss/crossentropy": 2.5685064792633057, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16686025261878967, + "step": 13678 + }, + { + "epoch": 0.4275, + "grad_norm": 3.515625, + "grad_norm_var": 0.0789459228515625, + "learning_rate": 0.0001, + "loss": 5.6396, + "loss/crossentropy": 2.4428157806396484, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.168898805975914, + "step": 13680 + }, + { + "epoch": 0.4275625, + "grad_norm": 3.171875, + "grad_norm_var": 0.07375386555989584, + "learning_rate": 0.0001, + "loss": 6.0415, + "loss/crossentropy": 2.750417947769165, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17598801851272583, + "step": 13682 + }, + { + "epoch": 0.427625, + "grad_norm": 3.140625, + "grad_norm_var": 0.07376302083333333, + "learning_rate": 0.0001, + "loss": 5.6226, + "loss/crossentropy": 2.4173182249069214, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17052601277828217, + "step": 13684 + }, + { + "epoch": 0.4276875, + "grad_norm": 3.125, + "grad_norm_var": 0.0793609619140625, + "learning_rate": 0.0001, + "loss": 5.9497, + "loss/crossentropy": 2.714414358139038, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17587494105100632, + "step": 13686 + }, + { + "epoch": 0.42775, + "grad_norm": 3.09375, + "grad_norm_var": 0.060302734375, + "learning_rate": 0.0001, + "loss": 5.7912, + "loss/crossentropy": 2.5054643154144287, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17935198545455933, + "step": 13688 + }, + { + "epoch": 0.4278125, + "grad_norm": 3.125, + "grad_norm_var": 0.22333882649739584, + "learning_rate": 0.0001, + "loss": 5.4972, + "loss/crossentropy": 2.2655035257339478, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.16731233149766922, + "step": 13690 + }, + { + "epoch": 0.427875, + "grad_norm": 3.3125, + "grad_norm_var": 0.22379150390625, + "learning_rate": 0.0001, + "loss": 6.0416, + "loss/crossentropy": 2.7287049293518066, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17933955043554306, + "step": 13692 + }, + { + "epoch": 0.4279375, + "grad_norm": 3.375, + "grad_norm_var": 0.21106363932291666, + "learning_rate": 0.0001, + "loss": 5.8759, + "loss/crossentropy": 2.5507595539093018, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18017400801181793, + "step": 13694 + }, + { + "epoch": 0.428, + "grad_norm": 3.09375, + "grad_norm_var": 0.21228841145833333, + "learning_rate": 0.0001, + "loss": 5.5248, + "loss/crossentropy": 2.3603193759918213, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16996591538190842, + "step": 13696 + }, + { + "epoch": 0.4280625, + "grad_norm": 3.359375, + "grad_norm_var": 0.21627197265625, + "learning_rate": 0.0001, + "loss": 5.6145, + "loss/crossentropy": 2.377763271331787, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.172113299369812, + "step": 13698 + }, + { + "epoch": 0.428125, + "grad_norm": 3.03125, + "grad_norm_var": 0.22115885416666667, + "learning_rate": 0.0001, + "loss": 5.6002, + "loss/crossentropy": 2.4868147373199463, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16211627423763275, + "step": 13700 + }, + { + "epoch": 0.4281875, + "grad_norm": 2.8125, + "grad_norm_var": 0.2318267822265625, + "learning_rate": 0.0001, + "loss": 5.4648, + "loss/crossentropy": 2.353792190551758, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16656972467899323, + "step": 13702 + }, + { + "epoch": 0.42825, + "grad_norm": 3.203125, + "grad_norm_var": 0.23946024576822916, + "learning_rate": 0.0001, + "loss": 5.869, + "loss/crossentropy": 2.623986840248108, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1741149052977562, + "step": 13704 + }, + { + "epoch": 0.4283125, + "grad_norm": 3.125, + "grad_norm_var": 0.024869791666666665, + "learning_rate": 0.0001, + "loss": 6.2232, + "loss/crossentropy": 2.8240978717803955, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.186787448823452, + "step": 13706 + }, + { + "epoch": 0.428375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0230865478515625, + "learning_rate": 0.0001, + "loss": 5.7839, + "loss/crossentropy": 2.646933913230896, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16642916202545166, + "step": 13708 + }, + { + "epoch": 0.4284375, + "grad_norm": 3.21875, + "grad_norm_var": 0.0195465087890625, + "learning_rate": 0.0001, + "loss": 5.8334, + "loss/crossentropy": 2.564917206764221, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1764616221189499, + "step": 13710 + }, + { + "epoch": 0.4285, + "grad_norm": 3.40625, + "grad_norm_var": 0.024388631184895832, + "learning_rate": 0.0001, + "loss": 6.0885, + "loss/crossentropy": 2.7025904655456543, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18312708288431168, + "step": 13712 + }, + { + "epoch": 0.4285625, + "grad_norm": 3.78125, + "grad_norm_var": 0.050455729166666664, + "learning_rate": 0.0001, + "loss": 6.1528, + "loss/crossentropy": 2.6861952543258667, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.19236107915639877, + "step": 13714 + }, + { + "epoch": 0.428625, + "grad_norm": 3.171875, + "grad_norm_var": 0.04816792805989583, + "learning_rate": 0.0001, + "loss": 5.8196, + "loss/crossentropy": 2.617398262023926, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16943687200546265, + "step": 13716 + }, + { + "epoch": 0.4286875, + "grad_norm": 3.859375, + "grad_norm_var": 0.06177978515625, + "learning_rate": 0.0001, + "loss": 5.4724, + "loss/crossentropy": 2.3168063163757324, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16361089795827866, + "step": 13718 + }, + { + "epoch": 0.42875, + "grad_norm": 3.4375, + "grad_norm_var": 0.05182291666666667, + "learning_rate": 0.0001, + "loss": 6.1111, + "loss/crossentropy": 2.6841464042663574, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18801257759332657, + "step": 13720 + }, + { + "epoch": 0.4288125, + "grad_norm": 3.078125, + "grad_norm_var": 0.05181884765625, + "learning_rate": 0.0001, + "loss": 5.8708, + "loss/crossentropy": 2.6361746788024902, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17190248519182205, + "step": 13722 + }, + { + "epoch": 0.428875, + "grad_norm": 3.515625, + "grad_norm_var": 0.052632649739583336, + "learning_rate": 0.0001, + "loss": 5.9087, + "loss/crossentropy": 2.6201218366622925, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1761198490858078, + "step": 13724 + }, + { + "epoch": 0.4289375, + "grad_norm": 3.171875, + "grad_norm_var": 0.05117085774739583, + "learning_rate": 0.0001, + "loss": 5.8039, + "loss/crossentropy": 2.626819133758545, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1688845530152321, + "step": 13726 + }, + { + "epoch": 0.429, + "grad_norm": 13.0, + "grad_norm_var": 5.851806640625, + "learning_rate": 0.0001, + "loss": 6.1558, + "loss/crossentropy": 2.5434751510620117, + "loss/hidden": 1.625, + "loss/jsd": 0.0, + "loss/logits": 0.19873183220624924, + "step": 13728 + }, + { + "epoch": 0.4290625, + "grad_norm": 3.15625, + "grad_norm_var": 5.901416015625, + "learning_rate": 0.0001, + "loss": 5.6204, + "loss/crossentropy": 2.4339829683303833, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1713765189051628, + "step": 13730 + }, + { + "epoch": 0.429125, + "grad_norm": 3.734375, + "grad_norm_var": 5.85650634765625, + "learning_rate": 0.0001, + "loss": 6.4488, + "loss/crossentropy": 2.9593294858932495, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19269901514053345, + "step": 13732 + }, + { + "epoch": 0.4291875, + "grad_norm": 3.375, + "grad_norm_var": 5.8624013264973955, + "learning_rate": 0.0001, + "loss": 5.7938, + "loss/crossentropy": 2.544976592063904, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17527136206626892, + "step": 13734 + }, + { + "epoch": 0.42925, + "grad_norm": 3.203125, + "grad_norm_var": 5.8963368733723955, + "learning_rate": 0.0001, + "loss": 5.7754, + "loss/crossentropy": 2.520832061767578, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17546138912439346, + "step": 13736 + }, + { + "epoch": 0.4293125, + "grad_norm": 3.1875, + "grad_norm_var": 5.9157053629557295, + "learning_rate": 0.0001, + "loss": 5.6156, + "loss/crossentropy": 2.471334457397461, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1659916192293167, + "step": 13738 + }, + { + "epoch": 0.429375, + "grad_norm": 3.90625, + "grad_norm_var": 5.92720947265625, + "learning_rate": 0.0001, + "loss": 5.8394, + "loss/crossentropy": 2.611321210861206, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17359286546707153, + "step": 13740 + }, + { + "epoch": 0.4294375, + "grad_norm": 3.375, + "grad_norm_var": 5.902311197916666, + "learning_rate": 0.0001, + "loss": 5.7597, + "loss/crossentropy": 2.5149848461151123, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17017000913619995, + "step": 13742 + }, + { + "epoch": 0.4295, + "grad_norm": 5.28125, + "grad_norm_var": 0.31319071451822916, + "learning_rate": 0.0001, + "loss": 5.6272, + "loss/crossentropy": 2.2962993383407593, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17684400081634521, + "step": 13744 + }, + { + "epoch": 0.4295625, + "grad_norm": 3.640625, + "grad_norm_var": 0.3112701416015625, + "learning_rate": 0.0001, + "loss": 5.9082, + "loss/crossentropy": 2.5763291120529175, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1777186393737793, + "step": 13746 + }, + { + "epoch": 0.429625, + "grad_norm": 3.359375, + "grad_norm_var": 0.330859375, + "learning_rate": 0.0001, + "loss": 5.8902, + "loss/crossentropy": 2.600212574005127, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17899896204471588, + "step": 13748 + }, + { + "epoch": 0.4296875, + "grad_norm": 3.328125, + "grad_norm_var": 0.328955078125, + "learning_rate": 0.0001, + "loss": 5.7874, + "loss/crossentropy": 2.4550464153289795, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17659784853458405, + "step": 13750 + }, + { + "epoch": 0.42975, + "grad_norm": 3.5, + "grad_norm_var": 0.325732421875, + "learning_rate": 0.0001, + "loss": 5.7718, + "loss/crossentropy": 2.4952975511550903, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17569301277399063, + "step": 13752 + }, + { + "epoch": 0.4298125, + "grad_norm": 3.21875, + "grad_norm_var": 0.32275390625, + "learning_rate": 0.0001, + "loss": 5.5028, + "loss/crossentropy": 2.33465039730072, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16642434149980545, + "step": 13754 + }, + { + "epoch": 0.429875, + "grad_norm": 3.390625, + "grad_norm_var": 0.29573160807291665, + "learning_rate": 0.0001, + "loss": 5.9297, + "loss/crossentropy": 2.6052504777908325, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1777571439743042, + "step": 13756 + }, + { + "epoch": 0.4299375, + "grad_norm": 3.40625, + "grad_norm_var": 0.2967437744140625, + "learning_rate": 0.0001, + "loss": 5.8089, + "loss/crossentropy": 2.5966118574142456, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17435576766729355, + "step": 13758 + }, + { + "epoch": 0.43, + "grad_norm": 3.171875, + "grad_norm_var": 0.056591796875, + "learning_rate": 0.0001, + "loss": 5.8061, + "loss/crossentropy": 2.5896633863449097, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17164431512355804, + "step": 13760 + }, + { + "epoch": 0.4300625, + "grad_norm": 3.25, + "grad_norm_var": 0.053120930989583336, + "learning_rate": 0.0001, + "loss": 5.6903, + "loss/crossentropy": 2.4845727682113647, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1697913035750389, + "step": 13762 + }, + { + "epoch": 0.430125, + "grad_norm": 3.1875, + "grad_norm_var": 0.024007161458333332, + "learning_rate": 0.0001, + "loss": 5.3803, + "loss/crossentropy": 2.3315229415893555, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1548776477575302, + "step": 13764 + }, + { + "epoch": 0.4301875, + "grad_norm": 4.21875, + "grad_norm_var": 0.08062235514322917, + "learning_rate": 0.0001, + "loss": 5.8792, + "loss/crossentropy": 2.590985655784607, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17296196520328522, + "step": 13766 + }, + { + "epoch": 0.43025, + "grad_norm": 3.984375, + "grad_norm_var": 0.11121317545572916, + "learning_rate": 0.0001, + "loss": 6.0749, + "loss/crossentropy": 2.653734803199768, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18586856126785278, + "step": 13768 + }, + { + "epoch": 0.4303125, + "grad_norm": 4.125, + "grad_norm_var": 0.15481770833333333, + "learning_rate": 0.0001, + "loss": 5.9805, + "loss/crossentropy": 2.76421320438385, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16967318952083588, + "step": 13770 + }, + { + "epoch": 0.430375, + "grad_norm": 3.375, + "grad_norm_var": 0.15419514973958334, + "learning_rate": 0.0001, + "loss": 5.8072, + "loss/crossentropy": 2.493858814239502, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.17352117598056793, + "step": 13772 + }, + { + "epoch": 0.4304375, + "grad_norm": 3.3125, + "grad_norm_var": 0.20327046712239583, + "learning_rate": 0.0001, + "loss": 6.1747, + "loss/crossentropy": 2.7048277854919434, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19307807832956314, + "step": 13774 + }, + { + "epoch": 0.4305, + "grad_norm": 3.4375, + "grad_norm_var": 0.1990386962890625, + "learning_rate": 0.0001, + "loss": 5.6902, + "loss/crossentropy": 2.3923569917678833, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17431984096765518, + "step": 13776 + }, + { + "epoch": 0.4305625, + "grad_norm": 3.234375, + "grad_norm_var": 0.20152079264322917, + "learning_rate": 0.0001, + "loss": 5.7667, + "loss/crossentropy": 2.490602493286133, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1768304854631424, + "step": 13778 + }, + { + "epoch": 0.430625, + "grad_norm": 3.203125, + "grad_norm_var": 0.18127848307291666, + "learning_rate": 0.0001, + "loss": 5.8033, + "loss/crossentropy": 2.576148271560669, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17310161888599396, + "step": 13780 + }, + { + "epoch": 0.4306875, + "grad_norm": 3.078125, + "grad_norm_var": 0.1714019775390625, + "learning_rate": 0.0001, + "loss": 5.3822, + "loss/crossentropy": 2.3225895166397095, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15713541209697723, + "step": 13782 + }, + { + "epoch": 0.43075, + "grad_norm": 3.0, + "grad_norm_var": 0.15918680826822917, + "learning_rate": 0.0001, + "loss": 5.7832, + "loss/crossentropy": 2.639627456665039, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16670116037130356, + "step": 13784 + }, + { + "epoch": 0.4308125, + "grad_norm": 3.0625, + "grad_norm_var": 0.11142171223958333, + "learning_rate": 0.0001, + "loss": 5.7224, + "loss/crossentropy": 2.518721342086792, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1719333603978157, + "step": 13786 + }, + { + "epoch": 0.430875, + "grad_norm": 3.296875, + "grad_norm_var": 0.10881754557291666, + "learning_rate": 0.0001, + "loss": 5.9192, + "loss/crossentropy": 2.5601470470428467, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18082885444164276, + "step": 13788 + }, + { + "epoch": 0.4309375, + "grad_norm": 3.359375, + "grad_norm_var": 0.028336588541666666, + "learning_rate": 0.0001, + "loss": 5.8847, + "loss/crossentropy": 2.6607024669647217, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1751309037208557, + "step": 13790 + }, + { + "epoch": 0.431, + "grad_norm": 3.28125, + "grad_norm_var": 0.024372355143229166, + "learning_rate": 0.0001, + "loss": 5.5683, + "loss/crossentropy": 2.3886818885803223, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16678636521100998, + "step": 13792 + }, + { + "epoch": 0.4310625, + "grad_norm": 3.34375, + "grad_norm_var": 0.02666015625, + "learning_rate": 0.0001, + "loss": 5.7791, + "loss/crossentropy": 2.5440547466278076, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17311306297779083, + "step": 13794 + }, + { + "epoch": 0.431125, + "grad_norm": 3.703125, + "grad_norm_var": 0.0519439697265625, + "learning_rate": 0.0001, + "loss": 6.1354, + "loss/crossentropy": 2.6821444034576416, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19298520684242249, + "step": 13796 + }, + { + "epoch": 0.4311875, + "grad_norm": 3.171875, + "grad_norm_var": 0.043196614583333334, + "learning_rate": 0.0001, + "loss": 6.1425, + "loss/crossentropy": 2.775768518447876, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1835494562983513, + "step": 13798 + }, + { + "epoch": 0.43125, + "grad_norm": 3.625, + "grad_norm_var": 0.04099833170572917, + "learning_rate": 0.0001, + "loss": 6.102, + "loss/crossentropy": 2.7383395433425903, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18050390481948853, + "step": 13800 + }, + { + "epoch": 0.4313125, + "grad_norm": 3.515625, + "grad_norm_var": 0.0325836181640625, + "learning_rate": 0.0001, + "loss": 6.2694, + "loss/crossentropy": 2.8019193410873413, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19206029176712036, + "step": 13802 + }, + { + "epoch": 0.431375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03837890625, + "learning_rate": 0.0001, + "loss": 5.3785, + "loss/crossentropy": 2.2824034690856934, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.15765908360481262, + "step": 13804 + }, + { + "epoch": 0.4314375, + "grad_norm": 3.359375, + "grad_norm_var": 0.03726806640625, + "learning_rate": 0.0001, + "loss": 5.7639, + "loss/crossentropy": 2.5160328149795532, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1763525754213333, + "step": 13806 + }, + { + "epoch": 0.4315, + "grad_norm": 3.03125, + "grad_norm_var": 0.05715738932291667, + "learning_rate": 0.0001, + "loss": 6.078, + "loss/crossentropy": 2.739522695541382, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18189027160406113, + "step": 13808 + }, + { + "epoch": 0.4315625, + "grad_norm": 3.375, + "grad_norm_var": 0.059798177083333334, + "learning_rate": 0.0001, + "loss": 5.948, + "loss/crossentropy": 2.6395925283432007, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1800587698817253, + "step": 13810 + }, + { + "epoch": 0.431625, + "grad_norm": 2.9375, + "grad_norm_var": 0.06549072265625, + "learning_rate": 0.0001, + "loss": 5.8187, + "loss/crossentropy": 2.598936915397644, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17197246104478836, + "step": 13812 + }, + { + "epoch": 0.4316875, + "grad_norm": 3.125, + "grad_norm_var": 0.0686187744140625, + "learning_rate": 0.0001, + "loss": 5.8044, + "loss/crossentropy": 2.5577696561813354, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1734897345304489, + "step": 13814 + }, + { + "epoch": 0.43175, + "grad_norm": 3.171875, + "grad_norm_var": 0.06845703125, + "learning_rate": 0.0001, + "loss": 5.9794, + "loss/crossentropy": 2.642100691795349, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1829443871974945, + "step": 13816 + }, + { + "epoch": 0.4318125, + "grad_norm": 3.203125, + "grad_norm_var": 0.07056376139322916, + "learning_rate": 0.0001, + "loss": 6.0588, + "loss/crossentropy": 2.7387092113494873, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18083222210407257, + "step": 13818 + }, + { + "epoch": 0.431875, + "grad_norm": 3.234375, + "grad_norm_var": 0.08374735514322916, + "learning_rate": 0.0001, + "loss": 5.8616, + "loss/crossentropy": 2.754805088043213, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16341445595026016, + "step": 13820 + }, + { + "epoch": 0.4319375, + "grad_norm": 3.1875, + "grad_norm_var": 0.08114827473958333, + "learning_rate": 0.0001, + "loss": 5.974, + "loss/crossentropy": 2.6396480798721313, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18265436589717865, + "step": 13822 + }, + { + "epoch": 0.432, + "grad_norm": 3.171875, + "grad_norm_var": 0.05879618326822917, + "learning_rate": 0.0001, + "loss": 5.713, + "loss/crossentropy": 2.43161678314209, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.17071346193552017, + "step": 13824 + }, + { + "epoch": 0.4320625, + "grad_norm": 3.21875, + "grad_norm_var": 0.039534505208333334, + "learning_rate": 0.0001, + "loss": 5.8704, + "loss/crossentropy": 2.607662320137024, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17822878062725067, + "step": 13826 + }, + { + "epoch": 0.432125, + "grad_norm": 2.984375, + "grad_norm_var": 0.04830322265625, + "learning_rate": 0.0001, + "loss": 5.929, + "loss/crossentropy": 2.6019667387008667, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18114198744297028, + "step": 13828 + }, + { + "epoch": 0.4321875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0496002197265625, + "learning_rate": 0.0001, + "loss": 5.9722, + "loss/crossentropy": 2.7419170141220093, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17263995110988617, + "step": 13830 + }, + { + "epoch": 0.43225, + "grad_norm": 3.046875, + "grad_norm_var": 0.04319559733072917, + "learning_rate": 0.0001, + "loss": 5.9201, + "loss/crossentropy": 2.6870416402816772, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17173901200294495, + "step": 13832 + }, + { + "epoch": 0.4323125, + "grad_norm": 3.140625, + "grad_norm_var": 0.0340972900390625, + "learning_rate": 0.0001, + "loss": 5.7739, + "loss/crossentropy": 2.5142263174057007, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17518573999404907, + "step": 13834 + }, + { + "epoch": 0.432375, + "grad_norm": 3.1875, + "grad_norm_var": 0.023274739583333332, + "learning_rate": 0.0001, + "loss": 6.1575, + "loss/crossentropy": 2.7912551164627075, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18232276290655136, + "step": 13836 + }, + { + "epoch": 0.4324375, + "grad_norm": 3.140625, + "grad_norm_var": 0.02359619140625, + "learning_rate": 0.0001, + "loss": 5.8468, + "loss/crossentropy": 2.5685415267944336, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17782345414161682, + "step": 13838 + }, + { + "epoch": 0.4325, + "grad_norm": 2.984375, + "grad_norm_var": 0.024169921875, + "learning_rate": 0.0001, + "loss": 5.8359, + "loss/crossentropy": 2.544231414794922, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1772119104862213, + "step": 13840 + }, + { + "epoch": 0.4325625, + "grad_norm": 3.296875, + "grad_norm_var": 2.9762603759765627, + "learning_rate": 0.0001, + "loss": 6.0882, + "loss/crossentropy": 2.6641210317611694, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.19084730744361877, + "step": 13842 + }, + { + "epoch": 0.432625, + "grad_norm": 3.5625, + "grad_norm_var": 2.9530924479166667, + "learning_rate": 0.0001, + "loss": 5.9587, + "loss/crossentropy": 2.6440919637680054, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1814604476094246, + "step": 13844 + }, + { + "epoch": 0.4326875, + "grad_norm": 2.953125, + "grad_norm_var": 2.985309855143229, + "learning_rate": 0.0001, + "loss": 5.3481, + "loss/crossentropy": 2.33569598197937, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15827567875385284, + "step": 13846 + }, + { + "epoch": 0.43275, + "grad_norm": 3.546875, + "grad_norm_var": 2.9599273681640623, + "learning_rate": 0.0001, + "loss": 6.0096, + "loss/crossentropy": 2.7038800716400146, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1770564764738083, + "step": 13848 + }, + { + "epoch": 0.4328125, + "grad_norm": 3.46875, + "grad_norm_var": 2.9489898681640625, + "learning_rate": 0.0001, + "loss": 5.9135, + "loss/crossentropy": 2.6132771968841553, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1776796653866768, + "step": 13850 + }, + { + "epoch": 0.432875, + "grad_norm": 3.234375, + "grad_norm_var": 2.94381103515625, + "learning_rate": 0.0001, + "loss": 5.9411, + "loss/crossentropy": 2.643177628517151, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17510050535202026, + "step": 13852 + }, + { + "epoch": 0.4329375, + "grad_norm": 3.15625, + "grad_norm_var": 2.954215494791667, + "learning_rate": 0.0001, + "loss": 5.6373, + "loss/crossentropy": 2.4669450521469116, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1643032804131508, + "step": 13854 + }, + { + "epoch": 0.433, + "grad_norm": 3.328125, + "grad_norm_var": 2.9218658447265624, + "learning_rate": 0.0001, + "loss": 6.0053, + "loss/crossentropy": 2.64568555355072, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17971544712781906, + "step": 13856 + }, + { + "epoch": 0.4330625, + "grad_norm": 2.984375, + "grad_norm_var": 0.05982666015625, + "learning_rate": 0.0001, + "loss": 5.7653, + "loss/crossentropy": 2.6104692220687866, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16391621530056, + "step": 13858 + }, + { + "epoch": 0.433125, + "grad_norm": 3.609375, + "grad_norm_var": 0.06252848307291667, + "learning_rate": 0.0001, + "loss": 5.8163, + "loss/crossentropy": 2.574515700340271, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17417685687541962, + "step": 13860 + }, + { + "epoch": 0.4331875, + "grad_norm": 3.171875, + "grad_norm_var": 0.049169921875, + "learning_rate": 0.0001, + "loss": 5.5406, + "loss/crossentropy": 2.363754153251648, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1712006852030754, + "step": 13862 + }, + { + "epoch": 0.43325, + "grad_norm": 2.859375, + "grad_norm_var": 0.05364176432291667, + "learning_rate": 0.0001, + "loss": 5.7121, + "loss/crossentropy": 2.590463161468506, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16099364310503006, + "step": 13864 + }, + { + "epoch": 0.4333125, + "grad_norm": 3.21875, + "grad_norm_var": 0.0495025634765625, + "learning_rate": 0.0001, + "loss": 5.747, + "loss/crossentropy": 2.486981987953186, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1732637882232666, + "step": 13866 + }, + { + "epoch": 0.433375, + "grad_norm": 3.8125, + "grad_norm_var": 0.07224833170572917, + "learning_rate": 0.0001, + "loss": 5.7717, + "loss/crossentropy": 2.4394524097442627, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17658094316720963, + "step": 13868 + }, + { + "epoch": 0.4334375, + "grad_norm": 3.53125, + "grad_norm_var": 0.07399088541666667, + "learning_rate": 0.0001, + "loss": 5.9521, + "loss/crossentropy": 2.632522225379944, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17844469100236893, + "step": 13870 + }, + { + "epoch": 0.4335, + "grad_norm": 3.0, + "grad_norm_var": 0.06363016764322917, + "learning_rate": 0.0001, + "loss": 5.8005, + "loss/crossentropy": 2.62562096118927, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1682659387588501, + "step": 13872 + }, + { + "epoch": 0.4335625, + "grad_norm": 3.0, + "grad_norm_var": 0.060347493489583334, + "learning_rate": 0.0001, + "loss": 5.378, + "loss/crossentropy": 2.2950897216796875, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16102531552314758, + "step": 13874 + }, + { + "epoch": 0.433625, + "grad_norm": 3.328125, + "grad_norm_var": 0.05243733723958333, + "learning_rate": 0.0001, + "loss": 5.6193, + "loss/crossentropy": 2.465804696083069, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16768839210271835, + "step": 13876 + }, + { + "epoch": 0.4336875, + "grad_norm": 3.296875, + "grad_norm_var": 0.05286356608072917, + "learning_rate": 0.0001, + "loss": 5.928, + "loss/crossentropy": 2.5896536111831665, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18305715918540955, + "step": 13878 + }, + { + "epoch": 0.43375, + "grad_norm": 3.34375, + "grad_norm_var": 0.0444244384765625, + "learning_rate": 0.0001, + "loss": 5.7915, + "loss/crossentropy": 2.517315983772278, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17077286541461945, + "step": 13880 + }, + { + "epoch": 0.4338125, + "grad_norm": 3.421875, + "grad_norm_var": 0.047240193684895834, + "learning_rate": 0.0001, + "loss": 5.7765, + "loss/crossentropy": 2.536103844642639, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17326051741838455, + "step": 13882 + }, + { + "epoch": 0.433875, + "grad_norm": 3.296875, + "grad_norm_var": 0.03373921712239583, + "learning_rate": 0.0001, + "loss": 5.7424, + "loss/crossentropy": 2.558171033859253, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17349782586097717, + "step": 13884 + }, + { + "epoch": 0.4339375, + "grad_norm": 3.390625, + "grad_norm_var": 0.03803609212239583, + "learning_rate": 0.0001, + "loss": 5.9657, + "loss/crossentropy": 2.720614433288574, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17333794385194778, + "step": 13886 + }, + { + "epoch": 0.434, + "grad_norm": 3.09375, + "grad_norm_var": 0.0366851806640625, + "learning_rate": 0.0001, + "loss": 5.6914, + "loss/crossentropy": 2.441069483757019, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17424984276294708, + "step": 13888 + }, + { + "epoch": 0.4340625, + "grad_norm": 2.984375, + "grad_norm_var": 0.03701070149739583, + "learning_rate": 0.0001, + "loss": 5.7065, + "loss/crossentropy": 2.589089870452881, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16603338718414307, + "step": 13890 + }, + { + "epoch": 0.434125, + "grad_norm": 3.140625, + "grad_norm_var": 0.03532613118489583, + "learning_rate": 0.0001, + "loss": 5.6098, + "loss/crossentropy": 2.442548394203186, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16789712011814117, + "step": 13892 + }, + { + "epoch": 0.4341875, + "grad_norm": 3.109375, + "grad_norm_var": 0.03786519368489583, + "learning_rate": 0.0001, + "loss": 5.9175, + "loss/crossentropy": 2.6187649965286255, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17558104544878006, + "step": 13894 + }, + { + "epoch": 0.43425, + "grad_norm": 3.0625, + "grad_norm_var": 0.03863525390625, + "learning_rate": 0.0001, + "loss": 5.9075, + "loss/crossentropy": 2.6415693759918213, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1789368912577629, + "step": 13896 + }, + { + "epoch": 0.4343125, + "grad_norm": 3.484375, + "grad_norm_var": 0.03986002604166667, + "learning_rate": 0.0001, + "loss": 5.9529, + "loss/crossentropy": 2.667291522026062, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17777517437934875, + "step": 13898 + }, + { + "epoch": 0.434375, + "grad_norm": 3.21875, + "grad_norm_var": 0.0394439697265625, + "learning_rate": 0.0001, + "loss": 5.9658, + "loss/crossentropy": 2.699573516845703, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17467273771762848, + "step": 13900 + }, + { + "epoch": 0.4344375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0342681884765625, + "learning_rate": 0.0001, + "loss": 5.3276, + "loss/crossentropy": 2.216606616973877, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.15836957097053528, + "step": 13902 + }, + { + "epoch": 0.4345, + "grad_norm": 3.28125, + "grad_norm_var": 0.03125712076822917, + "learning_rate": 0.0001, + "loss": 5.7911, + "loss/crossentropy": 2.6023541688919067, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16887789964675903, + "step": 13904 + }, + { + "epoch": 0.4345625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0301910400390625, + "learning_rate": 0.0001, + "loss": 5.6791, + "loss/crossentropy": 2.4348760843276978, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1705169975757599, + "step": 13906 + }, + { + "epoch": 0.434625, + "grad_norm": 3.15625, + "grad_norm_var": 0.029832967122395835, + "learning_rate": 0.0001, + "loss": 5.8906, + "loss/crossentropy": 2.6364128589630127, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17581136524677277, + "step": 13908 + }, + { + "epoch": 0.4346875, + "grad_norm": 3.078125, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 5.6061, + "loss/crossentropy": 2.473349452018738, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16522951424121857, + "step": 13910 + }, + { + "epoch": 0.43475, + "grad_norm": 4.15625, + "grad_norm_var": 0.084033203125, + "learning_rate": 0.0001, + "loss": 5.5508, + "loss/crossentropy": 2.3882246017456055, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16586287319660187, + "step": 13912 + }, + { + "epoch": 0.4348125, + "grad_norm": 3.0, + "grad_norm_var": 0.08943684895833333, + "learning_rate": 0.0001, + "loss": 5.652, + "loss/crossentropy": 2.467076301574707, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16810613870620728, + "step": 13914 + }, + { + "epoch": 0.434875, + "grad_norm": 3.390625, + "grad_norm_var": 0.0906890869140625, + "learning_rate": 0.0001, + "loss": 6.3171, + "loss/crossentropy": 2.87043559551239, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19076044112443924, + "step": 13916 + }, + { + "epoch": 0.4349375, + "grad_norm": 3.46875, + "grad_norm_var": 0.09966532389322917, + "learning_rate": 0.0001, + "loss": 5.7529, + "loss/crossentropy": 2.58914315700531, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1694972664117813, + "step": 13918 + }, + { + "epoch": 0.435, + "grad_norm": 3.3125, + "grad_norm_var": 0.10134175618489584, + "learning_rate": 0.0001, + "loss": 6.0511, + "loss/crossentropy": 2.7517701387405396, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1783655434846878, + "step": 13920 + }, + { + "epoch": 0.4350625, + "grad_norm": 3.265625, + "grad_norm_var": 0.0958160400390625, + "learning_rate": 0.0001, + "loss": 5.9012, + "loss/crossentropy": 2.6555097103118896, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17730756103992462, + "step": 13922 + }, + { + "epoch": 0.435125, + "grad_norm": 3.328125, + "grad_norm_var": 0.09462890625, + "learning_rate": 0.0001, + "loss": 5.8087, + "loss/crossentropy": 2.492259979248047, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18047624081373215, + "step": 13924 + }, + { + "epoch": 0.4351875, + "grad_norm": 3.171875, + "grad_norm_var": 0.094482421875, + "learning_rate": 0.0001, + "loss": 6.0533, + "loss/crossentropy": 2.752696990966797, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17654091119766235, + "step": 13926 + }, + { + "epoch": 0.43525, + "grad_norm": 3.03125, + "grad_norm_var": 0.04117431640625, + "learning_rate": 0.0001, + "loss": 5.7251, + "loss/crossentropy": 2.494315981864929, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1715133786201477, + "step": 13928 + }, + { + "epoch": 0.4353125, + "grad_norm": 3.203125, + "grad_norm_var": 0.03540751139322917, + "learning_rate": 0.0001, + "loss": 5.6916, + "loss/crossentropy": 2.5007740259170532, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16869062930345535, + "step": 13930 + }, + { + "epoch": 0.435375, + "grad_norm": 3.109375, + "grad_norm_var": 0.026025390625, + "learning_rate": 0.0001, + "loss": 5.4062, + "loss/crossentropy": 2.295772910118103, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16299156844615936, + "step": 13932 + }, + { + "epoch": 0.4354375, + "grad_norm": 3.28125, + "grad_norm_var": 0.0167144775390625, + "learning_rate": 0.0001, + "loss": 5.891, + "loss/crossentropy": 2.620903968811035, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17427542805671692, + "step": 13934 + }, + { + "epoch": 0.4355, + "grad_norm": 3.1875, + "grad_norm_var": 0.018236287434895835, + "learning_rate": 0.0001, + "loss": 5.9248, + "loss/crossentropy": 2.7304844856262207, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17059976607561111, + "step": 13936 + }, + { + "epoch": 0.4355625, + "grad_norm": 3.734375, + "grad_norm_var": 0.03484700520833333, + "learning_rate": 0.0001, + "loss": 5.937, + "loss/crossentropy": 2.4570083618164062, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1894073486328125, + "step": 13938 + }, + { + "epoch": 0.435625, + "grad_norm": 3.4375, + "grad_norm_var": 0.0402252197265625, + "learning_rate": 0.0001, + "loss": 6.0185, + "loss/crossentropy": 2.646772623062134, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1848280280828476, + "step": 13940 + }, + { + "epoch": 0.4356875, + "grad_norm": 3.28125, + "grad_norm_var": 0.037886555989583334, + "learning_rate": 0.0001, + "loss": 5.8972, + "loss/crossentropy": 2.5509214401245117, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18033506721258163, + "step": 13942 + }, + { + "epoch": 0.43575, + "grad_norm": 3.5, + "grad_norm_var": 0.03338216145833333, + "learning_rate": 0.0001, + "loss": 6.2292, + "loss/crossentropy": 2.7814353704452515, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.18657396733760834, + "step": 13944 + }, + { + "epoch": 0.4358125, + "grad_norm": 3.25, + "grad_norm_var": 0.0400299072265625, + "learning_rate": 0.0001, + "loss": 6.0215, + "loss/crossentropy": 2.6906360387802124, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17918318510055542, + "step": 13946 + }, + { + "epoch": 0.435875, + "grad_norm": 3.125, + "grad_norm_var": 0.04468994140625, + "learning_rate": 0.0001, + "loss": 5.8619, + "loss/crossentropy": 2.4934970140457153, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1813701018691063, + "step": 13948 + }, + { + "epoch": 0.4359375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0441314697265625, + "learning_rate": 0.0001, + "loss": 5.8171, + "loss/crossentropy": 2.6028374433517456, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17259834706783295, + "step": 13950 + }, + { + "epoch": 0.436, + "grad_norm": 3.015625, + "grad_norm_var": 0.043211873372395834, + "learning_rate": 0.0001, + "loss": 5.6416, + "loss/crossentropy": 2.4892324209213257, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1668020561337471, + "step": 13952 + }, + { + "epoch": 0.4360625, + "grad_norm": 2.953125, + "grad_norm_var": 0.045556640625, + "learning_rate": 0.0001, + "loss": 5.6439, + "loss/crossentropy": 2.487300753593445, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16761772334575653, + "step": 13954 + }, + { + "epoch": 0.436125, + "grad_norm": 2.96875, + "grad_norm_var": 0.052179972330729164, + "learning_rate": 0.0001, + "loss": 5.7655, + "loss/crossentropy": 2.6785272359848022, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16104034334421158, + "step": 13956 + }, + { + "epoch": 0.4361875, + "grad_norm": 3.203125, + "grad_norm_var": 0.05458984375, + "learning_rate": 0.0001, + "loss": 5.9288, + "loss/crossentropy": 2.6830239295959473, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17340965569019318, + "step": 13958 + }, + { + "epoch": 0.43625, + "grad_norm": 3.1875, + "grad_norm_var": 0.05006103515625, + "learning_rate": 0.0001, + "loss": 5.8653, + "loss/crossentropy": 2.5887218713760376, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1772654503583908, + "step": 13960 + }, + { + "epoch": 0.4363125, + "grad_norm": 3.546875, + "grad_norm_var": 0.045807902018229166, + "learning_rate": 0.0001, + "loss": 5.4811, + "loss/crossentropy": 2.3509578704833984, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16262295097112656, + "step": 13962 + }, + { + "epoch": 0.436375, + "grad_norm": 2.953125, + "grad_norm_var": 0.033980305989583334, + "learning_rate": 0.0001, + "loss": 5.5103, + "loss/crossentropy": 2.366196870803833, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16948788613080978, + "step": 13964 + }, + { + "epoch": 0.4364375, + "grad_norm": 3.21875, + "grad_norm_var": 0.03258463541666667, + "learning_rate": 0.0001, + "loss": 5.7932, + "loss/crossentropy": 2.5165878534317017, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17727329581975937, + "step": 13966 + }, + { + "epoch": 0.4365, + "grad_norm": 3.046875, + "grad_norm_var": 0.03167317708333333, + "learning_rate": 0.0001, + "loss": 5.8628, + "loss/crossentropy": 2.569228172302246, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17583833634853363, + "step": 13968 + }, + { + "epoch": 0.4365625, + "grad_norm": 3.453125, + "grad_norm_var": 0.03478190104166667, + "learning_rate": 0.0001, + "loss": 5.7704, + "loss/crossentropy": 2.4731861352920532, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1773790717124939, + "step": 13970 + }, + { + "epoch": 0.436625, + "grad_norm": 2.84375, + "grad_norm_var": 0.04010009765625, + "learning_rate": 0.0001, + "loss": 5.7873, + "loss/crossentropy": 2.5951253175735474, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1696045771241188, + "step": 13972 + }, + { + "epoch": 0.4366875, + "grad_norm": 3.046875, + "grad_norm_var": 0.040095011393229164, + "learning_rate": 0.0001, + "loss": 5.8076, + "loss/crossentropy": 2.5616809129714966, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1741999313235283, + "step": 13974 + }, + { + "epoch": 0.43675, + "grad_norm": 3.125, + "grad_norm_var": 0.04112040201822917, + "learning_rate": 0.0001, + "loss": 5.4678, + "loss/crossentropy": 2.3087103366851807, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16864745318889618, + "step": 13976 + }, + { + "epoch": 0.4368125, + "grad_norm": 3.046875, + "grad_norm_var": 0.027936808268229165, + "learning_rate": 0.0001, + "loss": 5.9896, + "loss/crossentropy": 2.6676132678985596, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1802445352077484, + "step": 13978 + }, + { + "epoch": 0.436875, + "grad_norm": 3.078125, + "grad_norm_var": 0.027586873372395834, + "learning_rate": 0.0001, + "loss": 5.5338, + "loss/crossentropy": 2.3940389156341553, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16241838037967682, + "step": 13980 + }, + { + "epoch": 0.4369375, + "grad_norm": 3.453125, + "grad_norm_var": 0.10838216145833333, + "learning_rate": 0.0001, + "loss": 6.1542, + "loss/crossentropy": 2.6889700889587402, + "loss/hidden": 1.61328125, + "loss/jsd": 0.0, + "loss/logits": 0.18519629538059235, + "step": 13982 + }, + { + "epoch": 0.437, + "grad_norm": 3.34375, + "grad_norm_var": 0.10601298014322917, + "learning_rate": 0.0001, + "loss": 5.766, + "loss/crossentropy": 2.4458248615264893, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17967405915260315, + "step": 13984 + }, + { + "epoch": 0.4370625, + "grad_norm": 2.953125, + "grad_norm_var": 0.10950419108072916, + "learning_rate": 0.0001, + "loss": 5.7802, + "loss/crossentropy": 2.6169904470443726, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1674969717860222, + "step": 13986 + }, + { + "epoch": 0.437125, + "grad_norm": 3.1875, + "grad_norm_var": 0.09851888020833334, + "learning_rate": 0.0001, + "loss": 5.6014, + "loss/crossentropy": 2.446366548538208, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1658981293439865, + "step": 13988 + }, + { + "epoch": 0.4371875, + "grad_norm": 3.234375, + "grad_norm_var": 0.09902242024739584, + "learning_rate": 0.0001, + "loss": 5.7611, + "loss/crossentropy": 2.5728660821914673, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16999761760234833, + "step": 13990 + }, + { + "epoch": 0.43725, + "grad_norm": 2.984375, + "grad_norm_var": 0.10242411295572916, + "learning_rate": 0.0001, + "loss": 5.7782, + "loss/crossentropy": 2.63112211227417, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16548720002174377, + "step": 13992 + }, + { + "epoch": 0.4373125, + "grad_norm": 3.3125, + "grad_norm_var": 0.0985015869140625, + "learning_rate": 0.0001, + "loss": 5.9244, + "loss/crossentropy": 2.6902605295181274, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17263738065958023, + "step": 13994 + }, + { + "epoch": 0.437375, + "grad_norm": 4.09375, + "grad_norm_var": 0.14097900390625, + "learning_rate": 0.0001, + "loss": 5.7846, + "loss/crossentropy": 2.5803415775299072, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17238248139619827, + "step": 13996 + }, + { + "epoch": 0.4374375, + "grad_norm": 3.15625, + "grad_norm_var": 0.06813151041666667, + "learning_rate": 0.0001, + "loss": 5.7821, + "loss/crossentropy": 2.5158464908599854, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17740444093942642, + "step": 13998 + }, + { + "epoch": 0.4375, + "grad_norm": 3.53125, + "grad_norm_var": 0.8107493082682292, + "learning_rate": 0.0001, + "loss": 6.6197, + "loss/crossentropy": 3.0875765085220337, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19695878773927689, + "step": 14000 + }, + { + "epoch": 0.4375625, + "grad_norm": 3.40625, + "grad_norm_var": 0.7908854166666667, + "learning_rate": 0.0001, + "loss": 5.6925, + "loss/crossentropy": 2.4908525943756104, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.171725332736969, + "step": 14002 + }, + { + "epoch": 0.437625, + "grad_norm": 3.484375, + "grad_norm_var": 0.7793853759765625, + "learning_rate": 0.0001, + "loss": 5.9493, + "loss/crossentropy": 2.6585350036621094, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1779026985168457, + "step": 14004 + }, + { + "epoch": 0.4376875, + "grad_norm": 3.890625, + "grad_norm_var": 0.7587961832682292, + "learning_rate": 0.0001, + "loss": 6.1833, + "loss/crossentropy": 2.781018376350403, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1839783489704132, + "step": 14006 + }, + { + "epoch": 0.43775, + "grad_norm": 3.453125, + "grad_norm_var": 0.73349609375, + "learning_rate": 0.0001, + "loss": 5.854, + "loss/crossentropy": 2.599832057952881, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17541275918483734, + "step": 14008 + }, + { + "epoch": 0.4378125, + "grad_norm": 3.015625, + "grad_norm_var": 0.7720937093098958, + "learning_rate": 0.0001, + "loss": 5.5801, + "loss/crossentropy": 2.4539239406585693, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16652580350637436, + "step": 14010 + }, + { + "epoch": 0.437875, + "grad_norm": 4.90625, + "grad_norm_var": 0.84169921875, + "learning_rate": 0.0001, + "loss": 5.8686, + "loss/crossentropy": 2.5402169227600098, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1820555478334427, + "step": 14012 + }, + { + "epoch": 0.4379375, + "grad_norm": 3.421875, + "grad_norm_var": 0.83336181640625, + "learning_rate": 0.0001, + "loss": 5.8299, + "loss/crossentropy": 2.538370966911316, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17992941290140152, + "step": 14014 + }, + { + "epoch": 0.438, + "grad_norm": 3.125, + "grad_norm_var": 0.2145172119140625, + "learning_rate": 0.0001, + "loss": 5.5102, + "loss/crossentropy": 2.3528844118118286, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16729706525802612, + "step": 14016 + }, + { + "epoch": 0.4380625, + "grad_norm": 3.171875, + "grad_norm_var": 0.2233795166015625, + "learning_rate": 0.0001, + "loss": 6.0016, + "loss/crossentropy": 2.729480028152466, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17760110646486282, + "step": 14018 + }, + { + "epoch": 0.438125, + "grad_norm": 3.109375, + "grad_norm_var": 0.2264312744140625, + "learning_rate": 0.0001, + "loss": 6.0135, + "loss/crossentropy": 2.694047689437866, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18233928829431534, + "step": 14020 + }, + { + "epoch": 0.4381875, + "grad_norm": 3.4375, + "grad_norm_var": 0.19953511555989584, + "learning_rate": 0.0001, + "loss": 5.8586, + "loss/crossentropy": 2.595989942550659, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17586620151996613, + "step": 14022 + }, + { + "epoch": 0.43825, + "grad_norm": 3.15625, + "grad_norm_var": 0.196533203125, + "learning_rate": 0.0001, + "loss": 5.7336, + "loss/crossentropy": 2.4948946237564087, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17074289917945862, + "step": 14024 + }, + { + "epoch": 0.4383125, + "grad_norm": 3.046875, + "grad_norm_var": 0.19375, + "learning_rate": 0.0001, + "loss": 5.9275, + "loss/crossentropy": 2.739194631576538, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1711711436510086, + "step": 14026 + }, + { + "epoch": 0.438375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0367095947265625, + "learning_rate": 0.0001, + "loss": 5.82, + "loss/crossentropy": 2.520078420639038, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17842947691679, + "step": 14028 + }, + { + "epoch": 0.4384375, + "grad_norm": 3.34375, + "grad_norm_var": 0.03531494140625, + "learning_rate": 0.0001, + "loss": 6.1359, + "loss/crossentropy": 2.726751685142517, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1838875114917755, + "step": 14030 + }, + { + "epoch": 0.4385, + "grad_norm": 3.15625, + "grad_norm_var": 0.03815104166666667, + "learning_rate": 0.0001, + "loss": 5.6123, + "loss/crossentropy": 2.5084946155548096, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16624469310045242, + "step": 14032 + }, + { + "epoch": 0.4385625, + "grad_norm": 3.46875, + "grad_norm_var": 0.04306233723958333, + "learning_rate": 0.0001, + "loss": 5.9164, + "loss/crossentropy": 2.6273417472839355, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17733992636203766, + "step": 14034 + }, + { + "epoch": 0.438625, + "grad_norm": 3.265625, + "grad_norm_var": 0.04011128743489583, + "learning_rate": 0.0001, + "loss": 5.9573, + "loss/crossentropy": 2.7191213369369507, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17616549879312515, + "step": 14036 + }, + { + "epoch": 0.4386875, + "grad_norm": 3.75, + "grad_norm_var": 0.05353190104166667, + "learning_rate": 0.0001, + "loss": 5.9259, + "loss/crossentropy": 2.5204232931137085, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1866377666592598, + "step": 14038 + }, + { + "epoch": 0.43875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0557525634765625, + "learning_rate": 0.0001, + "loss": 5.9545, + "loss/crossentropy": 2.701447010040283, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17374404519796371, + "step": 14040 + }, + { + "epoch": 0.4388125, + "grad_norm": 3.328125, + "grad_norm_var": 0.04988505045572917, + "learning_rate": 0.0001, + "loss": 6.0738, + "loss/crossentropy": 2.8018620014190674, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17641377449035645, + "step": 14042 + }, + { + "epoch": 0.438875, + "grad_norm": 3.59375, + "grad_norm_var": 0.04192301432291667, + "learning_rate": 0.0001, + "loss": 6.0156, + "loss/crossentropy": 2.6663641929626465, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1814088374376297, + "step": 14044 + }, + { + "epoch": 0.4389375, + "grad_norm": 3.0, + "grad_norm_var": 0.04879557291666667, + "learning_rate": 0.0001, + "loss": 5.7216, + "loss/crossentropy": 2.509984850883484, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17545349150896072, + "step": 14046 + }, + { + "epoch": 0.439, + "grad_norm": 3.21875, + "grad_norm_var": 0.05074462890625, + "learning_rate": 0.0001, + "loss": 6.3435, + "loss/crossentropy": 2.942991256713867, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18770598620176315, + "step": 14048 + }, + { + "epoch": 0.4390625, + "grad_norm": 3.203125, + "grad_norm_var": 0.04761962890625, + "learning_rate": 0.0001, + "loss": 5.902, + "loss/crossentropy": 2.616852045059204, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17616866528987885, + "step": 14050 + }, + { + "epoch": 0.439125, + "grad_norm": 2.921875, + "grad_norm_var": 0.05896809895833333, + "learning_rate": 0.0001, + "loss": 5.6839, + "loss/crossentropy": 2.5626214742660522, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1625198796391487, + "step": 14052 + }, + { + "epoch": 0.4391875, + "grad_norm": 3.078125, + "grad_norm_var": 0.04194234212239583, + "learning_rate": 0.0001, + "loss": 5.4755, + "loss/crossentropy": 2.345886468887329, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16491156816482544, + "step": 14054 + }, + { + "epoch": 0.43925, + "grad_norm": 3.359375, + "grad_norm_var": 0.04426167805989583, + "learning_rate": 0.0001, + "loss": 5.6413, + "loss/crossentropy": 2.5013986825942993, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16555754095315933, + "step": 14056 + }, + { + "epoch": 0.4393125, + "grad_norm": 3.515625, + "grad_norm_var": 0.04892171223958333, + "learning_rate": 0.0001, + "loss": 5.8352, + "loss/crossentropy": 2.5604896545410156, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17903056740760803, + "step": 14058 + }, + { + "epoch": 0.439375, + "grad_norm": 3.03125, + "grad_norm_var": 0.03905843098958333, + "learning_rate": 0.0001, + "loss": 5.7855, + "loss/crossentropy": 2.551244020462036, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17264288663864136, + "step": 14060 + }, + { + "epoch": 0.4394375, + "grad_norm": 3.203125, + "grad_norm_var": 0.038630167643229164, + "learning_rate": 0.0001, + "loss": 6.0331, + "loss/crossentropy": 2.6743807792663574, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18313482403755188, + "step": 14062 + }, + { + "epoch": 0.4395, + "grad_norm": 3.21875, + "grad_norm_var": 1.5511881510416667, + "learning_rate": 0.0001, + "loss": 5.5925, + "loss/crossentropy": 2.3944915533065796, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17097366601228714, + "step": 14064 + }, + { + "epoch": 0.4395625, + "grad_norm": 3.484375, + "grad_norm_var": 1.5429026285807292, + "learning_rate": 0.0001, + "loss": 5.6763, + "loss/crossentropy": 2.4504220485687256, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1698511689901352, + "step": 14066 + }, + { + "epoch": 0.439625, + "grad_norm": 3.59375, + "grad_norm_var": 1.515087890625, + "learning_rate": 0.0001, + "loss": 5.8929, + "loss/crossentropy": 2.578840494155884, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18023841083049774, + "step": 14068 + }, + { + "epoch": 0.4396875, + "grad_norm": 3.265625, + "grad_norm_var": 1.5004140218098958, + "learning_rate": 0.0001, + "loss": 6.1559, + "loss/crossentropy": 2.7392576932907104, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1881519854068756, + "step": 14070 + }, + { + "epoch": 0.43975, + "grad_norm": 3.625, + "grad_norm_var": 1.4768218994140625, + "learning_rate": 0.0001, + "loss": 5.8832, + "loss/crossentropy": 2.6005676984786987, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1755259484052658, + "step": 14072 + }, + { + "epoch": 0.4398125, + "grad_norm": 3.015625, + "grad_norm_var": 1.5114095052083334, + "learning_rate": 0.0001, + "loss": 5.5797, + "loss/crossentropy": 2.4586753845214844, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.162494458258152, + "step": 14074 + }, + { + "epoch": 0.439875, + "grad_norm": 3.125, + "grad_norm_var": 1.5263417561848958, + "learning_rate": 0.0001, + "loss": 5.5643, + "loss/crossentropy": 2.388434648513794, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16563333570957184, + "step": 14076 + }, + { + "epoch": 0.4399375, + "grad_norm": 2.953125, + "grad_norm_var": 1.5548828125, + "learning_rate": 0.0001, + "loss": 5.6168, + "loss/crossentropy": 2.4416269063949585, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16986381262540817, + "step": 14078 + }, + { + "epoch": 0.44, + "grad_norm": 3.296875, + "grad_norm_var": 0.05517578125, + "learning_rate": 0.0001, + "loss": 5.8422, + "loss/crossentropy": 2.558874249458313, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17950692027807236, + "step": 14080 + }, + { + "epoch": 0.4400625, + "grad_norm": 3.03125, + "grad_norm_var": 0.045832316080729164, + "learning_rate": 0.0001, + "loss": 5.7236, + "loss/crossentropy": 2.5704649686813354, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16687601059675217, + "step": 14082 + }, + { + "epoch": 0.440125, + "grad_norm": 3.25, + "grad_norm_var": 0.033446248372395834, + "learning_rate": 0.0001, + "loss": 5.7626, + "loss/crossentropy": 2.5022090673446655, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17448122054338455, + "step": 14084 + }, + { + "epoch": 0.4401875, + "grad_norm": 3.03125, + "grad_norm_var": 0.03964436848958333, + "learning_rate": 0.0001, + "loss": 6.0418, + "loss/crossentropy": 2.786649227142334, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17238830029964447, + "step": 14086 + }, + { + "epoch": 0.44025, + "grad_norm": 3.390625, + "grad_norm_var": 0.028571573893229167, + "learning_rate": 0.0001, + "loss": 5.7734, + "loss/crossentropy": 2.6143399477005005, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16786056756973267, + "step": 14088 + }, + { + "epoch": 0.4403125, + "grad_norm": 2.96875, + "grad_norm_var": 0.028962198893229166, + "learning_rate": 0.0001, + "loss": 5.5703, + "loss/crossentropy": 2.4524561166763306, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16178082674741745, + "step": 14090 + }, + { + "epoch": 0.440375, + "grad_norm": 3.0, + "grad_norm_var": 0.030304972330729166, + "learning_rate": 0.0001, + "loss": 5.7567, + "loss/crossentropy": 2.584386944770813, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17074551433324814, + "step": 14092 + }, + { + "epoch": 0.4404375, + "grad_norm": 3.03125, + "grad_norm_var": 0.028238932291666668, + "learning_rate": 0.0001, + "loss": 5.9168, + "loss/crossentropy": 2.6656646728515625, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17472387850284576, + "step": 14094 + }, + { + "epoch": 0.4405, + "grad_norm": 3.375, + "grad_norm_var": 0.026318359375, + "learning_rate": 0.0001, + "loss": 5.6788, + "loss/crossentropy": 2.457118511199951, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1737259030342102, + "step": 14096 + }, + { + "epoch": 0.4405625, + "grad_norm": 3.09375, + "grad_norm_var": 0.029222615559895835, + "learning_rate": 0.0001, + "loss": 5.9826, + "loss/crossentropy": 2.6728047132492065, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17746330052614212, + "step": 14098 + }, + { + "epoch": 0.440625, + "grad_norm": 3.484375, + "grad_norm_var": 0.04126688639322917, + "learning_rate": 0.0001, + "loss": 5.787, + "loss/crossentropy": 2.6073368787765503, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17069777846336365, + "step": 14100 + }, + { + "epoch": 0.4406875, + "grad_norm": 3.390625, + "grad_norm_var": 0.03775634765625, + "learning_rate": 0.0001, + "loss": 5.9069, + "loss/crossentropy": 2.6937869787216187, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16896523535251617, + "step": 14102 + }, + { + "epoch": 0.44075, + "grad_norm": 3.0625, + "grad_norm_var": 0.0427734375, + "learning_rate": 0.0001, + "loss": 5.7835, + "loss/crossentropy": 2.6282999515533447, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16864388436079025, + "step": 14104 + }, + { + "epoch": 0.4408125, + "grad_norm": 3.09375, + "grad_norm_var": 0.04303385416666667, + "learning_rate": 0.0001, + "loss": 5.7754, + "loss/crossentropy": 2.611765742301941, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16558589041233063, + "step": 14106 + }, + { + "epoch": 0.440875, + "grad_norm": 3.125, + "grad_norm_var": 0.039697265625, + "learning_rate": 0.0001, + "loss": 5.9349, + "loss/crossentropy": 2.622216582298279, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18244481831789017, + "step": 14108 + }, + { + "epoch": 0.4409375, + "grad_norm": 3.15625, + "grad_norm_var": 0.037984212239583336, + "learning_rate": 0.0001, + "loss": 5.7041, + "loss/crossentropy": 2.4544249773025513, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17418567091226578, + "step": 14110 + }, + { + "epoch": 0.441, + "grad_norm": 3.21875, + "grad_norm_var": 0.03574930826822917, + "learning_rate": 0.0001, + "loss": 6.0332, + "loss/crossentropy": 2.6911864280700684, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1795118898153305, + "step": 14112 + }, + { + "epoch": 0.4410625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0308258056640625, + "learning_rate": 0.0001, + "loss": 5.8747, + "loss/crossentropy": 2.593514323234558, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17772328853607178, + "step": 14114 + }, + { + "epoch": 0.441125, + "grad_norm": 3.59375, + "grad_norm_var": 0.04641011555989583, + "learning_rate": 0.0001, + "loss": 6.148, + "loss/crossentropy": 2.7083226442337036, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.18459460884332657, + "step": 14116 + }, + { + "epoch": 0.4411875, + "grad_norm": 3.328125, + "grad_norm_var": 0.04452718098958333, + "learning_rate": 0.0001, + "loss": 5.6375, + "loss/crossentropy": 2.4524585008621216, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16733208298683167, + "step": 14118 + }, + { + "epoch": 0.44125, + "grad_norm": 3.375, + "grad_norm_var": 0.0410064697265625, + "learning_rate": 0.0001, + "loss": 5.6476, + "loss/crossentropy": 2.442119598388672, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17016087472438812, + "step": 14120 + }, + { + "epoch": 0.4413125, + "grad_norm": 3.28125, + "grad_norm_var": 0.033935546875, + "learning_rate": 0.0001, + "loss": 5.9103, + "loss/crossentropy": 2.598555088043213, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1815623864531517, + "step": 14122 + }, + { + "epoch": 0.441375, + "grad_norm": 3.8125, + "grad_norm_var": 0.04830322265625, + "learning_rate": 0.0001, + "loss": 6.1144, + "loss/crossentropy": 2.685201048851013, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18784663081169128, + "step": 14124 + }, + { + "epoch": 0.4414375, + "grad_norm": 3.78125, + "grad_norm_var": 0.0636871337890625, + "learning_rate": 0.0001, + "loss": 6.1894, + "loss/crossentropy": 2.7855113744735718, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18647854775190353, + "step": 14126 + }, + { + "epoch": 0.4415, + "grad_norm": 3.1875, + "grad_norm_var": 0.06370035807291667, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.6137852668762207, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16817715764045715, + "step": 14128 + }, + { + "epoch": 0.4415625, + "grad_norm": 3.484375, + "grad_norm_var": 0.0630035400390625, + "learning_rate": 0.0001, + "loss": 5.7611, + "loss/crossentropy": 2.582552433013916, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17059174180030823, + "step": 14130 + }, + { + "epoch": 0.441625, + "grad_norm": 3.28125, + "grad_norm_var": 0.05446675618489583, + "learning_rate": 0.0001, + "loss": 5.768, + "loss/crossentropy": 2.574445605278015, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.169355146586895, + "step": 14132 + }, + { + "epoch": 0.4416875, + "grad_norm": 3.765625, + "grad_norm_var": 0.06409098307291666, + "learning_rate": 0.0001, + "loss": 6.4481, + "loss/crossentropy": 2.8273682594299316, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.20699873566627502, + "step": 14134 + }, + { + "epoch": 0.44175, + "grad_norm": 3.28125, + "grad_norm_var": 0.06486002604166667, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.5572317838668823, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1786102056503296, + "step": 14136 + }, + { + "epoch": 0.4418125, + "grad_norm": 3.390625, + "grad_norm_var": 0.06030985514322917, + "learning_rate": 0.0001, + "loss": 5.9828, + "loss/crossentropy": 2.6731334924697876, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17706193029880524, + "step": 14138 + }, + { + "epoch": 0.441875, + "grad_norm": 2.921875, + "grad_norm_var": 0.05513407389322917, + "learning_rate": 0.0001, + "loss": 5.558, + "loss/crossentropy": 2.442023754119873, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1639430969953537, + "step": 14140 + }, + { + "epoch": 0.4419375, + "grad_norm": 3.25, + "grad_norm_var": 0.03472900390625, + "learning_rate": 0.0001, + "loss": 5.9016, + "loss/crossentropy": 2.5875269174575806, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1806221604347229, + "step": 14142 + }, + { + "epoch": 0.442, + "grad_norm": 3.171875, + "grad_norm_var": 0.0385162353515625, + "learning_rate": 0.0001, + "loss": 5.7166, + "loss/crossentropy": 2.5313161611557007, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17087139934301376, + "step": 14144 + }, + { + "epoch": 0.4420625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0388336181640625, + "learning_rate": 0.0001, + "loss": 5.8014, + "loss/crossentropy": 2.5723941326141357, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17251383513212204, + "step": 14146 + }, + { + "epoch": 0.442125, + "grad_norm": 2.890625, + "grad_norm_var": 0.04780171712239583, + "learning_rate": 0.0001, + "loss": 5.5317, + "loss/crossentropy": 2.4141929149627686, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16487937420606613, + "step": 14148 + }, + { + "epoch": 0.4421875, + "grad_norm": 3.15625, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 6.1061, + "loss/crossentropy": 2.7535077333450317, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18135527521371841, + "step": 14150 + }, + { + "epoch": 0.44225, + "grad_norm": 3.3125, + "grad_norm_var": 0.025126139322916668, + "learning_rate": 0.0001, + "loss": 6.0045, + "loss/crossentropy": 2.7140774726867676, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17904505133628845, + "step": 14152 + }, + { + "epoch": 0.4423125, + "grad_norm": 3.390625, + "grad_norm_var": 0.023053995768229165, + "learning_rate": 0.0001, + "loss": 5.8206, + "loss/crossentropy": 2.5800344944000244, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1760055422782898, + "step": 14154 + }, + { + "epoch": 0.442375, + "grad_norm": 3.28125, + "grad_norm_var": 0.025130208333333334, + "learning_rate": 0.0001, + "loss": 5.9944, + "loss/crossentropy": 2.6406712532043457, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1779462918639183, + "step": 14156 + }, + { + "epoch": 0.4424375, + "grad_norm": 3.09375, + "grad_norm_var": 0.029271443684895832, + "learning_rate": 0.0001, + "loss": 5.7035, + "loss/crossentropy": 2.5728673934936523, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.166965052485466, + "step": 14158 + }, + { + "epoch": 0.4425, + "grad_norm": 4.375, + "grad_norm_var": 0.11597900390625, + "learning_rate": 0.0001, + "loss": 5.3905, + "loss/crossentropy": 2.2556103467941284, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16036295890808105, + "step": 14160 + }, + { + "epoch": 0.4425625, + "grad_norm": 2.984375, + "grad_norm_var": 0.12603759765625, + "learning_rate": 0.0001, + "loss": 5.8177, + "loss/crossentropy": 2.564350128173828, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17455732077360153, + "step": 14162 + }, + { + "epoch": 0.442625, + "grad_norm": 3.109375, + "grad_norm_var": 0.11891276041666667, + "learning_rate": 0.0001, + "loss": 5.6852, + "loss/crossentropy": 2.5463815927505493, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16583866626024246, + "step": 14164 + }, + { + "epoch": 0.4426875, + "grad_norm": 3.3125, + "grad_norm_var": 0.11741434733072917, + "learning_rate": 0.0001, + "loss": 5.7215, + "loss/crossentropy": 2.581761956214905, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1620202511548996, + "step": 14166 + }, + { + "epoch": 0.44275, + "grad_norm": 3.3125, + "grad_norm_var": 0.12092997233072916, + "learning_rate": 0.0001, + "loss": 5.8324, + "loss/crossentropy": 2.656448006629944, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16837306320667267, + "step": 14168 + }, + { + "epoch": 0.4428125, + "grad_norm": 4.03125, + "grad_norm_var": 0.17285054524739582, + "learning_rate": 0.0001, + "loss": 6.0773, + "loss/crossentropy": 2.6987791061401367, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18590284883975983, + "step": 14170 + }, + { + "epoch": 0.442875, + "grad_norm": 3.375, + "grad_norm_var": 0.17138264973958334, + "learning_rate": 0.0001, + "loss": 5.8663, + "loss/crossentropy": 2.5615211725234985, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17579397559165955, + "step": 14172 + }, + { + "epoch": 0.4429375, + "grad_norm": 2.984375, + "grad_norm_var": 0.165185546875, + "learning_rate": 0.0001, + "loss": 5.4595, + "loss/crossentropy": 2.3379613161087036, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1648847609758377, + "step": 14174 + }, + { + "epoch": 0.443, + "grad_norm": 3.03125, + "grad_norm_var": 0.10237223307291667, + "learning_rate": 0.0001, + "loss": 5.8391, + "loss/crossentropy": 2.6235986948013306, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17272279411554337, + "step": 14176 + }, + { + "epoch": 0.4430625, + "grad_norm": 3.40625, + "grad_norm_var": 0.10217692057291666, + "learning_rate": 0.0001, + "loss": 5.7728, + "loss/crossentropy": 2.5508469343185425, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17219547182321548, + "step": 14178 + }, + { + "epoch": 0.443125, + "grad_norm": 3.125, + "grad_norm_var": 0.10484110514322917, + "learning_rate": 0.0001, + "loss": 5.6887, + "loss/crossentropy": 2.5842883586883545, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1651291698217392, + "step": 14180 + }, + { + "epoch": 0.4431875, + "grad_norm": 3.28125, + "grad_norm_var": 0.10510152180989583, + "learning_rate": 0.0001, + "loss": 5.6395, + "loss/crossentropy": 2.3743157386779785, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17573385685682297, + "step": 14182 + }, + { + "epoch": 0.44325, + "grad_norm": 3.078125, + "grad_norm_var": 0.10654195149739583, + "learning_rate": 0.0001, + "loss": 5.9045, + "loss/crossentropy": 2.655468225479126, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1764613389968872, + "step": 14184 + }, + { + "epoch": 0.4433125, + "grad_norm": 3.390625, + "grad_norm_var": 0.032938639322916664, + "learning_rate": 0.0001, + "loss": 5.9218, + "loss/crossentropy": 2.6279489994049072, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17703726887702942, + "step": 14186 + }, + { + "epoch": 0.443375, + "grad_norm": 3.234375, + "grad_norm_var": 0.024560546875, + "learning_rate": 0.0001, + "loss": 5.5574, + "loss/crossentropy": 2.424659490585327, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1656167060136795, + "step": 14188 + }, + { + "epoch": 0.4434375, + "grad_norm": 3.109375, + "grad_norm_var": 0.026334635416666665, + "learning_rate": 0.0001, + "loss": 5.778, + "loss/crossentropy": 2.502086043357849, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17289915680885315, + "step": 14190 + }, + { + "epoch": 0.4435, + "grad_norm": 3.484375, + "grad_norm_var": 0.030475870768229166, + "learning_rate": 0.0001, + "loss": 5.7241, + "loss/crossentropy": 2.526078701019287, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17097116261720657, + "step": 14192 + }, + { + "epoch": 0.4435625, + "grad_norm": 3.53125, + "grad_norm_var": 0.030777994791666666, + "learning_rate": 0.0001, + "loss": 5.8012, + "loss/crossentropy": 2.555241823196411, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17498234659433365, + "step": 14194 + }, + { + "epoch": 0.443625, + "grad_norm": 3.46875, + "grad_norm_var": 0.0344146728515625, + "learning_rate": 0.0001, + "loss": 6.0615, + "loss/crossentropy": 2.6597427129745483, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.1827550157904625, + "step": 14196 + }, + { + "epoch": 0.4436875, + "grad_norm": 3.328125, + "grad_norm_var": 0.03463134765625, + "learning_rate": 0.0001, + "loss": 5.9326, + "loss/crossentropy": 2.5371713638305664, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18329720944166183, + "step": 14198 + }, + { + "epoch": 0.44375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0299957275390625, + "learning_rate": 0.0001, + "loss": 5.8034, + "loss/crossentropy": 2.5516685247421265, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17477956414222717, + "step": 14200 + }, + { + "epoch": 0.4438125, + "grad_norm": 3.015625, + "grad_norm_var": 0.035302734375, + "learning_rate": 0.0001, + "loss": 5.8454, + "loss/crossentropy": 2.634779691696167, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1726226583123207, + "step": 14202 + }, + { + "epoch": 0.443875, + "grad_norm": 3.09375, + "grad_norm_var": 0.0335357666015625, + "learning_rate": 0.0001, + "loss": 6.1434, + "loss/crossentropy": 2.7896809577941895, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18146494776010513, + "step": 14204 + }, + { + "epoch": 0.4439375, + "grad_norm": 3.40625, + "grad_norm_var": 0.03239644368489583, + "learning_rate": 0.0001, + "loss": 5.9345, + "loss/crossentropy": 2.648725152015686, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17779573053121567, + "step": 14206 + }, + { + "epoch": 0.444, + "grad_norm": 3.5, + "grad_norm_var": 0.034886678059895836, + "learning_rate": 0.0001, + "loss": 6.4137, + "loss/crossentropy": 3.0079147815704346, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18511473387479782, + "step": 14208 + }, + { + "epoch": 0.4440625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0277984619140625, + "learning_rate": 0.0001, + "loss": 5.8442, + "loss/crossentropy": 2.6448980569839478, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1699315309524536, + "step": 14210 + }, + { + "epoch": 0.444125, + "grad_norm": 3.03125, + "grad_norm_var": 0.031078084309895834, + "learning_rate": 0.0001, + "loss": 5.5926, + "loss/crossentropy": 2.4751064777374268, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.165263831615448, + "step": 14212 + }, + { + "epoch": 0.4441875, + "grad_norm": 3.015625, + "grad_norm_var": 0.031037394205729166, + "learning_rate": 0.0001, + "loss": 5.5775, + "loss/crossentropy": 2.4614042043685913, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16707490384578705, + "step": 14214 + }, + { + "epoch": 0.44425, + "grad_norm": 3.421875, + "grad_norm_var": 0.20391337076822916, + "learning_rate": 0.0001, + "loss": 6.1049, + "loss/crossentropy": 2.6063095331192017, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.19205011427402496, + "step": 14216 + }, + { + "epoch": 0.4443125, + "grad_norm": 3.234375, + "grad_norm_var": 0.19436442057291667, + "learning_rate": 0.0001, + "loss": 5.9834, + "loss/crossentropy": 2.730865478515625, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17330465465784073, + "step": 14218 + }, + { + "epoch": 0.444375, + "grad_norm": 3.125, + "grad_norm_var": 0.22955322265625, + "learning_rate": 0.0001, + "loss": 5.673, + "loss/crossentropy": 2.423761010169983, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.173749141395092, + "step": 14220 + }, + { + "epoch": 0.4444375, + "grad_norm": 3.234375, + "grad_norm_var": 0.22981363932291668, + "learning_rate": 0.0001, + "loss": 5.8368, + "loss/crossentropy": 2.6157952547073364, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1728796511888504, + "step": 14222 + }, + { + "epoch": 0.4445, + "grad_norm": 3.1875, + "grad_norm_var": 0.23242899576822917, + "learning_rate": 0.0001, + "loss": 5.4949, + "loss/crossentropy": 2.4254437685012817, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.156550794839859, + "step": 14224 + }, + { + "epoch": 0.4445625, + "grad_norm": 3.484375, + "grad_norm_var": 0.22834879557291668, + "learning_rate": 0.0001, + "loss": 6.1521, + "loss/crossentropy": 2.844159722328186, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17884384095668793, + "step": 14226 + }, + { + "epoch": 0.444625, + "grad_norm": 3.40625, + "grad_norm_var": 0.22198893229166666, + "learning_rate": 0.0001, + "loss": 5.5103, + "loss/crossentropy": 2.2973939180374146, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16777722537517548, + "step": 14228 + }, + { + "epoch": 0.4446875, + "grad_norm": 4.0625, + "grad_norm_var": 0.2211334228515625, + "learning_rate": 0.0001, + "loss": 5.6075, + "loss/crossentropy": 2.361604690551758, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17575780302286148, + "step": 14230 + }, + { + "epoch": 0.44475, + "grad_norm": 3.203125, + "grad_norm_var": 0.10288798014322917, + "learning_rate": 0.0001, + "loss": 5.7361, + "loss/crossentropy": 2.5314559936523438, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1708575189113617, + "step": 14232 + }, + { + "epoch": 0.4448125, + "grad_norm": 3.171875, + "grad_norm_var": 0.11469624837239584, + "learning_rate": 0.0001, + "loss": 5.8871, + "loss/crossentropy": 2.6339739561080933, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17453377693891525, + "step": 14234 + }, + { + "epoch": 0.444875, + "grad_norm": 3.359375, + "grad_norm_var": 0.07746480305989584, + "learning_rate": 0.0001, + "loss": 5.7914, + "loss/crossentropy": 2.5673142671585083, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17202017456293106, + "step": 14236 + }, + { + "epoch": 0.4449375, + "grad_norm": 3.25, + "grad_norm_var": 0.08157145182291667, + "learning_rate": 0.0001, + "loss": 5.7828, + "loss/crossentropy": 2.604298233985901, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1666785031557083, + "step": 14238 + }, + { + "epoch": 0.445, + "grad_norm": 3.953125, + "grad_norm_var": 0.29511617024739584, + "learning_rate": 0.0001, + "loss": 5.6492, + "loss/crossentropy": 2.318596124649048, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.17250914126634598, + "step": 14240 + }, + { + "epoch": 0.4450625, + "grad_norm": 3.40625, + "grad_norm_var": 0.30865478515625, + "learning_rate": 0.0001, + "loss": 5.6779, + "loss/crossentropy": 2.496907114982605, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16926919668912888, + "step": 14242 + }, + { + "epoch": 0.445125, + "grad_norm": 3.546875, + "grad_norm_var": 0.2950103759765625, + "learning_rate": 0.0001, + "loss": 5.8451, + "loss/crossentropy": 2.5783416032791138, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17706424742937088, + "step": 14244 + }, + { + "epoch": 0.4451875, + "grad_norm": 3.390625, + "grad_norm_var": 0.27116597493489586, + "learning_rate": 0.0001, + "loss": 6.032, + "loss/crossentropy": 2.7022446393966675, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18102285265922546, + "step": 14246 + }, + { + "epoch": 0.44525, + "grad_norm": 3.296875, + "grad_norm_var": 0.268603515625, + "learning_rate": 0.0001, + "loss": 5.6267, + "loss/crossentropy": 2.4022055864334106, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16854028403759003, + "step": 14248 + }, + { + "epoch": 0.4453125, + "grad_norm": 3.078125, + "grad_norm_var": 0.2626261393229167, + "learning_rate": 0.0001, + "loss": 5.7496, + "loss/crossentropy": 2.60064959526062, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16724015772342682, + "step": 14250 + }, + { + "epoch": 0.445375, + "grad_norm": 3.40625, + "grad_norm_var": 0.26174723307291664, + "learning_rate": 0.0001, + "loss": 5.8662, + "loss/crossentropy": 2.637366533279419, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17093206942081451, + "step": 14252 + }, + { + "epoch": 0.4454375, + "grad_norm": 3.328125, + "grad_norm_var": 0.24931538899739583, + "learning_rate": 0.0001, + "loss": 6.0655, + "loss/crossentropy": 2.6863538026809692, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18322216719388962, + "step": 14254 + }, + { + "epoch": 0.4455, + "grad_norm": 3.140625, + "grad_norm_var": 0.029230753580729168, + "learning_rate": 0.0001, + "loss": 5.8473, + "loss/crossentropy": 2.6832687854766846, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16796649247407913, + "step": 14256 + }, + { + "epoch": 0.4455625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0343414306640625, + "learning_rate": 0.0001, + "loss": 5.6471, + "loss/crossentropy": 2.428324341773987, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16835793107748032, + "step": 14258 + }, + { + "epoch": 0.445625, + "grad_norm": 3.09375, + "grad_norm_var": 0.03557535807291667, + "learning_rate": 0.0001, + "loss": 5.6099, + "loss/crossentropy": 2.470323920249939, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16512752324342728, + "step": 14260 + }, + { + "epoch": 0.4456875, + "grad_norm": 3.296875, + "grad_norm_var": 0.039460245768229166, + "learning_rate": 0.0001, + "loss": 5.7145, + "loss/crossentropy": 2.581887722015381, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16717243194580078, + "step": 14262 + }, + { + "epoch": 0.44575, + "grad_norm": 3.140625, + "grad_norm_var": 0.039915974934895834, + "learning_rate": 0.0001, + "loss": 5.6021, + "loss/crossentropy": 2.4398629665374756, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16818110644817352, + "step": 14264 + }, + { + "epoch": 0.4458125, + "grad_norm": 3.140625, + "grad_norm_var": 0.03868815104166667, + "learning_rate": 0.0001, + "loss": 5.7567, + "loss/crossentropy": 2.552677869796753, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17118756473064423, + "step": 14266 + }, + { + "epoch": 0.445875, + "grad_norm": 3.046875, + "grad_norm_var": 0.04176025390625, + "learning_rate": 0.0001, + "loss": 5.9116, + "loss/crossentropy": 2.7044728994369507, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17032155394554138, + "step": 14268 + }, + { + "epoch": 0.4459375, + "grad_norm": 3.140625, + "grad_norm_var": 0.03943684895833333, + "learning_rate": 0.0001, + "loss": 5.7551, + "loss/crossentropy": 2.523730158805847, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17352360486984253, + "step": 14270 + }, + { + "epoch": 0.446, + "grad_norm": 3.109375, + "grad_norm_var": 0.03864644368489583, + "learning_rate": 0.0001, + "loss": 5.7731, + "loss/crossentropy": 2.597599744796753, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16911684721708298, + "step": 14272 + }, + { + "epoch": 0.4460625, + "grad_norm": 3.0625, + "grad_norm_var": 0.019319661458333335, + "learning_rate": 0.0001, + "loss": 5.6792, + "loss/crossentropy": 2.4801100492477417, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1699041873216629, + "step": 14274 + }, + { + "epoch": 0.446125, + "grad_norm": 3.125, + "grad_norm_var": 0.019090779622395835, + "learning_rate": 0.0001, + "loss": 5.739, + "loss/crossentropy": 2.5322024822235107, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1695106029510498, + "step": 14276 + }, + { + "epoch": 0.4461875, + "grad_norm": 3.46875, + "grad_norm_var": 0.023758951822916666, + "learning_rate": 0.0001, + "loss": 6.1333, + "loss/crossentropy": 2.7846380472183228, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1817398965358734, + "step": 14278 + }, + { + "epoch": 0.44625, + "grad_norm": 3.40625, + "grad_norm_var": 0.0288238525390625, + "learning_rate": 0.0001, + "loss": 5.7624, + "loss/crossentropy": 2.5464032888412476, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17277182638645172, + "step": 14280 + }, + { + "epoch": 0.4463125, + "grad_norm": 3.3125, + "grad_norm_var": 0.0303863525390625, + "learning_rate": 0.0001, + "loss": 6.0514, + "loss/crossentropy": 2.7622212171554565, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17579232156276703, + "step": 14282 + }, + { + "epoch": 0.446375, + "grad_norm": 3.015625, + "grad_norm_var": 0.04712626139322917, + "learning_rate": 0.0001, + "loss": 5.8248, + "loss/crossentropy": 2.565149188041687, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17713207006454468, + "step": 14284 + }, + { + "epoch": 0.4464375, + "grad_norm": 3.0625, + "grad_norm_var": 0.039599609375, + "learning_rate": 0.0001, + "loss": 5.6326, + "loss/crossentropy": 2.4806195497512817, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16832613199949265, + "step": 14286 + }, + { + "epoch": 0.4465, + "grad_norm": 3.21875, + "grad_norm_var": 0.0372467041015625, + "learning_rate": 0.0001, + "loss": 5.7442, + "loss/crossentropy": 2.5185790061950684, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17217542231082916, + "step": 14288 + }, + { + "epoch": 0.4465625, + "grad_norm": 3.0, + "grad_norm_var": 0.03753255208333333, + "learning_rate": 0.0001, + "loss": 5.4591, + "loss/crossentropy": 2.3245084285736084, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16423750668764114, + "step": 14290 + }, + { + "epoch": 0.446625, + "grad_norm": 3.1875, + "grad_norm_var": 0.03570963541666667, + "learning_rate": 0.0001, + "loss": 5.8494, + "loss/crossentropy": 2.6422178745269775, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17149612307548523, + "step": 14292 + }, + { + "epoch": 0.4466875, + "grad_norm": 3.5, + "grad_norm_var": 0.03876953125, + "learning_rate": 0.0001, + "loss": 5.6575, + "loss/crossentropy": 2.4422398805618286, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16957762837409973, + "step": 14294 + }, + { + "epoch": 0.44675, + "grad_norm": 2.984375, + "grad_norm_var": 0.038792928059895836, + "learning_rate": 0.0001, + "loss": 5.6236, + "loss/crossentropy": 2.4454299211502075, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16898469626903534, + "step": 14296 + }, + { + "epoch": 0.4468125, + "grad_norm": 3.671875, + "grad_norm_var": 0.0551910400390625, + "learning_rate": 0.0001, + "loss": 6.2519, + "loss/crossentropy": 2.8622608184814453, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1850590854883194, + "step": 14298 + }, + { + "epoch": 0.446875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0386627197265625, + "learning_rate": 0.0001, + "loss": 5.6949, + "loss/crossentropy": 2.469943046569824, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1736626774072647, + "step": 14300 + }, + { + "epoch": 0.4469375, + "grad_norm": 3.578125, + "grad_norm_var": 0.0585357666015625, + "learning_rate": 0.0001, + "loss": 6.1595, + "loss/crossentropy": 2.8455876111984253, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1767076700925827, + "step": 14302 + }, + { + "epoch": 0.447, + "grad_norm": 3.0625, + "grad_norm_var": 0.0621490478515625, + "learning_rate": 0.0001, + "loss": 5.379, + "loss/crossentropy": 2.3277957439422607, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15551482141017914, + "step": 14304 + }, + { + "epoch": 0.4470625, + "grad_norm": 3.484375, + "grad_norm_var": 0.06354166666666666, + "learning_rate": 0.0001, + "loss": 6.1249, + "loss/crossentropy": 2.7531386613845825, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18014255166053772, + "step": 14306 + }, + { + "epoch": 0.447125, + "grad_norm": 3.375, + "grad_norm_var": 0.06317952473958334, + "learning_rate": 0.0001, + "loss": 5.9383, + "loss/crossentropy": 2.638308882713318, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1784343644976616, + "step": 14308 + }, + { + "epoch": 0.4471875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0624420166015625, + "learning_rate": 0.0001, + "loss": 5.4067, + "loss/crossentropy": 2.3553584814071655, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15981775522232056, + "step": 14310 + }, + { + "epoch": 0.44725, + "grad_norm": 3.453125, + "grad_norm_var": 0.059619140625, + "learning_rate": 0.0001, + "loss": 5.899, + "loss/crossentropy": 2.617624044418335, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17931316792964935, + "step": 14312 + }, + { + "epoch": 0.4473125, + "grad_norm": 3.015625, + "grad_norm_var": 0.05097554524739583, + "learning_rate": 0.0001, + "loss": 5.591, + "loss/crossentropy": 2.4188079833984375, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16916995495557785, + "step": 14314 + }, + { + "epoch": 0.447375, + "grad_norm": 3.609375, + "grad_norm_var": 0.05291341145833333, + "learning_rate": 0.0001, + "loss": 6.1025, + "loss/crossentropy": 2.644850015640259, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.1852177530527115, + "step": 14316 + }, + { + "epoch": 0.4474375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0364654541015625, + "learning_rate": 0.0001, + "loss": 5.6787, + "loss/crossentropy": 2.505561351776123, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1673169583082199, + "step": 14318 + }, + { + "epoch": 0.4475, + "grad_norm": 4.1875, + "grad_norm_var": 0.08759358723958334, + "learning_rate": 0.0001, + "loss": 6.1003, + "loss/crossentropy": 2.6329729557037354, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19009672850370407, + "step": 14320 + }, + { + "epoch": 0.4475625, + "grad_norm": 3.34375, + "grad_norm_var": 0.08820699055989584, + "learning_rate": 0.0001, + "loss": 5.4881, + "loss/crossentropy": 2.3465631008148193, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1645406112074852, + "step": 14322 + }, + { + "epoch": 0.447625, + "grad_norm": 3.046875, + "grad_norm_var": 0.18772379557291666, + "learning_rate": 0.0001, + "loss": 5.7843, + "loss/crossentropy": 2.445573091506958, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17684581130743027, + "step": 14324 + }, + { + "epoch": 0.4476875, + "grad_norm": 3.109375, + "grad_norm_var": 0.18263346354166668, + "learning_rate": 0.0001, + "loss": 5.9039, + "loss/crossentropy": 2.651862382888794, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1755986288189888, + "step": 14326 + }, + { + "epoch": 0.44775, + "grad_norm": 3.015625, + "grad_norm_var": 0.1947662353515625, + "learning_rate": 0.0001, + "loss": 5.6506, + "loss/crossentropy": 2.5213125944137573, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1641010344028473, + "step": 14328 + }, + { + "epoch": 0.4478125, + "grad_norm": 3.3125, + "grad_norm_var": 0.18586832682291668, + "learning_rate": 0.0001, + "loss": 5.4415, + "loss/crossentropy": 2.296280264854431, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16491208970546722, + "step": 14330 + }, + { + "epoch": 0.447875, + "grad_norm": 3.65625, + "grad_norm_var": 0.18569234212239583, + "learning_rate": 0.0001, + "loss": 5.7038, + "loss/crossentropy": 2.455007314682007, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17409540712833405, + "step": 14332 + }, + { + "epoch": 0.4479375, + "grad_norm": 3.375, + "grad_norm_var": 0.17942301432291666, + "learning_rate": 0.0001, + "loss": 5.9901, + "loss/crossentropy": 2.6195307970046997, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18393173068761826, + "step": 14334 + }, + { + "epoch": 0.448, + "grad_norm": 4.84375, + "grad_norm_var": 0.2781646728515625, + "learning_rate": 0.0001, + "loss": 6.1805, + "loss/crossentropy": 2.726317882537842, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.18682335317134857, + "step": 14336 + }, + { + "epoch": 0.4480625, + "grad_norm": 3.359375, + "grad_norm_var": 0.2736612955729167, + "learning_rate": 0.0001, + "loss": 5.9181, + "loss/crossentropy": 2.6938018798828125, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17165353149175644, + "step": 14338 + }, + { + "epoch": 0.448125, + "grad_norm": 3.609375, + "grad_norm_var": 0.18782145182291668, + "learning_rate": 0.0001, + "loss": 5.8565, + "loss/crossentropy": 2.4743123054504395, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18587610125541687, + "step": 14340 + }, + { + "epoch": 0.4481875, + "grad_norm": 3.1875, + "grad_norm_var": 0.1931060791015625, + "learning_rate": 0.0001, + "loss": 5.6905, + "loss/crossentropy": 2.4889925718307495, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17405406385660172, + "step": 14342 + }, + { + "epoch": 0.44825, + "grad_norm": 3.375, + "grad_norm_var": 0.17135009765625, + "learning_rate": 0.0001, + "loss": 5.8152, + "loss/crossentropy": 2.4915691614151, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1768980622291565, + "step": 14344 + }, + { + "epoch": 0.4483125, + "grad_norm": 3.09375, + "grad_norm_var": 0.17325846354166666, + "learning_rate": 0.0001, + "loss": 6.1619, + "loss/crossentropy": 2.8394335508346558, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1802908331155777, + "step": 14346 + }, + { + "epoch": 0.448375, + "grad_norm": 3.28125, + "grad_norm_var": 0.18095703125, + "learning_rate": 0.0001, + "loss": 5.775, + "loss/crossentropy": 2.5895841121673584, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17049019038677216, + "step": 14348 + }, + { + "epoch": 0.4484375, + "grad_norm": 3.015625, + "grad_norm_var": 0.2006011962890625, + "learning_rate": 0.0001, + "loss": 5.4213, + "loss/crossentropy": 2.3961212635040283, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15330223739147186, + "step": 14350 + }, + { + "epoch": 0.4485, + "grad_norm": 3.125, + "grad_norm_var": 0.0506744384765625, + "learning_rate": 0.0001, + "loss": 5.909, + "loss/crossentropy": 2.655321955680847, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17575698345899582, + "step": 14352 + }, + { + "epoch": 0.4485625, + "grad_norm": 3.15625, + "grad_norm_var": 0.05048828125, + "learning_rate": 0.0001, + "loss": 5.601, + "loss/crossentropy": 2.3765926361083984, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17400510609149933, + "step": 14354 + }, + { + "epoch": 0.448625, + "grad_norm": 3.25, + "grad_norm_var": 0.04390360514322917, + "learning_rate": 0.0001, + "loss": 5.6403, + "loss/crossentropy": 2.4661710262298584, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1674160286784172, + "step": 14356 + }, + { + "epoch": 0.4486875, + "grad_norm": 3.90625, + "grad_norm_var": 0.07631734212239584, + "learning_rate": 0.0001, + "loss": 6.0283, + "loss/crossentropy": 2.7284250259399414, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.178815595805645, + "step": 14358 + }, + { + "epoch": 0.44875, + "grad_norm": 3.484375, + "grad_norm_var": 0.07952473958333334, + "learning_rate": 0.0001, + "loss": 5.9497, + "loss/crossentropy": 2.676850438117981, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17611093819141388, + "step": 14360 + }, + { + "epoch": 0.4488125, + "grad_norm": 3.640625, + "grad_norm_var": 0.14666239420572916, + "learning_rate": 0.0001, + "loss": 6.2483, + "loss/crossentropy": 2.6903653144836426, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.19564007222652435, + "step": 14362 + }, + { + "epoch": 0.448875, + "grad_norm": 2.984375, + "grad_norm_var": 0.1464263916015625, + "learning_rate": 0.0001, + "loss": 5.3807, + "loss/crossentropy": 2.291722297668457, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15889935940504074, + "step": 14364 + }, + { + "epoch": 0.4489375, + "grad_norm": 3.0, + "grad_norm_var": 0.13606363932291668, + "learning_rate": 0.0001, + "loss": 5.8725, + "loss/crossentropy": 2.586019277572632, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1763058304786682, + "step": 14366 + }, + { + "epoch": 0.449, + "grad_norm": 3.328125, + "grad_norm_var": 0.127197265625, + "learning_rate": 0.0001, + "loss": 6.1236, + "loss/crossentropy": 2.698602795600891, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18781539052724838, + "step": 14368 + }, + { + "epoch": 0.4490625, + "grad_norm": 3.15625, + "grad_norm_var": 0.1311920166015625, + "learning_rate": 0.0001, + "loss": 5.7755, + "loss/crossentropy": 2.5386215448379517, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17212378978729248, + "step": 14370 + }, + { + "epoch": 0.449125, + "grad_norm": 3.21875, + "grad_norm_var": 0.12361551920572916, + "learning_rate": 0.0001, + "loss": 5.7539, + "loss/crossentropy": 2.5283440351486206, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17450423538684845, + "step": 14372 + }, + { + "epoch": 0.4491875, + "grad_norm": 3.328125, + "grad_norm_var": 0.10881754557291666, + "learning_rate": 0.0001, + "loss": 5.8751, + "loss/crossentropy": 2.707939386367798, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16827435791492462, + "step": 14374 + }, + { + "epoch": 0.44925, + "grad_norm": 3.078125, + "grad_norm_var": 0.12314351399739583, + "learning_rate": 0.0001, + "loss": 5.5668, + "loss/crossentropy": 2.465551018714905, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16169127821922302, + "step": 14376 + }, + { + "epoch": 0.4493125, + "grad_norm": 3.328125, + "grad_norm_var": 0.043017578125, + "learning_rate": 0.0001, + "loss": 5.874, + "loss/crossentropy": 2.6843870878219604, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16817626357078552, + "step": 14378 + }, + { + "epoch": 0.449375, + "grad_norm": 2.953125, + "grad_norm_var": 0.044189453125, + "learning_rate": 0.0001, + "loss": 5.7486, + "loss/crossentropy": 2.5721516609191895, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1688157320022583, + "step": 14380 + }, + { + "epoch": 0.4494375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03626302083333333, + "learning_rate": 0.0001, + "loss": 5.5518, + "loss/crossentropy": 2.4162875413894653, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16706334054470062, + "step": 14382 + }, + { + "epoch": 0.4495, + "grad_norm": 3.421875, + "grad_norm_var": 0.03835347493489583, + "learning_rate": 0.0001, + "loss": 6.1768, + "loss/crossentropy": 2.718705654144287, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.19034168124198914, + "step": 14384 + }, + { + "epoch": 0.4495625, + "grad_norm": 3.125, + "grad_norm_var": 0.038374837239583334, + "learning_rate": 0.0001, + "loss": 5.7778, + "loss/crossentropy": 2.5771600008010864, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1735822558403015, + "step": 14386 + }, + { + "epoch": 0.449625, + "grad_norm": 3.3125, + "grad_norm_var": 0.04042867024739583, + "learning_rate": 0.0001, + "loss": 5.9021, + "loss/crossentropy": 2.5540201663970947, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18128766119480133, + "step": 14388 + }, + { + "epoch": 0.4496875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0366607666015625, + "learning_rate": 0.0001, + "loss": 5.7921, + "loss/crossentropy": 2.554555296897888, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.174534872174263, + "step": 14390 + }, + { + "epoch": 0.44975, + "grad_norm": 3.21875, + "grad_norm_var": 0.02828369140625, + "learning_rate": 0.0001, + "loss": 5.5898, + "loss/crossentropy": 2.409899115562439, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1640855148434639, + "step": 14392 + }, + { + "epoch": 0.4498125, + "grad_norm": 2.90625, + "grad_norm_var": 0.027994791666666668, + "learning_rate": 0.0001, + "loss": 5.6268, + "loss/crossentropy": 2.5423814058303833, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16039558500051498, + "step": 14394 + }, + { + "epoch": 0.449875, + "grad_norm": 3.40625, + "grad_norm_var": 0.026594034830729165, + "learning_rate": 0.0001, + "loss": 5.8761, + "loss/crossentropy": 2.5717968940734863, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17613419890403748, + "step": 14396 + }, + { + "epoch": 0.4499375, + "grad_norm": 3.4375, + "grad_norm_var": 0.02593994140625, + "learning_rate": 0.0001, + "loss": 6.0095, + "loss/crossentropy": 2.6781710386276245, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18118468672037125, + "step": 14398 + }, + { + "epoch": 0.45, + "grad_norm": 3.03125, + "grad_norm_var": 0.021940104166666665, + "learning_rate": 0.0001, + "loss": 5.9548, + "loss/crossentropy": 2.715733528137207, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17351695150136948, + "step": 14400 + }, + { + "epoch": 0.4500625, + "grad_norm": 2.875, + "grad_norm_var": 0.029938761393229166, + "learning_rate": 0.0001, + "loss": 5.5577, + "loss/crossentropy": 2.4219359159469604, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16474615037441254, + "step": 14402 + }, + { + "epoch": 0.450125, + "grad_norm": 3.5, + "grad_norm_var": 0.0500152587890625, + "learning_rate": 0.0001, + "loss": 5.6615, + "loss/crossentropy": 2.498612403869629, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1705838069319725, + "step": 14404 + }, + { + "epoch": 0.4501875, + "grad_norm": 3.171875, + "grad_norm_var": 0.05266520182291667, + "learning_rate": 0.0001, + "loss": 5.7041, + "loss/crossentropy": 2.556857466697693, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16901937872171402, + "step": 14406 + }, + { + "epoch": 0.45025, + "grad_norm": 3.234375, + "grad_norm_var": 0.10281473795572917, + "learning_rate": 0.0001, + "loss": 5.8924, + "loss/crossentropy": 2.53371000289917, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18508557230234146, + "step": 14408 + }, + { + "epoch": 0.4503125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0964752197265625, + "learning_rate": 0.0001, + "loss": 5.8211, + "loss/crossentropy": 2.5924088954925537, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17208484560251236, + "step": 14410 + }, + { + "epoch": 0.450375, + "grad_norm": 3.171875, + "grad_norm_var": 0.09641825358072917, + "learning_rate": 0.0001, + "loss": 5.8779, + "loss/crossentropy": 2.6541298627853394, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1723785549402237, + "step": 14412 + }, + { + "epoch": 0.4504375, + "grad_norm": 3.3125, + "grad_norm_var": 0.10357666015625, + "learning_rate": 0.0001, + "loss": 5.9689, + "loss/crossentropy": 2.6106170415878296, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18426914513111115, + "step": 14414 + }, + { + "epoch": 0.4505, + "grad_norm": 3.296875, + "grad_norm_var": 0.09934794108072917, + "learning_rate": 0.0001, + "loss": 5.7839, + "loss/crossentropy": 2.536181330680847, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17437804490327835, + "step": 14416 + }, + { + "epoch": 0.4505625, + "grad_norm": 3.25, + "grad_norm_var": 0.08518473307291667, + "learning_rate": 0.0001, + "loss": 5.5786, + "loss/crossentropy": 2.436997413635254, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1645461916923523, + "step": 14418 + }, + { + "epoch": 0.450625, + "grad_norm": 3.34375, + "grad_norm_var": 0.074755859375, + "learning_rate": 0.0001, + "loss": 6.0638, + "loss/crossentropy": 2.716366171836853, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18161842226982117, + "step": 14420 + }, + { + "epoch": 0.4506875, + "grad_norm": 3.578125, + "grad_norm_var": 0.06943359375, + "learning_rate": 0.0001, + "loss": 6.0411, + "loss/crossentropy": 2.7519595623016357, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17852392047643661, + "step": 14422 + }, + { + "epoch": 0.45075, + "grad_norm": 3.375, + "grad_norm_var": 0.028271484375, + "learning_rate": 0.0001, + "loss": 6.0271, + "loss/crossentropy": 2.6504757404327393, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18297728896141052, + "step": 14424 + }, + { + "epoch": 0.4508125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0276031494140625, + "learning_rate": 0.0001, + "loss": 5.7918, + "loss/crossentropy": 2.558031678199768, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1741553172469139, + "step": 14426 + }, + { + "epoch": 0.450875, + "grad_norm": 3.015625, + "grad_norm_var": 0.031998697916666666, + "learning_rate": 0.0001, + "loss": 5.8355, + "loss/crossentropy": 2.606409192085266, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17485878616571426, + "step": 14428 + }, + { + "epoch": 0.4509375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0318756103515625, + "learning_rate": 0.0001, + "loss": 5.6906, + "loss/crossentropy": 2.5467242002487183, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16516976058483124, + "step": 14430 + }, + { + "epoch": 0.451, + "grad_norm": 3.21875, + "grad_norm_var": 0.028587849934895833, + "learning_rate": 0.0001, + "loss": 6.0052, + "loss/crossentropy": 2.616430401802063, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1857563853263855, + "step": 14432 + }, + { + "epoch": 0.4510625, + "grad_norm": 2.96875, + "grad_norm_var": 0.03583882649739583, + "learning_rate": 0.0001, + "loss": 5.4648, + "loss/crossentropy": 2.3175641298294067, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16315650194883347, + "step": 14434 + }, + { + "epoch": 0.451125, + "grad_norm": 3.25, + "grad_norm_var": 0.039484659830729164, + "learning_rate": 0.0001, + "loss": 5.7707, + "loss/crossentropy": 2.580053687095642, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17062243074178696, + "step": 14436 + }, + { + "epoch": 0.4511875, + "grad_norm": 3.015625, + "grad_norm_var": 0.030464680989583333, + "learning_rate": 0.0001, + "loss": 5.638, + "loss/crossentropy": 2.5637532472610474, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1617240235209465, + "step": 14438 + }, + { + "epoch": 0.45125, + "grad_norm": 3.234375, + "grad_norm_var": 0.02828369140625, + "learning_rate": 0.0001, + "loss": 5.7075, + "loss/crossentropy": 2.5479713678359985, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16946709156036377, + "step": 14440 + }, + { + "epoch": 0.4513125, + "grad_norm": 3.140625, + "grad_norm_var": 0.025927734375, + "learning_rate": 0.0001, + "loss": 5.6683, + "loss/crossentropy": 2.513846278190613, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1701308637857437, + "step": 14442 + }, + { + "epoch": 0.451375, + "grad_norm": 3.046875, + "grad_norm_var": 0.03837890625, + "learning_rate": 0.0001, + "loss": 5.8887, + "loss/crossentropy": 2.6817877292633057, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16756878793239594, + "step": 14444 + }, + { + "epoch": 0.4514375, + "grad_norm": 3.046875, + "grad_norm_var": 0.03560791015625, + "learning_rate": 0.0001, + "loss": 5.4545, + "loss/crossentropy": 2.336892008781433, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.164496548473835, + "step": 14446 + }, + { + "epoch": 0.4515, + "grad_norm": 3.625, + "grad_norm_var": 0.05224202473958333, + "learning_rate": 0.0001, + "loss": 5.8803, + "loss/crossentropy": 2.680983781814575, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17149503529071808, + "step": 14448 + }, + { + "epoch": 0.4515625, + "grad_norm": 4.25, + "grad_norm_var": 0.10862223307291667, + "learning_rate": 0.0001, + "loss": 5.9772, + "loss/crossentropy": 2.5449445247650146, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1881500631570816, + "step": 14450 + }, + { + "epoch": 0.451625, + "grad_norm": 3.3125, + "grad_norm_var": 0.09728190104166666, + "learning_rate": 0.0001, + "loss": 5.9131, + "loss/crossentropy": 2.637911319732666, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17634259909391403, + "step": 14452 + }, + { + "epoch": 0.4516875, + "grad_norm": 3.328125, + "grad_norm_var": 0.17082926432291667, + "learning_rate": 0.0001, + "loss": 6.0235, + "loss/crossentropy": 2.6543599367141724, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18496477603912354, + "step": 14454 + }, + { + "epoch": 0.45175, + "grad_norm": 3.1875, + "grad_norm_var": 0.17464192708333334, + "learning_rate": 0.0001, + "loss": 5.9006, + "loss/crossentropy": 2.6123616695404053, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17726413905620575, + "step": 14456 + }, + { + "epoch": 0.4518125, + "grad_norm": 3.109375, + "grad_norm_var": 0.17293294270833334, + "learning_rate": 0.0001, + "loss": 5.566, + "loss/crossentropy": 2.388457775115967, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1669737845659256, + "step": 14458 + }, + { + "epoch": 0.451875, + "grad_norm": 2.859375, + "grad_norm_var": 0.18325093587239583, + "learning_rate": 0.0001, + "loss": 5.6086, + "loss/crossentropy": 2.465468168258667, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16470561921596527, + "step": 14460 + }, + { + "epoch": 0.4519375, + "grad_norm": 2.890625, + "grad_norm_var": 0.203955078125, + "learning_rate": 0.0001, + "loss": 5.5544, + "loss/crossentropy": 2.454023003578186, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16394591331481934, + "step": 14462 + }, + { + "epoch": 0.452, + "grad_norm": 3.046875, + "grad_norm_var": 0.2122222900390625, + "learning_rate": 0.0001, + "loss": 5.9464, + "loss/crossentropy": 2.717406749725342, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1760205179452896, + "step": 14464 + }, + { + "epoch": 0.4520625, + "grad_norm": 3.015625, + "grad_norm_var": 0.15450846354166667, + "learning_rate": 0.0001, + "loss": 5.7106, + "loss/crossentropy": 2.5401346683502197, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1674344539642334, + "step": 14466 + }, + { + "epoch": 0.452125, + "grad_norm": 3.296875, + "grad_norm_var": 0.15325419108072916, + "learning_rate": 0.0001, + "loss": 5.7594, + "loss/crossentropy": 2.5550618171691895, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1720004305243492, + "step": 14468 + }, + { + "epoch": 0.4521875, + "grad_norm": 3.03125, + "grad_norm_var": 0.025145467122395834, + "learning_rate": 0.0001, + "loss": 5.714, + "loss/crossentropy": 2.555497169494629, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16545791178941727, + "step": 14470 + }, + { + "epoch": 0.45225, + "grad_norm": 3.5625, + "grad_norm_var": 0.03733622233072917, + "learning_rate": 0.0001, + "loss": 5.6401, + "loss/crossentropy": 2.42263126373291, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16979046165943146, + "step": 14472 + }, + { + "epoch": 0.4523125, + "grad_norm": 3.3125, + "grad_norm_var": 0.04156494140625, + "learning_rate": 0.0001, + "loss": 5.4641, + "loss/crossentropy": 2.412516951560974, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1578877568244934, + "step": 14474 + }, + { + "epoch": 0.452375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03616129557291667, + "learning_rate": 0.0001, + "loss": 5.9363, + "loss/crossentropy": 2.5886703729629517, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1820259764790535, + "step": 14476 + }, + { + "epoch": 0.4524375, + "grad_norm": 3.546875, + "grad_norm_var": 0.04942118326822917, + "learning_rate": 0.0001, + "loss": 6.0269, + "loss/crossentropy": 2.652520537376404, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1835315078496933, + "step": 14478 + }, + { + "epoch": 0.4525, + "grad_norm": 3.4375, + "grad_norm_var": 0.0476715087890625, + "learning_rate": 0.0001, + "loss": 6.018, + "loss/crossentropy": 2.7243393659591675, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1797575280070305, + "step": 14480 + }, + { + "epoch": 0.4525625, + "grad_norm": 2.90625, + "grad_norm_var": 0.053173828125, + "learning_rate": 0.0001, + "loss": 5.7898, + "loss/crossentropy": 2.6540274620056152, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16631050407886505, + "step": 14482 + }, + { + "epoch": 0.452625, + "grad_norm": 3.34375, + "grad_norm_var": 0.053873697916666664, + "learning_rate": 0.0001, + "loss": 5.7773, + "loss/crossentropy": 2.558873176574707, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17457697540521622, + "step": 14484 + }, + { + "epoch": 0.4526875, + "grad_norm": 2.984375, + "grad_norm_var": 0.05115458170572917, + "learning_rate": 0.0001, + "loss": 5.7279, + "loss/crossentropy": 2.600783586502075, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1646689623594284, + "step": 14486 + }, + { + "epoch": 0.45275, + "grad_norm": 3.5625, + "grad_norm_var": 0.11155192057291667, + "learning_rate": 0.0001, + "loss": 5.9461, + "loss/crossentropy": 2.5118407011032104, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.18287695944309235, + "step": 14488 + }, + { + "epoch": 0.4528125, + "grad_norm": 3.390625, + "grad_norm_var": 0.10028889973958334, + "learning_rate": 0.0001, + "loss": 6.1353, + "loss/crossentropy": 2.701886773109436, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1933390125632286, + "step": 14490 + }, + { + "epoch": 0.452875, + "grad_norm": 3.0, + "grad_norm_var": 0.11120503743489583, + "learning_rate": 0.0001, + "loss": 5.2824, + "loss/crossentropy": 2.273401975631714, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15051215142011642, + "step": 14492 + }, + { + "epoch": 0.4529375, + "grad_norm": 2.984375, + "grad_norm_var": 0.10927327473958333, + "learning_rate": 0.0001, + "loss": 5.662, + "loss/crossentropy": 2.533039927482605, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16446173936128616, + "step": 14494 + }, + { + "epoch": 0.453, + "grad_norm": 3.15625, + "grad_norm_var": 0.1080230712890625, + "learning_rate": 0.0001, + "loss": 5.9995, + "loss/crossentropy": 2.7052998542785645, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17629235982894897, + "step": 14496 + }, + { + "epoch": 0.4530625, + "grad_norm": 3.3125, + "grad_norm_var": 0.12870992024739583, + "learning_rate": 0.0001, + "loss": 5.9878, + "loss/crossentropy": 2.7074246406555176, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17999430745840073, + "step": 14498 + }, + { + "epoch": 0.453125, + "grad_norm": 3.15625, + "grad_norm_var": 0.135986328125, + "learning_rate": 0.0001, + "loss": 5.9737, + "loss/crossentropy": 2.7000943422317505, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17579935491085052, + "step": 14500 + }, + { + "epoch": 0.4531875, + "grad_norm": 3.15625, + "grad_norm_var": 0.13108723958333332, + "learning_rate": 0.0001, + "loss": 5.727, + "loss/crossentropy": 2.5456565618515015, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16696364432573318, + "step": 14502 + }, + { + "epoch": 0.45325, + "grad_norm": 2.984375, + "grad_norm_var": 0.0724029541015625, + "learning_rate": 0.0001, + "loss": 5.6147, + "loss/crossentropy": 2.488164782524109, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1673370823264122, + "step": 14504 + }, + { + "epoch": 0.4533125, + "grad_norm": 3.4375, + "grad_norm_var": 0.07317606608072917, + "learning_rate": 0.0001, + "loss": 6.1609, + "loss/crossentropy": 2.7739768028259277, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18713120371103287, + "step": 14506 + }, + { + "epoch": 0.453375, + "grad_norm": 3.40625, + "grad_norm_var": 0.06917317708333333, + "learning_rate": 0.0001, + "loss": 5.7904, + "loss/crossentropy": 2.6002981662750244, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17252818495035172, + "step": 14508 + }, + { + "epoch": 0.4534375, + "grad_norm": 3.421875, + "grad_norm_var": 0.06503804524739583, + "learning_rate": 0.0001, + "loss": 5.9902, + "loss/crossentropy": 2.698809862136841, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1775732785463333, + "step": 14510 + }, + { + "epoch": 0.4535, + "grad_norm": 4.25, + "grad_norm_var": 0.11190999348958333, + "learning_rate": 0.0001, + "loss": 5.6646, + "loss/crossentropy": 2.4125418663024902, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17246823012828827, + "step": 14512 + }, + { + "epoch": 0.4535625, + "grad_norm": 3.28125, + "grad_norm_var": 0.09343159993489583, + "learning_rate": 0.0001, + "loss": 5.3442, + "loss/crossentropy": 2.2412840127944946, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15990018844604492, + "step": 14514 + }, + { + "epoch": 0.453625, + "grad_norm": 3.265625, + "grad_norm_var": 0.08662821451822916, + "learning_rate": 0.0001, + "loss": 5.4618, + "loss/crossentropy": 2.2628493309020996, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16755583137273788, + "step": 14516 + }, + { + "epoch": 0.4536875, + "grad_norm": 3.234375, + "grad_norm_var": 0.08622945149739583, + "learning_rate": 0.0001, + "loss": 5.6243, + "loss/crossentropy": 2.4937922954559326, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16461055725812912, + "step": 14518 + }, + { + "epoch": 0.45375, + "grad_norm": 3.0625, + "grad_norm_var": 0.16317952473958333, + "learning_rate": 0.0001, + "loss": 5.894, + "loss/crossentropy": 2.5887582302093506, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17857546359300613, + "step": 14520 + }, + { + "epoch": 0.4538125, + "grad_norm": 5.03125, + "grad_norm_var": 0.34032796223958334, + "learning_rate": 0.0001, + "loss": 5.8418, + "loss/crossentropy": 2.476287841796875, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18108688294887543, + "step": 14522 + }, + { + "epoch": 0.453875, + "grad_norm": 3.1875, + "grad_norm_var": 0.3337961832682292, + "learning_rate": 0.0001, + "loss": 5.6447, + "loss/crossentropy": 2.4071391820907593, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17180682718753815, + "step": 14524 + }, + { + "epoch": 0.4539375, + "grad_norm": 3.078125, + "grad_norm_var": 0.3443349202473958, + "learning_rate": 0.0001, + "loss": 5.7284, + "loss/crossentropy": 2.589739680290222, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16582194715738297, + "step": 14526 + }, + { + "epoch": 0.454, + "grad_norm": 3.59375, + "grad_norm_var": 0.3094390869140625, + "learning_rate": 0.0001, + "loss": 5.3112, + "loss/crossentropy": 2.1507176756858826, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16487611830234528, + "step": 14528 + }, + { + "epoch": 0.4540625, + "grad_norm": 3.140625, + "grad_norm_var": 0.30414937337239584, + "learning_rate": 0.0001, + "loss": 5.7803, + "loss/crossentropy": 2.494012713432312, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1778479814529419, + "step": 14530 + }, + { + "epoch": 0.454125, + "grad_norm": 3.0, + "grad_norm_var": 0.31660868326822916, + "learning_rate": 0.0001, + "loss": 5.9288, + "loss/crossentropy": 2.6846877336502075, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17401766777038574, + "step": 14532 + }, + { + "epoch": 0.4541875, + "grad_norm": 3.203125, + "grad_norm_var": 0.3118560791015625, + "learning_rate": 0.0001, + "loss": 5.9263, + "loss/crossentropy": 2.62195360660553, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17965682595968246, + "step": 14534 + }, + { + "epoch": 0.45425, + "grad_norm": 3.09375, + "grad_norm_var": 0.24228108723958333, + "learning_rate": 0.0001, + "loss": 5.7171, + "loss/crossentropy": 2.461188793182373, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17481359094381332, + "step": 14536 + }, + { + "epoch": 0.4543125, + "grad_norm": 3.46875, + "grad_norm_var": 0.0307281494140625, + "learning_rate": 0.0001, + "loss": 5.9262, + "loss/crossentropy": 2.575995922088623, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18189138174057007, + "step": 14538 + }, + { + "epoch": 0.454375, + "grad_norm": 3.484375, + "grad_norm_var": 0.028857421875, + "learning_rate": 0.0001, + "loss": 5.6891, + "loss/crossentropy": 2.3918073177337646, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17659986019134521, + "step": 14540 + }, + { + "epoch": 0.4544375, + "grad_norm": 2.90625, + "grad_norm_var": 0.033568318684895834, + "learning_rate": 0.0001, + "loss": 6.0145, + "loss/crossentropy": 2.788806915283203, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17179163545370102, + "step": 14542 + }, + { + "epoch": 0.4545, + "grad_norm": 3.1875, + "grad_norm_var": 0.026839192708333334, + "learning_rate": 0.0001, + "loss": 5.9726, + "loss/crossentropy": 2.686835527420044, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17584455758333206, + "step": 14544 + }, + { + "epoch": 0.4545625, + "grad_norm": 3.578125, + "grad_norm_var": 0.03339742024739583, + "learning_rate": 0.0001, + "loss": 6.1558, + "loss/crossentropy": 2.770237684249878, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18816396594047546, + "step": 14546 + }, + { + "epoch": 0.454625, + "grad_norm": 3.265625, + "grad_norm_var": 0.03225911458333333, + "learning_rate": 0.0001, + "loss": 6.1124, + "loss/crossentropy": 2.7354531288146973, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18222268670797348, + "step": 14548 + }, + { + "epoch": 0.4546875, + "grad_norm": 3.21875, + "grad_norm_var": 0.032648722330729164, + "learning_rate": 0.0001, + "loss": 5.8928, + "loss/crossentropy": 2.6034258604049683, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17580794543027878, + "step": 14550 + }, + { + "epoch": 0.45475, + "grad_norm": 3.046875, + "grad_norm_var": 0.0453033447265625, + "learning_rate": 0.0001, + "loss": 5.6005, + "loss/crossentropy": 2.529695749282837, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15942486375570297, + "step": 14552 + }, + { + "epoch": 0.4548125, + "grad_norm": 3.25, + "grad_norm_var": 0.05146077473958333, + "learning_rate": 0.0001, + "loss": 6.0578, + "loss/crossentropy": 2.69464647769928, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18475749343633652, + "step": 14554 + }, + { + "epoch": 0.454875, + "grad_norm": 3.0625, + "grad_norm_var": 0.05123291015625, + "learning_rate": 0.0001, + "loss": 6.2026, + "loss/crossentropy": 2.8169463872909546, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18778616189956665, + "step": 14556 + }, + { + "epoch": 0.4549375, + "grad_norm": 3.515625, + "grad_norm_var": 0.047900390625, + "learning_rate": 0.0001, + "loss": 6.048, + "loss/crossentropy": 2.7339634895324707, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18062249571084976, + "step": 14558 + }, + { + "epoch": 0.455, + "grad_norm": 3.71875, + "grad_norm_var": 0.0690582275390625, + "learning_rate": 0.0001, + "loss": 5.85, + "loss/crossentropy": 2.6157747507095337, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17224732041358948, + "step": 14560 + }, + { + "epoch": 0.4550625, + "grad_norm": 2.90625, + "grad_norm_var": 0.07171122233072917, + "learning_rate": 0.0001, + "loss": 6.0783, + "loss/crossentropy": 2.7599072456359863, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18027833104133606, + "step": 14562 + }, + { + "epoch": 0.455125, + "grad_norm": 3.484375, + "grad_norm_var": 0.07132161458333333, + "learning_rate": 0.0001, + "loss": 5.8077, + "loss/crossentropy": 2.5338187217712402, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17660734057426453, + "step": 14564 + }, + { + "epoch": 0.4551875, + "grad_norm": 4.1875, + "grad_norm_var": 0.12392578125, + "learning_rate": 0.0001, + "loss": 5.8379, + "loss/crossentropy": 2.4399008750915527, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.17925576120615005, + "step": 14566 + }, + { + "epoch": 0.45525, + "grad_norm": 3.234375, + "grad_norm_var": 0.10426432291666667, + "learning_rate": 0.0001, + "loss": 6.3463, + "loss/crossentropy": 2.948116660118103, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18590866029262543, + "step": 14568 + }, + { + "epoch": 0.4553125, + "grad_norm": 3.15625, + "grad_norm_var": 0.10764567057291667, + "learning_rate": 0.0001, + "loss": 5.6641, + "loss/crossentropy": 2.558690071105957, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16014690697193146, + "step": 14570 + }, + { + "epoch": 0.455375, + "grad_norm": 10.5, + "grad_norm_var": 3.2926666259765627, + "learning_rate": 0.0001, + "loss": 6.0711, + "loss/crossentropy": 2.5426149368286133, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.19894441962242126, + "step": 14572 + }, + { + "epoch": 0.4554375, + "grad_norm": 3.25, + "grad_norm_var": 3.319172159830729, + "learning_rate": 0.0001, + "loss": 5.8357, + "loss/crossentropy": 2.600016236305237, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1762988045811653, + "step": 14574 + }, + { + "epoch": 0.4555, + "grad_norm": 3.0625, + "grad_norm_var": 3.32607421875, + "learning_rate": 0.0001, + "loss": 5.6709, + "loss/crossentropy": 2.523502826690674, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16708385944366455, + "step": 14576 + }, + { + "epoch": 0.4555625, + "grad_norm": 2.9375, + "grad_norm_var": 3.338719685872396, + "learning_rate": 0.0001, + "loss": 5.8305, + "loss/crossentropy": 2.6271212100982666, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16994474828243256, + "step": 14578 + }, + { + "epoch": 0.455625, + "grad_norm": 3.1875, + "grad_norm_var": 3.348705037434896, + "learning_rate": 0.0001, + "loss": 5.6505, + "loss/crossentropy": 2.4964014291763306, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16423358023166656, + "step": 14580 + }, + { + "epoch": 0.4556875, + "grad_norm": 3.375, + "grad_norm_var": 3.341120402018229, + "learning_rate": 0.0001, + "loss": 5.7585, + "loss/crossentropy": 2.471139073371887, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1787339448928833, + "step": 14582 + }, + { + "epoch": 0.45575, + "grad_norm": 3.515625, + "grad_norm_var": 3.3541575113932294, + "learning_rate": 0.0001, + "loss": 5.7238, + "loss/crossentropy": 2.5060452222824097, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17333626747131348, + "step": 14584 + }, + { + "epoch": 0.4558125, + "grad_norm": 3.234375, + "grad_norm_var": 3.33756103515625, + "learning_rate": 0.0001, + "loss": 5.4606, + "loss/crossentropy": 2.30147123336792, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16239578276872635, + "step": 14586 + }, + { + "epoch": 0.455875, + "grad_norm": 3.140625, + "grad_norm_var": 0.07457275390625, + "learning_rate": 0.0001, + "loss": 5.4625, + "loss/crossentropy": 2.429797410964966, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15561362355947495, + "step": 14588 + }, + { + "epoch": 0.4559375, + "grad_norm": 3.21875, + "grad_norm_var": 0.0742828369140625, + "learning_rate": 0.0001, + "loss": 5.3762, + "loss/crossentropy": 2.2654976844787598, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16223956644535065, + "step": 14590 + }, + { + "epoch": 0.456, + "grad_norm": 3.390625, + "grad_norm_var": 0.0739898681640625, + "learning_rate": 0.0001, + "loss": 5.8523, + "loss/crossentropy": 2.6187981367111206, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17452352494001389, + "step": 14592 + }, + { + "epoch": 0.4560625, + "grad_norm": 3.5, + "grad_norm_var": 0.06933492024739583, + "learning_rate": 0.0001, + "loss": 5.955, + "loss/crossentropy": 2.7187896966934204, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17322969436645508, + "step": 14594 + }, + { + "epoch": 0.456125, + "grad_norm": 3.375, + "grad_norm_var": 0.06941731770833333, + "learning_rate": 0.0001, + "loss": 5.8254, + "loss/crossentropy": 2.5588265657424927, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17705295979976654, + "step": 14596 + }, + { + "epoch": 0.4561875, + "grad_norm": 3.28125, + "grad_norm_var": 0.07266337076822917, + "learning_rate": 0.0001, + "loss": 5.932, + "loss/crossentropy": 2.7176828384399414, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1686951220035553, + "step": 14598 + }, + { + "epoch": 0.45625, + "grad_norm": 3.078125, + "grad_norm_var": 0.07768452962239583, + "learning_rate": 0.0001, + "loss": 5.8074, + "loss/crossentropy": 2.5815467834472656, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17610573023557663, + "step": 14600 + }, + { + "epoch": 0.4563125, + "grad_norm": 3.515625, + "grad_norm_var": 0.036864217122395834, + "learning_rate": 0.0001, + "loss": 5.6907, + "loss/crossentropy": 2.534337639808655, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16524504125118256, + "step": 14602 + }, + { + "epoch": 0.456375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0374908447265625, + "learning_rate": 0.0001, + "loss": 5.6976, + "loss/crossentropy": 2.4520140886306763, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17416337877511978, + "step": 14604 + }, + { + "epoch": 0.4564375, + "grad_norm": 2.96875, + "grad_norm_var": 0.041552734375, + "learning_rate": 0.0001, + "loss": 5.7777, + "loss/crossentropy": 2.6158251762390137, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16775169968605042, + "step": 14606 + }, + { + "epoch": 0.4565, + "grad_norm": 3.3125, + "grad_norm_var": 0.04243876139322917, + "learning_rate": 0.0001, + "loss": 5.9935, + "loss/crossentropy": 2.687050700187683, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1806405782699585, + "step": 14608 + }, + { + "epoch": 0.4565625, + "grad_norm": 3.046875, + "grad_norm_var": 0.03892822265625, + "learning_rate": 0.0001, + "loss": 5.5589, + "loss/crossentropy": 2.4694937467575073, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1593277007341385, + "step": 14610 + }, + { + "epoch": 0.456625, + "grad_norm": 3.484375, + "grad_norm_var": 0.03748270670572917, + "learning_rate": 0.0001, + "loss": 5.725, + "loss/crossentropy": 2.4947859048843384, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17536167800426483, + "step": 14612 + }, + { + "epoch": 0.4566875, + "grad_norm": 3.375, + "grad_norm_var": 0.040461222330729164, + "learning_rate": 0.0001, + "loss": 5.6171, + "loss/crossentropy": 2.421966314315796, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17303290963172913, + "step": 14614 + }, + { + "epoch": 0.45675, + "grad_norm": 3.328125, + "grad_norm_var": 0.034403483072916664, + "learning_rate": 0.0001, + "loss": 5.5022, + "loss/crossentropy": 2.274025797843933, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1712556630373001, + "step": 14616 + }, + { + "epoch": 0.4568125, + "grad_norm": 3.1875, + "grad_norm_var": 0.020731608072916668, + "learning_rate": 0.0001, + "loss": 5.809, + "loss/crossentropy": 2.603486657142639, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16859738528728485, + "step": 14618 + }, + { + "epoch": 0.456875, + "grad_norm": 3.0, + "grad_norm_var": 0.029618326822916666, + "learning_rate": 0.0001, + "loss": 5.6869, + "loss/crossentropy": 2.578454375267029, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16280003637075424, + "step": 14620 + }, + { + "epoch": 0.4569375, + "grad_norm": 3.140625, + "grad_norm_var": 0.026659138997395835, + "learning_rate": 0.0001, + "loss": 5.7511, + "loss/crossentropy": 2.5166012048721313, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17188695818185806, + "step": 14622 + }, + { + "epoch": 0.457, + "grad_norm": 3.21875, + "grad_norm_var": 0.024144490559895832, + "learning_rate": 0.0001, + "loss": 5.8898, + "loss/crossentropy": 2.6979739665985107, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17035824060440063, + "step": 14624 + }, + { + "epoch": 0.4570625, + "grad_norm": 2.96875, + "grad_norm_var": 0.025419108072916665, + "learning_rate": 0.0001, + "loss": 5.438, + "loss/crossentropy": 2.3709373474121094, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16139668226242065, + "step": 14626 + }, + { + "epoch": 0.457125, + "grad_norm": 3.171875, + "grad_norm_var": 0.019391886393229165, + "learning_rate": 0.0001, + "loss": 5.6868, + "loss/crossentropy": 2.4807026386260986, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1694331392645836, + "step": 14628 + }, + { + "epoch": 0.4571875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0139801025390625, + "learning_rate": 0.0001, + "loss": 5.8062, + "loss/crossentropy": 2.5614261627197266, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17330555617809296, + "step": 14630 + }, + { + "epoch": 0.45725, + "grad_norm": 3.09375, + "grad_norm_var": 0.012255859375, + "learning_rate": 0.0001, + "loss": 5.8374, + "loss/crossentropy": 2.601043462753296, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1740272492170334, + "step": 14632 + }, + { + "epoch": 0.4573125, + "grad_norm": 3.296875, + "grad_norm_var": 0.0140289306640625, + "learning_rate": 0.0001, + "loss": 5.9327, + "loss/crossentropy": 2.6570407152175903, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17912764102220535, + "step": 14634 + }, + { + "epoch": 0.457375, + "grad_norm": 2.953125, + "grad_norm_var": 0.014264933268229167, + "learning_rate": 0.0001, + "loss": 5.7822, + "loss/crossentropy": 2.581598401069641, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.170454740524292, + "step": 14636 + }, + { + "epoch": 0.4574375, + "grad_norm": 3.609375, + "grad_norm_var": 0.0278228759765625, + "learning_rate": 0.0001, + "loss": 5.8602, + "loss/crossentropy": 2.5793548822402954, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17457029223442078, + "step": 14638 + }, + { + "epoch": 0.4575, + "grad_norm": 3.5625, + "grad_norm_var": 0.04016520182291667, + "learning_rate": 0.0001, + "loss": 5.8721, + "loss/crossentropy": 2.624797224998474, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1727820411324501, + "step": 14640 + }, + { + "epoch": 0.4575625, + "grad_norm": 3.171875, + "grad_norm_var": 0.044098917643229166, + "learning_rate": 0.0001, + "loss": 5.7531, + "loss/crossentropy": 2.534020185470581, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17268630117177963, + "step": 14642 + }, + { + "epoch": 0.457625, + "grad_norm": 3.125, + "grad_norm_var": 0.05367431640625, + "learning_rate": 0.0001, + "loss": 5.7927, + "loss/crossentropy": 2.6541357040405273, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16464103013277054, + "step": 14644 + }, + { + "epoch": 0.4576875, + "grad_norm": 3.28125, + "grad_norm_var": 0.06168212890625, + "learning_rate": 0.0001, + "loss": 5.6557, + "loss/crossentropy": 2.514155864715576, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16806161403656006, + "step": 14646 + }, + { + "epoch": 0.45775, + "grad_norm": 3.171875, + "grad_norm_var": 0.0607086181640625, + "learning_rate": 0.0001, + "loss": 5.8374, + "loss/crossentropy": 2.6226083040237427, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17382187396287918, + "step": 14648 + }, + { + "epoch": 0.4578125, + "grad_norm": 3.96875, + "grad_norm_var": 0.097412109375, + "learning_rate": 0.0001, + "loss": 5.2514, + "loss/crossentropy": 2.1894924640655518, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1569766104221344, + "step": 14650 + }, + { + "epoch": 0.457875, + "grad_norm": 2.984375, + "grad_norm_var": 0.09517822265625, + "learning_rate": 0.0001, + "loss": 5.3859, + "loss/crossentropy": 2.330040216445923, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15871459245681763, + "step": 14652 + }, + { + "epoch": 0.4579375, + "grad_norm": 3.609375, + "grad_norm_var": 0.1093414306640625, + "learning_rate": 0.0001, + "loss": 5.9374, + "loss/crossentropy": 2.528441071510315, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1881653070449829, + "step": 14654 + }, + { + "epoch": 0.458, + "grad_norm": 3.359375, + "grad_norm_var": 0.09735921223958334, + "learning_rate": 0.0001, + "loss": 5.8961, + "loss/crossentropy": 2.661376118659973, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1715143546462059, + "step": 14656 + }, + { + "epoch": 0.4580625, + "grad_norm": 3.03125, + "grad_norm_var": 0.09684244791666667, + "learning_rate": 0.0001, + "loss": 5.9745, + "loss/crossentropy": 2.7906899452209473, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16994313150644302, + "step": 14658 + }, + { + "epoch": 0.458125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0846343994140625, + "learning_rate": 0.0001, + "loss": 5.9035, + "loss/crossentropy": 2.65447998046875, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17568230628967285, + "step": 14660 + }, + { + "epoch": 0.4581875, + "grad_norm": 3.015625, + "grad_norm_var": 0.08594462076822916, + "learning_rate": 0.0001, + "loss": 5.9461, + "loss/crossentropy": 2.699332356452942, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17624330520629883, + "step": 14662 + }, + { + "epoch": 0.45825, + "grad_norm": 3.203125, + "grad_norm_var": 0.09119466145833334, + "learning_rate": 0.0001, + "loss": 5.7475, + "loss/crossentropy": 2.514883279800415, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17247988283634186, + "step": 14664 + }, + { + "epoch": 0.4583125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0596343994140625, + "learning_rate": 0.0001, + "loss": 5.7979, + "loss/crossentropy": 2.582164764404297, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1704052984714508, + "step": 14666 + }, + { + "epoch": 0.458375, + "grad_norm": 2.9375, + "grad_norm_var": 0.06144917805989583, + "learning_rate": 0.0001, + "loss": 5.9131, + "loss/crossentropy": 2.6844900846481323, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17286403477191925, + "step": 14668 + }, + { + "epoch": 0.4584375, + "grad_norm": 3.40625, + "grad_norm_var": 0.03411458333333333, + "learning_rate": 0.0001, + "loss": 5.4473, + "loss/crossentropy": 2.341725468635559, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.15821301192045212, + "step": 14670 + }, + { + "epoch": 0.4585, + "grad_norm": 3.15625, + "grad_norm_var": 0.03733622233072917, + "learning_rate": 0.0001, + "loss": 5.5986, + "loss/crossentropy": 2.470059871673584, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16675861179828644, + "step": 14672 + }, + { + "epoch": 0.4585625, + "grad_norm": 3.015625, + "grad_norm_var": 0.03853759765625, + "learning_rate": 0.0001, + "loss": 5.3315, + "loss/crossentropy": 2.319731116294861, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15976974368095398, + "step": 14674 + }, + { + "epoch": 0.458625, + "grad_norm": 3.03125, + "grad_norm_var": 0.040266927083333334, + "learning_rate": 0.0001, + "loss": 5.8873, + "loss/crossentropy": 2.588124394416809, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17679382860660553, + "step": 14676 + }, + { + "epoch": 0.4586875, + "grad_norm": 3.109375, + "grad_norm_var": 0.024312337239583332, + "learning_rate": 0.0001, + "loss": 5.9772, + "loss/crossentropy": 2.825955390930176, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1678621917963028, + "step": 14678 + }, + { + "epoch": 0.45875, + "grad_norm": 3.390625, + "grad_norm_var": 0.028287760416666665, + "learning_rate": 0.0001, + "loss": 5.6411, + "loss/crossentropy": 2.525596022605896, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16389373689889908, + "step": 14680 + }, + { + "epoch": 0.4588125, + "grad_norm": 3.3125, + "grad_norm_var": 0.028857421875, + "learning_rate": 0.0001, + "loss": 5.6639, + "loss/crossentropy": 2.4533002376556396, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16910212486982346, + "step": 14682 + }, + { + "epoch": 0.458875, + "grad_norm": 3.15625, + "grad_norm_var": 0.02818603515625, + "learning_rate": 0.0001, + "loss": 5.8839, + "loss/crossentropy": 2.547485113143921, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17894960939884186, + "step": 14684 + }, + { + "epoch": 0.4589375, + "grad_norm": 3.0625, + "grad_norm_var": 0.026008097330729167, + "learning_rate": 0.0001, + "loss": 5.5148, + "loss/crossentropy": 2.419428825378418, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16500765085220337, + "step": 14686 + }, + { + "epoch": 0.459, + "grad_norm": 3.296875, + "grad_norm_var": 0.020563761393229168, + "learning_rate": 0.0001, + "loss": 5.8017, + "loss/crossentropy": 2.47537899017334, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1826336607336998, + "step": 14688 + }, + { + "epoch": 0.4590625, + "grad_norm": 3.21875, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 5.6329, + "loss/crossentropy": 2.4895445108413696, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16785429418087006, + "step": 14690 + }, + { + "epoch": 0.459125, + "grad_norm": 3.21875, + "grad_norm_var": 0.025194295247395835, + "learning_rate": 0.0001, + "loss": 5.7113, + "loss/crossentropy": 2.4924451112747192, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16992872953414917, + "step": 14692 + }, + { + "epoch": 0.4591875, + "grad_norm": 3.25, + "grad_norm_var": 0.026057942708333334, + "learning_rate": 0.0001, + "loss": 5.5704, + "loss/crossentropy": 2.432380795478821, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16224291920661926, + "step": 14694 + }, + { + "epoch": 0.45925, + "grad_norm": 3.03125, + "grad_norm_var": 0.021708170572916668, + "learning_rate": 0.0001, + "loss": 5.8647, + "loss/crossentropy": 2.6404428482055664, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17320983111858368, + "step": 14696 + }, + { + "epoch": 0.4593125, + "grad_norm": 3.703125, + "grad_norm_var": 0.04390869140625, + "learning_rate": 0.0001, + "loss": 5.6257, + "loss/crossentropy": 2.459041118621826, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17174628376960754, + "step": 14698 + }, + { + "epoch": 0.459375, + "grad_norm": 3.0625, + "grad_norm_var": 0.04267171223958333, + "learning_rate": 0.0001, + "loss": 5.7677, + "loss/crossentropy": 2.531997799873352, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1751360446214676, + "step": 14700 + }, + { + "epoch": 0.4594375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0443023681640625, + "learning_rate": 0.0001, + "loss": 5.8761, + "loss/crossentropy": 2.7251086235046387, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16588325053453445, + "step": 14702 + }, + { + "epoch": 0.4595, + "grad_norm": 3.078125, + "grad_norm_var": 0.06440327962239584, + "learning_rate": 0.0001, + "loss": 5.958, + "loss/crossentropy": 2.6849414110183716, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17652680724859238, + "step": 14704 + }, + { + "epoch": 0.4595625, + "grad_norm": 3.140625, + "grad_norm_var": 0.06448465983072917, + "learning_rate": 0.0001, + "loss": 5.7037, + "loss/crossentropy": 2.5385085344314575, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16964833438396454, + "step": 14706 + }, + { + "epoch": 0.459625, + "grad_norm": 3.109375, + "grad_norm_var": 0.05878499348958333, + "learning_rate": 0.0001, + "loss": 5.9176, + "loss/crossentropy": 2.7037012577056885, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17139053344726562, + "step": 14708 + }, + { + "epoch": 0.4596875, + "grad_norm": 3.296875, + "grad_norm_var": 0.058089192708333334, + "learning_rate": 0.0001, + "loss": 5.9178, + "loss/crossentropy": 2.68429696559906, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17451756447553635, + "step": 14710 + }, + { + "epoch": 0.45975, + "grad_norm": 3.5625, + "grad_norm_var": 0.06498921712239583, + "learning_rate": 0.0001, + "loss": 6.1829, + "loss/crossentropy": 2.8171669244766235, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18501022458076477, + "step": 14712 + }, + { + "epoch": 0.4598125, + "grad_norm": 3.25, + "grad_norm_var": 0.048460896809895834, + "learning_rate": 0.0001, + "loss": 5.7204, + "loss/crossentropy": 2.5563507080078125, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16992372274398804, + "step": 14714 + }, + { + "epoch": 0.459875, + "grad_norm": 3.578125, + "grad_norm_var": 0.05468343098958333, + "learning_rate": 0.0001, + "loss": 5.8308, + "loss/crossentropy": 2.4868799448013306, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18204709142446518, + "step": 14716 + }, + { + "epoch": 0.4599375, + "grad_norm": 2.96875, + "grad_norm_var": 0.05761311848958333, + "learning_rate": 0.0001, + "loss": 5.7521, + "loss/crossentropy": 2.5840706825256348, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16758275777101517, + "step": 14718 + }, + { + "epoch": 0.46, + "grad_norm": 3.125, + "grad_norm_var": 0.04110921223958333, + "learning_rate": 0.0001, + "loss": 5.8617, + "loss/crossentropy": 2.6252224445343018, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1752118468284607, + "step": 14720 + }, + { + "epoch": 0.4600625, + "grad_norm": 3.140625, + "grad_norm_var": 0.04031575520833333, + "learning_rate": 0.0001, + "loss": 5.2771, + "loss/crossentropy": 2.2195135354995728, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1557537019252777, + "step": 14722 + }, + { + "epoch": 0.460125, + "grad_norm": 2.953125, + "grad_norm_var": 0.04318033854166667, + "learning_rate": 0.0001, + "loss": 5.4511, + "loss/crossentropy": 2.3469650745391846, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1666633039712906, + "step": 14724 + }, + { + "epoch": 0.4601875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0434722900390625, + "learning_rate": 0.0001, + "loss": 5.6526, + "loss/crossentropy": 2.4295217990875244, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1742624267935753, + "step": 14726 + }, + { + "epoch": 0.46025, + "grad_norm": 3.3125, + "grad_norm_var": 0.03411051432291667, + "learning_rate": 0.0001, + "loss": 5.916, + "loss/crossentropy": 2.593013882637024, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17760702222585678, + "step": 14728 + }, + { + "epoch": 0.4603125, + "grad_norm": 3.265625, + "grad_norm_var": 0.03209635416666667, + "learning_rate": 0.0001, + "loss": 5.7603, + "loss/crossentropy": 2.5226577520370483, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1749354526400566, + "step": 14730 + }, + { + "epoch": 0.460375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0208160400390625, + "learning_rate": 0.0001, + "loss": 5.7309, + "loss/crossentropy": 2.4947710037231445, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17400462925434113, + "step": 14732 + }, + { + "epoch": 0.4604375, + "grad_norm": 3.59375, + "grad_norm_var": 0.027034505208333334, + "learning_rate": 0.0001, + "loss": 5.8304, + "loss/crossentropy": 2.486048698425293, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18130739778280258, + "step": 14734 + }, + { + "epoch": 0.4605, + "grad_norm": 3.375, + "grad_norm_var": 0.04313863118489583, + "learning_rate": 0.0001, + "loss": 5.9464, + "loss/crossentropy": 2.545111060142517, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18973546475172043, + "step": 14736 + }, + { + "epoch": 0.4605625, + "grad_norm": 3.390625, + "grad_norm_var": 0.0491119384765625, + "learning_rate": 0.0001, + "loss": 5.6791, + "loss/crossentropy": 2.593599557876587, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16518951207399368, + "step": 14738 + }, + { + "epoch": 0.460625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04248758951822917, + "learning_rate": 0.0001, + "loss": 5.8006, + "loss/crossentropy": 2.6278897523880005, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16844641417264938, + "step": 14740 + }, + { + "epoch": 0.4606875, + "grad_norm": 3.3125, + "grad_norm_var": 0.040816243489583334, + "learning_rate": 0.0001, + "loss": 5.9979, + "loss/crossentropy": 2.687938690185547, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17942996323108673, + "step": 14742 + }, + { + "epoch": 0.46075, + "grad_norm": 3.015625, + "grad_norm_var": 0.0473297119140625, + "learning_rate": 0.0001, + "loss": 5.3834, + "loss/crossentropy": 2.2958441972732544, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16579102724790573, + "step": 14744 + }, + { + "epoch": 0.4608125, + "grad_norm": 3.109375, + "grad_norm_var": 0.055882771809895836, + "learning_rate": 0.0001, + "loss": 5.9585, + "loss/crossentropy": 2.646650195121765, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18079080432653427, + "step": 14746 + }, + { + "epoch": 0.460875, + "grad_norm": 3.421875, + "grad_norm_var": 0.07305399576822917, + "learning_rate": 0.0001, + "loss": 6.0442, + "loss/crossentropy": 2.5205646753311157, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19767975062131882, + "step": 14748 + }, + { + "epoch": 0.4609375, + "grad_norm": 3.03125, + "grad_norm_var": 0.06814676920572917, + "learning_rate": 0.0001, + "loss": 6.0111, + "loss/crossentropy": 2.8023595809936523, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17126596719026566, + "step": 14750 + }, + { + "epoch": 0.461, + "grad_norm": 3.359375, + "grad_norm_var": 0.066357421875, + "learning_rate": 0.0001, + "loss": 5.493, + "loss/crossentropy": 2.3870071172714233, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1625504121184349, + "step": 14752 + }, + { + "epoch": 0.4610625, + "grad_norm": 3.28125, + "grad_norm_var": 0.0614166259765625, + "learning_rate": 0.0001, + "loss": 5.4415, + "loss/crossentropy": 2.3839242458343506, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15849259495735168, + "step": 14754 + }, + { + "epoch": 0.461125, + "grad_norm": 3.34375, + "grad_norm_var": 0.06119384765625, + "learning_rate": 0.0001, + "loss": 5.6472, + "loss/crossentropy": 2.4545260667800903, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16652970761060715, + "step": 14756 + }, + { + "epoch": 0.4611875, + "grad_norm": 3.15625, + "grad_norm_var": 0.061474609375, + "learning_rate": 0.0001, + "loss": 5.7819, + "loss/crossentropy": 2.6248152256011963, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16297141462564468, + "step": 14758 + }, + { + "epoch": 0.46125, + "grad_norm": 3.609375, + "grad_norm_var": 0.06609598795572917, + "learning_rate": 0.0001, + "loss": 5.7236, + "loss/crossentropy": 2.5196938514709473, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17195553332567215, + "step": 14760 + }, + { + "epoch": 0.4613125, + "grad_norm": 3.046875, + "grad_norm_var": 0.062300618489583334, + "learning_rate": 0.0001, + "loss": 5.5641, + "loss/crossentropy": 2.48823082447052, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15875942260026932, + "step": 14762 + }, + { + "epoch": 0.461375, + "grad_norm": 3.34375, + "grad_norm_var": 0.032835896809895834, + "learning_rate": 0.0001, + "loss": 5.5414, + "loss/crossentropy": 2.413806200027466, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1615920066833496, + "step": 14764 + }, + { + "epoch": 0.4614375, + "grad_norm": 3.328125, + "grad_norm_var": 0.032160441080729164, + "learning_rate": 0.0001, + "loss": 5.926, + "loss/crossentropy": 2.5423930883407593, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18289238214492798, + "step": 14766 + }, + { + "epoch": 0.4615, + "grad_norm": 3.109375, + "grad_norm_var": 0.0253326416015625, + "learning_rate": 0.0001, + "loss": 5.624, + "loss/crossentropy": 2.4361913204193115, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16955722868442535, + "step": 14768 + }, + { + "epoch": 0.4615625, + "grad_norm": 3.0625, + "grad_norm_var": 0.0267486572265625, + "learning_rate": 0.0001, + "loss": 5.6569, + "loss/crossentropy": 2.519798755645752, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1676197052001953, + "step": 14770 + }, + { + "epoch": 0.461625, + "grad_norm": 3.390625, + "grad_norm_var": 0.028304036458333334, + "learning_rate": 0.0001, + "loss": 5.7614, + "loss/crossentropy": 2.5674301385879517, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16979040205478668, + "step": 14772 + }, + { + "epoch": 0.4616875, + "grad_norm": 4.34375, + "grad_norm_var": 0.1083984375, + "learning_rate": 0.0001, + "loss": 5.8391, + "loss/crossentropy": 2.525059461593628, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18178994953632355, + "step": 14774 + }, + { + "epoch": 0.46175, + "grad_norm": 2.953125, + "grad_norm_var": 0.10592041015625, + "learning_rate": 0.0001, + "loss": 5.4408, + "loss/crossentropy": 2.348313093185425, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16081024706363678, + "step": 14776 + }, + { + "epoch": 0.4618125, + "grad_norm": 3.21875, + "grad_norm_var": 0.09839579264322916, + "learning_rate": 0.0001, + "loss": 5.9184, + "loss/crossentropy": 2.6475237607955933, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17552414536476135, + "step": 14778 + }, + { + "epoch": 0.461875, + "grad_norm": 5.03125, + "grad_norm_var": 0.29124348958333335, + "learning_rate": 0.0001, + "loss": 5.7123, + "loss/crossentropy": 2.5059620141983032, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17219389975070953, + "step": 14780 + }, + { + "epoch": 0.4619375, + "grad_norm": 3.78125, + "grad_norm_var": 0.30024312337239584, + "learning_rate": 0.0001, + "loss": 5.5394, + "loss/crossentropy": 2.3543527126312256, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16655007749795914, + "step": 14782 + }, + { + "epoch": 0.462, + "grad_norm": 3.203125, + "grad_norm_var": 0.2855143229166667, + "learning_rate": 0.0001, + "loss": 5.9099, + "loss/crossentropy": 2.586949944496155, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18151184171438217, + "step": 14784 + }, + { + "epoch": 0.4620625, + "grad_norm": 4.03125, + "grad_norm_var": 0.29341532389322916, + "learning_rate": 0.0001, + "loss": 5.8707, + "loss/crossentropy": 2.5481022596359253, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1826496571302414, + "step": 14786 + }, + { + "epoch": 0.462125, + "grad_norm": 3.265625, + "grad_norm_var": 0.2879058837890625, + "learning_rate": 0.0001, + "loss": 5.948, + "loss/crossentropy": 2.6165642738342285, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17962933331727982, + "step": 14788 + }, + { + "epoch": 0.4621875, + "grad_norm": 3.3125, + "grad_norm_var": 0.23883056640625, + "learning_rate": 0.0001, + "loss": 5.9596, + "loss/crossentropy": 2.6575080156326294, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1809903010725975, + "step": 14790 + }, + { + "epoch": 0.46225, + "grad_norm": 2.9375, + "grad_norm_var": 0.24378255208333333, + "learning_rate": 0.0001, + "loss": 5.0272, + "loss/crossentropy": 2.03986132144928, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.14991004019975662, + "step": 14792 + }, + { + "epoch": 0.4623125, + "grad_norm": 4.34375, + "grad_norm_var": 0.3038238525390625, + "learning_rate": 0.0001, + "loss": 6.2764, + "loss/crossentropy": 2.893642544746399, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18476461619138718, + "step": 14794 + }, + { + "epoch": 0.462375, + "grad_norm": 2.953125, + "grad_norm_var": 0.15435791015625, + "learning_rate": 0.0001, + "loss": 5.7914, + "loss/crossentropy": 2.607421875, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16761763393878937, + "step": 14796 + }, + { + "epoch": 0.4624375, + "grad_norm": 3.015625, + "grad_norm_var": 0.14687093098958334, + "learning_rate": 0.0001, + "loss": 5.4684, + "loss/crossentropy": 2.319623112678528, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.15980049967765808, + "step": 14798 + }, + { + "epoch": 0.4625, + "grad_norm": 3.203125, + "grad_norm_var": 0.15110270182291666, + "learning_rate": 0.0001, + "loss": 5.6973, + "loss/crossentropy": 2.511590003967285, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16895774006843567, + "step": 14800 + }, + { + "epoch": 0.4625625, + "grad_norm": 3.390625, + "grad_norm_var": 0.11250712076822916, + "learning_rate": 0.0001, + "loss": 5.9353, + "loss/crossentropy": 2.6513490676879883, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17722097784280777, + "step": 14802 + }, + { + "epoch": 0.462625, + "grad_norm": 3.59375, + "grad_norm_var": 0.12161051432291667, + "learning_rate": 0.0001, + "loss": 6.3433, + "loss/crossentropy": 2.8445650339126587, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19244921952486038, + "step": 14804 + }, + { + "epoch": 0.4626875, + "grad_norm": 3.15625, + "grad_norm_var": 0.12209370930989584, + "learning_rate": 0.0001, + "loss": 5.7802, + "loss/crossentropy": 2.4950926303863525, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17733414471149445, + "step": 14806 + }, + { + "epoch": 0.46275, + "grad_norm": 3.265625, + "grad_norm_var": 0.11503499348958333, + "learning_rate": 0.0001, + "loss": 6.1628, + "loss/crossentropy": 2.861618399620056, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17738831788301468, + "step": 14808 + }, + { + "epoch": 0.4628125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0292144775390625, + "learning_rate": 0.0001, + "loss": 5.9535, + "loss/crossentropy": 2.7600247859954834, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1716940999031067, + "step": 14810 + }, + { + "epoch": 0.462875, + "grad_norm": 3.484375, + "grad_norm_var": 0.07835184733072917, + "learning_rate": 0.0001, + "loss": 6.0486, + "loss/crossentropy": 2.697732090950012, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.17570797353982925, + "step": 14812 + }, + { + "epoch": 0.4629375, + "grad_norm": 3.53125, + "grad_norm_var": 0.5511301676432292, + "learning_rate": 0.0001, + "loss": 6.3706, + "loss/crossentropy": 2.7113730907440186, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.21123894304037094, + "step": 14814 + }, + { + "epoch": 0.463, + "grad_norm": 3.09375, + "grad_norm_var": 0.5447092692057292, + "learning_rate": 0.0001, + "loss": 5.9407, + "loss/crossentropy": 2.696148991584778, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17640455812215805, + "step": 14816 + }, + { + "epoch": 0.4630625, + "grad_norm": 3.203125, + "grad_norm_var": 0.5544881184895833, + "learning_rate": 0.0001, + "loss": 5.9524, + "loss/crossentropy": 2.7013330459594727, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17628022283315659, + "step": 14818 + }, + { + "epoch": 0.463125, + "grad_norm": 3.140625, + "grad_norm_var": 0.5630767822265625, + "learning_rate": 0.0001, + "loss": 5.4809, + "loss/crossentropy": 2.3524303436279297, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16128921508789062, + "step": 14820 + }, + { + "epoch": 0.4631875, + "grad_norm": 3.140625, + "grad_norm_var": 0.5612050374348958, + "learning_rate": 0.0001, + "loss": 5.9576, + "loss/crossentropy": 2.6361982822418213, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17705771327018738, + "step": 14822 + }, + { + "epoch": 0.46325, + "grad_norm": 3.03125, + "grad_norm_var": 0.5830078125, + "learning_rate": 0.0001, + "loss": 5.3445, + "loss/crossentropy": 2.308445453643799, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15634051710367203, + "step": 14824 + }, + { + "epoch": 0.4633125, + "grad_norm": 2.953125, + "grad_norm_var": 0.5953928629557291, + "learning_rate": 0.0001, + "loss": 5.4586, + "loss/crossentropy": 2.328156590461731, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16304896026849747, + "step": 14826 + }, + { + "epoch": 0.463375, + "grad_norm": 3.15625, + "grad_norm_var": 0.56376953125, + "learning_rate": 0.0001, + "loss": 6.092, + "loss/crossentropy": 2.772488594055176, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18077776581048965, + "step": 14828 + }, + { + "epoch": 0.4634375, + "grad_norm": 3.078125, + "grad_norm_var": 0.03418680826822917, + "learning_rate": 0.0001, + "loss": 5.293, + "loss/crossentropy": 2.2395306825637817, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1553502231836319, + "step": 14830 + }, + { + "epoch": 0.4635, + "grad_norm": 13.3125, + "grad_norm_var": 6.447184244791667, + "learning_rate": 0.0001, + "loss": 6.6271, + "loss/crossentropy": 2.7424668073654175, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.22869876772165298, + "step": 14832 + }, + { + "epoch": 0.4635625, + "grad_norm": 3.703125, + "grad_norm_var": 6.374535115559896, + "learning_rate": 0.0001, + "loss": 5.7731, + "loss/crossentropy": 2.4544259309768677, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18147774040699005, + "step": 14834 + }, + { + "epoch": 0.463625, + "grad_norm": 3.4375, + "grad_norm_var": 6.3384765625, + "learning_rate": 0.0001, + "loss": 5.9261, + "loss/crossentropy": 2.574429750442505, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18047606199979782, + "step": 14836 + }, + { + "epoch": 0.4636875, + "grad_norm": 3.640625, + "grad_norm_var": 6.321174112955729, + "learning_rate": 0.0001, + "loss": 5.7131, + "loss/crossentropy": 2.3872469663619995, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18141291290521622, + "step": 14838 + }, + { + "epoch": 0.46375, + "grad_norm": 3.171875, + "grad_norm_var": 6.293912760416666, + "learning_rate": 0.0001, + "loss": 5.5187, + "loss/crossentropy": 2.3648808002471924, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1665530502796173, + "step": 14840 + }, + { + "epoch": 0.4638125, + "grad_norm": 3.328125, + "grad_norm_var": 6.237385050455729, + "learning_rate": 0.0001, + "loss": 5.4878, + "loss/crossentropy": 2.4001606702804565, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16071591526269913, + "step": 14842 + }, + { + "epoch": 0.463875, + "grad_norm": 3.265625, + "grad_norm_var": 6.230231730143229, + "learning_rate": 0.0001, + "loss": 6.2012, + "loss/crossentropy": 2.899673342704773, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17897935211658478, + "step": 14844 + }, + { + "epoch": 0.4639375, + "grad_norm": 3.234375, + "grad_norm_var": 6.202144368489583, + "learning_rate": 0.0001, + "loss": 5.7554, + "loss/crossentropy": 2.5932239294052124, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16504772007465363, + "step": 14846 + }, + { + "epoch": 0.464, + "grad_norm": 3.140625, + "grad_norm_var": 0.06018473307291667, + "learning_rate": 0.0001, + "loss": 5.7055, + "loss/crossentropy": 2.6065553426742554, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16380469501018524, + "step": 14848 + }, + { + "epoch": 0.4640625, + "grad_norm": 3.078125, + "grad_norm_var": 0.03425191243489583, + "learning_rate": 0.0001, + "loss": 5.9366, + "loss/crossentropy": 2.713751435279846, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17111469060182571, + "step": 14850 + }, + { + "epoch": 0.464125, + "grad_norm": 3.171875, + "grad_norm_var": 0.03703511555989583, + "learning_rate": 0.0001, + "loss": 5.7265, + "loss/crossentropy": 2.6059426069259644, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16205142438411713, + "step": 14852 + }, + { + "epoch": 0.4641875, + "grad_norm": 3.953125, + "grad_norm_var": 0.06829427083333334, + "learning_rate": 0.0001, + "loss": 5.8085, + "loss/crossentropy": 2.6018357276916504, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1706642284989357, + "step": 14854 + }, + { + "epoch": 0.46425, + "grad_norm": 3.625, + "grad_norm_var": 0.08379618326822917, + "learning_rate": 0.0001, + "loss": 6.0674, + "loss/crossentropy": 2.7422547340393066, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18212289363145828, + "step": 14856 + }, + { + "epoch": 0.4643125, + "grad_norm": 3.375, + "grad_norm_var": 0.08560791015625, + "learning_rate": 0.0001, + "loss": 5.7673, + "loss/crossentropy": 2.5514076948165894, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17120198160409927, + "step": 14858 + }, + { + "epoch": 0.464375, + "grad_norm": 3.125, + "grad_norm_var": 0.0886627197265625, + "learning_rate": 0.0001, + "loss": 5.8941, + "loss/crossentropy": 2.735658288002014, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1670130491256714, + "step": 14860 + }, + { + "epoch": 0.4644375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0883941650390625, + "learning_rate": 0.0001, + "loss": 5.5859, + "loss/crossentropy": 2.4559171199798584, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16690906137228012, + "step": 14862 + }, + { + "epoch": 0.4645, + "grad_norm": 3.140625, + "grad_norm_var": 0.08950093587239584, + "learning_rate": 0.0001, + "loss": 5.7883, + "loss/crossentropy": 2.468541145324707, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17689593136310577, + "step": 14864 + }, + { + "epoch": 0.4645625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0905670166015625, + "learning_rate": 0.0001, + "loss": 6.0488, + "loss/crossentropy": 2.794520854949951, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17347898334264755, + "step": 14866 + }, + { + "epoch": 0.464625, + "grad_norm": 3.796875, + "grad_norm_var": 0.10972391764322917, + "learning_rate": 0.0001, + "loss": 5.8122, + "loss/crossentropy": 2.4630837440490723, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18413476645946503, + "step": 14868 + }, + { + "epoch": 0.4646875, + "grad_norm": 3.125, + "grad_norm_var": 0.06687723795572917, + "learning_rate": 0.0001, + "loss": 5.662, + "loss/crossentropy": 2.499110698699951, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16941003501415253, + "step": 14870 + }, + { + "epoch": 0.46475, + "grad_norm": 3.015625, + "grad_norm_var": 0.05271708170572917, + "learning_rate": 0.0001, + "loss": 5.6792, + "loss/crossentropy": 2.492180109024048, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16908857226371765, + "step": 14872 + }, + { + "epoch": 0.4648125, + "grad_norm": 3.421875, + "grad_norm_var": 0.05548502604166667, + "learning_rate": 0.0001, + "loss": 5.9703, + "loss/crossentropy": 2.632256269454956, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18107140809297562, + "step": 14874 + }, + { + "epoch": 0.464875, + "grad_norm": 3.40625, + "grad_norm_var": 0.052958170572916664, + "learning_rate": 0.0001, + "loss": 5.7635, + "loss/crossentropy": 2.4615299701690674, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17785103619098663, + "step": 14876 + }, + { + "epoch": 0.4649375, + "grad_norm": 3.1875, + "grad_norm_var": 0.04986979166666667, + "learning_rate": 0.0001, + "loss": 5.6528, + "loss/crossentropy": 2.455379843711853, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16818365454673767, + "step": 14878 + }, + { + "epoch": 0.465, + "grad_norm": 3.21875, + "grad_norm_var": 0.055174763997395834, + "learning_rate": 0.0001, + "loss": 5.8532, + "loss/crossentropy": 2.6770154237747192, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.171913243830204, + "step": 14880 + }, + { + "epoch": 0.4650625, + "grad_norm": 3.46875, + "grad_norm_var": 0.05263570149739583, + "learning_rate": 0.0001, + "loss": 5.9324, + "loss/crossentropy": 2.648526191711426, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1795620173215866, + "step": 14882 + }, + { + "epoch": 0.465125, + "grad_norm": 3.125, + "grad_norm_var": 0.028316243489583334, + "learning_rate": 0.0001, + "loss": 5.6334, + "loss/crossentropy": 2.516292929649353, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16366364061832428, + "step": 14884 + }, + { + "epoch": 0.4651875, + "grad_norm": 3.09375, + "grad_norm_var": 0.028873697916666666, + "learning_rate": 0.0001, + "loss": 5.5178, + "loss/crossentropy": 2.356394052505493, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16887901723384857, + "step": 14886 + }, + { + "epoch": 0.46525, + "grad_norm": 2.953125, + "grad_norm_var": 0.03427734375, + "learning_rate": 0.0001, + "loss": 5.7019, + "loss/crossentropy": 2.5004215240478516, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16936583817005157, + "step": 14888 + }, + { + "epoch": 0.4653125, + "grad_norm": 3.015625, + "grad_norm_var": 0.034211222330729166, + "learning_rate": 0.0001, + "loss": 5.8795, + "loss/crossentropy": 2.6740177869796753, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17093826830387115, + "step": 14890 + }, + { + "epoch": 0.465375, + "grad_norm": 2.90625, + "grad_norm_var": 0.03658854166666667, + "learning_rate": 0.0001, + "loss": 5.6259, + "loss/crossentropy": 2.4603389501571655, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16694767773151398, + "step": 14892 + }, + { + "epoch": 0.4654375, + "grad_norm": 3.859375, + "grad_norm_var": 0.06480204264322917, + "learning_rate": 0.0001, + "loss": 5.9331, + "loss/crossentropy": 2.5701653957366943, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.17652342468500137, + "step": 14894 + }, + { + "epoch": 0.4655, + "grad_norm": 3.3125, + "grad_norm_var": 0.05777587890625, + "learning_rate": 0.0001, + "loss": 5.598, + "loss/crossentropy": 2.4629119634628296, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1635095402598381, + "step": 14896 + }, + { + "epoch": 0.4655625, + "grad_norm": 2.9375, + "grad_norm_var": 0.05530192057291667, + "learning_rate": 0.0001, + "loss": 5.7099, + "loss/crossentropy": 2.5170661211013794, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16928022354841232, + "step": 14898 + }, + { + "epoch": 0.465625, + "grad_norm": 2.90625, + "grad_norm_var": 0.059370930989583334, + "learning_rate": 0.0001, + "loss": 5.7232, + "loss/crossentropy": 2.577171802520752, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16733480244874954, + "step": 14900 + }, + { + "epoch": 0.4656875, + "grad_norm": 3.484375, + "grad_norm_var": 0.06769917805989584, + "learning_rate": 0.0001, + "loss": 5.8252, + "loss/crossentropy": 2.646600842475891, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16473273932933807, + "step": 14902 + }, + { + "epoch": 0.46575, + "grad_norm": 3.1875, + "grad_norm_var": 0.058592732747395834, + "learning_rate": 0.0001, + "loss": 6.0117, + "loss/crossentropy": 2.7387936115264893, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17650548368692398, + "step": 14904 + }, + { + "epoch": 0.4658125, + "grad_norm": 3.359375, + "grad_norm_var": 0.05969950358072917, + "learning_rate": 0.0001, + "loss": 5.9957, + "loss/crossentropy": 2.75613534450531, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1743444800376892, + "step": 14906 + }, + { + "epoch": 0.465875, + "grad_norm": 3.234375, + "grad_norm_var": 0.05598856608072917, + "learning_rate": 0.0001, + "loss": 5.8286, + "loss/crossentropy": 2.555168867111206, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17577717453241348, + "step": 14908 + }, + { + "epoch": 0.4659375, + "grad_norm": 3.203125, + "grad_norm_var": 0.026188151041666666, + "learning_rate": 0.0001, + "loss": 5.7309, + "loss/crossentropy": 2.5268471240997314, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1711895540356636, + "step": 14910 + }, + { + "epoch": 0.466, + "grad_norm": 3.109375, + "grad_norm_var": 0.0243560791015625, + "learning_rate": 0.0001, + "loss": 5.4166, + "loss/crossentropy": 2.2632104754447937, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1649443358182907, + "step": 14912 + }, + { + "epoch": 0.4660625, + "grad_norm": 3.40625, + "grad_norm_var": 0.0246002197265625, + "learning_rate": 0.0001, + "loss": 6.0711, + "loss/crossentropy": 2.8264983892440796, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17251016944646835, + "step": 14914 + }, + { + "epoch": 0.466125, + "grad_norm": 3.5, + "grad_norm_var": 0.02340087890625, + "learning_rate": 0.0001, + "loss": 5.7918, + "loss/crossentropy": 2.5308409929275513, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17218975722789764, + "step": 14916 + }, + { + "epoch": 0.4661875, + "grad_norm": 3.234375, + "grad_norm_var": 0.015314737955729166, + "learning_rate": 0.0001, + "loss": 5.788, + "loss/crossentropy": 2.5696581602096558, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1702689602971077, + "step": 14918 + }, + { + "epoch": 0.46625, + "grad_norm": 3.484375, + "grad_norm_var": 0.025581868489583333, + "learning_rate": 0.0001, + "loss": 5.6231, + "loss/crossentropy": 2.4615012407302856, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16576935350894928, + "step": 14920 + }, + { + "epoch": 0.4663125, + "grad_norm": 3.171875, + "grad_norm_var": 0.028465779622395833, + "learning_rate": 0.0001, + "loss": 5.8206, + "loss/crossentropy": 2.502258539199829, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18104856461286545, + "step": 14922 + }, + { + "epoch": 0.466375, + "grad_norm": 3.09375, + "grad_norm_var": 0.041869099934895834, + "learning_rate": 0.0001, + "loss": 5.7827, + "loss/crossentropy": 2.507362484931946, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17401570081710815, + "step": 14924 + }, + { + "epoch": 0.4664375, + "grad_norm": 3.265625, + "grad_norm_var": 0.04195556640625, + "learning_rate": 0.0001, + "loss": 5.8868, + "loss/crossentropy": 2.6286935806274414, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17620167881250381, + "step": 14926 + }, + { + "epoch": 0.4665, + "grad_norm": 3.328125, + "grad_norm_var": 0.044759114583333336, + "learning_rate": 0.0001, + "loss": 5.6895, + "loss/crossentropy": 2.4355950355529785, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17031612992286682, + "step": 14928 + }, + { + "epoch": 0.4665625, + "grad_norm": 3.34375, + "grad_norm_var": 0.04107666015625, + "learning_rate": 0.0001, + "loss": 6.2603, + "loss/crossentropy": 2.83007276058197, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18872267752885818, + "step": 14930 + }, + { + "epoch": 0.466625, + "grad_norm": 3.078125, + "grad_norm_var": 0.05322265625, + "learning_rate": 0.0001, + "loss": 5.9574, + "loss/crossentropy": 2.6480225324630737, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17976664006710052, + "step": 14932 + }, + { + "epoch": 0.4666875, + "grad_norm": 3.25, + "grad_norm_var": 0.05519917805989583, + "learning_rate": 0.0001, + "loss": 5.4475, + "loss/crossentropy": 2.3893600702285767, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15971647202968597, + "step": 14934 + }, + { + "epoch": 0.46675, + "grad_norm": 3.0625, + "grad_norm_var": 0.04731343587239583, + "learning_rate": 0.0001, + "loss": 5.7363, + "loss/crossentropy": 2.5933868885040283, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16585280001163483, + "step": 14936 + }, + { + "epoch": 0.4668125, + "grad_norm": 3.140625, + "grad_norm_var": 0.045735677083333336, + "learning_rate": 0.0001, + "loss": 5.8121, + "loss/crossentropy": 2.619690418243408, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17197076976299286, + "step": 14938 + }, + { + "epoch": 0.466875, + "grad_norm": 3.015625, + "grad_norm_var": 0.03780924479166667, + "learning_rate": 0.0001, + "loss": 5.7452, + "loss/crossentropy": 2.5262022018432617, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17268161475658417, + "step": 14940 + }, + { + "epoch": 0.4669375, + "grad_norm": 2.875, + "grad_norm_var": 0.051789347330729166, + "learning_rate": 0.0001, + "loss": 5.4068, + "loss/crossentropy": 2.4165834188461304, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1529240757226944, + "step": 14942 + }, + { + "epoch": 0.467, + "grad_norm": 3.1875, + "grad_norm_var": 0.0452301025390625, + "learning_rate": 0.0001, + "loss": 5.609, + "loss/crossentropy": 2.5023202896118164, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16535700857639313, + "step": 14944 + }, + { + "epoch": 0.4670625, + "grad_norm": 3.15625, + "grad_norm_var": 0.03922119140625, + "learning_rate": 0.0001, + "loss": 5.5997, + "loss/crossentropy": 2.428265929222107, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1722167730331421, + "step": 14946 + }, + { + "epoch": 0.467125, + "grad_norm": 3.125, + "grad_norm_var": 0.01822509765625, + "learning_rate": 0.0001, + "loss": 5.8591, + "loss/crossentropy": 2.618662476539612, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17599669098854065, + "step": 14948 + }, + { + "epoch": 0.4671875, + "grad_norm": 3.171875, + "grad_norm_var": 0.05751953125, + "learning_rate": 0.0001, + "loss": 5.8571, + "loss/crossentropy": 2.6167280673980713, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17168885469436646, + "step": 14950 + }, + { + "epoch": 0.46725, + "grad_norm": 3.03125, + "grad_norm_var": 0.05782877604166667, + "learning_rate": 0.0001, + "loss": 5.8178, + "loss/crossentropy": 2.5427498817443848, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17125794291496277, + "step": 14952 + }, + { + "epoch": 0.4673125, + "grad_norm": 3.34375, + "grad_norm_var": 0.05614827473958333, + "learning_rate": 0.0001, + "loss": 6.0896, + "loss/crossentropy": 2.736640214920044, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18373557925224304, + "step": 14954 + }, + { + "epoch": 0.467375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0591796875, + "learning_rate": 0.0001, + "loss": 5.4814, + "loss/crossentropy": 2.4086307287216187, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16353166848421097, + "step": 14956 + }, + { + "epoch": 0.4674375, + "grad_norm": 3.0, + "grad_norm_var": 0.05366109212239583, + "learning_rate": 0.0001, + "loss": 5.6276, + "loss/crossentropy": 2.5093424320220947, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16611898690462112, + "step": 14958 + }, + { + "epoch": 0.4675, + "grad_norm": 3.515625, + "grad_norm_var": 0.06289774576822917, + "learning_rate": 0.0001, + "loss": 6.0632, + "loss/crossentropy": 2.6605218648910522, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1847984567284584, + "step": 14960 + }, + { + "epoch": 0.4675625, + "grad_norm": 3.421875, + "grad_norm_var": 0.06642964680989584, + "learning_rate": 0.0001, + "loss": 5.999, + "loss/crossentropy": 2.737818717956543, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1718187928199768, + "step": 14962 + }, + { + "epoch": 0.467625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0684722900390625, + "learning_rate": 0.0001, + "loss": 5.6779, + "loss/crossentropy": 2.5076217651367188, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17054229974746704, + "step": 14964 + }, + { + "epoch": 0.4676875, + "grad_norm": 3.265625, + "grad_norm_var": 0.03534749348958333, + "learning_rate": 0.0001, + "loss": 5.6204, + "loss/crossentropy": 2.4288675785064697, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1675926074385643, + "step": 14966 + }, + { + "epoch": 0.46775, + "grad_norm": 3.296875, + "grad_norm_var": 0.032868448893229166, + "learning_rate": 0.0001, + "loss": 5.8905, + "loss/crossentropy": 2.590886354446411, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.18308492004871368, + "step": 14968 + }, + { + "epoch": 0.4678125, + "grad_norm": 2.921875, + "grad_norm_var": 0.04488932291666667, + "learning_rate": 0.0001, + "loss": 5.7074, + "loss/crossentropy": 2.486487627029419, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17443692684173584, + "step": 14970 + }, + { + "epoch": 0.467875, + "grad_norm": 3.125, + "grad_norm_var": 0.03582356770833333, + "learning_rate": 0.0001, + "loss": 5.7548, + "loss/crossentropy": 2.5663185119628906, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16962523013353348, + "step": 14972 + }, + { + "epoch": 0.4679375, + "grad_norm": 3.34375, + "grad_norm_var": 0.031201171875, + "learning_rate": 0.0001, + "loss": 5.7835, + "loss/crossentropy": 2.5746841430664062, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17088166624307632, + "step": 14974 + }, + { + "epoch": 0.468, + "grad_norm": 3.015625, + "grad_norm_var": 0.029264322916666665, + "learning_rate": 0.0001, + "loss": 5.6593, + "loss/crossentropy": 2.5325201749801636, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16619150340557098, + "step": 14976 + }, + { + "epoch": 0.4680625, + "grad_norm": 3.109375, + "grad_norm_var": 0.025992838541666667, + "learning_rate": 0.0001, + "loss": 5.7783, + "loss/crossentropy": 2.621680974960327, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16956928372383118, + "step": 14978 + }, + { + "epoch": 0.468125, + "grad_norm": 2.875, + "grad_norm_var": 0.028685506184895834, + "learning_rate": 0.0001, + "loss": 5.1982, + "loss/crossentropy": 2.177245855331421, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1599075049161911, + "step": 14980 + }, + { + "epoch": 0.4681875, + "grad_norm": 3.3125, + "grad_norm_var": 0.028804524739583334, + "learning_rate": 0.0001, + "loss": 5.7523, + "loss/crossentropy": 2.594241499900818, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1669737994670868, + "step": 14982 + }, + { + "epoch": 0.46825, + "grad_norm": 3.234375, + "grad_norm_var": 0.028841145833333335, + "learning_rate": 0.0001, + "loss": 5.9412, + "loss/crossentropy": 2.71023952960968, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1734899953007698, + "step": 14984 + }, + { + "epoch": 0.4683125, + "grad_norm": 3.296875, + "grad_norm_var": 0.0262847900390625, + "learning_rate": 0.0001, + "loss": 5.8847, + "loss/crossentropy": 2.6082526445388794, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17686737328767776, + "step": 14986 + }, + { + "epoch": 0.468375, + "grad_norm": 3.171875, + "grad_norm_var": 0.026178995768229168, + "learning_rate": 0.0001, + "loss": 5.9113, + "loss/crossentropy": 2.6365636587142944, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17747656255960464, + "step": 14988 + }, + { + "epoch": 0.4684375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0247711181640625, + "learning_rate": 0.0001, + "loss": 5.586, + "loss/crossentropy": 2.374174952507019, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1719595566391945, + "step": 14990 + }, + { + "epoch": 0.4685, + "grad_norm": 3.296875, + "grad_norm_var": 0.18365478515625, + "learning_rate": 0.0001, + "loss": 5.7412, + "loss/crossentropy": 2.4613447189331055, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1748618260025978, + "step": 14992 + }, + { + "epoch": 0.4685625, + "grad_norm": 4.875, + "grad_norm_var": 0.3292307535807292, + "learning_rate": 0.0001, + "loss": 6.2036, + "loss/crossentropy": 2.71132493019104, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19454240053892136, + "step": 14994 + }, + { + "epoch": 0.468625, + "grad_norm": 7.25, + "grad_norm_var": 1.204295857747396, + "learning_rate": 0.0001, + "loss": 5.7628, + "loss/crossentropy": 2.4733160734176636, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17621204257011414, + "step": 14996 + }, + { + "epoch": 0.4686875, + "grad_norm": 3.65625, + "grad_norm_var": 1.1716217041015624, + "learning_rate": 0.0001, + "loss": 5.9932, + "loss/crossentropy": 2.6372264623641968, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18247324228286743, + "step": 14998 + }, + { + "epoch": 0.46875, + "grad_norm": 3.546875, + "grad_norm_var": 1.1736317952473958, + "learning_rate": 0.0001, + "loss": 6.1257, + "loss/crossentropy": 2.808167815208435, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1801864132285118, + "step": 15000 + }, + { + "epoch": 0.4688125, + "grad_norm": 3.65625, + "grad_norm_var": 1.1867472330729167, + "learning_rate": 0.0001, + "loss": 6.0516, + "loss/crossentropy": 2.6355226039886475, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18809112906455994, + "step": 15002 + }, + { + "epoch": 0.468875, + "grad_norm": 3.109375, + "grad_norm_var": 1.1732747395833334, + "learning_rate": 0.0001, + "loss": 5.7178, + "loss/crossentropy": 2.509262204170227, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17241171002388, + "step": 15004 + }, + { + "epoch": 0.4689375, + "grad_norm": 3.078125, + "grad_norm_var": 1.15576171875, + "learning_rate": 0.0001, + "loss": 5.5251, + "loss/crossentropy": 2.393805503845215, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.15844451636075974, + "step": 15006 + }, + { + "epoch": 0.469, + "grad_norm": 3.203125, + "grad_norm_var": 1.1028879801432292, + "learning_rate": 0.0001, + "loss": 5.8695, + "loss/crossentropy": 2.580573797225952, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17967334389686584, + "step": 15008 + }, + { + "epoch": 0.4690625, + "grad_norm": 3.21875, + "grad_norm_var": 1.020637003580729, + "learning_rate": 0.0001, + "loss": 5.9801, + "loss/crossentropy": 2.7248635292053223, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17786447703838348, + "step": 15010 + }, + { + "epoch": 0.469125, + "grad_norm": 3.296875, + "grad_norm_var": 0.04663798014322917, + "learning_rate": 0.0001, + "loss": 5.9036, + "loss/crossentropy": 2.589953064918518, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1762884259223938, + "step": 15012 + }, + { + "epoch": 0.4691875, + "grad_norm": 3.015625, + "grad_norm_var": 0.03866780598958333, + "learning_rate": 0.0001, + "loss": 5.6274, + "loss/crossentropy": 2.503264307975769, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16553644835948944, + "step": 15014 + }, + { + "epoch": 0.46925, + "grad_norm": 3.4375, + "grad_norm_var": 6.729605102539063, + "learning_rate": 0.0001, + "loss": 6.1971, + "loss/crossentropy": 2.5574769973754883, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.21240271627902985, + "step": 15016 + }, + { + "epoch": 0.4693125, + "grad_norm": 3.265625, + "grad_norm_var": 6.725877888997396, + "learning_rate": 0.0001, + "loss": 5.8184, + "loss/crossentropy": 2.5202722549438477, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17747237533330917, + "step": 15018 + }, + { + "epoch": 0.469375, + "grad_norm": 3.1875, + "grad_norm_var": 6.75064697265625, + "learning_rate": 0.0001, + "loss": 5.5763, + "loss/crossentropy": 2.4177215099334717, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16859114915132523, + "step": 15020 + }, + { + "epoch": 0.4694375, + "grad_norm": 3.3125, + "grad_norm_var": 6.739330037434896, + "learning_rate": 0.0001, + "loss": 6.071, + "loss/crossentropy": 2.7344648838043213, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1801370605826378, + "step": 15022 + }, + { + "epoch": 0.4695, + "grad_norm": 3.515625, + "grad_norm_var": 6.714892578125, + "learning_rate": 0.0001, + "loss": 5.757, + "loss/crossentropy": 2.531996011734009, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16976594924926758, + "step": 15024 + }, + { + "epoch": 0.4695625, + "grad_norm": 2.96875, + "grad_norm_var": 6.714094034830729, + "learning_rate": 0.0001, + "loss": 5.5923, + "loss/crossentropy": 2.4961259365081787, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15961505472660065, + "step": 15026 + }, + { + "epoch": 0.469625, + "grad_norm": 2.9375, + "grad_norm_var": 6.713212076822916, + "learning_rate": 0.0001, + "loss": 5.7165, + "loss/crossentropy": 2.5338306427001953, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16748927533626556, + "step": 15028 + }, + { + "epoch": 0.4696875, + "grad_norm": 3.125, + "grad_norm_var": 6.718001302083334, + "learning_rate": 0.0001, + "loss": 5.3329, + "loss/crossentropy": 2.2626683712005615, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16093085706233978, + "step": 15030 + }, + { + "epoch": 0.46975, + "grad_norm": 3.328125, + "grad_norm_var": 0.06539306640625, + "learning_rate": 0.0001, + "loss": 5.6058, + "loss/crossentropy": 2.3245774507522583, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1765628531575203, + "step": 15032 + }, + { + "epoch": 0.4698125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0667388916015625, + "learning_rate": 0.0001, + "loss": 5.8241, + "loss/crossentropy": 2.5087594985961914, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1784081682562828, + "step": 15034 + }, + { + "epoch": 0.469875, + "grad_norm": 3.078125, + "grad_norm_var": 0.06494852701822916, + "learning_rate": 0.0001, + "loss": 5.7196, + "loss/crossentropy": 2.504445433616638, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17112022638320923, + "step": 15036 + }, + { + "epoch": 0.4699375, + "grad_norm": 3.296875, + "grad_norm_var": 0.06573893229166666, + "learning_rate": 0.0001, + "loss": 5.812, + "loss/crossentropy": 2.5174232721328735, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17828761786222458, + "step": 15038 + }, + { + "epoch": 0.47, + "grad_norm": 3.078125, + "grad_norm_var": 0.06652018229166666, + "learning_rate": 0.0001, + "loss": 5.6692, + "loss/crossentropy": 2.443945288658142, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16940295696258545, + "step": 15040 + }, + { + "epoch": 0.4700625, + "grad_norm": 3.125, + "grad_norm_var": 0.06179097493489583, + "learning_rate": 0.0001, + "loss": 5.6077, + "loss/crossentropy": 2.446028470993042, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16499093174934387, + "step": 15042 + }, + { + "epoch": 0.470125, + "grad_norm": 3.25, + "grad_norm_var": 0.05515848795572917, + "learning_rate": 0.0001, + "loss": 5.7399, + "loss/crossentropy": 2.582988977432251, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1680392622947693, + "step": 15044 + }, + { + "epoch": 0.4701875, + "grad_norm": 3.234375, + "grad_norm_var": 0.04488932291666667, + "learning_rate": 0.0001, + "loss": 5.629, + "loss/crossentropy": 2.4428768157958984, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1697836071252823, + "step": 15046 + }, + { + "epoch": 0.47025, + "grad_norm": 3.078125, + "grad_norm_var": 0.028905232747395832, + "learning_rate": 0.0001, + "loss": 6.1088, + "loss/crossentropy": 2.7833809852600098, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18136659264564514, + "step": 15048 + }, + { + "epoch": 0.4703125, + "grad_norm": 3.375, + "grad_norm_var": 0.0274322509765625, + "learning_rate": 0.0001, + "loss": 5.8243, + "loss/crossentropy": 2.5886348485946655, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17473383247852325, + "step": 15050 + }, + { + "epoch": 0.470375, + "grad_norm": 3.21875, + "grad_norm_var": 0.0244537353515625, + "learning_rate": 0.0001, + "loss": 5.6175, + "loss/crossentropy": 2.433435082435608, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1703553944826126, + "step": 15052 + }, + { + "epoch": 0.4704375, + "grad_norm": 3.1875, + "grad_norm_var": 0.023095703125, + "learning_rate": 0.0001, + "loss": 5.849, + "loss/crossentropy": 2.6403539180755615, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1700805500149727, + "step": 15054 + }, + { + "epoch": 0.4705, + "grad_norm": 3.0, + "grad_norm_var": 0.026200358072916666, + "learning_rate": 0.0001, + "loss": 5.5189, + "loss/crossentropy": 2.4523452520370483, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1605592966079712, + "step": 15056 + }, + { + "epoch": 0.4705625, + "grad_norm": 3.171875, + "grad_norm_var": 0.025830078125, + "learning_rate": 0.0001, + "loss": 5.6711, + "loss/crossentropy": 2.48645281791687, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16689835488796234, + "step": 15058 + }, + { + "epoch": 0.470625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0253814697265625, + "learning_rate": 0.0001, + "loss": 5.9489, + "loss/crossentropy": 2.6477524042129517, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17816638201475143, + "step": 15060 + }, + { + "epoch": 0.4706875, + "grad_norm": 3.390625, + "grad_norm_var": 0.03052978515625, + "learning_rate": 0.0001, + "loss": 5.6189, + "loss/crossentropy": 2.463808298110962, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16511918604373932, + "step": 15062 + }, + { + "epoch": 0.47075, + "grad_norm": 3.140625, + "grad_norm_var": 0.023582967122395833, + "learning_rate": 0.0001, + "loss": 5.7961, + "loss/crossentropy": 2.570745825767517, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17331210523843765, + "step": 15064 + }, + { + "epoch": 0.4708125, + "grad_norm": 3.171875, + "grad_norm_var": 0.024833170572916667, + "learning_rate": 0.0001, + "loss": 6.0957, + "loss/crossentropy": 2.7117077112197876, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18527565896511078, + "step": 15066 + }, + { + "epoch": 0.470875, + "grad_norm": 3.5625, + "grad_norm_var": 0.03326416015625, + "learning_rate": 0.0001, + "loss": 6.104, + "loss/crossentropy": 2.7489144802093506, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18277819454669952, + "step": 15068 + }, + { + "epoch": 0.4709375, + "grad_norm": 3.34375, + "grad_norm_var": 0.0377349853515625, + "learning_rate": 0.0001, + "loss": 6.0843, + "loss/crossentropy": 2.7256009578704834, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18079321086406708, + "step": 15070 + }, + { + "epoch": 0.471, + "grad_norm": 3.3125, + "grad_norm_var": 0.027567545572916668, + "learning_rate": 0.0001, + "loss": 5.9187, + "loss/crossentropy": 2.6594446897506714, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1763150617480278, + "step": 15072 + }, + { + "epoch": 0.4710625, + "grad_norm": 3.3125, + "grad_norm_var": 0.028498331705729168, + "learning_rate": 0.0001, + "loss": 5.8383, + "loss/crossentropy": 2.5661813020706177, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17838828265666962, + "step": 15074 + }, + { + "epoch": 0.471125, + "grad_norm": 3.234375, + "grad_norm_var": 0.02818603515625, + "learning_rate": 0.0001, + "loss": 6.0568, + "loss/crossentropy": 2.731650710105896, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18056457489728928, + "step": 15076 + }, + { + "epoch": 0.4711875, + "grad_norm": 3.4375, + "grad_norm_var": 0.0237701416015625, + "learning_rate": 0.0001, + "loss": 5.9464, + "loss/crossentropy": 2.658777952194214, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17642329633235931, + "step": 15078 + }, + { + "epoch": 0.47125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0229156494140625, + "learning_rate": 0.0001, + "loss": 6.0087, + "loss/crossentropy": 2.726006269454956, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17943864315748215, + "step": 15080 + }, + { + "epoch": 0.4713125, + "grad_norm": 3.140625, + "grad_norm_var": 0.02261962890625, + "learning_rate": 0.0001, + "loss": 5.8073, + "loss/crossentropy": 2.5403780937194824, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17278371006250381, + "step": 15082 + }, + { + "epoch": 0.471375, + "grad_norm": 3.390625, + "grad_norm_var": 0.019017537434895832, + "learning_rate": 0.0001, + "loss": 5.9824, + "loss/crossentropy": 2.713652014732361, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17648645490407944, + "step": 15084 + }, + { + "epoch": 0.4714375, + "grad_norm": 3.328125, + "grad_norm_var": 0.016624959309895833, + "learning_rate": 0.0001, + "loss": 5.7174, + "loss/crossentropy": 2.4842076301574707, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17175430804491043, + "step": 15086 + }, + { + "epoch": 0.4715, + "grad_norm": 3.8125, + "grad_norm_var": 0.03528544108072917, + "learning_rate": 0.0001, + "loss": 5.7058, + "loss/crossentropy": 2.510892391204834, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1655847132205963, + "step": 15088 + }, + { + "epoch": 0.4715625, + "grad_norm": 3.125, + "grad_norm_var": 0.0415435791015625, + "learning_rate": 0.0001, + "loss": 5.5088, + "loss/crossentropy": 2.3916221857070923, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16327939927577972, + "step": 15090 + }, + { + "epoch": 0.471625, + "grad_norm": 3.390625, + "grad_norm_var": 0.046891276041666666, + "learning_rate": 0.0001, + "loss": 5.5516, + "loss/crossentropy": 2.492325186729431, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.15436577051877975, + "step": 15092 + }, + { + "epoch": 0.4716875, + "grad_norm": 3.15625, + "grad_norm_var": 0.04752197265625, + "learning_rate": 0.0001, + "loss": 5.7036, + "loss/crossentropy": 2.5581146478652954, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16377197206020355, + "step": 15094 + }, + { + "epoch": 0.47175, + "grad_norm": 3.390625, + "grad_norm_var": 0.0471343994140625, + "learning_rate": 0.0001, + "loss": 5.6875, + "loss/crossentropy": 2.494322180747986, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1704912930727005, + "step": 15096 + }, + { + "epoch": 0.4718125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0501861572265625, + "learning_rate": 0.0001, + "loss": 5.633, + "loss/crossentropy": 2.4340046644210815, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1679450124502182, + "step": 15098 + }, + { + "epoch": 0.471875, + "grad_norm": 3.40625, + "grad_norm_var": 0.048884073893229164, + "learning_rate": 0.0001, + "loss": 5.916, + "loss/crossentropy": 2.6575233936309814, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17545659095048904, + "step": 15100 + }, + { + "epoch": 0.4719375, + "grad_norm": 3.078125, + "grad_norm_var": 0.049540201822916664, + "learning_rate": 0.0001, + "loss": 5.5774, + "loss/crossentropy": 2.433680295944214, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16437102854251862, + "step": 15102 + }, + { + "epoch": 0.472, + "grad_norm": 4.09375, + "grad_norm_var": 0.07652994791666666, + "learning_rate": 0.0001, + "loss": 5.8867, + "loss/crossentropy": 2.4997910261154175, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18361079692840576, + "step": 15104 + }, + { + "epoch": 0.4720625, + "grad_norm": 3.15625, + "grad_norm_var": 0.07822977701822917, + "learning_rate": 0.0001, + "loss": 5.6591, + "loss/crossentropy": 2.4471428394317627, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16885577887296677, + "step": 15106 + }, + { + "epoch": 0.472125, + "grad_norm": 3.0625, + "grad_norm_var": 0.07629801432291666, + "learning_rate": 0.0001, + "loss": 5.5201, + "loss/crossentropy": 2.4350301027297974, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15929025411605835, + "step": 15108 + }, + { + "epoch": 0.4721875, + "grad_norm": 2.90625, + "grad_norm_var": 0.0806793212890625, + "learning_rate": 0.0001, + "loss": 5.6057, + "loss/crossentropy": 2.4947516918182373, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1634358912706375, + "step": 15110 + }, + { + "epoch": 0.47225, + "grad_norm": 3.03125, + "grad_norm_var": 0.08280843098958333, + "learning_rate": 0.0001, + "loss": 5.9371, + "loss/crossentropy": 2.7836095094680786, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16691502183675766, + "step": 15112 + }, + { + "epoch": 0.4723125, + "grad_norm": 3.25, + "grad_norm_var": 0.07932942708333333, + "learning_rate": 0.0001, + "loss": 5.7497, + "loss/crossentropy": 2.557365298271179, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17235562205314636, + "step": 15114 + }, + { + "epoch": 0.472375, + "grad_norm": 3.078125, + "grad_norm_var": 0.07916666666666666, + "learning_rate": 0.0001, + "loss": 5.6172, + "loss/crossentropy": 2.493856430053711, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1642913818359375, + "step": 15116 + }, + { + "epoch": 0.4724375, + "grad_norm": 3.046875, + "grad_norm_var": 0.07940165201822917, + "learning_rate": 0.0001, + "loss": 5.4265, + "loss/crossentropy": 2.3340771198272705, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16392983496189117, + "step": 15118 + }, + { + "epoch": 0.4725, + "grad_norm": 3.0, + "grad_norm_var": 0.024507649739583335, + "learning_rate": 0.0001, + "loss": 5.7192, + "loss/crossentropy": 2.53964900970459, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16873925179243088, + "step": 15120 + }, + { + "epoch": 0.4725625, + "grad_norm": 3.34375, + "grad_norm_var": 0.013179524739583334, + "learning_rate": 0.0001, + "loss": 6.1749, + "loss/crossentropy": 2.7930883169174194, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18466253578662872, + "step": 15122 + }, + { + "epoch": 0.472625, + "grad_norm": 3.0625, + "grad_norm_var": 0.013525390625, + "learning_rate": 0.0001, + "loss": 6.1091, + "loss/crossentropy": 2.835660219192505, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17851269245147705, + "step": 15124 + }, + { + "epoch": 0.4726875, + "grad_norm": 3.328125, + "grad_norm_var": 0.019384765625, + "learning_rate": 0.0001, + "loss": 5.7592, + "loss/crossentropy": 2.4408318996429443, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17480113357305527, + "step": 15126 + }, + { + "epoch": 0.47275, + "grad_norm": 3.140625, + "grad_norm_var": 0.017964680989583332, + "learning_rate": 0.0001, + "loss": 5.4661, + "loss/crossentropy": 2.359795093536377, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16101963818073273, + "step": 15128 + }, + { + "epoch": 0.4728125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0249908447265625, + "learning_rate": 0.0001, + "loss": 5.9525, + "loss/crossentropy": 2.5907492637634277, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18226782232522964, + "step": 15130 + }, + { + "epoch": 0.472875, + "grad_norm": 3.5, + "grad_norm_var": 0.0279296875, + "learning_rate": 0.0001, + "loss": 5.68, + "loss/crossentropy": 2.4544483423233032, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16825765371322632, + "step": 15132 + }, + { + "epoch": 0.4729375, + "grad_norm": 3.296875, + "grad_norm_var": 0.027179972330729166, + "learning_rate": 0.0001, + "loss": 5.8343, + "loss/crossentropy": 2.6822274923324585, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1663799062371254, + "step": 15134 + }, + { + "epoch": 0.473, + "grad_norm": 3.0625, + "grad_norm_var": 0.025324503580729168, + "learning_rate": 0.0001, + "loss": 5.4519, + "loss/crossentropy": 2.3754748106002808, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16116300970315933, + "step": 15136 + }, + { + "epoch": 0.4730625, + "grad_norm": 3.09375, + "grad_norm_var": 0.026871744791666666, + "learning_rate": 0.0001, + "loss": 5.6349, + "loss/crossentropy": 2.5355676412582397, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.161496102809906, + "step": 15138 + }, + { + "epoch": 0.473125, + "grad_norm": 3.359375, + "grad_norm_var": 0.027057902018229166, + "learning_rate": 0.0001, + "loss": 5.8073, + "loss/crossentropy": 2.50444233417511, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17598942667245865, + "step": 15140 + }, + { + "epoch": 0.4731875, + "grad_norm": 3.0625, + "grad_norm_var": 0.024544270833333333, + "learning_rate": 0.0001, + "loss": 5.787, + "loss/crossentropy": 2.6119871139526367, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1671072095632553, + "step": 15142 + }, + { + "epoch": 0.47325, + "grad_norm": 3.390625, + "grad_norm_var": 0.025877888997395834, + "learning_rate": 0.0001, + "loss": 5.7882, + "loss/crossentropy": 2.571872115135193, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1751495897769928, + "step": 15144 + }, + { + "epoch": 0.4733125, + "grad_norm": 3.09375, + "grad_norm_var": 0.022907511393229166, + "learning_rate": 0.0001, + "loss": 5.8456, + "loss/crossentropy": 2.6580671072006226, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1707022786140442, + "step": 15146 + }, + { + "epoch": 0.473375, + "grad_norm": 3.28125, + "grad_norm_var": 0.02828369140625, + "learning_rate": 0.0001, + "loss": 5.836, + "loss/crossentropy": 2.5903728008270264, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17651164531707764, + "step": 15148 + }, + { + "epoch": 0.4734375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03052978515625, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.5991785526275635, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17182902991771698, + "step": 15150 + }, + { + "epoch": 0.4735, + "grad_norm": 3.140625, + "grad_norm_var": 0.0308502197265625, + "learning_rate": 0.0001, + "loss": 5.8214, + "loss/crossentropy": 2.528180241584778, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17620117962360382, + "step": 15152 + }, + { + "epoch": 0.4735625, + "grad_norm": 3.5625, + "grad_norm_var": 0.03492431640625, + "learning_rate": 0.0001, + "loss": 5.9033, + "loss/crossentropy": 2.655681610107422, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17437532544136047, + "step": 15154 + }, + { + "epoch": 0.473625, + "grad_norm": 3.296875, + "grad_norm_var": 0.03551025390625, + "learning_rate": 0.0001, + "loss": 5.62, + "loss/crossentropy": 2.4807881116867065, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16977840662002563, + "step": 15156 + }, + { + "epoch": 0.4736875, + "grad_norm": 2.828125, + "grad_norm_var": 0.044417317708333334, + "learning_rate": 0.0001, + "loss": 6.0188, + "loss/crossentropy": 2.7204389572143555, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17827719449996948, + "step": 15158 + }, + { + "epoch": 0.47375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0440826416015625, + "learning_rate": 0.0001, + "loss": 6.0667, + "loss/crossentropy": 2.8044852018356323, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1738826036453247, + "step": 15160 + }, + { + "epoch": 0.4738125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04897359212239583, + "learning_rate": 0.0001, + "loss": 5.7891, + "loss/crossentropy": 2.6122967004776, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1735389232635498, + "step": 15162 + }, + { + "epoch": 0.473875, + "grad_norm": 3.28125, + "grad_norm_var": 0.036454264322916666, + "learning_rate": 0.0001, + "loss": 5.895, + "loss/crossentropy": 2.642680287361145, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17523349821567535, + "step": 15164 + }, + { + "epoch": 0.4739375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0365142822265625, + "learning_rate": 0.0001, + "loss": 5.6857, + "loss/crossentropy": 2.474525213241577, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17150677740573883, + "step": 15166 + }, + { + "epoch": 0.474, + "grad_norm": 3.0625, + "grad_norm_var": 0.0347320556640625, + "learning_rate": 0.0001, + "loss": 5.3874, + "loss/crossentropy": 2.324982166290283, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15663722157478333, + "step": 15168 + }, + { + "epoch": 0.4740625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0246246337890625, + "learning_rate": 0.0001, + "loss": 6.0049, + "loss/crossentropy": 2.7491393089294434, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1755783036351204, + "step": 15170 + }, + { + "epoch": 0.474125, + "grad_norm": 3.0, + "grad_norm_var": 0.0246978759765625, + "learning_rate": 0.0001, + "loss": 5.8553, + "loss/crossentropy": 2.628808856010437, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17538384348154068, + "step": 15172 + }, + { + "epoch": 0.4741875, + "grad_norm": 3.25, + "grad_norm_var": 0.017577107747395834, + "learning_rate": 0.0001, + "loss": 5.636, + "loss/crossentropy": 2.4430015087127686, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17203166335821152, + "step": 15174 + }, + { + "epoch": 0.47425, + "grad_norm": 4.6875, + "grad_norm_var": 0.16422119140625, + "learning_rate": 0.0001, + "loss": 5.3546, + "loss/crossentropy": 2.3191999197006226, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1535409614443779, + "step": 15176 + }, + { + "epoch": 0.4743125, + "grad_norm": 3.3125, + "grad_norm_var": 0.15623372395833332, + "learning_rate": 0.0001, + "loss": 5.6493, + "loss/crossentropy": 2.5304505825042725, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16266535967588425, + "step": 15178 + }, + { + "epoch": 0.474375, + "grad_norm": 3.4375, + "grad_norm_var": 0.16245829264322917, + "learning_rate": 0.0001, + "loss": 5.6971, + "loss/crossentropy": 2.4235081672668457, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17658285051584244, + "step": 15180 + }, + { + "epoch": 0.4744375, + "grad_norm": 3.3125, + "grad_norm_var": 0.171240234375, + "learning_rate": 0.0001, + "loss": 5.6314, + "loss/crossentropy": 2.447352647781372, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16645319014787674, + "step": 15182 + }, + { + "epoch": 0.4745, + "grad_norm": 3.359375, + "grad_norm_var": 0.16711832682291666, + "learning_rate": 0.0001, + "loss": 5.7577, + "loss/crossentropy": 2.500947952270508, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1752820387482643, + "step": 15184 + }, + { + "epoch": 0.4745625, + "grad_norm": 8.5, + "grad_norm_var": 1.8448527018229166, + "learning_rate": 0.0001, + "loss": 6.1857, + "loss/crossentropy": 2.762804865837097, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18838287889957428, + "step": 15186 + }, + { + "epoch": 0.474625, + "grad_norm": 4.3125, + "grad_norm_var": 1.8286692301432292, + "learning_rate": 0.0001, + "loss": 5.9227, + "loss/crossentropy": 2.6097878217697144, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17973001301288605, + "step": 15188 + }, + { + "epoch": 0.4746875, + "grad_norm": 3.765625, + "grad_norm_var": 1.8067667643229166, + "learning_rate": 0.0001, + "loss": 5.8835, + "loss/crossentropy": 2.567626476287842, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1788562387228012, + "step": 15190 + }, + { + "epoch": 0.47475, + "grad_norm": 3.375, + "grad_norm_var": 1.7167144775390626, + "learning_rate": 0.0001, + "loss": 6.1341, + "loss/crossentropy": 2.7327338457107544, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18740259855985641, + "step": 15192 + }, + { + "epoch": 0.4748125, + "grad_norm": 3.375, + "grad_norm_var": 1.7120402018229166, + "learning_rate": 0.0001, + "loss": 5.6488, + "loss/crossentropy": 2.49296772480011, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16597580909729004, + "step": 15194 + }, + { + "epoch": 0.474875, + "grad_norm": 3.3125, + "grad_norm_var": 1.7203450520833334, + "learning_rate": 0.0001, + "loss": 5.5357, + "loss/crossentropy": 2.325214982032776, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17026569694280624, + "step": 15196 + }, + { + "epoch": 0.4749375, + "grad_norm": 3.21875, + "grad_norm_var": 1.6799763997395833, + "learning_rate": 0.0001, + "loss": 5.7957, + "loss/crossentropy": 2.4964864253997803, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17796631157398224, + "step": 15198 + }, + { + "epoch": 0.475, + "grad_norm": 3.390625, + "grad_norm_var": 1.69111328125, + "learning_rate": 0.0001, + "loss": 5.965, + "loss/crossentropy": 2.682921886444092, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1789863184094429, + "step": 15200 + }, + { + "epoch": 0.4750625, + "grad_norm": 3.015625, + "grad_norm_var": 0.123583984375, + "learning_rate": 0.0001, + "loss": 5.8367, + "loss/crossentropy": 2.5980740785598755, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1758170649409294, + "step": 15202 + }, + { + "epoch": 0.475125, + "grad_norm": 2.8125, + "grad_norm_var": 0.10071207682291666, + "learning_rate": 0.0001, + "loss": 5.5116, + "loss/crossentropy": 2.4391099214553833, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15724600106477737, + "step": 15204 + }, + { + "epoch": 0.4751875, + "grad_norm": 2.96875, + "grad_norm_var": 0.8183664957682292, + "learning_rate": 0.0001, + "loss": 5.5985, + "loss/crossentropy": 2.442660093307495, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16323567926883698, + "step": 15206 + }, + { + "epoch": 0.47525, + "grad_norm": 3.140625, + "grad_norm_var": 0.8284830729166667, + "learning_rate": 0.0001, + "loss": 5.78, + "loss/crossentropy": 2.541997790336609, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1749734953045845, + "step": 15208 + }, + { + "epoch": 0.4753125, + "grad_norm": 3.15625, + "grad_norm_var": 0.84752197265625, + "learning_rate": 0.0001, + "loss": 5.2352, + "loss/crossentropy": 2.1886537075042725, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.14839966222643852, + "step": 15210 + }, + { + "epoch": 0.475375, + "grad_norm": 3.53125, + "grad_norm_var": 0.8497792561848958, + "learning_rate": 0.0001, + "loss": 6.0524, + "loss/crossentropy": 2.6989437341690063, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18378519266843796, + "step": 15212 + }, + { + "epoch": 0.4754375, + "grad_norm": 3.046875, + "grad_norm_var": 0.8433502197265625, + "learning_rate": 0.0001, + "loss": 5.9131, + "loss/crossentropy": 2.6584397554397583, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1746854931116104, + "step": 15214 + }, + { + "epoch": 0.4755, + "grad_norm": 3.0625, + "grad_norm_var": 0.848291015625, + "learning_rate": 0.0001, + "loss": 5.7146, + "loss/crossentropy": 2.4808326959609985, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17181871086359024, + "step": 15216 + }, + { + "epoch": 0.4755625, + "grad_norm": 3.296875, + "grad_norm_var": 0.8222076416015625, + "learning_rate": 0.0001, + "loss": 5.8577, + "loss/crossentropy": 2.61141300201416, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1734614074230194, + "step": 15218 + }, + { + "epoch": 0.475625, + "grad_norm": 3.3125, + "grad_norm_var": 0.7963175455729167, + "learning_rate": 0.0001, + "loss": 5.8515, + "loss/crossentropy": 2.571282982826233, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17333849519491196, + "step": 15220 + }, + { + "epoch": 0.4756875, + "grad_norm": 3.15625, + "grad_norm_var": 0.029227701822916667, + "learning_rate": 0.0001, + "loss": 5.3423, + "loss/crossentropy": 2.299667716026306, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1527012139558792, + "step": 15222 + }, + { + "epoch": 0.47575, + "grad_norm": 3.59375, + "grad_norm_var": 0.03821614583333333, + "learning_rate": 0.0001, + "loss": 5.9217, + "loss/crossentropy": 2.603001117706299, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17952166497707367, + "step": 15224 + }, + { + "epoch": 0.4758125, + "grad_norm": 3.25, + "grad_norm_var": 0.029325358072916665, + "learning_rate": 0.0001, + "loss": 5.6838, + "loss/crossentropy": 2.5082921981811523, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1687181293964386, + "step": 15226 + }, + { + "epoch": 0.475875, + "grad_norm": 2.8125, + "grad_norm_var": 0.03948160807291667, + "learning_rate": 0.0001, + "loss": 5.6393, + "loss/crossentropy": 2.6041629314422607, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15547103434801102, + "step": 15228 + }, + { + "epoch": 0.4759375, + "grad_norm": 3.140625, + "grad_norm_var": 0.03443094889322917, + "learning_rate": 0.0001, + "loss": 6.0314, + "loss/crossentropy": 2.716339945793152, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17759885638952255, + "step": 15230 + }, + { + "epoch": 0.476, + "grad_norm": 3.25, + "grad_norm_var": 0.03528238932291667, + "learning_rate": 0.0001, + "loss": 5.9443, + "loss/crossentropy": 2.6709004640579224, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17617063224315643, + "step": 15232 + }, + { + "epoch": 0.4760625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03494466145833333, + "learning_rate": 0.0001, + "loss": 5.6875, + "loss/crossentropy": 2.4924468994140625, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16794048994779587, + "step": 15234 + }, + { + "epoch": 0.476125, + "grad_norm": 3.15625, + "grad_norm_var": 0.03433837890625, + "learning_rate": 0.0001, + "loss": 5.7679, + "loss/crossentropy": 2.523940920829773, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17243968695402145, + "step": 15236 + }, + { + "epoch": 0.4761875, + "grad_norm": 3.171875, + "grad_norm_var": 0.0352935791015625, + "learning_rate": 0.0001, + "loss": 5.9944, + "loss/crossentropy": 2.714240550994873, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1784021183848381, + "step": 15238 + }, + { + "epoch": 0.47625, + "grad_norm": 3.25, + "grad_norm_var": 0.030663045247395833, + "learning_rate": 0.0001, + "loss": 5.6462, + "loss/crossentropy": 2.5538183450698853, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16510005295276642, + "step": 15240 + }, + { + "epoch": 0.4763125, + "grad_norm": 3.609375, + "grad_norm_var": 0.05555013020833333, + "learning_rate": 0.0001, + "loss": 6.2182, + "loss/crossentropy": 2.7652846574783325, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1913825199007988, + "step": 15242 + }, + { + "epoch": 0.476375, + "grad_norm": 2.890625, + "grad_norm_var": 0.05009663899739583, + "learning_rate": 0.0001, + "loss": 5.8987, + "loss/crossentropy": 2.6722806692123413, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17147146910429, + "step": 15244 + }, + { + "epoch": 0.4764375, + "grad_norm": 3.09375, + "grad_norm_var": 0.05126546223958333, + "learning_rate": 0.0001, + "loss": 5.6072, + "loss/crossentropy": 2.4855659008026123, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16411474347114563, + "step": 15246 + }, + { + "epoch": 0.4765, + "grad_norm": 3.3125, + "grad_norm_var": 0.04439697265625, + "learning_rate": 0.0001, + "loss": 5.7495, + "loss/crossentropy": 2.5743420124053955, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.167519249022007, + "step": 15248 + }, + { + "epoch": 0.4765625, + "grad_norm": 3.25, + "grad_norm_var": 0.04524637858072917, + "learning_rate": 0.0001, + "loss": 5.7153, + "loss/crossentropy": 2.43125057220459, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17137471586465836, + "step": 15250 + }, + { + "epoch": 0.476625, + "grad_norm": 3.421875, + "grad_norm_var": 0.04830729166666667, + "learning_rate": 0.0001, + "loss": 6.0371, + "loss/crossentropy": 2.599075436592102, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1852112114429474, + "step": 15252 + }, + { + "epoch": 0.4766875, + "grad_norm": 3.21875, + "grad_norm_var": 0.04539286295572917, + "learning_rate": 0.0001, + "loss": 5.8265, + "loss/crossentropy": 2.5461827516555786, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1725672110915184, + "step": 15254 + }, + { + "epoch": 0.47675, + "grad_norm": 3.265625, + "grad_norm_var": 0.04045817057291667, + "learning_rate": 0.0001, + "loss": 5.7316, + "loss/crossentropy": 2.562483310699463, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16886383295059204, + "step": 15256 + }, + { + "epoch": 0.4768125, + "grad_norm": 3.25, + "grad_norm_var": 0.02314453125, + "learning_rate": 0.0001, + "loss": 6.0071, + "loss/crossentropy": 2.7447198629379272, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17545223236083984, + "step": 15258 + }, + { + "epoch": 0.476875, + "grad_norm": 3.546875, + "grad_norm_var": 0.02427978515625, + "learning_rate": 0.0001, + "loss": 5.7061, + "loss/crossentropy": 2.5041333436965942, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16941587626934052, + "step": 15260 + }, + { + "epoch": 0.4769375, + "grad_norm": 3.28125, + "grad_norm_var": 0.022516886393229168, + "learning_rate": 0.0001, + "loss": 5.55, + "loss/crossentropy": 2.3615355491638184, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16806236654520035, + "step": 15262 + }, + { + "epoch": 0.477, + "grad_norm": 3.015625, + "grad_norm_var": 0.02578125, + "learning_rate": 0.0001, + "loss": 5.5874, + "loss/crossentropy": 2.4267687797546387, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17036070674657822, + "step": 15264 + }, + { + "epoch": 0.4770625, + "grad_norm": 3.28125, + "grad_norm_var": 0.029035441080729165, + "learning_rate": 0.0001, + "loss": 5.7918, + "loss/crossentropy": 2.6040912866592407, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1707252711057663, + "step": 15266 + }, + { + "epoch": 0.477125, + "grad_norm": 3.375, + "grad_norm_var": 0.025983683268229165, + "learning_rate": 0.0001, + "loss": 5.7662, + "loss/crossentropy": 2.5044026374816895, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1707111820578575, + "step": 15268 + }, + { + "epoch": 0.4771875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0467193603515625, + "learning_rate": 0.0001, + "loss": 5.9211, + "loss/crossentropy": 2.6799111366271973, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17450731992721558, + "step": 15270 + }, + { + "epoch": 0.47725, + "grad_norm": 2.828125, + "grad_norm_var": 0.05400390625, + "learning_rate": 0.0001, + "loss": 5.502, + "loss/crossentropy": 2.43990296125412, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1562146171927452, + "step": 15272 + }, + { + "epoch": 0.4773125, + "grad_norm": 3.34375, + "grad_norm_var": 0.0623443603515625, + "learning_rate": 0.0001, + "loss": 6.0116, + "loss/crossentropy": 2.6366543769836426, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.1812412366271019, + "step": 15274 + }, + { + "epoch": 0.477375, + "grad_norm": 3.1875, + "grad_norm_var": 0.056029256184895834, + "learning_rate": 0.0001, + "loss": 5.9316, + "loss/crossentropy": 2.6097456216812134, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18179281800985336, + "step": 15276 + }, + { + "epoch": 0.4774375, + "grad_norm": 3.09375, + "grad_norm_var": 0.058394368489583334, + "learning_rate": 0.0001, + "loss": 5.4876, + "loss/crossentropy": 2.3936734199523926, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16485852003097534, + "step": 15278 + }, + { + "epoch": 0.4775, + "grad_norm": 2.9375, + "grad_norm_var": 0.0631011962890625, + "learning_rate": 0.0001, + "loss": 5.9462, + "loss/crossentropy": 2.745453357696533, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17007002234458923, + "step": 15280 + }, + { + "epoch": 0.4775625, + "grad_norm": 5.1875, + "grad_norm_var": 0.29146728515625, + "learning_rate": 0.0001, + "loss": 5.9661, + "loss/crossentropy": 2.5787419080734253, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1875605657696724, + "step": 15282 + }, + { + "epoch": 0.477625, + "grad_norm": 3.53125, + "grad_norm_var": 0.3244374593098958, + "learning_rate": 0.0001, + "loss": 5.8584, + "loss/crossentropy": 2.4698829650878906, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18143144994974136, + "step": 15284 + }, + { + "epoch": 0.4776875, + "grad_norm": 2.78125, + "grad_norm_var": 0.34965718587239586, + "learning_rate": 0.0001, + "loss": 5.3, + "loss/crossentropy": 2.2809072732925415, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15385840088129044, + "step": 15286 + }, + { + "epoch": 0.47775, + "grad_norm": 3.0, + "grad_norm_var": 0.34578348795572916, + "learning_rate": 0.0001, + "loss": 5.6377, + "loss/crossentropy": 2.440936803817749, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16928477585315704, + "step": 15288 + }, + { + "epoch": 0.4778125, + "grad_norm": 3.078125, + "grad_norm_var": 0.3564117431640625, + "learning_rate": 0.0001, + "loss": 5.6101, + "loss/crossentropy": 2.4943089485168457, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1619659587740898, + "step": 15290 + }, + { + "epoch": 0.477875, + "grad_norm": 3.203125, + "grad_norm_var": 0.35675455729166666, + "learning_rate": 0.0001, + "loss": 5.8208, + "loss/crossentropy": 2.6396723985671997, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17162998020648956, + "step": 15292 + }, + { + "epoch": 0.4779375, + "grad_norm": 3.1875, + "grad_norm_var": 0.3532786051432292, + "learning_rate": 0.0001, + "loss": 5.7517, + "loss/crossentropy": 2.5214877128601074, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1730218604207039, + "step": 15294 + }, + { + "epoch": 0.478, + "grad_norm": 3.234375, + "grad_norm_var": 0.46142578125, + "learning_rate": 0.0001, + "loss": 6.0902, + "loss/crossentropy": 2.7099214792251587, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18412108719348907, + "step": 15296 + }, + { + "epoch": 0.4780625, + "grad_norm": 3.265625, + "grad_norm_var": 0.2379547119140625, + "learning_rate": 0.0001, + "loss": 5.9388, + "loss/crossentropy": 2.6545175313949585, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17725564539432526, + "step": 15298 + }, + { + "epoch": 0.478125, + "grad_norm": 3.546875, + "grad_norm_var": 0.19612528483072916, + "learning_rate": 0.0001, + "loss": 5.7419, + "loss/crossentropy": 2.5023709535598755, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1708322986960411, + "step": 15300 + }, + { + "epoch": 0.4781875, + "grad_norm": 3.375, + "grad_norm_var": 0.17860921223958334, + "learning_rate": 0.0001, + "loss": 5.8065, + "loss/crossentropy": 2.561031699180603, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1753268912434578, + "step": 15302 + }, + { + "epoch": 0.47825, + "grad_norm": 3.421875, + "grad_norm_var": 0.1693023681640625, + "learning_rate": 0.0001, + "loss": 6.1622, + "loss/crossentropy": 2.7797582149505615, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18433526903390884, + "step": 15304 + }, + { + "epoch": 0.4783125, + "grad_norm": 3.03125, + "grad_norm_var": 0.1610260009765625, + "learning_rate": 0.0001, + "loss": 5.8765, + "loss/crossentropy": 2.6523128747940063, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1712425947189331, + "step": 15306 + }, + { + "epoch": 0.478375, + "grad_norm": 3.125, + "grad_norm_var": 0.15771077473958334, + "learning_rate": 0.0001, + "loss": 6.0212, + "loss/crossentropy": 2.723678708076477, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18053758889436722, + "step": 15308 + }, + { + "epoch": 0.4784375, + "grad_norm": 3.0625, + "grad_norm_var": 0.16161702473958334, + "learning_rate": 0.0001, + "loss": 5.5002, + "loss/crossentropy": 2.4254177808761597, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15826012194156647, + "step": 15310 + }, + { + "epoch": 0.4785, + "grad_norm": 2.984375, + "grad_norm_var": 0.05628153483072917, + "learning_rate": 0.0001, + "loss": 5.9301, + "loss/crossentropy": 2.665627956390381, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1776166409254074, + "step": 15312 + }, + { + "epoch": 0.4785625, + "grad_norm": 3.25, + "grad_norm_var": 0.06303609212239583, + "learning_rate": 0.0001, + "loss": 6.1482, + "loss/crossentropy": 2.777933359146118, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1823379099369049, + "step": 15314 + }, + { + "epoch": 0.478625, + "grad_norm": 3.15625, + "grad_norm_var": 0.05572916666666667, + "learning_rate": 0.0001, + "loss": 5.8695, + "loss/crossentropy": 2.57357656955719, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17842179536819458, + "step": 15316 + }, + { + "epoch": 0.4786875, + "grad_norm": 2.875, + "grad_norm_var": 0.0773101806640625, + "learning_rate": 0.0001, + "loss": 5.8675, + "loss/crossentropy": 2.563399314880371, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.17377082258462906, + "step": 15318 + }, + { + "epoch": 0.47875, + "grad_norm": 4.125, + "grad_norm_var": 0.1182037353515625, + "learning_rate": 0.0001, + "loss": 6.0154, + "loss/crossentropy": 2.6596556901931763, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1789350062608719, + "step": 15320 + }, + { + "epoch": 0.4788125, + "grad_norm": 3.0, + "grad_norm_var": 0.1208404541015625, + "learning_rate": 0.0001, + "loss": 5.5908, + "loss/crossentropy": 2.445460319519043, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1680455207824707, + "step": 15322 + }, + { + "epoch": 0.478875, + "grad_norm": 3.171875, + "grad_norm_var": 0.12068583170572916, + "learning_rate": 0.0001, + "loss": 5.8791, + "loss/crossentropy": 2.6736626625061035, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17171691358089447, + "step": 15324 + }, + { + "epoch": 0.4789375, + "grad_norm": 3.53125, + "grad_norm_var": 0.12881571451822918, + "learning_rate": 0.0001, + "loss": 5.6332, + "loss/crossentropy": 2.4836422204971313, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16534702479839325, + "step": 15326 + }, + { + "epoch": 0.479, + "grad_norm": 2.953125, + "grad_norm_var": 0.1141754150390625, + "learning_rate": 0.0001, + "loss": 5.9135, + "loss/crossentropy": 2.58274507522583, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.17409475147724152, + "step": 15328 + }, + { + "epoch": 0.4790625, + "grad_norm": 3.03125, + "grad_norm_var": 0.11633199055989583, + "learning_rate": 0.0001, + "loss": 5.881, + "loss/crossentropy": 2.6876858472824097, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17049873620271683, + "step": 15330 + }, + { + "epoch": 0.479125, + "grad_norm": 3.390625, + "grad_norm_var": 0.1166015625, + "learning_rate": 0.0001, + "loss": 5.6899, + "loss/crossentropy": 2.501584529876709, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17000600695610046, + "step": 15332 + }, + { + "epoch": 0.4791875, + "grad_norm": 3.234375, + "grad_norm_var": 0.09429931640625, + "learning_rate": 0.0001, + "loss": 5.7534, + "loss/crossentropy": 2.568631649017334, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17120973765850067, + "step": 15334 + }, + { + "epoch": 0.47925, + "grad_norm": 3.1875, + "grad_norm_var": 0.035107421875, + "learning_rate": 0.0001, + "loss": 5.8323, + "loss/crossentropy": 2.5619494915008545, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17625130712985992, + "step": 15336 + }, + { + "epoch": 0.4793125, + "grad_norm": 3.453125, + "grad_norm_var": 0.03955078125, + "learning_rate": 0.0001, + "loss": 5.796, + "loss/crossentropy": 2.5548356771469116, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17489810287952423, + "step": 15338 + }, + { + "epoch": 0.479375, + "grad_norm": 3.09375, + "grad_norm_var": 0.04029541015625, + "learning_rate": 0.0001, + "loss": 5.6436, + "loss/crossentropy": 2.48819100856781, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16671043634414673, + "step": 15340 + }, + { + "epoch": 0.4794375, + "grad_norm": 3.15625, + "grad_norm_var": 0.029059855143229167, + "learning_rate": 0.0001, + "loss": 5.5445, + "loss/crossentropy": 2.449354887008667, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16107948124408722, + "step": 15342 + }, + { + "epoch": 0.4795, + "grad_norm": 2.921875, + "grad_norm_var": 0.02486572265625, + "learning_rate": 0.0001, + "loss": 6.0028, + "loss/crossentropy": 2.7391287088394165, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17715124040842056, + "step": 15344 + }, + { + "epoch": 0.4795625, + "grad_norm": 3.09375, + "grad_norm_var": 0.029313151041666666, + "learning_rate": 0.0001, + "loss": 5.7572, + "loss/crossentropy": 2.537302613258362, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17238043248653412, + "step": 15346 + }, + { + "epoch": 0.479625, + "grad_norm": 2.90625, + "grad_norm_var": 0.027079264322916668, + "learning_rate": 0.0001, + "loss": 5.3147, + "loss/crossentropy": 2.316564679145813, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15332921594381332, + "step": 15348 + }, + { + "epoch": 0.4796875, + "grad_norm": 2.984375, + "grad_norm_var": 0.028544108072916668, + "learning_rate": 0.0001, + "loss": 5.5347, + "loss/crossentropy": 2.40766978263855, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1607503741979599, + "step": 15350 + }, + { + "epoch": 0.47975, + "grad_norm": 3.484375, + "grad_norm_var": 0.036295572916666664, + "learning_rate": 0.0001, + "loss": 5.9035, + "loss/crossentropy": 2.718092679977417, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17010437697172165, + "step": 15352 + }, + { + "epoch": 0.4798125, + "grad_norm": 3.046875, + "grad_norm_var": 0.02984619140625, + "learning_rate": 0.0001, + "loss": 5.7388, + "loss/crossentropy": 2.571472644805908, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17024526000022888, + "step": 15354 + }, + { + "epoch": 0.479875, + "grad_norm": 3.265625, + "grad_norm_var": 0.03858133951822917, + "learning_rate": 0.0001, + "loss": 5.9969, + "loss/crossentropy": 2.6315516233444214, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1783294677734375, + "step": 15356 + }, + { + "epoch": 0.4799375, + "grad_norm": 3.0, + "grad_norm_var": 0.04397379557291667, + "learning_rate": 0.0001, + "loss": 5.5347, + "loss/crossentropy": 2.4775781631469727, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1537545621395111, + "step": 15358 + }, + { + "epoch": 0.48, + "grad_norm": 3.203125, + "grad_norm_var": 0.040934244791666664, + "learning_rate": 0.0001, + "loss": 5.9967, + "loss/crossentropy": 2.784665107727051, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1719825714826584, + "step": 15360 + }, + { + "epoch": 0.4800625, + "grad_norm": 3.53125, + "grad_norm_var": 0.04403889973958333, + "learning_rate": 0.0001, + "loss": 6.1269, + "loss/crossentropy": 2.7253435850143433, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18508252501487732, + "step": 15362 + }, + { + "epoch": 0.480125, + "grad_norm": 3.21875, + "grad_norm_var": 0.04531148274739583, + "learning_rate": 0.0001, + "loss": 5.9595, + "loss/crossentropy": 2.6731619834899902, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17472746223211288, + "step": 15364 + }, + { + "epoch": 0.4801875, + "grad_norm": 2.78125, + "grad_norm_var": 0.0546539306640625, + "learning_rate": 0.0001, + "loss": 5.3532, + "loss/crossentropy": 2.348812699317932, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15473972260951996, + "step": 15366 + }, + { + "epoch": 0.48025, + "grad_norm": 3.078125, + "grad_norm_var": 0.054911295572916664, + "learning_rate": 0.0001, + "loss": 5.9021, + "loss/crossentropy": 2.6510828733444214, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17431634664535522, + "step": 15368 + }, + { + "epoch": 0.4803125, + "grad_norm": 3.296875, + "grad_norm_var": 0.05706380208333333, + "learning_rate": 0.0001, + "loss": 5.7006, + "loss/crossentropy": 2.522355556488037, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1693916618824005, + "step": 15370 + }, + { + "epoch": 0.480375, + "grad_norm": 3.21875, + "grad_norm_var": 0.0513580322265625, + "learning_rate": 0.0001, + "loss": 5.6771, + "loss/crossentropy": 2.5551689863204956, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16414868086576462, + "step": 15372 + }, + { + "epoch": 0.4804375, + "grad_norm": 3.5, + "grad_norm_var": 0.046284993489583336, + "learning_rate": 0.0001, + "loss": 5.7002, + "loss/crossentropy": 2.41007137298584, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1782316491007805, + "step": 15374 + }, + { + "epoch": 0.4805, + "grad_norm": 3.578125, + "grad_norm_var": 0.0570220947265625, + "learning_rate": 0.0001, + "loss": 5.9039, + "loss/crossentropy": 2.639471411705017, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17839555442333221, + "step": 15376 + }, + { + "epoch": 0.4805625, + "grad_norm": 3.5, + "grad_norm_var": 0.060578409830729166, + "learning_rate": 0.0001, + "loss": 5.7466, + "loss/crossentropy": 2.51907479763031, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17313816398382187, + "step": 15378 + }, + { + "epoch": 0.480625, + "grad_norm": 3.171875, + "grad_norm_var": 0.05147196451822917, + "learning_rate": 0.0001, + "loss": 5.8164, + "loss/crossentropy": 2.5738741159439087, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17464550584554672, + "step": 15380 + }, + { + "epoch": 0.4806875, + "grad_norm": 3.015625, + "grad_norm_var": 0.042985026041666666, + "learning_rate": 0.0001, + "loss": 5.4853, + "loss/crossentropy": 2.392674684524536, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16121415793895721, + "step": 15382 + }, + { + "epoch": 0.48075, + "grad_norm": 3.03125, + "grad_norm_var": 0.04097900390625, + "learning_rate": 0.0001, + "loss": 5.9724, + "loss/crossentropy": 2.710559129714966, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17696435749530792, + "step": 15384 + }, + { + "epoch": 0.4808125, + "grad_norm": 3.25, + "grad_norm_var": 0.07803446451822917, + "learning_rate": 0.0001, + "loss": 5.907, + "loss/crossentropy": 2.601228952407837, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17706552147865295, + "step": 15386 + }, + { + "epoch": 0.480875, + "grad_norm": 3.15625, + "grad_norm_var": 0.077294921875, + "learning_rate": 0.0001, + "loss": 5.4973, + "loss/crossentropy": 2.385049819946289, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1588803306221962, + "step": 15388 + }, + { + "epoch": 0.4809375, + "grad_norm": 3.296875, + "grad_norm_var": 0.07291259765625, + "learning_rate": 0.0001, + "loss": 5.738, + "loss/crossentropy": 2.5566126108169556, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1677480787038803, + "step": 15390 + }, + { + "epoch": 0.481, + "grad_norm": 3.328125, + "grad_norm_var": 0.06599019368489584, + "learning_rate": 0.0001, + "loss": 5.8847, + "loss/crossentropy": 2.61326801776886, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17285117506980896, + "step": 15392 + }, + { + "epoch": 0.4810625, + "grad_norm": 3.15625, + "grad_norm_var": 0.05821024576822917, + "learning_rate": 0.0001, + "loss": 5.9517, + "loss/crossentropy": 2.667181134223938, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1776692196726799, + "step": 15394 + }, + { + "epoch": 0.481125, + "grad_norm": 3.0, + "grad_norm_var": 0.07560221354166667, + "learning_rate": 0.0001, + "loss": 5.7601, + "loss/crossentropy": 2.661770462989807, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16452480852603912, + "step": 15396 + }, + { + "epoch": 0.4811875, + "grad_norm": 3.25, + "grad_norm_var": 0.07385660807291666, + "learning_rate": 0.0001, + "loss": 5.8791, + "loss/crossentropy": 2.5926543474197388, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1778632253408432, + "step": 15398 + }, + { + "epoch": 0.48125, + "grad_norm": 3.109375, + "grad_norm_var": 0.075537109375, + "learning_rate": 0.0001, + "loss": 5.461, + "loss/crossentropy": 2.3669419288635254, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1633107140660286, + "step": 15400 + }, + { + "epoch": 0.4813125, + "grad_norm": 3.546875, + "grad_norm_var": 0.04103902180989583, + "learning_rate": 0.0001, + "loss": 6.2042, + "loss/crossentropy": 2.851514220237732, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18058335036039352, + "step": 15402 + }, + { + "epoch": 0.481375, + "grad_norm": 3.15625, + "grad_norm_var": 0.04088134765625, + "learning_rate": 0.0001, + "loss": 5.8117, + "loss/crossentropy": 2.5952255725860596, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1700821816921234, + "step": 15404 + }, + { + "epoch": 0.4814375, + "grad_norm": 3.25, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 5.6987, + "loss/crossentropy": 2.4063332080841064, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1741556078195572, + "step": 15406 + }, + { + "epoch": 0.4815, + "grad_norm": 3.171875, + "grad_norm_var": 0.0398101806640625, + "learning_rate": 0.0001, + "loss": 5.7181, + "loss/crossentropy": 2.535059690475464, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.168694369494915, + "step": 15408 + }, + { + "epoch": 0.4815625, + "grad_norm": 3.4375, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 5.4814, + "loss/crossentropy": 2.3332561254501343, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16598624736070633, + "step": 15410 + }, + { + "epoch": 0.481625, + "grad_norm": 3.125, + "grad_norm_var": 0.028352864583333335, + "learning_rate": 0.0001, + "loss": 5.8816, + "loss/crossentropy": 2.6249091625213623, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17410919815301895, + "step": 15412 + }, + { + "epoch": 0.4816875, + "grad_norm": 3.4375, + "grad_norm_var": 0.035380045572916664, + "learning_rate": 0.0001, + "loss": 5.6343, + "loss/crossentropy": 2.45553982257843, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16709962487220764, + "step": 15414 + }, + { + "epoch": 0.48175, + "grad_norm": 3.125, + "grad_norm_var": 0.03248291015625, + "learning_rate": 0.0001, + "loss": 6.0957, + "loss/crossentropy": 2.7755789756774902, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18201275169849396, + "step": 15416 + }, + { + "epoch": 0.4818125, + "grad_norm": 3.375, + "grad_norm_var": 0.0263336181640625, + "learning_rate": 0.0001, + "loss": 6.1827, + "loss/crossentropy": 2.8892905712127686, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17700207233428955, + "step": 15418 + }, + { + "epoch": 0.481875, + "grad_norm": 3.421875, + "grad_norm_var": 0.028473917643229166, + "learning_rate": 0.0001, + "loss": 6.0133, + "loss/crossentropy": 2.846737265586853, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16470354795455933, + "step": 15420 + }, + { + "epoch": 0.4819375, + "grad_norm": 3.40625, + "grad_norm_var": 0.025837198893229166, + "learning_rate": 0.0001, + "loss": 5.8385, + "loss/crossentropy": 2.6165854930877686, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17297424376010895, + "step": 15422 + }, + { + "epoch": 0.482, + "grad_norm": 2.953125, + "grad_norm_var": 0.030855305989583335, + "learning_rate": 0.0001, + "loss": 5.78, + "loss/crossentropy": 2.587552547454834, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1704176440834999, + "step": 15424 + }, + { + "epoch": 0.4820625, + "grad_norm": 3.15625, + "grad_norm_var": 0.025275675455729167, + "learning_rate": 0.0001, + "loss": 5.9748, + "loss/crossentropy": 2.706058979034424, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17570267617702484, + "step": 15426 + }, + { + "epoch": 0.482125, + "grad_norm": 3.171875, + "grad_norm_var": 0.03673502604166667, + "learning_rate": 0.0001, + "loss": 5.7973, + "loss/crossentropy": 2.5178266763687134, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17834125459194183, + "step": 15428 + }, + { + "epoch": 0.4821875, + "grad_norm": 3.25, + "grad_norm_var": 0.026200358072916666, + "learning_rate": 0.0001, + "loss": 5.9385, + "loss/crossentropy": 2.6587525606155396, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17719413340091705, + "step": 15430 + }, + { + "epoch": 0.48225, + "grad_norm": 3.453125, + "grad_norm_var": 0.026203409830729166, + "learning_rate": 0.0001, + "loss": 5.7926, + "loss/crossentropy": 2.520912289619446, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17678218334913254, + "step": 15432 + }, + { + "epoch": 0.4823125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0277008056640625, + "learning_rate": 0.0001, + "loss": 5.4106, + "loss/crossentropy": 2.2114129066467285, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16952737420797348, + "step": 15434 + }, + { + "epoch": 0.482375, + "grad_norm": 2.984375, + "grad_norm_var": 0.030887858072916666, + "learning_rate": 0.0001, + "loss": 5.9596, + "loss/crossentropy": 2.7331149578094482, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1718701273202896, + "step": 15436 + }, + { + "epoch": 0.4824375, + "grad_norm": 3.28125, + "grad_norm_var": 0.0292144775390625, + "learning_rate": 0.0001, + "loss": 6.0251, + "loss/crossentropy": 2.7384718656539917, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17865867912769318, + "step": 15438 + }, + { + "epoch": 0.4825, + "grad_norm": 3.21875, + "grad_norm_var": 0.02349853515625, + "learning_rate": 0.0001, + "loss": 5.7592, + "loss/crossentropy": 2.54393470287323, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1734769567847252, + "step": 15440 + }, + { + "epoch": 0.4825625, + "grad_norm": 3.078125, + "grad_norm_var": 0.02476806640625, + "learning_rate": 0.0001, + "loss": 5.7093, + "loss/crossentropy": 2.558498978614807, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16976485401391983, + "step": 15442 + }, + { + "epoch": 0.482625, + "grad_norm": 3.125, + "grad_norm_var": 0.0173004150390625, + "learning_rate": 0.0001, + "loss": 5.95, + "loss/crossentropy": 2.627052903175354, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17682897299528122, + "step": 15444 + }, + { + "epoch": 0.4826875, + "grad_norm": 3.328125, + "grad_norm_var": 0.01865234375, + "learning_rate": 0.0001, + "loss": 6.0299, + "loss/crossentropy": 2.693580389022827, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18441692739725113, + "step": 15446 + }, + { + "epoch": 0.48275, + "grad_norm": 2.84375, + "grad_norm_var": 0.021402994791666668, + "learning_rate": 0.0001, + "loss": 5.655, + "loss/crossentropy": 2.5437878370285034, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16424334794282913, + "step": 15448 + }, + { + "epoch": 0.4828125, + "grad_norm": 3.25, + "grad_norm_var": 0.020536295572916665, + "learning_rate": 0.0001, + "loss": 5.804, + "loss/crossentropy": 2.4936158657073975, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17596448212862015, + "step": 15450 + }, + { + "epoch": 0.482875, + "grad_norm": 3.4375, + "grad_norm_var": 0.021256510416666666, + "learning_rate": 0.0001, + "loss": 5.6857, + "loss/crossentropy": 2.4962233304977417, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1677766889333725, + "step": 15452 + }, + { + "epoch": 0.4829375, + "grad_norm": 3.1875, + "grad_norm_var": 0.020926920572916667, + "learning_rate": 0.0001, + "loss": 5.6113, + "loss/crossentropy": 2.497285485267639, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16062213480472565, + "step": 15454 + }, + { + "epoch": 0.483, + "grad_norm": 3.5, + "grad_norm_var": 0.029694620768229166, + "learning_rate": 0.0001, + "loss": 5.8732, + "loss/crossentropy": 2.5101137161254883, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1835731416940689, + "step": 15456 + }, + { + "epoch": 0.4830625, + "grad_norm": 3.5625, + "grad_norm_var": 0.034077962239583336, + "learning_rate": 0.0001, + "loss": 5.853, + "loss/crossentropy": 2.649649739265442, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1668233871459961, + "step": 15458 + }, + { + "epoch": 0.483125, + "grad_norm": 3.796875, + "grad_norm_var": 0.061572265625, + "learning_rate": 0.0001, + "loss": 5.4413, + "loss/crossentropy": 2.3667826652526855, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1578378528356552, + "step": 15460 + }, + { + "epoch": 0.4831875, + "grad_norm": 3.125, + "grad_norm_var": 0.07208658854166666, + "learning_rate": 0.0001, + "loss": 5.8166, + "loss/crossentropy": 2.555495262145996, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1729813516139984, + "step": 15462 + }, + { + "epoch": 0.48325, + "grad_norm": 3.171875, + "grad_norm_var": 0.06331278483072916, + "learning_rate": 0.0001, + "loss": 5.6925, + "loss/crossentropy": 2.530726909637451, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16891532391309738, + "step": 15464 + }, + { + "epoch": 0.4833125, + "grad_norm": 3.671875, + "grad_norm_var": 0.0723052978515625, + "learning_rate": 0.0001, + "loss": 5.7791, + "loss/crossentropy": 2.5951521396636963, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1695646271109581, + "step": 15466 + }, + { + "epoch": 0.483375, + "grad_norm": 3.25, + "grad_norm_var": 0.0817047119140625, + "learning_rate": 0.0001, + "loss": 5.9382, + "loss/crossentropy": 2.709382176399231, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16819705814123154, + "step": 15468 + }, + { + "epoch": 0.4834375, + "grad_norm": 2.9375, + "grad_norm_var": 0.09140625, + "learning_rate": 0.0001, + "loss": 5.6721, + "loss/crossentropy": 2.4807130098342896, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1718704253435135, + "step": 15470 + }, + { + "epoch": 0.4835, + "grad_norm": 3.421875, + "grad_norm_var": 0.0961822509765625, + "learning_rate": 0.0001, + "loss": 5.6021, + "loss/crossentropy": 2.491200089454651, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16499503701925278, + "step": 15472 + }, + { + "epoch": 0.4835625, + "grad_norm": 3.375, + "grad_norm_var": 0.09434305826822917, + "learning_rate": 0.0001, + "loss": 5.7448, + "loss/crossentropy": 2.526549220085144, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17026139795780182, + "step": 15474 + }, + { + "epoch": 0.483625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0687652587890625, + "learning_rate": 0.0001, + "loss": 5.3924, + "loss/crossentropy": 2.311138868331909, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16281364113092422, + "step": 15476 + }, + { + "epoch": 0.4836875, + "grad_norm": 3.09375, + "grad_norm_var": 0.06520894368489584, + "learning_rate": 0.0001, + "loss": 5.7145, + "loss/crossentropy": 2.522248387336731, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16883626580238342, + "step": 15478 + }, + { + "epoch": 0.48375, + "grad_norm": 3.359375, + "grad_norm_var": 0.06409098307291666, + "learning_rate": 0.0001, + "loss": 5.7127, + "loss/crossentropy": 2.506693124771118, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17059792578220367, + "step": 15480 + }, + { + "epoch": 0.4838125, + "grad_norm": 2.875, + "grad_norm_var": 0.06145833333333333, + "learning_rate": 0.0001, + "loss": 5.7323, + "loss/crossentropy": 2.513804316520691, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17106656730175018, + "step": 15482 + }, + { + "epoch": 0.483875, + "grad_norm": 3.3125, + "grad_norm_var": 0.045182291666666666, + "learning_rate": 0.0001, + "loss": 5.866, + "loss/crossentropy": 2.5672656297683716, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.18338841944932938, + "step": 15484 + }, + { + "epoch": 0.4839375, + "grad_norm": 3.34375, + "grad_norm_var": 0.0447662353515625, + "learning_rate": 0.0001, + "loss": 5.8856, + "loss/crossentropy": 2.5848830938339233, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17616067826747894, + "step": 15486 + }, + { + "epoch": 0.484, + "grad_norm": 3.453125, + "grad_norm_var": 0.0412994384765625, + "learning_rate": 0.0001, + "loss": 6.0439, + "loss/crossentropy": 2.7313040494918823, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17852747440338135, + "step": 15488 + }, + { + "epoch": 0.4840625, + "grad_norm": 3.28125, + "grad_norm_var": 0.03629150390625, + "learning_rate": 0.0001, + "loss": 5.8479, + "loss/crossentropy": 2.5692058801651, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.176309272646904, + "step": 15490 + }, + { + "epoch": 0.484125, + "grad_norm": 3.90625, + "grad_norm_var": 0.06212565104166667, + "learning_rate": 0.0001, + "loss": 6.0231, + "loss/crossentropy": 2.691379427909851, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18238692730665207, + "step": 15492 + }, + { + "epoch": 0.4841875, + "grad_norm": 3.546875, + "grad_norm_var": 0.04924214680989583, + "learning_rate": 0.0001, + "loss": 5.8521, + "loss/crossentropy": 2.636283278465271, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1711912304162979, + "step": 15494 + }, + { + "epoch": 0.48425, + "grad_norm": 3.453125, + "grad_norm_var": 0.045393880208333334, + "learning_rate": 0.0001, + "loss": 5.7119, + "loss/crossentropy": 2.4659664630889893, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1765500009059906, + "step": 15496 + }, + { + "epoch": 0.4843125, + "grad_norm": 2.84375, + "grad_norm_var": 0.0512359619140625, + "learning_rate": 0.0001, + "loss": 5.7075, + "loss/crossentropy": 2.5716251134872437, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.165537491440773, + "step": 15498 + }, + { + "epoch": 0.484375, + "grad_norm": 3.390625, + "grad_norm_var": 0.0505859375, + "learning_rate": 0.0001, + "loss": 5.9251, + "loss/crossentropy": 2.612492561340332, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17579318583011627, + "step": 15500 + }, + { + "epoch": 0.4844375, + "grad_norm": 3.09375, + "grad_norm_var": 0.05455729166666667, + "learning_rate": 0.0001, + "loss": 5.7712, + "loss/crossentropy": 2.5376791954040527, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17256756126880646, + "step": 15502 + }, + { + "epoch": 0.4845, + "grad_norm": 3.234375, + "grad_norm_var": 0.06252339680989584, + "learning_rate": 0.0001, + "loss": 5.7173, + "loss/crossentropy": 2.570461869239807, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16546591371297836, + "step": 15504 + }, + { + "epoch": 0.4845625, + "grad_norm": 3.0625, + "grad_norm_var": 0.06642964680989584, + "learning_rate": 0.0001, + "loss": 5.9016, + "loss/crossentropy": 2.675860047340393, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17452280223369598, + "step": 15506 + }, + { + "epoch": 0.484625, + "grad_norm": 3.203125, + "grad_norm_var": 0.03267313639322917, + "learning_rate": 0.0001, + "loss": 5.925, + "loss/crossentropy": 2.6444398164749146, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17609969526529312, + "step": 15508 + }, + { + "epoch": 0.4846875, + "grad_norm": 3.171875, + "grad_norm_var": 0.027469889322916666, + "learning_rate": 0.0001, + "loss": 5.6911, + "loss/crossentropy": 2.57095468044281, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1635763719677925, + "step": 15510 + }, + { + "epoch": 0.48475, + "grad_norm": 3.078125, + "grad_norm_var": 0.021629842122395833, + "learning_rate": 0.0001, + "loss": 5.8716, + "loss/crossentropy": 2.681228756904602, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17255127429962158, + "step": 15512 + }, + { + "epoch": 0.4848125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0184967041015625, + "learning_rate": 0.0001, + "loss": 5.5519, + "loss/crossentropy": 2.4044448137283325, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16162271797657013, + "step": 15514 + }, + { + "epoch": 0.484875, + "grad_norm": 3.390625, + "grad_norm_var": 0.0159332275390625, + "learning_rate": 0.0001, + "loss": 5.7978, + "loss/crossentropy": 2.5779892206192017, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17471708357334137, + "step": 15516 + }, + { + "epoch": 0.4849375, + "grad_norm": 3.4375, + "grad_norm_var": 0.024214680989583334, + "learning_rate": 0.0001, + "loss": 5.7523, + "loss/crossentropy": 2.6253888607025146, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16699223220348358, + "step": 15518 + }, + { + "epoch": 0.485, + "grad_norm": 3.140625, + "grad_norm_var": 0.031053670247395835, + "learning_rate": 0.0001, + "loss": 6.0675, + "loss/crossentropy": 2.764855742454529, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17675137519836426, + "step": 15520 + }, + { + "epoch": 0.4850625, + "grad_norm": 4.65625, + "grad_norm_var": 0.16469624837239583, + "learning_rate": 0.0001, + "loss": 6.1342, + "loss/crossentropy": 2.719378113746643, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.19070102274417877, + "step": 15522 + }, + { + "epoch": 0.485125, + "grad_norm": 3.328125, + "grad_norm_var": 0.16519266764322918, + "learning_rate": 0.0001, + "loss": 5.7085, + "loss/crossentropy": 2.5239862203598022, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1696275919675827, + "step": 15524 + }, + { + "epoch": 0.4851875, + "grad_norm": 3.53125, + "grad_norm_var": 0.16188151041666668, + "learning_rate": 0.0001, + "loss": 5.9615, + "loss/crossentropy": 2.628399610519409, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1821391060948372, + "step": 15526 + }, + { + "epoch": 0.48525, + "grad_norm": 3.3125, + "grad_norm_var": 0.15627848307291667, + "learning_rate": 0.0001, + "loss": 5.2108, + "loss/crossentropy": 2.182291626930237, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15363489091396332, + "step": 15528 + }, + { + "epoch": 0.4853125, + "grad_norm": 3.1875, + "grad_norm_var": 0.15288798014322916, + "learning_rate": 0.0001, + "loss": 5.855, + "loss/crossentropy": 2.5341769456863403, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18130408972501755, + "step": 15530 + }, + { + "epoch": 0.485375, + "grad_norm": 3.25, + "grad_norm_var": 0.15269775390625, + "learning_rate": 0.0001, + "loss": 5.9362, + "loss/crossentropy": 2.680835485458374, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17866097390651703, + "step": 15532 + }, + { + "epoch": 0.4854375, + "grad_norm": 3.203125, + "grad_norm_var": 0.14075113932291666, + "learning_rate": 0.0001, + "loss": 5.983, + "loss/crossentropy": 2.7550711631774902, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1743575856089592, + "step": 15534 + }, + { + "epoch": 0.4855, + "grad_norm": 3.1875, + "grad_norm_var": 0.1528961181640625, + "learning_rate": 0.0001, + "loss": 5.4632, + "loss/crossentropy": 2.4080125093460083, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16176702082157135, + "step": 15536 + }, + { + "epoch": 0.4855625, + "grad_norm": 3.046875, + "grad_norm_var": 0.025487263997395832, + "learning_rate": 0.0001, + "loss": 5.5596, + "loss/crossentropy": 2.4649598598480225, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1590685397386551, + "step": 15538 + }, + { + "epoch": 0.485625, + "grad_norm": 4.25, + "grad_norm_var": 0.0929107666015625, + "learning_rate": 0.0001, + "loss": 5.7771, + "loss/crossentropy": 2.542220115661621, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17427068948745728, + "step": 15540 + }, + { + "epoch": 0.4856875, + "grad_norm": 3.5625, + "grad_norm_var": 0.09619140625, + "learning_rate": 0.0001, + "loss": 5.8391, + "loss/crossentropy": 2.641697645187378, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1705191656947136, + "step": 15542 + }, + { + "epoch": 0.48575, + "grad_norm": 3.25, + "grad_norm_var": 0.09487202962239584, + "learning_rate": 0.0001, + "loss": 5.3877, + "loss/crossentropy": 2.2709479331970215, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1640223264694214, + "step": 15544 + }, + { + "epoch": 0.4858125, + "grad_norm": 2.828125, + "grad_norm_var": 0.1052398681640625, + "learning_rate": 0.0001, + "loss": 5.6644, + "loss/crossentropy": 2.567691445350647, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16474910080432892, + "step": 15546 + }, + { + "epoch": 0.485875, + "grad_norm": 3.078125, + "grad_norm_var": 0.11311848958333333, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.503561854362488, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16706478595733643, + "step": 15548 + }, + { + "epoch": 0.4859375, + "grad_norm": 2.953125, + "grad_norm_var": 0.11679585774739583, + "learning_rate": 0.0001, + "loss": 5.585, + "loss/crossentropy": 2.456661343574524, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16557209193706512, + "step": 15550 + }, + { + "epoch": 0.486, + "grad_norm": 2.984375, + "grad_norm_var": 0.11770426432291667, + "learning_rate": 0.0001, + "loss": 5.8205, + "loss/crossentropy": 2.609832286834717, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17068011313676834, + "step": 15552 + }, + { + "epoch": 0.4860625, + "grad_norm": 3.109375, + "grad_norm_var": 0.11530659993489584, + "learning_rate": 0.0001, + "loss": 5.7301, + "loss/crossentropy": 2.4726040363311768, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17301952093839645, + "step": 15554 + }, + { + "epoch": 0.486125, + "grad_norm": 2.765625, + "grad_norm_var": 0.04895426432291667, + "learning_rate": 0.0001, + "loss": 5.5239, + "loss/crossentropy": 2.456711530685425, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16297296434640884, + "step": 15556 + }, + { + "epoch": 0.4861875, + "grad_norm": 3.203125, + "grad_norm_var": 0.03626302083333333, + "learning_rate": 0.0001, + "loss": 5.5375, + "loss/crossentropy": 2.369423985481262, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17188247293233871, + "step": 15558 + }, + { + "epoch": 0.48625, + "grad_norm": 3.046875, + "grad_norm_var": 0.03292643229166667, + "learning_rate": 0.0001, + "loss": 6.0091, + "loss/crossentropy": 2.796201467514038, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17090045660734177, + "step": 15560 + }, + { + "epoch": 0.4863125, + "grad_norm": 3.109375, + "grad_norm_var": 0.02896728515625, + "learning_rate": 0.0001, + "loss": 5.3326, + "loss/crossentropy": 2.25577449798584, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15924570709466934, + "step": 15562 + }, + { + "epoch": 0.486375, + "grad_norm": 3.015625, + "grad_norm_var": 0.027977498372395833, + "learning_rate": 0.0001, + "loss": 5.806, + "loss/crossentropy": 2.6025267839431763, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16995244473218918, + "step": 15564 + }, + { + "epoch": 0.4864375, + "grad_norm": 3.015625, + "grad_norm_var": 0.027197265625, + "learning_rate": 0.0001, + "loss": 5.7417, + "loss/crossentropy": 2.589355945587158, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16601653397083282, + "step": 15566 + }, + { + "epoch": 0.4865, + "grad_norm": 2.984375, + "grad_norm_var": 0.0175689697265625, + "learning_rate": 0.0001, + "loss": 5.7621, + "loss/crossentropy": 2.5684362649917603, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16975650191307068, + "step": 15568 + }, + { + "epoch": 0.4865625, + "grad_norm": 3.140625, + "grad_norm_var": 0.037507120768229166, + "learning_rate": 0.0001, + "loss": 5.9495, + "loss/crossentropy": 2.5972514152526855, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1813228577375412, + "step": 15570 + }, + { + "epoch": 0.486625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03493550618489583, + "learning_rate": 0.0001, + "loss": 5.4156, + "loss/crossentropy": 2.384309411048889, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15625690668821335, + "step": 15572 + }, + { + "epoch": 0.4866875, + "grad_norm": 3.375, + "grad_norm_var": 0.03961588541666667, + "learning_rate": 0.0001, + "loss": 5.5536, + "loss/crossentropy": 2.380679130554199, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1669015809893608, + "step": 15574 + }, + { + "epoch": 0.48675, + "grad_norm": 3.4375, + "grad_norm_var": 0.04234619140625, + "learning_rate": 0.0001, + "loss": 5.7799, + "loss/crossentropy": 2.593273162841797, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1710069254040718, + "step": 15576 + }, + { + "epoch": 0.4868125, + "grad_norm": 2.953125, + "grad_norm_var": 0.04585673014322917, + "learning_rate": 0.0001, + "loss": 5.5643, + "loss/crossentropy": 2.4406670331954956, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16510066390037537, + "step": 15578 + }, + { + "epoch": 0.486875, + "grad_norm": 3.515625, + "grad_norm_var": 0.05426432291666667, + "learning_rate": 0.0001, + "loss": 5.8131, + "loss/crossentropy": 2.6028060913085938, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17063555121421814, + "step": 15580 + }, + { + "epoch": 0.4869375, + "grad_norm": 2.90625, + "grad_norm_var": 0.06106770833333333, + "learning_rate": 0.0001, + "loss": 5.4776, + "loss/crossentropy": 2.4742895364761353, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1573660522699356, + "step": 15582 + }, + { + "epoch": 0.487, + "grad_norm": 3.15625, + "grad_norm_var": 0.0585357666015625, + "learning_rate": 0.0001, + "loss": 5.3708, + "loss/crossentropy": 2.273473858833313, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16090303659439087, + "step": 15584 + }, + { + "epoch": 0.4870625, + "grad_norm": 3.515625, + "grad_norm_var": 0.050048828125, + "learning_rate": 0.0001, + "loss": 6.0934, + "loss/crossentropy": 2.688918948173523, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18576036393642426, + "step": 15586 + }, + { + "epoch": 0.487125, + "grad_norm": 3.015625, + "grad_norm_var": 0.04457906087239583, + "learning_rate": 0.0001, + "loss": 5.6411, + "loss/crossentropy": 2.4709302186965942, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16819019615650177, + "step": 15588 + }, + { + "epoch": 0.4871875, + "grad_norm": 3.125, + "grad_norm_var": 0.039351399739583334, + "learning_rate": 0.0001, + "loss": 5.7386, + "loss/crossentropy": 2.539092183113098, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17190296202898026, + "step": 15590 + }, + { + "epoch": 0.48725, + "grad_norm": 3.203125, + "grad_norm_var": 0.047200520833333336, + "learning_rate": 0.0001, + "loss": 5.5852, + "loss/crossentropy": 2.337055206298828, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.171297125518322, + "step": 15592 + }, + { + "epoch": 0.4873125, + "grad_norm": 3.390625, + "grad_norm_var": 0.05415751139322917, + "learning_rate": 0.0001, + "loss": 5.8436, + "loss/crossentropy": 2.5291141271591187, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17597944289445877, + "step": 15594 + }, + { + "epoch": 0.487375, + "grad_norm": 3.171875, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 0.0001, + "loss": 5.7844, + "loss/crossentropy": 2.5487102270126343, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17357081174850464, + "step": 15596 + }, + { + "epoch": 0.4874375, + "grad_norm": 3.296875, + "grad_norm_var": 0.04097900390625, + "learning_rate": 0.0001, + "loss": 5.4457, + "loss/crossentropy": 2.3209153413772583, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16287311166524887, + "step": 15598 + }, + { + "epoch": 0.4875, + "grad_norm": 3.078125, + "grad_norm_var": 0.040262858072916664, + "learning_rate": 0.0001, + "loss": 5.5744, + "loss/crossentropy": 2.4439727067947388, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1642143875360489, + "step": 15600 + }, + { + "epoch": 0.4875625, + "grad_norm": 3.28125, + "grad_norm_var": 0.03516337076822917, + "learning_rate": 0.0001, + "loss": 5.4231, + "loss/crossentropy": 2.3545764684677124, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.15567777305841446, + "step": 15602 + }, + { + "epoch": 0.487625, + "grad_norm": 3.28125, + "grad_norm_var": 0.9217112223307292, + "learning_rate": 0.0001, + "loss": 6.1146, + "loss/crossentropy": 2.6810855865478516, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1855430081486702, + "step": 15604 + }, + { + "epoch": 0.4876875, + "grad_norm": 3.921875, + "grad_norm_var": 0.9090779622395834, + "learning_rate": 0.0001, + "loss": 5.4459, + "loss/crossentropy": 2.262246608734131, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16523557156324387, + "step": 15606 + }, + { + "epoch": 0.48775, + "grad_norm": 3.046875, + "grad_norm_var": 1.2462565104166667, + "learning_rate": 0.0001, + "loss": 5.3636, + "loss/crossentropy": 2.2482502460479736, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16387996822595596, + "step": 15608 + }, + { + "epoch": 0.4878125, + "grad_norm": 3.984375, + "grad_norm_var": 1.25670166015625, + "learning_rate": 0.0001, + "loss": 5.7457, + "loss/crossentropy": 2.5216753482818604, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17200765758752823, + "step": 15610 + }, + { + "epoch": 0.487875, + "grad_norm": 3.09375, + "grad_norm_var": 1.2897206624348958, + "learning_rate": 0.0001, + "loss": 5.3269, + "loss/crossentropy": 2.31750226020813, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.14976944029331207, + "step": 15612 + }, + { + "epoch": 0.4879375, + "grad_norm": 3.09375, + "grad_norm_var": 1.2784464518229166, + "learning_rate": 0.0001, + "loss": 5.7985, + "loss/crossentropy": 2.6566789150238037, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.161838561296463, + "step": 15614 + }, + { + "epoch": 0.488, + "grad_norm": 3.09375, + "grad_norm_var": 1.2850423177083334, + "learning_rate": 0.0001, + "loss": 5.3373, + "loss/crossentropy": 2.2665454149246216, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15667995065450668, + "step": 15616 + }, + { + "epoch": 0.4880625, + "grad_norm": 3.125, + "grad_norm_var": 1.3047190348307292, + "learning_rate": 0.0001, + "loss": 5.7071, + "loss/crossentropy": 2.507651925086975, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1695544570684433, + "step": 15618 + }, + { + "epoch": 0.488125, + "grad_norm": 3.046875, + "grad_norm_var": 0.5093495686848958, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.583544969558716, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17369394749403, + "step": 15620 + }, + { + "epoch": 0.4881875, + "grad_norm": 2.8125, + "grad_norm_var": 0.51334228515625, + "learning_rate": 0.0001, + "loss": 5.623, + "loss/crossentropy": 2.515665292739868, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16581125557422638, + "step": 15622 + }, + { + "epoch": 0.48825, + "grad_norm": 3.65625, + "grad_norm_var": 0.1669097900390625, + "learning_rate": 0.0001, + "loss": 6.2384, + "loss/crossentropy": 2.799090266227722, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19041062146425247, + "step": 15624 + }, + { + "epoch": 0.4883125, + "grad_norm": 3.34375, + "grad_norm_var": 0.1380859375, + "learning_rate": 0.0001, + "loss": 5.7315, + "loss/crossentropy": 2.5976243019104004, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16612115502357483, + "step": 15626 + }, + { + "epoch": 0.488375, + "grad_norm": 3.125, + "grad_norm_var": 0.13386942545572916, + "learning_rate": 0.0001, + "loss": 5.6682, + "loss/crossentropy": 2.4756577014923096, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16847358644008636, + "step": 15628 + }, + { + "epoch": 0.4884375, + "grad_norm": 3.625, + "grad_norm_var": 0.14802144368489584, + "learning_rate": 0.0001, + "loss": 5.7986, + "loss/crossentropy": 2.55096173286438, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17359618842601776, + "step": 15630 + }, + { + "epoch": 0.4885, + "grad_norm": 3.109375, + "grad_norm_var": 0.15494384765625, + "learning_rate": 0.0001, + "loss": 5.8165, + "loss/crossentropy": 2.722593665122986, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16212552040815353, + "step": 15632 + }, + { + "epoch": 0.4885625, + "grad_norm": 3.078125, + "grad_norm_var": 0.15543212890625, + "learning_rate": 0.0001, + "loss": 5.7039, + "loss/crossentropy": 2.5893197059631348, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16497574001550674, + "step": 15634 + }, + { + "epoch": 0.488625, + "grad_norm": 3.0, + "grad_norm_var": 0.1569976806640625, + "learning_rate": 0.0001, + "loss": 5.6041, + "loss/crossentropy": 2.4363603591918945, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16716646403074265, + "step": 15636 + }, + { + "epoch": 0.4886875, + "grad_norm": 3.09375, + "grad_norm_var": 0.14580790201822916, + "learning_rate": 0.0001, + "loss": 5.6451, + "loss/crossentropy": 2.409726619720459, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1719701960682869, + "step": 15638 + }, + { + "epoch": 0.48875, + "grad_norm": 3.421875, + "grad_norm_var": 0.03974507649739583, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.5570164918899536, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1762908697128296, + "step": 15640 + }, + { + "epoch": 0.4888125, + "grad_norm": 3.8125, + "grad_norm_var": 0.05964253743489583, + "learning_rate": 0.0001, + "loss": 6.009, + "loss/crossentropy": 2.591892957687378, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18936841934919357, + "step": 15642 + }, + { + "epoch": 0.488875, + "grad_norm": 3.3125, + "grad_norm_var": 0.06513570149739584, + "learning_rate": 0.0001, + "loss": 5.835, + "loss/crossentropy": 2.527828335762024, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1736869290471077, + "step": 15644 + }, + { + "epoch": 0.4889375, + "grad_norm": 3.34375, + "grad_norm_var": 0.051102701822916666, + "learning_rate": 0.0001, + "loss": 5.7208, + "loss/crossentropy": 2.525538444519043, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1706969365477562, + "step": 15646 + }, + { + "epoch": 0.489, + "grad_norm": 2.84375, + "grad_norm_var": 0.051953125, + "learning_rate": 0.0001, + "loss": 5.7361, + "loss/crossentropy": 2.5371170043945312, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17380855977535248, + "step": 15648 + }, + { + "epoch": 0.4890625, + "grad_norm": 3.234375, + "grad_norm_var": 0.05137430826822917, + "learning_rate": 0.0001, + "loss": 5.8679, + "loss/crossentropy": 2.589292883872986, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17747006565332413, + "step": 15650 + }, + { + "epoch": 0.489125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05194905598958333, + "learning_rate": 0.0001, + "loss": 5.6713, + "loss/crossentropy": 2.5229251384735107, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16327373683452606, + "step": 15652 + }, + { + "epoch": 0.4891875, + "grad_norm": 2.96875, + "grad_norm_var": 0.06328023274739583, + "learning_rate": 0.0001, + "loss": 5.6995, + "loss/crossentropy": 2.5932918787002563, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16296438872814178, + "step": 15654 + }, + { + "epoch": 0.48925, + "grad_norm": 3.3125, + "grad_norm_var": 0.06113993326822917, + "learning_rate": 0.0001, + "loss": 5.7168, + "loss/crossentropy": 2.4919549226760864, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1709190458059311, + "step": 15656 + }, + { + "epoch": 0.4893125, + "grad_norm": 3.03125, + "grad_norm_var": 0.07034098307291667, + "learning_rate": 0.0001, + "loss": 5.9377, + "loss/crossentropy": 2.6713995933532715, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17077427357435226, + "step": 15658 + }, + { + "epoch": 0.489375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0634918212890625, + "learning_rate": 0.0001, + "loss": 5.8308, + "loss/crossentropy": 2.574105978012085, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1756741628050804, + "step": 15660 + }, + { + "epoch": 0.4894375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0641265869140625, + "learning_rate": 0.0001, + "loss": 5.6464, + "loss/crossentropy": 2.481296420097351, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1680774688720703, + "step": 15662 + }, + { + "epoch": 0.4895, + "grad_norm": 3.4375, + "grad_norm_var": 0.06000874837239583, + "learning_rate": 0.0001, + "loss": 5.8343, + "loss/crossentropy": 2.5626578330993652, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1756017655134201, + "step": 15664 + }, + { + "epoch": 0.4895625, + "grad_norm": 2.984375, + "grad_norm_var": 0.06164957682291667, + "learning_rate": 0.0001, + "loss": 6.038, + "loss/crossentropy": 2.7693047523498535, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1753101423382759, + "step": 15666 + }, + { + "epoch": 0.489625, + "grad_norm": 3.34375, + "grad_norm_var": 0.06169331868489583, + "learning_rate": 0.0001, + "loss": 5.7044, + "loss/crossentropy": 2.4952261447906494, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16896343231201172, + "step": 15668 + }, + { + "epoch": 0.4896875, + "grad_norm": 3.171875, + "grad_norm_var": 0.05386962890625, + "learning_rate": 0.0001, + "loss": 5.799, + "loss/crossentropy": 2.552024006843567, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17508617788553238, + "step": 15670 + }, + { + "epoch": 0.48975, + "grad_norm": 3.375, + "grad_norm_var": 0.06438700358072917, + "learning_rate": 0.0001, + "loss": 5.6874, + "loss/crossentropy": 2.5851725339889526, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16491178423166275, + "step": 15672 + }, + { + "epoch": 0.4898125, + "grad_norm": 3.453125, + "grad_norm_var": 0.03469645182291667, + "learning_rate": 0.0001, + "loss": 5.9859, + "loss/crossentropy": 2.67172908782959, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18102586269378662, + "step": 15674 + }, + { + "epoch": 0.489875, + "grad_norm": 3.046875, + "grad_norm_var": 0.036164347330729166, + "learning_rate": 0.0001, + "loss": 5.4261, + "loss/crossentropy": 2.4043819904327393, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15763608366250992, + "step": 15676 + }, + { + "epoch": 0.4899375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03381754557291667, + "learning_rate": 0.0001, + "loss": 5.4145, + "loss/crossentropy": 2.3068490028381348, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16310925781726837, + "step": 15678 + }, + { + "epoch": 0.49, + "grad_norm": 2.953125, + "grad_norm_var": 0.04502665201822917, + "learning_rate": 0.0001, + "loss": 5.9619, + "loss/crossentropy": 2.7321821451187134, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17258545756340027, + "step": 15680 + }, + { + "epoch": 0.4900625, + "grad_norm": 3.28125, + "grad_norm_var": 0.04262593587239583, + "learning_rate": 0.0001, + "loss": 5.8138, + "loss/crossentropy": 2.5558485984802246, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17384207993745804, + "step": 15682 + }, + { + "epoch": 0.490125, + "grad_norm": 3.046875, + "grad_norm_var": 0.043115234375, + "learning_rate": 0.0001, + "loss": 5.7928, + "loss/crossentropy": 2.6558672189712524, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1687743440270424, + "step": 15684 + }, + { + "epoch": 0.4901875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0478912353515625, + "learning_rate": 0.0001, + "loss": 5.9135, + "loss/crossentropy": 2.7171976566314697, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17041613161563873, + "step": 15686 + }, + { + "epoch": 0.49025, + "grad_norm": 3.3125, + "grad_norm_var": 0.03931376139322917, + "learning_rate": 0.0001, + "loss": 6.1663, + "loss/crossentropy": 2.863266110420227, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17756742984056473, + "step": 15688 + }, + { + "epoch": 0.4903125, + "grad_norm": 3.109375, + "grad_norm_var": 0.032177734375, + "learning_rate": 0.0001, + "loss": 6.1456, + "loss/crossentropy": 2.817554473876953, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18437139689922333, + "step": 15690 + }, + { + "epoch": 0.490375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03543294270833333, + "learning_rate": 0.0001, + "loss": 5.9821, + "loss/crossentropy": 2.5803717374801636, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1878328025341034, + "step": 15692 + }, + { + "epoch": 0.4904375, + "grad_norm": 3.09375, + "grad_norm_var": 0.040511067708333334, + "learning_rate": 0.0001, + "loss": 5.8106, + "loss/crossentropy": 2.566185235977173, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1724904626607895, + "step": 15694 + }, + { + "epoch": 0.4905, + "grad_norm": 3.28125, + "grad_norm_var": 0.025679524739583334, + "learning_rate": 0.0001, + "loss": 6.3179, + "loss/crossentropy": 2.9657928943634033, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18443161249160767, + "step": 15696 + }, + { + "epoch": 0.4905625, + "grad_norm": 3.0625, + "grad_norm_var": 0.02681884765625, + "learning_rate": 0.0001, + "loss": 5.7591, + "loss/crossentropy": 2.622571587562561, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16716735810041428, + "step": 15698 + }, + { + "epoch": 0.490625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0323394775390625, + "learning_rate": 0.0001, + "loss": 5.1729, + "loss/crossentropy": 2.186972200870514, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15054922550916672, + "step": 15700 + }, + { + "epoch": 0.4906875, + "grad_norm": 3.703125, + "grad_norm_var": 0.04387613932291667, + "learning_rate": 0.0001, + "loss": 6.3602, + "loss/crossentropy": 2.898533582687378, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1922570914030075, + "step": 15702 + }, + { + "epoch": 0.49075, + "grad_norm": 3.25, + "grad_norm_var": 0.048095703125, + "learning_rate": 0.0001, + "loss": 5.5322, + "loss/crossentropy": 2.4021382331848145, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16612635552883148, + "step": 15704 + }, + { + "epoch": 0.4908125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0469879150390625, + "learning_rate": 0.0001, + "loss": 5.8965, + "loss/crossentropy": 2.679166316986084, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17056386172771454, + "step": 15706 + }, + { + "epoch": 0.490875, + "grad_norm": 2.765625, + "grad_norm_var": 0.05694986979166667, + "learning_rate": 0.0001, + "loss": 5.495, + "loss/crossentropy": 2.380611300468445, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16339337080717087, + "step": 15708 + }, + { + "epoch": 0.4909375, + "grad_norm": 3.109375, + "grad_norm_var": 0.05315348307291667, + "learning_rate": 0.0001, + "loss": 5.9962, + "loss/crossentropy": 2.701938509941101, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.18255415558815002, + "step": 15710 + }, + { + "epoch": 0.491, + "grad_norm": 3.03125, + "grad_norm_var": 0.05447591145833333, + "learning_rate": 0.0001, + "loss": 5.9492, + "loss/crossentropy": 2.657499313354492, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17917373776435852, + "step": 15712 + }, + { + "epoch": 0.4910625, + "grad_norm": 2.9375, + "grad_norm_var": 0.057428995768229164, + "learning_rate": 0.0001, + "loss": 5.8335, + "loss/crossentropy": 2.63913357257843, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17060479521751404, + "step": 15714 + }, + { + "epoch": 0.491125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0566070556640625, + "learning_rate": 0.0001, + "loss": 6.0182, + "loss/crossentropy": 2.71595299243927, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18139967322349548, + "step": 15716 + }, + { + "epoch": 0.4911875, + "grad_norm": 2.984375, + "grad_norm_var": 0.03808186848958333, + "learning_rate": 0.0001, + "loss": 5.0515, + "loss/crossentropy": 2.112143397331238, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1470591500401497, + "step": 15718 + }, + { + "epoch": 0.49125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0357421875, + "learning_rate": 0.0001, + "loss": 5.4537, + "loss/crossentropy": 2.3805789947509766, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1631671041250229, + "step": 15720 + }, + { + "epoch": 0.4913125, + "grad_norm": 3.265625, + "grad_norm_var": 0.0476226806640625, + "learning_rate": 0.0001, + "loss": 5.9306, + "loss/crossentropy": 2.6870051622390747, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16967561841011047, + "step": 15722 + }, + { + "epoch": 0.491375, + "grad_norm": 3.765625, + "grad_norm_var": 0.057356770833333334, + "learning_rate": 0.0001, + "loss": 5.511, + "loss/crossentropy": 2.3568965196609497, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16892804205417633, + "step": 15724 + }, + { + "epoch": 0.4914375, + "grad_norm": 3.6875, + "grad_norm_var": 0.06994527180989583, + "learning_rate": 0.0001, + "loss": 6.0452, + "loss/crossentropy": 2.7377495765686035, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17956935614347458, + "step": 15726 + }, + { + "epoch": 0.4915, + "grad_norm": 3.359375, + "grad_norm_var": 0.07031962076822916, + "learning_rate": 0.0001, + "loss": 5.8133, + "loss/crossentropy": 2.5982768535614014, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16681692004203796, + "step": 15728 + }, + { + "epoch": 0.4915625, + "grad_norm": 3.71875, + "grad_norm_var": 0.06838785807291667, + "learning_rate": 0.0001, + "loss": 6.1405, + "loss/crossentropy": 2.72856867313385, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1865089237689972, + "step": 15730 + }, + { + "epoch": 0.491625, + "grad_norm": 3.34375, + "grad_norm_var": 0.0654449462890625, + "learning_rate": 0.0001, + "loss": 5.7988, + "loss/crossentropy": 2.4939388036727905, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17814233899116516, + "step": 15732 + }, + { + "epoch": 0.4916875, + "grad_norm": 3.515625, + "grad_norm_var": 0.051285807291666666, + "learning_rate": 0.0001, + "loss": 6.1611, + "loss/crossentropy": 2.8457610607147217, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18036296963691711, + "step": 15734 + }, + { + "epoch": 0.49175, + "grad_norm": 3.3125, + "grad_norm_var": 0.041304524739583334, + "learning_rate": 0.0001, + "loss": 5.8231, + "loss/crossentropy": 2.62145733833313, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17055156826972961, + "step": 15736 + }, + { + "epoch": 0.4918125, + "grad_norm": 2.90625, + "grad_norm_var": 0.05705464680989583, + "learning_rate": 0.0001, + "loss": 5.7634, + "loss/crossentropy": 2.5695502758026123, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17407134175300598, + "step": 15738 + }, + { + "epoch": 0.491875, + "grad_norm": 3.0625, + "grad_norm_var": 0.05419514973958333, + "learning_rate": 0.0001, + "loss": 5.8107, + "loss/crossentropy": 2.575039505958557, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17434308677911758, + "step": 15740 + }, + { + "epoch": 0.4919375, + "grad_norm": 3.109375, + "grad_norm_var": 0.05912984212239583, + "learning_rate": 0.0001, + "loss": 5.9301, + "loss/crossentropy": 2.6661219596862793, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1767871379852295, + "step": 15742 + }, + { + "epoch": 0.492, + "grad_norm": 3.25, + "grad_norm_var": 0.058527628580729164, + "learning_rate": 0.0001, + "loss": 5.585, + "loss/crossentropy": 2.4229283332824707, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16894365847110748, + "step": 15744 + }, + { + "epoch": 0.4920625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0479644775390625, + "learning_rate": 0.0001, + "loss": 5.8786, + "loss/crossentropy": 2.6585851907730103, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17473089694976807, + "step": 15746 + }, + { + "epoch": 0.492125, + "grad_norm": 3.328125, + "grad_norm_var": 0.0489898681640625, + "learning_rate": 0.0001, + "loss": 5.791, + "loss/crossentropy": 2.6436983346939087, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16511915624141693, + "step": 15748 + }, + { + "epoch": 0.4921875, + "grad_norm": 3.0, + "grad_norm_var": 0.04908447265625, + "learning_rate": 0.0001, + "loss": 5.7434, + "loss/crossentropy": 2.5909314155578613, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16680838912725449, + "step": 15750 + }, + { + "epoch": 0.49225, + "grad_norm": 3.25, + "grad_norm_var": 0.0529449462890625, + "learning_rate": 0.0001, + "loss": 5.4595, + "loss/crossentropy": 2.351040482521057, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16436628252267838, + "step": 15752 + }, + { + "epoch": 0.4923125, + "grad_norm": 3.125, + "grad_norm_var": 0.048151652018229164, + "learning_rate": 0.0001, + "loss": 5.8182, + "loss/crossentropy": 2.605093002319336, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1732684075832367, + "step": 15754 + }, + { + "epoch": 0.492375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03340555826822917, + "learning_rate": 0.0001, + "loss": 5.8776, + "loss/crossentropy": 2.666801691055298, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1730295717716217, + "step": 15756 + }, + { + "epoch": 0.4924375, + "grad_norm": 3.296875, + "grad_norm_var": 0.014436848958333333, + "learning_rate": 0.0001, + "loss": 5.8782, + "loss/crossentropy": 2.508259654045105, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1838727369904518, + "step": 15758 + }, + { + "epoch": 0.4925, + "grad_norm": 3.203125, + "grad_norm_var": 0.021654256184895835, + "learning_rate": 0.0001, + "loss": 6.0234, + "loss/crossentropy": 2.7083660364151, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17837554216384888, + "step": 15760 + }, + { + "epoch": 0.4925625, + "grad_norm": 3.03125, + "grad_norm_var": 0.02467041015625, + "learning_rate": 0.0001, + "loss": 5.6442, + "loss/crossentropy": 2.5146384239196777, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16412509232759476, + "step": 15762 + }, + { + "epoch": 0.492625, + "grad_norm": 3.0625, + "grad_norm_var": 0.023563639322916666, + "learning_rate": 0.0001, + "loss": 5.8087, + "loss/crossentropy": 2.599011540412903, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17136258631944656, + "step": 15764 + }, + { + "epoch": 0.4926875, + "grad_norm": 3.140625, + "grad_norm_var": 0.02135009765625, + "learning_rate": 0.0001, + "loss": 6.1348, + "loss/crossentropy": 2.864608883857727, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17545898258686066, + "step": 15766 + }, + { + "epoch": 0.49275, + "grad_norm": 3.21875, + "grad_norm_var": 0.017682902018229165, + "learning_rate": 0.0001, + "loss": 5.5526, + "loss/crossentropy": 2.442115902900696, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16808071732521057, + "step": 15768 + }, + { + "epoch": 0.4928125, + "grad_norm": 3.0, + "grad_norm_var": 0.019391886393229165, + "learning_rate": 0.0001, + "loss": 5.5731, + "loss/crossentropy": 2.4782867431640625, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16299203038215637, + "step": 15770 + }, + { + "epoch": 0.492875, + "grad_norm": 3.125, + "grad_norm_var": 0.016852823893229167, + "learning_rate": 0.0001, + "loss": 5.6081, + "loss/crossentropy": 2.4362006187438965, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16719000041484833, + "step": 15772 + }, + { + "epoch": 0.4929375, + "grad_norm": 3.390625, + "grad_norm_var": 0.01842041015625, + "learning_rate": 0.0001, + "loss": 5.9624, + "loss/crossentropy": 2.607534646987915, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18509627133607864, + "step": 15774 + }, + { + "epoch": 0.493, + "grad_norm": 2.875, + "grad_norm_var": 0.04576416015625, + "learning_rate": 0.0001, + "loss": 5.6579, + "loss/crossentropy": 2.476379632949829, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16541379690170288, + "step": 15776 + }, + { + "epoch": 0.4930625, + "grad_norm": 3.375, + "grad_norm_var": 0.049055989583333334, + "learning_rate": 0.0001, + "loss": 6.1386, + "loss/crossentropy": 2.7492098808288574, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1865941658616066, + "step": 15778 + }, + { + "epoch": 0.493125, + "grad_norm": 2.9375, + "grad_norm_var": 0.05415751139322917, + "learning_rate": 0.0001, + "loss": 5.5255, + "loss/crossentropy": 2.4384251832962036, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16143856197595596, + "step": 15780 + }, + { + "epoch": 0.4931875, + "grad_norm": 3.125, + "grad_norm_var": 0.05900777180989583, + "learning_rate": 0.0001, + "loss": 5.5614, + "loss/crossentropy": 2.5072191953659058, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16088951379060745, + "step": 15782 + }, + { + "epoch": 0.49325, + "grad_norm": 3.296875, + "grad_norm_var": 0.0617340087890625, + "learning_rate": 0.0001, + "loss": 5.8466, + "loss/crossentropy": 2.5994954109191895, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17510396987199783, + "step": 15784 + }, + { + "epoch": 0.4933125, + "grad_norm": 3.796875, + "grad_norm_var": 0.07994384765625, + "learning_rate": 0.0001, + "loss": 5.7199, + "loss/crossentropy": 2.497785806655884, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1722072958946228, + "step": 15786 + }, + { + "epoch": 0.493375, + "grad_norm": 3.21875, + "grad_norm_var": 0.07884012858072917, + "learning_rate": 0.0001, + "loss": 5.8925, + "loss/crossentropy": 2.6982651948928833, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17059308290481567, + "step": 15788 + }, + { + "epoch": 0.4934375, + "grad_norm": 3.34375, + "grad_norm_var": 0.07744852701822917, + "learning_rate": 0.0001, + "loss": 6.1106, + "loss/crossentropy": 2.75845205783844, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18365000188350677, + "step": 15790 + }, + { + "epoch": 0.4935, + "grad_norm": 3.203125, + "grad_norm_var": 0.04768778483072917, + "learning_rate": 0.0001, + "loss": 5.853, + "loss/crossentropy": 2.569619655609131, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17911946028470993, + "step": 15792 + }, + { + "epoch": 0.4935625, + "grad_norm": 3.265625, + "grad_norm_var": 0.043553670247395836, + "learning_rate": 0.0001, + "loss": 5.9874, + "loss/crossentropy": 2.724018096923828, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1771218106150627, + "step": 15794 + }, + { + "epoch": 0.493625, + "grad_norm": 3.171875, + "grad_norm_var": 0.036539713541666664, + "learning_rate": 0.0001, + "loss": 5.7951, + "loss/crossentropy": 2.6281638145446777, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1686425358057022, + "step": 15796 + }, + { + "epoch": 0.4936875, + "grad_norm": 3.25, + "grad_norm_var": 0.027827962239583334, + "learning_rate": 0.0001, + "loss": 5.8517, + "loss/crossentropy": 2.591265320777893, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17408807575702667, + "step": 15798 + }, + { + "epoch": 0.49375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03145243326822917, + "learning_rate": 0.0001, + "loss": 5.8795, + "loss/crossentropy": 2.5996209383010864, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17720826715230942, + "step": 15800 + }, + { + "epoch": 0.4938125, + "grad_norm": 2.859375, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 5.4859, + "loss/crossentropy": 2.459252953529358, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1589120328426361, + "step": 15802 + }, + { + "epoch": 0.493875, + "grad_norm": 3.09375, + "grad_norm_var": 0.02164306640625, + "learning_rate": 0.0001, + "loss": 5.4081, + "loss/crossentropy": 2.3998864889144897, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1601930558681488, + "step": 15804 + }, + { + "epoch": 0.4939375, + "grad_norm": 3.171875, + "grad_norm_var": 0.14748942057291667, + "learning_rate": 0.0001, + "loss": 6.1105, + "loss/crossentropy": 2.7312912940979004, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18128246814012527, + "step": 15806 + }, + { + "epoch": 0.494, + "grad_norm": 3.375, + "grad_norm_var": 0.14765218098958333, + "learning_rate": 0.0001, + "loss": 5.9199, + "loss/crossentropy": 2.6582727432250977, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17694677412509918, + "step": 15808 + }, + { + "epoch": 0.4940625, + "grad_norm": 12.0, + "grad_norm_var": 4.86480712890625, + "learning_rate": 0.0001, + "loss": 6.2311, + "loss/crossentropy": 2.7658541202545166, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19027357548475266, + "step": 15810 + }, + { + "epoch": 0.494125, + "grad_norm": 3.234375, + "grad_norm_var": 4.840543619791666, + "learning_rate": 0.0001, + "loss": 5.6745, + "loss/crossentropy": 2.480656147003174, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17172466963529587, + "step": 15812 + }, + { + "epoch": 0.4941875, + "grad_norm": 4.0625, + "grad_norm_var": 4.833177693684896, + "learning_rate": 0.0001, + "loss": 5.394, + "loss/crossentropy": 2.290852904319763, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16305189579725266, + "step": 15814 + }, + { + "epoch": 0.49425, + "grad_norm": 3.140625, + "grad_norm_var": 4.824479166666666, + "learning_rate": 0.0001, + "loss": 6.2618, + "loss/crossentropy": 2.8281394243240356, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1875031590461731, + "step": 15816 + }, + { + "epoch": 0.4943125, + "grad_norm": 3.3125, + "grad_norm_var": 4.766731770833333, + "learning_rate": 0.0001, + "loss": 5.9952, + "loss/crossentropy": 2.7113758325576782, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1775989830493927, + "step": 15818 + }, + { + "epoch": 0.494375, + "grad_norm": 3.171875, + "grad_norm_var": 4.751414998372396, + "learning_rate": 0.0001, + "loss": 5.7191, + "loss/crossentropy": 2.561946988105774, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16610819846391678, + "step": 15820 + }, + { + "epoch": 0.4944375, + "grad_norm": 3.21875, + "grad_norm_var": 4.773551432291667, + "learning_rate": 0.0001, + "loss": 5.7712, + "loss/crossentropy": 2.569547414779663, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17250573635101318, + "step": 15822 + }, + { + "epoch": 0.4945, + "grad_norm": 3.578125, + "grad_norm_var": 4.76539306640625, + "learning_rate": 0.0001, + "loss": 5.8216, + "loss/crossentropy": 2.549016833305359, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17452429980039597, + "step": 15824 + }, + { + "epoch": 0.4945625, + "grad_norm": 3.015625, + "grad_norm_var": 0.10800679524739583, + "learning_rate": 0.0001, + "loss": 5.2381, + "loss/crossentropy": 2.2993987798690796, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15246036648750305, + "step": 15826 + }, + { + "epoch": 0.494625, + "grad_norm": 3.359375, + "grad_norm_var": 0.1213287353515625, + "learning_rate": 0.0001, + "loss": 5.6697, + "loss/crossentropy": 2.5171492099761963, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16290698945522308, + "step": 15828 + }, + { + "epoch": 0.4946875, + "grad_norm": 2.984375, + "grad_norm_var": 0.0838043212890625, + "learning_rate": 0.0001, + "loss": 5.5798, + "loss/crossentropy": 2.442649006843567, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1656683087348938, + "step": 15830 + }, + { + "epoch": 0.49475, + "grad_norm": 3.03125, + "grad_norm_var": 0.0387603759765625, + "learning_rate": 0.0001, + "loss": 5.7172, + "loss/crossentropy": 2.546152353286743, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16945255547761917, + "step": 15832 + }, + { + "epoch": 0.4948125, + "grad_norm": 3.328125, + "grad_norm_var": 0.03957926432291667, + "learning_rate": 0.0001, + "loss": 5.9672, + "loss/crossentropy": 2.719525694847107, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1778969019651413, + "step": 15834 + }, + { + "epoch": 0.494875, + "grad_norm": 3.296875, + "grad_norm_var": 0.04096577962239583, + "learning_rate": 0.0001, + "loss": 5.966, + "loss/crossentropy": 2.706810235977173, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1759188175201416, + "step": 15836 + }, + { + "epoch": 0.4949375, + "grad_norm": 3.125, + "grad_norm_var": 0.03937886555989583, + "learning_rate": 0.0001, + "loss": 5.4779, + "loss/crossentropy": 2.4548873901367188, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15620262920856476, + "step": 15838 + }, + { + "epoch": 0.495, + "grad_norm": 3.328125, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 0.0001, + "loss": 5.7715, + "loss/crossentropy": 2.557394862174988, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17102333158254623, + "step": 15840 + }, + { + "epoch": 0.4950625, + "grad_norm": 3.28125, + "grad_norm_var": 0.03892313639322917, + "learning_rate": 0.0001, + "loss": 6.1132, + "loss/crossentropy": 2.7207332849502563, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18377604335546494, + "step": 15842 + }, + { + "epoch": 0.495125, + "grad_norm": 3.421875, + "grad_norm_var": 0.03394775390625, + "learning_rate": 0.0001, + "loss": 5.9958, + "loss/crossentropy": 2.731584906578064, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17642249166965485, + "step": 15844 + }, + { + "epoch": 0.4951875, + "grad_norm": 3.0625, + "grad_norm_var": 0.04205322265625, + "learning_rate": 0.0001, + "loss": 5.6099, + "loss/crossentropy": 2.4367045164108276, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17083469778299332, + "step": 15846 + }, + { + "epoch": 0.49525, + "grad_norm": 3.109375, + "grad_norm_var": 0.030985514322916668, + "learning_rate": 0.0001, + "loss": 5.7844, + "loss/crossentropy": 2.558732748031616, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17257042229175568, + "step": 15848 + }, + { + "epoch": 0.4953125, + "grad_norm": 3.109375, + "grad_norm_var": 0.03092041015625, + "learning_rate": 0.0001, + "loss": 6.1288, + "loss/crossentropy": 2.851402997970581, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17617689073085785, + "step": 15850 + }, + { + "epoch": 0.495375, + "grad_norm": 3.15625, + "grad_norm_var": 0.029889933268229165, + "learning_rate": 0.0001, + "loss": 5.5517, + "loss/crossentropy": 2.4140695333480835, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16610509157180786, + "step": 15852 + }, + { + "epoch": 0.4954375, + "grad_norm": 2.90625, + "grad_norm_var": 0.0359375, + "learning_rate": 0.0001, + "loss": 5.8073, + "loss/crossentropy": 2.63225257396698, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1671120896935463, + "step": 15854 + }, + { + "epoch": 0.4955, + "grad_norm": 3.5, + "grad_norm_var": 0.0517486572265625, + "learning_rate": 0.0001, + "loss": 5.9302, + "loss/crossentropy": 2.7441340684890747, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17134114354848862, + "step": 15856 + }, + { + "epoch": 0.4955625, + "grad_norm": 3.390625, + "grad_norm_var": 0.05275777180989583, + "learning_rate": 0.0001, + "loss": 5.7635, + "loss/crossentropy": 2.5725748538970947, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16752752661705017, + "step": 15858 + }, + { + "epoch": 0.495625, + "grad_norm": 3.21875, + "grad_norm_var": 0.049702962239583336, + "learning_rate": 0.0001, + "loss": 5.8945, + "loss/crossentropy": 2.5877639055252075, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1802806630730629, + "step": 15860 + }, + { + "epoch": 0.4956875, + "grad_norm": 3.25, + "grad_norm_var": 0.04422200520833333, + "learning_rate": 0.0001, + "loss": 6.1524, + "loss/crossentropy": 2.797191023826599, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1808350756764412, + "step": 15862 + }, + { + "epoch": 0.49575, + "grad_norm": 3.390625, + "grad_norm_var": 0.046849568684895836, + "learning_rate": 0.0001, + "loss": 5.7258, + "loss/crossentropy": 2.5211658477783203, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17007212340831757, + "step": 15864 + }, + { + "epoch": 0.4958125, + "grad_norm": 3.671875, + "grad_norm_var": 0.06403706868489584, + "learning_rate": 0.0001, + "loss": 5.9797, + "loss/crossentropy": 2.576448082923889, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.18055523186922073, + "step": 15866 + }, + { + "epoch": 0.495875, + "grad_norm": 3.171875, + "grad_norm_var": 0.06292215983072917, + "learning_rate": 0.0001, + "loss": 5.8199, + "loss/crossentropy": 2.5816270112991333, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1746114268898964, + "step": 15868 + }, + { + "epoch": 0.4959375, + "grad_norm": 2.984375, + "grad_norm_var": 0.0600250244140625, + "learning_rate": 0.0001, + "loss": 5.9041, + "loss/crossentropy": 2.7266438007354736, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16891635954380035, + "step": 15870 + }, + { + "epoch": 0.496, + "grad_norm": 3.90625, + "grad_norm_var": 0.06288655598958333, + "learning_rate": 0.0001, + "loss": 5.9347, + "loss/crossentropy": 2.6614303588867188, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1714666336774826, + "step": 15872 + }, + { + "epoch": 0.4960625, + "grad_norm": 2.890625, + "grad_norm_var": 0.07214253743489583, + "learning_rate": 0.0001, + "loss": 5.5861, + "loss/crossentropy": 2.4560959339141846, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16495682299137115, + "step": 15874 + }, + { + "epoch": 0.496125, + "grad_norm": 3.078125, + "grad_norm_var": 0.07622782389322917, + "learning_rate": 0.0001, + "loss": 5.4371, + "loss/crossentropy": 2.36905038356781, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15953627973794937, + "step": 15876 + }, + { + "epoch": 0.4961875, + "grad_norm": 3.125, + "grad_norm_var": 0.07359110514322917, + "learning_rate": 0.0001, + "loss": 5.869, + "loss/crossentropy": 2.6835721731185913, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1720564141869545, + "step": 15878 + }, + { + "epoch": 0.49625, + "grad_norm": 3.3125, + "grad_norm_var": 0.07060546875, + "learning_rate": 0.0001, + "loss": 5.8529, + "loss/crossentropy": 2.643310546875, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1725200042128563, + "step": 15880 + }, + { + "epoch": 0.4963125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0490386962890625, + "learning_rate": 0.0001, + "loss": 5.9573, + "loss/crossentropy": 2.751884341239929, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17327306419610977, + "step": 15882 + }, + { + "epoch": 0.496375, + "grad_norm": 3.3125, + "grad_norm_var": 0.051611328125, + "learning_rate": 0.0001, + "loss": 5.7362, + "loss/crossentropy": 2.4810686111450195, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17434068024158478, + "step": 15884 + }, + { + "epoch": 0.4964375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0598785400390625, + "learning_rate": 0.0001, + "loss": 5.8693, + "loss/crossentropy": 2.649235248565674, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.170439213514328, + "step": 15886 + }, + { + "epoch": 0.4965, + "grad_norm": 3.34375, + "grad_norm_var": 0.03590087890625, + "learning_rate": 0.0001, + "loss": 5.965, + "loss/crossentropy": 2.570220947265625, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18401052057743073, + "step": 15888 + }, + { + "epoch": 0.4965625, + "grad_norm": 2.953125, + "grad_norm_var": 0.03332926432291667, + "learning_rate": 0.0001, + "loss": 5.635, + "loss/crossentropy": 2.4804067611694336, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16819289326667786, + "step": 15890 + }, + { + "epoch": 0.496625, + "grad_norm": 3.265625, + "grad_norm_var": 0.0297515869140625, + "learning_rate": 0.0001, + "loss": 5.7789, + "loss/crossentropy": 2.5864343643188477, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1684667244553566, + "step": 15892 + }, + { + "epoch": 0.4966875, + "grad_norm": 2.84375, + "grad_norm_var": 0.0404449462890625, + "learning_rate": 0.0001, + "loss": 5.695, + "loss/crossentropy": 2.6350271701812744, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15951280295848846, + "step": 15894 + }, + { + "epoch": 0.49675, + "grad_norm": 3.15625, + "grad_norm_var": 0.041357421875, + "learning_rate": 0.0001, + "loss": 5.5124, + "loss/crossentropy": 2.411333203315735, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16049957275390625, + "step": 15896 + }, + { + "epoch": 0.4968125, + "grad_norm": 3.53125, + "grad_norm_var": 0.04854227701822917, + "learning_rate": 0.0001, + "loss": 5.5888, + "loss/crossentropy": 2.4651507139205933, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16431937366724014, + "step": 15898 + }, + { + "epoch": 0.496875, + "grad_norm": 5.125, + "grad_norm_var": 0.2789459228515625, + "learning_rate": 0.0001, + "loss": 5.4275, + "loss/crossentropy": 2.292418360710144, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16116555780172348, + "step": 15900 + }, + { + "epoch": 0.4969375, + "grad_norm": 3.3125, + "grad_norm_var": 0.26962890625, + "learning_rate": 0.0001, + "loss": 5.5919, + "loss/crossentropy": 2.468536615371704, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16780667752027512, + "step": 15902 + }, + { + "epoch": 0.497, + "grad_norm": 3.328125, + "grad_norm_var": 0.27049051920572914, + "learning_rate": 0.0001, + "loss": 6.0847, + "loss/crossentropy": 2.765723466873169, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17994843423366547, + "step": 15904 + }, + { + "epoch": 0.4970625, + "grad_norm": 3.328125, + "grad_norm_var": 0.26126200358072915, + "learning_rate": 0.0001, + "loss": 5.9942, + "loss/crossentropy": 2.714383363723755, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17758838087320328, + "step": 15906 + }, + { + "epoch": 0.497125, + "grad_norm": 3.125, + "grad_norm_var": 0.2636789957682292, + "learning_rate": 0.0001, + "loss": 5.866, + "loss/crossentropy": 2.56651508808136, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1779971495270729, + "step": 15908 + }, + { + "epoch": 0.4971875, + "grad_norm": 3.125, + "grad_norm_var": 0.2510650634765625, + "learning_rate": 0.0001, + "loss": 5.8023, + "loss/crossentropy": 2.5108169317245483, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1744590848684311, + "step": 15910 + }, + { + "epoch": 0.49725, + "grad_norm": 3.53125, + "grad_norm_var": 0.24410400390625, + "learning_rate": 0.0001, + "loss": 5.6003, + "loss/crossentropy": 2.420087218284607, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17114916443824768, + "step": 15912 + }, + { + "epoch": 0.4973125, + "grad_norm": 3.296875, + "grad_norm_var": 0.24005533854166666, + "learning_rate": 0.0001, + "loss": 5.8849, + "loss/crossentropy": 2.6445670127868652, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1732504740357399, + "step": 15914 + }, + { + "epoch": 0.497375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03037109375, + "learning_rate": 0.0001, + "loss": 5.7636, + "loss/crossentropy": 2.649904727935791, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1625397726893425, + "step": 15916 + }, + { + "epoch": 0.4974375, + "grad_norm": 3.125, + "grad_norm_var": 0.03775634765625, + "learning_rate": 0.0001, + "loss": 5.9128, + "loss/crossentropy": 2.6058311462402344, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17757585644721985, + "step": 15918 + }, + { + "epoch": 0.4975, + "grad_norm": 3.59375, + "grad_norm_var": 0.0398590087890625, + "learning_rate": 0.0001, + "loss": 5.9503, + "loss/crossentropy": 2.6391966342926025, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18228397518396378, + "step": 15920 + }, + { + "epoch": 0.4975625, + "grad_norm": 3.234375, + "grad_norm_var": 0.04195556640625, + "learning_rate": 0.0001, + "loss": 5.6969, + "loss/crossentropy": 2.5219361782073975, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16827717423439026, + "step": 15922 + }, + { + "epoch": 0.497625, + "grad_norm": 3.375, + "grad_norm_var": 0.04351806640625, + "learning_rate": 0.0001, + "loss": 5.883, + "loss/crossentropy": 2.6181185245513916, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17570406198501587, + "step": 15924 + }, + { + "epoch": 0.4976875, + "grad_norm": 3.40625, + "grad_norm_var": 0.03619791666666667, + "learning_rate": 0.0001, + "loss": 5.5938, + "loss/crossentropy": 2.4653561115264893, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.164012610912323, + "step": 15926 + }, + { + "epoch": 0.49775, + "grad_norm": 3.53125, + "grad_norm_var": 0.03616129557291667, + "learning_rate": 0.0001, + "loss": 5.8606, + "loss/crossentropy": 2.5704281330108643, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1794053092598915, + "step": 15928 + }, + { + "epoch": 0.4978125, + "grad_norm": 3.09375, + "grad_norm_var": 0.036356608072916664, + "learning_rate": 0.0001, + "loss": 5.6433, + "loss/crossentropy": 2.4557920694351196, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17031186819076538, + "step": 15930 + }, + { + "epoch": 0.497875, + "grad_norm": 3.359375, + "grad_norm_var": 0.036009724934895834, + "learning_rate": 0.0001, + "loss": 5.849, + "loss/crossentropy": 2.6339017152786255, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17190106213092804, + "step": 15932 + }, + { + "epoch": 0.4979375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0318511962890625, + "learning_rate": 0.0001, + "loss": 5.8076, + "loss/crossentropy": 2.5940616130828857, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17330337315797806, + "step": 15934 + }, + { + "epoch": 0.498, + "grad_norm": 3.09375, + "grad_norm_var": 0.0240142822265625, + "learning_rate": 0.0001, + "loss": 5.7265, + "loss/crossentropy": 2.5966960191726685, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16571569442749023, + "step": 15936 + }, + { + "epoch": 0.4980625, + "grad_norm": 3.0, + "grad_norm_var": 0.03152669270833333, + "learning_rate": 0.0001, + "loss": 5.542, + "loss/crossentropy": 2.4944313764572144, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1602211520075798, + "step": 15938 + }, + { + "epoch": 0.498125, + "grad_norm": 3.03125, + "grad_norm_var": 0.029645792643229165, + "learning_rate": 0.0001, + "loss": 5.7389, + "loss/crossentropy": 2.618900418281555, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16434358060359955, + "step": 15940 + }, + { + "epoch": 0.4981875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0260894775390625, + "learning_rate": 0.0001, + "loss": 5.4484, + "loss/crossentropy": 2.3067928552627563, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.15985964238643646, + "step": 15942 + }, + { + "epoch": 0.49825, + "grad_norm": 3.53125, + "grad_norm_var": 0.031005859375, + "learning_rate": 0.0001, + "loss": 5.6273, + "loss/crossentropy": 2.4584217071533203, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1680569276213646, + "step": 15944 + }, + { + "epoch": 0.4983125, + "grad_norm": 3.09375, + "grad_norm_var": 0.031672159830729164, + "learning_rate": 0.0001, + "loss": 5.8246, + "loss/crossentropy": 2.6650729179382324, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17220134288072586, + "step": 15946 + }, + { + "epoch": 0.498375, + "grad_norm": 3.09375, + "grad_norm_var": 0.028669230143229165, + "learning_rate": 0.0001, + "loss": 5.8149, + "loss/crossentropy": 2.5920504331588745, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17346028238534927, + "step": 15948 + }, + { + "epoch": 0.4984375, + "grad_norm": 3.609375, + "grad_norm_var": 0.04453837076822917, + "learning_rate": 0.0001, + "loss": 6.1388, + "loss/crossentropy": 2.79826557636261, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17975997924804688, + "step": 15950 + }, + { + "epoch": 0.4985, + "grad_norm": 3.265625, + "grad_norm_var": 0.046873982747395834, + "learning_rate": 0.0001, + "loss": 5.8, + "loss/crossentropy": 2.5743913650512695, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17138755321502686, + "step": 15952 + }, + { + "epoch": 0.4985625, + "grad_norm": 2.953125, + "grad_norm_var": 0.05056050618489583, + "learning_rate": 0.0001, + "loss": 5.7426, + "loss/crossentropy": 2.596100926399231, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1665981337428093, + "step": 15954 + }, + { + "epoch": 0.498625, + "grad_norm": 3.15625, + "grad_norm_var": 0.049193318684895834, + "learning_rate": 0.0001, + "loss": 5.7905, + "loss/crossentropy": 2.6612913608551025, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16370096802711487, + "step": 15956 + }, + { + "epoch": 0.4986875, + "grad_norm": 3.578125, + "grad_norm_var": 0.060042317708333334, + "learning_rate": 0.0001, + "loss": 6.1546, + "loss/crossentropy": 2.6566189527511597, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.19237946718931198, + "step": 15958 + }, + { + "epoch": 0.49875, + "grad_norm": 3.09375, + "grad_norm_var": 0.04508056640625, + "learning_rate": 0.0001, + "loss": 5.8987, + "loss/crossentropy": 2.6189379692077637, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17797543853521347, + "step": 15960 + }, + { + "epoch": 0.4988125, + "grad_norm": 3.28125, + "grad_norm_var": 0.049250284830729164, + "learning_rate": 0.0001, + "loss": 5.6761, + "loss/crossentropy": 2.514289379119873, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1689145490527153, + "step": 15962 + }, + { + "epoch": 0.498875, + "grad_norm": 3.421875, + "grad_norm_var": 0.048974609375, + "learning_rate": 0.0001, + "loss": 6.1333, + "loss/crossentropy": 2.835245370864868, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1794164776802063, + "step": 15964 + }, + { + "epoch": 0.4989375, + "grad_norm": 3.359375, + "grad_norm_var": 0.0409820556640625, + "learning_rate": 0.0001, + "loss": 5.9531, + "loss/crossentropy": 2.6926647424697876, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1756543666124344, + "step": 15966 + }, + { + "epoch": 0.499, + "grad_norm": 2.953125, + "grad_norm_var": 0.040558878580729166, + "learning_rate": 0.0001, + "loss": 6.0007, + "loss/crossentropy": 2.7710163593292236, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1725805401802063, + "step": 15968 + }, + { + "epoch": 0.4990625, + "grad_norm": 3.578125, + "grad_norm_var": 0.041829427083333336, + "learning_rate": 0.0001, + "loss": 5.8248, + "loss/crossentropy": 2.563231348991394, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17420003563165665, + "step": 15970 + }, + { + "epoch": 0.499125, + "grad_norm": 3.359375, + "grad_norm_var": 0.042952473958333334, + "learning_rate": 0.0001, + "loss": 5.7934, + "loss/crossentropy": 2.5948245525360107, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17337030172348022, + "step": 15972 + }, + { + "epoch": 0.4991875, + "grad_norm": 3.0625, + "grad_norm_var": 0.03693745930989583, + "learning_rate": 0.0001, + "loss": 6.0875, + "loss/crossentropy": 2.794007182121277, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17739754915237427, + "step": 15974 + }, + { + "epoch": 0.49925, + "grad_norm": 2.9375, + "grad_norm_var": 0.04419657389322917, + "learning_rate": 0.0001, + "loss": 5.8663, + "loss/crossentropy": 2.6902471780776978, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16839084029197693, + "step": 15976 + }, + { + "epoch": 0.4993125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0392730712890625, + "learning_rate": 0.0001, + "loss": 5.6043, + "loss/crossentropy": 2.4186110496520996, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16818278282880783, + "step": 15978 + }, + { + "epoch": 0.499375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0382965087890625, + "learning_rate": 0.0001, + "loss": 5.893, + "loss/crossentropy": 2.639996647834778, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1721796840429306, + "step": 15980 + }, + { + "epoch": 0.4994375, + "grad_norm": 3.03125, + "grad_norm_var": 0.03852437337239583, + "learning_rate": 0.0001, + "loss": 5.8705, + "loss/crossentropy": 2.6478593349456787, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1726580560207367, + "step": 15982 + }, + { + "epoch": 0.4995, + "grad_norm": 3.125, + "grad_norm_var": 0.03477274576822917, + "learning_rate": 0.0001, + "loss": 5.9556, + "loss/crossentropy": 2.6546578407287598, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1757965162396431, + "step": 15984 + }, + { + "epoch": 0.4995625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0257476806640625, + "learning_rate": 0.0001, + "loss": 5.8515, + "loss/crossentropy": 2.7225167751312256, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1652454286813736, + "step": 15986 + }, + { + "epoch": 0.499625, + "grad_norm": 3.140625, + "grad_norm_var": 0.024860636393229166, + "learning_rate": 0.0001, + "loss": 5.7254, + "loss/crossentropy": 2.6186875104904175, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16575387120246887, + "step": 15988 + }, + { + "epoch": 0.4996875, + "grad_norm": 3.25, + "grad_norm_var": 0.025309244791666668, + "learning_rate": 0.0001, + "loss": 5.7961, + "loss/crossentropy": 2.5866594314575195, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17172956466674805, + "step": 15990 + }, + { + "epoch": 0.49975, + "grad_norm": 2.984375, + "grad_norm_var": 0.02496337890625, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.483269691467285, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16713418066501617, + "step": 15992 + }, + { + "epoch": 0.4998125, + "grad_norm": 3.234375, + "grad_norm_var": 0.029491170247395834, + "learning_rate": 0.0001, + "loss": 5.9615, + "loss/crossentropy": 2.7531800270080566, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17239541560411453, + "step": 15994 + }, + { + "epoch": 0.499875, + "grad_norm": 2.953125, + "grad_norm_var": 0.03139546712239583, + "learning_rate": 0.0001, + "loss": 5.5012, + "loss/crossentropy": 2.471090793609619, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15848200023174286, + "step": 15996 + }, + { + "epoch": 0.4999375, + "grad_norm": 3.03125, + "grad_norm_var": 0.03134765625, + "learning_rate": 0.0001, + "loss": 5.8642, + "loss/crossentropy": 2.6671937704086304, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17360534518957138, + "step": 15998 + }, + { + "epoch": 0.5, + "grad_norm": 3.109375, + "grad_norm_var": 0.03413798014322917, + "learning_rate": 0.0001, + "loss": 5.8482, + "loss/crossentropy": 2.6166086196899414, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1719907894730568, + "step": 16000 + }, + { + "epoch": 0.5000625, + "grad_norm": 3.421875, + "grad_norm_var": 0.03278706868489583, + "learning_rate": 0.0001, + "loss": 5.891, + "loss/crossentropy": 2.64286208152771, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17442382872104645, + "step": 16002 + }, + { + "epoch": 0.500125, + "grad_norm": 3.859375, + "grad_norm_var": 0.06376546223958333, + "learning_rate": 0.0001, + "loss": 5.7983, + "loss/crossentropy": 2.474073648452759, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17655911296606064, + "step": 16004 + }, + { + "epoch": 0.5001875, + "grad_norm": 3.421875, + "grad_norm_var": 0.06347249348958334, + "learning_rate": 0.0001, + "loss": 5.4582, + "loss/crossentropy": 2.3265300989151, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16473408788442612, + "step": 16006 + }, + { + "epoch": 0.50025, + "grad_norm": 3.328125, + "grad_norm_var": 0.0589508056640625, + "learning_rate": 0.0001, + "loss": 5.7411, + "loss/crossentropy": 2.614622473716736, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16382355988025665, + "step": 16008 + }, + { + "epoch": 0.5003125, + "grad_norm": 3.09375, + "grad_norm_var": 0.05357666015625, + "learning_rate": 0.0001, + "loss": 5.5548, + "loss/crossentropy": 2.4665409326553345, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16273151338100433, + "step": 16010 + }, + { + "epoch": 0.500375, + "grad_norm": 3.078125, + "grad_norm_var": 0.048726399739583336, + "learning_rate": 0.0001, + "loss": 5.9993, + "loss/crossentropy": 2.73794686794281, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17730379849672318, + "step": 16012 + }, + { + "epoch": 0.5004375, + "grad_norm": 3.375, + "grad_norm_var": 0.04503580729166667, + "learning_rate": 0.0001, + "loss": 5.8427, + "loss/crossentropy": 2.6063257455825806, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17285971343517303, + "step": 16014 + }, + { + "epoch": 0.5005, + "grad_norm": 3.15625, + "grad_norm_var": 0.08115234375, + "learning_rate": 0.0001, + "loss": 5.8164, + "loss/crossentropy": 2.5140546560287476, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17984529584646225, + "step": 16016 + }, + { + "epoch": 0.5005625, + "grad_norm": 7.5625, + "grad_norm_var": 1.1968912760416666, + "learning_rate": 0.0001, + "loss": 6.0672, + "loss/crossentropy": 2.604338765144348, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.19550367444753647, + "step": 16018 + }, + { + "epoch": 0.500625, + "grad_norm": 3.671875, + "grad_norm_var": 1.1863444010416666, + "learning_rate": 0.0001, + "loss": 5.759, + "loss/crossentropy": 2.476668357849121, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1766715720295906, + "step": 16020 + }, + { + "epoch": 0.5006875, + "grad_norm": 3.296875, + "grad_norm_var": 1.1625162760416667, + "learning_rate": 0.0001, + "loss": 6.1761, + "loss/crossentropy": 2.6990877389907837, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.19144807755947113, + "step": 16022 + }, + { + "epoch": 0.50075, + "grad_norm": 3.140625, + "grad_norm_var": 1.1527506510416667, + "learning_rate": 0.0001, + "loss": 5.9818, + "loss/crossentropy": 2.6940040588378906, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1791740357875824, + "step": 16024 + }, + { + "epoch": 0.5008125, + "grad_norm": 3.3125, + "grad_norm_var": 1.1218251546223958, + "learning_rate": 0.0001, + "loss": 6.1611, + "loss/crossentropy": 2.806161403656006, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18314571678638458, + "step": 16026 + }, + { + "epoch": 0.500875, + "grad_norm": 3.046875, + "grad_norm_var": 1.1304921468098958, + "learning_rate": 0.0001, + "loss": 5.9907, + "loss/crossentropy": 2.7044265270233154, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17706524580717087, + "step": 16028 + }, + { + "epoch": 0.5009375, + "grad_norm": 3.0625, + "grad_norm_var": 1.1634999593098958, + "learning_rate": 0.0001, + "loss": 5.6703, + "loss/crossentropy": 2.5244998931884766, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16379740089178085, + "step": 16030 + }, + { + "epoch": 0.501, + "grad_norm": 4.90625, + "grad_norm_var": 1.232494099934896, + "learning_rate": 0.0001, + "loss": 6.0419, + "loss/crossentropy": 2.602308511734009, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18927372992038727, + "step": 16032 + }, + { + "epoch": 0.5010625, + "grad_norm": 3.375, + "grad_norm_var": 0.19861551920572917, + "learning_rate": 0.0001, + "loss": 5.9137, + "loss/crossentropy": 2.627810001373291, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1770292967557907, + "step": 16034 + }, + { + "epoch": 0.501125, + "grad_norm": 3.078125, + "grad_norm_var": 0.2118072509765625, + "learning_rate": 0.0001, + "loss": 5.4875, + "loss/crossentropy": 2.417228937149048, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1597566306591034, + "step": 16036 + }, + { + "epoch": 0.5011875, + "grad_norm": 2.984375, + "grad_norm_var": 0.2126129150390625, + "learning_rate": 0.0001, + "loss": 5.6537, + "loss/crossentropy": 2.5580382347106934, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16151735931634903, + "step": 16038 + }, + { + "epoch": 0.50125, + "grad_norm": 3.03125, + "grad_norm_var": 0.20921223958333332, + "learning_rate": 0.0001, + "loss": 5.3003, + "loss/crossentropy": 2.281244397163391, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1538594588637352, + "step": 16040 + }, + { + "epoch": 0.5013125, + "grad_norm": 3.625, + "grad_norm_var": 1.855304972330729, + "learning_rate": 0.0001, + "loss": 6.4356, + "loss/crossentropy": 2.6569602489471436, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2173164114356041, + "step": 16042 + }, + { + "epoch": 0.501375, + "grad_norm": 3.640625, + "grad_norm_var": 1.8310292561848958, + "learning_rate": 0.0001, + "loss": 6.0169, + "loss/crossentropy": 2.6723943948745728, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18327917158603668, + "step": 16044 + }, + { + "epoch": 0.5014375, + "grad_norm": 3.359375, + "grad_norm_var": 1.8175944010416667, + "learning_rate": 0.0001, + "loss": 6.2204, + "loss/crossentropy": 2.9426517486572266, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1765984445810318, + "step": 16046 + }, + { + "epoch": 0.5015, + "grad_norm": 3.40625, + "grad_norm_var": 1.7128733317057292, + "learning_rate": 0.0001, + "loss": 5.8617, + "loss/crossentropy": 2.5231932401657104, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17994019389152527, + "step": 16048 + }, + { + "epoch": 0.5015625, + "grad_norm": 3.359375, + "grad_norm_var": 1.7375396728515624, + "learning_rate": 0.0001, + "loss": 5.6578, + "loss/crossentropy": 2.5435420274734497, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16767632216215134, + "step": 16050 + }, + { + "epoch": 0.501625, + "grad_norm": 3.078125, + "grad_norm_var": 1.7257232666015625, + "learning_rate": 0.0001, + "loss": 5.7299, + "loss/crossentropy": 2.5690709352493286, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16999147087335587, + "step": 16052 + }, + { + "epoch": 0.5016875, + "grad_norm": 3.734375, + "grad_norm_var": 1.7012115478515626, + "learning_rate": 0.0001, + "loss": 5.9279, + "loss/crossentropy": 2.4675475358963013, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18900148570537567, + "step": 16054 + }, + { + "epoch": 0.50175, + "grad_norm": 3.390625, + "grad_norm_var": 1.6786946614583333, + "learning_rate": 0.0001, + "loss": 5.8318, + "loss/crossentropy": 2.571017861366272, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17919930070638657, + "step": 16056 + }, + { + "epoch": 0.5018125, + "grad_norm": 3.234375, + "grad_norm_var": 0.047998046875, + "learning_rate": 0.0001, + "loss": 5.297, + "loss/crossentropy": 2.2698299884796143, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16052843630313873, + "step": 16058 + }, + { + "epoch": 0.501875, + "grad_norm": 3.171875, + "grad_norm_var": 0.051512654622395834, + "learning_rate": 0.0001, + "loss": 5.459, + "loss/crossentropy": 2.4018373489379883, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16001027822494507, + "step": 16060 + }, + { + "epoch": 0.5019375, + "grad_norm": 3.21875, + "grad_norm_var": 0.05315755208333333, + "learning_rate": 0.0001, + "loss": 5.6102, + "loss/crossentropy": 2.4685033559799194, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1688537895679474, + "step": 16062 + }, + { + "epoch": 0.502, + "grad_norm": 3.171875, + "grad_norm_var": 0.04537353515625, + "learning_rate": 0.0001, + "loss": 5.9411, + "loss/crossentropy": 2.6571015119552612, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17761804163455963, + "step": 16064 + }, + { + "epoch": 0.5020625, + "grad_norm": 2.953125, + "grad_norm_var": 0.04302978515625, + "learning_rate": 0.0001, + "loss": 5.4804, + "loss/crossentropy": 2.413459300994873, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16138406842947006, + "step": 16066 + }, + { + "epoch": 0.502125, + "grad_norm": 2.953125, + "grad_norm_var": 0.043309529622395836, + "learning_rate": 0.0001, + "loss": 5.562, + "loss/crossentropy": 2.447237491607666, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16498787701129913, + "step": 16068 + }, + { + "epoch": 0.5021875, + "grad_norm": 3.265625, + "grad_norm_var": 0.019977823893229166, + "learning_rate": 0.0001, + "loss": 5.8691, + "loss/crossentropy": 2.575567126274109, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1789635792374611, + "step": 16070 + }, + { + "epoch": 0.50225, + "grad_norm": 3.65625, + "grad_norm_var": 0.03778889973958333, + "learning_rate": 0.0001, + "loss": 5.7838, + "loss/crossentropy": 2.5296674966812134, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17462927103042603, + "step": 16072 + }, + { + "epoch": 0.5023125, + "grad_norm": 3.375, + "grad_norm_var": 0.042561848958333336, + "learning_rate": 0.0001, + "loss": 5.8879, + "loss/crossentropy": 2.6592808961868286, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17403249442577362, + "step": 16074 + }, + { + "epoch": 0.502375, + "grad_norm": 3.03125, + "grad_norm_var": 0.03680013020833333, + "learning_rate": 0.0001, + "loss": 5.9504, + "loss/crossentropy": 2.6783300638198853, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17564692348241806, + "step": 16076 + }, + { + "epoch": 0.5024375, + "grad_norm": 3.171875, + "grad_norm_var": 0.03975321451822917, + "learning_rate": 0.0001, + "loss": 5.6857, + "loss/crossentropy": 2.4860860109329224, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1676216870546341, + "step": 16078 + }, + { + "epoch": 0.5025, + "grad_norm": 3.21875, + "grad_norm_var": 0.039876302083333336, + "learning_rate": 0.0001, + "loss": 5.7396, + "loss/crossentropy": 2.6158945560455322, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1647173911333084, + "step": 16080 + }, + { + "epoch": 0.5025625, + "grad_norm": 2.90625, + "grad_norm_var": 0.045042928059895834, + "learning_rate": 0.0001, + "loss": 5.8268, + "loss/crossentropy": 2.5639734268188477, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1774589642882347, + "step": 16082 + }, + { + "epoch": 0.502625, + "grad_norm": 3.234375, + "grad_norm_var": 0.064697265625, + "learning_rate": 0.0001, + "loss": 5.4842, + "loss/crossentropy": 2.3451225757598877, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16429540514945984, + "step": 16084 + }, + { + "epoch": 0.5026875, + "grad_norm": 3.21875, + "grad_norm_var": 0.06734110514322916, + "learning_rate": 0.0001, + "loss": 5.8722, + "loss/crossentropy": 2.6091232299804688, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17552156001329422, + "step": 16086 + }, + { + "epoch": 0.50275, + "grad_norm": 3.078125, + "grad_norm_var": 0.07672119140625, + "learning_rate": 0.0001, + "loss": 5.5526, + "loss/crossentropy": 2.481239438056946, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15948085486888885, + "step": 16088 + }, + { + "epoch": 0.5028125, + "grad_norm": 3.15625, + "grad_norm_var": 0.07282613118489584, + "learning_rate": 0.0001, + "loss": 5.8227, + "loss/crossentropy": 2.5843098163604736, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1730574518442154, + "step": 16090 + }, + { + "epoch": 0.502875, + "grad_norm": 3.125, + "grad_norm_var": 0.0718170166015625, + "learning_rate": 0.0001, + "loss": 5.8152, + "loss/crossentropy": 2.566161870956421, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17646662145853043, + "step": 16092 + }, + { + "epoch": 0.5029375, + "grad_norm": 2.921875, + "grad_norm_var": 0.07415364583333334, + "learning_rate": 0.0001, + "loss": 5.346, + "loss/crossentropy": 2.282988429069519, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15864654630422592, + "step": 16094 + }, + { + "epoch": 0.503, + "grad_norm": 3.671875, + "grad_norm_var": 0.09406636555989584, + "learning_rate": 0.0001, + "loss": 5.6663, + "loss/crossentropy": 2.429767608642578, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17170406877994537, + "step": 16096 + }, + { + "epoch": 0.5030625, + "grad_norm": 3.4375, + "grad_norm_var": 0.09127197265625, + "learning_rate": 0.0001, + "loss": 5.8715, + "loss/crossentropy": 2.6788880825042725, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17121805995702744, + "step": 16098 + }, + { + "epoch": 0.503125, + "grad_norm": 3.15625, + "grad_norm_var": 0.06845703125, + "learning_rate": 0.0001, + "loss": 5.3718, + "loss/crossentropy": 2.3410524129867554, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16010761260986328, + "step": 16100 + }, + { + "epoch": 0.5031875, + "grad_norm": 2.734375, + "grad_norm_var": 0.07044169108072916, + "learning_rate": 0.0001, + "loss": 5.3351, + "loss/crossentropy": 2.381006360054016, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15399903804063797, + "step": 16102 + }, + { + "epoch": 0.50325, + "grad_norm": 2.890625, + "grad_norm_var": 0.06923421223958333, + "learning_rate": 0.0001, + "loss": 5.5096, + "loss/crossentropy": 2.404729962348938, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.15736467391252518, + "step": 16104 + }, + { + "epoch": 0.5033125, + "grad_norm": 3.3125, + "grad_norm_var": 0.07305399576822917, + "learning_rate": 0.0001, + "loss": 6.1712, + "loss/crossentropy": 2.8202677965164185, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17923591285943985, + "step": 16106 + }, + { + "epoch": 0.503375, + "grad_norm": 3.375, + "grad_norm_var": 0.0749664306640625, + "learning_rate": 0.0001, + "loss": 6.0594, + "loss/crossentropy": 2.7203365564346313, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18273458629846573, + "step": 16108 + }, + { + "epoch": 0.5034375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0721099853515625, + "learning_rate": 0.0001, + "loss": 5.546, + "loss/crossentropy": 2.3975898027420044, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1636715978384018, + "step": 16110 + }, + { + "epoch": 0.5035, + "grad_norm": 3.75, + "grad_norm_var": 0.07496337890625, + "learning_rate": 0.0001, + "loss": 5.8336, + "loss/crossentropy": 2.59959077835083, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16871649026870728, + "step": 16112 + }, + { + "epoch": 0.5035625, + "grad_norm": 3.078125, + "grad_norm_var": 0.06832682291666667, + "learning_rate": 0.0001, + "loss": 5.6162, + "loss/crossentropy": 2.442840099334717, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16733119636774063, + "step": 16114 + }, + { + "epoch": 0.503625, + "grad_norm": 3.140625, + "grad_norm_var": 0.05947265625, + "learning_rate": 0.0001, + "loss": 5.7089, + "loss/crossentropy": 2.5029762983322144, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1733296662569046, + "step": 16116 + }, + { + "epoch": 0.5036875, + "grad_norm": 2.890625, + "grad_norm_var": 0.0436187744140625, + "learning_rate": 0.0001, + "loss": 5.5841, + "loss/crossentropy": 2.5070003271102905, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16161343455314636, + "step": 16118 + }, + { + "epoch": 0.50375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03753255208333333, + "learning_rate": 0.0001, + "loss": 5.943, + "loss/crossentropy": 2.713122248649597, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17103470861911774, + "step": 16120 + }, + { + "epoch": 0.5038125, + "grad_norm": 3.140625, + "grad_norm_var": 0.03677978515625, + "learning_rate": 0.0001, + "loss": 5.6773, + "loss/crossentropy": 2.4745407104492188, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1722284033894539, + "step": 16122 + }, + { + "epoch": 0.503875, + "grad_norm": 2.890625, + "grad_norm_var": 0.04394124348958333, + "learning_rate": 0.0001, + "loss": 5.5866, + "loss/crossentropy": 2.469696044921875, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1632496565580368, + "step": 16124 + }, + { + "epoch": 0.5039375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0474273681640625, + "learning_rate": 0.0001, + "loss": 5.536, + "loss/crossentropy": 2.4524052143096924, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1622677817940712, + "step": 16126 + }, + { + "epoch": 0.504, + "grad_norm": 3.21875, + "grad_norm_var": 0.022347005208333333, + "learning_rate": 0.0001, + "loss": 5.8276, + "loss/crossentropy": 2.58161997795105, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17655570805072784, + "step": 16128 + }, + { + "epoch": 0.5040625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0209625244140625, + "learning_rate": 0.0001, + "loss": 5.6437, + "loss/crossentropy": 2.453909993171692, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16937338560819626, + "step": 16130 + }, + { + "epoch": 0.504125, + "grad_norm": 3.0, + "grad_norm_var": 0.021675618489583333, + "learning_rate": 0.0001, + "loss": 5.7802, + "loss/crossentropy": 2.6307448148727417, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1672848016023636, + "step": 16132 + }, + { + "epoch": 0.5041875, + "grad_norm": 3.171875, + "grad_norm_var": 0.018944295247395833, + "learning_rate": 0.0001, + "loss": 5.4908, + "loss/crossentropy": 2.3591588735580444, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1639464944601059, + "step": 16134 + }, + { + "epoch": 0.50425, + "grad_norm": 3.109375, + "grad_norm_var": 0.021198527018229166, + "learning_rate": 0.0001, + "loss": 5.7267, + "loss/crossentropy": 2.5237497091293335, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17303133010864258, + "step": 16136 + }, + { + "epoch": 0.5043125, + "grad_norm": 3.375, + "grad_norm_var": 0.024332682291666668, + "learning_rate": 0.0001, + "loss": 5.936, + "loss/crossentropy": 2.6650710105895996, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17983216047286987, + "step": 16138 + }, + { + "epoch": 0.504375, + "grad_norm": 3.171875, + "grad_norm_var": 0.014094034830729166, + "learning_rate": 0.0001, + "loss": 5.7133, + "loss/crossentropy": 2.5225549936294556, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16829188913106918, + "step": 16140 + }, + { + "epoch": 0.5044375, + "grad_norm": 3.3125, + "grad_norm_var": 0.011970011393229167, + "learning_rate": 0.0001, + "loss": 5.9035, + "loss/crossentropy": 2.6384806632995605, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1757199615240097, + "step": 16142 + }, + { + "epoch": 0.5045, + "grad_norm": 3.328125, + "grad_norm_var": 0.013590494791666666, + "learning_rate": 0.0001, + "loss": 6.0729, + "loss/crossentropy": 2.7575117349624634, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1749001070857048, + "step": 16144 + }, + { + "epoch": 0.5045625, + "grad_norm": 3.3125, + "grad_norm_var": 0.016120402018229167, + "learning_rate": 0.0001, + "loss": 6.0004, + "loss/crossentropy": 2.742537498474121, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17773668467998505, + "step": 16146 + }, + { + "epoch": 0.504625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0156158447265625, + "learning_rate": 0.0001, + "loss": 5.4671, + "loss/crossentropy": 2.3550872802734375, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16588659584522247, + "step": 16148 + }, + { + "epoch": 0.5046875, + "grad_norm": 3.140625, + "grad_norm_var": 0.014069620768229167, + "learning_rate": 0.0001, + "loss": 5.8368, + "loss/crossentropy": 2.667310953140259, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1716412603855133, + "step": 16150 + }, + { + "epoch": 0.50475, + "grad_norm": 3.046875, + "grad_norm_var": 0.0194732666015625, + "learning_rate": 0.0001, + "loss": 5.8639, + "loss/crossentropy": 2.605831503868103, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17034223675727844, + "step": 16152 + }, + { + "epoch": 0.5048125, + "grad_norm": 3.15625, + "grad_norm_var": 0.018745930989583333, + "learning_rate": 0.0001, + "loss": 5.7254, + "loss/crossentropy": 2.5661193132400513, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16670657694339752, + "step": 16154 + }, + { + "epoch": 0.504875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0747955322265625, + "learning_rate": 0.0001, + "loss": 5.4511, + "loss/crossentropy": 2.3625839948654175, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15963409841060638, + "step": 16156 + }, + { + "epoch": 0.5049375, + "grad_norm": 3.453125, + "grad_norm_var": 0.07726236979166666, + "learning_rate": 0.0001, + "loss": 5.7982, + "loss/crossentropy": 2.584004282951355, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17376431822776794, + "step": 16158 + }, + { + "epoch": 0.505, + "grad_norm": 3.03125, + "grad_norm_var": 0.08076070149739584, + "learning_rate": 0.0001, + "loss": 5.7408, + "loss/crossentropy": 2.628161072731018, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1651678830385208, + "step": 16160 + }, + { + "epoch": 0.5050625, + "grad_norm": 2.984375, + "grad_norm_var": 0.08547261555989584, + "learning_rate": 0.0001, + "loss": 6.0497, + "loss/crossentropy": 2.759259819984436, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1786520555615425, + "step": 16162 + }, + { + "epoch": 0.505125, + "grad_norm": 2.890625, + "grad_norm_var": 0.0950347900390625, + "learning_rate": 0.0001, + "loss": 5.7571, + "loss/crossentropy": 2.6068798303604126, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16697701811790466, + "step": 16164 + }, + { + "epoch": 0.5051875, + "grad_norm": 3.375, + "grad_norm_var": 0.09643452962239583, + "learning_rate": 0.0001, + "loss": 5.7515, + "loss/crossentropy": 2.4896483421325684, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17736003547906876, + "step": 16166 + }, + { + "epoch": 0.50525, + "grad_norm": 2.859375, + "grad_norm_var": 0.09853413899739584, + "learning_rate": 0.0001, + "loss": 5.7608, + "loss/crossentropy": 2.5751978158950806, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1693381741642952, + "step": 16168 + }, + { + "epoch": 0.5053125, + "grad_norm": 2.953125, + "grad_norm_var": 0.10435282389322917, + "learning_rate": 0.0001, + "loss": 5.7404, + "loss/crossentropy": 2.627490282058716, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16598249971866608, + "step": 16170 + }, + { + "epoch": 0.505375, + "grad_norm": 2.96875, + "grad_norm_var": 0.03892822265625, + "learning_rate": 0.0001, + "loss": 5.3349, + "loss/crossentropy": 2.360503673553467, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1556451991200447, + "step": 16172 + }, + { + "epoch": 0.5054375, + "grad_norm": 2.921875, + "grad_norm_var": 0.032225545247395834, + "learning_rate": 0.0001, + "loss": 5.1259, + "loss/crossentropy": 2.2279410362243652, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.14838526397943497, + "step": 16174 + }, + { + "epoch": 0.5055, + "grad_norm": 3.15625, + "grad_norm_var": 0.0465240478515625, + "learning_rate": 0.0001, + "loss": 5.7103, + "loss/crossentropy": 2.4705700874328613, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17162708193063736, + "step": 16176 + }, + { + "epoch": 0.5055625, + "grad_norm": 3.046875, + "grad_norm_var": 0.039713541666666664, + "learning_rate": 0.0001, + "loss": 5.7458, + "loss/crossentropy": 2.562490940093994, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.170286163687706, + "step": 16178 + }, + { + "epoch": 0.505625, + "grad_norm": 4.28125, + "grad_norm_var": 0.12094624837239583, + "learning_rate": 0.0001, + "loss": 6.1504, + "loss/crossentropy": 2.6295450925827026, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19856493920087814, + "step": 16180 + }, + { + "epoch": 0.5056875, + "grad_norm": 3.5625, + "grad_norm_var": 0.12541402180989583, + "learning_rate": 0.0001, + "loss": 6.0624, + "loss/crossentropy": 2.729319453239441, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18253156542778015, + "step": 16182 + }, + { + "epoch": 0.50575, + "grad_norm": 3.328125, + "grad_norm_var": 0.11858622233072917, + "learning_rate": 0.0001, + "loss": 5.6087, + "loss/crossentropy": 2.4301421642303467, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17059285938739777, + "step": 16184 + }, + { + "epoch": 0.5058125, + "grad_norm": 3.203125, + "grad_norm_var": 0.1106597900390625, + "learning_rate": 0.0001, + "loss": 5.8512, + "loss/crossentropy": 2.5526453256607056, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1767268180847168, + "step": 16186 + }, + { + "epoch": 0.505875, + "grad_norm": 3.0625, + "grad_norm_var": 0.11020406087239583, + "learning_rate": 0.0001, + "loss": 5.6925, + "loss/crossentropy": 2.5582053661346436, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1665545403957367, + "step": 16188 + }, + { + "epoch": 0.5059375, + "grad_norm": 3.078125, + "grad_norm_var": 0.10537109375, + "learning_rate": 0.0001, + "loss": 5.6352, + "loss/crossentropy": 2.477321147918701, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16696136444807053, + "step": 16190 + }, + { + "epoch": 0.506, + "grad_norm": 3.265625, + "grad_norm_var": 0.11269429524739584, + "learning_rate": 0.0001, + "loss": 6.1459, + "loss/crossentropy": 2.7620056867599487, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18604668229818344, + "step": 16192 + }, + { + "epoch": 0.5060625, + "grad_norm": 3.234375, + "grad_norm_var": 0.11366780598958333, + "learning_rate": 0.0001, + "loss": 5.8388, + "loss/crossentropy": 2.644774913787842, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17174355685710907, + "step": 16194 + }, + { + "epoch": 0.506125, + "grad_norm": 3.0, + "grad_norm_var": 0.044733683268229164, + "learning_rate": 0.0001, + "loss": 5.802, + "loss/crossentropy": 2.6588616371154785, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16900236904621124, + "step": 16196 + }, + { + "epoch": 0.5061875, + "grad_norm": 3.0625, + "grad_norm_var": 0.03515218098958333, + "learning_rate": 0.0001, + "loss": 5.7907, + "loss/crossentropy": 2.557256817817688, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17295359075069427, + "step": 16198 + }, + { + "epoch": 0.50625, + "grad_norm": 3.0625, + "grad_norm_var": 0.06510416666666667, + "learning_rate": 0.0001, + "loss": 6.0178, + "loss/crossentropy": 2.6430050134658813, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.18162298202514648, + "step": 16200 + }, + { + "epoch": 0.5063125, + "grad_norm": 3.046875, + "grad_norm_var": 0.06559956868489583, + "learning_rate": 0.0001, + "loss": 5.839, + "loss/crossentropy": 2.69626522064209, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16622426360845566, + "step": 16202 + }, + { + "epoch": 0.506375, + "grad_norm": 3.328125, + "grad_norm_var": 0.06236063639322917, + "learning_rate": 0.0001, + "loss": 5.6734, + "loss/crossentropy": 2.517215371131897, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1671859174966812, + "step": 16204 + }, + { + "epoch": 0.5064375, + "grad_norm": 3.34375, + "grad_norm_var": 0.06251627604166667, + "learning_rate": 0.0001, + "loss": 5.733, + "loss/crossentropy": 2.5226190090179443, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1726020723581314, + "step": 16206 + }, + { + "epoch": 0.5065, + "grad_norm": 3.0, + "grad_norm_var": 0.04901936848958333, + "learning_rate": 0.0001, + "loss": 5.6748, + "loss/crossentropy": 2.5151621103286743, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16869834065437317, + "step": 16208 + }, + { + "epoch": 0.5065625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0471832275390625, + "learning_rate": 0.0001, + "loss": 5.7949, + "loss/crossentropy": 2.5869085788726807, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1696242317557335, + "step": 16210 + }, + { + "epoch": 0.506625, + "grad_norm": 2.875, + "grad_norm_var": 0.052392578125, + "learning_rate": 0.0001, + "loss": 6.018, + "loss/crossentropy": 2.7579660415649414, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17404796928167343, + "step": 16212 + }, + { + "epoch": 0.5066875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0533111572265625, + "learning_rate": 0.0001, + "loss": 5.3567, + "loss/crossentropy": 2.330523729324341, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15456832945346832, + "step": 16214 + }, + { + "epoch": 0.50675, + "grad_norm": 3.203125, + "grad_norm_var": 0.0234771728515625, + "learning_rate": 0.0001, + "loss": 5.8171, + "loss/crossentropy": 2.6442378759384155, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16806329041719437, + "step": 16216 + }, + { + "epoch": 0.5068125, + "grad_norm": 3.21875, + "grad_norm_var": 0.023908487955729165, + "learning_rate": 0.0001, + "loss": 5.834, + "loss/crossentropy": 2.5568329095840454, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.18006138503551483, + "step": 16218 + }, + { + "epoch": 0.506875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0215728759765625, + "learning_rate": 0.0001, + "loss": 5.9564, + "loss/crossentropy": 2.6573452949523926, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18029828369617462, + "step": 16220 + }, + { + "epoch": 0.5069375, + "grad_norm": 3.171875, + "grad_norm_var": 0.023844401041666668, + "learning_rate": 0.0001, + "loss": 5.593, + "loss/crossentropy": 2.477585196495056, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1638849675655365, + "step": 16222 + }, + { + "epoch": 0.507, + "grad_norm": 3.046875, + "grad_norm_var": 0.0238922119140625, + "learning_rate": 0.0001, + "loss": 5.7869, + "loss/crossentropy": 2.584786057472229, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1690370962023735, + "step": 16224 + }, + { + "epoch": 0.5070625, + "grad_norm": 3.375, + "grad_norm_var": 0.025462849934895834, + "learning_rate": 0.0001, + "loss": 6.0407, + "loss/crossentropy": 2.7082419395446777, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1797330230474472, + "step": 16226 + }, + { + "epoch": 0.507125, + "grad_norm": 3.390625, + "grad_norm_var": 0.019527180989583334, + "learning_rate": 0.0001, + "loss": 5.6953, + "loss/crossentropy": 2.4971712827682495, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17020606994628906, + "step": 16228 + }, + { + "epoch": 0.5071875, + "grad_norm": 3.3125, + "grad_norm_var": 0.017964680989583332, + "learning_rate": 0.0001, + "loss": 5.7703, + "loss/crossentropy": 2.5960510969161987, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16898278892040253, + "step": 16230 + }, + { + "epoch": 0.50725, + "grad_norm": 3.84375, + "grad_norm_var": 0.048046875, + "learning_rate": 0.0001, + "loss": 5.6682, + "loss/crossentropy": 2.416340708732605, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.16854875534772873, + "step": 16232 + }, + { + "epoch": 0.5073125, + "grad_norm": 3.203125, + "grad_norm_var": 0.04859619140625, + "learning_rate": 0.0001, + "loss": 5.8382, + "loss/crossentropy": 2.6066319942474365, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17589052021503448, + "step": 16234 + }, + { + "epoch": 0.507375, + "grad_norm": 2.859375, + "grad_norm_var": 0.07297770182291667, + "learning_rate": 0.0001, + "loss": 5.7389, + "loss/crossentropy": 2.500886917114258, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17418906837701797, + "step": 16236 + }, + { + "epoch": 0.5074375, + "grad_norm": 3.015625, + "grad_norm_var": 0.06868489583333333, + "learning_rate": 0.0001, + "loss": 5.6729, + "loss/crossentropy": 2.4839404821395874, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16733496636152267, + "step": 16238 + }, + { + "epoch": 0.5075, + "grad_norm": 3.125, + "grad_norm_var": 0.06521809895833333, + "learning_rate": 0.0001, + "loss": 5.6014, + "loss/crossentropy": 2.413141131401062, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1699993684887886, + "step": 16240 + }, + { + "epoch": 0.5075625, + "grad_norm": 3.140625, + "grad_norm_var": 0.06571858723958333, + "learning_rate": 0.0001, + "loss": 5.5904, + "loss/crossentropy": 2.5172938108444214, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16277482360601425, + "step": 16242 + }, + { + "epoch": 0.507625, + "grad_norm": 3.375, + "grad_norm_var": 0.06553446451822917, + "learning_rate": 0.0001, + "loss": 5.5444, + "loss/crossentropy": 2.3586833477020264, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16466347128152847, + "step": 16244 + }, + { + "epoch": 0.5076875, + "grad_norm": 3.21875, + "grad_norm_var": 0.06585184733072917, + "learning_rate": 0.0001, + "loss": 5.8779, + "loss/crossentropy": 2.6398749351501465, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17497644573450089, + "step": 16246 + }, + { + "epoch": 0.50775, + "grad_norm": 2.890625, + "grad_norm_var": 0.04908447265625, + "learning_rate": 0.0001, + "loss": 5.7304, + "loss/crossentropy": 2.6413955688476562, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1628054976463318, + "step": 16248 + }, + { + "epoch": 0.5078125, + "grad_norm": 3.84375, + "grad_norm_var": 0.0750885009765625, + "learning_rate": 0.0001, + "loss": 5.7429, + "loss/crossentropy": 2.519925117492676, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1715191975235939, + "step": 16250 + }, + { + "epoch": 0.507875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0544921875, + "learning_rate": 0.0001, + "loss": 5.3402, + "loss/crossentropy": 2.3426631689071655, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15404711663722992, + "step": 16252 + }, + { + "epoch": 0.5079375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0565826416015625, + "learning_rate": 0.0001, + "loss": 5.7011, + "loss/crossentropy": 2.489269971847534, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1746997907757759, + "step": 16254 + }, + { + "epoch": 0.508, + "grad_norm": 3.578125, + "grad_norm_var": 0.06616923014322916, + "learning_rate": 0.0001, + "loss": 5.5958, + "loss/crossentropy": 2.401709198951721, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16979826986789703, + "step": 16256 + }, + { + "epoch": 0.5080625, + "grad_norm": 3.546875, + "grad_norm_var": 0.07195536295572917, + "learning_rate": 0.0001, + "loss": 5.8205, + "loss/crossentropy": 2.554845094680786, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17617210000753403, + "step": 16258 + }, + { + "epoch": 0.508125, + "grad_norm": 3.046875, + "grad_norm_var": 0.0723541259765625, + "learning_rate": 0.0001, + "loss": 5.5145, + "loss/crossentropy": 2.38235604763031, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16712379455566406, + "step": 16260 + }, + { + "epoch": 0.5081875, + "grad_norm": 3.375, + "grad_norm_var": 0.07903544108072917, + "learning_rate": 0.0001, + "loss": 5.5439, + "loss/crossentropy": 2.417195439338684, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1654004082083702, + "step": 16262 + }, + { + "epoch": 0.50825, + "grad_norm": 2.984375, + "grad_norm_var": 0.06752827962239584, + "learning_rate": 0.0001, + "loss": 5.3385, + "loss/crossentropy": 2.28506600856781, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1596416011452675, + "step": 16264 + }, + { + "epoch": 0.5083125, + "grad_norm": 3.234375, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 0.0001, + "loss": 5.9388, + "loss/crossentropy": 2.6258240938186646, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1805119439959526, + "step": 16266 + }, + { + "epoch": 0.508375, + "grad_norm": 3.984375, + "grad_norm_var": 0.07288004557291666, + "learning_rate": 0.0001, + "loss": 5.7033, + "loss/crossentropy": 2.517643094062805, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16778413206338882, + "step": 16268 + }, + { + "epoch": 0.5084375, + "grad_norm": 3.125, + "grad_norm_var": 0.08831380208333334, + "learning_rate": 0.0001, + "loss": 5.906, + "loss/crossentropy": 2.567445397377014, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18346509337425232, + "step": 16270 + }, + { + "epoch": 0.5085, + "grad_norm": 3.546875, + "grad_norm_var": 0.08516337076822916, + "learning_rate": 0.0001, + "loss": 6.2019, + "loss/crossentropy": 2.753313660621643, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1901737079024315, + "step": 16272 + }, + { + "epoch": 0.5085625, + "grad_norm": 3.21875, + "grad_norm_var": 0.08279520670572917, + "learning_rate": 0.0001, + "loss": 5.8652, + "loss/crossentropy": 2.6811606884002686, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17191822826862335, + "step": 16274 + }, + { + "epoch": 0.508625, + "grad_norm": 3.0625, + "grad_norm_var": 0.08159891764322917, + "learning_rate": 0.0001, + "loss": 5.9666, + "loss/crossentropy": 2.694961190223694, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17715920507907867, + "step": 16276 + }, + { + "epoch": 0.5086875, + "grad_norm": 3.546875, + "grad_norm_var": 0.0837310791015625, + "learning_rate": 0.0001, + "loss": 5.9456, + "loss/crossentropy": 2.748932719230652, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17005664110183716, + "step": 16278 + }, + { + "epoch": 0.50875, + "grad_norm": 3.09375, + "grad_norm_var": 0.07838541666666667, + "learning_rate": 0.0001, + "loss": 5.7193, + "loss/crossentropy": 2.5305447578430176, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17122021317481995, + "step": 16280 + }, + { + "epoch": 0.5088125, + "grad_norm": 3.1875, + "grad_norm_var": 0.084375, + "learning_rate": 0.0001, + "loss": 5.5414, + "loss/crossentropy": 2.4110606908798218, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16459402441978455, + "step": 16282 + }, + { + "epoch": 0.508875, + "grad_norm": 3.328125, + "grad_norm_var": 0.048859659830729166, + "learning_rate": 0.0001, + "loss": 5.8926, + "loss/crossentropy": 2.5525397062301636, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1793188825249672, + "step": 16284 + }, + { + "epoch": 0.5089375, + "grad_norm": 3.359375, + "grad_norm_var": 0.032145182291666664, + "learning_rate": 0.0001, + "loss": 5.8081, + "loss/crossentropy": 2.6179057359695435, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16901445388793945, + "step": 16286 + }, + { + "epoch": 0.509, + "grad_norm": 3.078125, + "grad_norm_var": 0.028058878580729165, + "learning_rate": 0.0001, + "loss": 5.7786, + "loss/crossentropy": 2.521502375602722, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17531976848840714, + "step": 16288 + }, + { + "epoch": 0.5090625, + "grad_norm": 3.171875, + "grad_norm_var": 0.034684244791666666, + "learning_rate": 0.0001, + "loss": 5.9084, + "loss/crossentropy": 2.615760326385498, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17809134721755981, + "step": 16290 + }, + { + "epoch": 0.509125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04004618326822917, + "learning_rate": 0.0001, + "loss": 5.8047, + "loss/crossentropy": 2.584061026573181, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1716773360967636, + "step": 16292 + }, + { + "epoch": 0.5091875, + "grad_norm": 3.28125, + "grad_norm_var": 0.03495992024739583, + "learning_rate": 0.0001, + "loss": 5.5849, + "loss/crossentropy": 2.4501447677612305, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1603526473045349, + "step": 16294 + }, + { + "epoch": 0.50925, + "grad_norm": 3.234375, + "grad_norm_var": 0.03400777180989583, + "learning_rate": 0.0001, + "loss": 5.839, + "loss/crossentropy": 2.6217808723449707, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17289509624242783, + "step": 16296 + }, + { + "epoch": 0.5093125, + "grad_norm": 3.125, + "grad_norm_var": 0.030745442708333334, + "learning_rate": 0.0001, + "loss": 5.7943, + "loss/crossentropy": 2.6022061109542847, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16647806018590927, + "step": 16298 + }, + { + "epoch": 0.509375, + "grad_norm": 3.453125, + "grad_norm_var": 0.033951822916666666, + "learning_rate": 0.0001, + "loss": 5.6414, + "loss/crossentropy": 2.5401326417922974, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16363850235939026, + "step": 16300 + }, + { + "epoch": 0.5094375, + "grad_norm": 3.34375, + "grad_norm_var": 0.033690388997395834, + "learning_rate": 0.0001, + "loss": 5.5292, + "loss/crossentropy": 2.426029086112976, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16265857219696045, + "step": 16302 + }, + { + "epoch": 0.5095, + "grad_norm": 3.140625, + "grad_norm_var": 0.028662109375, + "learning_rate": 0.0001, + "loss": 5.5658, + "loss/crossentropy": 2.4209630489349365, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1695614457130432, + "step": 16304 + }, + { + "epoch": 0.5095625, + "grad_norm": 3.0, + "grad_norm_var": 0.022542317708333332, + "learning_rate": 0.0001, + "loss": 5.6216, + "loss/crossentropy": 2.4654128551483154, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16601119190454483, + "step": 16306 + }, + { + "epoch": 0.509625, + "grad_norm": 3.28125, + "grad_norm_var": 0.017215983072916666, + "learning_rate": 0.0001, + "loss": 5.6048, + "loss/crossentropy": 2.4579662084579468, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16468004882335663, + "step": 16308 + }, + { + "epoch": 0.5096875, + "grad_norm": 3.015625, + "grad_norm_var": 0.014142862955729167, + "learning_rate": 0.0001, + "loss": 5.7639, + "loss/crossentropy": 2.6069202423095703, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1703835353255272, + "step": 16310 + }, + { + "epoch": 0.50975, + "grad_norm": 3.265625, + "grad_norm_var": 0.014241536458333334, + "learning_rate": 0.0001, + "loss": 5.9887, + "loss/crossentropy": 2.771964192390442, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1712872013449669, + "step": 16312 + }, + { + "epoch": 0.5098125, + "grad_norm": 3.25, + "grad_norm_var": 0.0157623291015625, + "learning_rate": 0.0001, + "loss": 5.5799, + "loss/crossentropy": 2.3990046977996826, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17160802334547043, + "step": 16314 + }, + { + "epoch": 0.509875, + "grad_norm": 3.46875, + "grad_norm_var": 0.015165201822916667, + "learning_rate": 0.0001, + "loss": 5.7934, + "loss/crossentropy": 2.5308748483657837, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17312680184841156, + "step": 16316 + }, + { + "epoch": 0.5099375, + "grad_norm": 3.34375, + "grad_norm_var": 0.016112263997395834, + "learning_rate": 0.0001, + "loss": 5.8915, + "loss/crossentropy": 2.6198123693466187, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17560896277427673, + "step": 16318 + }, + { + "epoch": 0.51, + "grad_norm": 2.984375, + "grad_norm_var": 0.028425089518229165, + "learning_rate": 0.0001, + "loss": 5.6471, + "loss/crossentropy": 2.49073326587677, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1640724316239357, + "step": 16320 + }, + { + "epoch": 0.5100625, + "grad_norm": 3.265625, + "grad_norm_var": 0.02587890625, + "learning_rate": 0.0001, + "loss": 5.9094, + "loss/crossentropy": 2.6102280616760254, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1783544421195984, + "step": 16322 + }, + { + "epoch": 0.510125, + "grad_norm": 2.96875, + "grad_norm_var": 0.030826822916666666, + "learning_rate": 0.0001, + "loss": 5.9036, + "loss/crossentropy": 2.6985751390457153, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17245478183031082, + "step": 16324 + }, + { + "epoch": 0.5101875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0293121337890625, + "learning_rate": 0.0001, + "loss": 5.574, + "loss/crossentropy": 2.410964608192444, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16630808264017105, + "step": 16326 + }, + { + "epoch": 0.51025, + "grad_norm": 2.921875, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 5.5465, + "loss/crossentropy": 2.4809629917144775, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1628047674894333, + "step": 16328 + }, + { + "epoch": 0.5103125, + "grad_norm": 3.515625, + "grad_norm_var": 0.043000284830729166, + "learning_rate": 0.0001, + "loss": 5.8497, + "loss/crossentropy": 2.472634196281433, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.18106115609407425, + "step": 16330 + }, + { + "epoch": 0.510375, + "grad_norm": 3.078125, + "grad_norm_var": 0.050324503580729166, + "learning_rate": 0.0001, + "loss": 5.3682, + "loss/crossentropy": 2.3815958499908447, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1572549194097519, + "step": 16332 + }, + { + "epoch": 0.5104375, + "grad_norm": 3.296875, + "grad_norm_var": 0.052652994791666664, + "learning_rate": 0.0001, + "loss": 5.8791, + "loss/crossentropy": 2.515404224395752, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1816837266087532, + "step": 16334 + }, + { + "epoch": 0.5105, + "grad_norm": 3.1875, + "grad_norm_var": 0.0408111572265625, + "learning_rate": 0.0001, + "loss": 5.6377, + "loss/crossentropy": 2.485397219657898, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16796302050352097, + "step": 16336 + }, + { + "epoch": 0.5105625, + "grad_norm": 3.109375, + "grad_norm_var": 0.040576171875, + "learning_rate": 0.0001, + "loss": 5.6999, + "loss/crossentropy": 2.5654290914535522, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16579565405845642, + "step": 16338 + }, + { + "epoch": 0.510625, + "grad_norm": 3.125, + "grad_norm_var": 0.03681538899739583, + "learning_rate": 0.0001, + "loss": 5.9165, + "loss/crossentropy": 2.7041677236557007, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17357587069272995, + "step": 16340 + }, + { + "epoch": 0.5106875, + "grad_norm": 3.203125, + "grad_norm_var": 0.03600260416666667, + "learning_rate": 0.0001, + "loss": 5.8067, + "loss/crossentropy": 2.5463136434555054, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1740807741880417, + "step": 16342 + }, + { + "epoch": 0.51075, + "grad_norm": 2.96875, + "grad_norm_var": 0.03497721354166667, + "learning_rate": 0.0001, + "loss": 5.6408, + "loss/crossentropy": 2.499354839324951, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16765828430652618, + "step": 16344 + }, + { + "epoch": 0.5108125, + "grad_norm": 3.078125, + "grad_norm_var": 0.08043619791666666, + "learning_rate": 0.0001, + "loss": 6.1104, + "loss/crossentropy": 2.729995846748352, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1845257505774498, + "step": 16346 + }, + { + "epoch": 0.510875, + "grad_norm": 3.171875, + "grad_norm_var": 0.07063395182291667, + "learning_rate": 0.0001, + "loss": 5.9884, + "loss/crossentropy": 2.6871135234832764, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17856331914663315, + "step": 16348 + }, + { + "epoch": 0.5109375, + "grad_norm": 3.53125, + "grad_norm_var": 0.07433980305989583, + "learning_rate": 0.0001, + "loss": 6.2738, + "loss/crossentropy": 2.9303488731384277, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18083380907773972, + "step": 16350 + }, + { + "epoch": 0.511, + "grad_norm": 3.203125, + "grad_norm_var": 0.08007405598958334, + "learning_rate": 0.0001, + "loss": 5.7575, + "loss/crossentropy": 2.6469838619232178, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16339480131864548, + "step": 16352 + }, + { + "epoch": 0.5110625, + "grad_norm": 3.1875, + "grad_norm_var": 0.07778218587239584, + "learning_rate": 0.0001, + "loss": 5.962, + "loss/crossentropy": 2.700568914413452, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1769229918718338, + "step": 16354 + }, + { + "epoch": 0.511125, + "grad_norm": 2.859375, + "grad_norm_var": 0.08749593098958333, + "learning_rate": 0.0001, + "loss": 5.657, + "loss/crossentropy": 2.526244282722473, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16658874601125717, + "step": 16356 + }, + { + "epoch": 0.5111875, + "grad_norm": 3.0, + "grad_norm_var": 0.08982747395833333, + "learning_rate": 0.0001, + "loss": 5.4606, + "loss/crossentropy": 2.337547183036804, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16191042214632034, + "step": 16358 + }, + { + "epoch": 0.51125, + "grad_norm": 3.203125, + "grad_norm_var": 0.09379781087239583, + "learning_rate": 0.0001, + "loss": 5.814, + "loss/crossentropy": 2.6115304231643677, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16829267889261246, + "step": 16360 + }, + { + "epoch": 0.5113125, + "grad_norm": 3.28125, + "grad_norm_var": 0.04244791666666667, + "learning_rate": 0.0001, + "loss": 5.8401, + "loss/crossentropy": 2.6677355766296387, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16997096687555313, + "step": 16362 + }, + { + "epoch": 0.511375, + "grad_norm": 3.4375, + "grad_norm_var": 0.04417215983072917, + "learning_rate": 0.0001, + "loss": 5.9794, + "loss/crossentropy": 2.653276562690735, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18105435371398926, + "step": 16364 + }, + { + "epoch": 0.5114375, + "grad_norm": 3.203125, + "grad_norm_var": 0.03186442057291667, + "learning_rate": 0.0001, + "loss": 5.9588, + "loss/crossentropy": 2.6859607696533203, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.177672877907753, + "step": 16366 + }, + { + "epoch": 0.5115, + "grad_norm": 2.96875, + "grad_norm_var": 0.03181864420572917, + "learning_rate": 0.0001, + "loss": 5.5616, + "loss/crossentropy": 2.3682419061660767, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1720716878771782, + "step": 16368 + }, + { + "epoch": 0.5115625, + "grad_norm": 3.015625, + "grad_norm_var": 0.030549112955729166, + "learning_rate": 0.0001, + "loss": 5.9132, + "loss/crossentropy": 2.689664125442505, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1719593182206154, + "step": 16370 + }, + { + "epoch": 0.511625, + "grad_norm": 3.21875, + "grad_norm_var": 0.027457682291666667, + "learning_rate": 0.0001, + "loss": 5.1835, + "loss/crossentropy": 2.225640296936035, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15359832346439362, + "step": 16372 + }, + { + "epoch": 0.5116875, + "grad_norm": 3.03125, + "grad_norm_var": 0.034235636393229164, + "learning_rate": 0.0001, + "loss": 5.9472, + "loss/crossentropy": 2.6530286073684692, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1759035363793373, + "step": 16374 + }, + { + "epoch": 0.51175, + "grad_norm": 3.25, + "grad_norm_var": 0.0293853759765625, + "learning_rate": 0.0001, + "loss": 5.7623, + "loss/crossentropy": 2.4916436672210693, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17745761573314667, + "step": 16376 + }, + { + "epoch": 0.5118125, + "grad_norm": 2.984375, + "grad_norm_var": 0.026463826497395832, + "learning_rate": 0.0001, + "loss": 5.8484, + "loss/crossentropy": 2.694764733314514, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1645837500691414, + "step": 16378 + }, + { + "epoch": 0.511875, + "grad_norm": 3.234375, + "grad_norm_var": 0.022847493489583332, + "learning_rate": 0.0001, + "loss": 5.9255, + "loss/crossentropy": 2.6192431449890137, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17945070564746857, + "step": 16380 + }, + { + "epoch": 0.5119375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0229156494140625, + "learning_rate": 0.0001, + "loss": 5.9855, + "loss/crossentropy": 2.7059956789016724, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1771649792790413, + "step": 16382 + }, + { + "epoch": 0.512, + "grad_norm": 2.9375, + "grad_norm_var": 0.022688802083333334, + "learning_rate": 0.0001, + "loss": 5.6971, + "loss/crossentropy": 2.569319248199463, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1659058779478073, + "step": 16384 + }, + { + "epoch": 0.5120625, + "grad_norm": 2.890625, + "grad_norm_var": 0.025341796875, + "learning_rate": 0.0001, + "loss": 5.7879, + "loss/crossentropy": 2.6588436365127563, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16564378887414932, + "step": 16386 + }, + { + "epoch": 0.512125, + "grad_norm": 3.09375, + "grad_norm_var": 0.024507649739583335, + "learning_rate": 0.0001, + "loss": 5.4127, + "loss/crossentropy": 2.313828706741333, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16105583310127258, + "step": 16388 + }, + { + "epoch": 0.5121875, + "grad_norm": 3.25, + "grad_norm_var": 0.0187896728515625, + "learning_rate": 0.0001, + "loss": 6.0885, + "loss/crossentropy": 2.743655562400818, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1844824254512787, + "step": 16390 + }, + { + "epoch": 0.51225, + "grad_norm": 3.046875, + "grad_norm_var": 0.018115234375, + "learning_rate": 0.0001, + "loss": 5.7395, + "loss/crossentropy": 2.5687350034713745, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16511885076761246, + "step": 16392 + }, + { + "epoch": 0.5123125, + "grad_norm": 3.375, + "grad_norm_var": 0.020308430989583334, + "learning_rate": 0.0001, + "loss": 5.714, + "loss/crossentropy": 2.506616234779358, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16917835175991058, + "step": 16394 + }, + { + "epoch": 0.512375, + "grad_norm": 3.0, + "grad_norm_var": 0.0195953369140625, + "learning_rate": 0.0001, + "loss": 5.8435, + "loss/crossentropy": 2.6784114837646484, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.169635571539402, + "step": 16396 + }, + { + "epoch": 0.5124375, + "grad_norm": 2.875, + "grad_norm_var": 0.0249664306640625, + "learning_rate": 0.0001, + "loss": 5.5176, + "loss/crossentropy": 2.4275470972061157, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16134928166866302, + "step": 16398 + }, + { + "epoch": 0.5125, + "grad_norm": 3.40625, + "grad_norm_var": 0.033447265625, + "learning_rate": 0.0001, + "loss": 5.7455, + "loss/crossentropy": 2.469146490097046, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17255249619483948, + "step": 16400 + }, + { + "epoch": 0.5125625, + "grad_norm": 3.09375, + "grad_norm_var": 0.029645792643229165, + "learning_rate": 0.0001, + "loss": 5.749, + "loss/crossentropy": 2.5529110431671143, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16999481618404388, + "step": 16402 + }, + { + "epoch": 0.512625, + "grad_norm": 3.390625, + "grad_norm_var": 0.030924479166666668, + "learning_rate": 0.0001, + "loss": 5.6468, + "loss/crossentropy": 2.4244545698165894, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16988955438137054, + "step": 16404 + }, + { + "epoch": 0.5126875, + "grad_norm": 3.671875, + "grad_norm_var": 0.04410400390625, + "learning_rate": 0.0001, + "loss": 5.8922, + "loss/crossentropy": 2.6311975717544556, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17922505736351013, + "step": 16406 + }, + { + "epoch": 0.51275, + "grad_norm": 3.0625, + "grad_norm_var": 0.0494140625, + "learning_rate": 0.0001, + "loss": 5.3777, + "loss/crossentropy": 2.3718478679656982, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15527284890413284, + "step": 16408 + }, + { + "epoch": 0.5128125, + "grad_norm": 3.078125, + "grad_norm_var": 0.047684733072916666, + "learning_rate": 0.0001, + "loss": 5.9091, + "loss/crossentropy": 2.7013646364212036, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17506567388772964, + "step": 16410 + }, + { + "epoch": 0.512875, + "grad_norm": 3.0625, + "grad_norm_var": 0.046662394205729166, + "learning_rate": 0.0001, + "loss": 5.9451, + "loss/crossentropy": 2.7101930379867554, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1734907478094101, + "step": 16412 + }, + { + "epoch": 0.5129375, + "grad_norm": 3.28125, + "grad_norm_var": 0.04045817057291667, + "learning_rate": 0.0001, + "loss": 5.7463, + "loss/crossentropy": 2.600379467010498, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16654404252767563, + "step": 16414 + }, + { + "epoch": 0.513, + "grad_norm": 3.0, + "grad_norm_var": 0.035008748372395836, + "learning_rate": 0.0001, + "loss": 5.331, + "loss/crossentropy": 2.3208197355270386, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15649104118347168, + "step": 16416 + }, + { + "epoch": 0.5130625, + "grad_norm": 4.3125, + "grad_norm_var": 0.11590169270833334, + "learning_rate": 0.0001, + "loss": 5.5822, + "loss/crossentropy": 2.4566595554351807, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16372348368167877, + "step": 16418 + }, + { + "epoch": 0.513125, + "grad_norm": 3.0, + "grad_norm_var": 0.11842447916666667, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.640213131904602, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1663249060511589, + "step": 16420 + }, + { + "epoch": 0.5131875, + "grad_norm": 3.21875, + "grad_norm_var": 0.11492513020833334, + "learning_rate": 0.0001, + "loss": 6.2495, + "loss/crossentropy": 2.9241329431533813, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18409772962331772, + "step": 16422 + }, + { + "epoch": 0.51325, + "grad_norm": 3.25, + "grad_norm_var": 0.10549214680989584, + "learning_rate": 0.0001, + "loss": 5.7499, + "loss/crossentropy": 2.5274248123168945, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17497973889112473, + "step": 16424 + }, + { + "epoch": 0.5133125, + "grad_norm": 3.265625, + "grad_norm_var": 0.10448811848958334, + "learning_rate": 0.0001, + "loss": 5.734, + "loss/crossentropy": 2.4651782512664795, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17453866451978683, + "step": 16426 + }, + { + "epoch": 0.513375, + "grad_norm": 3.046875, + "grad_norm_var": 0.2105133056640625, + "learning_rate": 0.0001, + "loss": 5.7713, + "loss/crossentropy": 2.5959800481796265, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.167924702167511, + "step": 16428 + }, + { + "epoch": 0.5134375, + "grad_norm": 3.015625, + "grad_norm_var": 0.209814453125, + "learning_rate": 0.0001, + "loss": 5.6805, + "loss/crossentropy": 2.50624418258667, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16859985888004303, + "step": 16430 + }, + { + "epoch": 0.5135, + "grad_norm": 3.1875, + "grad_norm_var": 0.20103251139322917, + "learning_rate": 0.0001, + "loss": 5.8464, + "loss/crossentropy": 2.643206238746643, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17188185453414917, + "step": 16432 + }, + { + "epoch": 0.5135625, + "grad_norm": 2.953125, + "grad_norm_var": 0.1538238525390625, + "learning_rate": 0.0001, + "loss": 5.4847, + "loss/crossentropy": 2.4751126766204834, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15759844332933426, + "step": 16434 + }, + { + "epoch": 0.513625, + "grad_norm": 2.796875, + "grad_norm_var": 0.16464742024739584, + "learning_rate": 0.0001, + "loss": 5.5015, + "loss/crossentropy": 2.44723117351532, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1585516333580017, + "step": 16436 + }, + { + "epoch": 0.5136875, + "grad_norm": 3.390625, + "grad_norm_var": 0.16023763020833334, + "learning_rate": 0.0001, + "loss": 5.8765, + "loss/crossentropy": 2.7064484357833862, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16895991563796997, + "step": 16438 + }, + { + "epoch": 0.51375, + "grad_norm": 3.109375, + "grad_norm_var": 0.16331380208333332, + "learning_rate": 0.0001, + "loss": 5.7598, + "loss/crossentropy": 2.588004469871521, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16834890842437744, + "step": 16440 + }, + { + "epoch": 0.5138125, + "grad_norm": 3.296875, + "grad_norm_var": 0.16309305826822917, + "learning_rate": 0.0001, + "loss": 5.699, + "loss/crossentropy": 2.5069788694381714, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1656866893172264, + "step": 16442 + }, + { + "epoch": 0.513875, + "grad_norm": 3.125, + "grad_norm_var": 0.0258697509765625, + "learning_rate": 0.0001, + "loss": 5.9672, + "loss/crossentropy": 2.789791703224182, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16930486261844635, + "step": 16444 + }, + { + "epoch": 0.5139375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0309967041015625, + "learning_rate": 0.0001, + "loss": 5.55, + "loss/crossentropy": 2.3457083702087402, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16652101278305054, + "step": 16446 + }, + { + "epoch": 0.514, + "grad_norm": 2.953125, + "grad_norm_var": 0.0341796875, + "learning_rate": 0.0001, + "loss": 5.8116, + "loss/crossentropy": 2.6447486877441406, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17020239681005478, + "step": 16448 + }, + { + "epoch": 0.5140625, + "grad_norm": 3.421875, + "grad_norm_var": 0.03357747395833333, + "learning_rate": 0.0001, + "loss": 5.6166, + "loss/crossentropy": 2.412296772003174, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1692551001906395, + "step": 16450 + }, + { + "epoch": 0.514125, + "grad_norm": 3.015625, + "grad_norm_var": 0.027437337239583335, + "learning_rate": 0.0001, + "loss": 5.6995, + "loss/crossentropy": 2.474939227104187, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17284496128559113, + "step": 16452 + }, + { + "epoch": 0.5141875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0287261962890625, + "learning_rate": 0.0001, + "loss": 5.7438, + "loss/crossentropy": 2.4855151176452637, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17583084106445312, + "step": 16454 + }, + { + "epoch": 0.51425, + "grad_norm": 3.171875, + "grad_norm_var": 0.028327433268229167, + "learning_rate": 0.0001, + "loss": 5.8503, + "loss/crossentropy": 2.6569454669952393, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.174027219414711, + "step": 16456 + }, + { + "epoch": 0.5143125, + "grad_norm": 3.734375, + "grad_norm_var": 0.04450581868489583, + "learning_rate": 0.0001, + "loss": 5.8813, + "loss/crossentropy": 2.5460811853408813, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17805617302656174, + "step": 16458 + }, + { + "epoch": 0.514375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0438629150390625, + "learning_rate": 0.0001, + "loss": 5.8218, + "loss/crossentropy": 2.6440343856811523, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17051490396261215, + "step": 16460 + }, + { + "epoch": 0.5144375, + "grad_norm": 3.125, + "grad_norm_var": 0.04116109212239583, + "learning_rate": 0.0001, + "loss": 5.5685, + "loss/crossentropy": 2.430554509162903, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16965046525001526, + "step": 16462 + }, + { + "epoch": 0.5145, + "grad_norm": 3.078125, + "grad_norm_var": 0.0409088134765625, + "learning_rate": 0.0001, + "loss": 5.8054, + "loss/crossentropy": 2.6475865840911865, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.167339488863945, + "step": 16464 + }, + { + "epoch": 0.5145625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0390045166015625, + "learning_rate": 0.0001, + "loss": 5.7273, + "loss/crossentropy": 2.575814366340637, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16905974596738815, + "step": 16466 + }, + { + "epoch": 0.514625, + "grad_norm": 3.203125, + "grad_norm_var": 0.035054524739583336, + "learning_rate": 0.0001, + "loss": 5.819, + "loss/crossentropy": 2.6294543743133545, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17012812942266464, + "step": 16468 + }, + { + "epoch": 0.5146875, + "grad_norm": 2.921875, + "grad_norm_var": 0.03357645670572917, + "learning_rate": 0.0001, + "loss": 5.7404, + "loss/crossentropy": 2.5590656995773315, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17203693836927414, + "step": 16470 + }, + { + "epoch": 0.51475, + "grad_norm": 3.3125, + "grad_norm_var": 0.03512369791666667, + "learning_rate": 0.0001, + "loss": 5.9409, + "loss/crossentropy": 2.639412760734558, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17820008099079132, + "step": 16472 + }, + { + "epoch": 0.5148125, + "grad_norm": 2.984375, + "grad_norm_var": 0.016552734375, + "learning_rate": 0.0001, + "loss": 5.6727, + "loss/crossentropy": 2.60652494430542, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16208262741565704, + "step": 16474 + }, + { + "epoch": 0.514875, + "grad_norm": 3.296875, + "grad_norm_var": 0.020113118489583335, + "learning_rate": 0.0001, + "loss": 5.7368, + "loss/crossentropy": 2.5230515003204346, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1713699847459793, + "step": 16476 + }, + { + "epoch": 0.5149375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0196685791015625, + "learning_rate": 0.0001, + "loss": 6.1404, + "loss/crossentropy": 2.8043954372406006, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18165170401334763, + "step": 16478 + }, + { + "epoch": 0.515, + "grad_norm": 3.0, + "grad_norm_var": 0.019627888997395832, + "learning_rate": 0.0001, + "loss": 5.7294, + "loss/crossentropy": 2.607026219367981, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1661418303847313, + "step": 16480 + }, + { + "epoch": 0.5150625, + "grad_norm": 3.203125, + "grad_norm_var": 0.025288899739583332, + "learning_rate": 0.0001, + "loss": 5.8784, + "loss/crossentropy": 2.613901972770691, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17645075917243958, + "step": 16482 + }, + { + "epoch": 0.515125, + "grad_norm": 2.84375, + "grad_norm_var": 0.030696614583333334, + "learning_rate": 0.0001, + "loss": 5.8019, + "loss/crossentropy": 2.6880619525909424, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16450658440589905, + "step": 16484 + }, + { + "epoch": 0.5151875, + "grad_norm": 3.15625, + "grad_norm_var": 0.027782185872395834, + "learning_rate": 0.0001, + "loss": 5.6577, + "loss/crossentropy": 2.528021812438965, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1657046154141426, + "step": 16486 + }, + { + "epoch": 0.51525, + "grad_norm": 2.96875, + "grad_norm_var": 0.027099609375, + "learning_rate": 0.0001, + "loss": 5.7887, + "loss/crossentropy": 2.5553163290023804, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17450721561908722, + "step": 16488 + }, + { + "epoch": 0.5153125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0278472900390625, + "learning_rate": 0.0001, + "loss": 5.9336, + "loss/crossentropy": 2.7285076379776, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1708962619304657, + "step": 16490 + }, + { + "epoch": 0.515375, + "grad_norm": 3.40625, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 5.5904, + "loss/crossentropy": 2.3793158531188965, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17071735113859177, + "step": 16492 + }, + { + "epoch": 0.5154375, + "grad_norm": 2.953125, + "grad_norm_var": 0.039404296875, + "learning_rate": 0.0001, + "loss": 5.7508, + "loss/crossentropy": 2.6006277799606323, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16697286069393158, + "step": 16494 + }, + { + "epoch": 0.5155, + "grad_norm": 3.109375, + "grad_norm_var": 0.039118448893229164, + "learning_rate": 0.0001, + "loss": 5.6516, + "loss/crossentropy": 2.445363163948059, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16827547550201416, + "step": 16496 + }, + { + "epoch": 0.5155625, + "grad_norm": 3.15625, + "grad_norm_var": 0.037775675455729164, + "learning_rate": 0.0001, + "loss": 5.6302, + "loss/crossentropy": 2.4687918424606323, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16887445747852325, + "step": 16498 + }, + { + "epoch": 0.515625, + "grad_norm": 3.53125, + "grad_norm_var": 0.2933878580729167, + "learning_rate": 0.0001, + "loss": 6.2691, + "loss/crossentropy": 2.774771213531494, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.1924007385969162, + "step": 16500 + }, + { + "epoch": 0.5156875, + "grad_norm": 3.421875, + "grad_norm_var": 0.44541727701822914, + "learning_rate": 0.0001, + "loss": 6.2772, + "loss/crossentropy": 2.7692209482192993, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19377101957798004, + "step": 16502 + }, + { + "epoch": 0.51575, + "grad_norm": 3.40625, + "grad_norm_var": 0.4248860677083333, + "learning_rate": 0.0001, + "loss": 5.8903, + "loss/crossentropy": 2.6010122299194336, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17775394767522812, + "step": 16504 + }, + { + "epoch": 0.5158125, + "grad_norm": 2.859375, + "grad_norm_var": 0.44664306640625, + "learning_rate": 0.0001, + "loss": 5.7192, + "loss/crossentropy": 2.5332034826278687, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16977231949567795, + "step": 16506 + }, + { + "epoch": 0.515875, + "grad_norm": 3.09375, + "grad_norm_var": 0.45901285807291664, + "learning_rate": 0.0001, + "loss": 5.5482, + "loss/crossentropy": 2.365523934364319, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1690482795238495, + "step": 16508 + }, + { + "epoch": 0.5159375, + "grad_norm": 3.328125, + "grad_norm_var": 111.96536356608073, + "learning_rate": 0.0001, + "loss": 6.7191, + "loss/crossentropy": 2.786882162094116, + "loss/hidden": 1.6640625, + "loss/jsd": 0.0, + "loss/logits": 0.22681649774312973, + "step": 16510 + }, + { + "epoch": 0.516, + "grad_norm": 3.09375, + "grad_norm_var": 112.07008056640625, + "learning_rate": 0.0001, + "loss": 6.0112, + "loss/crossentropy": 2.7462023496627808, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17689534276723862, + "step": 16512 + }, + { + "epoch": 0.5160625, + "grad_norm": 3.234375, + "grad_norm_var": 112.233447265625, + "learning_rate": 0.0001, + "loss": 5.6887, + "loss/crossentropy": 2.5591888427734375, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16490060836076736, + "step": 16514 + }, + { + "epoch": 0.516125, + "grad_norm": 3.109375, + "grad_norm_var": 112.88609110514322, + "learning_rate": 0.0001, + "loss": 6.0241, + "loss/crossentropy": 2.7814793586730957, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17504484206438065, + "step": 16516 + }, + { + "epoch": 0.5161875, + "grad_norm": 3.046875, + "grad_norm_var": 113.45001525878907, + "learning_rate": 0.0001, + "loss": 5.9697, + "loss/crossentropy": 2.6906793117523193, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17555953562259674, + "step": 16518 + }, + { + "epoch": 0.51625, + "grad_norm": 3.0, + "grad_norm_var": 113.58023986816406, + "learning_rate": 0.0001, + "loss": 5.9051, + "loss/crossentropy": 2.697845458984375, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17345809936523438, + "step": 16520 + }, + { + "epoch": 0.5163125, + "grad_norm": 3.0625, + "grad_norm_var": 113.61848958333333, + "learning_rate": 0.0001, + "loss": 5.4842, + "loss/crossentropy": 2.4003665447235107, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1622883379459381, + "step": 16522 + }, + { + "epoch": 0.516375, + "grad_norm": 3.03125, + "grad_norm_var": 113.70017801920572, + "learning_rate": 0.0001, + "loss": 6.0258, + "loss/crossentropy": 2.810461163520813, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17153704166412354, + "step": 16524 + }, + { + "epoch": 0.5164375, + "grad_norm": 3.21875, + "grad_norm_var": 0.0197662353515625, + "learning_rate": 0.0001, + "loss": 5.4931, + "loss/crossentropy": 2.37578809261322, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16485276818275452, + "step": 16526 + }, + { + "epoch": 0.5165, + "grad_norm": 3.171875, + "grad_norm_var": 0.020905558268229166, + "learning_rate": 0.0001, + "loss": 5.5587, + "loss/crossentropy": 2.4451950788497925, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1636950746178627, + "step": 16528 + }, + { + "epoch": 0.5165625, + "grad_norm": 2.953125, + "grad_norm_var": 0.018082682291666666, + "learning_rate": 0.0001, + "loss": 5.6793, + "loss/crossentropy": 2.548762559890747, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16149181127548218, + "step": 16530 + }, + { + "epoch": 0.516625, + "grad_norm": 3.078125, + "grad_norm_var": 0.018049112955729165, + "learning_rate": 0.0001, + "loss": 5.7365, + "loss/crossentropy": 2.585257411003113, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16629429906606674, + "step": 16532 + }, + { + "epoch": 0.5166875, + "grad_norm": 2.96875, + "grad_norm_var": 0.0184722900390625, + "learning_rate": 0.0001, + "loss": 5.5841, + "loss/crossentropy": 2.5218945741653442, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16012202203273773, + "step": 16534 + }, + { + "epoch": 0.51675, + "grad_norm": 3.34375, + "grad_norm_var": 0.019364420572916666, + "learning_rate": 0.0001, + "loss": 5.6886, + "loss/crossentropy": 2.5698297023773193, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16695934534072876, + "step": 16536 + }, + { + "epoch": 0.5168125, + "grad_norm": 3.0625, + "grad_norm_var": 0.020783487955729166, + "learning_rate": 0.0001, + "loss": 5.8641, + "loss/crossentropy": 2.625547170639038, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.173465296626091, + "step": 16538 + }, + { + "epoch": 0.516875, + "grad_norm": 3.390625, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 5.9813, + "loss/crossentropy": 2.717432975769043, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17677276581525803, + "step": 16540 + }, + { + "epoch": 0.5169375, + "grad_norm": 3.203125, + "grad_norm_var": 0.020807902018229168, + "learning_rate": 0.0001, + "loss": 5.7959, + "loss/crossentropy": 2.5902841091156006, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16938591748476028, + "step": 16542 + }, + { + "epoch": 0.517, + "grad_norm": 3.109375, + "grad_norm_var": 0.05287984212239583, + "learning_rate": 0.0001, + "loss": 6.015, + "loss/crossentropy": 2.6680736541748047, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18039973080158234, + "step": 16544 + }, + { + "epoch": 0.5170625, + "grad_norm": 2.984375, + "grad_norm_var": 0.05230712890625, + "learning_rate": 0.0001, + "loss": 5.6888, + "loss/crossentropy": 2.6125682592391968, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16308893263339996, + "step": 16546 + }, + { + "epoch": 0.517125, + "grad_norm": 3.125, + "grad_norm_var": 0.0503814697265625, + "learning_rate": 0.0001, + "loss": 5.532, + "loss/crossentropy": 2.4145188331604004, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16877657175064087, + "step": 16548 + }, + { + "epoch": 0.5171875, + "grad_norm": 3.125, + "grad_norm_var": 0.045361328125, + "learning_rate": 0.0001, + "loss": 5.7908, + "loss/crossentropy": 2.5912692546844482, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17034027725458145, + "step": 16550 + }, + { + "epoch": 0.51725, + "grad_norm": 3.09375, + "grad_norm_var": 0.04692281087239583, + "learning_rate": 0.0001, + "loss": 5.7964, + "loss/crossentropy": 2.607316017150879, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17281020432710648, + "step": 16552 + }, + { + "epoch": 0.5173125, + "grad_norm": 3.015625, + "grad_norm_var": 0.045638020833333334, + "learning_rate": 0.0001, + "loss": 5.7138, + "loss/crossentropy": 2.5501874685287476, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16597293317317963, + "step": 16554 + }, + { + "epoch": 0.517375, + "grad_norm": 2.796875, + "grad_norm_var": 0.052083333333333336, + "learning_rate": 0.0001, + "loss": 5.7712, + "loss/crossentropy": 2.677625060081482, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.162484772503376, + "step": 16556 + }, + { + "epoch": 0.5174375, + "grad_norm": 3.6875, + "grad_norm_var": 0.07293192545572917, + "learning_rate": 0.0001, + "loss": 5.9425, + "loss/crossentropy": 2.6705286502838135, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1748487800359726, + "step": 16558 + }, + { + "epoch": 0.5175, + "grad_norm": 2.984375, + "grad_norm_var": 0.03701070149739583, + "learning_rate": 0.0001, + "loss": 5.6548, + "loss/crossentropy": 2.520450472831726, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1681206300854683, + "step": 16560 + }, + { + "epoch": 0.5175625, + "grad_norm": 3.1875, + "grad_norm_var": 0.040201822916666664, + "learning_rate": 0.0001, + "loss": 5.8293, + "loss/crossentropy": 2.5363374948501587, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17616698145866394, + "step": 16562 + }, + { + "epoch": 0.517625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04423828125, + "learning_rate": 0.0001, + "loss": 5.6035, + "loss/crossentropy": 2.4566866159439087, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16233892738819122, + "step": 16564 + }, + { + "epoch": 0.5176875, + "grad_norm": 3.171875, + "grad_norm_var": 0.04421284993489583, + "learning_rate": 0.0001, + "loss": 5.8993, + "loss/crossentropy": 2.6745909452438354, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17051292955875397, + "step": 16566 + }, + { + "epoch": 0.51775, + "grad_norm": 5.4375, + "grad_norm_var": 0.37661844889322915, + "learning_rate": 0.0001, + "loss": 6.1697, + "loss/crossentropy": 2.627561092376709, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1995277926325798, + "step": 16568 + }, + { + "epoch": 0.5178125, + "grad_norm": 3.203125, + "grad_norm_var": 0.38728739420572916, + "learning_rate": 0.0001, + "loss": 5.5743, + "loss/crossentropy": 2.4943583011627197, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16424017399549484, + "step": 16570 + }, + { + "epoch": 0.517875, + "grad_norm": 3.3125, + "grad_norm_var": 0.3738352457682292, + "learning_rate": 0.0001, + "loss": 6.0507, + "loss/crossentropy": 2.6343544721603394, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1869424656033516, + "step": 16572 + }, + { + "epoch": 0.5179375, + "grad_norm": 3.484375, + "grad_norm_var": 0.3589680989583333, + "learning_rate": 0.0001, + "loss": 5.8843, + "loss/crossentropy": 2.6096925735473633, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1758989542722702, + "step": 16574 + }, + { + "epoch": 0.518, + "grad_norm": 3.9375, + "grad_norm_var": 24.849951171875, + "learning_rate": 0.0001, + "loss": 6.2974, + "loss/crossentropy": 2.713488817214966, + "loss/hidden": 1.59375, + "loss/jsd": 0.0, + "loss/logits": 0.19901321083307266, + "step": 16576 + }, + { + "epoch": 0.5180625, + "grad_norm": 3.375, + "grad_norm_var": 24.806192016601564, + "learning_rate": 0.0001, + "loss": 5.9657, + "loss/crossentropy": 2.6843067407608032, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17735348641872406, + "step": 16578 + }, + { + "epoch": 0.518125, + "grad_norm": 3.234375, + "grad_norm_var": 24.819852701822917, + "learning_rate": 0.0001, + "loss": 5.4799, + "loss/crossentropy": 2.402770161628723, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15810411423444748, + "step": 16580 + }, + { + "epoch": 0.5181875, + "grad_norm": 3.71875, + "grad_norm_var": 24.695731608072915, + "learning_rate": 0.0001, + "loss": 5.8796, + "loss/crossentropy": 2.625036835670471, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17506562173366547, + "step": 16582 + }, + { + "epoch": 0.51825, + "grad_norm": 2.953125, + "grad_norm_var": 24.863863118489583, + "learning_rate": 0.0001, + "loss": 5.6303, + "loss/crossentropy": 2.524322748184204, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16176536679267883, + "step": 16584 + }, + { + "epoch": 0.5183125, + "grad_norm": 3.125, + "grad_norm_var": 24.795340983072915, + "learning_rate": 0.0001, + "loss": 5.6144, + "loss/crossentropy": 2.438259243965149, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17034796625375748, + "step": 16586 + }, + { + "epoch": 0.518375, + "grad_norm": 2.921875, + "grad_norm_var": 25.035846964518228, + "learning_rate": 0.0001, + "loss": 5.2784, + "loss/crossentropy": 2.328306198120117, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15125726163387299, + "step": 16588 + }, + { + "epoch": 0.5184375, + "grad_norm": 2.96875, + "grad_norm_var": 25.15708719889323, + "learning_rate": 0.0001, + "loss": 5.8799, + "loss/crossentropy": 2.670527696609497, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17250019311904907, + "step": 16590 + }, + { + "epoch": 0.5185, + "grad_norm": 3.125, + "grad_norm_var": 0.05158589680989583, + "learning_rate": 0.0001, + "loss": 5.8831, + "loss/crossentropy": 2.6243882179260254, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17665047943592072, + "step": 16592 + }, + { + "epoch": 0.5185625, + "grad_norm": 3.265625, + "grad_norm_var": 0.06926676432291666, + "learning_rate": 0.0001, + "loss": 5.6799, + "loss/crossentropy": 2.4813830852508545, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16868449747562408, + "step": 16594 + }, + { + "epoch": 0.518625, + "grad_norm": 3.296875, + "grad_norm_var": 0.06985270182291667, + "learning_rate": 0.0001, + "loss": 5.6264, + "loss/crossentropy": 2.480758786201477, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16495513916015625, + "step": 16596 + }, + { + "epoch": 0.5186875, + "grad_norm": 3.125, + "grad_norm_var": 0.050658162434895834, + "learning_rate": 0.0001, + "loss": 5.8854, + "loss/crossentropy": 2.6669927835464478, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17223212867975235, + "step": 16598 + }, + { + "epoch": 0.51875, + "grad_norm": 2.875, + "grad_norm_var": 0.049267578125, + "learning_rate": 0.0001, + "loss": 5.5787, + "loss/crossentropy": 2.5182470083236694, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16229919344186783, + "step": 16600 + }, + { + "epoch": 0.5188125, + "grad_norm": 3.171875, + "grad_norm_var": 0.050614420572916666, + "learning_rate": 0.0001, + "loss": 5.6494, + "loss/crossentropy": 2.543129086494446, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1645297035574913, + "step": 16602 + }, + { + "epoch": 0.518875, + "grad_norm": 3.34375, + "grad_norm_var": 0.04119466145833333, + "learning_rate": 0.0001, + "loss": 5.7711, + "loss/crossentropy": 2.5774060487747192, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17054499685764313, + "step": 16604 + }, + { + "epoch": 0.5189375, + "grad_norm": 3.40625, + "grad_norm_var": 0.0501373291015625, + "learning_rate": 0.0001, + "loss": 6.0801, + "loss/crossentropy": 2.738303065299988, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18144899606704712, + "step": 16606 + }, + { + "epoch": 0.519, + "grad_norm": 3.15625, + "grad_norm_var": 0.05136311848958333, + "learning_rate": 0.0001, + "loss": 5.7269, + "loss/crossentropy": 2.525917887687683, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1700976938009262, + "step": 16608 + }, + { + "epoch": 0.5190625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0331695556640625, + "learning_rate": 0.0001, + "loss": 5.8912, + "loss/crossentropy": 2.6941726207733154, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1736096292734146, + "step": 16610 + }, + { + "epoch": 0.519125, + "grad_norm": 3.484375, + "grad_norm_var": 0.0389801025390625, + "learning_rate": 0.0001, + "loss": 5.8374, + "loss/crossentropy": 2.6021909713745117, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17352460324764252, + "step": 16612 + }, + { + "epoch": 0.5191875, + "grad_norm": 3.1875, + "grad_norm_var": 0.03847554524739583, + "learning_rate": 0.0001, + "loss": 5.8745, + "loss/crossentropy": 2.602517247200012, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1783716008067131, + "step": 16614 + }, + { + "epoch": 0.51925, + "grad_norm": 3.09375, + "grad_norm_var": 0.03681538899739583, + "learning_rate": 0.0001, + "loss": 5.9364, + "loss/crossentropy": 2.758184313774109, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16938883066177368, + "step": 16616 + }, + { + "epoch": 0.5193125, + "grad_norm": 3.203125, + "grad_norm_var": 0.03352762858072917, + "learning_rate": 0.0001, + "loss": 6.0031, + "loss/crossentropy": 2.6977959871292114, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1813122108578682, + "step": 16618 + }, + { + "epoch": 0.519375, + "grad_norm": 2.828125, + "grad_norm_var": 0.04378153483072917, + "learning_rate": 0.0001, + "loss": 5.5473, + "loss/crossentropy": 2.456685423851013, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16179471462965012, + "step": 16620 + }, + { + "epoch": 0.5194375, + "grad_norm": 3.265625, + "grad_norm_var": 0.032242838541666666, + "learning_rate": 0.0001, + "loss": 5.5945, + "loss/crossentropy": 2.4643516540527344, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16379916667938232, + "step": 16622 + }, + { + "epoch": 0.5195, + "grad_norm": 3.609375, + "grad_norm_var": 0.05373942057291667, + "learning_rate": 0.0001, + "loss": 5.9985, + "loss/crossentropy": 2.665659546852112, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18016141653060913, + "step": 16624 + }, + { + "epoch": 0.5195625, + "grad_norm": 3.328125, + "grad_norm_var": 0.054352823893229166, + "learning_rate": 0.0001, + "loss": 5.7673, + "loss/crossentropy": 2.585843801498413, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1697111800312996, + "step": 16626 + }, + { + "epoch": 0.519625, + "grad_norm": 3.09375, + "grad_norm_var": 0.05849609375, + "learning_rate": 0.0001, + "loss": 5.9058, + "loss/crossentropy": 2.567280411720276, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17682481557130814, + "step": 16628 + }, + { + "epoch": 0.5196875, + "grad_norm": 3.421875, + "grad_norm_var": 0.0635406494140625, + "learning_rate": 0.0001, + "loss": 6.013, + "loss/crossentropy": 2.672019124031067, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18214056640863419, + "step": 16630 + }, + { + "epoch": 0.51975, + "grad_norm": 2.90625, + "grad_norm_var": 0.06884358723958334, + "learning_rate": 0.0001, + "loss": 5.392, + "loss/crossentropy": 2.3738889694213867, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15923285484313965, + "step": 16632 + }, + { + "epoch": 0.5198125, + "grad_norm": 3.21875, + "grad_norm_var": 0.0807525634765625, + "learning_rate": 0.0001, + "loss": 5.65, + "loss/crossentropy": 2.488644242286682, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16886603087186813, + "step": 16634 + }, + { + "epoch": 0.519875, + "grad_norm": 3.203125, + "grad_norm_var": 0.06968994140625, + "learning_rate": 0.0001, + "loss": 5.7624, + "loss/crossentropy": 2.5380845069885254, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1736041009426117, + "step": 16636 + }, + { + "epoch": 0.5199375, + "grad_norm": 3.359375, + "grad_norm_var": 0.07164306640625, + "learning_rate": 0.0001, + "loss": 5.7852, + "loss/crossentropy": 2.5422178506851196, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1727387011051178, + "step": 16638 + }, + { + "epoch": 0.52, + "grad_norm": 3.078125, + "grad_norm_var": 0.075830078125, + "learning_rate": 0.0001, + "loss": 5.6866, + "loss/crossentropy": 2.5098516941070557, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16923755407333374, + "step": 16640 + }, + { + "epoch": 0.5200625, + "grad_norm": 3.359375, + "grad_norm_var": 0.07096354166666667, + "learning_rate": 0.0001, + "loss": 5.7984, + "loss/crossentropy": 2.5298749208450317, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17372490465641022, + "step": 16642 + }, + { + "epoch": 0.520125, + "grad_norm": 3.671875, + "grad_norm_var": 0.07226155598958334, + "learning_rate": 0.0001, + "loss": 5.6965, + "loss/crossentropy": 2.485423445701599, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17150159925222397, + "step": 16644 + }, + { + "epoch": 0.5201875, + "grad_norm": 3.34375, + "grad_norm_var": 0.06940816243489584, + "learning_rate": 0.0001, + "loss": 5.8126, + "loss/crossentropy": 2.626734972000122, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17132389545440674, + "step": 16646 + }, + { + "epoch": 0.52025, + "grad_norm": 3.0, + "grad_norm_var": 0.057738240559895834, + "learning_rate": 0.0001, + "loss": 5.7702, + "loss/crossentropy": 2.619166374206543, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1650991588830948, + "step": 16648 + }, + { + "epoch": 0.5203125, + "grad_norm": 3.59375, + "grad_norm_var": 0.046483357747395836, + "learning_rate": 0.0001, + "loss": 6.0174, + "loss/crossentropy": 2.705695629119873, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17765065282583237, + "step": 16650 + }, + { + "epoch": 0.520375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0499664306640625, + "learning_rate": 0.0001, + "loss": 5.7881, + "loss/crossentropy": 2.581183910369873, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17107804864645004, + "step": 16652 + }, + { + "epoch": 0.5204375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0509674072265625, + "learning_rate": 0.0001, + "loss": 5.7454, + "loss/crossentropy": 2.608821749687195, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.165218323469162, + "step": 16654 + }, + { + "epoch": 0.5205, + "grad_norm": 3.4375, + "grad_norm_var": 0.03501688639322917, + "learning_rate": 0.0001, + "loss": 5.79, + "loss/crossentropy": 2.5277384519577026, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1770046204328537, + "step": 16656 + }, + { + "epoch": 0.5205625, + "grad_norm": 2.890625, + "grad_norm_var": 0.04708658854166667, + "learning_rate": 0.0001, + "loss": 5.7761, + "loss/crossentropy": 2.6142302751541138, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16345254331827164, + "step": 16658 + }, + { + "epoch": 0.520625, + "grad_norm": 2.921875, + "grad_norm_var": 0.04256083170572917, + "learning_rate": 0.0001, + "loss": 5.3146, + "loss/crossentropy": 2.3108972311019897, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15740341693162918, + "step": 16660 + }, + { + "epoch": 0.5206875, + "grad_norm": 2.984375, + "grad_norm_var": 0.05091145833333333, + "learning_rate": 0.0001, + "loss": 5.6101, + "loss/crossentropy": 2.5152639150619507, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16065338253974915, + "step": 16662 + }, + { + "epoch": 0.52075, + "grad_norm": 3.046875, + "grad_norm_var": 0.05803629557291667, + "learning_rate": 0.0001, + "loss": 5.2244, + "loss/crossentropy": 2.2537713050842285, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.14979984611272812, + "step": 16664 + }, + { + "epoch": 0.5208125, + "grad_norm": 3.28125, + "grad_norm_var": 0.05217692057291667, + "learning_rate": 0.0001, + "loss": 5.8836, + "loss/crossentropy": 2.58497154712677, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18025048077106476, + "step": 16666 + }, + { + "epoch": 0.520875, + "grad_norm": 3.359375, + "grad_norm_var": 0.05827534993489583, + "learning_rate": 0.0001, + "loss": 5.5951, + "loss/crossentropy": 2.4491108655929565, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16889246553182602, + "step": 16668 + }, + { + "epoch": 0.5209375, + "grad_norm": 3.390625, + "grad_norm_var": 0.06366780598958334, + "learning_rate": 0.0001, + "loss": 6.0348, + "loss/crossentropy": 2.7149627208709717, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18158919364213943, + "step": 16670 + }, + { + "epoch": 0.521, + "grad_norm": 3.359375, + "grad_norm_var": 0.060445149739583336, + "learning_rate": 0.0001, + "loss": 5.8633, + "loss/crossentropy": 2.554919481277466, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18005279451608658, + "step": 16672 + }, + { + "epoch": 0.5210625, + "grad_norm": 3.21875, + "grad_norm_var": 0.048094685872395834, + "learning_rate": 0.0001, + "loss": 5.8456, + "loss/crossentropy": 2.6079788208007812, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17493051290512085, + "step": 16674 + }, + { + "epoch": 0.521125, + "grad_norm": 3.125, + "grad_norm_var": 0.0436431884765625, + "learning_rate": 0.0001, + "loss": 5.5138, + "loss/crossentropy": 2.4231492280960083, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16335897147655487, + "step": 16676 + }, + { + "epoch": 0.5211875, + "grad_norm": 3.203125, + "grad_norm_var": 0.03212890625, + "learning_rate": 0.0001, + "loss": 5.5832, + "loss/crossentropy": 2.4611895084381104, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1633726805448532, + "step": 16678 + }, + { + "epoch": 0.52125, + "grad_norm": 3.640625, + "grad_norm_var": 0.030464680989583333, + "learning_rate": 0.0001, + "loss": 5.9662, + "loss/crossentropy": 2.601937174797058, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18173735588788986, + "step": 16680 + }, + { + "epoch": 0.5213125, + "grad_norm": 3.1875, + "grad_norm_var": 0.03173421223958333, + "learning_rate": 0.0001, + "loss": 5.6479, + "loss/crossentropy": 2.5193867683410645, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16792523860931396, + "step": 16682 + }, + { + "epoch": 0.521375, + "grad_norm": 2.96875, + "grad_norm_var": 0.030451456705729168, + "learning_rate": 0.0001, + "loss": 5.9514, + "loss/crossentropy": 2.780367136001587, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16827665269374847, + "step": 16684 + }, + { + "epoch": 0.5214375, + "grad_norm": 2.9375, + "grad_norm_var": 0.033589680989583336, + "learning_rate": 0.0001, + "loss": 5.5207, + "loss/crossentropy": 2.4640052318573, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15723436325788498, + "step": 16686 + }, + { + "epoch": 0.5215, + "grad_norm": 2.90625, + "grad_norm_var": 0.037495930989583336, + "learning_rate": 0.0001, + "loss": 5.5658, + "loss/crossentropy": 2.5082110166549683, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15770812332630157, + "step": 16688 + }, + { + "epoch": 0.5215625, + "grad_norm": 3.203125, + "grad_norm_var": 0.03766988118489583, + "learning_rate": 0.0001, + "loss": 5.6946, + "loss/crossentropy": 2.541201949119568, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16573123633861542, + "step": 16690 + }, + { + "epoch": 0.521625, + "grad_norm": 3.546875, + "grad_norm_var": 0.0573394775390625, + "learning_rate": 0.0001, + "loss": 5.836, + "loss/crossentropy": 2.496829390525818, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18118096888065338, + "step": 16692 + }, + { + "epoch": 0.5216875, + "grad_norm": 2.96875, + "grad_norm_var": 0.0611968994140625, + "learning_rate": 0.0001, + "loss": 5.6866, + "loss/crossentropy": 2.536230444908142, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16659681499004364, + "step": 16694 + }, + { + "epoch": 0.52175, + "grad_norm": 3.078125, + "grad_norm_var": 0.039449055989583336, + "learning_rate": 0.0001, + "loss": 5.572, + "loss/crossentropy": 2.4573066234588623, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16068410873413086, + "step": 16696 + }, + { + "epoch": 0.5218125, + "grad_norm": 2.78125, + "grad_norm_var": 0.0487213134765625, + "learning_rate": 0.0001, + "loss": 5.5547, + "loss/crossentropy": 2.421579599380493, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1660444363951683, + "step": 16698 + }, + { + "epoch": 0.521875, + "grad_norm": 3.203125, + "grad_norm_var": 0.04829813639322917, + "learning_rate": 0.0001, + "loss": 5.7851, + "loss/crossentropy": 2.5321544408798218, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1745123714208603, + "step": 16700 + }, + { + "epoch": 0.5219375, + "grad_norm": 2.921875, + "grad_norm_var": 0.04954325358072917, + "learning_rate": 0.0001, + "loss": 5.6871, + "loss/crossentropy": 2.6101166009902954, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16160931438207626, + "step": 16702 + }, + { + "epoch": 0.522, + "grad_norm": 3.125, + "grad_norm_var": 0.04578348795572917, + "learning_rate": 0.0001, + "loss": 5.9245, + "loss/crossentropy": 2.632740616798401, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1776122897863388, + "step": 16704 + }, + { + "epoch": 0.5220625, + "grad_norm": 3.375, + "grad_norm_var": 0.04827473958333333, + "learning_rate": 0.0001, + "loss": 5.657, + "loss/crossentropy": 2.476766586303711, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17075669765472412, + "step": 16706 + }, + { + "epoch": 0.522125, + "grad_norm": 3.0625, + "grad_norm_var": 0.028889973958333332, + "learning_rate": 0.0001, + "loss": 5.6833, + "loss/crossentropy": 2.533569812774658, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16810289025306702, + "step": 16708 + }, + { + "epoch": 0.5221875, + "grad_norm": 3.328125, + "grad_norm_var": 0.030785115559895833, + "learning_rate": 0.0001, + "loss": 5.6766, + "loss/crossentropy": 2.5180299282073975, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16781431436538696, + "step": 16710 + }, + { + "epoch": 0.52225, + "grad_norm": 3.0625, + "grad_norm_var": 0.11048177083333334, + "learning_rate": 0.0001, + "loss": 5.8755, + "loss/crossentropy": 2.620678663253784, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17352984845638275, + "step": 16712 + }, + { + "epoch": 0.5223125, + "grad_norm": 3.140625, + "grad_norm_var": 0.0986480712890625, + "learning_rate": 0.0001, + "loss": 5.7624, + "loss/crossentropy": 2.5677719116210938, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17024896293878555, + "step": 16714 + }, + { + "epoch": 0.522375, + "grad_norm": 3.265625, + "grad_norm_var": 0.09802958170572916, + "learning_rate": 0.0001, + "loss": 5.9484, + "loss/crossentropy": 2.703026533126831, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17531873285770416, + "step": 16716 + }, + { + "epoch": 0.5224375, + "grad_norm": 3.25, + "grad_norm_var": 0.088720703125, + "learning_rate": 0.0001, + "loss": 5.9424, + "loss/crossentropy": 2.7067244052886963, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17318104952573776, + "step": 16718 + }, + { + "epoch": 0.5225, + "grad_norm": 3.15625, + "grad_norm_var": 0.08799540201822917, + "learning_rate": 0.0001, + "loss": 5.5793, + "loss/crossentropy": 2.460513114929199, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16656684130430222, + "step": 16720 + }, + { + "epoch": 0.5225625, + "grad_norm": 3.171875, + "grad_norm_var": 0.08570556640625, + "learning_rate": 0.0001, + "loss": 5.6503, + "loss/crossentropy": 2.5076273679733276, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1670023426413536, + "step": 16722 + }, + { + "epoch": 0.522625, + "grad_norm": 2.875, + "grad_norm_var": 0.0902740478515625, + "learning_rate": 0.0001, + "loss": 5.8403, + "loss/crossentropy": 2.643712043762207, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17044231295585632, + "step": 16724 + }, + { + "epoch": 0.5226875, + "grad_norm": 3.4375, + "grad_norm_var": 0.09388020833333334, + "learning_rate": 0.0001, + "loss": 5.9001, + "loss/crossentropy": 2.6782734394073486, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17296595126390457, + "step": 16726 + }, + { + "epoch": 0.52275, + "grad_norm": 3.09375, + "grad_norm_var": 0.01949462890625, + "learning_rate": 0.0001, + "loss": 5.2923, + "loss/crossentropy": 2.2812613248825073, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15813816338777542, + "step": 16728 + }, + { + "epoch": 0.5228125, + "grad_norm": 3.40625, + "grad_norm_var": 0.024153645833333334, + "learning_rate": 0.0001, + "loss": 5.9144, + "loss/crossentropy": 2.666238784790039, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17325755208730698, + "step": 16730 + }, + { + "epoch": 0.522875, + "grad_norm": 3.296875, + "grad_norm_var": 0.023998006184895834, + "learning_rate": 0.0001, + "loss": 5.896, + "loss/crossentropy": 2.6160322427749634, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17643069475889206, + "step": 16732 + }, + { + "epoch": 0.5229375, + "grad_norm": 3.328125, + "grad_norm_var": 0.025016276041666667, + "learning_rate": 0.0001, + "loss": 5.8807, + "loss/crossentropy": 2.6244956254959106, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17562486976385117, + "step": 16734 + }, + { + "epoch": 0.523, + "grad_norm": 3.0625, + "grad_norm_var": 0.025999959309895834, + "learning_rate": 0.0001, + "loss": 5.6844, + "loss/crossentropy": 2.478963017463684, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1728852093219757, + "step": 16736 + }, + { + "epoch": 0.5230625, + "grad_norm": 3.296875, + "grad_norm_var": 0.030659993489583332, + "learning_rate": 0.0001, + "loss": 5.5548, + "loss/crossentropy": 2.334673762321472, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17005780339241028, + "step": 16738 + }, + { + "epoch": 0.523125, + "grad_norm": 3.390625, + "grad_norm_var": 0.026155598958333335, + "learning_rate": 0.0001, + "loss": 6.0049, + "loss/crossentropy": 2.6568034887313843, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17973309755325317, + "step": 16740 + }, + { + "epoch": 0.5231875, + "grad_norm": 2.890625, + "grad_norm_var": 0.030757649739583334, + "learning_rate": 0.0001, + "loss": 5.7537, + "loss/crossentropy": 2.5928655862808228, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16529954969882965, + "step": 16742 + }, + { + "epoch": 0.52325, + "grad_norm": 3.015625, + "grad_norm_var": 0.027489217122395833, + "learning_rate": 0.0001, + "loss": 5.6691, + "loss/crossentropy": 2.510942578315735, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16816120594739914, + "step": 16744 + }, + { + "epoch": 0.5233125, + "grad_norm": 3.015625, + "grad_norm_var": 0.02994384765625, + "learning_rate": 0.0001, + "loss": 5.6242, + "loss/crossentropy": 2.524855375289917, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16384124755859375, + "step": 16746 + }, + { + "epoch": 0.523375, + "grad_norm": 3.015625, + "grad_norm_var": 0.033203125, + "learning_rate": 0.0001, + "loss": 5.5681, + "loss/crossentropy": 2.483372688293457, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16238310188055038, + "step": 16748 + }, + { + "epoch": 0.5234375, + "grad_norm": 3.359375, + "grad_norm_var": 0.0370269775390625, + "learning_rate": 0.0001, + "loss": 5.6439, + "loss/crossentropy": 2.466872215270996, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1688736230134964, + "step": 16750 + }, + { + "epoch": 0.5235, + "grad_norm": 3.625, + "grad_norm_var": 0.048628743489583334, + "learning_rate": 0.0001, + "loss": 5.9759, + "loss/crossentropy": 2.658264398574829, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17942135781049728, + "step": 16752 + }, + { + "epoch": 0.5235625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0514801025390625, + "learning_rate": 0.0001, + "loss": 5.3525, + "loss/crossentropy": 2.3306466341018677, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1584375649690628, + "step": 16754 + }, + { + "epoch": 0.523625, + "grad_norm": 3.25, + "grad_norm_var": 0.04947509765625, + "learning_rate": 0.0001, + "loss": 5.7338, + "loss/crossentropy": 2.5754432678222656, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16778850555419922, + "step": 16756 + }, + { + "epoch": 0.5236875, + "grad_norm": 2.859375, + "grad_norm_var": 0.05592041015625, + "learning_rate": 0.0001, + "loss": 5.8624, + "loss/crossentropy": 2.63477885723114, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17237010598182678, + "step": 16758 + }, + { + "epoch": 0.52375, + "grad_norm": 3.203125, + "grad_norm_var": 0.053999837239583334, + "learning_rate": 0.0001, + "loss": 5.9421, + "loss/crossentropy": 2.7633711099624634, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17021772265434265, + "step": 16760 + }, + { + "epoch": 0.5238125, + "grad_norm": 3.203125, + "grad_norm_var": 0.0580474853515625, + "learning_rate": 0.0001, + "loss": 5.4875, + "loss/crossentropy": 2.3752094507217407, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1600557640194893, + "step": 16762 + }, + { + "epoch": 0.523875, + "grad_norm": 3.265625, + "grad_norm_var": 0.056929524739583334, + "learning_rate": 0.0001, + "loss": 5.988, + "loss/crossentropy": 2.7755390405654907, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17202361673116684, + "step": 16764 + }, + { + "epoch": 0.5239375, + "grad_norm": 3.328125, + "grad_norm_var": 0.05338134765625, + "learning_rate": 0.0001, + "loss": 5.7225, + "loss/crossentropy": 2.537709593772888, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16691192239522934, + "step": 16766 + }, + { + "epoch": 0.524, + "grad_norm": 2.75, + "grad_norm_var": 0.0473785400390625, + "learning_rate": 0.0001, + "loss": 5.4379, + "loss/crossentropy": 2.393734335899353, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15558606386184692, + "step": 16768 + }, + { + "epoch": 0.5240625, + "grad_norm": 3.171875, + "grad_norm_var": 0.04442952473958333, + "learning_rate": 0.0001, + "loss": 5.7923, + "loss/crossentropy": 2.632172107696533, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16796287894248962, + "step": 16770 + }, + { + "epoch": 0.524125, + "grad_norm": 3.84375, + "grad_norm_var": 0.07353413899739583, + "learning_rate": 0.0001, + "loss": 6.0387, + "loss/crossentropy": 2.7371329069137573, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17469213902950287, + "step": 16772 + }, + { + "epoch": 0.5241875, + "grad_norm": 3.265625, + "grad_norm_var": 0.05758463541666667, + "learning_rate": 0.0001, + "loss": 5.7492, + "loss/crossentropy": 2.611035943031311, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16967833042144775, + "step": 16774 + }, + { + "epoch": 0.52425, + "grad_norm": 3.109375, + "grad_norm_var": 0.05725911458333333, + "learning_rate": 0.0001, + "loss": 5.6682, + "loss/crossentropy": 2.5155742168426514, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16682402789592743, + "step": 16776 + }, + { + "epoch": 0.5243125, + "grad_norm": 3.40625, + "grad_norm_var": 0.05660400390625, + "learning_rate": 0.0001, + "loss": 5.4848, + "loss/crossentropy": 2.426730751991272, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1538565456867218, + "step": 16778 + }, + { + "epoch": 0.524375, + "grad_norm": 3.4375, + "grad_norm_var": 0.06402587890625, + "learning_rate": 0.0001, + "loss": 5.8914, + "loss/crossentropy": 2.554630398750305, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17782258987426758, + "step": 16780 + }, + { + "epoch": 0.5244375, + "grad_norm": 3.046875, + "grad_norm_var": 0.06392822265625, + "learning_rate": 0.0001, + "loss": 5.8384, + "loss/crossentropy": 2.6723839044570923, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17012126743793488, + "step": 16782 + }, + { + "epoch": 0.5245, + "grad_norm": 3.0, + "grad_norm_var": 0.05237223307291667, + "learning_rate": 0.0001, + "loss": 5.9261, + "loss/crossentropy": 2.7359548807144165, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17253194004297256, + "step": 16784 + }, + { + "epoch": 0.5245625, + "grad_norm": 3.03125, + "grad_norm_var": 0.05284830729166667, + "learning_rate": 0.0001, + "loss": 5.629, + "loss/crossentropy": 2.4921926259994507, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16328564286231995, + "step": 16786 + }, + { + "epoch": 0.524625, + "grad_norm": 2.921875, + "grad_norm_var": 0.03338216145833333, + "learning_rate": 0.0001, + "loss": 5.4287, + "loss/crossentropy": 2.439249873161316, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15754255652427673, + "step": 16788 + }, + { + "epoch": 0.5246875, + "grad_norm": 3.25, + "grad_norm_var": 0.031151326497395833, + "learning_rate": 0.0001, + "loss": 5.4842, + "loss/crossentropy": 2.3997395038604736, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15805823355913162, + "step": 16790 + }, + { + "epoch": 0.52475, + "grad_norm": 4.0, + "grad_norm_var": 0.07854715983072917, + "learning_rate": 0.0001, + "loss": 5.832, + "loss/crossentropy": 2.4776930809020996, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18035707622766495, + "step": 16792 + }, + { + "epoch": 0.5248125, + "grad_norm": 3.171875, + "grad_norm_var": 0.1131744384765625, + "learning_rate": 0.0001, + "loss": 6.0571, + "loss/crossentropy": 2.649075508117676, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.18298634141683578, + "step": 16794 + }, + { + "epoch": 0.524875, + "grad_norm": 3.0625, + "grad_norm_var": 0.10982666015625, + "learning_rate": 0.0001, + "loss": 6.1231, + "loss/crossentropy": 2.89070200920105, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17402183264493942, + "step": 16796 + }, + { + "epoch": 0.5249375, + "grad_norm": 2.984375, + "grad_norm_var": 0.11135965983072917, + "learning_rate": 0.0001, + "loss": 5.7474, + "loss/crossentropy": 2.6006139516830444, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16663673520088196, + "step": 16798 + }, + { + "epoch": 0.525, + "grad_norm": 3.125, + "grad_norm_var": 0.1128326416015625, + "learning_rate": 0.0001, + "loss": 5.4162, + "loss/crossentropy": 2.363920211791992, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15991061180830002, + "step": 16800 + }, + { + "epoch": 0.5250625, + "grad_norm": 3.265625, + "grad_norm_var": 0.1113433837890625, + "learning_rate": 0.0001, + "loss": 6.0503, + "loss/crossentropy": 2.7622867822647095, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18036288022994995, + "step": 16802 + }, + { + "epoch": 0.525125, + "grad_norm": 3.015625, + "grad_norm_var": 0.10338134765625, + "learning_rate": 0.0001, + "loss": 5.6703, + "loss/crossentropy": 2.585463047027588, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16160530596971512, + "step": 16804 + }, + { + "epoch": 0.5251875, + "grad_norm": 3.203125, + "grad_norm_var": 0.10568033854166667, + "learning_rate": 0.0001, + "loss": 5.2132, + "loss/crossentropy": 2.212006449699402, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.14972937107086182, + "step": 16806 + }, + { + "epoch": 0.52525, + "grad_norm": 3.0625, + "grad_norm_var": 0.062272135416666666, + "learning_rate": 0.0001, + "loss": 5.5782, + "loss/crossentropy": 2.5070362091064453, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16102229803800583, + "step": 16808 + }, + { + "epoch": 0.5253125, + "grad_norm": 3.0, + "grad_norm_var": 0.01578369140625, + "learning_rate": 0.0001, + "loss": 5.6242, + "loss/crossentropy": 2.506625771522522, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1676207035779953, + "step": 16810 + }, + { + "epoch": 0.525375, + "grad_norm": 2.96875, + "grad_norm_var": 0.017220052083333333, + "learning_rate": 0.0001, + "loss": 5.5629, + "loss/crossentropy": 2.4584254026412964, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16162265837192535, + "step": 16812 + }, + { + "epoch": 0.5254375, + "grad_norm": 3.0625, + "grad_norm_var": 0.016722615559895834, + "learning_rate": 0.0001, + "loss": 5.7189, + "loss/crossentropy": 2.5626300573349, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16523746401071548, + "step": 16814 + }, + { + "epoch": 0.5255, + "grad_norm": 2.828125, + "grad_norm_var": 0.020531209309895833, + "learning_rate": 0.0001, + "loss": 5.4088, + "loss/crossentropy": 2.378592610359192, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1584884375333786, + "step": 16816 + }, + { + "epoch": 0.5255625, + "grad_norm": 3.09375, + "grad_norm_var": 0.018480428059895835, + "learning_rate": 0.0001, + "loss": 5.7678, + "loss/crossentropy": 2.639050841331482, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16444016993045807, + "step": 16818 + }, + { + "epoch": 0.525625, + "grad_norm": 3.0, + "grad_norm_var": 0.0238189697265625, + "learning_rate": 0.0001, + "loss": 5.579, + "loss/crossentropy": 2.5458027124404907, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15801066905260086, + "step": 16820 + }, + { + "epoch": 0.5256875, + "grad_norm": 2.96875, + "grad_norm_var": 0.012548828125, + "learning_rate": 0.0001, + "loss": 5.7504, + "loss/crossentropy": 2.607507109642029, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.17131800949573517, + "step": 16822 + }, + { + "epoch": 0.52575, + "grad_norm": 3.453125, + "grad_norm_var": 0.029801432291666666, + "learning_rate": 0.0001, + "loss": 5.8954, + "loss/crossentropy": 2.6303540468215942, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17572374641895294, + "step": 16824 + }, + { + "epoch": 0.5258125, + "grad_norm": 3.4375, + "grad_norm_var": 0.03870035807291667, + "learning_rate": 0.0001, + "loss": 5.826, + "loss/crossentropy": 2.6341618299484253, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16879215836524963, + "step": 16826 + }, + { + "epoch": 0.525875, + "grad_norm": 3.1875, + "grad_norm_var": 0.04209696451822917, + "learning_rate": 0.0001, + "loss": 5.7755, + "loss/crossentropy": 2.6447147130966187, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1673756092786789, + "step": 16828 + }, + { + "epoch": 0.5259375, + "grad_norm": 3.21875, + "grad_norm_var": 0.042769368489583334, + "learning_rate": 0.0001, + "loss": 5.7975, + "loss/crossentropy": 2.59163498878479, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16941649466753006, + "step": 16830 + }, + { + "epoch": 0.526, + "grad_norm": 3.15625, + "grad_norm_var": 0.038374837239583334, + "learning_rate": 0.0001, + "loss": 5.4617, + "loss/crossentropy": 2.412532925605774, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16195210069417953, + "step": 16832 + }, + { + "epoch": 0.5260625, + "grad_norm": 3.078125, + "grad_norm_var": 0.03772684733072917, + "learning_rate": 0.0001, + "loss": 5.5503, + "loss/crossentropy": 2.4097806215286255, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16287972778081894, + "step": 16834 + }, + { + "epoch": 0.526125, + "grad_norm": 3.234375, + "grad_norm_var": 0.03179931640625, + "learning_rate": 0.0001, + "loss": 5.5965, + "loss/crossentropy": 2.3854483366012573, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16680347174406052, + "step": 16836 + }, + { + "epoch": 0.5261875, + "grad_norm": 3.359375, + "grad_norm_var": 0.0328125, + "learning_rate": 0.0001, + "loss": 5.6514, + "loss/crossentropy": 2.467614769935608, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16837888211011887, + "step": 16838 + }, + { + "epoch": 0.52625, + "grad_norm": 3.40625, + "grad_norm_var": 0.0302734375, + "learning_rate": 0.0001, + "loss": 5.9642, + "loss/crossentropy": 2.715559482574463, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1744699776172638, + "step": 16840 + }, + { + "epoch": 0.5263125, + "grad_norm": 2.953125, + "grad_norm_var": 0.025715128580729166, + "learning_rate": 0.0001, + "loss": 5.478, + "loss/crossentropy": 2.337415099143982, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16523519158363342, + "step": 16842 + }, + { + "epoch": 0.526375, + "grad_norm": 2.984375, + "grad_norm_var": 0.022581990559895834, + "learning_rate": 0.0001, + "loss": 5.3061, + "loss/crossentropy": 2.248517870903015, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.14989487826824188, + "step": 16844 + }, + { + "epoch": 0.5264375, + "grad_norm": 3.171875, + "grad_norm_var": 0.022630818684895835, + "learning_rate": 0.0001, + "loss": 5.8628, + "loss/crossentropy": 2.62682843208313, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17477399110794067, + "step": 16846 + }, + { + "epoch": 0.5265, + "grad_norm": 3.84375, + "grad_norm_var": 0.05113525390625, + "learning_rate": 0.0001, + "loss": 5.6703, + "loss/crossentropy": 2.4922419786453247, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16898052394390106, + "step": 16848 + }, + { + "epoch": 0.5265625, + "grad_norm": 2.84375, + "grad_norm_var": 0.05813802083333333, + "learning_rate": 0.0001, + "loss": 5.5643, + "loss/crossentropy": 2.472880244255066, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16109106689691544, + "step": 16850 + }, + { + "epoch": 0.526625, + "grad_norm": 3.234375, + "grad_norm_var": 0.06179097493489583, + "learning_rate": 0.0001, + "loss": 5.5819, + "loss/crossentropy": 2.484124779701233, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16446161270141602, + "step": 16852 + }, + { + "epoch": 0.5266875, + "grad_norm": 3.28125, + "grad_norm_var": 0.06026102701822917, + "learning_rate": 0.0001, + "loss": 5.7263, + "loss/crossentropy": 2.5401129722595215, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1697906032204628, + "step": 16854 + }, + { + "epoch": 0.52675, + "grad_norm": 2.953125, + "grad_norm_var": 0.0576812744140625, + "learning_rate": 0.0001, + "loss": 5.68, + "loss/crossentropy": 2.5771374702453613, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16302035003900528, + "step": 16856 + }, + { + "epoch": 0.5268125, + "grad_norm": 3.34375, + "grad_norm_var": 0.05762430826822917, + "learning_rate": 0.0001, + "loss": 5.5129, + "loss/crossentropy": 2.3532347679138184, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16440674662590027, + "step": 16858 + }, + { + "epoch": 0.526875, + "grad_norm": 3.15625, + "grad_norm_var": 0.060700480143229166, + "learning_rate": 0.0001, + "loss": 5.8512, + "loss/crossentropy": 2.6577740907669067, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17246952652931213, + "step": 16860 + }, + { + "epoch": 0.5269375, + "grad_norm": 2.921875, + "grad_norm_var": 0.06601155598958333, + "learning_rate": 0.0001, + "loss": 5.8364, + "loss/crossentropy": 2.6040685176849365, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17401274293661118, + "step": 16862 + }, + { + "epoch": 0.527, + "grad_norm": 2.9375, + "grad_norm_var": 0.04531148274739583, + "learning_rate": 0.0001, + "loss": 5.5698, + "loss/crossentropy": 2.5193625688552856, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15972767025232315, + "step": 16864 + }, + { + "epoch": 0.5270625, + "grad_norm": 2.984375, + "grad_norm_var": 0.04179585774739583, + "learning_rate": 0.0001, + "loss": 5.6884, + "loss/crossentropy": 2.616241216659546, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16385561227798462, + "step": 16866 + }, + { + "epoch": 0.527125, + "grad_norm": 3.15625, + "grad_norm_var": 0.03840230305989583, + "learning_rate": 0.0001, + "loss": 5.867, + "loss/crossentropy": 2.661938190460205, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17012009769678116, + "step": 16868 + }, + { + "epoch": 0.5271875, + "grad_norm": 3.21875, + "grad_norm_var": 0.03469645182291667, + "learning_rate": 0.0001, + "loss": 5.7668, + "loss/crossentropy": 2.5564876794815063, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1702459156513214, + "step": 16870 + }, + { + "epoch": 0.52725, + "grad_norm": 3.015625, + "grad_norm_var": 0.033610026041666664, + "learning_rate": 0.0001, + "loss": 5.7368, + "loss/crossentropy": 2.615280032157898, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16293253749608994, + "step": 16872 + }, + { + "epoch": 0.5273125, + "grad_norm": 3.171875, + "grad_norm_var": 0.028766886393229166, + "learning_rate": 0.0001, + "loss": 5.7585, + "loss/crossentropy": 2.5623830556869507, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1684436872601509, + "step": 16874 + }, + { + "epoch": 0.527375, + "grad_norm": 2.953125, + "grad_norm_var": 0.01353759765625, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.6023114919662476, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16375557333230972, + "step": 16876 + }, + { + "epoch": 0.5274375, + "grad_norm": 3.546875, + "grad_norm_var": 0.0321929931640625, + "learning_rate": 0.0001, + "loss": 6.1932, + "loss/crossentropy": 2.7963041067123413, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18616915494203568, + "step": 16878 + }, + { + "epoch": 0.5275, + "grad_norm": 3.359375, + "grad_norm_var": 0.0278472900390625, + "learning_rate": 0.0001, + "loss": 6.2065, + "loss/crossentropy": 2.8618983030319214, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1821155846118927, + "step": 16880 + }, + { + "epoch": 0.5275625, + "grad_norm": 3.203125, + "grad_norm_var": 0.024723307291666666, + "learning_rate": 0.0001, + "loss": 5.7974, + "loss/crossentropy": 2.694758176803589, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16143349558115005, + "step": 16882 + }, + { + "epoch": 0.527625, + "grad_norm": 3.421875, + "grad_norm_var": 0.026334635416666665, + "learning_rate": 0.0001, + "loss": 5.7665, + "loss/crossentropy": 2.538394570350647, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17007964849472046, + "step": 16884 + }, + { + "epoch": 0.5276875, + "grad_norm": 3.515625, + "grad_norm_var": 0.0408111572265625, + "learning_rate": 0.0001, + "loss": 6.3388, + "loss/crossentropy": 2.9788248538970947, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18209479749202728, + "step": 16886 + }, + { + "epoch": 0.52775, + "grad_norm": 3.984375, + "grad_norm_var": 0.06877848307291666, + "learning_rate": 0.0001, + "loss": 5.7101, + "loss/crossentropy": 2.516141653060913, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1682264357805252, + "step": 16888 + }, + { + "epoch": 0.5278125, + "grad_norm": 2.921875, + "grad_norm_var": 0.08279520670572917, + "learning_rate": 0.0001, + "loss": 5.5814, + "loss/crossentropy": 2.5133888721466064, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1610962450504303, + "step": 16890 + }, + { + "epoch": 0.527875, + "grad_norm": 3.265625, + "grad_norm_var": 0.079541015625, + "learning_rate": 0.0001, + "loss": 5.6858, + "loss/crossentropy": 2.463269591331482, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1714681014418602, + "step": 16892 + }, + { + "epoch": 0.5279375, + "grad_norm": 3.21875, + "grad_norm_var": 0.07891337076822917, + "learning_rate": 0.0001, + "loss": 5.8659, + "loss/crossentropy": 2.5936213731765747, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17683736234903336, + "step": 16894 + }, + { + "epoch": 0.528, + "grad_norm": 3.53125, + "grad_norm_var": 0.07958984375, + "learning_rate": 0.0001, + "loss": 5.6168, + "loss/crossentropy": 2.5288150310516357, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15879766643047333, + "step": 16896 + }, + { + "epoch": 0.5280625, + "grad_norm": 3.453125, + "grad_norm_var": 0.08035481770833333, + "learning_rate": 0.0001, + "loss": 5.8487, + "loss/crossentropy": 2.5781877040863037, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17627078294754028, + "step": 16898 + }, + { + "epoch": 0.528125, + "grad_norm": 2.96875, + "grad_norm_var": 0.09530843098958333, + "learning_rate": 0.0001, + "loss": 5.5579, + "loss/crossentropy": 2.4117249250411987, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1614910438656807, + "step": 16900 + }, + { + "epoch": 0.5281875, + "grad_norm": 4.84375, + "grad_norm_var": 0.23356831868489583, + "learning_rate": 0.0001, + "loss": 5.8338, + "loss/crossentropy": 2.5839866399765015, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17420267313718796, + "step": 16902 + }, + { + "epoch": 0.52825, + "grad_norm": 3.234375, + "grad_norm_var": 0.21035868326822918, + "learning_rate": 0.0001, + "loss": 5.8998, + "loss/crossentropy": 2.666812777519226, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17329959571361542, + "step": 16904 + }, + { + "epoch": 0.5283125, + "grad_norm": 3.375, + "grad_norm_var": 0.19847005208333332, + "learning_rate": 0.0001, + "loss": 5.8801, + "loss/crossentropy": 2.608286142349243, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17523258924484253, + "step": 16906 + }, + { + "epoch": 0.528375, + "grad_norm": 3.09375, + "grad_norm_var": 0.19133199055989583, + "learning_rate": 0.0001, + "loss": 5.5642, + "loss/crossentropy": 2.443881034851074, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16515463590621948, + "step": 16908 + }, + { + "epoch": 0.5284375, + "grad_norm": 5.78125, + "grad_norm_var": 0.5673004150390625, + "learning_rate": 0.0001, + "loss": 6.2954, + "loss/crossentropy": 2.78745698928833, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.1941521167755127, + "step": 16910 + }, + { + "epoch": 0.5285, + "grad_norm": 3.671875, + "grad_norm_var": 0.5599894205729167, + "learning_rate": 0.0001, + "loss": 5.8464, + "loss/crossentropy": 2.616065263748169, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17342819273471832, + "step": 16912 + }, + { + "epoch": 0.5285625, + "grad_norm": 2.96875, + "grad_norm_var": 0.5651692708333333, + "learning_rate": 0.0001, + "loss": 5.5398, + "loss/crossentropy": 2.3426021337509155, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1685510277748108, + "step": 16914 + }, + { + "epoch": 0.528625, + "grad_norm": 3.125, + "grad_norm_var": 0.5444244384765625, + "learning_rate": 0.0001, + "loss": 5.67, + "loss/crossentropy": 2.447251796722412, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17149558663368225, + "step": 16916 + }, + { + "epoch": 0.5286875, + "grad_norm": 3.125, + "grad_norm_var": 0.43374735514322915, + "learning_rate": 0.0001, + "loss": 5.9925, + "loss/crossentropy": 2.736096978187561, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17564402520656586, + "step": 16918 + }, + { + "epoch": 0.52875, + "grad_norm": 3.328125, + "grad_norm_var": 0.4401519775390625, + "learning_rate": 0.0001, + "loss": 5.7838, + "loss/crossentropy": 2.542062520980835, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16987399011850357, + "step": 16920 + }, + { + "epoch": 0.5288125, + "grad_norm": 3.1875, + "grad_norm_var": 0.43704427083333336, + "learning_rate": 0.0001, + "loss": 5.9564, + "loss/crossentropy": 2.6412583589553833, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18073027580976486, + "step": 16922 + }, + { + "epoch": 0.528875, + "grad_norm": 3.21875, + "grad_norm_var": 0.43429361979166664, + "learning_rate": 0.0001, + "loss": 5.828, + "loss/crossentropy": 2.6030869483947754, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17366065829992294, + "step": 16924 + }, + { + "epoch": 0.5289375, + "grad_norm": 3.296875, + "grad_norm_var": 0.04169820149739583, + "learning_rate": 0.0001, + "loss": 5.8671, + "loss/crossentropy": 2.557239294052124, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1782548651099205, + "step": 16926 + }, + { + "epoch": 0.529, + "grad_norm": 3.5, + "grad_norm_var": 0.0314605712890625, + "learning_rate": 0.0001, + "loss": 5.5674, + "loss/crossentropy": 2.3005136251449585, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17278698831796646, + "step": 16928 + }, + { + "epoch": 0.5290625, + "grad_norm": 3.0625, + "grad_norm_var": 0.02236328125, + "learning_rate": 0.0001, + "loss": 5.8136, + "loss/crossentropy": 2.6265273094177246, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17183005809783936, + "step": 16930 + }, + { + "epoch": 0.529125, + "grad_norm": 2.8125, + "grad_norm_var": 0.0405670166015625, + "learning_rate": 0.0001, + "loss": 5.3265, + "loss/crossentropy": 2.387045741081238, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15019110590219498, + "step": 16932 + }, + { + "epoch": 0.5291875, + "grad_norm": 4.09375, + "grad_norm_var": 0.09065348307291667, + "learning_rate": 0.0001, + "loss": 6.0541, + "loss/crossentropy": 2.6851927042007446, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18689000606536865, + "step": 16934 + }, + { + "epoch": 0.52925, + "grad_norm": 5.4375, + "grad_norm_var": 0.3789621988932292, + "learning_rate": 0.0001, + "loss": 5.7548, + "loss/crossentropy": 2.4790419340133667, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1728922575712204, + "step": 16936 + }, + { + "epoch": 0.5293125, + "grad_norm": 3.265625, + "grad_norm_var": 0.37795817057291664, + "learning_rate": 0.0001, + "loss": 5.8875, + "loss/crossentropy": 2.6745702028274536, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17128994315862656, + "step": 16938 + }, + { + "epoch": 0.529375, + "grad_norm": 4.15625, + "grad_norm_var": 0.42385965983072915, + "learning_rate": 0.0001, + "loss": 5.409, + "loss/crossentropy": 2.327757477760315, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.15695005655288696, + "step": 16940 + }, + { + "epoch": 0.5294375, + "grad_norm": 3.546875, + "grad_norm_var": 0.42753499348958335, + "learning_rate": 0.0001, + "loss": 5.8836, + "loss/crossentropy": 2.6485486030578613, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17623953521251678, + "step": 16942 + }, + { + "epoch": 0.5295, + "grad_norm": 3.125, + "grad_norm_var": 0.4352447509765625, + "learning_rate": 0.0001, + "loss": 5.9013, + "loss/crossentropy": 2.6693369150161743, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.173587866127491, + "step": 16944 + }, + { + "epoch": 0.5295625, + "grad_norm": 2.921875, + "grad_norm_var": 0.45735270182291665, + "learning_rate": 0.0001, + "loss": 5.7236, + "loss/crossentropy": 2.646336793899536, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1616320163011551, + "step": 16946 + }, + { + "epoch": 0.529625, + "grad_norm": 3.234375, + "grad_norm_var": 0.4293131510416667, + "learning_rate": 0.0001, + "loss": 5.6899, + "loss/crossentropy": 2.526552438735962, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1682843267917633, + "step": 16948 + }, + { + "epoch": 0.5296875, + "grad_norm": 3.390625, + "grad_norm_var": 0.40546773274739584, + "learning_rate": 0.0001, + "loss": 5.7597, + "loss/crossentropy": 2.555445075035095, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17003649473190308, + "step": 16950 + }, + { + "epoch": 0.52975, + "grad_norm": 3.171875, + "grad_norm_var": 0.0985015869140625, + "learning_rate": 0.0001, + "loss": 5.9148, + "loss/crossentropy": 2.7017569541931152, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16896067559719086, + "step": 16952 + }, + { + "epoch": 0.5298125, + "grad_norm": 3.34375, + "grad_norm_var": 0.09973551432291666, + "learning_rate": 0.0001, + "loss": 5.9579, + "loss/crossentropy": 2.7303154468536377, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17432095855474472, + "step": 16954 + }, + { + "epoch": 0.529875, + "grad_norm": 3.71875, + "grad_norm_var": 0.05175374348958333, + "learning_rate": 0.0001, + "loss": 5.6725, + "loss/crossentropy": 2.458512544631958, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16944526880979538, + "step": 16956 + }, + { + "epoch": 0.5299375, + "grad_norm": 3.4375, + "grad_norm_var": 0.04744364420572917, + "learning_rate": 0.0001, + "loss": 5.8231, + "loss/crossentropy": 2.6536834239959717, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17085148394107819, + "step": 16958 + }, + { + "epoch": 0.53, + "grad_norm": 3.390625, + "grad_norm_var": 0.05022786458333333, + "learning_rate": 0.0001, + "loss": 5.8461, + "loss/crossentropy": 2.6176013946533203, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17207121849060059, + "step": 16960 + }, + { + "epoch": 0.5300625, + "grad_norm": 3.265625, + "grad_norm_var": 0.04282938639322917, + "learning_rate": 0.0001, + "loss": 5.5728, + "loss/crossentropy": 2.491735577583313, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16006263345479965, + "step": 16962 + }, + { + "epoch": 0.530125, + "grad_norm": 3.359375, + "grad_norm_var": 0.044920857747395834, + "learning_rate": 0.0001, + "loss": 5.9665, + "loss/crossentropy": 2.6180880069732666, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18133027851581573, + "step": 16964 + }, + { + "epoch": 0.5301875, + "grad_norm": 2.953125, + "grad_norm_var": 0.047591145833333334, + "learning_rate": 0.0001, + "loss": 5.6772, + "loss/crossentropy": 2.62038254737854, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15881171077489853, + "step": 16966 + }, + { + "epoch": 0.53025, + "grad_norm": 3.15625, + "grad_norm_var": 0.04568684895833333, + "learning_rate": 0.0001, + "loss": 5.8876, + "loss/crossentropy": 2.7255882024765015, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16971950232982635, + "step": 16968 + }, + { + "epoch": 0.5303125, + "grad_norm": 3.53125, + "grad_norm_var": 0.05624593098958333, + "learning_rate": 0.0001, + "loss": 5.7136, + "loss/crossentropy": 2.551610231399536, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16736791282892227, + "step": 16970 + }, + { + "epoch": 0.530375, + "grad_norm": 3.296875, + "grad_norm_var": 0.03951416015625, + "learning_rate": 0.0001, + "loss": 5.7962, + "loss/crossentropy": 2.5625109672546387, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17180463671684265, + "step": 16972 + }, + { + "epoch": 0.5304375, + "grad_norm": 3.15625, + "grad_norm_var": 0.035868326822916664, + "learning_rate": 0.0001, + "loss": 5.8789, + "loss/crossentropy": 2.62811541557312, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17625083029270172, + "step": 16974 + }, + { + "epoch": 0.5305, + "grad_norm": 3.0625, + "grad_norm_var": 0.03430887858072917, + "learning_rate": 0.0001, + "loss": 5.856, + "loss/crossentropy": 2.7236850261688232, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.164009727537632, + "step": 16976 + }, + { + "epoch": 0.5305625, + "grad_norm": 3.421875, + "grad_norm_var": 0.03540751139322917, + "learning_rate": 0.0001, + "loss": 5.8104, + "loss/crossentropy": 2.5462101697921753, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17173375189304352, + "step": 16978 + }, + { + "epoch": 0.530625, + "grad_norm": 3.421875, + "grad_norm_var": 0.0314605712890625, + "learning_rate": 0.0001, + "loss": 5.8743, + "loss/crossentropy": 2.651816487312317, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1687367781996727, + "step": 16980 + }, + { + "epoch": 0.5306875, + "grad_norm": 3.1875, + "grad_norm_var": 0.02447509765625, + "learning_rate": 0.0001, + "loss": 5.624, + "loss/crossentropy": 2.5051662921905518, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.161881722509861, + "step": 16982 + }, + { + "epoch": 0.53075, + "grad_norm": 2.9375, + "grad_norm_var": 0.0282623291015625, + "learning_rate": 0.0001, + "loss": 5.5397, + "loss/crossentropy": 2.4485573768615723, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16380631178617477, + "step": 16984 + }, + { + "epoch": 0.5308125, + "grad_norm": 3.078125, + "grad_norm_var": 0.018050130208333334, + "learning_rate": 0.0001, + "loss": 5.605, + "loss/crossentropy": 2.442605137825012, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16818874329328537, + "step": 16986 + }, + { + "epoch": 0.530875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0190093994140625, + "learning_rate": 0.0001, + "loss": 5.8612, + "loss/crossentropy": 2.669662117958069, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17150143533945084, + "step": 16988 + }, + { + "epoch": 0.5309375, + "grad_norm": 2.875, + "grad_norm_var": 0.025830078125, + "learning_rate": 0.0001, + "loss": 5.7293, + "loss/crossentropy": 2.5571272373199463, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1683904081583023, + "step": 16990 + }, + { + "epoch": 0.531, + "grad_norm": 3.15625, + "grad_norm_var": 0.0263092041015625, + "learning_rate": 0.0001, + "loss": 5.5736, + "loss/crossentropy": 2.401633381843567, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16875667124986649, + "step": 16992 + }, + { + "epoch": 0.5310625, + "grad_norm": 3.46875, + "grad_norm_var": 0.03228759765625, + "learning_rate": 0.0001, + "loss": 5.6485, + "loss/crossentropy": 2.5339184999465942, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16262532770633698, + "step": 16994 + }, + { + "epoch": 0.531125, + "grad_norm": 3.265625, + "grad_norm_var": 0.029878743489583335, + "learning_rate": 0.0001, + "loss": 5.3957, + "loss/crossentropy": 2.365266442298889, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15421508252620697, + "step": 16996 + }, + { + "epoch": 0.5311875, + "grad_norm": 3.21875, + "grad_norm_var": 0.05676676432291667, + "learning_rate": 0.0001, + "loss": 5.9016, + "loss/crossentropy": 2.7077423334121704, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17251364141702652, + "step": 16998 + }, + { + "epoch": 0.53125, + "grad_norm": 3.109375, + "grad_norm_var": 0.052953084309895836, + "learning_rate": 0.0001, + "loss": 5.5384, + "loss/crossentropy": 2.4351983070373535, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16070988774299622, + "step": 17000 + }, + { + "epoch": 0.5313125, + "grad_norm": 3.28125, + "grad_norm_var": 0.05916239420572917, + "learning_rate": 0.0001, + "loss": 6.0018, + "loss/crossentropy": 2.6615259647369385, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18441465497016907, + "step": 17002 + }, + { + "epoch": 0.531375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0683013916015625, + "learning_rate": 0.0001, + "loss": 5.7757, + "loss/crossentropy": 2.6376312971115112, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16653899103403091, + "step": 17004 + }, + { + "epoch": 0.5314375, + "grad_norm": 3.140625, + "grad_norm_var": 0.06551106770833333, + "learning_rate": 0.0001, + "loss": 5.7551, + "loss/crossentropy": 2.6392234563827515, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1654955893754959, + "step": 17006 + }, + { + "epoch": 0.5315, + "grad_norm": 3.671875, + "grad_norm_var": 0.0814605712890625, + "learning_rate": 0.0001, + "loss": 5.8047, + "loss/crossentropy": 2.5538251399993896, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17469869554042816, + "step": 17008 + }, + { + "epoch": 0.5315625, + "grad_norm": 3.21875, + "grad_norm_var": 0.07419331868489583, + "learning_rate": 0.0001, + "loss": 5.7475, + "loss/crossentropy": 2.5746891498565674, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1680661141872406, + "step": 17010 + }, + { + "epoch": 0.531625, + "grad_norm": 2.984375, + "grad_norm_var": 0.06934305826822916, + "learning_rate": 0.0001, + "loss": 5.3914, + "loss/crossentropy": 2.348442792892456, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15858762711286545, + "step": 17012 + }, + { + "epoch": 0.5316875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0548492431640625, + "learning_rate": 0.0001, + "loss": 5.6595, + "loss/crossentropy": 2.5538713932037354, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16563858091831207, + "step": 17014 + }, + { + "epoch": 0.53175, + "grad_norm": 3.328125, + "grad_norm_var": 0.05650634765625, + "learning_rate": 0.0001, + "loss": 6.0582, + "loss/crossentropy": 2.853422522544861, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17047683149576187, + "step": 17016 + }, + { + "epoch": 0.5318125, + "grad_norm": 3.078125, + "grad_norm_var": 0.049153645833333336, + "learning_rate": 0.0001, + "loss": 5.9466, + "loss/crossentropy": 2.690248489379883, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1748548224568367, + "step": 17018 + }, + { + "epoch": 0.531875, + "grad_norm": 3.125, + "grad_norm_var": 0.040999348958333334, + "learning_rate": 0.0001, + "loss": 5.9512, + "loss/crossentropy": 2.7441134452819824, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16797682642936707, + "step": 17020 + }, + { + "epoch": 0.5319375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03736063639322917, + "learning_rate": 0.0001, + "loss": 5.7569, + "loss/crossentropy": 2.612870931625366, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16557930409908295, + "step": 17022 + }, + { + "epoch": 0.532, + "grad_norm": 2.78125, + "grad_norm_var": 0.029654947916666667, + "learning_rate": 0.0001, + "loss": 5.234, + "loss/crossentropy": 2.329947352409363, + "loss/hidden": 1.390625, + "loss/jsd": 0.0, + "loss/logits": 0.15133944153785706, + "step": 17024 + }, + { + "epoch": 0.5320625, + "grad_norm": 3.46875, + "grad_norm_var": 0.030671183268229166, + "learning_rate": 0.0001, + "loss": 6.0237, + "loss/crossentropy": 2.7970874309539795, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17343772947788239, + "step": 17026 + }, + { + "epoch": 0.532125, + "grad_norm": 3.03125, + "grad_norm_var": 0.03131103515625, + "learning_rate": 0.0001, + "loss": 5.8274, + "loss/crossentropy": 2.6069366931915283, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1732211410999298, + "step": 17028 + }, + { + "epoch": 0.5321875, + "grad_norm": 3.0, + "grad_norm_var": 0.027082316080729165, + "learning_rate": 0.0001, + "loss": 5.7569, + "loss/crossentropy": 2.6359177827835083, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16639314591884613, + "step": 17030 + }, + { + "epoch": 0.53225, + "grad_norm": 3.1875, + "grad_norm_var": 0.025813802083333334, + "learning_rate": 0.0001, + "loss": 5.764, + "loss/crossentropy": 2.572619676589966, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1710943728685379, + "step": 17032 + }, + { + "epoch": 0.5323125, + "grad_norm": 3.546875, + "grad_norm_var": 0.033219401041666666, + "learning_rate": 0.0001, + "loss": 5.9794, + "loss/crossentropy": 2.7105066776275635, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1749315857887268, + "step": 17034 + }, + { + "epoch": 0.532375, + "grad_norm": 3.046875, + "grad_norm_var": 0.03469645182291667, + "learning_rate": 0.0001, + "loss": 5.5228, + "loss/crossentropy": 2.423842668533325, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16106973588466644, + "step": 17036 + }, + { + "epoch": 0.5324375, + "grad_norm": 3.046875, + "grad_norm_var": 0.03502197265625, + "learning_rate": 0.0001, + "loss": 5.6784, + "loss/crossentropy": 2.511347532272339, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16904564201831818, + "step": 17038 + }, + { + "epoch": 0.5325, + "grad_norm": 2.859375, + "grad_norm_var": 0.0315582275390625, + "learning_rate": 0.0001, + "loss": 5.4597, + "loss/crossentropy": 2.4301384687423706, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15881867706775665, + "step": 17040 + }, + { + "epoch": 0.5325625, + "grad_norm": 3.0, + "grad_norm_var": 0.027925618489583335, + "learning_rate": 0.0001, + "loss": 5.5839, + "loss/crossentropy": 2.4871020317077637, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16085658222436905, + "step": 17042 + }, + { + "epoch": 0.532625, + "grad_norm": 2.984375, + "grad_norm_var": 0.024300130208333333, + "learning_rate": 0.0001, + "loss": 5.6928, + "loss/crossentropy": 2.5313451290130615, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16770461946725845, + "step": 17044 + }, + { + "epoch": 0.5326875, + "grad_norm": 3.0, + "grad_norm_var": 0.0280426025390625, + "learning_rate": 0.0001, + "loss": 5.8872, + "loss/crossentropy": 2.642845630645752, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17209375649690628, + "step": 17046 + }, + { + "epoch": 0.53275, + "grad_norm": 3.15625, + "grad_norm_var": 0.0261871337890625, + "learning_rate": 0.0001, + "loss": 5.7036, + "loss/crossentropy": 2.4725565910339355, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17310353368520737, + "step": 17048 + }, + { + "epoch": 0.5328125, + "grad_norm": 3.0, + "grad_norm_var": 0.012645467122395834, + "learning_rate": 0.0001, + "loss": 5.7822, + "loss/crossentropy": 2.6032555103302, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17219020426273346, + "step": 17050 + }, + { + "epoch": 0.532875, + "grad_norm": 3.359375, + "grad_norm_var": 0.018097941080729166, + "learning_rate": 0.0001, + "loss": 5.8159, + "loss/crossentropy": 2.618359684944153, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17092972993850708, + "step": 17052 + }, + { + "epoch": 0.5329375, + "grad_norm": 3.140625, + "grad_norm_var": 0.021532185872395835, + "learning_rate": 0.0001, + "loss": 5.6609, + "loss/crossentropy": 2.5374127626419067, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.163516603410244, + "step": 17054 + }, + { + "epoch": 0.533, + "grad_norm": 3.46875, + "grad_norm_var": 0.030296834309895833, + "learning_rate": 0.0001, + "loss": 5.6632, + "loss/crossentropy": 2.448095440864563, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16838687658309937, + "step": 17056 + }, + { + "epoch": 0.5330625, + "grad_norm": 3.125, + "grad_norm_var": 0.028971354166666668, + "learning_rate": 0.0001, + "loss": 5.7122, + "loss/crossentropy": 2.5715551376342773, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1671854928135872, + "step": 17058 + }, + { + "epoch": 0.533125, + "grad_norm": 3.265625, + "grad_norm_var": 0.02720947265625, + "learning_rate": 0.0001, + "loss": 5.9343, + "loss/crossentropy": 2.7259186506271362, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17240135371685028, + "step": 17060 + }, + { + "epoch": 0.5331875, + "grad_norm": 3.21875, + "grad_norm_var": 0.023273722330729166, + "learning_rate": 0.0001, + "loss": 5.949, + "loss/crossentropy": 2.677462577819824, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17598193883895874, + "step": 17062 + }, + { + "epoch": 0.53325, + "grad_norm": 3.265625, + "grad_norm_var": 0.021805826822916666, + "learning_rate": 0.0001, + "loss": 5.4291, + "loss/crossentropy": 2.3025020360946655, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16343628615140915, + "step": 17064 + }, + { + "epoch": 0.5333125, + "grad_norm": 3.234375, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 5.9242, + "loss/crossentropy": 2.6051729917526245, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18229009956121445, + "step": 17066 + }, + { + "epoch": 0.533375, + "grad_norm": 3.3125, + "grad_norm_var": 0.022379557291666668, + "learning_rate": 0.0001, + "loss": 5.6842, + "loss/crossentropy": 2.5065290927886963, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16815276443958282, + "step": 17068 + }, + { + "epoch": 0.5334375, + "grad_norm": 3.28125, + "grad_norm_var": 0.013313802083333333, + "learning_rate": 0.0001, + "loss": 5.8038, + "loss/crossentropy": 2.5707377195358276, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1733020544052124, + "step": 17070 + }, + { + "epoch": 0.5335, + "grad_norm": 3.34375, + "grad_norm_var": 0.014867146809895834, + "learning_rate": 0.0001, + "loss": 5.9117, + "loss/crossentropy": 2.6048630475997925, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17911715060472488, + "step": 17072 + }, + { + "epoch": 0.5335625, + "grad_norm": 3.34375, + "grad_norm_var": 0.15386962890625, + "learning_rate": 0.0001, + "loss": 6.1994, + "loss/crossentropy": 2.790488123893738, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18893983960151672, + "step": 17074 + }, + { + "epoch": 0.533625, + "grad_norm": 3.15625, + "grad_norm_var": 0.15868733723958334, + "learning_rate": 0.0001, + "loss": 5.6514, + "loss/crossentropy": 2.46434485912323, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16792075335979462, + "step": 17076 + }, + { + "epoch": 0.5336875, + "grad_norm": 3.0, + "grad_norm_var": 0.16470438639322918, + "learning_rate": 0.0001, + "loss": 5.5249, + "loss/crossentropy": 2.329636335372925, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1656169816851616, + "step": 17078 + }, + { + "epoch": 0.53375, + "grad_norm": 2.84375, + "grad_norm_var": 0.18338216145833333, + "learning_rate": 0.0001, + "loss": 5.5114, + "loss/crossentropy": 2.4661262035369873, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16039124131202698, + "step": 17080 + }, + { + "epoch": 0.5338125, + "grad_norm": 3.140625, + "grad_norm_var": 0.18459370930989583, + "learning_rate": 0.0001, + "loss": 5.7988, + "loss/crossentropy": 2.580346941947937, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17614497244358063, + "step": 17082 + }, + { + "epoch": 0.533875, + "grad_norm": 3.03125, + "grad_norm_var": 0.1836822509765625, + "learning_rate": 0.0001, + "loss": 5.4874, + "loss/crossentropy": 2.4446455240249634, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15662309527397156, + "step": 17084 + }, + { + "epoch": 0.5339375, + "grad_norm": 3.328125, + "grad_norm_var": 0.19029032389322917, + "learning_rate": 0.0001, + "loss": 5.8172, + "loss/crossentropy": 2.540899634361267, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17762987315654755, + "step": 17086 + }, + { + "epoch": 0.534, + "grad_norm": 2.921875, + "grad_norm_var": 0.19588216145833334, + "learning_rate": 0.0001, + "loss": 5.2604, + "loss/crossentropy": 2.3293145895004272, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15092363953590393, + "step": 17088 + }, + { + "epoch": 0.5340625, + "grad_norm": 3.140625, + "grad_norm_var": 0.052464803059895836, + "learning_rate": 0.0001, + "loss": 5.7763, + "loss/crossentropy": 2.595832109451294, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16999783366918564, + "step": 17090 + }, + { + "epoch": 0.534125, + "grad_norm": 2.9375, + "grad_norm_var": 0.05552469889322917, + "learning_rate": 0.0001, + "loss": 5.5164, + "loss/crossentropy": 2.4288792610168457, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16070610284805298, + "step": 17092 + }, + { + "epoch": 0.5341875, + "grad_norm": 3.125, + "grad_norm_var": 0.05373942057291667, + "learning_rate": 0.0001, + "loss": 5.6126, + "loss/crossentropy": 2.517060399055481, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1618964746594429, + "step": 17094 + }, + { + "epoch": 0.53425, + "grad_norm": 3.421875, + "grad_norm_var": 0.0662261962890625, + "learning_rate": 0.0001, + "loss": 6.0459, + "loss/crossentropy": 2.7155656814575195, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18224790692329407, + "step": 17096 + }, + { + "epoch": 0.5343125, + "grad_norm": 3.171875, + "grad_norm_var": 0.06583658854166667, + "learning_rate": 0.0001, + "loss": 6.0407, + "loss/crossentropy": 2.8346915245056152, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17255517095327377, + "step": 17098 + }, + { + "epoch": 0.534375, + "grad_norm": 3.125, + "grad_norm_var": 0.06780192057291666, + "learning_rate": 0.0001, + "loss": 5.7816, + "loss/crossentropy": 2.6422271728515625, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1670575961470604, + "step": 17100 + }, + { + "epoch": 0.5344375, + "grad_norm": 3.171875, + "grad_norm_var": 0.05739644368489583, + "learning_rate": 0.0001, + "loss": 5.8924, + "loss/crossentropy": 2.6804498434066772, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16884919255971909, + "step": 17102 + }, + { + "epoch": 0.5345, + "grad_norm": 3.453125, + "grad_norm_var": 0.055257161458333336, + "learning_rate": 0.0001, + "loss": 5.9318, + "loss/crossentropy": 2.646846652030945, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1784997135400772, + "step": 17104 + }, + { + "epoch": 0.5345625, + "grad_norm": 3.515625, + "grad_norm_var": 0.04442952473958333, + "learning_rate": 0.0001, + "loss": 5.5162, + "loss/crossentropy": 2.4204429388046265, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.162314772605896, + "step": 17106 + }, + { + "epoch": 0.534625, + "grad_norm": 3.609375, + "grad_norm_var": 0.05807291666666667, + "learning_rate": 0.0001, + "loss": 5.9085, + "loss/crossentropy": 2.5476585626602173, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18139329552650452, + "step": 17108 + }, + { + "epoch": 0.5346875, + "grad_norm": 2.9375, + "grad_norm_var": 0.06311442057291666, + "learning_rate": 0.0001, + "loss": 5.525, + "loss/crossentropy": 2.4291800260543823, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16036456823349, + "step": 17110 + }, + { + "epoch": 0.53475, + "grad_norm": 3.265625, + "grad_norm_var": 0.05123697916666667, + "learning_rate": 0.0001, + "loss": 5.8608, + "loss/crossentropy": 2.63621985912323, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1716729998588562, + "step": 17112 + }, + { + "epoch": 0.5348125, + "grad_norm": 2.84375, + "grad_norm_var": 0.062418619791666664, + "learning_rate": 0.0001, + "loss": 5.8591, + "loss/crossentropy": 2.6378746032714844, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17485301196575165, + "step": 17114 + }, + { + "epoch": 0.534875, + "grad_norm": 3.46875, + "grad_norm_var": 0.057470703125, + "learning_rate": 0.0001, + "loss": 5.8118, + "loss/crossentropy": 2.553487181663513, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17583421617746353, + "step": 17116 + }, + { + "epoch": 0.5349375, + "grad_norm": 2.859375, + "grad_norm_var": 0.0693359375, + "learning_rate": 0.0001, + "loss": 5.8919, + "loss/crossentropy": 2.6541894674301147, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17611850798130035, + "step": 17118 + }, + { + "epoch": 0.535, + "grad_norm": 2.890625, + "grad_norm_var": 0.07334696451822917, + "learning_rate": 0.0001, + "loss": 5.3697, + "loss/crossentropy": 2.3797430992126465, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15446283668279648, + "step": 17120 + }, + { + "epoch": 0.5350625, + "grad_norm": 3.140625, + "grad_norm_var": 0.07300516764322916, + "learning_rate": 0.0001, + "loss": 5.486, + "loss/crossentropy": 2.3793132305145264, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16222813725471497, + "step": 17122 + }, + { + "epoch": 0.535125, + "grad_norm": 3.0, + "grad_norm_var": 0.05357157389322917, + "learning_rate": 0.0001, + "loss": 5.9188, + "loss/crossentropy": 2.6369359493255615, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17427990585565567, + "step": 17124 + }, + { + "epoch": 0.5351875, + "grad_norm": 3.359375, + "grad_norm_var": 0.055150349934895836, + "learning_rate": 0.0001, + "loss": 6.0767, + "loss/crossentropy": 2.722614288330078, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18111039698123932, + "step": 17126 + }, + { + "epoch": 0.53525, + "grad_norm": 3.203125, + "grad_norm_var": 0.05728759765625, + "learning_rate": 0.0001, + "loss": 5.8847, + "loss/crossentropy": 2.6280394792556763, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1780138835310936, + "step": 17128 + }, + { + "epoch": 0.5353125, + "grad_norm": 3.140625, + "grad_norm_var": 0.04834696451822917, + "learning_rate": 0.0001, + "loss": 5.7332, + "loss/crossentropy": 2.53939950466156, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17132992297410965, + "step": 17130 + }, + { + "epoch": 0.535375, + "grad_norm": 2.921875, + "grad_norm_var": 0.049193318684895834, + "learning_rate": 0.0001, + "loss": 5.5148, + "loss/crossentropy": 2.465831995010376, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1627052202820778, + "step": 17132 + }, + { + "epoch": 0.5354375, + "grad_norm": 3.21875, + "grad_norm_var": 0.04259440104166667, + "learning_rate": 0.0001, + "loss": 5.5225, + "loss/crossentropy": 2.4369946718215942, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1644097939133644, + "step": 17134 + }, + { + "epoch": 0.5355, + "grad_norm": 3.109375, + "grad_norm_var": 0.03935546875, + "learning_rate": 0.0001, + "loss": 5.6981, + "loss/crossentropy": 2.542572021484375, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16672320663928986, + "step": 17136 + }, + { + "epoch": 0.5355625, + "grad_norm": 3.03125, + "grad_norm_var": 0.037328084309895836, + "learning_rate": 0.0001, + "loss": 5.6913, + "loss/crossentropy": 2.5692667961120605, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16571664810180664, + "step": 17138 + }, + { + "epoch": 0.535625, + "grad_norm": 2.96875, + "grad_norm_var": 0.030257161458333334, + "learning_rate": 0.0001, + "loss": 5.5473, + "loss/crossentropy": 2.5101238489151, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15840435028076172, + "step": 17140 + }, + { + "epoch": 0.5356875, + "grad_norm": 2.75, + "grad_norm_var": 0.02789306640625, + "learning_rate": 0.0001, + "loss": 5.5962, + "loss/crossentropy": 2.5291870832443237, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1598297655582428, + "step": 17142 + }, + { + "epoch": 0.53575, + "grad_norm": 3.140625, + "grad_norm_var": 0.025275675455729167, + "learning_rate": 0.0001, + "loss": 5.574, + "loss/crossentropy": 2.5178639888763428, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16029997169971466, + "step": 17144 + }, + { + "epoch": 0.5358125, + "grad_norm": 3.140625, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 0.0001, + "loss": 5.8875, + "loss/crossentropy": 2.584782123565674, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17909906804561615, + "step": 17146 + }, + { + "epoch": 0.535875, + "grad_norm": 3.171875, + "grad_norm_var": 0.04586181640625, + "learning_rate": 0.0001, + "loss": 5.8854, + "loss/crossentropy": 2.757336974143982, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16671134531497955, + "step": 17148 + }, + { + "epoch": 0.5359375, + "grad_norm": 3.203125, + "grad_norm_var": 0.04638264973958333, + "learning_rate": 0.0001, + "loss": 5.4471, + "loss/crossentropy": 2.356896758079529, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16331280767917633, + "step": 17150 + }, + { + "epoch": 0.536, + "grad_norm": 2.984375, + "grad_norm_var": 0.047749837239583336, + "learning_rate": 0.0001, + "loss": 5.4821, + "loss/crossentropy": 2.4455692768096924, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15990079194307327, + "step": 17152 + }, + { + "epoch": 0.5360625, + "grad_norm": 3.375, + "grad_norm_var": 0.05371805826822917, + "learning_rate": 0.0001, + "loss": 5.7659, + "loss/crossentropy": 2.558240294456482, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16920633614063263, + "step": 17154 + }, + { + "epoch": 0.536125, + "grad_norm": 3.21875, + "grad_norm_var": 0.0527252197265625, + "learning_rate": 0.0001, + "loss": 6.0016, + "loss/crossentropy": 2.7091336250305176, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1788558065891266, + "step": 17156 + }, + { + "epoch": 0.5361875, + "grad_norm": 3.28125, + "grad_norm_var": 0.04584859212239583, + "learning_rate": 0.0001, + "loss": 5.8222, + "loss/crossentropy": 2.5406280755996704, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17580938339233398, + "step": 17158 + }, + { + "epoch": 0.53625, + "grad_norm": 3.09375, + "grad_norm_var": 0.07486063639322917, + "learning_rate": 0.0001, + "loss": 5.6154, + "loss/crossentropy": 2.43525767326355, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16996947675943375, + "step": 17160 + }, + { + "epoch": 0.5363125, + "grad_norm": 3.15625, + "grad_norm_var": 0.07149149576822916, + "learning_rate": 0.0001, + "loss": 5.6441, + "loss/crossentropy": 2.510724902153015, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16607657074928284, + "step": 17162 + }, + { + "epoch": 0.536375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0732818603515625, + "learning_rate": 0.0001, + "loss": 5.4909, + "loss/crossentropy": 2.316679298877716, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1662546843290329, + "step": 17164 + }, + { + "epoch": 0.5364375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0728668212890625, + "learning_rate": 0.0001, + "loss": 5.8561, + "loss/crossentropy": 2.56137752532959, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17595582455396652, + "step": 17166 + }, + { + "epoch": 0.5365, + "grad_norm": 3.09375, + "grad_norm_var": 0.0694976806640625, + "learning_rate": 0.0001, + "loss": 5.638, + "loss/crossentropy": 2.509562849998474, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1624525785446167, + "step": 17168 + }, + { + "epoch": 0.5365625, + "grad_norm": 3.234375, + "grad_norm_var": 0.06886393229166667, + "learning_rate": 0.0001, + "loss": 5.6331, + "loss/crossentropy": 2.5476927757263184, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1604909971356392, + "step": 17170 + }, + { + "epoch": 0.536625, + "grad_norm": 2.9375, + "grad_norm_var": 0.07893473307291667, + "learning_rate": 0.0001, + "loss": 5.5707, + "loss/crossentropy": 2.5319403409957886, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15700259804725647, + "step": 17172 + }, + { + "epoch": 0.5366875, + "grad_norm": 2.890625, + "grad_norm_var": 0.08466389973958334, + "learning_rate": 0.0001, + "loss": 5.5913, + "loss/crossentropy": 2.496548056602478, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1622052937746048, + "step": 17174 + }, + { + "epoch": 0.53675, + "grad_norm": 3.765625, + "grad_norm_var": 0.06318257649739584, + "learning_rate": 0.0001, + "loss": 5.779, + "loss/crossentropy": 2.561243414878845, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1721685826778412, + "step": 17176 + }, + { + "epoch": 0.5368125, + "grad_norm": 3.1875, + "grad_norm_var": 0.061644490559895834, + "learning_rate": 0.0001, + "loss": 5.7119, + "loss/crossentropy": 2.5556975603103638, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17030397802591324, + "step": 17178 + }, + { + "epoch": 0.536875, + "grad_norm": 3.203125, + "grad_norm_var": 0.06154683430989583, + "learning_rate": 0.0001, + "loss": 5.8396, + "loss/crossentropy": 2.5738922357559204, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1777389496564865, + "step": 17180 + }, + { + "epoch": 0.5369375, + "grad_norm": 3.265625, + "grad_norm_var": 0.052668253580729164, + "learning_rate": 0.0001, + "loss": 5.898, + "loss/crossentropy": 2.605117678642273, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17850348353385925, + "step": 17182 + }, + { + "epoch": 0.537, + "grad_norm": 3.5, + "grad_norm_var": 0.0634918212890625, + "learning_rate": 0.0001, + "loss": 5.6326, + "loss/crossentropy": 2.5269027948379517, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16408953815698624, + "step": 17184 + }, + { + "epoch": 0.5370625, + "grad_norm": 3.0, + "grad_norm_var": 0.07398681640625, + "learning_rate": 0.0001, + "loss": 5.385, + "loss/crossentropy": 2.3865991830825806, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15608975291252136, + "step": 17186 + }, + { + "epoch": 0.537125, + "grad_norm": 2.90625, + "grad_norm_var": 0.07485249837239584, + "learning_rate": 0.0001, + "loss": 5.3913, + "loss/crossentropy": 2.3155194520950317, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16187888383865356, + "step": 17188 + }, + { + "epoch": 0.5371875, + "grad_norm": 3.140625, + "grad_norm_var": 0.07071024576822917, + "learning_rate": 0.0001, + "loss": 5.6339, + "loss/crossentropy": 2.4698076248168945, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1675824075937271, + "step": 17190 + }, + { + "epoch": 0.53725, + "grad_norm": 3.609375, + "grad_norm_var": 0.0535064697265625, + "learning_rate": 0.0001, + "loss": 5.6603, + "loss/crossentropy": 2.5393755435943604, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16755837947130203, + "step": 17192 + }, + { + "epoch": 0.5373125, + "grad_norm": 3.265625, + "grad_norm_var": 0.05611572265625, + "learning_rate": 0.0001, + "loss": 5.8479, + "loss/crossentropy": 2.6371690034866333, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1745872050523758, + "step": 17194 + }, + { + "epoch": 0.537375, + "grad_norm": 3.375, + "grad_norm_var": 0.06249593098958333, + "learning_rate": 0.0001, + "loss": 5.7458, + "loss/crossentropy": 2.528953194618225, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16817187517881393, + "step": 17196 + }, + { + "epoch": 0.5374375, + "grad_norm": 3.171875, + "grad_norm_var": 0.06855367024739584, + "learning_rate": 0.0001, + "loss": 5.7345, + "loss/crossentropy": 2.479103684425354, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17436964064836502, + "step": 17198 + }, + { + "epoch": 0.5375, + "grad_norm": 3.0625, + "grad_norm_var": 0.05607808430989583, + "learning_rate": 0.0001, + "loss": 5.8815, + "loss/crossentropy": 2.6622745990753174, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17231357842683792, + "step": 17200 + }, + { + "epoch": 0.5375625, + "grad_norm": 2.9375, + "grad_norm_var": 0.0782135009765625, + "learning_rate": 0.0001, + "loss": 5.8238, + "loss/crossentropy": 2.6184778213500977, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16974829882383347, + "step": 17202 + }, + { + "epoch": 0.537625, + "grad_norm": 3.0, + "grad_norm_var": 0.0746490478515625, + "learning_rate": 0.0001, + "loss": 5.5295, + "loss/crossentropy": 2.4015225172042847, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16591988503932953, + "step": 17204 + }, + { + "epoch": 0.5376875, + "grad_norm": 3.6875, + "grad_norm_var": 0.08297526041666667, + "learning_rate": 0.0001, + "loss": 6.0323, + "loss/crossentropy": 2.739021420478821, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18011116981506348, + "step": 17206 + }, + { + "epoch": 0.53775, + "grad_norm": 3.1875, + "grad_norm_var": 0.07071940104166667, + "learning_rate": 0.0001, + "loss": 5.8506, + "loss/crossentropy": 2.689476251602173, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16885115206241608, + "step": 17208 + }, + { + "epoch": 0.5378125, + "grad_norm": 3.421875, + "grad_norm_var": 0.06568094889322916, + "learning_rate": 0.0001, + "loss": 6.0803, + "loss/crossentropy": 2.750085473060608, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18145929276943207, + "step": 17210 + }, + { + "epoch": 0.537875, + "grad_norm": 2.75, + "grad_norm_var": 0.08286031087239583, + "learning_rate": 0.0001, + "loss": 5.185, + "loss/crossentropy": 2.2621421813964844, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.14932042360305786, + "step": 17212 + }, + { + "epoch": 0.5379375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0863433837890625, + "learning_rate": 0.0001, + "loss": 5.6305, + "loss/crossentropy": 2.48822021484375, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17047961056232452, + "step": 17214 + }, + { + "epoch": 0.538, + "grad_norm": 2.9375, + "grad_norm_var": 0.0932037353515625, + "learning_rate": 0.0001, + "loss": 5.6364, + "loss/crossentropy": 2.4415475130081177, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16909129917621613, + "step": 17216 + }, + { + "epoch": 0.5380625, + "grad_norm": 3.53125, + "grad_norm_var": 0.061278279622395834, + "learning_rate": 0.0001, + "loss": 6.0746, + "loss/crossentropy": 2.7563360929489136, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17830930650234222, + "step": 17218 + }, + { + "epoch": 0.538125, + "grad_norm": 3.171875, + "grad_norm_var": 0.06055399576822917, + "learning_rate": 0.0001, + "loss": 5.4381, + "loss/crossentropy": 2.384289860725403, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16085246950387955, + "step": 17220 + }, + { + "epoch": 0.5381875, + "grad_norm": 3.078125, + "grad_norm_var": 0.04368387858072917, + "learning_rate": 0.0001, + "loss": 5.5677, + "loss/crossentropy": 2.4485658407211304, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1638677716255188, + "step": 17222 + }, + { + "epoch": 0.53825, + "grad_norm": 3.28125, + "grad_norm_var": 0.045670572916666666, + "learning_rate": 0.0001, + "loss": 5.7756, + "loss/crossentropy": 2.669786214828491, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16409556567668915, + "step": 17224 + }, + { + "epoch": 0.5383125, + "grad_norm": 3.140625, + "grad_norm_var": 0.041890462239583336, + "learning_rate": 0.0001, + "loss": 5.7154, + "loss/crossentropy": 2.5550438165664673, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16603317111730576, + "step": 17226 + }, + { + "epoch": 0.538375, + "grad_norm": 3.3125, + "grad_norm_var": 0.033299763997395836, + "learning_rate": 0.0001, + "loss": 5.7385, + "loss/crossentropy": 2.474865198135376, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1767580509185791, + "step": 17228 + }, + { + "epoch": 0.5384375, + "grad_norm": 2.9375, + "grad_norm_var": 0.036942545572916666, + "learning_rate": 0.0001, + "loss": 5.6299, + "loss/crossentropy": 2.5964972972869873, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1603715643286705, + "step": 17230 + }, + { + "epoch": 0.5385, + "grad_norm": 3.265625, + "grad_norm_var": 0.031103515625, + "learning_rate": 0.0001, + "loss": 5.5366, + "loss/crossentropy": 2.4126774072647095, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1659061163663864, + "step": 17232 + }, + { + "epoch": 0.5385625, + "grad_norm": 3.0625, + "grad_norm_var": 0.02265625, + "learning_rate": 0.0001, + "loss": 5.6391, + "loss/crossentropy": 2.5527502298355103, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1609746515750885, + "step": 17234 + }, + { + "epoch": 0.538625, + "grad_norm": 3.328125, + "grad_norm_var": 0.024559529622395833, + "learning_rate": 0.0001, + "loss": 5.9366, + "loss/crossentropy": 2.702728271484375, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17690350860357285, + "step": 17236 + }, + { + "epoch": 0.5386875, + "grad_norm": 2.953125, + "grad_norm_var": 0.027171834309895834, + "learning_rate": 0.0001, + "loss": 5.7457, + "loss/crossentropy": 2.6052632331848145, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1671736240386963, + "step": 17238 + }, + { + "epoch": 0.53875, + "grad_norm": 3.046875, + "grad_norm_var": 0.025706990559895834, + "learning_rate": 0.0001, + "loss": 5.8378, + "loss/crossentropy": 2.632414698600769, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17600620537996292, + "step": 17240 + }, + { + "epoch": 0.5388125, + "grad_norm": 3.40625, + "grad_norm_var": 0.027372233072916665, + "learning_rate": 0.0001, + "loss": 5.9147, + "loss/crossentropy": 2.611965537071228, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18144212663173676, + "step": 17242 + }, + { + "epoch": 0.538875, + "grad_norm": 3.0625, + "grad_norm_var": 0.027372233072916665, + "learning_rate": 0.0001, + "loss": 5.3358, + "loss/crossentropy": 2.3390179872512817, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15436232835054398, + "step": 17244 + }, + { + "epoch": 0.5389375, + "grad_norm": 3.25, + "grad_norm_var": 0.024193318684895833, + "learning_rate": 0.0001, + "loss": 6.0388, + "loss/crossentropy": 2.7092678546905518, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17904328554868698, + "step": 17246 + }, + { + "epoch": 0.539, + "grad_norm": 2.875, + "grad_norm_var": 0.027815755208333334, + "learning_rate": 0.0001, + "loss": 5.5042, + "loss/crossentropy": 2.4605921506881714, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1621706485748291, + "step": 17248 + }, + { + "epoch": 0.5390625, + "grad_norm": 3.109375, + "grad_norm_var": 0.03173828125, + "learning_rate": 0.0001, + "loss": 6.1756, + "loss/crossentropy": 2.8003345727920532, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18284225463867188, + "step": 17250 + }, + { + "epoch": 0.539125, + "grad_norm": 3.453125, + "grad_norm_var": 0.03570556640625, + "learning_rate": 0.0001, + "loss": 6.1073, + "loss/crossentropy": 2.734965682029724, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1833227127790451, + "step": 17252 + }, + { + "epoch": 0.5391875, + "grad_norm": 3.140625, + "grad_norm_var": 0.031201171875, + "learning_rate": 0.0001, + "loss": 5.496, + "loss/crossentropy": 2.426445722579956, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15929921716451645, + "step": 17254 + }, + { + "epoch": 0.53925, + "grad_norm": 3.125, + "grad_norm_var": 0.03459370930989583, + "learning_rate": 0.0001, + "loss": 5.4814, + "loss/crossentropy": 2.4185190200805664, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15902447700500488, + "step": 17256 + }, + { + "epoch": 0.5393125, + "grad_norm": 3.046875, + "grad_norm_var": 0.031208292643229166, + "learning_rate": 0.0001, + "loss": 5.7525, + "loss/crossentropy": 2.623996615409851, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16402489691972733, + "step": 17258 + }, + { + "epoch": 0.539375, + "grad_norm": 3.265625, + "grad_norm_var": 0.030394490559895834, + "learning_rate": 0.0001, + "loss": 5.3717, + "loss/crossentropy": 2.283425211906433, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16546471416950226, + "step": 17260 + }, + { + "epoch": 0.5394375, + "grad_norm": 3.375, + "grad_norm_var": 0.12984619140625, + "learning_rate": 0.0001, + "loss": 5.9672, + "loss/crossentropy": 2.675995349884033, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17794667184352875, + "step": 17262 + }, + { + "epoch": 0.5395, + "grad_norm": 3.046875, + "grad_norm_var": 0.1250640869140625, + "learning_rate": 0.0001, + "loss": 5.7718, + "loss/crossentropy": 2.583772897720337, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16880057007074356, + "step": 17264 + }, + { + "epoch": 0.5395625, + "grad_norm": 3.21875, + "grad_norm_var": 0.12254231770833333, + "learning_rate": 0.0001, + "loss": 6.3176, + "loss/crossentropy": 2.9289597272872925, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18808703124523163, + "step": 17266 + }, + { + "epoch": 0.539625, + "grad_norm": 3.734375, + "grad_norm_var": 0.13996480305989584, + "learning_rate": 0.0001, + "loss": 5.7947, + "loss/crossentropy": 2.5368372201919556, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17618079483509064, + "step": 17268 + }, + { + "epoch": 0.5396875, + "grad_norm": 3.40625, + "grad_norm_var": 0.14003804524739583, + "learning_rate": 0.0001, + "loss": 5.8048, + "loss/crossentropy": 2.4929646253585815, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18001490086317062, + "step": 17270 + }, + { + "epoch": 0.53975, + "grad_norm": 3.171875, + "grad_norm_var": 0.1302642822265625, + "learning_rate": 0.0001, + "loss": 5.8942, + "loss/crossentropy": 2.701943278312683, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16961456090211868, + "step": 17272 + }, + { + "epoch": 0.5398125, + "grad_norm": 3.015625, + "grad_norm_var": 0.1318511962890625, + "learning_rate": 0.0001, + "loss": 5.328, + "loss/crossentropy": 2.232863187789917, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.15755825489759445, + "step": 17274 + }, + { + "epoch": 0.539875, + "grad_norm": 3.5, + "grad_norm_var": 0.132177734375, + "learning_rate": 0.0001, + "loss": 5.6895, + "loss/crossentropy": 2.518621563911438, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16747979819774628, + "step": 17276 + }, + { + "epoch": 0.5399375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0429595947265625, + "learning_rate": 0.0001, + "loss": 5.5796, + "loss/crossentropy": 2.4978870153427124, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1624642163515091, + "step": 17278 + }, + { + "epoch": 0.54, + "grad_norm": 2.828125, + "grad_norm_var": 0.04932352701822917, + "learning_rate": 0.0001, + "loss": 5.3894, + "loss/crossentropy": 2.368393898010254, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1567913517355919, + "step": 17280 + }, + { + "epoch": 0.5400625, + "grad_norm": 3.328125, + "grad_norm_var": 0.049738566080729164, + "learning_rate": 0.0001, + "loss": 5.7971, + "loss/crossentropy": 2.4818142652511597, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17879413068294525, + "step": 17282 + }, + { + "epoch": 0.540125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03406473795572917, + "learning_rate": 0.0001, + "loss": 5.7518, + "loss/crossentropy": 2.5515873432159424, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16650700569152832, + "step": 17284 + }, + { + "epoch": 0.5401875, + "grad_norm": 3.828125, + "grad_norm_var": 0.07256571451822917, + "learning_rate": 0.0001, + "loss": 6.216, + "loss/crossentropy": 2.8061646223068237, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.18395615369081497, + "step": 17286 + }, + { + "epoch": 0.54025, + "grad_norm": 3.5625, + "grad_norm_var": 0.09384663899739583, + "learning_rate": 0.0001, + "loss": 5.8815, + "loss/crossentropy": 2.5948829650878906, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17319124191999435, + "step": 17288 + }, + { + "epoch": 0.5403125, + "grad_norm": 3.015625, + "grad_norm_var": 0.092529296875, + "learning_rate": 0.0001, + "loss": 5.5779, + "loss/crossentropy": 2.507999897003174, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.160506471991539, + "step": 17290 + }, + { + "epoch": 0.540375, + "grad_norm": 3.265625, + "grad_norm_var": 0.09810791015625, + "learning_rate": 0.0001, + "loss": 5.6686, + "loss/crossentropy": 2.5252314805984497, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16785302013158798, + "step": 17292 + }, + { + "epoch": 0.5404375, + "grad_norm": 3.203125, + "grad_norm_var": 0.09640299479166667, + "learning_rate": 0.0001, + "loss": 5.9182, + "loss/crossentropy": 2.676273226737976, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1741974800825119, + "step": 17294 + }, + { + "epoch": 0.5405, + "grad_norm": 2.859375, + "grad_norm_var": 0.09888916015625, + "learning_rate": 0.0001, + "loss": 5.1875, + "loss/crossentropy": 2.3228524923324585, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1442803516983986, + "step": 17296 + }, + { + "epoch": 0.5405625, + "grad_norm": 3.21875, + "grad_norm_var": 0.0994293212890625, + "learning_rate": 0.0001, + "loss": 5.5537, + "loss/crossentropy": 2.42146372795105, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16400451213121414, + "step": 17298 + }, + { + "epoch": 0.540625, + "grad_norm": 3.0625, + "grad_norm_var": 0.09726460774739583, + "learning_rate": 0.0001, + "loss": 5.7194, + "loss/crossentropy": 2.5954188108444214, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16473744064569473, + "step": 17300 + }, + { + "epoch": 0.5406875, + "grad_norm": 3.15625, + "grad_norm_var": 0.05632222493489583, + "learning_rate": 0.0001, + "loss": 5.8397, + "loss/crossentropy": 2.7246214151382446, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16580431908369064, + "step": 17302 + }, + { + "epoch": 0.54075, + "grad_norm": 3.078125, + "grad_norm_var": 0.015672810872395835, + "learning_rate": 0.0001, + "loss": 5.3303, + "loss/crossentropy": 2.2647517919540405, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15889877825975418, + "step": 17304 + }, + { + "epoch": 0.5408125, + "grad_norm": 3.125, + "grad_norm_var": 0.013841756184895833, + "learning_rate": 0.0001, + "loss": 5.6368, + "loss/crossentropy": 2.5183417797088623, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16379909217357635, + "step": 17306 + }, + { + "epoch": 0.540875, + "grad_norm": 3.21875, + "grad_norm_var": 0.017113240559895833, + "learning_rate": 0.0001, + "loss": 6.0625, + "loss/crossentropy": 2.7601245641708374, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17906256020069122, + "step": 17308 + }, + { + "epoch": 0.5409375, + "grad_norm": 3.0625, + "grad_norm_var": 0.016487630208333333, + "learning_rate": 0.0001, + "loss": 5.7098, + "loss/crossentropy": 2.5399783849716187, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1704939901828766, + "step": 17310 + }, + { + "epoch": 0.541, + "grad_norm": 4.0, + "grad_norm_var": 0.05878499348958333, + "learning_rate": 0.0001, + "loss": 5.8959, + "loss/crossentropy": 2.6711167097091675, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1713021695613861, + "step": 17312 + }, + { + "epoch": 0.5410625, + "grad_norm": 3.0, + "grad_norm_var": 0.11057027180989583, + "learning_rate": 0.0001, + "loss": 6.0762, + "loss/crossentropy": 2.7620147466659546, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18024317920207977, + "step": 17314 + }, + { + "epoch": 0.541125, + "grad_norm": 3.3125, + "grad_norm_var": 0.10852762858072916, + "learning_rate": 0.0001, + "loss": 5.5332, + "loss/crossentropy": 2.3659234046936035, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16594360023736954, + "step": 17316 + }, + { + "epoch": 0.5411875, + "grad_norm": 3.265625, + "grad_norm_var": 0.10676167805989584, + "learning_rate": 0.0001, + "loss": 6.0635, + "loss/crossentropy": 2.782996892929077, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17726892232894897, + "step": 17318 + }, + { + "epoch": 0.54125, + "grad_norm": 3.28125, + "grad_norm_var": 0.10407613118489584, + "learning_rate": 0.0001, + "loss": 5.6753, + "loss/crossentropy": 2.587460994720459, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.15722176432609558, + "step": 17320 + }, + { + "epoch": 0.5413125, + "grad_norm": 3.03125, + "grad_norm_var": 0.10534566243489583, + "learning_rate": 0.0001, + "loss": 5.5398, + "loss/crossentropy": 2.4110758304595947, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16013644635677338, + "step": 17322 + }, + { + "epoch": 0.541375, + "grad_norm": 3.421875, + "grad_norm_var": 0.1089508056640625, + "learning_rate": 0.0001, + "loss": 5.8649, + "loss/crossentropy": 2.5966124534606934, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17526501417160034, + "step": 17324 + }, + { + "epoch": 0.5414375, + "grad_norm": 3.3125, + "grad_norm_var": 0.11116434733072916, + "learning_rate": 0.0001, + "loss": 5.8719, + "loss/crossentropy": 2.535356879234314, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17935575544834137, + "step": 17326 + }, + { + "epoch": 0.5415, + "grad_norm": 3.3125, + "grad_norm_var": 0.0928863525390625, + "learning_rate": 0.0001, + "loss": 5.3556, + "loss/crossentropy": 2.3893096446990967, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15521960705518723, + "step": 17328 + }, + { + "epoch": 0.5415625, + "grad_norm": 2.921875, + "grad_norm_var": 0.06511942545572917, + "learning_rate": 0.0001, + "loss": 5.9359, + "loss/crossentropy": 2.7159135341644287, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17239267379045486, + "step": 17330 + }, + { + "epoch": 0.541625, + "grad_norm": 3.078125, + "grad_norm_var": 0.07737223307291667, + "learning_rate": 0.0001, + "loss": 5.7177, + "loss/crossentropy": 2.5383517742156982, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1675461083650589, + "step": 17332 + }, + { + "epoch": 0.5416875, + "grad_norm": 3.34375, + "grad_norm_var": 0.07959696451822916, + "learning_rate": 0.0001, + "loss": 5.5848, + "loss/crossentropy": 2.3904402256011963, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17217034846544266, + "step": 17334 + }, + { + "epoch": 0.54175, + "grad_norm": 3.03125, + "grad_norm_var": 0.080126953125, + "learning_rate": 0.0001, + "loss": 5.7477, + "loss/crossentropy": 2.5591678619384766, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17041853815317154, + "step": 17336 + }, + { + "epoch": 0.5418125, + "grad_norm": 2.9375, + "grad_norm_var": 0.090869140625, + "learning_rate": 0.0001, + "loss": 6.0124, + "loss/crossentropy": 2.7603390216827393, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17364423722028732, + "step": 17338 + }, + { + "epoch": 0.541875, + "grad_norm": 3.25, + "grad_norm_var": 0.08847249348958333, + "learning_rate": 0.0001, + "loss": 5.8721, + "loss/crossentropy": 2.6525124311447144, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17196206003427505, + "step": 17340 + }, + { + "epoch": 0.5419375, + "grad_norm": 3.140625, + "grad_norm_var": 0.071923828125, + "learning_rate": 0.0001, + "loss": 5.7777, + "loss/crossentropy": 2.570147156715393, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16997401416301727, + "step": 17342 + }, + { + "epoch": 0.542, + "grad_norm": 2.9375, + "grad_norm_var": 0.07389322916666667, + "learning_rate": 0.0001, + "loss": 5.7201, + "loss/crossentropy": 2.595253825187683, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16365204751491547, + "step": 17344 + }, + { + "epoch": 0.5420625, + "grad_norm": 2.890625, + "grad_norm_var": 0.06341044108072917, + "learning_rate": 0.0001, + "loss": 5.6749, + "loss/crossentropy": 2.5351758003234863, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1635787934064865, + "step": 17346 + }, + { + "epoch": 0.542125, + "grad_norm": 3.078125, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 0.0001, + "loss": 5.8137, + "loss/crossentropy": 2.6436641216278076, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17012985795736313, + "step": 17348 + }, + { + "epoch": 0.5421875, + "grad_norm": 3.140625, + "grad_norm_var": 0.044066365559895834, + "learning_rate": 0.0001, + "loss": 5.544, + "loss/crossentropy": 2.4790529012680054, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1600080132484436, + "step": 17350 + }, + { + "epoch": 0.54225, + "grad_norm": 2.890625, + "grad_norm_var": 0.046533203125, + "learning_rate": 0.0001, + "loss": 5.5663, + "loss/crossentropy": 2.4919902086257935, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1636843979358673, + "step": 17352 + }, + { + "epoch": 0.5423125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0270660400390625, + "learning_rate": 0.0001, + "loss": 5.7964, + "loss/crossentropy": 2.619598388671875, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16807474195957184, + "step": 17354 + }, + { + "epoch": 0.542375, + "grad_norm": 3.1875, + "grad_norm_var": 0.014045206705729167, + "learning_rate": 0.0001, + "loss": 5.5497, + "loss/crossentropy": 2.488608479499817, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1557212769985199, + "step": 17356 + }, + { + "epoch": 0.5424375, + "grad_norm": 2.921875, + "grad_norm_var": 0.014058430989583334, + "learning_rate": 0.0001, + "loss": 5.4728, + "loss/crossentropy": 2.4885571002960205, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15467827767133713, + "step": 17358 + }, + { + "epoch": 0.5425, + "grad_norm": 3.328125, + "grad_norm_var": 0.015941365559895834, + "learning_rate": 0.0001, + "loss": 5.8309, + "loss/crossentropy": 2.5929763317108154, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17457586526870728, + "step": 17360 + }, + { + "epoch": 0.5425625, + "grad_norm": 2.859375, + "grad_norm_var": 0.01705322265625, + "learning_rate": 0.0001, + "loss": 5.6502, + "loss/crossentropy": 2.5163358449935913, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1653357744216919, + "step": 17362 + }, + { + "epoch": 0.542625, + "grad_norm": 3.046875, + "grad_norm_var": 0.014969889322916667, + "learning_rate": 0.0001, + "loss": 5.9868, + "loss/crossentropy": 2.7236123085021973, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17475664615631104, + "step": 17364 + }, + { + "epoch": 0.5426875, + "grad_norm": 3.703125, + "grad_norm_var": 0.04247639973958333, + "learning_rate": 0.0001, + "loss": 5.7169, + "loss/crossentropy": 2.5115528106689453, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17404936999082565, + "step": 17366 + }, + { + "epoch": 0.54275, + "grad_norm": 3.328125, + "grad_norm_var": 0.04132486979166667, + "learning_rate": 0.0001, + "loss": 5.4933, + "loss/crossentropy": 2.438134789466858, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1574733927845955, + "step": 17368 + }, + { + "epoch": 0.5428125, + "grad_norm": 2.96875, + "grad_norm_var": 0.04158426920572917, + "learning_rate": 0.0001, + "loss": 5.662, + "loss/crossentropy": 2.606270432472229, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16260183602571487, + "step": 17370 + }, + { + "epoch": 0.542875, + "grad_norm": 3.5, + "grad_norm_var": 0.0514068603515625, + "learning_rate": 0.0001, + "loss": 5.7725, + "loss/crossentropy": 2.52018940448761, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1717132329940796, + "step": 17372 + }, + { + "epoch": 0.5429375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0446929931640625, + "learning_rate": 0.0001, + "loss": 5.8875, + "loss/crossentropy": 2.6899309158325195, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1717073693871498, + "step": 17374 + }, + { + "epoch": 0.543, + "grad_norm": 3.078125, + "grad_norm_var": 0.043431599934895836, + "learning_rate": 0.0001, + "loss": 5.5822, + "loss/crossentropy": 2.4854485988616943, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1604522168636322, + "step": 17376 + }, + { + "epoch": 0.5430625, + "grad_norm": 3.15625, + "grad_norm_var": 0.03583577473958333, + "learning_rate": 0.0001, + "loss": 5.6836, + "loss/crossentropy": 2.4923148155212402, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17029929906129837, + "step": 17378 + }, + { + "epoch": 0.543125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03453369140625, + "learning_rate": 0.0001, + "loss": 5.6612, + "loss/crossentropy": 2.5236387252807617, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16571299731731415, + "step": 17380 + }, + { + "epoch": 0.5431875, + "grad_norm": 3.0, + "grad_norm_var": 0.0330963134765625, + "learning_rate": 0.0001, + "loss": 5.8322, + "loss/crossentropy": 2.6035473346710205, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.171689473092556, + "step": 17382 + }, + { + "epoch": 0.54325, + "grad_norm": 3.21875, + "grad_norm_var": 0.0328033447265625, + "learning_rate": 0.0001, + "loss": 5.7477, + "loss/crossentropy": 2.5189948081970215, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1748279556632042, + "step": 17384 + }, + { + "epoch": 0.5433125, + "grad_norm": 3.03125, + "grad_norm_var": 0.032389322916666664, + "learning_rate": 0.0001, + "loss": 5.5113, + "loss/crossentropy": 2.313473343849182, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17251309007406235, + "step": 17386 + }, + { + "epoch": 0.543375, + "grad_norm": 2.921875, + "grad_norm_var": 0.029150390625, + "learning_rate": 0.0001, + "loss": 5.7159, + "loss/crossentropy": 2.5559000968933105, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16756704449653625, + "step": 17388 + }, + { + "epoch": 0.5434375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0356842041015625, + "learning_rate": 0.0001, + "loss": 5.5085, + "loss/crossentropy": 2.4819912910461426, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15811563283205032, + "step": 17390 + }, + { + "epoch": 0.5435, + "grad_norm": 3.484375, + "grad_norm_var": 0.0461090087890625, + "learning_rate": 0.0001, + "loss": 5.5871, + "loss/crossentropy": 2.3881219625473022, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16755874454975128, + "step": 17392 + }, + { + "epoch": 0.5435625, + "grad_norm": 2.84375, + "grad_norm_var": 0.05329488118489583, + "learning_rate": 0.0001, + "loss": 5.2546, + "loss/crossentropy": 2.2453823685646057, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15053172409534454, + "step": 17394 + }, + { + "epoch": 0.543625, + "grad_norm": 3.21875, + "grad_norm_var": 0.052489217122395834, + "learning_rate": 0.0001, + "loss": 5.8638, + "loss/crossentropy": 2.674225926399231, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17091196030378342, + "step": 17396 + }, + { + "epoch": 0.5436875, + "grad_norm": 3.78125, + "grad_norm_var": 0.06104227701822917, + "learning_rate": 0.0001, + "loss": 5.9071, + "loss/crossentropy": 2.685681104660034, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1733183041214943, + "step": 17398 + }, + { + "epoch": 0.54375, + "grad_norm": 3.734375, + "grad_norm_var": 0.10305582682291667, + "learning_rate": 0.0001, + "loss": 5.8124, + "loss/crossentropy": 2.4964096546173096, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17925040423870087, + "step": 17400 + }, + { + "epoch": 0.5438125, + "grad_norm": 2.984375, + "grad_norm_var": 0.1107574462890625, + "learning_rate": 0.0001, + "loss": 5.6789, + "loss/crossentropy": 2.5261625051498413, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16878822445869446, + "step": 17402 + }, + { + "epoch": 0.543875, + "grad_norm": 3.25, + "grad_norm_var": 0.10347900390625, + "learning_rate": 0.0001, + "loss": 5.7427, + "loss/crossentropy": 2.527507185935974, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1726917028427124, + "step": 17404 + }, + { + "epoch": 0.5439375, + "grad_norm": 3.359375, + "grad_norm_var": 0.089404296875, + "learning_rate": 0.0001, + "loss": 5.6494, + "loss/crossentropy": 2.5336248874664307, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16626831889152527, + "step": 17406 + }, + { + "epoch": 0.544, + "grad_norm": 3.1875, + "grad_norm_var": 0.08929036458333334, + "learning_rate": 0.0001, + "loss": 5.783, + "loss/crossentropy": 2.623347759246826, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16908900439739227, + "step": 17408 + }, + { + "epoch": 0.5440625, + "grad_norm": 2.90625, + "grad_norm_var": 0.08642476399739583, + "learning_rate": 0.0001, + "loss": 5.5582, + "loss/crossentropy": 2.471962809562683, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16526072472333908, + "step": 17410 + }, + { + "epoch": 0.544125, + "grad_norm": 3.0, + "grad_norm_var": 0.0893707275390625, + "learning_rate": 0.0001, + "loss": 5.6715, + "loss/crossentropy": 2.559972047805786, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1650635376572609, + "step": 17412 + }, + { + "epoch": 0.5441875, + "grad_norm": 2.859375, + "grad_norm_var": 0.075146484375, + "learning_rate": 0.0001, + "loss": 5.6099, + "loss/crossentropy": 2.588362455368042, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15762421488761902, + "step": 17414 + }, + { + "epoch": 0.54425, + "grad_norm": 2.953125, + "grad_norm_var": 0.023193359375, + "learning_rate": 0.0001, + "loss": 5.4951, + "loss/crossentropy": 2.460350275039673, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15972907841205597, + "step": 17416 + }, + { + "epoch": 0.5443125, + "grad_norm": 3.234375, + "grad_norm_var": 0.022412109375, + "learning_rate": 0.0001, + "loss": 5.9098, + "loss/crossentropy": 2.674430251121521, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17353634536266327, + "step": 17418 + }, + { + "epoch": 0.544375, + "grad_norm": 3.265625, + "grad_norm_var": 0.026447550455729166, + "learning_rate": 0.0001, + "loss": 5.7901, + "loss/crossentropy": 2.562321186065674, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17160680145025253, + "step": 17420 + }, + { + "epoch": 0.5444375, + "grad_norm": 3.203125, + "grad_norm_var": 0.022980753580729166, + "learning_rate": 0.0001, + "loss": 5.9997, + "loss/crossentropy": 2.7496341466903687, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17305376380681992, + "step": 17422 + }, + { + "epoch": 0.5445, + "grad_norm": 3.125, + "grad_norm_var": 0.025715128580729166, + "learning_rate": 0.0001, + "loss": 5.5739, + "loss/crossentropy": 2.4427828788757324, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1674105003476143, + "step": 17424 + }, + { + "epoch": 0.5445625, + "grad_norm": 3.203125, + "grad_norm_var": 0.025, + "learning_rate": 0.0001, + "loss": 5.6343, + "loss/crossentropy": 2.4660485982894897, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16799865663051605, + "step": 17426 + }, + { + "epoch": 0.544625, + "grad_norm": 3.25, + "grad_norm_var": 0.026883951822916665, + "learning_rate": 0.0001, + "loss": 5.8551, + "loss/crossentropy": 2.63783597946167, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1736757457256317, + "step": 17428 + }, + { + "epoch": 0.5446875, + "grad_norm": 3.578125, + "grad_norm_var": 0.0374420166015625, + "learning_rate": 0.0001, + "loss": 5.9258, + "loss/crossentropy": 2.6419161558151245, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1744867041707039, + "step": 17430 + }, + { + "epoch": 0.54475, + "grad_norm": 3.0625, + "grad_norm_var": 0.028548177083333334, + "learning_rate": 0.0001, + "loss": 5.9458, + "loss/crossentropy": 2.711817502975464, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17339904606342316, + "step": 17432 + }, + { + "epoch": 0.5448125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0311431884765625, + "learning_rate": 0.0001, + "loss": 5.771, + "loss/crossentropy": 2.637289881706238, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16493095457553864, + "step": 17434 + }, + { + "epoch": 0.544875, + "grad_norm": 3.265625, + "grad_norm_var": 0.0318756103515625, + "learning_rate": 0.0001, + "loss": 6.0217, + "loss/crossentropy": 2.7659993171691895, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17478535324335098, + "step": 17436 + }, + { + "epoch": 0.5449375, + "grad_norm": 3.0625, + "grad_norm_var": 0.034765625, + "learning_rate": 0.0001, + "loss": 5.7704, + "loss/crossentropy": 2.5579363107681274, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17124298959970474, + "step": 17438 + }, + { + "epoch": 0.545, + "grad_norm": 3.3125, + "grad_norm_var": 0.029130045572916666, + "learning_rate": 0.0001, + "loss": 5.8219, + "loss/crossentropy": 2.633441209793091, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16884766519069672, + "step": 17440 + }, + { + "epoch": 0.5450625, + "grad_norm": 2.890625, + "grad_norm_var": 0.036116536458333334, + "learning_rate": 0.0001, + "loss": 5.746, + "loss/crossentropy": 2.5905871391296387, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16749078035354614, + "step": 17442 + }, + { + "epoch": 0.545125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03466389973958333, + "learning_rate": 0.0001, + "loss": 5.4613, + "loss/crossentropy": 2.308797836303711, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1679813116788864, + "step": 17444 + }, + { + "epoch": 0.5451875, + "grad_norm": 3.328125, + "grad_norm_var": 0.024344889322916667, + "learning_rate": 0.0001, + "loss": 6.0682, + "loss/crossentropy": 2.7605719566345215, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17880941927433014, + "step": 17446 + }, + { + "epoch": 0.54525, + "grad_norm": 3.0625, + "grad_norm_var": 0.02330322265625, + "learning_rate": 0.0001, + "loss": 5.7359, + "loss/crossentropy": 2.619320511817932, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1667317971587181, + "step": 17448 + }, + { + "epoch": 0.5453125, + "grad_norm": 3.09375, + "grad_norm_var": 0.022900390625, + "learning_rate": 0.0001, + "loss": 5.8526, + "loss/crossentropy": 2.616609811782837, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17672811448574066, + "step": 17450 + }, + { + "epoch": 0.545375, + "grad_norm": 3.09375, + "grad_norm_var": 0.019563802083333335, + "learning_rate": 0.0001, + "loss": 5.7392, + "loss/crossentropy": 2.5301653146743774, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17090477049350739, + "step": 17452 + }, + { + "epoch": 0.5454375, + "grad_norm": 3.125, + "grad_norm_var": 0.016402180989583334, + "learning_rate": 0.0001, + "loss": 5.8981, + "loss/crossentropy": 2.665869116783142, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17166230827569962, + "step": 17454 + }, + { + "epoch": 0.5455, + "grad_norm": 3.21875, + "grad_norm_var": 0.013899739583333333, + "learning_rate": 0.0001, + "loss": 5.4047, + "loss/crossentropy": 2.3639109134674072, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15798667073249817, + "step": 17456 + }, + { + "epoch": 0.5455625, + "grad_norm": 2.90625, + "grad_norm_var": 0.015583292643229166, + "learning_rate": 0.0001, + "loss": 5.3614, + "loss/crossentropy": 2.4397586584091187, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1519285961985588, + "step": 17458 + }, + { + "epoch": 0.545625, + "grad_norm": 3.125, + "grad_norm_var": 0.014937337239583333, + "learning_rate": 0.0001, + "loss": 5.5904, + "loss/crossentropy": 2.44307541847229, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1662917137145996, + "step": 17460 + }, + { + "epoch": 0.5456875, + "grad_norm": 3.265625, + "grad_norm_var": 0.012418619791666667, + "learning_rate": 0.0001, + "loss": 5.7021, + "loss/crossentropy": 2.5171843767166138, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17122726887464523, + "step": 17462 + }, + { + "epoch": 0.54575, + "grad_norm": 3.1875, + "grad_norm_var": 0.011128743489583334, + "learning_rate": 0.0001, + "loss": 5.7291, + "loss/crossentropy": 2.5654937028884888, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1679239571094513, + "step": 17464 + }, + { + "epoch": 0.5458125, + "grad_norm": 3.125, + "grad_norm_var": 0.0106842041015625, + "learning_rate": 0.0001, + "loss": 5.6877, + "loss/crossentropy": 2.58706271648407, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1651432141661644, + "step": 17466 + }, + { + "epoch": 0.545875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0110504150390625, + "learning_rate": 0.0001, + "loss": 6.1347, + "loss/crossentropy": 2.8856621980667114, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17646629363298416, + "step": 17468 + }, + { + "epoch": 0.5459375, + "grad_norm": 2.90625, + "grad_norm_var": 0.016141764322916665, + "learning_rate": 0.0001, + "loss": 5.7553, + "loss/crossentropy": 2.582043766975403, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16732431203126907, + "step": 17470 + }, + { + "epoch": 0.546, + "grad_norm": 3.171875, + "grad_norm_var": 0.01513671875, + "learning_rate": 0.0001, + "loss": 5.7062, + "loss/crossentropy": 2.5217323303222656, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1696169525384903, + "step": 17472 + }, + { + "epoch": 0.5460625, + "grad_norm": 4.0625, + "grad_norm_var": 0.0642486572265625, + "learning_rate": 0.0001, + "loss": 5.6209, + "loss/crossentropy": 2.4642462730407715, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1656637042760849, + "step": 17474 + }, + { + "epoch": 0.546125, + "grad_norm": 3.15625, + "grad_norm_var": 0.1303375244140625, + "learning_rate": 0.0001, + "loss": 5.8618, + "loss/crossentropy": 2.577812671661377, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17878559976816177, + "step": 17476 + }, + { + "epoch": 0.5461875, + "grad_norm": 3.0, + "grad_norm_var": 0.13186442057291667, + "learning_rate": 0.0001, + "loss": 5.5949, + "loss/crossentropy": 2.417052149772644, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16817185282707214, + "step": 17478 + }, + { + "epoch": 0.54625, + "grad_norm": 2.890625, + "grad_norm_var": 0.14026692708333333, + "learning_rate": 0.0001, + "loss": 5.5622, + "loss/crossentropy": 2.492171883583069, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16169288754463196, + "step": 17480 + }, + { + "epoch": 0.5463125, + "grad_norm": 3.078125, + "grad_norm_var": 0.14573567708333332, + "learning_rate": 0.0001, + "loss": 5.331, + "loss/crossentropy": 2.395987033843994, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15288100391626358, + "step": 17482 + }, + { + "epoch": 0.546375, + "grad_norm": 3.3125, + "grad_norm_var": 0.14401041666666667, + "learning_rate": 0.0001, + "loss": 5.749, + "loss/crossentropy": 2.5485790967941284, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17004591971635818, + "step": 17484 + }, + { + "epoch": 0.5464375, + "grad_norm": 3.28125, + "grad_norm_var": 0.14600321451822917, + "learning_rate": 0.0001, + "loss": 5.4656, + "loss/crossentropy": 2.3657922744750977, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16193493455648422, + "step": 17486 + }, + { + "epoch": 0.5465, + "grad_norm": 3.359375, + "grad_norm_var": 0.14504801432291667, + "learning_rate": 0.0001, + "loss": 5.671, + "loss/crossentropy": 2.5214210748672485, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16691534966230392, + "step": 17488 + }, + { + "epoch": 0.5465625, + "grad_norm": 8.6875, + "grad_norm_var": 1.9765370686848958, + "learning_rate": 0.0001, + "loss": 6.0524, + "loss/crossentropy": 2.622617244720459, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1921955570578575, + "step": 17490 + }, + { + "epoch": 0.546625, + "grad_norm": 2.921875, + "grad_norm_var": 1.9584309895833334, + "learning_rate": 0.0001, + "loss": 5.6667, + "loss/crossentropy": 2.5299049615859985, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16797561943531036, + "step": 17492 + }, + { + "epoch": 0.5466875, + "grad_norm": 3.46875, + "grad_norm_var": 1.9653635660807292, + "learning_rate": 0.0001, + "loss": 5.5908, + "loss/crossentropy": 2.408536672592163, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16705283522605896, + "step": 17494 + }, + { + "epoch": 0.54675, + "grad_norm": 3.03125, + "grad_norm_var": 1.96470947265625, + "learning_rate": 0.0001, + "loss": 5.8037, + "loss/crossentropy": 2.660837769508362, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16663309931755066, + "step": 17496 + }, + { + "epoch": 0.5468125, + "grad_norm": 3.046875, + "grad_norm_var": 1.948583984375, + "learning_rate": 0.0001, + "loss": 5.9658, + "loss/crossentropy": 2.700629472732544, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17573725432157516, + "step": 17498 + }, + { + "epoch": 0.546875, + "grad_norm": 3.15625, + "grad_norm_var": 1.9444173177083333, + "learning_rate": 0.0001, + "loss": 5.8598, + "loss/crossentropy": 2.583473563194275, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17802652716636658, + "step": 17500 + }, + { + "epoch": 0.5469375, + "grad_norm": 2.875, + "grad_norm_var": 1.9409820556640625, + "learning_rate": 0.0001, + "loss": 5.2422, + "loss/crossentropy": 2.2244415283203125, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15724655985832214, + "step": 17502 + }, + { + "epoch": 0.547, + "grad_norm": 3.171875, + "grad_norm_var": 1.9528961181640625, + "learning_rate": 0.0001, + "loss": 5.6346, + "loss/crossentropy": 2.4827910661697388, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1694793701171875, + "step": 17504 + }, + { + "epoch": 0.5470625, + "grad_norm": 5.96875, + "grad_norm_var": 0.5597808837890625, + "learning_rate": 0.0001, + "loss": 5.4604, + "loss/crossentropy": 2.2561005353927612, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16730080544948578, + "step": 17506 + }, + { + "epoch": 0.547125, + "grad_norm": 3.046875, + "grad_norm_var": 0.5482574462890625, + "learning_rate": 0.0001, + "loss": 5.7717, + "loss/crossentropy": 2.5395383834838867, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16931431740522385, + "step": 17508 + }, + { + "epoch": 0.5471875, + "grad_norm": 3.140625, + "grad_norm_var": 0.5327107747395833, + "learning_rate": 0.0001, + "loss": 5.752, + "loss/crossentropy": 2.5654181241989136, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16944094002246857, + "step": 17510 + }, + { + "epoch": 0.54725, + "grad_norm": 3.515625, + "grad_norm_var": 0.5152333577473959, + "learning_rate": 0.0001, + "loss": 5.5605, + "loss/crossentropy": 2.41715931892395, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16355209052562714, + "step": 17512 + }, + { + "epoch": 0.5473125, + "grad_norm": 3.25, + "grad_norm_var": 0.5193684895833334, + "learning_rate": 0.0001, + "loss": 5.6707, + "loss/crossentropy": 2.544505476951599, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1634012535214424, + "step": 17514 + }, + { + "epoch": 0.547375, + "grad_norm": 3.21875, + "grad_norm_var": 0.5138997395833333, + "learning_rate": 0.0001, + "loss": 5.59, + "loss/crossentropy": 2.4491783380508423, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16720939427614212, + "step": 17516 + }, + { + "epoch": 0.5474375, + "grad_norm": 3.59375, + "grad_norm_var": 0.4985260009765625, + "learning_rate": 0.0001, + "loss": 5.9674, + "loss/crossentropy": 2.7069283723831177, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17330986261367798, + "step": 17518 + }, + { + "epoch": 0.5475, + "grad_norm": 3.34375, + "grad_norm_var": 0.4915924072265625, + "learning_rate": 0.0001, + "loss": 5.9962, + "loss/crossentropy": 2.5872387886047363, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18660012632608414, + "step": 17520 + }, + { + "epoch": 0.5475625, + "grad_norm": 2.984375, + "grad_norm_var": 0.061766560872395834, + "learning_rate": 0.0001, + "loss": 5.476, + "loss/crossentropy": 2.3968299627304077, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1598661169409752, + "step": 17522 + }, + { + "epoch": 0.547625, + "grad_norm": 3.0, + "grad_norm_var": 0.06337483723958333, + "learning_rate": 0.0001, + "loss": 5.5755, + "loss/crossentropy": 2.5098626613616943, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15812503546476364, + "step": 17524 + }, + { + "epoch": 0.5476875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0638580322265625, + "learning_rate": 0.0001, + "loss": 5.9852, + "loss/crossentropy": 2.7504621744155884, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17621267586946487, + "step": 17526 + }, + { + "epoch": 0.54775, + "grad_norm": 3.09375, + "grad_norm_var": 0.05088602701822917, + "learning_rate": 0.0001, + "loss": 5.5887, + "loss/crossentropy": 2.4907697439193726, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1613566130399704, + "step": 17528 + }, + { + "epoch": 0.5478125, + "grad_norm": 2.859375, + "grad_norm_var": 0.055985514322916666, + "learning_rate": 0.0001, + "loss": 5.7495, + "loss/crossentropy": 2.6151968240737915, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1669449657201767, + "step": 17530 + }, + { + "epoch": 0.547875, + "grad_norm": 3.296875, + "grad_norm_var": 0.058014933268229166, + "learning_rate": 0.0001, + "loss": 5.8629, + "loss/crossentropy": 2.6741613149642944, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17083033919334412, + "step": 17532 + }, + { + "epoch": 0.5479375, + "grad_norm": 3.328125, + "grad_norm_var": 0.049051920572916664, + "learning_rate": 0.0001, + "loss": 5.7614, + "loss/crossentropy": 2.511942148208618, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1769026666879654, + "step": 17534 + }, + { + "epoch": 0.548, + "grad_norm": 3.375, + "grad_norm_var": 0.026302083333333334, + "learning_rate": 0.0001, + "loss": 5.8136, + "loss/crossentropy": 2.5612308979034424, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17211096733808517, + "step": 17536 + }, + { + "epoch": 0.5480625, + "grad_norm": 3.0625, + "grad_norm_var": 0.022477213541666666, + "learning_rate": 0.0001, + "loss": 5.6572, + "loss/crossentropy": 2.5014984607696533, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17182088643312454, + "step": 17538 + }, + { + "epoch": 0.548125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0195220947265625, + "learning_rate": 0.0001, + "loss": 5.8609, + "loss/crossentropy": 2.6298829317092896, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.171540766954422, + "step": 17540 + }, + { + "epoch": 0.5481875, + "grad_norm": 3.15625, + "grad_norm_var": 0.020068359375, + "learning_rate": 0.0001, + "loss": 5.7179, + "loss/crossentropy": 2.565890073776245, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16480782628059387, + "step": 17542 + }, + { + "epoch": 0.54825, + "grad_norm": 3.15625, + "grad_norm_var": 0.026985677083333333, + "learning_rate": 0.0001, + "loss": 5.6059, + "loss/crossentropy": 2.543424367904663, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16249486058950424, + "step": 17544 + }, + { + "epoch": 0.5483125, + "grad_norm": 3.265625, + "grad_norm_var": 0.029325358072916665, + "learning_rate": 0.0001, + "loss": 5.135, + "loss/crossentropy": 2.2710988521575928, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.13678062707185745, + "step": 17546 + }, + { + "epoch": 0.548375, + "grad_norm": 3.28125, + "grad_norm_var": 0.0256256103515625, + "learning_rate": 0.0001, + "loss": 5.9581, + "loss/crossentropy": 2.6833196878433228, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17669300734996796, + "step": 17548 + }, + { + "epoch": 0.5484375, + "grad_norm": 3.328125, + "grad_norm_var": 0.027783203125, + "learning_rate": 0.0001, + "loss": 5.3984, + "loss/crossentropy": 2.224676489830017, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1650291532278061, + "step": 17550 + }, + { + "epoch": 0.5485, + "grad_norm": 3.0625, + "grad_norm_var": 0.025325520833333334, + "learning_rate": 0.0001, + "loss": 5.5222, + "loss/crossentropy": 2.4809380769729614, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1607636660337448, + "step": 17552 + }, + { + "epoch": 0.5485625, + "grad_norm": 3.328125, + "grad_norm_var": 0.03638916015625, + "learning_rate": 0.0001, + "loss": 5.8775, + "loss/crossentropy": 2.6138750314712524, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17480115592479706, + "step": 17554 + }, + { + "epoch": 0.548625, + "grad_norm": 3.203125, + "grad_norm_var": 0.037255859375, + "learning_rate": 0.0001, + "loss": 5.6924, + "loss/crossentropy": 2.468183398246765, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1708623617887497, + "step": 17556 + }, + { + "epoch": 0.5486875, + "grad_norm": 3.28125, + "grad_norm_var": 0.03777669270833333, + "learning_rate": 0.0001, + "loss": 5.633, + "loss/crossentropy": 2.5118530988693237, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1617267206311226, + "step": 17558 + }, + { + "epoch": 0.54875, + "grad_norm": 3.109375, + "grad_norm_var": 0.03179931640625, + "learning_rate": 0.0001, + "loss": 5.7145, + "loss/crossentropy": 2.5721532106399536, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16774551570415497, + "step": 17560 + }, + { + "epoch": 0.5488125, + "grad_norm": 2.9375, + "grad_norm_var": 0.026471964518229165, + "learning_rate": 0.0001, + "loss": 5.6571, + "loss/crossentropy": 2.5563547611236572, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1651500016450882, + "step": 17562 + }, + { + "epoch": 0.548875, + "grad_norm": 3.3125, + "grad_norm_var": 0.02672119140625, + "learning_rate": 0.0001, + "loss": 5.8953, + "loss/crossentropy": 2.671082019805908, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1712455302476883, + "step": 17564 + }, + { + "epoch": 0.5489375, + "grad_norm": 3.1875, + "grad_norm_var": 0.02349853515625, + "learning_rate": 0.0001, + "loss": 5.7482, + "loss/crossentropy": 2.5788270235061646, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1688936948776245, + "step": 17566 + }, + { + "epoch": 0.549, + "grad_norm": 3.390625, + "grad_norm_var": 0.024364217122395834, + "learning_rate": 0.0001, + "loss": 5.7864, + "loss/crossentropy": 2.586372137069702, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17000386118888855, + "step": 17568 + }, + { + "epoch": 0.5490625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0194488525390625, + "learning_rate": 0.0001, + "loss": 5.5739, + "loss/crossentropy": 2.394267201423645, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16718415170907974, + "step": 17570 + }, + { + "epoch": 0.549125, + "grad_norm": 3.328125, + "grad_norm_var": 0.021728515625, + "learning_rate": 0.0001, + "loss": 5.8179, + "loss/crossentropy": 2.5978699922561646, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17239080369472504, + "step": 17572 + }, + { + "epoch": 0.5491875, + "grad_norm": 2.96875, + "grad_norm_var": 0.026219685872395832, + "learning_rate": 0.0001, + "loss": 6.0583, + "loss/crossentropy": 2.7638529539108276, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17866214364767075, + "step": 17574 + }, + { + "epoch": 0.54925, + "grad_norm": 3.109375, + "grad_norm_var": 0.032633463541666664, + "learning_rate": 0.0001, + "loss": 5.7033, + "loss/crossentropy": 2.573278069496155, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16456648707389832, + "step": 17576 + }, + { + "epoch": 0.5493125, + "grad_norm": 3.265625, + "grad_norm_var": 0.03645731608072917, + "learning_rate": 0.0001, + "loss": 6.0102, + "loss/crossentropy": 2.7705767154693604, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1759153977036476, + "step": 17578 + }, + { + "epoch": 0.549375, + "grad_norm": 2.96875, + "grad_norm_var": 0.041764322916666666, + "learning_rate": 0.0001, + "loss": 5.6084, + "loss/crossentropy": 2.516671657562256, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16229360550642014, + "step": 17580 + }, + { + "epoch": 0.5494375, + "grad_norm": 3.359375, + "grad_norm_var": 0.04462890625, + "learning_rate": 0.0001, + "loss": 5.8639, + "loss/crossentropy": 2.664319157600403, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16878651082515717, + "step": 17582 + }, + { + "epoch": 0.5495, + "grad_norm": 3.09375, + "grad_norm_var": 0.040848795572916666, + "learning_rate": 0.0001, + "loss": 5.6724, + "loss/crossentropy": 2.5199772119522095, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16758862137794495, + "step": 17584 + }, + { + "epoch": 0.5495625, + "grad_norm": 3.046875, + "grad_norm_var": 0.03671468098958333, + "learning_rate": 0.0001, + "loss": 5.9524, + "loss/crossentropy": 2.77493953704834, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16930725425481796, + "step": 17586 + }, + { + "epoch": 0.549625, + "grad_norm": 3.1875, + "grad_norm_var": 0.03520406087239583, + "learning_rate": 0.0001, + "loss": 5.6959, + "loss/crossentropy": 2.5064406394958496, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1693325936794281, + "step": 17588 + }, + { + "epoch": 0.5496875, + "grad_norm": 3.234375, + "grad_norm_var": 0.028999837239583333, + "learning_rate": 0.0001, + "loss": 5.7136, + "loss/crossentropy": 2.5963058471679688, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16602276265621185, + "step": 17590 + }, + { + "epoch": 0.54975, + "grad_norm": 3.171875, + "grad_norm_var": 0.025028483072916666, + "learning_rate": 0.0001, + "loss": 5.5308, + "loss/crossentropy": 2.449121594429016, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15855589509010315, + "step": 17592 + }, + { + "epoch": 0.5498125, + "grad_norm": 3.015625, + "grad_norm_var": 0.021321614583333332, + "learning_rate": 0.0001, + "loss": 5.6914, + "loss/crossentropy": 2.521979570388794, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16733740270137787, + "step": 17594 + }, + { + "epoch": 0.549875, + "grad_norm": 3.15625, + "grad_norm_var": 0.018260701497395834, + "learning_rate": 0.0001, + "loss": 5.7296, + "loss/crossentropy": 2.521511197090149, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1700238361954689, + "step": 17596 + }, + { + "epoch": 0.5499375, + "grad_norm": 3.078125, + "grad_norm_var": 0.021849568684895834, + "learning_rate": 0.0001, + "loss": 5.9507, + "loss/crossentropy": 2.6598910093307495, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18025202304124832, + "step": 17598 + }, + { + "epoch": 0.55, + "grad_norm": 2.921875, + "grad_norm_var": 0.025960286458333332, + "learning_rate": 0.0001, + "loss": 5.5521, + "loss/crossentropy": 2.464334011077881, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1638558954000473, + "step": 17600 + }, + { + "epoch": 0.5500625, + "grad_norm": 3.359375, + "grad_norm_var": 0.022835286458333333, + "learning_rate": 0.0001, + "loss": 5.364, + "loss/crossentropy": 2.2182083129882812, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16418644040822983, + "step": 17602 + }, + { + "epoch": 0.550125, + "grad_norm": 3.109375, + "grad_norm_var": 0.022184244791666665, + "learning_rate": 0.0001, + "loss": 5.8673, + "loss/crossentropy": 2.7186864614486694, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16447259485721588, + "step": 17604 + }, + { + "epoch": 0.5501875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0223297119140625, + "learning_rate": 0.0001, + "loss": 5.8666, + "loss/crossentropy": 2.61619770526886, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1762106567621231, + "step": 17606 + }, + { + "epoch": 0.55025, + "grad_norm": 3.140625, + "grad_norm_var": 0.021712239583333334, + "learning_rate": 0.0001, + "loss": 5.8196, + "loss/crossentropy": 2.6020315885543823, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17448948323726654, + "step": 17608 + }, + { + "epoch": 0.5503125, + "grad_norm": 2.984375, + "grad_norm_var": 0.02174072265625, + "learning_rate": 0.0001, + "loss": 5.6747, + "loss/crossentropy": 2.512550115585327, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16895052045583725, + "step": 17610 + }, + { + "epoch": 0.550375, + "grad_norm": 3.390625, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 0.0001, + "loss": 5.6881, + "loss/crossentropy": 2.540887713432312, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1686241254210472, + "step": 17612 + }, + { + "epoch": 0.5504375, + "grad_norm": 3.0625, + "grad_norm_var": 0.018626912434895834, + "learning_rate": 0.0001, + "loss": 5.7581, + "loss/crossentropy": 2.664351224899292, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16328338533639908, + "step": 17614 + }, + { + "epoch": 0.5505, + "grad_norm": 3.140625, + "grad_norm_var": 0.014867146809895834, + "learning_rate": 0.0001, + "loss": 5.7321, + "loss/crossentropy": 2.6070908308029175, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16718365252017975, + "step": 17616 + }, + { + "epoch": 0.5505625, + "grad_norm": 3.203125, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 5.6804, + "loss/crossentropy": 2.495343804359436, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16967647522687912, + "step": 17618 + }, + { + "epoch": 0.550625, + "grad_norm": 3.15625, + "grad_norm_var": 0.009761555989583334, + "learning_rate": 0.0001, + "loss": 5.6772, + "loss/crossentropy": 2.551109790802002, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16338741779327393, + "step": 17620 + }, + { + "epoch": 0.5506875, + "grad_norm": 3.421875, + "grad_norm_var": 0.01929931640625, + "learning_rate": 0.0001, + "loss": 5.6067, + "loss/crossentropy": 2.561010241508484, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16042368113994598, + "step": 17622 + }, + { + "epoch": 0.55075, + "grad_norm": 3.0, + "grad_norm_var": 0.020361328125, + "learning_rate": 0.0001, + "loss": 5.658, + "loss/crossentropy": 2.5929129123687744, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1604117974638939, + "step": 17624 + }, + { + "epoch": 0.5508125, + "grad_norm": 3.125, + "grad_norm_var": 0.023746744791666666, + "learning_rate": 0.0001, + "loss": 5.3172, + "loss/crossentropy": 2.3189679384231567, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15099874138832092, + "step": 17626 + }, + { + "epoch": 0.550875, + "grad_norm": 3.109375, + "grad_norm_var": 0.018192545572916666, + "learning_rate": 0.0001, + "loss": 5.8461, + "loss/crossentropy": 2.6823806762695312, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.165588840842247, + "step": 17628 + }, + { + "epoch": 0.5509375, + "grad_norm": 3.328125, + "grad_norm_var": 0.028303019205729165, + "learning_rate": 0.0001, + "loss": 5.2909, + "loss/crossentropy": 2.2500622868537903, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.15213175117969513, + "step": 17630 + }, + { + "epoch": 0.551, + "grad_norm": 2.8125, + "grad_norm_var": 0.03333231608072917, + "learning_rate": 0.0001, + "loss": 5.613, + "loss/crossentropy": 2.534808039665222, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.155868798494339, + "step": 17632 + }, + { + "epoch": 0.5510625, + "grad_norm": 3.0, + "grad_norm_var": 0.0326324462890625, + "learning_rate": 0.0001, + "loss": 5.9002, + "loss/crossentropy": 2.683477759361267, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1744086742401123, + "step": 17634 + }, + { + "epoch": 0.551125, + "grad_norm": 3.015625, + "grad_norm_var": 0.034566243489583336, + "learning_rate": 0.0001, + "loss": 6.0321, + "loss/crossentropy": 2.7725576162338257, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17478548735380173, + "step": 17636 + }, + { + "epoch": 0.5511875, + "grad_norm": 3.15625, + "grad_norm_var": 0.024507649739583335, + "learning_rate": 0.0001, + "loss": 5.6146, + "loss/crossentropy": 2.4677406549453735, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16976484656333923, + "step": 17638 + }, + { + "epoch": 0.55125, + "grad_norm": 3.6875, + "grad_norm_var": 0.04956766764322917, + "learning_rate": 0.0001, + "loss": 6.2732, + "loss/crossentropy": 2.845934271812439, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18882165104150772, + "step": 17640 + }, + { + "epoch": 0.5513125, + "grad_norm": 3.46875, + "grad_norm_var": 0.054198201497395834, + "learning_rate": 0.0001, + "loss": 5.4354, + "loss/crossentropy": 2.3915497064590454, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15594977885484695, + "step": 17642 + }, + { + "epoch": 0.551375, + "grad_norm": 3.328125, + "grad_norm_var": 0.056050618489583336, + "learning_rate": 0.0001, + "loss": 5.8491, + "loss/crossentropy": 2.6383293867111206, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17186033725738525, + "step": 17644 + }, + { + "epoch": 0.5514375, + "grad_norm": 2.9375, + "grad_norm_var": 0.049117024739583334, + "learning_rate": 0.0001, + "loss": 5.7848, + "loss/crossentropy": 2.616939425468445, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16795941442251205, + "step": 17646 + }, + { + "epoch": 0.5515, + "grad_norm": 3.171875, + "grad_norm_var": 0.04019775390625, + "learning_rate": 0.0001, + "loss": 6.0911, + "loss/crossentropy": 2.8558987379074097, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1750853955745697, + "step": 17648 + }, + { + "epoch": 0.5515625, + "grad_norm": 2.890625, + "grad_norm_var": 0.04482320149739583, + "learning_rate": 0.0001, + "loss": 5.4487, + "loss/crossentropy": 2.3902167081832886, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1609266996383667, + "step": 17650 + }, + { + "epoch": 0.551625, + "grad_norm": 2.984375, + "grad_norm_var": 0.048323567708333334, + "learning_rate": 0.0001, + "loss": 5.7564, + "loss/crossentropy": 2.5943437814712524, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16933320462703705, + "step": 17652 + }, + { + "epoch": 0.5516875, + "grad_norm": 3.078125, + "grad_norm_var": 0.049616495768229164, + "learning_rate": 0.0001, + "loss": 6.0053, + "loss/crossentropy": 2.7250771522521973, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17723697423934937, + "step": 17654 + }, + { + "epoch": 0.55175, + "grad_norm": 3.109375, + "grad_norm_var": 0.031901041666666664, + "learning_rate": 0.0001, + "loss": 5.3705, + "loss/crossentropy": 2.387295126914978, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1545664295554161, + "step": 17656 + }, + { + "epoch": 0.5518125, + "grad_norm": 3.125, + "grad_norm_var": 0.028125, + "learning_rate": 0.0001, + "loss": 5.6115, + "loss/crossentropy": 2.4386359453201294, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17080183327198029, + "step": 17658 + }, + { + "epoch": 0.551875, + "grad_norm": 3.109375, + "grad_norm_var": 0.024898274739583334, + "learning_rate": 0.0001, + "loss": 5.6052, + "loss/crossentropy": 2.5384548902511597, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16371016204357147, + "step": 17660 + }, + { + "epoch": 0.5519375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0279449462890625, + "learning_rate": 0.0001, + "loss": 5.7308, + "loss/crossentropy": 2.529237985610962, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17249634116888046, + "step": 17662 + }, + { + "epoch": 0.552, + "grad_norm": 3.28125, + "grad_norm_var": 0.04268290201822917, + "learning_rate": 0.0001, + "loss": 5.6703, + "loss/crossentropy": 2.4546802043914795, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17351430654525757, + "step": 17664 + }, + { + "epoch": 0.5520625, + "grad_norm": 3.15625, + "grad_norm_var": 0.03536783854166667, + "learning_rate": 0.0001, + "loss": 5.7763, + "loss/crossentropy": 2.566525936126709, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1737072914838791, + "step": 17666 + }, + { + "epoch": 0.552125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0426666259765625, + "learning_rate": 0.0001, + "loss": 5.8261, + "loss/crossentropy": 2.5591336488723755, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17552430927753448, + "step": 17668 + }, + { + "epoch": 0.5521875, + "grad_norm": 3.28125, + "grad_norm_var": 0.03679097493489583, + "learning_rate": 0.0001, + "loss": 5.5321, + "loss/crossentropy": 2.3835878372192383, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16563591361045837, + "step": 17670 + }, + { + "epoch": 0.55225, + "grad_norm": 3.484375, + "grad_norm_var": 0.028685506184895834, + "learning_rate": 0.0001, + "loss": 5.8604, + "loss/crossentropy": 2.6192610263824463, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1709924191236496, + "step": 17672 + }, + { + "epoch": 0.5523125, + "grad_norm": 3.453125, + "grad_norm_var": 0.028857421875, + "learning_rate": 0.0001, + "loss": 5.9271, + "loss/crossentropy": 2.6670114994049072, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17795803397893906, + "step": 17674 + }, + { + "epoch": 0.552375, + "grad_norm": 3.046875, + "grad_norm_var": 0.027586873372395834, + "learning_rate": 0.0001, + "loss": 5.7105, + "loss/crossentropy": 2.5654743909835815, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16606971621513367, + "step": 17676 + }, + { + "epoch": 0.5524375, + "grad_norm": 3.0625, + "grad_norm_var": 0.03631184895833333, + "learning_rate": 0.0001, + "loss": 5.7238, + "loss/crossentropy": 2.6403234004974365, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16225726157426834, + "step": 17678 + }, + { + "epoch": 0.5525, + "grad_norm": 3.078125, + "grad_norm_var": 0.037962849934895834, + "learning_rate": 0.0001, + "loss": 5.8579, + "loss/crossentropy": 2.6207019090652466, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1760634407401085, + "step": 17680 + }, + { + "epoch": 0.5525625, + "grad_norm": 3.40625, + "grad_norm_var": 0.03909098307291667, + "learning_rate": 0.0001, + "loss": 5.9807, + "loss/crossentropy": 2.8148363828659058, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1681457757949829, + "step": 17682 + }, + { + "epoch": 0.552625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03428446451822917, + "learning_rate": 0.0001, + "loss": 5.4105, + "loss/crossentropy": 2.3150410652160645, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16032294929027557, + "step": 17684 + }, + { + "epoch": 0.5526875, + "grad_norm": 3.375, + "grad_norm_var": 0.039774576822916664, + "learning_rate": 0.0001, + "loss": 5.6244, + "loss/crossentropy": 2.53757905960083, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1625930666923523, + "step": 17686 + }, + { + "epoch": 0.55275, + "grad_norm": 3.28125, + "grad_norm_var": 0.03984375, + "learning_rate": 0.0001, + "loss": 6.06, + "loss/crossentropy": 2.7653326988220215, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18063924461603165, + "step": 17688 + }, + { + "epoch": 0.5528125, + "grad_norm": 22.0, + "grad_norm_var": 22.19509989420573, + "learning_rate": 0.0001, + "loss": 5.8548, + "loss/crossentropy": 2.3556020259857178, + "loss/hidden": 1.73828125, + "loss/jsd": 0.0, + "loss/logits": 0.17609627544879913, + "step": 17690 + }, + { + "epoch": 0.552875, + "grad_norm": 3.078125, + "grad_norm_var": 22.14544169108073, + "learning_rate": 0.0001, + "loss": 5.9341, + "loss/crossentropy": 2.6883466243743896, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1734030321240425, + "step": 17692 + }, + { + "epoch": 0.5529375, + "grad_norm": 3.28125, + "grad_norm_var": 22.014655558268228, + "learning_rate": 0.0001, + "loss": 6.1103, + "loss/crossentropy": 2.818718194961548, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1776002198457718, + "step": 17694 + }, + { + "epoch": 0.553, + "grad_norm": 3.046875, + "grad_norm_var": 21.980288696289062, + "learning_rate": 0.0001, + "loss": 5.9822, + "loss/crossentropy": 2.687151551246643, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17794059216976166, + "step": 17696 + }, + { + "epoch": 0.5530625, + "grad_norm": 3.125, + "grad_norm_var": 21.997394816080728, + "learning_rate": 0.0001, + "loss": 5.9328, + "loss/crossentropy": 2.6805999279022217, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17600084841251373, + "step": 17698 + }, + { + "epoch": 0.553125, + "grad_norm": 2.890625, + "grad_norm_var": 22.028034464518228, + "learning_rate": 0.0001, + "loss": 5.6446, + "loss/crossentropy": 2.5476465225219727, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16203974187374115, + "step": 17700 + }, + { + "epoch": 0.5531875, + "grad_norm": 3.328125, + "grad_norm_var": 21.904325358072917, + "learning_rate": 0.0001, + "loss": 5.8891, + "loss/crossentropy": 2.6642894744873047, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17248596251010895, + "step": 17702 + }, + { + "epoch": 0.55325, + "grad_norm": 3.15625, + "grad_norm_var": 21.82415262858073, + "learning_rate": 0.0001, + "loss": 5.5586, + "loss/crossentropy": 2.3502798080444336, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16770578175783157, + "step": 17704 + }, + { + "epoch": 0.5533125, + "grad_norm": 2.9375, + "grad_norm_var": 0.11916910807291667, + "learning_rate": 0.0001, + "loss": 5.7012, + "loss/crossentropy": 2.5814989805221558, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16236452758312225, + "step": 17706 + }, + { + "epoch": 0.553375, + "grad_norm": 3.03125, + "grad_norm_var": 0.11990458170572917, + "learning_rate": 0.0001, + "loss": 5.5415, + "loss/crossentropy": 2.53207266330719, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15797212719917297, + "step": 17708 + }, + { + "epoch": 0.5534375, + "grad_norm": 3.484375, + "grad_norm_var": 0.1141510009765625, + "learning_rate": 0.0001, + "loss": 5.7966, + "loss/crossentropy": 2.583231806755066, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16977283358573914, + "step": 17710 + }, + { + "epoch": 0.5535, + "grad_norm": 3.078125, + "grad_norm_var": 0.07235921223958333, + "learning_rate": 0.0001, + "loss": 5.8542, + "loss/crossentropy": 2.654178738594055, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1688312143087387, + "step": 17712 + }, + { + "epoch": 0.5535625, + "grad_norm": 3.25, + "grad_norm_var": 0.07957356770833333, + "learning_rate": 0.0001, + "loss": 5.3742, + "loss/crossentropy": 2.416073203086853, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15205883234739304, + "step": 17714 + }, + { + "epoch": 0.553625, + "grad_norm": 3.3125, + "grad_norm_var": 0.08026936848958334, + "learning_rate": 0.0001, + "loss": 5.6411, + "loss/crossentropy": 2.4960005283355713, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16724785417318344, + "step": 17716 + }, + { + "epoch": 0.5536875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0369781494140625, + "learning_rate": 0.0001, + "loss": 5.8712, + "loss/crossentropy": 2.614146113395691, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17336326092481613, + "step": 17718 + }, + { + "epoch": 0.55375, + "grad_norm": 2.984375, + "grad_norm_var": 0.026691691080729166, + "learning_rate": 0.0001, + "loss": 5.5363, + "loss/crossentropy": 2.46210777759552, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16328229010105133, + "step": 17720 + }, + { + "epoch": 0.5538125, + "grad_norm": 4.6875, + "grad_norm_var": 0.18032938639322918, + "learning_rate": 0.0001, + "loss": 5.5073, + "loss/crossentropy": 2.404494285583496, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16145338118076324, + "step": 17722 + }, + { + "epoch": 0.553875, + "grad_norm": 3.765625, + "grad_norm_var": 0.196728515625, + "learning_rate": 0.0001, + "loss": 5.4868, + "loss/crossentropy": 2.4248130321502686, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15658820420503616, + "step": 17724 + }, + { + "epoch": 0.5539375, + "grad_norm": 3.6875, + "grad_norm_var": 0.20271809895833334, + "learning_rate": 0.0001, + "loss": 6.0828, + "loss/crossentropy": 2.714733123779297, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1852409541606903, + "step": 17726 + }, + { + "epoch": 0.554, + "grad_norm": 3.28125, + "grad_norm_var": 0.20032450358072917, + "learning_rate": 0.0001, + "loss": 6.1021, + "loss/crossentropy": 2.7352226972579956, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1839490830898285, + "step": 17728 + }, + { + "epoch": 0.5540625, + "grad_norm": 3.921875, + "grad_norm_var": 0.22157796223958334, + "learning_rate": 0.0001, + "loss": 5.6849, + "loss/crossentropy": 2.5687484741210938, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1623956859111786, + "step": 17730 + }, + { + "epoch": 0.554125, + "grad_norm": 3.328125, + "grad_norm_var": 0.2098297119140625, + "learning_rate": 0.0001, + "loss": 5.8104, + "loss/crossentropy": 2.620088815689087, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16824733465909958, + "step": 17732 + }, + { + "epoch": 0.5541875, + "grad_norm": 3.15625, + "grad_norm_var": 0.20611979166666666, + "learning_rate": 0.0001, + "loss": 5.8186, + "loss/crossentropy": 2.5815773010253906, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1740909218788147, + "step": 17734 + }, + { + "epoch": 0.55425, + "grad_norm": 3.078125, + "grad_norm_var": 0.19973856608072918, + "learning_rate": 0.0001, + "loss": 5.7036, + "loss/crossentropy": 2.53547203540802, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1687656044960022, + "step": 17736 + }, + { + "epoch": 0.5543125, + "grad_norm": 3.25, + "grad_norm_var": 0.07545572916666667, + "learning_rate": 0.0001, + "loss": 5.7654, + "loss/crossentropy": 2.5381171703338623, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17155338823795319, + "step": 17738 + }, + { + "epoch": 0.554375, + "grad_norm": 3.046875, + "grad_norm_var": 0.06968485514322917, + "learning_rate": 0.0001, + "loss": 5.4443, + "loss/crossentropy": 2.498365879058838, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1563076302409172, + "step": 17740 + }, + { + "epoch": 0.5544375, + "grad_norm": 3.296875, + "grad_norm_var": 0.0581695556640625, + "learning_rate": 0.0001, + "loss": 6.1023, + "loss/crossentropy": 2.746156334877014, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18405093252658844, + "step": 17742 + }, + { + "epoch": 0.5545, + "grad_norm": 3.3125, + "grad_norm_var": 0.06363525390625, + "learning_rate": 0.0001, + "loss": 5.8252, + "loss/crossentropy": 2.6532708406448364, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16641514748334885, + "step": 17744 + }, + { + "epoch": 0.5545625, + "grad_norm": 3.03125, + "grad_norm_var": 0.025830078125, + "learning_rate": 0.0001, + "loss": 5.5518, + "loss/crossentropy": 2.4523158073425293, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16229548305273056, + "step": 17746 + }, + { + "epoch": 0.554625, + "grad_norm": 3.0625, + "grad_norm_var": 0.027701822916666667, + "learning_rate": 0.0001, + "loss": 5.5929, + "loss/crossentropy": 2.389325499534607, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17113812267780304, + "step": 17748 + }, + { + "epoch": 0.5546875, + "grad_norm": 3.1875, + "grad_norm_var": 0.027620442708333335, + "learning_rate": 0.0001, + "loss": 5.7688, + "loss/crossentropy": 2.618277668952942, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16935007274150848, + "step": 17750 + }, + { + "epoch": 0.55475, + "grad_norm": 3.3125, + "grad_norm_var": 0.027953084309895834, + "learning_rate": 0.0001, + "loss": 5.9271, + "loss/crossentropy": 2.619473695755005, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18154089152812958, + "step": 17752 + }, + { + "epoch": 0.5548125, + "grad_norm": 3.25, + "grad_norm_var": 0.03378499348958333, + "learning_rate": 0.0001, + "loss": 5.9803, + "loss/crossentropy": 2.542687177658081, + "loss/hidden": 1.58984375, + "loss/jsd": 0.0, + "loss/logits": 0.18477710336446762, + "step": 17754 + }, + { + "epoch": 0.554875, + "grad_norm": 3.109375, + "grad_norm_var": 0.03287353515625, + "learning_rate": 0.0001, + "loss": 5.561, + "loss/crossentropy": 2.477379322052002, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16226952522993088, + "step": 17756 + }, + { + "epoch": 0.5549375, + "grad_norm": 3.171875, + "grad_norm_var": 0.028473917643229166, + "learning_rate": 0.0001, + "loss": 5.9849, + "loss/crossentropy": 2.737338662147522, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1731894165277481, + "step": 17758 + }, + { + "epoch": 0.555, + "grad_norm": 3.34375, + "grad_norm_var": 0.0525054931640625, + "learning_rate": 0.0001, + "loss": 6.1035, + "loss/crossentropy": 2.6326135396957397, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.19240082055330276, + "step": 17760 + }, + { + "epoch": 0.5550625, + "grad_norm": 3.125, + "grad_norm_var": 0.0496490478515625, + "learning_rate": 0.0001, + "loss": 5.6892, + "loss/crossentropy": 2.5344449281692505, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16430585086345673, + "step": 17762 + }, + { + "epoch": 0.555125, + "grad_norm": 3.171875, + "grad_norm_var": 0.050568644205729166, + "learning_rate": 0.0001, + "loss": 5.4346, + "loss/crossentropy": 2.487262725830078, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.14786113798618317, + "step": 17764 + }, + { + "epoch": 0.5551875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0554840087890625, + "learning_rate": 0.0001, + "loss": 5.7453, + "loss/crossentropy": 2.5581740140914917, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16871501505374908, + "step": 17766 + }, + { + "epoch": 0.55525, + "grad_norm": 3.171875, + "grad_norm_var": 0.054585774739583336, + "learning_rate": 0.0001, + "loss": 5.9073, + "loss/crossentropy": 2.76697313785553, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1675439178943634, + "step": 17768 + }, + { + "epoch": 0.5553125, + "grad_norm": 3.078125, + "grad_norm_var": 0.048628743489583334, + "learning_rate": 0.0001, + "loss": 5.9461, + "loss/crossentropy": 2.757845640182495, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1703917160630226, + "step": 17770 + }, + { + "epoch": 0.555375, + "grad_norm": 3.28125, + "grad_norm_var": 0.0402252197265625, + "learning_rate": 0.0001, + "loss": 5.7144, + "loss/crossentropy": 2.5307306051254272, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16680067032575607, + "step": 17772 + }, + { + "epoch": 0.5554375, + "grad_norm": 3.0, + "grad_norm_var": 0.0438140869140625, + "learning_rate": 0.0001, + "loss": 5.7311, + "loss/crossentropy": 2.5602293014526367, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1690380722284317, + "step": 17774 + }, + { + "epoch": 0.5555, + "grad_norm": 3.390625, + "grad_norm_var": 0.01959228515625, + "learning_rate": 0.0001, + "loss": 5.6756, + "loss/crossentropy": 2.5320883989334106, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16786563396453857, + "step": 17776 + }, + { + "epoch": 0.5555625, + "grad_norm": 3.25, + "grad_norm_var": 0.02301025390625, + "learning_rate": 0.0001, + "loss": 5.8345, + "loss/crossentropy": 2.6048797369003296, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17062260955572128, + "step": 17778 + }, + { + "epoch": 0.555625, + "grad_norm": 3.171875, + "grad_norm_var": 0.02261962890625, + "learning_rate": 0.0001, + "loss": 5.8017, + "loss/crossentropy": 2.54030978679657, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17496530711650848, + "step": 17780 + }, + { + "epoch": 0.5556875, + "grad_norm": 3.03125, + "grad_norm_var": 0.020210774739583333, + "learning_rate": 0.0001, + "loss": 5.9023, + "loss/crossentropy": 2.7037198543548584, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16985435783863068, + "step": 17782 + }, + { + "epoch": 0.55575, + "grad_norm": 3.015625, + "grad_norm_var": 0.05103759765625, + "learning_rate": 0.0001, + "loss": 6.0906, + "loss/crossentropy": 2.771214723587036, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18115680664777756, + "step": 17784 + }, + { + "epoch": 0.5558125, + "grad_norm": 3.328125, + "grad_norm_var": 0.05745340983072917, + "learning_rate": 0.0001, + "loss": 5.5596, + "loss/crossentropy": 2.4504921436309814, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16052187979221344, + "step": 17786 + }, + { + "epoch": 0.555875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0606109619140625, + "learning_rate": 0.0001, + "loss": 5.8293, + "loss/crossentropy": 2.6419315338134766, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1730317920446396, + "step": 17788 + }, + { + "epoch": 0.5559375, + "grad_norm": 2.96875, + "grad_norm_var": 0.061324055989583334, + "learning_rate": 0.0001, + "loss": 5.8703, + "loss/crossentropy": 2.636107921600342, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1769353449344635, + "step": 17790 + }, + { + "epoch": 0.556, + "grad_norm": 3.296875, + "grad_norm_var": 0.060933430989583336, + "learning_rate": 0.0001, + "loss": 6.014, + "loss/crossentropy": 2.7061779499053955, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17805054038763046, + "step": 17792 + }, + { + "epoch": 0.5560625, + "grad_norm": 4.9375, + "grad_norm_var": 0.23993733723958333, + "learning_rate": 0.0001, + "loss": 5.6304, + "loss/crossentropy": 2.3990488052368164, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.16688557714223862, + "step": 17794 + }, + { + "epoch": 0.556125, + "grad_norm": 3.34375, + "grad_norm_var": 0.24044596354166667, + "learning_rate": 0.0001, + "loss": 5.5713, + "loss/crossentropy": 2.4327298402786255, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16463829576969147, + "step": 17796 + }, + { + "epoch": 0.5561875, + "grad_norm": 2.875, + "grad_norm_var": 0.24844462076822918, + "learning_rate": 0.0001, + "loss": 5.8767, + "loss/crossentropy": 2.7174357175827026, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16748591512441635, + "step": 17798 + }, + { + "epoch": 0.55625, + "grad_norm": 2.921875, + "grad_norm_var": 0.23054097493489584, + "learning_rate": 0.0001, + "loss": 5.7476, + "loss/crossentropy": 2.667687773704529, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16463583707809448, + "step": 17800 + }, + { + "epoch": 0.5563125, + "grad_norm": 3.484375, + "grad_norm_var": 0.22349344889322917, + "learning_rate": 0.0001, + "loss": 5.9538, + "loss/crossentropy": 2.688518762588501, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17574457824230194, + "step": 17802 + }, + { + "epoch": 0.556375, + "grad_norm": 3.5, + "grad_norm_var": 0.22744852701822918, + "learning_rate": 0.0001, + "loss": 5.4751, + "loss/crossentropy": 2.3377124071121216, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16374120116233826, + "step": 17804 + }, + { + "epoch": 0.5564375, + "grad_norm": 3.234375, + "grad_norm_var": 0.28604227701822915, + "learning_rate": 0.0001, + "loss": 5.9729, + "loss/crossentropy": 2.7133841514587402, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17477981001138687, + "step": 17806 + }, + { + "epoch": 0.5565, + "grad_norm": 3.890625, + "grad_norm_var": 0.3018870035807292, + "learning_rate": 0.0001, + "loss": 5.9125, + "loss/crossentropy": 2.6736056804656982, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17154546082019806, + "step": 17808 + }, + { + "epoch": 0.5565625, + "grad_norm": 2.90625, + "grad_norm_var": 0.15346577962239583, + "learning_rate": 0.0001, + "loss": 5.8151, + "loss/crossentropy": 2.6831527948379517, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1702243611216545, + "step": 17810 + }, + { + "epoch": 0.556625, + "grad_norm": 3.484375, + "grad_norm_var": 0.15243733723958333, + "learning_rate": 0.0001, + "loss": 6.1999, + "loss/crossentropy": 2.8121283054351807, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18603944778442383, + "step": 17812 + }, + { + "epoch": 0.5566875, + "grad_norm": 3.09375, + "grad_norm_var": 0.15840555826822916, + "learning_rate": 0.0001, + "loss": 5.6435, + "loss/crossentropy": 2.5500491857528687, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16441978514194489, + "step": 17814 + }, + { + "epoch": 0.55675, + "grad_norm": 3.421875, + "grad_norm_var": 0.15227457682291667, + "learning_rate": 0.0001, + "loss": 6.2562, + "loss/crossentropy": 2.7869977951049805, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19105692207813263, + "step": 17816 + }, + { + "epoch": 0.5568125, + "grad_norm": 3.21875, + "grad_norm_var": 0.15562744140625, + "learning_rate": 0.0001, + "loss": 5.6101, + "loss/crossentropy": 2.5214409828186035, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16277530789375305, + "step": 17818 + }, + { + "epoch": 0.556875, + "grad_norm": 3.140625, + "grad_norm_var": 0.1571685791015625, + "learning_rate": 0.0001, + "loss": 6.0065, + "loss/crossentropy": 2.8121590614318848, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17099380493164062, + "step": 17820 + }, + { + "epoch": 0.5569375, + "grad_norm": 3.078125, + "grad_norm_var": 0.08584696451822917, + "learning_rate": 0.0001, + "loss": 5.6346, + "loss/crossentropy": 2.5494213104248047, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16554423421621323, + "step": 17822 + }, + { + "epoch": 0.557, + "grad_norm": 3.515625, + "grad_norm_var": 0.07345377604166667, + "learning_rate": 0.0001, + "loss": 6.0422, + "loss/crossentropy": 2.7481939792633057, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17900924384593964, + "step": 17824 + }, + { + "epoch": 0.5570625, + "grad_norm": 3.578125, + "grad_norm_var": 0.07337137858072916, + "learning_rate": 0.0001, + "loss": 6.1145, + "loss/crossentropy": 2.726052403450012, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18728207796812057, + "step": 17826 + }, + { + "epoch": 0.557125, + "grad_norm": 3.09375, + "grad_norm_var": 0.07099609375, + "learning_rate": 0.0001, + "loss": 5.7471, + "loss/crossentropy": 2.601917862892151, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16490618139505386, + "step": 17828 + }, + { + "epoch": 0.5571875, + "grad_norm": 3.3125, + "grad_norm_var": 0.05976155598958333, + "learning_rate": 0.0001, + "loss": 5.961, + "loss/crossentropy": 2.741401195526123, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17234960198402405, + "step": 17830 + }, + { + "epoch": 0.55725, + "grad_norm": 3.109375, + "grad_norm_var": 0.045458984375, + "learning_rate": 0.0001, + "loss": 5.6013, + "loss/crossentropy": 2.4385952949523926, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16705621778964996, + "step": 17832 + }, + { + "epoch": 0.5573125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0469146728515625, + "learning_rate": 0.0001, + "loss": 5.979, + "loss/crossentropy": 2.74137020111084, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17181239277124405, + "step": 17834 + }, + { + "epoch": 0.557375, + "grad_norm": 3.046875, + "grad_norm_var": 0.0437164306640625, + "learning_rate": 0.0001, + "loss": 5.7128, + "loss/crossentropy": 2.541237950325012, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1698867455124855, + "step": 17836 + }, + { + "epoch": 0.5574375, + "grad_norm": 2.953125, + "grad_norm_var": 0.047135416666666666, + "learning_rate": 0.0001, + "loss": 5.4935, + "loss/crossentropy": 2.3597596883773804, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16532307863235474, + "step": 17838 + }, + { + "epoch": 0.5575, + "grad_norm": 2.953125, + "grad_norm_var": 0.03694559733072917, + "learning_rate": 0.0001, + "loss": 5.5403, + "loss/crossentropy": 2.467650294303894, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16390115767717361, + "step": 17840 + }, + { + "epoch": 0.5575625, + "grad_norm": 3.078125, + "grad_norm_var": 0.029097493489583334, + "learning_rate": 0.0001, + "loss": 5.6512, + "loss/crossentropy": 2.5703890323638916, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16121027618646622, + "step": 17842 + }, + { + "epoch": 0.557625, + "grad_norm": 3.015625, + "grad_norm_var": 0.030768839518229167, + "learning_rate": 0.0001, + "loss": 5.4493, + "loss/crossentropy": 2.3899370431900024, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16218648850917816, + "step": 17844 + }, + { + "epoch": 0.5576875, + "grad_norm": 3.1875, + "grad_norm_var": 0.030338541666666666, + "learning_rate": 0.0001, + "loss": 5.6036, + "loss/crossentropy": 2.4563435316085815, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16550210118293762, + "step": 17846 + }, + { + "epoch": 0.55775, + "grad_norm": 3.015625, + "grad_norm_var": 0.030973307291666665, + "learning_rate": 0.0001, + "loss": 5.7082, + "loss/crossentropy": 2.557474374771118, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17014601081609726, + "step": 17848 + }, + { + "epoch": 0.5578125, + "grad_norm": 2.953125, + "grad_norm_var": 0.029427083333333333, + "learning_rate": 0.0001, + "loss": 5.5945, + "loss/crossentropy": 2.4838316440582275, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16653349995613098, + "step": 17850 + }, + { + "epoch": 0.557875, + "grad_norm": 3.109375, + "grad_norm_var": 0.028609212239583334, + "learning_rate": 0.0001, + "loss": 5.3799, + "loss/crossentropy": 2.3153923749923706, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16152680665254593, + "step": 17852 + }, + { + "epoch": 0.5579375, + "grad_norm": 3.0625, + "grad_norm_var": 0.010563151041666666, + "learning_rate": 0.0001, + "loss": 5.7112, + "loss/crossentropy": 2.5469366312026978, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16642998158931732, + "step": 17854 + }, + { + "epoch": 0.558, + "grad_norm": 3.265625, + "grad_norm_var": 0.009479777018229166, + "learning_rate": 0.0001, + "loss": 5.6037, + "loss/crossentropy": 2.4705650806427, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16369910538196564, + "step": 17856 + }, + { + "epoch": 0.5580625, + "grad_norm": 3.140625, + "grad_norm_var": 0.010270182291666667, + "learning_rate": 0.0001, + "loss": 5.7583, + "loss/crossentropy": 2.622636079788208, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.166298508644104, + "step": 17858 + }, + { + "epoch": 0.558125, + "grad_norm": 3.296875, + "grad_norm_var": 0.011506144205729167, + "learning_rate": 0.0001, + "loss": 5.7669, + "loss/crossentropy": 2.583187699317932, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17032814025878906, + "step": 17860 + }, + { + "epoch": 0.5581875, + "grad_norm": 4.0, + "grad_norm_var": 0.07049051920572917, + "learning_rate": 0.0001, + "loss": 6.0502, + "loss/crossentropy": 2.632668972015381, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18433555960655212, + "step": 17862 + }, + { + "epoch": 0.55825, + "grad_norm": 3.203125, + "grad_norm_var": 0.06819254557291667, + "learning_rate": 0.0001, + "loss": 5.5807, + "loss/crossentropy": 2.4633195400238037, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16525230556726456, + "step": 17864 + }, + { + "epoch": 0.5583125, + "grad_norm": 3.1875, + "grad_norm_var": 0.06282450358072916, + "learning_rate": 0.0001, + "loss": 5.2431, + "loss/crossentropy": 2.2175941467285156, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15606901794672012, + "step": 17866 + }, + { + "epoch": 0.558375, + "grad_norm": 3.25, + "grad_norm_var": 0.05722249348958333, + "learning_rate": 0.0001, + "loss": 5.9187, + "loss/crossentropy": 2.6782848834991455, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17248325049877167, + "step": 17868 + }, + { + "epoch": 0.5584375, + "grad_norm": 2.953125, + "grad_norm_var": 0.06363525390625, + "learning_rate": 0.0001, + "loss": 5.1332, + "loss/crossentropy": 2.3006467819213867, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.14184421300888062, + "step": 17870 + }, + { + "epoch": 0.5585, + "grad_norm": 3.375, + "grad_norm_var": 0.0641754150390625, + "learning_rate": 0.0001, + "loss": 5.8978, + "loss/crossentropy": 2.6230685710906982, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1778615415096283, + "step": 17872 + }, + { + "epoch": 0.5585625, + "grad_norm": 3.28125, + "grad_norm_var": 0.0650543212890625, + "learning_rate": 0.0001, + "loss": 5.8439, + "loss/crossentropy": 2.6285284757614136, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1715351864695549, + "step": 17874 + }, + { + "epoch": 0.558625, + "grad_norm": 3.1875, + "grad_norm_var": 0.07255757649739583, + "learning_rate": 0.0001, + "loss": 5.6397, + "loss/crossentropy": 2.5209524631500244, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.163827583193779, + "step": 17876 + }, + { + "epoch": 0.5586875, + "grad_norm": 3.28125, + "grad_norm_var": 0.018196614583333333, + "learning_rate": 0.0001, + "loss": 5.662, + "loss/crossentropy": 2.5259251594543457, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1639961451292038, + "step": 17878 + }, + { + "epoch": 0.55875, + "grad_norm": 3.09375, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 6.034, + "loss/crossentropy": 2.708481788635254, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17786332964897156, + "step": 17880 + }, + { + "epoch": 0.5588125, + "grad_norm": 2.859375, + "grad_norm_var": 0.029423014322916666, + "learning_rate": 0.0001, + "loss": 5.523, + "loss/crossentropy": 2.5217885971069336, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15715662389993668, + "step": 17882 + }, + { + "epoch": 0.558875, + "grad_norm": 3.5625, + "grad_norm_var": 0.037581380208333334, + "learning_rate": 0.0001, + "loss": 6.0094, + "loss/crossentropy": 2.653617262840271, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1824529990553856, + "step": 17884 + }, + { + "epoch": 0.5589375, + "grad_norm": 3.078125, + "grad_norm_var": 0.03511962890625, + "learning_rate": 0.0001, + "loss": 5.7433, + "loss/crossentropy": 2.633055090904236, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16414683312177658, + "step": 17886 + }, + { + "epoch": 0.559, + "grad_norm": 3.1875, + "grad_norm_var": 0.03013916015625, + "learning_rate": 0.0001, + "loss": 5.8918, + "loss/crossentropy": 2.649180054664612, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1738693118095398, + "step": 17888 + }, + { + "epoch": 0.5590625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0300445556640625, + "learning_rate": 0.0001, + "loss": 5.9866, + "loss/crossentropy": 2.6964234113693237, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17784736305475235, + "step": 17890 + }, + { + "epoch": 0.559125, + "grad_norm": 2.984375, + "grad_norm_var": 0.029227701822916667, + "learning_rate": 0.0001, + "loss": 5.8944, + "loss/crossentropy": 2.665500283241272, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17210885137319565, + "step": 17892 + }, + { + "epoch": 0.5591875, + "grad_norm": 3.03125, + "grad_norm_var": 0.030887858072916666, + "learning_rate": 0.0001, + "loss": 5.6217, + "loss/crossentropy": 2.572103977203369, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16042818129062653, + "step": 17894 + }, + { + "epoch": 0.55925, + "grad_norm": 3.078125, + "grad_norm_var": 0.0301910400390625, + "learning_rate": 0.0001, + "loss": 5.9123, + "loss/crossentropy": 2.677404046058655, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17582855373620987, + "step": 17896 + }, + { + "epoch": 0.5593125, + "grad_norm": 3.109375, + "grad_norm_var": 0.020881144205729167, + "learning_rate": 0.0001, + "loss": 5.8686, + "loss/crossentropy": 2.6037358045578003, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17687822878360748, + "step": 17898 + }, + { + "epoch": 0.559375, + "grad_norm": 3.28125, + "grad_norm_var": 0.0111724853515625, + "learning_rate": 0.0001, + "loss": 5.7929, + "loss/crossentropy": 2.577305555343628, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1723366603255272, + "step": 17900 + }, + { + "epoch": 0.5594375, + "grad_norm": 2.984375, + "grad_norm_var": 0.0213043212890625, + "learning_rate": 0.0001, + "loss": 5.7368, + "loss/crossentropy": 2.5786736011505127, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1685493439435959, + "step": 17902 + }, + { + "epoch": 0.5595, + "grad_norm": 3.21875, + "grad_norm_var": 0.02183837890625, + "learning_rate": 0.0001, + "loss": 5.9743, + "loss/crossentropy": 2.698343873023987, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17915450781583786, + "step": 17904 + }, + { + "epoch": 0.5595625, + "grad_norm": 3.21875, + "grad_norm_var": 0.023421223958333334, + "learning_rate": 0.0001, + "loss": 5.6588, + "loss/crossentropy": 2.5257489681243896, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16681639850139618, + "step": 17906 + }, + { + "epoch": 0.559625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0233551025390625, + "learning_rate": 0.0001, + "loss": 5.7083, + "loss/crossentropy": 2.522016763687134, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16980086266994476, + "step": 17908 + }, + { + "epoch": 0.5596875, + "grad_norm": 2.84375, + "grad_norm_var": 0.025414021809895833, + "learning_rate": 0.0001, + "loss": 5.5409, + "loss/crossentropy": 2.454635977745056, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16252947598695755, + "step": 17910 + }, + { + "epoch": 0.55975, + "grad_norm": 2.8125, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 5.3777, + "loss/crossentropy": 2.4028183221817017, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.152959406375885, + "step": 17912 + }, + { + "epoch": 0.5598125, + "grad_norm": 3.21875, + "grad_norm_var": 0.034468587239583334, + "learning_rate": 0.0001, + "loss": 5.9517, + "loss/crossentropy": 2.686766028404236, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17844261229038239, + "step": 17914 + }, + { + "epoch": 0.559875, + "grad_norm": 3.28125, + "grad_norm_var": 0.03355204264322917, + "learning_rate": 0.0001, + "loss": 5.9998, + "loss/crossentropy": 2.801062822341919, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17182545363903046, + "step": 17916 + }, + { + "epoch": 0.5599375, + "grad_norm": 3.078125, + "grad_norm_var": 0.03743387858072917, + "learning_rate": 0.0001, + "loss": 5.9728, + "loss/crossentropy": 2.7022135257720947, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17236721515655518, + "step": 17918 + }, + { + "epoch": 0.56, + "grad_norm": 3.1875, + "grad_norm_var": 0.0369537353515625, + "learning_rate": 0.0001, + "loss": 5.8791, + "loss/crossentropy": 2.6077463626861572, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17674753814935684, + "step": 17920 + }, + { + "epoch": 0.5600625, + "grad_norm": 2.96875, + "grad_norm_var": 0.0353912353515625, + "learning_rate": 0.0001, + "loss": 5.7588, + "loss/crossentropy": 2.6787171363830566, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16191715747117996, + "step": 17922 + }, + { + "epoch": 0.560125, + "grad_norm": 3.09375, + "grad_norm_var": 0.033812459309895834, + "learning_rate": 0.0001, + "loss": 5.5304, + "loss/crossentropy": 2.386778473854065, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16592152416706085, + "step": 17924 + }, + { + "epoch": 0.5601875, + "grad_norm": 3.109375, + "grad_norm_var": 0.030985514322916668, + "learning_rate": 0.0001, + "loss": 6.1054, + "loss/crossentropy": 2.827863335609436, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1765836700797081, + "step": 17926 + }, + { + "epoch": 0.56025, + "grad_norm": 4.0625, + "grad_norm_var": 0.08624674479166666, + "learning_rate": 0.0001, + "loss": 6.5417, + "loss/crossentropy": 2.92037570476532, + "loss/hidden": 1.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.20158959925174713, + "step": 17928 + }, + { + "epoch": 0.5603125, + "grad_norm": 3.484375, + "grad_norm_var": 0.1042633056640625, + "learning_rate": 0.0001, + "loss": 5.62, + "loss/crossentropy": 2.3516818284988403, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17214687913656235, + "step": 17930 + }, + { + "epoch": 0.560375, + "grad_norm": 3.125, + "grad_norm_var": 0.10723368326822917, + "learning_rate": 0.0001, + "loss": 5.599, + "loss/crossentropy": 2.457643985748291, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16530482470989227, + "step": 17932 + }, + { + "epoch": 0.5604375, + "grad_norm": 3.421875, + "grad_norm_var": 0.09968973795572916, + "learning_rate": 0.0001, + "loss": 5.6505, + "loss/crossentropy": 2.549486994743347, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1655661016702652, + "step": 17934 + }, + { + "epoch": 0.5605, + "grad_norm": 3.125, + "grad_norm_var": 0.10122782389322917, + "learning_rate": 0.0001, + "loss": 5.8692, + "loss/crossentropy": 2.6171056032180786, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17521066218614578, + "step": 17936 + }, + { + "epoch": 0.5605625, + "grad_norm": 3.0625, + "grad_norm_var": 0.09503580729166666, + "learning_rate": 0.0001, + "loss": 5.907, + "loss/crossentropy": 2.6747989654541016, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17439532279968262, + "step": 17938 + }, + { + "epoch": 0.560625, + "grad_norm": 3.296875, + "grad_norm_var": 0.08818359375, + "learning_rate": 0.0001, + "loss": 5.9037, + "loss/crossentropy": 2.6498863697052, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17733769863843918, + "step": 17940 + }, + { + "epoch": 0.5606875, + "grad_norm": 3.015625, + "grad_norm_var": 0.1028472900390625, + "learning_rate": 0.0001, + "loss": 5.7041, + "loss/crossentropy": 2.5769314765930176, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16389025747776031, + "step": 17942 + }, + { + "epoch": 0.56075, + "grad_norm": 2.890625, + "grad_norm_var": 0.0528472900390625, + "learning_rate": 0.0001, + "loss": 5.5264, + "loss/crossentropy": 2.438861846923828, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1603192836046219, + "step": 17944 + }, + { + "epoch": 0.5608125, + "grad_norm": 4.0625, + "grad_norm_var": 0.07902018229166667, + "learning_rate": 0.0001, + "loss": 5.6449, + "loss/crossentropy": 2.5336352586746216, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16385617852210999, + "step": 17946 + }, + { + "epoch": 0.560875, + "grad_norm": 3.484375, + "grad_norm_var": 0.3541493733723958, + "learning_rate": 0.0001, + "loss": 5.8157, + "loss/crossentropy": 2.4811586141586304, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1822827011346817, + "step": 17948 + }, + { + "epoch": 0.5609375, + "grad_norm": 3.265625, + "grad_norm_var": 0.3592732747395833, + "learning_rate": 0.0001, + "loss": 5.6334, + "loss/crossentropy": 2.538175344467163, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16108077764511108, + "step": 17950 + }, + { + "epoch": 0.561, + "grad_norm": 3.4375, + "grad_norm_var": 0.36804097493489585, + "learning_rate": 0.0001, + "loss": 5.8501, + "loss/crossentropy": 2.6697648763656616, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16998320072889328, + "step": 17952 + }, + { + "epoch": 0.5610625, + "grad_norm": 2.921875, + "grad_norm_var": 0.3748982747395833, + "learning_rate": 0.0001, + "loss": 5.6979, + "loss/crossentropy": 2.6105103492736816, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16029831767082214, + "step": 17954 + }, + { + "epoch": 0.561125, + "grad_norm": 4.75, + "grad_norm_var": 0.5028483072916666, + "learning_rate": 0.0001, + "loss": 5.7485, + "loss/crossentropy": 2.5200387239456177, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17089248448610306, + "step": 17956 + }, + { + "epoch": 0.5611875, + "grad_norm": 3.25, + "grad_norm_var": 0.47515869140625, + "learning_rate": 0.0001, + "loss": 6.0636, + "loss/crossentropy": 2.766772508621216, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18007465451955795, + "step": 17958 + }, + { + "epoch": 0.56125, + "grad_norm": 3.578125, + "grad_norm_var": 0.4588826497395833, + "learning_rate": 0.0001, + "loss": 5.9404, + "loss/crossentropy": 2.672636032104492, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17794343829154968, + "step": 17960 + }, + { + "epoch": 0.5613125, + "grad_norm": 3.125, + "grad_norm_var": 0.4217732747395833, + "learning_rate": 0.0001, + "loss": 5.8003, + "loss/crossentropy": 2.610080599784851, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16511627286672592, + "step": 17962 + }, + { + "epoch": 0.561375, + "grad_norm": 3.28125, + "grad_norm_var": 0.19097900390625, + "learning_rate": 0.0001, + "loss": 5.6038, + "loss/crossentropy": 2.4610944986343384, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16622376441955566, + "step": 17964 + }, + { + "epoch": 0.5614375, + "grad_norm": 3.140625, + "grad_norm_var": 0.19611714680989584, + "learning_rate": 0.0001, + "loss": 5.5325, + "loss/crossentropy": 2.4628101587295532, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16165287792682648, + "step": 17966 + }, + { + "epoch": 0.5615, + "grad_norm": 3.453125, + "grad_norm_var": 0.18842671712239584, + "learning_rate": 0.0001, + "loss": 5.8332, + "loss/crossentropy": 2.6182034015655518, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17267628014087677, + "step": 17968 + }, + { + "epoch": 0.5615625, + "grad_norm": 3.203125, + "grad_norm_var": 0.17791341145833334, + "learning_rate": 0.0001, + "loss": 5.9445, + "loss/crossentropy": 2.6934006214141846, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1743318811058998, + "step": 17970 + }, + { + "epoch": 0.561625, + "grad_norm": 3.625, + "grad_norm_var": 0.043929036458333334, + "learning_rate": 0.0001, + "loss": 5.7551, + "loss/crossentropy": 2.579498291015625, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16755972057580948, + "step": 17972 + }, + { + "epoch": 0.5616875, + "grad_norm": 3.5625, + "grad_norm_var": 0.04993387858072917, + "learning_rate": 0.0001, + "loss": 5.9624, + "loss/crossentropy": 2.709150195121765, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17571967840194702, + "step": 17974 + }, + { + "epoch": 0.56175, + "grad_norm": 3.5, + "grad_norm_var": 0.05461324055989583, + "learning_rate": 0.0001, + "loss": 5.5391, + "loss/crossentropy": 2.4214595556259155, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16449732333421707, + "step": 17976 + }, + { + "epoch": 0.5618125, + "grad_norm": 3.59375, + "grad_norm_var": 0.07040608723958333, + "learning_rate": 0.0001, + "loss": 6.1091, + "loss/crossentropy": 2.7031766176223755, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18746396899223328, + "step": 17978 + }, + { + "epoch": 0.561875, + "grad_norm": 3.25, + "grad_norm_var": 0.0666015625, + "learning_rate": 0.0001, + "loss": 5.5015, + "loss/crossentropy": 2.404158592224121, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16207338869571686, + "step": 17980 + }, + { + "epoch": 0.5619375, + "grad_norm": 3.078125, + "grad_norm_var": 0.06629130045572916, + "learning_rate": 0.0001, + "loss": 5.6953, + "loss/crossentropy": 2.560208320617676, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16585327684879303, + "step": 17982 + }, + { + "epoch": 0.562, + "grad_norm": 3.09375, + "grad_norm_var": 0.06707356770833334, + "learning_rate": 0.0001, + "loss": 5.4656, + "loss/crossentropy": 2.439324140548706, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15653540194034576, + "step": 17984 + }, + { + "epoch": 0.5620625, + "grad_norm": 3.078125, + "grad_norm_var": 0.07502339680989584, + "learning_rate": 0.0001, + "loss": 5.6782, + "loss/crossentropy": 2.511128067970276, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16944289952516556, + "step": 17986 + }, + { + "epoch": 0.562125, + "grad_norm": 3.046875, + "grad_norm_var": 0.06309305826822917, + "learning_rate": 0.0001, + "loss": 5.772, + "loss/crossentropy": 2.579297661781311, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1692730113863945, + "step": 17988 + }, + { + "epoch": 0.5621875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0550201416015625, + "learning_rate": 0.0001, + "loss": 5.5482, + "loss/crossentropy": 2.4625638723373413, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16129494458436966, + "step": 17990 + }, + { + "epoch": 0.56225, + "grad_norm": 3.140625, + "grad_norm_var": 0.0456451416015625, + "learning_rate": 0.0001, + "loss": 5.5685, + "loss/crossentropy": 2.4083163738250732, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16602043062448502, + "step": 17992 + }, + { + "epoch": 0.5623125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0220367431640625, + "learning_rate": 0.0001, + "loss": 5.7701, + "loss/crossentropy": 2.5844286680221558, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16973651200532913, + "step": 17994 + }, + { + "epoch": 0.562375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0264556884765625, + "learning_rate": 0.0001, + "loss": 6.0729, + "loss/crossentropy": 2.7710838317871094, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17784149944782257, + "step": 17996 + }, + { + "epoch": 0.5624375, + "grad_norm": 3.046875, + "grad_norm_var": 0.026009114583333333, + "learning_rate": 0.0001, + "loss": 5.6528, + "loss/crossentropy": 2.5517423152923584, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16479262709617615, + "step": 17998 + }, + { + "epoch": 0.5625, + "grad_norm": 3.109375, + "grad_norm_var": 0.024543253580729167, + "learning_rate": 0.0001, + "loss": 5.6744, + "loss/crossentropy": 2.527396082878113, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1670411452651024, + "step": 18000 + }, + { + "epoch": 0.5625625, + "grad_norm": 2.953125, + "grad_norm_var": 0.026707967122395832, + "learning_rate": 0.0001, + "loss": 5.4326, + "loss/crossentropy": 2.3805371522903442, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1598893702030182, + "step": 18002 + }, + { + "epoch": 0.562625, + "grad_norm": 3.203125, + "grad_norm_var": 0.01998291015625, + "learning_rate": 0.0001, + "loss": 5.5445, + "loss/crossentropy": 2.423843502998352, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1671416386961937, + "step": 18004 + }, + { + "epoch": 0.5626875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0218170166015625, + "learning_rate": 0.0001, + "loss": 6.0422, + "loss/crossentropy": 2.7827892303466797, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17320791631937027, + "step": 18006 + }, + { + "epoch": 0.56275, + "grad_norm": 3.328125, + "grad_norm_var": 0.017894490559895834, + "learning_rate": 0.0001, + "loss": 5.5495, + "loss/crossentropy": 2.460228204727173, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16322024911642075, + "step": 18008 + }, + { + "epoch": 0.5628125, + "grad_norm": 3.203125, + "grad_norm_var": 0.020308430989583334, + "learning_rate": 0.0001, + "loss": 5.5649, + "loss/crossentropy": 2.450485348701477, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16378730535507202, + "step": 18010 + }, + { + "epoch": 0.562875, + "grad_norm": 3.78125, + "grad_norm_var": 0.04619140625, + "learning_rate": 0.0001, + "loss": 5.4887, + "loss/crossentropy": 2.4171236753463745, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1591072902083397, + "step": 18012 + }, + { + "epoch": 0.5629375, + "grad_norm": 3.0, + "grad_norm_var": 0.04705403645833333, + "learning_rate": 0.0001, + "loss": 5.3726, + "loss/crossentropy": 2.2885377407073975, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15801254659891129, + "step": 18014 + }, + { + "epoch": 0.563, + "grad_norm": 3.015625, + "grad_norm_var": 0.04920145670572917, + "learning_rate": 0.0001, + "loss": 5.6008, + "loss/crossentropy": 2.5071990489959717, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16052991151809692, + "step": 18016 + }, + { + "epoch": 0.5630625, + "grad_norm": 3.09375, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 0.0001, + "loss": 5.8623, + "loss/crossentropy": 2.7066714763641357, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1702457219362259, + "step": 18018 + }, + { + "epoch": 0.563125, + "grad_norm": 3.09375, + "grad_norm_var": 0.04732666015625, + "learning_rate": 0.0001, + "loss": 6.058, + "loss/crossentropy": 2.8239424228668213, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17106150835752487, + "step": 18020 + }, + { + "epoch": 0.5631875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0458984375, + "learning_rate": 0.0001, + "loss": 5.6258, + "loss/crossentropy": 2.5434820652008057, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1637033000588417, + "step": 18022 + }, + { + "epoch": 0.56325, + "grad_norm": 3.25, + "grad_norm_var": 0.0510406494140625, + "learning_rate": 0.0001, + "loss": 5.5696, + "loss/crossentropy": 2.4420065879821777, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16666939854621887, + "step": 18024 + }, + { + "epoch": 0.5633125, + "grad_norm": 3.953125, + "grad_norm_var": 0.0898101806640625, + "learning_rate": 0.0001, + "loss": 5.8895, + "loss/crossentropy": 2.6028175354003906, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17671451717615128, + "step": 18026 + }, + { + "epoch": 0.563375, + "grad_norm": 3.296875, + "grad_norm_var": 0.08720296223958333, + "learning_rate": 0.0001, + "loss": 5.7647, + "loss/crossentropy": 2.6395243406295776, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16290399432182312, + "step": 18028 + }, + { + "epoch": 0.5634375, + "grad_norm": 3.375, + "grad_norm_var": 0.08398030598958334, + "learning_rate": 0.0001, + "loss": 5.6807, + "loss/crossentropy": 2.485153317451477, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16877098381519318, + "step": 18030 + }, + { + "epoch": 0.5635, + "grad_norm": 3.71875, + "grad_norm_var": 0.0890289306640625, + "learning_rate": 0.0001, + "loss": 5.8858, + "loss/crossentropy": 2.686255097389221, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17073087394237518, + "step": 18032 + }, + { + "epoch": 0.5635625, + "grad_norm": 3.0625, + "grad_norm_var": 0.09696858723958333, + "learning_rate": 0.0001, + "loss": 5.639, + "loss/crossentropy": 2.5935487747192383, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16040372848510742, + "step": 18034 + }, + { + "epoch": 0.563625, + "grad_norm": 3.234375, + "grad_norm_var": 0.09919331868489584, + "learning_rate": 0.0001, + "loss": 5.5149, + "loss/crossentropy": 2.442253351211548, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16273579001426697, + "step": 18036 + }, + { + "epoch": 0.5636875, + "grad_norm": 3.15625, + "grad_norm_var": 0.0981353759765625, + "learning_rate": 0.0001, + "loss": 5.7802, + "loss/crossentropy": 2.650180459022522, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1661262959241867, + "step": 18038 + }, + { + "epoch": 0.56375, + "grad_norm": 3.59375, + "grad_norm_var": 0.09782613118489583, + "learning_rate": 0.0001, + "loss": 5.7985, + "loss/crossentropy": 2.612444758415222, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17055834829807281, + "step": 18040 + }, + { + "epoch": 0.5638125, + "grad_norm": 3.375, + "grad_norm_var": 0.06830952962239584, + "learning_rate": 0.0001, + "loss": 5.4155, + "loss/crossentropy": 2.27899968624115, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16404490172863007, + "step": 18042 + }, + { + "epoch": 0.563875, + "grad_norm": 3.078125, + "grad_norm_var": 0.04676106770833333, + "learning_rate": 0.0001, + "loss": 5.6563, + "loss/crossentropy": 2.4841731786727905, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1707303375005722, + "step": 18044 + }, + { + "epoch": 0.5639375, + "grad_norm": 2.9375, + "grad_norm_var": 0.04976298014322917, + "learning_rate": 0.0001, + "loss": 5.3333, + "loss/crossentropy": 2.3177614212036133, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15624137222766876, + "step": 18046 + }, + { + "epoch": 0.564, + "grad_norm": 2.984375, + "grad_norm_var": 0.05961812337239583, + "learning_rate": 0.0001, + "loss": 5.7262, + "loss/crossentropy": 2.5988024473190308, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16312655061483383, + "step": 18048 + }, + { + "epoch": 0.5640625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0648345947265625, + "learning_rate": 0.0001, + "loss": 5.5478, + "loss/crossentropy": 2.4362375736236572, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16584526747465134, + "step": 18050 + }, + { + "epoch": 0.564125, + "grad_norm": 3.125, + "grad_norm_var": 0.06805013020833334, + "learning_rate": 0.0001, + "loss": 5.7547, + "loss/crossentropy": 2.570965528488159, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16837147623300552, + "step": 18052 + }, + { + "epoch": 0.5641875, + "grad_norm": 3.90625, + "grad_norm_var": 0.09541727701822916, + "learning_rate": 0.0001, + "loss": 5.905, + "loss/crossentropy": 2.674785614013672, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17458803951740265, + "step": 18054 + }, + { + "epoch": 0.56425, + "grad_norm": 3.0625, + "grad_norm_var": 0.08625895182291667, + "learning_rate": 0.0001, + "loss": 5.9082, + "loss/crossentropy": 2.652057647705078, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17444629222154617, + "step": 18056 + }, + { + "epoch": 0.5643125, + "grad_norm": 2.8125, + "grad_norm_var": 0.09990946451822917, + "learning_rate": 0.0001, + "loss": 5.784, + "loss/crossentropy": 2.6740591526031494, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1664658635854721, + "step": 18058 + }, + { + "epoch": 0.564375, + "grad_norm": 3.28125, + "grad_norm_var": 0.10211181640625, + "learning_rate": 0.0001, + "loss": 5.8103, + "loss/crossentropy": 2.5839916467666626, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17185447365045547, + "step": 18060 + }, + { + "epoch": 0.5644375, + "grad_norm": 3.125, + "grad_norm_var": 0.0981842041015625, + "learning_rate": 0.0001, + "loss": 5.5052, + "loss/crossentropy": 2.3681094646453857, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1641031578183174, + "step": 18062 + }, + { + "epoch": 0.5645, + "grad_norm": 3.21875, + "grad_norm_var": 0.07350260416666667, + "learning_rate": 0.0001, + "loss": 5.9768, + "loss/crossentropy": 2.7085739374160767, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17760029435157776, + "step": 18064 + }, + { + "epoch": 0.5645625, + "grad_norm": 3.03125, + "grad_norm_var": 0.06754150390625, + "learning_rate": 0.0001, + "loss": 5.6523, + "loss/crossentropy": 2.599989175796509, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16304685175418854, + "step": 18066 + }, + { + "epoch": 0.564625, + "grad_norm": 3.34375, + "grad_norm_var": 0.0619140625, + "learning_rate": 0.0001, + "loss": 5.6176, + "loss/crossentropy": 2.507688045501709, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16021274030208588, + "step": 18068 + }, + { + "epoch": 0.5646875, + "grad_norm": 3.171875, + "grad_norm_var": 0.025516764322916666, + "learning_rate": 0.0001, + "loss": 5.6256, + "loss/crossentropy": 2.464373230934143, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1711968332529068, + "step": 18070 + }, + { + "epoch": 0.56475, + "grad_norm": 2.90625, + "grad_norm_var": 0.029280598958333334, + "learning_rate": 0.0001, + "loss": 5.476, + "loss/crossentropy": 2.4423896074295044, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16078495979309082, + "step": 18072 + }, + { + "epoch": 0.5648125, + "grad_norm": 3.09375, + "grad_norm_var": 0.021833292643229165, + "learning_rate": 0.0001, + "loss": 6.0034, + "loss/crossentropy": 2.7145031690597534, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17889142036437988, + "step": 18074 + }, + { + "epoch": 0.564875, + "grad_norm": 3.15625, + "grad_norm_var": 0.01295166015625, + "learning_rate": 0.0001, + "loss": 5.9951, + "loss/crossentropy": 2.786222457885742, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17284026741981506, + "step": 18076 + }, + { + "epoch": 0.5649375, + "grad_norm": 3.15625, + "grad_norm_var": 0.013411458333333333, + "learning_rate": 0.0001, + "loss": 5.8743, + "loss/crossentropy": 2.638497829437256, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17514370381832123, + "step": 18078 + }, + { + "epoch": 0.565, + "grad_norm": 3.546875, + "grad_norm_var": 0.022166951497395834, + "learning_rate": 0.0001, + "loss": 5.9054, + "loss/crossentropy": 2.6657952070236206, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.172395721077919, + "step": 18080 + }, + { + "epoch": 0.5650625, + "grad_norm": 2.875, + "grad_norm_var": 0.026883951822916665, + "learning_rate": 0.0001, + "loss": 5.417, + "loss/crossentropy": 2.3664300441741943, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15935391187667847, + "step": 18082 + }, + { + "epoch": 0.565125, + "grad_norm": 3.296875, + "grad_norm_var": 0.024332682291666668, + "learning_rate": 0.0001, + "loss": 5.7181, + "loss/crossentropy": 2.5044695138931274, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17058125138282776, + "step": 18084 + }, + { + "epoch": 0.5651875, + "grad_norm": 2.953125, + "grad_norm_var": 0.027928670247395832, + "learning_rate": 0.0001, + "loss": 5.6824, + "loss/crossentropy": 2.5775527954101562, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16126485168933868, + "step": 18086 + }, + { + "epoch": 0.56525, + "grad_norm": 3.1875, + "grad_norm_var": 0.0307769775390625, + "learning_rate": 0.0001, + "loss": 5.8573, + "loss/crossentropy": 2.610924482345581, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1703444868326187, + "step": 18088 + }, + { + "epoch": 0.5653125, + "grad_norm": 3.375, + "grad_norm_var": 0.031722005208333334, + "learning_rate": 0.0001, + "loss": 6.1673, + "loss/crossentropy": 2.812220335006714, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18551187962293625, + "step": 18090 + }, + { + "epoch": 0.565375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0330963134765625, + "learning_rate": 0.0001, + "loss": 5.709, + "loss/crossentropy": 2.5566617250442505, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16836372017860413, + "step": 18092 + }, + { + "epoch": 0.5654375, + "grad_norm": 3.0625, + "grad_norm_var": 0.03693033854166667, + "learning_rate": 0.0001, + "loss": 5.6252, + "loss/crossentropy": 2.5410051345825195, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1635015681385994, + "step": 18094 + }, + { + "epoch": 0.5655, + "grad_norm": 3.21875, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 5.7779, + "loss/crossentropy": 2.598138451576233, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1687622368335724, + "step": 18096 + }, + { + "epoch": 0.5655625, + "grad_norm": 3.296875, + "grad_norm_var": 0.024739583333333332, + "learning_rate": 0.0001, + "loss": 5.8979, + "loss/crossentropy": 2.6277064085006714, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17662711441516876, + "step": 18098 + }, + { + "epoch": 0.565625, + "grad_norm": 3.609375, + "grad_norm_var": 0.033984375, + "learning_rate": 0.0001, + "loss": 5.5418, + "loss/crossentropy": 2.3807681798934937, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16727932542562485, + "step": 18100 + }, + { + "epoch": 0.5656875, + "grad_norm": 3.3125, + "grad_norm_var": 0.03699442545572917, + "learning_rate": 0.0001, + "loss": 5.6414, + "loss/crossentropy": 2.5439178943634033, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16287732124328613, + "step": 18102 + }, + { + "epoch": 0.56575, + "grad_norm": 3.234375, + "grad_norm_var": 0.04049479166666667, + "learning_rate": 0.0001, + "loss": 6.0229, + "loss/crossentropy": 2.7287250757217407, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17551034688949585, + "step": 18104 + }, + { + "epoch": 0.5658125, + "grad_norm": 3.140625, + "grad_norm_var": 0.04299723307291667, + "learning_rate": 0.0001, + "loss": 5.6857, + "loss/crossentropy": 2.589537262916565, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1643010824918747, + "step": 18106 + }, + { + "epoch": 0.565875, + "grad_norm": 3.375, + "grad_norm_var": 0.04282938639322917, + "learning_rate": 0.0001, + "loss": 5.4694, + "loss/crossentropy": 2.3726253509521484, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15928775072097778, + "step": 18108 + }, + { + "epoch": 0.5659375, + "grad_norm": 3.5, + "grad_norm_var": 0.040299479166666666, + "learning_rate": 0.0001, + "loss": 6.0452, + "loss/crossentropy": 2.6985702514648438, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1842763051390648, + "step": 18110 + }, + { + "epoch": 0.566, + "grad_norm": 3.03125, + "grad_norm_var": 0.04254150390625, + "learning_rate": 0.0001, + "loss": 5.8419, + "loss/crossentropy": 2.572100520133972, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17620253562927246, + "step": 18112 + }, + { + "epoch": 0.5660625, + "grad_norm": 2.671875, + "grad_norm_var": 0.0853424072265625, + "learning_rate": 0.0001, + "loss": 5.4406, + "loss/crossentropy": 2.40678334236145, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15767619758844376, + "step": 18114 + }, + { + "epoch": 0.566125, + "grad_norm": 3.359375, + "grad_norm_var": 0.07928059895833334, + "learning_rate": 0.0001, + "loss": 5.9884, + "loss/crossentropy": 2.7045150995254517, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17838776856660843, + "step": 18116 + }, + { + "epoch": 0.5661875, + "grad_norm": 3.21875, + "grad_norm_var": 0.07405192057291667, + "learning_rate": 0.0001, + "loss": 5.853, + "loss/crossentropy": 2.690333843231201, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16548164188861847, + "step": 18118 + }, + { + "epoch": 0.56625, + "grad_norm": 2.765625, + "grad_norm_var": 0.0814849853515625, + "learning_rate": 0.0001, + "loss": 5.5055, + "loss/crossentropy": 2.431414484977722, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.162482850253582, + "step": 18120 + }, + { + "epoch": 0.5663125, + "grad_norm": 3.328125, + "grad_norm_var": 0.08037109375, + "learning_rate": 0.0001, + "loss": 5.6249, + "loss/crossentropy": 2.4767180681228638, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16638048738241196, + "step": 18122 + }, + { + "epoch": 0.566375, + "grad_norm": 3.0, + "grad_norm_var": 0.08516337076822916, + "learning_rate": 0.0001, + "loss": 5.6001, + "loss/crossentropy": 2.540703773498535, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16140630096197128, + "step": 18124 + }, + { + "epoch": 0.5664375, + "grad_norm": 2.890625, + "grad_norm_var": 0.07871805826822917, + "learning_rate": 0.0001, + "loss": 5.5986, + "loss/crossentropy": 2.5523539781570435, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1585291028022766, + "step": 18126 + }, + { + "epoch": 0.5665, + "grad_norm": 3.265625, + "grad_norm_var": 0.07947489420572916, + "learning_rate": 0.0001, + "loss": 5.6393, + "loss/crossentropy": 2.4613728523254395, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16778801381587982, + "step": 18128 + }, + { + "epoch": 0.5665625, + "grad_norm": 3.09375, + "grad_norm_var": 0.047998046875, + "learning_rate": 0.0001, + "loss": 5.7893, + "loss/crossentropy": 2.535299062728882, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17422835528850555, + "step": 18130 + }, + { + "epoch": 0.566625, + "grad_norm": 3.453125, + "grad_norm_var": 0.0640777587890625, + "learning_rate": 0.0001, + "loss": 5.9657, + "loss/crossentropy": 2.6000982522964478, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.17679427564144135, + "step": 18132 + }, + { + "epoch": 0.5666875, + "grad_norm": 3.28125, + "grad_norm_var": 0.06733296712239584, + "learning_rate": 0.0001, + "loss": 5.5046, + "loss/crossentropy": 2.4269354343414307, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16050058603286743, + "step": 18134 + }, + { + "epoch": 0.56675, + "grad_norm": 3.359375, + "grad_norm_var": 0.05455729166666667, + "learning_rate": 0.0001, + "loss": 6.1821, + "loss/crossentropy": 2.747152090072632, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18959320336580276, + "step": 18136 + }, + { + "epoch": 0.5668125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05486653645833333, + "learning_rate": 0.0001, + "loss": 5.8644, + "loss/crossentropy": 2.642526865005493, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17297182232141495, + "step": 18138 + }, + { + "epoch": 0.566875, + "grad_norm": 3.046875, + "grad_norm_var": 0.048005167643229166, + "learning_rate": 0.0001, + "loss": 5.8307, + "loss/crossentropy": 2.650195360183716, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1684427112340927, + "step": 18140 + }, + { + "epoch": 0.5669375, + "grad_norm": 3.25, + "grad_norm_var": 0.0490234375, + "learning_rate": 0.0001, + "loss": 5.5932, + "loss/crossentropy": 2.5209991931915283, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.15604948997497559, + "step": 18142 + }, + { + "epoch": 0.567, + "grad_norm": 2.96875, + "grad_norm_var": 0.057291666666666664, + "learning_rate": 0.0001, + "loss": 5.6901, + "loss/crossentropy": 2.4706814289093018, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16960027068853378, + "step": 18144 + }, + { + "epoch": 0.5670625, + "grad_norm": 3.453125, + "grad_norm_var": 0.05049540201822917, + "learning_rate": 0.0001, + "loss": 6.0374, + "loss/crossentropy": 2.76907217502594, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17527476698160172, + "step": 18146 + }, + { + "epoch": 0.567125, + "grad_norm": 3.15625, + "grad_norm_var": 0.03345438639322917, + "learning_rate": 0.0001, + "loss": 5.6292, + "loss/crossentropy": 2.4950783252716064, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16536633670330048, + "step": 18148 + }, + { + "epoch": 0.5671875, + "grad_norm": 3.125, + "grad_norm_var": 0.029670206705729167, + "learning_rate": 0.0001, + "loss": 5.6338, + "loss/crossentropy": 2.4978891611099243, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16632362455129623, + "step": 18150 + }, + { + "epoch": 0.56725, + "grad_norm": 3.03125, + "grad_norm_var": 0.0312164306640625, + "learning_rate": 0.0001, + "loss": 5.4433, + "loss/crossentropy": 2.4270702600479126, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1586506888270378, + "step": 18152 + }, + { + "epoch": 0.5673125, + "grad_norm": 2.9375, + "grad_norm_var": 0.03443603515625, + "learning_rate": 0.0001, + "loss": 5.6759, + "loss/crossentropy": 2.593072295188904, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16570191085338593, + "step": 18154 + }, + { + "epoch": 0.567375, + "grad_norm": 3.59375, + "grad_norm_var": 0.049117024739583334, + "learning_rate": 0.0001, + "loss": 5.6579, + "loss/crossentropy": 2.4805086851119995, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1693010777235031, + "step": 18156 + }, + { + "epoch": 0.5674375, + "grad_norm": 3.46875, + "grad_norm_var": 0.05154622395833333, + "learning_rate": 0.0001, + "loss": 5.7272, + "loss/crossentropy": 2.6106998920440674, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1635986492037773, + "step": 18158 + }, + { + "epoch": 0.5675, + "grad_norm": 3.484375, + "grad_norm_var": 0.04954325358072917, + "learning_rate": 0.0001, + "loss": 5.8808, + "loss/crossentropy": 2.5786153078079224, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17865897715091705, + "step": 18160 + }, + { + "epoch": 0.5675625, + "grad_norm": 3.375, + "grad_norm_var": 0.04856669108072917, + "learning_rate": 0.0001, + "loss": 5.8635, + "loss/crossentropy": 2.6471441984176636, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16890473663806915, + "step": 18162 + }, + { + "epoch": 0.567625, + "grad_norm": 2.8125, + "grad_norm_var": 0.06799214680989583, + "learning_rate": 0.0001, + "loss": 5.5292, + "loss/crossentropy": 2.555854558944702, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1570984572172165, + "step": 18164 + }, + { + "epoch": 0.5676875, + "grad_norm": 3.203125, + "grad_norm_var": 0.06944071451822917, + "learning_rate": 0.0001, + "loss": 5.7648, + "loss/crossentropy": 2.5097345113754272, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17394182085990906, + "step": 18166 + }, + { + "epoch": 0.56775, + "grad_norm": 3.296875, + "grad_norm_var": 0.06754150390625, + "learning_rate": 0.0001, + "loss": 5.915, + "loss/crossentropy": 2.5866525173187256, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18127139657735825, + "step": 18168 + }, + { + "epoch": 0.5678125, + "grad_norm": 3.078125, + "grad_norm_var": 0.0651031494140625, + "learning_rate": 0.0001, + "loss": 5.6082, + "loss/crossentropy": 2.573627233505249, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15931487083435059, + "step": 18170 + }, + { + "epoch": 0.567875, + "grad_norm": 2.8125, + "grad_norm_var": 0.06481831868489583, + "learning_rate": 0.0001, + "loss": 5.3138, + "loss/crossentropy": 2.2177451848983765, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.15608614683151245, + "step": 18172 + }, + { + "epoch": 0.5679375, + "grad_norm": 2.890625, + "grad_norm_var": 0.06100260416666667, + "learning_rate": 0.0001, + "loss": 5.4315, + "loss/crossentropy": 2.41492235660553, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16024888306856155, + "step": 18174 + }, + { + "epoch": 0.568, + "grad_norm": 3.046875, + "grad_norm_var": 0.05423177083333333, + "learning_rate": 0.0001, + "loss": 5.7092, + "loss/crossentropy": 2.5794578790664673, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16609789431095123, + "step": 18176 + }, + { + "epoch": 0.5680625, + "grad_norm": 3.328125, + "grad_norm_var": 0.049153645833333336, + "learning_rate": 0.0001, + "loss": 5.723, + "loss/crossentropy": 2.566588878631592, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16798074543476105, + "step": 18178 + }, + { + "epoch": 0.568125, + "grad_norm": 3.671875, + "grad_norm_var": 0.053511555989583334, + "learning_rate": 0.0001, + "loss": 5.5526, + "loss/crossentropy": 2.4305126667022705, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16377465426921844, + "step": 18180 + }, + { + "epoch": 0.5681875, + "grad_norm": 3.421875, + "grad_norm_var": 0.053055826822916666, + "learning_rate": 0.0001, + "loss": 5.7102, + "loss/crossentropy": 2.4845412969589233, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17451970279216766, + "step": 18182 + }, + { + "epoch": 0.56825, + "grad_norm": 3.46875, + "grad_norm_var": 0.05862223307291667, + "learning_rate": 0.0001, + "loss": 5.7808, + "loss/crossentropy": 2.5853875875473022, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16837100684642792, + "step": 18184 + }, + { + "epoch": 0.5683125, + "grad_norm": 3.0625, + "grad_norm_var": 0.057249959309895834, + "learning_rate": 0.0001, + "loss": 5.764, + "loss/crossentropy": 2.5821412801742554, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17013757675886154, + "step": 18186 + }, + { + "epoch": 0.568375, + "grad_norm": 2.984375, + "grad_norm_var": 0.04781901041666667, + "learning_rate": 0.0001, + "loss": 5.4794, + "loss/crossentropy": 2.471649646759033, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1570231318473816, + "step": 18188 + }, + { + "epoch": 0.5684375, + "grad_norm": 2.96875, + "grad_norm_var": 0.04551493326822917, + "learning_rate": 0.0001, + "loss": 5.6079, + "loss/crossentropy": 2.527212381362915, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1604156643152237, + "step": 18190 + }, + { + "epoch": 0.5685, + "grad_norm": 3.671875, + "grad_norm_var": 0.06295572916666667, + "learning_rate": 0.0001, + "loss": 5.9374, + "loss/crossentropy": 2.703904390335083, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1725686490535736, + "step": 18192 + }, + { + "epoch": 0.5685625, + "grad_norm": 3.125, + "grad_norm_var": 0.060282389322916664, + "learning_rate": 0.0001, + "loss": 5.9282, + "loss/crossentropy": 2.6780800819396973, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17578843981027603, + "step": 18194 + }, + { + "epoch": 0.568625, + "grad_norm": 3.328125, + "grad_norm_var": 0.04529520670572917, + "learning_rate": 0.0001, + "loss": 5.8886, + "loss/crossentropy": 2.6965173482894897, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16764166951179504, + "step": 18196 + }, + { + "epoch": 0.5686875, + "grad_norm": 3.046875, + "grad_norm_var": 0.045328776041666664, + "learning_rate": 0.0001, + "loss": 5.5994, + "loss/crossentropy": 2.541398763656616, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16126679629087448, + "step": 18198 + }, + { + "epoch": 0.56875, + "grad_norm": 3.3125, + "grad_norm_var": 0.040461222330729164, + "learning_rate": 0.0001, + "loss": 5.799, + "loss/crossentropy": 2.585677981376648, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1709369719028473, + "step": 18200 + }, + { + "epoch": 0.5688125, + "grad_norm": 2.90625, + "grad_norm_var": 0.05640360514322917, + "learning_rate": 0.0001, + "loss": 5.595, + "loss/crossentropy": 2.4870870113372803, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16391517221927643, + "step": 18202 + }, + { + "epoch": 0.568875, + "grad_norm": 3.109375, + "grad_norm_var": 0.049470011393229166, + "learning_rate": 0.0001, + "loss": 5.6932, + "loss/crossentropy": 2.5428810119628906, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16933035850524902, + "step": 18204 + }, + { + "epoch": 0.5689375, + "grad_norm": 3.203125, + "grad_norm_var": 0.04586181640625, + "learning_rate": 0.0001, + "loss": 5.7163, + "loss/crossentropy": 2.5672882795333862, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17036515474319458, + "step": 18206 + }, + { + "epoch": 0.569, + "grad_norm": 2.984375, + "grad_norm_var": 0.029857381184895834, + "learning_rate": 0.0001, + "loss": 5.6548, + "loss/crossentropy": 2.4710731506347656, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1695476919412613, + "step": 18208 + }, + { + "epoch": 0.5690625, + "grad_norm": 3.046875, + "grad_norm_var": 0.030793253580729166, + "learning_rate": 0.0001, + "loss": 5.9761, + "loss/crossentropy": 2.7513391971588135, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17638200521469116, + "step": 18210 + }, + { + "epoch": 0.569125, + "grad_norm": 3.671875, + "grad_norm_var": 0.0537506103515625, + "learning_rate": 0.0001, + "loss": 5.7616, + "loss/crossentropy": 2.5401495695114136, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1705845668911934, + "step": 18212 + }, + { + "epoch": 0.5691875, + "grad_norm": 3.09375, + "grad_norm_var": 0.0755859375, + "learning_rate": 0.0001, + "loss": 5.4031, + "loss/crossentropy": 2.347140312194824, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15754767507314682, + "step": 18214 + }, + { + "epoch": 0.56925, + "grad_norm": 3.015625, + "grad_norm_var": 0.08241780598958333, + "learning_rate": 0.0001, + "loss": 5.6426, + "loss/crossentropy": 2.508566737174988, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16965033113956451, + "step": 18216 + }, + { + "epoch": 0.5693125, + "grad_norm": 2.984375, + "grad_norm_var": 0.07366434733072917, + "learning_rate": 0.0001, + "loss": 5.7891, + "loss/crossentropy": 2.6187134981155396, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16899175941944122, + "step": 18218 + }, + { + "epoch": 0.569375, + "grad_norm": 3.015625, + "grad_norm_var": 0.07834879557291667, + "learning_rate": 0.0001, + "loss": 5.4317, + "loss/crossentropy": 2.37195086479187, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15557928383350372, + "step": 18220 + }, + { + "epoch": 0.5694375, + "grad_norm": 3.09375, + "grad_norm_var": 0.08283589680989584, + "learning_rate": 0.0001, + "loss": 5.7955, + "loss/crossentropy": 2.6413161754608154, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16776670515537262, + "step": 18222 + }, + { + "epoch": 0.5695, + "grad_norm": 3.078125, + "grad_norm_var": 0.0821685791015625, + "learning_rate": 0.0001, + "loss": 5.5111, + "loss/crossentropy": 2.4040359258651733, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1626596450805664, + "step": 18224 + }, + { + "epoch": 0.5695625, + "grad_norm": 3.015625, + "grad_norm_var": 0.08507486979166666, + "learning_rate": 0.0001, + "loss": 6.0612, + "loss/crossentropy": 2.7181628942489624, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18117854744195938, + "step": 18226 + }, + { + "epoch": 0.569625, + "grad_norm": 2.703125, + "grad_norm_var": 0.07452799479166666, + "learning_rate": 0.0001, + "loss": 5.2521, + "loss/crossentropy": 2.3347413539886475, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15111136436462402, + "step": 18228 + }, + { + "epoch": 0.5696875, + "grad_norm": 3.5625, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 0.0001, + "loss": 5.963, + "loss/crossentropy": 2.7114120721817017, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17554765939712524, + "step": 18230 + }, + { + "epoch": 0.56975, + "grad_norm": 3.203125, + "grad_norm_var": 0.038407389322916666, + "learning_rate": 0.0001, + "loss": 5.815, + "loss/crossentropy": 2.5795810222625732, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17354222387075424, + "step": 18232 + }, + { + "epoch": 0.5698125, + "grad_norm": 3.0, + "grad_norm_var": 0.04634501139322917, + "learning_rate": 0.0001, + "loss": 5.6635, + "loss/crossentropy": 2.581420063972473, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16093791276216507, + "step": 18234 + }, + { + "epoch": 0.569875, + "grad_norm": 2.828125, + "grad_norm_var": 0.053694661458333334, + "learning_rate": 0.0001, + "loss": 5.7541, + "loss/crossentropy": 2.6179747581481934, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16947119683027267, + "step": 18236 + }, + { + "epoch": 0.5699375, + "grad_norm": 3.28125, + "grad_norm_var": 0.057184855143229164, + "learning_rate": 0.0001, + "loss": 6.0011, + "loss/crossentropy": 2.693076252937317, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17767733335494995, + "step": 18238 + }, + { + "epoch": 0.57, + "grad_norm": 3.1875, + "grad_norm_var": 0.0557037353515625, + "learning_rate": 0.0001, + "loss": 5.4824, + "loss/crossentropy": 2.4300020933151245, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15836870670318604, + "step": 18240 + }, + { + "epoch": 0.5700625, + "grad_norm": 2.984375, + "grad_norm_var": 0.054255167643229164, + "learning_rate": 0.0001, + "loss": 5.9279, + "loss/crossentropy": 2.6520771980285645, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17524328082799911, + "step": 18242 + }, + { + "epoch": 0.570125, + "grad_norm": 3.296875, + "grad_norm_var": 0.03718973795572917, + "learning_rate": 0.0001, + "loss": 5.6965, + "loss/crossentropy": 2.4949214458465576, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1732865646481514, + "step": 18244 + }, + { + "epoch": 0.5701875, + "grad_norm": 3.1875, + "grad_norm_var": 0.029296875, + "learning_rate": 0.0001, + "loss": 5.475, + "loss/crossentropy": 2.4005489349365234, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1621282771229744, + "step": 18246 + }, + { + "epoch": 0.57025, + "grad_norm": 2.984375, + "grad_norm_var": 0.03469645182291667, + "learning_rate": 0.0001, + "loss": 5.9914, + "loss/crossentropy": 2.6719961166381836, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1819431409239769, + "step": 18248 + }, + { + "epoch": 0.5703125, + "grad_norm": 2.953125, + "grad_norm_var": 0.03255106608072917, + "learning_rate": 0.0001, + "loss": 5.7835, + "loss/crossentropy": 2.5861481428146362, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17246808856725693, + "step": 18250 + }, + { + "epoch": 0.570375, + "grad_norm": 3.203125, + "grad_norm_var": 0.022977701822916665, + "learning_rate": 0.0001, + "loss": 5.6208, + "loss/crossentropy": 2.5238444805145264, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16165026277303696, + "step": 18252 + }, + { + "epoch": 0.5704375, + "grad_norm": 3.625, + "grad_norm_var": 0.03359375, + "learning_rate": 0.0001, + "loss": 5.9077, + "loss/crossentropy": 2.633137583732605, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1747267246246338, + "step": 18254 + }, + { + "epoch": 0.5705, + "grad_norm": 3.21875, + "grad_norm_var": 0.03357747395833333, + "learning_rate": 0.0001, + "loss": 5.8805, + "loss/crossentropy": 2.618165612220764, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17584676295518875, + "step": 18256 + }, + { + "epoch": 0.5705625, + "grad_norm": 3.3125, + "grad_norm_var": 0.030745442708333334, + "learning_rate": 0.0001, + "loss": 5.8216, + "loss/crossentropy": 2.597626805305481, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17278896272182465, + "step": 18258 + }, + { + "epoch": 0.570625, + "grad_norm": 3.109375, + "grad_norm_var": 0.030858357747395832, + "learning_rate": 0.0001, + "loss": 5.8035, + "loss/crossentropy": 2.5978556871414185, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16822044551372528, + "step": 18260 + }, + { + "epoch": 0.5706875, + "grad_norm": 3.265625, + "grad_norm_var": 0.029832967122395835, + "learning_rate": 0.0001, + "loss": 5.8593, + "loss/crossentropy": 2.6246622800827026, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17268268764019012, + "step": 18262 + }, + { + "epoch": 0.57075, + "grad_norm": 3.21875, + "grad_norm_var": 0.026558430989583333, + "learning_rate": 0.0001, + "loss": 5.7571, + "loss/crossentropy": 2.5931451320648193, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16717618703842163, + "step": 18264 + }, + { + "epoch": 0.5708125, + "grad_norm": 3.28125, + "grad_norm_var": 0.034505208333333336, + "learning_rate": 0.0001, + "loss": 6.2368, + "loss/crossentropy": 2.871632218360901, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18261412531137466, + "step": 18266 + }, + { + "epoch": 0.570875, + "grad_norm": 3.109375, + "grad_norm_var": 0.035074869791666664, + "learning_rate": 0.0001, + "loss": 5.8242, + "loss/crossentropy": 2.6054731607437134, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16796521097421646, + "step": 18268 + }, + { + "epoch": 0.5709375, + "grad_norm": 3.125, + "grad_norm_var": 0.026041666666666668, + "learning_rate": 0.0001, + "loss": 5.5439, + "loss/crossentropy": 2.4600071907043457, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16151829808950424, + "step": 18270 + }, + { + "epoch": 0.571, + "grad_norm": 3.234375, + "grad_norm_var": 0.026102701822916668, + "learning_rate": 0.0001, + "loss": 5.733, + "loss/crossentropy": 2.5170669555664062, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1704181507229805, + "step": 18272 + }, + { + "epoch": 0.5710625, + "grad_norm": 3.15625, + "grad_norm_var": 0.02740478515625, + "learning_rate": 0.0001, + "loss": 5.5909, + "loss/crossentropy": 2.5140548944473267, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15885886549949646, + "step": 18274 + }, + { + "epoch": 0.571125, + "grad_norm": 3.1875, + "grad_norm_var": 0.026904296875, + "learning_rate": 0.0001, + "loss": 5.587, + "loss/crossentropy": 2.5062899589538574, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1596328616142273, + "step": 18276 + }, + { + "epoch": 0.5711875, + "grad_norm": 3.375, + "grad_norm_var": 0.03625895182291667, + "learning_rate": 0.0001, + "loss": 5.6553, + "loss/crossentropy": 2.5024070739746094, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16763748228549957, + "step": 18278 + }, + { + "epoch": 0.57125, + "grad_norm": 2.96875, + "grad_norm_var": 0.03748372395833333, + "learning_rate": 0.0001, + "loss": 5.7911, + "loss/crossentropy": 2.642797827720642, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16873770952224731, + "step": 18280 + }, + { + "epoch": 0.5713125, + "grad_norm": 3.28125, + "grad_norm_var": 0.029710896809895835, + "learning_rate": 0.0001, + "loss": 5.9137, + "loss/crossentropy": 2.618430018424988, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17640304565429688, + "step": 18282 + }, + { + "epoch": 0.571375, + "grad_norm": 3.546875, + "grad_norm_var": 0.03865458170572917, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.452351212501526, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.16691983491182327, + "step": 18284 + }, + { + "epoch": 0.5714375, + "grad_norm": 3.140625, + "grad_norm_var": 0.03557942708333333, + "learning_rate": 0.0001, + "loss": 5.7971, + "loss/crossentropy": 2.5840392112731934, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17091979831457138, + "step": 18286 + }, + { + "epoch": 0.5715, + "grad_norm": 3.09375, + "grad_norm_var": 0.0437164306640625, + "learning_rate": 0.0001, + "loss": 5.5782, + "loss/crossentropy": 2.480460524559021, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1621190384030342, + "step": 18288 + }, + { + "epoch": 0.5715625, + "grad_norm": 3.125, + "grad_norm_var": 0.04248758951822917, + "learning_rate": 0.0001, + "loss": 5.649, + "loss/crossentropy": 2.4713305234909058, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1665937900543213, + "step": 18290 + }, + { + "epoch": 0.571625, + "grad_norm": 2.875, + "grad_norm_var": 0.0480621337890625, + "learning_rate": 0.0001, + "loss": 5.4988, + "loss/crossentropy": 2.4572752714157104, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1596246212720871, + "step": 18292 + }, + { + "epoch": 0.5716875, + "grad_norm": 3.3125, + "grad_norm_var": 0.03931884765625, + "learning_rate": 0.0001, + "loss": 5.6017, + "loss/crossentropy": 2.4361897706985474, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16694504022598267, + "step": 18294 + }, + { + "epoch": 0.57175, + "grad_norm": 3.140625, + "grad_norm_var": 0.04073893229166667, + "learning_rate": 0.0001, + "loss": 5.6611, + "loss/crossentropy": 2.462522864341736, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17141558229923248, + "step": 18296 + }, + { + "epoch": 0.5718125, + "grad_norm": 2.953125, + "grad_norm_var": 0.035868326822916664, + "learning_rate": 0.0001, + "loss": 5.5558, + "loss/crossentropy": 2.4553279876708984, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16317609697580338, + "step": 18298 + }, + { + "epoch": 0.571875, + "grad_norm": 3.09375, + "grad_norm_var": 0.024079386393229166, + "learning_rate": 0.0001, + "loss": 5.6954, + "loss/crossentropy": 2.598434805870056, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16399026662111282, + "step": 18300 + }, + { + "epoch": 0.5719375, + "grad_norm": 3.0625, + "grad_norm_var": 0.024300130208333333, + "learning_rate": 0.0001, + "loss": 6.0432, + "loss/crossentropy": 2.7587474584579468, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18001211434602737, + "step": 18302 + }, + { + "epoch": 0.572, + "grad_norm": 3.03125, + "grad_norm_var": 0.021024576822916665, + "learning_rate": 0.0001, + "loss": 5.8149, + "loss/crossentropy": 2.6901882886886597, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1655968725681305, + "step": 18304 + }, + { + "epoch": 0.5720625, + "grad_norm": 3.140625, + "grad_norm_var": 0.018994140625, + "learning_rate": 0.0001, + "loss": 6.0247, + "loss/crossentropy": 2.765766978263855, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17511239647865295, + "step": 18306 + }, + { + "epoch": 0.572125, + "grad_norm": 3.296875, + "grad_norm_var": 0.016792805989583333, + "learning_rate": 0.0001, + "loss": 5.7581, + "loss/crossentropy": 2.6202938556671143, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1645662784576416, + "step": 18308 + }, + { + "epoch": 0.5721875, + "grad_norm": 3.375, + "grad_norm_var": 0.032942708333333334, + "learning_rate": 0.0001, + "loss": 5.7976, + "loss/crossentropy": 2.501075863838196, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17848410457372665, + "step": 18310 + }, + { + "epoch": 0.57225, + "grad_norm": 3.015625, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 5.4869, + "loss/crossentropy": 2.4833608865737915, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1534803882241249, + "step": 18312 + }, + { + "epoch": 0.5723125, + "grad_norm": 2.90625, + "grad_norm_var": 0.037886555989583334, + "learning_rate": 0.0001, + "loss": 5.358, + "loss/crossentropy": 2.4097559452056885, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1522429808974266, + "step": 18314 + }, + { + "epoch": 0.572375, + "grad_norm": 3.265625, + "grad_norm_var": 0.040160115559895834, + "learning_rate": 0.0001, + "loss": 5.926, + "loss/crossentropy": 2.712649703025818, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17094045877456665, + "step": 18316 + }, + { + "epoch": 0.5724375, + "grad_norm": 3.1875, + "grad_norm_var": 0.06760660807291667, + "learning_rate": 0.0001, + "loss": 5.6137, + "loss/crossentropy": 2.382579803466797, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1727178767323494, + "step": 18318 + }, + { + "epoch": 0.5725, + "grad_norm": 3.078125, + "grad_norm_var": 0.064208984375, + "learning_rate": 0.0001, + "loss": 5.9288, + "loss/crossentropy": 2.7321243286132812, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16928109526634216, + "step": 18320 + }, + { + "epoch": 0.5725625, + "grad_norm": 3.375, + "grad_norm_var": 0.0653228759765625, + "learning_rate": 0.0001, + "loss": 5.541, + "loss/crossentropy": 2.371983051300049, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16612089425325394, + "step": 18322 + }, + { + "epoch": 0.572625, + "grad_norm": 3.296875, + "grad_norm_var": 0.06648661295572916, + "learning_rate": 0.0001, + "loss": 6.204, + "loss/crossentropy": 2.8039320707321167, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.19000670313835144, + "step": 18324 + }, + { + "epoch": 0.5726875, + "grad_norm": 3.046875, + "grad_norm_var": 0.055887858072916664, + "learning_rate": 0.0001, + "loss": 5.7852, + "loss/crossentropy": 2.6040135622024536, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1704602614045143, + "step": 18326 + }, + { + "epoch": 0.57275, + "grad_norm": 3.109375, + "grad_norm_var": 0.053446451822916664, + "learning_rate": 0.0001, + "loss": 5.8453, + "loss/crossentropy": 2.6023638248443604, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17624496668577194, + "step": 18328 + }, + { + "epoch": 0.5728125, + "grad_norm": 3.0, + "grad_norm_var": 0.03758036295572917, + "learning_rate": 0.0001, + "loss": 5.4056, + "loss/crossentropy": 2.3769582509994507, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15520767122507095, + "step": 18330 + }, + { + "epoch": 0.572875, + "grad_norm": 3.34375, + "grad_norm_var": 0.038134765625, + "learning_rate": 0.0001, + "loss": 5.8596, + "loss/crossentropy": 2.6480783224105835, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16959280520677567, + "step": 18332 + }, + { + "epoch": 0.5729375, + "grad_norm": 2.875, + "grad_norm_var": 0.022801717122395832, + "learning_rate": 0.0001, + "loss": 5.8033, + "loss/crossentropy": 2.5881412029266357, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1719089150428772, + "step": 18334 + }, + { + "epoch": 0.573, + "grad_norm": 3.28125, + "grad_norm_var": 0.02490234375, + "learning_rate": 0.0001, + "loss": 5.7223, + "loss/crossentropy": 2.5636401176452637, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16976793110370636, + "step": 18336 + }, + { + "epoch": 0.5730625, + "grad_norm": 2.953125, + "grad_norm_var": 0.03193359375, + "learning_rate": 0.0001, + "loss": 5.3448, + "loss/crossentropy": 2.340338110923767, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1566973328590393, + "step": 18338 + }, + { + "epoch": 0.573125, + "grad_norm": 3.265625, + "grad_norm_var": 0.02744140625, + "learning_rate": 0.0001, + "loss": 5.7583, + "loss/crossentropy": 2.6048583984375, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16924548894166946, + "step": 18340 + }, + { + "epoch": 0.5731875, + "grad_norm": 3.03125, + "grad_norm_var": 0.028620402018229168, + "learning_rate": 0.0001, + "loss": 5.3011, + "loss/crossentropy": 2.3738588094711304, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1505402997136116, + "step": 18342 + }, + { + "epoch": 0.57325, + "grad_norm": 3.015625, + "grad_norm_var": 0.028629557291666666, + "learning_rate": 0.0001, + "loss": 5.6385, + "loss/crossentropy": 2.5262138843536377, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16240377724170685, + "step": 18344 + }, + { + "epoch": 0.5733125, + "grad_norm": 2.9375, + "grad_norm_var": 0.0296875, + "learning_rate": 0.0001, + "loss": 5.7259, + "loss/crossentropy": 2.621517777442932, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16629406809806824, + "step": 18346 + }, + { + "epoch": 0.573375, + "grad_norm": 4.40625, + "grad_norm_var": 0.13762919108072916, + "learning_rate": 0.0001, + "loss": 6.3429, + "loss/crossentropy": 2.801121711730957, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.19714495539665222, + "step": 18348 + }, + { + "epoch": 0.5734375, + "grad_norm": 3.265625, + "grad_norm_var": 0.16383463541666668, + "learning_rate": 0.0001, + "loss": 6.0113, + "loss/crossentropy": 2.6030811071395874, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18340133130550385, + "step": 18350 + }, + { + "epoch": 0.5735, + "grad_norm": 3.421875, + "grad_norm_var": 0.16230061848958333, + "learning_rate": 0.0001, + "loss": 5.8881, + "loss/crossentropy": 2.6851214170455933, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17225381731987, + "step": 18352 + }, + { + "epoch": 0.5735625, + "grad_norm": 3.921875, + "grad_norm_var": 0.1701568603515625, + "learning_rate": 0.0001, + "loss": 5.8568, + "loss/crossentropy": 2.567447066307068, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17659613490104675, + "step": 18354 + }, + { + "epoch": 0.573625, + "grad_norm": 2.953125, + "grad_norm_var": 0.18293863932291668, + "learning_rate": 0.0001, + "loss": 5.1646, + "loss/crossentropy": 2.254288613796234, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1457211896777153, + "step": 18356 + }, + { + "epoch": 0.5736875, + "grad_norm": 3.21875, + "grad_norm_var": 0.17039286295572917, + "learning_rate": 0.0001, + "loss": 5.6014, + "loss/crossentropy": 2.5073583126068115, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16213755309581757, + "step": 18358 + }, + { + "epoch": 0.57375, + "grad_norm": 3.078125, + "grad_norm_var": 0.16236572265625, + "learning_rate": 0.0001, + "loss": 5.8935, + "loss/crossentropy": 2.6835970878601074, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17294839024543762, + "step": 18360 + }, + { + "epoch": 0.5738125, + "grad_norm": 3.375, + "grad_norm_var": 0.15689697265625, + "learning_rate": 0.0001, + "loss": 5.2427, + "loss/crossentropy": 2.2930054664611816, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15044043213129044, + "step": 18362 + }, + { + "epoch": 0.573875, + "grad_norm": 3.328125, + "grad_norm_var": 0.08193359375, + "learning_rate": 0.0001, + "loss": 5.9014, + "loss/crossentropy": 2.6530340909957886, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17171156406402588, + "step": 18364 + }, + { + "epoch": 0.5739375, + "grad_norm": 3.03125, + "grad_norm_var": 0.06304931640625, + "learning_rate": 0.0001, + "loss": 5.107, + "loss/crossentropy": 2.146529793739319, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1523013487458229, + "step": 18366 + }, + { + "epoch": 0.574, + "grad_norm": 3.40625, + "grad_norm_var": 0.06550191243489584, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.613413095474243, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16914102435112, + "step": 18368 + }, + { + "epoch": 0.5740625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0372222900390625, + "learning_rate": 0.0001, + "loss": 5.4298, + "loss/crossentropy": 2.3642795085906982, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15850616991519928, + "step": 18370 + }, + { + "epoch": 0.574125, + "grad_norm": 3.046875, + "grad_norm_var": 0.029182942708333333, + "learning_rate": 0.0001, + "loss": 5.69, + "loss/crossentropy": 2.529383659362793, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16645421832799911, + "step": 18372 + }, + { + "epoch": 0.5741875, + "grad_norm": 3.140625, + "grad_norm_var": 0.027391560872395835, + "learning_rate": 0.0001, + "loss": 5.8107, + "loss/crossentropy": 2.658630132675171, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16794230043888092, + "step": 18374 + }, + { + "epoch": 0.57425, + "grad_norm": 3.03125, + "grad_norm_var": 0.024266560872395832, + "learning_rate": 0.0001, + "loss": 5.745, + "loss/crossentropy": 2.672680377960205, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16192089766263962, + "step": 18376 + }, + { + "epoch": 0.5743125, + "grad_norm": 3.09375, + "grad_norm_var": 0.0211090087890625, + "learning_rate": 0.0001, + "loss": 5.4341, + "loss/crossentropy": 2.433584213256836, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15864913165569305, + "step": 18378 + }, + { + "epoch": 0.574375, + "grad_norm": 3.140625, + "grad_norm_var": 0.018342081705729166, + "learning_rate": 0.0001, + "loss": 5.5019, + "loss/crossentropy": 2.3593112230300903, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16269293427467346, + "step": 18380 + }, + { + "epoch": 0.5744375, + "grad_norm": 2.96875, + "grad_norm_var": 0.018097941080729166, + "learning_rate": 0.0001, + "loss": 5.9276, + "loss/crossentropy": 2.6684865951538086, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17708152532577515, + "step": 18382 + }, + { + "epoch": 0.5745, + "grad_norm": 3.25, + "grad_norm_var": 0.01402587890625, + "learning_rate": 0.0001, + "loss": 5.6939, + "loss/crossentropy": 2.51468563079834, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16986984014511108, + "step": 18384 + }, + { + "epoch": 0.5745625, + "grad_norm": 3.296875, + "grad_norm_var": 0.013863118489583333, + "learning_rate": 0.0001, + "loss": 5.8839, + "loss/crossentropy": 2.6714022159576416, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17280860990285873, + "step": 18386 + }, + { + "epoch": 0.574625, + "grad_norm": 2.96875, + "grad_norm_var": 0.015230305989583333, + "learning_rate": 0.0001, + "loss": 5.5828, + "loss/crossentropy": 2.4910420179367065, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1638609543442726, + "step": 18388 + }, + { + "epoch": 0.5746875, + "grad_norm": 2.984375, + "grad_norm_var": 0.029100545247395835, + "learning_rate": 0.0001, + "loss": 5.7003, + "loss/crossentropy": 2.552237868309021, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1675410270690918, + "step": 18390 + }, + { + "epoch": 0.57475, + "grad_norm": 2.9375, + "grad_norm_var": 0.03125, + "learning_rate": 0.0001, + "loss": 5.8232, + "loss/crossentropy": 2.698289394378662, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16600815951824188, + "step": 18392 + }, + { + "epoch": 0.5748125, + "grad_norm": 3.140625, + "grad_norm_var": 0.03033447265625, + "learning_rate": 0.0001, + "loss": 5.7958, + "loss/crossentropy": 2.6374276876449585, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17091414332389832, + "step": 18394 + }, + { + "epoch": 0.574875, + "grad_norm": 3.359375, + "grad_norm_var": 0.033036295572916666, + "learning_rate": 0.0001, + "loss": 5.9827, + "loss/crossentropy": 2.7636018991470337, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17386632412672043, + "step": 18396 + }, + { + "epoch": 0.5749375, + "grad_norm": 3.9375, + "grad_norm_var": 0.061930338541666664, + "learning_rate": 0.0001, + "loss": 5.4679, + "loss/crossentropy": 2.3220553398132324, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16341203451156616, + "step": 18398 + }, + { + "epoch": 0.575, + "grad_norm": 3.078125, + "grad_norm_var": 0.06453348795572916, + "learning_rate": 0.0001, + "loss": 5.5277, + "loss/crossentropy": 2.4370415210723877, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1629718393087387, + "step": 18400 + }, + { + "epoch": 0.5750625, + "grad_norm": 3.25, + "grad_norm_var": 0.06345926920572917, + "learning_rate": 0.0001, + "loss": 5.7709, + "loss/crossentropy": 2.5731245279312134, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16899394243955612, + "step": 18402 + }, + { + "epoch": 0.575125, + "grad_norm": 3.53125, + "grad_norm_var": 0.0649078369140625, + "learning_rate": 0.0001, + "loss": 5.5094, + "loss/crossentropy": 2.4272419214248657, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15977945178747177, + "step": 18404 + }, + { + "epoch": 0.5751875, + "grad_norm": 3.09375, + "grad_norm_var": 0.05481363932291667, + "learning_rate": 0.0001, + "loss": 5.8179, + "loss/crossentropy": 2.6184180974960327, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17189937829971313, + "step": 18406 + }, + { + "epoch": 0.57525, + "grad_norm": 3.296875, + "grad_norm_var": 0.05034077962239583, + "learning_rate": 0.0001, + "loss": 5.7, + "loss/crossentropy": 2.5761481523513794, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16317031532526016, + "step": 18408 + }, + { + "epoch": 0.5753125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0506744384765625, + "learning_rate": 0.0001, + "loss": 5.7413, + "loss/crossentropy": 2.5246126651763916, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17089063674211502, + "step": 18410 + }, + { + "epoch": 0.575375, + "grad_norm": 3.140625, + "grad_norm_var": 0.04970703125, + "learning_rate": 0.0001, + "loss": 5.8667, + "loss/crossentropy": 2.5869481563568115, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1764107272028923, + "step": 18412 + }, + { + "epoch": 0.5754375, + "grad_norm": 3.109375, + "grad_norm_var": 0.19708658854166666, + "learning_rate": 0.0001, + "loss": 5.6101, + "loss/crossentropy": 2.4800440073013306, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16651704907417297, + "step": 18414 + }, + { + "epoch": 0.5755, + "grad_norm": 3.0625, + "grad_norm_var": 0.19455973307291666, + "learning_rate": 0.0001, + "loss": 5.7655, + "loss/crossentropy": 2.6151453256607056, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1654304414987564, + "step": 18416 + }, + { + "epoch": 0.5755625, + "grad_norm": 2.890625, + "grad_norm_var": 0.20732014973958332, + "learning_rate": 0.0001, + "loss": 5.6632, + "loss/crossentropy": 2.492995023727417, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1666259467601776, + "step": 18418 + }, + { + "epoch": 0.575625, + "grad_norm": 3.609375, + "grad_norm_var": 0.2154296875, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.5649964809417725, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16826294362545013, + "step": 18420 + }, + { + "epoch": 0.5756875, + "grad_norm": 3.109375, + "grad_norm_var": 0.21497395833333333, + "learning_rate": 0.0001, + "loss": 5.5362, + "loss/crossentropy": 2.4699827432632446, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16209031641483307, + "step": 18422 + }, + { + "epoch": 0.57575, + "grad_norm": 3.5, + "grad_norm_var": 0.21179911295572917, + "learning_rate": 0.0001, + "loss": 5.9305, + "loss/crossentropy": 2.666316509246826, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.176804319024086, + "step": 18424 + }, + { + "epoch": 0.5758125, + "grad_norm": 3.203125, + "grad_norm_var": 0.2167633056640625, + "learning_rate": 0.0001, + "loss": 5.6947, + "loss/crossentropy": 2.5294259786605835, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16535243391990662, + "step": 18426 + }, + { + "epoch": 0.575875, + "grad_norm": 3.359375, + "grad_norm_var": 0.21601460774739584, + "learning_rate": 0.0001, + "loss": 5.8707, + "loss/crossentropy": 2.6436872482299805, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1730949506163597, + "step": 18428 + }, + { + "epoch": 0.5759375, + "grad_norm": 2.96875, + "grad_norm_var": 0.04572652180989583, + "learning_rate": 0.0001, + "loss": 5.5814, + "loss/crossentropy": 2.4956637620925903, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16482633352279663, + "step": 18430 + }, + { + "epoch": 0.576, + "grad_norm": 3.1875, + "grad_norm_var": 0.0509918212890625, + "learning_rate": 0.0001, + "loss": 5.5559, + "loss/crossentropy": 2.48967969417572, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.159743033349514, + "step": 18432 + }, + { + "epoch": 0.5760625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0450347900390625, + "learning_rate": 0.0001, + "loss": 5.5264, + "loss/crossentropy": 2.4949833154678345, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15587330609560013, + "step": 18434 + }, + { + "epoch": 0.576125, + "grad_norm": 3.3125, + "grad_norm_var": 0.0343658447265625, + "learning_rate": 0.0001, + "loss": 5.5187, + "loss/crossentropy": 2.3616496324539185, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16804824024438858, + "step": 18436 + }, + { + "epoch": 0.5761875, + "grad_norm": 3.171875, + "grad_norm_var": 0.03341471354166667, + "learning_rate": 0.0001, + "loss": 5.8019, + "loss/crossentropy": 2.589166522026062, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17400363087654114, + "step": 18438 + }, + { + "epoch": 0.57625, + "grad_norm": 2.984375, + "grad_norm_var": 0.0271881103515625, + "learning_rate": 0.0001, + "loss": 5.901, + "loss/crossentropy": 2.666108012199402, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17310288548469543, + "step": 18440 + }, + { + "epoch": 0.5763125, + "grad_norm": 3.28125, + "grad_norm_var": 0.028327433268229167, + "learning_rate": 0.0001, + "loss": 5.9678, + "loss/crossentropy": 2.8197555541992188, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16714906692504883, + "step": 18442 + }, + { + "epoch": 0.576375, + "grad_norm": 2.984375, + "grad_norm_var": 0.028473917643229166, + "learning_rate": 0.0001, + "loss": 5.3827, + "loss/crossentropy": 2.303182363510132, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16264119744300842, + "step": 18444 + }, + { + "epoch": 0.5764375, + "grad_norm": 3.15625, + "grad_norm_var": 0.10107014973958334, + "learning_rate": 0.0001, + "loss": 6.1568, + "loss/crossentropy": 2.79031503200531, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18430818617343903, + "step": 18446 + }, + { + "epoch": 0.5765, + "grad_norm": 3.234375, + "grad_norm_var": 0.132275390625, + "learning_rate": 0.0001, + "loss": 5.8918, + "loss/crossentropy": 2.7167197465896606, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1706361472606659, + "step": 18448 + }, + { + "epoch": 0.5765625, + "grad_norm": 3.46875, + "grad_norm_var": 0.12077534993489583, + "learning_rate": 0.0001, + "loss": 5.7644, + "loss/crossentropy": 2.5743154287338257, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16939499974250793, + "step": 18450 + }, + { + "epoch": 0.576625, + "grad_norm": 3.1875, + "grad_norm_var": 0.11385091145833333, + "learning_rate": 0.0001, + "loss": 6.1072, + "loss/crossentropy": 2.811020851135254, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17922312766313553, + "step": 18452 + }, + { + "epoch": 0.5766875, + "grad_norm": 3.984375, + "grad_norm_var": 0.14557291666666666, + "learning_rate": 0.0001, + "loss": 5.8725, + "loss/crossentropy": 2.609155774116516, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17164289951324463, + "step": 18454 + }, + { + "epoch": 0.57675, + "grad_norm": 3.09375, + "grad_norm_var": 0.14801432291666666, + "learning_rate": 0.0001, + "loss": 5.6958, + "loss/crossentropy": 2.572449564933777, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16623719781637192, + "step": 18456 + }, + { + "epoch": 0.5768125, + "grad_norm": 3.03125, + "grad_norm_var": 0.15652669270833333, + "learning_rate": 0.0001, + "loss": 5.8234, + "loss/crossentropy": 2.6459431648254395, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17087049037218094, + "step": 18458 + }, + { + "epoch": 0.576875, + "grad_norm": 3.09375, + "grad_norm_var": 0.15507405598958332, + "learning_rate": 0.0001, + "loss": 5.7833, + "loss/crossentropy": 2.5941377878189087, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16930782049894333, + "step": 18460 + }, + { + "epoch": 0.5769375, + "grad_norm": 3.703125, + "grad_norm_var": 0.11103108723958334, + "learning_rate": 0.0001, + "loss": 5.9248, + "loss/crossentropy": 2.6534959077835083, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17478252947330475, + "step": 18462 + }, + { + "epoch": 0.577, + "grad_norm": 3.359375, + "grad_norm_var": 0.07961832682291667, + "learning_rate": 0.0001, + "loss": 5.7831, + "loss/crossentropy": 2.5179115533828735, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17339365184307098, + "step": 18464 + }, + { + "epoch": 0.5770625, + "grad_norm": 3.296875, + "grad_norm_var": 0.08166910807291666, + "learning_rate": 0.0001, + "loss": 5.5137, + "loss/crossentropy": 2.4228307008743286, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.15752895176410675, + "step": 18466 + }, + { + "epoch": 0.577125, + "grad_norm": 2.9375, + "grad_norm_var": 0.08742574055989584, + "learning_rate": 0.0001, + "loss": 5.3917, + "loss/crossentropy": 2.4382351636886597, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14769209921360016, + "step": 18468 + }, + { + "epoch": 0.5771875, + "grad_norm": 2.9375, + "grad_norm_var": 0.04976298014322917, + "learning_rate": 0.0001, + "loss": 5.2428, + "loss/crossentropy": 2.2560296058654785, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15570665895938873, + "step": 18470 + }, + { + "epoch": 0.57725, + "grad_norm": 3.375, + "grad_norm_var": 0.05047200520833333, + "learning_rate": 0.0001, + "loss": 5.824, + "loss/crossentropy": 2.6376614570617676, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17058780789375305, + "step": 18472 + }, + { + "epoch": 0.5773125, + "grad_norm": 2.96875, + "grad_norm_var": 0.05113525390625, + "learning_rate": 0.0001, + "loss": 5.8204, + "loss/crossentropy": 2.6507813930511475, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16969439387321472, + "step": 18474 + }, + { + "epoch": 0.577375, + "grad_norm": 3.0, + "grad_norm_var": 0.0600250244140625, + "learning_rate": 0.0001, + "loss": 5.7476, + "loss/crossentropy": 2.5414364337921143, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17530686408281326, + "step": 18476 + }, + { + "epoch": 0.5774375, + "grad_norm": 3.375, + "grad_norm_var": 0.045221964518229164, + "learning_rate": 0.0001, + "loss": 5.8855, + "loss/crossentropy": 2.636993646621704, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17758085578680038, + "step": 18478 + }, + { + "epoch": 0.5775, + "grad_norm": 3.9375, + "grad_norm_var": 0.07720947265625, + "learning_rate": 0.0001, + "loss": 6.3393, + "loss/crossentropy": 2.9106377363204956, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18544892221689224, + "step": 18480 + }, + { + "epoch": 0.5775625, + "grad_norm": 2.84375, + "grad_norm_var": 0.08156636555989584, + "learning_rate": 0.0001, + "loss": 5.5022, + "loss/crossentropy": 2.3790981769561768, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16348543763160706, + "step": 18482 + }, + { + "epoch": 0.577625, + "grad_norm": 2.84375, + "grad_norm_var": 0.08564046223958334, + "learning_rate": 0.0001, + "loss": 5.8974, + "loss/crossentropy": 2.7403557300567627, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1680453196167946, + "step": 18484 + }, + { + "epoch": 0.5776875, + "grad_norm": 3.734375, + "grad_norm_var": 0.09696858723958333, + "learning_rate": 0.0001, + "loss": 5.8688, + "loss/crossentropy": 2.5689350366592407, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1749121993780136, + "step": 18486 + }, + { + "epoch": 0.57775, + "grad_norm": 3.453125, + "grad_norm_var": 0.09922587076822917, + "learning_rate": 0.0001, + "loss": 5.7181, + "loss/crossentropy": 2.4852821826934814, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1721075251698494, + "step": 18488 + }, + { + "epoch": 0.5778125, + "grad_norm": 2.828125, + "grad_norm_var": 0.11477457682291667, + "learning_rate": 0.0001, + "loss": 5.6, + "loss/crossentropy": 2.5532870292663574, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15935859829187393, + "step": 18490 + }, + { + "epoch": 0.577875, + "grad_norm": 2.9375, + "grad_norm_var": 0.11077372233072917, + "learning_rate": 0.0001, + "loss": 5.7528, + "loss/crossentropy": 2.609677314758301, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1662694588303566, + "step": 18492 + }, + { + "epoch": 0.5779375, + "grad_norm": 3.421875, + "grad_norm_var": 0.10816650390625, + "learning_rate": 0.0001, + "loss": 5.5932, + "loss/crossentropy": 2.4124221801757812, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1676826924085617, + "step": 18494 + }, + { + "epoch": 0.578, + "grad_norm": 3.125, + "grad_norm_var": 0.07392578125, + "learning_rate": 0.0001, + "loss": 5.4063, + "loss/crossentropy": 2.3695526123046875, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1575787216424942, + "step": 18496 + }, + { + "epoch": 0.5780625, + "grad_norm": 2.984375, + "grad_norm_var": 0.068359375, + "learning_rate": 0.0001, + "loss": 5.419, + "loss/crossentropy": 2.3370147943496704, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16210711747407913, + "step": 18498 + }, + { + "epoch": 0.578125, + "grad_norm": 3.046875, + "grad_norm_var": 0.0612213134765625, + "learning_rate": 0.0001, + "loss": 5.8758, + "loss/crossentropy": 2.6599076986312866, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1700289249420166, + "step": 18500 + }, + { + "epoch": 0.5781875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0375396728515625, + "learning_rate": 0.0001, + "loss": 5.64, + "loss/crossentropy": 2.544037103652954, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16194117069244385, + "step": 18502 + }, + { + "epoch": 0.57825, + "grad_norm": 3.203125, + "grad_norm_var": 0.029572550455729166, + "learning_rate": 0.0001, + "loss": 5.6184, + "loss/crossentropy": 2.5240252017974854, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16099613159894943, + "step": 18504 + }, + { + "epoch": 0.5783125, + "grad_norm": 3.265625, + "grad_norm_var": 0.022330729166666667, + "learning_rate": 0.0001, + "loss": 5.52, + "loss/crossentropy": 2.436471462249756, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16264820843935013, + "step": 18506 + }, + { + "epoch": 0.578375, + "grad_norm": 3.140625, + "grad_norm_var": 0.015946451822916666, + "learning_rate": 0.0001, + "loss": 5.7601, + "loss/crossentropy": 2.595950961112976, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1718810498714447, + "step": 18508 + }, + { + "epoch": 0.5784375, + "grad_norm": 3.03125, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 5.4356, + "loss/crossentropy": 2.3870849609375, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16032517701387405, + "step": 18510 + }, + { + "epoch": 0.5785, + "grad_norm": 3.453125, + "grad_norm_var": 0.017887369791666666, + "learning_rate": 0.0001, + "loss": 5.7266, + "loss/crossentropy": 2.575134754180908, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16944736242294312, + "step": 18512 + }, + { + "epoch": 0.5785625, + "grad_norm": 2.75, + "grad_norm_var": 0.026423136393229168, + "learning_rate": 0.0001, + "loss": 5.4494, + "loss/crossentropy": 2.3964990377426147, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16036802530288696, + "step": 18514 + }, + { + "epoch": 0.578625, + "grad_norm": 3.171875, + "grad_norm_var": 0.026611328125, + "learning_rate": 0.0001, + "loss": 5.8836, + "loss/crossentropy": 2.674792528152466, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17361968755722046, + "step": 18516 + }, + { + "epoch": 0.5786875, + "grad_norm": 3.0, + "grad_norm_var": 0.025755818684895834, + "learning_rate": 0.0001, + "loss": 5.7156, + "loss/crossentropy": 2.582158923149109, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16646727174520493, + "step": 18518 + }, + { + "epoch": 0.57875, + "grad_norm": 2.953125, + "grad_norm_var": 0.026903279622395835, + "learning_rate": 0.0001, + "loss": 5.6374, + "loss/crossentropy": 2.5500084161758423, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16420669108629227, + "step": 18520 + }, + { + "epoch": 0.5788125, + "grad_norm": 3.234375, + "grad_norm_var": 0.025846354166666665, + "learning_rate": 0.0001, + "loss": 5.4583, + "loss/crossentropy": 2.3921940326690674, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1538735032081604, + "step": 18522 + }, + { + "epoch": 0.578875, + "grad_norm": 3.078125, + "grad_norm_var": 0.025712076822916666, + "learning_rate": 0.0001, + "loss": 5.6155, + "loss/crossentropy": 2.5623749494552612, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15687866508960724, + "step": 18524 + }, + { + "epoch": 0.5789375, + "grad_norm": 3.015625, + "grad_norm_var": 0.026106770833333334, + "learning_rate": 0.0001, + "loss": 5.2689, + "loss/crossentropy": 2.3232126235961914, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15316598117351532, + "step": 18526 + }, + { + "epoch": 0.579, + "grad_norm": 3.03125, + "grad_norm_var": 0.0162506103515625, + "learning_rate": 0.0001, + "loss": 5.6008, + "loss/crossentropy": 2.5733802318573, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16055817902088165, + "step": 18528 + }, + { + "epoch": 0.5790625, + "grad_norm": 3.171875, + "grad_norm_var": 0.00826416015625, + "learning_rate": 0.0001, + "loss": 5.7381, + "loss/crossentropy": 2.586323618888855, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16830172389745712, + "step": 18530 + }, + { + "epoch": 0.579125, + "grad_norm": 3.390625, + "grad_norm_var": 0.014774576822916666, + "learning_rate": 0.0001, + "loss": 5.6249, + "loss/crossentropy": 2.4533214569091797, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16754812002182007, + "step": 18532 + }, + { + "epoch": 0.5791875, + "grad_norm": 2.96875, + "grad_norm_var": 0.0162261962890625, + "learning_rate": 0.0001, + "loss": 5.7429, + "loss/crossentropy": 2.59567129611969, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16550730913877487, + "step": 18534 + }, + { + "epoch": 0.57925, + "grad_norm": 3.09375, + "grad_norm_var": 0.014860026041666667, + "learning_rate": 0.0001, + "loss": 5.5887, + "loss/crossentropy": 2.4598299264907837, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16561666131019592, + "step": 18536 + }, + { + "epoch": 0.5793125, + "grad_norm": 3.21875, + "grad_norm_var": 0.018050130208333334, + "learning_rate": 0.0001, + "loss": 6.103, + "loss/crossentropy": 2.82500159740448, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17818892747163773, + "step": 18538 + }, + { + "epoch": 0.579375, + "grad_norm": 3.34375, + "grad_norm_var": 0.6931477864583333, + "learning_rate": 0.0001, + "loss": 5.702, + "loss/crossentropy": 2.4253724813461304, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.16750602424144745, + "step": 18540 + }, + { + "epoch": 0.5794375, + "grad_norm": 2.90625, + "grad_norm_var": 0.6881256103515625, + "learning_rate": 0.0001, + "loss": 5.4788, + "loss/crossentropy": 2.4489500522613525, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15728431940078735, + "step": 18542 + }, + { + "epoch": 0.5795, + "grad_norm": 3.0, + "grad_norm_var": 0.6911855061848958, + "learning_rate": 0.0001, + "loss": 5.536, + "loss/crossentropy": 2.4353054761886597, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1647573560476303, + "step": 18544 + }, + { + "epoch": 0.5795625, + "grad_norm": 3.171875, + "grad_norm_var": 0.6865468343098958, + "learning_rate": 0.0001, + "loss": 5.6885, + "loss/crossentropy": 2.5016510486602783, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16556347906589508, + "step": 18546 + }, + { + "epoch": 0.579625, + "grad_norm": 2.921875, + "grad_norm_var": 0.6917958577473958, + "learning_rate": 0.0001, + "loss": 5.8193, + "loss/crossentropy": 2.652044177055359, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1706322431564331, + "step": 18548 + }, + { + "epoch": 0.5796875, + "grad_norm": 3.1875, + "grad_norm_var": 0.6940500895182292, + "learning_rate": 0.0001, + "loss": 5.5408, + "loss/crossentropy": 2.447631359100342, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1585312932729721, + "step": 18550 + }, + { + "epoch": 0.57975, + "grad_norm": 3.4375, + "grad_norm_var": 0.6898427327473958, + "learning_rate": 0.0001, + "loss": 5.7557, + "loss/crossentropy": 2.596085786819458, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.169084370136261, + "step": 18552 + }, + { + "epoch": 0.5798125, + "grad_norm": 2.90625, + "grad_norm_var": 0.7029774983723959, + "learning_rate": 0.0001, + "loss": 5.7252, + "loss/crossentropy": 2.5617101192474365, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16557105630636215, + "step": 18554 + }, + { + "epoch": 0.579875, + "grad_norm": 3.0625, + "grad_norm_var": 0.03741861979166667, + "learning_rate": 0.0001, + "loss": 5.8683, + "loss/crossentropy": 2.6295604705810547, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17348121106624603, + "step": 18556 + }, + { + "epoch": 0.5799375, + "grad_norm": 3.140625, + "grad_norm_var": 0.03425191243489583, + "learning_rate": 0.0001, + "loss": 5.9799, + "loss/crossentropy": 2.7225042581558228, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1761271357536316, + "step": 18558 + }, + { + "epoch": 0.58, + "grad_norm": 3.375, + "grad_norm_var": 0.038304646809895836, + "learning_rate": 0.0001, + "loss": 5.7409, + "loss/crossentropy": 2.5768563747406006, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17070002853870392, + "step": 18560 + }, + { + "epoch": 0.5800625, + "grad_norm": 3.203125, + "grad_norm_var": 0.035741170247395836, + "learning_rate": 0.0001, + "loss": 5.5646, + "loss/crossentropy": 2.429896831512451, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16698598861694336, + "step": 18562 + }, + { + "epoch": 0.580125, + "grad_norm": 3.515625, + "grad_norm_var": 0.07639567057291667, + "learning_rate": 0.0001, + "loss": 5.6645, + "loss/crossentropy": 2.482678174972534, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1701330468058586, + "step": 18564 + }, + { + "epoch": 0.5801875, + "grad_norm": 3.265625, + "grad_norm_var": 0.07169596354166667, + "learning_rate": 0.0001, + "loss": 5.4908, + "loss/crossentropy": 2.3414900302886963, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.15985772758722305, + "step": 18566 + }, + { + "epoch": 0.58025, + "grad_norm": 3.625, + "grad_norm_var": 0.07626546223958333, + "learning_rate": 0.0001, + "loss": 5.7235, + "loss/crossentropy": 2.464827537536621, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17274148762226105, + "step": 18568 + }, + { + "epoch": 0.5803125, + "grad_norm": 3.359375, + "grad_norm_var": 0.06392313639322916, + "learning_rate": 0.0001, + "loss": 5.628, + "loss/crossentropy": 2.4404972791671753, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16640209406614304, + "step": 18570 + }, + { + "epoch": 0.580375, + "grad_norm": 3.515625, + "grad_norm_var": 0.0599609375, + "learning_rate": 0.0001, + "loss": 6.1257, + "loss/crossentropy": 2.8498064279556274, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1760292649269104, + "step": 18572 + }, + { + "epoch": 0.5804375, + "grad_norm": 3.046875, + "grad_norm_var": 0.0705078125, + "learning_rate": 0.0001, + "loss": 5.6626, + "loss/crossentropy": 2.5267767906188965, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16709468513727188, + "step": 18574 + }, + { + "epoch": 0.5805, + "grad_norm": 3.4375, + "grad_norm_var": 0.0618072509765625, + "learning_rate": 0.0001, + "loss": 5.7438, + "loss/crossentropy": 2.5630754232406616, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16533689945936203, + "step": 18576 + }, + { + "epoch": 0.5805625, + "grad_norm": 3.21875, + "grad_norm_var": 0.06129150390625, + "learning_rate": 0.0001, + "loss": 5.6433, + "loss/crossentropy": 2.4586691856384277, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16573145985603333, + "step": 18578 + }, + { + "epoch": 0.580625, + "grad_norm": 2.96875, + "grad_norm_var": 0.0387115478515625, + "learning_rate": 0.0001, + "loss": 5.5069, + "loss/crossentropy": 2.389492630958557, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16291730105876923, + "step": 18580 + }, + { + "epoch": 0.5806875, + "grad_norm": 3.15625, + "grad_norm_var": 0.03619791666666667, + "learning_rate": 0.0001, + "loss": 5.7239, + "loss/crossentropy": 2.5101183652877808, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1705961897969246, + "step": 18582 + }, + { + "epoch": 0.58075, + "grad_norm": 3.296875, + "grad_norm_var": 0.02613525390625, + "learning_rate": 0.0001, + "loss": 5.8409, + "loss/crossentropy": 2.562360644340515, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1778540313243866, + "step": 18584 + }, + { + "epoch": 0.5808125, + "grad_norm": 3.078125, + "grad_norm_var": 0.028173828125, + "learning_rate": 0.0001, + "loss": 5.6442, + "loss/crossentropy": 2.4837807416915894, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17033448815345764, + "step": 18586 + }, + { + "epoch": 0.580875, + "grad_norm": 3.328125, + "grad_norm_var": 0.0241119384765625, + "learning_rate": 0.0001, + "loss": 5.9073, + "loss/crossentropy": 2.6571329832077026, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17852745950222015, + "step": 18588 + }, + { + "epoch": 0.5809375, + "grad_norm": 3.359375, + "grad_norm_var": 0.020295206705729166, + "learning_rate": 0.0001, + "loss": 6.0707, + "loss/crossentropy": 2.778269648551941, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1796359047293663, + "step": 18590 + }, + { + "epoch": 0.581, + "grad_norm": 3.0, + "grad_norm_var": 0.023656209309895832, + "learning_rate": 0.0001, + "loss": 5.5964, + "loss/crossentropy": 2.5302563905715942, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15973998606204987, + "step": 18592 + }, + { + "epoch": 0.5810625, + "grad_norm": 3.15625, + "grad_norm_var": 0.023465983072916665, + "learning_rate": 0.0001, + "loss": 5.8044, + "loss/crossentropy": 2.656304359436035, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1667615845799446, + "step": 18594 + }, + { + "epoch": 0.581125, + "grad_norm": 3.78125, + "grad_norm_var": 0.036253865559895834, + "learning_rate": 0.0001, + "loss": 5.9787, + "loss/crossentropy": 2.6889857053756714, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.177021324634552, + "step": 18596 + }, + { + "epoch": 0.5811875, + "grad_norm": 3.046875, + "grad_norm_var": 0.039567057291666666, + "learning_rate": 0.0001, + "loss": 5.8973, + "loss/crossentropy": 2.671121835708618, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17379028350114822, + "step": 18598 + }, + { + "epoch": 0.58125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0404205322265625, + "learning_rate": 0.0001, + "loss": 5.5099, + "loss/crossentropy": 2.3974192142486572, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16241814941167831, + "step": 18600 + }, + { + "epoch": 0.5813125, + "grad_norm": 3.1875, + "grad_norm_var": 0.03863525390625, + "learning_rate": 0.0001, + "loss": 5.924, + "loss/crossentropy": 2.6862300634384155, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1757260039448738, + "step": 18602 + }, + { + "epoch": 0.581375, + "grad_norm": 3.046875, + "grad_norm_var": 0.04350484212239583, + "learning_rate": 0.0001, + "loss": 5.5963, + "loss/crossentropy": 2.5258930921554565, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1582084447145462, + "step": 18604 + }, + { + "epoch": 0.5814375, + "grad_norm": 4.28125, + "grad_norm_var": 0.12067057291666666, + "learning_rate": 0.0001, + "loss": 5.7236, + "loss/crossentropy": 2.469976305961609, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17614026367664337, + "step": 18606 + }, + { + "epoch": 0.5815, + "grad_norm": 3.515625, + "grad_norm_var": 0.11647135416666667, + "learning_rate": 0.0001, + "loss": 5.7335, + "loss/crossentropy": 2.500939130783081, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1724713146686554, + "step": 18608 + }, + { + "epoch": 0.5815625, + "grad_norm": 3.015625, + "grad_norm_var": 0.12549540201822917, + "learning_rate": 0.0001, + "loss": 5.4021, + "loss/crossentropy": 2.464106321334839, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15122338384389877, + "step": 18610 + }, + { + "epoch": 0.581625, + "grad_norm": 4.09375, + "grad_norm_var": 0.1540191650390625, + "learning_rate": 0.0001, + "loss": 5.4409, + "loss/crossentropy": 2.3574678897857666, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1634194403886795, + "step": 18612 + }, + { + "epoch": 0.5816875, + "grad_norm": 3.125, + "grad_norm_var": 0.15129801432291667, + "learning_rate": 0.0001, + "loss": 5.9564, + "loss/crossentropy": 2.771583914756775, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17239146679639816, + "step": 18614 + }, + { + "epoch": 0.58175, + "grad_norm": 3.09375, + "grad_norm_var": 0.14811197916666666, + "learning_rate": 0.0001, + "loss": 5.5109, + "loss/crossentropy": 2.3945586681365967, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1647638976573944, + "step": 18616 + }, + { + "epoch": 0.5818125, + "grad_norm": 3.0625, + "grad_norm_var": 0.15637919108072917, + "learning_rate": 0.0001, + "loss": 5.7852, + "loss/crossentropy": 2.6670639514923096, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1668880581855774, + "step": 18618 + }, + { + "epoch": 0.581875, + "grad_norm": 2.984375, + "grad_norm_var": 0.14918619791666668, + "learning_rate": 0.0001, + "loss": 5.7533, + "loss/crossentropy": 2.5921837091445923, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16728005558252335, + "step": 18620 + }, + { + "epoch": 0.5819375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0853515625, + "learning_rate": 0.0001, + "loss": 5.5894, + "loss/crossentropy": 2.445537567138672, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16594630479812622, + "step": 18622 + }, + { + "epoch": 0.582, + "grad_norm": 3.09375, + "grad_norm_var": 0.08701883951822917, + "learning_rate": 0.0001, + "loss": 5.8582, + "loss/crossentropy": 2.6308059692382812, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17000506818294525, + "step": 18624 + }, + { + "epoch": 0.5820625, + "grad_norm": 3.296875, + "grad_norm_var": 0.07776285807291666, + "learning_rate": 0.0001, + "loss": 6.0982, + "loss/crossentropy": 2.7778422832489014, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17969654500484467, + "step": 18626 + }, + { + "epoch": 0.582125, + "grad_norm": 3.109375, + "grad_norm_var": 0.03763020833333333, + "learning_rate": 0.0001, + "loss": 5.5696, + "loss/crossentropy": 2.4944108724594116, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16376770287752151, + "step": 18628 + }, + { + "epoch": 0.5821875, + "grad_norm": 3.09375, + "grad_norm_var": 0.0425933837890625, + "learning_rate": 0.0001, + "loss": 5.4778, + "loss/crossentropy": 2.459985852241516, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15568386018276215, + "step": 18630 + }, + { + "epoch": 0.58225, + "grad_norm": 3.390625, + "grad_norm_var": 0.0460601806640625, + "learning_rate": 0.0001, + "loss": 5.9489, + "loss/crossentropy": 2.643662929534912, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1785680204629898, + "step": 18632 + }, + { + "epoch": 0.5823125, + "grad_norm": 3.171875, + "grad_norm_var": 0.04049072265625, + "learning_rate": 0.0001, + "loss": 5.8371, + "loss/crossentropy": 2.6857060194015503, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1670950949192047, + "step": 18634 + }, + { + "epoch": 0.582375, + "grad_norm": 3.140625, + "grad_norm_var": 0.03424072265625, + "learning_rate": 0.0001, + "loss": 5.7002, + "loss/crossentropy": 2.5760804414749146, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16593117266893387, + "step": 18636 + }, + { + "epoch": 0.5824375, + "grad_norm": 3.375, + "grad_norm_var": 0.03526102701822917, + "learning_rate": 0.0001, + "loss": 5.7943, + "loss/crossentropy": 2.575034499168396, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1715373918414116, + "step": 18638 + }, + { + "epoch": 0.5825, + "grad_norm": 3.09375, + "grad_norm_var": 0.03228759765625, + "learning_rate": 0.0001, + "loss": 5.4874, + "loss/crossentropy": 2.4387048482894897, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16033842414617538, + "step": 18640 + }, + { + "epoch": 0.5825625, + "grad_norm": 3.078125, + "grad_norm_var": 0.027684529622395832, + "learning_rate": 0.0001, + "loss": 5.6826, + "loss/crossentropy": 2.5084009170532227, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17210392653942108, + "step": 18642 + }, + { + "epoch": 0.582625, + "grad_norm": 3.0, + "grad_norm_var": 0.025861612955729165, + "learning_rate": 0.0001, + "loss": 5.8363, + "loss/crossentropy": 2.6931592226028442, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1670495942234993, + "step": 18644 + }, + { + "epoch": 0.5826875, + "grad_norm": 3.1875, + "grad_norm_var": 0.027469889322916666, + "learning_rate": 0.0001, + "loss": 5.3942, + "loss/crossentropy": 2.3630157709121704, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15467745065689087, + "step": 18646 + }, + { + "epoch": 0.58275, + "grad_norm": 3.5625, + "grad_norm_var": 0.033219401041666666, + "learning_rate": 0.0001, + "loss": 5.701, + "loss/crossentropy": 2.471379041671753, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17257267236709595, + "step": 18648 + }, + { + "epoch": 0.5828125, + "grad_norm": 2.890625, + "grad_norm_var": 0.03746744791666667, + "learning_rate": 0.0001, + "loss": 5.7769, + "loss/crossentropy": 2.5884225368499756, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16767475754022598, + "step": 18650 + }, + { + "epoch": 0.582875, + "grad_norm": 3.078125, + "grad_norm_var": 0.037581380208333334, + "learning_rate": 0.0001, + "loss": 5.5594, + "loss/crossentropy": 2.437865972518921, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16254276037216187, + "step": 18652 + }, + { + "epoch": 0.5829375, + "grad_norm": 2.96875, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 5.096, + "loss/crossentropy": 2.1688880920410156, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14896288514137268, + "step": 18654 + }, + { + "epoch": 0.583, + "grad_norm": 3.21875, + "grad_norm_var": 0.028597005208333335, + "learning_rate": 0.0001, + "loss": 5.6605, + "loss/crossentropy": 2.4919804334640503, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17075835168361664, + "step": 18656 + }, + { + "epoch": 0.5830625, + "grad_norm": 3.03125, + "grad_norm_var": 0.028609212239583334, + "learning_rate": 0.0001, + "loss": 5.4257, + "loss/crossentropy": 2.4058111906051636, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15471865236759186, + "step": 18658 + }, + { + "epoch": 0.583125, + "grad_norm": 3.375, + "grad_norm_var": 0.28709208170572914, + "learning_rate": 0.0001, + "loss": 6.2958, + "loss/crossentropy": 2.9767050743103027, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1780048906803131, + "step": 18660 + }, + { + "epoch": 0.5831875, + "grad_norm": 3.0625, + "grad_norm_var": 0.2811431884765625, + "learning_rate": 0.0001, + "loss": 5.6223, + "loss/crossentropy": 2.5128010511398315, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1644660010933876, + "step": 18662 + }, + { + "epoch": 0.58325, + "grad_norm": 3.21875, + "grad_norm_var": 0.2728800455729167, + "learning_rate": 0.0001, + "loss": 5.7958, + "loss/crossentropy": 2.575974225997925, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1731526404619217, + "step": 18664 + }, + { + "epoch": 0.5833125, + "grad_norm": 3.296875, + "grad_norm_var": 0.2719065348307292, + "learning_rate": 0.0001, + "loss": 5.8324, + "loss/crossentropy": 2.730650544166565, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16603553295135498, + "step": 18666 + }, + { + "epoch": 0.583375, + "grad_norm": 3.359375, + "grad_norm_var": 0.2708821614583333, + "learning_rate": 0.0001, + "loss": 5.8089, + "loss/crossentropy": 2.5872223377227783, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17373275756835938, + "step": 18668 + }, + { + "epoch": 0.5834375, + "grad_norm": 3.515625, + "grad_norm_var": 0.2674794514973958, + "learning_rate": 0.0001, + "loss": 5.9221, + "loss/crossentropy": 2.6320900917053223, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17900218069553375, + "step": 18670 + }, + { + "epoch": 0.5835, + "grad_norm": 2.84375, + "grad_norm_var": 0.2742513020833333, + "learning_rate": 0.0001, + "loss": 5.5412, + "loss/crossentropy": 2.4675287008285522, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16361329704523087, + "step": 18672 + }, + { + "epoch": 0.5835625, + "grad_norm": 3.375, + "grad_norm_var": 0.32249247233072914, + "learning_rate": 0.0001, + "loss": 5.8521, + "loss/crossentropy": 2.5169920921325684, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18155356496572495, + "step": 18674 + }, + { + "epoch": 0.583625, + "grad_norm": 3.375, + "grad_norm_var": 0.10301005045572917, + "learning_rate": 0.0001, + "loss": 6.0169, + "loss/crossentropy": 2.6857441663742065, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1823333203792572, + "step": 18676 + }, + { + "epoch": 0.5836875, + "grad_norm": 2.96875, + "grad_norm_var": 0.11301676432291667, + "learning_rate": 0.0001, + "loss": 5.7099, + "loss/crossentropy": 2.6076717376708984, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16061626374721527, + "step": 18678 + }, + { + "epoch": 0.58375, + "grad_norm": 3.484375, + "grad_norm_var": 0.11795247395833333, + "learning_rate": 0.0001, + "loss": 5.7529, + "loss/crossentropy": 2.589252471923828, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.168318510055542, + "step": 18680 + }, + { + "epoch": 0.5838125, + "grad_norm": 3.5625, + "grad_norm_var": 0.11670633951822916, + "learning_rate": 0.0001, + "loss": 5.7674, + "loss/crossentropy": 2.5550804138183594, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17318449914455414, + "step": 18682 + }, + { + "epoch": 0.583875, + "grad_norm": 2.9375, + "grad_norm_var": 0.12859700520833334, + "learning_rate": 0.0001, + "loss": 5.8332, + "loss/crossentropy": 2.606408715248108, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17150401324033737, + "step": 18684 + }, + { + "epoch": 0.5839375, + "grad_norm": 3.078125, + "grad_norm_var": 0.12725321451822916, + "learning_rate": 0.0001, + "loss": 5.8097, + "loss/crossentropy": 2.6292680501937866, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1703866571187973, + "step": 18686 + }, + { + "epoch": 0.584, + "grad_norm": 3.09375, + "grad_norm_var": 0.12434488932291667, + "learning_rate": 0.0001, + "loss": 5.6515, + "loss/crossentropy": 2.5895968675613403, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1632196456193924, + "step": 18688 + }, + { + "epoch": 0.5840625, + "grad_norm": 3.53125, + "grad_norm_var": 0.06166890462239583, + "learning_rate": 0.0001, + "loss": 5.8398, + "loss/crossentropy": 2.5626882314682007, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17810668796300888, + "step": 18690 + }, + { + "epoch": 0.584125, + "grad_norm": 3.21875, + "grad_norm_var": 0.06065165201822917, + "learning_rate": 0.0001, + "loss": 5.407, + "loss/crossentropy": 2.3457969427108765, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15729539096355438, + "step": 18692 + }, + { + "epoch": 0.5841875, + "grad_norm": 3.140625, + "grad_norm_var": 0.049560546875, + "learning_rate": 0.0001, + "loss": 5.9334, + "loss/crossentropy": 2.7616426944732666, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17069139331579208, + "step": 18694 + }, + { + "epoch": 0.58425, + "grad_norm": 3.5625, + "grad_norm_var": 0.051813761393229164, + "learning_rate": 0.0001, + "loss": 5.8901, + "loss/crossentropy": 2.542791485786438, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18121101707220078, + "step": 18696 + }, + { + "epoch": 0.5843125, + "grad_norm": 3.125, + "grad_norm_var": 0.045182291666666666, + "learning_rate": 0.0001, + "loss": 5.6316, + "loss/crossentropy": 2.476907730102539, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16781611740589142, + "step": 18698 + }, + { + "epoch": 0.584375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0344146728515625, + "learning_rate": 0.0001, + "loss": 5.6306, + "loss/crossentropy": 2.5584638118743896, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16268333792686462, + "step": 18700 + }, + { + "epoch": 0.5844375, + "grad_norm": 2.90625, + "grad_norm_var": 0.03758036295572917, + "learning_rate": 0.0001, + "loss": 5.6783, + "loss/crossentropy": 2.6262636184692383, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15989147126674652, + "step": 18702 + }, + { + "epoch": 0.5845, + "grad_norm": 4.40625, + "grad_norm_var": 0.12493082682291666, + "learning_rate": 0.0001, + "loss": 5.6156, + "loss/crossentropy": 2.4813411235809326, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16694270819425583, + "step": 18704 + }, + { + "epoch": 0.5845625, + "grad_norm": 3.3125, + "grad_norm_var": 0.1207427978515625, + "learning_rate": 0.0001, + "loss": 5.694, + "loss/crossentropy": 2.4842689037323, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17019431293010712, + "step": 18706 + }, + { + "epoch": 0.584625, + "grad_norm": 3.90625, + "grad_norm_var": 0.14440104166666667, + "learning_rate": 0.0001, + "loss": 5.9157, + "loss/crossentropy": 2.5731600522994995, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18308069556951523, + "step": 18708 + }, + { + "epoch": 0.5846875, + "grad_norm": 3.140625, + "grad_norm_var": 0.15906473795572917, + "learning_rate": 0.0001, + "loss": 5.6158, + "loss/crossentropy": 2.5464928150177, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16278600692749023, + "step": 18710 + }, + { + "epoch": 0.58475, + "grad_norm": 3.046875, + "grad_norm_var": 0.15641276041666666, + "learning_rate": 0.0001, + "loss": 5.8485, + "loss/crossentropy": 2.682703733444214, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16658055782318115, + "step": 18712 + }, + { + "epoch": 0.5848125, + "grad_norm": 2.890625, + "grad_norm_var": 0.16401265462239584, + "learning_rate": 0.0001, + "loss": 5.7586, + "loss/crossentropy": 2.5853028297424316, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1696712076663971, + "step": 18714 + }, + { + "epoch": 0.584875, + "grad_norm": 3.390625, + "grad_norm_var": 0.16638997395833333, + "learning_rate": 0.0001, + "loss": 5.644, + "loss/crossentropy": 2.5225616693496704, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16488178819417953, + "step": 18716 + }, + { + "epoch": 0.5849375, + "grad_norm": 3.4375, + "grad_norm_var": 0.16483968098958332, + "learning_rate": 0.0001, + "loss": 5.3215, + "loss/crossentropy": 2.2986279726028442, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15385031700134277, + "step": 18718 + }, + { + "epoch": 0.585, + "grad_norm": 3.296875, + "grad_norm_var": 0.0817535400390625, + "learning_rate": 0.0001, + "loss": 5.8051, + "loss/crossentropy": 2.5350691080093384, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17699886113405228, + "step": 18720 + }, + { + "epoch": 0.5850625, + "grad_norm": 3.671875, + "grad_norm_var": 0.09387919108072916, + "learning_rate": 0.0001, + "loss": 5.9967, + "loss/crossentropy": 2.71618390083313, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1780504211783409, + "step": 18722 + }, + { + "epoch": 0.585125, + "grad_norm": 2.9375, + "grad_norm_var": 0.06747945149739583, + "learning_rate": 0.0001, + "loss": 5.6457, + "loss/crossentropy": 2.52972674369812, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16354888677597046, + "step": 18724 + }, + { + "epoch": 0.5851875, + "grad_norm": 2.8125, + "grad_norm_var": 0.06922098795572916, + "learning_rate": 0.0001, + "loss": 5.4993, + "loss/crossentropy": 2.4842694997787476, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1600925326347351, + "step": 18726 + }, + { + "epoch": 0.58525, + "grad_norm": 3.140625, + "grad_norm_var": 0.067333984375, + "learning_rate": 0.0001, + "loss": 5.4522, + "loss/crossentropy": 2.377071976661682, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16024264693260193, + "step": 18728 + }, + { + "epoch": 0.5853125, + "grad_norm": 3.109375, + "grad_norm_var": 0.06289774576822917, + "learning_rate": 0.0001, + "loss": 5.5121, + "loss/crossentropy": 2.4099632501602173, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1574772521853447, + "step": 18730 + }, + { + "epoch": 0.585375, + "grad_norm": 3.546875, + "grad_norm_var": 0.064501953125, + "learning_rate": 0.0001, + "loss": 5.7326, + "loss/crossentropy": 2.510116219520569, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16756585240364075, + "step": 18732 + }, + { + "epoch": 0.5854375, + "grad_norm": 3.125, + "grad_norm_var": 0.060074869791666666, + "learning_rate": 0.0001, + "loss": 5.6425, + "loss/crossentropy": 2.541126847267151, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16482357680797577, + "step": 18734 + }, + { + "epoch": 0.5855, + "grad_norm": 3.03125, + "grad_norm_var": 0.05244038899739583, + "learning_rate": 0.0001, + "loss": 5.8296, + "loss/crossentropy": 2.7355847358703613, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16252835094928741, + "step": 18736 + }, + { + "epoch": 0.5855625, + "grad_norm": 2.828125, + "grad_norm_var": 0.03176167805989583, + "learning_rate": 0.0001, + "loss": 5.3245, + "loss/crossentropy": 2.2850844860076904, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15589946508407593, + "step": 18738 + }, + { + "epoch": 0.585625, + "grad_norm": 3.25, + "grad_norm_var": 0.03371988932291667, + "learning_rate": 0.0001, + "loss": 5.8982, + "loss/crossentropy": 2.6071465015411377, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1783270686864853, + "step": 18740 + }, + { + "epoch": 0.5856875, + "grad_norm": 3.109375, + "grad_norm_var": 0.027327473958333334, + "learning_rate": 0.0001, + "loss": 5.7234, + "loss/crossentropy": 2.653437852859497, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.161686509847641, + "step": 18742 + }, + { + "epoch": 0.58575, + "grad_norm": 3.28125, + "grad_norm_var": 0.0290679931640625, + "learning_rate": 0.0001, + "loss": 5.8541, + "loss/crossentropy": 2.637198805809021, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17168515175580978, + "step": 18744 + }, + { + "epoch": 0.5858125, + "grad_norm": 3.28125, + "grad_norm_var": 0.032124837239583336, + "learning_rate": 0.0001, + "loss": 6.0422, + "loss/crossentropy": 2.6824125051498413, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1836380511522293, + "step": 18746 + }, + { + "epoch": 0.585875, + "grad_norm": 3.109375, + "grad_norm_var": 0.022196451822916668, + "learning_rate": 0.0001, + "loss": 5.7846, + "loss/crossentropy": 2.6274431943893433, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1653219535946846, + "step": 18748 + }, + { + "epoch": 0.5859375, + "grad_norm": 3.21875, + "grad_norm_var": 0.02320556640625, + "learning_rate": 0.0001, + "loss": 5.4851, + "loss/crossentropy": 2.4267383813858032, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15896280109882355, + "step": 18750 + }, + { + "epoch": 0.586, + "grad_norm": 3.09375, + "grad_norm_var": 0.020243326822916668, + "learning_rate": 0.0001, + "loss": 5.5005, + "loss/crossentropy": 2.467852473258972, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15912079811096191, + "step": 18752 + }, + { + "epoch": 0.5860625, + "grad_norm": 3.03125, + "grad_norm_var": 0.014574178059895833, + "learning_rate": 0.0001, + "loss": 5.7226, + "loss/crossentropy": 2.5836005210876465, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16702188551425934, + "step": 18754 + }, + { + "epoch": 0.586125, + "grad_norm": 2.8125, + "grad_norm_var": 0.020536295572916665, + "learning_rate": 0.0001, + "loss": 5.4636, + "loss/crossentropy": 2.427434802055359, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15986891835927963, + "step": 18756 + }, + { + "epoch": 0.5861875, + "grad_norm": 3.0, + "grad_norm_var": 0.026005045572916666, + "learning_rate": 0.0001, + "loss": 6.0944, + "loss/crossentropy": 2.8903247117996216, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17235716432332993, + "step": 18758 + }, + { + "epoch": 0.58625, + "grad_norm": 3.25, + "grad_norm_var": 0.030540974934895833, + "learning_rate": 0.0001, + "loss": 5.6706, + "loss/crossentropy": 2.4873905181884766, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16714636236429214, + "step": 18760 + }, + { + "epoch": 0.5863125, + "grad_norm": 3.484375, + "grad_norm_var": 0.0416015625, + "learning_rate": 0.0001, + "loss": 5.8718, + "loss/crossentropy": 2.551354169845581, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.174618661403656, + "step": 18762 + }, + { + "epoch": 0.586375, + "grad_norm": 3.0, + "grad_norm_var": 0.04360249837239583, + "learning_rate": 0.0001, + "loss": 5.7095, + "loss/crossentropy": 2.521346688270569, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17506151646375656, + "step": 18764 + }, + { + "epoch": 0.5864375, + "grad_norm": 3.5625, + "grad_norm_var": 0.06917317708333333, + "learning_rate": 0.0001, + "loss": 5.7361, + "loss/crossentropy": 2.512497305870056, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17274749279022217, + "step": 18766 + }, + { + "epoch": 0.5865, + "grad_norm": 3.59375, + "grad_norm_var": 0.06972249348958333, + "learning_rate": 0.0001, + "loss": 5.7384, + "loss/crossentropy": 2.605650782585144, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16640214622020721, + "step": 18768 + }, + { + "epoch": 0.5865625, + "grad_norm": 3.125, + "grad_norm_var": 0.07118733723958333, + "learning_rate": 0.0001, + "loss": 5.721, + "loss/crossentropy": 2.5801135301589966, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1679932251572609, + "step": 18770 + }, + { + "epoch": 0.586625, + "grad_norm": 3.09375, + "grad_norm_var": 0.06012369791666667, + "learning_rate": 0.0001, + "loss": 5.7245, + "loss/crossentropy": 2.556220769882202, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17034516483545303, + "step": 18772 + }, + { + "epoch": 0.5866875, + "grad_norm": 3.40625, + "grad_norm_var": 0.0532867431640625, + "learning_rate": 0.0001, + "loss": 5.8482, + "loss/crossentropy": 2.669142723083496, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17142222821712494, + "step": 18774 + }, + { + "epoch": 0.58675, + "grad_norm": 3.296875, + "grad_norm_var": 0.0547515869140625, + "learning_rate": 0.0001, + "loss": 6.1194, + "loss/crossentropy": 2.789446711540222, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1806519329547882, + "step": 18776 + }, + { + "epoch": 0.5868125, + "grad_norm": 2.90625, + "grad_norm_var": 0.06288655598958333, + "learning_rate": 0.0001, + "loss": 5.6243, + "loss/crossentropy": 2.5601223707199097, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16032865643501282, + "step": 18778 + }, + { + "epoch": 0.586875, + "grad_norm": 3.125, + "grad_norm_var": 0.0595855712890625, + "learning_rate": 0.0001, + "loss": 5.6704, + "loss/crossentropy": 2.4958789348602295, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17057915776968002, + "step": 18780 + }, + { + "epoch": 0.5869375, + "grad_norm": 3.46875, + "grad_norm_var": 0.034398396809895836, + "learning_rate": 0.0001, + "loss": 5.7679, + "loss/crossentropy": 2.5094715356826782, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17584221065044403, + "step": 18782 + }, + { + "epoch": 0.587, + "grad_norm": 2.765625, + "grad_norm_var": 0.03583577473958333, + "learning_rate": 0.0001, + "loss": 5.5669, + "loss/crossentropy": 2.5259240865707397, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15839381515979767, + "step": 18784 + }, + { + "epoch": 0.5870625, + "grad_norm": 3.171875, + "grad_norm_var": 0.03515218098958333, + "learning_rate": 0.0001, + "loss": 5.6445, + "loss/crossentropy": 2.4920928478240967, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17149509489536285, + "step": 18786 + }, + { + "epoch": 0.587125, + "grad_norm": 3.34375, + "grad_norm_var": 0.0355377197265625, + "learning_rate": 0.0001, + "loss": 5.4765, + "loss/crossentropy": 2.3797152042388916, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16007137298583984, + "step": 18788 + }, + { + "epoch": 0.5871875, + "grad_norm": 3.078125, + "grad_norm_var": 0.032421875, + "learning_rate": 0.0001, + "loss": 5.7126, + "loss/crossentropy": 2.66006600856781, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16189232468605042, + "step": 18790 + }, + { + "epoch": 0.58725, + "grad_norm": 3.4375, + "grad_norm_var": 0.03732096354166667, + "learning_rate": 0.0001, + "loss": 5.463, + "loss/crossentropy": 2.3955132961273193, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1551876962184906, + "step": 18792 + }, + { + "epoch": 0.5873125, + "grad_norm": 3.40625, + "grad_norm_var": 0.034012858072916666, + "learning_rate": 0.0001, + "loss": 5.5399, + "loss/crossentropy": 2.406830072402954, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1652606800198555, + "step": 18794 + }, + { + "epoch": 0.587375, + "grad_norm": 3.125, + "grad_norm_var": 0.03414306640625, + "learning_rate": 0.0001, + "loss": 5.7499, + "loss/crossentropy": 2.6117184162139893, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16928624361753464, + "step": 18796 + }, + { + "epoch": 0.5874375, + "grad_norm": 2.984375, + "grad_norm_var": 0.031916300455729164, + "learning_rate": 0.0001, + "loss": 5.6044, + "loss/crossentropy": 2.5122451782226562, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16272682696580887, + "step": 18798 + }, + { + "epoch": 0.5875, + "grad_norm": 3.46875, + "grad_norm_var": 0.0217926025390625, + "learning_rate": 0.0001, + "loss": 5.8524, + "loss/crossentropy": 2.638648271560669, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16942624002695084, + "step": 18800 + }, + { + "epoch": 0.5875625, + "grad_norm": 3.15625, + "grad_norm_var": 0.03274739583333333, + "learning_rate": 0.0001, + "loss": 5.7158, + "loss/crossentropy": 2.6581921577453613, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15498338639736176, + "step": 18802 + }, + { + "epoch": 0.587625, + "grad_norm": 3.125, + "grad_norm_var": 0.031346638997395836, + "learning_rate": 0.0001, + "loss": 5.8972, + "loss/crossentropy": 2.6991318464279175, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17254384607076645, + "step": 18804 + }, + { + "epoch": 0.5876875, + "grad_norm": 3.0, + "grad_norm_var": 0.03234049479166667, + "learning_rate": 0.0001, + "loss": 5.5506, + "loss/crossentropy": 2.4636647701263428, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1618233397603035, + "step": 18806 + }, + { + "epoch": 0.58775, + "grad_norm": 3.09375, + "grad_norm_var": 0.026805623372395834, + "learning_rate": 0.0001, + "loss": 5.8945, + "loss/crossentropy": 2.715206265449524, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1690976768732071, + "step": 18808 + }, + { + "epoch": 0.5878125, + "grad_norm": 3.109375, + "grad_norm_var": 0.020849609375, + "learning_rate": 0.0001, + "loss": 5.7979, + "loss/crossentropy": 2.602890729904175, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17145641148090363, + "step": 18810 + }, + { + "epoch": 0.587875, + "grad_norm": 2.984375, + "grad_norm_var": 0.020685831705729168, + "learning_rate": 0.0001, + "loss": 5.3204, + "loss/crossentropy": 2.3325252532958984, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15621185302734375, + "step": 18812 + }, + { + "epoch": 0.5879375, + "grad_norm": 3.078125, + "grad_norm_var": 0.020099894205729166, + "learning_rate": 0.0001, + "loss": 5.7891, + "loss/crossentropy": 2.62211012840271, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17021489888429642, + "step": 18814 + }, + { + "epoch": 0.588, + "grad_norm": 3.265625, + "grad_norm_var": 0.011311848958333334, + "learning_rate": 0.0001, + "loss": 6.0024, + "loss/crossentropy": 2.6743029356002808, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18320034444332123, + "step": 18816 + }, + { + "epoch": 0.5880625, + "grad_norm": 3.21875, + "grad_norm_var": 0.006884765625, + "learning_rate": 0.0001, + "loss": 5.7251, + "loss/crossentropy": 2.502592086791992, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1734180748462677, + "step": 18818 + }, + { + "epoch": 0.588125, + "grad_norm": 3.078125, + "grad_norm_var": 0.006966145833333334, + "learning_rate": 0.0001, + "loss": 5.7168, + "loss/crossentropy": 2.5643508434295654, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16953930258750916, + "step": 18820 + }, + { + "epoch": 0.5881875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0085845947265625, + "learning_rate": 0.0001, + "loss": 5.652, + "loss/crossentropy": 2.526243805885315, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.15906336903572083, + "step": 18822 + }, + { + "epoch": 0.58825, + "grad_norm": 3.03125, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 5.8207, + "loss/crossentropy": 2.62376070022583, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1716514155268669, + "step": 18824 + }, + { + "epoch": 0.5883125, + "grad_norm": 3.09375, + "grad_norm_var": 0.010184733072916667, + "learning_rate": 0.0001, + "loss": 5.4301, + "loss/crossentropy": 2.3604893684387207, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16438043117523193, + "step": 18826 + }, + { + "epoch": 0.588375, + "grad_norm": 3.046875, + "grad_norm_var": 0.009956868489583333, + "learning_rate": 0.0001, + "loss": 5.6351, + "loss/crossentropy": 2.5153839588165283, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16353678703308105, + "step": 18828 + }, + { + "epoch": 0.5884375, + "grad_norm": 3.34375, + "grad_norm_var": 0.01256103515625, + "learning_rate": 0.0001, + "loss": 5.6013, + "loss/crossentropy": 2.491378426551819, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16138406842947006, + "step": 18830 + }, + { + "epoch": 0.5885, + "grad_norm": 3.265625, + "grad_norm_var": 0.013700358072916667, + "learning_rate": 0.0001, + "loss": 5.7877, + "loss/crossentropy": 2.5900150537490845, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17054571956396103, + "step": 18832 + }, + { + "epoch": 0.5885625, + "grad_norm": 2.875, + "grad_norm_var": 0.8783487955729167, + "learning_rate": 0.0001, + "loss": 5.6009, + "loss/crossentropy": 2.3893556594848633, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1676376461982727, + "step": 18834 + }, + { + "epoch": 0.588625, + "grad_norm": 3.453125, + "grad_norm_var": 0.8694620768229167, + "learning_rate": 0.0001, + "loss": 5.7512, + "loss/crossentropy": 2.4890904426574707, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17308519035577774, + "step": 18836 + }, + { + "epoch": 0.5886875, + "grad_norm": 3.125, + "grad_norm_var": 0.8741495768229167, + "learning_rate": 0.0001, + "loss": 5.4162, + "loss/crossentropy": 2.4170466661453247, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15929004549980164, + "step": 18838 + }, + { + "epoch": 0.58875, + "grad_norm": 3.484375, + "grad_norm_var": 0.8793253580729167, + "learning_rate": 0.0001, + "loss": 5.7454, + "loss/crossentropy": 2.6104096174240112, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16505949199199677, + "step": 18840 + }, + { + "epoch": 0.5888125, + "grad_norm": 2.984375, + "grad_norm_var": 0.8839752197265625, + "learning_rate": 0.0001, + "loss": 5.6596, + "loss/crossentropy": 2.524080991744995, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16433259844779968, + "step": 18842 + }, + { + "epoch": 0.588875, + "grad_norm": 3.21875, + "grad_norm_var": 0.8858357747395833, + "learning_rate": 0.0001, + "loss": 5.4786, + "loss/crossentropy": 2.4198418855667114, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15704571455717087, + "step": 18844 + }, + { + "epoch": 0.5889375, + "grad_norm": 3.125, + "grad_norm_var": 0.887255859375, + "learning_rate": 0.0001, + "loss": 5.6893, + "loss/crossentropy": 2.46872341632843, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17049581557512283, + "step": 18846 + }, + { + "epoch": 0.589, + "grad_norm": 3.09375, + "grad_norm_var": 0.8957427978515625, + "learning_rate": 0.0001, + "loss": 5.5962, + "loss/crossentropy": 2.497233748435974, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16224434226751328, + "step": 18848 + }, + { + "epoch": 0.5890625, + "grad_norm": 2.890625, + "grad_norm_var": 0.03479817708333333, + "learning_rate": 0.0001, + "loss": 5.5171, + "loss/crossentropy": 2.4555243253707886, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15889254212379456, + "step": 18850 + }, + { + "epoch": 0.589125, + "grad_norm": 2.859375, + "grad_norm_var": 0.032389322916666664, + "learning_rate": 0.0001, + "loss": 5.3986, + "loss/crossentropy": 2.3385857343673706, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15951615571975708, + "step": 18852 + }, + { + "epoch": 0.5891875, + "grad_norm": 3.234375, + "grad_norm_var": 0.03331705729166667, + "learning_rate": 0.0001, + "loss": 5.5947, + "loss/crossentropy": 2.479830026626587, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16656863689422607, + "step": 18854 + }, + { + "epoch": 0.58925, + "grad_norm": 3.1875, + "grad_norm_var": 0.02193603515625, + "learning_rate": 0.0001, + "loss": 5.4323, + "loss/crossentropy": 2.368161678314209, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1626622974872589, + "step": 18856 + }, + { + "epoch": 0.5893125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0214752197265625, + "learning_rate": 0.0001, + "loss": 6.0137, + "loss/crossentropy": 2.783053994178772, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17267384380102158, + "step": 18858 + }, + { + "epoch": 0.589375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0397613525390625, + "learning_rate": 0.0001, + "loss": 5.5611, + "loss/crossentropy": 2.2689337730407715, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1745278239250183, + "step": 18860 + }, + { + "epoch": 0.5894375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0404937744140625, + "learning_rate": 0.0001, + "loss": 5.7192, + "loss/crossentropy": 2.5575358867645264, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16812192648649216, + "step": 18862 + }, + { + "epoch": 0.5895, + "grad_norm": 3.015625, + "grad_norm_var": 0.03795166015625, + "learning_rate": 0.0001, + "loss": 5.5239, + "loss/crossentropy": 2.498926877975464, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15718135237693787, + "step": 18864 + }, + { + "epoch": 0.5895625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0373931884765625, + "learning_rate": 0.0001, + "loss": 6.078, + "loss/crossentropy": 2.8394166231155396, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1722913682460785, + "step": 18866 + }, + { + "epoch": 0.589625, + "grad_norm": 3.015625, + "grad_norm_var": 0.031689453125, + "learning_rate": 0.0001, + "loss": 5.9251, + "loss/crossentropy": 2.6486889123916626, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17803260684013367, + "step": 18868 + }, + { + "epoch": 0.5896875, + "grad_norm": 3.296875, + "grad_norm_var": 0.03420817057291667, + "learning_rate": 0.0001, + "loss": 5.2808, + "loss/crossentropy": 2.2919921875, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15786871314048767, + "step": 18870 + }, + { + "epoch": 0.58975, + "grad_norm": 3.171875, + "grad_norm_var": 0.03389383951822917, + "learning_rate": 0.0001, + "loss": 5.8613, + "loss/crossentropy": 2.656334161758423, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1720557063817978, + "step": 18872 + }, + { + "epoch": 0.5898125, + "grad_norm": 3.296875, + "grad_norm_var": 0.041337076822916666, + "learning_rate": 0.0001, + "loss": 5.8043, + "loss/crossentropy": 2.6470354795455933, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17080049961805344, + "step": 18874 + }, + { + "epoch": 0.589875, + "grad_norm": 2.890625, + "grad_norm_var": 0.0343902587890625, + "learning_rate": 0.0001, + "loss": 5.4058, + "loss/crossentropy": 2.4188259840011597, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15299608558416367, + "step": 18876 + }, + { + "epoch": 0.5899375, + "grad_norm": 3.046875, + "grad_norm_var": 0.033381144205729164, + "learning_rate": 0.0001, + "loss": 5.9038, + "loss/crossentropy": 2.736717104911804, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17100075632333755, + "step": 18878 + }, + { + "epoch": 0.59, + "grad_norm": 3.078125, + "grad_norm_var": 0.03313395182291667, + "learning_rate": 0.0001, + "loss": 5.8452, + "loss/crossentropy": 2.595299482345581, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.18007101863622665, + "step": 18880 + }, + { + "epoch": 0.5900625, + "grad_norm": 3.203125, + "grad_norm_var": 0.029051717122395834, + "learning_rate": 0.0001, + "loss": 6.0323, + "loss/crossentropy": 2.7330769300460815, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.183043472468853, + "step": 18882 + }, + { + "epoch": 0.590125, + "grad_norm": 3.390625, + "grad_norm_var": 0.03362223307291667, + "learning_rate": 0.0001, + "loss": 5.6244, + "loss/crossentropy": 2.4341530799865723, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1674579456448555, + "step": 18884 + }, + { + "epoch": 0.5901875, + "grad_norm": 3.171875, + "grad_norm_var": 0.031571451822916666, + "learning_rate": 0.0001, + "loss": 5.6893, + "loss/crossentropy": 2.525902032852173, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16634322702884674, + "step": 18886 + }, + { + "epoch": 0.59025, + "grad_norm": 3.21875, + "grad_norm_var": 0.03188374837239583, + "learning_rate": 0.0001, + "loss": 5.5344, + "loss/crossentropy": 2.4152660369873047, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16426139324903488, + "step": 18888 + }, + { + "epoch": 0.5903125, + "grad_norm": 3.1875, + "grad_norm_var": 0.023786417643229165, + "learning_rate": 0.0001, + "loss": 5.8936, + "loss/crossentropy": 2.721444606781006, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17073000222444534, + "step": 18890 + }, + { + "epoch": 0.590375, + "grad_norm": 3.1875, + "grad_norm_var": 0.014598592122395834, + "learning_rate": 0.0001, + "loss": 5.7217, + "loss/crossentropy": 2.588342785835266, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1621607542037964, + "step": 18892 + }, + { + "epoch": 0.5904375, + "grad_norm": 3.28125, + "grad_norm_var": 0.015478515625, + "learning_rate": 0.0001, + "loss": 5.6263, + "loss/crossentropy": 2.4897148609161377, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16639653593301773, + "step": 18894 + }, + { + "epoch": 0.5905, + "grad_norm": 3.390625, + "grad_norm_var": 0.019254557291666665, + "learning_rate": 0.0001, + "loss": 5.8017, + "loss/crossentropy": 2.6354910135269165, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17013906687498093, + "step": 18896 + }, + { + "epoch": 0.5905625, + "grad_norm": 2.890625, + "grad_norm_var": 0.026167805989583334, + "learning_rate": 0.0001, + "loss": 5.6968, + "loss/crossentropy": 2.6278244256973267, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16275885701179504, + "step": 18898 + }, + { + "epoch": 0.590625, + "grad_norm": 3.34375, + "grad_norm_var": 0.028304036458333334, + "learning_rate": 0.0001, + "loss": 5.5764, + "loss/crossentropy": 2.411203622817993, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16690727323293686, + "step": 18900 + }, + { + "epoch": 0.5906875, + "grad_norm": 3.203125, + "grad_norm_var": 0.026395670572916665, + "learning_rate": 0.0001, + "loss": 5.5561, + "loss/crossentropy": 2.3805559873580933, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1671639084815979, + "step": 18902 + }, + { + "epoch": 0.59075, + "grad_norm": 2.984375, + "grad_norm_var": 0.034765625, + "learning_rate": 0.0001, + "loss": 5.9054, + "loss/crossentropy": 2.646710991859436, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17391187697649002, + "step": 18904 + }, + { + "epoch": 0.5908125, + "grad_norm": 3.515625, + "grad_norm_var": 0.0384674072265625, + "learning_rate": 0.0001, + "loss": 5.7693, + "loss/crossentropy": 2.5989474058151245, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16742993146181107, + "step": 18906 + }, + { + "epoch": 0.590875, + "grad_norm": 3.453125, + "grad_norm_var": 0.042215983072916664, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.455661177635193, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17167077958583832, + "step": 18908 + }, + { + "epoch": 0.5909375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0461822509765625, + "learning_rate": 0.0001, + "loss": 5.801, + "loss/crossentropy": 2.648666024208069, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1683606579899788, + "step": 18910 + }, + { + "epoch": 0.591, + "grad_norm": 3.578125, + "grad_norm_var": 0.05367431640625, + "learning_rate": 0.0001, + "loss": 5.9001, + "loss/crossentropy": 2.6702526807785034, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17415423691272736, + "step": 18912 + }, + { + "epoch": 0.5910625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0565093994140625, + "learning_rate": 0.0001, + "loss": 5.4425, + "loss/crossentropy": 2.3983428478240967, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1606631726026535, + "step": 18914 + }, + { + "epoch": 0.591125, + "grad_norm": 3.140625, + "grad_norm_var": 0.0539947509765625, + "learning_rate": 0.0001, + "loss": 5.8541, + "loss/crossentropy": 2.685070753097534, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17119833827018738, + "step": 18916 + }, + { + "epoch": 0.5911875, + "grad_norm": 2.96875, + "grad_norm_var": 0.06687825520833333, + "learning_rate": 0.0001, + "loss": 5.3403, + "loss/crossentropy": 2.359796643257141, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15000443160533905, + "step": 18918 + }, + { + "epoch": 0.59125, + "grad_norm": 4.15625, + "grad_norm_var": 0.11724344889322917, + "learning_rate": 0.0001, + "loss": 5.5051, + "loss/crossentropy": 2.360744833946228, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16404935717582703, + "step": 18920 + }, + { + "epoch": 0.5913125, + "grad_norm": 2.96875, + "grad_norm_var": 0.11398111979166667, + "learning_rate": 0.0001, + "loss": 5.4941, + "loss/crossentropy": 2.394318461418152, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16114606708288193, + "step": 18922 + }, + { + "epoch": 0.591375, + "grad_norm": 3.390625, + "grad_norm_var": 0.10827534993489583, + "learning_rate": 0.0001, + "loss": 5.4535, + "loss/crossentropy": 2.3116272687911987, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16340911388397217, + "step": 18924 + }, + { + "epoch": 0.5914375, + "grad_norm": 3.1875, + "grad_norm_var": 0.10829671223958333, + "learning_rate": 0.0001, + "loss": 5.7031, + "loss/crossentropy": 2.5502008199691772, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16645871102809906, + "step": 18926 + }, + { + "epoch": 0.5915, + "grad_norm": 3.390625, + "grad_norm_var": 0.09758707682291666, + "learning_rate": 0.0001, + "loss": 5.8017, + "loss/crossentropy": 2.6481685638427734, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16378618776798248, + "step": 18928 + }, + { + "epoch": 0.5915625, + "grad_norm": 3.109375, + "grad_norm_var": 0.09195048014322917, + "learning_rate": 0.0001, + "loss": 5.7804, + "loss/crossentropy": 2.582486391067505, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1725265383720398, + "step": 18930 + }, + { + "epoch": 0.591625, + "grad_norm": 3.640625, + "grad_norm_var": 0.10593159993489583, + "learning_rate": 0.0001, + "loss": 6.0065, + "loss/crossentropy": 2.7718145847320557, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17737869918346405, + "step": 18932 + }, + { + "epoch": 0.5916875, + "grad_norm": 3.0, + "grad_norm_var": 0.0904449462890625, + "learning_rate": 0.0001, + "loss": 5.7479, + "loss/crossentropy": 2.580349564552307, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16714490205049515, + "step": 18934 + }, + { + "epoch": 0.59175, + "grad_norm": 3.21875, + "grad_norm_var": 0.03564453125, + "learning_rate": 0.0001, + "loss": 5.8126, + "loss/crossentropy": 2.6517690420150757, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1707712784409523, + "step": 18936 + }, + { + "epoch": 0.5918125, + "grad_norm": 2.96875, + "grad_norm_var": 0.037007649739583336, + "learning_rate": 0.0001, + "loss": 5.6382, + "loss/crossentropy": 2.518024206161499, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1686599999666214, + "step": 18938 + }, + { + "epoch": 0.591875, + "grad_norm": 2.796875, + "grad_norm_var": 0.04729410807291667, + "learning_rate": 0.0001, + "loss": 5.2716, + "loss/crossentropy": 2.290798306465149, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15549713373184204, + "step": 18940 + }, + { + "epoch": 0.5919375, + "grad_norm": 2.859375, + "grad_norm_var": 0.05088602701822917, + "learning_rate": 0.0001, + "loss": 5.4848, + "loss/crossentropy": 2.317240595817566, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17105378210544586, + "step": 18942 + }, + { + "epoch": 0.592, + "grad_norm": 2.9375, + "grad_norm_var": 0.0511871337890625, + "learning_rate": 0.0001, + "loss": 5.3886, + "loss/crossentropy": 2.420640707015991, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15460366010665894, + "step": 18944 + }, + { + "epoch": 0.5920625, + "grad_norm": 2.984375, + "grad_norm_var": 0.05243733723958333, + "learning_rate": 0.0001, + "loss": 5.4536, + "loss/crossentropy": 2.412165403366089, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1541440710425377, + "step": 18946 + }, + { + "epoch": 0.592125, + "grad_norm": 3.25, + "grad_norm_var": 0.03235270182291667, + "learning_rate": 0.0001, + "loss": 5.3565, + "loss/crossentropy": 2.2599799633026123, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1612170711159706, + "step": 18948 + }, + { + "epoch": 0.5921875, + "grad_norm": 3.28125, + "grad_norm_var": 0.031819661458333336, + "learning_rate": 0.0001, + "loss": 6.1042, + "loss/crossentropy": 2.824833631515503, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17715388536453247, + "step": 18950 + }, + { + "epoch": 0.59225, + "grad_norm": 3.015625, + "grad_norm_var": 0.03179931640625, + "learning_rate": 0.0001, + "loss": 5.7712, + "loss/crossentropy": 2.6190898418426514, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1683368757367134, + "step": 18952 + }, + { + "epoch": 0.5923125, + "grad_norm": 3.296875, + "grad_norm_var": 0.03291015625, + "learning_rate": 0.0001, + "loss": 5.6267, + "loss/crossentropy": 2.531549572944641, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1606869176030159, + "step": 18954 + }, + { + "epoch": 0.592375, + "grad_norm": 3.0625, + "grad_norm_var": 0.027106730143229167, + "learning_rate": 0.0001, + "loss": 5.6599, + "loss/crossentropy": 2.516823410987854, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16587327420711517, + "step": 18956 + }, + { + "epoch": 0.5924375, + "grad_norm": 3.1875, + "grad_norm_var": 0.021708170572916668, + "learning_rate": 0.0001, + "loss": 5.5947, + "loss/crossentropy": 2.440278649330139, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16856301575899124, + "step": 18958 + }, + { + "epoch": 0.5925, + "grad_norm": 3.671875, + "grad_norm_var": 0.042378743489583336, + "learning_rate": 0.0001, + "loss": 5.9483, + "loss/crossentropy": 2.732330799102783, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17277376353740692, + "step": 18960 + }, + { + "epoch": 0.5925625, + "grad_norm": 3.0, + "grad_norm_var": 0.04168294270833333, + "learning_rate": 0.0001, + "loss": 5.5481, + "loss/crossentropy": 2.4707993268966675, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16241498291492462, + "step": 18962 + }, + { + "epoch": 0.592625, + "grad_norm": 2.9375, + "grad_norm_var": 0.0461334228515625, + "learning_rate": 0.0001, + "loss": 6.0532, + "loss/crossentropy": 2.7712173461914062, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17585937678813934, + "step": 18964 + }, + { + "epoch": 0.5926875, + "grad_norm": 3.125, + "grad_norm_var": 0.04332275390625, + "learning_rate": 0.0001, + "loss": 5.7558, + "loss/crossentropy": 2.6582868099212646, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16482648998498917, + "step": 18966 + }, + { + "epoch": 0.59275, + "grad_norm": 3.703125, + "grad_norm_var": 0.0842681884765625, + "learning_rate": 0.0001, + "loss": 5.9919, + "loss/crossentropy": 2.6734771728515625, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17637837678194046, + "step": 18968 + }, + { + "epoch": 0.5928125, + "grad_norm": 3.53125, + "grad_norm_var": 0.08127848307291667, + "learning_rate": 0.0001, + "loss": 5.8368, + "loss/crossentropy": 2.626417398452759, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1698618158698082, + "step": 18970 + }, + { + "epoch": 0.592875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0812896728515625, + "learning_rate": 0.0001, + "loss": 6.1239, + "loss/crossentropy": 2.816824197769165, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17797277867794037, + "step": 18972 + }, + { + "epoch": 0.5929375, + "grad_norm": 3.078125, + "grad_norm_var": 0.09773763020833333, + "learning_rate": 0.0001, + "loss": 5.3799, + "loss/crossentropy": 2.425009846687317, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15252325683832169, + "step": 18974 + }, + { + "epoch": 0.593, + "grad_norm": 3.125, + "grad_norm_var": 0.08258463541666666, + "learning_rate": 0.0001, + "loss": 5.6301, + "loss/crossentropy": 2.5467501878738403, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16106735169887543, + "step": 18976 + }, + { + "epoch": 0.5930625, + "grad_norm": 3.0625, + "grad_norm_var": 0.07834879557291667, + "learning_rate": 0.0001, + "loss": 5.7481, + "loss/crossentropy": 2.5487849712371826, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1714925691485405, + "step": 18978 + }, + { + "epoch": 0.593125, + "grad_norm": 3.390625, + "grad_norm_var": 0.07740478515625, + "learning_rate": 0.0001, + "loss": 5.8858, + "loss/crossentropy": 2.6531916856765747, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17287417501211166, + "step": 18980 + }, + { + "epoch": 0.5931875, + "grad_norm": 3.15625, + "grad_norm_var": 0.07591145833333333, + "learning_rate": 0.0001, + "loss": 6.0652, + "loss/crossentropy": 2.7632105350494385, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17941667139530182, + "step": 18982 + }, + { + "epoch": 0.59325, + "grad_norm": 3.0625, + "grad_norm_var": 0.04586181640625, + "learning_rate": 0.0001, + "loss": 5.4704, + "loss/crossentropy": 2.4141980409622192, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16108528524637222, + "step": 18984 + }, + { + "epoch": 0.5933125, + "grad_norm": 2.875, + "grad_norm_var": 0.03755594889322917, + "learning_rate": 0.0001, + "loss": 5.5901, + "loss/crossentropy": 2.530953526496887, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16333574801683426, + "step": 18986 + }, + { + "epoch": 0.593375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03426005045572917, + "learning_rate": 0.0001, + "loss": 6.0152, + "loss/crossentropy": 2.814469814300537, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17084968835115433, + "step": 18988 + }, + { + "epoch": 0.5934375, + "grad_norm": 3.109375, + "grad_norm_var": 0.027620442708333335, + "learning_rate": 0.0001, + "loss": 5.6947, + "loss/crossentropy": 2.5568608045578003, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16729601472616196, + "step": 18990 + }, + { + "epoch": 0.5935, + "grad_norm": 3.203125, + "grad_norm_var": 0.027057902018229166, + "learning_rate": 0.0001, + "loss": 5.7036, + "loss/crossentropy": 2.5921266078948975, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1654420644044876, + "step": 18992 + }, + { + "epoch": 0.5935625, + "grad_norm": 2.890625, + "grad_norm_var": 0.028563435872395834, + "learning_rate": 0.0001, + "loss": 5.4692, + "loss/crossentropy": 2.388649821281433, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1635209619998932, + "step": 18994 + }, + { + "epoch": 0.593625, + "grad_norm": 3.265625, + "grad_norm_var": 0.04140625, + "learning_rate": 0.0001, + "loss": 5.9893, + "loss/crossentropy": 2.656255602836609, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18291820585727692, + "step": 18996 + }, + { + "epoch": 0.5936875, + "grad_norm": 3.3125, + "grad_norm_var": 0.04641520182291667, + "learning_rate": 0.0001, + "loss": 5.6909, + "loss/crossentropy": 2.530544877052307, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16447626799345016, + "step": 18998 + }, + { + "epoch": 0.59375, + "grad_norm": 3.0625, + "grad_norm_var": 0.045572916666666664, + "learning_rate": 0.0001, + "loss": 5.7836, + "loss/crossentropy": 2.607883930206299, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16717688739299774, + "step": 19000 + }, + { + "epoch": 0.5938125, + "grad_norm": 3.171875, + "grad_norm_var": 0.04083658854166667, + "learning_rate": 0.0001, + "loss": 5.7664, + "loss/crossentropy": 2.6362565755844116, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1641901507973671, + "step": 19002 + }, + { + "epoch": 0.593875, + "grad_norm": 2.953125, + "grad_norm_var": 0.03870035807291667, + "learning_rate": 0.0001, + "loss": 5.5183, + "loss/crossentropy": 2.4436482191085815, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16293271631002426, + "step": 19004 + }, + { + "epoch": 0.5939375, + "grad_norm": 3.34375, + "grad_norm_var": 0.04023030598958333, + "learning_rate": 0.0001, + "loss": 5.9618, + "loss/crossentropy": 2.699216842651367, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17626157402992249, + "step": 19006 + }, + { + "epoch": 0.594, + "grad_norm": 3.078125, + "grad_norm_var": 3.042560831705729, + "learning_rate": 0.0001, + "loss": 5.9855, + "loss/crossentropy": 2.5044608116149902, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.19692841917276382, + "step": 19008 + }, + { + "epoch": 0.5940625, + "grad_norm": 3.15625, + "grad_norm_var": 3.050096638997396, + "learning_rate": 0.0001, + "loss": 5.7888, + "loss/crossentropy": 2.6429343223571777, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16810011863708496, + "step": 19010 + }, + { + "epoch": 0.594125, + "grad_norm": 3.125, + "grad_norm_var": 3.074803670247396, + "learning_rate": 0.0001, + "loss": 5.7232, + "loss/crossentropy": 2.5194497108459473, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1688118353486061, + "step": 19012 + }, + { + "epoch": 0.5941875, + "grad_norm": 3.25, + "grad_norm_var": 3.0759073893229165, + "learning_rate": 0.0001, + "loss": 5.6813, + "loss/crossentropy": 2.507745862007141, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16696728765964508, + "step": 19014 + }, + { + "epoch": 0.59425, + "grad_norm": 3.46875, + "grad_norm_var": 3.0719563802083334, + "learning_rate": 0.0001, + "loss": 5.793, + "loss/crossentropy": 2.6356258392333984, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1712021380662918, + "step": 19016 + }, + { + "epoch": 0.5943125, + "grad_norm": 3.15625, + "grad_norm_var": 3.075755818684896, + "learning_rate": 0.0001, + "loss": 5.8095, + "loss/crossentropy": 2.591075897216797, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17183955758810043, + "step": 19018 + }, + { + "epoch": 0.594375, + "grad_norm": 3.40625, + "grad_norm_var": 3.0625, + "learning_rate": 0.0001, + "loss": 5.5936, + "loss/crossentropy": 2.451937198638916, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.165337435901165, + "step": 19020 + }, + { + "epoch": 0.5944375, + "grad_norm": 3.28125, + "grad_norm_var": 3.083788045247396, + "learning_rate": 0.0001, + "loss": 5.4149, + "loss/crossentropy": 2.3058871030807495, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1628570780158043, + "step": 19022 + }, + { + "epoch": 0.5945, + "grad_norm": 3.375, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 5.5628, + "loss/crossentropy": 2.423704147338867, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1674252226948738, + "step": 19024 + }, + { + "epoch": 0.5945625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0283843994140625, + "learning_rate": 0.0001, + "loss": 6.0417, + "loss/crossentropy": 2.7136545181274414, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17733952403068542, + "step": 19026 + }, + { + "epoch": 0.594625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0339508056640625, + "learning_rate": 0.0001, + "loss": 5.5453, + "loss/crossentropy": 2.377169370651245, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16642732918262482, + "step": 19028 + }, + { + "epoch": 0.5946875, + "grad_norm": 3.0, + "grad_norm_var": 0.0364898681640625, + "learning_rate": 0.0001, + "loss": 5.4178, + "loss/crossentropy": 2.413586974143982, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15159141272306442, + "step": 19030 + }, + { + "epoch": 0.59475, + "grad_norm": 3.109375, + "grad_norm_var": 0.045731608072916666, + "learning_rate": 0.0001, + "loss": 5.86, + "loss/crossentropy": 2.6076817512512207, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17796234786510468, + "step": 19032 + }, + { + "epoch": 0.5948125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0462066650390625, + "learning_rate": 0.0001, + "loss": 5.8138, + "loss/crossentropy": 2.6422449350357056, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1691109985113144, + "step": 19034 + }, + { + "epoch": 0.594875, + "grad_norm": 3.0, + "grad_norm_var": 0.04415690104166667, + "learning_rate": 0.0001, + "loss": 5.6882, + "loss/crossentropy": 2.6311895847320557, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1603873297572136, + "step": 19036 + }, + { + "epoch": 0.5949375, + "grad_norm": 3.390625, + "grad_norm_var": 0.052506510416666666, + "learning_rate": 0.0001, + "loss": 5.4202, + "loss/crossentropy": 2.306910276412964, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16054462641477585, + "step": 19038 + }, + { + "epoch": 0.595, + "grad_norm": 3.359375, + "grad_norm_var": 0.0525787353515625, + "learning_rate": 0.0001, + "loss": 5.8284, + "loss/crossentropy": 2.6306427717208862, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.168601892888546, + "step": 19040 + }, + { + "epoch": 0.5950625, + "grad_norm": 2.96875, + "grad_norm_var": 0.0587554931640625, + "learning_rate": 0.0001, + "loss": 5.8099, + "loss/crossentropy": 2.637050747871399, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17002219706773758, + "step": 19042 + }, + { + "epoch": 0.595125, + "grad_norm": 3.265625, + "grad_norm_var": 0.057942708333333336, + "learning_rate": 0.0001, + "loss": 5.9011, + "loss/crossentropy": 2.6271122694015503, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17739781737327576, + "step": 19044 + }, + { + "epoch": 0.5951875, + "grad_norm": 3.03125, + "grad_norm_var": 0.051878865559895834, + "learning_rate": 0.0001, + "loss": 5.8256, + "loss/crossentropy": 2.6426377296447754, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1667339876294136, + "step": 19046 + }, + { + "epoch": 0.59525, + "grad_norm": 3.140625, + "grad_norm_var": 0.053319295247395836, + "learning_rate": 0.0001, + "loss": 5.4951, + "loss/crossentropy": 2.320650339126587, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1654891073703766, + "step": 19048 + }, + { + "epoch": 0.5953125, + "grad_norm": 2.859375, + "grad_norm_var": 0.07265218098958333, + "learning_rate": 0.0001, + "loss": 5.5159, + "loss/crossentropy": 2.4054372310638428, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1602659523487091, + "step": 19050 + }, + { + "epoch": 0.595375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0620513916015625, + "learning_rate": 0.0001, + "loss": 5.4535, + "loss/crossentropy": 2.3170766830444336, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16403310000896454, + "step": 19052 + }, + { + "epoch": 0.5954375, + "grad_norm": 3.0, + "grad_norm_var": 0.06348368326822916, + "learning_rate": 0.0001, + "loss": 5.584, + "loss/crossentropy": 2.4700220823287964, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16374646127223969, + "step": 19054 + }, + { + "epoch": 0.5955, + "grad_norm": 3.15625, + "grad_norm_var": 0.06272379557291667, + "learning_rate": 0.0001, + "loss": 5.7364, + "loss/crossentropy": 2.5879608392715454, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16914492845535278, + "step": 19056 + }, + { + "epoch": 0.5955625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0620269775390625, + "learning_rate": 0.0001, + "loss": 5.7574, + "loss/crossentropy": 2.60457980632782, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1672329530119896, + "step": 19058 + }, + { + "epoch": 0.595625, + "grad_norm": 2.984375, + "grad_norm_var": 0.06207275390625, + "learning_rate": 0.0001, + "loss": 5.5583, + "loss/crossentropy": 2.3792275190353394, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16595008969306946, + "step": 19060 + }, + { + "epoch": 0.5956875, + "grad_norm": 3.296875, + "grad_norm_var": 0.0641021728515625, + "learning_rate": 0.0001, + "loss": 5.5144, + "loss/crossentropy": 2.4050134420394897, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1613277867436409, + "step": 19062 + }, + { + "epoch": 0.59575, + "grad_norm": 3.046875, + "grad_norm_var": 0.0496490478515625, + "learning_rate": 0.0001, + "loss": 5.7238, + "loss/crossentropy": 2.553224802017212, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17057671397924423, + "step": 19064 + }, + { + "epoch": 0.5958125, + "grad_norm": 3.171875, + "grad_norm_var": 0.025484212239583335, + "learning_rate": 0.0001, + "loss": 5.889, + "loss/crossentropy": 2.669556975364685, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1735069379210472, + "step": 19066 + }, + { + "epoch": 0.595875, + "grad_norm": 3.203125, + "grad_norm_var": 0.024088541666666668, + "learning_rate": 0.0001, + "loss": 6.0692, + "loss/crossentropy": 2.798303008079529, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1770864650607109, + "step": 19068 + }, + { + "epoch": 0.5959375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0181793212890625, + "learning_rate": 0.0001, + "loss": 6.0967, + "loss/crossentropy": 2.8803699016571045, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1747601479291916, + "step": 19070 + }, + { + "epoch": 0.596, + "grad_norm": 3.046875, + "grad_norm_var": 0.0188629150390625, + "learning_rate": 0.0001, + "loss": 5.7132, + "loss/crossentropy": 2.6060245037078857, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1646246314048767, + "step": 19072 + }, + { + "epoch": 0.5960625, + "grad_norm": 2.921875, + "grad_norm_var": 0.023030598958333332, + "learning_rate": 0.0001, + "loss": 5.4366, + "loss/crossentropy": 2.3716466426849365, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16274771094322205, + "step": 19074 + }, + { + "epoch": 0.596125, + "grad_norm": 3.0, + "grad_norm_var": 0.024689737955729166, + "learning_rate": 0.0001, + "loss": 5.7495, + "loss/crossentropy": 2.6083240509033203, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1645103245973587, + "step": 19076 + }, + { + "epoch": 0.5961875, + "grad_norm": 2.96875, + "grad_norm_var": 0.022264607747395835, + "learning_rate": 0.0001, + "loss": 5.6416, + "loss/crossentropy": 2.4824129343032837, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.164356030523777, + "step": 19078 + }, + { + "epoch": 0.59625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0171875, + "learning_rate": 0.0001, + "loss": 5.6659, + "loss/crossentropy": 2.5303527116775513, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16706840693950653, + "step": 19080 + }, + { + "epoch": 0.5963125, + "grad_norm": 3.0625, + "grad_norm_var": 0.022261555989583334, + "learning_rate": 0.0001, + "loss": 5.5689, + "loss/crossentropy": 2.5071014165878296, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1608661264181137, + "step": 19082 + }, + { + "epoch": 0.596375, + "grad_norm": 3.125, + "grad_norm_var": 0.027339680989583334, + "learning_rate": 0.0001, + "loss": 5.4138, + "loss/crossentropy": 2.3051012754440308, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16399822384119034, + "step": 19084 + }, + { + "epoch": 0.5964375, + "grad_norm": 2.875, + "grad_norm_var": 0.03173828125, + "learning_rate": 0.0001, + "loss": 5.2159, + "loss/crossentropy": 2.3455780744552612, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.14601540565490723, + "step": 19086 + }, + { + "epoch": 0.5965, + "grad_norm": 3.359375, + "grad_norm_var": 0.037409464518229164, + "learning_rate": 0.0001, + "loss": 5.7854, + "loss/crossentropy": 2.667971611022949, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16799738258123398, + "step": 19088 + }, + { + "epoch": 0.5965625, + "grad_norm": 3.34375, + "grad_norm_var": 0.058405558268229164, + "learning_rate": 0.0001, + "loss": 5.4464, + "loss/crossentropy": 2.3377362489700317, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16047269850969315, + "step": 19090 + }, + { + "epoch": 0.596625, + "grad_norm": 2.984375, + "grad_norm_var": 0.05292867024739583, + "learning_rate": 0.0001, + "loss": 5.4976, + "loss/crossentropy": 2.4580774307250977, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15707862377166748, + "step": 19092 + }, + { + "epoch": 0.5966875, + "grad_norm": 3.125, + "grad_norm_var": 0.05196024576822917, + "learning_rate": 0.0001, + "loss": 5.6936, + "loss/crossentropy": 2.589480757713318, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16197218000888824, + "step": 19094 + }, + { + "epoch": 0.59675, + "grad_norm": 3.375, + "grad_norm_var": 0.06636962890625, + "learning_rate": 0.0001, + "loss": 5.8982, + "loss/crossentropy": 2.621792435646057, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17842186987400055, + "step": 19096 + }, + { + "epoch": 0.5968125, + "grad_norm": 3.078125, + "grad_norm_var": 0.05995992024739583, + "learning_rate": 0.0001, + "loss": 5.9474, + "loss/crossentropy": 2.7037535905838013, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17397662997245789, + "step": 19098 + }, + { + "epoch": 0.596875, + "grad_norm": 3.125, + "grad_norm_var": 0.05679931640625, + "learning_rate": 0.0001, + "loss": 5.7846, + "loss/crossentropy": 2.606392741203308, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16860055178403854, + "step": 19100 + }, + { + "epoch": 0.5969375, + "grad_norm": 3.078125, + "grad_norm_var": 0.04195963541666667, + "learning_rate": 0.0001, + "loss": 5.8501, + "loss/crossentropy": 2.660889506340027, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17009187489748, + "step": 19102 + }, + { + "epoch": 0.597, + "grad_norm": 3.03125, + "grad_norm_var": 0.04391276041666667, + "learning_rate": 0.0001, + "loss": 5.7535, + "loss/crossentropy": 2.6644606590270996, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1612473577260971, + "step": 19104 + }, + { + "epoch": 0.5970625, + "grad_norm": 3.140625, + "grad_norm_var": 0.03188374837239583, + "learning_rate": 0.0001, + "loss": 5.5338, + "loss/crossentropy": 2.452639102935791, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16475605219602585, + "step": 19106 + }, + { + "epoch": 0.597125, + "grad_norm": 3.15625, + "grad_norm_var": 0.03176676432291667, + "learning_rate": 0.0001, + "loss": 5.6205, + "loss/crossentropy": 2.5299713611602783, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16296324133872986, + "step": 19108 + }, + { + "epoch": 0.5971875, + "grad_norm": 3.15625, + "grad_norm_var": 0.03809305826822917, + "learning_rate": 0.0001, + "loss": 5.9802, + "loss/crossentropy": 2.6791226863861084, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17698239535093307, + "step": 19110 + }, + { + "epoch": 0.59725, + "grad_norm": 3.5, + "grad_norm_var": 0.03294169108072917, + "learning_rate": 0.0001, + "loss": 5.7402, + "loss/crossentropy": 2.533842444419861, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17024310678243637, + "step": 19112 + }, + { + "epoch": 0.5973125, + "grad_norm": 2.90625, + "grad_norm_var": 0.0326812744140625, + "learning_rate": 0.0001, + "loss": 5.8323, + "loss/crossentropy": 2.661787986755371, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16861362755298615, + "step": 19114 + }, + { + "epoch": 0.597375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0326171875, + "learning_rate": 0.0001, + "loss": 5.4835, + "loss/crossentropy": 2.3731298446655273, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1645573154091835, + "step": 19116 + }, + { + "epoch": 0.5974375, + "grad_norm": 3.75, + "grad_norm_var": 0.060456339518229166, + "learning_rate": 0.0001, + "loss": 5.7119, + "loss/crossentropy": 2.494303584098816, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1702006831765175, + "step": 19118 + }, + { + "epoch": 0.5975, + "grad_norm": 3.21875, + "grad_norm_var": 0.054230753580729166, + "learning_rate": 0.0001, + "loss": 6.0539, + "loss/crossentropy": 2.7265243530273438, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18274109065532684, + "step": 19120 + }, + { + "epoch": 0.5975625, + "grad_norm": 3.0, + "grad_norm_var": 0.1051910400390625, + "learning_rate": 0.0001, + "loss": 5.2535, + "loss/crossentropy": 2.245564341545105, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15040294080972672, + "step": 19122 + }, + { + "epoch": 0.597625, + "grad_norm": 3.0625, + "grad_norm_var": 0.10263264973958333, + "learning_rate": 0.0001, + "loss": 5.4162, + "loss/crossentropy": 2.3298370838165283, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1621474325656891, + "step": 19124 + }, + { + "epoch": 0.5976875, + "grad_norm": 3.21875, + "grad_norm_var": 0.1067779541015625, + "learning_rate": 0.0001, + "loss": 5.7115, + "loss/crossentropy": 2.5920151472091675, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16429658979177475, + "step": 19126 + }, + { + "epoch": 0.59775, + "grad_norm": 3.0625, + "grad_norm_var": 0.10891011555989584, + "learning_rate": 0.0001, + "loss": 5.9329, + "loss/crossentropy": 2.7269396781921387, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17177174985408783, + "step": 19128 + }, + { + "epoch": 0.5978125, + "grad_norm": 3.203125, + "grad_norm_var": 0.10945536295572916, + "learning_rate": 0.0001, + "loss": 5.626, + "loss/crossentropy": 2.475701689720154, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17049477994441986, + "step": 19130 + }, + { + "epoch": 0.597875, + "grad_norm": 2.8125, + "grad_norm_var": 0.11669514973958334, + "learning_rate": 0.0001, + "loss": 5.428, + "loss/crossentropy": 2.3634350299835205, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1611437350511551, + "step": 19132 + }, + { + "epoch": 0.5979375, + "grad_norm": 3.25, + "grad_norm_var": 0.10496317545572917, + "learning_rate": 0.0001, + "loss": 5.3409, + "loss/crossentropy": 2.35762619972229, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15262891352176666, + "step": 19134 + }, + { + "epoch": 0.598, + "grad_norm": 3.28125, + "grad_norm_var": 0.1359771728515625, + "learning_rate": 0.0001, + "loss": 5.9168, + "loss/crossentropy": 2.6832181215286255, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1714015007019043, + "step": 19136 + }, + { + "epoch": 0.5980625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0734283447265625, + "learning_rate": 0.0001, + "loss": 5.2945, + "loss/crossentropy": 2.269458770751953, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15523458272218704, + "step": 19138 + }, + { + "epoch": 0.598125, + "grad_norm": 3.234375, + "grad_norm_var": 0.07655843098958333, + "learning_rate": 0.0001, + "loss": 5.4908, + "loss/crossentropy": 2.4571874141693115, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15805184841156006, + "step": 19140 + }, + { + "epoch": 0.5981875, + "grad_norm": 3.296875, + "grad_norm_var": 0.08671875, + "learning_rate": 0.0001, + "loss": 5.9359, + "loss/crossentropy": 2.7241885662078857, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16648481786251068, + "step": 19142 + }, + { + "epoch": 0.59825, + "grad_norm": 3.0625, + "grad_norm_var": 0.0853179931640625, + "learning_rate": 0.0001, + "loss": 5.9012, + "loss/crossentropy": 2.720415949821472, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17081619054079056, + "step": 19144 + }, + { + "epoch": 0.5983125, + "grad_norm": 3.03125, + "grad_norm_var": 0.07872721354166666, + "learning_rate": 0.0001, + "loss": 5.2981, + "loss/crossentropy": 2.2692290544509888, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16031364351511002, + "step": 19146 + }, + { + "epoch": 0.598375, + "grad_norm": 3.125, + "grad_norm_var": 0.07541910807291667, + "learning_rate": 0.0001, + "loss": 5.3256, + "loss/crossentropy": 2.3439310789108276, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15168164670467377, + "step": 19148 + }, + { + "epoch": 0.5984375, + "grad_norm": 3.234375, + "grad_norm_var": 0.06773681640625, + "learning_rate": 0.0001, + "loss": 5.7722, + "loss/crossentropy": 2.554852604866028, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17173956334590912, + "step": 19150 + }, + { + "epoch": 0.5985, + "grad_norm": 3.453125, + "grad_norm_var": 0.04127197265625, + "learning_rate": 0.0001, + "loss": 5.977, + "loss/crossentropy": 2.7384544610977173, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17346123605966568, + "step": 19152 + }, + { + "epoch": 0.5985625, + "grad_norm": 3.390625, + "grad_norm_var": 0.045947265625, + "learning_rate": 0.0001, + "loss": 5.8553, + "loss/crossentropy": 2.676303505897522, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16555601358413696, + "step": 19154 + }, + { + "epoch": 0.598625, + "grad_norm": 3.421875, + "grad_norm_var": 0.04696858723958333, + "learning_rate": 0.0001, + "loss": 6.0491, + "loss/crossentropy": 2.6892950534820557, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18089944124221802, + "step": 19156 + }, + { + "epoch": 0.5986875, + "grad_norm": 3.046875, + "grad_norm_var": 0.03271077473958333, + "learning_rate": 0.0001, + "loss": 5.5119, + "loss/crossentropy": 2.427427649497986, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16352641582489014, + "step": 19158 + }, + { + "epoch": 0.59875, + "grad_norm": 3.109375, + "grad_norm_var": 0.036519368489583336, + "learning_rate": 0.0001, + "loss": 5.3986, + "loss/crossentropy": 2.3395333290100098, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1586436927318573, + "step": 19160 + }, + { + "epoch": 0.5988125, + "grad_norm": 3.234375, + "grad_norm_var": 0.03778889973958333, + "learning_rate": 0.0001, + "loss": 5.7716, + "loss/crossentropy": 2.6539366245269775, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16489075124263763, + "step": 19162 + }, + { + "epoch": 0.598875, + "grad_norm": 3.5625, + "grad_norm_var": 0.03687744140625, + "learning_rate": 0.0001, + "loss": 5.8816, + "loss/crossentropy": 2.6859357357025146, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1719125434756279, + "step": 19164 + }, + { + "epoch": 0.5989375, + "grad_norm": 3.09375, + "grad_norm_var": 0.03909098307291667, + "learning_rate": 0.0001, + "loss": 5.7933, + "loss/crossentropy": 2.581337332725525, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1735384166240692, + "step": 19166 + }, + { + "epoch": 0.599, + "grad_norm": 3.015625, + "grad_norm_var": 0.04478759765625, + "learning_rate": 0.0001, + "loss": 5.6245, + "loss/crossentropy": 2.542153000831604, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16330838948488235, + "step": 19168 + }, + { + "epoch": 0.5990625, + "grad_norm": 3.265625, + "grad_norm_var": 0.04004618326822917, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.6168237924575806, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17193201184272766, + "step": 19170 + }, + { + "epoch": 0.599125, + "grad_norm": 3.03125, + "grad_norm_var": 0.06161702473958333, + "learning_rate": 0.0001, + "loss": 5.4472, + "loss/crossentropy": 2.2256386280059814, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1639539822936058, + "step": 19172 + }, + { + "epoch": 0.5991875, + "grad_norm": 4.03125, + "grad_norm_var": 0.10725504557291667, + "learning_rate": 0.0001, + "loss": 5.8376, + "loss/crossentropy": 2.637717127799988, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16725322604179382, + "step": 19174 + }, + { + "epoch": 0.59925, + "grad_norm": 3.078125, + "grad_norm_var": 0.10352274576822916, + "learning_rate": 0.0001, + "loss": 5.6908, + "loss/crossentropy": 2.5423028469085693, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1664077639579773, + "step": 19176 + }, + { + "epoch": 0.5993125, + "grad_norm": 3.1875, + "grad_norm_var": 0.10003153483072917, + "learning_rate": 0.0001, + "loss": 5.725, + "loss/crossentropy": 2.5043609142303467, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17401903122663498, + "step": 19178 + }, + { + "epoch": 0.599375, + "grad_norm": 3.328125, + "grad_norm_var": 0.10322265625, + "learning_rate": 0.0001, + "loss": 5.2699, + "loss/crossentropy": 2.274636387825012, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1569497212767601, + "step": 19180 + }, + { + "epoch": 0.5994375, + "grad_norm": 3.046875, + "grad_norm_var": 0.10481669108072916, + "learning_rate": 0.0001, + "loss": 5.7336, + "loss/crossentropy": 2.5585625171661377, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17024116218090057, + "step": 19182 + }, + { + "epoch": 0.5995, + "grad_norm": 2.96875, + "grad_norm_var": 0.0996246337890625, + "learning_rate": 0.0001, + "loss": 5.6628, + "loss/crossentropy": 2.603795886039734, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16097746789455414, + "step": 19184 + }, + { + "epoch": 0.5995625, + "grad_norm": 3.328125, + "grad_norm_var": 0.09787495930989583, + "learning_rate": 0.0001, + "loss": 5.6364, + "loss/crossentropy": 2.4129215478897095, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1711808741092682, + "step": 19186 + }, + { + "epoch": 0.599625, + "grad_norm": 3.203125, + "grad_norm_var": 0.07096354166666667, + "learning_rate": 0.0001, + "loss": 5.5324, + "loss/crossentropy": 2.4042720794677734, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16750407218933105, + "step": 19188 + }, + { + "epoch": 0.5996875, + "grad_norm": 2.953125, + "grad_norm_var": 0.04303385416666667, + "learning_rate": 0.0001, + "loss": 5.7264, + "loss/crossentropy": 2.5236847400665283, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17417382448911667, + "step": 19190 + }, + { + "epoch": 0.59975, + "grad_norm": 3.390625, + "grad_norm_var": 0.04317118326822917, + "learning_rate": 0.0001, + "loss": 5.8526, + "loss/crossentropy": 2.5612668991088867, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1783522292971611, + "step": 19192 + }, + { + "epoch": 0.5998125, + "grad_norm": 3.078125, + "grad_norm_var": 0.04211832682291667, + "learning_rate": 0.0001, + "loss": 5.5537, + "loss/crossentropy": 2.4094767570495605, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16716086119413376, + "step": 19194 + }, + { + "epoch": 0.599875, + "grad_norm": 3.1875, + "grad_norm_var": 0.03567301432291667, + "learning_rate": 0.0001, + "loss": 5.527, + "loss/crossentropy": 2.483111023902893, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15908117592334747, + "step": 19196 + }, + { + "epoch": 0.5999375, + "grad_norm": 3.03125, + "grad_norm_var": 0.04003499348958333, + "learning_rate": 0.0001, + "loss": 5.4426, + "loss/crossentropy": 2.4285643100738525, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15414240956306458, + "step": 19198 + }, + { + "epoch": 0.6, + "grad_norm": 3.15625, + "grad_norm_var": 0.03810933430989583, + "learning_rate": 0.0001, + "loss": 5.7239, + "loss/crossentropy": 2.4994139671325684, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17167074233293533, + "step": 19200 + }, + { + "epoch": 0.6000625, + "grad_norm": 3.234375, + "grad_norm_var": 0.04234619140625, + "learning_rate": 0.0001, + "loss": 5.628, + "loss/crossentropy": 2.506988286972046, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16678385436534882, + "step": 19202 + }, + { + "epoch": 0.600125, + "grad_norm": 3.34375, + "grad_norm_var": 0.04420166015625, + "learning_rate": 0.0001, + "loss": 5.7679, + "loss/crossentropy": 2.5038286447525024, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17601903527975082, + "step": 19204 + }, + { + "epoch": 0.6001875, + "grad_norm": 3.203125, + "grad_norm_var": 0.027925618489583335, + "learning_rate": 0.0001, + "loss": 5.9747, + "loss/crossentropy": 2.714989423751831, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1775367259979248, + "step": 19206 + }, + { + "epoch": 0.60025, + "grad_norm": 2.90625, + "grad_norm_var": 0.03817952473958333, + "learning_rate": 0.0001, + "loss": 5.483, + "loss/crossentropy": 2.492974638938904, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15798742324113846, + "step": 19208 + }, + { + "epoch": 0.6003125, + "grad_norm": 3.125, + "grad_norm_var": 0.03810933430989583, + "learning_rate": 0.0001, + "loss": 5.7218, + "loss/crossentropy": 2.5865061283111572, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1674380898475647, + "step": 19210 + }, + { + "epoch": 0.600375, + "grad_norm": 2.9375, + "grad_norm_var": 0.060399373372395836, + "learning_rate": 0.0001, + "loss": 5.2924, + "loss/crossentropy": 2.317363142967224, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15180116146802902, + "step": 19212 + }, + { + "epoch": 0.6004375, + "grad_norm": 2.96875, + "grad_norm_var": 0.06330973307291667, + "learning_rate": 0.0001, + "loss": 5.6514, + "loss/crossentropy": 2.6089274883270264, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1620563492178917, + "step": 19214 + }, + { + "epoch": 0.6005, + "grad_norm": 2.9375, + "grad_norm_var": 0.06081441243489583, + "learning_rate": 0.0001, + "loss": 5.1693, + "loss/crossentropy": 2.2365992069244385, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15225821733474731, + "step": 19216 + }, + { + "epoch": 0.6005625, + "grad_norm": 3.125, + "grad_norm_var": 0.059309895833333334, + "learning_rate": 0.0001, + "loss": 5.582, + "loss/crossentropy": 2.494749903678894, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16341552883386612, + "step": 19218 + }, + { + "epoch": 0.600625, + "grad_norm": 3.171875, + "grad_norm_var": 0.05257059733072917, + "learning_rate": 0.0001, + "loss": 5.8487, + "loss/crossentropy": 2.6580334901809692, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16906992346048355, + "step": 19220 + }, + { + "epoch": 0.6006875, + "grad_norm": 3.171875, + "grad_norm_var": 0.04804585774739583, + "learning_rate": 0.0001, + "loss": 5.8858, + "loss/crossentropy": 2.6356533765792847, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1750139519572258, + "step": 19222 + }, + { + "epoch": 0.60075, + "grad_norm": 3.109375, + "grad_norm_var": 0.04234619140625, + "learning_rate": 0.0001, + "loss": 5.6525, + "loss/crossentropy": 2.5122116804122925, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1687157079577446, + "step": 19224 + }, + { + "epoch": 0.6008125, + "grad_norm": 3.125, + "grad_norm_var": 0.0424224853515625, + "learning_rate": 0.0001, + "loss": 5.8, + "loss/crossentropy": 2.5830377340316772, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1681763455271721, + "step": 19226 + }, + { + "epoch": 0.600875, + "grad_norm": 3.75, + "grad_norm_var": 0.0402252197265625, + "learning_rate": 0.0001, + "loss": 5.625, + "loss/crossentropy": 2.4458080530166626, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16908913850784302, + "step": 19228 + }, + { + "epoch": 0.6009375, + "grad_norm": 3.703125, + "grad_norm_var": 0.05497945149739583, + "learning_rate": 0.0001, + "loss": 6.1473, + "loss/crossentropy": 2.7635291814804077, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18720870465040207, + "step": 19230 + }, + { + "epoch": 0.601, + "grad_norm": 3.046875, + "grad_norm_var": 0.058329264322916664, + "learning_rate": 0.0001, + "loss": 5.9621, + "loss/crossentropy": 2.662600040435791, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.1760430634021759, + "step": 19232 + }, + { + "epoch": 0.6010625, + "grad_norm": 3.1875, + "grad_norm_var": 0.05292867024739583, + "learning_rate": 0.0001, + "loss": 5.597, + "loss/crossentropy": 2.387401819229126, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1678304523229599, + "step": 19234 + }, + { + "epoch": 0.601125, + "grad_norm": 3.484375, + "grad_norm_var": 0.05894775390625, + "learning_rate": 0.0001, + "loss": 5.6045, + "loss/crossentropy": 2.44003164768219, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1640988290309906, + "step": 19236 + }, + { + "epoch": 0.6011875, + "grad_norm": 3.109375, + "grad_norm_var": 0.062109375, + "learning_rate": 0.0001, + "loss": 5.542, + "loss/crossentropy": 2.467086672782898, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1641310602426529, + "step": 19238 + }, + { + "epoch": 0.60125, + "grad_norm": 3.1875, + "grad_norm_var": 0.06796875, + "learning_rate": 0.0001, + "loss": 5.7396, + "loss/crossentropy": 2.6073544025421143, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16752376407384872, + "step": 19240 + }, + { + "epoch": 0.6013125, + "grad_norm": 3.15625, + "grad_norm_var": 0.06892903645833333, + "learning_rate": 0.0001, + "loss": 5.461, + "loss/crossentropy": 2.3763049840927124, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16198724508285522, + "step": 19242 + }, + { + "epoch": 0.601375, + "grad_norm": 3.453125, + "grad_norm_var": 0.053873697916666664, + "learning_rate": 0.0001, + "loss": 5.9794, + "loss/crossentropy": 2.6908187866210938, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1780814528465271, + "step": 19244 + }, + { + "epoch": 0.6014375, + "grad_norm": 3.109375, + "grad_norm_var": 0.03385009765625, + "learning_rate": 0.0001, + "loss": 5.7021, + "loss/crossentropy": 2.5598464012145996, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16812731325626373, + "step": 19246 + }, + { + "epoch": 0.6015, + "grad_norm": 2.859375, + "grad_norm_var": 0.02711181640625, + "learning_rate": 0.0001, + "loss": 5.589, + "loss/crossentropy": 2.5304884910583496, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16054195910692215, + "step": 19248 + }, + { + "epoch": 0.6015625, + "grad_norm": 2.984375, + "grad_norm_var": 0.0286529541015625, + "learning_rate": 0.0001, + "loss": 5.917, + "loss/crossentropy": 2.7158541679382324, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17129085212945938, + "step": 19250 + }, + { + "epoch": 0.601625, + "grad_norm": 2.875, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 5.7501, + "loss/crossentropy": 2.6116241216659546, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1618928164243698, + "step": 19252 + }, + { + "epoch": 0.6016875, + "grad_norm": 3.0, + "grad_norm_var": 0.02626953125, + "learning_rate": 0.0001, + "loss": 5.5211, + "loss/crossentropy": 2.395193934440613, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16688557714223862, + "step": 19254 + }, + { + "epoch": 0.60175, + "grad_norm": 2.578125, + "grad_norm_var": 0.04390869140625, + "learning_rate": 0.0001, + "loss": 5.1934, + "loss/crossentropy": 2.285116672515869, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.14785685390233994, + "step": 19256 + }, + { + "epoch": 0.6018125, + "grad_norm": 3.078125, + "grad_norm_var": 0.06172587076822917, + "learning_rate": 0.0001, + "loss": 6.0258, + "loss/crossentropy": 2.773720860481262, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17637999355793, + "step": 19258 + }, + { + "epoch": 0.601875, + "grad_norm": 2.734375, + "grad_norm_var": 0.0645904541015625, + "learning_rate": 0.0001, + "loss": 5.4832, + "loss/crossentropy": 2.465247869491577, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15648344159126282, + "step": 19260 + }, + { + "epoch": 0.6019375, + "grad_norm": 2.984375, + "grad_norm_var": 0.06220601399739583, + "learning_rate": 0.0001, + "loss": 5.537, + "loss/crossentropy": 2.412294864654541, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1624700427055359, + "step": 19262 + }, + { + "epoch": 0.602, + "grad_norm": 3.015625, + "grad_norm_var": 0.0580963134765625, + "learning_rate": 0.0001, + "loss": 5.5669, + "loss/crossentropy": 2.4570724964141846, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16293827444314957, + "step": 19264 + }, + { + "epoch": 0.6020625, + "grad_norm": 3.234375, + "grad_norm_var": 0.06041259765625, + "learning_rate": 0.0001, + "loss": 5.6432, + "loss/crossentropy": 2.4601714611053467, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17103426903486252, + "step": 19266 + }, + { + "epoch": 0.602125, + "grad_norm": 3.46875, + "grad_norm_var": 0.06607666015625, + "learning_rate": 0.0001, + "loss": 5.8618, + "loss/crossentropy": 2.581000566482544, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17652061581611633, + "step": 19268 + }, + { + "epoch": 0.6021875, + "grad_norm": 3.015625, + "grad_norm_var": 0.06591796875, + "learning_rate": 0.0001, + "loss": 5.2412, + "loss/crossentropy": 2.2920453548431396, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.14804230630397797, + "step": 19270 + }, + { + "epoch": 0.60225, + "grad_norm": 2.953125, + "grad_norm_var": 0.058039347330729164, + "learning_rate": 0.0001, + "loss": 6.0194, + "loss/crossentropy": 2.7348662614822388, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1796286329627037, + "step": 19272 + }, + { + "epoch": 0.6023125, + "grad_norm": 3.109375, + "grad_norm_var": 0.04020894368489583, + "learning_rate": 0.0001, + "loss": 5.5369, + "loss/crossentropy": 2.4168903827667236, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16395603120326996, + "step": 19274 + }, + { + "epoch": 0.602375, + "grad_norm": 3.015625, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 5.7797, + "loss/crossentropy": 2.668266177177429, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16231407225131989, + "step": 19276 + }, + { + "epoch": 0.6024375, + "grad_norm": 3.125, + "grad_norm_var": 0.033543904622395836, + "learning_rate": 0.0001, + "loss": 5.6619, + "loss/crossentropy": 2.525829553604126, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16556084156036377, + "step": 19278 + }, + { + "epoch": 0.6025, + "grad_norm": 3.03125, + "grad_norm_var": 0.03453369140625, + "learning_rate": 0.0001, + "loss": 5.74, + "loss/crossentropy": 2.6013892889022827, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1658184453845024, + "step": 19280 + }, + { + "epoch": 0.6025625, + "grad_norm": 3.09375, + "grad_norm_var": 0.035542805989583336, + "learning_rate": 0.0001, + "loss": 5.8211, + "loss/crossentropy": 2.6150509119033813, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16982468217611313, + "step": 19282 + }, + { + "epoch": 0.602625, + "grad_norm": 3.125, + "grad_norm_var": 0.027765909830729168, + "learning_rate": 0.0001, + "loss": 5.8615, + "loss/crossentropy": 2.733675479888916, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16668817400932312, + "step": 19284 + }, + { + "epoch": 0.6026875, + "grad_norm": 3.140625, + "grad_norm_var": 0.025837198893229166, + "learning_rate": 0.0001, + "loss": 5.754, + "loss/crossentropy": 2.6141045093536377, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16711291670799255, + "step": 19286 + }, + { + "epoch": 0.60275, + "grad_norm": 2.921875, + "grad_norm_var": 0.24107666015625, + "learning_rate": 0.0001, + "loss": 5.8841, + "loss/crossentropy": 2.614741325378418, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.18123618513345718, + "step": 19288 + }, + { + "epoch": 0.6028125, + "grad_norm": 3.375, + "grad_norm_var": 0.23961588541666667, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.566126227378845, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17270290851593018, + "step": 19290 + }, + { + "epoch": 0.602875, + "grad_norm": 3.1875, + "grad_norm_var": 0.2448150634765625, + "learning_rate": 0.0001, + "loss": 5.7644, + "loss/crossentropy": 2.664409875869751, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16351237148046494, + "step": 19292 + }, + { + "epoch": 0.6029375, + "grad_norm": 3.078125, + "grad_norm_var": 0.2454010009765625, + "learning_rate": 0.0001, + "loss": 5.8293, + "loss/crossentropy": 2.7215731143951416, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1646827906370163, + "step": 19294 + }, + { + "epoch": 0.603, + "grad_norm": 2.9375, + "grad_norm_var": 0.2463775634765625, + "learning_rate": 0.0001, + "loss": 5.7439, + "loss/crossentropy": 2.5724823474884033, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1690913736820221, + "step": 19296 + }, + { + "epoch": 0.6030625, + "grad_norm": 3.03125, + "grad_norm_var": 0.24879150390625, + "learning_rate": 0.0001, + "loss": 6.1137, + "loss/crossentropy": 2.854590892791748, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17747671157121658, + "step": 19298 + }, + { + "epoch": 0.603125, + "grad_norm": 3.359375, + "grad_norm_var": 0.25308329264322915, + "learning_rate": 0.0001, + "loss": 5.8245, + "loss/crossentropy": 2.6510233879089355, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1689062938094139, + "step": 19300 + }, + { + "epoch": 0.6031875, + "grad_norm": 3.359375, + "grad_norm_var": 0.25236714680989586, + "learning_rate": 0.0001, + "loss": 5.8015, + "loss/crossentropy": 2.5342386960983276, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17790137231349945, + "step": 19302 + }, + { + "epoch": 0.60325, + "grad_norm": 3.640625, + "grad_norm_var": 0.04108784993489583, + "learning_rate": 0.0001, + "loss": 5.9817, + "loss/crossentropy": 2.6479718685150146, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18142174929380417, + "step": 19304 + }, + { + "epoch": 0.6033125, + "grad_norm": 3.71875, + "grad_norm_var": 0.06306050618489584, + "learning_rate": 0.0001, + "loss": 5.5137, + "loss/crossentropy": 2.349372386932373, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16955320537090302, + "step": 19306 + }, + { + "epoch": 0.603375, + "grad_norm": 3.859375, + "grad_norm_var": 0.087744140625, + "learning_rate": 0.0001, + "loss": 6.2897, + "loss/crossentropy": 2.863835573196411, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.19102825969457626, + "step": 19308 + }, + { + "epoch": 0.6034375, + "grad_norm": 3.078125, + "grad_norm_var": 0.08430887858072916, + "learning_rate": 0.0001, + "loss": 5.752, + "loss/crossentropy": 2.6125062704086304, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16590477526187897, + "step": 19310 + }, + { + "epoch": 0.6035, + "grad_norm": 3.328125, + "grad_norm_var": 0.07696940104166666, + "learning_rate": 0.0001, + "loss": 5.7415, + "loss/crossentropy": 2.5720916986465454, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16889013350009918, + "step": 19312 + }, + { + "epoch": 0.6035625, + "grad_norm": 3.21875, + "grad_norm_var": 0.071240234375, + "learning_rate": 0.0001, + "loss": 5.4345, + "loss/crossentropy": 2.3364455699920654, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.162933811545372, + "step": 19314 + }, + { + "epoch": 0.603625, + "grad_norm": 3.109375, + "grad_norm_var": 0.06687723795572917, + "learning_rate": 0.0001, + "loss": 5.7682, + "loss/crossentropy": 2.6161619424819946, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16520489007234573, + "step": 19316 + }, + { + "epoch": 0.6036875, + "grad_norm": 3.3125, + "grad_norm_var": 0.07131754557291667, + "learning_rate": 0.0001, + "loss": 5.8675, + "loss/crossentropy": 2.642058491706848, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17489047348499298, + "step": 19318 + }, + { + "epoch": 0.60375, + "grad_norm": 2.921875, + "grad_norm_var": 0.06887919108072917, + "learning_rate": 0.0001, + "loss": 5.7439, + "loss/crossentropy": 2.6179521083831787, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.163767471909523, + "step": 19320 + }, + { + "epoch": 0.6038125, + "grad_norm": 3.125, + "grad_norm_var": 0.052652994791666664, + "learning_rate": 0.0001, + "loss": 5.4641, + "loss/crossentropy": 2.4575281143188477, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15573221445083618, + "step": 19322 + }, + { + "epoch": 0.603875, + "grad_norm": 3.46875, + "grad_norm_var": 0.025569661458333334, + "learning_rate": 0.0001, + "loss": 5.7645, + "loss/crossentropy": 2.5369192361831665, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17002779245376587, + "step": 19324 + }, + { + "epoch": 0.6039375, + "grad_norm": 3.9375, + "grad_norm_var": 0.062272135416666666, + "learning_rate": 0.0001, + "loss": 5.7283, + "loss/crossentropy": 2.4499664306640625, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1762692779302597, + "step": 19326 + }, + { + "epoch": 0.604, + "grad_norm": 3.140625, + "grad_norm_var": 0.062272135416666666, + "learning_rate": 0.0001, + "loss": 5.7353, + "loss/crossentropy": 2.638036370277405, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16480784863233566, + "step": 19328 + }, + { + "epoch": 0.6040625, + "grad_norm": 3.5, + "grad_norm_var": 0.07062886555989584, + "learning_rate": 0.0001, + "loss": 5.8974, + "loss/crossentropy": 2.7524850368499756, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1640986055135727, + "step": 19330 + }, + { + "epoch": 0.604125, + "grad_norm": 3.28125, + "grad_norm_var": 0.07642822265625, + "learning_rate": 0.0001, + "loss": 5.5819, + "loss/crossentropy": 2.381034731864929, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17047860473394394, + "step": 19332 + }, + { + "epoch": 0.6041875, + "grad_norm": 3.21875, + "grad_norm_var": 0.07183837890625, + "learning_rate": 0.0001, + "loss": 5.6297, + "loss/crossentropy": 2.5100589990615845, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16157501190900803, + "step": 19334 + }, + { + "epoch": 0.60425, + "grad_norm": 3.015625, + "grad_norm_var": 0.07021077473958333, + "learning_rate": 0.0001, + "loss": 5.4607, + "loss/crossentropy": 2.4219084978103638, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15778452903032303, + "step": 19336 + }, + { + "epoch": 0.6043125, + "grad_norm": 3.421875, + "grad_norm_var": 0.0674957275390625, + "learning_rate": 0.0001, + "loss": 5.3473, + "loss/crossentropy": 2.3017560243606567, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.15337897092103958, + "step": 19338 + }, + { + "epoch": 0.604375, + "grad_norm": 2.984375, + "grad_norm_var": 0.06689046223958334, + "learning_rate": 0.0001, + "loss": 5.4759, + "loss/crossentropy": 2.462526559829712, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1564170941710472, + "step": 19340 + }, + { + "epoch": 0.6044375, + "grad_norm": 3.203125, + "grad_norm_var": 0.03827718098958333, + "learning_rate": 0.0001, + "loss": 5.5193, + "loss/crossentropy": 2.4268585443496704, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.161198690533638, + "step": 19342 + }, + { + "epoch": 0.6045, + "grad_norm": 2.875, + "grad_norm_var": 0.04409077962239583, + "learning_rate": 0.0001, + "loss": 5.841, + "loss/crossentropy": 2.7531384229660034, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1634698063135147, + "step": 19344 + }, + { + "epoch": 0.6045625, + "grad_norm": 3.015625, + "grad_norm_var": 0.037007649739583336, + "learning_rate": 0.0001, + "loss": 5.8738, + "loss/crossentropy": 2.654186487197876, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17235031723976135, + "step": 19346 + }, + { + "epoch": 0.604625, + "grad_norm": 5.15625, + "grad_norm_var": 0.28866780598958336, + "learning_rate": 0.0001, + "loss": 5.8584, + "loss/crossentropy": 2.6439250707626343, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16988525539636612, + "step": 19348 + }, + { + "epoch": 0.6046875, + "grad_norm": 3.203125, + "grad_norm_var": 0.28967183430989585, + "learning_rate": 0.0001, + "loss": 5.4388, + "loss/crossentropy": 2.3536821603775024, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16319891065359116, + "step": 19350 + }, + { + "epoch": 0.60475, + "grad_norm": 3.09375, + "grad_norm_var": 0.28609619140625, + "learning_rate": 0.0001, + "loss": 6.0791, + "loss/crossentropy": 2.845702886581421, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1748986840248108, + "step": 19352 + }, + { + "epoch": 0.6048125, + "grad_norm": 3.546875, + "grad_norm_var": 0.2860636393229167, + "learning_rate": 0.0001, + "loss": 5.7825, + "loss/crossentropy": 2.5356805324554443, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17468440532684326, + "step": 19354 + }, + { + "epoch": 0.604875, + "grad_norm": 3.21875, + "grad_norm_var": 0.281982421875, + "learning_rate": 0.0001, + "loss": 5.6824, + "loss/crossentropy": 2.5392826795578003, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16899523884058, + "step": 19356 + }, + { + "epoch": 0.6049375, + "grad_norm": 3.25, + "grad_norm_var": 0.27551676432291666, + "learning_rate": 0.0001, + "loss": 5.9311, + "loss/crossentropy": 2.71337354183197, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17450948804616928, + "step": 19358 + }, + { + "epoch": 0.605, + "grad_norm": 3.28125, + "grad_norm_var": 0.2628082275390625, + "learning_rate": 0.0001, + "loss": 5.7855, + "loss/crossentropy": 2.6472429037094116, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16929297894239426, + "step": 19360 + }, + { + "epoch": 0.6050625, + "grad_norm": 3.109375, + "grad_norm_var": 0.2653605143229167, + "learning_rate": 0.0001, + "loss": 5.462, + "loss/crossentropy": 2.394970655441284, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16256201267242432, + "step": 19362 + }, + { + "epoch": 0.605125, + "grad_norm": 3.546875, + "grad_norm_var": 0.023786417643229165, + "learning_rate": 0.0001, + "loss": 6.01, + "loss/crossentropy": 2.829805612564087, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16685090214014053, + "step": 19364 + }, + { + "epoch": 0.6051875, + "grad_norm": 3.328125, + "grad_norm_var": 0.2587890625, + "learning_rate": 0.0001, + "loss": 6.0703, + "loss/crossentropy": 2.699275016784668, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18398147821426392, + "step": 19366 + }, + { + "epoch": 0.60525, + "grad_norm": 3.21875, + "grad_norm_var": 0.2541249593098958, + "learning_rate": 0.0001, + "loss": 5.8238, + "loss/crossentropy": 2.6740514039993286, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16692347824573517, + "step": 19368 + }, + { + "epoch": 0.6053125, + "grad_norm": 3.4375, + "grad_norm_var": 0.2579254150390625, + "learning_rate": 0.0001, + "loss": 6.0299, + "loss/crossentropy": 2.7693264484405518, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17722737044095993, + "step": 19370 + }, + { + "epoch": 0.605375, + "grad_norm": 3.15625, + "grad_norm_var": 0.25732014973958334, + "learning_rate": 0.0001, + "loss": 5.7676, + "loss/crossentropy": 2.6156188249588013, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16676147282123566, + "step": 19372 + }, + { + "epoch": 0.6054375, + "grad_norm": 2.953125, + "grad_norm_var": 0.2650227864583333, + "learning_rate": 0.0001, + "loss": 5.7391, + "loss/crossentropy": 2.6294628381729126, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16486700624227524, + "step": 19374 + }, + { + "epoch": 0.6055, + "grad_norm": 3.46875, + "grad_norm_var": 0.26396077473958335, + "learning_rate": 0.0001, + "loss": 5.8428, + "loss/crossentropy": 2.6216262578964233, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17250894010066986, + "step": 19376 + }, + { + "epoch": 0.6055625, + "grad_norm": 3.078125, + "grad_norm_var": 0.2618479410807292, + "learning_rate": 0.0001, + "loss": 5.4434, + "loss/crossentropy": 2.385737895965576, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16123860329389572, + "step": 19378 + }, + { + "epoch": 0.605625, + "grad_norm": 3.21875, + "grad_norm_var": 0.27021484375, + "learning_rate": 0.0001, + "loss": 5.8877, + "loss/crossentropy": 2.6616382598876953, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17573069781064987, + "step": 19380 + }, + { + "epoch": 0.6056875, + "grad_norm": 3.0625, + "grad_norm_var": 0.022443644205729165, + "learning_rate": 0.0001, + "loss": 5.3565, + "loss/crossentropy": 2.291105270385742, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15732458233833313, + "step": 19382 + }, + { + "epoch": 0.60575, + "grad_norm": 3.140625, + "grad_norm_var": 0.020881144205729167, + "learning_rate": 0.0001, + "loss": 5.7053, + "loss/crossentropy": 2.6108888387680054, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16100048273801804, + "step": 19384 + }, + { + "epoch": 0.6058125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0164947509765625, + "learning_rate": 0.0001, + "loss": 5.6552, + "loss/crossentropy": 2.6393449306488037, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15744362771511078, + "step": 19386 + }, + { + "epoch": 0.605875, + "grad_norm": 2.96875, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 5.3549, + "loss/crossentropy": 2.325733184814453, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15799043327569962, + "step": 19388 + }, + { + "epoch": 0.6059375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0178863525390625, + "learning_rate": 0.0001, + "loss": 5.6828, + "loss/crossentropy": 2.5999585390090942, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16531386226415634, + "step": 19390 + }, + { + "epoch": 0.606, + "grad_norm": 3.40625, + "grad_norm_var": 0.013036092122395834, + "learning_rate": 0.0001, + "loss": 5.783, + "loss/crossentropy": 2.5392236709594727, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1755482405424118, + "step": 19392 + }, + { + "epoch": 0.6060625, + "grad_norm": 3.34375, + "grad_norm_var": 0.019514973958333334, + "learning_rate": 0.0001, + "loss": 5.7082, + "loss/crossentropy": 2.5588942766189575, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1696172133088112, + "step": 19394 + }, + { + "epoch": 0.606125, + "grad_norm": 3.515625, + "grad_norm_var": 0.028050740559895832, + "learning_rate": 0.0001, + "loss": 5.8606, + "loss/crossentropy": 2.6143925189971924, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17813197523355484, + "step": 19396 + }, + { + "epoch": 0.6061875, + "grad_norm": 3.078125, + "grad_norm_var": 0.028294881184895832, + "learning_rate": 0.0001, + "loss": 5.9204, + "loss/crossentropy": 2.693058729171753, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17234336584806442, + "step": 19398 + }, + { + "epoch": 0.60625, + "grad_norm": 3.28125, + "grad_norm_var": 0.033544921875, + "learning_rate": 0.0001, + "loss": 5.7255, + "loss/crossentropy": 2.603538751602173, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16453826427459717, + "step": 19400 + }, + { + "epoch": 0.6063125, + "grad_norm": 3.46875, + "grad_norm_var": 0.0446685791015625, + "learning_rate": 0.0001, + "loss": 5.9881, + "loss/crossentropy": 2.7714216709136963, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1743982583284378, + "step": 19402 + }, + { + "epoch": 0.606375, + "grad_norm": 3.046875, + "grad_norm_var": 0.04474995930989583, + "learning_rate": 0.0001, + "loss": 5.8255, + "loss/crossentropy": 2.6851247549057007, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16872383654117584, + "step": 19404 + }, + { + "epoch": 0.6064375, + "grad_norm": 3.09375, + "grad_norm_var": 0.05657145182291667, + "learning_rate": 0.0001, + "loss": 6.0151, + "loss/crossentropy": 2.7455601692199707, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1812482625246048, + "step": 19406 + }, + { + "epoch": 0.6065, + "grad_norm": 3.46875, + "grad_norm_var": 0.05781962076822917, + "learning_rate": 0.0001, + "loss": 5.8734, + "loss/crossentropy": 2.6485707759857178, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17404945939779282, + "step": 19408 + }, + { + "epoch": 0.6065625, + "grad_norm": 3.828125, + "grad_norm_var": 0.08401692708333333, + "learning_rate": 0.0001, + "loss": 5.8872, + "loss/crossentropy": 2.6375534534454346, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17340198904275894, + "step": 19410 + }, + { + "epoch": 0.606625, + "grad_norm": 2.9375, + "grad_norm_var": 0.08517964680989583, + "learning_rate": 0.0001, + "loss": 5.3536, + "loss/crossentropy": 2.3068161010742188, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1593640148639679, + "step": 19412 + }, + { + "epoch": 0.6066875, + "grad_norm": 3.609375, + "grad_norm_var": 0.08928629557291666, + "learning_rate": 0.0001, + "loss": 5.8966, + "loss/crossentropy": 2.6097759008407593, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1810266673564911, + "step": 19414 + }, + { + "epoch": 0.60675, + "grad_norm": 3.359375, + "grad_norm_var": 0.07962239583333333, + "learning_rate": 0.0001, + "loss": 5.9006, + "loss/crossentropy": 2.6326334476470947, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1760202795267105, + "step": 19416 + }, + { + "epoch": 0.6068125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0674957275390625, + "learning_rate": 0.0001, + "loss": 5.6606, + "loss/crossentropy": 2.4949105978012085, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16539839655160904, + "step": 19418 + }, + { + "epoch": 0.606875, + "grad_norm": 2.84375, + "grad_norm_var": 0.07154947916666667, + "learning_rate": 0.0001, + "loss": 5.4326, + "loss/crossentropy": 2.4538732767105103, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15646998584270477, + "step": 19420 + }, + { + "epoch": 0.6069375, + "grad_norm": 3.015625, + "grad_norm_var": 0.06676025390625, + "learning_rate": 0.0001, + "loss": 6.0018, + "loss/crossentropy": 2.836829662322998, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16845224797725677, + "step": 19422 + }, + { + "epoch": 0.607, + "grad_norm": 3.03125, + "grad_norm_var": 0.0700836181640625, + "learning_rate": 0.0001, + "loss": 5.7035, + "loss/crossentropy": 2.5772018432617188, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16341350972652435, + "step": 19424 + }, + { + "epoch": 0.6070625, + "grad_norm": 3.046875, + "grad_norm_var": 0.03340555826822917, + "learning_rate": 0.0001, + "loss": 5.6656, + "loss/crossentropy": 2.574127674102783, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16305193305015564, + "step": 19426 + }, + { + "epoch": 0.607125, + "grad_norm": 3.015625, + "grad_norm_var": 0.03184305826822917, + "learning_rate": 0.0001, + "loss": 5.4449, + "loss/crossentropy": 2.4027289152145386, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1569523811340332, + "step": 19428 + }, + { + "epoch": 0.6071875, + "grad_norm": 3.796875, + "grad_norm_var": 0.04780171712239583, + "learning_rate": 0.0001, + "loss": 5.4921, + "loss/crossentropy": 2.4096295833587646, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16410991549491882, + "step": 19430 + }, + { + "epoch": 0.60725, + "grad_norm": 2.84375, + "grad_norm_var": 0.05351460774739583, + "learning_rate": 0.0001, + "loss": 5.413, + "loss/crossentropy": 2.3793755769729614, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1596129685640335, + "step": 19432 + }, + { + "epoch": 0.6073125, + "grad_norm": 2.90625, + "grad_norm_var": 0.05611979166666667, + "learning_rate": 0.0001, + "loss": 5.4802, + "loss/crossentropy": 2.4252909421920776, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.158619225025177, + "step": 19434 + }, + { + "epoch": 0.607375, + "grad_norm": 2.921875, + "grad_norm_var": 0.053563435872395836, + "learning_rate": 0.0001, + "loss": 5.6961, + "loss/crossentropy": 2.5794930458068848, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16634531319141388, + "step": 19436 + }, + { + "epoch": 0.6074375, + "grad_norm": 3.25, + "grad_norm_var": 0.063720703125, + "learning_rate": 0.0001, + "loss": 5.7954, + "loss/crossentropy": 2.6061872243881226, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16931024938821793, + "step": 19438 + }, + { + "epoch": 0.6075, + "grad_norm": 3.109375, + "grad_norm_var": 0.06134440104166667, + "learning_rate": 0.0001, + "loss": 5.8521, + "loss/crossentropy": 2.6193517446517944, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1736624613404274, + "step": 19440 + }, + { + "epoch": 0.6075625, + "grad_norm": 2.90625, + "grad_norm_var": 0.06603190104166666, + "learning_rate": 0.0001, + "loss": 5.6158, + "loss/crossentropy": 2.553231120109558, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15898773074150085, + "step": 19442 + }, + { + "epoch": 0.607625, + "grad_norm": 2.90625, + "grad_norm_var": 0.06636454264322916, + "learning_rate": 0.0001, + "loss": 5.4121, + "loss/crossentropy": 2.3391834497451782, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16275667399168015, + "step": 19444 + }, + { + "epoch": 0.6076875, + "grad_norm": 2.796875, + "grad_norm_var": 0.0384185791015625, + "learning_rate": 0.0001, + "loss": 5.2162, + "loss/crossentropy": 2.29744815826416, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.15320461988449097, + "step": 19446 + }, + { + "epoch": 0.60775, + "grad_norm": 3.078125, + "grad_norm_var": 0.029442342122395833, + "learning_rate": 0.0001, + "loss": 5.8668, + "loss/crossentropy": 2.694766879081726, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16837353259325027, + "step": 19448 + }, + { + "epoch": 0.6078125, + "grad_norm": 3.359375, + "grad_norm_var": 0.032868448893229166, + "learning_rate": 0.0001, + "loss": 5.5795, + "loss/crossentropy": 2.4977762699127197, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1624685749411583, + "step": 19450 + }, + { + "epoch": 0.607875, + "grad_norm": 3.171875, + "grad_norm_var": 0.030887858072916666, + "learning_rate": 0.0001, + "loss": 5.5431, + "loss/crossentropy": 2.395267367362976, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1651710569858551, + "step": 19452 + }, + { + "epoch": 0.6079375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0245269775390625, + "learning_rate": 0.0001, + "loss": 5.8255, + "loss/crossentropy": 2.6514869928359985, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17013683915138245, + "step": 19454 + }, + { + "epoch": 0.608, + "grad_norm": 2.984375, + "grad_norm_var": 0.0296051025390625, + "learning_rate": 0.0001, + "loss": 5.6925, + "loss/crossentropy": 2.543419122695923, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16607911884784698, + "step": 19456 + }, + { + "epoch": 0.6080625, + "grad_norm": 3.3125, + "grad_norm_var": 0.0296875, + "learning_rate": 0.0001, + "loss": 5.9382, + "loss/crossentropy": 2.7204233407974243, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1733415350317955, + "step": 19458 + }, + { + "epoch": 0.608125, + "grad_norm": 2.984375, + "grad_norm_var": 0.027860514322916665, + "learning_rate": 0.0001, + "loss": 5.8821, + "loss/crossentropy": 2.807587146759033, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1613534614443779, + "step": 19460 + }, + { + "epoch": 0.6081875, + "grad_norm": 3.015625, + "grad_norm_var": 0.021870930989583332, + "learning_rate": 0.0001, + "loss": 5.7142, + "loss/crossentropy": 2.572507381439209, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16690758615732193, + "step": 19462 + }, + { + "epoch": 0.60825, + "grad_norm": 3.375, + "grad_norm_var": 0.030028279622395834, + "learning_rate": 0.0001, + "loss": 5.808, + "loss/crossentropy": 2.6362345218658447, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17264685779809952, + "step": 19464 + }, + { + "epoch": 0.6083125, + "grad_norm": 3.21875, + "grad_norm_var": 0.027000935872395833, + "learning_rate": 0.0001, + "loss": 5.8405, + "loss/crossentropy": 2.657623291015625, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1678960844874382, + "step": 19466 + }, + { + "epoch": 0.608375, + "grad_norm": 2.828125, + "grad_norm_var": 0.0315582275390625, + "learning_rate": 0.0001, + "loss": 5.5147, + "loss/crossentropy": 2.504341959953308, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15767860412597656, + "step": 19468 + }, + { + "epoch": 0.6084375, + "grad_norm": 3.515625, + "grad_norm_var": 0.036909993489583334, + "learning_rate": 0.0001, + "loss": 5.6847, + "loss/crossentropy": 2.452295660972595, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17011572420597076, + "step": 19470 + }, + { + "epoch": 0.6085, + "grad_norm": 3.34375, + "grad_norm_var": 0.03620503743489583, + "learning_rate": 0.0001, + "loss": 5.598, + "loss/crossentropy": 2.4428714513778687, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1705893874168396, + "step": 19472 + }, + { + "epoch": 0.6085625, + "grad_norm": 3.125, + "grad_norm_var": 0.036116536458333334, + "learning_rate": 0.0001, + "loss": 5.6301, + "loss/crossentropy": 2.5319563150405884, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16254442185163498, + "step": 19474 + }, + { + "epoch": 0.608625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04063212076822917, + "learning_rate": 0.0001, + "loss": 5.5087, + "loss/crossentropy": 2.482611656188965, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16003308445215225, + "step": 19476 + }, + { + "epoch": 0.6086875, + "grad_norm": 3.359375, + "grad_norm_var": 0.047379557291666666, + "learning_rate": 0.0001, + "loss": 5.677, + "loss/crossentropy": 2.4080413579940796, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1710318624973297, + "step": 19478 + }, + { + "epoch": 0.60875, + "grad_norm": 3.171875, + "grad_norm_var": 0.03925679524739583, + "learning_rate": 0.0001, + "loss": 5.915, + "loss/crossentropy": 2.721635103225708, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17129214107990265, + "step": 19480 + }, + { + "epoch": 0.6088125, + "grad_norm": 3.484375, + "grad_norm_var": 0.046773274739583336, + "learning_rate": 0.0001, + "loss": 5.8674, + "loss/crossentropy": 2.635382056236267, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1735890507698059, + "step": 19482 + }, + { + "epoch": 0.608875, + "grad_norm": 3.265625, + "grad_norm_var": 0.04179585774739583, + "learning_rate": 0.0001, + "loss": 5.649, + "loss/crossentropy": 2.55346143245697, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16268158704042435, + "step": 19484 + }, + { + "epoch": 0.6089375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03560791015625, + "learning_rate": 0.0001, + "loss": 5.8329, + "loss/crossentropy": 2.656839370727539, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16994751244783401, + "step": 19486 + }, + { + "epoch": 0.609, + "grad_norm": 2.90625, + "grad_norm_var": 0.036839803059895836, + "learning_rate": 0.0001, + "loss": 5.7469, + "loss/crossentropy": 2.5906903743743896, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16874559968709946, + "step": 19488 + }, + { + "epoch": 0.6090625, + "grad_norm": 3.296875, + "grad_norm_var": 0.03396809895833333, + "learning_rate": 0.0001, + "loss": 5.9224, + "loss/crossentropy": 2.668254256248474, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17463570088148117, + "step": 19490 + }, + { + "epoch": 0.609125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0238433837890625, + "learning_rate": 0.0001, + "loss": 5.596, + "loss/crossentropy": 2.4882742166519165, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16350454092025757, + "step": 19492 + }, + { + "epoch": 0.6091875, + "grad_norm": 3.328125, + "grad_norm_var": 0.0229644775390625, + "learning_rate": 0.0001, + "loss": 5.6077, + "loss/crossentropy": 2.4969935417175293, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16107328236103058, + "step": 19494 + }, + { + "epoch": 0.60925, + "grad_norm": 3.15625, + "grad_norm_var": 0.022981770833333335, + "learning_rate": 0.0001, + "loss": 5.8845, + "loss/crossentropy": 2.6997865438461304, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17042437195777893, + "step": 19496 + }, + { + "epoch": 0.6093125, + "grad_norm": 3.78125, + "grad_norm_var": 0.041162109375, + "learning_rate": 0.0001, + "loss": 5.8276, + "loss/crossentropy": 2.568711996078491, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17510397732257843, + "step": 19498 + }, + { + "epoch": 0.609375, + "grad_norm": 3.265625, + "grad_norm_var": 0.03935546875, + "learning_rate": 0.0001, + "loss": 5.4971, + "loss/crossentropy": 2.4446879625320435, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1568054035305977, + "step": 19500 + }, + { + "epoch": 0.6094375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03697509765625, + "learning_rate": 0.0001, + "loss": 5.8709, + "loss/crossentropy": 2.6936148405075073, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16655559837818146, + "step": 19502 + }, + { + "epoch": 0.6095, + "grad_norm": 3.21875, + "grad_norm_var": 0.030192057291666668, + "learning_rate": 0.0001, + "loss": 5.5124, + "loss/crossentropy": 2.441460132598877, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16099561750888824, + "step": 19504 + }, + { + "epoch": 0.6095625, + "grad_norm": 3.171875, + "grad_norm_var": 0.029206339518229166, + "learning_rate": 0.0001, + "loss": 5.9482, + "loss/crossentropy": 2.732633948326111, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17429260909557343, + "step": 19506 + }, + { + "epoch": 0.609625, + "grad_norm": 3.203125, + "grad_norm_var": 0.028446451822916666, + "learning_rate": 0.0001, + "loss": 5.6926, + "loss/crossentropy": 2.580842614173889, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16430071741342545, + "step": 19508 + }, + { + "epoch": 0.6096875, + "grad_norm": 3.4375, + "grad_norm_var": 0.03033447265625, + "learning_rate": 0.0001, + "loss": 5.7759, + "loss/crossentropy": 2.5772647857666016, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17025792598724365, + "step": 19510 + }, + { + "epoch": 0.60975, + "grad_norm": 3.46875, + "grad_norm_var": 0.033707682291666666, + "learning_rate": 0.0001, + "loss": 5.4905, + "loss/crossentropy": 2.394649028778076, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15880553424358368, + "step": 19512 + }, + { + "epoch": 0.6098125, + "grad_norm": 3.203125, + "grad_norm_var": 0.015620930989583334, + "learning_rate": 0.0001, + "loss": 5.4534, + "loss/crossentropy": 2.343334913253784, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1606125831604004, + "step": 19514 + }, + { + "epoch": 0.609875, + "grad_norm": 3.1875, + "grad_norm_var": 0.014481608072916667, + "learning_rate": 0.0001, + "loss": 5.4632, + "loss/crossentropy": 2.352441668510437, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16302981227636337, + "step": 19516 + }, + { + "epoch": 0.6099375, + "grad_norm": 3.015625, + "grad_norm_var": 0.016597493489583334, + "learning_rate": 0.0001, + "loss": 5.4485, + "loss/crossentropy": 2.38815176486969, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15720786154270172, + "step": 19518 + }, + { + "epoch": 0.61, + "grad_norm": 3.359375, + "grad_norm_var": 0.01773681640625, + "learning_rate": 0.0001, + "loss": 5.6109, + "loss/crossentropy": 2.381349205970764, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17256611585617065, + "step": 19520 + }, + { + "epoch": 0.6100625, + "grad_norm": 2.953125, + "grad_norm_var": 0.023810831705729167, + "learning_rate": 0.0001, + "loss": 5.4556, + "loss/crossentropy": 2.4127821922302246, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1589699685573578, + "step": 19522 + }, + { + "epoch": 0.610125, + "grad_norm": 3.203125, + "grad_norm_var": 0.024605305989583333, + "learning_rate": 0.0001, + "loss": 6.0556, + "loss/crossentropy": 2.782081723213196, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1761820837855339, + "step": 19524 + }, + { + "epoch": 0.6101875, + "grad_norm": 3.125, + "grad_norm_var": 0.0226715087890625, + "learning_rate": 0.0001, + "loss": 5.6567, + "loss/crossentropy": 2.574386239051819, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16018719971179962, + "step": 19526 + }, + { + "epoch": 0.61025, + "grad_norm": 3.625, + "grad_norm_var": 0.0297515869140625, + "learning_rate": 0.0001, + "loss": 5.7258, + "loss/crossentropy": 2.5339808464050293, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17152460664510727, + "step": 19528 + }, + { + "epoch": 0.6103125, + "grad_norm": 3.125, + "grad_norm_var": 0.0310211181640625, + "learning_rate": 0.0001, + "loss": 5.6892, + "loss/crossentropy": 2.562675952911377, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1689067929983139, + "step": 19530 + }, + { + "epoch": 0.610375, + "grad_norm": 3.0, + "grad_norm_var": 0.029833984375, + "learning_rate": 0.0001, + "loss": 5.506, + "loss/crossentropy": 2.4123008251190186, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1628851518034935, + "step": 19532 + }, + { + "epoch": 0.6104375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0288482666015625, + "learning_rate": 0.0001, + "loss": 5.6306, + "loss/crossentropy": 2.4892683029174805, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16803808510303497, + "step": 19534 + }, + { + "epoch": 0.6105, + "grad_norm": 3.15625, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 5.7239, + "loss/crossentropy": 2.642979383468628, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1635577529668808, + "step": 19536 + }, + { + "epoch": 0.6105625, + "grad_norm": 3.1875, + "grad_norm_var": 0.02705078125, + "learning_rate": 0.0001, + "loss": 5.7529, + "loss/crossentropy": 2.5309290885925293, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17571166157722473, + "step": 19538 + }, + { + "epoch": 0.610625, + "grad_norm": 3.03125, + "grad_norm_var": 0.03396809895833333, + "learning_rate": 0.0001, + "loss": 5.7253, + "loss/crossentropy": 2.5540027618408203, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16908280551433563, + "step": 19540 + }, + { + "epoch": 0.6106875, + "grad_norm": 4.4375, + "grad_norm_var": 0.13192952473958333, + "learning_rate": 0.0001, + "loss": 6.0836, + "loss/crossentropy": 2.78000009059906, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1799660548567772, + "step": 19542 + }, + { + "epoch": 0.61075, + "grad_norm": 3.234375, + "grad_norm_var": 0.12216389973958333, + "learning_rate": 0.0001, + "loss": 5.4596, + "loss/crossentropy": 2.374886989593506, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1608191877603531, + "step": 19544 + }, + { + "epoch": 0.6108125, + "grad_norm": 3.265625, + "grad_norm_var": 0.11640625, + "learning_rate": 0.0001, + "loss": 5.5675, + "loss/crossentropy": 2.514106512069702, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15924306213855743, + "step": 19546 + }, + { + "epoch": 0.610875, + "grad_norm": 3.546875, + "grad_norm_var": 0.1162994384765625, + "learning_rate": 0.0001, + "loss": 5.8744, + "loss/crossentropy": 2.620888352394104, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17418219149112701, + "step": 19548 + }, + { + "epoch": 0.6109375, + "grad_norm": 3.125, + "grad_norm_var": 0.1143951416015625, + "learning_rate": 0.0001, + "loss": 5.7243, + "loss/crossentropy": 2.5614830255508423, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17096445709466934, + "step": 19550 + }, + { + "epoch": 0.611, + "grad_norm": 3.015625, + "grad_norm_var": 0.12652587890625, + "learning_rate": 0.0001, + "loss": 5.2421, + "loss/crossentropy": 2.29048490524292, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.14828714728355408, + "step": 19552 + }, + { + "epoch": 0.6110625, + "grad_norm": 3.296875, + "grad_norm_var": 0.12499593098958334, + "learning_rate": 0.0001, + "loss": 6.0252, + "loss/crossentropy": 2.767704486846924, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1753620281815529, + "step": 19554 + }, + { + "epoch": 0.611125, + "grad_norm": 2.96875, + "grad_norm_var": 0.13213602701822916, + "learning_rate": 0.0001, + "loss": 5.7292, + "loss/crossentropy": 2.615246534347534, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16608589887619019, + "step": 19556 + }, + { + "epoch": 0.6111875, + "grad_norm": 3.125, + "grad_norm_var": 0.033600870768229166, + "learning_rate": 0.0001, + "loss": 5.8258, + "loss/crossentropy": 2.6491048336029053, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17039896547794342, + "step": 19558 + }, + { + "epoch": 0.61125, + "grad_norm": 3.03125, + "grad_norm_var": 0.03459879557291667, + "learning_rate": 0.0001, + "loss": 5.638, + "loss/crossentropy": 2.5442399978637695, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16132787615060806, + "step": 19560 + }, + { + "epoch": 0.6113125, + "grad_norm": 3.078125, + "grad_norm_var": 0.03526102701822917, + "learning_rate": 0.0001, + "loss": 5.4512, + "loss/crossentropy": 2.4072697162628174, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1598571613430977, + "step": 19562 + }, + { + "epoch": 0.611375, + "grad_norm": 3.171875, + "grad_norm_var": 0.025614420572916668, + "learning_rate": 0.0001, + "loss": 5.6302, + "loss/crossentropy": 2.457480549812317, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1664915680885315, + "step": 19564 + }, + { + "epoch": 0.6114375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0220123291015625, + "learning_rate": 0.0001, + "loss": 5.7991, + "loss/crossentropy": 2.6218069791793823, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16812142729759216, + "step": 19566 + }, + { + "epoch": 0.6115, + "grad_norm": 3.34375, + "grad_norm_var": 0.020524088541666666, + "learning_rate": 0.0001, + "loss": 5.9472, + "loss/crossentropy": 2.6854790449142456, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17656263709068298, + "step": 19568 + }, + { + "epoch": 0.6115625, + "grad_norm": 3.171875, + "grad_norm_var": 0.01549072265625, + "learning_rate": 0.0001, + "loss": 5.8737, + "loss/crossentropy": 2.614681124687195, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17511707544326782, + "step": 19570 + }, + { + "epoch": 0.611625, + "grad_norm": 3.515625, + "grad_norm_var": 0.02037353515625, + "learning_rate": 0.0001, + "loss": 5.8561, + "loss/crossentropy": 2.5757195949554443, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1760895773768425, + "step": 19572 + }, + { + "epoch": 0.6116875, + "grad_norm": 3.0625, + "grad_norm_var": 0.024300130208333333, + "learning_rate": 0.0001, + "loss": 5.4319, + "loss/crossentropy": 2.2783864736557007, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16730409860610962, + "step": 19574 + }, + { + "epoch": 0.61175, + "grad_norm": 3.171875, + "grad_norm_var": 0.022948201497395834, + "learning_rate": 0.0001, + "loss": 5.9062, + "loss/crossentropy": 2.7144960165023804, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16956491768360138, + "step": 19576 + }, + { + "epoch": 0.6118125, + "grad_norm": 3.203125, + "grad_norm_var": 0.020210774739583333, + "learning_rate": 0.0001, + "loss": 5.4665, + "loss/crossentropy": 2.357220768928528, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16209646314382553, + "step": 19578 + }, + { + "epoch": 0.611875, + "grad_norm": 3.390625, + "grad_norm_var": 0.022005208333333335, + "learning_rate": 0.0001, + "loss": 6.0825, + "loss/crossentropy": 2.85296094417572, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16904301196336746, + "step": 19580 + }, + { + "epoch": 0.6119375, + "grad_norm": 3.609375, + "grad_norm_var": 0.023631795247395834, + "learning_rate": 0.0001, + "loss": 5.735, + "loss/crossentropy": 2.5890785455703735, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1638134941458702, + "step": 19582 + }, + { + "epoch": 0.612, + "grad_norm": 3.15625, + "grad_norm_var": 0.024576822916666668, + "learning_rate": 0.0001, + "loss": 5.8232, + "loss/crossentropy": 2.627472996711731, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17074380815029144, + "step": 19584 + }, + { + "epoch": 0.6120625, + "grad_norm": 6.09375, + "grad_norm_var": 0.5181477864583334, + "learning_rate": 0.0001, + "loss": 5.7791, + "loss/crossentropy": 2.4958267211914062, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17597881704568863, + "step": 19586 + }, + { + "epoch": 0.612125, + "grad_norm": 3.28125, + "grad_norm_var": 0.5228352864583333, + "learning_rate": 0.0001, + "loss": 5.9044, + "loss/crossentropy": 2.673877239227295, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17265690118074417, + "step": 19588 + }, + { + "epoch": 0.6121875, + "grad_norm": 3.375, + "grad_norm_var": 0.5311848958333333, + "learning_rate": 0.0001, + "loss": 5.8418, + "loss/crossentropy": 2.652164936065674, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1720835268497467, + "step": 19590 + }, + { + "epoch": 0.61225, + "grad_norm": 2.96875, + "grad_norm_var": 0.5369099934895833, + "learning_rate": 0.0001, + "loss": 5.5793, + "loss/crossentropy": 2.507964611053467, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16065095365047455, + "step": 19592 + }, + { + "epoch": 0.6123125, + "grad_norm": 3.078125, + "grad_norm_var": 0.5409576416015625, + "learning_rate": 0.0001, + "loss": 5.7414, + "loss/crossentropy": 2.6736977100372314, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15911806374788284, + "step": 19594 + }, + { + "epoch": 0.612375, + "grad_norm": 3.609375, + "grad_norm_var": 0.5402333577473958, + "learning_rate": 0.0001, + "loss": 6.1378, + "loss/crossentropy": 2.7499048709869385, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.18254327028989792, + "step": 19596 + }, + { + "epoch": 0.6124375, + "grad_norm": 2.8125, + "grad_norm_var": 0.57115478515625, + "learning_rate": 0.0001, + "loss": 5.4608, + "loss/crossentropy": 2.487305521965027, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15203223377466202, + "step": 19598 + }, + { + "epoch": 0.6125, + "grad_norm": 3.078125, + "grad_norm_var": 0.5739095052083333, + "learning_rate": 0.0001, + "loss": 5.5002, + "loss/crossentropy": 2.383982539176941, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16123488545417786, + "step": 19600 + }, + { + "epoch": 0.6125625, + "grad_norm": 2.96875, + "grad_norm_var": 0.047215779622395836, + "learning_rate": 0.0001, + "loss": 5.7314, + "loss/crossentropy": 2.5770827531814575, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16816139966249466, + "step": 19602 + }, + { + "epoch": 0.612625, + "grad_norm": 3.171875, + "grad_norm_var": 0.05194905598958333, + "learning_rate": 0.0001, + "loss": 5.8007, + "loss/crossentropy": 2.6743478775024414, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16458549350500107, + "step": 19604 + }, + { + "epoch": 0.6126875, + "grad_norm": 2.671875, + "grad_norm_var": 0.06070556640625, + "learning_rate": 0.0001, + "loss": 5.4554, + "loss/crossentropy": 2.4700160026550293, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1551780253648758, + "step": 19606 + }, + { + "epoch": 0.61275, + "grad_norm": 3.25, + "grad_norm_var": 0.060770670572916664, + "learning_rate": 0.0001, + "loss": 5.5231, + "loss/crossentropy": 2.4005579948425293, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16577185690402985, + "step": 19608 + }, + { + "epoch": 0.6128125, + "grad_norm": 3.25, + "grad_norm_var": 0.06189778645833333, + "learning_rate": 0.0001, + "loss": 5.4559, + "loss/crossentropy": 2.332529306411743, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1662391945719719, + "step": 19610 + }, + { + "epoch": 0.612875, + "grad_norm": 2.953125, + "grad_norm_var": 0.04412434895833333, + "learning_rate": 0.0001, + "loss": 5.6385, + "loss/crossentropy": 2.499752402305603, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16113810241222382, + "step": 19612 + }, + { + "epoch": 0.6129375, + "grad_norm": 3.34375, + "grad_norm_var": 0.0521484375, + "learning_rate": 0.0001, + "loss": 5.2879, + "loss/crossentropy": 2.2084743976593018, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.161456897854805, + "step": 19614 + }, + { + "epoch": 0.613, + "grad_norm": 2.984375, + "grad_norm_var": 0.054011027018229164, + "learning_rate": 0.0001, + "loss": 5.3066, + "loss/crossentropy": 2.333135962486267, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.14734511077404022, + "step": 19616 + }, + { + "epoch": 0.6130625, + "grad_norm": 3.09375, + "grad_norm_var": 0.046826171875, + "learning_rate": 0.0001, + "loss": 5.5485, + "loss/crossentropy": 2.4865167140960693, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16478903591632843, + "step": 19618 + }, + { + "epoch": 0.613125, + "grad_norm": 2.953125, + "grad_norm_var": 0.04421284993489583, + "learning_rate": 0.0001, + "loss": 5.7492, + "loss/crossentropy": 2.589370846748352, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16598235815763474, + "step": 19620 + }, + { + "epoch": 0.6131875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0346099853515625, + "learning_rate": 0.0001, + "loss": 5.7248, + "loss/crossentropy": 2.5286474227905273, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16688184440135956, + "step": 19622 + }, + { + "epoch": 0.61325, + "grad_norm": 3.15625, + "grad_norm_var": 0.03664957682291667, + "learning_rate": 0.0001, + "loss": 6.0778, + "loss/crossentropy": 2.711007833480835, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18511471152305603, + "step": 19624 + }, + { + "epoch": 0.6133125, + "grad_norm": 3.5, + "grad_norm_var": 0.041071573893229164, + "learning_rate": 0.0001, + "loss": 5.8477, + "loss/crossentropy": 2.588347554206848, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17281106859445572, + "step": 19626 + }, + { + "epoch": 0.613375, + "grad_norm": 3.1875, + "grad_norm_var": 0.034089152018229166, + "learning_rate": 0.0001, + "loss": 6.1345, + "loss/crossentropy": 2.8718771934509277, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1770443245768547, + "step": 19628 + }, + { + "epoch": 0.6134375, + "grad_norm": 3.0625, + "grad_norm_var": 0.03790690104166667, + "learning_rate": 0.0001, + "loss": 5.5464, + "loss/crossentropy": 2.483364701271057, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1625508889555931, + "step": 19630 + }, + { + "epoch": 0.6135, + "grad_norm": 2.890625, + "grad_norm_var": 0.037919108072916666, + "learning_rate": 0.0001, + "loss": 5.6717, + "loss/crossentropy": 2.6128073930740356, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16135795414447784, + "step": 19632 + }, + { + "epoch": 0.6135625, + "grad_norm": 3.4375, + "grad_norm_var": 0.0441314697265625, + "learning_rate": 0.0001, + "loss": 5.9193, + "loss/crossentropy": 2.7125253677368164, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17106316983699799, + "step": 19634 + }, + { + "epoch": 0.613625, + "grad_norm": 3.265625, + "grad_norm_var": 0.0421783447265625, + "learning_rate": 0.0001, + "loss": 5.4595, + "loss/crossentropy": 2.4429343938827515, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15790626406669617, + "step": 19636 + }, + { + "epoch": 0.6136875, + "grad_norm": 3.296875, + "grad_norm_var": 0.03485921223958333, + "learning_rate": 0.0001, + "loss": 5.5245, + "loss/crossentropy": 2.5152522325515747, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1579592227935791, + "step": 19638 + }, + { + "epoch": 0.61375, + "grad_norm": 3.1875, + "grad_norm_var": 0.028902180989583335, + "learning_rate": 0.0001, + "loss": 5.4698, + "loss/crossentropy": 2.4052783250808716, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15957404673099518, + "step": 19640 + }, + { + "epoch": 0.6138125, + "grad_norm": 3.3125, + "grad_norm_var": 0.023631795247395834, + "learning_rate": 0.0001, + "loss": 5.5826, + "loss/crossentropy": 2.470545768737793, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1627647429704666, + "step": 19642 + }, + { + "epoch": 0.613875, + "grad_norm": 2.953125, + "grad_norm_var": 0.024559529622395833, + "learning_rate": 0.0001, + "loss": 5.8441, + "loss/crossentropy": 2.7033053636550903, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16798724234104156, + "step": 19644 + }, + { + "epoch": 0.6139375, + "grad_norm": 3.15625, + "grad_norm_var": 0.021284993489583334, + "learning_rate": 0.0001, + "loss": 5.8487, + "loss/crossentropy": 2.7112566232681274, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16882429271936417, + "step": 19646 + }, + { + "epoch": 0.614, + "grad_norm": 2.96875, + "grad_norm_var": 0.019872029622395832, + "learning_rate": 0.0001, + "loss": 5.363, + "loss/crossentropy": 2.3288105726242065, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16122935712337494, + "step": 19648 + }, + { + "epoch": 0.6140625, + "grad_norm": 3.265625, + "grad_norm_var": 0.014937337239583333, + "learning_rate": 0.0001, + "loss": 5.9147, + "loss/crossentropy": 2.6747225522994995, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17477816343307495, + "step": 19650 + }, + { + "epoch": 0.614125, + "grad_norm": 3.0625, + "grad_norm_var": 0.013752237955729166, + "learning_rate": 0.0001, + "loss": 5.7318, + "loss/crossentropy": 2.6186282634735107, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1675650253891945, + "step": 19652 + }, + { + "epoch": 0.6141875, + "grad_norm": 3.046875, + "grad_norm_var": 0.011970011393229167, + "learning_rate": 0.0001, + "loss": 5.6658, + "loss/crossentropy": 2.480633854866028, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16852006316184998, + "step": 19654 + }, + { + "epoch": 0.61425, + "grad_norm": 2.96875, + "grad_norm_var": 0.01298828125, + "learning_rate": 0.0001, + "loss": 5.4228, + "loss/crossentropy": 2.372126817703247, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15780331194400787, + "step": 19656 + }, + { + "epoch": 0.6143125, + "grad_norm": 3.125, + "grad_norm_var": 0.009406534830729167, + "learning_rate": 0.0001, + "loss": 5.9133, + "loss/crossentropy": 2.73931086063385, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1689605638384819, + "step": 19658 + }, + { + "epoch": 0.614375, + "grad_norm": 3.078125, + "grad_norm_var": 0.008821614583333333, + "learning_rate": 0.0001, + "loss": 5.4754, + "loss/crossentropy": 2.466127395629883, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15873520076274872, + "step": 19660 + }, + { + "epoch": 0.6144375, + "grad_norm": 2.875, + "grad_norm_var": 0.0175445556640625, + "learning_rate": 0.0001, + "loss": 5.8, + "loss/crossentropy": 2.5970553159713745, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16834472119808197, + "step": 19662 + }, + { + "epoch": 0.6145, + "grad_norm": 2.90625, + "grad_norm_var": 0.018583170572916665, + "learning_rate": 0.0001, + "loss": 5.4983, + "loss/crossentropy": 2.4853075742721558, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1555929183959961, + "step": 19664 + }, + { + "epoch": 0.6145625, + "grad_norm": 2.890625, + "grad_norm_var": 0.024739583333333332, + "learning_rate": 0.0001, + "loss": 5.8179, + "loss/crossentropy": 2.6911911964416504, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16422955691814423, + "step": 19666 + }, + { + "epoch": 0.614625, + "grad_norm": 3.25, + "grad_norm_var": 0.026851399739583334, + "learning_rate": 0.0001, + "loss": 5.5265, + "loss/crossentropy": 2.46109676361084, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16083980351686478, + "step": 19668 + }, + { + "epoch": 0.6146875, + "grad_norm": 3.421875, + "grad_norm_var": 0.036896769205729166, + "learning_rate": 0.0001, + "loss": 5.9072, + "loss/crossentropy": 2.6542869806289673, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17412163317203522, + "step": 19670 + }, + { + "epoch": 0.61475, + "grad_norm": 3.25, + "grad_norm_var": 0.0373687744140625, + "learning_rate": 0.0001, + "loss": 5.823, + "loss/crossentropy": 2.600062370300293, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1711195632815361, + "step": 19672 + }, + { + "epoch": 0.6148125, + "grad_norm": 3.125, + "grad_norm_var": 0.03710530598958333, + "learning_rate": 0.0001, + "loss": 5.551, + "loss/crossentropy": 2.4106554985046387, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16754940152168274, + "step": 19674 + }, + { + "epoch": 0.614875, + "grad_norm": 3.125, + "grad_norm_var": 0.0342681884765625, + "learning_rate": 0.0001, + "loss": 5.75, + "loss/crossentropy": 2.5887335538864136, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1688610017299652, + "step": 19676 + }, + { + "epoch": 0.6149375, + "grad_norm": 3.140625, + "grad_norm_var": 0.025715128580729166, + "learning_rate": 0.0001, + "loss": 5.8252, + "loss/crossentropy": 2.6288719177246094, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17510154843330383, + "step": 19678 + }, + { + "epoch": 0.615, + "grad_norm": 3.0625, + "grad_norm_var": 0.027144368489583334, + "learning_rate": 0.0001, + "loss": 5.4602, + "loss/crossentropy": 2.4112237691879272, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1615402102470398, + "step": 19680 + }, + { + "epoch": 0.6150625, + "grad_norm": 3.0625, + "grad_norm_var": 0.021805826822916666, + "learning_rate": 0.0001, + "loss": 5.7387, + "loss/crossentropy": 2.6867090463638306, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1618417203426361, + "step": 19682 + }, + { + "epoch": 0.615125, + "grad_norm": 3.296875, + "grad_norm_var": 0.023053995768229165, + "learning_rate": 0.0001, + "loss": 5.6359, + "loss/crossentropy": 2.4958173036575317, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16439910233020782, + "step": 19684 + }, + { + "epoch": 0.6151875, + "grad_norm": 3.0, + "grad_norm_var": 0.025886027018229167, + "learning_rate": 0.0001, + "loss": 5.5855, + "loss/crossentropy": 2.500712513923645, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1639462411403656, + "step": 19686 + }, + { + "epoch": 0.61525, + "grad_norm": 3.1875, + "grad_norm_var": 0.022086588541666667, + "learning_rate": 0.0001, + "loss": 5.6299, + "loss/crossentropy": 2.445050597190857, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.168094664812088, + "step": 19688 + }, + { + "epoch": 0.6153125, + "grad_norm": 2.96875, + "grad_norm_var": 0.028873697916666666, + "learning_rate": 0.0001, + "loss": 5.48, + "loss/crossentropy": 2.4688230752944946, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1597161442041397, + "step": 19690 + }, + { + "epoch": 0.615375, + "grad_norm": 3.5625, + "grad_norm_var": 0.054076131184895834, + "learning_rate": 0.0001, + "loss": 5.9437, + "loss/crossentropy": 2.6283657550811768, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18075451254844666, + "step": 19692 + }, + { + "epoch": 0.6154375, + "grad_norm": 3.234375, + "grad_norm_var": 0.05478108723958333, + "learning_rate": 0.0001, + "loss": 5.7547, + "loss/crossentropy": 2.62972092628479, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1636713668704033, + "step": 19694 + }, + { + "epoch": 0.6155, + "grad_norm": 3.21875, + "grad_norm_var": 0.05471598307291667, + "learning_rate": 0.0001, + "loss": 6.1012, + "loss/crossentropy": 2.7816812992095947, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18156083673238754, + "step": 19696 + }, + { + "epoch": 0.6155625, + "grad_norm": 2.9375, + "grad_norm_var": 0.052571614583333336, + "learning_rate": 0.0001, + "loss": 5.6343, + "loss/crossentropy": 2.5516425371170044, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1590462327003479, + "step": 19698 + }, + { + "epoch": 0.615625, + "grad_norm": 2.921875, + "grad_norm_var": 0.05571187337239583, + "learning_rate": 0.0001, + "loss": 5.8999, + "loss/crossentropy": 2.6493369340896606, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17662305384874344, + "step": 19700 + }, + { + "epoch": 0.6156875, + "grad_norm": 3.203125, + "grad_norm_var": 0.04784749348958333, + "learning_rate": 0.0001, + "loss": 5.7, + "loss/crossentropy": 2.556334614753723, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16944894194602966, + "step": 19702 + }, + { + "epoch": 0.61575, + "grad_norm": 3.140625, + "grad_norm_var": 0.05321858723958333, + "learning_rate": 0.0001, + "loss": 5.7652, + "loss/crossentropy": 2.6158326864242554, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1704033985733986, + "step": 19704 + }, + { + "epoch": 0.6158125, + "grad_norm": 3.3125, + "grad_norm_var": 0.05077718098958333, + "learning_rate": 0.0001, + "loss": 5.587, + "loss/crossentropy": 2.5270044803619385, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1626434177160263, + "step": 19706 + }, + { + "epoch": 0.615875, + "grad_norm": 2.828125, + "grad_norm_var": 0.036530558268229166, + "learning_rate": 0.0001, + "loss": 5.2316, + "loss/crossentropy": 2.3332518339157104, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14608360826969147, + "step": 19708 + }, + { + "epoch": 0.6159375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03720296223958333, + "learning_rate": 0.0001, + "loss": 5.8753, + "loss/crossentropy": 2.7151904106140137, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17147938162088394, + "step": 19710 + }, + { + "epoch": 0.616, + "grad_norm": 3.265625, + "grad_norm_var": 0.032307942708333336, + "learning_rate": 0.0001, + "loss": 5.6265, + "loss/crossentropy": 2.464982032775879, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16693414747714996, + "step": 19712 + }, + { + "epoch": 0.6160625, + "grad_norm": 2.921875, + "grad_norm_var": 0.031232706705729165, + "learning_rate": 0.0001, + "loss": 5.4135, + "loss/crossentropy": 2.3861416578292847, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1574278026819229, + "step": 19714 + }, + { + "epoch": 0.616125, + "grad_norm": 3.09375, + "grad_norm_var": 0.0286529541015625, + "learning_rate": 0.0001, + "loss": 5.4291, + "loss/crossentropy": 2.429054617881775, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15625176578760147, + "step": 19716 + }, + { + "epoch": 0.6161875, + "grad_norm": 3.109375, + "grad_norm_var": 0.04107666015625, + "learning_rate": 0.0001, + "loss": 5.8131, + "loss/crossentropy": 2.645476818084717, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1687166914343834, + "step": 19718 + }, + { + "epoch": 0.61625, + "grad_norm": 3.28125, + "grad_norm_var": 0.0397857666015625, + "learning_rate": 0.0001, + "loss": 5.9102, + "loss/crossentropy": 2.7409597635269165, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1696610003709793, + "step": 19720 + }, + { + "epoch": 0.6163125, + "grad_norm": 3.3125, + "grad_norm_var": 0.03638916015625, + "learning_rate": 0.0001, + "loss": 5.7133, + "loss/crossentropy": 2.5925413370132446, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16637563705444336, + "step": 19722 + }, + { + "epoch": 0.616375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0343658447265625, + "learning_rate": 0.0001, + "loss": 5.7412, + "loss/crossentropy": 2.5285887718200684, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17126226425170898, + "step": 19724 + }, + { + "epoch": 0.6164375, + "grad_norm": 3.046875, + "grad_norm_var": 0.033447265625, + "learning_rate": 0.0001, + "loss": 5.5655, + "loss/crossentropy": 2.4696013927459717, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16349675506353378, + "step": 19726 + }, + { + "epoch": 0.6165, + "grad_norm": 3.046875, + "grad_norm_var": 0.03194071451822917, + "learning_rate": 0.0001, + "loss": 5.5142, + "loss/crossentropy": 2.450305223464966, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16224515438079834, + "step": 19728 + }, + { + "epoch": 0.6165625, + "grad_norm": 3.0625, + "grad_norm_var": 0.028092447916666666, + "learning_rate": 0.0001, + "loss": 5.632, + "loss/crossentropy": 2.5135504007339478, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1649661585688591, + "step": 19730 + }, + { + "epoch": 0.616625, + "grad_norm": 3.515625, + "grad_norm_var": 0.03697509765625, + "learning_rate": 0.0001, + "loss": 5.2826, + "loss/crossentropy": 2.243741989135742, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1531006470322609, + "step": 19732 + }, + { + "epoch": 0.6166875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0291656494140625, + "learning_rate": 0.0001, + "loss": 6.0837, + "loss/crossentropy": 2.8326183557510376, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.175108902156353, + "step": 19734 + }, + { + "epoch": 0.61675, + "grad_norm": 2.953125, + "grad_norm_var": 0.0334625244140625, + "learning_rate": 0.0001, + "loss": 5.5365, + "loss/crossentropy": 2.565299153327942, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15649481862783432, + "step": 19736 + }, + { + "epoch": 0.6168125, + "grad_norm": 3.171875, + "grad_norm_var": 0.033154296875, + "learning_rate": 0.0001, + "loss": 5.5647, + "loss/crossentropy": 2.5372307300567627, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15743693709373474, + "step": 19738 + }, + { + "epoch": 0.616875, + "grad_norm": 3.046875, + "grad_norm_var": 0.024186197916666666, + "learning_rate": 0.0001, + "loss": 5.5928, + "loss/crossentropy": 2.488158345222473, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16085833311080933, + "step": 19740 + }, + { + "epoch": 0.6169375, + "grad_norm": 3.171875, + "grad_norm_var": 0.024901326497395834, + "learning_rate": 0.0001, + "loss": 5.7667, + "loss/crossentropy": 2.528436779975891, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17421747744083405, + "step": 19742 + }, + { + "epoch": 0.617, + "grad_norm": 3.15625, + "grad_norm_var": 0.025321451822916667, + "learning_rate": 0.0001, + "loss": 5.5162, + "loss/crossentropy": 2.4038294553756714, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16163229942321777, + "step": 19744 + }, + { + "epoch": 0.6170625, + "grad_norm": 3.15625, + "grad_norm_var": 0.026692708333333332, + "learning_rate": 0.0001, + "loss": 5.6065, + "loss/crossentropy": 2.4743977785110474, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16477020829916, + "step": 19746 + }, + { + "epoch": 0.617125, + "grad_norm": 2.953125, + "grad_norm_var": 0.015559895833333334, + "learning_rate": 0.0001, + "loss": 5.325, + "loss/crossentropy": 2.3060061931610107, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.150724358856678, + "step": 19748 + }, + { + "epoch": 0.6171875, + "grad_norm": 3.21875, + "grad_norm_var": 0.019986979166666665, + "learning_rate": 0.0001, + "loss": 5.96, + "loss/crossentropy": 2.808081030845642, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16831664741039276, + "step": 19750 + }, + { + "epoch": 0.61725, + "grad_norm": 3.203125, + "grad_norm_var": 0.020140584309895834, + "learning_rate": 0.0001, + "loss": 5.5748, + "loss/crossentropy": 2.39535653591156, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16364478319883347, + "step": 19752 + }, + { + "epoch": 0.6173125, + "grad_norm": 3.109375, + "grad_norm_var": 0.032957967122395834, + "learning_rate": 0.0001, + "loss": 5.7134, + "loss/crossentropy": 2.51510488986969, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16748429089784622, + "step": 19754 + }, + { + "epoch": 0.617375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03564046223958333, + "learning_rate": 0.0001, + "loss": 5.7733, + "loss/crossentropy": 2.5583337545394897, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16798176616430283, + "step": 19756 + }, + { + "epoch": 0.6174375, + "grad_norm": 3.203125, + "grad_norm_var": 0.03906962076822917, + "learning_rate": 0.0001, + "loss": 5.4938, + "loss/crossentropy": 2.3841590881347656, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16447947174310684, + "step": 19758 + }, + { + "epoch": 0.6175, + "grad_norm": 3.15625, + "grad_norm_var": 0.040095011393229164, + "learning_rate": 0.0001, + "loss": 5.9566, + "loss/crossentropy": 2.7379831075668335, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1753723919391632, + "step": 19760 + }, + { + "epoch": 0.6175625, + "grad_norm": 2.890625, + "grad_norm_var": 0.04627176920572917, + "learning_rate": 0.0001, + "loss": 5.475, + "loss/crossentropy": 2.390742301940918, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15959543734788895, + "step": 19762 + }, + { + "epoch": 0.617625, + "grad_norm": 3.109375, + "grad_norm_var": 0.044661458333333334, + "learning_rate": 0.0001, + "loss": 5.532, + "loss/crossentropy": 2.553749918937683, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1540743187069893, + "step": 19764 + }, + { + "epoch": 0.6176875, + "grad_norm": 3.078125, + "grad_norm_var": 0.03704020182291667, + "learning_rate": 0.0001, + "loss": 5.752, + "loss/crossentropy": 2.596306085586548, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1663486734032631, + "step": 19766 + }, + { + "epoch": 0.61775, + "grad_norm": 3.296875, + "grad_norm_var": 0.037531534830729164, + "learning_rate": 0.0001, + "loss": 5.9308, + "loss/crossentropy": 2.742679238319397, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1707676202058792, + "step": 19768 + }, + { + "epoch": 0.6178125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0276519775390625, + "learning_rate": 0.0001, + "loss": 5.6956, + "loss/crossentropy": 2.5400915145874023, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1655483990907669, + "step": 19770 + }, + { + "epoch": 0.617875, + "grad_norm": 3.484375, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 6.0722, + "loss/crossentropy": 2.741970181465149, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18067777156829834, + "step": 19772 + }, + { + "epoch": 0.6179375, + "grad_norm": 3.3125, + "grad_norm_var": 0.028743489583333334, + "learning_rate": 0.0001, + "loss": 5.9508, + "loss/crossentropy": 2.698033928871155, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17800699174404144, + "step": 19774 + }, + { + "epoch": 0.618, + "grad_norm": 3.296875, + "grad_norm_var": 0.025121053059895832, + "learning_rate": 0.0001, + "loss": 6.1399, + "loss/crossentropy": 2.8511316776275635, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.178090900182724, + "step": 19776 + }, + { + "epoch": 0.6180625, + "grad_norm": 3.09375, + "grad_norm_var": 0.024128214518229166, + "learning_rate": 0.0001, + "loss": 5.1792, + "loss/crossentropy": 2.223225235939026, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15301509201526642, + "step": 19778 + }, + { + "epoch": 0.618125, + "grad_norm": 3.0625, + "grad_norm_var": 0.02291259765625, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.5856436491012573, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16958042234182358, + "step": 19780 + }, + { + "epoch": 0.6181875, + "grad_norm": 3.578125, + "grad_norm_var": 0.03052978515625, + "learning_rate": 0.0001, + "loss": 5.6399, + "loss/crossentropy": 2.5239371061325073, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16628500074148178, + "step": 19782 + }, + { + "epoch": 0.61825, + "grad_norm": 2.96875, + "grad_norm_var": 0.037450154622395836, + "learning_rate": 0.0001, + "loss": 5.5843, + "loss/crossentropy": 2.4954280853271484, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16435730457305908, + "step": 19784 + }, + { + "epoch": 0.6183125, + "grad_norm": 3.15625, + "grad_norm_var": 0.03564046223958333, + "learning_rate": 0.0001, + "loss": 5.9371, + "loss/crossentropy": 2.6663039922714233, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17590922117233276, + "step": 19786 + }, + { + "epoch": 0.618375, + "grad_norm": 2.84375, + "grad_norm_var": 0.0433258056640625, + "learning_rate": 0.0001, + "loss": 5.8278, + "loss/crossentropy": 2.6265984773635864, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17128776758909225, + "step": 19788 + }, + { + "epoch": 0.6184375, + "grad_norm": 3.5625, + "grad_norm_var": 0.07093098958333334, + "learning_rate": 0.0001, + "loss": 5.8409, + "loss/crossentropy": 2.5903788805007935, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17270878702402115, + "step": 19790 + }, + { + "epoch": 0.6185, + "grad_norm": 2.78125, + "grad_norm_var": 0.08190816243489583, + "learning_rate": 0.0001, + "loss": 5.3253, + "loss/crossentropy": 2.3577980995178223, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14909353852272034, + "step": 19792 + }, + { + "epoch": 0.6185625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0795562744140625, + "learning_rate": 0.0001, + "loss": 5.9199, + "loss/crossentropy": 2.7423676252365112, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16814511269330978, + "step": 19794 + }, + { + "epoch": 0.618625, + "grad_norm": 2.96875, + "grad_norm_var": 0.080517578125, + "learning_rate": 0.0001, + "loss": 5.51, + "loss/crossentropy": 2.4029040336608887, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16579217463731766, + "step": 19796 + }, + { + "epoch": 0.6186875, + "grad_norm": 3.25, + "grad_norm_var": 0.07124735514322916, + "learning_rate": 0.0001, + "loss": 5.8219, + "loss/crossentropy": 2.6036036014556885, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17144262790679932, + "step": 19798 + }, + { + "epoch": 0.61875, + "grad_norm": 3.203125, + "grad_norm_var": 0.06711324055989583, + "learning_rate": 0.0001, + "loss": 5.8348, + "loss/crossentropy": 2.702506184577942, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16400794684886932, + "step": 19800 + }, + { + "epoch": 0.6188125, + "grad_norm": 2.921875, + "grad_norm_var": 0.07254130045572917, + "learning_rate": 0.0001, + "loss": 5.8127, + "loss/crossentropy": 2.6946845054626465, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16531235724687576, + "step": 19802 + }, + { + "epoch": 0.618875, + "grad_norm": 3.28125, + "grad_norm_var": 0.061335245768229164, + "learning_rate": 0.0001, + "loss": 5.5924, + "loss/crossentropy": 2.483719825744629, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1651698350906372, + "step": 19804 + }, + { + "epoch": 0.6189375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0256744384765625, + "learning_rate": 0.0001, + "loss": 5.5088, + "loss/crossentropy": 2.47348690032959, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15900491923093796, + "step": 19806 + }, + { + "epoch": 0.619, + "grad_norm": 3.46875, + "grad_norm_var": 0.03137613932291667, + "learning_rate": 0.0001, + "loss": 6.1446, + "loss/crossentropy": 2.730509877204895, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.18399201333522797, + "step": 19808 + }, + { + "epoch": 0.6190625, + "grad_norm": 3.140625, + "grad_norm_var": 0.030394490559895834, + "learning_rate": 0.0001, + "loss": 5.4351, + "loss/crossentropy": 2.3951356410980225, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16024568676948547, + "step": 19810 + }, + { + "epoch": 0.619125, + "grad_norm": 2.90625, + "grad_norm_var": 0.0328765869140625, + "learning_rate": 0.0001, + "loss": 5.8967, + "loss/crossentropy": 2.689175009727478, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1727081760764122, + "step": 19812 + }, + { + "epoch": 0.6191875, + "grad_norm": 3.0625, + "grad_norm_var": 0.03483784993489583, + "learning_rate": 0.0001, + "loss": 5.7387, + "loss/crossentropy": 2.6214596033096313, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16562539339065552, + "step": 19814 + }, + { + "epoch": 0.61925, + "grad_norm": 3.015625, + "grad_norm_var": 0.03827718098958333, + "learning_rate": 0.0001, + "loss": 5.4334, + "loss/crossentropy": 2.4262090921401978, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15345712006092072, + "step": 19816 + }, + { + "epoch": 0.6193125, + "grad_norm": 3.28125, + "grad_norm_var": 0.03860270182291667, + "learning_rate": 0.0001, + "loss": 5.8901, + "loss/crossentropy": 2.733052372932434, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16805174201726913, + "step": 19818 + }, + { + "epoch": 0.619375, + "grad_norm": 3.125, + "grad_norm_var": 0.0427886962890625, + "learning_rate": 0.0001, + "loss": 5.5381, + "loss/crossentropy": 2.5061721801757812, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1559237688779831, + "step": 19820 + }, + { + "epoch": 0.6194375, + "grad_norm": 3.4375, + "grad_norm_var": 0.058154296875, + "learning_rate": 0.0001, + "loss": 5.5695, + "loss/crossentropy": 2.4504482746124268, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16463561356067657, + "step": 19822 + }, + { + "epoch": 0.6195, + "grad_norm": 3.328125, + "grad_norm_var": 0.045068359375, + "learning_rate": 0.0001, + "loss": 5.9245, + "loss/crossentropy": 2.700040817260742, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1732223555445671, + "step": 19824 + }, + { + "epoch": 0.6195625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0473052978515625, + "learning_rate": 0.0001, + "loss": 5.6389, + "loss/crossentropy": 2.5035756826400757, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16705185174942017, + "step": 19826 + }, + { + "epoch": 0.619625, + "grad_norm": 3.21875, + "grad_norm_var": 0.0452545166015625, + "learning_rate": 0.0001, + "loss": 5.7924, + "loss/crossentropy": 2.6258389949798584, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1697773039340973, + "step": 19828 + }, + { + "epoch": 0.6196875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0395172119140625, + "learning_rate": 0.0001, + "loss": 5.7562, + "loss/crossentropy": 2.574925184249878, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.170865960419178, + "step": 19830 + }, + { + "epoch": 0.61975, + "grad_norm": 3.484375, + "grad_norm_var": 0.0346099853515625, + "learning_rate": 0.0001, + "loss": 5.5627, + "loss/crossentropy": 2.390936851501465, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16834847629070282, + "step": 19832 + }, + { + "epoch": 0.6198125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0341796875, + "learning_rate": 0.0001, + "loss": 5.7697, + "loss/crossentropy": 2.6261430978775024, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16592053323984146, + "step": 19834 + }, + { + "epoch": 0.619875, + "grad_norm": 3.34375, + "grad_norm_var": 0.02789306640625, + "learning_rate": 0.0001, + "loss": 5.6812, + "loss/crossentropy": 2.529661774635315, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17023325711488724, + "step": 19836 + }, + { + "epoch": 0.6199375, + "grad_norm": 2.671875, + "grad_norm_var": 0.03790690104166667, + "learning_rate": 0.0001, + "loss": 5.6216, + "loss/crossentropy": 2.5542296171188354, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16259898990392685, + "step": 19838 + }, + { + "epoch": 0.62, + "grad_norm": 3.328125, + "grad_norm_var": 0.053446451822916664, + "learning_rate": 0.0001, + "loss": 5.7378, + "loss/crossentropy": 2.6376454830169678, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1635262370109558, + "step": 19840 + }, + { + "epoch": 0.6200625, + "grad_norm": 3.40625, + "grad_norm_var": 0.060480753580729164, + "learning_rate": 0.0001, + "loss": 5.6424, + "loss/crossentropy": 2.4937418699264526, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16760383546352386, + "step": 19842 + }, + { + "epoch": 0.620125, + "grad_norm": 3.25, + "grad_norm_var": 0.06047261555989583, + "learning_rate": 0.0001, + "loss": 5.5447, + "loss/crossentropy": 2.4872806072235107, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15809065103530884, + "step": 19844 + }, + { + "epoch": 0.6201875, + "grad_norm": 3.09375, + "grad_norm_var": 0.0618316650390625, + "learning_rate": 0.0001, + "loss": 5.5921, + "loss/crossentropy": 2.5492920875549316, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15701929479837418, + "step": 19846 + }, + { + "epoch": 0.62025, + "grad_norm": 2.890625, + "grad_norm_var": 0.05341389973958333, + "learning_rate": 0.0001, + "loss": 5.5839, + "loss/crossentropy": 2.480614185333252, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16267366707324982, + "step": 19848 + }, + { + "epoch": 0.6203125, + "grad_norm": 2.921875, + "grad_norm_var": 0.05015869140625, + "learning_rate": 0.0001, + "loss": 5.5384, + "loss/crossentropy": 2.5023187398910522, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15985383093357086, + "step": 19850 + }, + { + "epoch": 0.620375, + "grad_norm": 2.984375, + "grad_norm_var": 0.05698954264322917, + "learning_rate": 0.0001, + "loss": 5.5993, + "loss/crossentropy": 2.4145087003707886, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16769680380821228, + "step": 19852 + }, + { + "epoch": 0.6204375, + "grad_norm": 2.90625, + "grad_norm_var": 0.049925740559895834, + "learning_rate": 0.0001, + "loss": 5.6951, + "loss/crossentropy": 2.605020761489868, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16486431658267975, + "step": 19854 + }, + { + "epoch": 0.6205, + "grad_norm": 3.265625, + "grad_norm_var": 0.03661702473958333, + "learning_rate": 0.0001, + "loss": 5.9839, + "loss/crossentropy": 2.7632559537887573, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1740158125758171, + "step": 19856 + }, + { + "epoch": 0.6205625, + "grad_norm": 3.046875, + "grad_norm_var": 0.026741536458333333, + "learning_rate": 0.0001, + "loss": 5.6185, + "loss/crossentropy": 2.530821681022644, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1599380522966385, + "step": 19858 + }, + { + "epoch": 0.620625, + "grad_norm": 3.109375, + "grad_norm_var": 0.024983723958333332, + "learning_rate": 0.0001, + "loss": 5.6764, + "loss/crossentropy": 2.5637917518615723, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1624293476343155, + "step": 19860 + }, + { + "epoch": 0.6206875, + "grad_norm": 3.015625, + "grad_norm_var": 0.025130208333333334, + "learning_rate": 0.0001, + "loss": 5.6027, + "loss/crossentropy": 2.503827214241028, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16106384992599487, + "step": 19862 + }, + { + "epoch": 0.62075, + "grad_norm": 3.109375, + "grad_norm_var": 0.0257720947265625, + "learning_rate": 0.0001, + "loss": 5.6813, + "loss/crossentropy": 2.6267412900924683, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16248583793640137, + "step": 19864 + }, + { + "epoch": 0.6208125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0237945556640625, + "learning_rate": 0.0001, + "loss": 5.7132, + "loss/crossentropy": 2.620882511138916, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16196326166391373, + "step": 19866 + }, + { + "epoch": 0.620875, + "grad_norm": 2.984375, + "grad_norm_var": 0.014207967122395833, + "learning_rate": 0.0001, + "loss": 5.5756, + "loss/crossentropy": 2.514933943748474, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15919151902198792, + "step": 19868 + }, + { + "epoch": 0.6209375, + "grad_norm": 3.1875, + "grad_norm_var": 0.011937459309895834, + "learning_rate": 0.0001, + "loss": 5.509, + "loss/crossentropy": 2.417167901992798, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16543804854154587, + "step": 19870 + }, + { + "epoch": 0.621, + "grad_norm": 2.84375, + "grad_norm_var": 0.0136383056640625, + "learning_rate": 0.0001, + "loss": 5.5005, + "loss/crossentropy": 2.4885376691818237, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15823006629943848, + "step": 19872 + }, + { + "epoch": 0.6210625, + "grad_norm": 3.125, + "grad_norm_var": 0.013801066080729167, + "learning_rate": 0.0001, + "loss": 5.8575, + "loss/crossentropy": 2.6915894746780396, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.17362503707408905, + "step": 19874 + }, + { + "epoch": 0.621125, + "grad_norm": 3.15625, + "grad_norm_var": 0.041239420572916664, + "learning_rate": 0.0001, + "loss": 5.6771, + "loss/crossentropy": 2.500933289527893, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17113670706748962, + "step": 19876 + }, + { + "epoch": 0.6211875, + "grad_norm": 3.3125, + "grad_norm_var": 0.044417317708333334, + "learning_rate": 0.0001, + "loss": 5.5976, + "loss/crossentropy": 2.4560399055480957, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16571440547704697, + "step": 19878 + }, + { + "epoch": 0.62125, + "grad_norm": 2.984375, + "grad_norm_var": 0.04436848958333333, + "learning_rate": 0.0001, + "loss": 5.732, + "loss/crossentropy": 2.667479157447815, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16348617523908615, + "step": 19880 + }, + { + "epoch": 0.6213125, + "grad_norm": 3.328125, + "grad_norm_var": 0.04750874837239583, + "learning_rate": 0.0001, + "loss": 5.7578, + "loss/crossentropy": 2.521763563156128, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17164885997772217, + "step": 19882 + }, + { + "epoch": 0.621375, + "grad_norm": 3.15625, + "grad_norm_var": 0.046605428059895836, + "learning_rate": 0.0001, + "loss": 5.4672, + "loss/crossentropy": 2.3879551887512207, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16104530543088913, + "step": 19884 + }, + { + "epoch": 0.6214375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0566314697265625, + "learning_rate": 0.0001, + "loss": 5.4134, + "loss/crossentropy": 2.279152035713196, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16108424216508865, + "step": 19886 + }, + { + "epoch": 0.6215, + "grad_norm": 2.953125, + "grad_norm_var": 0.04547119140625, + "learning_rate": 0.0001, + "loss": 5.7432, + "loss/crossentropy": 2.5680145025253296, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16946741938591003, + "step": 19888 + }, + { + "epoch": 0.6215625, + "grad_norm": 3.3125, + "grad_norm_var": 0.0491851806640625, + "learning_rate": 0.0001, + "loss": 5.8043, + "loss/crossentropy": 2.530237555503845, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17584840953350067, + "step": 19890 + }, + { + "epoch": 0.621625, + "grad_norm": 2.953125, + "grad_norm_var": 0.03766276041666667, + "learning_rate": 0.0001, + "loss": 5.6202, + "loss/crossentropy": 2.5808955430984497, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1578410044312477, + "step": 19892 + }, + { + "epoch": 0.6216875, + "grad_norm": 2.96875, + "grad_norm_var": 0.038960774739583336, + "learning_rate": 0.0001, + "loss": 5.8825, + "loss/crossentropy": 2.767555594444275, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16266612708568573, + "step": 19894 + }, + { + "epoch": 0.62175, + "grad_norm": 3.34375, + "grad_norm_var": 0.037886555989583334, + "learning_rate": 0.0001, + "loss": 5.9589, + "loss/crossentropy": 2.7096651792526245, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17609944194555283, + "step": 19896 + }, + { + "epoch": 0.6218125, + "grad_norm": 2.875, + "grad_norm_var": 0.04159749348958333, + "learning_rate": 0.0001, + "loss": 5.1233, + "loss/crossentropy": 2.2056552171707153, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.14371608197689056, + "step": 19898 + }, + { + "epoch": 0.621875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0403228759765625, + "learning_rate": 0.0001, + "loss": 5.8057, + "loss/crossentropy": 2.621985077857971, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16837255656719208, + "step": 19900 + }, + { + "epoch": 0.6219375, + "grad_norm": 2.859375, + "grad_norm_var": 0.03504231770833333, + "learning_rate": 0.0001, + "loss": 5.5809, + "loss/crossentropy": 2.5222402811050415, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16133303195238113, + "step": 19902 + }, + { + "epoch": 0.622, + "grad_norm": 3.109375, + "grad_norm_var": 0.030810546875, + "learning_rate": 0.0001, + "loss": 5.6809, + "loss/crossentropy": 2.5464282035827637, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16734935343265533, + "step": 19904 + }, + { + "epoch": 0.6220625, + "grad_norm": 3.1875, + "grad_norm_var": 0.01968994140625, + "learning_rate": 0.0001, + "loss": 5.9077, + "loss/crossentropy": 2.7067201137542725, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17049284279346466, + "step": 19906 + }, + { + "epoch": 0.622125, + "grad_norm": 2.9375, + "grad_norm_var": 0.021076456705729166, + "learning_rate": 0.0001, + "loss": 5.5868, + "loss/crossentropy": 2.536835789680481, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16202454268932343, + "step": 19908 + }, + { + "epoch": 0.6221875, + "grad_norm": 2.9375, + "grad_norm_var": 0.021708170572916668, + "learning_rate": 0.0001, + "loss": 5.8591, + "loss/crossentropy": 2.721961736679077, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16723129153251648, + "step": 19910 + }, + { + "epoch": 0.62225, + "grad_norm": 3.015625, + "grad_norm_var": 0.016877237955729166, + "learning_rate": 0.0001, + "loss": 5.5046, + "loss/crossentropy": 2.4232596158981323, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15891293436288834, + "step": 19912 + }, + { + "epoch": 0.6223125, + "grad_norm": 2.859375, + "grad_norm_var": 0.024605305989583333, + "learning_rate": 0.0001, + "loss": 5.6248, + "loss/crossentropy": 2.5254926681518555, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16344477981328964, + "step": 19914 + }, + { + "epoch": 0.622375, + "grad_norm": 3.703125, + "grad_norm_var": 0.04690755208333333, + "learning_rate": 0.0001, + "loss": 5.914, + "loss/crossentropy": 2.6902559995651245, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17237311601638794, + "step": 19916 + }, + { + "epoch": 0.6224375, + "grad_norm": 3.765625, + "grad_norm_var": 0.07075907389322916, + "learning_rate": 0.0001, + "loss": 6.0173, + "loss/crossentropy": 2.7229477167129517, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.18138906359672546, + "step": 19918 + }, + { + "epoch": 0.6225, + "grad_norm": 3.140625, + "grad_norm_var": 0.07154541015625, + "learning_rate": 0.0001, + "loss": 5.712, + "loss/crossentropy": 2.527737617492676, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17193929851055145, + "step": 19920 + }, + { + "epoch": 0.6225625, + "grad_norm": 3.203125, + "grad_norm_var": 0.07968343098958333, + "learning_rate": 0.0001, + "loss": 5.5652, + "loss/crossentropy": 2.5005797147750854, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.161932535469532, + "step": 19922 + }, + { + "epoch": 0.622625, + "grad_norm": 2.90625, + "grad_norm_var": 0.07846577962239583, + "learning_rate": 0.0001, + "loss": 5.5449, + "loss/crossentropy": 2.5181914567947388, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15969933569431305, + "step": 19924 + }, + { + "epoch": 0.6226875, + "grad_norm": 3.296875, + "grad_norm_var": 0.07704671223958333, + "learning_rate": 0.0001, + "loss": 5.7643, + "loss/crossentropy": 2.6106055974960327, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16771600395441055, + "step": 19926 + }, + { + "epoch": 0.62275, + "grad_norm": 2.9375, + "grad_norm_var": 0.0793365478515625, + "learning_rate": 0.0001, + "loss": 5.6237, + "loss/crossentropy": 2.566754460334778, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15998917818069458, + "step": 19928 + }, + { + "epoch": 0.6228125, + "grad_norm": 3.171875, + "grad_norm_var": 0.06936442057291667, + "learning_rate": 0.0001, + "loss": 5.6092, + "loss/crossentropy": 2.487126350402832, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16259347647428513, + "step": 19930 + }, + { + "epoch": 0.622875, + "grad_norm": 3.046875, + "grad_norm_var": 0.04719136555989583, + "learning_rate": 0.0001, + "loss": 5.7003, + "loss/crossentropy": 2.6059796810150146, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16333814710378647, + "step": 19932 + }, + { + "epoch": 0.6229375, + "grad_norm": 3.09375, + "grad_norm_var": 0.018159993489583335, + "learning_rate": 0.0001, + "loss": 5.8046, + "loss/crossentropy": 2.65829861164093, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16931987553834915, + "step": 19934 + }, + { + "epoch": 0.623, + "grad_norm": 3.0625, + "grad_norm_var": 0.016145833333333335, + "learning_rate": 0.0001, + "loss": 5.5101, + "loss/crossentropy": 2.448823094367981, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1596415713429451, + "step": 19936 + }, + { + "epoch": 0.6230625, + "grad_norm": 2.90625, + "grad_norm_var": 0.012450154622395833, + "learning_rate": 0.0001, + "loss": 5.3862, + "loss/crossentropy": 2.366100311279297, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16021539270877838, + "step": 19938 + }, + { + "epoch": 0.623125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0109375, + "learning_rate": 0.0001, + "loss": 5.7748, + "loss/crossentropy": 2.6478099822998047, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16738468408584595, + "step": 19940 + }, + { + "epoch": 0.6231875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0072265625, + "learning_rate": 0.0001, + "loss": 5.3404, + "loss/crossentropy": 2.3103270530700684, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1545696258544922, + "step": 19942 + }, + { + "epoch": 0.62325, + "grad_norm": 3.125, + "grad_norm_var": 0.006004842122395834, + "learning_rate": 0.0001, + "loss": 5.681, + "loss/crossentropy": 2.5023328065872192, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16825802624225616, + "step": 19944 + }, + { + "epoch": 0.6233125, + "grad_norm": 3.125, + "grad_norm_var": 0.006184895833333333, + "learning_rate": 0.0001, + "loss": 5.6783, + "loss/crossentropy": 2.498602867126465, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16952957212924957, + "step": 19946 + }, + { + "epoch": 0.623375, + "grad_norm": 3.40625, + "grad_norm_var": 0.012108357747395833, + "learning_rate": 0.0001, + "loss": 5.7909, + "loss/crossentropy": 2.64833664894104, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16659605503082275, + "step": 19948 + }, + { + "epoch": 0.6234375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0127593994140625, + "learning_rate": 0.0001, + "loss": 5.2735, + "loss/crossentropy": 2.2905895709991455, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15024419128894806, + "step": 19950 + }, + { + "epoch": 0.6235, + "grad_norm": 3.140625, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 5.6007, + "loss/crossentropy": 2.5294933319091797, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16024452447891235, + "step": 19952 + }, + { + "epoch": 0.6235625, + "grad_norm": 3.296875, + "grad_norm_var": 0.030659993489583332, + "learning_rate": 0.0001, + "loss": 5.9605, + "loss/crossentropy": 2.6646621227264404, + "loss/hidden": 1.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.17255229502916336, + "step": 19954 + }, + { + "epoch": 0.623625, + "grad_norm": 3.03125, + "grad_norm_var": 0.03189697265625, + "learning_rate": 0.0001, + "loss": 6.022, + "loss/crossentropy": 2.806672692298889, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17466050386428833, + "step": 19956 + }, + { + "epoch": 0.6236875, + "grad_norm": 3.15625, + "grad_norm_var": 0.03511454264322917, + "learning_rate": 0.0001, + "loss": 5.4811, + "loss/crossentropy": 2.452077865600586, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15836618095636368, + "step": 19958 + }, + { + "epoch": 0.62375, + "grad_norm": 3.4375, + "grad_norm_var": 0.038914998372395836, + "learning_rate": 0.0001, + "loss": 5.9342, + "loss/crossentropy": 2.6591310501098633, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.18024230748414993, + "step": 19960 + }, + { + "epoch": 0.6238125, + "grad_norm": 2.71875, + "grad_norm_var": 0.05577799479166667, + "learning_rate": 0.0001, + "loss": 5.6501, + "loss/crossentropy": 2.5616250038146973, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16196832805871964, + "step": 19962 + }, + { + "epoch": 0.623875, + "grad_norm": 3.0625, + "grad_norm_var": 0.052887980143229166, + "learning_rate": 0.0001, + "loss": 5.7081, + "loss/crossentropy": 2.468526005744934, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17317667603492737, + "step": 19964 + }, + { + "epoch": 0.6239375, + "grad_norm": 2.9375, + "grad_norm_var": 0.10784098307291666, + "learning_rate": 0.0001, + "loss": 5.8608, + "loss/crossentropy": 2.526008367538452, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18347448855638504, + "step": 19966 + }, + { + "epoch": 0.624, + "grad_norm": 3.40625, + "grad_norm_var": 0.125048828125, + "learning_rate": 0.0001, + "loss": 6.0458, + "loss/crossentropy": 2.776844620704651, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1768995299935341, + "step": 19968 + }, + { + "epoch": 0.6240625, + "grad_norm": 3.203125, + "grad_norm_var": 0.11021728515625, + "learning_rate": 0.0001, + "loss": 5.6742, + "loss/crossentropy": 2.5258538722991943, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16835208982229233, + "step": 19970 + }, + { + "epoch": 0.624125, + "grad_norm": 3.125, + "grad_norm_var": 0.1097808837890625, + "learning_rate": 0.0001, + "loss": 5.7995, + "loss/crossentropy": 2.632531762123108, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16786815226078033, + "step": 19972 + }, + { + "epoch": 0.6241875, + "grad_norm": 3.03125, + "grad_norm_var": 0.10949605305989583, + "learning_rate": 0.0001, + "loss": 5.8556, + "loss/crossentropy": 2.727881669998169, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1678517833352089, + "step": 19974 + }, + { + "epoch": 0.62425, + "grad_norm": 3.125, + "grad_norm_var": 0.1069732666015625, + "learning_rate": 0.0001, + "loss": 5.5128, + "loss/crossentropy": 2.3756444454193115, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16332362592220306, + "step": 19976 + }, + { + "epoch": 0.6243125, + "grad_norm": 3.140625, + "grad_norm_var": 0.08775634765625, + "learning_rate": 0.0001, + "loss": 5.5475, + "loss/crossentropy": 2.406875252723694, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16875281929969788, + "step": 19978 + }, + { + "epoch": 0.624375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0881988525390625, + "learning_rate": 0.0001, + "loss": 5.6718, + "loss/crossentropy": 2.52127742767334, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16700471192598343, + "step": 19980 + }, + { + "epoch": 0.6244375, + "grad_norm": 2.890625, + "grad_norm_var": 0.039453125, + "learning_rate": 0.0001, + "loss": 5.741, + "loss/crossentropy": 2.6321603059768677, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16439685225486755, + "step": 19982 + }, + { + "epoch": 0.6245, + "grad_norm": 3.25, + "grad_norm_var": 0.017243448893229166, + "learning_rate": 0.0001, + "loss": 5.8766, + "loss/crossentropy": 2.636662721633911, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17282648384571075, + "step": 19984 + }, + { + "epoch": 0.6245625, + "grad_norm": 3.0625, + "grad_norm_var": 0.017215983072916666, + "learning_rate": 0.0001, + "loss": 6.0626, + "loss/crossentropy": 2.7927119731903076, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1797192469239235, + "step": 19986 + }, + { + "epoch": 0.624625, + "grad_norm": 3.078125, + "grad_norm_var": 0.018701171875, + "learning_rate": 0.0001, + "loss": 5.7114, + "loss/crossentropy": 2.577378988265991, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16769851744174957, + "step": 19988 + }, + { + "epoch": 0.6246875, + "grad_norm": 2.984375, + "grad_norm_var": 0.019661458333333333, + "learning_rate": 0.0001, + "loss": 5.28, + "loss/crossentropy": 2.3434277772903442, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15108251571655273, + "step": 19990 + }, + { + "epoch": 0.62475, + "grad_norm": 3.046875, + "grad_norm_var": 0.019880167643229165, + "learning_rate": 0.0001, + "loss": 5.5554, + "loss/crossentropy": 2.4401341676712036, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16308779269456863, + "step": 19992 + }, + { + "epoch": 0.6248125, + "grad_norm": 3.09375, + "grad_norm_var": 0.021158854166666668, + "learning_rate": 0.0001, + "loss": 5.6507, + "loss/crossentropy": 2.5970619916915894, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15927314013242722, + "step": 19994 + }, + { + "epoch": 0.624875, + "grad_norm": 3.078125, + "grad_norm_var": 0.021317545572916666, + "learning_rate": 0.0001, + "loss": 5.7013, + "loss/crossentropy": 2.5487600564956665, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1730649545788765, + "step": 19996 + }, + { + "epoch": 0.6249375, + "grad_norm": 2.703125, + "grad_norm_var": 0.030085245768229168, + "learning_rate": 0.0001, + "loss": 5.2125, + "loss/crossentropy": 2.2816121578216553, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1481662392616272, + "step": 19998 + }, + { + "epoch": 0.625, + "grad_norm": 3.03125, + "grad_norm_var": 0.017333984375, + "learning_rate": 0.0001, + "loss": 5.8688, + "loss/crossentropy": 2.6770824193954468, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16878335177898407, + "step": 20000 + }, + { + "epoch": 0.6250625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0190093994140625, + "learning_rate": 0.0001, + "loss": 5.6288, + "loss/crossentropy": 2.4984673261642456, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16225223988294601, + "step": 20002 + }, + { + "epoch": 0.625125, + "grad_norm": 2.890625, + "grad_norm_var": 0.0303375244140625, + "learning_rate": 0.0001, + "loss": 6.0646, + "loss/crossentropy": 2.8133655786514282, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1762980967760086, + "step": 20004 + }, + { + "epoch": 0.6251875, + "grad_norm": 2.9375, + "grad_norm_var": 0.03092041015625, + "learning_rate": 0.0001, + "loss": 5.4517, + "loss/crossentropy": 2.4188199043273926, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16109905391931534, + "step": 20006 + }, + { + "epoch": 0.62525, + "grad_norm": 3.140625, + "grad_norm_var": 0.036498006184895834, + "learning_rate": 0.0001, + "loss": 5.729, + "loss/crossentropy": 2.5297917127609253, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16523268818855286, + "step": 20008 + }, + { + "epoch": 0.6253125, + "grad_norm": 2.875, + "grad_norm_var": 0.037516276041666664, + "learning_rate": 0.0001, + "loss": 5.8529, + "loss/crossentropy": 2.678296208381653, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16668106615543365, + "step": 20010 + }, + { + "epoch": 0.625375, + "grad_norm": 3.09375, + "grad_norm_var": 0.04182027180989583, + "learning_rate": 0.0001, + "loss": 5.8357, + "loss/crossentropy": 2.7086238861083984, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1666112244129181, + "step": 20012 + }, + { + "epoch": 0.6254375, + "grad_norm": 2.859375, + "grad_norm_var": 0.054032389322916666, + "learning_rate": 0.0001, + "loss": 5.7439, + "loss/crossentropy": 2.468640089035034, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17674177885055542, + "step": 20014 + }, + { + "epoch": 0.6255, + "grad_norm": 2.984375, + "grad_norm_var": 0.054621378580729164, + "learning_rate": 0.0001, + "loss": 5.5136, + "loss/crossentropy": 2.4705368280410767, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15977267920970917, + "step": 20016 + }, + { + "epoch": 0.6255625, + "grad_norm": 3.875, + "grad_norm_var": 0.08739827473958334, + "learning_rate": 0.0001, + "loss": 5.725, + "loss/crossentropy": 2.565088629722595, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16872488707304, + "step": 20018 + }, + { + "epoch": 0.625625, + "grad_norm": 3.0625, + "grad_norm_var": 0.07919514973958333, + "learning_rate": 0.0001, + "loss": 5.8745, + "loss/crossentropy": 2.7002429962158203, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1697673499584198, + "step": 20020 + }, + { + "epoch": 0.6256875, + "grad_norm": 3.09375, + "grad_norm_var": 0.07160542805989584, + "learning_rate": 0.0001, + "loss": 5.8948, + "loss/crossentropy": 2.6865261793136597, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17434507608413696, + "step": 20022 + }, + { + "epoch": 0.62575, + "grad_norm": 3.265625, + "grad_norm_var": 0.07259114583333333, + "learning_rate": 0.0001, + "loss": 5.5557, + "loss/crossentropy": 2.4680683612823486, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16149750351905823, + "step": 20024 + }, + { + "epoch": 0.6258125, + "grad_norm": 2.8125, + "grad_norm_var": 0.07628580729166666, + "learning_rate": 0.0001, + "loss": 5.5424, + "loss/crossentropy": 2.5413738489151, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15869881212711334, + "step": 20026 + }, + { + "epoch": 0.625875, + "grad_norm": 3.140625, + "grad_norm_var": 0.07774149576822917, + "learning_rate": 0.0001, + "loss": 5.6263, + "loss/crossentropy": 2.5399714708328247, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16448822617530823, + "step": 20028 + }, + { + "epoch": 0.6259375, + "grad_norm": 3.015625, + "grad_norm_var": 0.053938802083333334, + "learning_rate": 0.0001, + "loss": 5.8731, + "loss/crossentropy": 2.6865917444229126, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17138946801424026, + "step": 20030 + }, + { + "epoch": 0.626, + "grad_norm": 2.78125, + "grad_norm_var": 0.06081441243489583, + "learning_rate": 0.0001, + "loss": 5.5415, + "loss/crossentropy": 2.536372423171997, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15989230573177338, + "step": 20032 + }, + { + "epoch": 0.6260625, + "grad_norm": 3.296875, + "grad_norm_var": 0.021630859375, + "learning_rate": 0.0001, + "loss": 5.6821, + "loss/crossentropy": 2.553732395172119, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.169091135263443, + "step": 20034 + }, + { + "epoch": 0.626125, + "grad_norm": 2.953125, + "grad_norm_var": 0.0237213134765625, + "learning_rate": 0.0001, + "loss": 5.7535, + "loss/crossentropy": 2.5982415676116943, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1686534658074379, + "step": 20036 + }, + { + "epoch": 0.6261875, + "grad_norm": 2.859375, + "grad_norm_var": 0.0248046875, + "learning_rate": 0.0001, + "loss": 5.4695, + "loss/crossentropy": 2.371529698371887, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1625317856669426, + "step": 20038 + }, + { + "epoch": 0.62625, + "grad_norm": 3.125, + "grad_norm_var": 0.0400299072265625, + "learning_rate": 0.0001, + "loss": 5.8379, + "loss/crossentropy": 2.5744885206222534, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1767367273569107, + "step": 20040 + }, + { + "epoch": 0.6263125, + "grad_norm": 3.234375, + "grad_norm_var": 0.03654683430989583, + "learning_rate": 0.0001, + "loss": 5.5845, + "loss/crossentropy": 2.5264803171157837, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16009458154439926, + "step": 20042 + }, + { + "epoch": 0.626375, + "grad_norm": 3.140625, + "grad_norm_var": 0.034716796875, + "learning_rate": 0.0001, + "loss": 5.651, + "loss/crossentropy": 2.6154359579086304, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1543370485305786, + "step": 20044 + }, + { + "epoch": 0.6264375, + "grad_norm": 3.078125, + "grad_norm_var": 0.03862202962239583, + "learning_rate": 0.0001, + "loss": 5.3501, + "loss/crossentropy": 2.4048666954040527, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.14765171706676483, + "step": 20046 + }, + { + "epoch": 0.6265, + "grad_norm": 3.0625, + "grad_norm_var": 0.03323160807291667, + "learning_rate": 0.0001, + "loss": 6.1096, + "loss/crossentropy": 2.896267533302307, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17211604118347168, + "step": 20048 + }, + { + "epoch": 0.6265625, + "grad_norm": 3.09375, + "grad_norm_var": 0.03186442057291667, + "learning_rate": 0.0001, + "loss": 5.5006, + "loss/crossentropy": 2.4230915307998657, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16243375092744827, + "step": 20050 + }, + { + "epoch": 0.626625, + "grad_norm": 3.234375, + "grad_norm_var": 0.032275390625, + "learning_rate": 0.0001, + "loss": 6.0838, + "loss/crossentropy": 2.8078731298446655, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17915435135364532, + "step": 20052 + }, + { + "epoch": 0.6266875, + "grad_norm": 3.265625, + "grad_norm_var": 0.028416951497395832, + "learning_rate": 0.0001, + "loss": 5.7891, + "loss/crossentropy": 2.6495046615600586, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16747775673866272, + "step": 20054 + }, + { + "epoch": 0.62675, + "grad_norm": 3.15625, + "grad_norm_var": 0.015436808268229166, + "learning_rate": 0.0001, + "loss": 5.6559, + "loss/crossentropy": 2.511838912963867, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16636407375335693, + "step": 20056 + }, + { + "epoch": 0.6268125, + "grad_norm": 2.953125, + "grad_norm_var": 0.01988525390625, + "learning_rate": 0.0001, + "loss": 5.6049, + "loss/crossentropy": 2.5633102655410767, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15454931557178497, + "step": 20058 + }, + { + "epoch": 0.626875, + "grad_norm": 3.1875, + "grad_norm_var": 0.020048014322916665, + "learning_rate": 0.0001, + "loss": 5.7108, + "loss/crossentropy": 2.6183091402053833, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16276396811008453, + "step": 20060 + }, + { + "epoch": 0.6269375, + "grad_norm": 3.140625, + "grad_norm_var": 0.018192545572916666, + "learning_rate": 0.0001, + "loss": 5.6007, + "loss/crossentropy": 2.537225842475891, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1637696623802185, + "step": 20062 + }, + { + "epoch": 0.627, + "grad_norm": 3.28125, + "grad_norm_var": 0.019041951497395834, + "learning_rate": 0.0001, + "loss": 5.8561, + "loss/crossentropy": 2.6517701148986816, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1719948723912239, + "step": 20064 + }, + { + "epoch": 0.6270625, + "grad_norm": 3.109375, + "grad_norm_var": 0.017137654622395835, + "learning_rate": 0.0001, + "loss": 5.4176, + "loss/crossentropy": 2.3762134313583374, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1603914126753807, + "step": 20066 + }, + { + "epoch": 0.627125, + "grad_norm": 3.328125, + "grad_norm_var": 0.018001302083333334, + "learning_rate": 0.0001, + "loss": 5.8595, + "loss/crossentropy": 2.690279245376587, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16887035220861435, + "step": 20068 + }, + { + "epoch": 0.6271875, + "grad_norm": 2.90625, + "grad_norm_var": 0.0216705322265625, + "learning_rate": 0.0001, + "loss": 5.4766, + "loss/crossentropy": 2.4541242122650146, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1565433144569397, + "step": 20070 + }, + { + "epoch": 0.62725, + "grad_norm": 3.171875, + "grad_norm_var": 0.02451171875, + "learning_rate": 0.0001, + "loss": 5.7288, + "loss/crossentropy": 2.5191383361816406, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17057938873767853, + "step": 20072 + }, + { + "epoch": 0.6273125, + "grad_norm": 2.765625, + "grad_norm_var": 0.026756795247395833, + "learning_rate": 0.0001, + "loss": 5.3662, + "loss/crossentropy": 2.4036494493484497, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14859680831432343, + "step": 20074 + }, + { + "epoch": 0.627375, + "grad_norm": 3.046875, + "grad_norm_var": 0.030256144205729165, + "learning_rate": 0.0001, + "loss": 5.4422, + "loss/crossentropy": 2.383970260620117, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16051417589187622, + "step": 20076 + }, + { + "epoch": 0.6274375, + "grad_norm": 3.015625, + "grad_norm_var": 0.029059855143229167, + "learning_rate": 0.0001, + "loss": 6.0372, + "loss/crossentropy": 2.847605347633362, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1728636845946312, + "step": 20078 + }, + { + "epoch": 0.6275, + "grad_norm": 2.8125, + "grad_norm_var": 0.0399078369140625, + "learning_rate": 0.0001, + "loss": 5.3906, + "loss/crossentropy": 2.459246516227722, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.14899127185344696, + "step": 20080 + }, + { + "epoch": 0.6275625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0457427978515625, + "learning_rate": 0.0001, + "loss": 5.4739, + "loss/crossentropy": 2.353138566017151, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1648089736700058, + "step": 20082 + }, + { + "epoch": 0.627625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04138895670572917, + "learning_rate": 0.0001, + "loss": 5.3657, + "loss/crossentropy": 2.3784090280532837, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15536832809448242, + "step": 20084 + }, + { + "epoch": 0.6276875, + "grad_norm": 2.8125, + "grad_norm_var": 0.03606363932291667, + "learning_rate": 0.0001, + "loss": 5.4472, + "loss/crossentropy": 2.4753423929214478, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15421781688928604, + "step": 20086 + }, + { + "epoch": 0.62775, + "grad_norm": 3.46875, + "grad_norm_var": 0.043390909830729164, + "learning_rate": 0.0001, + "loss": 6.049, + "loss/crossentropy": 2.8337109088897705, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17153087258338928, + "step": 20088 + }, + { + "epoch": 0.6278125, + "grad_norm": 3.125, + "grad_norm_var": 0.04280497233072917, + "learning_rate": 0.0001, + "loss": 5.7366, + "loss/crossentropy": 2.600230097770691, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1687176153063774, + "step": 20090 + }, + { + "epoch": 0.627875, + "grad_norm": 3.28125, + "grad_norm_var": 0.045807902018229166, + "learning_rate": 0.0001, + "loss": 5.7429, + "loss/crossentropy": 2.6203036308288574, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16265041381120682, + "step": 20092 + }, + { + "epoch": 0.6279375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0502838134765625, + "learning_rate": 0.0001, + "loss": 5.4523, + "loss/crossentropy": 2.459430456161499, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1570972502231598, + "step": 20094 + }, + { + "epoch": 0.628, + "grad_norm": 2.9375, + "grad_norm_var": 0.0337554931640625, + "learning_rate": 0.0001, + "loss": 5.7673, + "loss/crossentropy": 2.6559654474258423, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16464993357658386, + "step": 20096 + }, + { + "epoch": 0.6280625, + "grad_norm": 3.484375, + "grad_norm_var": 0.04077860514322917, + "learning_rate": 0.0001, + "loss": 5.7831, + "loss/crossentropy": 2.527703881263733, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17319798469543457, + "step": 20098 + }, + { + "epoch": 0.628125, + "grad_norm": 3.1875, + "grad_norm_var": 0.037923177083333336, + "learning_rate": 0.0001, + "loss": 5.8382, + "loss/crossentropy": 2.5846521854400635, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17535626888275146, + "step": 20100 + }, + { + "epoch": 0.6281875, + "grad_norm": 3.03125, + "grad_norm_var": 0.027099609375, + "learning_rate": 0.0001, + "loss": 5.629, + "loss/crossentropy": 2.563002824783325, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16090133786201477, + "step": 20102 + }, + { + "epoch": 0.62825, + "grad_norm": 3.75, + "grad_norm_var": 0.04318033854166667, + "learning_rate": 0.0001, + "loss": 6.3233, + "loss/crossentropy": 2.8549082279205322, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.19176553934812546, + "step": 20104 + }, + { + "epoch": 0.6283125, + "grad_norm": 3.328125, + "grad_norm_var": 0.0426422119140625, + "learning_rate": 0.0001, + "loss": 5.7861, + "loss/crossentropy": 2.5958460569381714, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17019325494766235, + "step": 20106 + }, + { + "epoch": 0.628375, + "grad_norm": 3.1875, + "grad_norm_var": 0.04341532389322917, + "learning_rate": 0.0001, + "loss": 5.7901, + "loss/crossentropy": 2.606391191482544, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1703239008784294, + "step": 20108 + }, + { + "epoch": 0.6284375, + "grad_norm": 2.890625, + "grad_norm_var": 0.06373697916666667, + "learning_rate": 0.0001, + "loss": 5.871, + "loss/crossentropy": 2.832258701324463, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15621717274188995, + "step": 20110 + }, + { + "epoch": 0.6285, + "grad_norm": 2.84375, + "grad_norm_var": 0.07620035807291667, + "learning_rate": 0.0001, + "loss": 5.3623, + "loss/crossentropy": 2.385128617286682, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15201535820960999, + "step": 20112 + }, + { + "epoch": 0.6285625, + "grad_norm": 3.609375, + "grad_norm_var": 0.08222554524739584, + "learning_rate": 0.0001, + "loss": 5.402, + "loss/crossentropy": 2.350269317626953, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15869221836328506, + "step": 20114 + }, + { + "epoch": 0.628625, + "grad_norm": 2.96875, + "grad_norm_var": 0.08289388020833334, + "learning_rate": 0.0001, + "loss": 5.6134, + "loss/crossentropy": 2.5118885040283203, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1621083915233612, + "step": 20116 + }, + { + "epoch": 0.6286875, + "grad_norm": 3.09375, + "grad_norm_var": 0.08203837076822916, + "learning_rate": 0.0001, + "loss": 5.6185, + "loss/crossentropy": 2.5212897062301636, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1644095480442047, + "step": 20118 + }, + { + "epoch": 0.62875, + "grad_norm": 2.984375, + "grad_norm_var": 0.05614827473958333, + "learning_rate": 0.0001, + "loss": 5.6847, + "loss/crossentropy": 2.5570132732391357, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1651124209165573, + "step": 20120 + }, + { + "epoch": 0.6288125, + "grad_norm": 3.171875, + "grad_norm_var": 0.05250244140625, + "learning_rate": 0.0001, + "loss": 5.2034, + "loss/crossentropy": 2.152095317840576, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1582512930035591, + "step": 20122 + }, + { + "epoch": 0.628875, + "grad_norm": 2.890625, + "grad_norm_var": 0.04940999348958333, + "learning_rate": 0.0001, + "loss": 5.5825, + "loss/crossentropy": 2.54826283454895, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1581067144870758, + "step": 20124 + }, + { + "epoch": 0.6289375, + "grad_norm": 3.078125, + "grad_norm_var": 0.043929036458333334, + "learning_rate": 0.0001, + "loss": 5.6659, + "loss/crossentropy": 2.5693126916885376, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16122236102819443, + "step": 20126 + }, + { + "epoch": 0.629, + "grad_norm": 3.390625, + "grad_norm_var": 0.03905843098958333, + "learning_rate": 0.0001, + "loss": 5.8873, + "loss/crossentropy": 2.71217143535614, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17219892889261246, + "step": 20128 + }, + { + "epoch": 0.6290625, + "grad_norm": 3.3125, + "grad_norm_var": 0.025516764322916666, + "learning_rate": 0.0001, + "loss": 5.7538, + "loss/crossentropy": 2.5143548250198364, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17784936726093292, + "step": 20130 + }, + { + "epoch": 0.629125, + "grad_norm": 2.921875, + "grad_norm_var": 0.026496378580729167, + "learning_rate": 0.0001, + "loss": 5.6382, + "loss/crossentropy": 2.556102156639099, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16602304577827454, + "step": 20132 + }, + { + "epoch": 0.6291875, + "grad_norm": 3.265625, + "grad_norm_var": 0.0285064697265625, + "learning_rate": 0.0001, + "loss": 5.6289, + "loss/crossentropy": 2.4836350679397583, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1688230112195015, + "step": 20134 + }, + { + "epoch": 0.62925, + "grad_norm": 3.15625, + "grad_norm_var": 0.03432515462239583, + "learning_rate": 0.0001, + "loss": 5.8909, + "loss/crossentropy": 2.650188684463501, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17485381662845612, + "step": 20136 + }, + { + "epoch": 0.6293125, + "grad_norm": 3.234375, + "grad_norm_var": 0.033642578125, + "learning_rate": 0.0001, + "loss": 5.8458, + "loss/crossentropy": 2.7010639905929565, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16838444769382477, + "step": 20138 + }, + { + "epoch": 0.629375, + "grad_norm": 3.265625, + "grad_norm_var": 0.030594889322916666, + "learning_rate": 0.0001, + "loss": 6.0277, + "loss/crossentropy": 2.6885000467300415, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18314369022846222, + "step": 20140 + }, + { + "epoch": 0.6294375, + "grad_norm": 2.90625, + "grad_norm_var": 0.02880859375, + "learning_rate": 0.0001, + "loss": 5.4761, + "loss/crossentropy": 2.3749966621398926, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.15854676812887192, + "step": 20142 + }, + { + "epoch": 0.6295, + "grad_norm": 3.1875, + "grad_norm_var": 0.025340779622395834, + "learning_rate": 0.0001, + "loss": 5.7075, + "loss/crossentropy": 2.4974864721298218, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17334265261888504, + "step": 20144 + }, + { + "epoch": 0.6295625, + "grad_norm": 3.328125, + "grad_norm_var": 0.027318318684895832, + "learning_rate": 0.0001, + "loss": 5.7967, + "loss/crossentropy": 2.662586212158203, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.17200879007577896, + "step": 20146 + }, + { + "epoch": 0.629625, + "grad_norm": 3.1875, + "grad_norm_var": 0.022216796875, + "learning_rate": 0.0001, + "loss": 6.091, + "loss/crossentropy": 2.8375940322875977, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17338711023330688, + "step": 20148 + }, + { + "epoch": 0.6296875, + "grad_norm": 2.96875, + "grad_norm_var": 0.025748697916666667, + "learning_rate": 0.0001, + "loss": 5.5337, + "loss/crossentropy": 2.515775442123413, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15882223844528198, + "step": 20150 + }, + { + "epoch": 0.62975, + "grad_norm": 3.046875, + "grad_norm_var": 0.025926717122395835, + "learning_rate": 0.0001, + "loss": 5.4365, + "loss/crossentropy": 2.438557267189026, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15956391394138336, + "step": 20152 + }, + { + "epoch": 0.6298125, + "grad_norm": 2.859375, + "grad_norm_var": 0.029979451497395834, + "learning_rate": 0.0001, + "loss": 5.4571, + "loss/crossentropy": 2.4664463996887207, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15726518630981445, + "step": 20154 + }, + { + "epoch": 0.629875, + "grad_norm": 2.96875, + "grad_norm_var": 0.029271443684895832, + "learning_rate": 0.0001, + "loss": 5.7039, + "loss/crossentropy": 2.501569628715515, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17140649259090424, + "step": 20156 + }, + { + "epoch": 0.6299375, + "grad_norm": 3.125, + "grad_norm_var": 0.031712849934895836, + "learning_rate": 0.0001, + "loss": 5.7994, + "loss/crossentropy": 2.6657474040985107, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1649295687675476, + "step": 20158 + }, + { + "epoch": 0.63, + "grad_norm": 3.765625, + "grad_norm_var": 0.05890299479166667, + "learning_rate": 0.0001, + "loss": 5.5409, + "loss/crossentropy": 2.3929996490478516, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16830556094646454, + "step": 20160 + }, + { + "epoch": 0.6300625, + "grad_norm": 3.140625, + "grad_norm_var": 0.061066691080729166, + "learning_rate": 0.0001, + "loss": 5.5072, + "loss/crossentropy": 2.5050963163375854, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15489353984594345, + "step": 20162 + }, + { + "epoch": 0.630125, + "grad_norm": 3.125, + "grad_norm_var": 0.057938639322916666, + "learning_rate": 0.0001, + "loss": 5.6582, + "loss/crossentropy": 2.513243556022644, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1660541445016861, + "step": 20164 + }, + { + "epoch": 0.6301875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0605621337890625, + "learning_rate": 0.0001, + "loss": 5.9565, + "loss/crossentropy": 2.725982189178467, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17110057175159454, + "step": 20166 + }, + { + "epoch": 0.63025, + "grad_norm": 3.015625, + "grad_norm_var": 0.05732014973958333, + "learning_rate": 0.0001, + "loss": 5.8055, + "loss/crossentropy": 2.648452043533325, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1672716811299324, + "step": 20168 + }, + { + "epoch": 0.6303125, + "grad_norm": 2.875, + "grad_norm_var": 0.05679931640625, + "learning_rate": 0.0001, + "loss": 5.4629, + "loss/crossentropy": 2.4996532201766968, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15413638204336166, + "step": 20170 + }, + { + "epoch": 0.630375, + "grad_norm": 2.609375, + "grad_norm_var": 0.07001851399739584, + "learning_rate": 0.0001, + "loss": 5.4531, + "loss/crossentropy": 2.355065703392029, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.15511402487754822, + "step": 20172 + }, + { + "epoch": 0.6304375, + "grad_norm": 3.40625, + "grad_norm_var": 0.07616780598958334, + "learning_rate": 0.0001, + "loss": 5.6556, + "loss/crossentropy": 2.4644635915756226, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17184355854988098, + "step": 20174 + }, + { + "epoch": 0.6305, + "grad_norm": 3.09375, + "grad_norm_var": 0.04218343098958333, + "learning_rate": 0.0001, + "loss": 5.7584, + "loss/crossentropy": 2.641898274421692, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1655595824122429, + "step": 20176 + }, + { + "epoch": 0.6305625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0384674072265625, + "learning_rate": 0.0001, + "loss": 5.7657, + "loss/crossentropy": 2.5578166246414185, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16765928268432617, + "step": 20178 + }, + { + "epoch": 0.630625, + "grad_norm": 2.984375, + "grad_norm_var": 0.03941650390625, + "learning_rate": 0.0001, + "loss": 5.695, + "loss/crossentropy": 2.5939255952835083, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16440007835626602, + "step": 20180 + }, + { + "epoch": 0.6306875, + "grad_norm": 3.25, + "grad_norm_var": 0.05777587890625, + "learning_rate": 0.0001, + "loss": 6.0959, + "loss/crossentropy": 2.768033504486084, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18239657580852509, + "step": 20182 + }, + { + "epoch": 0.63075, + "grad_norm": 3.0, + "grad_norm_var": 0.05871988932291667, + "learning_rate": 0.0001, + "loss": 5.8539, + "loss/crossentropy": 2.6732946634292603, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1696242392063141, + "step": 20184 + }, + { + "epoch": 0.6308125, + "grad_norm": 3.25, + "grad_norm_var": 0.057795206705729164, + "learning_rate": 0.0001, + "loss": 5.3569, + "loss/crossentropy": 2.3667612075805664, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15447886288166046, + "step": 20186 + }, + { + "epoch": 0.630875, + "grad_norm": 3.203125, + "grad_norm_var": 0.0368072509765625, + "learning_rate": 0.0001, + "loss": 5.6897, + "loss/crossentropy": 2.572500467300415, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.161333829164505, + "step": 20188 + }, + { + "epoch": 0.6309375, + "grad_norm": 2.796875, + "grad_norm_var": 0.04058837890625, + "learning_rate": 0.0001, + "loss": 5.6884, + "loss/crossentropy": 2.636229991912842, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16224461793899536, + "step": 20190 + }, + { + "epoch": 0.631, + "grad_norm": 3.078125, + "grad_norm_var": 0.043290201822916666, + "learning_rate": 0.0001, + "loss": 5.7374, + "loss/crossentropy": 2.5980584621429443, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16627852618694305, + "step": 20192 + }, + { + "epoch": 0.6310625, + "grad_norm": 2.90625, + "grad_norm_var": 0.04617513020833333, + "learning_rate": 0.0001, + "loss": 5.7323, + "loss/crossentropy": 2.6956746578216553, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15873641520738602, + "step": 20194 + }, + { + "epoch": 0.631125, + "grad_norm": 3.03125, + "grad_norm_var": 0.04560139973958333, + "learning_rate": 0.0001, + "loss": 5.5217, + "loss/crossentropy": 2.4796465635299683, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1581074669957161, + "step": 20196 + }, + { + "epoch": 0.6311875, + "grad_norm": 3.578125, + "grad_norm_var": 0.038960774739583336, + "learning_rate": 0.0001, + "loss": 5.6897, + "loss/crossentropy": 2.58573317527771, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1643029898405075, + "step": 20198 + }, + { + "epoch": 0.63125, + "grad_norm": 3.21875, + "grad_norm_var": 0.03801676432291667, + "learning_rate": 0.0001, + "loss": 5.8526, + "loss/crossentropy": 2.6438595056533813, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1704840511083603, + "step": 20200 + }, + { + "epoch": 0.6313125, + "grad_norm": 3.171875, + "grad_norm_var": 0.038427734375, + "learning_rate": 0.0001, + "loss": 6.0056, + "loss/crossentropy": 2.7956109046936035, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17373424023389816, + "step": 20202 + }, + { + "epoch": 0.631375, + "grad_norm": 3.015625, + "grad_norm_var": 0.041624959309895834, + "learning_rate": 0.0001, + "loss": 5.8579, + "loss/crossentropy": 2.740612268447876, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1672016829252243, + "step": 20204 + }, + { + "epoch": 0.6314375, + "grad_norm": 3.359375, + "grad_norm_var": 0.0406646728515625, + "learning_rate": 0.0001, + "loss": 5.7933, + "loss/crossentropy": 2.612033247947693, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17008215934038162, + "step": 20206 + }, + { + "epoch": 0.6315, + "grad_norm": 2.90625, + "grad_norm_var": 0.0406646728515625, + "learning_rate": 0.0001, + "loss": 5.4172, + "loss/crossentropy": 2.4040307998657227, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1563967764377594, + "step": 20208 + }, + { + "epoch": 0.6315625, + "grad_norm": 2.90625, + "grad_norm_var": 0.03931884765625, + "learning_rate": 0.0001, + "loss": 5.8952, + "loss/crossentropy": 2.786734104156494, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16553699225187302, + "step": 20210 + }, + { + "epoch": 0.631625, + "grad_norm": 3.484375, + "grad_norm_var": 0.04474283854166667, + "learning_rate": 0.0001, + "loss": 5.8679, + "loss/crossentropy": 2.6364437341690063, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17392823845148087, + "step": 20212 + }, + { + "epoch": 0.6316875, + "grad_norm": 3.515625, + "grad_norm_var": 0.0413970947265625, + "learning_rate": 0.0001, + "loss": 5.9131, + "loss/crossentropy": 2.738698720932007, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16665589809417725, + "step": 20214 + }, + { + "epoch": 0.63175, + "grad_norm": 2.90625, + "grad_norm_var": 0.0447418212890625, + "learning_rate": 0.0001, + "loss": 5.6964, + "loss/crossentropy": 2.573652744293213, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1610986292362213, + "step": 20216 + }, + { + "epoch": 0.6318125, + "grad_norm": 3.015625, + "grad_norm_var": 0.04342447916666667, + "learning_rate": 0.0001, + "loss": 5.6864, + "loss/crossentropy": 2.592434287071228, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1609569787979126, + "step": 20218 + }, + { + "epoch": 0.631875, + "grad_norm": 3.546875, + "grad_norm_var": 0.04836832682291667, + "learning_rate": 0.0001, + "loss": 5.7919, + "loss/crossentropy": 2.6078829765319824, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16957232356071472, + "step": 20220 + }, + { + "epoch": 0.6319375, + "grad_norm": 4.125, + "grad_norm_var": 0.11231180826822916, + "learning_rate": 0.0001, + "loss": 5.279, + "loss/crossentropy": 2.2583394050598145, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15323758870363235, + "step": 20222 + }, + { + "epoch": 0.632, + "grad_norm": 2.8125, + "grad_norm_var": 0.11551106770833333, + "learning_rate": 0.0001, + "loss": 5.7031, + "loss/crossentropy": 2.6227800846099854, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.158027783036232, + "step": 20224 + }, + { + "epoch": 0.6320625, + "grad_norm": 3.390625, + "grad_norm_var": 0.10192057291666666, + "learning_rate": 0.0001, + "loss": 5.8896, + "loss/crossentropy": 2.6651804447174072, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17517473548650742, + "step": 20226 + }, + { + "epoch": 0.632125, + "grad_norm": 3.109375, + "grad_norm_var": 0.1005279541015625, + "learning_rate": 0.0001, + "loss": 5.6459, + "loss/crossentropy": 2.5703917741775513, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1618465781211853, + "step": 20228 + }, + { + "epoch": 0.6321875, + "grad_norm": 3.109375, + "grad_norm_var": 0.096923828125, + "learning_rate": 0.0001, + "loss": 5.7883, + "loss/crossentropy": 2.613739013671875, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17057643830776215, + "step": 20230 + }, + { + "epoch": 0.63225, + "grad_norm": 2.828125, + "grad_norm_var": 0.10400390625, + "learning_rate": 0.0001, + "loss": 5.3312, + "loss/crossentropy": 2.3937498331069946, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1515582799911499, + "step": 20232 + }, + { + "epoch": 0.6323125, + "grad_norm": 3.453125, + "grad_norm_var": 0.10446675618489583, + "learning_rate": 0.0001, + "loss": 6.1037, + "loss/crossentropy": 2.8300225734710693, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17698094248771667, + "step": 20234 + }, + { + "epoch": 0.632375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0999176025390625, + "learning_rate": 0.0001, + "loss": 5.8462, + "loss/crossentropy": 2.651261806488037, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.173788882791996, + "step": 20236 + }, + { + "epoch": 0.6324375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03095703125, + "learning_rate": 0.0001, + "loss": 5.8181, + "loss/crossentropy": 2.6541812419891357, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16600612550973892, + "step": 20238 + }, + { + "epoch": 0.6325, + "grad_norm": 2.734375, + "grad_norm_var": 0.0330963134765625, + "learning_rate": 0.0001, + "loss": 5.4874, + "loss/crossentropy": 2.4278383255004883, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15986019372940063, + "step": 20240 + }, + { + "epoch": 0.6325625, + "grad_norm": 3.34375, + "grad_norm_var": 0.04148661295572917, + "learning_rate": 0.0001, + "loss": 5.4431, + "loss/crossentropy": 2.306453824043274, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16054243594408035, + "step": 20242 + }, + { + "epoch": 0.632625, + "grad_norm": 3.140625, + "grad_norm_var": 0.039839680989583334, + "learning_rate": 0.0001, + "loss": 5.6074, + "loss/crossentropy": 2.5486620664596558, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15938962996006012, + "step": 20244 + }, + { + "epoch": 0.6326875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0414459228515625, + "learning_rate": 0.0001, + "loss": 5.7226, + "loss/crossentropy": 2.6033793687820435, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16621871292591095, + "step": 20246 + }, + { + "epoch": 0.63275, + "grad_norm": 3.25, + "grad_norm_var": 0.0333984375, + "learning_rate": 0.0001, + "loss": 5.286, + "loss/crossentropy": 2.2284141778945923, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15848813951015472, + "step": 20248 + }, + { + "epoch": 0.6328125, + "grad_norm": 2.90625, + "grad_norm_var": 0.0353424072265625, + "learning_rate": 0.0001, + "loss": 5.4818, + "loss/crossentropy": 2.321297287940979, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1652720496058464, + "step": 20250 + }, + { + "epoch": 0.632875, + "grad_norm": 3.3125, + "grad_norm_var": 0.03580322265625, + "learning_rate": 0.0001, + "loss": 5.76, + "loss/crossentropy": 2.5480066537857056, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17198427021503448, + "step": 20252 + }, + { + "epoch": 0.6329375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03824462890625, + "learning_rate": 0.0001, + "loss": 5.7146, + "loss/crossentropy": 2.5619568824768066, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16760719567537308, + "step": 20254 + }, + { + "epoch": 0.633, + "grad_norm": 3.046875, + "grad_norm_var": 0.03181050618489583, + "learning_rate": 0.0001, + "loss": 5.9623, + "loss/crossentropy": 2.7362542152404785, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17299064993858337, + "step": 20256 + }, + { + "epoch": 0.6330625, + "grad_norm": 3.234375, + "grad_norm_var": 0.023436482747395834, + "learning_rate": 0.0001, + "loss": 6.0276, + "loss/crossentropy": 2.796047568321228, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17432530224323273, + "step": 20258 + }, + { + "epoch": 0.633125, + "grad_norm": 2.65625, + "grad_norm_var": 0.0421051025390625, + "learning_rate": 0.0001, + "loss": 5.2651, + "loss/crossentropy": 2.3361037969589233, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.14837107062339783, + "step": 20260 + }, + { + "epoch": 0.6331875, + "grad_norm": 2.984375, + "grad_norm_var": 0.042073567708333336, + "learning_rate": 0.0001, + "loss": 5.3996, + "loss/crossentropy": 2.365097999572754, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1542360633611679, + "step": 20262 + }, + { + "epoch": 0.63325, + "grad_norm": 3.0, + "grad_norm_var": 0.03868815104166667, + "learning_rate": 0.0001, + "loss": 5.5728, + "loss/crossentropy": 2.448459506034851, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16672926396131516, + "step": 20264 + }, + { + "epoch": 0.6333125, + "grad_norm": 3.328125, + "grad_norm_var": 0.03535868326822917, + "learning_rate": 0.0001, + "loss": 5.9353, + "loss/crossentropy": 2.6578985452651978, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17735158652067184, + "step": 20266 + }, + { + "epoch": 0.633375, + "grad_norm": 3.203125, + "grad_norm_var": 0.033447265625, + "learning_rate": 0.0001, + "loss": 5.9451, + "loss/crossentropy": 2.7756892442703247, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16811110079288483, + "step": 20268 + }, + { + "epoch": 0.6334375, + "grad_norm": 3.09375, + "grad_norm_var": 0.03857421875, + "learning_rate": 0.0001, + "loss": 5.2494, + "loss/crossentropy": 2.253188133239746, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15196974575519562, + "step": 20270 + }, + { + "epoch": 0.6335, + "grad_norm": 3.234375, + "grad_norm_var": 0.0308746337890625, + "learning_rate": 0.0001, + "loss": 5.9143, + "loss/crossentropy": 2.7492603063583374, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16846046596765518, + "step": 20272 + }, + { + "epoch": 0.6335625, + "grad_norm": 3.375, + "grad_norm_var": 0.0353515625, + "learning_rate": 0.0001, + "loss": 5.8884, + "loss/crossentropy": 2.677353024482727, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1703251749277115, + "step": 20274 + }, + { + "epoch": 0.633625, + "grad_norm": 2.984375, + "grad_norm_var": 0.021629842122395833, + "learning_rate": 0.0001, + "loss": 5.7466, + "loss/crossentropy": 2.561260223388672, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16579557955265045, + "step": 20276 + }, + { + "epoch": 0.6336875, + "grad_norm": 3.0, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 5.4467, + "loss/crossentropy": 2.4198994636535645, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15893103182315826, + "step": 20278 + }, + { + "epoch": 0.63375, + "grad_norm": 3.046875, + "grad_norm_var": 0.03815104166666667, + "learning_rate": 0.0001, + "loss": 5.34, + "loss/crossentropy": 2.381423592567444, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15171539783477783, + "step": 20280 + }, + { + "epoch": 0.6338125, + "grad_norm": 3.75, + "grad_norm_var": 0.05452372233072917, + "learning_rate": 0.0001, + "loss": 6.1022, + "loss/crossentropy": 2.7580807209014893, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18440742045640945, + "step": 20282 + }, + { + "epoch": 0.633875, + "grad_norm": 2.96875, + "grad_norm_var": 0.06870829264322917, + "learning_rate": 0.0001, + "loss": 5.993, + "loss/crossentropy": 2.768987536430359, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1700623482465744, + "step": 20284 + }, + { + "epoch": 0.6339375, + "grad_norm": 3.359375, + "grad_norm_var": 0.06580403645833334, + "learning_rate": 0.0001, + "loss": 5.8209, + "loss/crossentropy": 2.6002098321914673, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17050490528345108, + "step": 20286 + }, + { + "epoch": 0.634, + "grad_norm": 3.171875, + "grad_norm_var": 0.07008056640625, + "learning_rate": 0.0001, + "loss": 5.63, + "loss/crossentropy": 2.541434168815613, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.15768882632255554, + "step": 20288 + }, + { + "epoch": 0.6340625, + "grad_norm": 3.25, + "grad_norm_var": 0.07315165201822917, + "learning_rate": 0.0001, + "loss": 5.7851, + "loss/crossentropy": 2.6267735958099365, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16895414888858795, + "step": 20290 + }, + { + "epoch": 0.634125, + "grad_norm": 3.015625, + "grad_norm_var": 0.07481180826822917, + "learning_rate": 0.0001, + "loss": 5.2985, + "loss/crossentropy": 2.2350289821624756, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1622042953968048, + "step": 20292 + }, + { + "epoch": 0.6341875, + "grad_norm": 3.140625, + "grad_norm_var": 0.07447001139322916, + "learning_rate": 0.0001, + "loss": 5.6256, + "loss/crossentropy": 2.5115227699279785, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16101675480604172, + "step": 20294 + }, + { + "epoch": 0.63425, + "grad_norm": 3.1875, + "grad_norm_var": 0.05950520833333333, + "learning_rate": 0.0001, + "loss": 5.8101, + "loss/crossentropy": 2.6483267545700073, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17008645087480545, + "step": 20296 + }, + { + "epoch": 0.6343125, + "grad_norm": 3.140625, + "grad_norm_var": 0.03593648274739583, + "learning_rate": 0.0001, + "loss": 5.406, + "loss/crossentropy": 2.3410415649414062, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15844881534576416, + "step": 20298 + }, + { + "epoch": 0.634375, + "grad_norm": 3.421875, + "grad_norm_var": 0.023029581705729166, + "learning_rate": 0.0001, + "loss": 5.4763, + "loss/crossentropy": 2.400785207748413, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16223635524511337, + "step": 20300 + }, + { + "epoch": 0.6344375, + "grad_norm": 3.0, + "grad_norm_var": 0.0150543212890625, + "learning_rate": 0.0001, + "loss": 5.7139, + "loss/crossentropy": 2.5901176929473877, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16277092695236206, + "step": 20302 + }, + { + "epoch": 0.6345, + "grad_norm": 2.96875, + "grad_norm_var": 0.0147125244140625, + "learning_rate": 0.0001, + "loss": 5.6907, + "loss/crossentropy": 2.5511187314987183, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16825895011425018, + "step": 20304 + }, + { + "epoch": 0.6345625, + "grad_norm": 3.0625, + "grad_norm_var": 0.0141998291015625, + "learning_rate": 0.0001, + "loss": 5.4864, + "loss/crossentropy": 2.466139554977417, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15906154364347458, + "step": 20306 + }, + { + "epoch": 0.634625, + "grad_norm": 3.375, + "grad_norm_var": 0.01988525390625, + "learning_rate": 0.0001, + "loss": 5.4633, + "loss/crossentropy": 2.3712934255599976, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1588064432144165, + "step": 20308 + }, + { + "epoch": 0.6346875, + "grad_norm": 3.171875, + "grad_norm_var": 0.0225006103515625, + "learning_rate": 0.0001, + "loss": 5.9008, + "loss/crossentropy": 2.6653735637664795, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17314809560775757, + "step": 20310 + }, + { + "epoch": 0.63475, + "grad_norm": 3.15625, + "grad_norm_var": 0.020438639322916667, + "learning_rate": 0.0001, + "loss": 5.6956, + "loss/crossentropy": 2.5791908502578735, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16125231981277466, + "step": 20312 + }, + { + "epoch": 0.6348125, + "grad_norm": 3.015625, + "grad_norm_var": 0.023079427083333333, + "learning_rate": 0.0001, + "loss": 5.5867, + "loss/crossentropy": 2.458158254623413, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1651960387825966, + "step": 20314 + }, + { + "epoch": 0.634875, + "grad_norm": 3.15625, + "grad_norm_var": 0.019456990559895835, + "learning_rate": 0.0001, + "loss": 5.7762, + "loss/crossentropy": 2.637978434562683, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16929318010807037, + "step": 20316 + }, + { + "epoch": 0.6349375, + "grad_norm": 3.0625, + "grad_norm_var": 0.019559733072916665, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.644035220146179, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16460420936346054, + "step": 20318 + }, + { + "epoch": 0.635, + "grad_norm": 3.078125, + "grad_norm_var": 0.018708292643229166, + "learning_rate": 0.0001, + "loss": 5.5519, + "loss/crossentropy": 2.3760015964508057, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16759207099676132, + "step": 20320 + }, + { + "epoch": 0.6350625, + "grad_norm": 3.203125, + "grad_norm_var": 0.016551717122395834, + "learning_rate": 0.0001, + "loss": 5.7103, + "loss/crossentropy": 2.6155636310577393, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1641618013381958, + "step": 20322 + }, + { + "epoch": 0.635125, + "grad_norm": 3.078125, + "grad_norm_var": 0.018504842122395834, + "learning_rate": 0.0001, + "loss": 5.5773, + "loss/crossentropy": 2.521718978881836, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15751251578330994, + "step": 20324 + }, + { + "epoch": 0.6351875, + "grad_norm": 3.046875, + "grad_norm_var": 0.014972941080729166, + "learning_rate": 0.0001, + "loss": 5.4521, + "loss/crossentropy": 2.381601095199585, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15978585928678513, + "step": 20326 + }, + { + "epoch": 0.63525, + "grad_norm": 3.09375, + "grad_norm_var": 0.013802083333333333, + "learning_rate": 0.0001, + "loss": 5.4768, + "loss/crossentropy": 2.364861845970154, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1658855304121971, + "step": 20328 + }, + { + "epoch": 0.6353125, + "grad_norm": 2.859375, + "grad_norm_var": 0.01451416015625, + "learning_rate": 0.0001, + "loss": 5.6741, + "loss/crossentropy": 2.612994074821472, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16157987713813782, + "step": 20330 + }, + { + "epoch": 0.635375, + "grad_norm": 3.125, + "grad_norm_var": 0.012760416666666666, + "learning_rate": 0.0001, + "loss": 5.718, + "loss/crossentropy": 2.6030982732772827, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16540135443210602, + "step": 20332 + }, + { + "epoch": 0.6354375, + "grad_norm": 3.078125, + "grad_norm_var": 0.012507120768229166, + "learning_rate": 0.0001, + "loss": 5.539, + "loss/crossentropy": 2.4375611543655396, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16287478804588318, + "step": 20334 + }, + { + "epoch": 0.6355, + "grad_norm": 3.34375, + "grad_norm_var": 0.01715087890625, + "learning_rate": 0.0001, + "loss": 5.7189, + "loss/crossentropy": 2.618451237678528, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16239001601934433, + "step": 20336 + }, + { + "epoch": 0.6355625, + "grad_norm": 3.6875, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 0.0001, + "loss": 6.1406, + "loss/crossentropy": 2.780633568763733, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1848253831267357, + "step": 20338 + }, + { + "epoch": 0.635625, + "grad_norm": 3.453125, + "grad_norm_var": 0.056538899739583336, + "learning_rate": 0.0001, + "loss": 6.3322, + "loss/crossentropy": 2.917917490005493, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18908123672008514, + "step": 20340 + }, + { + "epoch": 0.6356875, + "grad_norm": 3.328125, + "grad_norm_var": 0.05495503743489583, + "learning_rate": 0.0001, + "loss": 5.5889, + "loss/crossentropy": 2.501395583152771, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16070107370615005, + "step": 20342 + }, + { + "epoch": 0.63575, + "grad_norm": 3.171875, + "grad_norm_var": 0.05390218098958333, + "learning_rate": 0.0001, + "loss": 5.5669, + "loss/crossentropy": 2.465433955192566, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1660088747739792, + "step": 20344 + }, + { + "epoch": 0.6358125, + "grad_norm": 3.125, + "grad_norm_var": 0.045628865559895836, + "learning_rate": 0.0001, + "loss": 5.4948, + "loss/crossentropy": 2.3818790912628174, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16480587422847748, + "step": 20346 + }, + { + "epoch": 0.635875, + "grad_norm": 3.140625, + "grad_norm_var": 0.04334208170572917, + "learning_rate": 0.0001, + "loss": 5.5542, + "loss/crossentropy": 2.4394543170928955, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16265016049146652, + "step": 20348 + }, + { + "epoch": 0.6359375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0401763916015625, + "learning_rate": 0.0001, + "loss": 5.477, + "loss/crossentropy": 2.4162708520889282, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15880703181028366, + "step": 20350 + }, + { + "epoch": 0.636, + "grad_norm": 3.109375, + "grad_norm_var": 0.0600250244140625, + "learning_rate": 0.0001, + "loss": 5.037, + "loss/crossentropy": 2.1076958179473877, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1476190909743309, + "step": 20352 + }, + { + "epoch": 0.6360625, + "grad_norm": 3.28125, + "grad_norm_var": 0.042235310872395834, + "learning_rate": 0.0001, + "loss": 5.5061, + "loss/crossentropy": 2.429261088371277, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16549523919820786, + "step": 20354 + }, + { + "epoch": 0.636125, + "grad_norm": 3.75, + "grad_norm_var": 0.04729410807291667, + "learning_rate": 0.0001, + "loss": 5.7567, + "loss/crossentropy": 2.602792263031006, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16539494693279266, + "step": 20356 + }, + { + "epoch": 0.6361875, + "grad_norm": 2.8125, + "grad_norm_var": 0.0538482666015625, + "learning_rate": 0.0001, + "loss": 5.4911, + "loss/crossentropy": 2.491291046142578, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15427955985069275, + "step": 20358 + }, + { + "epoch": 0.63625, + "grad_norm": 3.21875, + "grad_norm_var": 0.061083984375, + "learning_rate": 0.0001, + "loss": 5.5776, + "loss/crossentropy": 2.5022329092025757, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16026996076107025, + "step": 20360 + }, + { + "epoch": 0.6363125, + "grad_norm": 3.515625, + "grad_norm_var": 0.06746317545572916, + "learning_rate": 0.0001, + "loss": 5.7371, + "loss/crossentropy": 2.6219738721847534, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16190487891435623, + "step": 20362 + }, + { + "epoch": 0.636375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06974283854166667, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.569133162498474, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.178208127617836, + "step": 20364 + }, + { + "epoch": 0.6364375, + "grad_norm": 3.015625, + "grad_norm_var": 0.07108968098958333, + "learning_rate": 0.0001, + "loss": 5.7103, + "loss/crossentropy": 2.5686323642730713, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16728801280260086, + "step": 20366 + }, + { + "epoch": 0.6365, + "grad_norm": 2.65625, + "grad_norm_var": 0.07828776041666667, + "learning_rate": 0.0001, + "loss": 5.2009, + "loss/crossentropy": 2.3160207271575928, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.146303191781044, + "step": 20368 + }, + { + "epoch": 0.6365625, + "grad_norm": 2.859375, + "grad_norm_var": 0.0799224853515625, + "learning_rate": 0.0001, + "loss": 5.7433, + "loss/crossentropy": 2.633384346961975, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16294124722480774, + "step": 20370 + }, + { + "epoch": 0.636625, + "grad_norm": 2.890625, + "grad_norm_var": 0.051390584309895834, + "learning_rate": 0.0001, + "loss": 5.4866, + "loss/crossentropy": 2.462988018989563, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15704964846372604, + "step": 20372 + }, + { + "epoch": 0.6366875, + "grad_norm": 3.25, + "grad_norm_var": 0.051634724934895834, + "learning_rate": 0.0001, + "loss": 5.4468, + "loss/crossentropy": 2.3518134355545044, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16301118582487106, + "step": 20374 + }, + { + "epoch": 0.63675, + "grad_norm": 3.1875, + "grad_norm_var": 0.05366109212239583, + "learning_rate": 0.0001, + "loss": 5.9309, + "loss/crossentropy": 2.7629919052124023, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16952699422836304, + "step": 20376 + }, + { + "epoch": 0.6368125, + "grad_norm": 3.125, + "grad_norm_var": 0.0410797119140625, + "learning_rate": 0.0001, + "loss": 6.0113, + "loss/crossentropy": 2.8072842359542847, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17079093307256699, + "step": 20378 + }, + { + "epoch": 0.636875, + "grad_norm": 2.90625, + "grad_norm_var": 0.0366851806640625, + "learning_rate": 0.0001, + "loss": 5.7854, + "loss/crossentropy": 2.6800915002822876, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16443432867527008, + "step": 20380 + }, + { + "epoch": 0.6369375, + "grad_norm": 2.859375, + "grad_norm_var": 0.03655192057291667, + "learning_rate": 0.0001, + "loss": 5.6768, + "loss/crossentropy": 2.5980231761932373, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16256462782621384, + "step": 20382 + }, + { + "epoch": 0.637, + "grad_norm": 3.046875, + "grad_norm_var": 0.028107706705729166, + "learning_rate": 0.0001, + "loss": 5.5433, + "loss/crossentropy": 2.491496205329895, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16025366634130478, + "step": 20384 + }, + { + "epoch": 0.6370625, + "grad_norm": 3.40625, + "grad_norm_var": 0.03585611979166667, + "learning_rate": 0.0001, + "loss": 5.4569, + "loss/crossentropy": 2.4254097938537598, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.154323972761631, + "step": 20386 + }, + { + "epoch": 0.637125, + "grad_norm": 3.015625, + "grad_norm_var": 0.03494466145833333, + "learning_rate": 0.0001, + "loss": 5.5164, + "loss/crossentropy": 2.4353718757629395, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1616169586777687, + "step": 20388 + }, + { + "epoch": 0.6371875, + "grad_norm": 3.046875, + "grad_norm_var": 0.031061808268229168, + "learning_rate": 0.0001, + "loss": 5.886, + "loss/crossentropy": 2.7061909437179565, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16836927086114883, + "step": 20390 + }, + { + "epoch": 0.63725, + "grad_norm": 3.328125, + "grad_norm_var": 0.027180989583333332, + "learning_rate": 0.0001, + "loss": 5.7933, + "loss/crossentropy": 2.5726131200790405, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17128415405750275, + "step": 20392 + }, + { + "epoch": 0.6373125, + "grad_norm": 3.609375, + "grad_norm_var": 0.0496978759765625, + "learning_rate": 0.0001, + "loss": 5.3625, + "loss/crossentropy": 2.32717502117157, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1593942791223526, + "step": 20394 + }, + { + "epoch": 0.637375, + "grad_norm": 2.84375, + "grad_norm_var": 0.0496734619140625, + "learning_rate": 0.0001, + "loss": 5.6418, + "loss/crossentropy": 2.4990261793136597, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16700708121061325, + "step": 20396 + }, + { + "epoch": 0.6374375, + "grad_norm": 3.015625, + "grad_norm_var": 0.04729817708333333, + "learning_rate": 0.0001, + "loss": 5.7106, + "loss/crossentropy": 2.536941170692444, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16697710752487183, + "step": 20398 + }, + { + "epoch": 0.6375, + "grad_norm": 3.15625, + "grad_norm_var": 0.043603515625, + "learning_rate": 0.0001, + "loss": 5.7261, + "loss/crossentropy": 2.541267156600952, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17043745517730713, + "step": 20400 + }, + { + "epoch": 0.6375625, + "grad_norm": 2.84375, + "grad_norm_var": 0.04146728515625, + "learning_rate": 0.0001, + "loss": 5.7255, + "loss/crossentropy": 2.630297064781189, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.159913070499897, + "step": 20402 + }, + { + "epoch": 0.637625, + "grad_norm": 3.078125, + "grad_norm_var": 0.04138997395833333, + "learning_rate": 0.0001, + "loss": 5.7414, + "loss/crossentropy": 2.5599911212921143, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16892090439796448, + "step": 20404 + }, + { + "epoch": 0.6376875, + "grad_norm": 3.203125, + "grad_norm_var": 0.04202067057291667, + "learning_rate": 0.0001, + "loss": 5.7447, + "loss/crossentropy": 2.6057400703430176, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16858305037021637, + "step": 20406 + }, + { + "epoch": 0.63775, + "grad_norm": 2.9375, + "grad_norm_var": 0.039549763997395834, + "learning_rate": 0.0001, + "loss": 5.7431, + "loss/crossentropy": 2.6216236352920532, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16488446295261383, + "step": 20408 + }, + { + "epoch": 0.6378125, + "grad_norm": 3.0625, + "grad_norm_var": 0.018131510416666666, + "learning_rate": 0.0001, + "loss": 5.5229, + "loss/crossentropy": 2.42576265335083, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16557565331459045, + "step": 20410 + }, + { + "epoch": 0.637875, + "grad_norm": 3.5, + "grad_norm_var": 0.02515869140625, + "learning_rate": 0.0001, + "loss": 5.6878, + "loss/crossentropy": 2.5755895376205444, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1639513000845909, + "step": 20412 + }, + { + "epoch": 0.6379375, + "grad_norm": 3.671875, + "grad_norm_var": 0.04109598795572917, + "learning_rate": 0.0001, + "loss": 5.824, + "loss/crossentropy": 2.5592713356018066, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.172563798725605, + "step": 20414 + }, + { + "epoch": 0.638, + "grad_norm": 3.40625, + "grad_norm_var": 0.04631245930989583, + "learning_rate": 0.0001, + "loss": 5.8793, + "loss/crossentropy": 2.6633025407791138, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16964267194271088, + "step": 20416 + }, + { + "epoch": 0.6380625, + "grad_norm": 3.25, + "grad_norm_var": 0.039990234375, + "learning_rate": 0.0001, + "loss": 5.9089, + "loss/crossentropy": 2.712688684463501, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1688438281416893, + "step": 20418 + }, + { + "epoch": 0.638125, + "grad_norm": 2.90625, + "grad_norm_var": 0.05426025390625, + "learning_rate": 0.0001, + "loss": 5.3497, + "loss/crossentropy": 2.433741807937622, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1505783349275589, + "step": 20420 + }, + { + "epoch": 0.6381875, + "grad_norm": 3.21875, + "grad_norm_var": 0.05386962890625, + "learning_rate": 0.0001, + "loss": 5.897, + "loss/crossentropy": 2.6967657804489136, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16767611354589462, + "step": 20422 + }, + { + "epoch": 0.63825, + "grad_norm": 3.140625, + "grad_norm_var": 0.0518707275390625, + "learning_rate": 0.0001, + "loss": 5.5642, + "loss/crossentropy": 2.3478355407714844, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17241621762514114, + "step": 20424 + }, + { + "epoch": 0.6383125, + "grad_norm": 3.359375, + "grad_norm_var": 0.058430989583333336, + "learning_rate": 0.0001, + "loss": 5.7552, + "loss/crossentropy": 2.456655979156494, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17516592890024185, + "step": 20426 + }, + { + "epoch": 0.638375, + "grad_norm": 3.3125, + "grad_norm_var": 0.06259358723958333, + "learning_rate": 0.0001, + "loss": 5.6379, + "loss/crossentropy": 2.5944056510925293, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15590814501047134, + "step": 20428 + }, + { + "epoch": 0.6384375, + "grad_norm": 3.171875, + "grad_norm_var": 0.05068359375, + "learning_rate": 0.0001, + "loss": 5.4455, + "loss/crossentropy": 2.4222077131271362, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15350348502397537, + "step": 20430 + }, + { + "epoch": 0.6385, + "grad_norm": 3.5625, + "grad_norm_var": 0.05671284993489583, + "learning_rate": 0.0001, + "loss": 5.7682, + "loss/crossentropy": 2.5905617475509644, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16971492767333984, + "step": 20432 + }, + { + "epoch": 0.6385625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0601959228515625, + "learning_rate": 0.0001, + "loss": 5.7718, + "loss/crossentropy": 2.6473952531814575, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16751841455698013, + "step": 20434 + }, + { + "epoch": 0.638625, + "grad_norm": 3.578125, + "grad_norm_var": 0.05845947265625, + "learning_rate": 0.0001, + "loss": 5.902, + "loss/crossentropy": 2.6589845418930054, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1727406159043312, + "step": 20436 + }, + { + "epoch": 0.6386875, + "grad_norm": 3.09375, + "grad_norm_var": 0.05920817057291667, + "learning_rate": 0.0001, + "loss": 5.5358, + "loss/crossentropy": 2.4186350107192993, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16054274141788483, + "step": 20438 + }, + { + "epoch": 0.63875, + "grad_norm": 3.34375, + "grad_norm_var": 0.05964253743489583, + "learning_rate": 0.0001, + "loss": 5.7618, + "loss/crossentropy": 2.552546262741089, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17131836712360382, + "step": 20440 + }, + { + "epoch": 0.6388125, + "grad_norm": 3.203125, + "grad_norm_var": 0.0472808837890625, + "learning_rate": 0.0001, + "loss": 5.7838, + "loss/crossentropy": 2.559280514717102, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.1689380407333374, + "step": 20442 + }, + { + "epoch": 0.638875, + "grad_norm": 3.359375, + "grad_norm_var": 0.042399088541666664, + "learning_rate": 0.0001, + "loss": 5.5916, + "loss/crossentropy": 2.479541301727295, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.162373349070549, + "step": 20444 + }, + { + "epoch": 0.6389375, + "grad_norm": 2.921875, + "grad_norm_var": 0.045710245768229164, + "learning_rate": 0.0001, + "loss": 5.257, + "loss/crossentropy": 2.314408540725708, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1497253105044365, + "step": 20446 + }, + { + "epoch": 0.639, + "grad_norm": 3.328125, + "grad_norm_var": 0.039143880208333336, + "learning_rate": 0.0001, + "loss": 5.7524, + "loss/crossentropy": 2.600519895553589, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16792187094688416, + "step": 20448 + }, + { + "epoch": 0.6390625, + "grad_norm": 3.359375, + "grad_norm_var": 0.03867899576822917, + "learning_rate": 0.0001, + "loss": 5.8297, + "loss/crossentropy": 2.672330379486084, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.167303629219532, + "step": 20450 + }, + { + "epoch": 0.639125, + "grad_norm": 3.515625, + "grad_norm_var": 0.0659576416015625, + "learning_rate": 0.0001, + "loss": 5.675, + "loss/crossentropy": 2.4223849773406982, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1760462373495102, + "step": 20452 + }, + { + "epoch": 0.6391875, + "grad_norm": 3.703125, + "grad_norm_var": 0.08121337890625, + "learning_rate": 0.0001, + "loss": 5.605, + "loss/crossentropy": 2.5124597549438477, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1608152985572815, + "step": 20454 + }, + { + "epoch": 0.63925, + "grad_norm": 3.0625, + "grad_norm_var": 0.08369140625, + "learning_rate": 0.0001, + "loss": 5.82, + "loss/crossentropy": 2.6119704246520996, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17392588406801224, + "step": 20456 + }, + { + "epoch": 0.6393125, + "grad_norm": 3.265625, + "grad_norm_var": 0.0875152587890625, + "learning_rate": 0.0001, + "loss": 5.9441, + "loss/crossentropy": 2.8331947326660156, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16304653882980347, + "step": 20458 + }, + { + "epoch": 0.639375, + "grad_norm": 3.296875, + "grad_norm_var": 0.08245340983072917, + "learning_rate": 0.0001, + "loss": 5.7966, + "loss/crossentropy": 2.632740616798401, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16872943937778473, + "step": 20460 + }, + { + "epoch": 0.6394375, + "grad_norm": 3.1875, + "grad_norm_var": 0.068310546875, + "learning_rate": 0.0001, + "loss": 5.764, + "loss/crossentropy": 2.6361597776412964, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.17020221799612045, + "step": 20462 + }, + { + "epoch": 0.6395, + "grad_norm": 3.03125, + "grad_norm_var": 0.06526285807291667, + "learning_rate": 0.0001, + "loss": 5.9375, + "loss/crossentropy": 2.679958701133728, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17458638548851013, + "step": 20464 + }, + { + "epoch": 0.6395625, + "grad_norm": 2.875, + "grad_norm_var": 0.07730712890625, + "learning_rate": 0.0001, + "loss": 5.7443, + "loss/crossentropy": 2.6192407608032227, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1671948879957199, + "step": 20466 + }, + { + "epoch": 0.639625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0546539306640625, + "learning_rate": 0.0001, + "loss": 5.5122, + "loss/crossentropy": 2.5090534687042236, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15812255442142487, + "step": 20468 + }, + { + "epoch": 0.6396875, + "grad_norm": 3.296875, + "grad_norm_var": 0.03459879557291667, + "learning_rate": 0.0001, + "loss": 5.4934, + "loss/crossentropy": 2.371386408805847, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1606387421488762, + "step": 20470 + }, + { + "epoch": 0.63975, + "grad_norm": 3.203125, + "grad_norm_var": 0.030985514322916668, + "learning_rate": 0.0001, + "loss": 5.862, + "loss/crossentropy": 2.667320728302002, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17064325511455536, + "step": 20472 + }, + { + "epoch": 0.6398125, + "grad_norm": 3.03125, + "grad_norm_var": 0.028815714518229167, + "learning_rate": 0.0001, + "loss": 5.8138, + "loss/crossentropy": 2.7414956092834473, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15996047109365463, + "step": 20474 + }, + { + "epoch": 0.639875, + "grad_norm": 3.59375, + "grad_norm_var": 0.1031890869140625, + "learning_rate": 0.0001, + "loss": 6.0073, + "loss/crossentropy": 2.7020500898361206, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17544400691986084, + "step": 20476 + }, + { + "epoch": 0.6399375, + "grad_norm": 2.953125, + "grad_norm_var": 0.1067047119140625, + "learning_rate": 0.0001, + "loss": 5.6185, + "loss/crossentropy": 2.5471861362457275, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15986835211515427, + "step": 20478 + }, + { + "epoch": 0.64, + "grad_norm": 3.15625, + "grad_norm_var": 0.10097249348958333, + "learning_rate": 0.0001, + "loss": 5.844, + "loss/crossentropy": 2.666524887084961, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.170479454100132, + "step": 20480 + }, + { + "epoch": 0.6400625, + "grad_norm": 3.65625, + "grad_norm_var": 0.11204020182291667, + "learning_rate": 0.0001, + "loss": 5.9971, + "loss/crossentropy": 2.670848250389099, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1771557331085205, + "step": 20482 + }, + { + "epoch": 0.640125, + "grad_norm": 3.125, + "grad_norm_var": 0.100830078125, + "learning_rate": 0.0001, + "loss": 5.4289, + "loss/crossentropy": 2.423728346824646, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15793531388044357, + "step": 20484 + }, + { + "epoch": 0.6401875, + "grad_norm": 3.09375, + "grad_norm_var": 0.10591532389322916, + "learning_rate": 0.0001, + "loss": 5.6005, + "loss/crossentropy": 2.5539534091949463, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15778449177742004, + "step": 20486 + }, + { + "epoch": 0.64025, + "grad_norm": 3.21875, + "grad_norm_var": 0.10319722493489583, + "learning_rate": 0.0001, + "loss": 5.6379, + "loss/crossentropy": 2.4964656829833984, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16765498369932175, + "step": 20488 + }, + { + "epoch": 0.6403125, + "grad_norm": 3.15625, + "grad_norm_var": 0.09967041015625, + "learning_rate": 0.0001, + "loss": 5.8429, + "loss/crossentropy": 2.6325289011001587, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17259898036718369, + "step": 20490 + }, + { + "epoch": 0.640375, + "grad_norm": 3.09375, + "grad_norm_var": 0.03677978515625, + "learning_rate": 0.0001, + "loss": 6.0761, + "loss/crossentropy": 2.8093377351760864, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17667745053768158, + "step": 20492 + }, + { + "epoch": 0.6404375, + "grad_norm": 3.0625, + "grad_norm_var": 0.03414306640625, + "learning_rate": 0.0001, + "loss": 5.988, + "loss/crossentropy": 2.711090922355652, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17847144603729248, + "step": 20494 + }, + { + "epoch": 0.6405, + "grad_norm": 3.71875, + "grad_norm_var": 0.05367431640625, + "learning_rate": 0.0001, + "loss": 5.4906, + "loss/crossentropy": 2.3758429288864136, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.161868117749691, + "step": 20496 + }, + { + "epoch": 0.6405625, + "grad_norm": 3.125, + "grad_norm_var": 0.03181864420572917, + "learning_rate": 0.0001, + "loss": 5.3742, + "loss/crossentropy": 2.3135257959365845, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15997402369976044, + "step": 20498 + }, + { + "epoch": 0.640625, + "grad_norm": 3.59375, + "grad_norm_var": 0.039078776041666666, + "learning_rate": 0.0001, + "loss": 5.4782, + "loss/crossentropy": 2.4464190006256104, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15825734287500381, + "step": 20500 + }, + { + "epoch": 0.6406875, + "grad_norm": 3.203125, + "grad_norm_var": 0.04088134765625, + "learning_rate": 0.0001, + "loss": 5.2256, + "loss/crossentropy": 2.296721577644348, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.14952635765075684, + "step": 20502 + }, + { + "epoch": 0.64075, + "grad_norm": 2.921875, + "grad_norm_var": 0.04540608723958333, + "learning_rate": 0.0001, + "loss": 5.4201, + "loss/crossentropy": 2.388209342956543, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15748750418424606, + "step": 20504 + }, + { + "epoch": 0.6408125, + "grad_norm": 2.984375, + "grad_norm_var": 0.050568644205729166, + "learning_rate": 0.0001, + "loss": 5.5847, + "loss/crossentropy": 2.496085286140442, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16120856255292892, + "step": 20506 + }, + { + "epoch": 0.640875, + "grad_norm": 3.296875, + "grad_norm_var": 0.052277628580729166, + "learning_rate": 0.0001, + "loss": 5.664, + "loss/crossentropy": 2.503101348876953, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16531158983707428, + "step": 20508 + }, + { + "epoch": 0.6409375, + "grad_norm": 2.828125, + "grad_norm_var": 0.06347249348958334, + "learning_rate": 0.0001, + "loss": 5.6787, + "loss/crossentropy": 2.5898317098617554, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16005895286798477, + "step": 20510 + }, + { + "epoch": 0.641, + "grad_norm": 2.875, + "grad_norm_var": 0.04778238932291667, + "learning_rate": 0.0001, + "loss": 5.8977, + "loss/crossentropy": 2.720097303390503, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.17361589521169662, + "step": 20512 + }, + { + "epoch": 0.6410625, + "grad_norm": 3.21875, + "grad_norm_var": 0.0487945556640625, + "learning_rate": 0.0001, + "loss": 5.8166, + "loss/crossentropy": 2.6614229679107666, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16708486527204514, + "step": 20514 + }, + { + "epoch": 0.641125, + "grad_norm": 3.1875, + "grad_norm_var": 0.03380533854166667, + "learning_rate": 0.0001, + "loss": 5.6232, + "loss/crossentropy": 2.4592409133911133, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16991379112005234, + "step": 20516 + }, + { + "epoch": 0.6411875, + "grad_norm": 3.15625, + "grad_norm_var": 0.031168619791666668, + "learning_rate": 0.0001, + "loss": 5.7806, + "loss/crossentropy": 2.6537065505981445, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1669885218143463, + "step": 20518 + }, + { + "epoch": 0.64125, + "grad_norm": 3.125, + "grad_norm_var": 0.028563435872395834, + "learning_rate": 0.0001, + "loss": 6.0185, + "loss/crossentropy": 2.7539583444595337, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17606395483016968, + "step": 20520 + }, + { + "epoch": 0.6413125, + "grad_norm": 2.921875, + "grad_norm_var": 0.025484212239583335, + "learning_rate": 0.0001, + "loss": 5.5576, + "loss/crossentropy": 2.5074567794799805, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16125962138175964, + "step": 20522 + }, + { + "epoch": 0.641375, + "grad_norm": 3.171875, + "grad_norm_var": 0.025389607747395834, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.631861090660095, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16504361480474472, + "step": 20524 + }, + { + "epoch": 0.6414375, + "grad_norm": 4.21875, + "grad_norm_var": 0.09474283854166667, + "learning_rate": 0.0001, + "loss": 5.8544, + "loss/crossentropy": 2.5563753843307495, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17746364325284958, + "step": 20526 + }, + { + "epoch": 0.6415, + "grad_norm": 3.46875, + "grad_norm_var": 0.08902587890625, + "learning_rate": 0.0001, + "loss": 5.7431, + "loss/crossentropy": 2.5309959650039673, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16886798292398453, + "step": 20528 + }, + { + "epoch": 0.6415625, + "grad_norm": 2.75, + "grad_norm_var": 0.10445556640625, + "learning_rate": 0.0001, + "loss": 5.6452, + "loss/crossentropy": 2.6100722551345825, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15975908190011978, + "step": 20530 + }, + { + "epoch": 0.641625, + "grad_norm": 3.078125, + "grad_norm_var": 0.10625, + "learning_rate": 0.0001, + "loss": 5.6942, + "loss/crossentropy": 2.5716564655303955, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16342197358608246, + "step": 20532 + }, + { + "epoch": 0.6416875, + "grad_norm": 3.375, + "grad_norm_var": 0.10946858723958333, + "learning_rate": 0.0001, + "loss": 5.7065, + "loss/crossentropy": 2.552822470664978, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1688838005065918, + "step": 20534 + }, + { + "epoch": 0.64175, + "grad_norm": 3.125, + "grad_norm_var": 0.11346028645833334, + "learning_rate": 0.0001, + "loss": 5.8805, + "loss/crossentropy": 2.661011815071106, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17312116920948029, + "step": 20536 + }, + { + "epoch": 0.6418125, + "grad_norm": 3.25, + "grad_norm_var": 0.10871988932291667, + "learning_rate": 0.0001, + "loss": 5.8692, + "loss/crossentropy": 2.5903857946395874, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1778780147433281, + "step": 20538 + }, + { + "epoch": 0.641875, + "grad_norm": 3.40625, + "grad_norm_var": 0.10724283854166666, + "learning_rate": 0.0001, + "loss": 5.8436, + "loss/crossentropy": 2.5993826389312744, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1732523962855339, + "step": 20540 + }, + { + "epoch": 0.6419375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0626373291015625, + "learning_rate": 0.0001, + "loss": 5.9778, + "loss/crossentropy": 2.727904796600342, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17498829215765, + "step": 20542 + }, + { + "epoch": 0.642, + "grad_norm": 3.078125, + "grad_norm_var": 0.0645416259765625, + "learning_rate": 0.0001, + "loss": 5.6195, + "loss/crossentropy": 2.527278184890747, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16507700830698013, + "step": 20544 + }, + { + "epoch": 0.6420625, + "grad_norm": 3.109375, + "grad_norm_var": 0.0537017822265625, + "learning_rate": 0.0001, + "loss": 5.6138, + "loss/crossentropy": 2.535588502883911, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16251179575920105, + "step": 20546 + }, + { + "epoch": 0.642125, + "grad_norm": 3.140625, + "grad_norm_var": 0.05260009765625, + "learning_rate": 0.0001, + "loss": 5.7288, + "loss/crossentropy": 2.600610852241516, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16515814512968063, + "step": 20548 + }, + { + "epoch": 0.6421875, + "grad_norm": 3.109375, + "grad_norm_var": 0.05413004557291667, + "learning_rate": 0.0001, + "loss": 5.6988, + "loss/crossentropy": 2.5278310775756836, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16435878723859787, + "step": 20550 + }, + { + "epoch": 0.64225, + "grad_norm": 2.953125, + "grad_norm_var": 0.05419514973958333, + "learning_rate": 0.0001, + "loss": 5.8973, + "loss/crossentropy": 2.7441052198410034, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16727203875780106, + "step": 20552 + }, + { + "epoch": 0.6423125, + "grad_norm": 2.890625, + "grad_norm_var": 0.05361328125, + "learning_rate": 0.0001, + "loss": 5.7992, + "loss/crossentropy": 2.6445915699005127, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16741015017032623, + "step": 20554 + }, + { + "epoch": 0.642375, + "grad_norm": 2.96875, + "grad_norm_var": 0.06968485514322917, + "learning_rate": 0.0001, + "loss": 5.2992, + "loss/crossentropy": 2.3941731452941895, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.14831183850765228, + "step": 20556 + }, + { + "epoch": 0.6424375, + "grad_norm": 2.90625, + "grad_norm_var": 0.03802083333333333, + "learning_rate": 0.0001, + "loss": 5.497, + "loss/crossentropy": 2.4603993892669678, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1599094420671463, + "step": 20558 + }, + { + "epoch": 0.6425, + "grad_norm": 3.03125, + "grad_norm_var": 0.03727925618489583, + "learning_rate": 0.0001, + "loss": 5.4223, + "loss/crossentropy": 2.4426238536834717, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1495300829410553, + "step": 20560 + }, + { + "epoch": 0.6425625, + "grad_norm": 3.015625, + "grad_norm_var": 0.041276041666666666, + "learning_rate": 0.0001, + "loss": 5.5665, + "loss/crossentropy": 2.448686718940735, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16412575542926788, + "step": 20562 + }, + { + "epoch": 0.642625, + "grad_norm": 3.484375, + "grad_norm_var": 0.05435791015625, + "learning_rate": 0.0001, + "loss": 5.9063, + "loss/crossentropy": 2.6836531162261963, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17069945484399796, + "step": 20564 + }, + { + "epoch": 0.6426875, + "grad_norm": 3.15625, + "grad_norm_var": 0.0501861572265625, + "learning_rate": 0.0001, + "loss": 5.6015, + "loss/crossentropy": 2.438377857208252, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16982437670230865, + "step": 20566 + }, + { + "epoch": 0.64275, + "grad_norm": 3.125, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 5.6914, + "loss/crossentropy": 2.5879935026168823, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16424890607595444, + "step": 20568 + }, + { + "epoch": 0.6428125, + "grad_norm": 3.28125, + "grad_norm_var": 0.05414937337239583, + "learning_rate": 0.0001, + "loss": 6.0771, + "loss/crossentropy": 2.7591071128845215, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.18413875252008438, + "step": 20570 + }, + { + "epoch": 0.642875, + "grad_norm": 3.171875, + "grad_norm_var": 0.033234659830729166, + "learning_rate": 0.0001, + "loss": 5.6281, + "loss/crossentropy": 2.5197192430496216, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16435106843709946, + "step": 20572 + }, + { + "epoch": 0.6429375, + "grad_norm": 3.0, + "grad_norm_var": 0.02867431640625, + "learning_rate": 0.0001, + "loss": 5.4993, + "loss/crossentropy": 2.4176703691482544, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16245892643928528, + "step": 20574 + }, + { + "epoch": 0.643, + "grad_norm": 4.03125, + "grad_norm_var": 0.06992899576822917, + "learning_rate": 0.0001, + "loss": 5.7244, + "loss/crossentropy": 2.5312092304229736, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.16501778364181519, + "step": 20576 + }, + { + "epoch": 0.6430625, + "grad_norm": 4.125, + "grad_norm_var": 0.11340738932291666, + "learning_rate": 0.0001, + "loss": 5.7225, + "loss/crossentropy": 2.5450315475463867, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16696663200855255, + "step": 20578 + }, + { + "epoch": 0.643125, + "grad_norm": 3.078125, + "grad_norm_var": 0.11417643229166667, + "learning_rate": 0.0001, + "loss": 5.721, + "loss/crossentropy": 2.5756813287734985, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1664893478155136, + "step": 20580 + }, + { + "epoch": 0.6431875, + "grad_norm": 3.1875, + "grad_norm_var": 0.11578369140625, + "learning_rate": 0.0001, + "loss": 5.5566, + "loss/crossentropy": 2.5012201070785522, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1606113612651825, + "step": 20582 + }, + { + "epoch": 0.64325, + "grad_norm": 3.265625, + "grad_norm_var": 0.1085601806640625, + "learning_rate": 0.0001, + "loss": 5.8301, + "loss/crossentropy": 2.604956269264221, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17173685878515244, + "step": 20584 + }, + { + "epoch": 0.6433125, + "grad_norm": 3.359375, + "grad_norm_var": 0.11462300618489583, + "learning_rate": 0.0001, + "loss": 6.0687, + "loss/crossentropy": 2.6878198385238647, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18378955870866776, + "step": 20586 + }, + { + "epoch": 0.643375, + "grad_norm": 3.296875, + "grad_norm_var": 0.10331929524739583, + "learning_rate": 0.0001, + "loss": 5.8921, + "loss/crossentropy": 2.6400766372680664, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17637501657009125, + "step": 20588 + }, + { + "epoch": 0.6434375, + "grad_norm": 3.34375, + "grad_norm_var": 0.1000396728515625, + "learning_rate": 0.0001, + "loss": 5.7808, + "loss/crossentropy": 2.6643242835998535, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1628221496939659, + "step": 20590 + }, + { + "epoch": 0.6435, + "grad_norm": 3.5, + "grad_norm_var": 0.07281494140625, + "learning_rate": 0.0001, + "loss": 5.6668, + "loss/crossentropy": 2.597736358642578, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15651683509349823, + "step": 20592 + }, + { + "epoch": 0.6435625, + "grad_norm": 2.890625, + "grad_norm_var": 0.03601786295572917, + "learning_rate": 0.0001, + "loss": 5.4977, + "loss/crossentropy": 2.4329657554626465, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15764636546373367, + "step": 20594 + }, + { + "epoch": 0.643625, + "grad_norm": 3.109375, + "grad_norm_var": 0.04039713541666667, + "learning_rate": 0.0001, + "loss": 5.8102, + "loss/crossentropy": 2.579249858856201, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17192527651786804, + "step": 20596 + }, + { + "epoch": 0.6436875, + "grad_norm": 3.1875, + "grad_norm_var": 0.04241129557291667, + "learning_rate": 0.0001, + "loss": 5.8364, + "loss/crossentropy": 2.60555100440979, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1719166487455368, + "step": 20598 + }, + { + "epoch": 0.64375, + "grad_norm": 3.359375, + "grad_norm_var": 0.04322509765625, + "learning_rate": 0.0001, + "loss": 5.4199, + "loss/crossentropy": 2.4294326305389404, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15607880055904388, + "step": 20600 + }, + { + "epoch": 0.6438125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0437896728515625, + "learning_rate": 0.0001, + "loss": 5.6764, + "loss/crossentropy": 2.5371029376983643, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16979029774665833, + "step": 20602 + }, + { + "epoch": 0.643875, + "grad_norm": 3.15625, + "grad_norm_var": 0.04836832682291667, + "learning_rate": 0.0001, + "loss": 5.6754, + "loss/crossentropy": 2.4644941091537476, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17147650569677353, + "step": 20604 + }, + { + "epoch": 0.6439375, + "grad_norm": 3.0625, + "grad_norm_var": 0.051904296875, + "learning_rate": 0.0001, + "loss": 5.7433, + "loss/crossentropy": 2.5431348085403442, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17352993786334991, + "step": 20606 + }, + { + "epoch": 0.644, + "grad_norm": 2.796875, + "grad_norm_var": 0.059626261393229164, + "learning_rate": 0.0001, + "loss": 5.6671, + "loss/crossentropy": 2.5326255559921265, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16500743478536606, + "step": 20608 + }, + { + "epoch": 0.6440625, + "grad_norm": 3.171875, + "grad_norm_var": 0.06204020182291667, + "learning_rate": 0.0001, + "loss": 5.5522, + "loss/crossentropy": 2.481705665588379, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15900429338216782, + "step": 20610 + }, + { + "epoch": 0.644125, + "grad_norm": 3.171875, + "grad_norm_var": 0.062189737955729164, + "learning_rate": 0.0001, + "loss": 5.4313, + "loss/crossentropy": 2.461795449256897, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.154764823615551, + "step": 20612 + }, + { + "epoch": 0.6441875, + "grad_norm": 3.078125, + "grad_norm_var": 0.06052958170572917, + "learning_rate": 0.0001, + "loss": 5.7733, + "loss/crossentropy": 2.6286559104919434, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16837041825056076, + "step": 20614 + }, + { + "epoch": 0.64425, + "grad_norm": 3.015625, + "grad_norm_var": 0.06249593098958333, + "learning_rate": 0.0001, + "loss": 5.6556, + "loss/crossentropy": 2.5903440713882446, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16434288769960403, + "step": 20616 + }, + { + "epoch": 0.6443125, + "grad_norm": 3.0625, + "grad_norm_var": 0.056151326497395834, + "learning_rate": 0.0001, + "loss": 5.8679, + "loss/crossentropy": 2.6790761947631836, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16771499812602997, + "step": 20618 + }, + { + "epoch": 0.644375, + "grad_norm": 3.09375, + "grad_norm_var": 0.047362263997395834, + "learning_rate": 0.0001, + "loss": 5.7163, + "loss/crossentropy": 2.5803322792053223, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16398627310991287, + "step": 20620 + }, + { + "epoch": 0.6444375, + "grad_norm": 5.75, + "grad_norm_var": 0.48156636555989585, + "learning_rate": 0.0001, + "loss": 6.2044, + "loss/crossentropy": 2.9062217473983765, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1802084892988205, + "step": 20622 + }, + { + "epoch": 0.6445, + "grad_norm": 2.875, + "grad_norm_var": 0.4784820556640625, + "learning_rate": 0.0001, + "loss": 5.751, + "loss/crossentropy": 2.6165133714675903, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16774094849824905, + "step": 20624 + }, + { + "epoch": 0.6445625, + "grad_norm": 3.046875, + "grad_norm_var": 0.48323160807291665, + "learning_rate": 0.0001, + "loss": 5.5445, + "loss/crossentropy": 2.511154890060425, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16153790056705475, + "step": 20626 + }, + { + "epoch": 0.644625, + "grad_norm": 2.96875, + "grad_norm_var": 0.4850819905598958, + "learning_rate": 0.0001, + "loss": 5.9264, + "loss/crossentropy": 2.789646625518799, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16758083552122116, + "step": 20628 + }, + { + "epoch": 0.6446875, + "grad_norm": 3.171875, + "grad_norm_var": 0.4852203369140625, + "learning_rate": 0.0001, + "loss": 5.5333, + "loss/crossentropy": 2.466044545173645, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1614159345626831, + "step": 20630 + }, + { + "epoch": 0.64475, + "grad_norm": 3.765625, + "grad_norm_var": 0.5007151285807292, + "learning_rate": 0.0001, + "loss": 5.4144, + "loss/crossentropy": 2.3553906679153442, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15824922919273376, + "step": 20632 + }, + { + "epoch": 0.6448125, + "grad_norm": 3.203125, + "grad_norm_var": 0.49524332682291666, + "learning_rate": 0.0001, + "loss": 5.7369, + "loss/crossentropy": 2.565440535545349, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17065993696451187, + "step": 20634 + }, + { + "epoch": 0.644875, + "grad_norm": 3.453125, + "grad_norm_var": 0.49905598958333336, + "learning_rate": 0.0001, + "loss": 6.0343, + "loss/crossentropy": 2.700536012649536, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18142636120319366, + "step": 20636 + }, + { + "epoch": 0.6449375, + "grad_norm": 3.3125, + "grad_norm_var": 0.06806233723958334, + "learning_rate": 0.0001, + "loss": 5.7073, + "loss/crossentropy": 2.539323091506958, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17226801067590714, + "step": 20638 + }, + { + "epoch": 0.645, + "grad_norm": 3.296875, + "grad_norm_var": 0.06585286458333334, + "learning_rate": 0.0001, + "loss": 5.7115, + "loss/crossentropy": 2.510597586631775, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1681392937898636, + "step": 20640 + }, + { + "epoch": 0.6450625, + "grad_norm": 3.171875, + "grad_norm_var": 0.055985514322916666, + "learning_rate": 0.0001, + "loss": 5.7717, + "loss/crossentropy": 2.5721393823623657, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1703510284423828, + "step": 20642 + }, + { + "epoch": 0.645125, + "grad_norm": 3.09375, + "grad_norm_var": 0.05738016764322917, + "learning_rate": 0.0001, + "loss": 5.4939, + "loss/crossentropy": 2.4927202463150024, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15676119178533554, + "step": 20644 + }, + { + "epoch": 0.6451875, + "grad_norm": 3.265625, + "grad_norm_var": 0.05315348307291667, + "learning_rate": 0.0001, + "loss": 5.6851, + "loss/crossentropy": 2.594095230102539, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16261142492294312, + "step": 20646 + }, + { + "epoch": 0.64525, + "grad_norm": 2.96875, + "grad_norm_var": 0.029130045572916666, + "learning_rate": 0.0001, + "loss": 5.634, + "loss/crossentropy": 2.4565051794052124, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16891717910766602, + "step": 20648 + }, + { + "epoch": 0.6453125, + "grad_norm": 3.046875, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 5.451, + "loss/crossentropy": 2.455316662788391, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1589415967464447, + "step": 20650 + }, + { + "epoch": 0.645375, + "grad_norm": 3.03125, + "grad_norm_var": 0.021654256184895835, + "learning_rate": 0.0001, + "loss": 6.0149, + "loss/crossentropy": 2.8132808208465576, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17406649887561798, + "step": 20652 + }, + { + "epoch": 0.6454375, + "grad_norm": 3.21875, + "grad_norm_var": 0.04731343587239583, + "learning_rate": 0.0001, + "loss": 5.8153, + "loss/crossentropy": 2.5763078927993774, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17506926506757736, + "step": 20654 + }, + { + "epoch": 0.6455, + "grad_norm": 3.140625, + "grad_norm_var": 0.04488525390625, + "learning_rate": 0.0001, + "loss": 5.5542, + "loss/crossentropy": 2.451761245727539, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16570903360843658, + "step": 20656 + }, + { + "epoch": 0.6455625, + "grad_norm": 2.953125, + "grad_norm_var": 0.04789937337239583, + "learning_rate": 0.0001, + "loss": 5.5099, + "loss/crossentropy": 2.5485183000564575, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15199314057826996, + "step": 20658 + }, + { + "epoch": 0.645625, + "grad_norm": 3.0625, + "grad_norm_var": 0.047053019205729164, + "learning_rate": 0.0001, + "loss": 5.8736, + "loss/crossentropy": 2.657061815261841, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17282658070325851, + "step": 20660 + }, + { + "epoch": 0.6456875, + "grad_norm": 3.296875, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 0.0001, + "loss": 5.5583, + "loss/crossentropy": 2.428104043006897, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1657543182373047, + "step": 20662 + }, + { + "epoch": 0.64575, + "grad_norm": 3.171875, + "grad_norm_var": 0.08806864420572917, + "learning_rate": 0.0001, + "loss": 5.6182, + "loss/crossentropy": 2.4839119911193848, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16264984756708145, + "step": 20664 + }, + { + "epoch": 0.6458125, + "grad_norm": 3.078125, + "grad_norm_var": 0.2552571614583333, + "learning_rate": 0.0001, + "loss": 5.525, + "loss/crossentropy": 2.4292737245559692, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16465438157320023, + "step": 20666 + }, + { + "epoch": 0.645875, + "grad_norm": 2.9375, + "grad_norm_var": 0.2591623942057292, + "learning_rate": 0.0001, + "loss": 5.5494, + "loss/crossentropy": 2.532724976539612, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1575249433517456, + "step": 20668 + }, + { + "epoch": 0.6459375, + "grad_norm": 3.8125, + "grad_norm_var": 0.27838541666666666, + "learning_rate": 0.0001, + "loss": 5.5343, + "loss/crossentropy": 2.400562286376953, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16376598179340363, + "step": 20670 + }, + { + "epoch": 0.646, + "grad_norm": 2.859375, + "grad_norm_var": 0.2901529947916667, + "learning_rate": 0.0001, + "loss": 5.2208, + "loss/crossentropy": 2.2743862867355347, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15167373418807983, + "step": 20672 + }, + { + "epoch": 0.6460625, + "grad_norm": 2.984375, + "grad_norm_var": 0.2821248372395833, + "learning_rate": 0.0001, + "loss": 5.4874, + "loss/crossentropy": 2.4957116842269897, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1565897911787033, + "step": 20674 + }, + { + "epoch": 0.646125, + "grad_norm": 3.171875, + "grad_norm_var": 0.28769124348958336, + "learning_rate": 0.0001, + "loss": 5.7027, + "loss/crossentropy": 2.582889437675476, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16900938749313354, + "step": 20676 + }, + { + "epoch": 0.6461875, + "grad_norm": 2.9375, + "grad_norm_var": 0.29375, + "learning_rate": 0.0001, + "loss": 5.629, + "loss/crossentropy": 2.523389458656311, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16173437237739563, + "step": 20678 + }, + { + "epoch": 0.64625, + "grad_norm": 3.28125, + "grad_norm_var": 0.25565999348958335, + "learning_rate": 0.0001, + "loss": 5.7128, + "loss/crossentropy": 2.5503554344177246, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1674199253320694, + "step": 20680 + }, + { + "epoch": 0.6463125, + "grad_norm": 3.03125, + "grad_norm_var": 0.053955078125, + "learning_rate": 0.0001, + "loss": 5.697, + "loss/crossentropy": 2.5437591075897217, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16844742000102997, + "step": 20682 + }, + { + "epoch": 0.646375, + "grad_norm": 3.390625, + "grad_norm_var": 0.05681966145833333, + "learning_rate": 0.0001, + "loss": 5.8445, + "loss/crossentropy": 2.6223974227905273, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17065011709928513, + "step": 20684 + }, + { + "epoch": 0.6464375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03388671875, + "learning_rate": 0.0001, + "loss": 5.9883, + "loss/crossentropy": 2.740772008895874, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17592400312423706, + "step": 20686 + }, + { + "epoch": 0.6465, + "grad_norm": 3.1875, + "grad_norm_var": 0.0319488525390625, + "learning_rate": 0.0001, + "loss": 5.8538, + "loss/crossentropy": 2.614986300468445, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1750514805316925, + "step": 20688 + }, + { + "epoch": 0.6465625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04364827473958333, + "learning_rate": 0.0001, + "loss": 6.1904, + "loss/crossentropy": 2.8218883275985718, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.18294403702020645, + "step": 20690 + }, + { + "epoch": 0.646625, + "grad_norm": 3.28125, + "grad_norm_var": 0.041779581705729166, + "learning_rate": 0.0001, + "loss": 5.6118, + "loss/crossentropy": 2.48758864402771, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16672097146511078, + "step": 20692 + }, + { + "epoch": 0.6466875, + "grad_norm": 3.28125, + "grad_norm_var": 0.03966471354166667, + "learning_rate": 0.0001, + "loss": 5.8225, + "loss/crossentropy": 2.6528961658477783, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1700805202126503, + "step": 20694 + }, + { + "epoch": 0.64675, + "grad_norm": 3.15625, + "grad_norm_var": 0.05286051432291667, + "learning_rate": 0.0001, + "loss": 5.5818, + "loss/crossentropy": 2.5731911659240723, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15671736001968384, + "step": 20696 + }, + { + "epoch": 0.6468125, + "grad_norm": 3.0625, + "grad_norm_var": 0.052057902018229164, + "learning_rate": 0.0001, + "loss": 5.8673, + "loss/crossentropy": 2.653502941131592, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17294374108314514, + "step": 20698 + }, + { + "epoch": 0.646875, + "grad_norm": 2.984375, + "grad_norm_var": 0.05219624837239583, + "learning_rate": 0.0001, + "loss": 5.3281, + "loss/crossentropy": 2.2781084775924683, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16203365474939346, + "step": 20700 + }, + { + "epoch": 0.6469375, + "grad_norm": 3.265625, + "grad_norm_var": 0.039525349934895836, + "learning_rate": 0.0001, + "loss": 5.8599, + "loss/crossentropy": 2.6676172018051147, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1715681254863739, + "step": 20702 + }, + { + "epoch": 0.647, + "grad_norm": 3.015625, + "grad_norm_var": 0.036742146809895834, + "learning_rate": 0.0001, + "loss": 5.6922, + "loss/crossentropy": 2.5569517612457275, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16743438690900803, + "step": 20704 + }, + { + "epoch": 0.6470625, + "grad_norm": 2.953125, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.641975998878479, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.17145609855651855, + "step": 20706 + }, + { + "epoch": 0.647125, + "grad_norm": 2.90625, + "grad_norm_var": 0.018001302083333334, + "learning_rate": 0.0001, + "loss": 5.5241, + "loss/crossentropy": 2.5126917362213135, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15387942641973495, + "step": 20708 + }, + { + "epoch": 0.6471875, + "grad_norm": 2.734375, + "grad_norm_var": 0.0216461181640625, + "learning_rate": 0.0001, + "loss": 5.4066, + "loss/crossentropy": 2.474587321281433, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14944685995578766, + "step": 20710 + }, + { + "epoch": 0.64725, + "grad_norm": 3.0, + "grad_norm_var": 0.01705322265625, + "learning_rate": 0.0001, + "loss": 5.8376, + "loss/crossentropy": 2.6972298622131348, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1648198664188385, + "step": 20712 + }, + { + "epoch": 0.6473125, + "grad_norm": 2.921875, + "grad_norm_var": 0.0204010009765625, + "learning_rate": 0.0001, + "loss": 5.6284, + "loss/crossentropy": 2.530660629272461, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16329185664653778, + "step": 20714 + }, + { + "epoch": 0.647375, + "grad_norm": 3.359375, + "grad_norm_var": 0.03780924479166667, + "learning_rate": 0.0001, + "loss": 5.8959, + "loss/crossentropy": 2.6485893726348877, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17668356746435165, + "step": 20716 + }, + { + "epoch": 0.6474375, + "grad_norm": 2.890625, + "grad_norm_var": 0.03723958333333333, + "learning_rate": 0.0001, + "loss": 5.5549, + "loss/crossentropy": 2.5403462648391724, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15771017968654633, + "step": 20718 + }, + { + "epoch": 0.6475, + "grad_norm": 3.59375, + "grad_norm_var": 0.0558013916015625, + "learning_rate": 0.0001, + "loss": 5.9226, + "loss/crossentropy": 2.650268077850342, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17723120003938675, + "step": 20720 + }, + { + "epoch": 0.6475625, + "grad_norm": 7.3125, + "grad_norm_var": 1.1860097249348958, + "learning_rate": 0.0001, + "loss": 5.3982, + "loss/crossentropy": 2.3582738637924194, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15984924882650375, + "step": 20722 + }, + { + "epoch": 0.647625, + "grad_norm": 2.90625, + "grad_norm_var": 1.1778391520182292, + "learning_rate": 0.0001, + "loss": 5.6724, + "loss/crossentropy": 2.561447024345398, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16265869140625, + "step": 20724 + }, + { + "epoch": 0.6476875, + "grad_norm": 2.984375, + "grad_norm_var": 1.1653798421223958, + "learning_rate": 0.0001, + "loss": 5.4925, + "loss/crossentropy": 2.5229510068893433, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15633327513933182, + "step": 20726 + }, + { + "epoch": 0.64775, + "grad_norm": 3.359375, + "grad_norm_var": 1.1530232747395834, + "learning_rate": 0.0001, + "loss": 5.5759, + "loss/crossentropy": 2.5082759857177734, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1579337641596794, + "step": 20728 + }, + { + "epoch": 0.6478125, + "grad_norm": 3.203125, + "grad_norm_var": 1.1388671875, + "learning_rate": 0.0001, + "loss": 5.7815, + "loss/crossentropy": 2.631260395050049, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16736982762813568, + "step": 20730 + }, + { + "epoch": 0.647875, + "grad_norm": 3.109375, + "grad_norm_var": 1.1561024983723958, + "learning_rate": 0.0001, + "loss": 5.3087, + "loss/crossentropy": 2.279842734336853, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.158743254840374, + "step": 20732 + }, + { + "epoch": 0.6479375, + "grad_norm": 3.296875, + "grad_norm_var": 1.132941691080729, + "learning_rate": 0.0001, + "loss": 5.8052, + "loss/crossentropy": 2.63310444355011, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17072393000125885, + "step": 20734 + }, + { + "epoch": 0.648, + "grad_norm": 3.015625, + "grad_norm_var": 1.13355712890625, + "learning_rate": 0.0001, + "loss": 5.8553, + "loss/crossentropy": 2.7163825035095215, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1662379875779152, + "step": 20736 + }, + { + "epoch": 0.6480625, + "grad_norm": 3.21875, + "grad_norm_var": 0.030663045247395833, + "learning_rate": 0.0001, + "loss": 5.5855, + "loss/crossentropy": 2.453284978866577, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1632222756743431, + "step": 20738 + }, + { + "epoch": 0.648125, + "grad_norm": 3.4375, + "grad_norm_var": 0.03299153645833333, + "learning_rate": 0.0001, + "loss": 5.5864, + "loss/crossentropy": 2.5098971128463745, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16116367280483246, + "step": 20740 + }, + { + "epoch": 0.6481875, + "grad_norm": 3.25, + "grad_norm_var": 0.024006144205729166, + "learning_rate": 0.0001, + "loss": 5.5355, + "loss/crossentropy": 2.530406355857849, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15441171824932098, + "step": 20742 + }, + { + "epoch": 0.64825, + "grad_norm": 3.09375, + "grad_norm_var": 0.0184234619140625, + "learning_rate": 0.0001, + "loss": 5.6487, + "loss/crossentropy": 2.559556484222412, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16555877029895782, + "step": 20744 + }, + { + "epoch": 0.6483125, + "grad_norm": 3.140625, + "grad_norm_var": 0.018928019205729167, + "learning_rate": 0.0001, + "loss": 5.6332, + "loss/crossentropy": 2.507178783416748, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1669030785560608, + "step": 20746 + }, + { + "epoch": 0.648375, + "grad_norm": 3.46875, + "grad_norm_var": 0.02216796875, + "learning_rate": 0.0001, + "loss": 5.7268, + "loss/crossentropy": 2.5367095470428467, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16822326183319092, + "step": 20748 + }, + { + "epoch": 0.6484375, + "grad_norm": 3.390625, + "grad_norm_var": 0.03330078125, + "learning_rate": 0.0001, + "loss": 5.7544, + "loss/crossentropy": 2.697374939918518, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1588297113776207, + "step": 20750 + }, + { + "epoch": 0.6485, + "grad_norm": 3.25, + "grad_norm_var": 0.031371053059895834, + "learning_rate": 0.0001, + "loss": 5.7108, + "loss/crossentropy": 2.590539574623108, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16476020216941833, + "step": 20752 + }, + { + "epoch": 0.6485625, + "grad_norm": 3.390625, + "grad_norm_var": 0.036742146809895834, + "learning_rate": 0.0001, + "loss": 5.6735, + "loss/crossentropy": 2.5535773038864136, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1639494150876999, + "step": 20754 + }, + { + "epoch": 0.648625, + "grad_norm": 2.96875, + "grad_norm_var": 0.0344879150390625, + "learning_rate": 0.0001, + "loss": 5.6174, + "loss/crossentropy": 2.572333812713623, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1568552851676941, + "step": 20756 + }, + { + "epoch": 0.6486875, + "grad_norm": 2.953125, + "grad_norm_var": 0.034375, + "learning_rate": 0.0001, + "loss": 5.7969, + "loss/crossentropy": 2.663484573364258, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16490188986063004, + "step": 20758 + }, + { + "epoch": 0.64875, + "grad_norm": 3.328125, + "grad_norm_var": 0.0371734619140625, + "learning_rate": 0.0001, + "loss": 5.9749, + "loss/crossentropy": 2.6919151544570923, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17673945426940918, + "step": 20760 + }, + { + "epoch": 0.6488125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04039306640625, + "learning_rate": 0.0001, + "loss": 5.5585, + "loss/crossentropy": 2.5666253566741943, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1554359495639801, + "step": 20762 + }, + { + "epoch": 0.648875, + "grad_norm": 2.9375, + "grad_norm_var": 0.03527018229166667, + "learning_rate": 0.0001, + "loss": 5.6896, + "loss/crossentropy": 2.591607093811035, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1644899696111679, + "step": 20764 + }, + { + "epoch": 0.6489375, + "grad_norm": 3.109375, + "grad_norm_var": 0.024593098958333334, + "learning_rate": 0.0001, + "loss": 5.5727, + "loss/crossentropy": 2.4673659801483154, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16053176671266556, + "step": 20766 + }, + { + "epoch": 0.649, + "grad_norm": 3.234375, + "grad_norm_var": 0.02509765625, + "learning_rate": 0.0001, + "loss": 5.4418, + "loss/crossentropy": 2.364788770675659, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16004639863967896, + "step": 20768 + }, + { + "epoch": 0.6490625, + "grad_norm": 3.1875, + "grad_norm_var": 0.019596354166666666, + "learning_rate": 0.0001, + "loss": 5.9013, + "loss/crossentropy": 2.7390103340148926, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17091257125139236, + "step": 20770 + }, + { + "epoch": 0.649125, + "grad_norm": 3.03125, + "grad_norm_var": 0.019872029622395832, + "learning_rate": 0.0001, + "loss": 5.6071, + "loss/crossentropy": 2.577046036720276, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16081403940916061, + "step": 20772 + }, + { + "epoch": 0.6491875, + "grad_norm": 3.140625, + "grad_norm_var": 0.018805948893229167, + "learning_rate": 0.0001, + "loss": 5.4083, + "loss/crossentropy": 2.356221079826355, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1602884978055954, + "step": 20774 + }, + { + "epoch": 0.64925, + "grad_norm": 3.21875, + "grad_norm_var": 0.016999308268229166, + "learning_rate": 0.0001, + "loss": 5.6074, + "loss/crossentropy": 2.5159261226654053, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1591450348496437, + "step": 20776 + }, + { + "epoch": 0.6493125, + "grad_norm": 3.265625, + "grad_norm_var": 0.0179107666015625, + "learning_rate": 0.0001, + "loss": 5.6751, + "loss/crossentropy": 2.467453718185425, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17193461954593658, + "step": 20778 + }, + { + "epoch": 0.649375, + "grad_norm": 3.375, + "grad_norm_var": 0.018603515625, + "learning_rate": 0.0001, + "loss": 5.6841, + "loss/crossentropy": 2.5426772832870483, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1692201793193817, + "step": 20780 + }, + { + "epoch": 0.6494375, + "grad_norm": 2.953125, + "grad_norm_var": 0.019820149739583334, + "learning_rate": 0.0001, + "loss": 5.5734, + "loss/crossentropy": 2.5002176761627197, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16356637328863144, + "step": 20782 + }, + { + "epoch": 0.6495, + "grad_norm": 3.234375, + "grad_norm_var": 0.019254557291666665, + "learning_rate": 0.0001, + "loss": 5.7456, + "loss/crossentropy": 2.587036371231079, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.168982595205307, + "step": 20784 + }, + { + "epoch": 0.6495625, + "grad_norm": 2.875, + "grad_norm_var": 0.028351847330729166, + "learning_rate": 0.0001, + "loss": 5.7281, + "loss/crossentropy": 2.5481547117233276, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16994479298591614, + "step": 20786 + }, + { + "epoch": 0.649625, + "grad_norm": 2.890625, + "grad_norm_var": 0.029816691080729166, + "learning_rate": 0.0001, + "loss": 5.5235, + "loss/crossentropy": 2.586545944213867, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15072697401046753, + "step": 20788 + }, + { + "epoch": 0.6496875, + "grad_norm": 3.0625, + "grad_norm_var": 0.036473592122395836, + "learning_rate": 0.0001, + "loss": 5.3723, + "loss/crossentropy": 2.3707480430603027, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15483906120061874, + "step": 20790 + }, + { + "epoch": 0.64975, + "grad_norm": 3.046875, + "grad_norm_var": 0.03330790201822917, + "learning_rate": 0.0001, + "loss": 5.3547, + "loss/crossentropy": 2.3479338884353638, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1541936844587326, + "step": 20792 + }, + { + "epoch": 0.6498125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03268229166666667, + "learning_rate": 0.0001, + "loss": 5.9684, + "loss/crossentropy": 2.7685261964797974, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.174676351249218, + "step": 20794 + }, + { + "epoch": 0.649875, + "grad_norm": 3.328125, + "grad_norm_var": 0.031037394205729166, + "learning_rate": 0.0001, + "loss": 5.4786, + "loss/crossentropy": 2.3692315816879272, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.15937399864196777, + "step": 20796 + }, + { + "epoch": 0.6499375, + "grad_norm": 3.4375, + "grad_norm_var": 0.0392974853515625, + "learning_rate": 0.0001, + "loss": 5.9443, + "loss/crossentropy": 2.654773235321045, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17894787341356277, + "step": 20798 + }, + { + "epoch": 0.65, + "grad_norm": 3.015625, + "grad_norm_var": 0.0437408447265625, + "learning_rate": 0.0001, + "loss": 5.5112, + "loss/crossentropy": 2.430831551551819, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16155192255973816, + "step": 20800 + }, + { + "epoch": 0.6500625, + "grad_norm": 2.9375, + "grad_norm_var": 0.0348297119140625, + "learning_rate": 0.0001, + "loss": 5.5582, + "loss/crossentropy": 2.5225579738616943, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1598166674375534, + "step": 20802 + }, + { + "epoch": 0.650125, + "grad_norm": 3.203125, + "grad_norm_var": 0.0316314697265625, + "learning_rate": 0.0001, + "loss": 5.5031, + "loss/crossentropy": 2.4108450412750244, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15883759409189224, + "step": 20804 + }, + { + "epoch": 0.6501875, + "grad_norm": 3.125, + "grad_norm_var": 0.03200581868489583, + "learning_rate": 0.0001, + "loss": 5.887, + "loss/crossentropy": 2.6739065647125244, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17287658154964447, + "step": 20806 + }, + { + "epoch": 0.65025, + "grad_norm": 3.15625, + "grad_norm_var": 0.031087239583333332, + "learning_rate": 0.0001, + "loss": 5.7727, + "loss/crossentropy": 2.559706211090088, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17364367842674255, + "step": 20808 + }, + { + "epoch": 0.6503125, + "grad_norm": 3.0, + "grad_norm_var": 0.03287353515625, + "learning_rate": 0.0001, + "loss": 5.6713, + "loss/crossentropy": 2.5572668313980103, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16492042690515518, + "step": 20810 + }, + { + "epoch": 0.650375, + "grad_norm": 3.546875, + "grad_norm_var": 0.037923177083333336, + "learning_rate": 0.0001, + "loss": 5.7524, + "loss/crossentropy": 2.5259430408477783, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17264336347579956, + "step": 20812 + }, + { + "epoch": 0.6504375, + "grad_norm": 2.984375, + "grad_norm_var": 0.0379791259765625, + "learning_rate": 0.0001, + "loss": 5.7859, + "loss/crossentropy": 2.605214238166809, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17001979798078537, + "step": 20814 + }, + { + "epoch": 0.6505, + "grad_norm": 3.8125, + "grad_norm_var": 0.07247721354166667, + "learning_rate": 0.0001, + "loss": 5.7527, + "loss/crossentropy": 2.5045515298843384, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17012901604175568, + "step": 20816 + }, + { + "epoch": 0.6505625, + "grad_norm": 3.0, + "grad_norm_var": 0.07073160807291666, + "learning_rate": 0.0001, + "loss": 6.0264, + "loss/crossentropy": 2.8550941944122314, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16986792534589767, + "step": 20818 + }, + { + "epoch": 0.650625, + "grad_norm": 3.125, + "grad_norm_var": 0.07301025390625, + "learning_rate": 0.0001, + "loss": 6.1044, + "loss/crossentropy": 2.8505197763442993, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17734120786190033, + "step": 20820 + }, + { + "epoch": 0.6506875, + "grad_norm": 3.25, + "grad_norm_var": 0.06678059895833334, + "learning_rate": 0.0001, + "loss": 5.798, + "loss/crossentropy": 2.6078113317489624, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16863227635622025, + "step": 20822 + }, + { + "epoch": 0.65075, + "grad_norm": 3.15625, + "grad_norm_var": 0.06692606608072917, + "learning_rate": 0.0001, + "loss": 5.6749, + "loss/crossentropy": 2.5907870531082153, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1615317314863205, + "step": 20824 + }, + { + "epoch": 0.6508125, + "grad_norm": 3.03125, + "grad_norm_var": 0.06776936848958333, + "learning_rate": 0.0001, + "loss": 5.6762, + "loss/crossentropy": 2.572179913520813, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1654839813709259, + "step": 20826 + }, + { + "epoch": 0.650875, + "grad_norm": 2.890625, + "grad_norm_var": 0.06816304524739583, + "learning_rate": 0.0001, + "loss": 5.6502, + "loss/crossentropy": 2.5975730419158936, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1599457710981369, + "step": 20828 + }, + { + "epoch": 0.6509375, + "grad_norm": 3.4375, + "grad_norm_var": 0.08326822916666667, + "learning_rate": 0.0001, + "loss": 5.7692, + "loss/crossentropy": 2.684068441390991, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16202397644519806, + "step": 20830 + }, + { + "epoch": 0.651, + "grad_norm": 3.234375, + "grad_norm_var": 0.03372395833333333, + "learning_rate": 0.0001, + "loss": 5.7825, + "loss/crossentropy": 2.6395801305770874, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16819562017917633, + "step": 20832 + }, + { + "epoch": 0.6510625, + "grad_norm": 2.9375, + "grad_norm_var": 0.03487040201822917, + "learning_rate": 0.0001, + "loss": 5.9598, + "loss/crossentropy": 2.7669568061828613, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17397551983594894, + "step": 20834 + }, + { + "epoch": 0.651125, + "grad_norm": 3.09375, + "grad_norm_var": 0.03615620930989583, + "learning_rate": 0.0001, + "loss": 5.758, + "loss/crossentropy": 2.595520853996277, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17015812546014786, + "step": 20836 + }, + { + "epoch": 0.6511875, + "grad_norm": 3.109375, + "grad_norm_var": 0.02965087890625, + "learning_rate": 0.0001, + "loss": 5.8458, + "loss/crossentropy": 2.6007357835769653, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17606915533542633, + "step": 20838 + }, + { + "epoch": 0.65125, + "grad_norm": 3.078125, + "grad_norm_var": 0.028831990559895833, + "learning_rate": 0.0001, + "loss": 5.5189, + "loss/crossentropy": 2.512834906578064, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1560702845454216, + "step": 20840 + }, + { + "epoch": 0.6513125, + "grad_norm": 2.953125, + "grad_norm_var": 0.030305989583333335, + "learning_rate": 0.0001, + "loss": 5.7302, + "loss/crossentropy": 2.6041375398635864, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1649482548236847, + "step": 20842 + }, + { + "epoch": 0.651375, + "grad_norm": 3.3125, + "grad_norm_var": 0.03330078125, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.5407623052597046, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16717741638422012, + "step": 20844 + }, + { + "epoch": 0.6514375, + "grad_norm": 4.28125, + "grad_norm_var": 0.11002197265625, + "learning_rate": 0.0001, + "loss": 6.0975, + "loss/crossentropy": 2.60309374332428, + "loss/hidden": 1.578125, + "loss/jsd": 0.0, + "loss/logits": 0.1916261613368988, + "step": 20846 + }, + { + "epoch": 0.6515, + "grad_norm": 2.96875, + "grad_norm_var": 0.10901285807291666, + "learning_rate": 0.0001, + "loss": 5.6536, + "loss/crossentropy": 2.5152251720428467, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1653994396328926, + "step": 20848 + }, + { + "epoch": 0.6515625, + "grad_norm": 3.109375, + "grad_norm_var": 0.10564676920572917, + "learning_rate": 0.0001, + "loss": 5.5958, + "loss/crossentropy": 2.4631600379943848, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16756527870893478, + "step": 20850 + }, + { + "epoch": 0.651625, + "grad_norm": 3.15625, + "grad_norm_var": 0.10240478515625, + "learning_rate": 0.0001, + "loss": 5.6409, + "loss/crossentropy": 2.5698968172073364, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16140027344226837, + "step": 20852 + }, + { + "epoch": 0.6516875, + "grad_norm": 3.34375, + "grad_norm_var": 0.10437825520833334, + "learning_rate": 0.0001, + "loss": 5.8536, + "loss/crossentropy": 2.6735728979110718, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1726933866739273, + "step": 20854 + }, + { + "epoch": 0.65175, + "grad_norm": 2.8125, + "grad_norm_var": 0.11347554524739584, + "learning_rate": 0.0001, + "loss": 5.3547, + "loss/crossentropy": 2.397574305534363, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15274271368980408, + "step": 20856 + }, + { + "epoch": 0.6518125, + "grad_norm": 3.3125, + "grad_norm_var": 0.10851949055989583, + "learning_rate": 0.0001, + "loss": 5.766, + "loss/crossentropy": 2.6301355361938477, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16710658371448517, + "step": 20858 + }, + { + "epoch": 0.651875, + "grad_norm": 3.78125, + "grad_norm_var": 0.12888997395833332, + "learning_rate": 0.0001, + "loss": 5.9323, + "loss/crossentropy": 2.753417730331421, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16750143468379974, + "step": 20860 + }, + { + "epoch": 0.6519375, + "grad_norm": 3.1875, + "grad_norm_var": 0.04876302083333333, + "learning_rate": 0.0001, + "loss": 5.6276, + "loss/crossentropy": 2.5113483667373657, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16357584297657013, + "step": 20862 + }, + { + "epoch": 0.652, + "grad_norm": 2.953125, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 5.4786, + "loss/crossentropy": 2.478987693786621, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15465336292982101, + "step": 20864 + }, + { + "epoch": 0.6520625, + "grad_norm": 3.21875, + "grad_norm_var": 0.05211181640625, + "learning_rate": 0.0001, + "loss": 5.8281, + "loss/crossentropy": 2.576219320297241, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1744033619761467, + "step": 20866 + }, + { + "epoch": 0.652125, + "grad_norm": 3.453125, + "grad_norm_var": 0.06768290201822917, + "learning_rate": 0.0001, + "loss": 5.7824, + "loss/crossentropy": 2.516385316848755, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.173087440431118, + "step": 20868 + }, + { + "epoch": 0.6521875, + "grad_norm": 3.34375, + "grad_norm_var": 0.06589253743489583, + "learning_rate": 0.0001, + "loss": 6.0909, + "loss/crossentropy": 2.820400357246399, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17743618041276932, + "step": 20870 + }, + { + "epoch": 0.65225, + "grad_norm": 3.21875, + "grad_norm_var": 0.04413655598958333, + "learning_rate": 0.0001, + "loss": 5.9355, + "loss/crossentropy": 2.644858717918396, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1798480600118637, + "step": 20872 + }, + { + "epoch": 0.6523125, + "grad_norm": 3.125, + "grad_norm_var": 0.04957275390625, + "learning_rate": 0.0001, + "loss": 5.4576, + "loss/crossentropy": 2.409852147102356, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1563372015953064, + "step": 20874 + }, + { + "epoch": 0.652375, + "grad_norm": 3.25, + "grad_norm_var": 0.028449503580729167, + "learning_rate": 0.0001, + "loss": 6.0919, + "loss/crossentropy": 2.7479069232940674, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1816682666540146, + "step": 20876 + }, + { + "epoch": 0.6524375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03329671223958333, + "learning_rate": 0.0001, + "loss": 5.4214, + "loss/crossentropy": 2.3958070278167725, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15528929978609085, + "step": 20878 + }, + { + "epoch": 0.6525, + "grad_norm": 2.875, + "grad_norm_var": 0.046337890625, + "learning_rate": 0.0001, + "loss": 5.4317, + "loss/crossentropy": 2.4208797216415405, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15030142664909363, + "step": 20880 + }, + { + "epoch": 0.6525625, + "grad_norm": 3.203125, + "grad_norm_var": 0.04453125, + "learning_rate": 0.0001, + "loss": 5.71, + "loss/crossentropy": 2.512806534767151, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17049754410982132, + "step": 20882 + }, + { + "epoch": 0.652625, + "grad_norm": 3.984375, + "grad_norm_var": 0.0686676025390625, + "learning_rate": 0.0001, + "loss": 5.8605, + "loss/crossentropy": 2.578509211540222, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.17351631820201874, + "step": 20884 + }, + { + "epoch": 0.6526875, + "grad_norm": 3.015625, + "grad_norm_var": 0.0745025634765625, + "learning_rate": 0.0001, + "loss": 5.3133, + "loss/crossentropy": 2.328048348426819, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15360206365585327, + "step": 20886 + }, + { + "epoch": 0.65275, + "grad_norm": 2.9375, + "grad_norm_var": 0.0755035400390625, + "learning_rate": 0.0001, + "loss": 5.5179, + "loss/crossentropy": 2.4790738821029663, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16169846802949905, + "step": 20888 + }, + { + "epoch": 0.6528125, + "grad_norm": 4.15625, + "grad_norm_var": 0.1389556884765625, + "learning_rate": 0.0001, + "loss": 5.443, + "loss/crossentropy": 2.366265058517456, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16352909058332443, + "step": 20890 + }, + { + "epoch": 0.652875, + "grad_norm": 3.15625, + "grad_norm_var": 0.13908589680989583, + "learning_rate": 0.0001, + "loss": 5.8661, + "loss/crossentropy": 2.628006100654602, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1730271428823471, + "step": 20892 + }, + { + "epoch": 0.6529375, + "grad_norm": 3.28125, + "grad_norm_var": 0.14055887858072916, + "learning_rate": 0.0001, + "loss": 5.6117, + "loss/crossentropy": 2.505952835083008, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1636989787220955, + "step": 20894 + }, + { + "epoch": 0.653, + "grad_norm": 3.171875, + "grad_norm_var": 0.12853190104166667, + "learning_rate": 0.0001, + "loss": 5.8703, + "loss/crossentropy": 2.658256769180298, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1751089245080948, + "step": 20896 + }, + { + "epoch": 0.6530625, + "grad_norm": 3.265625, + "grad_norm_var": 0.13076883951822918, + "learning_rate": 0.0001, + "loss": 5.5952, + "loss/crossentropy": 2.4552528858184814, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16360285133123398, + "step": 20898 + }, + { + "epoch": 0.653125, + "grad_norm": 3.21875, + "grad_norm_var": 0.08946024576822917, + "learning_rate": 0.0001, + "loss": 5.7229, + "loss/crossentropy": 2.575530529022217, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17099007219076157, + "step": 20900 + }, + { + "epoch": 0.6531875, + "grad_norm": 3.03125, + "grad_norm_var": 0.08788655598958334, + "learning_rate": 0.0001, + "loss": 5.6576, + "loss/crossentropy": 2.6053110361099243, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.158348947763443, + "step": 20902 + }, + { + "epoch": 0.65325, + "grad_norm": 2.96875, + "grad_norm_var": 0.09016011555989584, + "learning_rate": 0.0001, + "loss": 5.535, + "loss/crossentropy": 2.453185796737671, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16247710585594177, + "step": 20904 + }, + { + "epoch": 0.6533125, + "grad_norm": 2.921875, + "grad_norm_var": 0.027099609375, + "learning_rate": 0.0001, + "loss": 5.5489, + "loss/crossentropy": 2.5232917070388794, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15919943153858185, + "step": 20906 + }, + { + "epoch": 0.653375, + "grad_norm": 3.21875, + "grad_norm_var": 0.029466756184895835, + "learning_rate": 0.0001, + "loss": 5.4208, + "loss/crossentropy": 2.498198390007019, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1539750099182129, + "step": 20908 + }, + { + "epoch": 0.6534375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03271484375, + "learning_rate": 0.0001, + "loss": 5.8123, + "loss/crossentropy": 2.6328794956207275, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17067986726760864, + "step": 20910 + }, + { + "epoch": 0.6535, + "grad_norm": 2.96875, + "grad_norm_var": 0.03424072265625, + "learning_rate": 0.0001, + "loss": 5.6506, + "loss/crossentropy": 2.5400296449661255, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16223189234733582, + "step": 20912 + }, + { + "epoch": 0.6535625, + "grad_norm": 3.46875, + "grad_norm_var": 0.03673502604166667, + "learning_rate": 0.0001, + "loss": 5.9651, + "loss/crossentropy": 2.7477434873580933, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17603643983602524, + "step": 20914 + }, + { + "epoch": 0.653625, + "grad_norm": 3.453125, + "grad_norm_var": 0.04446207682291667, + "learning_rate": 0.0001, + "loss": 5.4801, + "loss/crossentropy": 2.4584161043167114, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1568535938858986, + "step": 20916 + }, + { + "epoch": 0.6536875, + "grad_norm": 2.953125, + "grad_norm_var": 0.06629231770833334, + "learning_rate": 0.0001, + "loss": 5.6703, + "loss/crossentropy": 2.584798574447632, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16363202035427094, + "step": 20918 + }, + { + "epoch": 0.65375, + "grad_norm": 3.3125, + "grad_norm_var": 0.061995442708333334, + "learning_rate": 0.0001, + "loss": 5.775, + "loss/crossentropy": 2.5854105949401855, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17013494670391083, + "step": 20920 + }, + { + "epoch": 0.6538125, + "grad_norm": 3.078125, + "grad_norm_var": 0.06536356608072917, + "learning_rate": 0.0001, + "loss": 5.5909, + "loss/crossentropy": 2.5749701261520386, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16057872772216797, + "step": 20922 + }, + { + "epoch": 0.653875, + "grad_norm": 3.21875, + "grad_norm_var": 0.060887654622395836, + "learning_rate": 0.0001, + "loss": 5.8097, + "loss/crossentropy": 2.593639850616455, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1743416041135788, + "step": 20924 + }, + { + "epoch": 0.6539375, + "grad_norm": 3.0, + "grad_norm_var": 0.057515462239583336, + "learning_rate": 0.0001, + "loss": 5.6364, + "loss/crossentropy": 2.555126428604126, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.164768286049366, + "step": 20926 + }, + { + "epoch": 0.654, + "grad_norm": 3.40625, + "grad_norm_var": 0.0628326416015625, + "learning_rate": 0.0001, + "loss": 5.8082, + "loss/crossentropy": 2.6723456382751465, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16553768515586853, + "step": 20928 + }, + { + "epoch": 0.6540625, + "grad_norm": 3.125, + "grad_norm_var": 0.060498046875, + "learning_rate": 0.0001, + "loss": 5.6475, + "loss/crossentropy": 2.4885549545288086, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16706141084432602, + "step": 20930 + }, + { + "epoch": 0.654125, + "grad_norm": 3.03125, + "grad_norm_var": 0.054488118489583334, + "learning_rate": 0.0001, + "loss": 5.5655, + "loss/crossentropy": 2.4763563871383667, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16476880759000778, + "step": 20932 + }, + { + "epoch": 0.6541875, + "grad_norm": 3.171875, + "grad_norm_var": 0.026590983072916668, + "learning_rate": 0.0001, + "loss": 5.3277, + "loss/crossentropy": 2.2567296028137207, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1609998345375061, + "step": 20934 + }, + { + "epoch": 0.65425, + "grad_norm": 3.109375, + "grad_norm_var": 0.029450480143229166, + "learning_rate": 0.0001, + "loss": 5.6583, + "loss/crossentropy": 2.573989987373352, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16507582366466522, + "step": 20936 + }, + { + "epoch": 0.6543125, + "grad_norm": 3.125, + "grad_norm_var": 0.027521769205729168, + "learning_rate": 0.0001, + "loss": 5.5077, + "loss/crossentropy": 2.466416597366333, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15881579369306564, + "step": 20938 + }, + { + "epoch": 0.654375, + "grad_norm": 3.40625, + "grad_norm_var": 0.03472900390625, + "learning_rate": 0.0001, + "loss": 5.7926, + "loss/crossentropy": 2.641322612762451, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16864470392465591, + "step": 20940 + }, + { + "epoch": 0.6544375, + "grad_norm": 3.296875, + "grad_norm_var": 0.037262980143229166, + "learning_rate": 0.0001, + "loss": 5.8734, + "loss/crossentropy": 2.6494545936584473, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17474105954170227, + "step": 20942 + }, + { + "epoch": 0.6545, + "grad_norm": 3.265625, + "grad_norm_var": 0.034912109375, + "learning_rate": 0.0001, + "loss": 5.6772, + "loss/crossentropy": 2.4440113306045532, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17370863258838654, + "step": 20944 + }, + { + "epoch": 0.6545625, + "grad_norm": 2.984375, + "grad_norm_var": 0.03351949055989583, + "learning_rate": 0.0001, + "loss": 5.6806, + "loss/crossentropy": 2.58296000957489, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16445454955101013, + "step": 20946 + }, + { + "epoch": 0.654625, + "grad_norm": 3.171875, + "grad_norm_var": 0.03394266764322917, + "learning_rate": 0.0001, + "loss": 5.5769, + "loss/crossentropy": 2.4426203966140747, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1673378050327301, + "step": 20948 + }, + { + "epoch": 0.6546875, + "grad_norm": 3.46875, + "grad_norm_var": 0.03850504557291667, + "learning_rate": 0.0001, + "loss": 6.0441, + "loss/crossentropy": 2.807307004928589, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17562832683324814, + "step": 20950 + }, + { + "epoch": 0.65475, + "grad_norm": 3.28125, + "grad_norm_var": 0.037206013997395836, + "learning_rate": 0.0001, + "loss": 5.6364, + "loss/crossentropy": 2.48821759223938, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16676755249500275, + "step": 20952 + }, + { + "epoch": 0.6548125, + "grad_norm": 3.015625, + "grad_norm_var": 0.041803995768229164, + "learning_rate": 0.0001, + "loss": 5.6989, + "loss/crossentropy": 2.59624445438385, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16495025157928467, + "step": 20954 + }, + { + "epoch": 0.654875, + "grad_norm": 3.171875, + "grad_norm_var": 0.03389383951822917, + "learning_rate": 0.0001, + "loss": 5.925, + "loss/crossentropy": 2.659188747406006, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17463178932666779, + "step": 20956 + }, + { + "epoch": 0.6549375, + "grad_norm": 3.578125, + "grad_norm_var": 0.04120686848958333, + "learning_rate": 0.0001, + "loss": 5.8806, + "loss/crossentropy": 2.624800443649292, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17401887476444244, + "step": 20958 + }, + { + "epoch": 0.655, + "grad_norm": 2.953125, + "grad_norm_var": 0.04192708333333333, + "learning_rate": 0.0001, + "loss": 5.5481, + "loss/crossentropy": 2.4357486963272095, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1616295427083969, + "step": 20960 + }, + { + "epoch": 0.6550625, + "grad_norm": 3.328125, + "grad_norm_var": 0.045685831705729166, + "learning_rate": 0.0001, + "loss": 5.7787, + "loss/crossentropy": 2.6577001810073853, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16483888775110245, + "step": 20962 + }, + { + "epoch": 0.655125, + "grad_norm": 3.09375, + "grad_norm_var": 0.047761027018229166, + "learning_rate": 0.0001, + "loss": 5.8686, + "loss/crossentropy": 2.726725220680237, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16770515590906143, + "step": 20964 + }, + { + "epoch": 0.6551875, + "grad_norm": 3.0625, + "grad_norm_var": 0.045563761393229166, + "learning_rate": 0.0001, + "loss": 5.5425, + "loss/crossentropy": 2.4814471006393433, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16001326590776443, + "step": 20966 + }, + { + "epoch": 0.65525, + "grad_norm": 3.109375, + "grad_norm_var": 0.0405181884765625, + "learning_rate": 0.0001, + "loss": 5.2998, + "loss/crossentropy": 2.2771515250205994, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15265830606222153, + "step": 20968 + }, + { + "epoch": 0.6553125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0336578369140625, + "learning_rate": 0.0001, + "loss": 5.7622, + "loss/crossentropy": 2.6117970943450928, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16660621017217636, + "step": 20970 + }, + { + "epoch": 0.655375, + "grad_norm": 3.296875, + "grad_norm_var": 0.03400065104166667, + "learning_rate": 0.0001, + "loss": 5.4569, + "loss/crossentropy": 2.395979404449463, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15999948978424072, + "step": 20972 + }, + { + "epoch": 0.6554375, + "grad_norm": 2.734375, + "grad_norm_var": 0.023729451497395835, + "learning_rate": 0.0001, + "loss": 5.1661, + "loss/crossentropy": 2.211157202720642, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.14588858932256699, + "step": 20974 + }, + { + "epoch": 0.6555, + "grad_norm": 2.84375, + "grad_norm_var": 0.024235026041666666, + "learning_rate": 0.0001, + "loss": 5.6514, + "loss/crossentropy": 2.6521668434143066, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15656233578920364, + "step": 20976 + }, + { + "epoch": 0.6555625, + "grad_norm": 3.234375, + "grad_norm_var": 0.023566691080729167, + "learning_rate": 0.0001, + "loss": 5.859, + "loss/crossentropy": 2.66953444480896, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17011483013629913, + "step": 20978 + }, + { + "epoch": 0.655625, + "grad_norm": 2.921875, + "grad_norm_var": 0.024507649739583335, + "learning_rate": 0.0001, + "loss": 5.3542, + "loss/crossentropy": 2.415980339050293, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.14890418201684952, + "step": 20980 + }, + { + "epoch": 0.6556875, + "grad_norm": 2.828125, + "grad_norm_var": 0.030757649739583334, + "learning_rate": 0.0001, + "loss": 5.6073, + "loss/crossentropy": 2.585160493850708, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1561153382062912, + "step": 20982 + }, + { + "epoch": 0.65575, + "grad_norm": 2.90625, + "grad_norm_var": 0.033707682291666666, + "learning_rate": 0.0001, + "loss": 5.8202, + "loss/crossentropy": 2.6771342754364014, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16859956085681915, + "step": 20984 + }, + { + "epoch": 0.6558125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0550689697265625, + "learning_rate": 0.0001, + "loss": 5.3667, + "loss/crossentropy": 2.3741567134857178, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15276531130075455, + "step": 20986 + }, + { + "epoch": 0.655875, + "grad_norm": 3.234375, + "grad_norm_var": 0.054280598958333336, + "learning_rate": 0.0001, + "loss": 5.4782, + "loss/crossentropy": 2.4002526998519897, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1620914787054062, + "step": 20988 + }, + { + "epoch": 0.6559375, + "grad_norm": 2.90625, + "grad_norm_var": 0.06416015625, + "learning_rate": 0.0001, + "loss": 5.6154, + "loss/crossentropy": 2.4854971170425415, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16689575463533401, + "step": 20990 + }, + { + "epoch": 0.656, + "grad_norm": 3.4375, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 0.0001, + "loss": 5.6022, + "loss/crossentropy": 2.4607324600219727, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16337022185325623, + "step": 20992 + }, + { + "epoch": 0.6560625, + "grad_norm": 3.296875, + "grad_norm_var": 0.06819254557291667, + "learning_rate": 0.0001, + "loss": 5.9668, + "loss/crossentropy": 2.7102184295654297, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1764424666762352, + "step": 20994 + }, + { + "epoch": 0.656125, + "grad_norm": 3.09375, + "grad_norm_var": 0.06874593098958333, + "learning_rate": 0.0001, + "loss": 5.852, + "loss/crossentropy": 2.7109466791152954, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16371320933103561, + "step": 20996 + }, + { + "epoch": 0.6561875, + "grad_norm": 3.140625, + "grad_norm_var": 0.05446675618489583, + "learning_rate": 0.0001, + "loss": 5.6789, + "loss/crossentropy": 2.5835018157958984, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1657932549715042, + "step": 20998 + }, + { + "epoch": 0.65625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0583892822265625, + "learning_rate": 0.0001, + "loss": 5.8651, + "loss/crossentropy": 2.704118251800537, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1680501401424408, + "step": 21000 + }, + { + "epoch": 0.6563125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0467681884765625, + "learning_rate": 0.0001, + "loss": 5.7122, + "loss/crossentropy": 2.5057101249694824, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17064709216356277, + "step": 21002 + }, + { + "epoch": 0.656375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0434234619140625, + "learning_rate": 0.0001, + "loss": 6.0921, + "loss/crossentropy": 2.817593574523926, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17784057557582855, + "step": 21004 + }, + { + "epoch": 0.6564375, + "grad_norm": 3.34375, + "grad_norm_var": 0.030269368489583334, + "learning_rate": 0.0001, + "loss": 6.1687, + "loss/crossentropy": 2.834757924079895, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18300583958625793, + "step": 21006 + }, + { + "epoch": 0.6565, + "grad_norm": 2.953125, + "grad_norm_var": 0.028888956705729166, + "learning_rate": 0.0001, + "loss": 5.7256, + "loss/crossentropy": 2.686676263809204, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16209668666124344, + "step": 21008 + }, + { + "epoch": 0.6565625, + "grad_norm": 2.8125, + "grad_norm_var": 0.042215983072916664, + "learning_rate": 0.0001, + "loss": 5.6487, + "loss/crossentropy": 2.5283303260803223, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16321031749248505, + "step": 21010 + }, + { + "epoch": 0.656625, + "grad_norm": 2.859375, + "grad_norm_var": 0.045572916666666664, + "learning_rate": 0.0001, + "loss": 5.4632, + "loss/crossentropy": 2.4296209812164307, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15960296988487244, + "step": 21012 + }, + { + "epoch": 0.6566875, + "grad_norm": 3.125, + "grad_norm_var": 0.04383036295572917, + "learning_rate": 0.0001, + "loss": 5.7991, + "loss/crossentropy": 2.6751062870025635, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16708219796419144, + "step": 21014 + }, + { + "epoch": 0.65675, + "grad_norm": 2.96875, + "grad_norm_var": 0.0384674072265625, + "learning_rate": 0.0001, + "loss": 5.6379, + "loss/crossentropy": 2.577240228652954, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16153688728809357, + "step": 21016 + }, + { + "epoch": 0.6568125, + "grad_norm": 2.890625, + "grad_norm_var": 0.038996378580729164, + "learning_rate": 0.0001, + "loss": 5.771, + "loss/crossentropy": 2.6858417987823486, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16086158901453018, + "step": 21018 + }, + { + "epoch": 0.656875, + "grad_norm": 2.765625, + "grad_norm_var": 0.053132120768229166, + "learning_rate": 0.0001, + "loss": 5.3949, + "loss/crossentropy": 2.346004366874695, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15918176621198654, + "step": 21020 + }, + { + "epoch": 0.6569375, + "grad_norm": 3.046875, + "grad_norm_var": 0.049641927083333336, + "learning_rate": 0.0001, + "loss": 5.5529, + "loss/crossentropy": 2.5203301906585693, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15950361639261246, + "step": 21022 + }, + { + "epoch": 0.657, + "grad_norm": 2.9375, + "grad_norm_var": 0.0576080322265625, + "learning_rate": 0.0001, + "loss": 5.0891, + "loss/crossentropy": 2.2516844272613525, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.14155269414186478, + "step": 21024 + }, + { + "epoch": 0.6570625, + "grad_norm": 3.46875, + "grad_norm_var": 0.0552886962890625, + "learning_rate": 0.0001, + "loss": 5.8603, + "loss/crossentropy": 2.5943716764450073, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1742505356669426, + "step": 21026 + }, + { + "epoch": 0.657125, + "grad_norm": 3.0, + "grad_norm_var": 0.05332743326822917, + "learning_rate": 0.0001, + "loss": 5.5413, + "loss/crossentropy": 2.4588228464126587, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16527670621871948, + "step": 21028 + }, + { + "epoch": 0.6571875, + "grad_norm": 3.203125, + "grad_norm_var": 0.05583394368489583, + "learning_rate": 0.0001, + "loss": 5.5807, + "loss/crossentropy": 2.4765560626983643, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16275492310523987, + "step": 21030 + }, + { + "epoch": 0.65725, + "grad_norm": 3.03125, + "grad_norm_var": 0.05660807291666667, + "learning_rate": 0.0001, + "loss": 5.5726, + "loss/crossentropy": 2.4483338594436646, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16594135016202927, + "step": 21032 + }, + { + "epoch": 0.6573125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0526519775390625, + "learning_rate": 0.0001, + "loss": 5.4477, + "loss/crossentropy": 2.403680682182312, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15713957697153091, + "step": 21034 + }, + { + "epoch": 0.657375, + "grad_norm": 2.921875, + "grad_norm_var": 0.03738606770833333, + "learning_rate": 0.0001, + "loss": 5.7809, + "loss/crossentropy": 2.5883078575134277, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17082417011260986, + "step": 21036 + }, + { + "epoch": 0.6574375, + "grad_norm": 2.90625, + "grad_norm_var": 0.04119466145833333, + "learning_rate": 0.0001, + "loss": 5.3847, + "loss/crossentropy": 2.3743726015090942, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15650182962417603, + "step": 21038 + }, + { + "epoch": 0.6575, + "grad_norm": 3.265625, + "grad_norm_var": 0.031966145833333334, + "learning_rate": 0.0001, + "loss": 5.4838, + "loss/crossentropy": 2.399978518486023, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16150595992803574, + "step": 21040 + }, + { + "epoch": 0.6575625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0249176025390625, + "learning_rate": 0.0001, + "loss": 5.5571, + "loss/crossentropy": 2.4963085651397705, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16154929995536804, + "step": 21042 + }, + { + "epoch": 0.657625, + "grad_norm": 3.125, + "grad_norm_var": 0.030589803059895834, + "learning_rate": 0.0001, + "loss": 5.7662, + "loss/crossentropy": 2.568527936935425, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17094258964061737, + "step": 21044 + }, + { + "epoch": 0.6576875, + "grad_norm": 3.265625, + "grad_norm_var": 0.03866780598958333, + "learning_rate": 0.0001, + "loss": 5.9367, + "loss/crossentropy": 2.7443350553512573, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17352941632270813, + "step": 21046 + }, + { + "epoch": 0.65775, + "grad_norm": 3.0625, + "grad_norm_var": 0.0400787353515625, + "learning_rate": 0.0001, + "loss": 5.4498, + "loss/crossentropy": 2.3897674083709717, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1599130928516388, + "step": 21048 + }, + { + "epoch": 0.6578125, + "grad_norm": 3.21875, + "grad_norm_var": 0.040070597330729166, + "learning_rate": 0.0001, + "loss": 5.6568, + "loss/crossentropy": 2.5966427326202393, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16070196777582169, + "step": 21050 + }, + { + "epoch": 0.657875, + "grad_norm": 3.03125, + "grad_norm_var": 0.036149088541666666, + "learning_rate": 0.0001, + "loss": 6.0151, + "loss/crossentropy": 2.810759663581848, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17200005799531937, + "step": 21052 + }, + { + "epoch": 0.6579375, + "grad_norm": 3.15625, + "grad_norm_var": 0.04091695149739583, + "learning_rate": 0.0001, + "loss": 5.6564, + "loss/crossentropy": 2.6177507638931274, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15933813899755478, + "step": 21054 + }, + { + "epoch": 0.658, + "grad_norm": 3.09375, + "grad_norm_var": 0.038426717122395836, + "learning_rate": 0.0001, + "loss": 5.5803, + "loss/crossentropy": 2.5462610721588135, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1600475162267685, + "step": 21056 + }, + { + "epoch": 0.6580625, + "grad_norm": 2.890625, + "grad_norm_var": 0.04216206868489583, + "learning_rate": 0.0001, + "loss": 5.3908, + "loss/crossentropy": 2.464584231376648, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1508210301399231, + "step": 21058 + }, + { + "epoch": 0.658125, + "grad_norm": 2.984375, + "grad_norm_var": 0.04221903483072917, + "learning_rate": 0.0001, + "loss": 5.8657, + "loss/crossentropy": 2.6992465257644653, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17016176879405975, + "step": 21060 + }, + { + "epoch": 0.6581875, + "grad_norm": 3.65625, + "grad_norm_var": 0.051171875, + "learning_rate": 0.0001, + "loss": 5.7709, + "loss/crossentropy": 2.619776964187622, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16550444811582565, + "step": 21062 + }, + { + "epoch": 0.65825, + "grad_norm": 3.203125, + "grad_norm_var": 0.050537109375, + "learning_rate": 0.0001, + "loss": 5.9504, + "loss/crossentropy": 2.735660672187805, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17342735826969147, + "step": 21064 + }, + { + "epoch": 0.6583125, + "grad_norm": 2.984375, + "grad_norm_var": 0.054585774739583336, + "learning_rate": 0.0001, + "loss": 5.9137, + "loss/crossentropy": 2.6759467124938965, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1741655245423317, + "step": 21066 + }, + { + "epoch": 0.658375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0538726806640625, + "learning_rate": 0.0001, + "loss": 5.8245, + "loss/crossentropy": 2.6125426292419434, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1739320084452629, + "step": 21068 + }, + { + "epoch": 0.6584375, + "grad_norm": 3.09375, + "grad_norm_var": 0.04413960774739583, + "learning_rate": 0.0001, + "loss": 5.6678, + "loss/crossentropy": 2.539334297180176, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16518954187631607, + "step": 21070 + }, + { + "epoch": 0.6585, + "grad_norm": 3.09375, + "grad_norm_var": 0.05071207682291667, + "learning_rate": 0.0001, + "loss": 5.5312, + "loss/crossentropy": 2.46273410320282, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15685029327869415, + "step": 21072 + }, + { + "epoch": 0.6585625, + "grad_norm": 3.1875, + "grad_norm_var": 0.045099894205729164, + "learning_rate": 0.0001, + "loss": 5.8338, + "loss/crossentropy": 2.6176934242248535, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.173955999314785, + "step": 21074 + }, + { + "epoch": 0.658625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04244384765625, + "learning_rate": 0.0001, + "loss": 5.5685, + "loss/crossentropy": 2.509552836418152, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16254013031721115, + "step": 21076 + }, + { + "epoch": 0.6586875, + "grad_norm": 3.046875, + "grad_norm_var": 0.03134765625, + "learning_rate": 0.0001, + "loss": 5.777, + "loss/crossentropy": 2.595310091972351, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1709018349647522, + "step": 21078 + }, + { + "epoch": 0.65875, + "grad_norm": 3.46875, + "grad_norm_var": 0.03997395833333333, + "learning_rate": 0.0001, + "loss": 5.8275, + "loss/crossentropy": 2.5864028930664062, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17333343625068665, + "step": 21080 + }, + { + "epoch": 0.6588125, + "grad_norm": 3.09375, + "grad_norm_var": 0.03824462890625, + "learning_rate": 0.0001, + "loss": 5.4689, + "loss/crossentropy": 2.441880226135254, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1554327756166458, + "step": 21082 + }, + { + "epoch": 0.658875, + "grad_norm": 2.765625, + "grad_norm_var": 0.05142822265625, + "learning_rate": 0.0001, + "loss": 5.6495, + "loss/crossentropy": 2.653690457344055, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15973851829767227, + "step": 21084 + }, + { + "epoch": 0.6589375, + "grad_norm": 3.109375, + "grad_norm_var": 0.05142822265625, + "learning_rate": 0.0001, + "loss": 5.7854, + "loss/crossentropy": 2.6315271854400635, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16811668872833252, + "step": 21086 + }, + { + "epoch": 0.659, + "grad_norm": 3.171875, + "grad_norm_var": 0.04655659993489583, + "learning_rate": 0.0001, + "loss": 5.7797, + "loss/crossentropy": 2.6315035820007324, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16833917051553726, + "step": 21088 + }, + { + "epoch": 0.6590625, + "grad_norm": 3.1875, + "grad_norm_var": 0.040949503580729164, + "learning_rate": 0.0001, + "loss": 5.5435, + "loss/crossentropy": 2.438621759414673, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16400685161352158, + "step": 21090 + }, + { + "epoch": 0.659125, + "grad_norm": 3.40625, + "grad_norm_var": 0.046468098958333336, + "learning_rate": 0.0001, + "loss": 5.5214, + "loss/crossentropy": 2.4408299922943115, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1607937514781952, + "step": 21092 + }, + { + "epoch": 0.6591875, + "grad_norm": 3.09375, + "grad_norm_var": 0.04550374348958333, + "learning_rate": 0.0001, + "loss": 5.7973, + "loss/crossentropy": 2.6019203662872314, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.17539478838443756, + "step": 21094 + }, + { + "epoch": 0.65925, + "grad_norm": 2.984375, + "grad_norm_var": 0.03632710774739583, + "learning_rate": 0.0001, + "loss": 5.3967, + "loss/crossentropy": 2.3610873222351074, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15668457746505737, + "step": 21096 + }, + { + "epoch": 0.6593125, + "grad_norm": 3.15625, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 5.3789, + "loss/crossentropy": 2.3296267986297607, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.158445805311203, + "step": 21098 + }, + { + "epoch": 0.659375, + "grad_norm": 2.953125, + "grad_norm_var": 0.026423136393229168, + "learning_rate": 0.0001, + "loss": 5.7804, + "loss/crossentropy": 2.701469898223877, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16413865983486176, + "step": 21100 + }, + { + "epoch": 0.6594375, + "grad_norm": 2.9375, + "grad_norm_var": 0.028902180989583335, + "learning_rate": 0.0001, + "loss": 5.4956, + "loss/crossentropy": 2.532857656478882, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15408175438642502, + "step": 21102 + }, + { + "epoch": 0.6595, + "grad_norm": 3.796875, + "grad_norm_var": 0.059382120768229164, + "learning_rate": 0.0001, + "loss": 5.8234, + "loss/crossentropy": 2.519111752510071, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17964977771043777, + "step": 21104 + }, + { + "epoch": 0.6595625, + "grad_norm": 3.015625, + "grad_norm_var": 0.06043294270833333, + "learning_rate": 0.0001, + "loss": 5.6892, + "loss/crossentropy": 2.55735445022583, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16669873893260956, + "step": 21106 + }, + { + "epoch": 0.659625, + "grad_norm": 2.890625, + "grad_norm_var": 0.06165364583333333, + "learning_rate": 0.0001, + "loss": 5.5278, + "loss/crossentropy": 2.5014939308166504, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.161612868309021, + "step": 21108 + }, + { + "epoch": 0.6596875, + "grad_norm": 3.296875, + "grad_norm_var": 0.056005859375, + "learning_rate": 0.0001, + "loss": 5.9416, + "loss/crossentropy": 2.7261011600494385, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17193977534770966, + "step": 21110 + }, + { + "epoch": 0.65975, + "grad_norm": 3.0625, + "grad_norm_var": 0.0574615478515625, + "learning_rate": 0.0001, + "loss": 5.6383, + "loss/crossentropy": 2.5475538969039917, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16415174305438995, + "step": 21112 + }, + { + "epoch": 0.6598125, + "grad_norm": 3.1875, + "grad_norm_var": 0.052958170572916664, + "learning_rate": 0.0001, + "loss": 5.7181, + "loss/crossentropy": 2.5600234270095825, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17010456323623657, + "step": 21114 + }, + { + "epoch": 0.659875, + "grad_norm": 3.0625, + "grad_norm_var": 0.05175374348958333, + "learning_rate": 0.0001, + "loss": 5.7516, + "loss/crossentropy": 2.628481864929199, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16621547937393188, + "step": 21116 + }, + { + "epoch": 0.6599375, + "grad_norm": 3.03125, + "grad_norm_var": 0.04990946451822917, + "learning_rate": 0.0001, + "loss": 6.2269, + "loss/crossentropy": 2.9299408197402954, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17774052917957306, + "step": 21118 + }, + { + "epoch": 0.66, + "grad_norm": 2.796875, + "grad_norm_var": 0.027632649739583334, + "learning_rate": 0.0001, + "loss": 5.7446, + "loss/crossentropy": 2.6586681604385376, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1636732593178749, + "step": 21120 + }, + { + "epoch": 0.6600625, + "grad_norm": 3.0625, + "grad_norm_var": 0.027046712239583333, + "learning_rate": 0.0001, + "loss": 5.4781, + "loss/crossentropy": 2.4577144384384155, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15946026891469955, + "step": 21122 + }, + { + "epoch": 0.660125, + "grad_norm": 2.953125, + "grad_norm_var": 0.026395670572916665, + "learning_rate": 0.0001, + "loss": 5.675, + "loss/crossentropy": 2.625125527381897, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16241003572940826, + "step": 21124 + }, + { + "epoch": 0.6601875, + "grad_norm": 4.03125, + "grad_norm_var": 0.09013671875, + "learning_rate": 0.0001, + "loss": 5.6091, + "loss/crossentropy": 2.228906750679016, + "loss/hidden": 1.6015625, + "loss/jsd": 0.0, + "loss/logits": 0.17785951495170593, + "step": 21126 + }, + { + "epoch": 0.66025, + "grad_norm": 3.3125, + "grad_norm_var": 0.08947652180989583, + "learning_rate": 0.0001, + "loss": 5.5693, + "loss/crossentropy": 2.491586923599243, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15894466638565063, + "step": 21128 + }, + { + "epoch": 0.6603125, + "grad_norm": 3.421875, + "grad_norm_var": 0.09631245930989583, + "learning_rate": 0.0001, + "loss": 5.9102, + "loss/crossentropy": 2.6011446714401245, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1777852401137352, + "step": 21130 + }, + { + "epoch": 0.660375, + "grad_norm": 3.171875, + "grad_norm_var": 0.09434305826822917, + "learning_rate": 0.0001, + "loss": 5.5044, + "loss/crossentropy": 2.491545796394348, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15597651898860931, + "step": 21132 + }, + { + "epoch": 0.6604375, + "grad_norm": 2.875, + "grad_norm_var": 0.10035807291666667, + "learning_rate": 0.0001, + "loss": 5.3615, + "loss/crossentropy": 2.4181984663009644, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15214426070451736, + "step": 21134 + }, + { + "epoch": 0.6605, + "grad_norm": 3.234375, + "grad_norm_var": 0.09345703125, + "learning_rate": 0.0001, + "loss": 5.8034, + "loss/crossentropy": 2.5535932779312134, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17302973568439484, + "step": 21136 + }, + { + "epoch": 0.6605625, + "grad_norm": 3.078125, + "grad_norm_var": 0.5128163655598958, + "learning_rate": 0.0001, + "loss": 5.6741, + "loss/crossentropy": 2.4165477752685547, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.17068202793598175, + "step": 21138 + }, + { + "epoch": 0.660625, + "grad_norm": 3.046875, + "grad_norm_var": 0.48266499837239585, + "learning_rate": 0.0001, + "loss": 5.8364, + "loss/crossentropy": 2.6450281143188477, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16953130811452866, + "step": 21140 + }, + { + "epoch": 0.6606875, + "grad_norm": 3.078125, + "grad_norm_var": 0.4637369791666667, + "learning_rate": 0.0001, + "loss": 5.6399, + "loss/crossentropy": 2.5641591548919678, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1607028692960739, + "step": 21142 + }, + { + "epoch": 0.66075, + "grad_norm": 4.5625, + "grad_norm_var": 0.5629140218098958, + "learning_rate": 0.0001, + "loss": 5.4791, + "loss/crossentropy": 2.4822245836257935, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1590593308210373, + "step": 21144 + }, + { + "epoch": 0.6608125, + "grad_norm": 3.125, + "grad_norm_var": 0.5676717122395833, + "learning_rate": 0.0001, + "loss": 5.7843, + "loss/crossentropy": 2.622680902481079, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16772551089525223, + "step": 21146 + }, + { + "epoch": 0.660875, + "grad_norm": 3.046875, + "grad_norm_var": 0.5866495768229166, + "learning_rate": 0.0001, + "loss": 5.1549, + "loss/crossentropy": 2.2301281690597534, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.14716807007789612, + "step": 21148 + }, + { + "epoch": 0.6609375, + "grad_norm": 4.0, + "grad_norm_var": 0.5773111979166666, + "learning_rate": 0.0001, + "loss": 5.8639, + "loss/crossentropy": 2.5864195823669434, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17775243520736694, + "step": 21150 + }, + { + "epoch": 0.661, + "grad_norm": 3.09375, + "grad_norm_var": 0.5822906494140625, + "learning_rate": 0.0001, + "loss": 5.6977, + "loss/crossentropy": 2.5363423824310303, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1661316081881523, + "step": 21152 + }, + { + "epoch": 0.6610625, + "grad_norm": 2.84375, + "grad_norm_var": 0.20327046712239583, + "learning_rate": 0.0001, + "loss": 5.7573, + "loss/crossentropy": 2.723081111907959, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.159281924366951, + "step": 21154 + }, + { + "epoch": 0.661125, + "grad_norm": 3.375, + "grad_norm_var": 0.20532124837239582, + "learning_rate": 0.0001, + "loss": 5.9345, + "loss/crossentropy": 2.7293641567230225, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17246946692466736, + "step": 21156 + }, + { + "epoch": 0.6611875, + "grad_norm": 3.0625, + "grad_norm_var": 0.20722249348958333, + "learning_rate": 0.0001, + "loss": 5.5277, + "loss/crossentropy": 2.4667723178863525, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15999778360128403, + "step": 21158 + }, + { + "epoch": 0.66125, + "grad_norm": 3.203125, + "grad_norm_var": 0.08376363118489584, + "learning_rate": 0.0001, + "loss": 5.6088, + "loss/crossentropy": 2.5209559202194214, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16269070655107498, + "step": 21160 + }, + { + "epoch": 0.6613125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0775787353515625, + "learning_rate": 0.0001, + "loss": 6.0978, + "loss/crossentropy": 2.811536431312561, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1755012571811676, + "step": 21162 + }, + { + "epoch": 0.661375, + "grad_norm": 3.40625, + "grad_norm_var": 0.0805816650390625, + "learning_rate": 0.0001, + "loss": 5.8546, + "loss/crossentropy": 2.5932637453079224, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17535630613565445, + "step": 21164 + }, + { + "epoch": 0.6614375, + "grad_norm": 3.53125, + "grad_norm_var": 0.044820149739583336, + "learning_rate": 0.0001, + "loss": 5.983, + "loss/crossentropy": 2.724538803100586, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17779680341482162, + "step": 21166 + }, + { + "epoch": 0.6615, + "grad_norm": 3.0, + "grad_norm_var": 0.04363606770833333, + "learning_rate": 0.0001, + "loss": 5.5289, + "loss/crossentropy": 2.4889990091323853, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15790055692195892, + "step": 21168 + }, + { + "epoch": 0.6615625, + "grad_norm": 2.796875, + "grad_norm_var": 0.04157613118489583, + "learning_rate": 0.0001, + "loss": 5.6187, + "loss/crossentropy": 2.5857560634613037, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16189289093017578, + "step": 21170 + }, + { + "epoch": 0.661625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0437896728515625, + "learning_rate": 0.0001, + "loss": 5.53, + "loss/crossentropy": 2.5200599431991577, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1588016375899315, + "step": 21172 + }, + { + "epoch": 0.6616875, + "grad_norm": 2.921875, + "grad_norm_var": 0.0469390869140625, + "learning_rate": 0.0001, + "loss": 5.8791, + "loss/crossentropy": 2.679272770881653, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1703711748123169, + "step": 21174 + }, + { + "epoch": 0.66175, + "grad_norm": 2.921875, + "grad_norm_var": 0.04895426432291667, + "learning_rate": 0.0001, + "loss": 5.7639, + "loss/crossentropy": 2.62574303150177, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16732646524906158, + "step": 21176 + }, + { + "epoch": 0.6618125, + "grad_norm": 3.296875, + "grad_norm_var": 0.052164713541666664, + "learning_rate": 0.0001, + "loss": 5.7762, + "loss/crossentropy": 2.6213289499282837, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16743764281272888, + "step": 21178 + }, + { + "epoch": 0.661875, + "grad_norm": 3.078125, + "grad_norm_var": 0.03671468098958333, + "learning_rate": 0.0001, + "loss": 5.7035, + "loss/crossentropy": 2.6381725072860718, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16083265841007233, + "step": 21180 + }, + { + "epoch": 0.6619375, + "grad_norm": 2.890625, + "grad_norm_var": 0.0255279541015625, + "learning_rate": 0.0001, + "loss": 5.9097, + "loss/crossentropy": 2.7510194778442383, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16899508982896805, + "step": 21182 + }, + { + "epoch": 0.662, + "grad_norm": 2.859375, + "grad_norm_var": 0.02769775390625, + "learning_rate": 0.0001, + "loss": 5.7053, + "loss/crossentropy": 2.687032103538513, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15768758207559586, + "step": 21184 + }, + { + "epoch": 0.6620625, + "grad_norm": 2.921875, + "grad_norm_var": 0.027534993489583333, + "learning_rate": 0.0001, + "loss": 5.5797, + "loss/crossentropy": 2.58084499835968, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15926294028759003, + "step": 21186 + }, + { + "epoch": 0.662125, + "grad_norm": 3.046875, + "grad_norm_var": 0.023860677083333334, + "learning_rate": 0.0001, + "loss": 5.7182, + "loss/crossentropy": 2.575597405433655, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1642555668950081, + "step": 21188 + }, + { + "epoch": 0.6621875, + "grad_norm": 3.21875, + "grad_norm_var": 0.021613566080729167, + "learning_rate": 0.0001, + "loss": 5.7413, + "loss/crossentropy": 2.6081173419952393, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1683983951807022, + "step": 21190 + }, + { + "epoch": 0.66225, + "grad_norm": 3.203125, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 5.7651, + "loss/crossentropy": 2.646545171737671, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1665422022342682, + "step": 21192 + }, + { + "epoch": 0.6623125, + "grad_norm": 3.46875, + "grad_norm_var": 0.025516764322916666, + "learning_rate": 0.0001, + "loss": 5.9382, + "loss/crossentropy": 2.613059163093567, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18134091049432755, + "step": 21194 + }, + { + "epoch": 0.662375, + "grad_norm": 3.328125, + "grad_norm_var": 0.029703776041666668, + "learning_rate": 0.0001, + "loss": 5.9396, + "loss/crossentropy": 2.7155404090881348, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17318322509527206, + "step": 21196 + }, + { + "epoch": 0.6624375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0309234619140625, + "learning_rate": 0.0001, + "loss": 5.6522, + "loss/crossentropy": 2.5789085626602173, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16318871080875397, + "step": 21198 + }, + { + "epoch": 0.6625, + "grad_norm": 2.5, + "grad_norm_var": 0.051005045572916664, + "learning_rate": 0.0001, + "loss": 5.012, + "loss/crossentropy": 2.206539809703827, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.13991685956716537, + "step": 21200 + }, + { + "epoch": 0.6625625, + "grad_norm": 3.28125, + "grad_norm_var": 0.0497467041015625, + "learning_rate": 0.0001, + "loss": 5.5861, + "loss/crossentropy": 2.5295369625091553, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15800325572490692, + "step": 21202 + }, + { + "epoch": 0.662625, + "grad_norm": 2.96875, + "grad_norm_var": 0.052571614583333336, + "learning_rate": 0.0001, + "loss": 5.6726, + "loss/crossentropy": 2.5857043266296387, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16259709000587463, + "step": 21204 + }, + { + "epoch": 0.6626875, + "grad_norm": 3.453125, + "grad_norm_var": 0.061620076497395836, + "learning_rate": 0.0001, + "loss": 5.6861, + "loss/crossentropy": 2.527758479118347, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1677919551730156, + "step": 21206 + }, + { + "epoch": 0.66275, + "grad_norm": 2.828125, + "grad_norm_var": 0.06516927083333333, + "learning_rate": 0.0001, + "loss": 5.8011, + "loss/crossentropy": 2.6378647089004517, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16945039480924606, + "step": 21208 + }, + { + "epoch": 0.6628125, + "grad_norm": 3.53125, + "grad_norm_var": 0.06885477701822916, + "learning_rate": 0.0001, + "loss": 5.801, + "loss/crossentropy": 2.5858579874038696, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17542026937007904, + "step": 21210 + }, + { + "epoch": 0.662875, + "grad_norm": 3.078125, + "grad_norm_var": 0.06466471354166667, + "learning_rate": 0.0001, + "loss": 5.694, + "loss/crossentropy": 2.597716808319092, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1611892729997635, + "step": 21212 + }, + { + "epoch": 0.6629375, + "grad_norm": 3.25, + "grad_norm_var": 0.06177978515625, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.6291065216064453, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16508817672729492, + "step": 21214 + }, + { + "epoch": 0.663, + "grad_norm": 3.515625, + "grad_norm_var": 0.04289449055989583, + "learning_rate": 0.0001, + "loss": 5.3654, + "loss/crossentropy": 2.2221158742904663, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16432448476552963, + "step": 21216 + }, + { + "epoch": 0.6630625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0451812744140625, + "learning_rate": 0.0001, + "loss": 5.4499, + "loss/crossentropy": 2.4725834131240845, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1567157581448555, + "step": 21218 + }, + { + "epoch": 0.663125, + "grad_norm": 3.5, + "grad_norm_var": 0.052668253580729164, + "learning_rate": 0.0001, + "loss": 5.8421, + "loss/crossentropy": 2.664140462875366, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16975214332342148, + "step": 21220 + }, + { + "epoch": 0.6631875, + "grad_norm": 3.0, + "grad_norm_var": 0.04563700358072917, + "learning_rate": 0.0001, + "loss": 5.558, + "loss/crossentropy": 2.504623770713806, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16236679255962372, + "step": 21222 + }, + { + "epoch": 0.66325, + "grad_norm": 3.015625, + "grad_norm_var": 0.04132486979166667, + "learning_rate": 0.0001, + "loss": 5.6483, + "loss/crossentropy": 2.5294255018234253, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16071248054504395, + "step": 21224 + }, + { + "epoch": 0.6633125, + "grad_norm": 3.359375, + "grad_norm_var": 0.034845987955729164, + "learning_rate": 0.0001, + "loss": 6.0104, + "loss/crossentropy": 2.7339723110198975, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1780320480465889, + "step": 21226 + }, + { + "epoch": 0.663375, + "grad_norm": 3.03125, + "grad_norm_var": 0.04215087890625, + "learning_rate": 0.0001, + "loss": 5.3252, + "loss/crossentropy": 2.380972385406494, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1534043401479721, + "step": 21228 + }, + { + "epoch": 0.6634375, + "grad_norm": 3.828125, + "grad_norm_var": 0.07174479166666667, + "learning_rate": 0.0001, + "loss": 5.6034, + "loss/crossentropy": 2.4696223735809326, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16220373660326004, + "step": 21230 + }, + { + "epoch": 0.6635, + "grad_norm": 3.078125, + "grad_norm_var": 0.06929931640625, + "learning_rate": 0.0001, + "loss": 5.5928, + "loss/crossentropy": 2.5176429748535156, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15946735441684723, + "step": 21232 + }, + { + "epoch": 0.6635625, + "grad_norm": 3.0625, + "grad_norm_var": 0.06607157389322917, + "learning_rate": 0.0001, + "loss": 5.8498, + "loss/crossentropy": 2.645890712738037, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17273297160863876, + "step": 21234 + }, + { + "epoch": 0.663625, + "grad_norm": 3.484375, + "grad_norm_var": 0.06752827962239584, + "learning_rate": 0.0001, + "loss": 5.4285, + "loss/crossentropy": 2.3075203895568848, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16131679713726044, + "step": 21236 + }, + { + "epoch": 0.6636875, + "grad_norm": 3.25, + "grad_norm_var": 0.0632476806640625, + "learning_rate": 0.0001, + "loss": 5.6491, + "loss/crossentropy": 2.4615968465805054, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1722680702805519, + "step": 21238 + }, + { + "epoch": 0.66375, + "grad_norm": 2.9375, + "grad_norm_var": 0.08355712890625, + "learning_rate": 0.0001, + "loss": 5.6558, + "loss/crossentropy": 2.5240232944488525, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16435149312019348, + "step": 21240 + }, + { + "epoch": 0.6638125, + "grad_norm": 3.03125, + "grad_norm_var": 0.090478515625, + "learning_rate": 0.0001, + "loss": 5.7438, + "loss/crossentropy": 2.590863347053528, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16803047060966492, + "step": 21242 + }, + { + "epoch": 0.663875, + "grad_norm": 3.125, + "grad_norm_var": 0.08412984212239584, + "learning_rate": 0.0001, + "loss": 5.6828, + "loss/crossentropy": 2.592938184738159, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16602183133363724, + "step": 21244 + }, + { + "epoch": 0.6639375, + "grad_norm": 3.078125, + "grad_norm_var": 0.06193745930989583, + "learning_rate": 0.0001, + "loss": 5.8362, + "loss/crossentropy": 2.6598994731903076, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16880616545677185, + "step": 21246 + }, + { + "epoch": 0.664, + "grad_norm": 3.03125, + "grad_norm_var": 0.056477864583333336, + "learning_rate": 0.0001, + "loss": 5.5684, + "loss/crossentropy": 2.458032250404358, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16455282270908356, + "step": 21248 + }, + { + "epoch": 0.6640625, + "grad_norm": 2.75, + "grad_norm_var": 0.0675933837890625, + "learning_rate": 0.0001, + "loss": 5.2155, + "loss/crossentropy": 2.270583391189575, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.14840058982372284, + "step": 21250 + }, + { + "epoch": 0.664125, + "grad_norm": 2.859375, + "grad_norm_var": 0.055074055989583336, + "learning_rate": 0.0001, + "loss": 5.6114, + "loss/crossentropy": 2.515115261077881, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16470438987016678, + "step": 21252 + }, + { + "epoch": 0.6641875, + "grad_norm": 3.515625, + "grad_norm_var": 0.06201171875, + "learning_rate": 0.0001, + "loss": 5.9282, + "loss/crossentropy": 2.6761960983276367, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1740298494696617, + "step": 21254 + }, + { + "epoch": 0.66425, + "grad_norm": 3.234375, + "grad_norm_var": 0.031769816080729166, + "learning_rate": 0.0001, + "loss": 5.8655, + "loss/crossentropy": 2.6693637371063232, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1707821488380432, + "step": 21256 + }, + { + "epoch": 0.6643125, + "grad_norm": 3.03125, + "grad_norm_var": 0.031050618489583334, + "learning_rate": 0.0001, + "loss": 5.7682, + "loss/crossentropy": 2.620382785797119, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1694653034210205, + "step": 21258 + }, + { + "epoch": 0.664375, + "grad_norm": 3.3125, + "grad_norm_var": 0.032470703125, + "learning_rate": 0.0001, + "loss": 6.0041, + "loss/crossentropy": 2.6569840908050537, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18119489401578903, + "step": 21260 + }, + { + "epoch": 0.6644375, + "grad_norm": 3.171875, + "grad_norm_var": 0.034098307291666664, + "learning_rate": 0.0001, + "loss": 5.4112, + "loss/crossentropy": 2.4031245708465576, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15666569769382477, + "step": 21262 + }, + { + "epoch": 0.6645, + "grad_norm": 3.265625, + "grad_norm_var": 0.03453776041666667, + "learning_rate": 0.0001, + "loss": 5.9051, + "loss/crossentropy": 2.836301565170288, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16234861314296722, + "step": 21264 + }, + { + "epoch": 0.6645625, + "grad_norm": 2.890625, + "grad_norm_var": 0.0279937744140625, + "learning_rate": 0.0001, + "loss": 5.5214, + "loss/crossentropy": 2.517667055130005, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1577962040901184, + "step": 21266 + }, + { + "epoch": 0.664625, + "grad_norm": 3.015625, + "grad_norm_var": 0.026960245768229165, + "learning_rate": 0.0001, + "loss": 5.3337, + "loss/crossentropy": 2.3482940196990967, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15830513089895248, + "step": 21268 + }, + { + "epoch": 0.6646875, + "grad_norm": 3.609375, + "grad_norm_var": 0.035319010416666664, + "learning_rate": 0.0001, + "loss": 5.9632, + "loss/crossentropy": 2.7076185941696167, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1747806891798973, + "step": 21270 + }, + { + "epoch": 0.66475, + "grad_norm": 3.125, + "grad_norm_var": 0.03606363932291667, + "learning_rate": 0.0001, + "loss": 5.7607, + "loss/crossentropy": 2.6146886348724365, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16811973601579666, + "step": 21272 + }, + { + "epoch": 0.6648125, + "grad_norm": 3.078125, + "grad_norm_var": 0.03766276041666667, + "learning_rate": 0.0001, + "loss": 5.7106, + "loss/crossentropy": 2.6125776767730713, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16448620706796646, + "step": 21274 + }, + { + "epoch": 0.664875, + "grad_norm": 2.859375, + "grad_norm_var": 0.04273681640625, + "learning_rate": 0.0001, + "loss": 5.4658, + "loss/crossentropy": 2.48010790348053, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15794584155082703, + "step": 21276 + }, + { + "epoch": 0.6649375, + "grad_norm": 2.9375, + "grad_norm_var": 0.04445699055989583, + "learning_rate": 0.0001, + "loss": 5.5905, + "loss/crossentropy": 2.5024571418762207, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16309750080108643, + "step": 21278 + }, + { + "epoch": 0.665, + "grad_norm": 3.078125, + "grad_norm_var": 0.04411519368489583, + "learning_rate": 0.0001, + "loss": 5.7933, + "loss/crossentropy": 2.650007486343384, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16745546460151672, + "step": 21280 + }, + { + "epoch": 0.6650625, + "grad_norm": 3.40625, + "grad_norm_var": 0.05068257649739583, + "learning_rate": 0.0001, + "loss": 5.9504, + "loss/crossentropy": 2.692716598510742, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1765463650226593, + "step": 21282 + }, + { + "epoch": 0.665125, + "grad_norm": 3.0625, + "grad_norm_var": 0.04860738118489583, + "learning_rate": 0.0001, + "loss": 5.4174, + "loss/crossentropy": 2.4194761514663696, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15995173901319504, + "step": 21284 + }, + { + "epoch": 0.6651875, + "grad_norm": 2.765625, + "grad_norm_var": 0.03352762858072917, + "learning_rate": 0.0001, + "loss": 5.4512, + "loss/crossentropy": 2.5085253715515137, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15090972930192947, + "step": 21286 + }, + { + "epoch": 0.66525, + "grad_norm": 3.3125, + "grad_norm_var": 0.03857421875, + "learning_rate": 0.0001, + "loss": 5.4336, + "loss/crossentropy": 2.438277840614319, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1565670222043991, + "step": 21288 + }, + { + "epoch": 0.6653125, + "grad_norm": 3.296875, + "grad_norm_var": 0.043488566080729166, + "learning_rate": 0.0001, + "loss": 5.6838, + "loss/crossentropy": 2.546603798866272, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16840271651744843, + "step": 21290 + }, + { + "epoch": 0.665375, + "grad_norm": 3.140625, + "grad_norm_var": 0.09138895670572916, + "learning_rate": 0.0001, + "loss": 5.5774, + "loss/crossentropy": 2.3896507024765015, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16682545095682144, + "step": 21292 + }, + { + "epoch": 0.6654375, + "grad_norm": 2.96875, + "grad_norm_var": 0.0893218994140625, + "learning_rate": 0.0001, + "loss": 5.6017, + "loss/crossentropy": 2.497402787208557, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16277802735567093, + "step": 21294 + }, + { + "epoch": 0.6655, + "grad_norm": 3.015625, + "grad_norm_var": 0.09031575520833333, + "learning_rate": 0.0001, + "loss": 5.6609, + "loss/crossentropy": 2.578903317451477, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16600920259952545, + "step": 21296 + }, + { + "epoch": 0.6655625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0870513916015625, + "learning_rate": 0.0001, + "loss": 5.4887, + "loss/crossentropy": 2.3647782802581787, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1612161248922348, + "step": 21298 + }, + { + "epoch": 0.665625, + "grad_norm": 3.0625, + "grad_norm_var": 0.08688863118489583, + "learning_rate": 0.0001, + "loss": 5.5759, + "loss/crossentropy": 2.519879937171936, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15951135009527206, + "step": 21300 + }, + { + "epoch": 0.6656875, + "grad_norm": 3.015625, + "grad_norm_var": 0.0798980712890625, + "learning_rate": 0.0001, + "loss": 5.3471, + "loss/crossentropy": 2.348939299583435, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1541113629937172, + "step": 21302 + }, + { + "epoch": 0.66575, + "grad_norm": 3.34375, + "grad_norm_var": 0.06988525390625, + "learning_rate": 0.0001, + "loss": 5.6263, + "loss/crossentropy": 2.4221882820129395, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1723650023341179, + "step": 21304 + }, + { + "epoch": 0.6658125, + "grad_norm": 3.046875, + "grad_norm_var": 0.06957906087239583, + "learning_rate": 0.0001, + "loss": 5.7512, + "loss/crossentropy": 2.606347441673279, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16487867385149002, + "step": 21306 + }, + { + "epoch": 0.665875, + "grad_norm": 3.3125, + "grad_norm_var": 0.024811808268229166, + "learning_rate": 0.0001, + "loss": 5.9485, + "loss/crossentropy": 2.7326987981796265, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17314036935567856, + "step": 21308 + }, + { + "epoch": 0.6659375, + "grad_norm": 2.9375, + "grad_norm_var": 0.025276692708333333, + "learning_rate": 0.0001, + "loss": 5.442, + "loss/crossentropy": 2.4333741664886475, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1547662913799286, + "step": 21310 + }, + { + "epoch": 0.666, + "grad_norm": 3.265625, + "grad_norm_var": 0.022151692708333334, + "learning_rate": 0.0001, + "loss": 5.7553, + "loss/crossentropy": 2.5214457511901855, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17260191589593887, + "step": 21312 + }, + { + "epoch": 0.6660625, + "grad_norm": 2.6875, + "grad_norm_var": 0.032389322916666664, + "learning_rate": 0.0001, + "loss": 5.4029, + "loss/crossentropy": 2.4361422061920166, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15135882049798965, + "step": 21314 + }, + { + "epoch": 0.666125, + "grad_norm": 2.859375, + "grad_norm_var": 0.04114481608072917, + "learning_rate": 0.0001, + "loss": 5.8165, + "loss/crossentropy": 2.668753743171692, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1690707951784134, + "step": 21316 + }, + { + "epoch": 0.6661875, + "grad_norm": 3.171875, + "grad_norm_var": 0.03762613932291667, + "learning_rate": 0.0001, + "loss": 5.7533, + "loss/crossentropy": 2.59453284740448, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16743647307157516, + "step": 21318 + }, + { + "epoch": 0.66625, + "grad_norm": 3.03125, + "grad_norm_var": 0.04049479166666667, + "learning_rate": 0.0001, + "loss": 5.5846, + "loss/crossentropy": 2.6005107164382935, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.155830517411232, + "step": 21320 + }, + { + "epoch": 0.6663125, + "grad_norm": 3.046875, + "grad_norm_var": 0.04830729166666667, + "learning_rate": 0.0001, + "loss": 5.7108, + "loss/crossentropy": 2.6046417951583862, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1660839170217514, + "step": 21322 + }, + { + "epoch": 0.666375, + "grad_norm": 3.15625, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 5.4813, + "loss/crossentropy": 2.4988549947738647, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15840186923742294, + "step": 21324 + }, + { + "epoch": 0.6664375, + "grad_norm": 3.1875, + "grad_norm_var": 0.049779256184895836, + "learning_rate": 0.0001, + "loss": 5.369, + "loss/crossentropy": 2.350496530532837, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15888623148202896, + "step": 21326 + }, + { + "epoch": 0.6665, + "grad_norm": 3.078125, + "grad_norm_var": 0.053343709309895834, + "learning_rate": 0.0001, + "loss": 5.8258, + "loss/crossentropy": 2.6401236057281494, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1716877520084381, + "step": 21328 + }, + { + "epoch": 0.6665625, + "grad_norm": 3.078125, + "grad_norm_var": 0.04805399576822917, + "learning_rate": 0.0001, + "loss": 5.7729, + "loss/crossentropy": 2.562077760696411, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16835245490074158, + "step": 21330 + }, + { + "epoch": 0.666625, + "grad_norm": 3.15625, + "grad_norm_var": 0.03990478515625, + "learning_rate": 0.0001, + "loss": 5.4159, + "loss/crossentropy": 2.346079468727112, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15893695503473282, + "step": 21332 + }, + { + "epoch": 0.6666875, + "grad_norm": 2.84375, + "grad_norm_var": 0.04348958333333333, + "learning_rate": 0.0001, + "loss": 5.637, + "loss/crossentropy": 2.576659083366394, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16228371113538742, + "step": 21334 + }, + { + "epoch": 0.66675, + "grad_norm": 3.03125, + "grad_norm_var": 0.03827718098958333, + "learning_rate": 0.0001, + "loss": 5.9511, + "loss/crossentropy": 2.740101456642151, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1710963174700737, + "step": 21336 + }, + { + "epoch": 0.6668125, + "grad_norm": 2.953125, + "grad_norm_var": 0.032515462239583334, + "learning_rate": 0.0001, + "loss": 5.6354, + "loss/crossentropy": 2.5601917505264282, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16298599541187286, + "step": 21338 + }, + { + "epoch": 0.666875, + "grad_norm": 3.09375, + "grad_norm_var": 0.03551025390625, + "learning_rate": 0.0001, + "loss": 5.262, + "loss/crossentropy": 2.3531311750411987, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.14830512553453445, + "step": 21340 + }, + { + "epoch": 0.6669375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03329671223958333, + "learning_rate": 0.0001, + "loss": 5.7388, + "loss/crossentropy": 2.5914554595947266, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16668959707021713, + "step": 21342 + }, + { + "epoch": 0.667, + "grad_norm": 2.96875, + "grad_norm_var": 0.02880859375, + "learning_rate": 0.0001, + "loss": 5.1932, + "loss/crossentropy": 2.2438119649887085, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15275021642446518, + "step": 21344 + }, + { + "epoch": 0.6670625, + "grad_norm": 3.453125, + "grad_norm_var": 0.029964192708333334, + "learning_rate": 0.0001, + "loss": 5.9101, + "loss/crossentropy": 2.704999804496765, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1736399382352829, + "step": 21346 + }, + { + "epoch": 0.667125, + "grad_norm": 3.1875, + "grad_norm_var": 0.030464680989583333, + "learning_rate": 0.0001, + "loss": 5.6993, + "loss/crossentropy": 2.557963013648987, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1641298159956932, + "step": 21348 + }, + { + "epoch": 0.6671875, + "grad_norm": 2.953125, + "grad_norm_var": 0.031233723958333334, + "learning_rate": 0.0001, + "loss": 5.6561, + "loss/crossentropy": 2.53562068939209, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16830183565616608, + "step": 21350 + }, + { + "epoch": 0.66725, + "grad_norm": 3.234375, + "grad_norm_var": 0.03280843098958333, + "learning_rate": 0.0001, + "loss": 5.3138, + "loss/crossentropy": 2.3396353721618652, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1513269543647766, + "step": 21352 + }, + { + "epoch": 0.6673125, + "grad_norm": 3.078125, + "grad_norm_var": 0.030973307291666665, + "learning_rate": 0.0001, + "loss": 6.12, + "loss/crossentropy": 2.8311607837677, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17771321535110474, + "step": 21354 + }, + { + "epoch": 0.667375, + "grad_norm": 3.84375, + "grad_norm_var": 0.05821024576822917, + "learning_rate": 0.0001, + "loss": 6.0085, + "loss/crossentropy": 2.7028005123138428, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.178620845079422, + "step": 21356 + }, + { + "epoch": 0.6674375, + "grad_norm": 3.28125, + "grad_norm_var": 0.05707906087239583, + "learning_rate": 0.0001, + "loss": 5.7083, + "loss/crossentropy": 2.558958649635315, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1700163409113884, + "step": 21358 + }, + { + "epoch": 0.6675, + "grad_norm": 3.21875, + "grad_norm_var": 0.05222880045572917, + "learning_rate": 0.0001, + "loss": 5.4728, + "loss/crossentropy": 2.4816389083862305, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1569308191537857, + "step": 21360 + }, + { + "epoch": 0.6675625, + "grad_norm": 3.0625, + "grad_norm_var": 0.04908447265625, + "learning_rate": 0.0001, + "loss": 5.5566, + "loss/crossentropy": 2.457713007926941, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16457998007535934, + "step": 21362 + }, + { + "epoch": 0.667625, + "grad_norm": 3.265625, + "grad_norm_var": 0.049267578125, + "learning_rate": 0.0001, + "loss": 5.8844, + "loss/crossentropy": 2.669553279876709, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17617502063512802, + "step": 21364 + }, + { + "epoch": 0.6676875, + "grad_norm": 2.890625, + "grad_norm_var": 0.05087483723958333, + "learning_rate": 0.0001, + "loss": 5.5797, + "loss/crossentropy": 2.515458106994629, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16111420094966888, + "step": 21366 + }, + { + "epoch": 0.66775, + "grad_norm": 3.21875, + "grad_norm_var": 0.0529205322265625, + "learning_rate": 0.0001, + "loss": 6.0597, + "loss/crossentropy": 2.8077352046966553, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17558830231428146, + "step": 21368 + }, + { + "epoch": 0.6678125, + "grad_norm": 3.40625, + "grad_norm_var": 0.05445556640625, + "learning_rate": 0.0001, + "loss": 5.8566, + "loss/crossentropy": 2.696689248085022, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1687242016196251, + "step": 21370 + }, + { + "epoch": 0.667875, + "grad_norm": 3.375, + "grad_norm_var": 0.03394775390625, + "learning_rate": 0.0001, + "loss": 5.5609, + "loss/crossentropy": 2.50481379032135, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15755990892648697, + "step": 21372 + }, + { + "epoch": 0.6679375, + "grad_norm": 3.28125, + "grad_norm_var": 0.033299763997395836, + "learning_rate": 0.0001, + "loss": 5.705, + "loss/crossentropy": 2.565472960472107, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16473621875047684, + "step": 21374 + }, + { + "epoch": 0.668, + "grad_norm": 2.90625, + "grad_norm_var": 0.034566243489583336, + "learning_rate": 0.0001, + "loss": 5.2835, + "loss/crossentropy": 2.309640049934387, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15206913650035858, + "step": 21376 + }, + { + "epoch": 0.6680625, + "grad_norm": 3.171875, + "grad_norm_var": 0.035374959309895836, + "learning_rate": 0.0001, + "loss": 5.4765, + "loss/crossentropy": 2.4518284797668457, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15441888570785522, + "step": 21378 + }, + { + "epoch": 0.668125, + "grad_norm": 3.21875, + "grad_norm_var": 0.03623758951822917, + "learning_rate": 0.0001, + "loss": 5.6201, + "loss/crossentropy": 2.522407650947571, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16328418999910355, + "step": 21380 + }, + { + "epoch": 0.6681875, + "grad_norm": 3.125, + "grad_norm_var": 0.03209228515625, + "learning_rate": 0.0001, + "loss": 5.5553, + "loss/crossentropy": 2.4795387983322144, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16031239926815033, + "step": 21382 + }, + { + "epoch": 0.66825, + "grad_norm": 2.96875, + "grad_norm_var": 0.028059895833333334, + "learning_rate": 0.0001, + "loss": 5.7353, + "loss/crossentropy": 2.7149728536605835, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1582816243171692, + "step": 21384 + }, + { + "epoch": 0.6683125, + "grad_norm": 2.953125, + "grad_norm_var": 0.024836222330729168, + "learning_rate": 0.0001, + "loss": 5.6475, + "loss/crossentropy": 2.560445189476013, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.165740005671978, + "step": 21386 + }, + { + "epoch": 0.668375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0182769775390625, + "learning_rate": 0.0001, + "loss": 5.4794, + "loss/crossentropy": 2.439791202545166, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16099288314580917, + "step": 21388 + }, + { + "epoch": 0.6684375, + "grad_norm": 2.875, + "grad_norm_var": 0.016706339518229165, + "learning_rate": 0.0001, + "loss": 5.2074, + "loss/crossentropy": 2.2957775592803955, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.14662948995828629, + "step": 21390 + }, + { + "epoch": 0.6685, + "grad_norm": 3.296875, + "grad_norm_var": 0.017985026041666668, + "learning_rate": 0.0001, + "loss": 5.5945, + "loss/crossentropy": 2.4474011659622192, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16509786248207092, + "step": 21392 + }, + { + "epoch": 0.6685625, + "grad_norm": 2.875, + "grad_norm_var": 0.020335896809895834, + "learning_rate": 0.0001, + "loss": 5.5399, + "loss/crossentropy": 2.48570454120636, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1608891859650612, + "step": 21394 + }, + { + "epoch": 0.668625, + "grad_norm": 3.140625, + "grad_norm_var": 0.019222005208333334, + "learning_rate": 0.0001, + "loss": 5.9927, + "loss/crossentropy": 2.7924585342407227, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.170022115111351, + "step": 21396 + }, + { + "epoch": 0.6686875, + "grad_norm": 3.703125, + "grad_norm_var": 0.04445699055989583, + "learning_rate": 0.0001, + "loss": 6.0819, + "loss/crossentropy": 2.764808773994446, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1817099004983902, + "step": 21398 + }, + { + "epoch": 0.66875, + "grad_norm": 3.0, + "grad_norm_var": 0.0437896728515625, + "learning_rate": 0.0001, + "loss": 5.5681, + "loss/crossentropy": 2.5108526945114136, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16080047190189362, + "step": 21400 + }, + { + "epoch": 0.6688125, + "grad_norm": 3.03125, + "grad_norm_var": 0.04038798014322917, + "learning_rate": 0.0001, + "loss": 5.7719, + "loss/crossentropy": 2.595277428627014, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17040060460567474, + "step": 21402 + }, + { + "epoch": 0.668875, + "grad_norm": 3.1875, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 0.0001, + "loss": 5.7047, + "loss/crossentropy": 2.4800525903701782, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.172075092792511, + "step": 21404 + }, + { + "epoch": 0.6689375, + "grad_norm": 3.203125, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 0.0001, + "loss": 6.2405, + "loss/crossentropy": 2.8728411197662354, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18325071781873703, + "step": 21406 + }, + { + "epoch": 0.669, + "grad_norm": 3.15625, + "grad_norm_var": 0.04472554524739583, + "learning_rate": 0.0001, + "loss": 5.6739, + "loss/crossentropy": 2.6061822175979614, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1630219966173172, + "step": 21408 + }, + { + "epoch": 0.6690625, + "grad_norm": 3.234375, + "grad_norm_var": 0.034520467122395836, + "learning_rate": 0.0001, + "loss": 5.4019, + "loss/crossentropy": 2.3656262159347534, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15831893682479858, + "step": 21410 + }, + { + "epoch": 0.669125, + "grad_norm": 3.171875, + "grad_norm_var": 0.03560791015625, + "learning_rate": 0.0001, + "loss": 5.614, + "loss/crossentropy": 2.4813379049301147, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16795629262924194, + "step": 21412 + }, + { + "epoch": 0.6691875, + "grad_norm": 3.21875, + "grad_norm_var": 0.019115193684895834, + "learning_rate": 0.0001, + "loss": 5.8957, + "loss/crossentropy": 2.6477352380752563, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17518383264541626, + "step": 21414 + }, + { + "epoch": 0.66925, + "grad_norm": 3.140625, + "grad_norm_var": 0.015152994791666667, + "learning_rate": 0.0001, + "loss": 5.8608, + "loss/crossentropy": 2.6706048250198364, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1705779880285263, + "step": 21416 + }, + { + "epoch": 0.6693125, + "grad_norm": 3.15625, + "grad_norm_var": 0.012767537434895834, + "learning_rate": 0.0001, + "loss": 5.9529, + "loss/crossentropy": 2.699122905731201, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17694467306137085, + "step": 21418 + }, + { + "epoch": 0.669375, + "grad_norm": 3.234375, + "grad_norm_var": 0.009105428059895834, + "learning_rate": 0.0001, + "loss": 5.7373, + "loss/crossentropy": 2.5856951475143433, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1675005704164505, + "step": 21420 + }, + { + "epoch": 0.6694375, + "grad_norm": 3.796875, + "grad_norm_var": 0.02802734375, + "learning_rate": 0.0001, + "loss": 5.5522, + "loss/crossentropy": 2.4567084312438965, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1614982932806015, + "step": 21422 + }, + { + "epoch": 0.6695, + "grad_norm": 2.59375, + "grad_norm_var": 0.061930338541666664, + "learning_rate": 0.0001, + "loss": 4.9384, + "loss/crossentropy": 2.1080212593078613, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.13928364217281342, + "step": 21424 + }, + { + "epoch": 0.6695625, + "grad_norm": 2.890625, + "grad_norm_var": 0.07755533854166667, + "learning_rate": 0.0001, + "loss": 5.5534, + "loss/crossentropy": 2.539226770401001, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1545458659529686, + "step": 21426 + }, + { + "epoch": 0.669625, + "grad_norm": 2.96875, + "grad_norm_var": 0.081494140625, + "learning_rate": 0.0001, + "loss": 5.442, + "loss/crossentropy": 2.441792845726013, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15432009100914001, + "step": 21428 + }, + { + "epoch": 0.6696875, + "grad_norm": 2.921875, + "grad_norm_var": 0.08239644368489583, + "learning_rate": 0.0001, + "loss": 5.705, + "loss/crossentropy": 2.6042795181274414, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16163183003664017, + "step": 21430 + }, + { + "epoch": 0.66975, + "grad_norm": 3.265625, + "grad_norm_var": 0.08306884765625, + "learning_rate": 0.0001, + "loss": 5.8226, + "loss/crossentropy": 2.6924946308135986, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16535554826259613, + "step": 21432 + }, + { + "epoch": 0.6698125, + "grad_norm": 3.015625, + "grad_norm_var": 0.07955729166666667, + "learning_rate": 0.0001, + "loss": 5.6431, + "loss/crossentropy": 2.638508439064026, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15709708631038666, + "step": 21434 + }, + { + "epoch": 0.669875, + "grad_norm": 3.15625, + "grad_norm_var": 0.07449442545572917, + "learning_rate": 0.0001, + "loss": 5.4824, + "loss/crossentropy": 2.4303663969039917, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.156769298017025, + "step": 21436 + }, + { + "epoch": 0.6699375, + "grad_norm": 2.765625, + "grad_norm_var": 0.031201171875, + "learning_rate": 0.0001, + "loss": 4.9212, + "loss/crossentropy": 2.09871244430542, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14161968231201172, + "step": 21438 + }, + { + "epoch": 0.67, + "grad_norm": 2.859375, + "grad_norm_var": 0.03365885416666667, + "learning_rate": 0.0001, + "loss": 5.7513, + "loss/crossentropy": 2.6521228551864624, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16499435156583786, + "step": 21440 + }, + { + "epoch": 0.6700625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03212890625, + "learning_rate": 0.0001, + "loss": 5.7794, + "loss/crossentropy": 2.672476291656494, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16499171406030655, + "step": 21442 + }, + { + "epoch": 0.670125, + "grad_norm": 3.140625, + "grad_norm_var": 0.03137613932291667, + "learning_rate": 0.0001, + "loss": 5.6225, + "loss/crossentropy": 2.4903299808502197, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16633928567171097, + "step": 21444 + }, + { + "epoch": 0.6701875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0308502197265625, + "learning_rate": 0.0001, + "loss": 5.7248, + "loss/crossentropy": 2.611571788787842, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16522642225027084, + "step": 21446 + }, + { + "epoch": 0.67025, + "grad_norm": 3.203125, + "grad_norm_var": 0.029124959309895834, + "learning_rate": 0.0001, + "loss": 5.5744, + "loss/crossentropy": 2.4765546321868896, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16408215463161469, + "step": 21448 + }, + { + "epoch": 0.6703125, + "grad_norm": 3.125, + "grad_norm_var": 0.030159505208333333, + "learning_rate": 0.0001, + "loss": 5.5132, + "loss/crossentropy": 2.464064121246338, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1568623036146164, + "step": 21450 + }, + { + "epoch": 0.670375, + "grad_norm": 3.328125, + "grad_norm_var": 0.03284098307291667, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.536476969718933, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1743551641702652, + "step": 21452 + }, + { + "epoch": 0.6704375, + "grad_norm": 3.265625, + "grad_norm_var": 0.021882120768229166, + "learning_rate": 0.0001, + "loss": 5.632, + "loss/crossentropy": 2.472257375717163, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1675385981798172, + "step": 21454 + }, + { + "epoch": 0.6705, + "grad_norm": 3.078125, + "grad_norm_var": 0.012776692708333334, + "learning_rate": 0.0001, + "loss": 5.5936, + "loss/crossentropy": 2.5181620121002197, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15754467248916626, + "step": 21456 + }, + { + "epoch": 0.6705625, + "grad_norm": 3.375, + "grad_norm_var": 0.012970987955729167, + "learning_rate": 0.0001, + "loss": 5.408, + "loss/crossentropy": 2.344880700111389, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15865999460220337, + "step": 21458 + }, + { + "epoch": 0.670625, + "grad_norm": 3.078125, + "grad_norm_var": 0.046418253580729166, + "learning_rate": 0.0001, + "loss": 6.0235, + "loss/crossentropy": 2.6762707233428955, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1847182661294937, + "step": 21460 + }, + { + "epoch": 0.6706875, + "grad_norm": 2.921875, + "grad_norm_var": 0.0536041259765625, + "learning_rate": 0.0001, + "loss": 5.6682, + "loss/crossentropy": 2.5822503566741943, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16250474750995636, + "step": 21462 + }, + { + "epoch": 0.67075, + "grad_norm": 3.125, + "grad_norm_var": 0.05126953125, + "learning_rate": 0.0001, + "loss": 5.4845, + "loss/crossentropy": 2.3889511823654175, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16228647530078888, + "step": 21464 + }, + { + "epoch": 0.6708125, + "grad_norm": 3.3125, + "grad_norm_var": 0.1002593994140625, + "learning_rate": 0.0001, + "loss": 6.1805, + "loss/crossentropy": 2.7867921590805054, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18429580330848694, + "step": 21466 + }, + { + "epoch": 0.670875, + "grad_norm": 3.0625, + "grad_norm_var": 0.11489156087239584, + "learning_rate": 0.0001, + "loss": 6.0297, + "loss/crossentropy": 2.723998546600342, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17901095747947693, + "step": 21468 + }, + { + "epoch": 0.6709375, + "grad_norm": 2.9375, + "grad_norm_var": 0.1339508056640625, + "learning_rate": 0.0001, + "loss": 5.5777, + "loss/crossentropy": 2.58323073387146, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15725874155759811, + "step": 21470 + }, + { + "epoch": 0.671, + "grad_norm": 3.140625, + "grad_norm_var": 0.13498942057291666, + "learning_rate": 0.0001, + "loss": 5.7627, + "loss/crossentropy": 2.617396593093872, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1684407889842987, + "step": 21472 + }, + { + "epoch": 0.6710625, + "grad_norm": 3.15625, + "grad_norm_var": 0.13696187337239582, + "learning_rate": 0.0001, + "loss": 6.154, + "loss/crossentropy": 2.908242344856262, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17261794209480286, + "step": 21474 + }, + { + "epoch": 0.671125, + "grad_norm": 3.0625, + "grad_norm_var": 0.10506083170572916, + "learning_rate": 0.0001, + "loss": 5.4484, + "loss/crossentropy": 2.439389705657959, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15597765892744064, + "step": 21476 + }, + { + "epoch": 0.6711875, + "grad_norm": 3.21875, + "grad_norm_var": 0.09988505045572917, + "learning_rate": 0.0001, + "loss": 5.6118, + "loss/crossentropy": 2.499711036682129, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16199331730604172, + "step": 21478 + }, + { + "epoch": 0.67125, + "grad_norm": 3.109375, + "grad_norm_var": 0.10393778483072917, + "learning_rate": 0.0001, + "loss": 5.4711, + "loss/crossentropy": 2.4389195442199707, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15672899782657623, + "step": 21480 + }, + { + "epoch": 0.6713125, + "grad_norm": 3.28125, + "grad_norm_var": 0.04990234375, + "learning_rate": 0.0001, + "loss": 5.6451, + "loss/crossentropy": 2.5340847969055176, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16657179594039917, + "step": 21482 + }, + { + "epoch": 0.671375, + "grad_norm": 3.125, + "grad_norm_var": 0.030436197916666668, + "learning_rate": 0.0001, + "loss": 5.9978, + "loss/crossentropy": 2.721407890319824, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17646276205778122, + "step": 21484 + }, + { + "epoch": 0.6714375, + "grad_norm": 3.25, + "grad_norm_var": 0.024117024739583333, + "learning_rate": 0.0001, + "loss": 5.5812, + "loss/crossentropy": 2.4138031005859375, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16829850524663925, + "step": 21486 + }, + { + "epoch": 0.6715, + "grad_norm": 3.671875, + "grad_norm_var": 0.20581766764322917, + "learning_rate": 0.0001, + "loss": 5.9694, + "loss/crossentropy": 2.6807148456573486, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17769964784383774, + "step": 21488 + }, + { + "epoch": 0.6715625, + "grad_norm": 3.453125, + "grad_norm_var": 0.20328369140625, + "learning_rate": 0.0001, + "loss": 5.5746, + "loss/crossentropy": 2.3858500719070435, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1657535880804062, + "step": 21490 + }, + { + "epoch": 0.671625, + "grad_norm": 2.9375, + "grad_norm_var": 0.2178619384765625, + "learning_rate": 0.0001, + "loss": 5.9514, + "loss/crossentropy": 2.840806007385254, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16301733255386353, + "step": 21492 + }, + { + "epoch": 0.6716875, + "grad_norm": 2.984375, + "grad_norm_var": 0.22433980305989584, + "learning_rate": 0.0001, + "loss": 5.4522, + "loss/crossentropy": 2.5097970962524414, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15283824503421783, + "step": 21494 + }, + { + "epoch": 0.67175, + "grad_norm": 2.84375, + "grad_norm_var": 0.23854878743489583, + "learning_rate": 0.0001, + "loss": 5.1412, + "loss/crossentropy": 2.231008291244507, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.149219810962677, + "step": 21496 + }, + { + "epoch": 0.6718125, + "grad_norm": 2.890625, + "grad_norm_var": 0.24462788899739582, + "learning_rate": 0.0001, + "loss": 5.9064, + "loss/crossentropy": 2.7614312171936035, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1652786284685135, + "step": 21498 + }, + { + "epoch": 0.671875, + "grad_norm": 3.265625, + "grad_norm_var": 0.24353841145833333, + "learning_rate": 0.0001, + "loss": 5.7548, + "loss/crossentropy": 2.602095127105713, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16917718201875687, + "step": 21500 + }, + { + "epoch": 0.6719375, + "grad_norm": 3.375, + "grad_norm_var": 5.868277994791667, + "learning_rate": 0.0001, + "loss": 6.7616, + "loss/crossentropy": 2.907825231552124, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.23147086799144745, + "step": 21502 + }, + { + "epoch": 0.672, + "grad_norm": 4.03125, + "grad_norm_var": 5.832027180989583, + "learning_rate": 0.0001, + "loss": 5.7947, + "loss/crossentropy": 2.5979628562927246, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1708494797348976, + "step": 21504 + }, + { + "epoch": 0.6720625, + "grad_norm": 3.140625, + "grad_norm_var": 5.864997355143229, + "learning_rate": 0.0001, + "loss": 5.5542, + "loss/crossentropy": 2.5263938903808594, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1574731022119522, + "step": 21506 + }, + { + "epoch": 0.672125, + "grad_norm": 3.265625, + "grad_norm_var": 5.815843709309896, + "learning_rate": 0.0001, + "loss": 5.6225, + "loss/crossentropy": 2.5533034801483154, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16004548966884613, + "step": 21508 + }, + { + "epoch": 0.6721875, + "grad_norm": 2.875, + "grad_norm_var": 5.85592041015625, + "learning_rate": 0.0001, + "loss": 5.5983, + "loss/crossentropy": 2.50632381439209, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16270896047353745, + "step": 21510 + }, + { + "epoch": 0.67225, + "grad_norm": 3.15625, + "grad_norm_var": 5.825260416666667, + "learning_rate": 0.0001, + "loss": 5.7735, + "loss/crossentropy": 2.6349629163742065, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16659299284219742, + "step": 21512 + }, + { + "epoch": 0.6723125, + "grad_norm": 3.078125, + "grad_norm_var": 5.812165323893229, + "learning_rate": 0.0001, + "loss": 5.5054, + "loss/crossentropy": 2.469195604324341, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16065293550491333, + "step": 21514 + }, + { + "epoch": 0.672375, + "grad_norm": 2.765625, + "grad_norm_var": 5.850516764322917, + "learning_rate": 0.0001, + "loss": 5.7099, + "loss/crossentropy": 2.6085174083709717, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16287671774625778, + "step": 21516 + }, + { + "epoch": 0.6724375, + "grad_norm": 2.984375, + "grad_norm_var": 0.07402242024739583, + "learning_rate": 0.0001, + "loss": 5.4277, + "loss/crossentropy": 2.424909234046936, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15574946999549866, + "step": 21518 + }, + { + "epoch": 0.6725, + "grad_norm": 3.171875, + "grad_norm_var": 0.024299112955729167, + "learning_rate": 0.0001, + "loss": 5.9345, + "loss/crossentropy": 2.6294257640838623, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17972489446401596, + "step": 21520 + }, + { + "epoch": 0.6725625, + "grad_norm": 3.46875, + "grad_norm_var": 0.03630269368489583, + "learning_rate": 0.0001, + "loss": 5.7392, + "loss/crossentropy": 2.563357949256897, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1695333793759346, + "step": 21522 + }, + { + "epoch": 0.672625, + "grad_norm": 3.34375, + "grad_norm_var": 0.040511067708333334, + "learning_rate": 0.0001, + "loss": 5.7447, + "loss/crossentropy": 2.613749384880066, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16426987200975418, + "step": 21524 + }, + { + "epoch": 0.6726875, + "grad_norm": 3.015625, + "grad_norm_var": 0.03713277180989583, + "learning_rate": 0.0001, + "loss": 6.1799, + "loss/crossentropy": 2.9965966939926147, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1706780195236206, + "step": 21526 + }, + { + "epoch": 0.67275, + "grad_norm": 3.21875, + "grad_norm_var": 0.037840779622395834, + "learning_rate": 0.0001, + "loss": 5.8798, + "loss/crossentropy": 2.6346405744552612, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17412938922643661, + "step": 21528 + }, + { + "epoch": 0.6728125, + "grad_norm": 3.3125, + "grad_norm_var": 0.03998921712239583, + "learning_rate": 0.0001, + "loss": 5.8051, + "loss/crossentropy": 2.5978105068206787, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17073268443346024, + "step": 21530 + }, + { + "epoch": 0.672875, + "grad_norm": 3.28125, + "grad_norm_var": 0.027669270833333332, + "learning_rate": 0.0001, + "loss": 5.7746, + "loss/crossentropy": 2.630752921104431, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16594670712947845, + "step": 21532 + }, + { + "epoch": 0.6729375, + "grad_norm": 3.375, + "grad_norm_var": 0.03235270182291667, + "learning_rate": 0.0001, + "loss": 5.8934, + "loss/crossentropy": 2.652822256088257, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17288265377283096, + "step": 21534 + }, + { + "epoch": 0.673, + "grad_norm": 3.109375, + "grad_norm_var": 0.0338043212890625, + "learning_rate": 0.0001, + "loss": 5.6394, + "loss/crossentropy": 2.5622141361236572, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16240905970335007, + "step": 21536 + }, + { + "epoch": 0.6730625, + "grad_norm": 2.984375, + "grad_norm_var": 0.025126139322916668, + "learning_rate": 0.0001, + "loss": 5.2742, + "loss/crossentropy": 2.327731966972351, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15324432402849197, + "step": 21538 + }, + { + "epoch": 0.673125, + "grad_norm": 2.890625, + "grad_norm_var": 0.024168904622395834, + "learning_rate": 0.0001, + "loss": 5.4945, + "loss/crossentropy": 2.427959680557251, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15977810323238373, + "step": 21540 + }, + { + "epoch": 0.6731875, + "grad_norm": 3.28125, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 5.9594, + "loss/crossentropy": 2.7185678482055664, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1764236018061638, + "step": 21542 + }, + { + "epoch": 0.67325, + "grad_norm": 3.328125, + "grad_norm_var": 0.027632649739583334, + "learning_rate": 0.0001, + "loss": 5.5374, + "loss/crossentropy": 2.5168328285217285, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1567484438419342, + "step": 21544 + }, + { + "epoch": 0.6733125, + "grad_norm": 3.578125, + "grad_norm_var": 0.0830718994140625, + "learning_rate": 0.0001, + "loss": 5.3888, + "loss/crossentropy": 2.248305916786194, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.161318838596344, + "step": 21546 + }, + { + "epoch": 0.673375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0821929931640625, + "learning_rate": 0.0001, + "loss": 5.8188, + "loss/crossentropy": 2.6391515731811523, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16991789638996124, + "step": 21548 + }, + { + "epoch": 0.6734375, + "grad_norm": 3.03125, + "grad_norm_var": 0.07672119140625, + "learning_rate": 0.0001, + "loss": 5.1313, + "loss/crossentropy": 2.2192198038101196, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.14667852967977524, + "step": 21550 + }, + { + "epoch": 0.6735, + "grad_norm": 2.734375, + "grad_norm_var": 0.08629150390625, + "learning_rate": 0.0001, + "loss": 5.2923, + "loss/crossentropy": 2.354954719543457, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15154416859149933, + "step": 21552 + }, + { + "epoch": 0.6735625, + "grad_norm": 3.140625, + "grad_norm_var": 0.08384501139322917, + "learning_rate": 0.0001, + "loss": 5.6326, + "loss/crossentropy": 2.539131999015808, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16286291182041168, + "step": 21554 + }, + { + "epoch": 0.673625, + "grad_norm": 3.046875, + "grad_norm_var": 0.07922770182291666, + "learning_rate": 0.0001, + "loss": 5.6455, + "loss/crossentropy": 2.562136173248291, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16263478249311447, + "step": 21556 + }, + { + "epoch": 0.6736875, + "grad_norm": 3.203125, + "grad_norm_var": 0.07893778483072916, + "learning_rate": 0.0001, + "loss": 5.7154, + "loss/crossentropy": 2.568955898284912, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16777236759662628, + "step": 21558 + }, + { + "epoch": 0.67375, + "grad_norm": 2.890625, + "grad_norm_var": 0.0842681884765625, + "learning_rate": 0.0001, + "loss": 5.5765, + "loss/crossentropy": 2.5343549251556396, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1585097759962082, + "step": 21560 + }, + { + "epoch": 0.6738125, + "grad_norm": 3.5625, + "grad_norm_var": 0.0351470947265625, + "learning_rate": 0.0001, + "loss": 5.6905, + "loss/crossentropy": 2.507950782775879, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16825632750988007, + "step": 21562 + }, + { + "epoch": 0.673875, + "grad_norm": 2.984375, + "grad_norm_var": 0.03794657389322917, + "learning_rate": 0.0001, + "loss": 5.4478, + "loss/crossentropy": 2.4567400217056274, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15614214539527893, + "step": 21564 + }, + { + "epoch": 0.6739375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0377105712890625, + "learning_rate": 0.0001, + "loss": 5.8502, + "loss/crossentropy": 2.651641011238098, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17610616981983185, + "step": 21566 + }, + { + "epoch": 0.674, + "grad_norm": 2.953125, + "grad_norm_var": 0.03033447265625, + "learning_rate": 0.0001, + "loss": 5.7605, + "loss/crossentropy": 2.601743459701538, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1701701581478119, + "step": 21568 + }, + { + "epoch": 0.6740625, + "grad_norm": 3.390625, + "grad_norm_var": 0.031298828125, + "learning_rate": 0.0001, + "loss": 5.5561, + "loss/crossentropy": 2.5366307497024536, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1589776575565338, + "step": 21570 + }, + { + "epoch": 0.674125, + "grad_norm": 3.078125, + "grad_norm_var": 0.030402628580729167, + "learning_rate": 0.0001, + "loss": 5.903, + "loss/crossentropy": 2.720766544342041, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17017146944999695, + "step": 21572 + }, + { + "epoch": 0.6741875, + "grad_norm": 2.71875, + "grad_norm_var": 0.07251688639322916, + "learning_rate": 0.0001, + "loss": 5.59, + "loss/crossentropy": 2.456699252128601, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16723531484603882, + "step": 21574 + }, + { + "epoch": 0.67425, + "grad_norm": 3.15625, + "grad_norm_var": 0.06866861979166666, + "learning_rate": 0.0001, + "loss": 5.5863, + "loss/crossentropy": 2.5174355506896973, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16547858715057373, + "step": 21576 + }, + { + "epoch": 0.6743125, + "grad_norm": 3.34375, + "grad_norm_var": 0.060286458333333334, + "learning_rate": 0.0001, + "loss": 5.5883, + "loss/crossentropy": 2.590228796005249, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15722859650850296, + "step": 21578 + }, + { + "epoch": 0.674375, + "grad_norm": 3.5, + "grad_norm_var": 0.06468098958333333, + "learning_rate": 0.0001, + "loss": 5.7053, + "loss/crossentropy": 2.455095648765564, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17501623928546906, + "step": 21580 + }, + { + "epoch": 0.6744375, + "grad_norm": 3.59375, + "grad_norm_var": 0.08015950520833333, + "learning_rate": 0.0001, + "loss": 5.771, + "loss/crossentropy": 2.598675847053528, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16958113014698029, + "step": 21582 + }, + { + "epoch": 0.6745, + "grad_norm": 3.109375, + "grad_norm_var": 0.07649637858072916, + "learning_rate": 0.0001, + "loss": 5.4936, + "loss/crossentropy": 2.440723180770874, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1564580649137497, + "step": 21584 + }, + { + "epoch": 0.6745625, + "grad_norm": 3.15625, + "grad_norm_var": 0.07935282389322916, + "learning_rate": 0.0001, + "loss": 5.3376, + "loss/crossentropy": 2.354195713996887, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1530267298221588, + "step": 21586 + }, + { + "epoch": 0.674625, + "grad_norm": 2.9375, + "grad_norm_var": 0.08111063639322917, + "learning_rate": 0.0001, + "loss": 5.5999, + "loss/crossentropy": 2.4619847536087036, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1649627760052681, + "step": 21588 + }, + { + "epoch": 0.6746875, + "grad_norm": 2.984375, + "grad_norm_var": 0.04377339680989583, + "learning_rate": 0.0001, + "loss": 5.6582, + "loss/crossentropy": 2.5627092123031616, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16071902960538864, + "step": 21590 + }, + { + "epoch": 0.67475, + "grad_norm": 3.1875, + "grad_norm_var": 0.04342041015625, + "learning_rate": 0.0001, + "loss": 5.598, + "loss/crossentropy": 2.445231556892395, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1683986634016037, + "step": 21592 + }, + { + "epoch": 0.6748125, + "grad_norm": 3.0, + "grad_norm_var": 0.04360249837239583, + "learning_rate": 0.0001, + "loss": 5.7165, + "loss/crossentropy": 2.5691791772842407, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16941748559474945, + "step": 21594 + }, + { + "epoch": 0.674875, + "grad_norm": 3.0, + "grad_norm_var": 0.03290608723958333, + "learning_rate": 0.0001, + "loss": 5.2587, + "loss/crossentropy": 2.328010857105255, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.14658305048942566, + "step": 21596 + }, + { + "epoch": 0.6749375, + "grad_norm": 3.859375, + "grad_norm_var": 0.052733357747395834, + "learning_rate": 0.0001, + "loss": 5.5409, + "loss/crossentropy": 2.464159846305847, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1619701012969017, + "step": 21598 + }, + { + "epoch": 0.675, + "grad_norm": 3.796875, + "grad_norm_var": 0.08989969889322917, + "learning_rate": 0.0001, + "loss": 5.9731, + "loss/crossentropy": 2.5941025018692017, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18829017877578735, + "step": 21600 + }, + { + "epoch": 0.6750625, + "grad_norm": 3.40625, + "grad_norm_var": 0.08860575358072917, + "learning_rate": 0.0001, + "loss": 5.9492, + "loss/crossentropy": 2.6869665384292603, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1805206462740898, + "step": 21602 + }, + { + "epoch": 0.675125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0867828369140625, + "learning_rate": 0.0001, + "loss": 5.392, + "loss/crossentropy": 2.291312098503113, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16436392813920975, + "step": 21604 + }, + { + "epoch": 0.6751875, + "grad_norm": 3.484375, + "grad_norm_var": 0.08621317545572917, + "learning_rate": 0.0001, + "loss": 6.0364, + "loss/crossentropy": 2.701277017593384, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.17765580117702484, + "step": 21606 + }, + { + "epoch": 0.67525, + "grad_norm": 3.046875, + "grad_norm_var": 0.0879302978515625, + "learning_rate": 0.0001, + "loss": 5.9724, + "loss/crossentropy": 2.739699602127075, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1724887266755104, + "step": 21608 + }, + { + "epoch": 0.6753125, + "grad_norm": 2.984375, + "grad_norm_var": 0.08313700358072916, + "learning_rate": 0.0001, + "loss": 5.7393, + "loss/crossentropy": 2.6435153484344482, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1673872172832489, + "step": 21610 + }, + { + "epoch": 0.675375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0766021728515625, + "learning_rate": 0.0001, + "loss": 5.6872, + "loss/crossentropy": 2.5578598976135254, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16566597670316696, + "step": 21612 + }, + { + "epoch": 0.6754375, + "grad_norm": 2.828125, + "grad_norm_var": 0.0662017822265625, + "learning_rate": 0.0001, + "loss": 5.5189, + "loss/crossentropy": 2.4635757207870483, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16178404539823532, + "step": 21614 + }, + { + "epoch": 0.6755, + "grad_norm": 3.296875, + "grad_norm_var": 0.033991495768229164, + "learning_rate": 0.0001, + "loss": 5.9125, + "loss/crossentropy": 2.6843149662017822, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1743776500225067, + "step": 21616 + }, + { + "epoch": 0.6755625, + "grad_norm": 2.9375, + "grad_norm_var": 0.038981119791666664, + "learning_rate": 0.0001, + "loss": 5.7046, + "loss/crossentropy": 2.483850121498108, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1701253280043602, + "step": 21618 + }, + { + "epoch": 0.675625, + "grad_norm": 3.203125, + "grad_norm_var": 0.04343973795572917, + "learning_rate": 0.0001, + "loss": 5.5076, + "loss/crossentropy": 2.4350651502609253, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1631159856915474, + "step": 21620 + }, + { + "epoch": 0.6756875, + "grad_norm": 3.046875, + "grad_norm_var": 0.028571573893229167, + "learning_rate": 0.0001, + "loss": 5.5037, + "loss/crossentropy": 2.5485366582870483, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15411410480737686, + "step": 21622 + }, + { + "epoch": 0.67575, + "grad_norm": 2.71875, + "grad_norm_var": 0.03479410807291667, + "learning_rate": 0.0001, + "loss": 5.4561, + "loss/crossentropy": 2.519161343574524, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15228858590126038, + "step": 21624 + }, + { + "epoch": 0.6758125, + "grad_norm": 3.171875, + "grad_norm_var": 0.034968058268229164, + "learning_rate": 0.0001, + "loss": 5.7888, + "loss/crossentropy": 2.658233880996704, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1669674590229988, + "step": 21626 + }, + { + "epoch": 0.675875, + "grad_norm": 3.203125, + "grad_norm_var": 0.0362457275390625, + "learning_rate": 0.0001, + "loss": 5.5435, + "loss/crossentropy": 2.451690912246704, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1638697236776352, + "step": 21628 + }, + { + "epoch": 0.6759375, + "grad_norm": 3.109375, + "grad_norm_var": 0.03424072265625, + "learning_rate": 0.0001, + "loss": 5.6949, + "loss/crossentropy": 2.565598487854004, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1641046106815338, + "step": 21630 + }, + { + "epoch": 0.676, + "grad_norm": 3.21875, + "grad_norm_var": 0.0347564697265625, + "learning_rate": 0.0001, + "loss": 5.9732, + "loss/crossentropy": 2.781027913093567, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17194978892803192, + "step": 21632 + }, + { + "epoch": 0.6760625, + "grad_norm": 3.25, + "grad_norm_var": 0.0332672119140625, + "learning_rate": 0.0001, + "loss": 5.6233, + "loss/crossentropy": 2.475935459136963, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16629965603351593, + "step": 21634 + }, + { + "epoch": 0.676125, + "grad_norm": 2.921875, + "grad_norm_var": 0.03291015625, + "learning_rate": 0.0001, + "loss": 5.4409, + "loss/crossentropy": 2.413808822631836, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15739503502845764, + "step": 21636 + }, + { + "epoch": 0.6761875, + "grad_norm": 3.453125, + "grad_norm_var": 0.03726806640625, + "learning_rate": 0.0001, + "loss": 5.7385, + "loss/crossentropy": 2.514286160469055, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17359095811843872, + "step": 21638 + }, + { + "epoch": 0.67625, + "grad_norm": 3.109375, + "grad_norm_var": 0.026667277018229168, + "learning_rate": 0.0001, + "loss": 5.6374, + "loss/crossentropy": 2.5960875749588013, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1635073944926262, + "step": 21640 + }, + { + "epoch": 0.6763125, + "grad_norm": 3.421875, + "grad_norm_var": 0.0535552978515625, + "learning_rate": 0.0001, + "loss": 5.8778, + "loss/crossentropy": 2.601802349090576, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1776026487350464, + "step": 21642 + }, + { + "epoch": 0.676375, + "grad_norm": 3.171875, + "grad_norm_var": 0.05562744140625, + "learning_rate": 0.0001, + "loss": 5.4776, + "loss/crossentropy": 2.3748942613601685, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1614421010017395, + "step": 21644 + }, + { + "epoch": 0.6764375, + "grad_norm": 3.1875, + "grad_norm_var": 0.05523681640625, + "learning_rate": 0.0001, + "loss": 5.512, + "loss/crossentropy": 2.480241894721985, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15630466490983963, + "step": 21646 + }, + { + "epoch": 0.6765, + "grad_norm": 2.84375, + "grad_norm_var": 0.06116129557291667, + "learning_rate": 0.0001, + "loss": 5.5291, + "loss/crossentropy": 2.4708411693573, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1593453586101532, + "step": 21648 + }, + { + "epoch": 0.6765625, + "grad_norm": 3.328125, + "grad_norm_var": 0.061751302083333334, + "learning_rate": 0.0001, + "loss": 5.9215, + "loss/crossentropy": 2.7264283895492554, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1738065481185913, + "step": 21650 + }, + { + "epoch": 0.676625, + "grad_norm": 3.296875, + "grad_norm_var": 0.059342447916666666, + "learning_rate": 0.0001, + "loss": 6.0063, + "loss/crossentropy": 2.7244467735290527, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17779753357172012, + "step": 21652 + }, + { + "epoch": 0.6766875, + "grad_norm": 3.703125, + "grad_norm_var": 0.07317606608072917, + "learning_rate": 0.0001, + "loss": 5.5412, + "loss/crossentropy": 2.442210555076599, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1618504673242569, + "step": 21654 + }, + { + "epoch": 0.67675, + "grad_norm": 3.09375, + "grad_norm_var": 0.07888997395833333, + "learning_rate": 0.0001, + "loss": 5.6021, + "loss/crossentropy": 2.5033955574035645, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1637721210718155, + "step": 21656 + }, + { + "epoch": 0.6768125, + "grad_norm": 3.359375, + "grad_norm_var": 0.051416015625, + "learning_rate": 0.0001, + "loss": 6.0564, + "loss/crossentropy": 2.7717570066452026, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17611663043498993, + "step": 21658 + }, + { + "epoch": 0.676875, + "grad_norm": 3.640625, + "grad_norm_var": 0.0673828125, + "learning_rate": 0.0001, + "loss": 5.4322, + "loss/crossentropy": 2.362541079521179, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15852497518062592, + "step": 21660 + }, + { + "epoch": 0.6769375, + "grad_norm": 3.1875, + "grad_norm_var": 0.06842041015625, + "learning_rate": 0.0001, + "loss": 5.7047, + "loss/crossentropy": 2.586043357849121, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16889940202236176, + "step": 21662 + }, + { + "epoch": 0.677, + "grad_norm": 3.09375, + "grad_norm_var": 0.06282552083333333, + "learning_rate": 0.0001, + "loss": 5.4755, + "loss/crossentropy": 2.4535220861434937, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15375986695289612, + "step": 21664 + }, + { + "epoch": 0.6770625, + "grad_norm": 3.03125, + "grad_norm_var": 0.06164957682291667, + "learning_rate": 0.0001, + "loss": 5.6035, + "loss/crossentropy": 2.4998347759246826, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15997806936502457, + "step": 21666 + }, + { + "epoch": 0.677125, + "grad_norm": 3.359375, + "grad_norm_var": 0.06210530598958333, + "learning_rate": 0.0001, + "loss": 6.0946, + "loss/crossentropy": 2.818961977958679, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17678148299455643, + "step": 21668 + }, + { + "epoch": 0.6771875, + "grad_norm": 3.171875, + "grad_norm_var": 0.06658528645833334, + "learning_rate": 0.0001, + "loss": 5.779, + "loss/crossentropy": 2.6024211645126343, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16805046051740646, + "step": 21670 + }, + { + "epoch": 0.67725, + "grad_norm": 3.0, + "grad_norm_var": 0.06057535807291667, + "learning_rate": 0.0001, + "loss": 5.9059, + "loss/crossentropy": 2.696392774581909, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17016559094190598, + "step": 21672 + }, + { + "epoch": 0.6773125, + "grad_norm": 3.078125, + "grad_norm_var": 0.06682840983072917, + "learning_rate": 0.0001, + "loss": 5.466, + "loss/crossentropy": 2.409417152404785, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15839356929063797, + "step": 21674 + }, + { + "epoch": 0.677375, + "grad_norm": 2.875, + "grad_norm_var": 0.060074869791666666, + "learning_rate": 0.0001, + "loss": 5.2729, + "loss/crossentropy": 2.3077032566070557, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15316389501094818, + "step": 21676 + }, + { + "epoch": 0.6774375, + "grad_norm": 2.71875, + "grad_norm_var": 0.06892903645833333, + "learning_rate": 0.0001, + "loss": 5.0794, + "loss/crossentropy": 2.1735494136810303, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.14761539548635483, + "step": 21678 + }, + { + "epoch": 0.6775, + "grad_norm": 3.015625, + "grad_norm_var": 0.0689453125, + "learning_rate": 0.0001, + "loss": 5.4218, + "loss/crossentropy": 2.421651840209961, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15860862284898758, + "step": 21680 + }, + { + "epoch": 0.6775625, + "grad_norm": 3.125, + "grad_norm_var": 0.066796875, + "learning_rate": 0.0001, + "loss": 5.5028, + "loss/crossentropy": 2.4240047931671143, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16295696049928665, + "step": 21682 + }, + { + "epoch": 0.677625, + "grad_norm": 2.6875, + "grad_norm_var": 0.07112223307291667, + "learning_rate": 0.0001, + "loss": 5.4404, + "loss/crossentropy": 2.4916775226593018, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15268395096063614, + "step": 21684 + }, + { + "epoch": 0.6776875, + "grad_norm": 3.296875, + "grad_norm_var": 0.0428619384765625, + "learning_rate": 0.0001, + "loss": 5.8948, + "loss/crossentropy": 2.6532318592071533, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17571566998958588, + "step": 21686 + }, + { + "epoch": 0.67775, + "grad_norm": 3.1875, + "grad_norm_var": 0.038330078125, + "learning_rate": 0.0001, + "loss": 5.5301, + "loss/crossentropy": 2.513607978820801, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15555134415626526, + "step": 21688 + }, + { + "epoch": 0.6778125, + "grad_norm": 3.234375, + "grad_norm_var": 0.039281209309895836, + "learning_rate": 0.0001, + "loss": 5.5822, + "loss/crossentropy": 2.5421735048294067, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1594759076833725, + "step": 21690 + }, + { + "epoch": 0.677875, + "grad_norm": 3.3125, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 0.0001, + "loss": 5.4246, + "loss/crossentropy": 2.3655821084976196, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15942063927650452, + "step": 21692 + }, + { + "epoch": 0.6779375, + "grad_norm": 3.125, + "grad_norm_var": 0.037629191080729166, + "learning_rate": 0.0001, + "loss": 5.7691, + "loss/crossentropy": 2.653545618057251, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16506946086883545, + "step": 21694 + }, + { + "epoch": 0.678, + "grad_norm": 3.1875, + "grad_norm_var": 0.03615620930989583, + "learning_rate": 0.0001, + "loss": 5.9941, + "loss/crossentropy": 2.679787755012512, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18064585328102112, + "step": 21696 + }, + { + "epoch": 0.6780625, + "grad_norm": 3.03125, + "grad_norm_var": 0.038508097330729164, + "learning_rate": 0.0001, + "loss": 5.5408, + "loss/crossentropy": 2.5309895277023315, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1568371206521988, + "step": 21698 + }, + { + "epoch": 0.678125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0251861572265625, + "learning_rate": 0.0001, + "loss": 5.9506, + "loss/crossentropy": 2.7247650623321533, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1741444393992424, + "step": 21700 + }, + { + "epoch": 0.6781875, + "grad_norm": 3.078125, + "grad_norm_var": 0.029295857747395834, + "learning_rate": 0.0001, + "loss": 5.77, + "loss/crossentropy": 2.7215741872787476, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16031458973884583, + "step": 21702 + }, + { + "epoch": 0.67825, + "grad_norm": 3.4375, + "grad_norm_var": 0.03905843098958333, + "learning_rate": 0.0001, + "loss": 5.6592, + "loss/crossentropy": 2.507062554359436, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16951168328523636, + "step": 21704 + }, + { + "epoch": 0.6783125, + "grad_norm": 2.875, + "grad_norm_var": 0.04080403645833333, + "learning_rate": 0.0001, + "loss": 5.3356, + "loss/crossentropy": 2.342239499092102, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15246369689702988, + "step": 21706 + }, + { + "epoch": 0.678375, + "grad_norm": 3.3125, + "grad_norm_var": 0.04680074055989583, + "learning_rate": 0.0001, + "loss": 5.6685, + "loss/crossentropy": 2.6061320304870605, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16209647804498672, + "step": 21708 + }, + { + "epoch": 0.6784375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0479644775390625, + "learning_rate": 0.0001, + "loss": 5.4719, + "loss/crossentropy": 2.491135835647583, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15706124156713486, + "step": 21710 + }, + { + "epoch": 0.6785, + "grad_norm": 3.03125, + "grad_norm_var": 0.04545796712239583, + "learning_rate": 0.0001, + "loss": 5.507, + "loss/crossentropy": 2.4627164602279663, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1614636331796646, + "step": 21712 + }, + { + "epoch": 0.6785625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0504547119140625, + "learning_rate": 0.0001, + "loss": 5.5285, + "loss/crossentropy": 2.550132632255554, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1560390442609787, + "step": 21714 + }, + { + "epoch": 0.678625, + "grad_norm": 2.890625, + "grad_norm_var": 0.04195963541666667, + "learning_rate": 0.0001, + "loss": 5.7092, + "loss/crossentropy": 2.5589277744293213, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16580668836832047, + "step": 21716 + }, + { + "epoch": 0.6786875, + "grad_norm": 3.15625, + "grad_norm_var": 0.043294270833333336, + "learning_rate": 0.0001, + "loss": 5.8755, + "loss/crossentropy": 2.6546236276626587, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17404113709926605, + "step": 21718 + }, + { + "epoch": 0.67875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0341796875, + "learning_rate": 0.0001, + "loss": 5.6575, + "loss/crossentropy": 2.581426978111267, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16307304054498672, + "step": 21720 + }, + { + "epoch": 0.6788125, + "grad_norm": 3.0, + "grad_norm_var": 0.0287109375, + "learning_rate": 0.0001, + "loss": 5.7229, + "loss/crossentropy": 2.6428853273391724, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16424734890460968, + "step": 21722 + }, + { + "epoch": 0.678875, + "grad_norm": 3.265625, + "grad_norm_var": 0.024267578125, + "learning_rate": 0.0001, + "loss": 5.7864, + "loss/crossentropy": 2.600390076637268, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17133549600839615, + "step": 21724 + }, + { + "epoch": 0.6789375, + "grad_norm": 3.015625, + "grad_norm_var": 0.024348958333333334, + "learning_rate": 0.0001, + "loss": 5.5554, + "loss/crossentropy": 2.4865139722824097, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16157332062721252, + "step": 21726 + }, + { + "epoch": 0.679, + "grad_norm": 2.78125, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 0.0001, + "loss": 5.5107, + "loss/crossentropy": 2.514100670814514, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1508304327726364, + "step": 21728 + }, + { + "epoch": 0.6790625, + "grad_norm": 3.5, + "grad_norm_var": 0.0369537353515625, + "learning_rate": 0.0001, + "loss": 5.5489, + "loss/crossentropy": 2.3980984687805176, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16507767140865326, + "step": 21730 + }, + { + "epoch": 0.679125, + "grad_norm": 3.140625, + "grad_norm_var": 0.035384114583333334, + "learning_rate": 0.0001, + "loss": 5.5816, + "loss/crossentropy": 2.4350985288619995, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16582182049751282, + "step": 21732 + }, + { + "epoch": 0.6791875, + "grad_norm": 3.046875, + "grad_norm_var": 0.03336181640625, + "learning_rate": 0.0001, + "loss": 5.4235, + "loss/crossentropy": 2.3969926834106445, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1592930406332016, + "step": 21734 + }, + { + "epoch": 0.67925, + "grad_norm": 3.078125, + "grad_norm_var": 0.029645792643229165, + "learning_rate": 0.0001, + "loss": 5.8289, + "loss/crossentropy": 2.715415120124817, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1636960431933403, + "step": 21736 + }, + { + "epoch": 0.6793125, + "grad_norm": 3.15625, + "grad_norm_var": 0.028889973958333332, + "learning_rate": 0.0001, + "loss": 5.8493, + "loss/crossentropy": 2.6418182849884033, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1734854057431221, + "step": 21738 + }, + { + "epoch": 0.679375, + "grad_norm": 3.03125, + "grad_norm_var": 0.02734375, + "learning_rate": 0.0001, + "loss": 5.4197, + "loss/crossentropy": 2.398454427719116, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15525156259536743, + "step": 21740 + }, + { + "epoch": 0.6794375, + "grad_norm": 3.3125, + "grad_norm_var": 0.027864583333333335, + "learning_rate": 0.0001, + "loss": 6.0178, + "loss/crossentropy": 2.7353005409240723, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1782461702823639, + "step": 21742 + }, + { + "epoch": 0.6795, + "grad_norm": 3.203125, + "grad_norm_var": 0.018504842122395834, + "learning_rate": 0.0001, + "loss": 5.674, + "loss/crossentropy": 2.527986168861389, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16459853947162628, + "step": 21744 + }, + { + "epoch": 0.6795625, + "grad_norm": 3.03125, + "grad_norm_var": 0.011421712239583333, + "learning_rate": 0.0001, + "loss": 5.5751, + "loss/crossentropy": 2.5177600383758545, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1615949645638466, + "step": 21746 + }, + { + "epoch": 0.679625, + "grad_norm": 3.125, + "grad_norm_var": 0.012646484375, + "learning_rate": 0.0001, + "loss": 5.8048, + "loss/crossentropy": 2.615201950073242, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17130304872989655, + "step": 21748 + }, + { + "epoch": 0.6796875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0217193603515625, + "learning_rate": 0.0001, + "loss": 5.4275, + "loss/crossentropy": 2.363314151763916, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15875961631536484, + "step": 21750 + }, + { + "epoch": 0.67975, + "grad_norm": 3.265625, + "grad_norm_var": 0.0248443603515625, + "learning_rate": 0.0001, + "loss": 5.4695, + "loss/crossentropy": 2.4244298934936523, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1588074341416359, + "step": 21752 + }, + { + "epoch": 0.6798125, + "grad_norm": 3.28125, + "grad_norm_var": 0.03337300618489583, + "learning_rate": 0.0001, + "loss": 6.0363, + "loss/crossentropy": 2.735056757926941, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17895662039518356, + "step": 21754 + }, + { + "epoch": 0.679875, + "grad_norm": 2.8125, + "grad_norm_var": 0.03889872233072917, + "learning_rate": 0.0001, + "loss": 5.4128, + "loss/crossentropy": 2.4331005811691284, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1526607722043991, + "step": 21756 + }, + { + "epoch": 0.6799375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0373199462890625, + "learning_rate": 0.0001, + "loss": 5.8498, + "loss/crossentropy": 2.6393890380859375, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.167527437210083, + "step": 21758 + }, + { + "epoch": 0.68, + "grad_norm": 2.984375, + "grad_norm_var": 0.039826456705729166, + "learning_rate": 0.0001, + "loss": 5.4507, + "loss/crossentropy": 2.4479658603668213, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15887079387903214, + "step": 21760 + }, + { + "epoch": 0.6800625, + "grad_norm": 3.296875, + "grad_norm_var": 0.03677978515625, + "learning_rate": 0.0001, + "loss": 5.9617, + "loss/crossentropy": 2.7392600774765015, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.173413947224617, + "step": 21762 + }, + { + "epoch": 0.680125, + "grad_norm": 3.109375, + "grad_norm_var": 0.03687744140625, + "learning_rate": 0.0001, + "loss": 5.6082, + "loss/crossentropy": 2.4955997467041016, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16555281728506088, + "step": 21764 + }, + { + "epoch": 0.6801875, + "grad_norm": 3.125, + "grad_norm_var": 0.028804524739583334, + "learning_rate": 0.0001, + "loss": 5.702, + "loss/crossentropy": 2.5557764768600464, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16657356917858124, + "step": 21766 + }, + { + "epoch": 0.68025, + "grad_norm": 3.140625, + "grad_norm_var": 0.026276652018229166, + "learning_rate": 0.0001, + "loss": 5.6545, + "loss/crossentropy": 2.5388453006744385, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16742068529129028, + "step": 21768 + }, + { + "epoch": 0.6803125, + "grad_norm": 3.25, + "grad_norm_var": 0.016434733072916666, + "learning_rate": 0.0001, + "loss": 5.7271, + "loss/crossentropy": 2.5621873140335083, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17079270631074905, + "step": 21770 + }, + { + "epoch": 0.680375, + "grad_norm": 3.453125, + "grad_norm_var": 0.016087849934895832, + "learning_rate": 0.0001, + "loss": 5.7628, + "loss/crossentropy": 2.5121124982833862, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17350760102272034, + "step": 21772 + }, + { + "epoch": 0.6804375, + "grad_norm": 2.890625, + "grad_norm_var": 0.021637980143229166, + "learning_rate": 0.0001, + "loss": 5.3457, + "loss/crossentropy": 2.324031352996826, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15684963762760162, + "step": 21774 + }, + { + "epoch": 0.6805, + "grad_norm": 3.109375, + "grad_norm_var": 0.023193359375, + "learning_rate": 0.0001, + "loss": 5.6466, + "loss/crossentropy": 2.584893584251404, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16241639107465744, + "step": 21776 + }, + { + "epoch": 0.6805625, + "grad_norm": 3.0625, + "grad_norm_var": 0.02105712890625, + "learning_rate": 0.0001, + "loss": 5.4492, + "loss/crossentropy": 2.3818461894989014, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15869245678186417, + "step": 21778 + }, + { + "epoch": 0.680625, + "grad_norm": 2.9375, + "grad_norm_var": 0.024396769205729165, + "learning_rate": 0.0001, + "loss": 5.6076, + "loss/crossentropy": 2.5006372928619385, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16616414487361908, + "step": 21780 + }, + { + "epoch": 0.6806875, + "grad_norm": 3.25, + "grad_norm_var": 0.04126688639322917, + "learning_rate": 0.0001, + "loss": 5.7623, + "loss/crossentropy": 2.6387823820114136, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16430340707302094, + "step": 21782 + }, + { + "epoch": 0.68075, + "grad_norm": 3.5, + "grad_norm_var": 0.0513092041015625, + "learning_rate": 0.0001, + "loss": 5.7836, + "loss/crossentropy": 2.6003860235214233, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17027339339256287, + "step": 21784 + }, + { + "epoch": 0.6808125, + "grad_norm": 2.84375, + "grad_norm_var": 0.0592926025390625, + "learning_rate": 0.0001, + "loss": 5.65, + "loss/crossentropy": 2.577357530593872, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16195328533649445, + "step": 21786 + }, + { + "epoch": 0.680875, + "grad_norm": 2.921875, + "grad_norm_var": 0.05533447265625, + "learning_rate": 0.0001, + "loss": 5.2702, + "loss/crossentropy": 2.31095814704895, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15295067429542542, + "step": 21788 + }, + { + "epoch": 0.6809375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0498687744140625, + "learning_rate": 0.0001, + "loss": 5.8766, + "loss/crossentropy": 2.666735887527466, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17137304693460464, + "step": 21790 + }, + { + "epoch": 0.681, + "grad_norm": 3.390625, + "grad_norm_var": 0.0470367431640625, + "learning_rate": 0.0001, + "loss": 6.1725, + "loss/crossentropy": 2.964896559715271, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17075765132904053, + "step": 21792 + }, + { + "epoch": 0.6810625, + "grad_norm": 3.265625, + "grad_norm_var": 0.0511383056640625, + "learning_rate": 0.0001, + "loss": 5.6333, + "loss/crossentropy": 2.51242196559906, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16130948811769485, + "step": 21794 + }, + { + "epoch": 0.681125, + "grad_norm": 3.15625, + "grad_norm_var": 0.04722900390625, + "learning_rate": 0.0001, + "loss": 5.761, + "loss/crossentropy": 2.619884967803955, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16762836277484894, + "step": 21796 + }, + { + "epoch": 0.6811875, + "grad_norm": 3.1875, + "grad_norm_var": 0.03577473958333333, + "learning_rate": 0.0001, + "loss": 5.5565, + "loss/crossentropy": 2.514464259147644, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15928348153829575, + "step": 21798 + }, + { + "epoch": 0.68125, + "grad_norm": 2.9375, + "grad_norm_var": 0.028547159830729165, + "learning_rate": 0.0001, + "loss": 5.5508, + "loss/crossentropy": 2.459440588951111, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16343362629413605, + "step": 21800 + }, + { + "epoch": 0.6813125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0224609375, + "learning_rate": 0.0001, + "loss": 6.0475, + "loss/crossentropy": 2.7599663734436035, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17719028145074844, + "step": 21802 + }, + { + "epoch": 0.681375, + "grad_norm": 3.234375, + "grad_norm_var": 0.018098958333333335, + "learning_rate": 0.0001, + "loss": 5.8111, + "loss/crossentropy": 2.6380518674850464, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16847773641347885, + "step": 21804 + }, + { + "epoch": 0.6814375, + "grad_norm": 3.265625, + "grad_norm_var": 0.018229166666666668, + "learning_rate": 0.0001, + "loss": 5.8106, + "loss/crossentropy": 2.6051825284957886, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.168195903301239, + "step": 21806 + }, + { + "epoch": 0.6815, + "grad_norm": 3.140625, + "grad_norm_var": 0.043229166666666666, + "learning_rate": 0.0001, + "loss": 5.7213, + "loss/crossentropy": 2.54864764213562, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16531335562467575, + "step": 21808 + }, + { + "epoch": 0.6815625, + "grad_norm": 3.078125, + "grad_norm_var": 0.07844136555989584, + "learning_rate": 0.0001, + "loss": 5.5756, + "loss/crossentropy": 2.446473240852356, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.1582270786166191, + "step": 21810 + }, + { + "epoch": 0.681625, + "grad_norm": 3.09375, + "grad_norm_var": 0.08017171223958333, + "learning_rate": 0.0001, + "loss": 5.7248, + "loss/crossentropy": 2.618484854698181, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16610309481620789, + "step": 21812 + }, + { + "epoch": 0.6816875, + "grad_norm": 3.34375, + "grad_norm_var": 0.08569234212239583, + "learning_rate": 0.0001, + "loss": 5.8648, + "loss/crossentropy": 2.748309850692749, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16360018402338028, + "step": 21814 + }, + { + "epoch": 0.68175, + "grad_norm": 2.953125, + "grad_norm_var": 0.08810221354166667, + "learning_rate": 0.0001, + "loss": 5.6683, + "loss/crossentropy": 2.60297167301178, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1604352742433548, + "step": 21816 + }, + { + "epoch": 0.6818125, + "grad_norm": 3.375, + "grad_norm_var": 0.09358317057291667, + "learning_rate": 0.0001, + "loss": 5.6063, + "loss/crossentropy": 2.4555102586746216, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1662549301981926, + "step": 21818 + }, + { + "epoch": 0.681875, + "grad_norm": 3.265625, + "grad_norm_var": 0.1030426025390625, + "learning_rate": 0.0001, + "loss": 5.8703, + "loss/crossentropy": 2.67289662361145, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17208616435527802, + "step": 21820 + }, + { + "epoch": 0.6819375, + "grad_norm": 3.0625, + "grad_norm_var": 0.10695699055989584, + "learning_rate": 0.0001, + "loss": 5.7378, + "loss/crossentropy": 2.6012319326400757, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16639358550310135, + "step": 21822 + }, + { + "epoch": 0.682, + "grad_norm": 3.09375, + "grad_norm_var": 0.07779541015625, + "learning_rate": 0.0001, + "loss": 5.6664, + "loss/crossentropy": 2.509590744972229, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16841749101877213, + "step": 21824 + }, + { + "epoch": 0.6820625, + "grad_norm": 2.953125, + "grad_norm_var": 0.02222900390625, + "learning_rate": 0.0001, + "loss": 5.6028, + "loss/crossentropy": 2.49535071849823, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1634838804602623, + "step": 21826 + }, + { + "epoch": 0.682125, + "grad_norm": 3.0625, + "grad_norm_var": 0.026883951822916665, + "learning_rate": 0.0001, + "loss": 5.5229, + "loss/crossentropy": 2.367142081260681, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16244809329509735, + "step": 21828 + }, + { + "epoch": 0.6821875, + "grad_norm": 3.1875, + "grad_norm_var": 0.021898396809895835, + "learning_rate": 0.0001, + "loss": 5.4361, + "loss/crossentropy": 2.474857807159424, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15472085028886795, + "step": 21830 + }, + { + "epoch": 0.68225, + "grad_norm": 3.21875, + "grad_norm_var": 0.0219879150390625, + "learning_rate": 0.0001, + "loss": 5.8704, + "loss/crossentropy": 2.7120243310928345, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16857147961854935, + "step": 21832 + }, + { + "epoch": 0.6823125, + "grad_norm": 3.0625, + "grad_norm_var": 0.01715087890625, + "learning_rate": 0.0001, + "loss": 5.8478, + "loss/crossentropy": 2.694658398628235, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.171563059091568, + "step": 21834 + }, + { + "epoch": 0.682375, + "grad_norm": 3.578125, + "grad_norm_var": 0.0235992431640625, + "learning_rate": 0.0001, + "loss": 5.7727, + "loss/crossentropy": 2.5386829376220703, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17378805577754974, + "step": 21836 + }, + { + "epoch": 0.6824375, + "grad_norm": 2.796875, + "grad_norm_var": 0.037262980143229166, + "learning_rate": 0.0001, + "loss": 5.491, + "loss/crossentropy": 2.5196973085403442, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1541638895869255, + "step": 21838 + }, + { + "epoch": 0.6825, + "grad_norm": 2.984375, + "grad_norm_var": 0.041169230143229166, + "learning_rate": 0.0001, + "loss": 5.4463, + "loss/crossentropy": 2.4267587661743164, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15703026950359344, + "step": 21840 + }, + { + "epoch": 0.6825625, + "grad_norm": 3.40625, + "grad_norm_var": 0.04413655598958333, + "learning_rate": 0.0001, + "loss": 5.8878, + "loss/crossentropy": 2.619802474975586, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17953240871429443, + "step": 21842 + }, + { + "epoch": 0.682625, + "grad_norm": 3.15625, + "grad_norm_var": 0.04462483723958333, + "learning_rate": 0.0001, + "loss": 5.741, + "loss/crossentropy": 2.5955699682235718, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1692328080534935, + "step": 21844 + }, + { + "epoch": 0.6826875, + "grad_norm": 3.421875, + "grad_norm_var": 0.05447591145833333, + "learning_rate": 0.0001, + "loss": 5.4384, + "loss/crossentropy": 2.3693102598190308, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16120661795139313, + "step": 21846 + }, + { + "epoch": 0.68275, + "grad_norm": 2.9375, + "grad_norm_var": 0.05701497395833333, + "learning_rate": 0.0001, + "loss": 5.4267, + "loss/crossentropy": 2.371813416481018, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16095516085624695, + "step": 21848 + }, + { + "epoch": 0.6828125, + "grad_norm": 3.015625, + "grad_norm_var": 0.05681966145833333, + "learning_rate": 0.0001, + "loss": 5.8023, + "loss/crossentropy": 2.6406824588775635, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1661587730050087, + "step": 21850 + }, + { + "epoch": 0.682875, + "grad_norm": 3.390625, + "grad_norm_var": 0.04744364420572917, + "learning_rate": 0.0001, + "loss": 5.7604, + "loss/crossentropy": 2.5873711109161377, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16691134870052338, + "step": 21852 + }, + { + "epoch": 0.6829375, + "grad_norm": 3.125, + "grad_norm_var": 0.033600870768229166, + "learning_rate": 0.0001, + "loss": 5.4775, + "loss/crossentropy": 2.450773000717163, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15227775275707245, + "step": 21854 + }, + { + "epoch": 0.683, + "grad_norm": 2.984375, + "grad_norm_var": 0.029052734375, + "learning_rate": 0.0001, + "loss": 5.589, + "loss/crossentropy": 2.4866209030151367, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16453347355127335, + "step": 21856 + }, + { + "epoch": 0.6830625, + "grad_norm": 2.828125, + "grad_norm_var": 0.033524576822916666, + "learning_rate": 0.0001, + "loss": 5.1388, + "loss/crossentropy": 2.2659223079681396, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.14900429546833038, + "step": 21858 + }, + { + "epoch": 0.683125, + "grad_norm": 3.09375, + "grad_norm_var": 0.031859334309895834, + "learning_rate": 0.0001, + "loss": 5.4301, + "loss/crossentropy": 2.3820217847824097, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15754248201847076, + "step": 21860 + }, + { + "epoch": 0.6831875, + "grad_norm": 3.140625, + "grad_norm_var": 0.021711222330729165, + "learning_rate": 0.0001, + "loss": 5.8392, + "loss/crossentropy": 2.633925437927246, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1709151640534401, + "step": 21862 + }, + { + "epoch": 0.68325, + "grad_norm": 3.03125, + "grad_norm_var": 0.019059244791666666, + "learning_rate": 0.0001, + "loss": 5.9715, + "loss/crossentropy": 2.7862902879714966, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17164774239063263, + "step": 21864 + }, + { + "epoch": 0.6833125, + "grad_norm": 3.125, + "grad_norm_var": 0.01978759765625, + "learning_rate": 0.0001, + "loss": 5.4585, + "loss/crossentropy": 2.4424999952316284, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1590244174003601, + "step": 21866 + }, + { + "epoch": 0.683375, + "grad_norm": 2.96875, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 5.4504, + "loss/crossentropy": 2.384376049041748, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15972968190908432, + "step": 21868 + }, + { + "epoch": 0.6834375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0204254150390625, + "learning_rate": 0.0001, + "loss": 5.9383, + "loss/crossentropy": 2.7200969457626343, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17260020226240158, + "step": 21870 + }, + { + "epoch": 0.6835, + "grad_norm": 3.328125, + "grad_norm_var": 0.023291015625, + "learning_rate": 0.0001, + "loss": 5.5581, + "loss/crossentropy": 2.4979900121688843, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1583554670214653, + "step": 21872 + }, + { + "epoch": 0.6835625, + "grad_norm": 3.0, + "grad_norm_var": 0.017406209309895834, + "learning_rate": 0.0001, + "loss": 5.5118, + "loss/crossentropy": 2.4855817556381226, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15692057460546494, + "step": 21874 + }, + { + "epoch": 0.683625, + "grad_norm": 3.640625, + "grad_norm_var": 0.03599853515625, + "learning_rate": 0.0001, + "loss": 5.3984, + "loss/crossentropy": 2.402698278427124, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1562107652425766, + "step": 21876 + }, + { + "epoch": 0.6836875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0371002197265625, + "learning_rate": 0.0001, + "loss": 5.8649, + "loss/crossentropy": 2.6718828678131104, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17046964913606644, + "step": 21878 + }, + { + "epoch": 0.68375, + "grad_norm": 3.265625, + "grad_norm_var": 0.03863525390625, + "learning_rate": 0.0001, + "loss": 5.8438, + "loss/crossentropy": 2.690075397491455, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16693761944770813, + "step": 21880 + }, + { + "epoch": 0.6838125, + "grad_norm": 3.15625, + "grad_norm_var": 0.036408487955729166, + "learning_rate": 0.0001, + "loss": 5.9486, + "loss/crossentropy": 2.7212469577789307, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17078034579753876, + "step": 21882 + }, + { + "epoch": 0.683875, + "grad_norm": 3.546875, + "grad_norm_var": 0.04589436848958333, + "learning_rate": 0.0001, + "loss": 5.8821, + "loss/crossentropy": 2.655408501625061, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17461851239204407, + "step": 21884 + }, + { + "epoch": 0.6839375, + "grad_norm": 3.359375, + "grad_norm_var": 0.047281901041666664, + "learning_rate": 0.0001, + "loss": 5.7517, + "loss/crossentropy": 2.567640542984009, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17191973328590393, + "step": 21886 + }, + { + "epoch": 0.684, + "grad_norm": 2.9375, + "grad_norm_var": 0.06629130045572916, + "learning_rate": 0.0001, + "loss": 5.7444, + "loss/crossentropy": 2.6137380599975586, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16619385033845901, + "step": 21888 + }, + { + "epoch": 0.6840625, + "grad_norm": 3.15625, + "grad_norm_var": 0.06210530598958333, + "learning_rate": 0.0001, + "loss": 5.7919, + "loss/crossentropy": 2.644736409187317, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16901639103889465, + "step": 21890 + }, + { + "epoch": 0.684125, + "grad_norm": 3.796875, + "grad_norm_var": 0.07151285807291667, + "learning_rate": 0.0001, + "loss": 5.6171, + "loss/crossentropy": 2.507060408592224, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16373561322689056, + "step": 21892 + }, + { + "epoch": 0.6841875, + "grad_norm": 3.046875, + "grad_norm_var": 0.07157796223958333, + "learning_rate": 0.0001, + "loss": 5.8277, + "loss/crossentropy": 2.6672521829605103, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1687830165028572, + "step": 21894 + }, + { + "epoch": 0.68425, + "grad_norm": 2.90625, + "grad_norm_var": 0.07733968098958334, + "learning_rate": 0.0001, + "loss": 5.7131, + "loss/crossentropy": 2.591132402420044, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16258373111486435, + "step": 21896 + }, + { + "epoch": 0.6843125, + "grad_norm": 3.296875, + "grad_norm_var": 0.07913004557291667, + "learning_rate": 0.0001, + "loss": 5.8509, + "loss/crossentropy": 2.5987366437911987, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17482291162014008, + "step": 21898 + }, + { + "epoch": 0.684375, + "grad_norm": 2.921875, + "grad_norm_var": 0.07366434733072917, + "learning_rate": 0.0001, + "loss": 5.3414, + "loss/crossentropy": 2.376563310623169, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1519482657313347, + "step": 21900 + }, + { + "epoch": 0.6844375, + "grad_norm": 2.90625, + "grad_norm_var": 0.074755859375, + "learning_rate": 0.0001, + "loss": 5.6567, + "loss/crossentropy": 2.512323498725891, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1671675145626068, + "step": 21902 + }, + { + "epoch": 0.6845, + "grad_norm": 3.296875, + "grad_norm_var": 0.05561421712239583, + "learning_rate": 0.0001, + "loss": 5.6793, + "loss/crossentropy": 2.6000667810440063, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16573521494865417, + "step": 21904 + }, + { + "epoch": 0.6845625, + "grad_norm": 3.0, + "grad_norm_var": 0.0597320556640625, + "learning_rate": 0.0001, + "loss": 5.2702, + "loss/crossentropy": 2.292046546936035, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15640486031770706, + "step": 21906 + }, + { + "epoch": 0.684625, + "grad_norm": 2.859375, + "grad_norm_var": 0.03037109375, + "learning_rate": 0.0001, + "loss": 5.7362, + "loss/crossentropy": 2.6720213890075684, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15836820751428604, + "step": 21908 + }, + { + "epoch": 0.6846875, + "grad_norm": 3.0625, + "grad_norm_var": 0.03447163899739583, + "learning_rate": 0.0001, + "loss": 5.5336, + "loss/crossentropy": 2.515222191810608, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15925846248865128, + "step": 21910 + }, + { + "epoch": 0.68475, + "grad_norm": 3.46875, + "grad_norm_var": 0.04980061848958333, + "learning_rate": 0.0001, + "loss": 5.3531, + "loss/crossentropy": 2.3858895301818848, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1525760367512703, + "step": 21912 + }, + { + "epoch": 0.6848125, + "grad_norm": 3.0, + "grad_norm_var": 0.04840087890625, + "learning_rate": 0.0001, + "loss": 5.4175, + "loss/crossentropy": 2.37613046169281, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16117016226053238, + "step": 21914 + }, + { + "epoch": 0.684875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0574127197265625, + "learning_rate": 0.0001, + "loss": 5.7245, + "loss/crossentropy": 2.587957739830017, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16600116342306137, + "step": 21916 + }, + { + "epoch": 0.6849375, + "grad_norm": 2.96875, + "grad_norm_var": 0.0575836181640625, + "learning_rate": 0.0001, + "loss": 5.719, + "loss/crossentropy": 2.6305911540985107, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16352810710668564, + "step": 21918 + }, + { + "epoch": 0.685, + "grad_norm": 3.109375, + "grad_norm_var": 0.05487874348958333, + "learning_rate": 0.0001, + "loss": 5.7561, + "loss/crossentropy": 2.6287907361984253, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16468214243650436, + "step": 21920 + }, + { + "epoch": 0.6850625, + "grad_norm": 2.859375, + "grad_norm_var": 0.055464680989583334, + "learning_rate": 0.0001, + "loss": 5.233, + "loss/crossentropy": 2.2569140195846558, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15424616634845734, + "step": 21922 + }, + { + "epoch": 0.685125, + "grad_norm": 2.953125, + "grad_norm_var": 0.0605865478515625, + "learning_rate": 0.0001, + "loss": 5.7256, + "loss/crossentropy": 2.5466452836990356, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16907097399234772, + "step": 21924 + }, + { + "epoch": 0.6851875, + "grad_norm": 2.84375, + "grad_norm_var": 0.059178670247395836, + "learning_rate": 0.0001, + "loss": 5.4947, + "loss/crossentropy": 2.4349265098571777, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1571447104215622, + "step": 21926 + }, + { + "epoch": 0.68525, + "grad_norm": 3.140625, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 5.649, + "loss/crossentropy": 2.5687440633773804, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16153889894485474, + "step": 21928 + }, + { + "epoch": 0.6853125, + "grad_norm": 2.9375, + "grad_norm_var": 0.0420318603515625, + "learning_rate": 0.0001, + "loss": 5.8262, + "loss/crossentropy": 2.7170010805130005, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16522109508514404, + "step": 21930 + }, + { + "epoch": 0.685375, + "grad_norm": 3.21875, + "grad_norm_var": 0.035986328125, + "learning_rate": 0.0001, + "loss": 5.5689, + "loss/crossentropy": 2.470358729362488, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16336709260940552, + "step": 21932 + }, + { + "epoch": 0.6854375, + "grad_norm": 3.078125, + "grad_norm_var": 0.03255106608072917, + "learning_rate": 0.0001, + "loss": 5.3914, + "loss/crossentropy": 2.3599261045455933, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15861255675554276, + "step": 21934 + }, + { + "epoch": 0.6855, + "grad_norm": 3.203125, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 5.6691, + "loss/crossentropy": 2.5072624683380127, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16579187661409378, + "step": 21936 + }, + { + "epoch": 0.6855625, + "grad_norm": 3.265625, + "grad_norm_var": 0.027197265625, + "learning_rate": 0.0001, + "loss": 5.6996, + "loss/crossentropy": 2.5683263540267944, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16429609060287476, + "step": 21938 + }, + { + "epoch": 0.685625, + "grad_norm": 2.96875, + "grad_norm_var": 0.022948201497395834, + "learning_rate": 0.0001, + "loss": 5.5415, + "loss/crossentropy": 2.510878562927246, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15774516761302948, + "step": 21940 + }, + { + "epoch": 0.6856875, + "grad_norm": 2.796875, + "grad_norm_var": 0.036351521809895836, + "learning_rate": 0.0001, + "loss": 5.285, + "loss/crossentropy": 2.3741862773895264, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1473342627286911, + "step": 21942 + }, + { + "epoch": 0.68575, + "grad_norm": 2.984375, + "grad_norm_var": 0.03428446451822917, + "learning_rate": 0.0001, + "loss": 5.5747, + "loss/crossentropy": 2.489713668823242, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16475097090005875, + "step": 21944 + }, + { + "epoch": 0.6858125, + "grad_norm": 2.875, + "grad_norm_var": 0.03524983723958333, + "learning_rate": 0.0001, + "loss": 5.269, + "loss/crossentropy": 2.2745388746261597, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15295787900686264, + "step": 21946 + }, + { + "epoch": 0.685875, + "grad_norm": 3.015625, + "grad_norm_var": 0.031168619791666668, + "learning_rate": 0.0001, + "loss": 6.0097, + "loss/crossentropy": 2.8562405109405518, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16691070795059204, + "step": 21948 + }, + { + "epoch": 0.6859375, + "grad_norm": 3.453125, + "grad_norm_var": 0.04331766764322917, + "learning_rate": 0.0001, + "loss": 5.9235, + "loss/crossentropy": 2.685052752494812, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17227753251791, + "step": 21950 + }, + { + "epoch": 0.686, + "grad_norm": 3.578125, + "grad_norm_var": 0.061812337239583334, + "learning_rate": 0.0001, + "loss": 5.5667, + "loss/crossentropy": 2.4415215253829956, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16290438175201416, + "step": 21952 + }, + { + "epoch": 0.6860625, + "grad_norm": 3.109375, + "grad_norm_var": 0.0598052978515625, + "learning_rate": 0.0001, + "loss": 5.7266, + "loss/crossentropy": 2.640066146850586, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16295120865106583, + "step": 21954 + }, + { + "epoch": 0.686125, + "grad_norm": 3.09375, + "grad_norm_var": 0.055436197916666666, + "learning_rate": 0.0001, + "loss": 5.4187, + "loss/crossentropy": 2.4268020391464233, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1589563563466072, + "step": 21956 + }, + { + "epoch": 0.6861875, + "grad_norm": 3.296875, + "grad_norm_var": 0.032568359375, + "learning_rate": 0.0001, + "loss": 6.0263, + "loss/crossentropy": 2.7968064546585083, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17802338302135468, + "step": 21958 + }, + { + "epoch": 0.68625, + "grad_norm": 2.96875, + "grad_norm_var": 0.030720011393229166, + "learning_rate": 0.0001, + "loss": 5.5455, + "loss/crossentropy": 2.4713134765625, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16211090236902237, + "step": 21960 + }, + { + "epoch": 0.6863125, + "grad_norm": 3.0, + "grad_norm_var": 0.028678385416666667, + "learning_rate": 0.0001, + "loss": 5.3926, + "loss/crossentropy": 2.370116949081421, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15810777992010117, + "step": 21962 + }, + { + "epoch": 0.686375, + "grad_norm": 3.21875, + "grad_norm_var": 0.027269490559895835, + "learning_rate": 0.0001, + "loss": 5.7012, + "loss/crossentropy": 2.512209415435791, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17045696079730988, + "step": 21964 + }, + { + "epoch": 0.6864375, + "grad_norm": 3.359375, + "grad_norm_var": 0.024788411458333333, + "learning_rate": 0.0001, + "loss": 5.8152, + "loss/crossentropy": 2.6030231714248657, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17472904175519943, + "step": 21966 + }, + { + "epoch": 0.6865, + "grad_norm": 2.984375, + "grad_norm_var": 0.0165924072265625, + "learning_rate": 0.0001, + "loss": 5.8785, + "loss/crossentropy": 2.6595598459243774, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1715015023946762, + "step": 21968 + }, + { + "epoch": 0.6865625, + "grad_norm": 3.09375, + "grad_norm_var": 0.016813151041666665, + "learning_rate": 0.0001, + "loss": 6.0522, + "loss/crossentropy": 2.818776845932007, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17529769986867905, + "step": 21970 + }, + { + "epoch": 0.686625, + "grad_norm": 2.875, + "grad_norm_var": 0.0226715087890625, + "learning_rate": 0.0001, + "loss": 6.0145, + "loss/crossentropy": 2.9263845682144165, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16193270683288574, + "step": 21972 + }, + { + "epoch": 0.6866875, + "grad_norm": 3.125, + "grad_norm_var": 0.022607421875, + "learning_rate": 0.0001, + "loss": 5.4597, + "loss/crossentropy": 2.4031002521514893, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1603483036160469, + "step": 21974 + }, + { + "epoch": 0.68675, + "grad_norm": 3.1875, + "grad_norm_var": 0.030858357747395832, + "learning_rate": 0.0001, + "loss": 5.9979, + "loss/crossentropy": 2.7784128189086914, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1746804192662239, + "step": 21976 + }, + { + "epoch": 0.6868125, + "grad_norm": 3.796875, + "grad_norm_var": 0.052079264322916666, + "learning_rate": 0.0001, + "loss": 5.9265, + "loss/crossentropy": 2.6443564891815186, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1786029413342476, + "step": 21978 + }, + { + "epoch": 0.686875, + "grad_norm": 3.296875, + "grad_norm_var": 0.05781148274739583, + "learning_rate": 0.0001, + "loss": 5.6427, + "loss/crossentropy": 2.613027572631836, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1599966436624527, + "step": 21980 + }, + { + "epoch": 0.6869375, + "grad_norm": 3.0, + "grad_norm_var": 0.057062784830729164, + "learning_rate": 0.0001, + "loss": 5.5465, + "loss/crossentropy": 2.536167621612549, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15767823159694672, + "step": 21982 + }, + { + "epoch": 0.687, + "grad_norm": 3.046875, + "grad_norm_var": 0.056868489583333334, + "learning_rate": 0.0001, + "loss": 5.7003, + "loss/crossentropy": 2.591855049133301, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16318461298942566, + "step": 21984 + }, + { + "epoch": 0.6870625, + "grad_norm": 2.96875, + "grad_norm_var": 0.06162821451822917, + "learning_rate": 0.0001, + "loss": 5.8229, + "loss/crossentropy": 2.7432347536087036, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16421236097812653, + "step": 21986 + }, + { + "epoch": 0.687125, + "grad_norm": 3.109375, + "grad_norm_var": 0.05715230305989583, + "learning_rate": 0.0001, + "loss": 5.7634, + "loss/crossentropy": 2.633246064186096, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1673073247075081, + "step": 21988 + }, + { + "epoch": 0.6871875, + "grad_norm": 3.015625, + "grad_norm_var": 0.0553863525390625, + "learning_rate": 0.0001, + "loss": 5.6576, + "loss/crossentropy": 2.5245636701583862, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1636907309293747, + "step": 21990 + }, + { + "epoch": 0.68725, + "grad_norm": 3.21875, + "grad_norm_var": 0.0464508056640625, + "learning_rate": 0.0001, + "loss": 5.8027, + "loss/crossentropy": 2.5923744440078735, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17025303095579147, + "step": 21992 + }, + { + "epoch": 0.6873125, + "grad_norm": 2.9375, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 5.6666, + "loss/crossentropy": 2.5845742225646973, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1648474708199501, + "step": 21994 + }, + { + "epoch": 0.687375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0107421875, + "learning_rate": 0.0001, + "loss": 5.5381, + "loss/crossentropy": 2.4800325632095337, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16127480566501617, + "step": 21996 + }, + { + "epoch": 0.6874375, + "grad_norm": 3.09375, + "grad_norm_var": 0.013581339518229167, + "learning_rate": 0.0001, + "loss": 6.0703, + "loss/crossentropy": 2.813502311706543, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17646300047636032, + "step": 21998 + }, + { + "epoch": 0.6875, + "grad_norm": 3.1875, + "grad_norm_var": 0.013263956705729166, + "learning_rate": 0.0001, + "loss": 5.8116, + "loss/crossentropy": 2.689139246940613, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1669357866048813, + "step": 22000 + }, + { + "epoch": 0.6875625, + "grad_norm": 3.625, + "grad_norm_var": 0.028706868489583332, + "learning_rate": 0.0001, + "loss": 5.6819, + "loss/crossentropy": 2.4935706853866577, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16961680352687836, + "step": 22002 + }, + { + "epoch": 0.687625, + "grad_norm": 3.0, + "grad_norm_var": 0.0355621337890625, + "learning_rate": 0.0001, + "loss": 5.6342, + "loss/crossentropy": 2.5379968881607056, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1600138247013092, + "step": 22004 + }, + { + "epoch": 0.6876875, + "grad_norm": 3.015625, + "grad_norm_var": 0.03567708333333333, + "learning_rate": 0.0001, + "loss": 5.5031, + "loss/crossentropy": 2.484773635864258, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1549551784992218, + "step": 22006 + }, + { + "epoch": 0.68775, + "grad_norm": 3.453125, + "grad_norm_var": 0.0484283447265625, + "learning_rate": 0.0001, + "loss": 5.7447, + "loss/crossentropy": 2.6551584005355835, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1624705046415329, + "step": 22008 + }, + { + "epoch": 0.6878125, + "grad_norm": 2.953125, + "grad_norm_var": 0.053929646809895836, + "learning_rate": 0.0001, + "loss": 5.6424, + "loss/crossentropy": 2.5648895502090454, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15970075130462646, + "step": 22010 + }, + { + "epoch": 0.687875, + "grad_norm": 3.34375, + "grad_norm_var": 0.05709228515625, + "learning_rate": 0.0001, + "loss": 5.6356, + "loss/crossentropy": 2.454726457595825, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16730573028326035, + "step": 22012 + }, + { + "epoch": 0.6879375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06226806640625, + "learning_rate": 0.0001, + "loss": 5.5746, + "loss/crossentropy": 2.499474883079529, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16219748556613922, + "step": 22014 + }, + { + "epoch": 0.688, + "grad_norm": 3.671875, + "grad_norm_var": 0.087548828125, + "learning_rate": 0.0001, + "loss": 5.9268, + "loss/crossentropy": 2.5879805088043213, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1831054538488388, + "step": 22016 + }, + { + "epoch": 0.6880625, + "grad_norm": 3.0625, + "grad_norm_var": 0.07528889973958333, + "learning_rate": 0.0001, + "loss": 5.6894, + "loss/crossentropy": 2.5440523624420166, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16492797434329987, + "step": 22018 + }, + { + "epoch": 0.688125, + "grad_norm": 3.21875, + "grad_norm_var": 0.07002665201822916, + "learning_rate": 0.0001, + "loss": 5.6702, + "loss/crossentropy": 2.543825387954712, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16459020972251892, + "step": 22020 + }, + { + "epoch": 0.6881875, + "grad_norm": 3.328125, + "grad_norm_var": 0.07581278483072916, + "learning_rate": 0.0001, + "loss": 5.8816, + "loss/crossentropy": 2.706055998802185, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17107157409191132, + "step": 22022 + }, + { + "epoch": 0.68825, + "grad_norm": 3.0625, + "grad_norm_var": 0.0632720947265625, + "learning_rate": 0.0001, + "loss": 5.5121, + "loss/crossentropy": 2.443795084953308, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1579991579055786, + "step": 22024 + }, + { + "epoch": 0.6883125, + "grad_norm": 2.875, + "grad_norm_var": 0.05939839680989583, + "learning_rate": 0.0001, + "loss": 5.6521, + "loss/crossentropy": 2.6103020906448364, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15964560210704803, + "step": 22026 + }, + { + "epoch": 0.688375, + "grad_norm": 3.203125, + "grad_norm_var": 0.058333333333333334, + "learning_rate": 0.0001, + "loss": 5.8219, + "loss/crossentropy": 2.6712719202041626, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16858209669589996, + "step": 22028 + }, + { + "epoch": 0.6884375, + "grad_norm": 3.28125, + "grad_norm_var": 0.06360677083333334, + "learning_rate": 0.0001, + "loss": 5.4321, + "loss/crossentropy": 2.389652729034424, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16010120511054993, + "step": 22030 + }, + { + "epoch": 0.6885, + "grad_norm": 3.34375, + "grad_norm_var": 0.03693033854166667, + "learning_rate": 0.0001, + "loss": 5.829, + "loss/crossentropy": 2.5845173597335815, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17991462349891663, + "step": 22032 + }, + { + "epoch": 0.6885625, + "grad_norm": 3.46875, + "grad_norm_var": 0.043257649739583334, + "learning_rate": 0.0001, + "loss": 5.6335, + "loss/crossentropy": 2.524790644645691, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16321934014558792, + "step": 22034 + }, + { + "epoch": 0.688625, + "grad_norm": 3.078125, + "grad_norm_var": 0.040816243489583334, + "learning_rate": 0.0001, + "loss": 5.4435, + "loss/crossentropy": 2.342835545539856, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1628004088997841, + "step": 22036 + }, + { + "epoch": 0.6886875, + "grad_norm": 3.09375, + "grad_norm_var": 0.033812459309895834, + "learning_rate": 0.0001, + "loss": 5.8164, + "loss/crossentropy": 2.640567421913147, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16758081316947937, + "step": 22038 + }, + { + "epoch": 0.68875, + "grad_norm": 3.4375, + "grad_norm_var": 0.0361480712890625, + "learning_rate": 0.0001, + "loss": 5.5189, + "loss/crossentropy": 2.3515186309814453, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16752216964960098, + "step": 22040 + }, + { + "epoch": 0.6888125, + "grad_norm": 3.28125, + "grad_norm_var": 0.027546183268229166, + "learning_rate": 0.0001, + "loss": 6.0336, + "loss/crossentropy": 2.7967907190322876, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1756298840045929, + "step": 22042 + }, + { + "epoch": 0.688875, + "grad_norm": 4.0, + "grad_norm_var": 0.059691365559895834, + "learning_rate": 0.0001, + "loss": 5.8296, + "loss/crossentropy": 2.5987207889556885, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.173872709274292, + "step": 22044 + }, + { + "epoch": 0.6889375, + "grad_norm": 2.90625, + "grad_norm_var": 0.06129150390625, + "learning_rate": 0.0001, + "loss": 5.7161, + "loss/crossentropy": 2.584370255470276, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16551848500967026, + "step": 22046 + }, + { + "epoch": 0.689, + "grad_norm": 2.703125, + "grad_norm_var": 0.08826395670572916, + "learning_rate": 0.0001, + "loss": 5.3159, + "loss/crossentropy": 2.424273729324341, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.15049034357070923, + "step": 22048 + }, + { + "epoch": 0.6890625, + "grad_norm": 3.078125, + "grad_norm_var": 0.08479715983072916, + "learning_rate": 0.0001, + "loss": 5.8019, + "loss/crossentropy": 2.6658477783203125, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16555969417095184, + "step": 22050 + }, + { + "epoch": 0.689125, + "grad_norm": 3.4375, + "grad_norm_var": 0.08819986979166666, + "learning_rate": 0.0001, + "loss": 6.0253, + "loss/crossentropy": 2.7549906969070435, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17742366343736649, + "step": 22052 + }, + { + "epoch": 0.6891875, + "grad_norm": 3.1875, + "grad_norm_var": 0.08970947265625, + "learning_rate": 0.0001, + "loss": 5.7482, + "loss/crossentropy": 2.583523154258728, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16842039674520493, + "step": 22054 + }, + { + "epoch": 0.68925, + "grad_norm": 3.0625, + "grad_norm_var": 0.087890625, + "learning_rate": 0.0001, + "loss": 5.0966, + "loss/crossentropy": 2.2068421840667725, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.14483093470335007, + "step": 22056 + }, + { + "epoch": 0.6893125, + "grad_norm": 2.96875, + "grad_norm_var": 0.09039306640625, + "learning_rate": 0.0001, + "loss": 5.3215, + "loss/crossentropy": 2.3375245332717896, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15464308857917786, + "step": 22058 + }, + { + "epoch": 0.689375, + "grad_norm": 2.921875, + "grad_norm_var": 0.046044921875, + "learning_rate": 0.0001, + "loss": 5.5924, + "loss/crossentropy": 2.555259585380554, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16191434860229492, + "step": 22060 + }, + { + "epoch": 0.6894375, + "grad_norm": 2.96875, + "grad_norm_var": 0.04265034993489583, + "learning_rate": 0.0001, + "loss": 5.5317, + "loss/crossentropy": 2.476559638977051, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1602018103003502, + "step": 22062 + }, + { + "epoch": 0.6895, + "grad_norm": 4.21875, + "grad_norm_var": 0.10701395670572916, + "learning_rate": 0.0001, + "loss": 5.6194, + "loss/crossentropy": 2.500504970550537, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16618820279836655, + "step": 22064 + }, + { + "epoch": 0.6895625, + "grad_norm": 3.015625, + "grad_norm_var": 0.10738016764322916, + "learning_rate": 0.0001, + "loss": 5.649, + "loss/crossentropy": 2.625067114830017, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15903447568416595, + "step": 22066 + }, + { + "epoch": 0.689625, + "grad_norm": 2.890625, + "grad_norm_var": 0.10705973307291666, + "learning_rate": 0.0001, + "loss": 5.5305, + "loss/crossentropy": 2.4101606607437134, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16672495007514954, + "step": 22068 + }, + { + "epoch": 0.6896875, + "grad_norm": 3.265625, + "grad_norm_var": 0.11030171712239584, + "learning_rate": 0.0001, + "loss": 5.6851, + "loss/crossentropy": 2.5571197271347046, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1647544801235199, + "step": 22070 + }, + { + "epoch": 0.68975, + "grad_norm": 3.046875, + "grad_norm_var": 0.11188863118489584, + "learning_rate": 0.0001, + "loss": 5.5543, + "loss/crossentropy": 2.49932861328125, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1586238443851471, + "step": 22072 + }, + { + "epoch": 0.6898125, + "grad_norm": 3.203125, + "grad_norm_var": 0.11961263020833333, + "learning_rate": 0.0001, + "loss": 5.9969, + "loss/crossentropy": 2.7396615743637085, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1776818111538887, + "step": 22074 + }, + { + "epoch": 0.689875, + "grad_norm": 3.34375, + "grad_norm_var": 0.10357666015625, + "learning_rate": 0.0001, + "loss": 5.6957, + "loss/crossentropy": 2.5299636125564575, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16696564108133316, + "step": 22076 + }, + { + "epoch": 0.6899375, + "grad_norm": 3.0625, + "grad_norm_var": 0.10089518229166666, + "learning_rate": 0.0001, + "loss": 5.5318, + "loss/crossentropy": 2.5148743391036987, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15677031874656677, + "step": 22078 + }, + { + "epoch": 0.69, + "grad_norm": 3.078125, + "grad_norm_var": 0.05152587890625, + "learning_rate": 0.0001, + "loss": 5.2976, + "loss/crossentropy": 2.294735014438629, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15458299219608307, + "step": 22080 + }, + { + "epoch": 0.6900625, + "grad_norm": 3.46875, + "grad_norm_var": 0.0514801025390625, + "learning_rate": 0.0001, + "loss": 5.8681, + "loss/crossentropy": 2.6674221754074097, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1724105179309845, + "step": 22082 + }, + { + "epoch": 0.690125, + "grad_norm": 3.734375, + "grad_norm_var": 0.056315104166666664, + "learning_rate": 0.0001, + "loss": 5.8382, + "loss/crossentropy": 2.5484025478363037, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1766359657049179, + "step": 22084 + }, + { + "epoch": 0.6901875, + "grad_norm": 3.5, + "grad_norm_var": 0.049462890625, + "learning_rate": 0.0001, + "loss": 5.5799, + "loss/crossentropy": 2.4521220922470093, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16356316953897476, + "step": 22086 + }, + { + "epoch": 0.69025, + "grad_norm": 3.1875, + "grad_norm_var": 0.045654296875, + "learning_rate": 0.0001, + "loss": 5.8716, + "loss/crossentropy": 2.670419692993164, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1724613457918167, + "step": 22088 + }, + { + "epoch": 0.6903125, + "grad_norm": 3.140625, + "grad_norm_var": 0.0429595947265625, + "learning_rate": 0.0001, + "loss": 5.7759, + "loss/crossentropy": 2.6571903228759766, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1630413457751274, + "step": 22090 + }, + { + "epoch": 0.690375, + "grad_norm": 3.140625, + "grad_norm_var": 0.045775349934895834, + "learning_rate": 0.0001, + "loss": 5.6678, + "loss/crossentropy": 2.6170836687088013, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1589749976992607, + "step": 22092 + }, + { + "epoch": 0.6904375, + "grad_norm": 2.859375, + "grad_norm_var": 0.05419514973958333, + "learning_rate": 0.0001, + "loss": 5.3578, + "loss/crossentropy": 2.3640111684799194, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1532842516899109, + "step": 22094 + }, + { + "epoch": 0.6905, + "grad_norm": 2.890625, + "grad_norm_var": 0.04895833333333333, + "learning_rate": 0.0001, + "loss": 5.3275, + "loss/crossentropy": 2.372657537460327, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.14900419861078262, + "step": 22096 + }, + { + "epoch": 0.6905625, + "grad_norm": 2.765625, + "grad_norm_var": 0.0707916259765625, + "learning_rate": 0.0001, + "loss": 5.4045, + "loss/crossentropy": 2.5098663568496704, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.1507904753088951, + "step": 22098 + }, + { + "epoch": 0.690625, + "grad_norm": 3.21875, + "grad_norm_var": 0.0517730712890625, + "learning_rate": 0.0001, + "loss": 6.0669, + "loss/crossentropy": 2.717814803123474, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1817857176065445, + "step": 22100 + }, + { + "epoch": 0.6906875, + "grad_norm": 3.15625, + "grad_norm_var": 0.0433013916015625, + "learning_rate": 0.0001, + "loss": 5.55, + "loss/crossentropy": 2.4737772941589355, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16387274861335754, + "step": 22102 + }, + { + "epoch": 0.69075, + "grad_norm": 3.203125, + "grad_norm_var": 0.0423492431640625, + "learning_rate": 0.0001, + "loss": 5.7436, + "loss/crossentropy": 2.639219284057617, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16239119321107864, + "step": 22104 + }, + { + "epoch": 0.6908125, + "grad_norm": 3.390625, + "grad_norm_var": 0.051595052083333336, + "learning_rate": 0.0001, + "loss": 5.8804, + "loss/crossentropy": 2.7018351554870605, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16903229802846909, + "step": 22106 + }, + { + "epoch": 0.690875, + "grad_norm": 3.046875, + "grad_norm_var": 0.05776265462239583, + "learning_rate": 0.0001, + "loss": 5.1394, + "loss/crossentropy": 2.168722927570343, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14941561222076416, + "step": 22108 + }, + { + "epoch": 0.6909375, + "grad_norm": 2.9375, + "grad_norm_var": 0.052783203125, + "learning_rate": 0.0001, + "loss": 5.4124, + "loss/crossentropy": 2.3674614429473877, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15879163146018982, + "step": 22110 + }, + { + "epoch": 0.691, + "grad_norm": 3.0, + "grad_norm_var": 0.05085347493489583, + "learning_rate": 0.0001, + "loss": 5.9384, + "loss/crossentropy": 2.75888991355896, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.169517882168293, + "step": 22112 + }, + { + "epoch": 0.6910625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0372467041015625, + "learning_rate": 0.0001, + "loss": 5.5238, + "loss/crossentropy": 2.486830711364746, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15916302800178528, + "step": 22114 + }, + { + "epoch": 0.691125, + "grad_norm": 3.171875, + "grad_norm_var": 0.029938761393229166, + "learning_rate": 0.0001, + "loss": 5.5068, + "loss/crossentropy": 2.4414623975753784, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16044388711452484, + "step": 22116 + }, + { + "epoch": 0.6911875, + "grad_norm": 3.296875, + "grad_norm_var": 0.03482666015625, + "learning_rate": 0.0001, + "loss": 5.4823, + "loss/crossentropy": 2.430028796195984, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15874332934617996, + "step": 22118 + }, + { + "epoch": 0.69125, + "grad_norm": 3.09375, + "grad_norm_var": 0.0339752197265625, + "learning_rate": 0.0001, + "loss": 5.8687, + "loss/crossentropy": 2.71201491355896, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16957851499319077, + "step": 22120 + }, + { + "epoch": 0.6913125, + "grad_norm": 3.171875, + "grad_norm_var": 0.022850545247395833, + "learning_rate": 0.0001, + "loss": 5.9176, + "loss/crossentropy": 2.667091488838196, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.177008718252182, + "step": 22122 + }, + { + "epoch": 0.691375, + "grad_norm": 3.640625, + "grad_norm_var": 0.03797200520833333, + "learning_rate": 0.0001, + "loss": 5.9542, + "loss/crossentropy": 2.7011945247650146, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1756882593035698, + "step": 22124 + }, + { + "epoch": 0.6914375, + "grad_norm": 3.203125, + "grad_norm_var": 0.041259765625, + "learning_rate": 0.0001, + "loss": 5.3941, + "loss/crossentropy": 2.4094094038009644, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1578417792916298, + "step": 22126 + }, + { + "epoch": 0.6915, + "grad_norm": 3.1875, + "grad_norm_var": 0.04081929524739583, + "learning_rate": 0.0001, + "loss": 5.5588, + "loss/crossentropy": 2.509129285812378, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15613751113414764, + "step": 22128 + }, + { + "epoch": 0.6915625, + "grad_norm": 3.265625, + "grad_norm_var": 0.04101155598958333, + "learning_rate": 0.0001, + "loss": 5.7464, + "loss/crossentropy": 2.60485577583313, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16767163574695587, + "step": 22130 + }, + { + "epoch": 0.691625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0484771728515625, + "learning_rate": 0.0001, + "loss": 5.4953, + "loss/crossentropy": 2.4983582496643066, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15828609466552734, + "step": 22132 + }, + { + "epoch": 0.6916875, + "grad_norm": 3.328125, + "grad_norm_var": 0.044596354166666664, + "learning_rate": 0.0001, + "loss": 6.045, + "loss/crossentropy": 2.789808750152588, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17512879520654678, + "step": 22134 + }, + { + "epoch": 0.69175, + "grad_norm": 3.109375, + "grad_norm_var": 0.04453125, + "learning_rate": 0.0001, + "loss": 5.9074, + "loss/crossentropy": 2.6833715438842773, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16850131005048752, + "step": 22136 + }, + { + "epoch": 0.6918125, + "grad_norm": 3.4375, + "grad_norm_var": 0.04853108723958333, + "learning_rate": 0.0001, + "loss": 5.8736, + "loss/crossentropy": 2.608944535255432, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17607013881206512, + "step": 22138 + }, + { + "epoch": 0.691875, + "grad_norm": 3.078125, + "grad_norm_var": 0.0348785400390625, + "learning_rate": 0.0001, + "loss": 5.8523, + "loss/crossentropy": 2.6538909673690796, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.170235276222229, + "step": 22140 + }, + { + "epoch": 0.6919375, + "grad_norm": 3.125, + "grad_norm_var": 0.0294830322265625, + "learning_rate": 0.0001, + "loss": 5.8493, + "loss/crossentropy": 2.710782289505005, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1681513860821724, + "step": 22142 + }, + { + "epoch": 0.692, + "grad_norm": 3.515625, + "grad_norm_var": 0.03876546223958333, + "learning_rate": 0.0001, + "loss": 5.8845, + "loss/crossentropy": 2.6459481716156006, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17424651235342026, + "step": 22144 + }, + { + "epoch": 0.6920625, + "grad_norm": 3.328125, + "grad_norm_var": 0.03941650390625, + "learning_rate": 0.0001, + "loss": 5.6237, + "loss/crossentropy": 2.524052619934082, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1646539568901062, + "step": 22146 + }, + { + "epoch": 0.692125, + "grad_norm": 3.296875, + "grad_norm_var": 0.025, + "learning_rate": 0.0001, + "loss": 5.6005, + "loss/crossentropy": 2.4987930059432983, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16329386830329895, + "step": 22148 + }, + { + "epoch": 0.6921875, + "grad_norm": 3.75, + "grad_norm_var": 0.0445709228515625, + "learning_rate": 0.0001, + "loss": 6.3365, + "loss/crossentropy": 2.899762511253357, + "loss/hidden": 1.58203125, + "loss/jsd": 0.0, + "loss/logits": 0.1854744702577591, + "step": 22150 + }, + { + "epoch": 0.69225, + "grad_norm": 3.53125, + "grad_norm_var": 0.0510406494140625, + "learning_rate": 0.0001, + "loss": 5.7907, + "loss/crossentropy": 2.5561044216156006, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1765880137681961, + "step": 22152 + }, + { + "epoch": 0.6923125, + "grad_norm": 3.03125, + "grad_norm_var": 0.051366170247395836, + "learning_rate": 0.0001, + "loss": 5.5862, + "loss/crossentropy": 2.501978039741516, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15959802269935608, + "step": 22154 + }, + { + "epoch": 0.692375, + "grad_norm": 2.921875, + "grad_norm_var": 0.058919270833333336, + "learning_rate": 0.0001, + "loss": 5.5544, + "loss/crossentropy": 2.509579300880432, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15916655957698822, + "step": 22156 + }, + { + "epoch": 0.6924375, + "grad_norm": 3.671875, + "grad_norm_var": 0.066552734375, + "learning_rate": 0.0001, + "loss": 5.6933, + "loss/crossentropy": 2.5143656730651855, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1721944585442543, + "step": 22158 + }, + { + "epoch": 0.6925, + "grad_norm": 2.875, + "grad_norm_var": 0.07075093587239584, + "learning_rate": 0.0001, + "loss": 5.6441, + "loss/crossentropy": 2.6157257556915283, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16064903140068054, + "step": 22160 + }, + { + "epoch": 0.6925625, + "grad_norm": 3.265625, + "grad_norm_var": 0.07592671712239583, + "learning_rate": 0.0001, + "loss": 5.712, + "loss/crossentropy": 2.6100316047668457, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16722562909126282, + "step": 22162 + }, + { + "epoch": 0.692625, + "grad_norm": 2.96875, + "grad_norm_var": 0.07788798014322916, + "learning_rate": 0.0001, + "loss": 5.708, + "loss/crossentropy": 2.6058512926101685, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16607257723808289, + "step": 22164 + }, + { + "epoch": 0.6926875, + "grad_norm": 3.1875, + "grad_norm_var": 0.05210673014322917, + "learning_rate": 0.0001, + "loss": 5.8378, + "loss/crossentropy": 2.6714993715286255, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16858133673667908, + "step": 22166 + }, + { + "epoch": 0.69275, + "grad_norm": 3.1875, + "grad_norm_var": 0.040283203125, + "learning_rate": 0.0001, + "loss": 5.7096, + "loss/crossentropy": 2.5298666954040527, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16875465214252472, + "step": 22168 + }, + { + "epoch": 0.6928125, + "grad_norm": 3.0625, + "grad_norm_var": 0.04036356608072917, + "learning_rate": 0.0001, + "loss": 5.5792, + "loss/crossentropy": 2.5069416761398315, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16112709790468216, + "step": 22170 + }, + { + "epoch": 0.692875, + "grad_norm": 3.15625, + "grad_norm_var": 0.03577067057291667, + "learning_rate": 0.0001, + "loss": 5.5173, + "loss/crossentropy": 2.427741289138794, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16403573751449585, + "step": 22172 + }, + { + "epoch": 0.6929375, + "grad_norm": 2.9375, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 5.5594, + "loss/crossentropy": 2.544328212738037, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15697290748357773, + "step": 22174 + }, + { + "epoch": 0.693, + "grad_norm": 3.15625, + "grad_norm_var": 0.025472005208333332, + "learning_rate": 0.0001, + "loss": 5.3211, + "loss/crossentropy": 2.3427733778953552, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15017352998256683, + "step": 22176 + }, + { + "epoch": 0.6930625, + "grad_norm": 3.25, + "grad_norm_var": 0.022614542643229166, + "learning_rate": 0.0001, + "loss": 5.8383, + "loss/crossentropy": 2.668258547782898, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16817117482423782, + "step": 22178 + }, + { + "epoch": 0.693125, + "grad_norm": 3.21875, + "grad_norm_var": 0.026416015625, + "learning_rate": 0.0001, + "loss": 6.0641, + "loss/crossentropy": 2.8303595781326294, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17259375751018524, + "step": 22180 + }, + { + "epoch": 0.6931875, + "grad_norm": 3.234375, + "grad_norm_var": 0.027372233072916665, + "learning_rate": 0.0001, + "loss": 5.6434, + "loss/crossentropy": 2.4964150190353394, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1670391485095024, + "step": 22182 + }, + { + "epoch": 0.69325, + "grad_norm": 2.96875, + "grad_norm_var": 0.03242899576822917, + "learning_rate": 0.0001, + "loss": 6.0384, + "loss/crossentropy": 2.809204339981079, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17331138253211975, + "step": 22184 + }, + { + "epoch": 0.6933125, + "grad_norm": 2.9375, + "grad_norm_var": 0.03474833170572917, + "learning_rate": 0.0001, + "loss": 5.2075, + "loss/crossentropy": 2.2899743914604187, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1503501981496811, + "step": 22186 + }, + { + "epoch": 0.693375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0342681884765625, + "learning_rate": 0.0001, + "loss": 5.4312, + "loss/crossentropy": 2.44270658493042, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15627416968345642, + "step": 22188 + }, + { + "epoch": 0.6934375, + "grad_norm": 2.84375, + "grad_norm_var": 0.0326171875, + "learning_rate": 0.0001, + "loss": 5.389, + "loss/crossentropy": 2.4759762287139893, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.15184426307678223, + "step": 22190 + }, + { + "epoch": 0.6935, + "grad_norm": 3.140625, + "grad_norm_var": 0.022777303059895834, + "learning_rate": 0.0001, + "loss": 5.5177, + "loss/crossentropy": 2.4368534088134766, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16121289879083633, + "step": 22192 + }, + { + "epoch": 0.6935625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0743072509765625, + "learning_rate": 0.0001, + "loss": 5.3237, + "loss/crossentropy": 2.30250883102417, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15368013083934784, + "step": 22194 + }, + { + "epoch": 0.693625, + "grad_norm": 3.234375, + "grad_norm_var": 0.07288411458333334, + "learning_rate": 0.0001, + "loss": 5.8712, + "loss/crossentropy": 2.6539297103881836, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1736777201294899, + "step": 22196 + }, + { + "epoch": 0.6936875, + "grad_norm": 3.0625, + "grad_norm_var": 0.07444559733072917, + "learning_rate": 0.0001, + "loss": 5.6884, + "loss/crossentropy": 2.590583086013794, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16173090040683746, + "step": 22198 + }, + { + "epoch": 0.69375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06975809733072917, + "learning_rate": 0.0001, + "loss": 5.7269, + "loss/crossentropy": 2.5890592336654663, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1669059693813324, + "step": 22200 + }, + { + "epoch": 0.6938125, + "grad_norm": 3.390625, + "grad_norm_var": 0.0677154541015625, + "learning_rate": 0.0001, + "loss": 5.5632, + "loss/crossentropy": 2.403108596801758, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17108941823244095, + "step": 22202 + }, + { + "epoch": 0.693875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0819976806640625, + "learning_rate": 0.0001, + "loss": 5.7487, + "loss/crossentropy": 2.54872989654541, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16961067914962769, + "step": 22204 + }, + { + "epoch": 0.6939375, + "grad_norm": 2.859375, + "grad_norm_var": 0.08206380208333333, + "learning_rate": 0.0001, + "loss": 5.6329, + "loss/crossentropy": 2.557260513305664, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1626439094543457, + "step": 22206 + }, + { + "epoch": 0.694, + "grad_norm": 3.1875, + "grad_norm_var": 0.09238179524739583, + "learning_rate": 0.0001, + "loss": 5.9052, + "loss/crossentropy": 2.705229640007019, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1707761213183403, + "step": 22208 + }, + { + "epoch": 0.6940625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0510650634765625, + "learning_rate": 0.0001, + "loss": 5.6836, + "loss/crossentropy": 2.6229788064956665, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16465359181165695, + "step": 22210 + }, + { + "epoch": 0.694125, + "grad_norm": 3.28125, + "grad_norm_var": 0.052079264322916666, + "learning_rate": 0.0001, + "loss": 5.9075, + "loss/crossentropy": 2.6422927379608154, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1757410168647766, + "step": 22212 + }, + { + "epoch": 0.6941875, + "grad_norm": 3.78125, + "grad_norm_var": 0.070703125, + "learning_rate": 0.0001, + "loss": 5.4121, + "loss/crossentropy": 2.340365767478943, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1583406627178192, + "step": 22214 + }, + { + "epoch": 0.69425, + "grad_norm": 2.953125, + "grad_norm_var": 0.07559305826822917, + "learning_rate": 0.0001, + "loss": 5.659, + "loss/crossentropy": 2.5273908376693726, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1635541394352913, + "step": 22216 + }, + { + "epoch": 0.6943125, + "grad_norm": 3.34375, + "grad_norm_var": 0.07333882649739583, + "learning_rate": 0.0001, + "loss": 5.8834, + "loss/crossentropy": 2.6678963899612427, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17428787797689438, + "step": 22218 + }, + { + "epoch": 0.694375, + "grad_norm": 3.015625, + "grad_norm_var": 0.06564127604166667, + "learning_rate": 0.0001, + "loss": 5.7687, + "loss/crossentropy": 2.62005078792572, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1683787852525711, + "step": 22220 + }, + { + "epoch": 0.6944375, + "grad_norm": 3.265625, + "grad_norm_var": 0.05554911295572917, + "learning_rate": 0.0001, + "loss": 5.733, + "loss/crossentropy": 2.633288860321045, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1658271849155426, + "step": 22222 + }, + { + "epoch": 0.6945, + "grad_norm": 3.171875, + "grad_norm_var": 0.06530659993489583, + "learning_rate": 0.0001, + "loss": 4.906, + "loss/crossentropy": 2.101522386074066, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1406061053276062, + "step": 22224 + }, + { + "epoch": 0.6945625, + "grad_norm": 2.890625, + "grad_norm_var": 0.06741536458333333, + "learning_rate": 0.0001, + "loss": 5.7833, + "loss/crossentropy": 2.6757954359054565, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16700387001037598, + "step": 22226 + }, + { + "epoch": 0.694625, + "grad_norm": 3.171875, + "grad_norm_var": 0.07576395670572916, + "learning_rate": 0.0001, + "loss": 5.7715, + "loss/crossentropy": 2.6445634365081787, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16426067054271698, + "step": 22228 + }, + { + "epoch": 0.6946875, + "grad_norm": 3.34375, + "grad_norm_var": 0.05032145182291667, + "learning_rate": 0.0001, + "loss": 5.9006, + "loss/crossentropy": 2.7169244289398193, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1683690845966339, + "step": 22230 + }, + { + "epoch": 0.69475, + "grad_norm": 2.984375, + "grad_norm_var": 0.047272745768229166, + "learning_rate": 0.0001, + "loss": 5.7788, + "loss/crossentropy": 2.602481245994568, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16997309029102325, + "step": 22232 + }, + { + "epoch": 0.6948125, + "grad_norm": 3.296875, + "grad_norm_var": 0.045003255208333336, + "learning_rate": 0.0001, + "loss": 5.9348, + "loss/crossentropy": 2.7773529291152954, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16418687999248505, + "step": 22234 + }, + { + "epoch": 0.694875, + "grad_norm": 3.03125, + "grad_norm_var": 0.04049479166666667, + "learning_rate": 0.0001, + "loss": 5.7328, + "loss/crossentropy": 2.6272052526474, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16798030585050583, + "step": 22236 + }, + { + "epoch": 0.6949375, + "grad_norm": 3.078125, + "grad_norm_var": 0.03769124348958333, + "learning_rate": 0.0001, + "loss": 5.8072, + "loss/crossentropy": 2.632036566734314, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16985852271318436, + "step": 22238 + }, + { + "epoch": 0.695, + "grad_norm": 3.1875, + "grad_norm_var": 0.02760009765625, + "learning_rate": 0.0001, + "loss": 5.7408, + "loss/crossentropy": 2.665483832359314, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16300511360168457, + "step": 22240 + }, + { + "epoch": 0.6950625, + "grad_norm": 3.265625, + "grad_norm_var": 0.02603759765625, + "learning_rate": 0.0001, + "loss": 5.6665, + "loss/crossentropy": 2.5961122512817383, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15977414697408676, + "step": 22242 + }, + { + "epoch": 0.695125, + "grad_norm": 3.203125, + "grad_norm_var": 0.020238240559895832, + "learning_rate": 0.0001, + "loss": 5.5223, + "loss/crossentropy": 2.467007637023926, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16255847364664078, + "step": 22244 + }, + { + "epoch": 0.6951875, + "grad_norm": 3.15625, + "grad_norm_var": 0.018257649739583333, + "learning_rate": 0.0001, + "loss": 5.5286, + "loss/crossentropy": 2.427953839302063, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16397497057914734, + "step": 22246 + }, + { + "epoch": 0.69525, + "grad_norm": 3.578125, + "grad_norm_var": 0.03287353515625, + "learning_rate": 0.0001, + "loss": 5.6832, + "loss/crossentropy": 2.5346686840057373, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.17071057856082916, + "step": 22248 + }, + { + "epoch": 0.6953125, + "grad_norm": 2.8125, + "grad_norm_var": 0.03791402180989583, + "learning_rate": 0.0001, + "loss": 5.6754, + "loss/crossentropy": 2.630379319190979, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1607513353228569, + "step": 22250 + }, + { + "epoch": 0.695375, + "grad_norm": 2.96875, + "grad_norm_var": 0.0409088134765625, + "learning_rate": 0.0001, + "loss": 5.7436, + "loss/crossentropy": 2.6902812719345093, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16002224385738373, + "step": 22252 + }, + { + "epoch": 0.6954375, + "grad_norm": 3.0625, + "grad_norm_var": 0.04243062337239583, + "learning_rate": 0.0001, + "loss": 5.5621, + "loss/crossentropy": 2.513045907020569, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1595899686217308, + "step": 22254 + }, + { + "epoch": 0.6955, + "grad_norm": 2.875, + "grad_norm_var": 0.04206441243489583, + "learning_rate": 0.0001, + "loss": 5.7661, + "loss/crossentropy": 2.732425332069397, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15765976905822754, + "step": 22256 + }, + { + "epoch": 0.6955625, + "grad_norm": 3.109375, + "grad_norm_var": 0.039713541666666664, + "learning_rate": 0.0001, + "loss": 5.5049, + "loss/crossentropy": 2.5390454530715942, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1536208540201187, + "step": 22258 + }, + { + "epoch": 0.695625, + "grad_norm": 2.828125, + "grad_norm_var": 0.05579020182291667, + "learning_rate": 0.0001, + "loss": 5.8636, + "loss/crossentropy": 2.620804190635681, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1742827668786049, + "step": 22260 + }, + { + "epoch": 0.6956875, + "grad_norm": 2.96875, + "grad_norm_var": 0.053278605143229164, + "learning_rate": 0.0001, + "loss": 5.7285, + "loss/crossentropy": 2.6134908199310303, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16775041818618774, + "step": 22262 + }, + { + "epoch": 0.69575, + "grad_norm": 3.265625, + "grad_norm_var": 0.036767578125, + "learning_rate": 0.0001, + "loss": 5.5856, + "loss/crossentropy": 2.5176788568496704, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1622568964958191, + "step": 22264 + }, + { + "epoch": 0.6958125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03386128743489583, + "learning_rate": 0.0001, + "loss": 5.5092, + "loss/crossentropy": 2.406510591506958, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1665196791291237, + "step": 22266 + }, + { + "epoch": 0.695875, + "grad_norm": 3.078125, + "grad_norm_var": 0.03135477701822917, + "learning_rate": 0.0001, + "loss": 5.7848, + "loss/crossentropy": 2.6489561796188354, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16631732136011124, + "step": 22268 + }, + { + "epoch": 0.6959375, + "grad_norm": 3.25, + "grad_norm_var": 0.0601226806640625, + "learning_rate": 0.0001, + "loss": 5.8294, + "loss/crossentropy": 2.568448781967163, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1729675531387329, + "step": 22270 + }, + { + "epoch": 0.696, + "grad_norm": 2.953125, + "grad_norm_var": 0.0565338134765625, + "learning_rate": 0.0001, + "loss": 5.6264, + "loss/crossentropy": 2.542552947998047, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16502072662115097, + "step": 22272 + }, + { + "epoch": 0.6960625, + "grad_norm": 3.046875, + "grad_norm_var": 0.05479227701822917, + "learning_rate": 0.0001, + "loss": 5.6426, + "loss/crossentropy": 2.531023144721985, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16467513889074326, + "step": 22274 + }, + { + "epoch": 0.696125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0438873291015625, + "learning_rate": 0.0001, + "loss": 5.811, + "loss/crossentropy": 2.658156991004944, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16763115674257278, + "step": 22276 + }, + { + "epoch": 0.6961875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0424957275390625, + "learning_rate": 0.0001, + "loss": 5.6239, + "loss/crossentropy": 2.5519994497299194, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16266239434480667, + "step": 22278 + }, + { + "epoch": 0.69625, + "grad_norm": 2.984375, + "grad_norm_var": 0.03970947265625, + "learning_rate": 0.0001, + "loss": 5.5622, + "loss/crossentropy": 2.445394515991211, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16558968275785446, + "step": 22280 + }, + { + "epoch": 0.6963125, + "grad_norm": 3.109375, + "grad_norm_var": 0.039388020833333336, + "learning_rate": 0.0001, + "loss": 5.5865, + "loss/crossentropy": 2.5204278230667114, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16129783540964127, + "step": 22282 + }, + { + "epoch": 0.696375, + "grad_norm": 3.421875, + "grad_norm_var": 0.04601236979166667, + "learning_rate": 0.0001, + "loss": 5.9431, + "loss/crossentropy": 2.6833596229553223, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17479926347732544, + "step": 22284 + }, + { + "epoch": 0.6964375, + "grad_norm": 3.046875, + "grad_norm_var": 0.023270670572916666, + "learning_rate": 0.0001, + "loss": 5.8238, + "loss/crossentropy": 2.56610107421875, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17459864169359207, + "step": 22286 + }, + { + "epoch": 0.6965, + "grad_norm": 3.046875, + "grad_norm_var": 0.022899373372395834, + "learning_rate": 0.0001, + "loss": 5.546, + "loss/crossentropy": 2.441725015640259, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15964962542057037, + "step": 22288 + }, + { + "epoch": 0.6965625, + "grad_norm": 2.984375, + "grad_norm_var": 0.025972493489583335, + "learning_rate": 0.0001, + "loss": 5.5882, + "loss/crossentropy": 2.5388636589050293, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15884168446063995, + "step": 22290 + }, + { + "epoch": 0.696625, + "grad_norm": 3.125, + "grad_norm_var": 0.021418253580729168, + "learning_rate": 0.0001, + "loss": 5.4641, + "loss/crossentropy": 2.4065194129943848, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15927237272262573, + "step": 22292 + }, + { + "epoch": 0.6966875, + "grad_norm": 3.140625, + "grad_norm_var": 0.019254557291666665, + "learning_rate": 0.0001, + "loss": 5.7093, + "loss/crossentropy": 2.559041976928711, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1673680618405342, + "step": 22294 + }, + { + "epoch": 0.69675, + "grad_norm": 3.09375, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 5.5445, + "loss/crossentropy": 2.5248336791992188, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15626318752765656, + "step": 22296 + }, + { + "epoch": 0.6968125, + "grad_norm": 3.046875, + "grad_norm_var": 0.019391886393229165, + "learning_rate": 0.0001, + "loss": 5.7617, + "loss/crossentropy": 2.5865273475646973, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17025582492351532, + "step": 22298 + }, + { + "epoch": 0.696875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0148834228515625, + "learning_rate": 0.0001, + "loss": 5.693, + "loss/crossentropy": 2.6101256608963013, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1633646935224533, + "step": 22300 + }, + { + "epoch": 0.6969375, + "grad_norm": 3.15625, + "grad_norm_var": 0.01259765625, + "learning_rate": 0.0001, + "loss": 5.7081, + "loss/crossentropy": 2.6090636253356934, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16380954533815384, + "step": 22302 + }, + { + "epoch": 0.697, + "grad_norm": 4.28125, + "grad_norm_var": 0.10497945149739583, + "learning_rate": 0.0001, + "loss": 5.4828, + "loss/crossentropy": 2.45250141620636, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15810998529195786, + "step": 22304 + }, + { + "epoch": 0.6970625, + "grad_norm": 3.359375, + "grad_norm_var": 0.10263264973958333, + "learning_rate": 0.0001, + "loss": 6.1271, + "loss/crossentropy": 2.794241786003113, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18016577512025833, + "step": 22306 + }, + { + "epoch": 0.697125, + "grad_norm": 3.0625, + "grad_norm_var": 0.10592447916666667, + "learning_rate": 0.0001, + "loss": 5.6457, + "loss/crossentropy": 2.5611395835876465, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.165873683989048, + "step": 22308 + }, + { + "epoch": 0.6971875, + "grad_norm": 3.375, + "grad_norm_var": 0.11043192545572916, + "learning_rate": 0.0001, + "loss": 5.52, + "loss/crossentropy": 2.4050599336624146, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1650119423866272, + "step": 22310 + }, + { + "epoch": 0.69725, + "grad_norm": 3.109375, + "grad_norm_var": 0.11057535807291667, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.6355658769607544, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16874302178621292, + "step": 22312 + }, + { + "epoch": 0.6973125, + "grad_norm": 3.21875, + "grad_norm_var": 0.1091461181640625, + "learning_rate": 0.0001, + "loss": 5.6542, + "loss/crossentropy": 2.6047236919403076, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16197961568832397, + "step": 22314 + }, + { + "epoch": 0.697375, + "grad_norm": 2.984375, + "grad_norm_var": 0.10572509765625, + "learning_rate": 0.0001, + "loss": 5.908, + "loss/crossentropy": 2.7273367643356323, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17080094665288925, + "step": 22316 + }, + { + "epoch": 0.6974375, + "grad_norm": 3.296875, + "grad_norm_var": 0.10445556640625, + "learning_rate": 0.0001, + "loss": 5.5467, + "loss/crossentropy": 2.4545209407806396, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16430046409368515, + "step": 22318 + }, + { + "epoch": 0.6975, + "grad_norm": 2.984375, + "grad_norm_var": 0.0266998291015625, + "learning_rate": 0.0001, + "loss": 5.8954, + "loss/crossentropy": 2.7398087978363037, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16790051758289337, + "step": 22320 + }, + { + "epoch": 0.6975625, + "grad_norm": 2.984375, + "grad_norm_var": 0.024507649739583335, + "learning_rate": 0.0001, + "loss": 5.4263, + "loss/crossentropy": 2.391477584838867, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15817295014858246, + "step": 22322 + }, + { + "epoch": 0.697625, + "grad_norm": 3.421875, + "grad_norm_var": 0.028815714518229167, + "learning_rate": 0.0001, + "loss": 6.1122, + "loss/crossentropy": 2.827970504760742, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17920669168233871, + "step": 22324 + }, + { + "epoch": 0.6976875, + "grad_norm": 3.125, + "grad_norm_var": 0.0246490478515625, + "learning_rate": 0.0001, + "loss": 5.5779, + "loss/crossentropy": 2.504065990447998, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1644120216369629, + "step": 22326 + }, + { + "epoch": 0.69775, + "grad_norm": 2.9375, + "grad_norm_var": 0.027684529622395832, + "learning_rate": 0.0001, + "loss": 5.3827, + "loss/crossentropy": 2.36915385723114, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15369880199432373, + "step": 22328 + }, + { + "epoch": 0.6978125, + "grad_norm": 3.296875, + "grad_norm_var": 0.029866536458333332, + "learning_rate": 0.0001, + "loss": 5.7214, + "loss/crossentropy": 2.5773812532424927, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16830333322286606, + "step": 22330 + }, + { + "epoch": 0.697875, + "grad_norm": 3.25, + "grad_norm_var": 0.028413899739583335, + "learning_rate": 0.0001, + "loss": 5.3243, + "loss/crossentropy": 2.2541534900665283, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1581818237900734, + "step": 22332 + }, + { + "epoch": 0.6979375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0273590087890625, + "learning_rate": 0.0001, + "loss": 5.9036, + "loss/crossentropy": 2.73256778717041, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17023073136806488, + "step": 22334 + }, + { + "epoch": 0.698, + "grad_norm": 2.8125, + "grad_norm_var": 0.0284332275390625, + "learning_rate": 0.0001, + "loss": 5.5798, + "loss/crossentropy": 2.5840107202529907, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15582574903964996, + "step": 22336 + }, + { + "epoch": 0.6980625, + "grad_norm": 2.984375, + "grad_norm_var": 0.0275543212890625, + "learning_rate": 0.0001, + "loss": 5.7119, + "loss/crossentropy": 2.5801466703414917, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16668801009655, + "step": 22338 + }, + { + "epoch": 0.698125, + "grad_norm": 2.984375, + "grad_norm_var": 0.025853474934895832, + "learning_rate": 0.0001, + "loss": 5.2978, + "loss/crossentropy": 2.357309341430664, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1526389792561531, + "step": 22340 + }, + { + "epoch": 0.6981875, + "grad_norm": 3.1875, + "grad_norm_var": 0.025169881184895833, + "learning_rate": 0.0001, + "loss": 5.788, + "loss/crossentropy": 2.6856919527053833, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16569886356592178, + "step": 22342 + }, + { + "epoch": 0.69825, + "grad_norm": 3.015625, + "grad_norm_var": 0.030159505208333333, + "learning_rate": 0.0001, + "loss": 5.8538, + "loss/crossentropy": 2.76581346988678, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16075499355793, + "step": 22344 + }, + { + "epoch": 0.6983125, + "grad_norm": 3.34375, + "grad_norm_var": 0.031538899739583334, + "learning_rate": 0.0001, + "loss": 5.5739, + "loss/crossentropy": 2.4678162336349487, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16412169486284256, + "step": 22346 + }, + { + "epoch": 0.698375, + "grad_norm": 3.171875, + "grad_norm_var": 0.029963175455729168, + "learning_rate": 0.0001, + "loss": 5.3019, + "loss/crossentropy": 2.34357488155365, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1497362107038498, + "step": 22348 + }, + { + "epoch": 0.6984375, + "grad_norm": 7.46875, + "grad_norm_var": 1.2452056884765625, + "learning_rate": 0.0001, + "loss": 5.8482, + "loss/crossentropy": 2.4086620807647705, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19160857051610947, + "step": 22350 + }, + { + "epoch": 0.6985, + "grad_norm": 3.46875, + "grad_norm_var": 1.22056884765625, + "learning_rate": 0.0001, + "loss": 6.0269, + "loss/crossentropy": 2.7019847631454468, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.18015021085739136, + "step": 22352 + }, + { + "epoch": 0.6985625, + "grad_norm": 3.234375, + "grad_norm_var": 1.2114898681640625, + "learning_rate": 0.0001, + "loss": 5.9025, + "loss/crossentropy": 2.6564154624938965, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17577841877937317, + "step": 22354 + }, + { + "epoch": 0.698625, + "grad_norm": 3.15625, + "grad_norm_var": 1.1758097330729167, + "learning_rate": 0.0001, + "loss": 5.9447, + "loss/crossentropy": 2.748743772506714, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16881877928972244, + "step": 22356 + }, + { + "epoch": 0.6986875, + "grad_norm": 3.09375, + "grad_norm_var": 1.162555948893229, + "learning_rate": 0.0001, + "loss": 5.8294, + "loss/crossentropy": 2.6740111112594604, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16866113990545273, + "step": 22358 + }, + { + "epoch": 0.69875, + "grad_norm": 2.859375, + "grad_norm_var": 1.172874959309896, + "learning_rate": 0.0001, + "loss": 5.6562, + "loss/crossentropy": 2.5262296199798584, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.164167582988739, + "step": 22360 + }, + { + "epoch": 0.6988125, + "grad_norm": 2.859375, + "grad_norm_var": 1.195563761393229, + "learning_rate": 0.0001, + "loss": 5.4895, + "loss/crossentropy": 2.496787905693054, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15590853989124298, + "step": 22362 + }, + { + "epoch": 0.698875, + "grad_norm": 3.125, + "grad_norm_var": 1.1972941080729167, + "learning_rate": 0.0001, + "loss": 5.6964, + "loss/crossentropy": 2.6194320917129517, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16394924372434616, + "step": 22364 + }, + { + "epoch": 0.6989375, + "grad_norm": 3.125, + "grad_norm_var": 0.032124837239583336, + "learning_rate": 0.0001, + "loss": 5.9581, + "loss/crossentropy": 2.741639733314514, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17593914270401, + "step": 22366 + }, + { + "epoch": 0.699, + "grad_norm": 3.109375, + "grad_norm_var": 0.22603759765625, + "learning_rate": 0.0001, + "loss": 5.4965, + "loss/crossentropy": 2.397615075111389, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.15715742111206055, + "step": 22368 + }, + { + "epoch": 0.6990625, + "grad_norm": 3.25, + "grad_norm_var": 0.2276763916015625, + "learning_rate": 0.0001, + "loss": 5.6063, + "loss/crossentropy": 2.5021533966064453, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16432107985019684, + "step": 22370 + }, + { + "epoch": 0.699125, + "grad_norm": 2.890625, + "grad_norm_var": 0.2720855712890625, + "learning_rate": 0.0001, + "loss": 5.752, + "loss/crossentropy": 2.6542662382125854, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16407115757465363, + "step": 22372 + }, + { + "epoch": 0.6991875, + "grad_norm": 2.984375, + "grad_norm_var": 0.27852274576822916, + "learning_rate": 0.0001, + "loss": 5.629, + "loss/crossentropy": 2.462101697921753, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1702076569199562, + "step": 22374 + }, + { + "epoch": 0.69925, + "grad_norm": 3.0625, + "grad_norm_var": 0.269140625, + "learning_rate": 0.0001, + "loss": 5.3237, + "loss/crossentropy": 2.40639591217041, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15188443660736084, + "step": 22376 + }, + { + "epoch": 0.6993125, + "grad_norm": 3.234375, + "grad_norm_var": 0.2586090087890625, + "learning_rate": 0.0001, + "loss": 5.6065, + "loss/crossentropy": 2.5098297595977783, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1616167202591896, + "step": 22378 + }, + { + "epoch": 0.699375, + "grad_norm": 3.15625, + "grad_norm_var": 0.2508941650390625, + "learning_rate": 0.0001, + "loss": 5.6762, + "loss/crossentropy": 2.5527048110961914, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1674271747469902, + "step": 22380 + }, + { + "epoch": 0.6994375, + "grad_norm": 3.265625, + "grad_norm_var": 0.2538726806640625, + "learning_rate": 0.0001, + "loss": 5.3082, + "loss/crossentropy": 2.306580424308777, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1509408950805664, + "step": 22382 + }, + { + "epoch": 0.6995, + "grad_norm": 3.421875, + "grad_norm_var": 0.06692708333333333, + "learning_rate": 0.0001, + "loss": 5.734, + "loss/crossentropy": 2.5101386308670044, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17277875542640686, + "step": 22384 + }, + { + "epoch": 0.6995625, + "grad_norm": 3.015625, + "grad_norm_var": 0.07023824055989583, + "learning_rate": 0.0001, + "loss": 5.6629, + "loss/crossentropy": 2.5359352827072144, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.162301167845726, + "step": 22386 + }, + { + "epoch": 0.699625, + "grad_norm": 3.015625, + "grad_norm_var": 0.020873006184895834, + "learning_rate": 0.0001, + "loss": 5.9309, + "loss/crossentropy": 2.7789628505706787, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1679278016090393, + "step": 22388 + }, + { + "epoch": 0.6996875, + "grad_norm": 2.984375, + "grad_norm_var": 0.024214680989583334, + "learning_rate": 0.0001, + "loss": 5.6155, + "loss/crossentropy": 2.5941267013549805, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15995129197835922, + "step": 22390 + }, + { + "epoch": 0.69975, + "grad_norm": 3.265625, + "grad_norm_var": 0.0268707275390625, + "learning_rate": 0.0001, + "loss": 5.3302, + "loss/crossentropy": 2.297610640525818, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15325400233268738, + "step": 22392 + }, + { + "epoch": 0.6998125, + "grad_norm": 3.0625, + "grad_norm_var": 0.026949055989583335, + "learning_rate": 0.0001, + "loss": 5.9984, + "loss/crossentropy": 2.773570418357849, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17170123010873795, + "step": 22394 + }, + { + "epoch": 0.699875, + "grad_norm": 3.125, + "grad_norm_var": 0.029548136393229167, + "learning_rate": 0.0001, + "loss": 5.5626, + "loss/crossentropy": 2.4833754301071167, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1633896306157112, + "step": 22396 + }, + { + "epoch": 0.6999375, + "grad_norm": 9.625, + "grad_norm_var": 2.6657623291015624, + "learning_rate": 0.0001, + "loss": 5.7952, + "loss/crossentropy": 2.604393482208252, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1683036834001541, + "step": 22398 + }, + { + "epoch": 0.7, + "grad_norm": 3.015625, + "grad_norm_var": 2.6793365478515625, + "learning_rate": 0.0001, + "loss": 5.9193, + "loss/crossentropy": 2.8233916759490967, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16271594911813736, + "step": 22400 + }, + { + "epoch": 0.7000625, + "grad_norm": 3.109375, + "grad_norm_var": 2.6681304931640626, + "learning_rate": 0.0001, + "loss": 5.5348, + "loss/crossentropy": 2.4189138412475586, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1678370013833046, + "step": 22402 + }, + { + "epoch": 0.700125, + "grad_norm": 3.09375, + "grad_norm_var": 2.6681304931640626, + "learning_rate": 0.0001, + "loss": 5.5848, + "loss/crossentropy": 2.53902804851532, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16239406913518906, + "step": 22404 + }, + { + "epoch": 0.7001875, + "grad_norm": 3.109375, + "grad_norm_var": 2.642625935872396, + "learning_rate": 0.0001, + "loss": 5.6684, + "loss/crossentropy": 2.503175377845764, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16886447370052338, + "step": 22406 + }, + { + "epoch": 0.70025, + "grad_norm": 2.875, + "grad_norm_var": 2.680101521809896, + "learning_rate": 0.0001, + "loss": 5.3362, + "loss/crossentropy": 2.391018033027649, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.14802981913089752, + "step": 22408 + }, + { + "epoch": 0.7003125, + "grad_norm": 3.296875, + "grad_norm_var": 2.6794230143229165, + "learning_rate": 0.0001, + "loss": 5.629, + "loss/crossentropy": 2.5314453840255737, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1624908298254013, + "step": 22410 + }, + { + "epoch": 0.700375, + "grad_norm": 2.9375, + "grad_norm_var": 2.6909464518229167, + "learning_rate": 0.0001, + "loss": 5.474, + "loss/crossentropy": 2.4601889848709106, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1595887541770935, + "step": 22412 + }, + { + "epoch": 0.7004375, + "grad_norm": 3.09375, + "grad_norm_var": 0.010993448893229167, + "learning_rate": 0.0001, + "loss": 5.5297, + "loss/crossentropy": 2.3893154859542847, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1683400347828865, + "step": 22414 + }, + { + "epoch": 0.7005, + "grad_norm": 3.0625, + "grad_norm_var": 0.014387003580729167, + "learning_rate": 0.0001, + "loss": 5.9771, + "loss/crossentropy": 2.7257237434387207, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17396235466003418, + "step": 22416 + }, + { + "epoch": 0.7005625, + "grad_norm": 3.0, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 5.627, + "loss/crossentropy": 2.5427207946777344, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16233046352863312, + "step": 22418 + }, + { + "epoch": 0.700625, + "grad_norm": 3.1875, + "grad_norm_var": 0.015623982747395833, + "learning_rate": 0.0001, + "loss": 5.7423, + "loss/crossentropy": 2.5589810609817505, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16637687385082245, + "step": 22420 + }, + { + "epoch": 0.7006875, + "grad_norm": 2.96875, + "grad_norm_var": 0.017073567708333334, + "learning_rate": 0.0001, + "loss": 5.4075, + "loss/crossentropy": 2.446865677833557, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15505224466323853, + "step": 22422 + }, + { + "epoch": 0.70075, + "grad_norm": 2.9375, + "grad_norm_var": 0.017039998372395834, + "learning_rate": 0.0001, + "loss": 5.6526, + "loss/crossentropy": 2.5972577333450317, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15748517215251923, + "step": 22424 + }, + { + "epoch": 0.7008125, + "grad_norm": 3.0625, + "grad_norm_var": 0.013179524739583334, + "learning_rate": 0.0001, + "loss": 5.575, + "loss/crossentropy": 2.4511340856552124, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16356197744607925, + "step": 22426 + }, + { + "epoch": 0.700875, + "grad_norm": 2.984375, + "grad_norm_var": 0.015501912434895833, + "learning_rate": 0.0001, + "loss": 5.5413, + "loss/crossentropy": 2.511778950691223, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15959448367357254, + "step": 22428 + }, + { + "epoch": 0.7009375, + "grad_norm": 3.03125, + "grad_norm_var": 0.015608723958333333, + "learning_rate": 0.0001, + "loss": 5.8105, + "loss/crossentropy": 2.707659602165222, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1669275015592575, + "step": 22430 + }, + { + "epoch": 0.701, + "grad_norm": 3.65625, + "grad_norm_var": 0.03448893229166667, + "learning_rate": 0.0001, + "loss": 5.8825, + "loss/crossentropy": 2.6640822887420654, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17496730387210846, + "step": 22432 + }, + { + "epoch": 0.7010625, + "grad_norm": 3.234375, + "grad_norm_var": 0.06253255208333333, + "learning_rate": 0.0001, + "loss": 5.8073, + "loss/crossentropy": 2.696197271347046, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1650131791830063, + "step": 22434 + }, + { + "epoch": 0.701125, + "grad_norm": 3.234375, + "grad_norm_var": 0.06267903645833334, + "learning_rate": 0.0001, + "loss": 5.815, + "loss/crossentropy": 2.612404942512512, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17299189418554306, + "step": 22436 + }, + { + "epoch": 0.7011875, + "grad_norm": 2.984375, + "grad_norm_var": 0.06051025390625, + "learning_rate": 0.0001, + "loss": 5.5862, + "loss/crossentropy": 2.5015735626220703, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16470873355865479, + "step": 22438 + }, + { + "epoch": 0.70125, + "grad_norm": 3.0, + "grad_norm_var": 0.058121744791666666, + "learning_rate": 0.0001, + "loss": 5.6935, + "loss/crossentropy": 2.5801044702529907, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16407185792922974, + "step": 22440 + }, + { + "epoch": 0.7013125, + "grad_norm": 3.078125, + "grad_norm_var": 0.07774149576822917, + "learning_rate": 0.0001, + "loss": 5.9704, + "loss/crossentropy": 2.6817033290863037, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1777021884918213, + "step": 22442 + }, + { + "epoch": 0.701375, + "grad_norm": 3.5, + "grad_norm_var": 0.06877848307291666, + "learning_rate": 0.0001, + "loss": 5.5508, + "loss/crossentropy": 2.5040571689605713, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16014103591442108, + "step": 22444 + }, + { + "epoch": 0.7014375, + "grad_norm": 3.234375, + "grad_norm_var": 0.06417643229166667, + "learning_rate": 0.0001, + "loss": 5.7439, + "loss/crossentropy": 2.5734212398529053, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16899675130844116, + "step": 22446 + }, + { + "epoch": 0.7015, + "grad_norm": 2.859375, + "grad_norm_var": 0.06323140462239583, + "learning_rate": 0.0001, + "loss": 5.7392, + "loss/crossentropy": 2.6356717348098755, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16348008811473846, + "step": 22448 + }, + { + "epoch": 0.7015625, + "grad_norm": 3.21875, + "grad_norm_var": 0.044189453125, + "learning_rate": 0.0001, + "loss": 5.6081, + "loss/crossentropy": 2.4915573596954346, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16478148102760315, + "step": 22450 + }, + { + "epoch": 0.701625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0458648681640625, + "learning_rate": 0.0001, + "loss": 5.5288, + "loss/crossentropy": 2.4801501035690308, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15916209667921066, + "step": 22452 + }, + { + "epoch": 0.7016875, + "grad_norm": 2.890625, + "grad_norm_var": 0.0484375, + "learning_rate": 0.0001, + "loss": 5.6895, + "loss/crossentropy": 2.573846220970154, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16312579065561295, + "step": 22454 + }, + { + "epoch": 0.70175, + "grad_norm": 2.84375, + "grad_norm_var": 0.05082906087239583, + "learning_rate": 0.0001, + "loss": 5.5499, + "loss/crossentropy": 2.513450026512146, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16263306885957718, + "step": 22456 + }, + { + "epoch": 0.7018125, + "grad_norm": 3.046875, + "grad_norm_var": 0.0286529541015625, + "learning_rate": 0.0001, + "loss": 5.8345, + "loss/crossentropy": 2.6493040323257446, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17281652241945267, + "step": 22458 + }, + { + "epoch": 0.701875, + "grad_norm": 3.09375, + "grad_norm_var": 0.019383748372395832, + "learning_rate": 0.0001, + "loss": 5.9764, + "loss/crossentropy": 2.7259128093719482, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17544399201869965, + "step": 22460 + }, + { + "epoch": 0.7019375, + "grad_norm": 3.03125, + "grad_norm_var": 0.019156901041666667, + "learning_rate": 0.0001, + "loss": 5.91, + "loss/crossentropy": 2.703311800956726, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17496934533119202, + "step": 22462 + }, + { + "epoch": 0.702, + "grad_norm": 3.09375, + "grad_norm_var": 0.023844401041666668, + "learning_rate": 0.0001, + "loss": 5.6215, + "loss/crossentropy": 2.57110595703125, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16128679364919662, + "step": 22464 + }, + { + "epoch": 0.7020625, + "grad_norm": 3.40625, + "grad_norm_var": 0.03418680826822917, + "learning_rate": 0.0001, + "loss": 5.9372, + "loss/crossentropy": 2.742884635925293, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17216527462005615, + "step": 22466 + }, + { + "epoch": 0.702125, + "grad_norm": 3.109375, + "grad_norm_var": 0.035542805989583336, + "learning_rate": 0.0001, + "loss": 5.5319, + "loss/crossentropy": 2.433596611022949, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16177868843078613, + "step": 22468 + }, + { + "epoch": 0.7021875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0365875244140625, + "learning_rate": 0.0001, + "loss": 5.9063, + "loss/crossentropy": 2.8023040294647217, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16352402418851852, + "step": 22470 + }, + { + "epoch": 0.70225, + "grad_norm": 3.5, + "grad_norm_var": 0.043782552083333336, + "learning_rate": 0.0001, + "loss": 5.9647, + "loss/crossentropy": 2.6569935083389282, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17764723300933838, + "step": 22472 + }, + { + "epoch": 0.7023125, + "grad_norm": 2.859375, + "grad_norm_var": 0.0514801025390625, + "learning_rate": 0.0001, + "loss": 5.5043, + "loss/crossentropy": 2.5186818838119507, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1555919125676155, + "step": 22474 + }, + { + "epoch": 0.702375, + "grad_norm": 3.046875, + "grad_norm_var": 0.04978739420572917, + "learning_rate": 0.0001, + "loss": 5.7322, + "loss/crossentropy": 2.6452149152755737, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1633850336074829, + "step": 22476 + }, + { + "epoch": 0.7024375, + "grad_norm": 3.25, + "grad_norm_var": 0.048388671875, + "learning_rate": 0.0001, + "loss": 5.3693, + "loss/crossentropy": 2.3515241146087646, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1549069508910179, + "step": 22478 + }, + { + "epoch": 0.7025, + "grad_norm": 3.109375, + "grad_norm_var": 0.05031636555989583, + "learning_rate": 0.0001, + "loss": 5.9579, + "loss/crossentropy": 2.68467915058136, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17615192383527756, + "step": 22480 + }, + { + "epoch": 0.7025625, + "grad_norm": 2.8125, + "grad_norm_var": 0.0432769775390625, + "learning_rate": 0.0001, + "loss": 5.5088, + "loss/crossentropy": 2.465117931365967, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15866827964782715, + "step": 22482 + }, + { + "epoch": 0.702625, + "grad_norm": 3.296875, + "grad_norm_var": 0.04519856770833333, + "learning_rate": 0.0001, + "loss": 5.8039, + "loss/crossentropy": 2.592761993408203, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1722838431596756, + "step": 22484 + }, + { + "epoch": 0.7026875, + "grad_norm": 2.875, + "grad_norm_var": 0.058882649739583334, + "learning_rate": 0.0001, + "loss": 5.5698, + "loss/crossentropy": 2.515580892562866, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1577618569135666, + "step": 22486 + }, + { + "epoch": 0.70275, + "grad_norm": 3.34375, + "grad_norm_var": 0.05354715983072917, + "learning_rate": 0.0001, + "loss": 5.9011, + "loss/crossentropy": 2.8239874839782715, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1627890020608902, + "step": 22488 + }, + { + "epoch": 0.7028125, + "grad_norm": 3.28125, + "grad_norm_var": 0.046174112955729166, + "learning_rate": 0.0001, + "loss": 5.8753, + "loss/crossentropy": 2.7165865898132324, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.166265070438385, + "step": 22490 + }, + { + "epoch": 0.702875, + "grad_norm": 3.0, + "grad_norm_var": 0.04716796875, + "learning_rate": 0.0001, + "loss": 5.6852, + "loss/crossentropy": 2.6169623136520386, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16072610020637512, + "step": 22492 + }, + { + "epoch": 0.7029375, + "grad_norm": 3.078125, + "grad_norm_var": 0.06935933430989584, + "learning_rate": 0.0001, + "loss": 5.4155, + "loss/crossentropy": 2.2609105706214905, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1642833724617958, + "step": 22494 + }, + { + "epoch": 0.703, + "grad_norm": 3.25, + "grad_norm_var": 0.07418619791666667, + "learning_rate": 0.0001, + "loss": 5.6373, + "loss/crossentropy": 2.5342200994491577, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16460448503494263, + "step": 22496 + }, + { + "epoch": 0.7030625, + "grad_norm": 2.84375, + "grad_norm_var": 0.326904296875, + "learning_rate": 0.0001, + "loss": 5.7002, + "loss/crossentropy": 2.575178384780884, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16445918381214142, + "step": 22498 + }, + { + "epoch": 0.703125, + "grad_norm": 3.015625, + "grad_norm_var": 0.3385894775390625, + "learning_rate": 0.0001, + "loss": 5.6958, + "loss/crossentropy": 2.621804714202881, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16091208904981613, + "step": 22500 + }, + { + "epoch": 0.7031875, + "grad_norm": 2.9375, + "grad_norm_var": 0.34890950520833336, + "learning_rate": 0.0001, + "loss": 5.7022, + "loss/crossentropy": 2.5340776443481445, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1691538318991661, + "step": 22502 + }, + { + "epoch": 0.70325, + "grad_norm": 3.0, + "grad_norm_var": 0.34834696451822916, + "learning_rate": 0.0001, + "loss": 5.7394, + "loss/crossentropy": 2.6451767683029175, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16450387239456177, + "step": 22504 + }, + { + "epoch": 0.7033125, + "grad_norm": 3.34375, + "grad_norm_var": 0.3486328125, + "learning_rate": 0.0001, + "loss": 5.6377, + "loss/crossentropy": 2.47772753238678, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16834013909101486, + "step": 22506 + }, + { + "epoch": 0.703375, + "grad_norm": 2.859375, + "grad_norm_var": 0.36357320149739586, + "learning_rate": 0.0001, + "loss": 5.5959, + "loss/crossentropy": 2.5505971908569336, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15648002177476883, + "step": 22508 + }, + { + "epoch": 0.7034375, + "grad_norm": 3.796875, + "grad_norm_var": 0.38450520833333335, + "learning_rate": 0.0001, + "loss": 5.9153, + "loss/crossentropy": 2.672336220741272, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17273079603910446, + "step": 22510 + }, + { + "epoch": 0.7035, + "grad_norm": 3.0, + "grad_norm_var": 0.367822265625, + "learning_rate": 0.0001, + "loss": 5.4554, + "loss/crossentropy": 2.4188915491104126, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15833529829978943, + "step": 22512 + }, + { + "epoch": 0.7035625, + "grad_norm": 9.0625, + "grad_norm_var": 2.200902303059896, + "learning_rate": 0.0001, + "loss": 6.1889, + "loss/crossentropy": 2.603859305381775, + "loss/hidden": 1.57421875, + "loss/jsd": 0.0, + "loss/logits": 0.20108668506145477, + "step": 22514 + }, + { + "epoch": 0.703625, + "grad_norm": 3.515625, + "grad_norm_var": 2.1885080973307294, + "learning_rate": 0.0001, + "loss": 5.8481, + "loss/crossentropy": 2.6655837297439575, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16942200809717178, + "step": 22516 + }, + { + "epoch": 0.7036875, + "grad_norm": 3.078125, + "grad_norm_var": 2.178076171875, + "learning_rate": 0.0001, + "loss": 5.9334, + "loss/crossentropy": 2.6331652402877808, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1808079555630684, + "step": 22518 + }, + { + "epoch": 0.70375, + "grad_norm": 3.078125, + "grad_norm_var": 2.179638671875, + "learning_rate": 0.0001, + "loss": 5.8864, + "loss/crossentropy": 2.683835744857788, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1722119152545929, + "step": 22520 + }, + { + "epoch": 0.7038125, + "grad_norm": 3.296875, + "grad_norm_var": 2.2008046468098956, + "learning_rate": 0.0001, + "loss": 5.5452, + "loss/crossentropy": 2.4897459745407104, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16140858829021454, + "step": 22522 + }, + { + "epoch": 0.703875, + "grad_norm": 3.203125, + "grad_norm_var": 2.1633453369140625, + "learning_rate": 0.0001, + "loss": 5.5807, + "loss/crossentropy": 2.4016828536987305, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17141630500555038, + "step": 22524 + }, + { + "epoch": 0.7039375, + "grad_norm": 2.890625, + "grad_norm_var": 2.191389973958333, + "learning_rate": 0.0001, + "loss": 5.4236, + "loss/crossentropy": 2.412192225456238, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15543314814567566, + "step": 22526 + }, + { + "epoch": 0.704, + "grad_norm": 3.234375, + "grad_norm_var": 2.1904256184895834, + "learning_rate": 0.0001, + "loss": 5.7931, + "loss/crossentropy": 2.5884499549865723, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1716323122382164, + "step": 22528 + }, + { + "epoch": 0.7040625, + "grad_norm": 2.984375, + "grad_norm_var": 0.0510162353515625, + "learning_rate": 0.0001, + "loss": 5.7378, + "loss/crossentropy": 2.5632896423339844, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17174633592367172, + "step": 22530 + }, + { + "epoch": 0.704125, + "grad_norm": 3.34375, + "grad_norm_var": 0.048173014322916666, + "learning_rate": 0.0001, + "loss": 5.9122, + "loss/crossentropy": 2.7450913190841675, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16865945607423782, + "step": 22532 + }, + { + "epoch": 0.7041875, + "grad_norm": 3.5625, + "grad_norm_var": 0.0542633056640625, + "learning_rate": 0.0001, + "loss": 5.4997, + "loss/crossentropy": 2.4344513416290283, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1615985557436943, + "step": 22534 + }, + { + "epoch": 0.70425, + "grad_norm": 3.125, + "grad_norm_var": 0.05813395182291667, + "learning_rate": 0.0001, + "loss": 5.1764, + "loss/crossentropy": 2.20544570684433, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.14748192578554153, + "step": 22536 + }, + { + "epoch": 0.7043125, + "grad_norm": 3.5, + "grad_norm_var": 0.062132771809895834, + "learning_rate": 0.0001, + "loss": 5.5872, + "loss/crossentropy": 2.50904643535614, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15976479649543762, + "step": 22538 + }, + { + "epoch": 0.704375, + "grad_norm": 3.125, + "grad_norm_var": 0.06211649576822917, + "learning_rate": 0.0001, + "loss": 5.6799, + "loss/crossentropy": 2.5684973001480103, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16817133128643036, + "step": 22540 + }, + { + "epoch": 0.7044375, + "grad_norm": 3.5625, + "grad_norm_var": 0.062108357747395836, + "learning_rate": 0.0001, + "loss": 5.94, + "loss/crossentropy": 2.6243035793304443, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17883047461509705, + "step": 22542 + }, + { + "epoch": 0.7045, + "grad_norm": 2.6875, + "grad_norm_var": 0.07993876139322917, + "learning_rate": 0.0001, + "loss": 5.8652, + "loss/crossentropy": 2.7938671112060547, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16025998443365097, + "step": 22544 + }, + { + "epoch": 0.7045625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0614166259765625, + "learning_rate": 0.0001, + "loss": 5.5639, + "loss/crossentropy": 2.563488721847534, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15668240934610367, + "step": 22546 + }, + { + "epoch": 0.704625, + "grad_norm": 3.078125, + "grad_norm_var": 0.05966796875, + "learning_rate": 0.0001, + "loss": 5.712, + "loss/crossentropy": 2.61086368560791, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16246207803487778, + "step": 22548 + }, + { + "epoch": 0.7046875, + "grad_norm": 3.203125, + "grad_norm_var": 0.05096028645833333, + "learning_rate": 0.0001, + "loss": 5.9104, + "loss/crossentropy": 2.6754400730133057, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17544656991958618, + "step": 22550 + }, + { + "epoch": 0.70475, + "grad_norm": 3.15625, + "grad_norm_var": 0.047587076822916664, + "learning_rate": 0.0001, + "loss": 5.9104, + "loss/crossentropy": 2.794792652130127, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1631246656179428, + "step": 22552 + }, + { + "epoch": 0.7048125, + "grad_norm": 2.921875, + "grad_norm_var": 0.040234375, + "learning_rate": 0.0001, + "loss": 5.755, + "loss/crossentropy": 2.629794120788574, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16799188405275345, + "step": 22554 + }, + { + "epoch": 0.704875, + "grad_norm": 2.953125, + "grad_norm_var": 0.04254557291666667, + "learning_rate": 0.0001, + "loss": 5.7324, + "loss/crossentropy": 2.6186139583587646, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16411759704351425, + "step": 22556 + }, + { + "epoch": 0.7049375, + "grad_norm": 3.171875, + "grad_norm_var": 0.030582682291666666, + "learning_rate": 0.0001, + "loss": 5.9094, + "loss/crossentropy": 2.634056806564331, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.176364503800869, + "step": 22558 + }, + { + "epoch": 0.705, + "grad_norm": 3.0, + "grad_norm_var": 0.019136555989583335, + "learning_rate": 0.0001, + "loss": 5.8174, + "loss/crossentropy": 2.7001943588256836, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.164451003074646, + "step": 22560 + }, + { + "epoch": 0.7050625, + "grad_norm": 3.03125, + "grad_norm_var": 0.03247782389322917, + "learning_rate": 0.0001, + "loss": 5.9079, + "loss/crossentropy": 2.684187173843384, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17237317562103271, + "step": 22562 + }, + { + "epoch": 0.705125, + "grad_norm": 4.03125, + "grad_norm_var": 0.07877197265625, + "learning_rate": 0.0001, + "loss": 5.9664, + "loss/crossentropy": 2.6163182258605957, + "loss/hidden": 1.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.1764097362756729, + "step": 22564 + }, + { + "epoch": 0.7051875, + "grad_norm": 2.921875, + "grad_norm_var": 0.08662007649739584, + "learning_rate": 0.0001, + "loss": 5.6746, + "loss/crossentropy": 2.6956058740615845, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15337137877941132, + "step": 22566 + }, + { + "epoch": 0.70525, + "grad_norm": 2.9375, + "grad_norm_var": 0.09000244140625, + "learning_rate": 0.0001, + "loss": 5.9217, + "loss/crossentropy": 2.71065890789032, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17501109093427658, + "step": 22568 + }, + { + "epoch": 0.7053125, + "grad_norm": 3.046875, + "grad_norm_var": 0.14767252604166667, + "learning_rate": 0.0001, + "loss": 5.6179, + "loss/crossentropy": 2.4611226320266724, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16763439029455185, + "step": 22570 + }, + { + "epoch": 0.705375, + "grad_norm": 2.96875, + "grad_norm_var": 0.14949442545572916, + "learning_rate": 0.0001, + "loss": 5.7302, + "loss/crossentropy": 2.598994493484497, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16390138864517212, + "step": 22572 + }, + { + "epoch": 0.7054375, + "grad_norm": 3.046875, + "grad_norm_var": 0.154541015625, + "learning_rate": 0.0001, + "loss": 5.1796, + "loss/crossentropy": 2.1796997785568237, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.14882108569145203, + "step": 22574 + }, + { + "epoch": 0.7055, + "grad_norm": 3.40625, + "grad_norm_var": 0.15673421223958334, + "learning_rate": 0.0001, + "loss": 5.9949, + "loss/crossentropy": 2.7754679918289185, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1719452291727066, + "step": 22576 + }, + { + "epoch": 0.7055625, + "grad_norm": 2.84375, + "grad_norm_var": 0.15115458170572918, + "learning_rate": 0.0001, + "loss": 5.7318, + "loss/crossentropy": 2.6822497844696045, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1623767912387848, + "step": 22578 + }, + { + "epoch": 0.705625, + "grad_norm": 3.046875, + "grad_norm_var": 0.10041910807291667, + "learning_rate": 0.0001, + "loss": 5.6734, + "loss/crossentropy": 2.65487802028656, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1580996885895729, + "step": 22580 + }, + { + "epoch": 0.7056875, + "grad_norm": 3.046875, + "grad_norm_var": 0.09967447916666666, + "learning_rate": 0.0001, + "loss": 5.6133, + "loss/crossentropy": 2.5546118021011353, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1605612114071846, + "step": 22582 + }, + { + "epoch": 0.70575, + "grad_norm": 3.75, + "grad_norm_var": 0.1258209228515625, + "learning_rate": 0.0001, + "loss": 5.7549, + "loss/crossentropy": 2.6321762800216675, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16734836995601654, + "step": 22584 + }, + { + "epoch": 0.7058125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0508209228515625, + "learning_rate": 0.0001, + "loss": 5.6999, + "loss/crossentropy": 2.607453227043152, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16119518876075745, + "step": 22586 + }, + { + "epoch": 0.705875, + "grad_norm": 2.921875, + "grad_norm_var": 0.05927327473958333, + "learning_rate": 0.0001, + "loss": 5.8221, + "loss/crossentropy": 2.6366634368896484, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17049414664506912, + "step": 22588 + }, + { + "epoch": 0.7059375, + "grad_norm": 2.859375, + "grad_norm_var": 0.0621978759765625, + "learning_rate": 0.0001, + "loss": 5.625, + "loss/crossentropy": 2.556036114692688, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16197659075260162, + "step": 22590 + }, + { + "epoch": 0.706, + "grad_norm": 3.09375, + "grad_norm_var": 0.05572916666666667, + "learning_rate": 0.0001, + "loss": 5.6676, + "loss/crossentropy": 2.602640151977539, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16079703718423843, + "step": 22592 + }, + { + "epoch": 0.7060625, + "grad_norm": 2.703125, + "grad_norm_var": 0.06265360514322917, + "learning_rate": 0.0001, + "loss": 5.4918, + "loss/crossentropy": 2.4886010885238647, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15618036687374115, + "step": 22594 + }, + { + "epoch": 0.706125, + "grad_norm": 3.125, + "grad_norm_var": 0.05956624348958333, + "learning_rate": 0.0001, + "loss": 6.0395, + "loss/crossentropy": 2.781382203102112, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1773706078529358, + "step": 22596 + }, + { + "epoch": 0.7061875, + "grad_norm": 2.921875, + "grad_norm_var": 0.06730855305989583, + "learning_rate": 0.0001, + "loss": 5.4773, + "loss/crossentropy": 2.5254284143447876, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15222245454788208, + "step": 22598 + }, + { + "epoch": 0.70625, + "grad_norm": 2.96875, + "grad_norm_var": 0.03313395182291667, + "learning_rate": 0.0001, + "loss": 5.5662, + "loss/crossentropy": 2.493793249130249, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15880389511585236, + "step": 22600 + }, + { + "epoch": 0.7063125, + "grad_norm": 3.0, + "grad_norm_var": 0.034789021809895834, + "learning_rate": 0.0001, + "loss": 5.682, + "loss/crossentropy": 2.577637553215027, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16433850675821304, + "step": 22602 + }, + { + "epoch": 0.706375, + "grad_norm": 2.96875, + "grad_norm_var": 0.022541300455729166, + "learning_rate": 0.0001, + "loss": 5.4109, + "loss/crossentropy": 2.39993155002594, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.155780628323555, + "step": 22604 + }, + { + "epoch": 0.7064375, + "grad_norm": 9.0, + "grad_norm_var": 2.27955322265625, + "learning_rate": 0.0001, + "loss": 5.4753, + "loss/crossentropy": 2.225648522377014, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.18043851852416992, + "step": 22606 + }, + { + "epoch": 0.7065, + "grad_norm": 2.96875, + "grad_norm_var": 2.2759724934895833, + "learning_rate": 0.0001, + "loss": 5.4955, + "loss/crossentropy": 2.482828974723816, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15438725054264069, + "step": 22608 + }, + { + "epoch": 0.7065625, + "grad_norm": 3.25, + "grad_norm_var": 2.2589152018229166, + "learning_rate": 0.0001, + "loss": 5.6719, + "loss/crossentropy": 2.524471640586853, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16630669683218002, + "step": 22610 + }, + { + "epoch": 0.706625, + "grad_norm": 3.578125, + "grad_norm_var": 2.265062459309896, + "learning_rate": 0.0001, + "loss": 5.8865, + "loss/crossentropy": 2.6477824449539185, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17582054436206818, + "step": 22612 + }, + { + "epoch": 0.7066875, + "grad_norm": 2.953125, + "grad_norm_var": 2.2369049072265623, + "learning_rate": 0.0001, + "loss": 5.677, + "loss/crossentropy": 2.5583536624908447, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16382235288619995, + "step": 22614 + }, + { + "epoch": 0.70675, + "grad_norm": 2.84375, + "grad_norm_var": 2.2416951497395834, + "learning_rate": 0.0001, + "loss": 5.3948, + "loss/crossentropy": 2.3632187843322754, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15549806505441666, + "step": 22616 + }, + { + "epoch": 0.7068125, + "grad_norm": 2.9375, + "grad_norm_var": 2.2510650634765623, + "learning_rate": 0.0001, + "loss": 5.3167, + "loss/crossentropy": 2.351553440093994, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15432695299386978, + "step": 22618 + }, + { + "epoch": 0.706875, + "grad_norm": 3.0, + "grad_norm_var": 2.257664998372396, + "learning_rate": 0.0001, + "loss": 5.5772, + "loss/crossentropy": 2.5519754886627197, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15994001924991608, + "step": 22620 + }, + { + "epoch": 0.7069375, + "grad_norm": 2.703125, + "grad_norm_var": 0.03928934733072917, + "learning_rate": 0.0001, + "loss": 5.5584, + "loss/crossentropy": 2.647280216217041, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.14501510560512543, + "step": 22622 + }, + { + "epoch": 0.707, + "grad_norm": 2.8125, + "grad_norm_var": 0.04212239583333333, + "learning_rate": 0.0001, + "loss": 5.5797, + "loss/crossentropy": 2.595065474510193, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15432358533143997, + "step": 22624 + }, + { + "epoch": 0.7070625, + "grad_norm": 2.84375, + "grad_norm_var": 0.0416015625, + "learning_rate": 0.0001, + "loss": 5.7241, + "loss/crossentropy": 2.5783623456954956, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16809231787919998, + "step": 22626 + }, + { + "epoch": 0.707125, + "grad_norm": 3.21875, + "grad_norm_var": 0.13616536458333334, + "learning_rate": 0.0001, + "loss": 6.1164, + "loss/crossentropy": 2.7184011936187744, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.18667079508304596, + "step": 22628 + }, + { + "epoch": 0.7071875, + "grad_norm": 3.109375, + "grad_norm_var": 0.1376861572265625, + "learning_rate": 0.0001, + "loss": 5.3682, + "loss/crossentropy": 2.3315646648406982, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15912887454032898, + "step": 22630 + }, + { + "epoch": 0.70725, + "grad_norm": 3.171875, + "grad_norm_var": 0.13580322265625, + "learning_rate": 0.0001, + "loss": 5.6181, + "loss/crossentropy": 2.535012722015381, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16260423511266708, + "step": 22632 + }, + { + "epoch": 0.7073125, + "grad_norm": 5.1875, + "grad_norm_var": 0.40627848307291664, + "learning_rate": 0.0001, + "loss": 5.9121, + "loss/crossentropy": 2.619154453277588, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.18202994018793106, + "step": 22634 + }, + { + "epoch": 0.707375, + "grad_norm": 3.234375, + "grad_norm_var": 0.3915191650390625, + "learning_rate": 0.0001, + "loss": 5.699, + "loss/crossentropy": 2.5490050315856934, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16577870398759842, + "step": 22636 + }, + { + "epoch": 0.7074375, + "grad_norm": 2.9375, + "grad_norm_var": 0.37292378743489585, + "learning_rate": 0.0001, + "loss": 5.6291, + "loss/crossentropy": 2.544523000717163, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16509998589754105, + "step": 22638 + }, + { + "epoch": 0.7075, + "grad_norm": 2.828125, + "grad_norm_var": 0.358642578125, + "learning_rate": 0.0001, + "loss": 5.4892, + "loss/crossentropy": 2.431329131126404, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.15422901511192322, + "step": 22640 + }, + { + "epoch": 0.7075625, + "grad_norm": 3.03125, + "grad_norm_var": 0.3481353759765625, + "learning_rate": 0.0001, + "loss": 5.5501, + "loss/crossentropy": 2.4429216384887695, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16305910795927048, + "step": 22642 + }, + { + "epoch": 0.707625, + "grad_norm": 3.15625, + "grad_norm_var": 0.28142903645833334, + "learning_rate": 0.0001, + "loss": 5.5926, + "loss/crossentropy": 2.505157470703125, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16421548277139664, + "step": 22644 + }, + { + "epoch": 0.7076875, + "grad_norm": 2.96875, + "grad_norm_var": 0.29156494140625, + "learning_rate": 0.0001, + "loss": 5.4453, + "loss/crossentropy": 2.4603381156921387, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15552835166454315, + "step": 22646 + }, + { + "epoch": 0.70775, + "grad_norm": 3.046875, + "grad_norm_var": 0.2938629150390625, + "learning_rate": 0.0001, + "loss": 5.461, + "loss/crossentropy": 2.3692561388015747, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16113196313381195, + "step": 22648 + }, + { + "epoch": 0.7078125, + "grad_norm": 3.1875, + "grad_norm_var": 0.031233723958333334, + "learning_rate": 0.0001, + "loss": 5.5372, + "loss/crossentropy": 2.4094094038009644, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1682494729757309, + "step": 22650 + }, + { + "epoch": 0.707875, + "grad_norm": 3.296875, + "grad_norm_var": 0.03400777180989583, + "learning_rate": 0.0001, + "loss": 5.8546, + "loss/crossentropy": 2.647633194923401, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1706990748643875, + "step": 22652 + }, + { + "epoch": 0.7079375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0291656494140625, + "learning_rate": 0.0001, + "loss": 5.6755, + "loss/crossentropy": 2.5117732286453247, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16715115308761597, + "step": 22654 + }, + { + "epoch": 0.708, + "grad_norm": 3.375, + "grad_norm_var": 0.0236480712890625, + "learning_rate": 0.0001, + "loss": 5.5235, + "loss/crossentropy": 2.3812849521636963, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16695304214954376, + "step": 22656 + }, + { + "epoch": 0.7080625, + "grad_norm": 3.359375, + "grad_norm_var": 0.023249308268229168, + "learning_rate": 0.0001, + "loss": 5.3083, + "loss/crossentropy": 2.313563823699951, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15494468063116074, + "step": 22658 + }, + { + "epoch": 0.708125, + "grad_norm": 2.953125, + "grad_norm_var": 0.030720011393229166, + "learning_rate": 0.0001, + "loss": 5.5461, + "loss/crossentropy": 2.483505964279175, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1617322638630867, + "step": 22660 + }, + { + "epoch": 0.7081875, + "grad_norm": 3.265625, + "grad_norm_var": 0.025423177083333335, + "learning_rate": 0.0001, + "loss": 5.7102, + "loss/crossentropy": 2.581833243370056, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16479483246803284, + "step": 22662 + }, + { + "epoch": 0.70825, + "grad_norm": 3.109375, + "grad_norm_var": 0.029613240559895834, + "learning_rate": 0.0001, + "loss": 5.5946, + "loss/crossentropy": 2.4872822761535645, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16737564653158188, + "step": 22664 + }, + { + "epoch": 0.7083125, + "grad_norm": 3.671875, + "grad_norm_var": 0.05006103515625, + "learning_rate": 0.0001, + "loss": 5.5572, + "loss/crossentropy": 2.428568482398987, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16716498136520386, + "step": 22666 + }, + { + "epoch": 0.708375, + "grad_norm": 3.203125, + "grad_norm_var": 0.049128214518229164, + "learning_rate": 0.0001, + "loss": 6.1102, + "loss/crossentropy": 2.7997725009918213, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1802627444267273, + "step": 22668 + }, + { + "epoch": 0.7084375, + "grad_norm": 2.953125, + "grad_norm_var": 0.08024088541666667, + "learning_rate": 0.0001, + "loss": 5.6215, + "loss/crossentropy": 2.4986475706100464, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16658470779657364, + "step": 22670 + }, + { + "epoch": 0.7085, + "grad_norm": 3.1875, + "grad_norm_var": 0.08087565104166666, + "learning_rate": 0.0001, + "loss": 5.6899, + "loss/crossentropy": 2.5867003202438354, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16501083225011826, + "step": 22672 + }, + { + "epoch": 0.7085625, + "grad_norm": 3.5, + "grad_norm_var": 0.092041015625, + "learning_rate": 0.0001, + "loss": 6.0346, + "loss/crossentropy": 2.792657971382141, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17731842398643494, + "step": 22674 + }, + { + "epoch": 0.708625, + "grad_norm": 3.34375, + "grad_norm_var": 0.08580322265625, + "learning_rate": 0.0001, + "loss": 5.501, + "loss/crossentropy": 2.4093992710113525, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16267625987529755, + "step": 22676 + }, + { + "epoch": 0.7086875, + "grad_norm": 2.921875, + "grad_norm_var": 0.09345296223958334, + "learning_rate": 0.0001, + "loss": 5.6387, + "loss/crossentropy": 2.5944132804870605, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1579430103302002, + "step": 22678 + }, + { + "epoch": 0.70875, + "grad_norm": 3.109375, + "grad_norm_var": 0.09018452962239583, + "learning_rate": 0.0001, + "loss": 5.9749, + "loss/crossentropy": 2.775382161140442, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1730796843767166, + "step": 22680 + }, + { + "epoch": 0.7088125, + "grad_norm": 2.96875, + "grad_norm_var": 0.06813863118489584, + "learning_rate": 0.0001, + "loss": 5.4292, + "loss/crossentropy": 2.382672429084778, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16089950501918793, + "step": 22682 + }, + { + "epoch": 0.708875, + "grad_norm": 2.8125, + "grad_norm_var": 0.07604166666666666, + "learning_rate": 0.0001, + "loss": 5.5873, + "loss/crossentropy": 2.5621368885040283, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15798881649971008, + "step": 22684 + }, + { + "epoch": 0.7089375, + "grad_norm": 3.203125, + "grad_norm_var": 0.037398274739583334, + "learning_rate": 0.0001, + "loss": 6.1326, + "loss/crossentropy": 2.880889654159546, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17556104063987732, + "step": 22686 + }, + { + "epoch": 0.709, + "grad_norm": 2.875, + "grad_norm_var": 0.03953348795572917, + "learning_rate": 0.0001, + "loss": 5.6047, + "loss/crossentropy": 2.578131675720215, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1596865952014923, + "step": 22688 + }, + { + "epoch": 0.7090625, + "grad_norm": 3.171875, + "grad_norm_var": 0.025202433268229168, + "learning_rate": 0.0001, + "loss": 5.7346, + "loss/crossentropy": 2.608475089073181, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1661328747868538, + "step": 22690 + }, + { + "epoch": 0.709125, + "grad_norm": 3.203125, + "grad_norm_var": 0.020807902018229168, + "learning_rate": 0.0001, + "loss": 5.7047, + "loss/crossentropy": 2.5614583492279053, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16705966740846634, + "step": 22692 + }, + { + "epoch": 0.7091875, + "grad_norm": 3.265625, + "grad_norm_var": 0.021565755208333332, + "learning_rate": 0.0001, + "loss": 5.7064, + "loss/crossentropy": 2.5638712644577026, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1705031916499138, + "step": 22694 + }, + { + "epoch": 0.70925, + "grad_norm": 3.0, + "grad_norm_var": 0.02838134765625, + "learning_rate": 0.0001, + "loss": 5.2853, + "loss/crossentropy": 2.359517216682434, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1519572213292122, + "step": 22696 + }, + { + "epoch": 0.7093125, + "grad_norm": 3.0, + "grad_norm_var": 0.025484212239583335, + "learning_rate": 0.0001, + "loss": 5.6032, + "loss/crossentropy": 2.5778008699417114, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.159569650888443, + "step": 22698 + }, + { + "epoch": 0.709375, + "grad_norm": 3.171875, + "grad_norm_var": 0.022118123372395833, + "learning_rate": 0.0001, + "loss": 5.465, + "loss/crossentropy": 2.4808367490768433, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15466291457414627, + "step": 22700 + }, + { + "epoch": 0.7094375, + "grad_norm": 3.25, + "grad_norm_var": 0.024739583333333332, + "learning_rate": 0.0001, + "loss": 5.7297, + "loss/crossentropy": 2.6298859119415283, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16310980916023254, + "step": 22702 + }, + { + "epoch": 0.7095, + "grad_norm": 3.03125, + "grad_norm_var": 0.022379557291666668, + "learning_rate": 0.0001, + "loss": 5.6075, + "loss/crossentropy": 2.5181902647018433, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1624443531036377, + "step": 22704 + }, + { + "epoch": 0.7095625, + "grad_norm": 2.859375, + "grad_norm_var": 0.025731404622395832, + "learning_rate": 0.0001, + "loss": 5.5071, + "loss/crossentropy": 2.48038911819458, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15618938207626343, + "step": 22706 + }, + { + "epoch": 0.709625, + "grad_norm": 2.828125, + "grad_norm_var": 0.026236979166666667, + "learning_rate": 0.0001, + "loss": 5.639, + "loss/crossentropy": 2.5747874975204468, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1611132174730301, + "step": 22708 + }, + { + "epoch": 0.7096875, + "grad_norm": 3.375, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 5.6398, + "loss/crossentropy": 2.533713221549988, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1668551117181778, + "step": 22710 + }, + { + "epoch": 0.70975, + "grad_norm": 2.921875, + "grad_norm_var": 0.025202433268229168, + "learning_rate": 0.0001, + "loss": 5.7806, + "loss/crossentropy": 2.641892910003662, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1666097566485405, + "step": 22712 + }, + { + "epoch": 0.7098125, + "grad_norm": 3.484375, + "grad_norm_var": 0.03564046223958333, + "learning_rate": 0.0001, + "loss": 5.7634, + "loss/crossentropy": 2.483535051345825, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17407961189746857, + "step": 22714 + }, + { + "epoch": 0.709875, + "grad_norm": 2.8125, + "grad_norm_var": 0.03870442708333333, + "learning_rate": 0.0001, + "loss": 5.4512, + "loss/crossentropy": 2.4775872230529785, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15322404354810715, + "step": 22716 + }, + { + "epoch": 0.7099375, + "grad_norm": 2.890625, + "grad_norm_var": 0.0653961181640625, + "learning_rate": 0.0001, + "loss": 5.4074, + "loss/crossentropy": 2.2718788385391235, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16316396743059158, + "step": 22718 + }, + { + "epoch": 0.71, + "grad_norm": 3.265625, + "grad_norm_var": 0.06968994140625, + "learning_rate": 0.0001, + "loss": 5.4643, + "loss/crossentropy": 2.4302117824554443, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15927303582429886, + "step": 22720 + }, + { + "epoch": 0.7100625, + "grad_norm": 3.109375, + "grad_norm_var": 0.06345113118489583, + "learning_rate": 0.0001, + "loss": 5.4779, + "loss/crossentropy": 2.4662187099456787, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1581977978348732, + "step": 22722 + }, + { + "epoch": 0.710125, + "grad_norm": 3.453125, + "grad_norm_var": 0.12180989583333333, + "learning_rate": 0.0001, + "loss": 5.8428, + "loss/crossentropy": 2.6261080503463745, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17323270440101624, + "step": 22724 + }, + { + "epoch": 0.7101875, + "grad_norm": 3.125, + "grad_norm_var": 0.12086181640625, + "learning_rate": 0.0001, + "loss": 5.5741, + "loss/crossentropy": 2.5218567848205566, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15678295493125916, + "step": 22726 + }, + { + "epoch": 0.71025, + "grad_norm": 3.734375, + "grad_norm_var": 0.1319244384765625, + "learning_rate": 0.0001, + "loss": 6.0828, + "loss/crossentropy": 2.759239077568054, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18157602846622467, + "step": 22728 + }, + { + "epoch": 0.7103125, + "grad_norm": 2.90625, + "grad_norm_var": 0.14137369791666668, + "learning_rate": 0.0001, + "loss": 5.3898, + "loss/crossentropy": 2.4318522214889526, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14813748002052307, + "step": 22730 + }, + { + "epoch": 0.710375, + "grad_norm": 2.953125, + "grad_norm_var": 0.13028971354166666, + "learning_rate": 0.0001, + "loss": 5.9064, + "loss/crossentropy": 2.769029974937439, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16646908223628998, + "step": 22732 + }, + { + "epoch": 0.7104375, + "grad_norm": 3.171875, + "grad_norm_var": 0.10664774576822916, + "learning_rate": 0.0001, + "loss": 5.6145, + "loss/crossentropy": 2.5253701210021973, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1600886434316635, + "step": 22734 + }, + { + "epoch": 0.7105, + "grad_norm": 2.90625, + "grad_norm_var": 0.10767822265625, + "learning_rate": 0.0001, + "loss": 5.7827, + "loss/crossentropy": 2.6507482528686523, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16319461911916733, + "step": 22736 + }, + { + "epoch": 0.7105625, + "grad_norm": 3.203125, + "grad_norm_var": 0.106640625, + "learning_rate": 0.0001, + "loss": 5.7554, + "loss/crossentropy": 2.5523555278778076, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17187045514583588, + "step": 22738 + }, + { + "epoch": 0.710625, + "grad_norm": 2.90625, + "grad_norm_var": 0.050634765625, + "learning_rate": 0.0001, + "loss": 5.6187, + "loss/crossentropy": 2.5167863368988037, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1617513820528984, + "step": 22740 + }, + { + "epoch": 0.7106875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0516998291015625, + "learning_rate": 0.0001, + "loss": 5.4693, + "loss/crossentropy": 2.4850971698760986, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15467499941587448, + "step": 22742 + }, + { + "epoch": 0.71075, + "grad_norm": 2.890625, + "grad_norm_var": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 5.7114, + "loss/crossentropy": 2.630742907524109, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1627499684691429, + "step": 22744 + }, + { + "epoch": 0.7108125, + "grad_norm": 3.03125, + "grad_norm_var": 0.016499837239583332, + "learning_rate": 0.0001, + "loss": 5.6352, + "loss/crossentropy": 2.5919694900512695, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16252951323986053, + "step": 22746 + }, + { + "epoch": 0.710875, + "grad_norm": 2.9375, + "grad_norm_var": 0.015816243489583333, + "learning_rate": 0.0001, + "loss": 5.4342, + "loss/crossentropy": 2.4220200777053833, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.16059560328722, + "step": 22748 + }, + { + "epoch": 0.7109375, + "grad_norm": 2.9375, + "grad_norm_var": 0.013948567708333333, + "learning_rate": 0.0001, + "loss": 5.5684, + "loss/crossentropy": 2.493459463119507, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1653081774711609, + "step": 22750 + }, + { + "epoch": 0.711, + "grad_norm": 3.03125, + "grad_norm_var": 0.020555623372395835, + "learning_rate": 0.0001, + "loss": 5.845, + "loss/crossentropy": 2.6640223264694214, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17083007842302322, + "step": 22752 + }, + { + "epoch": 0.7110625, + "grad_norm": 4.375, + "grad_norm_var": 0.13097330729166667, + "learning_rate": 0.0001, + "loss": 5.9282, + "loss/crossentropy": 2.667199730873108, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.171409510076046, + "step": 22754 + }, + { + "epoch": 0.711125, + "grad_norm": 3.0, + "grad_norm_var": 0.12916259765625, + "learning_rate": 0.0001, + "loss": 5.7468, + "loss/crossentropy": 2.60368275642395, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16821421682834625, + "step": 22756 + }, + { + "epoch": 0.7111875, + "grad_norm": 3.265625, + "grad_norm_var": 0.12870992024739583, + "learning_rate": 0.0001, + "loss": 5.8475, + "loss/crossentropy": 2.6243757009506226, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.173094242811203, + "step": 22758 + }, + { + "epoch": 0.71125, + "grad_norm": 2.859375, + "grad_norm_var": 0.12905985514322918, + "learning_rate": 0.0001, + "loss": 5.4948, + "loss/crossentropy": 2.5094510316848755, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1559608057141304, + "step": 22760 + }, + { + "epoch": 0.7113125, + "grad_norm": 3.03125, + "grad_norm_var": 0.12356669108072917, + "learning_rate": 0.0001, + "loss": 5.4227, + "loss/crossentropy": 2.4355177879333496, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15731318295001984, + "step": 22762 + }, + { + "epoch": 0.711375, + "grad_norm": 3.15625, + "grad_norm_var": 0.16108296712239584, + "learning_rate": 0.0001, + "loss": 5.772, + "loss/crossentropy": 2.555185317993164, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1712891086935997, + "step": 22764 + }, + { + "epoch": 0.7114375, + "grad_norm": 3.359375, + "grad_norm_var": 0.15607096354166666, + "learning_rate": 0.0001, + "loss": 5.8258, + "loss/crossentropy": 2.6384923458099365, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17381110787391663, + "step": 22766 + }, + { + "epoch": 0.7115, + "grad_norm": 3.03125, + "grad_norm_var": 0.16676432291666668, + "learning_rate": 0.0001, + "loss": 5.3878, + "loss/crossentropy": 2.346682071685791, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1568494439125061, + "step": 22768 + }, + { + "epoch": 0.7115625, + "grad_norm": 3.125, + "grad_norm_var": 0.08124898274739584, + "learning_rate": 0.0001, + "loss": 5.2037, + "loss/crossentropy": 2.2740384340286255, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.14999421685934067, + "step": 22770 + }, + { + "epoch": 0.711625, + "grad_norm": 3.375, + "grad_norm_var": 0.08825581868489583, + "learning_rate": 0.0001, + "loss": 6.1145, + "loss/crossentropy": 2.7673765420913696, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1823655590415001, + "step": 22772 + }, + { + "epoch": 0.7116875, + "grad_norm": 2.96875, + "grad_norm_var": 0.09159749348958333, + "learning_rate": 0.0001, + "loss": 5.3939, + "loss/crossentropy": 2.4127246141433716, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15437036752700806, + "step": 22774 + }, + { + "epoch": 0.71175, + "grad_norm": 2.828125, + "grad_norm_var": 0.09659830729166667, + "learning_rate": 0.0001, + "loss": 5.6882, + "loss/crossentropy": 2.651946783065796, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16104654967784882, + "step": 22776 + }, + { + "epoch": 0.7118125, + "grad_norm": 3.1875, + "grad_norm_var": 0.09666341145833333, + "learning_rate": 0.0001, + "loss": 5.9907, + "loss/crossentropy": 2.804213285446167, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17099642753601074, + "step": 22778 + }, + { + "epoch": 0.711875, + "grad_norm": 3.03125, + "grad_norm_var": 0.046923828125, + "learning_rate": 0.0001, + "loss": 5.7334, + "loss/crossentropy": 2.5786982774734497, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17054951190948486, + "step": 22780 + }, + { + "epoch": 0.7119375, + "grad_norm": 3.578125, + "grad_norm_var": 0.057062784830729164, + "learning_rate": 0.0001, + "loss": 5.1693, + "loss/crossentropy": 2.170661151409149, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15143033862113953, + "step": 22782 + }, + { + "epoch": 0.712, + "grad_norm": 3.078125, + "grad_norm_var": 0.0527984619140625, + "learning_rate": 0.0001, + "loss": 5.762, + "loss/crossentropy": 2.667365312576294, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16376512497663498, + "step": 22784 + }, + { + "epoch": 0.7120625, + "grad_norm": 2.90625, + "grad_norm_var": 0.04910481770833333, + "learning_rate": 0.0001, + "loss": 5.5265, + "loss/crossentropy": 2.572754383087158, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1531859189271927, + "step": 22786 + }, + { + "epoch": 0.712125, + "grad_norm": 2.890625, + "grad_norm_var": 0.033430989583333334, + "learning_rate": 0.0001, + "loss": 5.5468, + "loss/crossentropy": 2.4839890003204346, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1625334694981575, + "step": 22788 + }, + { + "epoch": 0.7121875, + "grad_norm": 2.875, + "grad_norm_var": 0.034586588541666664, + "learning_rate": 0.0001, + "loss": 5.2954, + "loss/crossentropy": 2.3394761085510254, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15106205642223358, + "step": 22790 + }, + { + "epoch": 0.71225, + "grad_norm": 3.140625, + "grad_norm_var": 0.031722005208333334, + "learning_rate": 0.0001, + "loss": 5.7823, + "loss/crossentropy": 2.633091449737549, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16413924098014832, + "step": 22792 + }, + { + "epoch": 0.7123125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0318511962890625, + "learning_rate": 0.0001, + "loss": 5.7476, + "loss/crossentropy": 2.5560721158981323, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16954263299703598, + "step": 22794 + }, + { + "epoch": 0.712375, + "grad_norm": 3.921875, + "grad_norm_var": 0.07866923014322917, + "learning_rate": 0.0001, + "loss": 5.5303, + "loss/crossentropy": 2.4877405166625977, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15621329098939896, + "step": 22796 + }, + { + "epoch": 0.7124375, + "grad_norm": 3.171875, + "grad_norm_var": 0.06638895670572917, + "learning_rate": 0.0001, + "loss": 5.9457, + "loss/crossentropy": 2.737141489982605, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17203232645988464, + "step": 22798 + }, + { + "epoch": 0.7125, + "grad_norm": 2.96875, + "grad_norm_var": 0.0666168212890625, + "learning_rate": 0.0001, + "loss": 5.7573, + "loss/crossentropy": 2.634291172027588, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1650400161743164, + "step": 22800 + }, + { + "epoch": 0.7125625, + "grad_norm": 3.3125, + "grad_norm_var": 0.08924051920572916, + "learning_rate": 0.0001, + "loss": 6.0942, + "loss/crossentropy": 2.8320658206939697, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17621354013681412, + "step": 22802 + }, + { + "epoch": 0.712625, + "grad_norm": 3.078125, + "grad_norm_var": 0.08406575520833333, + "learning_rate": 0.0001, + "loss": 5.4304, + "loss/crossentropy": 2.411654829978943, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15851899981498718, + "step": 22804 + }, + { + "epoch": 0.7126875, + "grad_norm": 2.9375, + "grad_norm_var": 0.08391011555989583, + "learning_rate": 0.0001, + "loss": 5.3661, + "loss/crossentropy": 2.352829337120056, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15562819689512253, + "step": 22806 + }, + { + "epoch": 0.71275, + "grad_norm": 2.953125, + "grad_norm_var": 0.0881256103515625, + "learning_rate": 0.0001, + "loss": 5.4315, + "loss/crossentropy": 2.429062843322754, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1592329815030098, + "step": 22808 + }, + { + "epoch": 0.7128125, + "grad_norm": 2.859375, + "grad_norm_var": 0.09368489583333334, + "learning_rate": 0.0001, + "loss": 5.3305, + "loss/crossentropy": 2.345287322998047, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15555164217948914, + "step": 22810 + }, + { + "epoch": 0.712875, + "grad_norm": 3.125, + "grad_norm_var": 0.05252278645833333, + "learning_rate": 0.0001, + "loss": 5.8159, + "loss/crossentropy": 2.6428064107894897, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16926005482673645, + "step": 22812 + }, + { + "epoch": 0.7129375, + "grad_norm": 3.375, + "grad_norm_var": 0.05526936848958333, + "learning_rate": 0.0001, + "loss": 5.7927, + "loss/crossentropy": 2.6445947885513306, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16754399240016937, + "step": 22814 + }, + { + "epoch": 0.713, + "grad_norm": 3.390625, + "grad_norm_var": 0.11797587076822917, + "learning_rate": 0.0001, + "loss": 6.1274, + "loss/crossentropy": 2.7830978631973267, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.18716295063495636, + "step": 22816 + }, + { + "epoch": 0.7130625, + "grad_norm": 2.984375, + "grad_norm_var": 0.09731343587239584, + "learning_rate": 0.0001, + "loss": 5.7858, + "loss/crossentropy": 2.581252932548523, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1724056899547577, + "step": 22818 + }, + { + "epoch": 0.713125, + "grad_norm": 3.0, + "grad_norm_var": 0.1576324462890625, + "learning_rate": 0.0001, + "loss": 5.6434, + "loss/crossentropy": 2.4850512742996216, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16231805831193924, + "step": 22820 + }, + { + "epoch": 0.7131875, + "grad_norm": 3.203125, + "grad_norm_var": 0.15260009765625, + "learning_rate": 0.0001, + "loss": 5.5542, + "loss/crossentropy": 2.5062475204467773, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16105009615421295, + "step": 22822 + }, + { + "epoch": 0.71325, + "grad_norm": 2.984375, + "grad_norm_var": 0.15286051432291667, + "learning_rate": 0.0001, + "loss": 5.4837, + "loss/crossentropy": 2.4561463594436646, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15549034625291824, + "step": 22824 + }, + { + "epoch": 0.7133125, + "grad_norm": 2.9375, + "grad_norm_var": 0.17031962076822918, + "learning_rate": 0.0001, + "loss": 5.3374, + "loss/crossentropy": 2.40144145488739, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.14945203065872192, + "step": 22826 + }, + { + "epoch": 0.713375, + "grad_norm": 3.703125, + "grad_norm_var": 0.19029541015625, + "learning_rate": 0.0001, + "loss": 5.7645, + "loss/crossentropy": 2.567002296447754, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17092538625001907, + "step": 22828 + }, + { + "epoch": 0.7134375, + "grad_norm": 3.140625, + "grad_norm_var": 0.18230692545572916, + "learning_rate": 0.0001, + "loss": 5.6136, + "loss/crossentropy": 2.457472801208496, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16835028678178787, + "step": 22830 + }, + { + "epoch": 0.7135, + "grad_norm": 3.296875, + "grad_norm_var": 0.12547098795572917, + "learning_rate": 0.0001, + "loss": 6.0906, + "loss/crossentropy": 2.879745602607727, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1738201305270195, + "step": 22832 + }, + { + "epoch": 0.7135625, + "grad_norm": 3.03125, + "grad_norm_var": 0.13761393229166666, + "learning_rate": 0.0001, + "loss": 5.8168, + "loss/crossentropy": 2.593244194984436, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17547693848609924, + "step": 22834 + }, + { + "epoch": 0.713625, + "grad_norm": 3.21875, + "grad_norm_var": 0.4930338541666667, + "learning_rate": 0.0001, + "loss": 6.4787, + "loss/crossentropy": 2.883357286453247, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.2052336409687996, + "step": 22836 + }, + { + "epoch": 0.7136875, + "grad_norm": 3.015625, + "grad_norm_var": 0.49075419108072915, + "learning_rate": 0.0001, + "loss": 5.6144, + "loss/crossentropy": 2.549429774284363, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1572815477848053, + "step": 22838 + }, + { + "epoch": 0.71375, + "grad_norm": 3.078125, + "grad_norm_var": 0.48004150390625, + "learning_rate": 0.0001, + "loss": 5.672, + "loss/crossentropy": 2.589589238166809, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16292711347341537, + "step": 22840 + }, + { + "epoch": 0.7138125, + "grad_norm": 2.953125, + "grad_norm_var": 0.45465087890625, + "learning_rate": 0.0001, + "loss": 5.4062, + "loss/crossentropy": 2.3374747037887573, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16000014543533325, + "step": 22842 + }, + { + "epoch": 0.713875, + "grad_norm": 2.796875, + "grad_norm_var": 0.45360921223958334, + "learning_rate": 0.0001, + "loss": 5.7364, + "loss/crossentropy": 2.690854072570801, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16237051039934158, + "step": 22844 + }, + { + "epoch": 0.7139375, + "grad_norm": 2.953125, + "grad_norm_var": 0.4587198893229167, + "learning_rate": 0.0001, + "loss": 5.636, + "loss/crossentropy": 2.5606768131256104, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16455943137407303, + "step": 22846 + }, + { + "epoch": 0.714, + "grad_norm": 3.015625, + "grad_norm_var": 0.4665323893229167, + "learning_rate": 0.0001, + "loss": 5.7475, + "loss/crossentropy": 2.6324750185012817, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1657993420958519, + "step": 22848 + }, + { + "epoch": 0.7140625, + "grad_norm": 3.265625, + "grad_norm_var": 0.4652303059895833, + "learning_rate": 0.0001, + "loss": 5.621, + "loss/crossentropy": 2.5494478940963745, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1606711894273758, + "step": 22850 + }, + { + "epoch": 0.714125, + "grad_norm": 3.265625, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 5.8994, + "loss/crossentropy": 2.7274543046951294, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.169148251414299, + "step": 22852 + }, + { + "epoch": 0.7141875, + "grad_norm": 2.875, + "grad_norm_var": 0.024095662434895835, + "learning_rate": 0.0001, + "loss": 5.6623, + "loss/crossentropy": 2.647777795791626, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15730953961610794, + "step": 22854 + }, + { + "epoch": 0.71425, + "grad_norm": 2.875, + "grad_norm_var": 0.026253255208333333, + "learning_rate": 0.0001, + "loss": 5.7908, + "loss/crossentropy": 2.6484906673431396, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1654057577252388, + "step": 22856 + }, + { + "epoch": 0.7143125, + "grad_norm": 3.15625, + "grad_norm_var": 0.034163411458333334, + "learning_rate": 0.0001, + "loss": 5.9103, + "loss/crossentropy": 2.688157796859741, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17572540789842606, + "step": 22858 + }, + { + "epoch": 0.714375, + "grad_norm": 3.125, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 6.0367, + "loss/crossentropy": 2.717068076133728, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18313800543546677, + "step": 22860 + }, + { + "epoch": 0.7144375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0355133056640625, + "learning_rate": 0.0001, + "loss": 5.9411, + "loss/crossentropy": 2.7882550954818726, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.169186569750309, + "step": 22862 + }, + { + "epoch": 0.7145, + "grad_norm": 2.8125, + "grad_norm_var": 0.041966756184895836, + "learning_rate": 0.0001, + "loss": 5.2297, + "loss/crossentropy": 2.3523730039596558, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1478886529803276, + "step": 22864 + }, + { + "epoch": 0.7145625, + "grad_norm": 2.984375, + "grad_norm_var": 0.03498433430989583, + "learning_rate": 0.0001, + "loss": 6.05, + "loss/crossentropy": 2.833707571029663, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17592568695545197, + "step": 22866 + }, + { + "epoch": 0.714625, + "grad_norm": 3.15625, + "grad_norm_var": 0.038525390625, + "learning_rate": 0.0001, + "loss": 5.6687, + "loss/crossentropy": 2.6187517642974854, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15929661691188812, + "step": 22868 + }, + { + "epoch": 0.7146875, + "grad_norm": 3.25, + "grad_norm_var": 0.03892822265625, + "learning_rate": 0.0001, + "loss": 5.7494, + "loss/crossentropy": 2.5784850120544434, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17217203229665756, + "step": 22870 + }, + { + "epoch": 0.71475, + "grad_norm": 2.984375, + "grad_norm_var": 0.0366363525390625, + "learning_rate": 0.0001, + "loss": 5.631, + "loss/crossentropy": 2.5323336124420166, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16182377189397812, + "step": 22872 + }, + { + "epoch": 0.7148125, + "grad_norm": 3.0, + "grad_norm_var": 0.031473795572916664, + "learning_rate": 0.0001, + "loss": 5.802, + "loss/crossentropy": 2.5953755378723145, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17105583101511002, + "step": 22874 + }, + { + "epoch": 0.714875, + "grad_norm": 3.1875, + "grad_norm_var": 0.021239217122395834, + "learning_rate": 0.0001, + "loss": 5.8375, + "loss/crossentropy": 2.65300977230072, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17118091881275177, + "step": 22876 + }, + { + "epoch": 0.7149375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0223052978515625, + "learning_rate": 0.0001, + "loss": 5.865, + "loss/crossentropy": 2.6619762182235718, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17108601331710815, + "step": 22878 + }, + { + "epoch": 0.715, + "grad_norm": 2.96875, + "grad_norm_var": 0.018355305989583334, + "learning_rate": 0.0001, + "loss": 5.8736, + "loss/crossentropy": 2.695634603500366, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17130812257528305, + "step": 22880 + }, + { + "epoch": 0.7150625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0179351806640625, + "learning_rate": 0.0001, + "loss": 5.5798, + "loss/crossentropy": 2.54266095161438, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16034962981939316, + "step": 22882 + }, + { + "epoch": 0.715125, + "grad_norm": 3.375, + "grad_norm_var": 0.019701131184895835, + "learning_rate": 0.0001, + "loss": 5.4497, + "loss/crossentropy": 2.444697618484497, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15636342763900757, + "step": 22884 + }, + { + "epoch": 0.7151875, + "grad_norm": 3.25, + "grad_norm_var": 0.017769368489583333, + "learning_rate": 0.0001, + "loss": 5.6857, + "loss/crossentropy": 2.6012312173843384, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16509142518043518, + "step": 22886 + }, + { + "epoch": 0.71525, + "grad_norm": 2.921875, + "grad_norm_var": 0.019091796875, + "learning_rate": 0.0001, + "loss": 5.6813, + "loss/crossentropy": 2.6111589670181274, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16599716246128082, + "step": 22888 + }, + { + "epoch": 0.7153125, + "grad_norm": 3.234375, + "grad_norm_var": 0.017210896809895834, + "learning_rate": 0.0001, + "loss": 5.7435, + "loss/crossentropy": 2.613040566444397, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1657794564962387, + "step": 22890 + }, + { + "epoch": 0.715375, + "grad_norm": 3.421875, + "grad_norm_var": 0.022261555989583334, + "learning_rate": 0.0001, + "loss": 5.8041, + "loss/crossentropy": 2.6595606803894043, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16718725115060806, + "step": 22892 + }, + { + "epoch": 0.7154375, + "grad_norm": 3.25, + "grad_norm_var": 0.022077433268229165, + "learning_rate": 0.0001, + "loss": 6.1265, + "loss/crossentropy": 2.8601614236831665, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17545753717422485, + "step": 22894 + }, + { + "epoch": 0.7155, + "grad_norm": 3.28125, + "grad_norm_var": 0.024681599934895833, + "learning_rate": 0.0001, + "loss": 5.5148, + "loss/crossentropy": 2.471987009048462, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16092655807733536, + "step": 22896 + }, + { + "epoch": 0.7155625, + "grad_norm": 3.0625, + "grad_norm_var": 0.024168904622395834, + "learning_rate": 0.0001, + "loss": 5.6747, + "loss/crossentropy": 2.5920382738113403, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16412097215652466, + "step": 22898 + }, + { + "epoch": 0.715625, + "grad_norm": 3.234375, + "grad_norm_var": 0.018993123372395834, + "learning_rate": 0.0001, + "loss": 5.2929, + "loss/crossentropy": 2.325380504131317, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1479194536805153, + "step": 22900 + }, + { + "epoch": 0.7156875, + "grad_norm": 3.09375, + "grad_norm_var": 0.019075520833333335, + "learning_rate": 0.0001, + "loss": 5.4117, + "loss/crossentropy": 2.3158397674560547, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16075515002012253, + "step": 22902 + }, + { + "epoch": 0.71575, + "grad_norm": 2.984375, + "grad_norm_var": 0.0173492431640625, + "learning_rate": 0.0001, + "loss": 5.7636, + "loss/crossentropy": 2.6401582956314087, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16586346924304962, + "step": 22904 + }, + { + "epoch": 0.7158125, + "grad_norm": 2.921875, + "grad_norm_var": 0.020018513997395834, + "learning_rate": 0.0001, + "loss": 5.3527, + "loss/crossentropy": 2.3282387256622314, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1579195261001587, + "step": 22906 + }, + { + "epoch": 0.715875, + "grad_norm": 2.96875, + "grad_norm_var": 0.01734619140625, + "learning_rate": 0.0001, + "loss": 5.646, + "loss/crossentropy": 2.5214792490005493, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1651904061436653, + "step": 22908 + }, + { + "epoch": 0.7159375, + "grad_norm": 3.15625, + "grad_norm_var": 0.01617431640625, + "learning_rate": 0.0001, + "loss": 5.3251, + "loss/crossentropy": 2.353050470352173, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1550159454345703, + "step": 22910 + }, + { + "epoch": 0.716, + "grad_norm": 2.84375, + "grad_norm_var": 0.016402180989583334, + "learning_rate": 0.0001, + "loss": 5.5424, + "loss/crossentropy": 2.4919172525405884, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1605149209499359, + "step": 22912 + }, + { + "epoch": 0.7160625, + "grad_norm": 3.015625, + "grad_norm_var": 0.018040974934895832, + "learning_rate": 0.0001, + "loss": 5.8854, + "loss/crossentropy": 2.7260115146636963, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16827943176031113, + "step": 22914 + }, + { + "epoch": 0.716125, + "grad_norm": 3.0625, + "grad_norm_var": 0.016014607747395833, + "learning_rate": 0.0001, + "loss": 5.7663, + "loss/crossentropy": 2.635956048965454, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16811296343803406, + "step": 22916 + }, + { + "epoch": 0.7161875, + "grad_norm": 2.96875, + "grad_norm_var": 0.016792805989583333, + "learning_rate": 0.0001, + "loss": 5.7874, + "loss/crossentropy": 2.626030683517456, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1669207662343979, + "step": 22918 + }, + { + "epoch": 0.71625, + "grad_norm": 2.984375, + "grad_norm_var": 0.017064412434895832, + "learning_rate": 0.0001, + "loss": 5.6213, + "loss/crossentropy": 2.505921483039856, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16231827437877655, + "step": 22920 + }, + { + "epoch": 0.7163125, + "grad_norm": 2.859375, + "grad_norm_var": 0.0242340087890625, + "learning_rate": 0.0001, + "loss": 5.1162, + "loss/crossentropy": 2.253924250602722, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.14442609250545502, + "step": 22922 + }, + { + "epoch": 0.716375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0188385009765625, + "learning_rate": 0.0001, + "loss": 5.6083, + "loss/crossentropy": 2.535650134086609, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16117332875728607, + "step": 22924 + }, + { + "epoch": 0.7164375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0178863525390625, + "learning_rate": 0.0001, + "loss": 5.5294, + "loss/crossentropy": 2.52425217628479, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.16066953539848328, + "step": 22926 + }, + { + "epoch": 0.7165, + "grad_norm": 3.390625, + "grad_norm_var": 0.0256256103515625, + "learning_rate": 0.0001, + "loss": 5.922, + "loss/crossentropy": 2.7153269052505493, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17340506613254547, + "step": 22928 + }, + { + "epoch": 0.7165625, + "grad_norm": 3.140625, + "grad_norm_var": 0.024950154622395835, + "learning_rate": 0.0001, + "loss": 5.3528, + "loss/crossentropy": 2.3147772550582886, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16043871641159058, + "step": 22930 + }, + { + "epoch": 0.716625, + "grad_norm": 2.9375, + "grad_norm_var": 0.026200358072916666, + "learning_rate": 0.0001, + "loss": 5.3001, + "loss/crossentropy": 2.3875588178634644, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.15180308371782303, + "step": 22932 + }, + { + "epoch": 0.7166875, + "grad_norm": 2.71875, + "grad_norm_var": 0.03156636555989583, + "learning_rate": 0.0001, + "loss": 5.4529, + "loss/crossentropy": 2.507371425628662, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15080364048480988, + "step": 22934 + }, + { + "epoch": 0.71675, + "grad_norm": 3.0625, + "grad_norm_var": 0.0298492431640625, + "learning_rate": 0.0001, + "loss": 5.6018, + "loss/crossentropy": 2.554092526435852, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1586805135011673, + "step": 22936 + }, + { + "epoch": 0.7168125, + "grad_norm": 3.046875, + "grad_norm_var": 0.027392578125, + "learning_rate": 0.0001, + "loss": 5.5521, + "loss/crossentropy": 2.3645232915878296, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16602084040641785, + "step": 22938 + }, + { + "epoch": 0.716875, + "grad_norm": 3.0, + "grad_norm_var": 0.027757771809895835, + "learning_rate": 0.0001, + "loss": 5.5889, + "loss/crossentropy": 2.5736584663391113, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15855270624160767, + "step": 22940 + }, + { + "epoch": 0.7169375, + "grad_norm": 2.921875, + "grad_norm_var": 0.028180948893229165, + "learning_rate": 0.0001, + "loss": 5.7128, + "loss/crossentropy": 2.663956642150879, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15801136195659637, + "step": 22942 + }, + { + "epoch": 0.717, + "grad_norm": 2.65625, + "grad_norm_var": 0.025389607747395834, + "learning_rate": 0.0001, + "loss": 5.381, + "loss/crossentropy": 2.4947913885116577, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.14252623170614243, + "step": 22944 + }, + { + "epoch": 0.7170625, + "grad_norm": 2.75, + "grad_norm_var": 0.024409993489583334, + "learning_rate": 0.0001, + "loss": 5.5959, + "loss/crossentropy": 2.5649075508117676, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15934739261865616, + "step": 22946 + }, + { + "epoch": 0.717125, + "grad_norm": 3.5, + "grad_norm_var": 0.04482320149739583, + "learning_rate": 0.0001, + "loss": 5.7552, + "loss/crossentropy": 2.581146717071533, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1689656898379326, + "step": 22948 + }, + { + "epoch": 0.7171875, + "grad_norm": 2.90625, + "grad_norm_var": 0.04300130208333333, + "learning_rate": 0.0001, + "loss": 5.6355, + "loss/crossentropy": 2.5125614404678345, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16658750921487808, + "step": 22950 + }, + { + "epoch": 0.71725, + "grad_norm": 2.953125, + "grad_norm_var": 0.04597066243489583, + "learning_rate": 0.0001, + "loss": 5.5647, + "loss/crossentropy": 2.460248827934265, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16161371022462845, + "step": 22952 + }, + { + "epoch": 0.7173125, + "grad_norm": 3.0, + "grad_norm_var": 0.03937886555989583, + "learning_rate": 0.0001, + "loss": 5.6261, + "loss/crossentropy": 2.554978609085083, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16024113446474075, + "step": 22954 + }, + { + "epoch": 0.717375, + "grad_norm": 2.75, + "grad_norm_var": 0.043257649739583334, + "learning_rate": 0.0001, + "loss": 5.5287, + "loss/crossentropy": 2.538967490196228, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1560032069683075, + "step": 22956 + }, + { + "epoch": 0.7174375, + "grad_norm": 3.53125, + "grad_norm_var": 0.06214192708333333, + "learning_rate": 0.0001, + "loss": 5.5909, + "loss/crossentropy": 2.4516427516937256, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16470374912023544, + "step": 22958 + }, + { + "epoch": 0.7175, + "grad_norm": 3.25, + "grad_norm_var": 0.07893778483072916, + "learning_rate": 0.0001, + "loss": 5.7601, + "loss/crossentropy": 2.5798234939575195, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16998391598463058, + "step": 22960 + }, + { + "epoch": 0.7175625, + "grad_norm": 3.390625, + "grad_norm_var": 0.07330729166666666, + "learning_rate": 0.0001, + "loss": 5.9709, + "loss/crossentropy": 2.748919129371643, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1749340146780014, + "step": 22962 + }, + { + "epoch": 0.717625, + "grad_norm": 3.265625, + "grad_norm_var": 0.06301981608072917, + "learning_rate": 0.0001, + "loss": 5.6071, + "loss/crossentropy": 2.4429022073745728, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16290469467639923, + "step": 22964 + }, + { + "epoch": 0.7176875, + "grad_norm": 2.890625, + "grad_norm_var": 0.0654937744140625, + "learning_rate": 0.0001, + "loss": 5.578, + "loss/crossentropy": 2.533684253692627, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16028668731451035, + "step": 22966 + }, + { + "epoch": 0.71775, + "grad_norm": 3.15625, + "grad_norm_var": 0.0646392822265625, + "learning_rate": 0.0001, + "loss": 5.2242, + "loss/crossentropy": 2.1830204725265503, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15451082587242126, + "step": 22968 + }, + { + "epoch": 0.7178125, + "grad_norm": 2.890625, + "grad_norm_var": 0.0666015625, + "learning_rate": 0.0001, + "loss": 5.8123, + "loss/crossentropy": 2.733540177345276, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16178182512521744, + "step": 22970 + }, + { + "epoch": 0.717875, + "grad_norm": 2.875, + "grad_norm_var": 0.059845987955729166, + "learning_rate": 0.0001, + "loss": 5.5909, + "loss/crossentropy": 2.5439876317977905, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15820924937725067, + "step": 22972 + }, + { + "epoch": 0.7179375, + "grad_norm": 2.84375, + "grad_norm_var": 0.059178670247395836, + "learning_rate": 0.0001, + "loss": 5.5805, + "loss/crossentropy": 2.587586283683777, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15671729296445847, + "step": 22974 + }, + { + "epoch": 0.718, + "grad_norm": 3.0625, + "grad_norm_var": 0.04228413899739583, + "learning_rate": 0.0001, + "loss": 5.5181, + "loss/crossentropy": 2.4439518451690674, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16131775826215744, + "step": 22976 + }, + { + "epoch": 0.7180625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0376861572265625, + "learning_rate": 0.0001, + "loss": 5.7114, + "loss/crossentropy": 2.6640597581863403, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15785618126392365, + "step": 22978 + }, + { + "epoch": 0.718125, + "grad_norm": 3.34375, + "grad_norm_var": 0.13805338541666667, + "learning_rate": 0.0001, + "loss": 6.0807, + "loss/crossentropy": 2.655805468559265, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18897271901369095, + "step": 22980 + }, + { + "epoch": 0.7181875, + "grad_norm": 3.21875, + "grad_norm_var": 0.1283355712890625, + "learning_rate": 0.0001, + "loss": 5.8266, + "loss/crossentropy": 2.6418944597244263, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16768986731767654, + "step": 22982 + }, + { + "epoch": 0.71825, + "grad_norm": 3.453125, + "grad_norm_var": 0.13191731770833334, + "learning_rate": 0.0001, + "loss": 5.8889, + "loss/crossentropy": 2.703049659729004, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16819187998771667, + "step": 22984 + }, + { + "epoch": 0.7183125, + "grad_norm": 3.515625, + "grad_norm_var": 0.12906901041666666, + "learning_rate": 0.0001, + "loss": 5.7941, + "loss/crossentropy": 2.600367307662964, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1717212051153183, + "step": 22986 + }, + { + "epoch": 0.718375, + "grad_norm": 3.34375, + "grad_norm_var": 0.12375895182291667, + "learning_rate": 0.0001, + "loss": 5.7472, + "loss/crossentropy": 2.6864311695098877, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15920286625623703, + "step": 22988 + }, + { + "epoch": 0.7184375, + "grad_norm": 3.09375, + "grad_norm_var": 0.11237691243489584, + "learning_rate": 0.0001, + "loss": 5.5005, + "loss/crossentropy": 2.42072856426239, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15992847084999084, + "step": 22990 + }, + { + "epoch": 0.7185, + "grad_norm": 3.234375, + "grad_norm_var": 0.11028645833333334, + "learning_rate": 0.0001, + "loss": 5.8857, + "loss/crossentropy": 2.6808758974075317, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17243708670139313, + "step": 22992 + }, + { + "epoch": 0.7185625, + "grad_norm": 3.234375, + "grad_norm_var": 0.14885965983072916, + "learning_rate": 0.0001, + "loss": 5.6984, + "loss/crossentropy": 2.490253210067749, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17159316688776016, + "step": 22994 + }, + { + "epoch": 0.718625, + "grad_norm": 2.921875, + "grad_norm_var": 0.09265848795572916, + "learning_rate": 0.0001, + "loss": 5.4208, + "loss/crossentropy": 2.4393712282180786, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15009599179029465, + "step": 22996 + }, + { + "epoch": 0.7186875, + "grad_norm": 3.0, + "grad_norm_var": 0.0980377197265625, + "learning_rate": 0.0001, + "loss": 5.4572, + "loss/crossentropy": 2.387829542160034, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16162166744470596, + "step": 22998 + }, + { + "epoch": 0.71875, + "grad_norm": 3.59375, + "grad_norm_var": 0.102587890625, + "learning_rate": 0.0001, + "loss": 6.041, + "loss/crossentropy": 2.757057785987854, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17761504650115967, + "step": 23000 + }, + { + "epoch": 0.7188125, + "grad_norm": 3.03125, + "grad_norm_var": 0.09999898274739584, + "learning_rate": 0.0001, + "loss": 5.6528, + "loss/crossentropy": 2.550535798072815, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16374579071998596, + "step": 23002 + }, + { + "epoch": 0.718875, + "grad_norm": 2.90625, + "grad_norm_var": 0.10453999837239583, + "learning_rate": 0.0001, + "loss": 5.5285, + "loss/crossentropy": 2.5141395330429077, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.158076211810112, + "step": 23004 + }, + { + "epoch": 0.7189375, + "grad_norm": 3.15625, + "grad_norm_var": 0.11061909993489584, + "learning_rate": 0.0001, + "loss": 5.5961, + "loss/crossentropy": 2.559632420539856, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16106736660003662, + "step": 23006 + }, + { + "epoch": 0.719, + "grad_norm": 3.3125, + "grad_norm_var": 0.11005859375, + "learning_rate": 0.0001, + "loss": 5.6422, + "loss/crossentropy": 2.5025556087493896, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16591913998126984, + "step": 23008 + }, + { + "epoch": 0.7190625, + "grad_norm": 3.078125, + "grad_norm_var": 0.045441691080729166, + "learning_rate": 0.0001, + "loss": 5.7458, + "loss/crossentropy": 2.616849422454834, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16602325439453125, + "step": 23010 + }, + { + "epoch": 0.719125, + "grad_norm": 3.046875, + "grad_norm_var": 0.039774576822916664, + "learning_rate": 0.0001, + "loss": 5.6429, + "loss/crossentropy": 2.5550752878189087, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16229917109012604, + "step": 23012 + }, + { + "epoch": 0.7191875, + "grad_norm": 3.234375, + "grad_norm_var": 0.033854166666666664, + "learning_rate": 0.0001, + "loss": 5.9789, + "loss/crossentropy": 2.7079319953918457, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17319077253341675, + "step": 23014 + }, + { + "epoch": 0.71925, + "grad_norm": 3.140625, + "grad_norm_var": 0.015501912434895833, + "learning_rate": 0.0001, + "loss": 5.7121, + "loss/crossentropy": 2.5766741037368774, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1705784946680069, + "step": 23016 + }, + { + "epoch": 0.7193125, + "grad_norm": 3.15625, + "grad_norm_var": 0.014557902018229167, + "learning_rate": 0.0001, + "loss": 5.5817, + "loss/crossentropy": 2.5128093957901, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1623556911945343, + "step": 23018 + }, + { + "epoch": 0.719375, + "grad_norm": 3.546875, + "grad_norm_var": 0.025047810872395833, + "learning_rate": 0.0001, + "loss": 5.7667, + "loss/crossentropy": 2.5243011713027954, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1718970388174057, + "step": 23020 + }, + { + "epoch": 0.7194375, + "grad_norm": 3.03125, + "grad_norm_var": 0.024290974934895834, + "learning_rate": 0.0001, + "loss": 5.6906, + "loss/crossentropy": 2.5249842405319214, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16655787080526352, + "step": 23022 + }, + { + "epoch": 0.7195, + "grad_norm": 3.171875, + "grad_norm_var": 0.023656209309895832, + "learning_rate": 0.0001, + "loss": 5.4646, + "loss/crossentropy": 2.458473801612854, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15803193300962448, + "step": 23024 + }, + { + "epoch": 0.7195625, + "grad_norm": 3.25, + "grad_norm_var": 0.03250325520833333, + "learning_rate": 0.0001, + "loss": 5.8432, + "loss/crossentropy": 2.659113883972168, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16996963322162628, + "step": 23026 + }, + { + "epoch": 0.719625, + "grad_norm": 2.96875, + "grad_norm_var": 0.0338287353515625, + "learning_rate": 0.0001, + "loss": 5.2914, + "loss/crossentropy": 2.321761965751648, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15204261988401413, + "step": 23028 + }, + { + "epoch": 0.7196875, + "grad_norm": 3.140625, + "grad_norm_var": 0.03336181640625, + "learning_rate": 0.0001, + "loss": 6.0239, + "loss/crossentropy": 2.8034214973449707, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17204715311527252, + "step": 23030 + }, + { + "epoch": 0.71975, + "grad_norm": 2.921875, + "grad_norm_var": 0.04973958333333333, + "learning_rate": 0.0001, + "loss": 5.131, + "loss/crossentropy": 2.263086676597595, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.14381912350654602, + "step": 23032 + }, + { + "epoch": 0.7198125, + "grad_norm": 3.296875, + "grad_norm_var": 0.0476226806640625, + "learning_rate": 0.0001, + "loss": 5.7328, + "loss/crossentropy": 2.5711982250213623, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16967156529426575, + "step": 23034 + }, + { + "epoch": 0.719875, + "grad_norm": 3.125, + "grad_norm_var": 0.038671875, + "learning_rate": 0.0001, + "loss": 6.0364, + "loss/crossentropy": 2.778636336326599, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17655359208583832, + "step": 23036 + }, + { + "epoch": 0.7199375, + "grad_norm": 3.25, + "grad_norm_var": 0.036962890625, + "learning_rate": 0.0001, + "loss": 5.7919, + "loss/crossentropy": 2.6048576831817627, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16909153014421463, + "step": 23038 + }, + { + "epoch": 0.72, + "grad_norm": 3.25, + "grad_norm_var": 0.039892578125, + "learning_rate": 0.0001, + "loss": 5.7048, + "loss/crossentropy": 2.5766351222991943, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1667202264070511, + "step": 23040 + }, + { + "epoch": 0.7200625, + "grad_norm": 3.453125, + "grad_norm_var": 0.03515625, + "learning_rate": 0.0001, + "loss": 5.8284, + "loss/crossentropy": 2.6341872215270996, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1705925315618515, + "step": 23042 + }, + { + "epoch": 0.720125, + "grad_norm": 3.15625, + "grad_norm_var": 0.033356730143229166, + "learning_rate": 0.0001, + "loss": 5.7009, + "loss/crossentropy": 2.5701510906219482, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16073225438594818, + "step": 23044 + }, + { + "epoch": 0.7201875, + "grad_norm": 3.21875, + "grad_norm_var": 0.0351226806640625, + "learning_rate": 0.0001, + "loss": 5.5986, + "loss/crossentropy": 2.5418559312820435, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1607527956366539, + "step": 23046 + }, + { + "epoch": 0.72025, + "grad_norm": 3.5, + "grad_norm_var": 0.035986328125, + "learning_rate": 0.0001, + "loss": 5.6231, + "loss/crossentropy": 2.5080316066741943, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16580697894096375, + "step": 23048 + }, + { + "epoch": 0.7203125, + "grad_norm": 2.984375, + "grad_norm_var": 0.036454264322916666, + "learning_rate": 0.0001, + "loss": 5.7042, + "loss/crossentropy": 2.5495556592941284, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16663891077041626, + "step": 23050 + }, + { + "epoch": 0.720375, + "grad_norm": 3.125, + "grad_norm_var": 0.03889872233072917, + "learning_rate": 0.0001, + "loss": 5.3949, + "loss/crossentropy": 2.3359018564224243, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16019276529550552, + "step": 23052 + }, + { + "epoch": 0.7204375, + "grad_norm": 3.0625, + "grad_norm_var": 0.04053446451822917, + "learning_rate": 0.0001, + "loss": 5.4801, + "loss/crossentropy": 2.403424859046936, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16079019010066986, + "step": 23054 + }, + { + "epoch": 0.7205, + "grad_norm": 3.1875, + "grad_norm_var": 0.0397125244140625, + "learning_rate": 0.0001, + "loss": 5.8981, + "loss/crossentropy": 2.72353732585907, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17019527405500412, + "step": 23056 + }, + { + "epoch": 0.7205625, + "grad_norm": 3.0, + "grad_norm_var": 0.033040364583333336, + "learning_rate": 0.0001, + "loss": 5.618, + "loss/crossentropy": 2.494266986846924, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1670648530125618, + "step": 23058 + }, + { + "epoch": 0.720625, + "grad_norm": 2.921875, + "grad_norm_var": 0.03543294270833333, + "learning_rate": 0.0001, + "loss": 5.8462, + "loss/crossentropy": 2.6992437839508057, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16703598201274872, + "step": 23060 + }, + { + "epoch": 0.7206875, + "grad_norm": 2.890625, + "grad_norm_var": 0.035033162434895834, + "learning_rate": 0.0001, + "loss": 5.5348, + "loss/crossentropy": 2.49991774559021, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15505250543355942, + "step": 23062 + }, + { + "epoch": 0.72075, + "grad_norm": 3.453125, + "grad_norm_var": 0.027684529622395832, + "learning_rate": 0.0001, + "loss": 5.797, + "loss/crossentropy": 2.56563663482666, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1754772886633873, + "step": 23064 + }, + { + "epoch": 0.7208125, + "grad_norm": 3.015625, + "grad_norm_var": 0.027033487955729168, + "learning_rate": 0.0001, + "loss": 5.7839, + "loss/crossentropy": 2.6267071962356567, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1696234866976738, + "step": 23066 + }, + { + "epoch": 0.720875, + "grad_norm": 3.1875, + "grad_norm_var": 0.024625651041666665, + "learning_rate": 0.0001, + "loss": 5.5554, + "loss/crossentropy": 2.4868232011795044, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15998029708862305, + "step": 23068 + }, + { + "epoch": 0.7209375, + "grad_norm": 3.109375, + "grad_norm_var": 0.023639933268229166, + "learning_rate": 0.0001, + "loss": 5.5957, + "loss/crossentropy": 2.4670718908309937, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16755016148090363, + "step": 23070 + }, + { + "epoch": 0.721, + "grad_norm": 3.296875, + "grad_norm_var": 0.07302144368489584, + "learning_rate": 0.0001, + "loss": 5.7273, + "loss/crossentropy": 2.5400205850601196, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1687304750084877, + "step": 23072 + }, + { + "epoch": 0.7210625, + "grad_norm": 3.203125, + "grad_norm_var": 0.08131103515625, + "learning_rate": 0.0001, + "loss": 5.4401, + "loss/crossentropy": 2.464422821998596, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15577441453933716, + "step": 23074 + }, + { + "epoch": 0.721125, + "grad_norm": 3.4375, + "grad_norm_var": 0.08234049479166666, + "learning_rate": 0.0001, + "loss": 5.5466, + "loss/crossentropy": 2.4490153789520264, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1640603244304657, + "step": 23076 + }, + { + "epoch": 0.7211875, + "grad_norm": 3.5, + "grad_norm_var": 0.08513895670572917, + "learning_rate": 0.0001, + "loss": 5.6498, + "loss/crossentropy": 2.5793803930282593, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16094836592674255, + "step": 23078 + }, + { + "epoch": 0.72125, + "grad_norm": 3.203125, + "grad_norm_var": 0.08606669108072916, + "learning_rate": 0.0001, + "loss": 5.512, + "loss/crossentropy": 2.5071429014205933, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15517634898424149, + "step": 23080 + }, + { + "epoch": 0.7213125, + "grad_norm": 3.328125, + "grad_norm_var": 0.08509012858072916, + "learning_rate": 0.0001, + "loss": 5.8955, + "loss/crossentropy": 2.6934951543807983, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1713682785630226, + "step": 23082 + }, + { + "epoch": 0.721375, + "grad_norm": 3.40625, + "grad_norm_var": 0.0873687744140625, + "learning_rate": 0.0001, + "loss": 5.7627, + "loss/crossentropy": 2.5480767488479614, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16755439341068268, + "step": 23084 + }, + { + "epoch": 0.7214375, + "grad_norm": 3.125, + "grad_norm_var": 0.09079488118489583, + "learning_rate": 0.0001, + "loss": 5.6522, + "loss/crossentropy": 2.550952911376953, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1648128256201744, + "step": 23086 + }, + { + "epoch": 0.7215, + "grad_norm": 3.0625, + "grad_norm_var": 0.041259765625, + "learning_rate": 0.0001, + "loss": 5.6769, + "loss/crossentropy": 2.6425377130508423, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16124935448169708, + "step": 23088 + }, + { + "epoch": 0.7215625, + "grad_norm": 2.890625, + "grad_norm_var": 0.038117472330729166, + "learning_rate": 0.0001, + "loss": 5.2548, + "loss/crossentropy": 2.326769709587097, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14905112981796265, + "step": 23090 + }, + { + "epoch": 0.721625, + "grad_norm": 3.296875, + "grad_norm_var": 0.03504130045572917, + "learning_rate": 0.0001, + "loss": 5.8017, + "loss/crossentropy": 2.5980507135391235, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17114394903182983, + "step": 23092 + }, + { + "epoch": 0.7216875, + "grad_norm": 2.96875, + "grad_norm_var": 0.029173787434895834, + "learning_rate": 0.0001, + "loss": 5.8539, + "loss/crossentropy": 2.746086359024048, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16351471841335297, + "step": 23094 + }, + { + "epoch": 0.72175, + "grad_norm": 2.8125, + "grad_norm_var": 0.0394195556640625, + "learning_rate": 0.0001, + "loss": 5.6244, + "loss/crossentropy": 2.5009769201278687, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16703161597251892, + "step": 23096 + }, + { + "epoch": 0.7218125, + "grad_norm": 3.203125, + "grad_norm_var": 0.03814697265625, + "learning_rate": 0.0001, + "loss": 5.6159, + "loss/crossentropy": 2.5001531839370728, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16547824442386627, + "step": 23098 + }, + { + "epoch": 0.721875, + "grad_norm": 3.640625, + "grad_norm_var": 0.05012105305989583, + "learning_rate": 0.0001, + "loss": 6.0574, + "loss/crossentropy": 2.8432135581970215, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17649948596954346, + "step": 23100 + }, + { + "epoch": 0.7219375, + "grad_norm": 3.09375, + "grad_norm_var": 0.048502604166666664, + "learning_rate": 0.0001, + "loss": 5.727, + "loss/crossentropy": 2.6021103858947754, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16678505390882492, + "step": 23102 + }, + { + "epoch": 0.722, + "grad_norm": 3.359375, + "grad_norm_var": 0.051102701822916666, + "learning_rate": 0.0001, + "loss": 5.8847, + "loss/crossentropy": 2.7236799001693726, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16610267758369446, + "step": 23104 + }, + { + "epoch": 0.7220625, + "grad_norm": 3.390625, + "grad_norm_var": 0.0484283447265625, + "learning_rate": 0.0001, + "loss": 5.8858, + "loss/crossentropy": 2.6859112977981567, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17311672121286392, + "step": 23106 + }, + { + "epoch": 0.722125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05194905598958333, + "learning_rate": 0.0001, + "loss": 5.4255, + "loss/crossentropy": 2.386235475540161, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16017398983240128, + "step": 23108 + }, + { + "epoch": 0.7221875, + "grad_norm": 2.921875, + "grad_norm_var": 0.05156148274739583, + "learning_rate": 0.0001, + "loss": 5.7212, + "loss/crossentropy": 2.6586928367614746, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15937219560146332, + "step": 23110 + }, + { + "epoch": 0.72225, + "grad_norm": 3.3125, + "grad_norm_var": 0.042496744791666666, + "learning_rate": 0.0001, + "loss": 5.9033, + "loss/crossentropy": 2.7014816999435425, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17252995818853378, + "step": 23112 + }, + { + "epoch": 0.7223125, + "grad_norm": 3.296875, + "grad_norm_var": 0.04475809733072917, + "learning_rate": 0.0001, + "loss": 5.7803, + "loss/crossentropy": 2.649962306022644, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16420765966176987, + "step": 23114 + }, + { + "epoch": 0.722375, + "grad_norm": 3.046875, + "grad_norm_var": 0.030256144205729165, + "learning_rate": 0.0001, + "loss": 5.8306, + "loss/crossentropy": 2.587769389152527, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1754559651017189, + "step": 23116 + }, + { + "epoch": 0.7224375, + "grad_norm": 2.84375, + "grad_norm_var": 0.035546875, + "learning_rate": 0.0001, + "loss": 5.527, + "loss/crossentropy": 2.4696303606033325, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16003839671611786, + "step": 23118 + }, + { + "epoch": 0.7225, + "grad_norm": 3.03125, + "grad_norm_var": 0.03193359375, + "learning_rate": 0.0001, + "loss": 5.7403, + "loss/crossentropy": 2.585333228111267, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16783996671438217, + "step": 23120 + }, + { + "epoch": 0.7225625, + "grad_norm": 2.828125, + "grad_norm_var": 0.030224609375, + "learning_rate": 0.0001, + "loss": 5.2633, + "loss/crossentropy": 2.360242009162903, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.14812248200178146, + "step": 23122 + }, + { + "epoch": 0.722625, + "grad_norm": 3.09375, + "grad_norm_var": 0.02691650390625, + "learning_rate": 0.0001, + "loss": 5.5555, + "loss/crossentropy": 2.5515612363815308, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1554725095629692, + "step": 23124 + }, + { + "epoch": 0.7226875, + "grad_norm": 2.859375, + "grad_norm_var": 0.0286041259765625, + "learning_rate": 0.0001, + "loss": 5.5207, + "loss/crossentropy": 2.4835323095321655, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16270030289888382, + "step": 23126 + }, + { + "epoch": 0.72275, + "grad_norm": 3.109375, + "grad_norm_var": 0.024934895833333335, + "learning_rate": 0.0001, + "loss": 5.8374, + "loss/crossentropy": 2.613251566886902, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17202463746070862, + "step": 23128 + }, + { + "epoch": 0.7228125, + "grad_norm": 3.265625, + "grad_norm_var": 0.0234771728515625, + "learning_rate": 0.0001, + "loss": 5.8513, + "loss/crossentropy": 2.652708888053894, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17063865065574646, + "step": 23130 + }, + { + "epoch": 0.722875, + "grad_norm": 3.78125, + "grad_norm_var": 0.07051493326822916, + "learning_rate": 0.0001, + "loss": 6.1522, + "loss/crossentropy": 2.759737491607666, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18807834386825562, + "step": 23132 + }, + { + "epoch": 0.7229375, + "grad_norm": 3.03125, + "grad_norm_var": 0.06780598958333334, + "learning_rate": 0.0001, + "loss": 5.2282, + "loss/crossentropy": 2.3027509450912476, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1519230529665947, + "step": 23134 + }, + { + "epoch": 0.723, + "grad_norm": 2.96875, + "grad_norm_var": 0.07200419108072917, + "learning_rate": 0.0001, + "loss": 5.4546, + "loss/crossentropy": 2.4205862283706665, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1569124236702919, + "step": 23136 + }, + { + "epoch": 0.7230625, + "grad_norm": 2.96875, + "grad_norm_var": 0.06336263020833334, + "learning_rate": 0.0001, + "loss": 5.4985, + "loss/crossentropy": 2.4439892768859863, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1570119559764862, + "step": 23138 + }, + { + "epoch": 0.723125, + "grad_norm": 2.96875, + "grad_norm_var": 0.06421610514322916, + "learning_rate": 0.0001, + "loss": 5.3426, + "loss/crossentropy": 2.3728432655334473, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15283336490392685, + "step": 23140 + }, + { + "epoch": 0.7231875, + "grad_norm": 2.96875, + "grad_norm_var": 0.06144917805989583, + "learning_rate": 0.0001, + "loss": 5.6772, + "loss/crossentropy": 2.6229491233825684, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.164021298289299, + "step": 23142 + }, + { + "epoch": 0.72325, + "grad_norm": 3.0, + "grad_norm_var": 0.06145426432291667, + "learning_rate": 0.0001, + "loss": 5.6777, + "loss/crossentropy": 2.547545075416565, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16887624561786652, + "step": 23144 + }, + { + "epoch": 0.7233125, + "grad_norm": 3.25, + "grad_norm_var": 0.0609527587890625, + "learning_rate": 0.0001, + "loss": 5.4967, + "loss/crossentropy": 2.3959068059921265, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1608622595667839, + "step": 23146 + }, + { + "epoch": 0.723375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0107818603515625, + "learning_rate": 0.0001, + "loss": 5.9147, + "loss/crossentropy": 2.743956446647644, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1705934852361679, + "step": 23148 + }, + { + "epoch": 0.7234375, + "grad_norm": 2.90625, + "grad_norm_var": 0.01343994140625, + "learning_rate": 0.0001, + "loss": 5.49, + "loss/crossentropy": 2.5214651823043823, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1531044840812683, + "step": 23150 + }, + { + "epoch": 0.7235, + "grad_norm": 3.015625, + "grad_norm_var": 0.012581380208333333, + "learning_rate": 0.0001, + "loss": 5.5244, + "loss/crossentropy": 2.451177477836609, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1584925651550293, + "step": 23152 + }, + { + "epoch": 0.7235625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 5.9048, + "loss/crossentropy": 2.7149475812911987, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17132603377103806, + "step": 23154 + }, + { + "epoch": 0.723625, + "grad_norm": 3.234375, + "grad_norm_var": 0.023827107747395833, + "learning_rate": 0.0001, + "loss": 5.7513, + "loss/crossentropy": 2.6011768579483032, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1665768250823021, + "step": 23156 + }, + { + "epoch": 0.7236875, + "grad_norm": 2.828125, + "grad_norm_var": 0.027864583333333335, + "learning_rate": 0.0001, + "loss": 5.705, + "loss/crossentropy": 2.6476725339889526, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1592516303062439, + "step": 23158 + }, + { + "epoch": 0.72375, + "grad_norm": 3.125, + "grad_norm_var": 0.028791300455729165, + "learning_rate": 0.0001, + "loss": 5.6737, + "loss/crossentropy": 2.561895489692688, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16157186031341553, + "step": 23160 + }, + { + "epoch": 0.7238125, + "grad_norm": 3.203125, + "grad_norm_var": 0.028547159830729165, + "learning_rate": 0.0001, + "loss": 5.5481, + "loss/crossentropy": 2.5043612718582153, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1610122099518776, + "step": 23162 + }, + { + "epoch": 0.723875, + "grad_norm": 3.1875, + "grad_norm_var": 0.030028279622395834, + "learning_rate": 0.0001, + "loss": 5.5372, + "loss/crossentropy": 2.4577651023864746, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1618458554148674, + "step": 23164 + }, + { + "epoch": 0.7239375, + "grad_norm": 3.25, + "grad_norm_var": 0.026463826497395832, + "learning_rate": 0.0001, + "loss": 5.8778, + "loss/crossentropy": 2.692238926887512, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17090285569429398, + "step": 23166 + }, + { + "epoch": 0.724, + "grad_norm": 3.484375, + "grad_norm_var": 0.03661702473958333, + "learning_rate": 0.0001, + "loss": 5.5408, + "loss/crossentropy": 2.447907328605652, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1631990671157837, + "step": 23168 + }, + { + "epoch": 0.7240625, + "grad_norm": 3.125, + "grad_norm_var": 0.03411051432291667, + "learning_rate": 0.0001, + "loss": 5.4933, + "loss/crossentropy": 2.3692781925201416, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1655312478542328, + "step": 23170 + }, + { + "epoch": 0.724125, + "grad_norm": 3.4375, + "grad_norm_var": 0.03964436848958333, + "learning_rate": 0.0001, + "loss": 5.6876, + "loss/crossentropy": 2.4974652528762817, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1705765798687935, + "step": 23172 + }, + { + "epoch": 0.7241875, + "grad_norm": 2.921875, + "grad_norm_var": 0.03624674479166667, + "learning_rate": 0.0001, + "loss": 5.4289, + "loss/crossentropy": 2.3903775215148926, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15892648696899414, + "step": 23174 + }, + { + "epoch": 0.72425, + "grad_norm": 3.109375, + "grad_norm_var": 0.03912353515625, + "learning_rate": 0.0001, + "loss": 5.6177, + "loss/crossentropy": 2.5819114446640015, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15670160949230194, + "step": 23176 + }, + { + "epoch": 0.7243125, + "grad_norm": 3.65625, + "grad_norm_var": 0.053766886393229164, + "learning_rate": 0.0001, + "loss": 6.3302, + "loss/crossentropy": 2.925212264060974, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.18541866540908813, + "step": 23178 + }, + { + "epoch": 0.724375, + "grad_norm": 2.890625, + "grad_norm_var": 0.05998433430989583, + "learning_rate": 0.0001, + "loss": 5.3544, + "loss/crossentropy": 2.3588887453079224, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15540842711925507, + "step": 23180 + }, + { + "epoch": 0.7244375, + "grad_norm": 2.9375, + "grad_norm_var": 0.05816650390625, + "learning_rate": 0.0001, + "loss": 5.5821, + "loss/crossentropy": 2.573221445083618, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15987595170736313, + "step": 23182 + }, + { + "epoch": 0.7245, + "grad_norm": 3.15625, + "grad_norm_var": 0.04605712890625, + "learning_rate": 0.0001, + "loss": 5.5574, + "loss/crossentropy": 2.514816403388977, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15933258831501007, + "step": 23184 + }, + { + "epoch": 0.7245625, + "grad_norm": 3.1875, + "grad_norm_var": 0.04547119140625, + "learning_rate": 0.0001, + "loss": 5.7732, + "loss/crossentropy": 2.653873920440674, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1650610715150833, + "step": 23186 + }, + { + "epoch": 0.724625, + "grad_norm": 3.203125, + "grad_norm_var": 0.036595662434895836, + "learning_rate": 0.0001, + "loss": 5.5357, + "loss/crossentropy": 2.4859979152679443, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16043585538864136, + "step": 23188 + }, + { + "epoch": 0.7246875, + "grad_norm": 2.890625, + "grad_norm_var": 0.038752237955729164, + "learning_rate": 0.0001, + "loss": 5.7962, + "loss/crossentropy": 2.7121152877807617, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16543519496917725, + "step": 23190 + }, + { + "epoch": 0.72475, + "grad_norm": 3.0625, + "grad_norm_var": 0.03465067545572917, + "learning_rate": 0.0001, + "loss": 5.5116, + "loss/crossentropy": 2.459586977958679, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15754922479391098, + "step": 23192 + }, + { + "epoch": 0.7248125, + "grad_norm": 3.28125, + "grad_norm_var": 0.0182525634765625, + "learning_rate": 0.0001, + "loss": 5.8184, + "loss/crossentropy": 2.695042848587036, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16624674201011658, + "step": 23194 + }, + { + "epoch": 0.724875, + "grad_norm": 3.28125, + "grad_norm_var": 0.0158843994140625, + "learning_rate": 0.0001, + "loss": 5.6659, + "loss/crossentropy": 2.5744264125823975, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1610967516899109, + "step": 23196 + }, + { + "epoch": 0.7249375, + "grad_norm": 3.171875, + "grad_norm_var": 0.01304931640625, + "learning_rate": 0.0001, + "loss": 5.9551, + "loss/crossentropy": 2.687638759613037, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1787019520998001, + "step": 23198 + }, + { + "epoch": 0.725, + "grad_norm": 2.890625, + "grad_norm_var": 0.0178375244140625, + "learning_rate": 0.0001, + "loss": 5.509, + "loss/crossentropy": 2.4411755800247192, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1587366759777069, + "step": 23200 + }, + { + "epoch": 0.7250625, + "grad_norm": 3.015625, + "grad_norm_var": 0.018244425455729168, + "learning_rate": 0.0001, + "loss": 5.7031, + "loss/crossentropy": 2.5218595266342163, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16733865439891815, + "step": 23202 + }, + { + "epoch": 0.725125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03420308430989583, + "learning_rate": 0.0001, + "loss": 5.2764, + "loss/crossentropy": 2.259980320930481, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15359201282262802, + "step": 23204 + }, + { + "epoch": 0.7251875, + "grad_norm": 3.203125, + "grad_norm_var": 0.027534993489583333, + "learning_rate": 0.0001, + "loss": 5.5431, + "loss/crossentropy": 2.405370593070984, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16455841064453125, + "step": 23206 + }, + { + "epoch": 0.72525, + "grad_norm": 2.890625, + "grad_norm_var": 0.03427327473958333, + "learning_rate": 0.0001, + "loss": 5.5312, + "loss/crossentropy": 2.502629280090332, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15676090121269226, + "step": 23208 + }, + { + "epoch": 0.7253125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04399312337239583, + "learning_rate": 0.0001, + "loss": 5.3974, + "loss/crossentropy": 2.4476083517074585, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.149665467441082, + "step": 23210 + }, + { + "epoch": 0.725375, + "grad_norm": 3.0, + "grad_norm_var": 0.042561848958333336, + "learning_rate": 0.0001, + "loss": 5.7794, + "loss/crossentropy": 2.648160219192505, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1685919389128685, + "step": 23212 + }, + { + "epoch": 0.7254375, + "grad_norm": 3.203125, + "grad_norm_var": 0.041910807291666664, + "learning_rate": 0.0001, + "loss": 5.6714, + "loss/crossentropy": 2.575753092765808, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16190358996391296, + "step": 23214 + }, + { + "epoch": 0.7255, + "grad_norm": 3.0625, + "grad_norm_var": 0.04676005045572917, + "learning_rate": 0.0001, + "loss": 5.3755, + "loss/crossentropy": 2.368937611579895, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15690668672323227, + "step": 23216 + }, + { + "epoch": 0.7255625, + "grad_norm": 2.953125, + "grad_norm_var": 0.05066630045572917, + "learning_rate": 0.0001, + "loss": 5.6621, + "loss/crossentropy": 2.5506527423858643, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16817519813776016, + "step": 23218 + }, + { + "epoch": 0.725625, + "grad_norm": 3.296875, + "grad_norm_var": 0.03635965983072917, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.510103940963745, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16484995186328888, + "step": 23220 + }, + { + "epoch": 0.7256875, + "grad_norm": 2.9375, + "grad_norm_var": 0.0321929931640625, + "learning_rate": 0.0001, + "loss": 5.5839, + "loss/crossentropy": 2.5797786712646484, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1527549773454666, + "step": 23222 + }, + { + "epoch": 0.72575, + "grad_norm": 2.921875, + "grad_norm_var": 0.0326171875, + "learning_rate": 0.0001, + "loss": 5.7047, + "loss/crossentropy": 2.5932576656341553, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16504817456007004, + "step": 23224 + }, + { + "epoch": 0.7258125, + "grad_norm": 2.953125, + "grad_norm_var": 0.0533843994140625, + "learning_rate": 0.0001, + "loss": 6.1421, + "loss/crossentropy": 2.8035610914230347, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18385039269924164, + "step": 23226 + }, + { + "epoch": 0.725875, + "grad_norm": 5.3125, + "grad_norm_var": 0.36940104166666665, + "learning_rate": 0.0001, + "loss": 5.7915, + "loss/crossentropy": 2.585119843482971, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17689155787229538, + "step": 23228 + }, + { + "epoch": 0.7259375, + "grad_norm": 3.15625, + "grad_norm_var": 0.3672841389973958, + "learning_rate": 0.0001, + "loss": 5.5665, + "loss/crossentropy": 2.4747084379196167, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1634795442223549, + "step": 23230 + }, + { + "epoch": 0.726, + "grad_norm": 3.09375, + "grad_norm_var": 0.34823811848958336, + "learning_rate": 0.0001, + "loss": 5.7145, + "loss/crossentropy": 2.5751774311065674, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16861557960510254, + "step": 23232 + }, + { + "epoch": 0.7260625, + "grad_norm": 3.421875, + "grad_norm_var": 0.3407216389973958, + "learning_rate": 0.0001, + "loss": 5.8606, + "loss/crossentropy": 2.624040961265564, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1763881966471672, + "step": 23234 + }, + { + "epoch": 0.726125, + "grad_norm": 3.515625, + "grad_norm_var": 0.3234771728515625, + "learning_rate": 0.0001, + "loss": 5.5971, + "loss/crossentropy": 2.530275583267212, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1652732416987419, + "step": 23236 + }, + { + "epoch": 0.7261875, + "grad_norm": 2.796875, + "grad_norm_var": 0.3291656494140625, + "learning_rate": 0.0001, + "loss": 5.5069, + "loss/crossentropy": 2.53396999835968, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1558818519115448, + "step": 23238 + }, + { + "epoch": 0.72625, + "grad_norm": 3.21875, + "grad_norm_var": 0.32779541015625, + "learning_rate": 0.0001, + "loss": 5.6379, + "loss/crossentropy": 2.5906230211257935, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1594114750623703, + "step": 23240 + }, + { + "epoch": 0.7263125, + "grad_norm": 3.0, + "grad_norm_var": 0.3327301025390625, + "learning_rate": 0.0001, + "loss": 5.4127, + "loss/crossentropy": 2.444182872772217, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15310557186603546, + "step": 23242 + }, + { + "epoch": 0.726375, + "grad_norm": 3.28125, + "grad_norm_var": 0.045751953125, + "learning_rate": 0.0001, + "loss": 5.7193, + "loss/crossentropy": 2.512703776359558, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16948899626731873, + "step": 23244 + }, + { + "epoch": 0.7264375, + "grad_norm": 3.3125, + "grad_norm_var": 0.043473307291666666, + "learning_rate": 0.0001, + "loss": 5.3024, + "loss/crossentropy": 2.282405376434326, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.15082381665706635, + "step": 23246 + }, + { + "epoch": 0.7265, + "grad_norm": 3.109375, + "grad_norm_var": 0.0408843994140625, + "learning_rate": 0.0001, + "loss": 5.4735, + "loss/crossentropy": 2.376451849937439, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16009971499443054, + "step": 23248 + }, + { + "epoch": 0.7265625, + "grad_norm": 2.90625, + "grad_norm_var": 0.03740132649739583, + "learning_rate": 0.0001, + "loss": 5.4683, + "loss/crossentropy": 2.4280524253845215, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15870977938175201, + "step": 23250 + }, + { + "epoch": 0.726625, + "grad_norm": 3.375, + "grad_norm_var": 0.030269368489583334, + "learning_rate": 0.0001, + "loss": 5.5973, + "loss/crossentropy": 2.4845110177993774, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16245494782924652, + "step": 23252 + }, + { + "epoch": 0.7266875, + "grad_norm": 3.34375, + "grad_norm_var": 0.05828450520833333, + "learning_rate": 0.0001, + "loss": 5.7597, + "loss/crossentropy": 2.5968947410583496, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16823583096265793, + "step": 23254 + }, + { + "epoch": 0.72675, + "grad_norm": 3.265625, + "grad_norm_var": 0.055403645833333334, + "learning_rate": 0.0001, + "loss": 5.709, + "loss/crossentropy": 2.5472995042800903, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16929778456687927, + "step": 23256 + }, + { + "epoch": 0.7268125, + "grad_norm": 3.25, + "grad_norm_var": 0.04604390462239583, + "learning_rate": 0.0001, + "loss": 5.5923, + "loss/crossentropy": 2.472580909729004, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1639285385608673, + "step": 23258 + }, + { + "epoch": 0.726875, + "grad_norm": 3.234375, + "grad_norm_var": 0.044286092122395836, + "learning_rate": 0.0001, + "loss": 5.7344, + "loss/crossentropy": 2.5646402835845947, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17088643461465836, + "step": 23260 + }, + { + "epoch": 0.7269375, + "grad_norm": 3.09375, + "grad_norm_var": 0.04517313639322917, + "learning_rate": 0.0001, + "loss": 5.3972, + "loss/crossentropy": 2.3815526962280273, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15859290957450867, + "step": 23262 + }, + { + "epoch": 0.727, + "grad_norm": 3.5625, + "grad_norm_var": 0.05178629557291667, + "learning_rate": 0.0001, + "loss": 5.6886, + "loss/crossentropy": 2.453500509262085, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17272623628377914, + "step": 23264 + }, + { + "epoch": 0.7270625, + "grad_norm": 2.96875, + "grad_norm_var": 0.04517822265625, + "learning_rate": 0.0001, + "loss": 5.7463, + "loss/crossentropy": 2.564083695411682, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1674373894929886, + "step": 23266 + }, + { + "epoch": 0.727125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0429595947265625, + "learning_rate": 0.0001, + "loss": 5.895, + "loss/crossentropy": 2.712197422981262, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17179730534553528, + "step": 23268 + }, + { + "epoch": 0.7271875, + "grad_norm": 3.203125, + "grad_norm_var": 0.024039713541666667, + "learning_rate": 0.0001, + "loss": 5.8484, + "loss/crossentropy": 2.683329939842224, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1672929972410202, + "step": 23270 + }, + { + "epoch": 0.72725, + "grad_norm": 3.140625, + "grad_norm_var": 0.03157450358072917, + "learning_rate": 0.0001, + "loss": 5.6134, + "loss/crossentropy": 2.6072771549224854, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1560782864689827, + "step": 23272 + }, + { + "epoch": 0.7273125, + "grad_norm": 3.0, + "grad_norm_var": 0.032892862955729164, + "learning_rate": 0.0001, + "loss": 5.6551, + "loss/crossentropy": 2.530093789100647, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16874957084655762, + "step": 23274 + }, + { + "epoch": 0.727375, + "grad_norm": 3.265625, + "grad_norm_var": 0.03328348795572917, + "learning_rate": 0.0001, + "loss": 5.6012, + "loss/crossentropy": 2.4686496257781982, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16286510229110718, + "step": 23276 + }, + { + "epoch": 0.7274375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0338043212890625, + "learning_rate": 0.0001, + "loss": 5.5035, + "loss/crossentropy": 2.4162285327911377, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15989916026592255, + "step": 23278 + }, + { + "epoch": 0.7275, + "grad_norm": 2.875, + "grad_norm_var": 0.027197265625, + "learning_rate": 0.0001, + "loss": 5.7379, + "loss/crossentropy": 2.6118892431259155, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1657288670539856, + "step": 23280 + }, + { + "epoch": 0.7275625, + "grad_norm": 3.046875, + "grad_norm_var": 0.023193359375, + "learning_rate": 0.0001, + "loss": 5.563, + "loss/crossentropy": 2.5222445726394653, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.161496102809906, + "step": 23282 + }, + { + "epoch": 0.727625, + "grad_norm": 3.0625, + "grad_norm_var": 0.030663045247395833, + "learning_rate": 0.0001, + "loss": 5.8126, + "loss/crossentropy": 2.6276007890701294, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16889140009880066, + "step": 23284 + }, + { + "epoch": 0.7276875, + "grad_norm": 3.046875, + "grad_norm_var": 0.022728474934895833, + "learning_rate": 0.0001, + "loss": 5.7388, + "loss/crossentropy": 2.597113609313965, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16729465126991272, + "step": 23286 + }, + { + "epoch": 0.72775, + "grad_norm": 2.890625, + "grad_norm_var": 0.022752888997395835, + "learning_rate": 0.0001, + "loss": 5.9324, + "loss/crossentropy": 2.7669402360916138, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1669377014040947, + "step": 23288 + }, + { + "epoch": 0.7278125, + "grad_norm": 3.109375, + "grad_norm_var": 0.023924763997395834, + "learning_rate": 0.0001, + "loss": 5.6937, + "loss/crossentropy": 2.5009844303131104, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1708378866314888, + "step": 23290 + }, + { + "epoch": 0.727875, + "grad_norm": 3.1875, + "grad_norm_var": 7.334375, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.488111734390259, + "loss/hidden": 1.59765625, + "loss/jsd": 0.0, + "loss/logits": 0.1614990085363388, + "step": 23292 + }, + { + "epoch": 0.7279375, + "grad_norm": 3.015625, + "grad_norm_var": 7.3345703125, + "learning_rate": 0.0001, + "loss": 5.703, + "loss/crossentropy": 2.5763041973114014, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16813740134239197, + "step": 23294 + }, + { + "epoch": 0.728, + "grad_norm": 3.140625, + "grad_norm_var": 7.324169921875, + "learning_rate": 0.0001, + "loss": 5.7199, + "loss/crossentropy": 2.588278889656067, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16589736938476562, + "step": 23296 + }, + { + "epoch": 0.7280625, + "grad_norm": 2.890625, + "grad_norm_var": 7.303023274739584, + "learning_rate": 0.0001, + "loss": 5.5376, + "loss/crossentropy": 2.4617420434951782, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15797995775938034, + "step": 23298 + }, + { + "epoch": 0.728125, + "grad_norm": 3.421875, + "grad_norm_var": 7.318310546875, + "learning_rate": 0.0001, + "loss": 5.9341, + "loss/crossentropy": 2.7568886280059814, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16889671981334686, + "step": 23300 + }, + { + "epoch": 0.7281875, + "grad_norm": 3.0625, + "grad_norm_var": 7.294266764322916, + "learning_rate": 0.0001, + "loss": 6.0928, + "loss/crossentropy": 2.799323797225952, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17778201401233673, + "step": 23302 + }, + { + "epoch": 0.72825, + "grad_norm": 3.453125, + "grad_norm_var": 7.282840983072917, + "learning_rate": 0.0001, + "loss": 5.9203, + "loss/crossentropy": 2.6988720893859863, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17448870837688446, + "step": 23304 + }, + { + "epoch": 0.7283125, + "grad_norm": 3.203125, + "grad_norm_var": 7.276558430989583, + "learning_rate": 0.0001, + "loss": 6.1345, + "loss/crossentropy": 2.802777409553528, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1819995865225792, + "step": 23306 + }, + { + "epoch": 0.728375, + "grad_norm": 3.5625, + "grad_norm_var": 0.0599609375, + "learning_rate": 0.0001, + "loss": 5.6137, + "loss/crossentropy": 2.3838824033737183, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17298342287540436, + "step": 23308 + }, + { + "epoch": 0.7284375, + "grad_norm": 3.078125, + "grad_norm_var": 0.06024983723958333, + "learning_rate": 0.0001, + "loss": 5.7482, + "loss/crossentropy": 2.623828649520874, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16673285514116287, + "step": 23310 + }, + { + "epoch": 0.7285, + "grad_norm": 3.4375, + "grad_norm_var": 0.06101786295572917, + "learning_rate": 0.0001, + "loss": 5.9586, + "loss/crossentropy": 2.7022147178649902, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1752513349056244, + "step": 23312 + }, + { + "epoch": 0.7285625, + "grad_norm": 3.0, + "grad_norm_var": 0.050537109375, + "learning_rate": 0.0001, + "loss": 5.7232, + "loss/crossentropy": 2.609552264213562, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1660483181476593, + "step": 23314 + }, + { + "epoch": 0.728625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0460357666015625, + "learning_rate": 0.0001, + "loss": 5.4811, + "loss/crossentropy": 2.451301693916321, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15844624489545822, + "step": 23316 + }, + { + "epoch": 0.7286875, + "grad_norm": 3.328125, + "grad_norm_var": 0.03616536458333333, + "learning_rate": 0.0001, + "loss": 5.6874, + "loss/crossentropy": 2.5450918674468994, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16501231491565704, + "step": 23318 + }, + { + "epoch": 0.72875, + "grad_norm": 3.09375, + "grad_norm_var": 0.048981730143229166, + "learning_rate": 0.0001, + "loss": 6.0266, + "loss/crossentropy": 2.7788890600204468, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17399199306964874, + "step": 23320 + }, + { + "epoch": 0.7288125, + "grad_norm": 3.125, + "grad_norm_var": 0.049779256184895836, + "learning_rate": 0.0001, + "loss": 5.6936, + "loss/crossentropy": 2.529011607170105, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.17231817543506622, + "step": 23322 + }, + { + "epoch": 0.728875, + "grad_norm": 3.328125, + "grad_norm_var": 0.04312744140625, + "learning_rate": 0.0001, + "loss": 5.8842, + "loss/crossentropy": 2.744863271713257, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16705384105443954, + "step": 23324 + }, + { + "epoch": 0.7289375, + "grad_norm": 3.140625, + "grad_norm_var": 0.040990193684895836, + "learning_rate": 0.0001, + "loss": 5.823, + "loss/crossentropy": 2.595468759536743, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1715800240635872, + "step": 23326 + }, + { + "epoch": 0.729, + "grad_norm": 3.546875, + "grad_norm_var": 0.045750935872395836, + "learning_rate": 0.0001, + "loss": 5.9501, + "loss/crossentropy": 2.647188663482666, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.18029607087373734, + "step": 23328 + }, + { + "epoch": 0.7290625, + "grad_norm": 2.921875, + "grad_norm_var": 0.051106770833333336, + "learning_rate": 0.0001, + "loss": 5.6685, + "loss/crossentropy": 2.650219440460205, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15885566920042038, + "step": 23330 + }, + { + "epoch": 0.729125, + "grad_norm": 3.046875, + "grad_norm_var": 0.15070699055989584, + "learning_rate": 0.0001, + "loss": 5.7041, + "loss/crossentropy": 2.5037001371383667, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.1680903658270836, + "step": 23332 + }, + { + "epoch": 0.7291875, + "grad_norm": 3.0, + "grad_norm_var": 0.15506184895833333, + "learning_rate": 0.0001, + "loss": 5.6046, + "loss/crossentropy": 2.51932156085968, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16399675607681274, + "step": 23334 + }, + { + "epoch": 0.72925, + "grad_norm": 2.96875, + "grad_norm_var": 0.19091389973958334, + "learning_rate": 0.0001, + "loss": 6.0395, + "loss/crossentropy": 2.728891968727112, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17871622741222382, + "step": 23336 + }, + { + "epoch": 0.7293125, + "grad_norm": 3.625, + "grad_norm_var": 0.20361226399739582, + "learning_rate": 0.0001, + "loss": 5.6939, + "loss/crossentropy": 2.4913874864578247, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17024771869182587, + "step": 23338 + }, + { + "epoch": 0.729375, + "grad_norm": 2.828125, + "grad_norm_var": 0.22463785807291667, + "learning_rate": 0.0001, + "loss": 5.5474, + "loss/crossentropy": 2.4907031059265137, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15801078081130981, + "step": 23340 + }, + { + "epoch": 0.7294375, + "grad_norm": 2.859375, + "grad_norm_var": 0.2403228759765625, + "learning_rate": 0.0001, + "loss": 5.5717, + "loss/crossentropy": 2.578675150871277, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1563306376338005, + "step": 23342 + }, + { + "epoch": 0.7295, + "grad_norm": 3.03125, + "grad_norm_var": 0.23493550618489584, + "learning_rate": 0.0001, + "loss": 5.4622, + "loss/crossentropy": 2.441346287727356, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16028911620378494, + "step": 23344 + }, + { + "epoch": 0.7295625, + "grad_norm": 2.8125, + "grad_norm_var": 0.24150390625, + "learning_rate": 0.0001, + "loss": 5.6568, + "loss/crossentropy": 2.57526171207428, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16167286783456802, + "step": 23346 + }, + { + "epoch": 0.729625, + "grad_norm": 3.015625, + "grad_norm_var": 0.1181304931640625, + "learning_rate": 0.0001, + "loss": 5.8356, + "loss/crossentropy": 2.71990966796875, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16391023993492126, + "step": 23348 + }, + { + "epoch": 0.7296875, + "grad_norm": 4.21875, + "grad_norm_var": 0.20810139973958333, + "learning_rate": 0.0001, + "loss": 5.5542, + "loss/crossentropy": 2.486966371536255, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16492543369531631, + "step": 23350 + }, + { + "epoch": 0.72975, + "grad_norm": 3.171875, + "grad_norm_var": 0.13358968098958332, + "learning_rate": 0.0001, + "loss": 5.5869, + "loss/crossentropy": 2.454864978790283, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16398197412490845, + "step": 23352 + }, + { + "epoch": 0.7298125, + "grad_norm": 2.9375, + "grad_norm_var": 0.11433817545572916, + "learning_rate": 0.0001, + "loss": 5.6214, + "loss/crossentropy": 2.5570112466812134, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1642509251832962, + "step": 23354 + }, + { + "epoch": 0.729875, + "grad_norm": 3.140625, + "grad_norm_var": 0.11079813639322916, + "learning_rate": 0.0001, + "loss": 5.3264, + "loss/crossentropy": 2.3819743394851685, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14678432047367096, + "step": 23356 + }, + { + "epoch": 0.7299375, + "grad_norm": 3.34375, + "grad_norm_var": 0.11305338541666667, + "learning_rate": 0.0001, + "loss": 6.0956, + "loss/crossentropy": 2.7229779958724976, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.18179063498973846, + "step": 23358 + }, + { + "epoch": 0.73, + "grad_norm": 2.8125, + "grad_norm_var": 0.1189361572265625, + "learning_rate": 0.0001, + "loss": 5.5458, + "loss/crossentropy": 2.601516842842102, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1491125375032425, + "step": 23360 + }, + { + "epoch": 0.7300625, + "grad_norm": 3.703125, + "grad_norm_var": 0.13271484375, + "learning_rate": 0.0001, + "loss": 6.0007, + "loss/crossentropy": 2.764381527900696, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17246322333812714, + "step": 23362 + }, + { + "epoch": 0.730125, + "grad_norm": 3.265625, + "grad_norm_var": 0.12694905598958334, + "learning_rate": 0.0001, + "loss": 5.541, + "loss/crossentropy": 2.4775807857513428, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15946689993143082, + "step": 23364 + }, + { + "epoch": 0.7301875, + "grad_norm": 2.921875, + "grad_norm_var": 0.056761678059895834, + "learning_rate": 0.0001, + "loss": 5.8493, + "loss/crossentropy": 2.7315409183502197, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16567786782979965, + "step": 23366 + }, + { + "epoch": 0.73025, + "grad_norm": 3.203125, + "grad_norm_var": 0.054011027018229164, + "learning_rate": 0.0001, + "loss": 5.5005, + "loss/crossentropy": 2.474494218826294, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15455739945173264, + "step": 23368 + }, + { + "epoch": 0.7303125, + "grad_norm": 3.109375, + "grad_norm_var": 0.051146443684895834, + "learning_rate": 0.0001, + "loss": 5.5954, + "loss/crossentropy": 2.527936339378357, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15908553451299667, + "step": 23370 + }, + { + "epoch": 0.730375, + "grad_norm": 2.953125, + "grad_norm_var": 0.06220601399739583, + "learning_rate": 0.0001, + "loss": 5.682, + "loss/crossentropy": 2.630681872367859, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16059622168540955, + "step": 23372 + }, + { + "epoch": 0.7304375, + "grad_norm": 3.140625, + "grad_norm_var": 0.058203125, + "learning_rate": 0.0001, + "loss": 5.5371, + "loss/crossentropy": 2.4286952018737793, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1651330292224884, + "step": 23374 + }, + { + "epoch": 0.7305, + "grad_norm": 3.3125, + "grad_norm_var": 0.049657185872395836, + "learning_rate": 0.0001, + "loss": 5.9044, + "loss/crossentropy": 2.8086371421813965, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16504616290330887, + "step": 23376 + }, + { + "epoch": 0.7305625, + "grad_norm": 2.96875, + "grad_norm_var": 0.027098592122395834, + "learning_rate": 0.0001, + "loss": 5.4529, + "loss/crossentropy": 2.361377239227295, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16032811254262924, + "step": 23378 + }, + { + "epoch": 0.730625, + "grad_norm": 3.296875, + "grad_norm_var": 0.028218587239583332, + "learning_rate": 0.0001, + "loss": 5.5693, + "loss/crossentropy": 2.4290562868118286, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16558928787708282, + "step": 23380 + }, + { + "epoch": 0.7306875, + "grad_norm": 3.0625, + "grad_norm_var": 0.015404256184895833, + "learning_rate": 0.0001, + "loss": 5.4597, + "loss/crossentropy": 2.3813695907592773, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16213400661945343, + "step": 23382 + }, + { + "epoch": 0.73075, + "grad_norm": 2.8125, + "grad_norm_var": 0.0199859619140625, + "learning_rate": 0.0001, + "loss": 5.7522, + "loss/crossentropy": 2.689029335975647, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1610071212053299, + "step": 23384 + }, + { + "epoch": 0.7308125, + "grad_norm": 3.15625, + "grad_norm_var": 0.024689737955729166, + "learning_rate": 0.0001, + "loss": 5.808, + "loss/crossentropy": 2.6453691720962524, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16899652779102325, + "step": 23386 + }, + { + "epoch": 0.730875, + "grad_norm": 3.375, + "grad_norm_var": 0.022477213541666666, + "learning_rate": 0.0001, + "loss": 5.8856, + "loss/crossentropy": 2.7170687913894653, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16880813241004944, + "step": 23388 + }, + { + "epoch": 0.7309375, + "grad_norm": 3.203125, + "grad_norm_var": 0.13961181640625, + "learning_rate": 0.0001, + "loss": 5.5278, + "loss/crossentropy": 2.4707634449005127, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15843693166971207, + "step": 23390 + }, + { + "epoch": 0.731, + "grad_norm": 3.109375, + "grad_norm_var": 0.1425689697265625, + "learning_rate": 0.0001, + "loss": 5.2349, + "loss/crossentropy": 2.2763832807540894, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.14780624210834503, + "step": 23392 + }, + { + "epoch": 0.7310625, + "grad_norm": 3.125, + "grad_norm_var": 0.13893941243489583, + "learning_rate": 0.0001, + "loss": 5.5035, + "loss/crossentropy": 2.481665849685669, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15531041473150253, + "step": 23394 + }, + { + "epoch": 0.731125, + "grad_norm": 3.109375, + "grad_norm_var": 0.13613179524739583, + "learning_rate": 0.0001, + "loss": 5.7716, + "loss/crossentropy": 2.6283375024795532, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1682298630475998, + "step": 23396 + }, + { + "epoch": 0.7311875, + "grad_norm": 3.203125, + "grad_norm_var": 0.13552144368489583, + "learning_rate": 0.0001, + "loss": 5.5228, + "loss/crossentropy": 2.4387375116348267, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1619235798716545, + "step": 23398 + }, + { + "epoch": 0.73125, + "grad_norm": 3.0, + "grad_norm_var": 0.12942708333333333, + "learning_rate": 0.0001, + "loss": 5.7851, + "loss/crossentropy": 2.7239224910736084, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16197381168603897, + "step": 23400 + }, + { + "epoch": 0.7313125, + "grad_norm": 3.375, + "grad_norm_var": 0.13816630045572917, + "learning_rate": 0.0001, + "loss": 5.3917, + "loss/crossentropy": 2.3603626489639282, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15938717126846313, + "step": 23402 + }, + { + "epoch": 0.731375, + "grad_norm": 3.3125, + "grad_norm_var": 0.13778889973958333, + "learning_rate": 0.0001, + "loss": 5.7806, + "loss/crossentropy": 2.5797680616378784, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1739886999130249, + "step": 23404 + }, + { + "epoch": 0.7314375, + "grad_norm": 3.296875, + "grad_norm_var": 0.020091756184895834, + "learning_rate": 0.0001, + "loss": 5.5676, + "loss/crossentropy": 2.4198325872421265, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16438789665699005, + "step": 23406 + }, + { + "epoch": 0.7315, + "grad_norm": 2.921875, + "grad_norm_var": 0.025972493489583335, + "learning_rate": 0.0001, + "loss": 5.3688, + "loss/crossentropy": 2.426826000213623, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15357258170843124, + "step": 23408 + }, + { + "epoch": 0.7315625, + "grad_norm": 3.390625, + "grad_norm_var": 0.030855305989583335, + "learning_rate": 0.0001, + "loss": 5.9291, + "loss/crossentropy": 2.6948466300964355, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17616183310747147, + "step": 23410 + }, + { + "epoch": 0.731625, + "grad_norm": 3.078125, + "grad_norm_var": 0.035791015625, + "learning_rate": 0.0001, + "loss": 5.867, + "loss/crossentropy": 2.759811282157898, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1642349362373352, + "step": 23412 + }, + { + "epoch": 0.7316875, + "grad_norm": 3.375, + "grad_norm_var": 0.038960774739583336, + "learning_rate": 0.0001, + "loss": 5.2287, + "loss/crossentropy": 2.2166020274162292, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1519937738776207, + "step": 23414 + }, + { + "epoch": 0.73175, + "grad_norm": 2.859375, + "grad_norm_var": 0.04243062337239583, + "learning_rate": 0.0001, + "loss": 5.703, + "loss/crossentropy": 2.632703423500061, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16405853629112244, + "step": 23416 + }, + { + "epoch": 0.7318125, + "grad_norm": 2.96875, + "grad_norm_var": 0.036149088541666666, + "learning_rate": 0.0001, + "loss": 5.0297, + "loss/crossentropy": 2.118439018726349, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.14307677745819092, + "step": 23418 + }, + { + "epoch": 0.731875, + "grad_norm": 3.21875, + "grad_norm_var": 0.03445638020833333, + "learning_rate": 0.0001, + "loss": 5.8998, + "loss/crossentropy": 2.6661545038223267, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17297054082155228, + "step": 23420 + }, + { + "epoch": 0.7319375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0317535400390625, + "learning_rate": 0.0001, + "loss": 5.8514, + "loss/crossentropy": 2.6101226806640625, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17647041380405426, + "step": 23422 + }, + { + "epoch": 0.732, + "grad_norm": 3.1875, + "grad_norm_var": 0.028023274739583333, + "learning_rate": 0.0001, + "loss": 5.6823, + "loss/crossentropy": 2.5592458248138428, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16425611078739166, + "step": 23424 + }, + { + "epoch": 0.7320625, + "grad_norm": 3.1875, + "grad_norm_var": 0.024095662434895835, + "learning_rate": 0.0001, + "loss": 5.7849, + "loss/crossentropy": 2.5620510578155518, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17423319816589355, + "step": 23426 + }, + { + "epoch": 0.732125, + "grad_norm": 2.90625, + "grad_norm_var": 0.0216461181640625, + "learning_rate": 0.0001, + "loss": 5.4815, + "loss/crossentropy": 2.4160321950912476, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16201941668987274, + "step": 23428 + }, + { + "epoch": 0.7321875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0192291259765625, + "learning_rate": 0.0001, + "loss": 5.7321, + "loss/crossentropy": 2.6281731128692627, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1635151132941246, + "step": 23430 + }, + { + "epoch": 0.73225, + "grad_norm": 3.046875, + "grad_norm_var": 0.015380859375, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.573588252067566, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16768812388181686, + "step": 23432 + }, + { + "epoch": 0.7323125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 5.7698, + "loss/crossentropy": 2.588934898376465, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1665220931172371, + "step": 23434 + }, + { + "epoch": 0.732375, + "grad_norm": 3.234375, + "grad_norm_var": 0.01363525390625, + "learning_rate": 0.0001, + "loss": 5.7479, + "loss/crossentropy": 2.640743613243103, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16110744327306747, + "step": 23436 + }, + { + "epoch": 0.7324375, + "grad_norm": 3.40625, + "grad_norm_var": 0.020817057291666666, + "learning_rate": 0.0001, + "loss": 5.8297, + "loss/crossentropy": 2.5773215293884277, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17484461516141891, + "step": 23438 + }, + { + "epoch": 0.7325, + "grad_norm": 3.15625, + "grad_norm_var": 0.019652303059895834, + "learning_rate": 0.0001, + "loss": 5.6132, + "loss/crossentropy": 2.5538493394851685, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15906429290771484, + "step": 23440 + }, + { + "epoch": 0.7325625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0196441650390625, + "learning_rate": 0.0001, + "loss": 5.5364, + "loss/crossentropy": 2.5850027799606323, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.14943711459636688, + "step": 23442 + }, + { + "epoch": 0.732625, + "grad_norm": 3.125, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 5.7022, + "loss/crossentropy": 2.5973565578460693, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16556359827518463, + "step": 23444 + }, + { + "epoch": 0.7326875, + "grad_norm": 2.96875, + "grad_norm_var": 0.0246978759765625, + "learning_rate": 0.0001, + "loss": 5.381, + "loss/crossentropy": 2.403241515159607, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15441212058067322, + "step": 23446 + }, + { + "epoch": 0.73275, + "grad_norm": 3.0, + "grad_norm_var": 0.025651041666666666, + "learning_rate": 0.0001, + "loss": 5.5208, + "loss/crossentropy": 2.5156280994415283, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15911425650119781, + "step": 23448 + }, + { + "epoch": 0.7328125, + "grad_norm": 3.421875, + "grad_norm_var": 0.033492024739583334, + "learning_rate": 0.0001, + "loss": 5.7593, + "loss/crossentropy": 2.604570508003235, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17055577784776688, + "step": 23450 + }, + { + "epoch": 0.732875, + "grad_norm": 3.25, + "grad_norm_var": 0.03437093098958333, + "learning_rate": 0.0001, + "loss": 5.5476, + "loss/crossentropy": 2.461097002029419, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16099774837493896, + "step": 23452 + }, + { + "epoch": 0.7329375, + "grad_norm": 2.890625, + "grad_norm_var": 0.0237457275390625, + "learning_rate": 0.0001, + "loss": 5.507, + "loss/crossentropy": 2.5025230646133423, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15826108306646347, + "step": 23454 + }, + { + "epoch": 0.733, + "grad_norm": 3.46875, + "grad_norm_var": 0.07079671223958334, + "learning_rate": 0.0001, + "loss": 5.5568, + "loss/crossentropy": 2.3830480575561523, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16268550604581833, + "step": 23456 + }, + { + "epoch": 0.7330625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0978912353515625, + "learning_rate": 0.0001, + "loss": 5.6719, + "loss/crossentropy": 2.587242603302002, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1643211841583252, + "step": 23458 + }, + { + "epoch": 0.733125, + "grad_norm": 3.0625, + "grad_norm_var": 0.09876200358072916, + "learning_rate": 0.0001, + "loss": 5.4057, + "loss/crossentropy": 2.3633275032043457, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15892165899276733, + "step": 23460 + }, + { + "epoch": 0.7331875, + "grad_norm": 2.890625, + "grad_norm_var": 0.09299723307291667, + "learning_rate": 0.0001, + "loss": 5.7002, + "loss/crossentropy": 2.608008861541748, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16312935948371887, + "step": 23462 + }, + { + "epoch": 0.73325, + "grad_norm": 3.140625, + "grad_norm_var": 0.08854166666666667, + "learning_rate": 0.0001, + "loss": 5.7402, + "loss/crossentropy": 2.5507601499557495, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1720656156539917, + "step": 23464 + }, + { + "epoch": 0.7333125, + "grad_norm": 3.40625, + "grad_norm_var": 0.09127197265625, + "learning_rate": 0.0001, + "loss": 5.5159, + "loss/crossentropy": 2.4633994102478027, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15681590884923935, + "step": 23466 + }, + { + "epoch": 0.733375, + "grad_norm": 3.125, + "grad_norm_var": 0.09455973307291667, + "learning_rate": 0.0001, + "loss": 5.5678, + "loss/crossentropy": 2.502510190010071, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16239215433597565, + "step": 23468 + }, + { + "epoch": 0.7334375, + "grad_norm": 3.015625, + "grad_norm_var": 0.10484110514322917, + "learning_rate": 0.0001, + "loss": 5.421, + "loss/crossentropy": 2.4273658990859985, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15404889732599258, + "step": 23470 + }, + { + "epoch": 0.7335, + "grad_norm": 3.078125, + "grad_norm_var": 0.06075846354166667, + "learning_rate": 0.0001, + "loss": 5.4922, + "loss/crossentropy": 2.429620862007141, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1570415273308754, + "step": 23472 + }, + { + "epoch": 0.7335625, + "grad_norm": 2.9375, + "grad_norm_var": 0.022965494791666666, + "learning_rate": 0.0001, + "loss": 5.7666, + "loss/crossentropy": 2.5914634466171265, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16985638439655304, + "step": 23474 + }, + { + "epoch": 0.733625, + "grad_norm": 3.0625, + "grad_norm_var": 0.032990519205729166, + "learning_rate": 0.0001, + "loss": 5.7969, + "loss/crossentropy": 2.6145005226135254, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1701931208372116, + "step": 23476 + }, + { + "epoch": 0.7336875, + "grad_norm": 3.265625, + "grad_norm_var": 0.03289388020833333, + "learning_rate": 0.0001, + "loss": 5.5761, + "loss/crossentropy": 2.4450889825820923, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1709156483411789, + "step": 23478 + }, + { + "epoch": 0.73375, + "grad_norm": 3.015625, + "grad_norm_var": 0.033137003580729164, + "learning_rate": 0.0001, + "loss": 5.6349, + "loss/crossentropy": 2.5668121576309204, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16149134933948517, + "step": 23480 + }, + { + "epoch": 0.7338125, + "grad_norm": 3.203125, + "grad_norm_var": 0.0259185791015625, + "learning_rate": 0.0001, + "loss": 5.7231, + "loss/crossentropy": 2.5993869304656982, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16432444006204605, + "step": 23482 + }, + { + "epoch": 0.733875, + "grad_norm": 3.0625, + "grad_norm_var": 0.025223795572916666, + "learning_rate": 0.0001, + "loss": 5.8019, + "loss/crossentropy": 2.6502548456192017, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1675073504447937, + "step": 23484 + }, + { + "epoch": 0.7339375, + "grad_norm": 2.703125, + "grad_norm_var": 0.028693644205729167, + "learning_rate": 0.0001, + "loss": 5.6372, + "loss/crossentropy": 2.5472875833511353, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15937808901071548, + "step": 23486 + }, + { + "epoch": 0.734, + "grad_norm": 3.0, + "grad_norm_var": 0.029069010416666666, + "learning_rate": 0.0001, + "loss": 5.5728, + "loss/crossentropy": 2.508591651916504, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16188612580299377, + "step": 23488 + }, + { + "epoch": 0.7340625, + "grad_norm": 2.875, + "grad_norm_var": 0.030549112955729166, + "learning_rate": 0.0001, + "loss": 5.4278, + "loss/crossentropy": 2.442284345626831, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15519032627344131, + "step": 23490 + }, + { + "epoch": 0.734125, + "grad_norm": 3.15625, + "grad_norm_var": 0.023270670572916666, + "learning_rate": 0.0001, + "loss": 5.5983, + "loss/crossentropy": 2.4656132459640503, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16366274654865265, + "step": 23492 + }, + { + "epoch": 0.7341875, + "grad_norm": 3.078125, + "grad_norm_var": 0.020531209309895833, + "learning_rate": 0.0001, + "loss": 5.8963, + "loss/crossentropy": 2.685925006866455, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17298579961061478, + "step": 23494 + }, + { + "epoch": 0.73425, + "grad_norm": 2.765625, + "grad_norm_var": 0.02945556640625, + "learning_rate": 0.0001, + "loss": 5.2837, + "loss/crossentropy": 2.4336854219436646, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.14163990318775177, + "step": 23496 + }, + { + "epoch": 0.7343125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0365386962890625, + "learning_rate": 0.0001, + "loss": 5.9122, + "loss/crossentropy": 2.64986252784729, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17701907455921173, + "step": 23498 + }, + { + "epoch": 0.734375, + "grad_norm": 2.859375, + "grad_norm_var": 0.04112040201822917, + "learning_rate": 0.0001, + "loss": 5.616, + "loss/crossentropy": 2.5660492181777954, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16046810150146484, + "step": 23500 + }, + { + "epoch": 0.7344375, + "grad_norm": 2.796875, + "grad_norm_var": 0.03284098307291667, + "learning_rate": 0.0001, + "loss": 5.602, + "loss/crossentropy": 2.5677409172058105, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15733551979064941, + "step": 23502 + }, + { + "epoch": 0.7345, + "grad_norm": 3.09375, + "grad_norm_var": 0.034699503580729166, + "learning_rate": 0.0001, + "loss": 5.7832, + "loss/crossentropy": 2.6343828439712524, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16839541494846344, + "step": 23504 + }, + { + "epoch": 0.7345625, + "grad_norm": 2.859375, + "grad_norm_var": 0.04716695149739583, + "learning_rate": 0.0001, + "loss": 5.4971, + "loss/crossentropy": 2.4877257347106934, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15210432559251785, + "step": 23506 + }, + { + "epoch": 0.734625, + "grad_norm": 3.0625, + "grad_norm_var": 0.058089192708333334, + "learning_rate": 0.0001, + "loss": 5.9026, + "loss/crossentropy": 2.7096848487854004, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17124372720718384, + "step": 23508 + }, + { + "epoch": 0.7346875, + "grad_norm": 2.875, + "grad_norm_var": 0.07830403645833334, + "learning_rate": 0.0001, + "loss": 5.3596, + "loss/crossentropy": 2.4028228521347046, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15114928781986237, + "step": 23510 + }, + { + "epoch": 0.73475, + "grad_norm": 3.40625, + "grad_norm_var": 0.06862691243489584, + "learning_rate": 0.0001, + "loss": 5.9145, + "loss/crossentropy": 2.733505964279175, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16888293623924255, + "step": 23512 + }, + { + "epoch": 0.7348125, + "grad_norm": 2.796875, + "grad_norm_var": 0.07138570149739583, + "learning_rate": 0.0001, + "loss": 5.3005, + "loss/crossentropy": 2.3271204233169556, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15319299697875977, + "step": 23514 + }, + { + "epoch": 0.734875, + "grad_norm": 3.34375, + "grad_norm_var": 0.0735504150390625, + "learning_rate": 0.0001, + "loss": 5.3765, + "loss/crossentropy": 2.407612442970276, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1554863601922989, + "step": 23516 + }, + { + "epoch": 0.7349375, + "grad_norm": 2.796875, + "grad_norm_var": 0.08217671712239584, + "learning_rate": 0.0001, + "loss": 5.3266, + "loss/crossentropy": 2.4507007598876953, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.14501401782035828, + "step": 23518 + }, + { + "epoch": 0.735, + "grad_norm": 3.1875, + "grad_norm_var": 0.08747456868489584, + "learning_rate": 0.0001, + "loss": 5.6318, + "loss/crossentropy": 2.5163025856018066, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16271989792585373, + "step": 23520 + }, + { + "epoch": 0.7350625, + "grad_norm": 3.0, + "grad_norm_var": 0.07698160807291667, + "learning_rate": 0.0001, + "loss": 5.4528, + "loss/crossentropy": 2.4520334005355835, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1551515907049179, + "step": 23522 + }, + { + "epoch": 0.735125, + "grad_norm": 2.71875, + "grad_norm_var": 0.07219136555989583, + "learning_rate": 0.0001, + "loss": 5.6717, + "loss/crossentropy": 2.61397123336792, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16201938688755035, + "step": 23524 + }, + { + "epoch": 0.7351875, + "grad_norm": 3.078125, + "grad_norm_var": 0.050755818684895836, + "learning_rate": 0.0001, + "loss": 5.6804, + "loss/crossentropy": 2.5597420930862427, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.165585957467556, + "step": 23526 + }, + { + "epoch": 0.73525, + "grad_norm": 3.03125, + "grad_norm_var": 0.042512003580729166, + "learning_rate": 0.0001, + "loss": 5.5602, + "loss/crossentropy": 2.529832124710083, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15850237756967545, + "step": 23528 + }, + { + "epoch": 0.7353125, + "grad_norm": 3.234375, + "grad_norm_var": 0.044164021809895836, + "learning_rate": 0.0001, + "loss": 5.7882, + "loss/crossentropy": 2.6322215795516968, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.167157880961895, + "step": 23530 + }, + { + "epoch": 0.735375, + "grad_norm": 3.046875, + "grad_norm_var": 0.0363922119140625, + "learning_rate": 0.0001, + "loss": 6.0217, + "loss/crossentropy": 2.894606351852417, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16661609709262848, + "step": 23532 + }, + { + "epoch": 0.7354375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0268707275390625, + "learning_rate": 0.0001, + "loss": 5.4766, + "loss/crossentropy": 2.4781914949417114, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.158429816365242, + "step": 23534 + }, + { + "epoch": 0.7355, + "grad_norm": 3.453125, + "grad_norm_var": 0.029002888997395834, + "learning_rate": 0.0001, + "loss": 5.9379, + "loss/crossentropy": 2.737681031227112, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17275556176900864, + "step": 23536 + }, + { + "epoch": 0.7355625, + "grad_norm": 2.921875, + "grad_norm_var": 0.02916259765625, + "learning_rate": 0.0001, + "loss": 5.6641, + "loss/crossentropy": 2.589012384414673, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1649332493543625, + "step": 23538 + }, + { + "epoch": 0.735625, + "grad_norm": 2.828125, + "grad_norm_var": 0.0323883056640625, + "learning_rate": 0.0001, + "loss": 5.6689, + "loss/crossentropy": 2.5991783142089844, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1612653061747551, + "step": 23540 + }, + { + "epoch": 0.7356875, + "grad_norm": 3.015625, + "grad_norm_var": 0.030289713541666666, + "learning_rate": 0.0001, + "loss": 5.3967, + "loss/crossentropy": 2.4203792810440063, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15544036030769348, + "step": 23542 + }, + { + "epoch": 0.73575, + "grad_norm": 2.859375, + "grad_norm_var": 0.031737263997395834, + "learning_rate": 0.0001, + "loss": 5.6952, + "loss/crossentropy": 2.635646939277649, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16142567247152328, + "step": 23544 + }, + { + "epoch": 0.7358125, + "grad_norm": 3.234375, + "grad_norm_var": 0.03383687337239583, + "learning_rate": 0.0001, + "loss": 5.4899, + "loss/crossentropy": 2.4383760690689087, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1606234908103943, + "step": 23546 + }, + { + "epoch": 0.735875, + "grad_norm": 2.859375, + "grad_norm_var": 0.03642578125, + "learning_rate": 0.0001, + "loss": 5.4974, + "loss/crossentropy": 2.5072513818740845, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15409240871667862, + "step": 23548 + }, + { + "epoch": 0.7359375, + "grad_norm": 3.140625, + "grad_norm_var": 0.05164286295572917, + "learning_rate": 0.0001, + "loss": 6.0929, + "loss/crossentropy": 2.7838547229766846, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.18050900846719742, + "step": 23550 + }, + { + "epoch": 0.736, + "grad_norm": 3.484375, + "grad_norm_var": 0.0539947509765625, + "learning_rate": 0.0001, + "loss": 5.9286, + "loss/crossentropy": 2.6969679594039917, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17433885484933853, + "step": 23552 + }, + { + "epoch": 0.7360625, + "grad_norm": 3.421875, + "grad_norm_var": 0.05956624348958333, + "learning_rate": 0.0001, + "loss": 5.9226, + "loss/crossentropy": 2.698677182197571, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17434575408697128, + "step": 23554 + }, + { + "epoch": 0.736125, + "grad_norm": 2.875, + "grad_norm_var": 0.05289306640625, + "learning_rate": 0.0001, + "loss": 5.7134, + "loss/crossentropy": 2.5809249877929688, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16597800701856613, + "step": 23556 + }, + { + "epoch": 0.7361875, + "grad_norm": 2.890625, + "grad_norm_var": 0.05805562337239583, + "learning_rate": 0.0001, + "loss": 5.5054, + "loss/crossentropy": 2.546760082244873, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15406754612922668, + "step": 23558 + }, + { + "epoch": 0.73625, + "grad_norm": 3.53125, + "grad_norm_var": 0.06536051432291666, + "learning_rate": 0.0001, + "loss": 5.7136, + "loss/crossentropy": 2.630208373069763, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1626323238015175, + "step": 23560 + }, + { + "epoch": 0.7363125, + "grad_norm": 2.90625, + "grad_norm_var": 0.06128641764322917, + "learning_rate": 0.0001, + "loss": 5.6842, + "loss/crossentropy": 2.5686936378479004, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1638946235179901, + "step": 23562 + }, + { + "epoch": 0.736375, + "grad_norm": 3.125, + "grad_norm_var": 0.05694986979166667, + "learning_rate": 0.0001, + "loss": 5.7707, + "loss/crossentropy": 2.6414307355880737, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1668316125869751, + "step": 23564 + }, + { + "epoch": 0.7364375, + "grad_norm": 3.125, + "grad_norm_var": 0.075146484375, + "learning_rate": 0.0001, + "loss": 5.8567, + "loss/crossentropy": 2.614556074142456, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17343351244926453, + "step": 23566 + }, + { + "epoch": 0.7365, + "grad_norm": 3.09375, + "grad_norm_var": 0.06851806640625, + "learning_rate": 0.0001, + "loss": 5.6407, + "loss/crossentropy": 2.5622506141662598, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.163309745490551, + "step": 23568 + }, + { + "epoch": 0.7365625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0625396728515625, + "learning_rate": 0.0001, + "loss": 6.0154, + "loss/crossentropy": 2.810123562812805, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1744309365749359, + "step": 23570 + }, + { + "epoch": 0.736625, + "grad_norm": 3.078125, + "grad_norm_var": 0.058610026041666666, + "learning_rate": 0.0001, + "loss": 5.5339, + "loss/crossentropy": 2.4425787925720215, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16225220263004303, + "step": 23572 + }, + { + "epoch": 0.7366875, + "grad_norm": 3.171875, + "grad_norm_var": 0.04915262858072917, + "learning_rate": 0.0001, + "loss": 5.6152, + "loss/crossentropy": 2.536233425140381, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16180337965488434, + "step": 23574 + }, + { + "epoch": 0.73675, + "grad_norm": 3.140625, + "grad_norm_var": 0.07273763020833333, + "learning_rate": 0.0001, + "loss": 5.4295, + "loss/crossentropy": 2.3130571842193604, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1616436317563057, + "step": 23576 + }, + { + "epoch": 0.7368125, + "grad_norm": 3.34375, + "grad_norm_var": 0.06689046223958334, + "learning_rate": 0.0001, + "loss": 5.4234, + "loss/crossentropy": 2.3387938737869263, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15767645090818405, + "step": 23578 + }, + { + "epoch": 0.736875, + "grad_norm": 3.578125, + "grad_norm_var": 0.07784830729166667, + "learning_rate": 0.0001, + "loss": 5.7788, + "loss/crossentropy": 2.565428376197815, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16938666254281998, + "step": 23580 + }, + { + "epoch": 0.7369375, + "grad_norm": 3.09375, + "grad_norm_var": 0.059956868489583336, + "learning_rate": 0.0001, + "loss": 5.3839, + "loss/crossentropy": 2.338326930999756, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1615864410996437, + "step": 23582 + }, + { + "epoch": 0.737, + "grad_norm": 3.34375, + "grad_norm_var": 0.06139322916666667, + "learning_rate": 0.0001, + "loss": 5.6858, + "loss/crossentropy": 2.5333563089370728, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.17188771069049835, + "step": 23584 + }, + { + "epoch": 0.7370625, + "grad_norm": 2.796875, + "grad_norm_var": 0.07151285807291667, + "learning_rate": 0.0001, + "loss": 5.3732, + "loss/crossentropy": 2.3642250299453735, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15324292331933975, + "step": 23586 + }, + { + "epoch": 0.737125, + "grad_norm": 2.9375, + "grad_norm_var": 0.07737223307291667, + "learning_rate": 0.0001, + "loss": 5.6459, + "loss/crossentropy": 2.4873459339141846, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16780464351177216, + "step": 23588 + }, + { + "epoch": 0.7371875, + "grad_norm": 3.203125, + "grad_norm_var": 0.07736714680989583, + "learning_rate": 0.0001, + "loss": 5.8558, + "loss/crossentropy": 2.613994836807251, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17496132850646973, + "step": 23590 + }, + { + "epoch": 0.73725, + "grad_norm": 2.859375, + "grad_norm_var": 0.04999593098958333, + "learning_rate": 0.0001, + "loss": 5.6823, + "loss/crossentropy": 2.616868495941162, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16357173025608063, + "step": 23592 + }, + { + "epoch": 0.7373125, + "grad_norm": 2.71875, + "grad_norm_var": 0.05718994140625, + "learning_rate": 0.0001, + "loss": 5.3136, + "loss/crossentropy": 2.4151304960250854, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.14961008727550507, + "step": 23594 + }, + { + "epoch": 0.737375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0427642822265625, + "learning_rate": 0.0001, + "loss": 5.7147, + "loss/crossentropy": 2.619183301925659, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16150399297475815, + "step": 23596 + }, + { + "epoch": 0.7374375, + "grad_norm": 3.46875, + "grad_norm_var": 0.05130208333333333, + "learning_rate": 0.0001, + "loss": 5.5533, + "loss/crossentropy": 2.520646572113037, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15599774569272995, + "step": 23598 + }, + { + "epoch": 0.7375, + "grad_norm": 3.734375, + "grad_norm_var": 0.07461649576822917, + "learning_rate": 0.0001, + "loss": 5.5009, + "loss/crossentropy": 2.389772653579712, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.164240300655365, + "step": 23600 + }, + { + "epoch": 0.7375625, + "grad_norm": 2.84375, + "grad_norm_var": 0.07599995930989584, + "learning_rate": 0.0001, + "loss": 5.3118, + "loss/crossentropy": 2.35556161403656, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15421681106090546, + "step": 23602 + }, + { + "epoch": 0.737625, + "grad_norm": 3.4375, + "grad_norm_var": 0.08384501139322917, + "learning_rate": 0.0001, + "loss": 5.653, + "loss/crossentropy": 2.506429433822632, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1689501628279686, + "step": 23604 + }, + { + "epoch": 0.7376875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0824615478515625, + "learning_rate": 0.0001, + "loss": 5.4944, + "loss/crossentropy": 2.504259943962097, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15604222565889359, + "step": 23606 + }, + { + "epoch": 0.73775, + "grad_norm": 3.140625, + "grad_norm_var": 0.07965494791666666, + "learning_rate": 0.0001, + "loss": 5.7588, + "loss/crossentropy": 2.6146479845046997, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16949427127838135, + "step": 23608 + }, + { + "epoch": 0.7378125, + "grad_norm": 2.890625, + "grad_norm_var": 0.07296549479166667, + "learning_rate": 0.0001, + "loss": 5.3794, + "loss/crossentropy": 2.4034372568130493, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1561950296163559, + "step": 23610 + }, + { + "epoch": 0.737875, + "grad_norm": 2.984375, + "grad_norm_var": 0.07371317545572917, + "learning_rate": 0.0001, + "loss": 5.9049, + "loss/crossentropy": 2.7431684732437134, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16851452738046646, + "step": 23612 + }, + { + "epoch": 0.7379375, + "grad_norm": 3.140625, + "grad_norm_var": 0.10067952473958333, + "learning_rate": 0.0001, + "loss": 5.7946, + "loss/crossentropy": 2.633496046066284, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1692311391234398, + "step": 23614 + }, + { + "epoch": 0.738, + "grad_norm": 3.328125, + "grad_norm_var": 0.08065999348958333, + "learning_rate": 0.0001, + "loss": 5.5748, + "loss/crossentropy": 2.506125807762146, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15960664302110672, + "step": 23616 + }, + { + "epoch": 0.7380625, + "grad_norm": 3.203125, + "grad_norm_var": 0.07156575520833333, + "learning_rate": 0.0001, + "loss": 5.7517, + "loss/crossentropy": 2.6554681062698364, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1611897200345993, + "step": 23618 + }, + { + "epoch": 0.738125, + "grad_norm": 3.171875, + "grad_norm_var": 0.05950113932291667, + "learning_rate": 0.0001, + "loss": 5.713, + "loss/crossentropy": 2.6006507873535156, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16514024883508682, + "step": 23620 + }, + { + "epoch": 0.7381875, + "grad_norm": 2.84375, + "grad_norm_var": 0.0748931884765625, + "learning_rate": 0.0001, + "loss": 5.374, + "loss/crossentropy": 2.279668092727661, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16217041015625, + "step": 23622 + }, + { + "epoch": 0.73825, + "grad_norm": 2.90625, + "grad_norm_var": 0.0783203125, + "learning_rate": 0.0001, + "loss": 5.6124, + "loss/crossentropy": 2.5494585037231445, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15981171280145645, + "step": 23624 + }, + { + "epoch": 0.7383125, + "grad_norm": 2.9375, + "grad_norm_var": 0.0767974853515625, + "learning_rate": 0.0001, + "loss": 5.5042, + "loss/crossentropy": 2.494245767593384, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15646066516637802, + "step": 23626 + }, + { + "epoch": 0.738375, + "grad_norm": 3.015625, + "grad_norm_var": 0.08000895182291666, + "learning_rate": 0.0001, + "loss": 5.7017, + "loss/crossentropy": 2.626276969909668, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16456867009401321, + "step": 23628 + }, + { + "epoch": 0.7384375, + "grad_norm": 2.9375, + "grad_norm_var": 0.041337076822916666, + "learning_rate": 0.0001, + "loss": 5.5703, + "loss/crossentropy": 2.528649926185608, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16041024029254913, + "step": 23630 + }, + { + "epoch": 0.7385, + "grad_norm": 4.03125, + "grad_norm_var": 0.09514567057291666, + "learning_rate": 0.0001, + "loss": 5.9694, + "loss/crossentropy": 2.622844696044922, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18036100268363953, + "step": 23632 + }, + { + "epoch": 0.7385625, + "grad_norm": 2.859375, + "grad_norm_var": 0.09716695149739583, + "learning_rate": 0.0001, + "loss": 5.5787, + "loss/crossentropy": 2.520618438720703, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16479463130235672, + "step": 23634 + }, + { + "epoch": 0.738625, + "grad_norm": 3.265625, + "grad_norm_var": 0.09853108723958333, + "learning_rate": 0.0001, + "loss": 5.481, + "loss/crossentropy": 2.4266551733016968, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16246679425239563, + "step": 23636 + }, + { + "epoch": 0.7386875, + "grad_norm": 3.046875, + "grad_norm_var": 0.08171284993489583, + "learning_rate": 0.0001, + "loss": 5.8134, + "loss/crossentropy": 2.6680673360824585, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16687805950641632, + "step": 23638 + }, + { + "epoch": 0.73875, + "grad_norm": 2.953125, + "grad_norm_var": 0.08269856770833334, + "learning_rate": 0.0001, + "loss": 5.2904, + "loss/crossentropy": 2.321548342704773, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1523580253124237, + "step": 23640 + }, + { + "epoch": 0.7388125, + "grad_norm": 2.9375, + "grad_norm_var": 0.08245442708333334, + "learning_rate": 0.0001, + "loss": 5.6741, + "loss/crossentropy": 2.5963666439056396, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16246125102043152, + "step": 23642 + }, + { + "epoch": 0.738875, + "grad_norm": 3.125, + "grad_norm_var": 0.07638346354166667, + "learning_rate": 0.0001, + "loss": 5.5873, + "loss/crossentropy": 2.5040605068206787, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16144713759422302, + "step": 23644 + }, + { + "epoch": 0.7389375, + "grad_norm": 3.5, + "grad_norm_var": 0.08185221354166666, + "learning_rate": 0.0001, + "loss": 5.3826, + "loss/crossentropy": 2.3296725749969482, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16037489473819733, + "step": 23646 + }, + { + "epoch": 0.739, + "grad_norm": 2.921875, + "grad_norm_var": 0.030760701497395834, + "learning_rate": 0.0001, + "loss": 5.6415, + "loss/crossentropy": 2.5903111696243286, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15863718837499619, + "step": 23648 + }, + { + "epoch": 0.7390625, + "grad_norm": 2.796875, + "grad_norm_var": 0.03705952962239583, + "learning_rate": 0.0001, + "loss": 5.3715, + "loss/crossentropy": 2.3111305236816406, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16033006459474564, + "step": 23650 + }, + { + "epoch": 0.739125, + "grad_norm": 3.03125, + "grad_norm_var": 0.03726806640625, + "learning_rate": 0.0001, + "loss": 5.36, + "loss/crossentropy": 2.398424506187439, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15240325778722763, + "step": 23652 + }, + { + "epoch": 0.7391875, + "grad_norm": 2.96875, + "grad_norm_var": 0.0329498291015625, + "learning_rate": 0.0001, + "loss": 5.8221, + "loss/crossentropy": 2.6814838647842407, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.164058618247509, + "step": 23654 + }, + { + "epoch": 0.73925, + "grad_norm": 2.859375, + "grad_norm_var": 0.0348297119140625, + "learning_rate": 0.0001, + "loss": 5.5606, + "loss/crossentropy": 2.570521354675293, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15682382881641388, + "step": 23656 + }, + { + "epoch": 0.7393125, + "grad_norm": 3.09375, + "grad_norm_var": 0.033014933268229164, + "learning_rate": 0.0001, + "loss": 5.5078, + "loss/crossentropy": 2.4352725744247437, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16155054420232773, + "step": 23658 + }, + { + "epoch": 0.739375, + "grad_norm": 2.96875, + "grad_norm_var": 0.034956868489583334, + "learning_rate": 0.0001, + "loss": 5.8403, + "loss/crossentropy": 2.633499503135681, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17263556271791458, + "step": 23660 + }, + { + "epoch": 0.7394375, + "grad_norm": 3.25, + "grad_norm_var": 0.0244537353515625, + "learning_rate": 0.0001, + "loss": 5.8884, + "loss/crossentropy": 2.6757001876831055, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17361054569482803, + "step": 23662 + }, + { + "epoch": 0.7395, + "grad_norm": 3.125, + "grad_norm_var": 0.026024373372395833, + "learning_rate": 0.0001, + "loss": 5.6931, + "loss/crossentropy": 2.592305064201355, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16594009846448898, + "step": 23664 + }, + { + "epoch": 0.7395625, + "grad_norm": 3.0, + "grad_norm_var": 0.016532389322916667, + "learning_rate": 0.0001, + "loss": 5.5975, + "loss/crossentropy": 2.524364471435547, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16161108016967773, + "step": 23666 + }, + { + "epoch": 0.739625, + "grad_norm": 3.328125, + "grad_norm_var": 0.027961222330729167, + "learning_rate": 0.0001, + "loss": 5.9427, + "loss/crossentropy": 2.7580608129501343, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17158909887075424, + "step": 23668 + }, + { + "epoch": 0.7396875, + "grad_norm": 3.203125, + "grad_norm_var": 0.028706868489583332, + "learning_rate": 0.0001, + "loss": 5.84, + "loss/crossentropy": 2.7155197858810425, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16635117679834366, + "step": 23670 + }, + { + "epoch": 0.73975, + "grad_norm": 3.375, + "grad_norm_var": 0.029781087239583334, + "learning_rate": 0.0001, + "loss": 5.8557, + "loss/crossentropy": 2.634499430656433, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17407264560461044, + "step": 23672 + }, + { + "epoch": 0.7398125, + "grad_norm": 3.46875, + "grad_norm_var": 0.039281209309895836, + "learning_rate": 0.0001, + "loss": 5.8832, + "loss/crossentropy": 2.57876718044281, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17810030281543732, + "step": 23674 + }, + { + "epoch": 0.739875, + "grad_norm": 3.125, + "grad_norm_var": 0.038411458333333336, + "learning_rate": 0.0001, + "loss": 5.7498, + "loss/crossentropy": 2.727282166481018, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15771931409835815, + "step": 23676 + }, + { + "epoch": 0.7399375, + "grad_norm": 3.046875, + "grad_norm_var": 0.03795572916666667, + "learning_rate": 0.0001, + "loss": 5.9869, + "loss/crossentropy": 2.7334845066070557, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17846722155809402, + "step": 23678 + }, + { + "epoch": 0.74, + "grad_norm": 2.953125, + "grad_norm_var": 0.035643513997395834, + "learning_rate": 0.0001, + "loss": 5.7289, + "loss/crossentropy": 2.726085901260376, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15379394590854645, + "step": 23680 + }, + { + "epoch": 0.7400625, + "grad_norm": 3.390625, + "grad_norm_var": 0.0536285400390625, + "learning_rate": 0.0001, + "loss": 5.4931, + "loss/crossentropy": 2.3859097957611084, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16697047650814056, + "step": 23682 + }, + { + "epoch": 0.740125, + "grad_norm": 3.25, + "grad_norm_var": 0.038263956705729164, + "learning_rate": 0.0001, + "loss": 5.6054, + "loss/crossentropy": 2.557660222053528, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16024072468280792, + "step": 23684 + }, + { + "epoch": 0.7401875, + "grad_norm": 3.140625, + "grad_norm_var": 0.03748270670572917, + "learning_rate": 0.0001, + "loss": 6.0288, + "loss/crossentropy": 2.8301045894622803, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1702597811818123, + "step": 23686 + }, + { + "epoch": 0.74025, + "grad_norm": 3.25, + "grad_norm_var": 0.0434478759765625, + "learning_rate": 0.0001, + "loss": 5.7256, + "loss/crossentropy": 2.661123752593994, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1627010926604271, + "step": 23688 + }, + { + "epoch": 0.7403125, + "grad_norm": 3.3125, + "grad_norm_var": 0.03803609212239583, + "learning_rate": 0.0001, + "loss": 5.8583, + "loss/crossentropy": 2.6228137016296387, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17159803211688995, + "step": 23690 + }, + { + "epoch": 0.740375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03662821451822917, + "learning_rate": 0.0001, + "loss": 6.0075, + "loss/crossentropy": 2.808443069458008, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17146998643875122, + "step": 23692 + }, + { + "epoch": 0.7404375, + "grad_norm": 2.984375, + "grad_norm_var": 0.0408111572265625, + "learning_rate": 0.0001, + "loss": 5.7996, + "loss/crossentropy": 2.6491732597351074, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16621138155460358, + "step": 23694 + }, + { + "epoch": 0.7405, + "grad_norm": 3.328125, + "grad_norm_var": 0.0383697509765625, + "learning_rate": 0.0001, + "loss": 5.7836, + "loss/crossentropy": 2.545189380645752, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17696908861398697, + "step": 23696 + }, + { + "epoch": 0.7405625, + "grad_norm": 3.421875, + "grad_norm_var": 0.029320271809895833, + "learning_rate": 0.0001, + "loss": 5.5782, + "loss/crossentropy": 2.4729238748550415, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1652190387248993, + "step": 23698 + }, + { + "epoch": 0.740625, + "grad_norm": 3.15625, + "grad_norm_var": 0.028351847330729166, + "learning_rate": 0.0001, + "loss": 5.7948, + "loss/crossentropy": 2.632536292076111, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17130043357610703, + "step": 23700 + }, + { + "epoch": 0.7406875, + "grad_norm": 2.921875, + "grad_norm_var": 0.030367024739583335, + "learning_rate": 0.0001, + "loss": 5.3192, + "loss/crossentropy": 2.3588616847991943, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.148768350481987, + "step": 23702 + }, + { + "epoch": 0.74075, + "grad_norm": 3.0, + "grad_norm_var": 0.029686482747395833, + "learning_rate": 0.0001, + "loss": 5.9918, + "loss/crossentropy": 2.700800657272339, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17910093069076538, + "step": 23704 + }, + { + "epoch": 0.7408125, + "grad_norm": 3.015625, + "grad_norm_var": 0.02974853515625, + "learning_rate": 0.0001, + "loss": 5.7756, + "loss/crossentropy": 2.7007559537887573, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16373586654663086, + "step": 23706 + }, + { + "epoch": 0.740875, + "grad_norm": 3.046875, + "grad_norm_var": 0.03736063639322917, + "learning_rate": 0.0001, + "loss": 5.5338, + "loss/crossentropy": 2.461254835128784, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16428812593221664, + "step": 23708 + }, + { + "epoch": 0.7409375, + "grad_norm": 3.015625, + "grad_norm_var": 0.039872233072916666, + "learning_rate": 0.0001, + "loss": 5.2952, + "loss/crossentropy": 2.367771625518799, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15134061872959137, + "step": 23710 + }, + { + "epoch": 0.741, + "grad_norm": 3.046875, + "grad_norm_var": 0.03721415201822917, + "learning_rate": 0.0001, + "loss": 5.6265, + "loss/crossentropy": 2.4913235902786255, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16663941740989685, + "step": 23712 + }, + { + "epoch": 0.7410625, + "grad_norm": 3.375, + "grad_norm_var": 0.03476155598958333, + "learning_rate": 0.0001, + "loss": 5.6103, + "loss/crossentropy": 2.4854609966278076, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1636602133512497, + "step": 23714 + }, + { + "epoch": 0.741125, + "grad_norm": 3.0, + "grad_norm_var": 0.03599853515625, + "learning_rate": 0.0001, + "loss": 5.7972, + "loss/crossentropy": 2.671536087989807, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16647516191005707, + "step": 23716 + }, + { + "epoch": 0.7411875, + "grad_norm": 2.984375, + "grad_norm_var": 0.04309895833333333, + "learning_rate": 0.0001, + "loss": 5.6616, + "loss/crossentropy": 2.609768509864807, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16221675276756287, + "step": 23718 + }, + { + "epoch": 0.74125, + "grad_norm": 3.265625, + "grad_norm_var": 0.03778889973958333, + "learning_rate": 0.0001, + "loss": 5.5797, + "loss/crossentropy": 2.4606499671936035, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1626831218600273, + "step": 23720 + }, + { + "epoch": 0.7413125, + "grad_norm": 3.34375, + "grad_norm_var": 0.039383951822916666, + "learning_rate": 0.0001, + "loss": 5.581, + "loss/crossentropy": 2.534591317176819, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1601075753569603, + "step": 23722 + }, + { + "epoch": 0.741375, + "grad_norm": 3.171875, + "grad_norm_var": 0.03013916015625, + "learning_rate": 0.0001, + "loss": 5.8479, + "loss/crossentropy": 2.680930733680725, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1698269248008728, + "step": 23724 + }, + { + "epoch": 0.7414375, + "grad_norm": 3.53125, + "grad_norm_var": 0.03504130045572917, + "learning_rate": 0.0001, + "loss": 5.7852, + "loss/crossentropy": 2.5099505186080933, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.175571970641613, + "step": 23726 + }, + { + "epoch": 0.7415, + "grad_norm": 3.171875, + "grad_norm_var": 0.0338287353515625, + "learning_rate": 0.0001, + "loss": 5.7857, + "loss/crossentropy": 2.6229697465896606, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16510528326034546, + "step": 23728 + }, + { + "epoch": 0.7415625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0298004150390625, + "learning_rate": 0.0001, + "loss": 5.5974, + "loss/crossentropy": 2.504623770713806, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16006328165531158, + "step": 23730 + }, + { + "epoch": 0.741625, + "grad_norm": 3.46875, + "grad_norm_var": 0.0399810791015625, + "learning_rate": 0.0001, + "loss": 5.4548, + "loss/crossentropy": 2.3802608251571655, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16018792241811752, + "step": 23732 + }, + { + "epoch": 0.7416875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0331451416015625, + "learning_rate": 0.0001, + "loss": 6.0567, + "loss/crossentropy": 2.829725742340088, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1758190467953682, + "step": 23734 + }, + { + "epoch": 0.74175, + "grad_norm": 3.0625, + "grad_norm_var": 0.03906962076822917, + "learning_rate": 0.0001, + "loss": 5.4737, + "loss/crossentropy": 2.4233882427215576, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15932518243789673, + "step": 23736 + }, + { + "epoch": 0.7418125, + "grad_norm": 3.1875, + "grad_norm_var": 0.03612874348958333, + "learning_rate": 0.0001, + "loss": 5.6637, + "loss/crossentropy": 2.5260519981384277, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16532868146896362, + "step": 23738 + }, + { + "epoch": 0.741875, + "grad_norm": 3.046875, + "grad_norm_var": 0.039839680989583334, + "learning_rate": 0.0001, + "loss": 5.5615, + "loss/crossentropy": 2.5807985067367554, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1551012247800827, + "step": 23740 + }, + { + "epoch": 0.7419375, + "grad_norm": 3.109375, + "grad_norm_var": 0.031966145833333334, + "learning_rate": 0.0001, + "loss": 5.6253, + "loss/crossentropy": 2.571129322052002, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16245053708553314, + "step": 23742 + }, + { + "epoch": 0.742, + "grad_norm": 2.984375, + "grad_norm_var": 0.0354400634765625, + "learning_rate": 0.0001, + "loss": 5.6157, + "loss/crossentropy": 2.533260226249695, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16175467520952225, + "step": 23744 + }, + { + "epoch": 0.7420625, + "grad_norm": 3.34375, + "grad_norm_var": 0.046875, + "learning_rate": 0.0001, + "loss": 6.0052, + "loss/crossentropy": 2.831053376197815, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1697554662823677, + "step": 23746 + }, + { + "epoch": 0.742125, + "grad_norm": 4.1875, + "grad_norm_var": 0.1024078369140625, + "learning_rate": 0.0001, + "loss": 5.6104, + "loss/crossentropy": 2.438306450843811, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16486083716154099, + "step": 23748 + }, + { + "epoch": 0.7421875, + "grad_norm": 3.25, + "grad_norm_var": 0.10673421223958333, + "learning_rate": 0.0001, + "loss": 5.8286, + "loss/crossentropy": 2.676417589187622, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1679568737745285, + "step": 23750 + }, + { + "epoch": 0.74225, + "grad_norm": 3.359375, + "grad_norm_var": 0.10194905598958333, + "learning_rate": 0.0001, + "loss": 5.7243, + "loss/crossentropy": 2.5188801288604736, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1717102751135826, + "step": 23752 + }, + { + "epoch": 0.7423125, + "grad_norm": 3.140625, + "grad_norm_var": 0.10461324055989583, + "learning_rate": 0.0001, + "loss": 5.5154, + "loss/crossentropy": 2.4078171253204346, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16466298699378967, + "step": 23754 + }, + { + "epoch": 0.742375, + "grad_norm": 2.90625, + "grad_norm_var": 0.11620992024739583, + "learning_rate": 0.0001, + "loss": 5.5563, + "loss/crossentropy": 2.47242271900177, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15955600142478943, + "step": 23756 + }, + { + "epoch": 0.7424375, + "grad_norm": 3.390625, + "grad_norm_var": 0.11027018229166667, + "learning_rate": 0.0001, + "loss": 5.5343, + "loss/crossentropy": 2.456767201423645, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1600937768816948, + "step": 23758 + }, + { + "epoch": 0.7425, + "grad_norm": 3.421875, + "grad_norm_var": 0.10869038899739583, + "learning_rate": 0.0001, + "loss": 5.8712, + "loss/crossentropy": 2.7310062646865845, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17026840150356293, + "step": 23760 + }, + { + "epoch": 0.7425625, + "grad_norm": 3.0625, + "grad_norm_var": 0.09637044270833334, + "learning_rate": 0.0001, + "loss": 5.8395, + "loss/crossentropy": 2.7477962970733643, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16112085431814194, + "step": 23762 + }, + { + "epoch": 0.742625, + "grad_norm": 2.921875, + "grad_norm_var": 0.042455037434895836, + "learning_rate": 0.0001, + "loss": 5.8389, + "loss/crossentropy": 2.7224472761154175, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16633137315511703, + "step": 23764 + }, + { + "epoch": 0.7426875, + "grad_norm": 3.0, + "grad_norm_var": 0.04687093098958333, + "learning_rate": 0.0001, + "loss": 5.4884, + "loss/crossentropy": 2.4695459604263306, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15618199855089188, + "step": 23766 + }, + { + "epoch": 0.74275, + "grad_norm": 3.171875, + "grad_norm_var": 0.04413960774739583, + "learning_rate": 0.0001, + "loss": 5.8696, + "loss/crossentropy": 2.6840864419937134, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17324209958314896, + "step": 23768 + }, + { + "epoch": 0.7428125, + "grad_norm": 3.15625, + "grad_norm_var": 0.04756571451822917, + "learning_rate": 0.0001, + "loss": 5.6721, + "loss/crossentropy": 2.6337684392929077, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15852554142475128, + "step": 23770 + }, + { + "epoch": 0.742875, + "grad_norm": 2.984375, + "grad_norm_var": 0.030516560872395834, + "learning_rate": 0.0001, + "loss": 5.485, + "loss/crossentropy": 2.437623381614685, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16060182452201843, + "step": 23772 + }, + { + "epoch": 0.7429375, + "grad_norm": 3.234375, + "grad_norm_var": 0.03082275390625, + "learning_rate": 0.0001, + "loss": 5.6025, + "loss/crossentropy": 2.4522485733032227, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.1618979573249817, + "step": 23774 + }, + { + "epoch": 0.743, + "grad_norm": 3.046875, + "grad_norm_var": 0.02603759765625, + "learning_rate": 0.0001, + "loss": 5.5423, + "loss/crossentropy": 2.5391552448272705, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.16047395020723343, + "step": 23776 + }, + { + "epoch": 0.7430625, + "grad_norm": 3.171875, + "grad_norm_var": 0.047587076822916664, + "learning_rate": 0.0001, + "loss": 5.678, + "loss/crossentropy": 2.538044571876526, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1667301505804062, + "step": 23778 + }, + { + "epoch": 0.743125, + "grad_norm": 3.015625, + "grad_norm_var": 0.045897420247395834, + "learning_rate": 0.0001, + "loss": 5.9731, + "loss/crossentropy": 2.809144616127014, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17030035704374313, + "step": 23780 + }, + { + "epoch": 0.7431875, + "grad_norm": 3.0625, + "grad_norm_var": 0.039061482747395834, + "learning_rate": 0.0001, + "loss": 5.7187, + "loss/crossentropy": 2.593548893928528, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1632988154888153, + "step": 23782 + }, + { + "epoch": 0.74325, + "grad_norm": 2.96875, + "grad_norm_var": 0.04285481770833333, + "learning_rate": 0.0001, + "loss": 5.5048, + "loss/crossentropy": 2.466431975364685, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1577415019273758, + "step": 23784 + }, + { + "epoch": 0.7433125, + "grad_norm": 2.859375, + "grad_norm_var": 0.044123331705729164, + "learning_rate": 0.0001, + "loss": 5.5427, + "loss/crossentropy": 2.5535603761672974, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15555761754512787, + "step": 23786 + }, + { + "epoch": 0.743375, + "grad_norm": 3.234375, + "grad_norm_var": 0.043257649739583334, + "learning_rate": 0.0001, + "loss": 5.8237, + "loss/crossentropy": 2.560186743736267, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17322541028261185, + "step": 23788 + }, + { + "epoch": 0.7434375, + "grad_norm": 3.125, + "grad_norm_var": 0.03844401041666667, + "learning_rate": 0.0001, + "loss": 5.8743, + "loss/crossentropy": 2.6807111501693726, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17131386697292328, + "step": 23790 + }, + { + "epoch": 0.7435, + "grad_norm": 3.0, + "grad_norm_var": 0.038752237955729164, + "learning_rate": 0.0001, + "loss": 5.7172, + "loss/crossentropy": 2.6006758213043213, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16243094950914383, + "step": 23792 + }, + { + "epoch": 0.7435625, + "grad_norm": 3.5625, + "grad_norm_var": 0.03437093098958333, + "learning_rate": 0.0001, + "loss": 6.0407, + "loss/crossentropy": 2.7483190298080444, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17728640139102936, + "step": 23794 + }, + { + "epoch": 0.743625, + "grad_norm": 3.28125, + "grad_norm_var": 0.03459879557291667, + "learning_rate": 0.0001, + "loss": 5.854, + "loss/crossentropy": 2.657385230064392, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17122593522071838, + "step": 23796 + }, + { + "epoch": 0.7436875, + "grad_norm": 3.203125, + "grad_norm_var": 0.03426005045572917, + "learning_rate": 0.0001, + "loss": 5.4711, + "loss/crossentropy": 2.4196746349334717, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15709207952022552, + "step": 23798 + }, + { + "epoch": 0.74375, + "grad_norm": 2.796875, + "grad_norm_var": 0.0406646728515625, + "learning_rate": 0.0001, + "loss": 5.457, + "loss/crossentropy": 2.4605520963668823, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15667151659727097, + "step": 23800 + }, + { + "epoch": 0.7438125, + "grad_norm": 3.453125, + "grad_norm_var": 0.0395416259765625, + "learning_rate": 0.0001, + "loss": 5.8347, + "loss/crossentropy": 2.6478604078292847, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16907228529453278, + "step": 23802 + }, + { + "epoch": 0.743875, + "grad_norm": 3.03125, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 0.0001, + "loss": 5.7251, + "loss/crossentropy": 2.6099071502685547, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1662066951394081, + "step": 23804 + }, + { + "epoch": 0.7439375, + "grad_norm": 3.109375, + "grad_norm_var": 0.04078369140625, + "learning_rate": 0.0001, + "loss": 5.649, + "loss/crossentropy": 2.5908056497573853, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.160505473613739, + "step": 23806 + }, + { + "epoch": 0.744, + "grad_norm": 3.234375, + "grad_norm_var": 0.0348541259765625, + "learning_rate": 0.0001, + "loss": 5.6204, + "loss/crossentropy": 2.495275855064392, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1621190384030342, + "step": 23808 + }, + { + "epoch": 0.7440625, + "grad_norm": 3.234375, + "grad_norm_var": 0.0240631103515625, + "learning_rate": 0.0001, + "loss": 5.9735, + "loss/crossentropy": 2.73429536819458, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17626766860485077, + "step": 23810 + }, + { + "epoch": 0.744125, + "grad_norm": 2.984375, + "grad_norm_var": 0.02476806640625, + "learning_rate": 0.0001, + "loss": 5.9545, + "loss/crossentropy": 2.7888200283050537, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.170477956533432, + "step": 23812 + }, + { + "epoch": 0.7441875, + "grad_norm": 3.0, + "grad_norm_var": 0.026090494791666665, + "learning_rate": 0.0001, + "loss": 5.5622, + "loss/crossentropy": 2.5557327270507812, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15768138319253922, + "step": 23814 + }, + { + "epoch": 0.74425, + "grad_norm": 2.90625, + "grad_norm_var": 0.028385416666666666, + "learning_rate": 0.0001, + "loss": 5.5638, + "loss/crossentropy": 2.6190967559814453, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15462665259838104, + "step": 23816 + }, + { + "epoch": 0.7443125, + "grad_norm": 3.09375, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 5.7951, + "loss/crossentropy": 2.600494861602783, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17023959755897522, + "step": 23818 + }, + { + "epoch": 0.744375, + "grad_norm": 3.125, + "grad_norm_var": 0.022175089518229166, + "learning_rate": 0.0001, + "loss": 5.365, + "loss/crossentropy": 2.3739324808120728, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15614111721515656, + "step": 23820 + }, + { + "epoch": 0.7444375, + "grad_norm": 3.234375, + "grad_norm_var": 0.027269490559895835, + "learning_rate": 0.0001, + "loss": 5.6678, + "loss/crossentropy": 2.529435157775879, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16384144872426987, + "step": 23822 + }, + { + "epoch": 0.7445, + "grad_norm": 3.21875, + "grad_norm_var": 0.03376363118489583, + "learning_rate": 0.0001, + "loss": 5.8025, + "loss/crossentropy": 2.5526143312454224, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17772244662046432, + "step": 23824 + }, + { + "epoch": 0.7445625, + "grad_norm": 3.203125, + "grad_norm_var": 0.042496744791666666, + "learning_rate": 0.0001, + "loss": 5.5248, + "loss/crossentropy": 2.4738471508026123, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15783115476369858, + "step": 23826 + }, + { + "epoch": 0.744625, + "grad_norm": 2.75, + "grad_norm_var": 0.05019429524739583, + "learning_rate": 0.0001, + "loss": 5.5796, + "loss/crossentropy": 2.524854302406311, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1566486656665802, + "step": 23828 + }, + { + "epoch": 0.7446875, + "grad_norm": 3.296875, + "grad_norm_var": 0.06464436848958334, + "learning_rate": 0.0001, + "loss": 5.839, + "loss/crossentropy": 2.54987108707428, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17930391430854797, + "step": 23830 + }, + { + "epoch": 0.74475, + "grad_norm": 3.140625, + "grad_norm_var": 0.05021870930989583, + "learning_rate": 0.0001, + "loss": 5.6872, + "loss/crossentropy": 2.557926297187805, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16840001940727234, + "step": 23832 + }, + { + "epoch": 0.7448125, + "grad_norm": 4.21875, + "grad_norm_var": 0.11629231770833333, + "learning_rate": 0.0001, + "loss": 5.6444, + "loss/crossentropy": 2.4489688873291016, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16680970042943954, + "step": 23834 + }, + { + "epoch": 0.744875, + "grad_norm": 2.875, + "grad_norm_var": 0.11988525390625, + "learning_rate": 0.0001, + "loss": 5.7966, + "loss/crossentropy": 2.7099976539611816, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1641286015510559, + "step": 23836 + }, + { + "epoch": 0.7449375, + "grad_norm": 3.078125, + "grad_norm_var": 0.12444661458333334, + "learning_rate": 0.0001, + "loss": 5.633, + "loss/crossentropy": 2.5509352684020996, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1609424576163292, + "step": 23838 + }, + { + "epoch": 0.745, + "grad_norm": 2.703125, + "grad_norm_var": 0.13852437337239584, + "learning_rate": 0.0001, + "loss": 5.4337, + "loss/crossentropy": 2.5036728382110596, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.14964640885591507, + "step": 23840 + }, + { + "epoch": 0.7450625, + "grad_norm": 3.25, + "grad_norm_var": 0.13062744140625, + "learning_rate": 0.0001, + "loss": 5.6479, + "loss/crossentropy": 2.545154333114624, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.15949159860610962, + "step": 23842 + }, + { + "epoch": 0.745125, + "grad_norm": 3.375, + "grad_norm_var": 0.12294514973958333, + "learning_rate": 0.0001, + "loss": 5.601, + "loss/crossentropy": 2.5171507596969604, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15799562633037567, + "step": 23844 + }, + { + "epoch": 0.7451875, + "grad_norm": 2.796875, + "grad_norm_var": 0.1241363525390625, + "learning_rate": 0.0001, + "loss": 5.1171, + "loss/crossentropy": 2.2651237845420837, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.14535468816757202, + "step": 23846 + }, + { + "epoch": 0.74525, + "grad_norm": 2.796875, + "grad_norm_var": 0.13225911458333334, + "learning_rate": 0.0001, + "loss": 5.1535, + "loss/crossentropy": 2.32041072845459, + "loss/hidden": 1.390625, + "loss/jsd": 0.0, + "loss/logits": 0.14425078779459, + "step": 23848 + }, + { + "epoch": 0.7453125, + "grad_norm": 3.09375, + "grad_norm_var": 0.03590087890625, + "learning_rate": 0.0001, + "loss": 5.417, + "loss/crossentropy": 2.4762797355651855, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15071460604667664, + "step": 23850 + }, + { + "epoch": 0.745375, + "grad_norm": 2.96875, + "grad_norm_var": 0.03487040201822917, + "learning_rate": 0.0001, + "loss": 5.4015, + "loss/crossentropy": 2.346813201904297, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15742014348506927, + "step": 23852 + }, + { + "epoch": 0.7454375, + "grad_norm": 3.109375, + "grad_norm_var": 0.042821248372395836, + "learning_rate": 0.0001, + "loss": 5.8207, + "loss/crossentropy": 2.6550374031066895, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16969284415245056, + "step": 23854 + }, + { + "epoch": 0.7455, + "grad_norm": 2.875, + "grad_norm_var": 0.0358551025390625, + "learning_rate": 0.0001, + "loss": 5.4539, + "loss/crossentropy": 2.4543232917785645, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15699072182178497, + "step": 23856 + }, + { + "epoch": 0.7455625, + "grad_norm": 3.265625, + "grad_norm_var": 0.03863525390625, + "learning_rate": 0.0001, + "loss": 5.5597, + "loss/crossentropy": 2.4374698400497437, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16573935747146606, + "step": 23858 + }, + { + "epoch": 0.745625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0354156494140625, + "learning_rate": 0.0001, + "loss": 5.8478, + "loss/crossentropy": 2.682563543319702, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.169651597738266, + "step": 23860 + }, + { + "epoch": 0.7456875, + "grad_norm": 3.234375, + "grad_norm_var": 0.030321248372395835, + "learning_rate": 0.0001, + "loss": 5.2931, + "loss/crossentropy": 2.266256332397461, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1569819077849388, + "step": 23862 + }, + { + "epoch": 0.74575, + "grad_norm": 2.890625, + "grad_norm_var": 0.024120076497395834, + "learning_rate": 0.0001, + "loss": 5.5981, + "loss/crossentropy": 2.5668485164642334, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1589885726571083, + "step": 23864 + }, + { + "epoch": 0.7458125, + "grad_norm": 3.34375, + "grad_norm_var": 0.028734334309895835, + "learning_rate": 0.0001, + "loss": 5.6187, + "loss/crossentropy": 2.527132987976074, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16462143510580063, + "step": 23866 + }, + { + "epoch": 0.745875, + "grad_norm": 2.96875, + "grad_norm_var": 0.0303619384765625, + "learning_rate": 0.0001, + "loss": 5.5882, + "loss/crossentropy": 2.5037399530410767, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16040148586034775, + "step": 23868 + }, + { + "epoch": 0.7459375, + "grad_norm": 3.25, + "grad_norm_var": 0.028547159830729165, + "learning_rate": 0.0001, + "loss": 5.4586, + "loss/crossentropy": 2.3871986865997314, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1590976044535637, + "step": 23870 + }, + { + "epoch": 0.746, + "grad_norm": 2.953125, + "grad_norm_var": 0.0265045166015625, + "learning_rate": 0.0001, + "loss": 5.6925, + "loss/crossentropy": 2.6443164348602295, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1610671430826187, + "step": 23872 + }, + { + "epoch": 0.7460625, + "grad_norm": 2.921875, + "grad_norm_var": 0.022606404622395833, + "learning_rate": 0.0001, + "loss": 5.7462, + "loss/crossentropy": 2.6622105836868286, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15956847369670868, + "step": 23874 + }, + { + "epoch": 0.746125, + "grad_norm": 3.375, + "grad_norm_var": 0.027887980143229168, + "learning_rate": 0.0001, + "loss": 5.7199, + "loss/crossentropy": 2.5624345541000366, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16652469336986542, + "step": 23876 + }, + { + "epoch": 0.7461875, + "grad_norm": 2.90625, + "grad_norm_var": 0.03357747395833333, + "learning_rate": 0.0001, + "loss": 5.5662, + "loss/crossentropy": 2.488021492958069, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15859421342611313, + "step": 23878 + }, + { + "epoch": 0.74625, + "grad_norm": 3.28125, + "grad_norm_var": 0.03280843098958333, + "learning_rate": 0.0001, + "loss": 5.5614, + "loss/crossentropy": 2.439563274383545, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16257558017969131, + "step": 23880 + }, + { + "epoch": 0.7463125, + "grad_norm": 2.90625, + "grad_norm_var": 0.030517578125, + "learning_rate": 0.0001, + "loss": 5.7069, + "loss/crossentropy": 2.659354090690613, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16178329288959503, + "step": 23882 + }, + { + "epoch": 0.746375, + "grad_norm": 3.09375, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 0.0001, + "loss": 5.8174, + "loss/crossentropy": 2.6543742418289185, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16864780336618423, + "step": 23884 + }, + { + "epoch": 0.7464375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0263671875, + "learning_rate": 0.0001, + "loss": 5.6579, + "loss/crossentropy": 2.539881706237793, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16532030701637268, + "step": 23886 + }, + { + "epoch": 0.7465, + "grad_norm": 3.1875, + "grad_norm_var": 0.024925740559895833, + "learning_rate": 0.0001, + "loss": 5.7044, + "loss/crossentropy": 2.5410473346710205, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17024239152669907, + "step": 23888 + }, + { + "epoch": 0.7465625, + "grad_norm": 3.21875, + "grad_norm_var": 0.5532948811848958, + "learning_rate": 0.0001, + "loss": 6.1183, + "loss/crossentropy": 2.7540390491485596, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.18290842324495316, + "step": 23890 + }, + { + "epoch": 0.746625, + "grad_norm": 3.15625, + "grad_norm_var": 0.5614003499348958, + "learning_rate": 0.0001, + "loss": 5.4943, + "loss/crossentropy": 2.4996707439422607, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1553225964307785, + "step": 23892 + }, + { + "epoch": 0.7466875, + "grad_norm": 3.296875, + "grad_norm_var": 0.5522450764973958, + "learning_rate": 0.0001, + "loss": 5.5942, + "loss/crossentropy": 2.500562906265259, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1648305132985115, + "step": 23894 + }, + { + "epoch": 0.74675, + "grad_norm": 2.71875, + "grad_norm_var": 0.5752675374348958, + "learning_rate": 0.0001, + "loss": 5.5396, + "loss/crossentropy": 2.5759114027023315, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15417730808258057, + "step": 23896 + }, + { + "epoch": 0.7468125, + "grad_norm": 3.421875, + "grad_norm_var": 0.5693522135416667, + "learning_rate": 0.0001, + "loss": 5.8975, + "loss/crossentropy": 2.683432936668396, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17257541418075562, + "step": 23898 + }, + { + "epoch": 0.746875, + "grad_norm": 3.421875, + "grad_norm_var": 0.5674875895182292, + "learning_rate": 0.0001, + "loss": 5.7925, + "loss/crossentropy": 2.609610438346863, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17024364322423935, + "step": 23900 + }, + { + "epoch": 0.7469375, + "grad_norm": 3.515625, + "grad_norm_var": 0.5640462239583334, + "learning_rate": 0.0001, + "loss": 6.1357, + "loss/crossentropy": 2.832602620124817, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17952553182840347, + "step": 23902 + }, + { + "epoch": 0.747, + "grad_norm": 3.03125, + "grad_norm_var": 0.58209228515625, + "learning_rate": 0.0001, + "loss": 5.2262, + "loss/crossentropy": 2.287443995475769, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.14856766909360886, + "step": 23904 + }, + { + "epoch": 0.7470625, + "grad_norm": 3.484375, + "grad_norm_var": 0.06466471354166667, + "learning_rate": 0.0001, + "loss": 5.5386, + "loss/crossentropy": 2.4046937227249146, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1649523451924324, + "step": 23906 + }, + { + "epoch": 0.747125, + "grad_norm": 2.984375, + "grad_norm_var": 0.06090087890625, + "learning_rate": 0.0001, + "loss": 5.4569, + "loss/crossentropy": 2.4906638860702515, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15521354228258133, + "step": 23908 + }, + { + "epoch": 0.7471875, + "grad_norm": 3.0, + "grad_norm_var": 0.0646392822265625, + "learning_rate": 0.0001, + "loss": 5.4602, + "loss/crossentropy": 2.4620014429092407, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15723932534456253, + "step": 23910 + }, + { + "epoch": 0.74725, + "grad_norm": 2.734375, + "grad_norm_var": 0.06411844889322917, + "learning_rate": 0.0001, + "loss": 5.5213, + "loss/crossentropy": 2.5211902856826782, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15704332292079926, + "step": 23912 + }, + { + "epoch": 0.7473125, + "grad_norm": 3.359375, + "grad_norm_var": 0.062409464518229166, + "learning_rate": 0.0001, + "loss": 5.8967, + "loss/crossentropy": 2.6899460554122925, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17223425954580307, + "step": 23914 + }, + { + "epoch": 0.747375, + "grad_norm": 3.125, + "grad_norm_var": 0.050902303059895834, + "learning_rate": 0.0001, + "loss": 5.7158, + "loss/crossentropy": 2.600480079650879, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16504865884780884, + "step": 23916 + }, + { + "epoch": 0.7474375, + "grad_norm": 3.3125, + "grad_norm_var": 0.0417633056640625, + "learning_rate": 0.0001, + "loss": 5.8917, + "loss/crossentropy": 2.760825276374817, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16464951634407043, + "step": 23918 + }, + { + "epoch": 0.7475, + "grad_norm": 3.15625, + "grad_norm_var": 0.0383209228515625, + "learning_rate": 0.0001, + "loss": 5.6164, + "loss/crossentropy": 2.49069881439209, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.167646124958992, + "step": 23920 + }, + { + "epoch": 0.7475625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0501953125, + "learning_rate": 0.0001, + "loss": 5.6284, + "loss/crossentropy": 2.4274455308914185, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16697538644075394, + "step": 23922 + }, + { + "epoch": 0.747625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0490142822265625, + "learning_rate": 0.0001, + "loss": 5.1744, + "loss/crossentropy": 2.2075916528701782, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15410692989826202, + "step": 23924 + }, + { + "epoch": 0.7476875, + "grad_norm": 3.234375, + "grad_norm_var": 0.05569254557291667, + "learning_rate": 0.0001, + "loss": 5.4039, + "loss/crossentropy": 2.415930151939392, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15426428616046906, + "step": 23926 + }, + { + "epoch": 0.74775, + "grad_norm": 3.34375, + "grad_norm_var": 0.05054423014322917, + "learning_rate": 0.0001, + "loss": 5.744, + "loss/crossentropy": 2.6106520891189575, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16646113991737366, + "step": 23928 + }, + { + "epoch": 0.7478125, + "grad_norm": 3.015625, + "grad_norm_var": 0.04761962890625, + "learning_rate": 0.0001, + "loss": 5.4452, + "loss/crossentropy": 2.40711510181427, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15966404229402542, + "step": 23930 + }, + { + "epoch": 0.747875, + "grad_norm": 4.21875, + "grad_norm_var": 0.12275390625, + "learning_rate": 0.0001, + "loss": 5.8813, + "loss/crossentropy": 2.5960358381271362, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.1742255985736847, + "step": 23932 + }, + { + "epoch": 0.7479375, + "grad_norm": 3.46875, + "grad_norm_var": 0.14806315104166667, + "learning_rate": 0.0001, + "loss": 5.5221, + "loss/crossentropy": 2.374852418899536, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.16120747476816177, + "step": 23934 + }, + { + "epoch": 0.748, + "grad_norm": 3.265625, + "grad_norm_var": 0.14806315104166667, + "learning_rate": 0.0001, + "loss": 5.6962, + "loss/crossentropy": 2.648604393005371, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15671633183956146, + "step": 23936 + }, + { + "epoch": 0.7480625, + "grad_norm": 3.171875, + "grad_norm_var": 0.1526519775390625, + "learning_rate": 0.0001, + "loss": 5.2132, + "loss/crossentropy": 2.3242075443267822, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.14671359956264496, + "step": 23938 + }, + { + "epoch": 0.748125, + "grad_norm": 2.921875, + "grad_norm_var": 0.16282145182291666, + "learning_rate": 0.0001, + "loss": 5.6767, + "loss/crossentropy": 2.611236333847046, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15967395156621933, + "step": 23940 + }, + { + "epoch": 0.7481875, + "grad_norm": 3.046875, + "grad_norm_var": 0.1510650634765625, + "learning_rate": 0.0001, + "loss": 5.8731, + "loss/crossentropy": 2.672127604484558, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17399899661540985, + "step": 23942 + }, + { + "epoch": 0.74825, + "grad_norm": 2.96875, + "grad_norm_var": 0.15441792805989582, + "learning_rate": 0.0001, + "loss": 5.5507, + "loss/crossentropy": 2.551435947418213, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15734337270259857, + "step": 23944 + }, + { + "epoch": 0.7483125, + "grad_norm": 3.390625, + "grad_norm_var": 0.15718485514322916, + "learning_rate": 0.0001, + "loss": 5.6394, + "loss/crossentropy": 2.5235499143600464, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16393250226974487, + "step": 23946 + }, + { + "epoch": 0.748375, + "grad_norm": 3.1875, + "grad_norm_var": 0.08142801920572916, + "learning_rate": 0.0001, + "loss": 5.7568, + "loss/crossentropy": 2.6940174102783203, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16408731043338776, + "step": 23948 + }, + { + "epoch": 0.7484375, + "grad_norm": 2.9375, + "grad_norm_var": 0.06448465983072917, + "learning_rate": 0.0001, + "loss": 5.6808, + "loss/crossentropy": 2.5052374601364136, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17029106616973877, + "step": 23950 + }, + { + "epoch": 0.7485, + "grad_norm": 3.40625, + "grad_norm_var": 0.0692535400390625, + "learning_rate": 0.0001, + "loss": 5.6076, + "loss/crossentropy": 2.44515597820282, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1674189642071724, + "step": 23952 + }, + { + "epoch": 0.7485625, + "grad_norm": 3.59375, + "grad_norm_var": 0.1969879150390625, + "learning_rate": 0.0001, + "loss": 5.9776, + "loss/crossentropy": 2.746953010559082, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1730656549334526, + "step": 23954 + }, + { + "epoch": 0.748625, + "grad_norm": 3.21875, + "grad_norm_var": 0.1804840087890625, + "learning_rate": 0.0001, + "loss": 5.5424, + "loss/crossentropy": 2.4625093936920166, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16267801821231842, + "step": 23956 + }, + { + "epoch": 0.7486875, + "grad_norm": 3.046875, + "grad_norm_var": 0.1767974853515625, + "learning_rate": 0.0001, + "loss": 5.5192, + "loss/crossentropy": 2.421678066253662, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1659986898303032, + "step": 23958 + }, + { + "epoch": 0.74875, + "grad_norm": 2.8125, + "grad_norm_var": 0.17277730305989583, + "learning_rate": 0.0001, + "loss": 5.3055, + "loss/crossentropy": 2.3576548099517822, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1502518653869629, + "step": 23960 + }, + { + "epoch": 0.7488125, + "grad_norm": 3.140625, + "grad_norm_var": 0.17234700520833332, + "learning_rate": 0.0001, + "loss": 5.7065, + "loss/crossentropy": 2.5937485694885254, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16517798602581024, + "step": 23962 + }, + { + "epoch": 0.748875, + "grad_norm": 3.390625, + "grad_norm_var": 0.16630859375, + "learning_rate": 0.0001, + "loss": 5.3547, + "loss/crossentropy": 2.3310216665267944, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15549761056900024, + "step": 23964 + }, + { + "epoch": 0.7489375, + "grad_norm": 3.234375, + "grad_norm_var": 0.1443359375, + "learning_rate": 0.0001, + "loss": 5.9194, + "loss/crossentropy": 2.7323626279830933, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1690945252776146, + "step": 23966 + }, + { + "epoch": 0.749, + "grad_norm": 2.96875, + "grad_norm_var": 0.15989481608072917, + "learning_rate": 0.0001, + "loss": 5.2454, + "loss/crossentropy": 2.316407322883606, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15227185189723969, + "step": 23968 + }, + { + "epoch": 0.7490625, + "grad_norm": 2.9375, + "grad_norm_var": 0.027457682291666667, + "learning_rate": 0.0001, + "loss": 5.5194, + "loss/crossentropy": 2.451360821723938, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16305788606405258, + "step": 23970 + }, + { + "epoch": 0.749125, + "grad_norm": 3.28125, + "grad_norm_var": 0.031956990559895836, + "learning_rate": 0.0001, + "loss": 5.4111, + "loss/crossentropy": 2.38328218460083, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15473628044128418, + "step": 23972 + }, + { + "epoch": 0.7491875, + "grad_norm": 3.40625, + "grad_norm_var": 0.036961873372395836, + "learning_rate": 0.0001, + "loss": 5.7902, + "loss/crossentropy": 2.6268088817596436, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16868211328983307, + "step": 23974 + }, + { + "epoch": 0.74925, + "grad_norm": 3.15625, + "grad_norm_var": 0.0292388916015625, + "learning_rate": 0.0001, + "loss": 5.7644, + "loss/crossentropy": 2.583236336708069, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17045753449201584, + "step": 23976 + }, + { + "epoch": 0.7493125, + "grad_norm": 2.796875, + "grad_norm_var": 0.03853251139322917, + "learning_rate": 0.0001, + "loss": 5.3857, + "loss/crossentropy": 2.4263709783554077, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1506212204694748, + "step": 23978 + }, + { + "epoch": 0.749375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0349517822265625, + "learning_rate": 0.0001, + "loss": 5.8157, + "loss/crossentropy": 2.6879948377609253, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1662856861948967, + "step": 23980 + }, + { + "epoch": 0.7494375, + "grad_norm": 3.015625, + "grad_norm_var": 0.03521728515625, + "learning_rate": 0.0001, + "loss": 5.4518, + "loss/crossentropy": 2.350409746170044, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16052792966365814, + "step": 23982 + }, + { + "epoch": 0.7495, + "grad_norm": 3.046875, + "grad_norm_var": 0.0280181884765625, + "learning_rate": 0.0001, + "loss": 5.2946, + "loss/crossentropy": 2.3216720819473267, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15119792520999908, + "step": 23984 + }, + { + "epoch": 0.7495625, + "grad_norm": 3.046875, + "grad_norm_var": 0.026366170247395834, + "learning_rate": 0.0001, + "loss": 5.3652, + "loss/crossentropy": 2.35917329788208, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15685203671455383, + "step": 23986 + }, + { + "epoch": 0.749625, + "grad_norm": 3.03125, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 5.545, + "loss/crossentropy": 2.5058690309524536, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15938085317611694, + "step": 23988 + }, + { + "epoch": 0.7496875, + "grad_norm": 3.296875, + "grad_norm_var": 0.013277180989583333, + "learning_rate": 0.0001, + "loss": 5.814, + "loss/crossentropy": 2.5942448377609253, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17197741568088531, + "step": 23990 + }, + { + "epoch": 0.74975, + "grad_norm": 3.09375, + "grad_norm_var": 0.013765462239583333, + "learning_rate": 0.0001, + "loss": 5.4701, + "loss/crossentropy": 2.4266786575317383, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1578565388917923, + "step": 23992 + }, + { + "epoch": 0.7498125, + "grad_norm": 3.109375, + "grad_norm_var": 0.011668904622395834, + "learning_rate": 0.0001, + "loss": 5.4139, + "loss/crossentropy": 2.3503929376602173, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1567407101392746, + "step": 23994 + }, + { + "epoch": 0.749875, + "grad_norm": 3.625, + "grad_norm_var": 0.026725260416666667, + "learning_rate": 0.0001, + "loss": 6.0104, + "loss/crossentropy": 2.7236788272857666, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17789245396852493, + "step": 23996 + }, + { + "epoch": 0.7499375, + "grad_norm": 2.953125, + "grad_norm_var": 0.027490234375, + "learning_rate": 0.0001, + "loss": 5.7012, + "loss/crossentropy": 2.6194682121276855, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16364064812660217, + "step": 23998 + }, + { + "epoch": 0.75, + "grad_norm": 3.4375, + "grad_norm_var": 0.03530171712239583, + "learning_rate": 0.0001, + "loss": 5.6843, + "loss/crossentropy": 2.554969072341919, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16254231333732605, + "step": 24000 + }, + { + "epoch": 0.7500625, + "grad_norm": 2.703125, + "grad_norm_var": 0.04576416015625, + "learning_rate": 0.0001, + "loss": 5.3964, + "loss/crossentropy": 2.4495993852615356, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15171610563993454, + "step": 24002 + }, + { + "epoch": 0.750125, + "grad_norm": 2.921875, + "grad_norm_var": 0.051081339518229164, + "learning_rate": 0.0001, + "loss": 5.7108, + "loss/crossentropy": 2.550310969352722, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1687837466597557, + "step": 24004 + }, + { + "epoch": 0.7501875, + "grad_norm": 3.5, + "grad_norm_var": 0.05676981608072917, + "learning_rate": 0.0001, + "loss": 5.9811, + "loss/crossentropy": 2.6983426809310913, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.17554175853729248, + "step": 24006 + }, + { + "epoch": 0.75025, + "grad_norm": 3.296875, + "grad_norm_var": 0.05952046712239583, + "learning_rate": 0.0001, + "loss": 5.6996, + "loss/crossentropy": 2.505595564842224, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17330221831798553, + "step": 24008 + }, + { + "epoch": 0.7503125, + "grad_norm": 3.203125, + "grad_norm_var": 0.05813802083333333, + "learning_rate": 0.0001, + "loss": 5.6695, + "loss/crossentropy": 2.5647228956222534, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.159697987139225, + "step": 24010 + }, + { + "epoch": 0.750375, + "grad_norm": 3.0625, + "grad_norm_var": 0.048567708333333334, + "learning_rate": 0.0001, + "loss": 5.7166, + "loss/crossentropy": 2.544753909111023, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17030484229326248, + "step": 24012 + }, + { + "epoch": 0.7504375, + "grad_norm": 3.078125, + "grad_norm_var": 0.052245076497395834, + "learning_rate": 0.0001, + "loss": 5.3793, + "loss/crossentropy": 2.4289255142211914, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15284773707389832, + "step": 24014 + }, + { + "epoch": 0.7505, + "grad_norm": 3.078125, + "grad_norm_var": 0.04543355305989583, + "learning_rate": 0.0001, + "loss": 5.5457, + "loss/crossentropy": 2.4863643646240234, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16062580794095993, + "step": 24016 + }, + { + "epoch": 0.7505625, + "grad_norm": 2.78125, + "grad_norm_var": 0.047607421875, + "learning_rate": 0.0001, + "loss": 5.5945, + "loss/crossentropy": 2.5885279178619385, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15763090550899506, + "step": 24018 + }, + { + "epoch": 0.750625, + "grad_norm": 2.9375, + "grad_norm_var": 0.04299723307291667, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.659751534461975, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15985984355211258, + "step": 24020 + }, + { + "epoch": 0.7506875, + "grad_norm": 3.015625, + "grad_norm_var": 0.031966145833333334, + "learning_rate": 0.0001, + "loss": 5.7268, + "loss/crossentropy": 2.62524950504303, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16210521757602692, + "step": 24022 + }, + { + "epoch": 0.75075, + "grad_norm": 3.203125, + "grad_norm_var": 0.0222320556640625, + "learning_rate": 0.0001, + "loss": 5.5927, + "loss/crossentropy": 2.542557954788208, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1589244231581688, + "step": 24024 + }, + { + "epoch": 0.7508125, + "grad_norm": 3.359375, + "grad_norm_var": 0.028120930989583334, + "learning_rate": 0.0001, + "loss": 5.8997, + "loss/crossentropy": 2.760553002357483, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16665276139974594, + "step": 24026 + }, + { + "epoch": 0.750875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0209625244140625, + "learning_rate": 0.0001, + "loss": 5.6023, + "loss/crossentropy": 2.5729691982269287, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1595735400915146, + "step": 24028 + }, + { + "epoch": 0.7509375, + "grad_norm": 3.203125, + "grad_norm_var": 0.021451822916666665, + "learning_rate": 0.0001, + "loss": 5.6939, + "loss/crossentropy": 2.6090248823165894, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16356821358203888, + "step": 24030 + }, + { + "epoch": 0.751, + "grad_norm": 2.828125, + "grad_norm_var": 0.023566691080729167, + "learning_rate": 0.0001, + "loss": 5.3827, + "loss/crossentropy": 2.3962953090667725, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15567557513713837, + "step": 24032 + }, + { + "epoch": 0.7510625, + "grad_norm": 3.1875, + "grad_norm_var": 0.018929036458333333, + "learning_rate": 0.0001, + "loss": 5.4211, + "loss/crossentropy": 2.427218198776245, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1536843404173851, + "step": 24034 + }, + { + "epoch": 0.751125, + "grad_norm": 3.109375, + "grad_norm_var": 0.018648274739583335, + "learning_rate": 0.0001, + "loss": 5.85, + "loss/crossentropy": 2.7219003438949585, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16593755781650543, + "step": 24036 + }, + { + "epoch": 0.7511875, + "grad_norm": 3.1875, + "grad_norm_var": 0.021735636393229167, + "learning_rate": 0.0001, + "loss": 5.9337, + "loss/crossentropy": 2.666556239128113, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17593314498662949, + "step": 24038 + }, + { + "epoch": 0.75125, + "grad_norm": 2.921875, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 5.3222, + "loss/crossentropy": 2.3676997423171997, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1548285260796547, + "step": 24040 + }, + { + "epoch": 0.7513125, + "grad_norm": 2.9375, + "grad_norm_var": 0.015461222330729166, + "learning_rate": 0.0001, + "loss": 5.5964, + "loss/crossentropy": 2.4753365516662598, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16522684693336487, + "step": 24042 + }, + { + "epoch": 0.751375, + "grad_norm": 2.984375, + "grad_norm_var": 0.013981119791666666, + "learning_rate": 0.0001, + "loss": 5.5272, + "loss/crossentropy": 2.5083736181259155, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15735136717557907, + "step": 24044 + }, + { + "epoch": 0.7514375, + "grad_norm": 2.875, + "grad_norm_var": 0.014762369791666667, + "learning_rate": 0.0001, + "loss": 5.1681, + "loss/crossentropy": 2.299069046974182, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14627868682146072, + "step": 24046 + }, + { + "epoch": 0.7515, + "grad_norm": 3.09375, + "grad_norm_var": 0.012007649739583333, + "learning_rate": 0.0001, + "loss": 5.7343, + "loss/crossentropy": 2.62216579914093, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16551008820533752, + "step": 24048 + }, + { + "epoch": 0.7515625, + "grad_norm": 3.140625, + "grad_norm_var": 0.013890584309895834, + "learning_rate": 0.0001, + "loss": 5.6754, + "loss/crossentropy": 2.5800259113311768, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.161882646381855, + "step": 24050 + }, + { + "epoch": 0.751625, + "grad_norm": 2.984375, + "grad_norm_var": 0.0150054931640625, + "learning_rate": 0.0001, + "loss": 5.5522, + "loss/crossentropy": 2.519499659538269, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16030453890562057, + "step": 24052 + }, + { + "epoch": 0.7516875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0165435791015625, + "learning_rate": 0.0001, + "loss": 5.7906, + "loss/crossentropy": 2.6092609167099, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16735679656267166, + "step": 24054 + }, + { + "epoch": 0.75175, + "grad_norm": 3.140625, + "grad_norm_var": 0.0208984375, + "learning_rate": 0.0001, + "loss": 5.5616, + "loss/crossentropy": 2.5605705976486206, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15869764983654022, + "step": 24056 + }, + { + "epoch": 0.7518125, + "grad_norm": 2.953125, + "grad_norm_var": 0.0206451416015625, + "learning_rate": 0.0001, + "loss": 5.6485, + "loss/crossentropy": 2.631874203681946, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15752369165420532, + "step": 24058 + }, + { + "epoch": 0.751875, + "grad_norm": 2.75, + "grad_norm_var": 0.026497395833333333, + "learning_rate": 0.0001, + "loss": 5.5585, + "loss/crossentropy": 2.5394619703292847, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15971557796001434, + "step": 24060 + }, + { + "epoch": 0.7519375, + "grad_norm": 3.15625, + "grad_norm_var": 0.024409993489583334, + "learning_rate": 0.0001, + "loss": 5.7917, + "loss/crossentropy": 2.7013940811157227, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16293393820524216, + "step": 24062 + }, + { + "epoch": 0.752, + "grad_norm": 3.171875, + "grad_norm_var": 0.025153605143229167, + "learning_rate": 0.0001, + "loss": 5.8292, + "loss/crossentropy": 2.7043410539627075, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16717380285263062, + "step": 24064 + }, + { + "epoch": 0.7520625, + "grad_norm": 2.953125, + "grad_norm_var": 0.022777303059895834, + "learning_rate": 0.0001, + "loss": 5.4734, + "loss/crossentropy": 2.470320701599121, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1569492593407631, + "step": 24066 + }, + { + "epoch": 0.752125, + "grad_norm": 3.1875, + "grad_norm_var": 0.023631795247395834, + "learning_rate": 0.0001, + "loss": 5.554, + "loss/crossentropy": 2.4775390625, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16311536729335785, + "step": 24068 + }, + { + "epoch": 0.7521875, + "grad_norm": 3.390625, + "grad_norm_var": 0.027220662434895834, + "learning_rate": 0.0001, + "loss": 5.5499, + "loss/crossentropy": 2.4541794061660767, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15956879407167435, + "step": 24070 + }, + { + "epoch": 0.75225, + "grad_norm": 3.59375, + "grad_norm_var": 0.0439605712890625, + "learning_rate": 0.0001, + "loss": 5.2566, + "loss/crossentropy": 2.3025163412094116, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.14736465364694595, + "step": 24072 + }, + { + "epoch": 0.7523125, + "grad_norm": 3.4375, + "grad_norm_var": 0.04894205729166667, + "learning_rate": 0.0001, + "loss": 5.9081, + "loss/crossentropy": 2.5796762704849243, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17971716821193695, + "step": 24074 + }, + { + "epoch": 0.752375, + "grad_norm": 3.234375, + "grad_norm_var": 0.04103902180989583, + "learning_rate": 0.0001, + "loss": 5.484, + "loss/crossentropy": 2.3495601415634155, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16579163074493408, + "step": 24076 + }, + { + "epoch": 0.7524375, + "grad_norm": 3.4375, + "grad_norm_var": 0.07107747395833333, + "learning_rate": 0.0001, + "loss": 5.9585, + "loss/crossentropy": 2.749734044075012, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.16814526170492172, + "step": 24078 + }, + { + "epoch": 0.7525, + "grad_norm": 3.1875, + "grad_norm_var": 0.07060445149739583, + "learning_rate": 0.0001, + "loss": 5.6366, + "loss/crossentropy": 2.507352113723755, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16760773211717606, + "step": 24080 + }, + { + "epoch": 0.7525625, + "grad_norm": 2.75, + "grad_norm_var": 0.09156494140625, + "learning_rate": 0.0001, + "loss": 5.5621, + "loss/crossentropy": 2.5038325786590576, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15777898579835892, + "step": 24082 + }, + { + "epoch": 0.752625, + "grad_norm": 2.96875, + "grad_norm_var": 0.09780171712239584, + "learning_rate": 0.0001, + "loss": 5.8253, + "loss/crossentropy": 2.680901050567627, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1683458387851715, + "step": 24084 + }, + { + "epoch": 0.7526875, + "grad_norm": 2.984375, + "grad_norm_var": 0.09470926920572917, + "learning_rate": 0.0001, + "loss": 5.4327, + "loss/crossentropy": 2.367120862007141, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16280563920736313, + "step": 24086 + }, + { + "epoch": 0.75275, + "grad_norm": 3.09375, + "grad_norm_var": 0.0748687744140625, + "learning_rate": 0.0001, + "loss": 5.2026, + "loss/crossentropy": 2.247442364692688, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15371931344270706, + "step": 24088 + }, + { + "epoch": 0.7528125, + "grad_norm": 3.078125, + "grad_norm_var": 0.08723551432291667, + "learning_rate": 0.0001, + "loss": 5.3154, + "loss/crossentropy": 2.3488672971725464, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15368030965328217, + "step": 24090 + }, + { + "epoch": 0.752875, + "grad_norm": 2.90625, + "grad_norm_var": 0.08950093587239584, + "learning_rate": 0.0001, + "loss": 5.4821, + "loss/crossentropy": 2.4977235794067383, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15391092747449875, + "step": 24092 + }, + { + "epoch": 0.7529375, + "grad_norm": 3.21875, + "grad_norm_var": 0.04742431640625, + "learning_rate": 0.0001, + "loss": 5.6068, + "loss/crossentropy": 2.4802398681640625, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.166949562728405, + "step": 24094 + }, + { + "epoch": 0.753, + "grad_norm": 3.609375, + "grad_norm_var": 0.06437886555989583, + "learning_rate": 0.0001, + "loss": 5.3373, + "loss/crossentropy": 2.3183417320251465, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15814294666051865, + "step": 24096 + }, + { + "epoch": 0.7530625, + "grad_norm": 3.140625, + "grad_norm_var": 0.039449055989583336, + "learning_rate": 0.0001, + "loss": 5.7298, + "loss/crossentropy": 2.5677947998046875, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17049698531627655, + "step": 24098 + }, + { + "epoch": 0.753125, + "grad_norm": 2.96875, + "grad_norm_var": 0.0394927978515625, + "learning_rate": 0.0001, + "loss": 5.4705, + "loss/crossentropy": 2.3900952339172363, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16390328109264374, + "step": 24100 + }, + { + "epoch": 0.7531875, + "grad_norm": 2.890625, + "grad_norm_var": 0.04500223795572917, + "learning_rate": 0.0001, + "loss": 5.4324, + "loss/crossentropy": 2.4493796825408936, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15650839358568192, + "step": 24102 + }, + { + "epoch": 0.75325, + "grad_norm": 3.578125, + "grad_norm_var": 0.05972900390625, + "learning_rate": 0.0001, + "loss": 5.455, + "loss/crossentropy": 2.44339120388031, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15428748726844788, + "step": 24104 + }, + { + "epoch": 0.7533125, + "grad_norm": 2.921875, + "grad_norm_var": 0.05113932291666667, + "learning_rate": 0.0001, + "loss": 5.6347, + "loss/crossentropy": 2.5348976850509644, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16389034688472748, + "step": 24106 + }, + { + "epoch": 0.753375, + "grad_norm": 3.515625, + "grad_norm_var": 0.05664774576822917, + "learning_rate": 0.0001, + "loss": 5.453, + "loss/crossentropy": 2.435604214668274, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1544729620218277, + "step": 24108 + }, + { + "epoch": 0.7534375, + "grad_norm": 3.25, + "grad_norm_var": 0.061864217122395836, + "learning_rate": 0.0001, + "loss": 5.7974, + "loss/crossentropy": 2.584768056869507, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1712661236524582, + "step": 24110 + }, + { + "epoch": 0.7535, + "grad_norm": 3.09375, + "grad_norm_var": 0.05432942708333333, + "learning_rate": 0.0001, + "loss": 5.8904, + "loss/crossentropy": 2.7961983680725098, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.163322813808918, + "step": 24112 + }, + { + "epoch": 0.7535625, + "grad_norm": 3.03125, + "grad_norm_var": 0.05172526041666667, + "learning_rate": 0.0001, + "loss": 5.516, + "loss/crossentropy": 2.4154030084609985, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16122816503047943, + "step": 24114 + }, + { + "epoch": 0.753625, + "grad_norm": 2.953125, + "grad_norm_var": 0.07306315104166666, + "learning_rate": 0.0001, + "loss": 5.6623, + "loss/crossentropy": 2.535654902458191, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16110121458768845, + "step": 24116 + }, + { + "epoch": 0.7536875, + "grad_norm": 3.25, + "grad_norm_var": 0.06221415201822917, + "learning_rate": 0.0001, + "loss": 6.0447, + "loss/crossentropy": 2.713833808898926, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17995687574148178, + "step": 24118 + }, + { + "epoch": 0.75375, + "grad_norm": 2.953125, + "grad_norm_var": 0.05815327962239583, + "learning_rate": 0.0001, + "loss": 5.6069, + "loss/crossentropy": 2.5649408102035522, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1569312885403633, + "step": 24120 + }, + { + "epoch": 0.7538125, + "grad_norm": 2.90625, + "grad_norm_var": 0.05870768229166667, + "learning_rate": 0.0001, + "loss": 5.3703, + "loss/crossentropy": 2.4001930952072144, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15130487829446793, + "step": 24122 + }, + { + "epoch": 0.753875, + "grad_norm": 3.296875, + "grad_norm_var": 0.053587849934895834, + "learning_rate": 0.0001, + "loss": 5.7407, + "loss/crossentropy": 2.560784935951233, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17228714376688004, + "step": 24124 + }, + { + "epoch": 0.7539375, + "grad_norm": 3.203125, + "grad_norm_var": 0.04698893229166667, + "learning_rate": 0.0001, + "loss": 5.7398, + "loss/crossentropy": 2.5797245502471924, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16640272736549377, + "step": 24126 + }, + { + "epoch": 0.754, + "grad_norm": 3.265625, + "grad_norm_var": 0.045344034830729164, + "learning_rate": 0.0001, + "loss": 5.8133, + "loss/crossentropy": 2.6464954614639282, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1682446300983429, + "step": 24128 + }, + { + "epoch": 0.7540625, + "grad_norm": 3.0625, + "grad_norm_var": 0.04700520833333333, + "learning_rate": 0.0001, + "loss": 5.791, + "loss/crossentropy": 2.6895121335983276, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1621066927909851, + "step": 24130 + }, + { + "epoch": 0.754125, + "grad_norm": 3.078125, + "grad_norm_var": 0.02320556640625, + "learning_rate": 0.0001, + "loss": 5.944, + "loss/crossentropy": 2.7760531902313232, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16796286404132843, + "step": 24132 + }, + { + "epoch": 0.7541875, + "grad_norm": 3.671875, + "grad_norm_var": 0.0392242431640625, + "learning_rate": 0.0001, + "loss": 5.3381, + "loss/crossentropy": 2.2714842557907104, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15861865878105164, + "step": 24134 + }, + { + "epoch": 0.75425, + "grad_norm": 3.0625, + "grad_norm_var": 0.0407379150390625, + "learning_rate": 0.0001, + "loss": 5.4312, + "loss/crossentropy": 2.505277156829834, + "loss/hidden": 1.390625, + "loss/jsd": 0.0, + "loss/logits": 0.15353471040725708, + "step": 24136 + }, + { + "epoch": 0.7543125, + "grad_norm": 3.171875, + "grad_norm_var": 0.0370025634765625, + "learning_rate": 0.0001, + "loss": 5.5424, + "loss/crossentropy": 2.5124454498291016, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15768658369779587, + "step": 24138 + }, + { + "epoch": 0.754375, + "grad_norm": 3.1875, + "grad_norm_var": 0.034012858072916666, + "learning_rate": 0.0001, + "loss": 5.7433, + "loss/crossentropy": 2.6044552326202393, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.165834940969944, + "step": 24140 + }, + { + "epoch": 0.7544375, + "grad_norm": 2.84375, + "grad_norm_var": 0.039183553059895834, + "learning_rate": 0.0001, + "loss": 5.6496, + "loss/crossentropy": 2.5876888036727905, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16127284616231918, + "step": 24142 + }, + { + "epoch": 0.7545, + "grad_norm": 3.25, + "grad_norm_var": 0.05074462890625, + "learning_rate": 0.0001, + "loss": 5.0513, + "loss/crossentropy": 2.2023468017578125, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14427007734775543, + "step": 24144 + }, + { + "epoch": 0.7545625, + "grad_norm": 3.1875, + "grad_norm_var": 0.05077718098958333, + "learning_rate": 0.0001, + "loss": 5.8935, + "loss/crossentropy": 2.7609182596206665, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1679467335343361, + "step": 24146 + }, + { + "epoch": 0.754625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0512603759765625, + "learning_rate": 0.0001, + "loss": 5.6736, + "loss/crossentropy": 2.554359197616577, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16622508317232132, + "step": 24148 + }, + { + "epoch": 0.7546875, + "grad_norm": 2.609375, + "grad_norm_var": 0.042708333333333334, + "learning_rate": 0.0001, + "loss": 5.3942, + "loss/crossentropy": 2.432632327079773, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15279417484998703, + "step": 24150 + }, + { + "epoch": 0.75475, + "grad_norm": 3.1875, + "grad_norm_var": 0.04178059895833333, + "learning_rate": 0.0001, + "loss": 5.6383, + "loss/crossentropy": 2.4687283039093018, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.1642264425754547, + "step": 24152 + }, + { + "epoch": 0.7548125, + "grad_norm": 3.171875, + "grad_norm_var": 0.041727701822916664, + "learning_rate": 0.0001, + "loss": 5.6806, + "loss/crossentropy": 2.546117663383484, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16383732110261917, + "step": 24154 + }, + { + "epoch": 0.754875, + "grad_norm": 3.515625, + "grad_norm_var": 0.05084228515625, + "learning_rate": 0.0001, + "loss": 5.9499, + "loss/crossentropy": 2.7119476795196533, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17652510851621628, + "step": 24156 + }, + { + "epoch": 0.7549375, + "grad_norm": 3.203125, + "grad_norm_var": 0.04735921223958333, + "learning_rate": 0.0001, + "loss": 5.6438, + "loss/crossentropy": 2.5351009368896484, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16360601782798767, + "step": 24158 + }, + { + "epoch": 0.755, + "grad_norm": 2.734375, + "grad_norm_var": 0.04263916015625, + "learning_rate": 0.0001, + "loss": 5.3956, + "loss/crossentropy": 2.4043564796447754, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1526353657245636, + "step": 24160 + }, + { + "epoch": 0.7550625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0416168212890625, + "learning_rate": 0.0001, + "loss": 5.69, + "loss/crossentropy": 2.6212981939315796, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16077616065740585, + "step": 24162 + }, + { + "epoch": 0.755125, + "grad_norm": 2.9375, + "grad_norm_var": 0.045263671875, + "learning_rate": 0.0001, + "loss": 5.4384, + "loss/crossentropy": 2.402511715888977, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15710320323705673, + "step": 24164 + }, + { + "epoch": 0.7551875, + "grad_norm": 3.046875, + "grad_norm_var": 0.029520670572916668, + "learning_rate": 0.0001, + "loss": 5.6641, + "loss/crossentropy": 2.587397575378418, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16352634876966476, + "step": 24166 + }, + { + "epoch": 0.75525, + "grad_norm": 3.234375, + "grad_norm_var": 0.04068603515625, + "learning_rate": 0.0001, + "loss": 6.0794, + "loss/crossentropy": 2.8126444816589355, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17628030478954315, + "step": 24168 + }, + { + "epoch": 0.7553125, + "grad_norm": 3.40625, + "grad_norm_var": 0.0530426025390625, + "learning_rate": 0.0001, + "loss": 5.724, + "loss/crossentropy": 2.6390836238861084, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16162113845348358, + "step": 24170 + }, + { + "epoch": 0.755375, + "grad_norm": 3.1875, + "grad_norm_var": 0.04641011555989583, + "learning_rate": 0.0001, + "loss": 5.6136, + "loss/crossentropy": 2.4587961435317993, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16548041254281998, + "step": 24172 + }, + { + "epoch": 0.7554375, + "grad_norm": 3.203125, + "grad_norm_var": 0.046630859375, + "learning_rate": 0.0001, + "loss": 5.5038, + "loss/crossentropy": 2.445357918739319, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16209685802459717, + "step": 24174 + }, + { + "epoch": 0.7555, + "grad_norm": 3.328125, + "grad_norm_var": 0.036279296875, + "learning_rate": 0.0001, + "loss": 5.8233, + "loss/crossentropy": 2.671785831451416, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16632480919361115, + "step": 24176 + }, + { + "epoch": 0.7555625, + "grad_norm": 2.96875, + "grad_norm_var": 0.044774373372395836, + "learning_rate": 0.0001, + "loss": 5.543, + "loss/crossentropy": 2.4994109869003296, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15983228385448456, + "step": 24178 + }, + { + "epoch": 0.755625, + "grad_norm": 3.28125, + "grad_norm_var": 0.04189351399739583, + "learning_rate": 0.0001, + "loss": 5.7399, + "loss/crossentropy": 2.571989893913269, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17148178815841675, + "step": 24180 + }, + { + "epoch": 0.7556875, + "grad_norm": 2.859375, + "grad_norm_var": 0.05201416015625, + "learning_rate": 0.0001, + "loss": 5.3768, + "loss/crossentropy": 2.479288935661316, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.15029726922512054, + "step": 24182 + }, + { + "epoch": 0.75575, + "grad_norm": 3.390625, + "grad_norm_var": 0.08507486979166666, + "learning_rate": 0.0001, + "loss": 5.5751, + "loss/crossentropy": 2.4374505281448364, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16337814927101135, + "step": 24184 + }, + { + "epoch": 0.7558125, + "grad_norm": 2.84375, + "grad_norm_var": 0.08245035807291666, + "learning_rate": 0.0001, + "loss": 5.8305, + "loss/crossentropy": 2.621421456336975, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1713024377822876, + "step": 24186 + }, + { + "epoch": 0.755875, + "grad_norm": 3.078125, + "grad_norm_var": 0.08056233723958334, + "learning_rate": 0.0001, + "loss": 5.6943, + "loss/crossentropy": 2.602727174758911, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1630595624446869, + "step": 24188 + }, + { + "epoch": 0.7559375, + "grad_norm": 3.3125, + "grad_norm_var": 0.08196614583333334, + "learning_rate": 0.0001, + "loss": 5.978, + "loss/crossentropy": 2.690164089202881, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.181515172123909, + "step": 24190 + }, + { + "epoch": 0.756, + "grad_norm": 3.25, + "grad_norm_var": 0.08019205729166666, + "learning_rate": 0.0001, + "loss": 5.5361, + "loss/crossentropy": 2.4419296979904175, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16370952129364014, + "step": 24192 + }, + { + "epoch": 0.7560625, + "grad_norm": 3.0625, + "grad_norm_var": 0.07509663899739584, + "learning_rate": 0.0001, + "loss": 5.7707, + "loss/crossentropy": 2.5682854652404785, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1702408790588379, + "step": 24194 + }, + { + "epoch": 0.756125, + "grad_norm": 2.984375, + "grad_norm_var": 0.0865386962890625, + "learning_rate": 0.0001, + "loss": 5.2196, + "loss/crossentropy": 2.2683480978012085, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1498098075389862, + "step": 24196 + }, + { + "epoch": 0.7561875, + "grad_norm": 6.875, + "grad_norm_var": 0.9197092692057292, + "learning_rate": 0.0001, + "loss": 5.7087, + "loss/crossentropy": 2.475526809692383, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1752721145749092, + "step": 24198 + }, + { + "epoch": 0.75625, + "grad_norm": 2.984375, + "grad_norm_var": 0.9122233072916667, + "learning_rate": 0.0001, + "loss": 5.7623, + "loss/crossentropy": 2.6058956384658813, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16954650729894638, + "step": 24200 + }, + { + "epoch": 0.7563125, + "grad_norm": 3.203125, + "grad_norm_var": 0.9079386393229166, + "learning_rate": 0.0001, + "loss": 5.544, + "loss/crossentropy": 2.4620473384857178, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16171350330114365, + "step": 24202 + }, + { + "epoch": 0.756375, + "grad_norm": 3.1875, + "grad_norm_var": 0.90953369140625, + "learning_rate": 0.0001, + "loss": 5.8217, + "loss/crossentropy": 2.6526798009872437, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16534314304590225, + "step": 24204 + }, + { + "epoch": 0.7564375, + "grad_norm": 3.078125, + "grad_norm_var": 0.9124837239583333, + "learning_rate": 0.0001, + "loss": 5.3978, + "loss/crossentropy": 2.3579283952713013, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1602371633052826, + "step": 24206 + }, + { + "epoch": 0.7565, + "grad_norm": 3.03125, + "grad_norm_var": 0.9220123291015625, + "learning_rate": 0.0001, + "loss": 5.357, + "loss/crossentropy": 2.4251667261123657, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15295164287090302, + "step": 24208 + }, + { + "epoch": 0.7565625, + "grad_norm": 2.953125, + "grad_norm_var": 0.9258860270182292, + "learning_rate": 0.0001, + "loss": 5.611, + "loss/crossentropy": 2.51955783367157, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1642208844423294, + "step": 24210 + }, + { + "epoch": 0.756625, + "grad_norm": 3.125, + "grad_norm_var": 0.9121897379557292, + "learning_rate": 0.0001, + "loss": 5.5745, + "loss/crossentropy": 2.4795562028884888, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16067060083150864, + "step": 24212 + }, + { + "epoch": 0.7566875, + "grad_norm": 3.234375, + "grad_norm_var": 0.027424112955729166, + "learning_rate": 0.0001, + "loss": 5.6013, + "loss/crossentropy": 2.609973669052124, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15577159821987152, + "step": 24214 + }, + { + "epoch": 0.75675, + "grad_norm": 3.375, + "grad_norm_var": 0.030467732747395834, + "learning_rate": 0.0001, + "loss": 6.1636, + "loss/crossentropy": 2.8224769830703735, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18567093461751938, + "step": 24216 + }, + { + "epoch": 0.7568125, + "grad_norm": 3.046875, + "grad_norm_var": 0.030192057291666668, + "learning_rate": 0.0001, + "loss": 5.4149, + "loss/crossentropy": 2.4502590894699097, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1542799025774002, + "step": 24218 + }, + { + "epoch": 0.756875, + "grad_norm": 3.203125, + "grad_norm_var": 0.030980428059895832, + "learning_rate": 0.0001, + "loss": 5.6365, + "loss/crossentropy": 2.582989811897278, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16082438826560974, + "step": 24220 + }, + { + "epoch": 0.7569375, + "grad_norm": 3.21875, + "grad_norm_var": 0.032689412434895836, + "learning_rate": 0.0001, + "loss": 5.5686, + "loss/crossentropy": 2.437768340110779, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1681620180606842, + "step": 24222 + }, + { + "epoch": 0.757, + "grad_norm": 2.984375, + "grad_norm_var": 0.037495930989583336, + "learning_rate": 0.0001, + "loss": 5.5878, + "loss/crossentropy": 2.5189844369888306, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16156598925590515, + "step": 24224 + }, + { + "epoch": 0.7570625, + "grad_norm": 3.234375, + "grad_norm_var": 0.03509114583333333, + "learning_rate": 0.0001, + "loss": 5.5288, + "loss/crossentropy": 2.4203603267669678, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16396590322256088, + "step": 24226 + }, + { + "epoch": 0.757125, + "grad_norm": 2.875, + "grad_norm_var": 0.036351521809895836, + "learning_rate": 0.0001, + "loss": 5.7988, + "loss/crossentropy": 2.723006010055542, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16305013000965118, + "step": 24228 + }, + { + "epoch": 0.7571875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0280426025390625, + "learning_rate": 0.0001, + "loss": 5.783, + "loss/crossentropy": 2.657711148262024, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16253386437892914, + "step": 24230 + }, + { + "epoch": 0.75725, + "grad_norm": 3.359375, + "grad_norm_var": 0.025780232747395833, + "learning_rate": 0.0001, + "loss": 5.4359, + "loss/crossentropy": 2.420749545097351, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15542279183864594, + "step": 24232 + }, + { + "epoch": 0.7573125, + "grad_norm": 2.8125, + "grad_norm_var": 0.036214192708333336, + "learning_rate": 0.0001, + "loss": 5.6808, + "loss/crossentropy": 2.5912322998046875, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16247299313545227, + "step": 24234 + }, + { + "epoch": 0.757375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03571675618489583, + "learning_rate": 0.0001, + "loss": 5.4814, + "loss/crossentropy": 2.444804072380066, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15600281953811646, + "step": 24236 + }, + { + "epoch": 0.7574375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03280843098958333, + "learning_rate": 0.0001, + "loss": 5.7374, + "loss/crossentropy": 2.647869110107422, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16402629017829895, + "step": 24238 + }, + { + "epoch": 0.7575, + "grad_norm": 3.46875, + "grad_norm_var": 0.03557027180989583, + "learning_rate": 0.0001, + "loss": 5.5875, + "loss/crossentropy": 2.433461546897888, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1681414619088173, + "step": 24240 + }, + { + "epoch": 0.7575625, + "grad_norm": 2.984375, + "grad_norm_var": 0.04597880045572917, + "learning_rate": 0.0001, + "loss": 5.7412, + "loss/crossentropy": 2.6030138731002808, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1665511652827263, + "step": 24242 + }, + { + "epoch": 0.757625, + "grad_norm": 2.890625, + "grad_norm_var": 0.04523111979166667, + "learning_rate": 0.0001, + "loss": 5.8167, + "loss/crossentropy": 2.7053295373916626, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1685625985264778, + "step": 24244 + }, + { + "epoch": 0.7576875, + "grad_norm": 3.078125, + "grad_norm_var": 0.04425455729166667, + "learning_rate": 0.0001, + "loss": 5.9314, + "loss/crossentropy": 2.7330563068389893, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1729612499475479, + "step": 24246 + }, + { + "epoch": 0.75775, + "grad_norm": 2.796875, + "grad_norm_var": 0.04986572265625, + "learning_rate": 0.0001, + "loss": 5.514, + "loss/crossentropy": 2.4921834468841553, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15843446552753448, + "step": 24248 + }, + { + "epoch": 0.7578125, + "grad_norm": 2.9375, + "grad_norm_var": 0.0417388916015625, + "learning_rate": 0.0001, + "loss": 4.9194, + "loss/crossentropy": 2.052195191383362, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14296723902225494, + "step": 24250 + }, + { + "epoch": 0.757875, + "grad_norm": 2.96875, + "grad_norm_var": 0.04306233723958333, + "learning_rate": 0.0001, + "loss": 5.4979, + "loss/crossentropy": 2.5355305671691895, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15599776804447174, + "step": 24252 + }, + { + "epoch": 0.7579375, + "grad_norm": 3.25, + "grad_norm_var": 0.04788411458333333, + "learning_rate": 0.0001, + "loss": 5.8001, + "loss/crossentropy": 2.610070824623108, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17290769517421722, + "step": 24254 + }, + { + "epoch": 0.758, + "grad_norm": 3.171875, + "grad_norm_var": 0.050093587239583334, + "learning_rate": 0.0001, + "loss": 5.374, + "loss/crossentropy": 2.4149253368377686, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15137417614459991, + "step": 24256 + }, + { + "epoch": 0.7580625, + "grad_norm": 3.125, + "grad_norm_var": 0.03486226399739583, + "learning_rate": 0.0001, + "loss": 5.3875, + "loss/crossentropy": 2.3582857847213745, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15565423667430878, + "step": 24258 + }, + { + "epoch": 0.758125, + "grad_norm": 3.0625, + "grad_norm_var": 0.032698567708333334, + "learning_rate": 0.0001, + "loss": 5.7439, + "loss/crossentropy": 2.627885580062866, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16550765186548233, + "step": 24260 + }, + { + "epoch": 0.7581875, + "grad_norm": 2.953125, + "grad_norm_var": 0.02802734375, + "learning_rate": 0.0001, + "loss": 5.4714, + "loss/crossentropy": 2.4771320819854736, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15489110350608826, + "step": 24262 + }, + { + "epoch": 0.75825, + "grad_norm": 3.078125, + "grad_norm_var": 0.02252197265625, + "learning_rate": 0.0001, + "loss": 5.7646, + "loss/crossentropy": 2.6379897594451904, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1642233058810234, + "step": 24264 + }, + { + "epoch": 0.7583125, + "grad_norm": 3.125, + "grad_norm_var": 0.02232666015625, + "learning_rate": 0.0001, + "loss": 5.6799, + "loss/crossentropy": 2.560634732246399, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16818025708198547, + "step": 24266 + }, + { + "epoch": 0.758375, + "grad_norm": 2.828125, + "grad_norm_var": 0.024836222330729168, + "learning_rate": 0.0001, + "loss": 5.6186, + "loss/crossentropy": 2.5361427068710327, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16293242573738098, + "step": 24268 + }, + { + "epoch": 0.7584375, + "grad_norm": 2.984375, + "grad_norm_var": 0.024071248372395833, + "learning_rate": 0.0001, + "loss": 5.19, + "loss/crossentropy": 2.3382837772369385, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1468924880027771, + "step": 24270 + }, + { + "epoch": 0.7585, + "grad_norm": 2.921875, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 5.9336, + "loss/crossentropy": 2.762654423713684, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16787448525428772, + "step": 24272 + }, + { + "epoch": 0.7585625, + "grad_norm": 3.09375, + "grad_norm_var": 0.014793904622395833, + "learning_rate": 0.0001, + "loss": 5.0658, + "loss/crossentropy": 2.1516292095184326, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.14649128168821335, + "step": 24274 + }, + { + "epoch": 0.758625, + "grad_norm": 3.265625, + "grad_norm_var": 0.023746744791666666, + "learning_rate": 0.0001, + "loss": 5.4255, + "loss/crossentropy": 2.3457034826278687, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16110370308160782, + "step": 24276 + }, + { + "epoch": 0.7586875, + "grad_norm": 3.046875, + "grad_norm_var": 0.03266499837239583, + "learning_rate": 0.0001, + "loss": 5.7083, + "loss/crossentropy": 2.626542091369629, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16325343400239944, + "step": 24278 + }, + { + "epoch": 0.75875, + "grad_norm": 2.796875, + "grad_norm_var": 0.039449055989583336, + "learning_rate": 0.0001, + "loss": 5.1111, + "loss/crossentropy": 2.238158345222473, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.1478414461016655, + "step": 24280 + }, + { + "epoch": 0.7588125, + "grad_norm": 3.0, + "grad_norm_var": 0.042333984375, + "learning_rate": 0.0001, + "loss": 5.4943, + "loss/crossentropy": 2.5649493932724, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.15464885532855988, + "step": 24282 + }, + { + "epoch": 0.758875, + "grad_norm": 3.3125, + "grad_norm_var": 0.04350484212239583, + "learning_rate": 0.0001, + "loss": 5.7973, + "loss/crossentropy": 2.6341344118118286, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16748441010713577, + "step": 24284 + }, + { + "epoch": 0.7589375, + "grad_norm": 3.125, + "grad_norm_var": 0.040339152018229164, + "learning_rate": 0.0001, + "loss": 5.641, + "loss/crossentropy": 2.560652017593384, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16506946831941605, + "step": 24286 + }, + { + "epoch": 0.759, + "grad_norm": 3.109375, + "grad_norm_var": 0.04024149576822917, + "learning_rate": 0.0001, + "loss": 5.9467, + "loss/crossentropy": 2.7992547750473022, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16669610887765884, + "step": 24288 + }, + { + "epoch": 0.7590625, + "grad_norm": 3.328125, + "grad_norm_var": 0.0446685791015625, + "learning_rate": 0.0001, + "loss": 5.6246, + "loss/crossentropy": 2.567845940589905, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1615363210439682, + "step": 24290 + }, + { + "epoch": 0.759125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03837890625, + "learning_rate": 0.0001, + "loss": 5.9742, + "loss/crossentropy": 2.819923162460327, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16972891241312027, + "step": 24292 + }, + { + "epoch": 0.7591875, + "grad_norm": 3.109375, + "grad_norm_var": 0.029715983072916667, + "learning_rate": 0.0001, + "loss": 5.8447, + "loss/crossentropy": 2.6782902479171753, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16976940631866455, + "step": 24294 + }, + { + "epoch": 0.75925, + "grad_norm": 3.296875, + "grad_norm_var": 0.02779541015625, + "learning_rate": 0.0001, + "loss": 5.5734, + "loss/crossentropy": 2.5208466053009033, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15993858128786087, + "step": 24296 + }, + { + "epoch": 0.7593125, + "grad_norm": 3.65625, + "grad_norm_var": 0.04777730305989583, + "learning_rate": 0.0001, + "loss": 5.3045, + "loss/crossentropy": 2.2961453199386597, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15591763705015182, + "step": 24298 + }, + { + "epoch": 0.759375, + "grad_norm": 3.1875, + "grad_norm_var": 0.0461578369140625, + "learning_rate": 0.0001, + "loss": 6.1393, + "loss/crossentropy": 2.8317224979400635, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.18232179433107376, + "step": 24300 + }, + { + "epoch": 0.7594375, + "grad_norm": 3.0, + "grad_norm_var": 0.04163411458333333, + "learning_rate": 0.0001, + "loss": 5.648, + "loss/crossentropy": 2.5475919246673584, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16512297093868256, + "step": 24302 + }, + { + "epoch": 0.7595, + "grad_norm": 3.09375, + "grad_norm_var": 0.037043253580729164, + "learning_rate": 0.0001, + "loss": 5.5121, + "loss/crossentropy": 2.5225062370300293, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15716056525707245, + "step": 24304 + }, + { + "epoch": 0.7595625, + "grad_norm": 3.0625, + "grad_norm_var": 0.0380279541015625, + "learning_rate": 0.0001, + "loss": 5.3988, + "loss/crossentropy": 2.4485208988189697, + "loss/hidden": 1.375, + "loss/jsd": 0.0, + "loss/logits": 0.1575288325548172, + "step": 24306 + }, + { + "epoch": 0.759625, + "grad_norm": 3.015625, + "grad_norm_var": 0.03650716145833333, + "learning_rate": 0.0001, + "loss": 5.7898, + "loss/crossentropy": 2.6098283529281616, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16877630352973938, + "step": 24308 + }, + { + "epoch": 0.7596875, + "grad_norm": 2.765625, + "grad_norm_var": 0.042704264322916664, + "learning_rate": 0.0001, + "loss": 5.5829, + "loss/crossentropy": 2.555821418762207, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15700319409370422, + "step": 24310 + }, + { + "epoch": 0.75975, + "grad_norm": 3.078125, + "grad_norm_var": 0.04006754557291667, + "learning_rate": 0.0001, + "loss": 5.6753, + "loss/crossentropy": 2.5307703018188477, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.17148318886756897, + "step": 24312 + }, + { + "epoch": 0.7598125, + "grad_norm": 3.125, + "grad_norm_var": 0.07297261555989583, + "learning_rate": 0.0001, + "loss": 5.6941, + "loss/crossentropy": 2.550258755683899, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16712124645709991, + "step": 24314 + }, + { + "epoch": 0.759875, + "grad_norm": 2.890625, + "grad_norm_var": 0.0743072509765625, + "learning_rate": 0.0001, + "loss": 5.3145, + "loss/crossentropy": 2.4267187118530273, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.14307159185409546, + "step": 24316 + }, + { + "epoch": 0.7599375, + "grad_norm": 3.25, + "grad_norm_var": 0.076611328125, + "learning_rate": 0.0001, + "loss": 5.7224, + "loss/crossentropy": 2.6389344930648804, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16107800602912903, + "step": 24318 + }, + { + "epoch": 0.76, + "grad_norm": 3.0625, + "grad_norm_var": 0.07613016764322916, + "learning_rate": 0.0001, + "loss": 5.6364, + "loss/crossentropy": 2.609094738960266, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1609373688697815, + "step": 24320 + }, + { + "epoch": 0.7600625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0740631103515625, + "learning_rate": 0.0001, + "loss": 5.7142, + "loss/crossentropy": 2.6537078619003296, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15995817631483078, + "step": 24322 + }, + { + "epoch": 0.760125, + "grad_norm": 2.921875, + "grad_norm_var": 0.075927734375, + "learning_rate": 0.0001, + "loss": 5.5217, + "loss/crossentropy": 2.4636749029159546, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15775848925113678, + "step": 24324 + }, + { + "epoch": 0.7601875, + "grad_norm": 3.15625, + "grad_norm_var": 0.06871337890625, + "learning_rate": 0.0001, + "loss": 5.5488, + "loss/crossentropy": 2.476245641708374, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1592119336128235, + "step": 24326 + }, + { + "epoch": 0.76025, + "grad_norm": 3.109375, + "grad_norm_var": 0.09335530598958333, + "learning_rate": 0.0001, + "loss": 5.8164, + "loss/crossentropy": 2.5851922035217285, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17585048079490662, + "step": 24328 + }, + { + "epoch": 0.7603125, + "grad_norm": 3.1875, + "grad_norm_var": 0.04155171712239583, + "learning_rate": 0.0001, + "loss": 5.8689, + "loss/crossentropy": 2.6981635093688965, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16902291029691696, + "step": 24330 + }, + { + "epoch": 0.760375, + "grad_norm": 3.0625, + "grad_norm_var": 0.04455973307291667, + "learning_rate": 0.0001, + "loss": 5.3983, + "loss/crossentropy": 2.4496692419052124, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1495552659034729, + "step": 24332 + }, + { + "epoch": 0.7604375, + "grad_norm": 3.046875, + "grad_norm_var": 0.04292704264322917, + "learning_rate": 0.0001, + "loss": 5.6176, + "loss/crossentropy": 2.527258038520813, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16137798130512238, + "step": 24334 + }, + { + "epoch": 0.7605, + "grad_norm": 2.890625, + "grad_norm_var": 0.0487701416015625, + "learning_rate": 0.0001, + "loss": 5.8402, + "loss/crossentropy": 2.702823281288147, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16725626587867737, + "step": 24336 + }, + { + "epoch": 0.7605625, + "grad_norm": 3.40625, + "grad_norm_var": 0.052490234375, + "learning_rate": 0.0001, + "loss": 5.5783, + "loss/crossentropy": 2.5081727504730225, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1593562588095665, + "step": 24338 + }, + { + "epoch": 0.760625, + "grad_norm": 2.75, + "grad_norm_var": 0.05944010416666667, + "learning_rate": 0.0001, + "loss": 5.4505, + "loss/crossentropy": 2.499588966369629, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15368393063545227, + "step": 24340 + }, + { + "epoch": 0.7606875, + "grad_norm": 3.296875, + "grad_norm_var": 0.06326395670572917, + "learning_rate": 0.0001, + "loss": 6.1718, + "loss/crossentropy": 2.8815900087356567, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1786285936832428, + "step": 24342 + }, + { + "epoch": 0.76075, + "grad_norm": 2.90625, + "grad_norm_var": 0.0376861572265625, + "learning_rate": 0.0001, + "loss": 5.8847, + "loss/crossentropy": 2.7646361589431763, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16473662108182907, + "step": 24344 + }, + { + "epoch": 0.7608125, + "grad_norm": 3.203125, + "grad_norm_var": 0.037385050455729166, + "learning_rate": 0.0001, + "loss": 5.756, + "loss/crossentropy": 2.6093757152557373, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16583281755447388, + "step": 24346 + }, + { + "epoch": 0.760875, + "grad_norm": 3.609375, + "grad_norm_var": 0.050126139322916666, + "learning_rate": 0.0001, + "loss": 5.9246, + "loss/crossentropy": 2.7277426719665527, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16890300810337067, + "step": 24348 + }, + { + "epoch": 0.7609375, + "grad_norm": 3.46875, + "grad_norm_var": 0.0551422119140625, + "learning_rate": 0.0001, + "loss": 5.805, + "loss/crossentropy": 2.618245482444763, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17141090333461761, + "step": 24350 + }, + { + "epoch": 0.761, + "grad_norm": 3.203125, + "grad_norm_var": 0.0514068603515625, + "learning_rate": 0.0001, + "loss": 5.6128, + "loss/crossentropy": 2.567072868347168, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1623883917927742, + "step": 24352 + }, + { + "epoch": 0.7610625, + "grad_norm": 2.921875, + "grad_norm_var": 0.049738566080729164, + "learning_rate": 0.0001, + "loss": 5.7429, + "loss/crossentropy": 2.6708853244781494, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16266701370477676, + "step": 24354 + }, + { + "epoch": 0.761125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0439605712890625, + "learning_rate": 0.0001, + "loss": 5.3716, + "loss/crossentropy": 2.4212406873703003, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1528492048382759, + "step": 24356 + }, + { + "epoch": 0.7611875, + "grad_norm": 3.546875, + "grad_norm_var": 0.051070149739583334, + "learning_rate": 0.0001, + "loss": 6.0262, + "loss/crossentropy": 2.813529849052429, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17282679677009583, + "step": 24358 + }, + { + "epoch": 0.76125, + "grad_norm": 3.046875, + "grad_norm_var": 0.047102864583333334, + "learning_rate": 0.0001, + "loss": 5.9006, + "loss/crossentropy": 2.7431578636169434, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.168087400496006, + "step": 24360 + }, + { + "epoch": 0.7613125, + "grad_norm": 3.390625, + "grad_norm_var": 0.050093587239583334, + "learning_rate": 0.0001, + "loss": 5.7242, + "loss/crossentropy": 2.5602455139160156, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16756587475538254, + "step": 24362 + }, + { + "epoch": 0.761375, + "grad_norm": 2.953125, + "grad_norm_var": 0.03797200520833333, + "learning_rate": 0.0001, + "loss": 5.7465, + "loss/crossentropy": 2.664482831954956, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1613277718424797, + "step": 24364 + }, + { + "epoch": 0.7614375, + "grad_norm": 3.0625, + "grad_norm_var": 0.03612874348958333, + "learning_rate": 0.0001, + "loss": 5.672, + "loss/crossentropy": 2.642616391181946, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15997271239757538, + "step": 24366 + }, + { + "epoch": 0.7615, + "grad_norm": 2.984375, + "grad_norm_var": 0.035497029622395836, + "learning_rate": 0.0001, + "loss": 5.7899, + "loss/crossentropy": 2.6880706548690796, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16565346717834473, + "step": 24368 + }, + { + "epoch": 0.7615625, + "grad_norm": 3.140625, + "grad_norm_var": 0.034016927083333336, + "learning_rate": 0.0001, + "loss": 5.3715, + "loss/crossentropy": 2.3503127098083496, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15914569050073624, + "step": 24370 + }, + { + "epoch": 0.761625, + "grad_norm": 3.8125, + "grad_norm_var": 0.060155232747395836, + "learning_rate": 0.0001, + "loss": 5.5425, + "loss/crossentropy": 2.3365726470947266, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16747110337018967, + "step": 24372 + }, + { + "epoch": 0.7616875, + "grad_norm": 2.96875, + "grad_norm_var": 0.05335286458333333, + "learning_rate": 0.0001, + "loss": 5.7142, + "loss/crossentropy": 2.5692873001098633, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16761834919452667, + "step": 24374 + }, + { + "epoch": 0.76175, + "grad_norm": 3.15625, + "grad_norm_var": 0.052912394205729164, + "learning_rate": 0.0001, + "loss": 5.6844, + "loss/crossentropy": 2.541516900062561, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1666291505098343, + "step": 24376 + }, + { + "epoch": 0.7618125, + "grad_norm": 3.09375, + "grad_norm_var": 0.049046834309895836, + "learning_rate": 0.0001, + "loss": 5.6735, + "loss/crossentropy": 2.5311325788497925, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16814269870519638, + "step": 24378 + }, + { + "epoch": 0.761875, + "grad_norm": 12.6875, + "grad_norm_var": 5.766681925455729, + "learning_rate": 0.0001, + "loss": 5.8868, + "loss/crossentropy": 2.381693482398987, + "loss/hidden": 1.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.19386768341064453, + "step": 24380 + }, + { + "epoch": 0.7619375, + "grad_norm": 3.015625, + "grad_norm_var": 5.729117838541667, + "learning_rate": 0.0001, + "loss": 5.5047, + "loss/crossentropy": 2.4462047815322876, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16366249322891235, + "step": 24382 + }, + { + "epoch": 0.762, + "grad_norm": 2.875, + "grad_norm_var": 5.736490885416667, + "learning_rate": 0.0001, + "loss": 5.4341, + "loss/crossentropy": 2.42786180973053, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1576521024107933, + "step": 24384 + }, + { + "epoch": 0.7620625, + "grad_norm": 3.171875, + "grad_norm_var": 5.711669921875, + "learning_rate": 0.0001, + "loss": 6.0457, + "loss/crossentropy": 2.748769998550415, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17813345044851303, + "step": 24386 + }, + { + "epoch": 0.762125, + "grad_norm": 3.40625, + "grad_norm_var": 5.703075154622396, + "learning_rate": 0.0001, + "loss": 5.6918, + "loss/crossentropy": 2.4644813537597656, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.17155954241752625, + "step": 24388 + }, + { + "epoch": 0.7621875, + "grad_norm": 3.125, + "grad_norm_var": 5.700065104166667, + "learning_rate": 0.0001, + "loss": 5.9113, + "loss/crossentropy": 2.708987832069397, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16906185448169708, + "step": 24390 + }, + { + "epoch": 0.76225, + "grad_norm": 3.0625, + "grad_norm_var": 5.743529256184896, + "learning_rate": 0.0001, + "loss": 5.4029, + "loss/crossentropy": 2.4219563007354736, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15395769476890564, + "step": 24392 + }, + { + "epoch": 0.7623125, + "grad_norm": 2.90625, + "grad_norm_var": 5.737987263997396, + "learning_rate": 0.0001, + "loss": 5.7952, + "loss/crossentropy": 2.6631529331207275, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1682787463068962, + "step": 24394 + }, + { + "epoch": 0.762375, + "grad_norm": 2.84375, + "grad_norm_var": 0.036742146809895834, + "learning_rate": 0.0001, + "loss": 5.8027, + "loss/crossentropy": 2.6886179447174072, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1657099425792694, + "step": 24396 + }, + { + "epoch": 0.7624375, + "grad_norm": 2.921875, + "grad_norm_var": 0.03632405598958333, + "learning_rate": 0.0001, + "loss": 5.5923, + "loss/crossentropy": 2.535095453262329, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1604101061820984, + "step": 24398 + }, + { + "epoch": 0.7625, + "grad_norm": 2.90625, + "grad_norm_var": 0.041178385416666664, + "learning_rate": 0.0001, + "loss": 5.3001, + "loss/crossentropy": 2.3773635625839233, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15086649358272552, + "step": 24400 + }, + { + "epoch": 0.7625625, + "grad_norm": 2.921875, + "grad_norm_var": 0.04439188639322917, + "learning_rate": 0.0001, + "loss": 5.4394, + "loss/crossentropy": 2.5263442993164062, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1510726362466812, + "step": 24402 + }, + { + "epoch": 0.762625, + "grad_norm": 2.96875, + "grad_norm_var": 0.027046712239583333, + "learning_rate": 0.0001, + "loss": 5.49, + "loss/crossentropy": 2.4671220779418945, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15814447402954102, + "step": 24404 + }, + { + "epoch": 0.7626875, + "grad_norm": 3.359375, + "grad_norm_var": 0.0331451416015625, + "learning_rate": 0.0001, + "loss": 5.717, + "loss/crossentropy": 2.609921932220459, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16656387597322464, + "step": 24406 + }, + { + "epoch": 0.76275, + "grad_norm": 2.953125, + "grad_norm_var": 0.033186848958333334, + "learning_rate": 0.0001, + "loss": 5.5609, + "loss/crossentropy": 2.5512131452560425, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15917153656482697, + "step": 24408 + }, + { + "epoch": 0.7628125, + "grad_norm": 3.265625, + "grad_norm_var": 0.03452860514322917, + "learning_rate": 0.0001, + "loss": 5.6542, + "loss/crossentropy": 2.482442259788513, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.17420323193073273, + "step": 24410 + }, + { + "epoch": 0.762875, + "grad_norm": 2.890625, + "grad_norm_var": 0.03711649576822917, + "learning_rate": 0.0001, + "loss": 5.6288, + "loss/crossentropy": 2.5039873123168945, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16482964158058167, + "step": 24412 + }, + { + "epoch": 0.7629375, + "grad_norm": 3.875, + "grad_norm_var": 0.09339090983072916, + "learning_rate": 0.0001, + "loss": 5.5329, + "loss/crossentropy": 2.4025429487228394, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16616224497556686, + "step": 24414 + }, + { + "epoch": 0.763, + "grad_norm": 3.171875, + "grad_norm_var": 0.0822418212890625, + "learning_rate": 0.0001, + "loss": 5.8005, + "loss/crossentropy": 2.6739391088485718, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16344071179628372, + "step": 24416 + }, + { + "epoch": 0.7630625, + "grad_norm": 3.078125, + "grad_norm_var": 0.06708984375, + "learning_rate": 0.0001, + "loss": 5.7469, + "loss/crossentropy": 2.596408247947693, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1677859202027321, + "step": 24418 + }, + { + "epoch": 0.763125, + "grad_norm": 3.078125, + "grad_norm_var": 0.06049702962239583, + "learning_rate": 0.0001, + "loss": 5.6792, + "loss/crossentropy": 2.595887303352356, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1626303270459175, + "step": 24420 + }, + { + "epoch": 0.7631875, + "grad_norm": 3.390625, + "grad_norm_var": 0.05894775390625, + "learning_rate": 0.0001, + "loss": 5.4664, + "loss/crossentropy": 2.448666214942932, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15880563855171204, + "step": 24422 + }, + { + "epoch": 0.76325, + "grad_norm": 3.25, + "grad_norm_var": 0.060139973958333336, + "learning_rate": 0.0001, + "loss": 5.6138, + "loss/crossentropy": 2.532195806503296, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16401486098766327, + "step": 24424 + }, + { + "epoch": 0.7633125, + "grad_norm": 3.140625, + "grad_norm_var": 0.06027018229166667, + "learning_rate": 0.0001, + "loss": 5.8564, + "loss/crossentropy": 2.7116355895996094, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16760292649269104, + "step": 24426 + }, + { + "epoch": 0.763375, + "grad_norm": 3.15625, + "grad_norm_var": 0.05575764973958333, + "learning_rate": 0.0001, + "loss": 5.8755, + "loss/crossentropy": 2.709297776222229, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16935163736343384, + "step": 24428 + }, + { + "epoch": 0.7634375, + "grad_norm": 3.015625, + "grad_norm_var": 0.020340983072916666, + "learning_rate": 0.0001, + "loss": 5.58, + "loss/crossentropy": 2.4961061477661133, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1591692715883255, + "step": 24430 + }, + { + "epoch": 0.7635, + "grad_norm": 3.171875, + "grad_norm_var": 0.025484212239583335, + "learning_rate": 0.0001, + "loss": 5.6587, + "loss/crossentropy": 2.598578453063965, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16069485992193222, + "step": 24432 + }, + { + "epoch": 0.7635625, + "grad_norm": 3.21875, + "grad_norm_var": 0.0270904541015625, + "learning_rate": 0.0001, + "loss": 5.4905, + "loss/crossentropy": 2.4428911209106445, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1567188948392868, + "step": 24434 + }, + { + "epoch": 0.763625, + "grad_norm": 2.953125, + "grad_norm_var": 0.029423014322916666, + "learning_rate": 0.0001, + "loss": 5.7569, + "loss/crossentropy": 2.7318053245544434, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15563537180423737, + "step": 24436 + }, + { + "epoch": 0.7636875, + "grad_norm": 3.34375, + "grad_norm_var": 0.029084269205729166, + "learning_rate": 0.0001, + "loss": 5.7455, + "loss/crossentropy": 2.58599317073822, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16868651658296585, + "step": 24438 + }, + { + "epoch": 0.76375, + "grad_norm": 3.09375, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 5.6417, + "loss/crossentropy": 2.529034972190857, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16633976995944977, + "step": 24440 + }, + { + "epoch": 0.7638125, + "grad_norm": 3.140625, + "grad_norm_var": 0.027762858072916667, + "learning_rate": 0.0001, + "loss": 5.7549, + "loss/crossentropy": 2.6712522506713867, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15992914140224457, + "step": 24442 + }, + { + "epoch": 0.763875, + "grad_norm": 2.9375, + "grad_norm_var": 0.0231353759765625, + "learning_rate": 0.0001, + "loss": 5.3658, + "loss/crossentropy": 2.390221357345581, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1487329974770546, + "step": 24444 + }, + { + "epoch": 0.7639375, + "grad_norm": 3.21875, + "grad_norm_var": 0.019904581705729167, + "learning_rate": 0.0001, + "loss": 5.7898, + "loss/crossentropy": 2.6352105140686035, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16975723206996918, + "step": 24446 + }, + { + "epoch": 0.764, + "grad_norm": 3.0625, + "grad_norm_var": 0.024051920572916666, + "learning_rate": 0.0001, + "loss": 5.8497, + "loss/crossentropy": 2.6421698331832886, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16879693418741226, + "step": 24448 + }, + { + "epoch": 0.7640625, + "grad_norm": 3.015625, + "grad_norm_var": 0.0232574462890625, + "learning_rate": 0.0001, + "loss": 5.5762, + "loss/crossentropy": 2.4971476793289185, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1657225862145424, + "step": 24450 + }, + { + "epoch": 0.764125, + "grad_norm": 2.890625, + "grad_norm_var": 0.026416015625, + "learning_rate": 0.0001, + "loss": 5.5338, + "loss/crossentropy": 2.509614109992981, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15788687020540237, + "step": 24452 + }, + { + "epoch": 0.7641875, + "grad_norm": 2.890625, + "grad_norm_var": 0.03954671223958333, + "learning_rate": 0.0001, + "loss": 5.5469, + "loss/crossentropy": 2.4922231435775757, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16171645373106003, + "step": 24454 + }, + { + "epoch": 0.76425, + "grad_norm": 3.109375, + "grad_norm_var": 6.216796875, + "learning_rate": 0.0001, + "loss": 5.7248, + "loss/crossentropy": 2.4789448976516724, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1765417903661728, + "step": 24456 + }, + { + "epoch": 0.7643125, + "grad_norm": 3.0, + "grad_norm_var": 6.195254516601563, + "learning_rate": 0.0001, + "loss": 6.0502, + "loss/crossentropy": 2.818184018135071, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17437320202589035, + "step": 24458 + }, + { + "epoch": 0.764375, + "grad_norm": 2.96875, + "grad_norm_var": 6.205427042643229, + "learning_rate": 0.0001, + "loss": 5.4949, + "loss/crossentropy": 2.4427298307418823, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1614658683538437, + "step": 24460 + }, + { + "epoch": 0.7644375, + "grad_norm": 3.171875, + "grad_norm_var": 6.226203409830729, + "learning_rate": 0.0001, + "loss": 5.5257, + "loss/crossentropy": 2.477227807044983, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15680351108312607, + "step": 24462 + }, + { + "epoch": 0.7645, + "grad_norm": 3.109375, + "grad_norm_var": 6.236197916666667, + "learning_rate": 0.0001, + "loss": 6.0102, + "loss/crossentropy": 2.7584580183029175, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.175169438123703, + "step": 24464 + }, + { + "epoch": 0.7645625, + "grad_norm": 3.125, + "grad_norm_var": 6.250234985351563, + "learning_rate": 0.0001, + "loss": 5.7546, + "loss/crossentropy": 2.706199288368225, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1583528071641922, + "step": 24466 + }, + { + "epoch": 0.764625, + "grad_norm": 3.09375, + "grad_norm_var": 6.216487630208333, + "learning_rate": 0.0001, + "loss": 5.6573, + "loss/crossentropy": 2.5890300273895264, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16033885627985, + "step": 24468 + }, + { + "epoch": 0.7646875, + "grad_norm": 3.203125, + "grad_norm_var": 6.203205362955729, + "learning_rate": 0.0001, + "loss": 5.8204, + "loss/crossentropy": 2.629197597503662, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1722457855939865, + "step": 24470 + }, + { + "epoch": 0.76475, + "grad_norm": 2.890625, + "grad_norm_var": 0.023531087239583335, + "learning_rate": 0.0001, + "loss": 5.1167, + "loss/crossentropy": 2.2669665813446045, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.14512494206428528, + "step": 24472 + }, + { + "epoch": 0.7648125, + "grad_norm": 3.203125, + "grad_norm_var": 0.018680826822916666, + "learning_rate": 0.0001, + "loss": 5.7604, + "loss/crossentropy": 2.639541268348694, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16677073389291763, + "step": 24474 + }, + { + "epoch": 0.764875, + "grad_norm": 2.921875, + "grad_norm_var": 0.01968994140625, + "learning_rate": 0.0001, + "loss": 5.3079, + "loss/crossentropy": 2.35752010345459, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15089724957942963, + "step": 24476 + }, + { + "epoch": 0.7649375, + "grad_norm": 3.625, + "grad_norm_var": 0.0392578125, + "learning_rate": 0.0001, + "loss": 5.6516, + "loss/crossentropy": 2.4392318725585938, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1731855645775795, + "step": 24478 + }, + { + "epoch": 0.765, + "grad_norm": 3.0625, + "grad_norm_var": 0.04045817057291667, + "learning_rate": 0.0001, + "loss": 5.5255, + "loss/crossentropy": 2.5260192155838013, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15853818506002426, + "step": 24480 + }, + { + "epoch": 0.7650625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04094645182291667, + "learning_rate": 0.0001, + "loss": 5.4057, + "loss/crossentropy": 2.385251045227051, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15829485654830933, + "step": 24482 + }, + { + "epoch": 0.765125, + "grad_norm": 2.84375, + "grad_norm_var": 0.0445709228515625, + "learning_rate": 0.0001, + "loss": 5.6976, + "loss/crossentropy": 2.7166390419006348, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15473493933677673, + "step": 24484 + }, + { + "epoch": 0.7651875, + "grad_norm": 3.328125, + "grad_norm_var": 0.052099609375, + "learning_rate": 0.0001, + "loss": 5.4798, + "loss/crossentropy": 2.475341320037842, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15865373611450195, + "step": 24486 + }, + { + "epoch": 0.76525, + "grad_norm": 3.03125, + "grad_norm_var": 0.20618489583333333, + "learning_rate": 0.0001, + "loss": 5.4679, + "loss/crossentropy": 2.3743693828582764, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.15779267251491547, + "step": 24488 + }, + { + "epoch": 0.7653125, + "grad_norm": 3.0625, + "grad_norm_var": 0.20689188639322917, + "learning_rate": 0.0001, + "loss": 5.447, + "loss/crossentropy": 2.4905470609664917, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.15736067295074463, + "step": 24490 + }, + { + "epoch": 0.765375, + "grad_norm": 3.46875, + "grad_norm_var": 0.20894775390625, + "learning_rate": 0.0001, + "loss": 5.5884, + "loss/crossentropy": 2.488085150718689, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1658872589468956, + "step": 24492 + }, + { + "epoch": 0.7654375, + "grad_norm": 3.03125, + "grad_norm_var": 0.20964253743489583, + "learning_rate": 0.0001, + "loss": 5.6334, + "loss/crossentropy": 2.4770509004592896, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16836615651845932, + "step": 24494 + }, + { + "epoch": 0.7655, + "grad_norm": 3.125, + "grad_norm_var": 0.21066080729166667, + "learning_rate": 0.0001, + "loss": 5.6307, + "loss/crossentropy": 2.5260634422302246, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16397515684366226, + "step": 24496 + }, + { + "epoch": 0.7655625, + "grad_norm": 2.921875, + "grad_norm_var": 0.2105377197265625, + "learning_rate": 0.0001, + "loss": 5.7065, + "loss/crossentropy": 2.653430461883545, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16311953961849213, + "step": 24498 + }, + { + "epoch": 0.765625, + "grad_norm": 3.171875, + "grad_norm_var": 0.20191141764322917, + "learning_rate": 0.0001, + "loss": 5.7151, + "loss/crossentropy": 2.553924083709717, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1653314083814621, + "step": 24500 + }, + { + "epoch": 0.7656875, + "grad_norm": 3.140625, + "grad_norm_var": 0.18728841145833333, + "learning_rate": 0.0001, + "loss": 5.5741, + "loss/crossentropy": 2.460555672645569, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1640918105840683, + "step": 24502 + }, + { + "epoch": 0.76575, + "grad_norm": 3.15625, + "grad_norm_var": 0.0445709228515625, + "learning_rate": 0.0001, + "loss": 5.7112, + "loss/crossentropy": 2.573582649230957, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1668897494673729, + "step": 24504 + }, + { + "epoch": 0.7658125, + "grad_norm": 3.15625, + "grad_norm_var": 0.04297587076822917, + "learning_rate": 0.0001, + "loss": 5.573, + "loss/crossentropy": 2.539095640182495, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15964529663324356, + "step": 24506 + }, + { + "epoch": 0.765875, + "grad_norm": 2.9375, + "grad_norm_var": 0.04067281087239583, + "learning_rate": 0.0001, + "loss": 5.6078, + "loss/crossentropy": 2.590778946876526, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.1622440367937088, + "step": 24508 + }, + { + "epoch": 0.7659375, + "grad_norm": 3.0, + "grad_norm_var": 0.0162017822265625, + "learning_rate": 0.0001, + "loss": 5.7587, + "loss/crossentropy": 2.689617156982422, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16237246990203857, + "step": 24510 + }, + { + "epoch": 0.766, + "grad_norm": 3.234375, + "grad_norm_var": 0.016634114583333335, + "learning_rate": 0.0001, + "loss": 5.6707, + "loss/crossentropy": 2.514092445373535, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1687850058078766, + "step": 24512 + }, + { + "epoch": 0.7660625, + "grad_norm": 2.9375, + "grad_norm_var": 0.019505818684895832, + "learning_rate": 0.0001, + "loss": 5.6159, + "loss/crossentropy": 2.5805495977401733, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16173956543207169, + "step": 24514 + }, + { + "epoch": 0.766125, + "grad_norm": 3.09375, + "grad_norm_var": 0.023616536458333334, + "learning_rate": 0.0001, + "loss": 5.5594, + "loss/crossentropy": 2.513151168823242, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15696899592876434, + "step": 24516 + }, + { + "epoch": 0.7661875, + "grad_norm": 3.109375, + "grad_norm_var": 0.019123331705729166, + "learning_rate": 0.0001, + "loss": 6.0361, + "loss/crossentropy": 2.814682960510254, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1737068146467209, + "step": 24518 + }, + { + "epoch": 0.76625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0194000244140625, + "learning_rate": 0.0001, + "loss": 5.6034, + "loss/crossentropy": 2.504035234451294, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16383955627679825, + "step": 24520 + }, + { + "epoch": 0.7663125, + "grad_norm": 2.921875, + "grad_norm_var": 0.0197174072265625, + "learning_rate": 0.0001, + "loss": 5.2152, + "loss/crossentropy": 2.2720115184783936, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1486142799258232, + "step": 24522 + }, + { + "epoch": 0.766375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0179351806640625, + "learning_rate": 0.0001, + "loss": 5.4113, + "loss/crossentropy": 2.3895599842071533, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15647148340940475, + "step": 24524 + }, + { + "epoch": 0.7664375, + "grad_norm": 3.5625, + "grad_norm_var": 0.03140869140625, + "learning_rate": 0.0001, + "loss": 5.8155, + "loss/crossentropy": 2.6010197401046753, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1714508831501007, + "step": 24526 + }, + { + "epoch": 0.7665, + "grad_norm": 3.203125, + "grad_norm_var": 0.031473795572916664, + "learning_rate": 0.0001, + "loss": 5.6301, + "loss/crossentropy": 2.5225613117218018, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1673906147480011, + "step": 24528 + }, + { + "epoch": 0.7665625, + "grad_norm": 3.296875, + "grad_norm_var": 0.028661092122395832, + "learning_rate": 0.0001, + "loss": 5.5383, + "loss/crossentropy": 2.413439989089966, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1636573150753975, + "step": 24530 + }, + { + "epoch": 0.766625, + "grad_norm": 3.28125, + "grad_norm_var": 0.027464803059895834, + "learning_rate": 0.0001, + "loss": 5.8101, + "loss/crossentropy": 2.6230051517486572, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17066382616758347, + "step": 24532 + }, + { + "epoch": 0.7666875, + "grad_norm": 3.078125, + "grad_norm_var": 0.030134073893229165, + "learning_rate": 0.0001, + "loss": 5.9387, + "loss/crossentropy": 2.724544644355774, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17336948961019516, + "step": 24534 + }, + { + "epoch": 0.76675, + "grad_norm": 2.796875, + "grad_norm_var": 0.03655598958333333, + "learning_rate": 0.0001, + "loss": 5.5393, + "loss/crossentropy": 2.5030943155288696, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1551826074719429, + "step": 24536 + }, + { + "epoch": 0.7668125, + "grad_norm": 2.84375, + "grad_norm_var": 0.04495035807291667, + "learning_rate": 0.0001, + "loss": 5.5044, + "loss/crossentropy": 2.5110961198806763, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15636055171489716, + "step": 24538 + }, + { + "epoch": 0.766875, + "grad_norm": 3.5625, + "grad_norm_var": 0.05181884765625, + "learning_rate": 0.0001, + "loss": 5.6657, + "loss/crossentropy": 2.506834864616394, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1658850461244583, + "step": 24540 + }, + { + "epoch": 0.7669375, + "grad_norm": 3.046875, + "grad_norm_var": 0.046418253580729166, + "learning_rate": 0.0001, + "loss": 5.8044, + "loss/crossentropy": 2.661869168281555, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16620570421218872, + "step": 24542 + }, + { + "epoch": 0.767, + "grad_norm": 3.0625, + "grad_norm_var": 0.054638671875, + "learning_rate": 0.0001, + "loss": 5.6193, + "loss/crossentropy": 2.5758901834487915, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15746265649795532, + "step": 24544 + }, + { + "epoch": 0.7670625, + "grad_norm": 2.78125, + "grad_norm_var": 0.0587799072265625, + "learning_rate": 0.0001, + "loss": 5.1982, + "loss/crossentropy": 2.2970075607299805, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.14675837755203247, + "step": 24546 + }, + { + "epoch": 0.767125, + "grad_norm": 2.96875, + "grad_norm_var": 0.054133097330729164, + "learning_rate": 0.0001, + "loss": 5.6817, + "loss/crossentropy": 2.6156165599823, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15777605026960373, + "step": 24548 + }, + { + "epoch": 0.7671875, + "grad_norm": 3.0625, + "grad_norm_var": 0.074853515625, + "learning_rate": 0.0001, + "loss": 5.5262, + "loss/crossentropy": 2.412881851196289, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16249888390302658, + "step": 24550 + }, + { + "epoch": 0.76725, + "grad_norm": 3.09375, + "grad_norm_var": 0.07151285807291667, + "learning_rate": 0.0001, + "loss": 5.2193, + "loss/crossentropy": 2.2569509744644165, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15014218538999557, + "step": 24552 + }, + { + "epoch": 0.7673125, + "grad_norm": 3.0, + "grad_norm_var": 0.07652994791666666, + "learning_rate": 0.0001, + "loss": 5.469, + "loss/crossentropy": 2.3602248430252075, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16517900675535202, + "step": 24554 + }, + { + "epoch": 0.767375, + "grad_norm": 2.984375, + "grad_norm_var": 0.06404520670572916, + "learning_rate": 0.0001, + "loss": 5.5428, + "loss/crossentropy": 2.502559542655945, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16027280688285828, + "step": 24556 + }, + { + "epoch": 0.7674375, + "grad_norm": 3.171875, + "grad_norm_var": 0.059000651041666664, + "learning_rate": 0.0001, + "loss": 6.0277, + "loss/crossentropy": 2.832331895828247, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1695387214422226, + "step": 24558 + }, + { + "epoch": 0.7675, + "grad_norm": 3.25, + "grad_norm_var": 0.05788472493489583, + "learning_rate": 0.0001, + "loss": 6.0129, + "loss/crossentropy": 2.8243350982666016, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17237577587366104, + "step": 24560 + }, + { + "epoch": 0.7675625, + "grad_norm": 2.921875, + "grad_norm_var": 0.052408854166666664, + "learning_rate": 0.0001, + "loss": 5.5788, + "loss/crossentropy": 2.550336241722107, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16026712208986282, + "step": 24562 + }, + { + "epoch": 0.767625, + "grad_norm": 2.984375, + "grad_norm_var": 0.05190327962239583, + "learning_rate": 0.0001, + "loss": 5.4595, + "loss/crossentropy": 2.463630437850952, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1573958843946457, + "step": 24564 + }, + { + "epoch": 0.7676875, + "grad_norm": 3.0625, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 5.638, + "loss/crossentropy": 2.459873914718628, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16741740703582764, + "step": 24566 + }, + { + "epoch": 0.76775, + "grad_norm": 2.90625, + "grad_norm_var": 0.03730367024739583, + "learning_rate": 0.0001, + "loss": 5.5341, + "loss/crossentropy": 2.5051268339157104, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15836410969495773, + "step": 24568 + }, + { + "epoch": 0.7678125, + "grad_norm": 3.21875, + "grad_norm_var": 0.0269195556640625, + "learning_rate": 0.0001, + "loss": 6.196, + "loss/crossentropy": 2.882757544517517, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.18015281111001968, + "step": 24570 + }, + { + "epoch": 0.767875, + "grad_norm": 3.234375, + "grad_norm_var": 0.02672119140625, + "learning_rate": 0.0001, + "loss": 5.7163, + "loss/crossentropy": 2.5850621461868286, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16312387585639954, + "step": 24572 + }, + { + "epoch": 0.7679375, + "grad_norm": 2.984375, + "grad_norm_var": 0.029292805989583334, + "learning_rate": 0.0001, + "loss": 5.5689, + "loss/crossentropy": 2.532110333442688, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1599290817975998, + "step": 24574 + }, + { + "epoch": 0.768, + "grad_norm": 2.9375, + "grad_norm_var": 0.029182942708333333, + "learning_rate": 0.0001, + "loss": 5.6969, + "loss/crossentropy": 2.588170289993286, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1620495393872261, + "step": 24576 + }, + { + "epoch": 0.7680625, + "grad_norm": 2.890625, + "grad_norm_var": 0.029011027018229166, + "learning_rate": 0.0001, + "loss": 5.6838, + "loss/crossentropy": 2.591399908065796, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16431699693202972, + "step": 24578 + }, + { + "epoch": 0.768125, + "grad_norm": 2.90625, + "grad_norm_var": 0.0312652587890625, + "learning_rate": 0.0001, + "loss": 5.5217, + "loss/crossentropy": 2.458386540412903, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15984228253364563, + "step": 24580 + }, + { + "epoch": 0.7681875, + "grad_norm": 3.359375, + "grad_norm_var": 0.0259185791015625, + "learning_rate": 0.0001, + "loss": 5.6103, + "loss/crossentropy": 2.492032527923584, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16611865162849426, + "step": 24582 + }, + { + "epoch": 0.76825, + "grad_norm": 2.84375, + "grad_norm_var": 0.027131144205729166, + "learning_rate": 0.0001, + "loss": 5.6992, + "loss/crossentropy": 2.63890540599823, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16228335350751877, + "step": 24584 + }, + { + "epoch": 0.7683125, + "grad_norm": 3.640625, + "grad_norm_var": 0.04303385416666667, + "learning_rate": 0.0001, + "loss": 5.5841, + "loss/crossentropy": 2.472393035888672, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16390815377235413, + "step": 24586 + }, + { + "epoch": 0.768375, + "grad_norm": 2.796875, + "grad_norm_var": 0.049637858072916666, + "learning_rate": 0.0001, + "loss": 5.5688, + "loss/crossentropy": 2.5740370750427246, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15963692218065262, + "step": 24588 + }, + { + "epoch": 0.7684375, + "grad_norm": 3.046875, + "grad_norm_var": 0.049788411458333334, + "learning_rate": 0.0001, + "loss": 5.3792, + "loss/crossentropy": 2.3682631254196167, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1569526493549347, + "step": 24590 + }, + { + "epoch": 0.7685, + "grad_norm": 2.796875, + "grad_norm_var": 0.04980061848958333, + "learning_rate": 0.0001, + "loss": 5.6177, + "loss/crossentropy": 2.562471032142639, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15826137363910675, + "step": 24592 + }, + { + "epoch": 0.7685625, + "grad_norm": 3.09375, + "grad_norm_var": 0.05293680826822917, + "learning_rate": 0.0001, + "loss": 5.431, + "loss/crossentropy": 2.345592975616455, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15971240401268005, + "step": 24594 + }, + { + "epoch": 0.768625, + "grad_norm": 3.3125, + "grad_norm_var": 0.053238932291666666, + "learning_rate": 0.0001, + "loss": 5.5878, + "loss/crossentropy": 2.4835760593414307, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16394073516130447, + "step": 24596 + }, + { + "epoch": 0.7686875, + "grad_norm": 3.234375, + "grad_norm_var": 0.05015360514322917, + "learning_rate": 0.0001, + "loss": 5.4102, + "loss/crossentropy": 2.3610429763793945, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15881942212581635, + "step": 24598 + }, + { + "epoch": 0.76875, + "grad_norm": 3.3125, + "grad_norm_var": 0.05064697265625, + "learning_rate": 0.0001, + "loss": 5.6839, + "loss/crossentropy": 2.5284552574157715, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16827517747879028, + "step": 24600 + }, + { + "epoch": 0.7688125, + "grad_norm": 2.921875, + "grad_norm_var": 0.0415924072265625, + "learning_rate": 0.0001, + "loss": 5.9036, + "loss/crossentropy": 2.785243272781372, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16417969018220901, + "step": 24602 + }, + { + "epoch": 0.768875, + "grad_norm": 2.96875, + "grad_norm_var": 0.03511962890625, + "learning_rate": 0.0001, + "loss": 5.427, + "loss/crossentropy": 2.4293742179870605, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15093691647052765, + "step": 24604 + }, + { + "epoch": 0.7689375, + "grad_norm": 3.015625, + "grad_norm_var": 0.044482421875, + "learning_rate": 0.0001, + "loss": 5.9005, + "loss/crossentropy": 2.66623592376709, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17576977610588074, + "step": 24606 + }, + { + "epoch": 0.769, + "grad_norm": 3.15625, + "grad_norm_var": 0.04067281087239583, + "learning_rate": 0.0001, + "loss": 5.6656, + "loss/crossentropy": 2.585243344306946, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1619393303990364, + "step": 24608 + }, + { + "epoch": 0.7690625, + "grad_norm": 3.390625, + "grad_norm_var": 0.051024373372395834, + "learning_rate": 0.0001, + "loss": 5.6106, + "loss/crossentropy": 2.5044806003570557, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16295979917049408, + "step": 24610 + }, + { + "epoch": 0.769125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0509429931640625, + "learning_rate": 0.0001, + "loss": 5.8575, + "loss/crossentropy": 2.700343132019043, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1684456691145897, + "step": 24612 + }, + { + "epoch": 0.7691875, + "grad_norm": 3.234375, + "grad_norm_var": 0.0538238525390625, + "learning_rate": 0.0001, + "loss": 5.5381, + "loss/crossentropy": 2.435794472694397, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16413667798042297, + "step": 24614 + }, + { + "epoch": 0.76925, + "grad_norm": 2.78125, + "grad_norm_var": 0.057470703125, + "learning_rate": 0.0001, + "loss": 5.4452, + "loss/crossentropy": 2.4861371517181396, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15293768048286438, + "step": 24616 + }, + { + "epoch": 0.7693125, + "grad_norm": 3.140625, + "grad_norm_var": 0.04978739420572917, + "learning_rate": 0.0001, + "loss": 5.7474, + "loss/crossentropy": 2.54036545753479, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17070432007312775, + "step": 24618 + }, + { + "epoch": 0.769375, + "grad_norm": 3.0625, + "grad_norm_var": 0.05236002604166667, + "learning_rate": 0.0001, + "loss": 5.5705, + "loss/crossentropy": 2.5813162326812744, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15633995831012726, + "step": 24620 + }, + { + "epoch": 0.7694375, + "grad_norm": 3.09375, + "grad_norm_var": 0.04547526041666667, + "learning_rate": 0.0001, + "loss": 5.6974, + "loss/crossentropy": 2.709510087966919, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15660195797681808, + "step": 24622 + }, + { + "epoch": 0.7695, + "grad_norm": 3.09375, + "grad_norm_var": 0.043192545572916664, + "learning_rate": 0.0001, + "loss": 5.6082, + "loss/crossentropy": 2.563498377799988, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15837693214416504, + "step": 24624 + }, + { + "epoch": 0.7695625, + "grad_norm": 3.15625, + "grad_norm_var": 0.030866495768229165, + "learning_rate": 0.0001, + "loss": 5.7813, + "loss/crossentropy": 2.671633005142212, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16370201110839844, + "step": 24626 + }, + { + "epoch": 0.769625, + "grad_norm": 3.109375, + "grad_norm_var": 0.03408203125, + "learning_rate": 0.0001, + "loss": 5.5951, + "loss/crossentropy": 2.5170499086380005, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1605348140001297, + "step": 24628 + }, + { + "epoch": 0.7696875, + "grad_norm": 3.0, + "grad_norm_var": 0.028108723958333335, + "learning_rate": 0.0001, + "loss": 5.7315, + "loss/crossentropy": 2.6486425399780273, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16609492897987366, + "step": 24630 + }, + { + "epoch": 0.76975, + "grad_norm": 2.875, + "grad_norm_var": 0.033772786458333336, + "learning_rate": 0.0001, + "loss": 5.6552, + "loss/crossentropy": 2.550073027610779, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1655862033367157, + "step": 24632 + }, + { + "epoch": 0.7698125, + "grad_norm": 2.96875, + "grad_norm_var": 0.030594889322916666, + "learning_rate": 0.0001, + "loss": 5.8422, + "loss/crossentropy": 2.697143793106079, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16762875020503998, + "step": 24634 + }, + { + "epoch": 0.769875, + "grad_norm": 3.140625, + "grad_norm_var": 0.028718058268229166, + "learning_rate": 0.0001, + "loss": 5.4412, + "loss/crossentropy": 2.3402713537216187, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16438943147659302, + "step": 24636 + }, + { + "epoch": 0.7699375, + "grad_norm": 3.015625, + "grad_norm_var": 0.0243804931640625, + "learning_rate": 0.0001, + "loss": 5.2724, + "loss/crossentropy": 2.31243634223938, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15302617102861404, + "step": 24638 + }, + { + "epoch": 0.77, + "grad_norm": 3.109375, + "grad_norm_var": 0.0235748291015625, + "learning_rate": 0.0001, + "loss": 5.5695, + "loss/crossentropy": 2.542052388191223, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15860824286937714, + "step": 24640 + }, + { + "epoch": 0.7700625, + "grad_norm": 2.78125, + "grad_norm_var": 0.032124837239583336, + "learning_rate": 0.0001, + "loss": 5.4541, + "loss/crossentropy": 2.534891366958618, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15129749476909637, + "step": 24642 + }, + { + "epoch": 0.770125, + "grad_norm": 3.03125, + "grad_norm_var": 0.029637654622395832, + "learning_rate": 0.0001, + "loss": 5.4417, + "loss/crossentropy": 2.3925892114639282, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16037844121456146, + "step": 24644 + }, + { + "epoch": 0.7701875, + "grad_norm": 3.125, + "grad_norm_var": 0.03162333170572917, + "learning_rate": 0.0001, + "loss": 5.9961, + "loss/crossentropy": 2.7234139442443848, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1764841303229332, + "step": 24646 + }, + { + "epoch": 0.77025, + "grad_norm": 3.8125, + "grad_norm_var": 0.0557769775390625, + "learning_rate": 0.0001, + "loss": 5.5932, + "loss/crossentropy": 2.582900047302246, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.160794235765934, + "step": 24648 + }, + { + "epoch": 0.7703125, + "grad_norm": 2.96875, + "grad_norm_var": 0.05937398274739583, + "learning_rate": 0.0001, + "loss": 5.7556, + "loss/crossentropy": 2.619705557823181, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16437150537967682, + "step": 24650 + }, + { + "epoch": 0.770375, + "grad_norm": 3.359375, + "grad_norm_var": 0.414208984375, + "learning_rate": 0.0001, + "loss": 5.6751, + "loss/crossentropy": 2.449666142463684, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1760595217347145, + "step": 24652 + }, + { + "epoch": 0.7704375, + "grad_norm": 2.9375, + "grad_norm_var": 0.40784098307291666, + "learning_rate": 0.0001, + "loss": 5.5639, + "loss/crossentropy": 2.472114086151123, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.16855724155902863, + "step": 24654 + }, + { + "epoch": 0.7705, + "grad_norm": 3.046875, + "grad_norm_var": 0.41793212890625, + "learning_rate": 0.0001, + "loss": 5.8266, + "loss/crossentropy": 2.689388871192932, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16997243463993073, + "step": 24656 + }, + { + "epoch": 0.7705625, + "grad_norm": 3.015625, + "grad_norm_var": 0.39348958333333334, + "learning_rate": 0.0001, + "loss": 5.5236, + "loss/crossentropy": 2.5500930547714233, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15438473224639893, + "step": 24658 + }, + { + "epoch": 0.770625, + "grad_norm": 3.265625, + "grad_norm_var": 0.39765625, + "learning_rate": 0.0001, + "loss": 5.8364, + "loss/crossentropy": 2.690742611885071, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16729624569416046, + "step": 24660 + }, + { + "epoch": 0.7706875, + "grad_norm": 3.25, + "grad_norm_var": 0.39846598307291664, + "learning_rate": 0.0001, + "loss": 5.5172, + "loss/crossentropy": 2.4236879348754883, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16169168800115585, + "step": 24662 + }, + { + "epoch": 0.77075, + "grad_norm": 3.265625, + "grad_norm_var": 0.3762440999348958, + "learning_rate": 0.0001, + "loss": 6.1183, + "loss/crossentropy": 2.8352246284484863, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17635374516248703, + "step": 24664 + }, + { + "epoch": 0.7708125, + "grad_norm": 3.203125, + "grad_norm_var": 0.37018941243489584, + "learning_rate": 0.0001, + "loss": 5.7557, + "loss/crossentropy": 2.561169385910034, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16866753995418549, + "step": 24666 + }, + { + "epoch": 0.770875, + "grad_norm": 3.234375, + "grad_norm_var": 0.02154541015625, + "learning_rate": 0.0001, + "loss": 5.6417, + "loss/crossentropy": 2.5521970987319946, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16480600833892822, + "step": 24668 + }, + { + "epoch": 0.7709375, + "grad_norm": 3.1875, + "grad_norm_var": 0.018745930989583333, + "learning_rate": 0.0001, + "loss": 5.7403, + "loss/crossentropy": 2.6300442218780518, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16219773888587952, + "step": 24670 + }, + { + "epoch": 0.771, + "grad_norm": 3.09375, + "grad_norm_var": 0.012235514322916667, + "learning_rate": 0.0001, + "loss": 5.2951, + "loss/crossentropy": 2.3326436281204224, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1540607511997223, + "step": 24672 + }, + { + "epoch": 0.7710625, + "grad_norm": 3.34375, + "grad_norm_var": 0.014090983072916667, + "learning_rate": 0.0001, + "loss": 5.6972, + "loss/crossentropy": 2.546350121498108, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16743051260709763, + "step": 24674 + }, + { + "epoch": 0.771125, + "grad_norm": 3.0, + "grad_norm_var": 0.01383056640625, + "learning_rate": 0.0001, + "loss": 5.4417, + "loss/crossentropy": 2.4570369720458984, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15198008716106415, + "step": 24676 + }, + { + "epoch": 0.7711875, + "grad_norm": 3.53125, + "grad_norm_var": 0.0245513916015625, + "learning_rate": 0.0001, + "loss": 5.6782, + "loss/crossentropy": 2.674981474876404, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15735703706741333, + "step": 24678 + }, + { + "epoch": 0.77125, + "grad_norm": 3.03125, + "grad_norm_var": 0.025634765625, + "learning_rate": 0.0001, + "loss": 5.8005, + "loss/crossentropy": 2.643213987350464, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16924799233675003, + "step": 24680 + }, + { + "epoch": 0.7713125, + "grad_norm": 2.859375, + "grad_norm_var": 0.028369140625, + "learning_rate": 0.0001, + "loss": 5.821, + "loss/crossentropy": 2.7028998136520386, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16650206595659256, + "step": 24682 + }, + { + "epoch": 0.771375, + "grad_norm": 3.25, + "grad_norm_var": 0.03264058430989583, + "learning_rate": 0.0001, + "loss": 5.9664, + "loss/crossentropy": 2.768761396408081, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17328058928251266, + "step": 24684 + }, + { + "epoch": 0.7714375, + "grad_norm": 4.375, + "grad_norm_var": 0.1312164306640625, + "learning_rate": 0.0001, + "loss": 5.8115, + "loss/crossentropy": 2.6067157983779907, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1704786792397499, + "step": 24686 + }, + { + "epoch": 0.7715, + "grad_norm": 3.109375, + "grad_norm_var": 0.13782552083333333, + "learning_rate": 0.0001, + "loss": 5.8724, + "loss/crossentropy": 2.8215242624282837, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1605537086725235, + "step": 24688 + }, + { + "epoch": 0.7715625, + "grad_norm": 3.109375, + "grad_norm_var": 0.13571675618489584, + "learning_rate": 0.0001, + "loss": 5.9054, + "loss/crossentropy": 2.739312529563904, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17129508405923843, + "step": 24690 + }, + { + "epoch": 0.771625, + "grad_norm": 3.078125, + "grad_norm_var": 0.1363677978515625, + "learning_rate": 0.0001, + "loss": 5.6326, + "loss/crossentropy": 2.5702801942825317, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16482558846473694, + "step": 24692 + }, + { + "epoch": 0.7716875, + "grad_norm": 3.1875, + "grad_norm_var": 0.12546284993489584, + "learning_rate": 0.0001, + "loss": 5.5806, + "loss/crossentropy": 2.48318612575531, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16287126392126083, + "step": 24694 + }, + { + "epoch": 0.77175, + "grad_norm": 3.515625, + "grad_norm_var": 0.13491923014322918, + "learning_rate": 0.0001, + "loss": 5.9828, + "loss/crossentropy": 2.823164939880371, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16635094583034515, + "step": 24696 + }, + { + "epoch": 0.7718125, + "grad_norm": 2.90625, + "grad_norm_var": 0.13479715983072918, + "learning_rate": 0.0001, + "loss": 5.2987, + "loss/crossentropy": 2.3417288064956665, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.155068501830101, + "step": 24698 + }, + { + "epoch": 0.771875, + "grad_norm": 3.21875, + "grad_norm_var": 0.13093973795572916, + "learning_rate": 0.0001, + "loss": 5.5299, + "loss/crossentropy": 2.449816107749939, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16113300621509552, + "step": 24700 + }, + { + "epoch": 0.7719375, + "grad_norm": 3.296875, + "grad_norm_var": 0.037353515625, + "learning_rate": 0.0001, + "loss": 6.0161, + "loss/crossentropy": 2.7602035999298096, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17402522265911102, + "step": 24702 + }, + { + "epoch": 0.772, + "grad_norm": 3.09375, + "grad_norm_var": 0.03395894368489583, + "learning_rate": 0.0001, + "loss": 5.3871, + "loss/crossentropy": 2.3438072204589844, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16136392951011658, + "step": 24704 + }, + { + "epoch": 0.7720625, + "grad_norm": 3.125, + "grad_norm_var": 0.04055582682291667, + "learning_rate": 0.0001, + "loss": 5.8478, + "loss/crossentropy": 2.6474716663360596, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16964183002710342, + "step": 24706 + }, + { + "epoch": 0.772125, + "grad_norm": 3.28125, + "grad_norm_var": 0.037646484375, + "learning_rate": 0.0001, + "loss": 5.6259, + "loss/crossentropy": 2.491701126098633, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16185308247804642, + "step": 24708 + }, + { + "epoch": 0.7721875, + "grad_norm": 3.1875, + "grad_norm_var": 0.0439849853515625, + "learning_rate": 0.0001, + "loss": 5.6412, + "loss/crossentropy": 2.3866571187973022, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17232473194599152, + "step": 24710 + }, + { + "epoch": 0.77225, + "grad_norm": 2.984375, + "grad_norm_var": 0.0389068603515625, + "learning_rate": 0.0001, + "loss": 5.3108, + "loss/crossentropy": 2.3295599222183228, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15086036920547485, + "step": 24712 + }, + { + "epoch": 0.7723125, + "grad_norm": 2.921875, + "grad_norm_var": 0.03654683430989583, + "learning_rate": 0.0001, + "loss": 5.5482, + "loss/crossentropy": 2.5633562803268433, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15708180516958237, + "step": 24714 + }, + { + "epoch": 0.772375, + "grad_norm": 2.828125, + "grad_norm_var": 0.04842020670572917, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.6539541482925415, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16868995130062103, + "step": 24716 + }, + { + "epoch": 0.7724375, + "grad_norm": 3.171875, + "grad_norm_var": 0.03962300618489583, + "learning_rate": 0.0001, + "loss": 5.6143, + "loss/crossentropy": 2.4918437004089355, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16146771609783173, + "step": 24718 + }, + { + "epoch": 0.7725, + "grad_norm": 3.0625, + "grad_norm_var": 0.04254150390625, + "learning_rate": 0.0001, + "loss": 5.3944, + "loss/crossentropy": 2.48309588432312, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15128827840089798, + "step": 24720 + }, + { + "epoch": 0.7725625, + "grad_norm": 2.953125, + "grad_norm_var": 0.05237630208333333, + "learning_rate": 0.0001, + "loss": 5.4593, + "loss/crossentropy": 2.4418129920959473, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1568250209093094, + "step": 24722 + }, + { + "epoch": 0.772625, + "grad_norm": 3.046875, + "grad_norm_var": 0.055826822916666664, + "learning_rate": 0.0001, + "loss": 5.3066, + "loss/crossentropy": 2.3395392894744873, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15412385761737823, + "step": 24724 + }, + { + "epoch": 0.7726875, + "grad_norm": 3.21875, + "grad_norm_var": 0.044331868489583336, + "learning_rate": 0.0001, + "loss": 5.512, + "loss/crossentropy": 2.472067952156067, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16141273826360703, + "step": 24726 + }, + { + "epoch": 0.77275, + "grad_norm": 3.25, + "grad_norm_var": 0.04455973307291667, + "learning_rate": 0.0001, + "loss": 5.6816, + "loss/crossentropy": 2.6148258447647095, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16175460070371628, + "step": 24728 + }, + { + "epoch": 0.7728125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0489654541015625, + "learning_rate": 0.0001, + "loss": 5.5001, + "loss/crossentropy": 2.4213614463806152, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16295599192380905, + "step": 24730 + }, + { + "epoch": 0.772875, + "grad_norm": 2.8125, + "grad_norm_var": 0.0503326416015625, + "learning_rate": 0.0001, + "loss": 5.649, + "loss/crossentropy": 2.5784149169921875, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16448526084423065, + "step": 24732 + }, + { + "epoch": 0.7729375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0536529541015625, + "learning_rate": 0.0001, + "loss": 5.6552, + "loss/crossentropy": 2.6306252479553223, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16026630997657776, + "step": 24734 + }, + { + "epoch": 0.773, + "grad_norm": 2.90625, + "grad_norm_var": 0.062939453125, + "learning_rate": 0.0001, + "loss": 5.3572, + "loss/crossentropy": 2.4248207807540894, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.15378645062446594, + "step": 24736 + }, + { + "epoch": 0.7730625, + "grad_norm": 2.734375, + "grad_norm_var": 0.050048828125, + "learning_rate": 0.0001, + "loss": 5.5771, + "loss/crossentropy": 2.5675575733184814, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15759897232055664, + "step": 24738 + }, + { + "epoch": 0.773125, + "grad_norm": 3.0625, + "grad_norm_var": 0.0483795166015625, + "learning_rate": 0.0001, + "loss": 5.2249, + "loss/crossentropy": 2.2803955078125, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15304088592529297, + "step": 24740 + }, + { + "epoch": 0.7731875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0474609375, + "learning_rate": 0.0001, + "loss": 5.3043, + "loss/crossentropy": 2.3219728469848633, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15448330342769623, + "step": 24742 + }, + { + "epoch": 0.77325, + "grad_norm": 3.15625, + "grad_norm_var": 0.0469635009765625, + "learning_rate": 0.0001, + "loss": 5.5888, + "loss/crossentropy": 2.528611898422241, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16499833017587662, + "step": 24744 + }, + { + "epoch": 0.7733125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 0.0001, + "loss": 5.63, + "loss/crossentropy": 2.6092342138290405, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15989167988300323, + "step": 24746 + }, + { + "epoch": 0.773375, + "grad_norm": 3.421875, + "grad_norm_var": 0.03453369140625, + "learning_rate": 0.0001, + "loss": 5.5182, + "loss/crossentropy": 2.3983983993530273, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16314804553985596, + "step": 24748 + }, + { + "epoch": 0.7734375, + "grad_norm": 3.171875, + "grad_norm_var": 0.03707682291666667, + "learning_rate": 0.0001, + "loss": 5.7343, + "loss/crossentropy": 2.582399845123291, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16597573459148407, + "step": 24750 + }, + { + "epoch": 0.7735, + "grad_norm": 3.0, + "grad_norm_var": 0.027936808268229165, + "learning_rate": 0.0001, + "loss": 6.0781, + "loss/crossentropy": 2.916908025741577, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16611584275960922, + "step": 24752 + }, + { + "epoch": 0.7735625, + "grad_norm": 3.015625, + "grad_norm_var": 0.019269816080729165, + "learning_rate": 0.0001, + "loss": 5.567, + "loss/crossentropy": 2.5042638778686523, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15861467272043228, + "step": 24754 + }, + { + "epoch": 0.773625, + "grad_norm": 3.375, + "grad_norm_var": 0.0253082275390625, + "learning_rate": 0.0001, + "loss": 5.654, + "loss/crossentropy": 2.4719542264938354, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1666421741247177, + "step": 24756 + }, + { + "epoch": 0.7736875, + "grad_norm": 3.015625, + "grad_norm_var": 0.0220367431640625, + "learning_rate": 0.0001, + "loss": 5.8333, + "loss/crossentropy": 2.6948615312576294, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16696688532829285, + "step": 24758 + }, + { + "epoch": 0.77375, + "grad_norm": 2.890625, + "grad_norm_var": 0.025300089518229166, + "learning_rate": 0.0001, + "loss": 5.6459, + "loss/crossentropy": 2.5435097217559814, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1618061512708664, + "step": 24760 + }, + { + "epoch": 0.7738125, + "grad_norm": 3.03125, + "grad_norm_var": 0.024779256184895834, + "learning_rate": 0.0001, + "loss": 5.5149, + "loss/crossentropy": 2.5401512384414673, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15567483007907867, + "step": 24762 + }, + { + "epoch": 0.773875, + "grad_norm": 3.203125, + "grad_norm_var": 0.01689453125, + "learning_rate": 0.0001, + "loss": 5.8176, + "loss/crossentropy": 2.6180412769317627, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17542476952075958, + "step": 24764 + }, + { + "epoch": 0.7739375, + "grad_norm": 3.796875, + "grad_norm_var": 0.04742431640625, + "learning_rate": 0.0001, + "loss": 6.158, + "loss/crossentropy": 2.888972043991089, + "loss/hidden": 1.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.17338383942842484, + "step": 24766 + }, + { + "epoch": 0.774, + "grad_norm": 3.109375, + "grad_norm_var": 0.04649149576822917, + "learning_rate": 0.0001, + "loss": 5.9144, + "loss/crossentropy": 2.7113550901412964, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17303407192230225, + "step": 24768 + }, + { + "epoch": 0.7740625, + "grad_norm": 3.546875, + "grad_norm_var": 0.0568023681640625, + "learning_rate": 0.0001, + "loss": 5.7253, + "loss/crossentropy": 2.5738178491592407, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1671004444360733, + "step": 24770 + }, + { + "epoch": 0.774125, + "grad_norm": 3.484375, + "grad_norm_var": 0.06061197916666667, + "learning_rate": 0.0001, + "loss": 5.9538, + "loss/crossentropy": 2.709931254386902, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17360104620456696, + "step": 24772 + }, + { + "epoch": 0.7741875, + "grad_norm": 2.921875, + "grad_norm_var": 0.06789957682291667, + "learning_rate": 0.0001, + "loss": 5.4632, + "loss/crossentropy": 2.4295272827148438, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15883446484804153, + "step": 24774 + }, + { + "epoch": 0.77425, + "grad_norm": 3.03125, + "grad_norm_var": 0.06398824055989584, + "learning_rate": 0.0001, + "loss": 5.6834, + "loss/crossentropy": 2.581278085708618, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1641213744878769, + "step": 24776 + }, + { + "epoch": 0.7743125, + "grad_norm": 2.8125, + "grad_norm_var": 0.067626953125, + "learning_rate": 0.0001, + "loss": 5.5977, + "loss/crossentropy": 2.5667072534561157, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1585664376616478, + "step": 24778 + }, + { + "epoch": 0.774375, + "grad_norm": 3.21875, + "grad_norm_var": 0.0672027587890625, + "learning_rate": 0.0001, + "loss": 6.0705, + "loss/crossentropy": 2.816719889640808, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1753796860575676, + "step": 24780 + }, + { + "epoch": 0.7744375, + "grad_norm": 3.015625, + "grad_norm_var": 0.039143880208333336, + "learning_rate": 0.0001, + "loss": 5.2543, + "loss/crossentropy": 2.2697088718414307, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15118995308876038, + "step": 24782 + }, + { + "epoch": 0.7745, + "grad_norm": 3.296875, + "grad_norm_var": 0.04049072265625, + "learning_rate": 0.0001, + "loss": 5.8765, + "loss/crossentropy": 2.7213023900985718, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16435182094573975, + "step": 24784 + }, + { + "epoch": 0.7745625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0284820556640625, + "learning_rate": 0.0001, + "loss": 5.5373, + "loss/crossentropy": 2.464460611343384, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16001853346824646, + "step": 24786 + }, + { + "epoch": 0.774625, + "grad_norm": 3.046875, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 5.7178, + "loss/crossentropy": 2.6299628019332886, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16112637519836426, + "step": 24788 + }, + { + "epoch": 0.7746875, + "grad_norm": 3.0, + "grad_norm_var": 0.014306640625, + "learning_rate": 0.0001, + "loss": 5.662, + "loss/crossentropy": 2.570648670196533, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.158745676279068, + "step": 24790 + }, + { + "epoch": 0.77475, + "grad_norm": 2.828125, + "grad_norm_var": 0.018798828125, + "learning_rate": 0.0001, + "loss": 5.5327, + "loss/crossentropy": 2.5075855255126953, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16149257868528366, + "step": 24792 + }, + { + "epoch": 0.7748125, + "grad_norm": 2.90625, + "grad_norm_var": 0.015478515625, + "learning_rate": 0.0001, + "loss": 5.6924, + "loss/crossentropy": 2.586639642715454, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1625300496816635, + "step": 24794 + }, + { + "epoch": 0.774875, + "grad_norm": 3.09375, + "grad_norm_var": 0.015119425455729167, + "learning_rate": 0.0001, + "loss": 5.5558, + "loss/crossentropy": 2.5161207914352417, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16217201948165894, + "step": 24796 + }, + { + "epoch": 0.7749375, + "grad_norm": 2.90625, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 5.5311, + "loss/crossentropy": 2.5070807933807373, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15865183621644974, + "step": 24798 + }, + { + "epoch": 0.775, + "grad_norm": 2.96875, + "grad_norm_var": 0.011507161458333333, + "learning_rate": 0.0001, + "loss": 5.5333, + "loss/crossentropy": 2.538558006286621, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15416593849658966, + "step": 24800 + }, + { + "epoch": 0.7750625, + "grad_norm": 3.09375, + "grad_norm_var": 0.010677083333333334, + "learning_rate": 0.0001, + "loss": 5.7173, + "loss/crossentropy": 2.6183621883392334, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16458562016487122, + "step": 24802 + }, + { + "epoch": 0.775125, + "grad_norm": 3.125, + "grad_norm_var": 0.013850911458333334, + "learning_rate": 0.0001, + "loss": 6.0465, + "loss/crossentropy": 2.82187819480896, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1736362800002098, + "step": 24804 + }, + { + "epoch": 0.7751875, + "grad_norm": 3.328125, + "grad_norm_var": 0.017073567708333334, + "learning_rate": 0.0001, + "loss": 5.8288, + "loss/crossentropy": 2.6323657035827637, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1727638989686966, + "step": 24806 + }, + { + "epoch": 0.77525, + "grad_norm": 2.828125, + "grad_norm_var": 0.017243448893229166, + "learning_rate": 0.0001, + "loss": 5.759, + "loss/crossentropy": 2.6707929372787476, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16038425266742706, + "step": 24808 + }, + { + "epoch": 0.7753125, + "grad_norm": 3.015625, + "grad_norm_var": 0.020970662434895832, + "learning_rate": 0.0001, + "loss": 5.672, + "loss/crossentropy": 2.5206936597824097, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16591551899909973, + "step": 24810 + }, + { + "epoch": 0.775375, + "grad_norm": 3.0625, + "grad_norm_var": 0.020751953125, + "learning_rate": 0.0001, + "loss": 5.6463, + "loss/crossentropy": 2.5930248498916626, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16157421469688416, + "step": 24812 + }, + { + "epoch": 0.7754375, + "grad_norm": 3.328125, + "grad_norm_var": 0.022233072916666666, + "learning_rate": 0.0001, + "loss": 5.7211, + "loss/crossentropy": 2.590226650238037, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16738758236169815, + "step": 24814 + }, + { + "epoch": 0.7755, + "grad_norm": 3.59375, + "grad_norm_var": 0.033503214518229164, + "learning_rate": 0.0001, + "loss": 5.7101, + "loss/crossentropy": 2.5524097681045532, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16850201040506363, + "step": 24816 + }, + { + "epoch": 0.7755625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03428446451822917, + "learning_rate": 0.0001, + "loss": 5.4872, + "loss/crossentropy": 2.4411537647247314, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16007166355848312, + "step": 24818 + }, + { + "epoch": 0.775625, + "grad_norm": 3.34375, + "grad_norm_var": 0.037840779622395834, + "learning_rate": 0.0001, + "loss": 5.6216, + "loss/crossentropy": 2.4603710174560547, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16846421360969543, + "step": 24820 + }, + { + "epoch": 0.7756875, + "grad_norm": 2.984375, + "grad_norm_var": 0.03760477701822917, + "learning_rate": 0.0001, + "loss": 5.5557, + "loss/crossentropy": 2.583541512489319, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15385423600673676, + "step": 24822 + }, + { + "epoch": 0.77575, + "grad_norm": 3.0, + "grad_norm_var": 0.031962076822916664, + "learning_rate": 0.0001, + "loss": 5.96, + "loss/crossentropy": 2.7172566652297974, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17466983199119568, + "step": 24824 + }, + { + "epoch": 0.7758125, + "grad_norm": 3.078125, + "grad_norm_var": 0.030695597330729168, + "learning_rate": 0.0001, + "loss": 5.7281, + "loss/crossentropy": 2.6229422092437744, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16754963248968124, + "step": 24826 + }, + { + "epoch": 0.775875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0330230712890625, + "learning_rate": 0.0001, + "loss": 5.57, + "loss/crossentropy": 2.596638798713684, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15554015338420868, + "step": 24828 + }, + { + "epoch": 0.7759375, + "grad_norm": 2.890625, + "grad_norm_var": 7.518680826822917, + "learning_rate": 0.0001, + "loss": 5.2667, + "loss/crossentropy": 2.226096510887146, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.14898383617401123, + "step": 24830 + }, + { + "epoch": 0.776, + "grad_norm": 3.34375, + "grad_norm_var": 7.539273071289062, + "learning_rate": 0.0001, + "loss": 5.4205, + "loss/crossentropy": 2.4121206998825073, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15747754275798798, + "step": 24832 + }, + { + "epoch": 0.7760625, + "grad_norm": 3.59375, + "grad_norm_var": 7.495536295572917, + "learning_rate": 0.0001, + "loss": 6.0159, + "loss/crossentropy": 2.6774450540542603, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.17837996780872345, + "step": 24834 + }, + { + "epoch": 0.776125, + "grad_norm": 3.1875, + "grad_norm_var": 7.490347290039063, + "learning_rate": 0.0001, + "loss": 5.8412, + "loss/crossentropy": 2.6141446828842163, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17114035040140152, + "step": 24836 + }, + { + "epoch": 0.7761875, + "grad_norm": 3.25, + "grad_norm_var": 7.467740885416666, + "learning_rate": 0.0001, + "loss": 5.4981, + "loss/crossentropy": 2.413454294204712, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1604211926460266, + "step": 24838 + }, + { + "epoch": 0.77625, + "grad_norm": 3.21875, + "grad_norm_var": 7.442805989583333, + "learning_rate": 0.0001, + "loss": 5.7351, + "loss/crossentropy": 2.582782506942749, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1671806275844574, + "step": 24840 + }, + { + "epoch": 0.7763125, + "grad_norm": 3.71875, + "grad_norm_var": 7.39107666015625, + "learning_rate": 0.0001, + "loss": 5.5931, + "loss/crossentropy": 2.4901710748672485, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1614660620689392, + "step": 24842 + }, + { + "epoch": 0.776375, + "grad_norm": 2.9375, + "grad_norm_var": 7.373177083333333, + "learning_rate": 0.0001, + "loss": 5.6264, + "loss/crossentropy": 2.586179494857788, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15948884189128876, + "step": 24844 + }, + { + "epoch": 0.7764375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0643707275390625, + "learning_rate": 0.0001, + "loss": 5.9235, + "loss/crossentropy": 2.758601188659668, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1680537387728691, + "step": 24846 + }, + { + "epoch": 0.7765, + "grad_norm": 3.453125, + "grad_norm_var": 0.08557942708333334, + "learning_rate": 0.0001, + "loss": 5.5767, + "loss/crossentropy": 2.444355845451355, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.16010946035385132, + "step": 24848 + }, + { + "epoch": 0.7765625, + "grad_norm": 3.0625, + "grad_norm_var": 0.07801005045572916, + "learning_rate": 0.0001, + "loss": 5.5247, + "loss/crossentropy": 2.4560447931289673, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16155368089675903, + "step": 24850 + }, + { + "epoch": 0.776625, + "grad_norm": 3.03125, + "grad_norm_var": 0.06894124348958333, + "learning_rate": 0.0001, + "loss": 5.8591, + "loss/crossentropy": 2.7664119005203247, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16317206621170044, + "step": 24852 + }, + { + "epoch": 0.7766875, + "grad_norm": 5.0625, + "grad_norm_var": 0.28684895833333335, + "learning_rate": 0.0001, + "loss": 5.5702, + "loss/crossentropy": 2.3672115802764893, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17381131649017334, + "step": 24854 + }, + { + "epoch": 0.77675, + "grad_norm": 3.15625, + "grad_norm_var": 0.2888671875, + "learning_rate": 0.0001, + "loss": 5.8842, + "loss/crossentropy": 2.6977726221084595, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17333439737558365, + "step": 24856 + }, + { + "epoch": 0.7768125, + "grad_norm": 3.078125, + "grad_norm_var": 0.2814605712890625, + "learning_rate": 0.0001, + "loss": 5.4317, + "loss/crossentropy": 2.418553113937378, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1587332859635353, + "step": 24858 + }, + { + "epoch": 0.776875, + "grad_norm": 3.09375, + "grad_norm_var": 0.284814453125, + "learning_rate": 0.0001, + "loss": 5.505, + "loss/crossentropy": 2.3784608840942383, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1669526994228363, + "step": 24860 + }, + { + "epoch": 0.7769375, + "grad_norm": 3.078125, + "grad_norm_var": 0.28543192545572915, + "learning_rate": 0.0001, + "loss": 5.8212, + "loss/crossentropy": 2.6897518634796143, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1678316667675972, + "step": 24862 + }, + { + "epoch": 0.777, + "grad_norm": 3.140625, + "grad_norm_var": 0.2692616780598958, + "learning_rate": 0.0001, + "loss": 5.552, + "loss/crossentropy": 2.5030031204223633, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1588040366768837, + "step": 24864 + }, + { + "epoch": 0.7770625, + "grad_norm": 3.71875, + "grad_norm_var": 0.2734771728515625, + "learning_rate": 0.0001, + "loss": 6.067, + "loss/crossentropy": 2.8639168739318848, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1730470359325409, + "step": 24866 + }, + { + "epoch": 0.777125, + "grad_norm": 3.1875, + "grad_norm_var": 0.26049702962239585, + "learning_rate": 0.0001, + "loss": 5.8398, + "loss/crossentropy": 2.6833678483963013, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16408467292785645, + "step": 24868 + }, + { + "epoch": 0.7771875, + "grad_norm": 3.1875, + "grad_norm_var": 0.04606831868489583, + "learning_rate": 0.0001, + "loss": 5.9563, + "loss/crossentropy": 2.810359001159668, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16381268203258514, + "step": 24870 + }, + { + "epoch": 0.77725, + "grad_norm": 3.078125, + "grad_norm_var": 0.0539703369140625, + "learning_rate": 0.0001, + "loss": 5.4282, + "loss/crossentropy": 2.3819870948791504, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.15422788262367249, + "step": 24872 + }, + { + "epoch": 0.7773125, + "grad_norm": 2.90625, + "grad_norm_var": 0.05467020670572917, + "learning_rate": 0.0001, + "loss": 5.6281, + "loss/crossentropy": 2.5575016736984253, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16330711543560028, + "step": 24874 + }, + { + "epoch": 0.777375, + "grad_norm": 2.9375, + "grad_norm_var": 0.051806640625, + "learning_rate": 0.0001, + "loss": 5.7753, + "loss/crossentropy": 2.6280269622802734, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16707344353199005, + "step": 24876 + }, + { + "epoch": 0.7774375, + "grad_norm": 3.265625, + "grad_norm_var": 0.0498443603515625, + "learning_rate": 0.0001, + "loss": 6.1472, + "loss/crossentropy": 2.795683979988098, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.18241284787654877, + "step": 24878 + }, + { + "epoch": 0.7775, + "grad_norm": 3.8125, + "grad_norm_var": 0.0611724853515625, + "learning_rate": 0.0001, + "loss": 5.9611, + "loss/crossentropy": 2.6614965200424194, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18074382841587067, + "step": 24880 + }, + { + "epoch": 0.7775625, + "grad_norm": 3.234375, + "grad_norm_var": 0.06274312337239583, + "learning_rate": 0.0001, + "loss": 5.7818, + "loss/crossentropy": 2.665590763092041, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16552364826202393, + "step": 24882 + }, + { + "epoch": 0.777625, + "grad_norm": 3.171875, + "grad_norm_var": 0.06363016764322917, + "learning_rate": 0.0001, + "loss": 5.6286, + "loss/crossentropy": 2.4945160150527954, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16653265058994293, + "step": 24884 + }, + { + "epoch": 0.7776875, + "grad_norm": 2.890625, + "grad_norm_var": 0.07089436848958333, + "learning_rate": 0.0001, + "loss": 5.6881, + "loss/crossentropy": 2.6458394527435303, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1569637879729271, + "step": 24886 + }, + { + "epoch": 0.77775, + "grad_norm": 2.953125, + "grad_norm_var": 0.06551106770833333, + "learning_rate": 0.0001, + "loss": 5.7243, + "loss/crossentropy": 2.640910267829895, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16068334877490997, + "step": 24888 + }, + { + "epoch": 0.7778125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0659088134765625, + "learning_rate": 0.0001, + "loss": 5.549, + "loss/crossentropy": 2.4916951656341553, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15651002526283264, + "step": 24890 + }, + { + "epoch": 0.777875, + "grad_norm": 3.359375, + "grad_norm_var": 0.061498006184895836, + "learning_rate": 0.0001, + "loss": 5.5354, + "loss/crossentropy": 2.4490002393722534, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16293669492006302, + "step": 24892 + }, + { + "epoch": 0.7779375, + "grad_norm": 3.453125, + "grad_norm_var": 0.06623942057291667, + "learning_rate": 0.0001, + "loss": 5.9291, + "loss/crossentropy": 2.730451464653015, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17181465774774551, + "step": 24894 + }, + { + "epoch": 0.778, + "grad_norm": 3.109375, + "grad_norm_var": 0.03559468587239583, + "learning_rate": 0.0001, + "loss": 5.9067, + "loss/crossentropy": 2.7338948249816895, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17040778696537018, + "step": 24896 + }, + { + "epoch": 0.7780625, + "grad_norm": 2.953125, + "grad_norm_var": 0.029588826497395835, + "learning_rate": 0.0001, + "loss": 5.7572, + "loss/crossentropy": 2.622227907180786, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.168186254799366, + "step": 24898 + }, + { + "epoch": 0.778125, + "grad_norm": 3.0625, + "grad_norm_var": 0.02750244140625, + "learning_rate": 0.0001, + "loss": 5.7004, + "loss/crossentropy": 2.5909247398376465, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16290107369422913, + "step": 24900 + }, + { + "epoch": 0.7781875, + "grad_norm": 3.171875, + "grad_norm_var": 0.0531890869140625, + "learning_rate": 0.0001, + "loss": 5.733, + "loss/crossentropy": 2.54416024684906, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1712314933538437, + "step": 24902 + }, + { + "epoch": 0.77825, + "grad_norm": 3.15625, + "grad_norm_var": 0.054361979166666664, + "learning_rate": 0.0001, + "loss": 5.6732, + "loss/crossentropy": 2.612813949584961, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1630677431821823, + "step": 24904 + }, + { + "epoch": 0.7783125, + "grad_norm": 3.5625, + "grad_norm_var": 0.05924072265625, + "learning_rate": 0.0001, + "loss": 5.705, + "loss/crossentropy": 2.6064003705978394, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1637658178806305, + "step": 24906 + }, + { + "epoch": 0.778375, + "grad_norm": 3.03125, + "grad_norm_var": 0.05500386555989583, + "learning_rate": 0.0001, + "loss": 5.5631, + "loss/crossentropy": 2.5389318466186523, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1555420458316803, + "step": 24908 + }, + { + "epoch": 0.7784375, + "grad_norm": 3.75, + "grad_norm_var": 0.08580322265625, + "learning_rate": 0.0001, + "loss": 5.8155, + "loss/crossentropy": 2.735503077507019, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16190160065889359, + "step": 24910 + }, + { + "epoch": 0.7785, + "grad_norm": 2.96875, + "grad_norm_var": 0.08651936848958333, + "learning_rate": 0.0001, + "loss": 5.6744, + "loss/crossentropy": 2.610866665840149, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1610436588525772, + "step": 24912 + }, + { + "epoch": 0.7785625, + "grad_norm": 3.234375, + "grad_norm_var": 0.08622639973958333, + "learning_rate": 0.0001, + "loss": 5.6425, + "loss/crossentropy": 2.6181299686431885, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15947112441062927, + "step": 24914 + }, + { + "epoch": 0.778625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0912506103515625, + "learning_rate": 0.0001, + "loss": 5.6656, + "loss/crossentropy": 2.5940080881118774, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1618470698595047, + "step": 24916 + }, + { + "epoch": 0.7786875, + "grad_norm": 3.53125, + "grad_norm_var": 0.07492574055989583, + "learning_rate": 0.0001, + "loss": 5.7441, + "loss/crossentropy": 2.642926573753357, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16245951503515244, + "step": 24918 + }, + { + "epoch": 0.77875, + "grad_norm": 3.046875, + "grad_norm_var": 0.07492574055989583, + "learning_rate": 0.0001, + "loss": 5.7488, + "loss/crossentropy": 2.70668888092041, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16123943030834198, + "step": 24920 + }, + { + "epoch": 0.7788125, + "grad_norm": 3.15625, + "grad_norm_var": 0.06214090983072917, + "learning_rate": 0.0001, + "loss": 5.5348, + "loss/crossentropy": 2.4590747356414795, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16225910186767578, + "step": 24922 + }, + { + "epoch": 0.778875, + "grad_norm": 2.875, + "grad_norm_var": 0.0652008056640625, + "learning_rate": 0.0001, + "loss": 5.3856, + "loss/crossentropy": 2.4269120693206787, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15524086356163025, + "step": 24924 + }, + { + "epoch": 0.7789375, + "grad_norm": 3.21875, + "grad_norm_var": 0.036498006184895834, + "learning_rate": 0.0001, + "loss": 5.965, + "loss/crossentropy": 2.7144534587860107, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17505048215389252, + "step": 24926 + }, + { + "epoch": 0.779, + "grad_norm": 3.921875, + "grad_norm_var": 0.39690755208333334, + "learning_rate": 0.0001, + "loss": 6.2914, + "loss/crossentropy": 2.870009183883667, + "loss/hidden": 1.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.1862778663635254, + "step": 24928 + }, + { + "epoch": 0.7790625, + "grad_norm": 3.125, + "grad_norm_var": 0.3919097900390625, + "learning_rate": 0.0001, + "loss": 5.9061, + "loss/crossentropy": 2.698647975921631, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17113231867551804, + "step": 24930 + }, + { + "epoch": 0.779125, + "grad_norm": 3.65625, + "grad_norm_var": 0.3816151936848958, + "learning_rate": 0.0001, + "loss": 5.943, + "loss/crossentropy": 2.770224094390869, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16689125448465347, + "step": 24932 + }, + { + "epoch": 0.7791875, + "grad_norm": 3.328125, + "grad_norm_var": 0.37463785807291666, + "learning_rate": 0.0001, + "loss": 5.8719, + "loss/crossentropy": 2.6833068132400513, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16768933832645416, + "step": 24934 + }, + { + "epoch": 0.77925, + "grad_norm": 3.390625, + "grad_norm_var": 0.36253153483072914, + "learning_rate": 0.0001, + "loss": 5.7564, + "loss/crossentropy": 2.602460741996765, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16930370777845383, + "step": 24936 + }, + { + "epoch": 0.7793125, + "grad_norm": 3.171875, + "grad_norm_var": 0.36128641764322916, + "learning_rate": 0.0001, + "loss": 5.6362, + "loss/crossentropy": 2.526370644569397, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16449592262506485, + "step": 24938 + }, + { + "epoch": 0.779375, + "grad_norm": 2.75, + "grad_norm_var": 0.3646769205729167, + "learning_rate": 0.0001, + "loss": 5.496, + "loss/crossentropy": 2.463905930519104, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15868177264928818, + "step": 24940 + }, + { + "epoch": 0.7794375, + "grad_norm": 3.125, + "grad_norm_var": 0.3747955322265625, + "learning_rate": 0.0001, + "loss": 5.9716, + "loss/crossentropy": 2.7801040410995483, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16875829547643661, + "step": 24942 + }, + { + "epoch": 0.7795, + "grad_norm": 3.140625, + "grad_norm_var": 0.046370442708333334, + "learning_rate": 0.0001, + "loss": 5.9026, + "loss/crossentropy": 2.709265947341919, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.16776718199253082, + "step": 24944 + }, + { + "epoch": 0.7795625, + "grad_norm": 2.96875, + "grad_norm_var": 0.0492095947265625, + "learning_rate": 0.0001, + "loss": 5.5134, + "loss/crossentropy": 2.4592331647872925, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15736858546733856, + "step": 24946 + }, + { + "epoch": 0.779625, + "grad_norm": 3.1875, + "grad_norm_var": 0.03206278483072917, + "learning_rate": 0.0001, + "loss": 5.6377, + "loss/crossentropy": 2.5303972959518433, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1650300770998001, + "step": 24948 + }, + { + "epoch": 0.7796875, + "grad_norm": 3.296875, + "grad_norm_var": 0.021630859375, + "learning_rate": 0.0001, + "loss": 5.865, + "loss/crossentropy": 2.693328857421875, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1714666560292244, + "step": 24950 + }, + { + "epoch": 0.77975, + "grad_norm": 3.1875, + "grad_norm_var": 0.0161285400390625, + "learning_rate": 0.0001, + "loss": 5.9176, + "loss/crossentropy": 2.705698251724243, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17080193012952805, + "step": 24952 + }, + { + "epoch": 0.7798125, + "grad_norm": 3.234375, + "grad_norm_var": 0.019384765625, + "learning_rate": 0.0001, + "loss": 5.6606, + "loss/crossentropy": 2.5922571420669556, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16230715066194534, + "step": 24954 + }, + { + "epoch": 0.779875, + "grad_norm": 3.078125, + "grad_norm_var": 0.016087849934895832, + "learning_rate": 0.0001, + "loss": 5.4266, + "loss/crossentropy": 2.3955196142196655, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15467171370983124, + "step": 24956 + }, + { + "epoch": 0.7799375, + "grad_norm": 3.046875, + "grad_norm_var": 0.0160308837890625, + "learning_rate": 0.0001, + "loss": 5.2, + "loss/crossentropy": 2.201227366924286, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15026813745498657, + "step": 24958 + }, + { + "epoch": 0.78, + "grad_norm": 3.1875, + "grad_norm_var": 0.02261962890625, + "learning_rate": 0.0001, + "loss": 5.5798, + "loss/crossentropy": 2.509858250617981, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1593395695090294, + "step": 24960 + }, + { + "epoch": 0.7800625, + "grad_norm": 3.0, + "grad_norm_var": 0.025777180989583332, + "learning_rate": 0.0001, + "loss": 5.3551, + "loss/crossentropy": 2.4357187747955322, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15053577721118927, + "step": 24962 + }, + { + "epoch": 0.780125, + "grad_norm": 3.203125, + "grad_norm_var": 0.0277984619140625, + "learning_rate": 0.0001, + "loss": 5.9254, + "loss/crossentropy": 2.688409209251404, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17214136570692062, + "step": 24964 + }, + { + "epoch": 0.7801875, + "grad_norm": 3.109375, + "grad_norm_var": 0.0298736572265625, + "learning_rate": 0.0001, + "loss": 5.7217, + "loss/crossentropy": 2.5876524448394775, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16847819089889526, + "step": 24966 + }, + { + "epoch": 0.78025, + "grad_norm": 3.03125, + "grad_norm_var": 0.031571451822916666, + "learning_rate": 0.0001, + "loss": 5.6469, + "loss/crossentropy": 2.6176631450653076, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15761353820562363, + "step": 24968 + }, + { + "epoch": 0.7803125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0280670166015625, + "learning_rate": 0.0001, + "loss": 5.6812, + "loss/crossentropy": 2.5582680702209473, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16111940145492554, + "step": 24970 + }, + { + "epoch": 0.780375, + "grad_norm": 3.484375, + "grad_norm_var": 0.03092041015625, + "learning_rate": 0.0001, + "loss": 5.5374, + "loss/crossentropy": 2.415781617164612, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1633347123861313, + "step": 24972 + }, + { + "epoch": 0.7804375, + "grad_norm": 3.265625, + "grad_norm_var": 0.037337239583333334, + "learning_rate": 0.0001, + "loss": 5.8287, + "loss/crossentropy": 2.635912775993347, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17083972692489624, + "step": 24974 + }, + { + "epoch": 0.7805, + "grad_norm": 3.109375, + "grad_norm_var": 0.03524983723958333, + "learning_rate": 0.0001, + "loss": 5.4134, + "loss/crossentropy": 2.5047943592071533, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.14984039962291718, + "step": 24976 + }, + { + "epoch": 0.7805625, + "grad_norm": 3.796875, + "grad_norm_var": 0.06164957682291667, + "learning_rate": 0.0001, + "loss": 5.691, + "loss/crossentropy": 2.5033782720565796, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1671987697482109, + "step": 24978 + }, + { + "epoch": 0.780625, + "grad_norm": 3.25, + "grad_norm_var": 0.06101786295572917, + "learning_rate": 0.0001, + "loss": 5.477, + "loss/crossentropy": 2.4886432886123657, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1527436152100563, + "step": 24980 + }, + { + "epoch": 0.7806875, + "grad_norm": 3.0625, + "grad_norm_var": 0.05538736979166667, + "learning_rate": 0.0001, + "loss": 5.6513, + "loss/crossentropy": 2.586162567138672, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16393253952264786, + "step": 24982 + }, + { + "epoch": 0.78075, + "grad_norm": 3.1875, + "grad_norm_var": 0.05191650390625, + "learning_rate": 0.0001, + "loss": 5.6875, + "loss/crossentropy": 2.634481191635132, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15920409560203552, + "step": 24984 + }, + { + "epoch": 0.7808125, + "grad_norm": 3.03125, + "grad_norm_var": 0.05263671875, + "learning_rate": 0.0001, + "loss": 5.5731, + "loss/crossentropy": 2.5229707956314087, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16282780468463898, + "step": 24986 + }, + { + "epoch": 0.780875, + "grad_norm": 2.984375, + "grad_norm_var": 0.04582926432291667, + "learning_rate": 0.0001, + "loss": 5.64, + "loss/crossentropy": 2.545683741569519, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16138702630996704, + "step": 24988 + }, + { + "epoch": 0.7809375, + "grad_norm": 2.703125, + "grad_norm_var": 0.06370035807291667, + "learning_rate": 0.0001, + "loss": 5.0855, + "loss/crossentropy": 2.239551544189453, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.14162637293338776, + "step": 24990 + }, + { + "epoch": 0.781, + "grad_norm": 3.28125, + "grad_norm_var": 0.0631011962890625, + "learning_rate": 0.0001, + "loss": 5.53, + "loss/crossentropy": 2.4307796955108643, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16616912931203842, + "step": 24992 + }, + { + "epoch": 0.7810625, + "grad_norm": 3.375, + "grad_norm_var": 0.03509114583333333, + "learning_rate": 0.0001, + "loss": 5.839, + "loss/crossentropy": 2.6514469385147095, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1710943505167961, + "step": 24994 + }, + { + "epoch": 0.781125, + "grad_norm": 3.0, + "grad_norm_var": 0.03408101399739583, + "learning_rate": 0.0001, + "loss": 5.3903, + "loss/crossentropy": 2.3674404621124268, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.16165786236524582, + "step": 24996 + }, + { + "epoch": 0.7811875, + "grad_norm": 3.140625, + "grad_norm_var": 0.03472900390625, + "learning_rate": 0.0001, + "loss": 5.8161, + "loss/crossentropy": 2.677467703819275, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16816051304340363, + "step": 24998 + }, + { + "epoch": 0.78125, + "grad_norm": 3.25, + "grad_norm_var": 0.07089742024739583, + "learning_rate": 0.0001, + "loss": 5.7517, + "loss/crossentropy": 2.4565203189849854, + "loss/hidden": 1.5625, + "loss/jsd": 0.0, + "loss/logits": 0.17326726019382477, + "step": 25000 + }, + { + "epoch": 0.7813125, + "grad_norm": 3.296875, + "grad_norm_var": 0.07288004557291666, + "learning_rate": 0.0001, + "loss": 5.7661, + "loss/crossentropy": 2.6471210718154907, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16658222675323486, + "step": 25002 + }, + { + "epoch": 0.781375, + "grad_norm": 3.625, + "grad_norm_var": 0.08578999837239583, + "learning_rate": 0.0001, + "loss": 5.5766, + "loss/crossentropy": 2.4884352684020996, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16194364428520203, + "step": 25004 + }, + { + "epoch": 0.7814375, + "grad_norm": 2.859375, + "grad_norm_var": 0.07358296712239583, + "learning_rate": 0.0001, + "loss": 5.3731, + "loss/crossentropy": 2.4396510124206543, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14959152787923813, + "step": 25006 + }, + { + "epoch": 0.7815, + "grad_norm": 3.375, + "grad_norm_var": 0.07263895670572916, + "learning_rate": 0.0001, + "loss": 5.7882, + "loss/crossentropy": 2.618377923965454, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16854405403137207, + "step": 25008 + }, + { + "epoch": 0.7815625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0718902587890625, + "learning_rate": 0.0001, + "loss": 5.6774, + "loss/crossentropy": 2.5781971216201782, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16109603643417358, + "step": 25010 + }, + { + "epoch": 0.781625, + "grad_norm": 3.203125, + "grad_norm_var": 0.06816304524739583, + "learning_rate": 0.0001, + "loss": 5.9065, + "loss/crossentropy": 2.7340006828308105, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16685685515403748, + "step": 25012 + }, + { + "epoch": 0.7816875, + "grad_norm": 2.828125, + "grad_norm_var": 0.07422587076822916, + "learning_rate": 0.0001, + "loss": 5.8071, + "loss/crossentropy": 2.6172449588775635, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16937804222106934, + "step": 25014 + }, + { + "epoch": 0.78175, + "grad_norm": 2.90625, + "grad_norm_var": 0.04372456868489583, + "learning_rate": 0.0001, + "loss": 5.0782, + "loss/crossentropy": 2.183945894241333, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.14450307190418243, + "step": 25016 + }, + { + "epoch": 0.7818125, + "grad_norm": 3.359375, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 5.8498, + "loss/crossentropy": 2.7048791646957397, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16918237507343292, + "step": 25018 + }, + { + "epoch": 0.781875, + "grad_norm": 3.078125, + "grad_norm_var": 0.030094401041666666, + "learning_rate": 0.0001, + "loss": 5.2292, + "loss/crossentropy": 2.3088778257369995, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15101177990436554, + "step": 25020 + }, + { + "epoch": 0.7819375, + "grad_norm": 3.296875, + "grad_norm_var": 0.026178995768229168, + "learning_rate": 0.0001, + "loss": 5.8802, + "loss/crossentropy": 2.7492371797561646, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1623164564371109, + "step": 25022 + }, + { + "epoch": 0.782, + "grad_norm": 2.9375, + "grad_norm_var": 0.025809733072916667, + "learning_rate": 0.0001, + "loss": 5.5296, + "loss/crossentropy": 2.482593536376953, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1578279659152031, + "step": 25024 + }, + { + "epoch": 0.7820625, + "grad_norm": 3.1875, + "grad_norm_var": 0.026949055989583335, + "learning_rate": 0.0001, + "loss": 5.3924, + "loss/crossentropy": 2.354622721672058, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15767978131771088, + "step": 25026 + }, + { + "epoch": 0.782125, + "grad_norm": 3.0625, + "grad_norm_var": 0.027632649739583334, + "learning_rate": 0.0001, + "loss": 5.397, + "loss/crossentropy": 2.4253294467926025, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15380343049764633, + "step": 25028 + }, + { + "epoch": 0.7821875, + "grad_norm": 3.4375, + "grad_norm_var": 0.03220926920572917, + "learning_rate": 0.0001, + "loss": 5.9257, + "loss/crossentropy": 2.655179262161255, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17783797532320023, + "step": 25030 + }, + { + "epoch": 0.78225, + "grad_norm": 3.0, + "grad_norm_var": 0.030134073893229165, + "learning_rate": 0.0001, + "loss": 5.4419, + "loss/crossentropy": 2.4656208753585815, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15543990582227707, + "step": 25032 + }, + { + "epoch": 0.7823125, + "grad_norm": 3.21875, + "grad_norm_var": 0.025373331705729165, + "learning_rate": 0.0001, + "loss": 5.1499, + "loss/crossentropy": 2.189010500907898, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15507280081510544, + "step": 25034 + }, + { + "epoch": 0.782375, + "grad_norm": 2.953125, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 5.4865, + "loss/crossentropy": 2.499233603477478, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15575594455003738, + "step": 25036 + }, + { + "epoch": 0.7824375, + "grad_norm": 3.6875, + "grad_norm_var": 0.047826131184895836, + "learning_rate": 0.0001, + "loss": 5.6478, + "loss/crossentropy": 2.4892709255218506, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1654580757021904, + "step": 25038 + }, + { + "epoch": 0.7825, + "grad_norm": 3.359375, + "grad_norm_var": 0.04643452962239583, + "learning_rate": 0.0001, + "loss": 5.5592, + "loss/crossentropy": 2.4499223232269287, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16248713433742523, + "step": 25040 + }, + { + "epoch": 0.7825625, + "grad_norm": 3.171875, + "grad_norm_var": 0.051806640625, + "learning_rate": 0.0001, + "loss": 5.5163, + "loss/crossentropy": 2.44981050491333, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16133665293455124, + "step": 25042 + }, + { + "epoch": 0.782625, + "grad_norm": 3.109375, + "grad_norm_var": 0.0479156494140625, + "learning_rate": 0.0001, + "loss": 5.5172, + "loss/crossentropy": 2.4054744243621826, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1642991378903389, + "step": 25044 + }, + { + "epoch": 0.7826875, + "grad_norm": 3.109375, + "grad_norm_var": 0.04194234212239583, + "learning_rate": 0.0001, + "loss": 5.663, + "loss/crossentropy": 2.524221658706665, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.169351264834404, + "step": 25046 + }, + { + "epoch": 0.78275, + "grad_norm": 3.046875, + "grad_norm_var": 0.04138895670572917, + "learning_rate": 0.0001, + "loss": 5.5084, + "loss/crossentropy": 2.432820677757263, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15755394101142883, + "step": 25048 + }, + { + "epoch": 0.7828125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04365234375, + "learning_rate": 0.0001, + "loss": 5.8103, + "loss/crossentropy": 2.659608244895935, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16780192404985428, + "step": 25050 + }, + { + "epoch": 0.782875, + "grad_norm": 2.75, + "grad_norm_var": 0.049836222330729166, + "learning_rate": 0.0001, + "loss": 5.4689, + "loss/crossentropy": 2.4619545936584473, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1596793532371521, + "step": 25052 + }, + { + "epoch": 0.7829375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0306304931640625, + "learning_rate": 0.0001, + "loss": 5.394, + "loss/crossentropy": 2.407675623893738, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1560579463839531, + "step": 25054 + }, + { + "epoch": 0.783, + "grad_norm": 2.890625, + "grad_norm_var": 0.02506103515625, + "learning_rate": 0.0001, + "loss": 5.6879, + "loss/crossentropy": 2.6041946411132812, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16110891103744507, + "step": 25056 + }, + { + "epoch": 0.7830625, + "grad_norm": 3.140625, + "grad_norm_var": 0.02271728515625, + "learning_rate": 0.0001, + "loss": 5.401, + "loss/crossentropy": 2.3181673288345337, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1594579517841339, + "step": 25058 + }, + { + "epoch": 0.783125, + "grad_norm": 2.890625, + "grad_norm_var": 0.024755859375, + "learning_rate": 0.0001, + "loss": 5.7628, + "loss/crossentropy": 2.681661009788513, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16436689347028732, + "step": 25060 + }, + { + "epoch": 0.7831875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0270416259765625, + "learning_rate": 0.0001, + "loss": 5.4755, + "loss/crossentropy": 2.4530845880508423, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1561483070254326, + "step": 25062 + }, + { + "epoch": 0.78325, + "grad_norm": 3.265625, + "grad_norm_var": 0.030549112955729166, + "learning_rate": 0.0001, + "loss": 5.6392, + "loss/crossentropy": 2.560398578643799, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16374336183071136, + "step": 25064 + }, + { + "epoch": 0.7833125, + "grad_norm": 3.09375, + "grad_norm_var": 0.0261138916015625, + "learning_rate": 0.0001, + "loss": 5.3838, + "loss/crossentropy": 2.349510431289673, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15928371995687485, + "step": 25066 + }, + { + "epoch": 0.783375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0192535400390625, + "learning_rate": 0.0001, + "loss": 5.6535, + "loss/crossentropy": 2.5598082542419434, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15976376086473465, + "step": 25068 + }, + { + "epoch": 0.7834375, + "grad_norm": 3.03125, + "grad_norm_var": 0.015397135416666667, + "learning_rate": 0.0001, + "loss": 5.8171, + "loss/crossentropy": 2.687534809112549, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16413022577762604, + "step": 25070 + }, + { + "epoch": 0.7835, + "grad_norm": 3.296875, + "grad_norm_var": 0.024344889322916667, + "learning_rate": 0.0001, + "loss": 5.0117, + "loss/crossentropy": 2.0827004313468933, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.14485756307840347, + "step": 25072 + }, + { + "epoch": 0.7835625, + "grad_norm": 3.109375, + "grad_norm_var": 0.029182942708333333, + "learning_rate": 0.0001, + "loss": 5.7212, + "loss/crossentropy": 2.6073025465011597, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16763654351234436, + "step": 25074 + }, + { + "epoch": 0.783625, + "grad_norm": 2.9375, + "grad_norm_var": 0.0277740478515625, + "learning_rate": 0.0001, + "loss": 5.8606, + "loss/crossentropy": 2.748585820198059, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16393591463565826, + "step": 25076 + }, + { + "epoch": 0.7836875, + "grad_norm": 3.40625, + "grad_norm_var": 0.031428019205729164, + "learning_rate": 0.0001, + "loss": 5.6233, + "loss/crossentropy": 2.511659026145935, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16429278999567032, + "step": 25078 + }, + { + "epoch": 0.78375, + "grad_norm": 3.203125, + "grad_norm_var": 0.0341217041015625, + "learning_rate": 0.0001, + "loss": 5.4974, + "loss/crossentropy": 2.5064194202423096, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15495362877845764, + "step": 25080 + }, + { + "epoch": 0.7838125, + "grad_norm": 2.875, + "grad_norm_var": 0.040283203125, + "learning_rate": 0.0001, + "loss": 5.511, + "loss/crossentropy": 2.525681257247925, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15751774609088898, + "step": 25082 + }, + { + "epoch": 0.783875, + "grad_norm": 2.953125, + "grad_norm_var": 0.042455037434895836, + "learning_rate": 0.0001, + "loss": 5.5149, + "loss/crossentropy": 2.45516037940979, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1602674052119255, + "step": 25084 + }, + { + "epoch": 0.7839375, + "grad_norm": 3.40625, + "grad_norm_var": 0.04993489583333333, + "learning_rate": 0.0001, + "loss": 5.5154, + "loss/crossentropy": 2.417151927947998, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1613856852054596, + "step": 25086 + }, + { + "epoch": 0.784, + "grad_norm": 2.96875, + "grad_norm_var": 0.041413370768229166, + "learning_rate": 0.0001, + "loss": 5.7797, + "loss/crossentropy": 2.7250806093215942, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15858832746744156, + "step": 25088 + }, + { + "epoch": 0.7840625, + "grad_norm": 3.046875, + "grad_norm_var": 0.036421712239583334, + "learning_rate": 0.0001, + "loss": 5.4991, + "loss/crossentropy": 2.470256209373474, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1583540141582489, + "step": 25090 + }, + { + "epoch": 0.784125, + "grad_norm": 3.328125, + "grad_norm_var": 0.04127197265625, + "learning_rate": 0.0001, + "loss": 5.5174, + "loss/crossentropy": 2.43650221824646, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16316696256399155, + "step": 25092 + }, + { + "epoch": 0.7841875, + "grad_norm": 2.796875, + "grad_norm_var": 0.03912353515625, + "learning_rate": 0.0001, + "loss": 5.5422, + "loss/crossentropy": 2.5840858221054077, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15284348279237747, + "step": 25094 + }, + { + "epoch": 0.78425, + "grad_norm": 3.15625, + "grad_norm_var": 0.03687744140625, + "learning_rate": 0.0001, + "loss": 5.6881, + "loss/crossentropy": 2.564616560935974, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16703927516937256, + "step": 25096 + }, + { + "epoch": 0.7843125, + "grad_norm": 3.1875, + "grad_norm_var": 0.3617472330729167, + "learning_rate": 0.0001, + "loss": 5.8486, + "loss/crossentropy": 2.5736570358276367, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17358990013599396, + "step": 25098 + }, + { + "epoch": 0.784375, + "grad_norm": 2.859375, + "grad_norm_var": 0.36842041015625, + "learning_rate": 0.0001, + "loss": 5.7328, + "loss/crossentropy": 2.5624277591705322, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16859986633062363, + "step": 25100 + }, + { + "epoch": 0.7844375, + "grad_norm": 3.25, + "grad_norm_var": 0.36705322265625, + "learning_rate": 0.0001, + "loss": 5.5661, + "loss/crossentropy": 2.5077946186065674, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15778060257434845, + "step": 25102 + }, + { + "epoch": 0.7845, + "grad_norm": 3.265625, + "grad_norm_var": 0.3607706705729167, + "learning_rate": 0.0001, + "loss": 5.5609, + "loss/crossentropy": 2.513390064239502, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16099922358989716, + "step": 25104 + }, + { + "epoch": 0.7845625, + "grad_norm": 3.046875, + "grad_norm_var": 0.3536773681640625, + "learning_rate": 0.0001, + "loss": 5.5773, + "loss/crossentropy": 2.5185513496398926, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1617312878370285, + "step": 25106 + }, + { + "epoch": 0.784625, + "grad_norm": 3.265625, + "grad_norm_var": 0.3480631510416667, + "learning_rate": 0.0001, + "loss": 5.6833, + "loss/crossentropy": 2.5997354984283447, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16343086957931519, + "step": 25108 + }, + { + "epoch": 0.7846875, + "grad_norm": 3.390625, + "grad_norm_var": 0.32638346354166664, + "learning_rate": 0.0001, + "loss": 5.625, + "loss/crossentropy": 2.5467759370803833, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1617325320839882, + "step": 25110 + }, + { + "epoch": 0.78475, + "grad_norm": 3.25, + "grad_norm_var": 0.39422200520833334, + "learning_rate": 0.0001, + "loss": 6.0529, + "loss/crossentropy": 2.8289847373962402, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17200353741645813, + "step": 25112 + }, + { + "epoch": 0.7848125, + "grad_norm": 2.984375, + "grad_norm_var": 0.12730204264322917, + "learning_rate": 0.0001, + "loss": 5.471, + "loss/crossentropy": 2.35847806930542, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16515729576349258, + "step": 25114 + }, + { + "epoch": 0.784875, + "grad_norm": 3.3125, + "grad_norm_var": 0.11646219889322916, + "learning_rate": 0.0001, + "loss": 5.8522, + "loss/crossentropy": 2.6589726209640503, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17049360275268555, + "step": 25116 + }, + { + "epoch": 0.7849375, + "grad_norm": 3.40625, + "grad_norm_var": 0.1243804931640625, + "learning_rate": 0.0001, + "loss": 5.9077, + "loss/crossentropy": 2.679678797721863, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1724073886871338, + "step": 25118 + }, + { + "epoch": 0.785, + "grad_norm": 3.1875, + "grad_norm_var": 0.11791890462239583, + "learning_rate": 0.0001, + "loss": 5.9043, + "loss/crossentropy": 2.716698169708252, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17266739159822464, + "step": 25120 + }, + { + "epoch": 0.7850625, + "grad_norm": 3.046875, + "grad_norm_var": 0.12151692708333334, + "learning_rate": 0.0001, + "loss": 5.1997, + "loss/crossentropy": 2.222951889038086, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15314562618732452, + "step": 25122 + }, + { + "epoch": 0.785125, + "grad_norm": 2.921875, + "grad_norm_var": 0.14719950358072917, + "learning_rate": 0.0001, + "loss": 5.2749, + "loss/crossentropy": 2.384072422981262, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1492406353354454, + "step": 25124 + }, + { + "epoch": 0.7851875, + "grad_norm": 3.25, + "grad_norm_var": 0.14053446451822918, + "learning_rate": 0.0001, + "loss": 5.7775, + "loss/crossentropy": 2.6351888179779053, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16813918203115463, + "step": 25126 + }, + { + "epoch": 0.78525, + "grad_norm": 3.015625, + "grad_norm_var": 0.057027180989583336, + "learning_rate": 0.0001, + "loss": 5.7459, + "loss/crossentropy": 2.6335253715515137, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16162623465061188, + "step": 25128 + }, + { + "epoch": 0.7853125, + "grad_norm": 3.625, + "grad_norm_var": 0.058714803059895834, + "learning_rate": 0.0001, + "loss": 5.5279, + "loss/crossentropy": 2.392796754837036, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16233666986227036, + "step": 25130 + }, + { + "epoch": 0.785375, + "grad_norm": 2.859375, + "grad_norm_var": 0.065234375, + "learning_rate": 0.0001, + "loss": 5.2826, + "loss/crossentropy": 2.327790141105652, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15250813215970993, + "step": 25132 + }, + { + "epoch": 0.7854375, + "grad_norm": 2.921875, + "grad_norm_var": 0.05224202473958333, + "learning_rate": 0.0001, + "loss": 5.3842, + "loss/crossentropy": 2.4696340560913086, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.15278441458940506, + "step": 25134 + }, + { + "epoch": 0.7855, + "grad_norm": 3.0, + "grad_norm_var": 0.05273030598958333, + "learning_rate": 0.0001, + "loss": 5.482, + "loss/crossentropy": 2.44424831867218, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1584630385041237, + "step": 25136 + }, + { + "epoch": 0.7855625, + "grad_norm": 3.453125, + "grad_norm_var": 0.061945597330729164, + "learning_rate": 0.0001, + "loss": 5.9022, + "loss/crossentropy": 2.772692084312439, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1641230285167694, + "step": 25138 + }, + { + "epoch": 0.785625, + "grad_norm": 3.15625, + "grad_norm_var": 0.051122029622395836, + "learning_rate": 0.0001, + "loss": 5.7926, + "loss/crossentropy": 2.6483733654022217, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16793441772460938, + "step": 25140 + }, + { + "epoch": 0.7856875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0496734619140625, + "learning_rate": 0.0001, + "loss": 5.8279, + "loss/crossentropy": 2.611177921295166, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17088709026575089, + "step": 25142 + }, + { + "epoch": 0.78575, + "grad_norm": 2.984375, + "grad_norm_var": 0.05120035807291667, + "learning_rate": 0.0001, + "loss": 5.8282, + "loss/crossentropy": 2.71530544757843, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16597488522529602, + "step": 25144 + }, + { + "epoch": 0.7858125, + "grad_norm": 2.984375, + "grad_norm_var": 0.0348541259765625, + "learning_rate": 0.0001, + "loss": 5.4036, + "loss/crossentropy": 2.425929307937622, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.157137930393219, + "step": 25146 + }, + { + "epoch": 0.785875, + "grad_norm": 2.96875, + "grad_norm_var": 0.035416666666666666, + "learning_rate": 0.0001, + "loss": 5.5936, + "loss/crossentropy": 2.464033603668213, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1684267818927765, + "step": 25148 + }, + { + "epoch": 0.7859375, + "grad_norm": 3.46875, + "grad_norm_var": 0.0369293212890625, + "learning_rate": 0.0001, + "loss": 5.8477, + "loss/crossentropy": 2.7339954376220703, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16645215451717377, + "step": 25150 + }, + { + "epoch": 0.786, + "grad_norm": 3.09375, + "grad_norm_var": 0.033935546875, + "learning_rate": 0.0001, + "loss": 5.7675, + "loss/crossentropy": 2.707942247390747, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16064704209566116, + "step": 25152 + }, + { + "epoch": 0.7860625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0323883056640625, + "learning_rate": 0.0001, + "loss": 5.6805, + "loss/crossentropy": 2.600821614265442, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16187451779842377, + "step": 25154 + }, + { + "epoch": 0.786125, + "grad_norm": 3.0, + "grad_norm_var": 0.03359273274739583, + "learning_rate": 0.0001, + "loss": 5.7208, + "loss/crossentropy": 2.628736734390259, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16507025063037872, + "step": 25156 + }, + { + "epoch": 0.7861875, + "grad_norm": 2.8125, + "grad_norm_var": 0.036031087239583336, + "learning_rate": 0.0001, + "loss": 5.5739, + "loss/crossentropy": 2.5731139183044434, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15789590775966644, + "step": 25158 + }, + { + "epoch": 0.78625, + "grad_norm": 3.171875, + "grad_norm_var": 0.034891764322916664, + "learning_rate": 0.0001, + "loss": 5.7913, + "loss/crossentropy": 2.678464412689209, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16636203974485397, + "step": 25160 + }, + { + "epoch": 0.7863125, + "grad_norm": 3.03125, + "grad_norm_var": 0.033869425455729164, + "learning_rate": 0.0001, + "loss": 5.7663, + "loss/crossentropy": 2.6258822679519653, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16677294671535492, + "step": 25162 + }, + { + "epoch": 0.786375, + "grad_norm": 2.90625, + "grad_norm_var": 0.03193257649739583, + "learning_rate": 0.0001, + "loss": 5.6864, + "loss/crossentropy": 2.6077847480773926, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1648903787136078, + "step": 25164 + }, + { + "epoch": 0.7864375, + "grad_norm": 2.96875, + "grad_norm_var": 0.019172159830729167, + "learning_rate": 0.0001, + "loss": 5.6069, + "loss/crossentropy": 2.495919704437256, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16539262235164642, + "step": 25166 + }, + { + "epoch": 0.7865, + "grad_norm": 2.828125, + "grad_norm_var": 0.0217437744140625, + "learning_rate": 0.0001, + "loss": 5.5293, + "loss/crossentropy": 2.525461792945862, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1542908474802971, + "step": 25168 + }, + { + "epoch": 0.7865625, + "grad_norm": 3.265625, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 5.6908, + "loss/crossentropy": 2.5442755222320557, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16543786972761154, + "step": 25170 + }, + { + "epoch": 0.786625, + "grad_norm": 2.828125, + "grad_norm_var": 0.02564697265625, + "learning_rate": 0.0001, + "loss": 5.2421, + "loss/crossentropy": 2.3321306705474854, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1484227403998375, + "step": 25172 + }, + { + "epoch": 0.7866875, + "grad_norm": 3.125, + "grad_norm_var": 0.017756144205729168, + "learning_rate": 0.0001, + "loss": 5.4943, + "loss/crossentropy": 2.4257287979125977, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1599854677915573, + "step": 25174 + }, + { + "epoch": 0.78675, + "grad_norm": 3.21875, + "grad_norm_var": 0.018062337239583334, + "learning_rate": 0.0001, + "loss": 5.5799, + "loss/crossentropy": 2.5376850366592407, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15656720101833344, + "step": 25176 + }, + { + "epoch": 0.7868125, + "grad_norm": 3.0, + "grad_norm_var": 0.022705078125, + "learning_rate": 0.0001, + "loss": 5.348, + "loss/crossentropy": 2.39486563205719, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15313006937503815, + "step": 25178 + }, + { + "epoch": 0.786875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0227691650390625, + "learning_rate": 0.0001, + "loss": 5.5245, + "loss/crossentropy": 2.5132004022598267, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1585562601685524, + "step": 25180 + }, + { + "epoch": 0.7869375, + "grad_norm": 2.84375, + "grad_norm_var": 0.025809733072916667, + "learning_rate": 0.0001, + "loss": 5.3467, + "loss/crossentropy": 2.390488862991333, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15304618328809738, + "step": 25182 + }, + { + "epoch": 0.787, + "grad_norm": 3.015625, + "grad_norm_var": 0.01959228515625, + "learning_rate": 0.0001, + "loss": 5.5321, + "loss/crossentropy": 2.4580490589141846, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16170482337474823, + "step": 25184 + }, + { + "epoch": 0.7870625, + "grad_norm": 3.3125, + "grad_norm_var": 0.020978800455729165, + "learning_rate": 0.0001, + "loss": 5.7807, + "loss/crossentropy": 2.6099783182144165, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16902922838926315, + "step": 25186 + }, + { + "epoch": 0.787125, + "grad_norm": 2.765625, + "grad_norm_var": 0.024800618489583332, + "learning_rate": 0.0001, + "loss": 5.602, + "loss/crossentropy": 2.617928385734558, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15739066153764725, + "step": 25188 + }, + { + "epoch": 0.7871875, + "grad_norm": 3.140625, + "grad_norm_var": 0.026952107747395832, + "learning_rate": 0.0001, + "loss": 5.5574, + "loss/crossentropy": 2.4698808193206787, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16226434707641602, + "step": 25190 + }, + { + "epoch": 0.78725, + "grad_norm": 3.03125, + "grad_norm_var": 0.023737589518229168, + "learning_rate": 0.0001, + "loss": 5.5291, + "loss/crossentropy": 2.515956163406372, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1544433981180191, + "step": 25192 + }, + { + "epoch": 0.7873125, + "grad_norm": 3.375, + "grad_norm_var": 0.033003743489583334, + "learning_rate": 0.0001, + "loss": 5.7061, + "loss/crossentropy": 2.519581437110901, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17021550238132477, + "step": 25194 + }, + { + "epoch": 0.787375, + "grad_norm": 2.921875, + "grad_norm_var": 0.03264058430989583, + "learning_rate": 0.0001, + "loss": 5.4925, + "loss/crossentropy": 2.4986947774887085, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15524086356163025, + "step": 25196 + }, + { + "epoch": 0.7874375, + "grad_norm": 3.40625, + "grad_norm_var": 0.03798726399739583, + "learning_rate": 0.0001, + "loss": 5.6295, + "loss/crossentropy": 2.582840085029602, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15935365855693817, + "step": 25198 + }, + { + "epoch": 0.7875, + "grad_norm": 3.015625, + "grad_norm_var": 0.04254150390625, + "learning_rate": 0.0001, + "loss": 5.7632, + "loss/crossentropy": 2.6446453332901, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16303178668022156, + "step": 25200 + }, + { + "epoch": 0.7875625, + "grad_norm": 2.96875, + "grad_norm_var": 0.03829752604166667, + "learning_rate": 0.0001, + "loss": 5.2524, + "loss/crossentropy": 2.2934986352920532, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1552659571170807, + "step": 25202 + }, + { + "epoch": 0.787625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0299957275390625, + "learning_rate": 0.0001, + "loss": 5.6602, + "loss/crossentropy": 2.617307186126709, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15819956362247467, + "step": 25204 + }, + { + "epoch": 0.7876875, + "grad_norm": 3.109375, + "grad_norm_var": 0.03992411295572917, + "learning_rate": 0.0001, + "loss": 5.4376, + "loss/crossentropy": 2.359330892562866, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16134297847747803, + "step": 25206 + }, + { + "epoch": 0.78775, + "grad_norm": 3.578125, + "grad_norm_var": 0.055810546875, + "learning_rate": 0.0001, + "loss": 6.1531, + "loss/crossentropy": 2.840658664703369, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18202606588602066, + "step": 25208 + }, + { + "epoch": 0.7878125, + "grad_norm": 3.03125, + "grad_norm_var": 0.053831990559895834, + "learning_rate": 0.0001, + "loss": 5.6484, + "loss/crossentropy": 2.5800265073776245, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1611306369304657, + "step": 25210 + }, + { + "epoch": 0.787875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0541656494140625, + "learning_rate": 0.0001, + "loss": 5.6995, + "loss/crossentropy": 2.568533182144165, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1662180870771408, + "step": 25212 + }, + { + "epoch": 0.7879375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0494049072265625, + "learning_rate": 0.0001, + "loss": 5.5971, + "loss/crossentropy": 2.4888023138046265, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16160811483860016, + "step": 25214 + }, + { + "epoch": 0.788, + "grad_norm": 3.234375, + "grad_norm_var": 0.042220052083333334, + "learning_rate": 0.0001, + "loss": 5.8117, + "loss/crossentropy": 2.6967689990997314, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16540372371673584, + "step": 25216 + }, + { + "epoch": 0.7880625, + "grad_norm": 2.84375, + "grad_norm_var": 0.04708658854166667, + "learning_rate": 0.0001, + "loss": 5.5943, + "loss/crossentropy": 2.5863274335861206, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.16016855835914612, + "step": 25218 + }, + { + "epoch": 0.788125, + "grad_norm": 3.53125, + "grad_norm_var": 0.061324055989583334, + "learning_rate": 0.0001, + "loss": 5.9415, + "loss/crossentropy": 2.6803689002990723, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1761108636856079, + "step": 25220 + }, + { + "epoch": 0.7881875, + "grad_norm": 2.953125, + "grad_norm_var": 0.05654296875, + "learning_rate": 0.0001, + "loss": 5.4505, + "loss/crossentropy": 2.423671245574951, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15736592561006546, + "step": 25222 + }, + { + "epoch": 0.78825, + "grad_norm": 3.078125, + "grad_norm_var": 0.037125651041666666, + "learning_rate": 0.0001, + "loss": 5.609, + "loss/crossentropy": 2.61668598651886, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1566566824913025, + "step": 25224 + }, + { + "epoch": 0.7883125, + "grad_norm": 3.203125, + "grad_norm_var": 0.04055989583333333, + "learning_rate": 0.0001, + "loss": 5.7241, + "loss/crossentropy": 2.6395206451416016, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16314589977264404, + "step": 25226 + }, + { + "epoch": 0.788375, + "grad_norm": 3.234375, + "grad_norm_var": 0.041064453125, + "learning_rate": 0.0001, + "loss": 5.7187, + "loss/crossentropy": 2.600376844406128, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16534972190856934, + "step": 25228 + }, + { + "epoch": 0.7884375, + "grad_norm": 3.234375, + "grad_norm_var": 0.04329325358072917, + "learning_rate": 0.0001, + "loss": 5.3233, + "loss/crossentropy": 2.3683305978775024, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.15682659298181534, + "step": 25230 + }, + { + "epoch": 0.7885, + "grad_norm": 3.59375, + "grad_norm_var": 0.06189676920572917, + "learning_rate": 0.0001, + "loss": 5.7239, + "loss/crossentropy": 2.564695119857788, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1670876443386078, + "step": 25232 + }, + { + "epoch": 0.7885625, + "grad_norm": 3.046875, + "grad_norm_var": 0.05627848307291667, + "learning_rate": 0.0001, + "loss": 5.6439, + "loss/crossentropy": 2.560565233230591, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.15989135205745697, + "step": 25234 + }, + { + "epoch": 0.788625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04110921223958333, + "learning_rate": 0.0001, + "loss": 5.4061, + "loss/crossentropy": 2.4623721837997437, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1541389375925064, + "step": 25236 + }, + { + "epoch": 0.7886875, + "grad_norm": 3.390625, + "grad_norm_var": 0.04722900390625, + "learning_rate": 0.0001, + "loss": 5.5537, + "loss/crossentropy": 2.4472579956054688, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16494275629520416, + "step": 25238 + }, + { + "epoch": 0.78875, + "grad_norm": 3.296875, + "grad_norm_var": 0.046019490559895834, + "learning_rate": 0.0001, + "loss": 5.8679, + "loss/crossentropy": 2.6663858890533447, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17366329580545425, + "step": 25240 + }, + { + "epoch": 0.7888125, + "grad_norm": 2.953125, + "grad_norm_var": 0.04617513020833333, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.6034947633743286, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1663685292005539, + "step": 25242 + }, + { + "epoch": 0.788875, + "grad_norm": 3.15625, + "grad_norm_var": 0.044798787434895834, + "learning_rate": 0.0001, + "loss": 5.7471, + "loss/crossentropy": 2.64049232006073, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1633920818567276, + "step": 25244 + }, + { + "epoch": 0.7889375, + "grad_norm": 3.125, + "grad_norm_var": 0.04024149576822917, + "learning_rate": 0.0001, + "loss": 5.7013, + "loss/crossentropy": 2.5922815799713135, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16441580653190613, + "step": 25246 + }, + { + "epoch": 0.789, + "grad_norm": 2.890625, + "grad_norm_var": 0.02486572265625, + "learning_rate": 0.0001, + "loss": 5.3338, + "loss/crossentropy": 2.324529767036438, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15444006025791168, + "step": 25248 + }, + { + "epoch": 0.7890625, + "grad_norm": 3.515625, + "grad_norm_var": 0.03499247233072917, + "learning_rate": 0.0001, + "loss": 5.8315, + "loss/crossentropy": 2.6474636793136597, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16996200382709503, + "step": 25250 + }, + { + "epoch": 0.789125, + "grad_norm": 3.484375, + "grad_norm_var": 0.0591217041015625, + "learning_rate": 0.0001, + "loss": 6.2729, + "loss/crossentropy": 2.8888838291168213, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.1833200380206108, + "step": 25252 + }, + { + "epoch": 0.7891875, + "grad_norm": 3.203125, + "grad_norm_var": 0.0658111572265625, + "learning_rate": 0.0001, + "loss": 5.5424, + "loss/crossentropy": 2.489980459213257, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1626640036702156, + "step": 25254 + }, + { + "epoch": 0.78925, + "grad_norm": 2.796875, + "grad_norm_var": 0.07440999348958334, + "learning_rate": 0.0001, + "loss": 5.6484, + "loss/crossentropy": 2.60439932346344, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16103915870189667, + "step": 25256 + }, + { + "epoch": 0.7893125, + "grad_norm": 3.15625, + "grad_norm_var": 0.0716705322265625, + "learning_rate": 0.0001, + "loss": 5.2819, + "loss/crossentropy": 2.246806025505066, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1558496356010437, + "step": 25258 + }, + { + "epoch": 0.789375, + "grad_norm": 3.203125, + "grad_norm_var": 0.07154541015625, + "learning_rate": 0.0001, + "loss": 5.7365, + "loss/crossentropy": 2.556297779083252, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17231497913599014, + "step": 25260 + }, + { + "epoch": 0.7894375, + "grad_norm": 3.421875, + "grad_norm_var": 0.07289937337239584, + "learning_rate": 0.0001, + "loss": 6.093, + "loss/crossentropy": 2.913823127746582, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16674679517745972, + "step": 25262 + }, + { + "epoch": 0.7895, + "grad_norm": 3.265625, + "grad_norm_var": 0.0753082275390625, + "learning_rate": 0.0001, + "loss": 5.7693, + "loss/crossentropy": 2.6205087900161743, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16722586750984192, + "step": 25264 + }, + { + "epoch": 0.7895625, + "grad_norm": 3.0, + "grad_norm_var": 0.0881256103515625, + "learning_rate": 0.0001, + "loss": 5.5436, + "loss/crossentropy": 2.519482970237732, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15905684977769852, + "step": 25266 + }, + { + "epoch": 0.789625, + "grad_norm": 3.0625, + "grad_norm_var": 0.0528228759765625, + "learning_rate": 0.0001, + "loss": 5.5977, + "loss/crossentropy": 2.558209180831909, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15902253985404968, + "step": 25268 + }, + { + "epoch": 0.7896875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0469635009765625, + "learning_rate": 0.0001, + "loss": 5.8884, + "loss/crossentropy": 2.7374707460403442, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16665419191122055, + "step": 25270 + }, + { + "epoch": 0.78975, + "grad_norm": 3.078125, + "grad_norm_var": 0.045807902018229166, + "learning_rate": 0.0001, + "loss": 5.2376, + "loss/crossentropy": 2.2393096685409546, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15373366326093674, + "step": 25272 + }, + { + "epoch": 0.7898125, + "grad_norm": 3.015625, + "grad_norm_var": 0.04550374348958333, + "learning_rate": 0.0001, + "loss": 5.5198, + "loss/crossentropy": 2.484234571456909, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16019342839717865, + "step": 25274 + }, + { + "epoch": 0.789875, + "grad_norm": 2.859375, + "grad_norm_var": 0.048737589518229166, + "learning_rate": 0.0001, + "loss": 5.4838, + "loss/crossentropy": 2.489290952682495, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1588217169046402, + "step": 25276 + }, + { + "epoch": 0.7899375, + "grad_norm": 2.796875, + "grad_norm_var": 0.04658203125, + "learning_rate": 0.0001, + "loss": 5.6814, + "loss/crossentropy": 2.6445512771606445, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1560288816690445, + "step": 25278 + }, + { + "epoch": 0.79, + "grad_norm": 3.125, + "grad_norm_var": 0.04098307291666667, + "learning_rate": 0.0001, + "loss": 5.7944, + "loss/crossentropy": 2.666891098022461, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16469985991716385, + "step": 25280 + }, + { + "epoch": 0.7900625, + "grad_norm": 3.046875, + "grad_norm_var": 0.017975870768229166, + "learning_rate": 0.0001, + "loss": 5.9708, + "loss/crossentropy": 2.8003090620040894, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16939659416675568, + "step": 25282 + }, + { + "epoch": 0.790125, + "grad_norm": 3.109375, + "grad_norm_var": 0.02193603515625, + "learning_rate": 0.0001, + "loss": 5.5681, + "loss/crossentropy": 2.465001106262207, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16226035356521606, + "step": 25284 + }, + { + "epoch": 0.7901875, + "grad_norm": 3.15625, + "grad_norm_var": 0.023363240559895835, + "learning_rate": 0.0001, + "loss": 5.7736, + "loss/crossentropy": 2.6344821453094482, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16782230883836746, + "step": 25286 + }, + { + "epoch": 0.79025, + "grad_norm": 2.96875, + "grad_norm_var": 0.027350870768229167, + "learning_rate": 0.0001, + "loss": 5.5039, + "loss/crossentropy": 2.5562633275985718, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15413692593574524, + "step": 25288 + }, + { + "epoch": 0.7903125, + "grad_norm": 2.765625, + "grad_norm_var": 0.03326416015625, + "learning_rate": 0.0001, + "loss": 5.391, + "loss/crossentropy": 2.3242186307907104, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16136524826288223, + "step": 25290 + }, + { + "epoch": 0.790375, + "grad_norm": 2.9375, + "grad_norm_var": 0.034032185872395836, + "learning_rate": 0.0001, + "loss": 5.6922, + "loss/crossentropy": 2.5918469429016113, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1639409139752388, + "step": 25292 + }, + { + "epoch": 0.7904375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03150634765625, + "learning_rate": 0.0001, + "loss": 5.5406, + "loss/crossentropy": 2.4601529836654663, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1650724932551384, + "step": 25294 + }, + { + "epoch": 0.7905, + "grad_norm": 3.265625, + "grad_norm_var": 0.03427327473958333, + "learning_rate": 0.0001, + "loss": 5.56, + "loss/crossentropy": 2.5213820934295654, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15854918956756592, + "step": 25296 + }, + { + "epoch": 0.7905625, + "grad_norm": 3.15625, + "grad_norm_var": 0.029637654622395832, + "learning_rate": 0.0001, + "loss": 5.4759, + "loss/crossentropy": 2.4062318801879883, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1616571843624115, + "step": 25298 + }, + { + "epoch": 0.790625, + "grad_norm": 3.21875, + "grad_norm_var": 0.031029256184895833, + "learning_rate": 0.0001, + "loss": 5.9488, + "loss/crossentropy": 2.6806533336639404, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.177203968167305, + "step": 25300 + }, + { + "epoch": 0.7906875, + "grad_norm": 3.0625, + "grad_norm_var": 0.029198201497395833, + "learning_rate": 0.0001, + "loss": 5.8204, + "loss/crossentropy": 2.7386746406555176, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16285746544599533, + "step": 25302 + }, + { + "epoch": 0.79075, + "grad_norm": 3.421875, + "grad_norm_var": 0.029292805989583334, + "learning_rate": 0.0001, + "loss": 5.652, + "loss/crossentropy": 2.5269299745559692, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16250652819871902, + "step": 25304 + }, + { + "epoch": 0.7908125, + "grad_norm": 3.390625, + "grad_norm_var": 0.027962239583333333, + "learning_rate": 0.0001, + "loss": 5.6296, + "loss/crossentropy": 2.3718210458755493, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17265165597200394, + "step": 25306 + }, + { + "epoch": 0.790875, + "grad_norm": 3.203125, + "grad_norm_var": 0.018822224934895833, + "learning_rate": 0.0001, + "loss": 5.7265, + "loss/crossentropy": 2.5841060876846313, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16697849333286285, + "step": 25308 + }, + { + "epoch": 0.7909375, + "grad_norm": 2.828125, + "grad_norm_var": 0.0208984375, + "learning_rate": 0.0001, + "loss": 5.4832, + "loss/crossentropy": 2.473755359649658, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15954020619392395, + "step": 25310 + }, + { + "epoch": 0.791, + "grad_norm": 3.203125, + "grad_norm_var": 0.019188435872395833, + "learning_rate": 0.0001, + "loss": 5.6534, + "loss/crossentropy": 2.588550329208374, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16273347288370132, + "step": 25312 + }, + { + "epoch": 0.7910625, + "grad_norm": 3.109375, + "grad_norm_var": 0.026558430989583333, + "learning_rate": 0.0001, + "loss": 5.3421, + "loss/crossentropy": 2.397429347038269, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.14758940786123276, + "step": 25314 + }, + { + "epoch": 0.791125, + "grad_norm": 3.21875, + "grad_norm_var": 0.026009114583333333, + "learning_rate": 0.0001, + "loss": 5.5765, + "loss/crossentropy": 2.5067743062973022, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15970566868782043, + "step": 25316 + }, + { + "epoch": 0.7911875, + "grad_norm": 3.34375, + "grad_norm_var": 0.031217447916666665, + "learning_rate": 0.0001, + "loss": 5.7345, + "loss/crossentropy": 2.6668519973754883, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1622350960969925, + "step": 25318 + }, + { + "epoch": 0.79125, + "grad_norm": 3.234375, + "grad_norm_var": 0.030060831705729166, + "learning_rate": 0.0001, + "loss": 5.929, + "loss/crossentropy": 2.67513906955719, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17694664001464844, + "step": 25320 + }, + { + "epoch": 0.7913125, + "grad_norm": 2.84375, + "grad_norm_var": 0.040816243489583334, + "learning_rate": 0.0001, + "loss": 5.3198, + "loss/crossentropy": 2.40627658367157, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1487717479467392, + "step": 25322 + }, + { + "epoch": 0.791375, + "grad_norm": 3.28125, + "grad_norm_var": 0.055540974934895834, + "learning_rate": 0.0001, + "loss": 5.2989, + "loss/crossentropy": 2.361093282699585, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.14651653915643692, + "step": 25324 + }, + { + "epoch": 0.7914375, + "grad_norm": 4.75, + "grad_norm_var": 0.22847900390625, + "learning_rate": 0.0001, + "loss": 5.5822, + "loss/crossentropy": 2.4573179483413696, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16717679053544998, + "step": 25326 + }, + { + "epoch": 0.7915, + "grad_norm": 3.0, + "grad_norm_var": 0.23121337890625, + "learning_rate": 0.0001, + "loss": 5.9451, + "loss/crossentropy": 2.749058485031128, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17311637103557587, + "step": 25328 + }, + { + "epoch": 0.7915625, + "grad_norm": 2.984375, + "grad_norm_var": 0.23987223307291666, + "learning_rate": 0.0001, + "loss": 5.4095, + "loss/crossentropy": 2.459301233291626, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1532263681292534, + "step": 25330 + }, + { + "epoch": 0.791625, + "grad_norm": 3.0625, + "grad_norm_var": 0.24000244140625, + "learning_rate": 0.0001, + "loss": 5.8423, + "loss/crossentropy": 2.5985885858535767, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17554303258657455, + "step": 25332 + }, + { + "epoch": 0.7916875, + "grad_norm": 3.15625, + "grad_norm_var": 0.24426167805989582, + "learning_rate": 0.0001, + "loss": 5.7257, + "loss/crossentropy": 2.593381404876709, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16479890793561935, + "step": 25334 + }, + { + "epoch": 0.79175, + "grad_norm": 3.234375, + "grad_norm_var": 0.24197591145833333, + "learning_rate": 0.0001, + "loss": 5.9831, + "loss/crossentropy": 2.7751917839050293, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17118044197559357, + "step": 25336 + }, + { + "epoch": 0.7918125, + "grad_norm": 3.03125, + "grad_norm_var": 0.22265218098958334, + "learning_rate": 0.0001, + "loss": 5.5792, + "loss/crossentropy": 2.529574751853943, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15964554250240326, + "step": 25338 + }, + { + "epoch": 0.791875, + "grad_norm": 2.953125, + "grad_norm_var": 0.20861714680989582, + "learning_rate": 0.0001, + "loss": 5.6736, + "loss/crossentropy": 2.5997570753097534, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16090020537376404, + "step": 25340 + }, + { + "epoch": 0.7919375, + "grad_norm": 3.15625, + "grad_norm_var": 0.043610636393229166, + "learning_rate": 0.0001, + "loss": 5.8498, + "loss/crossentropy": 2.7276843786239624, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16495086252689362, + "step": 25342 + }, + { + "epoch": 0.792, + "grad_norm": 3.203125, + "grad_norm_var": 0.0501129150390625, + "learning_rate": 0.0001, + "loss": 5.618, + "loss/crossentropy": 2.5446499586105347, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16201840341091156, + "step": 25344 + }, + { + "epoch": 0.7920625, + "grad_norm": 3.0, + "grad_norm_var": 0.037230428059895834, + "learning_rate": 0.0001, + "loss": 5.5496, + "loss/crossentropy": 2.499722480773926, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1588894948363304, + "step": 25346 + }, + { + "epoch": 0.792125, + "grad_norm": 3.1875, + "grad_norm_var": 0.04060770670572917, + "learning_rate": 0.0001, + "loss": 5.8169, + "loss/crossentropy": 2.7432724237442017, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16204586625099182, + "step": 25348 + }, + { + "epoch": 0.7921875, + "grad_norm": 2.90625, + "grad_norm_var": 0.032746378580729166, + "learning_rate": 0.0001, + "loss": 5.6696, + "loss/crossentropy": 2.6122941970825195, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16276054084300995, + "step": 25350 + }, + { + "epoch": 0.79225, + "grad_norm": 3.0625, + "grad_norm_var": 0.06959228515625, + "learning_rate": 0.0001, + "loss": 5.8806, + "loss/crossentropy": 2.6536483764648438, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17386756837368011, + "step": 25352 + }, + { + "epoch": 0.7923125, + "grad_norm": 3.296875, + "grad_norm_var": 0.07049153645833334, + "learning_rate": 0.0001, + "loss": 5.7874, + "loss/crossentropy": 2.648374080657959, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16585220396518707, + "step": 25354 + }, + { + "epoch": 0.792375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06725260416666666, + "learning_rate": 0.0001, + "loss": 5.6055, + "loss/crossentropy": 2.5399500131607056, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16436853259801865, + "step": 25356 + }, + { + "epoch": 0.7924375, + "grad_norm": 3.1875, + "grad_norm_var": 0.07284749348958333, + "learning_rate": 0.0001, + "loss": 5.3793, + "loss/crossentropy": 2.5220072269439697, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.13924633711576462, + "step": 25358 + }, + { + "epoch": 0.7925, + "grad_norm": 3.109375, + "grad_norm_var": 0.06595052083333333, + "learning_rate": 0.0001, + "loss": 5.7704, + "loss/crossentropy": 2.6193450689315796, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17057766020298004, + "step": 25360 + }, + { + "epoch": 0.7925625, + "grad_norm": 3.703125, + "grad_norm_var": 0.09200846354166667, + "learning_rate": 0.0001, + "loss": 5.6755, + "loss/crossentropy": 2.6169503927230835, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16054153442382812, + "step": 25362 + }, + { + "epoch": 0.792625, + "grad_norm": 3.78125, + "grad_norm_var": 0.11628316243489584, + "learning_rate": 0.0001, + "loss": 5.7137, + "loss/crossentropy": 2.4692364931106567, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.17210125178098679, + "step": 25364 + }, + { + "epoch": 0.7926875, + "grad_norm": 3.03125, + "grad_norm_var": 0.11047770182291666, + "learning_rate": 0.0001, + "loss": 5.4257, + "loss/crossentropy": 2.536818504333496, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.15060505270957947, + "step": 25366 + }, + { + "epoch": 0.79275, + "grad_norm": 3.0625, + "grad_norm_var": 0.08189188639322917, + "learning_rate": 0.0001, + "loss": 5.4759, + "loss/crossentropy": 2.4374356269836426, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16165433824062347, + "step": 25368 + }, + { + "epoch": 0.7928125, + "grad_norm": 3.15625, + "grad_norm_var": 0.08318684895833334, + "learning_rate": 0.0001, + "loss": 5.5178, + "loss/crossentropy": 2.4698901176452637, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1579151228070259, + "step": 25370 + }, + { + "epoch": 0.792875, + "grad_norm": 3.015625, + "grad_norm_var": 0.07545572916666667, + "learning_rate": 0.0001, + "loss": 5.4255, + "loss/crossentropy": 2.3560034036636353, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16085121780633926, + "step": 25372 + }, + { + "epoch": 0.7929375, + "grad_norm": 3.25, + "grad_norm_var": 0.0664459228515625, + "learning_rate": 0.0001, + "loss": 5.8468, + "loss/crossentropy": 2.631547451019287, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17113661766052246, + "step": 25374 + }, + { + "epoch": 0.793, + "grad_norm": 3.015625, + "grad_norm_var": 0.07095947265625, + "learning_rate": 0.0001, + "loss": 5.4377, + "loss/crossentropy": 2.4203755855560303, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15758804976940155, + "step": 25376 + }, + { + "epoch": 0.7930625, + "grad_norm": 3.125, + "grad_norm_var": 0.04597066243489583, + "learning_rate": 0.0001, + "loss": 5.7576, + "loss/crossentropy": 2.6211706399917603, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16402891278266907, + "step": 25378 + }, + { + "epoch": 0.793125, + "grad_norm": 2.875, + "grad_norm_var": 0.01806640625, + "learning_rate": 0.0001, + "loss": 5.7183, + "loss/crossentropy": 2.667958617210388, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16089429706335068, + "step": 25380 + }, + { + "epoch": 0.7931875, + "grad_norm": 3.0625, + "grad_norm_var": 0.016227213541666667, + "learning_rate": 0.0001, + "loss": 5.9311, + "loss/crossentropy": 2.689044237136841, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17732784152030945, + "step": 25382 + }, + { + "epoch": 0.79325, + "grad_norm": 3.140625, + "grad_norm_var": 0.016373697916666666, + "learning_rate": 0.0001, + "loss": 5.6614, + "loss/crossentropy": 2.6168936491012573, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.160306878387928, + "step": 25384 + }, + { + "epoch": 0.7933125, + "grad_norm": 3.21875, + "grad_norm_var": 0.01279296875, + "learning_rate": 0.0001, + "loss": 5.6141, + "loss/crossentropy": 2.4809409379959106, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16292868554592133, + "step": 25386 + }, + { + "epoch": 0.793375, + "grad_norm": 3.0625, + "grad_norm_var": 0.01578369140625, + "learning_rate": 0.0001, + "loss": 5.7743, + "loss/crossentropy": 2.720241069793701, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16126443445682526, + "step": 25388 + }, + { + "epoch": 0.7934375, + "grad_norm": 3.25, + "grad_norm_var": 0.01510009765625, + "learning_rate": 0.0001, + "loss": 5.4794, + "loss/crossentropy": 2.481152892112732, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15881385654211044, + "step": 25390 + }, + { + "epoch": 0.7935, + "grad_norm": 3.0, + "grad_norm_var": 0.0148101806640625, + "learning_rate": 0.0001, + "loss": 5.4523, + "loss/crossentropy": 2.4096468687057495, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16051514446735382, + "step": 25392 + }, + { + "epoch": 0.7935625, + "grad_norm": 2.953125, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 5.4221, + "loss/crossentropy": 2.420169234275818, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1572248712182045, + "step": 25394 + }, + { + "epoch": 0.793625, + "grad_norm": 3.203125, + "grad_norm_var": 0.018114217122395835, + "learning_rate": 0.0001, + "loss": 5.9426, + "loss/crossentropy": 2.776470184326172, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17051556706428528, + "step": 25396 + }, + { + "epoch": 0.7936875, + "grad_norm": 3.15625, + "grad_norm_var": 0.0198394775390625, + "learning_rate": 0.0001, + "loss": 5.7185, + "loss/crossentropy": 2.651002049446106, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16143406927585602, + "step": 25398 + }, + { + "epoch": 0.79375, + "grad_norm": 3.625, + "grad_norm_var": 0.03655192057291667, + "learning_rate": 0.0001, + "loss": 5.673, + "loss/crossentropy": 2.528809070587158, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16793614625930786, + "step": 25400 + }, + { + "epoch": 0.7938125, + "grad_norm": 3.015625, + "grad_norm_var": 0.03650716145833333, + "learning_rate": 0.0001, + "loss": 5.8523, + "loss/crossentropy": 2.7354499101638794, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16871225088834763, + "step": 25402 + }, + { + "epoch": 0.793875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0333892822265625, + "learning_rate": 0.0001, + "loss": 5.4758, + "loss/crossentropy": 2.4791252613067627, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15513592958450317, + "step": 25404 + }, + { + "epoch": 0.7939375, + "grad_norm": 3.1875, + "grad_norm_var": 0.033186848958333334, + "learning_rate": 0.0001, + "loss": 5.9236, + "loss/crossentropy": 2.7395559549331665, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16958092153072357, + "step": 25406 + }, + { + "epoch": 0.794, + "grad_norm": 3.3125, + "grad_norm_var": 0.037430826822916666, + "learning_rate": 0.0001, + "loss": 5.6847, + "loss/crossentropy": 2.5286134481430054, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17107945680618286, + "step": 25408 + }, + { + "epoch": 0.7940625, + "grad_norm": 3.34375, + "grad_norm_var": 0.028580729166666666, + "learning_rate": 0.0001, + "loss": 5.5004, + "loss/crossentropy": 2.4793365001678467, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1567937731742859, + "step": 25410 + }, + { + "epoch": 0.794125, + "grad_norm": 3.0, + "grad_norm_var": 0.03483784993489583, + "learning_rate": 0.0001, + "loss": 5.4253, + "loss/crossentropy": 2.441056251525879, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15428167581558228, + "step": 25412 + }, + { + "epoch": 0.7941875, + "grad_norm": 3.15625, + "grad_norm_var": 0.034520467122395836, + "learning_rate": 0.0001, + "loss": 5.3587, + "loss/crossentropy": 2.388457775115967, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1513218656182289, + "step": 25414 + }, + { + "epoch": 0.79425, + "grad_norm": 3.046875, + "grad_norm_var": 0.0177734375, + "learning_rate": 0.0001, + "loss": 5.8201, + "loss/crossentropy": 2.6421027183532715, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16975290328264236, + "step": 25416 + }, + { + "epoch": 0.7943125, + "grad_norm": 3.21875, + "grad_norm_var": 0.02080078125, + "learning_rate": 0.0001, + "loss": 5.5715, + "loss/crossentropy": 2.463736057281494, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16077560186386108, + "step": 25418 + }, + { + "epoch": 0.794375, + "grad_norm": 3.515625, + "grad_norm_var": 0.027925618489583335, + "learning_rate": 0.0001, + "loss": 5.6985, + "loss/crossentropy": 2.555183529853821, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16628053784370422, + "step": 25420 + }, + { + "epoch": 0.7944375, + "grad_norm": 3.0625, + "grad_norm_var": 0.030809529622395835, + "learning_rate": 0.0001, + "loss": 5.5911, + "loss/crossentropy": 2.55877947807312, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15596868097782135, + "step": 25422 + }, + { + "epoch": 0.7945, + "grad_norm": 3.015625, + "grad_norm_var": 0.03922119140625, + "learning_rate": 0.0001, + "loss": 5.558, + "loss/crossentropy": 2.5130953788757324, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16035164892673492, + "step": 25424 + }, + { + "epoch": 0.7945625, + "grad_norm": 2.984375, + "grad_norm_var": 0.04362691243489583, + "learning_rate": 0.0001, + "loss": 6.1642, + "loss/crossentropy": 2.8820998668670654, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17898721247911453, + "step": 25426 + }, + { + "epoch": 0.794625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0427734375, + "learning_rate": 0.0001, + "loss": 5.5461, + "loss/crossentropy": 2.5282377004623413, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15451814234256744, + "step": 25428 + }, + { + "epoch": 0.7946875, + "grad_norm": 3.203125, + "grad_norm_var": 0.045563761393229166, + "learning_rate": 0.0001, + "loss": 5.8231, + "loss/crossentropy": 2.67414653301239, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1668485924601555, + "step": 25430 + }, + { + "epoch": 0.79475, + "grad_norm": 3.203125, + "grad_norm_var": 0.0478424072265625, + "learning_rate": 0.0001, + "loss": 5.8406, + "loss/crossentropy": 2.715545177459717, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1656324788928032, + "step": 25432 + }, + { + "epoch": 0.7948125, + "grad_norm": 3.03125, + "grad_norm_var": 0.046647135416666666, + "learning_rate": 0.0001, + "loss": 5.8858, + "loss/crossentropy": 2.738201379776001, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16905217617750168, + "step": 25434 + }, + { + "epoch": 0.794875, + "grad_norm": 3.125, + "grad_norm_var": 0.03877665201822917, + "learning_rate": 0.0001, + "loss": 5.8226, + "loss/crossentropy": 2.615545630455017, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17265405505895615, + "step": 25436 + }, + { + "epoch": 0.7949375, + "grad_norm": 3.234375, + "grad_norm_var": 0.042512003580729166, + "learning_rate": 0.0001, + "loss": 5.3998, + "loss/crossentropy": 2.320631504058838, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16182444989681244, + "step": 25438 + }, + { + "epoch": 0.795, + "grad_norm": 3.28125, + "grad_norm_var": 0.029618326822916666, + "learning_rate": 0.0001, + "loss": 5.5821, + "loss/crossentropy": 2.484802722930908, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16403156518936157, + "step": 25440 + }, + { + "epoch": 0.7950625, + "grad_norm": 2.703125, + "grad_norm_var": 0.033014933268229164, + "learning_rate": 0.0001, + "loss": 5.7428, + "loss/crossentropy": 2.6623945236206055, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1607770398259163, + "step": 25442 + }, + { + "epoch": 0.795125, + "grad_norm": 2.796875, + "grad_norm_var": 0.04680989583333333, + "learning_rate": 0.0001, + "loss": 5.5889, + "loss/crossentropy": 2.497969150543213, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1579175516963005, + "step": 25444 + }, + { + "epoch": 0.7951875, + "grad_norm": 3.140625, + "grad_norm_var": 0.048094685872395834, + "learning_rate": 0.0001, + "loss": 5.5234, + "loss/crossentropy": 2.492112636566162, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1609383374452591, + "step": 25446 + }, + { + "epoch": 0.79525, + "grad_norm": 2.96875, + "grad_norm_var": 0.04838765462239583, + "learning_rate": 0.0001, + "loss": 5.746, + "loss/crossentropy": 2.713121175765991, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16109861433506012, + "step": 25448 + }, + { + "epoch": 0.7953125, + "grad_norm": 3.46875, + "grad_norm_var": 0.05676676432291667, + "learning_rate": 0.0001, + "loss": 6.0213, + "loss/crossentropy": 2.716894030570984, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17887406796216965, + "step": 25450 + }, + { + "epoch": 0.795375, + "grad_norm": 2.84375, + "grad_norm_var": 0.05751953125, + "learning_rate": 0.0001, + "loss": 5.5568, + "loss/crossentropy": 2.47613787651062, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16001805663108826, + "step": 25452 + }, + { + "epoch": 0.7954375, + "grad_norm": 3.15625, + "grad_norm_var": 0.0588531494140625, + "learning_rate": 0.0001, + "loss": 5.4774, + "loss/crossentropy": 2.3346649408340454, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1654464304447174, + "step": 25454 + }, + { + "epoch": 0.7955, + "grad_norm": 3.421875, + "grad_norm_var": 0.0620758056640625, + "learning_rate": 0.0001, + "loss": 5.8435, + "loss/crossentropy": 2.6950998306274414, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1675708219408989, + "step": 25456 + }, + { + "epoch": 0.7955625, + "grad_norm": 2.671875, + "grad_norm_var": 0.06360677083333334, + "learning_rate": 0.0001, + "loss": 5.4595, + "loss/crossentropy": 2.5493483543395996, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.15155760198831558, + "step": 25458 + }, + { + "epoch": 0.795625, + "grad_norm": 3.140625, + "grad_norm_var": 0.04726460774739583, + "learning_rate": 0.0001, + "loss": 5.712, + "loss/crossentropy": 2.5620031356811523, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.165386363863945, + "step": 25460 + }, + { + "epoch": 0.7956875, + "grad_norm": 2.859375, + "grad_norm_var": 0.047728474934895834, + "learning_rate": 0.0001, + "loss": 5.5077, + "loss/crossentropy": 2.5065815448760986, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1547996699810028, + "step": 25462 + }, + { + "epoch": 0.79575, + "grad_norm": 3.046875, + "grad_norm_var": 0.044482421875, + "learning_rate": 0.0001, + "loss": 5.7657, + "loss/crossentropy": 2.664130926132202, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1664118468761444, + "step": 25464 + }, + { + "epoch": 0.7958125, + "grad_norm": 2.890625, + "grad_norm_var": 0.03772379557291667, + "learning_rate": 0.0001, + "loss": 5.6806, + "loss/crossentropy": 2.6315428018569946, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1580308899283409, + "step": 25466 + }, + { + "epoch": 0.795875, + "grad_norm": 3.015625, + "grad_norm_var": 0.042845662434895834, + "learning_rate": 0.0001, + "loss": 6.0401, + "loss/crossentropy": 2.8397724628448486, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17237255722284317, + "step": 25468 + }, + { + "epoch": 0.7959375, + "grad_norm": 3.140625, + "grad_norm_var": 0.0360992431640625, + "learning_rate": 0.0001, + "loss": 5.9452, + "loss/crossentropy": 2.73575496673584, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1725090742111206, + "step": 25470 + }, + { + "epoch": 0.796, + "grad_norm": 3.609375, + "grad_norm_var": 0.04736328125, + "learning_rate": 0.0001, + "loss": 5.6821, + "loss/crossentropy": 2.530869245529175, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16902897506952286, + "step": 25472 + }, + { + "epoch": 0.7960625, + "grad_norm": 2.875, + "grad_norm_var": 0.03997395833333333, + "learning_rate": 0.0001, + "loss": 5.2324, + "loss/crossentropy": 2.2642139196395874, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15307102352380753, + "step": 25474 + }, + { + "epoch": 0.796125, + "grad_norm": 3.09375, + "grad_norm_var": 0.04102274576822917, + "learning_rate": 0.0001, + "loss": 5.7425, + "loss/crossentropy": 2.693701982498169, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16230234503746033, + "step": 25476 + }, + { + "epoch": 0.7961875, + "grad_norm": 3.109375, + "grad_norm_var": 0.04023030598958333, + "learning_rate": 0.0001, + "loss": 5.8272, + "loss/crossentropy": 2.7727891206741333, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.16481461375951767, + "step": 25478 + }, + { + "epoch": 0.79625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0412994384765625, + "learning_rate": 0.0001, + "loss": 5.7098, + "loss/crossentropy": 2.5666359663009644, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1643167957663536, + "step": 25480 + }, + { + "epoch": 0.7963125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03814697265625, + "learning_rate": 0.0001, + "loss": 5.6605, + "loss/crossentropy": 2.5249993801116943, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1639430746436119, + "step": 25482 + }, + { + "epoch": 0.796375, + "grad_norm": 2.984375, + "grad_norm_var": 0.03033447265625, + "learning_rate": 0.0001, + "loss": 5.702, + "loss/crossentropy": 2.6121197938919067, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1660180762410164, + "step": 25484 + }, + { + "epoch": 0.7964375, + "grad_norm": 3.359375, + "grad_norm_var": 0.03516337076822917, + "learning_rate": 0.0001, + "loss": 5.6047, + "loss/crossentropy": 2.5446722507476807, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16069257259368896, + "step": 25486 + }, + { + "epoch": 0.7965, + "grad_norm": 3.203125, + "grad_norm_var": 0.019465128580729168, + "learning_rate": 0.0001, + "loss": 5.7265, + "loss/crossentropy": 2.615384578704834, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16657806932926178, + "step": 25488 + }, + { + "epoch": 0.7965625, + "grad_norm": 3.078125, + "grad_norm_var": 0.017936197916666667, + "learning_rate": 0.0001, + "loss": 5.4358, + "loss/crossentropy": 2.3771530389785767, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16172577440738678, + "step": 25490 + }, + { + "epoch": 0.796625, + "grad_norm": 3.171875, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 5.66, + "loss/crossentropy": 2.5950770378112793, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16352280974388123, + "step": 25492 + }, + { + "epoch": 0.7966875, + "grad_norm": 3.0625, + "grad_norm_var": 0.013459269205729167, + "learning_rate": 0.0001, + "loss": 5.3398, + "loss/crossentropy": 2.3305797576904297, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1552230790257454, + "step": 25494 + }, + { + "epoch": 0.79675, + "grad_norm": 3.140625, + "grad_norm_var": 0.015592447916666667, + "learning_rate": 0.0001, + "loss": 5.5147, + "loss/crossentropy": 2.4388809204101562, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15953578054904938, + "step": 25496 + }, + { + "epoch": 0.7968125, + "grad_norm": 3.125, + "grad_norm_var": 0.0158355712890625, + "learning_rate": 0.0001, + "loss": 5.6602, + "loss/crossentropy": 2.576777219772339, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16459254920482635, + "step": 25498 + }, + { + "epoch": 0.796875, + "grad_norm": 3.0, + "grad_norm_var": 0.019017537434895832, + "learning_rate": 0.0001, + "loss": 5.369, + "loss/crossentropy": 2.4149060249328613, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15165583789348602, + "step": 25500 + }, + { + "epoch": 0.7969375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0140289306640625, + "learning_rate": 0.0001, + "loss": 5.3528, + "loss/crossentropy": 2.387664318084717, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15158936381340027, + "step": 25502 + }, + { + "epoch": 0.797, + "grad_norm": 2.953125, + "grad_norm_var": 0.0192291259765625, + "learning_rate": 0.0001, + "loss": 5.3215, + "loss/crossentropy": 2.366239547729492, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15372508019208908, + "step": 25504 + }, + { + "epoch": 0.7970625, + "grad_norm": 3.25, + "grad_norm_var": 0.024820963541666668, + "learning_rate": 0.0001, + "loss": 5.6459, + "loss/crossentropy": 2.630552649497986, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1589585244655609, + "step": 25506 + }, + { + "epoch": 0.797125, + "grad_norm": 3.359375, + "grad_norm_var": 0.16367085774739584, + "learning_rate": 0.0001, + "loss": 6.0366, + "loss/crossentropy": 2.8165512084960938, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17278824746608734, + "step": 25508 + }, + { + "epoch": 0.7971875, + "grad_norm": 3.171875, + "grad_norm_var": 0.1638824462890625, + "learning_rate": 0.0001, + "loss": 5.8457, + "loss/crossentropy": 2.689768671989441, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1710585355758667, + "step": 25510 + }, + { + "epoch": 0.79725, + "grad_norm": 2.828125, + "grad_norm_var": 0.16585286458333334, + "learning_rate": 0.0001, + "loss": 5.5155, + "loss/crossentropy": 2.4981210231781006, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16033216565847397, + "step": 25512 + }, + { + "epoch": 0.7973125, + "grad_norm": 3.046875, + "grad_norm_var": 0.16577860514322917, + "learning_rate": 0.0001, + "loss": 5.4075, + "loss/crossentropy": 2.446346402168274, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15314766019582748, + "step": 25514 + }, + { + "epoch": 0.797375, + "grad_norm": 3.46875, + "grad_norm_var": 0.16868082682291666, + "learning_rate": 0.0001, + "loss": 5.8491, + "loss/crossentropy": 2.7132939100265503, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16748397052288055, + "step": 25516 + }, + { + "epoch": 0.7974375, + "grad_norm": 2.765625, + "grad_norm_var": 0.1755859375, + "learning_rate": 0.0001, + "loss": 5.3634, + "loss/crossentropy": 2.3772170543670654, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15174540877342224, + "step": 25518 + }, + { + "epoch": 0.7975, + "grad_norm": 3.125, + "grad_norm_var": 0.15921122233072918, + "learning_rate": 0.0001, + "loss": 5.4941, + "loss/crossentropy": 2.4577242136001587, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15598254650831223, + "step": 25520 + }, + { + "epoch": 0.7975625, + "grad_norm": 2.8125, + "grad_norm_var": 0.15374247233072916, + "learning_rate": 0.0001, + "loss": 5.605, + "loss/crossentropy": 2.6322391033172607, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15430255234241486, + "step": 25522 + }, + { + "epoch": 0.797625, + "grad_norm": 2.984375, + "grad_norm_var": 0.031183878580729168, + "learning_rate": 0.0001, + "loss": 5.4833, + "loss/crossentropy": 2.444926142692566, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15852493047714233, + "step": 25524 + }, + { + "epoch": 0.7976875, + "grad_norm": 2.953125, + "grad_norm_var": 0.030094401041666666, + "learning_rate": 0.0001, + "loss": 5.3054, + "loss/crossentropy": 2.328010082244873, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1528138443827629, + "step": 25526 + }, + { + "epoch": 0.79775, + "grad_norm": 3.4375, + "grad_norm_var": 0.03902587890625, + "learning_rate": 0.0001, + "loss": 5.9898, + "loss/crossentropy": 2.72395658493042, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17814859747886658, + "step": 25528 + }, + { + "epoch": 0.7978125, + "grad_norm": 2.828125, + "grad_norm_var": 0.04814453125, + "learning_rate": 0.0001, + "loss": 5.7455, + "loss/crossentropy": 2.7303361892700195, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15698431432247162, + "step": 25530 + }, + { + "epoch": 0.797875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0395172119140625, + "learning_rate": 0.0001, + "loss": 5.6479, + "loss/crossentropy": 2.506553530693054, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16530776768922806, + "step": 25532 + }, + { + "epoch": 0.7979375, + "grad_norm": 2.796875, + "grad_norm_var": 0.039891560872395836, + "learning_rate": 0.0001, + "loss": 5.3655, + "loss/crossentropy": 2.433992028236389, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14940283447504044, + "step": 25534 + }, + { + "epoch": 0.798, + "grad_norm": 2.90625, + "grad_norm_var": 0.0386138916015625, + "learning_rate": 0.0001, + "loss": 5.5089, + "loss/crossentropy": 2.4843519926071167, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15870817005634308, + "step": 25536 + }, + { + "epoch": 0.7980625, + "grad_norm": 3.109375, + "grad_norm_var": 0.0354400634765625, + "learning_rate": 0.0001, + "loss": 5.8525, + "loss/crossentropy": 2.7046704292297363, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16673485934734344, + "step": 25538 + }, + { + "epoch": 0.798125, + "grad_norm": 3.265625, + "grad_norm_var": 0.0436431884765625, + "learning_rate": 0.0001, + "loss": 5.1388, + "loss/crossentropy": 2.208470582962036, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14537180960178375, + "step": 25540 + }, + { + "epoch": 0.7981875, + "grad_norm": 3.15625, + "grad_norm_var": 0.043309529622395836, + "learning_rate": 0.0001, + "loss": 5.5689, + "loss/crossentropy": 2.476935029029846, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1646619364619255, + "step": 25542 + }, + { + "epoch": 0.79825, + "grad_norm": 3.390625, + "grad_norm_var": 0.08238525390625, + "learning_rate": 0.0001, + "loss": 5.8632, + "loss/crossentropy": 2.6023300886154175, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17765138298273087, + "step": 25544 + }, + { + "epoch": 0.7983125, + "grad_norm": 3.28125, + "grad_norm_var": 0.06796875, + "learning_rate": 0.0001, + "loss": 5.5825, + "loss/crossentropy": 2.4996442794799805, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1606249436736107, + "step": 25546 + }, + { + "epoch": 0.798375, + "grad_norm": 3.515625, + "grad_norm_var": 0.07424723307291667, + "learning_rate": 0.0001, + "loss": 5.8853, + "loss/crossentropy": 2.6848857402801514, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1712096408009529, + "step": 25548 + }, + { + "epoch": 0.7984375, + "grad_norm": 2.984375, + "grad_norm_var": 0.06333719889322917, + "learning_rate": 0.0001, + "loss": 5.9677, + "loss/crossentropy": 2.8552619218826294, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1643703281879425, + "step": 25550 + }, + { + "epoch": 0.7985, + "grad_norm": 3.09375, + "grad_norm_var": 0.059228515625, + "learning_rate": 0.0001, + "loss": 5.8195, + "loss/crossentropy": 2.719208598136902, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16393586993217468, + "step": 25552 + }, + { + "epoch": 0.7985625, + "grad_norm": 3.125, + "grad_norm_var": 0.06682535807291666, + "learning_rate": 0.0001, + "loss": 5.5242, + "loss/crossentropy": 2.478301763534546, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16162413358688354, + "step": 25554 + }, + { + "epoch": 0.798625, + "grad_norm": 2.890625, + "grad_norm_var": 0.07568359375, + "learning_rate": 0.0001, + "loss": 5.6991, + "loss/crossentropy": 2.6302788257598877, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16235194355249405, + "step": 25556 + }, + { + "epoch": 0.7986875, + "grad_norm": 3.125, + "grad_norm_var": 0.07669169108072917, + "learning_rate": 0.0001, + "loss": 5.9282, + "loss/crossentropy": 2.741113543510437, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16909489780664444, + "step": 25558 + }, + { + "epoch": 0.79875, + "grad_norm": 2.984375, + "grad_norm_var": 0.0422760009765625, + "learning_rate": 0.0001, + "loss": 6.0116, + "loss/crossentropy": 2.803410530090332, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17354948073625565, + "step": 25560 + }, + { + "epoch": 0.7988125, + "grad_norm": 2.890625, + "grad_norm_var": 0.0452545166015625, + "learning_rate": 0.0001, + "loss": 5.1638, + "loss/crossentropy": 2.1904959678649902, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1570926457643509, + "step": 25562 + }, + { + "epoch": 0.798875, + "grad_norm": 2.75, + "grad_norm_var": 0.04071858723958333, + "learning_rate": 0.0001, + "loss": 5.6028, + "loss/crossentropy": 2.54690682888031, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15988218784332275, + "step": 25564 + }, + { + "epoch": 0.7989375, + "grad_norm": 3.0625, + "grad_norm_var": 0.03173421223958333, + "learning_rate": 0.0001, + "loss": 5.8456, + "loss/crossentropy": 2.7087689638137817, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.164850115776062, + "step": 25566 + }, + { + "epoch": 0.799, + "grad_norm": 3.078125, + "grad_norm_var": 0.035563151041666664, + "learning_rate": 0.0001, + "loss": 5.6834, + "loss/crossentropy": 2.6643515825271606, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1565898060798645, + "step": 25568 + }, + { + "epoch": 0.7990625, + "grad_norm": 2.71875, + "grad_norm_var": 0.04633687337239583, + "learning_rate": 0.0001, + "loss": 5.2925, + "loss/crossentropy": 2.2745431661605835, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15609276294708252, + "step": 25570 + }, + { + "epoch": 0.799125, + "grad_norm": 2.953125, + "grad_norm_var": 0.04537353515625, + "learning_rate": 0.0001, + "loss": 5.4212, + "loss/crossentropy": 2.426013469696045, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15459664165973663, + "step": 25572 + }, + { + "epoch": 0.7991875, + "grad_norm": 2.90625, + "grad_norm_var": 0.045221964518229164, + "learning_rate": 0.0001, + "loss": 5.5825, + "loss/crossentropy": 2.662925362586975, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15210948884487152, + "step": 25574 + }, + { + "epoch": 0.79925, + "grad_norm": 3.046875, + "grad_norm_var": 0.04396870930989583, + "learning_rate": 0.0001, + "loss": 6.1286, + "loss/crossentropy": 2.873689293861389, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17666620761156082, + "step": 25576 + }, + { + "epoch": 0.7993125, + "grad_norm": 3.09375, + "grad_norm_var": 0.072265625, + "learning_rate": 0.0001, + "loss": 5.6605, + "loss/crossentropy": 2.56427001953125, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16587091982364655, + "step": 25578 + }, + { + "epoch": 0.799375, + "grad_norm": 3.46875, + "grad_norm_var": 0.0746246337890625, + "learning_rate": 0.0001, + "loss": 5.6584, + "loss/crossentropy": 2.5480706691741943, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16181902587413788, + "step": 25580 + }, + { + "epoch": 0.7994375, + "grad_norm": 2.921875, + "grad_norm_var": 0.07457275390625, + "learning_rate": 0.0001, + "loss": 5.2879, + "loss/crossentropy": 2.3415573835372925, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15362046658992767, + "step": 25582 + }, + { + "epoch": 0.7995, + "grad_norm": 3.1875, + "grad_norm_var": 0.0777984619140625, + "learning_rate": 0.0001, + "loss": 5.7093, + "loss/crossentropy": 2.651996612548828, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16275802999734879, + "step": 25584 + }, + { + "epoch": 0.7995625, + "grad_norm": 3.125, + "grad_norm_var": 0.06565348307291667, + "learning_rate": 0.0001, + "loss": 5.2063, + "loss/crossentropy": 2.2687193155288696, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.14844368398189545, + "step": 25586 + }, + { + "epoch": 0.799625, + "grad_norm": 3.3125, + "grad_norm_var": 0.0683258056640625, + "learning_rate": 0.0001, + "loss": 5.6074, + "loss/crossentropy": 2.521701693534851, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16051886975765228, + "step": 25588 + }, + { + "epoch": 0.7996875, + "grad_norm": 3.34375, + "grad_norm_var": 0.06256510416666666, + "learning_rate": 0.0001, + "loss": 5.8776, + "loss/crossentropy": 2.7814104557037354, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16156956553459167, + "step": 25590 + }, + { + "epoch": 0.79975, + "grad_norm": 2.96875, + "grad_norm_var": 0.06262613932291666, + "learning_rate": 0.0001, + "loss": 5.4677, + "loss/crossentropy": 2.461822032928467, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1576220542192459, + "step": 25592 + }, + { + "epoch": 0.7998125, + "grad_norm": 2.96875, + "grad_norm_var": 0.0400787353515625, + "learning_rate": 0.0001, + "loss": 5.7341, + "loss/crossentropy": 2.5990320444107056, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16663643717765808, + "step": 25594 + }, + { + "epoch": 0.799875, + "grad_norm": 31.625, + "grad_norm_var": 51.0037109375, + "learning_rate": 0.0001, + "loss": 6.0037, + "loss/crossentropy": 2.4253209829330444, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.20627228915691376, + "step": 25596 + }, + { + "epoch": 0.7999375, + "grad_norm": 3.046875, + "grad_norm_var": 50.943033854166664, + "learning_rate": 0.0001, + "loss": 5.5132, + "loss/crossentropy": 2.444235324859619, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16432026028633118, + "step": 25598 + }, + { + "epoch": 0.8, + "grad_norm": 3.28125, + "grad_norm_var": 50.81070556640625, + "learning_rate": 0.0001, + "loss": 5.6477, + "loss/crossentropy": 2.529646158218384, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1696132943034172, + "step": 25600 + }, + { + "epoch": 0.8000625, + "grad_norm": 3.28125, + "grad_norm_var": 50.59339090983073, + "learning_rate": 0.0001, + "loss": 5.6978, + "loss/crossentropy": 2.522778630256653, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17062946408987045, + "step": 25602 + }, + { + "epoch": 0.800125, + "grad_norm": 3.234375, + "grad_norm_var": 50.6152089436849, + "learning_rate": 0.0001, + "loss": 5.4611, + "loss/crossentropy": 2.4644765853881836, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15474426746368408, + "step": 25604 + }, + { + "epoch": 0.8001875, + "grad_norm": 2.921875, + "grad_norm_var": 50.69345296223958, + "learning_rate": 0.0001, + "loss": 5.6861, + "loss/crossentropy": 2.6215637922286987, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1619187593460083, + "step": 25606 + }, + { + "epoch": 0.80025, + "grad_norm": 3.578125, + "grad_norm_var": 50.59892171223958, + "learning_rate": 0.0001, + "loss": 5.742, + "loss/crossentropy": 2.6077487468719482, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16342793405056, + "step": 25608 + }, + { + "epoch": 0.8003125, + "grad_norm": 3.546875, + "grad_norm_var": 50.42652079264323, + "learning_rate": 0.0001, + "loss": 5.9068, + "loss/crossentropy": 2.6417452096939087, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17885131388902664, + "step": 25610 + }, + { + "epoch": 0.800375, + "grad_norm": 3.40625, + "grad_norm_var": 0.058991495768229166, + "learning_rate": 0.0001, + "loss": 5.9333, + "loss/crossentropy": 2.686414957046509, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17625177651643753, + "step": 25612 + }, + { + "epoch": 0.8004375, + "grad_norm": 3.390625, + "grad_norm_var": 0.05634663899739583, + "learning_rate": 0.0001, + "loss": 5.8919, + "loss/crossentropy": 2.6309632062911987, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1760942041873932, + "step": 25614 + }, + { + "epoch": 0.8005, + "grad_norm": 2.796875, + "grad_norm_var": 0.07636311848958334, + "learning_rate": 0.0001, + "loss": 5.5093, + "loss/crossentropy": 2.454187512397766, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16098280996084213, + "step": 25616 + }, + { + "epoch": 0.8005625, + "grad_norm": 3.0, + "grad_norm_var": 0.06408589680989583, + "learning_rate": 0.0001, + "loss": 5.6101, + "loss/crossentropy": 2.5555964708328247, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16169817745685577, + "step": 25618 + }, + { + "epoch": 0.800625, + "grad_norm": 2.96875, + "grad_norm_var": 0.06236063639322917, + "learning_rate": 0.0001, + "loss": 5.856, + "loss/crossentropy": 2.673764228820801, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1701762080192566, + "step": 25620 + }, + { + "epoch": 0.8006875, + "grad_norm": 3.0, + "grad_norm_var": 0.05994364420572917, + "learning_rate": 0.0001, + "loss": 5.6185, + "loss/crossentropy": 2.5303525924682617, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1642872840166092, + "step": 25622 + }, + { + "epoch": 0.80075, + "grad_norm": 3.25, + "grad_norm_var": 0.05245768229166667, + "learning_rate": 0.0001, + "loss": 5.9096, + "loss/crossentropy": 2.699185848236084, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17416246980428696, + "step": 25624 + }, + { + "epoch": 0.8008125, + "grad_norm": 3.296875, + "grad_norm_var": 0.04036356608072917, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.6083273887634277, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1639333814382553, + "step": 25626 + }, + { + "epoch": 0.800875, + "grad_norm": 3.640625, + "grad_norm_var": 0.054520670572916666, + "learning_rate": 0.0001, + "loss": 6.2766, + "loss/crossentropy": 2.897311806678772, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.18636631220579147, + "step": 25628 + }, + { + "epoch": 0.8009375, + "grad_norm": 3.53125, + "grad_norm_var": 0.06113993326822917, + "learning_rate": 0.0001, + "loss": 5.8779, + "loss/crossentropy": 2.686615228652954, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17342127859592438, + "step": 25630 + }, + { + "epoch": 0.801, + "grad_norm": 3.0, + "grad_norm_var": 0.052750651041666666, + "learning_rate": 0.0001, + "loss": 5.7031, + "loss/crossentropy": 2.6328877210617065, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16210304200649261, + "step": 25632 + }, + { + "epoch": 0.8010625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0535308837890625, + "learning_rate": 0.0001, + "loss": 5.3814, + "loss/crossentropy": 2.446844458580017, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1516549289226532, + "step": 25634 + }, + { + "epoch": 0.801125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0518463134765625, + "learning_rate": 0.0001, + "loss": 5.6919, + "loss/crossentropy": 2.631867289543152, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15913043916225433, + "step": 25636 + }, + { + "epoch": 0.8011875, + "grad_norm": 3.109375, + "grad_norm_var": 0.06028544108072917, + "learning_rate": 0.0001, + "loss": 5.039, + "loss/crossentropy": 2.216692805290222, + "loss/hidden": 1.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.14551055431365967, + "step": 25638 + }, + { + "epoch": 0.80125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0585357666015625, + "learning_rate": 0.0001, + "loss": 5.7764, + "loss/crossentropy": 2.65237557888031, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16592242568731308, + "step": 25640 + }, + { + "epoch": 0.8013125, + "grad_norm": 3.203125, + "grad_norm_var": 0.06978759765625, + "learning_rate": 0.0001, + "loss": 4.9999, + "loss/crossentropy": 2.1383305191993713, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.13654698431491852, + "step": 25642 + }, + { + "epoch": 0.801375, + "grad_norm": 3.375, + "grad_norm_var": 0.052897135416666664, + "learning_rate": 0.0001, + "loss": 5.5595, + "loss/crossentropy": 2.56177020072937, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15993209183216095, + "step": 25644 + }, + { + "epoch": 0.8014375, + "grad_norm": 2.984375, + "grad_norm_var": 0.040934244791666664, + "learning_rate": 0.0001, + "loss": 5.9067, + "loss/crossentropy": 2.7646554708480835, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1681097373366356, + "step": 25646 + }, + { + "epoch": 0.8015, + "grad_norm": 3.109375, + "grad_norm_var": 0.0415679931640625, + "learning_rate": 0.0001, + "loss": 5.586, + "loss/crossentropy": 2.5196781158447266, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15780264139175415, + "step": 25648 + }, + { + "epoch": 0.8015625, + "grad_norm": 2.90625, + "grad_norm_var": 0.04228413899739583, + "learning_rate": 0.0001, + "loss": 5.7445, + "loss/crossentropy": 2.6249492168426514, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1627407670021057, + "step": 25650 + }, + { + "epoch": 0.801625, + "grad_norm": 3.1875, + "grad_norm_var": 0.04625244140625, + "learning_rate": 0.0001, + "loss": 5.8244, + "loss/crossentropy": 2.7322680950164795, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16272947937250137, + "step": 25652 + }, + { + "epoch": 0.8016875, + "grad_norm": 3.234375, + "grad_norm_var": 0.04169514973958333, + "learning_rate": 0.0001, + "loss": 5.4287, + "loss/crossentropy": 2.3911421298980713, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15375907719135284, + "step": 25654 + }, + { + "epoch": 0.80175, + "grad_norm": 3.4375, + "grad_norm_var": 0.04491780598958333, + "learning_rate": 0.0001, + "loss": 5.9986, + "loss/crossentropy": 2.765831232070923, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17366764694452286, + "step": 25656 + }, + { + "epoch": 0.8018125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03046875, + "learning_rate": 0.0001, + "loss": 5.9062, + "loss/crossentropy": 2.736068844795227, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16897106915712357, + "step": 25658 + }, + { + "epoch": 0.801875, + "grad_norm": 3.40625, + "grad_norm_var": 0.04478251139322917, + "learning_rate": 0.0001, + "loss": 5.3685, + "loss/crossentropy": 2.4307442903518677, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1515898033976555, + "step": 25660 + }, + { + "epoch": 0.8019375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0458648681640625, + "learning_rate": 0.0001, + "loss": 5.3073, + "loss/crossentropy": 2.3094619512557983, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1540796086192131, + "step": 25662 + }, + { + "epoch": 0.802, + "grad_norm": 3.296875, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 0.0001, + "loss": 5.4717, + "loss/crossentropy": 2.4414368867874146, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15927442908287048, + "step": 25664 + }, + { + "epoch": 0.8020625, + "grad_norm": 3.0, + "grad_norm_var": 0.046126302083333334, + "learning_rate": 0.0001, + "loss": 5.4901, + "loss/crossentropy": 2.4707207679748535, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.158184215426445, + "step": 25666 + }, + { + "epoch": 0.802125, + "grad_norm": 3.234375, + "grad_norm_var": 0.039338175455729166, + "learning_rate": 0.0001, + "loss": 5.5646, + "loss/crossentropy": 2.598528742790222, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1548113077878952, + "step": 25668 + }, + { + "epoch": 0.8021875, + "grad_norm": 2.953125, + "grad_norm_var": 0.050715128580729164, + "learning_rate": 0.0001, + "loss": 5.4259, + "loss/crossentropy": 2.482747793197632, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15330323576927185, + "step": 25670 + }, + { + "epoch": 0.80225, + "grad_norm": 3.140625, + "grad_norm_var": 0.04548238118489583, + "learning_rate": 0.0001, + "loss": 5.7484, + "loss/crossentropy": 2.6026499271392822, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16613566130399704, + "step": 25672 + }, + { + "epoch": 0.8023125, + "grad_norm": 3.28125, + "grad_norm_var": 0.043309529622395836, + "learning_rate": 0.0001, + "loss": 5.8977, + "loss/crossentropy": 2.7315609455108643, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16661161929368973, + "step": 25674 + }, + { + "epoch": 0.802375, + "grad_norm": 3.25, + "grad_norm_var": 0.0368316650390625, + "learning_rate": 0.0001, + "loss": 5.4293, + "loss/crossentropy": 2.437578558921814, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15542569011449814, + "step": 25676 + }, + { + "epoch": 0.8024375, + "grad_norm": 3.0625, + "grad_norm_var": 0.03489481608072917, + "learning_rate": 0.0001, + "loss": 5.4747, + "loss/crossentropy": 2.4499117136001587, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1622449979186058, + "step": 25678 + }, + { + "epoch": 0.8025, + "grad_norm": 3.1875, + "grad_norm_var": 0.06774088541666666, + "learning_rate": 0.0001, + "loss": 5.5693, + "loss/crossentropy": 2.462652325630188, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.160662479698658, + "step": 25680 + }, + { + "epoch": 0.8025625, + "grad_norm": 3.21875, + "grad_norm_var": 0.07193603515625, + "learning_rate": 0.0001, + "loss": 5.3556, + "loss/crossentropy": 2.336050271987915, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15195493400096893, + "step": 25682 + }, + { + "epoch": 0.802625, + "grad_norm": 3.09375, + "grad_norm_var": 0.07487691243489583, + "learning_rate": 0.0001, + "loss": 5.4765, + "loss/crossentropy": 2.4275963306427, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1619264781475067, + "step": 25684 + }, + { + "epoch": 0.8026875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0613677978515625, + "learning_rate": 0.0001, + "loss": 5.6656, + "loss/crossentropy": 2.613976240158081, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15828579664230347, + "step": 25686 + }, + { + "epoch": 0.80275, + "grad_norm": 2.890625, + "grad_norm_var": 0.06897786458333334, + "learning_rate": 0.0001, + "loss": 5.4136, + "loss/crossentropy": 2.4856724739074707, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15216396749019623, + "step": 25688 + }, + { + "epoch": 0.8028125, + "grad_norm": 3.296875, + "grad_norm_var": 0.07062886555989584, + "learning_rate": 0.0001, + "loss": 6.0441, + "loss/crossentropy": 2.7790746688842773, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17806822806596756, + "step": 25690 + }, + { + "epoch": 0.802875, + "grad_norm": 3.390625, + "grad_norm_var": 0.08241780598958333, + "learning_rate": 0.0001, + "loss": 5.4503, + "loss/crossentropy": 2.4067717790603638, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15669343620538712, + "step": 25692 + }, + { + "epoch": 0.8029375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0868316650390625, + "learning_rate": 0.0001, + "loss": 5.4701, + "loss/crossentropy": 2.4876248836517334, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15449516475200653, + "step": 25694 + }, + { + "epoch": 0.803, + "grad_norm": 2.953125, + "grad_norm_var": 0.04761962890625, + "learning_rate": 0.0001, + "loss": 5.7045, + "loss/crossentropy": 2.689522862434387, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15657395124435425, + "step": 25696 + }, + { + "epoch": 0.8030625, + "grad_norm": 2.671875, + "grad_norm_var": 0.0401275634765625, + "learning_rate": 0.0001, + "loss": 5.4649, + "loss/crossentropy": 2.47347354888916, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15539731830358505, + "step": 25698 + }, + { + "epoch": 0.803125, + "grad_norm": 2.953125, + "grad_norm_var": 0.045633951822916664, + "learning_rate": 0.0001, + "loss": 5.3044, + "loss/crossentropy": 2.332689046859741, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15654505789279938, + "step": 25700 + }, + { + "epoch": 0.8031875, + "grad_norm": 3.0625, + "grad_norm_var": 0.045703125, + "learning_rate": 0.0001, + "loss": 5.4595, + "loss/crossentropy": 2.3924553394317627, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15866002440452576, + "step": 25702 + }, + { + "epoch": 0.80325, + "grad_norm": 3.40625, + "grad_norm_var": 0.057062784830729164, + "learning_rate": 0.0001, + "loss": 5.6308, + "loss/crossentropy": 2.5779011249542236, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16153736412525177, + "step": 25704 + }, + { + "epoch": 0.8033125, + "grad_norm": 3.25, + "grad_norm_var": 0.058690388997395836, + "learning_rate": 0.0001, + "loss": 5.7869, + "loss/crossentropy": 2.5727206468582153, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17219997197389603, + "step": 25706 + }, + { + "epoch": 0.803375, + "grad_norm": 3.09375, + "grad_norm_var": 0.0398590087890625, + "learning_rate": 0.0001, + "loss": 5.6147, + "loss/crossentropy": 2.514239192008972, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16355954110622406, + "step": 25708 + }, + { + "epoch": 0.8034375, + "grad_norm": 2.96875, + "grad_norm_var": 0.0429351806640625, + "learning_rate": 0.0001, + "loss": 5.7726, + "loss/crossentropy": 2.6182950735092163, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16777712106704712, + "step": 25710 + }, + { + "epoch": 0.8035, + "grad_norm": 3.296875, + "grad_norm_var": 0.04412434895833333, + "learning_rate": 0.0001, + "loss": 5.4248, + "loss/crossentropy": 2.3947904109954834, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1596369817852974, + "step": 25712 + }, + { + "epoch": 0.8035625, + "grad_norm": 3.046875, + "grad_norm_var": 0.03072509765625, + "learning_rate": 0.0001, + "loss": 5.796, + "loss/crossentropy": 2.6150922775268555, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16887310147285461, + "step": 25714 + }, + { + "epoch": 0.803625, + "grad_norm": 3.171875, + "grad_norm_var": 0.016209920247395832, + "learning_rate": 0.0001, + "loss": 5.6265, + "loss/crossentropy": 2.5220504999160767, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16513632237911224, + "step": 25716 + }, + { + "epoch": 0.8036875, + "grad_norm": 3.046875, + "grad_norm_var": 0.014256795247395834, + "learning_rate": 0.0001, + "loss": 6.0629, + "loss/crossentropy": 2.8565841913223267, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1729728877544403, + "step": 25718 + }, + { + "epoch": 0.80375, + "grad_norm": 3.09375, + "grad_norm_var": 0.017121378580729166, + "learning_rate": 0.0001, + "loss": 5.6883, + "loss/crossentropy": 2.6064772605895996, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16364658623933792, + "step": 25720 + }, + { + "epoch": 0.8038125, + "grad_norm": 3.046875, + "grad_norm_var": 0.015217081705729166, + "learning_rate": 0.0001, + "loss": 5.4772, + "loss/crossentropy": 2.4333605766296387, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.159462071955204, + "step": 25722 + }, + { + "epoch": 0.803875, + "grad_norm": 3.421875, + "grad_norm_var": 0.021415201822916667, + "learning_rate": 0.0001, + "loss": 5.907, + "loss/crossentropy": 2.6885892152786255, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17418118566274643, + "step": 25724 + }, + { + "epoch": 0.8039375, + "grad_norm": 3.234375, + "grad_norm_var": 0.017780558268229166, + "learning_rate": 0.0001, + "loss": 5.7036, + "loss/crossentropy": 2.620099186897278, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16147109866142273, + "step": 25726 + }, + { + "epoch": 0.804, + "grad_norm": 2.890625, + "grad_norm_var": 0.021385701497395833, + "learning_rate": 0.0001, + "loss": 5.1817, + "loss/crossentropy": 2.275924324989319, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1452612727880478, + "step": 25728 + }, + { + "epoch": 0.8040625, + "grad_norm": 3.265625, + "grad_norm_var": 0.15966389973958334, + "learning_rate": 0.0001, + "loss": 5.6325, + "loss/crossentropy": 2.5340161323547363, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1610231250524521, + "step": 25730 + }, + { + "epoch": 0.804125, + "grad_norm": 2.84375, + "grad_norm_var": 0.18509114583333333, + "learning_rate": 0.0001, + "loss": 5.7786, + "loss/crossentropy": 2.6004785299301147, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17093750834465027, + "step": 25732 + }, + { + "epoch": 0.8041875, + "grad_norm": 2.8125, + "grad_norm_var": 0.19388020833333333, + "learning_rate": 0.0001, + "loss": 5.781, + "loss/crossentropy": 2.6742039918899536, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16614725440740585, + "step": 25734 + }, + { + "epoch": 0.80425, + "grad_norm": 3.03125, + "grad_norm_var": 0.18931376139322917, + "learning_rate": 0.0001, + "loss": 5.3246, + "loss/crossentropy": 2.3369566202163696, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.156968355178833, + "step": 25736 + }, + { + "epoch": 0.8043125, + "grad_norm": 3.15625, + "grad_norm_var": 0.19371744791666667, + "learning_rate": 0.0001, + "loss": 5.9781, + "loss/crossentropy": 2.8178043365478516, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16993281990289688, + "step": 25738 + }, + { + "epoch": 0.804375, + "grad_norm": 3.078125, + "grad_norm_var": 0.19742431640625, + "learning_rate": 0.0001, + "loss": 5.588, + "loss/crossentropy": 2.5488440990448, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15860477834939957, + "step": 25740 + }, + { + "epoch": 0.8044375, + "grad_norm": 3.109375, + "grad_norm_var": 0.19734700520833334, + "learning_rate": 0.0001, + "loss": 5.6494, + "loss/crossentropy": 2.6056177616119385, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1590660735964775, + "step": 25742 + }, + { + "epoch": 0.8045, + "grad_norm": 3.390625, + "grad_norm_var": 0.18928629557291668, + "learning_rate": 0.0001, + "loss": 5.4437, + "loss/crossentropy": 2.3790863752365112, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15997406095266342, + "step": 25744 + }, + { + "epoch": 0.8045625, + "grad_norm": 3.015625, + "grad_norm_var": 0.06025390625, + "learning_rate": 0.0001, + "loss": 5.5304, + "loss/crossentropy": 2.4592326879501343, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1606363207101822, + "step": 25746 + }, + { + "epoch": 0.804625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03284505208333333, + "learning_rate": 0.0001, + "loss": 5.3759, + "loss/crossentropy": 2.338041305541992, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16199104487895966, + "step": 25748 + }, + { + "epoch": 0.8046875, + "grad_norm": 3.203125, + "grad_norm_var": 0.030980428059895832, + "learning_rate": 0.0001, + "loss": 5.5584, + "loss/crossentropy": 2.522215723991394, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1579131782054901, + "step": 25750 + }, + { + "epoch": 0.80475, + "grad_norm": 2.921875, + "grad_norm_var": 0.03166402180989583, + "learning_rate": 0.0001, + "loss": 5.2946, + "loss/crossentropy": 2.321294665336609, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15710077434778214, + "step": 25752 + }, + { + "epoch": 0.8048125, + "grad_norm": 3.15625, + "grad_norm_var": 0.026839192708333334, + "learning_rate": 0.0001, + "loss": 5.8861, + "loss/crossentropy": 2.8006707429885864, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16128186881542206, + "step": 25754 + }, + { + "epoch": 0.804875, + "grad_norm": 3.34375, + "grad_norm_var": 0.022175089518229166, + "learning_rate": 0.0001, + "loss": 5.2714, + "loss/crossentropy": 2.295003652572632, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1542847454547882, + "step": 25756 + }, + { + "epoch": 0.8049375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0223541259765625, + "learning_rate": 0.0001, + "loss": 5.5602, + "loss/crossentropy": 2.499534487724304, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16114644706249237, + "step": 25758 + }, + { + "epoch": 0.805, + "grad_norm": 3.046875, + "grad_norm_var": 0.018993123372395834, + "learning_rate": 0.0001, + "loss": 5.5152, + "loss/crossentropy": 2.5322351455688477, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1545419991016388, + "step": 25760 + }, + { + "epoch": 0.8050625, + "grad_norm": 2.96875, + "grad_norm_var": 0.017479451497395833, + "learning_rate": 0.0001, + "loss": 5.4664, + "loss/crossentropy": 2.490007162094116, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1550569236278534, + "step": 25762 + }, + { + "epoch": 0.805125, + "grad_norm": 2.828125, + "grad_norm_var": 0.0184967041015625, + "learning_rate": 0.0001, + "loss": 5.0951, + "loss/crossentropy": 2.164342999458313, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1493251919746399, + "step": 25764 + }, + { + "epoch": 0.8051875, + "grad_norm": 3.203125, + "grad_norm_var": 0.016162109375, + "learning_rate": 0.0001, + "loss": 5.5416, + "loss/crossentropy": 2.4952151775360107, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1608925759792328, + "step": 25766 + }, + { + "epoch": 0.80525, + "grad_norm": 3.03125, + "grad_norm_var": 0.015623982747395833, + "learning_rate": 0.0001, + "loss": 5.4543, + "loss/crossentropy": 2.396771192550659, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1580957993865013, + "step": 25768 + }, + { + "epoch": 0.8053125, + "grad_norm": 2.640625, + "grad_norm_var": 0.042215983072916664, + "learning_rate": 0.0001, + "loss": 5.3644, + "loss/crossentropy": 2.3864080905914307, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15444162487983704, + "step": 25770 + }, + { + "epoch": 0.805375, + "grad_norm": 3.109375, + "grad_norm_var": 0.037873331705729166, + "learning_rate": 0.0001, + "loss": 5.611, + "loss/crossentropy": 2.5382243394851685, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1619631052017212, + "step": 25772 + }, + { + "epoch": 0.8054375, + "grad_norm": 3.3125, + "grad_norm_var": 0.04263916015625, + "learning_rate": 0.0001, + "loss": 5.7934, + "loss/crossentropy": 2.6957108974456787, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16680126637220383, + "step": 25774 + }, + { + "epoch": 0.8055, + "grad_norm": 3.0, + "grad_norm_var": 0.042333984375, + "learning_rate": 0.0001, + "loss": 5.3557, + "loss/crossentropy": 2.397977590560913, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15397745370864868, + "step": 25776 + }, + { + "epoch": 0.8055625, + "grad_norm": 3.1875, + "grad_norm_var": 0.042170206705729164, + "learning_rate": 0.0001, + "loss": 5.725, + "loss/crossentropy": 2.65224027633667, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16470235586166382, + "step": 25778 + }, + { + "epoch": 0.805625, + "grad_norm": 3.171875, + "grad_norm_var": 0.039281209309895836, + "learning_rate": 0.0001, + "loss": 5.4874, + "loss/crossentropy": 2.4238076210021973, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16026780009269714, + "step": 25780 + }, + { + "epoch": 0.8056875, + "grad_norm": 3.265625, + "grad_norm_var": 0.04670308430989583, + "learning_rate": 0.0001, + "loss": 5.2495, + "loss/crossentropy": 2.3126675486564636, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1472015157341957, + "step": 25782 + }, + { + "epoch": 0.80575, + "grad_norm": 3.046875, + "grad_norm_var": 0.046442667643229164, + "learning_rate": 0.0001, + "loss": 5.6502, + "loss/crossentropy": 2.618051767349243, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.162589430809021, + "step": 25784 + }, + { + "epoch": 0.8058125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 5.6136, + "loss/crossentropy": 2.607123851776123, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15376880764961243, + "step": 25786 + }, + { + "epoch": 0.805875, + "grad_norm": 3.703125, + "grad_norm_var": 0.0438385009765625, + "learning_rate": 0.0001, + "loss": 5.8406, + "loss/crossentropy": 2.6182804107666016, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1745753288269043, + "step": 25788 + }, + { + "epoch": 0.8059375, + "grad_norm": 2.78125, + "grad_norm_var": 0.04641520182291667, + "learning_rate": 0.0001, + "loss": 5.623, + "loss/crossentropy": 2.5778337717056274, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15803135931491852, + "step": 25790 + }, + { + "epoch": 0.806, + "grad_norm": 2.96875, + "grad_norm_var": 0.05774637858072917, + "learning_rate": 0.0001, + "loss": 5.9335, + "loss/crossentropy": 2.7090498208999634, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1744006648659706, + "step": 25792 + }, + { + "epoch": 0.8060625, + "grad_norm": 3.125, + "grad_norm_var": 0.057917277018229164, + "learning_rate": 0.0001, + "loss": 5.5817, + "loss/crossentropy": 2.451915979385376, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1645418405532837, + "step": 25794 + }, + { + "epoch": 0.806125, + "grad_norm": 2.84375, + "grad_norm_var": 0.06393229166666667, + "learning_rate": 0.0001, + "loss": 5.7454, + "loss/crossentropy": 2.624258518218994, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16562572121620178, + "step": 25796 + }, + { + "epoch": 0.8061875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0544097900390625, + "learning_rate": 0.0001, + "loss": 5.7204, + "loss/crossentropy": 2.6387758255004883, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16207094490528107, + "step": 25798 + }, + { + "epoch": 0.80625, + "grad_norm": 3.265625, + "grad_norm_var": 0.05766499837239583, + "learning_rate": 0.0001, + "loss": 5.9534, + "loss/crossentropy": 2.6982457637786865, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1743442565202713, + "step": 25800 + }, + { + "epoch": 0.8063125, + "grad_norm": 2.828125, + "grad_norm_var": 0.07045796712239584, + "learning_rate": 0.0001, + "loss": 5.5196, + "loss/crossentropy": 2.570582151412964, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.14724723994731903, + "step": 25802 + }, + { + "epoch": 0.806375, + "grad_norm": 3.15625, + "grad_norm_var": 0.04912109375, + "learning_rate": 0.0001, + "loss": 5.8479, + "loss/crossentropy": 2.696234703063965, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16556153446435928, + "step": 25804 + }, + { + "epoch": 0.8064375, + "grad_norm": 2.953125, + "grad_norm_var": 0.0493560791015625, + "learning_rate": 0.0001, + "loss": 5.2374, + "loss/crossentropy": 2.359342098236084, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1479654163122177, + "step": 25806 + }, + { + "epoch": 0.8065, + "grad_norm": 3.140625, + "grad_norm_var": 0.03502197265625, + "learning_rate": 0.0001, + "loss": 5.8056, + "loss/crossentropy": 2.625843048095703, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16953448951244354, + "step": 25808 + }, + { + "epoch": 0.8065625, + "grad_norm": 2.890625, + "grad_norm_var": 0.03536783854166667, + "learning_rate": 0.0001, + "loss": 5.5963, + "loss/crossentropy": 2.5228800773620605, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16124501079320908, + "step": 25810 + }, + { + "epoch": 0.806625, + "grad_norm": 3.109375, + "grad_norm_var": 0.033177693684895836, + "learning_rate": 0.0001, + "loss": 5.6845, + "loss/crossentropy": 2.55816388130188, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16458744555711746, + "step": 25812 + }, + { + "epoch": 0.8066875, + "grad_norm": 3.125, + "grad_norm_var": 0.03134358723958333, + "learning_rate": 0.0001, + "loss": 5.527, + "loss/crossentropy": 2.5222166776657104, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1590726673603058, + "step": 25814 + }, + { + "epoch": 0.80675, + "grad_norm": 3.078125, + "grad_norm_var": 0.019222005208333334, + "learning_rate": 0.0001, + "loss": 5.728, + "loss/crossentropy": 2.6434502601623535, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16392720490694046, + "step": 25816 + }, + { + "epoch": 0.8068125, + "grad_norm": 3.359375, + "grad_norm_var": 0.02164306640625, + "learning_rate": 0.0001, + "loss": 5.6377, + "loss/crossentropy": 2.5822893381118774, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16061674058437347, + "step": 25818 + }, + { + "epoch": 0.806875, + "grad_norm": 3.140625, + "grad_norm_var": 0.02711181640625, + "learning_rate": 0.0001, + "loss": 6.0137, + "loss/crossentropy": 2.72200345993042, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17604146897792816, + "step": 25820 + }, + { + "epoch": 0.8069375, + "grad_norm": 2.984375, + "grad_norm_var": 0.020833333333333332, + "learning_rate": 0.0001, + "loss": 5.779, + "loss/crossentropy": 2.6549432277679443, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.17100276798009872, + "step": 25822 + }, + { + "epoch": 0.807, + "grad_norm": 3.09375, + "grad_norm_var": 0.024918619791666666, + "learning_rate": 0.0001, + "loss": 5.5089, + "loss/crossentropy": 2.4635682106018066, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1588340327143669, + "step": 25824 + }, + { + "epoch": 0.8070625, + "grad_norm": 3.1875, + "grad_norm_var": 0.0245758056640625, + "learning_rate": 0.0001, + "loss": 5.8354, + "loss/crossentropy": 2.7161245346069336, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16388526558876038, + "step": 25826 + }, + { + "epoch": 0.807125, + "grad_norm": 3.03125, + "grad_norm_var": 0.0297271728515625, + "learning_rate": 0.0001, + "loss": 5.5257, + "loss/crossentropy": 2.570802092552185, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15486504882574081, + "step": 25828 + }, + { + "epoch": 0.8071875, + "grad_norm": 3.21875, + "grad_norm_var": 0.030475870768229166, + "learning_rate": 0.0001, + "loss": 5.7724, + "loss/crossentropy": 2.628113031387329, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16912047564983368, + "step": 25830 + }, + { + "epoch": 0.80725, + "grad_norm": 2.921875, + "grad_norm_var": 0.03525390625, + "learning_rate": 0.0001, + "loss": 5.4256, + "loss/crossentropy": 2.4442425966262817, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15633535385131836, + "step": 25832 + }, + { + "epoch": 0.8073125, + "grad_norm": 3.234375, + "grad_norm_var": 0.030003865559895832, + "learning_rate": 0.0001, + "loss": 5.7966, + "loss/crossentropy": 2.6853344440460205, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16620568186044693, + "step": 25834 + }, + { + "epoch": 0.807375, + "grad_norm": 3.3125, + "grad_norm_var": 0.02691650390625, + "learning_rate": 0.0001, + "loss": 5.6149, + "loss/crossentropy": 2.539078116416931, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16227243840694427, + "step": 25836 + }, + { + "epoch": 0.8074375, + "grad_norm": 3.15625, + "grad_norm_var": 0.03668212890625, + "learning_rate": 0.0001, + "loss": 5.6767, + "loss/crossentropy": 2.4889843463897705, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17150284349918365, + "step": 25838 + }, + { + "epoch": 0.8075, + "grad_norm": 2.90625, + "grad_norm_var": 0.03472900390625, + "learning_rate": 0.0001, + "loss": 5.4635, + "loss/crossentropy": 2.4544711112976074, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15636713802814484, + "step": 25840 + }, + { + "epoch": 0.8075625, + "grad_norm": 3.1875, + "grad_norm_var": 0.05281473795572917, + "learning_rate": 0.0001, + "loss": 5.8003, + "loss/crossentropy": 2.6237099170684814, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16883177310228348, + "step": 25842 + }, + { + "epoch": 0.807625, + "grad_norm": 2.796875, + "grad_norm_var": 0.0544830322265625, + "learning_rate": 0.0001, + "loss": 5.7551, + "loss/crossentropy": 2.6296327114105225, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16801517456769943, + "step": 25844 + }, + { + "epoch": 0.8076875, + "grad_norm": 2.65625, + "grad_norm_var": 0.07205301920572917, + "learning_rate": 0.0001, + "loss": 5.3143, + "loss/crossentropy": 2.3800086975097656, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1496814787387848, + "step": 25846 + }, + { + "epoch": 0.80775, + "grad_norm": 3.0625, + "grad_norm_var": 0.06725260416666666, + "learning_rate": 0.0001, + "loss": 5.5656, + "loss/crossentropy": 2.520174264907837, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15845265984535217, + "step": 25848 + }, + { + "epoch": 0.8078125, + "grad_norm": 2.875, + "grad_norm_var": 0.06857808430989583, + "learning_rate": 0.0001, + "loss": 5.4627, + "loss/crossentropy": 2.472960591316223, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15288261324167252, + "step": 25850 + }, + { + "epoch": 0.807875, + "grad_norm": 3.0, + "grad_norm_var": 0.06544596354166667, + "learning_rate": 0.0001, + "loss": 5.6572, + "loss/crossentropy": 2.5968352556228638, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1615081951022148, + "step": 25852 + }, + { + "epoch": 0.8079375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06123046875, + "learning_rate": 0.0001, + "loss": 6.0538, + "loss/crossentropy": 2.8122987747192383, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17414553463459015, + "step": 25854 + }, + { + "epoch": 0.808, + "grad_norm": 3.0625, + "grad_norm_var": 0.05886128743489583, + "learning_rate": 0.0001, + "loss": 5.675, + "loss/crossentropy": 2.5798511505126953, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16576293855905533, + "step": 25856 + }, + { + "epoch": 0.8080625, + "grad_norm": 2.953125, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 0.0001, + "loss": 5.4631, + "loss/crossentropy": 2.5038857460021973, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1498228833079338, + "step": 25858 + }, + { + "epoch": 0.808125, + "grad_norm": 3.25, + "grad_norm_var": 0.0347808837890625, + "learning_rate": 0.0001, + "loss": 5.5511, + "loss/crossentropy": 2.5348498821258545, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.16099654883146286, + "step": 25860 + }, + { + "epoch": 0.8081875, + "grad_norm": 3.046875, + "grad_norm_var": 0.018651326497395832, + "learning_rate": 0.0001, + "loss": 5.4165, + "loss/crossentropy": 2.404328227043152, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.158634215593338, + "step": 25862 + }, + { + "epoch": 0.80825, + "grad_norm": 3.1875, + "grad_norm_var": 0.019220987955729168, + "learning_rate": 0.0001, + "loss": 5.4641, + "loss/crossentropy": 2.364362955093384, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16349120438098907, + "step": 25864 + }, + { + "epoch": 0.8083125, + "grad_norm": 2.875, + "grad_norm_var": 0.019364420572916666, + "learning_rate": 0.0001, + "loss": 5.5132, + "loss/crossentropy": 2.410773754119873, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16258376836776733, + "step": 25866 + }, + { + "epoch": 0.808375, + "grad_norm": 3.296875, + "grad_norm_var": 0.021968587239583334, + "learning_rate": 0.0001, + "loss": 5.6447, + "loss/crossentropy": 2.5925387144088745, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16185922920703888, + "step": 25868 + }, + { + "epoch": 0.8084375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0190338134765625, + "learning_rate": 0.0001, + "loss": 5.8386, + "loss/crossentropy": 2.6724647283554077, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16700299084186554, + "step": 25870 + }, + { + "epoch": 0.8085, + "grad_norm": 3.265625, + "grad_norm_var": 0.0214752197265625, + "learning_rate": 0.0001, + "loss": 5.7479, + "loss/crossentropy": 2.63448703289032, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16525235772132874, + "step": 25872 + }, + { + "epoch": 0.8085625, + "grad_norm": 2.78125, + "grad_norm_var": 0.02340087890625, + "learning_rate": 0.0001, + "loss": 5.2758, + "loss/crossentropy": 2.299230217933655, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1535119041800499, + "step": 25874 + }, + { + "epoch": 0.808625, + "grad_norm": 2.75, + "grad_norm_var": 0.03804931640625, + "learning_rate": 0.0001, + "loss": 5.5636, + "loss/crossentropy": 2.5843125581741333, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15730515867471695, + "step": 25876 + }, + { + "epoch": 0.8086875, + "grad_norm": 2.921875, + "grad_norm_var": 0.04053446451822917, + "learning_rate": 0.0001, + "loss": 5.6744, + "loss/crossentropy": 2.5767170190811157, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1652393788099289, + "step": 25878 + }, + { + "epoch": 0.80875, + "grad_norm": 3.875, + "grad_norm_var": 0.08639322916666667, + "learning_rate": 0.0001, + "loss": 5.8538, + "loss/crossentropy": 2.6336944103240967, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1739620342850685, + "step": 25880 + }, + { + "epoch": 0.8088125, + "grad_norm": 3.34375, + "grad_norm_var": 0.08307291666666666, + "learning_rate": 0.0001, + "loss": 5.8214, + "loss/crossentropy": 2.643694758415222, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17089402675628662, + "step": 25882 + }, + { + "epoch": 0.808875, + "grad_norm": 3.328125, + "grad_norm_var": 0.08222249348958334, + "learning_rate": 0.0001, + "loss": 5.3962, + "loss/crossentropy": 2.347903251647949, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15912645310163498, + "step": 25884 + }, + { + "epoch": 0.8089375, + "grad_norm": 3.265625, + "grad_norm_var": 0.08266499837239584, + "learning_rate": 0.0001, + "loss": 5.6709, + "loss/crossentropy": 2.528378963470459, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1650313213467598, + "step": 25886 + }, + { + "epoch": 0.809, + "grad_norm": 2.9375, + "grad_norm_var": 0.08391011555989583, + "learning_rate": 0.0001, + "loss": 5.7254, + "loss/crossentropy": 2.6138205528259277, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16584961116313934, + "step": 25888 + }, + { + "epoch": 0.8090625, + "grad_norm": 2.625, + "grad_norm_var": 0.09212239583333333, + "learning_rate": 0.0001, + "loss": 5.312, + "loss/crossentropy": 2.38472843170166, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1521066278219223, + "step": 25890 + }, + { + "epoch": 0.809125, + "grad_norm": 2.96875, + "grad_norm_var": 0.07939046223958333, + "learning_rate": 0.0001, + "loss": 5.6212, + "loss/crossentropy": 2.601081967353821, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15942999720573425, + "step": 25892 + }, + { + "epoch": 0.8091875, + "grad_norm": 3.296875, + "grad_norm_var": 0.08157145182291667, + "learning_rate": 0.0001, + "loss": 5.8314, + "loss/crossentropy": 2.6287145614624023, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17183279991149902, + "step": 25894 + }, + { + "epoch": 0.80925, + "grad_norm": 2.921875, + "grad_norm_var": 0.05066731770833333, + "learning_rate": 0.0001, + "loss": 5.6516, + "loss/crossentropy": 2.633685350418091, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1568736582994461, + "step": 25896 + }, + { + "epoch": 0.8093125, + "grad_norm": 3.21875, + "grad_norm_var": 0.04893290201822917, + "learning_rate": 0.0001, + "loss": 5.9869, + "loss/crossentropy": 2.7616509199142456, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1721365600824356, + "step": 25898 + }, + { + "epoch": 0.809375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0486236572265625, + "learning_rate": 0.0001, + "loss": 6.0606, + "loss/crossentropy": 2.7832703590393066, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.18007761240005493, + "step": 25900 + }, + { + "epoch": 0.8094375, + "grad_norm": 3.140625, + "grad_norm_var": 0.042723592122395834, + "learning_rate": 0.0001, + "loss": 5.6753, + "loss/crossentropy": 2.538694739341736, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16639665514230728, + "step": 25902 + }, + { + "epoch": 0.8095, + "grad_norm": 3.015625, + "grad_norm_var": 0.0408111572265625, + "learning_rate": 0.0001, + "loss": 5.3537, + "loss/crossentropy": 2.4291588068008423, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15104401111602783, + "step": 25904 + }, + { + "epoch": 0.8095625, + "grad_norm": 3.390625, + "grad_norm_var": 0.03291015625, + "learning_rate": 0.0001, + "loss": 5.6602, + "loss/crossentropy": 2.547026038169861, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1640501320362091, + "step": 25906 + }, + { + "epoch": 0.809625, + "grad_norm": 3.015625, + "grad_norm_var": 0.032450358072916664, + "learning_rate": 0.0001, + "loss": 5.4983, + "loss/crossentropy": 2.5117374658584595, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15646952390670776, + "step": 25908 + }, + { + "epoch": 0.8096875, + "grad_norm": 3.0625, + "grad_norm_var": 0.019587198893229168, + "learning_rate": 0.0001, + "loss": 5.5231, + "loss/crossentropy": 2.5682032108306885, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.154079370200634, + "step": 25910 + }, + { + "epoch": 0.80975, + "grad_norm": 3.0, + "grad_norm_var": 0.017801920572916668, + "learning_rate": 0.0001, + "loss": 5.83, + "loss/crossentropy": 2.6992393732070923, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16541622579097748, + "step": 25912 + }, + { + "epoch": 0.8098125, + "grad_norm": 3.0625, + "grad_norm_var": 0.016727701822916666, + "learning_rate": 0.0001, + "loss": 5.5044, + "loss/crossentropy": 2.5401517152786255, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.15696705877780914, + "step": 25914 + }, + { + "epoch": 0.809875, + "grad_norm": 3.234375, + "grad_norm_var": 0.014484659830729166, + "learning_rate": 0.0001, + "loss": 5.8543, + "loss/crossentropy": 2.7261210680007935, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1643773913383484, + "step": 25916 + }, + { + "epoch": 0.8099375, + "grad_norm": 3.1875, + "grad_norm_var": 0.015673828125, + "learning_rate": 0.0001, + "loss": 5.6174, + "loss/crossentropy": 2.6042429208755493, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15756148099899292, + "step": 25918 + }, + { + "epoch": 0.81, + "grad_norm": 3.0625, + "grad_norm_var": 0.0150299072265625, + "learning_rate": 0.0001, + "loss": 5.679, + "loss/crossentropy": 2.636616587638855, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16009552031755447, + "step": 25920 + }, + { + "epoch": 0.8100625, + "grad_norm": 3.296875, + "grad_norm_var": 0.013114420572916667, + "learning_rate": 0.0001, + "loss": 5.3148, + "loss/crossentropy": 2.367428779602051, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15176884829998016, + "step": 25922 + }, + { + "epoch": 0.810125, + "grad_norm": 3.4375, + "grad_norm_var": 0.020849609375, + "learning_rate": 0.0001, + "loss": 5.5218, + "loss/crossentropy": 2.435949444770813, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.158583365380764, + "step": 25924 + }, + { + "epoch": 0.8101875, + "grad_norm": 3.0625, + "grad_norm_var": 0.017854817708333335, + "learning_rate": 0.0001, + "loss": 5.5599, + "loss/crossentropy": 2.5772972106933594, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15607711672782898, + "step": 25926 + }, + { + "epoch": 0.81025, + "grad_norm": 3.09375, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 5.8299, + "loss/crossentropy": 2.6923950910568237, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16687103360891342, + "step": 25928 + }, + { + "epoch": 0.8103125, + "grad_norm": 2.875, + "grad_norm_var": 0.019831339518229168, + "learning_rate": 0.0001, + "loss": 5.726, + "loss/crossentropy": 2.693419575691223, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15911632776260376, + "step": 25930 + }, + { + "epoch": 0.810375, + "grad_norm": 2.796875, + "grad_norm_var": 0.024592081705729168, + "learning_rate": 0.0001, + "loss": 5.4774, + "loss/crossentropy": 2.5127739906311035, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15427283942699432, + "step": 25932 + }, + { + "epoch": 0.8104375, + "grad_norm": 3.140625, + "grad_norm_var": 0.025614420572916668, + "learning_rate": 0.0001, + "loss": 5.566, + "loss/crossentropy": 2.5519654750823975, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15921468287706375, + "step": 25934 + }, + { + "epoch": 0.8105, + "grad_norm": 3.421875, + "grad_norm_var": 0.03321024576822917, + "learning_rate": 0.0001, + "loss": 5.9783, + "loss/crossentropy": 2.8077282905578613, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17135663330554962, + "step": 25936 + }, + { + "epoch": 0.8105625, + "grad_norm": 2.921875, + "grad_norm_var": 0.030855305989583335, + "learning_rate": 0.0001, + "loss": 5.7128, + "loss/crossentropy": 2.6672173738479614, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1615942120552063, + "step": 25938 + }, + { + "epoch": 0.810625, + "grad_norm": 3.046875, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 5.7031, + "loss/crossentropy": 2.5845017433166504, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1677147075533867, + "step": 25940 + }, + { + "epoch": 0.8106875, + "grad_norm": 2.921875, + "grad_norm_var": 0.024413045247395834, + "learning_rate": 0.0001, + "loss": 5.7142, + "loss/crossentropy": 2.6113885641098022, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.164574533700943, + "step": 25942 + }, + { + "epoch": 0.81075, + "grad_norm": 2.96875, + "grad_norm_var": 0.03306376139322917, + "learning_rate": 0.0001, + "loss": 5.5484, + "loss/crossentropy": 2.4556208848953247, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16239862143993378, + "step": 25944 + }, + { + "epoch": 0.8108125, + "grad_norm": 2.96875, + "grad_norm_var": 0.031266276041666666, + "learning_rate": 0.0001, + "loss": 5.5104, + "loss/crossentropy": 2.480081081390381, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15771479159593582, + "step": 25946 + }, + { + "epoch": 0.810875, + "grad_norm": 2.875, + "grad_norm_var": 0.03152669270833333, + "learning_rate": 0.0001, + "loss": 5.7399, + "loss/crossentropy": 2.600390076637268, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16902713477611542, + "step": 25948 + }, + { + "epoch": 0.8109375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0316314697265625, + "learning_rate": 0.0001, + "loss": 5.6332, + "loss/crossentropy": 2.538271427154541, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16418224573135376, + "step": 25950 + }, + { + "epoch": 0.811, + "grad_norm": 3.203125, + "grad_norm_var": 0.023469034830729166, + "learning_rate": 0.0001, + "loss": 5.6978, + "loss/crossentropy": 2.557962417602539, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16554241627454758, + "step": 25952 + }, + { + "epoch": 0.8110625, + "grad_norm": 3.296875, + "grad_norm_var": 0.04248758951822917, + "learning_rate": 0.0001, + "loss": 5.8858, + "loss/crossentropy": 2.6491034030914307, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1736716628074646, + "step": 25954 + }, + { + "epoch": 0.811125, + "grad_norm": 3.125, + "grad_norm_var": 0.04049479166666667, + "learning_rate": 0.0001, + "loss": 5.9979, + "loss/crossentropy": 2.8587846755981445, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16743197292089462, + "step": 25956 + }, + { + "epoch": 0.8111875, + "grad_norm": 3.40625, + "grad_norm_var": 0.0432281494140625, + "learning_rate": 0.0001, + "loss": 5.7378, + "loss/crossentropy": 2.593713879585266, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1636243313550949, + "step": 25958 + }, + { + "epoch": 0.81125, + "grad_norm": 3.140625, + "grad_norm_var": 0.041901652018229166, + "learning_rate": 0.0001, + "loss": 5.4866, + "loss/crossentropy": 2.398123025894165, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16119029372930527, + "step": 25960 + }, + { + "epoch": 0.8113125, + "grad_norm": 3.015625, + "grad_norm_var": 0.0491607666015625, + "learning_rate": 0.0001, + "loss": 5.4391, + "loss/crossentropy": 2.4695407152175903, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1539890244603157, + "step": 25962 + }, + { + "epoch": 0.811375, + "grad_norm": 3.15625, + "grad_norm_var": 0.04673563639322917, + "learning_rate": 0.0001, + "loss": 5.7556, + "loss/crossentropy": 2.649569034576416, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16451101005077362, + "step": 25964 + }, + { + "epoch": 0.8114375, + "grad_norm": 3.171875, + "grad_norm_var": 0.043701171875, + "learning_rate": 0.0001, + "loss": 5.697, + "loss/crossentropy": 2.580646514892578, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16866681724786758, + "step": 25966 + }, + { + "epoch": 0.8115, + "grad_norm": 2.828125, + "grad_norm_var": 0.04605712890625, + "learning_rate": 0.0001, + "loss": 5.5413, + "loss/crossentropy": 2.5573774576187134, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15503410249948502, + "step": 25968 + }, + { + "epoch": 0.8115625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03156636555989583, + "learning_rate": 0.0001, + "loss": 5.9362, + "loss/crossentropy": 2.795621633529663, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16562466323375702, + "step": 25970 + }, + { + "epoch": 0.811625, + "grad_norm": 3.015625, + "grad_norm_var": 0.031794230143229164, + "learning_rate": 0.0001, + "loss": 5.5216, + "loss/crossentropy": 2.500490665435791, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16109219193458557, + "step": 25972 + }, + { + "epoch": 0.8116875, + "grad_norm": 3.25, + "grad_norm_var": 0.027904256184895834, + "learning_rate": 0.0001, + "loss": 5.5786, + "loss/crossentropy": 2.542115569114685, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1595092937350273, + "step": 25974 + }, + { + "epoch": 0.81175, + "grad_norm": 3.0625, + "grad_norm_var": 0.018245442708333334, + "learning_rate": 0.0001, + "loss": 5.5512, + "loss/crossentropy": 2.539905309677124, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16011208295822144, + "step": 25976 + }, + { + "epoch": 0.8118125, + "grad_norm": 2.90625, + "grad_norm_var": 0.015165201822916667, + "learning_rate": 0.0001, + "loss": 5.6823, + "loss/crossentropy": 2.6318790912628174, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1632421314716339, + "step": 25978 + }, + { + "epoch": 0.811875, + "grad_norm": 2.75, + "grad_norm_var": 0.0322906494140625, + "learning_rate": 0.0001, + "loss": 5.6499, + "loss/crossentropy": 2.562356114387512, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16383300721645355, + "step": 25980 + }, + { + "epoch": 0.8119375, + "grad_norm": 3.09375, + "grad_norm_var": 0.032079060872395836, + "learning_rate": 0.0001, + "loss": 5.4906, + "loss/crossentropy": 2.507889151573181, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15842348337173462, + "step": 25982 + }, + { + "epoch": 0.812, + "grad_norm": 2.859375, + "grad_norm_var": 0.03125, + "learning_rate": 0.0001, + "loss": 5.5122, + "loss/crossentropy": 2.4938517808914185, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1580801010131836, + "step": 25984 + }, + { + "epoch": 0.8120625, + "grad_norm": 3.171875, + "grad_norm_var": 0.0365386962890625, + "learning_rate": 0.0001, + "loss": 5.709, + "loss/crossentropy": 2.6493401527404785, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1637757271528244, + "step": 25986 + }, + { + "epoch": 0.812125, + "grad_norm": 3.375, + "grad_norm_var": 0.04219462076822917, + "learning_rate": 0.0001, + "loss": 5.5582, + "loss/crossentropy": 2.431844115257263, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16536659747362137, + "step": 25988 + }, + { + "epoch": 0.8121875, + "grad_norm": 3.0, + "grad_norm_var": 0.039013671875, + "learning_rate": 0.0001, + "loss": 5.3936, + "loss/crossentropy": 2.386796236038208, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15888264775276184, + "step": 25990 + }, + { + "epoch": 0.81225, + "grad_norm": 2.921875, + "grad_norm_var": 0.04033203125, + "learning_rate": 0.0001, + "loss": 5.4206, + "loss/crossentropy": 2.414778470993042, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15996189415454865, + "step": 25992 + }, + { + "epoch": 0.8123125, + "grad_norm": 2.984375, + "grad_norm_var": 0.0384918212890625, + "learning_rate": 0.0001, + "loss": 5.3062, + "loss/crossentropy": 2.351076602935791, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1529332399368286, + "step": 25994 + }, + { + "epoch": 0.812375, + "grad_norm": 3.078125, + "grad_norm_var": 0.020319620768229168, + "learning_rate": 0.0001, + "loss": 5.3583, + "loss/crossentropy": 2.4164395332336426, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1535656899213791, + "step": 25996 + }, + { + "epoch": 0.8124375, + "grad_norm": 3.03125, + "grad_norm_var": 0.021312459309895834, + "learning_rate": 0.0001, + "loss": 5.9078, + "loss/crossentropy": 2.7739639282226562, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16690263152122498, + "step": 25998 + }, + { + "epoch": 0.8125, + "grad_norm": 3.140625, + "grad_norm_var": 0.021305338541666666, + "learning_rate": 0.0001, + "loss": 5.6046, + "loss/crossentropy": 2.5973480939865112, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1550179421901703, + "step": 26000 + }, + { + "epoch": 0.8125625, + "grad_norm": 3.34375, + "grad_norm_var": 0.020601399739583335, + "learning_rate": 0.0001, + "loss": 5.6036, + "loss/crossentropy": 2.5490570068359375, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16209101676940918, + "step": 26002 + }, + { + "epoch": 0.812625, + "grad_norm": 3.40625, + "grad_norm_var": 0.023958333333333335, + "learning_rate": 0.0001, + "loss": 5.6212, + "loss/crossentropy": 2.565270185470581, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15949514508247375, + "step": 26004 + }, + { + "epoch": 0.8126875, + "grad_norm": 3.203125, + "grad_norm_var": 0.026537068684895835, + "learning_rate": 0.0001, + "loss": 5.4849, + "loss/crossentropy": 2.472626805305481, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15512911975383759, + "step": 26006 + }, + { + "epoch": 0.81275, + "grad_norm": 3.0, + "grad_norm_var": 0.031689453125, + "learning_rate": 0.0001, + "loss": 5.3621, + "loss/crossentropy": 2.368017077445984, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.14863138645887375, + "step": 26008 + }, + { + "epoch": 0.8128125, + "grad_norm": 2.953125, + "grad_norm_var": 0.03704427083333333, + "learning_rate": 0.0001, + "loss": 5.7561, + "loss/crossentropy": 2.593183994293213, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16981104016304016, + "step": 26010 + }, + { + "epoch": 0.812875, + "grad_norm": 3.21875, + "grad_norm_var": 0.03508199055989583, + "learning_rate": 0.0001, + "loss": 5.9786, + "loss/crossentropy": 2.7332016229629517, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17375550419092178, + "step": 26012 + }, + { + "epoch": 0.8129375, + "grad_norm": 3.1875, + "grad_norm_var": 0.035033162434895834, + "learning_rate": 0.0001, + "loss": 5.4169, + "loss/crossentropy": 2.36329984664917, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1596594601869583, + "step": 26014 + }, + { + "epoch": 0.813, + "grad_norm": 2.84375, + "grad_norm_var": 0.037385050455729166, + "learning_rate": 0.0001, + "loss": 5.3814, + "loss/crossentropy": 2.4783889055252075, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.14811166375875473, + "step": 26016 + }, + { + "epoch": 0.8130625, + "grad_norm": 3.25, + "grad_norm_var": 0.03621317545572917, + "learning_rate": 0.0001, + "loss": 5.9081, + "loss/crossentropy": 2.695285677909851, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17245175689458847, + "step": 26018 + }, + { + "epoch": 0.813125, + "grad_norm": 2.921875, + "grad_norm_var": 0.030354817708333332, + "learning_rate": 0.0001, + "loss": 5.761, + "loss/crossentropy": 2.6396515369415283, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1660393923521042, + "step": 26020 + }, + { + "epoch": 0.8131875, + "grad_norm": 3.234375, + "grad_norm_var": 0.03125712076822917, + "learning_rate": 0.0001, + "loss": 5.8516, + "loss/crossentropy": 2.702316403388977, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16882963478565216, + "step": 26022 + }, + { + "epoch": 0.81325, + "grad_norm": 3.171875, + "grad_norm_var": 0.030989583333333334, + "learning_rate": 0.0001, + "loss": 5.5435, + "loss/crossentropy": 2.5398088693618774, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15662306547164917, + "step": 26024 + }, + { + "epoch": 0.8133125, + "grad_norm": 3.59375, + "grad_norm_var": 0.04273681640625, + "learning_rate": 0.0001, + "loss": 5.7943, + "loss/crossentropy": 2.6339192390441895, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16525793820619583, + "step": 26026 + }, + { + "epoch": 0.813375, + "grad_norm": 3.125, + "grad_norm_var": 0.0432037353515625, + "learning_rate": 0.0001, + "loss": 5.9342, + "loss/crossentropy": 2.775349497795105, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17018358409404755, + "step": 26028 + }, + { + "epoch": 0.8134375, + "grad_norm": 3.0, + "grad_norm_var": 0.04478251139322917, + "learning_rate": 0.0001, + "loss": 5.6793, + "loss/crossentropy": 2.548333764076233, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1677834466099739, + "step": 26030 + }, + { + "epoch": 0.8135, + "grad_norm": 2.96875, + "grad_norm_var": 0.04029541015625, + "learning_rate": 0.0001, + "loss": 5.6844, + "loss/crossentropy": 2.603028178215027, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1612669825553894, + "step": 26032 + }, + { + "epoch": 0.8135625, + "grad_norm": 3.0, + "grad_norm_var": 0.03905843098958333, + "learning_rate": 0.0001, + "loss": 5.4034, + "loss/crossentropy": 2.386218786239624, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15953043848276138, + "step": 26034 + }, + { + "epoch": 0.813625, + "grad_norm": 3.046875, + "grad_norm_var": 0.039159138997395836, + "learning_rate": 0.0001, + "loss": 5.4987, + "loss/crossentropy": 2.523100256919861, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.157719187438488, + "step": 26036 + }, + { + "epoch": 0.8136875, + "grad_norm": 3.078125, + "grad_norm_var": 0.036946614583333336, + "learning_rate": 0.0001, + "loss": 5.7258, + "loss/crossentropy": 2.5546183586120605, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16711720824241638, + "step": 26038 + }, + { + "epoch": 0.81375, + "grad_norm": 3.140625, + "grad_norm_var": 0.02896728515625, + "learning_rate": 0.0001, + "loss": 5.867, + "loss/crossentropy": 2.7451947927474976, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16335587948560715, + "step": 26040 + }, + { + "epoch": 0.8138125, + "grad_norm": 2.859375, + "grad_norm_var": 0.016991170247395833, + "learning_rate": 0.0001, + "loss": 5.3652, + "loss/crossentropy": 2.373391032218933, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15542739629745483, + "step": 26042 + }, + { + "epoch": 0.813875, + "grad_norm": 2.796875, + "grad_norm_var": 0.020308430989583334, + "learning_rate": 0.0001, + "loss": 5.5794, + "loss/crossentropy": 2.56760573387146, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15977203845977783, + "step": 26044 + }, + { + "epoch": 0.8139375, + "grad_norm": 2.90625, + "grad_norm_var": 0.018094889322916665, + "learning_rate": 0.0001, + "loss": 5.7479, + "loss/crossentropy": 2.640484571456909, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1662125587463379, + "step": 26046 + }, + { + "epoch": 0.814, + "grad_norm": 3.0625, + "grad_norm_var": 0.017780558268229166, + "learning_rate": 0.0001, + "loss": 5.4578, + "loss/crossentropy": 2.455212354660034, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15494395792484283, + "step": 26048 + }, + { + "epoch": 0.8140625, + "grad_norm": 3.390625, + "grad_norm_var": 0.04582926432291667, + "learning_rate": 0.0001, + "loss": 5.8634, + "loss/crossentropy": 2.6449553966522217, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17145580798387527, + "step": 26050 + }, + { + "epoch": 0.814125, + "grad_norm": 3.53125, + "grad_norm_var": 0.053278605143229164, + "learning_rate": 0.0001, + "loss": 5.5121, + "loss/crossentropy": 2.373735785484314, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.16149048507213593, + "step": 26052 + }, + { + "epoch": 0.8141875, + "grad_norm": 3.265625, + "grad_norm_var": 0.05279541015625, + "learning_rate": 0.0001, + "loss": 5.3452, + "loss/crossentropy": 2.277380883693695, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1614658087491989, + "step": 26054 + }, + { + "epoch": 0.81425, + "grad_norm": 2.875, + "grad_norm_var": 0.058577473958333334, + "learning_rate": 0.0001, + "loss": 5.5921, + "loss/crossentropy": 2.530734062194824, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16238905489444733, + "step": 26056 + }, + { + "epoch": 0.8143125, + "grad_norm": 3.15625, + "grad_norm_var": 0.06541341145833333, + "learning_rate": 0.0001, + "loss": 5.2169, + "loss/crossentropy": 2.275343418121338, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15196342766284943, + "step": 26058 + }, + { + "epoch": 0.814375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0645416259765625, + "learning_rate": 0.0001, + "loss": 5.4366, + "loss/crossentropy": 2.3882514238357544, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15952109545469284, + "step": 26060 + }, + { + "epoch": 0.8144375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0640289306640625, + "learning_rate": 0.0001, + "loss": 5.5776, + "loss/crossentropy": 2.522711992263794, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1609606221318245, + "step": 26062 + }, + { + "epoch": 0.8145, + "grad_norm": 3.28125, + "grad_norm_var": 0.0617584228515625, + "learning_rate": 0.0001, + "loss": 5.7144, + "loss/crossentropy": 2.5757981538772583, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16659953445196152, + "step": 26064 + }, + { + "epoch": 0.8145625, + "grad_norm": 3.375, + "grad_norm_var": 0.05034077962239583, + "learning_rate": 0.0001, + "loss": 5.3892, + "loss/crossentropy": 2.3622301816940308, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1569949835538864, + "step": 26066 + }, + { + "epoch": 0.814625, + "grad_norm": 2.84375, + "grad_norm_var": 0.04413960774739583, + "learning_rate": 0.0001, + "loss": 5.6939, + "loss/crossentropy": 2.5889511108398438, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16440598666667938, + "step": 26068 + }, + { + "epoch": 0.8146875, + "grad_norm": 3.046875, + "grad_norm_var": 0.04318033854166667, + "learning_rate": 0.0001, + "loss": 6.1497, + "loss/crossentropy": 2.917039155960083, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17444155365228653, + "step": 26070 + }, + { + "epoch": 0.81475, + "grad_norm": 3.0625, + "grad_norm_var": 0.040751139322916664, + "learning_rate": 0.0001, + "loss": 5.2042, + "loss/crossentropy": 2.2803984880447388, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1458997204899788, + "step": 26072 + }, + { + "epoch": 0.8148125, + "grad_norm": 3.0625, + "grad_norm_var": 0.032591756184895834, + "learning_rate": 0.0001, + "loss": 5.5049, + "loss/crossentropy": 2.3861804008483887, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16265819221735, + "step": 26074 + }, + { + "epoch": 0.814875, + "grad_norm": 3.4375, + "grad_norm_var": 0.030915323893229166, + "learning_rate": 0.0001, + "loss": 5.6712, + "loss/crossentropy": 2.4557416439056396, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17115310579538345, + "step": 26076 + }, + { + "epoch": 0.8149375, + "grad_norm": 3.03125, + "grad_norm_var": 0.06640218098958334, + "learning_rate": 0.0001, + "loss": 5.8032, + "loss/crossentropy": 2.7046096324920654, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16533035784959793, + "step": 26078 + }, + { + "epoch": 0.815, + "grad_norm": 3.6875, + "grad_norm_var": 0.08331705729166666, + "learning_rate": 0.0001, + "loss": 5.6713, + "loss/crossentropy": 2.5506922006607056, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1683076173067093, + "step": 26080 + }, + { + "epoch": 0.8150625, + "grad_norm": 3.0625, + "grad_norm_var": 0.08404947916666666, + "learning_rate": 0.0001, + "loss": 5.6315, + "loss/crossentropy": 2.5789612531661987, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16189821809530258, + "step": 26082 + }, + { + "epoch": 0.815125, + "grad_norm": 3.40625, + "grad_norm_var": 0.07544657389322916, + "learning_rate": 0.0001, + "loss": 5.5345, + "loss/crossentropy": 2.4583956003189087, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16191180050373077, + "step": 26084 + }, + { + "epoch": 0.8151875, + "grad_norm": 3.0625, + "grad_norm_var": 0.07571207682291667, + "learning_rate": 0.0001, + "loss": 5.69, + "loss/crossentropy": 2.5922963619232178, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16367991268634796, + "step": 26086 + }, + { + "epoch": 0.81525, + "grad_norm": 3.0625, + "grad_norm_var": 0.07688395182291667, + "learning_rate": 0.0001, + "loss": 5.7573, + "loss/crossentropy": 2.63303804397583, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16437966376543045, + "step": 26088 + }, + { + "epoch": 0.8153125, + "grad_norm": 3.625, + "grad_norm_var": 0.08284403483072916, + "learning_rate": 0.0001, + "loss": 5.9082, + "loss/crossentropy": 2.746885895729065, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17004229128360748, + "step": 26090 + }, + { + "epoch": 0.815375, + "grad_norm": 3.03125, + "grad_norm_var": 0.09219462076822917, + "learning_rate": 0.0001, + "loss": 5.2256, + "loss/crossentropy": 2.322092294692993, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15050920844078064, + "step": 26092 + }, + { + "epoch": 0.8154375, + "grad_norm": 3.125, + "grad_norm_var": 0.057291666666666664, + "learning_rate": 0.0001, + "loss": 5.364, + "loss/crossentropy": 2.370709538459778, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15519004315137863, + "step": 26094 + }, + { + "epoch": 0.8155, + "grad_norm": 3.171875, + "grad_norm_var": 0.03404032389322917, + "learning_rate": 0.0001, + "loss": 5.2883, + "loss/crossentropy": 2.3643643856048584, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.14551495760679245, + "step": 26096 + }, + { + "epoch": 0.8155625, + "grad_norm": 3.1875, + "grad_norm_var": 0.030321248372395835, + "learning_rate": 0.0001, + "loss": 5.8893, + "loss/crossentropy": 2.7373058795928955, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16989066451787949, + "step": 26098 + }, + { + "epoch": 0.815625, + "grad_norm": 3.15625, + "grad_norm_var": 0.02564697265625, + "learning_rate": 0.0001, + "loss": 5.6633, + "loss/crossentropy": 2.583295702934265, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16269079595804214, + "step": 26100 + }, + { + "epoch": 0.8156875, + "grad_norm": 2.9375, + "grad_norm_var": 0.027730305989583332, + "learning_rate": 0.0001, + "loss": 5.359, + "loss/crossentropy": 2.4208203554153442, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.14967238903045654, + "step": 26102 + }, + { + "epoch": 0.81575, + "grad_norm": 2.921875, + "grad_norm_var": 0.030589803059895834, + "learning_rate": 0.0001, + "loss": 5.5604, + "loss/crossentropy": 2.5293065309524536, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15701773017644882, + "step": 26104 + }, + { + "epoch": 0.8158125, + "grad_norm": 3.0, + "grad_norm_var": 0.010965983072916666, + "learning_rate": 0.0001, + "loss": 5.4673, + "loss/crossentropy": 2.474220871925354, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15516993403434753, + "step": 26106 + }, + { + "epoch": 0.815875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0093658447265625, + "learning_rate": 0.0001, + "loss": 5.3832, + "loss/crossentropy": 2.3631885051727295, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1578560248017311, + "step": 26108 + }, + { + "epoch": 0.8159375, + "grad_norm": 3.296875, + "grad_norm_var": 0.018359375, + "learning_rate": 0.0001, + "loss": 4.843, + "loss/crossentropy": 2.020129442214966, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.14127205312252045, + "step": 26110 + }, + { + "epoch": 0.816, + "grad_norm": 3.359375, + "grad_norm_var": 0.0252838134765625, + "learning_rate": 0.0001, + "loss": 5.6984, + "loss/crossentropy": 2.670602560043335, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16058924794197083, + "step": 26112 + }, + { + "epoch": 0.8160625, + "grad_norm": 3.109375, + "grad_norm_var": 0.024583943684895835, + "learning_rate": 0.0001, + "loss": 5.6156, + "loss/crossentropy": 2.510475754737854, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16676156967878342, + "step": 26114 + }, + { + "epoch": 0.816125, + "grad_norm": 3.703125, + "grad_norm_var": 0.05015360514322917, + "learning_rate": 0.0001, + "loss": 5.7941, + "loss/crossentropy": 2.5985480546951294, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1715070903301239, + "step": 26116 + }, + { + "epoch": 0.8161875, + "grad_norm": 3.109375, + "grad_norm_var": 0.048924763997395836, + "learning_rate": 0.0001, + "loss": 5.9325, + "loss/crossentropy": 2.7597391605377197, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17001105099916458, + "step": 26118 + }, + { + "epoch": 0.81625, + "grad_norm": 2.96875, + "grad_norm_var": 0.050690714518229166, + "learning_rate": 0.0001, + "loss": 5.7365, + "loss/crossentropy": 2.629301905632019, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16267205029726028, + "step": 26120 + }, + { + "epoch": 0.8163125, + "grad_norm": 3.0, + "grad_norm_var": 0.048173014322916666, + "learning_rate": 0.0001, + "loss": 5.4366, + "loss/crossentropy": 2.4595502614974976, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15746787190437317, + "step": 26122 + }, + { + "epoch": 0.816375, + "grad_norm": 2.828125, + "grad_norm_var": 0.05196024576822917, + "learning_rate": 0.0001, + "loss": 5.6512, + "loss/crossentropy": 2.585309863090515, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16362079232931137, + "step": 26124 + }, + { + "epoch": 0.8164375, + "grad_norm": 3.5, + "grad_norm_var": 0.05559488932291667, + "learning_rate": 0.0001, + "loss": 5.9476, + "loss/crossentropy": 2.749737501144409, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17212870717048645, + "step": 26126 + }, + { + "epoch": 0.8165, + "grad_norm": 2.96875, + "grad_norm_var": 0.05279541015625, + "learning_rate": 0.0001, + "loss": 5.5055, + "loss/crossentropy": 2.5037710666656494, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15407906472682953, + "step": 26128 + }, + { + "epoch": 0.8165625, + "grad_norm": 3.140625, + "grad_norm_var": 0.05406494140625, + "learning_rate": 0.0001, + "loss": 5.8976, + "loss/crossentropy": 2.7795106172561646, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16884027421474457, + "step": 26130 + }, + { + "epoch": 0.816625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0371002197265625, + "learning_rate": 0.0001, + "loss": 5.8581, + "loss/crossentropy": 2.699077606201172, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16785462200641632, + "step": 26132 + }, + { + "epoch": 0.8166875, + "grad_norm": 3.265625, + "grad_norm_var": 0.03968098958333333, + "learning_rate": 0.0001, + "loss": 5.8573, + "loss/crossentropy": 2.6877238750457764, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16890715807676315, + "step": 26134 + }, + { + "epoch": 0.81675, + "grad_norm": 2.96875, + "grad_norm_var": 0.04016825358072917, + "learning_rate": 0.0001, + "loss": 5.4469, + "loss/crossentropy": 2.3794307708740234, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15947842597961426, + "step": 26136 + }, + { + "epoch": 0.8168125, + "grad_norm": 2.9375, + "grad_norm_var": 0.043440755208333334, + "learning_rate": 0.0001, + "loss": 5.3964, + "loss/crossentropy": 2.3684866428375244, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15787289291620255, + "step": 26138 + }, + { + "epoch": 0.816875, + "grad_norm": 3.015625, + "grad_norm_var": 0.03884175618489583, + "learning_rate": 0.0001, + "loss": 5.5438, + "loss/crossentropy": 2.532816171646118, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15656939148902893, + "step": 26140 + }, + { + "epoch": 0.8169375, + "grad_norm": 3.21875, + "grad_norm_var": 0.025288899739583332, + "learning_rate": 0.0001, + "loss": 5.4799, + "loss/crossentropy": 2.3790860176086426, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16203491389751434, + "step": 26142 + }, + { + "epoch": 0.817, + "grad_norm": 2.90625, + "grad_norm_var": 0.024348958333333334, + "learning_rate": 0.0001, + "loss": 5.6526, + "loss/crossentropy": 2.560869574546814, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16268550604581833, + "step": 26144 + }, + { + "epoch": 0.8170625, + "grad_norm": 3.296875, + "grad_norm_var": 0.031834920247395836, + "learning_rate": 0.0001, + "loss": 5.5227, + "loss/crossentropy": 2.5373018980026245, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15791461616754532, + "step": 26146 + }, + { + "epoch": 0.817125, + "grad_norm": 3.0625, + "grad_norm_var": 0.025316365559895835, + "learning_rate": 0.0001, + "loss": 5.4378, + "loss/crossentropy": 2.4222248792648315, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15741781145334244, + "step": 26148 + }, + { + "epoch": 0.8171875, + "grad_norm": 2.984375, + "grad_norm_var": 0.024470011393229168, + "learning_rate": 0.0001, + "loss": 5.6441, + "loss/crossentropy": 2.5666340589523315, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16516925394535065, + "step": 26150 + }, + { + "epoch": 0.81725, + "grad_norm": 3.015625, + "grad_norm_var": 0.021610514322916666, + "learning_rate": 0.0001, + "loss": 5.599, + "loss/crossentropy": 2.5882219076156616, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1530270129442215, + "step": 26152 + }, + { + "epoch": 0.8173125, + "grad_norm": 2.921875, + "grad_norm_var": 0.05354715983072917, + "learning_rate": 0.0001, + "loss": 5.3402, + "loss/crossentropy": 2.3540260791778564, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15291216224431992, + "step": 26154 + }, + { + "epoch": 0.817375, + "grad_norm": 3.046875, + "grad_norm_var": 0.054423014322916664, + "learning_rate": 0.0001, + "loss": 5.6191, + "loss/crossentropy": 2.5458675622940063, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1604481339454651, + "step": 26156 + }, + { + "epoch": 0.8174375, + "grad_norm": 3.0625, + "grad_norm_var": 0.053059895833333336, + "learning_rate": 0.0001, + "loss": 5.5605, + "loss/crossentropy": 2.513525128364563, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15899336338043213, + "step": 26158 + }, + { + "epoch": 0.8175, + "grad_norm": 3.265625, + "grad_norm_var": 0.0528228759765625, + "learning_rate": 0.0001, + "loss": 5.4506, + "loss/crossentropy": 2.376926302909851, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16010113060474396, + "step": 26160 + }, + { + "epoch": 0.8175625, + "grad_norm": 3.0625, + "grad_norm_var": 0.04463602701822917, + "learning_rate": 0.0001, + "loss": 5.7315, + "loss/crossentropy": 2.590888023376465, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1640632301568985, + "step": 26162 + }, + { + "epoch": 0.817625, + "grad_norm": 2.9375, + "grad_norm_var": 0.0499908447265625, + "learning_rate": 0.0001, + "loss": 5.3026, + "loss/crossentropy": 2.4341611862182617, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.14817240089178085, + "step": 26164 + }, + { + "epoch": 0.8176875, + "grad_norm": 2.890625, + "grad_norm_var": 0.055322265625, + "learning_rate": 0.0001, + "loss": 5.4102, + "loss/crossentropy": 2.4750607013702393, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1524970903992653, + "step": 26166 + }, + { + "epoch": 0.81775, + "grad_norm": 3.15625, + "grad_norm_var": 0.0527496337890625, + "learning_rate": 0.0001, + "loss": 5.691, + "loss/crossentropy": 2.6213563680648804, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1620451584458351, + "step": 26168 + }, + { + "epoch": 0.8178125, + "grad_norm": 3.34375, + "grad_norm_var": 0.022554524739583335, + "learning_rate": 0.0001, + "loss": 5.7447, + "loss/crossentropy": 2.6336199045181274, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16501329839229584, + "step": 26170 + }, + { + "epoch": 0.817875, + "grad_norm": 3.203125, + "grad_norm_var": 0.022606404622395833, + "learning_rate": 0.0001, + "loss": 5.7543, + "loss/crossentropy": 2.610470414161682, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16594266146421432, + "step": 26172 + }, + { + "epoch": 0.8179375, + "grad_norm": 3.0625, + "grad_norm_var": 0.023078409830729167, + "learning_rate": 0.0001, + "loss": 5.7982, + "loss/crossentropy": 2.6537013053894043, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16913538426160812, + "step": 26174 + }, + { + "epoch": 0.818, + "grad_norm": 3.28125, + "grad_norm_var": 0.026590983072916668, + "learning_rate": 0.0001, + "loss": 5.8476, + "loss/crossentropy": 2.6743801832199097, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1704486906528473, + "step": 26176 + }, + { + "epoch": 0.8180625, + "grad_norm": 3.078125, + "grad_norm_var": 0.026102701822916668, + "learning_rate": 0.0001, + "loss": 5.4365, + "loss/crossentropy": 2.379012107849121, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16200068593025208, + "step": 26178 + }, + { + "epoch": 0.818125, + "grad_norm": 2.96875, + "grad_norm_var": 0.021068318684895834, + "learning_rate": 0.0001, + "loss": 5.6285, + "loss/crossentropy": 2.5713045597076416, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1565016508102417, + "step": 26180 + }, + { + "epoch": 0.8181875, + "grad_norm": 2.78125, + "grad_norm_var": 0.024152628580729165, + "learning_rate": 0.0001, + "loss": 5.4027, + "loss/crossentropy": 2.5099167823791504, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14865048229694366, + "step": 26182 + }, + { + "epoch": 0.81825, + "grad_norm": 3.140625, + "grad_norm_var": 0.0400054931640625, + "learning_rate": 0.0001, + "loss": 5.6903, + "loss/crossentropy": 2.5764319896698, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16334353387355804, + "step": 26184 + }, + { + "epoch": 0.8183125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03680013020833333, + "learning_rate": 0.0001, + "loss": 5.3066, + "loss/crossentropy": 2.2904746532440186, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15474095940589905, + "step": 26186 + }, + { + "epoch": 0.818375, + "grad_norm": 3.328125, + "grad_norm_var": 0.06179097493489583, + "learning_rate": 0.0001, + "loss": 5.9701, + "loss/crossentropy": 2.688890814781189, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1785091534256935, + "step": 26188 + }, + { + "epoch": 0.8184375, + "grad_norm": 3.421875, + "grad_norm_var": 0.06430562337239583, + "learning_rate": 0.0001, + "loss": 5.6174, + "loss/crossentropy": 2.592252016067505, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16033123433589935, + "step": 26190 + }, + { + "epoch": 0.8185, + "grad_norm": 3.140625, + "grad_norm_var": 0.067333984375, + "learning_rate": 0.0001, + "loss": 5.6458, + "loss/crossentropy": 2.599801540374756, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16241006553173065, + "step": 26192 + }, + { + "epoch": 0.8185625, + "grad_norm": 3.640625, + "grad_norm_var": 0.08297119140625, + "learning_rate": 0.0001, + "loss": 5.5937, + "loss/crossentropy": 2.4990642070770264, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16376066952943802, + "step": 26194 + }, + { + "epoch": 0.818625, + "grad_norm": 3.125, + "grad_norm_var": 0.08010965983072917, + "learning_rate": 0.0001, + "loss": 5.5154, + "loss/crossentropy": 2.4857006072998047, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1592223197221756, + "step": 26196 + }, + { + "epoch": 0.8186875, + "grad_norm": 3.453125, + "grad_norm_var": 0.05854390462239583, + "learning_rate": 0.0001, + "loss": 5.7373, + "loss/crossentropy": 2.541812300682068, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17306101322174072, + "step": 26198 + }, + { + "epoch": 0.81875, + "grad_norm": 3.078125, + "grad_norm_var": 0.06412760416666667, + "learning_rate": 0.0001, + "loss": 5.3927, + "loss/crossentropy": 2.4223748445510864, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1571846753358841, + "step": 26200 + }, + { + "epoch": 0.8188125, + "grad_norm": 3.4375, + "grad_norm_var": 0.06555989583333334, + "learning_rate": 0.0001, + "loss": 5.8268, + "loss/crossentropy": 2.6781102418899536, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16526120901107788, + "step": 26202 + }, + { + "epoch": 0.818875, + "grad_norm": 3.015625, + "grad_norm_var": 0.05287984212239583, + "learning_rate": 0.0001, + "loss": 5.3714, + "loss/crossentropy": 2.4879703521728516, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.14849692583084106, + "step": 26204 + }, + { + "epoch": 0.8189375, + "grad_norm": 2.828125, + "grad_norm_var": 0.05705973307291667, + "learning_rate": 0.0001, + "loss": 5.3034, + "loss/crossentropy": 2.3397125005722046, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15652088820934296, + "step": 26206 + }, + { + "epoch": 0.819, + "grad_norm": 2.859375, + "grad_norm_var": 0.06199442545572917, + "learning_rate": 0.0001, + "loss": 5.4446, + "loss/crossentropy": 2.4718334674835205, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1503976359963417, + "step": 26208 + }, + { + "epoch": 0.8190625, + "grad_norm": 3.171875, + "grad_norm_var": 0.03916015625, + "learning_rate": 0.0001, + "loss": 5.6137, + "loss/crossentropy": 2.5755761861801147, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1600641831755638, + "step": 26210 + }, + { + "epoch": 0.819125, + "grad_norm": 2.984375, + "grad_norm_var": 0.038916015625, + "learning_rate": 0.0001, + "loss": 5.432, + "loss/crossentropy": 2.433050751686096, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15575602650642395, + "step": 26212 + }, + { + "epoch": 0.8191875, + "grad_norm": 3.0, + "grad_norm_var": 0.023714192708333335, + "learning_rate": 0.0001, + "loss": 5.5737, + "loss/crossentropy": 2.5103474855422974, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16297713667154312, + "step": 26214 + }, + { + "epoch": 0.81925, + "grad_norm": 4.40625, + "grad_norm_var": 0.14511311848958333, + "learning_rate": 0.0001, + "loss": 5.8492, + "loss/crossentropy": 2.643450140953064, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17096712440252304, + "step": 26216 + }, + { + "epoch": 0.8193125, + "grad_norm": 3.1875, + "grad_norm_var": 0.13736979166666666, + "learning_rate": 0.0001, + "loss": 5.703, + "loss/crossentropy": 2.5356518030166626, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16712727397680283, + "step": 26218 + }, + { + "epoch": 0.819375, + "grad_norm": 2.765625, + "grad_norm_var": 0.14334208170572918, + "learning_rate": 0.0001, + "loss": 5.5674, + "loss/crossentropy": 2.5801234245300293, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15732340514659882, + "step": 26220 + }, + { + "epoch": 0.8194375, + "grad_norm": 2.953125, + "grad_norm_var": 0.27905171712239585, + "learning_rate": 0.0001, + "loss": 5.5043, + "loss/crossentropy": 2.4334503412246704, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15903393179178238, + "step": 26222 + }, + { + "epoch": 0.8195, + "grad_norm": 2.859375, + "grad_norm_var": 0.27244364420572914, + "learning_rate": 0.0001, + "loss": 5.3334, + "loss/crossentropy": 2.3872623443603516, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15008265525102615, + "step": 26224 + }, + { + "epoch": 0.8195625, + "grad_norm": 3.296875, + "grad_norm_var": 0.271826171875, + "learning_rate": 0.0001, + "loss": 5.7474, + "loss/crossentropy": 2.575577139854431, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17264865338802338, + "step": 26226 + }, + { + "epoch": 0.819625, + "grad_norm": 3.375, + "grad_norm_var": 0.27444254557291664, + "learning_rate": 0.0001, + "loss": 5.469, + "loss/crossentropy": 2.4319013357162476, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15761180222034454, + "step": 26228 + }, + { + "epoch": 0.8196875, + "grad_norm": 3.265625, + "grad_norm_var": 0.26929931640625, + "learning_rate": 0.0001, + "loss": 5.7769, + "loss/crossentropy": 2.641359329223633, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16746356338262558, + "step": 26230 + }, + { + "epoch": 0.81975, + "grad_norm": 3.0, + "grad_norm_var": 0.1694976806640625, + "learning_rate": 0.0001, + "loss": 5.689, + "loss/crossentropy": 2.6191372871398926, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1597168892621994, + "step": 26232 + }, + { + "epoch": 0.8198125, + "grad_norm": 2.96875, + "grad_norm_var": 0.1767730712890625, + "learning_rate": 0.0001, + "loss": 5.2413, + "loss/crossentropy": 2.310689330101013, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15087038278579712, + "step": 26234 + }, + { + "epoch": 0.819875, + "grad_norm": 3.0625, + "grad_norm_var": 0.16747639973958334, + "learning_rate": 0.0001, + "loss": 5.6303, + "loss/crossentropy": 2.503048062324524, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1658461093902588, + "step": 26236 + }, + { + "epoch": 0.8199375, + "grad_norm": 2.765625, + "grad_norm_var": 0.03764546712239583, + "learning_rate": 0.0001, + "loss": 5.5754, + "loss/crossentropy": 2.5644038915634155, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15774518996477127, + "step": 26238 + }, + { + "epoch": 0.82, + "grad_norm": 3.09375, + "grad_norm_var": 0.03371480305989583, + "learning_rate": 0.0001, + "loss": 5.5648, + "loss/crossentropy": 2.482118010520935, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16334488987922668, + "step": 26240 + }, + { + "epoch": 0.8200625, + "grad_norm": 3.109375, + "grad_norm_var": 0.033036295572916666, + "learning_rate": 0.0001, + "loss": 5.4303, + "loss/crossentropy": 2.4456228017807007, + "loss/hidden": 1.390625, + "loss/jsd": 0.0, + "loss/logits": 0.15940222144126892, + "step": 26242 + }, + { + "epoch": 0.820125, + "grad_norm": 2.859375, + "grad_norm_var": 0.030501302083333334, + "learning_rate": 0.0001, + "loss": 5.8435, + "loss/crossentropy": 2.747655987739563, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1646673008799553, + "step": 26244 + }, + { + "epoch": 0.8201875, + "grad_norm": 3.03125, + "grad_norm_var": 0.030562337239583334, + "learning_rate": 0.0001, + "loss": 5.5265, + "loss/crossentropy": 2.4170150756835938, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1629014015197754, + "step": 26246 + }, + { + "epoch": 0.82025, + "grad_norm": 2.78125, + "grad_norm_var": 0.0355865478515625, + "learning_rate": 0.0001, + "loss": 5.4901, + "loss/crossentropy": 2.5667566061019897, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.15405111759901047, + "step": 26248 + }, + { + "epoch": 0.8203125, + "grad_norm": 2.984375, + "grad_norm_var": 0.0397857666015625, + "learning_rate": 0.0001, + "loss": 5.4788, + "loss/crossentropy": 2.5163642168045044, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15484175086021423, + "step": 26250 + }, + { + "epoch": 0.820375, + "grad_norm": 3.046875, + "grad_norm_var": 0.034684244791666666, + "learning_rate": 0.0001, + "loss": 5.7484, + "loss/crossentropy": 2.7177315950393677, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15853740274906158, + "step": 26252 + }, + { + "epoch": 0.8204375, + "grad_norm": 2.828125, + "grad_norm_var": 0.031298828125, + "learning_rate": 0.0001, + "loss": 5.4406, + "loss/crossentropy": 2.4661972522735596, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15213170647621155, + "step": 26254 + }, + { + "epoch": 0.8205, + "grad_norm": 2.90625, + "grad_norm_var": 0.03665364583333333, + "learning_rate": 0.0001, + "loss": 5.4724, + "loss/crossentropy": 2.4164193868637085, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16067221015691757, + "step": 26256 + }, + { + "epoch": 0.8205625, + "grad_norm": 3.375, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 0.0001, + "loss": 5.8206, + "loss/crossentropy": 2.6252888441085815, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1695270612835884, + "step": 26258 + }, + { + "epoch": 0.820625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0408599853515625, + "learning_rate": 0.0001, + "loss": 5.2859, + "loss/crossentropy": 2.355344533920288, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.14696674793958664, + "step": 26260 + }, + { + "epoch": 0.8206875, + "grad_norm": 2.875, + "grad_norm_var": 0.04544270833333333, + "learning_rate": 0.0001, + "loss": 5.494, + "loss/crossentropy": 2.482057571411133, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15744779258966446, + "step": 26262 + }, + { + "epoch": 0.82075, + "grad_norm": 2.953125, + "grad_norm_var": 0.041380818684895834, + "learning_rate": 0.0001, + "loss": 5.4578, + "loss/crossentropy": 2.4409984350204468, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16027343273162842, + "step": 26264 + }, + { + "epoch": 0.8208125, + "grad_norm": 3.125, + "grad_norm_var": 0.03461812337239583, + "learning_rate": 0.0001, + "loss": 5.5687, + "loss/crossentropy": 2.4897130727767944, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16415006667375565, + "step": 26266 + }, + { + "epoch": 0.820875, + "grad_norm": 2.875, + "grad_norm_var": 0.03427327473958333, + "learning_rate": 0.0001, + "loss": 5.6886, + "loss/crossentropy": 2.5870476961135864, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16445458680391312, + "step": 26268 + }, + { + "epoch": 0.8209375, + "grad_norm": 3.5, + "grad_norm_var": 0.040038045247395834, + "learning_rate": 0.0001, + "loss": 5.8109, + "loss/crossentropy": 2.6275731325149536, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17067311704158783, + "step": 26270 + }, + { + "epoch": 0.821, + "grad_norm": 3.078125, + "grad_norm_var": 0.036458333333333336, + "learning_rate": 0.0001, + "loss": 5.6087, + "loss/crossentropy": 2.5324538946151733, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16309446096420288, + "step": 26272 + }, + { + "epoch": 0.8210625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0330078125, + "learning_rate": 0.0001, + "loss": 6.1243, + "loss/crossentropy": 2.8941376209259033, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1730210930109024, + "step": 26274 + }, + { + "epoch": 0.821125, + "grad_norm": 3.3125, + "grad_norm_var": 0.0441314697265625, + "learning_rate": 0.0001, + "loss": 5.3515, + "loss/crossentropy": 2.359967350959778, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1499393805861473, + "step": 26276 + }, + { + "epoch": 0.8211875, + "grad_norm": 2.734375, + "grad_norm_var": 0.044970703125, + "learning_rate": 0.0001, + "loss": 5.8185, + "loss/crossentropy": 2.7468066215515137, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16107572615146637, + "step": 26278 + }, + { + "epoch": 0.82125, + "grad_norm": 3.125, + "grad_norm_var": 0.044066365559895834, + "learning_rate": 0.0001, + "loss": 5.808, + "loss/crossentropy": 2.70841383934021, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1634780615568161, + "step": 26280 + }, + { + "epoch": 0.8213125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04649149576822917, + "learning_rate": 0.0001, + "loss": 5.2681, + "loss/crossentropy": 2.286479353904724, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15206390619277954, + "step": 26282 + }, + { + "epoch": 0.821375, + "grad_norm": 2.96875, + "grad_norm_var": 0.04403889973958333, + "learning_rate": 0.0001, + "loss": 5.7495, + "loss/crossentropy": 2.649899482727051, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16543245315551758, + "step": 26284 + }, + { + "epoch": 0.8214375, + "grad_norm": 3.0, + "grad_norm_var": 0.03518880208333333, + "learning_rate": 0.0001, + "loss": 5.33, + "loss/crossentropy": 2.421257972717285, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.14829303324222565, + "step": 26286 + }, + { + "epoch": 0.8215, + "grad_norm": 2.90625, + "grad_norm_var": 0.03608296712239583, + "learning_rate": 0.0001, + "loss": 5.6065, + "loss/crossentropy": 2.5276122093200684, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16219016909599304, + "step": 26288 + }, + { + "epoch": 0.8215625, + "grad_norm": 2.84375, + "grad_norm_var": 0.03323160807291667, + "learning_rate": 0.0001, + "loss": 5.6477, + "loss/crossentropy": 2.637739658355713, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15568828582763672, + "step": 26290 + }, + { + "epoch": 0.821625, + "grad_norm": 2.84375, + "grad_norm_var": 0.022782389322916666, + "learning_rate": 0.0001, + "loss": 5.397, + "loss/crossentropy": 2.4032782316207886, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15444639325141907, + "step": 26292 + }, + { + "epoch": 0.8216875, + "grad_norm": 3.0625, + "grad_norm_var": 0.017634073893229168, + "learning_rate": 0.0001, + "loss": 5.5938, + "loss/crossentropy": 2.527271032333374, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16055792570114136, + "step": 26294 + }, + { + "epoch": 0.82175, + "grad_norm": 3.109375, + "grad_norm_var": 0.023958333333333335, + "learning_rate": 0.0001, + "loss": 5.5984, + "loss/crossentropy": 2.5177271366119385, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16041366010904312, + "step": 26296 + }, + { + "epoch": 0.8218125, + "grad_norm": 3.078125, + "grad_norm_var": 0.021808878580729166, + "learning_rate": 0.0001, + "loss": 5.6578, + "loss/crossentropy": 2.5493358373641968, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16397219896316528, + "step": 26298 + }, + { + "epoch": 0.821875, + "grad_norm": 2.953125, + "grad_norm_var": 0.015355428059895834, + "learning_rate": 0.0001, + "loss": 5.6355, + "loss/crossentropy": 2.5864739418029785, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15919510275125504, + "step": 26300 + }, + { + "epoch": 0.8219375, + "grad_norm": 2.953125, + "grad_norm_var": 0.017317708333333334, + "learning_rate": 0.0001, + "loss": 5.4856, + "loss/crossentropy": 2.3710405826568604, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16263197362422943, + "step": 26302 + }, + { + "epoch": 0.822, + "grad_norm": 3.421875, + "grad_norm_var": 0.027595011393229167, + "learning_rate": 0.0001, + "loss": 5.6385, + "loss/crossentropy": 2.4434726238250732, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17184783518314362, + "step": 26304 + }, + { + "epoch": 0.8220625, + "grad_norm": 3.4375, + "grad_norm_var": 0.05213114420572917, + "learning_rate": 0.0001, + "loss": 6.1962, + "loss/crossentropy": 2.876863956451416, + "loss/hidden": 1.53125, + "loss/jsd": 0.0, + "loss/logits": 0.17880384624004364, + "step": 26306 + }, + { + "epoch": 0.822125, + "grad_norm": 3.203125, + "grad_norm_var": 0.04628804524739583, + "learning_rate": 0.0001, + "loss": 5.8211, + "loss/crossentropy": 2.7134053707122803, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1658443659543991, + "step": 26308 + }, + { + "epoch": 0.8221875, + "grad_norm": 3.25, + "grad_norm_var": 0.04149983723958333, + "learning_rate": 0.0001, + "loss": 5.7838, + "loss/crossentropy": 2.677572727203369, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16062747687101364, + "step": 26310 + }, + { + "epoch": 0.82225, + "grad_norm": 3.15625, + "grad_norm_var": 0.04345296223958333, + "learning_rate": 0.0001, + "loss": 5.3967, + "loss/crossentropy": 2.3640542030334473, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15443439036607742, + "step": 26312 + }, + { + "epoch": 0.8223125, + "grad_norm": 3.0625, + "grad_norm_var": 0.04263916015625, + "learning_rate": 0.0001, + "loss": 5.7215, + "loss/crossentropy": 2.6260221004486084, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1626776158809662, + "step": 26314 + }, + { + "epoch": 0.822375, + "grad_norm": 3.171875, + "grad_norm_var": 0.0373046875, + "learning_rate": 0.0001, + "loss": 5.8018, + "loss/crossentropy": 2.510248303413391, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17876074463129044, + "step": 26316 + }, + { + "epoch": 0.8224375, + "grad_norm": 2.78125, + "grad_norm_var": 0.04501953125, + "learning_rate": 0.0001, + "loss": 5.2927, + "loss/crossentropy": 2.311138153076172, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1536274254322052, + "step": 26318 + }, + { + "epoch": 0.8225, + "grad_norm": 3.21875, + "grad_norm_var": 0.041747029622395834, + "learning_rate": 0.0001, + "loss": 5.9675, + "loss/crossentropy": 2.7409067153930664, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17421995103359222, + "step": 26320 + }, + { + "epoch": 0.8225625, + "grad_norm": 3.25, + "grad_norm_var": 0.01923828125, + "learning_rate": 0.0001, + "loss": 5.4253, + "loss/crossentropy": 2.4497876167297363, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1537996083498001, + "step": 26322 + }, + { + "epoch": 0.822625, + "grad_norm": 2.75, + "grad_norm_var": 0.027730305989583332, + "learning_rate": 0.0001, + "loss": 5.7434, + "loss/crossentropy": 2.674960494041443, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.162706196308136, + "step": 26324 + }, + { + "epoch": 0.8226875, + "grad_norm": 3.5625, + "grad_norm_var": 0.04254150390625, + "learning_rate": 0.0001, + "loss": 5.6512, + "loss/crossentropy": 2.4605530500411987, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17062649875879288, + "step": 26326 + }, + { + "epoch": 0.82275, + "grad_norm": 2.984375, + "grad_norm_var": 0.04202067057291667, + "learning_rate": 0.0001, + "loss": 5.3649, + "loss/crossentropy": 2.400713562965393, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15189073979854584, + "step": 26328 + }, + { + "epoch": 0.8228125, + "grad_norm": 3.09375, + "grad_norm_var": 0.052144368489583336, + "learning_rate": 0.0001, + "loss": 5.4342, + "loss/crossentropy": 2.393681764602661, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15756287425756454, + "step": 26330 + }, + { + "epoch": 0.822875, + "grad_norm": 3.078125, + "grad_norm_var": 0.0521148681640625, + "learning_rate": 0.0001, + "loss": 5.6294, + "loss/crossentropy": 2.5355751514434814, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16250969469547272, + "step": 26332 + }, + { + "epoch": 0.8229375, + "grad_norm": 3.171875, + "grad_norm_var": 0.043553670247395836, + "learning_rate": 0.0001, + "loss": 5.8098, + "loss/crossentropy": 2.636785864830017, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1688680425286293, + "step": 26334 + }, + { + "epoch": 0.823, + "grad_norm": 3.03125, + "grad_norm_var": 0.04490458170572917, + "learning_rate": 0.0001, + "loss": 5.359, + "loss/crossentropy": 2.3224642276763916, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15794599056243896, + "step": 26336 + }, + { + "epoch": 0.8230625, + "grad_norm": 3.140625, + "grad_norm_var": 0.044977823893229164, + "learning_rate": 0.0001, + "loss": 5.5461, + "loss/crossentropy": 2.468558430671692, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16048458218574524, + "step": 26338 + }, + { + "epoch": 0.823125, + "grad_norm": 3.03125, + "grad_norm_var": 0.03583577473958333, + "learning_rate": 0.0001, + "loss": 5.6669, + "loss/crossentropy": 2.578675150871277, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16273091733455658, + "step": 26340 + }, + { + "epoch": 0.8231875, + "grad_norm": 3.109375, + "grad_norm_var": 0.021415201822916667, + "learning_rate": 0.0001, + "loss": 5.6098, + "loss/crossentropy": 2.4914400577545166, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1610499694943428, + "step": 26342 + }, + { + "epoch": 0.82325, + "grad_norm": 3.140625, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 5.7348, + "loss/crossentropy": 2.608322024345398, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16342993825674057, + "step": 26344 + }, + { + "epoch": 0.8233125, + "grad_norm": 3.125, + "grad_norm_var": 0.008837890625, + "learning_rate": 0.0001, + "loss": 5.6239, + "loss/crossentropy": 2.4591116905212402, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16999010741710663, + "step": 26346 + }, + { + "epoch": 0.823375, + "grad_norm": 3.234375, + "grad_norm_var": 0.00728759765625, + "learning_rate": 0.0001, + "loss": 5.8587, + "loss/crossentropy": 2.7089649438858032, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16809836775064468, + "step": 26348 + }, + { + "epoch": 0.8234375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0187408447265625, + "learning_rate": 0.0001, + "loss": 6.0662, + "loss/crossentropy": 2.854896903038025, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17152578383684158, + "step": 26350 + }, + { + "epoch": 0.8235, + "grad_norm": 3.328125, + "grad_norm_var": 0.018876139322916666, + "learning_rate": 0.0001, + "loss": 5.9049, + "loss/crossentropy": 2.756648540496826, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16951383650302887, + "step": 26352 + }, + { + "epoch": 0.8235625, + "grad_norm": 3.265625, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 5.9413, + "loss/crossentropy": 2.757227897644043, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16880160570144653, + "step": 26354 + }, + { + "epoch": 0.823625, + "grad_norm": 3.21875, + "grad_norm_var": 0.016650390625, + "learning_rate": 0.0001, + "loss": 5.5885, + "loss/crossentropy": 2.4658830165863037, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16460354626178741, + "step": 26356 + }, + { + "epoch": 0.8236875, + "grad_norm": 2.734375, + "grad_norm_var": 0.0368316650390625, + "learning_rate": 0.0001, + "loss": 5.0838, + "loss/crossentropy": 2.2466323375701904, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.14387667924165726, + "step": 26358 + }, + { + "epoch": 0.82375, + "grad_norm": 3.3125, + "grad_norm_var": 0.04722900390625, + "learning_rate": 0.0001, + "loss": 5.7701, + "loss/crossentropy": 2.6672489643096924, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16848907619714737, + "step": 26360 + }, + { + "epoch": 0.8238125, + "grad_norm": 6.21875, + "grad_norm_var": 0.64293212890625, + "learning_rate": 0.0001, + "loss": 5.5528, + "loss/crossentropy": 2.5347325801849365, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1580551490187645, + "step": 26362 + }, + { + "epoch": 0.823875, + "grad_norm": 2.765625, + "grad_norm_var": 0.6656565348307292, + "learning_rate": 0.0001, + "loss": 5.488, + "loss/crossentropy": 2.4814869165420532, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1557292342185974, + "step": 26364 + }, + { + "epoch": 0.8239375, + "grad_norm": 2.96875, + "grad_norm_var": 0.6638631184895833, + "learning_rate": 0.0001, + "loss": 5.6194, + "loss/crossentropy": 2.56308913230896, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16187691688537598, + "step": 26366 + }, + { + "epoch": 0.824, + "grad_norm": 3.234375, + "grad_norm_var": 0.6730784098307292, + "learning_rate": 0.0001, + "loss": 5.5467, + "loss/crossentropy": 2.48854660987854, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16323573142290115, + "step": 26368 + }, + { + "epoch": 0.8240625, + "grad_norm": 3.171875, + "grad_norm_var": 0.6733306884765625, + "learning_rate": 0.0001, + "loss": 5.8946, + "loss/crossentropy": 2.660510301589966, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17379587143659592, + "step": 26370 + }, + { + "epoch": 0.824125, + "grad_norm": 2.921875, + "grad_norm_var": 0.6805948893229167, + "learning_rate": 0.0001, + "loss": 5.6283, + "loss/crossentropy": 2.585998773574829, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1597011610865593, + "step": 26372 + }, + { + "epoch": 0.8241875, + "grad_norm": 3.109375, + "grad_norm_var": 0.6638010660807292, + "learning_rate": 0.0001, + "loss": 5.731, + "loss/crossentropy": 2.639747738838196, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16459402441978455, + "step": 26374 + }, + { + "epoch": 0.82425, + "grad_norm": 3.15625, + "grad_norm_var": 0.6514719645182292, + "learning_rate": 0.0001, + "loss": 5.5863, + "loss/crossentropy": 2.5839895009994507, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15882544219493866, + "step": 26376 + }, + { + "epoch": 0.8243125, + "grad_norm": 2.734375, + "grad_norm_var": 0.02548828125, + "learning_rate": 0.0001, + "loss": 5.4645, + "loss/crossentropy": 2.4918004274368286, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1566426083445549, + "step": 26378 + }, + { + "epoch": 0.824375, + "grad_norm": 2.84375, + "grad_norm_var": 0.023697916666666666, + "learning_rate": 0.0001, + "loss": 5.4738, + "loss/crossentropy": 2.4789711236953735, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15729864686727524, + "step": 26380 + }, + { + "epoch": 0.8244375, + "grad_norm": 2.703125, + "grad_norm_var": 0.029899088541666667, + "learning_rate": 0.0001, + "loss": 5.4287, + "loss/crossentropy": 2.518850564956665, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.14879833906888962, + "step": 26382 + }, + { + "epoch": 0.8245, + "grad_norm": 3.09375, + "grad_norm_var": 0.10966389973958333, + "learning_rate": 0.0001, + "loss": 5.7521, + "loss/crossentropy": 2.5595513582229614, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.16456907987594604, + "step": 26384 + }, + { + "epoch": 0.8245625, + "grad_norm": 3.390625, + "grad_norm_var": 0.1145660400390625, + "learning_rate": 0.0001, + "loss": 6.0436, + "loss/crossentropy": 2.7219513654708862, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.18020762503147125, + "step": 26386 + }, + { + "epoch": 0.824625, + "grad_norm": 3.0625, + "grad_norm_var": 0.11431376139322917, + "learning_rate": 0.0001, + "loss": 5.7164, + "loss/crossentropy": 2.6323970556259155, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16113485395908356, + "step": 26388 + }, + { + "epoch": 0.8246875, + "grad_norm": 2.984375, + "grad_norm_var": 0.11272379557291666, + "learning_rate": 0.0001, + "loss": 5.6277, + "loss/crossentropy": 2.5497812032699585, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16013897210359573, + "step": 26390 + }, + { + "epoch": 0.82475, + "grad_norm": 3.046875, + "grad_norm_var": 0.11663004557291666, + "learning_rate": 0.0001, + "loss": 5.6706, + "loss/crossentropy": 2.611387848854065, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16022178530693054, + "step": 26392 + }, + { + "epoch": 0.8248125, + "grad_norm": 3.0625, + "grad_norm_var": 0.11233317057291667, + "learning_rate": 0.0001, + "loss": 5.9252, + "loss/crossentropy": 2.758017063140869, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17101654410362244, + "step": 26394 + }, + { + "epoch": 0.824875, + "grad_norm": 3.078125, + "grad_norm_var": 0.10614827473958334, + "learning_rate": 0.0001, + "loss": 5.4983, + "loss/crossentropy": 2.471684217453003, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1612573266029358, + "step": 26396 + }, + { + "epoch": 0.8249375, + "grad_norm": 3.515625, + "grad_norm_var": 0.10191650390625, + "learning_rate": 0.0001, + "loss": 5.9139, + "loss/crossentropy": 2.7498886585235596, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1667894497513771, + "step": 26398 + }, + { + "epoch": 0.825, + "grad_norm": 3.15625, + "grad_norm_var": 0.033426920572916664, + "learning_rate": 0.0001, + "loss": 5.7226, + "loss/crossentropy": 2.6924824714660645, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15847747027873993, + "step": 26400 + }, + { + "epoch": 0.8250625, + "grad_norm": 2.921875, + "grad_norm_var": 0.031305948893229164, + "learning_rate": 0.0001, + "loss": 5.6605, + "loss/crossentropy": 2.630774140357971, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.16235052794218063, + "step": 26402 + }, + { + "epoch": 0.825125, + "grad_norm": 7.4375, + "grad_norm_var": 1.2342274983723958, + "learning_rate": 0.0001, + "loss": 5.7267, + "loss/crossentropy": 2.5541563034057617, + "loss/hidden": 1.5546875, + "loss/jsd": 0.0, + "loss/logits": 0.1617891490459442, + "step": 26404 + }, + { + "epoch": 0.8251875, + "grad_norm": 2.9375, + "grad_norm_var": 1.2349680582682292, + "learning_rate": 0.0001, + "loss": 5.9378, + "loss/crossentropy": 2.739465355873108, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1733456403017044, + "step": 26406 + }, + { + "epoch": 0.82525, + "grad_norm": 3.546875, + "grad_norm_var": 1.213451131184896, + "learning_rate": 0.0001, + "loss": 5.4925, + "loss/crossentropy": 2.345843553543091, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16466321051120758, + "step": 26408 + }, + { + "epoch": 0.8253125, + "grad_norm": 2.984375, + "grad_norm_var": 1.2509185791015625, + "learning_rate": 0.0001, + "loss": 5.3811, + "loss/crossentropy": 2.4296364784240723, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1529592126607895, + "step": 26410 + }, + { + "epoch": 0.825375, + "grad_norm": 2.828125, + "grad_norm_var": 1.2613922119140626, + "learning_rate": 0.0001, + "loss": 5.5242, + "loss/crossentropy": 2.5729185342788696, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15450449287891388, + "step": 26412 + }, + { + "epoch": 0.8254375, + "grad_norm": 2.96875, + "grad_norm_var": 1.2567708333333334, + "learning_rate": 0.0001, + "loss": 5.7273, + "loss/crossentropy": 2.608040928840637, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16387560963630676, + "step": 26414 + }, + { + "epoch": 0.8255, + "grad_norm": 2.96875, + "grad_norm_var": 1.260326131184896, + "learning_rate": 0.0001, + "loss": 5.8647, + "loss/crossentropy": 2.699707269668579, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1661122441291809, + "step": 26416 + }, + { + "epoch": 0.8255625, + "grad_norm": 3.015625, + "grad_norm_var": 1.2520172119140625, + "learning_rate": 0.0001, + "loss": 5.7293, + "loss/crossentropy": 2.646604895591736, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16413111239671707, + "step": 26418 + }, + { + "epoch": 0.825625, + "grad_norm": 3.296875, + "grad_norm_var": 0.05634765625, + "learning_rate": 0.0001, + "loss": 5.6879, + "loss/crossentropy": 2.508752465248108, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1710352897644043, + "step": 26420 + }, + { + "epoch": 0.8256875, + "grad_norm": 3.015625, + "grad_norm_var": 0.04879557291666667, + "learning_rate": 0.0001, + "loss": 5.6641, + "loss/crossentropy": 2.589567542076111, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16136162728071213, + "step": 26422 + }, + { + "epoch": 0.82575, + "grad_norm": 3.71875, + "grad_norm_var": 0.06659749348958334, + "learning_rate": 0.0001, + "loss": 5.7171, + "loss/crossentropy": 2.5890969038009644, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16514231264591217, + "step": 26424 + }, + { + "epoch": 0.8258125, + "grad_norm": 3.234375, + "grad_norm_var": 0.0551177978515625, + "learning_rate": 0.0001, + "loss": 5.5481, + "loss/crossentropy": 2.4240739345550537, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1623988151550293, + "step": 26426 + }, + { + "epoch": 0.825875, + "grad_norm": 2.859375, + "grad_norm_var": 0.056916300455729166, + "learning_rate": 0.0001, + "loss": 5.7019, + "loss/crossentropy": 2.5907323360443115, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16307149827480316, + "step": 26428 + }, + { + "epoch": 0.8259375, + "grad_norm": 3.609375, + "grad_norm_var": 0.06676025390625, + "learning_rate": 0.0001, + "loss": 5.8064, + "loss/crossentropy": 2.57390820980072, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.16934466361999512, + "step": 26430 + }, + { + "epoch": 0.826, + "grad_norm": 3.125, + "grad_norm_var": 0.07798563639322917, + "learning_rate": 0.0001, + "loss": 5.3473, + "loss/crossentropy": 2.319072723388672, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1563357710838318, + "step": 26432 + }, + { + "epoch": 0.8260625, + "grad_norm": 3.46875, + "grad_norm_var": 0.07955729166666667, + "learning_rate": 0.0001, + "loss": 5.9472, + "loss/crossentropy": 2.8393198251724243, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16117888689041138, + "step": 26434 + }, + { + "epoch": 0.826125, + "grad_norm": 3.25, + "grad_norm_var": 0.0787506103515625, + "learning_rate": 0.0001, + "loss": 5.5096, + "loss/crossentropy": 2.4227101802825928, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16493766009807587, + "step": 26436 + }, + { + "epoch": 0.8261875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0774078369140625, + "learning_rate": 0.0001, + "loss": 5.612, + "loss/crossentropy": 2.5441017150878906, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16030529141426086, + "step": 26438 + }, + { + "epoch": 0.82625, + "grad_norm": 3.28125, + "grad_norm_var": 0.05364583333333333, + "learning_rate": 0.0001, + "loss": 5.8929, + "loss/crossentropy": 2.7285382747650146, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17034100741147995, + "step": 26440 + }, + { + "epoch": 0.8263125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05396728515625, + "learning_rate": 0.0001, + "loss": 5.9828, + "loss/crossentropy": 2.851389527320862, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1643085852265358, + "step": 26442 + }, + { + "epoch": 0.826375, + "grad_norm": 3.0, + "grad_norm_var": 0.04797770182291667, + "learning_rate": 0.0001, + "loss": 5.7981, + "loss/crossentropy": 2.68903386592865, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1667705997824669, + "step": 26444 + }, + { + "epoch": 0.8264375, + "grad_norm": 2.9375, + "grad_norm_var": 0.04982808430989583, + "learning_rate": 0.0001, + "loss": 5.9516, + "loss/crossentropy": 2.7996885776519775, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16909538954496384, + "step": 26446 + }, + { + "epoch": 0.8265, + "grad_norm": 3.375, + "grad_norm_var": 0.04019775390625, + "learning_rate": 0.0001, + "loss": 5.7933, + "loss/crossentropy": 2.629693865776062, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16987202316522598, + "step": 26448 + }, + { + "epoch": 0.8265625, + "grad_norm": 3.140625, + "grad_norm_var": 0.033589680989583336, + "learning_rate": 0.0001, + "loss": 5.864, + "loss/crossentropy": 2.723311424255371, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16797932237386703, + "step": 26450 + }, + { + "epoch": 0.826625, + "grad_norm": 3.03125, + "grad_norm_var": 0.033984375, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.5940492153167725, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1576242372393608, + "step": 26452 + }, + { + "epoch": 0.8266875, + "grad_norm": 3.21875, + "grad_norm_var": 0.03339436848958333, + "learning_rate": 0.0001, + "loss": 5.561, + "loss/crossentropy": 2.5500476360321045, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15695472061634064, + "step": 26454 + }, + { + "epoch": 0.82675, + "grad_norm": 3.015625, + "grad_norm_var": 0.027978515625, + "learning_rate": 0.0001, + "loss": 5.3775, + "loss/crossentropy": 2.3505324125289917, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15621038526296616, + "step": 26456 + }, + { + "epoch": 0.8268125, + "grad_norm": 3.234375, + "grad_norm_var": 0.029206339518229166, + "learning_rate": 0.0001, + "loss": 6.0761, + "loss/crossentropy": 2.8190720081329346, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17570750415325165, + "step": 26458 + }, + { + "epoch": 0.826875, + "grad_norm": 3.0, + "grad_norm_var": 0.028620402018229168, + "learning_rate": 0.0001, + "loss": 5.5801, + "loss/crossentropy": 2.554157018661499, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15806709975004196, + "step": 26460 + }, + { + "epoch": 0.8269375, + "grad_norm": 2.96875, + "grad_norm_var": 0.015184529622395833, + "learning_rate": 0.0001, + "loss": 5.7697, + "loss/crossentropy": 2.625096917152405, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16876161843538284, + "step": 26462 + }, + { + "epoch": 0.827, + "grad_norm": 3.140625, + "grad_norm_var": 0.013916015625, + "learning_rate": 0.0001, + "loss": 5.8531, + "loss/crossentropy": 2.731651186943054, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16605433821678162, + "step": 26464 + }, + { + "epoch": 0.8270625, + "grad_norm": 3.234375, + "grad_norm_var": 0.014518229166666667, + "learning_rate": 0.0001, + "loss": 5.939, + "loss/crossentropy": 2.704955220222473, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17418386042118073, + "step": 26466 + }, + { + "epoch": 0.827125, + "grad_norm": 2.953125, + "grad_norm_var": 0.017699178059895834, + "learning_rate": 0.0001, + "loss": 5.872, + "loss/crossentropy": 2.667886972427368, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17041241377592087, + "step": 26468 + }, + { + "epoch": 0.8271875, + "grad_norm": 2.921875, + "grad_norm_var": 0.01943359375, + "learning_rate": 0.0001, + "loss": 5.6555, + "loss/crossentropy": 2.6146072149276733, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16034063696861267, + "step": 26470 + }, + { + "epoch": 0.82725, + "grad_norm": 2.90625, + "grad_norm_var": 0.0251373291015625, + "learning_rate": 0.0001, + "loss": 5.3293, + "loss/crossentropy": 2.3477360010147095, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15401406586170197, + "step": 26472 + }, + { + "epoch": 0.8273125, + "grad_norm": 3.359375, + "grad_norm_var": 0.0262603759765625, + "learning_rate": 0.0001, + "loss": 5.5267, + "loss/crossentropy": 2.5378081798553467, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15318354219198227, + "step": 26474 + }, + { + "epoch": 0.827375, + "grad_norm": 2.890625, + "grad_norm_var": 0.027765909830729168, + "learning_rate": 0.0001, + "loss": 5.2999, + "loss/crossentropy": 2.333868145942688, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1520739421248436, + "step": 26476 + }, + { + "epoch": 0.8274375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0291900634765625, + "learning_rate": 0.0001, + "loss": 5.9343, + "loss/crossentropy": 2.6245245933532715, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.17707359045743942, + "step": 26478 + }, + { + "epoch": 0.8275, + "grad_norm": 3.1875, + "grad_norm_var": 0.03076171875, + "learning_rate": 0.0001, + "loss": 5.7936, + "loss/crossentropy": 2.589395761489868, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1704207882285118, + "step": 26480 + }, + { + "epoch": 0.8275625, + "grad_norm": 3.265625, + "grad_norm_var": 0.03173828125, + "learning_rate": 0.0001, + "loss": 5.6295, + "loss/crossentropy": 2.4367631673812866, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17122900485992432, + "step": 26482 + }, + { + "epoch": 0.827625, + "grad_norm": 2.9375, + "grad_norm_var": 0.03654683430989583, + "learning_rate": 0.0001, + "loss": 5.1427, + "loss/crossentropy": 2.2844330072402954, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.1471521183848381, + "step": 26484 + }, + { + "epoch": 0.8276875, + "grad_norm": 2.9375, + "grad_norm_var": 0.036031087239583336, + "learning_rate": 0.0001, + "loss": 5.818, + "loss/crossentropy": 2.7138901948928833, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1666596755385399, + "step": 26486 + }, + { + "epoch": 0.82775, + "grad_norm": 3.25, + "grad_norm_var": 0.0419921875, + "learning_rate": 0.0001, + "loss": 5.8464, + "loss/crossentropy": 2.6777409315109253, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16608533263206482, + "step": 26488 + }, + { + "epoch": 0.8278125, + "grad_norm": 3.265625, + "grad_norm_var": 0.0384429931640625, + "learning_rate": 0.0001, + "loss": 5.7181, + "loss/crossentropy": 2.6251548528671265, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16398146003484726, + "step": 26490 + }, + { + "epoch": 0.827875, + "grad_norm": 2.859375, + "grad_norm_var": 0.04639383951822917, + "learning_rate": 0.0001, + "loss": 5.3811, + "loss/crossentropy": 2.5030300617218018, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14717745035886765, + "step": 26492 + }, + { + "epoch": 0.8279375, + "grad_norm": 3.421875, + "grad_norm_var": 0.056396484375, + "learning_rate": 0.0001, + "loss": 5.5396, + "loss/crossentropy": 2.4696165323257446, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16090339422225952, + "step": 26494 + }, + { + "epoch": 0.828, + "grad_norm": 2.8125, + "grad_norm_var": 0.0555572509765625, + "learning_rate": 0.0001, + "loss": 5.5684, + "loss/crossentropy": 2.5611249208450317, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15658611059188843, + "step": 26496 + }, + { + "epoch": 0.8280625, + "grad_norm": 2.96875, + "grad_norm_var": 0.061962890625, + "learning_rate": 0.0001, + "loss": 5.8567, + "loss/crossentropy": 2.6480154991149902, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17243517935276031, + "step": 26498 + }, + { + "epoch": 0.828125, + "grad_norm": 2.890625, + "grad_norm_var": 0.05626627604166667, + "learning_rate": 0.0001, + "loss": 5.4199, + "loss/crossentropy": 2.3727545738220215, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16252897679805756, + "step": 26500 + }, + { + "epoch": 0.8281875, + "grad_norm": 2.9375, + "grad_norm_var": 0.057306925455729164, + "learning_rate": 0.0001, + "loss": 5.6952, + "loss/crossentropy": 2.6760060787200928, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15973235666751862, + "step": 26502 + }, + { + "epoch": 0.82825, + "grad_norm": 3.0625, + "grad_norm_var": 0.041901652018229166, + "learning_rate": 0.0001, + "loss": 5.6175, + "loss/crossentropy": 2.581863760948181, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16059322655200958, + "step": 26504 + }, + { + "epoch": 0.8283125, + "grad_norm": 2.859375, + "grad_norm_var": 0.03951822916666667, + "learning_rate": 0.0001, + "loss": 5.4037, + "loss/crossentropy": 2.4558279514312744, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15299250930547714, + "step": 26506 + }, + { + "epoch": 0.828375, + "grad_norm": 3.15625, + "grad_norm_var": 0.04651692708333333, + "learning_rate": 0.0001, + "loss": 5.2724, + "loss/crossentropy": 2.3377411365509033, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15206074714660645, + "step": 26508 + }, + { + "epoch": 0.8284375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0355621337890625, + "learning_rate": 0.0001, + "loss": 5.6686, + "loss/crossentropy": 2.570454478263855, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.165279820561409, + "step": 26510 + }, + { + "epoch": 0.8285, + "grad_norm": 2.578125, + "grad_norm_var": 0.044287109375, + "learning_rate": 0.0001, + "loss": 5.2783, + "loss/crossentropy": 2.3998725414276123, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1444854512810707, + "step": 26512 + }, + { + "epoch": 0.8285625, + "grad_norm": 3.28125, + "grad_norm_var": 0.0391754150390625, + "learning_rate": 0.0001, + "loss": 5.4749, + "loss/crossentropy": 2.476272702217102, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.156505286693573, + "step": 26514 + }, + { + "epoch": 0.828625, + "grad_norm": 3.296875, + "grad_norm_var": 0.04589436848958333, + "learning_rate": 0.0001, + "loss": 5.5233, + "loss/crossentropy": 2.439882516860962, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1614651456475258, + "step": 26516 + }, + { + "epoch": 0.8286875, + "grad_norm": 3.03125, + "grad_norm_var": 0.04641011555989583, + "learning_rate": 0.0001, + "loss": 5.5137, + "loss/crossentropy": 2.5422792434692383, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15534193813800812, + "step": 26518 + }, + { + "epoch": 0.82875, + "grad_norm": 3.703125, + "grad_norm_var": 0.07837626139322916, + "learning_rate": 0.0001, + "loss": 5.7862, + "loss/crossentropy": 2.646515369415283, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1643586978316307, + "step": 26520 + }, + { + "epoch": 0.8288125, + "grad_norm": 3.109375, + "grad_norm_var": 0.0773101806640625, + "learning_rate": 0.0001, + "loss": 5.5665, + "loss/crossentropy": 2.519219160079956, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15824301540851593, + "step": 26522 + }, + { + "epoch": 0.828875, + "grad_norm": 3.03125, + "grad_norm_var": 0.06526590983072916, + "learning_rate": 0.0001, + "loss": 5.5823, + "loss/crossentropy": 2.5451369285583496, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15996336936950684, + "step": 26524 + }, + { + "epoch": 0.8289375, + "grad_norm": 2.90625, + "grad_norm_var": 0.0631988525390625, + "learning_rate": 0.0001, + "loss": 5.2962, + "loss/crossentropy": 2.3923909664154053, + "loss/hidden": 1.390625, + "loss/jsd": 0.0, + "loss/logits": 0.15131926536560059, + "step": 26526 + }, + { + "epoch": 0.829, + "grad_norm": 3.265625, + "grad_norm_var": 0.05148111979166667, + "learning_rate": 0.0001, + "loss": 5.4582, + "loss/crossentropy": 2.4119802713394165, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1597050204873085, + "step": 26528 + }, + { + "epoch": 0.8290625, + "grad_norm": 2.984375, + "grad_norm_var": 0.044482421875, + "learning_rate": 0.0001, + "loss": 5.573, + "loss/crossentropy": 2.570006012916565, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15811413526535034, + "step": 26530 + }, + { + "epoch": 0.829125, + "grad_norm": 3.078125, + "grad_norm_var": 0.04120686848958333, + "learning_rate": 0.0001, + "loss": 5.5177, + "loss/crossentropy": 2.4561526775360107, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15615782141685486, + "step": 26532 + }, + { + "epoch": 0.8291875, + "grad_norm": 3.140625, + "grad_norm_var": 0.040339152018229164, + "learning_rate": 0.0001, + "loss": 5.7838, + "loss/crossentropy": 2.690218448638916, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16483154892921448, + "step": 26534 + }, + { + "epoch": 0.82925, + "grad_norm": 3.125, + "grad_norm_var": 0.014069620768229167, + "learning_rate": 0.0001, + "loss": 5.6097, + "loss/crossentropy": 2.4879987239837646, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.164124995470047, + "step": 26536 + }, + { + "epoch": 0.8293125, + "grad_norm": 2.890625, + "grad_norm_var": 0.0142730712890625, + "learning_rate": 0.0001, + "loss": 5.7149, + "loss/crossentropy": 2.6120272874832153, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16809770464897156, + "step": 26538 + }, + { + "epoch": 0.829375, + "grad_norm": 3.046875, + "grad_norm_var": 0.016402180989583334, + "learning_rate": 0.0001, + "loss": 5.7451, + "loss/crossentropy": 2.646128296852112, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16497544944286346, + "step": 26540 + }, + { + "epoch": 0.8294375, + "grad_norm": 2.90625, + "grad_norm_var": 0.016097005208333334, + "learning_rate": 0.0001, + "loss": 5.439, + "loss/crossentropy": 2.3879107236862183, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16096339374780655, + "step": 26542 + }, + { + "epoch": 0.8295, + "grad_norm": 2.921875, + "grad_norm_var": 0.015949503580729166, + "learning_rate": 0.0001, + "loss": 5.8367, + "loss/crossentropy": 2.6953009366989136, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16687103360891342, + "step": 26544 + }, + { + "epoch": 0.8295625, + "grad_norm": 3.03125, + "grad_norm_var": 0.014142862955729167, + "learning_rate": 0.0001, + "loss": 5.7148, + "loss/crossentropy": 2.690555214881897, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16102024912834167, + "step": 26546 + }, + { + "epoch": 0.829625, + "grad_norm": 2.90625, + "grad_norm_var": 0.0148590087890625, + "learning_rate": 0.0001, + "loss": 5.6024, + "loss/crossentropy": 2.5594701766967773, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16014951467514038, + "step": 26548 + }, + { + "epoch": 0.8296875, + "grad_norm": 3.0, + "grad_norm_var": 0.011751302083333333, + "learning_rate": 0.0001, + "loss": 5.3773, + "loss/crossentropy": 2.4803178310394287, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.14829088747501373, + "step": 26550 + }, + { + "epoch": 0.82975, + "grad_norm": 2.890625, + "grad_norm_var": 0.013231404622395833, + "learning_rate": 0.0001, + "loss": 5.7185, + "loss/crossentropy": 2.6627864837646484, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16221345216035843, + "step": 26552 + }, + { + "epoch": 0.8298125, + "grad_norm": 3.078125, + "grad_norm_var": 0.012238566080729167, + "learning_rate": 0.0001, + "loss": 5.7239, + "loss/crossentropy": 2.6020004749298096, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16453495621681213, + "step": 26554 + }, + { + "epoch": 0.829875, + "grad_norm": 3.109375, + "grad_norm_var": 0.013395182291666667, + "learning_rate": 0.0001, + "loss": 5.7971, + "loss/crossentropy": 2.7000378370285034, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16322240978479385, + "step": 26556 + }, + { + "epoch": 0.8299375, + "grad_norm": 2.8125, + "grad_norm_var": 0.015494791666666667, + "learning_rate": 0.0001, + "loss": 5.3251, + "loss/crossentropy": 2.398852825164795, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15199558436870575, + "step": 26558 + }, + { + "epoch": 0.83, + "grad_norm": 2.78125, + "grad_norm_var": 0.014449055989583333, + "learning_rate": 0.0001, + "loss": 5.5648, + "loss/crossentropy": 2.6085457801818848, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15422050654888153, + "step": 26560 + }, + { + "epoch": 0.8300625, + "grad_norm": 3.171875, + "grad_norm_var": 0.015946451822916666, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.5029183626174927, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16790633648633957, + "step": 26562 + }, + { + "epoch": 0.830125, + "grad_norm": 3.328125, + "grad_norm_var": 0.03160807291666667, + "learning_rate": 0.0001, + "loss": 5.4127, + "loss/crossentropy": 2.3239113092422485, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16278766095638275, + "step": 26564 + }, + { + "epoch": 0.8301875, + "grad_norm": 2.921875, + "grad_norm_var": 0.03394775390625, + "learning_rate": 0.0001, + "loss": 5.442, + "loss/crossentropy": 2.4539923667907715, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15426688641309738, + "step": 26566 + }, + { + "epoch": 0.83025, + "grad_norm": 2.8125, + "grad_norm_var": 0.035380045572916664, + "learning_rate": 0.0001, + "loss": 5.429, + "loss/crossentropy": 2.4697686433792114, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15373125672340393, + "step": 26568 + }, + { + "epoch": 0.8303125, + "grad_norm": 3.09375, + "grad_norm_var": 0.03853759765625, + "learning_rate": 0.0001, + "loss": 5.6651, + "loss/crossentropy": 2.653131365776062, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.16135179996490479, + "step": 26570 + }, + { + "epoch": 0.830375, + "grad_norm": 2.890625, + "grad_norm_var": 0.041422526041666664, + "learning_rate": 0.0001, + "loss": 5.437, + "loss/crossentropy": 2.4077842235565186, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16033972054719925, + "step": 26572 + }, + { + "epoch": 0.8304375, + "grad_norm": 3.265625, + "grad_norm_var": 0.062352498372395836, + "learning_rate": 0.0001, + "loss": 6.1193, + "loss/crossentropy": 2.7360039949417114, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18755102157592773, + "step": 26574 + }, + { + "epoch": 0.8305, + "grad_norm": 3.21875, + "grad_norm_var": 0.056029256184895834, + "learning_rate": 0.0001, + "loss": 5.4868, + "loss/crossentropy": 2.4786585569381714, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15784411132335663, + "step": 26576 + }, + { + "epoch": 0.8305625, + "grad_norm": 3.390625, + "grad_norm_var": 0.0599761962890625, + "learning_rate": 0.0001, + "loss": 5.7899, + "loss/crossentropy": 2.640892744064331, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.17076198756694794, + "step": 26578 + }, + { + "epoch": 0.830625, + "grad_norm": 3.140625, + "grad_norm_var": 0.05230712890625, + "learning_rate": 0.0001, + "loss": 5.9349, + "loss/crossentropy": 2.810156226158142, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16364441066980362, + "step": 26580 + }, + { + "epoch": 0.8306875, + "grad_norm": 2.9375, + "grad_norm_var": 0.04895426432291667, + "learning_rate": 0.0001, + "loss": 5.5483, + "loss/crossentropy": 2.5238953828811646, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15713290870189667, + "step": 26582 + }, + { + "epoch": 0.83075, + "grad_norm": 2.921875, + "grad_norm_var": 0.04453125, + "learning_rate": 0.0001, + "loss": 5.0923, + "loss/crossentropy": 2.2778791189193726, + "loss/hidden": 1.34375, + "loss/jsd": 0.0, + "loss/logits": 0.14706549048423767, + "step": 26584 + }, + { + "epoch": 0.8308125, + "grad_norm": 3.046875, + "grad_norm_var": 0.03824462890625, + "learning_rate": 0.0001, + "loss": 5.9693, + "loss/crossentropy": 2.776100277900696, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1708783060312271, + "step": 26586 + }, + { + "epoch": 0.830875, + "grad_norm": 3.0625, + "grad_norm_var": 0.034130859375, + "learning_rate": 0.0001, + "loss": 5.5534, + "loss/crossentropy": 2.4902278184890747, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15904773771762848, + "step": 26588 + }, + { + "epoch": 0.8309375, + "grad_norm": 3.53125, + "grad_norm_var": 0.04877827962239583, + "learning_rate": 0.0001, + "loss": 5.3848, + "loss/crossentropy": 2.412701368331909, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1515059694647789, + "step": 26590 + }, + { + "epoch": 0.831, + "grad_norm": 3.171875, + "grad_norm_var": 0.04833577473958333, + "learning_rate": 0.0001, + "loss": 5.7852, + "loss/crossentropy": 2.6917717456817627, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16129279136657715, + "step": 26592 + }, + { + "epoch": 0.8310625, + "grad_norm": 2.6875, + "grad_norm_var": 0.06445210774739583, + "learning_rate": 0.0001, + "loss": 5.378, + "loss/crossentropy": 2.3800243139266968, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15644137561321259, + "step": 26594 + }, + { + "epoch": 0.831125, + "grad_norm": 3.28125, + "grad_norm_var": 0.06321614583333333, + "learning_rate": 0.0001, + "loss": 5.6866, + "loss/crossentropy": 2.6062453985214233, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16350360214710236, + "step": 26596 + }, + { + "epoch": 0.8311875, + "grad_norm": 3.015625, + "grad_norm_var": 0.0645660400390625, + "learning_rate": 0.0001, + "loss": 5.4337, + "loss/crossentropy": 2.4476349353790283, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15368564426898956, + "step": 26598 + }, + { + "epoch": 0.83125, + "grad_norm": 3.1875, + "grad_norm_var": 0.06405843098958333, + "learning_rate": 0.0001, + "loss": 5.7728, + "loss/crossentropy": 2.641290545463562, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16823303699493408, + "step": 26600 + }, + { + "epoch": 0.8313125, + "grad_norm": 2.890625, + "grad_norm_var": 0.06389058430989583, + "learning_rate": 0.0001, + "loss": 5.6648, + "loss/crossentropy": 2.550861358642578, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16530445218086243, + "step": 26602 + }, + { + "epoch": 0.831375, + "grad_norm": 3.296875, + "grad_norm_var": 0.06747945149739583, + "learning_rate": 0.0001, + "loss": 5.9078, + "loss/crossentropy": 2.655044198036194, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1744927167892456, + "step": 26604 + }, + { + "epoch": 0.8314375, + "grad_norm": 3.09375, + "grad_norm_var": 0.03706766764322917, + "learning_rate": 0.0001, + "loss": 5.6888, + "loss/crossentropy": 2.5507737398147583, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16810215264558792, + "step": 26606 + }, + { + "epoch": 0.8315, + "grad_norm": 3.15625, + "grad_norm_var": 0.036774698893229166, + "learning_rate": 0.0001, + "loss": 5.6882, + "loss/crossentropy": 2.556751012802124, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16627078503370285, + "step": 26608 + }, + { + "epoch": 0.8315625, + "grad_norm": 2.921875, + "grad_norm_var": 0.018586222330729166, + "learning_rate": 0.0001, + "loss": 5.7332, + "loss/crossentropy": 2.6276328563690186, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16134114563465118, + "step": 26610 + }, + { + "epoch": 0.831625, + "grad_norm": 2.671875, + "grad_norm_var": 0.0405426025390625, + "learning_rate": 0.0001, + "loss": 5.5495, + "loss/crossentropy": 2.455042839050293, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16374597698450089, + "step": 26612 + }, + { + "epoch": 0.8316875, + "grad_norm": 3.046875, + "grad_norm_var": 0.03710530598958333, + "learning_rate": 0.0001, + "loss": 5.7492, + "loss/crossentropy": 2.6724430322647095, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15962416678667068, + "step": 26614 + }, + { + "epoch": 0.83175, + "grad_norm": 3.078125, + "grad_norm_var": 0.0346343994140625, + "learning_rate": 0.0001, + "loss": 5.3511, + "loss/crossentropy": 2.3925164937973022, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15132832527160645, + "step": 26616 + }, + { + "epoch": 0.8318125, + "grad_norm": 3.09375, + "grad_norm_var": 0.03276265462239583, + "learning_rate": 0.0001, + "loss": 6.0376, + "loss/crossentropy": 2.8824446201324463, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16942650824785233, + "step": 26618 + }, + { + "epoch": 0.831875, + "grad_norm": 3.046875, + "grad_norm_var": 0.0291412353515625, + "learning_rate": 0.0001, + "loss": 5.5934, + "loss/crossentropy": 2.4897799491882324, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16621798276901245, + "step": 26620 + }, + { + "epoch": 0.8319375, + "grad_norm": 3.65625, + "grad_norm_var": 0.05239156087239583, + "learning_rate": 0.0001, + "loss": 5.5129, + "loss/crossentropy": 2.4113610982894897, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16288983821868896, + "step": 26622 + }, + { + "epoch": 0.832, + "grad_norm": 3.109375, + "grad_norm_var": 0.05408528645833333, + "learning_rate": 0.0001, + "loss": 5.6785, + "loss/crossentropy": 2.59414279460907, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16586247086524963, + "step": 26624 + }, + { + "epoch": 0.8320625, + "grad_norm": 3.125, + "grad_norm_var": 0.05830790201822917, + "learning_rate": 0.0001, + "loss": 5.2388, + "loss/crossentropy": 2.319770097732544, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.14776460826396942, + "step": 26626 + }, + { + "epoch": 0.832125, + "grad_norm": 2.96875, + "grad_norm_var": 0.035399373372395834, + "learning_rate": 0.0001, + "loss": 5.6976, + "loss/crossentropy": 2.633196473121643, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16347172856330872, + "step": 26628 + }, + { + "epoch": 0.8321875, + "grad_norm": 3.21875, + "grad_norm_var": 0.03687744140625, + "learning_rate": 0.0001, + "loss": 5.8177, + "loss/crossentropy": 2.6326440572738647, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1708531677722931, + "step": 26630 + }, + { + "epoch": 0.83225, + "grad_norm": 3.234375, + "grad_norm_var": 0.04069722493489583, + "learning_rate": 0.0001, + "loss": 5.6339, + "loss/crossentropy": 2.51995050907135, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16451912373304367, + "step": 26632 + }, + { + "epoch": 0.8323125, + "grad_norm": 2.90625, + "grad_norm_var": 0.04394124348958333, + "learning_rate": 0.0001, + "loss": 5.7602, + "loss/crossentropy": 2.667533040046692, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16551541537046432, + "step": 26634 + }, + { + "epoch": 0.832375, + "grad_norm": 2.765625, + "grad_norm_var": 0.05290425618489583, + "learning_rate": 0.0001, + "loss": 5.5446, + "loss/crossentropy": 2.634292483329773, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15001527965068817, + "step": 26636 + }, + { + "epoch": 0.8324375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0281646728515625, + "learning_rate": 0.0001, + "loss": 5.6131, + "loss/crossentropy": 2.544378161430359, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16312159597873688, + "step": 26638 + }, + { + "epoch": 0.8325, + "grad_norm": 3.015625, + "grad_norm_var": 0.026741536458333333, + "learning_rate": 0.0001, + "loss": 5.4294, + "loss/crossentropy": 2.410798668861389, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15810702741146088, + "step": 26640 + }, + { + "epoch": 0.8325625, + "grad_norm": 3.15625, + "grad_norm_var": 0.027790323893229166, + "learning_rate": 0.0001, + "loss": 5.5651, + "loss/crossentropy": 2.5781432390213013, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1565045714378357, + "step": 26642 + }, + { + "epoch": 0.832625, + "grad_norm": 3.171875, + "grad_norm_var": 0.028544108072916668, + "learning_rate": 0.0001, + "loss": 5.5522, + "loss/crossentropy": 2.501611590385437, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16014138609170914, + "step": 26644 + }, + { + "epoch": 0.8326875, + "grad_norm": 3.453125, + "grad_norm_var": 0.038798014322916664, + "learning_rate": 0.0001, + "loss": 5.1769, + "loss/crossentropy": 2.2129558324813843, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15225522220134735, + "step": 26646 + }, + { + "epoch": 0.83275, + "grad_norm": 3.09375, + "grad_norm_var": 0.0319000244140625, + "learning_rate": 0.0001, + "loss": 5.5751, + "loss/crossentropy": 2.527201771736145, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15869136154651642, + "step": 26648 + }, + { + "epoch": 0.8328125, + "grad_norm": 3.03125, + "grad_norm_var": 0.038960774739583336, + "learning_rate": 0.0001, + "loss": 5.8576, + "loss/crossentropy": 2.6822643280029297, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16988150775432587, + "step": 26650 + }, + { + "epoch": 0.832875, + "grad_norm": 3.171875, + "grad_norm_var": 0.030615234375, + "learning_rate": 0.0001, + "loss": 5.4277, + "loss/crossentropy": 2.4621567726135254, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15593001246452332, + "step": 26652 + }, + { + "epoch": 0.8329375, + "grad_norm": 3.09375, + "grad_norm_var": 0.03381754557291667, + "learning_rate": 0.0001, + "loss": 5.7046, + "loss/crossentropy": 2.5616977214813232, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16781005263328552, + "step": 26654 + }, + { + "epoch": 0.833, + "grad_norm": 3.046875, + "grad_norm_var": 0.03369140625, + "learning_rate": 0.0001, + "loss": 5.8725, + "loss/crossentropy": 2.6961796283721924, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16958148777484894, + "step": 26656 + }, + { + "epoch": 0.8330625, + "grad_norm": 3.140625, + "grad_norm_var": 0.023851521809895835, + "learning_rate": 0.0001, + "loss": 5.6901, + "loss/crossentropy": 2.6127448081970215, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16125278174877167, + "step": 26658 + }, + { + "epoch": 0.833125, + "grad_norm": 3.421875, + "grad_norm_var": 0.029173787434895834, + "learning_rate": 0.0001, + "loss": 5.7964, + "loss/crossentropy": 2.625298857688904, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16749783605337143, + "step": 26660 + }, + { + "epoch": 0.8331875, + "grad_norm": 3.546875, + "grad_norm_var": 0.03072509765625, + "learning_rate": 0.0001, + "loss": 5.862, + "loss/crossentropy": 2.6517386436462402, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1729767546057701, + "step": 26662 + }, + { + "epoch": 0.83325, + "grad_norm": 3.109375, + "grad_norm_var": 0.0299713134765625, + "learning_rate": 0.0001, + "loss": 5.6328, + "loss/crossentropy": 2.574502468109131, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16363906115293503, + "step": 26664 + }, + { + "epoch": 0.8333125, + "grad_norm": 3.265625, + "grad_norm_var": 0.032469685872395834, + "learning_rate": 0.0001, + "loss": 5.4988, + "loss/crossentropy": 2.5313034057617188, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15612319856882095, + "step": 26666 + }, + { + "epoch": 0.833375, + "grad_norm": 3.140625, + "grad_norm_var": 0.03329671223958333, + "learning_rate": 0.0001, + "loss": 5.6757, + "loss/crossentropy": 2.530162215232849, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16651113331317902, + "step": 26668 + }, + { + "epoch": 0.8334375, + "grad_norm": 3.1875, + "grad_norm_var": 0.02779541015625, + "learning_rate": 0.0001, + "loss": 5.733, + "loss/crossentropy": 2.65896737575531, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16443108767271042, + "step": 26670 + }, + { + "epoch": 0.8335, + "grad_norm": 3.171875, + "grad_norm_var": 0.028083292643229167, + "learning_rate": 0.0001, + "loss": 5.6405, + "loss/crossentropy": 2.536733031272888, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16584675759077072, + "step": 26672 + }, + { + "epoch": 0.8335625, + "grad_norm": 3.21875, + "grad_norm_var": 0.03177083333333333, + "learning_rate": 0.0001, + "loss": 5.5154, + "loss/crossentropy": 2.423623204231262, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15957237780094147, + "step": 26674 + }, + { + "epoch": 0.833625, + "grad_norm": 3.0, + "grad_norm_var": 0.028706868489583332, + "learning_rate": 0.0001, + "loss": 5.5164, + "loss/crossentropy": 2.5392919778823853, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15708287060260773, + "step": 26676 + }, + { + "epoch": 0.8336875, + "grad_norm": 3.25, + "grad_norm_var": 0.0214752197265625, + "learning_rate": 0.0001, + "loss": 5.4774, + "loss/crossentropy": 2.5430959463119507, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.14967641979455948, + "step": 26678 + }, + { + "epoch": 0.83375, + "grad_norm": 3.515625, + "grad_norm_var": 0.03196512858072917, + "learning_rate": 0.0001, + "loss": 5.4983, + "loss/crossentropy": 2.4303911924362183, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16030970215797424, + "step": 26680 + }, + { + "epoch": 0.8338125, + "grad_norm": 2.921875, + "grad_norm_var": 0.028400675455729166, + "learning_rate": 0.0001, + "loss": 5.5658, + "loss/crossentropy": 2.545443296432495, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1594526320695877, + "step": 26682 + }, + { + "epoch": 0.833875, + "grad_norm": 3.734375, + "grad_norm_var": 0.058268229166666664, + "learning_rate": 0.0001, + "loss": 5.4435, + "loss/crossentropy": 2.3132035732269287, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16458793729543686, + "step": 26684 + }, + { + "epoch": 0.8339375, + "grad_norm": 2.984375, + "grad_norm_var": 0.062174479166666664, + "learning_rate": 0.0001, + "loss": 5.279, + "loss/crossentropy": 2.3491846323013306, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.14766483008861542, + "step": 26686 + }, + { + "epoch": 0.834, + "grad_norm": 2.953125, + "grad_norm_var": 0.0600738525390625, + "learning_rate": 0.0001, + "loss": 5.3633, + "loss/crossentropy": 2.4419000148773193, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15151993930339813, + "step": 26688 + }, + { + "epoch": 0.8340625, + "grad_norm": 2.796875, + "grad_norm_var": 0.06399332682291667, + "learning_rate": 0.0001, + "loss": 5.5113, + "loss/crossentropy": 2.4870275259017944, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15945614874362946, + "step": 26690 + }, + { + "epoch": 0.834125, + "grad_norm": 2.640625, + "grad_norm_var": 0.0761383056640625, + "learning_rate": 0.0001, + "loss": 5.4755, + "loss/crossentropy": 2.4982056617736816, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15475623309612274, + "step": 26692 + }, + { + "epoch": 0.8341875, + "grad_norm": 3.09375, + "grad_norm_var": 0.07292378743489583, + "learning_rate": 0.0001, + "loss": 5.7732, + "loss/crossentropy": 2.6511059999465942, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16767705231904984, + "step": 26694 + }, + { + "epoch": 0.83425, + "grad_norm": 3.375, + "grad_norm_var": 0.06965230305989584, + "learning_rate": 0.0001, + "loss": 5.3915, + "loss/crossentropy": 2.3754031658172607, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15668748319149017, + "step": 26696 + }, + { + "epoch": 0.8343125, + "grad_norm": 2.6875, + "grad_norm_var": 0.07779541015625, + "learning_rate": 0.0001, + "loss": 5.2696, + "loss/crossentropy": 2.4492520093917847, + "loss/hidden": 1.375, + "loss/jsd": 0.0, + "loss/logits": 0.1445300504565239, + "step": 26698 + }, + { + "epoch": 0.834375, + "grad_norm": 3.125, + "grad_norm_var": 0.04003499348958333, + "learning_rate": 0.0001, + "loss": 5.5888, + "loss/crossentropy": 2.5078903436660767, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1604384109377861, + "step": 26700 + }, + { + "epoch": 0.8344375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0409088134765625, + "learning_rate": 0.0001, + "loss": 5.5608, + "loss/crossentropy": 2.512549877166748, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15989995747804642, + "step": 26702 + }, + { + "epoch": 0.8345, + "grad_norm": 3.078125, + "grad_norm_var": 0.044432576497395834, + "learning_rate": 0.0001, + "loss": 5.9121, + "loss/crossentropy": 2.7601088285446167, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16832401603460312, + "step": 26704 + }, + { + "epoch": 0.8345625, + "grad_norm": 2.9375, + "grad_norm_var": 0.03886617024739583, + "learning_rate": 0.0001, + "loss": 5.6269, + "loss/crossentropy": 2.5564210414886475, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16329863667488098, + "step": 26706 + }, + { + "epoch": 0.834625, + "grad_norm": 3.09375, + "grad_norm_var": 0.031689453125, + "learning_rate": 0.0001, + "loss": 5.4439, + "loss/crossentropy": 2.488960862159729, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15096069872379303, + "step": 26708 + }, + { + "epoch": 0.8346875, + "grad_norm": 2.96875, + "grad_norm_var": 0.030402628580729167, + "learning_rate": 0.0001, + "loss": 5.5835, + "loss/crossentropy": 2.5353397130966187, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16067619621753693, + "step": 26710 + }, + { + "epoch": 0.83475, + "grad_norm": 3.09375, + "grad_norm_var": 0.05784403483072917, + "learning_rate": 0.0001, + "loss": 5.7989, + "loss/crossentropy": 2.5801597833633423, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17226530611515045, + "step": 26712 + }, + { + "epoch": 0.8348125, + "grad_norm": 3.234375, + "grad_norm_var": 0.04716796875, + "learning_rate": 0.0001, + "loss": 5.8157, + "loss/crossentropy": 2.6295214891433716, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1682291179895401, + "step": 26714 + }, + { + "epoch": 0.834875, + "grad_norm": 3.109375, + "grad_norm_var": 0.044091796875, + "learning_rate": 0.0001, + "loss": 5.8039, + "loss/crossentropy": 2.604735016822815, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17030832171440125, + "step": 26716 + }, + { + "epoch": 0.8349375, + "grad_norm": 3.09375, + "grad_norm_var": 0.03968098958333333, + "learning_rate": 0.0001, + "loss": 5.8926, + "loss/crossentropy": 2.7313610315322876, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1692444309592247, + "step": 26718 + }, + { + "epoch": 0.835, + "grad_norm": 2.765625, + "grad_norm_var": 0.04872945149739583, + "learning_rate": 0.0001, + "loss": 5.307, + "loss/crossentropy": 2.4017326831817627, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1522488072514534, + "step": 26720 + }, + { + "epoch": 0.8350625, + "grad_norm": 3.171875, + "grad_norm_var": 0.047606404622395834, + "learning_rate": 0.0001, + "loss": 5.6059, + "loss/crossentropy": 2.5100516080856323, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16544535756111145, + "step": 26722 + }, + { + "epoch": 0.835125, + "grad_norm": 2.859375, + "grad_norm_var": 0.052473958333333334, + "learning_rate": 0.0001, + "loss": 5.5598, + "loss/crossentropy": 2.4760489463806152, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1626765951514244, + "step": 26724 + }, + { + "epoch": 0.8351875, + "grad_norm": 2.953125, + "grad_norm_var": 0.05281473795572917, + "learning_rate": 0.0001, + "loss": 5.3427, + "loss/crossentropy": 2.367329955101013, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15691465139389038, + "step": 26726 + }, + { + "epoch": 0.83525, + "grad_norm": 2.859375, + "grad_norm_var": 0.022587076822916666, + "learning_rate": 0.0001, + "loss": 5.3456, + "loss/crossentropy": 2.42109215259552, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15026235580444336, + "step": 26728 + }, + { + "epoch": 0.8353125, + "grad_norm": 2.875, + "grad_norm_var": 0.026366170247395834, + "learning_rate": 0.0001, + "loss": 5.4864, + "loss/crossentropy": 2.4277414083480835, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16212007403373718, + "step": 26730 + }, + { + "epoch": 0.835375, + "grad_norm": 3.046875, + "grad_norm_var": 0.026204427083333332, + "learning_rate": 0.0001, + "loss": 5.8032, + "loss/crossentropy": 2.7227463722229004, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16390687227249146, + "step": 26732 + }, + { + "epoch": 0.8354375, + "grad_norm": 3.859375, + "grad_norm_var": 0.0723785400390625, + "learning_rate": 0.0001, + "loss": 5.6879, + "loss/crossentropy": 2.5874462127685547, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16473715007305145, + "step": 26734 + }, + { + "epoch": 0.8355, + "grad_norm": 3.140625, + "grad_norm_var": 0.06676025390625, + "learning_rate": 0.0001, + "loss": 5.6737, + "loss/crossentropy": 2.6122844219207764, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15809710323810577, + "step": 26736 + }, + { + "epoch": 0.8355625, + "grad_norm": 2.9375, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 0.0001, + "loss": 5.4637, + "loss/crossentropy": 2.457230806350708, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15767929702997208, + "step": 26738 + }, + { + "epoch": 0.835625, + "grad_norm": 3.03125, + "grad_norm_var": 0.063623046875, + "learning_rate": 0.0001, + "loss": 5.6956, + "loss/crossentropy": 2.612622022628784, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16299007087945938, + "step": 26740 + }, + { + "epoch": 0.8356875, + "grad_norm": 2.890625, + "grad_norm_var": 0.06609598795572917, + "learning_rate": 0.0001, + "loss": 5.9292, + "loss/crossentropy": 2.7688095569610596, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16916365921497345, + "step": 26742 + }, + { + "epoch": 0.83575, + "grad_norm": 3.0, + "grad_norm_var": 0.06223856608072917, + "learning_rate": 0.0001, + "loss": 5.5947, + "loss/crossentropy": 2.5085628032684326, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15939905494451523, + "step": 26744 + }, + { + "epoch": 0.8358125, + "grad_norm": 2.96875, + "grad_norm_var": 0.05424702962239583, + "learning_rate": 0.0001, + "loss": 5.4353, + "loss/crossentropy": 2.3988637924194336, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15637993812561035, + "step": 26746 + }, + { + "epoch": 0.835875, + "grad_norm": 2.984375, + "grad_norm_var": 0.05304361979166667, + "learning_rate": 0.0001, + "loss": 5.4795, + "loss/crossentropy": 2.4259506464004517, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16160430014133453, + "step": 26748 + }, + { + "epoch": 0.8359375, + "grad_norm": 3.453125, + "grad_norm_var": 0.0195465087890625, + "learning_rate": 0.0001, + "loss": 5.7092, + "loss/crossentropy": 2.5474666357040405, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17007629573345184, + "step": 26750 + }, + { + "epoch": 0.836, + "grad_norm": 3.234375, + "grad_norm_var": 0.024787394205729167, + "learning_rate": 0.0001, + "loss": 5.5902, + "loss/crossentropy": 2.5298532247543335, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15642627328634262, + "step": 26752 + }, + { + "epoch": 0.8360625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0242584228515625, + "learning_rate": 0.0001, + "loss": 5.6481, + "loss/crossentropy": 2.5728485584259033, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15987379103899002, + "step": 26754 + }, + { + "epoch": 0.836125, + "grad_norm": 3.140625, + "grad_norm_var": 0.025223795572916666, + "learning_rate": 0.0001, + "loss": 5.5884, + "loss/crossentropy": 2.4491742849349976, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1627497524023056, + "step": 26756 + }, + { + "epoch": 0.8361875, + "grad_norm": 3.015625, + "grad_norm_var": 0.021857706705729167, + "learning_rate": 0.0001, + "loss": 5.7338, + "loss/crossentropy": 2.59290087223053, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16643692553043365, + "step": 26758 + }, + { + "epoch": 0.83625, + "grad_norm": 3.21875, + "grad_norm_var": 0.021773274739583334, + "learning_rate": 0.0001, + "loss": 5.6905, + "loss/crossentropy": 2.5556634664535522, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16387174278497696, + "step": 26760 + }, + { + "epoch": 0.8363125, + "grad_norm": 2.859375, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 5.83, + "loss/crossentropy": 2.696183443069458, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16533113270998, + "step": 26762 + }, + { + "epoch": 0.836375, + "grad_norm": 2.703125, + "grad_norm_var": 0.03271077473958333, + "learning_rate": 0.0001, + "loss": 5.1793, + "loss/crossentropy": 2.3029249906539917, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14701150357723236, + "step": 26764 + }, + { + "epoch": 0.8364375, + "grad_norm": 3.21875, + "grad_norm_var": 0.02427978515625, + "learning_rate": 0.0001, + "loss": 5.9573, + "loss/crossentropy": 2.7467890977859497, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1733977273106575, + "step": 26766 + }, + { + "epoch": 0.8365, + "grad_norm": 3.0625, + "grad_norm_var": 0.021695963541666665, + "learning_rate": 0.0001, + "loss": 5.5861, + "loss/crossentropy": 2.5231176614761353, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15981249511241913, + "step": 26768 + }, + { + "epoch": 0.8365625, + "grad_norm": 6.875, + "grad_norm_var": 0.9330800374348959, + "learning_rate": 0.0001, + "loss": 5.8358, + "loss/crossentropy": 2.6720504760742188, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17027679085731506, + "step": 26770 + }, + { + "epoch": 0.836625, + "grad_norm": 3.0625, + "grad_norm_var": 0.94205322265625, + "learning_rate": 0.0001, + "loss": 5.6374, + "loss/crossentropy": 2.564761996269226, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16429024934768677, + "step": 26772 + }, + { + "epoch": 0.8366875, + "grad_norm": 3.28125, + "grad_norm_var": 0.9464192708333333, + "learning_rate": 0.0001, + "loss": 5.9044, + "loss/crossentropy": 2.7284083366394043, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17033323645591736, + "step": 26774 + }, + { + "epoch": 0.83675, + "grad_norm": 3.0, + "grad_norm_var": 0.9486979166666667, + "learning_rate": 0.0001, + "loss": 5.6843, + "loss/crossentropy": 2.6526867151260376, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1578490436077118, + "step": 26776 + }, + { + "epoch": 0.8368125, + "grad_norm": 3.0625, + "grad_norm_var": 0.9392161051432292, + "learning_rate": 0.0001, + "loss": 5.5223, + "loss/crossentropy": 2.4756247997283936, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15740208327770233, + "step": 26778 + }, + { + "epoch": 0.836875, + "grad_norm": 3.203125, + "grad_norm_var": 0.9165201822916667, + "learning_rate": 0.0001, + "loss": 5.5949, + "loss/crossentropy": 2.4976896047592163, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1651892215013504, + "step": 26780 + }, + { + "epoch": 0.8369375, + "grad_norm": 3.328125, + "grad_norm_var": 0.934912109375, + "learning_rate": 0.0001, + "loss": 5.6704, + "loss/crossentropy": 2.551721930503845, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16460269689559937, + "step": 26782 + }, + { + "epoch": 0.837, + "grad_norm": 3.265625, + "grad_norm_var": 0.9153472900390625, + "learning_rate": 0.0001, + "loss": 6.0132, + "loss/crossentropy": 2.706874132156372, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.18141163140535355, + "step": 26784 + }, + { + "epoch": 0.8370625, + "grad_norm": 3.015625, + "grad_norm_var": 0.038939412434895834, + "learning_rate": 0.0001, + "loss": 5.415, + "loss/crossentropy": 2.47431743144989, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.14953739196062088, + "step": 26786 + }, + { + "epoch": 0.837125, + "grad_norm": 2.984375, + "grad_norm_var": 0.0384674072265625, + "learning_rate": 0.0001, + "loss": 5.6568, + "loss/crossentropy": 2.5297670364379883, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16738921403884888, + "step": 26788 + }, + { + "epoch": 0.8371875, + "grad_norm": 2.921875, + "grad_norm_var": 0.03527018229166667, + "learning_rate": 0.0001, + "loss": 5.4379, + "loss/crossentropy": 2.397291421890259, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15875116735696793, + "step": 26790 + }, + { + "epoch": 0.83725, + "grad_norm": 2.890625, + "grad_norm_var": 0.03779296875, + "learning_rate": 0.0001, + "loss": 5.5707, + "loss/crossentropy": 2.5675047636032104, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1577436849474907, + "step": 26792 + }, + { + "epoch": 0.8373125, + "grad_norm": 2.96875, + "grad_norm_var": 0.03433837890625, + "learning_rate": 0.0001, + "loss": 5.673, + "loss/crossentropy": 2.5663158893585205, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16614055633544922, + "step": 26794 + }, + { + "epoch": 0.837375, + "grad_norm": 3.109375, + "grad_norm_var": 0.03267822265625, + "learning_rate": 0.0001, + "loss": 5.9705, + "loss/crossentropy": 2.836636543273926, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16690289974212646, + "step": 26796 + }, + { + "epoch": 0.8374375, + "grad_norm": 3.265625, + "grad_norm_var": 0.02998046875, + "learning_rate": 0.0001, + "loss": 5.7814, + "loss/crossentropy": 2.54093074798584, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1717006117105484, + "step": 26798 + }, + { + "epoch": 0.8375, + "grad_norm": 3.09375, + "grad_norm_var": 0.02125244140625, + "learning_rate": 0.0001, + "loss": 5.5185, + "loss/crossentropy": 2.5173208713531494, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15090397745370865, + "step": 26800 + }, + { + "epoch": 0.8375625, + "grad_norm": 3.3125, + "grad_norm_var": 0.022370402018229166, + "learning_rate": 0.0001, + "loss": 5.5954, + "loss/crossentropy": 2.535297393798828, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16264691948890686, + "step": 26802 + }, + { + "epoch": 0.837625, + "grad_norm": 3.453125, + "grad_norm_var": 0.032373046875, + "learning_rate": 0.0001, + "loss": 5.8944, + "loss/crossentropy": 2.706671714782715, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16876784712076187, + "step": 26804 + }, + { + "epoch": 0.8376875, + "grad_norm": 2.84375, + "grad_norm_var": 0.042313639322916666, + "learning_rate": 0.0001, + "loss": 5.1955, + "loss/crossentropy": 2.3233484029769897, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1434631049633026, + "step": 26806 + }, + { + "epoch": 0.83775, + "grad_norm": 2.875, + "grad_norm_var": 0.0427398681640625, + "learning_rate": 0.0001, + "loss": 5.6275, + "loss/crossentropy": 2.5967692136764526, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16049602627754211, + "step": 26808 + }, + { + "epoch": 0.8378125, + "grad_norm": 3.59375, + "grad_norm_var": 0.058771769205729164, + "learning_rate": 0.0001, + "loss": 5.9516, + "loss/crossentropy": 2.7301191091537476, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.170590378344059, + "step": 26810 + }, + { + "epoch": 0.837875, + "grad_norm": 3.046875, + "grad_norm_var": 0.061909993489583336, + "learning_rate": 0.0001, + "loss": 5.1865, + "loss/crossentropy": 2.310048818588257, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14702502638101578, + "step": 26812 + }, + { + "epoch": 0.8379375, + "grad_norm": 2.75, + "grad_norm_var": 0.06623942057291667, + "learning_rate": 0.0001, + "loss": 5.2667, + "loss/crossentropy": 2.4080445766448975, + "loss/hidden": 1.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.14914533495903015, + "step": 26814 + }, + { + "epoch": 0.838, + "grad_norm": 3.015625, + "grad_norm_var": 0.06544596354166667, + "learning_rate": 0.0001, + "loss": 5.9025, + "loss/crossentropy": 2.729759693145752, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17001153528690338, + "step": 26816 + }, + { + "epoch": 0.8380625, + "grad_norm": 3.296875, + "grad_norm_var": 0.0635406494140625, + "learning_rate": 0.0001, + "loss": 5.3389, + "loss/crossentropy": 2.3518441915512085, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15378347784280777, + "step": 26818 + }, + { + "epoch": 0.838125, + "grad_norm": 4.84375, + "grad_norm_var": 0.2645467122395833, + "learning_rate": 0.0001, + "loss": 5.9562, + "loss/crossentropy": 2.596237540245056, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18521567434072495, + "step": 26820 + }, + { + "epoch": 0.8381875, + "grad_norm": 2.890625, + "grad_norm_var": 0.2518056233723958, + "learning_rate": 0.0001, + "loss": 5.4601, + "loss/crossentropy": 2.451179027557373, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15519140660762787, + "step": 26822 + }, + { + "epoch": 0.83825, + "grad_norm": 2.90625, + "grad_norm_var": 0.25244038899739585, + "learning_rate": 0.0001, + "loss": 5.4229, + "loss/crossentropy": 2.4421818256378174, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15002215653657913, + "step": 26824 + }, + { + "epoch": 0.8383125, + "grad_norm": 2.859375, + "grad_norm_var": 0.24560445149739582, + "learning_rate": 0.0001, + "loss": 5.4341, + "loss/crossentropy": 2.4391945600509644, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1526143178343773, + "step": 26826 + }, + { + "epoch": 0.838375, + "grad_norm": 3.34375, + "grad_norm_var": 0.24294331868489583, + "learning_rate": 0.0001, + "loss": 5.8242, + "loss/crossentropy": 2.630326747894287, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17290284484624863, + "step": 26828 + }, + { + "epoch": 0.8384375, + "grad_norm": 3.953125, + "grad_norm_var": 0.24970296223958333, + "learning_rate": 0.0001, + "loss": 5.9347, + "loss/crossentropy": 2.5525777339935303, + "loss/hidden": 1.546875, + "loss/jsd": 0.0, + "loss/logits": 0.18352903425693512, + "step": 26830 + }, + { + "epoch": 0.8385, + "grad_norm": 2.90625, + "grad_norm_var": 0.28645731608072916, + "learning_rate": 0.0001, + "loss": 5.9471, + "loss/crossentropy": 2.721782088279724, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17682532221078873, + "step": 26832 + }, + { + "epoch": 0.8385625, + "grad_norm": 3.0, + "grad_norm_var": 0.30128580729166665, + "learning_rate": 0.0001, + "loss": 5.5491, + "loss/crossentropy": 2.541784167289734, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15502379834651947, + "step": 26834 + }, + { + "epoch": 0.838625, + "grad_norm": 3.3125, + "grad_norm_var": 0.15608622233072916, + "learning_rate": 0.0001, + "loss": 5.9573, + "loss/crossentropy": 2.6573015451431274, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.17570366710424423, + "step": 26836 + }, + { + "epoch": 0.8386875, + "grad_norm": 3.078125, + "grad_norm_var": 0.14849344889322916, + "learning_rate": 0.0001, + "loss": 5.4387, + "loss/crossentropy": 2.422448754310608, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15670091658830643, + "step": 26838 + }, + { + "epoch": 0.83875, + "grad_norm": 3.078125, + "grad_norm_var": 0.14223531087239583, + "learning_rate": 0.0001, + "loss": 5.3166, + "loss/crossentropy": 2.348435401916504, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1534605398774147, + "step": 26840 + }, + { + "epoch": 0.8388125, + "grad_norm": 2.96875, + "grad_norm_var": 0.139013671875, + "learning_rate": 0.0001, + "loss": 5.2335, + "loss/crossentropy": 2.250566601753235, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15376639366149902, + "step": 26842 + }, + { + "epoch": 0.838875, + "grad_norm": 3.59375, + "grad_norm_var": 0.14752197265625, + "learning_rate": 0.0001, + "loss": 5.7326, + "loss/crossentropy": 2.6011509895324707, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16470693796873093, + "step": 26844 + }, + { + "epoch": 0.8389375, + "grad_norm": 3.234375, + "grad_norm_var": 0.14312744140625, + "learning_rate": 0.0001, + "loss": 5.6885, + "loss/crossentropy": 2.4735031127929688, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17344865947961807, + "step": 26846 + }, + { + "epoch": 0.839, + "grad_norm": 3.0, + "grad_norm_var": 0.09752197265625, + "learning_rate": 0.0001, + "loss": 5.5867, + "loss/crossentropy": 2.5408254861831665, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16083333641290665, + "step": 26848 + }, + { + "epoch": 0.8390625, + "grad_norm": 2.890625, + "grad_norm_var": 0.10227457682291667, + "learning_rate": 0.0001, + "loss": 5.4898, + "loss/crossentropy": 2.5737812519073486, + "loss/hidden": 1.390625, + "loss/jsd": 0.0, + "loss/logits": 0.15253794938325882, + "step": 26850 + }, + { + "epoch": 0.839125, + "grad_norm": 3.296875, + "grad_norm_var": 0.07851155598958333, + "learning_rate": 0.0001, + "loss": 5.6971, + "loss/crossentropy": 2.6061235666275024, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16300751268863678, + "step": 26852 + }, + { + "epoch": 0.8391875, + "grad_norm": 3.140625, + "grad_norm_var": 0.07815348307291667, + "learning_rate": 0.0001, + "loss": 5.5509, + "loss/crossentropy": 2.4616470336914062, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16009312123060226, + "step": 26854 + }, + { + "epoch": 0.83925, + "grad_norm": 2.875, + "grad_norm_var": 0.08430074055989584, + "learning_rate": 0.0001, + "loss": 5.688, + "loss/crossentropy": 2.626276135444641, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1600799262523651, + "step": 26856 + }, + { + "epoch": 0.8393125, + "grad_norm": 3.046875, + "grad_norm_var": 0.08238016764322917, + "learning_rate": 0.0001, + "loss": 5.647, + "loss/crossentropy": 2.580414891242981, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16095992922782898, + "step": 26858 + }, + { + "epoch": 0.839375, + "grad_norm": 3.40625, + "grad_norm_var": 0.07401936848958333, + "learning_rate": 0.0001, + "loss": 5.7336, + "loss/crossentropy": 2.6484148502349854, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16242486238479614, + "step": 26860 + }, + { + "epoch": 0.8394375, + "grad_norm": 3.078125, + "grad_norm_var": 0.0338531494140625, + "learning_rate": 0.0001, + "loss": 5.7906, + "loss/crossentropy": 2.6796486377716064, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16500383615493774, + "step": 26862 + }, + { + "epoch": 0.8395, + "grad_norm": 3.0, + "grad_norm_var": 0.9651194254557292, + "learning_rate": 0.0001, + "loss": 5.2826, + "loss/crossentropy": 2.179580330848694, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.1591331586241722, + "step": 26864 + }, + { + "epoch": 0.8395625, + "grad_norm": 2.796875, + "grad_norm_var": 0.9588826497395834, + "learning_rate": 0.0001, + "loss": 5.5379, + "loss/crossentropy": 2.5491769313812256, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15316757559776306, + "step": 26866 + }, + { + "epoch": 0.839625, + "grad_norm": 2.890625, + "grad_norm_var": 0.9625, + "learning_rate": 0.0001, + "loss": 5.6079, + "loss/crossentropy": 2.5456382036209106, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16208944469690323, + "step": 26868 + }, + { + "epoch": 0.8396875, + "grad_norm": 3.125, + "grad_norm_var": 0.9618804931640625, + "learning_rate": 0.0001, + "loss": 5.6729, + "loss/crossentropy": 2.5196443796157837, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16493035852909088, + "step": 26870 + }, + { + "epoch": 0.83975, + "grad_norm": 2.90625, + "grad_norm_var": 0.9611328125, + "learning_rate": 0.0001, + "loss": 5.7829, + "loss/crossentropy": 2.6576212644577026, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16565701365470886, + "step": 26872 + }, + { + "epoch": 0.8398125, + "grad_norm": 3.15625, + "grad_norm_var": 0.9554921468098958, + "learning_rate": 0.0001, + "loss": 5.3669, + "loss/crossentropy": 2.3514479398727417, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15544924139976501, + "step": 26874 + }, + { + "epoch": 0.839875, + "grad_norm": 2.84375, + "grad_norm_var": 0.9740234375, + "learning_rate": 0.0001, + "loss": 5.6978, + "loss/crossentropy": 2.684528112411499, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15445105731487274, + "step": 26876 + }, + { + "epoch": 0.8399375, + "grad_norm": 2.90625, + "grad_norm_var": 0.9736887613932291, + "learning_rate": 0.0001, + "loss": 5.7286, + "loss/crossentropy": 2.690067172050476, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15932255238294601, + "step": 26878 + }, + { + "epoch": 0.84, + "grad_norm": 3.0625, + "grad_norm_var": 0.03420817057291667, + "learning_rate": 0.0001, + "loss": 5.3626, + "loss/crossentropy": 2.3932125568389893, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15358121693134308, + "step": 26880 + }, + { + "epoch": 0.8400625, + "grad_norm": 2.9375, + "grad_norm_var": 0.030744425455729165, + "learning_rate": 0.0001, + "loss": 5.8626, + "loss/crossentropy": 2.7045029401779175, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1669803336262703, + "step": 26882 + }, + { + "epoch": 0.840125, + "grad_norm": 3.15625, + "grad_norm_var": 0.024925740559895833, + "learning_rate": 0.0001, + "loss": 5.8938, + "loss/crossentropy": 2.766055107116699, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16316113620996475, + "step": 26884 + }, + { + "epoch": 0.8401875, + "grad_norm": 3.25, + "grad_norm_var": 0.0205718994140625, + "learning_rate": 0.0001, + "loss": 5.4507, + "loss/crossentropy": 2.528176784515381, + "loss/hidden": 1.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.15280170738697052, + "step": 26886 + }, + { + "epoch": 0.84025, + "grad_norm": 3.203125, + "grad_norm_var": 0.0192291259765625, + "learning_rate": 0.0001, + "loss": 5.4286, + "loss/crossentropy": 2.4476903676986694, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15668931603431702, + "step": 26888 + }, + { + "epoch": 0.8403125, + "grad_norm": 3.015625, + "grad_norm_var": 0.03310546875, + "learning_rate": 0.0001, + "loss": 5.9517, + "loss/crossentropy": 2.68823504447937, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17751407623291016, + "step": 26890 + }, + { + "epoch": 0.840375, + "grad_norm": 3.140625, + "grad_norm_var": 0.026688639322916666, + "learning_rate": 0.0001, + "loss": 5.8172, + "loss/crossentropy": 2.668814778327942, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16796687245368958, + "step": 26892 + }, + { + "epoch": 0.8404375, + "grad_norm": 3.0625, + "grad_norm_var": 0.0240142822265625, + "learning_rate": 0.0001, + "loss": 5.6673, + "loss/crossentropy": 2.572339177131653, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16262523084878922, + "step": 26894 + }, + { + "epoch": 0.8405, + "grad_norm": 3.21875, + "grad_norm_var": 0.023485310872395835, + "learning_rate": 0.0001, + "loss": 5.7972, + "loss/crossentropy": 2.6653146743774414, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16436417400836945, + "step": 26896 + }, + { + "epoch": 0.8405625, + "grad_norm": 3.1875, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 5.9304, + "loss/crossentropy": 2.7217326164245605, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1732107624411583, + "step": 26898 + }, + { + "epoch": 0.840625, + "grad_norm": 2.984375, + "grad_norm_var": 0.021337890625, + "learning_rate": 0.0001, + "loss": 5.7385, + "loss/crossentropy": 2.6471580266952515, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16382071375846863, + "step": 26900 + }, + { + "epoch": 0.8406875, + "grad_norm": 2.96875, + "grad_norm_var": 0.023395792643229166, + "learning_rate": 0.0001, + "loss": 5.707, + "loss/crossentropy": 2.6310672760009766, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1654074341058731, + "step": 26902 + }, + { + "epoch": 0.84075, + "grad_norm": 3.0625, + "grad_norm_var": 0.023517862955729166, + "learning_rate": 0.0001, + "loss": 5.5822, + "loss/crossentropy": 2.538162112236023, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16025841236114502, + "step": 26904 + }, + { + "epoch": 0.8408125, + "grad_norm": 2.953125, + "grad_norm_var": 0.011994425455729167, + "learning_rate": 0.0001, + "loss": 5.5235, + "loss/crossentropy": 2.5428719520568848, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1558748334646225, + "step": 26906 + }, + { + "epoch": 0.840875, + "grad_norm": 2.75, + "grad_norm_var": 0.017268880208333334, + "learning_rate": 0.0001, + "loss": 5.6886, + "loss/crossentropy": 2.6650702953338623, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15821386873722076, + "step": 26908 + }, + { + "epoch": 0.8409375, + "grad_norm": 2.734375, + "grad_norm_var": 0.03150634765625, + "learning_rate": 0.0001, + "loss": 5.5548, + "loss/crossentropy": 2.48689067363739, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1606953889131546, + "step": 26910 + }, + { + "epoch": 0.841, + "grad_norm": 3.265625, + "grad_norm_var": 0.03775634765625, + "learning_rate": 0.0001, + "loss": 5.8837, + "loss/crossentropy": 2.644471287727356, + "loss/hidden": 1.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.16884031891822815, + "step": 26912 + }, + { + "epoch": 0.8410625, + "grad_norm": 3.09375, + "grad_norm_var": 0.0431793212890625, + "learning_rate": 0.0001, + "loss": 5.7563, + "loss/crossentropy": 2.681451916694641, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.161394402384758, + "step": 26914 + }, + { + "epoch": 0.841125, + "grad_norm": 3.1875, + "grad_norm_var": 0.046971638997395836, + "learning_rate": 0.0001, + "loss": 5.7341, + "loss/crossentropy": 2.6222978830337524, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16625793278217316, + "step": 26916 + }, + { + "epoch": 0.8411875, + "grad_norm": 3.15625, + "grad_norm_var": 0.04518941243489583, + "learning_rate": 0.0001, + "loss": 5.8303, + "loss/crossentropy": 2.6999884843826294, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16577009856700897, + "step": 26918 + }, + { + "epoch": 0.84125, + "grad_norm": 3.15625, + "grad_norm_var": 0.04499409993489583, + "learning_rate": 0.0001, + "loss": 5.4822, + "loss/crossentropy": 2.466330647468567, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15275505185127258, + "step": 26920 + }, + { + "epoch": 0.8413125, + "grad_norm": 2.859375, + "grad_norm_var": 0.048094685872395834, + "learning_rate": 0.0001, + "loss": 5.5703, + "loss/crossentropy": 2.5702513456344604, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15664538741111755, + "step": 26922 + }, + { + "epoch": 0.841375, + "grad_norm": 3.015625, + "grad_norm_var": 0.04749247233072917, + "learning_rate": 0.0001, + "loss": 5.2683, + "loss/crossentropy": 2.370810389518738, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1499096229672432, + "step": 26924 + }, + { + "epoch": 0.8414375, + "grad_norm": 3.0, + "grad_norm_var": 0.0365875244140625, + "learning_rate": 0.0001, + "loss": 5.2108, + "loss/crossentropy": 2.2897136211395264, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.14679966866970062, + "step": 26926 + }, + { + "epoch": 0.8415, + "grad_norm": 3.265625, + "grad_norm_var": 0.027880859375, + "learning_rate": 0.0001, + "loss": 5.4859, + "loss/crossentropy": 2.413581371307373, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16504260152578354, + "step": 26928 + }, + { + "epoch": 0.8415625, + "grad_norm": 2.90625, + "grad_norm_var": 0.02841796875, + "learning_rate": 0.0001, + "loss": 5.4686, + "loss/crossentropy": 2.5018755197525024, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15565899014472961, + "step": 26930 + }, + { + "epoch": 0.841625, + "grad_norm": 3.109375, + "grad_norm_var": 0.019401041666666667, + "learning_rate": 0.0001, + "loss": 5.2209, + "loss/crossentropy": 2.2474820613861084, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15281382948160172, + "step": 26932 + }, + { + "epoch": 0.8416875, + "grad_norm": 2.78125, + "grad_norm_var": 0.023726399739583334, + "learning_rate": 0.0001, + "loss": 5.2421, + "loss/crossentropy": 2.306075096130371, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1498563215136528, + "step": 26934 + }, + { + "epoch": 0.84175, + "grad_norm": 3.140625, + "grad_norm_var": 0.024437459309895833, + "learning_rate": 0.0001, + "loss": 5.7416, + "loss/crossentropy": 2.6036221981048584, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1665356457233429, + "step": 26936 + }, + { + "epoch": 0.8418125, + "grad_norm": 3.0, + "grad_norm_var": 0.03424479166666667, + "learning_rate": 0.0001, + "loss": 5.5409, + "loss/crossentropy": 2.4251224994659424, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16508854925632477, + "step": 26938 + }, + { + "epoch": 0.841875, + "grad_norm": 2.96875, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 0.0001, + "loss": 5.7345, + "loss/crossentropy": 2.6435906887054443, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16377675533294678, + "step": 26940 + }, + { + "epoch": 0.8419375, + "grad_norm": 3.8125, + "grad_norm_var": 0.06806538899739584, + "learning_rate": 0.0001, + "loss": 5.6693, + "loss/crossentropy": 2.5307010412216187, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16463905572891235, + "step": 26942 + }, + { + "epoch": 0.842, + "grad_norm": 3.109375, + "grad_norm_var": 0.06562398274739584, + "learning_rate": 0.0001, + "loss": 5.7085, + "loss/crossentropy": 2.601099133491516, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16542357206344604, + "step": 26944 + }, + { + "epoch": 0.8420625, + "grad_norm": 3.1875, + "grad_norm_var": 0.05486653645833333, + "learning_rate": 0.0001, + "loss": 5.8455, + "loss/crossentropy": 2.588975667953491, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17565176635980606, + "step": 26946 + }, + { + "epoch": 0.842125, + "grad_norm": 2.9375, + "grad_norm_var": 0.05418192545572917, + "learning_rate": 0.0001, + "loss": 5.7066, + "loss/crossentropy": 2.621884822845459, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16550437361001968, + "step": 26948 + }, + { + "epoch": 0.8421875, + "grad_norm": 3.265625, + "grad_norm_var": 0.04524637858072917, + "learning_rate": 0.0001, + "loss": 5.706, + "loss/crossentropy": 2.552005410194397, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16695862263441086, + "step": 26950 + }, + { + "epoch": 0.84225, + "grad_norm": 2.890625, + "grad_norm_var": 0.053609212239583336, + "learning_rate": 0.0001, + "loss": 5.6121, + "loss/crossentropy": 2.542546510696411, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16555316001176834, + "step": 26952 + }, + { + "epoch": 0.8423125, + "grad_norm": 3.609375, + "grad_norm_var": 0.05915425618489583, + "learning_rate": 0.0001, + "loss": 5.7437, + "loss/crossentropy": 2.591462731361389, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.17030170559883118, + "step": 26954 + }, + { + "epoch": 0.842375, + "grad_norm": 2.84375, + "grad_norm_var": 0.06367085774739584, + "learning_rate": 0.0001, + "loss": 5.5539, + "loss/crossentropy": 2.514713168144226, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16016453504562378, + "step": 26956 + }, + { + "epoch": 0.8424375, + "grad_norm": 3.1875, + "grad_norm_var": 0.03808186848958333, + "learning_rate": 0.0001, + "loss": 5.8469, + "loss/crossentropy": 2.7059032917022705, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16526933759450912, + "step": 26958 + }, + { + "epoch": 0.8425, + "grad_norm": 3.0, + "grad_norm_var": 0.0410552978515625, + "learning_rate": 0.0001, + "loss": 5.1531, + "loss/crossentropy": 2.266539216041565, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.14725393056869507, + "step": 26960 + }, + { + "epoch": 0.8425625, + "grad_norm": 3.234375, + "grad_norm_var": 0.040087890625, + "learning_rate": 0.0001, + "loss": 5.8384, + "loss/crossentropy": 2.7500157356262207, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1635221764445305, + "step": 26962 + }, + { + "epoch": 0.842625, + "grad_norm": 3.171875, + "grad_norm_var": 0.038069661458333334, + "learning_rate": 0.0001, + "loss": 5.4276, + "loss/crossentropy": 2.4385040998458862, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15086626261472702, + "step": 26964 + }, + { + "epoch": 0.8426875, + "grad_norm": 3.265625, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 0.0001, + "loss": 5.4244, + "loss/crossentropy": 2.385402202606201, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.156633660197258, + "step": 26966 + }, + { + "epoch": 0.84275, + "grad_norm": 2.890625, + "grad_norm_var": 0.03535868326822917, + "learning_rate": 0.0001, + "loss": 5.5479, + "loss/crossentropy": 2.523975968360901, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1598101630806923, + "step": 26968 + }, + { + "epoch": 0.8428125, + "grad_norm": 3.265625, + "grad_norm_var": 0.019383748372395832, + "learning_rate": 0.0001, + "loss": 5.4475, + "loss/crossentropy": 2.3999253511428833, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15866780281066895, + "step": 26970 + }, + { + "epoch": 0.842875, + "grad_norm": 3.546875, + "grad_norm_var": 0.06405843098958333, + "learning_rate": 0.0001, + "loss": 6.2068, + "loss/crossentropy": 2.8445491790771484, + "loss/hidden": 1.54296875, + "loss/jsd": 0.0, + "loss/logits": 0.18193239718675613, + "step": 26972 + }, + { + "epoch": 0.8429375, + "grad_norm": 3.0, + "grad_norm_var": 0.07138671875, + "learning_rate": 0.0001, + "loss": 5.268, + "loss/crossentropy": 2.3104175329208374, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15357424318790436, + "step": 26974 + }, + { + "epoch": 0.843, + "grad_norm": 3.109375, + "grad_norm_var": 0.07037353515625, + "learning_rate": 0.0001, + "loss": 5.7229, + "loss/crossentropy": 2.6200207471847534, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16497711837291718, + "step": 26976 + }, + { + "epoch": 0.8430625, + "grad_norm": 3.28125, + "grad_norm_var": 0.07506103515625, + "learning_rate": 0.0001, + "loss": 5.5779, + "loss/crossentropy": 2.4063092470169067, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.16637791693210602, + "step": 26978 + }, + { + "epoch": 0.843125, + "grad_norm": 3.015625, + "grad_norm_var": 0.07935791015625, + "learning_rate": 0.0001, + "loss": 5.3624, + "loss/crossentropy": 2.3784230947494507, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15543343126773834, + "step": 26980 + }, + { + "epoch": 0.8431875, + "grad_norm": 2.90625, + "grad_norm_var": 0.07746988932291667, + "learning_rate": 0.0001, + "loss": 5.5087, + "loss/crossentropy": 2.4464324712753296, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1632547602057457, + "step": 26982 + }, + { + "epoch": 0.84325, + "grad_norm": 3.296875, + "grad_norm_var": 0.07587788899739584, + "learning_rate": 0.0001, + "loss": 5.9727, + "loss/crossentropy": 2.7504212856292725, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17261478304862976, + "step": 26984 + }, + { + "epoch": 0.8433125, + "grad_norm": 2.765625, + "grad_norm_var": 0.08612874348958334, + "learning_rate": 0.0001, + "loss": 5.4549, + "loss/crossentropy": 2.4451680183410645, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15683747082948685, + "step": 26986 + }, + { + "epoch": 0.843375, + "grad_norm": 2.984375, + "grad_norm_var": 0.044041951497395836, + "learning_rate": 0.0001, + "loss": 5.6748, + "loss/crossentropy": 2.5268555879592896, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16791684925556183, + "step": 26988 + }, + { + "epoch": 0.8434375, + "grad_norm": 3.171875, + "grad_norm_var": 0.041341145833333336, + "learning_rate": 0.0001, + "loss": 5.5076, + "loss/crossentropy": 2.501393675804138, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1564769521355629, + "step": 26990 + }, + { + "epoch": 0.8435, + "grad_norm": 3.0625, + "grad_norm_var": 0.04091695149739583, + "learning_rate": 0.0001, + "loss": 5.4816, + "loss/crossentropy": 2.4734781980514526, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1523718759417534, + "step": 26992 + }, + { + "epoch": 0.8435625, + "grad_norm": 3.453125, + "grad_norm_var": 0.03901265462239583, + "learning_rate": 0.0001, + "loss": 5.9334, + "loss/crossentropy": 2.6941498517990112, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.17236074060201645, + "step": 26994 + }, + { + "epoch": 0.843625, + "grad_norm": 2.9375, + "grad_norm_var": 0.040013631184895836, + "learning_rate": 0.0001, + "loss": 5.3828, + "loss/crossentropy": 2.34332537651062, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15667743980884552, + "step": 26996 + }, + { + "epoch": 0.8436875, + "grad_norm": 3.4375, + "grad_norm_var": 0.05001627604166667, + "learning_rate": 0.0001, + "loss": 5.437, + "loss/crossentropy": 2.4447511434555054, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15665173530578613, + "step": 26998 + }, + { + "epoch": 0.84375, + "grad_norm": 3.984375, + "grad_norm_var": 0.0950347900390625, + "learning_rate": 0.0001, + "loss": 6.0429, + "loss/crossentropy": 2.723562240600586, + "loss/hidden": 1.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.1795891523361206, + "step": 27000 + }, + { + "epoch": 0.8438125, + "grad_norm": 3.09375, + "grad_norm_var": 0.08463134765625, + "learning_rate": 0.0001, + "loss": 5.2803, + "loss/crossentropy": 2.301753044128418, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1540997475385666, + "step": 27002 + }, + { + "epoch": 0.843875, + "grad_norm": 3.1875, + "grad_norm_var": 0.077197265625, + "learning_rate": 0.0001, + "loss": 5.842, + "loss/crossentropy": 2.7305915355682373, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1615283042192459, + "step": 27004 + }, + { + "epoch": 0.8439375, + "grad_norm": 3.59375, + "grad_norm_var": 0.08388264973958333, + "learning_rate": 0.0001, + "loss": 5.5253, + "loss/crossentropy": 2.4625903367996216, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16213227808475494, + "step": 27006 + }, + { + "epoch": 0.844, + "grad_norm": 2.828125, + "grad_norm_var": 0.08460286458333334, + "learning_rate": 0.0001, + "loss": 5.6316, + "loss/crossentropy": 2.5852612257003784, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16010281443595886, + "step": 27008 + }, + { + "epoch": 0.8440625, + "grad_norm": 3.03125, + "grad_norm_var": 0.08253580729166667, + "learning_rate": 0.0001, + "loss": 5.6831, + "loss/crossentropy": 2.568989872932434, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16336346417665482, + "step": 27010 + }, + { + "epoch": 0.844125, + "grad_norm": 3.234375, + "grad_norm_var": 0.08439839680989583, + "learning_rate": 0.0001, + "loss": 5.5188, + "loss/crossentropy": 2.4843783378601074, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15968778729438782, + "step": 27012 + }, + { + "epoch": 0.8441875, + "grad_norm": 3.53125, + "grad_norm_var": 0.08073628743489583, + "learning_rate": 0.0001, + "loss": 5.8848, + "loss/crossentropy": 2.715808629989624, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17119789123535156, + "step": 27014 + }, + { + "epoch": 0.84425, + "grad_norm": 2.96875, + "grad_norm_var": 0.04158426920572917, + "learning_rate": 0.0001, + "loss": 5.674, + "loss/crossentropy": 2.5980119705200195, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1618957370519638, + "step": 27016 + }, + { + "epoch": 0.8443125, + "grad_norm": 3.125, + "grad_norm_var": 0.046686808268229164, + "learning_rate": 0.0001, + "loss": 5.4329, + "loss/crossentropy": 2.3620887994766235, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16059454530477524, + "step": 27018 + }, + { + "epoch": 0.844375, + "grad_norm": 3.0625, + "grad_norm_var": 0.047053019205729164, + "learning_rate": 0.0001, + "loss": 5.5869, + "loss/crossentropy": 2.491005778312683, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16310331225395203, + "step": 27020 + }, + { + "epoch": 0.8444375, + "grad_norm": 3.03125, + "grad_norm_var": 0.036188761393229164, + "learning_rate": 0.0001, + "loss": 5.7232, + "loss/crossentropy": 2.5387020111083984, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16884002089500427, + "step": 27022 + }, + { + "epoch": 0.8445, + "grad_norm": 3.03125, + "grad_norm_var": 0.03412984212239583, + "learning_rate": 0.0001, + "loss": 5.6325, + "loss/crossentropy": 2.481870174407959, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16623012721538544, + "step": 27024 + }, + { + "epoch": 0.8445625, + "grad_norm": 2.921875, + "grad_norm_var": 0.038060506184895836, + "learning_rate": 0.0001, + "loss": 5.4642, + "loss/crossentropy": 2.429790735244751, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15968939661979675, + "step": 27026 + }, + { + "epoch": 0.844625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0376861572265625, + "learning_rate": 0.0001, + "loss": 5.4997, + "loss/crossentropy": 2.5007166862487793, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15693041682243347, + "step": 27028 + }, + { + "epoch": 0.8446875, + "grad_norm": 4.125, + "grad_norm_var": 0.09543863932291667, + "learning_rate": 0.0001, + "loss": 5.7831, + "loss/crossentropy": 2.61792528629303, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16846979409456253, + "step": 27030 + }, + { + "epoch": 0.84475, + "grad_norm": 2.953125, + "grad_norm_var": 0.0945465087890625, + "learning_rate": 0.0001, + "loss": 5.874, + "loss/crossentropy": 2.761651039123535, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16709166765213013, + "step": 27032 + }, + { + "epoch": 0.8448125, + "grad_norm": 3.28125, + "grad_norm_var": 0.09298502604166667, + "learning_rate": 0.0001, + "loss": 5.5276, + "loss/crossentropy": 2.4779335260391235, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16043298691511154, + "step": 27034 + }, + { + "epoch": 0.844875, + "grad_norm": 2.953125, + "grad_norm_var": 0.09420166015625, + "learning_rate": 0.0001, + "loss": 5.4692, + "loss/crossentropy": 2.4889973402023315, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1550476774573326, + "step": 27036 + }, + { + "epoch": 0.8449375, + "grad_norm": 3.171875, + "grad_norm_var": 0.093798828125, + "learning_rate": 0.0001, + "loss": 5.534, + "loss/crossentropy": 2.4192898273468018, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16263999044895172, + "step": 27038 + }, + { + "epoch": 0.845, + "grad_norm": 3.328125, + "grad_norm_var": 0.09368082682291666, + "learning_rate": 0.0001, + "loss": 5.6515, + "loss/crossentropy": 2.518969416618347, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16715529561042786, + "step": 27040 + }, + { + "epoch": 0.8450625, + "grad_norm": 3.140625, + "grad_norm_var": 0.09722900390625, + "learning_rate": 0.0001, + "loss": 5.5178, + "loss/crossentropy": 2.487750291824341, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15808579325675964, + "step": 27042 + }, + { + "epoch": 0.845125, + "grad_norm": 3.25, + "grad_norm_var": 0.09236653645833333, + "learning_rate": 0.0001, + "loss": 5.8472, + "loss/crossentropy": 2.698338747024536, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1707422360777855, + "step": 27044 + }, + { + "epoch": 0.8451875, + "grad_norm": 2.875, + "grad_norm_var": 0.031168619791666668, + "learning_rate": 0.0001, + "loss": 5.4016, + "loss/crossentropy": 2.3835963010787964, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15570556372404099, + "step": 27046 + }, + { + "epoch": 0.84525, + "grad_norm": 3.03125, + "grad_norm_var": 0.030345662434895834, + "learning_rate": 0.0001, + "loss": 5.6156, + "loss/crossentropy": 2.5234217643737793, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16390936076641083, + "step": 27048 + }, + { + "epoch": 0.8453125, + "grad_norm": 3.09375, + "grad_norm_var": 0.027144368489583334, + "learning_rate": 0.0001, + "loss": 5.5738, + "loss/crossentropy": 2.48178231716156, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16466858237981796, + "step": 27050 + }, + { + "epoch": 0.845375, + "grad_norm": 2.953125, + "grad_norm_var": 0.027196248372395832, + "learning_rate": 0.0001, + "loss": 5.4942, + "loss/crossentropy": 2.4780293703079224, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15787175297737122, + "step": 27052 + }, + { + "epoch": 0.8454375, + "grad_norm": 3.1875, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 5.8641, + "loss/crossentropy": 2.692082405090332, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.17384406924247742, + "step": 27054 + }, + { + "epoch": 0.8455, + "grad_norm": 3.25, + "grad_norm_var": 0.02857666015625, + "learning_rate": 0.0001, + "loss": 5.7946, + "loss/crossentropy": 2.5548572540283203, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.17436685413122177, + "step": 27056 + }, + { + "epoch": 0.8455625, + "grad_norm": 3.0, + "grad_norm_var": 0.02008056640625, + "learning_rate": 0.0001, + "loss": 5.2714, + "loss/crossentropy": 2.349884033203125, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.14488628506660461, + "step": 27058 + }, + { + "epoch": 0.845625, + "grad_norm": 2.953125, + "grad_norm_var": 0.0199127197265625, + "learning_rate": 0.0001, + "loss": 5.511, + "loss/crossentropy": 2.5070693492889404, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1574266254901886, + "step": 27060 + }, + { + "epoch": 0.8456875, + "grad_norm": 3.0625, + "grad_norm_var": 0.0166412353515625, + "learning_rate": 0.0001, + "loss": 5.7185, + "loss/crossentropy": 2.5456626415252686, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1692354381084442, + "step": 27062 + }, + { + "epoch": 0.84575, + "grad_norm": 3.296875, + "grad_norm_var": 0.020637003580729167, + "learning_rate": 0.0001, + "loss": 5.7958, + "loss/crossentropy": 2.549852728843689, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17263749986886978, + "step": 27064 + }, + { + "epoch": 0.8458125, + "grad_norm": 2.890625, + "grad_norm_var": 0.0243316650390625, + "learning_rate": 0.0001, + "loss": 5.5744, + "loss/crossentropy": 2.5248301029205322, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16042758524417877, + "step": 27066 + }, + { + "epoch": 0.845875, + "grad_norm": 3.09375, + "grad_norm_var": 0.0237457275390625, + "learning_rate": 0.0001, + "loss": 5.8239, + "loss/crossentropy": 2.7303425073623657, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16403896361589432, + "step": 27068 + }, + { + "epoch": 0.8459375, + "grad_norm": 2.78125, + "grad_norm_var": 0.030367024739583335, + "learning_rate": 0.0001, + "loss": 5.5004, + "loss/crossentropy": 2.513803482055664, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15529655665159225, + "step": 27070 + }, + { + "epoch": 0.846, + "grad_norm": 2.9375, + "grad_norm_var": 0.020015462239583334, + "learning_rate": 0.0001, + "loss": 5.7383, + "loss/crossentropy": 2.7543208599090576, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1577717885375023, + "step": 27072 + }, + { + "epoch": 0.8460625, + "grad_norm": 3.0625, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 5.9565, + "loss/crossentropy": 2.8108898401260376, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17081285268068314, + "step": 27074 + }, + { + "epoch": 0.846125, + "grad_norm": 2.953125, + "grad_norm_var": 0.021068318684895834, + "learning_rate": 0.0001, + "loss": 5.6215, + "loss/crossentropy": 2.528372049331665, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1604895070195198, + "step": 27076 + }, + { + "epoch": 0.8461875, + "grad_norm": 2.875, + "grad_norm_var": 0.022835286458333333, + "learning_rate": 0.0001, + "loss": 5.5454, + "loss/crossentropy": 2.5339020490646362, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.157789446413517, + "step": 27078 + }, + { + "epoch": 0.84625, + "grad_norm": 3.0, + "grad_norm_var": 0.013704427083333333, + "learning_rate": 0.0001, + "loss": 5.4112, + "loss/crossentropy": 2.400560140609741, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15379934757947922, + "step": 27080 + }, + { + "epoch": 0.8463125, + "grad_norm": 3.0625, + "grad_norm_var": 0.012702433268229167, + "learning_rate": 0.0001, + "loss": 5.3947, + "loss/crossentropy": 2.4423404932022095, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1534384861588478, + "step": 27082 + }, + { + "epoch": 0.846375, + "grad_norm": 3.109375, + "grad_norm_var": 0.0214508056640625, + "learning_rate": 0.0001, + "loss": 5.6499, + "loss/crossentropy": 2.490090489387512, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.167543925344944, + "step": 27084 + }, + { + "epoch": 0.8464375, + "grad_norm": 3.109375, + "grad_norm_var": 0.04205322265625, + "learning_rate": 0.0001, + "loss": 5.7125, + "loss/crossentropy": 2.5722068548202515, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16637550294399261, + "step": 27086 + }, + { + "epoch": 0.8465, + "grad_norm": 3.203125, + "grad_norm_var": 0.04016825358072917, + "learning_rate": 0.0001, + "loss": 5.7551, + "loss/crossentropy": 2.637491822242737, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1672314703464508, + "step": 27088 + }, + { + "epoch": 0.8465625, + "grad_norm": 3.015625, + "grad_norm_var": 0.04234110514322917, + "learning_rate": 0.0001, + "loss": 5.4526, + "loss/crossentropy": 2.3890881538391113, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16103775799274445, + "step": 27090 + }, + { + "epoch": 0.846625, + "grad_norm": 3.171875, + "grad_norm_var": 0.03996988932291667, + "learning_rate": 0.0001, + "loss": 5.6648, + "loss/crossentropy": 2.5229744911193848, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16848351806402206, + "step": 27092 + }, + { + "epoch": 0.8466875, + "grad_norm": 3.03125, + "grad_norm_var": 0.03902079264322917, + "learning_rate": 0.0001, + "loss": 5.8173, + "loss/crossentropy": 2.644046187400818, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1673254668712616, + "step": 27094 + }, + { + "epoch": 0.84675, + "grad_norm": 2.921875, + "grad_norm_var": 0.041258748372395834, + "learning_rate": 0.0001, + "loss": 5.7519, + "loss/crossentropy": 2.6347395181655884, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1664055809378624, + "step": 27096 + }, + { + "epoch": 0.8468125, + "grad_norm": 3.203125, + "grad_norm_var": 0.045653279622395834, + "learning_rate": 0.0001, + "loss": 5.6848, + "loss/crossentropy": 2.592831015586853, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16231727600097656, + "step": 27098 + }, + { + "epoch": 0.846875, + "grad_norm": 2.984375, + "grad_norm_var": 0.052000935872395834, + "learning_rate": 0.0001, + "loss": 5.5228, + "loss/crossentropy": 2.5616806745529175, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1500208005309105, + "step": 27100 + }, + { + "epoch": 0.8469375, + "grad_norm": 3.171875, + "grad_norm_var": 0.028739420572916667, + "learning_rate": 0.0001, + "loss": 5.8282, + "loss/crossentropy": 2.6943808794021606, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16572364419698715, + "step": 27102 + }, + { + "epoch": 0.847, + "grad_norm": 3.21875, + "grad_norm_var": 0.03095703125, + "learning_rate": 0.0001, + "loss": 5.7187, + "loss/crossentropy": 2.6862971782684326, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16183152049779892, + "step": 27104 + }, + { + "epoch": 0.8470625, + "grad_norm": 3.0, + "grad_norm_var": 0.028425089518229165, + "learning_rate": 0.0001, + "loss": 5.6099, + "loss/crossentropy": 2.6098971366882324, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1535153165459633, + "step": 27106 + }, + { + "epoch": 0.847125, + "grad_norm": 3.15625, + "grad_norm_var": 0.032124837239583336, + "learning_rate": 0.0001, + "loss": 5.7344, + "loss/crossentropy": 2.5661354064941406, + "loss/hidden": 1.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1652621030807495, + "step": 27108 + }, + { + "epoch": 0.8471875, + "grad_norm": 2.953125, + "grad_norm_var": 0.025614420572916668, + "learning_rate": 0.0001, + "loss": 5.6601, + "loss/crossentropy": 2.5817039012908936, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16487447917461395, + "step": 27110 + }, + { + "epoch": 0.84725, + "grad_norm": 3.03125, + "grad_norm_var": 0.023958333333333335, + "learning_rate": 0.0001, + "loss": 5.8438, + "loss/crossentropy": 2.6846741437911987, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.17060133069753647, + "step": 27112 + }, + { + "epoch": 0.8473125, + "grad_norm": 3.015625, + "grad_norm_var": 0.019270833333333334, + "learning_rate": 0.0001, + "loss": 5.8774, + "loss/crossentropy": 2.711457371711731, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1685514897108078, + "step": 27114 + }, + { + "epoch": 0.847375, + "grad_norm": 3.15625, + "grad_norm_var": 0.018171183268229165, + "learning_rate": 0.0001, + "loss": 5.4803, + "loss/crossentropy": 2.421882748603821, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16052868217229843, + "step": 27116 + }, + { + "epoch": 0.8474375, + "grad_norm": 3.640625, + "grad_norm_var": 0.0386383056640625, + "learning_rate": 0.0001, + "loss": 5.5062, + "loss/crossentropy": 2.4428123235702515, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16220122575759888, + "step": 27118 + }, + { + "epoch": 0.8475, + "grad_norm": 2.96875, + "grad_norm_var": 0.0363189697265625, + "learning_rate": 0.0001, + "loss": 5.3899, + "loss/crossentropy": 2.4994590282440186, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1491985023021698, + "step": 27120 + }, + { + "epoch": 0.8475625, + "grad_norm": 3.03125, + "grad_norm_var": 0.03611551920572917, + "learning_rate": 0.0001, + "loss": 5.9667, + "loss/crossentropy": 2.7980575561523438, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16842877864837646, + "step": 27122 + }, + { + "epoch": 0.847625, + "grad_norm": 3.140625, + "grad_norm_var": 0.03232014973958333, + "learning_rate": 0.0001, + "loss": 5.6818, + "loss/crossentropy": 2.566531181335449, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16425713896751404, + "step": 27124 + }, + { + "epoch": 0.8476875, + "grad_norm": 2.953125, + "grad_norm_var": 0.039839680989583334, + "learning_rate": 0.0001, + "loss": 5.1265, + "loss/crossentropy": 2.329495072364807, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.14141619950532913, + "step": 27126 + }, + { + "epoch": 0.84775, + "grad_norm": 3.171875, + "grad_norm_var": 0.04466044108072917, + "learning_rate": 0.0001, + "loss": 5.5252, + "loss/crossentropy": 2.5107333660125732, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15886925905942917, + "step": 27128 + }, + { + "epoch": 0.8478125, + "grad_norm": 2.921875, + "grad_norm_var": 0.048111979166666666, + "learning_rate": 0.0001, + "loss": 5.5582, + "loss/crossentropy": 2.5598950386047363, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15256302058696747, + "step": 27130 + }, + { + "epoch": 0.847875, + "grad_norm": 3.328125, + "grad_norm_var": 0.04785868326822917, + "learning_rate": 0.0001, + "loss": 5.7948, + "loss/crossentropy": 2.7138450145721436, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15848314762115479, + "step": 27132 + }, + { + "epoch": 0.8479375, + "grad_norm": 3.140625, + "grad_norm_var": 0.03396809895833333, + "learning_rate": 0.0001, + "loss": 5.8073, + "loss/crossentropy": 2.602906346321106, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17277860641479492, + "step": 27134 + }, + { + "epoch": 0.848, + "grad_norm": 3.03125, + "grad_norm_var": 0.03310445149739583, + "learning_rate": 0.0001, + "loss": 5.6494, + "loss/crossentropy": 2.6044774055480957, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16113029420375824, + "step": 27136 + }, + { + "epoch": 0.8480625, + "grad_norm": 2.921875, + "grad_norm_var": 0.03605855305989583, + "learning_rate": 0.0001, + "loss": 5.5735, + "loss/crossentropy": 2.5667667388916016, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15926573425531387, + "step": 27138 + }, + { + "epoch": 0.848125, + "grad_norm": 2.984375, + "grad_norm_var": 0.038313802083333334, + "learning_rate": 0.0001, + "loss": 5.6048, + "loss/crossentropy": 2.5968477725982666, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15938495099544525, + "step": 27140 + }, + { + "epoch": 0.8481875, + "grad_norm": 3.203125, + "grad_norm_var": 0.030760701497395834, + "learning_rate": 0.0001, + "loss": 5.7919, + "loss/crossentropy": 2.604135513305664, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16956109553575516, + "step": 27142 + }, + { + "epoch": 0.84825, + "grad_norm": 2.859375, + "grad_norm_var": 0.031201171875, + "learning_rate": 0.0001, + "loss": 5.5577, + "loss/crossentropy": 2.6035202741622925, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1543988510966301, + "step": 27144 + }, + { + "epoch": 0.8483125, + "grad_norm": 3.25, + "grad_norm_var": 0.03583577473958333, + "learning_rate": 0.0001, + "loss": 5.9122, + "loss/crossentropy": 2.7238636016845703, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16961700469255447, + "step": 27146 + }, + { + "epoch": 0.848375, + "grad_norm": 3.03125, + "grad_norm_var": 0.03459879557291667, + "learning_rate": 0.0001, + "loss": 5.6259, + "loss/crossentropy": 2.596389889717102, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15646573901176453, + "step": 27148 + }, + { + "epoch": 0.8484375, + "grad_norm": 3.375, + "grad_norm_var": 0.03264567057291667, + "learning_rate": 0.0001, + "loss": 6.0514, + "loss/crossentropy": 2.76579213142395, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17856386303901672, + "step": 27150 + }, + { + "epoch": 0.8485, + "grad_norm": 3.078125, + "grad_norm_var": 0.032698567708333334, + "learning_rate": 0.0001, + "loss": 5.6132, + "loss/crossentropy": 2.6267541646957397, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15645737200975418, + "step": 27152 + }, + { + "epoch": 0.8485625, + "grad_norm": 3.203125, + "grad_norm_var": 0.0305084228515625, + "learning_rate": 0.0001, + "loss": 5.5019, + "loss/crossentropy": 2.4569544792175293, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15918374061584473, + "step": 27154 + }, + { + "epoch": 0.848625, + "grad_norm": 3.25, + "grad_norm_var": 0.03181050618489583, + "learning_rate": 0.0001, + "loss": 5.7746, + "loss/crossentropy": 2.654120922088623, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16556436568498611, + "step": 27156 + }, + { + "epoch": 0.8486875, + "grad_norm": 3.25, + "grad_norm_var": 0.03177083333333333, + "learning_rate": 0.0001, + "loss": 5.6566, + "loss/crossentropy": 2.6050009727478027, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16062700748443604, + "step": 27158 + }, + { + "epoch": 0.84875, + "grad_norm": 3.140625, + "grad_norm_var": 0.019950358072916667, + "learning_rate": 0.0001, + "loss": 5.5895, + "loss/crossentropy": 2.555176854133606, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15890159457921982, + "step": 27160 + }, + { + "epoch": 0.8488125, + "grad_norm": 3.171875, + "grad_norm_var": 0.015555826822916667, + "learning_rate": 0.0001, + "loss": 5.6621, + "loss/crossentropy": 2.5718332529067993, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1644948124885559, + "step": 27162 + }, + { + "epoch": 0.848875, + "grad_norm": 2.96875, + "grad_norm_var": 0.020710245768229166, + "learning_rate": 0.0001, + "loss": 5.7778, + "loss/crossentropy": 2.684635043144226, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1647833213210106, + "step": 27164 + }, + { + "epoch": 0.8489375, + "grad_norm": 2.828125, + "grad_norm_var": 0.025569661458333334, + "learning_rate": 0.0001, + "loss": 5.5427, + "loss/crossentropy": 2.5394362211227417, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15579772740602493, + "step": 27166 + }, + { + "epoch": 0.849, + "grad_norm": 4.1875, + "grad_norm_var": 0.09032796223958334, + "learning_rate": 0.0001, + "loss": 5.6074, + "loss/crossentropy": 2.5032652616500854, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16158964484930038, + "step": 27168 + }, + { + "epoch": 0.8490625, + "grad_norm": 3.5, + "grad_norm_var": 0.097802734375, + "learning_rate": 0.0001, + "loss": 5.7246, + "loss/crossentropy": 2.668478012084961, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1575625315308571, + "step": 27170 + }, + { + "epoch": 0.849125, + "grad_norm": 3.0625, + "grad_norm_var": 0.10175374348958334, + "learning_rate": 0.0001, + "loss": 5.4272, + "loss/crossentropy": 2.40272319316864, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15674276649951935, + "step": 27172 + }, + { + "epoch": 0.8491875, + "grad_norm": 2.734375, + "grad_norm_var": 0.1138580322265625, + "learning_rate": 0.0001, + "loss": 5.4967, + "loss/crossentropy": 2.505965232849121, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15415487438440323, + "step": 27174 + }, + { + "epoch": 0.84925, + "grad_norm": 2.90625, + "grad_norm_var": 0.1227447509765625, + "learning_rate": 0.0001, + "loss": 5.1261, + "loss/crossentropy": 2.239875912666321, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1456497609615326, + "step": 27176 + }, + { + "epoch": 0.8493125, + "grad_norm": 2.96875, + "grad_norm_var": 0.1259429931640625, + "learning_rate": 0.0001, + "loss": 5.8369, + "loss/crossentropy": 2.739956498146057, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16242799162864685, + "step": 27178 + }, + { + "epoch": 0.849375, + "grad_norm": 2.8125, + "grad_norm_var": 0.12740478515625, + "learning_rate": 0.0001, + "loss": 5.671, + "loss/crossentropy": 2.7142287492752075, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1542678400874138, + "step": 27180 + }, + { + "epoch": 0.8494375, + "grad_norm": 2.9375, + "grad_norm_var": 0.1268707275390625, + "learning_rate": 0.0001, + "loss": 5.4992, + "loss/crossentropy": 2.5635247230529785, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1521565392613411, + "step": 27182 + }, + { + "epoch": 0.8495, + "grad_norm": 3.125, + "grad_norm_var": 0.039061482747395834, + "learning_rate": 0.0001, + "loss": 5.7256, + "loss/crossentropy": 2.661946415901184, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16300630569458008, + "step": 27184 + }, + { + "epoch": 0.8495625, + "grad_norm": 3.0625, + "grad_norm_var": 0.021061197916666666, + "learning_rate": 0.0001, + "loss": 5.6889, + "loss/crossentropy": 2.6285845041275024, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15954901278018951, + "step": 27186 + }, + { + "epoch": 0.849625, + "grad_norm": 3.078125, + "grad_norm_var": 0.0217437744140625, + "learning_rate": 0.0001, + "loss": 5.5159, + "loss/crossentropy": 2.4490103721618652, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16216076165437698, + "step": 27188 + }, + { + "epoch": 0.8496875, + "grad_norm": 3.21875, + "grad_norm_var": 0.029069010416666666, + "learning_rate": 0.0001, + "loss": 5.69, + "loss/crossentropy": 2.5842500925064087, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1687745600938797, + "step": 27190 + }, + { + "epoch": 0.84975, + "grad_norm": 3.40625, + "grad_norm_var": 0.03280843098958333, + "learning_rate": 0.0001, + "loss": 5.6734, + "loss/crossentropy": 2.562857985496521, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16535142809152603, + "step": 27192 + }, + { + "epoch": 0.8498125, + "grad_norm": 3.25, + "grad_norm_var": 0.044041951497395836, + "learning_rate": 0.0001, + "loss": 5.8665, + "loss/crossentropy": 2.6680439710617065, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1710200533270836, + "step": 27194 + }, + { + "epoch": 0.849875, + "grad_norm": 3.015625, + "grad_norm_var": 0.0418853759765625, + "learning_rate": 0.0001, + "loss": 5.2359, + "loss/crossentropy": 2.295186996459961, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15266850590705872, + "step": 27196 + }, + { + "epoch": 0.8499375, + "grad_norm": 3.078125, + "grad_norm_var": 0.038863118489583334, + "learning_rate": 0.0001, + "loss": 5.7304, + "loss/crossentropy": 2.696526050567627, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15924210846424103, + "step": 27198 + }, + { + "epoch": 0.85, + "grad_norm": 3.28125, + "grad_norm_var": 0.042389933268229166, + "learning_rate": 0.0001, + "loss": 5.5092, + "loss/crossentropy": 2.518649458885193, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15920644998550415, + "step": 27200 + }, + { + "epoch": 0.8500625, + "grad_norm": 2.96875, + "grad_norm_var": 0.04246317545572917, + "learning_rate": 0.0001, + "loss": 5.5849, + "loss/crossentropy": 2.47367525100708, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16385667771100998, + "step": 27202 + }, + { + "epoch": 0.850125, + "grad_norm": 3.078125, + "grad_norm_var": 0.04265034993489583, + "learning_rate": 0.0001, + "loss": 5.6853, + "loss/crossentropy": 2.605410099029541, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16306568682193756, + "step": 27204 + }, + { + "epoch": 0.8501875, + "grad_norm": 2.953125, + "grad_norm_var": 0.04045308430989583, + "learning_rate": 0.0001, + "loss": 5.6113, + "loss/crossentropy": 2.5274053812026978, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16502834856510162, + "step": 27206 + }, + { + "epoch": 0.85025, + "grad_norm": 3.0, + "grad_norm_var": 0.03517964680989583, + "learning_rate": 0.0001, + "loss": 5.6351, + "loss/crossentropy": 2.5945483446121216, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15679188072681427, + "step": 27208 + }, + { + "epoch": 0.8503125, + "grad_norm": 3.1875, + "grad_norm_var": 0.0193023681640625, + "learning_rate": 0.0001, + "loss": 5.7849, + "loss/crossentropy": 2.6564905643463135, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16557051241397858, + "step": 27210 + }, + { + "epoch": 0.850375, + "grad_norm": 3.21875, + "grad_norm_var": 0.019498697916666665, + "learning_rate": 0.0001, + "loss": 6.0189, + "loss/crossentropy": 2.771043300628662, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17595846205949783, + "step": 27212 + }, + { + "epoch": 0.8504375, + "grad_norm": 2.90625, + "grad_norm_var": 0.01832275390625, + "learning_rate": 0.0001, + "loss": 5.6694, + "loss/crossentropy": 2.6642444133758545, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15481116622686386, + "step": 27214 + }, + { + "epoch": 0.8505, + "grad_norm": 3.328125, + "grad_norm_var": 0.0170806884765625, + "learning_rate": 0.0001, + "loss": 5.733, + "loss/crossentropy": 2.5549402236938477, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16897690296173096, + "step": 27216 + }, + { + "epoch": 0.8505625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0191558837890625, + "learning_rate": 0.0001, + "loss": 5.2818, + "loss/crossentropy": 2.38115918636322, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15022436529397964, + "step": 27218 + }, + { + "epoch": 0.850625, + "grad_norm": 3.09375, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 6.0002, + "loss/crossentropy": 2.7196956872940063, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17727024108171463, + "step": 27220 + }, + { + "epoch": 0.8506875, + "grad_norm": 3.09375, + "grad_norm_var": 0.018680826822916666, + "learning_rate": 0.0001, + "loss": 5.5349, + "loss/crossentropy": 2.497388243675232, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16116993129253387, + "step": 27222 + }, + { + "epoch": 0.85075, + "grad_norm": 3.25, + "grad_norm_var": 0.017381795247395835, + "learning_rate": 0.0001, + "loss": 5.5134, + "loss/crossentropy": 2.506207227706909, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15735848993062973, + "step": 27224 + }, + { + "epoch": 0.8508125, + "grad_norm": 3.046875, + "grad_norm_var": 0.018538411458333334, + "learning_rate": 0.0001, + "loss": 5.5864, + "loss/crossentropy": 2.51308536529541, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16006861627101898, + "step": 27226 + }, + { + "epoch": 0.850875, + "grad_norm": 3.21875, + "grad_norm_var": 0.018485514322916667, + "learning_rate": 0.0001, + "loss": 5.6591, + "loss/crossentropy": 2.569067597389221, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16251851618289948, + "step": 27228 + }, + { + "epoch": 0.8509375, + "grad_norm": 3.109375, + "grad_norm_var": 0.018220011393229166, + "learning_rate": 0.0001, + "loss": 5.8602, + "loss/crossentropy": 2.6755404472351074, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17042145133018494, + "step": 27230 + }, + { + "epoch": 0.851, + "grad_norm": 3.25, + "grad_norm_var": 0.0253814697265625, + "learning_rate": 0.0001, + "loss": 5.5924, + "loss/crossentropy": 2.5000529289245605, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16197387874126434, + "step": 27232 + }, + { + "epoch": 0.8510625, + "grad_norm": 2.875, + "grad_norm_var": 0.0294921875, + "learning_rate": 0.0001, + "loss": 5.6575, + "loss/crossentropy": 2.6328113079071045, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16066823154687881, + "step": 27234 + }, + { + "epoch": 0.851125, + "grad_norm": 2.890625, + "grad_norm_var": 0.02861328125, + "learning_rate": 0.0001, + "loss": 5.6378, + "loss/crossentropy": 2.6340949535369873, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1554490625858307, + "step": 27236 + }, + { + "epoch": 0.8511875, + "grad_norm": 3.078125, + "grad_norm_var": 0.028783162434895832, + "learning_rate": 0.0001, + "loss": 5.4258, + "loss/crossentropy": 2.425517439842224, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1527608558535576, + "step": 27238 + }, + { + "epoch": 0.85125, + "grad_norm": 2.953125, + "grad_norm_var": 0.02388916015625, + "learning_rate": 0.0001, + "loss": 5.5994, + "loss/crossentropy": 2.536966323852539, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.15976081043481827, + "step": 27240 + }, + { + "epoch": 0.8513125, + "grad_norm": 3.125, + "grad_norm_var": 0.020807902018229168, + "learning_rate": 0.0001, + "loss": 5.5573, + "loss/crossentropy": 2.535452723503113, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15843620896339417, + "step": 27242 + }, + { + "epoch": 0.851375, + "grad_norm": 3.078125, + "grad_norm_var": 0.018285115559895832, + "learning_rate": 0.0001, + "loss": 5.674, + "loss/crossentropy": 2.550739884376526, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16505959630012512, + "step": 27244 + }, + { + "epoch": 0.8514375, + "grad_norm": 3.53125, + "grad_norm_var": 0.03713785807291667, + "learning_rate": 0.0001, + "loss": 5.8234, + "loss/crossentropy": 2.652758240699768, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17096707224845886, + "step": 27246 + }, + { + "epoch": 0.8515, + "grad_norm": 3.140625, + "grad_norm_var": 0.03351949055989583, + "learning_rate": 0.0001, + "loss": 5.5629, + "loss/crossentropy": 2.5765405893325806, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15645325928926468, + "step": 27248 + }, + { + "epoch": 0.8515625, + "grad_norm": 3.1875, + "grad_norm_var": 0.030182902018229166, + "learning_rate": 0.0001, + "loss": 5.894, + "loss/crossentropy": 2.749528646469116, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16600635647773743, + "step": 27250 + }, + { + "epoch": 0.851625, + "grad_norm": 2.9375, + "grad_norm_var": 0.029166666666666667, + "learning_rate": 0.0001, + "loss": 5.3832, + "loss/crossentropy": 2.4195252656936646, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15183337777853012, + "step": 27252 + }, + { + "epoch": 0.8516875, + "grad_norm": 3.234375, + "grad_norm_var": 0.03183186848958333, + "learning_rate": 0.0001, + "loss": 5.7519, + "loss/crossentropy": 2.588433623313904, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16674208641052246, + "step": 27254 + }, + { + "epoch": 0.85175, + "grad_norm": 3.078125, + "grad_norm_var": 0.0295806884765625, + "learning_rate": 0.0001, + "loss": 5.6704, + "loss/crossentropy": 2.6264848709106445, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16220723092556, + "step": 27256 + }, + { + "epoch": 0.8518125, + "grad_norm": 2.984375, + "grad_norm_var": 0.03189697265625, + "learning_rate": 0.0001, + "loss": 5.6843, + "loss/crossentropy": 2.5776803493499756, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16535252332687378, + "step": 27258 + }, + { + "epoch": 0.851875, + "grad_norm": 3.34375, + "grad_norm_var": 0.045458984375, + "learning_rate": 0.0001, + "loss": 5.6823, + "loss/crossentropy": 2.604082226753235, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16173037886619568, + "step": 27260 + }, + { + "epoch": 0.8519375, + "grad_norm": 2.828125, + "grad_norm_var": 0.034684244791666666, + "learning_rate": 0.0001, + "loss": 5.6682, + "loss/crossentropy": 2.6651251316070557, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15889926254749298, + "step": 27262 + }, + { + "epoch": 0.852, + "grad_norm": 3.125, + "grad_norm_var": 0.0301177978515625, + "learning_rate": 0.0001, + "loss": 5.3971, + "loss/crossentropy": 2.373060882091522, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15943298488855362, + "step": 27264 + }, + { + "epoch": 0.8520625, + "grad_norm": 2.921875, + "grad_norm_var": 0.05545247395833333, + "learning_rate": 0.0001, + "loss": 5.702, + "loss/crossentropy": 2.537823438644409, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.17266496270895004, + "step": 27266 + }, + { + "epoch": 0.852125, + "grad_norm": 4.4375, + "grad_norm_var": 0.1651275634765625, + "learning_rate": 0.0001, + "loss": 5.6176, + "loss/crossentropy": 2.5497862100601196, + "loss/hidden": 1.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.15404419600963593, + "step": 27268 + }, + { + "epoch": 0.8521875, + "grad_norm": 3.03125, + "grad_norm_var": 0.16867574055989584, + "learning_rate": 0.0001, + "loss": 5.5756, + "loss/crossentropy": 2.4981300830841064, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1604846939444542, + "step": 27270 + }, + { + "epoch": 0.85225, + "grad_norm": 2.9375, + "grad_norm_var": 0.17483317057291667, + "learning_rate": 0.0001, + "loss": 5.4375, + "loss/crossentropy": 2.4712727069854736, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15443594008684158, + "step": 27272 + }, + { + "epoch": 0.8523125, + "grad_norm": 3.359375, + "grad_norm_var": 0.17137044270833332, + "learning_rate": 0.0001, + "loss": 5.6358, + "loss/crossentropy": 2.556638479232788, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16026213765144348, + "step": 27274 + }, + { + "epoch": 0.852375, + "grad_norm": 3.078125, + "grad_norm_var": 0.15822652180989583, + "learning_rate": 0.0001, + "loss": 5.5691, + "loss/crossentropy": 2.5130996704101562, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.15638145804405212, + "step": 27276 + }, + { + "epoch": 0.8524375, + "grad_norm": 2.828125, + "grad_norm_var": 0.15387369791666666, + "learning_rate": 0.0001, + "loss": 5.7646, + "loss/crossentropy": 2.7054131031036377, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16138488054275513, + "step": 27278 + }, + { + "epoch": 0.8525, + "grad_norm": 3.234375, + "grad_norm_var": 0.15914306640625, + "learning_rate": 0.0001, + "loss": 5.8122, + "loss/crossentropy": 2.655655264854431, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16917280107736588, + "step": 27280 + }, + { + "epoch": 0.8525625, + "grad_norm": 2.921875, + "grad_norm_var": 0.14888916015625, + "learning_rate": 0.0001, + "loss": 5.4202, + "loss/crossentropy": 2.392878293991089, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1562449336051941, + "step": 27282 + }, + { + "epoch": 0.852625, + "grad_norm": 2.703125, + "grad_norm_var": 0.0577789306640625, + "learning_rate": 0.0001, + "loss": 5.3767, + "loss/crossentropy": 2.54988694190979, + "loss/hidden": 1.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.14596494287252426, + "step": 27284 + }, + { + "epoch": 0.8526875, + "grad_norm": 2.75, + "grad_norm_var": 0.054133097330729164, + "learning_rate": 0.0001, + "loss": 5.3467, + "loss/crossentropy": 2.472472071647644, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1471906527876854, + "step": 27286 + }, + { + "epoch": 0.85275, + "grad_norm": 3.171875, + "grad_norm_var": 0.058268229166666664, + "learning_rate": 0.0001, + "loss": 5.504, + "loss/crossentropy": 2.4292874336242676, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16098542511463165, + "step": 27288 + }, + { + "epoch": 0.8528125, + "grad_norm": 2.96875, + "grad_norm_var": 0.05670166015625, + "learning_rate": 0.0001, + "loss": 5.8281, + "loss/crossentropy": 2.7193890810012817, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16711793839931488, + "step": 27290 + }, + { + "epoch": 0.852875, + "grad_norm": 3.140625, + "grad_norm_var": 0.0574615478515625, + "learning_rate": 0.0001, + "loss": 5.5502, + "loss/crossentropy": 2.527142882347107, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15425408631563187, + "step": 27292 + }, + { + "epoch": 0.8529375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0751129150390625, + "learning_rate": 0.0001, + "loss": 5.8299, + "loss/crossentropy": 2.71413791179657, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16352686285972595, + "step": 27294 + }, + { + "epoch": 0.853, + "grad_norm": 3.390625, + "grad_norm_var": 0.07781575520833334, + "learning_rate": 0.0001, + "loss": 5.8199, + "loss/crossentropy": 2.669779896736145, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.17087218165397644, + "step": 27296 + }, + { + "epoch": 0.8530625, + "grad_norm": 2.90625, + "grad_norm_var": 0.07008056640625, + "learning_rate": 0.0001, + "loss": 5.4575, + "loss/crossentropy": 2.4028143882751465, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16015125811100006, + "step": 27298 + }, + { + "epoch": 0.853125, + "grad_norm": 2.9375, + "grad_norm_var": 0.0460845947265625, + "learning_rate": 0.0001, + "loss": 5.5584, + "loss/crossentropy": 2.5087332725524902, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16199709475040436, + "step": 27300 + }, + { + "epoch": 0.8531875, + "grad_norm": 3.234375, + "grad_norm_var": 0.036214192708333336, + "learning_rate": 0.0001, + "loss": 5.3389, + "loss/crossentropy": 2.3470911979675293, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15582001954317093, + "step": 27302 + }, + { + "epoch": 0.85325, + "grad_norm": 3.40625, + "grad_norm_var": 0.042073567708333336, + "learning_rate": 0.0001, + "loss": 5.7984, + "loss/crossentropy": 2.6537342071533203, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16993840038776398, + "step": 27304 + }, + { + "epoch": 0.8533125, + "grad_norm": 3.140625, + "grad_norm_var": 0.037873331705729166, + "learning_rate": 0.0001, + "loss": 5.3756, + "loss/crossentropy": 2.4142991304397583, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15550129860639572, + "step": 27306 + }, + { + "epoch": 0.853375, + "grad_norm": 3.125, + "grad_norm_var": 0.03782450358072917, + "learning_rate": 0.0001, + "loss": 5.794, + "loss/crossentropy": 2.6065648794174194, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.16835353523492813, + "step": 27308 + }, + { + "epoch": 0.8534375, + "grad_norm": 3.09375, + "grad_norm_var": 0.03922119140625, + "learning_rate": 0.0001, + "loss": 5.3525, + "loss/crossentropy": 2.394999146461487, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.14965371787548065, + "step": 27310 + }, + { + "epoch": 0.8535, + "grad_norm": 2.875, + "grad_norm_var": 0.03906962076822917, + "learning_rate": 0.0001, + "loss": 5.536, + "loss/crossentropy": 2.5364619493484497, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1577666699886322, + "step": 27312 + }, + { + "epoch": 0.8535625, + "grad_norm": 2.984375, + "grad_norm_var": 0.04576822916666667, + "learning_rate": 0.0001, + "loss": 5.8537, + "loss/crossentropy": 2.6952576637268066, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16740810126066208, + "step": 27314 + }, + { + "epoch": 0.853625, + "grad_norm": 3.3125, + "grad_norm_var": 0.043635050455729164, + "learning_rate": 0.0001, + "loss": 5.8972, + "loss/crossentropy": 2.6425116062164307, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.17507479339838028, + "step": 27316 + }, + { + "epoch": 0.8536875, + "grad_norm": 3.203125, + "grad_norm_var": 0.04394124348958333, + "learning_rate": 0.0001, + "loss": 5.7404, + "loss/crossentropy": 2.5958350896835327, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1687539964914322, + "step": 27318 + }, + { + "epoch": 0.85375, + "grad_norm": 2.609375, + "grad_norm_var": 0.056966145833333336, + "learning_rate": 0.0001, + "loss": 5.0976, + "loss/crossentropy": 2.2704073190689087, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1381903663277626, + "step": 27320 + }, + { + "epoch": 0.8538125, + "grad_norm": 3.109375, + "grad_norm_var": 0.05510660807291667, + "learning_rate": 0.0001, + "loss": 5.3793, + "loss/crossentropy": 2.3639668226242065, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1558261588215828, + "step": 27322 + }, + { + "epoch": 0.853875, + "grad_norm": 3.40625, + "grad_norm_var": 8.250065104166667, + "learning_rate": 0.0001, + "loss": 5.5607, + "loss/crossentropy": 2.356520652770996, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17158974707126617, + "step": 27324 + }, + { + "epoch": 0.8539375, + "grad_norm": 3.328125, + "grad_norm_var": 8.179377237955729, + "learning_rate": 0.0001, + "loss": 6.0781, + "loss/crossentropy": 2.8043962717056274, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.177764892578125, + "step": 27326 + }, + { + "epoch": 0.854, + "grad_norm": 2.84375, + "grad_norm_var": 8.200804646809896, + "learning_rate": 0.0001, + "loss": 5.4275, + "loss/crossentropy": 2.44600248336792, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15322712063789368, + "step": 27328 + }, + { + "epoch": 0.8540625, + "grad_norm": 3.109375, + "grad_norm_var": 8.213004557291667, + "learning_rate": 0.0001, + "loss": 5.6881, + "loss/crossentropy": 2.5942888259887695, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16602341830730438, + "step": 27330 + }, + { + "epoch": 0.854125, + "grad_norm": 2.90625, + "grad_norm_var": 8.2279296875, + "learning_rate": 0.0001, + "loss": 5.4666, + "loss/crossentropy": 2.3722946643829346, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.1594339907169342, + "step": 27332 + }, + { + "epoch": 0.8541875, + "grad_norm": 2.828125, + "grad_norm_var": 8.229181925455729, + "learning_rate": 0.0001, + "loss": 5.5654, + "loss/crossentropy": 2.539860963821411, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15685267001390457, + "step": 27334 + }, + { + "epoch": 0.85425, + "grad_norm": 3.25, + "grad_norm_var": 8.126528930664062, + "learning_rate": 0.0001, + "loss": 5.5007, + "loss/crossentropy": 2.429807662963867, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16490648686885834, + "step": 27336 + }, + { + "epoch": 0.8543125, + "grad_norm": 3.28125, + "grad_norm_var": 8.13121337890625, + "learning_rate": 0.0001, + "loss": 5.7599, + "loss/crossentropy": 2.6894830465316772, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16133538633584976, + "step": 27338 + }, + { + "epoch": 0.854375, + "grad_norm": 2.875, + "grad_norm_var": 0.046320597330729164, + "learning_rate": 0.0001, + "loss": 5.709, + "loss/crossentropy": 2.6138765811920166, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16263822466135025, + "step": 27340 + }, + { + "epoch": 0.8544375, + "grad_norm": 2.953125, + "grad_norm_var": 0.040185546875, + "learning_rate": 0.0001, + "loss": 6.016, + "loss/crossentropy": 2.7757010459899902, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.17793571949005127, + "step": 27342 + }, + { + "epoch": 0.8545, + "grad_norm": 3.15625, + "grad_norm_var": 0.0419097900390625, + "learning_rate": 0.0001, + "loss": 5.25, + "loss/crossentropy": 2.3976263999938965, + "loss/hidden": 1.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.1473466232419014, + "step": 27344 + }, + { + "epoch": 0.8545625, + "grad_norm": 3.515625, + "grad_norm_var": 0.05858968098958333, + "learning_rate": 0.0001, + "loss": 5.6017, + "loss/crossentropy": 2.5026453733444214, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16615450382232666, + "step": 27346 + }, + { + "epoch": 0.854625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0456451416015625, + "learning_rate": 0.0001, + "loss": 5.6635, + "loss/crossentropy": 2.5777475833892822, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1613086760044098, + "step": 27348 + }, + { + "epoch": 0.8546875, + "grad_norm": 2.859375, + "grad_norm_var": 0.1615234375, + "learning_rate": 0.0001, + "loss": 5.2415, + "loss/crossentropy": 2.213112473487854, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1590876430273056, + "step": 27350 + }, + { + "epoch": 0.85475, + "grad_norm": 3.90625, + "grad_norm_var": 0.20972900390625, + "learning_rate": 0.0001, + "loss": 5.4069, + "loss/crossentropy": 2.318873167037964, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1599702313542366, + "step": 27352 + }, + { + "epoch": 0.8548125, + "grad_norm": 3.015625, + "grad_norm_var": 0.20634358723958332, + "learning_rate": 0.0001, + "loss": 5.8637, + "loss/crossentropy": 2.720026969909668, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16748729348182678, + "step": 27354 + }, + { + "epoch": 0.854875, + "grad_norm": 3.265625, + "grad_norm_var": 0.20331624348958333, + "learning_rate": 0.0001, + "loss": 5.7398, + "loss/crossentropy": 2.6489644050598145, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16337642818689346, + "step": 27356 + }, + { + "epoch": 0.8549375, + "grad_norm": 3.578125, + "grad_norm_var": 0.20748697916666667, + "learning_rate": 0.0001, + "loss": 5.9554, + "loss/crossentropy": 2.705081582069397, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1777697503566742, + "step": 27358 + }, + { + "epoch": 0.855, + "grad_norm": 2.96875, + "grad_norm_var": 0.198046875, + "learning_rate": 0.0001, + "loss": 5.7981, + "loss/crossentropy": 2.6675941944122314, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16773658990859985, + "step": 27360 + }, + { + "epoch": 0.8550625, + "grad_norm": 3.75, + "grad_norm_var": 0.19925130208333333, + "learning_rate": 0.0001, + "loss": 5.967, + "loss/crossentropy": 2.6779096126556396, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.17695285379886627, + "step": 27362 + }, + { + "epoch": 0.855125, + "grad_norm": 3.03125, + "grad_norm_var": 0.19638264973958333, + "learning_rate": 0.0001, + "loss": 5.6363, + "loss/crossentropy": 2.5565619468688965, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1642194539308548, + "step": 27364 + }, + { + "epoch": 0.8551875, + "grad_norm": 5.03125, + "grad_norm_var": 0.2955403645833333, + "learning_rate": 0.0001, + "loss": 5.9018, + "loss/crossentropy": 2.591494917869568, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1802462637424469, + "step": 27366 + }, + { + "epoch": 0.85525, + "grad_norm": 3.03125, + "grad_norm_var": 0.2580362955729167, + "learning_rate": 0.0001, + "loss": 5.5429, + "loss/crossentropy": 2.496594190597534, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16243939846754074, + "step": 27368 + }, + { + "epoch": 0.8553125, + "grad_norm": 3.125, + "grad_norm_var": 0.2589670817057292, + "learning_rate": 0.0001, + "loss": 5.6632, + "loss/crossentropy": 2.5484431982040405, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16187050938606262, + "step": 27370 + }, + { + "epoch": 0.855375, + "grad_norm": 2.84375, + "grad_norm_var": 0.2725494384765625, + "learning_rate": 0.0001, + "loss": 5.2668, + "loss/crossentropy": 2.3581648468971252, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1471143513917923, + "step": 27372 + }, + { + "epoch": 0.8554375, + "grad_norm": 2.953125, + "grad_norm_var": 0.348681640625, + "learning_rate": 0.0001, + "loss": 5.5454, + "loss/crossentropy": 2.467699885368347, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1663680225610733, + "step": 27374 + }, + { + "epoch": 0.8555, + "grad_norm": 2.953125, + "grad_norm_var": 0.35290425618489585, + "learning_rate": 0.0001, + "loss": 5.4983, + "loss/crossentropy": 2.4543418884277344, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1583036109805107, + "step": 27376 + }, + { + "epoch": 0.8555625, + "grad_norm": 3.375, + "grad_norm_var": 0.33666890462239585, + "learning_rate": 0.0001, + "loss": 5.8463, + "loss/crossentropy": 2.632842183113098, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17212244868278503, + "step": 27378 + }, + { + "epoch": 0.855625, + "grad_norm": 3.296875, + "grad_norm_var": 0.33943583170572916, + "learning_rate": 0.0001, + "loss": 5.4097, + "loss/crossentropy": 2.404983639717102, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15515553206205368, + "step": 27380 + }, + { + "epoch": 0.8556875, + "grad_norm": 3.125, + "grad_norm_var": 0.12312825520833333, + "learning_rate": 0.0001, + "loss": 5.2698, + "loss/crossentropy": 2.2714738845825195, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15646881610155106, + "step": 27382 + }, + { + "epoch": 0.85575, + "grad_norm": 3.078125, + "grad_norm_var": 0.12574869791666668, + "learning_rate": 0.0001, + "loss": 5.9026, + "loss/crossentropy": 2.703365683555603, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.17109481990337372, + "step": 27384 + }, + { + "epoch": 0.8558125, + "grad_norm": 3.140625, + "grad_norm_var": 0.12421773274739584, + "learning_rate": 0.0001, + "loss": 5.8108, + "loss/crossentropy": 2.725627303123474, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16242322325706482, + "step": 27386 + }, + { + "epoch": 0.855875, + "grad_norm": 2.9375, + "grad_norm_var": 0.11724344889322917, + "learning_rate": 0.0001, + "loss": 5.445, + "loss/crossentropy": 2.445341467857361, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15934404730796814, + "step": 27388 + }, + { + "epoch": 0.8559375, + "grad_norm": 3.03125, + "grad_norm_var": 0.024735514322916666, + "learning_rate": 0.0001, + "loss": 5.5114, + "loss/crossentropy": 2.426816940307617, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.161977618932724, + "step": 27390 + }, + { + "epoch": 0.856, + "grad_norm": 3.34375, + "grad_norm_var": 0.0250152587890625, + "learning_rate": 0.0001, + "loss": 5.7785, + "loss/crossentropy": 2.6386823654174805, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16749393939971924, + "step": 27392 + }, + { + "epoch": 0.8560625, + "grad_norm": 3.046875, + "grad_norm_var": 0.0224517822265625, + "learning_rate": 0.0001, + "loss": 5.9373, + "loss/crossentropy": 2.7545653581619263, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17257516831159592, + "step": 27394 + }, + { + "epoch": 0.856125, + "grad_norm": 2.875, + "grad_norm_var": 0.02451171875, + "learning_rate": 0.0001, + "loss": 5.7075, + "loss/crossentropy": 2.6612977981567383, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1636067032814026, + "step": 27396 + }, + { + "epoch": 0.8561875, + "grad_norm": 2.84375, + "grad_norm_var": 0.03731180826822917, + "learning_rate": 0.0001, + "loss": 5.3092, + "loss/crossentropy": 2.444177985191345, + "loss/hidden": 1.375, + "loss/jsd": 0.0, + "loss/logits": 0.1489994302392006, + "step": 27398 + }, + { + "epoch": 0.85625, + "grad_norm": 3.078125, + "grad_norm_var": 0.02828369140625, + "learning_rate": 0.0001, + "loss": 5.2391, + "loss/crossentropy": 2.3072571754455566, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15334443002939224, + "step": 27400 + }, + { + "epoch": 0.8563125, + "grad_norm": 3.171875, + "grad_norm_var": 0.028153483072916666, + "learning_rate": 0.0001, + "loss": 5.7229, + "loss/crossentropy": 2.60724675655365, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16507958620786667, + "step": 27402 + }, + { + "epoch": 0.856375, + "grad_norm": 3.03125, + "grad_norm_var": 0.0283355712890625, + "learning_rate": 0.0001, + "loss": 5.5167, + "loss/crossentropy": 2.4488946199417114, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16186027228832245, + "step": 27404 + }, + { + "epoch": 0.8564375, + "grad_norm": 2.890625, + "grad_norm_var": 0.0293609619140625, + "learning_rate": 0.0001, + "loss": 5.6154, + "loss/crossentropy": 2.5921465158462524, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15896372497081757, + "step": 27406 + }, + { + "epoch": 0.8565, + "grad_norm": 3.125, + "grad_norm_var": 0.022801717122395832, + "learning_rate": 0.0001, + "loss": 5.8056, + "loss/crossentropy": 2.7032735347747803, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16569754481315613, + "step": 27408 + }, + { + "epoch": 0.8565625, + "grad_norm": 3.296875, + "grad_norm_var": 0.024137369791666665, + "learning_rate": 0.0001, + "loss": 5.3409, + "loss/crossentropy": 2.3539106845855713, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15299847722053528, + "step": 27410 + }, + { + "epoch": 0.856625, + "grad_norm": 2.921875, + "grad_norm_var": 0.026627604166666666, + "learning_rate": 0.0001, + "loss": 5.541, + "loss/crossentropy": 2.4986627101898193, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16048026829957962, + "step": 27412 + }, + { + "epoch": 0.8566875, + "grad_norm": 3.171875, + "grad_norm_var": 0.018257649739583333, + "learning_rate": 0.0001, + "loss": 5.537, + "loss/crossentropy": 2.5085893869400024, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15752866864204407, + "step": 27414 + }, + { + "epoch": 0.85675, + "grad_norm": 3.625, + "grad_norm_var": 0.0424224853515625, + "learning_rate": 0.0001, + "loss": 5.4293, + "loss/crossentropy": 2.3147988319396973, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16184009611606598, + "step": 27416 + }, + { + "epoch": 0.8568125, + "grad_norm": 3.265625, + "grad_norm_var": 0.04510091145833333, + "learning_rate": 0.0001, + "loss": 5.7798, + "loss/crossentropy": 2.7156347036361694, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16266556829214096, + "step": 27418 + }, + { + "epoch": 0.856875, + "grad_norm": 3.265625, + "grad_norm_var": 0.0453277587890625, + "learning_rate": 0.0001, + "loss": 5.7641, + "loss/crossentropy": 2.704028010368347, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1587410494685173, + "step": 27420 + }, + { + "epoch": 0.8569375, + "grad_norm": 2.765625, + "grad_norm_var": 0.04970601399739583, + "learning_rate": 0.0001, + "loss": 5.2174, + "loss/crossentropy": 2.2957347631454468, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.15114806592464447, + "step": 27422 + }, + { + "epoch": 0.857, + "grad_norm": 2.984375, + "grad_norm_var": 0.06676025390625, + "learning_rate": 0.0001, + "loss": 5.4437, + "loss/crossentropy": 2.395915150642395, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1614210456609726, + "step": 27424 + }, + { + "epoch": 0.8570625, + "grad_norm": 2.9375, + "grad_norm_var": 0.06992899576822917, + "learning_rate": 0.0001, + "loss": 5.9027, + "loss/crossentropy": 2.7567888498306274, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16459564864635468, + "step": 27426 + }, + { + "epoch": 0.857125, + "grad_norm": 2.890625, + "grad_norm_var": 0.07231343587239583, + "learning_rate": 0.0001, + "loss": 5.4217, + "loss/crossentropy": 2.4296735525131226, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15857718884944916, + "step": 27428 + }, + { + "epoch": 0.8571875, + "grad_norm": 3.03125, + "grad_norm_var": 0.06971028645833334, + "learning_rate": 0.0001, + "loss": 5.8225, + "loss/crossentropy": 2.723418951034546, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16186387836933136, + "step": 27430 + }, + { + "epoch": 0.85725, + "grad_norm": 3.234375, + "grad_norm_var": 0.04876302083333333, + "learning_rate": 0.0001, + "loss": 5.795, + "loss/crossentropy": 2.605363130569458, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17169761657714844, + "step": 27432 + }, + { + "epoch": 0.8573125, + "grad_norm": 3.234375, + "grad_norm_var": 0.04739176432291667, + "learning_rate": 0.0001, + "loss": 5.399, + "loss/crossentropy": 2.381725788116455, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15485329180955887, + "step": 27434 + }, + { + "epoch": 0.857375, + "grad_norm": 3.109375, + "grad_norm_var": 0.047459920247395836, + "learning_rate": 0.0001, + "loss": 5.6726, + "loss/crossentropy": 2.600713849067688, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16461395472288132, + "step": 27436 + }, + { + "epoch": 0.8574375, + "grad_norm": 3.0, + "grad_norm_var": 0.04155171712239583, + "learning_rate": 0.0001, + "loss": 5.623, + "loss/crossentropy": 2.550519824028015, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16271229088306427, + "step": 27438 + }, + { + "epoch": 0.8575, + "grad_norm": 3.125, + "grad_norm_var": 0.027229817708333333, + "learning_rate": 0.0001, + "loss": 5.4119, + "loss/crossentropy": 2.4456595182418823, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15131648629903793, + "step": 27440 + }, + { + "epoch": 0.8575625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0182525634765625, + "learning_rate": 0.0001, + "loss": 5.4175, + "loss/crossentropy": 2.394680976867676, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1585310995578766, + "step": 27442 + }, + { + "epoch": 0.857625, + "grad_norm": 2.859375, + "grad_norm_var": 0.019383748372395832, + "learning_rate": 0.0001, + "loss": 5.4036, + "loss/crossentropy": 2.468693494796753, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.14817877113819122, + "step": 27444 + }, + { + "epoch": 0.8576875, + "grad_norm": 3.515625, + "grad_norm_var": 0.033056640625, + "learning_rate": 0.0001, + "loss": 5.9536, + "loss/crossentropy": 2.7569260597229004, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17396005988121033, + "step": 27446 + }, + { + "epoch": 0.85775, + "grad_norm": 3.0, + "grad_norm_var": 0.0317535400390625, + "learning_rate": 0.0001, + "loss": 5.6816, + "loss/crossentropy": 2.5776615142822266, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16273970156908035, + "step": 27448 + }, + { + "epoch": 0.8578125, + "grad_norm": 3.890625, + "grad_norm_var": 0.073974609375, + "learning_rate": 0.0001, + "loss": 5.676, + "loss/crossentropy": 2.4964919090270996, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16912566870450974, + "step": 27450 + }, + { + "epoch": 0.857875, + "grad_norm": 3.453125, + "grad_norm_var": 0.07975972493489583, + "learning_rate": 0.0001, + "loss": 5.459, + "loss/crossentropy": 2.4694265127182007, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15481796860694885, + "step": 27452 + }, + { + "epoch": 0.8579375, + "grad_norm": 3.28125, + "grad_norm_var": 0.08210347493489584, + "learning_rate": 0.0001, + "loss": 5.7259, + "loss/crossentropy": 2.6806472539901733, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16312392055988312, + "step": 27454 + }, + { + "epoch": 0.858, + "grad_norm": 2.921875, + "grad_norm_var": 0.08254292805989584, + "learning_rate": 0.0001, + "loss": 5.3031, + "loss/crossentropy": 2.4080389738082886, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.15122409909963608, + "step": 27456 + }, + { + "epoch": 0.8580625, + "grad_norm": 3.09375, + "grad_norm_var": 0.07893880208333333, + "learning_rate": 0.0001, + "loss": 5.6916, + "loss/crossentropy": 2.54345440864563, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16715845465660095, + "step": 27458 + }, + { + "epoch": 0.858125, + "grad_norm": 3.21875, + "grad_norm_var": 0.06770426432291667, + "learning_rate": 0.0001, + "loss": 5.7702, + "loss/crossentropy": 2.634869337081909, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1670462042093277, + "step": 27460 + }, + { + "epoch": 0.8581875, + "grad_norm": 3.15625, + "grad_norm_var": 0.06020406087239583, + "learning_rate": 0.0001, + "loss": 5.6032, + "loss/crossentropy": 2.5240001678466797, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1657312512397766, + "step": 27462 + }, + { + "epoch": 0.85825, + "grad_norm": 3.015625, + "grad_norm_var": 0.06500651041666666, + "learning_rate": 0.0001, + "loss": 5.5489, + "loss/crossentropy": 2.4676438570022583, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16280968487262726, + "step": 27464 + }, + { + "epoch": 0.8583125, + "grad_norm": 2.8125, + "grad_norm_var": 0.043375651041666664, + "learning_rate": 0.0001, + "loss": 5.8323, + "loss/crossentropy": 2.6945706605911255, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16728605329990387, + "step": 27466 + }, + { + "epoch": 0.858375, + "grad_norm": 3.34375, + "grad_norm_var": 0.1019683837890625, + "learning_rate": 0.0001, + "loss": 6.2779, + "loss/crossentropy": 2.962168574333191, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1823587492108345, + "step": 27468 + }, + { + "epoch": 0.8584375, + "grad_norm": 2.8125, + "grad_norm_var": 0.1092437744140625, + "learning_rate": 0.0001, + "loss": 5.3642, + "loss/crossentropy": 2.365685820579529, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1541442573070526, + "step": 27470 + }, + { + "epoch": 0.8585, + "grad_norm": 3.046875, + "grad_norm_var": 0.11240132649739583, + "learning_rate": 0.0001, + "loss": 5.3879, + "loss/crossentropy": 2.463380217552185, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15183178335428238, + "step": 27472 + }, + { + "epoch": 0.8585625, + "grad_norm": 3.09375, + "grad_norm_var": 0.11122639973958333, + "learning_rate": 0.0001, + "loss": 5.5478, + "loss/crossentropy": 2.5058140754699707, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.15732257813215256, + "step": 27474 + }, + { + "epoch": 0.858625, + "grad_norm": 2.984375, + "grad_norm_var": 0.11325581868489583, + "learning_rate": 0.0001, + "loss": 5.455, + "loss/crossentropy": 2.4045562744140625, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16129323840141296, + "step": 27476 + }, + { + "epoch": 0.8586875, + "grad_norm": 2.90625, + "grad_norm_var": 0.1173980712890625, + "learning_rate": 0.0001, + "loss": 5.6568, + "loss/crossentropy": 2.6127243041992188, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16261501610279083, + "step": 27478 + }, + { + "epoch": 0.85875, + "grad_norm": 3.03125, + "grad_norm_var": 0.11155192057291667, + "learning_rate": 0.0001, + "loss": 5.6026, + "loss/crossentropy": 2.579832673072815, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15813258290290833, + "step": 27480 + }, + { + "epoch": 0.8588125, + "grad_norm": 3.4375, + "grad_norm_var": 0.10341389973958333, + "learning_rate": 0.0001, + "loss": 5.7871, + "loss/crossentropy": 2.5788873434066772, + "loss/hidden": 1.51171875, + "loss/jsd": 0.0, + "loss/logits": 0.16964523494243622, + "step": 27482 + }, + { + "epoch": 0.858875, + "grad_norm": 3.0625, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 5.4816, + "loss/crossentropy": 2.4420281648635864, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15864020586013794, + "step": 27484 + }, + { + "epoch": 0.8589375, + "grad_norm": 2.796875, + "grad_norm_var": 0.024128214518229166, + "learning_rate": 0.0001, + "loss": 5.6988, + "loss/crossentropy": 2.617542028427124, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16359714418649673, + "step": 27486 + }, + { + "epoch": 0.859, + "grad_norm": 3.140625, + "grad_norm_var": 0.021610514322916666, + "learning_rate": 0.0001, + "loss": 5.684, + "loss/crossentropy": 2.529650926589966, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16582664102315903, + "step": 27488 + }, + { + "epoch": 0.8590625, + "grad_norm": 3.609375, + "grad_norm_var": 0.046686808268229164, + "learning_rate": 0.0001, + "loss": 5.5575, + "loss/crossentropy": 2.4721691608428955, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16321710497140884, + "step": 27490 + }, + { + "epoch": 0.859125, + "grad_norm": 2.796875, + "grad_norm_var": 0.05162760416666667, + "learning_rate": 0.0001, + "loss": 5.5237, + "loss/crossentropy": 2.4864814281463623, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15645458549261093, + "step": 27492 + }, + { + "epoch": 0.8591875, + "grad_norm": 2.828125, + "grad_norm_var": 0.054052734375, + "learning_rate": 0.0001, + "loss": 5.3984, + "loss/crossentropy": 2.463224768638611, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.15484201908111572, + "step": 27494 + }, + { + "epoch": 0.85925, + "grad_norm": 3.046875, + "grad_norm_var": 0.0532379150390625, + "learning_rate": 0.0001, + "loss": 5.3393, + "loss/crossentropy": 2.3829842805862427, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1557861790060997, + "step": 27496 + }, + { + "epoch": 0.8593125, + "grad_norm": 3.25, + "grad_norm_var": 0.0466461181640625, + "learning_rate": 0.0001, + "loss": 5.5659, + "loss/crossentropy": 2.4582934379577637, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16271311789751053, + "step": 27498 + }, + { + "epoch": 0.859375, + "grad_norm": 2.953125, + "grad_norm_var": 0.0477935791015625, + "learning_rate": 0.0001, + "loss": 5.2957, + "loss/crossentropy": 2.3343403339385986, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15160369873046875, + "step": 27500 + }, + { + "epoch": 0.8594375, + "grad_norm": 3.25, + "grad_norm_var": 0.047118123372395834, + "learning_rate": 0.0001, + "loss": 5.8696, + "loss/crossentropy": 2.621949076652527, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17710597068071365, + "step": 27502 + }, + { + "epoch": 0.8595, + "grad_norm": 2.703125, + "grad_norm_var": 0.05353902180989583, + "learning_rate": 0.0001, + "loss": 5.6026, + "loss/crossentropy": 2.6189295053482056, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15657124668359756, + "step": 27504 + }, + { + "epoch": 0.8595625, + "grad_norm": 3.28125, + "grad_norm_var": 0.035445149739583334, + "learning_rate": 0.0001, + "loss": 5.238, + "loss/crossentropy": 2.364015221595764, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14677252620458603, + "step": 27506 + }, + { + "epoch": 0.859625, + "grad_norm": 3.0, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 5.9101, + "loss/crossentropy": 2.6949862241744995, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17463178932666779, + "step": 27508 + }, + { + "epoch": 0.8596875, + "grad_norm": 3.0625, + "grad_norm_var": 0.033219401041666666, + "learning_rate": 0.0001, + "loss": 5.8926, + "loss/crossentropy": 2.6635549068450928, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17368291318416595, + "step": 27510 + }, + { + "epoch": 0.85975, + "grad_norm": 3.015625, + "grad_norm_var": 0.03497721354166667, + "learning_rate": 0.0001, + "loss": 5.5182, + "loss/crossentropy": 2.4884071350097656, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16000645607709885, + "step": 27512 + }, + { + "epoch": 0.8598125, + "grad_norm": 2.984375, + "grad_norm_var": 0.033524576822916666, + "learning_rate": 0.0001, + "loss": 5.6427, + "loss/crossentropy": 2.646321177482605, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1566697359085083, + "step": 27514 + }, + { + "epoch": 0.859875, + "grad_norm": 2.953125, + "grad_norm_var": 0.03303934733072917, + "learning_rate": 0.0001, + "loss": 5.9071, + "loss/crossentropy": 2.816299080848694, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16142502427101135, + "step": 27516 + }, + { + "epoch": 0.8599375, + "grad_norm": 2.96875, + "grad_norm_var": 0.026883951822916665, + "learning_rate": 0.0001, + "loss": 5.5468, + "loss/crossentropy": 2.5224190950393677, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15517209470272064, + "step": 27518 + }, + { + "epoch": 0.86, + "grad_norm": 3.140625, + "grad_norm_var": 0.021317545572916666, + "learning_rate": 0.0001, + "loss": 5.3727, + "loss/crossentropy": 2.3843857049942017, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15742884576320648, + "step": 27520 + }, + { + "epoch": 0.8600625, + "grad_norm": 2.734375, + "grad_norm_var": 0.017585245768229167, + "learning_rate": 0.0001, + "loss": 5.6011, + "loss/crossentropy": 2.5978078842163086, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15774965286254883, + "step": 27522 + }, + { + "epoch": 0.860125, + "grad_norm": 2.90625, + "grad_norm_var": 0.015458170572916667, + "learning_rate": 0.0001, + "loss": 5.6141, + "loss/crossentropy": 2.585380434989929, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.162246972322464, + "step": 27524 + }, + { + "epoch": 0.8601875, + "grad_norm": 2.921875, + "grad_norm_var": 0.01259765625, + "learning_rate": 0.0001, + "loss": 5.6191, + "loss/crossentropy": 2.557377338409424, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16086407750844955, + "step": 27526 + }, + { + "epoch": 0.86025, + "grad_norm": 2.984375, + "grad_norm_var": 0.013792928059895833, + "learning_rate": 0.0001, + "loss": 5.6683, + "loss/crossentropy": 2.6066770553588867, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16280128806829453, + "step": 27528 + }, + { + "epoch": 0.8603125, + "grad_norm": 2.8125, + "grad_norm_var": 0.01500244140625, + "learning_rate": 0.0001, + "loss": 5.5504, + "loss/crossentropy": 2.544152617454529, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1592186689376831, + "step": 27530 + }, + { + "epoch": 0.860375, + "grad_norm": 3.3125, + "grad_norm_var": 0.02144775390625, + "learning_rate": 0.0001, + "loss": 5.6296, + "loss/crossentropy": 2.54948353767395, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16387441009283066, + "step": 27532 + }, + { + "epoch": 0.8604375, + "grad_norm": 3.328125, + "grad_norm_var": 0.0269927978515625, + "learning_rate": 0.0001, + "loss": 5.7507, + "loss/crossentropy": 2.6128547191619873, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16573945432901382, + "step": 27534 + }, + { + "epoch": 0.8605, + "grad_norm": 2.890625, + "grad_norm_var": 0.04712626139322917, + "learning_rate": 0.0001, + "loss": 5.9992, + "loss/crossentropy": 2.6797434091567993, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.1823408454656601, + "step": 27536 + }, + { + "epoch": 0.8605625, + "grad_norm": 3.421875, + "grad_norm_var": 0.04597066243489583, + "learning_rate": 0.0001, + "loss": 5.7349, + "loss/crossentropy": 2.512515425682068, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.17223601788282394, + "step": 27538 + }, + { + "epoch": 0.860625, + "grad_norm": 3.09375, + "grad_norm_var": 0.043431599934895836, + "learning_rate": 0.0001, + "loss": 5.6209, + "loss/crossentropy": 2.54488742351532, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1591637134552002, + "step": 27540 + }, + { + "epoch": 0.8606875, + "grad_norm": 3.0625, + "grad_norm_var": 0.040526326497395834, + "learning_rate": 0.0001, + "loss": 5.8191, + "loss/crossentropy": 2.6895852088928223, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16842305660247803, + "step": 27542 + }, + { + "epoch": 0.86075, + "grad_norm": 3.015625, + "grad_norm_var": 0.04138895670572917, + "learning_rate": 0.0001, + "loss": 5.655, + "loss/crossentropy": 2.6419804096221924, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.15793906152248383, + "step": 27544 + }, + { + "epoch": 0.8608125, + "grad_norm": 3.265625, + "grad_norm_var": 0.03638407389322917, + "learning_rate": 0.0001, + "loss": 5.7652, + "loss/crossentropy": 2.6107258796691895, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16583485901355743, + "step": 27546 + }, + { + "epoch": 0.860875, + "grad_norm": 3.0, + "grad_norm_var": 0.03551432291666667, + "learning_rate": 0.0001, + "loss": 5.3518, + "loss/crossentropy": 2.3826801776885986, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15472815185785294, + "step": 27548 + }, + { + "epoch": 0.8609375, + "grad_norm": 3.078125, + "grad_norm_var": 0.034912109375, + "learning_rate": 0.0001, + "loss": 5.7562, + "loss/crossentropy": 2.644647240638733, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16310906410217285, + "step": 27550 + }, + { + "epoch": 0.861, + "grad_norm": 3.25, + "grad_norm_var": 0.019115193684895834, + "learning_rate": 0.0001, + "loss": 5.51, + "loss/crossentropy": 2.437491297721863, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16037526726722717, + "step": 27552 + }, + { + "epoch": 0.8610625, + "grad_norm": 2.9375, + "grad_norm_var": 0.0150390625, + "learning_rate": 0.0001, + "loss": 5.6361, + "loss/crossentropy": 2.5380167961120605, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16606280207633972, + "step": 27554 + }, + { + "epoch": 0.861125, + "grad_norm": 3.0, + "grad_norm_var": 0.015327962239583333, + "learning_rate": 0.0001, + "loss": 5.5454, + "loss/crossentropy": 2.4664628505706787, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.16648901253938675, + "step": 27556 + }, + { + "epoch": 0.8611875, + "grad_norm": 2.890625, + "grad_norm_var": 0.020921834309895835, + "learning_rate": 0.0001, + "loss": 5.6024, + "loss/crossentropy": 2.569869041442871, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1598953977227211, + "step": 27558 + }, + { + "epoch": 0.86125, + "grad_norm": 3.453125, + "grad_norm_var": 0.03203023274739583, + "learning_rate": 0.0001, + "loss": 5.9652, + "loss/crossentropy": 2.7367547750473022, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1740185245871544, + "step": 27560 + }, + { + "epoch": 0.8613125, + "grad_norm": 2.953125, + "grad_norm_var": 0.0314117431640625, + "learning_rate": 0.0001, + "loss": 5.9583, + "loss/crossentropy": 2.7699532508850098, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1703985407948494, + "step": 27562 + }, + { + "epoch": 0.861375, + "grad_norm": 3.015625, + "grad_norm_var": 0.039383951822916666, + "learning_rate": 0.0001, + "loss": 5.4268, + "loss/crossentropy": 2.501665711402893, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.1538463532924652, + "step": 27564 + }, + { + "epoch": 0.8614375, + "grad_norm": 3.203125, + "grad_norm_var": 0.038798014322916664, + "learning_rate": 0.0001, + "loss": 5.4313, + "loss/crossentropy": 2.372091054916382, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16100051999092102, + "step": 27566 + }, + { + "epoch": 0.8615, + "grad_norm": 2.9375, + "grad_norm_var": 0.041014607747395834, + "learning_rate": 0.0001, + "loss": 5.579, + "loss/crossentropy": 2.53856885433197, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1595095619559288, + "step": 27568 + }, + { + "epoch": 0.8615625, + "grad_norm": 3.203125, + "grad_norm_var": 0.040751139322916664, + "learning_rate": 0.0001, + "loss": 5.5701, + "loss/crossentropy": 2.5165361166000366, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16082197427749634, + "step": 27570 + }, + { + "epoch": 0.861625, + "grad_norm": 3.328125, + "grad_norm_var": 0.046610514322916664, + "learning_rate": 0.0001, + "loss": 5.542, + "loss/crossentropy": 2.5036321878433228, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15930962562561035, + "step": 27572 + }, + { + "epoch": 0.8616875, + "grad_norm": 3.140625, + "grad_norm_var": 0.04087626139322917, + "learning_rate": 0.0001, + "loss": 5.5689, + "loss/crossentropy": 2.4433289766311646, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16607220470905304, + "step": 27574 + }, + { + "epoch": 0.86175, + "grad_norm": 2.859375, + "grad_norm_var": 0.037775675455729164, + "learning_rate": 0.0001, + "loss": 5.8773, + "loss/crossentropy": 2.7035598754882812, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17167061567306519, + "step": 27576 + }, + { + "epoch": 0.8618125, + "grad_norm": 3.03125, + "grad_norm_var": 0.03623046875, + "learning_rate": 0.0001, + "loss": 5.4398, + "loss/crossentropy": 2.4577486515045166, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15289676189422607, + "step": 27578 + }, + { + "epoch": 0.861875, + "grad_norm": 3.09375, + "grad_norm_var": 0.040608723958333336, + "learning_rate": 0.0001, + "loss": 5.6029, + "loss/crossentropy": 2.5073471069335938, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.16267910599708557, + "step": 27580 + }, + { + "epoch": 0.8619375, + "grad_norm": 3.375, + "grad_norm_var": 0.04343973795572917, + "learning_rate": 0.0001, + "loss": 5.5533, + "loss/crossentropy": 2.4675310850143433, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.15896278619766235, + "step": 27582 + }, + { + "epoch": 0.862, + "grad_norm": 3.0, + "grad_norm_var": 0.035187784830729166, + "learning_rate": 0.0001, + "loss": 5.2987, + "loss/crossentropy": 2.3355921506881714, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1549062505364418, + "step": 27584 + }, + { + "epoch": 0.8620625, + "grad_norm": 3.140625, + "grad_norm_var": 0.0320220947265625, + "learning_rate": 0.0001, + "loss": 5.8557, + "loss/crossentropy": 2.7173553705215454, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16618236899375916, + "step": 27586 + }, + { + "epoch": 0.862125, + "grad_norm": 3.015625, + "grad_norm_var": 0.033202107747395834, + "learning_rate": 0.0001, + "loss": 5.6455, + "loss/crossentropy": 2.6528557538986206, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15629655122756958, + "step": 27588 + }, + { + "epoch": 0.8621875, + "grad_norm": 2.875, + "grad_norm_var": 0.03720703125, + "learning_rate": 0.0001, + "loss": 5.5023, + "loss/crossentropy": 2.517209768295288, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15553860366344452, + "step": 27590 + }, + { + "epoch": 0.86225, + "grad_norm": 2.8125, + "grad_norm_var": 0.03209228515625, + "learning_rate": 0.0001, + "loss": 5.3081, + "loss/crossentropy": 2.4127840995788574, + "loss/hidden": 1.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.15086285769939423, + "step": 27592 + }, + { + "epoch": 0.8623125, + "grad_norm": 2.703125, + "grad_norm_var": 0.03828125, + "learning_rate": 0.0001, + "loss": 5.3493, + "loss/crossentropy": 2.449027419090271, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.14940007030963898, + "step": 27594 + }, + { + "epoch": 0.862375, + "grad_norm": 2.828125, + "grad_norm_var": 0.026595052083333334, + "learning_rate": 0.0001, + "loss": 5.4339, + "loss/crossentropy": 2.473372459411621, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1495700106024742, + "step": 27596 + }, + { + "epoch": 0.8624375, + "grad_norm": 3.03125, + "grad_norm_var": 0.014867146809895834, + "learning_rate": 0.0001, + "loss": 5.4278, + "loss/crossentropy": 2.488494396209717, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1540842279791832, + "step": 27598 + }, + { + "epoch": 0.8625, + "grad_norm": 2.921875, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 5.5219, + "loss/crossentropy": 2.4091190099716187, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16323351114988327, + "step": 27600 + }, + { + "epoch": 0.8625625, + "grad_norm": 3.453125, + "grad_norm_var": 0.027046712239583333, + "learning_rate": 0.0001, + "loss": 5.6252, + "loss/crossentropy": 2.429298520088196, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16998374462127686, + "step": 27602 + }, + { + "epoch": 0.862625, + "grad_norm": 2.90625, + "grad_norm_var": 0.027668253580729166, + "learning_rate": 0.0001, + "loss": 5.4972, + "loss/crossentropy": 2.4304704666137695, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16136115789413452, + "step": 27604 + }, + { + "epoch": 0.8626875, + "grad_norm": 3.03125, + "grad_norm_var": 0.0279296875, + "learning_rate": 0.0001, + "loss": 5.716, + "loss/crossentropy": 2.5851006507873535, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1650448888540268, + "step": 27606 + }, + { + "epoch": 0.86275, + "grad_norm": 3.0, + "grad_norm_var": 0.026488240559895834, + "learning_rate": 0.0001, + "loss": 5.627, + "loss/crossentropy": 2.5394445657730103, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.1634387969970703, + "step": 27608 + }, + { + "epoch": 0.8628125, + "grad_norm": 2.890625, + "grad_norm_var": 0.021956380208333334, + "learning_rate": 0.0001, + "loss": 5.5363, + "loss/crossentropy": 2.5663862228393555, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15401819348335266, + "step": 27610 + }, + { + "epoch": 0.862875, + "grad_norm": 2.96875, + "grad_norm_var": 0.023726399739583334, + "learning_rate": 0.0001, + "loss": 5.5966, + "loss/crossentropy": 2.5395649671554565, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15999913960695267, + "step": 27612 + }, + { + "epoch": 0.8629375, + "grad_norm": 3.34375, + "grad_norm_var": 0.0301422119140625, + "learning_rate": 0.0001, + "loss": 5.9448, + "loss/crossentropy": 2.744190216064453, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.17161927372217178, + "step": 27614 + }, + { + "epoch": 0.863, + "grad_norm": 3.359375, + "grad_norm_var": 0.03528645833333333, + "learning_rate": 0.0001, + "loss": 5.3661, + "loss/crossentropy": 2.300475001335144, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.15890540182590485, + "step": 27616 + }, + { + "epoch": 0.8630625, + "grad_norm": 3.203125, + "grad_norm_var": 0.027098592122395834, + "learning_rate": 0.0001, + "loss": 5.4225, + "loss/crossentropy": 2.4023449420928955, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15826784819364548, + "step": 27618 + }, + { + "epoch": 0.863125, + "grad_norm": 3.078125, + "grad_norm_var": 0.024974568684895834, + "learning_rate": 0.0001, + "loss": 5.7417, + "loss/crossentropy": 2.674994111061096, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1609710156917572, + "step": 27620 + }, + { + "epoch": 0.8631875, + "grad_norm": 2.859375, + "grad_norm_var": 0.0282135009765625, + "learning_rate": 0.0001, + "loss": 5.268, + "loss/crossentropy": 2.3086975812911987, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1529662385582924, + "step": 27622 + }, + { + "epoch": 0.86325, + "grad_norm": 3.484375, + "grad_norm_var": 0.037430826822916666, + "learning_rate": 0.0001, + "loss": 5.7246, + "loss/crossentropy": 2.5959867238998413, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16715864837169647, + "step": 27624 + }, + { + "epoch": 0.8633125, + "grad_norm": 2.921875, + "grad_norm_var": 0.03291015625, + "learning_rate": 0.0001, + "loss": 5.6348, + "loss/crossentropy": 2.5706393718719482, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1571996808052063, + "step": 27626 + }, + { + "epoch": 0.863375, + "grad_norm": 3.390625, + "grad_norm_var": 0.03599853515625, + "learning_rate": 0.0001, + "loss": 5.7237, + "loss/crossentropy": 2.5594701766967773, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.16877028346061707, + "step": 27628 + }, + { + "epoch": 0.8634375, + "grad_norm": 3.09375, + "grad_norm_var": 0.031151326497395833, + "learning_rate": 0.0001, + "loss": 5.5329, + "loss/crossentropy": 2.47390353679657, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16019625216722488, + "step": 27630 + }, + { + "epoch": 0.8635, + "grad_norm": 3.109375, + "grad_norm_var": 0.025316365559895835, + "learning_rate": 0.0001, + "loss": 5.4281, + "loss/crossentropy": 2.486311912536621, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.14847956597805023, + "step": 27632 + }, + { + "epoch": 0.8635625, + "grad_norm": 2.953125, + "grad_norm_var": 0.025755818684895834, + "learning_rate": 0.0001, + "loss": 5.5117, + "loss/crossentropy": 2.4831504821777344, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1610625460743904, + "step": 27634 + }, + { + "epoch": 0.863625, + "grad_norm": 3.046875, + "grad_norm_var": 0.029938761393229166, + "learning_rate": 0.0001, + "loss": 5.7136, + "loss/crossentropy": 2.6036019325256348, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1672532856464386, + "step": 27636 + }, + { + "epoch": 0.8636875, + "grad_norm": 2.859375, + "grad_norm_var": 0.0309967041015625, + "learning_rate": 0.0001, + "loss": 5.1623, + "loss/crossentropy": 2.291244864463806, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1456953063607216, + "step": 27638 + }, + { + "epoch": 0.86375, + "grad_norm": 2.84375, + "grad_norm_var": 0.024803670247395833, + "learning_rate": 0.0001, + "loss": 5.4878, + "loss/crossentropy": 2.5077956914901733, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1573728397488594, + "step": 27640 + }, + { + "epoch": 0.8638125, + "grad_norm": 2.9375, + "grad_norm_var": 0.024177042643229167, + "learning_rate": 0.0001, + "loss": 5.3704, + "loss/crossentropy": 2.4454660415649414, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.15420855581760406, + "step": 27642 + }, + { + "epoch": 0.863875, + "grad_norm": 3.109375, + "grad_norm_var": 0.017560831705729165, + "learning_rate": 0.0001, + "loss": 5.6064, + "loss/crossentropy": 2.6103018522262573, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15468335151672363, + "step": 27644 + }, + { + "epoch": 0.8639375, + "grad_norm": 3.015625, + "grad_norm_var": 0.01744384765625, + "learning_rate": 0.0001, + "loss": 5.5682, + "loss/crossentropy": 2.5905452966690063, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15284481644630432, + "step": 27646 + }, + { + "epoch": 0.864, + "grad_norm": 3.140625, + "grad_norm_var": 0.017267862955729168, + "learning_rate": 0.0001, + "loss": 5.6378, + "loss/crossentropy": 2.5828408002853394, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.161359004676342, + "step": 27648 + }, + { + "epoch": 0.8640625, + "grad_norm": 3.109375, + "grad_norm_var": 0.018244425455729168, + "learning_rate": 0.0001, + "loss": 5.549, + "loss/crossentropy": 2.5506070852279663, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.157648004591465, + "step": 27650 + }, + { + "epoch": 0.864125, + "grad_norm": 3.078125, + "grad_norm_var": 0.013109334309895833, + "learning_rate": 0.0001, + "loss": 5.7008, + "loss/crossentropy": 2.610270857810974, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16569805890321732, + "step": 27652 + }, + { + "epoch": 0.8641875, + "grad_norm": 3.125, + "grad_norm_var": 0.00875244140625, + "learning_rate": 0.0001, + "loss": 5.381, + "loss/crossentropy": 2.4877549409866333, + "loss/hidden": 1.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.1526072770357132, + "step": 27654 + }, + { + "epoch": 0.86425, + "grad_norm": 2.84375, + "grad_norm_var": 0.017838541666666666, + "learning_rate": 0.0001, + "loss": 5.8142, + "loss/crossentropy": 2.636727809906006, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17126110941171646, + "step": 27656 + }, + { + "epoch": 0.8643125, + "grad_norm": 2.96875, + "grad_norm_var": 0.024543253580729167, + "learning_rate": 0.0001, + "loss": 5.6218, + "loss/crossentropy": 2.5793492794036865, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16088585555553436, + "step": 27658 + }, + { + "epoch": 0.864375, + "grad_norm": 3.0, + "grad_norm_var": 0.030403645833333333, + "learning_rate": 0.0001, + "loss": 5.6904, + "loss/crossentropy": 2.505601406097412, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1704343557357788, + "step": 27660 + }, + { + "epoch": 0.8644375, + "grad_norm": 3.0, + "grad_norm_var": 0.029866536458333332, + "learning_rate": 0.0001, + "loss": 5.7843, + "loss/crossentropy": 2.6765681505203247, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16468166559934616, + "step": 27662 + }, + { + "epoch": 0.8645, + "grad_norm": 2.9375, + "grad_norm_var": 0.04508056640625, + "learning_rate": 0.0001, + "loss": 5.6697, + "loss/crossentropy": 2.5738131999969482, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16231893748044968, + "step": 27664 + }, + { + "epoch": 0.8645625, + "grad_norm": 3.203125, + "grad_norm_var": 0.04541015625, + "learning_rate": 0.0001, + "loss": 5.4086, + "loss/crossentropy": 2.4533698558807373, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15137922018766403, + "step": 27666 + }, + { + "epoch": 0.864625, + "grad_norm": 2.921875, + "grad_norm_var": 0.0484771728515625, + "learning_rate": 0.0001, + "loss": 5.7024, + "loss/crossentropy": 2.58325457572937, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16620948165655136, + "step": 27668 + }, + { + "epoch": 0.8646875, + "grad_norm": 3.125, + "grad_norm_var": 0.051691691080729164, + "learning_rate": 0.0001, + "loss": 5.7393, + "loss/crossentropy": 2.6752718687057495, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16539110243320465, + "step": 27670 + }, + { + "epoch": 0.86475, + "grad_norm": 3.109375, + "grad_norm_var": 0.042578125, + "learning_rate": 0.0001, + "loss": 5.562, + "loss/crossentropy": 2.531635284423828, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15929072350263596, + "step": 27672 + }, + { + "epoch": 0.8648125, + "grad_norm": 3.0, + "grad_norm_var": 0.07111714680989584, + "learning_rate": 0.0001, + "loss": 5.6956, + "loss/crossentropy": 2.5755029916763306, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1674804836511612, + "step": 27674 + }, + { + "epoch": 0.864875, + "grad_norm": 3.125, + "grad_norm_var": 0.06580301920572916, + "learning_rate": 0.0001, + "loss": 5.3852, + "loss/crossentropy": 2.3932870626449585, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15778710693120956, + "step": 27676 + }, + { + "epoch": 0.8649375, + "grad_norm": 3.421875, + "grad_norm_var": 0.0716705322265625, + "learning_rate": 0.0001, + "loss": 5.8202, + "loss/crossentropy": 2.75151526927948, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16038886457681656, + "step": 27678 + }, + { + "epoch": 0.865, + "grad_norm": 2.984375, + "grad_norm_var": 0.06311442057291666, + "learning_rate": 0.0001, + "loss": 5.5359, + "loss/crossentropy": 2.5368924140930176, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15771739929914474, + "step": 27680 + }, + { + "epoch": 0.8650625, + "grad_norm": 3.28125, + "grad_norm_var": 0.06184794108072917, + "learning_rate": 0.0001, + "loss": 5.4507, + "loss/crossentropy": 2.4856276512145996, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15119660645723343, + "step": 27682 + }, + { + "epoch": 0.865125, + "grad_norm": 3.3125, + "grad_norm_var": 0.0642730712890625, + "learning_rate": 0.0001, + "loss": 5.6671, + "loss/crossentropy": 2.5770288705825806, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16369254887104034, + "step": 27684 + }, + { + "epoch": 0.8651875, + "grad_norm": 2.8125, + "grad_norm_var": 0.06990559895833333, + "learning_rate": 0.0001, + "loss": 5.2007, + "loss/crossentropy": 2.3456820249557495, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.14409735053777695, + "step": 27686 + }, + { + "epoch": 0.86525, + "grad_norm": 3.265625, + "grad_norm_var": 0.06848551432291666, + "learning_rate": 0.0001, + "loss": 5.7854, + "loss/crossentropy": 2.645024299621582, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16560453176498413, + "step": 27688 + }, + { + "epoch": 0.8653125, + "grad_norm": 3.0625, + "grad_norm_var": 0.03316141764322917, + "learning_rate": 0.0001, + "loss": 5.7614, + "loss/crossentropy": 2.680987596511841, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.16545983403921127, + "step": 27690 + }, + { + "epoch": 0.865375, + "grad_norm": 3.03125, + "grad_norm_var": 0.03179931640625, + "learning_rate": 0.0001, + "loss": 5.7494, + "loss/crossentropy": 2.634661316871643, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16537588089704514, + "step": 27692 + }, + { + "epoch": 0.8654375, + "grad_norm": 3.0625, + "grad_norm_var": 0.022249348958333335, + "learning_rate": 0.0001, + "loss": 5.6987, + "loss/crossentropy": 2.62981116771698, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16314272582530975, + "step": 27694 + }, + { + "epoch": 0.8655, + "grad_norm": 2.96875, + "grad_norm_var": 0.019806925455729166, + "learning_rate": 0.0001, + "loss": 5.5717, + "loss/crossentropy": 2.5568922758102417, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1573409140110016, + "step": 27696 + }, + { + "epoch": 0.8655625, + "grad_norm": 2.984375, + "grad_norm_var": 0.017756144205729168, + "learning_rate": 0.0001, + "loss": 5.3276, + "loss/crossentropy": 2.3811562061309814, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1532423198223114, + "step": 27698 + }, + { + "epoch": 0.865625, + "grad_norm": 3.078125, + "grad_norm_var": 0.012157185872395834, + "learning_rate": 0.0001, + "loss": 5.4508, + "loss/crossentropy": 2.5310704708099365, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15212911367416382, + "step": 27700 + }, + { + "epoch": 0.8656875, + "grad_norm": 2.953125, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 5.7516, + "loss/crossentropy": 2.7092376947402954, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15853281319141388, + "step": 27702 + }, + { + "epoch": 0.86575, + "grad_norm": 3.015625, + "grad_norm_var": 0.0045806884765625, + "learning_rate": 0.0001, + "loss": 5.2488, + "loss/crossentropy": 2.309325337409973, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.14941205829381943, + "step": 27704 + }, + { + "epoch": 0.8658125, + "grad_norm": 2.9375, + "grad_norm_var": 0.004084269205729167, + "learning_rate": 0.0001, + "loss": 5.4029, + "loss/crossentropy": 2.487838864326477, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15126828849315643, + "step": 27706 + }, + { + "epoch": 0.865875, + "grad_norm": 3.15625, + "grad_norm_var": 0.0103424072265625, + "learning_rate": 0.0001, + "loss": 5.8147, + "loss/crossentropy": 2.7337146997451782, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16474289447069168, + "step": 27708 + }, + { + "epoch": 0.8659375, + "grad_norm": 2.859375, + "grad_norm_var": 0.011139933268229167, + "learning_rate": 0.0001, + "loss": 5.6107, + "loss/crossentropy": 2.599394202232361, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15738170593976974, + "step": 27710 + }, + { + "epoch": 0.866, + "grad_norm": 2.828125, + "grad_norm_var": 0.022004191080729166, + "learning_rate": 0.0001, + "loss": 5.7284, + "loss/crossentropy": 2.6974366903305054, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16090577095746994, + "step": 27712 + }, + { + "epoch": 0.8660625, + "grad_norm": 3.0, + "grad_norm_var": 0.027164713541666666, + "learning_rate": 0.0001, + "loss": 5.2408, + "loss/crossentropy": 2.3260730504989624, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1485009863972664, + "step": 27714 + }, + { + "epoch": 0.866125, + "grad_norm": 3.234375, + "grad_norm_var": 0.03365478515625, + "learning_rate": 0.0001, + "loss": 5.843, + "loss/crossentropy": 2.6683355569839478, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.17176324129104614, + "step": 27716 + }, + { + "epoch": 0.8661875, + "grad_norm": 3.078125, + "grad_norm_var": 0.033991495768229164, + "learning_rate": 0.0001, + "loss": 5.5059, + "loss/crossentropy": 2.5150938034057617, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15416057407855988, + "step": 27718 + }, + { + "epoch": 0.86625, + "grad_norm": 3.03125, + "grad_norm_var": 0.0335601806640625, + "learning_rate": 0.0001, + "loss": 5.325, + "loss/crossentropy": 2.4102606773376465, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15084628015756607, + "step": 27720 + }, + { + "epoch": 0.8663125, + "grad_norm": 3.125, + "grad_norm_var": 0.03590087890625, + "learning_rate": 0.0001, + "loss": 5.6581, + "loss/crossentropy": 2.6280406713485718, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1619897335767746, + "step": 27722 + }, + { + "epoch": 0.866375, + "grad_norm": 3.234375, + "grad_norm_var": 0.0326080322265625, + "learning_rate": 0.0001, + "loss": 5.4041, + "loss/crossentropy": 2.382719874382019, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15760424733161926, + "step": 27724 + }, + { + "epoch": 0.8664375, + "grad_norm": 3.125, + "grad_norm_var": 0.031184895833333334, + "learning_rate": 0.0001, + "loss": 5.4625, + "loss/crossentropy": 2.4307433366775513, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16020990163087845, + "step": 27726 + }, + { + "epoch": 0.8665, + "grad_norm": 3.125, + "grad_norm_var": 0.021907552083333334, + "learning_rate": 0.0001, + "loss": 5.6706, + "loss/crossentropy": 2.555809736251831, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1669429913163185, + "step": 27728 + }, + { + "epoch": 0.8665625, + "grad_norm": 2.765625, + "grad_norm_var": 0.019896443684895834, + "learning_rate": 0.0001, + "loss": 5.4588, + "loss/crossentropy": 2.4327945709228516, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.15728867053985596, + "step": 27730 + }, + { + "epoch": 0.866625, + "grad_norm": 3.125, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 5.6227, + "loss/crossentropy": 2.5396151542663574, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16025733202695847, + "step": 27732 + }, + { + "epoch": 0.8666875, + "grad_norm": 3.15625, + "grad_norm_var": 0.018871053059895834, + "learning_rate": 0.0001, + "loss": 5.9417, + "loss/crossentropy": 2.8445409536361694, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16518766433000565, + "step": 27734 + }, + { + "epoch": 0.86675, + "grad_norm": 2.875, + "grad_norm_var": 0.0204986572265625, + "learning_rate": 0.0001, + "loss": 5.5576, + "loss/crossentropy": 2.480849266052246, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16158322989940643, + "step": 27736 + }, + { + "epoch": 0.8668125, + "grad_norm": 3.140625, + "grad_norm_var": 0.027730305989583332, + "learning_rate": 0.0001, + "loss": 5.6466, + "loss/crossentropy": 2.617606520652771, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15915010124444962, + "step": 27738 + }, + { + "epoch": 0.866875, + "grad_norm": 3.34375, + "grad_norm_var": 0.030427042643229166, + "learning_rate": 0.0001, + "loss": 5.7358, + "loss/crossentropy": 2.6470601558685303, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16316870599985123, + "step": 27740 + }, + { + "epoch": 0.8669375, + "grad_norm": 3.203125, + "grad_norm_var": 0.03192952473958333, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.565005898475647, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1620922088623047, + "step": 27742 + }, + { + "epoch": 0.867, + "grad_norm": 2.984375, + "grad_norm_var": 0.033528645833333336, + "learning_rate": 0.0001, + "loss": 5.4992, + "loss/crossentropy": 2.540012001991272, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15607950091362, + "step": 27744 + }, + { + "epoch": 0.8670625, + "grad_norm": 3.21875, + "grad_norm_var": 0.028449503580729167, + "learning_rate": 0.0001, + "loss": 5.7067, + "loss/crossentropy": 2.6245768070220947, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16289737820625305, + "step": 27746 + }, + { + "epoch": 0.867125, + "grad_norm": 2.953125, + "grad_norm_var": 0.0289703369140625, + "learning_rate": 0.0001, + "loss": 5.7415, + "loss/crossentropy": 2.6854456663131714, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1606845110654831, + "step": 27748 + }, + { + "epoch": 0.8671875, + "grad_norm": 3.1875, + "grad_norm_var": 0.028841145833333335, + "learning_rate": 0.0001, + "loss": 5.7299, + "loss/crossentropy": 2.5976706743240356, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1671339049935341, + "step": 27750 + }, + { + "epoch": 0.86725, + "grad_norm": 2.9375, + "grad_norm_var": 0.0320953369140625, + "learning_rate": 0.0001, + "loss": 5.4988, + "loss/crossentropy": 2.3713111877441406, + "loss/hidden": 1.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.16079971939325333, + "step": 27752 + }, + { + "epoch": 0.8673125, + "grad_norm": 3.75, + "grad_norm_var": 0.04864908854166667, + "learning_rate": 0.0001, + "loss": 5.8481, + "loss/crossentropy": 2.6818984746932983, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1701388955116272, + "step": 27754 + }, + { + "epoch": 0.867375, + "grad_norm": 3.109375, + "grad_norm_var": 0.04512430826822917, + "learning_rate": 0.0001, + "loss": 5.6312, + "loss/crossentropy": 2.5963293313980103, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15974120795726776, + "step": 27756 + }, + { + "epoch": 0.8674375, + "grad_norm": 2.984375, + "grad_norm_var": 0.0453125, + "learning_rate": 0.0001, + "loss": 5.3357, + "loss/crossentropy": 2.417480707168579, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1511980965733528, + "step": 27758 + }, + { + "epoch": 0.8675, + "grad_norm": 2.859375, + "grad_norm_var": 0.0462310791015625, + "learning_rate": 0.0001, + "loss": 5.4992, + "loss/crossentropy": 2.5086978673934937, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15608545392751694, + "step": 27760 + }, + { + "epoch": 0.8675625, + "grad_norm": 3.015625, + "grad_norm_var": 0.0462310791015625, + "learning_rate": 0.0001, + "loss": 5.3478, + "loss/crossentropy": 2.4178558588027954, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15237433463335037, + "step": 27762 + }, + { + "epoch": 0.867625, + "grad_norm": 3.078125, + "grad_norm_var": 0.04551493326822917, + "learning_rate": 0.0001, + "loss": 5.8964, + "loss/crossentropy": 2.801416039466858, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1634017676115036, + "step": 27764 + }, + { + "epoch": 0.8676875, + "grad_norm": 2.890625, + "grad_norm_var": 0.0498687744140625, + "learning_rate": 0.0001, + "loss": 5.5029, + "loss/crossentropy": 2.491839647293091, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.15930673480033875, + "step": 27766 + }, + { + "epoch": 0.86775, + "grad_norm": 3.15625, + "grad_norm_var": 0.05849507649739583, + "learning_rate": 0.0001, + "loss": 5.8077, + "loss/crossentropy": 2.6288340091705322, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.16789048165082932, + "step": 27768 + }, + { + "epoch": 0.8678125, + "grad_norm": 2.9375, + "grad_norm_var": 0.04075419108072917, + "learning_rate": 0.0001, + "loss": 5.6089, + "loss/crossentropy": 2.5784683227539062, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.16085676103830338, + "step": 27770 + }, + { + "epoch": 0.867875, + "grad_norm": 3.0, + "grad_norm_var": 0.0421295166015625, + "learning_rate": 0.0001, + "loss": 5.7072, + "loss/crossentropy": 2.6747682094573975, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.160274475812912, + "step": 27772 + }, + { + "epoch": 0.8679375, + "grad_norm": 3.0, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 0.0001, + "loss": 5.783, + "loss/crossentropy": 2.699275851249695, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.16189026832580566, + "step": 27774 + }, + { + "epoch": 0.868, + "grad_norm": 3.21875, + "grad_norm_var": 0.035807291666666664, + "learning_rate": 0.0001, + "loss": 5.8626, + "loss/crossentropy": 2.6744813919067383, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.170765720307827, + "step": 27776 + }, + { + "epoch": 0.8680625, + "grad_norm": 4.0, + "grad_norm_var": 0.08801676432291666, + "learning_rate": 0.0001, + "loss": 5.8071, + "loss/crossentropy": 2.5578606128692627, + "loss/hidden": 1.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.17414429783821106, + "step": 27778 + }, + { + "epoch": 0.868125, + "grad_norm": 3.03125, + "grad_norm_var": 0.08368733723958334, + "learning_rate": 0.0001, + "loss": 5.1385, + "loss/crossentropy": 2.221524477005005, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1518501415848732, + "step": 27780 + }, + { + "epoch": 0.8681875, + "grad_norm": 3.15625, + "grad_norm_var": 0.08029683430989583, + "learning_rate": 0.0001, + "loss": 5.5732, + "loss/crossentropy": 2.4585397243499756, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1634185016155243, + "step": 27782 + }, + { + "epoch": 0.86825, + "grad_norm": 3.15625, + "grad_norm_var": 0.07093098958333334, + "learning_rate": 0.0001, + "loss": 5.6945, + "loss/crossentropy": 2.5165436267852783, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.1697457879781723, + "step": 27784 + }, + { + "epoch": 0.8683125, + "grad_norm": 2.9375, + "grad_norm_var": 0.0662750244140625, + "learning_rate": 0.0001, + "loss": 5.8333, + "loss/crossentropy": 2.7102712392807007, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1677682101726532, + "step": 27786 + }, + { + "epoch": 0.868375, + "grad_norm": 3.0, + "grad_norm_var": 0.06702372233072916, + "learning_rate": 0.0001, + "loss": 5.7649, + "loss/crossentropy": 2.6999846696853638, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15922239422798157, + "step": 27788 + }, + { + "epoch": 0.8684375, + "grad_norm": 3.0625, + "grad_norm_var": 0.06606343587239584, + "learning_rate": 0.0001, + "loss": 5.8227, + "loss/crossentropy": 2.7487266063690186, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15935276448726654, + "step": 27790 + }, + { + "epoch": 0.8685, + "grad_norm": 3.421875, + "grad_norm_var": 0.07010091145833333, + "learning_rate": 0.0001, + "loss": 5.5804, + "loss/crossentropy": 2.501925468444824, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15902023017406464, + "step": 27792 + }, + { + "epoch": 0.8685625, + "grad_norm": 3.0625, + "grad_norm_var": 0.02359619140625, + "learning_rate": 0.0001, + "loss": 5.7561, + "loss/crossentropy": 2.677756905555725, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16213340312242508, + "step": 27794 + }, + { + "epoch": 0.868625, + "grad_norm": 2.71875, + "grad_norm_var": 0.0349609375, + "learning_rate": 0.0001, + "loss": 5.6271, + "loss/crossentropy": 2.5794237852096558, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1590612530708313, + "step": 27796 + }, + { + "epoch": 0.8686875, + "grad_norm": 2.890625, + "grad_norm_var": 0.03434956868489583, + "learning_rate": 0.0001, + "loss": 5.7891, + "loss/crossentropy": 2.701713800430298, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16303890198469162, + "step": 27798 + }, + { + "epoch": 0.86875, + "grad_norm": 3.1875, + "grad_norm_var": 0.03466389973958333, + "learning_rate": 0.0001, + "loss": 5.489, + "loss/crossentropy": 2.4837100505828857, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.15599625557661057, + "step": 27800 + }, + { + "epoch": 0.8688125, + "grad_norm": 3.046875, + "grad_norm_var": 0.032938639322916664, + "learning_rate": 0.0001, + "loss": 5.4444, + "loss/crossentropy": 2.425615072250366, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15812792629003525, + "step": 27802 + }, + { + "epoch": 0.868875, + "grad_norm": 2.96875, + "grad_norm_var": 0.04101155598958333, + "learning_rate": 0.0001, + "loss": 5.4726, + "loss/crossentropy": 2.5447142124176025, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15216126292943954, + "step": 27804 + }, + { + "epoch": 0.8689375, + "grad_norm": 3.109375, + "grad_norm_var": 0.041731770833333334, + "learning_rate": 0.0001, + "loss": 5.5897, + "loss/crossentropy": 2.5250545740127563, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1627161130309105, + "step": 27806 + }, + { + "epoch": 0.869, + "grad_norm": 2.859375, + "grad_norm_var": 0.03648681640625, + "learning_rate": 0.0001, + "loss": 5.5572, + "loss/crossentropy": 2.523909568786621, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15918756276369095, + "step": 27808 + }, + { + "epoch": 0.8690625, + "grad_norm": 3.4375, + "grad_norm_var": 0.03961181640625, + "learning_rate": 0.0001, + "loss": 5.5009, + "loss/crossentropy": 2.442892074584961, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1581428349018097, + "step": 27810 + }, + { + "epoch": 0.869125, + "grad_norm": 2.9375, + "grad_norm_var": 0.0334136962890625, + "learning_rate": 0.0001, + "loss": 5.1772, + "loss/crossentropy": 2.2754937410354614, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15032844245433807, + "step": 27812 + }, + { + "epoch": 0.8691875, + "grad_norm": 3.140625, + "grad_norm_var": 0.03062744140625, + "learning_rate": 0.0001, + "loss": 5.7346, + "loss/crossentropy": 2.6316630840301514, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1653677299618721, + "step": 27814 + }, + { + "epoch": 0.86925, + "grad_norm": 3.296875, + "grad_norm_var": 0.0335113525390625, + "learning_rate": 0.0001, + "loss": 5.5422, + "loss/crossentropy": 2.443118453025818, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1642000824213028, + "step": 27816 + }, + { + "epoch": 0.8693125, + "grad_norm": 3.21875, + "grad_norm_var": 0.03533528645833333, + "learning_rate": 0.0001, + "loss": 5.6963, + "loss/crossentropy": 2.551207184791565, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.16880378127098083, + "step": 27818 + }, + { + "epoch": 0.869375, + "grad_norm": 3.078125, + "grad_norm_var": 0.028107706705729166, + "learning_rate": 0.0001, + "loss": 5.5252, + "loss/crossentropy": 2.554909586906433, + "loss/hidden": 1.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.15718352049589157, + "step": 27820 + }, + { + "epoch": 0.8694375, + "grad_norm": 3.375, + "grad_norm_var": 0.04263916015625, + "learning_rate": 0.0001, + "loss": 5.7741, + "loss/crossentropy": 2.5711824893951416, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17302384972572327, + "step": 27822 + }, + { + "epoch": 0.8695, + "grad_norm": 2.875, + "grad_norm_var": 0.0422760009765625, + "learning_rate": 0.0001, + "loss": 5.6609, + "loss/crossentropy": 2.5747915506362915, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1636883169412613, + "step": 27824 + }, + { + "epoch": 0.8695625, + "grad_norm": 3.078125, + "grad_norm_var": 0.031769816080729166, + "learning_rate": 0.0001, + "loss": 5.5748, + "loss/crossentropy": 2.5290616750717163, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1553570255637169, + "step": 27826 + }, + { + "epoch": 0.869625, + "grad_norm": 3.015625, + "grad_norm_var": 0.026073201497395834, + "learning_rate": 0.0001, + "loss": 5.6205, + "loss/crossentropy": 2.5588048696517944, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.15890072286128998, + "step": 27828 + }, + { + "epoch": 0.8696875, + "grad_norm": 2.96875, + "grad_norm_var": 0.028023274739583333, + "learning_rate": 0.0001, + "loss": 5.5863, + "loss/crossentropy": 2.535132646560669, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.16097809374332428, + "step": 27830 + }, + { + "epoch": 0.86975, + "grad_norm": 3.359375, + "grad_norm_var": 0.045633951822916664, + "learning_rate": 0.0001, + "loss": 5.5593, + "loss/crossentropy": 2.5300434827804565, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15996113419532776, + "step": 27832 + }, + { + "epoch": 0.8698125, + "grad_norm": 2.796875, + "grad_norm_var": 0.0514556884765625, + "learning_rate": 0.0001, + "loss": 5.2111, + "loss/crossentropy": 2.2957112789154053, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1470032036304474, + "step": 27834 + }, + { + "epoch": 0.869875, + "grad_norm": 3.046875, + "grad_norm_var": 0.05061747233072917, + "learning_rate": 0.0001, + "loss": 5.7635, + "loss/crossentropy": 2.6749976873397827, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16276133060455322, + "step": 27836 + }, + { + "epoch": 0.8699375, + "grad_norm": 3.0625, + "grad_norm_var": 0.030223592122395834, + "learning_rate": 0.0001, + "loss": 5.5158, + "loss/crossentropy": 2.498887300491333, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1602827087044716, + "step": 27838 + }, + { + "epoch": 0.87, + "grad_norm": 3.125, + "grad_norm_var": 0.0294097900390625, + "learning_rate": 0.0001, + "loss": 5.7285, + "loss/crossentropy": 2.5993796586990356, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16447728872299194, + "step": 27840 + }, + { + "epoch": 0.8700625, + "grad_norm": 2.96875, + "grad_norm_var": 0.029588826497395835, + "learning_rate": 0.0001, + "loss": 5.9302, + "loss/crossentropy": 2.816034197807312, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1684448942542076, + "step": 27842 + }, + { + "epoch": 0.870125, + "grad_norm": 2.859375, + "grad_norm_var": 0.03131103515625, + "learning_rate": 0.0001, + "loss": 5.6262, + "loss/crossentropy": 2.6053075790405273, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1599065214395523, + "step": 27844 + }, + { + "epoch": 0.8701875, + "grad_norm": 2.9375, + "grad_norm_var": 0.029499308268229166, + "learning_rate": 0.0001, + "loss": 5.5842, + "loss/crossentropy": 2.5269936323165894, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.16119346767663956, + "step": 27846 + }, + { + "epoch": 0.87025, + "grad_norm": 3.0625, + "grad_norm_var": 0.021214803059895832, + "learning_rate": 0.0001, + "loss": 6.0234, + "loss/crossentropy": 2.9078108072280884, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1631210893392563, + "step": 27848 + }, + { + "epoch": 0.8703125, + "grad_norm": 3.046875, + "grad_norm_var": 0.018586222330729166, + "learning_rate": 0.0001, + "loss": 5.3611, + "loss/crossentropy": 2.36064875125885, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.15512390434741974, + "step": 27850 + }, + { + "epoch": 0.870375, + "grad_norm": 3.234375, + "grad_norm_var": 0.028522745768229166, + "learning_rate": 0.0001, + "loss": 5.4436, + "loss/crossentropy": 2.451321840286255, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1582152545452118, + "step": 27852 + }, + { + "epoch": 0.8704375, + "grad_norm": 2.890625, + "grad_norm_var": 0.0332672119140625, + "learning_rate": 0.0001, + "loss": 5.163, + "loss/crossentropy": 2.3075605630874634, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.14335403591394424, + "step": 27854 + }, + { + "epoch": 0.8705, + "grad_norm": 2.875, + "grad_norm_var": 0.029637654622395832, + "learning_rate": 0.0001, + "loss": 5.7182, + "loss/crossentropy": 2.655929207801819, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.16521596908569336, + "step": 27856 + }, + { + "epoch": 0.8705625, + "grad_norm": 3.1875, + "grad_norm_var": 0.035237630208333336, + "learning_rate": 0.0001, + "loss": 5.5307, + "loss/crossentropy": 2.4572445154190063, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1659395396709442, + "step": 27858 + }, + { + "epoch": 0.870625, + "grad_norm": 2.953125, + "grad_norm_var": 0.03640034993489583, + "learning_rate": 0.0001, + "loss": 5.5832, + "loss/crossentropy": 2.559994101524353, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.16052212566137314, + "step": 27860 + }, + { + "epoch": 0.8706875, + "grad_norm": 3.40625, + "grad_norm_var": 0.0455718994140625, + "learning_rate": 0.0001, + "loss": 5.6618, + "loss/crossentropy": 2.6116913557052612, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.16203808784484863, + "step": 27862 + }, + { + "epoch": 0.87075, + "grad_norm": 3.0625, + "grad_norm_var": 0.039697265625, + "learning_rate": 0.0001, + "loss": 5.5123, + "loss/crossentropy": 2.436099648475647, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1607440710067749, + "step": 27864 + }, + { + "epoch": 0.8708125, + "grad_norm": 3.015625, + "grad_norm_var": 0.04202067057291667, + "learning_rate": 0.0001, + "loss": 5.4329, + "loss/crossentropy": 2.4671441316604614, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15438677370548248, + "step": 27866 + }, + { + "epoch": 0.870875, + "grad_norm": 3.140625, + "grad_norm_var": 0.031615193684895834, + "learning_rate": 0.0001, + "loss": 5.5803, + "loss/crossentropy": 2.4586938619613647, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16489533334970474, + "step": 27868 + }, + { + "epoch": 0.8709375, + "grad_norm": 3.1875, + "grad_norm_var": 0.027229817708333333, + "learning_rate": 0.0001, + "loss": 5.4877, + "loss/crossentropy": 2.4380314350128174, + "loss/hidden": 1.5, + "loss/jsd": 0.0, + "loss/logits": 0.15496237576007843, + "step": 27870 + }, + { + "epoch": 0.871, + "grad_norm": 3.25, + "grad_norm_var": 0.05241597493489583, + "learning_rate": 0.0001, + "loss": 5.6678, + "loss/crossentropy": 2.560866951942444, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16460144519805908, + "step": 27872 + }, + { + "epoch": 0.8710625, + "grad_norm": 3.0, + "grad_norm_var": 0.05629781087239583, + "learning_rate": 0.0001, + "loss": 5.0139, + "loss/crossentropy": 2.1779093146324158, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1390697881579399, + "step": 27874 + }, + { + "epoch": 0.871125, + "grad_norm": 2.953125, + "grad_norm_var": 0.0569976806640625, + "learning_rate": 0.0001, + "loss": 5.7047, + "loss/crossentropy": 2.626152992248535, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16058950126171112, + "step": 27876 + }, + { + "epoch": 0.8711875, + "grad_norm": 2.96875, + "grad_norm_var": 0.054539998372395836, + "learning_rate": 0.0001, + "loss": 5.8489, + "loss/crossentropy": 2.699462413787842, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.17041077464818954, + "step": 27878 + }, + { + "epoch": 0.87125, + "grad_norm": 3.234375, + "grad_norm_var": 0.05657145182291667, + "learning_rate": 0.0001, + "loss": 5.6879, + "loss/crossentropy": 2.6047691106796265, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1657308042049408, + "step": 27880 + }, + { + "epoch": 0.8713125, + "grad_norm": 2.90625, + "grad_norm_var": 0.05953369140625, + "learning_rate": 0.0001, + "loss": 5.5722, + "loss/crossentropy": 2.595220685005188, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1527717337012291, + "step": 27882 + }, + { + "epoch": 0.871375, + "grad_norm": 3.21875, + "grad_norm_var": 0.060498046875, + "learning_rate": 0.0001, + "loss": 5.7597, + "loss/crossentropy": 2.5783873796463013, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.16930709779262543, + "step": 27884 + }, + { + "epoch": 0.8714375, + "grad_norm": 3.109375, + "grad_norm_var": 0.053023274739583334, + "learning_rate": 0.0001, + "loss": 5.5359, + "loss/crossentropy": 2.4618382453918457, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.15936200320720673, + "step": 27886 + }, + { + "epoch": 0.8715, + "grad_norm": 3.125, + "grad_norm_var": 0.00953369140625, + "learning_rate": 0.0001, + "loss": 5.6499, + "loss/crossentropy": 2.5861200094223022, + "loss/hidden": 1.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.15754631161689758, + "step": 27888 + }, + { + "epoch": 0.8715625, + "grad_norm": 3.03125, + "grad_norm_var": 0.011083984375, + "learning_rate": 0.0001, + "loss": 5.389, + "loss/crossentropy": 2.3745484352111816, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1522216796875, + "step": 27890 + }, + { + "epoch": 0.871625, + "grad_norm": 2.828125, + "grad_norm_var": 0.014078776041666666, + "learning_rate": 0.0001, + "loss": 5.5995, + "loss/crossentropy": 2.5988909006118774, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15748737752437592, + "step": 27892 + }, + { + "epoch": 0.8716875, + "grad_norm": 2.921875, + "grad_norm_var": 0.014274088541666667, + "learning_rate": 0.0001, + "loss": 5.5922, + "loss/crossentropy": 2.5821398496627808, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1564764752984047, + "step": 27894 + }, + { + "epoch": 0.87175, + "grad_norm": 3.09375, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 5.6847, + "loss/crossentropy": 2.625950574874878, + "loss/hidden": 1.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1613416001200676, + "step": 27896 + }, + { + "epoch": 0.8718125, + "grad_norm": 2.984375, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 5.5418, + "loss/crossentropy": 2.5476146936416626, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1572282910346985, + "step": 27898 + }, + { + "epoch": 0.871875, + "grad_norm": 2.859375, + "grad_norm_var": 0.012272135416666666, + "learning_rate": 0.0001, + "loss": 5.416, + "loss/crossentropy": 2.4924851655960083, + "loss/hidden": 1.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1513383388519287, + "step": 27900 + }, + { + "epoch": 0.8719375, + "grad_norm": 2.9375, + "grad_norm_var": 0.0104644775390625, + "learning_rate": 0.0001, + "loss": 5.4087, + "loss/crossentropy": 2.4145772457122803, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.1556660234928131, + "step": 27902 + }, + { + "epoch": 0.872, + "grad_norm": 2.984375, + "grad_norm_var": 0.048273722330729164, + "learning_rate": 0.0001, + "loss": 5.7284, + "loss/crossentropy": 2.542745590209961, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.17129983752965927, + "step": 27904 + }, + { + "epoch": 0.8720625, + "grad_norm": 3.046875, + "grad_norm_var": 0.04576416015625, + "learning_rate": 0.0001, + "loss": 5.5258, + "loss/crossentropy": 2.5590767860412598, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.1552676558494568, + "step": 27906 + }, + { + "epoch": 0.872125, + "grad_norm": 3.21875, + "grad_norm_var": 0.051854451497395836, + "learning_rate": 0.0001, + "loss": 5.6264, + "loss/crossentropy": 2.5522128343582153, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.1601487547159195, + "step": 27908 + }, + { + "epoch": 0.8721875, + "grad_norm": 2.828125, + "grad_norm_var": 0.055887858072916664, + "learning_rate": 0.0001, + "loss": 5.3312, + "loss/crossentropy": 2.476024866104126, + "loss/hidden": 1.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.14723709970712662, + "step": 27910 + }, + { + "epoch": 0.87225, + "grad_norm": 2.828125, + "grad_norm_var": 0.05878499348958333, + "learning_rate": 0.0001, + "loss": 5.571, + "loss/crossentropy": 2.498559832572937, + "loss/hidden": 1.5390625, + "loss/jsd": 0.0, + "loss/logits": 0.15333276242017746, + "step": 27912 + }, + { + "epoch": 0.8723125, + "grad_norm": 2.890625, + "grad_norm_var": 0.05969950358072917, + "learning_rate": 0.0001, + "loss": 5.4344, + "loss/crossentropy": 2.528977632522583, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15030576288700104, + "step": 27914 + }, + { + "epoch": 0.872375, + "grad_norm": 3.09375, + "grad_norm_var": 0.05712483723958333, + "learning_rate": 0.0001, + "loss": 5.5819, + "loss/crossentropy": 2.4894860982894897, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16432037949562073, + "step": 27916 + }, + { + "epoch": 0.8724375, + "grad_norm": 2.921875, + "grad_norm_var": 0.0864410400390625, + "learning_rate": 0.0001, + "loss": 5.7793, + "loss/crossentropy": 2.604734182357788, + "loss/hidden": 1.484375, + "loss/jsd": 0.0, + "loss/logits": 0.16901922971010208, + "step": 27918 + }, + { + "epoch": 0.8725, + "grad_norm": 2.921875, + "grad_norm_var": 0.05397847493489583, + "learning_rate": 0.0001, + "loss": 5.0961, + "loss/crossentropy": 2.179744601249695, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.14632698148488998, + "step": 27920 + }, + { + "epoch": 0.8725625, + "grad_norm": 3.234375, + "grad_norm_var": 0.059407552083333336, + "learning_rate": 0.0001, + "loss": 5.4898, + "loss/crossentropy": 2.4999831914901733, + "loss/hidden": 1.421875, + "loss/jsd": 0.0, + "loss/logits": 0.15679404884576797, + "step": 27922 + }, + { + "epoch": 0.872625, + "grad_norm": 3.15625, + "grad_norm_var": 0.051569620768229164, + "learning_rate": 0.0001, + "loss": 5.3989, + "loss/crossentropy": 2.3952577114105225, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.15426691621541977, + "step": 27924 + }, + { + "epoch": 0.8726875, + "grad_norm": 3.484375, + "grad_norm_var": 0.061986287434895836, + "learning_rate": 0.0001, + "loss": 5.4991, + "loss/crossentropy": 2.388551950454712, + "loss/hidden": 1.50390625, + "loss/jsd": 0.0, + "loss/logits": 0.1606658473610878, + "step": 27926 + }, + { + "epoch": 0.87275, + "grad_norm": 3.3125, + "grad_norm_var": 0.0607330322265625, + "learning_rate": 0.0001, + "loss": 5.8942, + "loss/crossentropy": 2.658738136291504, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.17432327568531036, + "step": 27928 + }, + { + "epoch": 0.8728125, + "grad_norm": 3.34375, + "grad_norm_var": 0.060888671875, + "learning_rate": 0.0001, + "loss": 5.738, + "loss/crossentropy": 2.5245888233184814, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.17368977516889572, + "step": 27930 + }, + { + "epoch": 0.872875, + "grad_norm": 3.265625, + "grad_norm_var": 0.06109619140625, + "learning_rate": 0.0001, + "loss": 5.7505, + "loss/crossentropy": 2.649577260017395, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16087592393159866, + "step": 27932 + }, + { + "epoch": 0.8729375, + "grad_norm": 3.125, + "grad_norm_var": 0.03780924479166667, + "learning_rate": 0.0001, + "loss": 5.7636, + "loss/crossentropy": 2.5796053409576416, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.1719168797135353, + "step": 27934 + }, + { + "epoch": 0.873, + "grad_norm": 2.90625, + "grad_norm_var": 0.04420166015625, + "learning_rate": 0.0001, + "loss": 5.4642, + "loss/crossentropy": 2.502958655357361, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15550021082162857, + "step": 27936 + }, + { + "epoch": 0.8730625, + "grad_norm": 3.015625, + "grad_norm_var": 0.03916015625, + "learning_rate": 0.0001, + "loss": 5.7373, + "loss/crossentropy": 2.6349422931671143, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16297203302383423, + "step": 27938 + }, + { + "epoch": 0.873125, + "grad_norm": 3.0625, + "grad_norm_var": 0.041748046875, + "learning_rate": 0.0001, + "loss": 5.7272, + "loss/crossentropy": 2.6609092950820923, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16171126067638397, + "step": 27940 + }, + { + "epoch": 0.8731875, + "grad_norm": 2.921875, + "grad_norm_var": 0.02720947265625, + "learning_rate": 0.0001, + "loss": 5.6362, + "loss/crossentropy": 2.5398753881454468, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.16041860729455948, + "step": 27942 + }, + { + "epoch": 0.87325, + "grad_norm": 2.859375, + "grad_norm_var": 0.026414998372395835, + "learning_rate": 0.0001, + "loss": 5.7588, + "loss/crossentropy": 2.6977330446243286, + "loss/hidden": 1.453125, + "loss/jsd": 0.0, + "loss/logits": 0.16079582273960114, + "step": 27944 + }, + { + "epoch": 0.8733125, + "grad_norm": 2.859375, + "grad_norm_var": 0.019189453125, + "learning_rate": 0.0001, + "loss": 5.2738, + "loss/crossentropy": 2.329177975654602, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.15305107086896896, + "step": 27946 + }, + { + "epoch": 0.873375, + "grad_norm": 3.53125, + "grad_norm_var": 0.03835347493489583, + "learning_rate": 0.0001, + "loss": 5.7664, + "loss/crossentropy": 2.633378744125366, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16525045782327652, + "step": 27948 + }, + { + "epoch": 0.8734375, + "grad_norm": 3.125, + "grad_norm_var": 0.0434478759765625, + "learning_rate": 0.0001, + "loss": 5.8757, + "loss/crossentropy": 2.620681881904602, + "loss/hidden": 1.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.17902208864688873, + "step": 27950 + }, + { + "epoch": 0.8735, + "grad_norm": 2.875, + "grad_norm_var": 0.041413370768229166, + "learning_rate": 0.0001, + "loss": 5.3654, + "loss/crossentropy": 2.3401836156845093, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15682092308998108, + "step": 27952 + }, + { + "epoch": 0.8735625, + "grad_norm": 3.328125, + "grad_norm_var": 0.04488525390625, + "learning_rate": 0.0001, + "loss": 5.5194, + "loss/crossentropy": 2.421567916870117, + "loss/hidden": 1.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.16486230492591858, + "step": 27954 + }, + { + "epoch": 0.873625, + "grad_norm": 2.984375, + "grad_norm_var": 0.0427642822265625, + "learning_rate": 0.0001, + "loss": 5.7097, + "loss/crossentropy": 2.6570467948913574, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.16151063144207, + "step": 27956 + }, + { + "epoch": 0.8736875, + "grad_norm": 3.046875, + "grad_norm_var": 0.04407450358072917, + "learning_rate": 0.0001, + "loss": 5.437, + "loss/crossentropy": 2.421030282974243, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.159804105758667, + "step": 27958 + }, + { + "epoch": 0.87375, + "grad_norm": 2.890625, + "grad_norm_var": 0.043553670247395836, + "learning_rate": 0.0001, + "loss": 5.3645, + "loss/crossentropy": 2.3721309900283813, + "loss/hidden": 1.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.15665796399116516, + "step": 27960 + }, + { + "epoch": 0.8738125, + "grad_norm": 3.15625, + "grad_norm_var": 0.03753255208333333, + "learning_rate": 0.0001, + "loss": 5.6812, + "loss/crossentropy": 2.558328866958618, + "loss/hidden": 1.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.16619154810905457, + "step": 27962 + }, + { + "epoch": 0.873875, + "grad_norm": 3.3125, + "grad_norm_var": 57.6794179280599, + "learning_rate": 0.0001, + "loss": 6.8675, + "loss/crossentropy": 2.5392621755599976, + "loss/hidden": 2.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.19961903989315033, + "step": 27964 + }, + { + "epoch": 0.8739375, + "grad_norm": 3.25, + "grad_norm_var": 57.57952067057292, + "learning_rate": 0.0001, + "loss": 6.1596, + "loss/crossentropy": 2.856844186782837, + "loss/hidden": 1.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1810523048043251, + "step": 27966 + }, + { + "epoch": 0.874, + "grad_norm": 3.15625, + "grad_norm_var": 57.439134724934895, + "learning_rate": 0.0001, + "loss": 5.7996, + "loss/crossentropy": 2.6578762531280518, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.16612619161605835, + "step": 27968 + }, + { + "epoch": 0.8740625, + "grad_norm": 2.984375, + "grad_norm_var": 57.474690755208336, + "learning_rate": 0.0001, + "loss": 5.4024, + "loss/crossentropy": 2.3677303791046143, + "loss/hidden": 1.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.15932901203632355, + "step": 27970 + }, + { + "epoch": 0.874125, + "grad_norm": 2.875, + "grad_norm_var": 57.57089436848958, + "learning_rate": 0.0001, + "loss": 5.2109, + "loss/crossentropy": 2.308212399482727, + "loss/hidden": 1.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.14886417984962463, + "step": 27972 + }, + { + "epoch": 0.8741875, + "grad_norm": 2.890625, + "grad_norm_var": 57.66669514973958, + "learning_rate": 0.0001, + "loss": 5.3797, + "loss/crossentropy": 2.4334518909454346, + "loss/hidden": 1.40625, + "loss/jsd": 0.0, + "loss/logits": 0.15399521589279175, + "step": 27974 + }, + { + "epoch": 0.87425, + "grad_norm": 3.5, + "grad_norm_var": 57.528706868489586, + "learning_rate": 0.0001, + "loss": 5.5566, + "loss/crossentropy": 2.4991366863250732, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1600421965122223, + "step": 27976 + }, + { + "epoch": 0.8743125, + "grad_norm": 3.375, + "grad_norm_var": 57.4717763264974, + "learning_rate": 0.0001, + "loss": 6.0771, + "loss/crossentropy": 2.8167072534561157, + "loss/hidden": 1.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.1783793643116951, + "step": 27978 + }, + { + "epoch": 0.874375, + "grad_norm": 3.21875, + "grad_norm_var": 0.06281636555989584, + "learning_rate": 0.0001, + "loss": 5.7711, + "loss/crossentropy": 2.637932062149048, + "loss/hidden": 1.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.16370397806167603, + "step": 27980 + }, + { + "epoch": 0.8744375, + "grad_norm": 2.796875, + "grad_norm_var": 0.04554036458333333, + "learning_rate": 0.0001, + "loss": 5.3857, + "loss/crossentropy": 2.453414559364319, + "loss/hidden": 1.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.15299443900585175, + "step": 27982 + }, + { + "epoch": 0.8745, + "grad_norm": 3.1875, + "grad_norm_var": 0.038623046875, + "learning_rate": 0.0001, + "loss": 5.8691, + "loss/crossentropy": 2.6978119611740112, + "loss/hidden": 1.46875, + "loss/jsd": 0.0, + "loss/logits": 0.17025170475244522, + "step": 27984 + }, + { + "epoch": 0.8745625, + "grad_norm": 3.15625, + "grad_norm_var": 0.98033447265625, + "learning_rate": 0.0001, + "loss": 5.6903, + "loss/crossentropy": 2.477673649787903, + "loss/hidden": 1.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.17321471869945526, + "step": 27986 + }, + { + "epoch": 0.874625, + "grad_norm": 3.109375, + "grad_norm_var": 0.9660308837890625, + "learning_rate": 0.0001, + "loss": 5.6387, + "loss/crossentropy": 2.6202985048294067, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15809043496847153, + "step": 27988 + }, + { + "epoch": 0.8746875, + "grad_norm": 3.34375, + "grad_norm_var": 0.9475870768229167, + "learning_rate": 0.0001, + "loss": 5.7237, + "loss/crossentropy": 2.6030184030532837, + "loss/hidden": 1.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.16871295869350433, + "step": 27990 + }, + { + "epoch": 0.87475, + "grad_norm": 2.984375, + "grad_norm_var": 0.9591135660807292, + "learning_rate": 0.0001, + "loss": 5.7788, + "loss/crossentropy": 2.6670650243759155, + "loss/hidden": 1.47265625, + "loss/jsd": 0.0, + "loss/logits": 0.16391193866729736, + "step": 27992 + }, + { + "epoch": 0.8748125, + "grad_norm": 3.078125, + "grad_norm_var": 0.9908437093098958, + "learning_rate": 0.0001, + "loss": 5.3174, + "loss/crossentropy": 2.3251583576202393, + "loss/hidden": 1.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.15352027863264084, + "step": 27994 + }, + { + "epoch": 0.874875, + "grad_norm": 2.71875, + "grad_norm_var": 1.0175120035807292, + "learning_rate": 0.0001, + "loss": 5.5982, + "loss/crossentropy": 2.5926759243011475, + "loss/hidden": 1.4375, + "loss/jsd": 0.0, + "loss/logits": 0.15680398046970367, + "step": 27996 + }, + { + "epoch": 0.8749375, + "grad_norm": 2.96875, + "grad_norm_var": 1.0045562744140626, + "learning_rate": 0.0001, + "loss": 5.5022, + "loss/crossentropy": 2.51913321018219, + "loss/hidden": 1.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.15533510595560074, + "step": 27998 + }, + { + "epoch": 0.875, + "grad_norm": 2.8125, + "grad_norm_var": 1.0251617431640625, + "learning_rate": 0.0001, + "loss": 5.7521, + "loss/crossentropy": 2.7920806407928467, + "loss/hidden": 1.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1542009562253952, + "step": 28000 + } + ], + "logging_steps": 2, + "max_steps": 32000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.61518537375744e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}